{"step": 0, "timestamp": 1778194514.0954397, "grad/layer_0/attn": 0.007365311495959759, "grad/layer_0/mlp": 0.007989591918885708, "grad/layer_0/attn_mlp_ratio": 0.9218632789446217, "grad/layer_4/attn": 0.01555747352540493, "grad/layer_4/mlp": 0.012975124642252922, "grad/layer_4/attn_mlp_ratio": 1.1990230409687472, "grad/layer_8/attn": 0.011175042949616909, "grad/layer_8/mlp": 0.00597722502425313, "grad/layer_8/attn_mlp_ratio": 1.869603824067654, "grad/layer_12/attn": 0.0026964677963405848, "grad/layer_12/mlp": 0.0010275683598592877, "grad/layer_12/attn_mlp_ratio": 2.6241247193494353, "grad/layer_16/attn": 0.003890687134116888, "grad/layer_16/mlp": 0.0069762468338012695, "grad/layer_16/attn_mlp_ratio": 0.557704905092415, "grad/layer_20/attn": 0.006082314532250166, "grad/layer_20/mlp": 0.005152135156095028, "grad/layer_20/attn_mlp_ratio": 1.1805424799464896, "grad/layer_24/attn": 0.016979975625872612, "grad/layer_24/mlp": 0.031011298298835754, "grad/layer_24/attn_mlp_ratio": 0.5475415897616878, "grad/layer_27/attn": 0.0326603502035141, "grad/layer_27/mlp": 0.05628252774477005, "grad/layer_27/attn_mlp_ratio": 0.5802928804760321} {"step": 0, "timestamp": 1778194515.8061287, "eos/sharpness": 2.7702331542968746, "eos/L0_probe": 10.833698272705078, "eos/L_plus": 11.034712791442871, "eos/L_minus": 10.660386085510254, "eos/grad_norm": 1.0028200149536133, "eos/embed_grad_frac": 0.8117799162864685, "eos/time_s": 1.7057840824127197} {"step": 0, "timestamp": 1778194516.0339134, "train/loss": 10.833086967468262, "train/z_loss": 0.001183597487397492, "train/perplexity": 50669.88188828184, "train/grad_norm": 1.84375, "optim/muon_lr": 0.0, "optim/adamw_lr": 0.0, "perf/tokens_per_sec": 24875.65505964738, "perf/iters_per_sec": 0.011861636667083444, "perf/gpu_mem_gb": 78.058811392, "perf/step_time_s": 84.30539798736572, "data/tokens_consumed": 2097152, "data/tokens_consumed_B": 0.002097152} {"step": 0, "timestamp": 1778194517.8124838, "geo/rankme_last": 29.792739868164062, "geo/layer_0/stable_rank_q_proj": 53.602561950683594, "geo/layer_0/stable_rank_k_proj": 45.90247344970703, "geo/layer_0/stable_rank_o_proj": 67.8011474609375, "geo/layer_0/stable_rank_gate_proj": 161.25030517578125, "geo/layer_0/stable_rank_down_proj": 56.06612777709961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.007244851905852556, "geo/layer_0/attn_entropy_mean": 7.01924991607666, "geo/layer_0/attn_entropy_std": 0.015764569863677025, "geo/layer_7/stable_rank_q_proj": 27.272844314575195, "geo/layer_7/stable_rank_k_proj": 28.87115478515625, "geo/layer_7/stable_rank_o_proj": 117.2869644165039, "geo/layer_7/stable_rank_gate_proj": 172.76690673828125, "geo/layer_7/stable_rank_down_proj": 200.05337524414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8733821511268616, "geo/layer_7/attn_entropy_mean": 6.119824409484863, "geo/layer_7/attn_entropy_std": 0.6805421710014343, "geo/layer_14/stable_rank_q_proj": 33.9923095703125, "geo/layer_14/stable_rank_k_proj": 22.279741287231445, "geo/layer_14/stable_rank_o_proj": 92.39765167236328, "geo/layer_14/stable_rank_gate_proj": 193.68536376953125, "geo/layer_14/stable_rank_down_proj": 164.38673400878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8801140785217285, "geo/layer_14/attn_entropy_mean": 6.653115749359131, "geo/layer_14/attn_entropy_std": 0.40915676951408386, "geo/layer_21/stable_rank_q_proj": 64.25438690185547, "geo/layer_21/stable_rank_k_proj": 36.4431266784668, "geo/layer_21/stable_rank_o_proj": 65.61846160888672, "geo/layer_21/stable_rank_gate_proj": 150.0904541015625, "geo/layer_21/stable_rank_down_proj": 156.18081665039062, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8829315304756165, "geo/layer_21/attn_entropy_mean": 6.895988464355469, "geo/layer_21/attn_entropy_std": 0.08706562966108322, "geo/layer_27/stable_rank_q_proj": 59.485591888427734, "geo/layer_27/stable_rank_k_proj": 20.411039352416992, "geo/layer_27/stable_rank_o_proj": 75.03089904785156, "geo/layer_27/stable_rank_gate_proj": 82.33755493164062, "geo/layer_27/stable_rank_down_proj": 60.85941696166992, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9660035371780396, "geo/layer_27/attn_entropy_mean": 6.602514743804932, "geo/layer_27/attn_entropy_std": 0.10041193664073944, "attnres/final_alpha/block_0": 0.10114791989326477, "attnres/block_norm/0": 0.45335036516189575, "attnres/final_alpha/block_1": 0.0055791521444916725, "attnres/block_norm/1": 3165.49658203125, "attnres/final_alpha/block_2": 0.10953284054994583, "attnres/block_norm/2": 845.4291381835938, "attnres/final_alpha/block_3": 0.029725216329097748, "attnres/block_norm/3": 750.7996826171875, "attnres/final_alpha/block_4": 0.05867605283856392, "attnres/block_norm/4": 3077.130859375, "attnres/final_alpha/block_5": 0.18950068950653076, "attnres/block_norm/5": 1145.855712890625, "attnres/final_alpha/block_6": 0.5058380961418152, "attnres/block_norm/6": 1700.1439208984375, "geo/tier1_time_s": 1.7728288173675537, "geo/step": 0.0} {"step": 0, "timestamp": 1778194526.4629009, "geo/ww_alpha_mean": 7.811904989987941, "geo/ww_alpha_std": 7.1145409627758305, "geo/ww_alpha_min": 1.874115720098375, "geo/ww_alpha_max": 59.42035244555838, "geo/ww_alpha_healthy_frac": 0.29949238578680204, "geo/ww_alpha_by_type/q_proj": 4.280761732179406, "geo/ww_alpha_by_type/k_proj": 4.0968052964907375, "geo/ww_alpha_by_type/v_proj": 5.7560136936389865, "geo/ww_alpha_by_type/o_proj": 5.500031112244558, "geo/ww_alpha_by_type/gate_proj": 12.213385310352331, "geo/ww_alpha_by_type/up_proj": 12.16653163349539, "geo/ww_alpha_by_type/down_proj": 9.322982586302222, "geo/twonn_id/layer_0": 0.5951379537582397, "geo/twonn_id/layer_7": 2.9946630001068115, "geo/twonn_id/layer_14": 2.711090564727783, "geo/twonn_id/layer_21": 2.903562068939209, "geo/twonn_id/layer_27": 5.67042350769043, "geo/tier2_time_s": 8.642704963684082} {"step": 0, "timestamp": 1778194527.643193, "eoc/jacobian_sigma/layer_0/attn": 1551.9493408203125, "eoc/jacobian_sigma/layer_0/mlp": 1071.244384765625, "eoc/jacobian_sigma/layer_0": 1551.9493408203125, "eoc/jacobian_sigma/layer_7/attn": 1.0068025588989258, "eoc/jacobian_sigma/layer_7/mlp": 1.2488306760787964, "eoc/jacobian_sigma/layer_7": 1.2488306760787964, "eoc/jacobian_sigma/layer_14/attn": 1.0537097454071045, "eoc/jacobian_sigma/layer_14/mlp": 2.181209087371826, "eoc/jacobian_sigma/layer_14": 2.181209087371826, "eoc/jacobian_sigma/layer_21/attn": 0.9994311332702637, "eoc/jacobian_sigma/layer_21/mlp": 2.007694721221924, "eoc/jacobian_sigma/layer_21": 2.007694721221924, "eoc/jacobian_sigma/layer_27/attn": 1.0020406246185303, "eoc/jacobian_sigma/layer_27/mlp": 2.590712547302246, "eoc/jacobian_sigma/layer_27": 2.590712547302246, "eoc/layer0_sigma": 1551.9493408203125, "eoc/sigma_max": 2.590712547302246, "eoc/sigma_min": 1.2488306760787964, "eoc/sigma_mean": 2.007111757993698, "eoc/time_s": 1.1716713905334473} {"step": 10, "timestamp": 1778194538.177148, "train/loss": 10.627069473266602, "train/z_loss": 0.0011657053604722024, "train/perplexity": 41236.10445299555, "train/grad_norm": 2.125, "optim/muon_lr": 4e-05, "optim/adamw_lr": 1.2e-06, "perf/tokens_per_sec": 947324.8702240579, "perf/iters_per_sec": 0.45171969901278397, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 2.2137622117996214, "data/tokens_consumed": 23068672, "data/tokens_consumed_B": 0.023068672} {"step": 20, "timestamp": 1778194548.668364, "train/loss": 10.628330612182618, "train/z_loss": 0.0011659996584057809, "train/perplexity": 41288.14171527741, "train/grad_norm": 2.015625, "optim/muon_lr": 8e-05, "optim/adamw_lr": 2.4e-06, "perf/tokens_per_sec": 2000549.2632793696, "perf/iters_per_sec": 0.9539362255474899, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 1.048288106918335, "data/tokens_consumed": 44040192, "data/tokens_consumed_B": 0.044040192} {"step": 30, "timestamp": 1778194559.1433961, "train/loss": 10.627608489990234, "train/z_loss": 0.0011657538823783397, "train/perplexity": 41258.33739433846, "train/grad_norm": 2.140625, "optim/muon_lr": 0.00012, "optim/adamw_lr": 3.6e-06, "perf/tokens_per_sec": 2003400.237646854, "perf/iters_per_sec": 0.9552956760629912, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 1.0467963218688965, "data/tokens_consumed": 65011712, "data/tokens_consumed_B": 0.065011712} {"step": 40, "timestamp": 1778194569.6097271, "train/loss": 10.629000854492187, "train/z_loss": 0.001165717386174947, "train/perplexity": 41315.82405063837, "train/grad_norm": 2.09375, "optim/muon_lr": 0.00016, "optim/adamw_lr": 4.8e-06, "perf/tokens_per_sec": 2004984.8333500489, "perf/iters_per_sec": 0.9560512701750035, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 1.045969009399414, "data/tokens_consumed": 85983232, "data/tokens_consumed_B": 0.085983232} {"step": 50, "timestamp": 1778194580.057017, "grad/layer_0/attn": 0.025952370837330818, "grad/layer_0/mlp": 0.030824076384305954, "grad/layer_0/attn_mlp_ratio": 0.8419512860521365, "grad/layer_4/attn": 0.05456765368580818, "grad/layer_4/mlp": 0.029880182817578316, "grad/layer_4/attn_mlp_ratio": 1.8262155166963983, "grad/layer_8/attn": 0.013222643174231052, "grad/layer_8/mlp": 0.00485390005633235, "grad/layer_8/attn_mlp_ratio": 2.724127556884524, "grad/layer_12/attn": 0.0074904970824718475, "grad/layer_12/mlp": 0.0011300569167360663, "grad/layer_12/attn_mlp_ratio": 6.628424027759753, "grad/layer_16/attn": 0.005154724698513746, "grad/layer_16/mlp": 0.006997248623520136, "grad/layer_16/attn_mlp_ratio": 0.7366787865046103, "grad/layer_20/attn": 0.008730883710086346, "grad/layer_20/mlp": 0.010630784556269646, "grad/layer_20/attn_mlp_ratio": 0.8212830936178537, "grad/layer_24/attn": 0.044183291494846344, "grad/layer_24/mlp": 0.028801405802369118, "grad/layer_24/attn_mlp_ratio": 1.5340671786863, "grad/layer_27/attn": 0.055015191435813904, "grad/layer_27/mlp": 0.06967439502477646, "grad/layer_27/attn_mlp_ratio": 0.7896041485152457} {"step": 50, "timestamp": 1778194580.0727983, "train/loss": 10.626634311676025, "train/z_loss": 0.0011653616442345084, "train/perplexity": 41218.16398797634, "train/grad_norm": 2.140625, "optim/muon_lr": 0.0002, "optim/adamw_lr": 5.999999999999999e-06, "perf/tokens_per_sec": 2005722.9589591604, "perf/iters_per_sec": 0.9564032358928491, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 1.0455840826034546, "data/tokens_consumed": 106954752, "data/tokens_consumed_B": 0.106954752} {"step": 60, "timestamp": 1778194590.91702, "train/loss": 10.622785949707032, "train/z_loss": 0.0011651989421807229, "train/perplexity": 41059.846400333736, "train/grad_norm": 2.015625, "optim/muon_lr": 0.00024, "optim/adamw_lr": 7.2e-06, "perf/tokens_per_sec": 1934944.4975769345, "perf/iters_per_sec": 0.9226534355053589, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 1.0838305711746217, "data/tokens_consumed": 127926272, "data/tokens_consumed_B": 0.127926272} {"step": 70, "timestamp": 1778194601.3491879, "train/loss": 10.61846342086792, "train/z_loss": 0.001164692104794085, "train/perplexity": 40882.747064379684, "train/grad_norm": 2.078125, "optim/muon_lr": 0.00028000000000000003, "optim/adamw_lr": 8.4e-06, "perf/tokens_per_sec": 2011444.6749504565, "perf/iters_per_sec": 0.9591315626861842, "perf/gpu_mem_gb": 78.325295104, "perf/step_time_s": 1.0426098346710204, "data/tokens_consumed": 148897792, "data/tokens_consumed_B": 0.148897792} {"step": 75, "timestamp": 1778194607.1728501, "eos/sharpness": 2.026748657226562, "eos/L0_probe": 10.597017288208008, "eos/L_plus": 10.792549133300781, "eos/L_minus": 10.4217529296875, "eos/grad_norm": 0.9954532384872437, "eos/embed_grad_frac": 0.7320650815963745, "eos/time_s": 0.6152949333190918} {"step": 75, "timestamp": 1778194608.5555706, "geo/rankme_last": 29.126178741455078, "geo/layer_0/stable_rank_q_proj": 53.60388946533203, "geo/layer_0/stable_rank_k_proj": 45.903289794921875, "geo/layer_0/stable_rank_o_proj": 67.80142974853516, "geo/layer_0/stable_rank_gate_proj": 161.2521514892578, "geo/layer_0/stable_rank_down_proj": 56.06708908081055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.007633983623236418, "geo/layer_0/attn_entropy_mean": 7.0193071365356445, "geo/layer_0/attn_entropy_std": 0.01574062928557396, "geo/layer_7/stable_rank_q_proj": 27.27210235595703, "geo/layer_7/stable_rank_k_proj": 28.871591567993164, "geo/layer_7/stable_rank_o_proj": 117.28776550292969, "geo/layer_7/stable_rank_gate_proj": 172.76669311523438, "geo/layer_7/stable_rank_down_proj": 200.05274963378906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8716515302658081, "geo/layer_7/attn_entropy_mean": 6.132396697998047, "geo/layer_7/attn_entropy_std": 0.6756249070167542, "geo/layer_14/stable_rank_q_proj": 33.99350357055664, "geo/layer_14/stable_rank_k_proj": 22.279525756835938, "geo/layer_14/stable_rank_o_proj": 92.39201354980469, "geo/layer_14/stable_rank_gate_proj": 193.6787109375, "geo/layer_14/stable_rank_down_proj": 164.38470458984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8818008303642273, "geo/layer_14/attn_entropy_mean": 6.661342620849609, "geo/layer_14/attn_entropy_std": 0.40677860379219055, "geo/layer_21/stable_rank_q_proj": 64.2526626586914, "geo/layer_21/stable_rank_k_proj": 36.44309616088867, "geo/layer_21/stable_rank_o_proj": 65.61468505859375, "geo/layer_21/stable_rank_gate_proj": 150.09445190429688, "geo/layer_21/stable_rank_down_proj": 156.18621826171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8848254084587097, "geo/layer_21/attn_entropy_mean": 6.897251129150391, "geo/layer_21/attn_entropy_std": 0.08551429957151413, "geo/layer_27/stable_rank_q_proj": 59.48743438720703, "geo/layer_27/stable_rank_k_proj": 20.410993576049805, "geo/layer_27/stable_rank_o_proj": 75.03148651123047, "geo/layer_27/stable_rank_gate_proj": 82.33563995361328, "geo/layer_27/stable_rank_down_proj": 60.858604431152344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9644467234611511, "geo/layer_27/attn_entropy_mean": 6.633141994476318, "geo/layer_27/attn_entropy_std": 0.10411660373210907, "attnres/final_alpha/block_0": 0.10148974508047104, "attnres/block_norm/0": 0.4533553719520569, "attnres/final_alpha/block_1": 0.005619227886199951, "attnres/block_norm/1": 3175.3212890625, "attnres/final_alpha/block_2": 0.11211113631725311, "attnres/block_norm/2": 905.3133544921875, "attnres/final_alpha/block_3": 0.029421312734484673, "attnres/block_norm/3": 750.0589599609375, "attnres/final_alpha/block_4": 0.05885053798556328, "attnres/block_norm/4": 3180.68115234375, "attnres/final_alpha/block_5": 0.19105607271194458, "attnres/block_norm/5": 1156.1163330078125, "attnres/final_alpha/block_6": 0.5014519691467285, "attnres/block_norm/6": 1711.2501220703125, "geo/tier1_time_s": 1.362126111984253, "geo/step": 75.0} {"step": 80, "timestamp": 1778194613.7553341, "train/loss": 10.612710285186768, "train/z_loss": 0.0011638636351563036, "train/perplexity": 40648.218358114274, "train/grad_norm": 2.046875, "optim/muon_lr": 0.00032, "optim/adamw_lr": 9.6e-06, "perf/tokens_per_sec": 1691303.533071489, "perf/iters_per_sec": 0.8064763703687139, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2399619340896606, "data/tokens_consumed": 169869312, "data/tokens_consumed_B": 0.169869312} {"step": 90, "timestamp": 1778194624.1543403, "train/loss": 10.602851295471192, "train/z_loss": 0.0011629171320237219, "train/perplexity": 40249.43701207278, "train/grad_norm": 2.015625, "optim/muon_lr": 0.00035999999999999997, "optim/adamw_lr": 1.0799999999999998e-05, "perf/tokens_per_sec": 2017715.3400801907, "perf/iters_per_sec": 0.96212164882669, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0393696069717406, "data/tokens_consumed": 190840832, "data/tokens_consumed_B": 0.190840832, "train/loss_slope": -0.0013568518089525558} {"step": 100, "timestamp": 1778194634.5459387, "grad/layer_0/attn": 0.023796651512384415, "grad/layer_0/mlp": 0.028703805059194565, "grad/layer_0/attn_mlp_ratio": 0.8290417030217938, "grad/layer_4/attn": 0.04920732602477074, "grad/layer_4/mlp": 0.02677450142800808, "grad/layer_4/attn_mlp_ratio": 1.837842843621058, "grad/layer_8/attn": 0.01261156890541315, "grad/layer_8/mlp": 0.00465004937723279, "grad/layer_8/attn_mlp_ratio": 2.712136498151457, "grad/layer_12/attn": 0.007556399796158075, "grad/layer_12/mlp": 0.001128539675846696, "grad/layer_12/attn_mlp_ratio": 6.695731916483694, "grad/layer_16/attn": 0.00505185779184103, "grad/layer_16/mlp": 0.007169736549258232, "grad/layer_16/attn_mlp_ratio": 0.7046085566286018, "grad/layer_20/attn": 0.012036586180329323, "grad/layer_20/mlp": 0.010767949745059013, "grad/layer_20/attn_mlp_ratio": 1.1178159587966912, "grad/layer_24/attn": 0.0538054034113884, "grad/layer_24/mlp": 0.03077705018222332, "grad/layer_24/attn_mlp_ratio": 1.748231325549289, "grad/layer_27/attn": 0.04556918889284134, "grad/layer_27/mlp": 0.07072800397872925, "grad/layer_27/attn_mlp_ratio": 0.6442877822781066} {"step": 100, "timestamp": 1778194634.5617387, "train/loss": 10.594070148468017, "train/z_loss": 0.0011618765885941685, "train/perplexity": 39897.54804438599, "train/grad_norm": 2.03125, "optim/muon_lr": 0.0004, "optim/adamw_lr": 1.1999999999999999e-05, "perf/tokens_per_sec": 2016114.0317733095, "perf/iters_per_sec": 0.9613580855242297, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0401951313018798, "data/tokens_consumed": 211812352, "data/tokens_consumed_B": 0.211812352, "train/loss_slope": -0.0012393843910910847} {"step": 110, "timestamp": 1778194645.5993795, "train/loss": 10.587785720825195, "train/z_loss": 0.0011608924833126366, "train/perplexity": 39647.60100024924, "train/grad_norm": 1.9921875, "optim/muon_lr": 0.00043999999999999996, "optim/adamw_lr": 1.3199999999999997e-05, "perf/tokens_per_sec": 1901061.4368336657, "perf/iters_per_sec": 0.9064967331093148, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1031479358673095, "data/tokens_consumed": 232783872, "data/tokens_consumed_B": 0.232783872, "train/loss_slope": -0.0011481169720629694} {"step": 120, "timestamp": 1778194656.0061576, "train/loss": 10.574601173400879, "train/z_loss": 0.001159697154071182, "train/perplexity": 39128.29624631847, "train/grad_norm": 2.046875, "optim/muon_lr": 0.00048, "optim/adamw_lr": 1.44e-05, "perf/tokens_per_sec": 2016936.081368523, "perf/iters_per_sec": 0.9617500693171134, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0397711753845216, "data/tokens_consumed": 253755392, "data/tokens_consumed_B": 0.253755392, "train/loss_slope": -0.0010985708760691197} {"step": 130, "timestamp": 1778194666.3838773, "train/loss": 10.565117454528808, "train/z_loss": 0.0011584827909246088, "train/perplexity": 38758.96855291856, "train/grad_norm": 2.0, "optim/muon_lr": 0.00052, "optim/adamw_lr": 1.5599999999999996e-05, "perf/tokens_per_sec": 2021982.4179058764, "perf/iters_per_sec": 0.9641563500909216, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0371761798858643, "data/tokens_consumed": 274726912, "data/tokens_consumed_B": 0.274726912, "train/loss_slope": -0.0010631362160483555} {"step": 140, "timestamp": 1778194676.7638013, "train/loss": 10.547153854370118, "train/z_loss": 0.0011562815750949084, "train/perplexity": 38068.93424502336, "train/grad_norm": 2.0, "optim/muon_lr": 0.0005600000000000001, "optim/adamw_lr": 1.68e-05, "perf/tokens_per_sec": 2021619.3823278043, "perf/iters_per_sec": 0.9639832412375471, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0373624324798585, "data/tokens_consumed": 295698432, "data/tokens_consumed_B": 0.295698432, "train/loss_slope": -0.0010584342820303767} {"step": 150, "timestamp": 1778194687.1209724, "grad/layer_0/attn": 0.016726912930607796, "grad/layer_0/mlp": 0.021565603092312813, "grad/layer_0/attn_mlp_ratio": 0.7756292639461252, "grad/layer_4/attn": 0.03224452584981918, "grad/layer_4/mlp": 0.01968320831656456, "grad/layer_4/attn_mlp_ratio": 1.6381742837556679, "grad/layer_8/attn": 0.010500282980501652, "grad/layer_8/mlp": 0.00463958689942956, "grad/layer_8/attn_mlp_ratio": 2.263193465666808, "grad/layer_12/attn": 0.006715895142406225, "grad/layer_12/mlp": 0.0011849213624373078, "grad/layer_12/attn_mlp_ratio": 5.667797702467181, "grad/layer_16/attn": 0.0037506376393139362, "grad/layer_16/mlp": 0.007996457628905773, "grad/layer_16/attn_mlp_ratio": 0.4690373871115517, "grad/layer_20/attn": 0.013153454288840294, "grad/layer_20/mlp": 0.009003633633255959, "grad/layer_20/attn_mlp_ratio": 1.460905083272823, "grad/layer_24/attn": 0.057747676968574524, "grad/layer_24/mlp": 0.03525901585817337, "grad/layer_24/attn_mlp_ratio": 1.6378130642408943, "grad/layer_27/attn": 0.029217995703220367, "grad/layer_27/mlp": 0.06969624012708664, "grad/layer_27/attn_mlp_ratio": 0.4192191086351475} {"step": 150, "timestamp": 1778194687.7255483, "eos/sharpness": 2.126502990722656, "eos/L0_probe": 10.487151145935059, "eos/L_plus": 10.684636116027832, "eos/L_minus": 10.310931205749512, "eos/grad_norm": 1.003225326538086, "eos/embed_grad_frac": 0.7975086569786072, "eos/time_s": 0.6018099784851074} {"step": 150, "timestamp": 1778194687.745361, "train/loss": 10.521398830413819, "train/z_loss": 0.0011533921817317605, "train/perplexity": 37100.98619856607, "train/grad_norm": 1.9453125, "optim/muon_lr": 0.0006, "optim/adamw_lr": 1.7999999999999997e-05, "perf/tokens_per_sec": 1910854.4186135933, "perf/iters_per_sec": 0.9111663907115904, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0974943876266479, "data/tokens_consumed": 316669952, "data/tokens_consumed_B": 0.316669952, "train/loss_slope": -0.001088752872803629} {"step": 150, "timestamp": 1778194689.1096292, "geo/rankme_last": 27.719093322753906, "geo/layer_0/stable_rank_q_proj": 53.60813903808594, "geo/layer_0/stable_rank_k_proj": 45.9100456237793, "geo/layer_0/stable_rank_o_proj": 67.8039321899414, "geo/layer_0/stable_rank_gate_proj": 161.24127197265625, "geo/layer_0/stable_rank_down_proj": 56.06315612792969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.007345352787524462, "geo/layer_0/attn_entropy_mean": 7.019338607788086, "geo/layer_0/attn_entropy_std": 0.015477998182177544, "geo/layer_7/stable_rank_q_proj": 27.270296096801758, "geo/layer_7/stable_rank_k_proj": 28.870080947875977, "geo/layer_7/stable_rank_o_proj": 117.28844451904297, "geo/layer_7/stable_rank_gate_proj": 172.7693634033203, "geo/layer_7/stable_rank_down_proj": 200.05458068847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8781515955924988, "geo/layer_7/attn_entropy_mean": 6.19920539855957, "geo/layer_7/attn_entropy_std": 0.6347629427909851, "geo/layer_14/stable_rank_q_proj": 33.994224548339844, "geo/layer_14/stable_rank_k_proj": 22.28030014038086, "geo/layer_14/stable_rank_o_proj": 92.3937759399414, "geo/layer_14/stable_rank_gate_proj": 193.68408203125, "geo/layer_14/stable_rank_down_proj": 164.3942413330078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8886758089065552, "geo/layer_14/attn_entropy_mean": 6.681156158447266, "geo/layer_14/attn_entropy_std": 0.4162628948688507, "geo/layer_21/stable_rank_q_proj": 64.25050354003906, "geo/layer_21/stable_rank_k_proj": 36.4412841796875, "geo/layer_21/stable_rank_o_proj": 65.61756896972656, "geo/layer_21/stable_rank_gate_proj": 150.09317016601562, "geo/layer_21/stable_rank_down_proj": 156.1790771484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8895740509033203, "geo/layer_21/attn_entropy_mean": 6.905155181884766, "geo/layer_21/attn_entropy_std": 0.08243340998888016, "geo/layer_27/stable_rank_q_proj": 59.48427200317383, "geo/layer_27/stable_rank_k_proj": 20.410823822021484, "geo/layer_27/stable_rank_o_proj": 75.03592681884766, "geo/layer_27/stable_rank_gate_proj": 82.33594512939453, "geo/layer_27/stable_rank_down_proj": 60.856632232666016, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9733172655105591, "geo/layer_27/attn_entropy_mean": 6.775613307952881, "geo/layer_27/attn_entropy_std": 0.11312000453472137, "attnres/final_alpha/block_0": 0.10073784738779068, "attnres/block_norm/0": 0.453434020280838, "attnres/final_alpha/block_1": 0.005660992581397295, "attnres/block_norm/1": 3219.25830078125, "attnres/final_alpha/block_2": 0.12352403998374939, "attnres/block_norm/2": 1254.91796875, "attnres/final_alpha/block_3": 0.027996284887194633, "attnres/block_norm/3": 752.289306640625, "attnres/final_alpha/block_4": 0.06012151017785072, "attnres/block_norm/4": 3527.934326171875, "attnres/final_alpha/block_5": 0.22933262586593628, "attnres/block_norm/5": 1102.0450439453125, "attnres/final_alpha/block_6": 0.45262670516967773, "attnres/block_norm/6": 1773.5037841796875, "geo/tier1_time_s": 1.3601300716400146, "geo/step": 150.0} {"step": 160, "timestamp": 1778194699.4897037, "train/loss": 10.492650508880615, "train/z_loss": 0.001150240341667086, "train/perplexity": 36049.580602778704, "train/grad_norm": 1.921875, "optim/muon_lr": 0.00064, "optim/adamw_lr": 1.92e-05, "perf/tokens_per_sec": 1786302.5595759416, "perf/iters_per_sec": 0.8517754361991604, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1740183591842652, "data/tokens_consumed": 337641472, "data/tokens_consumed_B": 0.337641472, "train/loss_slope": -0.0011445809111875621} {"step": 170, "timestamp": 1778194709.8514988, "train/loss": 10.462341499328613, "train/z_loss": 0.0011471013072878121, "train/perplexity": 34973.34571445203, "train/grad_norm": 1.921875, "optim/muon_lr": 0.00068, "optim/adamw_lr": 2.04e-05, "perf/tokens_per_sec": 2025084.6354194377, "perf/iters_per_sec": 0.9656356026742161, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0355873346328734, "data/tokens_consumed": 358612992, "data/tokens_consumed_B": 0.358612992, "train/loss_slope": -0.0012168518160887909} {"step": 180, "timestamp": 1778194720.8437107, "train/loss": 10.434474563598632, "train/z_loss": 0.0011440325761213899, "train/perplexity": 34012.20002832158, "train/grad_norm": 1.9921875, "optim/muon_lr": 0.0007199999999999999, "optim/adamw_lr": 2.1599999999999996e-05, "perf/tokens_per_sec": 1908816.2359570519, "perf/iters_per_sec": 0.9101945094857463, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.098666262626648, "data/tokens_consumed": 379584512, "data/tokens_consumed_B": 0.379584512, "train/loss_slope": -0.001293369544179816} {"step": 190, "timestamp": 1778194731.2022715, "train/loss": 10.407581043243407, "train/z_loss": 0.0011408803635276853, "train/perplexity": 33109.68256608871, "train/grad_norm": 1.96875, "optim/muon_lr": 0.00076, "optim/adamw_lr": 2.28e-05, "perf/tokens_per_sec": 2025730.9843442314, "perf/iters_per_sec": 0.9659438058587224, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0352569103240967, "data/tokens_consumed": 400556032, "data/tokens_consumed_B": 0.400556032, "train/loss_slope": -0.0013690607816653152} {"step": 200, "timestamp": 1778194741.5539842, "grad/layer_0/attn": 0.008728940971195698, "grad/layer_0/mlp": 0.012543433345854282, "grad/layer_0/attn_mlp_ratio": 0.6958972604172179, "grad/layer_4/attn": 0.013137747533619404, "grad/layer_4/mlp": 0.010660930536687374, "grad/layer_4/attn_mlp_ratio": 1.2323265183255747, "grad/layer_8/attn": 0.010274559259414673, "grad/layer_8/mlp": 0.006107326131314039, "grad/layer_8/attn_mlp_ratio": 1.6823334582544514, "grad/layer_12/attn": 0.0054818205535411835, "grad/layer_12/mlp": 0.0013300193240866065, "grad/layer_12/attn_mlp_ratio": 4.121609394769429, "grad/layer_16/attn": 0.0023407512344419956, "grad/layer_16/mlp": 0.008527334779500961, "grad/layer_16/attn_mlp_ratio": 0.2744997431810702, "grad/layer_20/attn": 0.012349479831755161, "grad/layer_20/mlp": 0.0077554830349981785, "grad/layer_20/attn_mlp_ratio": 1.5923546756262867, "grad/layer_24/attn": 0.031451091170310974, "grad/layer_24/mlp": 0.041173502802848816, "grad/layer_24/attn_mlp_ratio": 0.7638672678523755, "grad/layer_27/attn": 0.034139037132263184, "grad/layer_27/mlp": 0.06705893576145172, "grad/layer_27/attn_mlp_ratio": 0.5090900518134784} {"step": 200, "timestamp": 1778194741.5701597, "train/loss": 10.376953315734863, "train/z_loss": 0.0011376169743016363, "train/perplexity": 32110.980289956726, "train/grad_norm": 1.984375, "optim/muon_lr": 0.0008, "optim/adamw_lr": 2.3999999999999997e-05, "perf/tokens_per_sec": 2023771.5225245506, "perf/iters_per_sec": 0.9650094616530183, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0362592697143556, "data/tokens_consumed": 421527552, "data/tokens_consumed_B": 0.421527552, "train/loss_slope": -0.0014470882168063884} {"step": 210, "timestamp": 1778194751.9450345, "train/loss": 10.34744176864624, "train/z_loss": 0.0011342329089529812, "train/perplexity": 31177.182267081615, "train/grad_norm": 1.9765625, "optim/muon_lr": 0.00084, "optim/adamw_lr": 2.52e-05, "perf/tokens_per_sec": 2022630.6480508125, "perf/iters_per_sec": 0.9644654503110945, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0368437767028809, "data/tokens_consumed": 442499072, "data/tokens_consumed_B": 0.442499072, "train/loss_slope": -0.001523520675909043} {"step": 220, "timestamp": 1778194762.328192, "train/loss": 10.315618228912353, "train/z_loss": 0.0011307165957987308, "train/perplexity": 30200.634978266993, "train/grad_norm": 2.0, "optim/muon_lr": 0.0008799999999999999, "optim/adamw_lr": 2.6399999999999995e-05, "perf/tokens_per_sec": 2020724.9901877793, "perf/iters_per_sec": 0.9635567618311783, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0378215789794922, "data/tokens_consumed": 463470592, "data/tokens_consumed_B": 0.463470592, "train/loss_slope": -0.001599706604546711} {"step": 225, "timestamp": 1778194768.1242409, "eos/sharpness": 2.280044555664062, "eos/L0_probe": 10.25532054901123, "eos/L_plus": 10.461673736572266, "eos/L_minus": 10.071767807006836, "eos/grad_norm": 0.9968438148498535, "eos/embed_grad_frac": 0.7543440461158752, "eos/time_s": 0.619575023651123} {"step": 225, "timestamp": 1778194769.5052602, "geo/rankme_last": 29.308853149414062, "geo/layer_0/stable_rank_q_proj": 53.61316680908203, "geo/layer_0/stable_rank_k_proj": 45.92123031616211, "geo/layer_0/stable_rank_o_proj": 67.81765747070312, "geo/layer_0/stable_rank_gate_proj": 161.2429962158203, "geo/layer_0/stable_rank_down_proj": 56.06039810180664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.00914669781923294, "geo/layer_0/attn_entropy_mean": 7.019333362579346, "geo/layer_0/attn_entropy_std": 0.01501933392137289, "geo/layer_7/stable_rank_q_proj": 27.271907806396484, "geo/layer_7/stable_rank_k_proj": 28.869220733642578, "geo/layer_7/stable_rank_o_proj": 117.28986358642578, "geo/layer_7/stable_rank_gate_proj": 172.77560424804688, "geo/layer_7/stable_rank_down_proj": 200.0470428466797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8933274745941162, "geo/layer_7/attn_entropy_mean": 6.261927604675293, "geo/layer_7/attn_entropy_std": 0.586933434009552, "geo/layer_14/stable_rank_q_proj": 33.99262237548828, "geo/layer_14/stable_rank_k_proj": 22.279739379882812, "geo/layer_14/stable_rank_o_proj": 92.39039611816406, "geo/layer_14/stable_rank_gate_proj": 193.67507934570312, "geo/layer_14/stable_rank_down_proj": 164.39031982421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.895706832408905, "geo/layer_14/attn_entropy_mean": 6.668040752410889, "geo/layer_14/attn_entropy_std": 0.4634210765361786, "geo/layer_21/stable_rank_q_proj": 64.25440979003906, "geo/layer_21/stable_rank_k_proj": 36.439491271972656, "geo/layer_21/stable_rank_o_proj": 65.61689758300781, "geo/layer_21/stable_rank_gate_proj": 150.0894775390625, "geo/layer_21/stable_rank_down_proj": 156.17945861816406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8947151899337769, "geo/layer_21/attn_entropy_mean": 6.91361141204834, "geo/layer_21/attn_entropy_std": 0.08228541910648346, "geo/layer_27/stable_rank_q_proj": 59.487056732177734, "geo/layer_27/stable_rank_k_proj": 20.409374237060547, "geo/layer_27/stable_rank_o_proj": 75.03278350830078, "geo/layer_27/stable_rank_gate_proj": 82.34026336669922, "geo/layer_27/stable_rank_down_proj": 60.85678482055664, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9819937944412231, "geo/layer_27/attn_entropy_mean": 6.928737640380859, "geo/layer_27/attn_entropy_std": 0.11441196501255035, "attnres/final_alpha/block_0": 0.09191017597913742, "attnres/block_norm/0": 0.45385003089904785, "attnres/final_alpha/block_1": 0.0052231065928936005, "attnres/block_norm/1": 3252.30908203125, "attnres/final_alpha/block_2": 0.12745335698127747, "attnres/block_norm/2": 1945.811767578125, "attnres/final_alpha/block_3": 0.02547331154346466, "attnres/block_norm/3": 767.693603515625, "attnres/final_alpha/block_4": 0.05674111098051071, "attnres/block_norm/4": 3809.0576171875, "attnres/final_alpha/block_5": 0.34423649311065674, "attnres/block_norm/5": 924.1912841796875, "attnres/final_alpha/block_6": 0.3489624261856079, "attnres/block_norm/6": 1830.121826171875, "geo/tier1_time_s": 1.3602616786956787, "geo/step": 225.0} {"step": 230, "timestamp": 1778194774.6874967, "train/loss": 10.28367338180542, "train/z_loss": 0.0011270981631241739, "train/perplexity": 29251.126999214466, "train/grad_norm": 2.09375, "optim/muon_lr": 0.00092, "optim/adamw_lr": 2.7599999999999997e-05, "perf/tokens_per_sec": 1697936.9034340256, "perf/iters_per_sec": 0.8096394078416946, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.235117745399475, "data/tokens_consumed": 484442112, "data/tokens_consumed_B": 0.484442112, "train/loss_slope": -0.0016743175506591817} {"step": 240, "timestamp": 1778194785.0367436, "train/loss": 10.2552921295166, "train/z_loss": 0.0011233732453547417, "train/perplexity": 28432.613544221338, "train/grad_norm": 2.109375, "optim/muon_lr": 0.00096, "optim/adamw_lr": 2.88e-05, "perf/tokens_per_sec": 2027255.9845959675, "perf/iters_per_sec": 0.9666709826450193, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0344781398773193, "data/tokens_consumed": 505413632, "data/tokens_consumed_B": 0.505413632, "train/loss_slope": -0.0017431421426626374} {"step": 250, "timestamp": 1778194795.3845744, "grad/layer_0/attn": 0.0092698372900486, "grad/layer_0/mlp": 0.013624953106045723, "grad/layer_0/attn_mlp_ratio": 0.6803573670943213, "grad/layer_4/attn": 0.010822039097547531, "grad/layer_4/mlp": 0.01120027620345354, "grad/layer_4/attn_mlp_ratio": 0.9662296540140369, "grad/layer_8/attn": 0.011705758981406689, "grad/layer_8/mlp": 0.0073920367285609245, "grad/layer_8/attn_mlp_ratio": 1.5835634011154618, "grad/layer_12/attn": 0.006482475437223911, "grad/layer_12/mlp": 0.0014829235151410103, "grad/layer_12/attn_mlp_ratio": 4.371415608353837, "grad/layer_16/attn": 0.0034733072388917208, "grad/layer_16/mlp": 0.009752294048666954, "grad/layer_16/attn_mlp_ratio": 0.3561528380854354, "grad/layer_20/attn": 0.014313471503555775, "grad/layer_20/mlp": 0.008173199370503426, "grad/layer_20/attn_mlp_ratio": 1.7512690783104234, "grad/layer_24/attn": 0.036067042499780655, "grad/layer_24/mlp": 0.04620156064629555, "grad/layer_24/attn_mlp_ratio": 0.7806455435095343, "grad/layer_27/attn": 0.03405807167291641, "grad/layer_27/mlp": 0.06341952830553055, "grad/layer_27/attn_mlp_ratio": 0.5370281446297598} {"step": 250, "timestamp": 1778194795.4005718, "train/loss": 10.210812377929688, "train/z_loss": 0.0011191839585080743, "train/perplexity": 27195.651765155188, "train/grad_norm": 2.125, "optim/muon_lr": 0.001, "optim/adamw_lr": 2.9999999999999997e-05, "perf/tokens_per_sec": 2024620.8467332423, "perf/iters_per_sec": 0.9654144509950839, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0358245611190795, "data/tokens_consumed": 526385152, "data/tokens_consumed_B": 0.526385152, "train/loss_slope": -0.001820378856169873} {"step": 260, "timestamp": 1778194805.7578616, "train/loss": 10.153226947784423, "train/z_loss": 0.0011130968574434518, "train/perplexity": 25673.81674322488, "train/grad_norm": 2.21875, "optim/muon_lr": 0.00104, "optim/adamw_lr": 3.119999999999999e-05, "perf/tokens_per_sec": 2025885.3688075407, "perf/iters_per_sec": 0.9660174221074775, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035178017616272, "data/tokens_consumed": 547356672, "data/tokens_consumed_B": 0.547356672, "train/loss_slope": -0.001912933106241938} {"step": 270, "timestamp": 1778194816.1192355, "train/loss": 10.07876901626587, "train/z_loss": 0.0011041922378353774, "train/perplexity": 23831.631128126148, "train/grad_norm": 2.296875, "optim/muon_lr": 0.00108, "optim/adamw_lr": 3.2399999999999995e-05, "perf/tokens_per_sec": 2024949.7185872234, "perf/iters_per_sec": 0.9655712693153493, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0356563329696655, "data/tokens_consumed": 568328192, "data/tokens_consumed_B": 0.568328192, "train/loss_slope": -0.0020279053018337147} {"step": 280, "timestamp": 1778194827.267741, "train/loss": 9.970972347259522, "train/z_loss": 0.0010933954967185856, "train/perplexity": 21396.27985313111, "train/grad_norm": 2.421875, "optim/muon_lr": 0.0011200000000000001, "optim/adamw_lr": 3.36e-05, "perf/tokens_per_sec": 1882052.4112692068, "perf/iters_per_sec": 0.8974325233789476, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.114289903640747, "data/tokens_consumed": 589299712, "data/tokens_consumed_B": 0.589299712, "train/loss_slope": -0.0021810329465443276} {"step": 290, "timestamp": 1778194838.2454548, "train/loss": 9.859507942199707, "train/z_loss": 0.0010813324945047498, "train/perplexity": 19139.46939282446, "train/grad_norm": 2.453125, "optim/muon_lr": 0.00116, "optim/adamw_lr": 3.48e-05, "perf/tokens_per_sec": 1911668.3018055924, "perf/iters_per_sec": 0.9115544804599726, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0970271348953247, "data/tokens_consumed": 610271232, "data/tokens_consumed_B": 0.610271232, "train/loss_slope": -0.002363352241982872} {"step": 300, "timestamp": 1778194848.6019778, "grad/layer_0/attn": 0.01239701732993126, "grad/layer_0/mlp": 0.019786369055509567, "grad/layer_0/attn_mlp_ratio": 0.6265433153752352, "grad/layer_4/attn": 0.007557359524071217, "grad/layer_4/mlp": 0.014380459673702717, "grad/layer_4/attn_mlp_ratio": 0.5255297565583558, "grad/layer_8/attn": 0.018504144623875618, "grad/layer_8/mlp": 0.011960918083786964, "grad/layer_8/attn_mlp_ratio": 1.5470505139779323, "grad/layer_12/attn": 0.0060059442184865475, "grad/layer_12/mlp": 0.0018064966425299644, "grad/layer_12/attn_mlp_ratio": 3.324636063320732, "grad/layer_16/attn": 0.013056066818535328, "grad/layer_16/mlp": 0.012028384953737259, "grad/layer_16/attn_mlp_ratio": 1.0854380500962402, "grad/layer_20/attn": 0.015244219452142715, "grad/layer_20/mlp": 0.010974843986332417, "grad/layer_20/attn_mlp_ratio": 1.3890146713908391, "grad/layer_24/attn": 0.06046800687909126, "grad/layer_24/mlp": 0.05467972159385681, "grad/layer_24/attn_mlp_ratio": 1.1058579854821162, "grad/layer_27/attn": 0.03024110198020935, "grad/layer_27/mlp": 0.04932301491498947, "grad/layer_27/attn_mlp_ratio": 0.6131235483276712} {"step": 300, "timestamp": 1778194849.2075987, "eos/sharpness": 10.782718658447264, "eos/L0_probe": 9.629650115966797, "eos/L_plus": 9.929368019104004, "eos/L_minus": 9.437759399414062, "eos/grad_norm": 1.0018365383148193, "eos/embed_grad_frac": 0.5889009833335876, "eos/time_s": 0.6028597354888916} {"step": 300, "timestamp": 1778194849.2277255, "train/loss": 9.73781623840332, "train/z_loss": 0.0010682500200346113, "train/perplexity": 16946.49384151031, "train/grad_norm": 2.5, "optim/muon_lr": 0.0012, "optim/adamw_lr": 3.5999999999999994e-05, "perf/tokens_per_sec": 1910639.705255446, "perf/iters_per_sec": 0.9110640074040632, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0976177215576173, "data/tokens_consumed": 631242752, "data/tokens_consumed_B": 0.631242752, "train/loss_slope": -0.0025718987910978252} {"step": 300, "timestamp": 1778194850.5911996, "geo/rankme_last": 38.624813079833984, "geo/layer_0/stable_rank_q_proj": 53.625553131103516, "geo/layer_0/stable_rank_k_proj": 45.929168701171875, "geo/layer_0/stable_rank_o_proj": 67.82169342041016, "geo/layer_0/stable_rank_gate_proj": 161.2294921875, "geo/layer_0/stable_rank_down_proj": 56.065093994140625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.013173182494938374, "geo/layer_0/attn_entropy_mean": 7.018764019012451, "geo/layer_0/attn_entropy_std": 0.013912415131926537, "geo/layer_7/stable_rank_q_proj": 27.27002716064453, "geo/layer_7/stable_rank_k_proj": 28.86813735961914, "geo/layer_7/stable_rank_o_proj": 117.28638458251953, "geo/layer_7/stable_rank_gate_proj": 172.78103637695312, "geo/layer_7/stable_rank_down_proj": 200.03550720214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.9012278914451599, "geo/layer_7/attn_entropy_mean": 6.246671676635742, "geo/layer_7/attn_entropy_std": 0.5927559733390808, "geo/layer_14/stable_rank_q_proj": 33.991180419921875, "geo/layer_14/stable_rank_k_proj": 22.27823257446289, "geo/layer_14/stable_rank_o_proj": 92.40023803710938, "geo/layer_14/stable_rank_gate_proj": 193.68553161621094, "geo/layer_14/stable_rank_down_proj": 164.3931884765625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.9100461006164551, "geo/layer_14/attn_entropy_mean": 6.630692005157471, "geo/layer_14/attn_entropy_std": 0.5231790542602539, "geo/layer_21/stable_rank_q_proj": 64.25245666503906, "geo/layer_21/stable_rank_k_proj": 36.44606399536133, "geo/layer_21/stable_rank_o_proj": 65.62283325195312, "geo/layer_21/stable_rank_gate_proj": 150.09536743164062, "geo/layer_21/stable_rank_down_proj": 156.1739044189453, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.9069671630859375, "geo/layer_21/attn_entropy_mean": 6.927047252655029, "geo/layer_21/attn_entropy_std": 0.0853780210018158, "geo/layer_27/stable_rank_q_proj": 59.485469818115234, "geo/layer_27/stable_rank_k_proj": 20.406951904296875, "geo/layer_27/stable_rank_o_proj": 75.02827453613281, "geo/layer_27/stable_rank_gate_proj": 82.33653259277344, "geo/layer_27/stable_rank_down_proj": 60.85643768310547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9810419082641602, "geo/layer_27/attn_entropy_mean": 7.015541076660156, "geo/layer_27/attn_entropy_std": 0.08008982241153717, "attnres/final_alpha/block_0": 0.07998689264059067, "attnres/block_norm/0": 0.45581430196762085, "attnres/final_alpha/block_1": 0.004398995079100132, "attnres/block_norm/1": 3255.614990234375, "attnres/final_alpha/block_2": 0.14418524503707886, "attnres/block_norm/2": 2926.93701171875, "attnres/final_alpha/block_3": 0.02485349401831627, "attnres/block_norm/3": 785.2069091796875, "attnres/final_alpha/block_4": 0.04654419422149658, "attnres/block_norm/4": 3490.26171875, "attnres/final_alpha/block_5": 0.5020025968551636, "attnres/block_norm/5": 771.590576171875, "attnres/final_alpha/block_6": 0.1980285495519638, "attnres/block_norm/6": 1652.4329833984375, "geo/tier1_time_s": 1.3592603206634521, "geo/step": 300.0, "geo/rankme_slope": 0.023795761108398438} {"step": 310, "timestamp": 1778194860.946135, "train/loss": 9.60523509979248, "train/z_loss": 0.0010545485885813833, "train/perplexity": 14842.279347805395, "train/grad_norm": 2.515625, "optim/muon_lr": 0.00124, "optim/adamw_lr": 3.7199999999999996e-05, "perf/tokens_per_sec": 1790234.087770081, "perf/iters_per_sec": 0.8536501349306492, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.171440100669861, "data/tokens_consumed": 652214272, "data/tokens_consumed_B": 0.652214272, "train/loss_slope": -0.002804429840482225} {"step": 320, "timestamp": 1778194871.298028, "train/loss": 9.473875427246094, "train/z_loss": 0.0010408036061562598, "train/perplexity": 13015.229193791854, "train/grad_norm": 2.515625, "optim/muon_lr": 0.00128, "optim/adamw_lr": 3.84e-05, "perf/tokens_per_sec": 2027033.2822412355, "perf/iters_per_sec": 0.9665647898870637, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034591794013977, "data/tokens_consumed": 673185792, "data/tokens_consumed_B": 0.673185792, "train/loss_slope": -0.003052418343523606} {"step": 330, "timestamp": 1778194881.6511202, "train/loss": 9.350650215148926, "train/z_loss": 0.0010270376573316754, "train/perplexity": 11506.302585610398, "train/grad_norm": 2.5, "optim/muon_lr": 0.00132, "optim/adamw_lr": 3.96e-05, "perf/tokens_per_sec": 2026747.9098134367, "perf/iters_per_sec": 0.9664287137095626, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034737467765808, "data/tokens_consumed": 694157312, "data/tokens_consumed_B": 0.694157312, "train/loss_slope": -0.0033058849084094967} {"step": 340, "timestamp": 1778194892.557268, "train/loss": 9.219283103942871, "train/z_loss": 0.0010135549237020314, "train/perplexity": 10089.828376565345, "train/grad_norm": 2.328125, "optim/muon_lr": 0.00136, "optim/adamw_lr": 4.08e-05, "perf/tokens_per_sec": 1923856.0487546686, "perf/iters_per_sec": 0.9173660510800689, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0900774002075195, "data/tokens_consumed": 715128832, "data/tokens_consumed_B": 0.715128832, "train/loss_slope": -0.0035651274876100335} {"step": 350, "timestamp": 1778194903.6111035, "grad/layer_0/attn": 0.017275197431445122, "grad/layer_0/mlp": 0.025269635021686554, "grad/layer_0/attn_mlp_ratio": 0.6836346210879571, "grad/layer_4/attn": 0.008365529589354992, "grad/layer_4/mlp": 0.01928168535232544, "grad/layer_4/attn_mlp_ratio": 0.4338588351126783, "grad/layer_8/attn": 0.023672886192798615, "grad/layer_8/mlp": 0.015303178690373898, "grad/layer_8/attn_mlp_ratio": 1.546926067915346, "grad/layer_12/attn": 0.005939573515206575, "grad/layer_12/mlp": 0.0023775652516633272, "grad/layer_12/attn_mlp_ratio": 2.4981746604993607, "grad/layer_16/attn": 0.01706462912261486, "grad/layer_16/mlp": 0.011715395376086235, "grad/layer_16/attn_mlp_ratio": 1.4565986404340867, "grad/layer_20/attn": 0.02240634337067604, "grad/layer_20/mlp": 0.01489044725894928, "grad/layer_20/attn_mlp_ratio": 1.5047461523853847, "grad/layer_24/attn": 0.0626254677772522, "grad/layer_24/mlp": 0.06957266479730606, "grad/layer_24/attn_mlp_ratio": 0.9001447316944322, "grad/layer_27/attn": 0.03984756022691727, "grad/layer_27/mlp": 0.03915683180093765, "grad/layer_27/attn_mlp_ratio": 1.0176400462562212} {"step": 350, "timestamp": 1778194903.6269433, "train/loss": 9.10163917541504, "train/z_loss": 0.001000492798630148, "train/perplexity": 8969.984036663118, "train/grad_norm": 2.53125, "optim/muon_lr": 0.0014000000000000002, "optim/adamw_lr": 4.2e-05, "perf/tokens_per_sec": 1895847.1928423257, "perf/iters_per_sec": 0.9040103878223065, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1061819791793823, "data/tokens_consumed": 736100352, "data/tokens_consumed_B": 0.736100352, "train/loss_slope": -0.003820432116779734} {"step": 360, "timestamp": 1778194913.970586, "train/loss": 8.966748905181884, "train/z_loss": 0.000987759802956134, "train/perplexity": 7838.0778036842, "train/grad_norm": 2.3125, "optim/muon_lr": 0.0014399999999999999, "optim/adamw_lr": 4.319999999999999e-05, "perf/tokens_per_sec": 2028519.5912211877, "perf/iters_per_sec": 0.967273517237276, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0338337421417236, "data/tokens_consumed": 757071872, "data/tokens_consumed_B": 0.757071872, "train/loss_slope": -0.0040777749888644835} {"step": 370, "timestamp": 1778194924.3221977, "train/loss": 8.859398937225341, "train/z_loss": 0.000975601188838482, "train/perplexity": 7040.249840468856, "train/grad_norm": 2.359375, "optim/muon_lr": 0.00148, "optim/adamw_lr": 4.4399999999999995e-05, "perf/tokens_per_sec": 2027007.2637825243, "perf/iters_per_sec": 0.966552383319151, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034605073928833, "data/tokens_consumed": 778043392, "data/tokens_consumed_B": 0.778043392, "train/loss_slope": -0.00432352083884257} {"step": 375, "timestamp": 1778194930.1021886, "eos/sharpness": 3.0027389526367183, "eos/L0_probe": 8.64775276184082, "eos/L_plus": 8.88754940032959, "eos/L_minus": 8.437983512878418, "eos/grad_norm": 0.9990360736846924, "eos/embed_grad_frac": 0.8186808228492737, "eos/time_s": 0.6078650951385498} {"step": 375, "timestamp": 1778194931.4812458, "geo/rankme_last": 50.79903793334961, "geo/layer_0/stable_rank_q_proj": 53.62200927734375, "geo/layer_0/stable_rank_k_proj": 45.92001724243164, "geo/layer_0/stable_rank_o_proj": 67.82071685791016, "geo/layer_0/stable_rank_gate_proj": 161.22923278808594, "geo/layer_0/stable_rank_down_proj": 56.06697082519531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.022278673946857452, "geo/layer_0/attn_entropy_mean": 7.017113208770752, "geo/layer_0/attn_entropy_std": 0.012090680189430714, "geo/layer_7/stable_rank_q_proj": 27.27029800415039, "geo/layer_7/stable_rank_k_proj": 28.863975524902344, "geo/layer_7/stable_rank_o_proj": 117.28482055664062, "geo/layer_7/stable_rank_gate_proj": 172.76004028320312, "geo/layer_7/stable_rank_down_proj": 200.0210418701172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.9018151164054871, "geo/layer_7/attn_entropy_mean": 6.254116058349609, "geo/layer_7/attn_entropy_std": 0.5697097182273865, "geo/layer_14/stable_rank_q_proj": 33.99488067626953, "geo/layer_14/stable_rank_k_proj": 22.279529571533203, "geo/layer_14/stable_rank_o_proj": 92.39582824707031, "geo/layer_14/stable_rank_gate_proj": 193.69313049316406, "geo/layer_14/stable_rank_down_proj": 164.39755249023438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.9084038138389587, "geo/layer_14/attn_entropy_mean": 6.612309455871582, "geo/layer_14/attn_entropy_std": 0.5471448302268982, "geo/layer_21/stable_rank_q_proj": 64.25049591064453, "geo/layer_21/stable_rank_k_proj": 36.44965362548828, "geo/layer_21/stable_rank_o_proj": 65.62379455566406, "geo/layer_21/stable_rank_gate_proj": 150.115966796875, "geo/layer_21/stable_rank_down_proj": 156.169189453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.9197524785995483, "geo/layer_21/attn_entropy_mean": 6.947605133056641, "geo/layer_21/attn_entropy_std": 0.09567360579967499, "geo/layer_27/stable_rank_q_proj": 59.47829055786133, "geo/layer_27/stable_rank_k_proj": 20.402559280395508, "geo/layer_27/stable_rank_o_proj": 75.03015899658203, "geo/layer_27/stable_rank_gate_proj": 82.34685516357422, "geo/layer_27/stable_rank_down_proj": 60.859771728515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9841923117637634, "geo/layer_27/attn_entropy_mean": 6.8983869552612305, "geo/layer_27/attn_entropy_std": 0.0631026104092598, "attnres/final_alpha/block_0": 0.0723130851984024, "attnres/block_norm/0": 0.46039530634880066, "attnres/final_alpha/block_1": 0.004030080512166023, "attnres/block_norm/1": 3223.34912109375, "attnres/final_alpha/block_2": 0.13648644089698792, "attnres/block_norm/2": 3286.399658203125, "attnres/final_alpha/block_3": 0.022500038146972656, "attnres/block_norm/3": 774.2010498046875, "attnres/final_alpha/block_4": 0.03136403113603592, "attnres/block_norm/4": 3109.05029296875, "attnres/final_alpha/block_5": 0.6215443015098572, "attnres/block_norm/5": 718.2799072265625, "attnres/final_alpha/block_6": 0.11176201701164246, "attnres/block_norm/6": 1494.4031982421875, "geo/tier1_time_s": 1.3566153049468994, "geo/step": 375.0, "geo/rankme_slope": 0.05147320120675223} {"step": 380, "timestamp": 1778194936.6549249, "train/loss": 8.74164571762085, "train/z_loss": 0.0009636891249101609, "train/perplexity": 6258.18644982475, "train/grad_norm": 2.28125, "optim/muon_lr": 0.00152, "optim/adamw_lr": 4.56e-05, "perf/tokens_per_sec": 1701327.3350524888, "perf/iters_per_sec": 0.8112560916197247, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2326563835144042, "data/tokens_consumed": 799014912, "data/tokens_consumed_B": 0.799014912, "train/loss_slope": -0.004562010761214653} {"step": 390, "timestamp": 1778194946.9960415, "train/loss": 8.615703296661376, "train/z_loss": 0.0009522549749817699, "train/perplexity": 5517.627778671527, "train/grad_norm": 2.359375, "optim/muon_lr": 0.00156, "optim/adamw_lr": 4.68e-05, "perf/tokens_per_sec": 2028756.377006942, "perf/iters_per_sec": 0.9673864254984579, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0337130784988404, "data/tokens_consumed": 819986432, "data/tokens_consumed_B": 0.819986432, "train/loss_slope": -0.004795858455643645} {"step": 400, "timestamp": 1778194957.3282707, "grad/layer_0/attn": 0.01765945367515087, "grad/layer_0/mlp": 0.026621034368872643, "grad/layer_0/attn_mlp_ratio": 0.6633646673572983, "grad/layer_4/attn": 0.008318878710269928, "grad/layer_4/mlp": 0.02403184212744236, "grad/layer_4/attn_mlp_ratio": 0.34616067430612796, "grad/layer_8/attn": 0.03257867321372032, "grad/layer_8/mlp": 0.020745230838656425, "grad/layer_8/attn_mlp_ratio": 1.5704174761927376, "grad/layer_12/attn": 0.006212274543941021, "grad/layer_12/mlp": 0.0033618982415646315, "grad/layer_12/attn_mlp_ratio": 1.8478472317666277, "grad/layer_16/attn": 0.016880033537745476, "grad/layer_16/mlp": 0.009724851697683334, "grad/layer_16/attn_mlp_ratio": 1.7357625482546333, "grad/layer_20/attn": 0.021149760112166405, "grad/layer_20/mlp": 0.017835766077041626, "grad/layer_20/attn_mlp_ratio": 1.1858060877356973, "grad/layer_24/attn": 0.07378187030553818, "grad/layer_24/mlp": 0.08073528110980988, "grad/layer_24/attn_mlp_ratio": 0.9138739495289352, "grad/layer_27/attn": 0.05047507956624031, "grad/layer_27/mlp": 0.029454708099365234, "grad/layer_27/attn_mlp_ratio": 1.713650640318619} {"step": 400, "timestamp": 1778194957.3432484, "train/loss": 8.503699684143067, "train/z_loss": 0.00094141082954593, "train/perplexity": 4932.985610012706, "train/grad_norm": 2.265625, "optim/muon_lr": 0.0016, "optim/adamw_lr": 4.7999999999999994e-05, "perf/tokens_per_sec": 2028041.2757388307, "perf/iters_per_sec": 0.9670454386419443, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340775728225708, "data/tokens_consumed": 840957952, "data/tokens_consumed_B": 0.840957952, "train/loss_slope": -0.005019429477665067} {"step": 410, "timestamp": 1778194967.6879036, "train/loss": 8.40148630142212, "train/z_loss": 0.0009308232110925019, "train/perplexity": 4453.681343755297, "train/grad_norm": 2.359375, "optim/muon_lr": 0.0016400000000000002, "optim/adamw_lr": 4.9199999999999997e-05, "perf/tokens_per_sec": 2028792.3605584425, "perf/iters_per_sec": 0.9674035837928975, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0336947441101074, "data/tokens_consumed": 861929472, "data/tokens_consumed_B": 0.861929472, "train/loss_slope": -0.005229829393013902} {"step": 420, "timestamp": 1778194978.027329, "train/loss": 8.28266658782959, "train/z_loss": 0.0009202459652442486, "train/perplexity": 3954.7259581627054, "train/grad_norm": 2.15625, "optim/muon_lr": 0.00168, "optim/adamw_lr": 5.04e-05, "perf/tokens_per_sec": 2029514.4069444838, "perf/iters_per_sec": 0.9677478823397082, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0333269834518433, "data/tokens_consumed": 882900992, "data/tokens_consumed_B": 0.882900992, "train/loss_slope": -0.005433301228621473} {"step": 430, "timestamp": 1778194988.3880875, "train/loss": 8.217171573638916, "train/z_loss": 0.0009099613293074071, "train/perplexity": 3704.011030057446, "train/grad_norm": 2.21875, "optim/muon_lr": 0.00172, "optim/adamw_lr": 5.1599999999999994e-05, "perf/tokens_per_sec": 2025668.1921982293, "perf/iters_per_sec": 0.9659138642302653, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0352890014648437, "data/tokens_consumed": 903872512, "data/tokens_consumed_B": 0.903872512, "train/loss_slope": -0.00561364249228759} {"step": 440, "timestamp": 1778194998.7382932, "train/loss": 8.106709957122803, "train/z_loss": 0.0008998454315587879, "train/perplexity": 3316.6481452519906, "train/grad_norm": 2.203125, "optim/muon_lr": 0.0017599999999999998, "optim/adamw_lr": 5.279999999999999e-05, "perf/tokens_per_sec": 2027294.2044355695, "perf/iters_per_sec": 0.9666892072847221, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0344586372375488, "data/tokens_consumed": 924844032, "data/tokens_consumed_B": 0.924844032, "train/loss_slope": -0.005786730143078387} {"step": 450, "timestamp": 1778195009.0671945, "grad/layer_0/attn": 0.0158880352973938, "grad/layer_0/mlp": 0.02628970332443714, "grad/layer_0/attn_mlp_ratio": 0.6043444097062483, "grad/layer_4/attn": 0.007316826842725277, "grad/layer_4/mlp": 0.02394983544945717, "grad/layer_4/attn_mlp_ratio": 0.30550635003801163, "grad/layer_8/attn": 0.03002430871129036, "grad/layer_8/mlp": 0.021951772272586823, "grad/layer_8/attn_mlp_ratio": 1.367739615812728, "grad/layer_12/attn": 0.004945460706949234, "grad/layer_12/mlp": 0.003432761412113905, "grad/layer_12/attn_mlp_ratio": 1.4406653912592384, "grad/layer_16/attn": 0.011387282982468605, "grad/layer_16/mlp": 0.006262615788727999, "grad/layer_16/attn_mlp_ratio": 1.8182949720682102, "grad/layer_20/attn": 0.012062766589224339, "grad/layer_20/mlp": 0.01567227393388748, "grad/layer_20/attn_mlp_ratio": 0.7696883402588252, "grad/layer_24/attn": 0.07914526760578156, "grad/layer_24/mlp": 0.08590715378522873, "grad/layer_24/attn_mlp_ratio": 0.9212884378816577, "grad/layer_27/attn": 0.05285302549600601, "grad/layer_27/mlp": 0.0257010105997324, "grad/layer_27/attn_mlp_ratio": 2.056457083088811} {"step": 450, "timestamp": 1778195009.6764526, "eos/sharpness": 1.6400814056396482, "eos/L0_probe": 7.8156352043151855, "eos/L_plus": 8.025771141052246, "eos/L_minus": 7.6219000816345215, "eos/grad_norm": 0.9946985840797424, "eos/embed_grad_frac": 0.9339469075202942, "eos/time_s": 0.6065237522125244} {"step": 450, "timestamp": 1778195009.6963024, "train/loss": 8.028769397735596, "train/z_loss": 0.000890279735904187, "train/perplexity": 3067.9639059545616, "train/grad_norm": 2.125, "optim/muon_lr": 0.0018, "optim/adamw_lr": 5.399999999999999e-05, "perf/tokens_per_sec": 1914780.6292617486, "perf/iters_per_sec": 0.9130385538395637, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.095244002342224, "data/tokens_consumed": 945815552, "data/tokens_consumed_B": 0.945815552, "train/loss_slope": -0.005943772599169989} {"step": 450, "timestamp": 1778195011.0564523, "geo/rankme_last": 75.9095230102539, "geo/layer_0/stable_rank_q_proj": 53.607398986816406, "geo/layer_0/stable_rank_k_proj": 45.90536117553711, "geo/layer_0/stable_rank_o_proj": 67.81135559082031, "geo/layer_0/stable_rank_gate_proj": 161.27015686035156, "geo/layer_0/stable_rank_down_proj": 56.059654235839844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.022401753813028336, "geo/layer_0/attn_entropy_mean": 7.014403820037842, "geo/layer_0/attn_entropy_std": 0.010717283934354782, "geo/layer_7/stable_rank_q_proj": 27.269712448120117, "geo/layer_7/stable_rank_k_proj": 28.850008010864258, "geo/layer_7/stable_rank_o_proj": 117.29092407226562, "geo/layer_7/stable_rank_gate_proj": 172.7593994140625, "geo/layer_7/stable_rank_down_proj": 200.0015106201172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.9064839482307434, "geo/layer_7/attn_entropy_mean": 6.274984359741211, "geo/layer_7/attn_entropy_std": 0.5487991571426392, "geo/layer_14/stable_rank_q_proj": 33.994468688964844, "geo/layer_14/stable_rank_k_proj": 22.28085708618164, "geo/layer_14/stable_rank_o_proj": 92.39627838134766, "geo/layer_14/stable_rank_gate_proj": 193.68382263183594, "geo/layer_14/stable_rank_down_proj": 164.39015197753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.9058893918991089, "geo/layer_14/attn_entropy_mean": 6.616078853607178, "geo/layer_14/attn_entropy_std": 0.5371800661087036, "geo/layer_21/stable_rank_q_proj": 64.24652862548828, "geo/layer_21/stable_rank_k_proj": 36.452579498291016, "geo/layer_21/stable_rank_o_proj": 65.635009765625, "geo/layer_21/stable_rank_gate_proj": 150.1609344482422, "geo/layer_21/stable_rank_down_proj": 156.17855834960938, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.9264703989028931, "geo/layer_21/attn_entropy_mean": 6.959164142608643, "geo/layer_21/attn_entropy_std": 0.10474484413862228, "geo/layer_27/stable_rank_q_proj": 59.47467041015625, "geo/layer_27/stable_rank_k_proj": 20.399333953857422, "geo/layer_27/stable_rank_o_proj": 75.03324890136719, "geo/layer_27/stable_rank_gate_proj": 82.35978698730469, "geo/layer_27/stable_rank_down_proj": 60.85517501831055, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.9771982431411743, "geo/layer_27/attn_entropy_mean": 6.666649341583252, "geo/layer_27/attn_entropy_std": 0.10541775822639465, "attnres/final_alpha/block_0": 0.07724964618682861, "attnres/block_norm/0": 0.46382439136505127, "attnres/final_alpha/block_1": 0.004596028942614794, "attnres/block_norm/1": 3195.8212890625, "attnres/final_alpha/block_2": 0.12584960460662842, "attnres/block_norm/2": 3023.130615234375, "attnres/final_alpha/block_3": 0.024981500580906868, "attnres/block_norm/3": 739.61376953125, "attnres/final_alpha/block_4": 0.024634981527924538, "attnres/block_norm/4": 2655.89453125, "attnres/final_alpha/block_5": 0.6524057388305664, "attnres/block_norm/5": 751.0576171875, "attnres/final_alpha/block_6": 0.09028250724077225, "attnres/block_norm/6": 1444.431884765625, "geo/tier1_time_s": 1.3573973178863525, "geo/step": 450.0, "geo/rankme_slope": 0.0917151369367327} {"step": 460, "timestamp": 1778195021.4235833, "train/loss": 7.904272890090942, "train/z_loss": 0.0008812162675894796, "train/perplexity": 2708.83217724637, "train/grad_norm": 2.078125, "optim/muon_lr": 0.00184, "optim/adamw_lr": 5.519999999999999e-05, "perf/tokens_per_sec": 1788689.445253271, "perf/iters_per_sec": 0.8529135919824938, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1724517107009889, "data/tokens_consumed": 966787072, "data/tokens_consumed_B": 0.966787072, "train/loss_slope": -0.0060989052818176594} {"step": 470, "timestamp": 1778195031.767376, "train/loss": 7.809734678268432, "train/z_loss": 0.0008727910346351564, "train/perplexity": 2464.476469369467, "train/grad_norm": 2.046875, "optim/muon_lr": 0.00188, "optim/adamw_lr": 5.6399999999999995e-05, "perf/tokens_per_sec": 2028710.007368411, "perf/iters_per_sec": 0.9673643147317939, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0337367057800293, "data/tokens_consumed": 987758592, "data/tokens_consumed_B": 0.987758592, "train/loss_slope": -0.006243996331342861} {"step": 480, "timestamp": 1778195042.1145782, "train/loss": 7.729195499420166, "train/z_loss": 0.0008650889038108289, "train/perplexity": 2273.7722137088476, "train/grad_norm": 2.09375, "optim/muon_lr": 0.00192, "optim/adamw_lr": 5.76e-05, "perf/tokens_per_sec": 2028145.7871407417, "perf/iters_per_sec": 0.9670952735618313, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340242862701416, "data/tokens_consumed": 1008730112, "data/tokens_consumed_B": 1.008730112, "train/loss_slope": -0.0063764642408915915} {"step": 490, "timestamp": 1778195052.4747431, "train/loss": 7.661282777786255, "train/z_loss": 0.0008582293521612883, "train/perplexity": 2124.480918318279, "train/grad_norm": 1.9921875, "optim/muon_lr": 0.00196, "optim/adamw_lr": 5.88e-05, "perf/tokens_per_sec": 2025239.247783997, "perf/iters_per_sec": 0.9657093275947557, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0355082750320435, "data/tokens_consumed": 1029701632, "data/tokens_consumed_B": 1.029701632, "train/loss_slope": -0.006494635398333527} {"step": 500, "timestamp": 1778195062.8083868, "grad/layer_0/attn": 0.011967991478741169, "grad/layer_0/mlp": 0.019436677917838097, "grad/layer_0/attn_mlp_ratio": 0.6157426422229916, "grad/layer_4/attn": 0.00635393476113677, "grad/layer_4/mlp": 0.02134820446372032, "grad/layer_4/attn_mlp_ratio": 0.29763321511050195, "grad/layer_8/attn": 0.024291520938277245, "grad/layer_8/mlp": 0.018911508843302727, "grad/layer_8/attn_mlp_ratio": 1.2844834862783268, "grad/layer_12/attn": 0.004050862975418568, "grad/layer_12/mlp": 0.002866310765966773, "grad/layer_12/attn_mlp_ratio": 1.4132671454155965, "grad/layer_16/attn": 0.007787852548062801, "grad/layer_16/mlp": 0.00439818762242794, "grad/layer_16/attn_mlp_ratio": 1.7706958046264698, "grad/layer_20/attn": 0.011171644553542137, "grad/layer_20/mlp": 0.013341368176043034, "grad/layer_20/attn_mlp_ratio": 0.8373687257852669, "grad/layer_24/attn": 0.06751637905836105, "grad/layer_24/mlp": 0.0780634731054306, "grad/layer_24/attn_mlp_ratio": 0.864890790609406, "grad/layer_27/attn": 0.03017599694430828, "grad/layer_27/mlp": 0.03071599081158638, "grad/layer_27/attn_mlp_ratio": 0.982419777085088} {"step": 500, "timestamp": 1778195062.8242476, "train/loss": 7.56210241317749, "train/z_loss": 0.0008523770957253873, "train/perplexity": 1923.8860678487895, "train/grad_norm": 1.6953125, "optim/muon_lr": 0.002, "optim/adamw_lr": 5.9999999999999995e-05, "perf/tokens_per_sec": 2027845.7023561585, "perf/iters_per_sec": 0.9669521819859307, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034177303314209, "data/tokens_consumed": 1050673152, "data/tokens_consumed_B": 1.050673152, "train/loss_slope": -0.006607184029057014} {"step": 500, "timestamp": 1778195069.9500802, "geo/ww_alpha_mean": 7.834673883357575, "geo/ww_alpha_std": 6.934036666001329, "geo/ww_alpha_min": 1.8748196369700807, "geo/ww_alpha_max": 59.314740176068455, "geo/ww_alpha_healthy_frac": 0.29949238578680204, "geo/ww_alpha_by_type/q_proj": 4.348515384746518, "geo/ww_alpha_by_type/k_proj": 4.103914929529395, "geo/ww_alpha_by_type/v_proj": 5.556229458904582, "geo/ww_alpha_by_type/o_proj": 5.489519952498363, "geo/ww_alpha_by_type/gate_proj": 12.992613760798688, "geo/ww_alpha_by_type/up_proj": 12.742231378435557, "geo/ww_alpha_by_type/down_proj": 9.561192614615527, "geo/twonn_id/layer_0": 0.6247621774673462, "geo/twonn_id/layer_7": 2.6403465270996094, "geo/twonn_id/layer_14": 2.562387228012085, "geo/twonn_id/layer_21": 2.780791997909546, "geo/twonn_id/layer_27": 7.5564680099487305, "geo/tier2_time_s": 7.116715669631958} {"step": 500, "timestamp": 1778195070.7101407, "eoc/jacobian_sigma/layer_0/attn": 1263.8612060546875, "eoc/jacobian_sigma/layer_0/mlp": 1217.1339111328125, "eoc/jacobian_sigma/layer_0": 1263.8612060546875, "eoc/jacobian_sigma/layer_7/attn": 1.0023224353790283, "eoc/jacobian_sigma/layer_7/mlp": 1.1297527551651, "eoc/jacobian_sigma/layer_7": 1.1297527551651, "eoc/jacobian_sigma/layer_14/attn": 1.1104538440704346, "eoc/jacobian_sigma/layer_14/mlp": 1.7032380104064941, "eoc/jacobian_sigma/layer_14": 1.7032380104064941, "eoc/jacobian_sigma/layer_21/attn": 1.000510811805725, "eoc/jacobian_sigma/layer_21/mlp": 2.325498580932617, "eoc/jacobian_sigma/layer_21": 2.325498580932617, "eoc/jacobian_sigma/layer_27/attn": 1.0489614009857178, "eoc/jacobian_sigma/layer_27/mlp": 3.0183815956115723, "eoc/jacobian_sigma/layer_27": 3.0183815956115723, "eoc/layer0_sigma": 1263.8612060546875, "eoc/sigma_max": 3.0183815956115723, "eoc/sigma_min": 1.1297527551651, "eoc/sigma_mean": 2.044217735528946, "eoc/time_s": 0.7538981437683105} {"step": 510, "timestamp": 1778195081.078228, "train/loss": 7.484774732589722, "train/z_loss": 0.0008482620294671505, "train/perplexity": 1780.7229857488464, "train/grad_norm": 1.5078125, "optim/muon_lr": 0.0020399999999999997, "optim/adamw_lr": 6.12e-05, "perf/tokens_per_sec": 1149364.6139189065, "perf/iters_per_sec": 0.5480597562403233, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.8246185541152955, "data/tokens_consumed": 1071644672, "data/tokens_consumed_B": 1.071644672, "train/loss_slope": -0.006709686804619737} {"step": 520, "timestamp": 1778195091.4194307, "train/loss": 7.381152868270874, "train/z_loss": 0.0008472315210383385, "train/perplexity": 1605.439561674781, "train/grad_norm": 1.2890625, "optim/muon_lr": 0.00208, "optim/adamw_lr": 6.239999999999999e-05, "perf/tokens_per_sec": 2029160.7857632115, "perf/iters_per_sec": 0.9675792626205499, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0335070610046386, "data/tokens_consumed": 1092616192, "data/tokens_consumed_B": 1.092616192, "train/loss_slope": -0.006808672505335507} {"step": 525, "timestamp": 1778195097.1990023, "eos/sharpness": 9.12971496582031, "eos/L0_probe": 7.100594520568848, "eos/L_plus": 7.227085113525391, "eos/L_minus": 7.065401077270508, "eos/grad_norm": 0.998841404914856, "eos/embed_grad_frac": 0.9407016038894653, "eos/time_s": 0.617748498916626} {"step": 525, "timestamp": 1778195098.5764782, "geo/rankme_last": 94.53275299072266, "geo/layer_0/stable_rank_q_proj": 53.597740173339844, "geo/layer_0/stable_rank_k_proj": 45.87440872192383, "geo/layer_0/stable_rank_o_proj": 67.78486633300781, "geo/layer_0/stable_rank_gate_proj": 161.3246307373047, "geo/layer_0/stable_rank_down_proj": 56.03753662109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03392110764980316, "geo/layer_0/attn_entropy_mean": 7.008700370788574, "geo/layer_0/attn_entropy_std": 0.009117839857935905, "geo/layer_7/stable_rank_q_proj": 27.274110794067383, "geo/layer_7/stable_rank_k_proj": 28.83620834350586, "geo/layer_7/stable_rank_o_proj": 117.30107879638672, "geo/layer_7/stable_rank_gate_proj": 172.77853393554688, "geo/layer_7/stable_rank_down_proj": 200.0183868408203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.9047333598136902, "geo/layer_7/attn_entropy_mean": 6.308781623840332, "geo/layer_7/attn_entropy_std": 0.52081298828125, "geo/layer_14/stable_rank_q_proj": 33.99421691894531, "geo/layer_14/stable_rank_k_proj": 22.27796745300293, "geo/layer_14/stable_rank_o_proj": 92.400390625, "geo/layer_14/stable_rank_gate_proj": 193.6947021484375, "geo/layer_14/stable_rank_down_proj": 164.4166259765625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.9062811732292175, "geo/layer_14/attn_entropy_mean": 6.60915470123291, "geo/layer_14/attn_entropy_std": 0.5361649394035339, "geo/layer_21/stable_rank_q_proj": 64.2425537109375, "geo/layer_21/stable_rank_k_proj": 36.458839416503906, "geo/layer_21/stable_rank_o_proj": 65.63127136230469, "geo/layer_21/stable_rank_gate_proj": 150.22425842285156, "geo/layer_21/stable_rank_down_proj": 156.17962646484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.9252816438674927, "geo/layer_21/attn_entropy_mean": 6.96382999420166, "geo/layer_21/attn_entropy_std": 0.11405786126852036, "geo/layer_27/stable_rank_q_proj": 59.46086502075195, "geo/layer_27/stable_rank_k_proj": 20.400344848632812, "geo/layer_27/stable_rank_o_proj": 75.02449798583984, "geo/layer_27/stable_rank_gate_proj": 82.36335754394531, "geo/layer_27/stable_rank_down_proj": 60.8714599609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.972405195236206, "geo/layer_27/attn_entropy_mean": 6.537088394165039, "geo/layer_27/attn_entropy_std": 0.16922588646411896, "attnres/final_alpha/block_0": 0.09437788277864456, "attnres/block_norm/0": 0.4713900089263916, "attnres/final_alpha/block_1": 0.00537223881110549, "attnres/block_norm/1": 3164.26513671875, "attnres/final_alpha/block_2": 0.11930084228515625, "attnres/block_norm/2": 2658.101806640625, "attnres/final_alpha/block_3": 0.038610197603702545, "attnres/block_norm/3": 738.4912719726562, "attnres/final_alpha/block_4": 0.023010239005088806, "attnres/block_norm/4": 2142.20263671875, "attnres/final_alpha/block_5": 0.5987387895584106, "attnres/block_norm/5": 795.0228271484375, "attnres/final_alpha/block_6": 0.12058980762958527, "attnres/block_norm/6": 1351.8853759765625, "geo/tier1_time_s": 1.3584725856781006, "geo/step": 525.0, "geo/rankme_slope": 0.12153215983557322} {"step": 530, "timestamp": 1778195103.7466934, "train/loss": 7.311044597625733, "train/z_loss": 0.0008504689496476204, "train/perplexity": 1496.7398637743133, "train/grad_norm": 0.734375, "optim/muon_lr": 0.00212, "optim/adamw_lr": 6.359999999999999e-05, "perf/tokens_per_sec": 1702077.5793198755, "perf/iters_per_sec": 0.8116138359641435, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2321130514144898, "data/tokens_consumed": 1113587712, "data/tokens_consumed_B": 1.113587712, "train/loss_slope": -0.006897468130134595} {"step": 540, "timestamp": 1778195114.0934937, "train/loss": 7.229951620101929, "train/z_loss": 0.0008565863943658769, "train/perplexity": 1380.1557306782477, "train/grad_norm": 0.4921875, "optim/muon_lr": 0.00216, "optim/adamw_lr": 6.479999999999999e-05, "perf/tokens_per_sec": 2027764.7348903157, "perf/iters_per_sec": 0.9669135736896113, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0342185974121094, "data/tokens_consumed": 1134559232, "data/tokens_consumed_B": 1.134559232, "train/loss_slope": -0.006979283623537116} {"step": 550, "timestamp": 1778195124.4282434, "grad/layer_0/attn": 0.014717315323650837, "grad/layer_0/mlp": 0.020534269511699677, "grad/layer_0/attn_mlp_ratio": 0.7167196886937458, "grad/layer_4/attn": 0.004515049513429403, "grad/layer_4/mlp": 0.0138735082000494, "grad/layer_4/attn_mlp_ratio": 0.32544396239077655, "grad/layer_8/attn": 0.022717749699950218, "grad/layer_8/mlp": 0.018555309623479843, "grad/layer_8/attn_mlp_ratio": 1.2243260844739892, "grad/layer_12/attn": 0.0025317594408988953, "grad/layer_12/mlp": 0.0017834317404776812, "grad/layer_12/attn_mlp_ratio": 1.4195997758012322, "grad/layer_16/attn": 0.006294414401054382, "grad/layer_16/mlp": 0.0026230523362755775, "grad/layer_16/attn_mlp_ratio": 2.3996525246714864, "grad/layer_20/attn": 0.01419683825224638, "grad/layer_20/mlp": 0.007945661433041096, "grad/layer_20/attn_mlp_ratio": 1.7867408765413055, "grad/layer_24/attn": 0.026566287502646446, "grad/layer_24/mlp": 0.03738683462142944, "grad/layer_24/attn_mlp_ratio": 0.7105786756379014, "grad/layer_27/attn": 0.010246548801660538, "grad/layer_27/mlp": 0.015173246152698994, "grad/layer_27/attn_mlp_ratio": 0.6753036648197743} {"step": 550, "timestamp": 1778195124.446653, "train/loss": 7.173527002334595, "train/z_loss": 0.0008625872957054526, "train/perplexity": 1304.4372521342339, "train/grad_norm": 0.419921875, "optim/muon_lr": 0.0022, "optim/adamw_lr": 6.599999999999999e-05, "perf/tokens_per_sec": 2026998.8090950262, "perf/iters_per_sec": 0.966548351809991, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0346093893051147, "data/tokens_consumed": 1155530752, "data/tokens_consumed_B": 1.155530752, "train/loss_slope": -0.007050127925240497} {"step": 560, "timestamp": 1778195134.7950706, "train/loss": 7.207624912261963, "train/z_loss": 0.0008676004479639232, "train/perplexity": 1349.682842380813, "train/grad_norm": 0.455078125, "optim/muon_lr": 0.0022400000000000002, "optim/adamw_lr": 6.72e-05, "perf/tokens_per_sec": 2028070.9680394982, "perf/iters_per_sec": 0.9670595970342151, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034062433242798, "data/tokens_consumed": 1176502272, "data/tokens_consumed_B": 1.176502272, "train/loss_slope": -0.007094788561220889} {"step": 570, "timestamp": 1778195145.1376104, "train/loss": 7.089724826812744, "train/z_loss": 0.0008700942329596728, "train/perplexity": 1199.5776635814461, "train/grad_norm": 0.46484375, "optim/muon_lr": 0.0022800000000000003, "optim/adamw_lr": 6.84e-05, "perf/tokens_per_sec": 2028798.4905234594, "perf/iters_per_sec": 0.967406506787996, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0336916208267213, "data/tokens_consumed": 1197473792, "data/tokens_consumed_B": 1.197473792, "train/loss_slope": -0.007143218157574428} {"step": 580, "timestamp": 1778195155.4864924, "train/loss": 7.1223304748535154, "train/z_loss": 0.0008704530773684382, "train/perplexity": 1239.3353103693091, "train/grad_norm": 0.388671875, "optim/muon_lr": 0.00232, "optim/adamw_lr": 6.96e-05, "perf/tokens_per_sec": 2028008.3112896204, "perf/iters_per_sec": 0.9670297199676611, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340943813323975, "data/tokens_consumed": 1218445312, "data/tokens_consumed_B": 1.218445312, "train/loss_slope": -0.007169253347910754} {"step": 590, "timestamp": 1778195165.831902, "train/loss": 7.080363750457764, "train/z_loss": 0.000872576207621023, "train/perplexity": 1188.4007212037081, "train/grad_norm": 0.384765625, "optim/muon_lr": 0.00236, "optim/adamw_lr": 7.079999999999999e-05, "perf/tokens_per_sec": 2028222.5759832948, "perf/iters_per_sec": 0.967131889335296, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0339851379394531, "data/tokens_consumed": 1239416832, "data/tokens_consumed_B": 1.239416832, "train/loss_slope": -0.007187897297036949} {"step": 600, "timestamp": 1778195176.1658194, "grad/layer_0/attn": 0.018630163744091988, "grad/layer_0/mlp": 0.025657016783952713, "grad/layer_0/attn_mlp_ratio": 0.7261235329249949, "grad/layer_4/attn": 0.007834000512957573, "grad/layer_4/mlp": 0.021180883049964905, "grad/layer_4/attn_mlp_ratio": 0.3698618446403427, "grad/layer_8/attn": 0.04847249388694763, "grad/layer_8/mlp": 0.027349667623639107, "grad/layer_8/attn_mlp_ratio": 1.7723247820320485, "grad/layer_12/attn": 0.0025844196788966656, "grad/layer_12/mlp": 0.0020568689797073603, "grad/layer_12/attn_mlp_ratio": 1.2564823422132254, "grad/layer_16/attn": 0.0033222255297005177, "grad/layer_16/mlp": 0.003131189849227667, "grad/layer_16/attn_mlp_ratio": 1.0610105370707297, "grad/layer_20/attn": 0.008133573457598686, "grad/layer_20/mlp": 0.009147203527390957, "grad/layer_20/attn_mlp_ratio": 0.8891868803754415, "grad/layer_24/attn": 0.028446678072214127, "grad/layer_24/mlp": 0.04084426164627075, "grad/layer_24/attn_mlp_ratio": 0.6964669418908379, "grad/layer_27/attn": 0.013033876195549965, "grad/layer_27/mlp": 0.017522551119327545, "grad/layer_27/attn_mlp_ratio": 0.7438343898902958} {"step": 600, "timestamp": 1778195176.7874284, "eos/sharpness": 29.753684997558587, "eos/L0_probe": 6.807113170623779, "eos/L_plus": 7.025510787963867, "eos/L_minus": 6.886252403259277, "eos/grad_norm": 0.4632958173751831, "eos/embed_grad_frac": 0.7088658213615417, "eos/time_s": 0.618715763092041} {"step": 600, "timestamp": 1778195176.8079321, "train/loss": 7.053962564468383, "train/z_loss": 0.0008748586638830602, "train/perplexity": 1157.4360828878189, "train/grad_norm": 0.462890625, "optim/muon_lr": 0.0024, "optim/adamw_lr": 7.199999999999999e-05, "perf/tokens_per_sec": 1911561.242268701, "perf/iters_per_sec": 0.9115034304946428, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0970885753631592, "data/tokens_consumed": 1260388352, "data/tokens_consumed_B": 1.260388352, "train/loss_slope": -0.00719755169199233} {"step": 600, "timestamp": 1778195178.1734436, "geo/rankme_last": 171.71005249023438, "geo/layer_0/stable_rank_q_proj": 53.65927505493164, "geo/layer_0/stable_rank_k_proj": 45.89857864379883, "geo/layer_0/stable_rank_o_proj": 67.80802917480469, "geo/layer_0/stable_rank_gate_proj": 161.37066650390625, "geo/layer_0/stable_rank_down_proj": 55.9989128112793, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.032827991992235184, "geo/layer_0/attn_entropy_mean": 7.0034332275390625, "geo/layer_0/attn_entropy_std": 0.008186784572899342, "geo/layer_7/stable_rank_q_proj": 27.27775764465332, "geo/layer_7/stable_rank_k_proj": 28.83304786682129, "geo/layer_7/stable_rank_o_proj": 117.34831237792969, "geo/layer_7/stable_rank_gate_proj": 172.80381774902344, "geo/layer_7/stable_rank_down_proj": 200.09323120117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8730692267417908, "geo/layer_7/attn_entropy_mean": 6.388195991516113, "geo/layer_7/attn_entropy_std": 0.47431573271751404, "geo/layer_14/stable_rank_q_proj": 33.98918914794922, "geo/layer_14/stable_rank_k_proj": 22.287551879882812, "geo/layer_14/stable_rank_o_proj": 92.4002685546875, "geo/layer_14/stable_rank_gate_proj": 193.736572265625, "geo/layer_14/stable_rank_down_proj": 164.5087127685547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8646008372306824, "geo/layer_14/attn_entropy_mean": 6.5687713623046875, "geo/layer_14/attn_entropy_std": 0.5932270288467407, "geo/layer_21/stable_rank_q_proj": 64.2405014038086, "geo/layer_21/stable_rank_k_proj": 36.461063385009766, "geo/layer_21/stable_rank_o_proj": 65.6396484375, "geo/layer_21/stable_rank_gate_proj": 150.27743530273438, "geo/layer_21/stable_rank_down_proj": 156.1737823486328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8774528503417969, "geo/layer_21/attn_entropy_mean": 6.944369316101074, "geo/layer_21/attn_entropy_std": 0.13737952709197998, "geo/layer_27/stable_rank_q_proj": 59.459102630615234, "geo/layer_27/stable_rank_k_proj": 20.407939910888672, "geo/layer_27/stable_rank_o_proj": 75.02825164794922, "geo/layer_27/stable_rank_gate_proj": 82.38778686523438, "geo/layer_27/stable_rank_down_proj": 60.87617874145508, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.8322605490684509, "geo/layer_27/attn_entropy_mean": 6.370878219604492, "geo/layer_27/attn_entropy_std": 0.2978239953517914, "attnres/final_alpha/block_0": 0.11812694370746613, "attnres/block_norm/0": 0.4774287939071655, "attnres/final_alpha/block_1": 0.006408396176993847, "attnres/block_norm/1": 2982.27197265625, "attnres/final_alpha/block_2": 0.13854017853736877, "attnres/block_norm/2": 2098.562255859375, "attnres/final_alpha/block_3": 0.05117961764335632, "attnres/block_norm/3": 730.3756103515625, "attnres/final_alpha/block_4": 0.030653759837150574, "attnres/block_norm/4": 1738.7540283203125, "attnres/final_alpha/block_5": 0.5269680023193359, "attnres/block_norm/5": 790.924560546875, "attnres/final_alpha/block_6": 0.12812311947345734, "attnres/block_norm/6": 1190.184326171875, "geo/tier1_time_s": 1.3612616062164307, "geo/step": 600.0, "geo/rankme_slope": 0.19594667053222656} {"step": 610, "timestamp": 1778195188.5249984, "train/loss": 6.981416034698486, "train/z_loss": 0.0008789484156295658, "train/perplexity": 1076.441566898915, "train/grad_norm": 0.466796875, "optim/muon_lr": 0.00244, "optim/adamw_lr": 7.319999999999999e-05, "perf/tokens_per_sec": 1790419.0920912554, "perf/iters_per_sec": 0.8537383518654134, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.171319055557251, "data/tokens_consumed": 1281359872, "data/tokens_consumed_B": 1.281359872, "train/loss_slope": -0.007206389164048661} {"step": 620, "timestamp": 1778195198.8730474, "train/loss": 6.9644513607025145, "train/z_loss": 0.0008826933742966502, "train/perplexity": 1058.3341144300107, "train/grad_norm": 0.455078125, "optim/muon_lr": 0.00248, "optim/adamw_lr": 7.439999999999999e-05, "perf/tokens_per_sec": 2027632.1248974986, "perf/iters_per_sec": 0.9668503403174871, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0342862367630006, "data/tokens_consumed": 1302331392, "data/tokens_consumed_B": 1.302331392, "train/loss_slope": -0.007206211985759838} {"step": 630, "timestamp": 1778195209.219241, "train/loss": 6.821253299713135, "train/z_loss": 0.0008891122473869473, "train/perplexity": 917.1337335285394, "train/grad_norm": 0.45703125, "optim/muon_lr": 0.00252, "optim/adamw_lr": 7.56e-05, "perf/tokens_per_sec": 2028350.2117161537, "perf/iters_per_sec": 0.9671927507954377, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0339200735092162, "data/tokens_consumed": 1323302912, "data/tokens_consumed_B": 1.323302912, "train/loss_slope": -0.007216310898353766} {"step": 640, "timestamp": 1778195219.5738552, "train/loss": 6.839739656448364, "train/z_loss": 0.0008929953968618065, "train/perplexity": 934.245878175421, "train/grad_norm": 0.53125, "optim/muon_lr": 0.00256, "optim/adamw_lr": 7.68e-05, "perf/tokens_per_sec": 2026532.3224358533, "perf/iters_per_sec": 0.9663259136370913, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348475456237793, "data/tokens_consumed": 1344274432, "data/tokens_consumed_B": 1.344274432, "train/loss_slope": -0.007212827605384213} {"step": 650, "timestamp": 1778195229.9188268, "grad/layer_0/attn": 0.027422405779361725, "grad/layer_0/mlp": 0.030349567532539368, "grad/layer_0/attn_mlp_ratio": 0.9035517774546719, "grad/layer_4/attn": 0.00938005093485117, "grad/layer_4/mlp": 0.025644291192293167, "grad/layer_4/attn_mlp_ratio": 0.36577540115800117, "grad/layer_8/attn": 0.03411194309592247, "grad/layer_8/mlp": 0.02416781522333622, "grad/layer_8/attn_mlp_ratio": 1.4114615921855498, "grad/layer_12/attn": 0.0031448621302843094, "grad/layer_12/mlp": 0.003071000799536705, "grad/layer_12/attn_mlp_ratio": 1.024051191505267, "grad/layer_16/attn": 0.006615301128476858, "grad/layer_16/mlp": 0.0037703244015574455, "grad/layer_16/attn_mlp_ratio": 1.754570760618671, "grad/layer_20/attn": 0.011121255345642567, "grad/layer_20/mlp": 0.011247589252889156, "grad/layer_20/attn_mlp_ratio": 0.9887679036562499, "grad/layer_24/attn": 0.026543881744146347, "grad/layer_24/mlp": 0.03283807262778282, "grad/layer_24/attn_mlp_ratio": 0.8083264192812618, "grad/layer_27/attn": 0.010323851369321346, "grad/layer_27/mlp": 0.014564977027475834, "grad/layer_27/attn_mlp_ratio": 0.7088134281959225} {"step": 650, "timestamp": 1778195229.9346473, "train/loss": 6.7392984390258786, "train/z_loss": 0.0008992297807708383, "train/perplexity": 844.9677314761999, "train/grad_norm": 0.373046875, "optim/muon_lr": 0.0026000000000000003, "optim/adamw_lr": 7.8e-05, "perf/tokens_per_sec": 2025380.9654351992, "perf/iters_per_sec": 0.9657769038368221, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0354358196258544, "data/tokens_consumed": 1365245952, "data/tokens_consumed_B": 1.365245952, "train/loss_slope": -0.007213493170189268} {"step": 660, "timestamp": 1778195240.2891648, "train/loss": 6.632194232940674, "train/z_loss": 0.0009053121553733945, "train/perplexity": 759.1460878159427, "train/grad_norm": 0.427734375, "optim/muon_lr": 0.00264, "optim/adamw_lr": 7.92e-05, "perf/tokens_per_sec": 2026188.7008575252, "perf/iters_per_sec": 0.9661620621001841, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035023045539856, "data/tokens_consumed": 1386217472, "data/tokens_consumed_B": 1.386217472, "train/loss_slope": -0.00721870614521539} {"step": 670, "timestamp": 1778195250.6406465, "train/loss": 6.620101642608643, "train/z_loss": 0.000910749682225287, "train/perplexity": 750.0213273619468, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.00268, "optim/adamw_lr": 8.04e-05, "perf/tokens_per_sec": 2027011.7947766744, "perf/iters_per_sec": 0.9665545438655254, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0346027612686157, "data/tokens_consumed": 1407188992, "data/tokens_consumed_B": 1.407188992, "train/loss_slope": -0.007215787769296804} {"step": 675, "timestamp": 1778195256.4087825, "eos/sharpness": 40.48147201538085, "eos/L0_probe": 6.300657272338867, "eos/L_plus": 6.538933753967285, "eos/L_minus": 6.467195510864258, "eos/grad_norm": 0.4003520607948303, "eos/embed_grad_frac": 0.591850757598877, "eos/time_s": 0.6048731803894043} {"step": 675, "timestamp": 1778195257.7875583, "geo/rankme_last": 220.5989990234375, "geo/layer_0/stable_rank_q_proj": 53.70150375366211, "geo/layer_0/stable_rank_k_proj": 45.95170974731445, "geo/layer_0/stable_rank_o_proj": 67.79117584228516, "geo/layer_0/stable_rank_gate_proj": 161.4892578125, "geo/layer_0/stable_rank_down_proj": 55.91875457763672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036438118666410446, "geo/layer_0/attn_entropy_mean": 6.99299955368042, "geo/layer_0/attn_entropy_std": 0.007851370610296726, "geo/layer_7/stable_rank_q_proj": 27.28011703491211, "geo/layer_7/stable_rank_k_proj": 28.850069046020508, "geo/layer_7/stable_rank_o_proj": 117.40959930419922, "geo/layer_7/stable_rank_gate_proj": 172.8680419921875, "geo/layer_7/stable_rank_down_proj": 200.229736328125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8469047546386719, "geo/layer_7/attn_entropy_mean": 6.452199935913086, "geo/layer_7/attn_entropy_std": 0.4133458137512207, "geo/layer_14/stable_rank_q_proj": 33.988155364990234, "geo/layer_14/stable_rank_k_proj": 22.3093318939209, "geo/layer_14/stable_rank_o_proj": 92.4094009399414, "geo/layer_14/stable_rank_gate_proj": 193.70677185058594, "geo/layer_14/stable_rank_down_proj": 164.570556640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8404231667518616, "geo/layer_14/attn_entropy_mean": 6.570333480834961, "geo/layer_14/attn_entropy_std": 0.5996370911598206, "geo/layer_21/stable_rank_q_proj": 64.21915435791016, "geo/layer_21/stable_rank_k_proj": 36.458717346191406, "geo/layer_21/stable_rank_o_proj": 65.6390151977539, "geo/layer_21/stable_rank_gate_proj": 150.35104370117188, "geo/layer_21/stable_rank_down_proj": 156.11912536621094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8408520817756653, "geo/layer_21/attn_entropy_mean": 6.928847312927246, "geo/layer_21/attn_entropy_std": 0.1499568521976471, "geo/layer_27/stable_rank_q_proj": 59.46834945678711, "geo/layer_27/stable_rank_k_proj": 20.422761917114258, "geo/layer_27/stable_rank_o_proj": 75.03812408447266, "geo/layer_27/stable_rank_gate_proj": 82.41206359863281, "geo/layer_27/stable_rank_down_proj": 60.873497009277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.5801749229431152, "geo/layer_27/attn_entropy_mean": 6.360488414764404, "geo/layer_27/attn_entropy_std": 0.2406117469072342, "attnres/final_alpha/block_0": 0.1406630128622055, "attnres/block_norm/0": 0.4896619915962219, "attnres/final_alpha/block_1": 0.008064803667366505, "attnres/block_norm/1": 2931.848388671875, "attnres/final_alpha/block_2": 0.13301944732666016, "attnres/block_norm/2": 1764.095703125, "attnres/final_alpha/block_3": 0.058341845870018005, "attnres/block_norm/3": 735.8775634765625, "attnres/final_alpha/block_4": 0.041585713624954224, "attnres/block_norm/4": 1522.3466796875, "attnres/final_alpha/block_5": 0.4821341633796692, "attnres/block_norm/5": 797.09423828125, "attnres/final_alpha/block_6": 0.1361910104751587, "attnres/block_norm/6": 1078.596435546875, "geo/tier1_time_s": 1.3595097064971924, "geo/step": 675.0, "geo/rankme_slope": 0.25869801870018544} {"step": 680, "timestamp": 1778195262.964388, "train/loss": 6.541336917877198, "train/z_loss": 0.0009161023946944625, "train/perplexity": 693.212727293616, "train/grad_norm": 0.44921875, "optim/muon_lr": 0.00272, "optim/adamw_lr": 8.16e-05, "perf/tokens_per_sec": 1702438.5669187827, "perf/iters_per_sec": 0.811785968264953, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2318517923355103, "data/tokens_consumed": 1428160512, "data/tokens_consumed_B": 1.428160512, "train/loss_slope": -0.007213936640914008} {"step": 690, "timestamp": 1778195273.3085845, "train/loss": 6.470947265625, "train/z_loss": 0.0009226729918736965, "train/perplexity": 646.0954612027771, "train/grad_norm": 0.361328125, "optim/muon_lr": 0.0027600000000000003, "optim/adamw_lr": 8.28e-05, "perf/tokens_per_sec": 2028600.7593353335, "perf/iters_per_sec": 0.9673122212101619, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0337923765182495, "data/tokens_consumed": 1449132032, "data/tokens_consumed_B": 1.449132032, "train/loss_slope": -0.007212028478055622} {"step": 700, "timestamp": 1778195283.6393073, "grad/layer_0/attn": 0.019124355167150497, "grad/layer_0/mlp": 0.028761301189661026, "grad/layer_0/attn_mlp_ratio": 0.6649335846992858, "grad/layer_4/attn": 0.00843801163136959, "grad/layer_4/mlp": 0.024515271186828613, "grad/layer_4/attn_mlp_ratio": 0.3441940956983455, "grad/layer_8/attn": 0.03078361228108406, "grad/layer_8/mlp": 0.030334508046507835, "grad/layer_8/attn_mlp_ratio": 1.0148050574087824, "grad/layer_12/attn": 0.0037609233986586332, "grad/layer_12/mlp": 0.0028127525001764297, "grad/layer_12/attn_mlp_ratio": 1.3370971191788177, "grad/layer_16/attn": 0.0054542263969779015, "grad/layer_16/mlp": 0.0038003914523869753, "grad/layer_16/attn_mlp_ratio": 1.4351748554835553, "grad/layer_20/attn": 0.03957623988389969, "grad/layer_20/mlp": 0.012508152984082699, "grad/layer_20/attn_mlp_ratio": 3.1640354589409845, "grad/layer_24/attn": 0.015891794115304947, "grad/layer_24/mlp": 0.02840917930006981, "grad/layer_24/attn_mlp_ratio": 0.5593894104264728, "grad/layer_27/attn": 0.013796052895486355, "grad/layer_27/mlp": 0.013304931111633778, "grad/layer_27/attn_mlp_ratio": 1.0369127563337677} {"step": 700, "timestamp": 1778195283.6548479, "train/loss": 6.437418794631958, "train/z_loss": 0.0009279416932258755, "train/perplexity": 624.7920001785442, "train/grad_norm": 0.37109375, "optim/muon_lr": 0.0028000000000000004, "optim/adamw_lr": 8.4e-05, "perf/tokens_per_sec": 2027936.0737972795, "perf/iters_per_sec": 0.9669952744470975, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0341312170028687, "data/tokens_consumed": 1470103552, "data/tokens_consumed_B": 1.470103552, "train/loss_slope": -0.007205747533532135} {"step": 710, "timestamp": 1778195293.9979706, "train/loss": 6.405519580841064, "train/z_loss": 0.0009334744710940868, "train/perplexity": 605.1761549403012, "train/grad_norm": 0.435546875, "optim/muon_lr": 0.0028399999999999996, "optim/adamw_lr": 8.519999999999998e-05, "perf/tokens_per_sec": 2028480.9041523621, "perf/iters_per_sec": 0.967255069805318, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0338534593582154, "data/tokens_consumed": 1491075072, "data/tokens_consumed_B": 1.491075072, "train/loss_slope": -0.007195391384564272} {"step": 720, "timestamp": 1778195304.3786335, "train/loss": 6.348507452011108, "train/z_loss": 0.0009394535911269486, "train/perplexity": 571.638873525905, "train/grad_norm": 0.3515625, "optim/muon_lr": 0.0028799999999999997, "optim/adamw_lr": 8.639999999999999e-05, "perf/tokens_per_sec": 2021738.2421415525, "perf/iters_per_sec": 0.9640399180133593, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0373014450073241, "data/tokens_consumed": 1512046592, "data/tokens_consumed_B": 1.512046592, "train/loss_slope": -0.007184203837104128} {"step": 730, "timestamp": 1778195314.7328005, "train/loss": 6.296372413635254, "train/z_loss": 0.0009442573704291135, "train/perplexity": 542.6000072764953, "train/grad_norm": 0.359375, "optim/muon_lr": 0.00292, "optim/adamw_lr": 8.759999999999999e-05, "perf/tokens_per_sec": 2026337.6468055272, "perf/iters_per_sec": 0.9662330850627552, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0349469661712647, "data/tokens_consumed": 1533018112, "data/tokens_consumed_B": 1.533018112, "train/loss_slope": -0.0071717687123089265} {"step": 740, "timestamp": 1778195325.0868337, "train/loss": 6.168738889694214, "train/z_loss": 0.0009534289827570319, "train/perplexity": 477.5834407563922, "train/grad_norm": 0.36328125, "optim/muon_lr": 0.00296, "optim/adamw_lr": 8.879999999999999e-05, "perf/tokens_per_sec": 2026419.4340249181, "perf/iters_per_sec": 0.9662720842480269, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034905195236206, "data/tokens_consumed": 1553989632, "data/tokens_consumed_B": 1.553989632, "train/loss_slope": -0.00716618809584703} {"step": 750, "timestamp": 1778195335.436736, "grad/layer_0/attn": 0.023532312363386154, "grad/layer_0/mlp": 0.03325776010751724, "grad/layer_0/attn_mlp_ratio": 0.7075735773110526, "grad/layer_4/attn": 0.011055517010390759, "grad/layer_4/mlp": 0.03264372795820236, "grad/layer_4/attn_mlp_ratio": 0.3386720104603018, "grad/layer_8/attn": 0.08024422824382782, "grad/layer_8/mlp": 0.04022609442472458, "grad/layer_8/attn_mlp_ratio": 1.9948302014381853, "grad/layer_12/attn": 0.0040375711396336555, "grad/layer_12/mlp": 0.0031674099154770374, "grad/layer_12/attn_mlp_ratio": 1.2747232344106751, "grad/layer_16/attn": 0.004551580175757408, "grad/layer_16/mlp": 0.004724679980427027, "grad/layer_16/attn_mlp_ratio": 0.9633626189026595, "grad/layer_20/attn": 0.02462703548371792, "grad/layer_20/mlp": 0.01576380804181099, "grad/layer_20/attn_mlp_ratio": 1.5622516629340741, "grad/layer_24/attn": 0.02490127831697464, "grad/layer_24/mlp": 0.025184158235788345, "grad/layer_24/attn_mlp_ratio": 0.9887675412836125, "grad/layer_27/attn": 0.009808707982301712, "grad/layer_27/mlp": 0.01095470692962408, "grad/layer_27/attn_mlp_ratio": 0.895387522073998} {"step": 750, "timestamp": 1778195336.049564, "eos/sharpness": 76.84602737426756, "eos/L0_probe": 5.872945785522461, "eos/L_plus": 6.281510353088379, "eos/L_minus": 6.232841491699219, "eos/grad_norm": 0.4232307970523834, "eos/embed_grad_frac": 0.33051997423171997, "eos/time_s": 0.610053300857544} {"step": 750, "timestamp": 1778195336.0692225, "train/loss": 6.172613954544067, "train/z_loss": 0.0009580438723787666, "train/perplexity": 479.437697923612, "train/grad_norm": 0.423828125, "optim/muon_lr": 0.003, "optim/adamw_lr": 8.999999999999999e-05, "perf/tokens_per_sec": 1911065.6509956957, "perf/iters_per_sec": 0.9112671141603926, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.097373080253601, "data/tokens_consumed": 1574961152, "data/tokens_consumed_B": 1.574961152, "train/loss_slope": -0.007153291879577011} {"step": 750, "timestamp": 1778195337.431085, "geo/rankme_last": 248.08596801757812, "geo/layer_0/stable_rank_q_proj": 53.74594497680664, "geo/layer_0/stable_rank_k_proj": 46.08567428588867, "geo/layer_0/stable_rank_o_proj": 67.83948516845703, "geo/layer_0/stable_rank_gate_proj": 161.6940155029297, "geo/layer_0/stable_rank_down_proj": 55.76530075073242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.041127678006887436, "geo/layer_0/attn_entropy_mean": 6.984000205993652, "geo/layer_0/attn_entropy_std": 0.010321698151528835, "geo/layer_7/stable_rank_q_proj": 27.292695999145508, "geo/layer_7/stable_rank_k_proj": 28.86719512939453, "geo/layer_7/stable_rank_o_proj": 117.48016357421875, "geo/layer_7/stable_rank_gate_proj": 172.83811950683594, "geo/layer_7/stable_rank_down_proj": 200.58660888671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8355101346969604, "geo/layer_7/attn_entropy_mean": 6.530071258544922, "geo/layer_7/attn_entropy_std": 0.34948816895484924, "geo/layer_14/stable_rank_q_proj": 33.97791290283203, "geo/layer_14/stable_rank_k_proj": 22.33348846435547, "geo/layer_14/stable_rank_o_proj": 92.3944091796875, "geo/layer_14/stable_rank_gate_proj": 193.63600158691406, "geo/layer_14/stable_rank_down_proj": 164.7801055908203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8243632912635803, "geo/layer_14/attn_entropy_mean": 6.600905895233154, "geo/layer_14/attn_entropy_std": 0.5491405725479126, "geo/layer_21/stable_rank_q_proj": 64.19541931152344, "geo/layer_21/stable_rank_k_proj": 36.46087646484375, "geo/layer_21/stable_rank_o_proj": 65.64884948730469, "geo/layer_21/stable_rank_gate_proj": 150.34292602539062, "geo/layer_21/stable_rank_down_proj": 156.07395935058594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.8122325539588928, "geo/layer_21/attn_entropy_mean": 6.865647315979004, "geo/layer_21/attn_entropy_std": 0.13315704464912415, "geo/layer_27/stable_rank_q_proj": 59.492218017578125, "geo/layer_27/stable_rank_k_proj": 20.439220428466797, "geo/layer_27/stable_rank_o_proj": 75.01423645019531, "geo/layer_27/stable_rank_gate_proj": 82.51544952392578, "geo/layer_27/stable_rank_down_proj": 60.89278030395508, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.4105471968650818, "geo/layer_27/attn_entropy_mean": 6.289881706237793, "geo/layer_27/attn_entropy_std": 0.2627735137939453, "attnres/final_alpha/block_0": 0.1597878336906433, "attnres/block_norm/0": 0.5013538599014282, "attnres/final_alpha/block_1": 0.009607411921024323, "attnres/block_norm/1": 2907.1708984375, "attnres/final_alpha/block_2": 0.12494468688964844, "attnres/block_norm/2": 1584.18212890625, "attnres/final_alpha/block_3": 0.059181734919548035, "attnres/block_norm/3": 783.265625, "attnres/final_alpha/block_4": 0.055550381541252136, "attnres/block_norm/4": 1352.1944580078125, "attnres/final_alpha/block_5": 0.43878722190856934, "attnres/block_norm/5": 813.2447509765625, "attnres/final_alpha/block_6": 0.15214073657989502, "attnres/block_norm/6": 1035.636962890625, "geo/tier1_time_s": 1.3578238487243652, "geo/step": 750.0, "geo/rankme_slope": 0.2978257950291489} {"step": 760, "timestamp": 1778195347.7836993, "train/loss": 6.173643636703491, "train/z_loss": 0.0009628941887058317, "train/perplexity": 479.9316206157714, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.00304, "optim/adamw_lr": 9.12e-05, "perf/tokens_per_sec": 1790816.5233220337, "perf/iters_per_sec": 0.8539278618440789, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1710591077804566, "data/tokens_consumed": 1595932672, "data/tokens_consumed_B": 1.595932672, "train/loss_slope": -0.0071341257855257745} {"step": 770, "timestamp": 1778195358.142936, "train/loss": 6.065651273727417, "train/z_loss": 0.0009691795974504202, "train/perplexity": 430.8031571057532, "train/grad_norm": 0.427734375, "optim/muon_lr": 0.00308, "optim/adamw_lr": 9.24e-05, "perf/tokens_per_sec": 2025373.2238470577, "perf/iters_per_sec": 0.9657732123599327, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0354397773742676, "data/tokens_consumed": 1616904192, "data/tokens_consumed_B": 1.616904192, "train/loss_slope": -0.007119965436123055} {"step": 780, "timestamp": 1778195368.5031257, "train/loss": 6.076366281509399, "train/z_loss": 0.0009739795117639005, "train/perplexity": 435.4440354081502, "train/grad_norm": 0.7109375, "optim/muon_lr": 0.00312, "optim/adamw_lr": 9.36e-05, "perf/tokens_per_sec": 2025907.205680956, "perf/iters_per_sec": 0.9660278347401409, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.03516685962677, "data/tokens_consumed": 1637875712, "data/tokens_consumed_B": 1.637875712, "train/loss_slope": -0.007099076961124535} {"step": 790, "timestamp": 1778195378.850987, "train/loss": 5.993340396881104, "train/z_loss": 0.0009806924732401966, "train/perplexity": 400.7510441119804, "train/grad_norm": 0.5546875, "optim/muon_lr": 0.00316, "optim/adamw_lr": 9.479999999999999e-05, "perf/tokens_per_sec": 2027636.8923865692, "perf/iters_per_sec": 0.9668526136334272, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0342838048934937, "data/tokens_consumed": 1658847232, "data/tokens_consumed_B": 1.658847232, "train/loss_slope": -0.007080830801891375} {"step": 800, "timestamp": 1778195389.1975868, "grad/layer_0/attn": 0.02541128732264042, "grad/layer_0/mlp": 0.03620198741555214, "grad/layer_0/attn_mlp_ratio": 0.7019307244311902, "grad/layer_4/attn": 0.013973573222756386, "grad/layer_4/mlp": 0.04265674576163292, "grad/layer_4/attn_mlp_ratio": 0.32758179135564913, "grad/layer_8/attn": 0.06599562615156174, "grad/layer_8/mlp": 0.04248671978712082, "grad/layer_8/attn_mlp_ratio": 1.5533236344650665, "grad/layer_12/attn": 0.008174004964530468, "grad/layer_12/mlp": 0.0038265653420239687, "grad/layer_12/attn_mlp_ratio": 2.1361205207056453, "grad/layer_16/attn": 0.014010540209710598, "grad/layer_16/mlp": 0.00589338131248951, "grad/layer_16/attn_mlp_ratio": 2.377334713144283, "grad/layer_20/attn": 0.05034959316253662, "grad/layer_20/mlp": 0.017839137464761734, "grad/layer_20/attn_mlp_ratio": 2.8224230560334886, "grad/layer_24/attn": 0.016816122457385063, "grad/layer_24/mlp": 0.023756256327033043, "grad/layer_24/attn_mlp_ratio": 0.7078607906525807, "grad/layer_27/attn": 0.007688023615628481, "grad/layer_27/mlp": 0.011772002093493938, "grad/layer_27/attn_mlp_ratio": 0.6530769778379281} {"step": 800, "timestamp": 1778195389.2134526, "train/loss": 5.934750747680664, "train/z_loss": 0.0009873937466181814, "train/perplexity": 377.9457805680128, "train/grad_norm": 0.423828125, "optim/muon_lr": 0.0032, "optim/adamw_lr": 9.599999999999999e-05, "perf/tokens_per_sec": 2024828.709660704, "perf/iters_per_sec": 0.9655135677627106, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0357182264328002, "data/tokens_consumed": 1679818752, "data/tokens_consumed_B": 1.679818752, "train/loss_slope": -0.00706279948173392} {"step": 810, "timestamp": 1778195399.5622072, "train/loss": 5.95963683128357, "train/z_loss": 0.0009896368836052715, "train/perplexity": 387.46938192463807, "train/grad_norm": 0.40234375, "optim/muon_lr": 0.0032400000000000003, "optim/adamw_lr": 9.719999999999999e-05, "perf/tokens_per_sec": 2027929.6217667677, "perf/iters_per_sec": 0.9669921978792037, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0341345071792603, "data/tokens_consumed": 1700790272, "data/tokens_consumed_B": 1.700790272, "train/loss_slope": -0.007037635451400793} {"step": 820, "timestamp": 1778195409.9159403, "train/loss": 5.870762634277344, "train/z_loss": 0.0009958260576240717, "train/perplexity": 354.5192457265839, "train/grad_norm": 0.640625, "optim/muon_lr": 0.0032800000000000004, "optim/adamw_lr": 9.839999999999999e-05, "perf/tokens_per_sec": 2026432.4589689204, "perf/iters_per_sec": 0.9662782950253107, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034898543357849, "data/tokens_consumed": 1721761792, "data/tokens_consumed_B": 1.721761792, "train/loss_slope": -0.007015839092957974} {"step": 825, "timestamp": 1778195415.6957698, "eos/sharpness": 116.34531021118161, "eos/L0_probe": 5.546693325042725, "eos/L_plus": 5.9119157791137695, "eos/L_minus": 6.344923973083496, "eos/grad_norm": 0.7244237661361694, "eos/embed_grad_frac": 0.13554489612579346, "eos/time_s": 0.6105337142944336} {"step": 825, "timestamp": 1778195417.0784197, "geo/rankme_last": 264.0623474121094, "geo/layer_0/stable_rank_q_proj": 53.78622817993164, "geo/layer_0/stable_rank_k_proj": 46.24275207519531, "geo/layer_0/stable_rank_o_proj": 67.93403625488281, "geo/layer_0/stable_rank_gate_proj": 162.0727996826172, "geo/layer_0/stable_rank_down_proj": 55.56963348388672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04137041047215462, "geo/layer_0/attn_entropy_mean": 6.978416442871094, "geo/layer_0/attn_entropy_std": 0.011620637029409409, "geo/layer_7/stable_rank_q_proj": 27.30130958557129, "geo/layer_7/stable_rank_k_proj": 28.884288787841797, "geo/layer_7/stable_rank_o_proj": 117.58027648925781, "geo/layer_7/stable_rank_gate_proj": 172.97262573242188, "geo/layer_7/stable_rank_down_proj": 200.96949768066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8213830590248108, "geo/layer_7/attn_entropy_mean": 6.579424858093262, "geo/layer_7/attn_entropy_std": 0.3557809293270111, "geo/layer_14/stable_rank_q_proj": 33.98249435424805, "geo/layer_14/stable_rank_k_proj": 22.351980209350586, "geo/layer_14/stable_rank_o_proj": 92.41644287109375, "geo/layer_14/stable_rank_gate_proj": 193.6184539794922, "geo/layer_14/stable_rank_down_proj": 165.16281127929688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8162226676940918, "geo/layer_14/attn_entropy_mean": 6.644399166107178, "geo/layer_14/attn_entropy_std": 0.4805825352668762, "geo/layer_21/stable_rank_q_proj": 64.16857147216797, "geo/layer_21/stable_rank_k_proj": 36.45558547973633, "geo/layer_21/stable_rank_o_proj": 65.6203384399414, "geo/layer_21/stable_rank_gate_proj": 150.22743225097656, "geo/layer_21/stable_rank_down_proj": 155.83738708496094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.785084068775177, "geo/layer_21/attn_entropy_mean": 6.670372009277344, "geo/layer_21/attn_entropy_std": 0.23224952816963196, "geo/layer_27/stable_rank_q_proj": 59.537532806396484, "geo/layer_27/stable_rank_k_proj": 20.453643798828125, "geo/layer_27/stable_rank_o_proj": 75.09014129638672, "geo/layer_27/stable_rank_gate_proj": 82.63288879394531, "geo/layer_27/stable_rank_down_proj": 60.929264068603516, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.3167335093021393, "geo/layer_27/attn_entropy_mean": 6.275091648101807, "geo/layer_27/attn_entropy_std": 0.28561273217201233, "attnres/final_alpha/block_0": 0.16738563776016235, "attnres/block_norm/0": 0.510870099067688, "attnres/final_alpha/block_1": 0.010088224895298481, "attnres/block_norm/1": 2892.88916015625, "attnres/final_alpha/block_2": 0.1235022246837616, "attnres/block_norm/2": 1444.55419921875, "attnres/final_alpha/block_3": 0.05722574144601822, "attnres/block_norm/3": 829.21240234375, "attnres/final_alpha/block_4": 0.07061878591775894, "attnres/block_norm/4": 1213.385986328125, "attnres/final_alpha/block_5": 0.4111202359199524, "attnres/block_norm/5": 824.364013671875, "attnres/final_alpha/block_6": 0.1600591540336609, "attnres/block_norm/6": 1057.355224609375, "geo/tier1_time_s": 1.362354040145874, "geo/step": 825.0, "geo/rankme_slope": 0.31713768894577915} {"step": 830, "timestamp": 1778195422.2548552, "train/loss": 5.79870491027832, "train/z_loss": 0.0010024503106251359, "train/perplexity": 329.87206922368205, "train/grad_norm": 0.462890625, "optim/muon_lr": 0.00332, "optim/adamw_lr": 9.96e-05, "perf/tokens_per_sec": 1700327.6541214446, "perf/iters_per_sec": 0.8107794066054557, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2333811044692993, "data/tokens_consumed": 1742733312, "data/tokens_consumed_B": 1.742733312, "train/loss_slope": -0.006995722592546835} {"step": 840, "timestamp": 1778195432.6027262, "train/loss": 5.794624280929566, "train/z_loss": 0.0010041099623776973, "train/perplexity": 328.5287262830477, "train/grad_norm": 0.474609375, "optim/muon_lr": 0.00336, "optim/adamw_lr": 0.0001008, "perf/tokens_per_sec": 2027999.1468955365, "perf/iters_per_sec": 0.9670253500440295, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340990543365478, "data/tokens_consumed": 1763704832, "data/tokens_consumed_B": 1.763704832, "train/loss_slope": -0.006971585949626965} {"step": 850, "timestamp": 1778195442.9427533, "grad/layer_0/attn": 0.02203211560845375, "grad/layer_0/mlp": 0.03154486045241356, "grad/layer_0/attn_mlp_ratio": 0.6984375655059927, "grad/layer_4/attn": 0.011745225638151169, "grad/layer_4/mlp": 0.03306403011083603, "grad/layer_4/attn_mlp_ratio": 0.35522667875805175, "grad/layer_8/attn": 0.03577983379364014, "grad/layer_8/mlp": 0.0372784286737442, "grad/layer_8/attn_mlp_ratio": 0.9597999425029536, "grad/layer_12/attn": 0.004397443495690823, "grad/layer_12/mlp": 0.0034704268909990788, "grad/layer_12/attn_mlp_ratio": 1.267118860905606, "grad/layer_16/attn": 0.009063204750418663, "grad/layer_16/mlp": 0.005550582893192768, "grad/layer_16/attn_mlp_ratio": 1.632838345365482, "grad/layer_20/attn": 0.024102432653307915, "grad/layer_20/mlp": 0.01707388088107109, "grad/layer_20/attn_mlp_ratio": 1.411655187243545, "grad/layer_24/attn": 0.01733773574233055, "grad/layer_24/mlp": 0.01834193989634514, "grad/layer_24/attn_mlp_ratio": 0.9452509246996398, "grad/layer_27/attn": 0.009253659285604954, "grad/layer_27/mlp": 0.010353868827223778, "grad/layer_27/attn_mlp_ratio": 0.8937392727924143} {"step": 850, "timestamp": 1778195442.9586823, "train/loss": 5.740339231491089, "train/z_loss": 0.0010096356854774058, "train/perplexity": 311.1699517257147, "train/grad_norm": 0.3671875, "optim/muon_lr": 0.0034000000000000002, "optim/adamw_lr": 0.000102, "perf/tokens_per_sec": 2026482.6462505537, "perf/iters_per_sec": 0.9663022261860627, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348729133605956, "data/tokens_consumed": 1784676352, "data/tokens_consumed_B": 1.784676352, "train/loss_slope": -0.006947857112354578} {"step": 860, "timestamp": 1778195453.3128157, "train/loss": 5.673747777938843, "train/z_loss": 0.0010166491614654661, "train/perplexity": 291.1235588574761, "train/grad_norm": 0.51953125, "optim/muon_lr": 0.00344, "optim/adamw_lr": 0.00010319999999999999, "perf/tokens_per_sec": 2026834.447168317, "perf/iters_per_sec": 0.9664699779359421, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0346932888031006, "data/tokens_consumed": 1805647872, "data/tokens_consumed_B": 1.805647872, "train/loss_slope": -0.00692550129095076} {"step": 870, "timestamp": 1778195463.6602023, "train/loss": 5.689625549316406, "train/z_loss": 0.0010193878784775733, "train/perplexity": 295.78284381348766, "train/grad_norm": 0.48828125, "optim/muon_lr": 0.00348, "optim/adamw_lr": 0.00010439999999999999, "perf/tokens_per_sec": 2028185.537033686, "perf/iters_per_sec": 0.9671142277878217, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034004020690918, "data/tokens_consumed": 1826619392, "data/tokens_consumed_B": 1.826619392, "train/loss_slope": -0.006898113557928014} {"step": 880, "timestamp": 1778195474.0100071, "train/loss": 5.677104902267456, "train/z_loss": 0.0010249008075334132, "train/perplexity": 292.102539199462, "train/grad_norm": 0.70703125, "optim/muon_lr": 0.0035199999999999997, "optim/adamw_lr": 0.00010559999999999998, "perf/tokens_per_sec": 2027224.1669778954, "perf/iters_per_sec": 0.966655810822437, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0344943761825562, "data/tokens_consumed": 1847590912, "data/tokens_consumed_B": 1.847590912, "train/loss_slope": -0.00686830190854045} {"step": 890, "timestamp": 1778195484.3851585, "train/loss": 5.629491376876831, "train/z_loss": 0.0010281812283210456, "train/perplexity": 278.52041967494426, "train/grad_norm": 0.57421875, "optim/muon_lr": 0.00356, "optim/adamw_lr": 0.00010679999999999998, "perf/tokens_per_sec": 2022586.0927586434, "perf/iters_per_sec": 0.9644442046921937, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0368666172027587, "data/tokens_consumed": 1868562432, "data/tokens_consumed_B": 1.868562432, "train/loss_slope": -0.006838890469721395} {"step": 900, "timestamp": 1778195494.7238572, "grad/layer_0/attn": 0.023919494822621346, "grad/layer_0/mlp": 0.04057984799146652, "grad/layer_0/attn_mlp_ratio": 0.5894426900935428, "grad/layer_4/attn": 0.018127813935279846, "grad/layer_4/mlp": 0.050984568893909454, "grad/layer_4/attn_mlp_ratio": 0.3555549118684394, "grad/layer_8/attn": 0.06986858695745468, "grad/layer_8/mlp": 0.04848341643810272, "grad/layer_8/attn_mlp_ratio": 1.441082166776459, "grad/layer_12/attn": 0.010870488360524178, "grad/layer_12/mlp": 0.006198775488883257, "grad/layer_12/attn_mlp_ratio": 1.7536508951895378, "grad/layer_16/attn": 0.014131862670183182, "grad/layer_16/mlp": 0.009721917100250721, "grad/layer_16/attn_mlp_ratio": 1.4536086225686773, "grad/layer_20/attn": 0.034781500697135925, "grad/layer_20/mlp": 0.029076090082526207, "grad/layer_20/attn_mlp_ratio": 1.1962234426566225, "grad/layer_24/attn": 0.03033963032066822, "grad/layer_24/mlp": 0.029252631589770317, "grad/layer_24/attn_mlp_ratio": 1.0371590030745175, "grad/layer_27/attn": 0.023699328303337097, "grad/layer_27/mlp": 0.018955957144498825, "grad/layer_27/attn_mlp_ratio": 1.2502311541251674} {"step": 900, "timestamp": 1778195495.3256521, "eos/sharpness": 131.268835067749, "eos/L0_probe": 5.2981696128845215, "eos/L_plus": 5.617563724517822, "eos/L_minus": 6.291463851928711, "eos/grad_norm": 0.5431841611862183, "eos/embed_grad_frac": 0.19050024449825287, "eos/time_s": 0.5990011692047119} {"step": 900, "timestamp": 1778195495.3447652, "train/loss": 5.608985376358032, "train/z_loss": 0.0010308771394193172, "train/perplexity": 272.86723995136003, "train/grad_norm": 0.54296875, "optim/muon_lr": 0.0036, "optim/adamw_lr": 0.00010799999999999998, "perf/tokens_per_sec": 1915050.5997582872, "perf/iters_per_sec": 0.9131672858039318, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.095089602470398, "data/tokens_consumed": 1889533952, "data/tokens_consumed_B": 1.889533952, "train/loss_slope": -0.006807944443522412} {"step": 900, "timestamp": 1778195496.703341, "geo/rankme_last": 278.3182067871094, "geo/layer_0/stable_rank_q_proj": 53.8431396484375, "geo/layer_0/stable_rank_k_proj": 46.397544860839844, "geo/layer_0/stable_rank_o_proj": 68.11227416992188, "geo/layer_0/stable_rank_gate_proj": 162.35610961914062, "geo/layer_0/stable_rank_down_proj": 55.36397171020508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04064986854791641, "geo/layer_0/attn_entropy_mean": 6.974413871765137, "geo/layer_0/attn_entropy_std": 0.014509739354252815, "geo/layer_7/stable_rank_q_proj": 27.304298400878906, "geo/layer_7/stable_rank_k_proj": 28.887399673461914, "geo/layer_7/stable_rank_o_proj": 117.72611236572266, "geo/layer_7/stable_rank_gate_proj": 173.05514526367188, "geo/layer_7/stable_rank_down_proj": 201.24668884277344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.8083398938179016, "geo/layer_7/attn_entropy_mean": 6.518953323364258, "geo/layer_7/attn_entropy_std": 0.5511161684989929, "geo/layer_14/stable_rank_q_proj": 33.99129104614258, "geo/layer_14/stable_rank_k_proj": 22.383441925048828, "geo/layer_14/stable_rank_o_proj": 92.4376449584961, "geo/layer_14/stable_rank_gate_proj": 193.69577026367188, "geo/layer_14/stable_rank_down_proj": 165.4730987548828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.8051648139953613, "geo/layer_14/attn_entropy_mean": 6.677340507507324, "geo/layer_14/attn_entropy_std": 0.42525598406791687, "geo/layer_21/stable_rank_q_proj": 64.095703125, "geo/layer_21/stable_rank_k_proj": 36.445404052734375, "geo/layer_21/stable_rank_o_proj": 65.63188934326172, "geo/layer_21/stable_rank_gate_proj": 150.13494873046875, "geo/layer_21/stable_rank_down_proj": 155.64010620117188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.7430872321128845, "geo/layer_21/attn_entropy_mean": 6.367825508117676, "geo/layer_21/attn_entropy_std": 0.5107284188270569, "geo/layer_27/stable_rank_q_proj": 59.6074104309082, "geo/layer_27/stable_rank_k_proj": 20.475378036499023, "geo/layer_27/stable_rank_o_proj": 75.16582489013672, "geo/layer_27/stable_rank_gate_proj": 82.72101593017578, "geo/layer_27/stable_rank_down_proj": 61.02186965942383, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.2765231132507324, "geo/layer_27/attn_entropy_mean": 6.319273948669434, "geo/layer_27/attn_entropy_std": 0.25348156690597534, "attnres/final_alpha/block_0": 0.1744382679462433, "attnres/block_norm/0": 0.5193371176719666, "attnres/final_alpha/block_1": 0.0103888725861907, "attnres/block_norm/1": 2880.44140625, "attnres/final_alpha/block_2": 0.12358546257019043, "attnres/block_norm/2": 1335.787109375, "attnres/final_alpha/block_3": 0.05557618662714958, "attnres/block_norm/3": 862.949951171875, "attnres/final_alpha/block_4": 0.0834004282951355, "attnres/block_norm/4": 1120.188232421875, "attnres/final_alpha/block_5": 0.388110876083374, "attnres/block_norm/5": 847.042724609375, "attnres/final_alpha/block_6": 0.16449987888336182, "attnres/block_norm/6": 1067.807861328125, "geo/tier1_time_s": 1.354616403579712, "geo/step": 900.0, "geo/rankme_slope": 0.3246209359081674} {"step": 910, "timestamp": 1778195507.048934, "train/loss": 5.571196746826172, "train/z_loss": 0.0010343157569877802, "train/perplexity": 262.7483544200621, "train/grad_norm": 0.51171875, "optim/muon_lr": 0.00364, "optim/adamw_lr": 0.00010919999999999998, "perf/tokens_per_sec": 1792383.725777808, "perf/iters_per_sec": 0.8546751622094193, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.170035171508789, "data/tokens_consumed": 1910505472, "data/tokens_consumed_B": 1.910505472, "train/loss_slope": -0.006776849053293816} {"step": 920, "timestamp": 1778195517.3993251, "train/loss": 5.54195384979248, "train/z_loss": 0.0010388672933913767, "train/perplexity": 255.17608845202855, "train/grad_norm": 0.490234375, "optim/muon_lr": 0.00368, "optim/adamw_lr": 0.00011039999999999999, "perf/tokens_per_sec": 2027423.5458294754, "perf/iters_per_sec": 0.9667508820674302, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0343926429748536, "data/tokens_consumed": 1931476992, "data/tokens_consumed_B": 1.931476992, "train/loss_slope": -0.006745072960629465} {"step": 930, "timestamp": 1778195527.7443764, "train/loss": 5.519565725326538, "train/z_loss": 0.0010461926227435469, "train/perplexity": 249.52665055192298, "train/grad_norm": 0.49609375, "optim/muon_lr": 0.00372, "optim/adamw_lr": 0.00011159999999999999, "perf/tokens_per_sec": 2028140.0819898546, "perf/iters_per_sec": 0.9670925531338952, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340271949768067, "data/tokens_consumed": 1952448512, "data/tokens_consumed_B": 1.952448512, "train/loss_slope": -0.006712254706332696} {"step": 940, "timestamp": 1778195538.1021593, "train/loss": 5.523992586135864, "train/z_loss": 0.001048858172725886, "train/perplexity": 250.63371891296072, "train/grad_norm": 0.54296875, "optim/muon_lr": 0.00376, "optim/adamw_lr": 0.00011279999999999999, "perf/tokens_per_sec": 2025991.8513442588, "perf/iters_per_sec": 0.9660681969376844, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035123610496521, "data/tokens_consumed": 1973420032, "data/tokens_consumed_B": 1.973420032, "train/loss_slope": -0.006676758804086341} {"step": 950, "timestamp": 1778195548.4461122, "grad/layer_0/attn": 0.027295399457216263, "grad/layer_0/mlp": 0.048263438045978546, "grad/layer_0/attn_mlp_ratio": 0.565550248920478, "grad/layer_4/attn": 0.023873552680015564, "grad/layer_4/mlp": 0.06406373530626297, "grad/layer_4/attn_mlp_ratio": 0.3726531481285066, "grad/layer_8/attn": 0.09034299105405807, "grad/layer_8/mlp": 0.05507299676537514, "grad/layer_8/attn_mlp_ratio": 1.6404226426046824, "grad/layer_12/attn": 0.010676679201424122, "grad/layer_12/mlp": 0.008783970028162003, "grad/layer_12/attn_mlp_ratio": 1.2154730771674607, "grad/layer_16/attn": 0.01795991137623787, "grad/layer_16/mlp": 0.014317899011075497, "grad/layer_16/attn_mlp_ratio": 1.2543677837724896, "grad/layer_20/attn": 0.09221263974905014, "grad/layer_20/mlp": 0.04622888192534447, "grad/layer_20/attn_mlp_ratio": 1.994697593995364, "grad/layer_24/attn": 0.04597951099276543, "grad/layer_24/mlp": 0.03829871863126755, "grad/layer_24/attn_mlp_ratio": 1.2005495879742625, "grad/layer_27/attn": 0.0338035523891449, "grad/layer_27/mlp": 0.024972334504127502, "grad/layer_27/attn_mlp_ratio": 1.3536400550855083} {"step": 950, "timestamp": 1778195548.461783, "train/loss": 5.525188350677491, "train/z_loss": 0.001052258617710322, "train/perplexity": 250.93359708308233, "train/grad_norm": 0.625, "optim/muon_lr": 0.0038, "optim/adamw_lr": 0.00011399999999999999, "perf/tokens_per_sec": 2025934.5024347347, "perf/iters_per_sec": 0.9660408508466409, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0351529121398926, "data/tokens_consumed": 1994391552, "data/tokens_consumed_B": 1.994391552, "train/loss_slope": -0.006639056571592019} {"step": 960, "timestamp": 1778195558.8362377, "train/loss": 5.3947522163391115, "train/z_loss": 0.0010596287436783313, "train/perplexity": 220.24756658634448, "train/grad_norm": 0.54296875, "optim/muon_lr": 0.00384, "optim/adamw_lr": 0.0001152, "perf/tokens_per_sec": 2022625.9040745548, "perf/iters_per_sec": 0.9644631882069372, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0368462085723877, "data/tokens_consumed": 2015363072, "data/tokens_consumed_B": 2.015363072, "train/loss_slope": -0.006607681272307271} {"step": 970, "timestamp": 1778195569.2010417, "train/loss": 5.450464344024658, "train/z_loss": 0.0010577118722721935, "train/perplexity": 232.86627086819541, "train/grad_norm": 0.5859375, "optim/muon_lr": 0.00388, "optim/adamw_lr": 0.0001164, "perf/tokens_per_sec": 2024909.3029804572, "perf/iters_per_sec": 0.9655519976522718, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0356770038604737, "data/tokens_consumed": 2036334592, "data/tokens_consumed_B": 2.036334592, "train/loss_slope": -0.006570656319336535} {"step": 975, "timestamp": 1778195574.9833598, "eos/sharpness": 71.37503623962401, "eos/L0_probe": 5.079431056976318, "eos/L_plus": 5.362480640411377, "eos/L_minus": 5.5101318359375, "eos/grad_norm": 0.6351354122161865, "eos/embed_grad_frac": 0.20374280214309692, "eos/time_s": 0.6158058643341064} {"step": 975, "timestamp": 1778195576.3647892, "geo/rankme_last": 291.5182800292969, "geo/layer_0/stable_rank_q_proj": 53.928192138671875, "geo/layer_0/stable_rank_k_proj": 46.58782196044922, "geo/layer_0/stable_rank_o_proj": 68.15638732910156, "geo/layer_0/stable_rank_gate_proj": 162.59169006347656, "geo/layer_0/stable_rank_down_proj": 55.1675910949707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0465792752802372, "geo/layer_0/attn_entropy_mean": 6.969626426696777, "geo/layer_0/attn_entropy_std": 0.019129367545247078, "geo/layer_7/stable_rank_q_proj": 27.298250198364258, "geo/layer_7/stable_rank_k_proj": 28.899869918823242, "geo/layer_7/stable_rank_o_proj": 117.90785217285156, "geo/layer_7/stable_rank_gate_proj": 173.1630096435547, "geo/layer_7/stable_rank_down_proj": 201.220703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7963980436325073, "geo/layer_7/attn_entropy_mean": 6.4148030281066895, "geo/layer_7/attn_entropy_std": 0.7772610783576965, "geo/layer_14/stable_rank_q_proj": 34.015533447265625, "geo/layer_14/stable_rank_k_proj": 22.41394805908203, "geo/layer_14/stable_rank_o_proj": 92.39257049560547, "geo/layer_14/stable_rank_gate_proj": 193.53794860839844, "geo/layer_14/stable_rank_down_proj": 165.71041870117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7972845435142517, "geo/layer_14/attn_entropy_mean": 6.712738037109375, "geo/layer_14/attn_entropy_std": 0.36860647797584534, "geo/layer_21/stable_rank_q_proj": 64.0089340209961, "geo/layer_21/stable_rank_k_proj": 36.410888671875, "geo/layer_21/stable_rank_o_proj": 65.64147186279297, "geo/layer_21/stable_rank_gate_proj": 149.99014282226562, "geo/layer_21/stable_rank_down_proj": 155.51480102539062, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.7143044471740723, "geo/layer_21/attn_entropy_mean": 6.229790210723877, "geo/layer_21/attn_entropy_std": 0.6357182264328003, "geo/layer_27/stable_rank_q_proj": 59.71835708618164, "geo/layer_27/stable_rank_k_proj": 20.510730743408203, "geo/layer_27/stable_rank_o_proj": 75.31448364257812, "geo/layer_27/stable_rank_gate_proj": 82.90149688720703, "geo/layer_27/stable_rank_down_proj": 61.092342376708984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.2478836476802826, "geo/layer_27/attn_entropy_mean": 6.30677604675293, "geo/layer_27/attn_entropy_std": 0.27738988399505615, "attnres/final_alpha/block_0": 0.18058687448501587, "attnres/block_norm/0": 0.5278541445732117, "attnres/final_alpha/block_1": 0.01073392666876316, "attnres/block_norm/1": 2845.94384765625, "attnres/final_alpha/block_2": 0.12007743120193481, "attnres/block_norm/2": 1266.83642578125, "attnres/final_alpha/block_3": 0.05382803827524185, "attnres/block_norm/3": 893.2042236328125, "attnres/final_alpha/block_4": 0.08833390474319458, "attnres/block_norm/4": 1084.9796142578125, "attnres/final_alpha/block_5": 0.3725299537181854, "attnres/block_norm/5": 873.387451171875, "attnres/final_alpha/block_6": 0.17390988767147064, "attnres/block_norm/6": 1063.4189453125, "geo/tier1_time_s": 1.3613710403442383, "geo/step": 975.0, "geo/rankme_slope": 0.32507840658893516} {"step": 980, "timestamp": 1778195581.542397, "train/loss": 5.393144321441651, "train/z_loss": 0.0010654839454218746, "train/perplexity": 219.89371620120133, "train/grad_norm": 0.6328125, "optim/muon_lr": 0.00392, "optim/adamw_lr": 0.0001176, "perf/tokens_per_sec": 1700256.6618550583, "perf/iters_per_sec": 0.8107455548548976, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2334326028823852, "data/tokens_consumed": 2057306112, "data/tokens_consumed_B": 2.057306112, "train/loss_slope": -0.0065353221489901} {"step": 990, "timestamp": 1778195591.890626, "train/loss": 5.354728317260742, "train/z_loss": 0.0010695718578062952, "train/perplexity": 211.60647871490784, "train/grad_norm": 0.609375, "optim/muon_lr": 0.00396, "optim/adamw_lr": 0.0001188, "perf/tokens_per_sec": 2027546.6410799564, "perf/iters_per_sec": 0.9668095784568579, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0343298435211181, "data/tokens_consumed": 2078277632, "data/tokens_consumed_B": 2.078277632, "train/loss_slope": -0.006500465816940733} {"step": 1000, "timestamp": 1778195602.2289906, "grad/layer_0/attn": 0.02392621710896492, "grad/layer_0/mlp": 0.04328641667962074, "grad/layer_0/attn_mlp_ratio": 0.5527419197291791, "grad/layer_4/attn": 0.01844082772731781, "grad/layer_4/mlp": 0.05137455090880394, "grad/layer_4/attn_mlp_ratio": 0.3589486888976927, "grad/layer_8/attn": 0.07492772489786148, "grad/layer_8/mlp": 0.05079341307282448, "grad/layer_8/attn_mlp_ratio": 1.4751464848979543, "grad/layer_12/attn": 0.007939974777400494, "grad/layer_12/mlp": 0.007113012485206127, "grad/layer_12/attn_mlp_ratio": 1.1162604708326131, "grad/layer_16/attn": 0.014754366129636765, "grad/layer_16/mlp": 0.011959332972764969, "grad/layer_16/attn_mlp_ratio": 1.2337114486122085, "grad/layer_20/attn": 0.07101410627365112, "grad/layer_20/mlp": 0.03429451212286949, "grad/layer_20/attn_mlp_ratio": 2.0707134077940013, "grad/layer_24/attn": 0.021153580397367477, "grad/layer_24/mlp": 0.028159979730844498, "grad/layer_24/attn_mlp_ratio": 0.7511930237321159, "grad/layer_27/attn": 0.015755459666252136, "grad/layer_27/mlp": 0.015868522226810455, "grad/layer_27/attn_mlp_ratio": 0.9928750353542815} {"step": 1000, "timestamp": 1778195602.2450213, "train/loss": 5.273845911026001, "train/z_loss": 0.001074628415517509, "train/perplexity": 195.16510859031476, "train/grad_norm": 0.498046875, "optim/muon_lr": 0.004, "optim/adamw_lr": 0.00011999999999999999, "perf/tokens_per_sec": 2026374.6649745372, "perf/iters_per_sec": 0.9662507367012678, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034928059577942, "data/tokens_consumed": 2099249152, "data/tokens_consumed_B": 2.099249152, "train/loss_slope": -0.006491773271231618} {"step": 1000, "timestamp": 1778195609.2242842, "geo/ww_alpha_mean": 8.024458629247453, "geo/ww_alpha_std": 6.668408906321332, "geo/ww_alpha_min": 1.8898676061568809, "geo/ww_alpha_max": 48.40077361800202, "geo/ww_alpha_healthy_frac": 0.29441624365482233, "geo/ww_alpha_by_type/q_proj": 4.427637976769004, "geo/ww_alpha_by_type/k_proj": 4.121146343659692, "geo/ww_alpha_by_type/v_proj": 6.108936626634432, "geo/ww_alpha_by_type/o_proj": 5.233694079629932, "geo/ww_alpha_by_type/gate_proj": 14.21760865164134, "geo/ww_alpha_by_type/up_proj": 11.745615886008471, "geo/ww_alpha_by_type/down_proj": 10.354034917368258, "geo/twonn_id/layer_0": 0.6779094338417053, "geo/twonn_id/layer_7": 2.127223491668701, "geo/twonn_id/layer_14": 1.9810363054275513, "geo/twonn_id/layer_21": 4.125124454498291, "geo/twonn_id/layer_27": 5.894241809844971, "geo/tier2_time_s": 6.971423149108887} {"step": 1000, "timestamp": 1778195609.991476, "eoc/jacobian_sigma/layer_0/attn": 1077.47607421875, "eoc/jacobian_sigma/layer_0/mlp": 1250.7191162109375, "eoc/jacobian_sigma/layer_0": 1250.7191162109375, "eoc/jacobian_sigma/layer_7/attn": 1.0138472318649292, "eoc/jacobian_sigma/layer_7/mlp": 1.2672176361083984, "eoc/jacobian_sigma/layer_7": 1.2672176361083984, "eoc/jacobian_sigma/layer_14/attn": 1.1975528001785278, "eoc/jacobian_sigma/layer_14/mlp": 1.919917106628418, "eoc/jacobian_sigma/layer_14": 1.919917106628418, "eoc/jacobian_sigma/layer_21/attn": 1.0101299285888672, "eoc/jacobian_sigma/layer_21/mlp": 3.3286523818969727, "eoc/jacobian_sigma/layer_21": 3.3286523818969727, "eoc/jacobian_sigma/layer_27/attn": 1.9352701902389526, "eoc/jacobian_sigma/layer_27/mlp": 10.106485366821289, "eoc/jacobian_sigma/layer_27": 10.106485366821289, "eoc/layer0_sigma": 1250.7191162109375, "eoc/sigma_max": 10.106485366821289, "eoc/sigma_min": 1.2672176361083984, "eoc/sigma_mean": 4.1555681228637695, "eoc/time_s": 0.7605774402618408} {"step": 1010, "timestamp": 1778195620.366289, "train/loss": 5.286013889312744, "train/z_loss": 0.001077764411456883, "train/perplexity": 197.55438021711316, "train/grad_norm": 0.6484375, "optim/muon_lr": 0.00404, "optim/adamw_lr": 0.00012119999999999999, "perf/tokens_per_sec": 1157709.95313338, "perf/iters_per_sec": 0.552039124075594, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.811465811729431, "data/tokens_consumed": 2120220672, "data/tokens_consumed_B": 2.120220672, "train/loss_slope": -0.006488172027623371} {"step": 1020, "timestamp": 1778195630.7213495, "train/loss": 5.2795093059539795, "train/z_loss": 0.0010826739482581616, "train/perplexity": 196.27354146031186, "train/grad_norm": 0.7265625, "optim/muon_lr": 0.004079999999999999, "optim/adamw_lr": 0.0001224, "perf/tokens_per_sec": 2026222.6797036761, "perf/iters_per_sec": 0.9661782644766217, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0350056886672974, "data/tokens_consumed": 2141192192, "data/tokens_consumed_B": 2.141192192, "train/loss_slope": -0.006478470854554633} {"step": 1030, "timestamp": 1778195641.0813587, "train/loss": 5.29634575843811, "train/z_loss": 0.0010814758483320475, "train/perplexity": 199.60606684652413, "train/grad_norm": 0.640625, "optim/muon_lr": 0.0041199999999999995, "optim/adamw_lr": 0.0001236, "perf/tokens_per_sec": 2025659.5620812608, "perf/iters_per_sec": 0.9659097490698151, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035293412208557, "data/tokens_consumed": 2162163712, "data/tokens_consumed_B": 2.162163712, "train/loss_slope": -0.0064613940336809405} {"step": 1040, "timestamp": 1778195651.4414194, "train/loss": 5.242304801940918, "train/z_loss": 0.0010899137938395142, "train/perplexity": 189.10545112745478, "train/grad_norm": 0.6171875, "optim/muon_lr": 0.00416, "optim/adamw_lr": 0.00012479999999999997, "perf/tokens_per_sec": 2025285.2723687468, "perf/iters_per_sec": 0.9657312738269552, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035484743118286, "data/tokens_consumed": 2183135232, "data/tokens_consumed_B": 2.183135232, "train/loss_slope": -0.006441045025837804} {"step": 1050, "timestamp": 1778195661.7857544, "grad/layer_0/attn": 0.026849552989006042, "grad/layer_0/mlp": 0.047869857400655746, "grad/layer_0/attn_mlp_ratio": 0.5608864197817643, "grad/layer_4/attn": 0.024045223370194435, "grad/layer_4/mlp": 0.059837061911821365, "grad/layer_4/attn_mlp_ratio": 0.40184498639729466, "grad/layer_8/attn": 0.06981369853019714, "grad/layer_8/mlp": 0.06019037961959839, "grad/layer_8/attn_mlp_ratio": 1.1598813440857116, "grad/layer_12/attn": 0.016516728326678276, "grad/layer_12/mlp": 0.011991934850811958, "grad/layer_12/attn_mlp_ratio": 1.3773197064882303, "grad/layer_16/attn": 0.05003301054239273, "grad/layer_16/mlp": 0.01839972659945488, "grad/layer_16/attn_mlp_ratio": 2.719225745014736, "grad/layer_20/attn": 0.15864010155200958, "grad/layer_20/mlp": 0.05273871868848801, "grad/layer_20/attn_mlp_ratio": 3.0080385947228985, "grad/layer_24/attn": 0.03108826093375683, "grad/layer_24/mlp": 0.038219600915908813, "grad/layer_24/attn_mlp_ratio": 0.8134114461534127, "grad/layer_27/attn": 0.028938904404640198, "grad/layer_27/mlp": 0.02158423326909542, "grad/layer_27/attn_mlp_ratio": 1.340742750033239} {"step": 1050, "timestamp": 1778195662.422214, "eos/sharpness": 90.87276458740233, "eos/L0_probe": 4.872642517089844, "eos/L_plus": 5.368581295013428, "eos/L_minus": 5.285431385040283, "eos/grad_norm": 0.7126367688179016, "eos/embed_grad_frac": 0.09916304796934128, "eos/time_s": 0.633486270904541} {"step": 1050, "timestamp": 1778195662.4438727, "train/loss": 5.1778162002563475, "train/z_loss": 0.0010955387959256768, "train/perplexity": 177.29521069102432, "train/grad_norm": 0.7109375, "optim/muon_lr": 0.0042, "optim/adamw_lr": 0.00012599999999999997, "perf/tokens_per_sec": 1906993.515208351, "perf/iters_per_sec": 0.9093253685037379, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0997163772583007, "data/tokens_consumed": 2204106752, "data/tokens_consumed_B": 2.204106752, "train/loss_slope": -0.006418205769141444} {"step": 1050, "timestamp": 1778195663.8105228, "geo/rankme_last": 301.30218505859375, "geo/layer_0/stable_rank_q_proj": 54.01841735839844, "geo/layer_0/stable_rank_k_proj": 46.81117630004883, "geo/layer_0/stable_rank_o_proj": 68.19337463378906, "geo/layer_0/stable_rank_gate_proj": 162.94970703125, "geo/layer_0/stable_rank_down_proj": 54.906646728515625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04192690551280975, "geo/layer_0/attn_entropy_mean": 6.96727991104126, "geo/layer_0/attn_entropy_std": 0.023860188201069832, "geo/layer_7/stable_rank_q_proj": 27.292604446411133, "geo/layer_7/stable_rank_k_proj": 28.91925621032715, "geo/layer_7/stable_rank_o_proj": 117.8810043334961, "geo/layer_7/stable_rank_gate_proj": 173.2486114501953, "geo/layer_7/stable_rank_down_proj": 201.264404296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7835333347320557, "geo/layer_7/attn_entropy_mean": 6.300737380981445, "geo/layer_7/attn_entropy_std": 1.0085567235946655, "geo/layer_14/stable_rank_q_proj": 34.05148696899414, "geo/layer_14/stable_rank_k_proj": 22.424325942993164, "geo/layer_14/stable_rank_o_proj": 92.3821792602539, "geo/layer_14/stable_rank_gate_proj": 193.4896697998047, "geo/layer_14/stable_rank_down_proj": 165.74365234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7844750881195068, "geo/layer_14/attn_entropy_mean": 6.750101089477539, "geo/layer_14/attn_entropy_std": 0.31375786662101746, "geo/layer_21/stable_rank_q_proj": 63.87082290649414, "geo/layer_21/stable_rank_k_proj": 36.39036560058594, "geo/layer_21/stable_rank_o_proj": 65.76714324951172, "geo/layer_21/stable_rank_gate_proj": 149.95993041992188, "geo/layer_21/stable_rank_down_proj": 155.5552215576172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.6593571901321411, "geo/layer_21/attn_entropy_mean": 6.152685642242432, "geo/layer_21/attn_entropy_std": 0.7073678374290466, "geo/layer_27/stable_rank_q_proj": 59.90553665161133, "geo/layer_27/stable_rank_k_proj": 20.592506408691406, "geo/layer_27/stable_rank_o_proj": 75.5416259765625, "geo/layer_27/stable_rank_gate_proj": 83.24207305908203, "geo/layer_27/stable_rank_down_proj": 61.1883430480957, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.2233593463897705, "geo/layer_27/attn_entropy_mean": 6.305874824523926, "geo/layer_27/attn_entropy_std": 0.2859591841697693, "attnres/final_alpha/block_0": 0.18423214554786682, "attnres/block_norm/0": 0.5361162424087524, "attnres/final_alpha/block_1": 0.010970408096909523, "attnres/block_norm/1": 2797.24755859375, "attnres/final_alpha/block_2": 0.11605144292116165, "attnres/block_norm/2": 1229.5712890625, "attnres/final_alpha/block_3": 0.05164142698049545, "attnres/block_norm/3": 930.712646484375, "attnres/final_alpha/block_4": 0.09029698371887207, "attnres/block_norm/4": 1070.40673828125, "attnres/final_alpha/block_5": 0.36217546463012695, "attnres/block_norm/5": 905.0208740234375, "attnres/final_alpha/block_6": 0.18463215231895447, "attnres/block_norm/6": 1076.2757568359375, "geo/tier1_time_s": 1.3623883724212646, "geo/step": 1050.0, "geo/rankme_slope": 0.3205101040431431} {"step": 1060, "timestamp": 1778195674.1645422, "train/loss": 5.163954162597657, "train/z_loss": 0.0010990932583808898, "train/perplexity": 174.85449354332405, "train/grad_norm": 0.6796875, "optim/muon_lr": 0.00424, "optim/adamw_lr": 0.00012719999999999997, "perf/tokens_per_sec": 1789913.9825209707, "perf/iters_per_sec": 0.8534974968533376, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1716495990753173, "data/tokens_consumed": 2225078272, "data/tokens_consumed_B": 2.225078272, "train/loss_slope": -0.006389883998232205} {"step": 1070, "timestamp": 1778195684.5250359, "train/loss": 5.1361658573150635, "train/z_loss": 0.001103822363074869, "train/perplexity": 170.0624729212221, "train/grad_norm": 0.734375, "optim/muon_lr": 0.00428, "optim/adamw_lr": 0.00012839999999999998, "perf/tokens_per_sec": 2025178.9107171467, "perf/iters_per_sec": 0.9656805566392644, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0355391263961793, "data/tokens_consumed": 2246049792, "data/tokens_consumed_B": 2.246049792, "train/loss_slope": -0.006356923735910731} {"step": 1080, "timestamp": 1778195694.8802924, "train/loss": 5.060565757751465, "train/z_loss": 0.0011088685831055045, "train/perplexity": 157.67969960522188, "train/grad_norm": 0.59765625, "optim/muon_lr": 0.00432, "optim/adamw_lr": 0.00012959999999999998, "perf/tokens_per_sec": 2026516.0280051602, "perf/iters_per_sec": 0.966318143847065, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348558664321899, "data/tokens_consumed": 2267021312, "data/tokens_consumed_B": 2.267021312, "train/loss_slope": -0.006322223828484838} {"step": 1090, "timestamp": 1778195705.2308753, "train/loss": 5.0500095844268795, "train/z_loss": 0.0011124355602078139, "train/perplexity": 156.02395987946366, "train/grad_norm": 0.51953125, "optim/muon_lr": 0.00436, "optim/adamw_lr": 0.00013079999999999998, "perf/tokens_per_sec": 2028099.3986476806, "perf/iters_per_sec": 0.9670731538046268, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340479373931886, "data/tokens_consumed": 2287992832, "data/tokens_consumed_B": 2.287992832, "train/loss_slope": -0.0062820852952356855} {"step": 1100, "timestamp": 1778195715.5683892, "grad/layer_0/attn": 0.021507805213332176, "grad/layer_0/mlp": 0.03841741010546684, "grad/layer_0/attn_mlp_ratio": 0.559845265422696, "grad/layer_4/attn": 0.019344832748174667, "grad/layer_4/mlp": 0.04817404970526695, "grad/layer_4/attn_mlp_ratio": 0.4015612726430914, "grad/layer_8/attn": 0.055076081305742264, "grad/layer_8/mlp": 0.04609832540154457, "grad/layer_8/attn_mlp_ratio": 1.1947523192333067, "grad/layer_12/attn": 0.009711475111544132, "grad/layer_12/mlp": 0.00906711257994175, "grad/layer_12/attn_mlp_ratio": 1.071065889919714, "grad/layer_16/attn": 0.022432293742895126, "grad/layer_16/mlp": 0.015214027836918831, "grad/layer_16/attn_mlp_ratio": 1.4744480446535944, "grad/layer_20/attn": 0.05833390727639198, "grad/layer_20/mlp": 0.038088470697402954, "grad/layer_20/attn_mlp_ratio": 1.5315371306628949, "grad/layer_24/attn": 0.020463138818740845, "grad/layer_24/mlp": 0.026671016588807106, "grad/layer_24/attn_mlp_ratio": 0.7672425486250214, "grad/layer_27/attn": 0.008204704150557518, "grad/layer_27/mlp": 0.014346002601087093, "grad/layer_27/attn_mlp_ratio": 0.5719156981572151} {"step": 1100, "timestamp": 1778195715.5840847, "train/loss": 5.126272392272949, "train/z_loss": 0.0011095887282863259, "train/perplexity": 168.38826132341296, "train/grad_norm": 0.46875, "optim/muon_lr": 0.0044, "optim/adamw_lr": 0.00013199999999999998, "perf/tokens_per_sec": 2026560.0562528048, "perf/iters_per_sec": 0.9663391381515526, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348333835601806, "data/tokens_consumed": 2308964352, "data/tokens_consumed_B": 2.308964352, "train/loss_slope": -0.006231284412420658} {"step": 1110, "timestamp": 1778195725.9476357, "train/loss": 5.040364933013916, "train/z_loss": 0.001114246028009802, "train/perplexity": 154.5263965211427, "train/grad_norm": 0.69921875, "optim/muon_lr": 0.00444, "optim/adamw_lr": 0.00013319999999999999, "perf/tokens_per_sec": 2025093.8201144484, "perf/iters_per_sec": 0.965639982278084, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0355826377868653, "data/tokens_consumed": 2329935872, "data/tokens_consumed_B": 2.329935872, "train/loss_slope": -0.00617940580392077} {"step": 1120, "timestamp": 1778195736.3005404, "train/loss": 5.053483247756958, "train/z_loss": 0.001121272600721568, "train/perplexity": 156.56687699723392, "train/grad_norm": 0.66796875, "optim/muon_lr": 0.0044800000000000005, "optim/adamw_lr": 0.0001344, "perf/tokens_per_sec": 2026926.1297183516, "perf/iters_per_sec": 0.9665136955825575, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034646487236023, "data/tokens_consumed": 2350907392, "data/tokens_consumed_B": 2.350907392, "train/loss_slope": -0.00612088938237238} {"step": 1125, "timestamp": 1778195742.0934567, "eos/sharpness": 63.95378112792967, "eos/L0_probe": 4.691252708435059, "eos/L_plus": 5.07049560546875, "eos/L_minus": 4.951547622680664, "eos/grad_norm": 0.5484917163848877, "eos/embed_grad_frac": 0.20569291710853577, "eos/time_s": 0.6172127723693848} {"step": 1125, "timestamp": 1778195743.4750183, "geo/rankme_last": 311.03125, "geo/layer_0/stable_rank_q_proj": 54.03766632080078, "geo/layer_0/stable_rank_k_proj": 47.126441955566406, "geo/layer_0/stable_rank_o_proj": 68.206787109375, "geo/layer_0/stable_rank_gate_proj": 163.30250549316406, "geo/layer_0/stable_rank_down_proj": 54.73198318481445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03745657950639725, "geo/layer_0/attn_entropy_mean": 6.967205047607422, "geo/layer_0/attn_entropy_std": 0.02891295962035656, "geo/layer_7/stable_rank_q_proj": 27.31348991394043, "geo/layer_7/stable_rank_k_proj": 28.95361328125, "geo/layer_7/stable_rank_o_proj": 117.91693115234375, "geo/layer_7/stable_rank_gate_proj": 173.26498413085938, "geo/layer_7/stable_rank_down_proj": 201.59567260742188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7692222595214844, "geo/layer_7/attn_entropy_mean": 6.2107038497924805, "geo/layer_7/attn_entropy_std": 1.1863843202590942, "geo/layer_14/stable_rank_q_proj": 34.095672607421875, "geo/layer_14/stable_rank_k_proj": 22.43973159790039, "geo/layer_14/stable_rank_o_proj": 92.3389663696289, "geo/layer_14/stable_rank_gate_proj": 193.60118103027344, "geo/layer_14/stable_rank_down_proj": 165.88624572753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7732134461402893, "geo/layer_14/attn_entropy_mean": 6.795938491821289, "geo/layer_14/attn_entropy_std": 0.2561168372631073, "geo/layer_21/stable_rank_q_proj": 63.65910720825195, "geo/layer_21/stable_rank_k_proj": 36.404327392578125, "geo/layer_21/stable_rank_o_proj": 65.87138366699219, "geo/layer_21/stable_rank_gate_proj": 149.93820190429688, "geo/layer_21/stable_rank_down_proj": 155.46377563476562, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.5864349603652954, "geo/layer_21/attn_entropy_mean": 6.121809005737305, "geo/layer_21/attn_entropy_std": 0.6969889402389526, "geo/layer_27/stable_rank_q_proj": 60.0861930847168, "geo/layer_27/stable_rank_k_proj": 20.66331672668457, "geo/layer_27/stable_rank_o_proj": 75.95890808105469, "geo/layer_27/stable_rank_gate_proj": 83.64283752441406, "geo/layer_27/stable_rank_down_proj": 61.36103820800781, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.21896332502365112, "geo/layer_27/attn_entropy_mean": 6.284600257873535, "geo/layer_27/attn_entropy_std": 0.2628549039363861, "attnres/final_alpha/block_0": 0.1879570186138153, "attnres/block_norm/0": 0.5440632104873657, "attnres/final_alpha/block_1": 0.011316689662635326, "attnres/block_norm/1": 2745.61865234375, "attnres/final_alpha/block_2": 0.11118259280920029, "attnres/block_norm/2": 1188.4326171875, "attnres/final_alpha/block_3": 0.04959029331803322, "attnres/block_norm/3": 965.409912109375, "attnres/final_alpha/block_4": 0.09259702265262604, "attnres/block_norm/4": 1044.0400390625, "attnres/final_alpha/block_5": 0.3514616787433624, "attnres/block_norm/5": 931.2422485351562, "attnres/final_alpha/block_6": 0.19589468836784363, "attnres/block_norm/6": 1076.6478271484375, "geo/tier1_time_s": 1.3616361618041992, "geo/step": 1125.0, "geo/rankme_slope": 0.31324478605681777} {"step": 1130, "timestamp": 1778195748.656668, "train/loss": 5.018268489837647, "train/z_loss": 0.0011236301972530783, "train/perplexity": 151.1493604270841, "train/grad_norm": 0.578125, "optim/muon_lr": 0.004520000000000001, "optim/adamw_lr": 0.0001356, "perf/tokens_per_sec": 1697951.4232369415, "perf/iters_per_sec": 0.8096463314232547, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.235107183456421, "data/tokens_consumed": 2371878912, "data/tokens_consumed_B": 2.371878912, "train/loss_slope": -0.006058413693370527} {"step": 1140, "timestamp": 1778195759.0115116, "train/loss": 4.942009544372558, "train/z_loss": 0.0011291906936094164, "train/perplexity": 140.0514064824974, "train/grad_norm": 0.52734375, "optim/muon_lr": 0.004560000000000001, "optim/adamw_lr": 0.0001368, "perf/tokens_per_sec": 2026243.9170612495, "perf/iters_per_sec": 0.9661883912378547, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0349948406219482, "data/tokens_consumed": 2392850432, "data/tokens_consumed_B": 2.392850432, "train/loss_slope": -0.005994900057346586} {"step": 1150, "timestamp": 1778195769.8375466, "grad/layer_0/attn": 0.025538096204400063, "grad/layer_0/mlp": 0.04967823997139931, "grad/layer_0/attn_mlp_ratio": 0.5140700670493925, "grad/layer_4/attn": 0.021397270262241364, "grad/layer_4/mlp": 0.056025516241788864, "grad/layer_4/attn_mlp_ratio": 0.38192009033357815, "grad/layer_8/attn": 0.051482029259204865, "grad/layer_8/mlp": 0.051594674587249756, "grad/layer_8/attn_mlp_ratio": 0.9978167237466326, "grad/layer_12/attn": 0.009465872310101986, "grad/layer_12/mlp": 0.009487775154411793, "grad/layer_12/attn_mlp_ratio": 0.997691456245275, "grad/layer_16/attn": 0.025052692741155624, "grad/layer_16/mlp": 0.01635124348104, "grad/layer_16/attn_mlp_ratio": 1.5321582494315809, "grad/layer_20/attn": 0.11590884625911713, "grad/layer_20/mlp": 0.05102253332734108, "grad/layer_20/attn_mlp_ratio": 2.2717187578342766, "grad/layer_24/attn": 0.02577069029211998, "grad/layer_24/mlp": 0.03294257074594498, "grad/layer_24/attn_mlp_ratio": 0.7822914129147934, "grad/layer_27/attn": 0.01076612900942564, "grad/layer_27/mlp": 0.015599406324326992, "grad/layer_27/attn_mlp_ratio": 0.6901627354638351} {"step": 1150, "timestamp": 1778195769.8537052, "train/loss": 4.96505651473999, "train/z_loss": 0.0011306321364827454, "train/perplexity": 143.316649547663, "train/grad_norm": 0.53515625, "optim/muon_lr": 0.0046, "optim/adamw_lr": 0.000138, "perf/tokens_per_sec": 1935424.8291142024, "perf/iters_per_sec": 0.9228824754305851, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0835615873336792, "data/tokens_consumed": 2413821952, "data/tokens_consumed_B": 2.413821952, "train/loss_slope": -0.005924851359056346} {"step": 1160, "timestamp": 1778195780.2074676, "train/loss": 4.8729820728302, "train/z_loss": 0.0011374892201274633, "train/perplexity": 130.71012339457764, "train/grad_norm": 0.78125, "optim/muon_lr": 0.00464, "optim/adamw_lr": 0.0001392, "perf/tokens_per_sec": 2026561.0834457884, "perf/iters_per_sec": 0.9663396279553358, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348328590393066, "data/tokens_consumed": 2434793472, "data/tokens_consumed_B": 2.434793472, "train/loss_slope": -0.005855346474503026} {"step": 1170, "timestamp": 1778195790.5613034, "train/loss": 4.9126146793365475, "train/z_loss": 0.0011390014784410595, "train/perplexity": 135.99453203959163, "train/grad_norm": 0.609375, "optim/muon_lr": 0.00468, "optim/adamw_lr": 0.0001404, "perf/tokens_per_sec": 2026676.2757374048, "perf/iters_per_sec": 0.9663945559203171, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0347740411758424, "data/tokens_consumed": 2455764992, "data/tokens_consumed_B": 2.455764992, "train/loss_slope": -0.0057785798108009045} {"step": 1180, "timestamp": 1778195800.9149075, "train/loss": 4.904095077514649, "train/z_loss": 0.00114281001733616, "train/perplexity": 134.8408342777236, "train/grad_norm": 0.5703125, "optim/muon_lr": 0.00472, "optim/adamw_lr": 0.00014159999999999997, "perf/tokens_per_sec": 2026630.5147672317, "perf/iters_per_sec": 0.9663727353893431, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0347974061965943, "data/tokens_consumed": 2476736512, "data/tokens_consumed_B": 2.476736512, "train/loss_slope": -0.005697347829036444} {"step": 1190, "timestamp": 1778195811.268411, "train/loss": 4.819867181777954, "train/z_loss": 0.0011494366684928537, "train/perplexity": 123.9486270502366, "train/grad_norm": 0.63671875, "optim/muon_lr": 0.0047599999999999995, "optim/adamw_lr": 0.00014279999999999997, "perf/tokens_per_sec": 2026600.257628027, "perf/iters_per_sec": 0.9663583076610693, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.03481285572052, "data/tokens_consumed": 2497708032, "data/tokens_consumed_B": 2.497708032, "train/loss_slope": -0.005616112275871828} {"step": 1200, "timestamp": 1778195821.6116369, "grad/layer_0/attn": 0.026298383250832558, "grad/layer_0/mlp": 0.04973533749580383, "grad/layer_0/attn_mlp_ratio": 0.5287665575844277, "grad/layer_4/attn": 0.023545853793621063, "grad/layer_4/mlp": 0.056171663105487823, "grad/layer_4/attn_mlp_ratio": 0.41917672452541327, "grad/layer_8/attn": 0.052952948957681656, "grad/layer_8/mlp": 0.052821025252342224, "grad/layer_8/attn_mlp_ratio": 1.0024975585850413, "grad/layer_12/attn": 0.010184431448578835, "grad/layer_12/mlp": 0.010442125611007214, "grad/layer_12/attn_mlp_ratio": 0.975321666338805, "grad/layer_16/attn": 0.022696970030665398, "grad/layer_16/mlp": 0.017777878791093826, "grad/layer_16/attn_mlp_ratio": 1.2766973028506727, "grad/layer_20/attn": 0.16864013671875, "grad/layer_20/mlp": 0.05540729686617851, "grad/layer_20/attn_mlp_ratio": 3.0436448979218493, "grad/layer_24/attn": 0.03132582828402519, "grad/layer_24/mlp": 0.04196902737021446, "grad/layer_24/attn_mlp_ratio": 0.7464034830508575, "grad/layer_27/attn": 0.0350072868168354, "grad/layer_27/mlp": 0.031085774302482605, "grad/layer_27/attn_mlp_ratio": 1.1261513502471925} {"step": 1200, "timestamp": 1778195822.220776, "eos/sharpness": 135.15543937683103, "eos/L0_probe": 4.524650573730469, "eos/L_plus": 5.474932670593262, "eos/L_minus": 4.925922870635986, "eos/grad_norm": 0.6929410696029663, "eos/embed_grad_frac": 0.12995891273021698, "eos/time_s": 0.6064140796661377} {"step": 1200, "timestamp": 1778195822.2414837, "train/loss": 4.833052158355713, "train/z_loss": 0.0011542334570549429, "train/perplexity": 125.5937081403936, "train/grad_norm": 0.69140625, "optim/muon_lr": 0.0048, "optim/adamw_lr": 0.00014399999999999998, "perf/tokens_per_sec": 1912165.2865080687, "perf/iters_per_sec": 0.9117914612331718, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0967420101165772, "data/tokens_consumed": 2518679552, "data/tokens_consumed_B": 2.518679552, "train/loss_slope": -0.005529243755455029} {"step": 1200, "timestamp": 1778195823.6065657, "geo/rankme_last": 313.83148193359375, "geo/layer_0/stable_rank_q_proj": 54.104576110839844, "geo/layer_0/stable_rank_k_proj": 47.405643463134766, "geo/layer_0/stable_rank_o_proj": 68.1410903930664, "geo/layer_0/stable_rank_gate_proj": 163.55941772460938, "geo/layer_0/stable_rank_down_proj": 54.61717224121094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04412205144762993, "geo/layer_0/attn_entropy_mean": 6.966131210327148, "geo/layer_0/attn_entropy_std": 0.03281906619668007, "geo/layer_7/stable_rank_q_proj": 27.336105346679688, "geo/layer_7/stable_rank_k_proj": 29.01624298095703, "geo/layer_7/stable_rank_o_proj": 117.87066650390625, "geo/layer_7/stable_rank_gate_proj": 173.2447052001953, "geo/layer_7/stable_rank_down_proj": 201.8058319091797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7568780183792114, "geo/layer_7/attn_entropy_mean": 6.1495866775512695, "geo/layer_7/attn_entropy_std": 1.3205223083496094, "geo/layer_14/stable_rank_q_proj": 34.15427017211914, "geo/layer_14/stable_rank_k_proj": 22.48581886291504, "geo/layer_14/stable_rank_o_proj": 92.29810333251953, "geo/layer_14/stable_rank_gate_proj": 193.59788513183594, "geo/layer_14/stable_rank_down_proj": 166.18812561035156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7637748718261719, "geo/layer_14/attn_entropy_mean": 6.8364715576171875, "geo/layer_14/attn_entropy_std": 0.20464187860488892, "geo/layer_21/stable_rank_q_proj": 63.41574478149414, "geo/layer_21/stable_rank_k_proj": 36.4702262878418, "geo/layer_21/stable_rank_o_proj": 66.05139923095703, "geo/layer_21/stable_rank_gate_proj": 149.80679321289062, "geo/layer_21/stable_rank_down_proj": 155.37005615234375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.5411937236785889, "geo/layer_21/attn_entropy_mean": 6.130191802978516, "geo/layer_21/attn_entropy_std": 0.6413360834121704, "geo/layer_27/stable_rank_q_proj": 60.26944351196289, "geo/layer_27/stable_rank_k_proj": 20.795047760009766, "geo/layer_27/stable_rank_o_proj": 76.33775329589844, "geo/layer_27/stable_rank_gate_proj": 84.03611755371094, "geo/layer_27/stable_rank_down_proj": 61.57078170776367, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1969309151172638, "geo/layer_27/attn_entropy_mean": 6.286921501159668, "geo/layer_27/attn_entropy_std": 0.24935287237167358, "attnres/final_alpha/block_0": 0.1892295479774475, "attnres/block_norm/0": 0.552323579788208, "attnres/final_alpha/block_1": 0.011611510999500751, "attnres/block_norm/1": 2704.013427734375, "attnres/final_alpha/block_2": 0.10594821721315384, "attnres/block_norm/2": 1183.0009765625, "attnres/final_alpha/block_3": 0.04753613471984863, "attnres/block_norm/3": 1015.1410522460938, "attnres/final_alpha/block_4": 0.08746184408664703, "attnres/block_norm/4": 1043.7646484375, "attnres/final_alpha/block_5": 0.3569662272930145, "attnres/block_norm/5": 950.55078125, "attnres/final_alpha/block_6": 0.20124652981758118, "attnres/block_norm/6": 1087.510009765625, "geo/tier1_time_s": 1.360625982284546, "geo/step": 1200.0, "geo/rankme_slope": 0.30284881566864214} {"step": 1210, "timestamp": 1778195833.9689543, "train/loss": 4.780751514434814, "train/z_loss": 0.0011581810307689012, "train/perplexity": 119.1938923250655, "train/grad_norm": 0.68359375, "optim/muon_lr": 0.00484, "optim/adamw_lr": 0.00014519999999999998, "perf/tokens_per_sec": 1788884.389831939, "perf/iters_per_sec": 0.853006548801393, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1723239421844482, "data/tokens_consumed": 2539651072, "data/tokens_consumed_B": 2.539651072, "train/loss_slope": -0.005440617435871929} {"step": 1220, "timestamp": 1778195844.3225057, "train/loss": 4.842910242080689, "train/z_loss": 0.0011574685922823846, "train/perplexity": 126.83794424601261, "train/grad_norm": 0.58203125, "optim/muon_lr": 0.00488, "optim/adamw_lr": 0.00014639999999999998, "perf/tokens_per_sec": 2026903.897222588, "perf/iters_per_sec": 0.9665030943024578, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0346578359603882, "data/tokens_consumed": 2560622592, "data/tokens_consumed_B": 2.560622592, "train/loss_slope": -0.005343546521610016} {"step": 1230, "timestamp": 1778195854.6695106, "train/loss": 4.752173852920532, "train/z_loss": 0.0011666920618154109, "train/perplexity": 115.83582106433684, "train/grad_norm": 0.58203125, "optim/muon_lr": 0.00492, "optim/adamw_lr": 0.00014759999999999998, "perf/tokens_per_sec": 2028082.097031312, "perf/iters_per_sec": 0.9670649037510453, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340567588806153, "data/tokens_consumed": 2581594112, "data/tokens_consumed_B": 2.581594112, "train/loss_slope": -0.00524723403285725} {"step": 1240, "timestamp": 1778195865.0197318, "train/loss": 4.819992542266846, "train/z_loss": 0.0011656072689220308, "train/perplexity": 123.96416628470375, "train/grad_norm": 0.57421875, "optim/muon_lr": 0.00496, "optim/adamw_lr": 0.00014879999999999998, "perf/tokens_per_sec": 2027267.8054926714, "perf/iters_per_sec": 0.966676619287811, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034472107887268, "data/tokens_consumed": 2602565632, "data/tokens_consumed_B": 2.602565632, "train/loss_slope": -0.005141974323689312} {"step": 1250, "timestamp": 1778195875.3544083, "grad/layer_0/attn": 0.018173230811953545, "grad/layer_0/mlp": 0.0358586385846138, "grad/layer_0/attn_mlp_ratio": 0.5068020281470222, "grad/layer_4/attn": 0.019790301099419594, "grad/layer_4/mlp": 0.048975542187690735, "grad/layer_4/attn_mlp_ratio": 0.40408538987006964, "grad/layer_8/attn": 0.03898593410849571, "grad/layer_8/mlp": 0.04165098816156387, "grad/layer_8/attn_mlp_ratio": 0.9360146237987944, "grad/layer_12/attn": 0.013286137022078037, "grad/layer_12/mlp": 0.011079586111009121, "grad/layer_12/attn_mlp_ratio": 1.1991546226587775, "grad/layer_16/attn": 0.015409787185490131, "grad/layer_16/mlp": 0.01723155565559864, "grad/layer_16/attn_mlp_ratio": 0.8942771856501346, "grad/layer_20/attn": 0.08179974555969238, "grad/layer_20/mlp": 0.05249979346990585, "grad/layer_20/attn_mlp_ratio": 1.5580965180514912, "grad/layer_24/attn": 0.02305592969059944, "grad/layer_24/mlp": 0.029560908675193787, "grad/layer_24/attn_mlp_ratio": 0.7799465796513997, "grad/layer_27/attn": 0.023994287475943565, "grad/layer_27/mlp": 0.02324226126074791, "grad/layer_27/attn_mlp_ratio": 1.0323559787717427} {"step": 1250, "timestamp": 1778195875.3703024, "train/loss": 4.730479621887207, "train/z_loss": 0.0011725353775545954, "train/perplexity": 113.34991441326373, "train/grad_norm": 0.5078125, "optim/muon_lr": 0.005, "optim/adamw_lr": 0.00015, "perf/tokens_per_sec": 2026963.2161012364, "perf/iters_per_sec": 0.9665313797479803, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0346275568008423, "data/tokens_consumed": 2623537152, "data/tokens_consumed_B": 2.623537152, "train/loss_slope": -0.005038204944466385} {"step": 1260, "timestamp": 1778195885.7192323, "train/loss": 4.687790393829346, "train/z_loss": 0.0011764394934289158, "train/perplexity": 108.61292268418099, "train/grad_norm": 0.44921875, "optim/muon_lr": 0.00504, "optim/adamw_lr": 0.0001512, "perf/tokens_per_sec": 2027805.732042692, "perf/iters_per_sec": 0.9669331226552448, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0341976881027222, "data/tokens_consumed": 2644508672, "data/tokens_consumed_B": 2.644508672, "train/loss_slope": -0.004933884528045929} {"step": 1270, "timestamp": 1778195896.0653148, "train/loss": 4.677069902420044, "train/z_loss": 0.0011788474279455841, "train/perplexity": 107.45475791951027, "train/grad_norm": 0.48828125, "optim/muon_lr": 0.00508, "optim/adamw_lr": 0.0001524, "perf/tokens_per_sec": 2028039.218351781, "perf/iters_per_sec": 0.9670444576033501, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0340786218643188, "data/tokens_consumed": 2665480192, "data/tokens_consumed_B": 2.665480192, "train/loss_slope": -0.004828154394628763} {"step": 1275, "timestamp": 1778195901.8344553, "eos/sharpness": 78.9884567260742, "eos/L0_probe": 4.339085102081299, "eos/L_plus": 4.726132392883301, "eos/L_minus": 4.741922378540039, "eos/grad_norm": 0.5930253863334656, "eos/embed_grad_frac": 0.16047675907611847, "eos/time_s": 0.6060144901275635} {"step": 1275, "timestamp": 1778195903.2115996, "geo/rankme_last": 318.6685485839844, "geo/layer_0/stable_rank_q_proj": 54.184322357177734, "geo/layer_0/stable_rank_k_proj": 47.77470397949219, "geo/layer_0/stable_rank_o_proj": 68.13177490234375, "geo/layer_0/stable_rank_gate_proj": 163.9484405517578, "geo/layer_0/stable_rank_down_proj": 54.53704833984375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.040367402136325836, "geo/layer_0/attn_entropy_mean": 6.966324806213379, "geo/layer_0/attn_entropy_std": 0.03701717406511307, "geo/layer_7/stable_rank_q_proj": 27.373027801513672, "geo/layer_7/stable_rank_k_proj": 29.124408721923828, "geo/layer_7/stable_rank_o_proj": 117.69453430175781, "geo/layer_7/stable_rank_gate_proj": 173.22950744628906, "geo/layer_7/stable_rank_down_proj": 201.98052978515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7479032874107361, "geo/layer_7/attn_entropy_mean": 6.090548992156982, "geo/layer_7/attn_entropy_std": 1.4162020683288574, "geo/layer_14/stable_rank_q_proj": 34.216365814208984, "geo/layer_14/stable_rank_k_proj": 22.521217346191406, "geo/layer_14/stable_rank_o_proj": 92.24563598632812, "geo/layer_14/stable_rank_gate_proj": 193.6806640625, "geo/layer_14/stable_rank_down_proj": 166.06236267089844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7518702149391174, "geo/layer_14/attn_entropy_mean": 6.86083459854126, "geo/layer_14/attn_entropy_std": 0.17134353518486023, "geo/layer_21/stable_rank_q_proj": 63.11319351196289, "geo/layer_21/stable_rank_k_proj": 36.589210510253906, "geo/layer_21/stable_rank_o_proj": 66.19296264648438, "geo/layer_21/stable_rank_gate_proj": 149.58607482910156, "geo/layer_21/stable_rank_down_proj": 155.21383666992188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.49105602502822876, "geo/layer_21/attn_entropy_mean": 6.158131122589111, "geo/layer_21/attn_entropy_std": 0.5969282984733582, "geo/layer_27/stable_rank_q_proj": 60.453826904296875, "geo/layer_27/stable_rank_k_proj": 20.906192779541016, "geo/layer_27/stable_rank_o_proj": 76.82572174072266, "geo/layer_27/stable_rank_gate_proj": 84.61940002441406, "geo/layer_27/stable_rank_down_proj": 61.837772369384766, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.18297512829303741, "geo/layer_27/attn_entropy_mean": 6.270552635192871, "geo/layer_27/attn_entropy_std": 0.24192456901073456, "attnres/final_alpha/block_0": 0.1938251554965973, "attnres/block_norm/0": 0.5608463883399963, "attnres/final_alpha/block_1": 0.0120257418602705, "attnres/block_norm/1": 2672.30078125, "attnres/final_alpha/block_2": 0.10231590270996094, "attnres/block_norm/2": 1173.876220703125, "attnres/final_alpha/block_3": 0.0460059680044651, "attnres/block_norm/3": 1062.4727783203125, "attnres/final_alpha/block_4": 0.08422467112541199, "attnres/block_norm/4": 1042.0977783203125, "attnres/final_alpha/block_5": 0.3494192361831665, "attnres/block_norm/5": 969.2911376953125, "attnres/final_alpha/block_6": 0.21218332648277283, "attnres/block_norm/6": 1105.498779296875, "geo/tier1_time_s": 1.3576171398162842, "geo/step": 1275.0, "geo/rankme_slope": 0.2913717383144236} {"step": 1280, "timestamp": 1778195908.969373, "train/loss": 4.744905471801758, "train/z_loss": 0.0011810099240392447, "train/perplexity": 114.99693453628593, "train/grad_norm": 0.62890625, "optim/muon_lr": 0.00512, "optim/adamw_lr": 0.0001536, "perf/tokens_per_sec": 1625850.7803073328, "perf/iters_per_sec": 0.7752660657440819, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2898797512054443, "data/tokens_consumed": 2686451712, "data/tokens_consumed_B": 2.686451712, "train/loss_slope": -0.004718444869656338} {"step": 1290, "timestamp": 1778195919.318823, "train/loss": 4.632099866867065, "train/z_loss": 0.0011890135589055716, "train/perplexity": 102.72955617026224, "train/grad_norm": 0.498046875, "optim/muon_lr": 0.0051600000000000005, "optim/adamw_lr": 0.0001548, "perf/tokens_per_sec": 2027692.5147489673, "perf/iters_per_sec": 0.9668791364426457, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0342554330825806, "data/tokens_consumed": 2707423232, "data/tokens_consumed_B": 2.707423232, "train/loss_slope": -0.00461592017879652} {"step": 1300, "timestamp": 1778195929.6548347, "grad/layer_0/attn": 0.01987694948911667, "grad/layer_0/mlp": 0.038843169808387756, "grad/layer_0/attn_mlp_ratio": 0.5117231558597503, "grad/layer_4/attn": 0.022408703342080116, "grad/layer_4/mlp": 0.055886976420879364, "grad/layer_4/attn_mlp_ratio": 0.4009646743675287, "grad/layer_8/attn": 0.06009107083082199, "grad/layer_8/mlp": 0.04356762021780014, "grad/layer_8/attn_mlp_ratio": 1.379259881363568, "grad/layer_12/attn": 0.013069596141576767, "grad/layer_12/mlp": 0.011979231610894203, "grad/layer_12/attn_mlp_ratio": 1.0910212321622397, "grad/layer_16/attn": 0.018241554498672485, "grad/layer_16/mlp": 0.019274797290563583, "grad/layer_16/attn_mlp_ratio": 0.9463940984200983, "grad/layer_20/attn": 0.07934465259313583, "grad/layer_20/mlp": 0.05846548452973366, "grad/layer_20/attn_mlp_ratio": 1.3571195568741374, "grad/layer_24/attn": 0.03202914074063301, "grad/layer_24/mlp": 0.04063837230205536, "grad/layer_24/attn_mlp_ratio": 0.7881501853409139, "grad/layer_27/attn": 0.032118916511535645, "grad/layer_27/mlp": 0.03418543562293053, "grad/layer_27/attn_mlp_ratio": 0.9395497185367532} {"step": 1300, "timestamp": 1778195929.6704857, "train/loss": 4.68438458442688, "train/z_loss": 0.0011899130186066031, "train/perplexity": 108.243636986187, "train/grad_norm": 0.6640625, "optim/muon_lr": 0.005200000000000001, "optim/adamw_lr": 0.000156, "perf/tokens_per_sec": 2027076.6789663474, "perf/iters_per_sec": 0.9665854830581414, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0345696449279784, "data/tokens_consumed": 2728394752, "data/tokens_consumed_B": 2.728394752, "train/loss_slope": -0.004511391202844803} {"step": 1310, "timestamp": 1778195940.0187333, "train/loss": 4.657499694824219, "train/z_loss": 0.001193732989486307, "train/perplexity": 105.37228963202679, "train/grad_norm": 0.8359375, "optim/muon_lr": 0.005240000000000001, "optim/adamw_lr": 0.0001572, "perf/tokens_per_sec": 2027692.3745206872, "perf/iters_per_sec": 0.9668790695765911, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0342555046081543, "data/tokens_consumed": 2749366272, "data/tokens_consumed_B": 2.749366272, "train/loss_slope": -0.0044104298451123495} {"step": 1320, "timestamp": 1778195950.9782343, "train/loss": 4.524956941604614, "train/z_loss": 0.001201244129333645, "train/perplexity": 92.2919517997493, "train/grad_norm": 0.796875, "optim/muon_lr": 0.00528, "optim/adamw_lr": 0.0001584, "perf/tokens_per_sec": 1914597.2461582923, "perf/iters_per_sec": 0.9129511099616491, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0953489065170288, "data/tokens_consumed": 2770337792, "data/tokens_consumed_B": 2.770337792, "train/loss_slope": -0.004319365630305783} {"step": 1330, "timestamp": 1778195961.78904, "train/loss": 4.6006043434143065, "train/z_loss": 0.001202101039234549, "train/perplexity": 99.5444565039553, "train/grad_norm": 0.6640625, "optim/muon_lr": 0.00532, "optim/adamw_lr": 0.0001596, "perf/tokens_per_sec": 1940856.835588195, "perf/iters_per_sec": 0.9254726579609847, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0805289506912232, "data/tokens_consumed": 2791309312, "data/tokens_consumed_B": 2.791309312, "train/loss_slope": -0.004225336409001389} {"step": 1340, "timestamp": 1778195972.139559, "train/loss": 4.563143873214722, "train/z_loss": 0.0012057062820531427, "train/perplexity": 95.88445503811973, "train/grad_norm": 0.6875, "optim/muon_lr": 0.00536, "optim/adamw_lr": 0.0001608, "perf/tokens_per_sec": 2027293.3633966974, "perf/iters_per_sec": 0.9666888062461364, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0344590663909912, "data/tokens_consumed": 2812280832, "data/tokens_consumed_B": 2.812280832, "train/loss_slope": -0.004135793580159103} {"step": 1350, "timestamp": 1778195982.4858963, "grad/layer_0/attn": 0.016047989949584007, "grad/layer_0/mlp": 0.03238977491855621, "grad/layer_0/attn_mlp_ratio": 0.4954646934222315, "grad/layer_4/attn": 0.019325383007526398, "grad/layer_4/mlp": 0.047248370945453644, "grad/layer_4/attn_mlp_ratio": 0.40901691592573824, "grad/layer_8/attn": 0.05282578617334366, "grad/layer_8/mlp": 0.037830088287591934, "grad/layer_8/attn_mlp_ratio": 1.3963960546988896, "grad/layer_12/attn": 0.010076096281409264, "grad/layer_12/mlp": 0.011571289971470833, "grad/layer_12/attn_mlp_ratio": 0.8707841752452486, "grad/layer_16/attn": 0.020612435415387154, "grad/layer_16/mlp": 0.018097659572958946, "grad/layer_16/attn_mlp_ratio": 1.1389558532911146, "grad/layer_20/attn": 0.1399180144071579, "grad/layer_20/mlp": 0.05186790972948074, "grad/layer_20/attn_mlp_ratio": 2.697583435830513, "grad/layer_24/attn": 0.02645723707973957, "grad/layer_24/mlp": 0.03362463787198067, "grad/layer_24/attn_mlp_ratio": 0.7868408011347608, "grad/layer_27/attn": 0.019503476098179817, "grad/layer_27/mlp": 0.01976720243692398, "grad/layer_27/attn_mlp_ratio": 0.9866583833371698} {"step": 1350, "timestamp": 1778195983.0948737, "eos/sharpness": 37.207365036010735, "eos/L0_probe": 4.16569185256958, "eos/L_plus": 4.348186016082764, "eos/L_minus": 4.355271339416504, "eos/grad_norm": 0.4576887786388397, "eos/embed_grad_frac": 0.19703254103660583, "eos/time_s": 0.6060895919799805} {"step": 1350, "timestamp": 1778195983.114792, "train/loss": 4.534424686431885, "train/z_loss": 0.001212532294448465, "train/perplexity": 93.16989797606898, "train/grad_norm": 0.45703125, "optim/muon_lr": 0.0054, "optim/adamw_lr": 0.000162, "perf/tokens_per_sec": 1911723.3526074274, "perf/iters_per_sec": 0.9115807307278764, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0969955444335937, "data/tokens_consumed": 2833252352, "data/tokens_consumed_B": 2.833252352, "train/loss_slope": -0.004049498850813579} {"step": 1350, "timestamp": 1778195984.4751048, "geo/rankme_last": 320.85504150390625, "geo/layer_0/stable_rank_q_proj": 54.281341552734375, "geo/layer_0/stable_rank_k_proj": 48.14823532104492, "geo/layer_0/stable_rank_o_proj": 67.97002410888672, "geo/layer_0/stable_rank_gate_proj": 164.19451904296875, "geo/layer_0/stable_rank_down_proj": 54.34535217285156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04236399009823799, "geo/layer_0/attn_entropy_mean": 6.966695308685303, "geo/layer_0/attn_entropy_std": 0.037894826382398605, "geo/layer_7/stable_rank_q_proj": 27.42177391052246, "geo/layer_7/stable_rank_k_proj": 29.22797393798828, "geo/layer_7/stable_rank_o_proj": 117.56221771240234, "geo/layer_7/stable_rank_gate_proj": 173.32369995117188, "geo/layer_7/stable_rank_down_proj": 201.995849609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7408983707427979, "geo/layer_7/attn_entropy_mean": 6.040384292602539, "geo/layer_7/attn_entropy_std": 1.4788317680358887, "geo/layer_14/stable_rank_q_proj": 34.29314422607422, "geo/layer_14/stable_rank_k_proj": 22.56656837463379, "geo/layer_14/stable_rank_o_proj": 92.28045654296875, "geo/layer_14/stable_rank_gate_proj": 194.06198120117188, "geo/layer_14/stable_rank_down_proj": 166.6597137451172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7422471642494202, "geo/layer_14/attn_entropy_mean": 6.882762908935547, "geo/layer_14/attn_entropy_std": 0.14608481526374817, "geo/layer_21/stable_rank_q_proj": 62.78950119018555, "geo/layer_21/stable_rank_k_proj": 36.71900177001953, "geo/layer_21/stable_rank_o_proj": 66.41063690185547, "geo/layer_21/stable_rank_gate_proj": 149.4381103515625, "geo/layer_21/stable_rank_down_proj": 155.23573303222656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.4602486789226532, "geo/layer_21/attn_entropy_mean": 6.175830841064453, "geo/layer_21/attn_entropy_std": 0.5545228123664856, "geo/layer_27/stable_rank_q_proj": 60.631507873535156, "geo/layer_27/stable_rank_k_proj": 21.066179275512695, "geo/layer_27/stable_rank_o_proj": 77.41387939453125, "geo/layer_27/stable_rank_gate_proj": 85.30464935302734, "geo/layer_27/stable_rank_down_proj": 62.209529876708984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1614498347043991, "geo/layer_27/attn_entropy_mean": 6.212257385253906, "geo/layer_27/attn_entropy_std": 0.27124059200286865, "attnres/final_alpha/block_0": 0.19440937042236328, "attnres/block_norm/0": 0.5694681406021118, "attnres/final_alpha/block_1": 0.01232148613780737, "attnres/block_norm/1": 2658.4140625, "attnres/final_alpha/block_2": 0.09606742858886719, "attnres/block_norm/2": 1193.266845703125, "attnres/final_alpha/block_3": 0.04369630664587021, "attnres/block_norm/3": 1110.95263671875, "attnres/final_alpha/block_4": 0.08025382459163666, "attnres/block_norm/4": 1026.511962890625, "attnres/final_alpha/block_5": 0.35746705532073975, "attnres/block_norm/5": 964.881591796875, "attnres/final_alpha/block_6": 0.21578457951545715, "attnres/block_norm/6": 1133.112548828125, "geo/tier1_time_s": 1.3563978672027588, "geo/step": 1350.0, "geo/rankme_slope": 0.2790162750601071} {"step": 1360, "timestamp": 1778195994.8264334, "train/loss": 4.52633318901062, "train/z_loss": 0.0012140435166656971, "train/perplexity": 92.4190558022239, "train/grad_norm": 0.7421875, "optim/muon_lr": 0.00544, "optim/adamw_lr": 0.0001632, "perf/tokens_per_sec": 1791225.4021357265, "perf/iters_per_sec": 0.8541228304556496, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.170791792869568, "data/tokens_consumed": 2854223872, "data/tokens_consumed_B": 2.854223872, "train/loss_slope": -0.003966378766868768} {"step": 1370, "timestamp": 1778196005.1814516, "train/loss": 4.4552844047546385, "train/z_loss": 0.0012197431758977472, "train/perplexity": 86.08062910430627, "train/grad_norm": 0.50390625, "optim/muon_lr": 0.0054800000000000005, "optim/adamw_lr": 0.0001644, "perf/tokens_per_sec": 2026299.0429173938, "perf/iters_per_sec": 0.9662146772944421, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0349666833877564, "data/tokens_consumed": 2875195392, "data/tokens_consumed_B": 2.875195392, "train/loss_slope": -0.0038886564296821986} {"step": 1380, "timestamp": 1778196015.9926612, "train/loss": 4.396322298049927, "train/z_loss": 0.0012262473814189435, "train/perplexity": 81.1518668033208, "train/grad_norm": 0.6328125, "optim/muon_lr": 0.005520000000000001, "optim/adamw_lr": 0.0001656, "perf/tokens_per_sec": 1940861.9745996457, "perf/iters_per_sec": 0.9254751084326008, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.080526089668274, "data/tokens_consumed": 2896166912, "data/tokens_consumed_B": 2.896166912, "train/loss_slope": -0.0038162878846821754} {"step": 1390, "timestamp": 1778196026.9282653, "train/loss": 4.459986543655395, "train/z_loss": 0.0012279643211513757, "train/perplexity": 86.48634529843322, "train/grad_norm": 0.80078125, "optim/muon_lr": 0.005560000000000001, "optim/adamw_lr": 0.0001668, "perf/tokens_per_sec": 1918630.389450803, "perf/iters_per_sec": 0.9148742625478758, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0930463790893554, "data/tokens_consumed": 2917138432, "data/tokens_consumed_B": 2.917138432, "train/loss_slope": -0.0037425552696928478} {"step": 1400, "timestamp": 1778196037.2632167, "grad/layer_0/attn": 0.022977126762270927, "grad/layer_0/mlp": 0.046238526701927185, "grad/layer_0/attn_mlp_ratio": 0.4969260128182384, "grad/layer_4/attn": 0.026609202846884727, "grad/layer_4/mlp": 0.06414264440536499, "grad/layer_4/attn_mlp_ratio": 0.41484418130997225, "grad/layer_8/attn": 0.049394309520721436, "grad/layer_8/mlp": 0.04841691255569458, "grad/layer_8/attn_mlp_ratio": 1.0201870960252541, "grad/layer_12/attn": 0.026030879467725754, "grad/layer_12/mlp": 0.015791960060596466, "grad/layer_12/attn_mlp_ratio": 1.6483627873300408, "grad/layer_16/attn": 0.02947639673948288, "grad/layer_16/mlp": 0.02668278105556965, "grad/layer_16/attn_mlp_ratio": 1.104697316506308, "grad/layer_20/attn": 0.10418759286403656, "grad/layer_20/mlp": 0.08643729984760284, "grad/layer_20/attn_mlp_ratio": 1.2053545509542027, "grad/layer_24/attn": 0.0253862626850605, "grad/layer_24/mlp": 0.03941035270690918, "grad/layer_24/attn_mlp_ratio": 0.644152129503645, "grad/layer_27/attn": 0.02138856239616871, "grad/layer_27/mlp": 0.020868636667728424, "grad/layer_27/attn_mlp_ratio": 1.0249142114181748} {"step": 1400, "timestamp": 1778196037.2789881, "train/loss": 4.401293659210205, "train/z_loss": 0.0012350499629974366, "train/perplexity": 81.55630651701533, "train/grad_norm": 0.50390625, "optim/muon_lr": 0.005600000000000001, "optim/adamw_lr": 0.000168, "perf/tokens_per_sec": 2027089.8057906725, "perf/iters_per_sec": 0.9665917424157489, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0345629453659058, "data/tokens_consumed": 2938109952, "data/tokens_consumed_B": 2.938109952, "train/loss_slope": -0.003674110099599056} {"step": 1410, "timestamp": 1778196047.6292632, "train/loss": 4.439439916610718, "train/z_loss": 0.001234510529320687, "train/perplexity": 84.72747393135705, "train/grad_norm": 0.64453125, "optim/muon_lr": 0.005639999999999999, "optim/adamw_lr": 0.00016919999999999997, "perf/tokens_per_sec": 2027166.0475051983, "perf/iters_per_sec": 0.9666280972982398, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0345240354537963, "data/tokens_consumed": 2959081472, "data/tokens_consumed_B": 2.959081472, "train/loss_slope": -0.0036046701861043038} {"step": 1420, "timestamp": 1778196057.9790735, "train/loss": 4.2712929248809814, "train/z_loss": 0.0012478810851462186, "train/perplexity": 71.61416752695258, "train/grad_norm": 0.546875, "optim/muon_lr": 0.005679999999999999, "optim/adamw_lr": 0.00017039999999999997, "perf/tokens_per_sec": 2027288.784419524, "perf/iters_per_sec": 0.9666866228196735, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0344614028930663, "data/tokens_consumed": 2980052992, "data/tokens_consumed_B": 2.980052992, "train/loss_slope": -0.0035476654664196123} {"step": 1425, "timestamp": 1778196063.7699447, "eos/sharpness": 158.83803367614743, "eos/L0_probe": 3.9924681186676025, "eos/L_plus": 4.493451118469238, "eos/L_minus": 5.079865455627441, "eos/grad_norm": 1.0015588998794556, "eos/embed_grad_frac": 0.03814644366502762, "eos/time_s": 0.6162641048431396} {"step": 1425, "timestamp": 1778196065.1549823, "geo/rankme_last": 322.439697265625, "geo/layer_0/stable_rank_q_proj": 54.410194396972656, "geo/layer_0/stable_rank_k_proj": 48.544189453125, "geo/layer_0/stable_rank_o_proj": 67.778076171875, "geo/layer_0/stable_rank_gate_proj": 164.39923095703125, "geo/layer_0/stable_rank_down_proj": 54.2076416015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03633624687790871, "geo/layer_0/attn_entropy_mean": 6.96513557434082, "geo/layer_0/attn_entropy_std": 0.03914573788642883, "geo/layer_7/stable_rank_q_proj": 27.48247718811035, "geo/layer_7/stable_rank_k_proj": 29.342851638793945, "geo/layer_7/stable_rank_o_proj": 117.41551971435547, "geo/layer_7/stable_rank_gate_proj": 173.462890625, "geo/layer_7/stable_rank_down_proj": 202.27342224121094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7338422536849976, "geo/layer_7/attn_entropy_mean": 6.0001020431518555, "geo/layer_7/attn_entropy_std": 1.5200281143188477, "geo/layer_14/stable_rank_q_proj": 34.42241668701172, "geo/layer_14/stable_rank_k_proj": 22.604175567626953, "geo/layer_14/stable_rank_o_proj": 92.25314331054688, "geo/layer_14/stable_rank_gate_proj": 194.2415313720703, "geo/layer_14/stable_rank_down_proj": 167.3221893310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7347854971885681, "geo/layer_14/attn_entropy_mean": 6.894530296325684, "geo/layer_14/attn_entropy_std": 0.13166898488998413, "geo/layer_21/stable_rank_q_proj": 62.411346435546875, "geo/layer_21/stable_rank_k_proj": 36.80461883544922, "geo/layer_21/stable_rank_o_proj": 66.7799072265625, "geo/layer_21/stable_rank_gate_proj": 149.1969757080078, "geo/layer_21/stable_rank_down_proj": 155.43905639648438, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.4248940646648407, "geo/layer_21/attn_entropy_mean": 6.239591121673584, "geo/layer_21/attn_entropy_std": 0.515974223613739, "geo/layer_27/stable_rank_q_proj": 60.782066345214844, "geo/layer_27/stable_rank_k_proj": 21.236989974975586, "geo/layer_27/stable_rank_o_proj": 78.01045227050781, "geo/layer_27/stable_rank_gate_proj": 85.89664459228516, "geo/layer_27/stable_rank_down_proj": 62.64702224731445, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.16275663673877716, "geo/layer_27/attn_entropy_mean": 6.21586799621582, "geo/layer_27/attn_entropy_std": 0.24597331881523132, "attnres/final_alpha/block_0": 0.19760248064994812, "attnres/block_norm/0": 0.5782129764556885, "attnres/final_alpha/block_1": 0.012834151275455952, "attnres/block_norm/1": 2660.470703125, "attnres/final_alpha/block_2": 0.09024201333522797, "attnres/block_norm/2": 1227.8240966796875, "attnres/final_alpha/block_3": 0.04230614751577377, "attnres/block_norm/3": 1162.204345703125, "attnres/final_alpha/block_4": 0.07372095435857773, "attnres/block_norm/4": 1023.3306884765625, "attnres/final_alpha/block_5": 0.3620545268058777, "attnres/block_norm/5": 956.9068603515625, "attnres/final_alpha/block_6": 0.22123971581459045, "attnres/block_norm/6": 1146.0517578125, "geo/tier1_time_s": 1.3631207942962646, "geo/step": 1425.0, "geo/rankme_slope": 0.2663303299523834} {"step": 1430, "timestamp": 1778196070.3518178, "train/loss": 4.320718288421631, "train/z_loss": 0.0012488035252317786, "train/perplexity": 75.24265481415334, "train/grad_norm": 0.81640625, "optim/muon_lr": 0.005719999999999999, "optim/adamw_lr": 0.00017159999999999997, "perf/tokens_per_sec": 1695627.519490691, "perf/iters_per_sec": 0.808538207764955, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2367999315261842, "data/tokens_consumed": 3001024512, "data/tokens_consumed_B": 3.001024512, "train/loss_slope": -0.0034868798515155014} {"step": 1440, "timestamp": 1778196080.7088206, "train/loss": 4.3018981456756595, "train/z_loss": 0.0012513396213762462, "train/perplexity": 73.8398194969423, "train/grad_norm": 0.578125, "optim/muon_lr": 0.0057599999999999995, "optim/adamw_lr": 0.00017279999999999997, "perf/tokens_per_sec": 2025813.5624084466, "perf/iters_per_sec": 0.9659831821481927, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0352147102355957, "data/tokens_consumed": 3021996032, "data/tokens_consumed_B": 3.021996032, "train/loss_slope": -0.0034292306967551307} {"step": 1450, "timestamp": 1778196091.0521688, "grad/layer_0/attn": 0.0247550867497921, "grad/layer_0/mlp": 0.051520559936761856, "grad/layer_0/attn_mlp_ratio": 0.4804894731759207, "grad/layer_4/attn": 0.02399292401969433, "grad/layer_4/mlp": 0.056983329355716705, "grad/layer_4/attn_mlp_ratio": 0.42105163473011664, "grad/layer_8/attn": 0.06662306189537048, "grad/layer_8/mlp": 0.048222508281469345, "grad/layer_8/attn_mlp_ratio": 1.3815760343352854, "grad/layer_12/attn": 0.01875615119934082, "grad/layer_12/mlp": 0.016773400828242302, "grad/layer_12/attn_mlp_ratio": 1.118208005614416, "grad/layer_16/attn": 0.020742664113640785, "grad/layer_16/mlp": 0.027844633907079697, "grad/layer_16/attn_mlp_ratio": 0.7449429613033094, "grad/layer_20/attn": 0.13516733050346375, "grad/layer_20/mlp": 0.10674361139535904, "grad/layer_20/attn_mlp_ratio": 1.266280282350579, "grad/layer_24/attn": 0.035011425614356995, "grad/layer_24/mlp": 0.04953495413064957, "grad/layer_24/attn_mlp_ratio": 0.7068024218077061, "grad/layer_27/attn": 0.026230262592434883, "grad/layer_27/mlp": 0.026727667078375816, "grad/layer_27/attn_mlp_ratio": 0.9813898989903854} {"step": 1450, "timestamp": 1778196091.0680668, "train/loss": 4.2818316459655765, "train/z_loss": 0.0012531578657217323, "train/perplexity": 72.37288017253614, "train/grad_norm": 0.59375, "optim/muon_lr": 0.0058, "optim/adamw_lr": 0.00017399999999999997, "perf/tokens_per_sec": 2025445.4654248764, "perf/iters_per_sec": 0.965807659828604, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0354028463363647, "data/tokens_consumed": 3042967552, "data/tokens_consumed_B": 3.042967552, "train/loss_slope": -0.0033729310507535433} {"step": 1460, "timestamp": 1778196101.4213214, "train/loss": 4.170336031913758, "train/z_loss": 0.0012633219244889914, "train/perplexity": 64.7372022187855, "train/grad_norm": 0.5625, "optim/muon_lr": 0.00584, "optim/adamw_lr": 0.00017519999999999998, "perf/tokens_per_sec": 2027166.8884384378, "perf/iters_per_sec": 0.966628498286456, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034523606300354, "data/tokens_consumed": 3063939072, "data/tokens_consumed_B": 3.063939072, "train/loss_slope": -0.0033263033744847493} {"step": 1470, "timestamp": 1778196111.7755332, "train/loss": 4.225319242477417, "train/z_loss": 0.001263567979913205, "train/perplexity": 68.39633507905363, "train/grad_norm": 0.66796875, "optim/muon_lr": 0.00588, "optim/adamw_lr": 0.00017639999999999998, "perf/tokens_per_sec": 2026640.9742353663, "perf/iters_per_sec": 0.9663777228524048, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0347920656204224, "data/tokens_consumed": 3084910592, "data/tokens_consumed_B": 3.084910592, "train/loss_slope": -0.003277657785123796} {"step": 1480, "timestamp": 1778196122.1257267, "train/loss": 4.259469223022461, "train/z_loss": 0.0012632807483896613, "train/perplexity": 70.7724091179576, "train/grad_norm": 1.140625, "optim/muon_lr": 0.00592, "optim/adamw_lr": 0.00017759999999999998, "perf/tokens_per_sec": 2027312.0065887405, "perf/iters_per_sec": 0.96669769601285, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0344495534896851, "data/tokens_consumed": 3105882112, "data/tokens_consumed_B": 3.105882112, "train/loss_slope": -0.003227562917705917} {"step": 1490, "timestamp": 1778196132.4838917, "train/loss": 4.152785515785217, "train/z_loss": 0.0012755479547195137, "train/perplexity": 63.61094303125165, "train/grad_norm": 1.0859375, "optim/muon_lr": 0.00596, "optim/adamw_lr": 0.00017879999999999998, "perf/tokens_per_sec": 2025880.0496404457, "perf/iters_per_sec": 0.966014885730956, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0351807355880738, "data/tokens_consumed": 3126853632, "data/tokens_consumed_B": 3.126853632, "train/loss_slope": -0.003183757530845324} {"step": 1500, "timestamp": 1778196142.8456283, "grad/layer_0/attn": 0.023066725581884384, "grad/layer_0/mlp": 0.04983845725655556, "grad/layer_0/attn_mlp_ratio": 0.4628298467759511, "grad/layer_4/attn": 0.028928790241479874, "grad/layer_4/mlp": 0.07004497200250626, "grad/layer_4/attn_mlp_ratio": 0.4130030946281837, "grad/layer_8/attn": 0.07357173413038254, "grad/layer_8/mlp": 0.050771769136190414, "grad/layer_8/attn_mlp_ratio": 1.4490677641767145, "grad/layer_12/attn": 0.018566137179732323, "grad/layer_12/mlp": 0.020120766013860703, "grad/layer_12/attn_mlp_ratio": 0.9227351023648431, "grad/layer_16/attn": 0.027668427675962448, "grad/layer_16/mlp": 0.032670170068740845, "grad/layer_16/attn_mlp_ratio": 0.8469018536804526, "grad/layer_20/attn": 0.21541728079319, "grad/layer_20/mlp": 0.1256152242422104, "grad/layer_20/attn_mlp_ratio": 1.7148978710282292, "grad/layer_24/attn": 0.03190506994724274, "grad/layer_24/mlp": 0.04782714694738388, "grad/layer_24/attn_mlp_ratio": 0.667091221553177, "grad/layer_27/attn": 0.02250734530389309, "grad/layer_27/mlp": 0.02457360178232193, "grad/layer_27/attn_mlp_ratio": 0.9159155996616315} {"step": 1500, "timestamp": 1778196143.4484012, "eos/sharpness": 48.131275177001946, "eos/L0_probe": 3.824759006500244, "eos/L_plus": 4.079836845397949, "eos/L_minus": 4.050993919372559, "eos/grad_norm": 0.6506308317184448, "eos/embed_grad_frac": 0.09108171612024307, "eos/time_s": 0.5993599891662598} {"step": 1500, "timestamp": 1778196143.4685183, "train/loss": 4.182541036605835, "train/z_loss": 0.001275492797140032, "train/perplexity": 65.53216144796771, "train/grad_norm": 0.65234375, "optim/muon_lr": 0.006, "optim/adamw_lr": 0.00017999999999999998, "perf/tokens_per_sec": 1910055.578904341, "perf/iters_per_sec": 0.9107854742547707, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0979533910751342, "data/tokens_consumed": 3147825152, "data/tokens_consumed_B": 3.147825152, "train/loss_slope": -0.0031399848026947472} {"step": 1500, "timestamp": 1778196144.83723, "geo/rankme_last": 323.119384765625, "geo/layer_0/stable_rank_q_proj": 54.42998123168945, "geo/layer_0/stable_rank_k_proj": 48.9703483581543, "geo/layer_0/stable_rank_o_proj": 67.6221923828125, "geo/layer_0/stable_rank_gate_proj": 164.4864959716797, "geo/layer_0/stable_rank_down_proj": 53.960899353027344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04809339717030525, "geo/layer_0/attn_entropy_mean": 6.961977958679199, "geo/layer_0/attn_entropy_std": 0.03842197731137276, "geo/layer_7/stable_rank_q_proj": 27.545799255371094, "geo/layer_7/stable_rank_k_proj": 29.486663818359375, "geo/layer_7/stable_rank_o_proj": 117.25174713134766, "geo/layer_7/stable_rank_gate_proj": 173.77197265625, "geo/layer_7/stable_rank_down_proj": 202.3026580810547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7317420244216919, "geo/layer_7/attn_entropy_mean": 5.973209381103516, "geo/layer_7/attn_entropy_std": 1.5357030630111694, "geo/layer_14/stable_rank_q_proj": 34.52186965942383, "geo/layer_14/stable_rank_k_proj": 22.694902420043945, "geo/layer_14/stable_rank_o_proj": 92.37053680419922, "geo/layer_14/stable_rank_gate_proj": 194.27537536621094, "geo/layer_14/stable_rank_down_proj": 168.3765411376953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.733108401298523, "geo/layer_14/attn_entropy_mean": 6.894008636474609, "geo/layer_14/attn_entropy_std": 0.13142143189907074, "geo/layer_21/stable_rank_q_proj": 61.824222564697266, "geo/layer_21/stable_rank_k_proj": 36.867679595947266, "geo/layer_21/stable_rank_o_proj": 67.0426025390625, "geo/layer_21/stable_rank_gate_proj": 148.85733032226562, "geo/layer_21/stable_rank_down_proj": 155.81300354003906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.4185284972190857, "geo/layer_21/attn_entropy_mean": 6.280249118804932, "geo/layer_21/attn_entropy_std": 0.4961497485637665, "geo/layer_27/stable_rank_q_proj": 61.04747772216797, "geo/layer_27/stable_rank_k_proj": 21.412010192871094, "geo/layer_27/stable_rank_o_proj": 78.5949478149414, "geo/layer_27/stable_rank_gate_proj": 86.62728118896484, "geo/layer_27/stable_rank_down_proj": 63.13056564331055, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.160909503698349, "geo/layer_27/attn_entropy_mean": 6.154914855957031, "geo/layer_27/attn_entropy_std": 0.2600218653678894, "attnres/final_alpha/block_0": 0.20126885175704956, "attnres/block_norm/0": 0.5876054763793945, "attnres/final_alpha/block_1": 0.013364076614379883, "attnres/block_norm/1": 2688.0732421875, "attnres/final_alpha/block_2": 0.08412333577871323, "attnres/block_norm/2": 1278.988037109375, "attnres/final_alpha/block_3": 0.041394028812646866, "attnres/block_norm/3": 1206.789794921875, "attnres/final_alpha/block_4": 0.06633549928665161, "attnres/block_norm/4": 1032.8486328125, "attnres/final_alpha/block_5": 0.3653460443019867, "attnres/block_norm/5": 974.258544921875, "attnres/final_alpha/block_6": 0.22816815972328186, "attnres/block_norm/6": 1171.2535400390625, "geo/tier1_time_s": 1.3651399612426758, "geo/step": 1500.0, "geo/rankme_slope": 0.25359829487325825} {"step": 1500, "timestamp": 1778196151.9432187, "geo/ww_alpha_mean": 7.541960304311452, "geo/ww_alpha_std": 5.548625826966039, "geo/ww_alpha_min": 1.9015921860217337, "geo/ww_alpha_max": 30.581689456800802, "geo/ww_alpha_healthy_frac": 0.27918781725888325, "geo/ww_alpha_by_type/q_proj": 4.318342380519124, "geo/ww_alpha_by_type/k_proj": 4.301214388834775, "geo/ww_alpha_by_type/v_proj": 5.70780665680192, "geo/ww_alpha_by_type/o_proj": 6.102015615501351, "geo/ww_alpha_by_type/gate_proj": 11.462616017431342, "geo/ww_alpha_by_type/up_proj": 11.910487503835895, "geo/ww_alpha_by_type/down_proj": 9.181875572586767, "geo/twonn_id/layer_0": 0.6607282757759094, "geo/twonn_id/layer_7": 2.058701276779175, "geo/twonn_id/layer_14": 2.0181503295898438, "geo/twonn_id/layer_21": 4.730832576751709, "geo/twonn_id/layer_27": 5.56469202041626, "geo/tier2_time_s": 7.099287271499634} {"step": 1500, "timestamp": 1778196153.127684, "eoc/jacobian_sigma/layer_0/attn": 1504.1888427734375, "eoc/jacobian_sigma/layer_0/mlp": 1161.3721923828125, "eoc/jacobian_sigma/layer_0": 1504.1888427734375, "eoc/jacobian_sigma/layer_7/attn": 1.0601177215576172, "eoc/jacobian_sigma/layer_7/mlp": 1.3343024253845215, "eoc/jacobian_sigma/layer_7": 1.3343024253845215, "eoc/jacobian_sigma/layer_14/attn": 1.295304298400879, "eoc/jacobian_sigma/layer_14/mlp": 2.2831835746765137, "eoc/jacobian_sigma/layer_14": 2.2831835746765137, "eoc/jacobian_sigma/layer_21/attn": 1.0331149101257324, "eoc/jacobian_sigma/layer_21/mlp": 3.740492820739746, "eoc/jacobian_sigma/layer_21": 3.740492820739746, "eoc/jacobian_sigma/layer_27/attn": 1.7878960371017456, "eoc/jacobian_sigma/layer_27/mlp": 12.391576766967773, "eoc/jacobian_sigma/layer_27": 12.391576766967773, "eoc/layer0_sigma": 1504.1888427734375, "eoc/sigma_max": 12.391576766967773, "eoc/sigma_min": 1.3343024253845215, "eoc/sigma_mean": 4.937388896942139, "eoc/time_s": 1.1780707836151123} {"step": 1510, "timestamp": 1778196163.5262437, "train/loss": 4.095396542549134, "train/z_loss": 0.0012810352258384229, "train/perplexity": 60.063152031145165, "train/grad_norm": 0.58984375, "optim/muon_lr": 0.00604, "optim/adamw_lr": 0.00018119999999999999, "perf/tokens_per_sec": 1045742.440356367, "perf/iters_per_sec": 0.498648853471931, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 2.0054192304611207, "data/tokens_consumed": 3168796672, "data/tokens_consumed_B": 3.168796672, "train/loss_slope": -0.0031020196220376203} {"step": 1520, "timestamp": 1778196173.8997414, "train/loss": 4.081232118606567, "train/z_loss": 0.0012842770665884018, "train/perplexity": 59.2183889970725, "train/grad_norm": 0.82421875, "optim/muon_lr": 0.00608, "optim/adamw_lr": 0.0001824, "perf/tokens_per_sec": 2022877.5980905304, "perf/iters_per_sec": 0.9645832052662517, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0367172002792358, "data/tokens_consumed": 3189768192, "data/tokens_consumed_B": 3.189768192, "train/loss_slope": -0.0030671083447026876} {"step": 1530, "timestamp": 1778196184.253174, "train/loss": 4.048073720932007, "train/z_loss": 0.0012892630882561206, "train/perplexity": 57.28699994513837, "train/grad_norm": 0.80859375, "optim/muon_lr": 0.0061200000000000004, "optim/adamw_lr": 0.0001836, "perf/tokens_per_sec": 2026645.9238427419, "perf/iters_per_sec": 0.9663800830091199, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034789538383484, "data/tokens_consumed": 3210739712, "data/tokens_consumed_B": 3.210739712, "train/loss_slope": -0.0030344555583402192} {"step": 1540, "timestamp": 1778196194.607557, "train/loss": 4.063197040557862, "train/z_loss": 0.0012941460125148296, "train/perplexity": 58.15995389848216, "train/grad_norm": 1.171875, "optim/muon_lr": 0.00616, "optim/adamw_lr": 0.0001848, "perf/tokens_per_sec": 2026488.8089532445, "perf/iters_per_sec": 0.9663051647917006, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348697662353517, "data/tokens_consumed": 3231711232, "data/tokens_consumed_B": 3.231711232, "train/loss_slope": -0.0030019031262133095} {"step": 1550, "timestamp": 1778196204.956249, "grad/layer_0/attn": 0.02420351654291153, "grad/layer_0/mlp": 0.05176675692200661, "grad/layer_0/attn_mlp_ratio": 0.4675494069026258, "grad/layer_4/attn": 0.0251142717897892, "grad/layer_4/mlp": 0.062442198395729065, "grad/layer_4/attn_mlp_ratio": 0.4022003131665355, "grad/layer_8/attn": 0.04543813318014145, "grad/layer_8/mlp": 0.04581967741250992, "grad/layer_8/attn_mlp_ratio": 0.9916729153699456, "grad/layer_12/attn": 0.019412565976381302, "grad/layer_12/mlp": 0.020188936963677406, "grad/layer_12/attn_mlp_ratio": 0.9615447269538078, "grad/layer_16/attn": 0.02286473661661148, "grad/layer_16/mlp": 0.03013671189546585, "grad/layer_16/attn_mlp_ratio": 0.7587004388551591, "grad/layer_20/attn": 0.21961145102977753, "grad/layer_20/mlp": 0.12608852982521057, "grad/layer_20/attn_mlp_ratio": 1.7417242564414073, "grad/layer_24/attn": 0.03623318299651146, "grad/layer_24/mlp": 0.05591462180018425, "grad/layer_24/attn_mlp_ratio": 0.6480090853729276, "grad/layer_27/attn": 0.021828651428222656, "grad/layer_27/mlp": 0.029663613066077232, "grad/layer_27/attn_mlp_ratio": 0.7358729803416363} {"step": 1550, "timestamp": 1778196204.9723694, "train/loss": 4.016871595382691, "train/z_loss": 0.001296351698692888, "train/perplexity": 55.52712251712912, "train/grad_norm": 0.75390625, "optim/muon_lr": 0.0062, "optim/adamw_lr": 0.000186, "perf/tokens_per_sec": 2024312.2536768748, "perf/iters_per_sec": 0.9652673023590445, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0359824657440186, "data/tokens_consumed": 3252682752, "data/tokens_consumed_B": 3.252682752, "train/loss_slope": -0.002971721888990542} {"step": 1560, "timestamp": 1778196215.3304434, "train/loss": 4.064361667633056, "train/z_loss": 0.001297138398513198, "train/perplexity": 58.22772801360865, "train/grad_norm": 0.96875, "optim/muon_lr": 0.00624, "optim/adamw_lr": 0.0001872, "perf/tokens_per_sec": 2025658.4425039075, "perf/iters_per_sec": 0.9659092152137315, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035293984413147, "data/tokens_consumed": 3273654272, "data/tokens_consumed_B": 3.273654272, "train/loss_slope": -0.002932864554036389} {"step": 1570, "timestamp": 1778196225.6884794, "train/loss": 3.9636550426483153, "train/z_loss": 0.0013054265640676022, "train/perplexity": 52.64941053749741, "train/grad_norm": 0.7265625, "optim/muon_lr": 0.00628, "optim/adamw_lr": 0.00018839999999999997, "perf/tokens_per_sec": 2025547.9836230788, "perf/iters_per_sec": 0.9658565443148989, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0353504419326782, "data/tokens_consumed": 3294625792, "data/tokens_consumed_B": 3.294625792, "train/loss_slope": -0.0029033629574648366} {"step": 1575, "timestamp": 1778196231.4855921, "eos/sharpness": 59.47475433349608, "eos/L0_probe": 3.6754539012908936, "eos/L_plus": 3.9249463081359863, "eos/L_minus": 4.020709037780762, "eos/grad_norm": 0.6188288331031799, "eos/embed_grad_frac": 0.08717963844537735, "eos/time_s": 0.6318154335021973} {"step": 1575, "timestamp": 1778196232.868388, "geo/rankme_last": 324.9422607421875, "geo/layer_0/stable_rank_q_proj": 54.37993621826172, "geo/layer_0/stable_rank_k_proj": 49.35973358154297, "geo/layer_0/stable_rank_o_proj": 67.53215026855469, "geo/layer_0/stable_rank_gate_proj": 164.5016632080078, "geo/layer_0/stable_rank_down_proj": 53.68840408325195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04441887512803078, "geo/layer_0/attn_entropy_mean": 6.956466197967529, "geo/layer_0/attn_entropy_std": 0.039052870124578476, "geo/layer_7/stable_rank_q_proj": 27.625600814819336, "geo/layer_7/stable_rank_k_proj": 29.63729476928711, "geo/layer_7/stable_rank_o_proj": 116.92584228515625, "geo/layer_7/stable_rank_gate_proj": 174.21669006347656, "geo/layer_7/stable_rank_down_proj": 202.7411651611328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7218358516693115, "geo/layer_7/attn_entropy_mean": 5.940582275390625, "geo/layer_7/attn_entropy_std": 1.5584897994995117, "geo/layer_14/stable_rank_q_proj": 34.609561920166016, "geo/layer_14/stable_rank_k_proj": 22.750974655151367, "geo/layer_14/stable_rank_o_proj": 92.4462890625, "geo/layer_14/stable_rank_gate_proj": 194.3665008544922, "geo/layer_14/stable_rank_down_proj": 169.31529235839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.724136471748352, "geo/layer_14/attn_entropy_mean": 6.90020227432251, "geo/layer_14/attn_entropy_std": 0.13006730377674103, "geo/layer_21/stable_rank_q_proj": 61.075218200683594, "geo/layer_21/stable_rank_k_proj": 36.95530700683594, "geo/layer_21/stable_rank_o_proj": 67.31134796142578, "geo/layer_21/stable_rank_gate_proj": 148.4784698486328, "geo/layer_21/stable_rank_down_proj": 156.1351776123047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.4141334593296051, "geo/layer_21/attn_entropy_mean": 6.303954124450684, "geo/layer_21/attn_entropy_std": 0.48575934767723083, "geo/layer_27/stable_rank_q_proj": 61.300872802734375, "geo/layer_27/stable_rank_k_proj": 21.632774353027344, "geo/layer_27/stable_rank_o_proj": 79.00360107421875, "geo/layer_27/stable_rank_gate_proj": 87.32170867919922, "geo/layer_27/stable_rank_down_proj": 63.73085403442383, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.16214239597320557, "geo/layer_27/attn_entropy_mean": 6.093061447143555, "geo/layer_27/attn_entropy_std": 0.2670348882675171, "attnres/final_alpha/block_0": 0.2051081508398056, "attnres/block_norm/0": 0.5972201824188232, "attnres/final_alpha/block_1": 0.013902625069022179, "attnres/block_norm/1": 2707.3310546875, "attnres/final_alpha/block_2": 0.07960784435272217, "attnres/block_norm/2": 1322.19189453125, "attnres/final_alpha/block_3": 0.04139178991317749, "attnres/block_norm/3": 1243.8172607421875, "attnres/final_alpha/block_4": 0.06100102886557579, "attnres/block_norm/4": 1049.456787109375, "attnres/final_alpha/block_5": 0.3663003742694855, "attnres/block_norm/5": 983.8919067382812, "attnres/final_alpha/block_6": 0.23268818855285645, "attnres/block_norm/6": 1203.736328125, "geo/tier1_time_s": 1.3631870746612549, "geo/step": 1575.0, "geo/rankme_slope": 0.24131779483139637} {"step": 1580, "timestamp": 1778196238.0519855, "train/loss": 3.922366714477539, "train/z_loss": 0.0013128216261975468, "train/perplexity": 50.5198695067685, "train/grad_norm": 0.67578125, "optim/muon_lr": 0.00632, "optim/adamw_lr": 0.00018959999999999997, "perf/tokens_per_sec": 1696928.7943542951, "perf/iters_per_sec": 0.8091587039729572, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2358515024185182, "data/tokens_consumed": 3315597312, "data/tokens_consumed_B": 3.315597312, "train/loss_slope": -0.002870586374075678} {"step": 1590, "timestamp": 1778196248.40774, "train/loss": 3.9913347959518433, "train/z_loss": 0.001306101179216057, "train/perplexity": 54.12708978652255, "train/grad_norm": 0.90625, "optim/muon_lr": 0.00636, "optim/adamw_lr": 0.00019079999999999998, "perf/tokens_per_sec": 2026022.976969273, "perf/iters_per_sec": 0.9660830387922635, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035107707977295, "data/tokens_consumed": 3336568832, "data/tokens_consumed_B": 3.336568832, "train/loss_slope": -0.0028324157742312317} {"step": 1600, "timestamp": 1778196258.7531316, "grad/layer_0/attn": 0.023690305650234222, "grad/layer_0/mlp": 0.05186886340379715, "grad/layer_0/attn_mlp_ratio": 0.4567346197685618, "grad/layer_4/attn": 0.026316020637750626, "grad/layer_4/mlp": 0.06268877536058426, "grad/layer_4/attn_mlp_ratio": 0.419788398232422, "grad/layer_8/attn": 0.059305839240550995, "grad/layer_8/mlp": 0.04911953955888748, "grad/layer_8/attn_mlp_ratio": 1.2073777493112243, "grad/layer_12/attn": 0.02219650335609913, "grad/layer_12/mlp": 0.022426679730415344, "grad/layer_12/attn_mlp_ratio": 0.9897364890364178, "grad/layer_16/attn": 0.029817162081599236, "grad/layer_16/mlp": 0.034199461340904236, "grad/layer_16/attn_mlp_ratio": 0.8718605739778242, "grad/layer_20/attn": 0.1578257828950882, "grad/layer_20/mlp": 0.12634457647800446, "grad/layer_20/attn_mlp_ratio": 1.2491694314844406, "grad/layer_24/attn": 0.040433429181575775, "grad/layer_24/mlp": 0.04885781928896904, "grad/layer_24/attn_mlp_ratio": 0.8275733482838308, "grad/layer_27/attn": 0.020982690155506134, "grad/layer_27/mlp": 0.024657707661390305, "grad/layer_27/attn_mlp_ratio": 0.8509586681192397} {"step": 1600, "timestamp": 1778196258.769191, "train/loss": 3.9637980222702027, "train/z_loss": 0.001312992279417813, "train/perplexity": 52.6569388684948, "train/grad_norm": 0.6796875, "optim/muon_lr": 0.0064, "optim/adamw_lr": 0.00019199999999999998, "perf/tokens_per_sec": 2025024.4940440252, "perf/iters_per_sec": 0.965606925031674, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0356180906295775, "data/tokens_consumed": 3357540352, "data/tokens_consumed_B": 3.357540352, "train/loss_slope": -0.0027937738887404595} {"step": 1610, "timestamp": 1778196269.1293006, "train/loss": 3.9113182544708254, "train/z_loss": 0.0013166176388040185, "train/perplexity": 49.96477486606535, "train/grad_norm": 0.69921875, "optim/muon_lr": 0.00644, "optim/adamw_lr": 0.00019319999999999998, "perf/tokens_per_sec": 2025364.8760562795, "perf/iters_per_sec": 0.9657692318231008, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0354440450668334, "data/tokens_consumed": 3378511872, "data/tokens_consumed_B": 3.378511872, "train/loss_slope": -0.002758937804290492} {"step": 1620, "timestamp": 1778196279.5013978, "train/loss": 3.8161619901657104, "train/z_loss": 0.0013271170435473322, "train/perplexity": 45.429514373813106, "train/grad_norm": 0.98828125, "optim/muon_lr": 0.0064800000000000005, "optim/adamw_lr": 0.00019439999999999998, "perf/tokens_per_sec": 2022917.2813143984, "perf/iters_per_sec": 0.9646021277019493, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0366968631744384, "data/tokens_consumed": 3399483392, "data/tokens_consumed_B": 3.399483392, "train/loss_slope": -0.0027270982435004974} {"step": 1630, "timestamp": 1778196289.880771, "train/loss": 3.921712350845337, "train/z_loss": 0.0013223304762504996, "train/perplexity": 50.486821955196845, "train/grad_norm": 1.0078125, "optim/muon_lr": 0.006520000000000001, "optim/adamw_lr": 0.00019559999999999998, "perf/tokens_per_sec": 2022033.5935416617, "perf/iters_per_sec": 0.9641807525356587, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.037149930000305, "data/tokens_consumed": 3420454912, "data/tokens_consumed_B": 3.420454912, "train/loss_slope": -0.0026938887095401282} {"step": 1640, "timestamp": 1778196300.2327876, "train/loss": 3.9295387506484984, "train/z_loss": 0.0013235131511464715, "train/perplexity": 50.8835022731864, "train/grad_norm": 0.69921875, "optim/muon_lr": 0.006560000000000001, "optim/adamw_lr": 0.00019679999999999999, "perf/tokens_per_sec": 2026851.7275036816, "perf/iters_per_sec": 0.9664782178419502, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0346844673156739, "data/tokens_consumed": 3441426432, "data/tokens_consumed_B": 3.441426432, "train/loss_slope": -0.0026556140585629532} {"step": 1650, "timestamp": 1778196310.5811315, "grad/layer_0/attn": 0.02179046720266342, "grad/layer_0/mlp": 0.04771172255277634, "grad/layer_0/attn_mlp_ratio": 0.4567109714575656, "grad/layer_4/attn": 0.024492178112268448, "grad/layer_4/mlp": 0.06016174703836441, "grad/layer_4/attn_mlp_ratio": 0.40710549937885837, "grad/layer_8/attn": 0.049991898238658905, "grad/layer_8/mlp": 0.040903110057115555, "grad/layer_8/attn_mlp_ratio": 1.2222028605314321, "grad/layer_12/attn": 0.021455058827996254, "grad/layer_12/mlp": 0.018642382696270943, "grad/layer_12/attn_mlp_ratio": 1.150875350134315, "grad/layer_16/attn": 0.02378646470606327, "grad/layer_16/mlp": 0.028693702071905136, "grad/layer_16/attn_mlp_ratio": 0.8289785878293987, "grad/layer_20/attn": 0.21811164915561676, "grad/layer_20/mlp": 0.13107101619243622, "grad/layer_20/attn_mlp_ratio": 1.6640723122874224, "grad/layer_24/attn": 0.042854804545640945, "grad/layer_24/mlp": 0.06211062893271446, "grad/layer_24/attn_mlp_ratio": 0.6899753747956533, "grad/layer_27/attn": 0.035688478499650955, "grad/layer_27/mlp": 0.053930796682834625, "grad/layer_27/attn_mlp_ratio": 0.6617458044122587} {"step": 1650, "timestamp": 1778196311.189976, "eos/sharpness": 131.63094520568845, "eos/L0_probe": 3.5540590286254883, "eos/L_plus": 3.9476490020751953, "eos/L_minus": 4.476778507232666, "eos/grad_norm": 0.8563780784606934, "eos/embed_grad_frac": 0.04359174892306328, "eos/time_s": 0.6059703826904297} {"step": 1650, "timestamp": 1778196311.2107813, "train/loss": 3.866551470756531, "train/z_loss": 0.0013278704253025353, "train/perplexity": 47.777340106917016, "train/grad_norm": 0.85546875, "optim/muon_lr": 0.006600000000000001, "optim/adamw_lr": 0.000198, "perf/tokens_per_sec": 1911307.2471497736, "perf/iters_per_sec": 0.911382316183936, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0972343683242798, "data/tokens_consumed": 3462397952, "data/tokens_consumed_B": 3.462397952, "train/loss_slope": -0.002623675982357681} {"step": 1650, "timestamp": 1778196312.5772135, "geo/rankme_last": 328.5090026855469, "geo/layer_0/stable_rank_q_proj": 54.268375396728516, "geo/layer_0/stable_rank_k_proj": 49.76457595825195, "geo/layer_0/stable_rank_o_proj": 67.26522827148438, "geo/layer_0/stable_rank_gate_proj": 164.6460418701172, "geo/layer_0/stable_rank_down_proj": 53.48787307739258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04030299931764603, "geo/layer_0/attn_entropy_mean": 6.950342178344727, "geo/layer_0/attn_entropy_std": 0.03959991782903671, "geo/layer_7/stable_rank_q_proj": 27.685121536254883, "geo/layer_7/stable_rank_k_proj": 29.728271484375, "geo/layer_7/stable_rank_o_proj": 116.08402252197266, "geo/layer_7/stable_rank_gate_proj": 174.63873291015625, "geo/layer_7/stable_rank_down_proj": 202.61886596679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7179229855537415, "geo/layer_7/attn_entropy_mean": 5.909057140350342, "geo/layer_7/attn_entropy_std": 1.5665556192398071, "geo/layer_14/stable_rank_q_proj": 34.762611389160156, "geo/layer_14/stable_rank_k_proj": 22.84731674194336, "geo/layer_14/stable_rank_o_proj": 92.58180236816406, "geo/layer_14/stable_rank_gate_proj": 194.0063018798828, "geo/layer_14/stable_rank_down_proj": 169.20777893066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7159680724143982, "geo/layer_14/attn_entropy_mean": 6.900769233703613, "geo/layer_14/attn_entropy_std": 0.13065709173679352, "geo/layer_21/stable_rank_q_proj": 60.26573944091797, "geo/layer_21/stable_rank_k_proj": 37.01942443847656, "geo/layer_21/stable_rank_o_proj": 67.70191192626953, "geo/layer_21/stable_rank_gate_proj": 148.21868896484375, "geo/layer_21/stable_rank_down_proj": 156.74798583984375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.4059147238731384, "geo/layer_21/attn_entropy_mean": 6.335949897766113, "geo/layer_21/attn_entropy_std": 0.46465957164764404, "geo/layer_27/stable_rank_q_proj": 61.56055450439453, "geo/layer_27/stable_rank_k_proj": 21.91468620300293, "geo/layer_27/stable_rank_o_proj": 79.5218276977539, "geo/layer_27/stable_rank_gate_proj": 87.94644927978516, "geo/layer_27/stable_rank_down_proj": 64.43901062011719, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.15844452381134033, "geo/layer_27/attn_entropy_mean": 6.073949337005615, "geo/layer_27/attn_entropy_std": 0.2460266351699829, "attnres/final_alpha/block_0": 0.20709404349327087, "attnres/block_norm/0": 0.6071971654891968, "attnres/final_alpha/block_1": 0.01428491622209549, "attnres/block_norm/1": 2721.835693359375, "attnres/final_alpha/block_2": 0.07479177415370941, "attnres/block_norm/2": 1364.635498046875, "attnres/final_alpha/block_3": 0.04000788927078247, "attnres/block_norm/3": 1280.08251953125, "attnres/final_alpha/block_4": 0.05605168640613556, "attnres/block_norm/4": 1079.86083984375, "attnres/final_alpha/block_5": 0.37377113103866577, "attnres/block_norm/5": 991.848876953125, "attnres/final_alpha/block_6": 0.23399850726127625, "attnres/block_norm/6": 1227.6612548828125, "geo/tier1_time_s": 1.3620412349700928, "geo/step": 1650.0, "geo/rankme_slope": 0.22986783571708186} {"step": 1660, "timestamp": 1778196322.9370658, "train/loss": 3.7907240629196166, "train/z_loss": 0.001332836749497801, "train/perplexity": 44.28845629826697, "train/grad_norm": 0.91796875, "optim/muon_lr": 0.00664, "optim/adamw_lr": 0.0001992, "perf/tokens_per_sec": 1789049.4656996003, "perf/iters_per_sec": 0.8530852631090166, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1722157716751098, "data/tokens_consumed": 3483369472, "data/tokens_consumed_B": 3.483369472, "train/loss_slope": -0.002599286027516898} {"step": 1670, "timestamp": 1778196333.2947617, "train/loss": 3.7578566789627077, "train/z_loss": 0.0013382640900090337, "train/perplexity": 42.856472292181195, "train/grad_norm": 0.51953125, "optim/muon_lr": 0.00668, "optim/adamw_lr": 0.0002004, "perf/tokens_per_sec": 2025808.7568337792, "perf/iters_per_sec": 0.9659808906716247, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0352171659469604, "data/tokens_consumed": 3504340992, "data/tokens_consumed_B": 3.504340992, "train/loss_slope": -0.002574171369582942} {"step": 1680, "timestamp": 1778196343.6597059, "train/loss": 3.7916285514831545, "train/z_loss": 0.0013415988069027662, "train/perplexity": 44.32853282213203, "train/grad_norm": 1.03125, "optim/muon_lr": 0.00672, "optim/adamw_lr": 0.0002016, "perf/tokens_per_sec": 2024672.948274964, "perf/iters_per_sec": 0.9654392949461765, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035797905921936, "data/tokens_consumed": 3525312512, "data/tokens_consumed_B": 3.525312512, "train/loss_slope": -0.002548389044016859} {"step": 1690, "timestamp": 1778196354.0226843, "train/loss": 3.7812206745147705, "train/z_loss": 0.0013448725221678615, "train/perplexity": 43.86955951831011, "train/grad_norm": 0.78515625, "optim/muon_lr": 0.00676, "optim/adamw_lr": 0.0002028, "perf/tokens_per_sec": 2024785.6421767874, "perf/iters_per_sec": 0.9654930315860688, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0357402563095093, "data/tokens_consumed": 3546284032, "data/tokens_consumed_B": 3.546284032, "train/loss_slope": -0.0025241910676644295} {"step": 1700, "timestamp": 1778196364.3637064, "grad/layer_0/attn": 0.022692348808050156, "grad/layer_0/mlp": 0.04825684800744057, "grad/layer_0/attn_mlp_ratio": 0.4702410061578658, "grad/layer_4/attn": 0.022930437698960304, "grad/layer_4/mlp": 0.05203156918287277, "grad/layer_4/attn_mlp_ratio": 0.440702404617043, "grad/layer_8/attn": 0.05137951672077179, "grad/layer_8/mlp": 0.038463130593299866, "grad/layer_8/attn_mlp_ratio": 1.3358121347548526, "grad/layer_12/attn": 0.015245093032717705, "grad/layer_12/mlp": 0.017152410000562668, "grad/layer_12/attn_mlp_ratio": 0.8888018035563179, "grad/layer_16/attn": 0.02163471095263958, "grad/layer_16/mlp": 0.026076916605234146, "grad/layer_16/attn_mlp_ratio": 0.8296498852679569, "grad/layer_20/attn": 0.1117301881313324, "grad/layer_20/mlp": 0.11952607333660126, "grad/layer_20/attn_mlp_ratio": 0.9347766969906869, "grad/layer_24/attn": 0.04822917655110359, "grad/layer_24/mlp": 0.05969328060746193, "grad/layer_24/attn_mlp_ratio": 0.8079498392366752, "grad/layer_27/attn": 0.031862951815128326, "grad/layer_27/mlp": 0.0444023534655571, "grad/layer_27/attn_mlp_ratio": 0.7175960113935135} {"step": 1700, "timestamp": 1778196364.3794847, "train/loss": 3.7821913242340086, "train/z_loss": 0.0013441077549941837, "train/perplexity": 43.912162166719135, "train/grad_norm": 0.8203125, "optim/muon_lr": 0.0068000000000000005, "optim/adamw_lr": 0.000204, "perf/tokens_per_sec": 2025898.8534878811, "perf/iters_per_sec": 0.9660238521041303, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035171127319336, "data/tokens_consumed": 3567255552, "data/tokens_consumed_B": 3.567255552, "train/loss_slope": -0.002498739462788671} {"step": 1710, "timestamp": 1778196374.7395222, "train/loss": 3.790706181526184, "train/z_loss": 0.0013471120619215071, "train/perplexity": 44.28766436603582, "train/grad_norm": 0.6875, "optim/muon_lr": 0.006840000000000001, "optim/adamw_lr": 0.0002052, "perf/tokens_per_sec": 2025142.5423517039, "perf/iters_per_sec": 0.9656632148512382, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0355577230453492, "data/tokens_consumed": 3588227072, "data/tokens_consumed_B": 3.588227072, "train/loss_slope": -0.002471528718867103} {"step": 1720, "timestamp": 1778196385.1017482, "train/loss": 3.820698547363281, "train/z_loss": 0.0013487186282873155, "train/perplexity": 45.6360761496238, "train/grad_norm": 0.6796875, "optim/muon_lr": 0.00688, "optim/adamw_lr": 0.00020639999999999998, "perf/tokens_per_sec": 2024918.905619795, "perf/iters_per_sec": 0.9655565765475249, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035672092437744, "data/tokens_consumed": 3609198592, "data/tokens_consumed_B": 3.609198592, "train/loss_slope": -0.002442853440889324} {"step": 1725, "timestamp": 1778196390.8855748, "eos/sharpness": 102.22740173339842, "eos/L0_probe": 3.4466512203216553, "eos/L_plus": 4.216890811920166, "eos/L_minus": 3.698685646057129, "eos/grad_norm": 0.5975536704063416, "eos/embed_grad_frac": 0.09788260608911514, "eos/time_s": 0.6008706092834473} {"step": 1725, "timestamp": 1778196392.2722602, "geo/rankme_last": 330.71630859375, "geo/layer_0/stable_rank_q_proj": 54.09551239013672, "geo/layer_0/stable_rank_k_proj": 50.08453369140625, "geo/layer_0/stable_rank_o_proj": 67.03119659423828, "geo/layer_0/stable_rank_gate_proj": 164.67649841308594, "geo/layer_0/stable_rank_down_proj": 53.20491409301758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.047771625220775604, "geo/layer_0/attn_entropy_mean": 6.942760467529297, "geo/layer_0/attn_entropy_std": 0.04063328728079796, "geo/layer_7/stable_rank_q_proj": 27.761154174804688, "geo/layer_7/stable_rank_k_proj": 29.90276527404785, "geo/layer_7/stable_rank_o_proj": 115.61599731445312, "geo/layer_7/stable_rank_gate_proj": 174.989990234375, "geo/layer_7/stable_rank_down_proj": 202.5531463623047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.7077309489250183, "geo/layer_7/attn_entropy_mean": 5.881063461303711, "geo/layer_7/attn_entropy_std": 1.5674841403961182, "geo/layer_14/stable_rank_q_proj": 34.92720413208008, "geo/layer_14/stable_rank_k_proj": 22.96470069885254, "geo/layer_14/stable_rank_o_proj": 92.52176666259766, "geo/layer_14/stable_rank_gate_proj": 194.20712280273438, "geo/layer_14/stable_rank_down_proj": 168.70188903808594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.7021661996841431, "geo/layer_14/attn_entropy_mean": 6.894597053527832, "geo/layer_14/attn_entropy_std": 0.1387673020362854, "geo/layer_21/stable_rank_q_proj": 59.35881805419922, "geo/layer_21/stable_rank_k_proj": 37.164485931396484, "geo/layer_21/stable_rank_o_proj": 68.05359649658203, "geo/layer_21/stable_rank_gate_proj": 147.77182006835938, "geo/layer_21/stable_rank_down_proj": 157.114501953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3937281668186188, "geo/layer_21/attn_entropy_mean": 6.308934211730957, "geo/layer_21/attn_entropy_std": 0.49978265166282654, "geo/layer_27/stable_rank_q_proj": 61.76812744140625, "geo/layer_27/stable_rank_k_proj": 22.174272537231445, "geo/layer_27/stable_rank_o_proj": 80.14102172851562, "geo/layer_27/stable_rank_gate_proj": 88.53309631347656, "geo/layer_27/stable_rank_down_proj": 65.22186279296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.15472722053527832, "geo/layer_27/attn_entropy_mean": 5.995334148406982, "geo/layer_27/attn_entropy_std": 0.24554719030857086, "attnres/final_alpha/block_0": 0.20722073316574097, "attnres/block_norm/0": 0.6173866987228394, "attnres/final_alpha/block_1": 0.014373606070876122, "attnres/block_norm/1": 2741.39453125, "attnres/final_alpha/block_2": 0.07125228643417358, "attnres/block_norm/2": 1388.1575927734375, "attnres/final_alpha/block_3": 0.03980372101068497, "attnres/block_norm/3": 1303.42529296875, "attnres/final_alpha/block_4": 0.05242859944701195, "attnres/block_norm/4": 1110.750732421875, "attnres/final_alpha/block_5": 0.3768075704574585, "attnres/block_norm/5": 1020.642333984375, "attnres/final_alpha/block_6": 0.23811347782611847, "attnres/block_norm/6": 1250.7525634765625, "geo/tier1_time_s": 1.3681590557098389, "geo/step": 1725.0, "geo/rankme_slope": 0.21904699629631597} {"step": 1730, "timestamp": 1778196397.4627, "train/loss": 3.6881653308868407, "train/z_loss": 0.0013596659293398262, "train/perplexity": 39.971445267928395, "train/grad_norm": 0.70703125, "optim/muon_lr": 0.00692, "optim/adamw_lr": 0.00020759999999999998, "perf/tokens_per_sec": 1697643.4141208895, "perf/iters_per_sec": 0.8094994612316558, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2353312730789185, "data/tokens_consumed": 3630170112, "data/tokens_consumed_B": 3.630170112, "train/loss_slope": -0.002422177448538807} {"step": 1740, "timestamp": 1778196407.8344727, "train/loss": 3.7274497985839843, "train/z_loss": 0.0013578983722254633, "train/perplexity": 41.57295345884252, "train/grad_norm": 0.90234375, "optim/muon_lr": 0.00696, "optim/adamw_lr": 0.00020879999999999998, "perf/tokens_per_sec": 2023642.6932209216, "perf/iters_per_sec": 0.9649480310539825, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0363252401351928, "data/tokens_consumed": 3651141632, "data/tokens_consumed_B": 3.651141632, "train/loss_slope": -0.00240377292900589} {"step": 1750, "timestamp": 1778196418.2090263, "grad/layer_0/attn": 0.015312016010284424, "grad/layer_0/mlp": 0.036213964223861694, "grad/layer_0/attn_mlp_ratio": 0.4228207625475348, "grad/layer_4/attn": 0.019454367458820343, "grad/layer_4/mlp": 0.04842350631952286, "grad/layer_4/attn_mlp_ratio": 0.40175462078840585, "grad/layer_8/attn": 0.035248033702373505, "grad/layer_8/mlp": 0.03261126950383186, "grad/layer_8/attn_mlp_ratio": 1.0808543834868611, "grad/layer_12/attn": 0.017446214333176613, "grad/layer_12/mlp": 0.016152989119291306, "grad/layer_12/attn_mlp_ratio": 1.0800610398687585, "grad/layer_16/attn": 0.021979035809636116, "grad/layer_16/mlp": 0.024714622646570206, "grad/layer_16/attn_mlp_ratio": 0.8893130206766469, "grad/layer_20/attn": 0.12010031193494797, "grad/layer_20/mlp": 0.1127644032239914, "grad/layer_20/attn_mlp_ratio": 1.065055180488822, "grad/layer_24/attn": 0.02927611768245697, "grad/layer_24/mlp": 0.04658512771129608, "grad/layer_24/attn_mlp_ratio": 0.6284434337294663, "grad/layer_27/attn": 0.01639466919004917, "grad/layer_27/mlp": 0.025437941774725914, "grad/layer_27/attn_mlp_ratio": 0.644496684157386} {"step": 1750, "timestamp": 1778196418.2242694, "train/loss": 3.8078760623931887, "train/z_loss": 0.0013557438855059446, "train/perplexity": 45.05464391850441, "train/grad_norm": 0.486328125, "optim/muon_lr": 0.006999999999999999, "optim/adamw_lr": 0.00020999999999999998, "perf/tokens_per_sec": 2019937.9319338177, "perf/iters_per_sec": 0.963181463210019, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0382259607315063, "data/tokens_consumed": 3672113152, "data/tokens_consumed_B": 3.672113152, "train/loss_slope": -0.002377425919314458} {"step": 1760, "timestamp": 1778196428.6049151, "train/loss": 3.6653128147125242, "train/z_loss": 0.0013665422447957098, "train/perplexity": 39.0683554081857, "train/grad_norm": 0.83984375, "optim/muon_lr": 0.007039999999999999, "optim/adamw_lr": 0.00021119999999999996, "perf/tokens_per_sec": 2021404.6522995555, "perf/iters_per_sec": 0.963880849981096, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.037472629547119, "data/tokens_consumed": 3693084672, "data/tokens_consumed_B": 3.693084672, "train/loss_slope": -0.0023566476391987724} {"step": 1770, "timestamp": 1778196438.9752278, "train/loss": 3.662385439872742, "train/z_loss": 0.0013703269534744323, "train/perplexity": 38.95415492289939, "train/grad_norm": 0.60546875, "optim/muon_lr": 0.0070799999999999995, "optim/adamw_lr": 0.00021239999999999996, "perf/tokens_per_sec": 2023255.7917805666, "perf/iters_per_sec": 0.964763542070659, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.036523413658142, "data/tokens_consumed": 3714056192, "data/tokens_consumed_B": 3.714056192, "train/loss_slope": -0.0023395779562182446} {"step": 1780, "timestamp": 1778196449.3513598, "train/loss": 3.6356701374053957, "train/z_loss": 0.0013750849175266922, "train/perplexity": 37.92726085896427, "train/grad_norm": 0.74609375, "optim/muon_lr": 0.00712, "optim/adamw_lr": 0.00021359999999999996, "perf/tokens_per_sec": 2022505.5912532294, "perf/iters_per_sec": 0.964405818583121, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0369078874588014, "data/tokens_consumed": 3735027712, "data/tokens_consumed_B": 3.735027712, "train/loss_slope": -0.0023205617190766472} {"step": 1790, "timestamp": 1778196459.7275715, "train/loss": 3.661728549003601, "train/z_loss": 0.001377110870089382, "train/perplexity": 38.92857469684374, "train/grad_norm": 0.45703125, "optim/muon_lr": 0.00716, "optim/adamw_lr": 0.00021479999999999996, "perf/tokens_per_sec": 2022086.5381835373, "perf/iters_per_sec": 0.9642059985082327, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0371227741241456, "data/tokens_consumed": 3755999232, "data/tokens_consumed_B": 3.755999232, "train/loss_slope": -0.002302100200993572} {"step": 1800, "timestamp": 1778196470.1038077, "grad/layer_0/attn": 0.020645175129175186, "grad/layer_0/mlp": 0.043090641498565674, "grad/layer_0/attn_mlp_ratio": 0.47911041384592395, "grad/layer_4/attn": 0.023066148161888123, "grad/layer_4/mlp": 0.05073276534676552, "grad/layer_4/attn_mlp_ratio": 0.4546597836479405, "grad/layer_8/attn": 0.04032804071903229, "grad/layer_8/mlp": 0.03752799704670906, "grad/layer_8/attn_mlp_ratio": 1.07461212388652, "grad/layer_12/attn": 0.02076723240315914, "grad/layer_12/mlp": 0.016117369756102562, "grad/layer_12/attn_mlp_ratio": 1.2885000833616775, "grad/layer_16/attn": 0.01684255339205265, "grad/layer_16/mlp": 0.022436633706092834, "grad/layer_16/attn_mlp_ratio": 0.7506720276140055, "grad/layer_20/attn": 0.1893346905708313, "grad/layer_20/mlp": 0.1018555536866188, "grad/layer_20/attn_mlp_ratio": 1.8588548540757628, "grad/layer_24/attn": 0.053271159529685974, "grad/layer_24/mlp": 0.06475003063678741, "grad/layer_24/attn_mlp_ratio": 0.82272021995227, "grad/layer_27/attn": 0.04301336407661438, "grad/layer_27/mlp": 0.05993150919675827, "grad/layer_27/attn_mlp_ratio": 0.7177086741404824} {"step": 1800, "timestamp": 1778196470.7133088, "eos/sharpness": 148.76732826232907, "eos/L0_probe": 3.355376958847046, "eos/L_plus": 3.8325157165527344, "eos/L_minus": 4.365911483764648, "eos/grad_norm": 1.002853274345398, "eos/embed_grad_frac": 0.02258225344121456, "eos/time_s": 0.6064300537109375} {"step": 1800, "timestamp": 1778196470.7405307, "train/loss": 3.5786852836608887, "train/z_loss": 0.001383274677209556, "train/perplexity": 35.82640830651889, "train/grad_norm": 1.0859375, "optim/muon_lr": 0.0072, "optim/adamw_lr": 0.00021599999999999996, "perf/tokens_per_sec": 1905099.3382509472, "perf/iters_per_sec": 0.9084221545462356, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1008097887039185, "data/tokens_consumed": 3776970752, "data/tokens_consumed_B": 3.776970752, "train/loss_slope": -0.0022893246199896557} {"step": 1800, "timestamp": 1778196472.1062512, "geo/rankme_last": 338.2553405761719, "geo/layer_0/stable_rank_q_proj": 53.8943977355957, "geo/layer_0/stable_rank_k_proj": 50.42884826660156, "geo/layer_0/stable_rank_o_proj": 66.93534851074219, "geo/layer_0/stable_rank_gate_proj": 164.78578186035156, "geo/layer_0/stable_rank_down_proj": 52.916324615478516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04514562711119652, "geo/layer_0/attn_entropy_mean": 6.932414531707764, "geo/layer_0/attn_entropy_std": 0.04263002797961235, "geo/layer_7/stable_rank_q_proj": 27.868350982666016, "geo/layer_7/stable_rank_k_proj": 30.062158584594727, "geo/layer_7/stable_rank_o_proj": 115.06652069091797, "geo/layer_7/stable_rank_gate_proj": 175.3647918701172, "geo/layer_7/stable_rank_down_proj": 203.42970275878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6953248977661133, "geo/layer_7/attn_entropy_mean": 5.863413333892822, "geo/layer_7/attn_entropy_std": 1.5674782991409302, "geo/layer_14/stable_rank_q_proj": 35.08330535888672, "geo/layer_14/stable_rank_k_proj": 23.09691047668457, "geo/layer_14/stable_rank_o_proj": 92.50326538085938, "geo/layer_14/stable_rank_gate_proj": 194.27841186523438, "geo/layer_14/stable_rank_down_proj": 167.59347534179688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.6890738606452942, "geo/layer_14/attn_entropy_mean": 6.888846397399902, "geo/layer_14/attn_entropy_std": 0.14822018146514893, "geo/layer_21/stable_rank_q_proj": 58.42908477783203, "geo/layer_21/stable_rank_k_proj": 37.31815719604492, "geo/layer_21/stable_rank_o_proj": 68.524169921875, "geo/layer_21/stable_rank_gate_proj": 147.15782165527344, "geo/layer_21/stable_rank_down_proj": 157.24620056152344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3937793970108032, "geo/layer_21/attn_entropy_mean": 6.278382301330566, "geo/layer_21/attn_entropy_std": 0.5117478966712952, "geo/layer_27/stable_rank_q_proj": 61.92304992675781, "geo/layer_27/stable_rank_k_proj": 22.521272659301758, "geo/layer_27/stable_rank_o_proj": 81.05070495605469, "geo/layer_27/stable_rank_gate_proj": 89.15512084960938, "geo/layer_27/stable_rank_down_proj": 65.99932098388672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1333385705947876, "geo/layer_27/attn_entropy_mean": 5.93836784362793, "geo/layer_27/attn_entropy_std": 0.24969804286956787, "attnres/final_alpha/block_0": 0.2118983268737793, "attnres/block_norm/0": 0.6280279755592346, "attnres/final_alpha/block_1": 0.014967490918934345, "attnres/block_norm/1": 2752.875244140625, "attnres/final_alpha/block_2": 0.06998094916343689, "attnres/block_norm/2": 1415.170654296875, "attnres/final_alpha/block_3": 0.0405503548681736, "attnres/block_norm/3": 1331.71923828125, "attnres/final_alpha/block_4": 0.0508820116519928, "attnres/block_norm/4": 1156.742431640625, "attnres/final_alpha/block_5": 0.3623257875442505, "attnres/block_norm/5": 1069.22216796875, "attnres/final_alpha/block_6": 0.24939508736133575, "attnres/block_norm/6": 1270.11572265625, "geo/tier1_time_s": 1.3621220588684082, "geo/step": 1800.0, "geo/rankme_slope": 0.20952929757925182} {"step": 1810, "timestamp": 1778196482.491478, "train/loss": 3.632843852043152, "train/z_loss": 0.0013784970971755684, "train/perplexity": 37.82021893353531, "train/grad_norm": 0.78125, "optim/muon_lr": 0.00724, "optim/adamw_lr": 0.00021719999999999997, "perf/tokens_per_sec": 1785245.6652909636, "perf/iters_per_sec": 0.8512714697317904, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.174713397026062, "data/tokens_consumed": 3797942272, "data/tokens_consumed_B": 3.797942272, "train/loss_slope": -0.0022689958894857227} {"step": 1820, "timestamp": 1778196492.8601854, "train/loss": 3.5992074489593504, "train/z_loss": 0.0013844323810189962, "train/perplexity": 36.56923996620307, "train/grad_norm": 0.59375, "optim/muon_lr": 0.00728, "optim/adamw_lr": 0.00021839999999999997, "perf/tokens_per_sec": 2023591.0171195692, "perf/iters_per_sec": 0.9649233899686667, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.036351704597473, "data/tokens_consumed": 3818913792, "data/tokens_consumed_B": 3.818913792, "train/loss_slope": -0.0022532592452970883} {"step": 1830, "timestamp": 1778196503.2485406, "train/loss": 3.564742112159729, "train/z_loss": 0.0013909876346588136, "train/perplexity": 35.330340964009174, "train/grad_norm": 0.6484375, "optim/muon_lr": 0.00732, "optim/adamw_lr": 0.00021959999999999997, "perf/tokens_per_sec": 2019996.0551722238, "perf/iters_per_sec": 0.9632091785298461, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.038196086883545, "data/tokens_consumed": 3839885312, "data/tokens_consumed_B": 3.839885312, "train/loss_slope": -0.0022412110428104808} {"step": 1840, "timestamp": 1778196513.6107855, "train/loss": 3.5499752283096315, "train/z_loss": 0.0013909154222346842, "train/perplexity": 34.812455113561796, "train/grad_norm": 0.87890625, "optim/muon_lr": 0.00736, "optim/adamw_lr": 0.00022079999999999997, "perf/tokens_per_sec": 2025140.1178404507, "perf/iters_per_sec": 0.9656620587541822, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0355589628219604, "data/tokens_consumed": 3860856832, "data/tokens_consumed_B": 3.860856832, "train/loss_slope": -0.0022276063683581647} {"step": 1850, "timestamp": 1778196523.9715483, "grad/layer_0/attn": 0.015753287822008133, "grad/layer_0/mlp": 0.03429088369011879, "grad/layer_0/attn_mlp_ratio": 0.4594016275120788, "grad/layer_4/attn": 0.017413629218935966, "grad/layer_4/mlp": 0.04330745339393616, "grad/layer_4/attn_mlp_ratio": 0.40209312287027443, "grad/layer_8/attn": 0.04230920597910881, "grad/layer_8/mlp": 0.02767164818942547, "grad/layer_8/attn_mlp_ratio": 1.5289731040444374, "grad/layer_12/attn": 0.014645928516983986, "grad/layer_12/mlp": 0.015716753900051117, "grad/layer_12/attn_mlp_ratio": 0.9318672619636568, "grad/layer_16/attn": 0.017912713810801506, "grad/layer_16/mlp": 0.023156490176916122, "grad/layer_16/attn_mlp_ratio": 0.7735504645390087, "grad/layer_20/attn": 0.0949791669845581, "grad/layer_20/mlp": 0.10380709916353226, "grad/layer_20/attn_mlp_ratio": 0.9149582991760234, "grad/layer_24/attn": 0.04065577685832977, "grad/layer_24/mlp": 0.0550546795129776, "grad/layer_24/attn_mlp_ratio": 0.7384617827972304, "grad/layer_27/attn": 0.03740236535668373, "grad/layer_27/mlp": 0.052084311842918396, "grad/layer_27/attn_mlp_ratio": 0.7181119220250948} {"step": 1850, "timestamp": 1778196523.9870532, "train/loss": 3.5503689765930178, "train/z_loss": 0.0013903450104407965, "train/perplexity": 34.826165156979116, "train/grad_norm": 0.75390625, "optim/muon_lr": 0.0074, "optim/adamw_lr": 0.00022199999999999998, "perf/tokens_per_sec": 2022023.5069488967, "perf/iters_per_sec": 0.9641759428734287, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0371551036834716, "data/tokens_consumed": 3881828352, "data/tokens_consumed_B": 3.881828352, "train/loss_slope": -0.0022145744576670193} {"step": 1860, "timestamp": 1778196534.3628352, "train/loss": 3.5083740234375, "train/z_loss": 0.0013984123594127595, "train/perplexity": 33.39392587577893, "train/grad_norm": 0.8203125, "optim/muon_lr": 0.00744, "optim/adamw_lr": 0.00022319999999999998, "perf/tokens_per_sec": 2022402.8227919997, "perf/iters_per_sec": 0.9643568147621153, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0369605779647828, "data/tokens_consumed": 3902799872, "data/tokens_consumed_B": 3.902799872, "train/loss_slope": -0.002205444915209523} {"step": 1870, "timestamp": 1778196544.723997, "train/loss": 3.525292897224426, "train/z_loss": 0.0014032157021574676, "train/perplexity": 33.963720038867216, "train/grad_norm": 0.4609375, "optim/muon_lr": 0.0074800000000000005, "optim/adamw_lr": 0.00022439999999999998, "perf/tokens_per_sec": 2025026.5453189171, "perf/iters_per_sec": 0.9656079031557642, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0356170415878296, "data/tokens_consumed": 3923771392, "data/tokens_consumed_B": 3.923771392, "train/loss_slope": -0.002191749293590286} {"step": 1875, "timestamp": 1778196550.4963362, "eos/sharpness": 70.76690196990965, "eos/L0_probe": 3.2791635990142822, "eos/L_plus": 3.750279664993286, "eos/L_minus": 3.515716552734375, "eos/grad_norm": 0.6842663884162903, "eos/embed_grad_frac": 0.06265764683485031, "eos/time_s": 0.5990557670593262} {"step": 1875, "timestamp": 1778196551.8779905, "geo/rankme_last": 338.54302978515625, "geo/layer_0/stable_rank_q_proj": 53.79307556152344, "geo/layer_0/stable_rank_k_proj": 50.555206298828125, "geo/layer_0/stable_rank_o_proj": 66.83646392822266, "geo/layer_0/stable_rank_gate_proj": 164.61587524414062, "geo/layer_0/stable_rank_down_proj": 52.61637496948242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03998185321688652, "geo/layer_0/attn_entropy_mean": 6.9230146408081055, "geo/layer_0/attn_entropy_std": 0.044334474951028824, "geo/layer_7/stable_rank_q_proj": 27.974010467529297, "geo/layer_7/stable_rank_k_proj": 30.204912185668945, "geo/layer_7/stable_rank_o_proj": 114.81290435791016, "geo/layer_7/stable_rank_gate_proj": 175.82235717773438, "geo/layer_7/stable_rank_down_proj": 204.03062438964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6872706413269043, "geo/layer_7/attn_entropy_mean": 5.844377517700195, "geo/layer_7/attn_entropy_std": 1.5596868991851807, "geo/layer_14/stable_rank_q_proj": 35.242923736572266, "geo/layer_14/stable_rank_k_proj": 23.21474838256836, "geo/layer_14/stable_rank_o_proj": 92.66454315185547, "geo/layer_14/stable_rank_gate_proj": 194.48873901367188, "geo/layer_14/stable_rank_down_proj": 166.94541931152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.6778057217597961, "geo/layer_14/attn_entropy_mean": 6.907205104827881, "geo/layer_14/attn_entropy_std": 0.12243663519620895, "geo/layer_21/stable_rank_q_proj": 57.50840377807617, "geo/layer_21/stable_rank_k_proj": 37.57235336303711, "geo/layer_21/stable_rank_o_proj": 69.03618621826172, "geo/layer_21/stable_rank_gate_proj": 146.7196502685547, "geo/layer_21/stable_rank_down_proj": 157.60826110839844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3887242376804352, "geo/layer_21/attn_entropy_mean": 6.239591598510742, "geo/layer_21/attn_entropy_std": 0.5394818782806396, "geo/layer_27/stable_rank_q_proj": 61.944026947021484, "geo/layer_27/stable_rank_k_proj": 22.86415672302246, "geo/layer_27/stable_rank_o_proj": 81.8360366821289, "geo/layer_27/stable_rank_gate_proj": 89.80616760253906, "geo/layer_27/stable_rank_down_proj": 66.85550689697266, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.15048471093177795, "geo/layer_27/attn_entropy_mean": 5.885192394256592, "geo/layer_27/attn_entropy_std": 0.2526438236236572, "attnres/final_alpha/block_0": 0.21201682090759277, "attnres/block_norm/0": 0.6383979320526123, "attnres/final_alpha/block_1": 0.015130408108234406, "attnres/block_norm/1": 2781.709228515625, "attnres/final_alpha/block_2": 0.06634427607059479, "attnres/block_norm/2": 1450.443115234375, "attnres/final_alpha/block_3": 0.03944941610097885, "attnres/block_norm/3": 1361.501708984375, "attnres/final_alpha/block_4": 0.0488172210752964, "attnres/block_norm/4": 1186.338623046875, "attnres/final_alpha/block_5": 0.36756473779678345, "attnres/block_norm/5": 1103.41064453125, "attnres/final_alpha/block_6": 0.2506771385669708, "attnres/block_norm/6": 1303.4375, "geo/tier1_time_s": 1.3634648323059082, "geo/step": 1875.0, "geo/rankme_slope": 0.20028723156703504} {"step": 1880, "timestamp": 1778196557.0761428, "train/loss": 3.54536771774292, "train/z_loss": 0.0013997718575410544, "train/perplexity": 34.652425311383155, "train/grad_norm": 0.51953125, "optim/muon_lr": 0.00752, "optim/adamw_lr": 0.00022559999999999998, "perf/tokens_per_sec": 1698612.123862197, "perf/iters_per_sec": 0.8099613780318246, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2346267700195312, "data/tokens_consumed": 3944742912, "data/tokens_consumed_B": 3.944742912, "train/loss_slope": -0.0021750224765413628} {"step": 1890, "timestamp": 1778196567.4505048, "train/loss": 3.473217749595642, "train/z_loss": 0.0014108938514254988, "train/perplexity": 32.240316983858875, "train/grad_norm": 0.55859375, "optim/muon_lr": 0.00756, "optim/adamw_lr": 0.00022679999999999998, "perf/tokens_per_sec": 2022985.811813482, "perf/iters_per_sec": 0.9646348055903826, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0366617441177368, "data/tokens_consumed": 3965714432, "data/tokens_consumed_B": 3.965714432, "train/loss_slope": -0.002162909123964078} {"step": 1900, "timestamp": 1778196577.7988372, "grad/layer_0/attn": 0.015313572250306606, "grad/layer_0/mlp": 0.03463832288980484, "grad/layer_0/attn_mlp_ratio": 0.44209912399090096, "grad/layer_4/attn": 0.017750831320881844, "grad/layer_4/mlp": 0.04106713831424713, "grad/layer_4/attn_mlp_ratio": 0.4322393038888649, "grad/layer_8/attn": 0.038721077144145966, "grad/layer_8/mlp": 0.025994574651122093, "grad/layer_8/attn_mlp_ratio": 1.489583019336545, "grad/layer_12/attn": 0.012717357836663723, "grad/layer_12/mlp": 0.013886570930480957, "grad/layer_12/attn_mlp_ratio": 0.915802598694032, "grad/layer_16/attn": 0.021653225645422935, "grad/layer_16/mlp": 0.018820835277438164, "grad/layer_16/attn_mlp_ratio": 1.150492271526914, "grad/layer_20/attn": 0.2090250849723816, "grad/layer_20/mlp": 0.10087843239307404, "grad/layer_20/attn_mlp_ratio": 2.072049295440158, "grad/layer_24/attn": 0.037571318447589874, "grad/layer_24/mlp": 0.04803437739610672, "grad/layer_24/attn_mlp_ratio": 0.782175608513613, "grad/layer_27/attn": 0.018657049164175987, "grad/layer_27/mlp": 0.03089170716702938, "grad/layer_27/attn_mlp_ratio": 0.6039500828783456} {"step": 1900, "timestamp": 1778196577.8139923, "train/loss": 3.62757523059845, "train/z_loss": 0.0013981049181893469, "train/perplexity": 37.62148251022602, "train/grad_norm": 0.5703125, "optim/muon_lr": 0.0076, "optim/adamw_lr": 0.00022799999999999999, "perf/tokens_per_sec": 2024571.357412174, "perf/iters_per_sec": 0.9653908526478644, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0358498811721801, "data/tokens_consumed": 3986685952, "data/tokens_consumed_B": 3.986685952, "train/loss_slope": -0.0021402810208188713} {"step": 1910, "timestamp": 1778196588.197999, "train/loss": 3.452840209007263, "train/z_loss": 0.0014145633089356125, "train/perplexity": 31.58998718056798, "train/grad_norm": 0.90234375, "optim/muon_lr": 0.00764, "optim/adamw_lr": 0.0002292, "perf/tokens_per_sec": 2020693.8414511369, "perf/iters_per_sec": 0.9635419089561161, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0378375768661499, "data/tokens_consumed": 4007657472, "data/tokens_consumed_B": 4.007657472, "train/loss_slope": -0.0021279455059897315} {"step": 1920, "timestamp": 1778196598.5786073, "train/loss": 3.4553876876831056, "train/z_loss": 0.0014180778292939068, "train/perplexity": 31.670564590320584, "train/grad_norm": 0.7578125, "optim/muon_lr": 0.00768, "optim/adamw_lr": 0.0002304, "perf/tokens_per_sec": 2021336.4146839483, "perf/iters_per_sec": 0.9638483117503873, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0375076532363892, "data/tokens_consumed": 4028628992, "data/tokens_consumed_B": 4.028628992, "train/loss_slope": -0.002114688670510041} {"step": 1930, "timestamp": 1778196608.935735, "train/loss": 3.5327611446380613, "train/z_loss": 0.0014137527556158601, "train/perplexity": 34.21831902396308, "train/grad_norm": 0.7109375, "optim/muon_lr": 0.00772, "optim/adamw_lr": 0.0002316, "perf/tokens_per_sec": 2026417.426611164, "perf/iters_per_sec": 0.966271127038557, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0349062204360961, "data/tokens_consumed": 4049600512, "data/tokens_consumed_B": 4.049600512, "train/loss_slope": -0.0020956881182636546} {"step": 1940, "timestamp": 1778196619.293816, "train/loss": 3.4437735795974733, "train/z_loss": 0.0014218418975360692, "train/perplexity": 31.304866966560915, "train/grad_norm": 1.171875, "optim/muon_lr": 0.00776, "optim/adamw_lr": 0.0002328, "perf/tokens_per_sec": 2025690.257642461, "perf/iters_per_sec": 0.9659243858539872, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0352777242660522, "data/tokens_consumed": 4070572032, "data/tokens_consumed_B": 4.070572032, "train/loss_slope": -0.0020793212574879544} {"step": 1950, "timestamp": 1778196629.6407924, "grad/layer_0/attn": 0.014499410055577755, "grad/layer_0/mlp": 0.031188590452075005, "grad/layer_0/attn_mlp_ratio": 0.4648946874136028, "grad/layer_4/attn": 0.015516228973865509, "grad/layer_4/mlp": 0.03804770112037659, "grad/layer_4/attn_mlp_ratio": 0.4078098932703912, "grad/layer_8/attn": 0.02870984375476837, "grad/layer_8/mlp": 0.025688817724585533, "grad/layer_8/attn_mlp_ratio": 1.1176008156860981, "grad/layer_12/attn": 0.01225853618234396, "grad/layer_12/mlp": 0.01346651278436184, "grad/layer_12/attn_mlp_ratio": 0.9102977353981031, "grad/layer_16/attn": 0.016276951879262924, "grad/layer_16/mlp": 0.018084730952978134, "grad/layer_16/attn_mlp_ratio": 0.9000383711309042, "grad/layer_20/attn": 0.07323948293924332, "grad/layer_20/mlp": 0.0870867446064949, "grad/layer_20/attn_mlp_ratio": 0.840994610443639, "grad/layer_24/attn": 0.027297213673591614, "grad/layer_24/mlp": 0.04068658500909805, "grad/layer_24/attn_mlp_ratio": 0.6709143468392877, "grad/layer_27/attn": 0.014347662217915058, "grad/layer_27/mlp": 0.026315858587622643, "grad/layer_27/attn_mlp_ratio": 0.5452097303084894} {"step": 1950, "timestamp": 1778196630.2527413, "eos/sharpness": 64.1118288040161, "eos/L0_probe": 3.2061684131622314, "eos/L_plus": 3.4980483055114746, "eos/L_minus": 3.5554068088531494, "eos/grad_norm": 0.47107794880867004, "eos/embed_grad_frac": 0.0844585970044136, "eos/time_s": 0.6090948581695557} {"step": 1950, "timestamp": 1778196630.2724526, "train/loss": 3.457537221908569, "train/z_loss": 0.001423451316077262, "train/perplexity": 31.738714772180778, "train/grad_norm": 0.470703125, "optim/muon_lr": 0.0078000000000000005, "optim/adamw_lr": 0.000234, "perf/tokens_per_sec": 1911640.549094033, "perf/iters_per_sec": 0.9115412469358601, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0970430612564086, "data/tokens_consumed": 4091543552, "data/tokens_consumed_B": 4.091543552, "train/loss_slope": -0.002059567771557391} {"step": 1950, "timestamp": 1778196631.6379797, "geo/rankme_last": 339.7481384277344, "geo/layer_0/stable_rank_q_proj": 53.67568588256836, "geo/layer_0/stable_rank_k_proj": 50.854591369628906, "geo/layer_0/stable_rank_o_proj": 66.59138488769531, "geo/layer_0/stable_rank_gate_proj": 164.7765655517578, "geo/layer_0/stable_rank_down_proj": 52.416927337646484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.044082872569561005, "geo/layer_0/attn_entropy_mean": 6.916173458099365, "geo/layer_0/attn_entropy_std": 0.04223713278770447, "geo/layer_7/stable_rank_q_proj": 28.08991050720215, "geo/layer_7/stable_rank_k_proj": 30.338111877441406, "geo/layer_7/stable_rank_o_proj": 114.4891357421875, "geo/layer_7/stable_rank_gate_proj": 176.1451416015625, "geo/layer_7/stable_rank_down_proj": 204.2093048095703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6867880821228027, "geo/layer_7/attn_entropy_mean": 5.817826747894287, "geo/layer_7/attn_entropy_std": 1.5641943216323853, "geo/layer_14/stable_rank_q_proj": 35.35768508911133, "geo/layer_14/stable_rank_k_proj": 23.378076553344727, "geo/layer_14/stable_rank_o_proj": 92.68547821044922, "geo/layer_14/stable_rank_gate_proj": 194.60757446289062, "geo/layer_14/stable_rank_down_proj": 166.09231567382812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.6680564880371094, "geo/layer_14/attn_entropy_mean": 6.878613471984863, "geo/layer_14/attn_entropy_std": 0.13935743272304535, "geo/layer_21/stable_rank_q_proj": 56.612220764160156, "geo/layer_21/stable_rank_k_proj": 37.76058578491211, "geo/layer_21/stable_rank_o_proj": 69.5431900024414, "geo/layer_21/stable_rank_gate_proj": 146.21273803710938, "geo/layer_21/stable_rank_down_proj": 158.12744140625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.37903863191604614, "geo/layer_21/attn_entropy_mean": 6.231312274932861, "geo/layer_21/attn_entropy_std": 0.534551739692688, "geo/layer_27/stable_rank_q_proj": 62.165653228759766, "geo/layer_27/stable_rank_k_proj": 23.16790771484375, "geo/layer_27/stable_rank_o_proj": 82.82612609863281, "geo/layer_27/stable_rank_gate_proj": 90.4673080444336, "geo/layer_27/stable_rank_down_proj": 67.70649719238281, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.14716698229312897, "geo/layer_27/attn_entropy_mean": 5.833150863647461, "geo/layer_27/attn_entropy_std": 0.25392213463783264, "attnres/final_alpha/block_0": 0.21128877997398376, "attnres/block_norm/0": 0.6491937637329102, "attnres/final_alpha/block_1": 0.015039913356304169, "attnres/block_norm/1": 2808.4873046875, "attnres/final_alpha/block_2": 0.06338755786418915, "attnres/block_norm/2": 1495.7420654296875, "attnres/final_alpha/block_3": 0.037767261266708374, "attnres/block_norm/3": 1395.5299072265625, "attnres/final_alpha/block_4": 0.04641745239496231, "attnres/block_norm/4": 1202.085693359375, "attnres/final_alpha/block_5": 0.3757334351539612, "attnres/block_norm/5": 1120.114501953125, "attnres/final_alpha/block_6": 0.25036561489105225, "attnres/block_norm/6": 1350.7880859375, "geo/tier1_time_s": 1.3611841201782227, "geo/step": 1950.0, "geo/rankme_slope": 0.19149020206263553} {"step": 1960, "timestamp": 1778196641.9964833, "train/loss": 3.456609535217285, "train/z_loss": 0.0014244751422666014, "train/perplexity": 31.70928484187612, "train/grad_norm": 0.64453125, "optim/muon_lr": 0.00784, "optim/adamw_lr": 0.0002352, "perf/tokens_per_sec": 1789357.213216684, "perf/iters_per_sec": 0.8532320085605068, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1720141649246216, "data/tokens_consumed": 4112515072, "data/tokens_consumed_B": 4.112515072, "train/loss_slope": -0.0020452931864927597} {"step": 1970, "timestamp": 1778196652.3555255, "train/loss": 3.5309706687927247, "train/z_loss": 0.0014214757829904556, "train/perplexity": 34.15710676617003, "train/grad_norm": 0.640625, "optim/muon_lr": 0.00788, "optim/adamw_lr": 0.0002364, "perf/tokens_per_sec": 2025545.2316341684, "perf/iters_per_sec": 0.9658552320643274, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035351848602295, "data/tokens_consumed": 4133486592, "data/tokens_consumed_B": 4.133486592, "train/loss_slope": -0.0020208986119397082} {"step": 1980, "timestamp": 1778196662.7194233, "train/loss": 3.4957095623016357, "train/z_loss": 0.001423167437314987, "train/perplexity": 32.97367653200977, "train/grad_norm": 0.84765625, "optim/muon_lr": 0.00792, "optim/adamw_lr": 0.0002376, "perf/tokens_per_sec": 2024549.968708651, "perf/iters_per_sec": 0.9653806537192587, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035860824584961, "data/tokens_consumed": 4154458112, "data/tokens_consumed_B": 4.154458112, "train/loss_slope": -0.0019997690745741977} {"step": 1990, "timestamp": 1778196673.0727117, "train/loss": 3.3954811334609984, "train/z_loss": 0.0014351540827192365, "train/perplexity": 29.82900175379468, "train/grad_norm": 0.9453125, "optim/muon_lr": 0.00796, "optim/adamw_lr": 0.0002388, "perf/tokens_per_sec": 2026465.232149342, "perf/iters_per_sec": 0.9662939224955283, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348818063735963, "data/tokens_consumed": 4175429632, "data/tokens_consumed_B": 4.175429632, "train/loss_slope": -0.0019846447945499504} {"step": 2000, "timestamp": 1778196683.4174485, "grad/layer_0/attn": 0.015329672954976559, "grad/layer_0/mlp": 0.033955320715904236, "grad/layer_0/attn_mlp_ratio": 0.45146600257701985, "grad/layer_4/attn": 0.017848698422312737, "grad/layer_4/mlp": 0.04121740534901619, "grad/layer_4/attn_mlp_ratio": 0.4330378932849293, "grad/layer_8/attn": 0.03010965697467327, "grad/layer_8/mlp": 0.024339810013771057, "grad/layer_8/attn_mlp_ratio": 1.237053897870704, "grad/layer_12/attn": 0.02981884405016899, "grad/layer_12/mlp": 0.016208725050091743, "grad/layer_12/attn_mlp_ratio": 1.839678554238438, "grad/layer_16/attn": 0.020889822393655777, "grad/layer_16/mlp": 0.020521795377135277, "grad/layer_16/attn_mlp_ratio": 1.0179334657598815, "grad/layer_20/attn": 0.14554905891418457, "grad/layer_20/mlp": 0.09252172708511353, "grad/layer_20/attn_mlp_ratio": 1.5731338285868381, "grad/layer_24/attn": 0.026924243196845055, "grad/layer_24/mlp": 0.04885488376021385, "grad/layer_24/attn_mlp_ratio": 0.5511064824937892, "grad/layer_27/attn": 0.021582327783107758, "grad/layer_27/mlp": 0.031395796686410904, "grad/layer_27/attn_mlp_ratio": 0.6874272989449745} {"step": 2000, "timestamp": 1778196683.4333746, "train/loss": 3.4276185989379884, "train/z_loss": 0.0014350720448419452, "train/perplexity": 30.80320056220122, "train/grad_norm": 0.53515625, "optim/muon_lr": 0.008, "optim/adamw_lr": 0.00023999999999999998, "perf/tokens_per_sec": 2025023.5150279282, "perf/iters_per_sec": 0.9656064582004205, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0356185913085938, "data/tokens_consumed": 4196401152, "data/tokens_consumed_B": 4.196401152, "train/loss_slope": -0.0019701619904212253} {"step": 2000, "timestamp": 1778196690.4685998, "geo/ww_alpha_mean": 7.70510095737598, "geo/ww_alpha_std": 5.419427897222632, "geo/ww_alpha_min": 1.971544635817694, "geo/ww_alpha_max": 32.785753060193656, "geo/ww_alpha_healthy_frac": 0.2436548223350254, "geo/ww_alpha_by_type/q_proj": 4.424589184168333, "geo/ww_alpha_by_type/k_proj": 4.358296941295864, "geo/ww_alpha_by_type/v_proj": 6.177036456853407, "geo/ww_alpha_by_type/o_proj": 5.859218253300413, "geo/ww_alpha_by_type/gate_proj": 11.904273701043937, "geo/ww_alpha_by_type/up_proj": 11.735033647179083, "geo/ww_alpha_by_type/down_proj": 9.671844255443062, "geo/twonn_id/layer_0": 0.6384816765785217, "geo/twonn_id/layer_7": 2.139411211013794, "geo/twonn_id/layer_14": 2.055067777633667, "geo/twonn_id/layer_21": 5.628379821777344, "geo/twonn_id/layer_27": 5.914292335510254, "geo/tier2_time_s": 7.024859189987183} {"step": 2000, "timestamp": 1778196691.2323804, "eoc/jacobian_sigma/layer_0/attn": 883.5003662109375, "eoc/jacobian_sigma/layer_0/mlp": 1029.07666015625, "eoc/jacobian_sigma/layer_0": 1029.07666015625, "eoc/jacobian_sigma/layer_7/attn": 1.1495214700698853, "eoc/jacobian_sigma/layer_7/mlp": 1.3820159435272217, "eoc/jacobian_sigma/layer_7": 1.3820159435272217, "eoc/jacobian_sigma/layer_14/attn": 1.2861113548278809, "eoc/jacobian_sigma/layer_14/mlp": 2.421828508377075, "eoc/jacobian_sigma/layer_14": 2.421828508377075, "eoc/jacobian_sigma/layer_21/attn": 1.0762790441513062, "eoc/jacobian_sigma/layer_21/mlp": 3.406959295272827, "eoc/jacobian_sigma/layer_21": 3.406959295272827, "eoc/jacobian_sigma/layer_27/attn": 1.605080246925354, "eoc/jacobian_sigma/layer_27/mlp": 5.7240777015686035, "eoc/jacobian_sigma/layer_27": 5.7240777015686035, "eoc/layer0_sigma": 1029.07666015625, "eoc/sigma_max": 5.7240777015686035, "eoc/sigma_min": 1.3820159435272217, "eoc/sigma_mean": 3.233720362186432, "eoc/time_s": 0.7577171325683594} {"step": 2010, "timestamp": 1778196701.6047833, "train/loss": 3.358496832847595, "train/z_loss": 0.0014433547272346913, "train/perplexity": 28.745948421631855, "train/grad_norm": 0.57421875, "optim/muon_lr": 0.00804, "optim/adamw_lr": 0.00024119999999999998, "perf/tokens_per_sec": 1154667.2080708651, "perf/iters_per_sec": 0.5505882301668478, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.816239333152771, "data/tokens_consumed": 4217372672, "data/tokens_consumed_B": 4.217372672, "train/loss_slope": -0.0019568322822491827} {"step": 2020, "timestamp": 1778196711.96071, "train/loss": 3.418573188781738, "train/z_loss": 0.0014395259087905288, "train/perplexity": 30.525829338427208, "train/grad_norm": 0.7890625, "optim/muon_lr": 0.00808, "optim/adamw_lr": 0.00024239999999999998, "perf/tokens_per_sec": 2026350.810740928, "perf/iters_per_sec": 0.9662393621163025, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.034940242767334, "data/tokens_consumed": 4238344192, "data/tokens_consumed_B": 4.238344192, "train/loss_slope": -0.0019380146470257301} {"step": 2025, "timestamp": 1778196717.7502468, "eos/sharpness": 123.464560508728, "eos/L0_probe": 3.1458728313446045, "eos/L_plus": 4.099544525146484, "eos/L_minus": 3.426846742630005, "eos/grad_norm": 0.6767691373825073, "eos/embed_grad_frac": 0.040594786405563354, "eos/time_s": 0.6237392425537109} {"step": 2025, "timestamp": 1778196719.1272397, "geo/rankme_last": 342.881103515625, "geo/layer_0/stable_rank_q_proj": 53.439453125, "geo/layer_0/stable_rank_k_proj": 50.9531135559082, "geo/layer_0/stable_rank_o_proj": 66.31192779541016, "geo/layer_0/stable_rank_gate_proj": 164.63771057128906, "geo/layer_0/stable_rank_down_proj": 52.143531799316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03574838861823082, "geo/layer_0/attn_entropy_mean": 6.907573223114014, "geo/layer_0/attn_entropy_std": 0.04279442876577377, "geo/layer_7/stable_rank_q_proj": 28.2087459564209, "geo/layer_7/stable_rank_k_proj": 30.524127960205078, "geo/layer_7/stable_rank_o_proj": 114.01681518554688, "geo/layer_7/stable_rank_gate_proj": 176.874267578125, "geo/layer_7/stable_rank_down_proj": 204.63885498046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6731009483337402, "geo/layer_7/attn_entropy_mean": 5.795403480529785, "geo/layer_7/attn_entropy_std": 1.5564526319503784, "geo/layer_14/stable_rank_q_proj": 35.53940963745117, "geo/layer_14/stable_rank_k_proj": 23.573442459106445, "geo/layer_14/stable_rank_o_proj": 92.67674255371094, "geo/layer_14/stable_rank_gate_proj": 195.79624938964844, "geo/layer_14/stable_rank_down_proj": 165.3590087890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.6591684818267822, "geo/layer_14/attn_entropy_mean": 6.8868794441223145, "geo/layer_14/attn_entropy_std": 0.12428542971611023, "geo/layer_21/stable_rank_q_proj": 55.71235275268555, "geo/layer_21/stable_rank_k_proj": 37.8110466003418, "geo/layer_21/stable_rank_o_proj": 70.07615661621094, "geo/layer_21/stable_rank_gate_proj": 145.46649169921875, "geo/layer_21/stable_rank_down_proj": 158.84765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3692363202571869, "geo/layer_21/attn_entropy_mean": 6.185616493225098, "geo/layer_21/attn_entropy_std": 0.5595421195030212, "geo/layer_27/stable_rank_q_proj": 62.4853515625, "geo/layer_27/stable_rank_k_proj": 23.447114944458008, "geo/layer_27/stable_rank_o_proj": 83.84542083740234, "geo/layer_27/stable_rank_gate_proj": 90.90619659423828, "geo/layer_27/stable_rank_down_proj": 68.60521697998047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.14375412464141846, "geo/layer_27/attn_entropy_mean": 5.724119663238525, "geo/layer_27/attn_entropy_std": 0.26557454466819763, "attnres/final_alpha/block_0": 0.2148367315530777, "attnres/block_norm/0": 0.6606084704399109, "attnres/final_alpha/block_1": 0.015448924154043198, "attnres/block_norm/1": 2831.45458984375, "attnres/final_alpha/block_2": 0.06254151463508606, "attnres/block_norm/2": 1525.6724853515625, "attnres/final_alpha/block_3": 0.037720806896686554, "attnres/block_norm/3": 1425.1644287109375, "attnres/final_alpha/block_4": 0.045885827392339706, "attnres/block_norm/4": 1240.402587890625, "attnres/final_alpha/block_5": 0.3761281371116638, "attnres/block_norm/5": 1163.54931640625, "attnres/final_alpha/block_6": 0.24743807315826416, "attnres/block_norm/6": 1387.686279296875, "geo/tier1_time_s": 1.3575422763824463, "geo/step": 2025.0, "geo/rankme_slope": 0.1833419592261771} {"step": 2030, "timestamp": 1778196724.3073175, "train/loss": 3.338368320465088, "train/z_loss": 0.0014499713666737079, "train/perplexity": 28.17311967843983, "train/grad_norm": 0.72265625, "optim/muon_lr": 0.00812, "optim/adamw_lr": 0.00024359999999999999, "perf/tokens_per_sec": 1699268.414768565, "perf/iters_per_sec": 0.8102743219225716, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2341499328613281, "data/tokens_consumed": 4259315712, "data/tokens_consumed_B": 4.259315712, "train/loss_slope": -0.0019207079197099321} {"step": 2040, "timestamp": 1778196734.6634092, "train/loss": 3.3719602108001707, "train/z_loss": 0.0014463492436334491, "train/perplexity": 29.135583003159542, "train/grad_norm": 0.578125, "optim/muon_lr": 0.008159999999999999, "optim/adamw_lr": 0.0002448, "perf/tokens_per_sec": 2025946.7745483431, "perf/iters_per_sec": 0.9660467026464191, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0351466417312623, "data/tokens_consumed": 4280287232, "data/tokens_consumed_B": 4.280287232, "train/loss_slope": -0.001902331036130766} {"step": 2050, "timestamp": 1778196745.0102434, "grad/layer_0/attn": 0.011257059872150421, "grad/layer_0/mlp": 0.02758595161139965, "grad/layer_0/attn_mlp_ratio": 0.408072195221691, "grad/layer_4/attn": 0.014880389906466007, "grad/layer_4/mlp": 0.035605598241090775, "grad/layer_4/attn_mlp_ratio": 0.41792275933454087, "grad/layer_8/attn": 0.024382712319493294, "grad/layer_8/mlp": 0.02465631440281868, "grad/layer_8/attn_mlp_ratio": 0.9889033625323805, "grad/layer_12/attn": 0.024580994620919228, "grad/layer_12/mlp": 0.013753268867731094, "grad/layer_12/attn_mlp_ratio": 1.7872837853017287, "grad/layer_16/attn": 0.016591621562838554, "grad/layer_16/mlp": 0.017234567552804947, "grad/layer_16/attn_mlp_ratio": 0.962694388224949, "grad/layer_20/attn": 0.11264043301343918, "grad/layer_20/mlp": 0.0866355374455452, "grad/layer_20/attn_mlp_ratio": 1.300164299831613, "grad/layer_24/attn": 0.03576856851577759, "grad/layer_24/mlp": 0.0539318285882473, "grad/layer_24/attn_mlp_ratio": 0.6632181660766158, "grad/layer_27/attn": 0.03611051291227341, "grad/layer_27/mlp": 0.04608331248164177, "grad/layer_27/attn_mlp_ratio": 0.7835919531240202} {"step": 2050, "timestamp": 1778196745.0260592, "train/loss": 3.4027121305465697, "train/z_loss": 0.0014466776978224517, "train/perplexity": 30.045476900879443, "train/grad_norm": 0.66015625, "optim/muon_lr": 0.008199999999999999, "optim/adamw_lr": 0.00024599999999999996, "perf/tokens_per_sec": 2024749.5675775926, "perf/iters_per_sec": 0.9654758298767054, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0357587099075318, "data/tokens_consumed": 4301258752, "data/tokens_consumed_B": 4.301258752, "train/loss_slope": -0.0018837910679629212} {"step": 2060, "timestamp": 1778196755.3836715, "train/loss": 3.3805765390396116, "train/z_loss": 0.0014496950898319483, "train/perplexity": 29.387709391779964, "train/grad_norm": 0.490234375, "optim/muon_lr": 0.008239999999999999, "optim/adamw_lr": 0.0002472, "perf/tokens_per_sec": 2025834.0446765383, "perf/iters_per_sec": 0.9659929488547031, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035204243659973, "data/tokens_consumed": 4322230272, "data/tokens_consumed_B": 4.322230272, "train/loss_slope": -0.0018652758710061756} {"step": 2070, "timestamp": 1778196765.7480302, "train/loss": 3.384850573539734, "train/z_loss": 0.0014537572511471807, "train/perplexity": 29.51358227651216, "train/grad_norm": 0.61328125, "optim/muon_lr": 0.00828, "optim/adamw_lr": 0.00024839999999999997, "perf/tokens_per_sec": 2024366.94834927, "perf/iters_per_sec": 0.9652933828111983, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035954475402832, "data/tokens_consumed": 4343201792, "data/tokens_consumed_B": 4.343201792, "train/loss_slope": -0.0018460506435489282} {"step": 2080, "timestamp": 1778196776.106977, "train/loss": 3.3030505895614626, "train/z_loss": 0.001462639681994915, "train/perplexity": 27.195474738734084, "train/grad_norm": 0.439453125, "optim/muon_lr": 0.00832, "optim/adamw_lr": 0.00024959999999999994, "perf/tokens_per_sec": 2025659.3288358767, "perf/iters_per_sec": 0.9659096378497489, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0352935314178466, "data/tokens_consumed": 4364173312, "data/tokens_consumed_B": 4.364173312, "train/loss_slope": -0.0018341648567770823} {"step": 2090, "timestamp": 1778196786.465371, "train/loss": 3.3460941076278687, "train/z_loss": 0.0014598696958273648, "train/perplexity": 28.39162216983209, "train/grad_norm": 0.85546875, "optim/muon_lr": 0.00836, "optim/adamw_lr": 0.00025079999999999997, "perf/tokens_per_sec": 2025584.4598435236, "perf/iters_per_sec": 0.9658739375321977, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0353317975997924, "data/tokens_consumed": 4385144832, "data/tokens_consumed_B": 4.385144832, "train/loss_slope": -0.0018182525682835617} {"step": 2100, "timestamp": 1778196796.8174922, "grad/layer_0/attn": 0.013064048252999783, "grad/layer_0/mlp": 0.029346751049160957, "grad/layer_0/attn_mlp_ratio": 0.44516165304292266, "grad/layer_4/attn": 0.014660223387181759, "grad/layer_4/mlp": 0.035110797733068466, "grad/layer_4/attn_mlp_ratio": 0.41754173336882405, "grad/layer_8/attn": 0.021527154371142387, "grad/layer_8/mlp": 0.02173464186489582, "grad/layer_8/attn_mlp_ratio": 0.9904535996457382, "grad/layer_12/attn": 0.014291668310761452, "grad/layer_12/mlp": 0.014277053065598011, "grad/layer_12/attn_mlp_ratio": 1.001023680797005, "grad/layer_16/attn": 0.020173516124486923, "grad/layer_16/mlp": 0.017261236906051636, "grad/layer_16/attn_mlp_ratio": 1.168717868679647, "grad/layer_20/attn": 0.1572936773300171, "grad/layer_20/mlp": 0.08401533961296082, "grad/layer_20/attn_mlp_ratio": 1.8722018844107802, "grad/layer_24/attn": 0.02505689300596714, "grad/layer_24/mlp": 0.046686090528964996, "grad/layer_24/attn_mlp_ratio": 0.5367100279418413, "grad/layer_27/attn": 0.02193315513432026, "grad/layer_27/mlp": 0.03614211082458496, "grad/layer_27/attn_mlp_ratio": 0.6068587189078837} {"step": 2100, "timestamp": 1778196797.4339807, "eos/sharpness": 67.96460151672362, "eos/L0_probe": 3.0853395462036133, "eos/L_plus": 3.5383825302124023, "eos/L_minus": 3.3119425773620605, "eos/grad_norm": 0.5598859190940857, "eos/embed_grad_frac": 0.0749882161617279, "eos/time_s": 0.6135282516479492} {"step": 2100, "timestamp": 1778196797.455036, "train/loss": 3.3316717863082888, "train/z_loss": 0.0014647614443674684, "train/perplexity": 27.985087704147, "train/grad_norm": 0.5625, "optim/muon_lr": 0.0084, "optim/adamw_lr": 0.00025199999999999995, "perf/tokens_per_sec": 1909146.3491289504, "perf/iters_per_sec": 0.9103519197125198, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0984762907028198, "data/tokens_consumed": 4406116352, "data/tokens_consumed_B": 4.406116352, "train/loss_slope": -0.0017965301599368081} {"step": 2100, "timestamp": 1778196798.821771, "geo/rankme_last": 347.0990905761719, "geo/layer_0/stable_rank_q_proj": 53.12222671508789, "geo/layer_0/stable_rank_k_proj": 50.927242279052734, "geo/layer_0/stable_rank_o_proj": 66.1969223022461, "geo/layer_0/stable_rank_gate_proj": 164.4451446533203, "geo/layer_0/stable_rank_down_proj": 51.925472259521484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.040316447615623474, "geo/layer_0/attn_entropy_mean": 6.895259857177734, "geo/layer_0/attn_entropy_std": 0.04480862617492676, "geo/layer_7/stable_rank_q_proj": 28.314489364624023, "geo/layer_7/stable_rank_k_proj": 30.704845428466797, "geo/layer_7/stable_rank_o_proj": 113.24534606933594, "geo/layer_7/stable_rank_gate_proj": 177.53924560546875, "geo/layer_7/stable_rank_down_proj": 204.94607543945312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6689304113388062, "geo/layer_7/attn_entropy_mean": 5.7732110023498535, "geo/layer_7/attn_entropy_std": 1.5482101440429688, "geo/layer_14/stable_rank_q_proj": 35.739505767822266, "geo/layer_14/stable_rank_k_proj": 23.69500160217285, "geo/layer_14/stable_rank_o_proj": 92.92717742919922, "geo/layer_14/stable_rank_gate_proj": 196.0163116455078, "geo/layer_14/stable_rank_down_proj": 164.77084350585938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.644911527633667, "geo/layer_14/attn_entropy_mean": 6.861284255981445, "geo/layer_14/attn_entropy_std": 0.12262191623449326, "geo/layer_21/stable_rank_q_proj": 54.826622009277344, "geo/layer_21/stable_rank_k_proj": 37.95846176147461, "geo/layer_21/stable_rank_o_proj": 70.7205581665039, "geo/layer_21/stable_rank_gate_proj": 144.82786560058594, "geo/layer_21/stable_rank_down_proj": 159.37228393554688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.35791561007499695, "geo/layer_21/attn_entropy_mean": 6.139610767364502, "geo/layer_21/attn_entropy_std": 0.5840889811515808, "geo/layer_27/stable_rank_q_proj": 62.523372650146484, "geo/layer_27/stable_rank_k_proj": 23.782148361206055, "geo/layer_27/stable_rank_o_proj": 84.54561614990234, "geo/layer_27/stable_rank_gate_proj": 91.29917907714844, "geo/layer_27/stable_rank_down_proj": 69.5029525756836, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.14778904616832733, "geo/layer_27/attn_entropy_mean": 5.676934719085693, "geo/layer_27/attn_entropy_std": 0.25322264432907104, "attnres/final_alpha/block_0": 0.21270287036895752, "attnres/block_norm/0": 0.6719383001327515, "attnres/final_alpha/block_1": 0.015516575425863266, "attnres/block_norm/1": 2863.4736328125, "attnres/final_alpha/block_2": 0.05965729430317879, "attnres/block_norm/2": 1542.699462890625, "attnres/final_alpha/block_3": 0.03672732785344124, "attnres/block_norm/3": 1442.23974609375, "attnres/final_alpha/block_4": 0.04552178457379341, "attnres/block_norm/4": 1275.806640625, "attnres/final_alpha/block_5": 0.3699738383293152, "attnres/block_norm/5": 1211.30419921875, "attnres/final_alpha/block_6": 0.2599002718925476, "attnres/block_norm/6": 1400.840576171875, "geo/tier1_time_s": 1.3631117343902588, "geo/step": 2100.0, "geo/rankme_slope": 0.17589060375295054} {"step": 2110, "timestamp": 1778196809.1766996, "train/loss": 3.3171828269958494, "train/z_loss": 0.0014655184815637768, "train/perplexity": 27.582536225321576, "train/grad_norm": 0.376953125, "optim/muon_lr": 0.00844, "optim/adamw_lr": 0.0002532, "perf/tokens_per_sec": 1789663.2095784952, "perf/iters_per_sec": 0.8533779189960934, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1718137741088868, "data/tokens_consumed": 4427087872, "data/tokens_consumed_B": 4.427087872, "train/loss_slope": -0.001778721258427837} {"step": 2120, "timestamp": 1778196819.5362556, "train/loss": 3.340674066543579, "train/z_loss": 0.0014644785318523646, "train/perplexity": 28.23815468695215, "train/grad_norm": 0.61328125, "optim/muon_lr": 0.00848, "optim/adamw_lr": 0.00025439999999999995, "perf/tokens_per_sec": 2025500.687769032, "perf/iters_per_sec": 0.9658339918942604, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0353746175765992, "data/tokens_consumed": 4448059392, "data/tokens_consumed_B": 4.448059392, "train/loss_slope": -0.001756653763029692} {"step": 2130, "timestamp": 1778196829.8911164, "train/loss": 3.3041664361953735, "train/z_loss": 0.001471190224401653, "train/perplexity": 27.225837654707263, "train/grad_norm": 0.71875, "optim/muon_lr": 0.00852, "optim/adamw_lr": 0.0002556, "perf/tokens_per_sec": 2026288.3535880935, "perf/iters_per_sec": 0.9662095802250354, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0349721431732177, "data/tokens_consumed": 4469030912, "data/tokens_consumed_B": 4.469030912, "train/loss_slope": -0.001736833688928814} {"step": 2140, "timestamp": 1778196840.244795, "train/loss": 3.280208969116211, "train/z_loss": 0.0014761092956177891, "train/perplexity": 26.58132679590473, "train/grad_norm": 0.48828125, "optim/muon_lr": 0.00856, "optim/adamw_lr": 0.00025679999999999995, "perf/tokens_per_sec": 2026610.3432407097, "perf/iters_per_sec": 0.9663631168559598, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0348077058792113, "data/tokens_consumed": 4490002432, "data/tokens_consumed_B": 4.490002432, "train/loss_slope": -0.0017210014567874481} {"step": 2150, "timestamp": 1778196850.5876482, "grad/layer_0/attn": 0.012637991458177567, "grad/layer_0/mlp": 0.025960994884371758, "grad/layer_0/attn_mlp_ratio": 0.48680689880281947, "grad/layer_4/attn": 0.013003520667552948, "grad/layer_4/mlp": 0.03102237731218338, "grad/layer_4/attn_mlp_ratio": 0.419165832933426, "grad/layer_8/attn": 0.0362275131046772, "grad/layer_8/mlp": 0.01841503195464611, "grad/layer_8/attn_mlp_ratio": 1.9672793942021405, "grad/layer_12/attn": 0.017714660614728928, "grad/layer_12/mlp": 0.012957132421433926, "grad/layer_12/attn_mlp_ratio": 1.3671744566496493, "grad/layer_16/attn": 0.015056833624839783, "grad/layer_16/mlp": 0.0156893078237772, "grad/layer_16/attn_mlp_ratio": 0.9596875590682429, "grad/layer_20/attn": 0.11684411764144897, "grad/layer_20/mlp": 0.07650698721408844, "grad/layer_20/attn_mlp_ratio": 1.5272345931197402, "grad/layer_24/attn": 0.023473089560866356, "grad/layer_24/mlp": 0.04264127090573311, "grad/layer_24/attn_mlp_ratio": 0.5504781871466825, "grad/layer_27/attn": 0.027748655527830124, "grad/layer_27/mlp": 0.03837451711297035, "grad/layer_27/attn_mlp_ratio": 0.7231010978934543} {"step": 2150, "timestamp": 1778196850.6032944, "train/loss": 3.306913471221924, "train/z_loss": 0.0014739608042873443, "train/perplexity": 27.300730804327987, "train/grad_norm": 0.58984375, "optim/muon_lr": 0.0086, "optim/adamw_lr": 0.000258, "perf/tokens_per_sec": 2025585.2061737692, "perf/iters_per_sec": 0.9658742934101912, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035331416130066, "data/tokens_consumed": 4510973952, "data/tokens_consumed_B": 4.510973952, "train/loss_slope": -0.0017001916723712015} {"step": 2160, "timestamp": 1778196860.968448, "train/loss": 3.2671653032302856, "train/z_loss": 0.0014812264824286103, "train/perplexity": 26.236860287265472, "train/grad_norm": 0.6171875, "optim/muon_lr": 0.00864, "optim/adamw_lr": 0.00025919999999999996, "perf/tokens_per_sec": 2024130.6270732249, "perf/iters_per_sec": 0.9651806960455059, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0360754251480102, "data/tokens_consumed": 4531945472, "data/tokens_consumed_B": 4.531945472, "train/loss_slope": -0.001685333463818756} {"step": 2170, "timestamp": 1778196871.3247542, "train/loss": 3.301709771156311, "train/z_loss": 0.001480054622516036, "train/perplexity": 27.159034980675962, "train/grad_norm": 0.70703125, "optim/muon_lr": 0.00868, "optim/adamw_lr": 0.0002604, "perf/tokens_per_sec": 2026241.4899121376, "perf/iters_per_sec": 0.9661872338829697, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0349960803985596, "data/tokens_consumed": 4552916992, "data/tokens_consumed_B": 4.552916992, "train/loss_slope": -0.0016640939596641398} {"step": 2175, "timestamp": 1778196877.1094663, "eos/sharpness": 122.08709716796872, "eos/L0_probe": 3.0360944271087646, "eos/L_plus": 3.4073328971862793, "eos/L_minus": 3.8857269287109375, "eos/grad_norm": 0.7582345604896545, "eos/embed_grad_frac": 0.022732852026820183, "eos/time_s": 0.6146376132965088} {"step": 2175, "timestamp": 1778196878.4880543, "geo/rankme_last": 352.04083251953125, "geo/layer_0/stable_rank_q_proj": 52.95050048828125, "geo/layer_0/stable_rank_k_proj": 50.805973052978516, "geo/layer_0/stable_rank_o_proj": 65.81161499023438, "geo/layer_0/stable_rank_gate_proj": 164.82681274414062, "geo/layer_0/stable_rank_down_proj": 51.69320297241211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04030930995941162, "geo/layer_0/attn_entropy_mean": 6.890527725219727, "geo/layer_0/attn_entropy_std": 0.04407139867544174, "geo/layer_7/stable_rank_q_proj": 28.472755432128906, "geo/layer_7/stable_rank_k_proj": 30.833356857299805, "geo/layer_7/stable_rank_o_proj": 112.40316009521484, "geo/layer_7/stable_rank_gate_proj": 177.7792510986328, "geo/layer_7/stable_rank_down_proj": 205.3557586669922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6596912741661072, "geo/layer_7/attn_entropy_mean": 5.751711845397949, "geo/layer_7/attn_entropy_std": 1.532950520515442, "geo/layer_14/stable_rank_q_proj": 35.99701690673828, "geo/layer_14/stable_rank_k_proj": 23.8359375, "geo/layer_14/stable_rank_o_proj": 92.86748504638672, "geo/layer_14/stable_rank_gate_proj": 195.82652282714844, "geo/layer_14/stable_rank_down_proj": 164.27903747558594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.635209858417511, "geo/layer_14/attn_entropy_mean": 6.8444414138793945, "geo/layer_14/attn_entropy_std": 0.15108008682727814, "geo/layer_21/stable_rank_q_proj": 53.928550720214844, "geo/layer_21/stable_rank_k_proj": 38.03341293334961, "geo/layer_21/stable_rank_o_proj": 71.56906127929688, "geo/layer_21/stable_rank_gate_proj": 143.92984008789062, "geo/layer_21/stable_rank_down_proj": 159.66519165039062, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3534092307090759, "geo/layer_21/attn_entropy_mean": 6.13655948638916, "geo/layer_21/attn_entropy_std": 0.5766887068748474, "geo/layer_27/stable_rank_q_proj": 62.74090576171875, "geo/layer_27/stable_rank_k_proj": 24.12828826904297, "geo/layer_27/stable_rank_o_proj": 85.38548278808594, "geo/layer_27/stable_rank_gate_proj": 91.47430419921875, "geo/layer_27/stable_rank_down_proj": 70.50711822509766, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.12988191843032837, "geo/layer_27/attn_entropy_mean": 5.677599906921387, "geo/layer_27/attn_entropy_std": 0.23145069181919098, "attnres/final_alpha/block_0": 0.2165970355272293, "attnres/block_norm/0": 0.683922529220581, "attnres/final_alpha/block_1": 0.015874970704317093, "attnres/block_norm/1": 2886.799072265625, "attnres/final_alpha/block_2": 0.0592690110206604, "attnres/block_norm/2": 1587.517822265625, "attnres/final_alpha/block_3": 0.037318289279937744, "attnres/block_norm/3": 1470.939453125, "attnres/final_alpha/block_4": 0.04562965780496597, "attnres/block_norm/4": 1310.8350830078125, "attnres/final_alpha/block_5": 0.3600408136844635, "attnres/block_norm/5": 1252.1268310546875, "attnres/final_alpha/block_6": 0.2652702331542969, "attnres/block_norm/6": 1437.2877197265625, "geo/tier1_time_s": 1.3587570190429688, "geo/step": 2175.0, "geo/rankme_slope": 0.16912369084455456} {"step": 2180, "timestamp": 1778196883.667713, "train/loss": 3.25804123878479, "train/z_loss": 0.0014849374420009553, "train/perplexity": 25.99856225959935, "train/grad_norm": 0.73828125, "optim/muon_lr": 0.00872, "optim/adamw_lr": 0.00026159999999999996, "perf/tokens_per_sec": 1699802.6159306846, "perf/iters_per_sec": 0.8105290488866256, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2337620735168457, "data/tokens_consumed": 4573888512, "data/tokens_consumed_B": 4.573888512, "train/loss_slope": -0.0016440316860360353} {"step": 2190, "timestamp": 1778196894.5039763, "train/loss": 3.237373924255371, "train/z_loss": 0.001487765135243535, "train/perplexity": 25.46675622674977, "train/grad_norm": 0.56640625, "optim/muon_lr": 0.00876, "optim/adamw_lr": 0.0002628, "perf/tokens_per_sec": 1936259.3555894997, "perf/iters_per_sec": 0.9232804086635111, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0830945730209351, "data/tokens_consumed": 4594860032, "data/tokens_consumed_B": 4.594860032, "train/loss_slope": -0.0016283264324681525} {"step": 2200, "timestamp": 1778196904.8475764, "grad/layer_0/attn": 0.010850177146494389, "grad/layer_0/mlp": 0.024558622390031815, "grad/layer_0/attn_mlp_ratio": 0.4418072369856414, "grad/layer_4/attn": 0.011789441108703613, "grad/layer_4/mlp": 0.028604978695511818, "grad/layer_4/attn_mlp_ratio": 0.412146472576774, "grad/layer_8/attn": 0.01966806873679161, "grad/layer_8/mlp": 0.01833159290254116, "grad/layer_8/attn_mlp_ratio": 1.0729055971330579, "grad/layer_12/attn": 0.011559088714420795, "grad/layer_12/mlp": 0.01191024947911501, "grad/layer_12/attn_mlp_ratio": 0.970516078411154, "grad/layer_16/attn": 0.01369305420666933, "grad/layer_16/mlp": 0.013838463462889194, "grad/layer_16/attn_mlp_ratio": 0.9894923771299431, "grad/layer_20/attn": 0.07585348188877106, "grad/layer_20/mlp": 0.0719112753868103, "grad/layer_20/attn_mlp_ratio": 1.054820421071294, "grad/layer_24/attn": 0.02199198305606842, "grad/layer_24/mlp": 0.03580317273736, "grad/layer_24/attn_mlp_ratio": 0.6142467639940603, "grad/layer_27/attn": 0.013638394884765148, "grad/layer_27/mlp": 0.023014824837446213, "grad/layer_27/attn_mlp_ratio": 0.5925917282375166} {"step": 2200, "timestamp": 1778196904.8634431, "train/loss": 3.263990569114685, "train/z_loss": 0.0014924579998478293, "train/perplexity": 26.153697311842375, "train/grad_norm": 0.376953125, "optim/muon_lr": 0.0088, "optim/adamw_lr": 0.00026399999999999997, "perf/tokens_per_sec": 2025497.7027004715, "perf/iters_per_sec": 0.965832568502651, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0353761434555053, "data/tokens_consumed": 4615831552, "data/tokens_consumed_B": 4.615831552, "train/loss_slope": -0.0016083417207601728} {"step": 2210, "timestamp": 1778196915.222937, "train/loss": 3.184644651412964, "train/z_loss": 0.0014966965187340974, "train/perplexity": 24.158702119710263, "train/grad_norm": 1.0390625, "optim/muon_lr": 0.00884, "optim/adamw_lr": 0.0002652, "perf/tokens_per_sec": 2025392.3447444732, "perf/iters_per_sec": 0.9657823299143187, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0354300022125245, "data/tokens_consumed": 4636803072, "data/tokens_consumed_B": 4.636803072, "train/loss_slope": -0.0015943573017074583} {"step": 2220, "timestamp": 1778196925.5848265, "train/loss": 3.268227767944336, "train/z_loss": 0.0014940332039259374, "train/perplexity": 26.2647508392682, "train/grad_norm": 0.49609375, "optim/muon_lr": 0.00888, "optim/adamw_lr": 0.00026639999999999997, "perf/tokens_per_sec": 2024920.1176098736, "perf/iters_per_sec": 0.9655571544694298, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0356714725494385, "data/tokens_consumed": 4657774592, "data/tokens_consumed_B": 4.657774592, "train/loss_slope": -0.0015697248335969083} {"step": 2230, "timestamp": 1778196935.9488294, "train/loss": 3.2053645849227905, "train/z_loss": 0.0014984847046434879, "train/perplexity": 24.6644906788565, "train/grad_norm": 0.5078125, "optim/muon_lr": 0.00892, "optim/adamw_lr": 0.0002676, "perf/tokens_per_sec": 2024745.186512255, "perf/iters_per_sec": 0.96547374082196, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0357609510421752, "data/tokens_consumed": 4678746112, "data/tokens_consumed_B": 4.678746112, "train/loss_slope": -0.0015524361791533464} {"step": 2240, "timestamp": 1778196946.3046536, "train/loss": 3.262488865852356, "train/z_loss": 0.0014980746316723525, "train/perplexity": 26.114451694278454, "train/grad_norm": 0.640625, "optim/muon_lr": 0.008960000000000001, "optim/adamw_lr": 0.0002688, "perf/tokens_per_sec": 2026067.2638022932, "perf/iters_per_sec": 0.9661041563998667, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0350850820541382, "data/tokens_consumed": 4699717632, "data/tokens_consumed_B": 4.699717632, "train/loss_slope": -0.0015257874227163854} {"step": 2250, "timestamp": 1778196956.6567829, "grad/layer_0/attn": 0.010538535192608833, "grad/layer_0/mlp": 0.02388426847755909, "grad/layer_0/attn_mlp_ratio": 0.44123332302964097, "grad/layer_4/attn": 0.01191103458404541, "grad/layer_4/mlp": 0.026972303166985512, "grad/layer_4/attn_mlp_ratio": 0.4416024269838567, "grad/layer_8/attn": 0.02596193552017212, "grad/layer_8/mlp": 0.018574023619294167, "grad/layer_8/attn_mlp_ratio": 1.3977550536453551, "grad/layer_12/attn": 0.013346307910978794, "grad/layer_12/mlp": 0.012464994564652443, "grad/layer_12/attn_mlp_ratio": 1.0707030584477932, "grad/layer_16/attn": 0.014980534091591835, "grad/layer_16/mlp": 0.014605107717216015, "grad/layer_16/attn_mlp_ratio": 1.0257051354275715, "grad/layer_20/attn": 0.13067267835140228, "grad/layer_20/mlp": 0.07671751081943512, "grad/layer_20/attn_mlp_ratio": 1.7032966370432454, "grad/layer_24/attn": 0.022239435464143753, "grad/layer_24/mlp": 0.03893253207206726, "grad/layer_24/attn_mlp_ratio": 0.571230131291069, "grad/layer_27/attn": 0.019826162606477737, "grad/layer_27/mlp": 0.029210954904556274, "grad/layer_27/attn_mlp_ratio": 0.6787235338038515} {"step": 2250, "timestamp": 1778196957.2724144, "eos/sharpness": 55.46989440917967, "eos/L0_probe": 2.9896013736724854, "eos/L_plus": 3.367332935333252, "eos/L_minus": 3.1665687561035156, "eos/grad_norm": 0.48794326186180115, "eos/embed_grad_frac": 0.06683554500341415, "eos/time_s": 0.6126763820648193} {"step": 2250, "timestamp": 1778196957.2942445, "train/loss": 3.1990513801574707, "train/z_loss": 0.0015023311600089074, "train/perplexity": 24.50926918684921, "train/grad_norm": 0.48828125, "optim/muon_lr": 0.009000000000000001, "optim/adamw_lr": 0.00027, "perf/tokens_per_sec": 1909171.750366843, "perf/iters_per_sec": 0.9103640319666114, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.098461675643921, "data/tokens_consumed": 4720689152, "data/tokens_consumed_B": 4.720689152, "train/loss_slope": -0.0015064630639327742} {"step": 2250, "timestamp": 1778196958.656752, "geo/rankme_last": 352.582275390625, "geo/layer_0/stable_rank_q_proj": 52.64081954956055, "geo/layer_0/stable_rank_k_proj": 50.619205474853516, "geo/layer_0/stable_rank_o_proj": 65.62440490722656, "geo/layer_0/stable_rank_gate_proj": 164.92103576660156, "geo/layer_0/stable_rank_down_proj": 51.53902816772461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.042234987020492554, "geo/layer_0/attn_entropy_mean": 6.882354736328125, "geo/layer_0/attn_entropy_std": 0.04550422355532646, "geo/layer_7/stable_rank_q_proj": 28.63007926940918, "geo/layer_7/stable_rank_k_proj": 30.98168182373047, "geo/layer_7/stable_rank_o_proj": 112.3258285522461, "geo/layer_7/stable_rank_gate_proj": 177.83673095703125, "geo/layer_7/stable_rank_down_proj": 206.17526245117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6527404189109802, "geo/layer_7/attn_entropy_mean": 5.72668981552124, "geo/layer_7/attn_entropy_std": 1.5441598892211914, "geo/layer_14/stable_rank_q_proj": 36.21525192260742, "geo/layer_14/stable_rank_k_proj": 23.996246337890625, "geo/layer_14/stable_rank_o_proj": 92.94100189208984, "geo/layer_14/stable_rank_gate_proj": 195.58792114257812, "geo/layer_14/stable_rank_down_proj": 163.656494140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.628156840801239, "geo/layer_14/attn_entropy_mean": 6.850743293762207, "geo/layer_14/attn_entropy_std": 0.154410257935524, "geo/layer_21/stable_rank_q_proj": 53.16642761230469, "geo/layer_21/stable_rank_k_proj": 38.09972381591797, "geo/layer_21/stable_rank_o_proj": 72.23694610595703, "geo/layer_21/stable_rank_gate_proj": 143.39129638671875, "geo/layer_21/stable_rank_down_proj": 160.04373168945312, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.34074699878692627, "geo/layer_21/attn_entropy_mean": 6.102022647857666, "geo/layer_21/attn_entropy_std": 0.6030858755111694, "geo/layer_27/stable_rank_q_proj": 62.878761291503906, "geo/layer_27/stable_rank_k_proj": 24.45815658569336, "geo/layer_27/stable_rank_o_proj": 86.45161437988281, "geo/layer_27/stable_rank_gate_proj": 91.65724182128906, "geo/layer_27/stable_rank_down_proj": 71.4822769165039, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1402559131383896, "geo/layer_27/attn_entropy_mean": 5.6199750900268555, "geo/layer_27/attn_entropy_std": 0.2504580616950989, "attnres/final_alpha/block_0": 0.21691717207431793, "attnres/block_norm/0": 0.696248471736908, "attnres/final_alpha/block_1": 0.015976373106241226, "attnres/block_norm/1": 2917.494140625, "attnres/final_alpha/block_2": 0.05701538175344467, "attnres/block_norm/2": 1633.978759765625, "attnres/final_alpha/block_3": 0.03587457165122032, "attnres/block_norm/3": 1505.5452880859375, "attnres/final_alpha/block_4": 0.0464361347258091, "attnres/block_norm/4": 1326.3526611328125, "attnres/final_alpha/block_5": 0.3650040626525879, "attnres/block_norm/5": 1294.277099609375, "attnres/final_alpha/block_6": 0.26277634501457214, "attnres/block_norm/6": 1476.8983154296875, "geo/tier1_time_s": 1.3582618236541748, "geo/step": 2250.0, "geo/rankme_slope": 0.16260538439596853} {"step": 2260, "timestamp": 1778196969.0110254, "train/loss": 3.1979532718658445, "train/z_loss": 0.001509729225654155, "train/perplexity": 24.48237012687683, "train/grad_norm": 0.51171875, "optim/muon_lr": 0.009040000000000001, "optim/adamw_lr": 0.0002712, "perf/tokens_per_sec": 1790471.0984173652, "perf/iters_per_sec": 0.8537631504141642, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.1712850332260132, "data/tokens_consumed": 4741660672, "data/tokens_consumed_B": 4.741660672, "train/loss_slope": -0.0014879532675681586} {"step": 2270, "timestamp": 1778196979.366663, "train/loss": 3.207216167449951, "train/z_loss": 0.0015089716878719627, "train/perplexity": 24.710201324295614, "train/grad_norm": 0.345703125, "optim/muon_lr": 0.009080000000000001, "optim/adamw_lr": 0.0002724, "perf/tokens_per_sec": 2026052.0035263435, "perf/iters_per_sec": 0.9660968797332494, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0350928783416748, "data/tokens_consumed": 4762632192, "data/tokens_consumed_B": 4.762632192, "train/loss_slope": -0.0014677549435861328} {"step": 2280, "timestamp": 1778196989.7253845, "train/loss": 3.187231206893921, "train/z_loss": 0.001510410022456199, "train/perplexity": 24.221270826923636, "train/grad_norm": 0.75390625, "optim/muon_lr": 0.009120000000000001, "optim/adamw_lr": 0.0002736, "perf/tokens_per_sec": 2025638.1970269722, "perf/iters_per_sec": 0.9658995614180432, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.03530433177948, "data/tokens_consumed": 4783603712, "data/tokens_consumed_B": 4.783603712, "train/loss_slope": -0.001442868597472426} {"step": 2290, "timestamp": 1778197000.088039, "train/loss": 3.1561428785324095, "train/z_loss": 0.0015150133869610727, "train/perplexity": 23.479856386721824, "train/grad_norm": 0.5078125, "optim/muon_lr": 0.00916, "optim/adamw_lr": 0.0002748, "perf/tokens_per_sec": 2024888.839407272, "perf/iters_per_sec": 0.9655422398601875, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.035687470436096, "data/tokens_consumed": 4804575232, "data/tokens_consumed_B": 4.804575232, "train/loss_slope": -0.0014247963900184118} {"step": 2300, "timestamp": 1778197010.4290004, "grad/layer_0/attn": 0.011592133902013302, "grad/layer_0/mlp": 0.024699585512280464, "grad/layer_0/attn_mlp_ratio": 0.46932503581152274, "grad/layer_4/attn": 0.011961393989622593, "grad/layer_4/mlp": 0.02853614278137684, "grad/layer_4/attn_mlp_ratio": 0.41916645985918427, "grad/layer_8/attn": 0.040785305202007294, "grad/layer_8/mlp": 0.020467843860387802, "grad/layer_8/attn_mlp_ratio": 1.9926527327910377, "grad/layer_12/attn": 0.017234444618225098, "grad/layer_12/mlp": 0.013313974253833294, "grad/layer_12/attn_mlp_ratio": 1.2944628072881215, "grad/layer_16/attn": 0.016786513850092888, "grad/layer_16/mlp": 0.01505914144217968, "grad/layer_16/attn_mlp_ratio": 1.1147058949592146, "grad/layer_20/attn": 0.13038793206214905, "grad/layer_20/mlp": 0.07463204115629196, "grad/layer_20/attn_mlp_ratio": 1.7470771248824246, "grad/layer_24/attn": 0.039730969816446304, "grad/layer_24/mlp": 0.04000289738178253, "grad/layer_24/attn_mlp_ratio": 0.9932023007718362, "grad/layer_27/attn": 0.01721779815852642, "grad/layer_27/mlp": 0.02806890942156315, "grad/layer_27/attn_mlp_ratio": 0.6134117232199324} {"step": 2300, "timestamp": 1778197010.444678, "train/loss": 3.1741441965103148, "train/z_loss": 0.0015162472729571164, "train/perplexity": 23.906351971932533, "train/grad_norm": 0.53125, "optim/muon_lr": 0.0092, "optim/adamw_lr": 0.000276, "perf/tokens_per_sec": 2025873.2374146087, "perf/iters_per_sec": 0.9660116374085468, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0351842164993286, "data/tokens_consumed": 4825546752, "data/tokens_consumed_B": 4.825546752, "train/loss_slope": -0.0014007147010582331} {"step": 2310, "timestamp": 1778197020.8010578, "train/loss": 3.181517767906189, "train/z_loss": 0.0015186440665274858, "train/perplexity": 24.08327865415605, "train/grad_norm": 0.53125, "optim/muon_lr": 0.00924, "optim/adamw_lr": 0.0002772, "perf/tokens_per_sec": 2026124.6202284244, "perf/iters_per_sec": 0.9661315060751078, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0350557804107665, "data/tokens_consumed": 4846518272, "data/tokens_consumed_B": 4.846518272, "train/loss_slope": -0.0013760118956899201} {"step": 2320, "timestamp": 1778197031.1603518, "train/loss": 3.171510362625122, "train/z_loss": 0.0015161199029535056, "train/perplexity": 23.843469459435365, "train/grad_norm": 0.41796875, "optim/muon_lr": 0.00928, "optim/adamw_lr": 0.0002784, "perf/tokens_per_sec": 2025592.1564005893, "perf/iters_per_sec": 0.9658776075365969, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0353278636932373, "data/tokens_consumed": 4867489792, "data/tokens_consumed_B": 4.867489792, "train/loss_slope": -0.00135816512833191} {"step": 2325, "timestamp": 1778197036.9354846, "eos/sharpness": 32.97417163848876, "eos/L0_probe": 2.94486665725708, "eos/L_plus": 3.1399192810058594, "eos/L_minus": 3.0795557498931885, "eos/grad_norm": 0.40018758177757263, "eos/embed_grad_frac": 0.09480365365743637, "eos/time_s": 0.6044924259185791} {"step": 2325, "timestamp": 1778197038.3103652, "geo/rankme_last": 355.46588134765625, "geo/layer_0/stable_rank_q_proj": 52.31919479370117, "geo/layer_0/stable_rank_k_proj": 50.66093444824219, "geo/layer_0/stable_rank_o_proj": 65.6156234741211, "geo/layer_0/stable_rank_gate_proj": 165.18060302734375, "geo/layer_0/stable_rank_down_proj": 51.50226974487305, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036635372787714005, "geo/layer_0/attn_entropy_mean": 6.8737993240356445, "geo/layer_0/attn_entropy_std": 0.04744420200586319, "geo/layer_7/stable_rank_q_proj": 28.80088233947754, "geo/layer_7/stable_rank_k_proj": 31.128433227539062, "geo/layer_7/stable_rank_o_proj": 112.23283386230469, "geo/layer_7/stable_rank_gate_proj": 177.82131958007812, "geo/layer_7/stable_rank_down_proj": 206.4691162109375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6496535539627075, "geo/layer_7/attn_entropy_mean": 5.72193717956543, "geo/layer_7/attn_entropy_std": 1.5150583982467651, "geo/layer_14/stable_rank_q_proj": 36.39665603637695, "geo/layer_14/stable_rank_k_proj": 24.138206481933594, "geo/layer_14/stable_rank_o_proj": 93.02912902832031, "geo/layer_14/stable_rank_gate_proj": 195.65985107421875, "geo/layer_14/stable_rank_down_proj": 162.8070068359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.62211012840271, "geo/layer_14/attn_entropy_mean": 6.839158535003662, "geo/layer_14/attn_entropy_std": 0.14389964938163757, "geo/layer_21/stable_rank_q_proj": 52.3704948425293, "geo/layer_21/stable_rank_k_proj": 38.2870979309082, "geo/layer_21/stable_rank_o_proj": 73.0993423461914, "geo/layer_21/stable_rank_gate_proj": 142.5979766845703, "geo/layer_21/stable_rank_down_proj": 160.63815307617188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.33436480164527893, "geo/layer_21/attn_entropy_mean": 6.111814498901367, "geo/layer_21/attn_entropy_std": 0.5839070081710815, "geo/layer_27/stable_rank_q_proj": 62.78282165527344, "geo/layer_27/stable_rank_k_proj": 24.837535858154297, "geo/layer_27/stable_rank_o_proj": 87.7602767944336, "geo/layer_27/stable_rank_gate_proj": 91.68618774414062, "geo/layer_27/stable_rank_down_proj": 72.43328094482422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.13699840009212494, "geo/layer_27/attn_entropy_mean": 5.550398826599121, "geo/layer_27/attn_entropy_std": 0.24586515128612518, "attnres/final_alpha/block_0": 0.217405766248703, "attnres/block_norm/0": 0.7087816596031189, "attnres/final_alpha/block_1": 0.016125839203596115, "attnres/block_norm/1": 2947.4521484375, "attnres/final_alpha/block_2": 0.05570826306939125, "attnres/block_norm/2": 1666.8543701171875, "attnres/final_alpha/block_3": 0.03563792258501053, "attnres/block_norm/3": 1537.541748046875, "attnres/final_alpha/block_4": 0.0450972281396389, "attnres/block_norm/4": 1361.6529541015625, "attnres/final_alpha/block_5": 0.3621521592140198, "attnres/block_norm/5": 1337.2706298828125, "attnres/final_alpha/block_6": 0.26787278056144714, "attnres/block_norm/6": 1509.555908203125, "geo/tier1_time_s": 1.3550286293029785, "geo/step": 2325.0, "geo/rankme_slope": 0.15652974950952614} {"step": 2330, "timestamp": 1778197043.4928799, "train/loss": 3.1812402486801146, "train/z_loss": 0.001520527305547148, "train/perplexity": 24.07659600862682, "train/grad_norm": 0.46484375, "optim/muon_lr": 0.00932, "optim/adamw_lr": 0.00027959999999999997, "perf/tokens_per_sec": 1701191.0130115948, "perf/iters_per_sec": 0.8111910882051443, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.2327551603317262, "data/tokens_consumed": 4888461312, "data/tokens_consumed_B": 4.888461312, "train/loss_slope": -0.0013335313585975527} {"step": 2340, "timestamp": 1778197053.8497431, "train/loss": 3.161559247970581, "train/z_loss": 0.0015263939741998911, "train/perplexity": 23.607377000889695, "train/grad_norm": 0.5, "optim/muon_lr": 0.00936, "optim/adamw_lr": 0.0002808, "perf/tokens_per_sec": 2025911.1718302749, "perf/iters_per_sec": 0.9660297259475111, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0351648330688477, "data/tokens_consumed": 4909432832, "data/tokens_consumed_B": 4.909432832, "train/loss_slope": -0.001310633681495019} {"step": 2350, "timestamp": 1778197064.1993294, "grad/layer_0/attn": 0.012002084404230118, "grad/layer_0/mlp": 0.02610975317656994, "grad/layer_0/attn_mlp_ratio": 0.4596782005979507, "grad/layer_4/attn": 0.012457366101443768, "grad/layer_4/mlp": 0.02603716216981411, "grad/layer_4/attn_mlp_ratio": 0.4784456144779677, "grad/layer_8/attn": 0.036877501755952835, "grad/layer_8/mlp": 0.0188063383102417, "grad/layer_8/attn_mlp_ratio": 1.960908123182012, "grad/layer_12/attn": 0.01697145774960518, "grad/layer_12/mlp": 0.013895238749682903, "grad/layer_12/attn_mlp_ratio": 1.2213865434916562, "grad/layer_16/attn": 0.018623938784003258, "grad/layer_16/mlp": 0.016855940222740173, "grad/layer_16/attn_mlp_ratio": 1.1048887470773667, "grad/layer_20/attn": 0.2306799590587616, "grad/layer_20/mlp": 0.08345343172550201, "grad/layer_20/attn_mlp_ratio": 2.7641758285160125, "grad/layer_24/attn": 0.03765816241502762, "grad/layer_24/mlp": 0.04314504563808441, "grad/layer_24/attn_mlp_ratio": 0.872827036588039, "grad/layer_27/attn": 0.022271133959293365, "grad/layer_27/mlp": 0.03776370733976364, "grad/layer_27/attn_mlp_ratio": 0.5897496689067867} {"step": 2350, "timestamp": 1778197064.2149498, "train/loss": 3.1457701683044434, "train/z_loss": 0.0015284228255040943, "train/perplexity": 23.23756541962386, "train/grad_norm": 0.76171875, "optim/muon_lr": 0.0094, "optim/adamw_lr": 0.00028199999999999997, "perf/tokens_per_sec": 2024835.0487524066, "perf/iters_per_sec": 0.9655165904771836, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0357149839401245, "data/tokens_consumed": 4930404352, "data/tokens_consumed_B": 4.930404352, "train/loss_slope": -0.0012887324565386152} {"step": 2360, "timestamp": 1778197075.145622, "train/loss": 3.1193999290466308, "train/z_loss": 0.001536628243047744, "train/perplexity": 22.632794285049613, "train/grad_norm": 0.43359375, "optim/muon_lr": 0.00944, "optim/adamw_lr": 0.00028319999999999994, "perf/tokens_per_sec": 1919554.7095257354, "perf/iters_per_sec": 0.9153150127056767, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.092520046234131, "data/tokens_consumed": 4951375872, "data/tokens_consumed_B": 4.951375872, "train/loss_slope": -0.0012672216221551586} {"step": 2370, "timestamp": 1778197085.9933555, "train/loss": 3.1538448095321656, "train/z_loss": 0.0015297275967895985, "train/perplexity": 23.425960009146028, "train/grad_norm": 0.4375, "optim/muon_lr": 0.00948, "optim/adamw_lr": 0.0002844, "perf/tokens_per_sec": 1934302.6255720987, "perf/iters_per_sec": 0.9223473670826429, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0841902256011964, "data/tokens_consumed": 4972347392, "data/tokens_consumed_B": 4.972347392, "train/loss_slope": -0.0012462820554115807} {"step": 2380, "timestamp": 1778197096.8547056, "train/loss": 3.1389427185058594, "train/z_loss": 0.0015360592980869115, "train/perplexity": 23.07945247656088, "train/grad_norm": 0.56640625, "optim/muon_lr": 0.009519999999999999, "optim/adamw_lr": 0.00028559999999999995, "perf/tokens_per_sec": 1931981.4048114966, "perf/iters_per_sec": 0.9212405227715953, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0854928493499756, "data/tokens_consumed": 4993318912, "data/tokens_consumed_B": 4.993318912, "train/loss_slope": -0.0012282393387120086} {"step": 2390, "timestamp": 1778197107.2211976, "train/loss": 3.040133571624756, "train/z_loss": 0.0015475541935302318, "train/perplexity": 20.908035768895072, "train/grad_norm": 0.4609375, "optim/muon_lr": 0.009559999999999999, "optim/adamw_lr": 0.0002868, "perf/tokens_per_sec": 2024156.7114750913, "perf/iters_per_sec": 0.9651931340575653, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0360620737075805, "data/tokens_consumed": 5014290432, "data/tokens_consumed_B": 5.014290432, "train/loss_slope": -0.0012106990267985083} {"step": 2400, "timestamp": 1778197117.570721, "grad/layer_0/attn": 0.009267257526516914, "grad/layer_0/mlp": 0.020246349275112152, "grad/layer_0/attn_mlp_ratio": 0.45772486460737954, "grad/layer_4/attn": 0.008991037495434284, "grad/layer_4/mlp": 0.02090136520564556, "grad/layer_4/attn_mlp_ratio": 0.4301650807952608, "grad/layer_8/attn": 0.018240733072161674, "grad/layer_8/mlp": 0.014993149787187576, "grad/layer_8/attn_mlp_ratio": 1.21660446333224, "grad/layer_12/attn": 0.008489416912198067, "grad/layer_12/mlp": 0.009369014762341976, "grad/layer_12/attn_mlp_ratio": 0.9061162819071422, "grad/layer_16/attn": 0.013653220608830452, "grad/layer_16/mlp": 0.01136002503335476, "grad/layer_16/attn_mlp_ratio": 1.201865352281882, "grad/layer_20/attn": 0.055666714906692505, "grad/layer_20/mlp": 0.060688961297273636, "grad/layer_20/attn_mlp_ratio": 0.9172461288683917, "grad/layer_24/attn": 0.01860971376299858, "grad/layer_24/mlp": 0.034106578677892685, "grad/layer_24/attn_mlp_ratio": 0.545634139506865, "grad/layer_27/attn": 0.015842735767364502, "grad/layer_27/mlp": 0.023989582434296608, "grad/layer_27/attn_mlp_ratio": 0.660400644517886} {"step": 2400, "timestamp": 1778197118.167312, "eos/sharpness": 40.11056423187255, "eos/L0_probe": 2.904979944229126, "eos/L_plus": 3.1730270385742188, "eos/L_minus": 3.038038492202759, "eos/grad_norm": 0.31911084055900574, "eos/embed_grad_frac": 0.10694701969623566, "eos/time_s": 0.5938873291015625} {"step": 2400, "timestamp": 1778197118.1877708, "train/loss": 3.1707310676574707, "train/z_loss": 0.0015350496396422386, "train/perplexity": 23.824895601871138, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.0096, "optim/adamw_lr": 0.00028799999999999995, "perf/tokens_per_sec": 1913318.4594207958, "perf/iters_per_sec": 0.9123413369277934, "perf/gpu_mem_gb": 78.330013696, "perf/step_time_s": 1.0960809946060182, "data/tokens_consumed": 5035261952, "data/tokens_consumed_B": 5.035261952, "train/loss_slope": -0.0011872535983399521} {"step": 2400, "timestamp": 1778197119.5484781, "geo/rankme_last": 356.6485900878906, "geo/layer_0/stable_rank_q_proj": 51.9532470703125, "geo/layer_0/stable_rank_k_proj": 50.2096061706543, "geo/layer_0/stable_rank_o_proj": 65.23081970214844, "geo/layer_0/stable_rank_gate_proj": 164.93894958496094, "geo/layer_0/stable_rank_down_proj": 51.180702209472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.041976407170295715, "geo/layer_0/attn_entropy_mean": 6.866523265838623, "geo/layer_0/attn_entropy_std": 0.0455050952732563, "geo/layer_7/stable_rank_q_proj": 29.044370651245117, "geo/layer_7/stable_rank_k_proj": 31.2786808013916, "geo/layer_7/stable_rank_o_proj": 112.6767578125, "geo/layer_7/stable_rank_gate_proj": 178.31741333007812, "geo/layer_7/stable_rank_down_proj": 207.26956176757812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6504559516906738, "geo/layer_7/attn_entropy_mean": 5.685025215148926, "geo/layer_7/attn_entropy_std": 1.507705569267273, "geo/layer_14/stable_rank_q_proj": 36.61936569213867, "geo/layer_14/stable_rank_k_proj": 24.27240562438965, "geo/layer_14/stable_rank_o_proj": 92.99231719970703, "geo/layer_14/stable_rank_gate_proj": 195.33505249023438, "geo/layer_14/stable_rank_down_proj": 162.72811889648438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.613847017288208, "geo/layer_14/attn_entropy_mean": 6.794305801391602, "geo/layer_14/attn_entropy_std": 0.17018979787826538, "geo/layer_21/stable_rank_q_proj": 51.554344177246094, "geo/layer_21/stable_rank_k_proj": 38.31288528442383, "geo/layer_21/stable_rank_o_proj": 73.76914978027344, "geo/layer_21/stable_rank_gate_proj": 142.31175231933594, "geo/layer_21/stable_rank_down_proj": 161.20205688476562, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.32950687408447266, "geo/layer_21/attn_entropy_mean": 6.070545673370361, "geo/layer_21/attn_entropy_std": 0.6014280915260315, "geo/layer_27/stable_rank_q_proj": 62.59389877319336, "geo/layer_27/stable_rank_k_proj": 25.194120407104492, "geo/layer_27/stable_rank_o_proj": 88.5457992553711, "geo/layer_27/stable_rank_gate_proj": 91.77896881103516, "geo/layer_27/stable_rank_down_proj": 73.5173568725586, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1350453495979309, "geo/layer_27/attn_entropy_mean": 5.481790542602539, "geo/layer_27/attn_entropy_std": 0.2501436769962311, "attnres/final_alpha/block_0": 0.217654287815094, "attnres/block_norm/0": 0.7213816046714783, "attnres/final_alpha/block_1": 0.01607271283864975, "attnres/block_norm/1": 2996.0732421875, "attnres/final_alpha/block_2": 0.05464315414428711, "attnres/block_norm/2": 1708.14794921875, "attnres/final_alpha/block_3": 0.03541585057973862, "attnres/block_norm/3": 1567.7427978515625, "attnres/final_alpha/block_4": 0.04454074054956436, "attnres/block_norm/4": 1385.205810546875, "attnres/final_alpha/block_5": 0.36591070890426636, "attnres/block_norm/5": 1374.635986328125, "attnres/final_alpha/block_6": 0.2657625377178192, "attnres/block_norm/6": 1548.8284912109375, "geo/tier1_time_s": 1.3562750816345215, "geo/step": 2400.0, "geo/rankme_slope": 0.1507410665841876} {"step": 2410, "timestamp": 1778197130.273342, "train/loss": 3.1206450700759887, "train/z_loss": 0.0015393149224109948, "train/perplexity": 22.660992857779682, "train/grad_norm": 0.6015625, "optim/muon_lr": 0.00964, "optim/adamw_lr": 0.0002892, "perf/tokens_per_sec": 1735855.4824830925, "perf/iters_per_sec": 0.8277203953185522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2081374406814576, "data/tokens_consumed": 5056233472, "data/tokens_consumed_B": 5.056233472, "train/loss_slope": -0.001162994858498263} {"step": 2420, "timestamp": 1778197141.1133635, "train/loss": 3.1547348260879517, "train/z_loss": 0.0015397640992887319, "train/perplexity": 23.446818782339182, "train/grad_norm": 0.466796875, "optim/muon_lr": 0.00968, "optim/adamw_lr": 0.00029039999999999996, "perf/tokens_per_sec": 1936076.1395067654, "perf/iters_per_sec": 0.923193044427283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0831970691680908, "data/tokens_consumed": 5077204992, "data/tokens_consumed_B": 5.077204992, "train/loss_slope": -0.0011453189993205102} {"step": 2430, "timestamp": 1778197151.4721751, "train/loss": 3.115573859214783, "train/z_loss": 0.0015446988050825893, "train/perplexity": 22.546365081145023, "train/grad_norm": 0.38671875, "optim/muon_lr": 0.00972, "optim/adamw_lr": 0.0002916, "perf/tokens_per_sec": 2025753.3777302415, "perf/iters_per_sec": 0.96595448385727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352454662322998, "data/tokens_consumed": 5098176512, "data/tokens_consumed_B": 5.098176512, "train/loss_slope": -0.0011256340539172861} {"step": 2440, "timestamp": 1778197162.4467604, "train/loss": 3.069662666320801, "train/z_loss": 0.0015529431402683258, "train/perplexity": 21.534637091248058, "train/grad_norm": 0.64453125, "optim/muon_lr": 0.00976, "optim/adamw_lr": 0.00029279999999999996, "perf/tokens_per_sec": 1911953.8525404013, "perf/iters_per_sec": 0.9116906416608817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096863293647766, "data/tokens_consumed": 5119148032, "data/tokens_consumed_B": 5.119148032, "train/loss_slope": -0.0011083708028767584} {"step": 2450, "timestamp": 1778197172.7923908, "grad/layer_0/attn": 0.010102816857397556, "grad/layer_0/mlp": 0.02081507071852684, "grad/layer_0/attn_mlp_ratio": 0.4853606766692024, "grad/layer_4/attn": 0.010355613194406033, "grad/layer_4/mlp": 0.023578431457281113, "grad/layer_4/attn_mlp_ratio": 0.4391985603134055, "grad/layer_8/attn": 0.02776178903877735, "grad/layer_8/mlp": 0.0166018046438694, "grad/layer_8/attn_mlp_ratio": 1.6722151276371944, "grad/layer_12/attn": 0.01304218452423811, "grad/layer_12/mlp": 0.011190240271389484, "grad/layer_12/attn_mlp_ratio": 1.1654963692811784, "grad/layer_16/attn": 0.012849241495132446, "grad/layer_16/mlp": 0.012856794521212578, "grad/layer_16/attn_mlp_ratio": 0.9994125187262719, "grad/layer_20/attn": 0.04595869034528732, "grad/layer_20/mlp": 0.055032096803188324, "grad/layer_20/attn_mlp_ratio": 0.835125189326098, "grad/layer_24/attn": 0.01738734543323517, "grad/layer_24/mlp": 0.03611176833510399, "grad/layer_24/attn_mlp_ratio": 0.48148695527004587, "grad/layer_27/attn": 0.02246658131480217, "grad/layer_27/mlp": 0.02801644057035446, "grad/layer_27/attn_mlp_ratio": 0.8019070509044047} {"step": 2450, "timestamp": 1778197172.8084745, "train/loss": 3.065091347694397, "train/z_loss": 0.001558214460965246, "train/perplexity": 21.4364200652964, "train/grad_norm": 0.37109375, "optim/muon_lr": 0.0098, "optim/adamw_lr": 0.000294, "perf/tokens_per_sec": 2025090.2767704618, "perf/iters_per_sec": 0.9656382926800069, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355844497680664, "data/tokens_consumed": 5140119552, "data/tokens_consumed_B": 5.140119552, "train/loss_slope": -0.0010911164363582965} {"step": 2460, "timestamp": 1778197183.1677456, "train/loss": 3.077402853965759, "train/z_loss": 0.001557654223870486, "train/perplexity": 21.701965966250253, "train/grad_norm": 0.482421875, "optim/muon_lr": 0.00984, "optim/adamw_lr": 0.00029519999999999997, "perf/tokens_per_sec": 2025812.6292853684, "perf/iters_per_sec": 0.9659827372004358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035215187072754, "data/tokens_consumed": 5161091072, "data/tokens_consumed_B": 5.161091072, "train/loss_slope": -0.0010784277687908256} {"step": 2470, "timestamp": 1778197193.5230927, "train/loss": 3.1049237728118895, "train/z_loss": 0.0015535286045633256, "train/perplexity": 22.307518471057552, "train/grad_norm": 0.478515625, "optim/muon_lr": 0.00988, "optim/adamw_lr": 0.0002964, "perf/tokens_per_sec": 2026336.0596759333, "perf/iters_per_sec": 0.96623232826039, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349477767944335, "data/tokens_consumed": 5182062592, "data/tokens_consumed_B": 5.182062592, "train/loss_slope": -0.0010594602283829151} {"step": 2475, "timestamp": 1778197199.3113022, "eos/sharpness": 45.12732028961181, "eos/L0_probe": 2.866990089416504, "eos/L_plus": 3.1297647953033447, "eos/L_minus": 3.0554885864257812, "eos/grad_norm": 0.49540963768959045, "eos/embed_grad_frac": 0.046537358313798904, "eos/time_s": 0.6201159954071045} {"step": 2475, "timestamp": 1778197200.6903346, "geo/rankme_last": 360.6830139160156, "geo/layer_0/stable_rank_q_proj": 51.58478927612305, "geo/layer_0/stable_rank_k_proj": 49.778564453125, "geo/layer_0/stable_rank_o_proj": 64.99787902832031, "geo/layer_0/stable_rank_gate_proj": 165.235595703125, "geo/layer_0/stable_rank_down_proj": 50.98461151123047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03018755279481411, "geo/layer_0/attn_entropy_mean": 6.854849815368652, "geo/layer_0/attn_entropy_std": 0.04881688207387924, "geo/layer_7/stable_rank_q_proj": 29.227277755737305, "geo/layer_7/stable_rank_k_proj": 31.381755828857422, "geo/layer_7/stable_rank_o_proj": 112.25833129882812, "geo/layer_7/stable_rank_gate_proj": 178.27859497070312, "geo/layer_7/stable_rank_down_proj": 207.22994995117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6364891529083252, "geo/layer_7/attn_entropy_mean": 5.679328918457031, "geo/layer_7/attn_entropy_std": 1.491926908493042, "geo/layer_14/stable_rank_q_proj": 36.82258224487305, "geo/layer_14/stable_rank_k_proj": 24.530153274536133, "geo/layer_14/stable_rank_o_proj": 92.8456039428711, "geo/layer_14/stable_rank_gate_proj": 195.39894104003906, "geo/layer_14/stable_rank_down_proj": 162.11517333984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.6007408499717712, "geo/layer_14/attn_entropy_mean": 6.808500289916992, "geo/layer_14/attn_entropy_std": 0.1401064693927765, "geo/layer_21/stable_rank_q_proj": 51.03407287597656, "geo/layer_21/stable_rank_k_proj": 38.30875015258789, "geo/layer_21/stable_rank_o_proj": 74.56736755371094, "geo/layer_21/stable_rank_gate_proj": 141.6825714111328, "geo/layer_21/stable_rank_down_proj": 162.04248046875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3183232843875885, "geo/layer_21/attn_entropy_mean": 6.041565895080566, "geo/layer_21/attn_entropy_std": 0.6026209592819214, "geo/layer_27/stable_rank_q_proj": 62.44765853881836, "geo/layer_27/stable_rank_k_proj": 25.570207595825195, "geo/layer_27/stable_rank_o_proj": 89.4410171508789, "geo/layer_27/stable_rank_gate_proj": 91.68328857421875, "geo/layer_27/stable_rank_down_proj": 74.62859344482422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1337794065475464, "geo/layer_27/attn_entropy_mean": 5.414370536804199, "geo/layer_27/attn_entropy_std": 0.22757773101329803, "attnres/final_alpha/block_0": 0.21856673061847687, "attnres/block_norm/0": 0.7342403531074524, "attnres/final_alpha/block_1": 0.016393039375543594, "attnres/block_norm/1": 3026.419189453125, "attnres/final_alpha/block_2": 0.053727518767118454, "attnres/block_norm/2": 1745.505859375, "attnres/final_alpha/block_3": 0.03490334749221802, "attnres/block_norm/3": 1592.71337890625, "attnres/final_alpha/block_4": 0.04469684883952141, "attnres/block_norm/4": 1407.212890625, "attnres/final_alpha/block_5": 0.36129534244537354, "attnres/block_norm/5": 1425.73974609375, "attnres/final_alpha/block_6": 0.270417183637619, "attnres/block_norm/6": 1582.3355712890625, "geo/tier1_time_s": 1.3593883514404297, "geo/step": 2475.0, "geo/rankme_slope": 0.1454267254453717} {"step": 2480, "timestamp": 1778197205.8863573, "train/loss": 3.0816012382507325, "train/z_loss": 0.0015587154892273247, "train/perplexity": 21.79327069116243, "train/grad_norm": 0.53125, "optim/muon_lr": 0.00992, "optim/adamw_lr": 0.00029759999999999997, "perf/tokens_per_sec": 1697040.6309420755, "perf/iters_per_sec": 0.8092120318136575, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235770058631897, "data/tokens_consumed": 5203034112, "data/tokens_consumed_B": 5.203034112, "train/loss_slope": -0.0010384638802625379} {"step": 2490, "timestamp": 1778197216.2432988, "train/loss": 3.082480311393738, "train/z_loss": 0.001560879300814122, "train/perplexity": 21.81243699318169, "train/grad_norm": 0.3828125, "optim/muon_lr": 0.00996, "optim/adamw_lr": 0.0002988, "perf/tokens_per_sec": 2025856.76694828, "perf/iters_per_sec": 0.9660037836781883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351926326751708, "data/tokens_consumed": 5224005632, "data/tokens_consumed_B": 5.224005632, "train/loss_slope": -0.0010224674062617291} {"step": 2500, "timestamp": 1778197226.593861, "grad/layer_0/attn": 0.00907981488853693, "grad/layer_0/mlp": 0.0195450559258461, "grad/layer_0/attn_mlp_ratio": 0.464558140766131, "grad/layer_4/attn": 0.00829903781414032, "grad/layer_4/mlp": 0.019884474575519562, "grad/layer_4/attn_mlp_ratio": 0.4173626887089726, "grad/layer_8/attn": 0.021991943940520287, "grad/layer_8/mlp": 0.017140381038188934, "grad/layer_8/attn_mlp_ratio": 1.283048711882026, "grad/layer_12/attn": 0.011412844061851501, "grad/layer_12/mlp": 0.009826279245316982, "grad/layer_12/attn_mlp_ratio": 1.1614613894821386, "grad/layer_16/attn": 0.014411171898245811, "grad/layer_16/mlp": 0.010378922335803509, "grad/layer_16/attn_mlp_ratio": 1.3885036705286966, "grad/layer_20/attn": 0.047078751027584076, "grad/layer_20/mlp": 0.06004399061203003, "grad/layer_20/attn_mlp_ratio": 0.7840709864434723, "grad/layer_24/attn": 0.024022940546274185, "grad/layer_24/mlp": 0.037825241684913635, "grad/layer_24/attn_mlp_ratio": 0.6351034233403258, "grad/layer_27/attn": 0.02093384973704815, "grad/layer_27/mlp": 0.039593033492565155, "grad/layer_27/attn_mlp_ratio": 0.5287255821938115} {"step": 2500, "timestamp": 1778197226.6098802, "train/loss": 3.01148841381073, "train/z_loss": 0.0015689327847212553, "train/perplexity": 20.317618454732997, "train/grad_norm": 0.5234375, "optim/muon_lr": 0.01, "optim/adamw_lr": 0.0003, "perf/tokens_per_sec": 2023986.010577539, "perf/iters_per_sec": 0.9651117375266738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361494541168212, "data/tokens_consumed": 5244977152, "data/tokens_consumed_B": 5.244977152, "train/loss_slope": -0.0010076004131184375} {"step": 2500, "timestamp": 1778197233.6528559, "geo/ww_alpha_mean": 8.192985643488527, "geo/ww_alpha_std": 6.247818721156024, "geo/ww_alpha_min": 1.9954902666337544, "geo/ww_alpha_max": 41.616687312915914, "geo/ww_alpha_healthy_frac": 0.2233502538071066, "geo/ww_alpha_by_type/q_proj": 4.570577133420333, "geo/ww_alpha_by_type/k_proj": 4.511521979080284, "geo/ww_alpha_by_type/v_proj": 6.876958199127849, "geo/ww_alpha_by_type/o_proj": 5.714382550988785, "geo/ww_alpha_by_type/gate_proj": 12.686456563954591, "geo/ww_alpha_by_type/up_proj": 13.838693893061828, "geo/ww_alpha_by_type/down_proj": 9.35349880976822, "geo/twonn_id/layer_0": 0.6699590086936951, "geo/twonn_id/layer_7": 2.1382768154144287, "geo/twonn_id/layer_14": 2.3132073879241943, "geo/twonn_id/layer_21": 6.1184163093566895, "geo/twonn_id/layer_27": 4.7072224617004395, "geo/tier2_time_s": 7.035772323608398} {"step": 2500, "timestamp": 1778197234.4246695, "eoc/jacobian_sigma/layer_0/attn": 843.4803466796875, "eoc/jacobian_sigma/layer_0/mlp": 863.593017578125, "eoc/jacobian_sigma/layer_0": 863.593017578125, "eoc/jacobian_sigma/layer_7/attn": 1.2049319744110107, "eoc/jacobian_sigma/layer_7/mlp": 1.416872501373291, "eoc/jacobian_sigma/layer_7": 1.416872501373291, "eoc/jacobian_sigma/layer_14/attn": 1.2873296737670898, "eoc/jacobian_sigma/layer_14/mlp": 2.8381714820861816, "eoc/jacobian_sigma/layer_14": 2.8381714820861816, "eoc/jacobian_sigma/layer_21/attn": 1.1644651889801025, "eoc/jacobian_sigma/layer_21/mlp": 3.2056713104248047, "eoc/jacobian_sigma/layer_21": 3.2056713104248047, "eoc/jacobian_sigma/layer_27/attn": 1.4303717613220215, "eoc/jacobian_sigma/layer_27/mlp": 4.6511430740356445, "eoc/jacobian_sigma/layer_27": 4.6511430740356445, "eoc/layer0_sigma": 863.593017578125, "eoc/sigma_max": 4.6511430740356445, "eoc/sigma_min": 1.416872501373291, "eoc/sigma_mean": 3.0279645919799805, "eoc/time_s": 0.7648131847381592} {"step": 2510, "timestamp": 1778197244.7944345, "train/loss": 3.037356376647949, "train/z_loss": 0.001567750610411167, "train/perplexity": 20.85005063226692, "train/grad_norm": 0.48046875, "optim/muon_lr": 0.01004, "optim/adamw_lr": 0.00030119999999999995, "perf/tokens_per_sec": 1153641.0876926226, "perf/iters_per_sec": 0.5500989378417123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8178548097610474, "data/tokens_consumed": 5265948672, "data/tokens_consumed_B": 5.265948672, "train/loss_slope": -0.0009950727901121106} {"step": 2520, "timestamp": 1778197255.1511698, "train/loss": 3.030234384536743, "train/z_loss": 0.0015703229000791907, "train/perplexity": 20.702084269218982, "train/grad_norm": 0.41015625, "optim/muon_lr": 0.01008, "optim/adamw_lr": 0.0003024, "perf/tokens_per_sec": 2026005.6174322718, "perf/iters_per_sec": 0.9660747611199721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351165771484374, "data/tokens_consumed": 5286920192, "data/tokens_consumed_B": 5.286920192, "train/loss_slope": -0.0009825569305053675} {"step": 2530, "timestamp": 1778197265.5093174, "train/loss": 3.063194513320923, "train/z_loss": 0.0015674690017476677, "train/perplexity": 21.39579726641295, "train/grad_norm": 0.396484375, "optim/muon_lr": 0.01012, "optim/adamw_lr": 0.00030359999999999995, "perf/tokens_per_sec": 2026353.611600314, "perf/iters_per_sec": 0.9662406976701327, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349388122558594, "data/tokens_consumed": 5307891712, "data/tokens_consumed_B": 5.307891712, "train/loss_slope": -0.0009688313198919379} {"step": 2540, "timestamp": 1778197275.8618374, "train/loss": 3.0680308818817137, "train/z_loss": 0.0015705411205999553, "train/perplexity": 21.499525860312257, "train/grad_norm": 0.490234375, "optim/muon_lr": 0.01016, "optim/adamw_lr": 0.0003048, "perf/tokens_per_sec": 2026775.135826139, "perf/iters_per_sec": 0.9664416960840888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347235679626465, "data/tokens_consumed": 5328863232, "data/tokens_consumed_B": 5.328863232, "train/loss_slope": -0.0009527198621828278} {"step": 2550, "timestamp": 1778197286.2086635, "grad/layer_0/attn": 0.00835094042122364, "grad/layer_0/mlp": 0.01821180246770382, "grad/layer_0/attn_mlp_ratio": 0.45854551685251127, "grad/layer_4/attn": 0.00809984840452671, "grad/layer_4/mlp": 0.018758296966552734, "grad/layer_4/attn_mlp_ratio": 0.431800838625659, "grad/layer_8/attn": 0.04067584127187729, "grad/layer_8/mlp": 0.01654607616364956, "grad/layer_8/attn_mlp_ratio": 2.458337591567793, "grad/layer_12/attn": 0.009094326756894588, "grad/layer_12/mlp": 0.009599721990525723, "grad/layer_12/attn_mlp_ratio": 0.9473531286775557, "grad/layer_16/attn": 0.011552215553820133, "grad/layer_16/mlp": 0.011150378733873367, "grad/layer_16/attn_mlp_ratio": 1.0360379432783073, "grad/layer_20/attn": 0.06641249358654022, "grad/layer_20/mlp": 0.05409546568989754, "grad/layer_20/attn_mlp_ratio": 1.227690576590671, "grad/layer_24/attn": 0.02568492293357849, "grad/layer_24/mlp": 0.030189653858542442, "grad/layer_24/attn_mlp_ratio": 0.8507856025395317, "grad/layer_27/attn": 0.011778665706515312, "grad/layer_27/mlp": 0.019648149609565735, "grad/layer_27/attn_mlp_ratio": 0.599479639590737} {"step": 2550, "timestamp": 1778197286.8356516, "eos/sharpness": 10.747885704040526, "eos/L0_probe": 2.8307127952575684, "eos/L_plus": 2.897700071334839, "eos/L_minus": 2.871204376220703, "eos/grad_norm": 0.27258002758026123, "eos/embed_grad_frac": 0.11953660100698471, "eos/time_s": 0.6241970062255859} {"step": 2550, "timestamp": 1778197286.8554876, "train/loss": 3.070310425758362, "train/z_loss": 0.0015694017871282994, "train/perplexity": 21.54859087451786, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.0102, "optim/adamw_lr": 0.00030599999999999996, "perf/tokens_per_sec": 1908405.245569953, "perf/iters_per_sec": 0.9099985339975133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0989028692245484, "data/tokens_consumed": 5349834752, "data/tokens_consumed_B": 5.349834752, "train/loss_slope": -0.0009380862699411955} {"step": 2550, "timestamp": 1778197288.2182727, "geo/rankme_last": 362.2131652832031, "geo/layer_0/stable_rank_q_proj": 51.17829513549805, "geo/layer_0/stable_rank_k_proj": 49.25174331665039, "geo/layer_0/stable_rank_o_proj": 64.48189544677734, "geo/layer_0/stable_rank_gate_proj": 165.84886169433594, "geo/layer_0/stable_rank_down_proj": 50.814334869384766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036768440157175064, "geo/layer_0/attn_entropy_mean": 6.847367763519287, "geo/layer_0/attn_entropy_std": 0.04602918028831482, "geo/layer_7/stable_rank_q_proj": 29.3842716217041, "geo/layer_7/stable_rank_k_proj": 31.503602981567383, "geo/layer_7/stable_rank_o_proj": 112.24362182617188, "geo/layer_7/stable_rank_gate_proj": 179.39056396484375, "geo/layer_7/stable_rank_down_proj": 207.2084197998047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6324806809425354, "geo/layer_7/attn_entropy_mean": 5.644388675689697, "geo/layer_7/attn_entropy_std": 1.4816197156906128, "geo/layer_14/stable_rank_q_proj": 37.10962677001953, "geo/layer_14/stable_rank_k_proj": 24.76169776916504, "geo/layer_14/stable_rank_o_proj": 92.70311737060547, "geo/layer_14/stable_rank_gate_proj": 195.30726623535156, "geo/layer_14/stable_rank_down_proj": 162.1455078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5927744507789612, "geo/layer_14/attn_entropy_mean": 6.771368026733398, "geo/layer_14/attn_entropy_std": 0.17824818193912506, "geo/layer_21/stable_rank_q_proj": 50.56085968017578, "geo/layer_21/stable_rank_k_proj": 38.12665939331055, "geo/layer_21/stable_rank_o_proj": 75.59324645996094, "geo/layer_21/stable_rank_gate_proj": 141.49234008789062, "geo/layer_21/stable_rank_down_proj": 162.22918701171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.3089785873889923, "geo/layer_21/attn_entropy_mean": 6.052611351013184, "geo/layer_21/attn_entropy_std": 0.5962793827056885, "geo/layer_27/stable_rank_q_proj": 62.6422119140625, "geo/layer_27/stable_rank_k_proj": 26.04049301147461, "geo/layer_27/stable_rank_o_proj": 90.49653625488281, "geo/layer_27/stable_rank_gate_proj": 91.58238220214844, "geo/layer_27/stable_rank_down_proj": 75.82404327392578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11834462732076645, "geo/layer_27/attn_entropy_mean": 5.3965888023376465, "geo/layer_27/attn_entropy_std": 0.24896240234375, "attnres/final_alpha/block_0": 0.22081385552883148, "attnres/block_norm/0": 0.7471523284912109, "attnres/final_alpha/block_1": 0.01659569889307022, "attnres/block_norm/1": 3074.05712890625, "attnres/final_alpha/block_2": 0.053152114152908325, "attnres/block_norm/2": 1788.3331298828125, "attnres/final_alpha/block_3": 0.034449364989995956, "attnres/block_norm/3": 1646.66796875, "attnres/final_alpha/block_4": 0.04559971019625664, "attnres/block_norm/4": 1422.884033203125, "attnres/final_alpha/block_5": 0.35835108160972595, "attnres/block_norm/5": 1468.64990234375, "attnres/final_alpha/block_6": 0.2710381746292114, "attnres/block_norm/6": 1626.944580078125, "geo/tier1_time_s": 1.3585999011993408, "geo/step": 2550.0, "geo/rankme_slope": 0.14037744569466784} {"step": 2560, "timestamp": 1778197298.5747292, "train/loss": 3.069785308837891, "train/z_loss": 0.0015712006599642337, "train/perplexity": 21.537278315305418, "train/grad_norm": 0.625, "optim/muon_lr": 0.01024, "optim/adamw_lr": 0.0003072, "perf/tokens_per_sec": 1790084.1664838817, "perf/iters_per_sec": 0.8535786468905838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715382099151612, "data/tokens_consumed": 5370806272, "data/tokens_consumed_B": 5.370806272, "train/loss_slope": -0.0009194696995649043} {"step": 2570, "timestamp": 1778197308.928224, "train/loss": 3.020755958557129, "train/z_loss": 0.0015813708654604853, "train/perplexity": 20.506788108077373, "train/grad_norm": 0.390625, "optim/muon_lr": 0.010280000000000001, "optim/adamw_lr": 0.00030839999999999996, "perf/tokens_per_sec": 2026736.3751710185, "perf/iters_per_sec": 0.9664232135634511, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034743356704712, "data/tokens_consumed": 5391777792, "data/tokens_consumed_B": 5.391777792, "train/loss_slope": -0.0009086755846843611} {"step": 2580, "timestamp": 1778197319.280747, "train/loss": 3.0235312461853026, "train/z_loss": 0.0015788154560141265, "train/perplexity": 20.56377939042775, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.010320000000000001, "optim/adamw_lr": 0.0003096, "perf/tokens_per_sec": 2026615.2926984676, "perf/iters_per_sec": 0.9663654769413317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034805178642273, "data/tokens_consumed": 5412749312, "data/tokens_consumed_B": 5.412749312, "train/loss_slope": -0.0008990873320911727} {"step": 2590, "timestamp": 1778197329.6641378, "train/loss": 2.9409430027008057, "train/z_loss": 0.0015947582200169564, "train/perplexity": 18.933692419556735, "train/grad_norm": 0.671875, "optim/muon_lr": 0.010360000000000001, "optim/adamw_lr": 0.00031079999999999997, "perf/tokens_per_sec": 2022275.469494242, "perf/iters_per_sec": 0.9642960879775248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370258808135986, "data/tokens_consumed": 5433720832, "data/tokens_consumed_B": 5.433720832, "train/loss_slope": -0.0008891467176254826} {"step": 2600, "timestamp": 1778197340.010248, "grad/layer_0/attn": 0.007691826205700636, "grad/layer_0/mlp": 0.016577649861574173, "grad/layer_0/attn_mlp_ratio": 0.46398773188779757, "grad/layer_4/attn": 0.007461846340447664, "grad/layer_4/mlp": 0.017196571454405785, "grad/layer_4/attn_mlp_ratio": 0.4339147670708775, "grad/layer_8/attn": 0.02216987870633602, "grad/layer_8/mlp": 0.013733677566051483, "grad/layer_8/attn_mlp_ratio": 1.614271082037853, "grad/layer_12/attn": 0.008213932625949383, "grad/layer_12/mlp": 0.009495378471910954, "grad/layer_12/attn_mlp_ratio": 0.8650453021691711, "grad/layer_16/attn": 0.008403398096561432, "grad/layer_16/mlp": 0.009536433033645153, "grad/layer_16/attn_mlp_ratio": 0.8811888028568774, "grad/layer_20/attn": 0.08031444251537323, "grad/layer_20/mlp": 0.04543457180261612, "grad/layer_20/attn_mlp_ratio": 1.7676944923684585, "grad/layer_24/attn": 0.012779354117810726, "grad/layer_24/mlp": 0.02728050947189331, "grad/layer_24/attn_mlp_ratio": 0.4684426470895946, "grad/layer_27/attn": 0.010707118548452854, "grad/layer_27/mlp": 0.01927899569272995, "grad/layer_27/attn_mlp_ratio": 0.5553773995059678} {"step": 2600, "timestamp": 1778197340.026052, "train/loss": 2.9709487199783324, "train/z_loss": 0.0015945207444019616, "train/perplexity": 19.51042074438476, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.010400000000000001, "optim/adamw_lr": 0.000312, "perf/tokens_per_sec": 2025025.7061605046, "perf/iters_per_sec": 0.9656075030138515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035617470741272, "data/tokens_consumed": 5454692352, "data/tokens_consumed_B": 5.454692352, "train/loss_slope": -0.0008778318844648442} {"step": 2610, "timestamp": 1778197350.3864596, "train/loss": 2.9542200326919557, "train/z_loss": 0.0015963656012900173, "train/perplexity": 19.186751843009766, "train/grad_norm": 0.400390625, "optim/muon_lr": 0.010440000000000001, "optim/adamw_lr": 0.00031319999999999997, "perf/tokens_per_sec": 2025719.6478813249, "perf/iters_per_sec": 0.965938400211966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352627038955688, "data/tokens_consumed": 5475663872, "data/tokens_consumed_B": 5.475663872, "train/loss_slope": -0.000869499888378616} {"step": 2620, "timestamp": 1778197360.734788, "train/loss": 3.025429701805115, "train/z_loss": 0.0015902662649750709, "train/perplexity": 20.60285589374334, "train/grad_norm": 0.392578125, "optim/muon_lr": 0.010480000000000001, "optim/adamw_lr": 0.0003144, "perf/tokens_per_sec": 2027451.0236555578, "perf/iters_per_sec": 0.9667639845159329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343786239624024, "data/tokens_consumed": 5496635392, "data/tokens_consumed_B": 5.496635392, "train/loss_slope": -0.0008615560285114434} {"step": 2625, "timestamp": 1778197366.5050104, "eos/sharpness": 24.18856620788574, "eos/L0_probe": 2.799013614654541, "eos/L_plus": 2.9241294860839844, "eos/L_minus": 2.915783405303955, "eos/grad_norm": 0.2857022285461426, "eos/embed_grad_frac": 0.09859665483236313, "eos/time_s": 0.6031351089477539} {"step": 2625, "timestamp": 1778197367.8821583, "geo/rankme_last": 363.7771911621094, "geo/layer_0/stable_rank_q_proj": 50.80949401855469, "geo/layer_0/stable_rank_k_proj": 48.753597259521484, "geo/layer_0/stable_rank_o_proj": 64.26385498046875, "geo/layer_0/stable_rank_gate_proj": 165.7347869873047, "geo/layer_0/stable_rank_down_proj": 50.63334274291992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03058493323624134, "geo/layer_0/attn_entropy_mean": 6.83509635925293, "geo/layer_0/attn_entropy_std": 0.04335051774978638, "geo/layer_7/stable_rank_q_proj": 29.567241668701172, "geo/layer_7/stable_rank_k_proj": 31.589696884155273, "geo/layer_7/stable_rank_o_proj": 111.73902130126953, "geo/layer_7/stable_rank_gate_proj": 179.72378540039062, "geo/layer_7/stable_rank_down_proj": 207.70559692382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6250200271606445, "geo/layer_7/attn_entropy_mean": 5.616710186004639, "geo/layer_7/attn_entropy_std": 1.4713817834854126, "geo/layer_14/stable_rank_q_proj": 37.394046783447266, "geo/layer_14/stable_rank_k_proj": 25.028480529785156, "geo/layer_14/stable_rank_o_proj": 92.81415557861328, "geo/layer_14/stable_rank_gate_proj": 194.6523895263672, "geo/layer_14/stable_rank_down_proj": 161.01658630371094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5789563059806824, "geo/layer_14/attn_entropy_mean": 6.783664226531982, "geo/layer_14/attn_entropy_std": 0.14594675600528717, "geo/layer_21/stable_rank_q_proj": 49.96323013305664, "geo/layer_21/stable_rank_k_proj": 38.027469635009766, "geo/layer_21/stable_rank_o_proj": 76.75260925292969, "geo/layer_21/stable_rank_gate_proj": 140.9569091796875, "geo/layer_21/stable_rank_down_proj": 162.34906005859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.30915316939353943, "geo/layer_21/attn_entropy_mean": 6.066874980926514, "geo/layer_21/attn_entropy_std": 0.5760408639907837, "geo/layer_27/stable_rank_q_proj": 62.34858703613281, "geo/layer_27/stable_rank_k_proj": 26.454086303710938, "geo/layer_27/stable_rank_o_proj": 91.28846740722656, "geo/layer_27/stable_rank_gate_proj": 91.41585540771484, "geo/layer_27/stable_rank_down_proj": 76.97161102294922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1257214993238449, "geo/layer_27/attn_entropy_mean": 5.359131813049316, "geo/layer_27/attn_entropy_std": 0.23522616922855377, "attnres/final_alpha/block_0": 0.22270137071609497, "attnres/block_norm/0": 0.7603608965873718, "attnres/final_alpha/block_1": 0.016856756061315536, "attnres/block_norm/1": 3115.1962890625, "attnres/final_alpha/block_2": 0.05226874351501465, "attnres/block_norm/2": 1827.014404296875, "attnres/final_alpha/block_3": 0.03377002105116844, "attnres/block_norm/3": 1695.733154296875, "attnres/final_alpha/block_4": 0.04490265995264053, "attnres/block_norm/4": 1459.9495849609375, "attnres/final_alpha/block_5": 0.3588765263557434, "attnres/block_norm/5": 1511.294189453125, "attnres/final_alpha/block_6": 0.27062392234802246, "attnres/block_norm/6": 1670.2879638671875, "geo/tier1_time_s": 1.3576734066009521, "geo/step": 2625.0, "geo/rankme_slope": 0.13558582821407833} {"step": 2630, "timestamp": 1778197373.0587215, "train/loss": 2.944842004776001, "train/z_loss": 0.0016004947363398969, "train/perplexity": 19.007659029870567, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.01052, "optim/adamw_lr": 0.0003156, "perf/tokens_per_sec": 1702499.5592260545, "perf/iters_per_sec": 0.8118150516634247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2318076610565185, "data/tokens_consumed": 5517606912, "data/tokens_consumed_B": 5.517606912, "train/loss_slope": -0.0008510535913868563} {"step": 2640, "timestamp": 1778197383.4077826, "train/loss": 3.0207318305969237, "train/z_loss": 0.0015967399114742876, "train/perplexity": 20.506293327079014, "train/grad_norm": 0.5078125, "optim/muon_lr": 0.01056, "optim/adamw_lr": 0.0003168, "perf/tokens_per_sec": 2027675.7809778315, "perf/iters_per_sec": 0.9668711571587713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342639684677124, "data/tokens_consumed": 5538578432, "data/tokens_consumed_B": 5.538578432, "train/loss_slope": -0.0008343961588656119} {"step": 2650, "timestamp": 1778197393.7469854, "grad/layer_0/attn": 0.009021277539432049, "grad/layer_0/mlp": 0.0188848115503788, "grad/layer_0/attn_mlp_ratio": 0.47770015960159684, "grad/layer_4/attn": 0.009378798305988312, "grad/layer_4/mlp": 0.018221568316221237, "grad/layer_4/attn_mlp_ratio": 0.5147086184765028, "grad/layer_8/attn": 0.017407251521945, "grad/layer_8/mlp": 0.015839112922549248, "grad/layer_8/attn_mlp_ratio": 1.0990041864821143, "grad/layer_12/attn": 0.010194361209869385, "grad/layer_12/mlp": 0.010129979811608791, "grad/layer_12/attn_mlp_ratio": 1.0063555208225847, "grad/layer_16/attn": 0.012085763737559319, "grad/layer_16/mlp": 0.011121653020381927, "grad/layer_16/attn_mlp_ratio": 1.0866877079101243, "grad/layer_20/attn": 0.041559141129255295, "grad/layer_20/mlp": 0.047838348895311356, "grad/layer_20/attn_mlp_ratio": 0.8687411251029694, "grad/layer_24/attn": 0.01569552533328533, "grad/layer_24/mlp": 0.03546919301152229, "grad/layer_24/attn_mlp_ratio": 0.44251148550054226, "grad/layer_27/attn": 0.0278889462351799, "grad/layer_27/mlp": 0.03278340399265289, "grad/layer_27/attn_mlp_ratio": 0.8507031837316158} {"step": 2650, "timestamp": 1778197393.7626033, "train/loss": 2.9588476419448853, "train/z_loss": 0.0016031580395065247, "train/perplexity": 19.275746390551568, "train/grad_norm": 0.421875, "optim/muon_lr": 0.0106, "optim/adamw_lr": 0.000318, "perf/tokens_per_sec": 2026267.6287469142, "perf/iters_per_sec": 0.9661996978506633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03498272895813, "data/tokens_consumed": 5559549952, "data/tokens_consumed_B": 5.559549952, "train/loss_slope": -0.0008241417483098865} {"step": 2660, "timestamp": 1778197404.107888, "train/loss": 2.99115195274353, "train/z_loss": 0.001600467402022332, "train/perplexity": 19.908603057237485, "train/grad_norm": 0.482421875, "optim/muon_lr": 0.01064, "optim/adamw_lr": 0.0003192, "perf/tokens_per_sec": 2028023.5542918022, "perf/iters_per_sec": 0.967036988397504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340866088867187, "data/tokens_consumed": 5580521472, "data/tokens_consumed_B": 5.580521472, "train/loss_slope": -0.0008154745167309623} {"step": 2670, "timestamp": 1778197414.4536722, "train/loss": 2.9699583768844606, "train/z_loss": 0.00160273399669677, "train/perplexity": 19.491108298494172, "train/grad_norm": 0.357421875, "optim/muon_lr": 0.01068, "optim/adamw_lr": 0.0003204, "perf/tokens_per_sec": 2027978.012996352, "perf/iters_per_sec": 0.9670152726156006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341098308563232, "data/tokens_consumed": 5601492992, "data/tokens_consumed_B": 5.601492992, "train/loss_slope": -0.0008090986896340925} {"step": 2680, "timestamp": 1778197424.8079908, "train/loss": 3.0150986671447755, "train/z_loss": 0.001598338084295392, "train/perplexity": 20.391102773186145, "train/grad_norm": 0.40625, "optim/muon_lr": 0.01072, "optim/adamw_lr": 0.0003216, "perf/tokens_per_sec": 2026450.5261039361, "perf/iters_per_sec": 0.9662869101066285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034889316558838, "data/tokens_consumed": 5622464512, "data/tokens_consumed_B": 5.622464512, "train/loss_slope": -0.0007970489084631196} {"step": 2690, "timestamp": 1778197435.1667879, "train/loss": 2.9957248687744142, "train/z_loss": 0.001602642482612282, "train/perplexity": 19.99985190495677, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.01076, "optim/adamw_lr": 0.0003228, "perf/tokens_per_sec": 2025701.873682565, "perf/iters_per_sec": 0.96592992481354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352717876434325, "data/tokens_consumed": 5643436032, "data/tokens_consumed_B": 5.643436032, "train/loss_slope": -0.0007858488973754323} {"step": 2700, "timestamp": 1778197445.5070796, "grad/layer_0/attn": 0.007367696147412062, "grad/layer_0/mlp": 0.017077291384339333, "grad/layer_0/attn_mlp_ratio": 0.4314323588239142, "grad/layer_4/attn": 0.0074377357959747314, "grad/layer_4/mlp": 0.017045194283127785, "grad/layer_4/attn_mlp_ratio": 0.43635382670302586, "grad/layer_8/attn": 0.0195685476064682, "grad/layer_8/mlp": 0.015283704735338688, "grad/layer_8/attn_mlp_ratio": 1.280353671920056, "grad/layer_12/attn": 0.013794579543173313, "grad/layer_12/mlp": 0.010152786038815975, "grad/layer_12/attn_mlp_ratio": 1.3586989181653388, "grad/layer_16/attn": 0.009889014065265656, "grad/layer_16/mlp": 0.01022174209356308, "grad/layer_16/attn_mlp_ratio": 0.9674489806143857, "grad/layer_20/attn": 0.09760934114456177, "grad/layer_20/mlp": 0.04853718727827072, "grad/layer_20/attn_mlp_ratio": 2.0110217838510316, "grad/layer_24/attn": 0.047073207795619965, "grad/layer_24/mlp": 0.03501247242093086, "grad/layer_24/attn_mlp_ratio": 1.3444696819820152, "grad/layer_27/attn": 0.02798617258667946, "grad/layer_27/mlp": 0.03504226356744766, "grad/layer_27/attn_mlp_ratio": 0.7986405459495779} {"step": 2700, "timestamp": 1778197446.1147125, "eos/sharpness": 87.38446235656737, "eos/L0_probe": 2.7709405422210693, "eos/L_plus": 3.446380376815796, "eos/L_minus": 2.9693453311920166, "eos/grad_norm": 0.49176618456840515, "eos/embed_grad_frac": 0.04381680116057396, "eos/time_s": 0.6049537658691406} {"step": 2700, "timestamp": 1778197446.1335812, "train/loss": 2.954541730880737, "train/z_loss": 0.001605679525528103, "train/perplexity": 19.192925179248565, "train/grad_norm": 0.4921875, "optim/muon_lr": 0.0108, "optim/adamw_lr": 0.000324, "perf/tokens_per_sec": 1913155.1213981528, "perf/iters_per_sec": 0.9122634512892498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961745738983155, "data/tokens_consumed": 5664407552, "data/tokens_consumed_B": 5.664407552, "train/loss_slope": -0.0007760938928394106} {"step": 2700, "timestamp": 1778197447.4967253, "geo/rankme_last": 365.8366394042969, "geo/layer_0/stable_rank_q_proj": 50.521873474121094, "geo/layer_0/stable_rank_k_proj": 48.2588996887207, "geo/layer_0/stable_rank_o_proj": 63.85299301147461, "geo/layer_0/stable_rank_gate_proj": 166.4049072265625, "geo/layer_0/stable_rank_down_proj": 50.53205871582031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03649662807583809, "geo/layer_0/attn_entropy_mean": 6.827932834625244, "geo/layer_0/attn_entropy_std": 0.049391333013772964, "geo/layer_7/stable_rank_q_proj": 29.75893783569336, "geo/layer_7/stable_rank_k_proj": 31.707006454467773, "geo/layer_7/stable_rank_o_proj": 110.8281021118164, "geo/layer_7/stable_rank_gate_proj": 179.5436248779297, "geo/layer_7/stable_rank_down_proj": 207.89381408691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6174900531768799, "geo/layer_7/attn_entropy_mean": 5.595965385437012, "geo/layer_7/attn_entropy_std": 1.4682766199111938, "geo/layer_14/stable_rank_q_proj": 37.82910919189453, "geo/layer_14/stable_rank_k_proj": 25.30350112915039, "geo/layer_14/stable_rank_o_proj": 93.09485626220703, "geo/layer_14/stable_rank_gate_proj": 194.6992645263672, "geo/layer_14/stable_rank_down_proj": 160.35235595703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5714733600616455, "geo/layer_14/attn_entropy_mean": 6.764698028564453, "geo/layer_14/attn_entropy_std": 0.16495785117149353, "geo/layer_21/stable_rank_q_proj": 49.34804916381836, "geo/layer_21/stable_rank_k_proj": 37.802696228027344, "geo/layer_21/stable_rank_o_proj": 78.0188217163086, "geo/layer_21/stable_rank_gate_proj": 140.5836944580078, "geo/layer_21/stable_rank_down_proj": 162.71353149414062, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.30190446972846985, "geo/layer_21/attn_entropy_mean": 6.027431488037109, "geo/layer_21/attn_entropy_std": 0.5830499529838562, "geo/layer_27/stable_rank_q_proj": 62.24538040161133, "geo/layer_27/stable_rank_k_proj": 26.780155181884766, "geo/layer_27/stable_rank_o_proj": 91.83119201660156, "geo/layer_27/stable_rank_gate_proj": 91.33970642089844, "geo/layer_27/stable_rank_down_proj": 78.17671203613281, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11316860467195511, "geo/layer_27/attn_entropy_mean": 5.349078178405762, "geo/layer_27/attn_entropy_std": 0.20569632947444916, "attnres/final_alpha/block_0": 0.22322124242782593, "attnres/block_norm/0": 0.7732534408569336, "attnres/final_alpha/block_1": 0.016954947263002396, "attnres/block_norm/1": 3163.19873046875, "attnres/final_alpha/block_2": 0.05205143988132477, "attnres/block_norm/2": 1876.282470703125, "attnres/final_alpha/block_3": 0.03350621089339256, "attnres/block_norm/3": 1736.199951171875, "attnres/final_alpha/block_4": 0.04549165442585945, "attnres/block_norm/4": 1475.7451171875, "attnres/final_alpha/block_5": 0.3606044352054596, "attnres/block_norm/5": 1557.0455322265625, "attnres/final_alpha/block_6": 0.2681700885295868, "attnres/block_norm/6": 1719.985107421875, "geo/tier1_time_s": 1.3591322898864746, "geo/step": 2700.0, "geo/rankme_slope": 0.13106891220283298} {"step": 2710, "timestamp": 1778197457.848187, "train/loss": 2.9492873430252073, "train/z_loss": 0.0016120549757033587, "train/perplexity": 19.092342587660358, "train/grad_norm": 0.384765625, "optim/muon_lr": 0.01084, "optim/adamw_lr": 0.0003252, "perf/tokens_per_sec": 1790795.4863570759, "perf/iters_per_sec": 0.8539178306374912, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710728645324706, "data/tokens_consumed": 5685379072, "data/tokens_consumed_B": 5.685379072, "train/loss_slope": -0.0007651416993591831} {"step": 2720, "timestamp": 1778197468.195447, "train/loss": 2.9911282539367674, "train/z_loss": 0.0016092660371214151, "train/perplexity": 19.90813125269134, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.01088, "optim/adamw_lr": 0.0003264, "perf/tokens_per_sec": 2027686.6251779078, "perf/iters_per_sec": 0.9668763280763186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342584371566772, "data/tokens_consumed": 5706350592, "data/tokens_consumed_B": 5.706350592, "train/loss_slope": -0.0007488763844827399} {"step": 2730, "timestamp": 1778197478.5617802, "train/loss": 3.0092824697494507, "train/z_loss": 0.0016096381354145707, "train/perplexity": 20.27284832332119, "train/grad_norm": 0.3984375, "optim/muon_lr": 0.010920000000000001, "optim/adamw_lr": 0.0003276, "perf/tokens_per_sec": 2024067.9806757397, "perf/iters_per_sec": 0.9651508239153574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361074924468994, "data/tokens_consumed": 5727322112, "data/tokens_consumed_B": 5.727322112, "train/loss_slope": -0.0007385693335797813} {"step": 2740, "timestamp": 1778197488.9391148, "train/loss": 2.9871238708496093, "train/z_loss": 0.0016037458553910255, "train/perplexity": 19.82857087004457, "train/grad_norm": 0.435546875, "optim/muon_lr": 0.010960000000000001, "optim/adamw_lr": 0.0003288, "perf/tokens_per_sec": 2022161.6136973286, "perf/iters_per_sec": 0.9642417973028796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370842695236206, "data/tokens_consumed": 5748293632, "data/tokens_consumed_B": 5.748293632, "train/loss_slope": -0.0007263830173491287} {"step": 2750, "timestamp": 1778197499.3095708, "grad/layer_0/attn": 0.00861442182213068, "grad/layer_0/mlp": 0.01817380078136921, "grad/layer_0/attn_mlp_ratio": 0.4740022122153718, "grad/layer_4/attn": 0.007076525595039129, "grad/layer_4/mlp": 0.016048695892095566, "grad/layer_4/attn_mlp_ratio": 0.44094084644163717, "grad/layer_8/attn": 0.02416938543319702, "grad/layer_8/mlp": 0.014131560921669006, "grad/layer_8/attn_mlp_ratio": 1.7103124981122928, "grad/layer_12/attn": 0.012889334000647068, "grad/layer_12/mlp": 0.01013938244432211, "grad/layer_12/attn_mlp_ratio": 1.2712148835793642, "grad/layer_16/attn": 0.010675023309886456, "grad/layer_16/mlp": 0.011344630271196365, "grad/layer_16/attn_mlp_ratio": 0.9409758591156906, "grad/layer_20/attn": 0.11856961995363235, "grad/layer_20/mlp": 0.048910245299339294, "grad/layer_20/attn_mlp_ratio": 2.424228686352779, "grad/layer_24/attn": 0.04344402626156807, "grad/layer_24/mlp": 0.03668228164315224, "grad/layer_24/attn_mlp_ratio": 1.1843327131545762, "grad/layer_27/attn": 0.02793198637664318, "grad/layer_27/mlp": 0.03433159366250038, "grad/layer_27/attn_mlp_ratio": 0.8135942237308148} {"step": 2750, "timestamp": 1778197499.3260658, "train/loss": 2.914379525184631, "train/z_loss": 0.0016211453708820044, "train/perplexity": 18.43736892983806, "train/grad_norm": 0.51953125, "optim/muon_lr": 0.011000000000000001, "optim/adamw_lr": 0.00033, "perf/tokens_per_sec": 2020004.7298769434, "perf/iters_per_sec": 0.9632133149513928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381916284561157, "data/tokens_consumed": 5769265152, "data/tokens_consumed_B": 5.769265152, "train/loss_slope": -0.0007127553484155865} {"step": 2760, "timestamp": 1778197509.7015178, "train/loss": 2.996914863586426, "train/z_loss": 0.001612042635679245, "train/perplexity": 20.02366579135509, "train/grad_norm": 0.357421875, "optim/muon_lr": 0.011040000000000001, "optim/adamw_lr": 0.0003312, "perf/tokens_per_sec": 2022101.599325679, "perf/iters_per_sec": 0.9642131802204509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371150493621826, "data/tokens_consumed": 5790236672, "data/tokens_consumed_B": 5.790236672, "train/loss_slope": -0.0007017924838262101} {"step": 2770, "timestamp": 1778197520.0740256, "train/loss": 2.9531127214431763, "train/z_loss": 0.0016230224748142064, "train/perplexity": 19.16551789533067, "train/grad_norm": 0.361328125, "optim/muon_lr": 0.011080000000000001, "optim/adamw_lr": 0.0003324, "perf/tokens_per_sec": 2022786.0953480138, "perf/iters_per_sec": 0.9645395733585423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036764097213745, "data/tokens_consumed": 5811208192, "data/tokens_consumed_B": 5.811208192, "train/loss_slope": -0.0006928069855907176} {"step": 2775, "timestamp": 1778197525.8516583, "eos/sharpness": 87.26756572723387, "eos/L0_probe": 2.7448272705078125, "eos/L_plus": 3.4021286964416504, "eos/L_minus": 2.9602015018463135, "eos/grad_norm": 0.5884100794792175, "eos/embed_grad_frac": 0.0341145321726799, "eos/time_s": 0.6028306484222412} {"step": 2775, "timestamp": 1778197527.2281826, "geo/rankme_last": 365.7635498046875, "geo/layer_0/stable_rank_q_proj": 49.92041015625, "geo/layer_0/stable_rank_k_proj": 47.49867630004883, "geo/layer_0/stable_rank_o_proj": 63.34928894042969, "geo/layer_0/stable_rank_gate_proj": 166.16360473632812, "geo/layer_0/stable_rank_down_proj": 50.52412796020508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.028921380639076233, "geo/layer_0/attn_entropy_mean": 6.81790828704834, "geo/layer_0/attn_entropy_std": 0.04474807530641556, "geo/layer_7/stable_rank_q_proj": 30.023387908935547, "geo/layer_7/stable_rank_k_proj": 31.869192123413086, "geo/layer_7/stable_rank_o_proj": 110.42516326904297, "geo/layer_7/stable_rank_gate_proj": 179.65887451171875, "geo/layer_7/stable_rank_down_proj": 208.87533569335938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6188016533851624, "geo/layer_7/attn_entropy_mean": 5.588834762573242, "geo/layer_7/attn_entropy_std": 1.4666908979415894, "geo/layer_14/stable_rank_q_proj": 38.127288818359375, "geo/layer_14/stable_rank_k_proj": 25.52727508544922, "geo/layer_14/stable_rank_o_proj": 93.10347747802734, "geo/layer_14/stable_rank_gate_proj": 194.80825805664062, "geo/layer_14/stable_rank_down_proj": 160.06912231445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5696324110031128, "geo/layer_14/attn_entropy_mean": 6.728579521179199, "geo/layer_14/attn_entropy_std": 0.15294525027275085, "geo/layer_21/stable_rank_q_proj": 48.81121826171875, "geo/layer_21/stable_rank_k_proj": 37.70085906982422, "geo/layer_21/stable_rank_o_proj": 78.93939971923828, "geo/layer_21/stable_rank_gate_proj": 140.43055725097656, "geo/layer_21/stable_rank_down_proj": 163.08724975585938, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.28969311714172363, "geo/layer_21/attn_entropy_mean": 6.064884185791016, "geo/layer_21/attn_entropy_std": 0.5467575192451477, "geo/layer_27/stable_rank_q_proj": 61.866844177246094, "geo/layer_27/stable_rank_k_proj": 27.137195587158203, "geo/layer_27/stable_rank_o_proj": 93.15581512451172, "geo/layer_27/stable_rank_gate_proj": 90.88613891601562, "geo/layer_27/stable_rank_down_proj": 79.55394744873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11271873116493225, "geo/layer_27/attn_entropy_mean": 5.330917835235596, "geo/layer_27/attn_entropy_std": 0.22547949850559235, "attnres/final_alpha/block_0": 0.22298870980739594, "attnres/block_norm/0": 0.7860974073410034, "attnres/final_alpha/block_1": 0.01705116406083107, "attnres/block_norm/1": 3235.72802734375, "attnres/final_alpha/block_2": 0.04981372505426407, "attnres/block_norm/2": 1939.74755859375, "attnres/final_alpha/block_3": 0.03254055604338646, "attnres/block_norm/3": 1796.339599609375, "attnres/final_alpha/block_4": 0.04469394311308861, "attnres/block_norm/4": 1501.2412109375, "attnres/final_alpha/block_5": 0.3662990927696228, "attnres/block_norm/5": 1593.5947265625, "attnres/final_alpha/block_6": 0.26661282777786255, "attnres/block_norm/6": 1777.2913818359375, "geo/tier1_time_s": 1.356881856918335, "geo/step": 2775.0, "geo/rankme_slope": 0.12669403041109553} {"step": 2780, "timestamp": 1778197532.4175096, "train/loss": 2.865747857093811, "train/z_loss": 0.001635185827035457, "train/perplexity": 17.562182297245933, "train/grad_norm": 0.35546875, "optim/muon_lr": 0.011120000000000001, "optim/adamw_lr": 0.0003336, "perf/tokens_per_sec": 1699748.057374895, "perf/iters_per_sec": 0.8105030333399272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338016748428344, "data/tokens_consumed": 5832179712, "data/tokens_consumed_B": 5.832179712, "train/loss_slope": -0.0006897793761538629} {"step": 2790, "timestamp": 1778197542.790787, "train/loss": 2.900516629219055, "train/z_loss": 0.0016324645839631557, "train/perplexity": 18.18353708978195, "train/grad_norm": 0.384765625, "optim/muon_lr": 0.011160000000000002, "optim/adamw_lr": 0.0003348, "perf/tokens_per_sec": 2022611.672279301, "perf/iters_per_sec": 0.9644564019581323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368535041809082, "data/tokens_consumed": 5853151232, "data/tokens_consumed_B": 5.853151232, "train/loss_slope": -0.0006821829982967016} {"step": 2800, "timestamp": 1778197553.1563742, "grad/layer_0/attn": 0.007183649577200413, "grad/layer_0/mlp": 0.015201170928776264, "grad/layer_0/attn_mlp_ratio": 0.4725721172139668, "grad/layer_4/attn": 0.006941772531718016, "grad/layer_4/mlp": 0.015317162498831749, "grad/layer_4/attn_mlp_ratio": 0.4532022485840471, "grad/layer_8/attn": 0.017679577693343163, "grad/layer_8/mlp": 0.011806266382336617, "grad/layer_8/attn_mlp_ratio": 1.4974740507334492, "grad/layer_12/attn": 0.009466606192290783, "grad/layer_12/mlp": 0.009628836065530777, "grad/layer_12/attn_mlp_ratio": 0.98315165296708, "grad/layer_16/attn": 0.009625833481550217, "grad/layer_16/mlp": 0.00931097473949194, "grad/layer_16/attn_mlp_ratio": 1.0338158621933788, "grad/layer_20/attn": 0.04994424059987068, "grad/layer_20/mlp": 0.039904553443193436, "grad/layer_20/attn_mlp_ratio": 1.2515925167740594, "grad/layer_24/attn": 0.020185178145766258, "grad/layer_24/mlp": 0.03373141214251518, "grad/layer_24/attn_mlp_ratio": 0.5984089252072522, "grad/layer_27/attn": 0.021842606365680695, "grad/layer_27/mlp": 0.031863849610090256, "grad/layer_27/attn_mlp_ratio": 0.6854980350589541} {"step": 2800, "timestamp": 1778197553.1728258, "train/loss": 2.8617496252059937, "train/z_loss": 0.0016401086701080202, "train/perplexity": 17.49210480634883, "train/grad_norm": 0.375, "optim/muon_lr": 0.011200000000000002, "optim/adamw_lr": 0.000336, "perf/tokens_per_sec": 2021106.6517712346, "perf/iters_per_sec": 0.9637387522560285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376255989074707, "data/tokens_consumed": 5874122752, "data/tokens_consumed_B": 5.874122752, "train/loss_slope": -0.0006810089902909282} {"step": 2810, "timestamp": 1778197563.5418324, "train/loss": 2.947140669822693, "train/z_loss": 0.0016268071834929288, "train/perplexity": 19.051401526715477, "train/grad_norm": 0.361328125, "optim/muon_lr": 0.011240000000000002, "optim/adamw_lr": 0.0003372, "perf/tokens_per_sec": 2023493.2121924898, "perf/iters_per_sec": 0.9648767529451798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364017963409424, "data/tokens_consumed": 5895094272, "data/tokens_consumed_B": 5.895094272, "train/loss_slope": -0.0006706195006526486} {"step": 2820, "timestamp": 1778197573.916426, "train/loss": 2.9109472036361694, "train/z_loss": 0.001636017276905477, "train/perplexity": 18.374194430781113, "train/grad_norm": 0.451171875, "optim/muon_lr": 0.011279999999999998, "optim/adamw_lr": 0.00033839999999999993, "perf/tokens_per_sec": 2022709.0664119006, "perf/iters_per_sec": 0.9645028430995467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368035793304444, "data/tokens_consumed": 5916065792, "data/tokens_consumed_B": 5.916065792, "train/loss_slope": -0.0006635957616891297} {"step": 2830, "timestamp": 1778197584.3085694, "train/loss": 2.953459692001343, "train/z_loss": 0.0016298089292831718, "train/perplexity": 19.172168919560434, "train/grad_norm": 0.380859375, "optim/muon_lr": 0.011319999999999998, "optim/adamw_lr": 0.00033959999999999996, "perf/tokens_per_sec": 2019731.2573936342, "perf/iters_per_sec": 0.9630829131096049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038332200050354, "data/tokens_consumed": 5937037312, "data/tokens_consumed_B": 5.937037312, "train/loss_slope": -0.0006553093417642927} {"step": 2840, "timestamp": 1778197594.684905, "train/loss": 2.938247323036194, "train/z_loss": 0.0016328622819855808, "train/perplexity": 18.882721980778502, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.011359999999999999, "optim/adamw_lr": 0.00034079999999999994, "perf/tokens_per_sec": 2022154.4545533983, "perf/iters_per_sec": 0.9642383835570327, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370879411697387, "data/tokens_consumed": 5958008832, "data/tokens_consumed_B": 5.958008832, "train/loss_slope": -0.0006480879773186593} {"step": 2850, "timestamp": 1778197605.055129, "grad/layer_0/attn": 0.009277239441871643, "grad/layer_0/mlp": 0.01675952970981598, "grad/layer_0/attn_mlp_ratio": 0.5535501023685048, "grad/layer_4/attn": 0.006668792571872473, "grad/layer_4/mlp": 0.014149870723485947, "grad/layer_4/attn_mlp_ratio": 0.47129706377273883, "grad/layer_8/attn": 0.02330598793923855, "grad/layer_8/mlp": 0.012766520492732525, "grad/layer_8/attn_mlp_ratio": 1.8255551910131038, "grad/layer_12/attn": 0.01139588002115488, "grad/layer_12/mlp": 0.00933592114597559, "grad/layer_12/attn_mlp_ratio": 1.2206486881053407, "grad/layer_16/attn": 0.012688132002949715, "grad/layer_16/mlp": 0.009569583460688591, "grad/layer_16/attn_mlp_ratio": 1.3258813116039843, "grad/layer_20/attn": 0.08344775438308716, "grad/layer_20/mlp": 0.04018359258770943, "grad/layer_20/attn_mlp_ratio": 2.0766623589784325, "grad/layer_24/attn": 0.03355889022350311, "grad/layer_24/mlp": 0.030460860580205917, "grad/layer_24/attn_mlp_ratio": 1.1017052530400218, "grad/layer_27/attn": 0.020333578810095787, "grad/layer_27/mlp": 0.029751699417829514, "grad/layer_27/attn_mlp_ratio": 0.6834425978895874} {"step": 2850, "timestamp": 1778197605.6580074, "eos/sharpness": 71.17357254028319, "eos/L0_probe": 2.7164251804351807, "eos/L_plus": 2.9992501735687256, "eos/L_minus": 3.1453359127044678, "eos/grad_norm": 0.4913831949234009, "eos/embed_grad_frac": 0.037352703511714935, "eos/time_s": 0.6001064777374268} {"step": 2850, "timestamp": 1778197605.6777465, "train/loss": 2.944259595870972, "train/z_loss": 0.001632767985574901, "train/perplexity": 18.996592023062302, "train/grad_norm": 0.490234375, "optim/muon_lr": 0.011399999999999999, "optim/adamw_lr": 0.00034199999999999996, "perf/tokens_per_sec": 1908974.8980936995, "perf/iters_per_sec": 0.9102701654880998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098574948310852, "data/tokens_consumed": 5978980352, "data/tokens_consumed_B": 5.978980352, "train/loss_slope": -0.000639751437716823} {"step": 2850, "timestamp": 1778197607.039548, "geo/rankme_last": 371.66912841796875, "geo/layer_0/stable_rank_q_proj": 49.540462493896484, "geo/layer_0/stable_rank_k_proj": 46.441612243652344, "geo/layer_0/stable_rank_o_proj": 62.90562438964844, "geo/layer_0/stable_rank_gate_proj": 166.239013671875, "geo/layer_0/stable_rank_down_proj": 50.37028503417969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.023866737261414528, "geo/layer_0/attn_entropy_mean": 6.812644004821777, "geo/layer_0/attn_entropy_std": 0.04623611643910408, "geo/layer_7/stable_rank_q_proj": 30.288591384887695, "geo/layer_7/stable_rank_k_proj": 31.976821899414062, "geo/layer_7/stable_rank_o_proj": 109.34326934814453, "geo/layer_7/stable_rank_gate_proj": 180.5234375, "geo/layer_7/stable_rank_down_proj": 209.28009033203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6097837090492249, "geo/layer_7/attn_entropy_mean": 5.5641865730285645, "geo/layer_7/attn_entropy_std": 1.456069827079773, "geo/layer_14/stable_rank_q_proj": 38.49005126953125, "geo/layer_14/stable_rank_k_proj": 25.73869514465332, "geo/layer_14/stable_rank_o_proj": 93.27265167236328, "geo/layer_14/stable_rank_gate_proj": 194.76766967773438, "geo/layer_14/stable_rank_down_proj": 159.8778839111328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.55836021900177, "geo/layer_14/attn_entropy_mean": 6.71846866607666, "geo/layer_14/attn_entropy_std": 0.19306723773479462, "geo/layer_21/stable_rank_q_proj": 48.387699127197266, "geo/layer_21/stable_rank_k_proj": 37.2467041015625, "geo/layer_21/stable_rank_o_proj": 80.09754180908203, "geo/layer_21/stable_rank_gate_proj": 139.91883850097656, "geo/layer_21/stable_rank_down_proj": 163.5765380859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.27775558829307556, "geo/layer_21/attn_entropy_mean": 6.014854431152344, "geo/layer_21/attn_entropy_std": 0.5662744045257568, "geo/layer_27/stable_rank_q_proj": 61.534305572509766, "geo/layer_27/stable_rank_k_proj": 27.5447998046875, "geo/layer_27/stable_rank_o_proj": 94.03739929199219, "geo/layer_27/stable_rank_gate_proj": 90.65953826904297, "geo/layer_27/stable_rank_down_proj": 81.0357894897461, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11568614840507507, "geo/layer_27/attn_entropy_mean": 5.263055801391602, "geo/layer_27/attn_entropy_std": 0.22537679970264435, "attnres/final_alpha/block_0": 0.2239116132259369, "attnres/block_norm/0": 0.7990583777427673, "attnres/final_alpha/block_1": 0.01728629879653454, "attnres/block_norm/1": 3283.534912109375, "attnres/final_alpha/block_2": 0.05006818100810051, "attnres/block_norm/2": 1969.868408203125, "attnres/final_alpha/block_3": 0.03326365351676941, "attnres/block_norm/3": 1825.764404296875, "attnres/final_alpha/block_4": 0.04699081555008888, "attnres/block_norm/4": 1524.7520751953125, "attnres/final_alpha/block_5": 0.35135728120803833, "attnres/block_norm/5": 1649.027587890625, "attnres/final_alpha/block_6": 0.2771221399307251, "attnres/block_norm/6": 1778.751708984375, "geo/tier1_time_s": 1.3577768802642822, "geo/step": 2850.0, "geo/rankme_slope": 0.12277411987263503} {"step": 2860, "timestamp": 1778197617.4177692, "train/loss": 2.9070096969604493, "train/z_loss": 0.0016387230134569108, "train/perplexity": 18.30198816716167, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.011439999999999999, "optim/adamw_lr": 0.00034319999999999994, "perf/tokens_per_sec": 1786908.2850597256, "perf/iters_per_sec": 0.8520642686174992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1736203908920289, "data/tokens_consumed": 5999951872, "data/tokens_consumed_B": 5.999951872, "train/loss_slope": -0.000635445508075626} {"step": 2870, "timestamp": 1778197627.802514, "train/loss": 2.8955246686935423, "train/z_loss": 0.0016456177341751753, "train/perplexity": 18.092991777757124, "train/grad_norm": 0.46875, "optim/muon_lr": 0.011479999999999999, "optim/adamw_lr": 0.00034439999999999997, "perf/tokens_per_sec": 2020626.7658377467, "perf/iters_per_sec": 0.9635099248112424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03787202835083, "data/tokens_consumed": 6020923392, "data/tokens_consumed_B": 6.020923392, "train/loss_slope": -0.0006300747616885483} {"step": 2880, "timestamp": 1778197638.1962464, "train/loss": 2.86484215259552, "train/z_loss": 0.0016490226378664374, "train/perplexity": 17.54628335070004, "train/grad_norm": 0.357421875, "optim/muon_lr": 0.011519999999999999, "optim/adamw_lr": 0.00034559999999999994, "perf/tokens_per_sec": 2019087.7575662252, "perf/iters_per_sec": 0.962776068480599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038663125038147, "data/tokens_consumed": 6041894912, "data/tokens_consumed_B": 6.041894912, "train/loss_slope": -0.0006245542857870839} {"step": 2890, "timestamp": 1778197648.5884936, "train/loss": 2.8767206192016603, "train/z_loss": 0.0016509848297573625, "train/perplexity": 17.755949079946614, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.011559999999999999, "optim/adamw_lr": 0.0003467999999999999, "perf/tokens_per_sec": 2019053.9249049802, "perf/iters_per_sec": 0.9627599358105565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386805295944215, "data/tokens_consumed": 6062866432, "data/tokens_consumed_B": 6.062866432, "train/loss_slope": -0.0006218841713969142} {"step": 2900, "timestamp": 1778197658.9590135, "grad/layer_0/attn": 0.006783934310078621, "grad/layer_0/mlp": 0.014531932771205902, "grad/layer_0/attn_mlp_ratio": 0.46682945553103633, "grad/layer_4/attn": 0.006147841922938824, "grad/layer_4/mlp": 0.01322286855429411, "grad/layer_4/attn_mlp_ratio": 0.46494010366973737, "grad/layer_8/attn": 0.029296936467289925, "grad/layer_8/mlp": 0.012557022273540497, "grad/layer_8/attn_mlp_ratio": 2.333111751797377, "grad/layer_12/attn": 0.010574862360954285, "grad/layer_12/mlp": 0.009650164283812046, "grad/layer_12/attn_mlp_ratio": 1.0958219922857895, "grad/layer_16/attn": 0.012307649478316307, "grad/layer_16/mlp": 0.01001938059926033, "grad/layer_16/attn_mlp_ratio": 1.228384253252789, "grad/layer_20/attn": 0.0769481286406517, "grad/layer_20/mlp": 0.048128388822078705, "grad/layer_20/attn_mlp_ratio": 1.59880956674517, "grad/layer_24/attn": 0.017308536916971207, "grad/layer_24/mlp": 0.03103414736688137, "grad/layer_24/attn_mlp_ratio": 0.5577255484605244, "grad/layer_27/attn": 0.013474147766828537, "grad/layer_27/mlp": 0.022090544924139977, "grad/layer_27/attn_mlp_ratio": 0.6099508976398879} {"step": 2900, "timestamp": 1778197658.9755857, "train/loss": 2.9223305225372314, "train/z_loss": 0.0016431753640063107, "train/perplexity": 18.584548739153878, "train/grad_norm": 0.357421875, "optim/muon_lr": 0.0116, "optim/adamw_lr": 0.00034799999999999995, "perf/tokens_per_sec": 2020029.4555136622, "perf/iters_per_sec": 0.9632251050537406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381789207458496, "data/tokens_consumed": 6083837952, "data/tokens_consumed_B": 6.083837952, "train/loss_slope": -0.0006064336908067485} {"step": 2910, "timestamp": 1778197669.3530228, "train/loss": 2.8532065391540526, "train/z_loss": 0.001655678206589073, "train/perplexity": 17.343304760561207, "train/grad_norm": 0.466796875, "optim/muon_lr": 0.01164, "optim/adamw_lr": 0.0003491999999999999, "perf/tokens_per_sec": 2021801.209120875, "perf/iters_per_sec": 0.9640699430088401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372691392898559, "data/tokens_consumed": 6104809472, "data/tokens_consumed_B": 6.104809472, "train/loss_slope": -0.0006048332084881233} {"step": 2920, "timestamp": 1778197679.7368798, "train/loss": 2.88999605178833, "train/z_loss": 0.0016503159538842737, "train/perplexity": 17.99323856029561, "train/grad_norm": 0.296875, "optim/muon_lr": 0.01168, "optim/adamw_lr": 0.00035039999999999995, "perf/tokens_per_sec": 2020717.8876342494, "perf/iters_per_sec": 0.9635533750697372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378252267837524, "data/tokens_consumed": 6125780992, "data/tokens_consumed_B": 6.125780992, "train/loss_slope": -0.0006001731855533805} {"step": 2925, "timestamp": 1778197685.5406914, "eos/sharpness": 23.666334152221676, "eos/L0_probe": 2.6911137104034424, "eos/L_plus": 2.8419697284698486, "eos/L_minus": 2.776921033859253, "eos/grad_norm": 0.3020392060279846, "eos/embed_grad_frac": 0.13453462719917297, "eos/time_s": 0.6186521053314209} {"step": 2925, "timestamp": 1778197686.9195304, "geo/rankme_last": 372.6623840332031, "geo/layer_0/stable_rank_q_proj": 49.013065338134766, "geo/layer_0/stable_rank_k_proj": 45.669803619384766, "geo/layer_0/stable_rank_o_proj": 62.57184600830078, "geo/layer_0/stable_rank_gate_proj": 166.4995574951172, "geo/layer_0/stable_rank_down_proj": 50.31322479248047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.029318509623408318, "geo/layer_0/attn_entropy_mean": 6.802864074707031, "geo/layer_0/attn_entropy_std": 0.04748833179473877, "geo/layer_7/stable_rank_q_proj": 30.554765701293945, "geo/layer_7/stable_rank_k_proj": 32.061973571777344, "geo/layer_7/stable_rank_o_proj": 109.1343002319336, "geo/layer_7/stable_rank_gate_proj": 180.7721405029297, "geo/layer_7/stable_rank_down_proj": 209.0514373779297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6062303781509399, "geo/layer_7/attn_entropy_mean": 5.540932655334473, "geo/layer_7/attn_entropy_std": 1.4336411952972412, "geo/layer_14/stable_rank_q_proj": 38.91472625732422, "geo/layer_14/stable_rank_k_proj": 26.058731079101562, "geo/layer_14/stable_rank_o_proj": 93.62390899658203, "geo/layer_14/stable_rank_gate_proj": 194.42066955566406, "geo/layer_14/stable_rank_down_proj": 159.828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5461086630821228, "geo/layer_14/attn_entropy_mean": 6.699801445007324, "geo/layer_14/attn_entropy_std": 0.1772267073392868, "geo/layer_21/stable_rank_q_proj": 48.03582000732422, "geo/layer_21/stable_rank_k_proj": 37.032012939453125, "geo/layer_21/stable_rank_o_proj": 81.19930267333984, "geo/layer_21/stable_rank_gate_proj": 140.0259552001953, "geo/layer_21/stable_rank_down_proj": 163.77621459960938, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.2796362638473511, "geo/layer_21/attn_entropy_mean": 6.013848304748535, "geo/layer_21/attn_entropy_std": 0.5509239435195923, "geo/layer_27/stable_rank_q_proj": 61.2794075012207, "geo/layer_27/stable_rank_k_proj": 28.00897979736328, "geo/layer_27/stable_rank_o_proj": 94.83364868164062, "geo/layer_27/stable_rank_gate_proj": 90.51282501220703, "geo/layer_27/stable_rank_down_proj": 82.38957214355469, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11626137048006058, "geo/layer_27/attn_entropy_mean": 5.212652206420898, "geo/layer_27/attn_entropy_std": 0.22334839403629303, "attnres/final_alpha/block_0": 0.2229290008544922, "attnres/block_norm/0": 0.8118168115615845, "attnres/final_alpha/block_1": 0.017275363206863403, "attnres/block_norm/1": 3344.92138671875, "attnres/final_alpha/block_2": 0.049641385674476624, "attnres/block_norm/2": 2022.25537109375, "attnres/final_alpha/block_3": 0.03266046196222305, "attnres/block_norm/3": 1875.269287109375, "attnres/final_alpha/block_4": 0.046437740325927734, "attnres/block_norm/4": 1553.56103515625, "attnres/final_alpha/block_5": 0.3540455102920532, "attnres/block_norm/5": 1693.7724609375, "attnres/final_alpha/block_6": 0.2770105302333832, "attnres/block_norm/6": 1826.625732421875, "geo/tier1_time_s": 1.3587713241577148, "geo/step": 2925.0, "geo/rankme_slope": 0.11901279152684692} {"step": 2930, "timestamp": 1778197692.1138225, "train/loss": 2.829888701438904, "train/z_loss": 0.0016579098417423665, "train/perplexity": 16.943574924085084, "train/grad_norm": 0.396484375, "optim/muon_lr": 0.01172, "optim/adamw_lr": 0.0003515999999999999, "perf/tokens_per_sec": 1695544.237865455, "perf/iters_per_sec": 0.808498495991447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2368606805801392, "data/tokens_consumed": 6146752512, "data/tokens_consumed_B": 6.146752512, "train/loss_slope": -0.0005937160580643941} {"step": 2940, "timestamp": 1778197702.4926863, "train/loss": 2.921015214920044, "train/z_loss": 0.0016481437138281762, "train/perplexity": 18.56012040954111, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.01176, "optim/adamw_lr": 0.00035279999999999996, "perf/tokens_per_sec": 2021675.3720157074, "perf/iters_per_sec": 0.9640099392012155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373337030410767, "data/tokens_consumed": 6167724032, "data/tokens_consumed_B": 6.167724032, "train/loss_slope": -0.0005863951288040714} {"step": 2950, "timestamp": 1778197712.8656917, "grad/layer_0/attn": 0.006264282390475273, "grad/layer_0/mlp": 0.013589802198112011, "grad/layer_0/attn_mlp_ratio": 0.46095463738611936, "grad/layer_4/attn": 0.005737371742725372, "grad/layer_4/mlp": 0.0126601941883564, "grad/layer_4/attn_mlp_ratio": 0.4531819664096342, "grad/layer_8/attn": 0.03653007745742798, "grad/layer_8/mlp": 0.01200926210731268, "grad/layer_8/attn_mlp_ratio": 3.041825286751095, "grad/layer_12/attn": 0.009164256975054741, "grad/layer_12/mlp": 0.008496030233800411, "grad/layer_12/attn_mlp_ratio": 1.0786516308205578, "grad/layer_16/attn": 0.011335236020386219, "grad/layer_16/mlp": 0.009456939063966274, "grad/layer_16/attn_mlp_ratio": 1.1986157279700824, "grad/layer_20/attn": 0.03583502396941185, "grad/layer_20/mlp": 0.03749431297183037, "grad/layer_20/attn_mlp_ratio": 0.9557455793565358, "grad/layer_24/attn": 0.012219075113534927, "grad/layer_24/mlp": 0.029214555397629738, "grad/layer_24/attn_mlp_ratio": 0.41825298743724837, "grad/layer_27/attn": 0.01624392345547676, "grad/layer_27/mlp": 0.025127561762928963, "grad/layer_27/attn_mlp_ratio": 0.6464584007030799} {"step": 2950, "timestamp": 1778197712.882485, "train/loss": 2.8810795307159425, "train/z_loss": 0.001661412010435015, "train/perplexity": 17.833514618685385, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.0118, "optim/adamw_lr": 0.00035399999999999993, "perf/tokens_per_sec": 2019909.12668421, "perf/iters_per_sec": 0.9631677277966547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382407665252686, "data/tokens_consumed": 6188695552, "data/tokens_consumed_B": 6.188695552, "train/loss_slope": -0.000579985083512204} {"step": 2960, "timestamp": 1778197723.2645202, "train/loss": 2.852779912948608, "train/z_loss": 0.0016632927465252579, "train/perplexity": 17.335907230363663, "train/grad_norm": 0.427734375, "optim/muon_lr": 0.01184, "optim/adamw_lr": 0.00035519999999999996, "perf/tokens_per_sec": 2021229.817146355, "perf/iters_per_sec": 0.9637974820834899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037562370300293, "data/tokens_consumed": 6209667072, "data/tokens_consumed_B": 6.209667072, "train/loss_slope": -0.0005746206086568206} {"step": 2970, "timestamp": 1778197733.6515334, "train/loss": 2.8947147130966187, "train/z_loss": 0.0016565272002480925, "train/perplexity": 18.078343190954893, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.01188, "optim/adamw_lr": 0.00035639999999999994, "perf/tokens_per_sec": 2020237.2582813175, "perf/iters_per_sec": 0.9633241931349361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03807213306427, "data/tokens_consumed": 6230638592, "data/tokens_consumed_B": 6.230638592, "train/loss_slope": -0.0005615335541732883} {"step": 2980, "timestamp": 1778197744.0090017, "train/loss": 2.905295991897583, "train/z_loss": 0.0016572939581237732, "train/perplexity": 18.270650816536552, "train/grad_norm": 0.408203125, "optim/muon_lr": 0.01192, "optim/adamw_lr": 0.00035759999999999996, "perf/tokens_per_sec": 2025923.1636938509, "perf/iters_per_sec": 0.9660354441136603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351587057113647, "data/tokens_consumed": 6251610112, "data/tokens_consumed_B": 6.251610112, "train/loss_slope": -0.0005491913621217468} {"step": 2990, "timestamp": 1778197754.3542147, "train/loss": 2.813068723678589, "train/z_loss": 0.001671292050741613, "train/perplexity": 16.66096775652534, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.01196, "optim/adamw_lr": 0.00035879999999999994, "perf/tokens_per_sec": 2028300.3995977016, "perf/iters_per_sec": 0.9671689985264309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339454650878905, "data/tokens_consumed": 6272581632, "data/tokens_consumed_B": 6.272581632, "train/loss_slope": -0.0005476939007787421} {"step": 3000, "timestamp": 1778197764.6982026, "grad/layer_0/attn": 0.005751334130764008, "grad/layer_0/mlp": 0.01219857856631279, "grad/layer_0/attn_mlp_ratio": 0.4714757586182324, "grad/layer_4/attn": 0.006724733859300613, "grad/layer_4/mlp": 0.012289915233850479, "grad/layer_4/attn_mlp_ratio": 0.5471749541494789, "grad/layer_8/attn": 0.019140655174851418, "grad/layer_8/mlp": 0.011163264513015747, "grad/layer_8/attn_mlp_ratio": 1.7146108990854232, "grad/layer_12/attn": 0.011601520702242851, "grad/layer_12/mlp": 0.007932082749903202, "grad/layer_12/attn_mlp_ratio": 1.4626071010320358, "grad/layer_16/attn": 0.008548328652977943, "grad/layer_16/mlp": 0.008465465158224106, "grad/layer_16/attn_mlp_ratio": 1.0097884040895846, "grad/layer_20/attn": 0.04779598489403725, "grad/layer_20/mlp": 0.03424215689301491, "grad/layer_20/attn_mlp_ratio": 1.3958228421120549, "grad/layer_24/attn": 0.017524098977446556, "grad/layer_24/mlp": 0.02543850801885128, "grad/layer_24/attn_mlp_ratio": 0.6888807667325534, "grad/layer_27/attn": 0.0104503920301795, "grad/layer_27/mlp": 0.021600941196084023, "grad/layer_27/attn_mlp_ratio": 0.48379336284173985} {"step": 3000, "timestamp": 1778197765.3103483, "eos/sharpness": 54.556798934936516, "eos/L0_probe": 2.6799416542053223, "eos/L_plus": 3.099458694458008, "eos/L_minus": 2.805992603302002, "eos/grad_norm": 0.27013975381851196, "eos/embed_grad_frac": 0.108253613114357, "eos/time_s": 0.6093308925628662} {"step": 3000, "timestamp": 1778197765.3383226, "train/loss": 2.864803647994995, "train/z_loss": 0.0016638527507893742, "train/perplexity": 17.545607751075856, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.012, "optim/adamw_lr": 0.00035999999999999997, "perf/tokens_per_sec": 1910485.8286733697, "perf/iters_per_sec": 0.910990633331952, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097706127166748, "data/tokens_consumed": 6293553152, "data/tokens_consumed_B": 6.293553152, "train/loss_slope": -0.0005404763876265174} {"step": 3000, "timestamp": 1778197766.7080395, "geo/rankme_last": 372.0063781738281, "geo/layer_0/stable_rank_q_proj": 48.6847038269043, "geo/layer_0/stable_rank_k_proj": 44.63742446899414, "geo/layer_0/stable_rank_o_proj": 62.33878707885742, "geo/layer_0/stable_rank_gate_proj": 166.402587890625, "geo/layer_0/stable_rank_down_proj": 50.063934326171875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.034176528453826904, "geo/layer_0/attn_entropy_mean": 6.792083740234375, "geo/layer_0/attn_entropy_std": 0.05793407931923866, "geo/layer_7/stable_rank_q_proj": 30.900171279907227, "geo/layer_7/stable_rank_k_proj": 32.15375518798828, "geo/layer_7/stable_rank_o_proj": 108.28004455566406, "geo/layer_7/stable_rank_gate_proj": 180.86553955078125, "geo/layer_7/stable_rank_down_proj": 209.30458068847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6145028471946716, "geo/layer_7/attn_entropy_mean": 5.520233154296875, "geo/layer_7/attn_entropy_std": 1.4074375629425049, "geo/layer_14/stable_rank_q_proj": 39.357810974121094, "geo/layer_14/stable_rank_k_proj": 26.448022842407227, "geo/layer_14/stable_rank_o_proj": 94.05769348144531, "geo/layer_14/stable_rank_gate_proj": 195.12196350097656, "geo/layer_14/stable_rank_down_proj": 159.785888671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5382974743843079, "geo/layer_14/attn_entropy_mean": 6.687903881072998, "geo/layer_14/attn_entropy_std": 0.226758673787117, "geo/layer_21/stable_rank_q_proj": 47.706119537353516, "geo/layer_21/stable_rank_k_proj": 36.72116470336914, "geo/layer_21/stable_rank_o_proj": 82.5349349975586, "geo/layer_21/stable_rank_gate_proj": 140.3673858642578, "geo/layer_21/stable_rank_down_proj": 163.66123962402344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.27240586280822754, "geo/layer_21/attn_entropy_mean": 6.003241539001465, "geo/layer_21/attn_entropy_std": 0.5359789133071899, "geo/layer_27/stable_rank_q_proj": 60.86237335205078, "geo/layer_27/stable_rank_k_proj": 28.3614444732666, "geo/layer_27/stable_rank_o_proj": 95.06014251708984, "geo/layer_27/stable_rank_gate_proj": 90.16815948486328, "geo/layer_27/stable_rank_down_proj": 83.98502349853516, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1098574548959732, "geo/layer_27/attn_entropy_mean": 5.168832778930664, "geo/layer_27/attn_entropy_std": 0.23362098634243011, "attnres/final_alpha/block_0": 0.22347339987754822, "attnres/block_norm/0": 0.8243753910064697, "attnres/final_alpha/block_1": 0.017420928925275803, "attnres/block_norm/1": 3406.06396484375, "attnres/final_alpha/block_2": 0.04886195808649063, "attnres/block_norm/2": 2090.059814453125, "attnres/final_alpha/block_3": 0.03210582584142685, "attnres/block_norm/3": 1954.498779296875, "attnres/final_alpha/block_4": 0.04617498070001602, "attnres/block_norm/4": 1577.5196533203125, "attnres/final_alpha/block_5": 0.36102285981178284, "attnres/block_norm/5": 1749.8055419921875, "attnres/final_alpha/block_6": 0.27094006538391113, "attnres/block_norm/6": 1888.0745849609375, "geo/tier1_time_s": 1.3657517433166504, "geo/step": 3000.0, "geo/rankme_slope": 0.11533053494939682} {"step": 3000, "timestamp": 1778197773.8094716, "geo/ww_alpha_mean": 8.619996159982934, "geo/ww_alpha_std": 6.253669565772713, "geo/ww_alpha_min": 2.2455210206261254, "geo/ww_alpha_max": 47.52552874069063, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.84123683837227, "geo/ww_alpha_by_type/k_proj": 4.7043863786699225, "geo/ww_alpha_by_type/v_proj": 7.302661618940715, "geo/ww_alpha_by_type/o_proj": 5.657859441168172, "geo/ww_alpha_by_type/gate_proj": 14.965096606346489, "geo/ww_alpha_by_type/up_proj": 12.498940597693053, "geo/ww_alpha_by_type/down_proj": 10.583070107060578, "geo/twonn_id/layer_0": 0.7027014493942261, "geo/twonn_id/layer_7": 2.3058536052703857, "geo/twonn_id/layer_14": 2.5391008853912354, "geo/twonn_id/layer_21": 6.070949554443359, "geo/twonn_id/layer_27": 5.87410831451416, "geo/tier2_time_s": 7.095422983169556} {"step": 3000, "timestamp": 1778197774.957817, "eoc/jacobian_sigma/layer_0/attn": 807.36181640625, "eoc/jacobian_sigma/layer_0/mlp": 783.6634521484375, "eoc/jacobian_sigma/layer_0": 807.36181640625, "eoc/jacobian_sigma/layer_7/attn": 1.209536075592041, "eoc/jacobian_sigma/layer_7/mlp": 1.4499400854110718, "eoc/jacobian_sigma/layer_7": 1.4499400854110718, "eoc/jacobian_sigma/layer_14/attn": 1.276224970817566, "eoc/jacobian_sigma/layer_14/mlp": 3.7624459266662598, "eoc/jacobian_sigma/layer_14": 3.7624459266662598, "eoc/jacobian_sigma/layer_21/attn": 1.1746293306350708, "eoc/jacobian_sigma/layer_21/mlp": 3.0637476444244385, "eoc/jacobian_sigma/layer_21": 3.0637476444244385, "eoc/jacobian_sigma/layer_27/attn": 1.5158003568649292, "eoc/jacobian_sigma/layer_27/mlp": 4.01469612121582, "eoc/jacobian_sigma/layer_27": 4.01469612121582, "eoc/layer0_sigma": 807.36181640625, "eoc/sigma_max": 4.01469612121582, "eoc/sigma_min": 1.4499400854110718, "eoc/sigma_mean": 3.0727074444293976, "eoc/time_s": 1.1412923336029053} {"step": 3010, "timestamp": 1778197785.3362703, "train/loss": 2.8405357122421266, "train/z_loss": 0.001666569255758077, "train/perplexity": 17.124937118722418, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.01204, "optim/adamw_lr": 0.00036119999999999994, "perf/tokens_per_sec": 1048889.3060853789, "perf/iters_per_sec": 0.5001493959833998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9994025945663452, "data/tokens_consumed": 6314524672, "data/tokens_consumed_B": 6.314524672, "train/loss_slope": -0.0005382142864831602} {"step": 3020, "timestamp": 1778197795.705379, "train/loss": 2.853472280502319, "train/z_loss": 0.0016672872705385088, "train/perplexity": 17.347914206184697, "train/grad_norm": 0.296875, "optim/muon_lr": 0.01208, "optim/adamw_lr": 0.00036239999999999997, "perf/tokens_per_sec": 2024393.9240672968, "perf/iters_per_sec": 0.9653062458359226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035940670967102, "data/tokens_consumed": 6335496192, "data/tokens_consumed_B": 6.335496192, "train/loss_slope": -0.0005309210695592341} {"step": 3030, "timestamp": 1778197806.059143, "train/loss": 2.9120415925979612, "train/z_loss": 0.0016599122667685152, "train/perplexity": 18.394313953631766, "train/grad_norm": 0.404296875, "optim/muon_lr": 0.01212, "optim/adamw_lr": 0.00036359999999999995, "perf/tokens_per_sec": 2026685.1480014033, "perf/iters_per_sec": 0.966398786545469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347695112228394, "data/tokens_consumed": 6356467712, "data/tokens_consumed_B": 6.356467712, "train/loss_slope": -0.000524331199761593} {"step": 3040, "timestamp": 1778197816.4161801, "train/loss": 2.858631467819214, "train/z_loss": 0.0016735140117816627, "train/perplexity": 17.437646619260068, "train/grad_norm": 0.54296875, "optim/muon_lr": 0.01216, "optim/adamw_lr": 0.0003648, "perf/tokens_per_sec": 2025925.8233871085, "perf/iters_per_sec": 0.9660367123542349, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351573467254638, "data/tokens_consumed": 6377439232, "data/tokens_consumed_B": 6.377439232, "train/loss_slope": -0.000518366693284395} {"step": 3050, "timestamp": 1778197826.7721827, "grad/layer_0/attn": 0.006149766966700554, "grad/layer_0/mlp": 0.012449176982045174, "grad/layer_0/attn_mlp_ratio": 0.49398983773554445, "grad/layer_4/attn": 0.005499872844666243, "grad/layer_4/mlp": 0.011982576921582222, "grad/layer_4/attn_mlp_ratio": 0.4589891502270536, "grad/layer_8/attn": 0.0212149266153574, "grad/layer_8/mlp": 0.011587603017687798, "grad/layer_8/attn_mlp_ratio": 1.8308295857125152, "grad/layer_12/attn": 0.010559448972344398, "grad/layer_12/mlp": 0.00777440657839179, "grad/layer_12/attn_mlp_ratio": 1.3582321338673167, "grad/layer_16/attn": 0.008951141498982906, "grad/layer_16/mlp": 0.008363638073205948, "grad/layer_16/attn_mlp_ratio": 1.0702449476663285, "grad/layer_20/attn": 0.10106134414672852, "grad/layer_20/mlp": 0.03149118646979332, "grad/layer_20/attn_mlp_ratio": 3.209194544725972, "grad/layer_24/attn": 0.03229471296072006, "grad/layer_24/mlp": 0.026708953082561493, "grad/layer_24/attn_mlp_ratio": 1.2091343580550944, "grad/layer_27/attn": 0.01632513850927353, "grad/layer_27/mlp": 0.025440098717808723, "grad/layer_27/attn_mlp_ratio": 0.6417089267690074} {"step": 3050, "timestamp": 1778197826.786568, "train/loss": 2.842809557914734, "train/z_loss": 0.0016778134740889072, "train/perplexity": 17.163920887623885, "train/grad_norm": 0.373046875, "optim/muon_lr": 0.0122, "optim/adamw_lr": 0.00036599999999999995, "perf/tokens_per_sec": 2024322.0835917888, "perf/iters_per_sec": 0.9652719896277374, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359774351119995, "data/tokens_consumed": 6398410752, "data/tokens_consumed_B": 6.398410752, "train/loss_slope": -0.0005108622934379772} {"step": 3060, "timestamp": 1778197837.1506047, "train/loss": 2.8428086042404175, "train/z_loss": 0.0016777754528447986, "train/perplexity": 17.163904518841168, "train/grad_norm": 0.28125, "optim/muon_lr": 0.012240000000000001, "optim/adamw_lr": 0.0003672, "perf/tokens_per_sec": 2024738.3819164508, "perf/iters_per_sec": 0.965470496137834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357644319534303, "data/tokens_consumed": 6419382272, "data/tokens_consumed_B": 6.419382272, "train/loss_slope": -0.0005040275509637145} {"step": 3070, "timestamp": 1778197847.5095985, "train/loss": 2.8052694320678713, "train/z_loss": 0.0016831681481562556, "train/perplexity": 16.531529430234063, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.01228, "optim/adamw_lr": 0.00036839999999999996, "perf/tokens_per_sec": 2025488.5609827375, "perf/iters_per_sec": 0.9658282093919456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353808164596559, "data/tokens_consumed": 6440353792, "data/tokens_consumed_B": 6.440353792, "train/loss_slope": -0.0004985184398671724} {"step": 3075, "timestamp": 1778197853.2997181, "eos/sharpness": 30.23948669433593, "eos/L0_probe": 2.6515309810638428, "eos/L_plus": 2.807518720626831, "eos/L_minus": 2.797938108444214, "eos/grad_norm": 0.30752310156822205, "eos/embed_grad_frac": 0.09931707382202148, "eos/time_s": 0.6115903854370117} {"step": 3075, "timestamp": 1778197854.6810029, "geo/rankme_last": 377.22015380859375, "geo/layer_0/stable_rank_q_proj": 47.892059326171875, "geo/layer_0/stable_rank_k_proj": 43.354618072509766, "geo/layer_0/stable_rank_o_proj": 62.177120208740234, "geo/layer_0/stable_rank_gate_proj": 166.6519012451172, "geo/layer_0/stable_rank_down_proj": 49.82499313354492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03160430118441582, "geo/layer_0/attn_entropy_mean": 6.782103538513184, "geo/layer_0/attn_entropy_std": 0.06249108165502548, "geo/layer_7/stable_rank_q_proj": 31.152679443359375, "geo/layer_7/stable_rank_k_proj": 32.20343780517578, "geo/layer_7/stable_rank_o_proj": 107.98517608642578, "geo/layer_7/stable_rank_gate_proj": 180.2708282470703, "geo/layer_7/stable_rank_down_proj": 209.5447998046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6014770865440369, "geo/layer_7/attn_entropy_mean": 5.50760555267334, "geo/layer_7/attn_entropy_std": 1.4083523750305176, "geo/layer_14/stable_rank_q_proj": 39.76985549926758, "geo/layer_14/stable_rank_k_proj": 26.90919303894043, "geo/layer_14/stable_rank_o_proj": 94.38935852050781, "geo/layer_14/stable_rank_gate_proj": 195.29478454589844, "geo/layer_14/stable_rank_down_proj": 159.14918518066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5356383323669434, "geo/layer_14/attn_entropy_mean": 6.650361061096191, "geo/layer_14/attn_entropy_std": 0.2057507336139679, "geo/layer_21/stable_rank_q_proj": 47.391456604003906, "geo/layer_21/stable_rank_k_proj": 36.440528869628906, "geo/layer_21/stable_rank_o_proj": 83.97920989990234, "geo/layer_21/stable_rank_gate_proj": 140.10336303710938, "geo/layer_21/stable_rank_down_proj": 163.99058532714844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.25812652707099915, "geo/layer_21/attn_entropy_mean": 5.988125801086426, "geo/layer_21/attn_entropy_std": 0.5189806818962097, "geo/layer_27/stable_rank_q_proj": 60.073143005371094, "geo/layer_27/stable_rank_k_proj": 28.845006942749023, "geo/layer_27/stable_rank_o_proj": 95.5891342163086, "geo/layer_27/stable_rank_gate_proj": 89.89407348632812, "geo/layer_27/stable_rank_down_proj": 85.5558090209961, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10226240009069443, "geo/layer_27/attn_entropy_mean": 5.149053573608398, "geo/layer_27/attn_entropy_std": 0.2235722541809082, "attnres/final_alpha/block_0": 0.22560912370681763, "attnres/block_norm/0": 0.8367433547973633, "attnres/final_alpha/block_1": 0.01775265485048294, "attnres/block_norm/1": 3469.35595703125, "attnres/final_alpha/block_2": 0.048380762338638306, "attnres/block_norm/2": 2144.725830078125, "attnres/final_alpha/block_3": 0.032254770398139954, "attnres/block_norm/3": 1997.107666015625, "attnres/final_alpha/block_4": 0.047056861221790314, "attnres/block_norm/4": 1596.0091552734375, "attnres/final_alpha/block_5": 0.35497087240219116, "attnres/block_norm/5": 1788.5693359375, "attnres/final_alpha/block_6": 0.2739749550819397, "attnres/block_norm/6": 1922.1673583984375, "geo/tier1_time_s": 1.3604161739349365, "geo/step": 3075.0, "geo/rankme_slope": 0.11199764379782579} {"step": 3080, "timestamp": 1778197859.8616726, "train/loss": 2.768252420425415, "train/z_loss": 0.0016892251092940568, "train/perplexity": 15.93076938187916, "train/grad_norm": 0.3125, "optim/muon_lr": 0.01232, "optim/adamw_lr": 0.0003696, "perf/tokens_per_sec": 1698753.8397703501, "perf/iters_per_sec": 0.8100289534427405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345237731933594, "data/tokens_consumed": 6461325312, "data/tokens_consumed_B": 6.461325312, "train/loss_slope": -0.0004994703670253347} {"step": 3090, "timestamp": 1778197870.213734, "train/loss": 2.867677855491638, "train/z_loss": 0.0016710588475689291, "train/perplexity": 17.59611001062677, "train/grad_norm": 0.412109375, "optim/muon_lr": 0.01236, "optim/adamw_lr": 0.00037079999999999996, "perf/tokens_per_sec": 2026754.3543266933, "perf/iters_per_sec": 0.9664317866929499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347341775894165, "data/tokens_consumed": 6482296832, "data/tokens_consumed_B": 6.482296832, "train/loss_slope": -0.0004912653126446221} {"step": 3100, "timestamp": 1778197880.5576384, "grad/layer_0/attn": 0.006090095732361078, "grad/layer_0/mlp": 0.012026392854750156, "grad/layer_0/attn_mlp_ratio": 0.5063942077458584, "grad/layer_4/attn": 0.005312064662575722, "grad/layer_4/mlp": 0.011388287879526615, "grad/layer_4/attn_mlp_ratio": 0.46644980106979456, "grad/layer_8/attn": 0.01629577949643135, "grad/layer_8/mlp": 0.011385016143321991, "grad/layer_8/attn_mlp_ratio": 1.4313356387163543, "grad/layer_12/attn": 0.009805927984416485, "grad/layer_12/mlp": 0.009681756608188152, "grad/layer_12/attn_mlp_ratio": 1.0128252836722613, "grad/layer_16/attn": 0.010206007398664951, "grad/layer_16/mlp": 0.008806436322629452, "grad/layer_16/attn_mlp_ratio": 1.1589259160991718, "grad/layer_20/attn": 0.02752802148461342, "grad/layer_20/mlp": 0.029942665249109268, "grad/layer_20/attn_mlp_ratio": 0.9193577513443478, "grad/layer_24/attn": 0.008966876193881035, "grad/layer_24/mlp": 0.02169402688741684, "grad/layer_24/attn_mlp_ratio": 0.41333387291728185, "grad/layer_27/attn": 0.009749727323651314, "grad/layer_27/mlp": 0.01886056922376156, "grad/layer_27/attn_mlp_ratio": 0.5169370635788859} {"step": 3100, "timestamp": 1778197880.571839, "train/loss": 2.826680397987366, "train/z_loss": 0.0016815269598737359, "train/perplexity": 16.88930190288899, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.0124, "optim/adamw_lr": 0.000372, "perf/tokens_per_sec": 2025578.0227679838, "perf/iters_per_sec": 0.9658708680953902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035335087776184, "data/tokens_consumed": 6503268352, "data/tokens_consumed_B": 6.503268352, "train/loss_slope": -0.00048579567395063474} {"step": 3110, "timestamp": 1778197890.9295251, "train/loss": 2.830402398109436, "train/z_loss": 0.001682110014371574, "train/perplexity": 16.95228101806479, "train/grad_norm": 0.3125, "optim/muon_lr": 0.01244, "optim/adamw_lr": 0.00037319999999999996, "perf/tokens_per_sec": 2026130.1273629067, "perf/iters_per_sec": 0.9661341320814641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350529670715332, "data/tokens_consumed": 6524239872, "data/tokens_consumed_B": 6.524239872, "train/loss_slope": -0.00048037699481370106} {"step": 3120, "timestamp": 1778197901.2835462, "train/loss": 2.746737337112427, "train/z_loss": 0.001702324440702796, "train/perplexity": 15.591678416868902, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.01248, "optim/adamw_lr": 0.0003744, "perf/tokens_per_sec": 2026893.7152760152, "perf/iters_per_sec": 0.9664982391719891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346630334854126, "data/tokens_consumed": 6545211392, "data/tokens_consumed_B": 6.545211392, "train/loss_slope": -0.00047792061089825565} {"step": 3130, "timestamp": 1778197911.6453984, "train/loss": 2.828831148147583, "train/z_loss": 0.0016866301302798092, "train/perplexity": 16.925665662325912, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.01252, "optim/adamw_lr": 0.00037559999999999997, "perf/tokens_per_sec": 2024937.4119340528, "perf/iters_per_sec": 0.9655654010458244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035662627220154, "data/tokens_consumed": 6566182912, "data/tokens_consumed_B": 6.566182912, "train/loss_slope": -0.00047208715522392995} {"step": 3140, "timestamp": 1778197922.0057569, "train/loss": 2.80321090221405, "train/z_loss": 0.0016944466275162994, "train/perplexity": 16.497533785901823, "train/grad_norm": 0.341796875, "optim/muon_lr": 0.01256, "optim/adamw_lr": 0.00037679999999999994, "perf/tokens_per_sec": 2025718.9014520054, "perf/iters_per_sec": 0.9659380442867305, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352630853652953, "data/tokens_consumed": 6587154432, "data/tokens_consumed_B": 6.587154432, "train/loss_slope": -0.00046865720266770785} {"step": 3150, "timestamp": 1778197932.3566365, "grad/layer_0/attn": 0.006506788544356823, "grad/layer_0/mlp": 0.01285086665302515, "grad/layer_0/attn_mlp_ratio": 0.5063307144496768, "grad/layer_4/attn": 0.006170566193759441, "grad/layer_4/mlp": 0.012348256073892117, "grad/layer_4/attn_mlp_ratio": 0.49971154686650027, "grad/layer_8/attn": 0.016192389652132988, "grad/layer_8/mlp": 0.012103255838155746, "grad/layer_8/attn_mlp_ratio": 1.3378540233199705, "grad/layer_12/attn": 0.010876395739614964, "grad/layer_12/mlp": 0.007998107932507992, "grad/layer_12/attn_mlp_ratio": 1.35987107143443, "grad/layer_16/attn": 0.009938359260559082, "grad/layer_16/mlp": 0.008959322236478329, "grad/layer_16/attn_mlp_ratio": 1.109275778603763, "grad/layer_20/attn": 0.03881562873721123, "grad/layer_20/mlp": 0.028760390356183052, "grad/layer_20/attn_mlp_ratio": 1.349621062911072, "grad/layer_24/attn": 0.01273665763437748, "grad/layer_24/mlp": 0.02105305902659893, "grad/layer_24/attn_mlp_ratio": 0.6049789514097591, "grad/layer_27/attn": 0.00857630092650652, "grad/layer_27/mlp": 0.017866360023617744, "grad/layer_27/attn_mlp_ratio": 0.48002507881666445} {"step": 3150, "timestamp": 1778197932.97337, "eos/sharpness": 35.58125495910644, "eos/L0_probe": 2.6298365592956543, "eos/L_plus": 2.8592724800109863, "eos/L_minus": 2.7562131881713867, "eos/grad_norm": 0.3270460069179535, "eos/embed_grad_frac": 0.06374736875295639, "eos/time_s": 0.6138670444488525} {"step": 3150, "timestamp": 1778197932.9918895, "train/loss": 2.813050079345703, "train/z_loss": 0.001690105942543596, "train/perplexity": 16.66065712679204, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.0126, "optim/adamw_lr": 0.00037799999999999997, "perf/tokens_per_sec": 1910146.084935324, "perf/iters_per_sec": 0.910828630893385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979013681411742, "data/tokens_consumed": 6608125952, "data/tokens_consumed_B": 6.608125952, "train/loss_slope": -0.00046245183494045966} {"step": 3150, "timestamp": 1778197934.3549433, "geo/rankme_last": 374.8536376953125, "geo/layer_0/stable_rank_q_proj": 47.06559753417969, "geo/layer_0/stable_rank_k_proj": 42.134910583496094, "geo/layer_0/stable_rank_o_proj": 61.711181640625, "geo/layer_0/stable_rank_gate_proj": 166.2554931640625, "geo/layer_0/stable_rank_down_proj": 49.69175338745117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.023033298552036285, "geo/layer_0/attn_entropy_mean": 6.768354415893555, "geo/layer_0/attn_entropy_std": 0.06904388964176178, "geo/layer_7/stable_rank_q_proj": 31.525856018066406, "geo/layer_7/stable_rank_k_proj": 32.38496398925781, "geo/layer_7/stable_rank_o_proj": 108.05189514160156, "geo/layer_7/stable_rank_gate_proj": 179.8494873046875, "geo/layer_7/stable_rank_down_proj": 210.78988647460938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.609129786491394, "geo/layer_7/attn_entropy_mean": 5.4886322021484375, "geo/layer_7/attn_entropy_std": 1.3940436840057373, "geo/layer_14/stable_rank_q_proj": 40.3449821472168, "geo/layer_14/stable_rank_k_proj": 27.30951690673828, "geo/layer_14/stable_rank_o_proj": 94.6618423461914, "geo/layer_14/stable_rank_gate_proj": 196.0545654296875, "geo/layer_14/stable_rank_down_proj": 159.4952850341797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5244033336639404, "geo/layer_14/attn_entropy_mean": 6.605608940124512, "geo/layer_14/attn_entropy_std": 0.264921098947525, "geo/layer_21/stable_rank_q_proj": 47.27402877807617, "geo/layer_21/stable_rank_k_proj": 36.14836120605469, "geo/layer_21/stable_rank_o_proj": 85.30461883544922, "geo/layer_21/stable_rank_gate_proj": 140.03883361816406, "geo/layer_21/stable_rank_down_proj": 164.06336975097656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.25686728954315186, "geo/layer_21/attn_entropy_mean": 5.978802680969238, "geo/layer_21/attn_entropy_std": 0.5032169818878174, "geo/layer_27/stable_rank_q_proj": 59.69425582885742, "geo/layer_27/stable_rank_k_proj": 29.314777374267578, "geo/layer_27/stable_rank_o_proj": 96.54464721679688, "geo/layer_27/stable_rank_gate_proj": 89.60659790039062, "geo/layer_27/stable_rank_down_proj": 87.30835723876953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11104802042245865, "geo/layer_27/attn_entropy_mean": 5.073139190673828, "geo/layer_27/attn_entropy_std": 0.24328596889972687, "attnres/final_alpha/block_0": 0.22599391639232635, "attnres/block_norm/0": 0.8488540649414062, "attnres/final_alpha/block_1": 0.017844321206212044, "attnres/block_norm/1": 3548.66748046875, "attnres/final_alpha/block_2": 0.0479867197573185, "attnres/block_norm/2": 2210.9765625, "attnres/final_alpha/block_3": 0.03120383247733116, "attnres/block_norm/3": 2083.32763671875, "attnres/final_alpha/block_4": 0.04583181440830231, "attnres/block_norm/4": 1624.330322265625, "attnres/final_alpha/block_5": 0.36054548621177673, "attnres/block_norm/5": 1848.885498046875, "attnres/final_alpha/block_6": 0.27059391140937805, "attnres/block_norm/6": 1998.547607421875, "geo/tier1_time_s": 1.3588230609893799, "geo/step": 3150.0, "geo/rankme_slope": 0.10865343204284379} {"step": 3160, "timestamp": 1778197944.7236965, "train/loss": 2.851268196105957, "train/z_loss": 0.0016873094951733946, "train/perplexity": 17.30972004621166, "train/grad_norm": 0.3359375, "optim/muon_lr": 0.01264, "optim/adamw_lr": 0.00037919999999999995, "perf/tokens_per_sec": 1788200.870042819, "perf/iters_per_sec": 0.8526806211675735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1727720499038696, "data/tokens_consumed": 6629097472, "data/tokens_consumed_B": 6.629097472, "train/loss_slope": -0.0004557923685778307} {"step": 3170, "timestamp": 1778197955.0782137, "train/loss": 2.802725648880005, "train/z_loss": 0.0016920373658649624, "train/perplexity": 16.489530244658276, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.01268, "optim/adamw_lr": 0.0003804, "perf/tokens_per_sec": 2026631.4019500706, "perf/iters_per_sec": 0.9663731584310868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034796953201294, "data/tokens_consumed": 6650068992, "data/tokens_consumed_B": 6.650068992, "train/loss_slope": -0.0004494238873340688} {"step": 3180, "timestamp": 1778197965.4382834, "train/loss": 2.8509397745132445, "train/z_loss": 0.001685456291306764, "train/perplexity": 17.304036093802118, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.01272, "optim/adamw_lr": 0.00038159999999999995, "perf/tokens_per_sec": 2025280.3294052766, "perf/iters_per_sec": 0.965728916838301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354872703552247, "data/tokens_consumed": 6671040512, "data/tokens_consumed_B": 6.671040512, "train/loss_slope": -0.00044223893749581564} {"step": 3190, "timestamp": 1778197975.8038888, "train/loss": 2.795916128158569, "train/z_loss": 0.0016991225886158646, "train/perplexity": 16.377625886705452, "train/grad_norm": 0.3125, "optim/muon_lr": 0.01276, "optim/adamw_lr": 0.0003828, "perf/tokens_per_sec": 2024229.285544599, "perf/iters_per_sec": 0.9652277400706286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360249280929565, "data/tokens_consumed": 6692012032, "data/tokens_consumed_B": 6.692012032, "train/loss_slope": -0.000439086713026924} {"step": 3200, "timestamp": 1778197986.1490948, "grad/layer_0/attn": 0.005606567487120628, "grad/layer_0/mlp": 0.011299649253487587, "grad/layer_0/attn_mlp_ratio": 0.49617181133060434, "grad/layer_4/attn": 0.005193747114390135, "grad/layer_4/mlp": 0.011342966929078102, "grad/layer_4/attn_mlp_ratio": 0.4578825893680003, "grad/layer_8/attn": 0.018412644043564796, "grad/layer_8/mlp": 0.011487765237689018, "grad/layer_8/attn_mlp_ratio": 1.6028046797889108, "grad/layer_12/attn": 0.009673700667917728, "grad/layer_12/mlp": 0.008443908765912056, "grad/layer_12/attn_mlp_ratio": 1.145642476906676, "grad/layer_16/attn": 0.010160746052861214, "grad/layer_16/mlp": 0.00900710467249155, "grad/layer_16/attn_mlp_ratio": 1.1280812546882968, "grad/layer_20/attn": 0.04108983650803566, "grad/layer_20/mlp": 0.03180772811174393, "grad/layer_20/attn_mlp_ratio": 1.2918192784627929, "grad/layer_24/attn": 0.028123214840888977, "grad/layer_24/mlp": 0.027316778898239136, "grad/layer_24/attn_mlp_ratio": 1.0295216300099592, "grad/layer_27/attn": 0.018595319241285324, "grad/layer_27/mlp": 0.024843823164701462, "grad/layer_27/attn_mlp_ratio": 0.7484886300775564} {"step": 3200, "timestamp": 1778197986.1633563, "train/loss": 2.7905940294265745, "train/z_loss": 0.0017027585650794207, "train/perplexity": 16.29069407976375, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.0128, "optim/adamw_lr": 0.00038399999999999996, "perf/tokens_per_sec": 2025867.9183112178, "perf/iters_per_sec": 0.9660091010624017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351869344711304, "data/tokens_consumed": 6712983552, "data/tokens_consumed_B": 6.712983552, "train/loss_slope": -0.0004341077205121177} {"step": 3210, "timestamp": 1778197996.5256436, "train/loss": 2.874648380279541, "train/z_loss": 0.001687250949908048, "train/perplexity": 17.719192608412676, "train/grad_norm": 0.3515625, "optim/muon_lr": 0.01284, "optim/adamw_lr": 0.0003852, "perf/tokens_per_sec": 2024795.1038154599, "perf/iters_per_sec": 0.9654975432469654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357354164123536, "data/tokens_consumed": 6733955072, "data/tokens_consumed_B": 6.733955072, "train/loss_slope": -0.0004283761110600501} {"step": 3220, "timestamp": 1778198006.882328, "train/loss": 2.843874764442444, "train/z_loss": 0.0016995231271721423, "train/perplexity": 17.182213749302626, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.01288, "optim/adamw_lr": 0.00038639999999999996, "perf/tokens_per_sec": 2025899.553389685, "perf/iters_per_sec": 0.9660241858433175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351707696914674, "data/tokens_consumed": 6754926592, "data/tokens_consumed_B": 6.754926592, "train/loss_slope": -0.0004190349610474887} {"step": 3225, "timestamp": 1778198012.6495967, "eos/sharpness": 26.934766769409176, "eos/L0_probe": 2.6168365478515625, "eos/L_plus": 2.8096048831939697, "eos/L_minus": 2.693415880203247, "eos/grad_norm": 0.22131116688251495, "eos/embed_grad_frac": 0.1333981305360794, "eos/time_s": 0.5999917984008789} {"step": 3225, "timestamp": 1778198014.0272276, "geo/rankme_last": 378.97760009765625, "geo/layer_0/stable_rank_q_proj": 46.23395538330078, "geo/layer_0/stable_rank_k_proj": 40.72126007080078, "geo/layer_0/stable_rank_o_proj": 61.820594787597656, "geo/layer_0/stable_rank_gate_proj": 168.3717041015625, "geo/layer_0/stable_rank_down_proj": 49.33892822265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.02296753041446209, "geo/layer_0/attn_entropy_mean": 6.757667541503906, "geo/layer_0/attn_entropy_std": 0.07905037701129913, "geo/layer_7/stable_rank_q_proj": 31.88426971435547, "geo/layer_7/stable_rank_k_proj": 32.52254867553711, "geo/layer_7/stable_rank_o_proj": 107.50337982177734, "geo/layer_7/stable_rank_gate_proj": 179.7283172607422, "geo/layer_7/stable_rank_down_proj": 210.5260772705078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6026721000671387, "geo/layer_7/attn_entropy_mean": 5.4854936599731445, "geo/layer_7/attn_entropy_std": 1.3911908864974976, "geo/layer_14/stable_rank_q_proj": 41.00923538208008, "geo/layer_14/stable_rank_k_proj": 27.718509674072266, "geo/layer_14/stable_rank_o_proj": 94.45878601074219, "geo/layer_14/stable_rank_gate_proj": 196.0343475341797, "geo/layer_14/stable_rank_down_proj": 159.65151977539062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5164852142333984, "geo/layer_14/attn_entropy_mean": 6.5791521072387695, "geo/layer_14/attn_entropy_std": 0.30145055055618286, "geo/layer_21/stable_rank_q_proj": 47.063331604003906, "geo/layer_21/stable_rank_k_proj": 35.72793197631836, "geo/layer_21/stable_rank_o_proj": 86.4238510131836, "geo/layer_21/stable_rank_gate_proj": 140.0380859375, "geo/layer_21/stable_rank_down_proj": 163.31509399414062, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.2556367814540863, "geo/layer_21/attn_entropy_mean": 5.974379062652588, "geo/layer_21/attn_entropy_std": 0.4900170564651489, "geo/layer_27/stable_rank_q_proj": 58.65890121459961, "geo/layer_27/stable_rank_k_proj": 29.865564346313477, "geo/layer_27/stable_rank_o_proj": 96.9476318359375, "geo/layer_27/stable_rank_gate_proj": 89.6202621459961, "geo/layer_27/stable_rank_down_proj": 89.37748718261719, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10370096564292908, "geo/layer_27/attn_entropy_mean": 5.104666709899902, "geo/layer_27/attn_entropy_std": 0.24531406164169312, "attnres/final_alpha/block_0": 0.22583764791488647, "attnres/block_norm/0": 0.8608064651489258, "attnres/final_alpha/block_1": 0.017803173512220383, "attnres/block_norm/1": 3619.456787109375, "attnres/final_alpha/block_2": 0.047894664108753204, "attnres/block_norm/2": 2275.87548828125, "attnres/final_alpha/block_3": 0.031814686954021454, "attnres/block_norm/3": 2135.8251953125, "attnres/final_alpha/block_4": 0.04693431407213211, "attnres/block_norm/4": 1653.63623046875, "attnres/final_alpha/block_5": 0.3568355441093445, "attnres/block_norm/5": 1904.90673828125, "attnres/final_alpha/block_6": 0.2728799879550934, "attnres/block_norm/6": 2041.5712890625, "geo/tier1_time_s": 1.3599207401275635, "geo/step": 3225.0, "geo/rankme_slope": 0.10558235313266774} {"step": 3230, "timestamp": 1778198019.6984282, "train/loss": 2.8682963371276857, "train/z_loss": 0.0016961937071755528, "train/perplexity": 17.606996247656046, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.012920000000000001, "optim/adamw_lr": 0.0003876, "perf/tokens_per_sec": 1637259.2298969456, "perf/iters_per_sec": 0.7807060384258965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2808918476104736, "data/tokens_consumed": 6775898112, "data/tokens_consumed_B": 6.775898112, "train/loss_slope": -0.00041154363988721255} {"step": 3240, "timestamp": 1778198030.0544806, "train/loss": 2.8240501165390013, "train/z_loss": 0.0016980996006168425, "train/perplexity": 16.844936657536866, "train/grad_norm": 0.3203125, "optim/muon_lr": 0.012960000000000001, "optim/adamw_lr": 0.00038879999999999996, "perf/tokens_per_sec": 2026023.070300921, "perf/iters_per_sec": 0.9660830832962614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351076602935791, "data/tokens_consumed": 6796869632, "data/tokens_consumed_B": 6.796869632, "train/loss_slope": -0.00040281420709705547} {"step": 3250, "timestamp": 1778198040.401823, "grad/layer_0/attn": 0.004979455843567848, "grad/layer_0/mlp": 0.011004132218658924, "grad/layer_0/attn_mlp_ratio": 0.45250781246282723, "grad/layer_4/attn": 0.00484026363119483, "grad/layer_4/mlp": 0.010443062521517277, "grad/layer_4/attn_mlp_ratio": 0.4634908174563437, "grad/layer_8/attn": 0.01489404495805502, "grad/layer_8/mlp": 0.010888254269957542, "grad/layer_8/attn_mlp_ratio": 1.3679001658107937, "grad/layer_12/attn": 0.012348033487796783, "grad/layer_12/mlp": 0.007674199994653463, "grad/layer_12/attn_mlp_ratio": 1.60903199493059, "grad/layer_16/attn": 0.00787443108856678, "grad/layer_16/mlp": 0.0076641542837023735, "grad/layer_16/attn_mlp_ratio": 1.0274363868911038, "grad/layer_20/attn": 0.055101264268159866, "grad/layer_20/mlp": 0.029386309906840324, "grad/layer_20/attn_mlp_ratio": 1.8750657791105383, "grad/layer_24/attn": 0.015902498736977577, "grad/layer_24/mlp": 0.029948821291327477, "grad/layer_24/attn_mlp_ratio": 0.5309891340693157, "grad/layer_27/attn": 0.0177689790725708, "grad/layer_27/mlp": 0.029164200648665428, "grad/layer_27/attn_mlp_ratio": 0.609273651134908} {"step": 3250, "timestamp": 1778198040.4163573, "train/loss": 2.8236740589141847, "train/z_loss": 0.0017004420165903865, "train/perplexity": 16.83860318161784, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.013000000000000001, "optim/adamw_lr": 0.00039, "perf/tokens_per_sec": 2025004.4010937854, "perf/iters_per_sec": 0.9655973439663817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035628366470337, "data/tokens_consumed": 6817841152, "data/tokens_consumed_B": 6.817841152, "train/loss_slope": -0.0003974256313494508} {"step": 3260, "timestamp": 1778198050.7704659, "train/loss": 2.8321470737457277, "train/z_loss": 0.0017009251634590328, "train/perplexity": 16.981883065213015, "train/grad_norm": 0.5703125, "optim/muon_lr": 0.013040000000000001, "optim/adamw_lr": 0.00039119999999999997, "perf/tokens_per_sec": 2026646.4841771757, "perf/iters_per_sec": 0.966380350197399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034789252281189, "data/tokens_consumed": 6838812672, "data/tokens_consumed_B": 6.838812672, "train/loss_slope": -0.00039114976236850977} {"step": 3270, "timestamp": 1778198061.1259081, "train/loss": 2.814953112602234, "train/z_loss": 0.0016997254570014775, "train/perplexity": 16.69239309910761, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.013080000000000001, "optim/adamw_lr": 0.0003924, "perf/tokens_per_sec": 2026453.513981598, "perf/iters_per_sec": 0.9662883348377218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348877906799316, "data/tokens_consumed": 6859784192, "data/tokens_consumed_B": 6.859784192, "train/loss_slope": -0.0003848949178717997} {"step": 3280, "timestamp": 1778198071.489273, "train/loss": 2.7813159465789794, "train/z_loss": 0.0017112801200710238, "train/perplexity": 16.140246681403134, "train/grad_norm": 0.439453125, "optim/muon_lr": 0.013120000000000001, "optim/adamw_lr": 0.00039359999999999997, "perf/tokens_per_sec": 2024622.8971904558, "perf/iters_per_sec": 0.9654154287292747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358235120773316, "data/tokens_consumed": 6880755712, "data/tokens_consumed_B": 6.880755712, "train/loss_slope": -0.0003813787678501966} {"step": 3290, "timestamp": 1778198081.8496456, "train/loss": 2.820165920257568, "train/z_loss": 0.001709692517761141, "train/perplexity": 16.779634522466772, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.013160000000000002, "optim/adamw_lr": 0.0003948, "perf/tokens_per_sec": 2025314.6508570379, "perf/iters_per_sec": 0.9657452825818242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354697227478027, "data/tokens_consumed": 6901727232, "data/tokens_consumed_B": 6.901727232, "train/loss_slope": -0.00037695169268590065} {"step": 3300, "timestamp": 1778198092.1914752, "grad/layer_0/attn": 0.007044460624456406, "grad/layer_0/mlp": 0.012905359268188477, "grad/layer_0/attn_mlp_ratio": 0.5458554406335169, "grad/layer_4/attn": 0.005459459964185953, "grad/layer_4/mlp": 0.010575628839433193, "grad/layer_4/attn_mlp_ratio": 0.5162302871491022, "grad/layer_8/attn": 0.011869038455188274, "grad/layer_8/mlp": 0.01037763524800539, "grad/layer_8/attn_mlp_ratio": 1.1437131925693975, "grad/layer_12/attn": 0.008688580244779587, "grad/layer_12/mlp": 0.008005281910300255, "grad/layer_12/attn_mlp_ratio": 1.0853559229518892, "grad/layer_16/attn": 0.012635646387934685, "grad/layer_16/mlp": 0.008401918224990368, "grad/layer_16/attn_mlp_ratio": 1.5039001688878202, "grad/layer_20/attn": 0.04702219367027283, "grad/layer_20/mlp": 0.028788885101675987, "grad/layer_20/attn_mlp_ratio": 1.6333454158042688, "grad/layer_24/attn": 0.012941060587763786, "grad/layer_24/mlp": 0.02690291777253151, "grad/layer_24/attn_mlp_ratio": 0.4810281415971203, "grad/layer_27/attn": 0.018282625824213028, "grad/layer_27/mlp": 0.027335606515407562, "grad/layer_27/attn_mlp_ratio": 0.6688209294725557} {"step": 3300, "timestamp": 1778198092.8011727, "eos/sharpness": 49.74899291992187, "eos/L0_probe": 2.5941367149353027, "eos/L_plus": 2.76438570022583, "eos/L_minus": 2.921377658843994, "eos/grad_norm": 0.3448084890842438, "eos/embed_grad_frac": 0.06242929399013519, "eos/time_s": 0.6067938804626465} {"step": 3300, "timestamp": 1778198092.8223124, "train/loss": 2.8182267427444456, "train/z_loss": 0.0017068967688828706, "train/perplexity": 16.747127361287063, "train/grad_norm": 0.34375, "optim/muon_lr": 0.013200000000000002, "optim/adamw_lr": 0.000396, "perf/tokens_per_sec": 1911927.9616394273, "perf/iters_per_sec": 0.9116782959172379, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0968781471252442, "data/tokens_consumed": 6922698752, "data/tokens_consumed_B": 6.922698752, "train/loss_slope": -0.0003711456145461958} {"step": 3300, "timestamp": 1778198094.1994202, "geo/rankme_last": 383.34814453125, "geo/layer_0/stable_rank_q_proj": 45.33855438232422, "geo/layer_0/stable_rank_k_proj": 39.18600082397461, "geo/layer_0/stable_rank_o_proj": 61.51233673095703, "geo/layer_0/stable_rank_gate_proj": 168.4901580810547, "geo/layer_0/stable_rank_down_proj": 49.13676834106445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03216860070824623, "geo/layer_0/attn_entropy_mean": 6.753813743591309, "geo/layer_0/attn_entropy_std": 0.08341943472623825, "geo/layer_7/stable_rank_q_proj": 32.11933898925781, "geo/layer_7/stable_rank_k_proj": 32.699581146240234, "geo/layer_7/stable_rank_o_proj": 106.38956451416016, "geo/layer_7/stable_rank_gate_proj": 179.03851318359375, "geo/layer_7/stable_rank_down_proj": 210.45375061035156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.594662070274353, "geo/layer_7/attn_entropy_mean": 5.462726593017578, "geo/layer_7/attn_entropy_std": 1.379822850227356, "geo/layer_14/stable_rank_q_proj": 41.578189849853516, "geo/layer_14/stable_rank_k_proj": 28.160438537597656, "geo/layer_14/stable_rank_o_proj": 94.79246520996094, "geo/layer_14/stable_rank_gate_proj": 196.11253356933594, "geo/layer_14/stable_rank_down_proj": 158.94622802734375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5090885162353516, "geo/layer_14/attn_entropy_mean": 6.590467929840088, "geo/layer_14/attn_entropy_std": 0.28856292366981506, "geo/layer_21/stable_rank_q_proj": 46.900169372558594, "geo/layer_21/stable_rank_k_proj": 35.50132369995117, "geo/layer_21/stable_rank_o_proj": 87.96307373046875, "geo/layer_21/stable_rank_gate_proj": 140.12026977539062, "geo/layer_21/stable_rank_down_proj": 162.58538818359375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.24939286708831787, "geo/layer_21/attn_entropy_mean": 5.984894752502441, "geo/layer_21/attn_entropy_std": 0.4676046669483185, "geo/layer_27/stable_rank_q_proj": 57.98106384277344, "geo/layer_27/stable_rank_k_proj": 30.283964157104492, "geo/layer_27/stable_rank_o_proj": 97.82408142089844, "geo/layer_27/stable_rank_gate_proj": 89.28007507324219, "geo/layer_27/stable_rank_down_proj": 91.1788101196289, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1015203520655632, "geo/layer_27/attn_entropy_mean": 5.037035942077637, "geo/layer_27/attn_entropy_std": 0.2709381878376007, "attnres/final_alpha/block_0": 0.22586211562156677, "attnres/block_norm/0": 0.8727498054504395, "attnres/final_alpha/block_1": 0.018099501729011536, "attnres/block_norm/1": 3685.88623046875, "attnres/final_alpha/block_2": 0.047039687633514404, "attnres/block_norm/2": 2324.931884765625, "attnres/final_alpha/block_3": 0.031821396201848984, "attnres/block_norm/3": 2175.899169921875, "attnres/final_alpha/block_4": 0.047425854951143265, "attnres/block_norm/4": 1690.7691650390625, "attnres/final_alpha/block_5": 0.354159951210022, "attnres/block_norm/5": 1936.55078125, "attnres/final_alpha/block_6": 0.2755914330482483, "attnres/block_norm/6": 2066.74658203125, "geo/tier1_time_s": 1.3736577033996582, "geo/step": 3300.0, "geo/rankme_slope": 0.10276582217813324} {"step": 3310, "timestamp": 1778198104.5602152, "train/loss": 2.7176003217697144, "train/z_loss": 0.0017313659889623522, "train/perplexity": 15.143938028799417, "train/grad_norm": 0.3125, "optim/muon_lr": 0.013240000000000002, "optim/adamw_lr": 0.0003972, "perf/tokens_per_sec": 1787153.3482262557, "perf/iters_per_sec": 0.8521811238414077, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734594583511353, "data/tokens_consumed": 6943670272, "data/tokens_consumed_B": 6.943670272, "train/loss_slope": -0.0003704433168193223} {"step": 3320, "timestamp": 1778198114.9134572, "train/loss": 2.8145488500595093, "train/z_loss": 0.0017095149960368872, "train/perplexity": 16.685646353649325, "train/grad_norm": 0.25, "optim/muon_lr": 0.01328, "optim/adamw_lr": 0.0003984, "perf/tokens_per_sec": 2026616.7868791453, "perf/iters_per_sec": 0.9663661894221999, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348044157028198, "data/tokens_consumed": 6964641792, "data/tokens_consumed_B": 6.964641792, "train/loss_slope": -0.0003640314533801327} {"step": 3330, "timestamp": 1778198125.270639, "train/loss": 2.79472758769989, "train/z_loss": 0.0017127380007877946, "train/perplexity": 16.358171978890347, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.01332, "optim/adamw_lr": 0.0003996, "perf/tokens_per_sec": 2025821.6339589523, "perf/iters_per_sec": 0.9659870309633981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352105855941773, "data/tokens_consumed": 6985613312, "data/tokens_consumed_B": 6.985613312, "train/loss_slope": -0.00035777900395649466} {"step": 3340, "timestamp": 1778198135.6341648, "train/loss": 2.7825839281082154, "train/z_loss": 0.0017185279983095824, "train/perplexity": 16.160725196518182, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.01336, "optim/adamw_lr": 0.0004008, "perf/tokens_per_sec": 2025427.1830149603, "perf/iters_per_sec": 0.9657989420962144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354121923446655, "data/tokens_consumed": 7006584832, "data/tokens_consumed_B": 7.006584832, "train/loss_slope": -0.0003529768863956098} {"step": 3350, "timestamp": 1778198145.9876134, "grad/layer_0/attn": 0.005499693099409342, "grad/layer_0/mlp": 0.009803901426494122, "grad/layer_0/attn_mlp_ratio": 0.5609698429290563, "grad/layer_4/attn": 0.004638070240616798, "grad/layer_4/mlp": 0.009295012801885605, "grad/layer_4/attn_mlp_ratio": 0.4989848093353276, "grad/layer_8/attn": 0.015018945559859276, "grad/layer_8/mlp": 0.010177971795201302, "grad/layer_8/attn_mlp_ratio": 1.4756324456879657, "grad/layer_12/attn": 0.006429244764149189, "grad/layer_12/mlp": 0.007546457927674055, "grad/layer_12/attn_mlp_ratio": 0.8519552802880679, "grad/layer_16/attn": 0.009325443767011166, "grad/layer_16/mlp": 0.007592049892991781, "grad/layer_16/attn_mlp_ratio": 1.2283169599277508, "grad/layer_20/attn": 0.03251989185810089, "grad/layer_20/mlp": 0.02447698824107647, "grad/layer_20/attn_mlp_ratio": 1.328590405198138, "grad/layer_24/attn": 0.01491471379995346, "grad/layer_24/mlp": 0.02278943359851837, "grad/layer_24/attn_mlp_ratio": 0.6544574120296426, "grad/layer_27/attn": 0.01242951862514019, "grad/layer_27/mlp": 0.019911538809537888, "grad/layer_27/attn_mlp_ratio": 0.6242369653902686} {"step": 3350, "timestamp": 1778198146.0019119, "train/loss": 2.8240318775177, "train/z_loss": 0.0017117966781370342, "train/perplexity": 16.844629425180166, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.0134, "optim/adamw_lr": 0.000402, "perf/tokens_per_sec": 2023812.4049295646, "perf/iters_per_sec": 0.9650289559028457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362383365631103, "data/tokens_consumed": 7027556352, "data/tokens_consumed_B": 7.027556352, "train/loss_slope": -0.00034621461246809806} {"step": 3360, "timestamp": 1778198156.3589435, "train/loss": 2.7403388261795043, "train/z_loss": 0.0017258613370358944, "train/perplexity": 15.492233381410147, "train/grad_norm": 0.265625, "optim/muon_lr": 0.01344, "optim/adamw_lr": 0.0004032, "perf/tokens_per_sec": 2026093.0714587362, "perf/iters_per_sec": 0.9661164624494248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035071897506714, "data/tokens_consumed": 7048527872, "data/tokens_consumed_B": 7.048527872, "train/loss_slope": -0.0003456362747337738} {"step": 3370, "timestamp": 1778198166.7131917, "train/loss": 2.7637290239334105, "train/z_loss": 0.001725614967290312, "train/perplexity": 15.858870930736579, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.01348, "optim/adamw_lr": 0.0004044, "perf/tokens_per_sec": 2026834.3537618993, "perf/iters_per_sec": 0.9664699333962914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346933364868165, "data/tokens_consumed": 7069499392, "data/tokens_consumed_B": 7.069499392, "train/loss_slope": -0.00034112593297160076} {"step": 3375, "timestamp": 1778198172.480154, "eos/sharpness": 7.962393760681151, "eos/L0_probe": 2.5799219608306885, "eos/L_plus": 2.6307966709136963, "eos/L_minus": 2.608671188354492, "eos/grad_norm": 0.1892421990633011, "eos/embed_grad_frac": 0.1373588740825653, "eos/time_s": 0.5982613563537598} {"step": 3375, "timestamp": 1778198173.8535635, "geo/rankme_last": 384.1016540527344, "geo/layer_0/stable_rank_q_proj": 44.22699737548828, "geo/layer_0/stable_rank_k_proj": 37.863189697265625, "geo/layer_0/stable_rank_o_proj": 61.16925811767578, "geo/layer_0/stable_rank_gate_proj": 168.60403442382812, "geo/layer_0/stable_rank_down_proj": 49.230987548828125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.031247176229953766, "geo/layer_0/attn_entropy_mean": 6.737332820892334, "geo/layer_0/attn_entropy_std": 0.09903384745121002, "geo/layer_7/stable_rank_q_proj": 32.49687957763672, "geo/layer_7/stable_rank_k_proj": 32.8942985534668, "geo/layer_7/stable_rank_o_proj": 106.24617767333984, "geo/layer_7/stable_rank_gate_proj": 179.12022399902344, "geo/layer_7/stable_rank_down_proj": 212.01483154296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5964071154594421, "geo/layer_7/attn_entropy_mean": 5.436002254486084, "geo/layer_7/attn_entropy_std": 1.370032548904419, "geo/layer_14/stable_rank_q_proj": 42.29792022705078, "geo/layer_14/stable_rank_k_proj": 28.575790405273438, "geo/layer_14/stable_rank_o_proj": 95.2928695678711, "geo/layer_14/stable_rank_gate_proj": 195.52833557128906, "geo/layer_14/stable_rank_down_proj": 158.45925903320312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5014980435371399, "geo/layer_14/attn_entropy_mean": 6.57470703125, "geo/layer_14/attn_entropy_std": 0.24325546622276306, "geo/layer_21/stable_rank_q_proj": 46.66537094116211, "geo/layer_21/stable_rank_k_proj": 35.36226272583008, "geo/layer_21/stable_rank_o_proj": 89.29405212402344, "geo/layer_21/stable_rank_gate_proj": 139.5961456298828, "geo/layer_21/stable_rank_down_proj": 161.96592712402344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.24818001687526703, "geo/layer_21/attn_entropy_mean": 5.963669776916504, "geo/layer_21/attn_entropy_std": 0.4524208903312683, "geo/layer_27/stable_rank_q_proj": 57.26713943481445, "geo/layer_27/stable_rank_k_proj": 30.835315704345703, "geo/layer_27/stable_rank_o_proj": 98.38768005371094, "geo/layer_27/stable_rank_gate_proj": 88.9693832397461, "geo/layer_27/stable_rank_down_proj": 92.74687194824219, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10773731768131256, "geo/layer_27/attn_entropy_mean": 5.021413803100586, "geo/layer_27/attn_entropy_std": 0.287231981754303, "attnres/final_alpha/block_0": 0.22549402713775635, "attnres/block_norm/0": 0.8842859268188477, "attnres/final_alpha/block_1": 0.017807066440582275, "attnres/block_norm/1": 3758.449462890625, "attnres/final_alpha/block_2": 0.04652968794107437, "attnres/block_norm/2": 2393.267578125, "attnres/final_alpha/block_3": 0.03118770569562912, "attnres/block_norm/3": 2245.8955078125, "attnres/final_alpha/block_4": 0.04740873724222183, "attnres/block_norm/4": 1713.875244140625, "attnres/final_alpha/block_5": 0.3534337282180786, "attnres/block_norm/5": 1990.522216796875, "attnres/final_alpha/block_6": 0.27813899517059326, "attnres/block_norm/6": 2111.10986328125, "geo/tier1_time_s": 1.3551642894744873, "geo/step": 3375.0, "geo/rankme_slope": 0.10004371730425286} {"step": 3380, "timestamp": 1778198179.0362377, "train/loss": 2.7911058187484743, "train/z_loss": 0.001726066949777305, "train/perplexity": 16.299033616901106, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.01352, "optim/adamw_lr": 0.0004056, "perf/tokens_per_sec": 1702520.0557311994, "perf/iters_per_sec": 0.8118248251586911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317928314208983, "data/tokens_consumed": 7090470912, "data/tokens_consumed_B": 7.090470912, "train/loss_slope": -0.000335424218233591} {"step": 3390, "timestamp": 1778198189.3892376, "train/loss": 2.763067865371704, "train/z_loss": 0.0017252215649932623, "train/perplexity": 15.848389167877178, "train/grad_norm": 0.439453125, "optim/muon_lr": 0.013560000000000001, "optim/adamw_lr": 0.00040679999999999997, "perf/tokens_per_sec": 2026530.8750689414, "perf/iters_per_sec": 0.9663252234787661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348482847213745, "data/tokens_consumed": 7111442432, "data/tokens_consumed_B": 7.111442432, "train/loss_slope": -0.00033695911160825867} {"step": 3400, "timestamp": 1778198199.732674, "grad/layer_0/attn": 0.004212062805891037, "grad/layer_0/mlp": 0.008399303071200848, "grad/layer_0/attn_mlp_ratio": 0.5014776488046256, "grad/layer_4/attn": 0.003862930228933692, "grad/layer_4/mlp": 0.008084903471171856, "grad/layer_4/attn_mlp_ratio": 0.4777954610006294, "grad/layer_8/attn": 0.01405195239931345, "grad/layer_8/mlp": 0.008957820013165474, "grad/layer_8/attn_mlp_ratio": 1.5686799044625863, "grad/layer_12/attn": 0.007296369876712561, "grad/layer_12/mlp": 0.007206455804407597, "grad/layer_12/attn_mlp_ratio": 1.0124768642863646, "grad/layer_16/attn": 0.008474491536617279, "grad/layer_16/mlp": 0.006961547303944826, "grad/layer_16/attn_mlp_ratio": 1.2173287122651968, "grad/layer_20/attn": 0.038178253918886185, "grad/layer_20/mlp": 0.020807810127735138, "grad/layer_20/attn_mlp_ratio": 1.8348040231546152, "grad/layer_24/attn": 0.018501700833439827, "grad/layer_24/mlp": 0.02060200460255146, "grad/layer_24/attn_mlp_ratio": 0.8980534224976892, "grad/layer_27/attn": 0.01337527111172676, "grad/layer_27/mlp": 0.017429934814572334, "grad/layer_27/attn_mlp_ratio": 0.7673735545933873} {"step": 3400, "timestamp": 1778198199.7469244, "train/loss": 2.8493887901306154, "train/z_loss": 0.0017099411692470313, "train/perplexity": 17.277218606192317, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.013600000000000001, "optim/adamw_lr": 0.000408, "perf/tokens_per_sec": 2025856.0204179068, "perf/iters_per_sec": 0.9660034277047667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351930141448975, "data/tokens_consumed": 7132413952, "data/tokens_consumed_B": 7.132413952, "train/loss_slope": -0.00032511851739640214} {"step": 3410, "timestamp": 1778198210.8502922, "train/loss": 2.776805114746094, "train/z_loss": 0.0017292297445237637, "train/perplexity": 16.067604703928374, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.013640000000000001, "optim/adamw_lr": 0.00040919999999999997, "perf/tokens_per_sec": 1889677.5996639177, "perf/iters_per_sec": 0.901068496543845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1097935438156128, "data/tokens_consumed": 7153385472, "data/tokens_consumed_B": 7.153385472, "train/loss_slope": -0.0003202396903995133} {"step": 3420, "timestamp": 1778198221.7717197, "train/loss": 2.8032557010650634, "train/z_loss": 0.0017177645931951702, "train/perplexity": 16.498272873014987, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.013680000000000001, "optim/adamw_lr": 0.0004104, "perf/tokens_per_sec": 1921360.6125956308, "perf/iters_per_sec": 0.916176134393516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0914931774139405, "data/tokens_consumed": 7174356992, "data/tokens_consumed_B": 7.174356992, "train/loss_slope": -0.0003113108464128579} {"step": 3430, "timestamp": 1778198232.139715, "train/loss": 2.798661804199219, "train/z_loss": 0.0017240761313587426, "train/perplexity": 16.422655331550462, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.013720000000000001, "optim/adamw_lr": 0.0004116, "perf/tokens_per_sec": 2024445.501405152, "perf/iters_per_sec": 0.9653308398271332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359142780303956, "data/tokens_consumed": 7195328512, "data/tokens_consumed_B": 7.195328512, "train/loss_slope": -0.0003046064819904766} {"step": 3440, "timestamp": 1778198242.4992394, "train/loss": 2.7649550676345824, "train/z_loss": 0.001733394945040345, "train/perplexity": 15.87832652381547, "train/grad_norm": 0.3359375, "optim/muon_lr": 0.01376, "optim/adamw_lr": 0.00041279999999999995, "perf/tokens_per_sec": 2025390.246091753, "perf/iters_per_sec": 0.9657813291987195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354310750961304, "data/tokens_consumed": 7216300032, "data/tokens_consumed_B": 7.216300032, "train/loss_slope": -0.0003023066619978343} {"step": 3450, "timestamp": 1778198253.4107053, "grad/layer_0/attn": 0.004568568430840969, "grad/layer_0/mlp": 0.009232774376869202, "grad/layer_0/attn_mlp_ratio": 0.494820754290768, "grad/layer_4/attn": 0.004507408943027258, "grad/layer_4/mlp": 0.008754798211157322, "grad/layer_4/attn_mlp_ratio": 0.5148501179385149, "grad/layer_8/attn": 0.014813757501542568, "grad/layer_8/mlp": 0.009674797765910625, "grad/layer_8/attn_mlp_ratio": 1.5311697160866984, "grad/layer_12/attn": 0.008353018201887608, "grad/layer_12/mlp": 0.008085672743618488, "grad/layer_12/attn_mlp_ratio": 1.0330640830317688, "grad/layer_16/attn": 0.009753018617630005, "grad/layer_16/mlp": 0.007762791123241186, "grad/layer_16/attn_mlp_ratio": 1.2563803839565124, "grad/layer_20/attn": 0.04092995077371597, "grad/layer_20/mlp": 0.023730028420686722, "grad/layer_20/attn_mlp_ratio": 1.7248167543513555, "grad/layer_24/attn": 0.010991825722157955, "grad/layer_24/mlp": 0.020076686516404152, "grad/layer_24/attn_mlp_ratio": 0.5474920205795717, "grad/layer_27/attn": 0.015144653618335724, "grad/layer_27/mlp": 0.017262481153011322, "grad/layer_27/attn_mlp_ratio": 0.8773161515059628} {"step": 3450, "timestamp": 1778198254.00965, "eos/sharpness": 22.4799633026123, "eos/L0_probe": 2.5658209323883057, "eos/L_plus": 2.695944309234619, "eos/L_minus": 2.6604971885681152, "eos/grad_norm": 0.2480173259973526, "eos/embed_grad_frac": 0.11197008192539215, "eos/time_s": 0.5960073471069336} {"step": 3450, "timestamp": 1778198254.029744, "train/loss": 2.7706329822540283, "train/z_loss": 0.0017331233015283943, "train/perplexity": 15.968738739640713, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.0138, "optim/adamw_lr": 0.0004139999999999999, "perf/tokens_per_sec": 1819570.5197460514, "perf/iters_per_sec": 0.8676388357858903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1525532960891725, "data/tokens_consumed": 7237271552, "data/tokens_consumed_B": 7.237271552, "train/loss_slope": -0.00029958090407334036} {"step": 3450, "timestamp": 1778198255.3950315, "geo/rankme_last": 386.1490783691406, "geo/layer_0/stable_rank_q_proj": 43.044708251953125, "geo/layer_0/stable_rank_k_proj": 36.30978012084961, "geo/layer_0/stable_rank_o_proj": 61.17948532104492, "geo/layer_0/stable_rank_gate_proj": 168.87210083007812, "geo/layer_0/stable_rank_down_proj": 49.10590744018555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03440604358911514, "geo/layer_0/attn_entropy_mean": 6.727709770202637, "geo/layer_0/attn_entropy_std": 0.1058749258518219, "geo/layer_7/stable_rank_q_proj": 32.81614685058594, "geo/layer_7/stable_rank_k_proj": 33.060794830322266, "geo/layer_7/stable_rank_o_proj": 106.60189056396484, "geo/layer_7/stable_rank_gate_proj": 179.03785705566406, "geo/layer_7/stable_rank_down_proj": 212.15957641601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5916846990585327, "geo/layer_7/attn_entropy_mean": 5.437244415283203, "geo/layer_7/attn_entropy_std": 1.355073094367981, "geo/layer_14/stable_rank_q_proj": 42.954341888427734, "geo/layer_14/stable_rank_k_proj": 29.08668327331543, "geo/layer_14/stable_rank_o_proj": 95.63282775878906, "geo/layer_14/stable_rank_gate_proj": 194.92190551757812, "geo/layer_14/stable_rank_down_proj": 158.1479034423828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.5030022859573364, "geo/layer_14/attn_entropy_mean": 6.563957214355469, "geo/layer_14/attn_entropy_std": 0.29368168115615845, "geo/layer_21/stable_rank_q_proj": 46.863014221191406, "geo/layer_21/stable_rank_k_proj": 35.087303161621094, "geo/layer_21/stable_rank_o_proj": 90.56147003173828, "geo/layer_21/stable_rank_gate_proj": 139.443115234375, "geo/layer_21/stable_rank_down_proj": 160.60877990722656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.24186263978481293, "geo/layer_21/attn_entropy_mean": 5.90994930267334, "geo/layer_21/attn_entropy_std": 0.4499056041240692, "geo/layer_27/stable_rank_q_proj": 56.5128059387207, "geo/layer_27/stable_rank_k_proj": 31.423038482666016, "geo/layer_27/stable_rank_o_proj": 98.5599365234375, "geo/layer_27/stable_rank_gate_proj": 88.84394073486328, "geo/layer_27/stable_rank_down_proj": 94.86923217773438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09379376471042633, "geo/layer_27/attn_entropy_mean": 4.971170425415039, "geo/layer_27/attn_entropy_std": 0.33267906308174133, "attnres/final_alpha/block_0": 0.22721290588378906, "attnres/block_norm/0": 0.8957862257957458, "attnres/final_alpha/block_1": 0.017856549471616745, "attnres/block_norm/1": 3867.27294921875, "attnres/final_alpha/block_2": 0.04608745127916336, "attnres/block_norm/2": 2456.32080078125, "attnres/final_alpha/block_3": 0.03092052973806858, "attnres/block_norm/3": 2319.05224609375, "attnres/final_alpha/block_4": 0.04829191416501999, "attnres/block_norm/4": 1740.8922119140625, "attnres/final_alpha/block_5": 0.3511844873428345, "attnres/block_norm/5": 2042.1552734375, "attnres/final_alpha/block_6": 0.27844613790512085, "attnres/block_norm/6": 2171.431640625, "geo/tier1_time_s": 1.361663818359375, "geo/step": 3450.0, "geo/rankme_slope": 0.09746116588191416} {"step": 3460, "timestamp": 1778198265.7530427, "train/loss": 2.8316274881362915, "train/z_loss": 0.0017150128493085503, "train/perplexity": 16.97306181504698, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.01384, "optim/adamw_lr": 0.00041519999999999995, "perf/tokens_per_sec": 1789379.0900302043, "perf/iters_per_sec": 0.8532424402380964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1719998359680175, "data/tokens_consumed": 7258243072, "data/tokens_consumed_B": 7.258243072, "train/loss_slope": -0.0002921321728835405} {"step": 3470, "timestamp": 1778198276.7054908, "train/loss": 2.698914647102356, "train/z_loss": 0.00174603417981416, "train/perplexity": 14.863590725853602, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.01388, "optim/adamw_lr": 0.00041639999999999993, "perf/tokens_per_sec": 1915699.4905194894, "perf/iters_per_sec": 0.9134767010304877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0947186708450318, "data/tokens_consumed": 7279214592, "data/tokens_consumed_B": 7.279214592, "train/loss_slope": -0.00029060447843375954} {"step": 3480, "timestamp": 1778198287.0634499, "train/loss": 2.7974472045898438, "train/z_loss": 0.001729962951503694, "train/perplexity": 16.40272048967618, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.01392, "optim/adamw_lr": 0.00041759999999999996, "perf/tokens_per_sec": 2025793.5471069517, "perf/iters_per_sec": 0.9659736381087073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035224938392639, "data/tokens_consumed": 7300186112, "data/tokens_consumed_B": 7.300186112, "train/loss_slope": -0.0002841495919935774} {"step": 3490, "timestamp": 1778198297.7891963, "train/loss": 2.6898218631744384, "train/z_loss": 0.0017531894613057376, "train/perplexity": 14.72905190018316, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.01396, "optim/adamw_lr": 0.00041879999999999993, "perf/tokens_per_sec": 1956239.8536883283, "perf/iters_per_sec": 0.9328078525964395, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0720321416854859, "data/tokens_consumed": 7321157632, "data/tokens_consumed_B": 7.321157632, "train/loss_slope": -0.0002836939940894648} {"step": 3500, "timestamp": 1778198308.1321328, "grad/layer_0/attn": 0.004843616858124733, "grad/layer_0/mlp": 0.009112094528973103, "grad/layer_0/attn_mlp_ratio": 0.5315591041738984, "grad/layer_4/attn": 0.0041421400383114815, "grad/layer_4/mlp": 0.008822489529848099, "grad/layer_4/attn_mlp_ratio": 0.4694978642193994, "grad/layer_8/attn": 0.014896749518811703, "grad/layer_8/mlp": 0.010166224092245102, "grad/layer_8/attn_mlp_ratio": 1.4653178247018293, "grad/layer_12/attn": 0.008067021146416664, "grad/layer_12/mlp": 0.008478903211653233, "grad/layer_12/attn_mlp_ratio": 0.9514227076194529, "grad/layer_16/attn": 0.01381636131554842, "grad/layer_16/mlp": 0.007882614620029926, "grad/layer_16/attn_mlp_ratio": 1.7527637473439737, "grad/layer_20/attn": 0.033436186611652374, "grad/layer_20/mlp": 0.0218565184623003, "grad/layer_20/attn_mlp_ratio": 1.529803866811868, "grad/layer_24/attn": 0.01650257222354412, "grad/layer_24/mlp": 0.022860605269670486, "grad/layer_24/attn_mlp_ratio": 0.7218781811193126, "grad/layer_27/attn": 0.021353019401431084, "grad/layer_27/mlp": 0.0206390842795372, "grad/layer_27/attn_mlp_ratio": 1.0345914096170719} {"step": 3500, "timestamp": 1778198308.146253, "train/loss": 2.695493149757385, "train/z_loss": 0.0017492546467110515, "train/perplexity": 14.812821891891376, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.013999999999999999, "optim/adamw_lr": 0.00041999999999999996, "perf/tokens_per_sec": 2025836.6108213263, "perf/iters_per_sec": 0.9659941724878913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035202932357788, "data/tokens_consumed": 7342129152, "data/tokens_consumed_B": 7.342129152, "train/loss_slope": -0.00028673279005737954} {"step": 3500, "timestamp": 1778198315.497597, "geo/ww_alpha_mean": 8.551433534025117, "geo/ww_alpha_std": 4.944722111105785, "geo/ww_alpha_min": 2.4321246121736206, "geo/ww_alpha_max": 31.131518850050323, "geo/ww_alpha_healthy_frac": 0.12690355329949238, "geo/ww_alpha_by_type/q_proj": 5.00474830114524, "geo/ww_alpha_by_type/k_proj": 5.235543472635222, "geo/ww_alpha_by_type/v_proj": 7.900943292114659, "geo/ww_alpha_by_type/o_proj": 7.065485703546703, "geo/ww_alpha_by_type/gate_proj": 12.108703199531293, "geo/ww_alpha_by_type/up_proj": 12.689239263333276, "geo/ww_alpha_by_type/down_proj": 10.062156439959484, "geo/twonn_id/layer_0": 0.6808432340621948, "geo/twonn_id/layer_7": 2.384877920150757, "geo/twonn_id/layer_14": 2.986492872238159, "geo/twonn_id/layer_21": 6.660088539123535, "geo/twonn_id/layer_27": 5.8418378829956055, "geo/tier2_time_s": 7.344826936721802} {"step": 3500, "timestamp": 1778198316.5408285, "eoc/jacobian_sigma/layer_0/attn": 675.7977294921875, "eoc/jacobian_sigma/layer_0/mlp": 774.5328369140625, "eoc/jacobian_sigma/layer_0": 774.5328369140625, "eoc/jacobian_sigma/layer_7/attn": 1.19467031955719, "eoc/jacobian_sigma/layer_7/mlp": 1.489977478981018, "eoc/jacobian_sigma/layer_7": 1.489977478981018, "eoc/jacobian_sigma/layer_14/attn": 1.231967806816101, "eoc/jacobian_sigma/layer_14/mlp": 4.934130668640137, "eoc/jacobian_sigma/layer_14": 4.934130668640137, "eoc/jacobian_sigma/layer_21/attn": 1.1163127422332764, "eoc/jacobian_sigma/layer_21/mlp": 2.8915693759918213, "eoc/jacobian_sigma/layer_21": 2.8915693759918213, "eoc/jacobian_sigma/layer_27/attn": 1.4475690126419067, "eoc/jacobian_sigma/layer_27/mlp": 3.085324764251709, "eoc/jacobian_sigma/layer_27": 3.085324764251709, "eoc/layer0_sigma": 774.5328369140625, "eoc/sigma_max": 4.934130668640137, "eoc/sigma_min": 1.489977478981018, "eoc/sigma_mean": 3.1002505719661713, "eoc/time_s": 1.0370192527770996} {"step": 3510, "timestamp": 1778198326.9122322, "train/loss": 2.746278929710388, "train/z_loss": 0.001738932146690786, "train/perplexity": 15.584532714019073, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.014039999999999999, "optim/adamw_lr": 0.00042119999999999994, "perf/tokens_per_sec": 1117845.6660603303, "perf/iters_per_sec": 0.5330303507138874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8760657787322998, "data/tokens_consumed": 7363100672, "data/tokens_consumed_B": 7.363100672, "train/loss_slope": -0.00028480762143482725} {"step": 3520, "timestamp": 1778198337.2724814, "train/loss": 2.717027473449707, "train/z_loss": 0.0017465553362853825, "train/perplexity": 15.135265333647908, "train/grad_norm": 0.41015625, "optim/muon_lr": 0.014079999999999999, "optim/adamw_lr": 0.0004223999999999999, "perf/tokens_per_sec": 2025219.8500040788, "perf/iters_per_sec": 0.965700078012504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035518193244934, "data/tokens_consumed": 7384072192, "data/tokens_consumed_B": 7.384072192, "train/loss_slope": -0.0002847024711016024} {"step": 3525, "timestamp": 1778198343.0527966, "eos/sharpness": 20.552325248718258, "eos/L0_probe": 2.551239013671875, "eos/L_plus": 2.667599678039551, "eos/L_minus": 2.640401601791382, "eos/grad_norm": 0.27614903450012207, "eos/embed_grad_frac": 0.0857645645737648, "eos/time_s": 0.6093101501464844} {"step": 3525, "timestamp": 1778198344.434301, "geo/rankme_last": 387.23345947265625, "geo/layer_0/stable_rank_q_proj": 41.82020950317383, "geo/layer_0/stable_rank_k_proj": 34.89434814453125, "geo/layer_0/stable_rank_o_proj": 61.068817138671875, "geo/layer_0/stable_rank_gate_proj": 170.35658264160156, "geo/layer_0/stable_rank_down_proj": 48.9909553527832, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03366214409470558, "geo/layer_0/attn_entropy_mean": 6.719094276428223, "geo/layer_0/attn_entropy_std": 0.10973202437162399, "geo/layer_7/stable_rank_q_proj": 33.2056999206543, "geo/layer_7/stable_rank_k_proj": 33.2070198059082, "geo/layer_7/stable_rank_o_proj": 105.3037338256836, "geo/layer_7/stable_rank_gate_proj": 179.15704345703125, "geo/layer_7/stable_rank_down_proj": 211.76651000976562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5835009217262268, "geo/layer_7/attn_entropy_mean": 5.418050765991211, "geo/layer_7/attn_entropy_std": 1.3309438228607178, "geo/layer_14/stable_rank_q_proj": 43.721317291259766, "geo/layer_14/stable_rank_k_proj": 29.737245559692383, "geo/layer_14/stable_rank_o_proj": 95.76329040527344, "geo/layer_14/stable_rank_gate_proj": 194.10340881347656, "geo/layer_14/stable_rank_down_proj": 158.40310668945312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4979512691497803, "geo/layer_14/attn_entropy_mean": 6.525416851043701, "geo/layer_14/attn_entropy_std": 0.30202341079711914, "geo/layer_21/stable_rank_q_proj": 46.69746398925781, "geo/layer_21/stable_rank_k_proj": 34.86882400512695, "geo/layer_21/stable_rank_o_proj": 92.17813110351562, "geo/layer_21/stable_rank_gate_proj": 139.4712677001953, "geo/layer_21/stable_rank_down_proj": 159.96632385253906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.23507463932037354, "geo/layer_21/attn_entropy_mean": 5.916107177734375, "geo/layer_21/attn_entropy_std": 0.44040295481681824, "geo/layer_27/stable_rank_q_proj": 55.64617919921875, "geo/layer_27/stable_rank_k_proj": 31.708820343017578, "geo/layer_27/stable_rank_o_proj": 98.65560913085938, "geo/layer_27/stable_rank_gate_proj": 88.60222625732422, "geo/layer_27/stable_rank_down_proj": 96.97052001953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10412460565567017, "geo/layer_27/attn_entropy_mean": 4.9407501220703125, "geo/layer_27/attn_entropy_std": 0.3158603012561798, "attnres/final_alpha/block_0": 0.2280254364013672, "attnres/block_norm/0": 0.907222330570221, "attnres/final_alpha/block_1": 0.018108021467924118, "attnres/block_norm/1": 3941.35498046875, "attnres/final_alpha/block_2": 0.04589962214231491, "attnres/block_norm/2": 2537.20068359375, "attnres/final_alpha/block_3": 0.03088904544711113, "attnres/block_norm/3": 2418.762451171875, "attnres/final_alpha/block_4": 0.04782154783606529, "attnres/block_norm/4": 1771.45751953125, "attnres/final_alpha/block_5": 0.351768434047699, "attnres/block_norm/5": 2094.06884765625, "attnres/final_alpha/block_6": 0.2774878740310669, "attnres/block_norm/6": 2225.48583984375, "geo/tier1_time_s": 1.3615164756774902, "geo/step": 3525.0, "geo/rankme_slope": 0.09497651539243653} {"step": 3530, "timestamp": 1778198349.6217616, "train/loss": 2.7746235132217407, "train/z_loss": 0.0017437508678995073, "train/perplexity": 16.03258980118363, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.014119999999999999, "optim/adamw_lr": 0.00042359999999999994, "perf/tokens_per_sec": 1698874.0872019602, "perf/iters_per_sec": 0.8100862918863106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344363927841187, "data/tokens_consumed": 7405043712, "data/tokens_consumed_B": 7.405043712, "train/loss_slope": -0.00027880230440665687} {"step": 3540, "timestamp": 1778198359.9831388, "train/loss": 2.7346997261047363, "train/z_loss": 0.001746666571125388, "train/perplexity": 15.4051169868785, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.014159999999999999, "optim/adamw_lr": 0.0004247999999999999, "perf/tokens_per_sec": 2024839.1971500805, "perf/iters_per_sec": 0.9655185685873415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357128620147704, "data/tokens_consumed": 7426015232, "data/tokens_consumed_B": 7.426015232, "train/loss_slope": -0.0002746344147592632} {"step": 3550, "timestamp": 1778198370.3261185, "grad/layer_0/attn": 0.004449959844350815, "grad/layer_0/mlp": 0.00836483296006918, "grad/layer_0/attn_mlp_ratio": 0.5319842981198734, "grad/layer_4/attn": 0.0037331164348870516, "grad/layer_4/mlp": 0.008146174252033234, "grad/layer_4/attn_mlp_ratio": 0.4582662085983084, "grad/layer_8/attn": 0.012525527738034725, "grad/layer_8/mlp": 0.009024628438055515, "grad/layer_8/attn_mlp_ratio": 1.387927235477498, "grad/layer_12/attn": 0.006247223820537329, "grad/layer_12/mlp": 0.006768692750483751, "grad/layer_12/attn_mlp_ratio": 0.9229586802850489, "grad/layer_16/attn": 0.008069094270467758, "grad/layer_16/mlp": 0.007402591872960329, "grad/layer_16/attn_mlp_ratio": 1.090036341316931, "grad/layer_20/attn": 0.027552476152777672, "grad/layer_20/mlp": 0.021448522806167603, "grad/layer_20/attn_mlp_ratio": 1.2845861821493942, "grad/layer_24/attn": 0.017196062952280045, "grad/layer_24/mlp": 0.027827680110931396, "grad/layer_24/attn_mlp_ratio": 0.6179481301328527, "grad/layer_27/attn": 0.014070644043385983, "grad/layer_27/mlp": 0.029635116457939148, "grad/layer_27/attn_mlp_ratio": 0.47479631186456406} {"step": 3550, "timestamp": 1778198370.340187, "train/loss": 2.7436899185180663, "train/z_loss": 0.0017481197486631573, "train/perplexity": 15.544236370746038, "train/grad_norm": 0.3125, "optim/muon_lr": 0.014199999999999999, "optim/adamw_lr": 0.00042599999999999995, "perf/tokens_per_sec": 2025709.1046181787, "perf/iters_per_sec": 0.9659333727923292, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352680921554565, "data/tokens_consumed": 7446986752, "data/tokens_consumed_B": 7.446986752, "train/loss_slope": -0.00026939426271042026} {"step": 3560, "timestamp": 1778198380.699544, "train/loss": 2.7348761796951293, "train/z_loss": 0.0017533379490487278, "train/perplexity": 15.407835514921222, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.01424, "optim/adamw_lr": 0.0004271999999999999, "perf/tokens_per_sec": 2025739.4751114883, "perf/iters_per_sec": 0.9659478545720521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035252571105957, "data/tokens_consumed": 7467958272, "data/tokens_consumed_B": 7.467958272, "train/loss_slope": -0.0002643175405673426} {"step": 3570, "timestamp": 1778198391.0541906, "train/loss": 2.7856715440750124, "train/z_loss": 0.0017439075978472828, "train/perplexity": 16.210700422119526, "train/grad_norm": 0.21875, "optim/muon_lr": 0.01428, "optim/adamw_lr": 0.00042839999999999995, "perf/tokens_per_sec": 2026525.3190667706, "perf/iters_per_sec": 0.9663225741704801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348511219024659, "data/tokens_consumed": 7488929792, "data/tokens_consumed_B": 7.488929792, "train/loss_slope": -0.0002587928166519655} {"step": 3580, "timestamp": 1778198401.4153204, "train/loss": 2.706889772415161, "train/z_loss": 0.00175874704727903, "train/perplexity": 14.982603665249362, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.01432, "optim/adamw_lr": 0.0004295999999999999, "perf/tokens_per_sec": 2025367.021292033, "perf/iters_per_sec": 0.9657702547512211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354429483413696, "data/tokens_consumed": 7509901312, "data/tokens_consumed_B": 7.509901312, "train/loss_slope": -0.0002574978692231864} {"step": 3590, "timestamp": 1778198411.7705224, "train/loss": 2.659900188446045, "train/z_loss": 0.0017653310089372099, "train/perplexity": 14.294862235056794, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.01436, "optim/adamw_lr": 0.00043079999999999995, "perf/tokens_per_sec": 2026617.1604246588, "perf/iters_per_sec": 0.966366367542581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348042249679565, "data/tokens_consumed": 7530872832, "data/tokens_consumed_B": 7.530872832, "train/loss_slope": -0.0002636197225488846} {"step": 3600, "timestamp": 1778198422.111374, "grad/layer_0/attn": 0.004812384024262428, "grad/layer_0/mlp": 0.007827834226191044, "grad/layer_0/attn_mlp_ratio": 0.6147784717620732, "grad/layer_4/attn": 0.0033212220296263695, "grad/layer_4/mlp": 0.007655971217900515, "grad/layer_4/attn_mlp_ratio": 0.4338080554012764, "grad/layer_8/attn": 0.010881238617002964, "grad/layer_8/mlp": 0.008743172511458397, "grad/layer_8/attn_mlp_ratio": 1.244541209531025, "grad/layer_12/attn": 0.008764845319092274, "grad/layer_12/mlp": 0.007144772447645664, "grad/layer_12/attn_mlp_ratio": 1.226749383642794, "grad/layer_16/attn": 0.010932606644928455, "grad/layer_16/mlp": 0.0070241596549749374, "grad/layer_16/attn_mlp_ratio": 1.556429099891317, "grad/layer_20/attn": 0.024493901059031487, "grad/layer_20/mlp": 0.01989373192191124, "grad/layer_20/attn_mlp_ratio": 1.2312371068461943, "grad/layer_24/attn": 0.010517324320971966, "grad/layer_24/mlp": 0.024161970242857933, "grad/layer_24/attn_mlp_ratio": 0.43528421613516277, "grad/layer_27/attn": 0.015564617700874805, "grad/layer_27/mlp": 0.023669838905334473, "grad/layer_27/attn_mlp_ratio": 0.657571760305045} {"step": 3600, "timestamp": 1778198422.7082965, "eos/sharpness": 11.993408203124998, "eos/L0_probe": 2.5375406742095947, "eos/L_plus": 2.627742052078247, "eos/L_minus": 2.5672733783721924, "eos/grad_norm": 0.2575955092906952, "eos/embed_grad_frac": 0.11160305142402649, "eos/time_s": 0.5940656661987305} {"step": 3600, "timestamp": 1778198422.7259295, "train/loss": 2.713806128501892, "train/z_loss": 0.0017638473538681865, "train/perplexity": 15.08658786867854, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.0144, "optim/adamw_lr": 0.00043199999999999993, "perf/tokens_per_sec": 1915099.257571159, "perf/iters_per_sec": 0.9131904876571459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950617790222168, "data/tokens_consumed": 7551844352, "data/tokens_consumed_B": 7.551844352, "train/loss_slope": -0.0002643834293764918} {"step": 3600, "timestamp": 1778198424.0857604, "geo/rankme_last": 389.6121826171875, "geo/layer_0/stable_rank_q_proj": 40.733829498291016, "geo/layer_0/stable_rank_k_proj": 33.441741943359375, "geo/layer_0/stable_rank_o_proj": 61.151222229003906, "geo/layer_0/stable_rank_gate_proj": 171.86376953125, "geo/layer_0/stable_rank_down_proj": 48.95210647583008, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03888837248086929, "geo/layer_0/attn_entropy_mean": 6.710434913635254, "geo/layer_0/attn_entropy_std": 0.12450890988111496, "geo/layer_7/stable_rank_q_proj": 33.67641067504883, "geo/layer_7/stable_rank_k_proj": 33.71312713623047, "geo/layer_7/stable_rank_o_proj": 104.44952392578125, "geo/layer_7/stable_rank_gate_proj": 178.44683837890625, "geo/layer_7/stable_rank_down_proj": 213.68795776367188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.584601640701294, "geo/layer_7/attn_entropy_mean": 5.4090776443481445, "geo/layer_7/attn_entropy_std": 1.3005012273788452, "geo/layer_14/stable_rank_q_proj": 44.41642379760742, "geo/layer_14/stable_rank_k_proj": 30.390113830566406, "geo/layer_14/stable_rank_o_proj": 96.17125701904297, "geo/layer_14/stable_rank_gate_proj": 193.57676696777344, "geo/layer_14/stable_rank_down_proj": 158.50823974609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4928000569343567, "geo/layer_14/attn_entropy_mean": 6.518734931945801, "geo/layer_14/attn_entropy_std": 0.2995679974555969, "geo/layer_21/stable_rank_q_proj": 46.665035247802734, "geo/layer_21/stable_rank_k_proj": 34.68966293334961, "geo/layer_21/stable_rank_o_proj": 93.27345275878906, "geo/layer_21/stable_rank_gate_proj": 139.3963623046875, "geo/layer_21/stable_rank_down_proj": 158.97689819335938, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.23356124758720398, "geo/layer_21/attn_entropy_mean": 5.937426567077637, "geo/layer_21/attn_entropy_std": 0.41494491696357727, "geo/layer_27/stable_rank_q_proj": 54.85574722290039, "geo/layer_27/stable_rank_k_proj": 32.228477478027344, "geo/layer_27/stable_rank_o_proj": 99.2975082397461, "geo/layer_27/stable_rank_gate_proj": 88.50870513916016, "geo/layer_27/stable_rank_down_proj": 99.26765441894531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09658657759428024, "geo/layer_27/attn_entropy_mean": 4.96147346496582, "geo/layer_27/attn_entropy_std": 0.32626157999038696, "attnres/final_alpha/block_0": 0.22893905639648438, "attnres/block_norm/0": 0.918270468711853, "attnres/final_alpha/block_1": 0.01823412999510765, "attnres/block_norm/1": 4033.04345703125, "attnres/final_alpha/block_2": 0.04554041475057602, "attnres/block_norm/2": 2620.71435546875, "attnres/final_alpha/block_3": 0.030802860856056213, "attnres/block_norm/3": 2472.892822265625, "attnres/final_alpha/block_4": 0.04788478836417198, "attnres/block_norm/4": 1807.8709716796875, "attnres/final_alpha/block_5": 0.3563131093978882, "attnres/block_norm/5": 2141.4853515625, "attnres/final_alpha/block_6": 0.2722856402397156, "attnres/block_norm/6": 2296.2353515625, "geo/tier1_time_s": 1.3559565544128418, "geo/step": 3600.0, "geo/rankme_slope": 0.09262901514792929} {"step": 3610, "timestamp": 1778198434.4480119, "train/loss": 2.7243584871292112, "train/z_loss": 0.0017606088891625403, "train/perplexity": 15.246629879594664, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.01444, "optim/adamw_lr": 0.00043319999999999996, "perf/tokens_per_sec": 1789626.9796817838, "perf/iters_per_sec": 0.8533606432351035, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718374967575074, "data/tokens_consumed": 7572815872, "data/tokens_consumed_B": 7.572815872, "train/loss_slope": -0.0002652255212799265} {"step": 3620, "timestamp": 1778198444.8081472, "train/loss": 2.7115121126174926, "train/z_loss": 0.0017624387401156127, "train/perplexity": 15.052018662780185, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.01448, "optim/adamw_lr": 0.00043439999999999993, "perf/tokens_per_sec": 2025426.576716204, "perf/iters_per_sec": 0.9657986529904384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354125022888183, "data/tokens_consumed": 7593787392, "data/tokens_consumed_B": 7.593787392, "train/loss_slope": -0.0002622391651816244} {"step": 3630, "timestamp": 1778198455.166257, "train/loss": 2.6879997730255125, "train/z_loss": 0.0017636445700190961, "train/perplexity": 14.702238675287642, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.01452, "optim/adamw_lr": 0.00043559999999999996, "perf/tokens_per_sec": 2025696.1822727758, "perf/iters_per_sec": 0.9659272109378699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352746963500976, "data/tokens_consumed": 7614758912, "data/tokens_consumed_B": 7.614758912, "train/loss_slope": -0.0002651569458016969} {"step": 3640, "timestamp": 1778198465.5253603, "train/loss": 2.6825090885162353, "train/z_loss": 0.0017691148910671472, "train/perplexity": 14.62173453481063, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.01456, "optim/adamw_lr": 0.00043679999999999994, "perf/tokens_per_sec": 2025590.9902518198, "perf/iters_per_sec": 0.9658770514735316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035328459739685, "data/tokens_consumed": 7635730432, "data/tokens_consumed_B": 7.635730432, "train/loss_slope": -0.00026349328081421125} {"step": 3650, "timestamp": 1778198475.8682559, "grad/layer_0/attn": 0.005321466829627752, "grad/layer_0/mlp": 0.009309940040111542, "grad/layer_0/attn_mlp_ratio": 0.5715898007443039, "grad/layer_4/attn": 0.0037834797985851765, "grad/layer_4/mlp": 0.00821447093039751, "grad/layer_4/attn_mlp_ratio": 0.4605871497488364, "grad/layer_8/attn": 0.011578571982681751, "grad/layer_8/mlp": 0.009489824064075947, "grad/layer_8/attn_mlp_ratio": 1.2201039537184293, "grad/layer_12/attn": 0.010169295594096184, "grad/layer_12/mlp": 0.009099959395825863, "grad/layer_12/attn_mlp_ratio": 1.1175099843862848, "grad/layer_16/attn": 0.013778503984212875, "grad/layer_16/mlp": 0.008194312453269958, "grad/layer_16/attn_mlp_ratio": 1.6814716176178237, "grad/layer_20/attn": 0.04519910365343094, "grad/layer_20/mlp": 0.023585254326462746, "grad/layer_20/attn_mlp_ratio": 1.9164136555897133, "grad/layer_24/attn": 0.03345213457942009, "grad/layer_24/mlp": 0.022290924564003944, "grad/layer_24/attn_mlp_ratio": 1.5007064571636906, "grad/layer_27/attn": 0.015703991055488586, "grad/layer_27/mlp": 0.02356121502816677, "grad/layer_27/attn_mlp_ratio": 0.6665187245251585} {"step": 3650, "timestamp": 1778198475.8821685, "train/loss": 2.7210838556289674, "train/z_loss": 0.0017608775524422526, "train/perplexity": 15.19678444237973, "train/grad_norm": 0.388671875, "optim/muon_lr": 0.0146, "optim/adamw_lr": 0.00043799999999999997, "perf/tokens_per_sec": 2026013.9705056853, "perf/iters_per_sec": 0.9660787441757609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351123094558716, "data/tokens_consumed": 7656701952, "data/tokens_consumed_B": 7.656701952, "train/loss_slope": -0.0002628826945099143} {"step": 3660, "timestamp": 1778198486.23568, "train/loss": 2.717200255393982, "train/z_loss": 0.0017644604784436523, "train/perplexity": 15.13788066015347, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.01464, "optim/adamw_lr": 0.00043919999999999994, "perf/tokens_per_sec": 2026524.898866165, "perf/iters_per_sec": 0.9663223738032174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034851336479187, "data/tokens_consumed": 7677673472, "data/tokens_consumed_B": 7.677673472, "train/loss_slope": -0.0002602596350342814} {"step": 3670, "timestamp": 1778198496.5897582, "train/loss": 2.730182909965515, "train/z_loss": 0.0017625302192755044, "train/perplexity": 15.335691814247852, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.01468, "optim/adamw_lr": 0.00044039999999999997, "perf/tokens_per_sec": 2026520.9303134836, "perf/iters_per_sec": 0.9663204814498346, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348533630371093, "data/tokens_consumed": 7698644992, "data/tokens_consumed_B": 7.698644992, "train/loss_slope": -0.0002578210129810818} {"step": 3675, "timestamp": 1778198502.3511732, "eos/sharpness": 12.095451354980467, "eos/L0_probe": 2.527949094772339, "eos/L_plus": 2.604682683944702, "eos/L_minus": 2.5721700191497803, "eos/grad_norm": 0.20939046144485474, "eos/embed_grad_frac": 0.12809154391288757, "eos/time_s": 0.5910422801971436} {"step": 3675, "timestamp": 1778198503.7323625, "geo/rankme_last": 390.7754211425781, "geo/layer_0/stable_rank_q_proj": 39.62908172607422, "geo/layer_0/stable_rank_k_proj": 31.98309326171875, "geo/layer_0/stable_rank_o_proj": 60.99058151245117, "geo/layer_0/stable_rank_gate_proj": 171.58596801757812, "geo/layer_0/stable_rank_down_proj": 48.656986236572266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.02884146384894848, "geo/layer_0/attn_entropy_mean": 6.699832916259766, "geo/layer_0/attn_entropy_std": 0.12765134871006012, "geo/layer_7/stable_rank_q_proj": 34.0849723815918, "geo/layer_7/stable_rank_k_proj": 33.99083709716797, "geo/layer_7/stable_rank_o_proj": 103.81929016113281, "geo/layer_7/stable_rank_gate_proj": 178.43247985839844, "geo/layer_7/stable_rank_down_proj": 214.38536071777344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5932235717773438, "geo/layer_7/attn_entropy_mean": 5.378262519836426, "geo/layer_7/attn_entropy_std": 1.2906752824783325, "geo/layer_14/stable_rank_q_proj": 45.199180603027344, "geo/layer_14/stable_rank_k_proj": 31.1601505279541, "geo/layer_14/stable_rank_o_proj": 96.5537338256836, "geo/layer_14/stable_rank_gate_proj": 193.3160858154297, "geo/layer_14/stable_rank_down_proj": 158.00328063964844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.47876906394958496, "geo/layer_14/attn_entropy_mean": 6.491976737976074, "geo/layer_14/attn_entropy_std": 0.3079507648944855, "geo/layer_21/stable_rank_q_proj": 46.76012420654297, "geo/layer_21/stable_rank_k_proj": 34.50439453125, "geo/layer_21/stable_rank_o_proj": 93.86380767822266, "geo/layer_21/stable_rank_gate_proj": 139.3856201171875, "geo/layer_21/stable_rank_down_proj": 157.2467803955078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.2274816781282425, "geo/layer_21/attn_entropy_mean": 5.900712966918945, "geo/layer_21/attn_entropy_std": 0.4090120792388916, "geo/layer_27/stable_rank_q_proj": 53.9168701171875, "geo/layer_27/stable_rank_k_proj": 32.85249710083008, "geo/layer_27/stable_rank_o_proj": 99.7420654296875, "geo/layer_27/stable_rank_gate_proj": 88.30977630615234, "geo/layer_27/stable_rank_down_proj": 101.49408721923828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09273359179496765, "geo/layer_27/attn_entropy_mean": 4.879979133605957, "geo/layer_27/attn_entropy_std": 0.333310067653656, "attnres/final_alpha/block_0": 0.22934702038764954, "attnres/block_norm/0": 0.9291238784790039, "attnres/final_alpha/block_1": 0.018050700426101685, "attnres/block_norm/1": 4148.78515625, "attnres/final_alpha/block_2": 0.044266920536756516, "attnres/block_norm/2": 2717.130859375, "attnres/final_alpha/block_3": 0.030129622668027878, "attnres/block_norm/3": 2561.69775390625, "attnres/final_alpha/block_4": 0.04807806760072708, "attnres/block_norm/4": 1837.566650390625, "attnres/final_alpha/block_5": 0.3570573329925537, "attnres/block_norm/5": 2185.56787109375, "attnres/final_alpha/block_6": 0.2730703353881836, "attnres/block_norm/6": 2348.4521484375, "geo/tier1_time_s": 1.363311529159546, "geo/step": 3675.0, "geo/rankme_slope": 0.0903707110065706} {"step": 3680, "timestamp": 1778198508.9195085, "train/loss": 2.727125954627991, "train/z_loss": 0.0017640413949266075, "train/perplexity": 15.288882872176679, "train/grad_norm": 0.265625, "optim/muon_lr": 0.01472, "optim/adamw_lr": 0.00044159999999999995, "perf/tokens_per_sec": 1701752.7620848077, "perf/iters_per_sec": 0.8114589510368384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2323482275009154, "data/tokens_consumed": 7719616512, "data/tokens_consumed_B": 7.719616512, "train/loss_slope": -0.00025254045773153366} {"step": 3690, "timestamp": 1778198519.276609, "train/loss": 2.7165584564208984, "train/z_loss": 0.0017681797384284438, "train/perplexity": 15.128168300915595, "train/grad_norm": 0.3125, "optim/muon_lr": 0.01476, "optim/adamw_lr": 0.0004428, "perf/tokens_per_sec": 2025719.8344887407, "perf/iters_per_sec": 0.9659384891933158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352626085281371, "data/tokens_consumed": 7740588032, "data/tokens_consumed_B": 7.740588032, "train/loss_slope": -0.00024871624243570596} {"step": 3700, "timestamp": 1778198529.6462584, "grad/layer_0/attn": 0.004545771516859531, "grad/layer_0/mlp": 0.008529603481292725, "grad/layer_0/attn_mlp_ratio": 0.5329405374511657, "grad/layer_4/attn": 0.0034517920576035976, "grad/layer_4/mlp": 0.007552676368504763, "grad/layer_4/attn_mlp_ratio": 0.4570289846252298, "grad/layer_8/attn": 0.013805086724460125, "grad/layer_8/mlp": 0.009059430100023746, "grad/layer_8/attn_mlp_ratio": 1.5238360934028654, "grad/layer_12/attn": 0.008168580010533333, "grad/layer_12/mlp": 0.007490938529372215, "grad/layer_12/attn_mlp_ratio": 1.0904614781522923, "grad/layer_16/attn": 0.008142060600221157, "grad/layer_16/mlp": 0.006409610155969858, "grad/layer_16/attn_mlp_ratio": 1.2702894989032616, "grad/layer_20/attn": 0.021816499531269073, "grad/layer_20/mlp": 0.01896406151354313, "grad/layer_20/attn_mlp_ratio": 1.1504128164026257, "grad/layer_24/attn": 0.00965151097625494, "grad/layer_24/mlp": 0.02007514238357544, "grad/layer_24/attn_mlp_ratio": 0.480769239080189, "grad/layer_27/attn": 0.009911701083183289, "grad/layer_27/mlp": 0.020258238539099693, "grad/layer_27/attn_mlp_ratio": 0.4892676633818042} {"step": 3700, "timestamp": 1778198529.6636164, "train/loss": 2.6545308113098143, "train/z_loss": 0.001782741560600698, "train/perplexity": 14.218313422238767, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.0148, "optim/adamw_lr": 0.00044399999999999995, "perf/tokens_per_sec": 2020183.6216810262, "perf/iters_per_sec": 0.9632986172108775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380996942520142, "data/tokens_consumed": 7761559552, "data/tokens_consumed_B": 7.761559552, "train/loss_slope": -0.00025073775229352463} {"step": 3710, "timestamp": 1778198540.0511532, "train/loss": 2.7292688369750975, "train/z_loss": 0.0017702304874546825, "train/perplexity": 15.321680277330392, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.01484, "optim/adamw_lr": 0.0004452, "perf/tokens_per_sec": 2020076.2178473454, "perf/iters_per_sec": 0.963247403072045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381548881530762, "data/tokens_consumed": 7782531072, "data/tokens_consumed_B": 7.782531072, "train/loss_slope": -0.0002482777780789783} {"step": 3720, "timestamp": 1778198550.4356425, "train/loss": 2.6866171598434447, "train/z_loss": 0.0017704758094623685, "train/perplexity": 14.681925212356061, "train/grad_norm": 0.33984375, "optim/muon_lr": 0.01488, "optim/adamw_lr": 0.00044639999999999995, "perf/tokens_per_sec": 2020921.281814814, "perf/iters_per_sec": 0.9636503609727927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377207756042481, "data/tokens_consumed": 7803502592, "data/tokens_consumed_B": 7.803502592, "train/loss_slope": -0.00024555170546771175} {"step": 3730, "timestamp": 1778198560.8170905, "train/loss": 2.7149996519088746, "train/z_loss": 0.0017714345827698707, "train/perplexity": 15.104604814115598, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.014920000000000001, "optim/adamw_lr": 0.0004476, "perf/tokens_per_sec": 2021275.8919137134, "perf/iters_per_sec": 0.963819452244622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037538719177246, "data/tokens_consumed": 7824474112, "data/tokens_consumed_B": 7.824474112, "train/loss_slope": -0.00023967383885719815} {"step": 3740, "timestamp": 1778198571.1949933, "train/loss": 2.684251618385315, "train/z_loss": 0.0017767165787518024, "train/perplexity": 14.647235555667931, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.014960000000000001, "optim/adamw_lr": 0.00044879999999999996, "perf/tokens_per_sec": 2021707.8985697166, "perf/iters_per_sec": 0.9640254490707953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373170137405396, "data/tokens_consumed": 7845445632, "data/tokens_consumed_B": 7.845445632, "train/loss_slope": -0.00023661235878855987} {"step": 3750, "timestamp": 1778198581.568464, "grad/layer_0/attn": 0.0037887864746153355, "grad/layer_0/mlp": 0.0071599241346120834, "grad/layer_0/attn_mlp_ratio": 0.5291657216566353, "grad/layer_4/attn": 0.0030476185493171215, "grad/layer_4/mlp": 0.006907328497618437, "grad/layer_4/attn_mlp_ratio": 0.4412152261538424, "grad/layer_8/attn": 0.012677222490310669, "grad/layer_8/mlp": 0.00814181286841631, "grad/layer_8/attn_mlp_ratio": 1.5570515485295602, "grad/layer_12/attn": 0.006373976357281208, "grad/layer_12/mlp": 0.0067828684113919735, "grad/layer_12/attn_mlp_ratio": 0.9397169275176097, "grad/layer_16/attn": 0.009648116305470467, "grad/layer_16/mlp": 0.0066545093432068825, "grad/layer_16/attn_mlp_ratio": 1.44986138915462, "grad/layer_20/attn": 0.01653445139527321, "grad/layer_20/mlp": 0.01866469718515873, "grad/layer_20/attn_mlp_ratio": 0.8858676432122266, "grad/layer_24/attn": 0.015833653509616852, "grad/layer_24/mlp": 0.017115909606218338, "grad/layer_24/attn_mlp_ratio": 0.9250839588073058, "grad/layer_27/attn": 0.012432725168764591, "grad/layer_27/mlp": 0.015849469229578972, "grad/layer_27/attn_mlp_ratio": 0.7844253274500551} {"step": 3750, "timestamp": 1778198582.1748447, "eos/sharpness": 13.500785827636717, "eos/L0_probe": 2.5144495964050293, "eos/L_plus": 2.5842857360839844, "eos/L_minus": 2.5796213150024414, "eos/grad_norm": 0.18210488557815552, "eos/embed_grad_frac": 0.2219349890947342, "eos/time_s": 0.603593111038208} {"step": 3750, "timestamp": 1778198582.1944678, "train/loss": 2.708707332611084, "train/z_loss": 0.001771467598155141, "train/perplexity": 15.009860212006284, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.015, "optim/adamw_lr": 0.00045, "perf/tokens_per_sec": 1907703.6910765502, "perf/iters_per_sec": 0.9096640067465545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0993069887161255, "data/tokens_consumed": 7866417152, "data/tokens_consumed_B": 7.866417152, "train/loss_slope": -0.00023614332918429787} {"step": 3750, "timestamp": 1778198583.5626442, "geo/rankme_last": 392.3088073730469, "geo/layer_0/stable_rank_q_proj": 38.53692626953125, "geo/layer_0/stable_rank_k_proj": 30.581588745117188, "geo/layer_0/stable_rank_o_proj": 61.355064392089844, "geo/layer_0/stable_rank_gate_proj": 170.7592010498047, "geo/layer_0/stable_rank_down_proj": 48.50099563598633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.025459207594394684, "geo/layer_0/attn_entropy_mean": 6.686655044555664, "geo/layer_0/attn_entropy_std": 0.13483837246894836, "geo/layer_7/stable_rank_q_proj": 34.594486236572266, "geo/layer_7/stable_rank_k_proj": 34.30470275878906, "geo/layer_7/stable_rank_o_proj": 104.05241394042969, "geo/layer_7/stable_rank_gate_proj": 176.90802001953125, "geo/layer_7/stable_rank_down_proj": 213.64366149902344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5784206390380859, "geo/layer_7/attn_entropy_mean": 5.359014511108398, "geo/layer_7/attn_entropy_std": 1.2535934448242188, "geo/layer_14/stable_rank_q_proj": 46.218902587890625, "geo/layer_14/stable_rank_k_proj": 32.04590606689453, "geo/layer_14/stable_rank_o_proj": 97.31354522705078, "geo/layer_14/stable_rank_gate_proj": 192.88015747070312, "geo/layer_14/stable_rank_down_proj": 157.35638427734375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.46755659580230713, "geo/layer_14/attn_entropy_mean": 6.514923095703125, "geo/layer_14/attn_entropy_std": 0.3045603930950165, "geo/layer_21/stable_rank_q_proj": 46.90694046020508, "geo/layer_21/stable_rank_k_proj": 34.1180305480957, "geo/layer_21/stable_rank_o_proj": 95.70283508300781, "geo/layer_21/stable_rank_gate_proj": 139.70152282714844, "geo/layer_21/stable_rank_down_proj": 155.14268493652344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.22673362493515015, "geo/layer_21/attn_entropy_mean": 5.907301902770996, "geo/layer_21/attn_entropy_std": 0.416713148355484, "geo/layer_27/stable_rank_q_proj": 53.21588134765625, "geo/layer_27/stable_rank_k_proj": 33.287662506103516, "geo/layer_27/stable_rank_o_proj": 99.96775817871094, "geo/layer_27/stable_rank_gate_proj": 88.18698120117188, "geo/layer_27/stable_rank_down_proj": 103.72176361083984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08905337005853653, "geo/layer_27/attn_entropy_mean": 4.901127815246582, "geo/layer_27/attn_entropy_std": 0.3662967085838318, "attnres/final_alpha/block_0": 0.23061831295490265, "attnres/block_norm/0": 0.9396262764930725, "attnres/final_alpha/block_1": 0.01807108148932457, "attnres/block_norm/1": 4242.14404296875, "attnres/final_alpha/block_2": 0.04406898096203804, "attnres/block_norm/2": 2806.5146484375, "attnres/final_alpha/block_3": 0.030111605301499367, "attnres/block_norm/3": 2644.2099609375, "attnres/final_alpha/block_4": 0.04787159711122513, "attnres/block_norm/4": 1878.781005859375, "attnres/final_alpha/block_5": 0.3556526303291321, "attnres/block_norm/5": 2246.984375, "attnres/final_alpha/block_6": 0.2736057639122009, "attnres/block_norm/6": 2413.527099609375, "geo/tier1_time_s": 1.3642914295196533, "geo/step": 3750.0, "geo/rankme_slope": 0.08501255055977469} {"step": 3760, "timestamp": 1778198593.9402764, "train/loss": 2.704982852935791, "train/z_loss": 0.0017761646653525532, "train/perplexity": 14.954060270095994, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.01504, "optim/adamw_lr": 0.00045119999999999996, "perf/tokens_per_sec": 1786051.6007121226, "perf/iters_per_sec": 0.8516557696877111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1741833209991455, "data/tokens_consumed": 7887388672, "data/tokens_consumed_B": 7.887388672, "train/loss_slope": -0.00023064658276282957} {"step": 3770, "timestamp": 1778198604.3155255, "train/loss": 2.710889744758606, "train/z_loss": 0.001774423266761005, "train/perplexity": 15.042653684686075, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.01508, "optim/adamw_lr": 0.00045239999999999994, "perf/tokens_per_sec": 2022242.5992320112, "perf/iters_per_sec": 0.9642804142150933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370427370071411, "data/tokens_consumed": 7908360192, "data/tokens_consumed_B": 7.908360192, "train/loss_slope": -0.00022710325370992304} {"step": 3780, "timestamp": 1778198614.6873338, "train/loss": 2.7203645944595336, "train/z_loss": 0.0017753560212440788, "train/perplexity": 15.18585791541441, "train/grad_norm": 0.265625, "optim/muon_lr": 0.01512, "optim/adamw_lr": 0.00045359999999999997, "perf/tokens_per_sec": 2023005.2133479766, "perf/iters_per_sec": 0.9646440569629557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366518020629882, "data/tokens_consumed": 7929331712, "data/tokens_consumed_B": 7.929331712, "train/loss_slope": -0.00022800120592045785} {"step": 3790, "timestamp": 1778198625.0610507, "train/loss": 2.6709174633026125, "train/z_loss": 0.001781988178845495, "train/perplexity": 14.453223413829114, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.01516, "optim/adamw_lr": 0.00045479999999999994, "perf/tokens_per_sec": 2022585.906728136, "perf/iters_per_sec": 0.9644441159859352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368667125701905, "data/tokens_consumed": 7950303232, "data/tokens_consumed_B": 7.950303232, "train/loss_slope": -0.00022955493579352904} {"step": 3800, "timestamp": 1778198635.4257836, "grad/layer_0/attn": 0.003951975144445896, "grad/layer_0/mlp": 0.007088636979460716, "grad/layer_0/attn_mlp_ratio": 0.5575084603917333, "grad/layer_4/attn": 0.0034922086633741856, "grad/layer_4/mlp": 0.00664816377684474, "grad/layer_4/attn_mlp_ratio": 0.5252891968468758, "grad/layer_8/attn": 0.010869454592466354, "grad/layer_8/mlp": 0.00828487053513527, "grad/layer_8/attn_mlp_ratio": 1.3119643107485752, "grad/layer_12/attn": 0.007382058072835207, "grad/layer_12/mlp": 0.006806171499192715, "grad/layer_12/attn_mlp_ratio": 1.0846123940969692, "grad/layer_16/attn": 0.007705152500420809, "grad/layer_16/mlp": 0.006670892704278231, "grad/layer_16/attn_mlp_ratio": 1.1550406709397696, "grad/layer_20/attn": 0.02214626409113407, "grad/layer_20/mlp": 0.01695224642753601, "grad/layer_20/attn_mlp_ratio": 1.3063911060496478, "grad/layer_24/attn": 0.014064548537135124, "grad/layer_24/mlp": 0.015722524374723434, "grad/layer_24/attn_mlp_ratio": 0.894547727354231, "grad/layer_27/attn": 0.009862188249826431, "grad/layer_27/mlp": 0.014964167028665543, "grad/layer_27/attn_mlp_ratio": 0.6590536021837328} {"step": 3800, "timestamp": 1778198635.4423833, "train/loss": 2.69957640171051, "train/z_loss": 0.0017806834657676518, "train/perplexity": 14.873430030753765, "train/grad_norm": 0.171875, "optim/muon_lr": 0.0152, "optim/adamw_lr": 0.00045599999999999997, "perf/tokens_per_sec": 2021222.4788188648, "perf/iters_per_sec": 0.9637939828962635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375661373138427, "data/tokens_consumed": 7971274752, "data/tokens_consumed_B": 7.971274752, "train/loss_slope": -0.00023148012331502778} {"step": 3810, "timestamp": 1778198645.825685, "train/loss": 2.629124474525452, "train/z_loss": 0.0017922647763043641, "train/perplexity": 13.861628379034542, "train/grad_norm": 0.3359375, "optim/muon_lr": 0.01524, "optim/adamw_lr": 0.00045719999999999995, "perf/tokens_per_sec": 2020912.738528363, "perf/iters_per_sec": 0.9636462872163596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377251625061035, "data/tokens_consumed": 7992246272, "data/tokens_consumed_B": 7.992246272, "train/loss_slope": -0.0002322207316814369} {"step": 3820, "timestamp": 1778198656.1961753, "train/loss": 2.6472443342208862, "train/z_loss": 0.001791861024685204, "train/perplexity": 14.115088536833019, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.01528, "optim/adamw_lr": 0.0004584, "perf/tokens_per_sec": 2023205.8572726627, "perf/iters_per_sec": 0.9647397314418138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365489959716796, "data/tokens_consumed": 8013217792, "data/tokens_consumed_B": 8.013217792, "train/loss_slope": -0.00023369679854433353} {"step": 3825, "timestamp": 1778198661.9813414, "eos/sharpness": 44.764924049377434, "eos/L0_probe": 2.504718780517578, "eos/L_plus": 2.8416907787323, "eos/L_minus": 2.615396022796631, "eos/grad_norm": 0.32326844334602356, "eos/embed_grad_frac": 0.053895656019449234, "eos/time_s": 0.6017076969146729} {"step": 3825, "timestamp": 1778198663.3556914, "geo/rankme_last": 393.41778564453125, "geo/layer_0/stable_rank_q_proj": 37.4247932434082, "geo/layer_0/stable_rank_k_proj": 29.23746681213379, "geo/layer_0/stable_rank_o_proj": 61.559059143066406, "geo/layer_0/stable_rank_gate_proj": 170.01510620117188, "geo/layer_0/stable_rank_down_proj": 48.28466033935547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03026575595140457, "geo/layer_0/attn_entropy_mean": 6.676509857177734, "geo/layer_0/attn_entropy_std": 0.144694522023201, "geo/layer_7/stable_rank_q_proj": 35.06022644042969, "geo/layer_7/stable_rank_k_proj": 34.62836456298828, "geo/layer_7/stable_rank_o_proj": 104.20215606689453, "geo/layer_7/stable_rank_gate_proj": 175.6628875732422, "geo/layer_7/stable_rank_down_proj": 214.3642578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5805994868278503, "geo/layer_7/attn_entropy_mean": 5.336469650268555, "geo/layer_7/attn_entropy_std": 1.2440131902694702, "geo/layer_14/stable_rank_q_proj": 47.094730377197266, "geo/layer_14/stable_rank_k_proj": 32.8337516784668, "geo/layer_14/stable_rank_o_proj": 97.73348236083984, "geo/layer_14/stable_rank_gate_proj": 192.326416015625, "geo/layer_14/stable_rank_down_proj": 156.65341186523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.47012174129486084, "geo/layer_14/attn_entropy_mean": 6.51341438293457, "geo/layer_14/attn_entropy_std": 0.2748638987541199, "geo/layer_21/stable_rank_q_proj": 47.07247543334961, "geo/layer_21/stable_rank_k_proj": 33.86711120605469, "geo/layer_21/stable_rank_o_proj": 97.06110382080078, "geo/layer_21/stable_rank_gate_proj": 139.42001342773438, "geo/layer_21/stable_rank_down_proj": 152.87454223632812, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.21808357536792755, "geo/layer_21/attn_entropy_mean": 5.917120933532715, "geo/layer_21/attn_entropy_std": 0.3683573007583618, "geo/layer_27/stable_rank_q_proj": 52.56947708129883, "geo/layer_27/stable_rank_k_proj": 33.83217239379883, "geo/layer_27/stable_rank_o_proj": 99.8839340209961, "geo/layer_27/stable_rank_gate_proj": 87.72624969482422, "geo/layer_27/stable_rank_down_proj": 105.6689224243164, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08841311186552048, "geo/layer_27/attn_entropy_mean": 4.8403472900390625, "geo/layer_27/attn_entropy_std": 0.37689855694770813, "attnres/final_alpha/block_0": 0.23029561340808868, "attnres/block_norm/0": 0.9496557116508484, "attnres/final_alpha/block_1": 0.017755817621946335, "attnres/block_norm/1": 4352.42529296875, "attnres/final_alpha/block_2": 0.04369035363197327, "attnres/block_norm/2": 2891.72265625, "attnres/final_alpha/block_3": 0.030335906893014908, "attnres/block_norm/3": 2726.403076171875, "attnres/final_alpha/block_4": 0.04759952798485756, "attnres/block_norm/4": 1923.3651123046875, "attnres/final_alpha/block_5": 0.3624154031276703, "attnres/block_norm/5": 2294.830322265625, "attnres/final_alpha/block_6": 0.26790738105773926, "attnres/block_norm/6": 2494.92919921875, "geo/tier1_time_s": 1.354966640472412, "geo/step": 3825.0, "geo/rankme_slope": 0.0792032100779765} {"step": 3830, "timestamp": 1778198668.5473092, "train/loss": 2.649277091026306, "train/z_loss": 0.0017949719564057886, "train/perplexity": 14.143810261365788, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.01532, "optim/adamw_lr": 0.00045959999999999995, "perf/tokens_per_sec": 1698778.6096771997, "perf/iters_per_sec": 0.8100407646547316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345057725906372, "data/tokens_consumed": 8034189312, "data/tokens_consumed_B": 8.034189312, "train/loss_slope": -0.00023215911804002362} {"step": 3840, "timestamp": 1778198678.9215693, "train/loss": 2.6911988496780395, "train/z_loss": 0.001778352668043226, "train/perplexity": 14.749347576089582, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.01536, "optim/adamw_lr": 0.0004608, "perf/tokens_per_sec": 2022546.329515714, "perf/iters_per_sec": 0.9644252441004343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036887001991272, "data/tokens_consumed": 8055160832, "data/tokens_consumed_B": 8.055160832, "train/loss_slope": -0.00022868794216038122} {"step": 3850, "timestamp": 1778198689.2839336, "grad/layer_0/attn": 0.003950972575694323, "grad/layer_0/mlp": 0.007243114523589611, "grad/layer_0/attn_mlp_ratio": 0.5454797805942028, "grad/layer_4/attn": 0.0027954508550465107, "grad/layer_4/mlp": 0.006856218911707401, "grad/layer_4/attn_mlp_ratio": 0.40772484809384185, "grad/layer_8/attn": 0.010705494321882725, "grad/layer_8/mlp": 0.008333677425980568, "grad/layer_8/attn_mlp_ratio": 1.2846062603823971, "grad/layer_12/attn": 0.00730191869661212, "grad/layer_12/mlp": 0.007005464285612106, "grad/layer_12/attn_mlp_ratio": 1.04231758163083, "grad/layer_16/attn": 0.007333708927035332, "grad/layer_16/mlp": 0.0064538028091192245, "grad/layer_16/attn_mlp_ratio": 1.1363391523271962, "grad/layer_20/attn": 0.0220914538949728, "grad/layer_20/mlp": 0.017975453287363052, "grad/layer_20/attn_mlp_ratio": 1.2289789536270246, "grad/layer_24/attn": 0.015714256092905998, "grad/layer_24/mlp": 0.017707480117678642, "grad/layer_24/attn_mlp_ratio": 0.8874360383143239, "grad/layer_27/attn": 0.007304770406335592, "grad/layer_27/mlp": 0.018383679911494255, "grad/layer_27/attn_mlp_ratio": 0.39735082430548946} {"step": 3850, "timestamp": 1778198689.3003635, "train/loss": 2.6447376012802124, "train/z_loss": 0.001792825770098716, "train/perplexity": 14.079750089966339, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.0154, "optim/adamw_lr": 0.00046199999999999995, "perf/tokens_per_sec": 2022119.4033725772, "perf/iters_per_sec": 0.9642216698515783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037105917930603, "data/tokens_consumed": 8076132352, "data/tokens_consumed_B": 8.076132352, "train/loss_slope": -0.00022731597245436976} {"step": 3860, "timestamp": 1778198699.6714947, "train/loss": 2.6805979013442993, "train/z_loss": 0.0017893102602101862, "train/perplexity": 14.593816350272236, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.01544, "optim/adamw_lr": 0.0004632, "perf/tokens_per_sec": 2023074.4942862894, "perf/iters_per_sec": 0.9646770926886985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366163015365601, "data/tokens_consumed": 8097103872, "data/tokens_consumed_B": 8.097103872, "train/loss_slope": -0.00022571179518426392} {"step": 3870, "timestamp": 1778198710.0453103, "train/loss": 2.7109378814697265, "train/z_loss": 0.0017818572116084397, "train/perplexity": 15.043377805989252, "train/grad_norm": 0.25, "optim/muon_lr": 0.01548, "optim/adamw_lr": 0.00046439999999999996, "perf/tokens_per_sec": 2022671.1587936664, "perf/iters_per_sec": 0.9644847673385937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368230104446412, "data/tokens_consumed": 8118075392, "data/tokens_consumed_B": 8.118075392, "train/loss_slope": -0.00022272958384953641} {"step": 3880, "timestamp": 1778198720.4177842, "train/loss": 2.681528115272522, "train/z_loss": 0.001785913249477744, "train/perplexity": 14.60739803746513, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.01552, "optim/adamw_lr": 0.0004656, "perf/tokens_per_sec": 2022674.13553682, "perf/iters_per_sec": 0.9644861867603397, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368214845657349, "data/tokens_consumed": 8139046912, "data/tokens_consumed_B": 8.139046912, "train/loss_slope": -0.00022313250747129005} {"step": 3890, "timestamp": 1778198730.7941263, "train/loss": 2.659555220603943, "train/z_loss": 0.0017949469387531281, "train/perplexity": 14.289931817745023, "train/grad_norm": 0.265625, "optim/muon_lr": 0.015560000000000001, "optim/adamw_lr": 0.00046679999999999996, "perf/tokens_per_sec": 2022422.4921462964, "perf/iters_per_sec": 0.9643661938411219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369504928588866, "data/tokens_consumed": 8160018432, "data/tokens_consumed_B": 8.160018432, "train/loss_slope": -0.00022390084565669397} {"step": 3900, "timestamp": 1778198741.1778188, "grad/layer_0/attn": 0.0049975416623055935, "grad/layer_0/mlp": 0.008079511113464832, "grad/layer_0/attn_mlp_ratio": 0.6185450493560785, "grad/layer_4/attn": 0.0033175561111420393, "grad/layer_4/mlp": 0.006430183071643114, "grad/layer_4/attn_mlp_ratio": 0.5159349310253474, "grad/layer_8/attn": 0.019175712019205093, "grad/layer_8/mlp": 0.007972503080964088, "grad/layer_8/attn_mlp_ratio": 2.4052310276891276, "grad/layer_12/attn": 0.009926553815603256, "grad/layer_12/mlp": 0.006719596218317747, "grad/layer_12/attn_mlp_ratio": 1.4772544875267704, "grad/layer_16/attn": 0.00800139456987381, "grad/layer_16/mlp": 0.006191687658429146, "grad/layer_16/attn_mlp_ratio": 1.292280050618023, "grad/layer_20/attn": 0.025115078315138817, "grad/layer_20/mlp": 0.017993208020925522, "grad/layer_20/attn_mlp_ratio": 1.3958088044305332, "grad/layer_24/attn": 0.041741497814655304, "grad/layer_24/mlp": 0.02461315505206585, "grad/layer_24/attn_mlp_ratio": 1.6959019498624428, "grad/layer_27/attn": 0.0191927682608366, "grad/layer_27/mlp": 0.02901553176343441, "grad/layer_27/attn_mlp_ratio": 0.6614653266109338} {"step": 3900, "timestamp": 1778198741.7933667, "eos/sharpness": 76.58386230468749, "eos/L0_probe": 2.491468906402588, "eos/L_plus": 2.6651995182037354, "eos/L_minus": 3.0835769176483154, "eos/grad_norm": 0.4819833040237427, "eos/embed_grad_frac": 0.023487431928515434, "eos/time_s": 0.6125974655151367} {"step": 3900, "timestamp": 1778198741.8143563, "train/loss": 2.679494857788086, "train/z_loss": 0.001791046478319913, "train/perplexity": 14.577727610108361, "train/grad_norm": 0.482421875, "optim/muon_lr": 0.015600000000000001, "optim/adamw_lr": 0.000468, "perf/tokens_per_sec": 1905327.4593595634, "perf/iters_per_sec": 0.9085309311673944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.100677990913391, "data/tokens_consumed": 8180989952, "data/tokens_consumed_B": 8.180989952, "train/loss_slope": -0.00022045978983350605} {"step": 3900, "timestamp": 1778198743.1836498, "geo/rankme_last": 398.23394775390625, "geo/layer_0/stable_rank_q_proj": 36.47226333618164, "geo/layer_0/stable_rank_k_proj": 28.071069717407227, "geo/layer_0/stable_rank_o_proj": 61.264869689941406, "geo/layer_0/stable_rank_gate_proj": 171.12152099609375, "geo/layer_0/stable_rank_down_proj": 48.17106628417969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0385952927172184, "geo/layer_0/attn_entropy_mean": 6.672294616699219, "geo/layer_0/attn_entropy_std": 0.14963233470916748, "geo/layer_7/stable_rank_q_proj": 35.58510971069336, "geo/layer_7/stable_rank_k_proj": 34.9193229675293, "geo/layer_7/stable_rank_o_proj": 103.67127227783203, "geo/layer_7/stable_rank_gate_proj": 175.58226013183594, "geo/layer_7/stable_rank_down_proj": 216.1477813720703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5807577967643738, "geo/layer_7/attn_entropy_mean": 5.30214786529541, "geo/layer_7/attn_entropy_std": 1.2350624799728394, "geo/layer_14/stable_rank_q_proj": 48.07120895385742, "geo/layer_14/stable_rank_k_proj": 33.554691314697266, "geo/layer_14/stable_rank_o_proj": 97.46784210205078, "geo/layer_14/stable_rank_gate_proj": 192.59364318847656, "geo/layer_14/stable_rank_down_proj": 156.7537841796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.46285712718963623, "geo/layer_14/attn_entropy_mean": 6.499147415161133, "geo/layer_14/attn_entropy_std": 0.33619940280914307, "geo/layer_21/stable_rank_q_proj": 47.16446304321289, "geo/layer_21/stable_rank_k_proj": 33.713600158691406, "geo/layer_21/stable_rank_o_proj": 97.76067352294922, "geo/layer_21/stable_rank_gate_proj": 139.0287628173828, "geo/layer_21/stable_rank_down_proj": 150.3363800048828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.213816836476326, "geo/layer_21/attn_entropy_mean": 5.869710922241211, "geo/layer_21/attn_entropy_std": 0.39609211683273315, "geo/layer_27/stable_rank_q_proj": 51.83778762817383, "geo/layer_27/stable_rank_k_proj": 34.3963508605957, "geo/layer_27/stable_rank_o_proj": 99.49618530273438, "geo/layer_27/stable_rank_gate_proj": 87.40018463134766, "geo/layer_27/stable_rank_down_proj": 108.36148071289062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08375448733568192, "geo/layer_27/attn_entropy_mean": 4.871406078338623, "geo/layer_27/attn_entropy_std": 0.389047771692276, "attnres/final_alpha/block_0": 0.2324783205986023, "attnres/block_norm/0": 0.9596295952796936, "attnres/final_alpha/block_1": 0.01775851473212242, "attnres/block_norm/1": 4456.89990234375, "attnres/final_alpha/block_2": 0.04308560490608215, "attnres/block_norm/2": 2952.896728515625, "attnres/final_alpha/block_3": 0.030052337795495987, "attnres/block_norm/3": 2798.353759765625, "attnres/final_alpha/block_4": 0.048555128276348114, "attnres/block_norm/4": 1960.0439453125, "attnres/final_alpha/block_5": 0.34971603751182556, "attnres/block_norm/5": 2329.6318359375, "attnres/final_alpha/block_6": 0.27835404872894287, "attnres/block_norm/6": 2521.32861328125, "geo/tier1_time_s": 1.3652758598327637, "geo/step": 3900.0, "geo/rankme_slope": 0.07303253938294067} {"step": 3910, "timestamp": 1778198753.558971, "train/loss": 2.6770521879196165, "train/z_loss": 0.0017867880873382092, "train/perplexity": 14.542162488733247, "train/grad_norm": 0.28125, "optim/muon_lr": 0.01564, "optim/adamw_lr": 0.00046919999999999997, "perf/tokens_per_sec": 1786216.4806107804, "perf/iters_per_sec": 0.8517343905500319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174074935913086, "data/tokens_consumed": 8201961472, "data/tokens_consumed_B": 8.201961472, "train/loss_slope": -0.00022106174349677556} {"step": 3920, "timestamp": 1778198763.9382408, "train/loss": 2.6872340202331544, "train/z_loss": 0.0017864308785647154, "train/perplexity": 14.690984704397865, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.01568, "optim/adamw_lr": 0.0004704, "perf/tokens_per_sec": 2021556.9377367508, "perf/iters_per_sec": 0.963953465336204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373944759368896, "data/tokens_consumed": 8222932992, "data/tokens_consumed_B": 8.222932992, "train/loss_slope": -0.00021861776203999035} {"step": 3930, "timestamp": 1778198774.316778, "train/loss": 2.6973769426345826, "train/z_loss": 0.0017902954481542111, "train/perplexity": 14.84075247972401, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.01572, "optim/adamw_lr": 0.00047159999999999997, "perf/tokens_per_sec": 2021765.798425508, "perf/iters_per_sec": 0.9640530578734913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372873067855835, "data/tokens_consumed": 8243904512, "data/tokens_consumed_B": 8.243904512, "train/loss_slope": -0.00021897076168397937} {"step": 3940, "timestamp": 1778198784.6942685, "train/loss": 2.60322163105011, "train/z_loss": 0.0018098026630468667, "train/perplexity": 13.507183176037385, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.01576, "optim/adamw_lr": 0.0004728, "perf/tokens_per_sec": 2022187.879821882, "perf/iters_per_sec": 0.9642543219670686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370707988739014, "data/tokens_consumed": 8264876032, "data/tokens_consumed_B": 8.264876032, "train/loss_slope": -0.00021923529716214724} {"step": 3950, "timestamp": 1778198795.0561886, "grad/layer_0/attn": 0.0037950067780911922, "grad/layer_0/mlp": 0.00693289190530777, "grad/layer_0/attn_mlp_ratio": 0.5473915900010795, "grad/layer_4/attn": 0.003448493778705597, "grad/layer_4/mlp": 0.006600537337362766, "grad/layer_4/attn_mlp_ratio": 0.522456513796161, "grad/layer_8/attn": 0.011683138087391853, "grad/layer_8/mlp": 0.007954616099596024, "grad/layer_8/attn_mlp_ratio": 1.4687242972181591, "grad/layer_12/attn": 0.0063394890166819096, "grad/layer_12/mlp": 0.006742590107023716, "grad/layer_12/attn_mlp_ratio": 0.9402156770669678, "grad/layer_16/attn": 0.009252321906387806, "grad/layer_16/mlp": 0.00686495378613472, "grad/layer_16/attn_mlp_ratio": 1.3477616980173603, "grad/layer_20/attn": 0.02577199600636959, "grad/layer_20/mlp": 0.016725344583392143, "grad/layer_20/attn_mlp_ratio": 1.5408947614670416, "grad/layer_24/attn": 0.011039620265364647, "grad/layer_24/mlp": 0.016263823956251144, "grad/layer_24/attn_mlp_ratio": 0.6787837981511777, "grad/layer_27/attn": 0.0078077795915305614, "grad/layer_27/mlp": 0.0174142736941576, "grad/layer_27/attn_mlp_ratio": 0.4483551644944293} {"step": 3950, "timestamp": 1778198795.0727966, "train/loss": 2.63736207485199, "train/z_loss": 0.0018002611119300126, "train/perplexity": 13.976286539172103, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.0158, "optim/adamw_lr": 0.000474, "perf/tokens_per_sec": 2021743.5395813251, "perf/iters_per_sec": 0.9640424440294862, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372987270355225, "data/tokens_consumed": 8285847552, "data/tokens_consumed_B": 8.285847552, "train/loss_slope": -0.00021951064152149618} {"step": 3960, "timestamp": 1778198805.4478793, "train/loss": 2.680654191970825, "train/z_loss": 0.0017945939907804132, "train/perplexity": 14.594637868459666, "train/grad_norm": 0.21875, "optim/muon_lr": 0.01584, "optim/adamw_lr": 0.0004752, "perf/tokens_per_sec": 2022478.0613019126, "perf/iters_per_sec": 0.9643926912793697, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036922001838684, "data/tokens_consumed": 8306819072, "data/tokens_consumed_B": 8.306819072, "train/loss_slope": -0.0002186368150965716} {"step": 3970, "timestamp": 1778198815.8229918, "train/loss": 2.6418267726898192, "train/z_loss": 0.0018067044205963612, "train/perplexity": 14.03882594134539, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.015880000000000002, "optim/adamw_lr": 0.0004764, "perf/tokens_per_sec": 2022552.0962516754, "perf/iters_per_sec": 0.9644279938944222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368840456008912, "data/tokens_consumed": 8327790592, "data/tokens_consumed_B": 8.327790592, "train/loss_slope": -0.00021732149347327613} {"step": 3975, "timestamp": 1778198821.6067522, "eos/sharpness": 30.824351310729973, "eos/L0_probe": 2.4820995330810547, "eos/L_plus": 2.7087321281433105, "eos/L_minus": 2.5637104511260986, "eos/grad_norm": 0.2649548053741455, "eos/embed_grad_frac": 0.15739142894744873, "eos/time_s": 0.6078228950500488} {"step": 3975, "timestamp": 1778198822.985143, "geo/rankme_last": 395.8310852050781, "geo/layer_0/stable_rank_q_proj": 35.426753997802734, "geo/layer_0/stable_rank_k_proj": 26.910907745361328, "geo/layer_0/stable_rank_o_proj": 61.06259536743164, "geo/layer_0/stable_rank_gate_proj": 169.58004760742188, "geo/layer_0/stable_rank_down_proj": 48.10475540161133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.035940881818532944, "geo/layer_0/attn_entropy_mean": 6.664518356323242, "geo/layer_0/attn_entropy_std": 0.1567896604537964, "geo/layer_7/stable_rank_q_proj": 36.23194885253906, "geo/layer_7/stable_rank_k_proj": 35.31101989746094, "geo/layer_7/stable_rank_o_proj": 103.31275939941406, "geo/layer_7/stable_rank_gate_proj": 175.12242126464844, "geo/layer_7/stable_rank_down_proj": 215.6182098388672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5906977653503418, "geo/layer_7/attn_entropy_mean": 5.275185585021973, "geo/layer_7/attn_entropy_std": 1.2187713384628296, "geo/layer_14/stable_rank_q_proj": 49.10186004638672, "geo/layer_14/stable_rank_k_proj": 34.1438102722168, "geo/layer_14/stable_rank_o_proj": 97.88483428955078, "geo/layer_14/stable_rank_gate_proj": 191.74514770507812, "geo/layer_14/stable_rank_down_proj": 156.5245819091797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4586797058582306, "geo/layer_14/attn_entropy_mean": 6.419702529907227, "geo/layer_14/attn_entropy_std": 0.3591216802597046, "geo/layer_21/stable_rank_q_proj": 47.263126373291016, "geo/layer_21/stable_rank_k_proj": 33.53164291381836, "geo/layer_21/stable_rank_o_proj": 98.5987319946289, "geo/layer_21/stable_rank_gate_proj": 139.23326110839844, "geo/layer_21/stable_rank_down_proj": 147.96365356445312, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.21079730987548828, "geo/layer_21/attn_entropy_mean": 5.881190776824951, "geo/layer_21/attn_entropy_std": 0.36464667320251465, "geo/layer_27/stable_rank_q_proj": 50.96986389160156, "geo/layer_27/stable_rank_k_proj": 34.79014205932617, "geo/layer_27/stable_rank_o_proj": 100.13348388671875, "geo/layer_27/stable_rank_gate_proj": 87.2752914428711, "geo/layer_27/stable_rank_down_proj": 110.55992126464844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09016730636358261, "geo/layer_27/attn_entropy_mean": 4.827971458435059, "geo/layer_27/attn_entropy_std": 0.43489235639572144, "attnres/final_alpha/block_0": 0.2325536012649536, "attnres/block_norm/0": 0.9698001742362976, "attnres/final_alpha/block_1": 0.01770957186818123, "attnres/block_norm/1": 4594.716796875, "attnres/final_alpha/block_2": 0.04264523833990097, "attnres/block_norm/2": 3073.03466796875, "attnres/final_alpha/block_3": 0.029605770483613014, "attnres/block_norm/3": 2923.884033203125, "attnres/final_alpha/block_4": 0.047652870416641235, "attnres/block_norm/4": 2006.9752197265625, "attnres/final_alpha/block_5": 0.36237984895706177, "attnres/block_norm/5": 2392.65185546875, "attnres/final_alpha/block_6": 0.267453134059906, "attnres/block_norm/6": 2642.80517578125, "geo/tier1_time_s": 1.3569371700286865, "geo/step": 3975.0, "geo/rankme_slope": 0.06636394649656738} {"step": 3980, "timestamp": 1778198828.189758, "train/loss": 2.65862455368042, "train/z_loss": 0.0018006780999712646, "train/perplexity": 14.276638837490834, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.01592, "optim/adamw_lr": 0.0004776, "perf/tokens_per_sec": 1696573.346194185, "perf/iters_per_sec": 0.8089892130824018, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2361104249954225, "data/tokens_consumed": 8348762112, "data/tokens_consumed_B": 8.348762112, "train/loss_slope": -0.00021406349836796423} {"step": 3990, "timestamp": 1778198838.550221, "train/loss": 2.662833070755005, "train/z_loss": 0.0017993443529121579, "train/perplexity": 14.336848924527644, "train/grad_norm": 0.365234375, "optim/muon_lr": 0.015960000000000002, "optim/adamw_lr": 0.0004788, "perf/tokens_per_sec": 2025887.6084652003, "perf/iters_per_sec": 0.9660184900594713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351768732070923, "data/tokens_consumed": 8369733632, "data/tokens_consumed_B": 8.369733632, "train/loss_slope": -0.00021584898842514865} {"step": 4000, "timestamp": 1778198848.9048226, "grad/layer_0/attn": 0.003981611225754023, "grad/layer_0/mlp": 0.006820539943873882, "grad/layer_0/attn_mlp_ratio": 0.5837677368862093, "grad/layer_4/attn": 0.0027971393428742886, "grad/layer_4/mlp": 0.006327805109322071, "grad/layer_4/attn_mlp_ratio": 0.4420394197270115, "grad/layer_8/attn": 0.013059426099061966, "grad/layer_8/mlp": 0.007954957894980907, "grad/layer_8/attn_mlp_ratio": 1.64167128315469, "grad/layer_12/attn": 0.007643390446901321, "grad/layer_12/mlp": 0.007275102660059929, "grad/layer_12/attn_mlp_ratio": 1.0506230219679205, "grad/layer_16/attn": 0.009262022562325, "grad/layer_16/mlp": 0.007447161711752415, "grad/layer_16/attn_mlp_ratio": 1.2436983103695347, "grad/layer_20/attn": 0.023239849135279655, "grad/layer_20/mlp": 0.017287684604525566, "grad/layer_20/attn_mlp_ratio": 1.344300843779036, "grad/layer_24/attn": 0.03093045763671398, "grad/layer_24/mlp": 0.022834865376353264, "grad/layer_24/attn_mlp_ratio": 1.3545276922583205, "grad/layer_27/attn": 0.0235182773321867, "grad/layer_27/mlp": 0.027048539370298386, "grad/layer_27/attn_mlp_ratio": 0.8694841863092749} {"step": 4000, "timestamp": 1778198848.9193308, "train/loss": 2.6218302965164186, "train/z_loss": 0.0018169962102547288, "train/perplexity": 13.760887053385604, "train/grad_norm": 0.408203125, "optim/muon_lr": 0.016, "optim/adamw_lr": 0.00047999999999999996, "perf/tokens_per_sec": 2024240.8382514508, "perf/iters_per_sec": 0.9652332488305334, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360190153121949, "data/tokens_consumed": 8390705152, "data/tokens_consumed_B": 8.390705152, "train/loss_slope": -0.00021675453608078243} {"step": 4000, "timestamp": 1778198856.007187, "geo/ww_alpha_mean": 8.654620974574136, "geo/ww_alpha_std": 5.432731968914562, "geo/ww_alpha_min": 2.6696315642159476, "geo/ww_alpha_max": 36.095981041532895, "geo/ww_alpha_healthy_frac": 0.116751269035533, "geo/ww_alpha_by_type/q_proj": 5.107708763080152, "geo/ww_alpha_by_type/k_proj": 5.331398701980108, "geo/ww_alpha_by_type/v_proj": 6.534985776131494, "geo/ww_alpha_by_type/o_proj": 7.574384443162367, "geo/ww_alpha_by_type/gate_proj": 13.34410645793177, "geo/ww_alpha_by_type/up_proj": 12.68189646060336, "geo/ww_alpha_by_type/down_proj": 10.214037619065651, "geo/twonn_id/layer_0": 0.6485440731048584, "geo/twonn_id/layer_7": 2.560999631881714, "geo/twonn_id/layer_14": 3.105855941772461, "geo/twonn_id/layer_21": 6.512927055358887, "geo/twonn_id/layer_27": 5.867128849029541, "geo/tier2_time_s": 7.077374458312988} {"step": 4000, "timestamp": 1778198856.8285978, "eoc/jacobian_sigma/layer_0/attn": 575.4388427734375, "eoc/jacobian_sigma/layer_0/mlp": 664.1405639648438, "eoc/jacobian_sigma/layer_0": 664.1405639648438, "eoc/jacobian_sigma/layer_7/attn": 1.1830765008926392, "eoc/jacobian_sigma/layer_7/mlp": 1.562404990196228, "eoc/jacobian_sigma/layer_7": 1.562404990196228, "eoc/jacobian_sigma/layer_14/attn": 1.2089805603027344, "eoc/jacobian_sigma/layer_14/mlp": 4.445191860198975, "eoc/jacobian_sigma/layer_14": 4.445191860198975, "eoc/jacobian_sigma/layer_21/attn": 1.095291256904602, "eoc/jacobian_sigma/layer_21/mlp": 2.7452266216278076, "eoc/jacobian_sigma/layer_21": 2.7452266216278076, "eoc/jacobian_sigma/layer_27/attn": 1.4565013647079468, "eoc/jacobian_sigma/layer_27/mlp": 2.0861222743988037, "eoc/jacobian_sigma/layer_27": 2.0861222743988037, "eoc/layer0_sigma": 664.1405639648438, "eoc/sigma_max": 4.445191860198975, "eoc/sigma_min": 1.562404990196228, "eoc/sigma_mean": 2.7097364366054535, "eoc/time_s": 0.8157014846801758} {"step": 4010, "timestamp": 1778198867.2398143, "train/loss": 2.700570511817932, "train/z_loss": 0.001792065380141139, "train/perplexity": 14.88822320968543, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.016040000000000002, "optim/adamw_lr": 0.0004812, "perf/tokens_per_sec": 1145278.6818195311, "perf/iters_per_sec": 0.5461114319894462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8311281204223633, "data/tokens_consumed": 8411676672, "data/tokens_consumed_B": 8.411676672, "train/loss_slope": -0.00021416163398738092} {"step": 4020, "timestamp": 1778198877.6185944, "train/loss": 2.669665479660034, "train/z_loss": 0.0018036193680018187, "train/perplexity": 14.435139537253454, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.01608, "optim/adamw_lr": 0.00048239999999999996, "perf/tokens_per_sec": 2021770.073666721, "perf/iters_per_sec": 0.9640550964673619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372851133346557, "data/tokens_consumed": 8432648192, "data/tokens_consumed_B": 8.432648192, "train/loss_slope": -0.00021245266492038453} {"step": 4030, "timestamp": 1778198888.0018544, "train/loss": 2.6884326457977297, "train/z_loss": 0.0017923468025401235, "train/perplexity": 14.708604251743779, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.016120000000000002, "optim/adamw_lr": 0.0004836, "perf/tokens_per_sec": 2020839.1949627877, "perf/iters_per_sec": 0.9636112189115466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377629280090332, "data/tokens_consumed": 8453619712, "data/tokens_consumed_B": 8.453619712, "train/loss_slope": -0.00020585856919813682} {"step": 4040, "timestamp": 1778198898.3750136, "train/loss": 2.63315863609314, "train/z_loss": 0.001813125645276159, "train/perplexity": 13.917661374593212, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.01616, "optim/adamw_lr": 0.00048479999999999997, "perf/tokens_per_sec": 2022756.1389535065, "perf/iters_per_sec": 0.9645252890365155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367794513702393, "data/tokens_consumed": 8474591232, "data/tokens_consumed_B": 8.474591232, "train/loss_slope": -0.000205516697700196} {"step": 4050, "timestamp": 1778198908.71939, "grad/layer_0/attn": 0.004461344331502914, "grad/layer_0/mlp": 0.007752796169370413, "grad/layer_0/attn_mlp_ratio": 0.575449705692474, "grad/layer_4/attn": 0.0036625638604164124, "grad/layer_4/mlp": 0.006567277014255524, "grad/layer_4/attn_mlp_ratio": 0.5576989971180173, "grad/layer_8/attn": 0.014095315709710121, "grad/layer_8/mlp": 0.0080852210521698, "grad/layer_8/attn_mlp_ratio": 1.7433432486787843, "grad/layer_12/attn": 0.010085996240377426, "grad/layer_12/mlp": 0.008737966418266296, "grad/layer_12/attn_mlp_ratio": 1.1542727039858918, "grad/layer_16/attn": 0.009798688814043999, "grad/layer_16/mlp": 0.007168212439864874, "grad/layer_16/attn_mlp_ratio": 1.3669640457157415, "grad/layer_20/attn": 0.027579743415117264, "grad/layer_20/mlp": 0.018911197781562805, "grad/layer_20/attn_mlp_ratio": 1.4583816206590343, "grad/layer_24/attn": 0.03706744685769081, "grad/layer_24/mlp": 0.01950174942612648, "grad/layer_24/attn_mlp_ratio": 1.9007241790297622, "grad/layer_27/attn": 0.013390461914241314, "grad/layer_27/mlp": 0.02017028257250786, "grad/layer_27/attn_mlp_ratio": 0.6638708109179126} {"step": 4050, "timestamp": 1778198909.352364, "eos/sharpness": 60.90471744537352, "eos/L0_probe": 2.4680428504943848, "eos/L_plus": 2.6397860050201416, "eos/L_minus": 2.9053468704223633, "eos/grad_norm": 0.36735260486602783, "eos/embed_grad_frac": 0.034464165568351746, "eos/time_s": 0.6301383972167969} {"step": 4050, "timestamp": 1778198909.372515, "train/loss": 2.658572220802307, "train/z_loss": 0.001810632599517703, "train/perplexity": 14.275891719440281, "train/grad_norm": 0.3671875, "optim/muon_lr": 0.016200000000000003, "optim/adamw_lr": 0.000486, "perf/tokens_per_sec": 1907981.3952975047, "perf/iters_per_sec": 0.9097964264380954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0991469860076903, "data/tokens_consumed": 8495562752, "data/tokens_consumed_B": 8.495562752, "train/loss_slope": -0.00020435341747179777} {"step": 4050, "timestamp": 1778198910.738744, "geo/rankme_last": 400.3486328125, "geo/layer_0/stable_rank_q_proj": 34.44306182861328, "geo/layer_0/stable_rank_k_proj": 25.79520034790039, "geo/layer_0/stable_rank_o_proj": 61.014923095703125, "geo/layer_0/stable_rank_gate_proj": 168.9506378173828, "geo/layer_0/stable_rank_down_proj": 47.90312194824219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.032002076506614685, "geo/layer_0/attn_entropy_mean": 6.664260387420654, "geo/layer_0/attn_entropy_std": 0.15735158324241638, "geo/layer_7/stable_rank_q_proj": 36.873905181884766, "geo/layer_7/stable_rank_k_proj": 35.7648811340332, "geo/layer_7/stable_rank_o_proj": 102.9461669921875, "geo/layer_7/stable_rank_gate_proj": 174.26089477539062, "geo/layer_7/stable_rank_down_proj": 215.5396270751953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5828883647918701, "geo/layer_7/attn_entropy_mean": 5.2634687423706055, "geo/layer_7/attn_entropy_std": 1.2122975587844849, "geo/layer_14/stable_rank_q_proj": 49.966739654541016, "geo/layer_14/stable_rank_k_proj": 35.08799743652344, "geo/layer_14/stable_rank_o_proj": 98.2766342163086, "geo/layer_14/stable_rank_gate_proj": 191.0447540283203, "geo/layer_14/stable_rank_down_proj": 155.90567016601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.453912615776062, "geo/layer_14/attn_entropy_mean": 6.484301567077637, "geo/layer_14/attn_entropy_std": 0.34423279762268066, "geo/layer_21/stable_rank_q_proj": 47.572784423828125, "geo/layer_21/stable_rank_k_proj": 33.430267333984375, "geo/layer_21/stable_rank_o_proj": 99.25121307373047, "geo/layer_21/stable_rank_gate_proj": 138.76934814453125, "geo/layer_21/stable_rank_down_proj": 146.83204650878906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.20965902507305145, "geo/layer_21/attn_entropy_mean": 5.863026142120361, "geo/layer_21/attn_entropy_std": 0.3899977505207062, "geo/layer_27/stable_rank_q_proj": 50.358402252197266, "geo/layer_27/stable_rank_k_proj": 35.148983001708984, "geo/layer_27/stable_rank_o_proj": 101.00736236572266, "geo/layer_27/stable_rank_gate_proj": 87.17587280273438, "geo/layer_27/stable_rank_down_proj": 113.14736938476562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08916059881448746, "geo/layer_27/attn_entropy_mean": 4.824103355407715, "geo/layer_27/attn_entropy_std": 0.39418748021125793, "attnres/final_alpha/block_0": 0.23533031344413757, "attnres/block_norm/0": 0.9795401096343994, "attnres/final_alpha/block_1": 0.017935557290911674, "attnres/block_norm/1": 4705.474609375, "attnres/final_alpha/block_2": 0.04209423065185547, "attnres/block_norm/2": 3152.68896484375, "attnres/final_alpha/block_3": 0.030080672353506088, "attnres/block_norm/3": 3000.468994140625, "attnres/final_alpha/block_4": 0.048936307430267334, "attnres/block_norm/4": 2047.515869140625, "attnres/final_alpha/block_5": 0.3531060516834259, "attnres/block_norm/5": 2429.54443359375, "attnres/final_alpha/block_6": 0.2725168466567993, "attnres/block_norm/6": 2655.381103515625, "geo/tier1_time_s": 1.361644983291626, "geo/step": 4050.0, "geo/rankme_slope": 0.05967193901242138} {"step": 4060, "timestamp": 1778198921.0926013, "train/loss": 2.6264626502990724, "train/z_loss": 0.0018112467019818722, "train/perplexity": 13.824780224210471, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.01624, "optim/adamw_lr": 0.00048719999999999997, "perf/tokens_per_sec": 1790041.2896065086, "perf/iters_per_sec": 0.8535582016022246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715662717819213, "data/tokens_consumed": 8516534272, "data/tokens_consumed_B": 8.516534272, "train/loss_slope": -0.0002048765873644325} {"step": 4070, "timestamp": 1778198931.460823, "train/loss": 2.634840154647827, "train/z_loss": 0.0018149092211388052, "train/perplexity": 13.941083867591798, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.01628, "optim/adamw_lr": 0.0004883999999999999, "perf/tokens_per_sec": 2023612.8976691451, "perf/iters_per_sec": 0.9649338234277464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363404989242553, "data/tokens_consumed": 8537505792, "data/tokens_consumed_B": 8.537505792, "train/loss_slope": -0.00020691754396634878} {"step": 4080, "timestamp": 1778198941.8213217, "train/loss": 2.680145812034607, "train/z_loss": 0.00180396978976205, "train/perplexity": 14.587220133064656, "train/grad_norm": 0.171875, "optim/muon_lr": 0.016319999999999998, "optim/adamw_lr": 0.0004896, "perf/tokens_per_sec": 2025318.8012201244, "perf/iters_per_sec": 0.9657472616291639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354676008224488, "data/tokens_consumed": 8558477312, "data/tokens_consumed_B": 8.558477312, "train/loss_slope": -0.00020830599504156644} {"step": 4090, "timestamp": 1778198952.2029574, "train/loss": 2.5897735357284546, "train/z_loss": 0.0018206766922958194, "train/perplexity": 13.326753227969764, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.01636, "optim/adamw_lr": 0.0004907999999999999, "perf/tokens_per_sec": 2021434.150584818, "perf/iters_per_sec": 0.9638949158596125, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037457489967346, "data/tokens_consumed": 8579448832, "data/tokens_consumed_B": 8.579448832, "train/loss_slope": -0.00020893157374705056} {"step": 4100, "timestamp": 1778198962.5533524, "grad/layer_0/attn": 0.0038641297724097967, "grad/layer_0/mlp": 0.006215197499841452, "grad/layer_0/attn_mlp_ratio": 0.6217227546407165, "grad/layer_4/attn": 0.00253703142516315, "grad/layer_4/mlp": 0.005755771417170763, "grad/layer_4/attn_mlp_ratio": 0.4407804266716658, "grad/layer_8/attn": 0.021032042801380157, "grad/layer_8/mlp": 0.00730658695101738, "grad/layer_8/attn_mlp_ratio": 2.87850437619184, "grad/layer_12/attn": 0.005612981040030718, "grad/layer_12/mlp": 0.005979518871754408, "grad/layer_12/attn_mlp_ratio": 0.9387011006311522, "grad/layer_16/attn": 0.008205619640648365, "grad/layer_16/mlp": 0.005290060769766569, "grad/layer_16/attn_mlp_ratio": 1.5511389835879996, "grad/layer_20/attn": 0.02835007570683956, "grad/layer_20/mlp": 0.014246027916669846, "grad/layer_20/attn_mlp_ratio": 1.9900336903497597, "grad/layer_24/attn": 0.011553095653653145, "grad/layer_24/mlp": 0.014978908002376556, "grad/layer_24/attn_mlp_ratio": 0.7712909095036192, "grad/layer_27/attn": 0.009002446196973324, "grad/layer_27/mlp": 0.014868131838738918, "grad/layer_27/attn_mlp_ratio": 0.6054860310673899} {"step": 4100, "timestamp": 1778198962.568479, "train/loss": 2.6297953367233275, "train/z_loss": 0.0018195230513811111, "train/perplexity": 13.87093074146781, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.016399999999999998, "optim/adamw_lr": 0.0004919999999999999, "perf/tokens_per_sec": 2024303.1226566052, "perf/iters_per_sec": 0.9652629483492876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035987138748169, "data/tokens_consumed": 8600420352, "data/tokens_consumed_B": 8.600420352, "train/loss_slope": -0.00020933079557879488} {"step": 4110, "timestamp": 1778198972.9410183, "train/loss": 2.5841267108917236, "train/z_loss": 0.001832794724032283, "train/perplexity": 13.251711460110672, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.01644, "optim/adamw_lr": 0.0004932, "perf/tokens_per_sec": 2023101.6216952216, "perf/iters_per_sec": 0.9646900280452831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366024017333983, "data/tokens_consumed": 8621391872, "data/tokens_consumed_B": 8.621391872, "train/loss_slope": -0.0002119811436166428} {"step": 4120, "timestamp": 1778198983.3069851, "train/loss": 2.6060776710510254, "train/z_loss": 0.0018202055827714503, "train/perplexity": 13.545815372777328, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.016479999999999998, "optim/adamw_lr": 0.0004944, "perf/tokens_per_sec": 2024344.0731309806, "perf/iters_per_sec": 0.9652824750571158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359661817550658, "data/tokens_consumed": 8642363392, "data/tokens_consumed_B": 8.642363392, "train/loss_slope": -0.00021810252359597985} {"step": 4125, "timestamp": 1778198989.0832677, "eos/sharpness": 22.66755104064941, "eos/L0_probe": 2.4616785049438477, "eos/L_plus": 2.6301612854003906, "eos/L_minus": 2.519871234893799, "eos/grad_norm": 0.22876907885074615, "eos/embed_grad_frac": 0.24979457259178162, "eos/time_s": 0.5971238613128662} {"step": 4125, "timestamp": 1778198990.4633656, "geo/rankme_last": 399.248291015625, "geo/layer_0/stable_rank_q_proj": 33.338783264160156, "geo/layer_0/stable_rank_k_proj": 24.751611709594727, "geo/layer_0/stable_rank_o_proj": 60.40487289428711, "geo/layer_0/stable_rank_gate_proj": 168.60768127441406, "geo/layer_0/stable_rank_down_proj": 47.68111801147461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03402366489171982, "geo/layer_0/attn_entropy_mean": 6.657085418701172, "geo/layer_0/attn_entropy_std": 0.16413794457912445, "geo/layer_7/stable_rank_q_proj": 37.54161071777344, "geo/layer_7/stable_rank_k_proj": 36.05340576171875, "geo/layer_7/stable_rank_o_proj": 102.89286804199219, "geo/layer_7/stable_rank_gate_proj": 172.80819702148438, "geo/layer_7/stable_rank_down_proj": 214.5894775390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5772750377655029, "geo/layer_7/attn_entropy_mean": 5.243526458740234, "geo/layer_7/attn_entropy_std": 1.2202444076538086, "geo/layer_14/stable_rank_q_proj": 50.79265213012695, "geo/layer_14/stable_rank_k_proj": 35.73906326293945, "geo/layer_14/stable_rank_o_proj": 99.34033966064453, "geo/layer_14/stable_rank_gate_proj": 191.54010009765625, "geo/layer_14/stable_rank_down_proj": 156.22596740722656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4460504651069641, "geo/layer_14/attn_entropy_mean": 6.449049472808838, "geo/layer_14/attn_entropy_std": 0.35907626152038574, "geo/layer_21/stable_rank_q_proj": 47.82102584838867, "geo/layer_21/stable_rank_k_proj": 33.27696990966797, "geo/layer_21/stable_rank_o_proj": 100.0903549194336, "geo/layer_21/stable_rank_gate_proj": 138.18914794921875, "geo/layer_21/stable_rank_down_proj": 144.4302978515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.20241087675094604, "geo/layer_21/attn_entropy_mean": 5.874676704406738, "geo/layer_21/attn_entropy_std": 0.3576146960258484, "geo/layer_27/stable_rank_q_proj": 49.395503997802734, "geo/layer_27/stable_rank_k_proj": 35.709659576416016, "geo/layer_27/stable_rank_o_proj": 100.54169464111328, "geo/layer_27/stable_rank_gate_proj": 87.47588348388672, "geo/layer_27/stable_rank_down_proj": 115.296630859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0844796895980835, "geo/layer_27/attn_entropy_mean": 4.778196334838867, "geo/layer_27/attn_entropy_std": 0.39604032039642334, "attnres/final_alpha/block_0": 0.23378083109855652, "attnres/block_norm/0": 0.9895409345626831, "attnres/final_alpha/block_1": 0.017521999776363373, "attnres/block_norm/1": 4844.85009765625, "attnres/final_alpha/block_2": 0.04130955785512924, "attnres/block_norm/2": 3283.684326171875, "attnres/final_alpha/block_3": 0.02967064082622528, "attnres/block_norm/3": 3109.77587890625, "attnres/final_alpha/block_4": 0.04747213423252106, "attnres/block_norm/4": 2077.02783203125, "attnres/final_alpha/block_5": 0.36442288756370544, "attnres/block_norm/5": 2467.65966796875, "attnres/final_alpha/block_6": 0.2658219337463379, "attnres/block_norm/6": 2763.9609375, "geo/tier1_time_s": 1.359839916229248, "geo/step": 4125.0, "geo/rankme_slope": 0.05287974494485294} {"step": 4130, "timestamp": 1778198995.6465712, "train/loss": 2.6830284118652346, "train/z_loss": 0.001811010402161628, "train/perplexity": 14.629329915015864, "train/grad_norm": 0.375, "optim/muon_lr": 0.01652, "optim/adamw_lr": 0.0004955999999999999, "perf/tokens_per_sec": 1700426.264326921, "perf/iters_per_sec": 0.8108264276156049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2333095788955688, "data/tokens_consumed": 8663334912, "data/tokens_consumed_B": 8.663334912, "train/loss_slope": -0.00021450838147073824} {"step": 4140, "timestamp": 1778199006.0101612, "train/loss": 2.585161542892456, "train/z_loss": 0.0018296099151484668, "train/perplexity": 13.265431853120537, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.01656, "optim/adamw_lr": 0.0004967999999999999, "perf/tokens_per_sec": 2024626.5787036028, "perf/iters_per_sec": 0.9654171842115415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358216285705566, "data/tokens_consumed": 8684306432, "data/tokens_consumed_B": 8.684306432, "train/loss_slope": -0.00021810587414599302} {"step": 4150, "timestamp": 1778199016.373141, "grad/layer_0/attn": 0.004652634263038635, "grad/layer_0/mlp": 0.007183631416410208, "grad/layer_0/attn_mlp_ratio": 0.6476716201840544, "grad/layer_4/attn": 0.0029040705412626266, "grad/layer_4/mlp": 0.006195182912051678, "grad/layer_4/attn_mlp_ratio": 0.46876267183927417, "grad/layer_8/attn": 0.012242713011801243, "grad/layer_8/mlp": 0.007777733728289604, "grad/layer_8/attn_mlp_ratio": 1.5740719960448337, "grad/layer_12/attn": 0.009316753596067429, "grad/layer_12/mlp": 0.007891683839261532, "grad/layer_12/attn_mlp_ratio": 1.1805786531460165, "grad/layer_16/attn": 0.009145503863692284, "grad/layer_16/mlp": 0.007206101901829243, "grad/layer_16/attn_mlp_ratio": 1.2691332791807182, "grad/layer_20/attn": 0.030927643179893494, "grad/layer_20/mlp": 0.016817154362797737, "grad/layer_20/attn_mlp_ratio": 1.8390532862328426, "grad/layer_24/attn": 0.039146337658166885, "grad/layer_24/mlp": 0.0240657739341259, "grad/layer_24/attn_mlp_ratio": 1.6266394591196753, "grad/layer_27/attn": 0.020512809976935387, "grad/layer_27/mlp": 0.02468830533325672, "grad/layer_27/attn_mlp_ratio": 0.8308715246735129} {"step": 4150, "timestamp": 1778199016.3888206, "train/loss": 2.588419556617737, "train/z_loss": 0.0018296080525033176, "train/perplexity": 13.308721292672029, "train/grad_norm": 0.34375, "optim/muon_lr": 0.0166, "optim/adamw_lr": 0.000498, "perf/tokens_per_sec": 2021845.5439448287, "perf/iters_per_sec": 0.9640910835003036, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372463941574097, "data/tokens_consumed": 8705277952, "data/tokens_consumed_B": 8.705277952, "train/loss_slope": -0.00022065182228614853} {"step": 4160, "timestamp": 1778199026.7580442, "train/loss": 2.639109396934509, "train/z_loss": 0.0018231720430776476, "train/perplexity": 14.000728961446878, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.01664, "optim/adamw_lr": 0.0004991999999999999, "perf/tokens_per_sec": 2023482.692092407, "perf/iters_per_sec": 0.9648717365705524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03640718460083, "data/tokens_consumed": 8726249472, "data/tokens_consumed_B": 8.726249472, "train/loss_slope": -0.00021760065969746994} {"step": 4170, "timestamp": 1778199037.126739, "train/loss": 2.630569052696228, "train/z_loss": 0.0018237680662423373, "train/perplexity": 13.881667055034534, "train/grad_norm": 0.412109375, "optim/muon_lr": 0.01668, "optim/adamw_lr": 0.0005003999999999999, "perf/tokens_per_sec": 2024072.1259309317, "perf/iters_per_sec": 0.9651528005270632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361053705215455, "data/tokens_consumed": 8747220992, "data/tokens_consumed_B": 8.747220992, "train/loss_slope": -0.00021774420082980428} {"step": 4180, "timestamp": 1778199047.488074, "train/loss": 2.7132065057754517, "train/z_loss": 0.001802621560636908, "train/perplexity": 15.077544319357855, "train/grad_norm": 0.21875, "optim/muon_lr": 0.01672, "optim/adamw_lr": 0.0005015999999999999, "perf/tokens_per_sec": 2025225.8651412767, "perf/iters_per_sec": 0.9657029462534317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355151176452637, "data/tokens_consumed": 8768192512, "data/tokens_consumed_B": 8.768192512, "train/loss_slope": -0.00020984990953719539} {"step": 4190, "timestamp": 1778199057.86607, "train/loss": 2.6180315732955934, "train/z_loss": 0.0018255960196256638, "train/perplexity": 13.708712413446179, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.01676, "optim/adamw_lr": 0.0005028, "perf/tokens_per_sec": 2022024.3901027688, "perf/iters_per_sec": 0.9641763639940113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371546506881715, "data/tokens_consumed": 8789164032, "data/tokens_consumed_B": 8.789164032, "train/loss_slope": -0.00021077904462313594} {"step": 4200, "timestamp": 1778199068.2135177, "grad/layer_0/attn": 0.004454414825886488, "grad/layer_0/mlp": 0.006382083520293236, "grad/layer_0/attn_mlp_ratio": 0.6979561990887582, "grad/layer_4/attn": 0.002782404189929366, "grad/layer_4/mlp": 0.005731179844588041, "grad/layer_4/attn_mlp_ratio": 0.4854854003592739, "grad/layer_8/attn": 0.008135896176099777, "grad/layer_8/mlp": 0.007246689405292273, "grad/layer_8/attn_mlp_ratio": 1.122705225628629, "grad/layer_12/attn": 0.006489118095487356, "grad/layer_12/mlp": 0.006153258960694075, "grad/layer_12/attn_mlp_ratio": 1.0545823004493162, "grad/layer_16/attn": 0.006945734843611717, "grad/layer_16/mlp": 0.005723100155591965, "grad/layer_16/attn_mlp_ratio": 1.2136315167334581, "grad/layer_20/attn": 0.03259582445025444, "grad/layer_20/mlp": 0.012933522462844849, "grad/layer_20/attn_mlp_ratio": 2.5202588306371414, "grad/layer_24/attn": 0.0259908027946949, "grad/layer_24/mlp": 0.014990641735494137, "grad/layer_24/attn_mlp_ratio": 1.7338018665188237, "grad/layer_27/attn": 0.012694669887423515, "grad/layer_27/mlp": 0.017695024609565735, "grad/layer_27/attn_mlp_ratio": 0.7174146459688704} {"step": 4200, "timestamp": 1778199068.8179598, "eos/sharpness": 51.19445323944091, "eos/L0_probe": 2.453787326812744, "eos/L_plus": 2.8539507389068604, "eos/L_minus": 2.565568447113037, "eos/grad_norm": 0.28244486451148987, "eos/embed_grad_frac": 0.07938095927238464, "eos/time_s": 0.6016113758087158} {"step": 4200, "timestamp": 1778199068.8381863, "train/loss": 2.641241693496704, "train/z_loss": 0.0018134048557840287, "train/perplexity": 14.030614518791825, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.0168, "optim/adamw_lr": 0.0005039999999999999, "perf/tokens_per_sec": 1912527.0396975405, "perf/iters_per_sec": 0.9119639585960104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965345621109008, "data/tokens_consumed": 8810135552, "data/tokens_consumed_B": 8.810135552, "train/loss_slope": -0.00021043842931141402} {"step": 4200, "timestamp": 1778199070.206276, "geo/rankme_last": 401.0963439941406, "geo/layer_0/stable_rank_q_proj": 32.307979583740234, "geo/layer_0/stable_rank_k_proj": 23.89599609375, "geo/layer_0/stable_rank_o_proj": 60.440765380859375, "geo/layer_0/stable_rank_gate_proj": 168.82955932617188, "geo/layer_0/stable_rank_down_proj": 47.586883544921875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.023589661344885826, "geo/layer_0/attn_entropy_mean": 6.653700828552246, "geo/layer_0/attn_entropy_std": 0.16361428797245026, "geo/layer_7/stable_rank_q_proj": 38.1551628112793, "geo/layer_7/stable_rank_k_proj": 36.357513427734375, "geo/layer_7/stable_rank_o_proj": 102.31110382080078, "geo/layer_7/stable_rank_gate_proj": 171.01153564453125, "geo/layer_7/stable_rank_down_proj": 214.24493408203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5824779272079468, "geo/layer_7/attn_entropy_mean": 5.198454856872559, "geo/layer_7/attn_entropy_std": 1.188292384147644, "geo/layer_14/stable_rank_q_proj": 52.138423919677734, "geo/layer_14/stable_rank_k_proj": 36.63402557373047, "geo/layer_14/stable_rank_o_proj": 100.08438110351562, "geo/layer_14/stable_rank_gate_proj": 189.8845977783203, "geo/layer_14/stable_rank_down_proj": 157.0040740966797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.43837520480155945, "geo/layer_14/attn_entropy_mean": 6.384025573730469, "geo/layer_14/attn_entropy_std": 0.4008389413356781, "geo/layer_21/stable_rank_q_proj": 48.07550048828125, "geo/layer_21/stable_rank_k_proj": 33.13559341430664, "geo/layer_21/stable_rank_o_proj": 100.82159423828125, "geo/layer_21/stable_rank_gate_proj": 138.062744140625, "geo/layer_21/stable_rank_down_proj": 142.0078582763672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.2074110358953476, "geo/layer_21/attn_entropy_mean": 5.872023582458496, "geo/layer_21/attn_entropy_std": 0.37007901072502136, "geo/layer_27/stable_rank_q_proj": 48.96393585205078, "geo/layer_27/stable_rank_k_proj": 36.09679412841797, "geo/layer_27/stable_rank_o_proj": 101.14059448242188, "geo/layer_27/stable_rank_gate_proj": 87.52227783203125, "geo/layer_27/stable_rank_down_proj": 118.0916976928711, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09004935622215271, "geo/layer_27/attn_entropy_mean": 4.818027496337891, "geo/layer_27/attn_entropy_std": 0.41101622581481934, "attnres/final_alpha/block_0": 0.23541809618473053, "attnres/block_norm/0": 0.9989215135574341, "attnres/final_alpha/block_1": 0.017409905791282654, "attnres/block_norm/1": 4999.6552734375, "attnres/final_alpha/block_2": 0.04119928926229477, "attnres/block_norm/2": 3377.53564453125, "attnres/final_alpha/block_3": 0.029481327161192894, "attnres/block_norm/3": 3230.40234375, "attnres/final_alpha/block_4": 0.04759606346487999, "attnres/block_norm/4": 2128.28173828125, "attnres/final_alpha/block_5": 0.36488497257232666, "attnres/block_norm/5": 2519.51513671875, "attnres/final_alpha/block_6": 0.26401036977767944, "attnres/block_norm/6": 2829.82568359375, "geo/tier1_time_s": 1.3622746467590332, "geo/step": 4200.0, "geo/rankme_slope": 0.04651926969518276} {"step": 4210, "timestamp": 1778199080.5763621, "train/loss": 2.6059531450271605, "train/z_loss": 0.0018307823687791825, "train/perplexity": 13.544128671270244, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.01684, "optim/adamw_lr": 0.0005051999999999999, "perf/tokens_per_sec": 1787489.53899027, "perf/iters_per_sec": 0.8523414320899344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173238754272461, "data/tokens_consumed": 8831107072, "data/tokens_consumed_B": 8.831107072, "train/loss_slope": -0.0002069207196045379} {"step": 4220, "timestamp": 1778199090.9555526, "train/loss": 2.6786508560180664, "train/z_loss": 0.0018140088533982635, "train/perplexity": 14.565429172884015, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.01688, "optim/adamw_lr": 0.0005064, "perf/tokens_per_sec": 2022080.634648905, "perf/iters_per_sec": 0.9642031834835553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371258020401002, "data/tokens_consumed": 8852078592, "data/tokens_consumed_B": 8.852078592, "train/loss_slope": -0.00020062693509474691} {"step": 4230, "timestamp": 1778199101.3069386, "train/loss": 2.595909857749939, "train/z_loss": 0.0018341640592552722, "train/perplexity": 13.408781897046472, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.01692, "optim/adamw_lr": 0.0005076, "perf/tokens_per_sec": 2027189.9676563372, "perf/iters_per_sec": 0.9666395033151327, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345118284225463, "data/tokens_consumed": 8873050112, "data/tokens_consumed_B": 8.873050112, "train/loss_slope": -0.0001975700735795472} {"step": 4240, "timestamp": 1778199111.667853, "train/loss": 2.6036099672317503, "train/z_loss": 0.0018351860227994621, "train/perplexity": 13.512429522583846, "train/grad_norm": 0.1875, "optim/muon_lr": 0.01696, "optim/adamw_lr": 0.0005087999999999999, "perf/tokens_per_sec": 2025311.2000062997, "perf/iters_per_sec": 0.9657436370879648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354714870452881, "data/tokens_consumed": 8894021632, "data/tokens_consumed_B": 8.894021632, "train/loss_slope": -0.00019641047247959044} {"step": 4250, "timestamp": 1778199122.017921, "grad/layer_0/attn": 0.0037164059467613697, "grad/layer_0/mlp": 0.005792967043817043, "grad/layer_0/attn_mlp_ratio": 0.6415375496696833, "grad/layer_4/attn": 0.002907011192291975, "grad/layer_4/mlp": 0.005783853121101856, "grad/layer_4/attn_mlp_ratio": 0.5026080505788101, "grad/layer_8/attn": 0.00834477785974741, "grad/layer_8/mlp": 0.006957867182791233, "grad/layer_8/attn_mlp_ratio": 1.1993298406806923, "grad/layer_12/attn": 0.0054068611934781075, "grad/layer_12/mlp": 0.00641247583553195, "grad/layer_12/attn_mlp_ratio": 0.8431783990826905, "grad/layer_16/attn": 0.009058493189513683, "grad/layer_16/mlp": 0.005844533443450928, "grad/layer_16/attn_mlp_ratio": 1.5499086663065091, "grad/layer_20/attn": 0.024604981765151024, "grad/layer_20/mlp": 0.014741995371878147, "grad/layer_20/attn_mlp_ratio": 1.6690401114345426, "grad/layer_24/attn": 0.017111578956246376, "grad/layer_24/mlp": 0.018046751618385315, "grad/layer_24/attn_mlp_ratio": 0.9481805492350059, "grad/layer_27/attn": 0.011455190367996693, "grad/layer_27/mlp": 0.01789044961333275, "grad/layer_27/attn_mlp_ratio": 0.6402963900599872} {"step": 4250, "timestamp": 1778199122.0335975, "train/loss": 2.64390287399292, "train/z_loss": 0.0018244955455884338, "train/perplexity": 14.068002242174675, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.017, "optim/adamw_lr": 0.0005099999999999999, "perf/tokens_per_sec": 2024333.0783016689, "perf/iters_per_sec": 0.9652772323139519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359718084335328, "data/tokens_consumed": 8914993152, "data/tokens_consumed_B": 8.914993152, "train/loss_slope": -0.00019261547009078649} {"step": 4260, "timestamp": 1778199132.3922145, "train/loss": 2.638399863243103, "train/z_loss": 0.0018251146771945058, "train/perplexity": 13.990798495961014, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.01704, "optim/adamw_lr": 0.0005112, "perf/tokens_per_sec": 2025655.5502681397, "perf/iters_per_sec": 0.9659078360882472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352954626083375, "data/tokens_consumed": 8935964672, "data/tokens_consumed_B": 8.935964672, "train/loss_slope": -0.0001884181161846729} {"step": 4270, "timestamp": 1778199142.7534964, "train/loss": 2.6238553285598756, "train/z_loss": 0.0018299203831702471, "train/perplexity": 13.788781524681841, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.01708, "optim/adamw_lr": 0.0005124, "perf/tokens_per_sec": 2025402.5582497981, "perf/iters_per_sec": 0.9657872000931731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035424780845642, "data/tokens_consumed": 8956936192, "data/tokens_consumed_B": 8.956936192, "train/loss_slope": -0.00018589433238844658} {"step": 4275, "timestamp": 1778199148.5376973, "eos/sharpness": 28.44209671020507, "eos/L0_probe": 2.4417827129364014, "eos/L_plus": 2.5674216747283936, "eos/L_minus": 2.60056471824646, "eos/grad_norm": 0.2726576030254364, "eos/embed_grad_frac": 0.07154285907745361, "eos/time_s": 0.6157639026641846} {"step": 4275, "timestamp": 1778199149.9187455, "geo/rankme_last": 404.3280029296875, "geo/layer_0/stable_rank_q_proj": 31.35260581970215, "geo/layer_0/stable_rank_k_proj": 22.991722106933594, "geo/layer_0/stable_rank_o_proj": 60.2960205078125, "geo/layer_0/stable_rank_gate_proj": 169.79127502441406, "geo/layer_0/stable_rank_down_proj": 47.61964416503906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.032380539923906326, "geo/layer_0/attn_entropy_mean": 6.640910625457764, "geo/layer_0/attn_entropy_std": 0.1818041056394577, "geo/layer_7/stable_rank_q_proj": 38.759132385253906, "geo/layer_7/stable_rank_k_proj": 36.9683952331543, "geo/layer_7/stable_rank_o_proj": 102.29154968261719, "geo/layer_7/stable_rank_gate_proj": 171.36331176757812, "geo/layer_7/stable_rank_down_proj": 213.73770141601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5806578993797302, "geo/layer_7/attn_entropy_mean": 5.225567817687988, "geo/layer_7/attn_entropy_std": 1.1847559213638306, "geo/layer_14/stable_rank_q_proj": 53.52330780029297, "geo/layer_14/stable_rank_k_proj": 37.627445220947266, "geo/layer_14/stable_rank_o_proj": 100.21808624267578, "geo/layer_14/stable_rank_gate_proj": 188.93621826171875, "geo/layer_14/stable_rank_down_proj": 157.20767211914062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.44313156604766846, "geo/layer_14/attn_entropy_mean": 6.437626838684082, "geo/layer_14/attn_entropy_std": 0.3532702326774597, "geo/layer_21/stable_rank_q_proj": 48.20906448364258, "geo/layer_21/stable_rank_k_proj": 33.07796096801758, "geo/layer_21/stable_rank_o_proj": 101.28165435791016, "geo/layer_21/stable_rank_gate_proj": 138.20045471191406, "geo/layer_21/stable_rank_down_proj": 138.72262573242188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1998296082019806, "geo/layer_21/attn_entropy_mean": 5.866789817810059, "geo/layer_21/attn_entropy_std": 0.35439082980155945, "geo/layer_27/stable_rank_q_proj": 48.677642822265625, "geo/layer_27/stable_rank_k_proj": 36.50553512573242, "geo/layer_27/stable_rank_o_proj": 100.72687530517578, "geo/layer_27/stable_rank_gate_proj": 87.32435607910156, "geo/layer_27/stable_rank_down_proj": 121.41218566894531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08546698838472366, "geo/layer_27/attn_entropy_mean": 4.752438545227051, "geo/layer_27/attn_entropy_std": 0.4188225269317627, "attnres/final_alpha/block_0": 0.2355133593082428, "attnres/block_norm/0": 1.0086748600006104, "attnres/final_alpha/block_1": 0.01758299395442009, "attnres/block_norm/1": 5118.0185546875, "attnres/final_alpha/block_2": 0.04080881550908089, "attnres/block_norm/2": 3467.6064453125, "attnres/final_alpha/block_3": 0.02960546314716339, "attnres/block_norm/3": 3304.491943359375, "attnres/final_alpha/block_4": 0.048209238797426224, "attnres/block_norm/4": 2178.59716796875, "attnres/final_alpha/block_5": 0.3589768409729004, "attnres/block_norm/5": 2556.912841796875, "attnres/final_alpha/block_6": 0.2693032920360565, "attnres/block_norm/6": 2896.5185546875, "geo/tier1_time_s": 1.361205816268921, "geo/step": 4275.0, "geo/rankme_slope": 0.040451879599496046} {"step": 4280, "timestamp": 1778199155.5879216, "train/loss": 2.628002405166626, "train/z_loss": 0.0018279360374435782, "train/perplexity": 13.846083393473966, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.01712, "optim/adamw_lr": 0.0005135999999999999, "perf/tokens_per_sec": 1634686.8522850436, "perf/iters_per_sec": 0.7794794331956117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.282907485961914, "data/tokens_consumed": 8977907712, "data/tokens_consumed_B": 8.977907712, "train/loss_slope": -0.00018493346345342383} {"step": 4290, "timestamp": 1778199165.9479916, "train/loss": 2.6291439056396486, "train/z_loss": 0.0018357818014919757, "train/perplexity": 13.8618977285354, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.01716, "optim/adamw_lr": 0.0005147999999999999, "perf/tokens_per_sec": 2025442.4338799973, "perf/iters_per_sec": 0.9658062142753588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354043960571289, "data/tokens_consumed": 8998879232, "data/tokens_consumed_B": 8.998879232, "train/loss_slope": -0.00018136624410064445} {"step": 4300, "timestamp": 1778199176.2931952, "grad/layer_0/attn": 0.004307418130338192, "grad/layer_0/mlp": 0.006540548987686634, "grad/layer_0/attn_mlp_ratio": 0.6585713328636922, "grad/layer_4/attn": 0.0024791955947875977, "grad/layer_4/mlp": 0.005556649528443813, "grad/layer_4/attn_mlp_ratio": 0.4461673419351289, "grad/layer_8/attn": 0.010611326433718204, "grad/layer_8/mlp": 0.0070319646038115025, "grad/layer_8/attn_mlp_ratio": 1.5090130398360218, "grad/layer_12/attn": 0.007281693629920483, "grad/layer_12/mlp": 0.0067802597768604755, "grad/layer_12/attn_mlp_ratio": 1.0739549460001216, "grad/layer_16/attn": 0.007413169369101524, "grad/layer_16/mlp": 0.005852180067449808, "grad/layer_16/attn_mlp_ratio": 1.2667363541426897, "grad/layer_20/attn": 0.013640646822750568, "grad/layer_20/mlp": 0.013843591324985027, "grad/layer_20/attn_mlp_ratio": 0.9853401768367577, "grad/layer_24/attn": 0.014271657913923264, "grad/layer_24/mlp": 0.014949389733374119, "grad/layer_24/attn_mlp_ratio": 0.9546649109425298, "grad/layer_27/attn": 0.006599048618227243, "grad/layer_27/mlp": 0.01477499958127737, "grad/layer_27/attn_mlp_ratio": 0.4466361259275996} {"step": 4300, "timestamp": 1778199176.308914, "train/loss": 2.658543920516968, "train/z_loss": 0.0018261449877172709, "train/perplexity": 14.275487713347921, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.0172, "optim/adamw_lr": 0.000516, "perf/tokens_per_sec": 2025284.246468684, "perf/iters_per_sec": 0.9657307846396846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354852676391602, "data/tokens_consumed": 9019850752, "data/tokens_consumed_B": 9.019850752, "train/loss_slope": -0.000175940765778009} {"step": 4310, "timestamp": 1778199186.6618283, "train/loss": 2.6468000411987305, "train/z_loss": 0.0018258891999721528, "train/perplexity": 14.10881869441575, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.017240000000000002, "optim/adamw_lr": 0.0005172, "perf/tokens_per_sec": 2026711.9987473043, "perf/iters_per_sec": 0.96641158997884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034755802154541, "data/tokens_consumed": 9040822272, "data/tokens_consumed_B": 9.040822272, "train/loss_slope": -0.0001771198760701341} {"step": 4320, "timestamp": 1778199197.0180466, "train/loss": 2.6152316331863403, "train/z_loss": 0.0018314109998755156, "train/perplexity": 13.670382525450066, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.01728, "optim/adamw_lr": 0.0005183999999999999, "perf/tokens_per_sec": 2026186.1338206695, "perf/iters_per_sec": 0.9661608380416248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035024356842041, "data/tokens_consumed": 9061793792, "data/tokens_consumed_B": 9.061793792, "train/loss_slope": -0.00017421370011852405} {"step": 4330, "timestamp": 1778199207.3660634, "train/loss": 2.586571455001831, "train/z_loss": 0.0018417934770695865, "train/perplexity": 13.284148137183678, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.01732, "optim/adamw_lr": 0.0005195999999999999, "perf/tokens_per_sec": 2027618.5236548383, "perf/iters_per_sec": 0.9668438547395889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342931747436523, "data/tokens_consumed": 9082765312, "data/tokens_consumed_B": 9.082765312, "train/loss_slope": -0.00017397219305670796} {"step": 4340, "timestamp": 1778199217.7137213, "train/loss": 2.649322748184204, "train/z_loss": 0.0018244530190713703, "train/perplexity": 14.144456042286318, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.01736, "optim/adamw_lr": 0.0005208, "perf/tokens_per_sec": 2028105.5711876664, "perf/iters_per_sec": 0.9670760971010525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340447902679444, "data/tokens_consumed": 9103736832, "data/tokens_consumed_B": 9.103736832, "train/loss_slope": -0.00017048905342862008} {"step": 4350, "timestamp": 1778199228.05215, "grad/layer_0/attn": 0.003944457042962313, "grad/layer_0/mlp": 0.006305020768195391, "grad/layer_0/attn_mlp_ratio": 0.6256057078033569, "grad/layer_4/attn": 0.002911362098529935, "grad/layer_4/mlp": 0.005789577029645443, "grad/layer_4/attn_mlp_ratio": 0.5028626501273036, "grad/layer_8/attn": 0.008986043743789196, "grad/layer_8/mlp": 0.007423970382660627, "grad/layer_8/attn_mlp_ratio": 1.210409411618343, "grad/layer_12/attn": 0.007198183797299862, "grad/layer_12/mlp": 0.006934200879186392, "grad/layer_12/attn_mlp_ratio": 1.0380696808335723, "grad/layer_16/attn": 0.008983397856354713, "grad/layer_16/mlp": 0.00629356037825346, "grad/layer_16/attn_mlp_ratio": 1.4273951743842965, "grad/layer_20/attn": 0.017622560262680054, "grad/layer_20/mlp": 0.01367590669542551, "grad/layer_20/attn_mlp_ratio": 1.2885844080609463, "grad/layer_24/attn": 0.01790565624833107, "grad/layer_24/mlp": 0.022992197424173355, "grad/layer_24/attn_mlp_ratio": 0.7787709821780002, "grad/layer_27/attn": 0.012839156202971935, "grad/layer_27/mlp": 0.02515411004424095, "grad/layer_27/attn_mlp_ratio": 0.5104198132769752} {"step": 4350, "timestamp": 1778199228.6618779, "eos/sharpness": 22.28913307189941, "eos/L0_probe": 2.4298019409179688, "eos/L_plus": 2.5396831035614014, "eos/L_minus": 2.5428121089935303, "eos/grad_norm": 0.27434468269348145, "eos/embed_grad_frac": 0.1014789268374443, "eos/time_s": 0.6069662570953369} {"step": 4350, "timestamp": 1778199228.6821966, "train/loss": 2.539015007019043, "train/z_loss": 0.001847518456634134, "train/perplexity": 12.667187732886923, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.0174, "optim/adamw_lr": 0.000522, "perf/tokens_per_sec": 1913106.229452133, "perf/iters_per_sec": 0.9122401377926507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.09620258808136, "data/tokens_consumed": 9124708352, "data/tokens_consumed_B": 9.124708352, "train/loss_slope": -0.00017088692257649676} {"step": 4350, "timestamp": 1778199230.0420053, "geo/rankme_last": 405.27166748046875, "geo/layer_0/stable_rank_q_proj": 30.346216201782227, "geo/layer_0/stable_rank_k_proj": 22.05344009399414, "geo/layer_0/stable_rank_o_proj": 60.14372634887695, "geo/layer_0/stable_rank_gate_proj": 169.49266052246094, "geo/layer_0/stable_rank_down_proj": 47.4571418762207, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03572811931371689, "geo/layer_0/attn_entropy_mean": 6.644411087036133, "geo/layer_0/attn_entropy_std": 0.17794549465179443, "geo/layer_7/stable_rank_q_proj": 39.298606872558594, "geo/layer_7/stable_rank_k_proj": 37.476158142089844, "geo/layer_7/stable_rank_o_proj": 102.67594909667969, "geo/layer_7/stable_rank_gate_proj": 170.52305603027344, "geo/layer_7/stable_rank_down_proj": 213.88150024414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5882048606872559, "geo/layer_7/attn_entropy_mean": 5.1521077156066895, "geo/layer_7/attn_entropy_std": 1.1742674112319946, "geo/layer_14/stable_rank_q_proj": 54.899967193603516, "geo/layer_14/stable_rank_k_proj": 38.58694839477539, "geo/layer_14/stable_rank_o_proj": 99.98678588867188, "geo/layer_14/stable_rank_gate_proj": 187.79653930664062, "geo/layer_14/stable_rank_down_proj": 155.62632751464844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.43600034713745117, "geo/layer_14/attn_entropy_mean": 6.375698566436768, "geo/layer_14/attn_entropy_std": 0.36303120851516724, "geo/layer_21/stable_rank_q_proj": 48.46299362182617, "geo/layer_21/stable_rank_k_proj": 32.96006393432617, "geo/layer_21/stable_rank_o_proj": 101.52494812011719, "geo/layer_21/stable_rank_gate_proj": 138.2720489501953, "geo/layer_21/stable_rank_down_proj": 136.0591278076172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.19800613820552826, "geo/layer_21/attn_entropy_mean": 5.862694263458252, "geo/layer_21/attn_entropy_std": 0.3726502060890198, "geo/layer_27/stable_rank_q_proj": 48.138023376464844, "geo/layer_27/stable_rank_k_proj": 37.026283264160156, "geo/layer_27/stable_rank_o_proj": 101.08721160888672, "geo/layer_27/stable_rank_gate_proj": 87.26700592041016, "geo/layer_27/stable_rank_down_proj": 124.4168472290039, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08210019767284393, "geo/layer_27/attn_entropy_mean": 4.7400712966918945, "geo/layer_27/attn_entropy_std": 0.42094317078590393, "attnres/final_alpha/block_0": 0.2346404790878296, "attnres/block_norm/0": 1.0177334547042847, "attnres/final_alpha/block_1": 0.01732664927840233, "attnres/block_norm/1": 5261.9482421875, "attnres/final_alpha/block_2": 0.03993227332830429, "attnres/block_norm/2": 3564.560546875, "attnres/final_alpha/block_3": 0.029573265463113785, "attnres/block_norm/3": 3399.82177734375, "attnres/final_alpha/block_4": 0.04733984172344208, "attnres/block_norm/4": 2231.020263671875, "attnres/final_alpha/block_5": 0.36270827054977417, "attnres/block_norm/5": 2608.31201171875, "attnres/final_alpha/block_6": 0.26847922801971436, "attnres/block_norm/6": 2960.25439453125, "geo/tier1_time_s": 1.355867624282837, "geo/step": 4350.0, "geo/rankme_slope": 0.03653747315332383} {"step": 4360, "timestamp": 1778199240.394281, "train/loss": 2.6245027780532837, "train/z_loss": 0.001834443083498627, "train/perplexity": 13.797711954984036, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.01744, "optim/adamw_lr": 0.0005231999999999999, "perf/tokens_per_sec": 1791178.2760083268, "perf/iters_per_sec": 0.8541003589669832, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708225965499879, "data/tokens_consumed": 9145679872, "data/tokens_consumed_B": 9.145679872, "train/loss_slope": -0.00017093656196846027} {"step": 4370, "timestamp": 1778199250.7612798, "train/loss": 2.6026228427886964, "train/z_loss": 0.0018438146216794848, "train/perplexity": 13.499097654306148, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.01748, "optim/adamw_lr": 0.0005244, "perf/tokens_per_sec": 2024346.8684456106, "perf/iters_per_sec": 0.9652838079670003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359647512435912, "data/tokens_consumed": 9166651392, "data/tokens_consumed_B": 9.166651392, "train/loss_slope": -0.00017072939459282532} {"step": 4380, "timestamp": 1778199261.1259923, "train/loss": 2.571000647544861, "train/z_loss": 0.0018539783777669072, "train/perplexity": 13.078905269641048, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.01752, "optim/adamw_lr": 0.0005256, "perf/tokens_per_sec": 2024822.6036613728, "perf/iters_per_sec": 0.9655106561953415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357213497161866, "data/tokens_consumed": 9187622912, "data/tokens_consumed_B": 9.187622912, "train/loss_slope": -0.0001705482270314891} {"step": 4390, "timestamp": 1778199271.4809566, "train/loss": 2.640546131134033, "train/z_loss": 0.0018301001633517443, "train/perplexity": 14.020858744675381, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.01756, "optim/adamw_lr": 0.0005267999999999999, "perf/tokens_per_sec": 2026532.1823679972, "perf/iters_per_sec": 0.9663258468475329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034847617149353, "data/tokens_consumed": 9208594432, "data/tokens_consumed_B": 9.208594432, "train/loss_slope": -0.00016767076188438541} {"step": 4400, "timestamp": 1778199281.8267908, "grad/layer_0/attn": 0.004071362316608429, "grad/layer_0/mlp": 0.0069357422180473804, "grad/layer_0/attn_mlp_ratio": 0.5870117616703269, "grad/layer_4/attn": 0.0031010573729872704, "grad/layer_4/mlp": 0.005506301298737526, "grad/layer_4/attn_mlp_ratio": 0.5631833690938667, "grad/layer_8/attn": 0.011237015016376972, "grad/layer_8/mlp": 0.007220194675028324, "grad/layer_8/attn_mlp_ratio": 1.5563312855826512, "grad/layer_12/attn": 0.006394532043486834, "grad/layer_12/mlp": 0.006856152322143316, "grad/layer_12/attn_mlp_ratio": 0.9326706364978721, "grad/layer_16/attn": 0.008851210586726665, "grad/layer_16/mlp": 0.007084638345986605, "grad/layer_16/attn_mlp_ratio": 1.2493524763766612, "grad/layer_20/attn": 0.019564006477594376, "grad/layer_20/mlp": 0.01662418246269226, "grad/layer_20/attn_mlp_ratio": 1.1768402087630836, "grad/layer_24/attn": 0.020423412322998047, "grad/layer_24/mlp": 0.022169847041368484, "grad/layer_24/attn_mlp_ratio": 0.9212247695153646, "grad/layer_27/attn": 0.01738128997385502, "grad/layer_27/mlp": 0.02362903580069542, "grad/layer_27/attn_mlp_ratio": 0.7355903155296943} {"step": 4400, "timestamp": 1778199281.8425643, "train/loss": 2.6188417434692384, "train/z_loss": 0.0018387975520454346, "train/perplexity": 13.719823303610793, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.0176, "optim/adamw_lr": 0.0005279999999999999, "perf/tokens_per_sec": 2025221.1556122173, "perf/iters_per_sec": 0.9657007005749785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355175256729126, "data/tokens_consumed": 9229565952, "data/tokens_consumed_B": 9.229565952, "train/loss_slope": -0.00016070405431885344} {"step": 4410, "timestamp": 1778199292.1975243, "train/loss": 2.6091241121292112, "train/z_loss": 0.0018379740649834275, "train/perplexity": 13.58714482306983, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.01764, "optim/adamw_lr": 0.0005292, "perf/tokens_per_sec": 2026446.8379545978, "perf/iters_per_sec": 0.966285151459979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348912000656127, "data/tokens_consumed": 9250537472, "data/tokens_consumed_B": 9.250537472, "train/loss_slope": -0.00015843695828838094} {"step": 4420, "timestamp": 1778199302.5707233, "train/loss": 2.5708245038986206, "train/z_loss": 0.001849249447695911, "train/perplexity": 13.076601706462986, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.01768, "optim/adamw_lr": 0.0005304, "perf/tokens_per_sec": 2022752.3712089402, "perf/iters_per_sec": 0.9645234924359036, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03678138256073, "data/tokens_consumed": 9271508992, "data/tokens_consumed_B": 9.271508992, "train/loss_slope": -0.0001566407833448444} {"step": 4425, "timestamp": 1778199308.3409495, "eos/sharpness": 37.00358867645263, "eos/L0_probe": 2.4261605739593506, "eos/L_plus": 2.5499978065490723, "eos/L_minus": 2.6723592281341553, "eos/grad_norm": 0.22598925232887268, "eos/embed_grad_frac": 0.09453023225069046, "eos/time_s": 0.6044847965240479} {"step": 4425, "timestamp": 1778199309.7208445, "geo/rankme_last": 407.58233642578125, "geo/layer_0/stable_rank_q_proj": 29.43364906311035, "geo/layer_0/stable_rank_k_proj": 21.17983055114746, "geo/layer_0/stable_rank_o_proj": 60.09498596191406, "geo/layer_0/stable_rank_gate_proj": 168.7700958251953, "geo/layer_0/stable_rank_down_proj": 47.3768424987793, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.02355576679110527, "geo/layer_0/attn_entropy_mean": 6.634401321411133, "geo/layer_0/attn_entropy_std": 0.1897326558828354, "geo/layer_7/stable_rank_q_proj": 39.890869140625, "geo/layer_7/stable_rank_k_proj": 38.233489990234375, "geo/layer_7/stable_rank_o_proj": 102.75141143798828, "geo/layer_7/stable_rank_gate_proj": 169.761962890625, "geo/layer_7/stable_rank_down_proj": 214.5880126953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5736767053604126, "geo/layer_7/attn_entropy_mean": 5.156886577606201, "geo/layer_7/attn_entropy_std": 1.1617696285247803, "geo/layer_14/stable_rank_q_proj": 56.0710334777832, "geo/layer_14/stable_rank_k_proj": 39.742767333984375, "geo/layer_14/stable_rank_o_proj": 101.07027435302734, "geo/layer_14/stable_rank_gate_proj": 186.02935791015625, "geo/layer_14/stable_rank_down_proj": 155.1376190185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4421064257621765, "geo/layer_14/attn_entropy_mean": 6.366761684417725, "geo/layer_14/attn_entropy_std": 0.3573290705680847, "geo/layer_21/stable_rank_q_proj": 48.44464874267578, "geo/layer_21/stable_rank_k_proj": 32.819236755371094, "geo/layer_21/stable_rank_o_proj": 101.76172637939453, "geo/layer_21/stable_rank_gate_proj": 138.08038330078125, "geo/layer_21/stable_rank_down_proj": 133.7745361328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1952689290046692, "geo/layer_21/attn_entropy_mean": 5.881237030029297, "geo/layer_21/attn_entropy_std": 0.3527214825153351, "geo/layer_27/stable_rank_q_proj": 47.68557357788086, "geo/layer_27/stable_rank_k_proj": 37.47101593017578, "geo/layer_27/stable_rank_o_proj": 100.97252655029297, "geo/layer_27/stable_rank_gate_proj": 87.46715545654297, "geo/layer_27/stable_rank_down_proj": 126.60559844970703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07502976804971695, "geo/layer_27/attn_entropy_mean": 4.7550740242004395, "geo/layer_27/attn_entropy_std": 0.45118892192840576, "attnres/final_alpha/block_0": 0.237306147813797, "attnres/block_norm/0": 1.027044653892517, "attnres/final_alpha/block_1": 0.017316080629825592, "attnres/block_norm/1": 5408.87060546875, "attnres/final_alpha/block_2": 0.03988522291183472, "attnres/block_norm/2": 3672.252197265625, "attnres/final_alpha/block_3": 0.029461778700351715, "attnres/block_norm/3": 3516.437744140625, "attnres/final_alpha/block_4": 0.04779960960149765, "attnres/block_norm/4": 2273.82958984375, "attnres/final_alpha/block_5": 0.3633902072906494, "attnres/block_norm/5": 2640.116943359375, "attnres/final_alpha/block_6": 0.2648409605026245, "attnres/block_norm/6": 3048.33935546875, "geo/tier1_time_s": 1.3587532043457031, "geo/step": 4425.0, "geo/rankme_slope": 0.033992853547669066} {"step": 4430, "timestamp": 1778199314.9009755, "train/loss": 2.6084590673446657, "train/z_loss": 0.0018397437292151154, "train/perplexity": 13.57811176729463, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.01772, "optim/adamw_lr": 0.0005315999999999999, "perf/tokens_per_sec": 1701619.9270137355, "perf/iters_per_sec": 0.8113956103390386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232444429397583, "data/tokens_consumed": 9292480512, "data/tokens_consumed_B": 9.292480512, "train/loss_slope": -0.0001526083644216853} {"step": 4440, "timestamp": 1778199325.2545667, "train/loss": 2.6337785482406617, "train/z_loss": 0.0018375736312009395, "train/perplexity": 13.926291776713592, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.01776, "optim/adamw_lr": 0.0005327999999999999, "perf/tokens_per_sec": 2026550.111210945, "perf/iters_per_sec": 0.966334395986054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348384618759154, "data/tokens_consumed": 9313452032, "data/tokens_consumed_B": 9.313452032, "train/loss_slope": -0.00014888638432877856} {"step": 4450, "timestamp": 1778199335.6023142, "grad/layer_0/attn": 0.004496717359870672, "grad/layer_0/mlp": 0.007078318856656551, "grad/layer_0/attn_mlp_ratio": 0.6352804087249976, "grad/layer_4/attn": 0.0025965615641325712, "grad/layer_4/mlp": 0.005770950112491846, "grad/layer_4/attn_mlp_ratio": 0.4499365734453978, "grad/layer_8/attn": 0.00989942904561758, "grad/layer_8/mlp": 0.007341756951063871, "grad/layer_8/attn_mlp_ratio": 1.3483732813227707, "grad/layer_12/attn": 0.007128507364541292, "grad/layer_12/mlp": 0.0069360011257231236, "grad/layer_12/attn_mlp_ratio": 1.0277546287195327, "grad/layer_16/attn": 0.008585433475673199, "grad/layer_16/mlp": 0.007308692671358585, "grad/layer_16/attn_mlp_ratio": 1.1746879701001969, "grad/layer_20/attn": 0.017843401059508324, "grad/layer_20/mlp": 0.015505800023674965, "grad/layer_20/attn_mlp_ratio": 1.1507565502707728, "grad/layer_24/attn": 0.023629609495401382, "grad/layer_24/mlp": 0.020550552755594254, "grad/layer_24/attn_mlp_ratio": 1.149828409067299, "grad/layer_27/attn": 0.014244754798710346, "grad/layer_27/mlp": 0.023100079968571663, "grad/layer_27/attn_mlp_ratio": 0.6166539144637317} {"step": 4450, "timestamp": 1778199335.6178632, "train/loss": 2.634483218193054, "train/z_loss": 0.001836405647918582, "train/perplexity": 13.936108674507121, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.0178, "optim/adamw_lr": 0.000534, "perf/tokens_per_sec": 2025624.6225213823, "perf/iters_per_sec": 0.9658930885893737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353112697601319, "data/tokens_consumed": 9334423552, "data/tokens_consumed_B": 9.334423552, "train/loss_slope": -0.00014462099905097008} {"step": 4460, "timestamp": 1778199346.3786886, "train/loss": 2.621058940887451, "train/z_loss": 0.0018395818769931793, "train/perplexity": 13.7502766084367, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.01784, "optim/adamw_lr": 0.0005352, "perf/tokens_per_sec": 1950092.5734821963, "perf/iters_per_sec": 0.9298766009722692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0754115104675293, "data/tokens_consumed": 9355395072, "data/tokens_consumed_B": 9.355395072, "train/loss_slope": -0.00013729306281668046} {"step": 4470, "timestamp": 1778199357.2418184, "train/loss": 2.601606249809265, "train/z_loss": 0.0018404926406219602, "train/perplexity": 13.48538153943642, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.01788, "optim/adamw_lr": 0.0005363999999999999, "perf/tokens_per_sec": 1931519.78854869, "perf/iters_per_sec": 0.9210204069846583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.085752272605896, "data/tokens_consumed": 9376366592, "data/tokens_consumed_B": 9.376366592, "train/loss_slope": -0.00013891122695242997} {"step": 4480, "timestamp": 1778199367.5970728, "train/loss": 2.6619101762771606, "train/z_loss": 0.0018351087695918978, "train/perplexity": 14.323623629540068, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.017920000000000002, "optim/adamw_lr": 0.0005376, "perf/tokens_per_sec": 2026280.1382850157, "perf/iters_per_sec": 0.966205662863262, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03497633934021, "data/tokens_consumed": 9397338112, "data/tokens_consumed_B": 9.397338112, "train/loss_slope": -0.0001308585277091551} {"step": 4490, "timestamp": 1778199377.9519224, "train/loss": 2.623462128639221, "train/z_loss": 0.001840326259844005, "train/perplexity": 13.783360842656124, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.01796, "optim/adamw_lr": 0.0005388, "perf/tokens_per_sec": 2026265.995049983, "perf/iters_per_sec": 0.9661989188432613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349835634231568, "data/tokens_consumed": 9418309632, "data/tokens_consumed_B": 9.418309632, "train/loss_slope": -0.00013144995816434204} {"step": 4500, "timestamp": 1778199388.3027217, "grad/layer_0/attn": 0.004537506960332394, "grad/layer_0/mlp": 0.006537331733852625, "grad/layer_0/attn_mlp_ratio": 0.6940915767554543, "grad/layer_4/attn": 0.0031346529722213745, "grad/layer_4/mlp": 0.005866645835340023, "grad/layer_4/attn_mlp_ratio": 0.5343177356824235, "grad/layer_8/attn": 0.010944614186882973, "grad/layer_8/mlp": 0.007786301430314779, "grad/layer_8/attn_mlp_ratio": 1.4056242420450562, "grad/layer_12/attn": 0.006660963874310255, "grad/layer_12/mlp": 0.007461320608854294, "grad/layer_12/attn_mlp_ratio": 0.8927325515448946, "grad/layer_16/attn": 0.008872426114976406, "grad/layer_16/mlp": 0.006343170069158077, "grad/layer_16/attn_mlp_ratio": 1.3987368899727997, "grad/layer_20/attn": 0.012409145943820477, "grad/layer_20/mlp": 0.013284489512443542, "grad/layer_20/attn_mlp_ratio": 0.9341078434956859, "grad/layer_24/attn": 0.020934129133820534, "grad/layer_24/mlp": 0.014920495450496674, "grad/layer_24/attn_mlp_ratio": 1.403045164483406, "grad/layer_27/attn": 0.005428843200206757, "grad/layer_27/mlp": 0.01563902571797371, "grad/layer_27/attn_mlp_ratio": 0.3471343588401436} {"step": 4500, "timestamp": 1778199388.9045603, "eos/sharpness": 35.57887077331542, "eos/L0_probe": 2.4180753231048584, "eos/L_plus": 2.685408353805542, "eos/L_minus": 2.506531000137329, "eos/grad_norm": 0.22423680126667023, "eos/embed_grad_frac": 0.12495703250169754, "eos/time_s": 0.5989954471588135} {"step": 4500, "timestamp": 1778199388.9254496, "train/loss": 2.657594609260559, "train/z_loss": 0.0018309500184841454, "train/perplexity": 14.261942262612594, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.018000000000000002, "optim/adamw_lr": 0.00054, "perf/tokens_per_sec": 1911991.796652476, "perf/iters_per_sec": 0.9117087348234539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096841526031494, "data/tokens_consumed": 9439281152, "data/tokens_consumed_B": 9.439281152, "train/loss_slope": -0.0001295903625196427} {"step": 4500, "timestamp": 1778199390.2869368, "geo/rankme_last": 405.7949523925781, "geo/layer_0/stable_rank_q_proj": 28.638151168823242, "geo/layer_0/stable_rank_k_proj": 20.3581600189209, "geo/layer_0/stable_rank_o_proj": 60.33381271362305, "geo/layer_0/stable_rank_gate_proj": 168.53562927246094, "geo/layer_0/stable_rank_down_proj": 47.45553970336914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.02687077783048153, "geo/layer_0/attn_entropy_mean": 6.628337860107422, "geo/layer_0/attn_entropy_std": 0.1842549443244934, "geo/layer_7/stable_rank_q_proj": 40.49543762207031, "geo/layer_7/stable_rank_k_proj": 38.88825607299805, "geo/layer_7/stable_rank_o_proj": 102.87397766113281, "geo/layer_7/stable_rank_gate_proj": 168.8582000732422, "geo/layer_7/stable_rank_down_proj": 214.1038818359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.574036717414856, "geo/layer_7/attn_entropy_mean": 5.13946533203125, "geo/layer_7/attn_entropy_std": 1.114275574684143, "geo/layer_14/stable_rank_q_proj": 56.98950958251953, "geo/layer_14/stable_rank_k_proj": 40.83149337768555, "geo/layer_14/stable_rank_o_proj": 99.02611541748047, "geo/layer_14/stable_rank_gate_proj": 183.58155822753906, "geo/layer_14/stable_rank_down_proj": 153.9037628173828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4246891140937805, "geo/layer_14/attn_entropy_mean": 6.35297966003418, "geo/layer_14/attn_entropy_std": 0.38618040084838867, "geo/layer_21/stable_rank_q_proj": 48.649749755859375, "geo/layer_21/stable_rank_k_proj": 32.67603302001953, "geo/layer_21/stable_rank_o_proj": 101.7677001953125, "geo/layer_21/stable_rank_gate_proj": 137.61407470703125, "geo/layer_21/stable_rank_down_proj": 131.94154357910156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.19629667699337006, "geo/layer_21/attn_entropy_mean": 5.887605667114258, "geo/layer_21/attn_entropy_std": 0.34092408418655396, "geo/layer_27/stable_rank_q_proj": 47.277618408203125, "geo/layer_27/stable_rank_k_proj": 37.97955322265625, "geo/layer_27/stable_rank_o_proj": 101.35233306884766, "geo/layer_27/stable_rank_gate_proj": 87.22144317626953, "geo/layer_27/stable_rank_down_proj": 128.84152221679688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08045054972171783, "geo/layer_27/attn_entropy_mean": 4.733633995056152, "geo/layer_27/attn_entropy_std": 0.42172032594680786, "attnres/final_alpha/block_0": 0.23553362488746643, "attnres/block_norm/0": 1.0362299680709839, "attnres/final_alpha/block_1": 0.01700339838862419, "attnres/block_norm/1": 5551.74951171875, "attnres/final_alpha/block_2": 0.03897826001048088, "attnres/block_norm/2": 3817.259765625, "attnres/final_alpha/block_3": 0.029296085238456726, "attnres/block_norm/3": 3646.28369140625, "attnres/final_alpha/block_4": 0.04697650671005249, "attnres/block_norm/4": 2326.93408203125, "attnres/final_alpha/block_5": 0.3748067617416382, "attnres/block_norm/5": 2702.689208984375, "attnres/final_alpha/block_6": 0.2574053704738617, "attnres/block_norm/6": 3170.5498046875, "geo/tier1_time_s": 1.3571076393127441, "geo/step": 4500.0, "geo/rankme_slope": 0.03205025838460384} {"step": 4500, "timestamp": 1778199397.268333, "geo/ww_alpha_mean": 8.982573513673028, "geo/ww_alpha_std": 6.26507541734306, "geo/ww_alpha_min": 2.857297461119833, "geo/ww_alpha_max": 51.98132430586397, "geo/ww_alpha_healthy_frac": 0.1116751269035533, "geo/ww_alpha_by_type/q_proj": 5.038012453482703, "geo/ww_alpha_by_type/k_proj": 5.427610266937348, "geo/ww_alpha_by_type/v_proj": 7.966085163949231, "geo/ww_alpha_by_type/o_proj": 7.738197625062669, "geo/ww_alpha_by_type/gate_proj": 13.39305356090442, "geo/ww_alpha_by_type/up_proj": 12.915892762075048, "geo/ww_alpha_by_type/down_proj": 10.612111121687049, "geo/twonn_id/layer_0": 0.7145214080810547, "geo/twonn_id/layer_7": 2.454113721847534, "geo/twonn_id/layer_14": 3.531858205795288, "geo/twonn_id/layer_21": 6.367410659790039, "geo/twonn_id/layer_27": 5.020122051239014, "geo/tier2_time_s": 6.973851919174194} {"step": 4500, "timestamp": 1778199398.029855, "eoc/jacobian_sigma/layer_0/attn": 554.0719604492188, "eoc/jacobian_sigma/layer_0/mlp": 862.2036743164062, "eoc/jacobian_sigma/layer_0": 862.2036743164062, "eoc/jacobian_sigma/layer_7/attn": 1.167536973953247, "eoc/jacobian_sigma/layer_7/mlp": 1.5954183340072632, "eoc/jacobian_sigma/layer_7": 1.5954183340072632, "eoc/jacobian_sigma/layer_14/attn": 1.1740659475326538, "eoc/jacobian_sigma/layer_14/mlp": 5.256354808807373, "eoc/jacobian_sigma/layer_14": 5.256354808807373, "eoc/jacobian_sigma/layer_21/attn": 1.0832834243774414, "eoc/jacobian_sigma/layer_21/mlp": 2.6111974716186523, "eoc/jacobian_sigma/layer_21": 2.6111974716186523, "eoc/jacobian_sigma/layer_27/attn": 1.4725496768951416, "eoc/jacobian_sigma/layer_27/mlp": 2.3913817405700684, "eoc/jacobian_sigma/layer_27": 2.3913817405700684, "eoc/layer0_sigma": 862.2036743164062, "eoc/sigma_max": 5.256354808807373, "eoc/sigma_min": 1.5954183340072632, "eoc/sigma_mean": 2.9635880887508392, "eoc/time_s": 0.7552032470703125} {"step": 4510, "timestamp": 1778199408.3981261, "train/loss": 2.620447206497192, "train/z_loss": 0.0018412001198157668, "train/perplexity": 13.741867663642337, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.01804, "optim/adamw_lr": 0.0005411999999999999, "perf/tokens_per_sec": 1077207.7847557827, "perf/iters_per_sec": 0.513652698877231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9468407392501832, "data/tokens_consumed": 9460252672, "data/tokens_consumed_B": 9.460252672, "train/loss_slope": -0.00012681413442209392} {"step": 4520, "timestamp": 1778199419.2948778, "train/loss": 2.535418653488159, "train/z_loss": 0.0018593226093798875, "train/perplexity": 12.621713866619464, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.018080000000000002, "optim/adamw_lr": 0.0005424, "perf/tokens_per_sec": 1925539.2524901878, "perf/iters_per_sec": 0.9181686651659907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0891245126724243, "data/tokens_consumed": 9481224192, "data/tokens_consumed_B": 9.481224192, "train/loss_slope": -0.00013071090985517617} {"step": 4530, "timestamp": 1778199430.1003098, "train/loss": 2.5654896974563597, "train/z_loss": 0.0018504774896427989, "train/perplexity": 13.007026318062076, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.01812, "optim/adamw_lr": 0.0005436, "perf/tokens_per_sec": 1942409.2828621713, "perf/iters_per_sec": 0.9262129225073678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0796653509140015, "data/tokens_consumed": 9502195712, "data/tokens_consumed_B": 9.502195712, "train/loss_slope": -0.00012911266518039744} {"step": 4540, "timestamp": 1778199440.461923, "train/loss": 2.5668370723724365, "train/z_loss": 0.0018532112590037286, "train/perplexity": 13.024563470962637, "train/grad_norm": 0.59375, "optim/muon_lr": 0.018160000000000003, "optim/adamw_lr": 0.0005448, "perf/tokens_per_sec": 2025799.8455857537, "perf/iters_per_sec": 0.9659766414574402, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352217197418212, "data/tokens_consumed": 9523167232, "data/tokens_consumed_B": 9.523167232, "train/loss_slope": -0.00012960301621554673} {"step": 4550, "timestamp": 1778199450.8076406, "grad/layer_0/attn": 0.003957732580602169, "grad/layer_0/mlp": 0.005820315331220627, "grad/layer_0/attn_mlp_ratio": 0.6799859264280734, "grad/layer_4/attn": 0.0026635455433279276, "grad/layer_4/mlp": 0.005232403054833412, "grad/layer_4/attn_mlp_ratio": 0.509048225931805, "grad/layer_8/attn": 0.015117786824703217, "grad/layer_8/mlp": 0.006898573134094477, "grad/layer_8/attn_mlp_ratio": 2.191436737960152, "grad/layer_12/attn": 0.006156053394079208, "grad/layer_12/mlp": 0.00651147123426199, "grad/layer_12/attn_mlp_ratio": 0.9454166467242746, "grad/layer_16/attn": 0.008187945932149887, "grad/layer_16/mlp": 0.006258285138756037, "grad/layer_16/attn_mlp_ratio": 1.3083369676798897, "grad/layer_20/attn": 0.017700202763080597, "grad/layer_20/mlp": 0.013940931297838688, "grad/layer_20/attn_mlp_ratio": 1.269657116727848, "grad/layer_24/attn": 0.01981167495250702, "grad/layer_24/mlp": 0.02125781588256359, "grad/layer_24/attn_mlp_ratio": 0.9319713261586821, "grad/layer_27/attn": 0.014315730892121792, "grad/layer_27/mlp": 0.021784506738185883, "grad/layer_27/attn_mlp_ratio": 0.6571519382310672} {"step": 4550, "timestamp": 1778199450.8235543, "train/loss": 2.617405128479004, "train/z_loss": 0.0018481729784980416, "train/perplexity": 13.700127350927374, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.0182, "optim/adamw_lr": 0.0005459999999999999, "perf/tokens_per_sec": 2025412.5386695773, "perf/iters_per_sec": 0.9657919591281783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354196786880494, "data/tokens_consumed": 9544138752, "data/tokens_consumed_B": 9.544138752, "train/loss_slope": -0.000126343008837398} {"step": 4560, "timestamp": 1778199461.7076447, "train/loss": 2.589898109436035, "train/z_loss": 0.0018503526225686073, "train/perplexity": 13.32841349444001, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.018240000000000003, "optim/adamw_lr": 0.0005472, "perf/tokens_per_sec": 1928105.8708773693, "perf/iters_per_sec": 0.9193925241839263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0876747131347657, "data/tokens_consumed": 9565110272, "data/tokens_consumed_B": 9.565110272, "train/loss_slope": -0.00012509969088396724} {"step": 4570, "timestamp": 1778199472.06265, "train/loss": 2.609128499031067, "train/z_loss": 0.0018483318039216102, "train/perplexity": 13.587204428671408, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.01828, "optim/adamw_lr": 0.0005484, "perf/tokens_per_sec": 2026539.0924053118, "perf/iters_per_sec": 0.966329141810089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348440885543824, "data/tokens_consumed": 9586081792, "data/tokens_consumed_B": 9.586081792, "train/loss_slope": -0.00011946147553312002} {"step": 4575, "timestamp": 1778199477.84666, "eos/sharpness": 12.933206558227537, "eos/L0_probe": 2.4128549098968506, "eos/L_plus": 2.506802797317505, "eos/L_minus": 2.4482390880584717, "eos/grad_norm": 0.19816431403160095, "eos/embed_grad_frac": 0.10805309563875198, "eos/time_s": 0.6140913963317871} {"step": 4575, "timestamp": 1778199479.2256389, "geo/rankme_last": 407.0561828613281, "geo/layer_0/stable_rank_q_proj": 27.87005615234375, "geo/layer_0/stable_rank_k_proj": 19.698726654052734, "geo/layer_0/stable_rank_o_proj": 59.899620056152344, "geo/layer_0/stable_rank_gate_proj": 168.5089874267578, "geo/layer_0/stable_rank_down_proj": 47.429176330566406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03222980722784996, "geo/layer_0/attn_entropy_mean": 6.623019218444824, "geo/layer_0/attn_entropy_std": 0.1911441832780838, "geo/layer_7/stable_rank_q_proj": 41.202938079833984, "geo/layer_7/stable_rank_k_proj": 39.234073638916016, "geo/layer_7/stable_rank_o_proj": 102.57421112060547, "geo/layer_7/stable_rank_gate_proj": 167.6138153076172, "geo/layer_7/stable_rank_down_proj": 214.55894470214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5747737884521484, "geo/layer_7/attn_entropy_mean": 5.118899345397949, "geo/layer_7/attn_entropy_std": 1.137342095375061, "geo/layer_14/stable_rank_q_proj": 58.51030349731445, "geo/layer_14/stable_rank_k_proj": 42.06413650512695, "geo/layer_14/stable_rank_o_proj": 96.30795288085938, "geo/layer_14/stable_rank_gate_proj": 181.05618286132812, "geo/layer_14/stable_rank_down_proj": 153.8634796142578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4249960780143738, "geo/layer_14/attn_entropy_mean": 6.348320007324219, "geo/layer_14/attn_entropy_std": 0.4196030795574188, "geo/layer_21/stable_rank_q_proj": 48.76510238647461, "geo/layer_21/stable_rank_k_proj": 32.82204818725586, "geo/layer_21/stable_rank_o_proj": 102.1988296508789, "geo/layer_21/stable_rank_gate_proj": 137.84396362304688, "geo/layer_21/stable_rank_down_proj": 129.33883666992188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.19271689653396606, "geo/layer_21/attn_entropy_mean": 5.88150691986084, "geo/layer_21/attn_entropy_std": 0.34103208780288696, "geo/layer_27/stable_rank_q_proj": 46.701168060302734, "geo/layer_27/stable_rank_k_proj": 38.421424865722656, "geo/layer_27/stable_rank_o_proj": 101.78761291503906, "geo/layer_27/stable_rank_gate_proj": 87.25943756103516, "geo/layer_27/stable_rank_down_proj": 131.28466796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08079877495765686, "geo/layer_27/attn_entropy_mean": 4.743438720703125, "geo/layer_27/attn_entropy_std": 0.44107744097709656, "attnres/final_alpha/block_0": 0.23662933707237244, "attnres/block_norm/0": 1.0453941822052002, "attnres/final_alpha/block_1": 0.0170290507376194, "attnres/block_norm/1": 5729.39501953125, "attnres/final_alpha/block_2": 0.038224875926971436, "attnres/block_norm/2": 3953.088623046875, "attnres/final_alpha/block_3": 0.028461169451475143, "attnres/block_norm/3": 3789.32763671875, "attnres/final_alpha/block_4": 0.04641826078295708, "attnres/block_norm/4": 2372.39013671875, "attnres/final_alpha/block_5": 0.3765695095062256, "attnres/block_norm/5": 2741.806640625, "attnres/final_alpha/block_6": 0.2566677927970886, "attnres/block_norm/6": 3260.35498046875, "geo/tier1_time_s": 1.3589739799499512, "geo/step": 4575.0, "geo/rankme_slope": 0.03046696065144808} {"step": 4580, "timestamp": 1778199484.4666548, "train/loss": 2.6206028699874877, "train/z_loss": 0.0018394694081507624, "train/perplexity": 13.744006937225114, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.01832, "optim/adamw_lr": 0.0005496, "perf/tokens_per_sec": 1691719.0807869635, "perf/iters_per_sec": 0.8066745189604585, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2396573543548584, "data/tokens_consumed": 9607053312, "data/tokens_consumed_B": 9.607053312, "train/loss_slope": -0.00011770439436941436} {"step": 4590, "timestamp": 1778199494.816044, "train/loss": 2.5737987756729126, "train/z_loss": 0.001854108413681388, "train/perplexity": 13.115552970930898, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.01836, "optim/adamw_lr": 0.0005507999999999999, "perf/tokens_per_sec": 2027434.2471391778, "perf/iters_per_sec": 0.9667559848495377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034387183189392, "data/tokens_consumed": 9628024832, "data/tokens_consumed_B": 9.628024832, "train/loss_slope": -0.00012147205338762789} {"step": 4600, "timestamp": 1778199505.1695611, "grad/layer_0/attn": 0.003754333360120654, "grad/layer_0/mlp": 0.0058141122572124004, "grad/layer_0/attn_mlp_ratio": 0.645727693147073, "grad/layer_4/attn": 0.002421637298539281, "grad/layer_4/mlp": 0.005047959741204977, "grad/layer_4/attn_mlp_ratio": 0.479725943691585, "grad/layer_8/attn": 0.00814894586801529, "grad/layer_8/mlp": 0.00663464842364192, "grad/layer_8/attn_mlp_ratio": 1.22824077853971, "grad/layer_12/attn": 0.005644797347486019, "grad/layer_12/mlp": 0.0062002157792449, "grad/layer_12/attn_mlp_ratio": 0.9104194849701713, "grad/layer_16/attn": 0.008622836321592331, "grad/layer_16/mlp": 0.005663715768605471, "grad/layer_16/attn_mlp_ratio": 1.5224697922064832, "grad/layer_20/attn": 0.017702078446745872, "grad/layer_20/mlp": 0.013356035575270653, "grad/layer_20/attn_mlp_ratio": 1.3253991586382274, "grad/layer_24/attn": 0.019088642671704292, "grad/layer_24/mlp": 0.01629417948424816, "grad/layer_24/attn_mlp_ratio": 1.1715006928092029, "grad/layer_27/attn": 0.009043014608323574, "grad/layer_27/mlp": 0.016275348141789436, "grad/layer_27/attn_mlp_ratio": 0.5556264894599463} {"step": 4600, "timestamp": 1778199505.1853864, "train/loss": 2.586419606208801, "train/z_loss": 0.0018557764124125242, "train/perplexity": 13.28213110846818, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.0184, "optim/adamw_lr": 0.000552, "perf/tokens_per_sec": 2023493.677686171, "perf/iters_per_sec": 0.9648769749098639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364015579223633, "data/tokens_consumed": 9648996352, "data/tokens_consumed_B": 9.648996352, "train/loss_slope": -0.00012111960139819676} {"step": 4610, "timestamp": 1778199515.5445015, "train/loss": 2.5557459592819214, "train/z_loss": 0.0018670055083930492, "train/perplexity": 12.880904705013831, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.01844, "optim/adamw_lr": 0.0005532, "perf/tokens_per_sec": 2025554.6070846731, "perf/iters_per_sec": 0.965859702627503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035347056388855, "data/tokens_consumed": 9669967872, "data/tokens_consumed_B": 9.669967872, "train/loss_slope": -0.00012179693025473966} {"step": 4620, "timestamp": 1778199525.8977706, "train/loss": 2.5930306673049928, "train/z_loss": 0.0018544026883319021, "train/perplexity": 13.370230984670817, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.01848, "optim/adamw_lr": 0.0005544, "perf/tokens_per_sec": 2026881.6652145367, "perf/iters_per_sec": 0.9664924932549175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346691846847533, "data/tokens_consumed": 9690939392, "data/tokens_consumed_B": 9.690939392, "train/loss_slope": -0.00012083553884467886} {"step": 4630, "timestamp": 1778199536.245989, "train/loss": 2.6216442346572877, "train/z_loss": 0.0018429696443490683, "train/perplexity": 13.758326915336568, "train/grad_norm": 0.171875, "optim/muon_lr": 0.018520000000000002, "optim/adamw_lr": 0.0005556, "perf/tokens_per_sec": 2027657.598506096, "perf/iters_per_sec": 0.9668624870806198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342732429504395, "data/tokens_consumed": 9711910912, "data/tokens_consumed_B": 9.711910912, "train/loss_slope": -0.00011945712987703495} {"step": 4640, "timestamp": 1778199546.6019316, "train/loss": 2.669452738761902, "train/z_loss": 0.0018393533769994973, "train/perplexity": 14.432068919338224, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.01856, "optim/adamw_lr": 0.0005568, "perf/tokens_per_sec": 2026161.3039090033, "perf/iters_per_sec": 0.9661489982171074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350370407104492, "data/tokens_consumed": 9732882432, "data/tokens_consumed_B": 9.732882432, "train/loss_slope": -0.00011549174590329668} {"step": 4650, "timestamp": 1778199556.9462678, "grad/layer_0/attn": 0.003529550274834037, "grad/layer_0/mlp": 0.005468288436532021, "grad/layer_0/attn_mlp_ratio": 0.6454579437888389, "grad/layer_4/attn": 0.002559639746323228, "grad/layer_4/mlp": 0.005287761799991131, "grad/layer_4/attn_mlp_ratio": 0.4840686465719119, "grad/layer_8/attn": 0.011294550262391567, "grad/layer_8/mlp": 0.007145911920815706, "grad/layer_8/attn_mlp_ratio": 1.5805610577755609, "grad/layer_12/attn": 0.0067934817634522915, "grad/layer_12/mlp": 0.006709324661642313, "grad/layer_12/attn_mlp_ratio": 1.0125432893472546, "grad/layer_16/attn": 0.007664275821298361, "grad/layer_16/mlp": 0.005804470740258694, "grad/layer_16/attn_mlp_ratio": 1.3204090488559987, "grad/layer_20/attn": 0.010458783246576786, "grad/layer_20/mlp": 0.011109404265880585, "grad/layer_20/attn_mlp_ratio": 0.9414351032804245, "grad/layer_24/attn": 0.01094126608222723, "grad/layer_24/mlp": 0.014637180604040623, "grad/layer_24/attn_mlp_ratio": 0.7474981899490297, "grad/layer_27/attn": 0.009743698872625828, "grad/layer_27/mlp": 0.01478410605341196, "grad/layer_27/attn_mlp_ratio": 0.659065808342909} {"step": 4650, "timestamp": 1778199557.5508828, "eos/sharpness": 20.070862770080563, "eos/L0_probe": 2.4057931900024414, "eos/L_plus": 2.5096077919006348, "eos/L_minus": 2.5026872158050537, "eos/grad_norm": 0.17362521588802338, "eos/embed_grad_frac": 0.14282719790935516, "eos/time_s": 0.601806640625} {"step": 4650, "timestamp": 1778199557.571447, "train/loss": 2.599418354034424, "train/z_loss": 0.0018584183533675968, "train/perplexity": 13.45590918312283, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.018600000000000002, "optim/adamw_lr": 0.000558, "perf/tokens_per_sec": 1912826.2827658828, "perf/iters_per_sec": 0.9121066488103308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963630199432373, "data/tokens_consumed": 9753853952, "data/tokens_consumed_B": 9.753853952, "train/loss_slope": -0.00011333328656809771} {"step": 4650, "timestamp": 1778199558.931225, "geo/rankme_last": 409.9548645019531, "geo/layer_0/stable_rank_q_proj": 27.08810043334961, "geo/layer_0/stable_rank_k_proj": 18.931116104125977, "geo/layer_0/stable_rank_o_proj": 59.86365509033203, "geo/layer_0/stable_rank_gate_proj": 168.3433380126953, "geo/layer_0/stable_rank_down_proj": 47.173255920410156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.024403469637036324, "geo/layer_0/attn_entropy_mean": 6.618779182434082, "geo/layer_0/attn_entropy_std": 0.19168290495872498, "geo/layer_7/stable_rank_q_proj": 41.785133361816406, "geo/layer_7/stable_rank_k_proj": 39.28793716430664, "geo/layer_7/stable_rank_o_proj": 102.62126159667969, "geo/layer_7/stable_rank_gate_proj": 166.3279266357422, "geo/layer_7/stable_rank_down_proj": 213.572021484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5692216157913208, "geo/layer_7/attn_entropy_mean": 5.056687831878662, "geo/layer_7/attn_entropy_std": 1.127943992614746, "geo/layer_14/stable_rank_q_proj": 60.0794563293457, "geo/layer_14/stable_rank_k_proj": 43.302703857421875, "geo/layer_14/stable_rank_o_proj": 93.54778289794922, "geo/layer_14/stable_rank_gate_proj": 179.740234375, "geo/layer_14/stable_rank_down_proj": 154.0023956298828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.42148858308792114, "geo/layer_14/attn_entropy_mean": 6.285759925842285, "geo/layer_14/attn_entropy_std": 0.42802125215530396, "geo/layer_21/stable_rank_q_proj": 48.94982147216797, "geo/layer_21/stable_rank_k_proj": 32.71670150756836, "geo/layer_21/stable_rank_o_proj": 102.6304931640625, "geo/layer_21/stable_rank_gate_proj": 136.58590698242188, "geo/layer_21/stable_rank_down_proj": 127.07825469970703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1895875334739685, "geo/layer_21/attn_entropy_mean": 5.898465156555176, "geo/layer_21/attn_entropy_std": 0.3442060351371765, "geo/layer_27/stable_rank_q_proj": 46.31951904296875, "geo/layer_27/stable_rank_k_proj": 38.60505676269531, "geo/layer_27/stable_rank_o_proj": 102.09347534179688, "geo/layer_27/stable_rank_gate_proj": 87.72669982910156, "geo/layer_27/stable_rank_down_proj": 133.53225708007812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07808198779821396, "geo/layer_27/attn_entropy_mean": 4.7905073165893555, "geo/layer_27/attn_entropy_std": 0.42988505959510803, "attnres/final_alpha/block_0": 0.2368251383304596, "attnres/block_norm/0": 1.0544257164001465, "attnres/final_alpha/block_1": 0.017344657331705093, "attnres/block_norm/1": 5906.822265625, "attnres/final_alpha/block_2": 0.0380987748503685, "attnres/block_norm/2": 4078.490234375, "attnres/final_alpha/block_3": 0.028803061693906784, "attnres/block_norm/3": 3887.1318359375, "attnres/final_alpha/block_4": 0.047017499804496765, "attnres/block_norm/4": 2439.101806640625, "attnres/final_alpha/block_5": 0.3771282434463501, "attnres/block_norm/5": 2781.12939453125, "attnres/final_alpha/block_6": 0.25478261709213257, "attnres/block_norm/6": 3362.061279296875, "geo/tier1_time_s": 1.3559484481811523, "geo/step": 4650.0, "geo/rankme_slope": 0.02925699404761905} {"step": 4660, "timestamp": 1778199569.2843068, "train/loss": 2.641233515739441, "train/z_loss": 0.0018356023239903152, "train/perplexity": 14.030499780301193, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.01864, "optim/adamw_lr": 0.0005591999999999999, "perf/tokens_per_sec": 1791040.4499658514, "perf/iters_per_sec": 0.8540346383885629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709126949310302, "data/tokens_consumed": 9774825472, "data/tokens_consumed_B": 9.774825472, "train/loss_slope": -0.00010878011472869696} {"step": 4670, "timestamp": 1778199579.644196, "train/loss": 2.538584566116333, "train/z_loss": 0.0018653562758117915, "train/perplexity": 12.661736430480314, "train/grad_norm": 0.203125, "optim/muon_lr": 0.018680000000000002, "optim/adamw_lr": 0.0005604, "perf/tokens_per_sec": 2026061.3370011505, "perf/iters_per_sec": 0.9661013302808525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350881099700928, "data/tokens_consumed": 9795796992, "data/tokens_consumed_B": 9.795796992, "train/loss_slope": -0.00010944690353835338} {"step": 4680, "timestamp": 1778199589.9971707, "train/loss": 2.556530809402466, "train/z_loss": 0.0018688899232074617, "train/perplexity": 12.891018252914776, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.01872, "optim/adamw_lr": 0.0005616, "perf/tokens_per_sec": 2027072.054253793, "perf/iters_per_sec": 0.9665832778233495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345720052719116, "data/tokens_consumed": 9816768512, "data/tokens_consumed_B": 9.816768512, "train/loss_slope": -0.00010900290789920358} {"step": 4690, "timestamp": 1778199600.3508315, "train/loss": 2.550245213508606, "train/z_loss": 0.0018677816493436693, "train/perplexity": 12.810244642591561, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.01876, "optim/adamw_lr": 0.0005627999999999999, "perf/tokens_per_sec": 2026716.9020038133, "perf/iters_per_sec": 0.9664139280337397, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347532987594605, "data/tokens_consumed": 9837740032, "data/tokens_consumed_B": 9.837740032, "train/loss_slope": -0.00010936803378538077} {"step": 4700, "timestamp": 1778199610.6897407, "grad/layer_0/attn": 0.003479286329820752, "grad/layer_0/mlp": 0.005308416672050953, "grad/layer_0/attn_mlp_ratio": 0.6554282527587788, "grad/layer_4/attn": 0.002481478499248624, "grad/layer_4/mlp": 0.004888670053333044, "grad/layer_4/attn_mlp_ratio": 0.5075978581939667, "grad/layer_8/attn": 0.00890498235821724, "grad/layer_8/mlp": 0.006303959060460329, "grad/layer_8/attn_mlp_ratio": 1.4126015304907176, "grad/layer_12/attn": 0.0053105284459888935, "grad/layer_12/mlp": 0.006148683372884989, "grad/layer_12/attn_mlp_ratio": 0.8636854489920865, "grad/layer_16/attn": 0.006439204327762127, "grad/layer_16/mlp": 0.005536656826734543, "grad/layer_16/attn_mlp_ratio": 1.1630130623895991, "grad/layer_20/attn": 0.011650613509118557, "grad/layer_20/mlp": 0.012200804427266121, "grad/layer_20/attn_mlp_ratio": 0.9549053493220052, "grad/layer_24/attn": 0.010515368543565273, "grad/layer_24/mlp": 0.012906813994050026, "grad/layer_24/attn_mlp_ratio": 0.8147144963072493, "grad/layer_27/attn": 0.007796190213412046, "grad/layer_27/mlp": 0.012518293224275112, "grad/layer_27/attn_mlp_ratio": 0.6227837942009156} {"step": 4700, "timestamp": 1778199610.705337, "train/loss": 2.5692289590835573, "train/z_loss": 0.001863127073738724, "train/perplexity": 13.0557540385286, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.0188, "optim/adamw_lr": 0.0005639999999999999, "perf/tokens_per_sec": 2026403.188092874, "perf/iters_per_sec": 0.9662643375839586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349134922027587, "data/tokens_consumed": 9858711552, "data/tokens_consumed_B": 9.858711552, "train/loss_slope": -0.00011216506777745338} {"step": 4710, "timestamp": 1778199621.053903, "train/loss": 2.597460412979126, "train/z_loss": 0.001858183927834034, "train/perplexity": 13.429589081109237, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.01884, "optim/adamw_lr": 0.0005652, "perf/tokens_per_sec": 2027837.4276707019, "perf/iters_per_sec": 0.9669482363084325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341815233230591, "data/tokens_consumed": 9879683072, "data/tokens_consumed_B": 9.879683072, "train/loss_slope": -0.00010865303592355695} {"step": 4720, "timestamp": 1778199631.3996398, "train/loss": 2.561163377761841, "train/z_loss": 0.001866551220882684, "train/perplexity": 12.950875315111784, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.01888, "optim/adamw_lr": 0.0005663999999999999, "perf/tokens_per_sec": 2028015.3249124708, "perf/iters_per_sec": 0.9670330643236498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034090805053711, "data/tokens_consumed": 9900654592, "data/tokens_consumed_B": 9.900654592, "train/loss_slope": -0.00010972402778932219} {"step": 4725, "timestamp": 1778199637.1691496, "eos/sharpness": 24.064111709594723, "eos/L0_probe": 2.3961567878723145, "eos/L_plus": 2.539367437362671, "eos/L_minus": 2.4935872554779053, "eos/grad_norm": 0.2745658755302429, "eos/embed_grad_frac": 0.09873919934034348, "eos/time_s": 0.6021230220794678} {"step": 4725, "timestamp": 1778199638.5454102, "geo/rankme_last": 410.2648010253906, "geo/layer_0/stable_rank_q_proj": 26.284183502197266, "geo/layer_0/stable_rank_k_proj": 18.30127716064453, "geo/layer_0/stable_rank_o_proj": 60.2452278137207, "geo/layer_0/stable_rank_gate_proj": 168.1129608154297, "geo/layer_0/stable_rank_down_proj": 47.181941986083984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03645601496100426, "geo/layer_0/attn_entropy_mean": 6.616110324859619, "geo/layer_0/attn_entropy_std": 0.19059360027313232, "geo/layer_7/stable_rank_q_proj": 42.4615592956543, "geo/layer_7/stable_rank_k_proj": 39.33042526245117, "geo/layer_7/stable_rank_o_proj": 102.85103607177734, "geo/layer_7/stable_rank_gate_proj": 165.79214477539062, "geo/layer_7/stable_rank_down_proj": 212.82814025878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5818940997123718, "geo/layer_7/attn_entropy_mean": 5.034316062927246, "geo/layer_7/attn_entropy_std": 1.1283152103424072, "geo/layer_14/stable_rank_q_proj": 61.616249084472656, "geo/layer_14/stable_rank_k_proj": 44.50148391723633, "geo/layer_14/stable_rank_o_proj": 90.6947250366211, "geo/layer_14/stable_rank_gate_proj": 177.41481018066406, "geo/layer_14/stable_rank_down_proj": 154.24053955078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.42809244990348816, "geo/layer_14/attn_entropy_mean": 6.3096113204956055, "geo/layer_14/attn_entropy_std": 0.40538477897644043, "geo/layer_21/stable_rank_q_proj": 49.3928108215332, "geo/layer_21/stable_rank_k_proj": 32.545291900634766, "geo/layer_21/stable_rank_o_proj": 102.55821990966797, "geo/layer_21/stable_rank_gate_proj": 136.83096313476562, "geo/layer_21/stable_rank_down_proj": 124.47237396240234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.18550026416778564, "geo/layer_21/attn_entropy_mean": 5.884881973266602, "geo/layer_21/attn_entropy_std": 0.3407948613166809, "geo/layer_27/stable_rank_q_proj": 46.124027252197266, "geo/layer_27/stable_rank_k_proj": 38.730064392089844, "geo/layer_27/stable_rank_o_proj": 102.5882339477539, "geo/layer_27/stable_rank_gate_proj": 87.58328247070312, "geo/layer_27/stable_rank_down_proj": 135.9867401123047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07460804283618927, "geo/layer_27/attn_entropy_mean": 4.6914873123168945, "geo/layer_27/attn_entropy_std": 0.42598405480384827, "attnres/final_alpha/block_0": 0.23766925930976868, "attnres/block_norm/0": 1.0635764598846436, "attnres/final_alpha/block_1": 0.017024129629135132, "attnres/block_norm/1": 6087.12890625, "attnres/final_alpha/block_2": 0.03693100064992905, "attnres/block_norm/2": 4226.2333984375, "attnres/final_alpha/block_3": 0.02846645563840866, "attnres/block_norm/3": 4073.513671875, "attnres/final_alpha/block_4": 0.046033475548028946, "attnres/block_norm/4": 2494.37060546875, "attnres/final_alpha/block_5": 0.38212788105010986, "attnres/block_norm/5": 2807.23095703125, "attnres/final_alpha/block_6": 0.2517477869987488, "attnres/block_norm/6": 3467.532470703125, "geo/tier1_time_s": 1.3560304641723633, "geo/step": 4725.0, "geo/rankme_slope": 0.0283192113954957} {"step": 4730, "timestamp": 1778199643.7302797, "train/loss": 2.5359185934066772, "train/z_loss": 0.0018702735309489072, "train/perplexity": 12.628025542819483, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.01892, "optim/adamw_lr": 0.0005675999999999999, "perf/tokens_per_sec": 1701695.4775034504, "perf/iters_per_sec": 0.8114316356198551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232389712333679, "data/tokens_consumed": 9921626112, "data/tokens_consumed_B": 9.921626112, "train/loss_slope": -0.00011042399918607429} {"step": 4740, "timestamp": 1778199654.0820646, "train/loss": 2.6062710523605346, "train/z_loss": 0.0018588868668302893, "train/perplexity": 13.54843513359071, "train/grad_norm": 0.345703125, "optim/muon_lr": 0.01896, "optim/adamw_lr": 0.0005688, "perf/tokens_per_sec": 2026951.3987563462, "perf/iters_per_sec": 0.9665257447988254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346335887908935, "data/tokens_consumed": 9942597632, "data/tokens_consumed_B": 9.942597632, "train/loss_slope": -0.00010859321500673473} {"step": 4750, "timestamp": 1778199664.426367, "grad/layer_0/attn": 0.003168749623000622, "grad/layer_0/mlp": 0.004874464590102434, "grad/layer_0/attn_mlp_ratio": 0.6500713051496189, "grad/layer_4/attn": 0.002291465410962701, "grad/layer_4/mlp": 0.0049036964774131775, "grad/layer_4/attn_mlp_ratio": 0.4672934743795886, "grad/layer_8/attn": 0.010086330585181713, "grad/layer_8/mlp": 0.006510531529784203, "grad/layer_8/attn_mlp_ratio": 1.5492330210084602, "grad/layer_12/attn": 0.005347030237317085, "grad/layer_12/mlp": 0.006363155785948038, "grad/layer_12/attn_mlp_ratio": 0.8403110552619187, "grad/layer_16/attn": 0.006628443021327257, "grad/layer_16/mlp": 0.005628500133752823, "grad/layer_16/attn_mlp_ratio": 1.1776570571282927, "grad/layer_20/attn": 0.012102828361093998, "grad/layer_20/mlp": 0.01155891828238964, "grad/layer_20/attn_mlp_ratio": 1.0470554389875286, "grad/layer_24/attn": 0.019099215045571327, "grad/layer_24/mlp": 0.017038919031620026, "grad/layer_24/attn_mlp_ratio": 1.1209170545406193, "grad/layer_27/attn": 0.012018661014735699, "grad/layer_27/mlp": 0.018939834088087082, "grad/layer_27/attn_mlp_ratio": 0.6345705509024617} {"step": 4750, "timestamp": 1778199664.442233, "train/loss": 2.5208747148513795, "train/z_loss": 0.0018781504477374256, "train/perplexity": 12.439472897769294, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.019, "optim/adamw_lr": 0.00057, "perf/tokens_per_sec": 2025281.355301371, "perf/iters_per_sec": 0.9657294060236793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354867458343506, "data/tokens_consumed": 9963569152, "data/tokens_consumed_B": 9.963569152, "train/loss_slope": -0.0001102597300440493} {"step": 4760, "timestamp": 1778199674.792533, "train/loss": 2.562544655799866, "train/z_loss": 0.0018727546674199402, "train/perplexity": 12.968776435123539, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.019039999999999998, "optim/adamw_lr": 0.0005711999999999999, "perf/tokens_per_sec": 2027626.6095949754, "perf/iters_per_sec": 0.9668477104163052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034289050102234, "data/tokens_consumed": 9984540672, "data/tokens_consumed_B": 9.984540672, "train/loss_slope": -0.00010945110740226698} {"step": 4770, "timestamp": 1778199685.1422358, "train/loss": 2.5678035974502564, "train/z_loss": 0.0018671610509045423, "train/perplexity": 13.03715812372839, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.01908, "optim/adamw_lr": 0.0005723999999999999, "perf/tokens_per_sec": 2027425.6954284708, "perf/iters_per_sec": 0.9667519070761065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343915462493896, "data/tokens_consumed": 10005512192, "data/tokens_consumed_B": 10.005512192, "train/loss_slope": -0.0001078011360105508} {"step": 4780, "timestamp": 1778199695.4996114, "train/loss": 2.511804223060608, "train/z_loss": 0.0018842132412828505, "train/perplexity": 12.327150938767419, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.019119999999999998, "optim/adamw_lr": 0.0005736, "perf/tokens_per_sec": 2025768.6335079914, "perf/iters_per_sec": 0.9659617583789785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352376699447632, "data/tokens_consumed": 10026483712, "data/tokens_consumed_B": 10.026483712, "train/loss_slope": -0.0001087319062630979} {"step": 4790, "timestamp": 1778199705.8494382, "train/loss": 2.5225744962692263, "train/z_loss": 0.0018806690815836192, "train/perplexity": 12.46063526325124, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.01916, "optim/adamw_lr": 0.0005747999999999999, "perf/tokens_per_sec": 2028010.6024010829, "perf/iters_per_sec": 0.96703081245474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340932130813598, "data/tokens_consumed": 10047455232, "data/tokens_consumed_B": 10.047455232, "train/loss_slope": -0.00011176935665749799} {"step": 4800, "timestamp": 1778199716.1878748, "grad/layer_0/attn": 0.003992744255810976, "grad/layer_0/mlp": 0.005085291340947151, "grad/layer_0/attn_mlp_ratio": 0.7851554433362654, "grad/layer_4/attn": 0.0025117511395365, "grad/layer_4/mlp": 0.004711721558123827, "grad/layer_4/attn_mlp_ratio": 0.5330856365858974, "grad/layer_8/attn": 0.011569909751415253, "grad/layer_8/mlp": 0.006071037612855434, "grad/layer_8/attn_mlp_ratio": 1.9057548805727145, "grad/layer_12/attn": 0.007796441670507193, "grad/layer_12/mlp": 0.006103166379034519, "grad/layer_12/attn_mlp_ratio": 1.2774420781883273, "grad/layer_16/attn": 0.0068025230430066586, "grad/layer_16/mlp": 0.005436317529529333, "grad/layer_16/attn_mlp_ratio": 1.2513108148898964, "grad/layer_20/attn": 0.014747527427971363, "grad/layer_20/mlp": 0.01076752133667469, "grad/layer_20/attn_mlp_ratio": 1.369630654064972, "grad/layer_24/attn": 0.034088052809238434, "grad/layer_24/mlp": 0.016912491992115974, "grad/layer_24/attn_mlp_ratio": 2.0155546931566115, "grad/layer_27/attn": 0.0209446232765913, "grad/layer_27/mlp": 0.017353413626551628, "grad/layer_27/attn_mlp_ratio": 1.2069454233402466} {"step": 4800, "timestamp": 1778199716.7871923, "eos/sharpness": 44.41020488739013, "eos/L0_probe": 2.3893868923187256, "eos/L_plus": 2.7223188877105713, "eos/L_minus": 2.5005569458007812, "eos/grad_norm": 0.26042231917381287, "eos/embed_grad_frac": 0.08612912893295288, "eos/time_s": 0.5966641902923584} {"step": 4800, "timestamp": 1778199716.8073728, "train/loss": 2.5623128414154053, "train/z_loss": 0.0018708692397922278, "train/perplexity": 12.965770434627562, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.0192, "optim/adamw_lr": 0.0005759999999999999, "perf/tokens_per_sec": 1914599.2048376596, "perf/iters_per_sec": 0.9129520439327524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095347785949707, "data/tokens_consumed": 10068426752, "data/tokens_consumed_B": 10.068426752, "train/loss_slope": -0.0001105311785594071} {"step": 4800, "timestamp": 1778199718.166369, "geo/rankme_last": 412.0001525878906, "geo/layer_0/stable_rank_q_proj": 25.5601806640625, "geo/layer_0/stable_rank_k_proj": 17.672008514404297, "geo/layer_0/stable_rank_o_proj": 59.698909759521484, "geo/layer_0/stable_rank_gate_proj": 169.83334350585938, "geo/layer_0/stable_rank_down_proj": 47.00139617919922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03480597957968712, "geo/layer_0/attn_entropy_mean": 6.609493732452393, "geo/layer_0/attn_entropy_std": 0.19256551563739777, "geo/layer_7/stable_rank_q_proj": 43.37303924560547, "geo/layer_7/stable_rank_k_proj": 39.733463287353516, "geo/layer_7/stable_rank_o_proj": 102.4012451171875, "geo/layer_7/stable_rank_gate_proj": 164.0820770263672, "geo/layer_7/stable_rank_down_proj": 211.1699981689453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5716508030891418, "geo/layer_7/attn_entropy_mean": 4.982586860656738, "geo/layer_7/attn_entropy_std": 1.0855404138565063, "geo/layer_14/stable_rank_q_proj": 63.161903381347656, "geo/layer_14/stable_rank_k_proj": 45.88456726074219, "geo/layer_14/stable_rank_o_proj": 87.79389190673828, "geo/layer_14/stable_rank_gate_proj": 174.72361755371094, "geo/layer_14/stable_rank_down_proj": 153.89569091796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4286644756793976, "geo/layer_14/attn_entropy_mean": 6.226611137390137, "geo/layer_14/attn_entropy_std": 0.45014235377311707, "geo/layer_21/stable_rank_q_proj": 49.54732894897461, "geo/layer_21/stable_rank_k_proj": 32.48577880859375, "geo/layer_21/stable_rank_o_proj": 102.75629425048828, "geo/layer_21/stable_rank_gate_proj": 136.72576904296875, "geo/layer_21/stable_rank_down_proj": 122.69139099121094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1843753159046173, "geo/layer_21/attn_entropy_mean": 5.82907772064209, "geo/layer_21/attn_entropy_std": 0.3376697599887848, "geo/layer_27/stable_rank_q_proj": 45.85404968261719, "geo/layer_27/stable_rank_k_proj": 38.955135345458984, "geo/layer_27/stable_rank_o_proj": 102.30528259277344, "geo/layer_27/stable_rank_gate_proj": 87.91838836669922, "geo/layer_27/stable_rank_down_proj": 138.38937377929688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07625386863946915, "geo/layer_27/attn_entropy_mean": 4.701783180236816, "geo/layer_27/attn_entropy_std": 0.45178914070129395, "attnres/final_alpha/block_0": 0.23788584768772125, "attnres/block_norm/0": 1.0727450847625732, "attnres/final_alpha/block_1": 0.01669900491833687, "attnres/block_norm/1": 6257.4990234375, "attnres/final_alpha/block_2": 0.036909084767103195, "attnres/block_norm/2": 4367.28466796875, "attnres/final_alpha/block_3": 0.02865227870643139, "attnres/block_norm/3": 4177.7060546875, "attnres/final_alpha/block_4": 0.046774063259363174, "attnres/block_norm/4": 2521.02587890625, "attnres/final_alpha/block_5": 0.38554221391677856, "attnres/block_norm/5": 2826.315673828125, "attnres/final_alpha/block_6": 0.2475375086069107, "attnres/block_norm/6": 3537.61376953125, "geo/tier1_time_s": 1.3549914360046387, "geo/step": 4800.0, "geo/rankme_slope": 0.027603289225846588} {"step": 4810, "timestamp": 1778199728.9225316, "train/loss": 2.5523932695388796, "train/z_loss": 0.0018714090925641357, "train/perplexity": 12.837791341179333, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.01924, "optim/adamw_lr": 0.0005771999999999999, "perf/tokens_per_sec": 1731589.6031561843, "perf/iters_per_sec": 0.8256862655430719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2111137628555297, "data/tokens_consumed": 10089398272, "data/tokens_consumed_B": 10.089398272, "train/loss_slope": -0.00011398736298209436} {"step": 4820, "timestamp": 1778199739.2784443, "train/loss": 2.557852339744568, "train/z_loss": 0.0018734166398644448, "train/perplexity": 12.908065386347648, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.01928, "optim/adamw_lr": 0.0005784, "perf/tokens_per_sec": 2026669.5048513522, "perf/iters_per_sec": 0.9663913273102532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347774982452393, "data/tokens_consumed": 10110369792, "data/tokens_consumed_B": 10.110369792, "train/loss_slope": -0.0001159289862349195} {"step": 4830, "timestamp": 1778199749.639269, "train/loss": 2.4965751647949217, "train/z_loss": 0.0018870956380851566, "train/perplexity": 12.140842292264162, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.01932, "optim/adamw_lr": 0.0005795999999999999, "perf/tokens_per_sec": 2025489.7736548325, "perf/iters_per_sec": 0.9658287876390612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03538019657135, "data/tokens_consumed": 10131341312, "data/tokens_consumed_B": 10.131341312, "train/loss_slope": -0.00012128035919894475} {"step": 4840, "timestamp": 1778199759.9979773, "train/loss": 2.539619517326355, "train/z_loss": 0.0018775235046632587, "train/perplexity": 12.674847493404942, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.01936, "optim/adamw_lr": 0.0005807999999999999, "perf/tokens_per_sec": 2026052.0968606658, "perf/iters_per_sec": 0.9660969242385224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035092830657959, "data/tokens_consumed": 10152312832, "data/tokens_consumed_B": 10.152312832, "train/loss_slope": -0.00012135066863047688} {"step": 4850, "timestamp": 1778199770.3485174, "grad/layer_0/attn": 0.004073374904692173, "grad/layer_0/mlp": 0.00533172907307744, "grad/layer_0/attn_mlp_ratio": 0.7639875868528122, "grad/layer_4/attn": 0.002252759877592325, "grad/layer_4/mlp": 0.004641992971301079, "grad/layer_4/attn_mlp_ratio": 0.4853001378911821, "grad/layer_8/attn": 0.008881193585693836, "grad/layer_8/mlp": 0.00644374405965209, "grad/layer_8/attn_mlp_ratio": 1.3782660151692536, "grad/layer_12/attn": 0.006002054084092379, "grad/layer_12/mlp": 0.006495047360658646, "grad/layer_12/attn_mlp_ratio": 0.924097032461673, "grad/layer_16/attn": 0.0071817487478256226, "grad/layer_16/mlp": 0.006125809624791145, "grad/layer_16/attn_mlp_ratio": 1.1723754198177416, "grad/layer_20/attn": 0.013821989297866821, "grad/layer_20/mlp": 0.011735393665730953, "grad/layer_20/attn_mlp_ratio": 1.1778036232775613, "grad/layer_24/attn": 0.02741537243127823, "grad/layer_24/mlp": 0.015635931864380836, "grad/layer_24/attn_mlp_ratio": 1.7533571068057434, "grad/layer_27/attn": 0.0134242819622159, "grad/layer_27/mlp": 0.017245644703507423, "grad/layer_27/attn_mlp_ratio": 0.7784157748329402} {"step": 4850, "timestamp": 1778199770.364435, "train/loss": 2.5582927942276, "train/z_loss": 0.00186937905382365, "train/perplexity": 12.913752053881502, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.0194, "optim/adamw_lr": 0.0005819999999999999, "perf/tokens_per_sec": 2024199.0534801725, "perf/iters_per_sec": 0.9652133242989409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360404014587403, "data/tokens_consumed": 10173284352, "data/tokens_consumed_B": 10.173284352, "train/loss_slope": -0.00012294559432978817} {"step": 4860, "timestamp": 1778199780.7354999, "train/loss": 2.5799936771392824, "train/z_loss": 0.001857859850861132, "train/perplexity": 13.197054716255703, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.01944, "optim/adamw_lr": 0.0005832, "perf/tokens_per_sec": 2023574.4906351545, "perf/iters_per_sec": 0.9649155095268033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363601684570312, "data/tokens_consumed": 10194255872, "data/tokens_consumed_B": 10.194255872, "train/loss_slope": -0.00012097426300609639} {"step": 4870, "timestamp": 1778199791.099179, "train/loss": 2.533505988121033, "train/z_loss": 0.001877789048012346, "train/perplexity": 12.597595823858125, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.01948, "optim/adamw_lr": 0.0005843999999999999, "perf/tokens_per_sec": 2025179.9831374928, "perf/iters_per_sec": 0.9656810680091347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355385780334472, "data/tokens_consumed": 10215227392, "data/tokens_consumed_B": 10.215227392, "train/loss_slope": -0.00011980505275754926} {"step": 4875, "timestamp": 1778199796.878076, "eos/sharpness": 33.650422096252434, "eos/L0_probe": 2.3858020305633545, "eos/L_plus": 2.636861801147461, "eos/L_minus": 2.4712464809417725, "eos/grad_norm": 0.1960785835981369, "eos/embed_grad_frac": 0.10885605216026306, "eos/time_s": 0.605790376663208} {"step": 4875, "timestamp": 1778199798.2559552, "geo/rankme_last": 412.5809020996094, "geo/layer_0/stable_rank_q_proj": 25.010435104370117, "geo/layer_0/stable_rank_k_proj": 17.160539627075195, "geo/layer_0/stable_rank_o_proj": 59.32289123535156, "geo/layer_0/stable_rank_gate_proj": 169.37591552734375, "geo/layer_0/stable_rank_down_proj": 47.2874870300293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.02364201657474041, "geo/layer_0/attn_entropy_mean": 6.606865882873535, "geo/layer_0/attn_entropy_std": 0.20120900869369507, "geo/layer_7/stable_rank_q_proj": 44.04127883911133, "geo/layer_7/stable_rank_k_proj": 40.05552673339844, "geo/layer_7/stable_rank_o_proj": 102.96722412109375, "geo/layer_7/stable_rank_gate_proj": 163.04238891601562, "geo/layer_7/stable_rank_down_proj": 210.9038848876953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5852206945419312, "geo/layer_7/attn_entropy_mean": 5.01458740234375, "geo/layer_7/attn_entropy_std": 1.046722412109375, "geo/layer_14/stable_rank_q_proj": 64.53067016601562, "geo/layer_14/stable_rank_k_proj": 47.39707946777344, "geo/layer_14/stable_rank_o_proj": 84.54531860351562, "geo/layer_14/stable_rank_gate_proj": 173.6485595703125, "geo/layer_14/stable_rank_down_proj": 153.05166625976562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4247373640537262, "geo/layer_14/attn_entropy_mean": 6.2368483543396, "geo/layer_14/attn_entropy_std": 0.41978323459625244, "geo/layer_21/stable_rank_q_proj": 49.888671875, "geo/layer_21/stable_rank_k_proj": 32.57636642456055, "geo/layer_21/stable_rank_o_proj": 102.14144134521484, "geo/layer_21/stable_rank_gate_proj": 136.75213623046875, "geo/layer_21/stable_rank_down_proj": 120.2887191772461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.18756498396396637, "geo/layer_21/attn_entropy_mean": 5.880060195922852, "geo/layer_21/attn_entropy_std": 0.34083762764930725, "geo/layer_27/stable_rank_q_proj": 45.755184173583984, "geo/layer_27/stable_rank_k_proj": 39.07693862915039, "geo/layer_27/stable_rank_o_proj": 102.67938995361328, "geo/layer_27/stable_rank_gate_proj": 88.04669952392578, "geo/layer_27/stable_rank_down_proj": 140.59701538085938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07126099616289139, "geo/layer_27/attn_entropy_mean": 4.719352722167969, "geo/layer_27/attn_entropy_std": 0.45363885164260864, "attnres/final_alpha/block_0": 0.23841948807239532, "attnres/block_norm/0": 1.0816454887390137, "attnres/final_alpha/block_1": 0.016592254862189293, "attnres/block_norm/1": 6461.3466796875, "attnres/final_alpha/block_2": 0.03573711961507797, "attnres/block_norm/2": 4545.60888671875, "attnres/final_alpha/block_3": 0.02817033976316452, "attnres/block_norm/3": 4322.44189453125, "attnres/final_alpha/block_4": 0.0458877868950367, "attnres/block_norm/4": 2590.88818359375, "attnres/final_alpha/block_5": 0.38926011323928833, "attnres/block_norm/5": 2868.189453125, "attnres/final_alpha/block_6": 0.2459329068660736, "attnres/block_norm/6": 3642.609375, "geo/tier1_time_s": 1.3578732013702393, "geo/step": 4875.0, "geo/rankme_slope": 0.027081520303433873} {"step": 4880, "timestamp": 1778199803.4355643, "train/loss": 2.5680540800094604, "train/z_loss": 0.0018684047390706837, "train/perplexity": 13.040424113479627, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.01952, "optim/adamw_lr": 0.0005855999999999999, "perf/tokens_per_sec": 1700804.277436529, "perf/iters_per_sec": 0.8110066783125539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330354690551757, "data/tokens_consumed": 10236198912, "data/tokens_consumed_B": 10.236198912, "train/loss_slope": -0.00011815295112122292} {"step": 4890, "timestamp": 1778199813.790081, "train/loss": 2.563628578186035, "train/z_loss": 0.001871662214398384, "train/perplexity": 12.982841203404346, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.01956, "optim/adamw_lr": 0.0005868, "perf/tokens_per_sec": 2026316.9209560005, "perf/iters_per_sec": 0.9662232022075655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349575519561767, "data/tokens_consumed": 10257170432, "data/tokens_consumed_B": 10.257170432, "train/loss_slope": -0.00011795925870873065} {"step": 4900, "timestamp": 1778199824.1447616, "grad/layer_0/attn": 0.00344442599453032, "grad/layer_0/mlp": 0.005058211740106344, "grad/layer_0/attn_mlp_ratio": 0.6809572440639229, "grad/layer_4/attn": 0.002398717449977994, "grad/layer_4/mlp": 0.00456387409940362, "grad/layer_4/attn_mlp_ratio": 0.5255879862533118, "grad/layer_8/attn": 0.007011002860963345, "grad/layer_8/mlp": 0.005899913143366575, "grad/layer_8/attn_mlp_ratio": 1.188323043367798, "grad/layer_12/attn": 0.005890164989978075, "grad/layer_12/mlp": 0.0063627539202570915, "grad/layer_12/attn_mlp_ratio": 0.9257257111033312, "grad/layer_16/attn": 0.006947451736778021, "grad/layer_16/mlp": 0.005816045217216015, "grad/layer_16/attn_mlp_ratio": 1.194531912640526, "grad/layer_20/attn": 0.01323905773460865, "grad/layer_20/mlp": 0.011505866423249245, "grad/layer_20/attn_mlp_ratio": 1.1506354352240438, "grad/layer_24/attn": 0.01961439847946167, "grad/layer_24/mlp": 0.016351576894521713, "grad/layer_24/attn_mlp_ratio": 1.1995417008422553, "grad/layer_27/attn": 0.00880387332290411, "grad/layer_27/mlp": 0.018074482679367065, "grad/layer_27/attn_mlp_ratio": 0.48708853417117837} {"step": 4900, "timestamp": 1778199824.1605563, "train/loss": 2.567875123023987, "train/z_loss": 0.0018699938314966857, "train/perplexity": 13.038090647292247, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.0196, "optim/adamw_lr": 0.000588, "perf/tokens_per_sec": 2023355.2958471393, "perf/iters_per_sec": 0.9648109893069932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364724397659302, "data/tokens_consumed": 10278141952, "data/tokens_consumed_B": 10.278141952, "train/loss_slope": -0.00011618970996297015} {"step": 4910, "timestamp": 1778199834.5364091, "train/loss": 2.58792507648468, "train/z_loss": 0.0018636823631823062, "train/perplexity": 13.302142021189992, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.01964, "optim/adamw_lr": 0.0005891999999999999, "perf/tokens_per_sec": 2022285.3261319033, "perf/iters_per_sec": 0.9643007879886166, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370208263397216, "data/tokens_consumed": 10299113472, "data/tokens_consumed_B": 10.299113472, "train/loss_slope": -0.00011324315839367444} {"step": 4920, "timestamp": 1778199844.9137042, "train/loss": 2.5553723096847536, "train/z_loss": 0.001879059395287186, "train/perplexity": 12.876092659225117, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.01968, "optim/adamw_lr": 0.0005903999999999999, "perf/tokens_per_sec": 2022060.2282080948, "perf/iters_per_sec": 0.9641934529343104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371362686157226, "data/tokens_consumed": 10320084992, "data/tokens_consumed_B": 10.320084992, "train/loss_slope": -0.00011150639057874749} {"step": 4930, "timestamp": 1778199855.2981682, "train/loss": 2.5409056901931764, "train/z_loss": 0.0018786674365401268, "train/perplexity": 12.69116002646202, "train/grad_norm": 0.25, "optim/muon_lr": 0.01972, "optim/adamw_lr": 0.0005916, "perf/tokens_per_sec": 2020537.92651724, "perf/iters_per_sec": 0.9634675629221153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379176616668702, "data/tokens_consumed": 10341056512, "data/tokens_consumed_B": 10.341056512, "train/loss_slope": -0.00010985605345450464} {"step": 4940, "timestamp": 1778199865.6598916, "train/loss": 2.5094430208206178, "train/z_loss": 0.0018833004869520664, "train/perplexity": 12.298078378962067, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.01976, "optim/adamw_lr": 0.0005928, "perf/tokens_per_sec": 2025243.1646884952, "perf/iters_per_sec": 0.9657111953203655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035506272315979, "data/tokens_consumed": 10362028032, "data/tokens_consumed_B": 10.362028032, "train/loss_slope": -0.00011559338403685183} {"step": 4950, "timestamp": 1778199876.0117798, "grad/layer_0/attn": 0.0036977468989789486, "grad/layer_0/mlp": 0.005012730602174997, "grad/layer_0/attn_mlp_ratio": 0.7376711654137965, "grad/layer_4/attn": 0.0018951462116092443, "grad/layer_4/mlp": 0.004289370961487293, "grad/layer_4/attn_mlp_ratio": 0.4418237975784084, "grad/layer_8/attn": 0.006361905951052904, "grad/layer_8/mlp": 0.005726204253733158, "grad/layer_8/attn_mlp_ratio": 1.1110162261158747, "grad/layer_12/attn": 0.006053621880710125, "grad/layer_12/mlp": 0.005500230938196182, "grad/layer_12/attn_mlp_ratio": 1.1006122904057916, "grad/layer_16/attn": 0.006417708471417427, "grad/layer_16/mlp": 0.005180130712687969, "grad/layer_16/attn_mlp_ratio": 1.2389085726750748, "grad/layer_20/attn": 0.009829577058553696, "grad/layer_20/mlp": 0.01050606183707714, "grad/layer_20/attn_mlp_ratio": 0.9356100427948126, "grad/layer_24/attn": 0.018789144232869148, "grad/layer_24/mlp": 0.015103213489055634, "grad/layer_24/attn_mlp_ratio": 1.2440494284265755, "grad/layer_27/attn": 0.00742550753057003, "grad/layer_27/mlp": 0.015566638670861721, "grad/layer_27/attn_mlp_ratio": 0.4770141865480558} {"step": 4950, "timestamp": 1778199876.6223211, "eos/sharpness": 27.13825702667236, "eos/L0_probe": 2.3815364837646484, "eos/L_plus": 2.490959882736206, "eos/L_minus": 2.5434956550598145, "eos/grad_norm": 0.19682809710502625, "eos/embed_grad_frac": 0.1259419023990631, "eos/time_s": 0.6076679229736328} {"step": 4950, "timestamp": 1778199876.6425714, "train/loss": 2.6128717422485352, "train/z_loss": 0.001860188110731542, "train/perplexity": 13.638159949448664, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.0198, "optim/adamw_lr": 0.0005939999999999999, "perf/tokens_per_sec": 1910423.172981255, "perf/iters_per_sec": 0.9109607567697787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0977421283721924, "data/tokens_consumed": 10382999552, "data/tokens_consumed_B": 10.382999552, "train/loss_slope": -0.00011300477074532399} {"step": 4950, "timestamp": 1778199878.0039873, "geo/rankme_last": 415.5811462402344, "geo/layer_0/stable_rank_q_proj": 24.359813690185547, "geo/layer_0/stable_rank_k_proj": 16.699560165405273, "geo/layer_0/stable_rank_o_proj": 58.82632064819336, "geo/layer_0/stable_rank_gate_proj": 169.08824157714844, "geo/layer_0/stable_rank_down_proj": 47.11570739746094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036881912499666214, "geo/layer_0/attn_entropy_mean": 6.600354194641113, "geo/layer_0/attn_entropy_std": 0.20021770894527435, "geo/layer_7/stable_rank_q_proj": 44.681617736816406, "geo/layer_7/stable_rank_k_proj": 40.45893859863281, "geo/layer_7/stable_rank_o_proj": 104.11524963378906, "geo/layer_7/stable_rank_gate_proj": 163.7119140625, "geo/layer_7/stable_rank_down_proj": 210.48101806640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5755966305732727, "geo/layer_7/attn_entropy_mean": 5.003184795379639, "geo/layer_7/attn_entropy_std": 1.0866392850875854, "geo/layer_14/stable_rank_q_proj": 65.98033905029297, "geo/layer_14/stable_rank_k_proj": 48.19976806640625, "geo/layer_14/stable_rank_o_proj": 81.78422546386719, "geo/layer_14/stable_rank_gate_proj": 172.5274200439453, "geo/layer_14/stable_rank_down_proj": 151.9005584716797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4130587577819824, "geo/layer_14/attn_entropy_mean": 6.2394633293151855, "geo/layer_14/attn_entropy_std": 0.437835156917572, "geo/layer_21/stable_rank_q_proj": 50.07047653198242, "geo/layer_21/stable_rank_k_proj": 32.42537307739258, "geo/layer_21/stable_rank_o_proj": 102.294677734375, "geo/layer_21/stable_rank_gate_proj": 135.94529724121094, "geo/layer_21/stable_rank_down_proj": 118.38972473144531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.18576094508171082, "geo/layer_21/attn_entropy_mean": 5.841306209564209, "geo/layer_21/attn_entropy_std": 0.329958438873291, "geo/layer_27/stable_rank_q_proj": 45.42723083496094, "geo/layer_27/stable_rank_k_proj": 38.84978485107422, "geo/layer_27/stable_rank_o_proj": 103.1238021850586, "geo/layer_27/stable_rank_gate_proj": 88.59334564208984, "geo/layer_27/stable_rank_down_proj": 142.38380432128906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0726885125041008, "geo/layer_27/attn_entropy_mean": 4.6995954513549805, "geo/layer_27/attn_entropy_std": 0.4761528968811035, "attnres/final_alpha/block_0": 0.23908905684947968, "attnres/block_norm/0": 1.0905660390853882, "attnres/final_alpha/block_1": 0.016746461391448975, "attnres/block_norm/1": 6648.02734375, "attnres/final_alpha/block_2": 0.03653820604085922, "attnres/block_norm/2": 4608.7939453125, "attnres/final_alpha/block_3": 0.02791537716984749, "attnres/block_norm/3": 4417.9814453125, "attnres/final_alpha/block_4": 0.045346274971961975, "attnres/block_norm/4": 2663.047607421875, "attnres/final_alpha/block_5": 0.3834405839443207, "attnres/block_norm/5": 2922.449951171875, "attnres/final_alpha/block_6": 0.2509240508079529, "attnres/block_norm/6": 3742.7041015625, "geo/tier1_time_s": 1.3571751117706299, "geo/step": 4950.0, "geo/rankme_slope": 0.02661527726324905} {"step": 4960, "timestamp": 1778199888.358912, "train/loss": 2.526969861984253, "train/z_loss": 0.0018808877211995423, "train/perplexity": 12.515524853818299, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.01984, "optim/adamw_lr": 0.0005951999999999999, "perf/tokens_per_sec": 1790540.8215189627, "perf/iters_per_sec": 0.853796396979791, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171239423751831, "data/tokens_consumed": 10403971072, "data/tokens_consumed_B": 10.403971072, "train/loss_slope": -0.00011286608343756733} {"step": 4970, "timestamp": 1778199898.7205431, "train/loss": 2.543818402290344, "train/z_loss": 0.0018786390661261975, "train/perplexity": 12.72817960931909, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.019880000000000002, "optim/adamw_lr": 0.0005964, "perf/tokens_per_sec": 2025250.2990891903, "perf/iters_per_sec": 0.9657145972677185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355026245117187, "data/tokens_consumed": 10424942592, "data/tokens_consumed_B": 10.424942592, "train/loss_slope": -0.0001138952300314641} {"step": 4980, "timestamp": 1778199909.0840847, "train/loss": 2.528873062133789, "train/z_loss": 0.0018794795963913203, "train/perplexity": 12.539367083662592, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.01992, "optim/adamw_lr": 0.0005976, "perf/tokens_per_sec": 2024871.406074389, "perf/iters_per_sec": 0.9655339269992775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356963872909546, "data/tokens_consumed": 10445914112, "data/tokens_consumed_B": 10.445914112, "train/loss_slope": -0.00011467654947543547} {"step": 4990, "timestamp": 1778199919.4564507, "train/loss": 2.5278873205184937, "train/z_loss": 0.0018787505803629756, "train/perplexity": 12.527012597864545, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.019960000000000002, "optim/adamw_lr": 0.0005987999999999999, "perf/tokens_per_sec": 2023445.779509175, "perf/iters_per_sec": 0.9648541352792621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364260911941527, "data/tokens_consumed": 10466885632, "data/tokens_consumed_B": 10.466885632, "train/loss_slope": -0.00011510564882477013} {"step": 5000, "timestamp": 1778199929.804294, "grad/layer_0/attn": 0.0046770586632192135, "grad/layer_0/mlp": 0.0057547427713871, "grad/layer_0/attn_mlp_ratio": 0.8127311276536452, "grad/layer_4/attn": 0.0025089529808610678, "grad/layer_4/mlp": 0.004699816927313805, "grad/layer_4/attn_mlp_ratio": 0.5338405657666779, "grad/layer_8/attn": 0.008651846088469028, "grad/layer_8/mlp": 0.006393250077962875, "grad/layer_8/attn_mlp_ratio": 1.3532781992939036, "grad/layer_12/attn": 0.0067754872143268585, "grad/layer_12/mlp": 0.006627051625400782, "grad/layer_12/attn_mlp_ratio": 1.0223984201537375, "grad/layer_16/attn": 0.010353709571063519, "grad/layer_16/mlp": 0.0064459596760571, "grad/layer_16/attn_mlp_ratio": 1.6062324201155243, "grad/layer_20/attn": 0.011192385107278824, "grad/layer_20/mlp": 0.01213809009641409, "grad/layer_20/attn_mlp_ratio": 0.9220878182784759, "grad/layer_24/attn": 0.01332283578813076, "grad/layer_24/mlp": 0.014589766971766949, "grad/layer_24/attn_mlp_ratio": 0.9131630219040398, "grad/layer_27/attn": 0.011091244406998158, "grad/layer_27/mlp": 0.016011586412787437, "grad/layer_27/attn_mlp_ratio": 0.6927011510158774} {"step": 5000, "timestamp": 1778199929.8205795, "train/loss": 2.566976523399353, "train/z_loss": 0.0018739531515166163, "train/perplexity": 13.026379886361354, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024670.89771637, "perf/iters_per_sec": 0.9654383171636438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357989549636841, "data/tokens_consumed": 10487857152, "data/tokens_consumed_B": 10.487857152, "train/loss_slope": -0.00011553568283502234} {"step": 5000, "timestamp": 1778199936.8179705, "geo/ww_alpha_mean": 8.516515732443231, "geo/ww_alpha_std": 4.836939821032228, "geo/ww_alpha_min": 2.3514051735427732, "geo/ww_alpha_max": 27.31720656870591, "geo/ww_alpha_healthy_frac": 0.12690355329949238, "geo/ww_alpha_by_type/q_proj": 4.878421336000981, "geo/ww_alpha_by_type/k_proj": 5.531398218490245, "geo/ww_alpha_by_type/v_proj": 7.557318674999495, "geo/ww_alpha_by_type/o_proj": 7.6341917300572915, "geo/ww_alpha_by_type/gate_proj": 11.967908984128977, "geo/ww_alpha_by_type/up_proj": 12.284235168937085, "geo/ww_alpha_by_type/down_proj": 9.953354343229776, "geo/twonn_id/layer_0": 0.7306721210479736, "geo/twonn_id/layer_7": 3.0129828453063965, "geo/twonn_id/layer_14": 3.4009532928466797, "geo/twonn_id/layer_21": 6.173281669616699, "geo/twonn_id/layer_27": 5.420497894287109, "geo/tier2_time_s": 6.990743398666382} {"step": 5000, "timestamp": 1778199937.542007, "eoc/jacobian_sigma/layer_0/attn": 577.04443359375, "eoc/jacobian_sigma/layer_0/mlp": 1001.342529296875, "eoc/jacobian_sigma/layer_0": 1001.342529296875, "eoc/jacobian_sigma/layer_7/attn": 1.1589399576187134, "eoc/jacobian_sigma/layer_7/mlp": 1.630664587020874, "eoc/jacobian_sigma/layer_7": 1.630664587020874, "eoc/jacobian_sigma/layer_14/attn": 1.1426219940185547, "eoc/jacobian_sigma/layer_14/mlp": 4.8383097648620605, "eoc/jacobian_sigma/layer_14": 4.8383097648620605, "eoc/jacobian_sigma/layer_21/attn": 1.0755236148834229, "eoc/jacobian_sigma/layer_21/mlp": 2.500581979751587, "eoc/jacobian_sigma/layer_21": 2.500581979751587, "eoc/jacobian_sigma/layer_27/attn": 1.5034675598144531, "eoc/jacobian_sigma/layer_27/mlp": 3.161097764968872, "eoc/jacobian_sigma/layer_27": 3.161097764968872, "eoc/layer0_sigma": 1001.342529296875, "eoc/sigma_max": 4.8383097648620605, "eoc/sigma_min": 1.630664587020874, "eoc/sigma_mean": 3.0326635241508484, "eoc/time_s": 0.7175595760345459} {"step": 5010, "timestamp": 1778199947.9219978, "train/loss": 2.510001230239868, "train/z_loss": 0.0018873691791668535, "train/perplexity": 12.304945198535256, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1158898.1159103513, "perf/iters_per_sec": 0.5526056842376477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8096086025238036, "data/tokens_consumed": 10508828672, "data/tokens_consumed_B": 10.508828672, "train/loss_slope": -0.0001145124223544867} {"step": 5020, "timestamp": 1778199958.2703135, "train/loss": 2.5030952215194704, "train/z_loss": 0.0018898952170275153, "train/perplexity": 12.220259894986663, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027414.90074911, "perf/iters_per_sec": 0.9667467597718763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034397053718567, "data/tokens_consumed": 10529800192, "data/tokens_consumed_B": 10.529800192, "train/loss_slope": -0.00011554374606123634} {"step": 5025, "timestamp": 1778199964.0713317, "eos/sharpness": 39.97609615325927, "eos/L0_probe": 2.3763298988342285, "eos/L_plus": 2.6760623455047607, "eos/L_minus": 2.476358413696289, "eos/grad_norm": 0.2579808235168457, "eos/embed_grad_frac": 0.0758451521396637, "eos/time_s": 0.6333410739898682} {"step": 5025, "timestamp": 1778199965.4506848, "geo/rankme_last": 415.181640625, "geo/layer_0/stable_rank_q_proj": 23.806184768676758, "geo/layer_0/stable_rank_k_proj": 16.256065368652344, "geo/layer_0/stable_rank_o_proj": 58.519081115722656, "geo/layer_0/stable_rank_gate_proj": 168.74325561523438, "geo/layer_0/stable_rank_down_proj": 46.67254638671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.026107672601938248, "geo/layer_0/attn_entropy_mean": 6.596796035766602, "geo/layer_0/attn_entropy_std": 0.19915662705898285, "geo/layer_7/stable_rank_q_proj": 45.4753303527832, "geo/layer_7/stable_rank_k_proj": 40.668724060058594, "geo/layer_7/stable_rank_o_proj": 103.76872253417969, "geo/layer_7/stable_rank_gate_proj": 162.67132568359375, "geo/layer_7/stable_rank_down_proj": 210.05783081054688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5850077867507935, "geo/layer_7/attn_entropy_mean": 4.938270092010498, "geo/layer_7/attn_entropy_std": 1.066670298576355, "geo/layer_14/stable_rank_q_proj": 67.0126953125, "geo/layer_14/stable_rank_k_proj": 49.477935791015625, "geo/layer_14/stable_rank_o_proj": 79.21280670166016, "geo/layer_14/stable_rank_gate_proj": 170.6114959716797, "geo/layer_14/stable_rank_down_proj": 152.83621215820312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4231983423233032, "geo/layer_14/attn_entropy_mean": 6.235194206237793, "geo/layer_14/attn_entropy_std": 0.4395800232887268, "geo/layer_21/stable_rank_q_proj": 50.2349967956543, "geo/layer_21/stable_rank_k_proj": 32.3299560546875, "geo/layer_21/stable_rank_o_proj": 101.85253143310547, "geo/layer_21/stable_rank_gate_proj": 135.9681854248047, "geo/layer_21/stable_rank_down_proj": 116.63896179199219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.18006391823291779, "geo/layer_21/attn_entropy_mean": 5.8919243812561035, "geo/layer_21/attn_entropy_std": 0.3234383165836334, "geo/layer_27/stable_rank_q_proj": 45.148681640625, "geo/layer_27/stable_rank_k_proj": 38.852638244628906, "geo/layer_27/stable_rank_o_proj": 103.38331604003906, "geo/layer_27/stable_rank_gate_proj": 88.7979507446289, "geo/layer_27/stable_rank_down_proj": 144.58790588378906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07070421427488327, "geo/layer_27/attn_entropy_mean": 4.728843688964844, "geo/layer_27/attn_entropy_std": 0.44013702869415283, "attnres/final_alpha/block_0": 0.2385987490415573, "attnres/block_norm/0": 1.0993123054504395, "attnres/final_alpha/block_1": 0.01639966107904911, "attnres/block_norm/1": 6875.5986328125, "attnres/final_alpha/block_2": 0.03421249985694885, "attnres/block_norm/2": 4832.638671875, "attnres/final_alpha/block_3": 0.027341853827238083, "attnres/block_norm/3": 4637.86376953125, "attnres/final_alpha/block_4": 0.044787533581256866, "attnres/block_norm/4": 2718.1962890625, "attnres/final_alpha/block_5": 0.39654481410980225, "attnres/block_norm/5": 2942.39404296875, "attnres/final_alpha/block_6": 0.24211485683918, "attnres/block_norm/6": 3901.28955078125, "geo/tier1_time_s": 1.3583807945251465, "geo/step": 5025.0, "geo/rankme_slope": 0.026164154060061524} {"step": 5030, "timestamp": 1778199970.637744, "train/loss": 2.5260159730911256, "train/z_loss": 0.0018864649930037559, "train/perplexity": 12.503592125821733, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696683.9250172395, "perf/iters_per_sec": 0.8090419411741445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236029863357544, "data/tokens_consumed": 10550771712, "data/tokens_consumed_B": 10.550771712, "train/loss_slope": -0.00011387613265320516} {"step": 5040, "timestamp": 1778199980.9951332, "train/loss": 2.5217004776000977, "train/z_loss": 0.0018858717987313867, "train/perplexity": 12.449749193409092, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025904.0794330256, "perf/iters_per_sec": 0.9660263440289619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03516845703125, "data/tokens_consumed": 10571743232, "data/tokens_consumed_B": 10.571743232, "train/loss_slope": -0.00011561990579684642} {"step": 5050, "timestamp": 1778199991.3443482, "grad/layer_0/attn": 0.003527725813910365, "grad/layer_0/mlp": 0.004786036908626556, "grad/layer_0/attn_mlp_ratio": 0.7370870320375381, "grad/layer_4/attn": 0.002255921484902501, "grad/layer_4/mlp": 0.0043981848284602165, "grad/layer_4/attn_mlp_ratio": 0.5129210166459027, "grad/layer_8/attn": 0.016590846702456474, "grad/layer_8/mlp": 0.005929201375693083, "grad/layer_8/attn_mlp_ratio": 2.798158701550468, "grad/layer_12/attn": 0.005889437161386013, "grad/layer_12/mlp": 0.006390117574483156, "grad/layer_12/attn_mlp_ratio": 0.9216476849719926, "grad/layer_16/attn": 0.006480402313172817, "grad/layer_16/mlp": 0.005262352526187897, "grad/layer_16/attn_mlp_ratio": 1.2314648548870222, "grad/layer_20/attn": 0.019350891932845116, "grad/layer_20/mlp": 0.012116163037717342, "grad/layer_20/attn_mlp_ratio": 1.5971138480800269, "grad/layer_24/attn": 0.023611456155776978, "grad/layer_24/mlp": 0.015199381858110428, "grad/layer_24/attn_mlp_ratio": 1.553448437630574, "grad/layer_27/attn": 0.010782847180962563, "grad/layer_27/mlp": 0.0171054657548666, "grad/layer_27/attn_mlp_ratio": 0.6303743652731201} {"step": 5050, "timestamp": 1778199991.3603272, "train/loss": 2.4883296251297, "train/z_loss": 0.0018873073975555598, "train/perplexity": 12.041146084942584, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024426.2120118, "perf/iters_per_sec": 0.9653216419276237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359241485595703, "data/tokens_consumed": 10592714752, "data/tokens_consumed_B": 10.592714752, "train/loss_slope": -0.0001176721253983079} {"step": 5060, "timestamp": 1778200001.72762, "train/loss": 2.5799055814743044, "train/z_loss": 0.0018661128589883447, "train/perplexity": 13.195892164153376, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024394.6695204899, "perf/iters_per_sec": 0.9653066012957048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359402894973755, "data/tokens_consumed": 10613686272, "data/tokens_consumed_B": 10.613686272, "train/loss_slope": -0.00011602591220730953} {"step": 5070, "timestamp": 1778200012.1003084, "train/loss": 2.5078109979629515, "train/z_loss": 0.0018926291493698954, "train/perplexity": 12.278024002993451, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023354.0857295445, "perf/iters_per_sec": 0.9648104122779582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364730596542358, "data/tokens_consumed": 10634657792, "data/tokens_consumed_B": 10.634657792, "train/loss_slope": -0.00011809894395525515} {"step": 5080, "timestamp": 1778200022.4559464, "train/loss": 2.524803376197815, "train/z_loss": 0.001885277882684022, "train/perplexity": 12.488439497726308, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026164.0575722116, "perf/iters_per_sec": 0.9661503112660463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350356340408324, "data/tokens_consumed": 10655629312, "data/tokens_consumed_B": 10.655629312, "train/loss_slope": -0.00011626427984080293} {"step": 5090, "timestamp": 1778200032.8113651, "train/loss": 2.548215651512146, "train/z_loss": 0.001881396968383342, "train/perplexity": 12.784271822539296, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026297.4558482724, "perf/iters_per_sec": 0.9662139205209124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349674940109252, "data/tokens_consumed": 10676600832, "data/tokens_consumed_B": 10.676600832, "train/loss_slope": -0.00011832946557881337} {"step": 5100, "timestamp": 1778200043.16653, "grad/layer_0/attn": 0.0036637629382312298, "grad/layer_0/mlp": 0.004894421901553869, "grad/layer_0/attn_mlp_ratio": 0.7485588568104808, "grad/layer_4/attn": 0.0018587062368169427, "grad/layer_4/mlp": 0.0042290883138775826, "grad/layer_4/attn_mlp_ratio": 0.43950517343588125, "grad/layer_8/attn": 0.010876945219933987, "grad/layer_8/mlp": 0.0058310008607804775, "grad/layer_8/attn_mlp_ratio": 1.8653650193324804, "grad/layer_12/attn": 0.00572981545701623, "grad/layer_12/mlp": 0.00593729829415679, "grad/layer_12/attn_mlp_ratio": 0.965054318754679, "grad/layer_16/attn": 0.006089639849960804, "grad/layer_16/mlp": 0.005185485817492008, "grad/layer_16/attn_mlp_ratio": 1.1743624313815697, "grad/layer_20/attn": 0.016115501523017883, "grad/layer_20/mlp": 0.009748061187565327, "grad/layer_20/attn_mlp_ratio": 1.6532006773054344, "grad/layer_24/attn": 0.013461071997880936, "grad/layer_24/mlp": 0.012945222668349743, "grad/layer_24/attn_mlp_ratio": 1.039848617421433, "grad/layer_27/attn": 0.0073538534343242645, "grad/layer_27/mlp": 0.01372586004436016, "grad/layer_27/attn_mlp_ratio": 0.535766309504902} {"step": 5100, "timestamp": 1778200043.7970479, "eos/sharpness": 18.888330459594723, "eos/L0_probe": 2.3688607215881348, "eos/L_plus": 2.4888923168182373, "eos/L_minus": 2.4377124309539795, "eos/grad_norm": 0.17702504992485046, "eos/embed_grad_frac": 0.12225471436977386, "eos/time_s": 0.6276862621307373} {"step": 5100, "timestamp": 1778200043.8172212, "train/loss": 2.5394128561019897, "train/z_loss": 0.0018857710994780064, "train/perplexity": 12.672228364548818, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906243.9605002515, "perf/iters_per_sec": 0.90896795296681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1001487970352173, "data/tokens_consumed": 10697572352, "data/tokens_consumed_B": 10.697572352, "train/loss_slope": -0.00011844215151285783} {"step": 5100, "timestamp": 1778200045.1787186, "geo/rankme_last": 416.8781433105469, "geo/layer_0/stable_rank_q_proj": 23.2810001373291, "geo/layer_0/stable_rank_k_proj": 15.771825790405273, "geo/layer_0/stable_rank_o_proj": 58.69416046142578, "geo/layer_0/stable_rank_gate_proj": 169.386962890625, "geo/layer_0/stable_rank_down_proj": 46.89540100097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03066098876297474, "geo/layer_0/attn_entropy_mean": 6.591982841491699, "geo/layer_0/attn_entropy_std": 0.20495976507663727, "geo/layer_7/stable_rank_q_proj": 45.68324279785156, "geo/layer_7/stable_rank_k_proj": 40.80123519897461, "geo/layer_7/stable_rank_o_proj": 103.92366027832031, "geo/layer_7/stable_rank_gate_proj": 161.24844360351562, "geo/layer_7/stable_rank_down_proj": 209.28054809570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5746432542800903, "geo/layer_7/attn_entropy_mean": 4.942914009094238, "geo/layer_7/attn_entropy_std": 1.096272349357605, "geo/layer_14/stable_rank_q_proj": 67.92981719970703, "geo/layer_14/stable_rank_k_proj": 50.90332794189453, "geo/layer_14/stable_rank_o_proj": 76.32304382324219, "geo/layer_14/stable_rank_gate_proj": 168.5155487060547, "geo/layer_14/stable_rank_down_proj": 153.41123962402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40734338760375977, "geo/layer_14/attn_entropy_mean": 6.150188446044922, "geo/layer_14/attn_entropy_std": 0.4930223524570465, "geo/layer_21/stable_rank_q_proj": 50.322608947753906, "geo/layer_21/stable_rank_k_proj": 32.093223571777344, "geo/layer_21/stable_rank_o_proj": 102.18419647216797, "geo/layer_21/stable_rank_gate_proj": 136.36636352539062, "geo/layer_21/stable_rank_down_proj": 114.99834442138672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17836251854896545, "geo/layer_21/attn_entropy_mean": 5.853570938110352, "geo/layer_21/attn_entropy_std": 0.3198992609977722, "geo/layer_27/stable_rank_q_proj": 45.05302429199219, "geo/layer_27/stable_rank_k_proj": 38.89259719848633, "geo/layer_27/stable_rank_o_proj": 103.43548583984375, "geo/layer_27/stable_rank_gate_proj": 88.55980682373047, "geo/layer_27/stable_rank_down_proj": 145.76243591308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0673113539814949, "geo/layer_27/attn_entropy_mean": 4.686187744140625, "geo/layer_27/attn_entropy_std": 0.45975375175476074, "attnres/final_alpha/block_0": 0.24151712656021118, "attnres/block_norm/0": 1.1080892086029053, "attnres/final_alpha/block_1": 0.016212061047554016, "attnres/block_norm/1": 7072.75048828125, "attnres/final_alpha/block_2": 0.03482348471879959, "attnres/block_norm/2": 4991.2880859375, "attnres/final_alpha/block_3": 0.027755368500947952, "attnres/block_norm/3": 4756.9072265625, "attnres/final_alpha/block_4": 0.045530229806900024, "attnres/block_norm/4": 2768.31591796875, "attnres/final_alpha/block_5": 0.39338409900665283, "attnres/block_norm/5": 2999.14990234375, "attnres/final_alpha/block_6": 0.24077767133712769, "attnres/block_norm/6": 3985.5859375, "geo/tier1_time_s": 1.3572306632995605, "geo/step": 5100.0, "geo/rankme_slope": 0.025714063965429922} {"step": 5110, "timestamp": 1778200055.5456755, "train/loss": 2.5047561168670653, "train/z_loss": 0.0018943059956654906, "train/perplexity": 12.240573332369793, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788653.5457440664, "perf/iters_per_sec": 0.8528964737625438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172475242614746, "data/tokens_consumed": 10718543872, "data/tokens_consumed_B": 10.718543872, "train/loss_slope": -0.00012327297932029376} {"step": 5120, "timestamp": 1778200065.9139144, "train/loss": 2.587313103675842, "train/z_loss": 0.001866640953812748, "train/perplexity": 13.294003962363064, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024050.5148993372, "perf/iters_per_sec": 0.9651424955841719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361164331436157, "data/tokens_consumed": 10739515392, "data/tokens_consumed_B": 10.739515392, "train/loss_slope": -0.00012177381621371363} {"step": 5130, "timestamp": 1778200076.282541, "train/loss": 2.543332743644714, "train/z_loss": 0.001879687246400863, "train/perplexity": 12.721999559667438, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024027.5071099657, "perf/iters_per_sec": 0.9651315246152714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361282110214234, "data/tokens_consumed": 10760486912, "data/tokens_consumed_B": 10.760486912, "train/loss_slope": -0.00011820114676815732} {"step": 5140, "timestamp": 1778200086.634552, "train/loss": 2.5373714685440065, "train/z_loss": 0.0018814943265169858, "train/perplexity": 12.646385821530787, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027082.3314214603, "perf/iters_per_sec": 0.9665881783587743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345667600631714, "data/tokens_consumed": 10781458432, "data/tokens_consumed_B": 10.781458432, "train/loss_slope": -0.00012074628630236677} {"step": 5150, "timestamp": 1778200096.9845219, "grad/layer_0/attn": 0.0035501655656844378, "grad/layer_0/mlp": 0.004718063864856958, "grad/layer_0/attn_mlp_ratio": 0.7524623642511535, "grad/layer_4/attn": 0.0026055972557514906, "grad/layer_4/mlp": 0.004374878481030464, "grad/layer_4/attn_mlp_ratio": 0.5955816161503081, "grad/layer_8/attn": 0.00944546889513731, "grad/layer_8/mlp": 0.005752527620643377, "grad/layer_8/attn_mlp_ratio": 1.6419684274169661, "grad/layer_12/attn": 0.0066404337994754314, "grad/layer_12/mlp": 0.006095657590776682, "grad/layer_12/attn_mlp_ratio": 1.0893711780310513, "grad/layer_16/attn": 0.006615353282541037, "grad/layer_16/mlp": 0.005412193015217781, "grad/layer_16/attn_mlp_ratio": 1.2223054761184076, "grad/layer_20/attn": 0.008820874616503716, "grad/layer_20/mlp": 0.009321875870227814, "grad/layer_20/attn_mlp_ratio": 0.9462553079096745, "grad/layer_24/attn": 0.022592764347791672, "grad/layer_24/mlp": 0.01646680012345314, "grad/layer_24/attn_mlp_ratio": 1.3720190954654028, "grad/layer_27/attn": 0.017518412321805954, "grad/layer_27/mlp": 0.016526296734809875, "grad/layer_27/attn_mlp_ratio": 1.0600325346272585} {"step": 5150, "timestamp": 1778200097.0003633, "train/loss": 2.505456829071045, "train/z_loss": 0.0018885160447098316, "train/perplexity": 12.249153457235526, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024205.8078692309, "perf/iters_per_sec": 0.9652165450426249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360369443893434, "data/tokens_consumed": 10802429952, "data/tokens_consumed_B": 10.802429952, "train/loss_slope": -0.00012493253581129747} {"step": 5160, "timestamp": 1778200107.3645475, "train/loss": 2.5092761278152467, "train/z_loss": 0.001888027391396463, "train/perplexity": 12.296026086962465, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024886.4155034493, "perf/iters_per_sec": 0.9655410840527769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356887102127075, "data/tokens_consumed": 10823401472, "data/tokens_consumed_B": 10.823401472, "train/loss_slope": -0.00012572021952199033} {"step": 5170, "timestamp": 1778200117.7196085, "train/loss": 2.5415774822235107, "train/z_loss": 0.001878093439154327, "train/perplexity": 12.699688711053911, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026275.8906260277, "perf/iters_per_sec": 0.9662036374216212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349785089492798, "data/tokens_consumed": 10844372992, "data/tokens_consumed_B": 10.844372992, "train/loss_slope": -0.0001249507920076065} {"step": 5175, "timestamp": 1778200123.4954755, "eos/sharpness": 31.48756027221679, "eos/L0_probe": 2.3621058464050293, "eos/L_plus": 2.580225706100464, "eos/L_minus": 2.4588615894317627, "eos/grad_norm": 0.2741621434688568, "eos/embed_grad_frac": 0.09400126338005066, "eos/time_s": 0.606766939163208} {"step": 5175, "timestamp": 1778200124.8728805, "geo/rankme_last": 418.53350830078125, "geo/layer_0/stable_rank_q_proj": 22.74178123474121, "geo/layer_0/stable_rank_k_proj": 15.3911771774292, "geo/layer_0/stable_rank_o_proj": 58.50660705566406, "geo/layer_0/stable_rank_gate_proj": 170.12716674804688, "geo/layer_0/stable_rank_down_proj": 47.00634765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03801824152469635, "geo/layer_0/attn_entropy_mean": 6.595419883728027, "geo/layer_0/attn_entropy_std": 0.20288404822349548, "geo/layer_7/stable_rank_q_proj": 45.73728561401367, "geo/layer_7/stable_rank_k_proj": 41.295963287353516, "geo/layer_7/stable_rank_o_proj": 104.07203674316406, "geo/layer_7/stable_rank_gate_proj": 160.41624450683594, "geo/layer_7/stable_rank_down_proj": 207.7369384765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5805754065513611, "geo/layer_7/attn_entropy_mean": 4.955963134765625, "geo/layer_7/attn_entropy_std": 1.0755784511566162, "geo/layer_14/stable_rank_q_proj": 68.51679992675781, "geo/layer_14/stable_rank_k_proj": 52.08900451660156, "geo/layer_14/stable_rank_o_proj": 72.82585906982422, "geo/layer_14/stable_rank_gate_proj": 166.16201782226562, "geo/layer_14/stable_rank_down_proj": 152.7138671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4155037999153137, "geo/layer_14/attn_entropy_mean": 6.161072731018066, "geo/layer_14/attn_entropy_std": 0.4968024492263794, "geo/layer_21/stable_rank_q_proj": 50.32453155517578, "geo/layer_21/stable_rank_k_proj": 32.27751922607422, "geo/layer_21/stable_rank_o_proj": 101.28973388671875, "geo/layer_21/stable_rank_gate_proj": 136.36390686035156, "geo/layer_21/stable_rank_down_proj": 112.74881744384766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17872729897499084, "geo/layer_21/attn_entropy_mean": 5.863875389099121, "geo/layer_21/attn_entropy_std": 0.31331756711006165, "geo/layer_27/stable_rank_q_proj": 44.915287017822266, "geo/layer_27/stable_rank_k_proj": 38.56423568725586, "geo/layer_27/stable_rank_o_proj": 103.80633544921875, "geo/layer_27/stable_rank_gate_proj": 88.67806243896484, "geo/layer_27/stable_rank_down_proj": 147.74876403808594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06801347434520721, "geo/layer_27/attn_entropy_mean": 4.659521102905273, "geo/layer_27/attn_entropy_std": 0.4894808232784271, "attnres/final_alpha/block_0": 0.23866084218025208, "attnres/block_norm/0": 1.116303563117981, "attnres/final_alpha/block_1": 0.01569119282066822, "attnres/block_norm/1": 7249.48291015625, "attnres/final_alpha/block_2": 0.034036971628665924, "attnres/block_norm/2": 5109.8115234375, "attnres/final_alpha/block_3": 0.026853840798139572, "attnres/block_norm/3": 4918.6591796875, "attnres/final_alpha/block_4": 0.04440182447433472, "attnres/block_norm/4": 2823.316162109375, "attnres/final_alpha/block_5": 0.4029397666454315, "attnres/block_norm/5": 3005.544921875, "attnres/final_alpha/block_6": 0.23741558194160461, "attnres/block_norm/6": 4091.15380859375, "geo/tier1_time_s": 1.357177972793579, "geo/step": 5175.0, "geo/rankme_slope": 0.025244691997892907} {"step": 5180, "timestamp": 1778200130.0527136, "train/loss": 2.5831672668457033, "train/z_loss": 0.0018701176741160452, "train/perplexity": 13.239003281818546, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701203.3511819835, "perf/iters_per_sec": 0.8111969715032499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327462196350099, "data/tokens_consumed": 10865344512, "data/tokens_consumed_B": 10.865344512, "train/loss_slope": -0.00011659555316436333} {"step": 5190, "timestamp": 1778200140.4171653, "train/loss": 2.4992125272750854, "train/z_loss": 0.0018873375141993166, "train/perplexity": 12.1729043552599, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024549.6425227711, "perf/iters_per_sec": 0.9653804981817108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358609914779664, "data/tokens_consumed": 10886316032, "data/tokens_consumed_B": 10.886316032, "train/loss_slope": -0.00011883983962570905} {"step": 5200, "timestamp": 1778200150.7595315, "grad/layer_0/attn": 0.00256398250348866, "grad/layer_0/mlp": 0.0038495396729558706, "grad/layer_0/attn_mlp_ratio": 0.6660491005967462, "grad/layer_4/attn": 0.00248626247048378, "grad/layer_4/mlp": 0.004061962012201548, "grad/layer_4/attn_mlp_ratio": 0.612084111522214, "grad/layer_8/attn": 0.006526823155581951, "grad/layer_8/mlp": 0.005382472649216652, "grad/layer_8/attn_mlp_ratio": 1.2126068184055079, "grad/layer_12/attn": 0.0071034603752195835, "grad/layer_12/mlp": 0.006129682995378971, "grad/layer_12/attn_mlp_ratio": 1.1588625814236173, "grad/layer_16/attn": 0.006585678085684776, "grad/layer_16/mlp": 0.00523952953517437, "grad/layer_16/attn_mlp_ratio": 1.2569216216420174, "grad/layer_20/attn": 0.012353249825537205, "grad/layer_20/mlp": 0.009511987678706646, "grad/layer_20/attn_mlp_ratio": 1.298703290303942, "grad/layer_24/attn": 0.014124113135039806, "grad/layer_24/mlp": 0.014718903228640556, "grad/layer_24/attn_mlp_ratio": 0.9595900468723518, "grad/layer_27/attn": 0.00783263985067606, "grad/layer_27/mlp": 0.01580243743956089, "grad/layer_27/attn_mlp_ratio": 0.4956602315982771} {"step": 5200, "timestamp": 1778200150.7754521, "train/loss": 2.5056459426879885, "train/z_loss": 0.001894881424959749, "train/perplexity": 12.251470158003247, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025622.336797788, "perf/iters_per_sec": 0.9658919986714306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353124380111693, "data/tokens_consumed": 10907287552, "data/tokens_consumed_B": 10.907287552, "train/loss_slope": -0.00011915267192670038} {"step": 5210, "timestamp": 1778200161.132145, "train/loss": 2.547579288482666, "train/z_loss": 0.0018718965002335608, "train/perplexity": 12.776138972589582, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027035.290875336, "perf/iters_per_sec": 0.9665657476784401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034590768814087, "data/tokens_consumed": 10928259072, "data/tokens_consumed_B": 10.928259072, "train/loss_slope": -0.00011895038311166014} {"step": 5220, "timestamp": 1778200171.4973826, "train/loss": 2.532498097419739, "train/z_loss": 0.001882525603286922, "train/perplexity": 12.584905220613406, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024105.242006192, "perf/iters_per_sec": 0.9651685915022812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360884189605712, "data/tokens_consumed": 10949230592, "data/tokens_consumed_B": 10.949230592, "train/loss_slope": -0.00011516802912533834} {"step": 5230, "timestamp": 1778200181.8574216, "train/loss": 2.4588871002197266, "train/z_loss": 0.0019100447185337543, "train/perplexity": 11.691792503574968, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025779.1773908115, "perf/iters_per_sec": 0.9659667860940988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352322816848756, "data/tokens_consumed": 10970202112, "data/tokens_consumed_B": 10.970202112, "train/loss_slope": -0.00012059781077099102} {"step": 5240, "timestamp": 1778200192.2162786, "train/loss": 2.5272241830825806, "train/z_loss": 0.0018851172528229654, "train/perplexity": 12.518708220626781, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025798.9124753117, "perf/iters_per_sec": 0.9659761965157088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352221965789794, "data/tokens_consumed": 10991173632, "data/tokens_consumed_B": 10.991173632, "train/loss_slope": -0.0001213368466763821} {"step": 5250, "timestamp": 1778200202.5739002, "grad/layer_0/attn": 0.0027917460538446903, "grad/layer_0/mlp": 0.004033708479255438, "grad/layer_0/attn_mlp_ratio": 0.6921040523854614, "grad/layer_4/attn": 0.0020344089716672897, "grad/layer_4/mlp": 0.004122612066566944, "grad/layer_4/attn_mlp_ratio": 0.4934757113865066, "grad/layer_8/attn": 0.007499731611460447, "grad/layer_8/mlp": 0.005357402842491865, "grad/layer_8/attn_mlp_ratio": 1.3998819375665872, "grad/layer_12/attn": 0.005991888232529163, "grad/layer_12/mlp": 0.006152571644634008, "grad/layer_12/attn_mlp_ratio": 0.9738835207821859, "grad/layer_16/attn": 0.0059540714137256145, "grad/layer_16/mlp": 0.005466714035719633, "grad/layer_16/attn_mlp_ratio": 1.0891499474650739, "grad/layer_20/attn": 0.016863679513335228, "grad/layer_20/mlp": 0.009083060547709465, "grad/layer_20/attn_mlp_ratio": 1.8566076091970005, "grad/layer_24/attn": 0.02068316377699375, "grad/layer_24/mlp": 0.012181016616523266, "grad/layer_24/attn_mlp_ratio": 1.6979833669333628, "grad/layer_27/attn": 0.006407656706869602, "grad/layer_27/mlp": 0.012402987107634544, "grad/layer_27/attn_mlp_ratio": 0.5166220523814963} {"step": 5250, "timestamp": 1778200203.199293, "eos/sharpness": 36.43293380737304, "eos/L0_probe": 2.356334686279297, "eos/L_plus": 2.6245956420898438, "eos/L_minus": 2.4524030685424805, "eos/grad_norm": 0.20484067499637604, "eos/embed_grad_frac": 0.08190643787384033, "eos/time_s": 0.6224215030670166} {"step": 5250, "timestamp": 1778200203.22971, "train/loss": 2.5092742919921873, "train/z_loss": 0.0018905897974036633, "train/perplexity": 12.296003513654957, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905255.608522372, "perf/iters_per_sec": 0.9084966700183735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1007194995880127, "data/tokens_consumed": 11012145152, "data/tokens_consumed_B": 11.012145152, "train/loss_slope": -0.0001206085462882073} {"step": 5250, "timestamp": 1778200204.5901394, "geo/rankme_last": 418.47564697265625, "geo/layer_0/stable_rank_q_proj": 22.22037696838379, "geo/layer_0/stable_rank_k_proj": 15.060199737548828, "geo/layer_0/stable_rank_o_proj": 58.182342529296875, "geo/layer_0/stable_rank_gate_proj": 170.6573944091797, "geo/layer_0/stable_rank_down_proj": 47.21432876586914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03392130881547928, "geo/layer_0/attn_entropy_mean": 6.585689544677734, "geo/layer_0/attn_entropy_std": 0.20671924948692322, "geo/layer_7/stable_rank_q_proj": 46.10124969482422, "geo/layer_7/stable_rank_k_proj": 41.312660217285156, "geo/layer_7/stable_rank_o_proj": 103.2417221069336, "geo/layer_7/stable_rank_gate_proj": 158.94374084472656, "geo/layer_7/stable_rank_down_proj": 207.8370819091797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5755062699317932, "geo/layer_7/attn_entropy_mean": 4.9241132736206055, "geo/layer_7/attn_entropy_std": 1.0496667623519897, "geo/layer_14/stable_rank_q_proj": 69.67768096923828, "geo/layer_14/stable_rank_k_proj": 53.599910736083984, "geo/layer_14/stable_rank_o_proj": 70.48705291748047, "geo/layer_14/stable_rank_gate_proj": 164.396728515625, "geo/layer_14/stable_rank_down_proj": 151.81886291503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41318804025650024, "geo/layer_14/attn_entropy_mean": 6.16340446472168, "geo/layer_14/attn_entropy_std": 0.48489120602607727, "geo/layer_21/stable_rank_q_proj": 49.95884323120117, "geo/layer_21/stable_rank_k_proj": 32.21639633178711, "geo/layer_21/stable_rank_o_proj": 100.99666595458984, "geo/layer_21/stable_rank_gate_proj": 135.90838623046875, "geo/layer_21/stable_rank_down_proj": 111.37606048583984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17936816811561584, "geo/layer_21/attn_entropy_mean": 5.861927509307861, "geo/layer_21/attn_entropy_std": 0.3415025770664215, "geo/layer_27/stable_rank_q_proj": 44.69767761230469, "geo/layer_27/stable_rank_k_proj": 38.38093948364258, "geo/layer_27/stable_rank_o_proj": 104.19952392578125, "geo/layer_27/stable_rank_gate_proj": 89.14851379394531, "geo/layer_27/stable_rank_down_proj": 149.73263549804688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0722513496875763, "geo/layer_27/attn_entropy_mean": 4.671152114868164, "geo/layer_27/attn_entropy_std": 0.45708850026130676, "attnres/final_alpha/block_0": 0.2406853437423706, "attnres/block_norm/0": 1.1244943141937256, "attnres/final_alpha/block_1": 0.016040662303566933, "attnres/block_norm/1": 7480.9267578125, "attnres/final_alpha/block_2": 0.03362900763750076, "attnres/block_norm/2": 5295.12890625, "attnres/final_alpha/block_3": 0.027432821691036224, "attnres/block_norm/3": 5075.7783203125, "attnres/final_alpha/block_4": 0.04405216500163078, "attnres/block_norm/4": 2894.8046875, "attnres/final_alpha/block_5": 0.40240490436553955, "attnres/block_norm/5": 3068.3701171875, "attnres/final_alpha/block_6": 0.2357551008462906, "attnres/block_norm/6": 4221.70458984375, "geo/tier1_time_s": 1.3563241958618164, "geo/step": 5250.0, "geo/rankme_slope": 0.024672649352709835} {"step": 5260, "timestamp": 1778200214.9627721, "train/loss": 2.540132761001587, "train/z_loss": 0.0018832580884918571, "train/perplexity": 12.681354448399368, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788006.6559262727, "perf/iters_per_sec": 0.8525880126601566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1728994369506835, "data/tokens_consumed": 11033116672, "data/tokens_consumed_B": 11.033116672, "train/loss_slope": -0.00011821901487080928} {"step": 5270, "timestamp": 1778200225.322656, "train/loss": 2.5275044202804566, "train/z_loss": 0.0018855670001357793, "train/perplexity": 12.522216919950628, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025228.523003979, "perf/iters_per_sec": 0.9657042136211295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355137586593628, "data/tokens_consumed": 11054088192, "data/tokens_consumed_B": 11.054088192, "train/loss_slope": -0.0001173432365466695} {"step": 5280, "timestamp": 1778200235.6743171, "train/loss": 2.5522698879241945, "train/z_loss": 0.0018816802999936043, "train/perplexity": 12.836207491465645, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027017.2133117395, "perf/iters_per_sec": 0.9665571276243875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345999956130982, "data/tokens_consumed": 11075059712, "data/tokens_consumed_B": 11.075059712, "train/loss_slope": -0.00011462927169353439} {"step": 5290, "timestamp": 1778200246.0265846, "train/loss": 2.548437786102295, "train/z_loss": 0.001881937391590327, "train/perplexity": 12.78711196695643, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026989.8873742034, "perf/iters_per_sec": 0.9665440976019876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346139430999757, "data/tokens_consumed": 11096031232, "data/tokens_consumed_B": 11.096031232, "train/loss_slope": -0.00011198288649722971} {"step": 5300, "timestamp": 1778200256.368772, "grad/layer_0/attn": 0.003839969402179122, "grad/layer_0/mlp": 0.004939681850373745, "grad/layer_0/attn_mlp_ratio": 0.7773717904831066, "grad/layer_4/attn": 0.0021259638015180826, "grad/layer_4/mlp": 0.00423173513263464, "grad/layer_4/attn_mlp_ratio": 0.5023858262971895, "grad/layer_8/attn": 0.010377106256783009, "grad/layer_8/mlp": 0.0058837407268583775, "grad/layer_8/attn_mlp_ratio": 1.7636919371793376, "grad/layer_12/attn": 0.007747497875243425, "grad/layer_12/mlp": 0.006890827789902687, "grad/layer_12/attn_mlp_ratio": 1.1243203282723169, "grad/layer_16/attn": 0.007291501387953758, "grad/layer_16/mlp": 0.005846409127116203, "grad/layer_16/attn_mlp_ratio": 1.247176019450551, "grad/layer_20/attn": 0.011431711725890636, "grad/layer_20/mlp": 0.010778266936540604, "grad/layer_20/attn_mlp_ratio": 1.0606261365704448, "grad/layer_24/attn": 0.013280076906085014, "grad/layer_24/mlp": 0.012519820593297482, "grad/layer_24/attn_mlp_ratio": 1.0607242093486642, "grad/layer_27/attn": 0.010852986015379429, "grad/layer_27/mlp": 0.013340312987565994, "grad/layer_27/attn_mlp_ratio": 0.813548073732624} {"step": 5300, "timestamp": 1778200256.3845508, "train/loss": 2.538142514228821, "train/z_loss": 0.0018902107956819236, "train/perplexity": 12.656140522923913, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025830.6387125875, "perf/iters_per_sec": 0.9659913247645319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352059841156005, "data/tokens_consumed": 11117002752, "data/tokens_consumed_B": 11.117002752, "train/loss_slope": -0.00010806942549761869} {"step": 5310, "timestamp": 1778200266.743159, "train/loss": 2.4923424243927004, "train/z_loss": 0.0019001606618985535, "train/perplexity": 12.089561863510061, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025782.8631102531, "perf/iters_per_sec": 0.9659685435820833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352303981781006, "data/tokens_consumed": 11137974272, "data/tokens_consumed_B": 11.137974272, "train/loss_slope": -0.0001074440160385667} {"step": 5320, "timestamp": 1778200277.5429912, "train/loss": 2.54471378326416, "train/z_loss": 0.0018836312345229089, "train/perplexity": 12.739581282831619, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943002.0805780243, "perf/iters_per_sec": 0.9264955904855844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0793359518051147, "data/tokens_consumed": 11158945792, "data/tokens_consumed_B": 11.158945792, "train/loss_slope": -0.00010543530591786745} {"step": 5325, "timestamp": 1778200283.3300676, "eos/sharpness": 16.075134277343746, "eos/L0_probe": 2.3432958126068115, "eos/L_plus": 2.4450089931488037, "eos/L_minus": 2.402333974838257, "eos/grad_norm": 0.16155599057674408, "eos/embed_grad_frac": 0.18847785890102386, "eos/time_s": 0.6152153015136719} {"step": 5325, "timestamp": 1778200284.7079468, "geo/rankme_last": 419.31097412109375, "geo/layer_0/stable_rank_q_proj": 21.80389404296875, "geo/layer_0/stable_rank_k_proj": 14.776325225830078, "geo/layer_0/stable_rank_o_proj": 58.141780853271484, "geo/layer_0/stable_rank_gate_proj": 172.29074096679688, "geo/layer_0/stable_rank_down_proj": 47.193359375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.038065697997808456, "geo/layer_0/attn_entropy_mean": 6.582890033721924, "geo/layer_0/attn_entropy_std": 0.20852167904376984, "geo/layer_7/stable_rank_q_proj": 46.23615264892578, "geo/layer_7/stable_rank_k_proj": 41.40930938720703, "geo/layer_7/stable_rank_o_proj": 103.45894622802734, "geo/layer_7/stable_rank_gate_proj": 158.30712890625, "geo/layer_7/stable_rank_down_proj": 207.3125762939453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5820783376693726, "geo/layer_7/attn_entropy_mean": 4.921706199645996, "geo/layer_7/attn_entropy_std": 1.0894054174423218, "geo/layer_14/stable_rank_q_proj": 70.22351837158203, "geo/layer_14/stable_rank_k_proj": 54.660030364990234, "geo/layer_14/stable_rank_o_proj": 68.49818420410156, "geo/layer_14/stable_rank_gate_proj": 163.05691528320312, "geo/layer_14/stable_rank_down_proj": 151.0631561279297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4082658588886261, "geo/layer_14/attn_entropy_mean": 6.138854026794434, "geo/layer_14/attn_entropy_std": 0.49876073002815247, "geo/layer_21/stable_rank_q_proj": 50.07158279418945, "geo/layer_21/stable_rank_k_proj": 32.35681915283203, "geo/layer_21/stable_rank_o_proj": 101.06974029541016, "geo/layer_21/stable_rank_gate_proj": 134.78065490722656, "geo/layer_21/stable_rank_down_proj": 109.42366027832031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.175504669547081, "geo/layer_21/attn_entropy_mean": 5.854792594909668, "geo/layer_21/attn_entropy_std": 0.31630516052246094, "geo/layer_27/stable_rank_q_proj": 44.83512878417969, "geo/layer_27/stable_rank_k_proj": 38.440216064453125, "geo/layer_27/stable_rank_o_proj": 104.25067138671875, "geo/layer_27/stable_rank_gate_proj": 89.24638366699219, "geo/layer_27/stable_rank_down_proj": 151.25222778320312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07419751584529877, "geo/layer_27/attn_entropy_mean": 4.615023612976074, "geo/layer_27/attn_entropy_std": 0.4666668176651001, "attnres/final_alpha/block_0": 0.2402036488056183, "attnres/block_norm/0": 1.1322764158248901, "attnres/final_alpha/block_1": 0.015584252774715424, "attnres/block_norm/1": 7686.0029296875, "attnres/final_alpha/block_2": 0.03264999762177467, "attnres/block_norm/2": 5492.96337890625, "attnres/final_alpha/block_3": 0.026913080364465714, "attnres/block_norm/3": 5249.72216796875, "attnres/final_alpha/block_4": 0.04424678534269333, "attnres/block_norm/4": 2918.92724609375, "attnres/final_alpha/block_5": 0.40593093633651733, "attnres/block_norm/5": 3061.072265625, "attnres/final_alpha/block_6": 0.23447129130363464, "attnres/block_norm/6": 4338.5634765625, "geo/tier1_time_s": 1.355522871017456, "geo/step": 5325.0, "geo/rankme_slope": 0.024064230672737844} {"step": 5330, "timestamp": 1778200289.8913758, "train/loss": 2.4166521787643434, "train/z_loss": 0.0019272523000836372, "train/perplexity": 11.208273139257928, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699258.3368706163, "perf/iters_per_sec": 0.8102695164063531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2341572523117066, "data/tokens_consumed": 11179917312, "data/tokens_consumed_B": 11.179917312, "train/loss_slope": -0.00011268656647960019} {"step": 5340, "timestamp": 1778200300.2493768, "train/loss": 2.5462180376052856, "train/z_loss": 0.0018792638671584426, "train/perplexity": 12.758759273952341, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025886.5352952871, "perf/iters_per_sec": 0.9660179783321796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351774215698242, "data/tokens_consumed": 11200888832, "data/tokens_consumed_B": 11.200888832, "train/loss_slope": -0.00010823381312645269} {"step": 5350, "timestamp": 1778200310.590996, "grad/layer_0/attn": 0.00424576411023736, "grad/layer_0/mlp": 0.004876859951764345, "grad/layer_0/attn_mlp_ratio": 0.8705937970685323, "grad/layer_4/attn": 0.00207403302192688, "grad/layer_4/mlp": 0.004058906342834234, "grad/layer_4/attn_mlp_ratio": 0.5109832047468021, "grad/layer_8/attn": 0.011710071004927158, "grad/layer_8/mlp": 0.005592391826212406, "grad/layer_8/attn_mlp_ratio": 2.0939288875731767, "grad/layer_12/attn": 0.004844681825488806, "grad/layer_12/mlp": 0.005975733976811171, "grad/layer_12/attn_mlp_ratio": 0.8107258059371465, "grad/layer_16/attn": 0.006986911408603191, "grad/layer_16/mlp": 0.0053655169904232025, "grad/layer_16/attn_mlp_ratio": 1.302187895566297, "grad/layer_20/attn": 0.013360724784433842, "grad/layer_20/mlp": 0.009650299325585365, "grad/layer_20/attn_mlp_ratio": 1.3844881070748134, "grad/layer_24/attn": 0.011940671131014824, "grad/layer_24/mlp": 0.011178525164723396, "grad/layer_24/attn_mlp_ratio": 1.0681794644859433, "grad/layer_27/attn": 0.00717990193516016, "grad/layer_27/mlp": 0.011563788168132305, "grad/layer_27/attn_mlp_ratio": 0.620895312909409} {"step": 5350, "timestamp": 1778200310.6070333, "train/loss": 2.4967988252639772, "train/z_loss": 0.0019044530228711665, "train/perplexity": 12.1435580224354, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025948.221081035, "perf/iters_per_sec": 0.9660473924069571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035145902633667, "data/tokens_consumed": 11221860352, "data/tokens_consumed_B": 11.221860352, "train/loss_slope": -0.0001132784341857342} {"step": 5360, "timestamp": 1778200320.9807856, "train/loss": 2.467725896835327, "train/z_loss": 0.0019073877134360372, "train/perplexity": 11.795591934845946, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022773.1172358706, "perf/iters_per_sec": 0.9645333849124291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036770749092102, "data/tokens_consumed": 11242831872, "data/tokens_consumed_B": 11.242831872, "train/loss_slope": -0.00011481841838721068} {"step": 5370, "timestamp": 1778200331.3413463, "train/loss": 2.5040772914886475, "train/z_loss": 0.0018961093621328474, "train/perplexity": 12.232266940159283, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025212.3427899496, "perf/iters_per_sec": 0.9656964982938526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355220317840577, "data/tokens_consumed": 11263803392, "data/tokens_consumed_B": 11.263803392, "train/loss_slope": -0.00011533681940753428} {"step": 5380, "timestamp": 1778200341.7379668, "train/loss": 2.525020408630371, "train/z_loss": 0.0018922363175079226, "train/perplexity": 12.491150188272568, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018397.2397833087, "perf/iters_per_sec": 0.9624468039433044, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03901846408844, "data/tokens_consumed": 11284774912, "data/tokens_consumed_B": 11.284774912, "train/loss_slope": -0.00011640930504831791} {"step": 5390, "timestamp": 1778200352.091729, "train/loss": 2.5095054149627685, "train/z_loss": 0.001889756228774786, "train/perplexity": 12.29884573095151, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026802.5960790126, "perf/iters_per_sec": 0.9664547901530326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347095489501954, "data/tokens_consumed": 11305746432, "data/tokens_consumed_B": 11.305746432, "train/loss_slope": -0.00011413341388307529} {"step": 5400, "timestamp": 1778200362.430062, "grad/layer_0/attn": 0.0029845081735402346, "grad/layer_0/mlp": 0.004190254490822554, "grad/layer_0/attn_mlp_ratio": 0.7122498427844622, "grad/layer_4/attn": 0.001961178844794631, "grad/layer_4/mlp": 0.003944477532058954, "grad/layer_4/attn_mlp_ratio": 0.4971960872220556, "grad/layer_8/attn": 0.006326131988316774, "grad/layer_8/mlp": 0.00531186955049634, "grad/layer_8/attn_mlp_ratio": 1.1909426255830022, "grad/layer_12/attn": 0.005393948405981064, "grad/layer_12/mlp": 0.005914668552577496, "grad/layer_12/attn_mlp_ratio": 0.9119612141976015, "grad/layer_16/attn": 0.005884724203497171, "grad/layer_16/mlp": 0.005203728098422289, "grad/layer_16/attn_mlp_ratio": 1.1308669436811387, "grad/layer_20/attn": 0.013406246900558472, "grad/layer_20/mlp": 0.009225544519722462, "grad/layer_20/attn_mlp_ratio": 1.4531659054467596, "grad/layer_24/attn": 0.012500791810452938, "grad/layer_24/mlp": 0.014604681171476841, "grad/layer_24/attn_mlp_ratio": 0.8559441714669371, "grad/layer_27/attn": 0.012347075156867504, "grad/layer_27/mlp": 0.015151023864746094, "grad/layer_27/attn_mlp_ratio": 0.8149333791298258} {"step": 5400, "timestamp": 1778200363.0417593, "eos/sharpness": 17.990016937255856, "eos/L0_probe": 2.33437442779541, "eos/L_plus": 2.432047128677368, "eos/L_minus": 2.4166018962860107, "eos/grad_norm": 0.20262503623962402, "eos/embed_grad_frac": 0.10581189393997192, "eos/time_s": 0.6089742183685303} {"step": 5400, "timestamp": 1778200363.0624063, "train/loss": 2.5261566162109377, "train/z_loss": 0.001892726740334183, "train/perplexity": 12.505350793696536, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912725.3323521912, "perf/iters_per_sec": 0.9120585119019466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964208841323853, "data/tokens_consumed": 11326717952, "data/tokens_consumed_B": 11.326717952, "train/loss_slope": -0.00011202649529403009} {"step": 5400, "timestamp": 1778200364.4270794, "geo/rankme_last": 421.8448791503906, "geo/layer_0/stable_rank_q_proj": 21.444555282592773, "geo/layer_0/stable_rank_k_proj": 14.46254825592041, "geo/layer_0/stable_rank_o_proj": 58.18708419799805, "geo/layer_0/stable_rank_gate_proj": 170.78109741210938, "geo/layer_0/stable_rank_down_proj": 47.199195861816406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03171628713607788, "geo/layer_0/attn_entropy_mean": 6.571208953857422, "geo/layer_0/attn_entropy_std": 0.2230723649263382, "geo/layer_7/stable_rank_q_proj": 46.08454513549805, "geo/layer_7/stable_rank_k_proj": 41.71702194213867, "geo/layer_7/stable_rank_o_proj": 103.68982696533203, "geo/layer_7/stable_rank_gate_proj": 157.7008056640625, "geo/layer_7/stable_rank_down_proj": 204.75025939941406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5748453140258789, "geo/layer_7/attn_entropy_mean": 4.898198127746582, "geo/layer_7/attn_entropy_std": 1.0711253881454468, "geo/layer_14/stable_rank_q_proj": 71.38794708251953, "geo/layer_14/stable_rank_k_proj": 55.658172607421875, "geo/layer_14/stable_rank_o_proj": 66.81249237060547, "geo/layer_14/stable_rank_gate_proj": 162.2021026611328, "geo/layer_14/stable_rank_down_proj": 150.7092742919922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4080636203289032, "geo/layer_14/attn_entropy_mean": 6.151358604431152, "geo/layer_14/attn_entropy_std": 0.5060430765151978, "geo/layer_21/stable_rank_q_proj": 50.306888580322266, "geo/layer_21/stable_rank_k_proj": 32.27601623535156, "geo/layer_21/stable_rank_o_proj": 100.79610443115234, "geo/layer_21/stable_rank_gate_proj": 133.87705993652344, "geo/layer_21/stable_rank_down_proj": 108.17588806152344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17533788084983826, "geo/layer_21/attn_entropy_mean": 5.86680793762207, "geo/layer_21/attn_entropy_std": 0.31388089060783386, "geo/layer_27/stable_rank_q_proj": 44.711090087890625, "geo/layer_27/stable_rank_k_proj": 38.094425201416016, "geo/layer_27/stable_rank_o_proj": 104.30559539794922, "geo/layer_27/stable_rank_gate_proj": 89.54835510253906, "geo/layer_27/stable_rank_down_proj": 153.2714385986328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06998717039823532, "geo/layer_27/attn_entropy_mean": 4.662001609802246, "geo/layer_27/attn_entropy_std": 0.47463732957839966, "attnres/final_alpha/block_0": 0.24098607897758484, "attnres/block_norm/0": 1.1399834156036377, "attnres/final_alpha/block_1": 0.015625115483999252, "attnres/block_norm/1": 7841.06787109375, "attnres/final_alpha/block_2": 0.03290846198797226, "attnres/block_norm/2": 5559.91162109375, "attnres/final_alpha/block_3": 0.027086755260825157, "attnres/block_norm/3": 5379.8876953125, "attnres/final_alpha/block_4": 0.04423149675130844, "attnres/block_norm/4": 3011.88671875, "attnres/final_alpha/block_5": 0.399894118309021, "attnres/block_norm/5": 3092.134521484375, "attnres/final_alpha/block_6": 0.2392679750919342, "attnres/block_norm/6": 4420.916015625, "geo/tier1_time_s": 1.3602442741394043, "geo/step": 5400.0, "geo/rankme_slope": 0.023530931806316276} {"step": 5410, "timestamp": 1778200374.7798326, "train/loss": 2.497674751281738, "train/z_loss": 0.0018946913420222699, "train/perplexity": 12.154199540766438, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790398.6839870235, "perf/iters_per_sec": 0.8537286205229871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713324069976807, "data/tokens_consumed": 11347689472, "data/tokens_consumed_B": 11.347689472, "train/loss_slope": -0.00011208928275888043} {"step": 5420, "timestamp": 1778200385.1318712, "train/loss": 2.535246801376343, "train/z_loss": 0.0018873939639888703, "train/perplexity": 12.619544984805545, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026828.0955515373, "perf/iters_per_sec": 0.966466949249047, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346965312957763, "data/tokens_consumed": 11368660992, "data/tokens_consumed_B": 11.368660992, "train/loss_slope": -0.00011210750301714739} {"step": 5430, "timestamp": 1778200395.4846942, "train/loss": 2.5215995073318482, "train/z_loss": 0.0018948281067423522, "train/perplexity": 12.448492202353824, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026682.2995292323, "perf/iters_per_sec": 0.9663974282880937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347709655761719, "data/tokens_consumed": 11389632512, "data/tokens_consumed_B": 11.389632512, "train/loss_slope": -0.00011061287346881963} {"step": 5440, "timestamp": 1778200405.845946, "train/loss": 2.5598473072052004, "train/z_loss": 0.0018783694365993142, "train/perplexity": 12.933842260236343, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025147.9042722064, "perf/iters_per_sec": 0.9656657716141731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355549812316895, "data/tokens_consumed": 11410604032, "data/tokens_consumed_B": 11.410604032, "train/loss_slope": -0.0001052073415034603} {"step": 5450, "timestamp": 1778200416.187532, "grad/layer_0/attn": 0.0032298045698553324, "grad/layer_0/mlp": 0.004066404886543751, "grad/layer_0/attn_mlp_ratio": 0.7942653475350254, "grad/layer_4/attn": 0.0022665224969387054, "grad/layer_4/mlp": 0.0038686394691467285, "grad/layer_4/attn_mlp_ratio": 0.5858706804879763, "grad/layer_8/attn": 0.006575123872607946, "grad/layer_8/mlp": 0.005271519999951124, "grad/layer_8/attn_mlp_ratio": 1.2472918148730778, "grad/layer_12/attn": 0.005290533881634474, "grad/layer_12/mlp": 0.0060517131350934505, "grad/layer_12/attn_mlp_ratio": 0.8742208488920209, "grad/layer_16/attn": 0.0057218037545681, "grad/layer_16/mlp": 0.004941877443343401, "grad/layer_16/attn_mlp_ratio": 1.1578198173435605, "grad/layer_20/attn": 0.010465036146342754, "grad/layer_20/mlp": 0.008562227711081505, "grad/layer_20/attn_mlp_ratio": 1.2222328554256143, "grad/layer_24/attn": 0.01199700403958559, "grad/layer_24/mlp": 0.011635332368314266, "grad/layer_24/attn_mlp_ratio": 1.0310839051876035, "grad/layer_27/attn": 0.007126024924218655, "grad/layer_27/mlp": 0.011204331181943417, "grad/layer_27/attn_mlp_ratio": 0.6360062680137596} {"step": 5450, "timestamp": 1778200416.203398, "train/loss": 2.462029814720154, "train/z_loss": 0.0019010406220331788, "train/perplexity": 11.728594267841187, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025628.6342119991, "perf/iters_per_sec": 0.965895001512527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353092193603515, "data/tokens_consumed": 11431575552, "data/tokens_consumed_B": 11.431575552, "train/loss_slope": -0.0001054813160540068} {"step": 5460, "timestamp": 1778200426.5521395, "train/loss": 2.5348129987716677, "train/z_loss": 0.0018885963596403598, "train/perplexity": 12.61407178055228, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027720.2803305169, "perf/iters_per_sec": 0.9668923761036476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034241271018982, "data/tokens_consumed": 11452547072, "data/tokens_consumed_B": 11.452547072, "train/loss_slope": -0.00010203816487987294} {"step": 5470, "timestamp": 1778200436.9125345, "train/loss": 2.590215635299683, "train/z_loss": 0.001868364668916911, "train/perplexity": 13.332646282420152, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025341.4653006333, "perf/iters_per_sec": 0.9657580687049071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354560136795044, "data/tokens_consumed": 11473518592, "data/tokens_consumed_B": 11.473518592, "train/loss_slope": -9.637921346475005e-05} {"step": 5475, "timestamp": 1778200442.685816, "eos/sharpness": 4.546546936035155, "eos/L0_probe": 2.3336637020111084, "eos/L_plus": 2.360870122909546, "eos/L_minus": 2.3519227504730225, "eos/grad_norm": 0.211417555809021, "eos/embed_grad_frac": 0.38858890533447266, "eos/time_s": 0.60750412940979} {"step": 5475, "timestamp": 1778200444.0670445, "geo/rankme_last": 421.59051513671875, "geo/layer_0/stable_rank_q_proj": 21.046443939208984, "geo/layer_0/stable_rank_k_proj": 14.16572380065918, "geo/layer_0/stable_rank_o_proj": 58.416542053222656, "geo/layer_0/stable_rank_gate_proj": 170.49063110351562, "geo/layer_0/stable_rank_down_proj": 47.12619400024414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.029494941234588623, "geo/layer_0/attn_entropy_mean": 6.571104049682617, "geo/layer_0/attn_entropy_std": 0.21867597103118896, "geo/layer_7/stable_rank_q_proj": 45.716766357421875, "geo/layer_7/stable_rank_k_proj": 41.78364562988281, "geo/layer_7/stable_rank_o_proj": 104.38642883300781, "geo/layer_7/stable_rank_gate_proj": 157.82229614257812, "geo/layer_7/stable_rank_down_proj": 204.71444702148438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5790290832519531, "geo/layer_7/attn_entropy_mean": 4.8869757652282715, "geo/layer_7/attn_entropy_std": 1.0620959997177124, "geo/layer_14/stable_rank_q_proj": 72.14411926269531, "geo/layer_14/stable_rank_k_proj": 56.949527740478516, "geo/layer_14/stable_rank_o_proj": 65.11622619628906, "geo/layer_14/stable_rank_gate_proj": 159.26246643066406, "geo/layer_14/stable_rank_down_proj": 150.4978790283203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40242406725883484, "geo/layer_14/attn_entropy_mean": 6.141883850097656, "geo/layer_14/attn_entropy_std": 0.47615429759025574, "geo/layer_21/stable_rank_q_proj": 50.41439437866211, "geo/layer_21/stable_rank_k_proj": 32.425148010253906, "geo/layer_21/stable_rank_o_proj": 100.27664947509766, "geo/layer_21/stable_rank_gate_proj": 132.9937286376953, "geo/layer_21/stable_rank_down_proj": 106.2995376586914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17298361659049988, "geo/layer_21/attn_entropy_mean": 5.835288047790527, "geo/layer_21/attn_entropy_std": 0.3092852532863617, "geo/layer_27/stable_rank_q_proj": 44.488494873046875, "geo/layer_27/stable_rank_k_proj": 37.93699645996094, "geo/layer_27/stable_rank_o_proj": 104.8974380493164, "geo/layer_27/stable_rank_gate_proj": 89.99508666992188, "geo/layer_27/stable_rank_down_proj": 154.20729064941406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07241470366716385, "geo/layer_27/attn_entropy_mean": 4.649405002593994, "geo/layer_27/attn_entropy_std": 0.4669618010520935, "attnres/final_alpha/block_0": 0.2401822954416275, "attnres/block_norm/0": 1.1471084356307983, "attnres/final_alpha/block_1": 0.015493386425077915, "attnres/block_norm/1": 8025.14453125, "attnres/final_alpha/block_2": 0.03204556554555893, "attnres/block_norm/2": 5723.1396484375, "attnres/final_alpha/block_3": 0.027169587090611458, "attnres/block_norm/3": 5502.193359375, "attnres/final_alpha/block_4": 0.043241340667009354, "attnres/block_norm/4": 3054.263427734375, "attnres/final_alpha/block_5": 0.40846940875053406, "attnres/block_norm/5": 3115.14501953125, "attnres/final_alpha/block_6": 0.2333984375, "attnres/block_norm/6": 4517.7880859375, "geo/tier1_time_s": 1.3594796657562256, "geo/step": 5475.0, "geo/rankme_slope": 0.022942210458402112} {"step": 5480, "timestamp": 1778200449.245639, "train/loss": 2.4712283611297607, "train/z_loss": 0.0019026989117264747, "train/perplexity": 11.836978008744724, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701410.198466743, "perf/iters_per_sec": 0.8112956039746967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325963497161865, "data/tokens_consumed": 11494490112, "data/tokens_consumed_B": 11.494490112, "train/loss_slope": -9.412035946369595e-05} {"step": 5490, "timestamp": 1778200459.5988476, "train/loss": 2.556670093536377, "train/z_loss": 0.0018787600216455757, "train/perplexity": 12.892813892276497, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026745.5281558658, "perf/iters_per_sec": 0.9664275780467347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347386837005614, "data/tokens_consumed": 11515461632, "data/tokens_consumed_B": 11.515461632, "train/loss_slope": -8.888710262608747e-05} {"step": 5500, "timestamp": 1778200470.366318, "grad/layer_0/attn": 0.003174066776409745, "grad/layer_0/mlp": 0.004388609901070595, "grad/layer_0/attn_mlp_ratio": 0.7232510466036938, "grad/layer_4/attn": 0.001952267368324101, "grad/layer_4/mlp": 0.003931912127882242, "grad/layer_4/attn_mlp_ratio": 0.4965185525963806, "grad/layer_8/attn": 0.00830287765711546, "grad/layer_8/mlp": 0.005560766905546188, "grad/layer_8/attn_mlp_ratio": 1.4931173431352815, "grad/layer_12/attn": 0.005567905027419329, "grad/layer_12/mlp": 0.00597540196031332, "grad/layer_12/attn_mlp_ratio": 0.9318042486880583, "grad/layer_16/attn": 0.0056620207615196705, "grad/layer_16/mlp": 0.005147312302142382, "grad/layer_16/attn_mlp_ratio": 1.0999955547992504, "grad/layer_20/attn": 0.009259031154215336, "grad/layer_20/mlp": 0.009519034996628761, "grad/layer_20/attn_mlp_ratio": 0.9726858930790677, "grad/layer_24/attn": 0.015527520328760147, "grad/layer_24/mlp": 0.016319025307893753, "grad/layer_24/attn_mlp_ratio": 0.9514980178442063, "grad/layer_27/attn": 0.01232998725026846, "grad/layer_27/mlp": 0.015256324782967567, "grad/layer_27/attn_mlp_ratio": 0.8081885608003718} {"step": 5500, "timestamp": 1778200470.3823493, "train/loss": 2.5271826505661013, "train/z_loss": 0.0018866188009269535, "train/perplexity": 12.51818829796823, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945858.072128099, "perf/iters_per_sec": 0.9278574333801741, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0777517795562743, "data/tokens_consumed": 11536433152, "data/tokens_consumed_B": 11.536433152, "train/loss_slope": -8.325678142670068e-05} {"step": 5500, "timestamp": 1778200477.472854, "geo/ww_alpha_mean": 8.615113452564339, "geo/ww_alpha_std": 5.203010762218662, "geo/ww_alpha_min": 2.134708385368647, "geo/ww_alpha_max": 32.387257528475814, "geo/ww_alpha_healthy_frac": 0.14213197969543148, "geo/ww_alpha_by_type/q_proj": 4.834798547675815, "geo/ww_alpha_by_type/k_proj": 5.685763814607379, "geo/ww_alpha_by_type/v_proj": 8.268815984495761, "geo/ww_alpha_by_type/o_proj": 7.283316321881365, "geo/ww_alpha_by_type/gate_proj": 12.55842941813751, "geo/ww_alpha_by_type/up_proj": 12.332629586339134, "geo/ww_alpha_by_type/down_proj": 9.53488207993155, "geo/twonn_id/layer_0": 0.6628124713897705, "geo/twonn_id/layer_7": 2.7994678020477295, "geo/twonn_id/layer_14": 3.2403159141540527, "geo/twonn_id/layer_21": 6.674387454986572, "geo/twonn_id/layer_27": 6.5633368492126465, "geo/tier2_time_s": 7.082808494567871} {"step": 5500, "timestamp": 1778200478.181246, "eoc/jacobian_sigma/layer_0/attn": 615.7294921875, "eoc/jacobian_sigma/layer_0/mlp": 1202.3345947265625, "eoc/jacobian_sigma/layer_0": 1202.3345947265625, "eoc/jacobian_sigma/layer_7/attn": 1.154754400253296, "eoc/jacobian_sigma/layer_7/mlp": 1.6240601539611816, "eoc/jacobian_sigma/layer_7": 1.6240601539611816, "eoc/jacobian_sigma/layer_14/attn": 1.1309850215911865, "eoc/jacobian_sigma/layer_14/mlp": 5.600870132446289, "eoc/jacobian_sigma/layer_14": 5.600870132446289, "eoc/jacobian_sigma/layer_21/attn": 1.0710656642913818, "eoc/jacobian_sigma/layer_21/mlp": 2.5908615589141846, "eoc/jacobian_sigma/layer_21": 2.5908615589141846, "eoc/jacobian_sigma/layer_27/attn": 1.5297178030014038, "eoc/jacobian_sigma/layer_27/mlp": 6.447983741760254, "eoc/jacobian_sigma/layer_27": 6.447983741760254, "eoc/layer0_sigma": 1202.3345947265625, "eoc/sigma_max": 6.447983741760254, "eoc/sigma_min": 1.6240601539611816, "eoc/sigma_mean": 4.065943896770477, "eoc/time_s": 0.7015233039855957} {"step": 5510, "timestamp": 1778200489.015287, "train/loss": 2.4405483484268187, "train/z_loss": 0.0019124885671772062, "train/perplexity": 11.479333691843644, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1125893.436107868, "perf/iters_per_sec": 0.5368678265132275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8626558542251588, "data/tokens_consumed": 11557404672, "data/tokens_consumed_B": 11.557404672, "train/loss_slope": -8.486790017540878e-05} {"step": 5520, "timestamp": 1778200499.3720074, "train/loss": 2.472423267364502, "train/z_loss": 0.0019051468814723193, "train/perplexity": 11.851130541358682, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026026.2435820724, "perf/iters_per_sec": 0.9660845964346277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351060390472413, "data/tokens_consumed": 11578376192, "data/tokens_consumed_B": 11.578376192, "train/loss_slope": -8.952280480046907e-05} {"step": 5530, "timestamp": 1778200509.7213073, "train/loss": 2.5182432174682616, "train/z_loss": 0.001899324101395905, "train/perplexity": 12.406781489933595, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027653.812479736, "perf/iters_per_sec": 0.9668606817625695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342751741409302, "data/tokens_consumed": 11599347712, "data/tokens_consumed_B": 11.599347712, "train/loss_slope": -8.955764265486762e-05} {"step": 5540, "timestamp": 1778200520.0755386, "train/loss": 2.5344074964523315, "train/z_loss": 0.0018828142783604562, "train/perplexity": 12.608957782128181, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026480.545337749, "perf/iters_per_sec": 0.9663012243927712, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348739862442016, "data/tokens_consumed": 11620319232, "data/tokens_consumed_B": 11.620319232, "train/loss_slope": -8.849386578023956e-05} {"step": 5550, "timestamp": 1778200530.4200697, "grad/layer_0/attn": 0.003864990547299385, "grad/layer_0/mlp": 0.004796245135366917, "grad/layer_0/attn_mlp_ratio": 0.8058367238604535, "grad/layer_4/attn": 0.002032097429037094, "grad/layer_4/mlp": 0.004167033359408379, "grad/layer_4/attn_mlp_ratio": 0.4876604540933071, "grad/layer_8/attn": 0.010761049576103687, "grad/layer_8/mlp": 0.005555554758757353, "grad/layer_8/attn_mlp_ratio": 1.9369891666429664, "grad/layer_12/attn": 0.0058146589435637, "grad/layer_12/mlp": 0.0061310953460633755, "grad/layer_12/attn_mlp_ratio": 0.9483882602573327, "grad/layer_16/attn": 0.00671837292611599, "grad/layer_16/mlp": 0.005606349091976881, "grad/layer_16/attn_mlp_ratio": 1.1983507798141615, "grad/layer_20/attn": 0.00993959978222847, "grad/layer_20/mlp": 0.009836257435381413, "grad/layer_20/attn_mlp_ratio": 1.0105062567216578, "grad/layer_24/attn": 0.021685484796762466, "grad/layer_24/mlp": 0.01473135594278574, "grad/layer_24/attn_mlp_ratio": 1.4720630425182284, "grad/layer_27/attn": 0.008338509127497673, "grad/layer_27/mlp": 0.016839463263750076, "grad/layer_27/attn_mlp_ratio": 0.49517665423043045} {"step": 5550, "timestamp": 1778200531.040703, "eos/sharpness": 34.83064174652099, "eos/L0_probe": 2.325746774673462, "eos/L_plus": 2.5716359615325928, "eos/L_minus": 2.428164005279541, "eos/grad_norm": 0.2749462127685547, "eos/embed_grad_frac": 0.054131463170051575, "eos/time_s": 0.6177451610565186} {"step": 5550, "timestamp": 1778200531.0607388, "train/loss": 2.5175935506820677, "train/z_loss": 0.0018863339675590396, "train/perplexity": 12.398723833755222, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909775.3217838597, "perf/iters_per_sec": 0.9106518372458743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098114514350891, "data/tokens_consumed": 11641290752, "data/tokens_consumed_B": 11.641290752, "train/loss_slope": -8.532528711302377e-05} {"step": 5550, "timestamp": 1778200532.4273512, "geo/rankme_last": 421.6164245605469, "geo/layer_0/stable_rank_q_proj": 20.596031188964844, "geo/layer_0/stable_rank_k_proj": 13.909475326538086, "geo/layer_0/stable_rank_o_proj": 58.25601577758789, "geo/layer_0/stable_rank_gate_proj": 171.46986389160156, "geo/layer_0/stable_rank_down_proj": 47.270938873291016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03158793970942497, "geo/layer_0/attn_entropy_mean": 6.563490867614746, "geo/layer_0/attn_entropy_std": 0.2216355949640274, "geo/layer_7/stable_rank_q_proj": 45.24100875854492, "geo/layer_7/stable_rank_k_proj": 42.11590576171875, "geo/layer_7/stable_rank_o_proj": 104.21141815185547, "geo/layer_7/stable_rank_gate_proj": 156.54611206054688, "geo/layer_7/stable_rank_down_proj": 203.49044799804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5854237675666809, "geo/layer_7/attn_entropy_mean": 4.864616394042969, "geo/layer_7/attn_entropy_std": 1.0512242317199707, "geo/layer_14/stable_rank_q_proj": 73.10714721679688, "geo/layer_14/stable_rank_k_proj": 57.70695114135742, "geo/layer_14/stable_rank_o_proj": 63.92457580566406, "geo/layer_14/stable_rank_gate_proj": 158.03045654296875, "geo/layer_14/stable_rank_down_proj": 149.7054901123047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40944069623947144, "geo/layer_14/attn_entropy_mean": 6.116576194763184, "geo/layer_14/attn_entropy_std": 0.504058837890625, "geo/layer_21/stable_rank_q_proj": 50.735939025878906, "geo/layer_21/stable_rank_k_proj": 32.490455627441406, "geo/layer_21/stable_rank_o_proj": 100.16119384765625, "geo/layer_21/stable_rank_gate_proj": 132.14205932617188, "geo/layer_21/stable_rank_down_proj": 104.40091705322266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17728373408317566, "geo/layer_21/attn_entropy_mean": 5.84539794921875, "geo/layer_21/attn_entropy_std": 0.31502556800842285, "geo/layer_27/stable_rank_q_proj": 44.417240142822266, "geo/layer_27/stable_rank_k_proj": 37.43449020385742, "geo/layer_27/stable_rank_o_proj": 105.60284423828125, "geo/layer_27/stable_rank_gate_proj": 90.2713394165039, "geo/layer_27/stable_rank_down_proj": 155.66586303710938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06847769767045975, "geo/layer_27/attn_entropy_mean": 4.653423309326172, "geo/layer_27/attn_entropy_std": 0.5124155282974243, "attnres/final_alpha/block_0": 0.24095377326011658, "attnres/block_norm/0": 1.1544923782348633, "attnres/final_alpha/block_1": 0.014974097721278667, "attnres/block_norm/1": 8240.025390625, "attnres/final_alpha/block_2": 0.03150761127471924, "attnres/block_norm/2": 5918.82666015625, "attnres/final_alpha/block_3": 0.026514489203691483, "attnres/block_norm/3": 5706.9404296875, "attnres/final_alpha/block_4": 0.042266372591257095, "attnres/block_norm/4": 3086.44091796875, "attnres/final_alpha/block_5": 0.41750913858413696, "attnres/block_norm/5": 3137.166015625, "attnres/final_alpha/block_6": 0.2262745499610901, "attnres/block_norm/6": 4661.2958984375, "geo/tier1_time_s": 1.3626322746276855, "geo/step": 5550.0, "geo/rankme_slope": 0.022484108897465237} {"step": 5560, "timestamp": 1778200543.2500196, "train/loss": 2.5316705465316773, "train/z_loss": 0.0018835385562852025, "train/perplexity": 12.574494879259651, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1721027.826668895, "perf/iters_per_sec": 0.8206500180572963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2185462474823, "data/tokens_consumed": 11662262272, "data/tokens_consumed_B": 11.662262272, "train/loss_slope": -8.286775745788998e-05} {"step": 5570, "timestamp": 1778200554.0281694, "train/loss": 2.506504535675049, "train/z_loss": 0.0018881231895647942, "train/perplexity": 12.261993701435902, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946854.967679694, "perf/iters_per_sec": 0.9283327902220221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0771999120712281, "data/tokens_consumed": 11683233792, "data/tokens_consumed_B": 11.683233792, "train/loss_slope": -8.06698801279283e-05} {"step": 5580, "timestamp": 1778200564.3767052, "train/loss": 2.4594276428222654, "train/z_loss": 0.00190534433349967, "train/perplexity": 11.69811412392183, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027627.5443899014, "perf/iters_per_sec": 0.9668481561612613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342885732650757, "data/tokens_consumed": 11704205312, "data/tokens_consumed_B": 11.704205312, "train/loss_slope": -8.045007240439626e-05} {"step": 5590, "timestamp": 1778200574.7423794, "train/loss": 2.4710317134857176, "train/z_loss": 0.0019048528047278523, "train/perplexity": 11.834650523761432, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026797.0385868258, "perf/iters_per_sec": 0.9664521401342515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347123861312866, "data/tokens_consumed": 11725176832, "data/tokens_consumed_B": 11.725176832, "train/loss_slope": -8.218409610946301e-05} {"step": 5600, "timestamp": 1778200585.0967314, "grad/layer_0/attn": 0.0033599233720451593, "grad/layer_0/mlp": 0.004334066063165665, "grad/layer_0/attn_mlp_ratio": 0.7752358283314769, "grad/layer_4/attn": 0.0016517172334715724, "grad/layer_4/mlp": 0.0037517542950809, "grad/layer_4/attn_mlp_ratio": 0.4402519620253436, "grad/layer_8/attn": 0.009760742075741291, "grad/layer_8/mlp": 0.005205569323152304, "grad/layer_8/attn_mlp_ratio": 1.8750575167299453, "grad/layer_12/attn": 0.0048495992086827755, "grad/layer_12/mlp": 0.005598120391368866, "grad/layer_12/attn_mlp_ratio": 0.8662906088141272, "grad/layer_16/attn": 0.006806282792240381, "grad/layer_16/mlp": 0.005358608439564705, "grad/layer_16/attn_mlp_ratio": 1.270158613376389, "grad/layer_20/attn": 0.011023912578821182, "grad/layer_20/mlp": 0.009161309339106083, "grad/layer_20/attn_mlp_ratio": 1.20331189030297, "grad/layer_24/attn": 0.015481294132769108, "grad/layer_24/mlp": 0.017422731965780258, "grad/layer_24/attn_mlp_ratio": 0.8885686856871143, "grad/layer_27/attn": 0.01859043724834919, "grad/layer_27/mlp": 0.016393227502703667, "grad/layer_27/attn_mlp_ratio": 1.1340315463736468} {"step": 5600, "timestamp": 1778200585.1125271, "train/loss": 2.4644597291946413, "train/z_loss": 0.0018990454729646445, "train/perplexity": 11.75712840253151, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023706.1048001337, "perf/iters_per_sec": 0.9649782680512112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362927675247193, "data/tokens_consumed": 11746148352, "data/tokens_consumed_B": 11.746148352, "train/loss_slope": -8.342030309464821e-05} {"step": 5610, "timestamp": 1778200595.478333, "train/loss": 2.47798171043396, "train/z_loss": 0.0019013666780665517, "train/perplexity": 11.91718779344214, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024169.1483819499, "perf/iters_per_sec": 0.9651990644368886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360557079315185, "data/tokens_consumed": 11767119872, "data/tokens_consumed_B": 11.767119872, "train/loss_slope": -8.556586648120997e-05} {"step": 5620, "timestamp": 1778200605.8376358, "train/loss": 2.4824156045913695, "train/z_loss": 0.0019005723646841944, "train/perplexity": 11.970144658581306, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025138.7657116966, "perf/iters_per_sec": 0.9656614140089496, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355596542358398, "data/tokens_consumed": 11788091392, "data/tokens_consumed_B": 11.788091392, "train/loss_slope": -8.509502450947008e-05} {"step": 5625, "timestamp": 1778200611.6218753, "eos/sharpness": 33.159375190734856, "eos/L0_probe": 2.3212451934814453, "eos/L_plus": 2.419072151184082, "eos/L_minus": 2.5550119876861572, "eos/grad_norm": 0.15437057614326477, "eos/embed_grad_frac": 0.15903830528259277, "eos/time_s": 0.6153631210327148} {"step": 5625, "timestamp": 1778200613.001038, "geo/rankme_last": 424.9694519042969, "geo/layer_0/stable_rank_q_proj": 20.214937210083008, "geo/layer_0/stable_rank_k_proj": 13.669977188110352, "geo/layer_0/stable_rank_o_proj": 57.83710861206055, "geo/layer_0/stable_rank_gate_proj": 172.11952209472656, "geo/layer_0/stable_rank_down_proj": 47.254554748535156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.028665045276284218, "geo/layer_0/attn_entropy_mean": 6.562778472900391, "geo/layer_0/attn_entropy_std": 0.22971242666244507, "geo/layer_7/stable_rank_q_proj": 45.188411712646484, "geo/layer_7/stable_rank_k_proj": 42.643043518066406, "geo/layer_7/stable_rank_o_proj": 104.094482421875, "geo/layer_7/stable_rank_gate_proj": 155.6094207763672, "geo/layer_7/stable_rank_down_proj": 201.1773681640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5866538882255554, "geo/layer_7/attn_entropy_mean": 4.832617282867432, "geo/layer_7/attn_entropy_std": 1.0524425506591797, "geo/layer_14/stable_rank_q_proj": 74.2269287109375, "geo/layer_14/stable_rank_k_proj": 58.59059143066406, "geo/layer_14/stable_rank_o_proj": 62.256778717041016, "geo/layer_14/stable_rank_gate_proj": 157.34825134277344, "geo/layer_14/stable_rank_down_proj": 150.9340057373047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41367107629776, "geo/layer_14/attn_entropy_mean": 6.115149974822998, "geo/layer_14/attn_entropy_std": 0.4992852807044983, "geo/layer_21/stable_rank_q_proj": 50.91326904296875, "geo/layer_21/stable_rank_k_proj": 32.48372268676758, "geo/layer_21/stable_rank_o_proj": 99.27806854248047, "geo/layer_21/stable_rank_gate_proj": 132.06663513183594, "geo/layer_21/stable_rank_down_proj": 103.67656707763672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17458122968673706, "geo/layer_21/attn_entropy_mean": 5.846193313598633, "geo/layer_21/attn_entropy_std": 0.32515737414360046, "geo/layer_27/stable_rank_q_proj": 44.245460510253906, "geo/layer_27/stable_rank_k_proj": 37.210723876953125, "geo/layer_27/stable_rank_o_proj": 105.69998168945312, "geo/layer_27/stable_rank_gate_proj": 90.56831359863281, "geo/layer_27/stable_rank_down_proj": 157.68960571289062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07056339830160141, "geo/layer_27/attn_entropy_mean": 4.631867408752441, "geo/layer_27/attn_entropy_std": 0.5049506425857544, "attnres/final_alpha/block_0": 0.24130350351333618, "attnres/block_norm/0": 1.1614859104156494, "attnres/final_alpha/block_1": 0.014926889911293983, "attnres/block_norm/1": 8450.1875, "attnres/final_alpha/block_2": 0.031532615423202515, "attnres/block_norm/2": 5997.6123046875, "attnres/final_alpha/block_3": 0.026293637230992317, "attnres/block_norm/3": 5834.2294921875, "attnres/final_alpha/block_4": 0.043092064559459686, "attnres/block_norm/4": 3153.145263671875, "attnres/final_alpha/block_5": 0.4091433882713318, "attnres/block_norm/5": 3154.4638671875, "attnres/final_alpha/block_6": 0.23370790481567383, "attnres/block_norm/6": 4707.09814453125, "geo/tier1_time_s": 1.3582870960235596, "geo/step": 5625.0, "geo/rankme_slope": 0.022033849399134654} {"step": 5630, "timestamp": 1778200618.7153966, "train/loss": 2.5632894039154053, "train/z_loss": 0.0018783098552376033, "train/perplexity": 12.978438504391798, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1629463.059662448, "perf/iters_per_sec": 0.7769885347664108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2870202779769897, "data/tokens_consumed": 11809062912, "data/tokens_consumed_B": 11.809062912, "train/loss_slope": -7.795289147197048e-05} {"step": 5640, "timestamp": 1778200629.1069746, "train/loss": 2.5197732925415037, "train/z_loss": 0.0018908785190433265, "train/perplexity": 12.4257793273829, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019613.0971506971, "perf/iters_per_sec": 0.9630265699151502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383929491043091, "data/tokens_consumed": 11830034432, "data/tokens_consumed_B": 11.830034432, "train/loss_slope": -7.04283560642137e-05} {"step": 5650, "timestamp": 1778200639.4757304, "grad/layer_0/attn": 0.003791453782469034, "grad/layer_0/mlp": 0.004650234244763851, "grad/layer_0/attn_mlp_ratio": 0.8153253151076567, "grad/layer_4/attn": 0.0019237162778154016, "grad/layer_4/mlp": 0.003931890241801739, "grad/layer_4/attn_mlp_ratio": 0.4892599006039124, "grad/layer_8/attn": 0.007070143707096577, "grad/layer_8/mlp": 0.0053604780696332455, "grad/layer_8/attn_mlp_ratio": 1.3189389982312536, "grad/layer_12/attn": 0.005602972116321325, "grad/layer_12/mlp": 0.006360063329339027, "grad/layer_12/attn_mlp_ratio": 0.8809616725636362, "grad/layer_16/attn": 0.007214798126369715, "grad/layer_16/mlp": 0.005796744953840971, "grad/layer_16/attn_mlp_ratio": 1.2446291943767875, "grad/layer_20/attn": 0.01189045887440443, "grad/layer_20/mlp": 0.011064951308071613, "grad/layer_20/attn_mlp_ratio": 1.074605611528545, "grad/layer_24/attn": 0.024168474599719048, "grad/layer_24/mlp": 0.01666141115128994, "grad/layer_24/attn_mlp_ratio": 1.4505658755555837, "grad/layer_27/attn": 0.0097727682441473, "grad/layer_27/mlp": 0.017640013247728348, "grad/layer_27/attn_mlp_ratio": 0.5540113860177901} {"step": 5650, "timestamp": 1778200639.4907343, "train/loss": 2.5197196006774902, "train/z_loss": 0.0018902084324508906, "train/perplexity": 12.42511218203929, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020954.5270299579, "perf/iters_per_sec": 0.9636662135267057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377037048339843, "data/tokens_consumed": 11851005952, "data/tokens_consumed_B": 11.851005952, "train/loss_slope": -6.697188513864823e-05} {"step": 5660, "timestamp": 1778200649.8686275, "train/loss": 2.544925665855408, "train/z_loss": 0.0018799525452777744, "train/perplexity": 12.742280864312297, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022471.4579453052, "perf/iters_per_sec": 0.9643895425535703, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369253873825073, "data/tokens_consumed": 11871977472, "data/tokens_consumed_B": 11.871977472, "train/loss_slope": -5.938812390913171e-05} {"step": 5670, "timestamp": 1778200660.2366161, "train/loss": 2.5436211824417114, "train/z_loss": 0.0018845279002562166, "train/perplexity": 12.725669607182427, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024478.8626728759, "perf/iters_per_sec": 0.9653467477192287, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358972072601318, "data/tokens_consumed": 11892948992, "data/tokens_consumed_B": 11.892948992, "train/loss_slope": -5.7987424227843086e-05} {"step": 5680, "timestamp": 1778200670.5927875, "train/loss": 2.4682132720947267, "train/z_loss": 0.0018986721755936743, "train/perplexity": 11.801342215683448, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026332.8854242044, "perf/iters_per_sec": 0.9662308146592161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349493980407716, "data/tokens_consumed": 11913920512, "data/tokens_consumed_B": 11.913920512, "train/loss_slope": -5.9984795824743975e-05} {"step": 5690, "timestamp": 1778200680.953137, "train/loss": 2.4794071197509764, "train/z_loss": 0.0018994013546034694, "train/perplexity": 11.934186776331645, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025392.7178387456, "perf/iters_per_sec": 0.9657825078195312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354298114776612, "data/tokens_consumed": 11934892032, "data/tokens_consumed_B": 11.934892032, "train/loss_slope": -6.159213993451355e-05} {"step": 5700, "timestamp": 1778200691.2947578, "grad/layer_0/attn": 0.003141621593385935, "grad/layer_0/mlp": 0.003965800628066063, "grad/layer_0/attn_mlp_ratio": 0.7921783792999499, "grad/layer_4/attn": 0.001839454285800457, "grad/layer_4/mlp": 0.0038389256224036217, "grad/layer_4/attn_mlp_ratio": 0.4791586029043409, "grad/layer_8/attn": 0.007777653634548187, "grad/layer_8/mlp": 0.005470735486596823, "grad/layer_8/attn_mlp_ratio": 1.4216833388188714, "grad/layer_12/attn": 0.005463401786983013, "grad/layer_12/mlp": 0.006482451222836971, "grad/layer_12/attn_mlp_ratio": 0.8427987369123842, "grad/layer_16/attn": 0.007850978523492813, "grad/layer_16/mlp": 0.005348011385649443, "grad/layer_16/attn_mlp_ratio": 1.4680182614715196, "grad/layer_20/attn": 0.007913840934634209, "grad/layer_20/mlp": 0.00959191843867302, "grad/layer_20/attn_mlp_ratio": 0.8250529758699388, "grad/layer_24/attn": 0.0191982239484787, "grad/layer_24/mlp": 0.015013137832283974, "grad/layer_24/attn_mlp_ratio": 1.2787615776975707, "grad/layer_27/attn": 0.012504295445978642, "grad/layer_27/mlp": 0.014384936541318893, "grad/layer_27/attn_mlp_ratio": 0.8692631575492414} {"step": 5700, "timestamp": 1778200691.908973, "eos/sharpness": 22.35751152038574, "eos/L0_probe": 2.319868564605713, "eos/L_plus": 2.4442853927612305, "eos/L_minus": 2.4190268516540527, "eos/grad_norm": 0.17803144454956055, "eos/embed_grad_frac": 0.16462093591690063, "eos/time_s": 0.6113801002502441} {"step": 5700, "timestamp": 1778200691.9299383, "train/loss": 2.514160466194153, "train/z_loss": 0.0018867048551328481, "train/perplexity": 12.356230949857228, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911813.310363416, "perf/iters_per_sec": 0.9116236259286004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969439268112182, "data/tokens_consumed": 11955863552, "data/tokens_consumed_B": 11.955863552, "train/loss_slope": -5.9899384561258926e-05} {"step": 5700, "timestamp": 1778200693.2906392, "geo/rankme_last": 423.19476318359375, "geo/layer_0/stable_rank_q_proj": 19.86823844909668, "geo/layer_0/stable_rank_k_proj": 13.462615013122559, "geo/layer_0/stable_rank_o_proj": 58.05196762084961, "geo/layer_0/stable_rank_gate_proj": 173.1961669921875, "geo/layer_0/stable_rank_down_proj": 47.153358459472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.034966371953487396, "geo/layer_0/attn_entropy_mean": 6.558533668518066, "geo/layer_0/attn_entropy_std": 0.2182314097881317, "geo/layer_7/stable_rank_q_proj": 45.04136276245117, "geo/layer_7/stable_rank_k_proj": 42.95013427734375, "geo/layer_7/stable_rank_o_proj": 104.52325439453125, "geo/layer_7/stable_rank_gate_proj": 154.620361328125, "geo/layer_7/stable_rank_down_proj": 199.59396362304688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5821228623390198, "geo/layer_7/attn_entropy_mean": 4.80672550201416, "geo/layer_7/attn_entropy_std": 1.035657525062561, "geo/layer_14/stable_rank_q_proj": 75.02705383300781, "geo/layer_14/stable_rank_k_proj": 59.82405090332031, "geo/layer_14/stable_rank_o_proj": 61.64952087402344, "geo/layer_14/stable_rank_gate_proj": 155.94944763183594, "geo/layer_14/stable_rank_down_proj": 151.00198364257812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4087251126766205, "geo/layer_14/attn_entropy_mean": 6.0724992752075195, "geo/layer_14/attn_entropy_std": 0.5200333595275879, "geo/layer_21/stable_rank_q_proj": 51.05611801147461, "geo/layer_21/stable_rank_k_proj": 32.60822296142578, "geo/layer_21/stable_rank_o_proj": 98.92850494384766, "geo/layer_21/stable_rank_gate_proj": 131.74395751953125, "geo/layer_21/stable_rank_down_proj": 101.91690826416016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17147473990917206, "geo/layer_21/attn_entropy_mean": 5.849279880523682, "geo/layer_21/attn_entropy_std": 0.3106687068939209, "geo/layer_27/stable_rank_q_proj": 44.34889602661133, "geo/layer_27/stable_rank_k_proj": 37.17239761352539, "geo/layer_27/stable_rank_o_proj": 105.61749267578125, "geo/layer_27/stable_rank_gate_proj": 90.67485809326172, "geo/layer_27/stable_rank_down_proj": 158.66798400878906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06499908864498138, "geo/layer_27/attn_entropy_mean": 4.6523871421813965, "geo/layer_27/attn_entropy_std": 0.4673408269882202, "attnres/final_alpha/block_0": 0.24265272915363312, "attnres/block_norm/0": 1.1682724952697754, "attnres/final_alpha/block_1": 0.014887799508869648, "attnres/block_norm/1": 8593.94921875, "attnres/final_alpha/block_2": 0.03101998381316662, "attnres/block_norm/2": 6153.77392578125, "attnres/final_alpha/block_3": 0.026319850236177444, "attnres/block_norm/3": 6011.587890625, "attnres/final_alpha/block_4": 0.04180053249001503, "attnres/block_norm/4": 3196.61279296875, "attnres/final_alpha/block_5": 0.4162226915359497, "attnres/block_norm/5": 3200.26220703125, "attnres/final_alpha/block_6": 0.22709640860557556, "attnres/block_norm/6": 4857.9140625, "geo/tier1_time_s": 1.356153964996338, "geo/step": 5700.0, "geo/rankme_slope": 0.021456593789859695} {"step": 5710, "timestamp": 1778200703.646743, "train/loss": 2.5202139616012573, "train/z_loss": 0.0018883734359405936, "train/perplexity": 12.43125619052924, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790506.5606650223, "perf/iters_per_sec": 0.8537800601315605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712618350982666, "data/tokens_consumed": 11976835072, "data/tokens_consumed_B": 11.976835072, "train/loss_slope": -5.606992959332409e-05} {"step": 5720, "timestamp": 1778200714.0087316, "train/loss": 2.4937393426895142, "train/z_loss": 0.0018971810583025216, "train/perplexity": 12.106461794841124, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025061.510870996, "perf/iters_per_sec": 0.9656245760302524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035599160194397, "data/tokens_consumed": 11997806592, "data/tokens_consumed_B": 11.997806592, "train/loss_slope": -5.592033956775505e-05} {"step": 5730, "timestamp": 1778200724.3683271, "train/loss": 2.4793909072875975, "train/z_loss": 0.001900485239457339, "train/perplexity": 11.933993295333982, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025747.5927247596, "perf/iters_per_sec": 0.9659517253516958, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352484226226806, "data/tokens_consumed": 12018778112, "data/tokens_consumed_B": 12.018778112, "train/loss_slope": -5.80722018401258e-05} {"step": 5740, "timestamp": 1778200734.7301433, "train/loss": 2.515174961090088, "train/z_loss": 0.001890936226118356, "train/perplexity": 12.368772643755415, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025126.1770134033, "perf/iters_per_sec": 0.9656554112498299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355660915374756, "data/tokens_consumed": 12039749632, "data/tokens_consumed_B": 12.039749632, "train/loss_slope": -5.376665333960467e-05} {"step": 5750, "timestamp": 1778200745.0870316, "grad/layer_0/attn": 0.003799855476245284, "grad/layer_0/mlp": 0.004268485587090254, "grad/layer_0/attn_mlp_ratio": 0.8902116007411458, "grad/layer_4/attn": 0.001873954781331122, "grad/layer_4/mlp": 0.0038084560073912144, "grad/layer_4/attn_mlp_ratio": 0.49205103813439444, "grad/layer_8/attn": 0.009840276092290878, "grad/layer_8/mlp": 0.005330757237970829, "grad/layer_8/attn_mlp_ratio": 1.8459433563405483, "grad/layer_12/attn": 0.006094323471188545, "grad/layer_12/mlp": 0.006109985988587141, "grad/layer_12/attn_mlp_ratio": 0.9974365543273738, "grad/layer_16/attn": 0.010131724178791046, "grad/layer_16/mlp": 0.005437059793621302, "grad/layer_16/attn_mlp_ratio": 1.86345642259293, "grad/layer_20/attn": 0.010581430047750473, "grad/layer_20/mlp": 0.008662492968142033, "grad/layer_20/attn_mlp_ratio": 1.2215224837138048, "grad/layer_24/attn": 0.025190148502588272, "grad/layer_24/mlp": 0.012130643241107464, "grad/layer_24/attn_mlp_ratio": 2.076571521744909, "grad/layer_27/attn": 0.007324270438402891, "grad/layer_27/mlp": 0.011631971225142479, "grad/layer_27/attn_mlp_ratio": 0.6296671676426417} {"step": 5750, "timestamp": 1778200745.1029038, "train/loss": 2.469983673095703, "train/z_loss": 0.0018967588432133198, "train/perplexity": 11.822253829263794, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022939.2403442124, "perf/iters_per_sec": 0.9646125985833227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366856098175048, "data/tokens_consumed": 12060721152, "data/tokens_consumed_B": 12.060721152, "train/loss_slope": -5.7211945173513706e-05} {"step": 5760, "timestamp": 1778200755.4660356, "train/loss": 2.482843351364136, "train/z_loss": 0.0018961999914608896, "train/perplexity": 11.97526594455725, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024864.041280639, "perf/iters_per_sec": 0.965530415191955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357001543045044, "data/tokens_consumed": 12081692672, "data/tokens_consumed_B": 12.081692672, "train/loss_slope": -5.730676939993199e-05} {"step": 5770, "timestamp": 1778200765.8226304, "train/loss": 2.4703020334243773, "train/z_loss": 0.0019096892443485557, "train/perplexity": 11.826018165054341, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025912.6649728655, "perf/iters_per_sec": 0.9660304379333808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351640701293945, "data/tokens_consumed": 12102664192, "data/tokens_consumed_B": 12.102664192, "train/loss_slope": -5.773224754802758e-05} {"step": 5775, "timestamp": 1778200771.603448, "eos/sharpness": 37.0246410369873, "eos/L0_probe": 2.314370632171631, "eos/L_plus": 2.579040288925171, "eos/L_minus": 2.419947385787964, "eos/grad_norm": 0.18307442963123322, "eos/embed_grad_frac": 0.13750013709068298, "eos/time_s": 0.6129429340362549} {"step": 5775, "timestamp": 1778200772.9833918, "geo/rankme_last": 424.8314514160156, "geo/layer_0/stable_rank_q_proj": 19.510276794433594, "geo/layer_0/stable_rank_k_proj": 13.23707389831543, "geo/layer_0/stable_rank_o_proj": 57.95915603637695, "geo/layer_0/stable_rank_gate_proj": 174.7062530517578, "geo/layer_0/stable_rank_down_proj": 47.15902328491211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0327177532017231, "geo/layer_0/attn_entropy_mean": 6.552969932556152, "geo/layer_0/attn_entropy_std": 0.2186005711555481, "geo/layer_7/stable_rank_q_proj": 44.96385955810547, "geo/layer_7/stable_rank_k_proj": 42.93289566040039, "geo/layer_7/stable_rank_o_proj": 104.47505187988281, "geo/layer_7/stable_rank_gate_proj": 153.22906494140625, "geo/layer_7/stable_rank_down_proj": 200.009033203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5809600949287415, "geo/layer_7/attn_entropy_mean": 4.8741655349731445, "geo/layer_7/attn_entropy_std": 1.0653995275497437, "geo/layer_14/stable_rank_q_proj": 75.50715637207031, "geo/layer_14/stable_rank_k_proj": 60.59552764892578, "geo/layer_14/stable_rank_o_proj": 60.58441162109375, "geo/layer_14/stable_rank_gate_proj": 153.9710235595703, "geo/layer_14/stable_rank_down_proj": 150.11668395996094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3981415033340454, "geo/layer_14/attn_entropy_mean": 6.076457977294922, "geo/layer_14/attn_entropy_std": 0.5140131115913391, "geo/layer_21/stable_rank_q_proj": 50.86625289916992, "geo/layer_21/stable_rank_k_proj": 32.425472259521484, "geo/layer_21/stable_rank_o_proj": 98.31843566894531, "geo/layer_21/stable_rank_gate_proj": 130.94647216796875, "geo/layer_21/stable_rank_down_proj": 100.4080810546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16944096982479095, "geo/layer_21/attn_entropy_mean": 5.864501953125, "geo/layer_21/attn_entropy_std": 0.3164580166339874, "geo/layer_27/stable_rank_q_proj": 44.300994873046875, "geo/layer_27/stable_rank_k_proj": 36.764461517333984, "geo/layer_27/stable_rank_o_proj": 105.57119750976562, "geo/layer_27/stable_rank_gate_proj": 90.63745880126953, "geo/layer_27/stable_rank_down_proj": 159.20863342285156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06751561909914017, "geo/layer_27/attn_entropy_mean": 4.6486616134643555, "geo/layer_27/attn_entropy_std": 0.48301956057548523, "attnres/final_alpha/block_0": 0.24162206053733826, "attnres/block_norm/0": 1.1748971939086914, "attnres/final_alpha/block_1": 0.014724655076861382, "attnres/block_norm/1": 8756.40625, "attnres/final_alpha/block_2": 0.030756603926420212, "attnres/block_norm/2": 6325.69921875, "attnres/final_alpha/block_3": 0.026295926421880722, "attnres/block_norm/3": 6115.787109375, "attnres/final_alpha/block_4": 0.04182820022106171, "attnres/block_norm/4": 3253.30322265625, "attnres/final_alpha/block_5": 0.4199283719062805, "attnres/block_norm/5": 3216.31201171875, "attnres/final_alpha/block_6": 0.22484418749809265, "attnres/block_norm/6": 4959.990234375, "geo/tier1_time_s": 1.3591139316558838, "geo/step": 5775.0, "geo/rankme_slope": 0.020926131741759203} {"step": 5780, "timestamp": 1778200778.1728976, "train/loss": 2.4485735416412355, "train/z_loss": 0.0019068773020990194, "train/perplexity": 11.571828209465616, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698949.656576295, "perf/iters_per_sec": 0.8101223261720156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2343814849853516, "data/tokens_consumed": 12123635712, "data/tokens_consumed_B": 12.123635712, "train/loss_slope": -6.272541508816262e-05} {"step": 5790, "timestamp": 1778200788.532435, "train/loss": 2.539623475074768, "train/z_loss": 0.0018771714298054577, "train/perplexity": 12.674897657361761, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025378.680261459, "perf/iters_per_sec": 0.9657758141810698, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035436987876892, "data/tokens_consumed": 12144607232, "data/tokens_consumed_B": 12.144607232, "train/loss_slope": -6.158104745468294e-05} {"step": 5800, "timestamp": 1778200798.878013, "grad/layer_0/attn": 0.003185632172971964, "grad/layer_0/mlp": 0.00400714622810483, "grad/layer_0/attn_mlp_ratio": 0.794987732449142, "grad/layer_4/attn": 0.0019298333209007978, "grad/layer_4/mlp": 0.003693696577101946, "grad/layer_4/attn_mlp_ratio": 0.5224666478068617, "grad/layer_8/attn": 0.007665723096579313, "grad/layer_8/mlp": 0.00530658895149827, "grad/layer_8/attn_mlp_ratio": 1.4445669378552994, "grad/layer_12/attn": 0.0063460590317845345, "grad/layer_12/mlp": 0.005939273629337549, "grad/layer_12/attn_mlp_ratio": 1.0684907483616448, "grad/layer_16/attn": 0.0068735689856112, "grad/layer_16/mlp": 0.005246310494840145, "grad/layer_16/attn_mlp_ratio": 1.3101719506221188, "grad/layer_20/attn": 0.009049121290445328, "grad/layer_20/mlp": 0.009116013534367085, "grad/layer_20/attn_mlp_ratio": 0.9926621057619337, "grad/layer_24/attn": 0.015687786042690277, "grad/layer_24/mlp": 0.014691587537527084, "grad/layer_24/attn_mlp_ratio": 1.0678074031031595, "grad/layer_27/attn": 0.010669171810150146, "grad/layer_27/mlp": 0.015538185834884644, "grad/layer_27/attn_mlp_ratio": 0.6866420478465819} {"step": 5800, "timestamp": 1778200798.894053, "train/loss": 2.524203634262085, "train/z_loss": 0.0018848325475119055, "train/perplexity": 12.480951902384591, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025130.5597276427, "perf/iters_per_sec": 0.965657501090833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035563850402832, "data/tokens_consumed": 12165578752, "data/tokens_consumed_B": 12.165578752, "train/loss_slope": -5.896478623482622e-05} {"step": 5810, "timestamp": 1778200809.2599957, "train/loss": 2.4594374895095825, "train/z_loss": 0.001897788024507463, "train/perplexity": 11.698229312160917, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024663.9538101605, "perf/iters_per_sec": 0.9654350060511401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358025074005126, "data/tokens_consumed": 12186550272, "data/tokens_consumed_B": 12.186550272, "train/loss_slope": -6.07514693148316e-05} {"step": 5820, "timestamp": 1778200819.6391652, "train/loss": 2.5020824670791626, "train/z_loss": 0.0018849833519198, "train/perplexity": 12.207890037387688, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022322.3823698945, "perf/iters_per_sec": 0.964318457779834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370018243789674, "data/tokens_consumed": 12207521792, "data/tokens_consumed_B": 12.207521792, "train/loss_slope": -5.956237655912048e-05} {"step": 5830, "timestamp": 1778200830.0162108, "train/loss": 2.5062545776367187, "train/z_loss": 0.0018874966655857861, "train/perplexity": 12.258929100571041, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022666.2750934095, "perf/iters_per_sec": 0.9644824386088416, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368255138397218, "data/tokens_consumed": 12228493312, "data/tokens_consumed_B": 12.228493312, "train/loss_slope": -6.177227328998926e-05} {"step": 5840, "timestamp": 1778200840.4001756, "train/loss": 2.461740732192993, "train/z_loss": 0.0018977694329805673, "train/perplexity": 11.725204226194709, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021224.4295083145, "perf/iters_per_sec": 0.9637949130574772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375651359558105, "data/tokens_consumed": 12249464832, "data/tokens_consumed_B": 12.249464832, "train/loss_slope": -6.402942469768832e-05} {"step": 5850, "timestamp": 1778200850.7456481, "grad/layer_0/attn": 0.003391990903764963, "grad/layer_0/mlp": 0.0043619065545499325, "grad/layer_0/attn_mlp_ratio": 0.7776394985955869, "grad/layer_4/attn": 0.0016427953960373998, "grad/layer_4/mlp": 0.003720602486282587, "grad/layer_4/attn_mlp_ratio": 0.44154014247428314, "grad/layer_8/attn": 0.007892854511737823, "grad/layer_8/mlp": 0.005262529943138361, "grad/layer_8/attn_mlp_ratio": 1.4998212736151608, "grad/layer_12/attn": 0.005183502100408077, "grad/layer_12/mlp": 0.005689835641533136, "grad/layer_12/attn_mlp_ratio": 0.9110108509057546, "grad/layer_16/attn": 0.005637279711663723, "grad/layer_16/mlp": 0.004932190291583538, "grad/layer_16/attn_mlp_ratio": 1.1429566306449512, "grad/layer_20/attn": 0.01068900153040886, "grad/layer_20/mlp": 0.008666545152664185, "grad/layer_20/attn_mlp_ratio": 1.2333636090024398, "grad/layer_24/attn": 0.021578950807452202, "grad/layer_24/mlp": 0.013714874163269997, "grad/layer_24/attn_mlp_ratio": 1.5733976406362764, "grad/layer_27/attn": 0.006339044310152531, "grad/layer_27/mlp": 0.01402056124061346, "grad/layer_27/attn_mlp_ratio": 0.4521248583528661} {"step": 5850, "timestamp": 1778200851.3579452, "eos/sharpness": 37.683343887329094, "eos/L0_probe": 2.3084940910339355, "eos/L_plus": 2.5732760429382324, "eos/L_minus": 2.4205455780029297, "eos/grad_norm": 0.22648543119430542, "eos/embed_grad_frac": 0.0810481458902359, "eos/time_s": 0.609550952911377} {"step": 5850, "timestamp": 1778200851.3786538, "train/loss": 2.4612802505493163, "train/z_loss": 0.0019074456533417106, "train/perplexity": 11.719806227815152, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911472.305144942, "perf/iters_per_sec": 0.9114610219692907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971396207809447, "data/tokens_consumed": 12270436352, "data/tokens_consumed_B": 12.270436352, "train/loss_slope": -6.508875381041681e-05} {"step": 5850, "timestamp": 1778200852.7428305, "geo/rankme_last": 424.92608642578125, "geo/layer_0/stable_rank_q_proj": 19.22400665283203, "geo/layer_0/stable_rank_k_proj": 13.052571296691895, "geo/layer_0/stable_rank_o_proj": 58.08163070678711, "geo/layer_0/stable_rank_gate_proj": 174.5732421875, "geo/layer_0/stable_rank_down_proj": 47.293678283691406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0291682630777359, "geo/layer_0/attn_entropy_mean": 6.544225215911865, "geo/layer_0/attn_entropy_std": 0.23113900423049927, "geo/layer_7/stable_rank_q_proj": 44.649349212646484, "geo/layer_7/stable_rank_k_proj": 43.23286437988281, "geo/layer_7/stable_rank_o_proj": 103.72673797607422, "geo/layer_7/stable_rank_gate_proj": 152.60818481445312, "geo/layer_7/stable_rank_down_proj": 199.10752868652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5850154161453247, "geo/layer_7/attn_entropy_mean": 4.843162536621094, "geo/layer_7/attn_entropy_std": 1.0217045545578003, "geo/layer_14/stable_rank_q_proj": 76.29444122314453, "geo/layer_14/stable_rank_k_proj": 60.85032653808594, "geo/layer_14/stable_rank_o_proj": 59.231109619140625, "geo/layer_14/stable_rank_gate_proj": 151.82818603515625, "geo/layer_14/stable_rank_down_proj": 149.7482147216797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40032318234443665, "geo/layer_14/attn_entropy_mean": 6.073952674865723, "geo/layer_14/attn_entropy_std": 0.5432485342025757, "geo/layer_21/stable_rank_q_proj": 50.83680725097656, "geo/layer_21/stable_rank_k_proj": 32.45863723754883, "geo/layer_21/stable_rank_o_proj": 97.88107299804688, "geo/layer_21/stable_rank_gate_proj": 130.7002410888672, "geo/layer_21/stable_rank_down_proj": 99.22551727294922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16717711091041565, "geo/layer_21/attn_entropy_mean": 5.857215881347656, "geo/layer_21/attn_entropy_std": 0.293128103017807, "geo/layer_27/stable_rank_q_proj": 44.254642486572266, "geo/layer_27/stable_rank_k_proj": 36.59286880493164, "geo/layer_27/stable_rank_o_proj": 106.29705047607422, "geo/layer_27/stable_rank_gate_proj": 91.10150909423828, "geo/layer_27/stable_rank_down_proj": 160.75022888183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06718903034925461, "geo/layer_27/attn_entropy_mean": 4.636866569519043, "geo/layer_27/attn_entropy_std": 0.5073091387748718, "attnres/final_alpha/block_0": 0.24122363328933716, "attnres/block_norm/0": 1.1814322471618652, "attnres/final_alpha/block_1": 0.014379587955772877, "attnres/block_norm/1": 8965.5556640625, "attnres/final_alpha/block_2": 0.030125953257083893, "attnres/block_norm/2": 6472.0048828125, "attnres/final_alpha/block_3": 0.0258878655731678, "attnres/block_norm/3": 6304.529296875, "attnres/final_alpha/block_4": 0.041055381298065186, "attnres/block_norm/4": 3299.752197265625, "attnres/final_alpha/block_5": 0.4278984069824219, "attnres/block_norm/5": 3205.545166015625, "attnres/final_alpha/block_6": 0.2194291353225708, "attnres/block_norm/6": 5054.4248046875, "geo/tier1_time_s": 1.360269546508789, "geo/step": 5850.0, "geo/rankme_slope": 0.020431430404192926} {"step": 5860, "timestamp": 1778200863.0990517, "train/loss": 2.4841496467590334, "train/z_loss": 0.0018980039982125162, "train/perplexity": 11.990919401106469, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789916.2407435488, "perf/iters_per_sec": 0.8534985736577744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171648120880127, "data/tokens_consumed": 12291407872, "data/tokens_consumed_B": 12.291407872, "train/loss_slope": -6.335787321045494e-05} {"step": 5870, "timestamp": 1778200873.4539971, "train/loss": 2.4485064268112184, "train/z_loss": 0.0019082598621025682, "train/perplexity": 11.571051594243842, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026487.6417718271, "perf/iters_per_sec": 0.9663046082362304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348703622817994, "data/tokens_consumed": 12312379392, "data/tokens_consumed_B": 12.312379392, "train/loss_slope": -6.644682297647948e-05} {"step": 5880, "timestamp": 1778200883.8155725, "train/loss": 2.461438035964966, "train/z_loss": 0.0019033923046663404, "train/perplexity": 11.721655588209352, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025201.8980628767, "perf/iters_per_sec": 0.965691517859877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355273723602294, "data/tokens_consumed": 12333350912, "data/tokens_consumed_B": 12.333350912, "train/loss_slope": -6.657172491197791e-05} {"step": 5890, "timestamp": 1778200894.1712067, "train/loss": 2.4821112394332885, "train/z_loss": 0.0018912648665718734, "train/perplexity": 11.966501917999823, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026259.6936722118, "perf/iters_per_sec": 0.9661959141121921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349867820739747, "data/tokens_consumed": 12354322432, "data/tokens_consumed_B": 12.354322432, "train/loss_slope": -6.56087759340128e-05} {"step": 5900, "timestamp": 1778200904.5092738, "grad/layer_0/attn": 0.0033563270699232817, "grad/layer_0/mlp": 0.004043045919388533, "grad/layer_0/attn_mlp_ratio": 0.8301481244160783, "grad/layer_4/attn": 0.0021583728957921267, "grad/layer_4/mlp": 0.00386191182769835, "grad/layer_4/attn_mlp_ratio": 0.5588871357505268, "grad/layer_8/attn": 0.009228800423443317, "grad/layer_8/mlp": 0.005678280256688595, "grad/layer_8/attn_mlp_ratio": 1.6252808673971995, "grad/layer_12/attn": 0.004501303192228079, "grad/layer_12/mlp": 0.005874491296708584, "grad/layer_12/attn_mlp_ratio": 0.7662456012362385, "grad/layer_16/attn": 0.005043730605393648, "grad/layer_16/mlp": 0.0048522986471652985, "grad/layer_16/attn_mlp_ratio": 1.0394517873286726, "grad/layer_20/attn": 0.008350507356226444, "grad/layer_20/mlp": 0.007934244349598885, "grad/layer_20/attn_mlp_ratio": 1.0524640889591703, "grad/layer_24/attn": 0.009263833053410053, "grad/layer_24/mlp": 0.012235979549586773, "grad/layer_24/attn_mlp_ratio": 0.7570977820090529, "grad/layer_27/attn": 0.008198203518986702, "grad/layer_27/mlp": 0.012839912436902523, "grad/layer_27/attn_mlp_ratio": 0.6384937199085019} {"step": 5900, "timestamp": 1778200904.5248845, "train/loss": 2.539250898361206, "train/z_loss": 0.0018834349466487765, "train/perplexity": 12.670176165261465, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026535.1237970397, "perf/iters_per_sec": 0.9663272494301985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348461151123047, "data/tokens_consumed": 12375293952, "data/tokens_consumed_B": 12.375293952, "train/loss_slope": -6.089619482883351e-05} {"step": 5910, "timestamp": 1778200914.8806417, "train/loss": 2.4510351181030274, "train/z_loss": 0.00190333416685462, "train/perplexity": 11.60034823711923, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026322.1957378436, "perf/iters_per_sec": 0.9662257174195498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349548578262329, "data/tokens_consumed": 12396265472, "data/tokens_consumed_B": 12.396265472, "train/loss_slope": -6.0174653925696866e-05} {"step": 5920, "timestamp": 1778200925.2332006, "train/loss": 2.500848889350891, "train/z_loss": 0.0018856323440559208, "train/perplexity": 12.192839940768874, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026916.6014461606, "perf/iters_per_sec": 0.9665091521483233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346513509750366, "data/tokens_consumed": 12417236992, "data/tokens_consumed_B": 12.417236992, "train/loss_slope": -5.8302489666119045e-05} {"step": 5925, "timestamp": 1778200931.0132437, "eos/sharpness": 15.581083297729489, "eos/L0_probe": 2.302415132522583, "eos/L_plus": 2.4187700748443604, "eos/L_minus": 2.3418710231781006, "eos/grad_norm": 0.18453355133533478, "eos/embed_grad_frac": 0.11934646219015121, "eos/time_s": 0.6062638759613037} {"step": 5925, "timestamp": 1778200932.388134, "geo/rankme_last": 426.1921081542969, "geo/layer_0/stable_rank_q_proj": 18.871936798095703, "geo/layer_0/stable_rank_k_proj": 12.917709350585938, "geo/layer_0/stable_rank_o_proj": 58.2296257019043, "geo/layer_0/stable_rank_gate_proj": 175.10292053222656, "geo/layer_0/stable_rank_down_proj": 47.24311447143555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.034690774977207184, "geo/layer_0/attn_entropy_mean": 6.538046360015869, "geo/layer_0/attn_entropy_std": 0.2309790849685669, "geo/layer_7/stable_rank_q_proj": 44.4727668762207, "geo/layer_7/stable_rank_k_proj": 43.35514450073242, "geo/layer_7/stable_rank_o_proj": 103.646484375, "geo/layer_7/stable_rank_gate_proj": 151.66427612304688, "geo/layer_7/stable_rank_down_proj": 199.34423828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5803454518318176, "geo/layer_7/attn_entropy_mean": 4.802443981170654, "geo/layer_7/attn_entropy_std": 1.0316650867462158, "geo/layer_14/stable_rank_q_proj": 76.9237289428711, "geo/layer_14/stable_rank_k_proj": 60.634254455566406, "geo/layer_14/stable_rank_o_proj": 58.164588928222656, "geo/layer_14/stable_rank_gate_proj": 150.72694396972656, "geo/layer_14/stable_rank_down_proj": 148.567626953125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40554502606391907, "geo/layer_14/attn_entropy_mean": 6.066612243652344, "geo/layer_14/attn_entropy_std": 0.5251346826553345, "geo/layer_21/stable_rank_q_proj": 51.045936584472656, "geo/layer_21/stable_rank_k_proj": 32.386783599853516, "geo/layer_21/stable_rank_o_proj": 97.73363494873047, "geo/layer_21/stable_rank_gate_proj": 130.2049102783203, "geo/layer_21/stable_rank_down_proj": 98.15509796142578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1757880598306656, "geo/layer_21/attn_entropy_mean": 5.869108200073242, "geo/layer_21/attn_entropy_std": 0.3121468722820282, "geo/layer_27/stable_rank_q_proj": 44.03431701660156, "geo/layer_27/stable_rank_k_proj": 36.47434997558594, "geo/layer_27/stable_rank_o_proj": 106.48204040527344, "geo/layer_27/stable_rank_gate_proj": 91.54953002929688, "geo/layer_27/stable_rank_down_proj": 161.9378204345703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0680026113986969, "geo/layer_27/attn_entropy_mean": 4.634464263916016, "geo/layer_27/attn_entropy_std": 0.5015119910240173, "attnres/final_alpha/block_0": 0.2434452623128891, "attnres/block_norm/0": 1.1875321865081787, "attnres/final_alpha/block_1": 0.014383779838681221, "attnres/block_norm/1": 9182.61328125, "attnres/final_alpha/block_2": 0.029979191720485687, "attnres/block_norm/2": 6585.05810546875, "attnres/final_alpha/block_3": 0.02588840015232563, "attnres/block_norm/3": 6466.220703125, "attnres/final_alpha/block_4": 0.041078925132751465, "attnres/block_norm/4": 3359.7529296875, "attnres/final_alpha/block_5": 0.4209420084953308, "attnres/block_norm/5": 3259.82470703125, "attnres/final_alpha/block_6": 0.22428244352340698, "attnres/block_norm/6": 5148.220703125, "geo/tier1_time_s": 1.3548078536987305, "geo/step": 5925.0, "geo/rankme_slope": 0.020038151979541816} {"step": 5930, "timestamp": 1778200937.5685184, "train/loss": 2.4156523942947388, "train/z_loss": 0.00191405100049451, "train/perplexity": 11.197072881696947, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701176.700958153, "perf/iters_per_sec": 0.8111842636862531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232765531539917, "data/tokens_consumed": 12438208512, "data/tokens_consumed_B": 12.438208512, "train/loss_slope": -6.230283350047502e-05} {"step": 5940, "timestamp": 1778200947.9305096, "train/loss": 2.500381660461426, "train/z_loss": 0.0018923204974271358, "train/perplexity": 12.187144424362437, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024844.1379604999, "perf/iters_per_sec": 0.9655209245493411, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035710334777832, "data/tokens_consumed": 12459180032, "data/tokens_consumed_B": 12.459180032, "train/loss_slope": -6.302626504219943e-05} {"step": 5950, "timestamp": 1778200958.2751718, "grad/layer_0/attn": 0.0029076903592795134, "grad/layer_0/mlp": 0.0038830058183521032, "grad/layer_0/attn_mlp_ratio": 0.7488246014606897, "grad/layer_4/attn": 0.002050272189080715, "grad/layer_4/mlp": 0.0036451811902225018, "grad/layer_4/attn_mlp_ratio": 0.5624609658181271, "grad/layer_8/attn": 0.009361770004034042, "grad/layer_8/mlp": 0.005065112840384245, "grad/layer_8/attn_mlp_ratio": 1.8482845524316875, "grad/layer_12/attn": 0.005430538207292557, "grad/layer_12/mlp": 0.00593456020578742, "grad/layer_12/attn_mlp_ratio": 0.9150700182449343, "grad/layer_16/attn": 0.005534553434699774, "grad/layer_16/mlp": 0.004956676624715328, "grad/layer_16/attn_mlp_ratio": 1.1165855152713504, "grad/layer_20/attn": 0.008840693160891533, "grad/layer_20/mlp": 0.00873394962400198, "grad/layer_20/attn_mlp_ratio": 1.0122216683473926, "grad/layer_24/attn": 0.018148282542824745, "grad/layer_24/mlp": 0.015855450183153152, "grad/layer_24/attn_mlp_ratio": 1.1446084607327607, "grad/layer_27/attn": 0.01043624896556139, "grad/layer_27/mlp": 0.01686003990471363, "grad/layer_27/attn_mlp_ratio": 0.6189931318456946} {"step": 5950, "timestamp": 1778200958.2909548, "train/loss": 2.451439619064331, "train/z_loss": 0.0019090036978013814, "train/perplexity": 11.605041538291026, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025193.7848224428, "perf/iters_per_sec": 0.9656876491653646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355315208435059, "data/tokens_consumed": 12480151552, "data/tokens_consumed_B": 12.480151552, "train/loss_slope": -6.03778625085409e-05} {"step": 5960, "timestamp": 1778200968.6509478, "train/loss": 2.5400543212890625, "train/z_loss": 0.0018754506250843405, "train/perplexity": 12.680359765613842, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025899.0401283149, "perf/iters_per_sec": 0.9660239411012244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351710319519043, "data/tokens_consumed": 12501123072, "data/tokens_consumed_B": 12.501123072, "train/loss_slope": -5.747765687861816e-05} {"step": 5970, "timestamp": 1778200978.9958987, "train/loss": 2.5067281484603883, "train/z_loss": 0.0018866869271732868, "train/perplexity": 12.264735946590406, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028045.7645977922, "perf/iters_per_sec": 0.967047579096695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340752840042113, "data/tokens_consumed": 12522094592, "data/tokens_consumed_B": 12.522094592, "train/loss_slope": -5.555180315852631e-05} {"step": 5980, "timestamp": 1778200989.3635528, "train/loss": 2.518374967575073, "train/z_loss": 0.0018875403911806643, "train/perplexity": 12.408416192403827, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024800.9766011816, "perf/iters_per_sec": 0.9655003436094196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357324123382567, "data/tokens_consumed": 12543066112, "data/tokens_consumed_B": 12.543066112, "train/loss_slope": -5.3795324629432564e-05} {"step": 5990, "timestamp": 1778200999.730422, "train/loss": 2.47834107875824, "train/z_loss": 0.0018949717516079545, "train/perplexity": 11.921471222867929, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024491.9092777597, "perf/iters_per_sec": 0.9653529688252257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035890531539917, "data/tokens_consumed": 12564037632, "data/tokens_consumed_B": 12.564037632, "train/loss_slope": -5.4464239863851504e-05} {"step": 6000, "timestamp": 1778201010.0800822, "grad/layer_0/attn": 0.0028144274838268757, "grad/layer_0/mlp": 0.0037413868121802807, "grad/layer_0/attn_mlp_ratio": 0.7522417621829915, "grad/layer_4/attn": 0.001925680204294622, "grad/layer_4/mlp": 0.0036345410626381636, "grad/layer_4/attn_mlp_ratio": 0.5298275953206841, "grad/layer_8/attn": 0.006862752605229616, "grad/layer_8/mlp": 0.0051229423843324184, "grad/layer_8/attn_mlp_ratio": 1.3396114881668277, "grad/layer_12/attn": 0.006281021051108837, "grad/layer_12/mlp": 0.005653094500303268, "grad/layer_12/attn_mlp_ratio": 1.111076586401346, "grad/layer_16/attn": 0.0055614071898162365, "grad/layer_16/mlp": 0.004863259382545948, "grad/layer_16/attn_mlp_ratio": 1.1435555124656445, "grad/layer_20/attn": 0.014068832620978355, "grad/layer_20/mlp": 0.00877869501709938, "grad/layer_20/attn_mlp_ratio": 1.6026109157811737, "grad/layer_24/attn": 0.006803757045418024, "grad/layer_24/mlp": 0.010187610983848572, "grad/layer_24/attn_mlp_ratio": 0.667846170159037, "grad/layer_27/attn": 0.005367111414670944, "grad/layer_27/mlp": 0.010028030723333359, "grad/layer_27/attn_mlp_ratio": 0.5352109012452052} {"step": 6000, "timestamp": 1778201010.7033076, "eos/sharpness": 18.36750507354736, "eos/L0_probe": 2.2967171669006348, "eos/L_plus": 2.4234790802001953, "eos/L_minus": 2.353630304336548, "eos/grad_norm": 0.1506965160369873, "eos/embed_grad_frac": 0.1594107747077942, "eos/time_s": 0.620323657989502} {"step": 6000, "timestamp": 1778201010.723914, "train/loss": 2.478044629096985, "train/z_loss": 0.001891468372195959, "train/perplexity": 11.917937630554249, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908872.8210456949, "perf/iters_per_sec": 0.9102214913586115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986336946487427, "data/tokens_consumed": 12585009152, "data/tokens_consumed_B": 12.585009152, "train/loss_slope": -5.27222619341402e-05} {"step": 6000, "timestamp": 1778201012.0863452, "geo/rankme_last": 425.7606506347656, "geo/layer_0/stable_rank_q_proj": 18.630741119384766, "geo/layer_0/stable_rank_k_proj": 12.781400680541992, "geo/layer_0/stable_rank_o_proj": 58.13859558105469, "geo/layer_0/stable_rank_gate_proj": 175.7635955810547, "geo/layer_0/stable_rank_down_proj": 47.050209045410156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.035378437489271164, "geo/layer_0/attn_entropy_mean": 6.524503231048584, "geo/layer_0/attn_entropy_std": 0.23436173796653748, "geo/layer_7/stable_rank_q_proj": 44.57304000854492, "geo/layer_7/stable_rank_k_proj": 43.562503814697266, "geo/layer_7/stable_rank_o_proj": 103.3447494506836, "geo/layer_7/stable_rank_gate_proj": 149.61940002441406, "geo/layer_7/stable_rank_down_proj": 198.15411376953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5881390571594238, "geo/layer_7/attn_entropy_mean": 4.854208946228027, "geo/layer_7/attn_entropy_std": 1.0608078241348267, "geo/layer_14/stable_rank_q_proj": 77.51081848144531, "geo/layer_14/stable_rank_k_proj": 60.19074249267578, "geo/layer_14/stable_rank_o_proj": 57.16960525512695, "geo/layer_14/stable_rank_gate_proj": 149.2086944580078, "geo/layer_14/stable_rank_down_proj": 148.4043731689453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3971768915653229, "geo/layer_14/attn_entropy_mean": 6.019007205963135, "geo/layer_14/attn_entropy_std": 0.5244266986846924, "geo/layer_21/stable_rank_q_proj": 51.22064971923828, "geo/layer_21/stable_rank_k_proj": 32.39242935180664, "geo/layer_21/stable_rank_o_proj": 97.34396362304688, "geo/layer_21/stable_rank_gate_proj": 129.03700256347656, "geo/layer_21/stable_rank_down_proj": 97.24092864990234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16959287226200104, "geo/layer_21/attn_entropy_mean": 5.869864463806152, "geo/layer_21/attn_entropy_std": 0.3215181231498718, "geo/layer_27/stable_rank_q_proj": 44.10905456542969, "geo/layer_27/stable_rank_k_proj": 36.493900299072266, "geo/layer_27/stable_rank_o_proj": 107.02324676513672, "geo/layer_27/stable_rank_gate_proj": 91.35863494873047, "geo/layer_27/stable_rank_down_proj": 163.08099365234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06420484185218811, "geo/layer_27/attn_entropy_mean": 4.630996227264404, "geo/layer_27/attn_entropy_std": 0.4734264314174652, "attnres/final_alpha/block_0": 0.24435260891914368, "attnres/block_norm/0": 1.1937952041625977, "attnres/final_alpha/block_1": 0.014301442541182041, "attnres/block_norm/1": 9341.80859375, "attnres/final_alpha/block_2": 0.029958147555589676, "attnres/block_norm/2": 6739.18359375, "attnres/final_alpha/block_3": 0.02620314620435238, "attnres/block_norm/3": 6657.494140625, "attnres/final_alpha/block_4": 0.040901269763708115, "attnres/block_norm/4": 3414.439697265625, "attnres/final_alpha/block_5": 0.42463597655296326, "attnres/block_norm/5": 3272.623046875, "attnres/final_alpha/block_6": 0.21964740753173828, "attnres/block_norm/6": 5273.892578125, "geo/tier1_time_s": 1.3582854270935059, "geo/step": 6000.0, "geo/rankme_slope": 0.01955406578647084} {"step": 6000, "timestamp": 1778201019.0958736, "geo/ww_alpha_mean": 8.422442270400799, "geo/ww_alpha_std": 5.027350369969395, "geo/ww_alpha_min": 1.7867747988561147, "geo/ww_alpha_max": 40.67185678152024, "geo/ww_alpha_healthy_frac": 0.14213197969543148, "geo/ww_alpha_by_type/q_proj": 4.796335300962568, "geo/ww_alpha_by_type/k_proj": 5.219367185725946, "geo/ww_alpha_by_type/v_proj": 7.483711665675051, "geo/ww_alpha_by_type/o_proj": 7.662067456448781, "geo/ww_alpha_by_type/gate_proj": 11.67902620404928, "geo/ww_alpha_by_type/up_proj": 12.293285804151443, "geo/ww_alpha_by_type/down_proj": 10.049176523757867, "geo/twonn_id/layer_0": 0.7749471068382263, "geo/twonn_id/layer_7": 2.5532116889953613, "geo/twonn_id/layer_14": 3.3286826610565186, "geo/twonn_id/layer_21": 7.113701343536377, "geo/twonn_id/layer_27": 5.162723541259766, "geo/tier2_time_s": 7.002684593200684} {"step": 6000, "timestamp": 1778201019.8108296, "eoc/jacobian_sigma/layer_0/attn": 505.4422607421875, "eoc/jacobian_sigma/layer_0/mlp": 1375.10400390625, "eoc/jacobian_sigma/layer_0": 1375.10400390625, "eoc/jacobian_sigma/layer_7/attn": 1.1494898796081543, "eoc/jacobian_sigma/layer_7/mlp": 1.6796578168869019, "eoc/jacobian_sigma/layer_7": 1.6796578168869019, "eoc/jacobian_sigma/layer_14/attn": 1.1670867204666138, "eoc/jacobian_sigma/layer_14/mlp": 4.671422004699707, "eoc/jacobian_sigma/layer_14": 4.671422004699707, "eoc/jacobian_sigma/layer_21/attn": 1.0737329721450806, "eoc/jacobian_sigma/layer_21/mlp": 2.555936813354492, "eoc/jacobian_sigma/layer_21": 2.555936813354492, "eoc/jacobian_sigma/layer_27/attn": 1.7129722833633423, "eoc/jacobian_sigma/layer_27/mlp": 8.227668762207031, "eoc/jacobian_sigma/layer_27": 8.227668762207031, "eoc/layer0_sigma": 1375.10400390625, "eoc/sigma_max": 8.227668762207031, "eoc/sigma_min": 1.6796578168869019, "eoc/sigma_mean": 4.283671349287033, "eoc/time_s": 0.7058382034301758} {"step": 6010, "timestamp": 1778201030.32973, "train/loss": 2.4309306859970095, "train/z_loss": 0.0019085042062215508, "train/perplexity": 11.369458553419426, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1069862.6576766395, "perf/iters_per_sec": 0.5101502693541715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9602067470550537, "data/tokens_consumed": 12605980672, "data/tokens_consumed_B": 12.605980672, "train/loss_slope": -5.71254512335923e-05} {"step": 6020, "timestamp": 1778201040.6874409, "train/loss": 2.4574507236480714, "train/z_loss": 0.0019000968779437244, "train/perplexity": 11.675010742092006, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025913.5048665402, "perf/iters_per_sec": 0.9660308384258939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351636409759521, "data/tokens_consumed": 12626952192, "data/tokens_consumed_B": 12.626952192, "train/loss_slope": -6.0276844592341916e-05} {"step": 6030, "timestamp": 1778201051.041128, "train/loss": 2.479180836677551, "train/z_loss": 0.0018948352662846446, "train/perplexity": 11.931486577385247, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026567.9936801975, "perf/iters_per_sec": 0.966342923011874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034829330444336, "data/tokens_consumed": 12647923712, "data/tokens_consumed_B": 12.647923712, "train/loss_slope": -6.0693424813138656e-05} {"step": 6040, "timestamp": 1778201061.4032388, "train/loss": 2.4526710748672484, "train/z_loss": 0.0019033398595638574, "train/perplexity": 11.619341437075992, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025429.7481290994, "perf/iters_per_sec": 0.965800165237951, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354108810424805, "data/tokens_consumed": 12668895232, "data/tokens_consumed_B": 12.668895232, "train/loss_slope": -6.289017976600059e-05} {"step": 6050, "timestamp": 1778201071.7481527, "grad/layer_0/attn": 0.003065055003389716, "grad/layer_0/mlp": 0.0037296288646757603, "grad/layer_0/attn_mlp_ratio": 0.8218123122754599, "grad/layer_4/attn": 0.0017411864828318357, "grad/layer_4/mlp": 0.0035465159453451633, "grad/layer_4/attn_mlp_ratio": 0.49095688855465913, "grad/layer_8/attn": 0.006271849852055311, "grad/layer_8/mlp": 0.005013397894799709, "grad/layer_8/attn_mlp_ratio": 1.2510177445638602, "grad/layer_12/attn": 0.0052789063192903996, "grad/layer_12/mlp": 0.006225822493433952, "grad/layer_12/attn_mlp_ratio": 0.8479050342452393, "grad/layer_16/attn": 0.007389552891254425, "grad/layer_16/mlp": 0.004847606178373098, "grad/layer_16/attn_mlp_ratio": 1.524371507690684, "grad/layer_20/attn": 0.006635115947574377, "grad/layer_20/mlp": 0.007848668843507767, "grad/layer_20/attn_mlp_ratio": 0.8453810442677402, "grad/layer_24/attn": 0.019758453592658043, "grad/layer_24/mlp": 0.011441653594374657, "grad/layer_24/attn_mlp_ratio": 1.7268879237598653, "grad/layer_27/attn": 0.0134333036839962, "grad/layer_27/mlp": 0.010384481400251389, "grad/layer_27/attn_mlp_ratio": 1.2935940695422306} {"step": 6050, "timestamp": 1778201071.764159, "train/loss": 2.471389889717102, "train/z_loss": 0.0019019422587007284, "train/perplexity": 11.838890173511345, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025229.1291841688, "perf/iters_per_sec": 0.9657045026703686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03551344871521, "data/tokens_consumed": 12689866752, "data/tokens_consumed_B": 12.689866752, "train/loss_slope": -6.591455825556158e-05} {"step": 6060, "timestamp": 1778201082.1359255, "train/loss": 2.405686068534851, "train/z_loss": 0.0019166972720995545, "train/perplexity": 11.086033452468293, "train/grad_norm": 0.34765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023304.658317119, "perf/iters_per_sec": 0.9647868434510799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364983797073364, "data/tokens_consumed": 12710838272, "data/tokens_consumed_B": 12.710838272, "train/loss_slope": -6.727174636733235e-05} {"step": 6070, "timestamp": 1778201092.5113738, "train/loss": 2.4811293125152587, "train/z_loss": 0.0018877128255553543, "train/perplexity": 11.95475745469786, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025815.9418761875, "perf/iters_per_sec": 0.9659843167668283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352134943008422, "data/tokens_consumed": 12731809792, "data/tokens_consumed_B": 12.731809792, "train/loss_slope": -6.83074419826779e-05} {"step": 6075, "timestamp": 1778201098.321444, "eos/sharpness": 4.98218536376953, "eos/L0_probe": 2.293978452682495, "eos/L_plus": 2.3237650394439697, "eos/L_minus": 2.314013719558716, "eos/grad_norm": 0.11990532279014587, "eos/embed_grad_frac": 0.29046592116355896, "eos/time_s": 0.63840651512146} {"step": 6075, "timestamp": 1778201099.6994758, "geo/rankme_last": 428.83343505859375, "geo/layer_0/stable_rank_q_proj": 18.386072158813477, "geo/layer_0/stable_rank_k_proj": 12.6660737991333, "geo/layer_0/stable_rank_o_proj": 57.892494201660156, "geo/layer_0/stable_rank_gate_proj": 175.0330810546875, "geo/layer_0/stable_rank_down_proj": 47.04632568359375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03647460788488388, "geo/layer_0/attn_entropy_mean": 6.525459289550781, "geo/layer_0/attn_entropy_std": 0.23123480379581451, "geo/layer_7/stable_rank_q_proj": 44.50223159790039, "geo/layer_7/stable_rank_k_proj": 43.85354232788086, "geo/layer_7/stable_rank_o_proj": 104.53907775878906, "geo/layer_7/stable_rank_gate_proj": 149.26255798339844, "geo/layer_7/stable_rank_down_proj": 196.3819580078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5880265235900879, "geo/layer_7/attn_entropy_mean": 4.807862281799316, "geo/layer_7/attn_entropy_std": 1.016744613647461, "geo/layer_14/stable_rank_q_proj": 77.85301208496094, "geo/layer_14/stable_rank_k_proj": 60.65781021118164, "geo/layer_14/stable_rank_o_proj": 56.45226287841797, "geo/layer_14/stable_rank_gate_proj": 149.86550903320312, "geo/layer_14/stable_rank_down_proj": 148.68069458007812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40147116780281067, "geo/layer_14/attn_entropy_mean": 5.997435569763184, "geo/layer_14/attn_entropy_std": 0.5480337738990784, "geo/layer_21/stable_rank_q_proj": 51.003360748291016, "geo/layer_21/stable_rank_k_proj": 32.432586669921875, "geo/layer_21/stable_rank_o_proj": 96.61538696289062, "geo/layer_21/stable_rank_gate_proj": 128.730712890625, "geo/layer_21/stable_rank_down_proj": 96.31452178955078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17104974389076233, "geo/layer_21/attn_entropy_mean": 5.839961528778076, "geo/layer_21/attn_entropy_std": 0.31217724084854126, "geo/layer_27/stable_rank_q_proj": 43.90296173095703, "geo/layer_27/stable_rank_k_proj": 36.461849212646484, "geo/layer_27/stable_rank_o_proj": 107.47830963134766, "geo/layer_27/stable_rank_gate_proj": 91.66456604003906, "geo/layer_27/stable_rank_down_proj": 163.9608612060547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06342486292123795, "geo/layer_27/attn_entropy_mean": 4.6693830490112305, "geo/layer_27/attn_entropy_std": 0.476493775844574, "attnres/final_alpha/block_0": 0.2443855106830597, "attnres/block_norm/0": 1.199756383895874, "attnres/final_alpha/block_1": 0.01424599252641201, "attnres/block_norm/1": 9539.4775390625, "attnres/final_alpha/block_2": 0.02915586717426777, "attnres/block_norm/2": 6855.537109375, "attnres/final_alpha/block_3": 0.02578563056886196, "attnres/block_norm/3": 6751.43017578125, "attnres/final_alpha/block_4": 0.0411919504404068, "attnres/block_norm/4": 3422.693359375, "attnres/final_alpha/block_5": 0.42613863945007324, "attnres/block_norm/5": 3240.484375, "attnres/final_alpha/block_6": 0.21909643709659576, "attnres/block_norm/6": 5348.3935546875, "geo/tier1_time_s": 1.3581326007843018, "geo/step": 6075.0, "geo/rankme_slope": 0.019166833432591786} {"step": 6080, "timestamp": 1778201104.8927917, "train/loss": 2.482273983955383, "train/z_loss": 0.0018846600665710866, "train/perplexity": 11.968449559115275, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694690.3869835436, "perf/iters_per_sec": 0.8080913481633871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2374838590621948, "data/tokens_consumed": 12752781312, "data/tokens_consumed_B": 12.752781312, "train/loss_slope": -6.821327498464875e-05} {"step": 6090, "timestamp": 1778201115.2520409, "train/loss": 2.469041633605957, "train/z_loss": 0.0019037044839933515, "train/perplexity": 11.811122043412984, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025517.1058033882, "perf/iters_per_sec": 0.9658418206231061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353662252426148, "data/tokens_consumed": 12773752832, "data/tokens_consumed_B": 12.773752832, "train/loss_slope": -6.743522205404291e-05} {"step": 6100, "timestamp": 1778201125.6038632, "grad/layer_0/attn": 0.0032084721606224775, "grad/layer_0/mlp": 0.004087639972567558, "grad/layer_0/attn_mlp_ratio": 0.784920419523911, "grad/layer_4/attn": 0.0021260264329612255, "grad/layer_4/mlp": 0.0036559980362653732, "grad/layer_4/attn_mlp_ratio": 0.5815173732919284, "grad/layer_8/attn": 0.007642372976988554, "grad/layer_8/mlp": 0.005096130538731813, "grad/layer_8/attn_mlp_ratio": 1.4996422813231454, "grad/layer_12/attn": 0.005165054462850094, "grad/layer_12/mlp": 0.005392143502831459, "grad/layer_12/attn_mlp_ratio": 0.9578851832020723, "grad/layer_16/attn": 0.005026879254728556, "grad/layer_16/mlp": 0.004546928685158491, "grad/layer_16/attn_mlp_ratio": 1.1055548684062655, "grad/layer_20/attn": 0.009471862576901913, "grad/layer_20/mlp": 0.008739222772419453, "grad/layer_20/attn_mlp_ratio": 1.0838335073012766, "grad/layer_24/attn": 0.021150188520550728, "grad/layer_24/mlp": 0.017968641594052315, "grad/layer_24/attn_mlp_ratio": 1.1770610645295194, "grad/layer_27/attn": 0.0177314430475235, "grad/layer_27/mlp": 0.017408067360520363, "grad/layer_27/attn_mlp_ratio": 1.0185761910525977} {"step": 6100, "timestamp": 1778201125.6202695, "train/loss": 2.4219810962677, "train/z_loss": 0.001909815100952983, "train/perplexity": 11.26816052792873, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023972.039063996, "perf/iters_per_sec": 0.9651050753898601, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361566066741943, "data/tokens_consumed": 12794724352, "data/tokens_consumed_B": 12.794724352, "train/loss_slope": -6.989132903768897e-05} {"step": 6110, "timestamp": 1778201135.9985857, "train/loss": 2.4963908433914184, "train/z_loss": 0.0018855778616853058, "train/perplexity": 12.138604681399261, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021731.7365517418, "perf/iters_per_sec": 0.9640368159064016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373047828674316, "data/tokens_consumed": 12815695872, "data/tokens_consumed_B": 12.815695872, "train/loss_slope": -6.988653123754307e-05} {"step": 6120, "timestamp": 1778201146.3701813, "train/loss": 2.467762494087219, "train/z_loss": 0.001894309080671519, "train/perplexity": 11.796023628994563, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023189.0578665284, "perf/iters_per_sec": 0.9647317208607332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365576028823853, "data/tokens_consumed": 12836667392, "data/tokens_consumed_B": 12.836667392, "train/loss_slope": -6.656893437737794e-05} {"step": 6130, "timestamp": 1778201156.7296402, "train/loss": 2.4712204456329347, "train/z_loss": 0.0018936522072181105, "train/perplexity": 11.83688431355369, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025503.2530693556, "perf/iters_per_sec": 0.9658352151247767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353733062744142, "data/tokens_consumed": 12857638912, "data/tokens_consumed_B": 12.857638912, "train/loss_slope": -6.556791593961954e-05} {"step": 6140, "timestamp": 1778201167.0909102, "train/loss": 2.467585587501526, "train/z_loss": 0.0018900671042501927, "train/perplexity": 11.79393701930254, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025344.4499087026, "perf/iters_per_sec": 0.9657594918769372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354544878005982, "data/tokens_consumed": 12878610432, "data/tokens_consumed_B": 12.878610432, "train/loss_slope": -6.505757565617103e-05} {"step": 6150, "timestamp": 1778201177.4426358, "grad/layer_0/attn": 0.002833711914718151, "grad/layer_0/mlp": 0.003524159546941519, "grad/layer_0/attn_mlp_ratio": 0.8040815963537349, "grad/layer_4/attn": 0.001966433832421899, "grad/layer_4/mlp": 0.0036818638909608126, "grad/layer_4/attn_mlp_ratio": 0.5340864945716645, "grad/layer_8/attn": 0.009875653311610222, "grad/layer_8/mlp": 0.004986247979104519, "grad/layer_8/attn_mlp_ratio": 1.9805780127537884, "grad/layer_12/attn": 0.004005627706646919, "grad/layer_12/mlp": 0.005531498696655035, "grad/layer_12/attn_mlp_ratio": 0.7241487079540126, "grad/layer_16/attn": 0.005206483416259289, "grad/layer_16/mlp": 0.0048097604885697365, "grad/layer_16/attn_mlp_ratio": 1.0824828638315915, "grad/layer_20/attn": 0.006310775876045227, "grad/layer_20/mlp": 0.007971332408487797, "grad/layer_20/attn_mlp_ratio": 0.7916839335613682, "grad/layer_24/attn": 0.012246337719261646, "grad/layer_24/mlp": 0.013742840848863125, "grad/layer_24/attn_mlp_ratio": 0.8911067052896893, "grad/layer_27/attn": 0.0070792995393276215, "grad/layer_27/mlp": 0.013187958858907223, "grad/layer_27/attn_mlp_ratio": 0.5368002403849021} {"step": 6150, "timestamp": 1778201178.0536683, "eos/sharpness": 17.973136901855465, "eos/L0_probe": 2.287976026535034, "eos/L_plus": 2.423032283782959, "eos/L_minus": 2.332651138305664, "eos/grad_norm": 0.18311119079589844, "eos/embed_grad_frac": 0.10960207879543304, "eos/time_s": 0.6081459522247314} {"step": 6150, "timestamp": 1778201178.07368, "train/loss": 2.4576356410980225, "train/z_loss": 0.0018937897169962526, "train/perplexity": 11.677169854929252, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910528.4036939447, "perf/iters_per_sec": 0.9110109346837734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976816654205321, "data/tokens_consumed": 12899581952, "data/tokens_consumed_B": 12.899581952, "train/loss_slope": -6.698879043940778e-05} {"step": 6150, "timestamp": 1778201179.4350781, "geo/rankme_last": 429.08502197265625, "geo/layer_0/stable_rank_q_proj": 18.276227951049805, "geo/layer_0/stable_rank_k_proj": 12.570230484008789, "geo/layer_0/stable_rank_o_proj": 57.651180267333984, "geo/layer_0/stable_rank_gate_proj": 174.36473083496094, "geo/layer_0/stable_rank_down_proj": 47.016456604003906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03152179718017578, "geo/layer_0/attn_entropy_mean": 6.533206462860107, "geo/layer_0/attn_entropy_std": 0.23540222644805908, "geo/layer_7/stable_rank_q_proj": 44.320892333984375, "geo/layer_7/stable_rank_k_proj": 43.81467819213867, "geo/layer_7/stable_rank_o_proj": 104.15962982177734, "geo/layer_7/stable_rank_gate_proj": 148.74468994140625, "geo/layer_7/stable_rank_down_proj": 195.38375854492188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5810450911521912, "geo/layer_7/attn_entropy_mean": 4.784346580505371, "geo/layer_7/attn_entropy_std": 1.011613368988037, "geo/layer_14/stable_rank_q_proj": 78.14277648925781, "geo/layer_14/stable_rank_k_proj": 59.92319107055664, "geo/layer_14/stable_rank_o_proj": 55.731136322021484, "geo/layer_14/stable_rank_gate_proj": 147.76361083984375, "geo/layer_14/stable_rank_down_proj": 148.18809509277344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40974852442741394, "geo/layer_14/attn_entropy_mean": 5.968402862548828, "geo/layer_14/attn_entropy_std": 0.6022130250930786, "geo/layer_21/stable_rank_q_proj": 51.066688537597656, "geo/layer_21/stable_rank_k_proj": 32.37945556640625, "geo/layer_21/stable_rank_o_proj": 96.29700469970703, "geo/layer_21/stable_rank_gate_proj": 127.30914306640625, "geo/layer_21/stable_rank_down_proj": 95.70773315429688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1658090204000473, "geo/layer_21/attn_entropy_mean": 5.826190948486328, "geo/layer_21/attn_entropy_std": 0.3103092610836029, "geo/layer_27/stable_rank_q_proj": 43.79655456542969, "geo/layer_27/stable_rank_k_proj": 36.21529006958008, "geo/layer_27/stable_rank_o_proj": 107.7715835571289, "geo/layer_27/stable_rank_gate_proj": 91.99217224121094, "geo/layer_27/stable_rank_down_proj": 163.92153930664062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0622531995177269, "geo/layer_27/attn_entropy_mean": 4.622359752655029, "geo/layer_27/attn_entropy_std": 0.5060513615608215, "attnres/final_alpha/block_0": 0.24491173028945923, "attnres/block_norm/0": 1.2056881189346313, "attnres/final_alpha/block_1": 0.013957403600215912, "attnres/block_norm/1": 9736.3525390625, "attnres/final_alpha/block_2": 0.028716543689370155, "attnres/block_norm/2": 7006.2001953125, "attnres/final_alpha/block_3": 0.025644510984420776, "attnres/block_norm/3": 6856.4326171875, "attnres/final_alpha/block_4": 0.04062210023403168, "attnres/block_norm/4": 3486.8857421875, "attnres/final_alpha/block_5": 0.42866024374961853, "attnres/block_norm/5": 3278.32373046875, "attnres/final_alpha/block_6": 0.21748745441436768, "attnres/block_norm/6": 5449.63330078125, "geo/tier1_time_s": 1.3578894138336182, "geo/step": 6150.0, "geo/rankme_slope": 0.01873216501444328} {"step": 6160, "timestamp": 1778201189.7980063, "train/loss": 2.474880743026733, "train/z_loss": 0.0018976835301145912, "train/perplexity": 11.88029022116065, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789275.971563018, "perf/iters_per_sec": 0.853193269521245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720673799514771, "data/tokens_consumed": 12920553472, "data/tokens_consumed_B": 12.920553472, "train/loss_slope": -6.76066799060812e-05} {"step": 6170, "timestamp": 1778201200.1664999, "train/loss": 2.513819360733032, "train/z_loss": 0.0018694195430725812, "train/perplexity": 12.352016890761506, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024043.109501139, "perf/iters_per_sec": 0.9651389644151397, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361202239990235, "data/tokens_consumed": 12941524992, "data/tokens_consumed_B": 12.941524992, "train/loss_slope": -6.391244772994918e-05} {"step": 6180, "timestamp": 1778201210.5185978, "train/loss": 2.4720298290252685, "train/z_loss": 0.0018889864557422697, "train/perplexity": 11.846468769360499, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026805.9586100923, "perf/iters_per_sec": 0.966456393532797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347078323364258, "data/tokens_consumed": 12962496512, "data/tokens_consumed_B": 12.962496512, "train/loss_slope": -6.0146855907400215e-05} {"step": 6190, "timestamp": 1778201220.8793173, "train/loss": 2.422166085243225, "train/z_loss": 0.0018957926542498172, "train/perplexity": 11.27024520621615, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025370.4723329975, "perf/iters_per_sec": 0.9657719003357875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354411840438842, "data/tokens_consumed": 12983468032, "data/tokens_consumed_B": 12.983468032, "train/loss_slope": -6.429825445713677e-05} {"step": 6200, "timestamp": 1778201231.2169933, "grad/layer_0/attn": 0.0035670073702931404, "grad/layer_0/mlp": 0.003957955166697502, "grad/layer_0/attn_mlp_ratio": 0.9012247814689006, "grad/layer_4/attn": 0.0018187417881563306, "grad/layer_4/mlp": 0.003499226411804557, "grad/layer_4/attn_mlp_ratio": 0.5197553750867081, "grad/layer_8/attn": 0.00646187923848629, "grad/layer_8/mlp": 0.0049613420851528645, "grad/layer_8/attn_mlp_ratio": 1.3024457893317416, "grad/layer_12/attn": 0.005535711534321308, "grad/layer_12/mlp": 0.005766754969954491, "grad/layer_12/attn_mlp_ratio": 0.9599352611944718, "grad/layer_16/attn": 0.00551037210971117, "grad/layer_16/mlp": 0.004956124350428581, "grad/layer_16/attn_mlp_ratio": 1.1118308599443383, "grad/layer_20/attn": 0.007644303143024445, "grad/layer_20/mlp": 0.00905850250273943, "grad/layer_20/attn_mlp_ratio": 0.8438815418248807, "grad/layer_24/attn": 0.021475212648510933, "grad/layer_24/mlp": 0.013498254120349884, "grad/layer_24/attn_mlp_ratio": 1.5909622309628038, "grad/layer_27/attn": 0.012342339381575584, "grad/layer_27/mlp": 0.012457313947379589, "grad/layer_27/attn_mlp_ratio": 0.990770509167007} {"step": 6200, "timestamp": 1778201231.232706, "train/loss": 2.4706386804580687, "train/z_loss": 0.0018890010309405626, "train/perplexity": 11.830000029193759, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026561.643733309, "perf/iters_per_sec": 0.966339895121245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348325729370118, "data/tokens_consumed": 13004439552, "data/tokens_consumed_B": 13.004439552, "train/loss_slope": -6.508772395374616e-05} {"step": 6210, "timestamp": 1778201241.5866144, "train/loss": 2.4884761333465577, "train/z_loss": 0.001876246917527169, "train/perplexity": 12.042910341020265, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026492.0770683652, "perf/iters_per_sec": 0.9663067231504274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034868097305298, "data/tokens_consumed": 13025411072, "data/tokens_consumed_B": 13.025411072, "train/loss_slope": -6.223411497109802e-05} {"step": 6220, "timestamp": 1778201251.940622, "train/loss": 2.4606340646743776, "train/z_loss": 0.001886360626667738, "train/perplexity": 11.712235500885555, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026426.1098713577, "perf/iters_per_sec": 0.9662752675396717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034901785850525, "data/tokens_consumed": 13046382592, "data/tokens_consumed_B": 13.046382592, "train/loss_slope": -6.187757096632522e-05} {"step": 6225, "timestamp": 1778201257.731189, "eos/sharpness": 18.469047546386715, "eos/L0_probe": 2.2847204208374023, "eos/L_plus": 2.4264142513275146, "eos/L_minus": 2.3277170658111572, "eos/grad_norm": 0.18616683781147003, "eos/embed_grad_frac": 0.11856106668710709, "eos/time_s": 0.6185739040374756} {"step": 6225, "timestamp": 1778201259.118396, "geo/rankme_last": 429.8361511230469, "geo/layer_0/stable_rank_q_proj": 18.01414680480957, "geo/layer_0/stable_rank_k_proj": 12.4537353515625, "geo/layer_0/stable_rank_o_proj": 57.83405685424805, "geo/layer_0/stable_rank_gate_proj": 175.71835327148438, "geo/layer_0/stable_rank_down_proj": 47.0456428527832, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.028121765702962875, "geo/layer_0/attn_entropy_mean": 6.52606201171875, "geo/layer_0/attn_entropy_std": 0.23763485252857208, "geo/layer_7/stable_rank_q_proj": 43.95650100708008, "geo/layer_7/stable_rank_k_proj": 44.16048812866211, "geo/layer_7/stable_rank_o_proj": 103.8387222290039, "geo/layer_7/stable_rank_gate_proj": 148.49583435058594, "geo/layer_7/stable_rank_down_proj": 195.72572326660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5850031971931458, "geo/layer_7/attn_entropy_mean": 4.807857513427734, "geo/layer_7/attn_entropy_std": 1.043798804283142, "geo/layer_14/stable_rank_q_proj": 77.7433853149414, "geo/layer_14/stable_rank_k_proj": 59.915443420410156, "geo/layer_14/stable_rank_o_proj": 54.74643325805664, "geo/layer_14/stable_rank_gate_proj": 147.04515075683594, "geo/layer_14/stable_rank_down_proj": 148.55548095703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4151792824268341, "geo/layer_14/attn_entropy_mean": 5.972108840942383, "geo/layer_14/attn_entropy_std": 0.5889073610305786, "geo/layer_21/stable_rank_q_proj": 51.35954666137695, "geo/layer_21/stable_rank_k_proj": 32.470314025878906, "geo/layer_21/stable_rank_o_proj": 96.20315551757812, "geo/layer_21/stable_rank_gate_proj": 127.06934356689453, "geo/layer_21/stable_rank_down_proj": 95.14691162109375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16846884787082672, "geo/layer_21/attn_entropy_mean": 5.841891288757324, "geo/layer_21/attn_entropy_std": 0.30289581418037415, "geo/layer_27/stable_rank_q_proj": 43.712890625, "geo/layer_27/stable_rank_k_proj": 36.106502532958984, "geo/layer_27/stable_rank_o_proj": 107.96568298339844, "geo/layer_27/stable_rank_gate_proj": 92.04057312011719, "geo/layer_27/stable_rank_down_proj": 164.01109313964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.060904111713171005, "geo/layer_27/attn_entropy_mean": 4.61455774307251, "geo/layer_27/attn_entropy_std": 0.4942159056663513, "attnres/final_alpha/block_0": 0.24191085994243622, "attnres/block_norm/0": 1.2111749649047852, "attnres/final_alpha/block_1": 0.01358412578701973, "attnres/block_norm/1": 9884.6689453125, "attnres/final_alpha/block_2": 0.028239671140909195, "attnres/block_norm/2": 7113.32080078125, "attnres/final_alpha/block_3": 0.025591745972633362, "attnres/block_norm/3": 6998.43603515625, "attnres/final_alpha/block_4": 0.03999089449644089, "attnres/block_norm/4": 3547.146484375, "attnres/final_alpha/block_5": 0.433813214302063, "attnres/block_norm/5": 3272.347412109375, "attnres/final_alpha/block_6": 0.21686948835849762, "attnres/block_norm/6": 5535.8388671875, "geo/tier1_time_s": 1.357128620147705, "geo/step": 6225.0, "geo/rankme_slope": 0.018360041985544216} {"step": 6230, "timestamp": 1778201264.2959945, "train/loss": 2.4842576026916503, "train/z_loss": 0.0018822949728928506, "train/perplexity": 11.992213961869851, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697982.364683761, "perf/iters_per_sec": 0.8096610854548268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350846767425536, "data/tokens_consumed": 13067354112, "data/tokens_consumed_B": 13.067354112, "train/loss_slope": -6.44926755592125e-05} {"step": 6240, "timestamp": 1778201274.6517956, "train/loss": 2.4465812921524046, "train/z_loss": 0.0019004054134711622, "train/perplexity": 11.548797190016453, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026074.9173605812, "perf/iters_per_sec": 0.9661078059008509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350811719894408, "data/tokens_consumed": 13088325632, "data/tokens_consumed_B": 13.088325632, "train/loss_slope": -6.523478307990109e-05} {"step": 6250, "timestamp": 1778201285.0127404, "grad/layer_0/attn": 0.0030509736388921738, "grad/layer_0/mlp": 0.0034522386267781258, "grad/layer_0/attn_mlp_ratio": 0.8837667033935277, "grad/layer_4/attn": 0.0032351366244256496, "grad/layer_4/mlp": 0.003487106179818511, "grad/layer_4/attn_mlp_ratio": 0.9277424789570866, "grad/layer_8/attn": 0.006588484160602093, "grad/layer_8/mlp": 0.004992144647985697, "grad/layer_8/attn_mlp_ratio": 1.319770257715486, "grad/layer_12/attn": 0.004410071298480034, "grad/layer_12/mlp": 0.0055791293270885944, "grad/layer_12/attn_mlp_ratio": 0.7904586828668322, "grad/layer_16/attn": 0.005613598972558975, "grad/layer_16/mlp": 0.0047262925654649734, "grad/layer_16/attn_mlp_ratio": 1.1877383331712719, "grad/layer_20/attn": 0.010636171326041222, "grad/layer_20/mlp": 0.007079018745571375, "grad/layer_20/attn_mlp_ratio": 1.502492302686155, "grad/layer_24/attn": 0.018485117703676224, "grad/layer_24/mlp": 0.011044683866202831, "grad/layer_24/attn_mlp_ratio": 1.6736665132512087, "grad/layer_27/attn": 0.007446072064340115, "grad/layer_27/mlp": 0.010331911034882069, "grad/layer_27/attn_mlp_ratio": 0.7206868087745226} {"step": 6250, "timestamp": 1778201285.028475, "train/loss": 2.419311547279358, "train/z_loss": 0.0019088288536295295, "train/perplexity": 11.238119736913003, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022395.15045256, "perf/iters_per_sec": 0.9643531563055802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036964511871338, "data/tokens_consumed": 13109297152, "data/tokens_consumed_B": 13.109297152, "train/loss_slope": -6.85879664845987e-05} {"step": 6260, "timestamp": 1778201295.38664, "train/loss": 2.463351678848267, "train/z_loss": 0.0018897807109169661, "train/perplexity": 11.744108127225163, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025896.5204853606, "perf/iters_per_sec": 0.9660227396418384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351723194122315, "data/tokens_consumed": 13130268672, "data/tokens_consumed_B": 13.130268672, "train/loss_slope": -6.73467281163008e-05} {"step": 6270, "timestamp": 1778201305.7425115, "train/loss": 2.4068740367889405, "train/z_loss": 0.0019143731566146015, "train/perplexity": 11.099211134057525, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026362.4343579893, "perf/iters_per_sec": 0.9662449046888301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349343061447143, "data/tokens_consumed": 13151240192, "data/tokens_consumed_B": 13.151240192, "train/loss_slope": -7.013380466693528e-05} {"step": 6280, "timestamp": 1778201316.1044936, "train/loss": 2.447545576095581, "train/z_loss": 0.001895213674288243, "train/perplexity": 11.559938880723173, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025039.4591221928, "perf/iters_per_sec": 0.9656140609370197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356104373931885, "data/tokens_consumed": 13172211712, "data/tokens_consumed_B": 13.172211712, "train/loss_slope": -6.885904179941317e-05} {"step": 6290, "timestamp": 1778201326.4724195, "train/loss": 2.472346067428589, "train/z_loss": 0.0018827636959031224, "train/perplexity": 11.850215670154842, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024483.5221553165, "perf/iters_per_sec": 0.9653489695335944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358948230743408, "data/tokens_consumed": 13193183232, "data/tokens_consumed_B": 13.193183232, "train/loss_slope": -6.621754926995697e-05} {"step": 6300, "timestamp": 1778201336.827744, "grad/layer_0/attn": 0.0026674799155443907, "grad/layer_0/mlp": 0.003339584916830063, "grad/layer_0/attn_mlp_ratio": 0.7987459226524992, "grad/layer_4/attn": 0.001566744758747518, "grad/layer_4/mlp": 0.003333216765895486, "grad/layer_4/attn_mlp_ratio": 0.47003985092539247, "grad/layer_8/attn": 0.00989613402634859, "grad/layer_8/mlp": 0.004620285239070654, "grad/layer_8/attn_mlp_ratio": 2.141888065367657, "grad/layer_12/attn": 0.004423616919666529, "grad/layer_12/mlp": 0.00548156863078475, "grad/layer_12/attn_mlp_ratio": 0.8069983497284795, "grad/layer_16/attn": 0.0057174526154994965, "grad/layer_16/mlp": 0.004750018008053303, "grad/layer_16/attn_mlp_ratio": 1.2036696461863967, "grad/layer_20/attn": 0.0055441707372665405, "grad/layer_20/mlp": 0.007065554149448872, "grad/layer_20/attn_mlp_ratio": 0.7846759845767228, "grad/layer_24/attn": 0.013775386847555637, "grad/layer_24/mlp": 0.010971860028803349, "grad/layer_24/attn_mlp_ratio": 1.2555197282721884, "grad/layer_27/attn": 0.015093446709215641, "grad/layer_27/mlp": 0.010275520384311676, "grad/layer_27/attn_mlp_ratio": 1.4688741784184864} {"step": 6300, "timestamp": 1778201337.4360883, "eos/sharpness": 19.26920413970947, "eos/L0_probe": 2.2764339447021484, "eos/L_plus": 2.380558967590332, "eos/L_minus": 2.3650009632110596, "eos/grad_norm": 0.13332821428775787, "eos/embed_grad_frac": 0.20563888549804688, "eos/time_s": 0.6055598258972168} {"step": 6300, "timestamp": 1778201337.456346, "train/loss": 2.5053977489471437, "train/z_loss": 0.0018751759431324898, "train/perplexity": 12.248429797108763, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910639.912764875, "perf/iters_per_sec": 0.9110641063522696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976176023483277, "data/tokens_consumed": 13214154752, "data/tokens_consumed_B": 13.214154752, "train/loss_slope": -6.21452271884675e-05} {"step": 6300, "timestamp": 1778201338.8177352, "geo/rankme_last": 429.1752624511719, "geo/layer_0/stable_rank_q_proj": 17.807985305786133, "geo/layer_0/stable_rank_k_proj": 12.395716667175293, "geo/layer_0/stable_rank_o_proj": 57.71443557739258, "geo/layer_0/stable_rank_gate_proj": 174.74496459960938, "geo/layer_0/stable_rank_down_proj": 46.98195266723633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.028091825544834137, "geo/layer_0/attn_entropy_mean": 6.522821426391602, "geo/layer_0/attn_entropy_std": 0.2337205410003662, "geo/layer_7/stable_rank_q_proj": 43.7100715637207, "geo/layer_7/stable_rank_k_proj": 44.17564010620117, "geo/layer_7/stable_rank_o_proj": 103.19633483886719, "geo/layer_7/stable_rank_gate_proj": 148.0948944091797, "geo/layer_7/stable_rank_down_proj": 194.23265075683594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5917779207229614, "geo/layer_7/attn_entropy_mean": 4.7931227684021, "geo/layer_7/attn_entropy_std": 0.9886938333511353, "geo/layer_14/stable_rank_q_proj": 77.20169067382812, "geo/layer_14/stable_rank_k_proj": 58.7733039855957, "geo/layer_14/stable_rank_o_proj": 54.37863540649414, "geo/layer_14/stable_rank_gate_proj": 146.16844177246094, "geo/layer_14/stable_rank_down_proj": 147.89450073242188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41378429532051086, "geo/layer_14/attn_entropy_mean": 5.987097263336182, "geo/layer_14/attn_entropy_std": 0.5982412099838257, "geo/layer_21/stable_rank_q_proj": 51.423797607421875, "geo/layer_21/stable_rank_k_proj": 32.55713653564453, "geo/layer_21/stable_rank_o_proj": 96.37722778320312, "geo/layer_21/stable_rank_gate_proj": 127.00713348388672, "geo/layer_21/stable_rank_down_proj": 94.06742095947266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.169502854347229, "geo/layer_21/attn_entropy_mean": 5.866430759429932, "geo/layer_21/attn_entropy_std": 0.30265626311302185, "geo/layer_27/stable_rank_q_proj": 43.55341339111328, "geo/layer_27/stable_rank_k_proj": 36.115230560302734, "geo/layer_27/stable_rank_o_proj": 108.1443099975586, "geo/layer_27/stable_rank_gate_proj": 92.37037658691406, "geo/layer_27/stable_rank_down_proj": 164.0661163330078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06334024667739868, "geo/layer_27/attn_entropy_mean": 4.628918647766113, "geo/layer_27/attn_entropy_std": 0.4709116518497467, "attnres/final_alpha/block_0": 0.24291874468326569, "attnres/block_norm/0": 1.2168242931365967, "attnres/final_alpha/block_1": 0.013536926358938217, "attnres/block_norm/1": 10091.13671875, "attnres/final_alpha/block_2": 0.02806287258863449, "attnres/block_norm/2": 7252.2470703125, "attnres/final_alpha/block_3": 0.025199342519044876, "attnres/block_norm/3": 7277.41162109375, "attnres/final_alpha/block_4": 0.03954527527093887, "attnres/block_norm/4": 3598.8056640625, "attnres/final_alpha/block_5": 0.44002610445022583, "attnres/block_norm/5": 3292.17724609375, "attnres/final_alpha/block_6": 0.21071073412895203, "attnres/block_norm/6": 5663.8994140625, "geo/tier1_time_s": 1.3574886322021484, "geo/step": 6300.0, "geo/rankme_slope": 0.017928597884466286} {"step": 6310, "timestamp": 1778201349.178268, "train/loss": 2.465666484832764, "train/z_loss": 0.001875871093943715, "train/perplexity": 11.771324947677025, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789636.301002493, "perf/iters_per_sec": 0.8533650879871811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718313932418822, "data/tokens_consumed": 13235126272, "data/tokens_consumed_B": 13.235126272, "train/loss_slope": -6.316964359495187e-05} {"step": 6320, "timestamp": 1778201359.554597, "train/loss": 2.4197221755981446, "train/z_loss": 0.0019013323937542737, "train/perplexity": 11.242735374717837, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022249.9914609352, "perf/iters_per_sec": 0.9642839391045261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370389461517333, "data/tokens_consumed": 13256097792, "data/tokens_consumed_B": 13.256097792, "train/loss_slope": -6.371738882777289e-05} {"step": 6330, "timestamp": 1778201369.926475, "train/loss": 2.488281226158142, "train/z_loss": 0.00188420289196074, "train/perplexity": 12.040563319958425, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024169.9402501085, "perf/iters_per_sec": 0.9651994420290511, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360553026199342, "data/tokens_consumed": 13277069312, "data/tokens_consumed_B": 13.277069312, "train/loss_slope": -6.780362360977465e-05} {"step": 6340, "timestamp": 1778201380.2873085, "train/loss": 2.4757996082305906, "train/z_loss": 0.0018864817800931632, "train/perplexity": 11.891211623336396, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025187.5367390597, "perf/iters_per_sec": 0.96568466984704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035534715652466, "data/tokens_consumed": 13298040832, "data/tokens_consumed_B": 13.298040832, "train/loss_slope": -6.486482780472571e-05} {"step": 6350, "timestamp": 1778201390.6389394, "grad/layer_0/attn": 0.002752270782366395, "grad/layer_0/mlp": 0.0034104727674275637, "grad/layer_0/attn_mlp_ratio": 0.8070056233704523, "grad/layer_4/attn": 0.0021484605967998505, "grad/layer_4/mlp": 0.0033221079502254725, "grad/layer_4/attn_mlp_ratio": 0.6467160502663464, "grad/layer_8/attn": 0.006934595759958029, "grad/layer_8/mlp": 0.004722146317362785, "grad/layer_8/attn_mlp_ratio": 1.468526205468836, "grad/layer_12/attn": 0.0052971383556723595, "grad/layer_12/mlp": 0.0056945257820189, "grad/layer_12/attn_mlp_ratio": 0.9302158714211247, "grad/layer_16/attn": 0.005782195366919041, "grad/layer_16/mlp": 0.00499170646071434, "grad/layer_16/attn_mlp_ratio": 1.1583604317661609, "grad/layer_20/attn": 0.009760713204741478, "grad/layer_20/mlp": 0.00792737677693367, "grad/layer_20/attn_mlp_ratio": 1.2312664524809305, "grad/layer_24/attn": 0.015796231105923653, "grad/layer_24/mlp": 0.013173909857869148, "grad/layer_24/attn_mlp_ratio": 1.1990541271680788, "grad/layer_27/attn": 0.008326786570250988, "grad/layer_27/mlp": 0.013308407738804817, "grad/layer_27/attn_mlp_ratio": 0.6256786439901281} {"step": 6350, "timestamp": 1778201390.655029, "train/loss": 2.392636513710022, "train/z_loss": 0.0019079707912169398, "train/perplexity": 10.94230548376854, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024457.7088923203, "perf/iters_per_sec": 0.9653366608106233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035908031463623, "data/tokens_consumed": 13319012352, "data/tokens_consumed_B": 13.319012352, "train/loss_slope": -6.977700706433866e-05} {"step": 6360, "timestamp": 1778201401.4121416, "train/loss": 2.456974816322327, "train/z_loss": 0.0018854449852369725, "train/perplexity": 11.669455840865634, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950986.3604978325, "perf/iters_per_sec": 0.930302791832844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.074918842315674, "data/tokens_consumed": 13339983872, "data/tokens_consumed_B": 13.339983872, "train/loss_slope": -7.250409732879264e-05} {"step": 6370, "timestamp": 1778201411.7702372, "train/loss": 2.462335300445557, "train/z_loss": 0.0018868825864046812, "train/perplexity": 11.732177733289959, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025866.4252345848, "perf/iters_per_sec": 0.966008389107983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351876974105836, "data/tokens_consumed": 13360955392, "data/tokens_consumed_B": 13.360955392, "train/loss_slope": -7.269672557751838e-05} {"step": 6375, "timestamp": 1778201417.5582657, "eos/sharpness": 51.92070007324218, "eos/L0_probe": 2.270059585571289, "eos/L_plus": 2.4097368717193604, "eos/L_minus": 2.6495893001556396, "eos/grad_norm": 0.20086562633514404, "eos/embed_grad_frac": 0.07402563095092773, "eos/time_s": 0.6133697032928467} {"step": 6375, "timestamp": 1778201418.939792, "geo/rankme_last": 430.3129577636719, "geo/layer_0/stable_rank_q_proj": 17.613636016845703, "geo/layer_0/stable_rank_k_proj": 12.313497543334961, "geo/layer_0/stable_rank_o_proj": 57.7575798034668, "geo/layer_0/stable_rank_gate_proj": 174.2682647705078, "geo/layer_0/stable_rank_down_proj": 46.9172248840332, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0289300549775362, "geo/layer_0/attn_entropy_mean": 6.517631530761719, "geo/layer_0/attn_entropy_std": 0.2391231805086136, "geo/layer_7/stable_rank_q_proj": 43.942665100097656, "geo/layer_7/stable_rank_k_proj": 44.20790481567383, "geo/layer_7/stable_rank_o_proj": 103.09224700927734, "geo/layer_7/stable_rank_gate_proj": 147.53738403320312, "geo/layer_7/stable_rank_down_proj": 192.5394744873047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5826542377471924, "geo/layer_7/attn_entropy_mean": 4.785942077636719, "geo/layer_7/attn_entropy_std": 0.9980489015579224, "geo/layer_14/stable_rank_q_proj": 77.70671844482422, "geo/layer_14/stable_rank_k_proj": 58.33525466918945, "geo/layer_14/stable_rank_o_proj": 53.71549606323242, "geo/layer_14/stable_rank_gate_proj": 144.82225036621094, "geo/layer_14/stable_rank_down_proj": 147.45556640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.398258239030838, "geo/layer_14/attn_entropy_mean": 5.982126712799072, "geo/layer_14/attn_entropy_std": 0.5664991736412048, "geo/layer_21/stable_rank_q_proj": 51.251136779785156, "geo/layer_21/stable_rank_k_proj": 32.53060531616211, "geo/layer_21/stable_rank_o_proj": 96.11419677734375, "geo/layer_21/stable_rank_gate_proj": 126.5314712524414, "geo/layer_21/stable_rank_down_proj": 93.17457580566406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.17362797260284424, "geo/layer_21/attn_entropy_mean": 5.832150459289551, "geo/layer_21/attn_entropy_std": 0.3108161389827728, "geo/layer_27/stable_rank_q_proj": 43.59209442138672, "geo/layer_27/stable_rank_k_proj": 35.85895538330078, "geo/layer_27/stable_rank_o_proj": 108.10850524902344, "geo/layer_27/stable_rank_gate_proj": 92.42716979980469, "geo/layer_27/stable_rank_down_proj": 164.7654266357422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06024036183953285, "geo/layer_27/attn_entropy_mean": 4.605003356933594, "geo/layer_27/attn_entropy_std": 0.5144144296646118, "attnres/final_alpha/block_0": 0.24512839317321777, "attnres/block_norm/0": 1.2225449085235596, "attnres/final_alpha/block_1": 0.013753044418990612, "attnres/block_norm/1": 10248.87890625, "attnres/final_alpha/block_2": 0.02821267396211624, "attnres/block_norm/2": 7404.50048828125, "attnres/final_alpha/block_3": 0.025327224284410477, "attnres/block_norm/3": 7283.4892578125, "attnres/final_alpha/block_4": 0.039849959313869476, "attnres/block_norm/4": 3644.40185546875, "attnres/final_alpha/block_5": 0.4301478862762451, "attnres/block_norm/5": 3328.43017578125, "attnres/final_alpha/block_6": 0.21758082509040833, "attnres/block_norm/6": 5726.10400390625, "geo/tier1_time_s": 1.3613989353179932, "geo/step": 6375.0, "geo/rankme_slope": 0.017498170635441675} {"step": 6380, "timestamp": 1778201424.130849, "train/loss": 2.499159049987793, "train/z_loss": 0.001870612835045904, "train/perplexity": 12.172253398762363, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697360.4418138508, "perf/iters_per_sec": 0.8093645295209173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2355372190475464, "data/tokens_consumed": 13381926912, "data/tokens_consumed_B": 13.381926912, "train/loss_slope": -6.938242912292481e-05} {"step": 6390, "timestamp": 1778201434.4923294, "train/loss": 2.442169213294983, "train/z_loss": 0.001892156177200377, "train/perplexity": 11.497955227954643, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025118.6238695509, "perf/iters_per_sec": 0.9656518096301798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035569953918457, "data/tokens_consumed": 13402898432, "data/tokens_consumed_B": 13.402898432, "train/loss_slope": -7.036293343384633e-05} {"step": 6400, "timestamp": 1778201444.8362632, "grad/layer_0/attn": 0.0040870485827326775, "grad/layer_0/mlp": 0.004256878048181534, "grad/layer_0/attn_mlp_ratio": 0.960104668365618, "grad/layer_4/attn": 0.002984802471473813, "grad/layer_4/mlp": 0.0034763282164931297, "grad/layer_4/attn_mlp_ratio": 0.8586077607551289, "grad/layer_8/attn": 0.012166734784841537, "grad/layer_8/mlp": 0.004937372636049986, "grad/layer_8/attn_mlp_ratio": 2.4642123321998195, "grad/layer_12/attn": 0.0058632055297493935, "grad/layer_12/mlp": 0.006118043791502714, "grad/layer_12/attn_mlp_ratio": 0.9583464312658393, "grad/layer_16/attn": 0.007647119462490082, "grad/layer_16/mlp": 0.005005183629691601, "grad/layer_16/attn_mlp_ratio": 1.5278399106762195, "grad/layer_20/attn": 0.007962199859321117, "grad/layer_20/mlp": 0.008275073952972889, "grad/layer_20/attn_mlp_ratio": 0.9621907681249852, "grad/layer_24/attn": 0.03470023348927498, "grad/layer_24/mlp": 0.013104693032801151, "grad/layer_24/attn_mlp_ratio": 2.6479241549288948, "grad/layer_27/attn": 0.009519321843981743, "grad/layer_27/mlp": 0.0119959507137537, "grad/layer_27/attn_mlp_ratio": 0.7935445878177133} {"step": 6400, "timestamp": 1778201444.8518696, "train/loss": 2.4845999240875245, "train/z_loss": 0.001880697498563677, "train/perplexity": 11.996319856020513, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025340.2528061178, "perf/iters_per_sec": 0.965757490542468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354566335678101, "data/tokens_consumed": 13423869952, "data/tokens_consumed_B": 13.423869952, "train/loss_slope": -6.77328262248985e-05} {"step": 6410, "timestamp": 1778201455.2100096, "train/loss": 2.4008246183395388, "train/z_loss": 0.001902132120449096, "train/perplexity": 11.032270042930094, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025698.748068312, "perf/iters_per_sec": 0.9659284344045219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352733850479126, "data/tokens_consumed": 13444841472, "data/tokens_consumed_B": 13.444841472, "train/loss_slope": -7.175577046668747e-05} {"step": 6420, "timestamp": 1778201465.565953, "train/loss": 2.448254942893982, "train/z_loss": 0.0018903467571362853, "train/perplexity": 11.568142026732428, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026053.2168732055, "perf/iters_per_sec": 0.966097458302119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035092258453369, "data/tokens_consumed": 13465812992, "data/tokens_consumed_B": 13.465812992, "train/loss_slope": -7.056774593780178e-05} {"step": 6430, "timestamp": 1778201475.9298882, "train/loss": 2.394146513938904, "train/z_loss": 0.0019059349899180234, "train/perplexity": 10.958840848614019, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024463.3467509768, "perf/iters_per_sec": 0.9653393491511234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359051465988158, "data/tokens_consumed": 13486784512, "data/tokens_consumed_B": 13.486784512, "train/loss_slope": -7.331679158478284e-05} {"step": 6440, "timestamp": 1778201486.2816632, "train/loss": 2.390199875831604, "train/z_loss": 0.0019053073250688613, "train/perplexity": 10.915675504733011, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026958.91886897, "perf/iters_per_sec": 0.9665293306679582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03462975025177, "data/tokens_consumed": 13507756032, "data/tokens_consumed_B": 13.507756032, "train/loss_slope": -7.382928361557922e-05} {"step": 6450, "timestamp": 1778201496.6344686, "grad/layer_0/attn": 0.002847767900675535, "grad/layer_0/mlp": 0.003624380100518465, "grad/layer_0/attn_mlp_ratio": 0.7857254877035704, "grad/layer_4/attn": 0.0024908825289458036, "grad/layer_4/mlp": 0.0033801349345594645, "grad/layer_4/attn_mlp_ratio": 0.7369180531186803, "grad/layer_8/attn": 0.011738748289644718, "grad/layer_8/mlp": 0.004768961574882269, "grad/layer_8/attn_mlp_ratio": 2.4614893324623988, "grad/layer_12/attn": 0.005467789713293314, "grad/layer_12/mlp": 0.006180352997034788, "grad/layer_12/attn_mlp_ratio": 0.8847050690221328, "grad/layer_16/attn": 0.008380746468901634, "grad/layer_16/mlp": 0.004779673647135496, "grad/layer_16/attn_mlp_ratio": 1.7534139174089645, "grad/layer_20/attn": 0.008155993185937405, "grad/layer_20/mlp": 0.008314032107591629, "grad/layer_20/attn_mlp_ratio": 0.9809912906628006, "grad/layer_24/attn": 0.024632476270198822, "grad/layer_24/mlp": 0.02064461074769497, "grad/layer_24/attn_mlp_ratio": 1.193167381643772, "grad/layer_27/attn": 0.01840277388691902, "grad/layer_27/mlp": 0.019482959061861038, "grad/layer_27/attn_mlp_ratio": 0.9445574326790902} {"step": 6450, "timestamp": 1778201497.264573, "eos/sharpness": 26.53536796569824, "eos/L0_probe": 2.267547369003296, "eos/L_plus": 2.360320568084717, "eos/L_minus": 2.4401278495788574, "eos/grad_norm": 0.2588956654071808, "eos/embed_grad_frac": 0.08979484438896179, "eos/time_s": 0.6270034313201904} {"step": 6450, "timestamp": 1778201497.2845836, "train/loss": 2.473389506340027, "train/z_loss": 0.0018843241385184228, "train/perplexity": 11.86258709958698, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907278.084537147, "perf/iters_per_sec": 0.9094610617337928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099552297592163, "data/tokens_consumed": 13528727552, "data/tokens_consumed_B": 13.528727552, "train/loss_slope": -7.512455299885605e-05} {"step": 6450, "timestamp": 1778201498.6430557, "geo/rankme_last": 430.0456848144531, "geo/layer_0/stable_rank_q_proj": 17.458328247070312, "geo/layer_0/stable_rank_k_proj": 12.27004623413086, "geo/layer_0/stable_rank_o_proj": 57.951072692871094, "geo/layer_0/stable_rank_gate_proj": 175.14151000976562, "geo/layer_0/stable_rank_down_proj": 47.029212951660156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0266795065253973, "geo/layer_0/attn_entropy_mean": 6.515514850616455, "geo/layer_0/attn_entropy_std": 0.23553311824798584, "geo/layer_7/stable_rank_q_proj": 43.8486328125, "geo/layer_7/stable_rank_k_proj": 44.054405212402344, "geo/layer_7/stable_rank_o_proj": 103.10800170898438, "geo/layer_7/stable_rank_gate_proj": 146.0157470703125, "geo/layer_7/stable_rank_down_proj": 192.33409118652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5886307954788208, "geo/layer_7/attn_entropy_mean": 4.794297695159912, "geo/layer_7/attn_entropy_std": 1.0248271226882935, "geo/layer_14/stable_rank_q_proj": 77.36781311035156, "geo/layer_14/stable_rank_k_proj": 58.01154708862305, "geo/layer_14/stable_rank_o_proj": 53.08068084716797, "geo/layer_14/stable_rank_gate_proj": 143.2826385498047, "geo/layer_14/stable_rank_down_proj": 147.1410369873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39979347586631775, "geo/layer_14/attn_entropy_mean": 5.958785057067871, "geo/layer_14/attn_entropy_std": 0.5949745774269104, "geo/layer_21/stable_rank_q_proj": 51.46815872192383, "geo/layer_21/stable_rank_k_proj": 32.51020431518555, "geo/layer_21/stable_rank_o_proj": 95.67243194580078, "geo/layer_21/stable_rank_gate_proj": 125.54105377197266, "geo/layer_21/stable_rank_down_proj": 92.35710144042969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1683899313211441, "geo/layer_21/attn_entropy_mean": 5.838316440582275, "geo/layer_21/attn_entropy_std": 0.3151053488254547, "geo/layer_27/stable_rank_q_proj": 43.4459228515625, "geo/layer_27/stable_rank_k_proj": 35.82307434082031, "geo/layer_27/stable_rank_o_proj": 108.22927856445312, "geo/layer_27/stable_rank_gate_proj": 92.67295837402344, "geo/layer_27/stable_rank_down_proj": 164.9565887451172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06550946086645126, "geo/layer_27/attn_entropy_mean": 4.6177825927734375, "geo/layer_27/attn_entropy_std": 0.5125752091407776, "attnres/final_alpha/block_0": 0.24394750595092773, "attnres/block_norm/0": 1.2281904220581055, "attnres/final_alpha/block_1": 0.013247855938971043, "attnres/block_norm/1": 10427.947265625, "attnres/final_alpha/block_2": 0.027959361672401428, "attnres/block_norm/2": 7555.326171875, "attnres/final_alpha/block_3": 0.02490163967013359, "attnres/block_norm/3": 7499.19140625, "attnres/final_alpha/block_4": 0.03917898237705231, "attnres/block_norm/4": 3700.93115234375, "attnres/final_alpha/block_5": 0.4378373920917511, "attnres/block_norm/5": 3336.11328125, "attnres/final_alpha/block_6": 0.21292725205421448, "attnres/block_norm/6": 5866.693359375, "geo/tier1_time_s": 1.3543932437896729, "geo/step": 6450.0, "geo/rankme_slope": 0.017041405781062426} {"step": 6460, "timestamp": 1778201509.0044422, "train/loss": 2.419888162612915, "train/z_loss": 0.0018991601187735795, "train/perplexity": 11.244601677687283, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790003.2959253192, "perf/iters_per_sec": 0.8535400848032566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715911388397218, "data/tokens_consumed": 13549699072, "data/tokens_consumed_B": 13.549699072, "train/loss_slope": -7.520065095880311e-05} {"step": 6470, "timestamp": 1778201519.3646286, "train/loss": 2.431708312034607, "train/z_loss": 0.0018932115403003992, "train/perplexity": 11.37830317888378, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025289.1894512745, "perf/iters_per_sec": 0.9657331416374562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354827404022218, "data/tokens_consumed": 13570670592, "data/tokens_consumed_B": 13.570670592, "train/loss_slope": -7.107890268148971e-05} {"step": 6480, "timestamp": 1778201529.7186892, "train/loss": 2.427246260643005, "train/z_loss": 0.0018967873649671674, "train/perplexity": 11.327645707519421, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026627.4796739202, "perf/iters_per_sec": 0.9663712881440736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347989559173585, "data/tokens_consumed": 13591642112, "data/tokens_consumed_B": 13.591642112, "train/loss_slope": -7.424334890306654e-05} {"step": 6490, "timestamp": 1778201540.0941505, "train/loss": 2.3836484432220457, "train/z_loss": 0.001905574311967939, "train/perplexity": 10.84439593886267, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022434.5822465967, "perf/iters_per_sec": 0.9643719588501914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03694429397583, "data/tokens_consumed": 13612613632, "data/tokens_consumed_B": 13.612613632, "train/loss_slope": -7.476669386489732e-05} {"step": 6500, "timestamp": 1778201550.4631524, "grad/layer_0/attn": 0.003048391779884696, "grad/layer_0/mlp": 0.004054263699799776, "grad/layer_0/attn_mlp_ratio": 0.7518977378914624, "grad/layer_4/attn": 0.002530762692913413, "grad/layer_4/mlp": 0.0034869126975536346, "grad/layer_4/attn_mlp_ratio": 0.7257889255759301, "grad/layer_8/attn": 0.007156610023230314, "grad/layer_8/mlp": 0.004755201283842325, "grad/layer_8/attn_mlp_ratio": 1.5050067169705417, "grad/layer_12/attn": 0.00512332608923316, "grad/layer_12/mlp": 0.005967879667878151, "grad/layer_12/attn_mlp_ratio": 0.8584834628889872, "grad/layer_16/attn": 0.005393967032432556, "grad/layer_16/mlp": 0.005121816415339708, "grad/layer_16/attn_mlp_ratio": 1.0531355460075082, "grad/layer_20/attn": 0.0070251282304525375, "grad/layer_20/mlp": 0.008359924890100956, "grad/layer_20/attn_mlp_ratio": 0.8403338832311342, "grad/layer_24/attn": 0.014262273907661438, "grad/layer_24/mlp": 0.013585378415882587, "grad/layer_24/attn_mlp_ratio": 1.0498252876051628, "grad/layer_27/attn": 0.0072877174243330956, "grad/layer_27/mlp": 0.014555549249053001, "grad/layer_27/attn_mlp_ratio": 0.5006830899726393} {"step": 6500, "timestamp": 1778201550.4800394, "train/loss": 2.455102968215942, "train/z_loss": 0.0018938121036626398, "train/perplexity": 11.64763282310659, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020287.2783694682, "perf/iters_per_sec": 0.9633480445716229, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380464315414428, "data/tokens_consumed": 13633585152, "data/tokens_consumed_B": 13.633585152, "train/loss_slope": -7.262468657048655e-05} {"step": 6500, "timestamp": 1778201557.538525, "geo/ww_alpha_mean": 8.66484005029532, "geo/ww_alpha_std": 4.898648484525028, "geo/ww_alpha_min": 1.9681853981018116, "geo/ww_alpha_max": 23.914596791539854, "geo/ww_alpha_healthy_frac": 0.14720812182741116, "geo/ww_alpha_by_type/q_proj": 4.768977952985482, "geo/ww_alpha_by_type/k_proj": 5.622192584746933, "geo/ww_alpha_by_type/v_proj": 7.09646949428367, "geo/ww_alpha_by_type/o_proj": 8.153900035043177, "geo/ww_alpha_by_type/gate_proj": 11.99583870322627, "geo/ww_alpha_by_type/up_proj": 13.561777217203941, "geo/ww_alpha_by_type/down_proj": 9.688415981244884, "geo/twonn_id/layer_0": 0.7503364086151123, "geo/twonn_id/layer_7": 2.7258481979370117, "geo/twonn_id/layer_14": 3.6762757301330566, "geo/twonn_id/layer_21": 7.302998065948486, "geo/twonn_id/layer_27": 5.817110061645508, "geo/tier2_time_s": 7.050932884216309} {"step": 6500, "timestamp": 1778201558.277577, "eoc/jacobian_sigma/layer_0/attn": 437.30462646484375, "eoc/jacobian_sigma/layer_0/mlp": 1508.2064208984375, "eoc/jacobian_sigma/layer_0": 1508.2064208984375, "eoc/jacobian_sigma/layer_7/attn": 1.1512764692306519, "eoc/jacobian_sigma/layer_7/mlp": 1.6799758672714233, "eoc/jacobian_sigma/layer_7": 1.6799758672714233, "eoc/jacobian_sigma/layer_14/attn": 1.193871259689331, "eoc/jacobian_sigma/layer_14/mlp": 5.420879364013672, "eoc/jacobian_sigma/layer_14": 5.420879364013672, "eoc/jacobian_sigma/layer_21/attn": 1.0749175548553467, "eoc/jacobian_sigma/layer_21/mlp": 2.6029579639434814, "eoc/jacobian_sigma/layer_21": 2.6029579639434814, "eoc/jacobian_sigma/layer_27/attn": 2.0407326221466064, "eoc/jacobian_sigma/layer_27/mlp": 7.754856586456299, "eoc/jacobian_sigma/layer_27": 7.754856586456299, "eoc/layer0_sigma": 1508.2064208984375, "eoc/sigma_max": 7.754856586456299, "eoc/sigma_min": 1.6799758672714233, "eoc/sigma_mean": 4.364667445421219, "eoc/time_s": 0.7332620620727539} {"step": 6510, "timestamp": 1778201568.6740944, "train/loss": 2.4228787660598754, "train/z_loss": 0.0018924877047538757, "train/perplexity": 11.278280156610979, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1153053.614722981, "perf/iters_per_sec": 0.5498188089003472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8187809944152833, "data/tokens_consumed": 13654556672, "data/tokens_consumed_B": 13.654556672, "train/loss_slope": -7.756104781182005e-05} {"step": 6520, "timestamp": 1778201579.0606678, "train/loss": 2.4752875566482544, "train/z_loss": 0.0018828548723831772, "train/perplexity": 11.885124268261677, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020174.9454397766, "perf/iters_per_sec": 0.9632944800566562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381041526794434, "data/tokens_consumed": 13675528192, "data/tokens_consumed_B": 13.675528192, "train/loss_slope": -7.743099666450102e-05} {"step": 6525, "timestamp": 1778201584.864351, "eos/sharpness": 20.77994346618652, "eos/L0_probe": 2.26560115814209, "eos/L_plus": 2.353409767150879, "eos/L_minus": 2.385591983795166, "eos/grad_norm": 0.22759374976158142, "eos/embed_grad_frac": 0.09071435034275055, "eos/time_s": 0.6205358505249023} {"step": 6525, "timestamp": 1778201586.2412677, "geo/rankme_last": 431.3890686035156, "geo/layer_0/stable_rank_q_proj": 17.271947860717773, "geo/layer_0/stable_rank_k_proj": 12.215204238891602, "geo/layer_0/stable_rank_o_proj": 57.57918930053711, "geo/layer_0/stable_rank_gate_proj": 174.29721069335938, "geo/layer_0/stable_rank_down_proj": 47.10711669921875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03575428947806358, "geo/layer_0/attn_entropy_mean": 6.500944137573242, "geo/layer_0/attn_entropy_std": 0.24223671853542328, "geo/layer_7/stable_rank_q_proj": 43.903717041015625, "geo/layer_7/stable_rank_k_proj": 44.32946014404297, "geo/layer_7/stable_rank_o_proj": 103.74771881103516, "geo/layer_7/stable_rank_gate_proj": 146.45175170898438, "geo/layer_7/stable_rank_down_proj": 191.45166015625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5801520943641663, "geo/layer_7/attn_entropy_mean": 4.7665863037109375, "geo/layer_7/attn_entropy_std": 1.0416560173034668, "geo/layer_14/stable_rank_q_proj": 76.89132690429688, "geo/layer_14/stable_rank_k_proj": 57.46459197998047, "geo/layer_14/stable_rank_o_proj": 52.8061408996582, "geo/layer_14/stable_rank_gate_proj": 141.7296905517578, "geo/layer_14/stable_rank_down_proj": 147.32984924316406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40566304326057434, "geo/layer_14/attn_entropy_mean": 5.947625160217285, "geo/layer_14/attn_entropy_std": 0.5730879306793213, "geo/layer_21/stable_rank_q_proj": 51.18709945678711, "geo/layer_21/stable_rank_k_proj": 32.694305419921875, "geo/layer_21/stable_rank_o_proj": 95.31204986572266, "geo/layer_21/stable_rank_gate_proj": 125.10247802734375, "geo/layer_21/stable_rank_down_proj": 91.46419525146484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.166166752576828, "geo/layer_21/attn_entropy_mean": 5.85343074798584, "geo/layer_21/attn_entropy_std": 0.2923721969127655, "geo/layer_27/stable_rank_q_proj": 43.424373626708984, "geo/layer_27/stable_rank_k_proj": 35.71046829223633, "geo/layer_27/stable_rank_o_proj": 108.87220001220703, "geo/layer_27/stable_rank_gate_proj": 92.98363494873047, "geo/layer_27/stable_rank_down_proj": 166.3553924560547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06799450516700745, "geo/layer_27/attn_entropy_mean": 4.607669353485107, "geo/layer_27/attn_entropy_std": 0.5092813372612, "attnres/final_alpha/block_0": 0.2458229660987854, "attnres/block_norm/0": 1.2334790229797363, "attnres/final_alpha/block_1": 0.01322589349001646, "attnres/block_norm/1": 10597.322265625, "attnres/final_alpha/block_2": 0.027587514370679855, "attnres/block_norm/2": 7673.3955078125, "attnres/final_alpha/block_3": 0.02486109547317028, "attnres/block_norm/3": 7630.1884765625, "attnres/final_alpha/block_4": 0.03916240110993385, "attnres/block_norm/4": 3735.369384765625, "attnres/final_alpha/block_5": 0.43765705823898315, "attnres/block_norm/5": 3348.08740234375, "attnres/final_alpha/block_6": 0.21168306469917297, "attnres/block_norm/6": 5929.5693359375, "geo/tier1_time_s": 1.3571038246154785, "geo/step": 6525.0, "geo/rankme_slope": 0.01654217923497524} {"step": 6530, "timestamp": 1778201591.4357245, "train/loss": 2.442844796180725, "train/z_loss": 0.001889217272400856, "train/perplexity": 11.505725674222392, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695795.1207747704, "perf/iters_per_sec": 0.8086181262849667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2366776943206788, "data/tokens_consumed": 13696499712, "data/tokens_consumed_B": 13.696499712, "train/loss_slope": -7.645470903615781e-05} {"step": 6540, "timestamp": 1778201602.2412965, "train/loss": 2.4378244638442994, "train/z_loss": 0.0018951280042529106, "train/perplexity": 11.448107858872133, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1941945.628836784, "perf/iters_per_sec": 0.9259918350395127, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0799231290817262, "data/tokens_consumed": 13717471232, "data/tokens_consumed_B": 13.717471232, "train/loss_slope": -7.470651854633246e-05} {"step": 6550, "timestamp": 1778201613.1387887, "grad/layer_0/attn": 0.003287364961579442, "grad/layer_0/mlp": 0.003651717910543084, "grad/layer_0/attn_mlp_ratio": 0.9002242101083113, "grad/layer_4/attn": 0.0020328043028712273, "grad/layer_4/mlp": 0.003283090889453888, "grad/layer_4/attn_mlp_ratio": 0.6191739154964345, "grad/layer_8/attn": 0.007042432203888893, "grad/layer_8/mlp": 0.004747807513922453, "grad/layer_8/attn_mlp_ratio": 1.4833019314509905, "grad/layer_12/attn": 0.004186215344816446, "grad/layer_12/mlp": 0.005284090060740709, "grad/layer_12/attn_mlp_ratio": 0.7922301129376706, "grad/layer_16/attn": 0.004644152708351612, "grad/layer_16/mlp": 0.00451801810413599, "grad/layer_16/attn_mlp_ratio": 1.0279181044689358, "grad/layer_20/attn": 0.005887299310415983, "grad/layer_20/mlp": 0.006921569351106882, "grad/layer_20/attn_mlp_ratio": 0.8505728869735606, "grad/layer_24/attn": 0.012030479498207569, "grad/layer_24/mlp": 0.011201524175703526, "grad/layer_24/attn_mlp_ratio": 1.0740037875293522, "grad/layer_27/attn": 0.01100768893957138, "grad/layer_27/mlp": 0.010036463849246502, "grad/layer_27/attn_mlp_ratio": 1.0967696387130246} {"step": 6550, "timestamp": 1778201613.1562688, "train/loss": 2.4210000038146973, "train/z_loss": 0.0018980108899995685, "train/perplexity": 11.257110841943634, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922444.2737923348, "perf/iters_per_sec": 0.9166928643190073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0908779144287108, "data/tokens_consumed": 13738442752, "data/tokens_consumed_B": 13.738442752, "train/loss_slope": -7.486091671567975e-05} {"step": 6560, "timestamp": 1778201623.5565689, "train/loss": 2.4090763092041017, "train/z_loss": 0.0019019204541109503, "train/perplexity": 11.123681555945529, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017725.3374528505, "perf/iters_per_sec": 0.9621264159454587, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0393644571304321, "data/tokens_consumed": 13759414272, "data/tokens_consumed_B": 13.759414272, "train/loss_slope": -7.475457806648736e-05} {"step": 6570, "timestamp": 1778201633.941531, "train/loss": 2.405517840385437, "train/z_loss": 0.0018969459692016245, "train/perplexity": 11.084168626438755, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020649.3714466288, "perf/iters_per_sec": 0.9635207040055412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378604173660277, "data/tokens_consumed": 13780385792, "data/tokens_consumed_B": 13.780385792, "train/loss_slope": -7.623771858615916e-05} {"step": 6580, "timestamp": 1778201644.330023, "train/loss": 2.420853066444397, "train/z_loss": 0.0018868385348469019, "train/perplexity": 11.255456873197222, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019909.12668421, "perf/iters_per_sec": 0.9631677277966547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382407665252686, "data/tokens_consumed": 13801357312, "data/tokens_consumed_B": 13.801357312, "train/loss_slope": -7.954180444499376e-05} {"step": 6590, "timestamp": 1778201654.7095683, "train/loss": 2.4138614892959596, "train/z_loss": 0.0018914018874056637, "train/perplexity": 11.177037933600678, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021633.228451608, "perf/iters_per_sec": 0.9639898435838737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037355327606201, "data/tokens_consumed": 13822328832, "data/tokens_consumed_B": 13.822328832, "train/loss_slope": -8.251166038959549e-05} {"step": 6600, "timestamp": 1778201665.505407, "grad/layer_0/attn": 0.003726281924173236, "grad/layer_0/mlp": 0.0039659827016294, "grad/layer_0/attn_mlp_ratio": 0.9395607874654212, "grad/layer_4/attn": 0.0016969057032838464, "grad/layer_4/mlp": 0.003374457126483321, "grad/layer_4/attn_mlp_ratio": 0.502867747134632, "grad/layer_8/attn": 0.007988549768924713, "grad/layer_8/mlp": 0.004890765994787216, "grad/layer_8/attn_mlp_ratio": 1.63339436278485, "grad/layer_12/attn": 0.004165482707321644, "grad/layer_12/mlp": 0.005415063351392746, "grad/layer_12/attn_mlp_ratio": 0.7692398703565123, "grad/layer_16/attn": 0.006383396219462156, "grad/layer_16/mlp": 0.005006971303373575, "grad/layer_16/attn_mlp_ratio": 1.2749016731274279, "grad/layer_20/attn": 0.008932742290198803, "grad/layer_20/mlp": 0.007845352403819561, "grad/layer_20/attn_mlp_ratio": 1.1386030501306141, "grad/layer_24/attn": 0.0233412217348814, "grad/layer_24/mlp": 0.018522586673498154, "grad/layer_24/attn_mlp_ratio": 1.2601491368515374, "grad/layer_27/attn": 0.014540396630764008, "grad/layer_27/mlp": 0.0178645271807909, "grad/layer_27/attn_mlp_ratio": 0.8139256305090583} {"step": 6600, "timestamp": 1778201666.1122382, "eos/sharpness": 25.62751770019531, "eos/L0_probe": 2.2624621391296387, "eos/L_plus": 2.4686360359191895, "eos/L_minus": 2.312563419342041, "eos/grad_norm": 0.24946992099285126, "eos/embed_grad_frac": 0.06757491081953049, "eos/time_s": 0.6041336059570312} {"step": 6600, "timestamp": 1778201666.1322134, "train/loss": 2.4830960035324097, "train/z_loss": 0.0018764996551908554, "train/perplexity": 11.978291903705273, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1836855.9285992666, "perf/iters_per_sec": 0.8758811610218366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1417073965072633, "data/tokens_consumed": 13843300352, "data/tokens_consumed_B": 13.843300352, "train/loss_slope": -8.169826584251444e-05} {"step": 6600, "timestamp": 1778201667.4908774, "geo/rankme_last": 431.4407043457031, "geo/layer_0/stable_rank_q_proj": 17.106325149536133, "geo/layer_0/stable_rank_k_proj": 12.155363082885742, "geo/layer_0/stable_rank_o_proj": 57.70978927612305, "geo/layer_0/stable_rank_gate_proj": 176.05377197265625, "geo/layer_0/stable_rank_down_proj": 47.203147888183594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04553095996379852, "geo/layer_0/attn_entropy_mean": 6.495584964752197, "geo/layer_0/attn_entropy_std": 0.24364565312862396, "geo/layer_7/stable_rank_q_proj": 43.92417526245117, "geo/layer_7/stable_rank_k_proj": 43.84980773925781, "geo/layer_7/stable_rank_o_proj": 103.76219940185547, "geo/layer_7/stable_rank_gate_proj": 146.2924346923828, "geo/layer_7/stable_rank_down_proj": 191.4407196044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5874574184417725, "geo/layer_7/attn_entropy_mean": 4.771385192871094, "geo/layer_7/attn_entropy_std": 0.9762566089630127, "geo/layer_14/stable_rank_q_proj": 76.57778930664062, "geo/layer_14/stable_rank_k_proj": 56.93288040161133, "geo/layer_14/stable_rank_o_proj": 52.448707580566406, "geo/layer_14/stable_rank_gate_proj": 141.1199188232422, "geo/layer_14/stable_rank_down_proj": 147.7274932861328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40048110485076904, "geo/layer_14/attn_entropy_mean": 5.928411483764648, "geo/layer_14/attn_entropy_std": 0.600366473197937, "geo/layer_21/stable_rank_q_proj": 51.20687484741211, "geo/layer_21/stable_rank_k_proj": 32.61027908325195, "geo/layer_21/stable_rank_o_proj": 95.00093078613281, "geo/layer_21/stable_rank_gate_proj": 124.93783569335938, "geo/layer_21/stable_rank_down_proj": 90.608642578125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16504117846488953, "geo/layer_21/attn_entropy_mean": 5.865507125854492, "geo/layer_21/attn_entropy_std": 0.301640123128891, "geo/layer_27/stable_rank_q_proj": 43.25124740600586, "geo/layer_27/stable_rank_k_proj": 35.52985763549805, "geo/layer_27/stable_rank_o_proj": 108.60684967041016, "geo/layer_27/stable_rank_gate_proj": 92.84235382080078, "geo/layer_27/stable_rank_down_proj": 166.33343505859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0631081685423851, "geo/layer_27/attn_entropy_mean": 4.63076114654541, "geo/layer_27/attn_entropy_std": 0.4813522398471832, "attnres/final_alpha/block_0": 0.2451668679714203, "attnres/block_norm/0": 1.238553524017334, "attnres/final_alpha/block_1": 0.013204781338572502, "attnres/block_norm/1": 10759.9619140625, "attnres/final_alpha/block_2": 0.027316128835082054, "attnres/block_norm/2": 7810.744140625, "attnres/final_alpha/block_3": 0.024874836206436157, "attnres/block_norm/3": 7819.16650390625, "attnres/final_alpha/block_4": 0.039010852575302124, "attnres/block_norm/4": 3789.993408203125, "attnres/final_alpha/block_5": 0.4415205121040344, "attnres/block_norm/5": 3362.65966796875, "attnres/final_alpha/block_6": 0.20890605449676514, "attnres/block_norm/6": 6066.66796875, "geo/tier1_time_s": 1.3545384407043457, "geo/step": 6600.0, "geo/rankme_slope": 0.016153373575992898} {"step": 6610, "timestamp": 1778201678.345647, "train/loss": 2.4377081632614135, "train/z_loss": 0.0018906585872173309, "train/perplexity": 11.44677651467476, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1717685.652439865, "perf/iters_per_sec": 0.8190563451957059, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2209172248840332, "data/tokens_consumed": 13864271872, "data/tokens_consumed_B": 13.864271872, "train/loss_slope": -8.278403038954257e-05} {"step": 6620, "timestamp": 1778201688.7293735, "train/loss": 2.4229010105133058, "train/z_loss": 0.0018866560189053417, "train/perplexity": 11.278531038579052, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021268.1352282101, "perf/iters_per_sec": 0.9638157535687495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037542700767517, "data/tokens_consumed": 13885243392, "data/tokens_consumed_B": 13.885243392, "train/loss_slope": -8.44323738251511e-05} {"step": 6630, "timestamp": 1778201699.110226, "train/loss": 2.3984130382537843, "train/z_loss": 0.0018985393806360662, "train/perplexity": 11.005696894709924, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021295.1213189368, "perf/iters_per_sec": 0.9638286215395626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375288486480714, "data/tokens_consumed": 13906214912, "data/tokens_consumed_B": 13.906214912, "train/loss_slope": -8.256258124744362e-05} {"step": 6640, "timestamp": 1778201709.4947393, "train/loss": 2.449151849746704, "train/z_loss": 0.0018859682255424558, "train/perplexity": 11.578522226929959, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020829.3059900897, "perf/iters_per_sec": 0.9636065034819077, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037768006324768, "data/tokens_consumed": 13927186432, "data/tokens_consumed_B": 13.927186432, "train/loss_slope": -8.011807050570477e-05} {"step": 6650, "timestamp": 1778201719.8575113, "grad/layer_0/attn": 0.00259896507486701, "grad/layer_0/mlp": 0.003148090559989214, "grad/layer_0/attn_mlp_ratio": 0.8255686876806511, "grad/layer_4/attn": 0.001653185929171741, "grad/layer_4/mlp": 0.0031198945362120867, "grad/layer_4/attn_mlp_ratio": 0.5298851794491689, "grad/layer_8/attn": 0.00571431266143918, "grad/layer_8/mlp": 0.004630095791071653, "grad/layer_8/attn_mlp_ratio": 1.2341672388380236, "grad/layer_12/attn": 0.004415557719767094, "grad/layer_12/mlp": 0.005548755172640085, "grad/layer_12/attn_mlp_ratio": 0.7957744580193354, "grad/layer_16/attn": 0.005442171823233366, "grad/layer_16/mlp": 0.004762580618262291, "grad/layer_16/attn_mlp_ratio": 1.1426938765289916, "grad/layer_20/attn": 0.007206643000245094, "grad/layer_20/mlp": 0.007712456863373518, "grad/layer_20/attn_mlp_ratio": 0.9344159759295209, "grad/layer_24/attn": 0.022511687129735947, "grad/layer_24/mlp": 0.015805846080183983, "grad/layer_24/attn_mlp_ratio": 1.4242633309919952, "grad/layer_27/attn": 0.012708433903753757, "grad/layer_27/mlp": 0.014603565447032452, "grad/layer_27/attn_mlp_ratio": 0.8702281550916312} {"step": 6650, "timestamp": 1778201719.874057, "train/loss": 2.433857870101929, "train/z_loss": 0.0018849036772735418, "train/perplexity": 11.402787808413043, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021611.7623942727, "perf/iters_per_sec": 0.9639796077700962, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373663425445556, "data/tokens_consumed": 13948157952, "data/tokens_consumed_B": 13.948157952, "train/loss_slope": -7.850061285578021e-05} {"step": 6660, "timestamp": 1778201730.2548223, "train/loss": 2.50508828163147, "train/z_loss": 0.0018645109492354095, "train/perplexity": 12.244639894873915, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021234.4616849602, "perf/iters_per_sec": 0.9637996967720796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037559986114502, "data/tokens_consumed": 13969129472, "data/tokens_consumed_B": 13.969129472, "train/loss_slope": -7.102096091509938e-05} {"step": 6670, "timestamp": 1778201740.6384318, "train/loss": 2.4503054141998293, "train/z_loss": 0.0018856527400203048, "train/perplexity": 11.591886505386949, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020852.7982617726, "perf/iters_per_sec": 0.9636177054699767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377559423446656, "data/tokens_consumed": 13990100992, "data/tokens_consumed_B": 13.990100992, "train/loss_slope": -6.682698670143197e-05} {"step": 6675, "timestamp": 1778201746.4276173, "eos/sharpness": 54.10523414611816, "eos/L0_probe": 2.2563345432281494, "eos/L_plus": 2.4124937057495117, "eos/L_minus": 2.6412277221679688, "eos/grad_norm": 0.25532183051109314, "eos/embed_grad_frac": 0.08197804540395737, "eos/time_s": 0.6099569797515869} {"step": 6675, "timestamp": 1778201747.8032072, "geo/rankme_last": 431.9219665527344, "geo/layer_0/stable_rank_q_proj": 17.02503204345703, "geo/layer_0/stable_rank_k_proj": 12.14611530303955, "geo/layer_0/stable_rank_o_proj": 57.50520706176758, "geo/layer_0/stable_rank_gate_proj": 175.2905731201172, "geo/layer_0/stable_rank_down_proj": 47.346778869628906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03818957135081291, "geo/layer_0/attn_entropy_mean": 6.495635986328125, "geo/layer_0/attn_entropy_std": 0.24397218227386475, "geo/layer_7/stable_rank_q_proj": 43.87076187133789, "geo/layer_7/stable_rank_k_proj": 44.1025276184082, "geo/layer_7/stable_rank_o_proj": 103.52935028076172, "geo/layer_7/stable_rank_gate_proj": 145.4457550048828, "geo/layer_7/stable_rank_down_proj": 189.36798095703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5889065265655518, "geo/layer_7/attn_entropy_mean": 4.7926130294799805, "geo/layer_7/attn_entropy_std": 0.9894776940345764, "geo/layer_14/stable_rank_q_proj": 76.30470275878906, "geo/layer_14/stable_rank_k_proj": 56.832176208496094, "geo/layer_14/stable_rank_o_proj": 51.94850540161133, "geo/layer_14/stable_rank_gate_proj": 140.4334259033203, "geo/layer_14/stable_rank_down_proj": 146.8257293701172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4121217429637909, "geo/layer_14/attn_entropy_mean": 5.966268539428711, "geo/layer_14/attn_entropy_std": 0.5616563558578491, "geo/layer_21/stable_rank_q_proj": 51.54258728027344, "geo/layer_21/stable_rank_k_proj": 32.57453918457031, "geo/layer_21/stable_rank_o_proj": 94.96786499023438, "geo/layer_21/stable_rank_gate_proj": 124.21868896484375, "geo/layer_21/stable_rank_down_proj": 89.59269714355469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1701427847146988, "geo/layer_21/attn_entropy_mean": 5.829648017883301, "geo/layer_21/attn_entropy_std": 0.3117504417896271, "geo/layer_27/stable_rank_q_proj": 43.20795822143555, "geo/layer_27/stable_rank_k_proj": 35.52912521362305, "geo/layer_27/stable_rank_o_proj": 109.74990844726562, "geo/layer_27/stable_rank_gate_proj": 92.71513366699219, "geo/layer_27/stable_rank_down_proj": 166.9974822998047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06252268701791763, "geo/layer_27/attn_entropy_mean": 4.584926605224609, "geo/layer_27/attn_entropy_std": 0.525919497013092, "attnres/final_alpha/block_0": 0.24608205258846283, "attnres/block_norm/0": 1.2432835102081299, "attnres/final_alpha/block_1": 0.012914445251226425, "attnres/block_norm/1": 10927.37890625, "attnres/final_alpha/block_2": 0.02674940600991249, "attnres/block_norm/2": 7933.07958984375, "attnres/final_alpha/block_3": 0.024743974208831787, "attnres/block_norm/3": 7916.30810546875, "attnres/final_alpha/block_4": 0.03848031535744667, "attnres/block_norm/4": 3815.36767578125, "attnres/final_alpha/block_5": 0.4418588876724243, "attnres/block_norm/5": 3375.21240234375, "attnres/final_alpha/block_6": 0.20917092263698578, "attnres/block_norm/6": 6121.2998046875, "geo/tier1_time_s": 1.35595703125, "geo/step": 6675.0, "geo/rankme_slope": 0.015735560943127253} {"step": 6680, "timestamp": 1778201752.9963067, "train/loss": 2.4403369665145873, "train/z_loss": 0.0018836289877071978, "train/perplexity": 11.47690742478024, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697751.6748295631, "perf/iters_per_sec": 0.8095510839603248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2352524995803833, "data/tokens_consumed": 14011072512, "data/tokens_consumed_B": 14.011072512, "train/loss_slope": -6.768338376253242e-05} {"step": 6690, "timestamp": 1778201763.378348, "train/loss": 2.4094426155090334, "train/z_loss": 0.0018853871268220245, "train/perplexity": 11.12775697701417, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020936.557762482, "perf/iters_per_sec": 0.9636576451122675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377129316329956, "data/tokens_consumed": 14032044032, "data/tokens_consumed_B": 14.032044032, "train/loss_slope": -6.966321888250377e-05} {"step": 6700, "timestamp": 1778201773.751249, "grad/layer_0/attn": 0.0028846454806625843, "grad/layer_0/mlp": 0.0035251504741609097, "grad/layer_0/attn_mlp_ratio": 0.8183041887080853, "grad/layer_4/attn": 0.0020630541257560253, "grad/layer_4/mlp": 0.0033729670103639364, "grad/layer_4/attn_mlp_ratio": 0.611643712568969, "grad/layer_8/attn": 0.006776639726012945, "grad/layer_8/mlp": 0.004714287351816893, "grad/layer_8/attn_mlp_ratio": 1.4374685029868555, "grad/layer_12/attn": 0.005582714453339577, "grad/layer_12/mlp": 0.005690158810466528, "grad/layer_12/attn_mlp_ratio": 0.9811174944641148, "grad/layer_16/attn": 0.004848957993090153, "grad/layer_16/mlp": 0.0045591238886117935, "grad/layer_16/attn_mlp_ratio": 1.0635723014338574, "grad/layer_20/attn": 0.010127932764589787, "grad/layer_20/mlp": 0.007206037174910307, "grad/layer_20/attn_mlp_ratio": 1.4054788198019486, "grad/layer_24/attn": 0.020920583978295326, "grad/layer_24/mlp": 0.016193578019738197, "grad/layer_24/attn_mlp_ratio": 1.2919062003224244, "grad/layer_27/attn": 0.012347021140158176, "grad/layer_27/mlp": 0.014972050674259663, "grad/layer_27/attn_mlp_ratio": 0.8246713377024806} {"step": 6700, "timestamp": 1778201773.7683005, "train/loss": 2.4745697736740113, "train/z_loss": 0.0018757629208266736, "train/perplexity": 11.876596389364435, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019598.4903885946, "perf/iters_per_sec": 0.9630196048682187, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384004592895508, "data/tokens_consumed": 14053015552, "data/tokens_consumed_B": 14.053015552, "train/loss_slope": -6.558388469099745e-05} {"step": 6710, "timestamp": 1778201784.1515405, "train/loss": 2.3529474258422853, "train/z_loss": 0.0019094654358923434, "train/perplexity": 10.516520754430932, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020831.5809095097, "perf/iters_per_sec": 0.9636075882480191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377668380737304, "data/tokens_consumed": 14073987072, "data/tokens_consumed_B": 14.073987072, "train/loss_slope": -6.831524837778881e-05} {"step": 6720, "timestamp": 1778201794.5257947, "train/loss": 2.4347208023071287, "train/z_loss": 0.0018868491402827202, "train/perplexity": 11.412631888017776, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022488.059423915, "perf/iters_per_sec": 0.9643974587554527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369168758392333, "data/tokens_consumed": 14094958592, "data/tokens_consumed_B": 14.094958592, "train/loss_slope": -6.759257016151899e-05} {"step": 6730, "timestamp": 1778201805.3625157, "train/loss": 2.434279537200928, "train/z_loss": 0.001889391802251339, "train/perplexity": 11.407597002736969, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936208.2953779106, "perf/iters_per_sec": 0.9232560612573197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0831231355667115, "data/tokens_consumed": 14115930112, "data/tokens_consumed_B": 14.115930112, "train/loss_slope": -6.769487852811311e-05} {"step": 6740, "timestamp": 1778201815.7157724, "train/loss": 2.4279561281204223, "train/z_loss": 0.0018896354711614549, "train/perplexity": 11.335689689545719, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026872.277475235, "perf/iters_per_sec": 0.9664880168319869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346739768981934, "data/tokens_consumed": 14136901632, "data/tokens_consumed_B": 14.136901632, "train/loss_slope": -6.594996536740159e-05} {"step": 6750, "timestamp": 1778201826.0546978, "grad/layer_0/attn": 0.003357934532687068, "grad/layer_0/mlp": 0.0037625280674546957, "grad/layer_0/attn_mlp_ratio": 0.8924676130620623, "grad/layer_4/attn": 0.0021256061736494303, "grad/layer_4/mlp": 0.003459952073171735, "grad/layer_4/attn_mlp_ratio": 0.6143455363722237, "grad/layer_8/attn": 0.010089937597513199, "grad/layer_8/mlp": 0.004996154922991991, "grad/layer_8/attn_mlp_ratio": 2.0195405368888557, "grad/layer_12/attn": 0.0046410816721618176, "grad/layer_12/mlp": 0.006279435940086842, "grad/layer_12/attn_mlp_ratio": 0.7390921163196741, "grad/layer_16/attn": 0.005201349034905434, "grad/layer_16/mlp": 0.00500171072781086, "grad/layer_16/attn_mlp_ratio": 1.039913984228063, "grad/layer_20/attn": 0.005615903530269861, "grad/layer_20/mlp": 0.007871553301811218, "grad/layer_20/attn_mlp_ratio": 0.7134428547455018, "grad/layer_24/attn": 0.014481883496046066, "grad/layer_24/mlp": 0.010463450103998184, "grad/layer_24/attn_mlp_ratio": 1.3840447666595097, "grad/layer_27/attn": 0.006185546983033419, "grad/layer_27/mlp": 0.010113135911524296, "grad/layer_27/attn_mlp_ratio": 0.6116349049379696} {"step": 6750, "timestamp": 1778201826.6649776, "eos/sharpness": 23.649644851684567, "eos/L0_probe": 2.2558352947235107, "eos/L_plus": 2.3658246994018555, "eos/L_minus": 2.3823423385620117, "eos/grad_norm": 0.16550347208976746, "eos/embed_grad_frac": 0.13266827166080475, "eos/time_s": 0.6075711250305176} {"step": 6750, "timestamp": 1778201826.6851404, "train/loss": 2.5072437047958376, "train/z_loss": 0.0018561136675998569, "train/perplexity": 12.271060739163632, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912727.9110929281, "perf/iters_per_sec": 0.9120597415413514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964194059371948, "data/tokens_consumed": 14157873152, "data/tokens_consumed_B": 14.157873152, "train/loss_slope": -6.212909200904676e-05} {"step": 6750, "timestamp": 1778201828.048913, "geo/rankme_last": 432.9426574707031, "geo/layer_0/stable_rank_q_proj": 16.906896591186523, "geo/layer_0/stable_rank_k_proj": 12.126119613647461, "geo/layer_0/stable_rank_o_proj": 57.503013610839844, "geo/layer_0/stable_rank_gate_proj": 174.73733520507812, "geo/layer_0/stable_rank_down_proj": 47.59269332885742, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03424735739827156, "geo/layer_0/attn_entropy_mean": 6.494050979614258, "geo/layer_0/attn_entropy_std": 0.24161656200885773, "geo/layer_7/stable_rank_q_proj": 43.96643829345703, "geo/layer_7/stable_rank_k_proj": 44.10952377319336, "geo/layer_7/stable_rank_o_proj": 103.59902954101562, "geo/layer_7/stable_rank_gate_proj": 145.50119018554688, "geo/layer_7/stable_rank_down_proj": 189.35923767089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5977361798286438, "geo/layer_7/attn_entropy_mean": 4.723160743713379, "geo/layer_7/attn_entropy_std": 0.9994862079620361, "geo/layer_14/stable_rank_q_proj": 76.17830657958984, "geo/layer_14/stable_rank_k_proj": 56.75522994995117, "geo/layer_14/stable_rank_o_proj": 51.54967498779297, "geo/layer_14/stable_rank_gate_proj": 139.52354431152344, "geo/layer_14/stable_rank_down_proj": 145.7404327392578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4023013412952423, "geo/layer_14/attn_entropy_mean": 5.938840866088867, "geo/layer_14/attn_entropy_std": 0.5745088458061218, "geo/layer_21/stable_rank_q_proj": 51.5904655456543, "geo/layer_21/stable_rank_k_proj": 32.53416442871094, "geo/layer_21/stable_rank_o_proj": 94.89704895019531, "geo/layer_21/stable_rank_gate_proj": 124.105224609375, "geo/layer_21/stable_rank_down_proj": 88.75507354736328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16437943279743195, "geo/layer_21/attn_entropy_mean": 5.832549571990967, "geo/layer_21/attn_entropy_std": 0.3087918162345886, "geo/layer_27/stable_rank_q_proj": 43.28859329223633, "geo/layer_27/stable_rank_k_proj": 35.52064514160156, "geo/layer_27/stable_rank_o_proj": 110.06517028808594, "geo/layer_27/stable_rank_gate_proj": 93.01799011230469, "geo/layer_27/stable_rank_down_proj": 167.11679077148438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05737787485122681, "geo/layer_27/attn_entropy_mean": 4.589448928833008, "geo/layer_27/attn_entropy_std": 0.523247241973877, "attnres/final_alpha/block_0": 0.24777603149414062, "attnres/block_norm/0": 1.248476505279541, "attnres/final_alpha/block_1": 0.012898346409201622, "attnres/block_norm/1": 11104.2890625, "attnres/final_alpha/block_2": 0.026716448366642, "attnres/block_norm/2": 8050.1376953125, "attnres/final_alpha/block_3": 0.024915415793657303, "attnres/block_norm/3": 8020.5234375, "attnres/final_alpha/block_4": 0.038476791232824326, "attnres/block_norm/4": 3884.5595703125, "attnres/final_alpha/block_5": 0.44009220600128174, "attnres/block_norm/5": 3395.4736328125, "attnres/final_alpha/block_6": 0.20912477374076843, "attnres/block_norm/6": 6228.0947265625, "geo/tier1_time_s": 1.3597776889801025, "geo/step": 6750.0, "geo/rankme_slope": 0.015252466767957183} {"step": 6760, "timestamp": 1778201838.3977787, "train/loss": 2.4720773935317992, "train/z_loss": 0.0018688401323743165, "train/perplexity": 11.847032254202478, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791079.7641501136, "perf/iters_per_sec": 0.8540533848524635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708869934082031, "data/tokens_consumed": 14178844672, "data/tokens_consumed_B": 14.178844672, "train/loss_slope": -5.966264851773565e-05} {"step": 6770, "timestamp": 1778201848.7552288, "train/loss": 2.4374398708343508, "train/z_loss": 0.0018769094021990895, "train/perplexity": 11.443705843158968, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025895.4006460593, "perf/iters_per_sec": 0.9660222056608483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351728916168212, "data/tokens_consumed": 14199816192, "data/tokens_consumed_B": 14.199816192, "train/loss_slope": -6.0001039075808484e-05} {"step": 6780, "timestamp": 1778201859.115354, "train/loss": 2.412999701499939, "train/z_loss": 0.0018984115682542323, "train/perplexity": 11.167409847993062, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025185.2520017775, "perf/iters_per_sec": 0.9656835803994072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355358839035034, "data/tokens_consumed": 14220787712, "data/tokens_consumed_B": 14.220787712, "train/loss_slope": -6.306876063239558e-05} {"step": 6790, "timestamp": 1778201869.465847, "train/loss": 2.4201558351516725, "train/z_loss": 0.0018863979494199157, "train/perplexity": 11.247611951631528, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027594.6401273103, "perf/iters_per_sec": 0.9668324661861946, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343053579330443, "data/tokens_consumed": 14241759232, "data/tokens_consumed_B": 14.241759232, "train/loss_slope": -6.0150494677076894e-05} {"step": 6800, "timestamp": 1778201879.8050518, "grad/layer_0/attn": 0.0034680154640227556, "grad/layer_0/mlp": 0.003986972849816084, "grad/layer_0/attn_mlp_ratio": 0.8698367176488451, "grad/layer_4/attn": 0.001850281492806971, "grad/layer_4/mlp": 0.0035775487776845694, "grad/layer_4/attn_mlp_ratio": 0.5171925125463259, "grad/layer_8/attn": 0.006166399922221899, "grad/layer_8/mlp": 0.004837563261389732, "grad/layer_8/attn_mlp_ratio": 1.274691298399949, "grad/layer_12/attn": 0.006269198376685381, "grad/layer_12/mlp": 0.006321114022284746, "grad/layer_12/attn_mlp_ratio": 0.9917869311334946, "grad/layer_16/attn": 0.0075457338243722916, "grad/layer_16/mlp": 0.005160994827747345, "grad/layer_16/attn_mlp_ratio": 1.4620696067348842, "grad/layer_20/attn": 0.007247614674270153, "grad/layer_20/mlp": 0.009397613815963268, "grad/layer_20/attn_mlp_ratio": 0.7712186028369376, "grad/layer_24/attn": 0.01692427694797516, "grad/layer_24/mlp": 0.012830017134547234, "grad/layer_24/attn_mlp_ratio": 1.3191156830564001, "grad/layer_27/attn": 0.005724353715777397, "grad/layer_27/mlp": 0.013372172601521015, "grad/layer_27/attn_mlp_ratio": 0.4280795532296918} {"step": 6800, "timestamp": 1778201879.8208103, "train/loss": 2.4324111461639406, "train/z_loss": 0.0018811932066455483, "train/perplexity": 11.386303049653597, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026172.7386611484, "perf/iters_per_sec": 0.966154450731825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350311994552612, "data/tokens_consumed": 14262730752, "data/tokens_consumed_B": 14.262730752, "train/loss_slope": -5.72953507451727e-05} {"step": 6810, "timestamp": 1778201890.1778288, "train/loss": 2.441774320602417, "train/z_loss": 0.001879604917485267, "train/perplexity": 11.493415665834624, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025967.8194694738, "perf/iters_per_sec": 0.9660567376468057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351358890533446, "data/tokens_consumed": 14283702272, "data/tokens_consumed_B": 14.283702272, "train/loss_slope": -5.769903752097765e-05} {"step": 6820, "timestamp": 1778201900.5300663, "train/loss": 2.4125839471817017, "train/z_loss": 0.0018887359532527626, "train/perplexity": 11.162767914144116, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027182.1655137555, "perf/iters_per_sec": 0.9666357829636362, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345158100128173, "data/tokens_consumed": 14304673792, "data/tokens_consumed_B": 14.304673792, "train/loss_slope": -5.723106386852513e-05} {"step": 6825, "timestamp": 1778201906.3068569, "eos/sharpness": 6.959152221679686, "eos/L0_probe": 2.252556800842285, "eos/L_plus": 2.2944681644439697, "eos/L_minus": 2.2802369594573975, "eos/grad_norm": 0.12078003585338593, "eos/embed_grad_frac": 0.21156533062458038, "eos/time_s": 0.6076247692108154} {"step": 6825, "timestamp": 1778201907.6850057, "geo/rankme_last": 432.9248962402344, "geo/layer_0/stable_rank_q_proj": 16.717737197875977, "geo/layer_0/stable_rank_k_proj": 12.025976181030273, "geo/layer_0/stable_rank_o_proj": 57.28009033203125, "geo/layer_0/stable_rank_gate_proj": 173.1996307373047, "geo/layer_0/stable_rank_down_proj": 47.41874313354492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03240695223212242, "geo/layer_0/attn_entropy_mean": 6.48869514465332, "geo/layer_0/attn_entropy_std": 0.2416492998600006, "geo/layer_7/stable_rank_q_proj": 43.988399505615234, "geo/layer_7/stable_rank_k_proj": 44.10856628417969, "geo/layer_7/stable_rank_o_proj": 103.13699340820312, "geo/layer_7/stable_rank_gate_proj": 144.93092346191406, "geo/layer_7/stable_rank_down_proj": 188.50735473632812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5894335508346558, "geo/layer_7/attn_entropy_mean": 4.72679328918457, "geo/layer_7/attn_entropy_std": 0.9613199830055237, "geo/layer_14/stable_rank_q_proj": 75.59386444091797, "geo/layer_14/stable_rank_k_proj": 56.04519271850586, "geo/layer_14/stable_rank_o_proj": 50.898502349853516, "geo/layer_14/stable_rank_gate_proj": 138.46392822265625, "geo/layer_14/stable_rank_down_proj": 145.47377014160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39769336581230164, "geo/layer_14/attn_entropy_mean": 5.96834659576416, "geo/layer_14/attn_entropy_std": 0.5729801058769226, "geo/layer_21/stable_rank_q_proj": 51.591373443603516, "geo/layer_21/stable_rank_k_proj": 32.4228515625, "geo/layer_21/stable_rank_o_proj": 95.12860870361328, "geo/layer_21/stable_rank_gate_proj": 123.14822387695312, "geo/layer_21/stable_rank_down_proj": 88.09907531738281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1658172905445099, "geo/layer_21/attn_entropy_mean": 5.8401055335998535, "geo/layer_21/attn_entropy_std": 0.3060348331928253, "geo/layer_27/stable_rank_q_proj": 43.45957565307617, "geo/layer_27/stable_rank_k_proj": 35.34908676147461, "geo/layer_27/stable_rank_o_proj": 110.35320281982422, "geo/layer_27/stable_rank_gate_proj": 93.06056213378906, "geo/layer_27/stable_rank_down_proj": 166.8353271484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.061357393860816956, "geo/layer_27/attn_entropy_mean": 4.583505630493164, "geo/layer_27/attn_entropy_std": 0.5178429484367371, "attnres/final_alpha/block_0": 0.24556149542331696, "attnres/block_norm/0": 1.2534198760986328, "attnres/final_alpha/block_1": 0.012810519896447659, "attnres/block_norm/1": 11278.62890625, "attnres/final_alpha/block_2": 0.026535402983427048, "attnres/block_norm/2": 8166.6220703125, "attnres/final_alpha/block_3": 0.02440469339489937, "attnres/block_norm/3": 8201.673828125, "attnres/final_alpha/block_4": 0.037653449922800064, "attnres/block_norm/4": 3914.5498046875, "attnres/final_alpha/block_5": 0.44926589727401733, "attnres/block_norm/5": 3384.50537109375, "attnres/final_alpha/block_6": 0.2037685364484787, "attnres/block_norm/6": 6358.40283203125, "geo/tier1_time_s": 1.3586807250976562, "geo/step": 6825.0, "geo/rankme_slope": 0.014861031463366597} {"step": 6830, "timestamp": 1778201912.863578, "train/loss": 2.4307150840759277, "train/z_loss": 0.0018812958151102066, "train/perplexity": 11.367007540544686, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700969.1867948791, "perf/iters_per_sec": 0.8110853132223507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329159259796143, "data/tokens_consumed": 14325645312, "data/tokens_consumed_B": 14.325645312, "train/loss_slope": -5.53257288247516e-05} {"step": 6840, "timestamp": 1778201923.2126882, "train/loss": 2.426444435119629, "train/z_loss": 0.0018816876923665403, "train/perplexity": 11.318566552506912, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027389.1061366848, "perf/iters_per_sec": 0.9667344599421905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344102144241334, "data/tokens_consumed": 14346616832, "data/tokens_consumed_B": 14.346616832, "train/loss_slope": -5.628124805602654e-05} {"step": 6850, "timestamp": 1778201933.5522373, "grad/layer_0/attn": 0.0032036113552749157, "grad/layer_0/mlp": 0.003932046703994274, "grad/layer_0/attn_mlp_ratio": 0.8147439526967495, "grad/layer_4/attn": 0.0017727658851072192, "grad/layer_4/mlp": 0.003493324853479862, "grad/layer_4/attn_mlp_ratio": 0.5074723676482731, "grad/layer_8/attn": 0.006782554090023041, "grad/layer_8/mlp": 0.005236178170889616, "grad/layer_8/attn_mlp_ratio": 1.2953252809841211, "grad/layer_12/attn": 0.006378826219588518, "grad/layer_12/mlp": 0.006401782855391502, "grad/layer_12/attn_mlp_ratio": 0.9964140090404581, "grad/layer_16/attn": 0.005657911766320467, "grad/layer_16/mlp": 0.005107137840241194, "grad/layer_16/attn_mlp_ratio": 1.1078439299121932, "grad/layer_20/attn": 0.006120510399341583, "grad/layer_20/mlp": 0.008123822510242462, "grad/layer_20/attn_mlp_ratio": 0.7534027628354273, "grad/layer_24/attn": 0.017456576228141785, "grad/layer_24/mlp": 0.011851739138364792, "grad/layer_24/attn_mlp_ratio": 1.4729126145160027, "grad/layer_27/attn": 0.005544903688132763, "grad/layer_27/mlp": 0.00981487799435854, "grad/layer_27/attn_mlp_ratio": 0.5649488088211609} {"step": 6850, "timestamp": 1778201933.5679853, "train/loss": 2.399094295501709, "train/z_loss": 0.0018875406007282437, "train/perplexity": 11.013197160002875, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026342.7349730455, "perf/iters_per_sec": 0.9662355112900951, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349443674087524, "data/tokens_consumed": 14367588352, "data/tokens_consumed_B": 14.367588352, "train/loss_slope": -5.884707624262021e-05} {"step": 6860, "timestamp": 1778201943.918556, "train/loss": 2.394807720184326, "train/z_loss": 0.0018986735143698753, "train/perplexity": 10.966089298721867, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027245.9392782652, "perf/iters_per_sec": 0.9666661926642729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03448326587677, "data/tokens_consumed": 14388559872, "data/tokens_consumed_B": 14.388559872, "train/loss_slope": -6.020689784604215e-05} {"step": 6870, "timestamp": 1778201954.2763348, "train/loss": 2.4392035961151124, "train/z_loss": 0.0018724987749010324, "train/perplexity": 11.463907206050623, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025641.5090473553, "perf/iters_per_sec": 0.9659011407124306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353026390075684, "data/tokens_consumed": 14409531392, "data/tokens_consumed_B": 14.409531392, "train/loss_slope": -6.098231479565794e-05} {"step": 6880, "timestamp": 1778201964.6379464, "train/loss": 2.379522919654846, "train/z_loss": 0.0019029564457014204, "train/perplexity": 10.799749286576562, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024944.8238777053, "perf/iters_per_sec": 0.9655689353359724, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035658836364746, "data/tokens_consumed": 14430502912, "data/tokens_consumed_B": 14.430502912, "train/loss_slope": -6.450822006429301e-05} {"step": 6890, "timestamp": 1778201974.9985926, "train/loss": 2.4212410688400268, "train/z_loss": 0.001881141366902739, "train/perplexity": 11.259824864768731, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025227.2640155135, "perf/iters_per_sec": 0.9657036132886474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355144023895264, "data/tokens_consumed": 14451474432, "data/tokens_consumed_B": 14.451474432, "train/loss_slope": -6.420459004804739e-05} {"step": 6900, "timestamp": 1778201985.356523, "grad/layer_0/attn": 0.0032803805079311132, "grad/layer_0/mlp": 0.003404551651328802, "grad/layer_0/attn_mlp_ratio": 0.9635278731335999, "grad/layer_4/attn": 0.001611029845662415, "grad/layer_4/mlp": 0.0031579870264977217, "grad/layer_4/attn_mlp_ratio": 0.510144525968693, "grad/layer_8/attn": 0.006373721174895763, "grad/layer_8/mlp": 0.004562697373330593, "grad/layer_8/attn_mlp_ratio": 1.3969195223112598, "grad/layer_12/attn": 0.00655019236728549, "grad/layer_12/mlp": 0.006009816657751799, "grad/layer_12/attn_mlp_ratio": 1.0899154884941684, "grad/layer_16/attn": 0.0055800918489694595, "grad/layer_16/mlp": 0.004862389527261257, "grad/layer_16/attn_mlp_ratio": 1.1476027790295467, "grad/layer_20/attn": 0.010425085201859474, "grad/layer_20/mlp": 0.006897450890392065, "grad/layer_20/attn_mlp_ratio": 1.5114402721210045, "grad/layer_24/attn": 0.006498041097074747, "grad/layer_24/mlp": 0.009615045972168446, "grad/layer_24/attn_mlp_ratio": 0.6758200687029332, "grad/layer_27/attn": 0.005664585158228874, "grad/layer_27/mlp": 0.008448300883173943, "grad/layer_27/attn_mlp_ratio": 0.6704999229443582} {"step": 6900, "timestamp": 1778201985.94577, "eos/sharpness": 6.721997261047362, "eos/L0_probe": 2.240837574005127, "eos/L_plus": 2.286245584487915, "eos/L_minus": 2.2626495361328125, "eos/grad_norm": 0.11406548321247101, "eos/embed_grad_frac": 0.2239822894334793, "eos/time_s": 0.5863306522369385} {"step": 6900, "timestamp": 1778201985.964418, "train/loss": 2.440667152404785, "train/z_loss": 0.001880738651379943, "train/perplexity": 11.480697563365716, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913284.5826953158, "perf/iters_per_sec": 0.9123251832462863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096100401878357, "data/tokens_consumed": 14472445952, "data/tokens_consumed_B": 14.472445952, "train/loss_slope": -5.921087412372059e-05} {"step": 6900, "timestamp": 1778201987.3293476, "geo/rankme_last": 432.5895690917969, "geo/layer_0/stable_rank_q_proj": 16.575536727905273, "geo/layer_0/stable_rank_k_proj": 11.99752140045166, "geo/layer_0/stable_rank_o_proj": 57.28559112548828, "geo/layer_0/stable_rank_gate_proj": 172.142333984375, "geo/layer_0/stable_rank_down_proj": 47.294273376464844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.035607751458883286, "geo/layer_0/attn_entropy_mean": 6.480177879333496, "geo/layer_0/attn_entropy_std": 0.24535289406776428, "geo/layer_7/stable_rank_q_proj": 43.8548698425293, "geo/layer_7/stable_rank_k_proj": 44.38711166381836, "geo/layer_7/stable_rank_o_proj": 102.5082015991211, "geo/layer_7/stable_rank_gate_proj": 144.0066680908203, "geo/layer_7/stable_rank_down_proj": 187.29544067382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5838818550109863, "geo/layer_7/attn_entropy_mean": 4.748424053192139, "geo/layer_7/attn_entropy_std": 0.9882857203483582, "geo/layer_14/stable_rank_q_proj": 75.12300109863281, "geo/layer_14/stable_rank_k_proj": 55.26348876953125, "geo/layer_14/stable_rank_o_proj": 50.77627944946289, "geo/layer_14/stable_rank_gate_proj": 137.00743103027344, "geo/layer_14/stable_rank_down_proj": 145.51644897460938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4004647731781006, "geo/layer_14/attn_entropy_mean": 5.923015117645264, "geo/layer_14/attn_entropy_std": 0.5807769894599915, "geo/layer_21/stable_rank_q_proj": 51.91131591796875, "geo/layer_21/stable_rank_k_proj": 32.56494903564453, "geo/layer_21/stable_rank_o_proj": 95.29102325439453, "geo/layer_21/stable_rank_gate_proj": 123.39827728271484, "geo/layer_21/stable_rank_down_proj": 87.71551513671875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16571591794490814, "geo/layer_21/attn_entropy_mean": 5.8627777099609375, "geo/layer_21/attn_entropy_std": 0.3001073896884918, "geo/layer_27/stable_rank_q_proj": 43.567813873291016, "geo/layer_27/stable_rank_k_proj": 35.21068572998047, "geo/layer_27/stable_rank_o_proj": 109.68028259277344, "geo/layer_27/stable_rank_gate_proj": 93.05941009521484, "geo/layer_27/stable_rank_down_proj": 166.96389770507812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06346631795167923, "geo/layer_27/attn_entropy_mean": 4.610163688659668, "geo/layer_27/attn_entropy_std": 0.5193805694580078, "attnres/final_alpha/block_0": 0.2466009259223938, "attnres/block_norm/0": 1.2582558393478394, "attnres/final_alpha/block_1": 0.01266766618937254, "attnres/block_norm/1": 11441.341796875, "attnres/final_alpha/block_2": 0.026355769485235214, "attnres/block_norm/2": 8301.06640625, "attnres/final_alpha/block_3": 0.024330316111445427, "attnres/block_norm/3": 8340.7451171875, "attnres/final_alpha/block_4": 0.03737608715891838, "attnres/block_norm/4": 3958.05517578125, "attnres/final_alpha/block_5": 0.45131832361221313, "attnres/block_norm/5": 3410.955810546875, "attnres/final_alpha/block_6": 0.20135092735290527, "attnres/block_norm/6": 6483.4580078125, "geo/tier1_time_s": 1.361140489578247, "geo/step": 6900.0, "geo/rankme_slope": 0.014310471493284814} {"step": 6910, "timestamp": 1778201997.6889129, "train/loss": 2.40910325050354, "train/z_loss": 0.00188123359112069, "train/perplexity": 11.123981246418193, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789248.3467137697, "perf/iters_per_sec": 0.8531800969666337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720854759216308, "data/tokens_consumed": 14493417472, "data/tokens_consumed_B": 14.493417472, "train/loss_slope": -6.132034007901362e-05} {"step": 6920, "timestamp": 1778202008.0534298, "train/loss": 2.476411747932434, "train/z_loss": 0.0018630022532306612, "train/perplexity": 11.898492934436495, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024430.68488187, "perf/iters_per_sec": 0.9653237747582769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035921859741211, "data/tokens_consumed": 14514388992, "data/tokens_consumed_B": 14.514388992, "train/loss_slope": -5.636194172185918e-05} {"step": 6930, "timestamp": 1778202018.4369655, "train/loss": 2.3962550163269043, "train/z_loss": 0.0018856434151530265, "train/perplexity": 10.98197196815377, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021293.6814232643, "perf/iters_per_sec": 0.963827934943802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375295877456665, "data/tokens_consumed": 14535360512, "data/tokens_consumed_B": 14.535360512, "train/loss_slope": -6.129942584102154e-05} {"step": 6940, "timestamp": 1778202028.8050325, "train/loss": 2.3862155199050905, "train/z_loss": 0.0018874842557124794, "train/perplexity": 10.872270097062533, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023809.3782712126, "perf/iters_per_sec": 0.9650275126796782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362398862838744, "data/tokens_consumed": 14556332032, "data/tokens_consumed_B": 14.556332032, "train/loss_slope": -6.167492946632759e-05} {"step": 6950, "timestamp": 1778202039.1726904, "grad/layer_0/attn": 0.0026454131584614515, "grad/layer_0/mlp": 0.003537200391292572, "grad/layer_0/attn_mlp_ratio": 0.7478832949881103, "grad/layer_4/attn": 0.0016717389225959778, "grad/layer_4/mlp": 0.003233600640669465, "grad/layer_4/attn_mlp_ratio": 0.5169898997022963, "grad/layer_8/attn": 0.0048487125895917416, "grad/layer_8/mlp": 0.004511911887675524, "grad/layer_8/attn_mlp_ratio": 1.0746469795590434, "grad/layer_12/attn": 0.005630418658256531, "grad/layer_12/mlp": 0.006309148855507374, "grad/layer_12/attn_mlp_ratio": 0.8924212596600106, "grad/layer_16/attn": 0.006426176521927118, "grad/layer_16/mlp": 0.004938803147524595, "grad/layer_16/attn_mlp_ratio": 1.3011606658248671, "grad/layer_20/attn": 0.009571406058967113, "grad/layer_20/mlp": 0.007246680557727814, "grad/layer_20/attn_mlp_ratio": 1.3207986540375873, "grad/layer_24/attn": 0.009420273825526237, "grad/layer_24/mlp": 0.009817753918468952, "grad/layer_24/attn_mlp_ratio": 0.959514142216745, "grad/layer_27/attn": 0.005679842084646225, "grad/layer_27/mlp": 0.010651052929461002, "grad/layer_27/attn_mlp_ratio": 0.5332657784104239} {"step": 6950, "timestamp": 1778202039.188312, "train/loss": 2.3910223245620728, "train/z_loss": 0.0018940614187158643, "train/perplexity": 10.92465678100746, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021131.6830359523, "perf/iters_per_sec": 0.9637506880931627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037612748146057, "data/tokens_consumed": 14577303552, "data/tokens_consumed_B": 14.577303552, "train/loss_slope": -6.459405159685577e-05} {"step": 6960, "timestamp": 1778202049.5580223, "train/loss": 2.4223519802093505, "train/z_loss": 0.0018768760142847895, "train/perplexity": 11.272340482811634, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023456.7181544614, "perf/iters_per_sec": 0.964859351231795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364204883575439, "data/tokens_consumed": 14598275072, "data/tokens_consumed_B": 14.598275072, "train/loss_slope": -6.020891004734621e-05} {"step": 6970, "timestamp": 1778202059.9222813, "train/loss": 2.455190920829773, "train/z_loss": 0.001867785840295255, "train/perplexity": 11.64865730791072, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024427.5631892167, "perf/iters_per_sec": 0.9653222862192233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035923457145691, "data/tokens_consumed": 14619246592, "data/tokens_consumed_B": 14.619246592, "train/loss_slope": -5.575145145740153e-05} {"step": 6975, "timestamp": 1778202065.685383, "eos/sharpness": 16.304945945739743, "eos/L0_probe": 2.23710298538208, "eos/L_plus": 2.3073935508728027, "eos/L_minus": 2.329861879348755, "eos/grad_norm": 0.11171641200780869, "eos/embed_grad_frac": 0.2300671488046646, "eos/time_s": 0.5863659381866455} {"step": 6975, "timestamp": 1778202067.0631163, "geo/rankme_last": 434.00299072265625, "geo/layer_0/stable_rank_q_proj": 16.503589630126953, "geo/layer_0/stable_rank_k_proj": 11.920279502868652, "geo/layer_0/stable_rank_o_proj": 57.39726638793945, "geo/layer_0/stable_rank_gate_proj": 173.01304626464844, "geo/layer_0/stable_rank_down_proj": 47.358848571777344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.024881994351744652, "geo/layer_0/attn_entropy_mean": 6.477787971496582, "geo/layer_0/attn_entropy_std": 0.24388116598129272, "geo/layer_7/stable_rank_q_proj": 43.85105895996094, "geo/layer_7/stable_rank_k_proj": 44.333839416503906, "geo/layer_7/stable_rank_o_proj": 102.55669403076172, "geo/layer_7/stable_rank_gate_proj": 143.50857543945312, "geo/layer_7/stable_rank_down_proj": 187.33367919921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5901185870170593, "geo/layer_7/attn_entropy_mean": 4.74626350402832, "geo/layer_7/attn_entropy_std": 0.9892065525054932, "geo/layer_14/stable_rank_q_proj": 74.93592071533203, "geo/layer_14/stable_rank_k_proj": 54.60783386230469, "geo/layer_14/stable_rank_o_proj": 50.58269500732422, "geo/layer_14/stable_rank_gate_proj": 136.21658325195312, "geo/layer_14/stable_rank_down_proj": 144.9747772216797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40275493264198303, "geo/layer_14/attn_entropy_mean": 5.9117560386657715, "geo/layer_14/attn_entropy_std": 0.6010825634002686, "geo/layer_21/stable_rank_q_proj": 51.66877746582031, "geo/layer_21/stable_rank_k_proj": 32.6217155456543, "geo/layer_21/stable_rank_o_proj": 94.78400421142578, "geo/layer_21/stable_rank_gate_proj": 123.10265350341797, "geo/layer_21/stable_rank_down_proj": 87.06006622314453, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1629897952079773, "geo/layer_21/attn_entropy_mean": 5.834979057312012, "geo/layer_21/attn_entropy_std": 0.30165112018585205, "geo/layer_27/stable_rank_q_proj": 43.692527770996094, "geo/layer_27/stable_rank_k_proj": 35.15837097167969, "geo/layer_27/stable_rank_o_proj": 109.7790298461914, "geo/layer_27/stable_rank_gate_proj": 93.60552978515625, "geo/layer_27/stable_rank_down_proj": 166.534423828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0601000115275383, "geo/layer_27/attn_entropy_mean": 4.593571662902832, "geo/layer_27/attn_entropy_std": 0.48910942673683167, "attnres/final_alpha/block_0": 0.24621716141700745, "attnres/block_norm/0": 1.2629543542861938, "attnres/final_alpha/block_1": 0.012428265996277332, "attnres/block_norm/1": 11557.81640625, "attnres/final_alpha/block_2": 0.02607889473438263, "attnres/block_norm/2": 8415.474609375, "attnres/final_alpha/block_3": 0.024260234087705612, "attnres/block_norm/3": 8518.63671875, "attnres/final_alpha/block_4": 0.03781745210289955, "attnres/block_norm/4": 4026.0439453125, "attnres/final_alpha/block_5": 0.4502768814563751, "attnres/block_norm/5": 3422.159912109375, "attnres/final_alpha/block_6": 0.20292112231254578, "attnres/block_norm/6": 6500.93212890625, "geo/tier1_time_s": 1.3595566749572754, "geo/step": 6975.0, "geo/rankme_slope": 0.01386498259460034} {"step": 6980, "timestamp": 1778202072.2560008, "train/loss": 2.4053641319274903, "train/z_loss": 0.0018736059078946709, "train/perplexity": 11.08246502690375, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701090.7678989852, "perf/iters_per_sec": 0.8111432876105238, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2328278064727782, "data/tokens_consumed": 14640218112, "data/tokens_consumed_B": 14.640218112, "train/loss_slope": -5.3486281445603594e-05} {"step": 6990, "timestamp": 1778202082.627783, "train/loss": 2.413712430000305, "train/z_loss": 0.0018776927259750663, "train/perplexity": 11.1753720163621, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023813.9415441928, "perf/iters_per_sec": 0.9650296886177983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362375497817993, "data/tokens_consumed": 14661189632, "data/tokens_consumed_B": 14.661189632, "train/loss_slope": -5.3015842660926185e-05} {"step": 7000, "timestamp": 1778202092.9811108, "grad/layer_0/attn": 0.0029904055409133434, "grad/layer_0/mlp": 0.0033358472865074873, "grad/layer_0/attn_mlp_ratio": 0.8964455487408226, "grad/layer_4/attn": 0.0014749269466847181, "grad/layer_4/mlp": 0.003223277162760496, "grad/layer_4/attn_mlp_ratio": 0.45758612320602954, "grad/layer_8/attn": 0.009556573815643787, "grad/layer_8/mlp": 0.004733770154416561, "grad/layer_8/attn_mlp_ratio": 2.018808117425555, "grad/layer_12/attn": 0.0046080173924565315, "grad/layer_12/mlp": 0.006273042410612106, "grad/layer_12/attn_mlp_ratio": 0.7345745520871488, "grad/layer_16/attn": 0.005482570268213749, "grad/layer_16/mlp": 0.005469752475619316, "grad/layer_16/attn_mlp_ratio": 1.002343377038948, "grad/layer_20/attn": 0.005960866808891296, "grad/layer_20/mlp": 0.008203842677175999, "grad/layer_20/attn_mlp_ratio": 0.7265944717364757, "grad/layer_24/attn": 0.020793776959180832, "grad/layer_24/mlp": 0.013336565345525742, "grad/layer_24/attn_mlp_ratio": 1.5591553195696946, "grad/layer_27/attn": 0.007405532989650965, "grad/layer_27/mlp": 0.013912559486925602, "grad/layer_27/attn_mlp_ratio": 0.5322911965538212} {"step": 7000, "timestamp": 1778202092.9962304, "train/loss": 2.3778456687927245, "train/z_loss": 0.0018916301778517663, "train/perplexity": 10.781650580053407, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023676.7264011558, "perf/iters_per_sec": 0.96496425933893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363078117370605, "data/tokens_consumed": 14682161152, "data/tokens_consumed_B": 14.682161152, "train/loss_slope": -5.461650707326129e-05} {"step": 7000, "timestamp": 1778202100.2314107, "geo/ww_alpha_mean": 8.569737145969322, "geo/ww_alpha_std": 4.830628280358607, "geo/ww_alpha_min": 1.9588806024983918, "geo/ww_alpha_max": 28.308875013436055, "geo/ww_alpha_healthy_frac": 0.13705583756345177, "geo/ww_alpha_by_type/q_proj": 4.729211745627184, "geo/ww_alpha_by_type/k_proj": 5.578064723617919, "geo/ww_alpha_by_type/v_proj": 7.6885161114142875, "geo/ww_alpha_by_type/o_proj": 8.659795055402041, "geo/ww_alpha_by_type/gate_proj": 11.937940726104802, "geo/ww_alpha_by_type/up_proj": 12.304708621238477, "geo/ww_alpha_by_type/down_proj": 9.319005048448629, "geo/twonn_id/layer_0": 0.7434776425361633, "geo/twonn_id/layer_7": 2.7016890048980713, "geo/twonn_id/layer_14": 3.2328479290008545, "geo/twonn_id/layer_21": 6.953843593597412, "geo/twonn_id/layer_27": 5.66039514541626, "geo/tier2_time_s": 7.228171110153198} {"step": 7000, "timestamp": 1778202101.1872385, "eoc/jacobian_sigma/layer_0/attn": 489.62750244140625, "eoc/jacobian_sigma/layer_0/mlp": 1630.3709716796875, "eoc/jacobian_sigma/layer_0": 1630.3709716796875, "eoc/jacobian_sigma/layer_7/attn": 1.169508457183838, "eoc/jacobian_sigma/layer_7/mlp": 1.609704613685608, "eoc/jacobian_sigma/layer_7": 1.609704613685608, "eoc/jacobian_sigma/layer_14/attn": 1.266050100326538, "eoc/jacobian_sigma/layer_14/mlp": 4.125993728637695, "eoc/jacobian_sigma/layer_14": 4.125993728637695, "eoc/jacobian_sigma/layer_21/attn": 1.0745809078216553, "eoc/jacobian_sigma/layer_21/mlp": 2.7767884731292725, "eoc/jacobian_sigma/layer_21": 2.7767884731292725, "eoc/jacobian_sigma/layer_27/attn": 2.125865936279297, "eoc/jacobian_sigma/layer_27/mlp": 10.237960815429688, "eoc/jacobian_sigma/layer_27": 10.237960815429688, "eoc/layer0_sigma": 1630.3709716796875, "eoc/sigma_max": 10.237960815429688, "eoc/sigma_min": 1.609704613685608, "eoc/sigma_mean": 4.687611907720566, "eoc/time_s": 0.9468717575073242} {"step": 7010, "timestamp": 1778202111.57441, "train/loss": 2.412800359725952, "train/z_loss": 0.0018827883061021567, "train/perplexity": 11.165183938568859, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1129191.059752318, "perf/iters_per_sec": 0.5384402560006705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8572162628173827, "data/tokens_consumed": 14703132672, "data/tokens_consumed_B": 14.703132672, "train/loss_slope": -5.6875794905998115e-05} {"step": 7020, "timestamp": 1778202121.936947, "train/loss": 2.409080219268799, "train/z_loss": 0.0018738126615062355, "train/perplexity": 11.123725050345119, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024912.3795443103, "perf/iters_per_sec": 0.9655534646722366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356754302978515, "data/tokens_consumed": 14724104192, "data/tokens_consumed_B": 14.724104192, "train/loss_slope": -5.772704760519685e-05} {"step": 7030, "timestamp": 1778202132.3001533, "train/loss": 2.3679459571838377, "train/z_loss": 0.0018919784924946725, "train/perplexity": 10.675441933535705, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024621.7787587335, "perf/iters_per_sec": 0.9654148954194706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358240842819213, "data/tokens_consumed": 14745075712, "data/tokens_consumed_B": 14.745075712, "train/loss_slope": -5.964689287665791e-05} {"step": 7040, "timestamp": 1778202142.6732798, "train/loss": 2.4789194583892824, "train/z_loss": 0.0018650355632416904, "train/perplexity": 11.928368353382925, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023170.3508229072, "perf/iters_per_sec": 0.9647228006472145, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365671873092652, "data/tokens_consumed": 14766047232, "data/tokens_consumed_B": 14.766047232, "train/loss_slope": -5.6447409954008034e-05} {"step": 7050, "timestamp": 1778202153.0288475, "grad/layer_0/attn": 0.003911449573934078, "grad/layer_0/mlp": 0.003927482757717371, "grad/layer_0/attn_mlp_ratio": 0.9959176693154019, "grad/layer_4/attn": 0.0016997810453176498, "grad/layer_4/mlp": 0.00325380708090961, "grad/layer_4/attn_mlp_ratio": 0.5223975948207453, "grad/layer_8/attn": 0.007783540990203619, "grad/layer_8/mlp": 0.004634154960513115, "grad/layer_8/attn_mlp_ratio": 1.6796030535373994, "grad/layer_12/attn": 0.0049781822599470615, "grad/layer_12/mlp": 0.005810852162539959, "grad/layer_12/attn_mlp_ratio": 0.8567043240867164, "grad/layer_16/attn": 0.006072366610169411, "grad/layer_16/mlp": 0.0050266701728105545, "grad/layer_16/attn_mlp_ratio": 1.2080296260956416, "grad/layer_20/attn": 0.006184062454849482, "grad/layer_20/mlp": 0.008050834760069847, "grad/layer_20/attn_mlp_ratio": 0.7681268542124624, "grad/layer_24/attn": 0.014214912429451942, "grad/layer_24/mlp": 0.01061047799885273, "grad/layer_24/attn_mlp_ratio": 1.3397051760550682, "grad/layer_27/attn": 0.00610349141061306, "grad/layer_27/mlp": 0.009787756949663162, "grad/layer_27/attn_mlp_ratio": 0.6235842777506525} {"step": 7050, "timestamp": 1778202153.6664312, "eos/sharpness": 38.02602291107177, "eos/L0_probe": 2.2378034591674805, "eos/L_plus": 2.384193181991577, "eos/L_minus": 2.4716739654541016, "eos/grad_norm": 0.19873274862766266, "eos/embed_grad_frac": 0.06911254674196243, "eos/time_s": 0.6344308853149414} {"step": 7050, "timestamp": 1778202153.6872246, "train/loss": 2.4302602291107176, "train/z_loss": 0.0018796640215441585, "train/perplexity": 11.361838376424393, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904959.1005985355, "perf/iters_per_sec": 0.9083552840225866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.100890827178955, "data/tokens_consumed": 14787018752, "data/tokens_consumed_B": 14.787018752, "train/loss_slope": -5.503560189831692e-05} {"step": 7050, "timestamp": 1778202155.0539575, "geo/rankme_last": 434.6551818847656, "geo/layer_0/stable_rank_q_proj": 16.364669799804688, "geo/layer_0/stable_rank_k_proj": 11.939850807189941, "geo/layer_0/stable_rank_o_proj": 57.10993957519531, "geo/layer_0/stable_rank_gate_proj": 171.5576171875, "geo/layer_0/stable_rank_down_proj": 47.40293884277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.027632076293230057, "geo/layer_0/attn_entropy_mean": 6.466723918914795, "geo/layer_0/attn_entropy_std": 0.24726173281669617, "geo/layer_7/stable_rank_q_proj": 43.8479118347168, "geo/layer_7/stable_rank_k_proj": 44.48561477661133, "geo/layer_7/stable_rank_o_proj": 102.67605590820312, "geo/layer_7/stable_rank_gate_proj": 143.042724609375, "geo/layer_7/stable_rank_down_proj": 186.63023376464844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5948134064674377, "geo/layer_7/attn_entropy_mean": 4.733607292175293, "geo/layer_7/attn_entropy_std": 0.9775070548057556, "geo/layer_14/stable_rank_q_proj": 74.94158935546875, "geo/layer_14/stable_rank_k_proj": 54.12472152709961, "geo/layer_14/stable_rank_o_proj": 50.244693756103516, "geo/layer_14/stable_rank_gate_proj": 134.86749267578125, "geo/layer_14/stable_rank_down_proj": 145.2205352783203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39417216181755066, "geo/layer_14/attn_entropy_mean": 5.870980739593506, "geo/layer_14/attn_entropy_std": 0.6002644300460815, "geo/layer_21/stable_rank_q_proj": 51.60482406616211, "geo/layer_21/stable_rank_k_proj": 32.713050842285156, "geo/layer_21/stable_rank_o_proj": 94.78790283203125, "geo/layer_21/stable_rank_gate_proj": 122.11861419677734, "geo/layer_21/stable_rank_down_proj": 86.5319595336914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.165300190448761, "geo/layer_21/attn_entropy_mean": 5.843375205993652, "geo/layer_21/attn_entropy_std": 0.3079102635383606, "geo/layer_27/stable_rank_q_proj": 43.527645111083984, "geo/layer_27/stable_rank_k_proj": 35.03350830078125, "geo/layer_27/stable_rank_o_proj": 109.64697265625, "geo/layer_27/stable_rank_gate_proj": 93.42411804199219, "geo/layer_27/stable_rank_down_proj": 166.73980712890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06006469577550888, "geo/layer_27/attn_entropy_mean": 4.5675129890441895, "geo/layer_27/attn_entropy_std": 0.49157440662384033, "attnres/final_alpha/block_0": 0.24788838624954224, "attnres/block_norm/0": 1.2676373720169067, "attnres/final_alpha/block_1": 0.012516507878899574, "attnres/block_norm/1": 11702.0703125, "attnres/final_alpha/block_2": 0.02600220963358879, "attnres/block_norm/2": 8566.486328125, "attnres/final_alpha/block_3": 0.024455858394503593, "attnres/block_norm/3": 8588.76953125, "attnres/final_alpha/block_4": 0.037011899054050446, "attnres/block_norm/4": 4039.30078125, "attnres/final_alpha/block_5": 0.447948694229126, "attnres/block_norm/5": 3436.05615234375, "attnres/final_alpha/block_6": 0.20417645573616028, "attnres/block_norm/6": 6579.89453125, "geo/tier1_time_s": 1.3625309467315674, "geo/step": 7050.0, "geo/rankme_slope": 0.013512205565820078} {"step": 7060, "timestamp": 1778202165.4114876, "train/loss": 2.4439244270324707, "train/z_loss": 0.001871384517289698, "train/perplexity": 11.518154318598723, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789367.878551459, "perf/iters_per_sec": 0.8532370941884322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172007179260254, "data/tokens_consumed": 14807990272, "data/tokens_consumed_B": 14.807990272, "train/loss_slope": -5.6744748552461414e-05} {"step": 7070, "timestamp": 1778202175.7702508, "train/loss": 2.431158971786499, "train/z_loss": 0.0018788670538924634, "train/perplexity": 11.372054335520174, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025749.6921180908, "perf/iters_per_sec": 0.9659527264204458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352473497390746, "data/tokens_consumed": 14828961792, "data/tokens_consumed_B": 14.828961792, "train/loss_slope": -5.4685811884868866e-05} {"step": 7080, "timestamp": 1778202186.1247673, "train/loss": 2.402864909172058, "train/z_loss": 0.0018860898562707007, "train/perplexity": 11.054802060478652, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026319.768401195, "perf/iters_per_sec": 0.9662245599752403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349560976028442, "data/tokens_consumed": 14849933312, "data/tokens_consumed_B": 14.849933312, "train/loss_slope": -5.417836618752509e-05} {"step": 7090, "timestamp": 1778202196.4808044, "train/loss": 2.426634359359741, "train/z_loss": 0.0018867641338147223, "train/perplexity": 11.320716426808715, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025986.998259526, "perf/iters_per_sec": 0.9660658828065519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351260900497437, "data/tokens_consumed": 14870904832, "data/tokens_consumed_B": 14.870904832, "train/loss_slope": -5.296553415183437e-05} {"step": 7100, "timestamp": 1778202206.8266199, "grad/layer_0/attn": 0.0028765518218278885, "grad/layer_0/mlp": 0.0033111211378127337, "grad/layer_0/attn_mlp_ratio": 0.8687546046269469, "grad/layer_4/attn": 0.0018610991537570953, "grad/layer_4/mlp": 0.0032422582153230906, "grad/layer_4/attn_mlp_ratio": 0.5740132255845991, "grad/layer_8/attn": 0.009167999029159546, "grad/layer_8/mlp": 0.004596992861479521, "grad/layer_8/attn_mlp_ratio": 1.994346979858953, "grad/layer_12/attn": 0.004811422433704138, "grad/layer_12/mlp": 0.006245230324566364, "grad/layer_12/attn_mlp_ratio": 0.7704155181813356, "grad/layer_16/attn": 0.0054618059657514095, "grad/layer_16/mlp": 0.0048292214050889015, "grad/layer_16/attn_mlp_ratio": 1.130990980636508, "grad/layer_20/attn": 0.007170937489718199, "grad/layer_20/mlp": 0.008208099752664566, "grad/layer_20/attn_mlp_ratio": 0.873641600179891, "grad/layer_24/attn": 0.010212577879428864, "grad/layer_24/mlp": 0.011662334203720093, "grad/layer_24/attn_mlp_ratio": 0.8756890013152187, "grad/layer_27/attn": 0.008288154378533363, "grad/layer_27/mlp": 0.01085344236344099, "grad/layer_27/attn_mlp_ratio": 0.7636429092844406} {"step": 7100, "timestamp": 1778202206.8428597, "train/loss": 2.381036114692688, "train/z_loss": 0.0018939069123007357, "train/perplexity": 10.81610378425866, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024824.095199795, "perf/iters_per_sec": 0.9655113674162841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357205867767334, "data/tokens_consumed": 14891876352, "data/tokens_consumed_B": 14.891876352, "train/loss_slope": -5.726276868962682e-05} {"step": 7110, "timestamp": 1778202217.1952038, "train/loss": 2.386431908607483, "train/z_loss": 0.0018813597504049539, "train/perplexity": 10.874622988041228, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026769.7652854463, "perf/iters_per_sec": 0.9664391352107269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347263097763062, "data/tokens_consumed": 14912847872, "data/tokens_consumed_B": 14.912847872, "train/loss_slope": -5.6680640479018284e-05} {"step": 7120, "timestamp": 1778202227.5550423, "train/loss": 2.3450774192810058, "train/z_loss": 0.0018972798250615596, "train/perplexity": 10.434080495290015, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025771.7126836635, "perf/iters_per_sec": 0.9659632266443555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352360963821412, "data/tokens_consumed": 14933819392, "data/tokens_consumed_B": 14.933819392, "train/loss_slope": -6.0158302159485355e-05} {"step": 7125, "timestamp": 1778202233.3378768, "eos/sharpness": 49.17459487915038, "eos/L0_probe": 2.2313764095306396, "eos/L_plus": 2.3759617805480957, "eos/L_minus": 2.5785369873046875, "eos/grad_norm": 0.17197856307029724, "eos/embed_grad_frac": 0.09558156132698059, "eos/time_s": 0.6121664047241211} {"step": 7125, "timestamp": 1778202234.7203898, "geo/rankme_last": 434.7828674316406, "geo/layer_0/stable_rank_q_proj": 16.26134490966797, "geo/layer_0/stable_rank_k_proj": 11.908807754516602, "geo/layer_0/stable_rank_o_proj": 57.099273681640625, "geo/layer_0/stable_rank_gate_proj": 169.94480895996094, "geo/layer_0/stable_rank_down_proj": 47.50648880004883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03280659765005112, "geo/layer_0/attn_entropy_mean": 6.465743064880371, "geo/layer_0/attn_entropy_std": 0.25036782026290894, "geo/layer_7/stable_rank_q_proj": 43.85581588745117, "geo/layer_7/stable_rank_k_proj": 44.408958435058594, "geo/layer_7/stable_rank_o_proj": 102.96884155273438, "geo/layer_7/stable_rank_gate_proj": 142.4807586669922, "geo/layer_7/stable_rank_down_proj": 185.92050170898438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5831961035728455, "geo/layer_7/attn_entropy_mean": 4.759796142578125, "geo/layer_7/attn_entropy_std": 0.9770799875259399, "geo/layer_14/stable_rank_q_proj": 74.99132537841797, "geo/layer_14/stable_rank_k_proj": 53.39324188232422, "geo/layer_14/stable_rank_o_proj": 50.243446350097656, "geo/layer_14/stable_rank_gate_proj": 133.81239318847656, "geo/layer_14/stable_rank_down_proj": 145.11256408691406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4007038474082947, "geo/layer_14/attn_entropy_mean": 5.86834716796875, "geo/layer_14/attn_entropy_std": 0.6284235119819641, "geo/layer_21/stable_rank_q_proj": 51.827667236328125, "geo/layer_21/stable_rank_k_proj": 32.8444938659668, "geo/layer_21/stable_rank_o_proj": 94.50775146484375, "geo/layer_21/stable_rank_gate_proj": 121.74593353271484, "geo/layer_21/stable_rank_down_proj": 86.04882049560547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16295167803764343, "geo/layer_21/attn_entropy_mean": 5.817193031311035, "geo/layer_21/attn_entropy_std": 0.2995738387107849, "geo/layer_27/stable_rank_q_proj": 43.408748626708984, "geo/layer_27/stable_rank_k_proj": 35.10963439941406, "geo/layer_27/stable_rank_o_proj": 109.77487182617188, "geo/layer_27/stable_rank_gate_proj": 93.4544448852539, "geo/layer_27/stable_rank_down_proj": 166.4727783203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.061485495418310165, "geo/layer_27/attn_entropy_mean": 4.606857776641846, "geo/layer_27/attn_entropy_std": 0.5024070739746094, "attnres/final_alpha/block_0": 0.2479744851589203, "attnres/block_norm/0": 1.272188663482666, "attnres/final_alpha/block_1": 0.01242641918361187, "attnres/block_norm/1": 11878.87890625, "attnres/final_alpha/block_2": 0.025621412321925163, "attnres/block_norm/2": 8696.990234375, "attnres/final_alpha/block_3": 0.023946234956383705, "attnres/block_norm/3": 8756.859375, "attnres/final_alpha/block_4": 0.037025175988674164, "attnres/block_norm/4": 4085.349365234375, "attnres/final_alpha/block_5": 0.450376033782959, "attnres/block_norm/5": 3460.222900390625, "attnres/final_alpha/block_6": 0.20263025164604187, "attnres/block_norm/6": 6740.71337890625, "geo/tier1_time_s": 1.3628566265106201, "geo/step": 7125.0, "geo/rankme_slope": 0.013122339463129002} {"step": 7130, "timestamp": 1778202239.902294, "train/loss": 2.442850399017334, "train/z_loss": 0.0018708035349845886, "train/perplexity": 11.505790139104004, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699351.5045347728, "perf/iters_per_sec": 0.810313942210566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2340895891189576, "data/tokens_consumed": 14954790912, "data/tokens_consumed_B": 14.954790912, "train/loss_slope": -5.7470858379630684e-05} {"step": 7140, "timestamp": 1778202250.274571, "train/loss": 2.4667335033416746, "train/z_loss": 0.0018575012451037764, "train/perplexity": 11.783891872649054, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023040.5278874775, "perf/iters_per_sec": 0.9646608962476146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366337060928346, "data/tokens_consumed": 14975762432, "data/tokens_consumed_B": 14.975762432, "train/loss_slope": -5.355086329460432e-05} {"step": 7150, "timestamp": 1778202260.6315372, "grad/layer_0/attn": 0.002849772572517395, "grad/layer_0/mlp": 0.003308553248643875, "grad/layer_0/attn_mlp_ratio": 0.8613349316810843, "grad/layer_4/attn": 0.0015471694059669971, "grad/layer_4/mlp": 0.0031000080052763224, "grad/layer_4/attn_mlp_ratio": 0.49908560023880605, "grad/layer_8/attn": 0.006549332290887833, "grad/layer_8/mlp": 0.004333635792136192, "grad/layer_8/attn_mlp_ratio": 1.51127885542305, "grad/layer_12/attn": 0.006514651235193014, "grad/layer_12/mlp": 0.005855434574186802, "grad/layer_12/attn_mlp_ratio": 1.1125820024792206, "grad/layer_16/attn": 0.005353071261197329, "grad/layer_16/mlp": 0.004558959975838661, "grad/layer_16/attn_mlp_ratio": 1.1741869137146554, "grad/layer_20/attn": 0.005298054777085781, "grad/layer_20/mlp": 0.0075254845432937145, "grad/layer_20/attn_mlp_ratio": 0.7040150938062318, "grad/layer_24/attn": 0.014134018681943417, "grad/layer_24/mlp": 0.012115120887756348, "grad/layer_24/attn_mlp_ratio": 1.1666428008624417, "grad/layer_27/attn": 0.006996569223701954, "grad/layer_27/mlp": 0.011184671893715858, "grad/layer_27/attn_mlp_ratio": 0.6255497906092371} {"step": 7150, "timestamp": 1778202260.6465065, "train/loss": 2.4507893323898315, "train/z_loss": 0.00187162779038772, "train/perplexity": 11.597497387617839, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023011.773663255, "perf/iters_per_sec": 0.96464718516505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036648440361023, "data/tokens_consumed": 14996733952, "data/tokens_consumed_B": 14.996733952, "train/loss_slope": -5.11800511334703e-05} {"step": 7160, "timestamp": 1778202271.0290082, "train/loss": 2.346441650390625, "train/z_loss": 0.0019005454261787236, "train/perplexity": 10.448324706490796, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021306.0367238193, "perf/iters_per_sec": 0.9638338264102074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375232458114625, "data/tokens_consumed": 15017705472, "data/tokens_consumed_B": 15.017705472, "train/loss_slope": -5.395473708557078e-05} {"step": 7170, "timestamp": 1778202281.3820748, "train/loss": 2.4403389930725097, "train/z_loss": 0.0018705019261687993, "train/perplexity": 11.476930683421473, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026898.1523502483, "perf/iters_per_sec": 0.9665003549338571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034660768508911, "data/tokens_consumed": 15038676992, "data/tokens_consumed_B": 15.038676992, "train/loss_slope": -4.863730452635112e-05} {"step": 7180, "timestamp": 1778202291.7376366, "train/loss": 2.3804428815841674, "train/z_loss": 0.0018783458741381764, "train/perplexity": 10.809689216243823, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026215.8651733918, "perf/iters_per_sec": 0.9661750150553664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350091695785522, "data/tokens_consumed": 15059648512, "data/tokens_consumed_B": 15.059648512, "train/loss_slope": -4.93225704396841e-05} {"step": 7190, "timestamp": 1778202302.097439, "train/loss": 2.331599712371826, "train/z_loss": 0.001904125150758773, "train/perplexity": 10.29439644132182, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025594.3021178336, "perf/iters_per_sec": 0.96587863069431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353267669677735, "data/tokens_consumed": 15080620032, "data/tokens_consumed_B": 15.080620032, "train/loss_slope": -5.582154051090845e-05} {"step": 7200, "timestamp": 1778202312.4366724, "grad/layer_0/attn": 0.002928389934822917, "grad/layer_0/mlp": 0.003611172316595912, "grad/layer_0/attn_mlp_ratio": 0.8109249841865421, "grad/layer_4/attn": 0.0017156255198642612, "grad/layer_4/mlp": 0.0031943656504154205, "grad/layer_4/attn_mlp_ratio": 0.5370786108763994, "grad/layer_8/attn": 0.010458560660481453, "grad/layer_8/mlp": 0.005011547822505236, "grad/layer_8/attn_mlp_ratio": 2.086892278035585, "grad/layer_12/attn": 0.004462745971977711, "grad/layer_12/mlp": 0.006252163555473089, "grad/layer_12/attn_mlp_ratio": 0.7137922514345993, "grad/layer_16/attn": 0.007340710610151291, "grad/layer_16/mlp": 0.005089291371405125, "grad/layer_16/attn_mlp_ratio": 1.4423836110382104, "grad/layer_20/attn": 0.009500278159976006, "grad/layer_20/mlp": 0.00796047504991293, "grad/layer_20/attn_mlp_ratio": 1.1934310428793837, "grad/layer_24/attn": 0.014232179149985313, "grad/layer_24/mlp": 0.01312513742595911, "grad/layer_24/attn_mlp_ratio": 1.0843451447146117, "grad/layer_27/attn": 0.0063142310827970505, "grad/layer_27/mlp": 0.01351257599890232, "grad/layer_27/attn_mlp_ratio": 0.4672855151069218} {"step": 7200, "timestamp": 1778202313.0372784, "eos/sharpness": 26.383996009826657, "eos/L0_probe": 2.232781171798706, "eos/L_plus": 2.395381450653076, "eos/L_minus": 2.3340208530426025, "eos/grad_norm": 0.19146908819675446, "eos/embed_grad_frac": 0.11789258569478989, "eos/time_s": 0.5978569984436035} {"step": 7200, "timestamp": 1778202313.0588713, "train/loss": 2.3926565170288088, "train/z_loss": 0.0018820397206582129, "train/perplexity": 10.942524368382594, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914100.1212637932, "perf/iters_per_sec": 0.9127140623396841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0956333875656128, "data/tokens_consumed": 15101591552, "data/tokens_consumed_B": 15.101591552, "train/loss_slope": -5.564695011199582e-05} {"step": 7200, "timestamp": 1778202314.422809, "geo/rankme_last": 433.8891906738281, "geo/layer_0/stable_rank_q_proj": 16.189590454101562, "geo/layer_0/stable_rank_k_proj": 11.859807014465332, "geo/layer_0/stable_rank_o_proj": 57.32847595214844, "geo/layer_0/stable_rank_gate_proj": 169.72177124023438, "geo/layer_0/stable_rank_down_proj": 47.57345962524414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03287447988986969, "geo/layer_0/attn_entropy_mean": 6.470911026000977, "geo/layer_0/attn_entropy_std": 0.24208739399909973, "geo/layer_7/stable_rank_q_proj": 43.78570556640625, "geo/layer_7/stable_rank_k_proj": 44.4294548034668, "geo/layer_7/stable_rank_o_proj": 103.15205383300781, "geo/layer_7/stable_rank_gate_proj": 141.781005859375, "geo/layer_7/stable_rank_down_proj": 184.7528076171875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5868412852287292, "geo/layer_7/attn_entropy_mean": 4.730857849121094, "geo/layer_7/attn_entropy_std": 0.9810232520103455, "geo/layer_14/stable_rank_q_proj": 74.68669128417969, "geo/layer_14/stable_rank_k_proj": 53.126277923583984, "geo/layer_14/stable_rank_o_proj": 49.83085632324219, "geo/layer_14/stable_rank_gate_proj": 132.7838897705078, "geo/layer_14/stable_rank_down_proj": 144.2527618408203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39997556805610657, "geo/layer_14/attn_entropy_mean": 5.8650970458984375, "geo/layer_14/attn_entropy_std": 0.6138696074485779, "geo/layer_21/stable_rank_q_proj": 51.714691162109375, "geo/layer_21/stable_rank_k_proj": 32.68947982788086, "geo/layer_21/stable_rank_o_proj": 94.22438049316406, "geo/layer_21/stable_rank_gate_proj": 121.04713439941406, "geo/layer_21/stable_rank_down_proj": 85.61064147949219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16221682727336884, "geo/layer_21/attn_entropy_mean": 5.840107440948486, "geo/layer_21/attn_entropy_std": 0.30531013011932373, "geo/layer_27/stable_rank_q_proj": 43.24933624267578, "geo/layer_27/stable_rank_k_proj": 35.06147384643555, "geo/layer_27/stable_rank_o_proj": 109.93791198730469, "geo/layer_27/stable_rank_gate_proj": 93.41303253173828, "geo/layer_27/stable_rank_down_proj": 165.27511596679688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05912064015865326, "geo/layer_27/attn_entropy_mean": 4.583824157714844, "geo/layer_27/attn_entropy_std": 0.5299577713012695, "attnres/final_alpha/block_0": 0.24651242792606354, "attnres/block_norm/0": 1.2767772674560547, "attnres/final_alpha/block_1": 0.012275326065719128, "attnres/block_norm/1": 12013.21875, "attnres/final_alpha/block_2": 0.02520255371928215, "attnres/block_norm/2": 8820.296875, "attnres/final_alpha/block_3": 0.024107038974761963, "attnres/block_norm/3": 8951.8701171875, "attnres/final_alpha/block_4": 0.035823240876197815, "attnres/block_norm/4": 4121.0341796875, "attnres/final_alpha/block_5": 0.4613342881202698, "attnres/block_norm/5": 3445.14794921875, "attnres/final_alpha/block_6": 0.19474509358406067, "attnres/block_norm/6": 6866.5244140625, "geo/tier1_time_s": 1.3594970703125, "geo/step": 7200.0, "geo/rankme_slope": 0.012706393201811975} {"step": 7210, "timestamp": 1778202325.1508565, "train/loss": 2.43894100189209, "train/z_loss": 0.0018663449911400676, "train/perplexity": 11.460897245461474, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1734951.046082562, "perf/iters_per_sec": 0.8272891264355479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2087672472000122, "data/tokens_consumed": 15122563072, "data/tokens_consumed_B": 15.122563072, "train/loss_slope": -5.154814064914035e-05} {"step": 7220, "timestamp": 1778202335.5075185, "train/loss": 2.3667235374450684, "train/z_loss": 0.0018869857303798199, "train/perplexity": 10.662400035556878, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026636.0713462357, "perf/iters_per_sec": 0.9663753849726847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034794569015503, "data/tokens_consumed": 15143534592, "data/tokens_consumed_B": 15.143534592, "train/loss_slope": -5.3367427587866866e-05} {"step": 7230, "timestamp": 1778202345.8552804, "train/loss": 2.434353733062744, "train/z_loss": 0.0018655340769328176, "train/perplexity": 11.408443430628175, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027795.1670535367, "perf/iters_per_sec": 0.9669280848758396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342030763626098, "data/tokens_consumed": 15164506112, "data/tokens_consumed_B": 15.164506112, "train/loss_slope": -4.962464566349524e-05} {"step": 7240, "timestamp": 1778202356.208944, "train/loss": 2.4066619873046875, "train/z_loss": 0.0018822043319232761, "train/perplexity": 11.09685780158122, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026937.432980942, "perf/iters_per_sec": 0.9665190853981696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346407175064087, "data/tokens_consumed": 15185477632, "data/tokens_consumed_B": 15.185477632, "train/loss_slope": -4.975044006990886e-05} {"step": 7250, "timestamp": 1778202366.546992, "grad/layer_0/attn": 0.002582400105893612, "grad/layer_0/mlp": 0.0029515386559069157, "grad/layer_0/attn_mlp_ratio": 0.8749334904464505, "grad/layer_4/attn": 0.0016880882903933525, "grad/layer_4/mlp": 0.003214363008737564, "grad/layer_4/attn_mlp_ratio": 0.52517037848171, "grad/layer_8/attn": 0.00647442601621151, "grad/layer_8/mlp": 0.004664368461817503, "grad/layer_8/attn_mlp_ratio": 1.3880605553367134, "grad/layer_12/attn": 0.004339589271694422, "grad/layer_12/mlp": 0.006011538673192263, "grad/layer_12/attn_mlp_ratio": 0.7218766168566193, "grad/layer_16/attn": 0.004816576838493347, "grad/layer_16/mlp": 0.004694052040576935, "grad/layer_16/attn_mlp_ratio": 1.026102116944392, "grad/layer_20/attn": 0.012212935835123062, "grad/layer_20/mlp": 0.00734794232994318, "grad/layer_20/attn_mlp_ratio": 1.6620892109000236, "grad/layer_24/attn": 0.016124187037348747, "grad/layer_24/mlp": 0.014233794994652271, "grad/layer_24/attn_mlp_ratio": 1.1328101135449609, "grad/layer_27/attn": 0.013246702961623669, "grad/layer_27/mlp": 0.012340632267296314, "grad/layer_27/attn_mlp_ratio": 1.0734217313472942} {"step": 7250, "timestamp": 1778202366.5626667, "train/loss": 2.414778709411621, "train/z_loss": 0.0018763213069178165, "train/perplexity": 11.187294440645116, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026415.699304838, "perf/iters_per_sec": 0.9662703033947172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349071025848389, "data/tokens_consumed": 15206449152, "data/tokens_consumed_B": 15.206449152, "train/loss_slope": -5.099885681889425e-05} {"step": 7260, "timestamp": 1778202376.9137003, "train/loss": 2.445366144180298, "train/z_loss": 0.001862247788812965, "train/perplexity": 11.534772215466178, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027051.7337971814, "perf/iters_per_sec": 0.9665735882745654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345823764801025, "data/tokens_consumed": 15227420672, "data/tokens_consumed_B": 15.227420672, "train/loss_slope": -4.775565940268656e-05} {"step": 7270, "timestamp": 1778202387.2865868, "train/loss": 2.4556458950042725, "train/z_loss": 0.0018626655684784055, "train/perplexity": 11.653958351981048, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023159.8340796474, "perf/iters_per_sec": 0.964717785873245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365725755691528, "data/tokens_consumed": 15248392192, "data/tokens_consumed_B": 15.248392192, "train/loss_slope": -4.730308631716144e-05} {"step": 7275, "timestamp": 1778202393.0781307, "eos/sharpness": 45.99442481994628, "eos/L0_probe": 2.229947090148926, "eos/L_plus": 2.565993070602417, "eos/L_minus": 2.3538453578948975, "eos/grad_norm": 0.24418972432613373, "eos/embed_grad_frac": 0.08470481634140015, "eos/time_s": 0.6119074821472168} {"step": 7275, "timestamp": 1778202394.4634552, "geo/rankme_last": 434.7120056152344, "geo/layer_0/stable_rank_q_proj": 16.159343719482422, "geo/layer_0/stable_rank_k_proj": 11.893476486206055, "geo/layer_0/stable_rank_o_proj": 57.10182571411133, "geo/layer_0/stable_rank_gate_proj": 168.78253173828125, "geo/layer_0/stable_rank_down_proj": 47.64216232299805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03010142594575882, "geo/layer_0/attn_entropy_mean": 6.466554164886475, "geo/layer_0/attn_entropy_std": 0.2496982365846634, "geo/layer_7/stable_rank_q_proj": 43.76668930053711, "geo/layer_7/stable_rank_k_proj": 44.24696731567383, "geo/layer_7/stable_rank_o_proj": 102.76519012451172, "geo/layer_7/stable_rank_gate_proj": 141.15835571289062, "geo/layer_7/stable_rank_down_proj": 184.12020874023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5869395136833191, "geo/layer_7/attn_entropy_mean": 4.750767707824707, "geo/layer_7/attn_entropy_std": 0.9740684628486633, "geo/layer_14/stable_rank_q_proj": 74.31338500976562, "geo/layer_14/stable_rank_k_proj": 53.20009994506836, "geo/layer_14/stable_rank_o_proj": 49.449703216552734, "geo/layer_14/stable_rank_gate_proj": 131.87330627441406, "geo/layer_14/stable_rank_down_proj": 143.82431030273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39776596426963806, "geo/layer_14/attn_entropy_mean": 5.865835666656494, "geo/layer_14/attn_entropy_std": 0.6073830723762512, "geo/layer_21/stable_rank_q_proj": 51.788978576660156, "geo/layer_21/stable_rank_k_proj": 32.69602584838867, "geo/layer_21/stable_rank_o_proj": 94.05890655517578, "geo/layer_21/stable_rank_gate_proj": 120.47647094726562, "geo/layer_21/stable_rank_down_proj": 85.20746612548828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16541117429733276, "geo/layer_21/attn_entropy_mean": 5.8097686767578125, "geo/layer_21/attn_entropy_std": 0.3062339127063751, "geo/layer_27/stable_rank_q_proj": 43.174190521240234, "geo/layer_27/stable_rank_k_proj": 35.14945983886719, "geo/layer_27/stable_rank_o_proj": 109.75235748291016, "geo/layer_27/stable_rank_gate_proj": 93.61324310302734, "geo/layer_27/stable_rank_down_proj": 164.8817901611328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05865463986992836, "geo/layer_27/attn_entropy_mean": 4.56320858001709, "geo/layer_27/attn_entropy_std": 0.5025887489318848, "attnres/final_alpha/block_0": 0.24460723996162415, "attnres/block_norm/0": 1.2811217308044434, "attnres/final_alpha/block_1": 0.011791334487497807, "attnres/block_norm/1": 12259.6044921875, "attnres/final_alpha/block_2": 0.024687770754098892, "attnres/block_norm/2": 8965.4580078125, "attnres/final_alpha/block_3": 0.02364572137594223, "attnres/block_norm/3": 9072.19921875, "attnres/final_alpha/block_4": 0.0356701985001564, "attnres/block_norm/4": 4159.95361328125, "attnres/final_alpha/block_5": 0.4668046832084656, "attnres/block_norm/5": 3441.7431640625, "attnres/final_alpha/block_6": 0.19279301166534424, "attnres/block_norm/6": 6904.6806640625, "geo/tier1_time_s": 1.3658647537231445, "geo/step": 7275.0, "geo/rankme_slope": 0.01229053730867347} {"step": 7280, "timestamp": 1778202399.656694, "train/loss": 2.392666459083557, "train/z_loss": 0.0018767059198580682, "train/perplexity": 10.942633160099756, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696292.431514412, "perf/iters_per_sec": 0.8088552625248012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236315131187439, "data/tokens_consumed": 15269363712, "data/tokens_consumed_B": 15.269363712, "train/loss_slope": -4.818545616749641e-05} {"step": 7290, "timestamp": 1778202410.0276825, "train/loss": 2.4235814571380616, "train/z_loss": 0.0018737731035798789, "train/perplexity": 11.286208088571625, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023423.995665741, "perf/iters_per_sec": 0.9648437479332642, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364372491836549, "data/tokens_consumed": 15290335232, "data/tokens_consumed_B": 15.290335232, "train/loss_slope": -4.5662369872584564e-05} {"step": 7300, "timestamp": 1778202420.389888, "grad/layer_0/attn": 0.0026109160389751196, "grad/layer_0/mlp": 0.0031544591765850782, "grad/layer_0/attn_mlp_ratio": 0.827690520006211, "grad/layer_4/attn": 0.0017250668024644256, "grad/layer_4/mlp": 0.003198793390765786, "grad/layer_4/attn_mlp_ratio": 0.5392867052669434, "grad/layer_8/attn": 0.005624772049486637, "grad/layer_8/mlp": 0.004583221860229969, "grad/layer_8/attn_mlp_ratio": 1.227252814350804, "grad/layer_12/attn": 0.0049231527373194695, "grad/layer_12/mlp": 0.0058457073755562305, "grad/layer_12/attn_mlp_ratio": 0.8421825344332718, "grad/layer_16/attn": 0.005442862398922443, "grad/layer_16/mlp": 0.004977285396307707, "grad/layer_16/attn_mlp_ratio": 1.0935403249341664, "grad/layer_20/attn": 0.0053962054662406445, "grad/layer_20/mlp": 0.007246495224535465, "grad/layer_20/attn_mlp_ratio": 0.7446641755180553, "grad/layer_24/attn": 0.011103770695626736, "grad/layer_24/mlp": 0.011395531706511974, "grad/layer_24/attn_mlp_ratio": 0.9743968850388789, "grad/layer_27/attn": 0.011382265016436577, "grad/layer_27/mlp": 0.010477133095264435, "grad/layer_27/attn_mlp_ratio": 1.0863911724994824} {"step": 7300, "timestamp": 1778202420.4052804, "train/loss": 2.4493601083755494, "train/z_loss": 0.0018513705814257263, "train/perplexity": 11.580933805200269, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022098.8102083553, "perf/iters_per_sec": 0.9642118502656724, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371164798736572, "data/tokens_consumed": 15311306752, "data/tokens_consumed_B": 15.311306752, "train/loss_slope": -3.954622295573069e-05} {"step": 7310, "timestamp": 1778202430.78878, "train/loss": 2.393518829345703, "train/z_loss": 0.0018771056900732219, "train/perplexity": 10.951964311427998, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020655.0809514658, "perf/iters_per_sec": 0.9635234265096024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378574848175048, "data/tokens_consumed": 15332278272, "data/tokens_consumed_B": 15.332278272, "train/loss_slope": -3.9088083250616384e-05} {"step": 7320, "timestamp": 1778202441.1574135, "train/loss": 2.4118438243865965, "train/z_loss": 0.0018658222164958716, "train/perplexity": 11.154509151780532, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023555.8230154586, "perf/iters_per_sec": 0.9649066081120771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363697290420533, "data/tokens_consumed": 15353249792, "data/tokens_consumed_B": 15.353249792, "train/loss_slope": -4.023924772352425e-05} {"step": 7330, "timestamp": 1778202451.5137289, "train/loss": 2.414200186729431, "train/z_loss": 0.0018796649761497974, "train/perplexity": 11.180824208827257, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026208.863991385, "perf/iters_per_sec": 0.9661716766316343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350127458572387, "data/tokens_consumed": 15374221312, "data/tokens_consumed_B": 15.374221312, "train/loss_slope": -3.7085881334791864e-05} {"step": 7340, "timestamp": 1778202461.8718126, "train/loss": 2.420901894569397, "train/z_loss": 0.001873349107336253, "train/perplexity": 11.25600646947013, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026069.7372020015, "perf/iters_per_sec": 0.9661053358087547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035083818435669, "data/tokens_consumed": 15395192832, "data/tokens_consumed_B": 15.395192832, "train/loss_slope": -3.4201949242890366e-05} {"step": 7350, "timestamp": 1778202472.2152948, "grad/layer_0/attn": 0.0028726323507726192, "grad/layer_0/mlp": 0.003103163791820407, "grad/layer_0/attn_mlp_ratio": 0.9257108070716332, "grad/layer_4/attn": 0.0015123706543818116, "grad/layer_4/mlp": 0.003073986852541566, "grad/layer_4/attn_mlp_ratio": 0.4919899393624253, "grad/layer_8/attn": 0.005956220906227827, "grad/layer_8/mlp": 0.004619289189577103, "grad/layer_8/attn_mlp_ratio": 1.289423660836172, "grad/layer_12/attn": 0.005087840836495161, "grad/layer_12/mlp": 0.0059689790941774845, "grad/layer_12/attn_mlp_ratio": 0.8523803938633526, "grad/layer_16/attn": 0.005726144183427095, "grad/layer_16/mlp": 0.0055210161954164505, "grad/layer_16/attn_mlp_ratio": 1.0371540087974278, "grad/layer_20/attn": 0.008916153572499752, "grad/layer_20/mlp": 0.008858238346874714, "grad/layer_20/attn_mlp_ratio": 1.006537995784644, "grad/layer_24/attn": 0.010796014219522476, "grad/layer_24/mlp": 0.01159298699349165, "grad/layer_24/attn_mlp_ratio": 0.9312538806830384, "grad/layer_27/attn": 0.022369075566530228, "grad/layer_27/mlp": 0.010259198024868965, "grad/layer_27/attn_mlp_ratio": 2.1803921996891873} {"step": 7350, "timestamp": 1778202472.8282192, "eos/sharpness": 40.01243114471435, "eos/L0_probe": 2.2236709594726562, "eos/L_plus": 2.360536575317383, "eos/L_minus": 2.4869296550750732, "eos/grad_norm": 0.17737211287021637, "eos/embed_grad_frac": 0.10087977349758148, "eos/time_s": 0.609856367111206} {"step": 7350, "timestamp": 1778202472.8485303, "train/loss": 2.352794146537781, "train/z_loss": 0.0018999922671355308, "train/perplexity": 10.51490891297801, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911951.4421219446, "perf/iters_per_sec": 0.9116894922837947, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096864676475525, "data/tokens_consumed": 15416164352, "data/tokens_consumed_B": 15.416164352, "train/loss_slope": -4.033832568647344e-05} {"step": 7350, "timestamp": 1778202474.2108154, "geo/rankme_last": 434.390380859375, "geo/layer_0/stable_rank_q_proj": 16.081771850585938, "geo/layer_0/stable_rank_k_proj": 11.86109733581543, "geo/layer_0/stable_rank_o_proj": 56.72730255126953, "geo/layer_0/stable_rank_gate_proj": 168.05209350585938, "geo/layer_0/stable_rank_down_proj": 47.57645797729492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03047221153974533, "geo/layer_0/attn_entropy_mean": 6.465060234069824, "geo/layer_0/attn_entropy_std": 0.25109028816223145, "geo/layer_7/stable_rank_q_proj": 43.806434631347656, "geo/layer_7/stable_rank_k_proj": 44.31287384033203, "geo/layer_7/stable_rank_o_proj": 103.25659942626953, "geo/layer_7/stable_rank_gate_proj": 140.71009826660156, "geo/layer_7/stable_rank_down_proj": 183.4127655029297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.583793044090271, "geo/layer_7/attn_entropy_mean": 4.739833831787109, "geo/layer_7/attn_entropy_std": 1.0043171644210815, "geo/layer_14/stable_rank_q_proj": 74.10663604736328, "geo/layer_14/stable_rank_k_proj": 52.63599395751953, "geo/layer_14/stable_rank_o_proj": 49.07143783569336, "geo/layer_14/stable_rank_gate_proj": 130.7029266357422, "geo/layer_14/stable_rank_down_proj": 143.428466796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3910847306251526, "geo/layer_14/attn_entropy_mean": 5.877850532531738, "geo/layer_14/attn_entropy_std": 0.5925772190093994, "geo/layer_21/stable_rank_q_proj": 51.99001693725586, "geo/layer_21/stable_rank_k_proj": 32.680728912353516, "geo/layer_21/stable_rank_o_proj": 94.00618743896484, "geo/layer_21/stable_rank_gate_proj": 120.26899719238281, "geo/layer_21/stable_rank_down_proj": 84.60100555419922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1666022688150406, "geo/layer_21/attn_entropy_mean": 5.8424248695373535, "geo/layer_21/attn_entropy_std": 0.31285980343818665, "geo/layer_27/stable_rank_q_proj": 42.968772888183594, "geo/layer_27/stable_rank_k_proj": 35.144161224365234, "geo/layer_27/stable_rank_o_proj": 110.4163818359375, "geo/layer_27/stable_rank_gate_proj": 93.82968139648438, "geo/layer_27/stable_rank_down_proj": 164.4132080078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06439986824989319, "geo/layer_27/attn_entropy_mean": 4.564311504364014, "geo/layer_27/attn_entropy_std": 0.5031044483184814, "attnres/final_alpha/block_0": 0.2474973499774933, "attnres/block_norm/0": 1.285262107849121, "attnres/final_alpha/block_1": 0.01225312240421772, "attnres/block_norm/1": 12373.7685546875, "attnres/final_alpha/block_2": 0.025112401694059372, "attnres/block_norm/2": 9096.392578125, "attnres/final_alpha/block_3": 0.023816706612706184, "attnres/block_norm/3": 9210.0966796875, "attnres/final_alpha/block_4": 0.036044083535671234, "attnres/block_norm/4": 4223.64990234375, "attnres/final_alpha/block_5": 0.45778703689575195, "attnres/block_norm/5": 3482.5859375, "attnres/final_alpha/block_6": 0.19748930633068085, "attnres/block_norm/6": 7037.4052734375, "geo/tier1_time_s": 1.3578438758850098, "geo/step": 7350.0, "geo/rankme_slope": 0.011881466961784715} {"step": 7360, "timestamp": 1778202484.5683353, "train/loss": 2.429665184020996, "train/z_loss": 0.0018682215246371924, "train/perplexity": 11.355079581381624, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790010.0348594459, "perf/iters_per_sec": 0.8535432981774549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715867280960084, "data/tokens_consumed": 15437135872, "data/tokens_consumed_B": 15.437135872, "train/loss_slope": -3.796099915434358e-05} {"step": 7370, "timestamp": 1778202494.9325037, "train/loss": 2.427278923988342, "train/z_loss": 0.0018634436069987713, "train/perplexity": 11.328015712365785, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024470.4290642017, "perf/iters_per_sec": 0.9653427262612351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359015226364137, "data/tokens_consumed": 15458107392, "data/tokens_consumed_B": 15.458107392, "train/loss_slope": -3.536777798206004e-05} {"step": 7380, "timestamp": 1778202505.318509, "train/loss": 2.3804673194885253, "train/z_loss": 0.0018761996179819107, "train/perplexity": 10.809953385622888, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020788.4975913449, "perf/iters_per_sec": 0.9635870445210194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377889633178712, "data/tokens_consumed": 15479078912, "data/tokens_consumed_B": 15.479078912, "train/loss_slope": -3.328162995037149e-05} {"step": 7390, "timestamp": 1778202515.6711862, "train/loss": 2.439518857002258, "train/z_loss": 0.001853538816794753, "train/perplexity": 11.467521897362015, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026726.00811456, "perf/iters_per_sec": 0.9664182701657105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034748649597168, "data/tokens_consumed": 15500050432, "data/tokens_consumed_B": 15.500050432, "train/loss_slope": -3.0998954964657e-05} {"step": 7400, "timestamp": 1778202526.0118322, "grad/layer_0/attn": 0.0029318181332200766, "grad/layer_0/mlp": 0.00313829374499619, "grad/layer_0/attn_mlp_ratio": 0.934207654867843, "grad/layer_4/attn": 0.0014707444934174418, "grad/layer_4/mlp": 0.0030066967010498047, "grad/layer_4/attn_mlp_ratio": 0.4891562371383517, "grad/layer_8/attn": 0.005979564506560564, "grad/layer_8/mlp": 0.004408766981214285, "grad/layer_8/attn_mlp_ratio": 1.3562895014434835, "grad/layer_12/attn": 0.007332667242735624, "grad/layer_12/mlp": 0.005855378694832325, "grad/layer_12/attn_mlp_ratio": 1.252295965754954, "grad/layer_16/attn": 0.008468500338494778, "grad/layer_16/mlp": 0.00480778468772769, "grad/layer_16/attn_mlp_ratio": 1.7614141881124545, "grad/layer_20/attn": 0.006237805355340242, "grad/layer_20/mlp": 0.007180056534707546, "grad/layer_20/attn_mlp_ratio": 0.868768266421107, "grad/layer_24/attn": 0.0053512840531766415, "grad/layer_24/mlp": 0.009277486242353916, "grad/layer_24/attn_mlp_ratio": 0.5768032261871155, "grad/layer_27/attn": 0.003925778903067112, "grad/layer_27/mlp": 0.008075730875134468, "grad/layer_27/attn_mlp_ratio": 0.4861205648324292} {"step": 7400, "timestamp": 1778202526.0275946, "train/loss": 2.353833293914795, "train/z_loss": 0.0018886393751017748, "train/perplexity": 10.525841132104146, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025896.0538855013, "perf/iters_per_sec": 0.9660225171496875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351725578308106, "data/tokens_consumed": 15521021952, "data/tokens_consumed_B": 15.521021952, "train/loss_slope": -3.123177247400802e-05} {"step": 7410, "timestamp": 1778202536.9658942, "train/loss": 2.3702791213989256, "train/z_loss": 0.0018863038043491543, "train/perplexity": 10.700378571959282, "train/grad_norm": 0.34375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918302.636390344, "perf/iters_per_sec": 0.9147179777099342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0932331323623656, "data/tokens_consumed": 15541993472, "data/tokens_consumed_B": 15.541993472, "train/loss_slope": -3.540796574049806e-05} {"step": 7420, "timestamp": 1778202547.3196235, "train/loss": 2.4007317304611204, "train/z_loss": 0.0018771765986457466, "train/perplexity": 11.031245326364278, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026650.7800847953, "perf/iters_per_sec": 0.9663823986457802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347870588302612, "data/tokens_consumed": 15562964992, "data/tokens_consumed_B": 15.562964992, "train/loss_slope": -3.486386973543858e-05} {"step": 7425, "timestamp": 1778202553.0918894, "eos/sharpness": 24.511122703552243, "eos/L0_probe": 2.225067138671875, "eos/L_plus": 2.320225954055786, "eos/L_minus": 2.3750195503234863, "eos/grad_norm": 0.20468097925186157, "eos/embed_grad_frac": 0.08232242614030838, "eos/time_s": 0.6083149909973145} {"step": 7425, "timestamp": 1778202554.470391, "geo/rankme_last": 434.79241943359375, "geo/layer_0/stable_rank_q_proj": 15.97114086151123, "geo/layer_0/stable_rank_k_proj": 11.912981986999512, "geo/layer_0/stable_rank_o_proj": 56.74920654296875, "geo/layer_0/stable_rank_gate_proj": 168.21328735351562, "geo/layer_0/stable_rank_down_proj": 47.60408401489258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.02858462929725647, "geo/layer_0/attn_entropy_mean": 6.462705612182617, "geo/layer_0/attn_entropy_std": 0.2506597936153412, "geo/layer_7/stable_rank_q_proj": 43.520469665527344, "geo/layer_7/stable_rank_k_proj": 44.35246658325195, "geo/layer_7/stable_rank_o_proj": 103.35140228271484, "geo/layer_7/stable_rank_gate_proj": 140.38427734375, "geo/layer_7/stable_rank_down_proj": 183.06407165527344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5918592214584351, "geo/layer_7/attn_entropy_mean": 4.721136569976807, "geo/layer_7/attn_entropy_std": 0.9575565457344055, "geo/layer_14/stable_rank_q_proj": 74.06290435791016, "geo/layer_14/stable_rank_k_proj": 51.90518569946289, "geo/layer_14/stable_rank_o_proj": 49.4370231628418, "geo/layer_14/stable_rank_gate_proj": 129.58126831054688, "geo/layer_14/stable_rank_down_proj": 142.1808624267578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39802345633506775, "geo/layer_14/attn_entropy_mean": 5.879514694213867, "geo/layer_14/attn_entropy_std": 0.5909362435340881, "geo/layer_21/stable_rank_q_proj": 52.163665771484375, "geo/layer_21/stable_rank_k_proj": 32.544761657714844, "geo/layer_21/stable_rank_o_proj": 94.19920349121094, "geo/layer_21/stable_rank_gate_proj": 120.14616394042969, "geo/layer_21/stable_rank_down_proj": 83.74341583251953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1644899994134903, "geo/layer_21/attn_entropy_mean": 5.821348190307617, "geo/layer_21/attn_entropy_std": 0.3117097020149231, "geo/layer_27/stable_rank_q_proj": 43.08913040161133, "geo/layer_27/stable_rank_k_proj": 35.03298568725586, "geo/layer_27/stable_rank_o_proj": 110.44371032714844, "geo/layer_27/stable_rank_gate_proj": 93.70172119140625, "geo/layer_27/stable_rank_down_proj": 163.38278198242188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06544143706560135, "geo/layer_27/attn_entropy_mean": 4.579992294311523, "geo/layer_27/attn_entropy_std": 0.4940338134765625, "attnres/final_alpha/block_0": 0.246232271194458, "attnres/block_norm/0": 1.2898329496383667, "attnres/final_alpha/block_1": 0.011882059276103973, "attnres/block_norm/1": 12548.416015625, "attnres/final_alpha/block_2": 0.0245550274848938, "attnres/block_norm/2": 9230.6123046875, "attnres/final_alpha/block_3": 0.023520374670624733, "attnres/block_norm/3": 9353.9013671875, "attnres/final_alpha/block_4": 0.036186911165714264, "attnres/block_norm/4": 4244.197265625, "attnres/final_alpha/block_5": 0.4614836573600769, "attnres/block_norm/5": 3494.58837890625, "attnres/final_alpha/block_6": 0.19613967835903168, "attnres/block_norm/6": 7126.19921875, "geo/tier1_time_s": 1.3565247058868408, "geo/step": 7425.0, "geo/rankme_slope": 0.01146565385919993} {"step": 7430, "timestamp": 1778202559.650667, "train/loss": 2.387809324264526, "train/z_loss": 0.0018769875983707606, "train/perplexity": 10.889612184816675, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701414.0818588932, "perf/iters_per_sec": 0.8112974557203737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325935363769531, "data/tokens_consumed": 15583936512, "data/tokens_consumed_B": 15.583936512, "train/loss_slope": -3.83097065915012e-05} {"step": 7440, "timestamp": 1778202570.0051944, "train/loss": 2.407903718948364, "train/z_loss": 0.0018676004256121814, "train/perplexity": 11.110645679709483, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026358.7465292984, "perf/iters_per_sec": 0.9662431461950771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349361896514893, "data/tokens_consumed": 15604908032, "data/tokens_consumed_B": 15.604908032, "train/loss_slope": -4.0793401830875297e-05} {"step": 7450, "timestamp": 1778202580.3469384, "grad/layer_0/attn": 0.0025417464785277843, "grad/layer_0/mlp": 0.003207690315321088, "grad/layer_0/attn_mlp_ratio": 0.7923914559795063, "grad/layer_4/attn": 0.0018302601529285312, "grad/layer_4/mlp": 0.003220522776246071, "grad/layer_4/attn_mlp_ratio": 0.5683114895497753, "grad/layer_8/attn": 0.016579890623688698, "grad/layer_8/mlp": 0.004773817025125027, "grad/layer_8/attn_mlp_ratio": 3.4730887650528652, "grad/layer_12/attn": 0.005230004899203777, "grad/layer_12/mlp": 0.006418599281460047, "grad/layer_12/attn_mlp_ratio": 0.8148202728324979, "grad/layer_16/attn": 0.006383880972862244, "grad/layer_16/mlp": 0.005275073926895857, "grad/layer_16/attn_mlp_ratio": 1.2101974190907934, "grad/layer_20/attn": 0.007240002509206533, "grad/layer_20/mlp": 0.008237895555794239, "grad/layer_20/attn_mlp_ratio": 0.8788655272799161, "grad/layer_24/attn": 0.011787449941039085, "grad/layer_24/mlp": 0.011993657797574997, "grad/layer_24/attn_mlp_ratio": 0.9828069169308552, "grad/layer_27/attn": 0.010450324043631554, "grad/layer_27/mlp": 0.011586852371692657, "grad/layer_27/attn_mlp_ratio": 0.9019122379578313} {"step": 7450, "timestamp": 1778202580.3626478, "train/loss": 2.3856258630752563, "train/z_loss": 0.001881048467475921, "train/perplexity": 10.865861078490482, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025844.542582682, "perf/iters_per_sec": 0.9659979546464358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351988792419433, "data/tokens_consumed": 15625879552, "data/tokens_consumed_B": 15.625879552, "train/loss_slope": -3.957998500083475e-05} {"step": 7460, "timestamp": 1778202590.7118466, "train/loss": 2.41215603351593, "train/z_loss": 0.0018665088689886033, "train/perplexity": 11.157992235067859, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027490.325718316, "perf/iters_per_sec": 0.9667827251998501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343585729598999, "data/tokens_consumed": 15646851072, "data/tokens_consumed_B": 15.646851072, "train/loss_slope": -3.992769721746811e-05} {"step": 7470, "timestamp": 1778202601.0656416, "train/loss": 2.4049060583114623, "train/z_loss": 0.0018710028030909597, "train/perplexity": 11.077389604621638, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026511.3591621409, "perf/iters_per_sec": 0.9663159175692276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348582506179809, "data/tokens_consumed": 15667822592, "data/tokens_consumed_B": 15.667822592, "train/loss_slope": -3.998044885531805e-05} {"step": 7480, "timestamp": 1778202611.4172037, "train/loss": 2.4598119497299193, "train/z_loss": 0.0018473941017873585, "train/perplexity": 11.70261065395461, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026992.3163165948, "perf/iters_per_sec": 0.9665452558119749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346127033233643, "data/tokens_consumed": 15688794112, "data/tokens_consumed_B": 15.688794112, "train/loss_slope": -3.700972580530616e-05} {"step": 7490, "timestamp": 1778202621.7715986, "train/loss": 2.4546446561813355, "train/z_loss": 0.0018612306681461631, "train/perplexity": 11.642295795914396, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026365.515339086, "perf/iters_per_sec": 0.9662463738151007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349327325820923, "data/tokens_consumed": 15709765632, "data/tokens_consumed_B": 15.709765632, "train/loss_slope": -3.702734538895795e-05} {"step": 7500, "timestamp": 1778202632.1129396, "grad/layer_0/attn": 0.0034858889412134886, "grad/layer_0/mlp": 0.0037419013679027557, "grad/layer_0/attn_mlp_ratio": 0.9315822372969228, "grad/layer_4/attn": 0.0017939218087121844, "grad/layer_4/mlp": 0.0030765049159526825, "grad/layer_4/attn_mlp_ratio": 0.5831038140390198, "grad/layer_8/attn": 0.009964300319552422, "grad/layer_8/mlp": 0.004564139526337385, "grad/layer_8/attn_mlp_ratio": 2.183171667679356, "grad/layer_12/attn": 0.005125666037201881, "grad/layer_12/mlp": 0.006581324152648449, "grad/layer_12/attn_mlp_ratio": 0.7788198606289938, "grad/layer_16/attn": 0.00630741473287344, "grad/layer_16/mlp": 0.005207869224250317, "grad/layer_16/attn_mlp_ratio": 1.2111315281094166, "grad/layer_20/attn": 0.0062993550673127174, "grad/layer_20/mlp": 0.0084628164768219, "grad/layer_20/attn_mlp_ratio": 0.7443568001421057, "grad/layer_24/attn": 0.01614508405327797, "grad/layer_24/mlp": 0.01332105789333582, "grad/layer_24/attn_mlp_ratio": 1.211997129759133, "grad/layer_27/attn": 0.013616078533232212, "grad/layer_27/mlp": 0.011532904580235481, "grad/layer_27/attn_mlp_ratio": 1.1806287237045123} {"step": 7500, "timestamp": 1778202632.7224615, "eos/sharpness": 30.6974172592163, "eos/L0_probe": 2.219423294067383, "eos/L_plus": 2.379096746444702, "eos/L_minus": 2.3667240142822266, "eos/grad_norm": 0.19073447585105896, "eos/embed_grad_frac": 0.11712738871574402, "eos/time_s": 0.6066832542419434} {"step": 7500, "timestamp": 1778202632.7422945, "train/loss": 2.4222195863723757, "train/z_loss": 0.0018726781825535, "train/perplexity": 11.270848193190583, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912487.9932003466, "perf/iters_per_sec": 0.9119453397752507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965569496154786, "data/tokens_consumed": 15730737152, "data/tokens_consumed_B": 15.730737152, "train/loss_slope": -3.472583344703042e-05} {"step": 7500, "timestamp": 1778202634.1050048, "geo/rankme_last": 436.19659423828125, "geo/layer_0/stable_rank_q_proj": 15.923824310302734, "geo/layer_0/stable_rank_k_proj": 11.934786796569824, "geo/layer_0/stable_rank_o_proj": 56.916725158691406, "geo/layer_0/stable_rank_gate_proj": 168.79066467285156, "geo/layer_0/stable_rank_down_proj": 47.6211051940918, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.043679703027009964, "geo/layer_0/attn_entropy_mean": 6.449627876281738, "geo/layer_0/attn_entropy_std": 0.2566668391227722, "geo/layer_7/stable_rank_q_proj": 43.67721176147461, "geo/layer_7/stable_rank_k_proj": 44.599483489990234, "geo/layer_7/stable_rank_o_proj": 103.7923355102539, "geo/layer_7/stable_rank_gate_proj": 140.13796997070312, "geo/layer_7/stable_rank_down_proj": 182.99566650390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5803424715995789, "geo/layer_7/attn_entropy_mean": 4.710572242736816, "geo/layer_7/attn_entropy_std": 0.9442690014839172, "geo/layer_14/stable_rank_q_proj": 73.94650268554688, "geo/layer_14/stable_rank_k_proj": 51.64487075805664, "geo/layer_14/stable_rank_o_proj": 49.17456817626953, "geo/layer_14/stable_rank_gate_proj": 129.2227783203125, "geo/layer_14/stable_rank_down_proj": 143.4506072998047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3887709975242615, "geo/layer_14/attn_entropy_mean": 5.895216941833496, "geo/layer_14/attn_entropy_std": 0.5847601890563965, "geo/layer_21/stable_rank_q_proj": 52.20399475097656, "geo/layer_21/stable_rank_k_proj": 32.48540496826172, "geo/layer_21/stable_rank_o_proj": 94.02928161621094, "geo/layer_21/stable_rank_gate_proj": 120.13618469238281, "geo/layer_21/stable_rank_down_proj": 82.97296905517578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15994997322559357, "geo/layer_21/attn_entropy_mean": 5.852749824523926, "geo/layer_21/attn_entropy_std": 0.2958805561065674, "geo/layer_27/stable_rank_q_proj": 43.24427795410156, "geo/layer_27/stable_rank_k_proj": 34.97241973876953, "geo/layer_27/stable_rank_o_proj": 110.97093963623047, "geo/layer_27/stable_rank_gate_proj": 93.75851440429688, "geo/layer_27/stable_rank_down_proj": 162.96910095214844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06211328133940697, "geo/layer_27/attn_entropy_mean": 4.612205505371094, "geo/layer_27/attn_entropy_std": 0.5156468749046326, "attnres/final_alpha/block_0": 0.24788916110992432, "attnres/block_norm/0": 1.2941361665725708, "attnres/final_alpha/block_1": 0.011878062039613724, "attnres/block_norm/1": 12703.59765625, "attnres/final_alpha/block_2": 0.02442038245499134, "attnres/block_norm/2": 9315.9404296875, "attnres/final_alpha/block_3": 0.02329944260418415, "attnres/block_norm/3": 9494.5947265625, "attnres/final_alpha/block_4": 0.03589516133069992, "attnres/block_norm/4": 4270.30810546875, "attnres/final_alpha/block_5": 0.4629885256290436, "attnres/block_norm/5": 3475.44677734375, "attnres/final_alpha/block_6": 0.19362926483154297, "attnres/block_norm/6": 7193.5537109375, "geo/tier1_time_s": 1.3587157726287842, "geo/step": 7500.0, "geo/rankme_slope": 0.011087598750437674} {"step": 7500, "timestamp": 1778202641.0417166, "geo/ww_alpha_mean": 8.411190647698199, "geo/ww_alpha_std": 4.845534872239736, "geo/ww_alpha_min": 2.1906743529322714, "geo/ww_alpha_max": 28.549203800902795, "geo/ww_alpha_healthy_frac": 0.1319796954314721, "geo/ww_alpha_by_type/q_proj": 4.61027313689801, "geo/ww_alpha_by_type/k_proj": 5.502066473757128, "geo/ww_alpha_by_type/v_proj": 6.986190679226451, "geo/ww_alpha_by_type/o_proj": 8.228062143512915, "geo/ww_alpha_by_type/gate_proj": 10.7473744416791, "geo/ww_alpha_by_type/up_proj": 13.243777071716547, "geo/ww_alpha_by_type/down_proj": 9.782751883338875, "geo/twonn_id/layer_0": 0.7080284953117371, "geo/twonn_id/layer_7": 2.7894182205200195, "geo/twonn_id/layer_14": 3.3992130756378174, "geo/twonn_id/layer_21": 6.637546539306641, "geo/twonn_id/layer_27": 5.26580810546875, "geo/tier2_time_s": 6.930364608764648} {"step": 7500, "timestamp": 1778202641.6928048, "eoc/jacobian_sigma/layer_0/attn": 580.8904418945312, "eoc/jacobian_sigma/layer_0/mlp": 1981.7103271484375, "eoc/jacobian_sigma/layer_0": 1981.7103271484375, "eoc/jacobian_sigma/layer_7/attn": 1.1591789722442627, "eoc/jacobian_sigma/layer_7/mlp": 1.6446837186813354, "eoc/jacobian_sigma/layer_7": 1.6446837186813354, "eoc/jacobian_sigma/layer_14/attn": 1.2883293628692627, "eoc/jacobian_sigma/layer_14/mlp": 4.519261837005615, "eoc/jacobian_sigma/layer_14": 4.519261837005615, "eoc/jacobian_sigma/layer_21/attn": 1.0786677598953247, "eoc/jacobian_sigma/layer_21/mlp": 2.959324359893799, "eoc/jacobian_sigma/layer_21": 2.959324359893799, "eoc/jacobian_sigma/layer_27/attn": 1.853668212890625, "eoc/jacobian_sigma/layer_27/mlp": 12.407509803771973, "eoc/jacobian_sigma/layer_27": 12.407509803771973, "eoc/layer0_sigma": 1981.7103271484375, "eoc/sigma_max": 12.407509803771973, "eoc/sigma_min": 1.6446837186813354, "eoc/sigma_mean": 5.3826949298381805, "eoc/time_s": 0.6448264122009277} {"step": 7510, "timestamp": 1778202652.0616927, "train/loss": 2.360303354263306, "train/z_loss": 0.0018829867476597428, "train/perplexity": 10.594164750068469, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085733.58712536, "perf/iters_per_sec": 0.5177181182505417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.931553030014038, "data/tokens_consumed": 15751708672, "data/tokens_consumed_B": 15.751708672, "train/loss_slope": -3.8016031455822074e-05} {"step": 7520, "timestamp": 1778202662.4539216, "train/loss": 2.3853113651275635, "train/z_loss": 0.0018742564832791686, "train/perplexity": 10.862444324790557, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019186.342210084, "perf/iters_per_sec": 0.9628230773020191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386124134063721, "data/tokens_consumed": 15772680192, "data/tokens_consumed_B": 15.772680192, "train/loss_slope": -3.656921671419014e-05} {"step": 7530, "timestamp": 1778202672.8086784, "train/loss": 2.353470873832703, "train/z_loss": 0.001887969230301678, "train/perplexity": 10.52202704708921, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026684.5876455896, "perf/iters_per_sec": 0.9663985193469952, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347697973251342, "data/tokens_consumed": 15793651712, "data/tokens_consumed_B": 15.793651712, "train/loss_slope": -3.887216198598172e-05} {"step": 7540, "timestamp": 1778202683.1580393, "train/loss": 2.3962975025177, "train/z_loss": 0.0018674725666642188, "train/perplexity": 10.982438560221912, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027652.5037331716, "perf/iters_per_sec": 0.9668600577035769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342758417129516, "data/tokens_consumed": 15814623232, "data/tokens_consumed_B": 15.814623232, "train/loss_slope": -3.882795423135159e-05} {"step": 7550, "timestamp": 1778202693.5122063, "grad/layer_0/attn": 0.003707123687490821, "grad/layer_0/mlp": 0.003625726792961359, "grad/layer_0/attn_mlp_ratio": 1.0224497864655713, "grad/layer_4/attn": 0.0016008928650990129, "grad/layer_4/mlp": 0.0030466935131698847, "grad/layer_4/attn_mlp_ratio": 0.5254525293186239, "grad/layer_8/attn": 0.00759361544623971, "grad/layer_8/mlp": 0.004377440083771944, "grad/layer_8/attn_mlp_ratio": 1.734715981817588, "grad/layer_12/attn": 0.005576212890446186, "grad/layer_12/mlp": 0.005754152778536081, "grad/layer_12/attn_mlp_ratio": 0.969076249476506, "grad/layer_16/attn": 0.005255072843283415, "grad/layer_16/mlp": 0.004757983610033989, "grad/layer_16/attn_mlp_ratio": 1.1044747446699172, "grad/layer_20/attn": 0.005743175745010376, "grad/layer_20/mlp": 0.007765426766127348, "grad/layer_20/attn_mlp_ratio": 0.7395827485108392, "grad/layer_24/attn": 0.024217037484049797, "grad/layer_24/mlp": 0.012733560055494308, "grad/layer_24/attn_mlp_ratio": 1.9018277047680627, "grad/layer_27/attn": 0.01888708584010601, "grad/layer_27/mlp": 0.011352733708918095, "grad/layer_27/attn_mlp_ratio": 1.66365971033949} {"step": 7550, "timestamp": 1778202693.5286667, "train/loss": 2.3462188720703123, "train/z_loss": 0.0018943443661555647, "train/perplexity": 10.445997305519468, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023442.2884770643, "perf/iters_per_sec": 0.9648524706254312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036427879333496, "data/tokens_consumed": 15835594752, "data/tokens_consumed_B": 15.835594752, "train/loss_slope": -4.2728541529003485e-05} {"step": 7560, "timestamp": 1778202703.885564, "train/loss": 2.3967194318771363, "train/z_loss": 0.0018695476581342518, "train/perplexity": 10.98707335119711, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026066.843791622, "perf/iters_per_sec": 0.9661039561231718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350852966308595, "data/tokens_consumed": 15856566272, "data/tokens_consumed_B": 15.856566272, "train/loss_slope": -4.426199739629583e-05} {"step": 7570, "timestamp": 1778202714.2515213, "train/loss": 2.3567607164382935, "train/z_loss": 0.0018787699285894633, "train/perplexity": 10.556699862746834, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024600.1559883142, "perf/iters_per_sec": 0.9654045848790713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358351469039917, "data/tokens_consumed": 15877537792, "data/tokens_consumed_B": 15.877537792, "train/loss_slope": -4.837007338028583e-05} {"step": 7575, "timestamp": 1778202720.0632124, "eos/sharpness": 23.33743572235107, "eos/L0_probe": 2.216188669204712, "eos/L_plus": 2.3017578125, "eos/L_minus": 2.3639938831329346, "eos/grad_norm": 0.22357866168022156, "eos/embed_grad_frac": 0.06807264685630798, "eos/time_s": 0.6354317665100098} {"step": 7575, "timestamp": 1778202721.4435637, "geo/rankme_last": 435.5602111816406, "geo/layer_0/stable_rank_q_proj": 15.84363842010498, "geo/layer_0/stable_rank_k_proj": 11.952611923217773, "geo/layer_0/stable_rank_o_proj": 56.965946197509766, "geo/layer_0/stable_rank_gate_proj": 169.09754943847656, "geo/layer_0/stable_rank_down_proj": 47.7179069519043, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03003012202680111, "geo/layer_0/attn_entropy_mean": 6.45108699798584, "geo/layer_0/attn_entropy_std": 0.25372084975242615, "geo/layer_7/stable_rank_q_proj": 43.57332992553711, "geo/layer_7/stable_rank_k_proj": 44.36396408081055, "geo/layer_7/stable_rank_o_proj": 103.95389556884766, "geo/layer_7/stable_rank_gate_proj": 139.58853149414062, "geo/layer_7/stable_rank_down_proj": 182.5678253173828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5913089513778687, "geo/layer_7/attn_entropy_mean": 4.694245338439941, "geo/layer_7/attn_entropy_std": 0.9729536771774292, "geo/layer_14/stable_rank_q_proj": 73.11028289794922, "geo/layer_14/stable_rank_k_proj": 51.17250442504883, "geo/layer_14/stable_rank_o_proj": 48.96966552734375, "geo/layer_14/stable_rank_gate_proj": 128.57830810546875, "geo/layer_14/stable_rank_down_proj": 143.07687377929688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40369781851768494, "geo/layer_14/attn_entropy_mean": 5.863658428192139, "geo/layer_14/attn_entropy_std": 0.6102649569511414, "geo/layer_21/stable_rank_q_proj": 52.34552764892578, "geo/layer_21/stable_rank_k_proj": 32.41645050048828, "geo/layer_21/stable_rank_o_proj": 94.12026977539062, "geo/layer_21/stable_rank_gate_proj": 119.95588684082031, "geo/layer_21/stable_rank_down_proj": 82.54246520996094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16493278741836548, "geo/layer_21/attn_entropy_mean": 5.837956428527832, "geo/layer_21/attn_entropy_std": 0.29576078057289124, "geo/layer_27/stable_rank_q_proj": 43.3402214050293, "geo/layer_27/stable_rank_k_proj": 34.82003402709961, "geo/layer_27/stable_rank_o_proj": 110.81964111328125, "geo/layer_27/stable_rank_gate_proj": 93.89676666259766, "geo/layer_27/stable_rank_down_proj": 163.30422973632812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06188993155956268, "geo/layer_27/attn_entropy_mean": 4.5952348709106445, "geo/layer_27/attn_entropy_std": 0.5088750123977661, "attnres/final_alpha/block_0": 0.24617788195610046, "attnres/block_norm/0": 1.2982017993927002, "attnres/final_alpha/block_1": 0.01154588907957077, "attnres/block_norm/1": 12874.646484375, "attnres/final_alpha/block_2": 0.023870252072811127, "attnres/block_norm/2": 9436.6982421875, "attnres/final_alpha/block_3": 0.023026563227176666, "attnres/block_norm/3": 9668.765625, "attnres/final_alpha/block_4": 0.03501633554697037, "attnres/block_norm/4": 4313.87841796875, "attnres/final_alpha/block_5": 0.46524524688720703, "attnres/block_norm/5": 3485.337646484375, "attnres/final_alpha/block_6": 0.19511783123016357, "attnres/block_norm/6": 7262.857421875, "geo/tier1_time_s": 1.3589835166931152, "geo/step": 7575.0, "geo/rankme_slope": 0.010669591371704932} {"step": 7580, "timestamp": 1778202727.1031744, "train/loss": 2.4298879623413088, "train/z_loss": 0.0018541360506787897, "train/perplexity": 11.357609528736026, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1632911.859689281, "perf/iters_per_sec": 0.7786330507704168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.284302020072937, "data/tokens_consumed": 15898509312, "data/tokens_consumed_B": 15.898509312, "train/loss_slope": -4.714603449824058e-05} {"step": 7590, "timestamp": 1778202737.4603648, "train/loss": 2.389427995681763, "train/z_loss": 0.0018693808233365417, "train/perplexity": 10.907253162426388, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026173.7654615338, "perf/iters_per_sec": 0.9661549403484029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350306749343872, "data/tokens_consumed": 15919480832, "data/tokens_consumed_B": 15.919480832, "train/loss_slope": -4.876013289977232e-05} {"step": 7600, "timestamp": 1778202748.3621576, "grad/layer_0/attn": 0.0027963113971054554, "grad/layer_0/mlp": 0.0030113921966403723, "grad/layer_0/attn_mlp_ratio": 0.9285775885875548, "grad/layer_4/attn": 0.0014286799123510718, "grad/layer_4/mlp": 0.003127645468339324, "grad/layer_4/attn_mlp_ratio": 0.4567908610916081, "grad/layer_8/attn": 0.0049987430684268475, "grad/layer_8/mlp": 0.0043947468511760235, "grad/layer_8/attn_mlp_ratio": 1.137435926109283, "grad/layer_12/attn": 0.004955898504704237, "grad/layer_12/mlp": 0.005852987524122, "grad/layer_12/attn_mlp_ratio": 0.8467297084790378, "grad/layer_16/attn": 0.005661153234541416, "grad/layer_16/mlp": 0.00485919788479805, "grad/layer_16/attn_mlp_ratio": 1.1650386035416287, "grad/layer_20/attn": 0.0053193224593997, "grad/layer_20/mlp": 0.006876160390675068, "grad/layer_20/attn_mlp_ratio": 0.7735890496758134, "grad/layer_24/attn": 0.012970504350960255, "grad/layer_24/mlp": 0.014565245248377323, "grad/layer_24/attn_mlp_ratio": 0.8905105297388805, "grad/layer_27/attn": 0.010423709638416767, "grad/layer_27/mlp": 0.013481269590556622, "grad/layer_27/attn_mlp_ratio": 0.7731994001810069} {"step": 7600, "timestamp": 1778202748.3783658, "train/loss": 2.3661090612411497, "train/z_loss": 0.001872960792388767, "train/perplexity": 10.655850257005987, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921887.845092059, "perf/iters_per_sec": 0.9164275384388252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0911937475204467, "data/tokens_consumed": 15940452352, "data/tokens_consumed_B": 15.940452352, "train/loss_slope": -4.7534160244904906e-05} {"step": 7610, "timestamp": 1778202758.7356591, "train/loss": 2.3554197788238525, "train/z_loss": 0.0018885653698816895, "train/perplexity": 10.542553473648871, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025930.069578538, "perf/iters_per_sec": 0.9660387370960893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035155177116394, "data/tokens_consumed": 15961423872, "data/tokens_consumed_B": 15.961423872, "train/loss_slope": -4.9553574293968214e-05} {"step": 7620, "timestamp": 1778202769.0980685, "train/loss": 2.3710259437561034, "train/z_loss": 0.0018743198597803712, "train/perplexity": 10.708372838684085, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025081.2785973803, "perf/iters_per_sec": 0.9656340020167257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035589051246643, "data/tokens_consumed": 15982395392, "data/tokens_consumed_B": 15.982395392, "train/loss_slope": -5.1444536696101377e-05} {"step": 7630, "timestamp": 1778202779.4752293, "train/loss": 2.3999406337738036, "train/z_loss": 0.0018649265286512673, "train/perplexity": 11.022521995683562, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022498.0576447698, "perf/iters_per_sec": 0.9644022262786721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369117498397826, "data/tokens_consumed": 16003366912, "data/tokens_consumed_B": 16.003366912, "train/loss_slope": -5.303965796588827e-05} {"step": 7640, "timestamp": 1778202790.3170445, "train/loss": 2.400075173377991, "train/z_loss": 0.0018745064502581955, "train/perplexity": 11.024005061193291, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935634.9266990551, "perf/iters_per_sec": 0.9229826577658916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0834439754486085, "data/tokens_consumed": 16024338432, "data/tokens_consumed_B": 16.024338432, "train/loss_slope": -5.155354059747569e-05} {"step": 7650, "timestamp": 1778202800.6626356, "grad/layer_0/attn": 0.0027911770157516003, "grad/layer_0/mlp": 0.003328440710902214, "grad/layer_0/attn_mlp_ratio": 0.8385839419493892, "grad/layer_4/attn": 0.001754009979777038, "grad/layer_4/mlp": 0.0031316429376602173, "grad/layer_4/attn_mlp_ratio": 0.5600925644091074, "grad/layer_8/attn": 0.006900953594595194, "grad/layer_8/mlp": 0.004509756341576576, "grad/layer_8/attn_mlp_ratio": 1.530227559735504, "grad/layer_12/attn": 0.00462839612737298, "grad/layer_12/mlp": 0.0060696108266711235, "grad/layer_12/attn_mlp_ratio": 0.7625523585103046, "grad/layer_16/attn": 0.006142332684248686, "grad/layer_16/mlp": 0.005714487284421921, "grad/layer_16/attn_mlp_ratio": 1.074870285127079, "grad/layer_20/attn": 0.006599584594368935, "grad/layer_20/mlp": 0.008587539196014404, "grad/layer_20/attn_mlp_ratio": 0.7685070620208857, "grad/layer_24/attn": 0.016839144751429558, "grad/layer_24/mlp": 0.015901600942015648, "grad/layer_24/attn_mlp_ratio": 1.0589590763179573, "grad/layer_27/attn": 0.006231415085494518, "grad/layer_27/mlp": 0.014986866153776646, "grad/layer_27/attn_mlp_ratio": 0.4157917325727932} {"step": 7650, "timestamp": 1778202801.273199, "eos/sharpness": 25.01106262207031, "eos/L0_probe": 2.2183496952056885, "eos/L_plus": 2.3384387493133545, "eos/L_minus": 2.3483712673187256, "eos/grad_norm": 0.2233658879995346, "eos/embed_grad_frac": 0.06500646471977234, "eos/time_s": 0.6077988147735596} {"step": 7650, "timestamp": 1778202801.2942657, "train/loss": 2.3811227083206177, "train/z_loss": 0.0018690597848035396, "train/perplexity": 10.817040430478613, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911637.9732844247, "perf/iters_per_sec": 0.9115400186941265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097044539451599, "data/tokens_consumed": 16045309952, "data/tokens_consumed_B": 16.045309952, "train/loss_slope": -5.206132219819893e-05} {"step": 7650, "timestamp": 1778202802.6540158, "geo/rankme_last": 435.3384094238281, "geo/layer_0/stable_rank_q_proj": 15.773160934448242, "geo/layer_0/stable_rank_k_proj": 11.941279411315918, "geo/layer_0/stable_rank_o_proj": 57.203086853027344, "geo/layer_0/stable_rank_gate_proj": 170.05564880371094, "geo/layer_0/stable_rank_down_proj": 47.698299407958984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03434951230883598, "geo/layer_0/attn_entropy_mean": 6.454505920410156, "geo/layer_0/attn_entropy_std": 0.2533739507198334, "geo/layer_7/stable_rank_q_proj": 43.62744903564453, "geo/layer_7/stable_rank_k_proj": 44.57442092895508, "geo/layer_7/stable_rank_o_proj": 104.72264862060547, "geo/layer_7/stable_rank_gate_proj": 139.31976318359375, "geo/layer_7/stable_rank_down_proj": 182.4484405517578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.583873450756073, "geo/layer_7/attn_entropy_mean": 4.705556869506836, "geo/layer_7/attn_entropy_std": 0.9490700364112854, "geo/layer_14/stable_rank_q_proj": 72.6458511352539, "geo/layer_14/stable_rank_k_proj": 50.76264953613281, "geo/layer_14/stable_rank_o_proj": 49.12004852294922, "geo/layer_14/stable_rank_gate_proj": 127.7658462524414, "geo/layer_14/stable_rank_down_proj": 142.82147216796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39875712990760803, "geo/layer_14/attn_entropy_mean": 5.869828701019287, "geo/layer_14/attn_entropy_std": 0.5911270976066589, "geo/layer_21/stable_rank_q_proj": 52.04612350463867, "geo/layer_21/stable_rank_k_proj": 32.47582244873047, "geo/layer_21/stable_rank_o_proj": 93.92984008789062, "geo/layer_21/stable_rank_gate_proj": 119.38697052001953, "geo/layer_21/stable_rank_down_proj": 81.92452239990234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16480185091495514, "geo/layer_21/attn_entropy_mean": 5.805071830749512, "geo/layer_21/attn_entropy_std": 0.3086543679237366, "geo/layer_27/stable_rank_q_proj": 43.16248321533203, "geo/layer_27/stable_rank_k_proj": 34.59703063964844, "geo/layer_27/stable_rank_o_proj": 110.81771850585938, "geo/layer_27/stable_rank_gate_proj": 93.90695190429688, "geo/layer_27/stable_rank_down_proj": 163.00137329101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06596948951482773, "geo/layer_27/attn_entropy_mean": 4.525786876678467, "geo/layer_27/attn_entropy_std": 0.5117238163948059, "attnres/final_alpha/block_0": 0.2477235496044159, "attnres/block_norm/0": 1.3025672435760498, "attnres/final_alpha/block_1": 0.011616782285273075, "attnres/block_norm/1": 12993.0478515625, "attnres/final_alpha/block_2": 0.024324636906385422, "attnres/block_norm/2": 9585.2265625, "attnres/final_alpha/block_3": 0.023365961387753487, "attnres/block_norm/3": 9808.240234375, "attnres/final_alpha/block_4": 0.03523864969611168, "attnres/block_norm/4": 4356.99462890625, "attnres/final_alpha/block_5": 0.46293359994888306, "attnres/block_norm/5": 3525.531494140625, "attnres/final_alpha/block_6": 0.19479680061340332, "attnres/block_norm/6": 7376.2294921875, "geo/tier1_time_s": 1.3559815883636475, "geo/step": 7650.0, "geo/rankme_slope": 0.010347924052433474} {"step": 7660, "timestamp": 1778202813.4728103, "train/loss": 2.4410080909729004, "train/z_loss": 0.0018506761407479644, "train/perplexity": 11.484612443282789, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1722532.722626873, "perf/iters_per_sec": 0.8213676083692899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2174816608428956, "data/tokens_consumed": 16066281472, "data/tokens_consumed_B": 16.066281472, "train/loss_slope": -4.463127315348426e-05} {"step": 7670, "timestamp": 1778202823.8318434, "train/loss": 2.391859793663025, "train/z_loss": 0.0018665959127247334, "train/perplexity": 10.933809675598312, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025657.369576772, "perf/iters_per_sec": 0.9659087036022053, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035294532775879, "data/tokens_consumed": 16087252992, "data/tokens_consumed_B": 16.087252992, "train/loss_slope": -4.336419480361283e-05} {"step": 7680, "timestamp": 1778202834.183354, "train/loss": 2.382232642173767, "train/z_loss": 0.0018553199362941087, "train/perplexity": 10.829053295352713, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026952.7533066736, "perf/iters_per_sec": 0.9665263906987541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346328973770142, "data/tokens_consumed": 16108224512, "data/tokens_consumed_B": 16.108224512, "train/loss_slope": -4.320303302894232e-05} {"step": 7690, "timestamp": 1778202844.538949, "train/loss": 2.4220634937286376, "train/z_loss": 0.0018538821954280139, "train/perplexity": 11.269089033998412, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026280.0916073476, "perf/iters_per_sec": 0.9662056406056154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349763631820679, "data/tokens_consumed": 16129196032, "data/tokens_consumed_B": 16.129196032, "train/loss_slope": -4.247833479999466e-05} {"step": 7700, "timestamp": 1778202854.875378, "grad/layer_0/attn": 0.00240634405054152, "grad/layer_0/mlp": 0.002977416617795825, "grad/layer_0/attn_mlp_ratio": 0.8081986092705663, "grad/layer_4/attn": 0.0017772761639207602, "grad/layer_4/mlp": 0.003087261924520135, "grad/layer_4/attn_mlp_ratio": 0.5756803762703002, "grad/layer_8/attn": 0.004841042682528496, "grad/layer_8/mlp": 0.004306048154830933, "grad/layer_8/attn_mlp_ratio": 1.1242425528086817, "grad/layer_12/attn": 0.005089844111353159, "grad/layer_12/mlp": 0.005888592451810837, "grad/layer_12/attn_mlp_ratio": 0.8643566466129415, "grad/layer_16/attn": 0.005107999313622713, "grad/layer_16/mlp": 0.004839508328586817, "grad/layer_16/attn_mlp_ratio": 1.055478958038368, "grad/layer_20/attn": 0.00496875774115324, "grad/layer_20/mlp": 0.007029774598777294, "grad/layer_20/attn_mlp_ratio": 0.7068160722160084, "grad/layer_24/attn": 0.01093470212072134, "grad/layer_24/mlp": 0.010769756510853767, "grad/layer_24/attn_mlp_ratio": 1.01531562093997, "grad/layer_27/attn": 0.008012961596250534, "grad/layer_27/mlp": 0.010014520958065987, "grad/layer_27/attn_mlp_ratio": 0.8001342799910198} {"step": 7700, "timestamp": 1778202854.8913245, "train/loss": 2.400871729850769, "train/z_loss": 0.0018574553076177835, "train/perplexity": 11.032789802087342, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026794.9370220918, "perf/iters_per_sec": 0.966451138030096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347134590148925, "data/tokens_consumed": 16150167552, "data/tokens_consumed_B": 16.150167552, "train/loss_slope": -3.9080599310732225e-05} {"step": 7710, "timestamp": 1778202865.2377048, "train/loss": 2.372559380531311, "train/z_loss": 0.00186695713782683, "train/perplexity": 10.724806047821046, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028009.1996791647, "perf/iters_per_sec": 0.9670301435848068, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340939283370971, "data/tokens_consumed": 16171139072, "data/tokens_consumed_B": 16.171139072, "train/loss_slope": -4.464739044972311e-05} {"step": 7720, "timestamp": 1778202875.58632, "train/loss": 2.4288898229599, "train/z_loss": 0.0018472045194357634, "train/perplexity": 11.346278707197058, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027583.0491073532, "perf/iters_per_sec": 0.9668269391571775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343112707138062, "data/tokens_consumed": 16192110592, "data/tokens_consumed_B": 16.192110592, "train/loss_slope": -4.1935393161470425e-05} {"step": 7725, "timestamp": 1778202881.3572526, "eos/sharpness": 46.88653945922851, "eos/L0_probe": 2.2135426998138428, "eos/L_plus": 2.551142454147339, "eos/L_minus": 2.344808340072632, "eos/grad_norm": 0.1805519312620163, "eos/embed_grad_frac": 0.07775725424289703, "eos/time_s": 0.6066012382507324} {"step": 7725, "timestamp": 1778202882.7354028, "geo/rankme_last": 435.9366760253906, "geo/layer_0/stable_rank_q_proj": 15.703812599182129, "geo/layer_0/stable_rank_k_proj": 11.930301666259766, "geo/layer_0/stable_rank_o_proj": 57.270103454589844, "geo/layer_0/stable_rank_gate_proj": 170.3739471435547, "geo/layer_0/stable_rank_down_proj": 47.81415557861328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.027124637737870216, "geo/layer_0/attn_entropy_mean": 6.449002265930176, "geo/layer_0/attn_entropy_std": 0.25669682025909424, "geo/layer_7/stable_rank_q_proj": 43.545005798339844, "geo/layer_7/stable_rank_k_proj": 44.547767639160156, "geo/layer_7/stable_rank_o_proj": 104.77650451660156, "geo/layer_7/stable_rank_gate_proj": 139.22732543945312, "geo/layer_7/stable_rank_down_proj": 182.35560607910156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5886887311935425, "geo/layer_7/attn_entropy_mean": 4.728157043457031, "geo/layer_7/attn_entropy_std": 0.9764970541000366, "geo/layer_14/stable_rank_q_proj": 72.44293975830078, "geo/layer_14/stable_rank_k_proj": 50.20014190673828, "geo/layer_14/stable_rank_o_proj": 48.898921966552734, "geo/layer_14/stable_rank_gate_proj": 126.91022491455078, "geo/layer_14/stable_rank_down_proj": 142.3174591064453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3982853889465332, "geo/layer_14/attn_entropy_mean": 5.859078407287598, "geo/layer_14/attn_entropy_std": 0.596199631690979, "geo/layer_21/stable_rank_q_proj": 52.074310302734375, "geo/layer_21/stable_rank_k_proj": 32.50638961791992, "geo/layer_21/stable_rank_o_proj": 93.53934478759766, "geo/layer_21/stable_rank_gate_proj": 119.21119689941406, "geo/layer_21/stable_rank_down_proj": 81.47154998779297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16589051485061646, "geo/layer_21/attn_entropy_mean": 5.8261542320251465, "geo/layer_21/attn_entropy_std": 0.30354246497154236, "geo/layer_27/stable_rank_q_proj": 43.30970001220703, "geo/layer_27/stable_rank_k_proj": 34.54594421386719, "geo/layer_27/stable_rank_o_proj": 110.3548355102539, "geo/layer_27/stable_rank_gate_proj": 94.1523208618164, "geo/layer_27/stable_rank_down_proj": 162.03968811035156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06412269920110703, "geo/layer_27/attn_entropy_mean": 4.561400890350342, "geo/layer_27/attn_entropy_std": 0.5019554495811462, "attnres/final_alpha/block_0": 0.24506595730781555, "attnres/block_norm/0": 1.3061012029647827, "attnres/final_alpha/block_1": 0.011296248994767666, "attnres/block_norm/1": 13145.966796875, "attnres/final_alpha/block_2": 0.0234358087182045, "attnres/block_norm/2": 9630.34765625, "attnres/final_alpha/block_3": 0.022692576050758362, "attnres/block_norm/3": 9997.36328125, "attnres/final_alpha/block_4": 0.03412748500704765, "attnres/block_norm/4": 4425.4716796875, "attnres/final_alpha/block_5": 0.473141074180603, "attnres/block_norm/5": 3532.3955078125, "attnres/final_alpha/block_6": 0.19024087488651276, "attnres/block_norm/6": 7489.70166015625, "geo/tier1_time_s": 1.357330083847046, "geo/step": 7725.0, "geo/rankme_slope": 0.009919052347501501} {"step": 7730, "timestamp": 1778202887.9121926, "train/loss": 2.3760337829589844, "train/z_loss": 0.0018704038462601601, "train/perplexity": 10.762133147122722, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702235.9166834385, "perf/iters_per_sec": 0.8116893371026223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319984436035156, "data/tokens_consumed": 16213082112, "data/tokens_consumed_B": 16.213082112, "train/loss_slope": -4.238310411031494e-05} {"step": 7740, "timestamp": 1778202898.2648437, "train/loss": 2.315558385848999, "train/z_loss": 0.001877058856189251, "train/perplexity": 10.130578111166113, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026885.4950635054, "perf/iters_per_sec": 0.966494319469216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346672296524049, "data/tokens_consumed": 16234053632, "data/tokens_consumed_B": 16.234053632, "train/loss_slope": -4.673674792596756e-05} {"step": 7750, "timestamp": 1778202908.6081717, "grad/layer_0/attn": 0.0024669182021170855, "grad/layer_0/mlp": 0.0028806361369788647, "grad/layer_0/attn_mlp_ratio": 0.8563796325440675, "grad/layer_4/attn": 0.0015879556303843856, "grad/layer_4/mlp": 0.0030704978853464127, "grad/layer_4/attn_mlp_ratio": 0.5171655014797976, "grad/layer_8/attn": 0.007368528284132481, "grad/layer_8/mlp": 0.004326313268393278, "grad/layer_8/attn_mlp_ratio": 1.7031887560352643, "grad/layer_12/attn": 0.005973074119538069, "grad/layer_12/mlp": 0.005852429196238518, "grad/layer_12/attn_mlp_ratio": 1.0206144862573723, "grad/layer_16/attn": 0.005058628972619772, "grad/layer_16/mlp": 0.004617783706635237, "grad/layer_16/attn_mlp_ratio": 1.0954668266087058, "grad/layer_20/attn": 0.004663768690079451, "grad/layer_20/mlp": 0.00688594114035368, "grad/layer_20/attn_mlp_ratio": 0.6772884820376288, "grad/layer_24/attn": 0.014183446764945984, "grad/layer_24/mlp": 0.014830171130597591, "grad/layer_24/attn_mlp_ratio": 0.9563913015166483, "grad/layer_27/attn": 0.01474788784980774, "grad/layer_27/mlp": 0.012168891727924347, "grad/layer_27/attn_mlp_ratio": 1.2119335152577564} {"step": 7750, "timestamp": 1778202908.6238213, "train/loss": 2.4061504364013673, "train/z_loss": 0.0018596364185214042, "train/perplexity": 11.091182645638137, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025528.953073048, "perf/iters_per_sec": 0.9658474698415032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353601694107055, "data/tokens_consumed": 16255025152, "data/tokens_consumed_B": 16.255025152, "train/loss_slope": -4.076848731111058e-05} {"step": 7760, "timestamp": 1778202918.9731896, "train/loss": 2.3942312717437746, "train/z_loss": 0.0018615688080899418, "train/perplexity": 10.959769735272914, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027591.9760505103, "perf/iters_per_sec": 0.966831195855384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343067169189453, "data/tokens_consumed": 16275996672, "data/tokens_consumed_B": 16.275996672, "train/loss_slope": -3.751826342111446e-05} {"step": 7770, "timestamp": 1778202929.3301528, "train/loss": 2.4090592861175537, "train/z_loss": 0.0018581520649604498, "train/perplexity": 11.123492198163401, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026268.3755806743, "perf/iters_per_sec": 0.9662000539687511, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349823474884032, "data/tokens_consumed": 16296968192, "data/tokens_consumed_B": 16.296968192, "train/loss_slope": -3.539298650133263e-05} {"step": 7780, "timestamp": 1778202939.678987, "train/loss": 2.36579749584198, "train/z_loss": 0.0018677656073123217, "train/perplexity": 10.652530779911126, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027697.65646596, "perf/iters_per_sec": 0.9668815882043648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342528104782105, "data/tokens_consumed": 16317939712, "data/tokens_consumed_B": 16.317939712, "train/loss_slope": -3.7284879200886984e-05} {"step": 7790, "timestamp": 1778202950.036873, "train/loss": 2.417768120765686, "train/z_loss": 0.0018582502612844109, "train/perplexity": 11.22078790359338, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026524.3852878835, "perf/iters_per_sec": 0.9663221289100091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034851598739624, "data/tokens_consumed": 16338911232, "data/tokens_consumed_B": 16.338911232, "train/loss_slope": -3.55990546621648e-05} {"step": 7800, "timestamp": 1778202960.3847132, "grad/layer_0/attn": 0.0025965941604226828, "grad/layer_0/mlp": 0.0032167979516088963, "grad/layer_0/attn_mlp_ratio": 0.8071983751432529, "grad/layer_4/attn": 0.0016428149538114667, "grad/layer_4/mlp": 0.003093245904892683, "grad/layer_4/attn_mlp_ratio": 0.5310974139182514, "grad/layer_8/attn": 0.006650495808571577, "grad/layer_8/mlp": 0.004692974034696817, "grad/layer_8/attn_mlp_ratio": 1.4171175075102396, "grad/layer_12/attn": 0.004741039127111435, "grad/layer_12/mlp": 0.006113186478614807, "grad/layer_12/attn_mlp_ratio": 0.7755430111844732, "grad/layer_16/attn": 0.005026771686971188, "grad/layer_16/mlp": 0.004571947734802961, "grad/layer_16/attn_mlp_ratio": 1.0994814176806609, "grad/layer_20/attn": 0.004905858542770147, "grad/layer_20/mlp": 0.006878619082272053, "grad/layer_20/attn_mlp_ratio": 0.7132039749218549, "grad/layer_24/attn": 0.004561444278806448, "grad/layer_24/mlp": 0.00999427679926157, "grad/layer_24/attn_mlp_ratio": 0.45640563342241114, "grad/layer_27/attn": 0.004188987892121077, "grad/layer_27/mlp": 0.008967644535005093, "grad/layer_27/attn_mlp_ratio": 0.4671224231800414} {"step": 7800, "timestamp": 1778202961.006633, "eos/sharpness": 17.976427078247067, "eos/L0_probe": 2.2102901935577393, "eos/L_plus": 2.3229618072509766, "eos/L_minus": 2.2773828506469727, "eos/grad_norm": 0.12140610069036484, "eos/embed_grad_frac": 0.23806972801685333, "eos/time_s": 0.6191530227661133} {"step": 7800, "timestamp": 1778202961.0272784, "train/loss": 2.37811758518219, "train/z_loss": 0.0018660035566426814, "train/perplexity": 10.784582686177401, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909360.6026204808, "perf/iters_per_sec": 0.9104540837385563, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0983530282974243, "data/tokens_consumed": 16359882752, "data/tokens_consumed_B": 16.359882752, "train/loss_slope": -3.552309581906521e-05} {"step": 7800, "timestamp": 1778202962.3916478, "geo/rankme_last": 435.077392578125, "geo/layer_0/stable_rank_q_proj": 15.69571304321289, "geo/layer_0/stable_rank_k_proj": 11.961281776428223, "geo/layer_0/stable_rank_o_proj": 56.9130859375, "geo/layer_0/stable_rank_gate_proj": 169.75686645507812, "geo/layer_0/stable_rank_down_proj": 47.801578521728516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.024379247799515724, "geo/layer_0/attn_entropy_mean": 6.450022220611572, "geo/layer_0/attn_entropy_std": 0.24590130150318146, "geo/layer_7/stable_rank_q_proj": 43.31802749633789, "geo/layer_7/stable_rank_k_proj": 44.433563232421875, "geo/layer_7/stable_rank_o_proj": 104.00396728515625, "geo/layer_7/stable_rank_gate_proj": 139.25868225097656, "geo/layer_7/stable_rank_down_proj": 182.26377868652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5893743634223938, "geo/layer_7/attn_entropy_mean": 4.712587356567383, "geo/layer_7/attn_entropy_std": 0.9794947504997253, "geo/layer_14/stable_rank_q_proj": 71.97926330566406, "geo/layer_14/stable_rank_k_proj": 49.62044906616211, "geo/layer_14/stable_rank_o_proj": 48.72071838378906, "geo/layer_14/stable_rank_gate_proj": 126.07096099853516, "geo/layer_14/stable_rank_down_proj": 142.22164916992188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4093591570854187, "geo/layer_14/attn_entropy_mean": 5.850314617156982, "geo/layer_14/attn_entropy_std": 0.6092836856842041, "geo/layer_21/stable_rank_q_proj": 52.039520263671875, "geo/layer_21/stable_rank_k_proj": 32.697021484375, "geo/layer_21/stable_rank_o_proj": 93.34988403320312, "geo/layer_21/stable_rank_gate_proj": 118.67884826660156, "geo/layer_21/stable_rank_down_proj": 81.12041473388672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1622104048728943, "geo/layer_21/attn_entropy_mean": 5.862915992736816, "geo/layer_21/attn_entropy_std": 0.3017323911190033, "geo/layer_27/stable_rank_q_proj": 43.29764175415039, "geo/layer_27/stable_rank_k_proj": 34.32935333251953, "geo/layer_27/stable_rank_o_proj": 111.0755615234375, "geo/layer_27/stable_rank_gate_proj": 93.70207977294922, "geo/layer_27/stable_rank_down_proj": 161.74070739746094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06222693994641304, "geo/layer_27/attn_entropy_mean": 4.565284729003906, "geo/layer_27/attn_entropy_std": 0.4905562996864319, "attnres/final_alpha/block_0": 0.2461424171924591, "attnres/block_norm/0": 1.3100090026855469, "attnres/final_alpha/block_1": 0.011168131604790688, "attnres/block_norm/1": 13282.609375, "attnres/final_alpha/block_2": 0.023556608706712723, "attnres/block_norm/2": 9755.677734375, "attnres/final_alpha/block_3": 0.02300066314637661, "attnres/block_norm/3": 10149.1171875, "attnres/final_alpha/block_4": 0.03441938757896423, "attnres/block_norm/4": 4462.01220703125, "attnres/final_alpha/block_5": 0.47419482469558716, "attnres/block_norm/5": 3543.72314453125, "attnres/final_alpha/block_6": 0.1875179409980774, "attnres/block_norm/6": 7637.2734375, "geo/tier1_time_s": 1.3608927726745605, "geo/step": 7800.0, "geo/rankme_slope": 0.009559378790578731} {"step": 7810, "timestamp": 1778202973.3120687, "train/loss": 2.434493827819824, "train/z_loss": 0.0018498084275051952, "train/perplexity": 11.410041805698619, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707559.5658379649, "perf/iters_per_sec": 0.8142278508367371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2281574487686158, "data/tokens_consumed": 16380854272, "data/tokens_consumed_B": 16.380854272, "train/loss_slope": -3.1465429348377615e-05} {"step": 7820, "timestamp": 1778202983.6739285, "train/loss": 2.399606776237488, "train/z_loss": 0.0018518956727348267, "train/perplexity": 11.018842657867602, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025170.844287467, "perf/iters_per_sec": 0.9656767102658591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355432510375977, "data/tokens_consumed": 16401825792, "data/tokens_consumed_B": 16.401825792, "train/loss_slope": -3.124063706705596e-05} {"step": 7830, "timestamp": 1778202994.032559, "train/loss": 2.4068813562393188, "train/z_loss": 0.0018624237272888422, "train/perplexity": 11.099292374479974, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025578.8157388482, "perf/iters_per_sec": 0.9658712462133637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353346824645997, "data/tokens_consumed": 16422797312, "data/tokens_consumed_B": 16.422797312, "train/loss_slope": -2.946926057428126e-05} {"step": 7840, "timestamp": 1778203004.3900244, "train/loss": 2.446508455276489, "train/z_loss": 0.0018390049459412693, "train/perplexity": 11.547956042342205, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025795.6932508852, "perf/iters_per_sec": 0.9659746614698816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352238416671753, "data/tokens_consumed": 16443768832, "data/tokens_consumed_B": 16.443768832, "train/loss_slope": -2.5574022865924868e-05} {"step": 7850, "timestamp": 1778203014.735708, "grad/layer_0/attn": 0.0025505146477371454, "grad/layer_0/mlp": 0.0027736122719943523, "grad/layer_0/attn_mlp_ratio": 0.919564202081782, "grad/layer_4/attn": 0.0019161667441949248, "grad/layer_4/mlp": 0.0030298044439405203, "grad/layer_4/attn_mlp_ratio": 0.6324390621260294, "grad/layer_8/attn": 0.007025725208222866, "grad/layer_8/mlp": 0.004438676871359348, "grad/layer_8/attn_mlp_ratio": 1.582842196797942, "grad/layer_12/attn": 0.004524463787674904, "grad/layer_12/mlp": 0.005829830188304186, "grad/layer_12/attn_mlp_ratio": 0.7760884217765122, "grad/layer_16/attn": 0.00492291571572423, "grad/layer_16/mlp": 0.004830833990126848, "grad/layer_16/attn_mlp_ratio": 1.0190612270840715, "grad/layer_20/attn": 0.005969291087239981, "grad/layer_20/mlp": 0.00772869260981679, "grad/layer_20/attn_mlp_ratio": 0.7723545638783049, "grad/layer_24/attn": 0.014654413796961308, "grad/layer_24/mlp": 0.01499346736818552, "grad/layer_24/attn_mlp_ratio": 0.9773865737232801, "grad/layer_27/attn": 0.0139224324375391, "grad/layer_27/mlp": 0.013337364420294762, "grad/layer_27/attn_mlp_ratio": 1.0438668311384962} {"step": 7850, "timestamp": 1778203014.751909, "train/loss": 2.4138972759246826, "train/z_loss": 0.001845351606607437, "train/perplexity": 11.177437929264636, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025444.9523935665, "perf/iters_per_sec": 0.965807415196212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354031085968018, "data/tokens_consumed": 16464740352, "data/tokens_consumed_B": 16.464740352, "train/loss_slope": -2.529774639222341e-05} {"step": 7860, "timestamp": 1778203025.1122618, "train/loss": 2.397905969619751, "train/z_loss": 0.001850433717481792, "train/perplexity": 11.000117665664506, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025318.8944869058, "perf/iters_per_sec": 0.9657473061022309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035467553138733, "data/tokens_consumed": 16485711872, "data/tokens_consumed_B": 16.485711872, "train/loss_slope": -2.6249006307414767e-05} {"step": 7870, "timestamp": 1778203035.4727826, "train/loss": 2.3617820024490355, "train/z_loss": 0.0018655704334378242, "train/perplexity": 10.609841379809257, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025686.199057444, "perf/iters_per_sec": 0.9659224505698414, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352797985076905, "data/tokens_consumed": 16506683392, "data/tokens_consumed_B": 16.506683392, "train/loss_slope": -2.6659303594200642e-05} {"step": 7875, "timestamp": 1778203041.246523, "eos/sharpness": 15.193414688110348, "eos/L0_probe": 2.2113616466522217, "eos/L_plus": 2.3095736503601074, "eos/L_minus": 2.2650837898254395, "eos/grad_norm": 0.13868707418441772, "eos/embed_grad_frac": 0.15591715276241302, "eos/time_s": 0.6098287105560303} {"step": 7875, "timestamp": 1778203042.6215155, "geo/rankme_last": 436.3802795410156, "geo/layer_0/stable_rank_q_proj": 15.672717094421387, "geo/layer_0/stable_rank_k_proj": 11.99058723449707, "geo/layer_0/stable_rank_o_proj": 57.01535415649414, "geo/layer_0/stable_rank_gate_proj": 170.51625061035156, "geo/layer_0/stable_rank_down_proj": 47.838226318359375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.033206477761268616, "geo/layer_0/attn_entropy_mean": 6.4439377784729, "geo/layer_0/attn_entropy_std": 0.2536696195602417, "geo/layer_7/stable_rank_q_proj": 43.273887634277344, "geo/layer_7/stable_rank_k_proj": 44.53052520751953, "geo/layer_7/stable_rank_o_proj": 104.0860366821289, "geo/layer_7/stable_rank_gate_proj": 138.88026428222656, "geo/layer_7/stable_rank_down_proj": 181.5343475341797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5969515442848206, "geo/layer_7/attn_entropy_mean": 4.674922466278076, "geo/layer_7/attn_entropy_std": 0.9369862675666809, "geo/layer_14/stable_rank_q_proj": 71.67545318603516, "geo/layer_14/stable_rank_k_proj": 49.19167709350586, "geo/layer_14/stable_rank_o_proj": 48.68106460571289, "geo/layer_14/stable_rank_gate_proj": 125.78117370605469, "geo/layer_14/stable_rank_down_proj": 142.37326049804688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38922804594039917, "geo/layer_14/attn_entropy_mean": 5.815666198730469, "geo/layer_14/attn_entropy_std": 0.587096095085144, "geo/layer_21/stable_rank_q_proj": 51.94789505004883, "geo/layer_21/stable_rank_k_proj": 32.615760803222656, "geo/layer_21/stable_rank_o_proj": 93.4348373413086, "geo/layer_21/stable_rank_gate_proj": 117.90830993652344, "geo/layer_21/stable_rank_down_proj": 80.71015930175781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1621868759393692, "geo/layer_21/attn_entropy_mean": 5.824207305908203, "geo/layer_21/attn_entropy_std": 0.30275315046310425, "geo/layer_27/stable_rank_q_proj": 43.28299331665039, "geo/layer_27/stable_rank_k_proj": 34.316898345947266, "geo/layer_27/stable_rank_o_proj": 111.33824157714844, "geo/layer_27/stable_rank_gate_proj": 94.17915344238281, "geo/layer_27/stable_rank_down_proj": 161.3340606689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06354784220457077, "geo/layer_27/attn_entropy_mean": 4.548034191131592, "geo/layer_27/attn_entropy_std": 0.5075741410255432, "attnres/final_alpha/block_0": 0.24683699011802673, "attnres/block_norm/0": 1.3137227296829224, "attnres/final_alpha/block_1": 0.011356398463249207, "attnres/block_norm/1": 13462.705078125, "attnres/final_alpha/block_2": 0.023688053712248802, "attnres/block_norm/2": 9843.806640625, "attnres/final_alpha/block_3": 0.02320254035294056, "attnres/block_norm/3": 10227.150390625, "attnres/final_alpha/block_4": 0.0337165892124176, "attnres/block_norm/4": 4498.6435546875, "attnres/final_alpha/block_5": 0.47255271673202515, "attnres/block_norm/5": 3537.8974609375, "attnres/final_alpha/block_6": 0.18864670395851135, "attnres/block_norm/6": 7634.8486328125, "geo/tier1_time_s": 1.354057788848877, "geo/step": 7875.0, "geo/rankme_slope": 0.009160179989964736} {"step": 7880, "timestamp": 1778203047.8043206, "train/loss": 2.3851764678955076, "train/z_loss": 0.001862209988757968, "train/perplexity": 10.860979109946717, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701353.4963785016, "perf/iters_per_sec": 0.8112685663120754, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2326374292373656, "data/tokens_consumed": 16527654912, "data/tokens_consumed_B": 16.527654912, "train/loss_slope": -2.9203926147085514e-05} {"step": 7890, "timestamp": 1778203058.1643567, "train/loss": 2.410243821144104, "train/z_loss": 0.0018622120027430356, "train/perplexity": 11.136676171187043, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025111.7235157741, "perf/iters_per_sec": 0.9656485192850943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355734825134277, "data/tokens_consumed": 16548626432, "data/tokens_consumed_B": 16.548626432, "train/loss_slope": -2.7737811260526664e-05} {"step": 7900, "timestamp": 1778203068.5043209, "grad/layer_0/attn": 0.003794243559241295, "grad/layer_0/mlp": 0.0038403922226279974, "grad/layer_0/attn_mlp_ratio": 0.9879833205803508, "grad/layer_4/attn": 0.0020322271157056093, "grad/layer_4/mlp": 0.003219697391614318, "grad/layer_4/attn_mlp_ratio": 0.6311857312677767, "grad/layer_8/attn": 0.0103691341355443, "grad/layer_8/mlp": 0.004589821211993694, "grad/layer_8/attn_mlp_ratio": 2.259158566467205, "grad/layer_12/attn": 0.0041462210938334465, "grad/layer_12/mlp": 0.0055238232016563416, "grad/layer_12/attn_mlp_ratio": 0.7506071189116758, "grad/layer_16/attn": 0.006019929423928261, "grad/layer_16/mlp": 0.00518816290423274, "grad/layer_16/attn_mlp_ratio": 1.1603200244512224, "grad/layer_20/attn": 0.006756015587598085, "grad/layer_20/mlp": 0.008716699667274952, "grad/layer_20/attn_mlp_ratio": 0.7750657666290344, "grad/layer_24/attn": 0.02243579365313053, "grad/layer_24/mlp": 0.014003206975758076, "grad/layer_24/attn_mlp_ratio": 1.6021896649640133, "grad/layer_27/attn": 0.013498576357960701, "grad/layer_27/mlp": 0.013000953011214733, "grad/layer_27/attn_mlp_ratio": 1.0382759050424322} {"step": 7900, "timestamp": 1778203068.52101, "train/loss": 2.340709185600281, "train/z_loss": 0.0018711376236751676, "train/perplexity": 10.388601397423924, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025856.953580959, "perf/iters_per_sec": 0.9660038726715846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351925373077393, "data/tokens_consumed": 16569597952, "data/tokens_consumed_B": 16.569597952, "train/loss_slope": -2.921193038741279e-05} {"step": 7910, "timestamp": 1778203078.8799458, "train/loss": 2.3610380411148073, "train/z_loss": 0.001863708149176091, "train/perplexity": 10.601951003491761, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025516.732663288, "perf/iters_per_sec": 0.9658416426960411, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035366415977478, "data/tokens_consumed": 16590569472, "data/tokens_consumed_B": 16.590569472, "train/loss_slope": -3.127139692652732e-05} {"step": 7920, "timestamp": 1778203089.239367, "train/loss": 2.3860655307769774, "train/z_loss": 0.0018536189338192344, "train/perplexity": 10.870639497039258, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025834.3246192902, "perf/iters_per_sec": 0.9659930823418094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352041006088257, "data/tokens_consumed": 16611540992, "data/tokens_consumed_B": 16.611540992, "train/loss_slope": -2.7707095007405678e-05} {"step": 7930, "timestamp": 1778203099.6002347, "train/loss": 2.389477038383484, "train/z_loss": 0.0018615378765389323, "train/perplexity": 10.907788096707032, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025512.3949197112, "perf/iters_per_sec": 0.9658395742987209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353686332702636, "data/tokens_consumed": 16632512512, "data/tokens_consumed_B": 16.632512512, "train/loss_slope": -2.8689686700527326e-05} {"step": 7940, "timestamp": 1778203109.9570227, "train/loss": 2.418750357627869, "train/z_loss": 0.0018467735149897634, "train/perplexity": 11.231814789715484, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026388.4362652977, "perf/iters_per_sec": 0.9662573033644188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349210262298585, "data/tokens_consumed": 16653484032, "data/tokens_consumed_B": 16.653484032, "train/loss_slope": -2.8533589271250074e-05} {"step": 7950, "timestamp": 1778203120.3017402, "grad/layer_0/attn": 0.003112168749794364, "grad/layer_0/mlp": 0.0036162612959742546, "grad/layer_0/attn_mlp_ratio": 0.8606039246109078, "grad/layer_4/attn": 0.0017268514493480325, "grad/layer_4/mlp": 0.003126170951873064, "grad/layer_4/attn_mlp_ratio": 0.5523854647407662, "grad/layer_8/attn": 0.009752605110406876, "grad/layer_8/mlp": 0.0046633766032755375, "grad/layer_8/attn_mlp_ratio": 2.0913183152364003, "grad/layer_12/attn": 0.005640882067382336, "grad/layer_12/mlp": 0.006236530840396881, "grad/layer_12/attn_mlp_ratio": 0.9044903522956563, "grad/layer_16/attn": 0.006192096974700689, "grad/layer_16/mlp": 0.005133489146828651, "grad/layer_16/attn_mlp_ratio": 1.2062160213009157, "grad/layer_20/attn": 0.00620003929361701, "grad/layer_20/mlp": 0.008009248413145542, "grad/layer_20/attn_mlp_ratio": 0.774109990898761, "grad/layer_24/attn": 0.015534751117229462, "grad/layer_24/mlp": 0.014210499823093414, "grad/layer_24/attn_mlp_ratio": 1.0931882200698664, "grad/layer_27/attn": 0.016709618270397186, "grad/layer_27/mlp": 0.013618268072605133, "grad/layer_27/attn_mlp_ratio": 1.2270002366388038} {"step": 7950, "timestamp": 1778203120.924823, "eos/sharpness": 22.46162891387939, "eos/L0_probe": 2.2034637928009033, "eos/L_plus": 2.272759437561035, "eos/L_minus": 2.3587844371795654, "eos/grad_norm": 0.17121335864067078, "eos/embed_grad_frac": 0.11703193932771683, "eos/time_s": 0.6199951171875} {"step": 7950, "timestamp": 1778203120.9465003, "train/loss": 2.439184236526489, "train/z_loss": 0.0018399989930912852, "train/perplexity": 11.463685271671384, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909486.1518147872, "perf/iters_per_sec": 0.910513950259584, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0982808113098144, "data/tokens_consumed": 16674455552, "data/tokens_consumed_B": 16.674455552, "train/loss_slope": -2.691132226149094e-05} {"step": 7950, "timestamp": 1778203122.3117986, "geo/rankme_last": 435.92816162109375, "geo/layer_0/stable_rank_q_proj": 15.617687225341797, "geo/layer_0/stable_rank_k_proj": 12.019399642944336, "geo/layer_0/stable_rank_o_proj": 56.81481170654297, "geo/layer_0/stable_rank_gate_proj": 170.35601806640625, "geo/layer_0/stable_rank_down_proj": 47.92981719970703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0320366770029068, "geo/layer_0/attn_entropy_mean": 6.445343971252441, "geo/layer_0/attn_entropy_std": 0.2593454420566559, "geo/layer_7/stable_rank_q_proj": 43.37179946899414, "geo/layer_7/stable_rank_k_proj": 44.80596160888672, "geo/layer_7/stable_rank_o_proj": 104.31156158447266, "geo/layer_7/stable_rank_gate_proj": 137.85682678222656, "geo/layer_7/stable_rank_down_proj": 181.0436553955078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6075016260147095, "geo/layer_7/attn_entropy_mean": 4.682683944702148, "geo/layer_7/attn_entropy_std": 0.9694734215736389, "geo/layer_14/stable_rank_q_proj": 70.75411987304688, "geo/layer_14/stable_rank_k_proj": 49.04816818237305, "geo/layer_14/stable_rank_o_proj": 48.64215087890625, "geo/layer_14/stable_rank_gate_proj": 125.13115692138672, "geo/layer_14/stable_rank_down_proj": 142.87155151367188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4127558767795563, "geo/layer_14/attn_entropy_mean": 5.827859401702881, "geo/layer_14/attn_entropy_std": 0.6083599328994751, "geo/layer_21/stable_rank_q_proj": 52.02110290527344, "geo/layer_21/stable_rank_k_proj": 32.660545349121094, "geo/layer_21/stable_rank_o_proj": 93.71455383300781, "geo/layer_21/stable_rank_gate_proj": 117.84468078613281, "geo/layer_21/stable_rank_down_proj": 80.22286987304688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16111716628074646, "geo/layer_21/attn_entropy_mean": 5.826035499572754, "geo/layer_21/attn_entropy_std": 0.30041810870170593, "geo/layer_27/stable_rank_q_proj": 43.40995788574219, "geo/layer_27/stable_rank_k_proj": 34.28904342651367, "geo/layer_27/stable_rank_o_proj": 111.3260269165039, "geo/layer_27/stable_rank_gate_proj": 94.10618591308594, "geo/layer_27/stable_rank_down_proj": 161.12100219726562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06248081102967262, "geo/layer_27/attn_entropy_mean": 4.5609965324401855, "geo/layer_27/attn_entropy_std": 0.5053826570510864, "attnres/final_alpha/block_0": 0.24665996432304382, "attnres/block_norm/0": 1.3175914287567139, "attnres/final_alpha/block_1": 0.01114130299538374, "attnres/block_norm/1": 13609.646484375, "attnres/final_alpha/block_2": 0.023139983415603638, "attnres/block_norm/2": 10050.587890625, "attnres/final_alpha/block_3": 0.022839538753032684, "attnres/block_norm/3": 10394.7373046875, "attnres/final_alpha/block_4": 0.03398614004254341, "attnres/block_norm/4": 4540.0595703125, "attnres/final_alpha/block_5": 0.47406160831451416, "attnres/block_norm/5": 3568.28173828125, "attnres/final_alpha/block_6": 0.18817147612571716, "attnres/block_norm/6": 7803.16357421875, "geo/tier1_time_s": 1.3613076210021973, "geo/step": 7950.0, "geo/rankme_slope": 0.00875959372030062} {"step": 7960, "timestamp": 1778203132.6676817, "train/loss": 2.3711589336395265, "train/z_loss": 0.0018548413296230136, "train/perplexity": 10.709797038639545, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789770.1236336362, "perf/iters_per_sec": 0.8534288995903188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1717437744140624, "data/tokens_consumed": 16695427072, "data/tokens_consumed_B": 16.695427072, "train/loss_slope": -2.7489195419366416e-05} {"step": 7970, "timestamp": 1778203143.0259569, "train/loss": 2.3801467180252076, "train/z_loss": 0.0018506813095882535, "train/perplexity": 10.806488254241838, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025830.6387125875, "perf/iters_per_sec": 0.9659913247645319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352059841156005, "data/tokens_consumed": 16716398592, "data/tokens_consumed_B": 16.716398592, "train/loss_slope": -2.5481464166810008e-05} {"step": 7980, "timestamp": 1778203153.3876183, "train/loss": 2.419628071784973, "train/z_loss": 0.0018435952020809054, "train/perplexity": 11.241677440227212, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025122.4004344344, "perf/iters_per_sec": 0.9656536104366467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355680227279662, "data/tokens_consumed": 16737370112, "data/tokens_consumed_B": 16.737370112, "train/loss_slope": -2.405804929667462e-05} {"step": 7990, "timestamp": 1778203163.7418344, "train/loss": 2.384373331069946, "train/z_loss": 0.001860779954586178, "train/perplexity": 10.852259759546252, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026523.4515098569, "perf/iters_per_sec": 0.9663216836499485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348520755767823, "data/tokens_consumed": 16758341632, "data/tokens_consumed_B": 16.758341632, "train/loss_slope": -2.4240136439829563e-05} {"step": 8000, "timestamp": 1778203174.084322, "grad/layer_0/attn": 0.003096414264291525, "grad/layer_0/mlp": 0.0033072978258132935, "grad/layer_0/attn_mlp_ratio": 0.936236871835515, "grad/layer_4/attn": 0.0019278890686109662, "grad/layer_4/mlp": 0.0031450639944523573, "grad/layer_4/attn_mlp_ratio": 0.6129888010904481, "grad/layer_8/attn": 0.005925105884671211, "grad/layer_8/mlp": 0.004404573701322079, "grad/layer_8/attn_mlp_ratio": 1.3452166207074858, "grad/layer_12/attn": 0.004795370157808065, "grad/layer_12/mlp": 0.005838342942297459, "grad/layer_12/attn_mlp_ratio": 0.8213580673603292, "grad/layer_16/attn": 0.006194394547492266, "grad/layer_16/mlp": 0.0051157549023628235, "grad/layer_16/attn_mlp_ratio": 1.2108465993057231, "grad/layer_20/attn": 0.007458460982888937, "grad/layer_20/mlp": 0.008449511602520943, "grad/layer_20/attn_mlp_ratio": 0.8827091133164154, "grad/layer_24/attn": 0.025073833763599396, "grad/layer_24/mlp": 0.019049694761633873, "grad/layer_24/attn_mlp_ratio": 1.3162328292249001, "grad/layer_27/attn": 0.007354542147368193, "grad/layer_27/mlp": 0.01786205917596817, "grad/layer_27/attn_mlp_ratio": 0.41174099994523483} {"step": 8000, "timestamp": 1778203174.1005993, "train/loss": 2.3705375671386717, "train/z_loss": 0.0018592883832752704, "train/perplexity": 10.703144396607279, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025568.2272967487, "perf/iters_per_sec": 0.9658661972507232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353400945663451, "data/tokens_consumed": 16779313152, "data/tokens_consumed_B": 16.779313152, "train/loss_slope": -2.738268281688855e-05} {"step": 8000, "timestamp": 1778203181.2358735, "geo/ww_alpha_mean": 8.36927966522882, "geo/ww_alpha_std": 4.689086988893536, "geo/ww_alpha_min": 2.225555877127917, "geo/ww_alpha_max": 22.952555760597104, "geo/ww_alpha_healthy_frac": 0.13705583756345177, "geo/ww_alpha_by_type/q_proj": 4.493951860225157, "geo/ww_alpha_by_type/k_proj": 5.378028808787694, "geo/ww_alpha_by_type/v_proj": 7.525008498166889, "geo/ww_alpha_by_type/o_proj": 8.471597966705893, "geo/ww_alpha_by_type/gate_proj": 10.841243184197, "geo/ww_alpha_by_type/up_proj": 12.659179815550747, "geo/ww_alpha_by_type/down_proj": 9.435366229686261, "geo/twonn_id/layer_0": 0.7144250273704529, "geo/twonn_id/layer_7": 2.73948335647583, "geo/twonn_id/layer_14": 4.037630081176758, "geo/twonn_id/layer_21": 7.399240016937256, "geo/twonn_id/layer_27": 5.346835613250732, "geo/tier2_time_s": 7.127718925476074} {"step": 8000, "timestamp": 1778203181.8826997, "eoc/jacobian_sigma/layer_0/attn": 516.0515747070312, "eoc/jacobian_sigma/layer_0/mlp": 2250.6572265625, "eoc/jacobian_sigma/layer_0": 2250.6572265625, "eoc/jacobian_sigma/layer_7/attn": 1.1494463682174683, "eoc/jacobian_sigma/layer_7/mlp": 1.6593834161758423, "eoc/jacobian_sigma/layer_7": 1.6593834161758423, "eoc/jacobian_sigma/layer_14/attn": 1.2522343397140503, "eoc/jacobian_sigma/layer_14/mlp": 4.606714725494385, "eoc/jacobian_sigma/layer_14": 4.606714725494385, "eoc/jacobian_sigma/layer_21/attn": 1.0818349123001099, "eoc/jacobian_sigma/layer_21/mlp": 2.987978935241699, "eoc/jacobian_sigma/layer_21": 2.987978935241699, "eoc/jacobian_sigma/layer_27/attn": 1.9437841176986694, "eoc/jacobian_sigma/layer_27/mlp": 14.939826011657715, "eoc/jacobian_sigma/layer_27": 14.939826011657715, "eoc/layer0_sigma": 2250.6572265625, "eoc/sigma_max": 14.939826011657715, "eoc/sigma_min": 1.6593834161758423, "eoc/sigma_mean": 6.04847577214241, "eoc/time_s": 0.6393859386444092} {"step": 8010, "timestamp": 1778203192.2630858, "train/loss": 2.36683030128479, "train/z_loss": 0.0018577449722215534, "train/perplexity": 10.663538455095264, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1155066.8036389472, "perf/iters_per_sec": 0.5507787721819626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8156110048294067, "data/tokens_consumed": 16800284672, "data/tokens_consumed_B": 16.800284672, "train/loss_slope": -2.8618226092819535e-05} {"step": 8020, "timestamp": 1778203202.6207075, "train/loss": 2.3678639411926268, "train/z_loss": 0.001854036212898791, "train/perplexity": 10.674566412487765, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025942.2949763357, "perf/iters_per_sec": 0.9660445666200331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351489305496215, "data/tokens_consumed": 16821256192, "data/tokens_consumed_B": 16.821256192, "train/loss_slope": -2.99626584887111e-05} {"step": 8025, "timestamp": 1778203208.5047724, "eos/sharpness": 52.90827751159667, "eos/L0_probe": 2.2008750438690186, "eos/L_plus": 2.3549771308898926, "eos/L_minus": 2.5758557319641113, "eos/grad_norm": 0.18881534039974213, "eos/embed_grad_frac": 0.07309535145759583, "eos/time_s": 0.6867599487304688} {"step": 8025, "timestamp": 1778203209.885059, "geo/rankme_last": 437.0120849609375, "geo/layer_0/stable_rank_q_proj": 15.554676055908203, "geo/layer_0/stable_rank_k_proj": 12.051294326782227, "geo/layer_0/stable_rank_o_proj": 56.64949035644531, "geo/layer_0/stable_rank_gate_proj": 170.0380859375, "geo/layer_0/stable_rank_down_proj": 47.94145584106445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.041501257568597794, "geo/layer_0/attn_entropy_mean": 6.441802501678467, "geo/layer_0/attn_entropy_std": 0.25491613149642944, "geo/layer_7/stable_rank_q_proj": 43.50019836425781, "geo/layer_7/stable_rank_k_proj": 44.8233757019043, "geo/layer_7/stable_rank_o_proj": 104.38021850585938, "geo/layer_7/stable_rank_gate_proj": 137.007080078125, "geo/layer_7/stable_rank_down_proj": 180.73992919921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5868091583251953, "geo/layer_7/attn_entropy_mean": 4.731780052185059, "geo/layer_7/attn_entropy_std": 0.9625751972198486, "geo/layer_14/stable_rank_q_proj": 70.29467010498047, "geo/layer_14/stable_rank_k_proj": 48.975128173828125, "geo/layer_14/stable_rank_o_proj": 48.600791931152344, "geo/layer_14/stable_rank_gate_proj": 124.51123046875, "geo/layer_14/stable_rank_down_proj": 142.3268585205078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.404559463262558, "geo/layer_14/attn_entropy_mean": 5.824407577514648, "geo/layer_14/attn_entropy_std": 0.6021766066551208, "geo/layer_21/stable_rank_q_proj": 52.22916793823242, "geo/layer_21/stable_rank_k_proj": 32.59215545654297, "geo/layer_21/stable_rank_o_proj": 93.85342407226562, "geo/layer_21/stable_rank_gate_proj": 117.59444427490234, "geo/layer_21/stable_rank_down_proj": 80.00592803955078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16524985432624817, "geo/layer_21/attn_entropy_mean": 5.836339473724365, "geo/layer_21/attn_entropy_std": 0.3077504634857178, "geo/layer_27/stable_rank_q_proj": 43.44240188598633, "geo/layer_27/stable_rank_k_proj": 34.30220031738281, "geo/layer_27/stable_rank_o_proj": 111.46826934814453, "geo/layer_27/stable_rank_gate_proj": 94.02576446533203, "geo/layer_27/stable_rank_down_proj": 160.981689453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.054995663464069366, "geo/layer_27/attn_entropy_mean": 4.560997486114502, "geo/layer_27/attn_entropy_std": 0.48830410838127136, "attnres/final_alpha/block_0": 0.24940113723278046, "attnres/block_norm/0": 1.321608066558838, "attnres/final_alpha/block_1": 0.011235794052481651, "attnres/block_norm/1": 13760.548828125, "attnres/final_alpha/block_2": 0.023355290293693542, "attnres/block_norm/2": 10083.6708984375, "attnres/final_alpha/block_3": 0.02278003841638565, "attnres/block_norm/3": 10481.1474609375, "attnres/final_alpha/block_4": 0.03421223163604736, "attnres/block_norm/4": 4596.1298828125, "attnres/final_alpha/block_5": 0.46871763467788696, "attnres/block_norm/5": 3582.171630859375, "attnres/final_alpha/block_6": 0.19029784202575684, "attnres/block_norm/6": 7891.6201171875, "geo/tier1_time_s": 1.3568711280822754, "geo/step": 8025.0, "geo/rankme_slope": 0.008453933878238796} {"step": 8030, "timestamp": 1778203215.064758, "train/loss": 2.3895264863967896, "train/z_loss": 0.0018515127128921449, "train/perplexity": 10.908327478493543, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1686738.9267379905, "perf/iters_per_sec": 0.8042997964563324, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2433174848556519, "data/tokens_consumed": 16842227712, "data/tokens_consumed_B": 16.842227712, "train/loss_slope": -3.2463728111855365e-05} {"step": 8040, "timestamp": 1778203225.424271, "train/loss": 2.3674377679824827, "train/z_loss": 0.0018599926843307912, "train/perplexity": 10.670018167491785, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025951.954078171, "perf/iters_per_sec": 0.966049172438703, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035143995285034, "data/tokens_consumed": 16863199232, "data/tokens_consumed_B": 16.863199232, "train/loss_slope": -2.9577231314172977e-05} {"step": 8050, "timestamp": 1778203235.7690008, "grad/layer_0/attn": 0.002791374921798706, "grad/layer_0/mlp": 0.0030376636423170567, "grad/layer_0/attn_mlp_ratio": 0.9189216314210975, "grad/layer_4/attn": 0.0016869904939085245, "grad/layer_4/mlp": 0.002955107484012842, "grad/layer_4/attn_mlp_ratio": 0.5708727841365769, "grad/layer_8/attn": 0.011856663040816784, "grad/layer_8/mlp": 0.004504625219851732, "grad/layer_8/attn_mlp_ratio": 2.632108599257938, "grad/layer_12/attn": 0.0045212688855826855, "grad/layer_12/mlp": 0.005441776942461729, "grad/layer_12/attn_mlp_ratio": 0.8308441985593318, "grad/layer_16/attn": 0.004655183292925358, "grad/layer_16/mlp": 0.004662511870265007, "grad/layer_16/attn_mlp_ratio": 0.9984281697534747, "grad/layer_20/attn": 0.006918774917721748, "grad/layer_20/mlp": 0.006662626285105944, "grad/layer_20/attn_mlp_ratio": 1.0384455795372851, "grad/layer_24/attn": 0.007038084790110588, "grad/layer_24/mlp": 0.01123152207583189, "grad/layer_24/attn_mlp_ratio": 0.6266367710384987, "grad/layer_27/attn": 0.012422965839505196, "grad/layer_27/mlp": 0.010181259363889694, "grad/layer_27/attn_mlp_ratio": 1.2201796726197045} {"step": 8050, "timestamp": 1778203235.784697, "train/loss": 2.3882514953613283, "train/z_loss": 0.0018492628820240497, "train/perplexity": 10.894428321279888, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025360.1658806775, "perf/iters_per_sec": 0.9657669858363521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354464530944825, "data/tokens_consumed": 16884170752, "data/tokens_consumed_B": 16.884170752, "train/loss_slope": -2.8269528255353905e-05} {"step": 8060, "timestamp": 1778203246.136093, "train/loss": 2.3924927711486816, "train/z_loss": 0.001848345878534019, "train/perplexity": 10.940732721790457, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027195.4805837127, "perf/iters_per_sec": 0.9666421320837558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034509015083313, "data/tokens_consumed": 16905142272, "data/tokens_consumed_B": 16.905142272, "train/loss_slope": -2.5831319489637858e-05} {"step": 8070, "timestamp": 1778203256.4963787, "train/loss": 2.4207396268844605, "train/z_loss": 0.0018411740427836775, "train/perplexity": 11.25418013154052, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025695.8090666938, "perf/iters_per_sec": 0.9659270329793423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352748870849608, "data/tokens_consumed": 16926113792, "data/tokens_consumed_B": 16.926113792, "train/loss_slope": -2.242701948493797e-05} {"step": 8080, "timestamp": 1778203266.847337, "train/loss": 2.3548577547073366, "train/z_loss": 0.0018560696858912706, "train/perplexity": 10.536629969079156, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027206.9737321143, "perf/iters_per_sec": 0.9666476124439785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345031499862671, "data/tokens_consumed": 16947085312, "data/tokens_consumed_B": 16.947085312, "train/loss_slope": -2.4638781274291325e-05} {"step": 8090, "timestamp": 1778203277.2128534, "train/loss": 2.362760376930237, "train/z_loss": 0.001858273078687489, "train/perplexity": 10.620226857479594, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024183.5418469633, "perf/iters_per_sec": 0.9652059277758424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360483407974244, "data/tokens_consumed": 16968056832, "data/tokens_consumed_B": 16.968056832, "train/loss_slope": -2.4882893271893977e-05} {"step": 8100, "timestamp": 1778203287.5652502, "grad/layer_0/attn": 0.002973171416670084, "grad/layer_0/mlp": 0.0032558110542595387, "grad/layer_0/attn_mlp_ratio": 0.9131891488178974, "grad/layer_4/attn": 0.0015242011286318302, "grad/layer_4/mlp": 0.002999580930918455, "grad/layer_4/attn_mlp_ratio": 0.508138007582055, "grad/layer_8/attn": 0.009871117770671844, "grad/layer_8/mlp": 0.0044495598413050175, "grad/layer_8/attn_mlp_ratio": 2.218448093942686, "grad/layer_12/attn": 0.004279009532183409, "grad/layer_12/mlp": 0.00561893917620182, "grad/layer_12/attn_mlp_ratio": 0.7615333289516967, "grad/layer_16/attn": 0.006598968058824539, "grad/layer_16/mlp": 0.004570472054183483, "grad/layer_16/attn_mlp_ratio": 1.4438263348315812, "grad/layer_20/attn": 0.006200115196406841, "grad/layer_20/mlp": 0.007089925929903984, "grad/layer_20/attn_mlp_ratio": 0.8744964574039156, "grad/layer_24/attn": 0.011089488863945007, "grad/layer_24/mlp": 0.01111422386020422, "grad/layer_24/attn_mlp_ratio": 0.9977744648346318, "grad/layer_27/attn": 0.0047528394497931, "grad/layer_27/mlp": 0.011207391507923603, "grad/layer_27/attn_mlp_ratio": 0.42408078668660537} {"step": 8100, "timestamp": 1778203288.172384, "eos/sharpness": 31.948971748352044, "eos/L0_probe": 2.2005481719970703, "eos/L_plus": 2.400113105773926, "eos/L_minus": 2.3204729557037354, "eos/grad_norm": 0.1755947768688202, "eos/embed_grad_frac": 0.09270665049552917, "eos/time_s": 0.6043593883514404} {"step": 8100, "timestamp": 1778203288.1923256, "train/loss": 2.3551920890808105, "train/z_loss": 0.0018589210929349064, "train/perplexity": 10.540153315613502, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911208.2841720253, "perf/iters_per_sec": 0.9113351269588591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972911834716796, "data/tokens_consumed": 16989028352, "data/tokens_consumed_B": 16.989028352, "train/loss_slope": -2.8263480063139926e-05} {"step": 8100, "timestamp": 1778203289.5562406, "geo/rankme_last": 436.8499755859375, "geo/layer_0/stable_rank_q_proj": 15.536434173583984, "geo/layer_0/stable_rank_k_proj": 12.103309631347656, "geo/layer_0/stable_rank_o_proj": 56.372528076171875, "geo/layer_0/stable_rank_gate_proj": 169.63491821289062, "geo/layer_0/stable_rank_down_proj": 47.919334411621094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03188558295369148, "geo/layer_0/attn_entropy_mean": 6.4434709548950195, "geo/layer_0/attn_entropy_std": 0.25480136275291443, "geo/layer_7/stable_rank_q_proj": 43.34955978393555, "geo/layer_7/stable_rank_k_proj": 44.99197006225586, "geo/layer_7/stable_rank_o_proj": 104.51020050048828, "geo/layer_7/stable_rank_gate_proj": 136.85968017578125, "geo/layer_7/stable_rank_down_proj": 180.67922973632812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5891316533088684, "geo/layer_7/attn_entropy_mean": 4.645052433013916, "geo/layer_7/attn_entropy_std": 0.9343135952949524, "geo/layer_14/stable_rank_q_proj": 69.89441680908203, "geo/layer_14/stable_rank_k_proj": 48.303157806396484, "geo/layer_14/stable_rank_o_proj": 48.449764251708984, "geo/layer_14/stable_rank_gate_proj": 124.022216796875, "geo/layer_14/stable_rank_down_proj": 142.56724548339844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3880177438259125, "geo/layer_14/attn_entropy_mean": 5.845467567443848, "geo/layer_14/attn_entropy_std": 0.5972059965133667, "geo/layer_21/stable_rank_q_proj": 52.120548248291016, "geo/layer_21/stable_rank_k_proj": 32.7231559753418, "geo/layer_21/stable_rank_o_proj": 93.54261016845703, "geo/layer_21/stable_rank_gate_proj": 117.09719848632812, "geo/layer_21/stable_rank_down_proj": 79.67847442626953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1629771888256073, "geo/layer_21/attn_entropy_mean": 5.831872940063477, "geo/layer_21/attn_entropy_std": 0.2919415235519409, "geo/layer_27/stable_rank_q_proj": 43.2589225769043, "geo/layer_27/stable_rank_k_proj": 34.23164749145508, "geo/layer_27/stable_rank_o_proj": 112.0904769897461, "geo/layer_27/stable_rank_gate_proj": 94.2839584350586, "geo/layer_27/stable_rank_down_proj": 160.9400634765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05825459212064743, "geo/layer_27/attn_entropy_mean": 4.588868141174316, "geo/layer_27/attn_entropy_std": 0.4767221212387085, "attnres/final_alpha/block_0": 0.24708156287670135, "attnres/block_norm/0": 1.3254154920578003, "attnres/final_alpha/block_1": 0.01087590679526329, "attnres/block_norm/1": 13860.8359375, "attnres/final_alpha/block_2": 0.02280266582965851, "attnres/block_norm/2": 10189.529296875, "attnres/final_alpha/block_3": 0.022266538813710213, "attnres/block_norm/3": 10681.166015625, "attnres/final_alpha/block_4": 0.033621981739997864, "attnres/block_norm/4": 4598.94091796875, "attnres/final_alpha/block_5": 0.47916144132614136, "attnres/block_norm/5": 3567.207763671875, "attnres/final_alpha/block_6": 0.18418988585472107, "attnres/block_norm/6": 7977.74365234375, "geo/tier1_time_s": 1.3599531650543213, "geo/step": 8100.0, "geo/rankme_slope": 0.008132149422268907} {"step": 8110, "timestamp": 1778203299.9285884, "train/loss": 2.3882484436035156, "train/z_loss": 0.0018466291250661016, "train/perplexity": 10.894395074173875, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787486.5604029745, "perf/iters_per_sec": 0.8523400117888329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1732407093048096, "data/tokens_consumed": 17009999872, "data/tokens_consumed_B": 17.009999872, "train/loss_slope": -2.9322289276008537e-05} {"step": 8120, "timestamp": 1778203310.2952392, "train/loss": 2.3760571002960207, "train/z_loss": 0.0018502809456549586, "train/perplexity": 10.762384094334243, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024271.956682327, "perf/iters_per_sec": 0.9652480872546801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036003088951111, "data/tokens_consumed": 17030971392, "data/tokens_consumed_B": 17.030971392, "train/loss_slope": -3.361384945161837e-05} {"step": 8130, "timestamp": 1778203320.6627233, "train/loss": 2.388679361343384, "train/z_loss": 0.0018518348690122366, "train/perplexity": 10.899090673912422, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024188.9452717889, "perf/iters_per_sec": 0.9652085043295807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360455751419066, "data/tokens_consumed": 17051942912, "data/tokens_consumed_B": 17.051942912, "train/loss_slope": -3.1267116536902345e-05} {"step": 8140, "timestamp": 1778203331.0219235, "train/loss": 2.3178179025650025, "train/z_loss": 0.0018684938200749456, "train/perplexity": 10.153494201646144, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025718.481585755, "perf/iters_per_sec": 0.9659378440789008, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352632999420166, "data/tokens_consumed": 17072914432, "data/tokens_consumed_B": 17.072914432, "train/loss_slope": -3.161750261825799e-05} {"step": 8150, "timestamp": 1778203341.372495, "grad/layer_0/attn": 0.0026409511920064688, "grad/layer_0/mlp": 0.002876885235309601, "grad/layer_0/attn_mlp_ratio": 0.9179897299320963, "grad/layer_4/attn": 0.0017555932281538844, "grad/layer_4/mlp": 0.00288602733053267, "grad/layer_4/attn_mlp_ratio": 0.6083078800917208, "grad/layer_8/attn": 0.005989166907966137, "grad/layer_8/mlp": 0.00427722604945302, "grad/layer_8/attn_mlp_ratio": 1.4002455560438496, "grad/layer_12/attn": 0.004372198600322008, "grad/layer_12/mlp": 0.005539588164538145, "grad/layer_12/attn_mlp_ratio": 0.7892641820170608, "grad/layer_16/attn": 0.006952675990760326, "grad/layer_16/mlp": 0.00434070872142911, "grad/layer_16/attn_mlp_ratio": 1.6017374757865626, "grad/layer_20/attn": 0.008506597951054573, "grad/layer_20/mlp": 0.006227046716958284, "grad/layer_20/attn_mlp_ratio": 1.3660725864286625, "grad/layer_24/attn": 0.009835581295192242, "grad/layer_24/mlp": 0.009047010913491249, "grad/layer_24/attn_mlp_ratio": 1.0871636257019082, "grad/layer_27/attn": 0.00790915172547102, "grad/layer_27/mlp": 0.00737653486430645, "grad/layer_27/attn_mlp_ratio": 1.0722041939395386} {"step": 8150, "timestamp": 1778203341.388189, "train/loss": 2.3603026628494264, "train/z_loss": 0.0018554104608483613, "train/perplexity": 10.594157425118452, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024359.9133492904, "perf/iters_per_sec": 0.9652900282618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359580755233764, "data/tokens_consumed": 17093885952, "data/tokens_consumed_B": 17.093885952, "train/loss_slope": -3.0231638352433812e-05} {"step": 8160, "timestamp": 1778203351.7515779, "train/loss": 2.330140995979309, "train/z_loss": 0.001866942981723696, "train/perplexity": 10.279390783642032, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024822.8833246597, "perf/iters_per_sec": 0.9655107895491885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035721206665039, "data/tokens_consumed": 17114857472, "data/tokens_consumed_B": 17.114857472, "train/loss_slope": -3.6853063353324724e-05} {"step": 8170, "timestamp": 1778203362.1179771, "train/loss": 2.3843515634536745, "train/z_loss": 0.001848097937181592, "train/perplexity": 10.852023534291165, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024359.6338138774, "perf/iters_per_sec": 0.965289894968928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035958218574524, "data/tokens_consumed": 17135828992, "data/tokens_consumed_B": 17.135828992, "train/loss_slope": -3.454374782036914e-05} {"step": 8175, "timestamp": 1778203367.9045587, "eos/sharpness": 36.53974533081054, "eos/L0_probe": 2.200730085372925, "eos/L_plus": 2.3621761798858643, "eos/L_minus": 2.404681444168091, "eos/grad_norm": 0.1878649890422821, "eos/embed_grad_frac": 0.08758444339036942, "eos/time_s": 0.6092782020568848} {"step": 8175, "timestamp": 1778203369.2824948, "geo/rankme_last": 437.3172302246094, "geo/layer_0/stable_rank_q_proj": 15.484111785888672, "geo/layer_0/stable_rank_k_proj": 12.116189956665039, "geo/layer_0/stable_rank_o_proj": 56.369014739990234, "geo/layer_0/stable_rank_gate_proj": 169.02008056640625, "geo/layer_0/stable_rank_down_proj": 47.880916595458984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.030192842707037926, "geo/layer_0/attn_entropy_mean": 6.435366630554199, "geo/layer_0/attn_entropy_std": 0.25485295057296753, "geo/layer_7/stable_rank_q_proj": 43.42219924926758, "geo/layer_7/stable_rank_k_proj": 44.94322967529297, "geo/layer_7/stable_rank_o_proj": 104.65428924560547, "geo/layer_7/stable_rank_gate_proj": 136.4321746826172, "geo/layer_7/stable_rank_down_proj": 179.78944396972656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5895001292228699, "geo/layer_7/attn_entropy_mean": 4.750977039337158, "geo/layer_7/attn_entropy_std": 0.9804158210754395, "geo/layer_14/stable_rank_q_proj": 69.77658081054688, "geo/layer_14/stable_rank_k_proj": 47.85811996459961, "geo/layer_14/stable_rank_o_proj": 48.29460144042969, "geo/layer_14/stable_rank_gate_proj": 123.27392578125, "geo/layer_14/stable_rank_down_proj": 141.7970733642578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4000087380409241, "geo/layer_14/attn_entropy_mean": 5.785196304321289, "geo/layer_14/attn_entropy_std": 0.6274096369743347, "geo/layer_21/stable_rank_q_proj": 51.91873550415039, "geo/layer_21/stable_rank_k_proj": 32.913787841796875, "geo/layer_21/stable_rank_o_proj": 93.25634765625, "geo/layer_21/stable_rank_gate_proj": 116.73214721679688, "geo/layer_21/stable_rank_down_proj": 79.37290954589844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16457588970661163, "geo/layer_21/attn_entropy_mean": 5.813902378082275, "geo/layer_21/attn_entropy_std": 0.30059105157852173, "geo/layer_27/stable_rank_q_proj": 43.187782287597656, "geo/layer_27/stable_rank_k_proj": 34.32283401489258, "geo/layer_27/stable_rank_o_proj": 112.04181671142578, "geo/layer_27/stable_rank_gate_proj": 94.82911682128906, "geo/layer_27/stable_rank_down_proj": 160.39898681640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06440724432468414, "geo/layer_27/attn_entropy_mean": 4.585111141204834, "geo/layer_27/attn_entropy_std": 0.5001084804534912, "attnres/final_alpha/block_0": 0.24871550500392914, "attnres/block_norm/0": 1.329176902770996, "attnres/final_alpha/block_1": 0.011003212071955204, "attnres/block_norm/1": 14040.05859375, "attnres/final_alpha/block_2": 0.02338092215359211, "attnres/block_norm/2": 10285.4306640625, "attnres/final_alpha/block_3": 0.02259916067123413, "attnres/block_norm/3": 10830.373046875, "attnres/final_alpha/block_4": 0.03387653827667236, "attnres/block_norm/4": 4647.380859375, "attnres/final_alpha/block_5": 0.4718186557292938, "attnres/block_norm/5": 3611.4443359375, "attnres/final_alpha/block_6": 0.18860603868961334, "attnres/block_norm/6": 8002.53271484375, "geo/tier1_time_s": 1.3568954467773438, "geo/step": 8175.0, "geo/rankme_slope": 0.007860037940957633} {"step": 8180, "timestamp": 1778203374.4657316, "train/loss": 2.348793458938599, "train/z_loss": 0.0018557855393737555, "train/perplexity": 10.472926083372558, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699435.0292646147, "perf/iters_per_sec": 0.8103537699053834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234028935432434, "data/tokens_consumed": 17156800512, "data/tokens_consumed_B": 17.156800512, "train/loss_slope": -3.790967066009726e-05} {"step": 8190, "timestamp": 1778203384.82809, "train/loss": 2.3619949340820314, "train/z_loss": 0.0018543266691267491, "train/perplexity": 10.61210079120163, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024686.3236113356, "perf/iters_per_sec": 0.965445672803562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357910633087157, "data/tokens_consumed": 17177772032, "data/tokens_consumed_B": 17.177772032, "train/loss_slope": -4.341355641969067e-05} {"step": 8200, "timestamp": 1778203395.183155, "grad/layer_0/attn": 0.0027493282686918974, "grad/layer_0/mlp": 0.002994424430653453, "grad/layer_0/attn_mlp_ratio": 0.9181491270016847, "grad/layer_4/attn": 0.002461069030687213, "grad/layer_4/mlp": 0.0029208739288151264, "grad/layer_4/attn_mlp_ratio": 0.8425796547225843, "grad/layer_8/attn": 0.013054969720542431, "grad/layer_8/mlp": 0.0045163375325500965, "grad/layer_8/attn_mlp_ratio": 2.890609777810411, "grad/layer_12/attn": 0.005293526686728001, "grad/layer_12/mlp": 0.006197401788085699, "grad/layer_12/attn_mlp_ratio": 0.8541525597855179, "grad/layer_16/attn": 0.0051827579736709595, "grad/layer_16/mlp": 0.004822423215955496, "grad/layer_16/attn_mlp_ratio": 1.0747206609845426, "grad/layer_20/attn": 0.01004922203719616, "grad/layer_20/mlp": 0.007235174532979727, "grad/layer_20/attn_mlp_ratio": 1.3889398040773342, "grad/layer_24/attn": 0.010314296931028366, "grad/layer_24/mlp": 0.01401685830205679, "grad/layer_24/attn_mlp_ratio": 0.7358494061347498, "grad/layer_27/attn": 0.006953293923288584, "grad/layer_27/mlp": 0.012457993812859058, "grad/layer_27/attn_mlp_ratio": 0.5581391331481901} {"step": 8200, "timestamp": 1778203395.1987736, "train/loss": 2.324763774871826, "train/z_loss": 0.0018623154261149465, "train/perplexity": 10.224264572290666, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023177.8894406618, "perf/iters_per_sec": 0.9647263953402814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365633249282837, "data/tokens_consumed": 17198743552, "data/tokens_consumed_B": 17.198743552, "train/loss_slope": -4.746525972196363e-05} {"step": 8210, "timestamp": 1778203405.566233, "train/loss": 2.3254387617111205, "train/z_loss": 0.0018699327600188553, "train/perplexity": 10.231168145967004, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023994.020998969, "perf/iters_per_sec": 0.9651155571932645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361453533172607, "data/tokens_consumed": 17219715072, "data/tokens_consumed_B": 17.219715072, "train/loss_slope": -4.859026506288317e-05} {"step": 8220, "timestamp": 1778203415.9268541, "train/loss": 2.332435965538025, "train/z_loss": 0.001865013677161187, "train/perplexity": 10.30300876347892, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025174.3412866632, "perf/iters_per_sec": 0.9656783777650181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355414628982544, "data/tokens_consumed": 17240686592, "data/tokens_consumed_B": 17.240686592, "train/loss_slope": -5.354019464618116e-05} {"step": 8230, "timestamp": 1778203426.2899551, "train/loss": 2.352419710159302, "train/z_loss": 0.0018558899872004985, "train/perplexity": 10.510972485581398, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024791.2818620997, "perf/iters_per_sec": 0.9654957207975863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357373714447022, "data/tokens_consumed": 17261658112, "data/tokens_consumed_B": 17.261658112, "train/loss_slope": -5.3163022148047555e-05} {"step": 8240, "timestamp": 1778203436.657322, "train/loss": 2.4159480571746825, "train/z_loss": 0.0018385181087069214, "train/perplexity": 11.200383929965003, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024260.030943051, "perf/iters_per_sec": 0.9652424006190543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360091924667358, "data/tokens_consumed": 17282629632, "data/tokens_consumed_B": 17.282629632, "train/loss_slope": -5.059184540222967e-05} {"step": 8250, "timestamp": 1778203447.0169864, "grad/layer_0/attn": 0.002771832048892975, "grad/layer_0/mlp": 0.002847884315997362, "grad/layer_0/attn_mlp_ratio": 0.9732951356181522, "grad/layer_4/attn": 0.001759068458341062, "grad/layer_4/mlp": 0.0029899717774242163, "grad/layer_4/attn_mlp_ratio": 0.5883227436428112, "grad/layer_8/attn": 0.004945638123899698, "grad/layer_8/mlp": 0.004360598046332598, "grad/layer_8/attn_mlp_ratio": 1.1341650750503434, "grad/layer_12/attn": 0.004052013158798218, "grad/layer_12/mlp": 0.005988930352032185, "grad/layer_12/attn_mlp_ratio": 0.6765837725537913, "grad/layer_16/attn": 0.006352544762194157, "grad/layer_16/mlp": 0.004687420558184385, "grad/layer_16/attn_mlp_ratio": 1.3552324882774096, "grad/layer_20/attn": 0.006127749104052782, "grad/layer_20/mlp": 0.006670676171779633, "grad/layer_20/attn_mlp_ratio": 0.9186098761794649, "grad/layer_24/attn": 0.014270017854869366, "grad/layer_24/mlp": 0.010228936560451984, "grad/layer_24/attn_mlp_ratio": 1.3950636638548526, "grad/layer_27/attn": 0.012452695518732071, "grad/layer_27/mlp": 0.009210371412336826, "grad/layer_27/attn_mlp_ratio": 1.352029665910036} {"step": 8250, "timestamp": 1778203447.6279323, "eos/sharpness": 55.03766536712645, "eos/L0_probe": 2.1954457759857178, "eos/L_plus": 2.3542399406433105, "eos/L_minus": 2.5870282649993896, "eos/grad_norm": 0.18513432145118713, "eos/embed_grad_frac": 0.08497647196054459, "eos/time_s": 0.6081640720367432} {"step": 8250, "timestamp": 1778203447.6481047, "train/loss": 2.3098127841949463, "train/z_loss": 0.0018598061171360314, "train/perplexity": 10.0725387400334, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909038.1631722478, "perf/iters_per_sec": 0.9103003326283683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985385417938232, "data/tokens_consumed": 17303601152, "data/tokens_consumed_B": 17.303601152, "train/loss_slope": -5.3844956224328724e-05} {"step": 8250, "timestamp": 1778203449.0075881, "geo/rankme_last": 437.0758361816406, "geo/layer_0/stable_rank_q_proj": 15.466024398803711, "geo/layer_0/stable_rank_k_proj": 12.132919311523438, "geo/layer_0/stable_rank_o_proj": 56.0579948425293, "geo/layer_0/stable_rank_gate_proj": 168.89117431640625, "geo/layer_0/stable_rank_down_proj": 47.821842193603516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.028217019513249397, "geo/layer_0/attn_entropy_mean": 6.436962604522705, "geo/layer_0/attn_entropy_std": 0.2563152611255646, "geo/layer_7/stable_rank_q_proj": 43.34691619873047, "geo/layer_7/stable_rank_k_proj": 44.971038818359375, "geo/layer_7/stable_rank_o_proj": 104.57238006591797, "geo/layer_7/stable_rank_gate_proj": 135.9330596923828, "geo/layer_7/stable_rank_down_proj": 178.9707489013672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.598513126373291, "geo/layer_7/attn_entropy_mean": 4.672609806060791, "geo/layer_7/attn_entropy_std": 0.9287192225456238, "geo/layer_14/stable_rank_q_proj": 69.5289306640625, "geo/layer_14/stable_rank_k_proj": 47.57294464111328, "geo/layer_14/stable_rank_o_proj": 48.2704963684082, "geo/layer_14/stable_rank_gate_proj": 122.04849243164062, "geo/layer_14/stable_rank_down_proj": 141.93350219726562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38546022772789, "geo/layer_14/attn_entropy_mean": 5.828886032104492, "geo/layer_14/attn_entropy_std": 0.5982108116149902, "geo/layer_21/stable_rank_q_proj": 51.76774215698242, "geo/layer_21/stable_rank_k_proj": 32.9222297668457, "geo/layer_21/stable_rank_o_proj": 93.16356658935547, "geo/layer_21/stable_rank_gate_proj": 116.3368911743164, "geo/layer_21/stable_rank_down_proj": 79.08274841308594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1640225201845169, "geo/layer_21/attn_entropy_mean": 5.824512481689453, "geo/layer_21/attn_entropy_std": 0.2949845790863037, "geo/layer_27/stable_rank_q_proj": 43.28828048706055, "geo/layer_27/stable_rank_k_proj": 34.40237808227539, "geo/layer_27/stable_rank_o_proj": 112.12738800048828, "geo/layer_27/stable_rank_gate_proj": 94.6481704711914, "geo/layer_27/stable_rank_down_proj": 160.7732696533203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0625300407409668, "geo/layer_27/attn_entropy_mean": 4.531277179718018, "geo/layer_27/attn_entropy_std": 0.4806440472602844, "attnres/final_alpha/block_0": 0.24839958548545837, "attnres/block_norm/0": 1.3327233791351318, "attnres/final_alpha/block_1": 0.010930853895843029, "attnres/block_norm/1": 14203.39453125, "attnres/final_alpha/block_2": 0.02293677069246769, "attnres/block_norm/2": 10380.3251953125, "attnres/final_alpha/block_3": 0.02245161309838295, "attnres/block_norm/3": 10872.546875, "attnres/final_alpha/block_4": 0.033828139305114746, "attnres/block_norm/4": 4696.7509765625, "attnres/final_alpha/block_5": 0.476157546043396, "attnres/block_norm/5": 3597.39111328125, "attnres/final_alpha/block_6": 0.18529552221298218, "attnres/block_norm/6": 8084.2158203125, "geo/tier1_time_s": 1.3552100658416748, "geo/step": 8250.0, "geo/rankme_slope": 0.007483913858512155} {"step": 8260, "timestamp": 1778203459.3758671, "train/loss": 2.3617852449417116, "train/z_loss": 0.0018473603762686253, "train/perplexity": 10.609875782198001, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788832.4028814703, "perf/iters_per_sec": 0.8529817594916679, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172358012199402, "data/tokens_consumed": 17324572672, "data/tokens_consumed_B": 17.324572672, "train/loss_slope": -5.203083849797325e-05} {"step": 8270, "timestamp": 1778203469.7451496, "train/loss": 2.3804251670837404, "train/z_loss": 0.001850237580947578, "train/perplexity": 10.809497729695636, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023596.7898143576, "perf/iters_per_sec": 0.9649261426040447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363487482070923, "data/tokens_consumed": 17345544192, "data/tokens_consumed_B": 17.345544192, "train/loss_slope": -4.838607635292023e-05} {"step": 8280, "timestamp": 1778203480.1257446, "train/loss": 2.349354958534241, "train/z_loss": 0.0018606155063025653, "train/perplexity": 10.478808278404081, "train/grad_norm": 0.34375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021400.1927911877, "perf/iters_per_sec": 0.9638787235217989, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374749183654786, "data/tokens_consumed": 17366515712, "data/tokens_consumed_B": 17.366515712, "train/loss_slope": -5.0313730780655476e-05} {"step": 8290, "timestamp": 1778203490.5002215, "train/loss": 2.3345170974731446, "train/z_loss": 0.0018586397171020507, "train/perplexity": 10.324473011263176, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022929.098190689, "perf/iters_per_sec": 0.9646077624276586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366908073425294, "data/tokens_consumed": 17387487232, "data/tokens_consumed_B": 17.387487232, "train/loss_slope": -5.119722705684254e-05} {"step": 8300, "timestamp": 1778203500.8591166, "grad/layer_0/attn": 0.003219913924112916, "grad/layer_0/mlp": 0.0031916042789816856, "grad/layer_0/attn_mlp_ratio": 1.0088700044772663, "grad/layer_4/attn": 0.0018881091382354498, "grad/layer_4/mlp": 0.003106525866314769, "grad/layer_4/attn_mlp_ratio": 0.6077879788255204, "grad/layer_8/attn": 0.00810293760150671, "grad/layer_8/mlp": 0.00452461140230298, "grad/layer_8/attn_mlp_ratio": 1.7908581979651512, "grad/layer_12/attn": 0.006251566577702761, "grad/layer_12/mlp": 0.006499327253550291, "grad/layer_12/attn_mlp_ratio": 0.9618790126470258, "grad/layer_16/attn": 0.005869594402611256, "grad/layer_16/mlp": 0.0048471856862306595, "grad/layer_16/attn_mlp_ratio": 1.2109282914809951, "grad/layer_20/attn": 0.0058194296434521675, "grad/layer_20/mlp": 0.007567563094198704, "grad/layer_20/attn_mlp_ratio": 0.7689965044379602, "grad/layer_24/attn": 0.01536734402179718, "grad/layer_24/mlp": 0.011415669694542885, "grad/layer_24/attn_mlp_ratio": 1.346162275046125, "grad/layer_27/attn": 0.005617138929665089, "grad/layer_27/mlp": 0.01023226510733366, "grad/layer_27/attn_mlp_ratio": 0.5489633835564757} {"step": 8300, "timestamp": 1778203500.8749685, "train/loss": 2.3928972244262696, "train/z_loss": 0.0018460455117747187, "train/perplexity": 10.945158631975607, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022478.8053447967, "perf/iters_per_sec": 0.9643930460666641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369216203689575, "data/tokens_consumed": 17408458752, "data/tokens_consumed_B": 17.408458752, "train/loss_slope": -4.694336655974698e-05} {"step": 8310, "timestamp": 1778203511.2449825, "train/loss": 2.3874123811721804, "train/z_loss": 0.0018420217442326249, "train/perplexity": 10.885290486272526, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023530.638568077, "perf/iters_per_sec": 0.9648945992317567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363826274871826, "data/tokens_consumed": 17429430272, "data/tokens_consumed_B": 17.429430272, "train/loss_slope": -4.633189603941156e-05} {"step": 8320, "timestamp": 1778203521.6108687, "train/loss": 2.3489513635635375, "train/z_loss": 0.0018532116431742907, "train/perplexity": 10.474579937409928, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024258.6799892592, "perf/iters_per_sec": 0.9652417564340874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360098838806153, "data/tokens_consumed": 17450401792, "data/tokens_consumed_B": 17.450401792, "train/loss_slope": -4.68873042156128e-05} {"step": 8325, "timestamp": 1778203527.3952188, "eos/sharpness": 45.353913307189934, "eos/L0_probe": 2.197697162628174, "eos/L_plus": 2.5335044860839844, "eos/L_minus": 2.3154289722442627, "eos/grad_norm": 0.16908183693885803, "eos/embed_grad_frac": 0.11385595053434372, "eos/time_s": 0.6148414611816406} {"step": 8325, "timestamp": 1778203528.7734616, "geo/rankme_last": 436.7953796386719, "geo/layer_0/stable_rank_q_proj": 15.401566505432129, "geo/layer_0/stable_rank_k_proj": 12.10285758972168, "geo/layer_0/stable_rank_o_proj": 56.24689483642578, "geo/layer_0/stable_rank_gate_proj": 167.98153686523438, "geo/layer_0/stable_rank_down_proj": 47.660160064697266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.029646871611475945, "geo/layer_0/attn_entropy_mean": 6.436079502105713, "geo/layer_0/attn_entropy_std": 0.25044646859169006, "geo/layer_7/stable_rank_q_proj": 43.53565216064453, "geo/layer_7/stable_rank_k_proj": 44.42416000366211, "geo/layer_7/stable_rank_o_proj": 104.39117431640625, "geo/layer_7/stable_rank_gate_proj": 135.44140625, "geo/layer_7/stable_rank_down_proj": 177.98536682128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5990599989891052, "geo/layer_7/attn_entropy_mean": 4.690141677856445, "geo/layer_7/attn_entropy_std": 0.9588232636451721, "geo/layer_14/stable_rank_q_proj": 69.27360534667969, "geo/layer_14/stable_rank_k_proj": 47.237083435058594, "geo/layer_14/stable_rank_o_proj": 48.24531555175781, "geo/layer_14/stable_rank_gate_proj": 121.54899597167969, "geo/layer_14/stable_rank_down_proj": 142.21392822265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4172464907169342, "geo/layer_14/attn_entropy_mean": 5.747274875640869, "geo/layer_14/attn_entropy_std": 0.6229047775268555, "geo/layer_21/stable_rank_q_proj": 52.073795318603516, "geo/layer_21/stable_rank_k_proj": 32.63751220703125, "geo/layer_21/stable_rank_o_proj": 92.80570983886719, "geo/layer_21/stable_rank_gate_proj": 116.06257629394531, "geo/layer_21/stable_rank_down_proj": 78.56759643554688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1624298095703125, "geo/layer_21/attn_entropy_mean": 5.829784393310547, "geo/layer_21/attn_entropy_std": 0.29456669092178345, "geo/layer_27/stable_rank_q_proj": 43.18923568725586, "geo/layer_27/stable_rank_k_proj": 34.323829650878906, "geo/layer_27/stable_rank_o_proj": 112.4654769897461, "geo/layer_27/stable_rank_gate_proj": 94.90607452392578, "geo/layer_27/stable_rank_down_proj": 160.19192504882812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05827870965003967, "geo/layer_27/attn_entropy_mean": 4.54176139831543, "geo/layer_27/attn_entropy_std": 0.5091943144798279, "attnres/final_alpha/block_0": 0.24803224205970764, "attnres/block_norm/0": 1.3364136219024658, "attnres/final_alpha/block_1": 0.010733300819993019, "attnres/block_norm/1": 14318.8828125, "attnres/final_alpha/block_2": 0.02261747047305107, "attnres/block_norm/2": 10542.92578125, "attnres/final_alpha/block_3": 0.02220156602561474, "attnres/block_norm/3": 11069.4599609375, "attnres/final_alpha/block_4": 0.032994575798511505, "attnres/block_norm/4": 4727.83984375, "attnres/final_alpha/block_5": 0.4811288118362427, "attnres/block_norm/5": 3603.8515625, "attnres/final_alpha/block_6": 0.18229201436042786, "attnres/block_norm/6": 8206.5693359375, "geo/tier1_time_s": 1.3587265014648438, "geo/step": 8325.0, "geo/rankme_slope": 0.007100118621667417} {"step": 8330, "timestamp": 1778203533.9925125, "train/loss": 2.3572486639022827, "train/z_loss": 0.001855028688441962, "train/perplexity": 10.56185223461412, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694524.7014834255, "perf/iters_per_sec": 0.8080123431603553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2376048564910889, "data/tokens_consumed": 17471373312, "data/tokens_consumed_B": 17.471373312, "train/loss_slope": -4.6731515121002044e-05} {"step": 8340, "timestamp": 1778203544.357364, "train/loss": 2.3504770040512084, "train/z_loss": 0.0018453798373229801, "train/perplexity": 10.490572577061053, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024085.9125197756, "perf/iters_per_sec": 0.9651593744849088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036098313331604, "data/tokens_consumed": 17492344832, "data/tokens_consumed_B": 17.492344832, "train/loss_slope": -4.65034900754555e-05} {"step": 8350, "timestamp": 1778203554.7152846, "grad/layer_0/attn": 0.003047223901376128, "grad/layer_0/mlp": 0.0035304792691022158, "grad/layer_0/attn_mlp_ratio": 0.8631190223187801, "grad/layer_4/attn": 0.001711834454908967, "grad/layer_4/mlp": 0.0031754120718687773, "grad/layer_4/attn_mlp_ratio": 0.5390904746395574, "grad/layer_8/attn": 0.00555190397426486, "grad/layer_8/mlp": 0.004566300194710493, "grad/layer_8/attn_mlp_ratio": 1.2158429397856445, "grad/layer_12/attn": 0.004038300830870867, "grad/layer_12/mlp": 0.005673505365848541, "grad/layer_12/attn_mlp_ratio": 0.7117823108092997, "grad/layer_16/attn": 0.005007690750062466, "grad/layer_16/mlp": 0.0060196821577847, "grad/layer_16/attn_mlp_ratio": 0.8318862251552366, "grad/layer_20/attn": 0.010614399798214436, "grad/layer_20/mlp": 0.008413762785494328, "grad/layer_20/attn_mlp_ratio": 1.2615520478374898, "grad/layer_24/attn": 0.02447870932519436, "grad/layer_24/mlp": 0.01685814931988716, "grad/layer_24/attn_mlp_ratio": 1.4520401211011573, "grad/layer_27/attn": 0.010995042510330677, "grad/layer_27/mlp": 0.017322754487395287, "grad/layer_27/attn_mlp_ratio": 0.6347167510143623} {"step": 8350, "timestamp": 1778203554.7312589, "train/loss": 2.389419770240784, "train/z_loss": 0.0018421390908770264, "train/perplexity": 10.907163445828237, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022878.6680749704, "perf/iters_per_sec": 0.9645837154745914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367166519165039, "data/tokens_consumed": 17513316352, "data/tokens_consumed_B": 17.513316352, "train/loss_slope": -4.8005257359575266e-05} {"step": 8360, "timestamp": 1778203565.09615, "train/loss": 2.3490990161895753, "train/z_loss": 0.001851166842970997, "train/perplexity": 10.476126650829668, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024537.1543424195, "perf/iters_per_sec": 0.9653745433532808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358673810958863, "data/tokens_consumed": 17534287872, "data/tokens_consumed_B": 17.534287872, "train/loss_slope": -4.7287421350968286e-05} {"step": 8370, "timestamp": 1778203575.4659429, "train/loss": 2.341457796096802, "train/z_loss": 0.0018603624543175102, "train/perplexity": 10.396381325178664, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023453.3667271687, "perf/iters_per_sec": 0.9648577531467288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364222049713134, "data/tokens_consumed": 17555259392, "data/tokens_consumed_B": 17.555259392, "train/loss_slope": -4.7071451959114865e-05} {"step": 8380, "timestamp": 1778203585.8427606, "train/loss": 2.3383693218231203, "train/z_loss": 0.0018569803098216653, "train/perplexity": 10.364321901752401, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022091.326114893, "perf/iters_per_sec": 0.964208281571814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371203184127809, "data/tokens_consumed": 17576230912, "data/tokens_consumed_B": 17.576230912, "train/loss_slope": -4.977302757283773e-05} {"step": 8390, "timestamp": 1778203596.2071958, "train/loss": 2.359810996055603, "train/z_loss": 0.0018452471704222262, "train/perplexity": 10.588949909990019, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024429.5200700525, "perf/iters_per_sec": 0.96532321933272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359224557876587, "data/tokens_consumed": 17597202432, "data/tokens_consumed_B": 17.597202432, "train/loss_slope": -4.7571436649966777e-05} {"step": 8400, "timestamp": 1778203606.5591698, "grad/layer_0/attn": 0.002965804422274232, "grad/layer_0/mlp": 0.0029212944209575653, "grad/layer_0/attn_mlp_ratio": 1.015236362166618, "grad/layer_4/attn": 0.0015041687292978168, "grad/layer_4/mlp": 0.0028728605248034, "grad/layer_4/attn_mlp_ratio": 0.523578734140906, "grad/layer_8/attn": 0.006941395346075296, "grad/layer_8/mlp": 0.004308934789150953, "grad/layer_8/attn_mlp_ratio": 1.6109306649196207, "grad/layer_12/attn": 0.004167217295616865, "grad/layer_12/mlp": 0.005557535216212273, "grad/layer_12/attn_mlp_ratio": 0.749831905424044, "grad/layer_16/attn": 0.004235649947077036, "grad/layer_16/mlp": 0.004289702977985144, "grad/layer_16/attn_mlp_ratio": 0.9873993304605372, "grad/layer_20/attn": 0.00465946551412344, "grad/layer_20/mlp": 0.006536640226840973, "grad/layer_20/attn_mlp_ratio": 0.7128226858361145, "grad/layer_24/attn": 0.015544026158750057, "grad/layer_24/mlp": 0.011731153354048729, "grad/layer_24/attn_mlp_ratio": 1.3250211259818963, "grad/layer_27/attn": 0.006081066094338894, "grad/layer_27/mlp": 0.012355647049844265, "grad/layer_27/attn_mlp_ratio": 0.49216896699866847} {"step": 8400, "timestamp": 1778203607.1797614, "eos/sharpness": 34.23430919647216, "eos/L0_probe": 2.190659999847412, "eos/L_plus": 2.401418685913086, "eos/L_minus": 2.32224440574646, "eos/grad_norm": 0.19005820155143738, "eos/embed_grad_frac": 0.08524643629789352, "eos/time_s": 0.6178765296936035} {"step": 8400, "timestamp": 1778203607.2020252, "train/loss": 2.416246771812439, "train/z_loss": 0.001829164766240865, "train/perplexity": 11.203730148350699, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908349.516246975, "perf/iters_per_sec": 0.9099719601855158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0989349603652954, "data/tokens_consumed": 17618173952, "data/tokens_consumed_B": 17.618173952, "train/loss_slope": -4.711463081561289e-05} {"step": 8400, "timestamp": 1778203608.5696778, "geo/rankme_last": 437.63140869140625, "geo/layer_0/stable_rank_q_proj": 15.416836738586426, "geo/layer_0/stable_rank_k_proj": 12.165972709655762, "geo/layer_0/stable_rank_o_proj": 56.38777542114258, "geo/layer_0/stable_rank_gate_proj": 167.8431854248047, "geo/layer_0/stable_rank_down_proj": 47.6580696105957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03630755841732025, "geo/layer_0/attn_entropy_mean": 6.434661865234375, "geo/layer_0/attn_entropy_std": 0.25159990787506104, "geo/layer_7/stable_rank_q_proj": 43.59793472290039, "geo/layer_7/stable_rank_k_proj": 44.3858642578125, "geo/layer_7/stable_rank_o_proj": 104.51713562011719, "geo/layer_7/stable_rank_gate_proj": 134.96640014648438, "geo/layer_7/stable_rank_down_proj": 177.6863250732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5864254832267761, "geo/layer_7/attn_entropy_mean": 4.722028732299805, "geo/layer_7/attn_entropy_std": 0.9425202012062073, "geo/layer_14/stable_rank_q_proj": 68.98856353759766, "geo/layer_14/stable_rank_k_proj": 47.03422164916992, "geo/layer_14/stable_rank_o_proj": 48.24097442626953, "geo/layer_14/stable_rank_gate_proj": 120.8980712890625, "geo/layer_14/stable_rank_down_proj": 141.94483947753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40426158905029297, "geo/layer_14/attn_entropy_mean": 5.752945423126221, "geo/layer_14/attn_entropy_std": 0.6206616759300232, "geo/layer_21/stable_rank_q_proj": 52.03077697753906, "geo/layer_21/stable_rank_k_proj": 32.70962905883789, "geo/layer_21/stable_rank_o_proj": 93.0065689086914, "geo/layer_21/stable_rank_gate_proj": 116.05303192138672, "geo/layer_21/stable_rank_down_proj": 78.18551635742188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16357778012752533, "geo/layer_21/attn_entropy_mean": 5.830265998840332, "geo/layer_21/attn_entropy_std": 0.30668407678604126, "geo/layer_27/stable_rank_q_proj": 43.20304489135742, "geo/layer_27/stable_rank_k_proj": 34.284603118896484, "geo/layer_27/stable_rank_o_proj": 112.78668975830078, "geo/layer_27/stable_rank_gate_proj": 94.91062927246094, "geo/layer_27/stable_rank_down_proj": 159.84715270996094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06224934756755829, "geo/layer_27/attn_entropy_mean": 4.542940139770508, "geo/layer_27/attn_entropy_std": 0.5398361682891846, "attnres/final_alpha/block_0": 0.2484956830739975, "attnres/block_norm/0": 1.3395881652832031, "attnres/final_alpha/block_1": 0.010841002687811852, "attnres/block_norm/1": 14446.1806640625, "attnres/final_alpha/block_2": 0.022488363087177277, "attnres/block_norm/2": 10687.486328125, "attnres/final_alpha/block_3": 0.022403448820114136, "attnres/block_norm/3": 11208.20703125, "attnres/final_alpha/block_4": 0.03346845507621765, "attnres/block_norm/4": 4761.35888671875, "attnres/final_alpha/block_5": 0.4794958233833313, "attnres/block_norm/5": 3606.737548828125, "attnres/final_alpha/block_6": 0.18280722200870514, "attnres/block_norm/6": 8279.1708984375, "geo/tier1_time_s": 1.3635737895965576, "geo/step": 8400.0, "geo/rankme_slope": 0.006799121171906263} {"step": 8410, "timestamp": 1778203618.9826295, "train/loss": 2.3870773792266844, "train/z_loss": 0.0018366475007496775, "train/perplexity": 10.881644503522091, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1780760.680079807, "perf/iters_per_sec": 0.8491328621291194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1776720046997071, "data/tokens_consumed": 17639145472, "data/tokens_consumed_B": 17.639145472, "train/loss_slope": -4.746884703099666e-05} {"step": 8420, "timestamp": 1778203629.339576, "train/loss": 2.3544274091720583, "train/z_loss": 0.0018545352155342697, "train/perplexity": 10.532096552952742, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026267.3486843961, "perf/iters_per_sec": 0.966199564306448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349828720092773, "data/tokens_consumed": 17660116992, "data/tokens_consumed_B": 17.660116992, "train/loss_slope": -4.793721268279316e-05} {"step": 8430, "timestamp": 1778203639.6993341, "train/loss": 2.3827483654022217, "train/z_loss": 0.0018430918105877935, "train/perplexity": 10.834639530030975, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025498.4023251259, "perf/iters_per_sec": 0.965832902109683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353757858276367, "data/tokens_consumed": 17681088512, "data/tokens_consumed_B": 17.681088512, "train/loss_slope": -4.745075068171941e-05} {"step": 8440, "timestamp": 1778203650.083183, "train/loss": 2.3747188806533814, "train/z_loss": 0.0018454344710335136, "train/perplexity": 10.747991293050195, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021293.0775966574, "perf/iters_per_sec": 0.9638276470168388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375298976898193, "data/tokens_consumed": 17702060032, "data/tokens_consumed_B": 17.702060032, "train/loss_slope": -4.621737191457482e-05} {"step": 8450, "timestamp": 1778203660.433291, "grad/layer_0/attn": 0.002635517856106162, "grad/layer_0/mlp": 0.0029364151414483786, "grad/layer_0/attn_mlp_ratio": 0.897529007105344, "grad/layer_4/attn": 0.0016383853508159518, "grad/layer_4/mlp": 0.002901341300457716, "grad/layer_4/attn_mlp_ratio": 0.5646992630917134, "grad/layer_8/attn": 0.009612694382667542, "grad/layer_8/mlp": 0.004290251526981592, "grad/layer_8/attn_mlp_ratio": 2.240589881071977, "grad/layer_12/attn": 0.004283982329070568, "grad/layer_12/mlp": 0.00554296001791954, "grad/layer_12/attn_mlp_ratio": 0.7728690515418125, "grad/layer_16/attn": 0.005272889044135809, "grad/layer_16/mlp": 0.004885852336883545, "grad/layer_16/attn_mlp_ratio": 1.0792157790788985, "grad/layer_20/attn": 0.008120394311845303, "grad/layer_20/mlp": 0.0073579298332333565, "grad/layer_20/attn_mlp_ratio": 1.103624849044586, "grad/layer_24/attn": 0.011732248589396477, "grad/layer_24/mlp": 0.013675871305167675, "grad/layer_24/attn_mlp_ratio": 0.8578794170997568, "grad/layer_27/attn": 0.011190276592969894, "grad/layer_27/mlp": 0.012090090662240982, "grad/layer_27/attn_mlp_ratio": 0.9255742419997929} {"step": 8450, "timestamp": 1778203660.4492218, "train/loss": 2.3171042919158937, "train/z_loss": 0.001864777400624007, "train/perplexity": 10.146251144726829, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024059.5504900296, "perf/iters_per_sec": 0.9651468040895603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361118078231812, "data/tokens_consumed": 17723031552, "data/tokens_consumed_B": 17.723031552, "train/loss_slope": -4.971698927800641e-05} {"step": 8460, "timestamp": 1778203671.3465338, "train/loss": 2.3364171147346497, "train/z_loss": 0.0018507441855035722, "train/perplexity": 10.34410833602144, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925600.838071467, "perf/iters_per_sec": 0.9181980314595541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0890896797180176, "data/tokens_consumed": 17744003072, "data/tokens_consumed_B": 17.744003072, "train/loss_slope": -5.037918700994469e-05} {"step": 8470, "timestamp": 1778203681.7102306, "train/loss": 2.327913594245911, "train/z_loss": 0.0018519530538469553, "train/perplexity": 10.256519931535882, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025167.673685301, "perf/iters_per_sec": 0.9656751984049325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355448722839355, "data/tokens_consumed": 17764974592, "data/tokens_consumed_B": 17.764974592, "train/loss_slope": -5.1895041019395004e-05} {"step": 8475, "timestamp": 1778203687.5055192, "eos/sharpness": 24.508738517761227, "eos/L0_probe": 2.186241626739502, "eos/L_plus": 2.286104440689087, "eos/L_minus": 2.3314661979675293, "eos/grad_norm": 0.13446684181690216, "eos/embed_grad_frac": 0.1839006543159485, "eos/time_s": 0.6237330436706543} {"step": 8475, "timestamp": 1778203688.8854933, "geo/rankme_last": 438.7358703613281, "geo/layer_0/stable_rank_q_proj": 15.385493278503418, "geo/layer_0/stable_rank_k_proj": 12.155217170715332, "geo/layer_0/stable_rank_o_proj": 56.01168441772461, "geo/layer_0/stable_rank_gate_proj": 166.44461059570312, "geo/layer_0/stable_rank_down_proj": 47.54403305053711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03622034192085266, "geo/layer_0/attn_entropy_mean": 6.428693771362305, "geo/layer_0/attn_entropy_std": 0.25651705265045166, "geo/layer_7/stable_rank_q_proj": 43.62860870361328, "geo/layer_7/stable_rank_k_proj": 44.501197814941406, "geo/layer_7/stable_rank_o_proj": 104.71492004394531, "geo/layer_7/stable_rank_gate_proj": 134.1261444091797, "geo/layer_7/stable_rank_down_proj": 177.12571716308594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5776582956314087, "geo/layer_7/attn_entropy_mean": 4.716583251953125, "geo/layer_7/attn_entropy_std": 0.9514551758766174, "geo/layer_14/stable_rank_q_proj": 68.82017517089844, "geo/layer_14/stable_rank_k_proj": 46.83985137939453, "geo/layer_14/stable_rank_o_proj": 48.157527923583984, "geo/layer_14/stable_rank_gate_proj": 120.23361206054688, "geo/layer_14/stable_rank_down_proj": 141.2259979248047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3998432755470276, "geo/layer_14/attn_entropy_mean": 5.734564781188965, "geo/layer_14/attn_entropy_std": 0.6366676092147827, "geo/layer_21/stable_rank_q_proj": 52.0278434753418, "geo/layer_21/stable_rank_k_proj": 32.615962982177734, "geo/layer_21/stable_rank_o_proj": 93.08242797851562, "geo/layer_21/stable_rank_gate_proj": 115.80818176269531, "geo/layer_21/stable_rank_down_proj": 77.66935729980469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1606203317642212, "geo/layer_21/attn_entropy_mean": 5.819343566894531, "geo/layer_21/attn_entropy_std": 0.3183450698852539, "geo/layer_27/stable_rank_q_proj": 42.825748443603516, "geo/layer_27/stable_rank_k_proj": 34.30677795410156, "geo/layer_27/stable_rank_o_proj": 113.34638214111328, "geo/layer_27/stable_rank_gate_proj": 94.83590698242188, "geo/layer_27/stable_rank_down_proj": 158.81472778320312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06723731756210327, "geo/layer_27/attn_entropy_mean": 4.557522773742676, "geo/layer_27/attn_entropy_std": 0.512448787689209, "attnres/final_alpha/block_0": 0.2505697011947632, "attnres/block_norm/0": 1.343198299407959, "attnres/final_alpha/block_1": 0.010899328626692295, "attnres/block_norm/1": 14570.125, "attnres/final_alpha/block_2": 0.022592617198824883, "attnres/block_norm/2": 10668.208984375, "attnres/final_alpha/block_3": 0.022477036342024803, "attnres/block_norm/3": 11296.1806640625, "attnres/final_alpha/block_4": 0.03392307460308075, "attnres/block_norm/4": 4792.50634765625, "attnres/final_alpha/block_5": 0.4751982092857361, "attnres/block_norm/5": 3645.105224609375, "attnres/final_alpha/block_6": 0.1843400001525879, "attnres/block_norm/6": 8350.0048828125, "geo/tier1_time_s": 1.3601279258728027, "geo/step": 8475.0, "geo/rankme_slope": 0.006507453723676971} {"step": 8480, "timestamp": 1778203694.091755, "train/loss": 2.3051296949386595, "train/z_loss": 0.0018571573426015675, "train/perplexity": 10.02547842191939, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694511.6112614172, "perf/iters_per_sec": 0.8080061012560926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2376144170761108, "data/tokens_consumed": 17785946112, "data/tokens_consumed_B": 17.785946112, "train/loss_slope": -5.134436401775779e-05} {"step": 8490, "timestamp": 1778203704.4620328, "train/loss": 2.370268726348877, "train/z_loss": 0.0018443179549649357, "train/perplexity": 10.70026734156661, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023043.971004723, "perf/iters_per_sec": 0.9646625380538574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366319417953491, "data/tokens_consumed": 17806917632, "data/tokens_consumed_B": 17.806917632, "train/loss_slope": -4.705157362230894e-05} {"step": 8500, "timestamp": 1778203714.8095715, "grad/layer_0/attn": 0.0024814754724502563, "grad/layer_0/mlp": 0.0028388546779751778, "grad/layer_0/attn_mlp_ratio": 0.8741114521610636, "grad/layer_4/attn": 0.001780279097147286, "grad/layer_4/mlp": 0.0028758335392922163, "grad/layer_4/attn_mlp_ratio": 0.6190480119654759, "grad/layer_8/attn": 0.008960619568824768, "grad/layer_8/mlp": 0.004278813488781452, "grad/layer_8/attn_mlp_ratio": 2.094183208242225, "grad/layer_12/attn": 0.005411941558122635, "grad/layer_12/mlp": 0.005386304575949907, "grad/layer_12/attn_mlp_ratio": 1.0047596420394111, "grad/layer_16/attn": 0.0069229188375175, "grad/layer_16/mlp": 0.004534842912107706, "grad/layer_16/attn_mlp_ratio": 1.526606063106, "grad/layer_20/attn": 0.005966451950371265, "grad/layer_20/mlp": 0.006604472175240517, "grad/layer_20/attn_mlp_ratio": 0.9033957145582815, "grad/layer_24/attn": 0.010120430029928684, "grad/layer_24/mlp": 0.00984685868024826, "grad/layer_24/attn_mlp_ratio": 1.0277825909547091, "grad/layer_27/attn": 0.009263127110898495, "grad/layer_27/mlp": 0.00776897557079792, "grad/layer_27/attn_mlp_ratio": 1.1923228368080494} {"step": 8500, "timestamp": 1778203714.825448, "train/loss": 2.332855725288391, "train/z_loss": 0.0018454551347531379, "train/perplexity": 10.307334459678561, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024661.8566666432, "perf/iters_per_sec": 0.9654340060551849, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358035802841186, "data/tokens_consumed": 17827889152, "data/tokens_consumed_B": 17.827889152, "train/loss_slope": -4.684523224937923e-05} {"step": 8500, "timestamp": 1778203721.809425, "geo/ww_alpha_mean": 8.402709228903598, "geo/ww_alpha_std": 5.029018536359205, "geo/ww_alpha_min": 2.257995615658287, "geo/ww_alpha_max": 36.27778657557962, "geo/ww_alpha_healthy_frac": 0.116751269035533, "geo/ww_alpha_by_type/q_proj": 4.5407468494764265, "geo/ww_alpha_by_type/k_proj": 5.51783880366414, "geo/ww_alpha_by_type/v_proj": 8.93638612498211, "geo/ww_alpha_by_type/o_proj": 8.948686758752592, "geo/ww_alpha_by_type/gate_proj": 9.89320124825249, "geo/ww_alpha_by_type/up_proj": 11.7166597174767, "geo/ww_alpha_by_type/down_proj": 9.484899157336626, "geo/twonn_id/layer_0": 0.6758924126625061, "geo/twonn_id/layer_7": 2.8420941829681396, "geo/twonn_id/layer_14": 3.6641998291015625, "geo/twonn_id/layer_21": 8.473710060119629, "geo/twonn_id/layer_27": 5.913145065307617, "geo/tier2_time_s": 6.9772021770477295} {"step": 8500, "timestamp": 1778203722.4386582, "eoc/jacobian_sigma/layer_0/attn": 579.7523803710938, "eoc/jacobian_sigma/layer_0/mlp": 2434.693115234375, "eoc/jacobian_sigma/layer_0": 2434.693115234375, "eoc/jacobian_sigma/layer_7/attn": 1.163734793663025, "eoc/jacobian_sigma/layer_7/mlp": 1.6789854764938354, "eoc/jacobian_sigma/layer_7": 1.6789854764938354, "eoc/jacobian_sigma/layer_14/attn": 1.261862874031067, "eoc/jacobian_sigma/layer_14/mlp": 6.104692459106445, "eoc/jacobian_sigma/layer_14": 6.104692459106445, "eoc/jacobian_sigma/layer_21/attn": 1.0803157091140747, "eoc/jacobian_sigma/layer_21/mlp": 2.8950233459472656, "eoc/jacobian_sigma/layer_21": 2.8950233459472656, "eoc/jacobian_sigma/layer_27/attn": 1.8511968851089478, "eoc/jacobian_sigma/layer_27/mlp": 16.021638870239258, "eoc/jacobian_sigma/layer_27": 16.021638870239258, "eoc/layer0_sigma": 2434.693115234375, "eoc/sigma_max": 16.021638870239258, "eoc/sigma_min": 1.6789854764938354, "eoc/sigma_mean": 6.675085037946701, "eoc/time_s": 0.623136043548584} {"step": 8510, "timestamp": 1778203732.802197, "train/loss": 2.325529360771179, "train/z_loss": 0.0018528023152612149, "train/perplexity": 10.232095122175282, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1166970.610324216, "perf/iters_per_sec": 0.5564549495335656, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7970906734466552, "data/tokens_consumed": 17848860672, "data/tokens_consumed_B": 17.848860672, "train/loss_slope": -5.071937200939405e-05} {"step": 8520, "timestamp": 1778203743.164524, "train/loss": 2.4026044607162476, "train/z_loss": 0.001834137667901814, "train/perplexity": 11.051923229262554, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025373.457026559, "perf/iters_per_sec": 0.9657733235485835, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035439658164978, "data/tokens_consumed": 17869832192, "data/tokens_consumed_B": 17.869832192, "train/loss_slope": -4.845742297322762e-05} {"step": 8530, "timestamp": 1778203753.5237653, "train/loss": 2.3214842319488525, "train/z_loss": 0.0018526708474382759, "train/perplexity": 10.190788580749546, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025230.5280629147, "perf/iters_per_sec": 0.9657051697077345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355127334594727, "data/tokens_consumed": 17890803712, "data/tokens_consumed_B": 17.890803712, "train/loss_slope": -5.2964977963422066e-05} {"step": 8540, "timestamp": 1778203763.891832, "train/loss": 2.350792932510376, "train/z_loss": 0.0018514340394176544, "train/perplexity": 10.49388737108242, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024143.4362937047, "perf/iters_per_sec": 0.9651868039577983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036068868637085, "data/tokens_consumed": 17911775232, "data/tokens_consumed_B": 17.911775232, "train/loss_slope": -5.309748096649187e-05} {"step": 8550, "timestamp": 1778203774.2441554, "grad/layer_0/attn": 0.0024817679077386856, "grad/layer_0/mlp": 0.0027861734852194786, "grad/layer_0/attn_mlp_ratio": 0.8907441808020681, "grad/layer_4/attn": 0.001829397166147828, "grad/layer_4/mlp": 0.00288840988650918, "grad/layer_4/attn_mlp_ratio": 0.6333578594078905, "grad/layer_8/attn": 0.009380931034684181, "grad/layer_8/mlp": 0.004278514068573713, "grad/layer_8/attn_mlp_ratio": 2.192567481390721, "grad/layer_12/attn": 0.004833092913031578, "grad/layer_12/mlp": 0.005825646687299013, "grad/layer_12/attn_mlp_ratio": 0.8296234031161337, "grad/layer_16/attn": 0.006133607588708401, "grad/layer_16/mlp": 0.0044702086597681046, "grad/layer_16/attn_mlp_ratio": 1.3721076393368676, "grad/layer_20/attn": 0.004629150964319706, "grad/layer_20/mlp": 0.006001099478453398, "grad/layer_20/attn_mlp_ratio": 0.7713837945533192, "grad/layer_24/attn": 0.009140042588114738, "grad/layer_24/mlp": 0.01188434474170208, "grad/layer_24/attn_mlp_ratio": 0.7690825796338723, "grad/layer_27/attn": 0.007426399737596512, "grad/layer_27/mlp": 0.009554463438689709, "grad/layer_27/attn_mlp_ratio": 0.7772701949747524} {"step": 8550, "timestamp": 1778203774.8713691, "eos/sharpness": 13.918876647949215, "eos/L0_probe": 2.187366485595703, "eos/L_plus": 2.2474303245544434, "eos/L_minus": 2.266491413116455, "eos/grad_norm": 0.12179148197174072, "eos/embed_grad_frac": 0.21677112579345703, "eos/time_s": 0.6243665218353271} {"step": 8550, "timestamp": 1778203774.8921769, "train/loss": 2.37061448097229, "train/z_loss": 0.0018431287142448127, "train/perplexity": 10.703967648133904, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907341.5680658193, "perf/iters_per_sec": 0.9094913330391976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099515700340271, "data/tokens_consumed": 17932746752, "data/tokens_consumed_B": 17.932746752, "train/loss_slope": -5.50329238060105e-05} {"step": 8550, "timestamp": 1778203776.2532237, "geo/rankme_last": 437.3226623535156, "geo/layer_0/stable_rank_q_proj": 15.361181259155273, "geo/layer_0/stable_rank_k_proj": 12.204649925231934, "geo/layer_0/stable_rank_o_proj": 56.038021087646484, "geo/layer_0/stable_rank_gate_proj": 166.64414978027344, "geo/layer_0/stable_rank_down_proj": 47.57754898071289, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03315845504403114, "geo/layer_0/attn_entropy_mean": 6.421274185180664, "geo/layer_0/attn_entropy_std": 0.26070722937583923, "geo/layer_7/stable_rank_q_proj": 43.49687957763672, "geo/layer_7/stable_rank_k_proj": 44.73458480834961, "geo/layer_7/stable_rank_o_proj": 105.10477447509766, "geo/layer_7/stable_rank_gate_proj": 133.64305114746094, "geo/layer_7/stable_rank_down_proj": 176.70941162109375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5865552425384521, "geo/layer_7/attn_entropy_mean": 4.714263439178467, "geo/layer_7/attn_entropy_std": 0.9134592413902283, "geo/layer_14/stable_rank_q_proj": 68.8330307006836, "geo/layer_14/stable_rank_k_proj": 46.442508697509766, "geo/layer_14/stable_rank_o_proj": 48.3139533996582, "geo/layer_14/stable_rank_gate_proj": 119.56256103515625, "geo/layer_14/stable_rank_down_proj": 141.47994995117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3962935507297516, "geo/layer_14/attn_entropy_mean": 5.791489124298096, "geo/layer_14/attn_entropy_std": 0.6143523454666138, "geo/layer_21/stable_rank_q_proj": 51.9580078125, "geo/layer_21/stable_rank_k_proj": 32.50218963623047, "geo/layer_21/stable_rank_o_proj": 93.13027954101562, "geo/layer_21/stable_rank_gate_proj": 116.0733413696289, "geo/layer_21/stable_rank_down_proj": 77.50626373291016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1654333919286728, "geo/layer_21/attn_entropy_mean": 5.832653522491455, "geo/layer_21/attn_entropy_std": 0.29515084624290466, "geo/layer_27/stable_rank_q_proj": 42.89523696899414, "geo/layer_27/stable_rank_k_proj": 34.177223205566406, "geo/layer_27/stable_rank_o_proj": 113.44017791748047, "geo/layer_27/stable_rank_gate_proj": 94.68743896484375, "geo/layer_27/stable_rank_down_proj": 158.441650390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0616416409611702, "geo/layer_27/attn_entropy_mean": 4.540411949157715, "geo/layer_27/attn_entropy_std": 0.5160000324249268, "attnres/final_alpha/block_0": 0.2472468763589859, "attnres/block_norm/0": 1.346513271331787, "attnres/final_alpha/block_1": 0.010696720331907272, "attnres/block_norm/1": 14721.990234375, "attnres/final_alpha/block_2": 0.022444499656558037, "attnres/block_norm/2": 10805.859375, "attnres/final_alpha/block_3": 0.022233635187149048, "attnres/block_norm/3": 11511.3447265625, "attnres/final_alpha/block_4": 0.03299208730459213, "attnres/block_norm/4": 4813.560546875, "attnres/final_alpha/block_5": 0.4852181077003479, "attnres/block_norm/5": 3646.49365234375, "attnres/final_alpha/block_6": 0.17916806042194366, "attnres/block_norm/6": 8556.1875, "geo/tier1_time_s": 1.3566176891326904, "geo/step": 8550.0, "geo/rankme_slope": 0.006191657326993298} {"step": 8560, "timestamp": 1778203786.613185, "train/loss": 2.361422228813171, "train/z_loss": 0.0018402832793071866, "train/perplexity": 10.606024925171146, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789856.509032087, "perf/iters_per_sec": 0.8534700913582263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716872215270997, "data/tokens_consumed": 17953718272, "data/tokens_consumed_B": 17.953718272, "train/loss_slope": -5.448307870137523e-05} {"step": 8570, "timestamp": 1778203796.9668305, "train/loss": 2.4069286584854126, "train/z_loss": 0.0018257965217344462, "train/perplexity": 11.099817408356882, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026502.34835594, "perf/iters_per_sec": 0.9663116208820057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348628520965577, "data/tokens_consumed": 17974689792, "data/tokens_consumed_B": 17.974689792, "train/loss_slope": -5.3609260788845144e-05} {"step": 8580, "timestamp": 1778203807.322414, "train/loss": 2.326728177070618, "train/z_loss": 0.0018517127842642366, "train/perplexity": 10.244368880105737, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026238.829389984, "perf/iters_per_sec": 0.9661859652471466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349974393844605, "data/tokens_consumed": 17995661312, "data/tokens_consumed_B": 17.995661312, "train/loss_slope": -5.312808116491654e-05} {"step": 8590, "timestamp": 1778203817.6963732, "train/loss": 2.3376860857009887, "train/z_loss": 0.0018425044836476445, "train/perplexity": 10.357243041189717, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022885.4136551095, "perf/iters_per_sec": 0.9645869320178554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036713194847107, "data/tokens_consumed": 18016632832, "data/tokens_consumed_B": 18.016632832, "train/loss_slope": -5.432425174060754e-05} {"step": 8600, "timestamp": 1778203828.0466464, "grad/layer_0/attn": 0.0030636603478342295, "grad/layer_0/mlp": 0.0030606668442487717, "grad/layer_0/attn_mlp_ratio": 1.0009780233001446, "grad/layer_4/attn": 0.0016160949598997831, "grad/layer_4/mlp": 0.002923912601545453, "grad/layer_4/attn_mlp_ratio": 0.552716556498279, "grad/layer_8/attn": 0.00514311995357275, "grad/layer_8/mlp": 0.004448302555829287, "grad/layer_8/attn_mlp_ratio": 1.156198296631849, "grad/layer_12/attn": 0.004493466578423977, "grad/layer_12/mlp": 0.00604392122477293, "grad/layer_12/attn_mlp_ratio": 0.7434687410648577, "grad/layer_16/attn": 0.006006675306707621, "grad/layer_16/mlp": 0.005160880740731955, "grad/layer_16/attn_mlp_ratio": 1.1638856799987092, "grad/layer_20/attn": 0.018481293693184853, "grad/layer_20/mlp": 0.00739213265478611, "grad/layer_20/attn_mlp_ratio": 2.5001301121410404, "grad/layer_24/attn": 0.012562932446599007, "grad/layer_24/mlp": 0.011364641599357128, "grad/layer_24/attn_mlp_ratio": 1.1054402575058448, "grad/layer_27/attn": 0.006258436944335699, "grad/layer_27/mlp": 0.011421244591474533, "grad/layer_27/attn_mlp_ratio": 0.5479645269317583} {"step": 8600, "timestamp": 1778203828.0628383, "train/loss": 2.389088582992554, "train/z_loss": 0.001829726411961019, "train/perplexity": 10.903551730490664, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024630.4932385974, "perf/iters_per_sec": 0.9654190508072841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358196258544923, "data/tokens_consumed": 18037604352, "data/tokens_consumed_B": 18.037604352, "train/loss_slope": -5.381798086815897e-05} {"step": 8610, "timestamp": 1778203838.426624, "train/loss": 2.3699800252914427, "train/z_loss": 0.0018356487271375954, "train/perplexity": 10.69717860895191, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025003.0025274055, "perf/iters_per_sec": 0.9655966770779636, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356290817260743, "data/tokens_consumed": 18058575872, "data/tokens_consumed_B": 18.058575872, "train/loss_slope": -5.5122285321278856e-05} {"step": 8620, "timestamp": 1778203848.7811968, "train/loss": 2.3744356632232666, "train/z_loss": 0.0018324490287341178, "train/perplexity": 10.744947705596134, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026614.358836663, "perf/iters_per_sec": 0.9663650316413226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348056554794312, "data/tokens_consumed": 18079547392, "data/tokens_consumed_B": 18.079547392, "train/loss_slope": -5.523354427756542e-05} {"step": 8625, "timestamp": 1778203854.6200159, "eos/sharpness": 44.22252178192138, "eos/L0_probe": 2.1865038871765137, "eos/L_plus": 2.359861373901367, "eos/L_minus": 2.455371618270874, "eos/grad_norm": 0.2608165740966797, "eos/embed_grad_frac": 0.05151195079088211, "eos/time_s": 0.6220245361328125} {"step": 8625, "timestamp": 1778203855.997597, "geo/rankme_last": 438.5702209472656, "geo/layer_0/stable_rank_q_proj": 15.338473320007324, "geo/layer_0/stable_rank_k_proj": 12.191000938415527, "geo/layer_0/stable_rank_o_proj": 56.12321472167969, "geo/layer_0/stable_rank_gate_proj": 166.50021362304688, "geo/layer_0/stable_rank_down_proj": 47.68033218383789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03268040344119072, "geo/layer_0/attn_entropy_mean": 6.4230241775512695, "geo/layer_0/attn_entropy_std": 0.26055672764778137, "geo/layer_7/stable_rank_q_proj": 43.652095794677734, "geo/layer_7/stable_rank_k_proj": 44.26491928100586, "geo/layer_7/stable_rank_o_proj": 105.02976989746094, "geo/layer_7/stable_rank_gate_proj": 133.08599853515625, "geo/layer_7/stable_rank_down_proj": 175.64466857910156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5822415947914124, "geo/layer_7/attn_entropy_mean": 4.7041120529174805, "geo/layer_7/attn_entropy_std": 0.9384149312973022, "geo/layer_14/stable_rank_q_proj": 68.9200439453125, "geo/layer_14/stable_rank_k_proj": 46.07294845581055, "geo/layer_14/stable_rank_o_proj": 48.22602081298828, "geo/layer_14/stable_rank_gate_proj": 118.60140228271484, "geo/layer_14/stable_rank_down_proj": 140.64346313476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40602993965148926, "geo/layer_14/attn_entropy_mean": 5.773713111877441, "geo/layer_14/attn_entropy_std": 0.5934751629829407, "geo/layer_21/stable_rank_q_proj": 52.06077575683594, "geo/layer_21/stable_rank_k_proj": 32.47453689575195, "geo/layer_21/stable_rank_o_proj": 93.09541320800781, "geo/layer_21/stable_rank_gate_proj": 115.63117980957031, "geo/layer_21/stable_rank_down_proj": 77.21007537841797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16126634180545807, "geo/layer_21/attn_entropy_mean": 5.818858623504639, "geo/layer_21/attn_entropy_std": 0.30535003542900085, "geo/layer_27/stable_rank_q_proj": 42.93164825439453, "geo/layer_27/stable_rank_k_proj": 33.940025329589844, "geo/layer_27/stable_rank_o_proj": 113.3498306274414, "geo/layer_27/stable_rank_gate_proj": 94.72948455810547, "geo/layer_27/stable_rank_down_proj": 157.96754455566406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05670351907610893, "geo/layer_27/attn_entropy_mean": 4.564912796020508, "geo/layer_27/attn_entropy_std": 0.5235041379928589, "attnres/final_alpha/block_0": 0.24831649661064148, "attnres/block_norm/0": 1.3501617908477783, "attnres/final_alpha/block_1": 0.010712860152125359, "attnres/block_norm/1": 14780.03125, "attnres/final_alpha/block_2": 0.022049464285373688, "attnres/block_norm/2": 10974.44921875, "attnres/final_alpha/block_3": 0.022260818630456924, "attnres/block_norm/3": 11620.9443359375, "attnres/final_alpha/block_4": 0.032794784754514694, "attnres/block_norm/4": 4865.17041015625, "attnres/final_alpha/block_5": 0.481058806180954, "attnres/block_norm/5": 3667.380615234375, "attnres/final_alpha/block_6": 0.18280676007270813, "attnres/block_norm/6": 8559.4404296875, "geo/tier1_time_s": 1.3579182624816895, "geo/step": 8625.0, "geo/rankme_slope": 0.005901537489995999} {"step": 8630, "timestamp": 1778203861.6148632, "train/loss": 2.3122504711151124, "train/z_loss": 0.0018493648851290346, "train/perplexity": 10.09712238741703, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1640483.5850908775, "perf/iters_per_sec": 0.7822435307936084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2783742666244506, "data/tokens_consumed": 18100518912, "data/tokens_consumed_B": 18.100518912, "train/loss_slope": -5.729065967900883e-05} {"step": 8640, "timestamp": 1778203872.4679563, "train/loss": 2.360610508918762, "train/z_loss": 0.0018371175974607468, "train/perplexity": 10.59741929689114, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933596.5267310957, "perf/iters_per_sec": 0.9220106729178885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0845861434936523, "data/tokens_consumed": 18121490432, "data/tokens_consumed_B": 18.121490432, "train/loss_slope": -5.636150891309453e-05} {"step": 8650, "timestamp": 1778203882.8157725, "grad/layer_0/attn": 0.00275261583738029, "grad/layer_0/mlp": 0.0029052800964564085, "grad/layer_0/attn_mlp_ratio": 0.9474527932753872, "grad/layer_4/attn": 0.001537975505925715, "grad/layer_4/mlp": 0.002978644100949168, "grad/layer_4/attn_mlp_ratio": 0.5163340775765118, "grad/layer_8/attn": 0.004671633709222078, "grad/layer_8/mlp": 0.004397604148834944, "grad/layer_8/attn_mlp_ratio": 1.0623133517436758, "grad/layer_12/attn": 0.0058310506865382195, "grad/layer_12/mlp": 0.006077995058149099, "grad/layer_12/attn_mlp_ratio": 0.9593707357137677, "grad/layer_16/attn": 0.005180157721042633, "grad/layer_16/mlp": 0.0048238965682685375, "grad/layer_16/attn_mlp_ratio": 1.0738533756573958, "grad/layer_20/attn": 0.008263979107141495, "grad/layer_20/mlp": 0.006928465329110622, "grad/layer_20/attn_mlp_ratio": 1.1927575004444104, "grad/layer_24/attn": 0.010563998483121395, "grad/layer_24/mlp": 0.01486629992723465, "grad/layer_24/attn_mlp_ratio": 0.7106003823256926, "grad/layer_27/attn": 0.01583383046090603, "grad/layer_27/mlp": 0.013142252340912819, "grad/layer_27/attn_mlp_ratio": 1.2048034027724293} {"step": 8650, "timestamp": 1778203882.841687, "train/loss": 2.3938187837600706, "train/z_loss": 0.0018330593942664565, "train/perplexity": 10.955249894207101, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023218.4221078116, "perf/iters_per_sec": 0.9647457228220995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365425586700439, "data/tokens_consumed": 18142461952, "data/tokens_consumed_B": 18.142461952, "train/loss_slope": -5.4560861257043235e-05} {"step": 8660, "timestamp": 1778203893.1974888, "train/loss": 2.3343852758407593, "train/z_loss": 0.0018443243112415074, "train/perplexity": 10.32311211207726, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026360.9405523452, "perf/iters_per_sec": 0.9662441923867918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349350690841674, "data/tokens_consumed": 18163433472, "data/tokens_consumed_B": 18.163433472, "train/loss_slope": -5.2676736725510494e-05} {"step": 8670, "timestamp": 1778203903.550062, "train/loss": 2.4247456550598145, "train/z_loss": 0.0018221465754322708, "train/perplexity": 11.299355119961211, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026836.4087050736, "perf/iters_per_sec": 0.9664709132695549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346922874450684, "data/tokens_consumed": 18184404992, "data/tokens_consumed_B": 18.184404992, "train/loss_slope": -4.8275393385304896e-05} {"step": 8680, "timestamp": 1778203913.9075956, "train/loss": 2.3541500091552736, "train/z_loss": 0.0018361748661845922, "train/perplexity": 10.529175354381174, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026137.1746718856, "perf/iters_per_sec": 0.9661374925002506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350493669509888, "data/tokens_consumed": 18205376512, "data/tokens_consumed_B": 18.205376512, "train/loss_slope": -4.8690781498899566e-05} {"step": 8690, "timestamp": 1778203924.8316114, "train/loss": 2.4445900201797484, "train/z_loss": 0.0018048429978080095, "train/perplexity": 11.525823275101796, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1920790.5079118218, "perf/iters_per_sec": 0.9159042872962102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0918171405792236, "data/tokens_consumed": 18226348032, "data/tokens_consumed_B": 18.226348032, "train/loss_slope": -4.128580215704755e-05} {"step": 8700, "timestamp": 1778203935.1813877, "grad/layer_0/attn": 0.003020135685801506, "grad/layer_0/mlp": 0.0030587955843657255, "grad/layer_0/attn_mlp_ratio": 0.9873610392607063, "grad/layer_4/attn": 0.0013738047564402223, "grad/layer_4/mlp": 0.002856131410226226, "grad/layer_4/attn_mlp_ratio": 0.4810019256891316, "grad/layer_8/attn": 0.0071802036836743355, "grad/layer_8/mlp": 0.004248857498168945, "grad/layer_8/attn_mlp_ratio": 1.6899139398714282, "grad/layer_12/attn": 0.004442314151674509, "grad/layer_12/mlp": 0.005842472892254591, "grad/layer_12/attn_mlp_ratio": 0.7603482562202201, "grad/layer_16/attn": 0.006672247312963009, "grad/layer_16/mlp": 0.004657890181988478, "grad/layer_16/attn_mlp_ratio": 1.4324612451185932, "grad/layer_20/attn": 0.007429691031575203, "grad/layer_20/mlp": 0.006597880739718676, "grad/layer_20/attn_mlp_ratio": 1.1260723271704305, "grad/layer_24/attn": 0.010311070829629898, "grad/layer_24/mlp": 0.011764739640057087, "grad/layer_24/attn_mlp_ratio": 0.8764384982119345, "grad/layer_27/attn": 0.006412473972886801, "grad/layer_27/mlp": 0.010709784924983978, "grad/layer_27/attn_mlp_ratio": 0.598749083938442} {"step": 8700, "timestamp": 1778203935.8012621, "eos/sharpness": 29.272437095642083, "eos/L0_probe": 2.1824066638946533, "eos/L_plus": 2.3293967247009277, "eos/L_minus": 2.3281409740448, "eos/grad_norm": 0.1610240489244461, "eos/embed_grad_frac": 0.11353158205747604, "eos/time_s": 0.6168806552886963} {"step": 8700, "timestamp": 1778203935.821401, "train/loss": 2.3459859132766723, "train/z_loss": 0.0018318233080208302, "train/perplexity": 10.443564102017888, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909375.1503938294, "perf/iters_per_sec": 0.910461020657458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098344659805298, "data/tokens_consumed": 18247319552, "data/tokens_consumed_B": 18.247319552, "train/loss_slope": -4.104987639333624e-05} {"step": 8700, "timestamp": 1778203937.1868546, "geo/rankme_last": 439.22320556640625, "geo/layer_0/stable_rank_q_proj": 15.282270431518555, "geo/layer_0/stable_rank_k_proj": 12.193493843078613, "geo/layer_0/stable_rank_o_proj": 56.117225646972656, "geo/layer_0/stable_rank_gate_proj": 167.3254852294922, "geo/layer_0/stable_rank_down_proj": 47.79164505004883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.030007893219590187, "geo/layer_0/attn_entropy_mean": 6.424142837524414, "geo/layer_0/attn_entropy_std": 0.25801360607147217, "geo/layer_7/stable_rank_q_proj": 43.662445068359375, "geo/layer_7/stable_rank_k_proj": 44.292236328125, "geo/layer_7/stable_rank_o_proj": 104.84542083740234, "geo/layer_7/stable_rank_gate_proj": 132.53428649902344, "geo/layer_7/stable_rank_down_proj": 176.42767333984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5990334153175354, "geo/layer_7/attn_entropy_mean": 4.688467979431152, "geo/layer_7/attn_entropy_std": 0.9522576332092285, "geo/layer_14/stable_rank_q_proj": 68.39569854736328, "geo/layer_14/stable_rank_k_proj": 45.679630279541016, "geo/layer_14/stable_rank_o_proj": 48.33279800415039, "geo/layer_14/stable_rank_gate_proj": 118.25477600097656, "geo/layer_14/stable_rank_down_proj": 141.008544921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3851327896118164, "geo/layer_14/attn_entropy_mean": 5.776352882385254, "geo/layer_14/attn_entropy_std": 0.5832370519638062, "geo/layer_21/stable_rank_q_proj": 52.12443161010742, "geo/layer_21/stable_rank_k_proj": 32.51676559448242, "geo/layer_21/stable_rank_o_proj": 93.14840698242188, "geo/layer_21/stable_rank_gate_proj": 115.64529418945312, "geo/layer_21/stable_rank_down_proj": 76.80187225341797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15763522684574127, "geo/layer_21/attn_entropy_mean": 5.807963848114014, "geo/layer_21/attn_entropy_std": 0.3041418492794037, "geo/layer_27/stable_rank_q_proj": 42.8964958190918, "geo/layer_27/stable_rank_k_proj": 34.09647750854492, "geo/layer_27/stable_rank_o_proj": 113.35011291503906, "geo/layer_27/stable_rank_gate_proj": 94.76956176757812, "geo/layer_27/stable_rank_down_proj": 157.34271240234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06427537649869919, "geo/layer_27/attn_entropy_mean": 4.561382293701172, "geo/layer_27/attn_entropy_std": 0.5096314549446106, "attnres/final_alpha/block_0": 0.24865834414958954, "attnres/block_norm/0": 1.3535159826278687, "attnres/final_alpha/block_1": 0.010566668584942818, "attnres/block_norm/1": 14963.701171875, "attnres/final_alpha/block_2": 0.022136140614748, "attnres/block_norm/2": 10962.775390625, "attnres/final_alpha/block_3": 0.022308947518467903, "attnres/block_norm/3": 11688.158203125, "attnres/final_alpha/block_4": 0.03290719911456108, "attnres/block_norm/4": 4902.44140625, "attnres/final_alpha/block_5": 0.4822831451892853, "attnres/block_norm/5": 3668.492919921875, "attnres/final_alpha/block_6": 0.18113954365253448, "attnres/block_norm/6": 8614.1826171875, "geo/tier1_time_s": 1.3610451221466064, "geo/step": 8700.0, "geo/rankme_slope": 0.005696590960602991} {"step": 8710, "timestamp": 1778203948.01136, "train/loss": 2.338704538345337, "train/z_loss": 0.0018466649809852243, "train/perplexity": 10.367796776080546, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1720973.6143272396, "perf/iters_per_sec": 0.8206241675983618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2185846328735352, "data/tokens_consumed": 18268291072, "data/tokens_consumed_B": 18.268291072, "train/loss_slope": -4.289653794814831e-05} {"step": 8720, "timestamp": 1778203958.367341, "train/loss": 2.339238739013672, "train/z_loss": 0.001841088943183422, "train/perplexity": 10.37333673964186, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026599.2303953308, "perf/iters_per_sec": 0.9663578178383497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348133802413941, "data/tokens_consumed": 18289262592, "data/tokens_consumed_B": 18.289262592, "train/loss_slope": -4.125686872838819e-05} {"step": 8730, "timestamp": 1778203968.7326689, "train/loss": 2.3443761587142946, "train/z_loss": 0.0018452872522175313, "train/perplexity": 10.426766051054182, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024274.0996034965, "perf/iters_per_sec": 0.9652491090791209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036001992225647, "data/tokens_consumed": 18310234112, "data/tokens_consumed_B": 18.310234112, "train/loss_slope": -4.240781056283652e-05} {"step": 8740, "timestamp": 1778203979.0875032, "train/loss": 2.344731855392456, "train/z_loss": 0.0018373114056885243, "train/perplexity": 10.430475476778602, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026425.549658777, "perf/iters_per_sec": 0.9662750004094968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03490207195282, "data/tokens_consumed": 18331205632, "data/tokens_consumed_B": 18.331205632, "train/loss_slope": -4.716480453558737e-05} {"step": 8750, "timestamp": 1778203989.4308846, "grad/layer_0/attn": 0.0029276045970618725, "grad/layer_0/mlp": 0.0029347511008381844, "grad/layer_0/attn_mlp_ratio": 0.9975648348744961, "grad/layer_4/attn": 0.0019013434648513794, "grad/layer_4/mlp": 0.002947130473330617, "grad/layer_4/attn_mlp_ratio": 0.6451507381644884, "grad/layer_8/attn": 0.005384515970945358, "grad/layer_8/mlp": 0.0043851397931575775, "grad/layer_8/attn_mlp_ratio": 1.2279006148349279, "grad/layer_12/attn": 0.003924638964235783, "grad/layer_12/mlp": 0.005781590007245541, "grad/layer_12/attn_mlp_ratio": 0.6788165351461686, "grad/layer_16/attn": 0.004604775924235582, "grad/layer_16/mlp": 0.004644880536943674, "grad/layer_16/attn_mlp_ratio": 0.9913658249064329, "grad/layer_20/attn": 0.006282923277467489, "grad/layer_20/mlp": 0.0065486072562634945, "grad/layer_20/attn_mlp_ratio": 0.9594289191056953, "grad/layer_24/attn": 0.010436009615659714, "grad/layer_24/mlp": 0.011151192709803581, "grad/layer_24/attn_mlp_ratio": 0.9358648705710556, "grad/layer_27/attn": 0.010050620883703232, "grad/layer_27/mlp": 0.009382769465446472, "grad/layer_27/attn_mlp_ratio": 1.0711784845187104} {"step": 8750, "timestamp": 1778203989.4464414, "train/loss": 2.3485756158828734, "train/z_loss": 0.0018316203262656928, "train/perplexity": 10.470644877633607, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025965.3463185988, "perf/iters_per_sec": 0.9660555583565706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035137152671814, "data/tokens_consumed": 18352177152, "data/tokens_consumed_B": 18.352177152, "train/loss_slope": -4.6238040659401166e-05} {"step": 8760, "timestamp": 1778203999.8029904, "train/loss": 2.369384741783142, "train/z_loss": 0.0018288098042830824, "train/perplexity": 10.690812649903863, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026034.1768284396, "perf/iters_per_sec": 0.9660883793012808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351019859313966, "data/tokens_consumed": 18373148672, "data/tokens_consumed_B": 18.373148672, "train/loss_slope": -4.472836800748936e-05} {"step": 8770, "timestamp": 1778204010.155655, "train/loss": 2.325562834739685, "train/z_loss": 0.0018381392466835678, "train/perplexity": 10.23243763673778, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026914.7798749125, "perf/iters_per_sec": 0.9665082835554659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346522808074952, "data/tokens_consumed": 18394120192, "data/tokens_consumed_B": 18.394120192, "train/loss_slope": -4.489349060648021e-05} {"step": 8775, "timestamp": 1778204015.9251325, "eos/sharpness": 43.41895580291747, "eos/L0_probe": 2.1844003200531006, "eos/L_plus": 2.354304790496826, "eos/L_minus": 2.44868540763855, "eos/grad_norm": 0.21400082111358643, "eos/embed_grad_frac": 0.06350386142730713, "eos/time_s": 0.6046197414398193} {"step": 8775, "timestamp": 1778204017.2995102, "geo/rankme_last": 438.04937744140625, "geo/layer_0/stable_rank_q_proj": 15.283923149108887, "geo/layer_0/stable_rank_k_proj": 12.249441146850586, "geo/layer_0/stable_rank_o_proj": 56.19567108154297, "geo/layer_0/stable_rank_gate_proj": 167.66258239746094, "geo/layer_0/stable_rank_down_proj": 47.630279541015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.032814521342515945, "geo/layer_0/attn_entropy_mean": 6.423975467681885, "geo/layer_0/attn_entropy_std": 0.25423043966293335, "geo/layer_7/stable_rank_q_proj": 43.68964385986328, "geo/layer_7/stable_rank_k_proj": 44.18527603149414, "geo/layer_7/stable_rank_o_proj": 104.56599426269531, "geo/layer_7/stable_rank_gate_proj": 132.01861572265625, "geo/layer_7/stable_rank_down_proj": 175.51158142089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5881627202033997, "geo/layer_7/attn_entropy_mean": 4.7056732177734375, "geo/layer_7/attn_entropy_std": 0.9294925928115845, "geo/layer_14/stable_rank_q_proj": 68.04953002929688, "geo/layer_14/stable_rank_k_proj": 45.43960952758789, "geo/layer_14/stable_rank_o_proj": 48.31016159057617, "geo/layer_14/stable_rank_gate_proj": 117.78743743896484, "geo/layer_14/stable_rank_down_proj": 141.7757568359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40530768036842346, "geo/layer_14/attn_entropy_mean": 5.762189865112305, "geo/layer_14/attn_entropy_std": 0.6148539781570435, "geo/layer_21/stable_rank_q_proj": 52.0225715637207, "geo/layer_21/stable_rank_k_proj": 32.55194091796875, "geo/layer_21/stable_rank_o_proj": 93.07186889648438, "geo/layer_21/stable_rank_gate_proj": 115.05128479003906, "geo/layer_21/stable_rank_down_proj": 76.35675048828125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16522906720638275, "geo/layer_21/attn_entropy_mean": 5.818626880645752, "geo/layer_21/attn_entropy_std": 0.31103649735450745, "geo/layer_27/stable_rank_q_proj": 42.92755126953125, "geo/layer_27/stable_rank_k_proj": 33.94611358642578, "geo/layer_27/stable_rank_o_proj": 113.47666931152344, "geo/layer_27/stable_rank_gate_proj": 94.89849853515625, "geo/layer_27/stable_rank_down_proj": 157.5629119873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06327921152114868, "geo/layer_27/attn_entropy_mean": 4.561953067779541, "geo/layer_27/attn_entropy_std": 0.5298455953598022, "attnres/final_alpha/block_0": 0.24995297193527222, "attnres/block_norm/0": 1.3569061756134033, "attnres/final_alpha/block_1": 0.010662822052836418, "attnres/block_norm/1": 15106.83203125, "attnres/final_alpha/block_2": 0.02163248136639595, "attnres/block_norm/2": 11161.453125, "attnres/final_alpha/block_3": 0.021915454417467117, "attnres/block_norm/3": 11832.564453125, "attnres/final_alpha/block_4": 0.03290238976478577, "attnres/block_norm/4": 4957.044921875, "attnres/final_alpha/block_5": 0.47995004057884216, "attnres/block_norm/5": 3691.607421875, "attnres/final_alpha/block_6": 0.182983860373497, "attnres/block_norm/6": 8726.419921875, "geo/tier1_time_s": 1.3547661304473877, "geo/step": 8775.0, "geo/rankme_slope": 0.0054114994239883455} {"step": 8780, "timestamp": 1778204022.4758546, "train/loss": 2.2858208894729612, "train/z_loss": 0.0018486298155039548, "train/perplexity": 9.833755337875473, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702990.4905768856, "perf/iters_per_sec": 0.8120491459736279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2314525604248048, "data/tokens_consumed": 18415091712, "data/tokens_consumed_B": 18.415091712, "train/loss_slope": -4.9941241766216864e-05} {"step": 8790, "timestamp": 1778204032.8226085, "train/loss": 2.3869072914123537, "train/z_loss": 0.0018205856322310865, "train/perplexity": 10.87979382578549, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027833.6409727614, "perf/iters_per_sec": 0.9669464306701476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341834545135498, "data/tokens_consumed": 18436063232, "data/tokens_consumed_B": 18.436063232, "train/loss_slope": -4.573814376781838e-05} {"step": 8800, "timestamp": 1778204043.174032, "grad/layer_0/attn": 0.0026915566995739937, "grad/layer_0/mlp": 0.0028413652908056974, "grad/layer_0/attn_mlp_ratio": 0.9472758091175228, "grad/layer_4/attn": 0.001942130969837308, "grad/layer_4/mlp": 0.0027970285154879093, "grad/layer_4/attn_mlp_ratio": 0.6943550591807319, "grad/layer_8/attn": 0.012247548438608646, "grad/layer_8/mlp": 0.0041008563712239265, "grad/layer_8/attn_mlp_ratio": 2.9865830527234447, "grad/layer_12/attn": 0.004298804327845573, "grad/layer_12/mlp": 0.006121065001934767, "grad/layer_12/attn_mlp_ratio": 0.702296782709727, "grad/layer_16/attn": 0.004980006720870733, "grad/layer_16/mlp": 0.00493953051045537, "grad/layer_16/attn_mlp_ratio": 1.0081943232277353, "grad/layer_20/attn": 0.0055011361837387085, "grad/layer_20/mlp": 0.007181049790233374, "grad/layer_20/attn_mlp_ratio": 0.7660629389611343, "grad/layer_24/attn": 0.021797824651002884, "grad/layer_24/mlp": 0.01575392112135887, "grad/layer_24/attn_mlp_ratio": 1.3836443857196519, "grad/layer_27/attn": 0.009573242627084255, "grad/layer_27/mlp": 0.014255153946578503, "grad/layer_27/attn_mlp_ratio": 0.6715636039992152} {"step": 8800, "timestamp": 1778204043.1898577, "train/loss": 2.3545938968658446, "train/z_loss": 0.0018190722330473363, "train/perplexity": 10.533850163391808, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024362.2428140675, "perf/iters_per_sec": 0.9652911390371645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359568834304809, "data/tokens_consumed": 18457034752, "data/tokens_consumed_B": 18.457034752, "train/loss_slope": -4.582067942998447e-05} {"step": 8810, "timestamp": 1778204053.5445657, "train/loss": 2.3142934322357176, "train/z_loss": 0.0018474776181392372, "train/perplexity": 10.117771501371092, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026813.851261594, "perf/iters_per_sec": 0.9664601570423098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034703803062439, "data/tokens_consumed": 18478006272, "data/tokens_consumed_B": 18.478006272, "train/loss_slope": -4.4852328872737904e-05} {"step": 8820, "timestamp": 1778204063.9034393, "train/loss": 2.360802149772644, "train/z_loss": 0.001834363688249141, "train/perplexity": 10.599450389988133, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025685.1727511655, "perf/iters_per_sec": 0.9659219611888721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352803230285645, "data/tokens_consumed": 18498977792, "data/tokens_consumed_B": 18.498977792, "train/loss_slope": -4.3091196066761674e-05} {"step": 8830, "timestamp": 1778204074.2640793, "train/loss": 2.3454179048538206, "train/z_loss": 0.001826538552995771, "train/perplexity": 10.437633754046594, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025300.8008920809, "perf/iters_per_sec": 0.9657386784038929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035476803779602, "data/tokens_consumed": 18519949312, "data/tokens_consumed_B": 18.519949312, "train/loss_slope": -4.175652495526425e-05} {"step": 8840, "timestamp": 1778204084.6157072, "train/loss": 2.3557746410369873, "train/z_loss": 0.0018291750107891857, "train/perplexity": 10.546295291382217, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027079.8088342918, "perf/iters_per_sec": 0.9665869754954776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345680475234986, "data/tokens_consumed": 18540920832, "data/tokens_consumed_B": 18.540920832, "train/loss_slope": -3.733119630541777e-05} {"step": 8850, "timestamp": 1778204094.9649258, "grad/layer_0/attn": 0.002902396721765399, "grad/layer_0/mlp": 0.003062222618609667, "grad/layer_0/attn_mlp_ratio": 0.9478071938161202, "grad/layer_4/attn": 0.002722993725910783, "grad/layer_4/mlp": 0.0029930868186056614, "grad/layer_4/attn_mlp_ratio": 0.9097609925672648, "grad/layer_8/attn": 0.006638184655457735, "grad/layer_8/mlp": 0.004479498136788607, "grad/layer_8/attn_mlp_ratio": 1.481903620575306, "grad/layer_12/attn": 0.004745800979435444, "grad/layer_12/mlp": 0.006847005803138018, "grad/layer_12/attn_mlp_ratio": 0.6931206203956125, "grad/layer_16/attn": 0.005415685009211302, "grad/layer_16/mlp": 0.0050821853801608086, "grad/layer_16/attn_mlp_ratio": 1.0656212824880884, "grad/layer_20/attn": 0.010499924421310425, "grad/layer_20/mlp": 0.0072241779416799545, "grad/layer_20/attn_mlp_ratio": 1.453442088599011, "grad/layer_24/attn": 0.015107613056898117, "grad/layer_24/mlp": 0.013516106642782688, "grad/layer_24/attn_mlp_ratio": 1.11774887135789, "grad/layer_27/attn": 0.007648991886526346, "grad/layer_27/mlp": 0.0114655252546072, "grad/layer_27/attn_mlp_ratio": 0.6671296473521596} {"step": 8850, "timestamp": 1778204095.56996, "eos/sharpness": 22.71153926849365, "eos/L0_probe": 2.1811749935150146, "eos/L_plus": 2.266263246536255, "eos/L_minus": 2.323202133178711, "eos/grad_norm": 0.180587038397789, "eos/embed_grad_frac": 0.09472007304430008, "eos/time_s": 0.6023561954498291} {"step": 8850, "timestamp": 1778204095.5901213, "train/loss": 2.3621639728546144, "train/z_loss": 0.0018223467282950877, "train/perplexity": 10.613894799318095, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912575.6941135845, "perf/iters_per_sec": 0.9119871588294909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096506667137146, "data/tokens_consumed": 18561892352, "data/tokens_consumed_B": 18.561892352, "train/loss_slope": -3.4393847035174274e-05} {"step": 8850, "timestamp": 1778204096.9475718, "geo/rankme_last": 438.0498046875, "geo/layer_0/stable_rank_q_proj": 15.304779052734375, "geo/layer_0/stable_rank_k_proj": 12.278852462768555, "geo/layer_0/stable_rank_o_proj": 56.066558837890625, "geo/layer_0/stable_rank_gate_proj": 167.18067932128906, "geo/layer_0/stable_rank_down_proj": 47.72160720825195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03069021739065647, "geo/layer_0/attn_entropy_mean": 6.4202446937561035, "geo/layer_0/attn_entropy_std": 0.2597837746143341, "geo/layer_7/stable_rank_q_proj": 43.74629211425781, "geo/layer_7/stable_rank_k_proj": 44.12384033203125, "geo/layer_7/stable_rank_o_proj": 104.41299438476562, "geo/layer_7/stable_rank_gate_proj": 131.97515869140625, "geo/layer_7/stable_rank_down_proj": 174.64028930664062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5959084630012512, "geo/layer_7/attn_entropy_mean": 4.683465003967285, "geo/layer_7/attn_entropy_std": 0.9328188300132751, "geo/layer_14/stable_rank_q_proj": 67.86107635498047, "geo/layer_14/stable_rank_k_proj": 45.249271392822266, "geo/layer_14/stable_rank_o_proj": 48.196903228759766, "geo/layer_14/stable_rank_gate_proj": 117.3838882446289, "geo/layer_14/stable_rank_down_proj": 141.82456970214844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4004114866256714, "geo/layer_14/attn_entropy_mean": 5.742166042327881, "geo/layer_14/attn_entropy_std": 0.6150370836257935, "geo/layer_21/stable_rank_q_proj": 52.254581451416016, "geo/layer_21/stable_rank_k_proj": 32.57746505737305, "geo/layer_21/stable_rank_o_proj": 92.89303588867188, "geo/layer_21/stable_rank_gate_proj": 114.8692626953125, "geo/layer_21/stable_rank_down_proj": 75.84444427490234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16083726286888123, "geo/layer_21/attn_entropy_mean": 5.832915306091309, "geo/layer_21/attn_entropy_std": 0.31775909662246704, "geo/layer_27/stable_rank_q_proj": 42.93203353881836, "geo/layer_27/stable_rank_k_proj": 33.883670806884766, "geo/layer_27/stable_rank_o_proj": 112.78185272216797, "geo/layer_27/stable_rank_gate_proj": 94.80891418457031, "geo/layer_27/stable_rank_down_proj": 157.5263214111328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06327594816684723, "geo/layer_27/attn_entropy_mean": 4.567557334899902, "geo/layer_27/attn_entropy_std": 0.48483529686927795, "attnres/final_alpha/block_0": 0.2499505579471588, "attnres/block_norm/0": 1.3602863550186157, "attnres/final_alpha/block_1": 0.010587372817099094, "attnres/block_norm/1": 15278.2900390625, "attnres/final_alpha/block_2": 0.02179999276995659, "attnres/block_norm/2": 11277.71484375, "attnres/final_alpha/block_3": 0.022304769605398178, "attnres/block_norm/3": 12077.505859375, "attnres/final_alpha/block_4": 0.03287193924188614, "attnres/block_norm/4": 4979.13037109375, "attnres/final_alpha/block_5": 0.48273172974586487, "attnres/block_norm/5": 3694.84228515625, "attnres/final_alpha/block_6": 0.1797536462545395, "attnres/block_norm/6": 8848.212890625, "geo/tier1_time_s": 1.3535430431365967, "geo/step": 8850.0, "geo/rankme_slope": 0.0051525348811399556} {"step": 8860, "timestamp": 1778204107.3029013, "train/loss": 2.327656292915344, "train/z_loss": 0.0018449203576892613, "train/perplexity": 10.25388125479259, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791069.5889777, "perf/iters_per_sec": 0.8540485329521657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708936452865601, "data/tokens_consumed": 18582863872, "data/tokens_consumed_B": 18.582863872, "train/loss_slope": -3.4413542856227334e-05} {"step": 8870, "timestamp": 1778204117.6594176, "train/loss": 2.3549425840377807, "train/z_loss": 0.0018323517171666027, "train/perplexity": 10.537523822256517, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025846.8754654897, "perf/iters_per_sec": 0.9659990670516442, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351976871490478, "data/tokens_consumed": 18603835392, "data/tokens_consumed_B": 18.603835392, "train/loss_slope": -3.491729418865887e-05} {"step": 8880, "timestamp": 1778204128.0221872, "train/loss": 2.350136923789978, "train/z_loss": 0.0018254646915011109, "train/perplexity": 10.487005546971229, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024563.0162300293, "perf/iters_per_sec": 0.9653868752622744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358541488647461, "data/tokens_consumed": 18624806912, "data/tokens_consumed_B": 18.624806912, "train/loss_slope": -3.4280475774685535e-05} {"step": 8890, "timestamp": 1778204138.382164, "train/loss": 2.323893117904663, "train/z_loss": 0.001835382787976414, "train/perplexity": 10.215366619201193, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025282.0081448816, "perf/iters_per_sec": 0.9657297173237236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354864120483398, "data/tokens_consumed": 18645778432, "data/tokens_consumed_B": 18.645778432, "train/loss_slope": -3.364141014578684e-05} {"step": 8900, "timestamp": 1778204148.728652, "grad/layer_0/attn": 0.0026461679954081774, "grad/layer_0/mlp": 0.0028383592143654823, "grad/layer_0/attn_mlp_ratio": 0.9322878826565094, "grad/layer_4/attn": 0.0014628057833760977, "grad/layer_4/mlp": 0.0027621048502624035, "grad/layer_4/attn_mlp_ratio": 0.5295981904080542, "grad/layer_8/attn": 0.0064549692906439304, "grad/layer_8/mlp": 0.004213134292513132, "grad/layer_8/attn_mlp_ratio": 1.5321061920347492, "grad/layer_12/attn": 0.004144784063100815, "grad/layer_12/mlp": 0.005922187585383654, "grad/layer_12/attn_mlp_ratio": 0.6998738107085686, "grad/layer_16/attn": 0.005023205652832985, "grad/layer_16/mlp": 0.0049466583877801895, "grad/layer_16/attn_mlp_ratio": 1.0154745198686934, "grad/layer_20/attn": 0.0073438724502921104, "grad/layer_20/mlp": 0.007291000336408615, "grad/layer_20/attn_mlp_ratio": 1.0072516816237553, "grad/layer_24/attn": 0.018856026232242584, "grad/layer_24/mlp": 0.014565565623342991, "grad/layer_24/attn_mlp_ratio": 1.294561885915879, "grad/layer_27/attn": 0.007342573255300522, "grad/layer_27/mlp": 0.014141758903861046, "grad/layer_27/attn_mlp_ratio": 0.5192121611813509} {"step": 8900, "timestamp": 1778204148.7441776, "train/loss": 2.3609848022460938, "train/z_loss": 0.0018325167358852923, "train/perplexity": 10.601386582638876, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024884.4111258257, "perf/iters_per_sec": 0.965540128291047, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356897354125976, "data/tokens_consumed": 18666749952, "data/tokens_consumed_B": 18.666749952, "train/loss_slope": -3.490946724934875e-05} {"step": 8910, "timestamp": 1778204159.1038127, "train/loss": 2.398265027999878, "train/z_loss": 0.001814317109528929, "train/perplexity": 11.004068059263268, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025954.0072324595, "perf/iters_per_sec": 0.9660501514589593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351429462432862, "data/tokens_consumed": 18687721472, "data/tokens_consumed_B": 18.687721472, "train/loss_slope": -3.275513880037531e-05} {"step": 8920, "timestamp": 1778204170.0735717, "train/loss": 2.2832117319107055, "train/z_loss": 0.0018489356152713299, "train/perplexity": 9.808130964321226, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912964.4361376213, "perf/iters_per_sec": 0.9121725254715067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0962838411331177, "data/tokens_consumed": 18708692992, "data/tokens_consumed_B": 18.708692992, "train/loss_slope": -3.596351892069022e-05} {"step": 8925, "timestamp": 1778204175.8434496, "eos/sharpness": 12.820458412170408, "eos/L0_probe": 2.1741745471954346, "eos/L_plus": 2.257779598236084, "eos/L_minus": 2.2187740802764893, "eos/grad_norm": 0.12240660935640335, "eos/embed_grad_frac": 0.1921588033437729, "eos/time_s": 0.6003544330596924} {"step": 8925, "timestamp": 1778204177.220357, "geo/rankme_last": 438.17974853515625, "geo/layer_0/stable_rank_q_proj": 15.285266876220703, "geo/layer_0/stable_rank_k_proj": 12.279590606689453, "geo/layer_0/stable_rank_o_proj": 55.76753234863281, "geo/layer_0/stable_rank_gate_proj": 166.72828674316406, "geo/layer_0/stable_rank_down_proj": 47.86738204956055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.033236853778362274, "geo/layer_0/attn_entropy_mean": 6.413069725036621, "geo/layer_0/attn_entropy_std": 0.26192426681518555, "geo/layer_7/stable_rank_q_proj": 43.688514709472656, "geo/layer_7/stable_rank_k_proj": 44.05145263671875, "geo/layer_7/stable_rank_o_proj": 104.39433288574219, "geo/layer_7/stable_rank_gate_proj": 131.44009399414062, "geo/layer_7/stable_rank_down_proj": 173.82138061523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5979813933372498, "geo/layer_7/attn_entropy_mean": 4.6888861656188965, "geo/layer_7/attn_entropy_std": 0.9231153130531311, "geo/layer_14/stable_rank_q_proj": 67.79598999023438, "geo/layer_14/stable_rank_k_proj": 44.78329849243164, "geo/layer_14/stable_rank_o_proj": 47.89451217651367, "geo/layer_14/stable_rank_gate_proj": 117.24576568603516, "geo/layer_14/stable_rank_down_proj": 141.4773406982422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4067953824996948, "geo/layer_14/attn_entropy_mean": 5.7308244705200195, "geo/layer_14/attn_entropy_std": 0.6347660422325134, "geo/layer_21/stable_rank_q_proj": 52.165122985839844, "geo/layer_21/stable_rank_k_proj": 32.53794479370117, "geo/layer_21/stable_rank_o_proj": 92.77710723876953, "geo/layer_21/stable_rank_gate_proj": 114.0197982788086, "geo/layer_21/stable_rank_down_proj": 75.67182159423828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15859252214431763, "geo/layer_21/attn_entropy_mean": 5.806113243103027, "geo/layer_21/attn_entropy_std": 0.3061898648738861, "geo/layer_27/stable_rank_q_proj": 43.21836471557617, "geo/layer_27/stable_rank_k_proj": 33.914588928222656, "geo/layer_27/stable_rank_o_proj": 112.74364471435547, "geo/layer_27/stable_rank_gate_proj": 94.99632263183594, "geo/layer_27/stable_rank_down_proj": 156.90798950195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06157517060637474, "geo/layer_27/attn_entropy_mean": 4.5610198974609375, "geo/layer_27/attn_entropy_std": 0.48605096340179443, "attnres/final_alpha/block_0": 0.24999688565731049, "attnres/block_norm/0": 1.3634566068649292, "attnres/final_alpha/block_1": 0.010505655780434608, "attnres/block_norm/1": 15411.46484375, "attnres/final_alpha/block_2": 0.02142433077096939, "attnres/block_norm/2": 11348.54296875, "attnres/final_alpha/block_3": 0.021821431815624237, "attnres/block_norm/3": 12102.2724609375, "attnres/final_alpha/block_4": 0.03241218626499176, "attnres/block_norm/4": 5031.81298828125, "attnres/final_alpha/block_5": 0.48475050926208496, "attnres/block_norm/5": 3732.083740234375, "attnres/final_alpha/block_6": 0.1790890097618103, "attnres/block_norm/6": 8957.3408203125, "geo/tier1_time_s": 1.3575403690338135, "geo/step": 8925.0, "geo/rankme_slope": 0.0049245891716061425} {"step": 8930, "timestamp": 1778204182.4006774, "train/loss": 2.351183795928955, "train/z_loss": 0.0018258221331052481, "train/perplexity": 10.497989849476603, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702227.8459368497, "perf/iters_per_sec": 0.8116854886707543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2320042848587036, "data/tokens_consumed": 18729664512, "data/tokens_consumed_B": 18.729664512, "train/loss_slope": -3.480375970479367e-05} {"step": 8940, "timestamp": 1778204192.7584236, "train/loss": 2.385933589935303, "train/z_loss": 0.001817787706386298, "train/perplexity": 10.86920531033046, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026054.1968851935, "perf/iters_per_sec": 0.9660979256082504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035091757774353, "data/tokens_consumed": 18750636032, "data/tokens_consumed_B": 18.750636032, "train/loss_slope": -2.9759559243640347e-05} {"step": 8950, "timestamp": 1778204203.1025865, "grad/layer_0/attn": 0.0027720099315047264, "grad/layer_0/mlp": 0.0029762117192149162, "grad/layer_0/attn_mlp_ratio": 0.9313886577588905, "grad/layer_4/attn": 0.00182457419577986, "grad/layer_4/mlp": 0.0029027601704001427, "grad/layer_4/attn_mlp_ratio": 0.6285652364700252, "grad/layer_8/attn": 0.010267568752169609, "grad/layer_8/mlp": 0.004266886040568352, "grad/layer_8/attn_mlp_ratio": 2.4063376462166306, "grad/layer_12/attn": 0.004569375421851873, "grad/layer_12/mlp": 0.006276910658925772, "grad/layer_12/attn_mlp_ratio": 0.7279656501973074, "grad/layer_16/attn": 0.006054925266653299, "grad/layer_16/mlp": 0.004827747121453285, "grad/layer_16/attn_mlp_ratio": 1.2541926884130856, "grad/layer_20/attn": 0.0057389503344893456, "grad/layer_20/mlp": 0.008046085014939308, "grad/layer_20/attn_mlp_ratio": 0.71325995841552, "grad/layer_24/attn": 0.016446610912680626, "grad/layer_24/mlp": 0.01507518906146288, "grad/layer_24/attn_mlp_ratio": 1.0909721089751596, "grad/layer_27/attn": 0.008661187253892422, "grad/layer_27/mlp": 0.014103688299655914, "grad/layer_27/attn_mlp_ratio": 0.6141079559091598} {"step": 8950, "timestamp": 1778204203.1186557, "train/loss": 2.355385160446167, "train/z_loss": 0.0018255775910802185, "train/perplexity": 10.542188513868146, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025519.7644305818, "perf/iters_per_sec": 0.9658430883553418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353648662567139, "data/tokens_consumed": 18771607552, "data/tokens_consumed_B": 18.771607552, "train/loss_slope": -2.5252316007376724e-05} {"step": 8960, "timestamp": 1778204213.4752328, "train/loss": 2.3420939445495605, "train/z_loss": 0.0018304744618944824, "train/perplexity": 10.402997071148139, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026057.7436031692, "perf/iters_per_sec": 0.9660996168151709, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035089945793152, "data/tokens_consumed": 18792579072, "data/tokens_consumed_B": 18.792579072, "train/loss_slope": -2.555682663441141e-05} {"step": 8970, "timestamp": 1778204223.8339784, "train/loss": 2.33898184299469, "train/z_loss": 0.0018275798647664487, "train/perplexity": 10.370672212997686, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025688.9514294164, "perf/iters_per_sec": 0.965923763003071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352783918380737, "data/tokens_consumed": 18813550592, "data/tokens_consumed_B": 18.813550592, "train/loss_slope": -2.546661890128915e-05} {"step": 8980, "timestamp": 1778204234.1895332, "train/loss": 2.3224268674850466, "train/z_loss": 0.0018317130859941244, "train/perplexity": 10.200399309203043, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026229.0742018616, "perf/iters_per_sec": 0.9661813136109646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350024223327636, "data/tokens_consumed": 18834522112, "data/tokens_consumed_B": 18.834522112, "train/loss_slope": -2.391766295073957e-05} {"step": 8990, "timestamp": 1778204244.5424592, "train/loss": 2.3993215799331664, "train/z_loss": 0.0018111582729034125, "train/perplexity": 11.015700572740506, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026723.7666027842, "perf/iters_per_sec": 0.9664172013296052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347497940063477, "data/tokens_consumed": 18855493632, "data/tokens_consumed_B": 18.855493632, "train/loss_slope": -1.98207021224546e-05} {"step": 9000, "timestamp": 1778204254.8829799, "grad/layer_0/attn": 0.002467166632413864, "grad/layer_0/mlp": 0.002623031847178936, "grad/layer_0/attn_mlp_ratio": 0.940578186654301, "grad/layer_4/attn": 0.0022089146077632904, "grad/layer_4/mlp": 0.002847721567377448, "grad/layer_4/attn_mlp_ratio": 0.7756778455801636, "grad/layer_8/attn": 0.00696408748626709, "grad/layer_8/mlp": 0.004296667408198118, "grad/layer_8/attn_mlp_ratio": 1.620811355074482, "grad/layer_12/attn": 0.0035440735518932343, "grad/layer_12/mlp": 0.005421413574367762, "grad/layer_12/attn_mlp_ratio": 0.6537176029657136, "grad/layer_16/attn": 0.004320904612541199, "grad/layer_16/mlp": 0.004265219904482365, "grad/layer_16/attn_mlp_ratio": 1.0130555066327915, "grad/layer_20/attn": 0.005745889153331518, "grad/layer_20/mlp": 0.006153488531708717, "grad/layer_20/attn_mlp_ratio": 0.9337612364672533, "grad/layer_24/attn": 0.012888459488749504, "grad/layer_24/mlp": 0.012438378296792507, "grad/layer_24/attn_mlp_ratio": 1.0361848689273725, "grad/layer_27/attn": 0.009939386509358883, "grad/layer_27/mlp": 0.011456056497991085, "grad/layer_27/attn_mlp_ratio": 0.8676097594613698} {"step": 9000, "timestamp": 1778204255.4851222, "eos/sharpness": 22.925019264221188, "eos/L0_probe": 2.1715664863586426, "eos/L_plus": 2.269585371017456, "eos/L_minus": 2.302797794342041, "eos/grad_norm": 0.16538207232952118, "eos/embed_grad_frac": 0.12235366553068161, "eos/time_s": 0.5994575023651123} {"step": 9000, "timestamp": 1778204255.504168, "train/loss": 2.4318958282470704, "train/z_loss": 0.0017919691861607134, "train/perplexity": 11.380436995256474, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914248.4151106942, "perf/iters_per_sec": 0.9127847743562194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0955485105514526, "data/tokens_consumed": 18876465152, "data/tokens_consumed_B": 18.876465152, "train/loss_slope": -1.4645108273892813e-05} {"step": 9000, "timestamp": 1778204256.864715, "geo/rankme_last": 438.73486328125, "geo/layer_0/stable_rank_q_proj": 15.259065628051758, "geo/layer_0/stable_rank_k_proj": 12.312257766723633, "geo/layer_0/stable_rank_o_proj": 55.864524841308594, "geo/layer_0/stable_rank_gate_proj": 167.7470245361328, "geo/layer_0/stable_rank_down_proj": 47.91908645629883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.025926627218723297, "geo/layer_0/attn_entropy_mean": 6.409931182861328, "geo/layer_0/attn_entropy_std": 0.26167187094688416, "geo/layer_7/stable_rank_q_proj": 43.737281799316406, "geo/layer_7/stable_rank_k_proj": 44.081356048583984, "geo/layer_7/stable_rank_o_proj": 105.180419921875, "geo/layer_7/stable_rank_gate_proj": 130.78564453125, "geo/layer_7/stable_rank_down_proj": 173.53541564941406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5847722291946411, "geo/layer_7/attn_entropy_mean": 4.668666839599609, "geo/layer_7/attn_entropy_std": 0.954805850982666, "geo/layer_14/stable_rank_q_proj": 67.15922546386719, "geo/layer_14/stable_rank_k_proj": 44.47834014892578, "geo/layer_14/stable_rank_o_proj": 47.84116744995117, "geo/layer_14/stable_rank_gate_proj": 116.98222351074219, "geo/layer_14/stable_rank_down_proj": 141.62057495117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38856664299964905, "geo/layer_14/attn_entropy_mean": 5.779136657714844, "geo/layer_14/attn_entropy_std": 0.6106052398681641, "geo/layer_21/stable_rank_q_proj": 52.34943771362305, "geo/layer_21/stable_rank_k_proj": 32.45665740966797, "geo/layer_21/stable_rank_o_proj": 92.71852111816406, "geo/layer_21/stable_rank_gate_proj": 113.99137115478516, "geo/layer_21/stable_rank_down_proj": 75.54280090332031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16406728327274323, "geo/layer_21/attn_entropy_mean": 5.812220573425293, "geo/layer_21/attn_entropy_std": 0.3070856034755707, "geo/layer_27/stable_rank_q_proj": 43.17866516113281, "geo/layer_27/stable_rank_k_proj": 33.754764556884766, "geo/layer_27/stable_rank_o_proj": 113.01195526123047, "geo/layer_27/stable_rank_gate_proj": 95.03430938720703, "geo/layer_27/stable_rank_down_proj": 156.5866241455078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.061641719192266464, "geo/layer_27/attn_entropy_mean": 4.532896995544434, "geo/layer_27/attn_entropy_std": 0.49641555547714233, "attnres/final_alpha/block_0": 0.24953262507915497, "attnres/block_norm/0": 1.3667230606079102, "attnres/final_alpha/block_1": 0.010524546727538109, "attnres/block_norm/1": 15537.271484375, "attnres/final_alpha/block_2": 0.02105754241347313, "attnres/block_norm/2": 11438.8701171875, "attnres/final_alpha/block_3": 0.021652545779943466, "attnres/block_norm/3": 12327.62109375, "attnres/final_alpha/block_4": 0.032232657074928284, "attnres/block_norm/4": 5040.56298828125, "attnres/final_alpha/block_5": 0.4859321713447571, "attnres/block_norm/5": 3723.87744140625, "attnres/final_alpha/block_6": 0.17906790971755981, "attnres/block_norm/6": 8986.583984375, "geo/tier1_time_s": 1.3567969799041748, "geo/step": 9000.0, "geo/rankme_slope": 0.004687012226765706} {"step": 9000, "timestamp": 1778204264.1836753, "geo/ww_alpha_mean": 8.353719706438213, "geo/ww_alpha_std": 5.097140441636373, "geo/ww_alpha_min": 1.3651707475408696, "geo/ww_alpha_max": 39.13463585473272, "geo/ww_alpha_healthy_frac": 0.12690355329949238, "geo/ww_alpha_by_type/q_proj": 4.446985470870922, "geo/ww_alpha_by_type/k_proj": 4.678819826624801, "geo/ww_alpha_by_type/v_proj": 9.184442560284458, "geo/ww_alpha_by_type/o_proj": 7.826835658255901, "geo/ww_alpha_by_type/gate_proj": 9.9937320167994, "geo/ww_alpha_by_type/up_proj": 13.213291957558114, "geo/ww_alpha_by_type/down_proj": 9.288960071475412, "geo/twonn_id/layer_0": 0.6885560154914856, "geo/twonn_id/layer_7": 3.0670554637908936, "geo/twonn_id/layer_14": 3.5980513095855713, "geo/twonn_id/layer_21": 7.204709053039551, "geo/twonn_id/layer_27": 5.808625221252441, "geo/tier2_time_s": 7.311379432678223} {"step": 9000, "timestamp": 1778204264.8498132, "eoc/jacobian_sigma/layer_0/attn": 610.0594482421875, "eoc/jacobian_sigma/layer_0/mlp": 2460.54345703125, "eoc/jacobian_sigma/layer_0": 2460.54345703125, "eoc/jacobian_sigma/layer_7/attn": 1.1702290773391724, "eoc/jacobian_sigma/layer_7/mlp": 1.5541290044784546, "eoc/jacobian_sigma/layer_7": 1.5541290044784546, "eoc/jacobian_sigma/layer_14/attn": 1.296289324760437, "eoc/jacobian_sigma/layer_14/mlp": 5.995290279388428, "eoc/jacobian_sigma/layer_14": 5.995290279388428, "eoc/jacobian_sigma/layer_21/attn": 1.0857319831848145, "eoc/jacobian_sigma/layer_21/mlp": 2.856549024581909, "eoc/jacobian_sigma/layer_21": 2.856549024581909, "eoc/jacobian_sigma/layer_27/attn": 1.8897902965545654, "eoc/jacobian_sigma/layer_27/mlp": 16.257741928100586, "eoc/jacobian_sigma/layer_27": 16.257741928100586, "eoc/layer0_sigma": 2460.54345703125, "eoc/sigma_max": 16.257741928100586, "eoc/sigma_min": 1.5541290044784546, "eoc/sigma_mean": 6.665927559137344, "eoc/time_s": 0.6592872142791748} {"step": 9010, "timestamp": 1778204275.2190518, "train/loss": 2.3453937292099, "train/z_loss": 0.0018258479773066937, "train/perplexity": 10.437381420579756, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1063932.4113249353, "perf/iters_per_sec": 0.5073225075363804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9711327314376832, "data/tokens_consumed": 18897436672, "data/tokens_consumed_B": 18.897436672, "train/loss_slope": -1.4906573038075512e-05} {"step": 9020, "timestamp": 1778204285.5668635, "train/loss": 2.3584889650344847, "train/z_loss": 0.0018220151541754603, "train/perplexity": 10.57496023915491, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027926.6295346683, "perf/iters_per_sec": 0.9669907710717527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341360330581666, "data/tokens_consumed": 18918408192, "data/tokens_consumed_B": 18.918408192, "train/loss_slope": -1.4301731698762771e-05} {"step": 9030, "timestamp": 1778204295.920964, "train/loss": 2.333313989639282, "train/z_loss": 0.0018351454869844019, "train/perplexity": 10.312059026081638, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026607.0747441235, "perf/iters_per_sec": 0.966361558315336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034809374809265, "data/tokens_consumed": 18939379712, "data/tokens_consumed_B": 18.939379712, "train/loss_slope": -1.3868300828209907e-05} {"step": 9040, "timestamp": 1778204306.2699254, "train/loss": 2.347792625427246, "train/z_loss": 0.0018255965202115475, "train/perplexity": 10.462449671432964, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027516.2631103725, "perf/iters_per_sec": 0.9667950931121695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343453407287597, "data/tokens_consumed": 18960351232, "data/tokens_consumed_B": 18.960351232, "train/loss_slope": -1.3846001444798678e-05} {"step": 9050, "timestamp": 1778204316.6096592, "grad/layer_0/attn": 0.0025402759201824665, "grad/layer_0/mlp": 0.002788579324260354, "grad/layer_0/attn_mlp_ratio": 0.9109569905315686, "grad/layer_4/attn": 0.0015633763978257775, "grad/layer_4/mlp": 0.002778653521090746, "grad/layer_4/attn_mlp_ratio": 0.5626381014025355, "grad/layer_8/attn": 0.006983937695622444, "grad/layer_8/mlp": 0.0041535645723342896, "grad/layer_8/attn_mlp_ratio": 1.6814322748217847, "grad/layer_12/attn": 0.0040273950435221195, "grad/layer_12/mlp": 0.0057474044151604176, "grad/layer_12/attn_mlp_ratio": 0.7007328321677581, "grad/layer_16/attn": 0.005655817221850157, "grad/layer_16/mlp": 0.00454695662483573, "grad/layer_16/attn_mlp_ratio": 1.2438687157407438, "grad/layer_20/attn": 0.004870416596531868, "grad/layer_20/mlp": 0.006563038565218449, "grad/layer_20/attn_mlp_ratio": 0.7420978063626504, "grad/layer_24/attn": 0.013499530032277107, "grad/layer_24/mlp": 0.011957848444581032, "grad/layer_24/attn_mlp_ratio": 1.1289263266672434, "grad/layer_27/attn": 0.006564840208739042, "grad/layer_27/mlp": 0.01060764491558075, "grad/layer_27/attn_mlp_ratio": 0.6188781957820474} {"step": 9050, "timestamp": 1778204316.6256158, "train/loss": 2.3629031658172606, "train/z_loss": 0.0018142487853765488, "train/perplexity": 10.621743416123795, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026146.088887904, "perf/iters_per_sec": 0.9661417431296845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350448131561278, "data/tokens_consumed": 18981322752, "data/tokens_consumed_B": 18.981322752, "train/loss_slope": -1.164103164495462e-05} {"step": 9060, "timestamp": 1778204326.9776638, "train/loss": 2.3521260261535644, "train/z_loss": 0.0018242342048324645, "train/perplexity": 10.507886034320505, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026935.1442936272, "perf/iters_per_sec": 0.9665179940670143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346418857574462, "data/tokens_consumed": 19002294272, "data/tokens_consumed_B": 19.002294272, "train/loss_slope": -9.788819908773989e-06} {"step": 9070, "timestamp": 1778204337.3346634, "train/loss": 2.3732462882995606, "train/z_loss": 0.0018202190403826535, "train/perplexity": 10.732175531196182, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026007.0640489932, "perf/iters_per_sec": 0.9660754509205786, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351158380508423, "data/tokens_consumed": 19023265792, "data/tokens_consumed_B": 19.023265792, "train/loss_slope": -4.921563545076088e-06} {"step": 9075, "timestamp": 1778204343.1313493, "eos/sharpness": 25.195121765136715, "eos/L0_probe": 2.172196865081787, "eos/L_plus": 2.27612566947937, "eos/L_minus": 2.3202192783355713, "eos/grad_norm": 0.19377832114696503, "eos/embed_grad_frac": 0.08914165198802948, "eos/time_s": 0.6298024654388428} {"step": 9075, "timestamp": 1778204344.5085876, "geo/rankme_last": 438.82269287109375, "geo/layer_0/stable_rank_q_proj": 15.201410293579102, "geo/layer_0/stable_rank_k_proj": 12.30717945098877, "geo/layer_0/stable_rank_o_proj": 55.917640686035156, "geo/layer_0/stable_rank_gate_proj": 167.61244201660156, "geo/layer_0/stable_rank_down_proj": 47.89352798461914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03439122810959816, "geo/layer_0/attn_entropy_mean": 6.407736301422119, "geo/layer_0/attn_entropy_std": 0.26423609256744385, "geo/layer_7/stable_rank_q_proj": 43.6371955871582, "geo/layer_7/stable_rank_k_proj": 44.29194259643555, "geo/layer_7/stable_rank_o_proj": 105.51354217529297, "geo/layer_7/stable_rank_gate_proj": 130.71347045898438, "geo/layer_7/stable_rank_down_proj": 173.69277954101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5906904935836792, "geo/layer_7/attn_entropy_mean": 4.72601318359375, "geo/layer_7/attn_entropy_std": 0.9576837420463562, "geo/layer_14/stable_rank_q_proj": 66.64543151855469, "geo/layer_14/stable_rank_k_proj": 44.343414306640625, "geo/layer_14/stable_rank_o_proj": 47.73019790649414, "geo/layer_14/stable_rank_gate_proj": 116.97001647949219, "geo/layer_14/stable_rank_down_proj": 141.60948181152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3930063247680664, "geo/layer_14/attn_entropy_mean": 5.7809648513793945, "geo/layer_14/attn_entropy_std": 0.6090806126594543, "geo/layer_21/stable_rank_q_proj": 52.41584014892578, "geo/layer_21/stable_rank_k_proj": 32.40629577636719, "geo/layer_21/stable_rank_o_proj": 92.593994140625, "geo/layer_21/stable_rank_gate_proj": 113.88162994384766, "geo/layer_21/stable_rank_down_proj": 75.29350280761719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16092771291732788, "geo/layer_21/attn_entropy_mean": 5.810318946838379, "geo/layer_21/attn_entropy_std": 0.3049662113189697, "geo/layer_27/stable_rank_q_proj": 43.20461654663086, "geo/layer_27/stable_rank_k_proj": 33.61153793334961, "geo/layer_27/stable_rank_o_proj": 112.78154754638672, "geo/layer_27/stable_rank_gate_proj": 95.15230560302734, "geo/layer_27/stable_rank_down_proj": 156.56539916992188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06454890966415405, "geo/layer_27/attn_entropy_mean": 4.547194480895996, "geo/layer_27/attn_entropy_std": 0.5173198580741882, "attnres/final_alpha/block_0": 0.24947145581245422, "attnres/block_norm/0": 1.3698103427886963, "attnres/final_alpha/block_1": 0.010331660509109497, "attnres/block_norm/1": 15683.515625, "attnres/final_alpha/block_2": 0.021210214123129845, "attnres/block_norm/2": 11521.21875, "attnres/final_alpha/block_3": 0.021222993731498718, "attnres/block_norm/3": 12401.7685546875, "attnres/final_alpha/block_4": 0.03171679005026817, "attnres/block_norm/4": 5066.7373046875, "attnres/final_alpha/block_5": 0.4886954426765442, "attnres/block_norm/5": 3723.4677734375, "attnres/final_alpha/block_6": 0.1773514449596405, "attnres/block_norm/6": 9107.30859375, "geo/tier1_time_s": 1.3575949668884277, "geo/step": 9075.0, "geo/rankme_slope": 0.004453524534813925} {"step": 9080, "timestamp": 1778204349.6897774, "train/loss": 2.377497148513794, "train/z_loss": 0.0018171939300373197, "train/perplexity": 10.777893610913912, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698231.7729410133, "perf/iters_per_sec": 0.8097800125794474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2349032878875732, "data/tokens_consumed": 19044237312, "data/tokens_consumed_B": 19.044237312, "train/loss_slope": -3.737623863952805e-06} {"step": 9090, "timestamp": 1778204360.0450444, "train/loss": 2.3607781648635866, "train/z_loss": 0.0018080253386870027, "train/perplexity": 10.59919616618325, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026286.719857743, "perf/iters_per_sec": 0.9662088012016978, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349729776382446, "data/tokens_consumed": 19065208832, "data/tokens_consumed_B": 19.065208832, "train/loss_slope": -3.095114320525602e-06} {"step": 9100, "timestamp": 1778204370.4230163, "grad/layer_0/attn": 0.002662002807483077, "grad/layer_0/mlp": 0.0028101850766688585, "grad/layer_0/attn_mlp_ratio": 0.9472695356818321, "grad/layer_4/attn": 0.0014602452283725142, "grad/layer_4/mlp": 0.0028537060134112835, "grad/layer_4/attn_mlp_ratio": 0.511701335154991, "grad/layer_8/attn": 0.008486787788569927, "grad/layer_8/mlp": 0.0044345189817249775, "grad/layer_8/attn_mlp_ratio": 1.913801165845624, "grad/layer_12/attn": 0.005618528928607702, "grad/layer_12/mlp": 0.005873765330761671, "grad/layer_12/attn_mlp_ratio": 0.9565463576707947, "grad/layer_16/attn": 0.006526521872729063, "grad/layer_16/mlp": 0.0048864795826375484, "grad/layer_16/attn_mlp_ratio": 1.3356285703834705, "grad/layer_20/attn": 0.005611482076346874, "grad/layer_20/mlp": 0.006903867237269878, "grad/layer_20/attn_mlp_ratio": 0.8128027092950956, "grad/layer_24/attn": 0.011465947143733501, "grad/layer_24/mlp": 0.012712259776890278, "grad/layer_24/attn_mlp_ratio": 0.9019597817204431, "grad/layer_27/attn": 0.011334842070937157, "grad/layer_27/mlp": 0.011501607485115528, "grad/layer_27/attn_mlp_ratio": 0.985500677801407} {"step": 9100, "timestamp": 1778204370.438853, "train/loss": 2.3256710052490233, "train/z_loss": 0.0018227956257760524, "train/perplexity": 10.233544544595032, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019126.6897027395, "perf/iters_per_sec": 0.9627946327699373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386430978775025, "data/tokens_consumed": 19086180352, "data/tokens_consumed_B": 19.086180352, "train/loss_slope": -4.994483835304551e-06} {"step": 9110, "timestamp": 1778204380.794947, "train/loss": 2.355351996421814, "train/z_loss": 0.001812874898314476, "train/perplexity": 10.541838898268901, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026350.1105272912, "perf/iters_per_sec": 0.9662390282284218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349406003952026, "data/tokens_consumed": 19107151872, "data/tokens_consumed_B": 19.107151872, "train/loss_slope": -3.091781882598895e-06} {"step": 9120, "timestamp": 1778204391.147886, "train/loss": 2.326295495033264, "train/z_loss": 0.001819723763037473, "train/perplexity": 10.239937284512335, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027230.894849111, "perf/iters_per_sec": 0.9666590189214282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034490942955017, "data/tokens_consumed": 19128123392, "data/tokens_consumed_B": 19.128123392, "train/loss_slope": -3.614598332029473e-06} {"step": 9130, "timestamp": 1778204401.505428, "train/loss": 2.3654642343521117, "train/z_loss": 0.0018097967724315823, "train/perplexity": 10.648981293119025, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026253.765745197, "perf/iters_per_sec": 0.9661930874563203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349898099899293, "data/tokens_consumed": 19149094912, "data/tokens_consumed_B": 19.149094912, "train/loss_slope": -9.858535640132447e-07} {"step": 9140, "timestamp": 1778204411.8608031, "train/loss": 2.34712700843811, "train/z_loss": 0.0018193963449448347, "train/perplexity": 10.45548800434265, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026197.2421360721, "perf/iters_per_sec": 0.9661661348991738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350186824798584, "data/tokens_consumed": 19170066432, "data/tokens_consumed_B": 19.170066432, "train/loss_slope": -3.7132218761771087e-06} {"step": 9150, "timestamp": 1778204422.202454, "grad/layer_0/attn": 0.002442573895677924, "grad/layer_0/mlp": 0.0027069703210145235, "grad/layer_0/attn_mlp_ratio": 0.9023275159255311, "grad/layer_4/attn": 0.0015274855541065335, "grad/layer_4/mlp": 0.0028490282129496336, "grad/layer_4/attn_mlp_ratio": 0.5361426375314288, "grad/layer_8/attn": 0.0057224310003221035, "grad/layer_8/mlp": 0.004336235579103231, "grad/layer_8/attn_mlp_ratio": 1.3196771171592658, "grad/layer_12/attn": 0.005212715361267328, "grad/layer_12/mlp": 0.006715796422213316, "grad/layer_12/attn_mlp_ratio": 0.7761872093690804, "grad/layer_16/attn": 0.005329912528395653, "grad/layer_16/mlp": 0.005045730620622635, "grad/layer_16/attn_mlp_ratio": 1.0563212393819443, "grad/layer_20/attn": 0.0050563812255859375, "grad/layer_20/mlp": 0.007138540502637625, "grad/layer_20/attn_mlp_ratio": 0.7083214212884986, "grad/layer_24/attn": 0.01978207193315029, "grad/layer_24/mlp": 0.011442816816270351, "grad/layer_24/attn_mlp_ratio": 1.7287764086326063, "grad/layer_27/attn": 0.010890238918364048, "grad/layer_27/mlp": 0.010773401707410812, "grad/layer_27/attn_mlp_ratio": 1.0108449599339056} {"step": 9150, "timestamp": 1778204422.8150897, "eos/sharpness": 35.88199615478515, "eos/L0_probe": 2.1695761680603027, "eos/L_plus": 2.322706460952759, "eos/L_minus": 2.3752658367156982, "eos/grad_norm": 0.1875060349702835, "eos/embed_grad_frac": 0.07795869559049606, "eos/time_s": 0.6096737384796143} {"step": 9150, "timestamp": 1778204422.834604, "train/loss": 2.356602907180786, "train/z_loss": 0.0018139812280423939, "train/perplexity": 10.55503404922362, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912187.608883161, "perf/iters_per_sec": 0.9118021053710752, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967292070388794, "data/tokens_consumed": 19191037952, "data/tokens_consumed_B": 19.191037952, "train/loss_slope": -3.3380059984186744e-06} {"step": 9150, "timestamp": 1778204424.1941822, "geo/rankme_last": 438.16387939453125, "geo/layer_0/stable_rank_q_proj": 15.245870590209961, "geo/layer_0/stable_rank_k_proj": 12.348970413208008, "geo/layer_0/stable_rank_o_proj": 55.6666259765625, "geo/layer_0/stable_rank_gate_proj": 167.0072021484375, "geo/layer_0/stable_rank_down_proj": 47.92296600341797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03123047761619091, "geo/layer_0/attn_entropy_mean": 6.409246444702148, "geo/layer_0/attn_entropy_std": 0.2640385031700134, "geo/layer_7/stable_rank_q_proj": 43.74080276489258, "geo/layer_7/stable_rank_k_proj": 44.482330322265625, "geo/layer_7/stable_rank_o_proj": 105.4464340209961, "geo/layer_7/stable_rank_gate_proj": 130.69297790527344, "geo/layer_7/stable_rank_down_proj": 172.85494995117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5991225242614746, "geo/layer_7/attn_entropy_mean": 4.653748035430908, "geo/layer_7/attn_entropy_std": 0.9329396486282349, "geo/layer_14/stable_rank_q_proj": 66.45265197753906, "geo/layer_14/stable_rank_k_proj": 44.063228607177734, "geo/layer_14/stable_rank_o_proj": 47.8823356628418, "geo/layer_14/stable_rank_gate_proj": 116.2774887084961, "geo/layer_14/stable_rank_down_proj": 141.36065673828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3863561153411865, "geo/layer_14/attn_entropy_mean": 5.7277960777282715, "geo/layer_14/attn_entropy_std": 0.6012580394744873, "geo/layer_21/stable_rank_q_proj": 52.30013656616211, "geo/layer_21/stable_rank_k_proj": 32.487308502197266, "geo/layer_21/stable_rank_o_proj": 92.56644439697266, "geo/layer_21/stable_rank_gate_proj": 113.36971282958984, "geo/layer_21/stable_rank_down_proj": 75.12261199951172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1637008935213089, "geo/layer_21/attn_entropy_mean": 5.803558826446533, "geo/layer_21/attn_entropy_std": 0.3181363642215729, "geo/layer_27/stable_rank_q_proj": 43.2083854675293, "geo/layer_27/stable_rank_k_proj": 33.471405029296875, "geo/layer_27/stable_rank_o_proj": 112.7352066040039, "geo/layer_27/stable_rank_gate_proj": 95.25585174560547, "geo/layer_27/stable_rank_down_proj": 155.70553588867188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06450527906417847, "geo/layer_27/attn_entropy_mean": 4.537110328674316, "geo/layer_27/attn_entropy_std": 0.5051129460334778, "attnres/final_alpha/block_0": 0.24840758740901947, "attnres/block_norm/0": 1.3729406595230103, "attnres/final_alpha/block_1": 0.010244494304060936, "attnres/block_norm/1": 15875.4892578125, "attnres/final_alpha/block_2": 0.02104145660996437, "attnres/block_norm/2": 11671.580078125, "attnres/final_alpha/block_3": 0.021746058017015457, "attnres/block_norm/3": 12580.91015625, "attnres/final_alpha/block_4": 0.03137067332863808, "attnres/block_norm/4": 5128.1787109375, "attnres/final_alpha/block_5": 0.48869436979293823, "attnres/block_norm/5": 3754.007080078125, "attnres/final_alpha/block_6": 0.1784953624010086, "attnres/block_norm/6": 9166.30078125, "geo/tier1_time_s": 1.355548620223999, "geo/step": 9150.0, "geo/rankme_slope": 0.004257122946834984} {"step": 9160, "timestamp": 1778204434.5530386, "train/loss": 2.345294404029846, "train/z_loss": 0.0018210251233540475, "train/perplexity": 10.436344777274108, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790216.416519607, "perf/iters_per_sec": 0.8536417086217913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714516639709474, "data/tokens_consumed": 19212009472, "data/tokens_consumed_B": 19.212009472, "train/loss_slope": -5.458122046545421e-06} {"step": 9170, "timestamp": 1778204444.905493, "train/loss": 2.36566698551178, "train/z_loss": 0.0018098114058375358, "train/perplexity": 10.651140605319616, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026788.2120442297, "perf/iters_per_sec": 0.9664479313107632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347168922424317, "data/tokens_consumed": 19232980992, "data/tokens_consumed_B": 19.232980992, "train/loss_slope": -3.1006826998199036e-06} {"step": 9180, "timestamp": 1778204455.2622488, "train/loss": 2.3725290060043336, "train/z_loss": 0.0018189370399340986, "train/perplexity": 10.724480291857784, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026082.710982404, "perf/iters_per_sec": 0.9661115221893329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03507719039917, "data/tokens_consumed": 19253952512, "data/tokens_consumed_B": 19.253952512, "train/loss_slope": -2.4682114727319703e-06} {"step": 9190, "timestamp": 1778204465.618411, "train/loss": 2.3428160905838014, "train/z_loss": 0.0018109985161572696, "train/perplexity": 10.41051226743529, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026165.0843637984, "perf/iters_per_sec": 0.9661508008784286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350351095199586, "data/tokens_consumed": 19274924032, "data/tokens_consumed_B": 19.274924032, "train/loss_slope": -2.82926003638959e-06} {"step": 9200, "timestamp": 1778204475.961284, "grad/layer_0/attn": 0.0031714518554508686, "grad/layer_0/mlp": 0.0032651929650455713, "grad/layer_0/attn_mlp_ratio": 0.9712907605378016, "grad/layer_4/attn": 0.00154023221693933, "grad/layer_4/mlp": 0.002891920041292906, "grad/layer_4/attn_mlp_ratio": 0.5325984611216582, "grad/layer_8/attn": 0.0057143764570355415, "grad/layer_8/mlp": 0.00449605006724596, "grad/layer_8/attn_mlp_ratio": 1.2709770230468453, "grad/layer_12/attn": 0.005839442368596792, "grad/layer_12/mlp": 0.006235728971660137, "grad/layer_12/attn_mlp_ratio": 0.9364490184693285, "grad/layer_16/attn": 0.006030583754181862, "grad/layer_16/mlp": 0.00532185286283493, "grad/layer_16/attn_mlp_ratio": 1.1331736889944801, "grad/layer_20/attn": 0.008602781221270561, "grad/layer_20/mlp": 0.00814446248114109, "grad/layer_20/attn_mlp_ratio": 1.0562736504174912, "grad/layer_24/attn": 0.013690337538719177, "grad/layer_24/mlp": 0.01454556081444025, "grad/layer_24/attn_mlp_ratio": 0.9412038228878447, "grad/layer_27/attn": 0.006221345625817776, "grad/layer_27/mlp": 0.012560417875647545, "grad/layer_27/attn_mlp_ratio": 0.4953135825479596} {"step": 9200, "timestamp": 1778204475.976968, "train/loss": 2.370062804222107, "train/z_loss": 0.0018124989466741682, "train/perplexity": 10.69806414660971, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025523.4958489412, "perf/iters_per_sec": 0.9658448676342684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035362958908081, "data/tokens_consumed": 19295895552, "data/tokens_consumed_B": 19.295895552, "train/loss_slope": -3.8051089259050008e-06} {"step": 9210, "timestamp": 1778204486.3356056, "train/loss": 2.3339836835861205, "train/z_loss": 0.0018212418537586928, "train/perplexity": 10.318967262534724, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025670.6179740762, "perf/iters_per_sec": 0.9659150209303266, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352877616882323, "data/tokens_consumed": 19316867072, "data/tokens_consumed_B": 19.316867072, "train/loss_slope": -6.937727890964665e-06} {"step": 9220, "timestamp": 1778204496.694246, "train/loss": 2.3501172780990602, "train/z_loss": 0.0018157344427891075, "train/perplexity": 10.486799524525336, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025694.8760520904, "perf/iters_per_sec": 0.9659265880833103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035275363922119, "data/tokens_consumed": 19337838592, "data/tokens_consumed_B": 19.337838592, "train/loss_slope": -8.698097471356836e-06} {"step": 9225, "timestamp": 1778204502.470617, "eos/sharpness": 26.372718811035153, "eos/L0_probe": 2.1685521602630615, "eos/L_plus": 2.267256259918213, "eos/L_minus": 2.3335752487182617, "eos/grad_norm": 0.24225787818431854, "eos/embed_grad_frac": 0.07252493500709534, "eos/time_s": 0.6119911670684814} {"step": 9225, "timestamp": 1778204503.8494453, "geo/rankme_last": 438.8689880371094, "geo/layer_0/stable_rank_q_proj": 15.204113960266113, "geo/layer_0/stable_rank_k_proj": 12.368390083312988, "geo/layer_0/stable_rank_o_proj": 55.51939010620117, "geo/layer_0/stable_rank_gate_proj": 165.64874267578125, "geo/layer_0/stable_rank_down_proj": 47.99266815185547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.037039462476968765, "geo/layer_0/attn_entropy_mean": 6.403750896453857, "geo/layer_0/attn_entropy_std": 0.2671699821949005, "geo/layer_7/stable_rank_q_proj": 43.7077751159668, "geo/layer_7/stable_rank_k_proj": 44.5202522277832, "geo/layer_7/stable_rank_o_proj": 104.97090148925781, "geo/layer_7/stable_rank_gate_proj": 129.94149780273438, "geo/layer_7/stable_rank_down_proj": 172.28981018066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5815060138702393, "geo/layer_7/attn_entropy_mean": 4.687377452850342, "geo/layer_7/attn_entropy_std": 0.9601020216941833, "geo/layer_14/stable_rank_q_proj": 66.27059936523438, "geo/layer_14/stable_rank_k_proj": 44.01479721069336, "geo/layer_14/stable_rank_o_proj": 48.04364013671875, "geo/layer_14/stable_rank_gate_proj": 116.15089416503906, "geo/layer_14/stable_rank_down_proj": 141.32276916503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39279669523239136, "geo/layer_14/attn_entropy_mean": 5.702178478240967, "geo/layer_14/attn_entropy_std": 0.6128596067428589, "geo/layer_21/stable_rank_q_proj": 52.38930130004883, "geo/layer_21/stable_rank_k_proj": 32.55315017700195, "geo/layer_21/stable_rank_o_proj": 92.52619171142578, "geo/layer_21/stable_rank_gate_proj": 112.50991821289062, "geo/layer_21/stable_rank_down_proj": 75.0999526977539, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16027402877807617, "geo/layer_21/attn_entropy_mean": 5.7872161865234375, "geo/layer_21/attn_entropy_std": 0.29914799332618713, "geo/layer_27/stable_rank_q_proj": 43.26408767700195, "geo/layer_27/stable_rank_k_proj": 33.574119567871094, "geo/layer_27/stable_rank_o_proj": 112.66844177246094, "geo/layer_27/stable_rank_gate_proj": 95.0467758178711, "geo/layer_27/stable_rank_down_proj": 154.92376708984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06341110169887543, "geo/layer_27/attn_entropy_mean": 4.530197620391846, "geo/layer_27/attn_entropy_std": 0.5148696899414062, "attnres/final_alpha/block_0": 0.24787123501300812, "attnres/block_norm/0": 1.3758494853973389, "attnres/final_alpha/block_1": 0.010272890329360962, "attnres/block_norm/1": 15929.32421875, "attnres/final_alpha/block_2": 0.021346330642700195, "attnres/block_norm/2": 11721.865234375, "attnres/final_alpha/block_3": 0.021715335547924042, "attnres/block_norm/3": 12731.400390625, "attnres/final_alpha/block_4": 0.031226417049765587, "attnres/block_norm/4": 5171.701171875, "attnres/final_alpha/block_5": 0.49152788519859314, "attnres/block_norm/5": 3754.824951171875, "attnres/final_alpha/block_6": 0.1760398894548416, "attnres/block_norm/6": 9322.81640625, "geo/tier1_time_s": 1.3575072288513184, "geo/step": 9225.0, "geo/rankme_slope": 0.004053639971613646} {"step": 9230, "timestamp": 1778204509.0287552, "train/loss": 2.366754102706909, "train/z_loss": 0.0018120001535862685, "train/perplexity": 10.66272593958646, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700949.2867812905, "perf/iters_per_sec": 0.8110758241564229, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23293035030365, "data/tokens_consumed": 19358810112, "data/tokens_consumed_B": 19.358810112, "train/loss_slope": -8.28022449681116e-06} {"step": 9240, "timestamp": 1778204519.379651, "train/loss": 2.3222792863845827, "train/z_loss": 0.0018170782248489558, "train/perplexity": 10.198894034125628, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027012.682293365, "perf/iters_per_sec": 0.966554967066462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346023082733153, "data/tokens_consumed": 19379781632, "data/tokens_consumed_B": 19.379781632, "train/loss_slope": -6.671419960580553e-06} {"step": 9250, "timestamp": 1778204529.7249882, "grad/layer_0/attn": 0.002959966193884611, "grad/layer_0/mlp": 0.003160062013193965, "grad/layer_0/attn_mlp_ratio": 0.9366797511751719, "grad/layer_4/attn": 0.0018685676623135805, "grad/layer_4/mlp": 0.0027501536533236504, "grad/layer_4/attn_mlp_ratio": 0.6794411621733351, "grad/layer_8/attn": 0.005053598899394274, "grad/layer_8/mlp": 0.004242125432938337, "grad/layer_8/attn_mlp_ratio": 1.1912893336501205, "grad/layer_12/attn": 0.004418021067976952, "grad/layer_12/mlp": 0.005706404335796833, "grad/layer_12/attn_mlp_ratio": 0.7742215115813162, "grad/layer_16/attn": 0.0058800033293664455, "grad/layer_16/mlp": 0.004958281293511391, "grad/layer_16/attn_mlp_ratio": 1.185895446971053, "grad/layer_20/attn": 0.007186598144471645, "grad/layer_20/mlp": 0.007369770668447018, "grad/layer_20/attn_mlp_ratio": 0.9751454109319644, "grad/layer_24/attn": 0.012890989892184734, "grad/layer_24/mlp": 0.012200091034173965, "grad/layer_24/attn_mlp_ratio": 1.0566306227070286, "grad/layer_27/attn": 0.005636157467961311, "grad/layer_27/mlp": 0.01146349310874939, "grad/layer_27/attn_mlp_ratio": 0.49166143036221926} {"step": 9250, "timestamp": 1778204529.740879, "train/loss": 2.26573703289032, "train/z_loss": 0.001835872430820018, "train/perplexity": 9.638225673252483, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025345.009523695, "perf/iters_per_sec": 0.9657597587221599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354542016983033, "data/tokens_consumed": 19400753152, "data/tokens_consumed_B": 19.400753152, "train/loss_slope": -1.47415881968103e-05} {"step": 9260, "timestamp": 1778204540.0971394, "train/loss": 2.2833072900772096, "train/z_loss": 0.001831843890249729, "train/perplexity": 9.80906825611524, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026275.8439485552, "perf/iters_per_sec": 0.9662036151640678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349785327911376, "data/tokens_consumed": 19421724672, "data/tokens_consumed_B": 19.421724672, "train/loss_slope": -1.85652364813241e-05} {"step": 9270, "timestamp": 1778204550.4572237, "train/loss": 2.371471571922302, "train/z_loss": 0.001811276248190552, "train/perplexity": 10.713145854651348, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025490.2400675635, "perf/iters_per_sec": 0.9658290100419824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035379958152771, "data/tokens_consumed": 19442696192, "data/tokens_consumed_B": 19.442696192, "train/loss_slope": -1.592752746086456e-05} {"step": 9280, "timestamp": 1778204560.809993, "train/loss": 2.3297538280487062, "train/z_loss": 0.0018291283398866654, "train/perplexity": 10.275411703520286, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026785.6434880742, "perf/iters_per_sec": 0.9664467065277453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347182035446167, "data/tokens_consumed": 19463667712, "data/tokens_consumed_B": 19.463667712, "train/loss_slope": -1.7640397808339354e-05} {"step": 9290, "timestamp": 1778204571.1641707, "train/loss": 2.3734295129776, "train/z_loss": 0.0018181600142270326, "train/perplexity": 10.734142110760002, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026518.6425666476, "perf/iters_per_sec": 0.9663193905671347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034854531288147, "data/tokens_consumed": 19484639232, "data/tokens_consumed_B": 19.484639232, "train/loss_slope": -1.7634413609302983e-05} {"step": 9300, "timestamp": 1778204581.5109155, "grad/layer_0/attn": 0.0026443174574524164, "grad/layer_0/mlp": 0.0029742131009697914, "grad/layer_0/attn_mlp_ratio": 0.8890813397607787, "grad/layer_4/attn": 0.0027985922060906887, "grad/layer_4/mlp": 0.0029518799856305122, "grad/layer_4/attn_mlp_ratio": 0.9480711021135259, "grad/layer_8/attn": 0.009200328961014748, "grad/layer_8/mlp": 0.00454360619187355, "grad/layer_8/attn_mlp_ratio": 2.0248957259941207, "grad/layer_12/attn": 0.005267948843538761, "grad/layer_12/mlp": 0.006853281520307064, "grad/layer_12/attn_mlp_ratio": 0.7686753785119849, "grad/layer_16/attn": 0.005716471932828426, "grad/layer_16/mlp": 0.005134430713951588, "grad/layer_16/attn_mlp_ratio": 1.1133603976696467, "grad/layer_20/attn": 0.005536897573620081, "grad/layer_20/mlp": 0.007388805504888296, "grad/layer_20/attn_mlp_ratio": 0.7493630052950605, "grad/layer_24/attn": 0.005947122350335121, "grad/layer_24/mlp": 0.009438209235668182, "grad/layer_24/attn_mlp_ratio": 0.6301112995936842, "grad/layer_27/attn": 0.006918805651366711, "grad/layer_27/mlp": 0.008486464619636536, "grad/layer_27/attn_mlp_ratio": 0.8152753684767613} {"step": 9300, "timestamp": 1778204582.1233282, "eos/sharpness": 8.148860931396483, "eos/L0_probe": 2.161874532699585, "eos/L_plus": 2.2065136432647705, "eos/L_minus": 2.1987240314483643, "eos/grad_norm": 0.12148481607437134, "eos/embed_grad_frac": 0.22630099952220917, "eos/time_s": 0.6096911430358887} {"step": 9300, "timestamp": 1778204582.1439042, "train/loss": 2.3584075450897215, "train/z_loss": 0.0018165373359806836, "train/perplexity": 10.574099261527216, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911322.0737912015, "perf/iters_per_sec": 0.9113893860775001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972258567810058, "data/tokens_consumed": 19505610752, "data/tokens_consumed_B": 19.505610752, "train/loss_slope": -1.50293335984714e-05} {"step": 9300, "timestamp": 1778204583.5050123, "geo/rankme_last": 439.7076721191406, "geo/layer_0/stable_rank_q_proj": 15.143289566040039, "geo/layer_0/stable_rank_k_proj": 12.380374908447266, "geo/layer_0/stable_rank_o_proj": 55.69161605834961, "geo/layer_0/stable_rank_gate_proj": 165.49163818359375, "geo/layer_0/stable_rank_down_proj": 47.90290451049805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.028399642556905746, "geo/layer_0/attn_entropy_mean": 6.403632164001465, "geo/layer_0/attn_entropy_std": 0.2638040781021118, "geo/layer_7/stable_rank_q_proj": 43.838340759277344, "geo/layer_7/stable_rank_k_proj": 44.44708251953125, "geo/layer_7/stable_rank_o_proj": 105.54417419433594, "geo/layer_7/stable_rank_gate_proj": 129.1373748779297, "geo/layer_7/stable_rank_down_proj": 172.27386474609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.6018886566162109, "geo/layer_7/attn_entropy_mean": 4.689870834350586, "geo/layer_7/attn_entropy_std": 0.9540032744407654, "geo/layer_14/stable_rank_q_proj": 66.2852783203125, "geo/layer_14/stable_rank_k_proj": 43.59458541870117, "geo/layer_14/stable_rank_o_proj": 48.209449768066406, "geo/layer_14/stable_rank_gate_proj": 115.62505340576172, "geo/layer_14/stable_rank_down_proj": 141.50466918945312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3855712115764618, "geo/layer_14/attn_entropy_mean": 5.729457855224609, "geo/layer_14/attn_entropy_std": 0.5840480327606201, "geo/layer_21/stable_rank_q_proj": 52.6083869934082, "geo/layer_21/stable_rank_k_proj": 32.47234344482422, "geo/layer_21/stable_rank_o_proj": 92.61176300048828, "geo/layer_21/stable_rank_gate_proj": 112.2754898071289, "geo/layer_21/stable_rank_down_proj": 74.83655548095703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15688443183898926, "geo/layer_21/attn_entropy_mean": 5.818273544311523, "geo/layer_21/attn_entropy_std": 0.31314617395401, "geo/layer_27/stable_rank_q_proj": 43.21419906616211, "geo/layer_27/stable_rank_k_proj": 33.478919982910156, "geo/layer_27/stable_rank_o_proj": 112.39291381835938, "geo/layer_27/stable_rank_gate_proj": 94.85987091064453, "geo/layer_27/stable_rank_down_proj": 155.09942626953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.056741002947092056, "geo/layer_27/attn_entropy_mean": 4.522947311401367, "geo/layer_27/attn_entropy_std": 0.49592316150665283, "attnres/final_alpha/block_0": 0.24818867444992065, "attnres/block_norm/0": 1.37887442111969, "attnres/final_alpha/block_1": 0.010277906432747841, "attnres/block_norm/1": 16031.1220703125, "attnres/final_alpha/block_2": 0.021030496805906296, "attnres/block_norm/2": 11840.0009765625, "attnres/final_alpha/block_3": 0.0212867334485054, "attnres/block_norm/3": 12720.6171875, "attnres/final_alpha/block_4": 0.031244264915585518, "attnres/block_norm/4": 5226.55419921875, "attnres/final_alpha/block_5": 0.49135640263557434, "attnres/block_norm/5": 3769.90185546875, "attnres/final_alpha/block_6": 0.17661553621292114, "attnres/block_norm/6": 9380.400390625, "geo/tier1_time_s": 1.3571152687072754, "geo/step": 9300.0, "geo/rankme_slope": 0.003855189380439676} {"step": 9310, "timestamp": 1778204593.8621573, "train/loss": 2.380342149734497, "train/z_loss": 0.0018101957975886762, "train/perplexity": 10.808600391095316, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790253.9455709127, "perf/iters_per_sec": 0.853659603867966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714271068573, "data/tokens_consumed": 19526582272, "data/tokens_consumed_B": 19.526582272, "train/loss_slope": -1.1412230755450887e-05} {"step": 9320, "timestamp": 1778204604.2282376, "train/loss": 2.352838563919067, "train/z_loss": 0.0018130958313122391, "train/perplexity": 10.515375968068994, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024147.4421195684, "perf/iters_per_sec": 0.9651887140844194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360668182373047, "data/tokens_consumed": 19547553792, "data/tokens_consumed_B": 19.547553792, "train/loss_slope": -1.1751489935427736e-05} {"step": 9330, "timestamp": 1778204614.58308, "train/loss": 2.339998745918274, "train/z_loss": 0.0018111809738911688, "train/perplexity": 10.38122354382094, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026592.086487658, "perf/iters_per_sec": 0.9663544113577166, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348170280456543, "data/tokens_consumed": 19568525312, "data/tokens_consumed_B": 19.568525312, "train/loss_slope": -1.2355309000538904e-05} {"step": 9340, "timestamp": 1778204624.9372542, "train/loss": 2.3313358545303347, "train/z_loss": 0.001815814198926091, "train/perplexity": 10.291680542418723, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026447.258122832, "perf/iters_per_sec": 0.9662853518118057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348909854888917, "data/tokens_consumed": 19589496832, "data/tokens_consumed_B": 19.589496832, "train/loss_slope": -1.3863456932374624e-05} {"step": 9350, "timestamp": 1778204635.2805629, "grad/layer_0/attn": 0.0027750979643315077, "grad/layer_0/mlp": 0.002880865940824151, "grad/layer_0/attn_mlp_ratio": 0.9632860136522052, "grad/layer_4/attn": 0.001468187547288835, "grad/layer_4/mlp": 0.0027917707338929176, "grad/layer_4/attn_mlp_ratio": 0.5258983042105775, "grad/layer_8/attn": 0.0046208640560507774, "grad/layer_8/mlp": 0.0043138060718774796, "grad/layer_8/attn_mlp_ratio": 1.0711802691032033, "grad/layer_12/attn": 0.004647628404200077, "grad/layer_12/mlp": 0.005971862003207207, "grad/layer_12/attn_mlp_ratio": 0.7782544747146881, "grad/layer_16/attn": 0.005348819773644209, "grad/layer_16/mlp": 0.004529035184532404, "grad/layer_16/attn_mlp_ratio": 1.1810064257859814, "grad/layer_20/attn": 0.004778421483933926, "grad/layer_20/mlp": 0.006604847498238087, "grad/layer_20/attn_mlp_ratio": 0.7234718762032633, "grad/layer_24/attn": 0.008525257930159569, "grad/layer_24/mlp": 0.008839559741318226, "grad/layer_24/attn_mlp_ratio": 0.9644437147549435, "grad/layer_27/attn": 0.006802704185247421, "grad/layer_27/mlp": 0.008334876969456673, "grad/layer_27/attn_mlp_ratio": 0.8161733074835704} {"step": 9350, "timestamp": 1778204635.2963896, "train/loss": 2.3898999452590943, "train/z_loss": 0.0018003799486905336, "train/perplexity": 10.912402050858534, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025624.9957012343, "perf/iters_per_sec": 0.9658932665353939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353110790252686, "data/tokens_consumed": 19610468352, "data/tokens_consumed_B": 19.610468352, "train/loss_slope": -9.50941064498678e-06} {"step": 9360, "timestamp": 1778204645.6545212, "train/loss": 2.3605181694030763, "train/z_loss": 0.0018106038100086153, "train/perplexity": 10.59644078150426, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026004.2174825731, "perf/iters_per_sec": 0.9660740935719362, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351172924041747, "data/tokens_consumed": 19631439872, "data/tokens_consumed_B": 19.631439872, "train/loss_slope": -9.345074720007851e-06} {"step": 9370, "timestamp": 1778204656.0138307, "train/loss": 2.394230103492737, "train/z_loss": 0.001796021347399801, "train/perplexity": 10.959756931518026, "train/grad_norm": 0.33984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025437.2103162652, "perf/iters_per_sec": 0.9658037234860731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354070663452148, "data/tokens_consumed": 19652411392, "data/tokens_consumed_B": 19.652411392, "train/loss_slope": -7.654858243526204e-06} {"step": 9375, "timestamp": 1778204661.7969885, "eos/sharpness": 34.27541255950927, "eos/L0_probe": 2.1665666103363037, "eos/L_plus": 2.4126205444335938, "eos/L_minus": 2.2632668018341064, "eos/grad_norm": 0.17793813347816467, "eos/embed_grad_frac": 0.10620832443237305, "eos/time_s": 0.6135637760162354} {"step": 9375, "timestamp": 1778204663.175298, "geo/rankme_last": 439.56597900390625, "geo/layer_0/stable_rank_q_proj": 15.126483917236328, "geo/layer_0/stable_rank_k_proj": 12.4058837890625, "geo/layer_0/stable_rank_o_proj": 55.694244384765625, "geo/layer_0/stable_rank_gate_proj": 166.27792358398438, "geo/layer_0/stable_rank_down_proj": 47.957664489746094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0374503992497921, "geo/layer_0/attn_entropy_mean": 6.397799491882324, "geo/layer_0/attn_entropy_std": 0.26277458667755127, "geo/layer_7/stable_rank_q_proj": 43.75977325439453, "geo/layer_7/stable_rank_k_proj": 44.27840805053711, "geo/layer_7/stable_rank_o_proj": 105.78208923339844, "geo/layer_7/stable_rank_gate_proj": 129.08702087402344, "geo/layer_7/stable_rank_down_proj": 171.2930908203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5821303725242615, "geo/layer_7/attn_entropy_mean": 4.6829657554626465, "geo/layer_7/attn_entropy_std": 0.895172119140625, "geo/layer_14/stable_rank_q_proj": 65.85472869873047, "geo/layer_14/stable_rank_k_proj": 43.58165740966797, "geo/layer_14/stable_rank_o_proj": 48.2957878112793, "geo/layer_14/stable_rank_gate_proj": 116.01101684570312, "geo/layer_14/stable_rank_down_proj": 141.23155212402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3922305405139923, "geo/layer_14/attn_entropy_mean": 5.745944023132324, "geo/layer_14/attn_entropy_std": 0.60167396068573, "geo/layer_21/stable_rank_q_proj": 52.4422492980957, "geo/layer_21/stable_rank_k_proj": 32.410179138183594, "geo/layer_21/stable_rank_o_proj": 92.39201354980469, "geo/layer_21/stable_rank_gate_proj": 111.55484771728516, "geo/layer_21/stable_rank_down_proj": 74.54572296142578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16067679226398468, "geo/layer_21/attn_entropy_mean": 5.78704309463501, "geo/layer_21/attn_entropy_std": 0.3110875189304352, "geo/layer_27/stable_rank_q_proj": 43.25302505493164, "geo/layer_27/stable_rank_k_proj": 33.35243225097656, "geo/layer_27/stable_rank_o_proj": 112.52859497070312, "geo/layer_27/stable_rank_gate_proj": 94.96841430664062, "geo/layer_27/stable_rank_down_proj": 155.36068725585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06256145238876343, "geo/layer_27/attn_entropy_mean": 4.5317792892456055, "geo/layer_27/attn_entropy_std": 0.50898677110672, "attnres/final_alpha/block_0": 0.24868518114089966, "attnres/block_norm/0": 1.3818097114562988, "attnres/final_alpha/block_1": 0.010316136293113232, "attnres/block_norm/1": 16178.57421875, "attnres/final_alpha/block_2": 0.02080090343952179, "attnres/block_norm/2": 11941.7060546875, "attnres/final_alpha/block_3": 0.021832019090652466, "attnres/block_norm/3": 12874.458984375, "attnres/final_alpha/block_4": 0.03110259398818016, "attnres/block_norm/4": 5226.318359375, "attnres/final_alpha/block_5": 0.49163568019866943, "attnres/block_norm/5": 3774.09423828125, "attnres/final_alpha/block_6": 0.1756274700164795, "attnres/block_norm/6": 9415.9208984375, "geo/tier1_time_s": 1.3567097187042236, "geo/step": 9375.0, "geo/rankme_slope": 0.003738614058904812} {"step": 9380, "timestamp": 1778204668.355798, "train/loss": 2.380289649963379, "train/z_loss": 0.0017958503914996982, "train/perplexity": 10.80803295694389, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699903.859086495, "perf/iters_per_sec": 0.8105773253853297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2336885929107666, "data/tokens_consumed": 19673382912, "data/tokens_consumed_B": 19.673382912, "train/loss_slope": -7.043300882460187e-06} {"step": 9390, "timestamp": 1778204678.71427, "train/loss": 2.358781576156616, "train/z_loss": 0.0018092641839757562, "train/perplexity": 10.578055042901903, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025488.8408300153, "perf/iters_per_sec": 0.9658283428335263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353806734085083, "data/tokens_consumed": 19694354432, "data/tokens_consumed_B": 19.694354432, "train/loss_slope": -6.460264892932898e-06} {"step": 9400, "timestamp": 1778204689.058186, "grad/layer_0/attn": 0.0025860911700874567, "grad/layer_0/mlp": 0.002831171965226531, "grad/layer_0/attn_mlp_ratio": 0.9134348285823932, "grad/layer_4/attn": 0.0016053932486101985, "grad/layer_4/mlp": 0.002815110841766, "grad/layer_4/attn_mlp_ratio": 0.5702770803068561, "grad/layer_8/attn": 0.00550506729632616, "grad/layer_8/mlp": 0.004112895578145981, "grad/layer_8/attn_mlp_ratio": 1.3384894067645658, "grad/layer_12/attn": 0.004663947504013777, "grad/layer_12/mlp": 0.0057919761165976524, "grad/layer_12/attn_mlp_ratio": 0.805242861779825, "grad/layer_16/attn": 0.00601979112252593, "grad/layer_16/mlp": 0.004517500288784504, "grad/layer_16/attn_mlp_ratio": 1.332549110005863, "grad/layer_20/attn": 0.007142184302210808, "grad/layer_20/mlp": 0.0071961525827646255, "grad/layer_20/attn_mlp_ratio": 0.9925003841729101, "grad/layer_24/attn": 0.007364348974078894, "grad/layer_24/mlp": 0.010759891010820866, "grad/layer_24/attn_mlp_ratio": 0.6844259758979171, "grad/layer_27/attn": 0.005858030170202255, "grad/layer_27/mlp": 0.009831346571445465, "grad/layer_27/attn_mlp_ratio": 0.5958522637815773} {"step": 9400, "timestamp": 1778204689.0741205, "train/loss": 2.3174242258071898, "train/z_loss": 0.0018218812183476985, "train/perplexity": 10.14949779366645, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025315.8633209, "perf/iters_per_sec": 0.9657458607296466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354691028594971, "data/tokens_consumed": 19715325952, "data/tokens_consumed_B": 19.715325952, "train/loss_slope": -4.912515728100959e-06} {"step": 9410, "timestamp": 1778204699.4304876, "train/loss": 2.3293983936309814, "train/z_loss": 0.0018026305129751562, "train/perplexity": 10.271760117532676, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026135.07447534, "perf/iters_per_sec": 0.9661364910484982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350504398345948, "data/tokens_consumed": 19736297472, "data/tokens_consumed_B": 19.736297472, "train/loss_slope": -4.302672920661961e-06} {"step": 9420, "timestamp": 1778204709.7950702, "train/loss": 2.3227251291275026, "train/z_loss": 0.0018186562578193844, "train/perplexity": 10.203442150813624, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024866.1854555695, "perf/iters_per_sec": 0.9655314376142357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356990575790406, "data/tokens_consumed": 19757268992, "data/tokens_consumed_B": 19.757268992, "train/loss_slope": -5.998826026916474e-06} {"step": 9430, "timestamp": 1778204720.1492712, "train/loss": 2.3572256088256838, "train/z_loss": 0.0018052503233775496, "train/perplexity": 10.561608733108809, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026706.7219358522, "perf/iters_per_sec": 0.9664090737990628, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034758496284485, "data/tokens_consumed": 19778240512, "data/tokens_consumed_B": 19.778240512, "train/loss_slope": -3.890977555339875e-06} {"step": 9440, "timestamp": 1778204730.5187368, "train/loss": 2.3102620601654054, "train/z_loss": 0.0018304469413124025, "train/perplexity": 10.077065106368492, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023693.627024095, "perf/iters_per_sec": 0.9649723181839442, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362991571426392, "data/tokens_consumed": 19799212032, "data/tokens_consumed_B": 19.799212032, "train/loss_slope": -5.029047933956705e-06} {"step": 9450, "timestamp": 1778204740.8584363, "grad/layer_0/attn": 0.002833136124536395, "grad/layer_0/mlp": 0.00274076615460217, "grad/layer_0/attn_mlp_ratio": 1.0337022063735357, "grad/layer_4/attn": 0.0015366212464869022, "grad/layer_4/mlp": 0.002765175187960267, "grad/layer_4/attn_mlp_ratio": 0.5557048239138546, "grad/layer_8/attn": 0.006880700588226318, "grad/layer_8/mlp": 0.004262118600308895, "grad/layer_8/attn_mlp_ratio": 1.6143850211697865, "grad/layer_12/attn": 0.0037186264526098967, "grad/layer_12/mlp": 0.005865912418812513, "grad/layer_12/attn_mlp_ratio": 0.6339382731474301, "grad/layer_16/attn": 0.005326802842319012, "grad/layer_16/mlp": 0.004570632241666317, "grad/layer_16/attn_mlp_ratio": 1.1654411127666895, "grad/layer_20/attn": 0.008124101907014847, "grad/layer_20/mlp": 0.006318313535302877, "grad/layer_20/attn_mlp_ratio": 1.2858022529338093, "grad/layer_24/attn": 0.014578561298549175, "grad/layer_24/mlp": 0.012520665302872658, "grad/layer_24/attn_mlp_ratio": 1.1643599464932886, "grad/layer_27/attn": 0.011438784189522266, "grad/layer_27/mlp": 0.010211179964244366, "grad/layer_27/attn_mlp_ratio": 1.1202215725855722} {"step": 9450, "timestamp": 1778204741.4766464, "eos/sharpness": 42.818212509155266, "eos/L0_probe": 2.1602957248687744, "eos/L_plus": 2.416559934616089, "eos/L_minus": 2.3322136402130127, "eos/grad_norm": 0.18387192487716675, "eos/embed_grad_frac": 0.0859663262963295, "eos/time_s": 0.6154336929321289} {"step": 9450, "timestamp": 1778204741.4972277, "train/loss": 2.283707928657532, "train/z_loss": 0.0018219811259768902, "train/perplexity": 9.812998934633807, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911180.4617646493, "perf/iters_per_sec": 0.9113218602011915, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973071575164794, "data/tokens_consumed": 19820183552, "data/tokens_consumed_B": 19.820183552, "train/loss_slope": -1.1159028808096953e-05} {"step": 9450, "timestamp": 1778204742.8625953, "geo/rankme_last": 439.30279541015625, "geo/layer_0/stable_rank_q_proj": 15.152231216430664, "geo/layer_0/stable_rank_k_proj": 12.43426513671875, "geo/layer_0/stable_rank_o_proj": 55.850589752197266, "geo/layer_0/stable_rank_gate_proj": 167.38278198242188, "geo/layer_0/stable_rank_down_proj": 47.79597473144531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0299790371209383, "geo/layer_0/attn_entropy_mean": 6.393896102905273, "geo/layer_0/attn_entropy_std": 0.2634684145450592, "geo/layer_7/stable_rank_q_proj": 43.7012939453125, "geo/layer_7/stable_rank_k_proj": 44.27030563354492, "geo/layer_7/stable_rank_o_proj": 105.8313980102539, "geo/layer_7/stable_rank_gate_proj": 128.7808074951172, "geo/layer_7/stable_rank_down_proj": 170.9217987060547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5924519300460815, "geo/layer_7/attn_entropy_mean": 4.685626029968262, "geo/layer_7/attn_entropy_std": 0.9263136982917786, "geo/layer_14/stable_rank_q_proj": 65.46619415283203, "geo/layer_14/stable_rank_k_proj": 43.45729064941406, "geo/layer_14/stable_rank_o_proj": 48.22365951538086, "geo/layer_14/stable_rank_gate_proj": 115.38391876220703, "geo/layer_14/stable_rank_down_proj": 141.4967498779297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39825010299682617, "geo/layer_14/attn_entropy_mean": 5.707108497619629, "geo/layer_14/attn_entropy_std": 0.6009805202484131, "geo/layer_21/stable_rank_q_proj": 52.430702209472656, "geo/layer_21/stable_rank_k_proj": 32.381282806396484, "geo/layer_21/stable_rank_o_proj": 92.48299407958984, "geo/layer_21/stable_rank_gate_proj": 111.25567626953125, "geo/layer_21/stable_rank_down_proj": 74.38896179199219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16429755091667175, "geo/layer_21/attn_entropy_mean": 5.802196502685547, "geo/layer_21/attn_entropy_std": 0.30973342061042786, "geo/layer_27/stable_rank_q_proj": 43.25403594970703, "geo/layer_27/stable_rank_k_proj": 33.33562469482422, "geo/layer_27/stable_rank_o_proj": 112.59902954101562, "geo/layer_27/stable_rank_gate_proj": 95.10452270507812, "geo/layer_27/stable_rank_down_proj": 155.23910522460938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06424395740032196, "geo/layer_27/attn_entropy_mean": 4.531989097595215, "geo/layer_27/attn_entropy_std": 0.5177326202392578, "attnres/final_alpha/block_0": 0.24665695428848267, "attnres/block_norm/0": 1.3848992586135864, "attnres/final_alpha/block_1": 0.009898161515593529, "attnres/block_norm/1": 16283.9140625, "attnres/final_alpha/block_2": 0.02090434916317463, "attnres/block_norm/2": 12032.2998046875, "attnres/final_alpha/block_3": 0.021154366433620453, "attnres/block_norm/3": 13067.490234375, "attnres/final_alpha/block_4": 0.03098812885582447, "attnres/block_norm/4": 5273.0244140625, "attnres/final_alpha/block_5": 0.4991705119609833, "attnres/block_norm/5": 3768.10595703125, "attnres/final_alpha/block_6": 0.17122754454612732, "attnres/block_norm/6": 9524.84375, "geo/tier1_time_s": 1.3616304397583008, "geo/step": 9450.0, "geo/rankme_slope": 0.003537141946622399} {"step": 9460, "timestamp": 1778204753.2248454, "train/loss": 2.325323534011841, "train/z_loss": 0.0018151055788621306, "train/perplexity": 10.229989299919765, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788763.4311576884, "perf/iters_per_sec": 0.8529488712109033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1724032163619995, "data/tokens_consumed": 19841155072, "data/tokens_consumed_B": 19.841155072, "train/loss_slope": -1.3606241748671239e-05} {"step": 9470, "timestamp": 1778204763.586412, "train/loss": 2.339122462272644, "train/z_loss": 0.0018083591712638736, "train/perplexity": 10.372130631974683, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025165.5754981502, "perf/iters_per_sec": 0.9656741979113341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355459451675415, "data/tokens_consumed": 19862126592, "data/tokens_consumed_B": 19.862126592, "train/loss_slope": -1.5735767626597775e-05} {"step": 9480, "timestamp": 1778204773.9406464, "train/loss": 2.3148327350616453, "train/z_loss": 0.0018234735704027115, "train/perplexity": 10.123229515762878, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026183.0533850281, "perf/iters_per_sec": 0.9661593691754475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350259304046632, "data/tokens_consumed": 19883098112, "data/tokens_consumed_B": 19.883098112, "train/loss_slope": -2.070254196535522e-05} {"step": 9490, "timestamp": 1778204784.293585, "train/loss": 2.3330987691879272, "train/z_loss": 0.0018059341586194933, "train/perplexity": 10.30983989889299, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026903.2900394197, "perf/iters_per_sec": 0.9665028047749613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034658145904541, "data/tokens_consumed": 19904069632, "data/tokens_consumed_B": 19.904069632, "train/loss_slope": -2.0648030093078826e-05} {"step": 9500, "timestamp": 1778204795.192977, "grad/layer_0/attn": 0.0026196211110800505, "grad/layer_0/mlp": 0.0027763659600168467, "grad/layer_0/attn_mlp_ratio": 0.9435431259609026, "grad/layer_4/attn": 0.0014668735675513744, "grad/layer_4/mlp": 0.002784630749374628, "grad/layer_4/attn_mlp_ratio": 0.5267748749823714, "grad/layer_8/attn": 0.004706276580691338, "grad/layer_8/mlp": 0.004152392968535423, "grad/layer_8/attn_mlp_ratio": 1.1333889887142292, "grad/layer_12/attn": 0.0040412708185613155, "grad/layer_12/mlp": 0.005695686209946871, "grad/layer_12/attn_mlp_ratio": 0.7095318454430495, "grad/layer_16/attn": 0.004929042421281338, "grad/layer_16/mlp": 0.004524200223386288, "grad/layer_16/attn_mlp_ratio": 1.0894836808623083, "grad/layer_20/attn": 0.006020639557391405, "grad/layer_20/mlp": 0.0058659096248447895, "grad/layer_20/attn_mlp_ratio": 1.0263778066497111, "grad/layer_24/attn": 0.009042437188327312, "grad/layer_24/mlp": 0.009646459482610226, "grad/layer_24/attn_mlp_ratio": 0.9373840330630949, "grad/layer_27/attn": 0.004185211379081011, "grad/layer_27/mlp": 0.009367687627673149, "grad/layer_27/attn_mlp_ratio": 0.44677101764584337} {"step": 9500, "timestamp": 1778204795.2095668, "train/loss": 2.350735807418823, "train/z_loss": 0.0018103322363458574, "train/perplexity": 10.493287923927504, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922634.1641631995, "perf/iters_per_sec": 0.9167834111038206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.090770173072815, "data/tokens_consumed": 19925041152, "data/tokens_consumed_B": 19.925041152, "train/loss_slope": -2.1768619575695005e-05} {"step": 9500, "timestamp": 1778204802.2938673, "geo/ww_alpha_mean": 8.83776932015715, "geo/ww_alpha_std": 5.638724311837487, "geo/ww_alpha_min": 2.735724744726341, "geo/ww_alpha_max": 38.93656257164557, "geo/ww_alpha_healthy_frac": 0.1319796954314721, "geo/ww_alpha_by_type/q_proj": 4.424697222724114, "geo/ww_alpha_by_type/k_proj": 5.403355183500387, "geo/ww_alpha_by_type/v_proj": 8.716174579680333, "geo/ww_alpha_by_type/o_proj": 10.12031098529027, "geo/ww_alpha_by_type/gate_proj": 10.745978167609513, "geo/ww_alpha_by_type/up_proj": 13.290852327199179, "geo/ww_alpha_by_type/down_proj": 9.33684483311548, "geo/twonn_id/layer_0": 0.6691839694976807, "geo/twonn_id/layer_7": 2.838104248046875, "geo/twonn_id/layer_14": 3.7273173332214355, "geo/twonn_id/layer_21": 7.554349899291992, "geo/twonn_id/layer_27": 5.9935688972473145, "geo/tier2_time_s": 7.075140953063965} {"step": 9500, "timestamp": 1778204802.9377942, "eoc/jacobian_sigma/layer_0/attn": 539.138427734375, "eoc/jacobian_sigma/layer_0/mlp": 2752.44921875, "eoc/jacobian_sigma/layer_0": 2752.44921875, "eoc/jacobian_sigma/layer_7/attn": 1.1423566341400146, "eoc/jacobian_sigma/layer_7/mlp": 1.583417534828186, "eoc/jacobian_sigma/layer_7": 1.583417534828186, "eoc/jacobian_sigma/layer_14/attn": 1.306686282157898, "eoc/jacobian_sigma/layer_14/mlp": 7.524385452270508, "eoc/jacobian_sigma/layer_14": 7.524385452270508, "eoc/jacobian_sigma/layer_21/attn": 1.0877740383148193, "eoc/jacobian_sigma/layer_21/mlp": 3.0834085941314697, "eoc/jacobian_sigma/layer_21": 3.0834085941314697, "eoc/jacobian_sigma/layer_27/attn": 2.159355878829956, "eoc/jacobian_sigma/layer_27/mlp": 18.167097091674805, "eoc/jacobian_sigma/layer_27": 18.167097091674805, "eoc/layer0_sigma": 2752.44921875, "eoc/sigma_max": 18.167097091674805, "eoc/sigma_min": 1.583417534828186, "eoc/sigma_mean": 7.589577168226242, "eoc/time_s": 0.6358621120452881} {"step": 9510, "timestamp": 1778204813.3114862, "train/loss": 2.3425644636154175, "train/z_loss": 0.0018126029288396239, "train/perplexity": 10.407893031343152, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1158939.342737613, "perf/iters_per_sec": 0.5526253427208009, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8095442295074462, "data/tokens_consumed": 19946012672, "data/tokens_consumed_B": 19.946012672, "train/loss_slope": -2.3840115759203132e-05} {"step": 9520, "timestamp": 1778204823.6763265, "train/loss": 2.337845706939697, "train/z_loss": 0.0018175981123931706, "train/perplexity": 10.358896409106364, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025079.6468129, "perf/iters_per_sec": 0.9656332239212513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355898857116699, "data/tokens_consumed": 19966984192, "data/tokens_consumed_B": 19.966984192, "train/loss_slope": -2.1541160110807793e-05} {"step": 9525, "timestamp": 1778204829.4656117, "eos/sharpness": 24.377322196960446, "eos/L0_probe": 2.1566803455352783, "eos/L_plus": 2.256218194961548, "eos/L_minus": 2.3009157180786133, "eos/grad_norm": 0.20153820514678955, "eos/embed_grad_frac": 0.08203984797000885, "eos/time_s": 0.6205618381500244} {"step": 9525, "timestamp": 1778204830.8470309, "geo/rankme_last": 440.8182067871094, "geo/layer_0/stable_rank_q_proj": 15.133035659790039, "geo/layer_0/stable_rank_k_proj": 12.437996864318848, "geo/layer_0/stable_rank_o_proj": 55.6331672668457, "geo/layer_0/stable_rank_gate_proj": 166.4769744873047, "geo/layer_0/stable_rank_down_proj": 47.78163146972656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03274340555071831, "geo/layer_0/attn_entropy_mean": 6.395322322845459, "geo/layer_0/attn_entropy_std": 0.2667628824710846, "geo/layer_7/stable_rank_q_proj": 43.898231506347656, "geo/layer_7/stable_rank_k_proj": 44.25844192504883, "geo/layer_7/stable_rank_o_proj": 106.11711883544922, "geo/layer_7/stable_rank_gate_proj": 128.4774169921875, "geo/layer_7/stable_rank_down_proj": 170.76365661621094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5927151441574097, "geo/layer_7/attn_entropy_mean": 4.687387466430664, "geo/layer_7/attn_entropy_std": 0.9369177222251892, "geo/layer_14/stable_rank_q_proj": 65.52500915527344, "geo/layer_14/stable_rank_k_proj": 43.26816177368164, "geo/layer_14/stable_rank_o_proj": 48.339111328125, "geo/layer_14/stable_rank_gate_proj": 115.1988754272461, "geo/layer_14/stable_rank_down_proj": 141.268310546875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.383992999792099, "geo/layer_14/attn_entropy_mean": 5.737934112548828, "geo/layer_14/attn_entropy_std": 0.5749703049659729, "geo/layer_21/stable_rank_q_proj": 52.51984405517578, "geo/layer_21/stable_rank_k_proj": 32.53361129760742, "geo/layer_21/stable_rank_o_proj": 93.05076599121094, "geo/layer_21/stable_rank_gate_proj": 110.92324829101562, "geo/layer_21/stable_rank_down_proj": 74.19161987304688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16191278398036957, "geo/layer_21/attn_entropy_mean": 5.790489196777344, "geo/layer_21/attn_entropy_std": 0.3064767122268677, "geo/layer_27/stable_rank_q_proj": 43.07394790649414, "geo/layer_27/stable_rank_k_proj": 33.14889144897461, "geo/layer_27/stable_rank_o_proj": 112.89684295654297, "geo/layer_27/stable_rank_gate_proj": 95.16567993164062, "geo/layer_27/stable_rank_down_proj": 154.673583984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06041202321648598, "geo/layer_27/attn_entropy_mean": 4.5369486808776855, "geo/layer_27/attn_entropy_std": 0.5105897188186646, "attnres/final_alpha/block_0": 0.2492855191230774, "attnres/block_norm/0": 1.3879129886627197, "attnres/final_alpha/block_1": 0.010150309652090073, "attnres/block_norm/1": 16429.380859375, "attnres/final_alpha/block_2": 0.020855043083429337, "attnres/block_norm/2": 12089.39453125, "attnres/final_alpha/block_3": 0.02121710404753685, "attnres/block_norm/3": 13095.2421875, "attnres/final_alpha/block_4": 0.03105207160115242, "attnres/block_norm/4": 5326.1552734375, "attnres/final_alpha/block_5": 0.4906580150127411, "attnres/block_norm/5": 3804.375, "attnres/final_alpha/block_6": 0.17678195238113403, "attnres/block_norm/6": 9594.9619140625, "geo/tier1_time_s": 1.3601586818695068, "geo/step": 9525.0, "geo/rankme_slope": 0.0034160285012442477} {"step": 9530, "timestamp": 1778204836.0261123, "train/loss": 2.3301929235458374, "train/z_loss": 0.001816508884076029, "train/perplexity": 10.279924581250107, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698818.9320374548, "perf/iters_per_sec": 0.8100599918544077, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344764709472655, "data/tokens_consumed": 19987955712, "data/tokens_consumed_B": 19.987955712, "train/loss_slope": -2.4535484482782043e-05} {"step": 9540, "timestamp": 1778204846.3800108, "train/loss": 2.3162025451660155, "train/z_loss": 0.001813833275809884, "train/perplexity": 10.137105919691802, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026333.0254645122, "perf/iters_per_sec": 0.9662308814356385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349493265151977, "data/tokens_consumed": 20008927232, "data/tokens_consumed_B": 20.008927232, "train/loss_slope": -2.6595087034223726e-05} {"step": 9550, "timestamp": 1778204856.7269678, "grad/layer_0/attn": 0.0032849826384335756, "grad/layer_0/mlp": 0.0030739407520741224, "grad/layer_0/attn_mlp_ratio": 1.0686551226960188, "grad/layer_4/attn": 0.0017311552073806524, "grad/layer_4/mlp": 0.0028356832917779684, "grad/layer_4/attn_mlp_ratio": 0.6104895957003229, "grad/layer_8/attn": 0.007111868355423212, "grad/layer_8/mlp": 0.004133671056479216, "grad/layer_8/attn_mlp_ratio": 1.7204726951431295, "grad/layer_12/attn": 0.003995514940470457, "grad/layer_12/mlp": 0.005595171824097633, "grad/layer_12/attn_mlp_ratio": 0.7141004771028261, "grad/layer_16/attn": 0.005224439315497875, "grad/layer_16/mlp": 0.004761177580803633, "grad/layer_16/attn_mlp_ratio": 1.0972997997033478, "grad/layer_20/attn": 0.007024100981652737, "grad/layer_20/mlp": 0.007107687648385763, "grad/layer_20/attn_mlp_ratio": 0.9882399495177585, "grad/layer_24/attn": 0.014419197104871273, "grad/layer_24/mlp": 0.01439549122005701, "grad/layer_24/attn_mlp_ratio": 1.001646750658745, "grad/layer_27/attn": 0.01012813113629818, "grad/layer_27/mlp": 0.012631766498088837, "grad/layer_27/attn_mlp_ratio": 0.8017984703604758} {"step": 9550, "timestamp": 1778204856.7431638, "train/loss": 2.331298089027405, "train/z_loss": 0.001801891636569053, "train/perplexity": 10.291291879266122, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024961.885539112, "perf/iters_per_sec": 0.9655770709701118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035650110244751, "data/tokens_consumed": 20029898752, "data/tokens_consumed_B": 20.029898752, "train/loss_slope": -2.6515106020337105e-05} {"step": 9560, "timestamp": 1778204867.0971315, "train/loss": 2.3569602966308594, "train/z_loss": 0.001803339587058872, "train/perplexity": 10.55880698120086, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026721.2449079629, "perf/iters_per_sec": 0.9664159988918127, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347510814666747, "data/tokens_consumed": 20050870272, "data/tokens_consumed_B": 20.050870272, "train/loss_slope": -2.5420559228736267e-05} {"step": 9570, "timestamp": 1778204877.452883, "train/loss": 2.393545079231262, "train/z_loss": 0.0017814574530348182, "train/perplexity": 10.952251803011112, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026128.773911827, "perf/iters_per_sec": 0.9661334867056975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350536584854126, "data/tokens_consumed": 20071841792, "data/tokens_consumed_B": 20.071841792, "train/loss_slope": -1.938933872654384e-05} {"step": 9580, "timestamp": 1778204887.8119597, "train/loss": 2.3583139181137085, "train/z_loss": 0.0017964109196327628, "train/perplexity": 10.573109286934185, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025412.0256549476, "perf/iters_per_sec": 0.9657917145037401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354199409484863, "data/tokens_consumed": 20092813312, "data/tokens_consumed_B": 20.092813312, "train/loss_slope": -2.0295631824725722e-05} {"step": 9590, "timestamp": 1778204898.1682518, "train/loss": 2.290435290336609, "train/z_loss": 0.0018276918795891105, "train/perplexity": 9.8792370817963, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026083.551017078, "perf/iters_per_sec": 0.9661119227490798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350767612457275, "data/tokens_consumed": 20113784832, "data/tokens_consumed_B": 20.113784832, "train/loss_slope": -2.4608109657592013e-05} {"step": 9600, "timestamp": 1778204908.5222116, "grad/layer_0/attn": 0.002249818528071046, "grad/layer_0/mlp": 0.0024061149451881647, "grad/layer_0/attn_mlp_ratio": 0.93504196009676, "grad/layer_4/attn": 0.001413031597621739, "grad/layer_4/mlp": 0.002670337911695242, "grad/layer_4/attn_mlp_ratio": 0.5291583280592586, "grad/layer_8/attn": 0.008175019174814224, "grad/layer_8/mlp": 0.004201875999569893, "grad/layer_8/attn_mlp_ratio": 1.9455640721179346, "grad/layer_12/attn": 0.00380913564004004, "grad/layer_12/mlp": 0.005715228617191315, "grad/layer_12/attn_mlp_ratio": 0.6664887493622473, "grad/layer_16/attn": 0.004571862518787384, "grad/layer_16/mlp": 0.004595770500600338, "grad/layer_16/attn_mlp_ratio": 0.9947978078344837, "grad/layer_20/attn": 0.00615974934771657, "grad/layer_20/mlp": 0.006625667214393616, "grad/layer_20/attn_mlp_ratio": 0.9296798428642977, "grad/layer_24/attn": 0.011781254783272743, "grad/layer_24/mlp": 0.009771924465894699, "grad/layer_24/attn_mlp_ratio": 1.2056227720372372, "grad/layer_27/attn": 0.0070298356004059315, "grad/layer_27/mlp": 0.008412276394665241, "grad/layer_27/attn_mlp_ratio": 0.8356638782455613} {"step": 9600, "timestamp": 1778204909.1206372, "eos/sharpness": 34.977936744689934, "eos/L0_probe": 2.159512519836426, "eos/L_plus": 2.284052848815918, "eos/L_minus": 2.384751558303833, "eos/grad_norm": 0.1401362121105194, "eos/embed_grad_frac": 0.14835086464881897, "eos/time_s": 0.5957024097442627} {"step": 9600, "timestamp": 1778204909.1397474, "train/loss": 2.3152019500732424, "train/z_loss": 0.0018256338662467897, "train/perplexity": 10.126967854148797, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912744.4235169743, "perf/iters_per_sec": 0.9120676152787086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964099407196044, "data/tokens_consumed": 20134756352, "data/tokens_consumed_B": 20.134756352, "train/loss_slope": -2.427729128217535e-05} {"step": 9600, "timestamp": 1778204910.5035942, "geo/rankme_last": 439.80999755859375, "geo/layer_0/stable_rank_q_proj": 15.127607345581055, "geo/layer_0/stable_rank_k_proj": 12.484109878540039, "geo/layer_0/stable_rank_o_proj": 55.65669250488281, "geo/layer_0/stable_rank_gate_proj": 165.6133270263672, "geo/layer_0/stable_rank_down_proj": 47.67050552368164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.027449743822216988, "geo/layer_0/attn_entropy_mean": 6.392553329467773, "geo/layer_0/attn_entropy_std": 0.27121487259864807, "geo/layer_7/stable_rank_q_proj": 43.742279052734375, "geo/layer_7/stable_rank_k_proj": 44.34504318237305, "geo/layer_7/stable_rank_o_proj": 105.82170104980469, "geo/layer_7/stable_rank_gate_proj": 128.25880432128906, "geo/layer_7/stable_rank_down_proj": 170.84649658203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5972245931625366, "geo/layer_7/attn_entropy_mean": 4.686465263366699, "geo/layer_7/attn_entropy_std": 0.923869252204895, "geo/layer_14/stable_rank_q_proj": 65.37161254882812, "geo/layer_14/stable_rank_k_proj": 42.989803314208984, "geo/layer_14/stable_rank_o_proj": 48.345977783203125, "geo/layer_14/stable_rank_gate_proj": 114.85124206542969, "geo/layer_14/stable_rank_down_proj": 140.89671325683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38789862394332886, "geo/layer_14/attn_entropy_mean": 5.7436113357543945, "geo/layer_14/attn_entropy_std": 0.5446716547012329, "geo/layer_21/stable_rank_q_proj": 52.49327850341797, "geo/layer_21/stable_rank_k_proj": 32.5346565246582, "geo/layer_21/stable_rank_o_proj": 92.52044677734375, "geo/layer_21/stable_rank_gate_proj": 110.78679656982422, "geo/layer_21/stable_rank_down_proj": 74.02720642089844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15893946588039398, "geo/layer_21/attn_entropy_mean": 5.781580924987793, "geo/layer_21/attn_entropy_std": 0.29947271943092346, "geo/layer_27/stable_rank_q_proj": 43.08436584472656, "geo/layer_27/stable_rank_k_proj": 33.0901985168457, "geo/layer_27/stable_rank_o_proj": 113.17761993408203, "geo/layer_27/stable_rank_gate_proj": 94.98179626464844, "geo/layer_27/stable_rank_down_proj": 154.18731689453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0619325116276741, "geo/layer_27/attn_entropy_mean": 4.538733005523682, "geo/layer_27/attn_entropy_std": 0.533187985420227, "attnres/final_alpha/block_0": 0.24979649484157562, "attnres/block_norm/0": 1.390621304512024, "attnres/final_alpha/block_1": 0.010156385600566864, "attnres/block_norm/1": 16594.5078125, "attnres/final_alpha/block_2": 0.020451677963137627, "attnres/block_norm/2": 12261.8359375, "attnres/final_alpha/block_3": 0.02141474187374115, "attnres/block_norm/3": 13260.373046875, "attnres/final_alpha/block_4": 0.03078840672969818, "attnres/block_norm/4": 5326.453125, "attnres/final_alpha/block_5": 0.4929894208908081, "attnres/block_norm/5": 3816.044677734375, "attnres/final_alpha/block_6": 0.1744028925895691, "attnres/block_norm/6": 9638.546875, "geo/tier1_time_s": 1.3602964878082275, "geo/step": 9600.0, "geo/rankme_slope": 0.0032459038498211783} {"step": 9610, "timestamp": 1778204921.2347777, "train/loss": 2.357766532897949, "train/z_loss": 0.0018036472611129284, "train/perplexity": 10.567323306950332, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1734444.287877952, "perf/iters_per_sec": 0.8270474852933655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2091204166412353, "data/tokens_consumed": 20155727872, "data/tokens_consumed_B": 20.155727872, "train/loss_slope": -2.2487305407405335e-05} {"step": 9620, "timestamp": 1778204931.6164427, "train/loss": 2.3883636713027956, "train/z_loss": 0.001793095504399389, "train/perplexity": 10.895650482580841, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021356.4349091274, "perf/iters_per_sec": 0.9638578581376683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03749737739563, "data/tokens_consumed": 20176699392, "data/tokens_consumed_B": 20.176699392, "train/loss_slope": -1.8594971427036158e-05} {"step": 9630, "timestamp": 1778204941.979517, "train/loss": 2.331901288032532, "train/z_loss": 0.0018076024716719985, "train/perplexity": 10.297501448904, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025107.1543884755, "perf/iters_per_sec": 0.9656463405554178, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355758190155029, "data/tokens_consumed": 20197670912, "data/tokens_consumed_B": 20.197670912, "train/loss_slope": -2.1842353235949153e-05} {"step": 9640, "timestamp": 1778204952.3410068, "train/loss": 2.3039227962493896, "train/z_loss": 0.0018157909042201936, "train/perplexity": 10.01338598379436, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025260.1381163995, "perf/iters_per_sec": 0.9657192888814924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354975938796998, "data/tokens_consumed": 20218642432, "data/tokens_consumed_B": 20.218642432, "train/loss_slope": -2.384449562223827e-05} {"step": 9650, "timestamp": 1778204962.6920674, "grad/layer_0/attn": 0.003009253880009055, "grad/layer_0/mlp": 0.0029595408122986555, "grad/layer_0/attn_mlp_ratio": 1.0167975267730927, "grad/layer_4/attn": 0.0017253593541681767, "grad/layer_4/mlp": 0.002876708749681711, "grad/layer_4/attn_mlp_ratio": 0.5997685008543969, "grad/layer_8/attn": 0.006632627919316292, "grad/layer_8/mlp": 0.004409512504935265, "grad/layer_8/attn_mlp_ratio": 1.5041635014021386, "grad/layer_12/attn": 0.005050415173172951, "grad/layer_12/mlp": 0.006443789228796959, "grad/layer_12/attn_mlp_ratio": 0.7837647873748617, "grad/layer_16/attn": 0.0061761909164488316, "grad/layer_16/mlp": 0.004816827829927206, "grad/layer_16/attn_mlp_ratio": 1.2822112407370485, "grad/layer_20/attn": 0.005488733295351267, "grad/layer_20/mlp": 0.006613383535295725, "grad/layer_20/attn_mlp_ratio": 0.8299432783632599, "grad/layer_24/attn": 0.008413993753492832, "grad/layer_24/mlp": 0.010035611689090729, "grad/layer_24/attn_mlp_ratio": 0.8384136343973881, "grad/layer_27/attn": 0.006473853252828121, "grad/layer_27/mlp": 0.008343047462403774, "grad/layer_27/attn_mlp_ratio": 0.7759578504623668} {"step": 9650, "timestamp": 1778204962.7083213, "train/loss": 2.2876155376434326, "train/z_loss": 0.0018201762111857533, "train/perplexity": 9.85141931447214, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024308.7130674005, "perf/iters_per_sec": 0.9652656140648844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359842777252197, "data/tokens_consumed": 20239613952, "data/tokens_consumed_B": 20.239613952, "train/loss_slope": -2.473473126846068e-05} {"step": 9660, "timestamp": 1778204973.0693324, "train/loss": 2.3484060764312744, "train/z_loss": 0.0017998208524659276, "train/perplexity": 10.468869840716808, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025367.5342838645, "perf/iters_per_sec": 0.9657704993647883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354426860809327, "data/tokens_consumed": 20260585472, "data/tokens_consumed_B": 20.260585472, "train/loss_slope": -2.5488221920756193e-05} {"step": 9670, "timestamp": 1778204983.8462892, "train/loss": 2.3528400182724, "train/z_loss": 0.001805863343179226, "train/perplexity": 10.5153912611522, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946840.3602493044, "perf/iters_per_sec": 0.9283258248564265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0772079944610595, "data/tokens_consumed": 20281556992, "data/tokens_consumed_B": 20.281556992, "train/loss_slope": -2.0518750111476557e-05} {"step": 9675, "timestamp": 1778204989.6113656, "eos/sharpness": 32.104635238647454, "eos/L0_probe": 2.1575767993927, "eos/L_plus": 2.3205745220184326, "eos/L_minus": 2.3156254291534424, "eos/grad_norm": 0.2032880187034607, "eos/embed_grad_frac": 0.0703205093741417, "eos/time_s": 0.5858392715454102} {"step": 9675, "timestamp": 1778204990.9946096, "geo/rankme_last": 439.66363525390625, "geo/layer_0/stable_rank_q_proj": 15.117546081542969, "geo/layer_0/stable_rank_k_proj": 12.490253448486328, "geo/layer_0/stable_rank_o_proj": 55.57322311401367, "geo/layer_0/stable_rank_gate_proj": 165.208251953125, "geo/layer_0/stable_rank_down_proj": 47.669471740722656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.032570913434028625, "geo/layer_0/attn_entropy_mean": 6.38825798034668, "geo/layer_0/attn_entropy_std": 0.2716725468635559, "geo/layer_7/stable_rank_q_proj": 43.5724983215332, "geo/layer_7/stable_rank_k_proj": 44.1754035949707, "geo/layer_7/stable_rank_o_proj": 105.86627960205078, "geo/layer_7/stable_rank_gate_proj": 127.7381820678711, "geo/layer_7/stable_rank_down_proj": 171.0298309326172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5836009383201599, "geo/layer_7/attn_entropy_mean": 4.677796840667725, "geo/layer_7/attn_entropy_std": 0.9346208572387695, "geo/layer_14/stable_rank_q_proj": 65.5418930053711, "geo/layer_14/stable_rank_k_proj": 42.91492462158203, "geo/layer_14/stable_rank_o_proj": 48.444374084472656, "geo/layer_14/stable_rank_gate_proj": 114.44874572753906, "geo/layer_14/stable_rank_down_proj": 140.19293212890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38178324699401855, "geo/layer_14/attn_entropy_mean": 5.75970458984375, "geo/layer_14/attn_entropy_std": 0.5726572275161743, "geo/layer_21/stable_rank_q_proj": 52.4859619140625, "geo/layer_21/stable_rank_k_proj": 32.3125114440918, "geo/layer_21/stable_rank_o_proj": 92.48155212402344, "geo/layer_21/stable_rank_gate_proj": 109.99220275878906, "geo/layer_21/stable_rank_down_proj": 73.83745574951172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16205956041812897, "geo/layer_21/attn_entropy_mean": 5.820911407470703, "geo/layer_21/attn_entropy_std": 0.30042073130607605, "geo/layer_27/stable_rank_q_proj": 43.05339431762695, "geo/layer_27/stable_rank_k_proj": 33.09309005737305, "geo/layer_27/stable_rank_o_proj": 112.86332702636719, "geo/layer_27/stable_rank_gate_proj": 94.98332977294922, "geo/layer_27/stable_rank_down_proj": 153.74139404296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06659699976444244, "geo/layer_27/attn_entropy_mean": 4.5476274490356445, "geo/layer_27/attn_entropy_std": 0.4991410970687866, "attnres/final_alpha/block_0": 0.24916347861289978, "attnres/block_norm/0": 1.393648386001587, "attnres/final_alpha/block_1": 0.010049611330032349, "attnres/block_norm/1": 16721.470703125, "attnres/final_alpha/block_2": 0.020215589553117752, "attnres/block_norm/2": 12334.572265625, "attnres/final_alpha/block_3": 0.020738273859024048, "attnres/block_norm/3": 13465.033203125, "attnres/final_alpha/block_4": 0.030789192765951157, "attnres/block_norm/4": 5379.72412109375, "attnres/final_alpha/block_5": 0.4957590401172638, "attnres/block_norm/5": 3816.86474609375, "attnres/final_alpha/block_6": 0.17328479886054993, "attnres/block_norm/6": 9799.7314453125, "geo/tier1_time_s": 1.3631062507629395, "geo/step": 9675.0, "geo/rankme_slope": 0.0030934678949704883} {"step": 9680, "timestamp": 1778204996.7244925, "train/loss": 2.3224539518356324, "train/z_loss": 0.0018192246207036078, "train/perplexity": 10.200675584135395, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1629226.891896197, "perf/iters_per_sec": 0.7768759211998926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2872068405151367, "data/tokens_consumed": 20302528512, "data/tokens_consumed_B": 20.302528512, "train/loss_slope": -2.1546619760357018e-05} {"step": 9690, "timestamp": 1778205007.1001675, "train/loss": 2.3436011552810667, "train/z_loss": 0.0018020557588897645, "train/perplexity": 10.418688402074304, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022573.6287902908, "perf/iters_per_sec": 0.9644382614089445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368730068206786, "data/tokens_consumed": 20323500032, "data/tokens_consumed_B": 20.323500032, "train/loss_slope": -1.5798968035574572e-05} {"step": 9700, "timestamp": 1778205017.4539487, "grad/layer_0/attn": 0.002409532433375716, "grad/layer_0/mlp": 0.002632722957059741, "grad/layer_0/attn_mlp_ratio": 0.9152244201738078, "grad/layer_4/attn": 0.0014642503811046481, "grad/layer_4/mlp": 0.0026817121542990208, "grad/layer_4/attn_mlp_ratio": 0.546013234178024, "grad/layer_8/attn": 0.007341523189097643, "grad/layer_8/mlp": 0.004092462360858917, "grad/layer_8/attn_mlp_ratio": 1.79391338572152, "grad/layer_12/attn": 0.004137290641665459, "grad/layer_12/mlp": 0.005378806497901678, "grad/layer_12/attn_mlp_ratio": 0.7691837522619709, "grad/layer_16/attn": 0.004586152732372284, "grad/layer_16/mlp": 0.004409561865031719, "grad/layer_16/attn_mlp_ratio": 1.0400472356984545, "grad/layer_20/attn": 0.004422565922141075, "grad/layer_20/mlp": 0.006088919937610626, "grad/layer_20/attn_mlp_ratio": 0.726330103667538, "grad/layer_24/attn": 0.005387856159359217, "grad/layer_24/mlp": 0.009196926839649677, "grad/layer_24/attn_mlp_ratio": 0.5858322236019032, "grad/layer_27/attn": 0.007248067297041416, "grad/layer_27/mlp": 0.0078026349656283855, "grad/layer_27/attn_mlp_ratio": 0.9289255791251969} {"step": 9700, "timestamp": 1778205017.4690356, "train/loss": 2.2765350103378297, "train/z_loss": 0.0018333026790060103, "train/perplexity": 9.742862935413457, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023603.3539669982, "perf/iters_per_sec": 0.9649292726359359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363453865051269, "data/tokens_consumed": 20344471552, "data/tokens_consumed_B": 20.344471552, "train/loss_slope": -1.9890251454382765e-05} {"step": 9710, "timestamp": 1778205027.8404434, "train/loss": 2.3288623094558716, "train/z_loss": 0.001814115117304027, "train/perplexity": 10.266255065200701, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023274.4073256848, "perf/iters_per_sec": 0.9647724186542915, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036513876914978, "data/tokens_consumed": 20365443072, "data/tokens_consumed_B": 20.365443072, "train/loss_slope": -2.1230928482252476e-05} {"step": 9720, "timestamp": 1778205038.2193725, "train/loss": 2.3805697202682494, "train/z_loss": 0.0017879694816656411, "train/perplexity": 10.811060389956445, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022284.2567750576, "perf/iters_per_sec": 0.9643002780795372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370213747024537, "data/tokens_consumed": 20386414592, "data/tokens_consumed_B": 20.386414592, "train/loss_slope": -1.9455690481195592e-05} {"step": 9730, "timestamp": 1778205049.0707312, "train/loss": 2.365681791305542, "train/z_loss": 0.0017844837973825633, "train/perplexity": 10.651298305078184, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933595.804143237, "perf/iters_per_sec": 0.9220103283611474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0845865488052369, "data/tokens_consumed": 20407386112, "data/tokens_consumed_B": 20.407386112, "train/loss_slope": -1.830312727165045e-05} {"step": 9740, "timestamp": 1778205059.4474258, "train/loss": 2.305346369743347, "train/z_loss": 0.0018105759751051663, "train/perplexity": 10.027650925853294, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022358.045050653, "perf/iters_per_sec": 0.9643354630711808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369835376739502, "data/tokens_consumed": 20428357632, "data/tokens_consumed_B": 20.428357632, "train/loss_slope": -2.0738858472753663e-05} {"step": 9750, "timestamp": 1778205070.228326, "grad/layer_0/attn": 0.0029933294281363487, "grad/layer_0/mlp": 0.003046691883355379, "grad/layer_0/attn_mlp_ratio": 0.9824850836544818, "grad/layer_4/attn": 0.0017080882098525763, "grad/layer_4/mlp": 0.0028220955282449722, "grad/layer_4/attn_mlp_ratio": 0.6052552552639104, "grad/layer_8/attn": 0.005805226508527994, "grad/layer_8/mlp": 0.004410303197801113, "grad/layer_8/attn_mlp_ratio": 1.316287365411436, "grad/layer_12/attn": 0.004170913249254227, "grad/layer_12/mlp": 0.006189381703734398, "grad/layer_12/attn_mlp_ratio": 0.6738820421027644, "grad/layer_16/attn": 0.005947267636656761, "grad/layer_16/mlp": 0.005549329798668623, "grad/layer_16/attn_mlp_ratio": 1.071709151420897, "grad/layer_20/attn": 0.005948004312813282, "grad/layer_20/mlp": 0.008732634596526623, "grad/layer_20/attn_mlp_ratio": 0.6811236836895378, "grad/layer_24/attn": 0.024036487564444542, "grad/layer_24/mlp": 0.01446488592773676, "grad/layer_24/attn_mlp_ratio": 1.6617128899843394, "grad/layer_27/attn": 0.007059827446937561, "grad/layer_27/mlp": 0.013044260442256927, "grad/layer_27/attn_mlp_ratio": 0.541220977921073} {"step": 9750, "timestamp": 1778205070.844107, "eos/sharpness": 47.455215454101555, "eos/L0_probe": 2.160346031188965, "eos/L_plus": 2.4775304794311523, "eos/L_minus": 2.317713737487793, "eos/grad_norm": 0.25828129053115845, "eos/embed_grad_frac": 0.03781178966164589, "eos/time_s": 0.612962007522583} {"step": 9750, "timestamp": 1778205070.8650248, "train/loss": 2.306743788719177, "train/z_loss": 0.001803489204030484, "train/perplexity": 10.04167355099939, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1838015.2750509784, "perf/iters_per_sec": 0.8764339804892437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.140987253189087, "data/tokens_consumed": 20449329152, "data/tokens_consumed_B": 20.449329152, "train/loss_slope": -2.2811352194446407e-05} {"step": 9750, "timestamp": 1778205072.2302165, "geo/rankme_last": 437.7962951660156, "geo/layer_0/stable_rank_q_proj": 15.128082275390625, "geo/layer_0/stable_rank_k_proj": 12.502568244934082, "geo/layer_0/stable_rank_o_proj": 55.4917106628418, "geo/layer_0/stable_rank_gate_proj": 166.43643188476562, "geo/layer_0/stable_rank_down_proj": 47.79289245605469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03494764491915703, "geo/layer_0/attn_entropy_mean": 6.3929524421691895, "geo/layer_0/attn_entropy_std": 0.2705547511577606, "geo/layer_7/stable_rank_q_proj": 43.797027587890625, "geo/layer_7/stable_rank_k_proj": 44.37813949584961, "geo/layer_7/stable_rank_o_proj": 105.69573974609375, "geo/layer_7/stable_rank_gate_proj": 126.97724151611328, "geo/layer_7/stable_rank_down_proj": 170.44541931152344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5891036987304688, "geo/layer_7/attn_entropy_mean": 4.699226379394531, "geo/layer_7/attn_entropy_std": 0.9330900311470032, "geo/layer_14/stable_rank_q_proj": 65.00637817382812, "geo/layer_14/stable_rank_k_proj": 42.65707778930664, "geo/layer_14/stable_rank_o_proj": 48.569801330566406, "geo/layer_14/stable_rank_gate_proj": 113.75443267822266, "geo/layer_14/stable_rank_down_proj": 140.35272216796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3982542157173157, "geo/layer_14/attn_entropy_mean": 5.733869552612305, "geo/layer_14/attn_entropy_std": 0.5887684226036072, "geo/layer_21/stable_rank_q_proj": 52.49711608886719, "geo/layer_21/stable_rank_k_proj": 32.338871002197266, "geo/layer_21/stable_rank_o_proj": 92.4613265991211, "geo/layer_21/stable_rank_gate_proj": 110.05892181396484, "geo/layer_21/stable_rank_down_proj": 73.50560760498047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16150623559951782, "geo/layer_21/attn_entropy_mean": 5.844564437866211, "geo/layer_21/attn_entropy_std": 0.2818138301372528, "geo/layer_27/stable_rank_q_proj": 42.990264892578125, "geo/layer_27/stable_rank_k_proj": 32.991973876953125, "geo/layer_27/stable_rank_o_proj": 112.64671325683594, "geo/layer_27/stable_rank_gate_proj": 94.65631866455078, "geo/layer_27/stable_rank_down_proj": 153.14662170410156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06863313913345337, "geo/layer_27/attn_entropy_mean": 4.519349098205566, "geo/layer_27/attn_entropy_std": 0.5013014078140259, "attnres/final_alpha/block_0": 0.2463729977607727, "attnres/block_norm/0": 1.3962430953979492, "attnres/final_alpha/block_1": 0.009766405448317528, "attnres/block_norm/1": 16830.15625, "attnres/final_alpha/block_2": 0.01980222389101982, "attnres/block_norm/2": 12474.23828125, "attnres/final_alpha/block_3": 0.020874537527561188, "attnres/block_norm/3": 13661.51171875, "attnres/final_alpha/block_4": 0.030001789331436157, "attnres/block_norm/4": 5444.1923828125, "attnres/final_alpha/block_5": 0.504743218421936, "attnres/block_norm/5": 3833.7138671875, "attnres/final_alpha/block_6": 0.1684388518333435, "attnres/block_norm/6": 10026.4140625, "geo/tier1_time_s": 1.361048698425293, "geo/step": 9750.0, "geo/rankme_slope": 0.002851109858005702} {"step": 9760, "timestamp": 1778205082.5874684, "train/loss": 2.2977433443069457, "train/z_loss": 0.001803321996703744, "train/perplexity": 9.951699536838102, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789620.0251659094, "perf/iters_per_sec": 0.8533573270635173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718420505523681, "data/tokens_consumed": 20470300672, "data/tokens_consumed_B": 20.470300672, "train/loss_slope": -2.410716342382374e-05} {"step": 9770, "timestamp": 1778205092.9568005, "train/loss": 2.3307044506073, "train/z_loss": 0.0017986558261327446, "train/perplexity": 10.285184386014757, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023725.3339129894, "perf/iters_per_sec": 0.9649874372067401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362829208374023, "data/tokens_consumed": 20491272192, "data/tokens_consumed_B": 20.491272192, "train/loss_slope": -2.601478400975776e-05} {"step": 9780, "timestamp": 1778205103.3123045, "train/loss": 2.334860062599182, "train/z_loss": 0.0018005278310738505, "train/perplexity": 10.328014552728648, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026464.7186013025, "perf/iters_per_sec": 0.9662936776167405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348820686340332, "data/tokens_consumed": 20512243712, "data/tokens_consumed_B": 20.512243712, "train/loss_slope": -3.0090309856104606e-05} {"step": 9790, "timestamp": 1778205113.669835, "train/loss": 2.3376702308654784, "train/z_loss": 0.0017955893301405013, "train/perplexity": 10.357078830106733, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025993.157947967, "perf/iters_per_sec": 0.9660688199748836, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351229429244995, "data/tokens_consumed": 20533215232, "data/tokens_consumed_B": 20.533215232, "train/loss_slope": -2.7931299304017898e-05} {"step": 9800, "timestamp": 1778205124.027528, "grad/layer_0/attn": 0.003281132085248828, "grad/layer_0/mlp": 0.0031066627707332373, "grad/layer_0/attn_mlp_ratio": 1.0561596870259733, "grad/layer_4/attn": 0.001789634465239942, "grad/layer_4/mlp": 0.0028331750072538853, "grad/layer_4/attn_mlp_ratio": 0.6316709689626573, "grad/layer_8/attn": 0.006200109608471394, "grad/layer_8/mlp": 0.004293675534427166, "grad/layer_8/attn_mlp_ratio": 1.4440097800491098, "grad/layer_12/attn": 0.006441901437938213, "grad/layer_12/mlp": 0.006096973549574614, "grad/layer_12/attn_mlp_ratio": 1.0565736065445626, "grad/layer_16/attn": 0.005630464758723974, "grad/layer_16/mlp": 0.004770426079630852, "grad/layer_16/attn_mlp_ratio": 1.180285481151639, "grad/layer_20/attn": 0.005201445892453194, "grad/layer_20/mlp": 0.006400812417268753, "grad/layer_20/attn_mlp_ratio": 0.8126227535051546, "grad/layer_24/attn": 0.018664609640836716, "grad/layer_24/mlp": 0.00943082571029663, "grad/layer_24/attn_mlp_ratio": 1.9791066038414777, "grad/layer_27/attn": 0.008222752250730991, "grad/layer_27/mlp": 0.008837590925395489, "grad/layer_27/attn_mlp_ratio": 0.9304291437680573} {"step": 9800, "timestamp": 1778205124.0433855, "train/loss": 2.3439987659454347, "train/z_loss": 0.001799499907065183, "train/perplexity": 10.422831807368174, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023166.3488634704, "perf/iters_per_sec": 0.9647208923642494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365692377090454, "data/tokens_consumed": 20554186752, "data/tokens_consumed_B": 20.554186752, "train/loss_slope": -2.7295633339502745e-05} {"step": 9810, "timestamp": 1778205134.4064662, "train/loss": 2.3564462661743164, "train/z_loss": 0.0017967205960303545, "train/perplexity": 10.553380827551361, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024887.0680923692, "perf/iters_per_sec": 0.965541395231423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356883764266969, "data/tokens_consumed": 20575158272, "data/tokens_consumed_B": 20.575158272, "train/loss_slope": -2.835024888902463e-05} {"step": 9820, "timestamp": 1778205144.7866318, "train/loss": 2.3513605117797853, "train/z_loss": 0.001797181984875351, "train/perplexity": 10.499845174612712, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022048.7003780336, "perf/iters_per_sec": 0.9641879560365837, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371421813964843, "data/tokens_consumed": 20596129792, "data/tokens_consumed_B": 20.596129792, "train/loss_slope": -2.693886676780315e-05} {"step": 9825, "timestamp": 1778205150.5577695, "eos/sharpness": 29.483246803283684, "eos/L0_probe": 2.1597087383270264, "eos/L_plus": 2.3380324840545654, "eos/L_minus": 2.276217460632324, "eos/grad_norm": 0.2254422903060913, "eos/embed_grad_frac": 0.06354183703660965, "eos/time_s": 0.5881466865539551} {"step": 9825, "timestamp": 1778205151.942121, "geo/rankme_last": 439.05255126953125, "geo/layer_0/stable_rank_q_proj": 15.165632247924805, "geo/layer_0/stable_rank_k_proj": 12.58044719696045, "geo/layer_0/stable_rank_o_proj": 55.269248962402344, "geo/layer_0/stable_rank_gate_proj": 166.38397216796875, "geo/layer_0/stable_rank_down_proj": 47.71376419067383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0374961793422699, "geo/layer_0/attn_entropy_mean": 6.391002178192139, "geo/layer_0/attn_entropy_std": 0.27500009536743164, "geo/layer_7/stable_rank_q_proj": 43.858985900878906, "geo/layer_7/stable_rank_k_proj": 44.316429138183594, "geo/layer_7/stable_rank_o_proj": 105.70622253417969, "geo/layer_7/stable_rank_gate_proj": 126.5768814086914, "geo/layer_7/stable_rank_down_proj": 170.30908203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5965723395347595, "geo/layer_7/attn_entropy_mean": 4.670269966125488, "geo/layer_7/attn_entropy_std": 0.9317701458930969, "geo/layer_14/stable_rank_q_proj": 64.61932373046875, "geo/layer_14/stable_rank_k_proj": 42.62017822265625, "geo/layer_14/stable_rank_o_proj": 48.74270248413086, "geo/layer_14/stable_rank_gate_proj": 113.51441192626953, "geo/layer_14/stable_rank_down_proj": 140.0613555908203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39782071113586426, "geo/layer_14/attn_entropy_mean": 5.723147392272949, "geo/layer_14/attn_entropy_std": 0.5835323333740234, "geo/layer_21/stable_rank_q_proj": 52.42469024658203, "geo/layer_21/stable_rank_k_proj": 32.4281005859375, "geo/layer_21/stable_rank_o_proj": 92.60453796386719, "geo/layer_21/stable_rank_gate_proj": 109.68777465820312, "geo/layer_21/stable_rank_down_proj": 73.24337768554688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16131389141082764, "geo/layer_21/attn_entropy_mean": 5.802156448364258, "geo/layer_21/attn_entropy_std": 0.2980831563472748, "geo/layer_27/stable_rank_q_proj": 42.88677978515625, "geo/layer_27/stable_rank_k_proj": 32.98961639404297, "geo/layer_27/stable_rank_o_proj": 112.78833770751953, "geo/layer_27/stable_rank_gate_proj": 94.41918182373047, "geo/layer_27/stable_rank_down_proj": 153.07814025878906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05858226865530014, "geo/layer_27/attn_entropy_mean": 4.518274784088135, "geo/layer_27/attn_entropy_std": 0.5035162568092346, "attnres/final_alpha/block_0": 0.24767166376113892, "attnres/block_norm/0": 1.3990836143493652, "attnres/final_alpha/block_1": 0.009820636361837387, "attnres/block_norm/1": 16949.7421875, "attnres/final_alpha/block_2": 0.02013235352933407, "attnres/block_norm/2": 12576.052734375, "attnres/final_alpha/block_3": 0.020635876804590225, "attnres/block_norm/3": 13783.03515625, "attnres/final_alpha/block_4": 0.030301295220851898, "attnres/block_norm/4": 5435.06005859375, "attnres/final_alpha/block_5": 0.5027201175689697, "attnres/block_norm/5": 3801.457275390625, "attnres/final_alpha/block_6": 0.16871805489063263, "attnres/block_norm/6": 10007.66015625, "geo/tier1_time_s": 1.3644981384277344, "geo/step": 9825.0, "geo/rankme_slope": 0.0027330878054346737} {"step": 9830, "timestamp": 1778205157.1272337, "train/loss": 2.316923189163208, "train/z_loss": 0.001814276387449354, "train/perplexity": 10.144413797094456, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700522.518917388, "perf/iters_per_sec": 0.8108723253809872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233239769935608, "data/tokens_consumed": 20617101312, "data/tokens_consumed_B": 20.617101312, "train/loss_slope": -2.8494313569387936e-05} {"step": 9840, "timestamp": 1778205167.4992387, "train/loss": 2.356974458694458, "train/z_loss": 0.0017870220239274205, "train/perplexity": 10.558956516755718, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023103.669076364, "perf/iters_per_sec": 0.9646910043126888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366013526916504, "data/tokens_consumed": 20638072832, "data/tokens_consumed_B": 20.638072832, "train/loss_slope": -2.700859897314803e-05} {"step": 9850, "timestamp": 1778205177.8655891, "grad/layer_0/attn": 0.0030828602612018585, "grad/layer_0/mlp": 0.0028623025864362717, "grad/layer_0/attn_mlp_ratio": 1.077055992649119, "grad/layer_4/attn": 0.0015109906671568751, "grad/layer_4/mlp": 0.0027922403533011675, "grad/layer_4/attn_mlp_ratio": 0.541139164920587, "grad/layer_8/attn": 0.0055487253703176975, "grad/layer_8/mlp": 0.004057127051055431, "grad/layer_8/attn_mlp_ratio": 1.3676488716589124, "grad/layer_12/attn": 0.004360510502010584, "grad/layer_12/mlp": 0.0056299930438399315, "grad/layer_12/attn_mlp_ratio": 0.7745143538552344, "grad/layer_16/attn": 0.004147735890001059, "grad/layer_16/mlp": 0.0045412261970341206, "grad/layer_16/attn_mlp_ratio": 0.9133515087565554, "grad/layer_20/attn": 0.005099502857774496, "grad/layer_20/mlp": 0.0070494478568434715, "grad/layer_20/attn_mlp_ratio": 0.7233903830475116, "grad/layer_24/attn": 0.016756778582930565, "grad/layer_24/mlp": 0.01063323300331831, "grad/layer_24/attn_mlp_ratio": 1.5758874483529646, "grad/layer_27/attn": 0.013292111456394196, "grad/layer_27/mlp": 0.00931645929813385, "grad/layer_27/attn_mlp_ratio": 1.4267342225584856} {"step": 9850, "timestamp": 1778205177.8805938, "train/loss": 2.356435680389404, "train/z_loss": 0.0017891471041366458, "train/perplexity": 10.553269112323122, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020948.397941577, "perf/iters_per_sec": 0.9636632909496198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377068519592285, "data/tokens_consumed": 20659044352, "data/tokens_consumed_B": 20.659044352, "train/loss_slope": -2.516909870270167e-05} {"step": 9860, "timestamp": 1778205188.2539363, "train/loss": 2.326589322090149, "train/z_loss": 0.0018023954471573234, "train/perplexity": 10.242946497219735, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022948.2660158956, "perf/iters_per_sec": 0.964616902358959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366809844970704, "data/tokens_consumed": 20680015872, "data/tokens_consumed_B": 20.680015872, "train/loss_slope": -2.7187149323681738e-05} {"step": 9870, "timestamp": 1778205198.6283479, "train/loss": 2.3581182241439818, "train/z_loss": 0.0017852107179351152, "train/perplexity": 10.571040395646849, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023036.1542149028, "perf/iters_per_sec": 0.9646588107180132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036635947227478, "data/tokens_consumed": 20700987392, "data/tokens_consumed_B": 20.700987392, "train/loss_slope": -2.5677200755735357e-05} {"step": 9880, "timestamp": 1778205208.996998, "train/loss": 2.382700061798096, "train/z_loss": 0.0017803263384848834, "train/perplexity": 10.834116190531963, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023648.5127670488, "perf/iters_per_sec": 0.9649508060298199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036322259902954, "data/tokens_consumed": 20721958912, "data/tokens_consumed_B": 20.721958912, "train/loss_slope": -2.300200828588871e-05} {"step": 9890, "timestamp": 1778205219.3819659, "train/loss": 2.342637467384338, "train/z_loss": 0.001797428959980607, "train/perplexity": 10.408652874496333, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020592.0460929018, "perf/iters_per_sec": 0.9634933691467771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378898620605468, "data/tokens_consumed": 20742930432, "data/tokens_consumed_B": 20.742930432, "train/loss_slope": -2.433638528342583e-05} {"step": 9900, "timestamp": 1778205229.7358007, "grad/layer_0/attn": 0.002448681276291609, "grad/layer_0/mlp": 0.0027332862373441458, "grad/layer_0/attn_mlp_ratio": 0.89587440687642, "grad/layer_4/attn": 0.0015449358616024256, "grad/layer_4/mlp": 0.0028440188616514206, "grad/layer_4/attn_mlp_ratio": 0.5432227711679307, "grad/layer_8/attn": 0.0051035573706030846, "grad/layer_8/mlp": 0.004017663188278675, "grad/layer_8/attn_mlp_ratio": 1.270280012138511, "grad/layer_12/attn": 0.004209601320326328, "grad/layer_12/mlp": 0.006086934823542833, "grad/layer_12/attn_mlp_ratio": 0.6915798136833335, "grad/layer_16/attn": 0.004427335225045681, "grad/layer_16/mlp": 0.004829417448490858, "grad/layer_16/attn_mlp_ratio": 0.9167431021633606, "grad/layer_20/attn": 0.005403981078416109, "grad/layer_20/mlp": 0.006742881610989571, "grad/layer_20/attn_mlp_ratio": 0.80143495170747, "grad/layer_24/attn": 0.01678730547428131, "grad/layer_24/mlp": 0.013276119716465473, "grad/layer_24/attn_mlp_ratio": 1.264473784988077, "grad/layer_27/attn": 0.010633681900799274, "grad/layer_27/mlp": 0.011213351972401142, "grad/layer_27/attn_mlp_ratio": 0.9483053624055398} {"step": 9900, "timestamp": 1778205230.326376, "eos/sharpness": 27.97217369079589, "eos/L0_probe": 2.1550378799438477, "eos/L_plus": 2.3240628242492676, "eos/L_minus": 2.2657346725463867, "eos/grad_norm": 0.18045620620250702, "eos/embed_grad_frac": 0.22420643270015717, "eos/time_s": 0.5876436233520508} {"step": 9900, "timestamp": 1778205230.346556, "train/loss": 2.360650134086609, "train/z_loss": 0.0017872155527584254, "train/perplexity": 10.597839229729423, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913770.6246164911, "perf/iters_per_sec": 0.9125569460947471, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095822024345398, "data/tokens_consumed": 20763901952, "data/tokens_consumed_B": 20.763901952, "train/loss_slope": -2.2375217472652125e-05} {"step": 9900, "timestamp": 1778205231.710293, "geo/rankme_last": 439.2034912109375, "geo/layer_0/stable_rank_q_proj": 15.161091804504395, "geo/layer_0/stable_rank_k_proj": 12.634074211120605, "geo/layer_0/stable_rank_o_proj": 55.48040008544922, "geo/layer_0/stable_rank_gate_proj": 166.7962646484375, "geo/layer_0/stable_rank_down_proj": 47.69675064086914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.030345311388373375, "geo/layer_0/attn_entropy_mean": 6.391179084777832, "geo/layer_0/attn_entropy_std": 0.26556622982025146, "geo/layer_7/stable_rank_q_proj": 43.712284088134766, "geo/layer_7/stable_rank_k_proj": 44.55376434326172, "geo/layer_7/stable_rank_o_proj": 106.30425262451172, "geo/layer_7/stable_rank_gate_proj": 126.37214660644531, "geo/layer_7/stable_rank_down_proj": 169.51458740234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5916389226913452, "geo/layer_7/attn_entropy_mean": 4.655191898345947, "geo/layer_7/attn_entropy_std": 0.9237766861915588, "geo/layer_14/stable_rank_q_proj": 64.46236419677734, "geo/layer_14/stable_rank_k_proj": 42.24198913574219, "geo/layer_14/stable_rank_o_proj": 48.843772888183594, "geo/layer_14/stable_rank_gate_proj": 112.82185363769531, "geo/layer_14/stable_rank_down_proj": 140.0624237060547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3856162428855896, "geo/layer_14/attn_entropy_mean": 5.67694091796875, "geo/layer_14/attn_entropy_std": 0.6122758984565735, "geo/layer_21/stable_rank_q_proj": 52.42364501953125, "geo/layer_21/stable_rank_k_proj": 32.40414810180664, "geo/layer_21/stable_rank_o_proj": 92.52081298828125, "geo/layer_21/stable_rank_gate_proj": 109.24665069580078, "geo/layer_21/stable_rank_down_proj": 73.0343246459961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1626017987728119, "geo/layer_21/attn_entropy_mean": 5.796268463134766, "geo/layer_21/attn_entropy_std": 0.29043838381767273, "geo/layer_27/stable_rank_q_proj": 42.842247009277344, "geo/layer_27/stable_rank_k_proj": 32.85293197631836, "geo/layer_27/stable_rank_o_proj": 112.65709686279297, "geo/layer_27/stable_rank_gate_proj": 94.4615707397461, "geo/layer_27/stable_rank_down_proj": 153.13021850585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06361109018325806, "geo/layer_27/attn_entropy_mean": 4.5173258781433105, "geo/layer_27/attn_entropy_std": 0.515755295753479, "attnres/final_alpha/block_0": 0.24695053696632385, "attnres/block_norm/0": 1.4015717506408691, "attnres/final_alpha/block_1": 0.009663484059274197, "attnres/block_norm/1": 17046.76953125, "attnres/final_alpha/block_2": 0.01986699365079403, "attnres/block_norm/2": 12514.248046875, "attnres/final_alpha/block_3": 0.02061915583908558, "attnres/block_norm/3": 13938.76953125, "attnres/final_alpha/block_4": 0.02992134913802147, "attnres/block_norm/4": 5489.478515625, "attnres/final_alpha/block_5": 0.504920244216919, "attnres/block_norm/5": 3845.68212890625, "attnres/final_alpha/block_6": 0.16805824637413025, "attnres/block_norm/6": 10088.9248046875, "geo/tier1_time_s": 1.3595261573791504, "geo/step": 9900.0, "geo/rankme_slope": 0.0026149305034513805} {"step": 9910, "timestamp": 1778205242.0772688, "train/loss": 2.335974335670471, "train/z_loss": 0.001796346320770681, "train/perplexity": 10.3395291952613, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788381.745566554, "perf/iters_per_sec": 0.8527668693382997, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1726534366607666, "data/tokens_consumed": 20784873472, "data/tokens_consumed_B": 20.784873472, "train/loss_slope": -1.9620129413301454e-05} {"step": 9920, "timestamp": 1778205252.452242, "train/loss": 2.324050283432007, "train/z_loss": 0.001791757286991924, "train/perplexity": 10.216972248854422, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022883.134109202, "perf/iters_per_sec": 0.9645858450456629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367143630981446, "data/tokens_consumed": 20805844992, "data/tokens_consumed_B": 20.805844992, "train/loss_slope": -2.4471571622150943e-05} {"step": 9930, "timestamp": 1778205262.818963, "train/loss": 2.351352858543396, "train/z_loss": 0.001793728640768677, "train/perplexity": 10.499764817123037, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024079.158930828, "perf/iters_per_sec": 0.9651561541227475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036101770401001, "data/tokens_consumed": 20826816512, "data/tokens_consumed_B": 20.826816512, "train/loss_slope": -2.3630570807401212e-05} {"step": 9940, "timestamp": 1778205273.1907234, "train/loss": 2.3456372737884523, "train/z_loss": 0.0017921318416483701, "train/perplexity": 10.439923697805373, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023176.6795352623, "perf/iters_per_sec": 0.964725818412429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365639448165893, "data/tokens_consumed": 20847788032, "data/tokens_consumed_B": 20.847788032, "train/loss_slope": -2.1023264454893993e-05} {"step": 9950, "timestamp": 1778205283.5389419, "grad/layer_0/attn": 0.0024531716480851173, "grad/layer_0/mlp": 0.0026462723035365343, "grad/layer_0/attn_mlp_ratio": 0.9270291466617899, "grad/layer_4/attn": 0.0016483435174450278, "grad/layer_4/mlp": 0.002702750265598297, "grad/layer_4/attn_mlp_ratio": 0.6098763461197944, "grad/layer_8/attn": 0.007052634842693806, "grad/layer_8/mlp": 0.004182273056358099, "grad/layer_8/attn_mlp_ratio": 1.6863161680322198, "grad/layer_12/attn": 0.004303189925849438, "grad/layer_12/mlp": 0.005551705602556467, "grad/layer_12/attn_mlp_ratio": 0.775111318286899, "grad/layer_16/attn": 0.0043449667282402515, "grad/layer_16/mlp": 0.004791957791894674, "grad/layer_16/attn_mlp_ratio": 0.9067205568708183, "grad/layer_20/attn": 0.004393107257783413, "grad/layer_20/mlp": 0.005806266330182552, "grad/layer_20/attn_mlp_ratio": 0.7566148247946127, "grad/layer_24/attn": 0.011960742995142937, "grad/layer_24/mlp": 0.0123742101714015, "grad/layer_24/attn_mlp_ratio": 0.96658637058931, "grad/layer_27/attn": 0.009988709352910519, "grad/layer_27/mlp": 0.01059428509324789, "grad/layer_27/attn_mlp_ratio": 0.9428393865852011} {"step": 9950, "timestamp": 1778205283.5551643, "train/loss": 2.361058306694031, "train/z_loss": 0.0017783482791855932, "train/perplexity": 10.60216586034684, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024937.3653182266, "perf/iters_per_sec": 0.9655653788176664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356626510620117, "data/tokens_consumed": 20868759552, "data/tokens_consumed_B": 20.868759552, "train/loss_slope": -1.9302916691319704e-05} {"step": 9960, "timestamp": 1778205293.915144, "train/loss": 2.2839073657989504, "train/z_loss": 0.001808502699714154, "train/perplexity": 9.814956206259914, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025551.5285689668, "perf/iters_per_sec": 0.9658582346768221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353486299514771, "data/tokens_consumed": 20889731072, "data/tokens_consumed_B": 20.889731072, "train/loss_slope": -2.297812983183541e-05} {"step": 9970, "timestamp": 1778205304.2720807, "train/loss": 2.3032306432724, "train/z_loss": 0.001797970070037991, "train/perplexity": 10.006457586907796, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026396.6524465624, "perf/iters_per_sec": 0.9662612211449444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349168300628662, "data/tokens_consumed": 20910702592, "data/tokens_consumed_B": 20.910702592, "train/loss_slope": -2.5624206836539047e-05} {"step": 9975, "timestamp": 1778205310.0655184, "eos/sharpness": 19.320750236511227, "eos/L0_probe": 2.150123357772827, "eos/L_plus": 2.278296709060669, "eos/L_minus": 2.2151575088500977, "eos/grad_norm": 0.14355537295341492, "eos/embed_grad_frac": 0.12592874467372894, "eos/time_s": 0.617445707321167} {"step": 9975, "timestamp": 1778205311.4499643, "geo/rankme_last": 439.8309631347656, "geo/layer_0/stable_rank_q_proj": 15.134322166442871, "geo/layer_0/stable_rank_k_proj": 12.629603385925293, "geo/layer_0/stable_rank_o_proj": 55.688907623291016, "geo/layer_0/stable_rank_gate_proj": 166.6365203857422, "geo/layer_0/stable_rank_down_proj": 47.64870071411133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03899787366390228, "geo/layer_0/attn_entropy_mean": 6.390310764312744, "geo/layer_0/attn_entropy_std": 0.26448854804039, "geo/layer_7/stable_rank_q_proj": 43.6121711730957, "geo/layer_7/stable_rank_k_proj": 44.268798828125, "geo/layer_7/stable_rank_o_proj": 106.8751220703125, "geo/layer_7/stable_rank_gate_proj": 126.0212173461914, "geo/layer_7/stable_rank_down_proj": 169.21636962890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5868059992790222, "geo/layer_7/attn_entropy_mean": 4.6781463623046875, "geo/layer_7/attn_entropy_std": 0.906231164932251, "geo/layer_14/stable_rank_q_proj": 64.5762710571289, "geo/layer_14/stable_rank_k_proj": 42.220359802246094, "geo/layer_14/stable_rank_o_proj": 48.69704055786133, "geo/layer_14/stable_rank_gate_proj": 112.31757354736328, "geo/layer_14/stable_rank_down_proj": 139.8846893310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40525826811790466, "geo/layer_14/attn_entropy_mean": 5.715850353240967, "geo/layer_14/attn_entropy_std": 0.601982057094574, "geo/layer_21/stable_rank_q_proj": 52.192893981933594, "geo/layer_21/stable_rank_k_proj": 32.38221740722656, "geo/layer_21/stable_rank_o_proj": 92.035400390625, "geo/layer_21/stable_rank_gate_proj": 109.04547882080078, "geo/layer_21/stable_rank_down_proj": 72.87174987792969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16145487129688263, "geo/layer_21/attn_entropy_mean": 5.794254302978516, "geo/layer_21/attn_entropy_std": 0.3068973422050476, "geo/layer_27/stable_rank_q_proj": 42.766143798828125, "geo/layer_27/stable_rank_k_proj": 32.94950866699219, "geo/layer_27/stable_rank_o_proj": 113.03736114501953, "geo/layer_27/stable_rank_gate_proj": 94.34941864013672, "geo/layer_27/stable_rank_down_proj": 152.8801727294922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06060677021741867, "geo/layer_27/attn_entropy_mean": 4.512343406677246, "geo/layer_27/attn_entropy_std": 0.5002667903900146, "attnres/final_alpha/block_0": 0.24813184142112732, "attnres/block_norm/0": 1.404384732246399, "attnres/final_alpha/block_1": 0.00978398509323597, "attnres/block_norm/1": 17139.77734375, "attnres/final_alpha/block_2": 0.020117375999689102, "attnres/block_norm/2": 12651.966796875, "attnres/final_alpha/block_3": 0.020798301324248314, "attnres/block_norm/3": 13915.919921875, "attnres/final_alpha/block_4": 0.03025413677096367, "attnres/block_norm/4": 5510.75830078125, "attnres/final_alpha/block_5": 0.5013791918754578, "attnres/block_norm/5": 3867.12158203125, "attnres/final_alpha/block_6": 0.16953513026237488, "attnres/block_norm/6": 10104.9365234375, "geo/tier1_time_s": 1.363173246383667, "geo/step": 9975.0, "geo/rankme_slope": 0.002528028437937675} {"step": 9980, "timestamp": 1778205316.643459, "train/loss": 2.3101393699646, "train/z_loss": 0.0018125648377463222, "train/perplexity": 10.075828825068411, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696338.4590281895, "perf/iters_per_sec": 0.8088772101536701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2362815856933593, "data/tokens_consumed": 20931674112, "data/tokens_consumed_B": 20.931674112, "train/loss_slope": -2.88202905526148e-05} {"step": 9990, "timestamp": 1778205327.6160421, "train/loss": 2.3537541151046755, "train/z_loss": 0.00179100027307868, "train/perplexity": 10.525007741521671, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912327.2498681322, "perf/iters_per_sec": 0.9118686913815175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966491222381591, "data/tokens_consumed": 20952645632, "data/tokens_consumed_B": 20.952645632, "train/loss_slope": -2.4750367235286114e-05} {"step": 10000, "timestamp": 1778205337.9648335, "grad/layer_0/attn": 0.002523440169170499, "grad/layer_0/mlp": 0.002652581315487623, "grad/layer_0/attn_mlp_ratio": 0.9513148793235496, "grad/layer_4/attn": 0.001637284061871469, "grad/layer_4/mlp": 0.002806892851367593, "grad/layer_4/attn_mlp_ratio": 0.5833083378095131, "grad/layer_8/attn": 0.015209984965622425, "grad/layer_8/mlp": 0.0040419758297502995, "grad/layer_8/attn_mlp_ratio": 3.76300730879465, "grad/layer_12/attn": 0.0037368666380643845, "grad/layer_12/mlp": 0.005505216773599386, "grad/layer_12/attn_mlp_ratio": 0.6787864536245182, "grad/layer_16/attn": 0.004523031413555145, "grad/layer_16/mlp": 0.0045377761125564575, "grad/layer_16/attn_mlp_ratio": 0.9967506553186753, "grad/layer_20/attn": 0.004241283982992172, "grad/layer_20/mlp": 0.007296502124518156, "grad/layer_20/attn_mlp_ratio": 0.5812763228852791, "grad/layer_24/attn": 0.015070085413753986, "grad/layer_24/mlp": 0.013270840980112553, "grad/layer_24/attn_mlp_ratio": 1.135578771743243, "grad/layer_27/attn": 0.008501245640218258, "grad/layer_27/mlp": 0.012875346466898918, "grad/layer_27/attn_mlp_ratio": 0.6602731504000066} {"step": 10000, "timestamp": 1778205337.981065, "train/loss": 2.3688421726226805, "train/z_loss": 0.0017777831293642522, "train/perplexity": 10.685013717962827, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024518.6087469282, "perf/iters_per_sec": 0.9653657001242295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358768701553345, "data/tokens_consumed": 20973617152, "data/tokens_consumed_B": 20.973617152, "train/loss_slope": -1.7755240375416907e-05} {"step": 10000, "timestamp": 1778205347.2041523, "geo/ww_alpha_mean": 8.170943146229419, "geo/ww_alpha_std": 4.402418924069644, "geo/ww_alpha_min": 1.3768100771884835, "geo/ww_alpha_max": 30.894931947684416, "geo/ww_alpha_healthy_frac": 0.1319796954314721, "geo/ww_alpha_by_type/q_proj": 4.390799117794324, "geo/ww_alpha_by_type/k_proj": 4.738341196949171, "geo/ww_alpha_by_type/v_proj": 7.6292952498156135, "geo/ww_alpha_by_type/o_proj": 9.366524479203807, "geo/ww_alpha_by_type/gate_proj": 9.754327432026376, "geo/ww_alpha_by_type/up_proj": 12.124821190815796, "geo/ww_alpha_by_type/down_proj": 9.340220650808764, "geo/twonn_id/layer_0": 0.6538161635398865, "geo/twonn_id/layer_7": 2.887840986251831, "geo/twonn_id/layer_14": 3.8719751834869385, "geo/twonn_id/layer_21": 6.883591651916504, "geo/twonn_id/layer_27": 5.435900688171387, "geo/tier2_time_s": 9.214973211288452} {"step": 10000, "timestamp": 1778205347.8110495, "eoc/jacobian_sigma/layer_0/attn": 576.5986328125, "eoc/jacobian_sigma/layer_0/mlp": 3047.169189453125, "eoc/jacobian_sigma/layer_0": 3047.169189453125, "eoc/jacobian_sigma/layer_7/attn": 1.1627893447875977, "eoc/jacobian_sigma/layer_7/mlp": 1.5715184211730957, "eoc/jacobian_sigma/layer_7": 1.5715184211730957, "eoc/jacobian_sigma/layer_14/attn": 1.3717644214630127, "eoc/jacobian_sigma/layer_14/mlp": 6.86469841003418, "eoc/jacobian_sigma/layer_14": 6.86469841003418, "eoc/jacobian_sigma/layer_21/attn": 1.0845426321029663, "eoc/jacobian_sigma/layer_21/mlp": 3.142688035964966, "eoc/jacobian_sigma/layer_21": 3.142688035964966, "eoc/jacobian_sigma/layer_27/attn": 2.201244831085205, "eoc/jacobian_sigma/layer_27/mlp": 15.792438507080078, "eoc/jacobian_sigma/layer_27": 15.792438507080078, "eoc/layer0_sigma": 3047.169189453125, "eoc/sigma_max": 15.792438507080078, "eoc/sigma_min": 1.5715184211730957, "eoc/sigma_mean": 6.84283584356308, "eoc/time_s": 0.6007874011993408} {"step": 10010, "timestamp": 1778205358.190232, "train/loss": 2.2898606777191164, "train/z_loss": 0.0018102951580658555, "train/perplexity": 9.873561978167125, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1038041.8393005258, "perf/iters_per_sec": 0.4949769207480077, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 2.0202962160110474, "data/tokens_consumed": 20994588672, "data/tokens_consumed_B": 20.994588672, "train/loss_slope": -2.061896301267237e-05} {"step": 10020, "timestamp": 1778205368.5782511, "train/loss": 2.3503262996673584, "train/z_loss": 0.0017869144328869879, "train/perplexity": 10.488991720908562, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019845.5817764276, "perf/iters_per_sec": 0.9631374272234094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382734298706056, "data/tokens_consumed": 21015560192, "data/tokens_consumed_B": 21.015560192, "train/loss_slope": -1.9030371522030706e-05} {"step": 10030, "timestamp": 1778205378.9457939, "train/loss": 2.3024566173553467, "train/z_loss": 0.001797412778250873, "train/perplexity": 9.998715326139424, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023984.7531334213, "perf/iters_per_sec": 0.9651111379305941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361500978469849, "data/tokens_consumed": 21036531712, "data/tokens_consumed_B": 21.036531712, "train/loss_slope": -2.1801483441572283e-05} {"step": 10040, "timestamp": 1778205389.3086271, "train/loss": 2.336187791824341, "train/z_loss": 0.0017889707698486746, "train/perplexity": 10.341736466965628, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024748.2625774243, "perf/iters_per_sec": 0.9654752076041337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357593774795533, "data/tokens_consumed": 21057503232, "data/tokens_consumed_B": 21.057503232, "train/loss_slope": -2.165423758638678e-05} {"step": 10050, "timestamp": 1778205399.652927, "grad/layer_0/attn": 0.0026568316388875246, "grad/layer_0/mlp": 0.0027987665962427855, "grad/layer_0/attn_mlp_ratio": 0.9492865705648841, "grad/layer_4/attn": 0.001570473425090313, "grad/layer_4/mlp": 0.002741230884566903, "grad/layer_4/attn_mlp_ratio": 0.5729080963742418, "grad/layer_8/attn": 0.0060989647172391415, "grad/layer_8/mlp": 0.004189938772469759, "grad/layer_8/attn_mlp_ratio": 1.4556214071075715, "grad/layer_12/attn": 0.00513458251953125, "grad/layer_12/mlp": 0.0061071887612342834, "grad/layer_12/attn_mlp_ratio": 0.8407440208904124, "grad/layer_16/attn": 0.00620898324996233, "grad/layer_16/mlp": 0.004608374089002609, "grad/layer_16/attn_mlp_ratio": 1.3473261925603617, "grad/layer_20/attn": 0.004563210066407919, "grad/layer_20/mlp": 0.00608663447201252, "grad/layer_20/attn_mlp_ratio": 0.749709878656164, "grad/layer_24/attn": 0.007766436319798231, "grad/layer_24/mlp": 0.009360418654978275, "grad/layer_24/attn_mlp_ratio": 0.8297103498353324, "grad/layer_27/attn": 0.004323352128267288, "grad/layer_27/mlp": 0.00824545044451952, "grad/layer_27/attn_mlp_ratio": 0.5243318245527382} {"step": 10050, "timestamp": 1778205400.280811, "eos/sharpness": 9.835720062255858, "eos/L0_probe": 2.148791790008545, "eos/L_plus": 2.2146599292755127, "eos/L_minus": 2.1812808513641357, "eos/grad_norm": 0.11336540430784225, "eos/embed_grad_frac": 0.21941682696342468, "eos/time_s": 0.6251733303070068} {"step": 10050, "timestamp": 1778205400.3022566, "train/loss": 2.3528871059417726, "train/z_loss": 0.0017830171971581876, "train/perplexity": 10.515886418077029, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908502.3035131844, "perf/iters_per_sec": 0.9100448148313448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098846983909607, "data/tokens_consumed": 21078474752, "data/tokens_consumed_B": 21.078474752, "train/loss_slope": -1.9585235749069256e-05} {"step": 10050, "timestamp": 1778205401.6651602, "geo/rankme_last": 440.08587646484375, "geo/layer_0/stable_rank_q_proj": 15.17603874206543, "geo/layer_0/stable_rank_k_proj": 12.677380561828613, "geo/layer_0/stable_rank_o_proj": 55.64158248901367, "geo/layer_0/stable_rank_gate_proj": 166.43716430664062, "geo/layer_0/stable_rank_down_proj": 47.737213134765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03514590486884117, "geo/layer_0/attn_entropy_mean": 6.3910675048828125, "geo/layer_0/attn_entropy_std": 0.2690483331680298, "geo/layer_7/stable_rank_q_proj": 43.76463317871094, "geo/layer_7/stable_rank_k_proj": 43.84767532348633, "geo/layer_7/stable_rank_o_proj": 106.73499298095703, "geo/layer_7/stable_rank_gate_proj": 126.21522521972656, "geo/layer_7/stable_rank_down_proj": 168.39984130859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5854824185371399, "geo/layer_7/attn_entropy_mean": 4.672956466674805, "geo/layer_7/attn_entropy_std": 0.8992172479629517, "geo/layer_14/stable_rank_q_proj": 64.19293212890625, "geo/layer_14/stable_rank_k_proj": 42.061683654785156, "geo/layer_14/stable_rank_o_proj": 48.51797866821289, "geo/layer_14/stable_rank_gate_proj": 112.11563110351562, "geo/layer_14/stable_rank_down_proj": 138.875244140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3999364376068115, "geo/layer_14/attn_entropy_mean": 5.698853015899658, "geo/layer_14/attn_entropy_std": 0.5849946141242981, "geo/layer_21/stable_rank_q_proj": 52.26237869262695, "geo/layer_21/stable_rank_k_proj": 32.285369873046875, "geo/layer_21/stable_rank_o_proj": 91.86038208007812, "geo/layer_21/stable_rank_gate_proj": 108.85828399658203, "geo/layer_21/stable_rank_down_proj": 72.73420715332031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16251614689826965, "geo/layer_21/attn_entropy_mean": 5.803009986877441, "geo/layer_21/attn_entropy_std": 0.3024279475212097, "geo/layer_27/stable_rank_q_proj": 42.895545959472656, "geo/layer_27/stable_rank_k_proj": 33.05852508544922, "geo/layer_27/stable_rank_o_proj": 113.75918579101562, "geo/layer_27/stable_rank_gate_proj": 94.6434555053711, "geo/layer_27/stable_rank_down_proj": 152.5725555419922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06027558445930481, "geo/layer_27/attn_entropy_mean": 4.502047538757324, "geo/layer_27/attn_entropy_std": 0.5168250203132629, "attnres/final_alpha/block_0": 0.2494649887084961, "attnres/block_norm/0": 1.4069714546203613, "attnres/final_alpha/block_1": 0.009885480627417564, "attnres/block_norm/1": 17270.4296875, "attnres/final_alpha/block_2": 0.0201849564909935, "attnres/block_norm/2": 12737.62890625, "attnres/final_alpha/block_3": 0.020777598023414612, "attnres/block_norm/3": 14108.21484375, "attnres/final_alpha/block_4": 0.03065187856554985, "attnres/block_norm/4": 5560.4853515625, "attnres/final_alpha/block_5": 0.49785614013671875, "attnres/block_norm/5": 3896.10595703125, "attnres/final_alpha/block_6": 0.17117898166179657, "attnres/block_norm/6": 10229.4150390625, "geo/tier1_time_s": 1.3585736751556396, "geo/step": 10050.0, "geo/rankme_slope": 0.0024147451363357845} {"step": 10060, "timestamp": 1778205412.0229876, "train/loss": 2.359330916404724, "train/z_loss": 0.0017877600854262709, "train/perplexity": 10.583867590670712, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789906.260899724, "perf/iters_per_sec": 0.8534938148974056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716546535491943, "data/tokens_consumed": 21099446272, "data/tokens_consumed_B": 21.099446272, "train/loss_slope": -1.7774572795910248e-05} {"step": 10070, "timestamp": 1778205422.3793118, "train/loss": 2.2681103944778442, "train/z_loss": 0.0018238721298985184, "train/perplexity": 9.661127834641968, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026136.7079611658, "perf/iters_per_sec": 0.9661372699552373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035049605369568, "data/tokens_consumed": 21120417792, "data/tokens_consumed_B": 21.120417792, "train/loss_slope": -2.0111581594637102e-05} {"step": 10080, "timestamp": 1778205432.7440536, "train/loss": 2.2786131143569945, "train/z_loss": 0.0017941525322385133, "train/perplexity": 9.76313066997416, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024616.3730228292, "perf/iters_per_sec": 0.9654123177637239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035826849937439, "data/tokens_consumed": 21141389312, "data/tokens_consumed_B": 21.141389312, "train/loss_slope": -2.1440862858220304e-05} {"step": 10090, "timestamp": 1778205443.1028368, "train/loss": 2.3702567338943483, "train/z_loss": 0.0017809994635172189, "train/perplexity": 10.700139019866521, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025544.998415113, "perf/iters_per_sec": 0.9658551208568158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353519678115846, "data/tokens_consumed": 21162360832, "data/tokens_consumed_B": 21.162360832, "train/loss_slope": -1.822056773186012e-05} {"step": 10100, "timestamp": 1778205453.4552085, "grad/layer_0/attn": 0.0026843762025237083, "grad/layer_0/mlp": 0.0026437975466251373, "grad/layer_0/attn_mlp_ratio": 1.0153485861334246, "grad/layer_4/attn": 0.001759710256010294, "grad/layer_4/mlp": 0.0027803019620478153, "grad/layer_4/attn_mlp_ratio": 0.6329205304815656, "grad/layer_8/attn": 0.005389252677559853, "grad/layer_8/mlp": 0.004227053374052048, "grad/layer_8/attn_mlp_ratio": 1.274943104136727, "grad/layer_12/attn": 0.004564865492284298, "grad/layer_12/mlp": 0.005646028555929661, "grad/layer_12/attn_mlp_ratio": 0.8085090902771302, "grad/layer_16/attn": 0.005460516549646854, "grad/layer_16/mlp": 0.0043981666676700115, "grad/layer_16/attn_mlp_ratio": 1.2415437699602367, "grad/layer_20/attn": 0.00524703785777092, "grad/layer_20/mlp": 0.006102882791310549, "grad/layer_20/attn_mlp_ratio": 0.8597638118276522, "grad/layer_24/attn": 0.01807871274650097, "grad/layer_24/mlp": 0.014591685496270657, "grad/layer_24/attn_mlp_ratio": 1.23897356664, "grad/layer_27/attn": 0.013673070818185806, "grad/layer_27/mlp": 0.012501940131187439, "grad/layer_27/attn_mlp_ratio": 1.09367590672661} {"step": 10100, "timestamp": 1778205453.4730074, "train/loss": 2.3248353958129884, "train/z_loss": 0.0017884759465232492, "train/perplexity": 10.224996869965636, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023453.8787500644, "perf/iters_per_sec": 0.9648579972982714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364219427108765, "data/tokens_consumed": 21183332352, "data/tokens_consumed_B": 21.183332352, "train/loss_slope": -1.9837651980949704e-05} {"step": 10110, "timestamp": 1778205463.832073, "train/loss": 2.3401132345199587, "train/z_loss": 0.00178109792759642, "train/perplexity": 10.382412143627516, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025467.1995354462, "perf/iters_per_sec": 0.9658180234601241, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353917360305787, "data/tokens_consumed": 21204303872, "data/tokens_consumed_B": 21.204303872, "train/loss_slope": -1.8747291069934877e-05} {"step": 10120, "timestamp": 1778205474.189919, "train/loss": 2.335242223739624, "train/z_loss": 0.0017893810756504536, "train/perplexity": 10.331962272833184, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025907.3923229286, "perf/iters_per_sec": 0.9660279237379688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351667642593383, "data/tokens_consumed": 21225275392, "data/tokens_consumed_B": 21.225275392, "train/loss_slope": -1.968900887700763e-05} {"step": 10125, "timestamp": 1778205479.9827435, "eos/sharpness": 21.91383838653564, "eos/L0_probe": 2.149390459060669, "eos/L_plus": 2.2337350845336914, "eos/L_minus": 2.284184217453003, "eos/grad_norm": 0.11673004925251007, "eos/embed_grad_frac": 0.19343659281730652, "eos/time_s": 0.6219589710235596} {"step": 10125, "timestamp": 1778205481.362122, "geo/rankme_last": 440.3602294921875, "geo/layer_0/stable_rank_q_proj": 15.158390045166016, "geo/layer_0/stable_rank_k_proj": 12.664407730102539, "geo/layer_0/stable_rank_o_proj": 55.69871139526367, "geo/layer_0/stable_rank_gate_proj": 166.78671264648438, "geo/layer_0/stable_rank_down_proj": 47.78562927246094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04025137796998024, "geo/layer_0/attn_entropy_mean": 6.389104843139648, "geo/layer_0/attn_entropy_std": 0.2638865113258362, "geo/layer_7/stable_rank_q_proj": 43.801475524902344, "geo/layer_7/stable_rank_k_proj": 43.95221710205078, "geo/layer_7/stable_rank_o_proj": 106.36787414550781, "geo/layer_7/stable_rank_gate_proj": 125.75106811523438, "geo/layer_7/stable_rank_down_proj": 168.23605346679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5855616331100464, "geo/layer_7/attn_entropy_mean": 4.669294357299805, "geo/layer_7/attn_entropy_std": 0.9452714920043945, "geo/layer_14/stable_rank_q_proj": 63.976627349853516, "geo/layer_14/stable_rank_k_proj": 41.934139251708984, "geo/layer_14/stable_rank_o_proj": 48.565879821777344, "geo/layer_14/stable_rank_gate_proj": 111.6824722290039, "geo/layer_14/stable_rank_down_proj": 138.8759765625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38793718814849854, "geo/layer_14/attn_entropy_mean": 5.726232528686523, "geo/layer_14/attn_entropy_std": 0.569486677646637, "geo/layer_21/stable_rank_q_proj": 52.27391052246094, "geo/layer_21/stable_rank_k_proj": 32.23225402832031, "geo/layer_21/stable_rank_o_proj": 91.8554458618164, "geo/layer_21/stable_rank_gate_proj": 108.96721649169922, "geo/layer_21/stable_rank_down_proj": 72.77984619140625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16243301331996918, "geo/layer_21/attn_entropy_mean": 5.78175163269043, "geo/layer_21/attn_entropy_std": 0.3054386079311371, "geo/layer_27/stable_rank_q_proj": 42.849178314208984, "geo/layer_27/stable_rank_k_proj": 32.97990798950195, "geo/layer_27/stable_rank_o_proj": 113.71590423583984, "geo/layer_27/stable_rank_gate_proj": 94.60591888427734, "geo/layer_27/stable_rank_down_proj": 152.35597229003906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05985276401042938, "geo/layer_27/attn_entropy_mean": 4.516027927398682, "geo/layer_27/attn_entropy_std": 0.4931815564632416, "attnres/final_alpha/block_0": 0.24777555465698242, "attnres/block_norm/0": 1.40970778465271, "attnres/final_alpha/block_1": 0.009591775014996529, "attnres/block_norm/1": 17424.57421875, "attnres/final_alpha/block_2": 0.01983984373509884, "attnres/block_norm/2": 12755.1826171875, "attnres/final_alpha/block_3": 0.020770877599716187, "attnres/block_norm/3": 14183.5859375, "attnres/final_alpha/block_4": 0.029779430478811264, "attnres/block_norm/4": 5606.9345703125, "attnres/final_alpha/block_5": 0.500744640827179, "attnres/block_norm/5": 3906.5341796875, "attnres/final_alpha/block_6": 0.1714978814125061, "attnres/block_norm/6": 10266.3095703125, "geo/tier1_time_s": 1.3579974174499512, "geo/step": 10125.0, "geo/rankme_slope": 0.002333247048819528} {"step": 10130, "timestamp": 1778205486.54531, "train/loss": 2.302513074874878, "train/z_loss": 0.0017957941396161913, "train/perplexity": 9.99927984474075, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698099.0935764136, "perf/iters_per_sec": 0.8097167461282795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2349997758865356, "data/tokens_consumed": 21246246912, "data/tokens_consumed_B": 21.246246912, "train/loss_slope": -2.0211906716375e-05} {"step": 10140, "timestamp": 1778205496.90458, "train/loss": 2.374867343902588, "train/z_loss": 0.0017778867390006781, "train/perplexity": 10.749587093215911, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025372.431037155, "perf/iters_per_sec": 0.9657728343187117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354401826858521, "data/tokens_consumed": 21267218432, "data/tokens_consumed_B": 21.267218432, "train/loss_slope": -1.747232922697935e-05} {"step": 10150, "timestamp": 1778205507.251728, "grad/layer_0/attn": 0.0031782444566488266, "grad/layer_0/mlp": 0.0031898037996143103, "grad/layer_0/attn_mlp_ratio": 0.9963761273955175, "grad/layer_4/attn": 0.001540540368296206, "grad/layer_4/mlp": 0.002818911802023649, "grad/layer_4/attn_mlp_ratio": 0.5465017786438371, "grad/layer_8/attn": 0.004730185493826866, "grad/layer_8/mlp": 0.004237903282046318, "grad/layer_8/attn_mlp_ratio": 1.1161617119130367, "grad/layer_12/attn": 0.0042319390922784805, "grad/layer_12/mlp": 0.006128164939582348, "grad/layer_12/attn_mlp_ratio": 0.6905719844266628, "grad/layer_16/attn": 0.006692622322589159, "grad/layer_16/mlp": 0.004963247571140528, "grad/layer_16/attn_mlp_ratio": 1.3484360979010404, "grad/layer_20/attn": 0.010365169495344162, "grad/layer_20/mlp": 0.007844771258533001, "grad/layer_20/attn_mlp_ratio": 1.3212838235329378, "grad/layer_24/attn": 0.026435142382979393, "grad/layer_24/mlp": 0.017184190452098846, "grad/layer_24/attn_mlp_ratio": 1.5383408548010222, "grad/layer_27/attn": 0.010137238539755344, "grad/layer_27/mlp": 0.015384841710329056, "grad/layer_27/attn_mlp_ratio": 0.6589108074513589} {"step": 10150, "timestamp": 1778205507.26779, "train/loss": 2.286045694351196, "train/z_loss": 0.0018039131653495132, "train/perplexity": 9.835966262550812, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024897.7426422702, "perf/iters_per_sec": 0.9655464852534629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356829166412354, "data/tokens_consumed": 21288189952, "data/tokens_consumed_B": 21.288189952, "train/loss_slope": -1.9468280212535068e-05} {"step": 10160, "timestamp": 1778205517.632165, "train/loss": 2.398461604118347, "train/z_loss": 0.0017739702481776476, "train/perplexity": 11.006231408874198, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024739.0810166514, "perf/iters_per_sec": 0.9654708294947869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357640743255616, "data/tokens_consumed": 21309161472, "data/tokens_consumed_B": 21.309161472, "train/loss_slope": -1.538674504485816e-05} {"step": 10170, "timestamp": 1778205527.9926822, "train/loss": 2.353512001037598, "train/z_loss": 0.0017849322874099016, "train/perplexity": 10.522459797550333, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025256.6408208953, "perf/iters_per_sec": 0.9657176212410428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035499382019043, "data/tokens_consumed": 21330132992, "data/tokens_consumed_B": 21.330132992, "train/loss_slope": -1.2804588440335092e-05} {"step": 10180, "timestamp": 1778205538.3536155, "train/loss": 2.330726885795593, "train/z_loss": 0.001790321385487914, "train/perplexity": 10.285415138651569, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025195.4167908705, "perf/iters_per_sec": 0.965688427348552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035530686378479, "data/tokens_consumed": 21351104512, "data/tokens_consumed_B": 21.351104512, "train/loss_slope": -1.1145535568342573e-05} {"step": 10190, "timestamp": 1778205548.7473059, "train/loss": 2.3032997131347654, "train/z_loss": 0.0017804320435971023, "train/perplexity": 10.007148755425273, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018676.6053709358, "perf/iters_per_sec": 0.9625800158362082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388746738433838, "data/tokens_consumed": 21372076032, "data/tokens_consumed_B": 21.372076032, "train/loss_slope": -1.2866434877855135e-05} {"step": 10200, "timestamp": 1778205559.1021261, "grad/layer_0/attn": 0.0033504334278404713, "grad/layer_0/mlp": 0.0030807796865701675, "grad/layer_0/attn_mlp_ratio": 1.087527723482797, "grad/layer_4/attn": 0.0025027557276189327, "grad/layer_4/mlp": 0.0027560745365917683, "grad/layer_4/attn_mlp_ratio": 0.9080870649838075, "grad/layer_8/attn": 0.005728320684283972, "grad/layer_8/mlp": 0.004322287160903215, "grad/layer_8/attn_mlp_ratio": 1.3252984678040447, "grad/layer_12/attn": 0.0048371595330536366, "grad/layer_12/mlp": 0.005731647834181786, "grad/layer_12/attn_mlp_ratio": 0.8439387046448379, "grad/layer_16/attn": 0.006053300108760595, "grad/layer_16/mlp": 0.004584670532494783, "grad/layer_16/attn_mlp_ratio": 1.320334783889731, "grad/layer_20/attn": 0.004375656135380268, "grad/layer_20/mlp": 0.0065819695591926575, "grad/layer_20/attn_mlp_ratio": 0.6647943339071826, "grad/layer_24/attn": 0.01561031024903059, "grad/layer_24/mlp": 0.011281613260507584, "grad/layer_24/attn_mlp_ratio": 1.3836948448947952, "grad/layer_27/attn": 0.009612941183149815, "grad/layer_27/mlp": 0.009685590863227844, "grad/layer_27/attn_mlp_ratio": 0.992499189739289} {"step": 10200, "timestamp": 1778205559.7220113, "eos/sharpness": 46.25177383422851, "eos/L0_probe": 2.150383710861206, "eos/L_plus": 2.3268978595733643, "eos/L_minus": 2.436387300491333, "eos/grad_norm": 0.17267458140850067, "eos/embed_grad_frac": 0.1039346307516098, "eos/time_s": 0.6170909404754639} {"step": 10200, "timestamp": 1778205559.7431495, "train/loss": 2.31633403301239, "train/z_loss": 0.0017906028311699628, "train/perplexity": 10.138438913551978, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908397.0888460968, "perf/iters_per_sec": 0.9099946445684894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0989075660705567, "data/tokens_consumed": 21393047552, "data/tokens_consumed_B": 21.393047552, "train/loss_slope": -1.2114277781576051e-05} {"step": 10200, "timestamp": 1778205561.1081424, "geo/rankme_last": 441.0997619628906, "geo/layer_0/stable_rank_q_proj": 15.104571342468262, "geo/layer_0/stable_rank_k_proj": 12.640510559082031, "geo/layer_0/stable_rank_o_proj": 55.61042022705078, "geo/layer_0/stable_rank_gate_proj": 166.44293212890625, "geo/layer_0/stable_rank_down_proj": 47.872318267822266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04268305003643036, "geo/layer_0/attn_entropy_mean": 6.389218330383301, "geo/layer_0/attn_entropy_std": 0.2621811330318451, "geo/layer_7/stable_rank_q_proj": 43.67660903930664, "geo/layer_7/stable_rank_k_proj": 43.936790466308594, "geo/layer_7/stable_rank_o_proj": 106.28752136230469, "geo/layer_7/stable_rank_gate_proj": 125.74996185302734, "geo/layer_7/stable_rank_down_proj": 168.4786834716797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.596951425075531, "geo/layer_7/attn_entropy_mean": 4.650653839111328, "geo/layer_7/attn_entropy_std": 0.9279953837394714, "geo/layer_14/stable_rank_q_proj": 63.54243087768555, "geo/layer_14/stable_rank_k_proj": 41.71487045288086, "geo/layer_14/stable_rank_o_proj": 48.79494094848633, "geo/layer_14/stable_rank_gate_proj": 111.38407897949219, "geo/layer_14/stable_rank_down_proj": 139.09677124023438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37837961316108704, "geo/layer_14/attn_entropy_mean": 5.701376914978027, "geo/layer_14/attn_entropy_std": 0.6006280183792114, "geo/layer_21/stable_rank_q_proj": 52.026397705078125, "geo/layer_21/stable_rank_k_proj": 32.24235916137695, "geo/layer_21/stable_rank_o_proj": 92.08740997314453, "geo/layer_21/stable_rank_gate_proj": 108.37279510498047, "geo/layer_21/stable_rank_down_proj": 72.76638793945312, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15822073817253113, "geo/layer_21/attn_entropy_mean": 5.788641452789307, "geo/layer_21/attn_entropy_std": 0.29937419295310974, "geo/layer_27/stable_rank_q_proj": 42.918033599853516, "geo/layer_27/stable_rank_k_proj": 32.837608337402344, "geo/layer_27/stable_rank_o_proj": 113.63693237304688, "geo/layer_27/stable_rank_gate_proj": 94.83463287353516, "geo/layer_27/stable_rank_down_proj": 152.41058349609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.058789175003767014, "geo/layer_27/attn_entropy_mean": 4.492868900299072, "geo/layer_27/attn_entropy_std": 0.5132275223731995, "attnres/final_alpha/block_0": 0.25047290325164795, "attnres/block_norm/0": 1.4122653007507324, "attnres/final_alpha/block_1": 0.00977712869644165, "attnres/block_norm/1": 17511.24609375, "attnres/final_alpha/block_2": 0.01990155503153801, "attnres/block_norm/2": 12871.498046875, "attnres/final_alpha/block_3": 0.02071276679635048, "attnres/block_norm/3": 14375.412109375, "attnres/final_alpha/block_4": 0.030620452016592026, "attnres/block_norm/4": 5638.9736328125, "attnres/final_alpha/block_5": 0.4966094195842743, "attnres/block_norm/5": 3918.16552734375, "attnres/final_alpha/block_6": 0.1719057708978653, "attnres/block_norm/6": 10305.8603515625, "geo/tier1_time_s": 1.3612122535705566, "geo/step": 10200.0, "geo/rankme_slope": 0.0022533570459433772} {"step": 10210, "timestamp": 1778205571.4685876, "train/loss": 2.307068967819214, "train/z_loss": 0.0017846945323981346, "train/perplexity": 10.044939424335665, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789094.66039362, "perf/iters_per_sec": 0.8531068136184787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1721861600875854, "data/tokens_consumed": 21414019072, "data/tokens_consumed_B": 21.414019072, "train/loss_slope": -1.403465299609184e-05} {"step": 10220, "timestamp": 1778205581.8296735, "train/loss": 2.273339867591858, "train/z_loss": 0.0017956021707504987, "train/perplexity": 9.711782776793331, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025328.6875467803, "perf/iters_per_sec": 0.9657519757970716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354625463485718, "data/tokens_consumed": 21434990592, "data/tokens_consumed_B": 21.434990592, "train/loss_slope": -1.6948642639150988e-05} {"step": 10230, "timestamp": 1778205592.1859097, "train/loss": 2.3120002269744875, "train/z_loss": 0.0017816868145018815, "train/perplexity": 10.094595957827693, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026346.9829122887, "perf/iters_per_sec": 0.9662375368653721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349421977996827, "data/tokens_consumed": 21455962112, "data/tokens_consumed_B": 21.455962112, "train/loss_slope": -1.6465542762085648e-05} {"step": 10240, "timestamp": 1778205602.548907, "train/loss": 2.319369649887085, "train/z_loss": 0.0017894123564474284, "train/perplexity": 10.16926208980846, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025135.9682096615, "perf/iters_per_sec": 0.9656600800560291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355610847473145, "data/tokens_consumed": 21476933632, "data/tokens_consumed_B": 21.476933632, "train/loss_slope": -1.8174387576735248e-05} {"step": 10250, "timestamp": 1778205612.9012969, "grad/layer_0/attn": 0.002991169923916459, "grad/layer_0/mlp": 0.002931805793195963, "grad/layer_0/attn_mlp_ratio": 1.0202482813948448, "grad/layer_4/attn": 0.0014103134162724018, "grad/layer_4/mlp": 0.002692185575142503, "grad/layer_4/attn_mlp_ratio": 0.5238544389022317, "grad/layer_8/attn": 0.006336895748972893, "grad/layer_8/mlp": 0.004180974792689085, "grad/layer_8/attn_mlp_ratio": 1.5156502757415942, "grad/layer_12/attn": 0.004929016809910536, "grad/layer_12/mlp": 0.005558242090046406, "grad/layer_12/attn_mlp_ratio": 0.8867941772557744, "grad/layer_16/attn": 0.004493573680520058, "grad/layer_16/mlp": 0.004477947484701872, "grad/layer_16/attn_mlp_ratio": 1.0034895664861219, "grad/layer_20/attn": 0.007697992958128452, "grad/layer_20/mlp": 0.006462577730417252, "grad/layer_20/attn_mlp_ratio": 1.1911644486348008, "grad/layer_24/attn": 0.019056303426623344, "grad/layer_24/mlp": 0.01164021622389555, "grad/layer_24/attn_mlp_ratio": 1.6371090447437582, "grad/layer_27/attn": 0.008472256362438202, "grad/layer_27/mlp": 0.009750418365001678, "grad/layer_27/attn_mlp_ratio": 0.8689120772455733} {"step": 10250, "timestamp": 1778205612.9176073, "train/loss": 2.316627311706543, "train/z_loss": 0.0017786189448088408, "train/perplexity": 10.141412737735612, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023926.2138534766, "perf/iters_per_sec": 0.965083224226702, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036180067062378, "data/tokens_consumed": 21497905152, "data/tokens_consumed_B": 21.497905152, "train/loss_slope": -2.3469454899515204e-05} {"step": 10260, "timestamp": 1778205623.2934916, "train/loss": 2.299583005905151, "train/z_loss": 0.0017967360909096897, "train/perplexity": 9.970024146685342, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022427.8396728318, "perf/iters_per_sec": 0.9643687437404784, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036947751045227, "data/tokens_consumed": 21518876672, "data/tokens_consumed_B": 21.518876672, "train/loss_slope": -2.877326561029057e-05} {"step": 10270, "timestamp": 1778205633.6527069, "train/loss": 2.1990166664123536, "train/z_loss": 0.0018193281372077762, "train/perplexity": 9.016143262446898, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025502.6000832021, "perf/iters_per_sec": 0.9658349037567149, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353736400604248, "data/tokens_consumed": 21539848192, "data/tokens_consumed_B": 21.539848192, "train/loss_slope": -3.472755731421836e-05} {"step": 10275, "timestamp": 1778205639.4597676, "eos/sharpness": 42.59297847747802, "eos/L0_probe": 2.148003578186035, "eos/L_plus": 2.339261531829834, "eos/L_minus": 2.3826754093170166, "eos/grad_norm": 0.3043590784072876, "eos/embed_grad_frac": 0.0365595705807209, "eos/time_s": 0.6350526809692383} {"step": 10275, "timestamp": 1778205640.8401513, "geo/rankme_last": 439.46661376953125, "geo/layer_0/stable_rank_q_proj": 15.117472648620605, "geo/layer_0/stable_rank_k_proj": 12.646445274353027, "geo/layer_0/stable_rank_o_proj": 55.669620513916016, "geo/layer_0/stable_rank_gate_proj": 166.40687561035156, "geo/layer_0/stable_rank_down_proj": 47.81834411621094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.035763174295425415, "geo/layer_0/attn_entropy_mean": 6.385040760040283, "geo/layer_0/attn_entropy_std": 0.2629578709602356, "geo/layer_7/stable_rank_q_proj": 43.71269226074219, "geo/layer_7/stable_rank_k_proj": 43.81499099731445, "geo/layer_7/stable_rank_o_proj": 106.85983276367188, "geo/layer_7/stable_rank_gate_proj": 126.04866027832031, "geo/layer_7/stable_rank_down_proj": 167.915283203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5908994078636169, "geo/layer_7/attn_entropy_mean": 4.678725242614746, "geo/layer_7/attn_entropy_std": 0.9463830590248108, "geo/layer_14/stable_rank_q_proj": 63.3502082824707, "geo/layer_14/stable_rank_k_proj": 41.74117660522461, "geo/layer_14/stable_rank_o_proj": 48.75123596191406, "geo/layer_14/stable_rank_gate_proj": 111.1241455078125, "geo/layer_14/stable_rank_down_proj": 138.9709014892578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39217278361320496, "geo/layer_14/attn_entropy_mean": 5.696590900421143, "geo/layer_14/attn_entropy_std": 0.5834665894508362, "geo/layer_21/stable_rank_q_proj": 52.2725944519043, "geo/layer_21/stable_rank_k_proj": 32.211181640625, "geo/layer_21/stable_rank_o_proj": 92.02415466308594, "geo/layer_21/stable_rank_gate_proj": 108.01300048828125, "geo/layer_21/stable_rank_down_proj": 72.6677017211914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1583556830883026, "geo/layer_21/attn_entropy_mean": 5.79811954498291, "geo/layer_21/attn_entropy_std": 0.3088359832763672, "geo/layer_27/stable_rank_q_proj": 42.8796501159668, "geo/layer_27/stable_rank_k_proj": 32.77471160888672, "geo/layer_27/stable_rank_o_proj": 114.12922668457031, "geo/layer_27/stable_rank_gate_proj": 94.72993469238281, "geo/layer_27/stable_rank_down_proj": 152.3627471923828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06772948801517487, "geo/layer_27/attn_entropy_mean": 4.491547584533691, "geo/layer_27/attn_entropy_std": 0.513099193572998, "attnres/final_alpha/block_0": 0.24895191192626953, "attnres/block_norm/0": 1.4149974584579468, "attnres/final_alpha/block_1": 0.009751670993864536, "attnres/block_norm/1": 17644.955078125, "attnres/final_alpha/block_2": 0.020035305991768837, "attnres/block_norm/2": 13061.080078125, "attnres/final_alpha/block_3": 0.020698562264442444, "attnres/block_norm/3": 14497.560546875, "attnres/final_alpha/block_4": 0.03031253069639206, "attnres/block_norm/4": 5679.0107421875, "attnres/final_alpha/block_5": 0.4989984631538391, "attnres/block_norm/5": 3964.90673828125, "attnres/final_alpha/block_6": 0.1712515503168106, "attnres/block_norm/6": 10473.9580078125, "geo/tier1_time_s": 1.3604297637939453, "geo/step": 10275.0, "geo/rankme_slope": 0.00215194173372474} {"step": 10280, "timestamp": 1778205646.0229194, "train/loss": 2.346065855026245, "train/z_loss": 0.0017729258863255382, "train/perplexity": 10.444399012175625, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696264.3975008172, "perf/iters_per_sec": 0.8088418948654257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236335563659668, "data/tokens_consumed": 21560819712, "data/tokens_consumed_B": 21.560819712, "train/loss_slope": -3.4267635185225654e-05} {"step": 10290, "timestamp": 1778205656.385155, "train/loss": 2.323049807548523, "train/z_loss": 0.0017825974500738084, "train/perplexity": 10.206755526161752, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025138.9988372151, "perf/iters_per_sec": 0.9656615251718593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355595350265503, "data/tokens_consumed": 21581791232, "data/tokens_consumed_B": 21.581791232, "train/loss_slope": -3.254756818760488e-05} {"step": 10300, "timestamp": 1778205666.7642522, "grad/layer_0/attn": 0.0029777728486806154, "grad/layer_0/mlp": 0.0027966683264821768, "grad/layer_0/attn_mlp_ratio": 1.06475720199203, "grad/layer_4/attn": 0.0014585773460566998, "grad/layer_4/mlp": 0.002658451208844781, "grad/layer_4/attn_mlp_ratio": 0.5486567841976081, "grad/layer_8/attn": 0.006441168487071991, "grad/layer_8/mlp": 0.004154185764491558, "grad/layer_8/attn_mlp_ratio": 1.5505248674905747, "grad/layer_12/attn": 0.00456711370497942, "grad/layer_12/mlp": 0.005660027265548706, "grad/layer_12/attn_mlp_ratio": 0.806906647267891, "grad/layer_16/attn": 0.005840777885168791, "grad/layer_16/mlp": 0.004874907899647951, "grad/layer_16/attn_mlp_ratio": 1.1981308951041931, "grad/layer_20/attn": 0.004690969828516245, "grad/layer_20/mlp": 0.006512506864964962, "grad/layer_20/attn_mlp_ratio": 0.7203016985243996, "grad/layer_24/attn": 0.011093356646597385, "grad/layer_24/mlp": 0.010677864775061607, "grad/layer_24/attn_mlp_ratio": 1.0389115030389802, "grad/layer_27/attn": 0.013152165338397026, "grad/layer_27/mlp": 0.009820971637964249, "grad/layer_27/attn_mlp_ratio": 1.3391918528342377} {"step": 10300, "timestamp": 1778205666.7812283, "train/loss": 2.333626699447632, "train/z_loss": 0.00178868017392233, "train/perplexity": 10.315284212330782, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019016.6639435326, "perf/iters_per_sec": 0.9627421683995879, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038699698448181, "data/tokens_consumed": 21602762752, "data/tokens_consumed_B": 21.602762752, "train/loss_slope": -3.104913158170674e-05} {"step": 10310, "timestamp": 1778205677.1432266, "train/loss": 2.294271206855774, "train/z_loss": 0.0017876439727842809, "train/perplexity": 9.917205786348855, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025091.4889656934, "perf/iters_per_sec": 0.9656388706997363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355838298797608, "data/tokens_consumed": 21623734272, "data/tokens_consumed_B": 21.623734272, "train/loss_slope": -3.052953506352984e-05} {"step": 10320, "timestamp": 1778205687.5121799, "train/loss": 2.2994637489318848, "train/z_loss": 0.0017840677290223538, "train/perplexity": 9.968835222677363, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023711.971270404, "perf/iters_per_sec": 0.9649810654022236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362897634506225, "data/tokens_consumed": 21644705792, "data/tokens_consumed_B": 21.644705792, "train/loss_slope": -3.126505922706071e-05} {"step": 10330, "timestamp": 1778205697.874608, "train/loss": 2.3320315361022947, "train/z_loss": 0.0017785801202990115, "train/perplexity": 10.298842765941183, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025097.3168476438, "perf/iters_per_sec": 0.9656416496504039, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035580849647522, "data/tokens_consumed": 21665677312, "data/tokens_consumed_B": 21.665677312, "train/loss_slope": -3.077997796499012e-05} {"step": 10340, "timestamp": 1778205708.237191, "train/loss": 2.3339130163192747, "train/z_loss": 0.0017847441136837005, "train/perplexity": 10.318238075086756, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024863.80821842, "perf/iters_per_sec": 0.9655303040592289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035700273513794, "data/tokens_consumed": 21686648832, "data/tokens_consumed_B": 21.686648832, "train/loss_slope": -3.069858771346192e-05} {"step": 10350, "timestamp": 1778205718.5795074, "grad/layer_0/attn": 0.002520472276955843, "grad/layer_0/mlp": 0.002742522396147251, "grad/layer_0/attn_mlp_ratio": 0.9190343125705082, "grad/layer_4/attn": 0.0022355648688971996, "grad/layer_4/mlp": 0.002850578399375081, "grad/layer_4/attn_mlp_ratio": 0.7842495372035158, "grad/layer_8/attn": 0.005658476147800684, "grad/layer_8/mlp": 0.004468301311135292, "grad/layer_8/attn_mlp_ratio": 1.2663595463142663, "grad/layer_12/attn": 0.005305025260895491, "grad/layer_12/mlp": 0.006491964217275381, "grad/layer_12/attn_mlp_ratio": 0.8171679635974898, "grad/layer_16/attn": 0.004876431077718735, "grad/layer_16/mlp": 0.004704887047410011, "grad/layer_16/attn_mlp_ratio": 1.036460795962589, "grad/layer_20/attn": 0.006216820329427719, "grad/layer_20/mlp": 0.00644452590495348, "grad/layer_20/attn_mlp_ratio": 0.9646668078690755, "grad/layer_24/attn": 0.010191005654633045, "grad/layer_24/mlp": 0.011944749392569065, "grad/layer_24/attn_mlp_ratio": 0.8531786841551563, "grad/layer_27/attn": 0.008391088806092739, "grad/layer_27/mlp": 0.009387594647705555, "grad/layer_27/attn_mlp_ratio": 0.8938486408505891} {"step": 10350, "timestamp": 1778205719.1965847, "eos/sharpness": 20.670795440673825, "eos/L0_probe": 2.1429686546325684, "eos/L_plus": 2.2190239429473877, "eos/L_minus": 2.2736213207244873, "eos/grad_norm": 0.14644598960876465, "eos/embed_grad_frac": 0.13238966464996338, "eos/time_s": 0.6142253875732422} {"step": 10350, "timestamp": 1778205719.2162287, "train/loss": 2.251954507827759, "train/z_loss": 0.001807427394669503, "train/perplexity": 9.506297824184594, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911056.3919941054, "perf/iters_per_sec": 0.9112626991243865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097378396987915, "data/tokens_consumed": 21707620352, "data/tokens_consumed_B": 21.707620352, "train/loss_slope": -3.193977381994944e-05} {"step": 10350, "timestamp": 1778205720.581287, "geo/rankme_last": 439.0799560546875, "geo/layer_0/stable_rank_q_proj": 15.133123397827148, "geo/layer_0/stable_rank_k_proj": 12.696049690246582, "geo/layer_0/stable_rank_o_proj": 55.736083984375, "geo/layer_0/stable_rank_gate_proj": 166.3268280029297, "geo/layer_0/stable_rank_down_proj": 47.87033462524414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03155086189508438, "geo/layer_0/attn_entropy_mean": 6.375030040740967, "geo/layer_0/attn_entropy_std": 0.26864126324653625, "geo/layer_7/stable_rank_q_proj": 43.627933502197266, "geo/layer_7/stable_rank_k_proj": 43.81354904174805, "geo/layer_7/stable_rank_o_proj": 106.72692108154297, "geo/layer_7/stable_rank_gate_proj": 125.43646240234375, "geo/layer_7/stable_rank_down_proj": 168.09991455078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5905439257621765, "geo/layer_7/attn_entropy_mean": 4.699148654937744, "geo/layer_7/attn_entropy_std": 0.9530050158500671, "geo/layer_14/stable_rank_q_proj": 63.2518424987793, "geo/layer_14/stable_rank_k_proj": 41.49274826049805, "geo/layer_14/stable_rank_o_proj": 48.79541015625, "geo/layer_14/stable_rank_gate_proj": 110.77908325195312, "geo/layer_14/stable_rank_down_proj": 139.22726440429688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3904965817928314, "geo/layer_14/attn_entropy_mean": 5.69654655456543, "geo/layer_14/attn_entropy_std": 0.5793548822402954, "geo/layer_21/stable_rank_q_proj": 52.11310958862305, "geo/layer_21/stable_rank_k_proj": 32.14828109741211, "geo/layer_21/stable_rank_o_proj": 91.86864471435547, "geo/layer_21/stable_rank_gate_proj": 107.47071075439453, "geo/layer_21/stable_rank_down_proj": 72.34002685546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1620197296142578, "geo/layer_21/attn_entropy_mean": 5.802371978759766, "geo/layer_21/attn_entropy_std": 0.3028201758861542, "geo/layer_27/stable_rank_q_proj": 42.92400360107422, "geo/layer_27/stable_rank_k_proj": 32.505767822265625, "geo/layer_27/stable_rank_o_proj": 114.07648468017578, "geo/layer_27/stable_rank_gate_proj": 95.02326202392578, "geo/layer_27/stable_rank_down_proj": 151.93983459472656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06682390719652176, "geo/layer_27/attn_entropy_mean": 4.49710750579834, "geo/layer_27/attn_entropy_std": 0.5241488814353943, "attnres/final_alpha/block_0": 0.2483222782611847, "attnres/block_norm/0": 1.4174237251281738, "attnres/final_alpha/block_1": 0.009474823251366615, "attnres/block_norm/1": 17805.51171875, "attnres/final_alpha/block_2": 0.019381865859031677, "attnres/block_norm/2": 13106.859375, "attnres/final_alpha/block_3": 0.020535893738269806, "attnres/block_norm/3": 14833.52734375, "attnres/final_alpha/block_4": 0.029166188091039658, "attnres/block_norm/4": 5707.1064453125, "attnres/final_alpha/block_5": 0.5066835880279541, "attnres/block_norm/5": 3909.60595703125, "attnres/final_alpha/block_6": 0.1664353609085083, "attnres/block_norm/6": 10633.244140625, "geo/tier1_time_s": 1.3608512878417969, "geo/step": 10350.0, "geo/rankme_slope": 0.0020297386532738097} {"step": 10360, "timestamp": 1778205730.9405353, "train/loss": 2.2753450155258177, "train/z_loss": 0.0017985433456487953, "train/perplexity": 9.731275874704595, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789333.3713751838, "perf/iters_per_sec": 0.8532206398845595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720297813415528, "data/tokens_consumed": 21728591872, "data/tokens_consumed_B": 21.728591872, "train/loss_slope": -3.3406587423879116e-05} {"step": 10370, "timestamp": 1778205741.3074038, "train/loss": 2.3063111782073973, "train/z_loss": 0.0017966473358683288, "train/perplexity": 10.037330356988795, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024327.6275195547, "perf/iters_per_sec": 0.9652746331784986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359745979309083, "data/tokens_consumed": 21749563392, "data/tokens_consumed_B": 21.749563392, "train/loss_slope": -3.088846148008205e-05} {"step": 10380, "timestamp": 1778205751.6655884, "train/loss": 2.320543360710144, "train/z_loss": 0.0017830848461017012, "train/perplexity": 10.181204870100007, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026075.0573652347, "perf/iters_per_sec": 0.966107872660272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350811004638671, "data/tokens_consumed": 21770534912, "data/tokens_consumed_B": 21.770534912, "train/loss_slope": -2.8264222057810642e-05} {"step": 10390, "timestamp": 1778205762.0244195, "train/loss": 2.326593255996704, "train/z_loss": 0.0017832724028266967, "train/perplexity": 10.242986792093362, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025690.5841959852, "perf/iters_per_sec": 0.9659245415668417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352775573730468, "data/tokens_consumed": 21791506432, "data/tokens_consumed_B": 21.791506432, "train/loss_slope": -2.651239982282225e-05} {"step": 10400, "timestamp": 1778205772.372306, "grad/layer_0/attn": 0.003811346599832177, "grad/layer_0/mlp": 0.003304978832602501, "grad/layer_0/attn_mlp_ratio": 1.1532135839761428, "grad/layer_4/attn": 0.0018021056894212961, "grad/layer_4/mlp": 0.002959024626761675, "grad/layer_4/attn_mlp_ratio": 0.6090201521882859, "grad/layer_8/attn": 0.0055643185041844845, "grad/layer_8/mlp": 0.004157771356403828, "grad/layer_8/attn_mlp_ratio": 1.338293497497147, "grad/layer_12/attn": 0.003905101679265499, "grad/layer_12/mlp": 0.006234796717762947, "grad/layer_12/attn_mlp_ratio": 0.6263398460940792, "grad/layer_16/attn": 0.006074751727283001, "grad/layer_16/mlp": 0.0055102696642279625, "grad/layer_16/attn_mlp_ratio": 1.1024418018006288, "grad/layer_20/attn": 0.004767874255776405, "grad/layer_20/mlp": 0.007342141587287188, "grad/layer_20/attn_mlp_ratio": 0.6493846698752642, "grad/layer_24/attn": 0.020742250606417656, "grad/layer_24/mlp": 0.01566637121140957, "grad/layer_24/attn_mlp_ratio": 1.3239984035940349, "grad/layer_27/attn": 0.006802903022617102, "grad/layer_27/mlp": 0.013959969393908978, "grad/layer_27/attn_mlp_ratio": 0.4873150350067275} {"step": 10400, "timestamp": 1778205772.3882468, "train/loss": 2.299988603591919, "train/z_loss": 0.0017889330396428705, "train/perplexity": 9.974068785608903, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024731.4841536859, "perf/iters_per_sec": 0.9654672070282392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035767960548401, "data/tokens_consumed": 21812477952, "data/tokens_consumed_B": 21.812477952, "train/loss_slope": -2.880892822272493e-05} {"step": 10410, "timestamp": 1778205782.779758, "train/loss": 2.3499581098556517, "train/z_loss": 0.0017735188477672637, "train/perplexity": 10.485130491898058, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020339.110702801, "perf/iters_per_sec": 0.9633727601541524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380198001861571, "data/tokens_consumed": 21833449472, "data/tokens_consumed_B": 21.833449472, "train/loss_slope": -2.7390339122031726e-05} {"step": 10420, "timestamp": 1778205793.1484072, "train/loss": 2.3040070056915285, "train/z_loss": 0.0017855065409094096, "train/perplexity": 10.014229240946584, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023677.9369047084, "perf/iters_per_sec": 0.964964836552004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036307191848755, "data/tokens_consumed": 21854420992, "data/tokens_consumed_B": 21.854420992, "train/loss_slope": -2.9130632985364272e-05} {"step": 10425, "timestamp": 1778205798.9299304, "eos/sharpness": 33.24112892150878, "eos/L0_probe": 2.1434218883514404, "eos/L_plus": 2.303847551345825, "eos/L_minus": 2.3154075145721436, "eos/grad_norm": 0.21425116062164307, "eos/embed_grad_frac": 0.07206739485263824, "eos/time_s": 0.6152462959289551} {"step": 10425, "timestamp": 1778205800.3108382, "geo/rankme_last": 440.765869140625, "geo/layer_0/stable_rank_q_proj": 15.176342964172363, "geo/layer_0/stable_rank_k_proj": 12.769394874572754, "geo/layer_0/stable_rank_o_proj": 55.66060256958008, "geo/layer_0/stable_rank_gate_proj": 167.61038208007812, "geo/layer_0/stable_rank_down_proj": 47.938758850097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.030520111322402954, "geo/layer_0/attn_entropy_mean": 6.373805999755859, "geo/layer_0/attn_entropy_std": 0.26579540967941284, "geo/layer_7/stable_rank_q_proj": 43.60585021972656, "geo/layer_7/stable_rank_k_proj": 43.928367614746094, "geo/layer_7/stable_rank_o_proj": 107.79075622558594, "geo/layer_7/stable_rank_gate_proj": 125.15078735351562, "geo/layer_7/stable_rank_down_proj": 168.1862030029297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5956072211265564, "geo/layer_7/attn_entropy_mean": 4.661510467529297, "geo/layer_7/attn_entropy_std": 0.9351533651351929, "geo/layer_14/stable_rank_q_proj": 62.86528396606445, "geo/layer_14/stable_rank_k_proj": 41.3882942199707, "geo/layer_14/stable_rank_o_proj": 48.987552642822266, "geo/layer_14/stable_rank_gate_proj": 110.59584045410156, "geo/layer_14/stable_rank_down_proj": 139.01022338867188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39441341161727905, "geo/layer_14/attn_entropy_mean": 5.686636924743652, "geo/layer_14/attn_entropy_std": 0.5796781778335571, "geo/layer_21/stable_rank_q_proj": 52.03016662597656, "geo/layer_21/stable_rank_k_proj": 32.15257263183594, "geo/layer_21/stable_rank_o_proj": 91.73980712890625, "geo/layer_21/stable_rank_gate_proj": 107.48502349853516, "geo/layer_21/stable_rank_down_proj": 71.94539642333984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16069750487804413, "geo/layer_21/attn_entropy_mean": 5.784976005554199, "geo/layer_21/attn_entropy_std": 0.3008154034614563, "geo/layer_27/stable_rank_q_proj": 42.826900482177734, "geo/layer_27/stable_rank_k_proj": 32.41231155395508, "geo/layer_27/stable_rank_o_proj": 114.50213623046875, "geo/layer_27/stable_rank_gate_proj": 94.87036895751953, "geo/layer_27/stable_rank_down_proj": 151.4590606689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06180195510387421, "geo/layer_27/attn_entropy_mean": 4.516371726989746, "geo/layer_27/attn_entropy_std": 0.527710497379303, "attnres/final_alpha/block_0": 0.24849142134189606, "attnres/block_norm/0": 1.4200563430786133, "attnres/final_alpha/block_1": 0.009647094644606113, "attnres/block_norm/1": 17810.833984375, "attnres/final_alpha/block_2": 0.019493943080306053, "attnres/block_norm/2": 13234.8056640625, "attnres/final_alpha/block_3": 0.020983410999178886, "attnres/block_norm/3": 14768.228515625, "attnres/final_alpha/block_4": 0.02942705899477005, "attnres/block_norm/4": 5701.8359375, "attnres/final_alpha/block_5": 0.503676176071167, "attnres/block_norm/5": 3949.867431640625, "attnres/final_alpha/block_6": 0.16828086972236633, "attnres/block_norm/6": 10551.4365234375, "geo/tier1_time_s": 1.3602421283721924, "geo/step": 10425.0, "geo/rankme_slope": 0.001966359493015956} {"step": 10430, "timestamp": 1778205805.5048106, "train/loss": 2.297153663635254, "train/z_loss": 0.0017825099173933267, "train/perplexity": 9.945832941849579, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697968.5654458134, "perf/iters_per_sec": 0.8096545054654185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235094714164734, "data/tokens_consumed": 21875392512, "data/tokens_consumed_B": 21.875392512, "train/loss_slope": -2.9164653921713867e-05} {"step": 10440, "timestamp": 1778205815.8630826, "train/loss": 2.302989625930786, "train/z_loss": 0.0017855941550806165, "train/perplexity": 10.004046147712234, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025606.803343478, "perf/iters_per_sec": 0.9658845917432203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353203773498536, "data/tokens_consumed": 21896364032, "data/tokens_consumed_B": 21.896364032, "train/loss_slope": -3.1626166161423554e-05} {"step": 10450, "timestamp": 1778205826.2157545, "grad/layer_0/attn": 0.0025441166944801807, "grad/layer_0/mlp": 0.0027628126554191113, "grad/layer_0/attn_mlp_ratio": 0.9208429668243099, "grad/layer_4/attn": 0.002869978314265609, "grad/layer_4/mlp": 0.002635018201544881, "grad/layer_4/attn_mlp_ratio": 1.0891682659596609, "grad/layer_8/attn": 0.004855284933000803, "grad/layer_8/mlp": 0.004202207550406456, "grad/layer_8/attn_mlp_ratio": 1.1554129012475571, "grad/layer_12/attn": 0.005124861840158701, "grad/layer_12/mlp": 0.005636863876134157, "grad/layer_12/attn_mlp_ratio": 0.9091689744256355, "grad/layer_16/attn": 0.004348449409008026, "grad/layer_16/mlp": 0.004802635870873928, "grad/layer_16/attn_mlp_ratio": 0.9054297338752381, "grad/layer_20/attn": 0.005614948458969593, "grad/layer_20/mlp": 0.006913555320352316, "grad/layer_20/attn_mlp_ratio": 0.8121651042877522, "grad/layer_24/attn": 0.013179855421185493, "grad/layer_24/mlp": 0.013133346103131771, "grad/layer_24/attn_mlp_ratio": 1.0035413075490718, "grad/layer_27/attn": 0.01112345326691866, "grad/layer_27/mlp": 0.01195207517594099, "grad/layer_27/attn_mlp_ratio": 0.9306712859573173} {"step": 10450, "timestamp": 1778205826.231712, "train/loss": 2.310959005355835, "train/z_loss": 0.0017812066595070063, "train/perplexity": 10.084090716376192, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023960.4428542447, "perf/iters_per_sec": 0.9650995458861564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036162543296814, "data/tokens_consumed": 21917335552, "data/tokens_consumed_B": 21.917335552, "train/loss_slope": -3.521486343008383e-05} {"step": 10460, "timestamp": 1778205836.5913382, "train/loss": 2.363159918785095, "train/z_loss": 0.0017642279504798352, "train/perplexity": 10.62447093040317, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025433.898963924, "perf/iters_per_sec": 0.9658021445102329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354087591171264, "data/tokens_consumed": 21938307072, "data/tokens_consumed_B": 21.938307072, "train/loss_slope": -3.3213062998842886e-05} {"step": 10470, "timestamp": 1778205846.965636, "train/loss": 2.3297354698181154, "train/z_loss": 0.0017793260514736176, "train/perplexity": 10.27522306687434, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023194.4559832318, "perf/iters_per_sec": 0.9647342948833617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365548372268676, "data/tokens_consumed": 21959278592, "data/tokens_consumed_B": 21.959278592, "train/loss_slope": -3.240598291739877e-05} {"step": 10480, "timestamp": 1778205857.3156366, "train/loss": 2.3295206785202027, "train/z_loss": 0.001776060718111694, "train/perplexity": 10.273016275383752, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027345.0885855847, "perf/iters_per_sec": 0.9667134707382129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344326734542846, "data/tokens_consumed": 21980250112, "data/tokens_consumed_B": 21.980250112, "train/loss_slope": -3.3072501876995845e-05} {"step": 10490, "timestamp": 1778205867.6687977, "train/loss": 2.3324939489364622, "train/z_loss": 0.0017743694130331277, "train/perplexity": 10.303606184261232, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026562.297402475, "perf/iters_per_sec": 0.9663402068149924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034832239151001, "data/tokens_consumed": 22001221632, "data/tokens_consumed_B": 22.001221632, "train/loss_slope": -3.247298583446448e-05} {"step": 10500, "timestamp": 1778205878.006816, "grad/layer_0/attn": 0.0038763268385082483, "grad/layer_0/mlp": 0.003291771514341235, "grad/layer_0/attn_mlp_ratio": 1.177580735437503, "grad/layer_4/attn": 0.0021151036489754915, "grad/layer_4/mlp": 0.0028657421935349703, "grad/layer_4/attn_mlp_ratio": 0.7380648475430267, "grad/layer_8/attn": 0.0057162512093782425, "grad/layer_8/mlp": 0.004371400456875563, "grad/layer_8/attn_mlp_ratio": 1.3076475456790229, "grad/layer_12/attn": 0.00515223853290081, "grad/layer_12/mlp": 0.006150900386273861, "grad/layer_12/attn_mlp_ratio": 0.8376397154202657, "grad/layer_16/attn": 0.006931648589670658, "grad/layer_16/mlp": 0.004973256960511208, "grad/layer_16/attn_mlp_ratio": 1.3937844968259383, "grad/layer_20/attn": 0.005670872516930103, "grad/layer_20/mlp": 0.007777051534503698, "grad/layer_20/attn_mlp_ratio": 0.7291802579490007, "grad/layer_24/attn": 0.02827291563153267, "grad/layer_24/mlp": 0.014922156929969788, "grad/layer_24/attn_mlp_ratio": 1.8946936139828245, "grad/layer_27/attn": 0.007744294125586748, "grad/layer_27/mlp": 0.014561529271304607, "grad/layer_27/attn_mlp_ratio": 0.5318324695239698} {"step": 10500, "timestamp": 1778205878.6177368, "eos/sharpness": 49.09424781799316, "eos/L0_probe": 2.1409831047058105, "eos/L_plus": 2.3354403972625732, "eos/L_minus": 2.4374682903289795, "eos/grad_norm": 0.2841203510761261, "eos/embed_grad_frac": 0.033499132841825485, "eos/time_s": 0.6081376075744629} {"step": 10500, "timestamp": 1778205878.637489, "train/loss": 2.2949108839035035, "train/z_loss": 0.0017997321090660989, "train/perplexity": 9.923551624695175, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912780.818405367, "perf/iters_per_sec": 0.9120849697138629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096389079093933, "data/tokens_consumed": 22022193152, "data/tokens_consumed_B": 22.022193152, "train/loss_slope": -3.30364898558032e-05} {"step": 10500, "timestamp": 1778205879.999177, "geo/rankme_last": 440.75054931640625, "geo/layer_0/stable_rank_q_proj": 15.121604919433594, "geo/layer_0/stable_rank_k_proj": 12.801274299621582, "geo/layer_0/stable_rank_o_proj": 55.324615478515625, "geo/layer_0/stable_rank_gate_proj": 167.32901000976562, "geo/layer_0/stable_rank_down_proj": 47.997467041015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.029717475175857544, "geo/layer_0/attn_entropy_mean": 6.36591911315918, "geo/layer_0/attn_entropy_std": 0.2671225965023041, "geo/layer_7/stable_rank_q_proj": 43.524574279785156, "geo/layer_7/stable_rank_k_proj": 43.92802810668945, "geo/layer_7/stable_rank_o_proj": 107.72647094726562, "geo/layer_7/stable_rank_gate_proj": 125.12654876708984, "geo/layer_7/stable_rank_down_proj": 167.62965393066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5908791422843933, "geo/layer_7/attn_entropy_mean": 4.689199447631836, "geo/layer_7/attn_entropy_std": 0.9352661371231079, "geo/layer_14/stable_rank_q_proj": 62.86688995361328, "geo/layer_14/stable_rank_k_proj": 41.36165237426758, "geo/layer_14/stable_rank_o_proj": 48.845733642578125, "geo/layer_14/stable_rank_gate_proj": 109.80870819091797, "geo/layer_14/stable_rank_down_proj": 138.94923400878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38898393511772156, "geo/layer_14/attn_entropy_mean": 5.6723504066467285, "geo/layer_14/attn_entropy_std": 0.595038115978241, "geo/layer_21/stable_rank_q_proj": 52.03430938720703, "geo/layer_21/stable_rank_k_proj": 31.968128204345703, "geo/layer_21/stable_rank_o_proj": 91.60015869140625, "geo/layer_21/stable_rank_gate_proj": 107.47413635253906, "geo/layer_21/stable_rank_down_proj": 71.61674499511719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15718120336532593, "geo/layer_21/attn_entropy_mean": 5.795145034790039, "geo/layer_21/attn_entropy_std": 0.3068010210990906, "geo/layer_27/stable_rank_q_proj": 42.67318344116211, "geo/layer_27/stable_rank_k_proj": 32.40629196166992, "geo/layer_27/stable_rank_o_proj": 114.40400695800781, "geo/layer_27/stable_rank_gate_proj": 94.99488830566406, "geo/layer_27/stable_rank_down_proj": 151.27098083496094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06260159611701965, "geo/layer_27/attn_entropy_mean": 4.503895282745361, "geo/layer_27/attn_entropy_std": 0.5136870741844177, "attnres/final_alpha/block_0": 0.25049883127212524, "attnres/block_norm/0": 1.4225714206695557, "attnres/final_alpha/block_1": 0.009839648380875587, "attnres/block_norm/1": 17932.1875, "attnres/final_alpha/block_2": 0.019538726657629013, "attnres/block_norm/2": 13195.142578125, "attnres/final_alpha/block_3": 0.020932506769895554, "attnres/block_norm/3": 14917.837890625, "attnres/final_alpha/block_4": 0.029888074845075607, "attnres/block_norm/4": 5760.52783203125, "attnres/final_alpha/block_5": 0.49906986951828003, "attnres/block_norm/5": 3996.2001953125, "attnres/final_alpha/block_6": 0.1702323704957962, "attnres/block_norm/6": 10635.625, "geo/tier1_time_s": 1.3574471473693848, "geo/step": 10500.0, "geo/rankme_slope": 0.001924503668654962} {"step": 10500, "timestamp": 1778205886.9173467, "geo/ww_alpha_mean": 7.999605731005049, "geo/ww_alpha_std": 4.401014285772304, "geo/ww_alpha_min": 1.3655244670942261, "geo/ww_alpha_max": 26.693903739500314, "geo/ww_alpha_healthy_frac": 0.13705583756345177, "geo/ww_alpha_by_type/q_proj": 4.387810440940583, "geo/ww_alpha_by_type/k_proj": 4.684289435839655, "geo/ww_alpha_by_type/v_proj": 7.341527324951806, "geo/ww_alpha_by_type/o_proj": 8.412378660718918, "geo/ww_alpha_by_type/gate_proj": 10.094157417362556, "geo/ww_alpha_by_type/up_proj": 12.008567071439321, "geo/ww_alpha_by_type/down_proj": 9.208410102630321, "geo/twonn_id/layer_0": 0.6762331128120422, "geo/twonn_id/layer_7": 2.909165859222412, "geo/twonn_id/layer_14": 3.7366299629211426, "geo/twonn_id/layer_21": 6.693827152252197, "geo/twonn_id/layer_27": 5.694864273071289, "geo/tier2_time_s": 6.910863876342773} {"step": 10500, "timestamp": 1778205887.5893214, "eoc/jacobian_sigma/layer_0/attn": 619.8123168945312, "eoc/jacobian_sigma/layer_0/mlp": 3047.160400390625, "eoc/jacobian_sigma/layer_0": 3047.160400390625, "eoc/jacobian_sigma/layer_7/attn": 1.1479967832565308, "eoc/jacobian_sigma/layer_7/mlp": 1.6129504442214966, "eoc/jacobian_sigma/layer_7": 1.6129504442214966, "eoc/jacobian_sigma/layer_14/attn": 1.361029863357544, "eoc/jacobian_sigma/layer_14/mlp": 9.51267147064209, "eoc/jacobian_sigma/layer_14": 9.51267147064209, "eoc/jacobian_sigma/layer_21/attn": 1.0860095024108887, "eoc/jacobian_sigma/layer_21/mlp": 3.2784533500671387, "eoc/jacobian_sigma/layer_21": 3.2784533500671387, "eoc/jacobian_sigma/layer_27/attn": 2.087894916534424, "eoc/jacobian_sigma/layer_27/mlp": 19.07801628112793, "eoc/jacobian_sigma/layer_27": 19.07801628112793, "eoc/layer0_sigma": 3047.160400390625, "eoc/sigma_max": 19.07801628112793, "eoc/sigma_min": 1.6129504442214966, "eoc/sigma_mean": 8.370522886514664, "eoc/time_s": 0.6657357215881348} {"step": 10510, "timestamp": 1778205897.9550323, "train/loss": 2.3453332424163817, "train/z_loss": 0.0017687746323645115, "train/perplexity": 10.436750115937887, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085841.8292572517, "perf/iters_per_sec": 0.5177697321211108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9313604831695557, "data/tokens_consumed": 22043164672, "data/tokens_consumed_B": 22.043164672, "train/loss_slope": -3.10328425926165e-05} {"step": 10520, "timestamp": 1778205908.3061953, "train/loss": 2.350857734680176, "train/z_loss": 0.0017694906564429402, "train/perplexity": 10.494567419787781, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027230.053862767, "perf/iters_per_sec": 0.9666586179078899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344913721084594, "data/tokens_consumed": 22064136192, "data/tokens_consumed_B": 22.064136192, "train/loss_slope": -2.899031579011631e-05} {"step": 10530, "timestamp": 1778205918.6571808, "train/loss": 2.3487760543823244, "train/z_loss": 0.0017594536533579231, "train/perplexity": 10.472743808327396, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027058.4605239148, "perf/iters_per_sec": 0.966576795827825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345789432525634, "data/tokens_consumed": 22085107712, "data/tokens_consumed_B": 22.085107712, "train/loss_slope": -2.755087421278749e-05} {"step": 10540, "timestamp": 1778205929.5003965, "train/loss": 2.344449210166931, "train/z_loss": 0.0017813678830862045, "train/perplexity": 10.427527769282484, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935008.9848952543, "perf/iters_per_sec": 0.9226841854549667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0837944507598878, "data/tokens_consumed": 22106079232, "data/tokens_consumed_B": 22.106079232, "train/loss_slope": -2.7238676593069784e-05} {"step": 10550, "timestamp": 1778205939.8679843, "grad/layer_0/attn": 0.002543471520766616, "grad/layer_0/mlp": 0.0027101743035018444, "grad/layer_0/attn_mlp_ratio": 0.9384899796412253, "grad/layer_4/attn": 0.0014327969402074814, "grad/layer_4/mlp": 0.0028034893330186605, "grad/layer_4/attn_mlp_ratio": 0.5110762763477641, "grad/layer_8/attn": 0.005441857967525721, "grad/layer_8/mlp": 0.004153409041464329, "grad/layer_8/attn_mlp_ratio": 1.3102147614590978, "grad/layer_12/attn": 0.004565366078168154, "grad/layer_12/mlp": 0.005763700697571039, "grad/layer_12/attn_mlp_ratio": 0.7920893603797248, "grad/layer_16/attn": 0.005155487451702356, "grad/layer_16/mlp": 0.0044595180079340935, "grad/layer_16/attn_mlp_ratio": 1.1560638003756591, "grad/layer_20/attn": 0.006921886932104826, "grad/layer_20/mlp": 0.006154554896056652, "grad/layer_20/attn_mlp_ratio": 1.124677078446747, "grad/layer_24/attn": 0.011189454235136509, "grad/layer_24/mlp": 0.01338434498757124, "grad/layer_24/attn_mlp_ratio": 0.8360105901279462, "grad/layer_27/attn": 0.010470419190824032, "grad/layer_27/mlp": 0.010741890408098698, "grad/layer_27/attn_mlp_ratio": 0.9747277895757741} {"step": 10550, "timestamp": 1778205939.884022, "train/loss": 2.3084496736526487, "train/z_loss": 0.0017836636281572283, "train/perplexity": 10.058818109781521, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020898.6701216714, "perf/iters_per_sec": 0.9636395788772923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377323865890502, "data/tokens_consumed": 22127050752, "data/tokens_consumed_B": 22.127050752, "train/loss_slope": -2.818408324272829e-05} {"step": 10560, "timestamp": 1778205950.249614, "train/loss": 2.2637062549591063, "train/z_loss": 0.0017939444980584085, "train/perplexity": 9.618672438118272, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024827.497780045, "perf/iters_per_sec": 0.9655129898929811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035718846321106, "data/tokens_consumed": 22148022272, "data/tokens_consumed_B": 22.148022272, "train/loss_slope": -3.020480861543647e-05} {"step": 10570, "timestamp": 1778205960.6060805, "train/loss": 2.3159979820251464, "train/z_loss": 0.0017751710838638246, "train/perplexity": 10.135032453550151, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026359.9135622424, "perf/iters_per_sec": 0.9662437026797497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349355936050415, "data/tokens_consumed": 22168993792, "data/tokens_consumed_B": 22.168993792, "train/loss_slope": -2.6789919163348885e-05} {"step": 10575, "timestamp": 1778205966.401731, "eos/sharpness": 53.721785545349114, "eos/L0_probe": 2.137861490249634, "eos/L_plus": 2.5266261100769043, "eos/L_minus": 2.2863147258758545, "eos/grad_norm": 0.22148758172988892, "eos/embed_grad_frac": 0.05401499196887016, "eos/time_s": 0.628638505935669} {"step": 10575, "timestamp": 1778205967.7829437, "geo/rankme_last": 439.3363037109375, "geo/layer_0/stable_rank_q_proj": 15.116547584533691, "geo/layer_0/stable_rank_k_proj": 12.83078384399414, "geo/layer_0/stable_rank_o_proj": 55.181678771972656, "geo/layer_0/stable_rank_gate_proj": 166.1334228515625, "geo/layer_0/stable_rank_down_proj": 48.027286529541016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03297805041074753, "geo/layer_0/attn_entropy_mean": 6.364682674407959, "geo/layer_0/attn_entropy_std": 0.26581287384033203, "geo/layer_7/stable_rank_q_proj": 43.415008544921875, "geo/layer_7/stable_rank_k_proj": 43.85044860839844, "geo/layer_7/stable_rank_o_proj": 107.88432312011719, "geo/layer_7/stable_rank_gate_proj": 124.89903259277344, "geo/layer_7/stable_rank_down_proj": 167.1679229736328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.587041974067688, "geo/layer_7/attn_entropy_mean": 4.678464412689209, "geo/layer_7/attn_entropy_std": 0.9185258150100708, "geo/layer_14/stable_rank_q_proj": 62.88151550292969, "geo/layer_14/stable_rank_k_proj": 41.1247444152832, "geo/layer_14/stable_rank_o_proj": 49.01805114746094, "geo/layer_14/stable_rank_gate_proj": 109.6160659790039, "geo/layer_14/stable_rank_down_proj": 138.90611267089844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38544875383377075, "geo/layer_14/attn_entropy_mean": 5.653900146484375, "geo/layer_14/attn_entropy_std": 0.6053673028945923, "geo/layer_21/stable_rank_q_proj": 52.011802673339844, "geo/layer_21/stable_rank_k_proj": 31.915096282958984, "geo/layer_21/stable_rank_o_proj": 91.48892211914062, "geo/layer_21/stable_rank_gate_proj": 107.34518432617188, "geo/layer_21/stable_rank_down_proj": 71.37813568115234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1624816358089447, "geo/layer_21/attn_entropy_mean": 5.8223772048950195, "geo/layer_21/attn_entropy_std": 0.29652413725852966, "geo/layer_27/stable_rank_q_proj": 42.56938552856445, "geo/layer_27/stable_rank_k_proj": 32.57345962524414, "geo/layer_27/stable_rank_o_proj": 114.29737091064453, "geo/layer_27/stable_rank_gate_proj": 94.9466323852539, "geo/layer_27/stable_rank_down_proj": 150.77297973632812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06397368758916855, "geo/layer_27/attn_entropy_mean": 4.5155029296875, "geo/layer_27/attn_entropy_std": 0.5169546604156494, "attnres/final_alpha/block_0": 0.24704711139202118, "attnres/block_norm/0": 1.4250085353851318, "attnres/final_alpha/block_1": 0.009478505700826645, "attnres/block_norm/1": 18096.587890625, "attnres/final_alpha/block_2": 0.019154466688632965, "attnres/block_norm/2": 13439.072265625, "attnres/final_alpha/block_3": 0.02046920917928219, "attnres/block_norm/3": 15161.294921875, "attnres/final_alpha/block_4": 0.029177535325288773, "attnres/block_norm/4": 5822.3271484375, "attnres/final_alpha/block_5": 0.5108828544616699, "attnres/block_norm/5": 3934.349609375, "attnres/final_alpha/block_6": 0.16379031538963318, "attnres/block_norm/6": 10887.748046875, "geo/tier1_time_s": 1.3599271774291992, "geo/step": 10575.0, "geo/rankme_slope": 0.0018277012953618948} {"step": 10580, "timestamp": 1778205972.9650078, "train/loss": 2.306206560134888, "train/z_loss": 0.0017736529116518795, "train/perplexity": 10.036280325760785, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697832.7811075442, "perf/iters_per_sec": 0.8095897584474298, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351934909820557, "data/tokens_consumed": 22189965312, "data/tokens_consumed_B": 22.189965312, "train/loss_slope": -2.5998854401088044e-05} {"step": 10590, "timestamp": 1778205983.3188477, "train/loss": 2.321790170669556, "train/z_loss": 0.0017778276349417865, "train/perplexity": 10.193906814540806, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026515.3743658422, "perf/iters_per_sec": 0.9663178321675502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348562002182007, "data/tokens_consumed": 22210936832, "data/tokens_consumed_B": 22.210936832, "train/loss_slope": -2.8333351742995988e-05} {"step": 10600, "timestamp": 1778205993.6715086, "grad/layer_0/attn": 0.0028140100184828043, "grad/layer_0/mlp": 0.002613872056826949, "grad/layer_0/attn_mlp_ratio": 1.0765675785378914, "grad/layer_4/attn": 0.0016801855526864529, "grad/layer_4/mlp": 0.0026395495515316725, "grad/layer_4/attn_mlp_ratio": 0.6365425070566406, "grad/layer_8/attn": 0.0042928424663841724, "grad/layer_8/mlp": 0.003987313713878393, "grad/layer_8/attn_mlp_ratio": 1.0766251834611926, "grad/layer_12/attn": 0.00378908752463758, "grad/layer_12/mlp": 0.00553674902766943, "grad/layer_12/attn_mlp_ratio": 0.6843523947476579, "grad/layer_16/attn": 0.004467500373721123, "grad/layer_16/mlp": 0.004306393209844828, "grad/layer_16/attn_mlp_ratio": 1.0374111355569844, "grad/layer_20/attn": 0.008205875754356384, "grad/layer_20/mlp": 0.005548753309994936, "grad/layer_20/attn_mlp_ratio": 1.4788683417747832, "grad/layer_24/attn": 0.01834196411073208, "grad/layer_24/mlp": 0.010193600319325924, "grad/layer_24/attn_mlp_ratio": 1.7993607122325268, "grad/layer_27/attn": 0.007537256460636854, "grad/layer_27/mlp": 0.008053135126829147, "grad/layer_27/attn_mlp_ratio": 0.9359406303679547} {"step": 10600, "timestamp": 1778205993.6874623, "train/loss": 2.291176748275757, "train/z_loss": 0.0017846529255621136, "train/perplexity": 9.88656483683869, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024008.9242773254, "perf/iters_per_sec": 0.9651226636301639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361377239227294, "data/tokens_consumed": 22231908352, "data/tokens_consumed_B": 22.231908352, "train/loss_slope": -3.1023088175364184e-05} {"step": 10610, "timestamp": 1778206004.0448704, "train/loss": 2.3472747087478636, "train/z_loss": 0.0017629824578762054, "train/perplexity": 10.457032397210359, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026103.8987364166, "perf/iters_per_sec": 0.966121625297745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350663661956787, "data/tokens_consumed": 22252879872, "data/tokens_consumed_B": 22.252879872, "train/loss_slope": -2.7771767688663123e-05} {"step": 10620, "timestamp": 1778206014.403228, "train/loss": 2.3278281927108764, "train/z_loss": 0.0017659288598224521, "train/perplexity": 10.255644046391117, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025650.8853891434, "perf/iters_per_sec": 0.9659056117006032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352978467941285, "data/tokens_consumed": 22273851392, "data/tokens_consumed_B": 22.273851392, "train/loss_slope": -2.3808722305755687e-05} {"step": 10630, "timestamp": 1778206024.763792, "train/loss": 2.3121806383132935, "train/z_loss": 0.0017853785888291896, "train/perplexity": 10.096417301689751, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025414.9171953467, "perf/iters_per_sec": 0.9657930932976468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354184627532959, "data/tokens_consumed": 22294822912, "data/tokens_consumed_B": 22.294822912, "train/loss_slope": -2.412454739297937e-05} {"step": 10640, "timestamp": 1778206035.1242478, "train/loss": 2.3684283256530763, "train/z_loss": 0.0017653704504482447, "train/perplexity": 10.680592672296743, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025456.8920986785, "perf/iters_per_sec": 0.9658131084912674, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353970050811767, "data/tokens_consumed": 22315794432, "data/tokens_consumed_B": 22.315794432, "train/loss_slope": -2.2770924729840255e-05} {"step": 10650, "timestamp": 1778206045.469604, "grad/layer_0/attn": 0.0024635898880660534, "grad/layer_0/mlp": 0.002690982073545456, "grad/layer_0/attn_mlp_ratio": 0.9154984051121358, "grad/layer_4/attn": 0.0013858038000762463, "grad/layer_4/mlp": 0.0027044741436839104, "grad/layer_4/attn_mlp_ratio": 0.5124115355554535, "grad/layer_8/attn": 0.00975560862571001, "grad/layer_8/mlp": 0.0042113373056054115, "grad/layer_8/attn_mlp_ratio": 2.316510810253535, "grad/layer_12/attn": 0.00530641432851553, "grad/layer_12/mlp": 0.005871532950550318, "grad/layer_12/attn_mlp_ratio": 0.9037527819107105, "grad/layer_16/attn": 0.008006013929843903, "grad/layer_16/mlp": 0.005406845826655626, "grad/layer_16/attn_mlp_ratio": 1.4807179709661114, "grad/layer_20/attn": 0.005137884523719549, "grad/layer_20/mlp": 0.0066733225248754025, "grad/layer_20/attn_mlp_ratio": 0.7699140012454408, "grad/layer_24/attn": 0.005872831214219332, "grad/layer_24/mlp": 0.009494735859334469, "grad/layer_24/attn_mlp_ratio": 0.6185354958128806, "grad/layer_27/attn": 0.011112921871244907, "grad/layer_27/mlp": 0.008390645496547222, "grad/layer_27/attn_mlp_ratio": 1.3244418136094214} {"step": 10650, "timestamp": 1778206046.0862892, "eos/sharpness": 11.802482604980467, "eos/L0_probe": 2.1375646591186523, "eos/L_plus": 2.2068467140197754, "eos/L_minus": 2.186307430267334, "eos/grad_norm": 0.11238029599189758, "eos/embed_grad_frac": 0.21469250321388245, "eos/time_s": 0.6136329174041748} {"step": 10650, "timestamp": 1778206046.1075356, "train/loss": 2.346300792694092, "train/z_loss": 0.0017645229701884091, "train/perplexity": 10.446853083187179, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910312.76788601, "perf/iters_per_sec": 0.910908111517911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978055715560913, "data/tokens_consumed": 22336765952, "data/tokens_consumed_B": 22.336765952, "train/loss_slope": -2.3797542055269078e-05} {"step": 10650, "timestamp": 1778206047.4708, "geo/rankme_last": 438.68609619140625, "geo/layer_0/stable_rank_q_proj": 15.137941360473633, "geo/layer_0/stable_rank_k_proj": 12.879937171936035, "geo/layer_0/stable_rank_o_proj": 55.1544189453125, "geo/layer_0/stable_rank_gate_proj": 165.31927490234375, "geo/layer_0/stable_rank_down_proj": 48.009071350097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03683708235621452, "geo/layer_0/attn_entropy_mean": 6.366219520568848, "geo/layer_0/attn_entropy_std": 0.26977407932281494, "geo/layer_7/stable_rank_q_proj": 43.35921859741211, "geo/layer_7/stable_rank_k_proj": 43.610877990722656, "geo/layer_7/stable_rank_o_proj": 107.75202178955078, "geo/layer_7/stable_rank_gate_proj": 124.10528564453125, "geo/layer_7/stable_rank_down_proj": 166.84580993652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.580413818359375, "geo/layer_7/attn_entropy_mean": 4.687034606933594, "geo/layer_7/attn_entropy_std": 0.9326372742652893, "geo/layer_14/stable_rank_q_proj": 63.00801467895508, "geo/layer_14/stable_rank_k_proj": 41.191009521484375, "geo/layer_14/stable_rank_o_proj": 48.930477142333984, "geo/layer_14/stable_rank_gate_proj": 109.8193588256836, "geo/layer_14/stable_rank_down_proj": 138.77078247070312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3952310085296631, "geo/layer_14/attn_entropy_mean": 5.662686347961426, "geo/layer_14/attn_entropy_std": 0.5862671136856079, "geo/layer_21/stable_rank_q_proj": 51.97134017944336, "geo/layer_21/stable_rank_k_proj": 32.00402069091797, "geo/layer_21/stable_rank_o_proj": 91.29537963867188, "geo/layer_21/stable_rank_gate_proj": 107.178466796875, "geo/layer_21/stable_rank_down_proj": 71.25918579101562, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16070561110973358, "geo/layer_21/attn_entropy_mean": 5.819387912750244, "geo/layer_21/attn_entropy_std": 0.29206976294517517, "geo/layer_27/stable_rank_q_proj": 42.452484130859375, "geo/layer_27/stable_rank_k_proj": 32.52860641479492, "geo/layer_27/stable_rank_o_proj": 114.25211334228516, "geo/layer_27/stable_rank_gate_proj": 95.04230499267578, "geo/layer_27/stable_rank_down_proj": 150.2143096923828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06836209446191788, "geo/layer_27/attn_entropy_mean": 4.517065048217773, "geo/layer_27/attn_entropy_std": 0.4960428774356842, "attnres/final_alpha/block_0": 0.24720458686351776, "attnres/block_norm/0": 1.4275919198989868, "attnres/final_alpha/block_1": 0.009246058762073517, "attnres/block_norm/1": 18269.83984375, "attnres/final_alpha/block_2": 0.018996931612491608, "attnres/block_norm/2": 13480.369140625, "attnres/final_alpha/block_3": 0.020337730646133423, "attnres/block_norm/3": 15224.0263671875, "attnres/final_alpha/block_4": 0.028694093227386475, "attnres/block_norm/4": 5821.931640625, "attnres/final_alpha/block_5": 0.5105628371238708, "attnres/block_norm/5": 3957.82861328125, "attnres/final_alpha/block_6": 0.16495776176452637, "attnres/block_norm/6": 10947.26953125, "geo/tier1_time_s": 1.3595972061157227, "geo/step": 10650.0, "geo/rankme_slope": 0.0016913409113645458} {"step": 10660, "timestamp": 1778206057.82523, "train/loss": 2.304579567909241, "train/z_loss": 0.0017733517568558454, "train/perplexity": 10.019964652032622, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790256.13177774, "perf/iters_per_sec": 0.8536606463326168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714256763458253, "data/tokens_consumed": 22357737472, "data/tokens_consumed_B": 22.357737472, "train/loss_slope": -2.368880225272284e-05} {"step": 10670, "timestamp": 1778206068.1880279, "train/loss": 2.3560676097869875, "train/z_loss": 0.0017581266234628855, "train/perplexity": 10.549385478972962, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024749.3345417248, "perf/iters_per_sec": 0.9654757187565445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357588291168214, "data/tokens_consumed": 22378708992, "data/tokens_consumed_B": 22.378708992, "train/loss_slope": -2.0200046075202294e-05} {"step": 10680, "timestamp": 1778206078.5706542, "train/loss": 2.29320809841156, "train/z_loss": 0.0017834196682088078, "train/perplexity": 9.906668323359776, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021656.2283046301, "perf/iters_per_sec": 0.9640008107684279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373435258865356, "data/tokens_consumed": 22399680512, "data/tokens_consumed_B": 22.399680512, "train/loss_slope": -2.2290971577435734e-05} {"step": 10690, "timestamp": 1778206088.923379, "train/loss": 2.332736015319824, "train/z_loss": 0.001768502127379179, "train/perplexity": 10.306100642845946, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026753.09344044, "perf/iters_per_sec": 0.9664311854555321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347348213195802, "data/tokens_consumed": 22420652032, "data/tokens_consumed_B": 22.420652032, "train/loss_slope": -2.0716956768384957e-05} {"step": 10700, "timestamp": 1778206099.2790735, "grad/layer_0/attn": 0.002539865206927061, "grad/layer_0/mlp": 0.0025426228530704975, "grad/layer_0/attn_mlp_ratio": 0.9989153932005113, "grad/layer_4/attn": 0.001540526282042265, "grad/layer_4/mlp": 0.0027735941112041473, "grad/layer_4/attn_mlp_ratio": 0.5554259796978197, "grad/layer_8/attn": 0.005680732894688845, "grad/layer_8/mlp": 0.004236025735735893, "grad/layer_8/attn_mlp_ratio": 1.3410524663860965, "grad/layer_12/attn": 0.005028143059462309, "grad/layer_12/mlp": 0.005699045956134796, "grad/layer_12/attn_mlp_ratio": 0.8822780180991366, "grad/layer_16/attn": 0.004860800225287676, "grad/layer_16/mlp": 0.004857216961681843, "grad/layer_16/attn_mlp_ratio": 1.0007376988840175, "grad/layer_20/attn": 0.004057437181472778, "grad/layer_20/mlp": 0.006246914155781269, "grad/layer_20/attn_mlp_ratio": 0.6495106248205317, "grad/layer_24/attn": 0.006553281098604202, "grad/layer_24/mlp": 0.010161255486309528, "grad/layer_24/attn_mlp_ratio": 0.6449282810515636, "grad/layer_27/attn": 0.006688767112791538, "grad/layer_27/mlp": 0.009896919131278992, "grad/layer_27/attn_mlp_ratio": 0.6758433565519905} {"step": 10700, "timestamp": 1778206099.2964594, "train/loss": 2.373439598083496, "train/z_loss": 0.001758001488633454, "train/perplexity": 10.734250366265771, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023348.8263891214, "perf/iters_per_sec": 0.9648079044290168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364757537841798, "data/tokens_consumed": 22441623552, "data/tokens_consumed_B": 22.441623552, "train/loss_slope": -2.0776482712854107e-05} {"step": 10710, "timestamp": 1778206109.6783683, "train/loss": 2.336886763572693, "train/z_loss": 0.001766952022444457, "train/perplexity": 10.348967575460806, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020984.755123272, "perf/iters_per_sec": 0.9636806274048195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037688183784485, "data/tokens_consumed": 22462595072, "data/tokens_consumed_B": 22.462595072, "train/loss_slope": -1.9952409846125294e-05} {"step": 10720, "timestamp": 1778206120.4300442, "train/loss": 2.311324214935303, "train/z_loss": 0.0017763122566975653, "train/perplexity": 10.087774195486023, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951733.5413551012, "perf/iters_per_sec": 0.9306590754294878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0745073318481446, "data/tokens_consumed": 22483566592, "data/tokens_consumed_B": 22.483566592, "train/loss_slope": -1.752275205967557e-05} {"step": 10725, "timestamp": 1778206126.7149239, "eos/sharpness": 23.49019050598144, "eos/L0_probe": 2.138715982437134, "eos/L_plus": 2.2408220767974854, "eos/L_minus": 2.2715117931365967, "eos/grad_norm": 0.1602756381034851, "eos/embed_grad_frac": 0.09977451711893082, "eos/time_s": 0.6047372817993164} {"step": 10725, "timestamp": 1778206128.0914323, "geo/rankme_last": 440.1914367675781, "geo/layer_0/stable_rank_q_proj": 15.148838996887207, "geo/layer_0/stable_rank_k_proj": 12.878511428833008, "geo/layer_0/stable_rank_o_proj": 55.018306732177734, "geo/layer_0/stable_rank_gate_proj": 165.61180114746094, "geo/layer_0/stable_rank_down_proj": 48.011417388916016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03532781824469566, "geo/layer_0/attn_entropy_mean": 6.366744041442871, "geo/layer_0/attn_entropy_std": 0.2699297070503235, "geo/layer_7/stable_rank_q_proj": 43.523834228515625, "geo/layer_7/stable_rank_k_proj": 43.527530670166016, "geo/layer_7/stable_rank_o_proj": 108.17007446289062, "geo/layer_7/stable_rank_gate_proj": 123.89728546142578, "geo/layer_7/stable_rank_down_proj": 166.65838623046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5895859599113464, "geo/layer_7/attn_entropy_mean": 4.67917013168335, "geo/layer_7/attn_entropy_std": 0.929337739944458, "geo/layer_14/stable_rank_q_proj": 63.04369354248047, "geo/layer_14/stable_rank_k_proj": 41.30067443847656, "geo/layer_14/stable_rank_o_proj": 48.87447738647461, "geo/layer_14/stable_rank_gate_proj": 109.2235336303711, "geo/layer_14/stable_rank_down_proj": 139.0258026123047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.376812607049942, "geo/layer_14/attn_entropy_mean": 5.675370216369629, "geo/layer_14/attn_entropy_std": 0.5923952460289001, "geo/layer_21/stable_rank_q_proj": 52.07331085205078, "geo/layer_21/stable_rank_k_proj": 32.05290222167969, "geo/layer_21/stable_rank_o_proj": 91.22490692138672, "geo/layer_21/stable_rank_gate_proj": 106.86457061767578, "geo/layer_21/stable_rank_down_proj": 71.25383758544922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1607792228460312, "geo/layer_21/attn_entropy_mean": 5.803847312927246, "geo/layer_21/attn_entropy_std": 0.31299498677253723, "geo/layer_27/stable_rank_q_proj": 42.45579147338867, "geo/layer_27/stable_rank_k_proj": 32.52860641479492, "geo/layer_27/stable_rank_o_proj": 114.23068237304688, "geo/layer_27/stable_rank_gate_proj": 95.22953796386719, "geo/layer_27/stable_rank_down_proj": 150.77996826171875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06474693864583969, "geo/layer_27/attn_entropy_mean": 4.505349636077881, "geo/layer_27/attn_entropy_std": 0.5148293375968933, "attnres/final_alpha/block_0": 0.2479732185602188, "attnres/block_norm/0": 1.4300235509872437, "attnres/final_alpha/block_1": 0.009459054097533226, "attnres/block_norm/1": 18270.474609375, "attnres/final_alpha/block_2": 0.0191043633967638, "attnres/block_norm/2": 13529.080078125, "attnres/final_alpha/block_3": 0.020270824432373047, "attnres/block_norm/3": 15273.330078125, "attnres/final_alpha/block_4": 0.028743697330355644, "attnres/block_norm/4": 5855.7666015625, "attnres/final_alpha/block_5": 0.5082457065582275, "attnres/block_norm/5": 4005.1025390625, "attnres/final_alpha/block_6": 0.16620314121246338, "attnres/block_norm/6": 10948.578125, "geo/tier1_time_s": 1.3568296432495117, "geo/step": 10725.0, "geo/rankme_slope": 0.0016405527640743797} {"step": 10730, "timestamp": 1778206133.2737253, "train/loss": 2.3255935192108153, "train/z_loss": 0.0017684040241874755, "train/perplexity": 10.232751618492195, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1633687.646684274, "perf/iters_per_sec": 0.7790029748364802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2836921453475951, "data/tokens_consumed": 22504538112, "data/tokens_consumed_B": 22.504538112, "train/loss_slope": -1.5064608639437569e-05} {"step": 10740, "timestamp": 1778206143.6270728, "train/loss": 2.3570720911026, "train/z_loss": 0.0017580606741830706, "train/perplexity": 10.559987463434, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026767.150075935, "perf/iters_per_sec": 0.9664378881816554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034727644920349, "data/tokens_consumed": 22525509632, "data/tokens_consumed_B": 22.525509632, "train/loss_slope": -1.4345032525713523e-05} {"step": 10750, "timestamp": 1778206153.965709, "grad/layer_0/attn": 0.003047916805371642, "grad/layer_0/mlp": 0.002948263194411993, "grad/layer_0/attn_mlp_ratio": 1.033800750139422, "grad/layer_4/attn": 0.0014089741744101048, "grad/layer_4/mlp": 0.0027358972001820803, "grad/layer_4/attn_mlp_ratio": 0.5149952720507215, "grad/layer_8/attn": 0.006361219100654125, "grad/layer_8/mlp": 0.003965144045650959, "grad/layer_8/attn_mlp_ratio": 1.6042844514571368, "grad/layer_12/attn": 0.005166091490536928, "grad/layer_12/mlp": 0.0055071027018129826, "grad/layer_12/attn_mlp_ratio": 0.9380779107366962, "grad/layer_16/attn": 0.007006836589425802, "grad/layer_16/mlp": 0.0048875343054533005, "grad/layer_16/attn_mlp_ratio": 1.4336137627200898, "grad/layer_20/attn": 0.004064024426043034, "grad/layer_20/mlp": 0.006073087453842163, "grad/layer_20/attn_mlp_ratio": 0.6691858778607452, "grad/layer_24/attn": 0.014959371648728848, "grad/layer_24/mlp": 0.010009890422224998, "grad/layer_24/attn_mlp_ratio": 1.49445906681142, "grad/layer_27/attn": 0.004520590882748365, "grad/layer_27/mlp": 0.008307513780891895, "grad/layer_27/attn_mlp_ratio": 0.5441568858700522} {"step": 10750, "timestamp": 1778206153.9818227, "train/loss": 2.326359438896179, "train/z_loss": 0.0017584218876436353, "train/perplexity": 10.240592086593375, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026582.6547389852, "perf/iters_per_sec": 0.9663499139494826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348218441009522, "data/tokens_consumed": 22546481152, "data/tokens_consumed_B": 22.546481152, "train/loss_slope": -1.542735541864737e-05} {"step": 10760, "timestamp": 1778206164.336944, "train/loss": 2.3144449949264527, "train/z_loss": 0.0017661540885455906, "train/perplexity": 10.119305094258879, "train/grad_norm": 0.375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026446.5111572025, "perf/iters_per_sec": 0.9662849956308377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348913669586182, "data/tokens_consumed": 22567452672, "data/tokens_consumed_B": 22.567452672, "train/loss_slope": -1.7786489676113035e-05} {"step": 10770, "timestamp": 1778206174.689224, "train/loss": 2.3515648365020754, "train/z_loss": 0.0017578276223503054, "train/perplexity": 10.50199077175391, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027022.2114689068, "perf/iters_per_sec": 0.9665595109314474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345974445343018, "data/tokens_consumed": 22588424192, "data/tokens_consumed_B": 22.588424192, "train/loss_slope": -1.5962886009136224e-05} {"step": 10780, "timestamp": 1778206185.488517, "train/loss": 2.3479872226715086, "train/z_loss": 0.0017588030779734254, "train/perplexity": 10.464485833416589, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943031.0088891846, "perf/iters_per_sec": 0.9265093845792697, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0793198823928833, "data/tokens_consumed": 22609395712, "data/tokens_consumed_B": 22.609395712, "train/loss_slope": -1.4124993551181632e-05} {"step": 10790, "timestamp": 1778206196.323379, "train/loss": 2.3569101810455324, "train/z_loss": 0.001751103310380131, "train/perplexity": 10.558277833668022, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936597.87873177, "perf/iters_per_sec": 0.9234418290766573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0829052448272705, "data/tokens_consumed": 22630367232, "data/tokens_consumed_B": 22.630367232, "train/loss_slope": -1.1602465297379687e-05} {"step": 10800, "timestamp": 1778206206.6753244, "grad/layer_0/attn": 0.0029074992053210735, "grad/layer_0/mlp": 0.002763408236205578, "grad/layer_0/attn_mlp_ratio": 1.052142445699265, "grad/layer_4/attn": 0.0034584319218993187, "grad/layer_4/mlp": 0.003072926076129079, "grad/layer_4/attn_mlp_ratio": 1.1254523290422342, "grad/layer_8/attn": 0.004481440410017967, "grad/layer_8/mlp": 0.004284495487809181, "grad/layer_8/attn_mlp_ratio": 1.0459668631166665, "grad/layer_12/attn": 0.005227971822023392, "grad/layer_12/mlp": 0.007479493971914053, "grad/layer_12/attn_mlp_ratio": 0.6989739909888742, "grad/layer_16/attn": 0.005392962601035833, "grad/layer_16/mlp": 0.005192113574594259, "grad/layer_16/attn_mlp_ratio": 1.0386834609235072, "grad/layer_20/attn": 0.010938718914985657, "grad/layer_20/mlp": 0.006906306836754084, "grad/layer_20/attn_mlp_ratio": 1.5838738438877986, "grad/layer_24/attn": 0.008599905297160149, "grad/layer_24/mlp": 0.010849152691662312, "grad/layer_24/attn_mlp_ratio": 0.7926798951314675, "grad/layer_27/attn": 0.005153970327228308, "grad/layer_27/mlp": 0.009184121154248714, "grad/layer_27/attn_mlp_ratio": 0.5611827397034858} {"step": 10800, "timestamp": 1778206207.2983825, "eos/sharpness": 13.286304473876951, "eos/L0_probe": 2.1402018070220947, "eos/L_plus": 2.191986322402954, "eos/L_minus": 2.221280336380005, "eos/grad_norm": 0.13307633996009827, "eos/embed_grad_frac": 0.19342635571956635, "eos/time_s": 0.6202147006988525} {"step": 10800, "timestamp": 1778206207.3265545, "train/loss": 2.3080039978027345, "train/z_loss": 0.001774039212614298, "train/perplexity": 10.054336136299169, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907069.9213001586, "perf/iters_per_sec": 0.9093618017674249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0996723175048828, "data/tokens_consumed": 22651338752, "data/tokens_consumed_B": 22.651338752, "train/loss_slope": -1.1624797528619159e-05} {"step": 10800, "timestamp": 1778206208.6876237, "geo/rankme_last": 439.5686950683594, "geo/layer_0/stable_rank_q_proj": 15.175235748291016, "geo/layer_0/stable_rank_k_proj": 12.897367477416992, "geo/layer_0/stable_rank_o_proj": 55.013004302978516, "geo/layer_0/stable_rank_gate_proj": 166.25494384765625, "geo/layer_0/stable_rank_down_proj": 48.0054817199707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03805994987487793, "geo/layer_0/attn_entropy_mean": 6.36610221862793, "geo/layer_0/attn_entropy_std": 0.27419376373291016, "geo/layer_7/stable_rank_q_proj": 43.652103424072266, "geo/layer_7/stable_rank_k_proj": 43.71999740600586, "geo/layer_7/stable_rank_o_proj": 108.51834106445312, "geo/layer_7/stable_rank_gate_proj": 123.73953247070312, "geo/layer_7/stable_rank_down_proj": 166.24412536621094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5993820428848267, "geo/layer_7/attn_entropy_mean": 4.691438674926758, "geo/layer_7/attn_entropy_std": 0.9351417422294617, "geo/layer_14/stable_rank_q_proj": 63.04011535644531, "geo/layer_14/stable_rank_k_proj": 40.926761627197266, "geo/layer_14/stable_rank_o_proj": 48.949066162109375, "geo/layer_14/stable_rank_gate_proj": 109.13465881347656, "geo/layer_14/stable_rank_down_proj": 139.49607849121094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3959629535675049, "geo/layer_14/attn_entropy_mean": 5.623497009277344, "geo/layer_14/attn_entropy_std": 0.5749169588088989, "geo/layer_21/stable_rank_q_proj": 52.011356353759766, "geo/layer_21/stable_rank_k_proj": 32.18043518066406, "geo/layer_21/stable_rank_o_proj": 91.49203491210938, "geo/layer_21/stable_rank_gate_proj": 107.00444030761719, "geo/layer_21/stable_rank_down_proj": 71.08793640136719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16053712368011475, "geo/layer_21/attn_entropy_mean": 5.808553218841553, "geo/layer_21/attn_entropy_std": 0.29734447598457336, "geo/layer_27/stable_rank_q_proj": 42.50319290161133, "geo/layer_27/stable_rank_k_proj": 32.4457893371582, "geo/layer_27/stable_rank_o_proj": 113.65660858154297, "geo/layer_27/stable_rank_gate_proj": 95.25067138671875, "geo/layer_27/stable_rank_down_proj": 150.54371643066406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0690966472029686, "geo/layer_27/attn_entropy_mean": 4.481773376464844, "geo/layer_27/attn_entropy_std": 0.5034539699554443, "attnres/final_alpha/block_0": 0.24944071471691132, "attnres/block_norm/0": 1.4323313236236572, "attnres/final_alpha/block_1": 0.009475143626332283, "attnres/block_norm/1": 18407.71875, "attnres/final_alpha/block_2": 0.019303325563669205, "attnres/block_norm/2": 13608.51171875, "attnres/final_alpha/block_3": 0.020360630005598068, "attnres/block_norm/3": 15324.5986328125, "attnres/final_alpha/block_4": 0.029023367911577225, "attnres/block_norm/4": 5908.826171875, "attnres/final_alpha/block_5": 0.5065287947654724, "attnres/block_norm/5": 4018.385009765625, "attnres/final_alpha/block_6": 0.16586799919605255, "attnres/block_norm/6": 11049.9482421875, "geo/tier1_time_s": 1.3570358753204346, "geo/step": 10800.0, "geo/rankme_slope": 0.0015835992795555723} {"step": 10810, "timestamp": 1778206219.0344503, "train/loss": 2.3250842094421387, "train/z_loss": 0.0017619692371226847, "train/perplexity": 10.227541305076842, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791797.642553758, "perf/iters_per_sec": 0.8543956959503928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170417881011963, "data/tokens_consumed": 22672310272, "data/tokens_consumed_B": 22.672310272, "train/loss_slope": -9.834871727033265e-06} {"step": 10820, "timestamp": 1778206229.379834, "train/loss": 2.30530104637146, "train/z_loss": 0.0017701247823424637, "train/perplexity": 10.027196449200511, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027900.0738617212, "perf/iters_per_sec": 0.9669781083401304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341495752334595, "data/tokens_consumed": 22693281792, "data/tokens_consumed_B": 22.693281792, "train/loss_slope": -9.490772642747382e-06} {"step": 10830, "timestamp": 1778206239.7343981, "train/loss": 2.304394030570984, "train/z_loss": 0.0017755964305251836, "train/perplexity": 10.01810574691514, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026703.639917096, "perf/iters_per_sec": 0.9664076041779975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034760069847107, "data/tokens_consumed": 22714253312, "data/tokens_consumed_B": 22.714253312, "train/loss_slope": -1.1232389260654612e-05} {"step": 10840, "timestamp": 1778206250.0841913, "train/loss": 2.3190792322158815, "train/z_loss": 0.0017581265652552246, "train/perplexity": 10.166309185203072, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027199.0312985757, "perf/iters_per_sec": 0.9666438251965407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345072031021119, "data/tokens_consumed": 22735224832, "data/tokens_consumed_B": 22.735224832, "train/loss_slope": -9.65923149951258e-06} {"step": 10850, "timestamp": 1778206260.4242551, "grad/layer_0/attn": 0.003391055390238762, "grad/layer_0/mlp": 0.0031772188376635313, "grad/layer_0/attn_mlp_ratio": 1.0673030271978934, "grad/layer_4/attn": 0.0024049440398812294, "grad/layer_4/mlp": 0.0027034145314246416, "grad/layer_4/attn_mlp_ratio": 0.8895949633201014, "grad/layer_8/attn": 0.008571119979023933, "grad/layer_8/mlp": 0.004351944662630558, "grad/layer_8/attn_mlp_ratio": 1.9694919045440897, "grad/layer_12/attn": 0.0048524485900998116, "grad/layer_12/mlp": 0.006591178942471743, "grad/layer_12/attn_mlp_ratio": 0.7362034256438749, "grad/layer_16/attn": 0.006107744760811329, "grad/layer_16/mlp": 0.005364800803363323, "grad/layer_16/attn_mlp_ratio": 1.1384848889699217, "grad/layer_20/attn": 0.004694988951086998, "grad/layer_20/mlp": 0.00683075375854969, "grad/layer_20/attn_mlp_ratio": 0.6873310103555453, "grad/layer_24/attn": 0.018206315115094185, "grad/layer_24/mlp": 0.011568103916943073, "grad/layer_24/attn_mlp_ratio": 1.5738374316507306, "grad/layer_27/attn": 0.007189778611063957, "grad/layer_27/mlp": 0.010939644649624825, "grad/layer_27/attn_mlp_ratio": 0.657222311657838} {"step": 10850, "timestamp": 1778206260.4399507, "train/loss": 2.3026246070861816, "train/z_loss": 0.0017785211792215704, "train/perplexity": 10.000395148728279, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026515.2809888313, "perf/iters_per_sec": 0.9663177876419217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348562479019165, "data/tokens_consumed": 22756196352, "data/tokens_consumed_B": 22.756196352, "train/loss_slope": -9.050750632276088e-06} {"step": 10860, "timestamp": 1778206270.7937121, "train/loss": 2.2827265739440916, "train/z_loss": 0.0017789907637052239, "train/perplexity": 9.803373625570005, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026914.8265818267, "perf/iters_per_sec": 0.9665083058270582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346522569656371, "data/tokens_consumed": 22777167872, "data/tokens_consumed_B": 22.777167872, "train/loss_slope": -1.1368621593833616e-05} {"step": 10870, "timestamp": 1778206281.1521237, "train/loss": 2.294410967826843, "train/z_loss": 0.001768711896147579, "train/perplexity": 9.91859192152139, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026026.0569182, "perf/iters_per_sec": 0.9660845074263573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351061344146728, "data/tokens_consumed": 22798139392, "data/tokens_consumed_B": 22.798139392, "train/loss_slope": -1.1028887035966225e-05} {"step": 10875, "timestamp": 1778206286.9227881, "eos/sharpness": 28.526949882507317, "eos/L0_probe": 2.135601282119751, "eos/L_plus": 2.241690158843994, "eos/L_minus": 2.314781904220581, "eos/grad_norm": 0.16880857944488525, "eos/embed_grad_frac": 0.12225902825593948, "eos/time_s": 0.6040003299713135} {"step": 10875, "timestamp": 1778206288.2972414, "geo/rankme_last": 439.67120361328125, "geo/layer_0/stable_rank_q_proj": 15.177047729492188, "geo/layer_0/stable_rank_k_proj": 12.911261558532715, "geo/layer_0/stable_rank_o_proj": 55.104087829589844, "geo/layer_0/stable_rank_gate_proj": 165.5686798095703, "geo/layer_0/stable_rank_down_proj": 47.924171447753906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036847103387117386, "geo/layer_0/attn_entropy_mean": 6.366535663604736, "geo/layer_0/attn_entropy_std": 0.27336207032203674, "geo/layer_7/stable_rank_q_proj": 43.562255859375, "geo/layer_7/stable_rank_k_proj": 43.702545166015625, "geo/layer_7/stable_rank_o_proj": 108.27027893066406, "geo/layer_7/stable_rank_gate_proj": 123.36217498779297, "geo/layer_7/stable_rank_down_proj": 165.6096954345703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5894346833229065, "geo/layer_7/attn_entropy_mean": 4.680944442749023, "geo/layer_7/attn_entropy_std": 0.9191476106643677, "geo/layer_14/stable_rank_q_proj": 63.08992385864258, "geo/layer_14/stable_rank_k_proj": 40.78263854980469, "geo/layer_14/stable_rank_o_proj": 48.87162780761719, "geo/layer_14/stable_rank_gate_proj": 108.64842987060547, "geo/layer_14/stable_rank_down_proj": 139.18638610839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37707215547561646, "geo/layer_14/attn_entropy_mean": 5.673288345336914, "geo/layer_14/attn_entropy_std": 0.5704687833786011, "geo/layer_21/stable_rank_q_proj": 52.229434967041016, "geo/layer_21/stable_rank_k_proj": 32.208946228027344, "geo/layer_21/stable_rank_o_proj": 91.64794158935547, "geo/layer_21/stable_rank_gate_proj": 106.66180419921875, "geo/layer_21/stable_rank_down_proj": 70.91168975830078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16116182506084442, "geo/layer_21/attn_entropy_mean": 5.809725284576416, "geo/layer_21/attn_entropy_std": 0.29033374786376953, "geo/layer_27/stable_rank_q_proj": 42.459983825683594, "geo/layer_27/stable_rank_k_proj": 32.663246154785156, "geo/layer_27/stable_rank_o_proj": 113.39374542236328, "geo/layer_27/stable_rank_gate_proj": 95.2581558227539, "geo/layer_27/stable_rank_down_proj": 150.10452270507812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06931616365909576, "geo/layer_27/attn_entropy_mean": 4.502099990844727, "geo/layer_27/attn_entropy_std": 0.49499478936195374, "attnres/final_alpha/block_0": 0.24639439582824707, "attnres/block_norm/0": 1.4346892833709717, "attnres/final_alpha/block_1": 0.00931390468031168, "attnres/block_norm/1": 18413.5390625, "attnres/final_alpha/block_2": 0.018744098022580147, "attnres/block_norm/2": 13773.12109375, "attnres/final_alpha/block_3": 0.02029922604560852, "attnres/block_norm/3": 15427.193359375, "attnres/final_alpha/block_4": 0.02854962646961212, "attnres/block_norm/4": 5939.63671875, "attnres/final_alpha/block_5": 0.5142244696617126, "attnres/block_norm/5": 3993.9462890625, "attnres/final_alpha/block_6": 0.16247430443763733, "attnres/block_norm/6": 11161.90625, "geo/tier1_time_s": 1.3553383350372314, "geo/step": 10875.0, "geo/rankme_slope": 0.0015277392597664065} {"step": 10880, "timestamp": 1778206293.4772332, "train/loss": 2.3423729658126833, "train/z_loss": 0.0017523352871648967, "train/perplexity": 10.405900133520424, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702527.6349497277, "perf/iters_per_sec": 0.8118284392117155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231787347793579, "data/tokens_consumed": 22819110912, "data/tokens_consumed_B": 22.819110912, "train/loss_slope": -6.273660179090156e-06} {"step": 10890, "timestamp": 1778206303.8594913, "train/loss": 2.3150338172912597, "train/z_loss": 0.0017742188181728124, "train/perplexity": 10.125265322000173, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022674.7867005526, "perf/iters_per_sec": 0.9644864972594035, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036821150779724, "data/tokens_consumed": 22840082432, "data/tokens_consumed_B": 22.840082432, "train/loss_slope": -5.522179825328175e-06} {"step": 10900, "timestamp": 1778206314.2022266, "grad/layer_0/attn": 0.003042382886633277, "grad/layer_0/mlp": 0.002872319659218192, "grad/layer_0/attn_mlp_ratio": 1.0592075888728258, "grad/layer_4/attn": 0.0016085897805169225, "grad/layer_4/mlp": 0.0027400346007198095, "grad/layer_4/attn_mlp_ratio": 0.5870691272976741, "grad/layer_8/attn": 0.010094299912452698, "grad/layer_8/mlp": 0.004305860958993435, "grad/layer_8/attn_mlp_ratio": 2.344316217860593, "grad/layer_12/attn": 0.004441958852112293, "grad/layer_12/mlp": 0.005914340727031231, "grad/layer_12/attn_mlp_ratio": 0.7510488458511756, "grad/layer_16/attn": 0.005389348603785038, "grad/layer_16/mlp": 0.004187413025647402, "grad/layer_16/attn_mlp_ratio": 1.2870353227810092, "grad/layer_20/attn": 0.008970336057245731, "grad/layer_20/mlp": 0.0058967843651771545, "grad/layer_20/attn_mlp_ratio": 1.5212250185196892, "grad/layer_24/attn": 0.01065564714372158, "grad/layer_24/mlp": 0.01313394121825695, "grad/layer_24/attn_mlp_ratio": 0.8113061331338222, "grad/layer_27/attn": 0.011508798226714134, "grad/layer_27/mlp": 0.01083292718976736, "grad/layer_27/attn_mlp_ratio": 1.0623904249394522} {"step": 10900, "timestamp": 1778206314.2185853, "train/loss": 2.3575469255447388, "train/z_loss": 0.0017527201562188566, "train/perplexity": 10.565002899846931, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025442.3872409163, "perf/iters_per_sec": 0.965806192036112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035404419898987, "data/tokens_consumed": 22861053952, "data/tokens_consumed_B": 22.861053952, "train/loss_slope": -1.120363825475616e-06} {"step": 10910, "timestamp": 1778206324.5719464, "train/loss": 2.28982527256012, "train/z_loss": 0.0017673107096925378, "train/perplexity": 9.873212409323731, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026741.7455347579, "perf/iters_per_sec": 0.966425774352435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347406148910523, "data/tokens_consumed": 22882025472, "data/tokens_consumed_B": 22.882025472, "train/loss_slope": -2.233395031397714e-06} {"step": 10920, "timestamp": 1778206334.9469001, "train/loss": 2.25774347782135, "train/z_loss": 0.001784190523903817, "train/perplexity": 9.561489093207552, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022322.1033969065, "perf/iters_per_sec": 0.9643183247551472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370019674301147, "data/tokens_consumed": 22902996992, "data/tokens_consumed_B": 22.902996992, "train/loss_slope": -5.919560848182332e-06} {"step": 10930, "timestamp": 1778206345.2940607, "train/loss": 2.3091037273406982, "train/z_loss": 0.0017617168836295606, "train/perplexity": 10.065399268844711, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027929.2944909518, "perf/iters_per_sec": 0.9669920418219337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341346740722657, "data/tokens_consumed": 22923968512, "data/tokens_consumed_B": 22.923968512, "train/loss_slope": -4.820345086876905e-06} {"step": 10940, "timestamp": 1778206355.643423, "train/loss": 2.309095025062561, "train/z_loss": 0.0017575280857272447, "train/perplexity": 10.065311677321835, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027427.564648694, "perf/iters_per_sec": 0.9667527983897657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343905925750732, "data/tokens_consumed": 22944940032, "data/tokens_consumed_B": 22.944940032, "train/loss_slope": -4.017341340800689e-06} {"step": 10950, "timestamp": 1778206365.98401, "grad/layer_0/attn": 0.002815152984112501, "grad/layer_0/mlp": 0.0028061524499207735, "grad/layer_0/attn_mlp_ratio": 1.0032073930520926, "grad/layer_4/attn": 0.0014349654084071517, "grad/layer_4/mlp": 0.002778035821393132, "grad/layer_4/attn_mlp_ratio": 0.5165395441278325, "grad/layer_8/attn": 0.004871957935392857, "grad/layer_8/mlp": 0.00401565944775939, "grad/layer_8/attn_mlp_ratio": 1.2132397872502052, "grad/layer_12/attn": 0.004218738060444593, "grad/layer_12/mlp": 0.005633353255689144, "grad/layer_12/attn_mlp_ratio": 0.7488857513587488, "grad/layer_16/attn": 0.004838663153350353, "grad/layer_16/mlp": 0.00473319785669446, "grad/layer_16/attn_mlp_ratio": 1.0222820168564317, "grad/layer_20/attn": 0.007476918864995241, "grad/layer_20/mlp": 0.006690767128020525, "grad/layer_20/attn_mlp_ratio": 1.1174979804531775, "grad/layer_24/attn": 0.010273746214807034, "grad/layer_24/mlp": 0.0127568943426013, "grad/layer_24/attn_mlp_ratio": 0.8053485322021745, "grad/layer_27/attn": 0.00596644589677453, "grad/layer_27/mlp": 0.011540789157152176, "grad/layer_27/attn_mlp_ratio": 0.516987682889794} {"step": 10950, "timestamp": 1778206366.5945842, "eos/sharpness": 19.660258293151852, "eos/L0_probe": 2.1328916549682617, "eos/L_plus": 2.217538833618164, "eos/L_minus": 2.244847059249878, "eos/grad_norm": 0.14708837866783142, "eos/embed_grad_frac": 0.15027162432670593, "eos/time_s": 0.6076576709747314} {"step": 10950, "timestamp": 1778206366.6138732, "train/loss": 2.2355833530426024, "train/z_loss": 0.0017889226088300346, "train/perplexity": 9.351935741439519, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912634.9976295624, "perf/iters_per_sec": 0.9120154369495213, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964726686477662, "data/tokens_consumed": 22965911552, "data/tokens_consumed_B": 22.965911552, "train/loss_slope": -6.6029044768490985e-06} {"step": 10950, "timestamp": 1778206367.975222, "geo/rankme_last": 440.96539306640625, "geo/layer_0/stable_rank_q_proj": 15.187018394470215, "geo/layer_0/stable_rank_k_proj": 12.913538932800293, "geo/layer_0/stable_rank_o_proj": 55.27513122558594, "geo/layer_0/stable_rank_gate_proj": 165.06405639648438, "geo/layer_0/stable_rank_down_proj": 47.9270133972168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0369097925722599, "geo/layer_0/attn_entropy_mean": 6.3698577880859375, "geo/layer_0/attn_entropy_std": 0.267829954624176, "geo/layer_7/stable_rank_q_proj": 43.415828704833984, "geo/layer_7/stable_rank_k_proj": 43.79187774658203, "geo/layer_7/stable_rank_o_proj": 108.23995208740234, "geo/layer_7/stable_rank_gate_proj": 122.99016571044922, "geo/layer_7/stable_rank_down_proj": 165.10443115234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5977318286895752, "geo/layer_7/attn_entropy_mean": 4.654312610626221, "geo/layer_7/attn_entropy_std": 0.9212583303451538, "geo/layer_14/stable_rank_q_proj": 62.856388092041016, "geo/layer_14/stable_rank_k_proj": 40.597957611083984, "geo/layer_14/stable_rank_o_proj": 49.02568054199219, "geo/layer_14/stable_rank_gate_proj": 108.05513763427734, "geo/layer_14/stable_rank_down_proj": 139.02500915527344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39653894305229187, "geo/layer_14/attn_entropy_mean": 5.694265842437744, "geo/layer_14/attn_entropy_std": 0.5652389526367188, "geo/layer_21/stable_rank_q_proj": 52.1511116027832, "geo/layer_21/stable_rank_k_proj": 32.1253776550293, "geo/layer_21/stable_rank_o_proj": 91.41057586669922, "geo/layer_21/stable_rank_gate_proj": 106.13921356201172, "geo/layer_21/stable_rank_down_proj": 70.83477020263672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16039365530014038, "geo/layer_21/attn_entropy_mean": 5.792967796325684, "geo/layer_21/attn_entropy_std": 0.2951616644859314, "geo/layer_27/stable_rank_q_proj": 42.50053405761719, "geo/layer_27/stable_rank_k_proj": 32.49365997314453, "geo/layer_27/stable_rank_o_proj": 113.80596160888672, "geo/layer_27/stable_rank_gate_proj": 95.12755584716797, "geo/layer_27/stable_rank_down_proj": 149.701416015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06549695879220963, "geo/layer_27/attn_entropy_mean": 4.497754096984863, "geo/layer_27/attn_entropy_std": 0.47158247232437134, "attnres/final_alpha/block_0": 0.2480461299419403, "attnres/block_norm/0": 1.4370005130767822, "attnres/final_alpha/block_1": 0.009216509759426117, "attnres/block_norm/1": 18658.14453125, "attnres/final_alpha/block_2": 0.01857680454850197, "attnres/block_norm/2": 13914.08203125, "attnres/final_alpha/block_3": 0.02032475918531418, "attnres/block_norm/3": 15583.974609375, "attnres/final_alpha/block_4": 0.028625287115573883, "attnres/block_norm/4": 5970.65673828125, "attnres/final_alpha/block_5": 0.5115652084350586, "attnres/block_norm/5": 4003.513916015625, "attnres/final_alpha/block_6": 0.16364529728889465, "attnres/block_norm/6": 11208.970703125, "geo/tier1_time_s": 1.3576877117156982, "geo/step": 10950.0, "geo/rankme_slope": 0.0014770404060061525} {"step": 10960, "timestamp": 1778206378.3250594, "train/loss": 2.331566309928894, "train/z_loss": 0.0017696524038910866, "train/perplexity": 10.294052589074953, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791276.6527901893, "perf/iters_per_sec": 0.8541472686720797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707582950592041, "data/tokens_consumed": 22986883072, "data/tokens_consumed_B": 22.986883072, "train/loss_slope": -8.011739374888144e-06} {"step": 10970, "timestamp": 1778206388.6838229, "train/loss": 2.2713287830352784, "train/z_loss": 0.0017729552462697029, "train/perplexity": 9.692271186738939, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026048.316827487, "perf/iters_per_sec": 0.9660951217782435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350947618484496, "data/tokens_consumed": 23007854592, "data/tokens_consumed_B": 23.007854592, "train/loss_slope": -1.1885129944039965e-05} {"step": 10980, "timestamp": 1778206399.0416822, "train/loss": 2.2858724355697633, "train/z_loss": 0.0017747409408912062, "train/perplexity": 9.834262242644416, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025916.957770074, "perf/iters_per_sec": 0.9660324848986025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351618766784667, "data/tokens_consumed": 23028826112, "data/tokens_consumed_B": 23.028826112, "train/loss_slope": -1.4437544385198726e-05} {"step": 10990, "timestamp": 1778206409.3989472, "train/loss": 2.2858419895172117, "train/z_loss": 0.0017680718447081745, "train/perplexity": 9.833962832737319, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025898.386886947, "perf/iters_per_sec": 0.9660236296114669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351713657379151, "data/tokens_consumed": 23049797632, "data/tokens_consumed_B": 23.049797632, "train/loss_slope": -1.4319326381872212e-05} {"step": 11000, "timestamp": 1778206419.747406, "grad/layer_0/attn": 0.003085255855694413, "grad/layer_0/mlp": 0.002989103551954031, "grad/layer_0/attn_mlp_ratio": 1.0321675709296751, "grad/layer_4/attn": 0.0019503901712596416, "grad/layer_4/mlp": 0.002817087108269334, "grad/layer_4/attn_mlp_ratio": 0.6923428446000633, "grad/layer_8/attn": 0.008086948655545712, "grad/layer_8/mlp": 0.004536449443548918, "grad/layer_8/attn_mlp_ratio": 1.7826603333538231, "grad/layer_12/attn": 0.0045992424711585045, "grad/layer_12/mlp": 0.006248882040381432, "grad/layer_12/attn_mlp_ratio": 0.7360104364006723, "grad/layer_16/attn": 0.00558557640761137, "grad/layer_16/mlp": 0.004503494128584862, "grad/layer_16/attn_mlp_ratio": 1.2402761331763787, "grad/layer_20/attn": 0.005566637963056564, "grad/layer_20/mlp": 0.006518094800412655, "grad/layer_20/attn_mlp_ratio": 0.8540283699619264, "grad/layer_24/attn": 0.01000482402741909, "grad/layer_24/mlp": 0.01148242224007845, "grad/layer_24/attn_mlp_ratio": 0.8713164984794259, "grad/layer_27/attn": 0.007663289085030556, "grad/layer_27/mlp": 0.010304944589734077, "grad/layer_27/attn_mlp_ratio": 0.7436516464435589} {"step": 11000, "timestamp": 1778206419.7635293, "train/loss": 2.3137738704681396, "train/z_loss": 0.0017756435205228626, "train/perplexity": 10.112516059507445, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024483.8949148385, "perf/iters_per_sec": 0.9653491472791855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358946323394775, "data/tokens_consumed": 23070769152, "data/tokens_consumed_B": 23.070769152, "train/loss_slope": -1.1545858288755537e-05} {"step": 11000, "timestamp": 1778206426.7648501, "geo/ww_alpha_mean": 8.648158937855348, "geo/ww_alpha_std": 5.793778670675092, "geo/ww_alpha_min": 2.762639812095335, "geo/ww_alpha_max": 43.21693927038919, "geo/ww_alpha_healthy_frac": 0.13705583756345177, "geo/ww_alpha_by_type/q_proj": 4.34596182925347, "geo/ww_alpha_by_type/k_proj": 5.279911487983926, "geo/ww_alpha_by_type/v_proj": 7.815651005555503, "geo/ww_alpha_by_type/o_proj": 10.405930207647588, "geo/ww_alpha_by_type/gate_proj": 9.959781513210194, "geo/ww_alpha_by_type/up_proj": 13.634398628460307, "geo/ww_alpha_by_type/down_proj": 9.259225072452507, "geo/twonn_id/layer_0": 0.7338136434555054, "geo/twonn_id/layer_7": 2.8462586402893066, "geo/twonn_id/layer_14": 4.346555709838867, "geo/twonn_id/layer_21": 6.064110279083252, "geo/twonn_id/layer_27": 5.088542461395264, "geo/tier2_time_s": 6.995450735092163} {"step": 11000, "timestamp": 1778206427.383901, "eoc/jacobian_sigma/layer_0/attn": 524.1353149414062, "eoc/jacobian_sigma/layer_0/mlp": 3305.293701171875, "eoc/jacobian_sigma/layer_0": 3305.293701171875, "eoc/jacobian_sigma/layer_7/attn": 1.1659897565841675, "eoc/jacobian_sigma/layer_7/mlp": 1.6367377042770386, "eoc/jacobian_sigma/layer_7": 1.6367377042770386, "eoc/jacobian_sigma/layer_14/attn": 1.3865282535552979, "eoc/jacobian_sigma/layer_14/mlp": 8.682275772094727, "eoc/jacobian_sigma/layer_14": 8.682275772094727, "eoc/jacobian_sigma/layer_21/attn": 1.0872656106948853, "eoc/jacobian_sigma/layer_21/mlp": 3.2538602352142334, "eoc/jacobian_sigma/layer_21": 3.2538602352142334, "eoc/jacobian_sigma/layer_27/attn": 1.9360790252685547, "eoc/jacobian_sigma/layer_27/mlp": 21.884803771972656, "eoc/jacobian_sigma/layer_27": 21.884803771972656, "eoc/layer0_sigma": 3305.293701171875, "eoc/sigma_max": 21.884803771972656, "eoc/sigma_min": 1.6367377042770386, "eoc/sigma_mean": 8.864419370889664, "eoc/time_s": 0.6129989624023438} {"step": 11010, "timestamp": 1778206437.7534835, "train/loss": 2.3139418601989745, "train/z_loss": 0.0017649695277214051, "train/perplexity": 10.114215001056715, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1166038.403910212, "perf/iters_per_sec": 0.5560104388762531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7985273838043212, "data/tokens_consumed": 23091740672, "data/tokens_consumed_B": 23.091740672, "train/loss_slope": -1.3483079303108555e-05} {"step": 11020, "timestamp": 1778206448.1191487, "train/loss": 2.3094430923461915, "train/z_loss": 0.0017517280532047153, "train/perplexity": 10.068815692797452, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024367.2744762828, "perf/iters_per_sec": 0.9652935383206762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359543085098266, "data/tokens_consumed": 23112712192, "data/tokens_consumed_B": 23.112712192, "train/loss_slope": -1.2051871013898895e-05} {"step": 11025, "timestamp": 1778206453.905043, "eos/sharpness": 24.412584304809567, "eos/L0_probe": 2.131300210952759, "eos/L_plus": 2.2700581550598145, "eos/L_minus": 2.236668109893799, "eos/grad_norm": 0.12185923755168915, "eos/embed_grad_frac": 0.2048489898443222, "eos/time_s": 0.6200556755065918} {"step": 11025, "timestamp": 1778206455.2835205, "geo/rankme_last": 439.2052001953125, "geo/layer_0/stable_rank_q_proj": 15.22030258178711, "geo/layer_0/stable_rank_k_proj": 12.984230041503906, "geo/layer_0/stable_rank_o_proj": 55.103057861328125, "geo/layer_0/stable_rank_gate_proj": 165.38851928710938, "geo/layer_0/stable_rank_down_proj": 47.96736145019531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03463710844516754, "geo/layer_0/attn_entropy_mean": 6.370349884033203, "geo/layer_0/attn_entropy_std": 0.26711177825927734, "geo/layer_7/stable_rank_q_proj": 43.3090705871582, "geo/layer_7/stable_rank_k_proj": 43.45359420776367, "geo/layer_7/stable_rank_o_proj": 108.29552459716797, "geo/layer_7/stable_rank_gate_proj": 123.27505493164062, "geo/layer_7/stable_rank_down_proj": 164.9402618408203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5866190195083618, "geo/layer_7/attn_entropy_mean": 4.639904975891113, "geo/layer_7/attn_entropy_std": 0.9048168659210205, "geo/layer_14/stable_rank_q_proj": 62.821067810058594, "geo/layer_14/stable_rank_k_proj": 40.463191986083984, "geo/layer_14/stable_rank_o_proj": 48.93210983276367, "geo/layer_14/stable_rank_gate_proj": 107.83096313476562, "geo/layer_14/stable_rank_down_proj": 139.1120147705078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39885902404785156, "geo/layer_14/attn_entropy_mean": 5.67915678024292, "geo/layer_14/attn_entropy_std": 0.5718020796775818, "geo/layer_21/stable_rank_q_proj": 52.25345993041992, "geo/layer_21/stable_rank_k_proj": 32.17997360229492, "geo/layer_21/stable_rank_o_proj": 91.22036743164062, "geo/layer_21/stable_rank_gate_proj": 105.78479766845703, "geo/layer_21/stable_rank_down_proj": 70.70616912841797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16032741963863373, "geo/layer_21/attn_entropy_mean": 5.793829441070557, "geo/layer_21/attn_entropy_std": 0.2948930859565735, "geo/layer_27/stable_rank_q_proj": 42.60497283935547, "geo/layer_27/stable_rank_k_proj": 32.670982360839844, "geo/layer_27/stable_rank_o_proj": 114.34424591064453, "geo/layer_27/stable_rank_gate_proj": 95.04856872558594, "geo/layer_27/stable_rank_down_proj": 149.4930419921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06105675920844078, "geo/layer_27/attn_entropy_mean": 4.477081298828125, "geo/layer_27/attn_entropy_std": 0.5002272725105286, "attnres/final_alpha/block_0": 0.24788951873779297, "attnres/block_norm/0": 1.439441204071045, "attnres/final_alpha/block_1": 0.009137803688645363, "attnres/block_norm/1": 18767.6875, "attnres/final_alpha/block_2": 0.018609346821904182, "attnres/block_norm/2": 14003.0625, "attnres/final_alpha/block_3": 0.02021968737244606, "attnres/block_norm/3": 15816.0625, "attnres/final_alpha/block_4": 0.028828147798776627, "attnres/block_norm/4": 6004.4306640625, "attnres/final_alpha/block_5": 0.5112856030464172, "attnres/block_norm/5": 4053.700927734375, "attnres/final_alpha/block_6": 0.16402985155582428, "attnres/block_norm/6": 11349.970703125, "geo/tier1_time_s": 1.357908010482788, "geo/step": 11025.0, "geo/rankme_slope": 0.0013889260782437975} {"step": 11030, "timestamp": 1778206460.4618008, "train/loss": 2.3160803079605103, "train/z_loss": 0.0017541215638630092, "train/perplexity": 10.135866863923168, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700002.0258738499, "perf/iters_per_sec": 0.810624134957242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233617353439331, "data/tokens_consumed": 23133683712, "data/tokens_consumed_B": 23.133683712, "train/loss_slope": -1.3078500800328765e-05} {"step": 11040, "timestamp": 1778206470.838391, "train/loss": 2.28197181224823, "train/z_loss": 0.0017688175314106048, "train/perplexity": 9.795977206285311, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022069.0601023098, "perf/iters_per_sec": 0.9641976643096494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371317386627197, "data/tokens_consumed": 23154655232, "data/tokens_consumed_B": 23.154655232, "train/loss_slope": -1.410341431634619e-05} {"step": 11050, "timestamp": 1778206481.1870904, "grad/layer_0/attn": 0.003288427134975791, "grad/layer_0/mlp": 0.0030739521607756615, "grad/layer_0/attn_mlp_ratio": 1.0697716997550282, "grad/layer_4/attn": 0.0014732892159372568, "grad/layer_4/mlp": 0.0027241355273872614, "grad/layer_4/attn_mlp_ratio": 0.5408281442103892, "grad/layer_8/attn": 0.008217660710215569, "grad/layer_8/mlp": 0.004172865767031908, "grad/layer_8/attn_mlp_ratio": 1.9693086171640255, "grad/layer_12/attn": 0.004425628576427698, "grad/layer_12/mlp": 0.006220516283065081, "grad/layer_12/attn_mlp_ratio": 0.7114567833108135, "grad/layer_16/attn": 0.005031449720263481, "grad/layer_16/mlp": 0.005435554776340723, "grad/layer_16/attn_mlp_ratio": 0.9256552154709751, "grad/layer_20/attn": 0.005815858952701092, "grad/layer_20/mlp": 0.006921688094735146, "grad/layer_20/attn_mlp_ratio": 0.8402370619821936, "grad/layer_24/attn": 0.022876081988215446, "grad/layer_24/mlp": 0.011481215246021748, "grad/layer_24/attn_mlp_ratio": 1.9924791321105246, "grad/layer_27/attn": 0.00843562837690115, "grad/layer_27/mlp": 0.011304661631584167, "grad/layer_27/attn_mlp_ratio": 0.74620794298805} {"step": 11050, "timestamp": 1778206481.20302, "train/loss": 2.2838828325271607, "train/z_loss": 0.001778574683703482, "train/perplexity": 9.814715416225399, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024568.328483336, "perf/iters_per_sec": 0.9653894083420448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358514308929443, "data/tokens_consumed": 23175626752, "data/tokens_consumed_B": 23.175626752, "train/loss_slope": -1.3937656549659168e-05} {"step": 11060, "timestamp": 1778206491.5569098, "train/loss": 2.2784707307815553, "train/z_loss": 0.0017568220617249608, "train/perplexity": 9.761740659481568, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026993.7176321608, "perf/iters_per_sec": 0.9665459240113071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034611988067627, "data/tokens_consumed": 23196598272, "data/tokens_consumed_B": 23.196598272, "train/loss_slope": -1.3620062400870509e-05} {"step": 11070, "timestamp": 1778206501.9142013, "train/loss": 2.3149824857711794, "train/z_loss": 0.0017533145379275083, "train/perplexity": 10.124745590079408, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026122.193365105, "perf/iters_per_sec": 0.9661303488564992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350570201873779, "data/tokens_consumed": 23217569792, "data/tokens_consumed_B": 23.217569792, "train/loss_slope": -1.6564927657659848e-05} {"step": 11080, "timestamp": 1778206512.2673724, "train/loss": 2.326636242866516, "train/z_loss": 0.001761673903092742, "train/perplexity": 10.243427115497072, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026950.6981275768, "perf/iters_per_sec": 0.966525410712994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346339464187622, "data/tokens_consumed": 23238541312, "data/tokens_consumed_B": 23.238541312, "train/loss_slope": -1.823721416998725e-05} {"step": 11090, "timestamp": 1778206522.6330245, "train/loss": 2.282826781272888, "train/z_loss": 0.0017759565031155943, "train/perplexity": 9.804356044676194, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024455.6121760374, "perf/iters_per_sec": 0.9653356610183894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035909104347229, "data/tokens_consumed": 23259512832, "data/tokens_consumed_B": 23.259512832, "train/loss_slope": -1.7015517712926984e-05} {"step": 11100, "timestamp": 1778206532.9766288, "grad/layer_0/attn": 0.0028041426558047533, "grad/layer_0/mlp": 0.002646723994985223, "grad/layer_0/attn_mlp_ratio": 1.0594767551018232, "grad/layer_4/attn": 0.002673494163900614, "grad/layer_4/mlp": 0.0026835408061742783, "grad/layer_4/attn_mlp_ratio": 0.9962561620541915, "grad/layer_8/attn": 0.00902534369379282, "grad/layer_8/mlp": 0.004049662500619888, "grad/layer_8/attn_mlp_ratio": 2.2286655911559885, "grad/layer_12/attn": 0.0037860251031816006, "grad/layer_12/mlp": 0.006114404182881117, "grad/layer_12/attn_mlp_ratio": 0.6191977056181212, "grad/layer_16/attn": 0.004538127686828375, "grad/layer_16/mlp": 0.004494611173868179, "grad/layer_16/attn_mlp_ratio": 1.009681908024661, "grad/layer_20/attn": 0.00522663863375783, "grad/layer_20/mlp": 0.006126983556896448, "grad/layer_20/attn_mlp_ratio": 0.8530524849490659, "grad/layer_24/attn": 0.006729819346219301, "grad/layer_24/mlp": 0.010308201424777508, "grad/layer_24/attn_mlp_ratio": 0.652860669248951, "grad/layer_27/attn": 0.007336240261793137, "grad/layer_27/mlp": 0.008962029591202736, "grad/layer_27/attn_mlp_ratio": 0.8185913810344214} {"step": 11100, "timestamp": 1778206533.5940695, "eos/sharpness": 15.648841857910153, "eos/L0_probe": 2.132134437561035, "eos/L_plus": 2.1918201446533203, "eos/L_minus": 2.2289371490478516, "eos/grad_norm": 0.1312893182039261, "eos/embed_grad_frac": 0.18763099610805511, "eos/time_s": 0.6147022247314453} {"step": 11100, "timestamp": 1778206533.615458, "train/loss": 2.321962904930115, "train/z_loss": 0.001756917405873537, "train/perplexity": 10.195667803583815, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910614.762950409, "perf/iters_per_sec": 0.9110521139862103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976320505142212, "data/tokens_consumed": 23280484352, "data/tokens_consumed_B": 23.280484352, "train/loss_slope": -1.6116784951581683e-05} {"step": 11100, "timestamp": 1778206534.9812782, "geo/rankme_last": 439.6900634765625, "geo/layer_0/stable_rank_q_proj": 15.176285743713379, "geo/layer_0/stable_rank_k_proj": 12.96842098236084, "geo/layer_0/stable_rank_o_proj": 55.20967483520508, "geo/layer_0/stable_rank_gate_proj": 165.0840301513672, "geo/layer_0/stable_rank_down_proj": 47.941444396972656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03798925131559372, "geo/layer_0/attn_entropy_mean": 6.36501932144165, "geo/layer_0/attn_entropy_std": 0.27269259095191956, "geo/layer_7/stable_rank_q_proj": 43.49631118774414, "geo/layer_7/stable_rank_k_proj": 43.30849838256836, "geo/layer_7/stable_rank_o_proj": 108.17376708984375, "geo/layer_7/stable_rank_gate_proj": 123.09413146972656, "geo/layer_7/stable_rank_down_proj": 165.229248046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5896620750427246, "geo/layer_7/attn_entropy_mean": 4.680704116821289, "geo/layer_7/attn_entropy_std": 0.923582136631012, "geo/layer_14/stable_rank_q_proj": 62.60502624511719, "geo/layer_14/stable_rank_k_proj": 40.30541229248047, "geo/layer_14/stable_rank_o_proj": 49.18489456176758, "geo/layer_14/stable_rank_gate_proj": 107.20698547363281, "geo/layer_14/stable_rank_down_proj": 139.62196350097656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3887876570224762, "geo/layer_14/attn_entropy_mean": 5.649648666381836, "geo/layer_14/attn_entropy_std": 0.5986644625663757, "geo/layer_21/stable_rank_q_proj": 51.98218536376953, "geo/layer_21/stable_rank_k_proj": 32.16672897338867, "geo/layer_21/stable_rank_o_proj": 91.025146484375, "geo/layer_21/stable_rank_gate_proj": 105.73941040039062, "geo/layer_21/stable_rank_down_proj": 70.30526733398438, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15851391851902008, "geo/layer_21/attn_entropy_mean": 5.8063435554504395, "geo/layer_21/attn_entropy_std": 0.29371434450149536, "geo/layer_27/stable_rank_q_proj": 42.61968231201172, "geo/layer_27/stable_rank_k_proj": 32.668575286865234, "geo/layer_27/stable_rank_o_proj": 113.95098114013672, "geo/layer_27/stable_rank_gate_proj": 94.94158172607422, "geo/layer_27/stable_rank_down_proj": 149.34674072265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06671597808599472, "geo/layer_27/attn_entropy_mean": 4.508720397949219, "geo/layer_27/attn_entropy_std": 0.49845090508461, "attnres/final_alpha/block_0": 0.2466556429862976, "attnres/block_norm/0": 1.441831350326538, "attnres/final_alpha/block_1": 0.009068604558706284, "attnres/block_norm/1": 18894.56640625, "attnres/final_alpha/block_2": 0.018551666289567947, "attnres/block_norm/2": 14091.615234375, "attnres/final_alpha/block_3": 0.01995611935853958, "attnres/block_norm/3": 16070.7490234375, "attnres/final_alpha/block_4": 0.028167445212602615, "attnres/block_norm/4": 6042.75537109375, "attnres/final_alpha/block_5": 0.5150462985038757, "attnres/block_norm/5": 4025.69140625, "attnres/final_alpha/block_6": 0.16255420446395874, "attnres/block_norm/6": 11425.13671875, "geo/tier1_time_s": 1.361816167831421, "geo/step": 11100.0, "geo/rankme_slope": 0.0012997675241971788} {"step": 11110, "timestamp": 1778206545.9028668, "train/loss": 2.305364894866943, "train/z_loss": 0.001757826143875718, "train/perplexity": 10.02783669104673, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707296.342227698, "perf/iters_per_sec": 0.8141023360384455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2283468008041383, "data/tokens_consumed": 23301455872, "data/tokens_consumed_B": 23.301455872, "train/loss_slope": -1.5274695637631672e-05} {"step": 11120, "timestamp": 1778206556.2573247, "train/loss": 2.299493408203125, "train/z_loss": 0.0017658884171396494, "train/perplexity": 9.96913089544988, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026777.6109544742, "perf/iters_per_sec": 0.9664428763172503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347223043441773, "data/tokens_consumed": 23322427392, "data/tokens_consumed_B": 23.322427392, "train/loss_slope": -1.5034918108395579e-05} {"step": 11130, "timestamp": 1778206566.615711, "train/loss": 2.316017198562622, "train/z_loss": 0.0017583820153959095, "train/perplexity": 10.13522721565243, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025681.020886379, "perf/iters_per_sec": 0.9659199814254661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352824449539184, "data/tokens_consumed": 23343398912, "data/tokens_consumed_B": 23.343398912, "train/loss_slope": -1.5754211181425417e-05} {"step": 11140, "timestamp": 1778206576.969691, "train/loss": 2.2554789066314695, "train/z_loss": 0.0017796148895286024, "train/perplexity": 9.53986091898778, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026613.5650548055, "perf/iters_per_sec": 0.9663646531366374, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348060607910157, "data/tokens_consumed": 23364370432, "data/tokens_consumed_B": 23.364370432, "train/loss_slope": -1.570093778386473e-05} {"step": 11150, "timestamp": 1778206587.3165574, "grad/layer_0/attn": 0.003522322978824377, "grad/layer_0/mlp": 0.00303106801584363, "grad/layer_0/attn_mlp_ratio": 1.1620731848330692, "grad/layer_4/attn": 0.0027676543686538935, "grad/layer_4/mlp": 0.002752156462520361, "grad/layer_4/attn_mlp_ratio": 1.0056311498933548, "grad/layer_8/attn": 0.00585418613627553, "grad/layer_8/mlp": 0.004363299813121557, "grad/layer_8/attn_mlp_ratio": 1.3416877713747106, "grad/layer_12/attn": 0.004908958449959755, "grad/layer_12/mlp": 0.0067317187786102295, "grad/layer_12/attn_mlp_ratio": 0.7292280825270017, "grad/layer_16/attn": 0.005042664706707001, "grad/layer_16/mlp": 0.005441227927803993, "grad/layer_16/attn_mlp_ratio": 0.9267512188314139, "grad/layer_20/attn": 0.007876727730035782, "grad/layer_20/mlp": 0.006826319266110659, "grad/layer_20/attn_mlp_ratio": 1.153876241000368, "grad/layer_24/attn": 0.007041328586637974, "grad/layer_24/mlp": 0.009662439115345478, "grad/layer_24/attn_mlp_ratio": 0.7287319929997834, "grad/layer_27/attn": 0.0076585025526583195, "grad/layer_27/mlp": 0.008632115088403225, "grad/layer_27/attn_mlp_ratio": 0.8872104212588705} {"step": 11150, "timestamp": 1778206587.3324068, "train/loss": 2.307329773902893, "train/z_loss": 0.0017623460618779064, "train/perplexity": 10.047559547304875, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024907.205328645, "perf/iters_per_sec": 0.9655509974139428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356780767440796, "data/tokens_consumed": 23385341952, "data/tokens_consumed_B": 23.385341952, "train/loss_slope": -1.7807264668498693e-05} {"step": 11160, "timestamp": 1778206597.6899784, "train/loss": 2.2749207973480225, "train/z_loss": 0.0017673226189799606, "train/perplexity": 9.727148566086973, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025995.911154155, "perf/iters_per_sec": 0.9660701328058982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035121536254883, "data/tokens_consumed": 23406313472, "data/tokens_consumed_B": 23.406313472, "train/loss_slope": -1.505133529843919e-05} {"step": 11170, "timestamp": 1778206608.0426404, "train/loss": 2.2673816442489625, "train/z_loss": 0.001766825478989631, "train/perplexity": 9.654089850299043, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027144.7908133608, "perf/iters_per_sec": 0.9666179613177113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345348834991455, "data/tokens_consumed": 23427284992, "data/tokens_consumed_B": 23.427284992, "train/loss_slope": -1.5319230983538384e-05} {"step": 11175, "timestamp": 1778206613.8217554, "eos/sharpness": 19.99576091766357, "eos/L0_probe": 2.1312761306762695, "eos/L_plus": 2.2644381523132324, "eos/L_minus": 2.1980717182159424, "eos/grad_norm": 0.1388663351535797, "eos/embed_grad_frac": 0.14399181306362152, "eos/time_s": 0.6138298511505127} {"step": 11175, "timestamp": 1778206615.2032874, "geo/rankme_last": 440.67218017578125, "geo/layer_0/stable_rank_q_proj": 15.14646053314209, "geo/layer_0/stable_rank_k_proj": 12.986554145812988, "geo/layer_0/stable_rank_o_proj": 55.35390090942383, "geo/layer_0/stable_rank_gate_proj": 164.2964324951172, "geo/layer_0/stable_rank_down_proj": 48.03812026977539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03720993921160698, "geo/layer_0/attn_entropy_mean": 6.360243797302246, "geo/layer_0/attn_entropy_std": 0.27484938502311707, "geo/layer_7/stable_rank_q_proj": 43.48759460449219, "geo/layer_7/stable_rank_k_proj": 43.05670166015625, "geo/layer_7/stable_rank_o_proj": 107.83647918701172, "geo/layer_7/stable_rank_gate_proj": 122.57677459716797, "geo/layer_7/stable_rank_down_proj": 165.72943115234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5885222554206848, "geo/layer_7/attn_entropy_mean": 4.66940975189209, "geo/layer_7/attn_entropy_std": 0.8969852328300476, "geo/layer_14/stable_rank_q_proj": 62.68463897705078, "geo/layer_14/stable_rank_k_proj": 40.00373840332031, "geo/layer_14/stable_rank_o_proj": 49.319313049316406, "geo/layer_14/stable_rank_gate_proj": 106.91149139404297, "geo/layer_14/stable_rank_down_proj": 139.87405395507812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38140398263931274, "geo/layer_14/attn_entropy_mean": 5.671438217163086, "geo/layer_14/attn_entropy_std": 0.587117075920105, "geo/layer_21/stable_rank_q_proj": 51.92266082763672, "geo/layer_21/stable_rank_k_proj": 32.19304656982422, "geo/layer_21/stable_rank_o_proj": 90.89348602294922, "geo/layer_21/stable_rank_gate_proj": 105.39820861816406, "geo/layer_21/stable_rank_down_proj": 69.94538116455078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16177980601787567, "geo/layer_21/attn_entropy_mean": 5.80546760559082, "geo/layer_21/attn_entropy_std": 0.2943381369113922, "geo/layer_27/stable_rank_q_proj": 42.631248474121094, "geo/layer_27/stable_rank_k_proj": 32.508846282958984, "geo/layer_27/stable_rank_o_proj": 114.27934265136719, "geo/layer_27/stable_rank_gate_proj": 95.02009582519531, "geo/layer_27/stable_rank_down_proj": 149.37966918945312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06229417398571968, "geo/layer_27/attn_entropy_mean": 4.504558086395264, "geo/layer_27/attn_entropy_std": 0.5090982913970947, "attnres/final_alpha/block_0": 0.2469208985567093, "attnres/block_norm/0": 1.4442204236984253, "attnres/final_alpha/block_1": 0.009183399379253387, "attnres/block_norm/1": 18989.62890625, "attnres/final_alpha/block_2": 0.018610727041959763, "attnres/block_norm/2": 14134.4365234375, "attnres/final_alpha/block_3": 0.0198674313724041, "attnres/block_norm/3": 16094.236328125, "attnres/final_alpha/block_4": 0.028232712298631668, "attnres/block_norm/4": 6050.17822265625, "attnres/final_alpha/block_5": 0.5161164999008179, "attnres/block_norm/5": 4006.52392578125, "attnres/final_alpha/block_6": 0.161068394780159, "attnres/block_norm/6": 11433.05859375, "geo/tier1_time_s": 1.3605883121490479, "geo/step": 11175.0, "geo/rankme_slope": 0.0012477619563450381} {"step": 11180, "timestamp": 1778206620.3922617, "train/loss": 2.3053579092025758, "train/z_loss": 0.0017641499056480826, "train/perplexity": 10.02776664018995, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698898.0731053064, "perf/iters_per_sec": 0.8100977292562992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344189643859864, "data/tokens_consumed": 23448256512, "data/tokens_consumed_B": 23.448256512, "train/loss_slope": -1.4608660239269134e-05} {"step": 11190, "timestamp": 1778206630.7858944, "train/loss": 2.351011776924133, "train/z_loss": 0.0017413062276318669, "train/perplexity": 10.496184151021744, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019753.1009306978, "perf/iters_per_sec": 0.9630933289197434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383209705352783, "data/tokens_consumed": 23469228032, "data/tokens_consumed_B": 23.469228032, "train/loss_slope": -1.2817785613285739e-05} {"step": 11200, "timestamp": 1778206641.1372411, "grad/layer_0/attn": 0.00280884001404047, "grad/layer_0/mlp": 0.002780144102871418, "grad/layer_0/attn_mlp_ratio": 1.0103216988310946, "grad/layer_4/attn": 0.001598067581653595, "grad/layer_4/mlp": 0.00282050552777946, "grad/layer_4/attn_mlp_ratio": 0.5665890420192952, "grad/layer_8/attn": 0.007862512953579426, "grad/layer_8/mlp": 0.004252917133271694, "grad/layer_8/attn_mlp_ratio": 1.8487340623675703, "grad/layer_12/attn": 0.0042629241943359375, "grad/layer_12/mlp": 0.005889631807804108, "grad/layer_12/attn_mlp_ratio": 0.7238014634984766, "grad/layer_16/attn": 0.004868074785917997, "grad/layer_16/mlp": 0.004672233946621418, "grad/layer_16/attn_mlp_ratio": 1.0419158666587338, "grad/layer_20/attn": 0.0042381384409964085, "grad/layer_20/mlp": 0.0065813432447612286, "grad/layer_20/attn_mlp_ratio": 0.6439625193494852, "grad/layer_24/attn": 0.010186905041337013, "grad/layer_24/mlp": 0.010766370221972466, "grad/layer_24/attn_mlp_ratio": 0.9461782138913748, "grad/layer_27/attn": 0.008117267861962318, "grad/layer_27/mlp": 0.008902243338525295, "grad/layer_27/attn_mlp_ratio": 0.9118227240151712} {"step": 11200, "timestamp": 1778206641.1530938, "train/loss": 2.264595651626587, "train/z_loss": 0.0017646244494244456, "train/perplexity": 9.62723105877049, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024388.286595299, "perf/iters_per_sec": 0.9653035576797957, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359435558319092, "data/tokens_consumed": 23490199552, "data/tokens_consumed_B": 23.490199552, "train/loss_slope": -1.542784361520258e-05} {"step": 11210, "timestamp": 1778206651.5107372, "train/loss": 2.3300546407699585, "train/z_loss": 0.0017424469231627881, "train/perplexity": 10.278503143025663, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026184.0335226255, "perf/iters_per_sec": 0.9661598365414741, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035025429725647, "data/tokens_consumed": 23511171072, "data/tokens_consumed_B": 23.511171072, "train/loss_slope": -1.4648675632448195e-05} {"step": 11220, "timestamp": 1778206661.8855734, "train/loss": 2.3064782857894897, "train/z_loss": 0.001757895911578089, "train/perplexity": 10.039007811149164, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022874.620748481, "perf/iters_per_sec": 0.9645817855589299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036718726158142, "data/tokens_consumed": 23532142592, "data/tokens_consumed_B": 23.532142592, "train/loss_slope": -1.7341856651752524e-05} {"step": 11230, "timestamp": 1778206672.2450788, "train/loss": 2.269026756286621, "train/z_loss": 0.0017699189251288772, "train/perplexity": 9.669984980774776, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026010.8905935055, "perf/iters_per_sec": 0.9660772755591895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351138830184936, "data/tokens_consumed": 23553114112, "data/tokens_consumed_B": 23.553114112, "train/loss_slope": -1.9956599002910238e-05} {"step": 11240, "timestamp": 1778206682.6043184, "train/loss": 2.2507096767425536, "train/z_loss": 0.0017625227337703109, "train/perplexity": 9.494471451593126, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026114.2594294397, "perf/iters_per_sec": 0.9661265656611632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350610733032226, "data/tokens_consumed": 23574085632, "data/tokens_consumed_B": 23.574085632, "train/loss_slope": -2.3161279679489248e-05} {"step": 11250, "timestamp": 1778206692.946041, "grad/layer_0/attn": 0.0030385537538677454, "grad/layer_0/mlp": 0.0028501800261437893, "grad/layer_0/attn_mlp_ratio": 1.0660918325814095, "grad/layer_4/attn": 0.001521286554634571, "grad/layer_4/mlp": 0.0027399135287851095, "grad/layer_4/attn_mlp_ratio": 0.5552315732336119, "grad/layer_8/attn": 0.0053041549399495125, "grad/layer_8/mlp": 0.004098630975931883, "grad/layer_8/attn_mlp_ratio": 1.294128415484074, "grad/layer_12/attn": 0.0042888508178293705, "grad/layer_12/mlp": 0.005503866355866194, "grad/layer_12/attn_mlp_ratio": 0.7792432560310723, "grad/layer_16/attn": 0.005935745779424906, "grad/layer_16/mlp": 0.004864297341555357, "grad/layer_16/attn_mlp_ratio": 1.2202678497240402, "grad/layer_20/attn": 0.005235475953668356, "grad/layer_20/mlp": 0.007263831328600645, "grad/layer_20/attn_mlp_ratio": 0.7207595612770098, "grad/layer_24/attn": 0.015282778069376945, "grad/layer_24/mlp": 0.014044023118913174, "grad/layer_24/attn_mlp_ratio": 1.088205126918014, "grad/layer_27/attn": 0.008094662800431252, "grad/layer_27/mlp": 0.013325324282050133, "grad/layer_27/attn_mlp_ratio": 0.6074645966093825} {"step": 11250, "timestamp": 1778206693.5612013, "eos/sharpness": 32.318305969238274, "eos/L0_probe": 2.1308789253234863, "eos/L_plus": 2.318328857421875, "eos/L_minus": 2.2666120529174805, "eos/grad_norm": 0.22641544044017792, "eos/embed_grad_frac": 0.05508213862776756, "eos/time_s": 0.6124045848846436} {"step": 11250, "timestamp": 1778206693.5829043, "train/loss": 2.2820330381393434, "train/z_loss": 0.001763435418251902, "train/perplexity": 9.796576992080116, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911321.9491966176, "perf/iters_per_sec": 0.9113893266661728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972259283065795, "data/tokens_consumed": 23595057152, "data/tokens_consumed_B": 23.595057152, "train/loss_slope": -2.458896871590235e-05} {"step": 11250, "timestamp": 1778206694.9456162, "geo/rankme_last": 439.5954895019531, "geo/layer_0/stable_rank_q_proj": 15.163275718688965, "geo/layer_0/stable_rank_k_proj": 13.054150581359863, "geo/layer_0/stable_rank_o_proj": 55.28437805175781, "geo/layer_0/stable_rank_gate_proj": 164.1881561279297, "geo/layer_0/stable_rank_down_proj": 48.07670211791992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03592840954661369, "geo/layer_0/attn_entropy_mean": 6.3561787605285645, "geo/layer_0/attn_entropy_std": 0.2788030207157135, "geo/layer_7/stable_rank_q_proj": 43.57255172729492, "geo/layer_7/stable_rank_k_proj": 43.2938232421875, "geo/layer_7/stable_rank_o_proj": 107.71345520019531, "geo/layer_7/stable_rank_gate_proj": 122.23169708251953, "geo/layer_7/stable_rank_down_proj": 165.20851135253906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5875281095504761, "geo/layer_7/attn_entropy_mean": 4.651845932006836, "geo/layer_7/attn_entropy_std": 0.9165303111076355, "geo/layer_14/stable_rank_q_proj": 62.608219146728516, "geo/layer_14/stable_rank_k_proj": 39.949954986572266, "geo/layer_14/stable_rank_o_proj": 49.1422233581543, "geo/layer_14/stable_rank_gate_proj": 106.7278823852539, "geo/layer_14/stable_rank_down_proj": 139.4127960205078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38906195759773254, "geo/layer_14/attn_entropy_mean": 5.658433437347412, "geo/layer_14/attn_entropy_std": 0.5751819610595703, "geo/layer_21/stable_rank_q_proj": 51.768287658691406, "geo/layer_21/stable_rank_k_proj": 32.14558410644531, "geo/layer_21/stable_rank_o_proj": 90.91206359863281, "geo/layer_21/stable_rank_gate_proj": 105.24019622802734, "geo/layer_21/stable_rank_down_proj": 69.84880828857422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16307704150676727, "geo/layer_21/attn_entropy_mean": 5.772018909454346, "geo/layer_21/attn_entropy_std": 0.3028928339481354, "geo/layer_27/stable_rank_q_proj": 42.48012924194336, "geo/layer_27/stable_rank_k_proj": 32.56519317626953, "geo/layer_27/stable_rank_o_proj": 114.25826263427734, "geo/layer_27/stable_rank_gate_proj": 94.81483459472656, "geo/layer_27/stable_rank_down_proj": 149.63699340820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.061957698315382004, "geo/layer_27/attn_entropy_mean": 4.476828575134277, "geo/layer_27/attn_entropy_std": 0.5043537020683289, "attnres/final_alpha/block_0": 0.24736559391021729, "attnres/block_norm/0": 1.4462977647781372, "attnres/final_alpha/block_1": 0.009004727005958557, "attnres/block_norm/1": 19133.95703125, "attnres/final_alpha/block_2": 0.01843748241662979, "attnres/block_norm/2": 14188.7763671875, "attnres/final_alpha/block_3": 0.019853930920362473, "attnres/block_norm/3": 16180.923828125, "attnres/final_alpha/block_4": 0.02784242294728756, "attnres/block_norm/4": 6100.052734375, "attnres/final_alpha/block_5": 0.51734459400177, "attnres/block_norm/5": 4051.082763671875, "attnres/final_alpha/block_6": 0.16015125811100006, "attnres/block_norm/6": 11579.478515625, "geo/tier1_time_s": 1.3582048416137695, "geo/step": 11250.0, "geo/rankme_slope": 0.0012002993580244598} {"step": 11260, "timestamp": 1778206705.3120563, "train/loss": 2.23887152671814, "train/z_loss": 0.0017656550160609185, "train/perplexity": 9.382737142786098, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788647.28984735, "perf/iters_per_sec": 0.8528934907185316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1724793434143066, "data/tokens_consumed": 23616028672, "data/tokens_consumed_B": 23.616028672, "train/loss_slope": -2.9572178886132023e-05} {"step": 11270, "timestamp": 1778206715.6806576, "train/loss": 2.283816766738892, "train/z_loss": 0.0017551534459926189, "train/perplexity": 9.814067020733406, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023694.8375478664, "perf/iters_per_sec": 0.9649728954066593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362985372543334, "data/tokens_consumed": 23637000192, "data/tokens_consumed_B": 23.637000192, "train/loss_slope": -3.790744340280279e-05} {"step": 11280, "timestamp": 1778206726.0343502, "train/loss": 2.342199206352234, "train/z_loss": 0.0017467807279899716, "train/perplexity": 10.404092167007924, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026784.0089557283, "perf/iters_per_sec": 0.9664459271219865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347190380096436, "data/tokens_consumed": 23657971712, "data/tokens_consumed_B": 23.657971712, "train/loss_slope": -3.396414244028313e-05} {"step": 11290, "timestamp": 1778206736.3864284, "train/loss": 2.3362420558929444, "train/z_loss": 0.001737404044251889, "train/perplexity": 10.342297666889108, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027103.446656574, "perf/iters_per_sec": 0.9665982468874807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034555983543396, "data/tokens_consumed": 23678943232, "data/tokens_consumed_B": 23.678943232, "train/loss_slope": -3.1765003129951646e-05} {"step": 11300, "timestamp": 1778206746.7260377, "grad/layer_0/attn": 0.002586252521723509, "grad/layer_0/mlp": 0.0027122197207063437, "grad/layer_0/attn_mlp_ratio": 0.9535556454454237, "grad/layer_4/attn": 0.0019039403414353728, "grad/layer_4/mlp": 0.0026995374355465174, "grad/layer_4/attn_mlp_ratio": 0.7052838926538293, "grad/layer_8/attn": 0.006106157321482897, "grad/layer_8/mlp": 0.004093084950000048, "grad/layer_8/attn_mlp_ratio": 1.49182273197153, "grad/layer_12/attn": 0.004251639824360609, "grad/layer_12/mlp": 0.006016070954501629, "grad/layer_12/attn_mlp_ratio": 0.7067136983329422, "grad/layer_16/attn": 0.004924899898469448, "grad/layer_16/mlp": 0.004798390436917543, "grad/layer_16/attn_mlp_ratio": 1.0263649572869435, "grad/layer_20/attn": 0.005616527516394854, "grad/layer_20/mlp": 0.007156567182391882, "grad/layer_20/attn_mlp_ratio": 0.7848074774918747, "grad/layer_24/attn": 0.010060603730380535, "grad/layer_24/mlp": 0.012978865765035152, "grad/layer_24/attn_mlp_ratio": 0.7751527625756295, "grad/layer_27/attn": 0.009474438615143299, "grad/layer_27/mlp": 0.012735268101096153, "grad/layer_27/attn_mlp_ratio": 0.7439528139916058} {"step": 11300, "timestamp": 1778206746.7417192, "train/loss": 2.2923724174499513, "train/z_loss": 0.0017614868702366948, "train/perplexity": 9.898392967509238, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026351.6509979307, "perf/iters_per_sec": 0.9662397627820638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349398136138916, "data/tokens_consumed": 23699914752, "data/tokens_consumed_B": 23.699914752, "train/loss_slope": -3.154678948462774e-05} {"step": 11310, "timestamp": 1778206757.0962095, "train/loss": 2.3040834188461305, "train/z_loss": 0.001752572041004896, "train/perplexity": 10.014994489030931, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026588.7713580695, "perf/iters_per_sec": 0.9663528305807445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348187208175659, "data/tokens_consumed": 23720886272, "data/tokens_consumed_B": 23.720886272, "train/loss_slope": -3.296854406586283e-05} {"step": 11320, "timestamp": 1778206767.4422643, "train/loss": 2.3034605264663695, "train/z_loss": 0.0017606725217774511, "train/perplexity": 10.0087581677605, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028098.9777940498, "perf/iters_per_sec": 0.9670729531259774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340481519699096, "data/tokens_consumed": 23741857792, "data/tokens_consumed_B": 23.741857792, "train/loss_slope": -3.412437846701383e-05} {"step": 11325, "timestamp": 1778206773.2212994, "eos/sharpness": 44.27947998046874, "eos/L0_probe": 2.136143922805786, "eos/L_plus": 2.4216480255126953, "eos/L_minus": 2.2934346199035645, "eos/grad_norm": 0.23477095365524292, "eos/embed_grad_frac": 0.04842919483780861, "eos/time_s": 0.6109604835510254} {"step": 11325, "timestamp": 1778206774.5977392, "geo/rankme_last": 440.1796875, "geo/layer_0/stable_rank_q_proj": 15.2027006149292, "geo/layer_0/stable_rank_k_proj": 13.055537223815918, "geo/layer_0/stable_rank_o_proj": 55.34243392944336, "geo/layer_0/stable_rank_gate_proj": 163.8472900390625, "geo/layer_0/stable_rank_down_proj": 48.026710510253906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.032164424657821655, "geo/layer_0/attn_entropy_mean": 6.354617595672607, "geo/layer_0/attn_entropy_std": 0.27870383858680725, "geo/layer_7/stable_rank_q_proj": 43.61814880371094, "geo/layer_7/stable_rank_k_proj": 43.247459411621094, "geo/layer_7/stable_rank_o_proj": 107.31293487548828, "geo/layer_7/stable_rank_gate_proj": 121.9354248046875, "geo/layer_7/stable_rank_down_proj": 164.86318969726562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5925554633140564, "geo/layer_7/attn_entropy_mean": 4.6360883712768555, "geo/layer_7/attn_entropy_std": 0.9007047414779663, "geo/layer_14/stable_rank_q_proj": 62.44737243652344, "geo/layer_14/stable_rank_k_proj": 39.79041290283203, "geo/layer_14/stable_rank_o_proj": 49.16547393798828, "geo/layer_14/stable_rank_gate_proj": 106.45460510253906, "geo/layer_14/stable_rank_down_proj": 139.417236328125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4056793749332428, "geo/layer_14/attn_entropy_mean": 5.662384986877441, "geo/layer_14/attn_entropy_std": 0.5850006341934204, "geo/layer_21/stable_rank_q_proj": 51.87587356567383, "geo/layer_21/stable_rank_k_proj": 32.03765106201172, "geo/layer_21/stable_rank_o_proj": 90.83943176269531, "geo/layer_21/stable_rank_gate_proj": 104.99969482421875, "geo/layer_21/stable_rank_down_proj": 69.75698852539062, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16046087443828583, "geo/layer_21/attn_entropy_mean": 5.8101701736450195, "geo/layer_21/attn_entropy_std": 0.28823861479759216, "geo/layer_27/stable_rank_q_proj": 42.351200103759766, "geo/layer_27/stable_rank_k_proj": 32.501007080078125, "geo/layer_27/stable_rank_o_proj": 114.44856262207031, "geo/layer_27/stable_rank_gate_proj": 94.86257934570312, "geo/layer_27/stable_rank_down_proj": 149.67633056640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06626129895448685, "geo/layer_27/attn_entropy_mean": 4.507877826690674, "geo/layer_27/attn_entropy_std": 0.4985133409500122, "attnres/final_alpha/block_0": 0.24682097136974335, "attnres/block_norm/0": 1.4485347270965576, "attnres/final_alpha/block_1": 0.008915768004953861, "attnres/block_norm/1": 19175.681640625, "attnres/final_alpha/block_2": 0.018484804779291153, "attnres/block_norm/2": 14355.7919921875, "attnres/final_alpha/block_3": 0.01978813111782074, "attnres/block_norm/3": 16381.8017578125, "attnres/final_alpha/block_4": 0.02822694554924965, "attnres/block_norm/4": 6121.1787109375, "attnres/final_alpha/block_5": 0.5183255076408386, "attnres/block_norm/5": 4057.26171875, "attnres/final_alpha/block_6": 0.159437894821167, "attnres/block_norm/6": 11678.912109375, "geo/tier1_time_s": 1.3557403087615967, "geo/step": 11325.0, "geo/rankme_slope": 0.0011460323582558024} {"step": 11330, "timestamp": 1778206779.7738197, "train/loss": 2.27531578540802, "train/z_loss": 0.0017650130088441075, "train/perplexity": 9.730991432521614, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701281.102223535, "perf/iters_per_sec": 0.8112340460889507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232689881324768, "data/tokens_consumed": 23762829312, "data/tokens_consumed_B": 23.762829312, "train/loss_slope": -3.498316901315983e-05} {"step": 11340, "timestamp": 1778206790.131469, "train/loss": 2.35301673412323, "train/z_loss": 0.0017438312876038252, "train/perplexity": 10.517249661665302, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026140.6283378939, "perf/iters_per_sec": 0.9661391393365354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350476026535034, "data/tokens_consumed": 23783800832, "data/tokens_consumed_B": 23.783800832, "train/loss_slope": -3.104396680436475e-05} {"step": 11350, "timestamp": 1778206800.46788, "grad/layer_0/attn": 0.0027106190100312233, "grad/layer_0/mlp": 0.0026794259902089834, "grad/layer_0/attn_mlp_ratio": 1.0116416421920438, "grad/layer_4/attn": 0.0016606338322162628, "grad/layer_4/mlp": 0.0025890045799314976, "grad/layer_4/attn_mlp_ratio": 0.6414178564791939, "grad/layer_8/attn": 0.004538801033049822, "grad/layer_8/mlp": 0.004085718188434839, "grad/layer_8/attn_mlp_ratio": 1.110894269410961, "grad/layer_12/attn": 0.005750083364546299, "grad/layer_12/mlp": 0.005871472414582968, "grad/layer_12/attn_mlp_ratio": 0.9793256036309164, "grad/layer_16/attn": 0.005016625393182039, "grad/layer_16/mlp": 0.004947995766997337, "grad/layer_16/attn_mlp_ratio": 1.01387016643293, "grad/layer_20/attn": 0.007384934462606907, "grad/layer_20/mlp": 0.006960178725421429, "grad/layer_20/attn_mlp_ratio": 1.0610265408173274, "grad/layer_24/attn": 0.014150144532322884, "grad/layer_24/mlp": 0.011818702332675457, "grad/layer_24/attn_mlp_ratio": 1.1972671799572203, "grad/layer_27/attn": 0.013400810770690441, "grad/layer_27/mlp": 0.009764089249074459, "grad/layer_27/attn_mlp_ratio": 1.3724588429704105} {"step": 11350, "timestamp": 1778206800.4837043, "train/loss": 2.340835785865784, "train/z_loss": 0.0017391051165759563, "train/perplexity": 10.38991668037487, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026794.7502165486, "perf/iters_per_sec": 0.9664510489542716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347135543823243, "data/tokens_consumed": 23804772352, "data/tokens_consumed_B": 23.804772352, "train/loss_slope": -3.281849437099011e-05} {"step": 11360, "timestamp": 1778206810.8336155, "train/loss": 2.3021994113922117, "train/z_loss": 0.0017586240079253911, "train/perplexity": 9.996143927637542, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027449.2011236227, "perf/iters_per_sec": 0.9667631154649843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343795537948608, "data/tokens_consumed": 23825743872, "data/tokens_consumed_B": 23.825743872, "train/loss_slope": -3.557731377767293e-05} {"step": 11370, "timestamp": 1778206821.18441, "train/loss": 2.256978964805603, "train/z_loss": 0.001775869820266962, "train/perplexity": 9.554182003884224, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027502.5699540344, "perf/iters_per_sec": 0.9667885637064144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343523263931274, "data/tokens_consumed": 23846715392, "data/tokens_consumed_B": 23.846715392, "train/loss_slope": -3.917798771358917e-05} {"step": 11380, "timestamp": 1778206831.5358448, "train/loss": 2.3425175189971923, "train/z_loss": 0.0017394631286151707, "train/perplexity": 10.40740444824653, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027445.6028009616, "perf/iters_per_sec": 0.9667613996510322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03438138961792, "data/tokens_consumed": 23867686912, "data/tokens_consumed_B": 23.867686912, "train/loss_slope": -3.6775402288840335e-05} {"step": 11390, "timestamp": 1778206841.8905761, "train/loss": 2.2818876028060915, "train/z_loss": 0.001753197016660124, "train/perplexity": 9.795152327241357, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026870.9230324645, "perf/iters_per_sec": 0.9664873709833453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346746683120727, "data/tokens_consumed": 23888658432, "data/tokens_consumed_B": 23.888658432, "train/loss_slope": -3.763430537027242e-05} {"step": 11400, "timestamp": 1778206852.254017, "grad/layer_0/attn": 0.002541797701269388, "grad/layer_0/mlp": 0.002732393564656377, "grad/layer_0/attn_mlp_ratio": 0.9302457893046836, "grad/layer_4/attn": 0.0017452072352170944, "grad/layer_4/mlp": 0.0026273750700056553, "grad/layer_4/attn_mlp_ratio": 0.6642398295989598, "grad/layer_8/attn": 0.007151734549552202, "grad/layer_8/mlp": 0.003914765547960997, "grad/layer_8/attn_mlp_ratio": 1.8268614759295183, "grad/layer_12/attn": 0.004590244498103857, "grad/layer_12/mlp": 0.006256826687604189, "grad/layer_12/attn_mlp_ratio": 0.7336377774110501, "grad/layer_16/attn": 0.0047103832475841045, "grad/layer_16/mlp": 0.004890948534011841, "grad/layer_16/attn_mlp_ratio": 0.9630817250518483, "grad/layer_20/attn": 0.004350757226347923, "grad/layer_20/mlp": 0.006667859852313995, "grad/layer_20/attn_mlp_ratio": 0.6524967916937201, "grad/layer_24/attn": 0.023265188559889793, "grad/layer_24/mlp": 0.018373724073171616, "grad/layer_24/attn_mlp_ratio": 1.2662206279258532, "grad/layer_27/attn": 0.009162690490484238, "grad/layer_27/mlp": 0.017462706193327904, "grad/layer_27/attn_mlp_ratio": 0.5247004866585364} {"step": 11400, "timestamp": 1778206852.8895566, "eos/sharpness": 29.083132743835442, "eos/L0_probe": 2.13242506980896, "eos/L_plus": 2.3041446208953857, "eos/L_minus": 2.2515368461608887, "eos/grad_norm": 0.248603954911232, "eos/embed_grad_frac": 0.048933181911706924, "eos/time_s": 0.6321194171905518} {"step": 11400, "timestamp": 1778206852.911887, "train/loss": 2.3676336765289308, "train/z_loss": 0.0017391745117492975, "train/perplexity": 10.672108720013417, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903809.4686525264, "perf/iters_per_sec": 0.9078070967924721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1015556097030639, "data/tokens_consumed": 23909629952, "data/tokens_consumed_B": 23.909629952, "train/loss_slope": -3.495813331695563e-05} {"step": 11400, "timestamp": 1778206854.2757173, "geo/rankme_last": 440.8457336425781, "geo/layer_0/stable_rank_q_proj": 15.20423412322998, "geo/layer_0/stable_rank_k_proj": 13.126119613647461, "geo/layer_0/stable_rank_o_proj": 55.305084228515625, "geo/layer_0/stable_rank_gate_proj": 163.41854858398438, "geo/layer_0/stable_rank_down_proj": 48.00493621826172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0335472971200943, "geo/layer_0/attn_entropy_mean": 6.362356185913086, "geo/layer_0/attn_entropy_std": 0.2726286053657532, "geo/layer_7/stable_rank_q_proj": 43.51017761230469, "geo/layer_7/stable_rank_k_proj": 43.15091323852539, "geo/layer_7/stable_rank_o_proj": 107.3763198852539, "geo/layer_7/stable_rank_gate_proj": 121.95663452148438, "geo/layer_7/stable_rank_down_proj": 163.9593048095703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5901829600334167, "geo/layer_7/attn_entropy_mean": 4.672579765319824, "geo/layer_7/attn_entropy_std": 0.893450140953064, "geo/layer_14/stable_rank_q_proj": 62.464630126953125, "geo/layer_14/stable_rank_k_proj": 39.879844665527344, "geo/layer_14/stable_rank_o_proj": 49.48926544189453, "geo/layer_14/stable_rank_gate_proj": 105.93513488769531, "geo/layer_14/stable_rank_down_proj": 139.84930419921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4109432101249695, "geo/layer_14/attn_entropy_mean": 5.61818265914917, "geo/layer_14/attn_entropy_std": 0.5989986062049866, "geo/layer_21/stable_rank_q_proj": 51.97628402709961, "geo/layer_21/stable_rank_k_proj": 32.21318435668945, "geo/layer_21/stable_rank_o_proj": 90.4738540649414, "geo/layer_21/stable_rank_gate_proj": 104.58421325683594, "geo/layer_21/stable_rank_down_proj": 69.66506958007812, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16077010333538055, "geo/layer_21/attn_entropy_mean": 5.799633979797363, "geo/layer_21/attn_entropy_std": 0.3014467656612396, "geo/layer_27/stable_rank_q_proj": 42.517948150634766, "geo/layer_27/stable_rank_k_proj": 32.653228759765625, "geo/layer_27/stable_rank_o_proj": 114.3163070678711, "geo/layer_27/stable_rank_gate_proj": 94.7239990234375, "geo/layer_27/stable_rank_down_proj": 149.697265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06794343888759613, "geo/layer_27/attn_entropy_mean": 4.490139007568359, "geo/layer_27/attn_entropy_std": 0.5313307046890259, "attnres/final_alpha/block_0": 0.2470751255750656, "attnres/block_norm/0": 1.4505057334899902, "attnres/final_alpha/block_1": 0.009047526866197586, "attnres/block_norm/1": 19295.43359375, "attnres/final_alpha/block_2": 0.018465379253029823, "attnres/block_norm/2": 14311.5390625, "attnres/final_alpha/block_3": 0.020075831562280655, "attnres/block_norm/3": 16422.0234375, "attnres/final_alpha/block_4": 0.028028033673763275, "attnres/block_norm/4": 6134.8974609375, "attnres/final_alpha/block_5": 0.5163813829421997, "attnres/block_norm/5": 4050.809814453125, "attnres/final_alpha/block_6": 0.16092675924301147, "attnres/block_norm/6": 11667.4833984375, "geo/tier1_time_s": 1.36021089553833, "geo/step": 11400.0, "geo/rankme_slope": 0.0010995031215611244} {"step": 11410, "timestamp": 1778206864.6507487, "train/loss": 2.385547709465027, "train/z_loss": 0.0017334891716018319, "train/perplexity": 10.865011905402351, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787001.4374498923, "perf/iters_per_sec": 0.8521086871385061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1735592126846313, "data/tokens_consumed": 23930601472, "data/tokens_consumed_B": 23.930601472, "train/loss_slope": -2.8270488567907365e-05} {"step": 11420, "timestamp": 1778206875.0069528, "train/loss": 2.302071213722229, "train/z_loss": 0.0017466870020143687, "train/perplexity": 9.994862527415224, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026280.3716733884, "perf/iters_per_sec": 0.9662057741515104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349762201309205, "data/tokens_consumed": 23951572992, "data/tokens_consumed_B": 23.951572992, "train/loss_slope": -2.9369470760552586e-05} {"step": 11430, "timestamp": 1778206885.3606763, "train/loss": 2.268727684020996, "train/z_loss": 0.0017646802356466652, "train/perplexity": 9.667093388877039, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027242.1080670408, "perf/iters_per_sec": 0.9666643658004002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344852209091187, "data/tokens_consumed": 23972544512, "data/tokens_consumed_B": 23.972544512, "train/loss_slope": -3.286228757916071e-05} {"step": 11440, "timestamp": 1778206895.72578, "train/loss": 2.3375393390655517, "train/z_loss": 0.0017385705956257879, "train/perplexity": 10.355723262134978, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024480.0275414751, "perf/iters_per_sec": 0.9653473031718612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035896611213684, "data/tokens_consumed": 23993516032, "data/tokens_consumed_B": 23.993516032, "train/loss_slope": -3.1879474034439554e-05} {"step": 11450, "timestamp": 1778206906.0640671, "grad/layer_0/attn": 0.003110291436314583, "grad/layer_0/mlp": 0.002919111168012023, "grad/layer_0/attn_mlp_ratio": 1.0654925937210875, "grad/layer_4/attn": 0.002759493887424469, "grad/layer_4/mlp": 0.0028671682812273502, "grad/layer_4/attn_mlp_ratio": 0.9624457026982178, "grad/layer_8/attn": 0.005261670798063278, "grad/layer_8/mlp": 0.0042551797814667225, "grad/layer_8/attn_mlp_ratio": 1.2365331066214826, "grad/layer_12/attn": 0.00432234862819314, "grad/layer_12/mlp": 0.0057542696595191956, "grad/layer_12/attn_mlp_ratio": 0.7511550220673525, "grad/layer_16/attn": 0.004688318353146315, "grad/layer_16/mlp": 0.00472145713865757, "grad/layer_16/attn_mlp_ratio": 0.9929812166379639, "grad/layer_20/attn": 0.005647499114274979, "grad/layer_20/mlp": 0.00660153292119503, "grad/layer_20/attn_mlp_ratio": 0.8554829758698459, "grad/layer_24/attn": 0.010029992088675499, "grad/layer_24/mlp": 0.009896882809698582, "grad/layer_24/attn_mlp_ratio": 1.0134496063246818, "grad/layer_27/attn": 0.008266272954642773, "grad/layer_27/mlp": 0.00842051301151514, "grad/layer_27/attn_mlp_ratio": 0.9816828078253993} {"step": 11450, "timestamp": 1778206906.0801353, "train/loss": 2.295981788635254, "train/z_loss": 0.00174848836613819, "train/perplexity": 9.934184495465459, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026225.7135846447, "perf/iters_per_sec": 0.9661797111438011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350041389465332, "data/tokens_consumed": 24014487552, "data/tokens_consumed_B": 24.014487552, "train/loss_slope": -3.292389697153003e-05} {"step": 11460, "timestamp": 1778206916.4263487, "train/loss": 2.2880602836608888, "train/z_loss": 0.0017504373681731522, "train/perplexity": 9.85580166842354, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028141.2043120833, "perf/iters_per_sec": 0.9670930882988373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340266227722168, "data/tokens_consumed": 24035459072, "data/tokens_consumed_B": 24.035459072, "train/loss_slope": -3.125723818204726e-05} {"step": 11470, "timestamp": 1778206926.775712, "train/loss": 2.307280921936035, "train/z_loss": 0.0017461203737184406, "train/perplexity": 10.047068716247994, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027518.6933144408, "perf/iters_per_sec": 0.9667962519237713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343441009521483, "data/tokens_consumed": 24056430592, "data/tokens_consumed_B": 24.056430592, "train/loss_slope": -3.038435490182165e-05} {"step": 11475, "timestamp": 1778206932.5424337, "eos/sharpness": 20.319461822509762, "eos/L0_probe": 2.127833127975464, "eos/L_plus": 2.2423171997070312, "eos/L_minus": 2.216543674468994, "eos/grad_norm": 0.14717301726341248, "eos/embed_grad_frac": 0.11917679011821747, "eos/time_s": 0.6045293807983398} {"step": 11475, "timestamp": 1778206933.9192548, "geo/rankme_last": 440.056884765625, "geo/layer_0/stable_rank_q_proj": 15.239670753479004, "geo/layer_0/stable_rank_k_proj": 13.159128189086914, "geo/layer_0/stable_rank_o_proj": 55.15055465698242, "geo/layer_0/stable_rank_gate_proj": 162.38768005371094, "geo/layer_0/stable_rank_down_proj": 48.03408432006836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04200129583477974, "geo/layer_0/attn_entropy_mean": 6.354274272918701, "geo/layer_0/attn_entropy_std": 0.2756470739841461, "geo/layer_7/stable_rank_q_proj": 43.44315719604492, "geo/layer_7/stable_rank_k_proj": 43.26982116699219, "geo/layer_7/stable_rank_o_proj": 107.75531768798828, "geo/layer_7/stable_rank_gate_proj": 121.8826904296875, "geo/layer_7/stable_rank_down_proj": 163.7260284423828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5946062803268433, "geo/layer_7/attn_entropy_mean": 4.659685134887695, "geo/layer_7/attn_entropy_std": 0.9172204732894897, "geo/layer_14/stable_rank_q_proj": 62.28815841674805, "geo/layer_14/stable_rank_k_proj": 39.88928985595703, "geo/layer_14/stable_rank_o_proj": 49.440773010253906, "geo/layer_14/stable_rank_gate_proj": 105.6436996459961, "geo/layer_14/stable_rank_down_proj": 140.27622985839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39738133549690247, "geo/layer_14/attn_entropy_mean": 5.670605659484863, "geo/layer_14/attn_entropy_std": 0.5793708562850952, "geo/layer_21/stable_rank_q_proj": 51.808773040771484, "geo/layer_21/stable_rank_k_proj": 32.093223571777344, "geo/layer_21/stable_rank_o_proj": 90.62278747558594, "geo/layer_21/stable_rank_gate_proj": 104.57752990722656, "geo/layer_21/stable_rank_down_proj": 69.50469207763672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15912696719169617, "geo/layer_21/attn_entropy_mean": 5.798638343811035, "geo/layer_21/attn_entropy_std": 0.30454131960868835, "geo/layer_27/stable_rank_q_proj": 42.64253616333008, "geo/layer_27/stable_rank_k_proj": 32.78527069091797, "geo/layer_27/stable_rank_o_proj": 114.58061218261719, "geo/layer_27/stable_rank_gate_proj": 94.58493041992188, "geo/layer_27/stable_rank_down_proj": 149.09127807617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0653679370880127, "geo/layer_27/attn_entropy_mean": 4.486200332641602, "geo/layer_27/attn_entropy_std": 0.48567789793014526, "attnres/final_alpha/block_0": 0.24819403886795044, "attnres/block_norm/0": 1.452488660812378, "attnres/final_alpha/block_1": 0.008976208046078682, "attnres/block_norm/1": 19354.140625, "attnres/final_alpha/block_2": 0.018311314284801483, "attnres/block_norm/2": 14377.5458984375, "attnres/final_alpha/block_3": 0.019908351823687553, "attnres/block_norm/3": 16707.78515625, "attnres/final_alpha/block_4": 0.027911439538002014, "attnres/block_norm/4": 6186.0498046875, "attnres/final_alpha/block_5": 0.5175095200538635, "attnres/block_norm/5": 4058.366943359375, "attnres/final_alpha/block_6": 0.1591891646385193, "attnres/block_norm/6": 11845.953125, "geo/tier1_time_s": 1.3558175563812256, "geo/step": 11475.0, "geo/rankme_slope": 0.0010407087248962084} {"step": 11480, "timestamp": 1778206939.1003637, "train/loss": 2.2122534275054933, "train/z_loss": 0.001773569080978632, "train/perplexity": 9.136281160997324, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702239.0791192977, "perf/iters_per_sec": 0.8116908450695504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319961547851563, "data/tokens_consumed": 24077402112, "data/tokens_consumed_B": 24.077402112, "train/loss_slope": -3.514273881268435e-05} {"step": 11490, "timestamp": 1778206949.4507232, "train/loss": 2.317567539215088, "train/z_loss": 0.001747330918442458, "train/perplexity": 10.150952457017608, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027493.5035973454, "perf/iters_per_sec": 0.9667842405306556, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034356951713562, "data/tokens_consumed": 24098373632, "data/tokens_consumed_B": 24.098373632, "train/loss_slope": -3.332390587786956e-05} {"step": 11500, "timestamp": 1778206959.7904878, "grad/layer_0/attn": 0.002679675118997693, "grad/layer_0/mlp": 0.002748028142377734, "grad/layer_0/attn_mlp_ratio": 0.9751264843912599, "grad/layer_4/attn": 0.0016954028978943825, "grad/layer_4/mlp": 0.0028634066693484783, "grad/layer_4/attn_mlp_ratio": 0.5920929279217091, "grad/layer_8/attn": 0.006224106065928936, "grad/layer_8/mlp": 0.004226790275424719, "grad/layer_8/attn_mlp_ratio": 1.4725372003582085, "grad/layer_12/attn": 0.005408892873674631, "grad/layer_12/mlp": 0.005887314677238464, "grad/layer_12/attn_mlp_ratio": 0.9187368228698238, "grad/layer_16/attn": 0.004836622159928083, "grad/layer_16/mlp": 0.004564980044960976, "grad/layer_16/attn_mlp_ratio": 1.059505629015048, "grad/layer_20/attn": 0.005946009419858456, "grad/layer_20/mlp": 0.006586495786905289, "grad/layer_20/attn_mlp_ratio": 0.9027576304541246, "grad/layer_24/attn": 0.015860367566347122, "grad/layer_24/mlp": 0.012985864654183388, "grad/layer_24/attn_mlp_ratio": 1.2213562875154476, "grad/layer_27/attn": 0.01105860248208046, "grad/layer_27/mlp": 0.011738934554159641, "grad/layer_27/attn_mlp_ratio": 0.9420448113800421} {"step": 11500, "timestamp": 1778206959.8061945, "train/loss": 2.261595439910889, "train/z_loss": 0.0017529243952594698, "train/perplexity": 9.598390612713693, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026344.975642075, "perf/iters_per_sec": 0.9662365797243476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349432229995728, "data/tokens_consumed": 24119345152, "data/tokens_consumed_B": 24.119345152, "train/loss_slope": -3.7089996183379904e-05} {"step": 11500, "timestamp": 1778206966.7541845, "geo/ww_alpha_mean": 8.539718719980485, "geo/ww_alpha_std": 5.557571970598105, "geo/ww_alpha_min": 1.3574195254716381, "geo/ww_alpha_max": 40.78082808026387, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.329001879007251, "geo/ww_alpha_by_type/k_proj": 4.670800582435901, "geo/ww_alpha_by_type/v_proj": 9.190603920007893, "geo/ww_alpha_by_type/o_proj": 8.713318535746085, "geo/ww_alpha_by_type/gate_proj": 10.189442407276072, "geo/ww_alpha_by_type/up_proj": 13.206726546083285, "geo/ww_alpha_by_type/down_proj": 9.639021073320512, "geo/twonn_id/layer_0": 0.7633002400398254, "geo/twonn_id/layer_7": 3.161187171936035, "geo/twonn_id/layer_14": 4.144181728363037, "geo/twonn_id/layer_21": 8.091673851013184, "geo/twonn_id/layer_27": 6.078675270080566, "geo/tier2_time_s": 6.941492557525635} {"step": 11500, "timestamp": 1778206967.3659086, "eoc/jacobian_sigma/layer_0/attn": 553.0230102539062, "eoc/jacobian_sigma/layer_0/mlp": 3231.53857421875, "eoc/jacobian_sigma/layer_0": 3231.53857421875, "eoc/jacobian_sigma/layer_7/attn": 1.1694976091384888, "eoc/jacobian_sigma/layer_7/mlp": 1.7444487810134888, "eoc/jacobian_sigma/layer_7": 1.7444487810134888, "eoc/jacobian_sigma/layer_14/attn": 1.3850324153900146, "eoc/jacobian_sigma/layer_14/mlp": 8.903196334838867, "eoc/jacobian_sigma/layer_14": 8.903196334838867, "eoc/jacobian_sigma/layer_21/attn": 1.0849199295043945, "eoc/jacobian_sigma/layer_21/mlp": 3.44781756401062, "eoc/jacobian_sigma/layer_21": 3.44781756401062, "eoc/jacobian_sigma/layer_27/attn": 2.186992883682251, "eoc/jacobian_sigma/layer_27/mlp": 22.660486221313477, "eoc/jacobian_sigma/layer_27": 22.660486221313477, "eoc/layer0_sigma": 3231.53857421875, "eoc/sigma_max": 22.660486221313477, "eoc/sigma_min": 1.7444487810134888, "eoc/sigma_mean": 9.188987225294113, "eoc/time_s": 0.6050777435302734} {"step": 11510, "timestamp": 1778206977.838055, "train/loss": 2.2902783155441284, "train/z_loss": 0.0017425113590434193, "train/perplexity": 9.877686412316713, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163385.1979776039, "perf/iters_per_sec": 0.5547452916992206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8026290893554688, "data/tokens_consumed": 24140316672, "data/tokens_consumed_B": 24.140316672, "train/loss_slope": -3.6056270235978954e-05} {"step": 11520, "timestamp": 1778206988.196573, "train/loss": 2.2491113185882567, "train/z_loss": 0.00176633131923154, "train/perplexity": 9.479308007263413, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025867.7783351901, "perf/iters_per_sec": 0.9660090343166304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035187005996704, "data/tokens_consumed": 24161288192, "data/tokens_consumed_B": 24.161288192, "train/loss_slope": -3.706721823171848e-05} {"step": 11530, "timestamp": 1778206998.5516818, "train/loss": 2.3075857639312742, "train/z_loss": 0.001744115527253598, "train/perplexity": 10.050131951599427, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026418.4069755238, "perf/iters_per_sec": 0.9662715945127124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349057197570801, "data/tokens_consumed": 24182259712, "data/tokens_consumed_B": 24.182259712, "train/loss_slope": -3.4608491333332836e-05} {"step": 11540, "timestamp": 1778207008.902584, "train/loss": 2.371481108665466, "train/z_loss": 0.0017275699530728162, "train/perplexity": 10.713248023659022, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027488.2227156125, "perf/iters_per_sec": 0.9667817224100173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343596458435058, "data/tokens_consumed": 24203231232, "data/tokens_consumed_B": 24.203231232, "train/loss_slope": -2.856680107231152e-05} {"step": 11550, "timestamp": 1778207019.2529676, "grad/layer_0/attn": 0.002311072777956724, "grad/layer_0/mlp": 0.002395001472905278, "grad/layer_0/attn_mlp_ratio": 0.9649566848314244, "grad/layer_4/attn": 0.0015531547833234072, "grad/layer_4/mlp": 0.002540640067309141, "grad/layer_4/attn_mlp_ratio": 0.6113241864425036, "grad/layer_8/attn": 0.004100992809981108, "grad/layer_8/mlp": 0.004017614759504795, "grad/layer_8/attn_mlp_ratio": 1.0207530968975433, "grad/layer_12/attn": 0.004075980745255947, "grad/layer_12/mlp": 0.005949077662080526, "grad/layer_12/attn_mlp_ratio": 0.6851449767956109, "grad/layer_16/attn": 0.004821382462978363, "grad/layer_16/mlp": 0.0044132862240076065, "grad/layer_16/attn_mlp_ratio": 1.0924698986219807, "grad/layer_20/attn": 0.006677975878119469, "grad/layer_20/mlp": 0.005580682307481766, "grad/layer_20/attn_mlp_ratio": 1.1966235292599006, "grad/layer_24/attn": 0.008149352855980396, "grad/layer_24/mlp": 0.007951237261295319, "grad/layer_24/attn_mlp_ratio": 1.0249163099632086, "grad/layer_27/attn": 0.007823866792023182, "grad/layer_27/mlp": 0.006904184352606535, "grad/layer_27/attn_mlp_ratio": 1.1332065134889957} {"step": 11550, "timestamp": 1778207019.880022, "eos/sharpness": 12.717700004577635, "eos/L0_probe": 2.1195573806762695, "eos/L_plus": 2.1881301403045654, "eos/L_minus": 2.17816162109375, "eos/grad_norm": 0.10411322861909866, "eos/embed_grad_frac": 0.2256469577550888, "eos/time_s": 0.6242167949676514} {"step": 11550, "timestamp": 1778207019.8996532, "train/loss": 2.320076751708984, "train/z_loss": 0.0017362989950925113, "train/perplexity": 10.176455336438718, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907900.5295491538, "perf/iters_per_sec": 0.9097578666444558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0991935729980469, "data/tokens_consumed": 24224202752, "data/tokens_consumed_B": 24.224202752, "train/loss_slope": -2.779306660104795e-05} {"step": 11550, "timestamp": 1778207021.262499, "geo/rankme_last": 440.9973449707031, "geo/layer_0/stable_rank_q_proj": 15.231690406799316, "geo/layer_0/stable_rank_k_proj": 13.180646896362305, "geo/layer_0/stable_rank_o_proj": 55.01789093017578, "geo/layer_0/stable_rank_gate_proj": 161.5246124267578, "geo/layer_0/stable_rank_down_proj": 48.19152069091797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03315092623233795, "geo/layer_0/attn_entropy_mean": 6.3468475341796875, "geo/layer_0/attn_entropy_std": 0.28121417760849, "geo/layer_7/stable_rank_q_proj": 43.44486999511719, "geo/layer_7/stable_rank_k_proj": 43.34651565551758, "geo/layer_7/stable_rank_o_proj": 108.303955078125, "geo/layer_7/stable_rank_gate_proj": 121.46341705322266, "geo/layer_7/stable_rank_down_proj": 163.25991821289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5937073230743408, "geo/layer_7/attn_entropy_mean": 4.682263374328613, "geo/layer_7/attn_entropy_std": 0.9250319004058838, "geo/layer_14/stable_rank_q_proj": 62.39342498779297, "geo/layer_14/stable_rank_k_proj": 39.72770309448242, "geo/layer_14/stable_rank_o_proj": 49.48835372924805, "geo/layer_14/stable_rank_gate_proj": 105.50968933105469, "geo/layer_14/stable_rank_down_proj": 139.9001922607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36862412095069885, "geo/layer_14/attn_entropy_mean": 5.645282745361328, "geo/layer_14/attn_entropy_std": 0.5821470618247986, "geo/layer_21/stable_rank_q_proj": 51.76042175292969, "geo/layer_21/stable_rank_k_proj": 32.10544967651367, "geo/layer_21/stable_rank_o_proj": 90.56230926513672, "geo/layer_21/stable_rank_gate_proj": 104.26219177246094, "geo/layer_21/stable_rank_down_proj": 69.31724548339844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15860843658447266, "geo/layer_21/attn_entropy_mean": 5.789193630218506, "geo/layer_21/attn_entropy_std": 0.3013251721858978, "geo/layer_27/stable_rank_q_proj": 42.56694030761719, "geo/layer_27/stable_rank_k_proj": 32.573909759521484, "geo/layer_27/stable_rank_o_proj": 114.52619171142578, "geo/layer_27/stable_rank_gate_proj": 94.54254150390625, "geo/layer_27/stable_rank_down_proj": 148.99191284179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06708525121212006, "geo/layer_27/attn_entropy_mean": 4.495325088500977, "geo/layer_27/attn_entropy_std": 0.4981871247291565, "attnres/final_alpha/block_0": 0.2484777569770813, "attnres/block_norm/0": 1.4546699523925781, "attnres/final_alpha/block_1": 0.008995888754725456, "attnres/block_norm/1": 19432.56640625, "attnres/final_alpha/block_2": 0.018439514562487602, "attnres/block_norm/2": 14461.6572265625, "attnres/final_alpha/block_3": 0.02004080079495907, "attnres/block_norm/3": 16681.05859375, "attnres/final_alpha/block_4": 0.02809157222509384, "attnres/block_norm/4": 6249.876953125, "attnres/final_alpha/block_5": 0.5150039792060852, "attnres/block_norm/5": 4105.11376953125, "attnres/final_alpha/block_6": 0.16095048189163208, "attnres/block_norm/6": 11803.1513671875, "geo/tier1_time_s": 1.3588378429412842, "geo/step": 11550.0, "geo/rankme_slope": 0.00097808474952481} {"step": 11560, "timestamp": 1778207031.6174371, "train/loss": 2.3513784646987914, "train/z_loss": 0.0017254042788408697, "train/perplexity": 10.500033679174807, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790299.8570356066, "perf/iters_per_sec": 0.8536814961603196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171397066116333, "data/tokens_consumed": 24245174272, "data/tokens_consumed_B": 24.245174272, "train/loss_slope": -2.788550066153928e-05} {"step": 11570, "timestamp": 1778207041.978608, "train/loss": 2.3155869960784914, "train/z_loss": 0.0017408692045137286, "train/perplexity": 10.13086795347697, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025500.454560234, "perf/iters_per_sec": 0.9658338806916399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353747367858888, "data/tokens_consumed": 24266145792, "data/tokens_consumed_B": 24.266145792, "train/loss_slope": -2.704018217430725e-05} {"step": 11580, "timestamp": 1778207052.332803, "train/loss": 2.3456171274185182, "train/z_loss": 0.0017281949636526405, "train/perplexity": 10.439713373359117, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026818.1011763255, "perf/iters_per_sec": 0.9664621835595729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347016334533692, "data/tokens_consumed": 24287117312, "data/tokens_consumed_B": 24.287117312, "train/loss_slope": -2.5003821764699078e-05} {"step": 11590, "timestamp": 1778207063.2627897, "train/loss": 2.238093686103821, "train/z_loss": 0.0017594175529666244, "train/perplexity": 9.375441706474126, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919914.9480142035, "perf/iters_per_sec": 0.9154867878027932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0923150539398194, "data/tokens_consumed": 24308088832, "data/tokens_consumed_B": 24.308088832, "train/loss_slope": -2.845782868825671e-05} {"step": 11600, "timestamp": 1778207073.6104205, "grad/layer_0/attn": 0.0025875952560454607, "grad/layer_0/mlp": 0.0027126220520585775, "grad/layer_0/attn_mlp_ratio": 0.9539092107176683, "grad/layer_4/attn": 0.00180473190266639, "grad/layer_4/mlp": 0.0026786027010530233, "grad/layer_4/attn_mlp_ratio": 0.6737586856688518, "grad/layer_8/attn": 0.0052832672372460365, "grad/layer_8/mlp": 0.0039334986358881, "grad/layer_8/attn_mlp_ratio": 1.3431470535488017, "grad/layer_12/attn": 0.0056917620822787285, "grad/layer_12/mlp": 0.0056571029126644135, "grad/layer_12/attn_mlp_ratio": 1.0061266463659448, "grad/layer_16/attn": 0.004118375014513731, "grad/layer_16/mlp": 0.004369805566966534, "grad/layer_16/attn_mlp_ratio": 0.9424618228784205, "grad/layer_20/attn": 0.009099832735955715, "grad/layer_20/mlp": 0.005852723494172096, "grad/layer_20/attn_mlp_ratio": 1.5548030911654462, "grad/layer_24/attn": 0.01123401615768671, "grad/layer_24/mlp": 0.010023637674748898, "grad/layer_24/attn_mlp_ratio": 1.1207524064753163, "grad/layer_27/attn": 0.008511602878570557, "grad/layer_27/mlp": 0.008322508074343204, "grad/layer_27/attn_mlp_ratio": 1.0227208793630624} {"step": 11600, "timestamp": 1778207073.62631, "train/loss": 2.3647910594940185, "train/z_loss": 0.001721618021838367, "train/perplexity": 10.641815078976409, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025223.6269465173, "perf/iters_per_sec": 0.9657018789990031, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355162620544434, "data/tokens_consumed": 24329060352, "data/tokens_consumed_B": 24.329060352, "train/loss_slope": -2.6140172081668926e-05} {"step": 11610, "timestamp": 1778207083.996739, "train/loss": 2.2285497903823854, "train/z_loss": 0.0017584409331902861, "train/perplexity": 9.286389098838027, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023836.8979132797, "perf/iters_per_sec": 0.9650406350675963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362257957458496, "data/tokens_consumed": 24350031872, "data/tokens_consumed_B": 24.350031872, "train/loss_slope": -2.8604525830199904e-05} {"step": 11620, "timestamp": 1778207094.3694832, "train/loss": 2.286145067214966, "train/z_loss": 0.0017448273836635053, "train/perplexity": 9.836943739252789, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023181.3795604995, "perf/iters_per_sec": 0.9647280595591066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365615367889405, "data/tokens_consumed": 24371003392, "data/tokens_consumed_B": 24.371003392, "train/loss_slope": -2.8683470563776963e-05} {"step": 11625, "timestamp": 1778207100.1528091, "eos/sharpness": 41.08493328094482, "eos/L0_probe": 2.1238608360290527, "eos/L_plus": 2.3935368061065674, "eos/L_minus": 2.2650341987609863, "eos/grad_norm": 0.17069752514362335, "eos/embed_grad_frac": 0.09406062960624695, "eos/time_s": 0.6121771335601807} {"step": 11625, "timestamp": 1778207101.530721, "geo/rankme_last": 439.4620056152344, "geo/layer_0/stable_rank_q_proj": 15.248997688293457, "geo/layer_0/stable_rank_k_proj": 13.176873207092285, "geo/layer_0/stable_rank_o_proj": 54.89130401611328, "geo/layer_0/stable_rank_gate_proj": 162.2127227783203, "geo/layer_0/stable_rank_down_proj": 48.18343734741211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04093160480260849, "geo/layer_0/attn_entropy_mean": 6.346648693084717, "geo/layer_0/attn_entropy_std": 0.28090351819992065, "geo/layer_7/stable_rank_q_proj": 43.47698974609375, "geo/layer_7/stable_rank_k_proj": 43.817710876464844, "geo/layer_7/stable_rank_o_proj": 108.23904418945312, "geo/layer_7/stable_rank_gate_proj": 120.99073791503906, "geo/layer_7/stable_rank_down_proj": 163.19595336914062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5907801389694214, "geo/layer_7/attn_entropy_mean": 4.671993255615234, "geo/layer_7/attn_entropy_std": 0.890506386756897, "geo/layer_14/stable_rank_q_proj": 62.271148681640625, "geo/layer_14/stable_rank_k_proj": 39.611324310302734, "geo/layer_14/stable_rank_o_proj": 49.40927505493164, "geo/layer_14/stable_rank_gate_proj": 105.2808837890625, "geo/layer_14/stable_rank_down_proj": 140.09861755371094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3933056592941284, "geo/layer_14/attn_entropy_mean": 5.637417793273926, "geo/layer_14/attn_entropy_std": 0.5821493864059448, "geo/layer_21/stable_rank_q_proj": 51.622379302978516, "geo/layer_21/stable_rank_k_proj": 32.21751022338867, "geo/layer_21/stable_rank_o_proj": 90.39603424072266, "geo/layer_21/stable_rank_gate_proj": 104.02720642089844, "geo/layer_21/stable_rank_down_proj": 69.19309997558594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15986880660057068, "geo/layer_21/attn_entropy_mean": 5.801083087921143, "geo/layer_21/attn_entropy_std": 0.2967498004436493, "geo/layer_27/stable_rank_q_proj": 42.65399932861328, "geo/layer_27/stable_rank_k_proj": 32.61639404296875, "geo/layer_27/stable_rank_o_proj": 114.6911849975586, "geo/layer_27/stable_rank_gate_proj": 94.58467864990234, "geo/layer_27/stable_rank_down_proj": 148.8768310546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06648574769496918, "geo/layer_27/attn_entropy_mean": 4.4814863204956055, "geo/layer_27/attn_entropy_std": 0.48399031162261963, "attnres/final_alpha/block_0": 0.24690662324428558, "attnres/block_norm/0": 1.4566633701324463, "attnres/final_alpha/block_1": 0.008889454416930676, "attnres/block_norm/1": 19584.4765625, "attnres/final_alpha/block_2": 0.01803591661155224, "attnres/block_norm/2": 14498.9521484375, "attnres/final_alpha/block_3": 0.019465826451778412, "attnres/block_norm/3": 16944.55859375, "attnres/final_alpha/block_4": 0.027473393827676773, "attnres/block_norm/4": 6278.427734375, "attnres/final_alpha/block_5": 0.5192164182662964, "attnres/block_norm/5": 4101.23095703125, "attnres/final_alpha/block_6": 0.1600123643875122, "attnres/block_norm/6": 12029.7607421875, "geo/tier1_time_s": 1.3582241535186768, "geo/step": 11625.0, "geo/rankme_slope": 0.0009022559414390756} {"step": 11630, "timestamp": 1778207106.7121303, "train/loss": 2.3447631359100343, "train/z_loss": 0.0017241448280401529, "train/perplexity": 10.430801752553114, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699988.7194703682, "perf/iters_per_sec": 0.8106177899696199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2336270093917847, "data/tokens_consumed": 24391974912, "data/tokens_consumed_B": 24.391974912, "train/loss_slope": -2.617846566780244e-05} {"step": 11640, "timestamp": 1778207117.078669, "train/loss": 2.270808219909668, "train/z_loss": 0.0017452784348279237, "train/perplexity": 9.687227060762606, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023935.2483344127, "perf/iters_per_sec": 0.9650875322029174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361754417419433, "data/tokens_consumed": 24412946432, "data/tokens_consumed_B": 24.412946432, "train/loss_slope": -2.4696974277925533e-05} {"step": 11650, "timestamp": 1778207127.4284034, "grad/layer_0/attn": 0.003936950583010912, "grad/layer_0/mlp": 0.0030674892477691174, "grad/layer_0/attn_mlp_ratio": 1.2834439297642957, "grad/layer_4/attn": 0.002067031804472208, "grad/layer_4/mlp": 0.0026947103906422853, "grad/layer_4/attn_mlp_ratio": 0.7670700847642932, "grad/layer_8/attn": 0.006433801259845495, "grad/layer_8/mlp": 0.004261666908860207, "grad/layer_8/attn_mlp_ratio": 1.5096912185934095, "grad/layer_12/attn": 0.005066266283392906, "grad/layer_12/mlp": 0.005925298668444157, "grad/layer_12/attn_mlp_ratio": 0.8550229248141673, "grad/layer_16/attn": 0.0050657582469284534, "grad/layer_16/mlp": 0.005587306339293718, "grad/layer_16/attn_mlp_ratio": 0.9066548079952471, "grad/layer_20/attn": 0.01263918075710535, "grad/layer_20/mlp": 0.007718265056610107, "grad/layer_20/attn_mlp_ratio": 1.6375675751798775, "grad/layer_24/attn": 0.020216546952724457, "grad/layer_24/mlp": 0.012469924986362457, "grad/layer_24/attn_mlp_ratio": 1.6212244109496676, "grad/layer_27/attn": 0.006025139708071947, "grad/layer_27/mlp": 0.011246887966990471, "grad/layer_27/attn_mlp_ratio": 0.5357161618559791} {"step": 11650, "timestamp": 1778207127.444437, "train/loss": 2.329022693634033, "train/z_loss": 0.001727901946287602, "train/perplexity": 10.267901742129062, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024030.3015504032, "perf/iters_per_sec": 0.9651328571083084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361267805099488, "data/tokens_consumed": 24433917952, "data/tokens_consumed_B": 24.433917952, "train/loss_slope": -2.0981104081363036e-05} {"step": 11660, "timestamp": 1778207137.7978747, "train/loss": 2.283563232421875, "train/z_loss": 0.0017477608053013682, "train/perplexity": 9.81157913334989, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026528.774056135, "perf/iters_per_sec": 0.9663242216377902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348493576049804, "data/tokens_consumed": 24454889472, "data/tokens_consumed_B": 24.454889472, "train/loss_slope": -2.247361922242638e-05} {"step": 11670, "timestamp": 1778207148.1539674, "train/loss": 2.337967777252197, "train/z_loss": 0.0017253360245376825, "train/perplexity": 10.360161000011113, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026097.6450360534, "perf/iters_per_sec": 0.9661186433010356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350695610046388, "data/tokens_consumed": 24475860992, "data/tokens_consumed_B": 24.475860992, "train/loss_slope": -1.758847169392536e-05} {"step": 11680, "timestamp": 1778207158.5094388, "train/loss": 2.2999650716781614, "train/z_loss": 0.0017358390614390373, "train/perplexity": 9.973834079443982, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026325.9767930538, "perf/iters_per_sec": 0.9662275203671712, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349529266357422, "data/tokens_consumed": 24496832512, "data/tokens_consumed_B": 24.496832512, "train/loss_slope": -1.8748856020016668e-05} {"step": 11690, "timestamp": 1778207168.8685162, "train/loss": 2.273625373840332, "train/z_loss": 0.001749486301559955, "train/perplexity": 9.714555947319848, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025351.3984834433, "perf/iters_per_sec": 0.9657628052155701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354509353637695, "data/tokens_consumed": 24517804032, "data/tokens_consumed_B": 24.517804032, "train/loss_slope": -1.908645272219177e-05} {"step": 11700, "timestamp": 1778207179.2119884, "grad/layer_0/attn": 0.002910173498094082, "grad/layer_0/mlp": 0.0027983072213828564, "grad/layer_0/attn_mlp_ratio": 1.039976372808096, "grad/layer_4/attn": 0.0015204864321276546, "grad/layer_4/mlp": 0.0026578372344374657, "grad/layer_4/attn_mlp_ratio": 0.5720765572921969, "grad/layer_8/attn": 0.008667469024658203, "grad/layer_8/mlp": 0.003917009104043245, "grad/layer_8/attn_mlp_ratio": 2.212777293377662, "grad/layer_12/attn": 0.005309042055159807, "grad/layer_12/mlp": 0.005808173678815365, "grad/layer_12/attn_mlp_ratio": 0.9140639136046373, "grad/layer_16/attn": 0.0043371254578232765, "grad/layer_16/mlp": 0.004478528629988432, "grad/layer_16/attn_mlp_ratio": 0.9684264005680454, "grad/layer_20/attn": 0.0040015182457864285, "grad/layer_20/mlp": 0.006553824990987778, "grad/layer_20/attn_mlp_ratio": 0.6105622579535959, "grad/layer_24/attn": 0.014211886562407017, "grad/layer_24/mlp": 0.012886904180049896, "grad/layer_24/attn_mlp_ratio": 1.1028161809511003, "grad/layer_27/attn": 0.007123829796910286, "grad/layer_27/mlp": 0.010908950120210648, "grad/layer_27/attn_mlp_ratio": 0.6530261531226171} {"step": 11700, "timestamp": 1778207179.8176565, "eos/sharpness": 20.422482490539547, "eos/L0_probe": 2.1220922470092773, "eos/L_plus": 2.2525932788848877, "eos/L_minus": 2.1958160400390625, "eos/grad_norm": 0.15252624452114105, "eos/embed_grad_frac": 0.12374763935804367, "eos/time_s": 0.6028025150299072} {"step": 11700, "timestamp": 1778207179.837263, "train/loss": 2.2770976305007933, "train/z_loss": 0.0017529229633510112, "train/perplexity": 9.748346008845111, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912915.803542058, "perf/iters_per_sec": 0.9121493356428423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963117122650146, "data/tokens_consumed": 24538775552, "data/tokens_consumed_B": 24.538775552, "train/loss_slope": -1.667995308384752e-05} {"step": 11700, "timestamp": 1778207181.1967597, "geo/rankme_last": 441.2396545410156, "geo/layer_0/stable_rank_q_proj": 15.26810073852539, "geo/layer_0/stable_rank_k_proj": 13.213390350341797, "geo/layer_0/stable_rank_o_proj": 54.98440170288086, "geo/layer_0/stable_rank_gate_proj": 162.8053741455078, "geo/layer_0/stable_rank_down_proj": 48.306331634521484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.042285218834877014, "geo/layer_0/attn_entropy_mean": 6.352212905883789, "geo/layer_0/attn_entropy_std": 0.2816554307937622, "geo/layer_7/stable_rank_q_proj": 43.5163459777832, "geo/layer_7/stable_rank_k_proj": 43.656070709228516, "geo/layer_7/stable_rank_o_proj": 108.15117645263672, "geo/layer_7/stable_rank_gate_proj": 120.75424194335938, "geo/layer_7/stable_rank_down_proj": 163.2245635986328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5843712687492371, "geo/layer_7/attn_entropy_mean": 4.6543989181518555, "geo/layer_7/attn_entropy_std": 0.8852595090866089, "geo/layer_14/stable_rank_q_proj": 62.065616607666016, "geo/layer_14/stable_rank_k_proj": 39.60791778564453, "geo/layer_14/stable_rank_o_proj": 49.297523498535156, "geo/layer_14/stable_rank_gate_proj": 105.21517181396484, "geo/layer_14/stable_rank_down_proj": 140.36346435546875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3916149437427521, "geo/layer_14/attn_entropy_mean": 5.650734901428223, "geo/layer_14/attn_entropy_std": 0.5818524956703186, "geo/layer_21/stable_rank_q_proj": 51.507652282714844, "geo/layer_21/stable_rank_k_proj": 32.1667366027832, "geo/layer_21/stable_rank_o_proj": 90.3649673461914, "geo/layer_21/stable_rank_gate_proj": 103.75184631347656, "geo/layer_21/stable_rank_down_proj": 68.96285247802734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15667517483234406, "geo/layer_21/attn_entropy_mean": 5.781035900115967, "geo/layer_21/attn_entropy_std": 0.3061767518520355, "geo/layer_27/stable_rank_q_proj": 42.62043380737305, "geo/layer_27/stable_rank_k_proj": 32.56108856201172, "geo/layer_27/stable_rank_o_proj": 115.14285278320312, "geo/layer_27/stable_rank_gate_proj": 94.6644515991211, "geo/layer_27/stable_rank_down_proj": 148.6494140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06728819012641907, "geo/layer_27/attn_entropy_mean": 4.489907264709473, "geo/layer_27/attn_entropy_std": 0.5201196670532227, "attnres/final_alpha/block_0": 0.24800564348697662, "attnres/block_norm/0": 1.4587626457214355, "attnres/final_alpha/block_1": 0.009017398580908775, "attnres/block_norm/1": 19641.76171875, "attnres/final_alpha/block_2": 0.018155274912714958, "attnres/block_norm/2": 14498.623046875, "attnres/final_alpha/block_3": 0.019562281668186188, "attnres/block_norm/3": 16809.013671875, "attnres/final_alpha/block_4": 0.027304673567414284, "attnres/block_norm/4": 6288.8173828125, "attnres/final_alpha/block_5": 0.5163853764533997, "attnres/block_norm/5": 4107.4580078125, "attnres/final_alpha/block_6": 0.16156932711601257, "attnres/block_norm/6": 12038.1953125, "geo/tier1_time_s": 1.355525016784668, "geo/step": 11700.0, "geo/rankme_slope": 0.0008634872894470288} {"step": 11710, "timestamp": 1778207191.564678, "train/loss": 2.3118669986724854, "train/z_loss": 0.0017447808058932424, "train/perplexity": 10.093251161533287, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788884.7536420992, "perf/iters_per_sec": 0.853006722279596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1723237037658691, "data/tokens_consumed": 24559747072, "data/tokens_consumed_B": 24.559747072, "train/loss_slope": -1.4307647805796219e-05} {"step": 11720, "timestamp": 1778207201.926806, "train/loss": 2.347538638114929, "train/z_loss": 0.0017246698611415924, "train/perplexity": 10.45979267939608, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025084.4489290316, "perf/iters_per_sec": 0.9656355137486609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355874300003052, "data/tokens_consumed": 24580718592, "data/tokens_consumed_B": 24.580718592, "train/loss_slope": -1.133545388554513e-05} {"step": 11730, "timestamp": 1778207212.2776916, "train/loss": 2.316267657279968, "train/z_loss": 0.0017323373816907407, "train/perplexity": 10.137765989576634, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027266.1701825396, "perf/iters_per_sec": 0.966675839511175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034472942352295, "data/tokens_consumed": 24601690112, "data/tokens_consumed_B": 24.601690112, "train/loss_slope": -9.399597329346626e-06} {"step": 11740, "timestamp": 1778207222.6358793, "train/loss": 2.2772284507751466, "train/z_loss": 0.0017476132838055492, "train/perplexity": 9.749621373564441, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026512.4329941303, "perf/iters_per_sec": 0.9663164296122219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034857702255249, "data/tokens_consumed": 24622661632, "data/tokens_consumed_B": 24.622661632, "train/loss_slope": -7.863917166214642e-06} {"step": 11750, "timestamp": 1778207232.9809988, "grad/layer_0/attn": 0.002569264266639948, "grad/layer_0/mlp": 0.0027369007002562284, "grad/layer_0/attn_mlp_ratio": 0.9387495032335164, "grad/layer_4/attn": 0.0021899330895394087, "grad/layer_4/mlp": 0.002705648308619857, "grad/layer_4/attn_mlp_ratio": 0.8093930765588597, "grad/layer_8/attn": 0.005190485622733831, "grad/layer_8/mlp": 0.003943758551031351, "grad/layer_8/attn_mlp_ratio": 1.3161265893835676, "grad/layer_12/attn": 0.004226702265441418, "grad/layer_12/mlp": 0.005934192333370447, "grad/layer_12/attn_mlp_ratio": 0.7122624203544365, "grad/layer_16/attn": 0.004284212831407785, "grad/layer_16/mlp": 0.004255803767591715, "grad/layer_16/attn_mlp_ratio": 1.00667534611555, "grad/layer_20/attn": 0.004348756745457649, "grad/layer_20/mlp": 0.005675273947417736, "grad/layer_20/attn_mlp_ratio": 0.7662637449968333, "grad/layer_24/attn": 0.009659096598625183, "grad/layer_24/mlp": 0.012223189696669579, "grad/layer_24/attn_mlp_ratio": 0.7902271632284539, "grad/layer_27/attn": 0.008165659382939339, "grad/layer_27/mlp": 0.010126566514372826, "grad/layer_27/attn_mlp_ratio": 0.8063601113678219} {"step": 11750, "timestamp": 1778207232.996783, "train/loss": 2.29256329536438, "train/z_loss": 0.0017396426759660245, "train/perplexity": 9.900282532447441, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025542.4796527359, "perf/iters_per_sec": 0.9658539198173217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353532552719116, "data/tokens_consumed": 24643633152, "data/tokens_consumed_B": 24.643633152, "train/loss_slope": -7.182807046802742e-06} {"step": 11760, "timestamp": 1778207243.745095, "train/loss": 2.3096922636032104, "train/z_loss": 0.0017378606367856263, "train/perplexity": 10.071324864854114, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952195.9458024113, "perf/iters_per_sec": 0.9308795670520836, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0742528200149537, "data/tokens_consumed": 24664604672, "data/tokens_consumed_B": 24.664604672, "train/loss_slope": -6.165662542893118e-06} {"step": 11770, "timestamp": 1778207254.0991607, "train/loss": 2.345909261703491, "train/z_loss": 0.0017253599711693824, "train/perplexity": 10.442763617079457, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026663.387744922, "perf/iters_per_sec": 0.9663884104466066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347806215286255, "data/tokens_consumed": 24685576192, "data/tokens_consumed_B": 24.685576192, "train/loss_slope": -7.416219851508156e-07} {"step": 11775, "timestamp": 1778207260.3968248, "eos/sharpness": 30.604386329650872, "eos/L0_probe": 2.1231822967529297, "eos/L_plus": 2.2989537715911865, "eos/L_minus": 2.2534546852111816, "eos/grad_norm": 0.2640906572341919, "eos/embed_grad_frac": 0.044390931725502014, "eos/time_s": 0.610485315322876} {"step": 11775, "timestamp": 1778207261.7731876, "geo/rankme_last": 440.1255187988281, "geo/layer_0/stable_rank_q_proj": 15.237295150756836, "geo/layer_0/stable_rank_k_proj": 13.209320068359375, "geo/layer_0/stable_rank_o_proj": 54.98845291137695, "geo/layer_0/stable_rank_gate_proj": 162.74029541015625, "geo/layer_0/stable_rank_down_proj": 48.40821075439453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036367326974868774, "geo/layer_0/attn_entropy_mean": 6.352455139160156, "geo/layer_0/attn_entropy_std": 0.2766502797603607, "geo/layer_7/stable_rank_q_proj": 43.78893280029297, "geo/layer_7/stable_rank_k_proj": 43.404029846191406, "geo/layer_7/stable_rank_o_proj": 108.43746948242188, "geo/layer_7/stable_rank_gate_proj": 120.83927154541016, "geo/layer_7/stable_rank_down_proj": 162.83175659179688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5954294800758362, "geo/layer_7/attn_entropy_mean": 4.6493096351623535, "geo/layer_7/attn_entropy_std": 0.8735068440437317, "geo/layer_14/stable_rank_q_proj": 62.04124069213867, "geo/layer_14/stable_rank_k_proj": 39.604976654052734, "geo/layer_14/stable_rank_o_proj": 49.52180480957031, "geo/layer_14/stable_rank_gate_proj": 105.07991790771484, "geo/layer_14/stable_rank_down_proj": 140.03248596191406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39810964465141296, "geo/layer_14/attn_entropy_mean": 5.626242637634277, "geo/layer_14/attn_entropy_std": 0.5889368653297424, "geo/layer_21/stable_rank_q_proj": 51.59128952026367, "geo/layer_21/stable_rank_k_proj": 32.230751037597656, "geo/layer_21/stable_rank_o_proj": 90.33838653564453, "geo/layer_21/stable_rank_gate_proj": 103.5261459350586, "geo/layer_21/stable_rank_down_proj": 68.86507415771484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15601883828639984, "geo/layer_21/attn_entropy_mean": 5.802548885345459, "geo/layer_21/attn_entropy_std": 0.3004366457462311, "geo/layer_27/stable_rank_q_proj": 42.611080169677734, "geo/layer_27/stable_rank_k_proj": 32.53093338012695, "geo/layer_27/stable_rank_o_proj": 115.86585998535156, "geo/layer_27/stable_rank_gate_proj": 94.71297454833984, "geo/layer_27/stable_rank_down_proj": 148.59475708007812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07074213027954102, "geo/layer_27/attn_entropy_mean": 4.500393867492676, "geo/layer_27/attn_entropy_std": 0.5143259763717651, "attnres/final_alpha/block_0": 0.2472735196352005, "attnres/block_norm/0": 1.4609184265136719, "attnres/final_alpha/block_1": 0.008846675977110863, "attnres/block_norm/1": 19768.609375, "attnres/final_alpha/block_2": 0.01795811951160431, "attnres/block_norm/2": 14539.00390625, "attnres/final_alpha/block_3": 0.019414003938436508, "attnres/block_norm/3": 17101.90625, "attnres/final_alpha/block_4": 0.027264975011348724, "attnres/block_norm/4": 6314.13720703125, "attnres/final_alpha/block_5": 0.5206581354141235, "attnres/block_norm/5": 4123.412109375, "attnres/final_alpha/block_6": 0.1585845947265625, "attnres/block_norm/6": 12096.212890625, "geo/tier1_time_s": 1.356231451034546, "geo/step": 11775.0, "geo/rankme_slope": 0.0008183573429371749} {"step": 11780, "timestamp": 1778207266.954277, "train/loss": 2.290262818336487, "train/z_loss": 0.0017399514792487026, "train/perplexity": 9.877533336945485, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1631943.3128347516, "perf/iters_per_sec": 0.7781712116407163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2850642442703246, "data/tokens_consumed": 24706547712, "data/tokens_consumed_B": 24.706547712, "train/loss_slope": 1.1666515515152773e-06} {"step": 11790, "timestamp": 1778207277.3171108, "train/loss": 2.2908090114593507, "train/z_loss": 0.0017474962281994522, "train/perplexity": 9.882929851360354, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024704.0799929197, "perf/iters_per_sec": 0.9654541397060965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035781979560852, "data/tokens_consumed": 24727519232, "data/tokens_consumed_B": 24.727519232, "train/loss_slope": 3.717433773215857e-06} {"step": 11800, "timestamp": 1778207287.661088, "grad/layer_0/attn": 0.002663316670805216, "grad/layer_0/mlp": 0.0026485431008040905, "grad/layer_0/attn_mlp_ratio": 1.00557796074334, "grad/layer_4/attn": 0.0016698050312697887, "grad/layer_4/mlp": 0.002830671379342675, "grad/layer_4/attn_mlp_ratio": 0.5898971475338928, "grad/layer_8/attn": 0.005136690568178892, "grad/layer_8/mlp": 0.004128394182771444, "grad/layer_8/attn_mlp_ratio": 1.2442344932060525, "grad/layer_12/attn": 0.0052949208766222, "grad/layer_12/mlp": 0.005832582246512175, "grad/layer_12/attn_mlp_ratio": 0.9078175946865987, "grad/layer_16/attn": 0.004254633095115423, "grad/layer_16/mlp": 0.004809126257896423, "grad/layer_16/attn_mlp_ratio": 0.8846997933688017, "grad/layer_20/attn": 0.007908793166279793, "grad/layer_20/mlp": 0.0063361008651554585, "grad/layer_20/attn_mlp_ratio": 1.2482113542339597, "grad/layer_24/attn": 0.015460592694580555, "grad/layer_24/mlp": 0.011242790147662163, "grad/layer_24/attn_mlp_ratio": 1.3751561982396183, "grad/layer_27/attn": 0.008647591806948185, "grad/layer_27/mlp": 0.01004121359437704, "grad/layer_27/attn_mlp_ratio": 0.8612098168760947} {"step": 11800, "timestamp": 1778207287.6770303, "train/loss": 2.2903245210647585, "train/z_loss": 0.001736520673148334, "train/perplexity": 9.87814282650436, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025703.3665166565, "perf/iters_per_sec": 0.9659306366523058, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352710247039796, "data/tokens_consumed": 24748490752, "data/tokens_consumed_B": 24.748490752, "train/loss_slope": 3.3547526181298424e-06} {"step": 11810, "timestamp": 1778207298.031915, "train/loss": 2.2709845542907714, "train/z_loss": 0.0017433183267712594, "train/perplexity": 9.688935402566244, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026475.829971537, "perf/iters_per_sec": 0.9662989759309468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348763942718506, "data/tokens_consumed": 24769462272, "data/tokens_consumed_B": 24.769462272, "train/loss_slope": 2.8995443813942984e-06} {"step": 11820, "timestamp": 1778207308.884765, "train/loss": 2.3012394428253176, "train/z_loss": 0.0017323773121461273, "train/perplexity": 9.986552548124854, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933272.0541090663, "perf/iters_per_sec": 0.9218559523148853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0847681760787964, "data/tokens_consumed": 24790433792, "data/tokens_consumed_B": 24.790433792, "train/loss_slope": 3.1076027543702827e-06} {"step": 11830, "timestamp": 1778207319.2437415, "train/loss": 2.260983395576477, "train/z_loss": 0.0017531814286485315, "train/perplexity": 9.592517769523239, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025659.8886248886, "perf/iters_per_sec": 0.9659099047779506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352932453155517, "data/tokens_consumed": 24811405312, "data/tokens_consumed_B": 24.811405312, "train/loss_slope": 8.741165330522112e-07} {"step": 11840, "timestamp": 1778207330.050369, "train/loss": 2.2942097902297975, "train/z_loss": 0.0017490882892161608, "train/perplexity": 9.91659672373382, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1941642.5208610683, "perf/iters_per_sec": 0.9258473018937436, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0800917148590088, "data/tokens_consumed": 24832376832, "data/tokens_consumed_B": 24.832376832, "train/loss_slope": 1.5565857313575604e-06} {"step": 11850, "timestamp": 1778207340.3926616, "grad/layer_0/attn": 0.002688743406906724, "grad/layer_0/mlp": 0.0026898954529315233, "grad/layer_0/attn_mlp_ratio": 0.9995716762966712, "grad/layer_4/attn": 0.0015420133713632822, "grad/layer_4/mlp": 0.002634448232129216, "grad/layer_4/attn_mlp_ratio": 0.5853268604880911, "grad/layer_8/attn": 0.007693937513977289, "grad/layer_8/mlp": 0.003970426507294178, "grad/layer_8/attn_mlp_ratio": 1.9378112920769142, "grad/layer_12/attn": 0.004545197356492281, "grad/layer_12/mlp": 0.005832554306834936, "grad/layer_12/attn_mlp_ratio": 0.7792807472427428, "grad/layer_16/attn": 0.005010165739804506, "grad/layer_16/mlp": 0.0046545774675905704, "grad/layer_16/attn_mlp_ratio": 1.0763953693005068, "grad/layer_20/attn": 0.007558460347354412, "grad/layer_20/mlp": 0.006440815515816212, "grad/layer_20/attn_mlp_ratio": 1.1735253418516884, "grad/layer_24/attn": 0.01179492473602295, "grad/layer_24/mlp": 0.011097094975411892, "grad/layer_24/attn_mlp_ratio": 1.062883994042482, "grad/layer_27/attn": 0.004575169645249844, "grad/layer_27/mlp": 0.008771686814725399, "grad/layer_27/attn_mlp_ratio": 0.5215837831112411} {"step": 11850, "timestamp": 1778207341.0060692, "eos/sharpness": 30.528569221496575, "eos/L0_probe": 2.116683006286621, "eos/L_plus": 2.304401159286499, "eos/L_minus": 2.234250545501709, "eos/grad_norm": 0.1458997279405594, "eos/embed_grad_frac": 0.13529276847839355, "eos/time_s": 0.6105339527130127} {"step": 11850, "timestamp": 1778207341.0255275, "train/loss": 2.274165415763855, "train/z_loss": 0.0017489045625552536, "train/perplexity": 9.719803631657042, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911755.096548697, "perf/iters_per_sec": 0.9115958674186215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969773292541505, "data/tokens_consumed": 24853348352, "data/tokens_consumed_B": 24.853348352, "train/loss_slope": 8.089631804823693e-08} {"step": 11850, "timestamp": 1778207342.3876753, "geo/rankme_last": 440.2672424316406, "geo/layer_0/stable_rank_q_proj": 15.243589401245117, "geo/layer_0/stable_rank_k_proj": 13.242968559265137, "geo/layer_0/stable_rank_o_proj": 55.042503356933594, "geo/layer_0/stable_rank_gate_proj": 162.33021545410156, "geo/layer_0/stable_rank_down_proj": 48.32744598388672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03392567113041878, "geo/layer_0/attn_entropy_mean": 6.350374221801758, "geo/layer_0/attn_entropy_std": 0.2771173417568207, "geo/layer_7/stable_rank_q_proj": 43.69639587402344, "geo/layer_7/stable_rank_k_proj": 43.52690124511719, "geo/layer_7/stable_rank_o_proj": 108.72535705566406, "geo/layer_7/stable_rank_gate_proj": 120.97753143310547, "geo/layer_7/stable_rank_down_proj": 162.99136352539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5904467105865479, "geo/layer_7/attn_entropy_mean": 4.654703617095947, "geo/layer_7/attn_entropy_std": 0.8814968466758728, "geo/layer_14/stable_rank_q_proj": 61.9984130859375, "geo/layer_14/stable_rank_k_proj": 39.50080871582031, "geo/layer_14/stable_rank_o_proj": 49.60664749145508, "geo/layer_14/stable_rank_gate_proj": 104.71434020996094, "geo/layer_14/stable_rank_down_proj": 139.57168579101562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3984230160713196, "geo/layer_14/attn_entropy_mean": 5.643715858459473, "geo/layer_14/attn_entropy_std": 0.5744817852973938, "geo/layer_21/stable_rank_q_proj": 51.53899383544922, "geo/layer_21/stable_rank_k_proj": 32.28982925415039, "geo/layer_21/stable_rank_o_proj": 90.38790130615234, "geo/layer_21/stable_rank_gate_proj": 103.46663665771484, "geo/layer_21/stable_rank_down_proj": 68.67475128173828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15632563829421997, "geo/layer_21/attn_entropy_mean": 5.790205955505371, "geo/layer_21/attn_entropy_std": 0.29672837257385254, "geo/layer_27/stable_rank_q_proj": 42.54324722290039, "geo/layer_27/stable_rank_k_proj": 32.60375213623047, "geo/layer_27/stable_rank_o_proj": 116.12568664550781, "geo/layer_27/stable_rank_gate_proj": 94.65603637695312, "geo/layer_27/stable_rank_down_proj": 148.46580505371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06833281368017197, "geo/layer_27/attn_entropy_mean": 4.479546546936035, "geo/layer_27/attn_entropy_std": 0.49996399879455566, "attnres/final_alpha/block_0": 0.24715015292167664, "attnres/block_norm/0": 1.4630745649337769, "attnres/final_alpha/block_1": 0.008980147540569305, "attnres/block_norm/1": 19901.5, "attnres/final_alpha/block_2": 0.017917580902576447, "attnres/block_norm/2": 14719.54296875, "attnres/final_alpha/block_3": 0.019488399848341942, "attnres/block_norm/3": 17074.78515625, "attnres/final_alpha/block_4": 0.027119385078549385, "attnres/block_norm/4": 6327.7080078125, "attnres/final_alpha/block_5": 0.5203136205673218, "attnres/block_norm/5": 4135.99462890625, "attnres/final_alpha/block_6": 0.15903069078922272, "attnres/block_norm/6": 12155.4375, "geo/tier1_time_s": 1.3583393096923828, "geo/step": 11850.0, "geo/rankme_slope": 0.0007683934706695178} {"step": 11860, "timestamp": 1778207352.7448869, "train/loss": 2.3355754137039186, "train/z_loss": 0.0017130960826762021, "train/perplexity": 10.33540535254191, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790030.9076638725, "perf/iters_per_sec": 0.8535532511062014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715730667114257, "data/tokens_consumed": 24874319872, "data/tokens_consumed_B": 24.874319872, "train/loss_slope": 1.0815386367280408e-06} {"step": 11870, "timestamp": 1778207363.0985603, "train/loss": 2.2810155868530275, "train/z_loss": 0.0017393344780430198, "train/perplexity": 9.78661452124185, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026559.2625134853, "perf/iters_per_sec": 0.9663387596671511, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348337888717651, "data/tokens_consumed": 24895291392, "data/tokens_consumed_B": 24.895291392, "train/loss_slope": -5.14276743722048e-07} {"step": 11880, "timestamp": 1778207373.4511409, "train/loss": 2.2765233755111693, "train/z_loss": 0.00174170162063092, "train/perplexity": 9.742749579551468, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026508.9780604846, "perf/iters_per_sec": 0.9663147821714805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348594665527344, "data/tokens_consumed": 24916262912, "data/tokens_consumed_B": 24.916262912, "train/loss_slope": 5.459076572101141e-07} {"step": 11890, "timestamp": 1778207383.8378956, "train/loss": 2.332642650604248, "train/z_loss": 0.0017225576215423644, "train/perplexity": 10.305138461608315, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023220.8420199112, "perf/iters_per_sec": 0.9647468767261081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365413188934327, "data/tokens_consumed": 24937234432, "data/tokens_consumed_B": 24.937234432, "train/loss_slope": 3.362019699398829e-06} {"step": 11900, "timestamp": 1778207394.1744924, "grad/layer_0/attn": 0.0026140070986002684, "grad/layer_0/mlp": 0.0026632952503859997, "grad/layer_0/attn_mlp_ratio": 0.9814935088672806, "grad/layer_4/attn": 0.0017087676096707582, "grad/layer_4/mlp": 0.002642948180437088, "grad/layer_4/attn_mlp_ratio": 0.6465384216251724, "grad/layer_8/attn": 0.004680501762777567, "grad/layer_8/mlp": 0.004153383895754814, "grad/layer_8/attn_mlp_ratio": 1.1269128420491632, "grad/layer_12/attn": 0.0048132226802408695, "grad/layer_12/mlp": 0.005939079448580742, "grad/layer_12/attn_mlp_ratio": 0.8104324316368321, "grad/layer_16/attn": 0.004574622958898544, "grad/layer_16/mlp": 0.004482021555304527, "grad/layer_16/attn_mlp_ratio": 1.0206606104824198, "grad/layer_20/attn": 0.004415025468915701, "grad/layer_20/mlp": 0.006032215431332588, "grad/layer_20/attn_mlp_ratio": 0.7319077784908606, "grad/layer_24/attn": 0.01273657288402319, "grad/layer_24/mlp": 0.012542876414954662, "grad/layer_24/attn_mlp_ratio": 1.0154427390589063, "grad/layer_27/attn": 0.007973234169185162, "grad/layer_27/mlp": 0.011473922990262508, "grad/layer_27/attn_mlp_ratio": 0.694900436970137} {"step": 11900, "timestamp": 1778207394.1907172, "train/loss": 2.2920143604278564, "train/z_loss": 0.0017372434376738966, "train/perplexity": 9.894849412834942, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026596.4288569074, "perf/iters_per_sec": 0.9663564819607293, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348148107528687, "data/tokens_consumed": 24958205952, "data/tokens_consumed_B": 24.958205952, "train/loss_slope": 6.319989251046071e-06} {"step": 11910, "timestamp": 1778207404.5449018, "train/loss": 2.3289653778076174, "train/z_loss": 0.001734066754579544, "train/perplexity": 10.267313245720397, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026181.653190106, "perf/iters_per_sec": 0.9661587015104799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350266456604005, "data/tokens_consumed": 24979177472, "data/tokens_consumed_B": 24.979177472, "train/loss_slope": 7.447373083751566e-06} {"step": 11920, "timestamp": 1778207414.902726, "train/loss": 2.312743616104126, "train/z_loss": 0.001734498713631183, "train/perplexity": 10.102102960697286, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025656.3432997058, "perf/iters_per_sec": 0.9659082142351655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035295057296753, "data/tokens_consumed": 25000148992, "data/tokens_consumed_B": 25.000148992, "train/loss_slope": 5.619763884500045e-06} {"step": 11925, "timestamp": 1778207420.6880417, "eos/sharpness": 33.49261283874511, "eos/L0_probe": 2.1206438541412354, "eos/L_plus": 2.3400092124938965, "eos/L_minus": 2.2362046241760254, "eos/grad_norm": 0.17647846043109894, "eos/embed_grad_frac": 0.10561441630125046, "eos/time_s": 0.6197483539581299} {"step": 11925, "timestamp": 1778207422.068624, "geo/rankme_last": 440.643798828125, "geo/layer_0/stable_rank_q_proj": 15.279086112976074, "geo/layer_0/stable_rank_k_proj": 13.25367546081543, "geo/layer_0/stable_rank_o_proj": 55.228050231933594, "geo/layer_0/stable_rank_gate_proj": 162.2677001953125, "geo/layer_0/stable_rank_down_proj": 48.240177154541016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04077720269560814, "geo/layer_0/attn_entropy_mean": 6.343629837036133, "geo/layer_0/attn_entropy_std": 0.2773643136024475, "geo/layer_7/stable_rank_q_proj": 43.78374481201172, "geo/layer_7/stable_rank_k_proj": 43.540313720703125, "geo/layer_7/stable_rank_o_proj": 108.27263641357422, "geo/layer_7/stable_rank_gate_proj": 120.65336608886719, "geo/layer_7/stable_rank_down_proj": 162.63661193847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5749196410179138, "geo/layer_7/attn_entropy_mean": 4.660585403442383, "geo/layer_7/attn_entropy_std": 0.8960450887680054, "geo/layer_14/stable_rank_q_proj": 61.79765701293945, "geo/layer_14/stable_rank_k_proj": 39.34713363647461, "geo/layer_14/stable_rank_o_proj": 49.69768524169922, "geo/layer_14/stable_rank_gate_proj": 104.39694213867188, "geo/layer_14/stable_rank_down_proj": 139.6088104248047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4075876772403717, "geo/layer_14/attn_entropy_mean": 5.620485305786133, "geo/layer_14/attn_entropy_std": 0.5681572556495667, "geo/layer_21/stable_rank_q_proj": 51.74299240112305, "geo/layer_21/stable_rank_k_proj": 32.378273010253906, "geo/layer_21/stable_rank_o_proj": 90.41765594482422, "geo/layer_21/stable_rank_gate_proj": 103.64531707763672, "geo/layer_21/stable_rank_down_proj": 68.44526672363281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16107945144176483, "geo/layer_21/attn_entropy_mean": 5.799924850463867, "geo/layer_21/attn_entropy_std": 0.3034042716026306, "geo/layer_27/stable_rank_q_proj": 42.49329376220703, "geo/layer_27/stable_rank_k_proj": 32.455753326416016, "geo/layer_27/stable_rank_o_proj": 116.0076675415039, "geo/layer_27/stable_rank_gate_proj": 94.72168731689453, "geo/layer_27/stable_rank_down_proj": 148.07992553710938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06674976646900177, "geo/layer_27/attn_entropy_mean": 4.488275051116943, "geo/layer_27/attn_entropy_std": 0.5269991159439087, "attnres/final_alpha/block_0": 0.24840694665908813, "attnres/block_norm/0": 1.4649574756622314, "attnres/final_alpha/block_1": 0.008853007107973099, "attnres/block_norm/1": 20028.20703125, "attnres/final_alpha/block_2": 0.01785469427704811, "attnres/block_norm/2": 14756.7255859375, "attnres/final_alpha/block_3": 0.019379325211048126, "attnres/block_norm/3": 17430.140625, "attnres/final_alpha/block_4": 0.027569863945245743, "attnres/block_norm/4": 6385.28515625, "attnres/final_alpha/block_5": 0.516470730304718, "attnres/block_norm/5": 4155.79443359375, "attnres/final_alpha/block_6": 0.16146546602249146, "attnres/block_norm/6": 12263.662109375, "geo/tier1_time_s": 1.3598501682281494, "geo/step": 11925.0, "geo/rankme_slope": 0.0007411245748299319} {"step": 11930, "timestamp": 1778207427.249255, "train/loss": 2.2838042259216307, "train/z_loss": 0.0017325684428215026, "train/perplexity": 9.813943945084047, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699180.9018969852, "perf/iters_per_sec": 0.8102325925335814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2342134952545165, "data/tokens_consumed": 25021120512, "data/tokens_consumed_B": 25.021120512, "train/loss_slope": 5.11971861973491e-06} {"step": 11940, "timestamp": 1778207437.60986, "train/loss": 2.3175766706466674, "train/z_loss": 0.0017343134386464953, "train/perplexity": 10.151045150168644, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025135.1289605582, "perf/iters_per_sec": 0.9656596798708716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355615139007568, "data/tokens_consumed": 25042092032, "data/tokens_consumed_B": 25.042092032, "train/loss_slope": 6.65579222717194e-06} {"step": 11950, "timestamp": 1778207447.9593732, "grad/layer_0/attn": 0.0028007717337459326, "grad/layer_0/mlp": 0.0027099931612610817, "grad/layer_0/attn_mlp_ratio": 1.033497674618795, "grad/layer_4/attn": 0.0018179957987740636, "grad/layer_4/mlp": 0.0026550383772701025, "grad/layer_4/attn_mlp_ratio": 0.684734256899854, "grad/layer_8/attn": 0.009175348095595837, "grad/layer_8/mlp": 0.004136127885431051, "grad/layer_8/attn_mlp_ratio": 2.2183424033092676, "grad/layer_12/attn": 0.00449941772967577, "grad/layer_12/mlp": 0.005627757403999567, "grad/layer_12/attn_mlp_ratio": 0.7995045498101327, "grad/layer_16/attn": 0.004324931185692549, "grad/layer_16/mlp": 0.0043323147110641, "grad/layer_16/attn_mlp_ratio": 0.9982956858645881, "grad/layer_20/attn": 0.00459512323141098, "grad/layer_20/mlp": 0.006219455506652594, "grad/layer_20/attn_mlp_ratio": 0.7388304575236202, "grad/layer_24/attn": 0.014020489528775215, "grad/layer_24/mlp": 0.01031060703098774, "grad/layer_24/attn_mlp_ratio": 1.3598122157751225, "grad/layer_27/attn": 0.009518945589661598, "grad/layer_27/mlp": 0.009894768707454205, "grad/layer_27/attn_mlp_ratio": 0.9620179889893455} {"step": 11950, "timestamp": 1778207447.975473, "train/loss": 2.3746191024780274, "train/z_loss": 0.0017095415270887316, "train/perplexity": 10.74691893159028, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024191.6004135236, "perf/iters_per_sec": 0.9652097703998201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036044216156006, "data/tokens_consumed": 25063063552, "data/tokens_consumed_B": 25.063063552, "train/loss_slope": 7.115093309028776e-06} {"step": 11960, "timestamp": 1778207458.3270774, "train/loss": 2.2884036779403685, "train/z_loss": 0.0017233548918738962, "train/perplexity": 9.859186675498933, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026784.2424601878, "perf/iters_per_sec": 0.9664460384655894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034718918800354, "data/tokens_consumed": 25084035072, "data/tokens_consumed_B": 25.084035072, "train/loss_slope": 8.102975319428295e-06} {"step": 11970, "timestamp": 1778207468.6839523, "train/loss": 2.3159674167633058, "train/z_loss": 0.0017204564297571777, "train/perplexity": 10.134722678363648, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025804.8843969947, "perf/iters_per_sec": 0.9659790441498731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035219144821167, "data/tokens_consumed": 25105006592, "data/tokens_consumed_B": 25.105006592, "train/loss_slope": 7.129348293639789e-06} {"step": 11980, "timestamp": 1778207479.040016, "train/loss": 2.254947280883789, "train/z_loss": 0.001748969138134271, "train/perplexity": 9.53479063116115, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026077.624120646, "perf/iters_per_sec": 0.9661090965846281, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035079789161682, "data/tokens_consumed": 25125978112, "data/tokens_consumed_B": 25.125978112, "train/loss_slope": 3.358624470044365e-06} {"step": 11990, "timestamp": 1778207489.394494, "train/loss": 2.3102374792099, "train/z_loss": 0.0017253321479074658, "train/perplexity": 10.076817405523862, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026317.527787912, "perf/iters_per_sec": 0.9662234915675697, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349572420120239, "data/tokens_consumed": 25146949632, "data/tokens_consumed_B": 25.146949632, "train/loss_slope": 2.907735565350366e-06} {"step": 12000, "timestamp": 1778207499.7627022, "grad/layer_0/attn": 0.0026981388218700886, "grad/layer_0/mlp": 0.002673120005056262, "grad/layer_0/attn_mlp_ratio": 1.0093593687640534, "grad/layer_4/attn": 0.0018586451187729836, "grad/layer_4/mlp": 0.0027208616957068443, "grad/layer_4/attn_mlp_ratio": 0.6831089773488961, "grad/layer_8/attn": 0.004488688427954912, "grad/layer_8/mlp": 0.004171354230493307, "grad/layer_8/attn_mlp_ratio": 1.076074596478616, "grad/layer_12/attn": 0.0048778061755001545, "grad/layer_12/mlp": 0.0060252780094742775, "grad/layer_12/attn_mlp_ratio": 0.8095570174313095, "grad/layer_16/attn": 0.004649195820093155, "grad/layer_16/mlp": 0.004275240935385227, "grad/layer_16/attn_mlp_ratio": 1.0874698716663662, "grad/layer_20/attn": 0.008509738370776176, "grad/layer_20/mlp": 0.006355419754981995, "grad/layer_20/attn_mlp_ratio": 1.338973437625119, "grad/layer_24/attn": 0.014336449094116688, "grad/layer_24/mlp": 0.013178561814129353, "grad/layer_24/attn_mlp_ratio": 1.08786142126372, "grad/layer_27/attn": 0.007435870356857777, "grad/layer_27/mlp": 0.011537962593138218, "grad/layer_27/attn_mlp_ratio": 0.6444699601325622} {"step": 12000, "timestamp": 1778207500.36316, "eos/sharpness": 23.338508605957028, "eos/L0_probe": 2.114112377166748, "eos/L_plus": 2.2998194694519043, "eos/L_minus": 2.161790370941162, "eos/grad_norm": 0.15967194736003876, "eos/embed_grad_frac": 0.10822159796953201, "eos/time_s": 0.5977311134338379} {"step": 12000, "timestamp": 1778207500.3836975, "train/loss": 2.2888151884078978, "train/z_loss": 0.0017337183351628482, "train/perplexity": 9.863244668913344, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909091.902423169, "perf/iters_per_sec": 0.9103259575000615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985076189041139, "data/tokens_consumed": 25167921152, "data/tokens_consumed_B": 25.167921152, "train/loss_slope": 2.847799080254395e-06} {"step": 12000, "timestamp": 1778207501.742585, "geo/rankme_last": 440.9199523925781, "geo/layer_0/stable_rank_q_proj": 15.30225944519043, "geo/layer_0/stable_rank_k_proj": 13.324670791625977, "geo/layer_0/stable_rank_o_proj": 55.40815734863281, "geo/layer_0/stable_rank_gate_proj": 161.8520965576172, "geo/layer_0/stable_rank_down_proj": 48.24215316772461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0299382284283638, "geo/layer_0/attn_entropy_mean": 6.347990036010742, "geo/layer_0/attn_entropy_std": 0.2786512076854706, "geo/layer_7/stable_rank_q_proj": 43.768985748291016, "geo/layer_7/stable_rank_k_proj": 43.60796356201172, "geo/layer_7/stable_rank_o_proj": 108.39494323730469, "geo/layer_7/stable_rank_gate_proj": 120.4620361328125, "geo/layer_7/stable_rank_down_proj": 162.4722137451172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5862748622894287, "geo/layer_7/attn_entropy_mean": 4.6766839027404785, "geo/layer_7/attn_entropy_std": 0.8663526773452759, "geo/layer_14/stable_rank_q_proj": 61.35499572753906, "geo/layer_14/stable_rank_k_proj": 39.04551696777344, "geo/layer_14/stable_rank_o_proj": 49.621734619140625, "geo/layer_14/stable_rank_gate_proj": 104.16020965576172, "geo/layer_14/stable_rank_down_proj": 139.6021270751953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.388982355594635, "geo/layer_14/attn_entropy_mean": 5.6258039474487305, "geo/layer_14/attn_entropy_std": 0.5737409591674805, "geo/layer_21/stable_rank_q_proj": 51.67193603515625, "geo/layer_21/stable_rank_k_proj": 32.36129379272461, "geo/layer_21/stable_rank_o_proj": 90.3892593383789, "geo/layer_21/stable_rank_gate_proj": 103.34860229492188, "geo/layer_21/stable_rank_down_proj": 68.40564727783203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15824595093727112, "geo/layer_21/attn_entropy_mean": 5.783816337585449, "geo/layer_21/attn_entropy_std": 0.2943025827407837, "geo/layer_27/stable_rank_q_proj": 42.33250427246094, "geo/layer_27/stable_rank_k_proj": 32.547672271728516, "geo/layer_27/stable_rank_o_proj": 116.45520782470703, "geo/layer_27/stable_rank_gate_proj": 94.43476104736328, "geo/layer_27/stable_rank_down_proj": 148.09486389160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0648425966501236, "geo/layer_27/attn_entropy_mean": 4.496200084686279, "geo/layer_27/attn_entropy_std": 0.4836372137069702, "attnres/final_alpha/block_0": 0.24791866540908813, "attnres/block_norm/0": 1.4670162200927734, "attnres/final_alpha/block_1": 0.008819688111543655, "attnres/block_norm/1": 20090.609375, "attnres/final_alpha/block_2": 0.018081292510032654, "attnres/block_norm/2": 14827.765625, "attnres/final_alpha/block_3": 0.01950417086482048, "attnres/block_norm/3": 17384.08203125, "attnres/final_alpha/block_4": 0.02749660052359104, "attnres/block_norm/4": 6411.13232421875, "attnres/final_alpha/block_5": 0.5195369720458984, "attnres/block_norm/5": 4156.56396484375, "attnres/final_alpha/block_6": 0.15864259004592896, "attnres/block_norm/6": 12307.650390625, "geo/tier1_time_s": 1.3548450469970703, "geo/step": 12000.0, "geo/rankme_slope": 0.0007103773540666267} {"step": 12000, "timestamp": 1778207508.63605, "geo/ww_alpha_mean": 8.138230571984536, "geo/ww_alpha_std": 4.80508989256988, "geo/ww_alpha_min": 1.3605852464971764, "geo/ww_alpha_max": 28.772497994968425, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.2795387765252055, "geo/ww_alpha_by_type/k_proj": 4.786884468241631, "geo/ww_alpha_by_type/v_proj": 8.444257005794906, "geo/ww_alpha_by_type/o_proj": 8.367781657194907, "geo/ww_alpha_by_type/gate_proj": 9.75802802923141, "geo/ww_alpha_by_type/up_proj": 12.001981141483592, "geo/ww_alpha_by_type/down_proj": 9.473800520637726, "geo/twonn_id/layer_0": 0.7396162152290344, "geo/twonn_id/layer_7": 3.0331900119781494, "geo/twonn_id/layer_14": 3.4118902683258057, "geo/twonn_id/layer_21": 6.390496730804443, "geo/twonn_id/layer_27": 5.057332515716553, "geo/tier2_time_s": 6.886709213256836} {"step": 12000, "timestamp": 1778207509.237362, "eoc/jacobian_sigma/layer_0/attn": 677.1063232421875, "eoc/jacobian_sigma/layer_0/mlp": 3640.58349609375, "eoc/jacobian_sigma/layer_0": 3640.58349609375, "eoc/jacobian_sigma/layer_7/attn": 1.161449909210205, "eoc/jacobian_sigma/layer_7/mlp": 1.7051838636398315, "eoc/jacobian_sigma/layer_7": 1.7051838636398315, "eoc/jacobian_sigma/layer_14/attn": 1.3197269439697266, "eoc/jacobian_sigma/layer_14/mlp": 9.304166793823242, "eoc/jacobian_sigma/layer_14": 9.304166793823242, "eoc/jacobian_sigma/layer_21/attn": 1.0836848020553589, "eoc/jacobian_sigma/layer_21/mlp": 3.3127987384796143, "eoc/jacobian_sigma/layer_21": 3.3127987384796143, "eoc/jacobian_sigma/layer_27/attn": 2.2662312984466553, "eoc/jacobian_sigma/layer_27/mlp": 24.185096740722656, "eoc/jacobian_sigma/layer_27": 24.185096740722656, "eoc/layer0_sigma": 3640.58349609375, "eoc/sigma_max": 24.185096740722656, "eoc/sigma_min": 1.7051838636398315, "eoc/sigma_mean": 9.626811534166336, "eoc/time_s": 0.5952432155609131} {"step": 12010, "timestamp": 1778207519.9634993, "train/loss": 2.276941013336182, "train/z_loss": 0.0017477862653322517, "train/perplexity": 9.746819370085593, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1071294.8855244734, "perf/iters_per_sec": 0.510833208811032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.957586121559143, "data/tokens_consumed": 25188892672, "data/tokens_consumed_B": 25.188892672, "train/loss_slope": 2.1226006658664516e-06} {"step": 12020, "timestamp": 1778207530.3146973, "train/loss": 2.2740819692611693, "train/z_loss": 0.0017355137970298529, "train/perplexity": 9.718992581877295, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027013.102696279, "perf/iters_per_sec": 0.9665551675301929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346020936965943, "data/tokens_consumed": 25209864192, "data/tokens_consumed_B": 25.209864192, "train/loss_slope": 9.993109086451832e-07} {"step": 12030, "timestamp": 1778207540.6669364, "train/loss": 2.262356734275818, "train/z_loss": 0.0017398569965735079, "train/perplexity": 9.605700595570866, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027045.941373824, "perf/iters_per_sec": 0.9665708262318725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345853328704835, "data/tokens_consumed": 25230835712, "data/tokens_consumed_B": 25.230835712, "train/loss_slope": -3.758343783291838e-07} {"step": 12040, "timestamp": 1778207551.0224602, "train/loss": 2.3469876289367675, "train/z_loss": 0.0017003321903757752, "train/perplexity": 10.454030825191122, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026064.930411878, "perf/iters_per_sec": 0.9661030437526121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350862741470337, "data/tokens_consumed": 25251807232, "data/tokens_consumed_B": 25.251807232, "train/loss_slope": 1.2738914129221141e-06} {"step": 12050, "timestamp": 1778207561.366393, "grad/layer_0/attn": 0.003950682934373617, "grad/layer_0/mlp": 0.003200751496478915, "grad/layer_0/attn_mlp_ratio": 1.2342985124867818, "grad/layer_4/attn": 0.0016631807666271925, "grad/layer_4/mlp": 0.0027771703898906708, "grad/layer_4/attn_mlp_ratio": 0.5988760044374039, "grad/layer_8/attn": 0.006306367926299572, "grad/layer_8/mlp": 0.004211110062897205, "grad/layer_8/attn_mlp_ratio": 1.4975547260347255, "grad/layer_12/attn": 0.004218291956931353, "grad/layer_12/mlp": 0.006153948605060577, "grad/layer_12/attn_mlp_ratio": 0.6854610200865867, "grad/layer_16/attn": 0.006039502564817667, "grad/layer_16/mlp": 0.005140057764947414, "grad/layer_16/attn_mlp_ratio": 1.1749872712531135, "grad/layer_20/attn": 0.005103560164570808, "grad/layer_20/mlp": 0.007301631849259138, "grad/layer_20/attn_mlp_ratio": 0.6989615746228681, "grad/layer_24/attn": 0.021714061498641968, "grad/layer_24/mlp": 0.014379768632352352, "grad/layer_24/attn_mlp_ratio": 1.510042470278993, "grad/layer_27/attn": 0.00835379958152771, "grad/layer_27/mlp": 0.014708324335515499, "grad/layer_27/attn_mlp_ratio": 0.5679640545156989} {"step": 12050, "timestamp": 1778207561.38209, "train/loss": 2.227208948135376, "train/z_loss": 0.0017561143497005105, "train/perplexity": 9.273945860086506, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025199.9863164607, "perf/iters_per_sec": 0.9656906062681487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355283498764039, "data/tokens_consumed": 25272778752, "data/tokens_consumed_B": 25.272778752, "train/loss_slope": -4.154155363809077e-06} {"step": 12060, "timestamp": 1778207571.7346125, "train/loss": 2.2987911701202393, "train/z_loss": 0.0017181661794893443, "train/perplexity": 9.962132649586792, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026761.6861778435, "perf/iters_per_sec": 0.9664352827920167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347304344177246, "data/tokens_consumed": 25293750272, "data/tokens_consumed_B": 25.293750272, "train/loss_slope": -5.589783617300621e-06} {"step": 12070, "timestamp": 1778207582.0859554, "train/loss": 2.3256941556930544, "train/z_loss": 0.0017197606270201505, "train/perplexity": 10.233781458437571, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027005.628892701, "perf/iters_per_sec": 0.9665516037429337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03460590839386, "data/tokens_consumed": 25314721792, "data/tokens_consumed_B": 25.314721792, "train/loss_slope": -3.2387680382188895e-06} {"step": 12075, "timestamp": 1778207587.878506, "eos/sharpness": 24.291753768920895, "eos/L0_probe": 2.1162657737731934, "eos/L_plus": 2.2631378173828125, "eos/L_minus": 2.212311267852783, "eos/grad_norm": 0.17905914783477783, "eos/embed_grad_frac": 0.20095354318618774, "eos/time_s": 0.6245806217193604} {"step": 12075, "timestamp": 1778207589.2597506, "geo/rankme_last": 440.94012451171875, "geo/layer_0/stable_rank_q_proj": 15.319416999816895, "geo/layer_0/stable_rank_k_proj": 13.35435962677002, "geo/layer_0/stable_rank_o_proj": 55.148841857910156, "geo/layer_0/stable_rank_gate_proj": 161.73974609375, "geo/layer_0/stable_rank_down_proj": 48.29678726196289, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.035671453922986984, "geo/layer_0/attn_entropy_mean": 6.341493129730225, "geo/layer_0/attn_entropy_std": 0.28332093358039856, "geo/layer_7/stable_rank_q_proj": 43.641021728515625, "geo/layer_7/stable_rank_k_proj": 43.30515670776367, "geo/layer_7/stable_rank_o_proj": 108.36528778076172, "geo/layer_7/stable_rank_gate_proj": 120.13096618652344, "geo/layer_7/stable_rank_down_proj": 162.32025146484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5663788914680481, "geo/layer_7/attn_entropy_mean": 4.663317680358887, "geo/layer_7/attn_entropy_std": 0.8989472985267639, "geo/layer_14/stable_rank_q_proj": 61.146907806396484, "geo/layer_14/stable_rank_k_proj": 38.94583511352539, "geo/layer_14/stable_rank_o_proj": 49.60763931274414, "geo/layer_14/stable_rank_gate_proj": 103.6226577758789, "geo/layer_14/stable_rank_down_proj": 139.34193420410156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.407465398311615, "geo/layer_14/attn_entropy_mean": 5.641071796417236, "geo/layer_14/attn_entropy_std": 0.5663772821426392, "geo/layer_21/stable_rank_q_proj": 51.5768928527832, "geo/layer_21/stable_rank_k_proj": 32.534976959228516, "geo/layer_21/stable_rank_o_proj": 90.33203125, "geo/layer_21/stable_rank_gate_proj": 102.94723510742188, "geo/layer_21/stable_rank_down_proj": 68.37130737304688, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15456147491931915, "geo/layer_21/attn_entropy_mean": 5.804668426513672, "geo/layer_21/attn_entropy_std": 0.2998073399066925, "geo/layer_27/stable_rank_q_proj": 42.25299072265625, "geo/layer_27/stable_rank_k_proj": 32.58128356933594, "geo/layer_27/stable_rank_o_proj": 116.91690826416016, "geo/layer_27/stable_rank_gate_proj": 94.50912475585938, "geo/layer_27/stable_rank_down_proj": 147.64288330078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05949261412024498, "geo/layer_27/attn_entropy_mean": 4.512325286865234, "geo/layer_27/attn_entropy_std": 0.5089192390441895, "attnres/final_alpha/block_0": 0.2484457790851593, "attnres/block_norm/0": 1.469132661819458, "attnres/final_alpha/block_1": 0.00895918719470501, "attnres/block_norm/1": 20191.15234375, "attnres/final_alpha/block_2": 0.017951510846614838, "attnres/block_norm/2": 14949.931640625, "attnres/final_alpha/block_3": 0.019487150013446808, "attnres/block_norm/3": 17505.859375, "attnres/final_alpha/block_4": 0.02732601948082447, "attnres/block_norm/4": 6462.27001953125, "attnres/final_alpha/block_5": 0.5182502269744873, "attnres/block_norm/5": 4166.265625, "attnres/final_alpha/block_6": 0.15958011150360107, "attnres/block_norm/6": 12406.232421875, "geo/tier1_time_s": 1.3606901168823242, "geo/step": 12075.0, "geo/rankme_slope": 0.0006661827817064325} {"step": 12080, "timestamp": 1778207594.4589071, "train/loss": 2.3002540111541747, "train/z_loss": 0.0017312648589722813, "train/perplexity": 9.976716330214702, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695603.9200205428, "perf/iters_per_sec": 0.8085269546606745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2368171453475951, "data/tokens_consumed": 25335693312, "data/tokens_consumed_B": 25.335693312, "train/loss_slope": -1.7056151549450452e-06} {"step": 12090, "timestamp": 1778207604.8278031, "train/loss": 2.336265969276428, "train/z_loss": 0.0017186981742270291, "train/perplexity": 10.342544989176462, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023731.1539344394, "perf/iters_per_sec": 0.964990212409229, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362799406051635, "data/tokens_consumed": 25356664832, "data/tokens_consumed_B": 25.356664832, "train/loss_slope": -6.565950670079188e-07} {"step": 12100, "timestamp": 1778207615.189084, "grad/layer_0/attn": 0.0027651989366859198, "grad/layer_0/mlp": 0.0027676215395331383, "grad/layer_0/attn_mlp_ratio": 0.9991246264255879, "grad/layer_4/attn": 0.00198356038890779, "grad/layer_4/mlp": 0.002659446792677045, "grad/layer_4/attn_mlp_ratio": 0.7458544836408085, "grad/layer_8/attn": 0.004613404627889395, "grad/layer_8/mlp": 0.003989549819380045, "grad/layer_8/attn_mlp_ratio": 1.1563722026584629, "grad/layer_12/attn": 0.004481450188905001, "grad/layer_12/mlp": 0.006111096125096083, "grad/layer_12/attn_mlp_ratio": 0.7333299990435906, "grad/layer_16/attn": 0.007797074504196644, "grad/layer_16/mlp": 0.004890464246273041, "grad/layer_16/attn_mlp_ratio": 1.5943423675378992, "grad/layer_20/attn": 0.004780577961355448, "grad/layer_20/mlp": 0.007007991895079613, "grad/layer_20/attn_mlp_ratio": 0.6821608764267916, "grad/layer_24/attn": 0.01469753123819828, "grad/layer_24/mlp": 0.011684302240610123, "grad/layer_24/attn_mlp_ratio": 1.2578869332330898, "grad/layer_27/attn": 0.005716829095035791, "grad/layer_27/mlp": 0.010341973975300789, "grad/layer_27/attn_mlp_ratio": 0.5527792908211793} {"step": 12100, "timestamp": 1778207615.2065113, "train/loss": 2.2952840089797975, "train/z_loss": 0.0017264231690205633, "train/perplexity": 9.927255041528124, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021635.551645324, "perf/iters_per_sec": 0.9639909513689633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373541355133056, "data/tokens_consumed": 25377636352, "data/tokens_consumed_B": 25.377636352, "train/loss_slope": 2.656059654274928e-07} {"step": 12110, "timestamp": 1778207625.5813642, "train/loss": 2.274281358718872, "train/z_loss": 0.00174047282198444, "train/perplexity": 9.720930639745346, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022901.091487563, "perf/iters_per_sec": 0.9645944077909293, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367051601409911, "data/tokens_consumed": 25398607872, "data/tokens_consumed_B": 25.398607872, "train/loss_slope": -1.0337973227559453e-06} {"step": 12120, "timestamp": 1778207635.9459193, "train/loss": 2.3109504699707033, "train/z_loss": 0.0017218546126969159, "train/perplexity": 10.084004645145551, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024392.8524792935, "perf/iters_per_sec": 0.9653057348629444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035941219329834, "data/tokens_consumed": 25419579392, "data/tokens_consumed_B": 25.419579392, "train/loss_slope": -4.7338126909612264e-07} {"step": 12130, "timestamp": 1778207646.3208048, "train/loss": 2.329633021354675, "train/z_loss": 0.0017143039614893496, "train/perplexity": 10.274170439980558, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022290.4869569277, "perf/iters_per_sec": 0.9643032488617552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370181798934937, "data/tokens_consumed": 25440550912, "data/tokens_consumed_B": 25.440550912, "train/loss_slope": 2.1845813130888975e-06} {"step": 12140, "timestamp": 1778207656.723763, "train/loss": 2.2580556154251097, "train/z_loss": 0.001732313574757427, "train/perplexity": 9.56447405933733, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017559.838150107, "perf/iters_per_sec": 0.9620474997282539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0394497156143188, "data/tokens_consumed": 25461522432, "data/tokens_consumed_B": 25.461522432, "train/loss_slope": -3.0949072404341415e-06} {"step": 12150, "timestamp": 1778207667.0729332, "grad/layer_0/attn": 0.0031957151368260384, "grad/layer_0/mlp": 0.0028593146707862616, "grad/layer_0/attn_mlp_ratio": 1.1176506936126072, "grad/layer_4/attn": 0.0021366651635617018, "grad/layer_4/mlp": 0.0028196300845593214, "grad/layer_4/attn_mlp_ratio": 0.7577820578253031, "grad/layer_8/attn": 0.004534479230642319, "grad/layer_8/mlp": 0.004396888893097639, "grad/layer_8/attn_mlp_ratio": 1.0312926338965285, "grad/layer_12/attn": 0.004825643263757229, "grad/layer_12/mlp": 0.006340291816741228, "grad/layer_12/attn_mlp_ratio": 0.7611074264601849, "grad/layer_16/attn": 0.004855872131884098, "grad/layer_16/mlp": 0.0048090387135744095, "grad/layer_16/attn_mlp_ratio": 1.0097386026865687, "grad/layer_20/attn": 0.01100866962224245, "grad/layer_20/mlp": 0.006357727572321892, "grad/layer_20/attn_mlp_ratio": 1.7315415490613488, "grad/layer_24/attn": 0.014376317150890827, "grad/layer_24/mlp": 0.01744203083217144, "grad/layer_24/attn_mlp_ratio": 0.8242341277112419, "grad/layer_27/attn": 0.0059365942142903805, "grad/layer_27/mlp": 0.015233006328344345, "grad/layer_27/attn_mlp_ratio": 0.38971914324437273} {"step": 12150, "timestamp": 1778207667.6658902, "eos/sharpness": 26.66044235229492, "eos/L0_probe": 2.1172568798065186, "eos/L_plus": 2.2518184185028076, "eos/L_minus": 2.2492997646331787, "eos/grad_norm": 0.21454988420009613, "eos/embed_grad_frac": 0.08547957241535187, "eos/time_s": 0.5900897979736328} {"step": 12150, "timestamp": 1778207667.684394, "train/loss": 2.3605276584625243, "train/z_loss": 0.0017021200270392, "train/perplexity": 10.596541332237837, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914233.751324645, "perf/iters_per_sec": 0.9127777821181512, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095556902885437, "data/tokens_consumed": 25482493952, "data/tokens_consumed_B": 25.482493952, "train/loss_slope": 8.524367518157232e-07} {"step": 12150, "timestamp": 1778207669.0532908, "geo/rankme_last": 439.4676513671875, "geo/layer_0/stable_rank_q_proj": 15.3095703125, "geo/layer_0/stable_rank_k_proj": 13.346512794494629, "geo/layer_0/stable_rank_o_proj": 54.98914337158203, "geo/layer_0/stable_rank_gate_proj": 161.7829132080078, "geo/layer_0/stable_rank_down_proj": 48.20375442504883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03365461528301239, "geo/layer_0/attn_entropy_mean": 6.340187072753906, "geo/layer_0/attn_entropy_std": 0.2834816873073578, "geo/layer_7/stable_rank_q_proj": 43.61612319946289, "geo/layer_7/stable_rank_k_proj": 43.38957595825195, "geo/layer_7/stable_rank_o_proj": 108.00675201416016, "geo/layer_7/stable_rank_gate_proj": 120.2022933959961, "geo/layer_7/stable_rank_down_proj": 162.02117919921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.590151846408844, "geo/layer_7/attn_entropy_mean": 4.682248115539551, "geo/layer_7/attn_entropy_std": 0.9087083339691162, "geo/layer_14/stable_rank_q_proj": 61.0273323059082, "geo/layer_14/stable_rank_k_proj": 39.00483322143555, "geo/layer_14/stable_rank_o_proj": 49.525020599365234, "geo/layer_14/stable_rank_gate_proj": 103.5783462524414, "geo/layer_14/stable_rank_down_proj": 139.03126525878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39898616075515747, "geo/layer_14/attn_entropy_mean": 5.609455108642578, "geo/layer_14/attn_entropy_std": 0.5691221952438354, "geo/layer_21/stable_rank_q_proj": 51.40218734741211, "geo/layer_21/stable_rank_k_proj": 32.5850715637207, "geo/layer_21/stable_rank_o_proj": 90.53623962402344, "geo/layer_21/stable_rank_gate_proj": 102.63948059082031, "geo/layer_21/stable_rank_down_proj": 68.30464172363281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1632428765296936, "geo/layer_21/attn_entropy_mean": 5.793145179748535, "geo/layer_21/attn_entropy_std": 0.2863328754901886, "geo/layer_27/stable_rank_q_proj": 42.384429931640625, "geo/layer_27/stable_rank_k_proj": 32.57492446899414, "geo/layer_27/stable_rank_o_proj": 116.85319519042969, "geo/layer_27/stable_rank_gate_proj": 94.65399169921875, "geo/layer_27/stable_rank_down_proj": 147.2984619140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0665314570069313, "geo/layer_27/attn_entropy_mean": 4.473832130432129, "geo/layer_27/attn_entropy_std": 0.5241144299507141, "attnres/final_alpha/block_0": 0.24726811051368713, "attnres/block_norm/0": 1.4714140892028809, "attnres/final_alpha/block_1": 0.008658386766910553, "attnres/block_norm/1": 20343.15625, "attnres/final_alpha/block_2": 0.01766074448823929, "attnres/block_norm/2": 15131.275390625, "attnres/final_alpha/block_3": 0.0192868672311306, "attnres/block_norm/3": 17626.150390625, "attnres/final_alpha/block_4": 0.02708069607615471, "attnres/block_norm/4": 6443.38134765625, "attnres/final_alpha/block_5": 0.5222039222717285, "attnres/block_norm/5": 4180.630859375, "attnres/final_alpha/block_6": 0.15784123539924622, "attnres/block_norm/6": 12475.9453125, "geo/tier1_time_s": 1.3651113510131836, "geo/step": 12150.0, "geo/rankme_slope": 0.000597784484887705} {"step": 12160, "timestamp": 1778207679.4242246, "train/loss": 2.3548874139785765, "train/z_loss": 0.0017137814313173295, "train/perplexity": 10.5369424824798, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786936.745242905, "perf/iters_per_sec": 0.8520778394903684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1736016988754272, "data/tokens_consumed": 25503465472, "data/tokens_consumed_B": 25.503465472, "train/loss_slope": 2.4366924721951895e-06} {"step": 12170, "timestamp": 1778207689.7932398, "train/loss": 2.2899382591247557, "train/z_loss": 0.0017233065911568702, "train/perplexity": 9.87432801269869, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023614.9926402022, "perf/iters_per_sec": 0.9649348223877917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363394260406493, "data/tokens_consumed": 25524436992, "data/tokens_consumed_B": 25.524436992, "train/loss_slope": -3.903053488561481e-07} {"step": 12180, "timestamp": 1778207700.742703, "train/loss": 2.2990753412246705, "train/z_loss": 0.0017296172445639968, "train/perplexity": 9.964964002099547, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916162.925237934, "perf/iters_per_sec": 0.9136976839246436, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944539070129395, "data/tokens_consumed": 25545408512, "data/tokens_consumed_B": 25.545408512, "train/loss_slope": -3.999850430218458e-07} {"step": 12190, "timestamp": 1778207711.117335, "train/loss": 2.3399752616882323, "train/z_loss": 0.0017153195338323712, "train/perplexity": 10.380979751641771, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022734.6025286904, "perf/iters_per_sec": 0.9645150196689083, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367904901504517, "data/tokens_consumed": 25566380032, "data/tokens_consumed_B": 25.566380032, "train/loss_slope": 4.794474417763395e-06} {"step": 12200, "timestamp": 1778207721.467618, "grad/layer_0/attn": 0.0026258958969265223, "grad/layer_0/mlp": 0.002637078519910574, "grad/layer_0/attn_mlp_ratio": 0.9957594275348413, "grad/layer_4/attn": 0.0025566648691892624, "grad/layer_4/mlp": 0.002676387783139944, "grad/layer_4/attn_mlp_ratio": 0.9552669421704965, "grad/layer_8/attn": 0.004448581952601671, "grad/layer_8/mlp": 0.004263920709490776, "grad/layer_8/attn_mlp_ratio": 1.043307826613447, "grad/layer_12/attn": 0.004337474703788757, "grad/layer_12/mlp": 0.006032568868249655, "grad/layer_12/attn_mlp_ratio": 0.7190095507598104, "grad/layer_16/attn": 0.004851158708333969, "grad/layer_16/mlp": 0.004764859564602375, "grad/layer_16/attn_mlp_ratio": 1.0181115604248958, "grad/layer_20/attn": 0.006516916211694479, "grad/layer_20/mlp": 0.007288414519280195, "grad/layer_20/attn_mlp_ratio": 0.8941472943176346, "grad/layer_24/attn": 0.01740090176463127, "grad/layer_24/mlp": 0.012831642292439938, "grad/layer_24/attn_mlp_ratio": 1.3560931042532343, "grad/layer_27/attn": 0.016522305086255074, "grad/layer_27/mlp": 0.009674640372395515, "grad/layer_27/attn_mlp_ratio": 1.7077952543454076} {"step": 12200, "timestamp": 1778207721.483407, "train/loss": 2.3082084178924562, "train/z_loss": 0.001719340996351093, "train/perplexity": 10.056391654681713, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024376.685615347, "perf/iters_per_sec": 0.9652980259014831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359494924545287, "data/tokens_consumed": 25587351552, "data/tokens_consumed_B": 25.587351552, "train/loss_slope": 2.8776988588293315e-06} {"step": 12210, "timestamp": 1778207731.8430622, "train/loss": 2.317829966545105, "train/z_loss": 0.0017307725036516785, "train/perplexity": 10.153616693937032, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025371.5449562334, "perf/iters_per_sec": 0.9657724118024031, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354406356811523, "data/tokens_consumed": 25608323072, "data/tokens_consumed_B": 25.608323072, "train/loss_slope": 5.447371357237618e-06} {"step": 12220, "timestamp": 1778207742.2099268, "train/loss": 2.3151717662811278, "train/z_loss": 0.0017202078248374163, "train/perplexity": 10.126662188469433, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023899.2505987133, "perf/iters_per_sec": 0.9650703671449248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361938714981078, "data/tokens_consumed": 25629294592, "data/tokens_consumed_B": 25.629294592, "train/loss_slope": 6.444932046038109e-06} {"step": 12225, "timestamp": 1778207747.9719105, "eos/sharpness": 16.935110092163082, "eos/L0_probe": 2.1171185970306396, "eos/L_plus": 2.1800899505615234, "eos/L_minus": 2.2234983444213867, "eos/grad_norm": 0.1462068259716034, "eos/embed_grad_frac": 0.14982770383358002, "eos/time_s": 0.5900118350982666} {"step": 12225, "timestamp": 1778207749.3499234, "geo/rankme_last": 440.73626708984375, "geo/layer_0/stable_rank_q_proj": 15.326372146606445, "geo/layer_0/stable_rank_k_proj": 13.366029739379883, "geo/layer_0/stable_rank_o_proj": 55.11091995239258, "geo/layer_0/stable_rank_gate_proj": 162.1186065673828, "geo/layer_0/stable_rank_down_proj": 48.16990280151367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03076198324561119, "geo/layer_0/attn_entropy_mean": 6.342951774597168, "geo/layer_0/attn_entropy_std": 0.2777714729309082, "geo/layer_7/stable_rank_q_proj": 43.65312957763672, "geo/layer_7/stable_rank_k_proj": 43.60952377319336, "geo/layer_7/stable_rank_o_proj": 108.27787780761719, "geo/layer_7/stable_rank_gate_proj": 119.87108612060547, "geo/layer_7/stable_rank_down_proj": 161.4597930908203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5861294269561768, "geo/layer_7/attn_entropy_mean": 4.617230415344238, "geo/layer_7/attn_entropy_std": 0.8669112920761108, "geo/layer_14/stable_rank_q_proj": 60.75544738769531, "geo/layer_14/stable_rank_k_proj": 39.048736572265625, "geo/layer_14/stable_rank_o_proj": 49.5877799987793, "geo/layer_14/stable_rank_gate_proj": 103.71778869628906, "geo/layer_14/stable_rank_down_proj": 139.37149047851562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4065553545951843, "geo/layer_14/attn_entropy_mean": 5.612573623657227, "geo/layer_14/attn_entropy_std": 0.5859724879264832, "geo/layer_21/stable_rank_q_proj": 51.2787971496582, "geo/layer_21/stable_rank_k_proj": 32.49687957763672, "geo/layer_21/stable_rank_o_proj": 90.19041442871094, "geo/layer_21/stable_rank_gate_proj": 102.55645751953125, "geo/layer_21/stable_rank_down_proj": 68.17828369140625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1576448529958725, "geo/layer_21/attn_entropy_mean": 5.801325798034668, "geo/layer_21/attn_entropy_std": 0.28870415687561035, "geo/layer_27/stable_rank_q_proj": 42.419002532958984, "geo/layer_27/stable_rank_k_proj": 32.615013122558594, "geo/layer_27/stable_rank_o_proj": 116.66437530517578, "geo/layer_27/stable_rank_gate_proj": 94.82262420654297, "geo/layer_27/stable_rank_down_proj": 147.0039825439453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.05914987996220589, "geo/layer_27/attn_entropy_mean": 4.484747886657715, "geo/layer_27/attn_entropy_std": 0.48493629693984985, "attnres/final_alpha/block_0": 0.24663320183753967, "attnres/block_norm/0": 1.4732943773269653, "attnres/final_alpha/block_1": 0.008691986091434956, "attnres/block_norm/1": 20469.244140625, "attnres/final_alpha/block_2": 0.01762247458100319, "attnres/block_norm/2": 15151.08984375, "attnres/final_alpha/block_3": 0.01927378959953785, "attnres/block_norm/3": 17769.95703125, "attnres/final_alpha/block_4": 0.027240164577960968, "attnres/block_norm/4": 6518.61474609375, "attnres/final_alpha/block_5": 0.5225457549095154, "attnres/block_norm/5": 4171.15625, "attnres/final_alpha/block_6": 0.1579926311969757, "attnres/block_norm/6": 12579.546875, "geo/tier1_time_s": 1.3592920303344727, "geo/step": 12225.0, "geo/rankme_slope": 0.0006028986203856542} {"step": 12230, "timestamp": 1778207754.5336635, "train/loss": 2.3101627588272096, "train/z_loss": 0.0017212499864399434, "train/perplexity": 10.076064490000437, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702581.81185075, "perf/iters_per_sec": 0.8118542727712392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317481517791748, "data/tokens_consumed": 25650266112, "data/tokens_consumed_B": 25.650266112, "train/loss_slope": 4.86470504884346e-06} {"step": 12240, "timestamp": 1778207764.898068, "train/loss": 2.2675280570983887, "train/z_loss": 0.001734115951694548, "train/perplexity": 9.655503436583714, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024808.9468648653, "perf/iters_per_sec": 0.9655041441273047, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357283353805542, "data/tokens_consumed": 25671237632, "data/tokens_consumed_B": 25.671237632, "train/loss_slope": -4.077706793354433e-07} {"step": 12250, "timestamp": 1778207775.255625, "grad/layer_0/attn": 0.0027410590555518866, "grad/layer_0/mlp": 0.0027095074765384197, "grad/layer_0/attn_mlp_ratio": 1.0116447280999215, "grad/layer_4/attn": 0.0016165298875421286, "grad/layer_4/mlp": 0.002717620460316539, "grad/layer_4/attn_mlp_ratio": 0.5948328148333706, "grad/layer_8/attn": 0.004507539793848991, "grad/layer_8/mlp": 0.0041505927219986916, "grad/layer_8/attn_mlp_ratio": 1.0859990336701866, "grad/layer_12/attn": 0.0051547810435295105, "grad/layer_12/mlp": 0.0058100782334804535, "grad/layer_12/attn_mlp_ratio": 0.8872136910487409, "grad/layer_16/attn": 0.003982367925345898, "grad/layer_16/mlp": 0.004479512572288513, "grad/layer_16/attn_mlp_ratio": 0.8890181179711606, "grad/layer_20/attn": 0.0043616327457129955, "grad/layer_20/mlp": 0.006185540929436684, "grad/layer_20/attn_mlp_ratio": 0.7051335889546608, "grad/layer_24/attn": 0.010560927912592888, "grad/layer_24/mlp": 0.009940086863934994, "grad/layer_24/attn_mlp_ratio": 1.0624583015128997, "grad/layer_27/attn": 0.012004883028566837, "grad/layer_27/mlp": 0.008762417361140251, "grad/layer_27/attn_mlp_ratio": 1.3700423521027547} {"step": 12250, "timestamp": 1778207775.2706547, "train/loss": 2.3067692279815675, "train/z_loss": 0.0017175411572679876, "train/perplexity": 10.041929007016984, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023028.988877299, "perf/iters_per_sec": 0.9646553940187926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366396188735962, "data/tokens_consumed": 25692209152, "data/tokens_consumed_B": 25.692209152, "train/loss_slope": -1.4708862768219959e-06} {"step": 12260, "timestamp": 1778207785.6280496, "train/loss": 2.266137981414795, "train/z_loss": 0.0017292233533225954, "train/perplexity": 9.642090880437507, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026018.5903915127, "perf/iters_per_sec": 0.9660809471089901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351099491119384, "data/tokens_consumed": 25713180672, "data/tokens_consumed_B": 25.713180672, "train/loss_slope": -7.593274867609781e-06} {"step": 12270, "timestamp": 1778207795.987442, "train/loss": 2.2334773778915404, "train/z_loss": 0.0017420291900634766, "train/perplexity": 9.332261521134736, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025475.967935277, "perf/iters_per_sec": 0.9658222045589814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353872537612916, "data/tokens_consumed": 25734152192, "data/tokens_consumed_B": 25.734152192, "train/loss_slope": -1.2964666408352292e-05} {"step": 12280, "timestamp": 1778207806.3620458, "train/loss": 2.263862943649292, "train/z_loss": 0.0017349985311739147, "train/perplexity": 9.62017969338577, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022547.7246907537, "perf/iters_per_sec": 0.9644259093717354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368862867355346, "data/tokens_consumed": 25755123712, "data/tokens_consumed_B": 25.755123712, "train/loss_slope": -1.293223185328942e-05} {"step": 12290, "timestamp": 1778207816.7234013, "train/loss": 2.2431764125823976, "train/z_loss": 0.0017409392399713396, "train/perplexity": 9.423215820811848, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025060.6250621027, "perf/iters_per_sec": 0.9656241536436571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355996131896972, "data/tokens_consumed": 25776095232, "data/tokens_consumed_B": 25.776095232, "train/loss_slope": -1.4395726639123579e-05} {"step": 12300, "timestamp": 1778207827.0735178, "grad/layer_0/attn": 0.0025075944140553474, "grad/layer_0/mlp": 0.0025445176288485527, "grad/layer_0/attn_mlp_ratio": 0.9854890715145798, "grad/layer_4/attn": 0.0019486927194520831, "grad/layer_4/mlp": 0.0025880909524858, "grad/layer_4/attn_mlp_ratio": 0.752945966711801, "grad/layer_8/attn": 0.006587150041013956, "grad/layer_8/mlp": 0.003929396625608206, "grad/layer_8/attn_mlp_ratio": 1.6763769354427744, "grad/layer_12/attn": 0.0039810012094676495, "grad/layer_12/mlp": 0.006295308470726013, "grad/layer_12/attn_mlp_ratio": 0.6323758660504447, "grad/layer_16/attn": 0.004353239666670561, "grad/layer_16/mlp": 0.004598537459969521, "grad/layer_16/attn_mlp_ratio": 0.9466574122533458, "grad/layer_20/attn": 0.003759868908673525, "grad/layer_20/mlp": 0.0058431257493793964, "grad/layer_20/attn_mlp_ratio": 0.6434687538131434, "grad/layer_24/attn": 0.012729658745229244, "grad/layer_24/mlp": 0.010131729766726494, "grad/layer_24/attn_mlp_ratio": 1.2564151346982297, "grad/layer_27/attn": 0.006193351000547409, "grad/layer_27/mlp": 0.009202663786709309, "grad/layer_27/attn_mlp_ratio": 0.6729954583576593} {"step": 12300, "timestamp": 1778207827.6660676, "eos/sharpness": 20.30975818634033, "eos/L0_probe": 2.1173477172851562, "eos/L_plus": 2.2409629821777344, "eos/L_minus": 2.1968300342559814, "eos/grad_norm": 0.12958580255508423, "eos/embed_grad_frac": 0.1436329185962677, "eos/time_s": 0.5896611213684082} {"step": 12300, "timestamp": 1778207827.6856906, "train/loss": 2.2930429458618162, "train/z_loss": 0.0017266400274820626, "train/perplexity": 9.905032346923264, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913914.7446180203, "perf/iters_per_sec": 0.9126256678667165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0957395076751708, "data/tokens_consumed": 25797066752, "data/tokens_consumed_B": 25.797066752, "train/loss_slope": -1.544392913660416e-05} {"step": 12300, "timestamp": 1778207829.048566, "geo/rankme_last": 440.0747375488281, "geo/layer_0/stable_rank_q_proj": 15.312868118286133, "geo/layer_0/stable_rank_k_proj": 13.345660209655762, "geo/layer_0/stable_rank_o_proj": 55.1690788269043, "geo/layer_0/stable_rank_gate_proj": 162.40826416015625, "geo/layer_0/stable_rank_down_proj": 48.26225662231445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04262596368789673, "geo/layer_0/attn_entropy_mean": 6.34281063079834, "geo/layer_0/attn_entropy_std": 0.2850079834461212, "geo/layer_7/stable_rank_q_proj": 43.82379913330078, "geo/layer_7/stable_rank_k_proj": 43.51774978637695, "geo/layer_7/stable_rank_o_proj": 108.17929077148438, "geo/layer_7/stable_rank_gate_proj": 119.4041748046875, "geo/layer_7/stable_rank_down_proj": 161.36007690429688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5810292363166809, "geo/layer_7/attn_entropy_mean": 4.63938570022583, "geo/layer_7/attn_entropy_std": 0.8503777384757996, "geo/layer_14/stable_rank_q_proj": 61.021881103515625, "geo/layer_14/stable_rank_k_proj": 38.977210998535156, "geo/layer_14/stable_rank_o_proj": 49.59626388549805, "geo/layer_14/stable_rank_gate_proj": 103.65376281738281, "geo/layer_14/stable_rank_down_proj": 139.24090576171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3948167860507965, "geo/layer_14/attn_entropy_mean": 5.642904281616211, "geo/layer_14/attn_entropy_std": 0.5503130555152893, "geo/layer_21/stable_rank_q_proj": 51.303653717041016, "geo/layer_21/stable_rank_k_proj": 32.45341491699219, "geo/layer_21/stable_rank_o_proj": 90.3285140991211, "geo/layer_21/stable_rank_gate_proj": 102.54513549804688, "geo/layer_21/stable_rank_down_proj": 67.8774642944336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15888121724128723, "geo/layer_21/attn_entropy_mean": 5.797685623168945, "geo/layer_21/attn_entropy_std": 0.2961413562297821, "geo/layer_27/stable_rank_q_proj": 42.46004867553711, "geo/layer_27/stable_rank_k_proj": 32.66938781738281, "geo/layer_27/stable_rank_o_proj": 116.27279663085938, "geo/layer_27/stable_rank_gate_proj": 94.64270782470703, "geo/layer_27/stable_rank_down_proj": 146.818359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0608319453895092, "geo/layer_27/attn_entropy_mean": 4.519474029541016, "geo/layer_27/attn_entropy_std": 0.4868786036968231, "attnres/final_alpha/block_0": 0.24634847044944763, "attnres/block_norm/0": 1.475224494934082, "attnres/final_alpha/block_1": 0.008720656856894493, "attnres/block_norm/1": 20512.76171875, "attnres/final_alpha/block_2": 0.01765449345111847, "attnres/block_norm/2": 15273.16015625, "attnres/final_alpha/block_3": 0.019169975072145462, "attnres/block_norm/3": 17912.52734375, "attnres/final_alpha/block_4": 0.026799026876688004, "attnres/block_norm/4": 6524.7705078125, "attnres/final_alpha/block_5": 0.5238631367683411, "attnres/block_norm/5": 4182.5009765625, "attnres/final_alpha/block_6": 0.15744420886039734, "attnres/block_norm/6": 12620.0146484375, "geo/tier1_time_s": 1.358595371246338, "geo/step": 12300.0, "geo/rankme_slope": 0.000538551787902661} {"step": 12310, "timestamp": 1778207839.417008, "train/loss": 2.2963053703308107, "train/z_loss": 0.0017255808576010167, "train/perplexity": 9.937399535864536, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788299.5744615793, "perf/iters_per_sec": 0.8527276871021172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1727073192596436, "data/tokens_consumed": 25818038272, "data/tokens_consumed_B": 25.818038272, "train/loss_slope": -1.558937129407825e-05} {"step": 12320, "timestamp": 1778207849.7762504, "train/loss": 2.3411078453063965, "train/z_loss": 0.0017152974382042886, "train/perplexity": 10.392743739841617, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025695.4825114848, "perf/iters_per_sec": 0.9659268772656845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352750539779663, "data/tokens_consumed": 25839009792, "data/tokens_consumed_B": 25.839009792, "train/loss_slope": -1.3101696746327162e-05} {"step": 12330, "timestamp": 1778207860.1397972, "train/loss": 2.27206494808197, "train/z_loss": 0.0017333454568870365, "train/perplexity": 9.699408924964018, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024693.5472985485, "perf/iters_per_sec": 0.9654491173260443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357873678207397, "data/tokens_consumed": 25859981312, "data/tokens_consumed_B": 25.859981312, "train/loss_slope": -1.6466503632594414e-05} {"step": 12340, "timestamp": 1778207870.4991684, "train/loss": 2.303858160972595, "train/z_loss": 0.0017229224205948413, "train/perplexity": 10.012738786735753, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026071.27724638, "perf/iters_per_sec": 0.9661060701591396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035083031654358, "data/tokens_consumed": 25880952832, "data/tokens_consumed_B": 25.880952832, "train/loss_slope": -1.3229555000673726e-05} {"step": 12350, "timestamp": 1778207880.8435597, "grad/layer_0/attn": 0.0027344452682882547, "grad/layer_0/mlp": 0.0027702234219759703, "grad/layer_0/attn_mlp_ratio": 0.9870847051135443, "grad/layer_4/attn": 0.0016132643213495612, "grad/layer_4/mlp": 0.0025431152898818254, "grad/layer_4/attn_mlp_ratio": 0.6343653645320935, "grad/layer_8/attn": 0.00888972170650959, "grad/layer_8/mlp": 0.003913450986146927, "grad/layer_8/attn_mlp_ratio": 2.271581146875189, "grad/layer_12/attn": 0.005073902662843466, "grad/layer_12/mlp": 0.006562710274010897, "grad/layer_12/attn_mlp_ratio": 0.7731413354666261, "grad/layer_16/attn": 0.004689100198447704, "grad/layer_16/mlp": 0.004711803048849106, "grad/layer_16/attn_mlp_ratio": 0.9951816852945254, "grad/layer_20/attn": 0.012020708061754704, "grad/layer_20/mlp": 0.007006471510976553, "grad/layer_20/attn_mlp_ratio": 1.715657855934604, "grad/layer_24/attn": 0.017379900440573692, "grad/layer_24/mlp": 0.014588028192520142, "grad/layer_24/attn_mlp_ratio": 1.1913810483549072, "grad/layer_27/attn": 0.005605271086096764, "grad/layer_27/mlp": 0.013297184370458126, "grad/layer_27/attn_mlp_ratio": 0.42153819092679295} {"step": 12350, "timestamp": 1778207880.8592033, "train/loss": 2.304908609390259, "train/z_loss": 0.001716960722114891, "train/perplexity": 10.023262178523222, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025366.1818514143, "perf/iters_per_sec": 0.9657698544747421, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035443377494812, "data/tokens_consumed": 25901924352, "data/tokens_consumed_B": 25.901924352, "train/loss_slope": -1.0609446593386759e-05} {"step": 12360, "timestamp": 1778207891.2231622, "train/loss": 2.3219006776809694, "train/z_loss": 0.0017025540233589708, "train/perplexity": 10.195033374962776, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024843.2523417643, "perf/iters_per_sec": 0.96552050225342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357107877731324, "data/tokens_consumed": 25922895872, "data/tokens_consumed_B": 25.922895872, "train/loss_slope": -9.278389916132425e-06} {"step": 12370, "timestamp": 1778207901.5838933, "train/loss": 2.2997648239135744, "train/z_loss": 0.0017223203554749488, "train/perplexity": 9.971837041423084, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025393.9770329292, "perf/iters_per_sec": 0.9657831082501074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354291677474976, "data/tokens_consumed": 25943867392, "data/tokens_consumed_B": 25.943867392, "train/loss_slope": -1.2026611463178779e-05} {"step": 12375, "timestamp": 1778207907.3701308, "eos/sharpness": 5.433034896850585, "eos/L0_probe": 2.1178295612335205, "eos/L_plus": 2.145336389541626, "eos/L_minus": 2.144653081893921, "eos/grad_norm": 0.09509726613759995, "eos/embed_grad_frac": 0.34557658433914185, "eos/time_s": 0.6198196411132812} {"step": 12375, "timestamp": 1778207908.755362, "geo/rankme_last": 440.97125244140625, "geo/layer_0/stable_rank_q_proj": 15.317806243896484, "geo/layer_0/stable_rank_k_proj": 13.341643333435059, "geo/layer_0/stable_rank_o_proj": 54.860355377197266, "geo/layer_0/stable_rank_gate_proj": 162.46058654785156, "geo/layer_0/stable_rank_down_proj": 48.1615104675293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.035389244556427, "geo/layer_0/attn_entropy_mean": 6.33681058883667, "geo/layer_0/attn_entropy_std": 0.28216129541397095, "geo/layer_7/stable_rank_q_proj": 43.66248321533203, "geo/layer_7/stable_rank_k_proj": 43.46970748901367, "geo/layer_7/stable_rank_o_proj": 108.19075012207031, "geo/layer_7/stable_rank_gate_proj": 119.37699127197266, "geo/layer_7/stable_rank_down_proj": 161.27777099609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5885575413703918, "geo/layer_7/attn_entropy_mean": 4.655427932739258, "geo/layer_7/attn_entropy_std": 0.9088239073753357, "geo/layer_14/stable_rank_q_proj": 61.130531311035156, "geo/layer_14/stable_rank_k_proj": 38.937889099121094, "geo/layer_14/stable_rank_o_proj": 49.58782958984375, "geo/layer_14/stable_rank_gate_proj": 102.85501861572266, "geo/layer_14/stable_rank_down_proj": 139.0352325439453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4009668827056885, "geo/layer_14/attn_entropy_mean": 5.637595176696777, "geo/layer_14/attn_entropy_std": 0.5402678847312927, "geo/layer_21/stable_rank_q_proj": 51.116764068603516, "geo/layer_21/stable_rank_k_proj": 32.55353546142578, "geo/layer_21/stable_rank_o_proj": 90.38643646240234, "geo/layer_21/stable_rank_gate_proj": 102.25711822509766, "geo/layer_21/stable_rank_down_proj": 67.94733428955078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15858933329582214, "geo/layer_21/attn_entropy_mean": 5.790749549865723, "geo/layer_21/attn_entropy_std": 0.29645559191703796, "geo/layer_27/stable_rank_q_proj": 42.39990997314453, "geo/layer_27/stable_rank_k_proj": 32.633209228515625, "geo/layer_27/stable_rank_o_proj": 116.31293487548828, "geo/layer_27/stable_rank_gate_proj": 94.72976684570312, "geo/layer_27/stable_rank_down_proj": 146.4851531982422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0698338970541954, "geo/layer_27/attn_entropy_mean": 4.507057189941406, "geo/layer_27/attn_entropy_std": 0.5238522291183472, "attnres/final_alpha/block_0": 0.24727773666381836, "attnres/block_norm/0": 1.4773201942443848, "attnres/final_alpha/block_1": 0.008716465905308723, "attnres/block_norm/1": 20651.3125, "attnres/final_alpha/block_2": 0.017745403572916985, "attnres/block_norm/2": 15332.2578125, "attnres/final_alpha/block_3": 0.019273962825536728, "attnres/block_norm/3": 17918.828125, "attnres/final_alpha/block_4": 0.026749156415462494, "attnres/block_norm/4": 6552.0791015625, "attnres/final_alpha/block_5": 0.5221163034439087, "attnres/block_norm/5": 4202.8759765625, "attnres/final_alpha/block_6": 0.1581210196018219, "attnres/block_norm/6": 12665.5693359375, "geo/tier1_time_s": 1.3649988174438477, "geo/step": 12375.0, "geo/rankme_slope": 0.0005395434541003902} {"step": 12380, "timestamp": 1778207913.9347334, "train/loss": 2.307941031455994, "train/z_loss": 0.001713816192932427, "train/perplexity": 10.053703071414864, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698792.4220846551, "perf/iters_per_sec": 0.8100473509238506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344957351684571, "data/tokens_consumed": 25964838912, "data/tokens_consumed_B": 25.964838912, "train/loss_slope": -9.156311076931348e-06} {"step": 12390, "timestamp": 1778207924.291364, "train/loss": 2.2834553718566895, "train/z_loss": 0.0017264424823224545, "train/perplexity": 9.810520907950623, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025832.7382780525, "perf/iters_per_sec": 0.9659923259153617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352049112319945, "data/tokens_consumed": 25985810432, "data/tokens_consumed_B": 25.985810432, "train/loss_slope": -1.1373648770821885e-05} {"step": 12400, "timestamp": 1778207934.6364868, "grad/layer_0/attn": 0.003751455806195736, "grad/layer_0/mlp": 0.0030320165678858757, "grad/layer_0/attn_mlp_ratio": 1.2372807332921099, "grad/layer_4/attn": 0.002135573886334896, "grad/layer_4/mlp": 0.0028534461744129658, "grad/layer_4/attn_mlp_ratio": 0.7484191678969823, "grad/layer_8/attn": 0.005272062961012125, "grad/layer_8/mlp": 0.004145361948758364, "grad/layer_8/attn_mlp_ratio": 1.2717979513010778, "grad/layer_12/attn": 0.005730161443352699, "grad/layer_12/mlp": 0.006074759177863598, "grad/layer_12/attn_mlp_ratio": 0.9432738288467476, "grad/layer_16/attn": 0.006052883341908455, "grad/layer_16/mlp": 0.004866149742156267, "grad/layer_16/attn_mlp_ratio": 1.2438752480392852, "grad/layer_20/attn": 0.006087257526814938, "grad/layer_20/mlp": 0.007010618224740028, "grad/layer_20/attn_mlp_ratio": 0.8682911042715579, "grad/layer_24/attn": 0.005244512110948563, "grad/layer_24/mlp": 0.009295318275690079, "grad/layer_24/attn_mlp_ratio": 0.5642100570395162, "grad/layer_27/attn": 0.006369947921484709, "grad/layer_27/mlp": 0.007817151956260204, "grad/layer_27/attn_mlp_ratio": 0.8148681099766338} {"step": 12400, "timestamp": 1778207934.6526327, "train/loss": 2.267501401901245, "train/z_loss": 0.0017198066227138043, "train/perplexity": 9.655246070666175, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025271.7958553145, "perf/iters_per_sec": 0.9657248477245877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354916334152222, "data/tokens_consumed": 26006781952, "data/tokens_consumed_B": 26.006781952, "train/loss_slope": -9.343896826358644e-06} {"step": 12410, "timestamp": 1778207945.005742, "train/loss": 2.251604509353638, "train/z_loss": 0.0017313271993771196, "train/perplexity": 9.502971216639331, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026714.8006052615, "perf/iters_per_sec": 0.966412926008826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347543716430665, "data/tokens_consumed": 26027753472, "data/tokens_consumed_B": 26.027753472, "train/loss_slope": -7.052645023756795e-06} {"step": 12420, "timestamp": 1778207955.3643026, "train/loss": 2.263525056838989, "train/z_loss": 0.001718469022307545, "train/perplexity": 9.616929710648702, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026624.9582186975, "perf/iters_per_sec": 0.9663700858205306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348002433776855, "data/tokens_consumed": 26048724992, "data/tokens_consumed_B": 26.048724992, "train/loss_slope": -8.951675530635922e-06} {"step": 12430, "timestamp": 1778207965.7236476, "train/loss": 2.2885813236236574, "train/z_loss": 0.0017290490563027562, "train/perplexity": 9.86093827302984, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025698.1416069623, "perf/iters_per_sec": 0.9659281452212154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352736949920653, "data/tokens_consumed": 26069696512, "data/tokens_consumed_B": 26.069696512, "train/loss_slope": -1.1336774909027738e-05} {"step": 12440, "timestamp": 1778207976.0774946, "train/loss": 2.2828646898269653, "train/z_loss": 0.001725547481328249, "train/perplexity": 9.804727720682312, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026423.495548631, "perf/iters_per_sec": 0.9662740209334522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349031209945678, "data/tokens_consumed": 26090668032, "data/tokens_consumed_B": 26.090668032, "train/loss_slope": -9.914899709308371e-06} {"step": 12450, "timestamp": 1778207986.4239404, "grad/layer_0/attn": 0.0025356709957122803, "grad/layer_0/mlp": 0.002562381559982896, "grad/layer_0/attn_mlp_ratio": 0.9895758447354818, "grad/layer_4/attn": 0.0016527946572750807, "grad/layer_4/mlp": 0.0026233734097331762, "grad/layer_4/attn_mlp_ratio": 0.6300264339572396, "grad/layer_8/attn": 0.006264605093747377, "grad/layer_8/mlp": 0.003960812464356422, "grad/layer_8/attn_mlp_ratio": 1.581646440460959, "grad/layer_12/attn": 0.0043748002499341965, "grad/layer_12/mlp": 0.006142733618617058, "grad/layer_12/attn_mlp_ratio": 0.7121910944430643, "grad/layer_16/attn": 0.004685745574533939, "grad/layer_16/mlp": 0.004530256614089012, "grad/layer_16/attn_mlp_ratio": 1.0343223067163854, "grad/layer_20/attn": 0.011056004092097282, "grad/layer_20/mlp": 0.006334888748824596, "grad/layer_20/attn_mlp_ratio": 1.7452562082677525, "grad/layer_24/attn": 0.017192065715789795, "grad/layer_24/mlp": 0.013429933227598667, "grad/layer_24/attn_mlp_ratio": 1.2801303845984033, "grad/layer_27/attn": 0.004649114329367876, "grad/layer_27/mlp": 0.01171344704926014, "grad/layer_27/attn_mlp_ratio": 0.39690402578557155} {"step": 12450, "timestamp": 1778207987.0457714, "eos/sharpness": 23.737478256225582, "eos/L0_probe": 2.1172492504119873, "eos/L_plus": 2.2579143047332764, "eos/L_minus": 2.213958978652954, "eos/grad_norm": 0.16867877542972565, "eos/embed_grad_frac": 0.10408657044172287, "eos/time_s": 0.6190390586853027} {"step": 12450, "timestamp": 1778207987.0657368, "train/loss": 2.289993715286255, "train/z_loss": 0.0017160693183541297, "train/perplexity": 9.874875620211625, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909227.030437567, "perf/iters_per_sec": 0.9103903915584407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0984298706054687, "data/tokens_consumed": 26111639552, "data/tokens_consumed_B": 26.111639552, "train/loss_slope": -1.0522541325978578e-05} {"step": 12450, "timestamp": 1778207988.4323637, "geo/rankme_last": 440.7679748535156, "geo/layer_0/stable_rank_q_proj": 15.323726654052734, "geo/layer_0/stable_rank_k_proj": 13.333992004394531, "geo/layer_0/stable_rank_o_proj": 54.770999908447266, "geo/layer_0/stable_rank_gate_proj": 163.2581024169922, "geo/layer_0/stable_rank_down_proj": 48.20990753173828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03243859112262726, "geo/layer_0/attn_entropy_mean": 6.335941791534424, "geo/layer_0/attn_entropy_std": 0.2862159311771393, "geo/layer_7/stable_rank_q_proj": 43.75664138793945, "geo/layer_7/stable_rank_k_proj": 43.40407943725586, "geo/layer_7/stable_rank_o_proj": 108.24710845947266, "geo/layer_7/stable_rank_gate_proj": 118.97769165039062, "geo/layer_7/stable_rank_down_proj": 160.7320098876953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5886562466621399, "geo/layer_7/attn_entropy_mean": 4.655293941497803, "geo/layer_7/attn_entropy_std": 0.9058617353439331, "geo/layer_14/stable_rank_q_proj": 60.93714141845703, "geo/layer_14/stable_rank_k_proj": 38.964195251464844, "geo/layer_14/stable_rank_o_proj": 49.66035842895508, "geo/layer_14/stable_rank_gate_proj": 102.2576675415039, "geo/layer_14/stable_rank_down_proj": 138.90252685546875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39146551489830017, "geo/layer_14/attn_entropy_mean": 5.6144185066223145, "geo/layer_14/attn_entropy_std": 0.5425666570663452, "geo/layer_21/stable_rank_q_proj": 51.26971435546875, "geo/layer_21/stable_rank_k_proj": 32.59668731689453, "geo/layer_21/stable_rank_o_proj": 90.54540252685547, "geo/layer_21/stable_rank_gate_proj": 101.95171356201172, "geo/layer_21/stable_rank_down_proj": 67.84629821777344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16136543452739716, "geo/layer_21/attn_entropy_mean": 5.784976959228516, "geo/layer_21/attn_entropy_std": 0.29354196786880493, "geo/layer_27/stable_rank_q_proj": 42.325950622558594, "geo/layer_27/stable_rank_k_proj": 32.67097473144531, "geo/layer_27/stable_rank_o_proj": 116.09606170654297, "geo/layer_27/stable_rank_gate_proj": 94.64720153808594, "geo/layer_27/stable_rank_down_proj": 146.59405517578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07021409273147583, "geo/layer_27/attn_entropy_mean": 4.466429710388184, "geo/layer_27/attn_entropy_std": 0.5147169828414917, "attnres/final_alpha/block_0": 0.24578392505645752, "attnres/block_norm/0": 1.4792015552520752, "attnres/final_alpha/block_1": 0.00868351012468338, "attnres/block_norm/1": 20768.3046875, "attnres/final_alpha/block_2": 0.017662493512034416, "attnres/block_norm/2": 15348.15625, "attnres/final_alpha/block_3": 0.019134001806378365, "attnres/block_norm/3": 18051.0, "attnres/final_alpha/block_4": 0.02647416666150093, "attnres/block_norm/4": 6586.162109375, "attnres/final_alpha/block_5": 0.5259405970573425, "attnres/block_norm/5": 4190.93359375, "attnres/final_alpha/block_6": 0.15632131695747375, "attnres/block_norm/6": 12811.9296875, "geo/tier1_time_s": 1.362353801727295, "geo/step": 12450.0, "geo/rankme_slope": 0.000552405180822329} {"step": 12460, "timestamp": 1778207998.7864065, "train/loss": 2.2774876594543456, "train/z_loss": 0.0017339823767542838, "train/perplexity": 9.75214888760601, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789875.0472890527, "perf/iters_per_sec": 0.8534789310879959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716750860214233, "data/tokens_consumed": 26132611072, "data/tokens_consumed_B": 26.132611072, "train/loss_slope": -1.2346021756373302e-05} {"step": 12470, "timestamp": 1778208009.1750126, "train/loss": 2.286462736129761, "train/z_loss": 0.0017174137756228447, "train/perplexity": 9.840069126888302, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019764.4634669323, "perf/iters_per_sec": 0.9630987469992315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383151292800903, "data/tokens_consumed": 26153582592, "data/tokens_consumed_B": 26.153582592, "train/loss_slope": -1.2458753728880855e-05} {"step": 12480, "timestamp": 1778208019.5488064, "train/loss": 2.2904448986053465, "train/z_loss": 0.001723341690376401, "train/perplexity": 9.879332004617124, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023356.4128800503, "perf/iters_per_sec": 0.964811521949792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364718675613402, "data/tokens_consumed": 26174554112, "data/tokens_consumed_B": 26.174554112, "train/loss_slope": -1.806917935922771e-05} {"step": 12490, "timestamp": 1778208029.9006104, "train/loss": 2.312579703330994, "train/z_loss": 0.0017095471965149045, "train/perplexity": 10.100447232687722, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027014.1303485811, "perf/iters_per_sec": 0.9665556575529962, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346015691757202, "data/tokens_consumed": 26195525632, "data/tokens_consumed_B": 26.195525632, "train/loss_slope": -1.607583180727126e-05} {"step": 12500, "timestamp": 1778208040.2442362, "grad/layer_0/attn": 0.0028256885707378387, "grad/layer_0/mlp": 0.0025989902205765247, "grad/layer_0/attn_mlp_ratio": 1.0872255076775454, "grad/layer_4/attn": 0.0016377551946789026, "grad/layer_4/mlp": 0.002643425250425935, "grad/layer_4/attn_mlp_ratio": 0.6195579513584567, "grad/layer_8/attn": 0.005404818803071976, "grad/layer_8/mlp": 0.003915637731552124, "grad/layer_8/attn_mlp_ratio": 1.380316320248023, "grad/layer_12/attn": 0.003779933089390397, "grad/layer_12/mlp": 0.005763415712863207, "grad/layer_12/attn_mlp_ratio": 0.6558494497228657, "grad/layer_16/attn": 0.004305060487240553, "grad/layer_16/mlp": 0.004248298704624176, "grad/layer_16/attn_mlp_ratio": 1.0133610382006542, "grad/layer_20/attn": 0.005424723494797945, "grad/layer_20/mlp": 0.005786463618278503, "grad/layer_20/attn_mlp_ratio": 0.9374850960634421, "grad/layer_24/attn": 0.010049953125417233, "grad/layer_24/mlp": 0.009857610799372196, "grad/layer_24/attn_mlp_ratio": 1.0195120529719108, "grad/layer_27/attn": 0.01512941438704729, "grad/layer_27/mlp": 0.007874513976275921, "grad/layer_27/attn_mlp_ratio": 1.9213140316338626} {"step": 12500, "timestamp": 1778208040.2605066, "train/loss": 2.260296416282654, "train/z_loss": 0.0017181791481561959, "train/perplexity": 9.585930171470757, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025184.1329486629, "perf/iters_per_sec": 0.9656830467933001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355364561080933, "data/tokens_consumed": 26216497152, "data/tokens_consumed_B": 26.216497152, "train/loss_slope": -2.0574684540788305e-05} {"step": 12500, "timestamp": 1778208047.3675792, "geo/ww_alpha_mean": 8.218238534137619, "geo/ww_alpha_std": 4.895626036874492, "geo/ww_alpha_min": 1.3759399279979099, "geo/ww_alpha_max": 28.78654394217415, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.248730969245799, "geo/ww_alpha_by_type/k_proj": 4.691963913891058, "geo/ww_alpha_by_type/v_proj": 8.04486009602914, "geo/ww_alpha_by_type/o_proj": 8.904480045518484, "geo/ww_alpha_by_type/gate_proj": 10.050570729120114, "geo/ww_alpha_by_type/up_proj": 12.709168551341522, "geo/ww_alpha_by_type/down_proj": 9.021717961494355, "geo/twonn_id/layer_0": 0.7088174819946289, "geo/twonn_id/layer_7": 2.970287561416626, "geo/twonn_id/layer_14": 3.8854880332946777, "geo/twonn_id/layer_21": 6.608175754547119, "geo/twonn_id/layer_27": 4.74265718460083, "geo/tier2_time_s": 7.098602056503296} {"step": 12500, "timestamp": 1778208048.004843, "eoc/jacobian_sigma/layer_0/attn": 636.0939331054688, "eoc/jacobian_sigma/layer_0/mlp": 3690.65087890625, "eoc/jacobian_sigma/layer_0": 3690.65087890625, "eoc/jacobian_sigma/layer_7/attn": 1.170521855354309, "eoc/jacobian_sigma/layer_7/mlp": 1.711233139038086, "eoc/jacobian_sigma/layer_7": 1.711233139038086, "eoc/jacobian_sigma/layer_14/attn": 1.3537546396255493, "eoc/jacobian_sigma/layer_14/mlp": 9.906221389770508, "eoc/jacobian_sigma/layer_14": 9.906221389770508, "eoc/jacobian_sigma/layer_21/attn": 1.0875171422958374, "eoc/jacobian_sigma/layer_21/mlp": 3.6121342182159424, "eoc/jacobian_sigma/layer_21": 3.6121342182159424, "eoc/jacobian_sigma/layer_27/attn": 2.733004570007324, "eoc/jacobian_sigma/layer_27/mlp": 22.003768920898438, "eoc/jacobian_sigma/layer_27": 22.003768920898438, "eoc/layer0_sigma": 3690.65087890625, "eoc/sigma_max": 22.003768920898438, "eoc/sigma_min": 1.711233139038086, "eoc/sigma_mean": 9.308339416980743, "eoc/time_s": 0.6305458545684814} {"step": 12510, "timestamp": 1778208058.377616, "train/loss": 2.331247925758362, "train/z_loss": 0.0017106315004639327, "train/perplexity": 10.290775647370829, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1157973.558515296, "perf/iters_per_sec": 0.5521648209167939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8110534429550171, "data/tokens_consumed": 26237468672, "data/tokens_consumed_B": 26.237468672, "train/loss_slope": -1.9118681034096087e-05} {"step": 12520, "timestamp": 1778208068.7298412, "train/loss": 2.2882453203201294, "train/z_loss": 0.0017221271409653127, "train/perplexity": 9.857625521773064, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026825.6202998955, "perf/iters_per_sec": 0.9664657689570882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346977949142455, "data/tokens_consumed": 26258440192, "data/tokens_consumed_B": 26.258440192, "train/loss_slope": -2.2761425713036333e-05} {"step": 12525, "timestamp": 1778208074.5364285, "eos/sharpness": 33.3613634109497, "eos/L0_probe": 2.1136386394500732, "eos/L_plus": 2.2619669437408447, "eos/L_minus": 2.298923969268799, "eos/grad_norm": 0.15805985033512115, "eos/embed_grad_frac": 0.11266905069351196, "eos/time_s": 0.640221118927002} {"step": 12525, "timestamp": 1778208075.9187498, "geo/rankme_last": 440.314453125, "geo/layer_0/stable_rank_q_proj": 15.350038528442383, "geo/layer_0/stable_rank_k_proj": 13.366515159606934, "geo/layer_0/stable_rank_o_proj": 54.89393997192383, "geo/layer_0/stable_rank_gate_proj": 163.36907958984375, "geo/layer_0/stable_rank_down_proj": 48.266056060791016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03173185884952545, "geo/layer_0/attn_entropy_mean": 6.333900451660156, "geo/layer_0/attn_entropy_std": 0.2887859046459198, "geo/layer_7/stable_rank_q_proj": 43.852840423583984, "geo/layer_7/stable_rank_k_proj": 43.54376220703125, "geo/layer_7/stable_rank_o_proj": 108.30117797851562, "geo/layer_7/stable_rank_gate_proj": 118.73090362548828, "geo/layer_7/stable_rank_down_proj": 160.28981018066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5995375514030457, "geo/layer_7/attn_entropy_mean": 4.649919033050537, "geo/layer_7/attn_entropy_std": 0.8891788721084595, "geo/layer_14/stable_rank_q_proj": 60.785552978515625, "geo/layer_14/stable_rank_k_proj": 39.02424240112305, "geo/layer_14/stable_rank_o_proj": 49.71424102783203, "geo/layer_14/stable_rank_gate_proj": 102.16128540039062, "geo/layer_14/stable_rank_down_proj": 138.77171325683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4164695739746094, "geo/layer_14/attn_entropy_mean": 5.553534030914307, "geo/layer_14/attn_entropy_std": 0.5610225796699524, "geo/layer_21/stable_rank_q_proj": 51.348609924316406, "geo/layer_21/stable_rank_k_proj": 32.43717956542969, "geo/layer_21/stable_rank_o_proj": 90.58731079101562, "geo/layer_21/stable_rank_gate_proj": 101.7010726928711, "geo/layer_21/stable_rank_down_proj": 67.54022216796875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15902292728424072, "geo/layer_21/attn_entropy_mean": 5.77935266494751, "geo/layer_21/attn_entropy_std": 0.3020758032798767, "geo/layer_27/stable_rank_q_proj": 42.428184509277344, "geo/layer_27/stable_rank_k_proj": 32.454185485839844, "geo/layer_27/stable_rank_o_proj": 115.9894027709961, "geo/layer_27/stable_rank_gate_proj": 94.7199935913086, "geo/layer_27/stable_rank_down_proj": 146.5885009765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06987753510475159, "geo/layer_27/attn_entropy_mean": 4.475252628326416, "geo/layer_27/attn_entropy_std": 0.48349952697753906, "attnres/final_alpha/block_0": 0.24812564253807068, "attnres/block_norm/0": 1.481088399887085, "attnres/final_alpha/block_1": 0.008851482532918453, "attnres/block_norm/1": 20790.6796875, "attnres/final_alpha/block_2": 0.017473623156547546, "attnres/block_norm/2": 15478.029296875, "attnres/final_alpha/block_3": 0.01937473751604557, "attnres/block_norm/3": 18232.990234375, "attnres/final_alpha/block_4": 0.026661137118935585, "attnres/block_norm/4": 6610.49609375, "attnres/final_alpha/block_5": 0.5213907957077026, "attnres/block_norm/5": 4205.9521484375, "attnres/final_alpha/block_6": 0.15812258422374725, "attnres/block_norm/6": 12793.208984375, "geo/tier1_time_s": 1.3621904850006104, "geo/step": 12525.0, "geo/rankme_slope": 0.0005107315973264306} {"step": 12530, "timestamp": 1778208081.098297, "train/loss": 2.2117116451263428, "train/z_loss": 0.001733974926173687, "train/perplexity": 9.131332625489048, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696312.9424007179, "perf/iters_per_sec": 0.8088650428775396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2363001823425293, "data/tokens_consumed": 26279411712, "data/tokens_consumed_B": 26.279411712, "train/loss_slope": -2.7453785074247127e-05} {"step": 12540, "timestamp": 1778208091.45439, "train/loss": 2.267911434173584, "train/z_loss": 0.0017131694708950817, "train/perplexity": 9.659205844914661, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026188.5141637165, "perf/iters_per_sec": 0.9661619730776388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350231409072876, "data/tokens_consumed": 26300383232, "data/tokens_consumed_B": 26.300383232, "train/loss_slope": -2.4820037521902485e-05} {"step": 12550, "timestamp": 1778208101.7972865, "grad/layer_0/attn": 0.002600262872874737, "grad/layer_0/mlp": 0.0025880620814859867, "grad/layer_0/attn_mlp_ratio": 1.0047142187989258, "grad/layer_4/attn": 0.00227586948312819, "grad/layer_4/mlp": 0.0026129591278731823, "grad/layer_4/attn_mlp_ratio": 0.8709931095942257, "grad/layer_8/attn": 0.005526562221348286, "grad/layer_8/mlp": 0.004142143297940493, "grad/layer_8/attn_mlp_ratio": 1.3342276426490058, "grad/layer_12/attn": 0.004453253000974655, "grad/layer_12/mlp": 0.0064564053900539875, "grad/layer_12/attn_mlp_ratio": 0.6897418397643761, "grad/layer_16/attn": 0.005022075958549976, "grad/layer_16/mlp": 0.004522130824625492, "grad/layer_16/attn_mlp_ratio": 1.110555187865528, "grad/layer_20/attn": 0.004246017895638943, "grad/layer_20/mlp": 0.0058685061521828175, "grad/layer_20/attn_mlp_ratio": 0.7235261773913281, "grad/layer_24/attn": 0.004184485878795385, "grad/layer_24/mlp": 0.008638598024845123, "grad/layer_24/attn_mlp_ratio": 0.4843940901430007, "grad/layer_27/attn": 0.004754438064992428, "grad/layer_27/mlp": 0.007573423907160759, "grad/layer_27/attn_mlp_ratio": 0.6277792000681663} {"step": 12550, "timestamp": 1778208101.8135352, "train/loss": 2.305047082901001, "train/z_loss": 0.001716035441495478, "train/perplexity": 10.024650230928197, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025344.2167358806, "perf/iters_per_sec": 0.9657593806914714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354546070098878, "data/tokens_consumed": 26321354752, "data/tokens_consumed_B": 26.321354752, "train/loss_slope": -2.297133136146103e-05} {"step": 12560, "timestamp": 1778208112.1696916, "train/loss": 2.2767486333847047, "train/z_loss": 0.0017174983979202807, "train/perplexity": 9.744944457801669, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026169.704938817, "perf/iters_per_sec": 0.9661530041402898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350327491760254, "data/tokens_consumed": 26342326272, "data/tokens_consumed_B": 26.342326272, "train/loss_slope": -2.088861029104466e-05} {"step": 12570, "timestamp": 1778208122.520985, "train/loss": 2.2980762720108032, "train/z_loss": 0.001710799860302359, "train/perplexity": 9.95501328490304, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027041.9707801626, "perf/iters_per_sec": 0.9665689329052747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345873594284059, "data/tokens_consumed": 26363297792, "data/tokens_consumed_B": 26.363297792, "train/loss_slope": -1.961851594972423e-05} {"step": 12580, "timestamp": 1778208132.874253, "train/loss": 2.32687246799469, "train/z_loss": 0.0016884513897821307, "train/perplexity": 10.245847156206331, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026608.148677553, "perf/iters_per_sec": 0.9663620704067006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348088264465332, "data/tokens_consumed": 26384269312, "data/tokens_consumed_B": 26.384269312, "train/loss_slope": -1.4796733570070238e-05} {"step": 12590, "timestamp": 1778208143.2330704, "train/loss": 2.2578346490859986, "train/z_loss": 0.0017283266643062234, "train/perplexity": 9.562360865999812, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025753.6576506996, "perf/iters_per_sec": 0.9659546173337458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352453231811523, "data/tokens_consumed": 26405240832, "data/tokens_consumed_B": 26.405240832, "train/loss_slope": -2.0570284119247458e-05} {"step": 12600, "timestamp": 1778208153.5766826, "grad/layer_0/attn": 0.002675095573067665, "grad/layer_0/mlp": 0.002573430072516203, "grad/layer_0/attn_mlp_ratio": 1.0395057933326621, "grad/layer_4/attn": 0.0016194003401324153, "grad/layer_4/mlp": 0.0026351273991167545, "grad/layer_4/attn_mlp_ratio": 0.61454344834366, "grad/layer_8/attn": 0.008279679343104362, "grad/layer_8/mlp": 0.003804567037150264, "grad/layer_8/attn_mlp_ratio": 2.1762474007243027, "grad/layer_12/attn": 0.004300366621464491, "grad/layer_12/mlp": 0.005800546146929264, "grad/layer_12/attn_mlp_ratio": 0.7413726980870209, "grad/layer_16/attn": 0.005376914981752634, "grad/layer_16/mlp": 0.0042930529452860355, "grad/layer_16/attn_mlp_ratio": 1.2524687966892765, "grad/layer_20/attn": 0.007018641568720341, "grad/layer_20/mlp": 0.006770044565200806, "grad/layer_20/attn_mlp_ratio": 1.0367201275343672, "grad/layer_24/attn": 0.019591880962252617, "grad/layer_24/mlp": 0.01241121906787157, "grad/layer_24/attn_mlp_ratio": 1.5785621619646635, "grad/layer_27/attn": 0.013544282875955105, "grad/layer_27/mlp": 0.01055251993238926, "grad/layer_27/attn_mlp_ratio": 1.2835116952522347} {"step": 12600, "timestamp": 1778208154.2096558, "eos/sharpness": 43.901252746582024, "eos/L0_probe": 2.1183626651763916, "eos/L_plus": 2.296928644180298, "eos/L_minus": 2.3788092136383057, "eos/grad_norm": 0.20090115070343018, "eos/embed_grad_frac": 0.06352243572473526, "eos/time_s": 0.6300160884857178} {"step": 12600, "timestamp": 1778208154.2291791, "train/loss": 2.292119765281677, "train/z_loss": 0.001716317841783166, "train/perplexity": 9.895892432959606, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908077.9962251866, "perf/iters_per_sec": 0.9098424893499311, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0990913391113282, "data/tokens_consumed": 26426212352, "data/tokens_consumed_B": 26.426212352, "train/loss_slope": -1.665215792685986e-05} {"step": 12600, "timestamp": 1778208155.5971684, "geo/rankme_last": 440.71185302734375, "geo/layer_0/stable_rank_q_proj": 15.380483627319336, "geo/layer_0/stable_rank_k_proj": 13.415680885314941, "geo/layer_0/stable_rank_o_proj": 54.949832916259766, "geo/layer_0/stable_rank_gate_proj": 162.4810028076172, "geo/layer_0/stable_rank_down_proj": 48.28126525878906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.033721137791872025, "geo/layer_0/attn_entropy_mean": 6.333693981170654, "geo/layer_0/attn_entropy_std": 0.2877586781978607, "geo/layer_7/stable_rank_q_proj": 43.893035888671875, "geo/layer_7/stable_rank_k_proj": 43.675140380859375, "geo/layer_7/stable_rank_o_proj": 108.50495147705078, "geo/layer_7/stable_rank_gate_proj": 118.52263641357422, "geo/layer_7/stable_rank_down_proj": 160.4937744140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5908570885658264, "geo/layer_7/attn_entropy_mean": 4.66058874130249, "geo/layer_7/attn_entropy_std": 0.8974076509475708, "geo/layer_14/stable_rank_q_proj": 60.79473114013672, "geo/layer_14/stable_rank_k_proj": 38.90363693237305, "geo/layer_14/stable_rank_o_proj": 49.7757682800293, "geo/layer_14/stable_rank_gate_proj": 102.1716079711914, "geo/layer_14/stable_rank_down_proj": 138.83551025390625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39579296112060547, "geo/layer_14/attn_entropy_mean": 5.586330413818359, "geo/layer_14/attn_entropy_std": 0.5979729294776917, "geo/layer_21/stable_rank_q_proj": 51.44906234741211, "geo/layer_21/stable_rank_k_proj": 32.5517578125, "geo/layer_21/stable_rank_o_proj": 90.72236633300781, "geo/layer_21/stable_rank_gate_proj": 101.53009796142578, "geo/layer_21/stable_rank_down_proj": 67.40936279296875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1570809930562973, "geo/layer_21/attn_entropy_mean": 5.778547286987305, "geo/layer_21/attn_entropy_std": 0.2984786629676819, "geo/layer_27/stable_rank_q_proj": 42.289737701416016, "geo/layer_27/stable_rank_k_proj": 32.44755172729492, "geo/layer_27/stable_rank_o_proj": 116.11143493652344, "geo/layer_27/stable_rank_gate_proj": 94.81548309326172, "geo/layer_27/stable_rank_down_proj": 146.91978454589844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06671728193759918, "geo/layer_27/attn_entropy_mean": 4.467344760894775, "geo/layer_27/attn_entropy_std": 0.5106096863746643, "attnres/final_alpha/block_0": 0.24829570949077606, "attnres/block_norm/0": 1.4829069375991821, "attnres/final_alpha/block_1": 0.008749593049287796, "attnres/block_norm/1": 20853.36328125, "attnres/final_alpha/block_2": 0.01762278378009796, "attnres/block_norm/2": 15564.2578125, "attnres/final_alpha/block_3": 0.01923040673136711, "attnres/block_norm/3": 18500.736328125, "attnres/final_alpha/block_4": 0.02676461637020111, "attnres/block_norm/4": 6644.5517578125, "attnres/final_alpha/block_5": 0.5205957889556885, "attnres/block_norm/5": 4240.3056640625, "attnres/final_alpha/block_6": 0.1587410867214203, "attnres/block_norm/6": 12885.0703125, "geo/tier1_time_s": 1.3637583255767822, "geo/step": 12600.0, "geo/rankme_slope": 0.0004786389555822329} {"step": 12610, "timestamp": 1778208165.9545844, "train/loss": 2.2979984045028687, "train/z_loss": 0.0017198804533109068, "train/perplexity": 9.954238143006666, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789154.8143616924, "perf/iters_per_sec": 0.8531354972656691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.17214674949646, "data/tokens_consumed": 26447183872, "data/tokens_consumed_B": 26.447183872, "train/loss_slope": -2.0554637980468232e-05} {"step": 12620, "timestamp": 1778208176.3142111, "train/loss": 2.2902501583099366, "train/z_loss": 0.0017191952443681656, "train/perplexity": 9.877408287902753, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025443.1334664717, "perf/iters_per_sec": 0.9658065478641852, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354040384292602, "data/tokens_consumed": 26468155392, "data/tokens_consumed_B": 26.468155392, "train/loss_slope": -2.151013372039561e-05} {"step": 12630, "timestamp": 1778208186.6673625, "train/loss": 2.32952196598053, "train/z_loss": 0.001698625274002552, "train/perplexity": 10.273029501493163, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026796.29136331, "perf/iters_per_sec": 0.9664517838303137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347127676010133, "data/tokens_consumed": 26489126912, "data/tokens_consumed_B": 26.489126912, "train/loss_slope": -1.6584967164853016e-05} {"step": 12640, "timestamp": 1778208197.5634701, "train/loss": 2.2840699911117555, "train/z_loss": 0.0017128334729932249, "train/perplexity": 9.816552496378199, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925887.1510393869, "perf/iters_per_sec": 0.9183345561215338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0889277696609496, "data/tokens_consumed": 26510098432, "data/tokens_consumed_B": 26.510098432, "train/loss_slope": -1.8823742816443137e-05} {"step": 12650, "timestamp": 1778208207.9169912, "grad/layer_0/attn": 0.002892854157835245, "grad/layer_0/mlp": 0.0029711038805544376, "grad/layer_0/attn_mlp_ratio": 0.9736630480685532, "grad/layer_4/attn": 0.0016140709631145, "grad/layer_4/mlp": 0.0027739889919757843, "grad/layer_4/attn_mlp_ratio": 0.5818591600750931, "grad/layer_8/attn": 0.003919829148799181, "grad/layer_8/mlp": 0.004147408530116081, "grad/layer_8/attn_mlp_ratio": 0.9451273068044587, "grad/layer_12/attn": 0.004093294031918049, "grad/layer_12/mlp": 0.005881673656404018, "grad/layer_12/attn_mlp_ratio": 0.6959403396798801, "grad/layer_16/attn": 0.004457515198737383, "grad/layer_16/mlp": 0.004679882898926735, "grad/layer_16/attn_mlp_ratio": 0.9524843248772782, "grad/layer_20/attn": 0.008254513144493103, "grad/layer_20/mlp": 0.006860631518065929, "grad/layer_20/attn_mlp_ratio": 1.2031710203994486, "grad/layer_24/attn": 0.015795784071087837, "grad/layer_24/mlp": 0.01619734615087509, "grad/layer_24/attn_mlp_ratio": 0.9752081499297726, "grad/layer_27/attn": 0.007016547489911318, "grad/layer_27/mlp": 0.01413493137806654, "grad/layer_27/attn_mlp_ratio": 0.49639770102876246} {"step": 12650, "timestamp": 1778208207.9321997, "train/loss": 2.3145214080810548, "train/z_loss": 0.0017036073259077966, "train/perplexity": 10.120078371827425, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023622.1621295311, "perf/iters_per_sec": 0.9649382410667091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363357543945313, "data/tokens_consumed": 26531069952, "data/tokens_consumed_B": 26.531069952, "train/loss_slope": -1.5741289194875543e-05} {"step": 12660, "timestamp": 1778208218.3025126, "train/loss": 2.3148396968841554, "train/z_loss": 0.0016925409203395248, "train/perplexity": 10.123299992135316, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023542.276404854, "perf/iters_per_sec": 0.964900148584773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036376667022705, "data/tokens_consumed": 26552041472, "data/tokens_consumed_B": 26.552041472, "train/loss_slope": -1.537764290593023e-05} {"step": 12670, "timestamp": 1778208228.7028184, "train/loss": 2.2980469703674316, "train/z_loss": 0.0017061574384570123, "train/perplexity": 9.954721590927583, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018008.5449250042, "perf/iters_per_sec": 0.9622614597916623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0392185926437378, "data/tokens_consumed": 26573012992, "data/tokens_consumed_B": 26.573012992, "train/loss_slope": -1.2751874700523889e-05} {"step": 12675, "timestamp": 1778208234.4814305, "eos/sharpness": 35.21995544433593, "eos/L0_probe": 2.110234498977661, "eos/L_plus": 2.324345350265503, "eos/L_minus": 2.2483232021331787, "eos/grad_norm": 0.20500415563583374, "eos/embed_grad_frac": 0.058888260275125504, "eos/time_s": 0.5928258895874023} {"step": 12675, "timestamp": 1778208235.8681073, "geo/rankme_last": 439.8938903808594, "geo/layer_0/stable_rank_q_proj": 15.379642486572266, "geo/layer_0/stable_rank_k_proj": 13.449210166931152, "geo/layer_0/stable_rank_o_proj": 54.86611557006836, "geo/layer_0/stable_rank_gate_proj": 162.1732940673828, "geo/layer_0/stable_rank_down_proj": 48.249305725097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03402324020862579, "geo/layer_0/attn_entropy_mean": 6.335519313812256, "geo/layer_0/attn_entropy_std": 0.2847798466682434, "geo/layer_7/stable_rank_q_proj": 43.842994689941406, "geo/layer_7/stable_rank_k_proj": 43.62057113647461, "geo/layer_7/stable_rank_o_proj": 108.7786865234375, "geo/layer_7/stable_rank_gate_proj": 118.40776062011719, "geo/layer_7/stable_rank_down_proj": 160.51290893554688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5792957544326782, "geo/layer_7/attn_entropy_mean": 4.647854804992676, "geo/layer_7/attn_entropy_std": 0.8827945590019226, "geo/layer_14/stable_rank_q_proj": 60.74574279785156, "geo/layer_14/stable_rank_k_proj": 39.04404067993164, "geo/layer_14/stable_rank_o_proj": 49.953365325927734, "geo/layer_14/stable_rank_gate_proj": 102.01290130615234, "geo/layer_14/stable_rank_down_proj": 138.7325439453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.396175742149353, "geo/layer_14/attn_entropy_mean": 5.5853095054626465, "geo/layer_14/attn_entropy_std": 0.5565088391304016, "geo/layer_21/stable_rank_q_proj": 51.4388542175293, "geo/layer_21/stable_rank_k_proj": 32.652679443359375, "geo/layer_21/stable_rank_o_proj": 90.2176513671875, "geo/layer_21/stable_rank_gate_proj": 101.32656860351562, "geo/layer_21/stable_rank_down_proj": 67.34349822998047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1587260514497757, "geo/layer_21/attn_entropy_mean": 5.81216287612915, "geo/layer_21/attn_entropy_std": 0.2957116961479187, "geo/layer_27/stable_rank_q_proj": 42.34731674194336, "geo/layer_27/stable_rank_k_proj": 32.34298324584961, "geo/layer_27/stable_rank_o_proj": 116.06075286865234, "geo/layer_27/stable_rank_gate_proj": 94.83208465576172, "geo/layer_27/stable_rank_down_proj": 146.8464813232422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06541682779788971, "geo/layer_27/attn_entropy_mean": 4.465785026550293, "geo/layer_27/attn_entropy_std": 0.5113482475280762, "attnres/final_alpha/block_0": 0.24611224234104156, "attnres/block_norm/0": 1.4844238758087158, "attnres/final_alpha/block_1": 0.008433937095105648, "attnres/block_norm/1": 21112.3984375, "attnres/final_alpha/block_2": 0.01706886477768421, "attnres/block_norm/2": 15621.994140625, "attnres/final_alpha/block_3": 0.018939193338155746, "attnres/block_norm/3": 18446.404296875, "attnres/final_alpha/block_4": 0.02636556513607502, "attnres/block_norm/4": 6703.119140625, "attnres/final_alpha/block_5": 0.5297011137008667, "attnres/block_norm/5": 4242.4677734375, "attnres/final_alpha/block_6": 0.15337909758090973, "attnres/block_norm/6": 13030.6787109375, "geo/tier1_time_s": 1.3677244186401367, "geo/step": 12675.0, "geo/rankme_slope": 0.00042171901963910564} {"step": 12680, "timestamp": 1778208241.0516243, "train/loss": 2.23146595954895, "train/z_loss": 0.0017267219722270966, "train/perplexity": 9.31350930473161, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699269.3339313297, "perf/iters_per_sec": 0.8102747602135323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2341492652893067, "data/tokens_consumed": 26593984512, "data/tokens_consumed_B": 26.593984512, "train/loss_slope": -1.6336698586469323e-05} {"step": 12690, "timestamp": 1778208251.4382367, "train/loss": 2.335155558586121, "train/z_loss": 0.0017098347656428815, "train/perplexity": 10.331066890536599, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020011.8274180407, "perf/iters_per_sec": 0.9632166993227199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381879806518555, "data/tokens_consumed": 26614956032, "data/tokens_consumed_B": 26.614956032, "train/loss_slope": -1.527588248002499e-05} {"step": 12700, "timestamp": 1778208261.791909, "grad/layer_0/attn": 0.002852571662515402, "grad/layer_0/mlp": 0.0029618353582918644, "grad/layer_0/attn_mlp_ratio": 0.9631094308528952, "grad/layer_4/attn": 0.002216035732999444, "grad/layer_4/mlp": 0.002854322548955679, "grad/layer_4/attn_mlp_ratio": 0.7763788490450555, "grad/layer_8/attn": 0.005462058819830418, "grad/layer_8/mlp": 0.004367230925709009, "grad/layer_8/attn_mlp_ratio": 1.2506915223115924, "grad/layer_12/attn": 0.005749653093516827, "grad/layer_12/mlp": 0.0061085419729352, "grad/layer_12/attn_mlp_ratio": 0.94124801382502, "grad/layer_16/attn": 0.0045822081156075, "grad/layer_16/mlp": 0.0046337987296283245, "grad/layer_16/attn_mlp_ratio": 0.9888664320749199, "grad/layer_20/attn": 0.004924277309328318, "grad/layer_20/mlp": 0.007650814950466156, "grad/layer_20/attn_mlp_ratio": 0.6436278065600719, "grad/layer_24/attn": 0.013019141741096973, "grad/layer_24/mlp": 0.013918781653046608, "grad/layer_24/attn_mlp_ratio": 0.9353650320903468, "grad/layer_27/attn": 0.014104419387876987, "grad/layer_27/mlp": 0.012720728293061256, "grad/layer_27/attn_mlp_ratio": 1.1087745097655328} {"step": 12700, "timestamp": 1778208261.8085938, "train/loss": 2.328617286682129, "train/z_loss": 0.0016983105335384608, "train/perplexity": 10.263739907056772, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023344.0325008736, "perf/iters_per_sec": 0.9648056185249679, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364782094955445, "data/tokens_consumed": 26635927552, "data/tokens_consumed_B": 26.635927552, "train/loss_slope": -1.446688237434884e-05} {"step": 12710, "timestamp": 1778208272.1745477, "train/loss": 2.224298334121704, "train/z_loss": 0.001733471651095897, "train/perplexity": 9.246992228142062, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025171.5903129487, "perf/iters_per_sec": 0.9656770659985298, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035542869567871, "data/tokens_consumed": 26656899072, "data/tokens_consumed_B": 26.656899072, "train/loss_slope": -1.7809643103058428e-05} {"step": 12720, "timestamp": 1778208282.5253778, "train/loss": 2.305113649368286, "train/z_loss": 0.001703787932638079, "train/perplexity": 10.025317558690421, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027229.6333698565, "perf/iters_per_sec": 0.9666584174012454, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344915866851807, "data/tokens_consumed": 26677870592, "data/tokens_consumed_B": 26.677870592, "train/loss_slope": -1.4084483566421481e-05} {"step": 12730, "timestamp": 1778208292.874018, "train/loss": 2.2500698804855346, "train/z_loss": 0.0017257019644603133, "train/perplexity": 9.488398867111579, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027518.6465796921, "perf/iters_per_sec": 0.9667962296389065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343441247940064, "data/tokens_consumed": 26698842112, "data/tokens_consumed_B": 26.698842112, "train/loss_slope": -1.547354677579725e-05} {"step": 12740, "timestamp": 1778208303.235208, "train/loss": 2.3362804651260376, "train/z_loss": 0.0016882217722013592, "train/perplexity": 10.342694914239848, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025874.683842291, "perf/iters_per_sec": 0.9660123271190123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351834774017334, "data/tokens_consumed": 26719813632, "data/tokens_consumed_B": 26.719813632, "train/loss_slope": -1.4027756354202901e-05} {"step": 12750, "timestamp": 1778208313.5835543, "grad/layer_0/attn": 0.003704474773257971, "grad/layer_0/mlp": 0.0030532467644661665, "grad/layer_0/attn_mlp_ratio": 1.213290289878236, "grad/layer_4/attn": 0.002222945913672447, "grad/layer_4/mlp": 0.002733345841988921, "grad/layer_4/attn_mlp_ratio": 0.8132691436982612, "grad/layer_8/attn": 0.004647958092391491, "grad/layer_8/mlp": 0.004109164234250784, "grad/layer_8/attn_mlp_ratio": 1.1311200317908299, "grad/layer_12/attn": 0.005711224861443043, "grad/layer_12/mlp": 0.007304945960640907, "grad/layer_12/attn_mlp_ratio": 0.7818298470696665, "grad/layer_16/attn": 0.00506229605525732, "grad/layer_16/mlp": 0.005088158883154392, "grad/layer_16/attn_mlp_ratio": 0.9949170362045099, "grad/layer_20/attn": 0.005607536993920803, "grad/layer_20/mlp": 0.00674157589673996, "grad/layer_20/attn_mlp_ratio": 0.8317842885153937, "grad/layer_24/attn": 0.007637257222086191, "grad/layer_24/mlp": 0.009840564802289009, "grad/layer_24/attn_mlp_ratio": 0.776099471719321, "grad/layer_27/attn": 0.008104763925075531, "grad/layer_27/mlp": 0.007922369986772537, "grad/layer_27/attn_mlp_ratio": 1.0230226354367773} {"step": 12750, "timestamp": 1778208314.200261, "eos/sharpness": 17.298984527587887, "eos/L0_probe": 2.1184163093566895, "eos/L_plus": 2.2275469303131104, "eos/L_minus": 2.1822755336761475, "eos/grad_norm": 0.11098163574934006, "eos/embed_grad_frac": 0.22079260647296906, "eos/time_s": 0.6138138771057129} {"step": 12750, "timestamp": 1778208314.2199252, "train/loss": 2.2919004917144776, "train/z_loss": 0.0017151182400994003, "train/perplexity": 9.893722763209515, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910039.5691060578, "perf/iters_per_sec": 0.910777840188054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979625940322877, "data/tokens_consumed": 26740785152, "data/tokens_consumed_B": 26.740785152, "train/loss_slope": -1.4359884980273509e-05} {"step": 12750, "timestamp": 1778208315.5837948, "geo/rankme_last": 441.182861328125, "geo/layer_0/stable_rank_q_proj": 15.433120727539062, "geo/layer_0/stable_rank_k_proj": 13.514634132385254, "geo/layer_0/stable_rank_o_proj": 55.0984001159668, "geo/layer_0/stable_rank_gate_proj": 161.48782348632812, "geo/layer_0/stable_rank_down_proj": 48.2829704284668, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03897768631577492, "geo/layer_0/attn_entropy_mean": 6.3328447341918945, "geo/layer_0/attn_entropy_std": 0.2836696207523346, "geo/layer_7/stable_rank_q_proj": 43.792572021484375, "geo/layer_7/stable_rank_k_proj": 43.55413055419922, "geo/layer_7/stable_rank_o_proj": 108.79739379882812, "geo/layer_7/stable_rank_gate_proj": 117.92069244384766, "geo/layer_7/stable_rank_down_proj": 159.84466552734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5821159482002258, "geo/layer_7/attn_entropy_mean": 4.650299072265625, "geo/layer_7/attn_entropy_std": 0.8751604557037354, "geo/layer_14/stable_rank_q_proj": 60.571842193603516, "geo/layer_14/stable_rank_k_proj": 38.79785919189453, "geo/layer_14/stable_rank_o_proj": 50.09298324584961, "geo/layer_14/stable_rank_gate_proj": 101.80225372314453, "geo/layer_14/stable_rank_down_proj": 138.6807403564453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39700353145599365, "geo/layer_14/attn_entropy_mean": 5.614799499511719, "geo/layer_14/attn_entropy_std": 0.5644559264183044, "geo/layer_21/stable_rank_q_proj": 51.39390563964844, "geo/layer_21/stable_rank_k_proj": 32.49103546142578, "geo/layer_21/stable_rank_o_proj": 90.17787170410156, "geo/layer_21/stable_rank_gate_proj": 101.1600570678711, "geo/layer_21/stable_rank_down_proj": 67.5208969116211, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16066426038742065, "geo/layer_21/attn_entropy_mean": 5.780732154846191, "geo/layer_21/attn_entropy_std": 0.279680460691452, "geo/layer_27/stable_rank_q_proj": 42.316650390625, "geo/layer_27/stable_rank_k_proj": 32.40553283691406, "geo/layer_27/stable_rank_o_proj": 116.29769134521484, "geo/layer_27/stable_rank_gate_proj": 94.43022918701172, "geo/layer_27/stable_rank_down_proj": 146.669189453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06468752771615982, "geo/layer_27/attn_entropy_mean": 4.498752593994141, "geo/layer_27/attn_entropy_std": 0.5107212662696838, "attnres/final_alpha/block_0": 0.2466265857219696, "attnres/block_norm/0": 1.4864356517791748, "attnres/final_alpha/block_1": 0.008435871452093124, "attnres/block_norm/1": 21133.720703125, "attnres/final_alpha/block_2": 0.017733164131641388, "attnres/block_norm/2": 15621.3369140625, "attnres/final_alpha/block_3": 0.019231177866458893, "attnres/block_norm/3": 18582.607421875, "attnres/final_alpha/block_4": 0.026574378833174706, "attnres/block_norm/4": 6709.8837890625, "attnres/final_alpha/block_5": 0.5280894637107849, "attnres/block_norm/5": 4204.80078125, "attnres/final_alpha/block_6": 0.15330934524536133, "attnres/block_norm/6": 13104.2666015625, "geo/tier1_time_s": 1.359992504119873, "geo/step": 12750.0, "geo/rankme_slope": 0.00042116860416041416} {"step": 12760, "timestamp": 1778208325.9440274, "train/loss": 2.2953907251358032, "train/z_loss": 0.0017142716562375427, "train/perplexity": 9.928314496555323, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789322.378862264, "perf/iters_per_sec": 0.8532153982459374, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720369815826417, "data/tokens_consumed": 26761756672, "data/tokens_consumed_B": 26.761756672, "train/loss_slope": -1.3445758275931326e-05} {"step": 12770, "timestamp": 1778208336.309446, "train/loss": 2.298418641090393, "train/z_loss": 0.0017107567400671542, "train/perplexity": 9.958422157151627, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024546.3340719305, "perf/iters_per_sec": 0.9653789205894139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035862684249878, "data/tokens_consumed": 26782728192, "data/tokens_consumed_B": 26.782728192, "train/loss_slope": -1.013962223191464e-05} {"step": 12780, "timestamp": 1778208346.667886, "train/loss": 2.2537349462509155, "train/z_loss": 0.0017196337110362947, "train/perplexity": 9.523238278335432, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025567.6209135482, "perf/iters_per_sec": 0.9658659081046811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353404045104981, "data/tokens_consumed": 26803699712, "data/tokens_consumed_B": 26.803699712, "train/loss_slope": -1.280348038408728e-05} {"step": 12790, "timestamp": 1778208357.0382817, "train/loss": 2.3121065855026246, "train/z_loss": 0.001703670376446098, "train/perplexity": 10.095669661293654, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023425.3920518914, "perf/iters_per_sec": 0.964844413782068, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364365339279176, "data/tokens_consumed": 26824671232, "data/tokens_consumed_B": 26.824671232, "train/loss_slope": -1.1922775958702414e-05} {"step": 12800, "timestamp": 1778208367.3785584, "grad/layer_0/attn": 0.0027294412720948458, "grad/layer_0/mlp": 0.002896231133490801, "grad/layer_0/attn_mlp_ratio": 0.9424113794964759, "grad/layer_4/attn": 0.0017342673381790519, "grad/layer_4/mlp": 0.002691845875233412, "grad/layer_4/attn_mlp_ratio": 0.6442669283961064, "grad/layer_8/attn": 0.010733513161540031, "grad/layer_8/mlp": 0.003991009201854467, "grad/layer_8/attn_mlp_ratio": 2.689423238515777, "grad/layer_12/attn": 0.00460848119109869, "grad/layer_12/mlp": 0.005368884652853012, "grad/layer_12/attn_mlp_ratio": 0.8583684327829453, "grad/layer_16/attn": 0.004025437869131565, "grad/layer_16/mlp": 0.004352088086307049, "grad/layer_16/attn_mlp_ratio": 0.9249440031561811, "grad/layer_20/attn": 0.007781385909765959, "grad/layer_20/mlp": 0.005538450554013252, "grad/layer_20/attn_mlp_ratio": 1.4049752170541485, "grad/layer_24/attn": 0.0069367969408631325, "grad/layer_24/mlp": 0.009492002427577972, "grad/layer_24/attn_mlp_ratio": 0.7308043714389066, "grad/layer_27/attn": 0.0070891487412154675, "grad/layer_27/mlp": 0.007684335112571716, "grad/layer_27/attn_mlp_ratio": 0.922545483130081} {"step": 12800, "timestamp": 1778208367.394617, "train/loss": 2.2916406989097595, "train/z_loss": 0.0017120234086178243, "train/perplexity": 9.891152779069905, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026114.306099468, "perf/iters_per_sec": 0.9661265879151668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350610494613648, "data/tokens_consumed": 26845642752, "data/tokens_consumed_B": 26.845642752, "train/loss_slope": -1.2312789475492165e-05} {"step": 12810, "timestamp": 1778208378.1555383, "train/loss": 2.341935634613037, "train/z_loss": 0.0016932424739934504, "train/perplexity": 10.401350303695418, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949918.3579431444, "perf/iters_per_sec": 0.9297935285297129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0755075931549072, "data/tokens_consumed": 26866614272, "data/tokens_consumed_B": 26.866614272, "train/loss_slope": -1.088868379950557e-05} {"step": 12820, "timestamp": 1778208389.0070364, "train/loss": 2.281265377998352, "train/z_loss": 0.0017113757668994368, "train/perplexity": 9.789059436238306, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933804.9095982148, "perf/iters_per_sec": 0.9221100376120638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0844692707061767, "data/tokens_consumed": 26887585792, "data/tokens_consumed_B": 26.887585792, "train/loss_slope": -1.1320271987010837e-05} {"step": 12825, "timestamp": 1778208394.8000872, "eos/sharpness": 38.88373374938964, "eos/L0_probe": 2.1138412952423096, "eos/L_plus": 2.289625883102417, "eos/L_minus": 2.3268940448760986, "eos/grad_norm": 0.17430149018764496, "eos/embed_grad_frac": 0.09054649621248245, "eos/time_s": 0.6231122016906738} {"step": 12825, "timestamp": 1778208396.1799376, "geo/rankme_last": 440.6916809082031, "geo/layer_0/stable_rank_q_proj": 15.434425354003906, "geo/layer_0/stable_rank_k_proj": 13.554790496826172, "geo/layer_0/stable_rank_o_proj": 55.051597595214844, "geo/layer_0/stable_rank_gate_proj": 161.5263214111328, "geo/layer_0/stable_rank_down_proj": 48.24413299560547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03851288557052612, "geo/layer_0/attn_entropy_mean": 6.3261237144470215, "geo/layer_0/attn_entropy_std": 0.28958550095558167, "geo/layer_7/stable_rank_q_proj": 43.766151428222656, "geo/layer_7/stable_rank_k_proj": 43.71734619140625, "geo/layer_7/stable_rank_o_proj": 109.33483123779297, "geo/layer_7/stable_rank_gate_proj": 117.61874389648438, "geo/layer_7/stable_rank_down_proj": 159.64205932617188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5747249722480774, "geo/layer_7/attn_entropy_mean": 4.6101789474487305, "geo/layer_7/attn_entropy_std": 0.8717107772827148, "geo/layer_14/stable_rank_q_proj": 60.382904052734375, "geo/layer_14/stable_rank_k_proj": 38.615909576416016, "geo/layer_14/stable_rank_o_proj": 49.990570068359375, "geo/layer_14/stable_rank_gate_proj": 101.9451904296875, "geo/layer_14/stable_rank_down_proj": 139.1142578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39998653531074524, "geo/layer_14/attn_entropy_mean": 5.607961654663086, "geo/layer_14/attn_entropy_std": 0.5566954016685486, "geo/layer_21/stable_rank_q_proj": 51.237796783447266, "geo/layer_21/stable_rank_k_proj": 32.56525421142578, "geo/layer_21/stable_rank_o_proj": 89.9208755493164, "geo/layer_21/stable_rank_gate_proj": 100.59223175048828, "geo/layer_21/stable_rank_down_proj": 67.41609191894531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1576201319694519, "geo/layer_21/attn_entropy_mean": 5.781706809997559, "geo/layer_21/attn_entropy_std": 0.3028064966201782, "geo/layer_27/stable_rank_q_proj": 42.25052261352539, "geo/layer_27/stable_rank_k_proj": 32.2496452331543, "geo/layer_27/stable_rank_o_proj": 116.30316925048828, "geo/layer_27/stable_rank_gate_proj": 94.28679656982422, "geo/layer_27/stable_rank_down_proj": 146.483642578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06848953664302826, "geo/layer_27/attn_entropy_mean": 4.466487884521484, "geo/layer_27/attn_entropy_std": 0.5092528462409973, "attnres/final_alpha/block_0": 0.24703191220760345, "attnres/block_norm/0": 1.488290548324585, "attnres/final_alpha/block_1": 0.008442609570920467, "attnres/block_norm/1": 21239.27734375, "attnres/final_alpha/block_2": 0.01738385483622551, "attnres/block_norm/2": 15811.509765625, "attnres/final_alpha/block_3": 0.018939681351184845, "attnres/block_norm/3": 18698.474609375, "attnres/final_alpha/block_4": 0.02645535208284855, "attnres/block_norm/4": 6742.76953125, "attnres/final_alpha/block_5": 0.5256600975990295, "attnres/block_norm/5": 4278.88427734375, "attnres/final_alpha/block_6": 0.15608645975589752, "attnres/block_norm/6": 13138.478515625, "geo/tier1_time_s": 1.3601956367492676, "geo/step": 12825.0, "geo/rankme_slope": 0.00040494182047819127} {"step": 12830, "timestamp": 1778208401.3612552, "train/loss": 2.2857980966567992, "train/z_loss": 0.0017149137449450791, "train/perplexity": 9.833531201452235, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698333.6160399464, "perf/iters_per_sec": 0.8098285751533253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2348292350769043, "data/tokens_consumed": 26908557312, "data/tokens_consumed_B": 26.908557312, "train/loss_slope": -1.3898378923566632e-05} {"step": 12840, "timestamp": 1778208411.7253134, "train/loss": 2.2784154653549193, "train/z_loss": 0.0017162410076707602, "train/perplexity": 9.761201187626522, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024602.43940562, "perf/iters_per_sec": 0.9654056736972905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035833978652954, "data/tokens_consumed": 26929528832, "data/tokens_consumed_B": 26.929528832, "train/loss_slope": -1.4931117747948029e-05} {"step": 12850, "timestamp": 1778208422.0727284, "grad/layer_0/attn": 0.002865677932277322, "grad/layer_0/mlp": 0.0030179088935256004, "grad/layer_0/attn_mlp_ratio": 0.9495574380888012, "grad/layer_4/attn": 0.0017489725723862648, "grad/layer_4/mlp": 0.002540821908041835, "grad/layer_4/attn_mlp_ratio": 0.6883491117640964, "grad/layer_8/attn": 0.006680873688310385, "grad/layer_8/mlp": 0.003955847583711147, "grad/layer_8/attn_mlp_ratio": 1.6888601944458024, "grad/layer_12/attn": 0.005218569654971361, "grad/layer_12/mlp": 0.006505357101559639, "grad/layer_12/attn_mlp_ratio": 0.8021957124383924, "grad/layer_16/attn": 0.005651312880218029, "grad/layer_16/mlp": 0.0054809884168207645, "grad/layer_16/attn_mlp_ratio": 1.0310754826204345, "grad/layer_20/attn": 0.007491896394640207, "grad/layer_20/mlp": 0.0077648889273405075, "grad/layer_20/attn_mlp_ratio": 0.964842687160231, "grad/layer_24/attn": 0.020807547494769096, "grad/layer_24/mlp": 0.015898631885647774, "grad/layer_24/attn_mlp_ratio": 1.3087633900547395, "grad/layer_27/attn": 0.007215249817818403, "grad/layer_27/mlp": 0.016171829774975777, "grad/layer_27/attn_mlp_ratio": 0.4461616201505589} {"step": 12850, "timestamp": 1778208422.0885367, "train/loss": 2.2805662751197815, "train/z_loss": 0.001707576646003872, "train/perplexity": 9.782218268226691, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024685.9973815167, "perf/iters_per_sec": 0.9654455172450622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357912302017211, "data/tokens_consumed": 26950500352, "data/tokens_consumed_B": 26.950500352, "train/loss_slope": -1.7031941183544007e-05} {"step": 12860, "timestamp": 1778208432.444778, "train/loss": 2.304620933532715, "train/z_loss": 0.0016938388347625733, "train/perplexity": 10.020379142690414, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026238.549335427, "perf/iters_per_sec": 0.9661858317067274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034997582435608, "data/tokens_consumed": 26971471872, "data/tokens_consumed_B": 26.971471872, "train/loss_slope": -1.3989638752884359e-05} {"step": 12870, "timestamp": 1778208443.3567896, "train/loss": 2.3178942918777468, "train/z_loss": 0.0017032216070219874, "train/perplexity": 10.154269849715392, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922780.1266485762, "perf/iters_per_sec": 0.9168530114405519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.090687370300293, "data/tokens_consumed": 26992443392, "data/tokens_consumed_B": 26.992443392, "train/loss_slope": -1.3428327066562812e-05} {"step": 12880, "timestamp": 1778208454.278868, "train/loss": 2.239535331726074, "train/z_loss": 0.0017250054865144194, "train/perplexity": 9.38896751833808, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921037.7998208492, "perf/iters_per_sec": 0.9160222052673574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0916765928268433, "data/tokens_consumed": 27013414912, "data/tokens_consumed_B": 27.013414912, "train/loss_slope": -1.7838517216780544e-05} {"step": 12890, "timestamp": 1778208464.6337607, "train/loss": 2.2752296686172486, "train/z_loss": 0.0017097356379963458, "train/perplexity": 9.730153466850398, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026276.3107233758, "perf/iters_per_sec": 0.9662038377396468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349782943725585, "data/tokens_consumed": 27034386432, "data/tokens_consumed_B": 27.034386432, "train/loss_slope": -1.668269343108623e-05} {"step": 12900, "timestamp": 1778208474.9834993, "grad/layer_0/attn": 0.002831608522683382, "grad/layer_0/mlp": 0.002676710719242692, "grad/layer_0/attn_mlp_ratio": 1.0578686731219304, "grad/layer_4/attn": 0.001739355269819498, "grad/layer_4/mlp": 0.002591318916529417, "grad/layer_4/attn_mlp_ratio": 0.6712239051712885, "grad/layer_8/attn": 0.00402095727622509, "grad/layer_8/mlp": 0.0038386210799217224, "grad/layer_8/attn_mlp_ratio": 1.0475004142782034, "grad/layer_12/attn": 0.005768026690930128, "grad/layer_12/mlp": 0.005486676003783941, "grad/layer_12/attn_mlp_ratio": 1.0512788766503185, "grad/layer_16/attn": 0.005346281919628382, "grad/layer_16/mlp": 0.004277824889868498, "grad/layer_16/attn_mlp_ratio": 1.2497663958414365, "grad/layer_20/attn": 0.00981102790683508, "grad/layer_20/mlp": 0.005811269860714674, "grad/layer_20/attn_mlp_ratio": 1.6882760520780407, "grad/layer_24/attn": 0.018206525593996048, "grad/layer_24/mlp": 0.012757859192788601, "grad/layer_24/attn_mlp_ratio": 1.4270831160747568, "grad/layer_27/attn": 0.005459131207317114, "grad/layer_27/mlp": 0.011878791265189648, "grad/layer_27/attn_mlp_ratio": 0.459569584100525} {"step": 12900, "timestamp": 1778208475.6001368, "eos/sharpness": 37.961339950561516, "eos/L0_probe": 2.108933687210083, "eos/L_plus": 2.289151668548584, "eos/L_minus": 2.3083291053771973, "eos/grad_norm": 0.19296909868717194, "eos/embed_grad_frac": 0.07310029864311218, "eos/time_s": 0.6136879920959473} {"step": 12900, "timestamp": 1778208475.6194468, "train/loss": 2.2408061265945434, "train/z_loss": 0.00172080131014809, "train/perplexity": 9.400906554506841, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909925.973608559, "perf/iters_per_sec": 0.910723673633842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0980278968811035, "data/tokens_consumed": 27055357952, "data/tokens_consumed_B": 27.055357952, "train/loss_slope": -1.99652506859496e-05} {"step": 12900, "timestamp": 1778208476.9810243, "geo/rankme_last": 440.37176513671875, "geo/layer_0/stable_rank_q_proj": 15.43701171875, "geo/layer_0/stable_rank_k_proj": 13.52484130859375, "geo/layer_0/stable_rank_o_proj": 55.07160949707031, "geo/layer_0/stable_rank_gate_proj": 161.56105041503906, "geo/layer_0/stable_rank_down_proj": 48.291465759277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0384509414434433, "geo/layer_0/attn_entropy_mean": 6.32387638092041, "geo/layer_0/attn_entropy_std": 0.2871115207672119, "geo/layer_7/stable_rank_q_proj": 43.72599411010742, "geo/layer_7/stable_rank_k_proj": 43.49961853027344, "geo/layer_7/stable_rank_o_proj": 109.3875961303711, "geo/layer_7/stable_rank_gate_proj": 117.42259216308594, "geo/layer_7/stable_rank_down_proj": 159.49989318847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5776074528694153, "geo/layer_7/attn_entropy_mean": 4.6596269607543945, "geo/layer_7/attn_entropy_std": 0.8563783764839172, "geo/layer_14/stable_rank_q_proj": 60.26869583129883, "geo/layer_14/stable_rank_k_proj": 38.30582046508789, "geo/layer_14/stable_rank_o_proj": 50.09629440307617, "geo/layer_14/stable_rank_gate_proj": 101.91893005371094, "geo/layer_14/stable_rank_down_proj": 138.89004516601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3951595425605774, "geo/layer_14/attn_entropy_mean": 5.602184295654297, "geo/layer_14/attn_entropy_std": 0.5615103840827942, "geo/layer_21/stable_rank_q_proj": 51.30314636230469, "geo/layer_21/stable_rank_k_proj": 32.44843292236328, "geo/layer_21/stable_rank_o_proj": 89.86795043945312, "geo/layer_21/stable_rank_gate_proj": 100.41312408447266, "geo/layer_21/stable_rank_down_proj": 67.2960433959961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16022826731204987, "geo/layer_21/attn_entropy_mean": 5.7740068435668945, "geo/layer_21/attn_entropy_std": 0.30090388655662537, "geo/layer_27/stable_rank_q_proj": 42.2768669128418, "geo/layer_27/stable_rank_k_proj": 32.32838439941406, "geo/layer_27/stable_rank_o_proj": 115.92000579833984, "geo/layer_27/stable_rank_gate_proj": 94.41813659667969, "geo/layer_27/stable_rank_down_proj": 146.46022033691406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07772260904312134, "geo/layer_27/attn_entropy_mean": 4.431097507476807, "geo/layer_27/attn_entropy_std": 0.5268621444702148, "attnres/final_alpha/block_0": 0.24732953310012817, "attnres/block_norm/0": 1.4902539253234863, "attnres/final_alpha/block_1": 0.008447547443211079, "attnres/block_norm/1": 21305.0546875, "attnres/final_alpha/block_2": 0.017728902399539948, "attnres/block_norm/2": 15758.080078125, "attnres/final_alpha/block_3": 0.01928602159023285, "attnres/block_norm/3": 18896.50390625, "attnres/final_alpha/block_4": 0.02668655663728714, "attnres/block_norm/4": 6793.759765625, "attnres/final_alpha/block_5": 0.5262374877929688, "attnres/block_norm/5": 4281.841796875, "attnres/final_alpha/block_6": 0.15428400039672852, "attnres/block_norm/6": 13272.443359375, "geo/tier1_time_s": 1.356982946395874, "geo/step": 12900.0, "geo/rankme_slope": 0.00035477292479491794} {"step": 12910, "timestamp": 1778208487.402161, "train/loss": 2.32476966381073, "train/z_loss": 0.0016862764721736311, "train/perplexity": 10.224324782537353, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1780462.5134035265, "perf/iters_per_sec": 0.8489906851785309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1778692245483398, "data/tokens_consumed": 27076329472, "data/tokens_consumed_B": 27.076329472, "train/loss_slope": -1.5958963411905125e-05} {"step": 12920, "timestamp": 1778208497.772487, "train/loss": 2.2728315353393556, "train/z_loss": 0.001704168354626745, "train/perplexity": 9.706847218936492, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025612.1677226904, "perf/iters_per_sec": 0.9658871496785595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035317635536194, "data/tokens_consumed": 27097300992, "data/tokens_consumed_B": 27.097300992, "train/loss_slope": -1.6016211225004288e-05} {"step": 12930, "timestamp": 1778208508.1285753, "train/loss": 2.3120558500289916, "train/z_loss": 0.0017018694547004999, "train/perplexity": 10.095157465705098, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026284.0125389625, "perf/iters_per_sec": 0.9662075102515042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349743604660033, "data/tokens_consumed": 27118272512, "data/tokens_consumed_B": 27.118272512, "train/loss_slope": -1.5449304878264593e-05} {"step": 12940, "timestamp": 1778208518.4877021, "train/loss": 2.3113802433013917, "train/z_loss": 0.0017004356370307506, "train/perplexity": 10.088339412825624, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026099.8384937255, "perf/iters_per_sec": 0.9661196892231586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035068440437317, "data/tokens_consumed": 27139244032, "data/tokens_consumed_B": 27.139244032, "train/loss_slope": -1.2909624092292017e-05} {"step": 12950, "timestamp": 1778208528.8420658, "grad/layer_0/attn": 0.0025191837921738625, "grad/layer_0/mlp": 0.0025174920447170734, "grad/layer_0/attn_mlp_ratio": 1.0006719573923355, "grad/layer_4/attn": 0.0015683737583458424, "grad/layer_4/mlp": 0.00264241942204535, "grad/layer_4/attn_mlp_ratio": 0.5935370009421711, "grad/layer_8/attn": 0.007485478185117245, "grad/layer_8/mlp": 0.004055913537740707, "grad/layer_8/attn_mlp_ratio": 1.8455713936963736, "grad/layer_12/attn": 0.0036950032226741314, "grad/layer_12/mlp": 0.005769874434918165, "grad/layer_12/attn_mlp_ratio": 0.6403957660279589, "grad/layer_16/attn": 0.006343379151076078, "grad/layer_16/mlp": 0.004309454467147589, "grad/layer_16/attn_mlp_ratio": 1.4719679839378692, "grad/layer_20/attn": 0.00954477209597826, "grad/layer_20/mlp": 0.00554695026949048, "grad/layer_20/attn_mlp_ratio": 1.7207242647197154, "grad/layer_24/attn": 0.013236070051789284, "grad/layer_24/mlp": 0.01417575217783451, "grad/layer_24/attn_mlp_ratio": 0.9337120028885851, "grad/layer_27/attn": 0.008467000909149647, "grad/layer_27/mlp": 0.012559610418975353, "grad/layer_27/attn_mlp_ratio": 0.6741451812026737} {"step": 12950, "timestamp": 1778208528.8581839, "train/loss": 2.2884533166885377, "train/z_loss": 0.0017104045837186277, "train/perplexity": 9.859676085330216, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023303.3086382223, "perf/iters_per_sec": 0.9647861998740302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364990711212159, "data/tokens_consumed": 27160215552, "data/tokens_consumed_B": 27.160215552, "train/loss_slope": -8.26738540953854e-06} {"step": 12960, "timestamp": 1778208539.2204175, "train/loss": 2.232314610481262, "train/z_loss": 0.0017231983481906354, "train/perplexity": 9.321416577868954, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025102.4920349345, "perf/iters_per_sec": 0.9656441173720047, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035578203201294, "data/tokens_consumed": 27181187072, "data/tokens_consumed_B": 27.181187072, "train/loss_slope": -1.2081887333592659e-05} {"step": 12970, "timestamp": 1778208549.5785308, "train/loss": 2.3143709659576417, "train/z_loss": 0.0016916057444177567, "train/perplexity": 10.118556000265338, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026273.0433041484, "perf/iters_per_sec": 0.9662022797127477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349799633026122, "data/tokens_consumed": 27202158592, "data/tokens_consumed_B": 27.202158592, "train/loss_slope": -9.28391104090723e-06} {"step": 12975, "timestamp": 1778208555.3698125, "eos/sharpness": 50.28986930847167, "eos/L0_probe": 2.1082632541656494, "eos/L_plus": 2.322122573852539, "eos/L_minus": 2.3973026275634766, "eos/grad_norm": 0.26094505190849304, "eos/embed_grad_frac": 0.04601169377565384, "eos/time_s": 0.621596097946167} {"step": 12975, "timestamp": 1778208556.7477412, "geo/rankme_last": 441.04486083984375, "geo/layer_0/stable_rank_q_proj": 15.498455047607422, "geo/layer_0/stable_rank_k_proj": 13.565790176391602, "geo/layer_0/stable_rank_o_proj": 54.918758392333984, "geo/layer_0/stable_rank_gate_proj": 161.21096801757812, "geo/layer_0/stable_rank_down_proj": 48.324928283691406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036041948944330215, "geo/layer_0/attn_entropy_mean": 6.3317365646362305, "geo/layer_0/attn_entropy_std": 0.28207385540008545, "geo/layer_7/stable_rank_q_proj": 43.815303802490234, "geo/layer_7/stable_rank_k_proj": 43.387142181396484, "geo/layer_7/stable_rank_o_proj": 109.5739517211914, "geo/layer_7/stable_rank_gate_proj": 117.24723815917969, "geo/layer_7/stable_rank_down_proj": 159.55938720703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5780479311943054, "geo/layer_7/attn_entropy_mean": 4.691079616546631, "geo/layer_7/attn_entropy_std": 0.9004307985305786, "geo/layer_14/stable_rank_q_proj": 60.18523406982422, "geo/layer_14/stable_rank_k_proj": 38.199432373046875, "geo/layer_14/stable_rank_o_proj": 50.0020866394043, "geo/layer_14/stable_rank_gate_proj": 101.59912872314453, "geo/layer_14/stable_rank_down_proj": 139.03965759277344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39494210481643677, "geo/layer_14/attn_entropy_mean": 5.603597164154053, "geo/layer_14/attn_entropy_std": 0.5543129444122314, "geo/layer_21/stable_rank_q_proj": 51.25090789794922, "geo/layer_21/stable_rank_k_proj": 32.41656494140625, "geo/layer_21/stable_rank_o_proj": 89.61553192138672, "geo/layer_21/stable_rank_gate_proj": 100.30207061767578, "geo/layer_21/stable_rank_down_proj": 67.13253021240234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1532900184392929, "geo/layer_21/attn_entropy_mean": 5.798708915710449, "geo/layer_21/attn_entropy_std": 0.290285587310791, "geo/layer_27/stable_rank_q_proj": 42.31911087036133, "geo/layer_27/stable_rank_k_proj": 32.407169342041016, "geo/layer_27/stable_rank_o_proj": 116.71540069580078, "geo/layer_27/stable_rank_gate_proj": 94.32904052734375, "geo/layer_27/stable_rank_down_proj": 146.24362182617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0660228431224823, "geo/layer_27/attn_entropy_mean": 4.4747467041015625, "geo/layer_27/attn_entropy_std": 0.4923490881919861, "attnres/final_alpha/block_0": 0.24816571176052094, "attnres/block_norm/0": 1.4920274019241333, "attnres/final_alpha/block_1": 0.0086154043674469, "attnres/block_norm/1": 21356.013671875, "attnres/final_alpha/block_2": 0.017496202141046524, "attnres/block_norm/2": 15844.234375, "attnres/final_alpha/block_3": 0.019373316317796707, "attnres/block_norm/3": 18994.806640625, "attnres/final_alpha/block_4": 0.02697616070508957, "attnres/block_norm/4": 6815.3466796875, "attnres/final_alpha/block_5": 0.524031400680542, "attnres/block_norm/5": 4295.78515625, "attnres/final_alpha/block_6": 0.15534177422523499, "attnres/block_norm/6": 13308.9912109375, "geo/tier1_time_s": 1.3581304550170898, "geo/step": 12975.0, "geo/rankme_slope": 0.00034591748808898557} {"step": 12980, "timestamp": 1778208561.9288888, "train/loss": 2.2619059324264525, "train/z_loss": 0.0017011703224852682, "train/perplexity": 9.6013713038776, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699028.2847374403, "perf/iters_per_sec": 0.8101598190009309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2343243598937987, "data/tokens_consumed": 27223130112, "data/tokens_consumed_B": 27.223130112, "train/loss_slope": -1.3298943534184942e-05} {"step": 12990, "timestamp": 1778208572.285451, "train/loss": 2.2719204425811768, "train/z_loss": 0.0017074577161110937, "train/perplexity": 9.698007408285791, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025913.7381704622, "perf/iters_per_sec": 0.966030949673873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351635217666626, "data/tokens_consumed": 27244101632, "data/tokens_consumed_B": 27.244101632, "train/loss_slope": -1.3376484738432006e-05} {"step": 13000, "timestamp": 1778208582.6362653, "grad/layer_0/attn": 0.003128842916339636, "grad/layer_0/mlp": 0.0027478497941046953, "grad/layer_0/attn_mlp_ratio": 1.1386513226404151, "grad/layer_4/attn": 0.0020519469399005175, "grad/layer_4/mlp": 0.002593190409243107, "grad/layer_4/attn_mlp_ratio": 0.7912827586660549, "grad/layer_8/attn": 0.008786123245954514, "grad/layer_8/mlp": 0.0038163703866302967, "grad/layer_8/attn_mlp_ratio": 2.3022196814314824, "grad/layer_12/attn": 0.004727592691779137, "grad/layer_12/mlp": 0.005949040874838829, "grad/layer_12/attn_mlp_ratio": 0.7946814808931816, "grad/layer_16/attn": 0.00482306070625782, "grad/layer_16/mlp": 0.004986281972378492, "grad/layer_16/attn_mlp_ratio": 0.9672659180224006, "grad/layer_20/attn": 0.00572764128446579, "grad/layer_20/mlp": 0.00652043242007494, "grad/layer_20/attn_mlp_ratio": 0.8784143178894461, "grad/layer_24/attn": 0.018971014767885208, "grad/layer_24/mlp": 0.012226266786456108, "grad/layer_24/attn_mlp_ratio": 1.5516604491024755, "grad/layer_27/attn": 0.004816242028027773, "grad/layer_27/mlp": 0.009862291626632214, "grad/layer_27/attn_mlp_ratio": 0.4883491749713663} {"step": 13000, "timestamp": 1778208582.6522653, "train/loss": 2.298982834815979, "train/z_loss": 0.0017033024108968676, "train/perplexity": 9.964042221702929, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023974.2279217085, "perf/iters_per_sec": 0.9651061191185515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361554861068725, "data/tokens_consumed": 27265073152, "data/tokens_consumed_B": 27.265073152, "train/loss_slope": -1.3098694689453361e-05} {"step": 13000, "timestamp": 1778208589.670736, "geo/ww_alpha_mean": 8.430508249267827, "geo/ww_alpha_std": 5.610475693529495, "geo/ww_alpha_min": 1.3716474472285198, "geo/ww_alpha_max": 42.31219866164196, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.232889488062497, "geo/ww_alpha_by_type/k_proj": 4.634877616380476, "geo/ww_alpha_by_type/v_proj": 9.66750844728004, "geo/ww_alpha_by_type/o_proj": 8.619681602352605, "geo/ww_alpha_by_type/gate_proj": 9.773218961747977, "geo/ww_alpha_by_type/up_proj": 13.123227050576208, "geo/ww_alpha_by_type/down_proj": 9.116283777806611, "geo/twonn_id/layer_0": 0.7398863434791565, "geo/twonn_id/layer_7": 3.171959400177002, "geo/twonn_id/layer_14": 3.8192527294158936, "geo/twonn_id/layer_21": 6.842705726623535, "geo/twonn_id/layer_27": 5.910987854003906, "geo/tier2_time_s": 7.011160373687744} {"step": 13000, "timestamp": 1778208590.285588, "eoc/jacobian_sigma/layer_0/attn": 700.941650390625, "eoc/jacobian_sigma/layer_0/mlp": 3879.398193359375, "eoc/jacobian_sigma/layer_0": 3879.398193359375, "eoc/jacobian_sigma/layer_7/attn": 1.1532386541366577, "eoc/jacobian_sigma/layer_7/mlp": 1.6543771028518677, "eoc/jacobian_sigma/layer_7": 1.6543771028518677, "eoc/jacobian_sigma/layer_14/attn": 1.3259131908416748, "eoc/jacobian_sigma/layer_14/mlp": 8.723443031311035, "eoc/jacobian_sigma/layer_14": 8.723443031311035, "eoc/jacobian_sigma/layer_21/attn": 1.085693120956421, "eoc/jacobian_sigma/layer_21/mlp": 3.6395998001098633, "eoc/jacobian_sigma/layer_21": 3.6395998001098633, "eoc/jacobian_sigma/layer_27/attn": 2.4631259441375732, "eoc/jacobian_sigma/layer_27/mlp": 24.938413619995117, "eoc/jacobian_sigma/layer_27": 24.938413619995117, "eoc/layer0_sigma": 3879.398193359375, "eoc/sigma_max": 24.938413619995117, "eoc/sigma_min": 1.6543771028518677, "eoc/sigma_mean": 9.73895838856697, "eoc/time_s": 0.6090426445007324} {"step": 13010, "timestamp": 1778208600.6676226, "train/loss": 2.2636232137680055, "train/z_loss": 0.001714262249879539, "train/perplexity": 9.617873725265696, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1164476.6444024865, "perf/iters_per_sec": 0.5552657339107926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8009395122528076, "data/tokens_consumed": 27286044672, "data/tokens_consumed_B": 27.286044672, "train/loss_slope": -1.5653325557851772e-05} {"step": 13020, "timestamp": 1778208611.0287282, "train/loss": 2.294236254692078, "train/z_loss": 0.0017012707190588116, "train/perplexity": 9.916859164606427, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025620.3776102413, "perf/iters_per_sec": 0.9658910644580084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353134393692016, "data/tokens_consumed": 27307016192, "data/tokens_consumed_B": 27.307016192, "train/loss_slope": -1.6546652376419247e-05} {"step": 13030, "timestamp": 1778208621.385239, "train/loss": 2.3181833267211913, "train/z_loss": 0.0016906945500522852, "train/perplexity": 10.15720521170221, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026421.3947584708, "perf/iters_per_sec": 0.9662730191986422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349041938781738, "data/tokens_consumed": 27327987712, "data/tokens_consumed_B": 27.327987712, "train/loss_slope": -1.675218872003927e-05} {"step": 13040, "timestamp": 1778208631.746944, "train/loss": 2.326510262489319, "train/z_loss": 0.0016926359967328608, "train/perplexity": 10.242136725968859, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024979.1805767599, "perf/iters_per_sec": 0.9655853178867149, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356412649154663, "data/tokens_consumed": 27348959232, "data/tokens_consumed_B": 27.348959232, "train/loss_slope": -1.1400909091916233e-05} {"step": 13050, "timestamp": 1778208642.102064, "grad/layer_0/attn": 0.0032544101122766733, "grad/layer_0/mlp": 0.002924023661762476, "grad/layer_0/attn_mlp_ratio": 1.112990309735053, "grad/layer_4/attn": 0.0017368614207953215, "grad/layer_4/mlp": 0.002673650160431862, "grad/layer_4/attn_mlp_ratio": 0.649621772338606, "grad/layer_8/attn": 0.005654795095324516, "grad/layer_8/mlp": 0.004021755885332823, "grad/layer_8/attn_mlp_ratio": 1.406051266150239, "grad/layer_12/attn": 0.004339948762208223, "grad/layer_12/mlp": 0.006669501308351755, "grad/layer_12/attn_mlp_ratio": 0.6507156227261015, "grad/layer_16/attn": 0.005540082231163979, "grad/layer_16/mlp": 0.004855297971516848, "grad/layer_16/attn_mlp_ratio": 1.1410385417250393, "grad/layer_20/attn": 0.005650260951370001, "grad/layer_20/mlp": 0.007040813565254211, "grad/layer_20/attn_mlp_ratio": 0.8025011341023746, "grad/layer_24/attn": 0.018066735938191414, "grad/layer_24/mlp": 0.011296681128442287, "grad/layer_24/attn_mlp_ratio": 1.5992958969846631, "grad/layer_27/attn": 0.005883314181119204, "grad/layer_27/mlp": 0.009530202485620975, "grad/layer_27/attn_mlp_ratio": 0.6173335905782169} {"step": 13050, "timestamp": 1778208642.7332585, "eos/sharpness": 51.82557106018066, "eos/L0_probe": 2.1135566234588623, "eos/L_plus": 2.4838550090789795, "eos/L_minus": 2.2615139484405518, "eos/grad_norm": 0.20585864782333374, "eos/embed_grad_frac": 0.06604266911745071, "eos/time_s": 0.6283459663391113} {"step": 13050, "timestamp": 1778208642.7536783, "train/loss": 2.2927010297775268, "train/z_loss": 0.001701782876625657, "train/perplexity": 9.901646235964332, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906801.9480667734, "perf/iters_per_sec": 0.909234022172343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0998268604278565, "data/tokens_consumed": 27369930752, "data/tokens_consumed_B": 27.369930752, "train/loss_slope": -1.5292837424497116e-05} {"step": 13050, "timestamp": 1778208644.1167383, "geo/rankme_last": 440.36358642578125, "geo/layer_0/stable_rank_q_proj": 15.507851600646973, "geo/layer_0/stable_rank_k_proj": 13.574134826660156, "geo/layer_0/stable_rank_o_proj": 54.94588851928711, "geo/layer_0/stable_rank_gate_proj": 161.00677490234375, "geo/layer_0/stable_rank_down_proj": 48.499305725097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04083782806992531, "geo/layer_0/attn_entropy_mean": 6.3275885581970215, "geo/layer_0/attn_entropy_std": 0.28803837299346924, "geo/layer_7/stable_rank_q_proj": 43.777469635009766, "geo/layer_7/stable_rank_k_proj": 43.54507827758789, "geo/layer_7/stable_rank_o_proj": 109.95425415039062, "geo/layer_7/stable_rank_gate_proj": 116.82034301757812, "geo/layer_7/stable_rank_down_proj": 159.44883728027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5908936262130737, "geo/layer_7/attn_entropy_mean": 4.6684770584106445, "geo/layer_7/attn_entropy_std": 0.878804087638855, "geo/layer_14/stable_rank_q_proj": 60.09449768066406, "geo/layer_14/stable_rank_k_proj": 37.936126708984375, "geo/layer_14/stable_rank_o_proj": 50.20030975341797, "geo/layer_14/stable_rank_gate_proj": 101.20869445800781, "geo/layer_14/stable_rank_down_proj": 138.7769775390625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3856240212917328, "geo/layer_14/attn_entropy_mean": 5.611786842346191, "geo/layer_14/attn_entropy_std": 0.5729880928993225, "geo/layer_21/stable_rank_q_proj": 51.38770294189453, "geo/layer_21/stable_rank_k_proj": 32.25471115112305, "geo/layer_21/stable_rank_o_proj": 89.50782012939453, "geo/layer_21/stable_rank_gate_proj": 99.97950744628906, "geo/layer_21/stable_rank_down_proj": 66.9504165649414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1607898324728012, "geo/layer_21/attn_entropy_mean": 5.798346519470215, "geo/layer_21/attn_entropy_std": 0.2957422733306885, "geo/layer_27/stable_rank_q_proj": 42.24650573730469, "geo/layer_27/stable_rank_k_proj": 32.32525634765625, "geo/layer_27/stable_rank_o_proj": 116.8983154296875, "geo/layer_27/stable_rank_gate_proj": 94.46630859375, "geo/layer_27/stable_rank_down_proj": 145.83177185058594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07284626364707947, "geo/layer_27/attn_entropy_mean": 4.479841232299805, "geo/layer_27/attn_entropy_std": 0.5089347958564758, "attnres/final_alpha/block_0": 0.24538010358810425, "attnres/block_norm/0": 1.493811845779419, "attnres/final_alpha/block_1": 0.008341051638126373, "attnres/block_norm/1": 21437.6875, "attnres/final_alpha/block_2": 0.017106883227825165, "attnres/block_norm/2": 15954.025390625, "attnres/final_alpha/block_3": 0.01913333311676979, "attnres/block_norm/3": 19126.44140625, "attnres/final_alpha/block_4": 0.026376258581876755, "attnres/block_norm/4": 6841.8876953125, "attnres/final_alpha/block_5": 0.5319722294807434, "attnres/block_norm/5": 4258.880859375, "attnres/final_alpha/block_6": 0.15169017016887665, "attnres/block_norm/6": 13420.86328125, "geo/tier1_time_s": 1.3588685989379883, "geo/step": 13050.0, "geo/rankme_slope": 0.0003402881074304722} {"step": 13060, "timestamp": 1778208654.480688, "train/loss": 2.263780403137207, "train/z_loss": 0.0017168943886645138, "train/perplexity": 9.61938567159746, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788892.8666471143, "perf/iters_per_sec": 0.8530105908618518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1723183870315552, "data/tokens_consumed": 27390902272, "data/tokens_consumed_B": 27.390902272, "train/loss_slope": -1.6643104659091072e-05} {"step": 13070, "timestamp": 1778208664.8397782, "train/loss": 2.243932819366455, "train/z_loss": 0.0017081709927879273, "train/perplexity": 9.4303463016184, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025367.5342838645, "perf/iters_per_sec": 0.9657704993647883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354426860809327, "data/tokens_consumed": 27411873792, "data/tokens_consumed_B": 27.411873792, "train/loss_slope": -1.7499935179904066e-05} {"step": 13080, "timestamp": 1778208675.2357552, "train/loss": 2.2775532960891725, "train/z_loss": 0.001705913885962218, "train/perplexity": 9.75278900684873, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018459.9061843709, "perf/iters_per_sec": 0.9624766856119017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0389862060546875, "data/tokens_consumed": 27432845312, "data/tokens_consumed_B": 27.432845312, "train/loss_slope": -1.780321318360013e-05} {"step": 13090, "timestamp": 1778208685.6401846, "train/loss": 2.2710943937301638, "train/z_loss": 0.0017088909400627017, "train/perplexity": 9.689999688248372, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017445.2633755424, "perf/iters_per_sec": 0.9619928662183487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0395087480545044, "data/tokens_consumed": 27453816832, "data/tokens_consumed_B": 27.453816832, "train/loss_slope": -1.6280401858201846e-05} {"step": 13100, "timestamp": 1778208696.009036, "grad/layer_0/attn": 0.003045526100322604, "grad/layer_0/mlp": 0.0027305609546601772, "grad/layer_0/attn_mlp_ratio": 1.115348105886475, "grad/layer_4/attn": 0.0017090673791244626, "grad/layer_4/mlp": 0.002480069175362587, "grad/layer_4/attn_mlp_ratio": 0.6891208225925852, "grad/layer_8/attn": 0.004855267703533173, "grad/layer_8/mlp": 0.003946807701140642, "grad/layer_8/attn_mlp_ratio": 1.230175865703413, "grad/layer_12/attn": 0.004157781135290861, "grad/layer_12/mlp": 0.006262599490582943, "grad/layer_12/attn_mlp_ratio": 0.6639065894525507, "grad/layer_16/attn": 0.004242422990500927, "grad/layer_16/mlp": 0.004721945151686668, "grad/layer_16/attn_mlp_ratio": 0.8984481531177312, "grad/layer_20/attn": 0.004616580903530121, "grad/layer_20/mlp": 0.0059190308675169945, "grad/layer_20/attn_mlp_ratio": 0.7799555246231048, "grad/layer_24/attn": 0.005874585825949907, "grad/layer_24/mlp": 0.008846481330692768, "grad/layer_24/attn_mlp_ratio": 0.6640590241412928, "grad/layer_27/attn": 0.013318002223968506, "grad/layer_27/mlp": 0.0076358201913535595, "grad/layer_27/attn_mlp_ratio": 1.7441482009534954} {"step": 13100, "timestamp": 1778208696.023779, "train/loss": 2.272550368309021, "train/z_loss": 0.0017097714822739362, "train/perplexity": 9.704118357180986, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020765.42462998, "perf/iters_per_sec": 0.9635760424756908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378008127212524, "data/tokens_consumed": 27474788352, "data/tokens_consumed_B": 27.474788352, "train/loss_slope": -1.707663845093155e-05} {"step": 13110, "timestamp": 1778208706.4023848, "train/loss": 2.2931078672409058, "train/z_loss": 0.0017072115093469619, "train/perplexity": 9.905675416157397, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022525.727655029, "perf/iters_per_sec": 0.9644154203677315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036897563934326, "data/tokens_consumed": 27495759872, "data/tokens_consumed_B": 27.495759872, "train/loss_slope": -1.789724228083816e-05} {"step": 13120, "timestamp": 1778208716.7842836, "train/loss": 2.311757493019104, "train/z_loss": 0.0016883595497347414, "train/perplexity": 10.09214595398444, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021731.2254000306, "perf/iters_per_sec": 0.9640365721702722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373050451278687, "data/tokens_consumed": 27516731392, "data/tokens_consumed_B": 27.516731392, "train/loss_slope": -1.5410171240398248e-05} {"step": 13125, "timestamp": 1778208722.5670927, "eos/sharpness": 37.20080852508544, "eos/L0_probe": 2.1096606254577637, "eos/L_plus": 2.3429083824157715, "eos/L_minus": 2.2484209537506104, "eos/grad_norm": 0.17577894032001495, "eos/embed_grad_frac": 0.10215503722429276, "eos/time_s": 0.5977888107299805} {"step": 13125, "timestamp": 1778208723.9488118, "geo/rankme_last": 440.43792724609375, "geo/layer_0/stable_rank_q_proj": 15.473536491394043, "geo/layer_0/stable_rank_k_proj": 13.552824020385742, "geo/layer_0/stable_rank_o_proj": 54.83998107910156, "geo/layer_0/stable_rank_gate_proj": 161.59149169921875, "geo/layer_0/stable_rank_down_proj": 48.457584381103516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.039249882102012634, "geo/layer_0/attn_entropy_mean": 6.32232141494751, "geo/layer_0/attn_entropy_std": 0.2884521484375, "geo/layer_7/stable_rank_q_proj": 43.7132453918457, "geo/layer_7/stable_rank_k_proj": 43.80928039550781, "geo/layer_7/stable_rank_o_proj": 109.94539642333984, "geo/layer_7/stable_rank_gate_proj": 116.57170104980469, "geo/layer_7/stable_rank_down_proj": 159.18003845214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5874596238136292, "geo/layer_7/attn_entropy_mean": 4.6739606857299805, "geo/layer_7/attn_entropy_std": 0.8826343417167664, "geo/layer_14/stable_rank_q_proj": 59.89624786376953, "geo/layer_14/stable_rank_k_proj": 37.79804611206055, "geo/layer_14/stable_rank_o_proj": 50.37763595581055, "geo/layer_14/stable_rank_gate_proj": 101.31334686279297, "geo/layer_14/stable_rank_down_proj": 138.41001892089844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3962065577507019, "geo/layer_14/attn_entropy_mean": 5.573570251464844, "geo/layer_14/attn_entropy_std": 0.5902663469314575, "geo/layer_21/stable_rank_q_proj": 50.98380661010742, "geo/layer_21/stable_rank_k_proj": 32.2999153137207, "geo/layer_21/stable_rank_o_proj": 89.5186538696289, "geo/layer_21/stable_rank_gate_proj": 99.62527465820312, "geo/layer_21/stable_rank_down_proj": 66.91370391845703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15912355482578278, "geo/layer_21/attn_entropy_mean": 5.786724090576172, "geo/layer_21/attn_entropy_std": 0.29666900634765625, "geo/layer_27/stable_rank_q_proj": 42.090335845947266, "geo/layer_27/stable_rank_k_proj": 32.500091552734375, "geo/layer_27/stable_rank_o_proj": 116.92675018310547, "geo/layer_27/stable_rank_gate_proj": 94.36663818359375, "geo/layer_27/stable_rank_down_proj": 146.02708435058594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06839875876903534, "geo/layer_27/attn_entropy_mean": 4.456281661987305, "geo/layer_27/attn_entropy_std": 0.5185536742210388, "attnres/final_alpha/block_0": 0.24540117383003235, "attnres/block_norm/0": 1.495625376701355, "attnres/final_alpha/block_1": 0.008428364060819149, "attnres/block_norm/1": 21480.8203125, "attnres/final_alpha/block_2": 0.01715496927499771, "attnres/block_norm/2": 16004.40234375, "attnres/final_alpha/block_3": 0.019009485840797424, "attnres/block_norm/3": 19207.138671875, "attnres/final_alpha/block_4": 0.02606542408466339, "attnres/block_norm/4": 6871.7998046875, "attnres/final_alpha/block_5": 0.5328067541122437, "attnres/block_norm/5": 4289.861328125, "attnres/final_alpha/block_6": 0.15113386511802673, "attnres/block_norm/6": 13497.3505859375, "geo/tier1_time_s": 1.3586337566375732, "geo/step": 13125.0, "geo/rankme_slope": 0.0003315243675595238} {"step": 13130, "timestamp": 1778208729.1986341, "train/loss": 2.2328883171081544, "train/z_loss": 0.0017108823056332767, "train/perplexity": 9.326765870647337, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1690312.5729250417, "perf/iters_per_sec": 0.8060038437485894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2406888723373413, "data/tokens_consumed": 27537702912, "data/tokens_consumed_B": 27.537702912, "train/loss_slope": -1.64770904618843e-05} {"step": 13140, "timestamp": 1778208739.5944843, "train/loss": 2.261526322364807, "train/z_loss": 0.0017066613538190723, "train/perplexity": 9.597727218434565, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019040.7629157274, "perf/iters_per_sec": 0.9627536596850049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386873006820678, "data/tokens_consumed": 27558674432, "data/tokens_consumed_B": 27.558674432, "train/loss_slope": -2.0064661390531738e-05} {"step": 13150, "timestamp": 1778208749.981251, "grad/layer_0/attn": 0.002967836568132043, "grad/layer_0/mlp": 0.002748386701568961, "grad/layer_0/attn_mlp_ratio": 1.0798467546263157, "grad/layer_4/attn": 0.0015461317962035537, "grad/layer_4/mlp": 0.0025928958784788847, "grad/layer_4/attn_mlp_ratio": 0.5962953427505363, "grad/layer_8/attn": 0.006627853959798813, "grad/layer_8/mlp": 0.0040601338259875774, "grad/layer_8/attn_mlp_ratio": 1.6324224965526648, "grad/layer_12/attn": 0.0036618427839130163, "grad/layer_12/mlp": 0.006130670662969351, "grad/layer_12/attn_mlp_ratio": 0.5972988805778614, "grad/layer_16/attn": 0.00506784999743104, "grad/layer_16/mlp": 0.004544283263385296, "grad/layer_16/attn_mlp_ratio": 1.1152143456247212, "grad/layer_20/attn": 0.004414600320160389, "grad/layer_20/mlp": 0.006313315127044916, "grad/layer_20/attn_mlp_ratio": 0.6992523201200487, "grad/layer_24/attn": 0.02128707803785801, "grad/layer_24/mlp": 0.013140032067894936, "grad/layer_24/attn_mlp_ratio": 1.620017193707392, "grad/layer_27/attn": 0.0051238276064395905, "grad/layer_27/mlp": 0.012038825079798698, "grad/layer_27/attn_mlp_ratio": 0.4256086063146293} {"step": 13150, "timestamp": 1778208749.998787, "train/loss": 2.227193260192871, "train/z_loss": 0.0017203133436851203, "train/perplexity": 9.273800372098265, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017058.9702160635, "perf/iters_per_sec": 0.9618086672859495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0397078275680542, "data/tokens_consumed": 27579645952, "data/tokens_consumed_B": 27.579645952, "train/loss_slope": -1.9485558585079084e-05} {"step": 13160, "timestamp": 1778208760.356075, "train/loss": 2.2926333427429197, "train/z_loss": 0.0016906912787817419, "train/perplexity": 9.900976045574748, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026100.165179317, "perf/iters_per_sec": 0.9661198449989876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350682735443115, "data/tokens_consumed": 27600617472, "data/tokens_consumed_B": 27.600617472, "train/loss_slope": -1.5200741842563489e-05} {"step": 13170, "timestamp": 1778208770.7062538, "train/loss": 2.2542171716690063, "train/z_loss": 0.0017049095476977527, "train/perplexity": 9.52783173334715, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027768.0071221024, "perf/iters_per_sec": 0.9669151340113175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342169284820557, "data/tokens_consumed": 27621588992, "data/tokens_consumed_B": 27.621588992, "train/loss_slope": -1.705967393538064e-05} {"step": 13180, "timestamp": 1778208781.0604925, "train/loss": 2.283306050300598, "train/z_loss": 0.0016910399775952102, "train/perplexity": 9.809056095069371, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026406.7360328122, "perf/iters_per_sec": 0.9662660293735562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349116802215577, "data/tokens_consumed": 27642560512, "data/tokens_consumed_B": 27.642560512, "train/loss_slope": -1.6593921688845568e-05} {"step": 13190, "timestamp": 1778208791.416312, "train/loss": 2.27049515247345, "train/z_loss": 0.0016995535581372679, "train/perplexity": 9.684194780101562, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026497.4461375186, "perf/iters_per_sec": 0.9663092833221048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348653554916383, "data/tokens_consumed": 27663532032, "data/tokens_consumed_B": 27.663532032, "train/loss_slope": -1.4391504777575892e-05} {"step": 13200, "timestamp": 1778208801.7854936, "grad/layer_0/attn": 0.0024762703105807304, "grad/layer_0/mlp": 0.002632367890328169, "grad/layer_0/attn_mlp_ratio": 0.9407006617915992, "grad/layer_4/attn": 0.001717610633932054, "grad/layer_4/mlp": 0.002526362892240286, "grad/layer_4/attn_mlp_ratio": 0.679874839525313, "grad/layer_8/attn": 0.004866401664912701, "grad/layer_8/mlp": 0.003980955109000206, "grad/layer_8/attn_mlp_ratio": 1.2224206024500512, "grad/layer_12/attn": 0.003518556011840701, "grad/layer_12/mlp": 0.0059560309164226055, "grad/layer_12/attn_mlp_ratio": 0.5907551525737462, "grad/layer_16/attn": 0.004180627875030041, "grad/layer_16/mlp": 0.00455898093059659, "grad/layer_16/attn_mlp_ratio": 0.9170092718027745, "grad/layer_20/attn": 0.0038983426056802273, "grad/layer_20/mlp": 0.006418210919946432, "grad/layer_20/attn_mlp_ratio": 0.6073877274469489, "grad/layer_24/attn": 0.015504804439842701, "grad/layer_24/mlp": 0.011670473031699657, "grad/layer_24/attn_mlp_ratio": 1.3285497738500531, "grad/layer_27/attn": 0.005377907305955887, "grad/layer_27/mlp": 0.01119072176516056, "grad/layer_27/attn_mlp_ratio": 0.4805684003905611} {"step": 13200, "timestamp": 1778208802.3745325, "eos/sharpness": 37.56930828094482, "eos/L0_probe": 2.108020305633545, "eos/L_plus": 2.3306238651275635, "eos/L_minus": 2.2611098289489746, "eos/grad_norm": 0.17977945506572723, "eos/embed_grad_frac": 0.07680046558380127, "eos/time_s": 0.5863978862762451} {"step": 13200, "timestamp": 1778208802.3920894, "train/loss": 2.296037459373474, "train/z_loss": 0.0016922261798754334, "train/perplexity": 9.93473755424439, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912261.7711300354, "perf/iters_per_sec": 0.9118374686861207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966866731643676, "data/tokens_consumed": 27684503552, "data/tokens_consumed_B": 27.684503552, "train/loss_slope": -1.2513601890813518e-05} {"step": 13200, "timestamp": 1778208803.7560189, "geo/rankme_last": 441.22662353515625, "geo/layer_0/stable_rank_q_proj": 15.49267864227295, "geo/layer_0/stable_rank_k_proj": 13.577880859375, "geo/layer_0/stable_rank_o_proj": 54.8955192565918, "geo/layer_0/stable_rank_gate_proj": 161.968017578125, "geo/layer_0/stable_rank_down_proj": 48.333370208740234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.039268672466278076, "geo/layer_0/attn_entropy_mean": 6.326725959777832, "geo/layer_0/attn_entropy_std": 0.2824062407016754, "geo/layer_7/stable_rank_q_proj": 43.524532318115234, "geo/layer_7/stable_rank_k_proj": 43.73162078857422, "geo/layer_7/stable_rank_o_proj": 110.32930755615234, "geo/layer_7/stable_rank_gate_proj": 116.52506256103516, "geo/layer_7/stable_rank_down_proj": 158.9972381591797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5712040066719055, "geo/layer_7/attn_entropy_mean": 4.651055335998535, "geo/layer_7/attn_entropy_std": 0.8767073750495911, "geo/layer_14/stable_rank_q_proj": 59.907840728759766, "geo/layer_14/stable_rank_k_proj": 37.85020446777344, "geo/layer_14/stable_rank_o_proj": 50.401912689208984, "geo/layer_14/stable_rank_gate_proj": 100.79165649414062, "geo/layer_14/stable_rank_down_proj": 138.235595703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3903678357601166, "geo/layer_14/attn_entropy_mean": 5.5724029541015625, "geo/layer_14/attn_entropy_std": 0.5730627179145813, "geo/layer_21/stable_rank_q_proj": 51.106544494628906, "geo/layer_21/stable_rank_k_proj": 32.29307174682617, "geo/layer_21/stable_rank_o_proj": 89.19661712646484, "geo/layer_21/stable_rank_gate_proj": 99.63721466064453, "geo/layer_21/stable_rank_down_proj": 66.79634094238281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1570899486541748, "geo/layer_21/attn_entropy_mean": 5.796968936920166, "geo/layer_21/attn_entropy_std": 0.28847557306289673, "geo/layer_27/stable_rank_q_proj": 41.93260955810547, "geo/layer_27/stable_rank_k_proj": 32.44321823120117, "geo/layer_27/stable_rank_o_proj": 116.65270233154297, "geo/layer_27/stable_rank_gate_proj": 94.26903533935547, "geo/layer_27/stable_rank_down_proj": 145.8187713623047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06400999426841736, "geo/layer_27/attn_entropy_mean": 4.4814863204956055, "geo/layer_27/attn_entropy_std": 0.49790605902671814, "attnres/final_alpha/block_0": 0.24516387283802032, "attnres/block_norm/0": 1.49752938747406, "attnres/final_alpha/block_1": 0.008370732888579369, "attnres/block_norm/1": 21733.84375, "attnres/final_alpha/block_2": 0.017264550551772118, "attnres/block_norm/2": 16045.8740234375, "attnres/final_alpha/block_3": 0.01883733458817005, "attnres/block_norm/3": 19269.625, "attnres/final_alpha/block_4": 0.025972677394747734, "attnres/block_norm/4": 6887.4384765625, "attnres/final_alpha/block_5": 0.5331801176071167, "attnres/block_norm/5": 4299.8193359375, "attnres/final_alpha/block_6": 0.15121068060398102, "attnres/block_norm/6": 13529.494140625, "geo/tier1_time_s": 1.3598532676696777, "geo/step": 13200.0, "geo/rankme_slope": 0.00033779375031262504} {"step": 13210, "timestamp": 1778208814.102546, "train/loss": 2.3775925397872926, "train/z_loss": 0.0016881714574992657, "train/perplexity": 10.778921776949343, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791421.1553008652, "perf/iters_per_sec": 0.8542161728386236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706638574600219, "data/tokens_consumed": 27705475072, "data/tokens_consumed_B": 27.705475072, "train/loss_slope": -5.1931119701936464e-06} {"step": 13220, "timestamp": 1778208824.4582107, "train/loss": 2.2771067142486574, "train/z_loss": 0.0016983829438686371, "train/perplexity": 9.748434560764538, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026390.723718162, "perf/iters_per_sec": 0.9662583941069421, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349198579788208, "data/tokens_consumed": 27726446592, "data/tokens_consumed_B": 27.726446592, "train/loss_slope": -4.07490233848524e-06} {"step": 13230, "timestamp": 1778208834.8005385, "train/loss": 2.300403332710266, "train/z_loss": 0.00168967607896775, "train/perplexity": 9.978206180252407, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028744.0240404087, "perf/iters_per_sec": 0.9673805351450008, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337193727493286, "data/tokens_consumed": 27747418112, "data/tokens_consumed_B": 27.747418112, "train/loss_slope": -1.8306287530303728e-06} {"step": 13240, "timestamp": 1778208845.1410496, "train/loss": 2.22250657081604, "train/z_loss": 0.0017130644060671329, "train/perplexity": 9.230438641263344, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029078.309047319, "perf/iters_per_sec": 0.9675399346577258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335490703582764, "data/tokens_consumed": 27768389632, "data/tokens_consumed_B": 27.768389632, "train/loss_slope": -6.786094411443151e-06} {"step": 13250, "timestamp": 1778208855.4802718, "grad/layer_0/attn": 0.0028454004786908627, "grad/layer_0/mlp": 0.002639626618474722, "grad/layer_0/attn_mlp_ratio": 1.0779556286409484, "grad/layer_4/attn": 0.0017218298744410276, "grad/layer_4/mlp": 0.0026426021941006184, "grad/layer_4/attn_mlp_ratio": 0.6515660257636438, "grad/layer_8/attn": 0.006309002172201872, "grad/layer_8/mlp": 0.003997119143605232, "grad/layer_8/attn_mlp_ratio": 1.5783872803633, "grad/layer_12/attn": 0.0034987200051546097, "grad/layer_12/mlp": 0.005936556030064821, "grad/layer_12/attn_mlp_ratio": 0.5893517939527015, "grad/layer_16/attn": 0.005251410882920027, "grad/layer_16/mlp": 0.004647011402994394, "grad/layer_16/attn_mlp_ratio": 1.130061950467774, "grad/layer_20/attn": 0.0037910945247858763, "grad/layer_20/mlp": 0.005843355320394039, "grad/layer_20/attn_mlp_ratio": 0.6487872552736539, "grad/layer_24/attn": 0.01333648432046175, "grad/layer_24/mlp": 0.009939144365489483, "grad/layer_24/attn_mlp_ratio": 1.3418141135555932, "grad/layer_27/attn": 0.007306005340069532, "grad/layer_27/mlp": 0.007978066802024841, "grad/layer_27/attn_mlp_ratio": 0.9157613529431875} {"step": 13250, "timestamp": 1778208855.4946296, "train/loss": 2.311593270301819, "train/z_loss": 0.0016960552660748363, "train/perplexity": 10.090488730433243, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026503.5155542982, "perf/iters_per_sec": 0.9663121774455539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03486225605011, "data/tokens_consumed": 27789361152, "data/tokens_consumed_B": 27.789361152, "train/loss_slope": -4.016996925026716e-06} {"step": 13260, "timestamp": 1778208865.8331966, "train/loss": 2.328944706916809, "train/z_loss": 0.001693158596754074, "train/perplexity": 10.267101013402922, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029556.9265680336, "perf/iters_per_sec": 0.9677681572761696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333053350448609, "data/tokens_consumed": 27810332672, "data/tokens_consumed_B": 27.810332672, "train/loss_slope": -2.685410247491483e-06} {"step": 13270, "timestamp": 1778208876.1708992, "train/loss": 2.245952081680298, "train/z_loss": 0.0017039983067661524, "train/perplexity": 9.449407883201571, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029550.230077404, "perf/iters_per_sec": 0.9677649641406079, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033308744430542, "data/tokens_consumed": 27831304192, "data/tokens_consumed_B": 27.831304192, "train/loss_slope": -8.338884659225562e-06} {"step": 13275, "timestamp": 1778208881.9196596, "eos/sharpness": 46.79183959960937, "eos/L0_probe": 2.1064724922180176, "eos/L_plus": 2.3021976947784424, "eos/L_minus": 2.3786656856536865, "eos/grad_norm": 0.21455538272857666, "eos/embed_grad_frac": 0.06376368552446365, "eos/time_s": 0.5850579738616943} {"step": 13275, "timestamp": 1778208883.2958658, "geo/rankme_last": 440.8287353515625, "geo/layer_0/stable_rank_q_proj": 15.478780746459961, "geo/layer_0/stable_rank_k_proj": 13.665205955505371, "geo/layer_0/stable_rank_o_proj": 54.87441635131836, "geo/layer_0/stable_rank_gate_proj": 161.2183837890625, "geo/layer_0/stable_rank_down_proj": 48.44039535522461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03987020626664162, "geo/layer_0/attn_entropy_mean": 6.315601348876953, "geo/layer_0/attn_entropy_std": 0.28833281993865967, "geo/layer_7/stable_rank_q_proj": 43.39140701293945, "geo/layer_7/stable_rank_k_proj": 43.381263732910156, "geo/layer_7/stable_rank_o_proj": 110.27934265136719, "geo/layer_7/stable_rank_gate_proj": 116.5404281616211, "geo/layer_7/stable_rank_down_proj": 159.33978271484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5652289986610413, "geo/layer_7/attn_entropy_mean": 4.687305450439453, "geo/layer_7/attn_entropy_std": 0.890978991985321, "geo/layer_14/stable_rank_q_proj": 59.901954650878906, "geo/layer_14/stable_rank_k_proj": 37.64787292480469, "geo/layer_14/stable_rank_o_proj": 50.356773376464844, "geo/layer_14/stable_rank_gate_proj": 100.50762176513672, "geo/layer_14/stable_rank_down_proj": 138.30532836914062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4004371464252472, "geo/layer_14/attn_entropy_mean": 5.583547115325928, "geo/layer_14/attn_entropy_std": 0.5505026578903198, "geo/layer_21/stable_rank_q_proj": 51.15605163574219, "geo/layer_21/stable_rank_k_proj": 32.41020202636719, "geo/layer_21/stable_rank_o_proj": 88.94792175292969, "geo/layer_21/stable_rank_gate_proj": 99.36188507080078, "geo/layer_21/stable_rank_down_proj": 66.5919418334961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15804003179073334, "geo/layer_21/attn_entropy_mean": 5.779682159423828, "geo/layer_21/attn_entropy_std": 0.2982725203037262, "geo/layer_27/stable_rank_q_proj": 41.880096435546875, "geo/layer_27/stable_rank_k_proj": 32.47543716430664, "geo/layer_27/stable_rank_o_proj": 116.89212036132812, "geo/layer_27/stable_rank_gate_proj": 94.20764923095703, "geo/layer_27/stable_rank_down_proj": 146.08236694335938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06443387269973755, "geo/layer_27/attn_entropy_mean": 4.480902671813965, "geo/layer_27/attn_entropy_std": 0.5179327130317688, "attnres/final_alpha/block_0": 0.24671712517738342, "attnres/block_norm/0": 1.499490737915039, "attnres/final_alpha/block_1": 0.00824907049536705, "attnres/block_norm/1": 21781.919921875, "attnres/final_alpha/block_2": 0.01774117909371853, "attnres/block_norm/2": 16058.369140625, "attnres/final_alpha/block_3": 0.019111592322587967, "attnres/block_norm/3": 19530.359375, "attnres/final_alpha/block_4": 0.026165157556533813, "attnres/block_norm/4": 6928.53076171875, "attnres/final_alpha/block_5": 0.5262903571128845, "attnres/block_norm/5": 4306.41015625, "attnres/final_alpha/block_6": 0.15572549402713776, "attnres/block_norm/6": 13575.4384765625, "geo/tier1_time_s": 1.3567917346954346, "geo/step": 13275.0, "geo/rankme_slope": 0.0003785997015993898} {"step": 13280, "timestamp": 1778208888.4716864, "train/loss": 2.285737729072571, "train/z_loss": 0.0016844471683725715, "train/perplexity": 9.832937592846708, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705486.6671282575, "perf/iters_per_sec": 0.8132394157067573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2296501874923706, "data/tokens_consumed": 27852275712, "data/tokens_consumed_B": 27.852275712, "train/loss_slope": -9.802276967275024e-06} {"step": 13290, "timestamp": 1778208899.3486972, "train/loss": 2.3134404182434083, "train/z_loss": 0.0016952420584857464, "train/perplexity": 10.109144580674569, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928901.9061800893, "perf/iters_per_sec": 0.9197721033955046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0872258424758912, "data/tokens_consumed": 27873247232, "data/tokens_consumed_B": 27.873247232, "train/loss_slope": -1.0899946491460096e-05} {"step": 13300, "timestamp": 1778208909.6862001, "grad/layer_0/attn": 0.0034708259627223015, "grad/layer_0/mlp": 0.002981513040140271, "grad/layer_0/attn_mlp_ratio": 1.1641156015696814, "grad/layer_4/attn": 0.0018024715827777982, "grad/layer_4/mlp": 0.0026899559888988733, "grad/layer_4/attn_mlp_ratio": 0.6700747236047393, "grad/layer_8/attn": 0.00728475209325552, "grad/layer_8/mlp": 0.00412420928478241, "grad/layer_8/attn_mlp_ratio": 1.766339051585243, "grad/layer_12/attn": 0.004043842200189829, "grad/layer_12/mlp": 0.006149190943688154, "grad/layer_12/attn_mlp_ratio": 0.6576218191075128, "grad/layer_16/attn": 0.005770471412688494, "grad/layer_16/mlp": 0.005269207525998354, "grad/layer_16/attn_mlp_ratio": 1.0951307715066867, "grad/layer_20/attn": 0.006250158883631229, "grad/layer_20/mlp": 0.007378111127763987, "grad/layer_20/attn_mlp_ratio": 0.8471218026792211, "grad/layer_24/attn": 0.024373989552259445, "grad/layer_24/mlp": 0.014275957830250263, "grad/layer_24/attn_mlp_ratio": 1.7073452913875433, "grad/layer_27/attn": 0.011402532458305359, "grad/layer_27/mlp": 0.011693036183714867, "grad/layer_27/attn_mlp_ratio": 0.9751558262233311} {"step": 13300, "timestamp": 1778208909.7003741, "train/loss": 2.3195043325424196, "train/z_loss": 0.0016925879404880106, "train/perplexity": 10.170631805265895, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026954.6216549238, "perf/iters_per_sec": 0.9665272815966243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346319437026978, "data/tokens_consumed": 27894218752, "data/tokens_consumed_B": 27.894218752, "train/loss_slope": -8.699494584201196e-06} {"step": 13310, "timestamp": 1778208920.052813, "train/loss": 2.3081967353820803, "train/z_loss": 0.0016878394642844796, "train/perplexity": 10.056274171468113, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027306.0257779974, "perf/iters_per_sec": 0.9666948441400516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344526052474976, "data/tokens_consumed": 27915190272, "data/tokens_consumed_B": 27.915190272, "train/loss_slope": -7.0048152667210905e-06} {"step": 13320, "timestamp": 1778208930.3948095, "train/loss": 2.324435901641846, "train/z_loss": 0.001680697244592011, "train/perplexity": 10.22091285913971, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028723.9975809637, "perf/iters_per_sec": 0.9673709857849901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033729577064514, "data/tokens_consumed": 27936161792, "data/tokens_consumed_B": 27.936161792, "train/loss_slope": -1.6444025546124705e-06} {"step": 13330, "timestamp": 1778208940.7419312, "train/loss": 2.2016414165496827, "train/z_loss": 0.0017103282385505735, "train/perplexity": 9.039839470421537, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027716.8212772098, "perf/iters_per_sec": 0.9668907266984986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342430353164673, "data/tokens_consumed": 27957133312, "data/tokens_consumed_B": 27.957133312, "train/loss_slope": -7.743121409537822e-06} {"step": 13340, "timestamp": 1778208951.0832062, "train/loss": 2.275105595588684, "train/z_loss": 0.0016875065863132476, "train/perplexity": 9.728946292131829, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029224.1691349226, "perf/iters_per_sec": 0.967609486167394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334747791290284, "data/tokens_consumed": 27978104832, "data/tokens_consumed_B": 27.978104832, "train/loss_slope": -7.46625353186261e-06} {"step": 13350, "timestamp": 1778208961.4163892, "grad/layer_0/attn": 0.002276225481182337, "grad/layer_0/mlp": 0.002380579710006714, "grad/layer_0/attn_mlp_ratio": 0.9561643224958365, "grad/layer_4/attn": 0.0013468300458043814, "grad/layer_4/mlp": 0.002545208903029561, "grad/layer_4/attn_mlp_ratio": 0.5291628483952595, "grad/layer_8/attn": 0.007645844016224146, "grad/layer_8/mlp": 0.003969098441302776, "grad/layer_8/attn_mlp_ratio": 1.9263427039315968, "grad/layer_12/attn": 0.004046415910124779, "grad/layer_12/mlp": 0.0057183983735740185, "grad/layer_12/attn_mlp_ratio": 0.7076134915788328, "grad/layer_16/attn": 0.004958850331604481, "grad/layer_16/mlp": 0.004361441358923912, "grad/layer_16/attn_mlp_ratio": 1.136975098326315, "grad/layer_20/attn": 0.0043449667282402515, "grad/layer_20/mlp": 0.005390638951212168, "grad/layer_20/attn_mlp_ratio": 0.8060207123797724, "grad/layer_24/attn": 0.01109217293560505, "grad/layer_24/mlp": 0.009971082210540771, "grad/layer_24/attn_mlp_ratio": 1.112434196223527, "grad/layer_27/attn": 0.01113938633352518, "grad/layer_27/mlp": 0.008237982168793678, "grad/layer_27/attn_mlp_ratio": 1.3521983866998986} {"step": 13350, "timestamp": 1778208962.0026894, "eos/sharpness": 48.0586290359497, "eos/L0_probe": 2.1035749912261963, "eos/L_plus": 2.425583839416504, "eos/L_minus": 2.2621524333953857, "eos/grad_norm": 0.15342961251735687, "eos/embed_grad_frac": 0.10072597116231918, "eos/time_s": 0.5835130214691162} {"step": 13350, "timestamp": 1778208962.020974, "train/loss": 2.283529758453369, "train/z_loss": 0.001694106706418097, "train/perplexity": 9.811250706355894, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918230.3056265095, "perf/iters_per_sec": 0.9146834877140567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0932743549346924, "data/tokens_consumed": 27999076352, "data/tokens_consumed_B": 27.999076352, "train/loss_slope": -6.590770265915162e-06} {"step": 13350, "timestamp": 1778208963.382542, "geo/rankme_last": 440.6826477050781, "geo/layer_0/stable_rank_q_proj": 15.52187442779541, "geo/layer_0/stable_rank_k_proj": 13.735998153686523, "geo/layer_0/stable_rank_o_proj": 54.873878479003906, "geo/layer_0/stable_rank_gate_proj": 161.42745971679688, "geo/layer_0/stable_rank_down_proj": 48.55727767944336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03976379707455635, "geo/layer_0/attn_entropy_mean": 6.319263935089111, "geo/layer_0/attn_entropy_std": 0.28724995255470276, "geo/layer_7/stable_rank_q_proj": 43.342525482177734, "geo/layer_7/stable_rank_k_proj": 43.479454040527344, "geo/layer_7/stable_rank_o_proj": 110.28422546386719, "geo/layer_7/stable_rank_gate_proj": 116.10259246826172, "geo/layer_7/stable_rank_down_proj": 159.1955108642578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.578267514705658, "geo/layer_7/attn_entropy_mean": 4.645408630371094, "geo/layer_7/attn_entropy_std": 0.8821427226066589, "geo/layer_14/stable_rank_q_proj": 59.899166107177734, "geo/layer_14/stable_rank_k_proj": 37.60445022583008, "geo/layer_14/stable_rank_o_proj": 50.33856201171875, "geo/layer_14/stable_rank_gate_proj": 100.53054809570312, "geo/layer_14/stable_rank_down_proj": 138.88560485839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38486337661743164, "geo/layer_14/attn_entropy_mean": 5.573215484619141, "geo/layer_14/attn_entropy_std": 0.5702874660491943, "geo/layer_21/stable_rank_q_proj": 51.136962890625, "geo/layer_21/stable_rank_k_proj": 32.34907913208008, "geo/layer_21/stable_rank_o_proj": 88.79060363769531, "geo/layer_21/stable_rank_gate_proj": 99.21315002441406, "geo/layer_21/stable_rank_down_proj": 66.58818817138672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1584298014640808, "geo/layer_21/attn_entropy_mean": 5.791664123535156, "geo/layer_21/attn_entropy_std": 0.29962968826293945, "geo/layer_27/stable_rank_q_proj": 42.06110763549805, "geo/layer_27/stable_rank_k_proj": 32.464210510253906, "geo/layer_27/stable_rank_o_proj": 116.97737884521484, "geo/layer_27/stable_rank_gate_proj": 94.21087646484375, "geo/layer_27/stable_rank_down_proj": 145.637451171875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07072708755731583, "geo/layer_27/attn_entropy_mean": 4.479743957519531, "geo/layer_27/attn_entropy_std": 0.504266619682312, "attnres/final_alpha/block_0": 0.246060311794281, "attnres/block_norm/0": 1.50099515914917, "attnres/final_alpha/block_1": 0.00825981143862009, "attnres/block_norm/1": 21854.50390625, "attnres/final_alpha/block_2": 0.01722678542137146, "attnres/block_norm/2": 16206.6142578125, "attnres/final_alpha/block_3": 0.01883401721715927, "attnres/block_norm/3": 19570.359375, "attnres/final_alpha/block_4": 0.026036452502012253, "attnres/block_norm/4": 6975.3740234375, "attnres/final_alpha/block_5": 0.5323351621627808, "attnres/block_norm/5": 4326.85205078125, "attnres/final_alpha/block_6": 0.15124750137329102, "attnres/block_norm/6": 13800.6748046875, "geo/tier1_time_s": 1.3576462268829346, "geo/step": 13350.0, "geo/rankme_slope": 0.0003818879114145658} {"step": 13360, "timestamp": 1778208973.7282548, "train/loss": 2.250886821746826, "train/z_loss": 0.001702010037843138, "train/perplexity": 9.496153498757712, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791914.8869410632, "perf/iters_per_sec": 0.8544516024308506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703413009643555, "data/tokens_consumed": 28020047872, "data/tokens_consumed_B": 28.020047872, "train/loss_slope": -6.598991851280179e-06} {"step": 13370, "timestamp": 1778208984.0687745, "train/loss": 2.2795987367630004, "train/z_loss": 0.001687401009257883, "train/perplexity": 9.77275817407774, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029275.5247408543, "perf/iters_per_sec": 0.9676339744285842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033448624610901, "data/tokens_consumed": 28041019392, "data/tokens_consumed_B": 28.041019392, "train/loss_slope": -6.157896864210364e-06} {"step": 13380, "timestamp": 1778208994.4081876, "train/loss": 2.299353313446045, "train/z_loss": 0.0016922869137488306, "train/perplexity": 9.967734370303992, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029301.8355477736, "perf/iters_per_sec": 0.9676465203989857, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334352254867554, "data/tokens_consumed": 28061990912, "data/tokens_consumed_B": 28.061990912, "train/loss_slope": -4.023533213650037e-06} {"step": 13390, "timestamp": 1778209004.7468626, "train/loss": 2.320172882080078, "train/z_loss": 0.0016890380415134133, "train/perplexity": 10.177433649888695, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029375.762267096, "perf/iters_per_sec": 0.9676817714057426, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333975791931151, "data/tokens_consumed": 28082962432, "data/tokens_consumed_B": 28.082962432, "train/loss_slope": -2.126036583036047e-06} {"step": 13400, "timestamp": 1778209015.0973027, "grad/layer_0/attn": 0.003589147701859474, "grad/layer_0/mlp": 0.002854034071788192, "grad/layer_0/attn_mlp_ratio": 1.2575699819356745, "grad/layer_4/attn": 0.00255836034193635, "grad/layer_4/mlp": 0.002741156378760934, "grad/layer_4/attn_mlp_ratio": 0.9333142276842173, "grad/layer_8/attn": 0.004032251890748739, "grad/layer_8/mlp": 0.0039921835996210575, "grad/layer_8/attn_mlp_ratio": 1.0100366601695923, "grad/layer_12/attn": 0.004941476974636316, "grad/layer_12/mlp": 0.00595026696100831, "grad/layer_12/attn_mlp_ratio": 0.8304630571991423, "grad/layer_16/attn": 0.004408753011375666, "grad/layer_16/mlp": 0.004468427039682865, "grad/layer_16/attn_mlp_ratio": 0.9866453840598072, "grad/layer_20/attn": 0.009791644290089607, "grad/layer_20/mlp": 0.006616575643420219, "grad/layer_20/attn_mlp_ratio": 1.479865820296367, "grad/layer_24/attn": 0.01438117865473032, "grad/layer_24/mlp": 0.011216404847800732, "grad/layer_24/attn_mlp_ratio": 1.2821557996218855, "grad/layer_27/attn": 0.008100060746073723, "grad/layer_27/mlp": 0.011234450154006481, "grad/layer_27/attn_mlp_ratio": 0.7210019683148308} {"step": 13400, "timestamp": 1778209015.1119654, "train/loss": 2.274075508117676, "train/z_loss": 0.0016883646487258375, "train/perplexity": 9.718929786274476, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024386.655927435, "perf/iters_per_sec": 0.9653027801167655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035944390296936, "data/tokens_consumed": 28103933952, "data/tokens_consumed_B": 28.103933952, "train/loss_slope": -3.977970500411562e-06} {"step": 13410, "timestamp": 1778209025.4596753, "train/loss": 2.28505654335022, "train/z_loss": 0.001690372230950743, "train/perplexity": 9.826241816942076, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027816.0166511536, "perf/iters_per_sec": 0.9669380267387169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034192442893982, "data/tokens_consumed": 28124905472, "data/tokens_consumed_B": 28.124905472, "train/loss_slope": -6.148903440720957e-06} {"step": 13420, "timestamp": 1778209035.8057353, "train/loss": 2.3269957304000854, "train/z_loss": 0.0016736378544010221, "train/perplexity": 10.247110161811076, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028332.8122756097, "perf/iters_per_sec": 0.9671844540956543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339289426803588, "data/tokens_consumed": 28145876992, "data/tokens_consumed_B": 28.145876992, "train/loss_slope": -5.1460885586697176e-06} {"step": 13425, "timestamp": 1778209041.5488346, "eos/sharpness": 34.857296943664544, "eos/L0_probe": 2.1030843257904053, "eos/L_plus": 2.3030998706817627, "eos/L_minus": 2.2516417503356934, "eos/grad_norm": 0.19109593331813812, "eos/embed_grad_frac": 0.08912449330091476, "eos/time_s": 0.582129955291748} {"step": 13425, "timestamp": 1778209042.9238288, "geo/rankme_last": 440.1947937011719, "geo/layer_0/stable_rank_q_proj": 15.542007446289062, "geo/layer_0/stable_rank_k_proj": 13.79200267791748, "geo/layer_0/stable_rank_o_proj": 54.87096405029297, "geo/layer_0/stable_rank_gate_proj": 161.60244750976562, "geo/layer_0/stable_rank_down_proj": 48.524017333984375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03985432907938957, "geo/layer_0/attn_entropy_mean": 6.318361759185791, "geo/layer_0/attn_entropy_std": 0.28952279686927795, "geo/layer_7/stable_rank_q_proj": 43.21730422973633, "geo/layer_7/stable_rank_k_proj": 43.54237365722656, "geo/layer_7/stable_rank_o_proj": 110.47245025634766, "geo/layer_7/stable_rank_gate_proj": 115.76210021972656, "geo/layer_7/stable_rank_down_proj": 158.99176025390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5930407047271729, "geo/layer_7/attn_entropy_mean": 4.676910400390625, "geo/layer_7/attn_entropy_std": 0.8754857182502747, "geo/layer_14/stable_rank_q_proj": 59.97381591796875, "geo/layer_14/stable_rank_k_proj": 37.661739349365234, "geo/layer_14/stable_rank_o_proj": 50.496639251708984, "geo/layer_14/stable_rank_gate_proj": 100.46177673339844, "geo/layer_14/stable_rank_down_proj": 138.4326934814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.393646776676178, "geo/layer_14/attn_entropy_mean": 5.602197647094727, "geo/layer_14/attn_entropy_std": 0.5608323812484741, "geo/layer_21/stable_rank_q_proj": 51.060882568359375, "geo/layer_21/stable_rank_k_proj": 32.27838134765625, "geo/layer_21/stable_rank_o_proj": 88.75289916992188, "geo/layer_21/stable_rank_gate_proj": 98.95752716064453, "geo/layer_21/stable_rank_down_proj": 66.62223815917969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16238436102867126, "geo/layer_21/attn_entropy_mean": 5.799330234527588, "geo/layer_21/attn_entropy_std": 0.29621461033821106, "geo/layer_27/stable_rank_q_proj": 42.146915435791016, "geo/layer_27/stable_rank_k_proj": 32.46552276611328, "geo/layer_27/stable_rank_o_proj": 117.01444244384766, "geo/layer_27/stable_rank_gate_proj": 93.95036315917969, "geo/layer_27/stable_rank_down_proj": 145.24732971191406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0672236829996109, "geo/layer_27/attn_entropy_mean": 4.478761672973633, "geo/layer_27/attn_entropy_std": 0.494582861661911, "attnres/final_alpha/block_0": 0.24404862523078918, "attnres/block_norm/0": 1.5026345252990723, "attnres/final_alpha/block_1": 0.008199821226298809, "attnres/block_norm/1": 21963.82421875, "attnres/final_alpha/block_2": 0.01696023717522621, "attnres/block_norm/2": 16270.943359375, "attnres/final_alpha/block_3": 0.018676087260246277, "attnres/block_norm/3": 19802.7109375, "attnres/final_alpha/block_4": 0.02567920833826065, "attnres/block_norm/4": 6969.85986328125, "attnres/final_alpha/block_5": 0.5369980931282043, "attnres/block_norm/5": 4284.1455078125, "attnres/final_alpha/block_6": 0.14943790435791016, "attnres/block_norm/6": 13783.3232421875, "geo/tier1_time_s": 1.3575308322906494, "geo/step": 13425.0, "geo/rankme_slope": 0.0003639742811186975} {"step": 13430, "timestamp": 1778209048.0942347, "train/loss": 2.2864057064056396, "train/z_loss": 0.001693644211627543, "train/perplexity": 9.839507966462225, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707390.8574084188, "perf/iters_per_sec": 0.8141474043886274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2282788038253785, "data/tokens_consumed": 28166848512, "data/tokens_consumed_B": 28.166848512, "train/loss_slope": -5.112173032946611e-06} {"step": 13440, "timestamp": 1778209058.4327803, "train/loss": 2.219961977005005, "train/z_loss": 0.0017159600276499987, "train/perplexity": 9.206980782242477, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029382.2703236414, "perf/iters_per_sec": 0.9676848746889312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333942651748658, "data/tokens_consumed": 28187820032, "data/tokens_consumed_B": 28.187820032, "train/loss_slope": -9.369261401428538e-06} {"step": 13450, "timestamp": 1778209068.769503, "grad/layer_0/attn": 0.0025427478831261396, "grad/layer_0/mlp": 0.002524120733141899, "grad/layer_0/attn_mlp_ratio": 1.0073796189705604, "grad/layer_4/attn": 0.0015146322548389435, "grad/layer_4/mlp": 0.0024738325737416744, "grad/layer_4/attn_mlp_ratio": 0.6122613994535292, "grad/layer_8/attn": 0.00516075175255537, "grad/layer_8/mlp": 0.0037313890643417835, "grad/layer_8/attn_mlp_ratio": 1.3830644634638976, "grad/layer_12/attn": 0.004548436962068081, "grad/layer_12/mlp": 0.005958415102213621, "grad/layer_12/attn_mlp_ratio": 0.7633635467998745, "grad/layer_16/attn": 0.004272341728210449, "grad/layer_16/mlp": 0.0045606959611177444, "grad/layer_16/attn_mlp_ratio": 0.936774051801947, "grad/layer_20/attn": 0.004170202650129795, "grad/layer_20/mlp": 0.006608493626117706, "grad/layer_20/attn_mlp_ratio": 0.6310367873466478, "grad/layer_24/attn": 0.01702660508453846, "grad/layer_24/mlp": 0.011873089708387852, "grad/layer_24/attn_mlp_ratio": 1.434050054309356, "grad/layer_27/attn": 0.006949169561266899, "grad/layer_27/mlp": 0.0108773959800601, "grad/layer_27/attn_mlp_ratio": 0.6388633373391422} {"step": 13450, "timestamp": 1778209068.7834458, "train/loss": 2.2497115254402162, "train/z_loss": 0.0017079159617424012, "train/perplexity": 9.484999260674998, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028503.9665199837, "perf/iters_per_sec": 0.967266066799156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338417053222657, "data/tokens_consumed": 28208791552, "data/tokens_consumed_B": 28.208791552, "train/loss_slope": -1.1351496909353548e-05} {"step": 13460, "timestamp": 1778209079.124144, "train/loss": 2.33213791847229, "train/z_loss": 0.0016780740465037526, "train/perplexity": 10.299938439521977, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029133.355191885, "perf/iters_per_sec": 0.9675661827048707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335210323333741, "data/tokens_consumed": 28229763072, "data/tokens_consumed_B": 28.229763072, "train/loss_slope": -9.146714324962605e-06} {"step": 13470, "timestamp": 1778209089.4679859, "train/loss": 2.2601436138153077, "train/z_loss": 0.0017112054047174752, "train/perplexity": 9.584465529592043, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028511.311035181, "perf/iters_per_sec": 0.9672695689369111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338379621505738, "data/tokens_consumed": 28250734592, "data/tokens_consumed_B": 28.250734592, "train/loss_slope": -1.0740463954709199e-05} {"step": 13480, "timestamp": 1778209099.8159215, "train/loss": 2.232487940788269, "train/z_loss": 0.0017039498314261436, "train/perplexity": 9.323032401897727, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027666.2923979133, "perf/iters_per_sec": 0.9668666326512877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342688083648681, "data/tokens_consumed": 28271706112, "data/tokens_consumed_B": 28.271706112, "train/loss_slope": -1.3704195567662384e-05} {"step": 13490, "timestamp": 1778209110.1576154, "train/loss": 2.27755720615387, "train/z_loss": 0.0016897623776458205, "train/perplexity": 9.75282714095928, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028917.2132586262, "perf/iters_per_sec": 0.967463118199647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336311340332032, "data/tokens_consumed": 28292677632, "data/tokens_consumed_B": 28.292677632, "train/loss_slope": -1.2579486458071901e-05} {"step": 13500, "timestamp": 1778209120.4945912, "grad/layer_0/attn": 0.005767180118709803, "grad/layer_0/mlp": 0.005361471325159073, "grad/layer_0/attn_mlp_ratio": 1.0756711472240463, "grad/layer_4/attn": 0.00643355306237936, "grad/layer_4/mlp": 0.0056857638992369175, "grad/layer_4/attn_mlp_ratio": 1.1315195395452227, "grad/layer_8/attn": 0.010407669469714165, "grad/layer_8/mlp": 0.006527759600430727, "grad/layer_8/attn_mlp_ratio": 1.5943708021340655, "grad/layer_12/attn": 0.006135070230811834, "grad/layer_12/mlp": 0.010328669100999832, "grad/layer_12/attn_mlp_ratio": 0.5939845793703945, "grad/layer_16/attn": 0.007204350549727678, "grad/layer_16/mlp": 0.006631465628743172, "grad/layer_16/attn_mlp_ratio": 1.0863888685274232, "grad/layer_20/attn": 0.02631799317896366, "grad/layer_20/mlp": 0.010999016463756561, "grad/layer_20/attn_mlp_ratio": 2.3927587549677365, "grad/layer_24/attn": 0.016481606289744377, "grad/layer_24/mlp": 0.014619331806898117, "grad/layer_24/attn_mlp_ratio": 1.127384369867651, "grad/layer_27/attn": 0.011310867965221405, "grad/layer_27/mlp": 0.013544362969696522, "grad/layer_27/attn_mlp_ratio": 0.8350978120579013} {"step": 13500, "timestamp": 1778209121.0786004, "eos/sharpness": 15.718817710876461, "eos/L0_probe": 2.100060224533081, "eos/L_plus": 2.195518732070923, "eos/L_minus": 2.161789894104004, "eos/grad_norm": 0.23896189033985138, "eos/embed_grad_frac": 0.19817465543746948, "eos/time_s": 0.5811042785644531} {"step": 13500, "timestamp": 1778209121.0987084, "train/loss": 2.2509283781051637, "train/z_loss": 0.001702604244928807, "train/perplexity": 9.496548132515048, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917729.2020168123, "perf/iters_per_sec": 0.9144445428928434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093560028076172, "data/tokens_consumed": 28313649152, "data/tokens_consumed_B": 28.313649152, "train/loss_slope": -1.6163340789435507e-05} {"step": 13500, "timestamp": 1778209122.4590833, "geo/rankme_last": 441.8536376953125, "geo/layer_0/stable_rank_q_proj": 15.562315940856934, "geo/layer_0/stable_rank_k_proj": 13.806230545043945, "geo/layer_0/stable_rank_o_proj": 54.78963851928711, "geo/layer_0/stable_rank_gate_proj": 161.1667022705078, "geo/layer_0/stable_rank_down_proj": 48.616172790527344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04458088055253029, "geo/layer_0/attn_entropy_mean": 6.313943862915039, "geo/layer_0/attn_entropy_std": 0.2896259129047394, "geo/layer_7/stable_rank_q_proj": 43.17682647705078, "geo/layer_7/stable_rank_k_proj": 43.51909637451172, "geo/layer_7/stable_rank_o_proj": 110.74652099609375, "geo/layer_7/stable_rank_gate_proj": 115.85274505615234, "geo/layer_7/stable_rank_down_proj": 159.5637969970703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5829843282699585, "geo/layer_7/attn_entropy_mean": 4.646997451782227, "geo/layer_7/attn_entropy_std": 0.8806950449943542, "geo/layer_14/stable_rank_q_proj": 59.6960563659668, "geo/layer_14/stable_rank_k_proj": 37.75358581542969, "geo/layer_14/stable_rank_o_proj": 50.54371643066406, "geo/layer_14/stable_rank_gate_proj": 100.20073699951172, "geo/layer_14/stable_rank_down_proj": 137.9618682861328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41191986203193665, "geo/layer_14/attn_entropy_mean": 5.510091781616211, "geo/layer_14/attn_entropy_std": 0.5953454971313477, "geo/layer_21/stable_rank_q_proj": 51.01326370239258, "geo/layer_21/stable_rank_k_proj": 32.36366653442383, "geo/layer_21/stable_rank_o_proj": 88.76453399658203, "geo/layer_21/stable_rank_gate_proj": 99.10301971435547, "geo/layer_21/stable_rank_down_proj": 66.4861068725586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15670692920684814, "geo/layer_21/attn_entropy_mean": 5.7810163497924805, "geo/layer_21/attn_entropy_std": 0.2973540723323822, "geo/layer_27/stable_rank_q_proj": 42.15980911254883, "geo/layer_27/stable_rank_k_proj": 32.61946487426758, "geo/layer_27/stable_rank_o_proj": 117.16776275634766, "geo/layer_27/stable_rank_gate_proj": 93.7511978149414, "geo/layer_27/stable_rank_down_proj": 144.85989379882812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07263875007629395, "geo/layer_27/attn_entropy_mean": 4.447918891906738, "geo/layer_27/attn_entropy_std": 0.5123263597488403, "attnres/final_alpha/block_0": 0.2463379055261612, "attnres/block_norm/0": 1.5044442415237427, "attnres/final_alpha/block_1": 0.008177035488188267, "attnres/block_norm/1": 21918.455078125, "attnres/final_alpha/block_2": 0.017138060182332993, "attnres/block_norm/2": 16336.158203125, "attnres/final_alpha/block_3": 0.019148295745253563, "attnres/block_norm/3": 19845.056640625, "attnres/final_alpha/block_4": 0.025907333940267563, "attnres/block_norm/4": 6968.515625, "attnres/final_alpha/block_5": 0.5334509611129761, "attnres/block_norm/5": 4277.50341796875, "attnres/final_alpha/block_6": 0.14984041452407837, "attnres/block_norm/6": 13716.1083984375, "geo/tier1_time_s": 1.3564379215240479, "geo/step": 13500.0, "geo/rankme_slope": 0.0003364482902536014} {"step": 13500, "timestamp": 1778209129.497785, "geo/ww_alpha_mean": 8.347036550210337, "geo/ww_alpha_std": 5.252734403768603, "geo/ww_alpha_min": 2.7253166699764027, "geo/ww_alpha_max": 46.774113687866034, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.241056430126359, "geo/ww_alpha_by_type/k_proj": 5.198022963632465, "geo/ww_alpha_by_type/v_proj": 9.191528273236925, "geo/ww_alpha_by_type/o_proj": 9.184453402637832, "geo/ww_alpha_by_type/gate_proj": 9.665599633636267, "geo/ww_alpha_by_type/up_proj": 11.41969427973068, "geo/ww_alpha_by_type/down_proj": 9.680298656924972, "geo/twonn_id/layer_0": 0.7100786566734314, "geo/twonn_id/layer_7": 2.9999442100524902, "geo/twonn_id/layer_14": 3.7853870391845703, "geo/twonn_id/layer_21": 7.349976539611816, "geo/twonn_id/layer_27": 5.577704429626465, "geo/tier2_time_s": 7.032484292984009} {"step": 13500, "timestamp": 1778209130.1450849, "eoc/jacobian_sigma/layer_0/attn": 678.9404907226562, "eoc/jacobian_sigma/layer_0/mlp": 3927.498046875, "eoc/jacobian_sigma/layer_0": 3927.498046875, "eoc/jacobian_sigma/layer_7/attn": 1.1708649396896362, "eoc/jacobian_sigma/layer_7/mlp": 1.5858166217803955, "eoc/jacobian_sigma/layer_7": 1.5858166217803955, "eoc/jacobian_sigma/layer_14/attn": 1.340842604637146, "eoc/jacobian_sigma/layer_14/mlp": 7.369242191314697, "eoc/jacobian_sigma/layer_14": 7.369242191314697, "eoc/jacobian_sigma/layer_21/attn": 1.0851494073867798, "eoc/jacobian_sigma/layer_21/mlp": 3.8294169902801514, "eoc/jacobian_sigma/layer_21": 3.8294169902801514, "eoc/jacobian_sigma/layer_27/attn": 2.362846851348877, "eoc/jacobian_sigma/layer_27/mlp": 22.21099281311035, "eoc/jacobian_sigma/layer_27": 22.21099281311035, "eoc/layer0_sigma": 3927.498046875, "eoc/sigma_max": 22.21099281311035, "eoc/sigma_min": 1.5858166217803955, "eoc/sigma_mean": 8.748867154121399, "eoc/time_s": 0.6401023864746094} {"step": 13510, "timestamp": 1778209140.5028028, "train/loss": 2.2851653337478637, "train/z_loss": 0.001695299125276506, "train/perplexity": 9.827310875847303, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1080989.016108606, "perf/iters_per_sec": 0.5154557304900198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.940030813217163, "data/tokens_consumed": 28334620672, "data/tokens_consumed_B": 28.334620672, "train/loss_slope": -1.3401982314301198e-05} {"step": 13520, "timestamp": 1778209150.8529632, "train/loss": 2.296351599693298, "train/z_loss": 0.0016866293735802174, "train/perplexity": 9.937858946128902, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027010.3934361138, "perf/iters_per_sec": 0.9665538756542749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034603476524353, "data/tokens_consumed": 28355592192, "data/tokens_consumed_B": 28.355592192, "train/loss_slope": -1.2527007059474401e-05} {"step": 13530, "timestamp": 1778209161.2008398, "train/loss": 2.209303784370422, "train/z_loss": 0.0017038532998412848, "train/perplexity": 9.109372097567771, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028021.1228773212, "perf/iters_per_sec": 0.9670358290087324, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03408784866333, "data/tokens_consumed": 28376563712, "data/tokens_consumed_B": 28.376563712, "train/loss_slope": -2.1471322215858827e-05} {"step": 13540, "timestamp": 1778209171.5415866, "train/loss": 2.26731014251709, "train/z_loss": 0.0016895800712518395, "train/perplexity": 9.65339959083276, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029200.9031389556, "perf/iters_per_sec": 0.9675983920759943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334866285324096, "data/tokens_consumed": 28397535232, "data/tokens_consumed_B": 28.397535232, "train/loss_slope": -2.3560777563180365e-05} {"step": 13550, "timestamp": 1778209181.8732042, "grad/layer_0/attn": 0.0031401885207742453, "grad/layer_0/mlp": 0.0028502927161753178, "grad/layer_0/attn_mlp_ratio": 1.1017073414190215, "grad/layer_4/attn": 0.00165856524836272, "grad/layer_4/mlp": 0.0026385916862636805, "grad/layer_4/attn_mlp_ratio": 0.6285797056585606, "grad/layer_8/attn": 0.00453169597312808, "grad/layer_8/mlp": 0.003763567190617323, "grad/layer_8/attn_mlp_ratio": 1.2040959077377793, "grad/layer_12/attn": 0.005069813225418329, "grad/layer_12/mlp": 0.006125296000391245, "grad/layer_12/attn_mlp_ratio": 0.8276845955405326, "grad/layer_16/attn": 0.00587082002311945, "grad/layer_16/mlp": 0.005160699598491192, "grad/layer_16/attn_mlp_ratio": 1.1376015591133646, "grad/layer_20/attn": 0.0051747700199484825, "grad/layer_20/mlp": 0.007329469081014395, "grad/layer_20/attn_mlp_ratio": 0.7060224815942671, "grad/layer_24/attn": 0.012137353420257568, "grad/layer_24/mlp": 0.01312460470944643, "grad/layer_24/attn_mlp_ratio": 0.924778581639404, "grad/layer_27/attn": 0.006086922250688076, "grad/layer_27/mlp": 0.011549536138772964, "grad/layer_27/attn_mlp_ratio": 0.5270274169324358} {"step": 13550, "timestamp": 1778209181.8871994, "train/loss": 2.2185245513916017, "train/z_loss": 0.001707148109562695, "train/perplexity": 9.193755939384987, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028004.8980107156, "perf/iters_per_sec": 0.967028092389448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034096121788025, "data/tokens_consumed": 28418506752, "data/tokens_consumed_B": 28.418506752, "train/loss_slope": -2.629701983917952e-05} {"step": 13560, "timestamp": 1778209192.2475674, "train/loss": 2.2744758129119873, "train/z_loss": 0.0016890946077182888, "train/perplexity": 9.72282109926716, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025658.8623452699, "perf/iters_per_sec": 0.9659094154096937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352937698364257, "data/tokens_consumed": 28439478272, "data/tokens_consumed_B": 28.439478272, "train/loss_slope": -2.7320644922024715e-05} {"step": 13570, "timestamp": 1778209202.596419, "train/loss": 2.2406787872314453, "train/z_loss": 0.0017075726645998657, "train/perplexity": 9.399709525269733, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027606.7454065604, "perf/iters_per_sec": 0.9668382384331514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342991828918457, "data/tokens_consumed": 28460449792, "data/tokens_consumed_B": 28.460449792, "train/loss_slope": -2.9056702288213607e-05} {"step": 13575, "timestamp": 1778209208.3732946, "eos/sharpness": 19.857263565063473, "eos/L0_probe": 2.096073865890503, "eos/L_plus": 2.2315962314605713, "eos/L_minus": 2.1591241359710693, "eos/grad_norm": 0.13234001398086548, "eos/embed_grad_frac": 0.13913510739803314, "eos/time_s": 0.614262580871582} {"step": 13575, "timestamp": 1778209209.748369, "geo/rankme_last": 440.7648010253906, "geo/layer_0/stable_rank_q_proj": 15.526914596557617, "geo/layer_0/stable_rank_k_proj": 13.788677215576172, "geo/layer_0/stable_rank_o_proj": 54.698368072509766, "geo/layer_0/stable_rank_gate_proj": 160.98284912109375, "geo/layer_0/stable_rank_down_proj": 48.54018783569336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03965877369046211, "geo/layer_0/attn_entropy_mean": 6.310184478759766, "geo/layer_0/attn_entropy_std": 0.291207879781723, "geo/layer_7/stable_rank_q_proj": 43.17171096801758, "geo/layer_7/stable_rank_k_proj": 43.46914291381836, "geo/layer_7/stable_rank_o_proj": 110.53215789794922, "geo/layer_7/stable_rank_gate_proj": 115.70480346679688, "geo/layer_7/stable_rank_down_proj": 159.46923828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5850710868835449, "geo/layer_7/attn_entropy_mean": 4.627209663391113, "geo/layer_7/attn_entropy_std": 0.8687922954559326, "geo/layer_14/stable_rank_q_proj": 59.597354888916016, "geo/layer_14/stable_rank_k_proj": 37.776634216308594, "geo/layer_14/stable_rank_o_proj": 50.341552734375, "geo/layer_14/stable_rank_gate_proj": 99.82352447509766, "geo/layer_14/stable_rank_down_proj": 137.8448944091797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3850727379322052, "geo/layer_14/attn_entropy_mean": 5.559545516967773, "geo/layer_14/attn_entropy_std": 0.5634886026382446, "geo/layer_21/stable_rank_q_proj": 50.85200119018555, "geo/layer_21/stable_rank_k_proj": 32.18398666381836, "geo/layer_21/stable_rank_o_proj": 88.66555786132812, "geo/layer_21/stable_rank_gate_proj": 98.86662292480469, "geo/layer_21/stable_rank_down_proj": 66.17607116699219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15545107424259186, "geo/layer_21/attn_entropy_mean": 5.824741363525391, "geo/layer_21/attn_entropy_std": 0.29209786653518677, "geo/layer_27/stable_rank_q_proj": 42.015010833740234, "geo/layer_27/stable_rank_k_proj": 32.440391540527344, "geo/layer_27/stable_rank_o_proj": 117.00749206542969, "geo/layer_27/stable_rank_gate_proj": 93.79996490478516, "geo/layer_27/stable_rank_down_proj": 144.78717041015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07149051129817963, "geo/layer_27/attn_entropy_mean": 4.496469974517822, "geo/layer_27/attn_entropy_std": 0.47714436054229736, "attnres/final_alpha/block_0": 0.2457353174686432, "attnres/block_norm/0": 1.506065845489502, "attnres/final_alpha/block_1": 0.00810844823718071, "attnres/block_norm/1": 22100.513671875, "attnres/final_alpha/block_2": 0.016809608787298203, "attnres/block_norm/2": 16378.3505859375, "attnres/final_alpha/block_3": 0.01853560283780098, "attnres/block_norm/3": 20028.8046875, "attnres/final_alpha/block_4": 0.025671660900115967, "attnres/block_norm/4": 7056.5869140625, "attnres/final_alpha/block_5": 0.5337603092193604, "attnres/block_norm/5": 4326.2900390625, "attnres/final_alpha/block_6": 0.1513790488243103, "attnres/block_norm/6": 13905.384765625, "geo/tier1_time_s": 1.3566114902496338, "geo/step": 13575.0, "geo/rankme_slope": 0.00031058784842061824} {"step": 13580, "timestamp": 1778209214.9268374, "train/loss": 2.230370044708252, "train/z_loss": 0.0016988276387564839, "train/perplexity": 9.303308082521978, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701509.7571263653, "perf/iters_per_sec": 0.8113430772430255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325242280960083, "data/tokens_consumed": 28481421312, "data/tokens_consumed_B": 28.481421312, "train/loss_slope": -2.95910523311891e-05} {"step": 13590, "timestamp": 1778209225.285045, "train/loss": 2.2745797395706178, "train/z_loss": 0.0017001904197968543, "train/perplexity": 9.723831612085169, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025798.5392313756, "perf/iters_per_sec": 0.965976018539131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352223873138429, "data/tokens_consumed": 28502392832, "data/tokens_consumed_B": 28.502392832, "train/loss_slope": -3.1567379614987114e-05} {"step": 13600, "timestamp": 1778209235.6192935, "grad/layer_0/attn": 0.003135759849101305, "grad/layer_0/mlp": 0.002871088683605194, "grad/layer_0/attn_mlp_ratio": 1.0921849115246678, "grad/layer_4/attn": 0.002272404031828046, "grad/layer_4/mlp": 0.0027072392404079437, "grad/layer_4/attn_mlp_ratio": 0.8393805445681894, "grad/layer_8/attn": 0.0038508190773427486, "grad/layer_8/mlp": 0.004041759762912989, "grad/layer_8/attn_mlp_ratio": 0.9527580083808779, "grad/layer_12/attn": 0.004320577252656221, "grad/layer_12/mlp": 0.005864460486918688, "grad/layer_12/attn_mlp_ratio": 0.7367390723528325, "grad/layer_16/attn": 0.004247457720339298, "grad/layer_16/mlp": 0.004656550008803606, "grad/layer_16/attn_mlp_ratio": 0.912146894394869, "grad/layer_20/attn": 0.009578143246471882, "grad/layer_20/mlp": 0.006477161776274443, "grad/layer_20/attn_mlp_ratio": 1.4787561943690488, "grad/layer_24/attn": 0.01909405179321766, "grad/layer_24/mlp": 0.01396229863166809, "grad/layer_24/attn_mlp_ratio": 1.3675435657246156, "grad/layer_27/attn": 0.007944786921143532, "grad/layer_27/mlp": 0.012533127330243587, "grad/layer_27/attn_mlp_ratio": 0.6339029875314306} {"step": 13600, "timestamp": 1778209235.6336706, "train/loss": 2.29533965587616, "train/z_loss": 0.001696178177371621, "train/perplexity": 9.927807477831124, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027594.967295119, "perf/iters_per_sec": 0.9668326221919628, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343051910400392, "data/tokens_consumed": 28523364352, "data/tokens_consumed_B": 28.523364352, "train/loss_slope": -3.025265483215269e-05} {"step": 13610, "timestamp": 1778209245.9855735, "train/loss": 2.3042243242263796, "train/z_loss": 0.0016837210045196117, "train/perplexity": 10.016405755062754, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027007.7308943868, "perf/iters_per_sec": 0.9665526060554441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034604835510254, "data/tokens_consumed": 28544335872, "data/tokens_consumed_B": 28.544335872, "train/loss_slope": -2.8057711059325382e-05} {"step": 13620, "timestamp": 1778209256.3400552, "train/loss": 2.2916170358657837, "train/z_loss": 0.0016895802458748222, "train/perplexity": 9.890918727055926, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026514.0670884717, "perf/iters_per_sec": 0.9663172088091239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348568677902221, "data/tokens_consumed": 28565307392, "data/tokens_consumed_B": 28.565307392, "train/loss_slope": -2.708877763911263e-05} {"step": 13630, "timestamp": 1778209266.6931858, "train/loss": 2.2486024618148805, "train/z_loss": 0.0017036890843883158, "train/perplexity": 9.474485624232189, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026699.2970883949, "perf/iters_per_sec": 0.9664055333559012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347622871398925, "data/tokens_consumed": 28586278912, "data/tokens_consumed_B": 28.586278912, "train/loss_slope": -2.629669631096179e-05} {"step": 13640, "timestamp": 1778209277.0362353, "train/loss": 2.2854847431182863, "train/z_loss": 0.001688878214918077, "train/perplexity": 9.830450312383137, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028507.8492825383, "perf/iters_per_sec": 0.9672679182446186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033839726448059, "data/tokens_consumed": 28607250432, "data/tokens_consumed_B": 28.607250432, "train/loss_slope": -2.5971140397979127e-05} {"step": 13650, "timestamp": 1778209287.3842378, "grad/layer_0/attn": 0.0032361233606934547, "grad/layer_0/mlp": 0.0027826933655887842, "grad/layer_0/attn_mlp_ratio": 1.162946404522041, "grad/layer_4/attn": 0.0014741546474397182, "grad/layer_4/mlp": 0.0027424537111073732, "grad/layer_4/attn_mlp_ratio": 0.5375312581269949, "grad/layer_8/attn": 0.013173316605389118, "grad/layer_8/mlp": 0.0042166998609900475, "grad/layer_8/attn_mlp_ratio": 3.1240820374367186, "grad/layer_12/attn": 0.004893025849014521, "grad/layer_12/mlp": 0.0060900007374584675, "grad/layer_12/attn_mlp_ratio": 0.8034524098779141, "grad/layer_16/attn": 0.004025096073746681, "grad/layer_16/mlp": 0.004432810470461845, "grad/layer_16/attn_mlp_ratio": 0.9080234785054926, "grad/layer_20/attn": 0.004031195305287838, "grad/layer_20/mlp": 0.0062930891290307045, "grad/layer_20/attn_mlp_ratio": 0.6405749479431336, "grad/layer_24/attn": 0.013790329918265343, "grad/layer_24/mlp": 0.009448511525988579, "grad/layer_24/attn_mlp_ratio": 1.4595240461295922, "grad/layer_27/attn": 0.006409896537661552, "grad/layer_27/mlp": 0.00802608858793974, "grad/layer_27/attn_mlp_ratio": 0.7986326574354045} {"step": 13650, "timestamp": 1778209287.970916, "eos/sharpness": 32.96115398406982, "eos/L0_probe": 2.0980639457702637, "eos/L_plus": 2.308589458465576, "eos/L_minus": 2.2171499729156494, "eos/grad_norm": 0.14211943745613098, "eos/embed_grad_frac": 0.13038457930088043, "eos/time_s": 0.5837538242340088} {"step": 13650, "timestamp": 1778209287.988923, "train/loss": 2.2431787967681887, "train/z_loss": 0.001691105542704463, "train/perplexity": 9.423238287535895, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916101.399250707, "perf/iters_per_sec": 0.9136683460477385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094489049911499, "data/tokens_consumed": 28628221952, "data/tokens_consumed_B": 28.628221952, "train/loss_slope": -2.6314966470459102e-05} {"step": 13650, "timestamp": 1778209289.347912, "geo/rankme_last": 440.40728759765625, "geo/layer_0/stable_rank_q_proj": 15.536815643310547, "geo/layer_0/stable_rank_k_proj": 13.804173469543457, "geo/layer_0/stable_rank_o_proj": 54.68734359741211, "geo/layer_0/stable_rank_gate_proj": 160.74673461914062, "geo/layer_0/stable_rank_down_proj": 48.55087661743164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03986095264554024, "geo/layer_0/attn_entropy_mean": 6.309562683105469, "geo/layer_0/attn_entropy_std": 0.28728851675987244, "geo/layer_7/stable_rank_q_proj": 43.07278060913086, "geo/layer_7/stable_rank_k_proj": 43.466793060302734, "geo/layer_7/stable_rank_o_proj": 110.643798828125, "geo/layer_7/stable_rank_gate_proj": 115.6454849243164, "geo/layer_7/stable_rank_down_proj": 159.5907745361328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5696315169334412, "geo/layer_7/attn_entropy_mean": 4.698905944824219, "geo/layer_7/attn_entropy_std": 0.8995437622070312, "geo/layer_14/stable_rank_q_proj": 59.41532516479492, "geo/layer_14/stable_rank_k_proj": 37.77104568481445, "geo/layer_14/stable_rank_o_proj": 50.471988677978516, "geo/layer_14/stable_rank_gate_proj": 99.48946380615234, "geo/layer_14/stable_rank_down_proj": 138.4097137451172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38445520401000977, "geo/layer_14/attn_entropy_mean": 5.5402913093566895, "geo/layer_14/attn_entropy_std": 0.5965225696563721, "geo/layer_21/stable_rank_q_proj": 50.7848014831543, "geo/layer_21/stable_rank_k_proj": 32.275604248046875, "geo/layer_21/stable_rank_o_proj": 88.55946350097656, "geo/layer_21/stable_rank_gate_proj": 98.64151763916016, "geo/layer_21/stable_rank_down_proj": 65.9872817993164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15569202601909637, "geo/layer_21/attn_entropy_mean": 5.796696186065674, "geo/layer_21/attn_entropy_std": 0.28883495926856995, "geo/layer_27/stable_rank_q_proj": 42.06498336791992, "geo/layer_27/stable_rank_k_proj": 32.48463439941406, "geo/layer_27/stable_rank_o_proj": 117.14079284667969, "geo/layer_27/stable_rank_gate_proj": 93.89860534667969, "geo/layer_27/stable_rank_down_proj": 144.9234619140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07042647898197174, "geo/layer_27/attn_entropy_mean": 4.461409568786621, "geo/layer_27/attn_entropy_std": 0.5043047666549683, "attnres/final_alpha/block_0": 0.24426651000976562, "attnres/block_norm/0": 1.507759690284729, "attnres/final_alpha/block_1": 0.008137580007314682, "attnres/block_norm/1": 22141.08203125, "attnres/final_alpha/block_2": 0.017096206545829773, "attnres/block_norm/2": 16539.341796875, "attnres/final_alpha/block_3": 0.01890142634510994, "attnres/block_norm/3": 20164.310546875, "attnres/final_alpha/block_4": 0.0255088172852993, "attnres/block_norm/4": 7081.564453125, "attnres/final_alpha/block_5": 0.5372132062911987, "attnres/block_norm/5": 4334.951171875, "attnres/final_alpha/block_6": 0.14887627959251404, "attnres/block_norm/6": 14119.4794921875, "geo/tier1_time_s": 1.3554637432098389, "geo/step": 13650.0, "geo/rankme_slope": 0.00027624739348864545} {"step": 13660, "timestamp": 1778209299.6939542, "train/loss": 2.287836956977844, "train/z_loss": 0.0016805318067781628, "train/perplexity": 9.853600850687995, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792214.6745270898, "perf/iters_per_sec": 0.8545945522914361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170145535469055, "data/tokens_consumed": 28649193472, "data/tokens_consumed_B": 28.649193472, "train/loss_slope": -2.3900922604448366e-05} {"step": 13670, "timestamp": 1778209310.0554225, "train/loss": 2.3603601932525633, "train/z_loss": 0.0016634643776342272, "train/perplexity": 10.594766928798343, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024905.527210324, "perf/iters_per_sec": 0.9655501972247715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356789350509643, "data/tokens_consumed": 28670164992, "data/tokens_consumed_B": 28.670164992, "train/loss_slope": -1.8163902145085857e-05} {"step": 13680, "timestamp": 1778209320.8103535, "train/loss": 2.2932315826416017, "train/z_loss": 0.0016761331469751894, "train/perplexity": 9.906900976569457, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951229.2397278494, "perf/iters_per_sec": 0.9304186056746718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.074785041809082, "data/tokens_consumed": 28691136512, "data/tokens_consumed_B": 28.691136512, "train/loss_slope": -2.0524716062514285e-05} {"step": 13690, "timestamp": 1778209331.152501, "train/loss": 2.2776952266693113, "train/z_loss": 0.0016850598389282823, "train/perplexity": 9.754173324086594, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028699.4797437214, "perf/iters_per_sec": 0.9673592947691543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033742070198059, "data/tokens_consumed": 28712108032, "data/tokens_consumed_B": 28.712108032, "train/loss_slope": -1.7598389856743076e-05} {"step": 13700, "timestamp": 1778209341.4863791, "grad/layer_0/attn": 0.0036274369340389967, "grad/layer_0/mlp": 0.0030122576281428337, "grad/layer_0/attn_mlp_ratio": 1.2042252892734524, "grad/layer_4/attn": 0.0021389354951679707, "grad/layer_4/mlp": 0.002572567667812109, "grad/layer_4/attn_mlp_ratio": 0.831439902936773, "grad/layer_8/attn": 0.005291968118399382, "grad/layer_8/mlp": 0.004193868488073349, "grad/layer_8/attn_mlp_ratio": 1.2618345108496822, "grad/layer_12/attn": 0.004738987889140844, "grad/layer_12/mlp": 0.006357942707836628, "grad/layer_12/attn_mlp_ratio": 0.7453649761208447, "grad/layer_16/attn": 0.006390085443854332, "grad/layer_16/mlp": 0.005034600850194693, "grad/layer_16/attn_mlp_ratio": 1.2692337500169149, "grad/layer_20/attn": 0.0056785461492836475, "grad/layer_20/mlp": 0.006947238929569721, "grad/layer_20/attn_mlp_ratio": 0.8173817145363645, "grad/layer_24/attn": 0.02221244014799595, "grad/layer_24/mlp": 0.013864925131201744, "grad/layer_24/attn_mlp_ratio": 1.6020598580660852, "grad/layer_27/attn": 0.009476246312260628, "grad/layer_27/mlp": 0.013365072198212147, "grad/layer_27/attn_mlp_ratio": 0.7090306809285477} {"step": 13700, "timestamp": 1778209341.5008361, "train/loss": 2.2753324270248414, "train/z_loss": 0.0016808569664135576, "train/perplexity": 9.7311533732998, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027780.3949523924, "perf/iters_per_sec": 0.9669210409891092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342106103897095, "data/tokens_consumed": 28733079552, "data/tokens_consumed_B": 28.733079552, "train/loss_slope": -1.5139727595329545e-05} {"step": 13710, "timestamp": 1778209351.847356, "train/loss": 2.313474178314209, "train/z_loss": 0.0016834412701427937, "train/perplexity": 10.109485871872321, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028132.2257690334, "perf/iters_per_sec": 0.9670888069958846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340312004089356, "data/tokens_consumed": 28754051072, "data/tokens_consumed_B": 28.754051072, "train/loss_slope": -1.6673631257492938e-05} {"step": 13720, "timestamp": 1778209362.191846, "train/loss": 2.2246177434921264, "train/z_loss": 0.001704191614408046, "train/perplexity": 9.249946275858097, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028208.5926855353, "perf/iters_per_sec": 0.9671252215793301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339922666549684, "data/tokens_consumed": 28775022592, "data/tokens_consumed_B": 28.775022592, "train/loss_slope": -1.8695258829567164e-05} {"step": 13725, "timestamp": 1778209367.959543, "eos/sharpness": 26.795625686645504, "eos/L0_probe": 2.097165584564209, "eos/L_plus": 2.2384696006774902, "eos/L_minus": 2.223817825317383, "eos/grad_norm": 0.1644500344991684, "eos/embed_grad_frac": 0.10524848848581314, "eos/time_s": 0.5836570262908936} {"step": 13725, "timestamp": 1778209369.3326283, "geo/rankme_last": 441.018798828125, "geo/layer_0/stable_rank_q_proj": 15.535449981689453, "geo/layer_0/stable_rank_k_proj": 13.833531379699707, "geo/layer_0/stable_rank_o_proj": 54.63385009765625, "geo/layer_0/stable_rank_gate_proj": 159.61236572265625, "geo/layer_0/stable_rank_down_proj": 48.5530891418457, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0453525111079216, "geo/layer_0/attn_entropy_mean": 6.310320854187012, "geo/layer_0/attn_entropy_std": 0.28616002202033997, "geo/layer_7/stable_rank_q_proj": 43.069339752197266, "geo/layer_7/stable_rank_k_proj": 43.423274993896484, "geo/layer_7/stable_rank_o_proj": 110.35332489013672, "geo/layer_7/stable_rank_gate_proj": 115.4197998046875, "geo/layer_7/stable_rank_down_proj": 159.22012329101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5916282534599304, "geo/layer_7/attn_entropy_mean": 4.644161224365234, "geo/layer_7/attn_entropy_std": 0.9000115990638733, "geo/layer_14/stable_rank_q_proj": 59.403106689453125, "geo/layer_14/stable_rank_k_proj": 37.63526153564453, "geo/layer_14/stable_rank_o_proj": 50.741207122802734, "geo/layer_14/stable_rank_gate_proj": 99.3383560180664, "geo/layer_14/stable_rank_down_proj": 138.7719268798828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38163071870803833, "geo/layer_14/attn_entropy_mean": 5.626298904418945, "geo/layer_14/attn_entropy_std": 0.550333559513092, "geo/layer_21/stable_rank_q_proj": 50.70331954956055, "geo/layer_21/stable_rank_k_proj": 32.229068756103516, "geo/layer_21/stable_rank_o_proj": 88.51802062988281, "geo/layer_21/stable_rank_gate_proj": 98.5240707397461, "geo/layer_21/stable_rank_down_proj": 66.00543975830078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15625208616256714, "geo/layer_21/attn_entropy_mean": 5.807346343994141, "geo/layer_21/attn_entropy_std": 0.28940361738204956, "geo/layer_27/stable_rank_q_proj": 42.008338928222656, "geo/layer_27/stable_rank_k_proj": 32.505802154541016, "geo/layer_27/stable_rank_o_proj": 117.17814636230469, "geo/layer_27/stable_rank_gate_proj": 93.58891296386719, "geo/layer_27/stable_rank_down_proj": 145.41139221191406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06861072778701782, "geo/layer_27/attn_entropy_mean": 4.458600044250488, "geo/layer_27/attn_entropy_std": 0.5140997171401978, "attnres/final_alpha/block_0": 0.2449110448360443, "attnres/block_norm/0": 1.509432077407837, "attnres/final_alpha/block_1": 0.008140970952808857, "attnres/block_norm/1": 22362.158203125, "attnres/final_alpha/block_2": 0.016768276691436768, "attnres/block_norm/2": 16502.353515625, "attnres/final_alpha/block_3": 0.019019456580281258, "attnres/block_norm/3": 19970.158203125, "attnres/final_alpha/block_4": 0.025268513709306717, "attnres/block_norm/4": 7120.8505859375, "attnres/final_alpha/block_5": 0.5355843305587769, "attnres/block_norm/5": 4334.0205078125, "attnres/final_alpha/block_6": 0.15030738711357117, "attnres/block_norm/6": 14052.88671875, "geo/tier1_time_s": 1.3557283878326416, "geo/step": 13725.0, "geo/rankme_slope": 0.00028003900779061624} {"step": 13730, "timestamp": 1778209374.5129929, "train/loss": 2.2586414098739622, "train/z_loss": 0.0016844849335029722, "train/perplexity": 9.570078516517171, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703041.4655635918, "perf/iters_per_sec": 0.8120734527414283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2314157009124755, "data/tokens_consumed": 28795994112, "data/tokens_consumed_B": 28.795994112, "train/loss_slope": -2.193505974313785e-05} {"step": 13740, "timestamp": 1778209384.8585138, "train/loss": 2.258500337600708, "train/z_loss": 0.0016919665969908238, "train/perplexity": 9.568728539010062, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028663.1719741349, "perf/iters_per_sec": 0.9673419818754839, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337605714797973, "data/tokens_consumed": 28816965632, "data/tokens_consumed_B": 28.816965632, "train/loss_slope": -1.996864413413921e-05} {"step": 13750, "timestamp": 1778209395.1932108, "grad/layer_0/attn": 0.0027670611161738634, "grad/layer_0/mlp": 0.0027878470718860626, "grad/layer_0/attn_mlp_ratio": 0.9925440476358189, "grad/layer_4/attn": 0.0017352940049022436, "grad/layer_4/mlp": 0.002778551308438182, "grad/layer_4/attn_mlp_ratio": 0.6245319052339035, "grad/layer_8/attn": 0.006478574592620134, "grad/layer_8/mlp": 0.004100820980966091, "grad/layer_8/attn_mlp_ratio": 1.579823763267887, "grad/layer_12/attn": 0.003971846308559179, "grad/layer_12/mlp": 0.006229354999959469, "grad/layer_12/attn_mlp_ratio": 0.6376015245277994, "grad/layer_16/attn": 0.004256054293364286, "grad/layer_16/mlp": 0.004339650738984346, "grad/layer_16/attn_mlp_ratio": 0.9807365733507665, "grad/layer_20/attn": 0.004108329303562641, "grad/layer_20/mlp": 0.005937857087701559, "grad/layer_20/attn_mlp_ratio": 0.6918875233428954, "grad/layer_24/attn": 0.010123137384653091, "grad/layer_24/mlp": 0.009868993423879147, "grad/layer_24/attn_mlp_ratio": 1.0257517506885596, "grad/layer_27/attn": 0.007006141357123852, "grad/layer_27/mlp": 0.007700120564550161, "grad/layer_27/attn_mlp_ratio": 0.909874229553148} {"step": 13750, "timestamp": 1778209395.2073317, "train/loss": 2.2584519147872926, "train/z_loss": 0.0016922594048082828, "train/perplexity": 9.568265205471441, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027418.0316615917, "perf/iters_per_sec": 0.9667482527072867, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034395456314087, "data/tokens_consumed": 28837937152, "data/tokens_consumed_B": 28.837937152, "train/loss_slope": -2.060145499861493e-05} {"step": 13760, "timestamp": 1778209405.5557914, "train/loss": 2.2814145565032957, "train/z_loss": 0.0016890706378035248, "train/perplexity": 9.790519862419197, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027655.4484153173, "perf/iters_per_sec": 0.966861461837443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342743396759033, "data/tokens_consumed": 28858908672, "data/tokens_consumed_B": 28.858908672, "train/loss_slope": -1.9618476928621664e-05} {"step": 13770, "timestamp": 1778209415.9041467, "train/loss": 2.2568788290023805, "train/z_loss": 0.001697404566220939, "train/perplexity": 9.553225336094277, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027719.579170109, "perf/iters_per_sec": 0.9668920417643113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342416286468505, "data/tokens_consumed": 28879880192, "data/tokens_consumed_B": 28.879880192, "train/loss_slope": -1.9892783691458894e-05} {"step": 13780, "timestamp": 1778209426.2461998, "train/loss": 2.238177752494812, "train/z_loss": 0.0016952646081335842, "train/perplexity": 9.37622989915213, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028726.805004721, "perf/iters_per_sec": 0.9673723244689565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337281465530395, "data/tokens_consumed": 28900851712, "data/tokens_consumed_B": 28.900851712, "train/loss_slope": -2.393629545211219e-05} {"step": 13790, "timestamp": 1778209436.6042902, "train/loss": 2.23562273979187, "train/z_loss": 0.0017055067233741283, "train/perplexity": 9.352304091041734, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025821.8205851582, "perf/iters_per_sec": 0.9659871199537078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352104902267456, "data/tokens_consumed": 28921823232, "data/tokens_consumed_B": 28.921823232, "train/loss_slope": -2.4575244539892057e-05} {"step": 13800, "timestamp": 1778209446.9447145, "grad/layer_0/attn": 0.002879345091059804, "grad/layer_0/mlp": 0.0028928767424076796, "grad/layer_0/attn_mlp_ratio": 0.9953223894119829, "grad/layer_4/attn": 0.0017449139850214124, "grad/layer_4/mlp": 0.0026319678872823715, "grad/layer_4/attn_mlp_ratio": 0.6629693041301451, "grad/layer_8/attn": 0.006380083970725536, "grad/layer_8/mlp": 0.004004000686109066, "grad/layer_8/attn_mlp_ratio": 1.5934272522772048, "grad/layer_12/attn": 0.003821260528638959, "grad/layer_12/mlp": 0.006132568698376417, "grad/layer_12/attn_mlp_ratio": 0.6231092800215513, "grad/layer_16/attn": 0.003930509556084871, "grad/layer_16/mlp": 0.004349380265921354, "grad/layer_16/attn_mlp_ratio": 0.9036941415567018, "grad/layer_20/attn": 0.005476965103298426, "grad/layer_20/mlp": 0.006450970657169819, "grad/layer_20/attn_mlp_ratio": 0.8490140956244684, "grad/layer_24/attn": 0.01039957907050848, "grad/layer_24/mlp": 0.012139801867306232, "grad/layer_24/attn_mlp_ratio": 0.8566514592672635, "grad/layer_27/attn": 0.005833619739860296, "grad/layer_27/mlp": 0.010619373992085457, "grad/layer_27/attn_mlp_ratio": 0.5493374363944906} {"step": 13800, "timestamp": 1778209447.540355, "eos/sharpness": 31.14707469940185, "eos/L0_probe": 2.100503921508789, "eos/L_plus": 2.265491008758545, "eos/L_minus": 2.2469875812530518, "eos/grad_norm": 0.15938346087932587, "eos/embed_grad_frac": 0.12615729868412018, "eos/time_s": 0.5927653312683105} {"step": 13800, "timestamp": 1778209447.5580487, "train/loss": 2.1823419094085694, "train/z_loss": 0.0017120818607509135, "train/perplexity": 8.867047784486628, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915523.1062121722, "perf/iters_per_sec": 0.913392594438635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094819474220276, "data/tokens_consumed": 28942794752, "data/tokens_consumed_B": 28.942794752, "train/loss_slope": -2.9527958441596095e-05} {"step": 13800, "timestamp": 1778209448.9188898, "geo/rankme_last": 441.3358154296875, "geo/layer_0/stable_rank_q_proj": 15.558697700500488, "geo/layer_0/stable_rank_k_proj": 13.85201644897461, "geo/layer_0/stable_rank_o_proj": 54.56315231323242, "geo/layer_0/stable_rank_gate_proj": 159.68328857421875, "geo/layer_0/stable_rank_down_proj": 48.50263214111328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03697618097066879, "geo/layer_0/attn_entropy_mean": 6.309409141540527, "geo/layer_0/attn_entropy_std": 0.2854669392108917, "geo/layer_7/stable_rank_q_proj": 43.18730926513672, "geo/layer_7/stable_rank_k_proj": 43.221038818359375, "geo/layer_7/stable_rank_o_proj": 110.5761947631836, "geo/layer_7/stable_rank_gate_proj": 115.26898956298828, "geo/layer_7/stable_rank_down_proj": 159.110107421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.593454122543335, "geo/layer_7/attn_entropy_mean": 4.653789043426514, "geo/layer_7/attn_entropy_std": 0.8853647112846375, "geo/layer_14/stable_rank_q_proj": 59.21158981323242, "geo/layer_14/stable_rank_k_proj": 37.470123291015625, "geo/layer_14/stable_rank_o_proj": 50.75409698486328, "geo/layer_14/stable_rank_gate_proj": 99.29095458984375, "geo/layer_14/stable_rank_down_proj": 139.19569396972656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39372026920318604, "geo/layer_14/attn_entropy_mean": 5.563930988311768, "geo/layer_14/attn_entropy_std": 0.5965588688850403, "geo/layer_21/stable_rank_q_proj": 50.773921966552734, "geo/layer_21/stable_rank_k_proj": 32.2315559387207, "geo/layer_21/stable_rank_o_proj": 88.46520233154297, "geo/layer_21/stable_rank_gate_proj": 98.18241119384766, "geo/layer_21/stable_rank_down_proj": 65.8871078491211, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15135245025157928, "geo/layer_21/attn_entropy_mean": 5.799790382385254, "geo/layer_21/attn_entropy_std": 0.29125767946243286, "geo/layer_27/stable_rank_q_proj": 41.995201110839844, "geo/layer_27/stable_rank_k_proj": 32.482452392578125, "geo/layer_27/stable_rank_o_proj": 116.98532104492188, "geo/layer_27/stable_rank_gate_proj": 93.53123474121094, "geo/layer_27/stable_rank_down_proj": 145.5184783935547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06683751195669174, "geo/layer_27/attn_entropy_mean": 4.47432804107666, "geo/layer_27/attn_entropy_std": 0.5098513960838318, "attnres/final_alpha/block_0": 0.24642547965049744, "attnres/block_norm/0": 1.511136770248413, "attnres/final_alpha/block_1": 0.008271312341094017, "attnres/block_norm/1": 22286.896484375, "attnres/final_alpha/block_2": 0.0174008309841156, "attnres/block_norm/2": 16506.525390625, "attnres/final_alpha/block_3": 0.01915040798485279, "attnres/block_norm/3": 20333.26171875, "attnres/final_alpha/block_4": 0.02585470676422119, "attnres/block_norm/4": 7122.3154296875, "attnres/final_alpha/block_5": 0.530613899230957, "attnres/block_norm/5": 4398.09033203125, "attnres/final_alpha/block_6": 0.15228337049484253, "attnres/block_norm/6": 14083.564453125, "geo/tier1_time_s": 1.3568382263183594, "geo/step": 13800.0, "geo/rankme_slope": 0.00030057890343637456} {"step": 13810, "timestamp": 1778209459.2727294, "train/loss": 2.2943358421325684, "train/z_loss": 0.0016712927026674151, "train/perplexity": 9.917846808405978, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790793.3717501545, "perf/iters_per_sec": 0.8539168223143361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710742473602296, "data/tokens_consumed": 28963766272, "data/tokens_consumed_B": 28.963766272, "train/loss_slope": -2.4648217843024522e-05} {"step": 13820, "timestamp": 1778209469.6286013, "train/loss": 2.255053448677063, "train/z_loss": 0.0016877296729944646, "train/perplexity": 9.535802972579862, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026214.278234563, "perf/iters_per_sec": 0.9661742583439651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350099802017212, "data/tokens_consumed": 28984737792, "data/tokens_consumed_B": 28.984737792, "train/loss_slope": -2.572194456231512e-05} {"step": 13830, "timestamp": 1778209479.9739127, "train/loss": 2.285434532165527, "train/z_loss": 0.0016902307746931911, "train/perplexity": 9.829956728498663, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028290.6713192954, "perf/iters_per_sec": 0.9671643597218015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033950424194336, "data/tokens_consumed": 29005709312, "data/tokens_consumed_B": 29.005709312, "train/loss_slope": -2.4684686758051126e-05} {"step": 13840, "timestamp": 1778209490.3218315, "train/loss": 2.3232311487197874, "train/z_loss": 0.0016685151611454784, "train/perplexity": 10.208606598996463, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027622.122591328, "perf/iters_per_sec": 0.9668455708462372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342913389205932, "data/tokens_consumed": 29026680832, "data/tokens_consumed_B": 29.026680832, "train/loss_slope": -2.1849081275200614e-05} {"step": 13850, "timestamp": 1778209501.0460873, "grad/layer_0/attn": 0.0029491307213902473, "grad/layer_0/mlp": 0.0026646004989743233, "grad/layer_0/attn_mlp_ratio": 1.106781527605093, "grad/layer_4/attn": 0.0016281172866001725, "grad/layer_4/mlp": 0.0025916642043739557, "grad/layer_4/attn_mlp_ratio": 0.628213030465557, "grad/layer_8/attn": 0.009808588773012161, "grad/layer_8/mlp": 0.003909289371222258, "grad/layer_8/attn_mlp_ratio": 2.5090464252434757, "grad/layer_12/attn": 0.003784094238653779, "grad/layer_12/mlp": 0.0060787508264184, "grad/layer_12/attn_mlp_ratio": 0.6225118094916529, "grad/layer_16/attn": 0.004427706822752953, "grad/layer_16/mlp": 0.004599396139383316, "grad/layer_16/attn_mlp_ratio": 0.9626713142998562, "grad/layer_20/attn": 0.0037343893200159073, "grad/layer_20/mlp": 0.0061495485715568066, "grad/layer_20/attn_mlp_ratio": 0.6072623406150743, "grad/layer_24/attn": 0.013171486556529999, "grad/layer_24/mlp": 0.012542039155960083, "grad/layer_24/attn_mlp_ratio": 1.0501869981208038, "grad/layer_27/attn": 0.006308420095592737, "grad/layer_27/mlp": 0.013452385552227497, "grad/layer_27/attn_mlp_ratio": 0.46894433884655734} {"step": 13850, "timestamp": 1778209501.060762, "train/loss": 2.2918860912323, "train/z_loss": 0.0016724823974072934, "train/perplexity": 9.893580289857036, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953915.7586455303, "perf/iters_per_sec": 0.9316996377208377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0733072757720947, "data/tokens_consumed": 29047652352, "data/tokens_consumed_B": 29.047652352, "train/loss_slope": -2.0798990506865513e-05} {"step": 13860, "timestamp": 1778209511.9583304, "train/loss": 2.2747488975524903, "train/z_loss": 0.0016848543658852578, "train/perplexity": 9.725476614945498, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925487.4073137494, "perf/iters_per_sec": 0.9181439434593913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0891538381576538, "data/tokens_consumed": 29068623872, "data/tokens_consumed_B": 29.068623872, "train/loss_slope": -1.9322677900438535e-05} {"step": 13870, "timestamp": 1778209522.3186193, "train/loss": 2.272910404205322, "train/z_loss": 0.0016939187771640718, "train/perplexity": 9.707612817159294, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025623.3630403962, "perf/iters_per_sec": 0.9658924880220395, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353119134902955, "data/tokens_consumed": 29089595392, "data/tokens_consumed_B": 29.089595392, "train/loss_slope": -1.711528673209197e-05} {"step": 13875, "timestamp": 1778209528.0917702, "eos/sharpness": 18.485760688781735, "eos/L0_probe": 2.0992422103881836, "eos/L_plus": 2.2314887046813965, "eos/L_minus": 2.151853322982788, "eos/grad_norm": 0.14512178301811218, "eos/embed_grad_frac": 0.1558036357164383, "eos/time_s": 0.5926611423492432} {"step": 13875, "timestamp": 1778209529.4731617, "geo/rankme_last": 442.2387390136719, "geo/layer_0/stable_rank_q_proj": 15.585762977600098, "geo/layer_0/stable_rank_k_proj": 13.868672370910645, "geo/layer_0/stable_rank_o_proj": 54.52030563354492, "geo/layer_0/stable_rank_gate_proj": 159.17657470703125, "geo/layer_0/stable_rank_down_proj": 48.5223388671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036723580211400986, "geo/layer_0/attn_entropy_mean": 6.3077006340026855, "geo/layer_0/attn_entropy_std": 0.28723496198654175, "geo/layer_7/stable_rank_q_proj": 43.170841217041016, "geo/layer_7/stable_rank_k_proj": 43.15627670288086, "geo/layer_7/stable_rank_o_proj": 110.62419891357422, "geo/layer_7/stable_rank_gate_proj": 115.19270324707031, "geo/layer_7/stable_rank_down_proj": 158.1197052001953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5694912075996399, "geo/layer_7/attn_entropy_mean": 4.652063369750977, "geo/layer_7/attn_entropy_std": 0.8876270651817322, "geo/layer_14/stable_rank_q_proj": 59.35738754272461, "geo/layer_14/stable_rank_k_proj": 37.6593017578125, "geo/layer_14/stable_rank_o_proj": 50.54021453857422, "geo/layer_14/stable_rank_gate_proj": 99.07463073730469, "geo/layer_14/stable_rank_down_proj": 138.8302764892578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3820653259754181, "geo/layer_14/attn_entropy_mean": 5.557127952575684, "geo/layer_14/attn_entropy_std": 0.5803000330924988, "geo/layer_21/stable_rank_q_proj": 50.71248245239258, "geo/layer_21/stable_rank_k_proj": 32.08034896850586, "geo/layer_21/stable_rank_o_proj": 88.45076751708984, "geo/layer_21/stable_rank_gate_proj": 97.9766845703125, "geo/layer_21/stable_rank_down_proj": 65.78763580322266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15918144583702087, "geo/layer_21/attn_entropy_mean": 5.780916213989258, "geo/layer_21/attn_entropy_std": 0.2956952452659607, "geo/layer_27/stable_rank_q_proj": 41.98224639892578, "geo/layer_27/stable_rank_k_proj": 32.571617126464844, "geo/layer_27/stable_rank_o_proj": 116.97516632080078, "geo/layer_27/stable_rank_gate_proj": 93.55967712402344, "geo/layer_27/stable_rank_down_proj": 145.3650360107422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06801388412714005, "geo/layer_27/attn_entropy_mean": 4.4960527420043945, "geo/layer_27/attn_entropy_std": 0.4866141080856323, "attnres/final_alpha/block_0": 0.24465756118297577, "attnres/block_norm/0": 1.51277756690979, "attnres/final_alpha/block_1": 0.008090009912848473, "attnres/block_norm/1": 22490.34375, "attnres/final_alpha/block_2": 0.0168924443423748, "attnres/block_norm/2": 16558.05078125, "attnres/final_alpha/block_3": 0.018699632957577705, "attnres/block_norm/3": 20399.9140625, "attnres/final_alpha/block_4": 0.02539980597794056, "attnres/block_norm/4": 7159.50244140625, "attnres/final_alpha/block_5": 0.5351431965827942, "attnres/block_norm/5": 4365.869140625, "attnres/final_alpha/block_6": 0.15111736953258514, "attnres/block_norm/6": 14159.83203125, "geo/tier1_time_s": 1.362210988998413, "geo/step": 13875.0, "geo/rankme_slope": 0.00035680371758078233} {"step": 13880, "timestamp": 1778209534.6442971, "train/loss": 2.272266387939453, "train/z_loss": 0.001692525716498494, "train/perplexity": 9.701362969320133, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702588.4688611166, "perf/iters_per_sec": 0.8118574470811446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231743335723877, "data/tokens_consumed": 29110566912, "data/tokens_consumed_B": 29.110566912, "train/loss_slope": -1.964119577994408e-05} {"step": 13890, "timestamp": 1778209544.984412, "train/loss": 2.286002826690674, "train/z_loss": 0.0016756547964178025, "train/perplexity": 9.835544626725495, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029162.5177525044, "perf/iters_per_sec": 0.9675800884974024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033506178855896, "data/tokens_consumed": 29131538432, "data/tokens_consumed_B": 29.131538432, "train/loss_slope": -1.9227066814023184e-05} {"step": 13900, "timestamp": 1778209555.313522, "grad/layer_0/attn": 0.004466493614017963, "grad/layer_0/mlp": 0.003505701432004571, "grad/layer_0/attn_mlp_ratio": 1.2740655681158368, "grad/layer_4/attn": 0.0017804221715778112, "grad/layer_4/mlp": 0.0027481671422719955, "grad/layer_4/attn_mlp_ratio": 0.6478580139489184, "grad/layer_8/attn": 0.00812823697924614, "grad/layer_8/mlp": 0.004362919833511114, "grad/layer_8/attn_mlp_ratio": 1.8630268497054083, "grad/layer_12/attn": 0.006209731567651033, "grad/layer_12/mlp": 0.0072454726323485374, "grad/layer_12/attn_mlp_ratio": 0.8570498843956342, "grad/layer_16/attn": 0.005064132157713175, "grad/layer_16/mlp": 0.005906058009713888, "grad/layer_16/attn_mlp_ratio": 0.8574470592126465, "grad/layer_20/attn": 0.004822938237339258, "grad/layer_20/mlp": 0.0075905583798885345, "grad/layer_20/attn_mlp_ratio": 0.635386480470154, "grad/layer_24/attn": 0.01876726746559143, "grad/layer_24/mlp": 0.013865628279745579, "grad/layer_24/attn_mlp_ratio": 1.3535100575034877, "grad/layer_27/attn": 0.016341088339686394, "grad/layer_27/mlp": 0.011218965984880924, "grad/layer_27/attn_mlp_ratio": 1.4565592066196031} {"step": 13900, "timestamp": 1778209555.3276813, "train/loss": 2.332649517059326, "train/z_loss": 0.0016756143653765322, "train/perplexity": 10.305209221621569, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028911.316570988, "perf/iters_per_sec": 0.9674603064398708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336341381072998, "data/tokens_consumed": 29152509952, "data/tokens_consumed_B": 29.152509952, "train/loss_slope": -1.814105168785238e-05} {"step": 13910, "timestamp": 1778209566.1976304, "train/loss": 2.3540110111236574, "train/z_loss": 0.0016535975853912532, "train/perplexity": 10.52771192144188, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930192.424326768, "perf/iters_per_sec": 0.9203874704011764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0864989280700683, "data/tokens_consumed": 29173481472, "data/tokens_consumed_B": 29.173481472, "train/loss_slope": -1.080756077993896e-05} {"step": 13920, "timestamp": 1778209576.8829393, "train/loss": 2.2857459783554077, "train/z_loss": 0.001673876284621656, "train/perplexity": 9.833018707864598, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1963840.7563930869, "perf/iters_per_sec": 0.936432245441955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0678829193115233, "data/tokens_consumed": 29194452992, "data/tokens_consumed_B": 29.194452992, "train/loss_slope": -1.071227684129729e-05} {"step": 13930, "timestamp": 1778209587.2360067, "train/loss": 2.2702390909194947, "train/z_loss": 0.0016783548635430634, "train/perplexity": 9.681715347594583, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026911.370275991, "perf/iters_per_sec": 0.9665066577320056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346540212631226, "data/tokens_consumed": 29215424512, "data/tokens_consumed_B": 29.215424512, "train/loss_slope": -9.176461824668371e-06} {"step": 13940, "timestamp": 1778209597.5832186, "train/loss": 2.288164520263672, "train/z_loss": 0.0016793687595054508, "train/perplexity": 9.856829057251991, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028258.0261439602, "perf/iters_per_sec": 0.9671487932891656, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339670658111573, "data/tokens_consumed": 29236396032, "data/tokens_consumed_B": 29.236396032, "train/loss_slope": -6.566530550130701e-06} {"step": 13950, "timestamp": 1778209607.9174428, "grad/layer_0/attn": 0.0026984356809407473, "grad/layer_0/mlp": 0.002621107269078493, "grad/layer_0/attn_mlp_ratio": 1.0295021534693714, "grad/layer_4/attn": 0.0019608582369983196, "grad/layer_4/mlp": 0.002646641805768013, "grad/layer_4/attn_mlp_ratio": 0.740885358432844, "grad/layer_8/attn": 0.004024325404316187, "grad/layer_8/mlp": 0.003915902692824602, "grad/layer_8/attn_mlp_ratio": 1.027687768881867, "grad/layer_12/attn": 0.004784835036844015, "grad/layer_12/mlp": 0.006158033385872841, "grad/layer_12/attn_mlp_ratio": 0.7770069857237569, "grad/layer_16/attn": 0.004327800124883652, "grad/layer_16/mlp": 0.004621499218046665, "grad/layer_16/attn_mlp_ratio": 0.9364493700093959, "grad/layer_20/attn": 0.004590325988829136, "grad/layer_20/mlp": 0.005779115483164787, "grad/layer_20/attn_mlp_ratio": 0.7942955842934292, "grad/layer_24/attn": 0.007692571263760328, "grad/layer_24/mlp": 0.00964164175093174, "grad/layer_24/attn_mlp_ratio": 0.7978486841446969, "grad/layer_27/attn": 0.00873664952814579, "grad/layer_27/mlp": 0.008216043934226036, "grad/layer_27/attn_mlp_ratio": 1.063364496557107} {"step": 13950, "timestamp": 1778209608.514089, "eos/sharpness": 51.269912719726555, "eos/L0_probe": 2.0971603393554688, "eos/L_plus": 2.2665328979492188, "eos/L_minus": 2.4404869079589844, "eos/grad_norm": 0.1639481484889984, "eos/embed_grad_frac": 0.10615600645542145, "eos/time_s": 0.5937991142272949} {"step": 13950, "timestamp": 1778209608.5330217, "train/loss": 2.2240175008773804, "train/z_loss": 0.0016926983604207634, "train/perplexity": 9.244395729922969, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916183.1704056787, "perf/iters_per_sec": 0.9137073375728982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944423437118531, "data/tokens_consumed": 29257367552, "data/tokens_consumed_B": 29.257367552, "train/loss_slope": -9.128962348539727e-06} {"step": 13950, "timestamp": 1778209609.8973997, "geo/rankme_last": 440.8209533691406, "geo/layer_0/stable_rank_q_proj": 15.60422134399414, "geo/layer_0/stable_rank_k_proj": 13.8844633102417, "geo/layer_0/stable_rank_o_proj": 54.69601058959961, "geo/layer_0/stable_rank_gate_proj": 159.37698364257812, "geo/layer_0/stable_rank_down_proj": 48.583839416503906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04256972298026085, "geo/layer_0/attn_entropy_mean": 6.305718421936035, "geo/layer_0/attn_entropy_std": 0.29200515151023865, "geo/layer_7/stable_rank_q_proj": 43.169734954833984, "geo/layer_7/stable_rank_k_proj": 43.231597900390625, "geo/layer_7/stable_rank_o_proj": 110.81175994873047, "geo/layer_7/stable_rank_gate_proj": 114.68484497070312, "geo/layer_7/stable_rank_down_proj": 157.9522247314453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.57414710521698, "geo/layer_7/attn_entropy_mean": 4.65303897857666, "geo/layer_7/attn_entropy_std": 0.8492146730422974, "geo/layer_14/stable_rank_q_proj": 59.230228424072266, "geo/layer_14/stable_rank_k_proj": 37.55903244018555, "geo/layer_14/stable_rank_o_proj": 50.66776657104492, "geo/layer_14/stable_rank_gate_proj": 98.7925796508789, "geo/layer_14/stable_rank_down_proj": 138.89308166503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38618004322052, "geo/layer_14/attn_entropy_mean": 5.584115028381348, "geo/layer_14/attn_entropy_std": 0.5491439700126648, "geo/layer_21/stable_rank_q_proj": 50.57240295410156, "geo/layer_21/stable_rank_k_proj": 32.07693099975586, "geo/layer_21/stable_rank_o_proj": 88.71179962158203, "geo/layer_21/stable_rank_gate_proj": 97.78675842285156, "geo/layer_21/stable_rank_down_proj": 65.69233703613281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15405315160751343, "geo/layer_21/attn_entropy_mean": 5.777942657470703, "geo/layer_21/attn_entropy_std": 0.2849614918231964, "geo/layer_27/stable_rank_q_proj": 42.072975158691406, "geo/layer_27/stable_rank_k_proj": 32.60446548461914, "geo/layer_27/stable_rank_o_proj": 116.87158203125, "geo/layer_27/stable_rank_gate_proj": 93.52987670898438, "geo/layer_27/stable_rank_down_proj": 145.7146759033203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06965602934360504, "geo/layer_27/attn_entropy_mean": 4.495703220367432, "geo/layer_27/attn_entropy_std": 0.49577078223228455, "attnres/final_alpha/block_0": 0.2452135682106018, "attnres/block_norm/0": 1.514318585395813, "attnres/final_alpha/block_1": 0.00806360598653555, "attnres/block_norm/1": 22542.294921875, "attnres/final_alpha/block_2": 0.017069987952709198, "attnres/block_norm/2": 16673.28515625, "attnres/final_alpha/block_3": 0.01910894364118576, "attnres/block_norm/3": 20496.04296875, "attnres/final_alpha/block_4": 0.025560801848769188, "attnres/block_norm/4": 7215.6533203125, "attnres/final_alpha/block_5": 0.5321825742721558, "attnres/block_norm/5": 4392.9912109375, "attnres/final_alpha/block_6": 0.15280048549175262, "attnres/block_norm/6": 14223.818359375, "geo/tier1_time_s": 1.360971450805664, "geo/step": 13950.0, "geo/rankme_slope": 0.0003902915267669568} {"step": 13960, "timestamp": 1778209620.239476, "train/loss": 2.275937354564667, "train/z_loss": 0.0016829836880788208, "train/perplexity": 9.737041796824789, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791959.3503755638, "perf/iters_per_sec": 0.8544728042486018, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170312261581421, "data/tokens_consumed": 29278339072, "data/tokens_consumed_B": 29.278339072, "train/loss_slope": -1.1932061521848936e-05} {"step": 13970, "timestamp": 1778209630.5811732, "train/loss": 2.2833334684371946, "train/z_loss": 0.0016751023358665407, "train/perplexity": 9.809325044796305, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029072.1305848663, "perf/iters_per_sec": 0.9675369885372478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335522174835206, "data/tokens_consumed": 29299310592, "data/tokens_consumed_B": 29.299310592, "train/loss_slope": -9.375027668858847e-06} {"step": 13980, "timestamp": 1778209640.932421, "train/loss": 2.2629251003265383, "train/z_loss": 0.001684100995771587, "train/perplexity": 9.611161701489362, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027300.3253506038, "perf/iters_per_sec": 0.9666921259644526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344555139541627, "data/tokens_consumed": 29320282112, "data/tokens_consumed_B": 29.320282112, "train/loss_slope": -1.1172822399465631e-05} {"step": 13990, "timestamp": 1778209651.2895703, "train/loss": 2.2831351280212404, "train/z_loss": 0.0016703176195733249, "train/perplexity": 9.807379652118062, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026009.8639581932, "perf/iters_per_sec": 0.9660767860213247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351144075393677, "data/tokens_consumed": 29341253632, "data/tokens_consumed_B": 29.341253632, "train/loss_slope": -1.1164304539851902e-05} {"step": 14000, "timestamp": 1778209661.6232731, "grad/layer_0/attn": 0.0029716903809458017, "grad/layer_0/mlp": 0.0028190126176923513, "grad/layer_0/attn_mlp_ratio": 1.0541599767518723, "grad/layer_4/attn": 0.0018514911644160748, "grad/layer_4/mlp": 0.002645049709826708, "grad/layer_4/attn_mlp_ratio": 0.699983477640966, "grad/layer_8/attn": 0.005397839471697807, "grad/layer_8/mlp": 0.003887459170073271, "grad/layer_8/attn_mlp_ratio": 1.3885263090090378, "grad/layer_12/attn": 0.005031230393797159, "grad/layer_12/mlp": 0.006178695242851973, "grad/layer_12/attn_mlp_ratio": 0.8142868542009769, "grad/layer_16/attn": 0.005265893414616585, "grad/layer_16/mlp": 0.004634039010852575, "grad/layer_16/attn_mlp_ratio": 1.1363506627046487, "grad/layer_20/attn": 0.004353060387074947, "grad/layer_20/mlp": 0.006374270189553499, "grad/layer_20/attn_mlp_ratio": 0.6829111709004527, "grad/layer_24/attn": 0.012081097811460495, "grad/layer_24/mlp": 0.011166488751769066, "grad/layer_24/attn_mlp_ratio": 1.0819065842300584, "grad/layer_27/attn": 0.012292422354221344, "grad/layer_27/mlp": 0.007974106818437576, "grad/layer_27/attn_mlp_ratio": 1.5415422040302773} {"step": 14000, "timestamp": 1778209661.6374192, "train/loss": 2.266887092590332, "train/z_loss": 0.0016883912379853428, "train/perplexity": 9.649316584561541, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027773.4764473813, "perf/iters_per_sec": 0.9669177419888407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342141389846802, "data/tokens_consumed": 29362225152, "data/tokens_consumed_B": 29.362225152, "train/loss_slope": -1.04943304923144e-05} {"step": 14000, "timestamp": 1778209668.827614, "geo/ww_alpha_mean": 8.115225824067325, "geo/ww_alpha_std": 5.2811728940441185, "geo/ww_alpha_min": 1.3784462112726215, "geo/ww_alpha_max": 44.65160475691088, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.257951893147531, "geo/ww_alpha_by_type/k_proj": 4.682151751987787, "geo/ww_alpha_by_type/v_proj": 7.3963695778310194, "geo/ww_alpha_by_type/o_proj": 9.865911838628818, "geo/ww_alpha_by_type/gate_proj": 10.357952570863054, "geo/ww_alpha_by_type/up_proj": 10.943897962638959, "geo/ww_alpha_by_type/down_proj": 9.443855024827922, "geo/twonn_id/layer_0": 0.6744548082351685, "geo/twonn_id/layer_7": 2.9192051887512207, "geo/twonn_id/layer_14": 3.9688658714294434, "geo/twonn_id/layer_21": 6.892454147338867, "geo/twonn_id/layer_27": 6.124117851257324, "geo/tier2_time_s": 7.183164834976196} {"step": 14000, "timestamp": 1778209669.4580042, "eoc/jacobian_sigma/layer_0/attn": 721.0940551757812, "eoc/jacobian_sigma/layer_0/mlp": 4222.39990234375, "eoc/jacobian_sigma/layer_0": 4222.39990234375, "eoc/jacobian_sigma/layer_7/attn": 1.150426983833313, "eoc/jacobian_sigma/layer_7/mlp": 1.6829473972320557, "eoc/jacobian_sigma/layer_7": 1.6829473972320557, "eoc/jacobian_sigma/layer_14/attn": 1.320157527923584, "eoc/jacobian_sigma/layer_14/mlp": 7.565035343170166, "eoc/jacobian_sigma/layer_14": 7.565035343170166, "eoc/jacobian_sigma/layer_21/attn": 1.0854321718215942, "eoc/jacobian_sigma/layer_21/mlp": 4.023275375366211, "eoc/jacobian_sigma/layer_21": 4.023275375366211, "eoc/jacobian_sigma/layer_27/attn": 2.5215492248535156, "eoc/jacobian_sigma/layer_27/mlp": 24.490619659423828, "eoc/jacobian_sigma/layer_27": 24.490619659423828, "eoc/layer0_sigma": 4222.39990234375, "eoc/sigma_max": 24.490619659423828, "eoc/sigma_min": 1.6829473972320557, "eoc/sigma_mean": 9.440469443798065, "eoc/time_s": 0.6234142780303955} {"step": 14010, "timestamp": 1778209679.8272662, "train/loss": 2.2657647371292113, "train/z_loss": 0.0016842743963934482, "train/perplexity": 9.638492696657847, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1153301.1160939096, "perf/iters_per_sec": 0.5499368267507122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.818390679359436, "data/tokens_consumed": 29383196672, "data/tokens_consumed_B": 29.383196672, "train/loss_slope": -1.1995519620321439e-05} {"step": 14020, "timestamp": 1778209690.1728935, "train/loss": 2.2732330799102782, "train/z_loss": 0.0016834811540320516, "train/perplexity": 9.710745733399307, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028504.10686054, "perf/iters_per_sec": 0.9672661337187481, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338416337966918, "data/tokens_consumed": 29404168192, "data/tokens_consumed_B": 29.404168192, "train/loss_slope": -1.1200279092202156e-05} {"step": 14025, "timestamp": 1778209695.9421499, "eos/sharpness": 59.704256057739244, "eos/L0_probe": 2.097027063369751, "eos/L_plus": 2.309445858001709, "eos/L_minus": 2.4816508293151855, "eos/grad_norm": 0.2655409574508667, "eos/embed_grad_frac": 0.034492675215005875, "eos/time_s": 0.6076028347015381} {"step": 14025, "timestamp": 1778209697.3211114, "geo/rankme_last": 441.36505126953125, "geo/layer_0/stable_rank_q_proj": 15.630422592163086, "geo/layer_0/stable_rank_k_proj": 13.921184539794922, "geo/layer_0/stable_rank_o_proj": 54.68905258178711, "geo/layer_0/stable_rank_gate_proj": 159.3994903564453, "geo/layer_0/stable_rank_down_proj": 48.64189147949219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.043755900114774704, "geo/layer_0/attn_entropy_mean": 6.312645435333252, "geo/layer_0/attn_entropy_std": 0.2831392288208008, "geo/layer_7/stable_rank_q_proj": 43.12228012084961, "geo/layer_7/stable_rank_k_proj": 43.08895492553711, "geo/layer_7/stable_rank_o_proj": 110.82304382324219, "geo/layer_7/stable_rank_gate_proj": 114.46478271484375, "geo/layer_7/stable_rank_down_proj": 158.05325317382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.585969865322113, "geo/layer_7/attn_entropy_mean": 4.647155284881592, "geo/layer_7/attn_entropy_std": 0.9050701856613159, "geo/layer_14/stable_rank_q_proj": 59.28146743774414, "geo/layer_14/stable_rank_k_proj": 37.48382568359375, "geo/layer_14/stable_rank_o_proj": 50.693321228027344, "geo/layer_14/stable_rank_gate_proj": 98.31848907470703, "geo/layer_14/stable_rank_down_proj": 138.55828857421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39714595675468445, "geo/layer_14/attn_entropy_mean": 5.557395935058594, "geo/layer_14/attn_entropy_std": 0.5777034759521484, "geo/layer_21/stable_rank_q_proj": 50.279544830322266, "geo/layer_21/stable_rank_k_proj": 32.1684684753418, "geo/layer_21/stable_rank_o_proj": 88.40901947021484, "geo/layer_21/stable_rank_gate_proj": 97.66692352294922, "geo/layer_21/stable_rank_down_proj": 65.8037338256836, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15812692046165466, "geo/layer_21/attn_entropy_mean": 5.772012710571289, "geo/layer_21/attn_entropy_std": 0.29501432180404663, "geo/layer_27/stable_rank_q_proj": 42.0095329284668, "geo/layer_27/stable_rank_k_proj": 32.611083984375, "geo/layer_27/stable_rank_o_proj": 117.04966735839844, "geo/layer_27/stable_rank_gate_proj": 93.48388671875, "geo/layer_27/stable_rank_down_proj": 145.33621215820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07100091874599457, "geo/layer_27/attn_entropy_mean": 4.448247909545898, "geo/layer_27/attn_entropy_std": 0.5069920420646667, "attnres/final_alpha/block_0": 0.24476179480552673, "attnres/block_norm/0": 1.5158441066741943, "attnres/final_alpha/block_1": 0.008113774470984936, "attnres/block_norm/1": 22696.330078125, "attnres/final_alpha/block_2": 0.01670830324292183, "attnres/block_norm/2": 16666.65234375, "attnres/final_alpha/block_3": 0.01899610087275505, "attnres/block_norm/3": 20626.005859375, "attnres/final_alpha/block_4": 0.0252310112118721, "attnres/block_norm/4": 7210.9365234375, "attnres/final_alpha/block_5": 0.53435218334198, "attnres/block_norm/5": 4432.830078125, "attnres/final_alpha/block_6": 0.1518368124961853, "attnres/block_norm/6": 14402.0361328125, "geo/tier1_time_s": 1.3598964214324951, "geo/step": 14025.0, "geo/rankme_slope": 0.00038787880386529613} {"step": 14030, "timestamp": 1778209702.4930303, "train/loss": 2.272343564033508, "train/z_loss": 0.0016782371210865676, "train/perplexity": 9.702111711513243, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702884.8904013697, "perf/iters_per_sec": 0.8119987918860291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231528925895691, "data/tokens_consumed": 29425139712, "data/tokens_consumed_B": 29.425139712, "train/loss_slope": -8.981337062310403e-06} {"step": 14040, "timestamp": 1778209712.8553097, "train/loss": 2.2490617513656614, "train/z_loss": 0.0016823433456011117, "train/perplexity": 9.478838155938076, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024781.633824508, "perf/iters_per_sec": 0.9654911202547588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357423067092895, "data/tokens_consumed": 29446111232, "data/tokens_consumed_B": 29.446111232, "train/loss_slope": -7.5857970223139755e-06} {"step": 14050, "timestamp": 1778209723.186494, "grad/layer_0/attn": 0.0028008741792291403, "grad/layer_0/mlp": 0.0027583653572946787, "grad/layer_0/attn_mlp_ratio": 1.0154108375386026, "grad/layer_4/attn": 0.0017449635779485106, "grad/layer_4/mlp": 0.002599473809823394, "grad/layer_4/attn_mlp_ratio": 0.671275665185292, "grad/layer_8/attn": 0.004501278046518564, "grad/layer_8/mlp": 0.003962913993746042, "grad/layer_8/attn_mlp_ratio": 1.13585052313451, "grad/layer_12/attn": 0.004455683287233114, "grad/layer_12/mlp": 0.005991244222968817, "grad/layer_12/attn_mlp_ratio": 0.7436991461274957, "grad/layer_16/attn": 0.0042238314636051655, "grad/layer_16/mlp": 0.004471042193472385, "grad/layer_16/attn_mlp_ratio": 0.9447084564088912, "grad/layer_20/attn": 0.005323959980159998, "grad/layer_20/mlp": 0.006309017539024353, "grad/layer_20/attn_mlp_ratio": 0.8438651284201057, "grad/layer_24/attn": 0.021848341450095177, "grad/layer_24/mlp": 0.014822477474808693, "grad/layer_24/attn_mlp_ratio": 1.4740006412441589, "grad/layer_27/attn": 0.006251992657780647, "grad/layer_27/mlp": 0.01362529769539833, "grad/layer_27/attn_mlp_ratio": 0.45885181752813736} {"step": 14050, "timestamp": 1778209723.200681, "train/loss": 2.2754729986190796, "train/z_loss": 0.0016787969158031046, "train/perplexity": 9.732521393193379, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028179.410772152, "perf/iters_per_sec": 0.9671113065586815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340071439743042, "data/tokens_consumed": 29467082752, "data/tokens_consumed_B": 29.467082752, "train/loss_slope": -6.57736889564206e-06} {"step": 14060, "timestamp": 1778209733.5476646, "train/loss": 2.2988540649414064, "train/z_loss": 0.0016730496427044272, "train/perplexity": 9.96275923584254, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028309.9408844758, "perf/iters_per_sec": 0.9671735481665019, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033940601348877, "data/tokens_consumed": 29488054272, "data/tokens_consumed_B": 29.488054272, "train/loss_slope": -5.912056077968084e-06} {"step": 14070, "timestamp": 1778209743.9012969, "train/loss": 2.2695847511291505, "train/z_loss": 0.0016794599359855056, "train/perplexity": 9.675382288216202, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026703.2663397018, "perf/iters_per_sec": 0.9664074260424146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347602605819701, "data/tokens_consumed": 29509025792, "data/tokens_consumed_B": 29.509025792, "train/loss_slope": -8.230490846173286e-06} {"step": 14080, "timestamp": 1778209754.260242, "train/loss": 2.2407964706420898, "train/z_loss": 0.0017095597344450653, "train/perplexity": 9.400815780238386, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025627.234783235, "perf/iters_per_sec": 0.9658943342128921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353099346160888, "data/tokens_consumed": 29529997312, "data/tokens_consumed_B": 29.529997312, "train/loss_slope": -1.0252301241591187e-05} {"step": 14090, "timestamp": 1778209764.6225846, "train/loss": 2.3260236263275145, "train/z_loss": 0.0016570911859162152, "train/perplexity": 10.237153744411703, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025024.6805233893, "perf/iters_per_sec": 0.965607013951964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035617995262146, "data/tokens_consumed": 29550968832, "data/tokens_consumed_B": 29.550968832, "train/loss_slope": -7.558448315382125e-06} {"step": 14100, "timestamp": 1778209774.968939, "grad/layer_0/attn": 0.003162694862112403, "grad/layer_0/mlp": 0.0029425041284412146, "grad/layer_0/attn_mlp_ratio": 1.0748310339006153, "grad/layer_4/attn": 0.001894178451038897, "grad/layer_4/mlp": 0.0026442324742674828, "grad/layer_4/attn_mlp_ratio": 0.7163433615757612, "grad/layer_8/attn": 0.0053632743656635284, "grad/layer_8/mlp": 0.004147551022469997, "grad/layer_8/attn_mlp_ratio": 1.2931183262834696, "grad/layer_12/attn": 0.006653573364019394, "grad/layer_12/mlp": 0.006211759056895971, "grad/layer_12/attn_mlp_ratio": 1.0711254567287825, "grad/layer_16/attn": 0.004553627222776413, "grad/layer_16/mlp": 0.004608285613358021, "grad/layer_16/attn_mlp_ratio": 0.9881390838195712, "grad/layer_20/attn": 0.004426289349794388, "grad/layer_20/mlp": 0.00706335436552763, "grad/layer_20/attn_mlp_ratio": 0.6266554186678137, "grad/layer_24/attn": 0.013899988494813442, "grad/layer_24/mlp": 0.015687095001339912, "grad/layer_24/attn_mlp_ratio": 0.8860779134070638, "grad/layer_27/attn": 0.007985534146428108, "grad/layer_27/mlp": 0.01340898685157299, "grad/layer_27/attn_mlp_ratio": 0.5955359771225175} {"step": 14100, "timestamp": 1778209775.569817, "eos/sharpness": 22.544980049133297, "eos/L0_probe": 2.094820261001587, "eos/L_plus": 2.2152950763702393, "eos/L_minus": 2.1997952461242676, "eos/grad_norm": 0.19981759786605835, "eos/embed_grad_frac": 0.17776349186897278, "eos/time_s": 0.5978434085845947} {"step": 14100, "timestamp": 1778209775.5893352, "train/loss": 2.2609492540359497, "train/z_loss": 0.00168707788689062, "train/perplexity": 9.592190271779724, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913270.8908457004, "perf/iters_per_sec": 0.9123186544636251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961082458496094, "data/tokens_consumed": 29571940352, "data/tokens_consumed_B": 29.571940352, "train/loss_slope": -8.708080471438314e-06} {"step": 14100, "timestamp": 1778209776.9591966, "geo/rankme_last": 439.7659606933594, "geo/layer_0/stable_rank_q_proj": 15.658187866210938, "geo/layer_0/stable_rank_k_proj": 13.964446067810059, "geo/layer_0/stable_rank_o_proj": 54.55971908569336, "geo/layer_0/stable_rank_gate_proj": 159.141845703125, "geo/layer_0/stable_rank_down_proj": 48.68779754638672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03699927777051926, "geo/layer_0/attn_entropy_mean": 6.314637184143066, "geo/layer_0/attn_entropy_std": 0.2839110791683197, "geo/layer_7/stable_rank_q_proj": 43.19234085083008, "geo/layer_7/stable_rank_k_proj": 42.94142532348633, "geo/layer_7/stable_rank_o_proj": 111.26573944091797, "geo/layer_7/stable_rank_gate_proj": 114.30509185791016, "geo/layer_7/stable_rank_down_proj": 158.0856475830078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5888662934303284, "geo/layer_7/attn_entropy_mean": 4.67805290222168, "geo/layer_7/attn_entropy_std": 0.86848384141922, "geo/layer_14/stable_rank_q_proj": 59.38559341430664, "geo/layer_14/stable_rank_k_proj": 37.3822135925293, "geo/layer_14/stable_rank_o_proj": 50.75233840942383, "geo/layer_14/stable_rank_gate_proj": 98.29328918457031, "geo/layer_14/stable_rank_down_proj": 138.34385681152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3901354968547821, "geo/layer_14/attn_entropy_mean": 5.556258201599121, "geo/layer_14/attn_entropy_std": 0.5527291297912598, "geo/layer_21/stable_rank_q_proj": 50.42290496826172, "geo/layer_21/stable_rank_k_proj": 32.13311004638672, "geo/layer_21/stable_rank_o_proj": 88.4533920288086, "geo/layer_21/stable_rank_gate_proj": 97.18103790283203, "geo/layer_21/stable_rank_down_proj": 65.82890319824219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1589726060628891, "geo/layer_21/attn_entropy_mean": 5.781417369842529, "geo/layer_21/attn_entropy_std": 0.2961771488189697, "geo/layer_27/stable_rank_q_proj": 41.95130920410156, "geo/layer_27/stable_rank_k_proj": 32.69087600708008, "geo/layer_27/stable_rank_o_proj": 117.08711242675781, "geo/layer_27/stable_rank_gate_proj": 93.3817367553711, "geo/layer_27/stable_rank_down_proj": 145.43673706054688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06884774565696716, "geo/layer_27/attn_entropy_mean": 4.456970691680908, "geo/layer_27/attn_entropy_std": 0.5045049786567688, "attnres/final_alpha/block_0": 0.2447294294834137, "attnres/block_norm/0": 1.5173969268798828, "attnres/final_alpha/block_1": 0.00797298178076744, "attnres/block_norm/1": 22771.140625, "attnres/final_alpha/block_2": 0.016692645847797394, "attnres/block_norm/2": 16840.515625, "attnres/final_alpha/block_3": 0.018650781363248825, "attnres/block_norm/3": 20710.140625, "attnres/final_alpha/block_4": 0.024919575080275536, "attnres/block_norm/4": 7226.5498046875, "attnres/final_alpha/block_5": 0.5362293124198914, "attnres/block_norm/5": 4425.90234375, "attnres/final_alpha/block_6": 0.15080526471138, "attnres/block_norm/6": 14482.2734375, "geo/tier1_time_s": 1.3664875030517578, "geo/step": 14100.0, "geo/rankme_slope": 0.0003202420030512205} {"step": 14110, "timestamp": 1778209787.3126893, "train/loss": 2.24367094039917, "train/z_loss": 0.0016903519979678094, "train/perplexity": 9.427877015608937, "train/grad_norm": 0.30859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789359.3608336, "perf/iters_per_sec": 0.8532330326240539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720127582550048, "data/tokens_consumed": 29592911872, "data/tokens_consumed_B": 29.592911872, "train/loss_slope": -9.624315345391103e-06} {"step": 14120, "timestamp": 1778209797.6650665, "train/loss": 2.2821306943893434, "train/z_loss": 0.001681653398554772, "train/perplexity": 9.797533735767237, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026864.4777778757, "perf/iters_per_sec": 0.9664842976464633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346779584884644, "data/tokens_consumed": 29613883392, "data/tokens_consumed_B": 29.613883392, "train/loss_slope": -7.066201763113082e-06} {"step": 14130, "timestamp": 1778209808.018824, "train/loss": 2.302201986312866, "train/z_loss": 0.0016722209635190665, "train/perplexity": 9.996169666948145, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026248.3046147847, "perf/iters_per_sec": 0.9661904833864139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349925994873046, "data/tokens_consumed": 29634854912, "data/tokens_consumed_B": 29.634854912, "train/loss_slope": -8.060128546462189e-06} {"step": 14140, "timestamp": 1778209818.3611717, "train/loss": 2.2789355993270872, "train/z_loss": 0.0016622119350358845, "train/perplexity": 9.766279640596835, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028690.5430458395, "perf/iters_per_sec": 0.9673550334195326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337466239929198, "data/tokens_consumed": 29655826432, "data/tokens_consumed_B": 29.655826432, "train/loss_slope": -8.783764972223303e-06} {"step": 14150, "timestamp": 1778209828.6979806, "grad/layer_0/attn": 0.0033378517255187035, "grad/layer_0/mlp": 0.0028987880796194077, "grad/layer_0/attn_mlp_ratio": 1.1514645150639955, "grad/layer_4/attn": 0.0014610178768634796, "grad/layer_4/mlp": 0.002583838766440749, "grad/layer_4/attn_mlp_ratio": 0.5654446551754363, "grad/layer_8/attn": 0.008104526437819004, "grad/layer_8/mlp": 0.003966833930462599, "grad/layer_8/attn_mlp_ratio": 2.043071723087412, "grad/layer_12/attn": 0.005112520419061184, "grad/layer_12/mlp": 0.005868402309715748, "grad/layer_12/attn_mlp_ratio": 0.8711945879166155, "grad/layer_16/attn": 0.004433592315763235, "grad/layer_16/mlp": 0.004639068618416786, "grad/layer_16/attn_mlp_ratio": 0.9557074026867023, "grad/layer_20/attn": 0.003966886550188065, "grad/layer_20/mlp": 0.005973764229565859, "grad/layer_20/attn_mlp_ratio": 0.6640513972998257, "grad/layer_24/attn": 0.011096235364675522, "grad/layer_24/mlp": 0.013505440205335617, "grad/layer_24/attn_mlp_ratio": 0.8216122624518738, "grad/layer_27/attn": 0.005184306763112545, "grad/layer_27/mlp": 0.01226571761071682, "grad/layer_27/attn_mlp_ratio": 0.42266640121538956} {"step": 14150, "timestamp": 1778209828.7123287, "train/loss": 2.264878988265991, "train/z_loss": 0.001663328381255269, "train/perplexity": 9.629959192537328, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027490.1387845657, "perf/iters_per_sec": 0.9667826360628918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343586683273316, "data/tokens_consumed": 29676797952, "data/tokens_consumed_B": 29.676797952, "train/loss_slope": -1.24441324680946e-05} {"step": 14160, "timestamp": 1778209839.0628216, "train/loss": 2.258124756813049, "train/z_loss": 0.0016754808253608643, "train/perplexity": 9.565135383210867, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027145.9587518878, "perf/iters_per_sec": 0.9666185182341994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345342874526977, "data/tokens_consumed": 29697769472, "data/tokens_consumed_B": 29.697769472, "train/loss_slope": -1.2584903223751424e-05} {"step": 14170, "timestamp": 1778209849.4103625, "train/loss": 2.297896933555603, "train/z_loss": 0.0016591656487435103, "train/perplexity": 9.953228128277432, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027659.701860183, "perf/iters_per_sec": 0.9668634900380053, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342721700668336, "data/tokens_consumed": 29718740992, "data/tokens_consumed_B": 29.718740992, "train/loss_slope": -1.2649808758341746e-05} {"step": 14175, "timestamp": 1778209855.1680682, "eos/sharpness": 49.51474666595458, "eos/L0_probe": 2.0953104496002197, "eos/L_plus": 2.4181199073791504, "eos/L_minus": 2.267648458480835, "eos/grad_norm": 0.18200860917568207, "eos/embed_grad_frac": 0.07608252763748169, "eos/time_s": 0.5911481380462646} {"step": 14175, "timestamp": 1778209856.5515668, "geo/rankme_last": 440.3316650390625, "geo/layer_0/stable_rank_q_proj": 15.6604585647583, "geo/layer_0/stable_rank_k_proj": 13.91838264465332, "geo/layer_0/stable_rank_o_proj": 54.39912033081055, "geo/layer_0/stable_rank_gate_proj": 158.3227996826172, "geo/layer_0/stable_rank_down_proj": 48.76451873779297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03759782761335373, "geo/layer_0/attn_entropy_mean": 6.315721035003662, "geo/layer_0/attn_entropy_std": 0.2846139073371887, "geo/layer_7/stable_rank_q_proj": 43.04572677612305, "geo/layer_7/stable_rank_k_proj": 43.0070686340332, "geo/layer_7/stable_rank_o_proj": 110.78294372558594, "geo/layer_7/stable_rank_gate_proj": 114.05398559570312, "geo/layer_7/stable_rank_down_proj": 158.16262817382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5764548182487488, "geo/layer_7/attn_entropy_mean": 4.623487949371338, "geo/layer_7/attn_entropy_std": 0.8802646398544312, "geo/layer_14/stable_rank_q_proj": 59.31405258178711, "geo/layer_14/stable_rank_k_proj": 37.31760025024414, "geo/layer_14/stable_rank_o_proj": 50.64278030395508, "geo/layer_14/stable_rank_gate_proj": 98.30472564697266, "geo/layer_14/stable_rank_down_proj": 138.04193115234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39755570888519287, "geo/layer_14/attn_entropy_mean": 5.550942420959473, "geo/layer_14/attn_entropy_std": 0.5679486393928528, "geo/layer_21/stable_rank_q_proj": 50.53186798095703, "geo/layer_21/stable_rank_k_proj": 32.28622817993164, "geo/layer_21/stable_rank_o_proj": 88.56282043457031, "geo/layer_21/stable_rank_gate_proj": 97.0147933959961, "geo/layer_21/stable_rank_down_proj": 65.69049072265625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15889941155910492, "geo/layer_21/attn_entropy_mean": 5.7791643142700195, "geo/layer_21/attn_entropy_std": 0.292095810174942, "geo/layer_27/stable_rank_q_proj": 41.946861267089844, "geo/layer_27/stable_rank_k_proj": 32.56675338745117, "geo/layer_27/stable_rank_o_proj": 116.61592102050781, "geo/layer_27/stable_rank_gate_proj": 93.1017074584961, "geo/layer_27/stable_rank_down_proj": 145.4291534423828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07231699675321579, "geo/layer_27/attn_entropy_mean": 4.452165603637695, "geo/layer_27/attn_entropy_std": 0.5018419027328491, "attnres/final_alpha/block_0": 0.24435578286647797, "attnres/block_norm/0": 1.5189471244812012, "attnres/final_alpha/block_1": 0.00794968567788601, "attnres/block_norm/1": 22917.7421875, "attnres/final_alpha/block_2": 0.01660015992820263, "attnres/block_norm/2": 16883.572265625, "attnres/final_alpha/block_3": 0.018797971308231354, "attnres/block_norm/3": 20835.67578125, "attnres/final_alpha/block_4": 0.02493152767419815, "attnres/block_norm/4": 7265.6865234375, "attnres/final_alpha/block_5": 0.5385947227478027, "attnres/block_norm/5": 4384.98046875, "attnres/final_alpha/block_6": 0.14877013862133026, "attnres/block_norm/6": 14608.9296875, "geo/tier1_time_s": 1.3654494285583496, "geo/step": 14175.0, "geo/rankme_slope": 0.00032452457545518207} {"step": 14180, "timestamp": 1778209861.7271974, "train/loss": 2.2848217248916627, "train/z_loss": 0.0016722737927921116, "train/perplexity": 9.823934704872062, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703686.5000151172, "perf/iters_per_sec": 0.8123810291362368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2309494733810424, "data/tokens_consumed": 29739712512, "data/tokens_consumed_B": 29.739712512, "train/loss_slope": -1.178091797712801e-05} {"step": 14190, "timestamp": 1778209872.0839396, "train/loss": 2.290003848075867, "train/z_loss": 0.001662550657056272, "train/perplexity": 9.874975680755673, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025793.4537964491, "perf/iters_per_sec": 0.9659735936147924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035224986076355, "data/tokens_consumed": 29760684032, "data/tokens_consumed_B": 29.760684032, "train/loss_slope": -1.1382415335420955e-05} {"step": 14200, "timestamp": 1778209882.4127896, "grad/layer_0/attn": 0.0034874998964369297, "grad/layer_0/mlp": 0.00315538770519197, "grad/layer_0/attn_mlp_ratio": 1.1052523847301725, "grad/layer_4/attn": 0.0018103424226865172, "grad/layer_4/mlp": 0.0025832352694123983, "grad/layer_4/attn_mlp_ratio": 0.7008042875699364, "grad/layer_8/attn": 0.004423980135470629, "grad/layer_8/mlp": 0.0039939917623996735, "grad/layer_8/attn_mlp_ratio": 1.1076587754519385, "grad/layer_12/attn": 0.0041924575343728065, "grad/layer_12/mlp": 0.006439643912017345, "grad/layer_12/attn_mlp_ratio": 0.6510387106102526, "grad/layer_16/attn": 0.004612519405782223, "grad/layer_16/mlp": 0.005354953929781914, "grad/layer_16/attn_mlp_ratio": 0.8613555560196047, "grad/layer_20/attn": 0.007026041857898235, "grad/layer_20/mlp": 0.008320008404552937, "grad/layer_20/attn_mlp_ratio": 0.844475321636197, "grad/layer_24/attn": 0.02681504376232624, "grad/layer_24/mlp": 0.01590132899582386, "grad/layer_24/attn_mlp_ratio": 1.6863397770547766, "grad/layer_27/attn": 0.011752795428037643, "grad/layer_27/mlp": 0.01542168203741312, "grad/layer_27/attn_mlp_ratio": 0.7620955563287919} {"step": 14200, "timestamp": 1778209882.4270797, "train/loss": 2.3054397821426393, "train/z_loss": 0.001663458882831037, "train/perplexity": 10.028587676536926, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028688.8118621595, "perf/iters_per_sec": 0.9673542079268262, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337475061416626, "data/tokens_consumed": 29781655552, "data/tokens_consumed_B": 29.781655552, "train/loss_slope": -8.542320687528428e-06} {"step": 14210, "timestamp": 1778209892.7749455, "train/loss": 2.2341763734817506, "train/z_loss": 0.0016861776355654, "train/perplexity": 9.338787011163381, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027815.5959151245, "perf/iters_per_sec": 0.9669378261161444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341926574707032, "data/tokens_consumed": 29802627072, "data/tokens_consumed_B": 29.802627072, "train/loss_slope": -5.004247630497374e-06} {"step": 14220, "timestamp": 1778209903.1366959, "train/loss": 2.2511950969696044, "train/z_loss": 0.0016750273760408163, "train/perplexity": 9.499081378866338, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024988.9237334526, "perf/iters_per_sec": 0.9655899637858641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356362819671632, "data/tokens_consumed": 29823598592, "data/tokens_consumed_B": 29.823598592, "train/loss_slope": -6.3730947183769344e-06} {"step": 14230, "timestamp": 1778209913.485561, "train/loss": 2.22832350730896, "train/z_loss": 0.0016968531184829772, "train/perplexity": 9.284287983904054, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027724.4405589101, "perf/iters_per_sec": 0.9668943598551322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034239149093628, "data/tokens_consumed": 29844570112, "data/tokens_consumed_B": 29.844570112, "train/loss_slope": -7.657636782088013e-06} {"step": 14240, "timestamp": 1778209923.8397844, "train/loss": 2.221753787994385, "train/z_loss": 0.0016915714601054787, "train/perplexity": 9.223492740323326, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026586.0632320882, "perf/iters_per_sec": 0.9663515392456475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348201036453246, "data/tokens_consumed": 29865541632, "data/tokens_consumed_B": 29.865541632, "train/loss_slope": -1.3966970632572062e-05} {"step": 14250, "timestamp": 1778209934.1839035, "grad/layer_0/attn": 0.0029129423201084137, "grad/layer_0/mlp": 0.002698533469811082, "grad/layer_0/attn_mlp_ratio": 1.0794538013890047, "grad/layer_4/attn": 0.002269473159685731, "grad/layer_4/mlp": 0.002538271015509963, "grad/layer_4/attn_mlp_ratio": 0.8941019522375852, "grad/layer_8/attn": 0.006854536943137646, "grad/layer_8/mlp": 0.00418785959482193, "grad/layer_8/attn_mlp_ratio": 1.6367637510905448, "grad/layer_12/attn": 0.006637211889028549, "grad/layer_12/mlp": 0.006187785416841507, "grad/layer_12/attn_mlp_ratio": 1.0726312136973435, "grad/layer_16/attn": 0.004946925211697817, "grad/layer_16/mlp": 0.004916576202958822, "grad/layer_16/attn_mlp_ratio": 1.0061727728542993, "grad/layer_20/attn": 0.005251992493867874, "grad/layer_20/mlp": 0.0066322507336735725, "grad/layer_20/attn_mlp_ratio": 0.7918868911293605, "grad/layer_24/attn": 0.01998344622552395, "grad/layer_24/mlp": 0.013247997500002384, "grad/layer_24/attn_mlp_ratio": 1.5084125789334648, "grad/layer_27/attn": 0.006951100192964077, "grad/layer_27/mlp": 0.012257474474608898, "grad/layer_27/attn_mlp_ratio": 0.5670907290611996} {"step": 14250, "timestamp": 1778209934.7800431, "eos/sharpness": 44.885706901550286, "eos/L0_probe": 2.0968778133392334, "eos/L_plus": 2.364703416824341, "eos/L_minus": 2.277909278869629, "eos/grad_norm": 0.22309793531894684, "eos/embed_grad_frac": 0.05366234853863716, "eos/time_s": 0.5934634208679199} {"step": 14250, "timestamp": 1778209934.8003876, "train/loss": 2.2637553453445434, "train/z_loss": 0.001678186806384474, "train/perplexity": 9.619144634045698, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914233.751324645, "perf/iters_per_sec": 0.9127777821181512, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095556902885437, "data/tokens_consumed": 29886513152, "data/tokens_consumed_B": 29.886513152, "train/loss_slope": -1.2381065124296627e-05} {"step": 14250, "timestamp": 1778209936.1763873, "geo/rankme_last": 440.2749938964844, "geo/layer_0/stable_rank_q_proj": 15.67051887512207, "geo/layer_0/stable_rank_k_proj": 13.933341026306152, "geo/layer_0/stable_rank_o_proj": 54.42804718017578, "geo/layer_0/stable_rank_gate_proj": 158.3813018798828, "geo/layer_0/stable_rank_down_proj": 48.62864303588867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04138293117284775, "geo/layer_0/attn_entropy_mean": 6.309382438659668, "geo/layer_0/attn_entropy_std": 0.2871178388595581, "geo/layer_7/stable_rank_q_proj": 43.07936477661133, "geo/layer_7/stable_rank_k_proj": 43.10026550292969, "geo/layer_7/stable_rank_o_proj": 110.44445037841797, "geo/layer_7/stable_rank_gate_proj": 113.77839660644531, "geo/layer_7/stable_rank_down_proj": 157.53359985351562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5797784328460693, "geo/layer_7/attn_entropy_mean": 4.662990093231201, "geo/layer_7/attn_entropy_std": 0.8969203233718872, "geo/layer_14/stable_rank_q_proj": 59.41398239135742, "geo/layer_14/stable_rank_k_proj": 37.40727615356445, "geo/layer_14/stable_rank_o_proj": 50.65699005126953, "geo/layer_14/stable_rank_gate_proj": 98.22563171386719, "geo/layer_14/stable_rank_down_proj": 138.08807373046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3926392197608948, "geo/layer_14/attn_entropy_mean": 5.5454301834106445, "geo/layer_14/attn_entropy_std": 0.5536578297615051, "geo/layer_21/stable_rank_q_proj": 50.51514434814453, "geo/layer_21/stable_rank_k_proj": 32.33155059814453, "geo/layer_21/stable_rank_o_proj": 88.4317855834961, "geo/layer_21/stable_rank_gate_proj": 96.81365203857422, "geo/layer_21/stable_rank_down_proj": 65.73503112792969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15562842786312103, "geo/layer_21/attn_entropy_mean": 5.791776657104492, "geo/layer_21/attn_entropy_std": 0.28743618726730347, "geo/layer_27/stable_rank_q_proj": 41.885066986083984, "geo/layer_27/stable_rank_k_proj": 32.61025619506836, "geo/layer_27/stable_rank_o_proj": 116.52947998046875, "geo/layer_27/stable_rank_gate_proj": 93.1740493774414, "geo/layer_27/stable_rank_down_proj": 145.35304260253906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07626314461231232, "geo/layer_27/attn_entropy_mean": 4.4880242347717285, "geo/layer_27/attn_entropy_std": 0.47818365693092346, "attnres/final_alpha/block_0": 0.24356234073638916, "attnres/block_norm/0": 1.52045738697052, "attnres/final_alpha/block_1": 0.007899104617536068, "attnres/block_norm/1": 22977.8671875, "attnres/final_alpha/block_2": 0.016569174826145172, "attnres/block_norm/2": 16977.125, "attnres/final_alpha/block_3": 0.018608588725328445, "attnres/block_norm/3": 21004.33203125, "attnres/final_alpha/block_4": 0.02495904453098774, "attnres/block_norm/4": 7265.6015625, "attnres/final_alpha/block_5": 0.5425620675086975, "attnres/block_norm/5": 4397.2998046875, "attnres/final_alpha/block_6": 0.14583972096443176, "attnres/block_norm/6": 14685.34765625, "geo/tier1_time_s": 1.361760139465332, "geo/step": 14250.0, "geo/rankme_slope": 0.00032708499415391154} {"step": 14260, "timestamp": 1778209946.524746, "train/loss": 2.2687116861343384, "train/z_loss": 0.0016709352494217455, "train/perplexity": 9.66693873704975, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790859.5102690333, "perf/iters_per_sec": 0.853948359617726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710309982299805, "data/tokens_consumed": 29907484672, "data/tokens_consumed_B": 29.907484672, "train/loss_slope": -9.39171005933457e-06} {"step": 14270, "timestamp": 1778209956.8684225, "train/loss": 2.2217055320739747, "train/z_loss": 0.0016753964708186686, "train/perplexity": 9.22304766293064, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028467.7593052438, "perf/iters_per_sec": 0.967248801853773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033860158920288, "data/tokens_consumed": 29928456192, "data/tokens_consumed_B": 29.928456192, "train/loss_slope": -1.4152369011353417e-05} {"step": 14280, "timestamp": 1778209967.2181773, "train/loss": 2.2623178482055666, "train/z_loss": 0.001678136456757784, "train/perplexity": 9.605327074885116, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027695.3193186398, "perf/iters_per_sec": 0.9668804737656783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034254002571106, "data/tokens_consumed": 29949427712, "data/tokens_consumed_B": 29.949427712, "train/loss_slope": -1.4060064997359778e-05} {"step": 14290, "timestamp": 1778209977.5657077, "train/loss": 2.2504720211029055, "train/z_loss": 0.001684276817832142, "train/perplexity": 9.492215305010776, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028176.5580981125, "perf/iters_per_sec": 0.9671099462976992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340085983276368, "data/tokens_consumed": 29970399232, "data/tokens_consumed_B": 29.970399232, "train/loss_slope": -1.296441595796852e-05} {"step": 14300, "timestamp": 1778209987.9014213, "grad/layer_0/attn": 0.0031663181725889444, "grad/layer_0/mlp": 0.003206903114914894, "grad/layer_0/attn_mlp_ratio": 0.9873444754624355, "grad/layer_4/attn": 0.004605626687407494, "grad/layer_4/mlp": 0.0025963569059967995, "grad/layer_4/attn_mlp_ratio": 1.7738803549626991, "grad/layer_8/attn": 0.005278958007693291, "grad/layer_8/mlp": 0.0039201038889586926, "grad/layer_8/attn_mlp_ratio": 1.3466372378288758, "grad/layer_12/attn": 0.004017290659248829, "grad/layer_12/mlp": 0.006266134325414896, "grad/layer_12/attn_mlp_ratio": 0.6411114710458569, "grad/layer_16/attn": 0.005618934985250235, "grad/layer_16/mlp": 0.00460062688216567, "grad/layer_16/attn_mlp_ratio": 1.2213411361173239, "grad/layer_20/attn": 0.003969875629991293, "grad/layer_20/mlp": 0.006062476430088282, "grad/layer_20/attn_mlp_ratio": 0.6548273812341644, "grad/layer_24/attn": 0.011871332302689552, "grad/layer_24/mlp": 0.010097737424075603, "grad/layer_24/attn_mlp_ratio": 1.1756427887321534, "grad/layer_27/attn": 0.0046759843826293945, "grad/layer_27/mlp": 0.009878003038465977, "grad/layer_27/attn_mlp_ratio": 0.473373445734252} {"step": 14300, "timestamp": 1778209987.9156852, "train/loss": 2.2209403276443482, "train/z_loss": 0.0016856551985256374, "train/perplexity": 9.215992845537308, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027209.6367972915, "perf/iters_per_sec": 0.9666488822924096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345017910003662, "data/tokens_consumed": 29991370752, "data/tokens_consumed_B": 29.991370752, "train/loss_slope": -1.318004535477049e-05} {"step": 14310, "timestamp": 1778209998.2622294, "train/loss": 2.2801626443862917, "train/z_loss": 0.0016663167625665665, "train/perplexity": 9.778270661033318, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028160.0033940654, "perf/iters_per_sec": 0.9671020523996665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340170383453369, "data/tokens_consumed": 30012342272, "data/tokens_consumed_B": 30.012342272, "train/loss_slope": -1.0444537605425286e-05} {"step": 14320, "timestamp": 1778210008.6098824, "train/loss": 2.292404890060425, "train/z_loss": 0.0016570308711379767, "train/perplexity": 9.898714399387224, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027890.022159928, "perf/iters_per_sec": 0.9669733153152122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341547012329102, "data/tokens_consumed": 30033313792, "data/tokens_consumed_B": 30.033313792, "train/loss_slope": -5.96393156867109e-06} {"step": 14325, "timestamp": 1778210014.3704138, "eos/sharpness": 27.40552425384521, "eos/L0_probe": 2.0939877033233643, "eos/L_plus": 2.2240800857543945, "eos/L_minus": 2.237950563430786, "eos/grad_norm": 0.1298343688249588, "eos/embed_grad_frac": 0.14780783653259277, "eos/time_s": 0.5972468852996826} {"step": 14325, "timestamp": 1778210015.7537448, "geo/rankme_last": 440.8985290527344, "geo/layer_0/stable_rank_q_proj": 15.713908195495605, "geo/layer_0/stable_rank_k_proj": 13.943994522094727, "geo/layer_0/stable_rank_o_proj": 54.35365676879883, "geo/layer_0/stable_rank_gate_proj": 157.74606323242188, "geo/layer_0/stable_rank_down_proj": 48.67922592163086, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.038787003606557846, "geo/layer_0/attn_entropy_mean": 6.310204029083252, "geo/layer_0/attn_entropy_std": 0.2898084819316864, "geo/layer_7/stable_rank_q_proj": 43.13872528076172, "geo/layer_7/stable_rank_k_proj": 43.17236328125, "geo/layer_7/stable_rank_o_proj": 110.37553405761719, "geo/layer_7/stable_rank_gate_proj": 113.87088012695312, "geo/layer_7/stable_rank_down_proj": 157.33108520507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5720319747924805, "geo/layer_7/attn_entropy_mean": 4.668896675109863, "geo/layer_7/attn_entropy_std": 0.874314546585083, "geo/layer_14/stable_rank_q_proj": 59.36564636230469, "geo/layer_14/stable_rank_k_proj": 37.385414123535156, "geo/layer_14/stable_rank_o_proj": 50.700347900390625, "geo/layer_14/stable_rank_gate_proj": 97.71540069580078, "geo/layer_14/stable_rank_down_proj": 137.6809539794922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.387684166431427, "geo/layer_14/attn_entropy_mean": 5.568476676940918, "geo/layer_14/attn_entropy_std": 0.5462676286697388, "geo/layer_21/stable_rank_q_proj": 50.29314422607422, "geo/layer_21/stable_rank_k_proj": 32.34265899658203, "geo/layer_21/stable_rank_o_proj": 88.3709487915039, "geo/layer_21/stable_rank_gate_proj": 96.83580780029297, "geo/layer_21/stable_rank_down_proj": 65.66238403320312, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15572644770145416, "geo/layer_21/attn_entropy_mean": 5.782081604003906, "geo/layer_21/attn_entropy_std": 0.28710708022117615, "geo/layer_27/stable_rank_q_proj": 41.85997009277344, "geo/layer_27/stable_rank_k_proj": 32.61220169067383, "geo/layer_27/stable_rank_o_proj": 116.4924087524414, "geo/layer_27/stable_rank_gate_proj": 93.1981201171875, "geo/layer_27/stable_rank_down_proj": 144.892822265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06156417354941368, "geo/layer_27/attn_entropy_mean": 4.468874931335449, "geo/layer_27/attn_entropy_std": 0.5031289458274841, "attnres/final_alpha/block_0": 0.24716559052467346, "attnres/block_norm/0": 1.5220844745635986, "attnres/final_alpha/block_1": 0.008033525198698044, "attnres/block_norm/1": 22965.939453125, "attnres/final_alpha/block_2": 0.016995998099446297, "attnres/block_norm/2": 17051.11328125, "attnres/final_alpha/block_3": 0.019084066152572632, "attnres/block_norm/3": 21108.775390625, "attnres/final_alpha/block_4": 0.025387471541762352, "attnres/block_norm/4": 7317.0234375, "attnres/final_alpha/block_5": 0.5326720476150513, "attnres/block_norm/5": 4443.7822265625, "attnres/final_alpha/block_6": 0.15066127479076385, "attnres/block_norm/6": 14555.3818359375, "geo/tier1_time_s": 1.3658685684204102, "geo/step": 14325.0, "geo/rankme_slope": 0.000303636806285014} {"step": 14330, "timestamp": 1778210020.927371, "train/loss": 2.2513570547103883, "train/z_loss": 0.0016804770450107754, "train/perplexity": 9.500619953214631, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703351.2058495204, "perf/iters_per_sec": 0.8122211484191515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2311917781829833, "data/tokens_consumed": 30054285312, "data/tokens_consumed_B": 30.054285312, "train/loss_slope": -1.1325459747818035e-05} {"step": 14340, "timestamp": 1778210031.2788012, "train/loss": 2.200187087059021, "train/z_loss": 0.0016931432182900608, "train/perplexity": 9.026702120622016, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027570.7104252798, "perf/iters_per_sec": 0.9668210556150817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343175649642944, "data/tokens_consumed": 30075256832, "data/tokens_consumed_B": 30.075256832, "train/loss_slope": -1.5334078230515916e-05} {"step": 14350, "timestamp": 1778210041.6209948, "grad/layer_0/attn": 0.0026261722669005394, "grad/layer_0/mlp": 0.0026893599424511194, "grad/layer_0/attn_mlp_ratio": 0.9765045309838879, "grad/layer_4/attn": 0.0014842667151242495, "grad/layer_4/mlp": 0.002570153446868062, "grad/layer_4/attn_mlp_ratio": 0.5775011835121475, "grad/layer_8/attn": 0.004126284271478653, "grad/layer_8/mlp": 0.003835097188130021, "grad/layer_8/attn_mlp_ratio": 1.0759268830675774, "grad/layer_12/attn": 0.003846196923404932, "grad/layer_12/mlp": 0.0055968924425542355, "grad/layer_12/attn_mlp_ratio": 0.6872022098265363, "grad/layer_16/attn": 0.004529488738626242, "grad/layer_16/mlp": 0.004423512611538172, "grad/layer_16/attn_mlp_ratio": 1.023957436996088, "grad/layer_20/attn": 0.005287201143801212, "grad/layer_20/mlp": 0.006171736866235733, "grad/layer_20/attn_mlp_ratio": 0.85667959809797, "grad/layer_24/attn": 0.020727839320898056, "grad/layer_24/mlp": 0.013802282512187958, "grad/layer_24/attn_mlp_ratio": 1.501768939479225, "grad/layer_27/attn": 0.006519907619804144, "grad/layer_27/mlp": 0.013668949715793133, "grad/layer_27/attn_mlp_ratio": 0.47698672594957} {"step": 14350, "timestamp": 1778210041.6353703, "train/loss": 2.2975380420684814, "train/z_loss": 0.0016695840749889612, "train/perplexity": 9.949656640359452, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026609.8296191194, "perf/iters_per_sec": 0.9663628719421002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348079681396485, "data/tokens_consumed": 30096228352, "data/tokens_consumed_B": 30.096228352, "train/loss_slope": -1.295900513665868e-05} {"step": 14360, "timestamp": 1778210051.9854965, "train/loss": 2.2884652614593506, "train/z_loss": 0.0016678091487847269, "train/perplexity": 9.859793857604725, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027674.4722029078, "perf/iters_per_sec": 0.9668705330862559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342646360397338, "data/tokens_consumed": 30117199872, "data/tokens_consumed_B": 30.117199872, "train/loss_slope": -1.3118080530587241e-05} {"step": 14370, "timestamp": 1778210062.330508, "train/loss": 2.3050485372543337, "train/z_loss": 0.0016571283689700068, "train/perplexity": 10.024664810322273, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028704.6733582462, "perf/iters_per_sec": 0.9673617712775451, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337394237518311, "data/tokens_consumed": 30138171392, "data/tokens_consumed_B": 30.138171392, "train/loss_slope": -1.0596993403716592e-05} {"step": 14380, "timestamp": 1778210072.7263877, "train/loss": 2.2254729747772215, "train/z_loss": 0.0016817740164697171, "train/perplexity": 9.257860503063625, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018877.5036324952, "perf/iters_per_sec": 0.9626758115923382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387712955474853, "data/tokens_consumed": 30159142912, "data/tokens_consumed_B": 30.159142912, "train/loss_slope": -1.1636463157271538e-05} {"step": 14390, "timestamp": 1778210083.0994055, "train/loss": 2.2933863162994386, "train/z_loss": 0.001667522347997874, "train/perplexity": 9.908434026199519, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022674.1820485012, "perf/iters_per_sec": 0.9644862089388376, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036821460723877, "data/tokens_consumed": 30180114432, "data/tokens_consumed_B": 30.180114432, "train/loss_slope": -7.291019667457461e-06} {"step": 14400, "timestamp": 1778210094.0365977, "grad/layer_0/attn": 0.003229070222005248, "grad/layer_0/mlp": 0.0028026180807501078, "grad/layer_0/attn_mlp_ratio": 1.1521620191377628, "grad/layer_4/attn": 0.0017807924887165427, "grad/layer_4/mlp": 0.0025410132948309183, "grad/layer_4/attn_mlp_ratio": 0.7008197958889699, "grad/layer_8/attn": 0.006322240922600031, "grad/layer_8/mlp": 0.0039777676574885845, "grad/layer_8/attn_mlp_ratio": 1.5893941798632458, "grad/layer_12/attn": 0.004617604892700911, "grad/layer_12/mlp": 0.005958315450698137, "grad/layer_12/attn_mlp_ratio": 0.7749849522756251, "grad/layer_16/attn": 0.005825414787977934, "grad/layer_16/mlp": 0.004564389120787382, "grad/layer_16/attn_mlp_ratio": 1.2762747667196135, "grad/layer_20/attn": 0.004047110676765442, "grad/layer_20/mlp": 0.0060534970834851265, "grad/layer_20/attn_mlp_ratio": 0.6685574559787661, "grad/layer_24/attn": 0.006879844237118959, "grad/layer_24/mlp": 0.010556507855653763, "grad/layer_24/attn_mlp_ratio": 0.6517159145827491, "grad/layer_27/attn": 0.008854038082063198, "grad/layer_27/mlp": 0.009052278473973274, "grad/layer_27/attn_mlp_ratio": 0.9781004870442179} {"step": 14400, "timestamp": 1778210094.6445506, "eos/sharpness": 16.355419158935543, "eos/L0_probe": 2.095679759979248, "eos/L_plus": 2.157370090484619, "eos/L_minus": 2.1975436210632324, "eos/grad_norm": 0.11231207102537155, "eos/embed_grad_frac": 0.22522591054439545, "eos/time_s": 0.6049425601959229} {"step": 14400, "timestamp": 1778210094.6644905, "train/loss": 2.308282494544983, "train/z_loss": 0.00166333531960845, "train/perplexity": 10.057136626104143, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1814203.4462051156, "perf/iters_per_sec": 0.8650796156907633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1559629678726195, "data/tokens_consumed": 30201085952, "data/tokens_consumed_B": 30.201085952, "train/loss_slope": -4.8222878442095245e-06} {"step": 14400, "timestamp": 1778210096.0264816, "geo/rankme_last": 440.51983642578125, "geo/layer_0/stable_rank_q_proj": 15.712206840515137, "geo/layer_0/stable_rank_k_proj": 13.927372932434082, "geo/layer_0/stable_rank_o_proj": 54.2950439453125, "geo/layer_0/stable_rank_gate_proj": 157.88284301757812, "geo/layer_0/stable_rank_down_proj": 48.719703674316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03916293382644653, "geo/layer_0/attn_entropy_mean": 6.307563304901123, "geo/layer_0/attn_entropy_std": 0.2901807129383087, "geo/layer_7/stable_rank_q_proj": 43.119991302490234, "geo/layer_7/stable_rank_k_proj": 43.312313079833984, "geo/layer_7/stable_rank_o_proj": 110.20160675048828, "geo/layer_7/stable_rank_gate_proj": 113.44180297851562, "geo/layer_7/stable_rank_down_proj": 157.67254638671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5842333436012268, "geo/layer_7/attn_entropy_mean": 4.647702217102051, "geo/layer_7/attn_entropy_std": 0.853022575378418, "geo/layer_14/stable_rank_q_proj": 59.275753021240234, "geo/layer_14/stable_rank_k_proj": 37.232093811035156, "geo/layer_14/stable_rank_o_proj": 50.730255126953125, "geo/layer_14/stable_rank_gate_proj": 97.53730773925781, "geo/layer_14/stable_rank_down_proj": 137.44403076171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39765194058418274, "geo/layer_14/attn_entropy_mean": 5.563101291656494, "geo/layer_14/attn_entropy_std": 0.5767230987548828, "geo/layer_21/stable_rank_q_proj": 50.42193603515625, "geo/layer_21/stable_rank_k_proj": 32.23891830444336, "geo/layer_21/stable_rank_o_proj": 88.07022857666016, "geo/layer_21/stable_rank_gate_proj": 96.55603790283203, "geo/layer_21/stable_rank_down_proj": 65.62596893310547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15753966569900513, "geo/layer_21/attn_entropy_mean": 5.791543006896973, "geo/layer_21/attn_entropy_std": 0.2845650911331177, "geo/layer_27/stable_rank_q_proj": 41.96805191040039, "geo/layer_27/stable_rank_k_proj": 32.549293518066406, "geo/layer_27/stable_rank_o_proj": 116.68505859375, "geo/layer_27/stable_rank_gate_proj": 93.03437805175781, "geo/layer_27/stable_rank_down_proj": 145.19790649414062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0692332312464714, "geo/layer_27/attn_entropy_mean": 4.478625297546387, "geo/layer_27/attn_entropy_std": 0.5065748691558838, "attnres/final_alpha/block_0": 0.24339449405670166, "attnres/block_norm/0": 1.5237319469451904, "attnres/final_alpha/block_1": 0.00788640696555376, "attnres/block_norm/1": 23081.388671875, "attnres/final_alpha/block_2": 0.016456928104162216, "attnres/block_norm/2": 17065.990234375, "attnres/final_alpha/block_3": 0.018529381603002548, "attnres/block_norm/3": 21350.27734375, "attnres/final_alpha/block_4": 0.024783829227089882, "attnres/block_norm/4": 7330.6396484375, "attnres/final_alpha/block_5": 0.541161298751831, "attnres/block_norm/5": 4409.5400390625, "attnres/final_alpha/block_6": 0.14778763055801392, "attnres/block_norm/6": 14697.671875, "geo/tier1_time_s": 1.3583152294158936, "geo/step": 14400.0, "geo/rankme_slope": 0.00024507635085284113} {"step": 14410, "timestamp": 1778210106.7540457, "train/loss": 2.308278965950012, "train/z_loss": 0.0016631160164251924, "train/perplexity": 10.057101138605033, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1735224.0991213596, "perf/iters_per_sec": 0.8274193282706068, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.208577036857605, "data/tokens_consumed": 30222057472, "data/tokens_consumed_B": 30.222057472, "train/loss_slope": -1.729300842605617e-06} {"step": 14420, "timestamp": 1778210117.133938, "train/loss": 2.240680432319641, "train/z_loss": 0.0016742165200412274, "train/perplexity": 9.399724988633638, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021547.366957344, "perf/iters_per_sec": 0.9639489016329499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373993873596192, "data/tokens_consumed": 30243028992, "data/tokens_consumed_B": 30.243028992, "train/loss_slope": -1.3816909606914285e-07} {"step": 14430, "timestamp": 1778210127.519646, "train/loss": 2.2790870666503906, "train/z_loss": 0.0016574505483731628, "train/perplexity": 9.767759024868989, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020402.2690026, "perf/iters_per_sec": 0.9634028763783455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379873514175415, "data/tokens_consumed": 30264000512, "data/tokens_consumed_B": 30.264000512, "train/loss_slope": 1.3781321502969422e-06} {"step": 14440, "timestamp": 1778210137.9000356, "train/loss": 2.26836302280426, "train/z_loss": 0.001669848070014268, "train/perplexity": 9.66356881751585, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021475.0315215546, "perf/iters_per_sec": 0.9639144094093106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374365091323852, "data/tokens_consumed": 30284972032, "data/tokens_consumed_B": 30.284972032, "train/loss_slope": -1.760747959904936e-06} {"step": 14450, "timestamp": 1778210148.273618, "grad/layer_0/attn": 0.0032330325338989496, "grad/layer_0/mlp": 0.002869731280952692, "grad/layer_0/attn_mlp_ratio": 1.1265976165426497, "grad/layer_4/attn": 0.0017296185251325369, "grad/layer_4/mlp": 0.002785696880891919, "grad/layer_4/attn_mlp_ratio": 0.6208925583064497, "grad/layer_8/attn": 0.004555354826152325, "grad/layer_8/mlp": 0.0040707578882575035, "grad/layer_8/attn_mlp_ratio": 1.1190433917449005, "grad/layer_12/attn": 0.006298464722931385, "grad/layer_12/mlp": 0.007882479578256607, "grad/layer_12/attn_mlp_ratio": 0.7990461098561864, "grad/layer_16/attn": 0.004749175161123276, "grad/layer_16/mlp": 0.005159074440598488, "grad/layer_16/attn_mlp_ratio": 0.9205478858175868, "grad/layer_20/attn": 0.007159317843616009, "grad/layer_20/mlp": 0.007115254644304514, "grad/layer_20/attn_mlp_ratio": 1.0061927648264126, "grad/layer_24/attn": 0.018888968974351883, "grad/layer_24/mlp": 0.012432407587766647, "grad/layer_24/attn_mlp_ratio": 1.5193331371314682, "grad/layer_27/attn": 0.006432550493627787, "grad/layer_27/mlp": 0.010672361589968204, "grad/layer_27/attn_mlp_ratio": 0.6027298062503119} {"step": 14450, "timestamp": 1778210148.2905185, "train/loss": 2.3440719127655028, "train/z_loss": 0.001642544486094266, "train/perplexity": 10.423594232255125, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019391.5144137482, "perf/iters_per_sec": 0.9629209110325566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385068893432616, "data/tokens_consumed": 30305943552, "data/tokens_consumed_B": 30.305943552, "train/loss_slope": 1.3428456211748196e-06} {"step": 14460, "timestamp": 1778210158.6696618, "train/loss": 2.264582109451294, "train/z_loss": 0.0016621555085293948, "train/perplexity": 9.627100686002677, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021875.1015743588, "perf/iters_per_sec": 0.9641051776763719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372312307357787, "data/tokens_consumed": 30326915072, "data/tokens_consumed_B": 30.326915072, "train/loss_slope": 4.6065678512088136e-06} {"step": 14470, "timestamp": 1778210169.050535, "train/loss": 2.268331289291382, "train/z_loss": 0.001660776580683887, "train/perplexity": 9.663262163395961, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021184.5804567488, "perf/iters_per_sec": 0.9637759115489716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375855922698975, "data/tokens_consumed": 30347886592, "data/tokens_consumed_B": 30.347886592, "train/loss_slope": 3.8107975231002625e-06} {"step": 14475, "timestamp": 1778210174.8453546, "eos/sharpness": 25.35741329193115, "eos/L0_probe": 2.0939409732818604, "eos/L_plus": 2.1887121200561523, "eos/L_minus": 2.25274395942688, "eos/grad_norm": 0.160742849111557, "eos/embed_grad_frac": 0.09869793802499771, "eos/time_s": 0.6081278324127197} {"step": 14475, "timestamp": 1778210176.2260015, "geo/rankme_last": 441.2244873046875, "geo/layer_0/stable_rank_q_proj": 15.719130516052246, "geo/layer_0/stable_rank_k_proj": 13.99002742767334, "geo/layer_0/stable_rank_o_proj": 54.43700408935547, "geo/layer_0/stable_rank_gate_proj": 158.47003173828125, "geo/layer_0/stable_rank_down_proj": 48.70164489746094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04254767298698425, "geo/layer_0/attn_entropy_mean": 6.307301044464111, "geo/layer_0/attn_entropy_std": 0.29174304008483887, "geo/layer_7/stable_rank_q_proj": 43.202388763427734, "geo/layer_7/stable_rank_k_proj": 43.19264602661133, "geo/layer_7/stable_rank_o_proj": 110.35069274902344, "geo/layer_7/stable_rank_gate_proj": 113.29080963134766, "geo/layer_7/stable_rank_down_proj": 158.24160766601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5724666118621826, "geo/layer_7/attn_entropy_mean": 4.684130668640137, "geo/layer_7/attn_entropy_std": 0.8615522980690002, "geo/layer_14/stable_rank_q_proj": 59.29619216918945, "geo/layer_14/stable_rank_k_proj": 37.17192459106445, "geo/layer_14/stable_rank_o_proj": 50.954734802246094, "geo/layer_14/stable_rank_gate_proj": 97.52435302734375, "geo/layer_14/stable_rank_down_proj": 137.4817352294922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38876768946647644, "geo/layer_14/attn_entropy_mean": 5.520387649536133, "geo/layer_14/attn_entropy_std": 0.6047926545143127, "geo/layer_21/stable_rank_q_proj": 50.26897048950195, "geo/layer_21/stable_rank_k_proj": 32.3040885925293, "geo/layer_21/stable_rank_o_proj": 88.07442474365234, "geo/layer_21/stable_rank_gate_proj": 96.01654815673828, "geo/layer_21/stable_rank_down_proj": 65.5887451171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15667815506458282, "geo/layer_21/attn_entropy_mean": 5.781891822814941, "geo/layer_21/attn_entropy_std": 0.2892858684062958, "geo/layer_27/stable_rank_q_proj": 41.938053131103516, "geo/layer_27/stable_rank_k_proj": 32.52549743652344, "geo/layer_27/stable_rank_o_proj": 116.1478500366211, "geo/layer_27/stable_rank_gate_proj": 93.09683990478516, "geo/layer_27/stable_rank_down_proj": 144.96607971191406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06789783388376236, "geo/layer_27/attn_entropy_mean": 4.496951580047607, "geo/layer_27/attn_entropy_std": 0.473928838968277, "attnres/final_alpha/block_0": 0.24440887570381165, "attnres/block_norm/0": 1.5251504182815552, "attnres/final_alpha/block_1": 0.007854699157178402, "attnres/block_norm/1": 23264.6796875, "attnres/final_alpha/block_2": 0.016502689570188522, "attnres/block_norm/2": 17151.564453125, "attnres/final_alpha/block_3": 0.018703829497098923, "attnres/block_norm/3": 21205.544921875, "attnres/final_alpha/block_4": 0.024823643267154694, "attnres/block_norm/4": 7364.0634765625, "attnres/final_alpha/block_5": 0.5395987629890442, "attnres/block_norm/5": 4431.6748046875, "attnres/final_alpha/block_6": 0.14810749888420105, "attnres/block_norm/6": 14752.41015625, "geo/tier1_time_s": 1.3589236736297607, "geo/step": 14475.0, "geo/rankme_slope": 0.000255428440907613} {"step": 14480, "timestamp": 1778210181.4261246, "train/loss": 2.281437397003174, "train/z_loss": 0.001667329203337431, "train/perplexity": 9.790743485340743, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695379.1696245542, "perf/iters_per_sec": 0.8084197853205463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2369811058044433, "data/tokens_consumed": 30368858112, "data/tokens_consumed_B": 30.368858112, "train/loss_slope": 2.1076802623690564e-06} {"step": 14490, "timestamp": 1778210191.8155098, "train/loss": 2.228210616111755, "train/z_loss": 0.0016796181327663362, "train/perplexity": 9.283239928677576, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019592.6013728445, "perf/iters_per_sec": 0.9630167967666838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384034872055055, "data/tokens_consumed": 30389829632, "data/tokens_consumed_B": 30.389829632, "train/loss_slope": -8.469857577263304e-08} {"step": 14500, "timestamp": 1778210202.1727836, "grad/layer_0/attn": 0.002502279356122017, "grad/layer_0/mlp": 0.0025328986812382936, "grad/layer_0/attn_mlp_ratio": 0.9879113112047425, "grad/layer_4/attn": 0.0028489066753536463, "grad/layer_4/mlp": 0.002490589627996087, "grad/layer_4/attn_mlp_ratio": 1.1438683149335314, "grad/layer_8/attn": 0.010195157490670681, "grad/layer_8/mlp": 0.003985511139035225, "grad/layer_8/attn_mlp_ratio": 2.5580551350141545, "grad/layer_12/attn": 0.0036728791892528534, "grad/layer_12/mlp": 0.005892944987863302, "grad/layer_12/attn_mlp_ratio": 0.6232671668394227, "grad/layer_16/attn": 0.003752605989575386, "grad/layer_16/mlp": 0.004325718153268099, "grad/layer_16/attn_mlp_ratio": 0.8675104964916003, "grad/layer_20/attn": 0.0037321699783205986, "grad/layer_20/mlp": 0.005377007648348808, "grad/layer_20/attn_mlp_ratio": 0.6940979356904754, "grad/layer_24/attn": 0.006914111319929361, "grad/layer_24/mlp": 0.008700895123183727, "grad/layer_24/attn_mlp_ratio": 0.7946436708611959, "grad/layer_27/attn": 0.007829263806343079, "grad/layer_27/mlp": 0.006972664967179298, "grad/layer_27/attn_mlp_ratio": 1.122850980351234} {"step": 14500, "timestamp": 1778210202.187746, "train/loss": 2.250775599479675, "train/z_loss": 0.0016816737246699632, "train/perplexity": 9.495097373769763, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022907.9302621135, "perf/iters_per_sec": 0.9645976687727515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367016553878785, "data/tokens_consumed": 30410801152, "data/tokens_consumed_B": 30.410801152, "train/loss_slope": -2.491229888808417e-06} {"step": 14500, "timestamp": 1778210209.2690418, "geo/ww_alpha_mean": 8.045415905796414, "geo/ww_alpha_std": 4.523363610553689, "geo/ww_alpha_min": 1.3569286606271804, "geo/ww_alpha_max": 24.499055396324554, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.239191843565462, "geo/ww_alpha_by_type/k_proj": 4.7041257949145034, "geo/ww_alpha_by_type/v_proj": 8.183735852066627, "geo/ww_alpha_by_type/o_proj": 7.499237250884453, "geo/ww_alpha_by_type/gate_proj": 9.780204547711833, "geo/ww_alpha_by_type/up_proj": 12.247603691145153, "geo/ww_alpha_by_type/down_proj": 9.798232005338075, "geo/twonn_id/layer_0": 0.699842095375061, "geo/twonn_id/layer_7": 3.1995675563812256, "geo/twonn_id/layer_14": 3.776632070541382, "geo/twonn_id/layer_21": 6.964071750640869, "geo/twonn_id/layer_27": 5.693794250488281, "geo/tier2_time_s": 7.071894407272339} {"step": 14500, "timestamp": 1778210209.8937163, "eoc/jacobian_sigma/layer_0/attn": 672.5679321289062, "eoc/jacobian_sigma/layer_0/mlp": 4211.08251953125, "eoc/jacobian_sigma/layer_0": 4211.08251953125, "eoc/jacobian_sigma/layer_7/attn": 1.1523690223693848, "eoc/jacobian_sigma/layer_7/mlp": 1.6016618013381958, "eoc/jacobian_sigma/layer_7": 1.6016618013381958, "eoc/jacobian_sigma/layer_14/attn": 1.3443398475646973, "eoc/jacobian_sigma/layer_14/mlp": 7.651709079742432, "eoc/jacobian_sigma/layer_14": 7.651709079742432, "eoc/jacobian_sigma/layer_21/attn": 1.079938530921936, "eoc/jacobian_sigma/layer_21/mlp": 4.126755237579346, "eoc/jacobian_sigma/layer_21": 4.126755237579346, "eoc/jacobian_sigma/layer_27/attn": 2.4993605613708496, "eoc/jacobian_sigma/layer_27/mlp": 22.32461929321289, "eoc/jacobian_sigma/layer_27": 22.32461929321289, "eoc/layer0_sigma": 4211.08251953125, "eoc/sigma_max": 22.32461929321289, "eoc/sigma_min": 1.6016618013381958, "eoc/sigma_mean": 8.926186352968216, "eoc/time_s": 0.6169052124023438} {"step": 14510, "timestamp": 1778210220.2926493, "train/loss": 2.2742213726043703, "train/z_loss": 0.001670656236819923, "train/perplexity": 9.720347536376153, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1158858.7546621307, "perf/iters_per_sec": 0.5525869153319029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8096700668334962, "data/tokens_consumed": 30431772672, "data/tokens_consumed_B": 30.431772672, "train/loss_slope": -1.4297926350824283e-06} {"step": 14520, "timestamp": 1778210230.669658, "train/loss": 2.240306043624878, "train/z_loss": 0.0016721287043765186, "train/perplexity": 9.396206496546936, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022062.6918339853, "perf/iters_per_sec": 0.9641946276826788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371350049972534, "data/tokens_consumed": 30452744192, "data/tokens_consumed_B": 30.452744192, "train/loss_slope": -1.6920378213167678e-06} {"step": 14530, "timestamp": 1778210241.0500643, "train/loss": 2.2434680700302123, "train/z_loss": 0.0016792157432064414, "train/perplexity": 9.42596457271585, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021351.2788503421, "perf/iters_per_sec": 0.9638553995372496, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037500023841858, "data/tokens_consumed": 30473715712, "data/tokens_consumed_B": 30.473715712, "train/loss_slope": -6.974803632420905e-06} {"step": 14540, "timestamp": 1778210251.4288979, "train/loss": 2.2709234237670897, "train/z_loss": 0.0016679779277183116, "train/perplexity": 9.688343130974287, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021646.2384051909, "perf/iters_per_sec": 0.9639960472131686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373486518859862, "data/tokens_consumed": 30494687232, "data/tokens_consumed_B": 30.494687232, "train/loss_slope": -7.152022715985461e-06} {"step": 14550, "timestamp": 1778210261.801229, "grad/layer_0/attn": 0.003225741907954216, "grad/layer_0/mlp": 0.002994874957948923, "grad/layer_0/attn_mlp_ratio": 1.0770873059937949, "grad/layer_4/attn": 0.0021968772634863853, "grad/layer_4/mlp": 0.002592384582385421, "grad/layer_4/attn_mlp_ratio": 0.847434903628923, "grad/layer_8/attn": 0.006039135158061981, "grad/layer_8/mlp": 0.004153026267886162, "grad/layer_8/attn_mlp_ratio": 1.4541528570009599, "grad/layer_12/attn": 0.005088218487799168, "grad/layer_12/mlp": 0.006291329860687256, "grad/layer_12/attn_mlp_ratio": 0.8087667503682064, "grad/layer_16/attn": 0.0046149506233632565, "grad/layer_16/mlp": 0.00497671402990818, "grad/layer_16/attn_mlp_ratio": 0.9273087629504655, "grad/layer_20/attn": 0.00511770136654377, "grad/layer_20/mlp": 0.006780519615858793, "grad/layer_20/attn_mlp_ratio": 0.754765354427641, "grad/layer_24/attn": 0.013251281343400478, "grad/layer_24/mlp": 0.012705816887319088, "grad/layer_24/attn_mlp_ratio": 1.0429302859175278, "grad/layer_27/attn": 0.00845599826425314, "grad/layer_27/mlp": 0.011826003901660442, "grad/layer_27/attn_mlp_ratio": 0.7150342806467735} {"step": 14550, "timestamp": 1778210262.4399114, "eos/sharpness": 42.51317977905273, "eos/L0_probe": 2.09505558013916, "eos/L_plus": 2.3554069995880127, "eos/L_minus": 2.259835958480835, "eos/grad_norm": 0.1939522624015808, "eos/embed_grad_frac": 0.06942711770534515, "eos/time_s": 0.6358740329742432} {"step": 14550, "timestamp": 1778210262.4605331, "train/loss": 2.260486674308777, "train/z_loss": 0.0016695699538104235, "train/perplexity": 9.587754145131047, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1901902.6094770571, "perf/iters_per_sec": 0.9068978354821478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1026600360870362, "data/tokens_consumed": 30515658752, "data/tokens_consumed_B": 30.515658752, "train/loss_slope": -1.0910285581456565e-05} {"step": 14550, "timestamp": 1778210263.8239713, "geo/rankme_last": 441.2594299316406, "geo/layer_0/stable_rank_q_proj": 15.724372863769531, "geo/layer_0/stable_rank_k_proj": 14.008398056030273, "geo/layer_0/stable_rank_o_proj": 54.43282699584961, "geo/layer_0/stable_rank_gate_proj": 158.06793212890625, "geo/layer_0/stable_rank_down_proj": 48.769134521484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03915945813059807, "geo/layer_0/attn_entropy_mean": 6.300906181335449, "geo/layer_0/attn_entropy_std": 0.2974088788032532, "geo/layer_7/stable_rank_q_proj": 43.26686477661133, "geo/layer_7/stable_rank_k_proj": 43.1989860534668, "geo/layer_7/stable_rank_o_proj": 109.6570053100586, "geo/layer_7/stable_rank_gate_proj": 113.37516021728516, "geo/layer_7/stable_rank_down_proj": 158.23216247558594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.573777973651886, "geo/layer_7/attn_entropy_mean": 4.683685302734375, "geo/layer_7/attn_entropy_std": 0.8568931221961975, "geo/layer_14/stable_rank_q_proj": 59.32439041137695, "geo/layer_14/stable_rank_k_proj": 36.994903564453125, "geo/layer_14/stable_rank_o_proj": 50.89801025390625, "geo/layer_14/stable_rank_gate_proj": 97.44384002685547, "geo/layer_14/stable_rank_down_proj": 136.9884490966797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37458157539367676, "geo/layer_14/attn_entropy_mean": 5.5335307121276855, "geo/layer_14/attn_entropy_std": 0.560264527797699, "geo/layer_21/stable_rank_q_proj": 50.15287399291992, "geo/layer_21/stable_rank_k_proj": 32.21590042114258, "geo/layer_21/stable_rank_o_proj": 87.94580841064453, "geo/layer_21/stable_rank_gate_proj": 96.07321166992188, "geo/layer_21/stable_rank_down_proj": 65.4962158203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.160356804728508, "geo/layer_21/attn_entropy_mean": 5.768019676208496, "geo/layer_21/attn_entropy_std": 0.28653430938720703, "geo/layer_27/stable_rank_q_proj": 41.92768859863281, "geo/layer_27/stable_rank_k_proj": 32.406307220458984, "geo/layer_27/stable_rank_o_proj": 116.41983032226562, "geo/layer_27/stable_rank_gate_proj": 93.18099212646484, "geo/layer_27/stable_rank_down_proj": 144.7328643798828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06809283792972565, "geo/layer_27/attn_entropy_mean": 4.484984397888184, "geo/layer_27/attn_entropy_std": 0.5075494050979614, "attnres/final_alpha/block_0": 0.24321646988391876, "attnres/block_norm/0": 1.5265685319900513, "attnres/final_alpha/block_1": 0.007708207704126835, "attnres/block_norm/1": 23271.25390625, "attnres/final_alpha/block_2": 0.016566049307584763, "attnres/block_norm/2": 17125.548828125, "attnres/final_alpha/block_3": 0.01845257543027401, "attnres/block_norm/3": 21497.884765625, "attnres/final_alpha/block_4": 0.024335630238056183, "attnres/block_norm/4": 7396.8701171875, "attnres/final_alpha/block_5": 0.5436007976531982, "attnres/block_norm/5": 4423.8798828125, "attnres/final_alpha/block_6": 0.14612025022506714, "attnres/block_norm/6": 14880.474609375, "geo/tier1_time_s": 1.3591620922088623, "geo/step": 14550.0, "geo/rankme_slope": 0.00024521951358668465} {"step": 14560, "timestamp": 1778210274.2165868, "train/loss": 2.3068787097930907, "train/z_loss": 0.0016609886894002556, "train/perplexity": 10.043028475780678, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1784533.4245515931, "perf/iters_per_sec": 0.8509318468816724, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1751822471618651, "data/tokens_consumed": 30536630272, "data/tokens_consumed_B": 30.536630272, "train/loss_slope": -8.571959979677947e-06} {"step": 14570, "timestamp": 1778210284.5794628, "train/loss": 2.287038540840149, "train/z_loss": 0.0016533018555492164, "train/perplexity": 9.845736716597944, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024591.861168787, "perf/iters_per_sec": 0.9654006296009002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358393907546997, "data/tokens_consumed": 30557601792, "data/tokens_consumed_B": 30.557601792, "train/loss_slope": -9.499450227787225e-06} {"step": 14580, "timestamp": 1778210294.9405715, "train/loss": 2.2972675800323485, "train/z_loss": 0.0016538777388632297, "train/perplexity": 9.946965999840135, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025339.0869473757, "perf/iters_per_sec": 0.9657569346176985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035457229614258, "data/tokens_consumed": 30578573312, "data/tokens_consumed_B": 30.578573312, "train/loss_slope": -1.0499684323500573e-05} {"step": 14590, "timestamp": 1778210305.3321557, "train/loss": 2.2114790439605714, "train/z_loss": 0.001689591642934829, "train/perplexity": 9.129208913873786, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019886.862319896, "perf/iters_per_sec": 0.963157111320446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382522106170655, "data/tokens_consumed": 30599544832, "data/tokens_consumed_B": 30.599544832, "train/loss_slope": -1.3997176721437256e-05} {"step": 14600, "timestamp": 1778210315.697752, "grad/layer_0/attn": 0.002917023841291666, "grad/layer_0/mlp": 0.002672029659152031, "grad/layer_0/attn_mlp_ratio": 1.091688380827534, "grad/layer_4/attn": 0.0018414953956380486, "grad/layer_4/mlp": 0.0025350956711918116, "grad/layer_4/attn_mlp_ratio": 0.7264007208580984, "grad/layer_8/attn": 0.0063974009826779366, "grad/layer_8/mlp": 0.003933181054890156, "grad/layer_8/attn_mlp_ratio": 1.6265208061225447, "grad/layer_12/attn": 0.0035810673143714666, "grad/layer_12/mlp": 0.005510651972144842, "grad/layer_12/attn_mlp_ratio": 0.649844567845789, "grad/layer_16/attn": 0.0034916745498776436, "grad/layer_16/mlp": 0.004086309112608433, "grad/layer_16/attn_mlp_ratio": 0.8544812367855019, "grad/layer_20/attn": 0.0036440761759877205, "grad/layer_20/mlp": 0.005233676638454199, "grad/layer_20/attn_mlp_ratio": 0.6962745996926097, "grad/layer_24/attn": 0.0054611279629170895, "grad/layer_24/mlp": 0.008000195026397705, "grad/layer_24/attn_mlp_ratio": 0.6826243456109431, "grad/layer_27/attn": 0.005083149764686823, "grad/layer_27/mlp": 0.006831464823335409, "grad/layer_27/attn_mlp_ratio": 0.7440790257626054} {"step": 14600, "timestamp": 1778210315.7145002, "train/loss": 2.289853310585022, "train/z_loss": 0.0016688815900124609, "train/perplexity": 9.873489238579984, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021093.5094444798, "perf/iters_per_sec": 0.9637324855062865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376323461532593, "data/tokens_consumed": 30620516352, "data/tokens_consumed_B": 30.620516352, "train/loss_slope": -1.1504866938815607e-05} {"step": 14610, "timestamp": 1778210326.0918956, "train/loss": 2.2984920740127563, "train/z_loss": 0.0016548316576518118, "train/perplexity": 9.95915346004328, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021693.1220911606, "perf/iters_per_sec": 0.9640184030967525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037324595451355, "data/tokens_consumed": 30641487872, "data/tokens_consumed_B": 30.641487872, "train/loss_slope": -7.954314267925035e-06} {"step": 14620, "timestamp": 1778210336.468331, "train/loss": 2.279002332687378, "train/z_loss": 0.001663792331237346, "train/perplexity": 9.766931399001564, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022185.6018477743, "perf/iters_per_sec": 0.9642532357443687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370719671249389, "data/tokens_consumed": 30662459392, "data/tokens_consumed_B": 30.662459392, "train/loss_slope": -6.31876670666867e-06} {"step": 14625, "timestamp": 1778210342.2577946, "eos/sharpness": 19.947290420532223, "eos/L0_probe": 2.096043348312378, "eos/L_plus": 2.217414379119873, "eos/L_minus": 2.174145221710205, "eos/grad_norm": 0.1361682265996933, "eos/embed_grad_frac": 0.1339685469865799, "eos/time_s": 0.6113028526306152} {"step": 14625, "timestamp": 1778210343.6337261, "geo/rankme_last": 441.4256896972656, "geo/layer_0/stable_rank_q_proj": 15.773947715759277, "geo/layer_0/stable_rank_k_proj": 14.020726203918457, "geo/layer_0/stable_rank_o_proj": 54.43569564819336, "geo/layer_0/stable_rank_gate_proj": 157.670166015625, "geo/layer_0/stable_rank_down_proj": 48.71223831176758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03907536342740059, "geo/layer_0/attn_entropy_mean": 6.311810493469238, "geo/layer_0/attn_entropy_std": 0.29703769087791443, "geo/layer_7/stable_rank_q_proj": 43.288230895996094, "geo/layer_7/stable_rank_k_proj": 42.990943908691406, "geo/layer_7/stable_rank_o_proj": 109.5250473022461, "geo/layer_7/stable_rank_gate_proj": 113.20659637451172, "geo/layer_7/stable_rank_down_proj": 158.0314178466797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5829962491989136, "geo/layer_7/attn_entropy_mean": 4.623181343078613, "geo/layer_7/attn_entropy_std": 0.8556503653526306, "geo/layer_14/stable_rank_q_proj": 59.45073699951172, "geo/layer_14/stable_rank_k_proj": 37.02060317993164, "geo/layer_14/stable_rank_o_proj": 50.703697204589844, "geo/layer_14/stable_rank_gate_proj": 97.58811950683594, "geo/layer_14/stable_rank_down_proj": 137.06161499023438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3854004144668579, "geo/layer_14/attn_entropy_mean": 5.5293426513671875, "geo/layer_14/attn_entropy_std": 0.5744439959526062, "geo/layer_21/stable_rank_q_proj": 50.09677505493164, "geo/layer_21/stable_rank_k_proj": 32.231502532958984, "geo/layer_21/stable_rank_o_proj": 87.94961547851562, "geo/layer_21/stable_rank_gate_proj": 96.0342788696289, "geo/layer_21/stable_rank_down_proj": 65.39165496826172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.16095124185085297, "geo/layer_21/attn_entropy_mean": 5.773184299468994, "geo/layer_21/attn_entropy_std": 0.2976480722427368, "geo/layer_27/stable_rank_q_proj": 41.950721740722656, "geo/layer_27/stable_rank_k_proj": 32.226158142089844, "geo/layer_27/stable_rank_o_proj": 116.6594467163086, "geo/layer_27/stable_rank_gate_proj": 93.16309356689453, "geo/layer_27/stable_rank_down_proj": 144.59600830078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06312321126461029, "geo/layer_27/attn_entropy_mean": 4.486108303070068, "geo/layer_27/attn_entropy_std": 0.48148974776268005, "attnres/final_alpha/block_0": 0.24601642787456512, "attnres/block_norm/0": 1.5281994342803955, "attnres/final_alpha/block_1": 0.007870031520724297, "attnres/block_norm/1": 23381.57421875, "attnres/final_alpha/block_2": 0.016387486830353737, "attnres/block_norm/2": 17226.923828125, "attnres/final_alpha/block_3": 0.018598919734358788, "attnres/block_norm/3": 21594.625, "attnres/final_alpha/block_4": 0.024584447965025902, "attnres/block_norm/4": 7418.615234375, "attnres/final_alpha/block_5": 0.5388847589492798, "attnres/block_norm/5": 4426.4521484375, "attnres/final_alpha/block_6": 0.14765791594982147, "attnres/block_norm/6": 14958.51953125, "geo/tier1_time_s": 1.3566958904266357, "geo/step": 14625.0, "geo/rankme_slope": 0.0002414087900785314} {"step": 14630, "timestamp": 1778210348.8259914, "train/loss": 2.2811601161956787, "train/z_loss": 0.0016642252332530916, "train/perplexity": 9.788029076425289, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697801.7140296735, "perf/iters_per_sec": 0.8095749445103042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2352160930633544, "data/tokens_consumed": 30683430912, "data/tokens_consumed_B": 30.683430912, "train/loss_slope": -7.146838710646401e-06} {"step": 14640, "timestamp": 1778210359.2063003, "train/loss": 2.270513451099396, "train/z_loss": 0.0016713523073121905, "train/perplexity": 9.684371989180768, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021366.84000876, "perf/iters_per_sec": 0.963862819675808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037492036819458, "data/tokens_consumed": 30704402432, "data/tokens_consumed_B": 30.704402432, "train/loss_slope": -6.4111691854609876e-06} {"step": 14650, "timestamp": 1778210369.5724356, "grad/layer_0/attn": 0.0026427966076880693, "grad/layer_0/mlp": 0.0026529976166784763, "grad/layer_0/attn_mlp_ratio": 0.9961548745683887, "grad/layer_4/attn": 0.0017762535717338324, "grad/layer_4/mlp": 0.0025486196391284466, "grad/layer_4/attn_mlp_ratio": 0.6969472708946605, "grad/layer_8/attn": 0.008766497485339642, "grad/layer_8/mlp": 0.003947106190025806, "grad/layer_8/attn_mlp_ratio": 2.220993518084948, "grad/layer_12/attn": 0.005141480825841427, "grad/layer_12/mlp": 0.005898524075746536, "grad/layer_12/attn_mlp_ratio": 0.8716554637483883, "grad/layer_16/attn": 0.005769933108240366, "grad/layer_16/mlp": 0.00464354082942009, "grad/layer_16/attn_mlp_ratio": 1.242571820931693, "grad/layer_20/attn": 0.006700917612761259, "grad/layer_20/mlp": 0.006588711403310299, "grad/layer_20/attn_mlp_ratio": 1.0170300535081231, "grad/layer_24/attn": 0.00870551262050867, "grad/layer_24/mlp": 0.009457654319703579, "grad/layer_24/attn_mlp_ratio": 0.9204726916614827, "grad/layer_27/attn": 0.004996065050363541, "grad/layer_27/mlp": 0.008676739409565926, "grad/layer_27/attn_mlp_ratio": 0.575799820296032} {"step": 14650, "timestamp": 1778210369.5892763, "train/loss": 2.26598265171051, "train/z_loss": 0.0016626884462311864, "train/perplexity": 9.640593293625233, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020884.9269168111, "perf/iters_per_sec": 0.9636330256065422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377394437789917, "data/tokens_consumed": 30725373952, "data/tokens_consumed_B": 30.725373952, "train/loss_slope": -8.490685460948698e-06} {"step": 14660, "timestamp": 1778210379.9659848, "train/loss": 2.2082797288894653, "train/z_loss": 0.0016738627105951308, "train/perplexity": 9.100048369965167, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022007.1919819205, "perf/iters_per_sec": 0.9641681632909396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371634721755982, "data/tokens_consumed": 30746345472, "data/tokens_consumed_B": 30.746345472, "train/loss_slope": -1.1318910340092509e-05} {"step": 14670, "timestamp": 1778210390.3454947, "train/loss": 2.261350464820862, "train/z_loss": 0.0016682818648405373, "train/perplexity": 9.596039534098836, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021399.5889009424, "perf/iters_per_sec": 0.9638784355644905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374752283096313, "data/tokens_consumed": 30767316992, "data/tokens_consumed_B": 30.767316992, "train/loss_slope": -6.503592358671258e-06} {"step": 14680, "timestamp": 1778210400.7330012, "train/loss": 2.245773935317993, "train/z_loss": 0.0016724918386898935, "train/perplexity": 9.447724655496142, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020086.8417348743, "perf/iters_per_sec": 0.9632524689363834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381494283676147, "data/tokens_consumed": 30788288512, "data/tokens_consumed_B": 30.788288512, "train/loss_slope": -6.563189850173946e-06} {"step": 14690, "timestamp": 1778210411.120897, "train/loss": 2.3132802724838255, "train/z_loss": 0.0016612999723292887, "train/perplexity": 10.107525773662964, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019745.9124533907, "perf/iters_per_sec": 0.9630899011866525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383246660232544, "data/tokens_consumed": 30809260032, "data/tokens_consumed_B": 30.809260032, "train/loss_slope": -3.4971523456590518e-06} {"step": 14700, "timestamp": 1778210421.49113, "grad/layer_0/attn": 0.0029760280158370733, "grad/layer_0/mlp": 0.002791468519717455, "grad/layer_0/attn_mlp_ratio": 1.0661155188405087, "grad/layer_4/attn": 0.001982402754947543, "grad/layer_4/mlp": 0.0026319907046854496, "grad/layer_4/attn_mlp_ratio": 0.7531951674825327, "grad/layer_8/attn": 0.013853028416633606, "grad/layer_8/mlp": 0.004243920557200909, "grad/layer_8/attn_mlp_ratio": 3.26420532700779, "grad/layer_12/attn": 0.003904651151970029, "grad/layer_12/mlp": 0.005882288329303265, "grad/layer_12/attn_mlp_ratio": 0.6637979757195485, "grad/layer_16/attn": 0.005262564867734909, "grad/layer_16/mlp": 0.0045692999847233295, "grad/layer_16/attn_mlp_ratio": 1.1517223141744164, "grad/layer_20/attn": 0.009960095398128033, "grad/layer_20/mlp": 0.006120041012763977, "grad/layer_20/attn_mlp_ratio": 1.6274556354458514, "grad/layer_24/attn": 0.021388201043009758, "grad/layer_24/mlp": 0.016449032351374626, "grad/layer_24/attn_mlp_ratio": 1.3002710710332612, "grad/layer_27/attn": 0.008432739414274693, "grad/layer_27/mlp": 0.015707770362496376, "grad/layer_27/attn_mlp_ratio": 0.5368514541518522} {"step": 14700, "timestamp": 1778210422.118768, "eos/sharpness": 31.059312820434563, "eos/L0_probe": 2.093451499938965, "eos/L_plus": 2.2697925567626953, "eos/L_minus": 2.22770357131958, "eos/grad_norm": 0.2228945791721344, "eos/embed_grad_frac": 0.05831480398774147, "eos/time_s": 0.6246497631072998} {"step": 14700, "timestamp": 1778210422.1397877, "train/loss": 2.303782749176025, "train/z_loss": 0.0016533904243260622, "train/perplexity": 10.011983736585462, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903979.498498056, "perf/iters_per_sec": 0.9078881733408242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1014572381973267, "data/tokens_consumed": 30830231552, "data/tokens_consumed_B": 30.830231552, "train/loss_slope": -1.1812304506207362e-06} {"step": 14700, "timestamp": 1778210423.500009, "geo/rankme_last": 441.2451171875, "geo/layer_0/stable_rank_q_proj": 15.790745735168457, "geo/layer_0/stable_rank_k_proj": 14.052302360534668, "geo/layer_0/stable_rank_o_proj": 54.2549934387207, "geo/layer_0/stable_rank_gate_proj": 156.94383239746094, "geo/layer_0/stable_rank_down_proj": 48.70765686035156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03838440775871277, "geo/layer_0/attn_entropy_mean": 6.313079833984375, "geo/layer_0/attn_entropy_std": 0.2944963276386261, "geo/layer_7/stable_rank_q_proj": 43.32027816772461, "geo/layer_7/stable_rank_k_proj": 42.72378921508789, "geo/layer_7/stable_rank_o_proj": 109.92049407958984, "geo/layer_7/stable_rank_gate_proj": 112.90947723388672, "geo/layer_7/stable_rank_down_proj": 158.0494384765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5736485719680786, "geo/layer_7/attn_entropy_mean": 4.682663917541504, "geo/layer_7/attn_entropy_std": 0.8867632150650024, "geo/layer_14/stable_rank_q_proj": 59.564456939697266, "geo/layer_14/stable_rank_k_proj": 36.897789001464844, "geo/layer_14/stable_rank_o_proj": 50.79835891723633, "geo/layer_14/stable_rank_gate_proj": 97.0325698852539, "geo/layer_14/stable_rank_down_proj": 136.9348602294922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.389376163482666, "geo/layer_14/attn_entropy_mean": 5.556268692016602, "geo/layer_14/attn_entropy_std": 0.5491852164268494, "geo/layer_21/stable_rank_q_proj": 50.05000305175781, "geo/layer_21/stable_rank_k_proj": 32.26126480102539, "geo/layer_21/stable_rank_o_proj": 87.91291809082031, "geo/layer_21/stable_rank_gate_proj": 95.97439575195312, "geo/layer_21/stable_rank_down_proj": 65.36382293701172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1572854071855545, "geo/layer_21/attn_entropy_mean": 5.781004905700684, "geo/layer_21/attn_entropy_std": 0.30349141359329224, "geo/layer_27/stable_rank_q_proj": 41.99917221069336, "geo/layer_27/stable_rank_k_proj": 32.292850494384766, "geo/layer_27/stable_rank_o_proj": 116.97218322753906, "geo/layer_27/stable_rank_gate_proj": 92.99930572509766, "geo/layer_27/stable_rank_down_proj": 144.79629516601562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06751462817192078, "geo/layer_27/attn_entropy_mean": 4.451564311981201, "geo/layer_27/attn_entropy_std": 0.4873256981372833, "attnres/final_alpha/block_0": 0.24394506216049194, "attnres/block_norm/0": 1.5293359756469727, "attnres/final_alpha/block_1": 0.0078107439912855625, "attnres/block_norm/1": 23474.388671875, "attnres/final_alpha/block_2": 0.01657630130648613, "attnres/block_norm/2": 17226.4296875, "attnres/final_alpha/block_3": 0.018586130812764168, "attnres/block_norm/3": 21588.953125, "attnres/final_alpha/block_4": 0.024587590247392654, "attnres/block_norm/4": 7459.52099609375, "attnres/final_alpha/block_5": 0.5404296517372131, "attnres/block_norm/5": 4482.029296875, "attnres/final_alpha/block_6": 0.1480644941329956, "attnres/block_norm/6": 14966.787109375, "geo/tier1_time_s": 1.356379747390747, "geo/step": 14700.0, "geo/rankme_slope": 0.00027194565326130453} {"step": 14710, "timestamp": 1778210433.8849604, "train/loss": 2.190246343612671, "train/z_loss": 0.0016896068584173918, "train/perplexity": 8.93741451851716, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786096.75287105, "perf/iters_per_sec": 0.8516772999148607, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1741536378860473, "data/tokens_consumed": 30851203072, "data/tokens_consumed_B": 30.851203072, "train/loss_slope": -3.3325680304389e-06} {"step": 14720, "timestamp": 1778210444.767391, "train/loss": 2.2685293436050413, "train/z_loss": 0.001660074875690043, "train/perplexity": 9.665176203687155, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927975.1990418267, "perf/iters_per_sec": 0.9193302149972089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0877484321594237, "data/tokens_consumed": 30872174592, "data/tokens_consumed_B": 30.872174592, "train/loss_slope": -6.07078064679027e-06} {"step": 14730, "timestamp": 1778210455.1590667, "train/loss": 2.2514468908309935, "train/z_loss": 0.001661754201631993, "train/perplexity": 9.501473490393233, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019049.012311455, "perf/iters_per_sec": 0.9627575933034206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386830568313599, "data/tokens_consumed": 30893146112, "data/tokens_consumed_B": 30.893146112, "train/loss_slope": -7.81445124111413e-06} {"step": 14740, "timestamp": 1778210465.5418007, "train/loss": 2.2750006198883055, "train/z_loss": 0.0016635506879538297, "train/perplexity": 9.727925042784985, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020878.4732465951, "perf/iters_per_sec": 0.9636299482567764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377427577972411, "data/tokens_consumed": 30914117632, "data/tokens_consumed_B": 30.914117632, "train/loss_slope": -8.158805954753228e-06} {"step": 14750, "timestamp": 1778210475.9071634, "grad/layer_0/attn": 0.0022942651994526386, "grad/layer_0/mlp": 0.002379540354013443, "grad/layer_0/attn_mlp_ratio": 0.9641631414935706, "grad/layer_4/attn": 0.001461030449718237, "grad/layer_4/mlp": 0.0024525397457182407, "grad/layer_4/attn_mlp_ratio": 0.5957213915480201, "grad/layer_8/attn": 0.004439660347998142, "grad/layer_8/mlp": 0.003689225297421217, "grad/layer_8/attn_mlp_ratio": 1.203412605557113, "grad/layer_12/attn": 0.005052669905126095, "grad/layer_12/mlp": 0.005904044024646282, "grad/layer_12/attn_mlp_ratio": 0.8557981272588823, "grad/layer_16/attn": 0.005438910331577063, "grad/layer_16/mlp": 0.004021893255412579, "grad/layer_16/attn_mlp_ratio": 1.3523258453030569, "grad/layer_20/attn": 0.004131547175347805, "grad/layer_20/mlp": 0.005402452778071165, "grad/layer_20/attn_mlp_ratio": 0.7647539494732047, "grad/layer_24/attn": 0.005214205011725426, "grad/layer_24/mlp": 0.007954185828566551, "grad/layer_24/attn_mlp_ratio": 0.6555296869537841, "grad/layer_27/attn": 0.006916484795510769, "grad/layer_27/mlp": 0.007451064884662628, "grad/layer_27/attn_mlp_ratio": 0.9282545259969889} {"step": 14750, "timestamp": 1778210475.9243953, "train/loss": 2.2396910429000854, "train/z_loss": 0.001663818338420242, "train/perplexity": 9.390429599321331, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020898.1129610823, "perf/iters_per_sec": 0.9636393132024204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377326726913452, "data/tokens_consumed": 30935089152, "data/tokens_consumed_B": 30.935089152, "train/loss_slope": -1.0623496336297484e-05} {"step": 14760, "timestamp": 1778210486.306389, "train/loss": 2.2760785818099976, "train/z_loss": 0.0016634912928566337, "train/perplexity": 9.738417029523301, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020813.5674021498, "perf/iters_per_sec": 0.9635989987383603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377760887145997, "data/tokens_consumed": 30956060672, "data/tokens_consumed_B": 30.956060672, "train/loss_slope": -9.512360189685701e-06} {"step": 14770, "timestamp": 1778210496.6865537, "train/loss": 2.311911177635193, "train/z_loss": 0.0016453303978778422, "train/perplexity": 10.093697080750003, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021451.8035137453, "perf/iters_per_sec": 0.9639033334320761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374484300613402, "data/tokens_consumed": 30977032192, "data/tokens_consumed_B": 30.977032192, "train/loss_slope": -7.753164962072714e-06} {"step": 14775, "timestamp": 1778210502.476698, "eos/sharpness": 28.22201251983642, "eos/L0_probe": 2.086826801300049, "eos/L_plus": 2.2076711654663086, "eos/L_minus": 2.2482025623321533, "eos/grad_norm": 0.19234207272529602, "eos/embed_grad_frac": 0.08731752634048462, "eos/time_s": 0.6106791496276855} {"step": 14775, "timestamp": 1778210503.8551097, "geo/rankme_last": 441.2409362792969, "geo/layer_0/stable_rank_q_proj": 15.826188087463379, "geo/layer_0/stable_rank_k_proj": 14.16958999633789, "geo/layer_0/stable_rank_o_proj": 54.37502670288086, "geo/layer_0/stable_rank_gate_proj": 156.68896484375, "geo/layer_0/stable_rank_down_proj": 48.63859939575195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03913813829421997, "geo/layer_0/attn_entropy_mean": 6.314328193664551, "geo/layer_0/attn_entropy_std": 0.2957175672054291, "geo/layer_7/stable_rank_q_proj": 43.3746337890625, "geo/layer_7/stable_rank_k_proj": 42.68257141113281, "geo/layer_7/stable_rank_o_proj": 110.28629302978516, "geo/layer_7/stable_rank_gate_proj": 112.87982940673828, "geo/layer_7/stable_rank_down_proj": 157.87808227539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5781120657920837, "geo/layer_7/attn_entropy_mean": 4.666450500488281, "geo/layer_7/attn_entropy_std": 0.8836731910705566, "geo/layer_14/stable_rank_q_proj": 59.59054183959961, "geo/layer_14/stable_rank_k_proj": 36.93248748779297, "geo/layer_14/stable_rank_o_proj": 50.92561340332031, "geo/layer_14/stable_rank_gate_proj": 96.78211212158203, "geo/layer_14/stable_rank_down_proj": 136.68719482421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3741895854473114, "geo/layer_14/attn_entropy_mean": 5.561986923217773, "geo/layer_14/attn_entropy_std": 0.5642601251602173, "geo/layer_21/stable_rank_q_proj": 50.04484558105469, "geo/layer_21/stable_rank_k_proj": 32.164642333984375, "geo/layer_21/stable_rank_o_proj": 87.75141143798828, "geo/layer_21/stable_rank_gate_proj": 96.10408020019531, "geo/layer_21/stable_rank_down_proj": 65.37742614746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15775121748447418, "geo/layer_21/attn_entropy_mean": 5.769865036010742, "geo/layer_21/attn_entropy_std": 0.2914820611476898, "geo/layer_27/stable_rank_q_proj": 42.13230514526367, "geo/layer_27/stable_rank_k_proj": 32.40129470825195, "geo/layer_27/stable_rank_o_proj": 116.83440399169922, "geo/layer_27/stable_rank_gate_proj": 93.28334045410156, "geo/layer_27/stable_rank_down_proj": 144.66357421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06931041926145554, "geo/layer_27/attn_entropy_mean": 4.461390972137451, "geo/layer_27/attn_entropy_std": 0.49672460556030273, "attnres/final_alpha/block_0": 0.2453862428665161, "attnres/block_norm/0": 1.5306003093719482, "attnres/final_alpha/block_1": 0.007773824501782656, "attnres/block_norm/1": 23533.8125, "attnres/final_alpha/block_2": 0.016580089926719666, "attnres/block_norm/2": 17346.630859375, "attnres/final_alpha/block_3": 0.018614934757351875, "attnres/block_norm/3": 21780.904296875, "attnres/final_alpha/block_4": 0.024800734594464302, "attnres/block_norm/4": 7476.96484375, "attnres/final_alpha/block_5": 0.539474606513977, "attnres/block_norm/5": 4462.580078125, "attnres/final_alpha/block_6": 0.14736956357955933, "attnres/block_norm/6": 15023.146484375, "geo/tier1_time_s": 1.3586013317108154, "geo/step": 14775.0, "geo/rankme_slope": 0.0002445174749587335} {"step": 14780, "timestamp": 1778210509.0639892, "train/loss": 2.305173420906067, "train/z_loss": 0.001654309977311641, "train/perplexity": 10.025916805246409, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695056.315744259, "perf/iters_per_sec": 0.8082658365937514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2372167110443115, "data/tokens_consumed": 30998003712, "data/tokens_consumed_B": 30.998003712, "train/loss_slope": -7.593676509565353e-06} {"step": 14790, "timestamp": 1778210519.4443161, "train/loss": 2.2447650909423826, "train/z_loss": 0.0016688830452039837, "train/perplexity": 9.438198177789832, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021675.0002897107, "perf/iters_per_sec": 0.9640097619484476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03733389377594, "data/tokens_consumed": 31018975232, "data/tokens_consumed_B": 31.018975232, "train/loss_slope": -1.1258053815368448e-05} {"step": 14800, "timestamp": 1778210529.8123949, "grad/layer_0/attn": 0.003524268278852105, "grad/layer_0/mlp": 0.0027574968989938498, "grad/layer_0/attn_mlp_ratio": 1.2780678565155406, "grad/layer_4/attn": 0.0014774341834709048, "grad/layer_4/mlp": 0.002436981536448002, "grad/layer_4/attn_mlp_ratio": 0.6062557720477222, "grad/layer_8/attn": 0.00429013604298234, "grad/layer_8/mlp": 0.0038391195703297853, "grad/layer_8/attn_mlp_ratio": 1.1174791127607158, "grad/layer_12/attn": 0.0037152066361159086, "grad/layer_12/mlp": 0.00603030389174819, "grad/layer_12/attn_mlp_ratio": 0.616089444445879, "grad/layer_16/attn": 0.004989983513951302, "grad/layer_16/mlp": 0.004502854775637388, "grad/layer_16/attn_mlp_ratio": 1.1081821759234378, "grad/layer_20/attn": 0.00809712614864111, "grad/layer_20/mlp": 0.005972987040877342, "grad/layer_20/attn_mlp_ratio": 1.3556242392063418, "grad/layer_24/attn": 0.008633473888039589, "grad/layer_24/mlp": 0.012660791166126728, "grad/layer_24/attn_mlp_ratio": 0.6819063442849728, "grad/layer_27/attn": 0.012993644922971725, "grad/layer_27/mlp": 0.009231598116457462, "grad/layer_27/attn_mlp_ratio": 1.4075184619503418} {"step": 14800, "timestamp": 1778210529.8289628, "train/loss": 2.284982204437256, "train/z_loss": 0.0016602070070803165, "train/perplexity": 9.825511371957463, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020529.757776511, "perf/iters_per_sec": 0.963463667763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379218578338623, "data/tokens_consumed": 31039946752, "data/tokens_consumed_B": 31.039946752, "train/loss_slope": -1.5773408819954594e-05} {"step": 14810, "timestamp": 1778210540.2054932, "train/loss": 2.2996166229248045, "train/z_loss": 0.0016470756148919464, "train/perplexity": 9.970359314816669, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021914.9778590468, "perf/iters_per_sec": 0.9641241921706423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372107744216919, "data/tokens_consumed": 31060918272, "data/tokens_consumed_B": 31.060918272, "train/loss_slope": -1.2755062027160698e-05} {"step": 14820, "timestamp": 1778210550.5836065, "train/loss": 2.253156542778015, "train/z_loss": 0.0016636269981972873, "train/perplexity": 9.51773159693733, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021875.9845985984, "perf/iters_per_sec": 0.964105598735141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372307777404786, "data/tokens_consumed": 31081889792, "data/tokens_consumed_B": 31.081889792, "train/loss_slope": -1.4883808684785456e-05} {"step": 14830, "timestamp": 1778210560.9593887, "train/loss": 2.249364805221558, "train/z_loss": 0.0016672058147378266, "train/perplexity": 9.481711189710643, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022337.6330103443, "perf/iters_per_sec": 0.9643257298518869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369940042495727, "data/tokens_consumed": 31102861312, "data/tokens_consumed_B": 31.102861312, "train/loss_slope": -1.5394252776527393e-05} {"step": 14840, "timestamp": 1778210571.3367798, "train/loss": 2.232513403892517, "train/z_loss": 0.0016854575136676432, "train/perplexity": 9.323269798266097, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021972.3318236666, "perf/iters_per_sec": 0.9641515406721433, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371813535690309, "data/tokens_consumed": 31123832832, "data/tokens_consumed_B": 31.123832832, "train/loss_slope": -1.4571778179824561e-05} {"step": 14850, "timestamp": 1778210581.7070634, "grad/layer_0/attn": 0.0028474910650402308, "grad/layer_0/mlp": 0.0027704241219908, "grad/layer_0/attn_mlp_ratio": 1.0278177047535526, "grad/layer_4/attn": 0.0027279662899672985, "grad/layer_4/mlp": 0.0025838350411504507, "grad/layer_4/attn_mlp_ratio": 1.0557818672412178, "grad/layer_8/attn": 0.00910815317183733, "grad/layer_8/mlp": 0.003865396836772561, "grad/layer_8/attn_mlp_ratio": 2.3563306229145624, "grad/layer_12/attn": 0.005367127712816, "grad/layer_12/mlp": 0.005976702552288771, "grad/layer_12/attn_mlp_ratio": 0.8980081535026117, "grad/layer_16/attn": 0.0046426900662481785, "grad/layer_16/mlp": 0.004601513966917992, "grad/layer_16/attn_mlp_ratio": 1.0089483588947856, "grad/layer_20/attn": 0.005176251288503408, "grad/layer_20/mlp": 0.006594066042453051, "grad/layer_20/attn_mlp_ratio": 0.7849862553210298, "grad/layer_24/attn": 0.013049964793026447, "grad/layer_24/mlp": 0.010929862037301064, "grad/layer_24/attn_mlp_ratio": 1.1939734123900767, "grad/layer_27/attn": 0.008717422373592854, "grad/layer_27/mlp": 0.010716699995100498, "grad/layer_27/attn_mlp_ratio": 0.813442785207577} {"step": 14850, "timestamp": 1778210582.3177898, "eos/sharpness": 38.95654678344726, "eos/L0_probe": 2.090919256210327, "eos/L_plus": 2.2649624347686768, "eos/L_minus": 2.30644154548645, "eos/grad_norm": 0.17452691495418549, "eos/embed_grad_frac": 0.08537398278713226, "eos/time_s": 0.607912540435791} {"step": 14850, "timestamp": 1778210582.3372364, "train/loss": 2.2407809257507325, "train/z_loss": 0.0016524865059182048, "train/perplexity": 9.400669646714231, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907358.608012439, "perf/iters_per_sec": 0.9094994583189197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099505877494812, "data/tokens_consumed": 31144804352, "data/tokens_consumed_B": 31.144804352, "train/loss_slope": -1.5048991943528806e-05} {"step": 14850, "timestamp": 1778210583.6990077, "geo/rankme_last": 440.33837890625, "geo/layer_0/stable_rank_q_proj": 15.808311462402344, "geo/layer_0/stable_rank_k_proj": 14.193689346313477, "geo/layer_0/stable_rank_o_proj": 54.367218017578125, "geo/layer_0/stable_rank_gate_proj": 157.31898498535156, "geo/layer_0/stable_rank_down_proj": 48.672733306884766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04563768208026886, "geo/layer_0/attn_entropy_mean": 6.30717658996582, "geo/layer_0/attn_entropy_std": 0.2993062734603882, "geo/layer_7/stable_rank_q_proj": 43.48038101196289, "geo/layer_7/stable_rank_k_proj": 42.92448043823242, "geo/layer_7/stable_rank_o_proj": 110.38456726074219, "geo/layer_7/stable_rank_gate_proj": 112.61307525634766, "geo/layer_7/stable_rank_down_proj": 157.61512756347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5866063237190247, "geo/layer_7/attn_entropy_mean": 4.657222270965576, "geo/layer_7/attn_entropy_std": 0.8999659419059753, "geo/layer_14/stable_rank_q_proj": 59.55430603027344, "geo/layer_14/stable_rank_k_proj": 36.890743255615234, "geo/layer_14/stable_rank_o_proj": 50.96809768676758, "geo/layer_14/stable_rank_gate_proj": 96.59632873535156, "geo/layer_14/stable_rank_down_proj": 136.74171447753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3825586140155792, "geo/layer_14/attn_entropy_mean": 5.5567626953125, "geo/layer_14/attn_entropy_std": 0.5166983604431152, "geo/layer_21/stable_rank_q_proj": 50.05446243286133, "geo/layer_21/stable_rank_k_proj": 32.19825744628906, "geo/layer_21/stable_rank_o_proj": 87.41305541992188, "geo/layer_21/stable_rank_gate_proj": 95.96409606933594, "geo/layer_21/stable_rank_down_proj": 65.2726058959961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15869709849357605, "geo/layer_21/attn_entropy_mean": 5.78926420211792, "geo/layer_21/attn_entropy_std": 0.293081670999527, "geo/layer_27/stable_rank_q_proj": 42.103878021240234, "geo/layer_27/stable_rank_k_proj": 32.51539611816406, "geo/layer_27/stable_rank_o_proj": 116.5112533569336, "geo/layer_27/stable_rank_gate_proj": 93.37460327148438, "geo/layer_27/stable_rank_down_proj": 144.70777893066406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07161020487546921, "geo/layer_27/attn_entropy_mean": 4.467121601104736, "geo/layer_27/attn_entropy_std": 0.5103049874305725, "attnres/final_alpha/block_0": 0.24586915969848633, "attnres/block_norm/0": 1.5322237014770508, "attnres/final_alpha/block_1": 0.007879162207245827, "attnres/block_norm/1": 23583.34375, "attnres/final_alpha/block_2": 0.01661059632897377, "attnres/block_norm/2": 17436.140625, "attnres/final_alpha/block_3": 0.01870063692331314, "attnres/block_norm/3": 21768.4296875, "attnres/final_alpha/block_4": 0.024501275271177292, "attnres/block_norm/4": 7510.6279296875, "attnres/final_alpha/block_5": 0.5361623764038086, "attnres/block_norm/5": 4514.7998046875, "attnres/final_alpha/block_6": 0.1502768099308014, "attnres/block_norm/6": 15143.5009765625, "geo/tier1_time_s": 1.3580734729766846, "geo/step": 14850.0, "geo/rankme_slope": 0.00020199925673394358} {"step": 14860, "timestamp": 1778210594.0780265, "train/loss": 2.2443125247955322, "train/z_loss": 0.001673414872493595, "train/perplexity": 9.43392773520906, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786759.5735678084, "perf/iters_per_sec": 0.8519933574523012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1737180709838868, "data/tokens_consumed": 31165775872, "data/tokens_consumed_B": 31.165775872, "train/loss_slope": -1.6293693211140916e-05} {"step": 14870, "timestamp": 1778210604.4585757, "train/loss": 2.278515887260437, "train/z_loss": 0.0016535987029783427, "train/perplexity": 9.762181475270278, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021378.8710387477, "perf/iters_per_sec": 0.9638685565179576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374858617782592, "data/tokens_consumed": 31186747392, "data/tokens_consumed_B": 31.186747392, "train/loss_slope": -1.558140809922976e-05} {"step": 14880, "timestamp": 1778210614.8356998, "train/loss": 2.3019047737121583, "train/z_loss": 0.001658317761030048, "train/perplexity": 9.99319912082805, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022140.6012889654, "perf/iters_per_sec": 0.9642317778057887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037095046043396, "data/tokens_consumed": 31207718912, "data/tokens_consumed_B": 31.207718912, "train/loss_slope": -1.3525442729438381e-05} {"step": 14890, "timestamp": 1778210625.5892239, "train/loss": 2.272555112838745, "train/z_loss": 0.0016603620373643936, "train/perplexity": 9.704164398768203, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951193.01174348, "perf/iters_per_sec": 0.9304013308255578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.074804997444153, "data/tokens_consumed": 31228690432, "data/tokens_consumed_B": 31.228690432, "train/loss_slope": -1.2416079750370076e-05} {"step": 14900, "timestamp": 1778210635.9649696, "grad/layer_0/attn": 0.0025886164512485266, "grad/layer_0/mlp": 0.0025940239429473877, "grad/layer_0/attn_mlp_ratio": 0.997915365621393, "grad/layer_4/attn": 0.001604396034963429, "grad/layer_4/mlp": 0.0025657829828560352, "grad/layer_4/attn_mlp_ratio": 0.6253046275359867, "grad/layer_8/attn": 0.005042123142629862, "grad/layer_8/mlp": 0.003840081160888076, "grad/layer_8/attn_mlp_ratio": 1.3130251158965829, "grad/layer_12/attn": 0.0043901922181248665, "grad/layer_12/mlp": 0.005745334085077047, "grad/layer_12/attn_mlp_ratio": 0.7641317418102447, "grad/layer_16/attn": 0.00597346993163228, "grad/layer_16/mlp": 0.004433851223438978, "grad/layer_16/attn_mlp_ratio": 1.3472418211351178, "grad/layer_20/attn": 0.007664641831070185, "grad/layer_20/mlp": 0.0056771449744701385, "grad/layer_20/attn_mlp_ratio": 1.3500873644285978, "grad/layer_24/attn": 0.01185862347483635, "grad/layer_24/mlp": 0.010196723975241184, "grad/layer_24/attn_mlp_ratio": 1.1629836589998985, "grad/layer_27/attn": 0.00914554949849844, "grad/layer_27/mlp": 0.008403380401432514, "grad/layer_27/attn_mlp_ratio": 1.0883179093151154} {"step": 14900, "timestamp": 1778210635.982003, "train/loss": 2.25164258480072, "train/z_loss": 0.0016583414748311044, "train/perplexity": 9.50333305340552, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019110.4677967338, "perf/iters_per_sec": 0.9627868975623769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038651442527771, "data/tokens_consumed": 31249661952, "data/tokens_consumed_B": 31.249661952, "train/loss_slope": -9.705834155536251e-06} {"step": 14910, "timestamp": 1778210646.863736, "train/loss": 2.2743627071380614, "train/z_loss": 0.001661602349486202, "train/perplexity": 9.721721454251258, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928265.17754726, "perf/iters_per_sec": 0.9194684875236797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0875848531723022, "data/tokens_consumed": 31270633472, "data/tokens_consumed_B": 31.270633472, "train/loss_slope": -4.25402427842732e-06} {"step": 14920, "timestamp": 1778210657.2451665, "train/loss": 2.3309734582901003, "train/z_loss": 0.0016512532718479633, "train/perplexity": 10.287951551811357, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021187.1348359983, "perf/iters_per_sec": 0.9637771295719139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375842809677125, "data/tokens_consumed": 31291604992, "data/tokens_consumed_B": 31.291604992, "train/loss_slope": 5.191133289125279e-07} {"step": 14925, "timestamp": 1778210663.031557, "eos/sharpness": 23.511624336242672, "eos/L0_probe": 2.0899605751037598, "eos/L_plus": 2.2623138427734375, "eos/L_minus": 2.152723550796509, "eos/grad_norm": 0.1544630378484726, "eos/embed_grad_frac": 0.15299855172634125, "eos/time_s": 0.6076672077178955} {"step": 14925, "timestamp": 1778210664.412445, "geo/rankme_last": 441.39959716796875, "geo/layer_0/stable_rank_q_proj": 15.883740425109863, "geo/layer_0/stable_rank_k_proj": 14.207657814025879, "geo/layer_0/stable_rank_o_proj": 54.12504577636719, "geo/layer_0/stable_rank_gate_proj": 157.77426147460938, "geo/layer_0/stable_rank_down_proj": 48.656585693359375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.043054938316345215, "geo/layer_0/attn_entropy_mean": 6.305434703826904, "geo/layer_0/attn_entropy_std": 0.3007179796695709, "geo/layer_7/stable_rank_q_proj": 43.43785858154297, "geo/layer_7/stable_rank_k_proj": 42.732444763183594, "geo/layer_7/stable_rank_o_proj": 109.89862823486328, "geo/layer_7/stable_rank_gate_proj": 112.21940612792969, "geo/layer_7/stable_rank_down_proj": 157.75022888183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.566811740398407, "geo/layer_7/attn_entropy_mean": 4.664142608642578, "geo/layer_7/attn_entropy_std": 0.8706892728805542, "geo/layer_14/stable_rank_q_proj": 59.41971206665039, "geo/layer_14/stable_rank_k_proj": 36.83803176879883, "geo/layer_14/stable_rank_o_proj": 51.216331481933594, "geo/layer_14/stable_rank_gate_proj": 96.4148941040039, "geo/layer_14/stable_rank_down_proj": 136.17408752441406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38185498118400574, "geo/layer_14/attn_entropy_mean": 5.564443588256836, "geo/layer_14/attn_entropy_std": 0.5324665904045105, "geo/layer_21/stable_rank_q_proj": 50.180267333984375, "geo/layer_21/stable_rank_k_proj": 32.3022346496582, "geo/layer_21/stable_rank_o_proj": 87.4792709350586, "geo/layer_21/stable_rank_gate_proj": 96.05870056152344, "geo/layer_21/stable_rank_down_proj": 65.20735168457031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15873581171035767, "geo/layer_21/attn_entropy_mean": 5.776864051818848, "geo/layer_21/attn_entropy_std": 0.3032514452934265, "geo/layer_27/stable_rank_q_proj": 42.11549377441406, "geo/layer_27/stable_rank_k_proj": 32.38496017456055, "geo/layer_27/stable_rank_o_proj": 116.30184936523438, "geo/layer_27/stable_rank_gate_proj": 93.41740417480469, "geo/layer_27/stable_rank_down_proj": 144.4989013671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06642524898052216, "geo/layer_27/attn_entropy_mean": 4.446183681488037, "geo/layer_27/attn_entropy_std": 0.494213730096817, "attnres/final_alpha/block_0": 0.2463981658220291, "attnres/block_norm/0": 1.5335949659347534, "attnres/final_alpha/block_1": 0.007936190813779831, "attnres/block_norm/1": 23714.78515625, "attnres/final_alpha/block_2": 0.016653481870889664, "attnres/block_norm/2": 17419.421875, "attnres/final_alpha/block_3": 0.018447300419211388, "attnres/block_norm/3": 22083.71484375, "attnres/final_alpha/block_4": 0.024824772030115128, "attnres/block_norm/4": 7541.4580078125, "attnres/final_alpha/block_5": 0.5372507572174072, "attnres/block_norm/5": 4493.12109375, "attnres/final_alpha/block_6": 0.1484893560409546, "attnres/block_norm/6": 15182.5888671875, "geo/tier1_time_s": 1.3613471984863281, "geo/step": 14925.0, "geo/rankme_slope": 0.00022401310133428372} {"step": 14930, "timestamp": 1778210669.703353, "train/loss": 2.280966854095459, "train/z_loss": 0.0016533493413589894, "train/perplexity": 9.786137604149808, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1684308.898795322, "perf/iters_per_sec": 0.8031410688377962, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2451112747192383, "data/tokens_consumed": 31312576512, "data/tokens_consumed_B": 31.312576512, "train/loss_slope": 1.3274718146405672e-06} {"step": 14940, "timestamp": 1778210680.0799797, "train/loss": 2.240574765205383, "train/z_loss": 0.0016790071269497276, "train/perplexity": 9.39873179929392, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022202.7565136838, "perf/iters_per_sec": 0.9642614157265109, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370631694793702, "data/tokens_consumed": 31333548032, "data/tokens_consumed_B": 31.333548032, "train/loss_slope": 8.098153212461542e-07} {"step": 14950, "timestamp": 1778210690.9438634, "grad/layer_0/attn": 0.0027994620613753796, "grad/layer_0/mlp": 0.002657060045748949, "grad/layer_0/attn_mlp_ratio": 1.0535937870485395, "grad/layer_4/attn": 0.0016535121249035, "grad/layer_4/mlp": 0.002774797845631838, "grad/layer_4/attn_mlp_ratio": 0.595903614353796, "grad/layer_8/attn": 0.007494654506444931, "grad/layer_8/mlp": 0.004011869430541992, "grad/layer_8/attn_mlp_ratio": 1.868120199171189, "grad/layer_12/attn": 0.0044410633854568005, "grad/layer_12/mlp": 0.005535698961466551, "grad/layer_12/attn_mlp_ratio": 0.8022588179279109, "grad/layer_16/attn": 0.003579107578843832, "grad/layer_16/mlp": 0.004280135966837406, "grad/layer_16/attn_mlp_ratio": 0.8362135041861966, "grad/layer_20/attn": 0.006082925014197826, "grad/layer_20/mlp": 0.005758287850767374, "grad/layer_20/attn_mlp_ratio": 1.0563773583756246, "grad/layer_24/attn": 0.015872467309236526, "grad/layer_24/mlp": 0.014321033842861652, "grad/layer_24/attn_mlp_ratio": 1.108332496980652, "grad/layer_27/attn": 0.009730236604809761, "grad/layer_27/mlp": 0.01231507956981659, "grad/layer_27/attn_mlp_ratio": 0.7901074833204612} {"step": 14950, "timestamp": 1778210690.9610085, "train/loss": 2.308990001678467, "train/z_loss": 0.0016539611387997866, "train/perplexity": 10.064254639735324, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928255.666593302, "perf/iters_per_sec": 0.9194639523474226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.087590217590332, "data/tokens_consumed": 31354519552, "data/tokens_consumed_B": 31.354519552, "train/loss_slope": 5.25845574288244e-07} {"step": 14960, "timestamp": 1778210701.3368561, "train/loss": 2.266901063919067, "train/z_loss": 0.0016604296513833105, "train/perplexity": 9.649451399277382, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022322.0104059277, "perf/iters_per_sec": 0.9643182804135931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370020151138306, "data/tokens_consumed": 31375491072, "data/tokens_consumed_B": 31.375491072, "train/loss_slope": 7.862234916766879e-07} {"step": 14970, "timestamp": 1778210712.124073, "train/loss": 2.274387311935425, "train/z_loss": 0.001652507169637829, "train/perplexity": 9.721960658180434, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945333.4832070575, "perf/iters_per_sec": 0.9276072898898399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0780424118041991, "data/tokens_consumed": 31396462592, "data/tokens_consumed_B": 31.396462592, "train/loss_slope": 1.9504229728430675e-06} {"step": 14980, "timestamp": 1778210722.504036, "train/loss": 2.309051775932312, "train/z_loss": 0.001659610739443451, "train/perplexity": 10.064876370759487, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021489.4796116925, "perf/iters_per_sec": 0.9639212987955534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374290943145752, "data/tokens_consumed": 31417434112, "data/tokens_consumed_B": 31.417434112, "train/loss_slope": 3.947763209796455e-06} {"step": 14990, "timestamp": 1778210732.8811538, "train/loss": 2.2620848655700683, "train/z_loss": 0.0016616194625385106, "train/perplexity": 9.603089461141082, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022085.2366138843, "perf/iters_per_sec": 0.9642053778714582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037123441696167, "data/tokens_consumed": 31438405632, "data/tokens_consumed_B": 31.438405632, "train/loss_slope": 4.3244825767175895e-06} {"step": 15000, "timestamp": 1778210743.2438273, "grad/layer_0/attn": 0.002631179988384247, "grad/layer_0/mlp": 0.0026296614669263363, "grad/layer_0/attn_mlp_ratio": 1.0005774208654863, "grad/layer_4/attn": 0.0029383350629359484, "grad/layer_4/mlp": 0.002479276619851589, "grad/layer_4/attn_mlp_ratio": 1.1851581710942853, "grad/layer_8/attn": 0.0045769307762384415, "grad/layer_8/mlp": 0.0037150303833186626, "grad/layer_8/attn_mlp_ratio": 1.232003558729843, "grad/layer_12/attn": 0.0035392798017710447, "grad/layer_12/mlp": 0.005677368491888046, "grad/layer_12/attn_mlp_ratio": 0.6234014481335682, "grad/layer_16/attn": 0.005143357440829277, "grad/layer_16/mlp": 0.004572966601699591, "grad/layer_16/attn_mlp_ratio": 1.1247310064422062, "grad/layer_20/attn": 0.004244760610163212, "grad/layer_20/mlp": 0.006156055256724358, "grad/layer_20/attn_mlp_ratio": 0.6895260624202465, "grad/layer_24/attn": 0.010446205735206604, "grad/layer_24/mlp": 0.00963503960520029, "grad/layer_24/attn_mlp_ratio": 1.0841891735608007, "grad/layer_27/attn": 0.005662884563207626, "grad/layer_27/mlp": 0.009219346567988396, "grad/layer_27/attn_mlp_ratio": 0.614239247871046} {"step": 15000, "timestamp": 1778210743.8525724, "eos/sharpness": 53.311419486999505, "eos/L0_probe": 2.0855534076690674, "eos/L_plus": 2.4404964447021484, "eos/L_minus": 2.2637245655059814, "eos/grad_norm": 0.16994668543338776, "eos/embed_grad_frac": 0.07936190813779831, "eos/time_s": 0.6058378219604492} {"step": 15000, "timestamp": 1778210743.8726845, "train/loss": 2.3125017166137694, "train/z_loss": 0.001638400147203356, "train/perplexity": 10.099659562679843, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909007.7108919567, "perf/iters_per_sec": 0.9102858118495735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985560655593871, "data/tokens_consumed": 31459377152, "data/tokens_consumed_B": 31.459377152, "train/loss_slope": 6.736795822850842e-06} {"step": 15000, "timestamp": 1778210745.2350328, "geo/rankme_last": 440.5652160644531, "geo/layer_0/stable_rank_q_proj": 15.902997016906738, "geo/layer_0/stable_rank_k_proj": 14.225092887878418, "geo/layer_0/stable_rank_o_proj": 53.95205307006836, "geo/layer_0/stable_rank_gate_proj": 157.06805419921875, "geo/layer_0/stable_rank_down_proj": 48.65999984741211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04212965816259384, "geo/layer_0/attn_entropy_mean": 6.303579330444336, "geo/layer_0/attn_entropy_std": 0.2993644177913666, "geo/layer_7/stable_rank_q_proj": 43.314273834228516, "geo/layer_7/stable_rank_k_proj": 42.6937141418457, "geo/layer_7/stable_rank_o_proj": 110.20720672607422, "geo/layer_7/stable_rank_gate_proj": 112.66613006591797, "geo/layer_7/stable_rank_down_proj": 157.88526916503906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5806747078895569, "geo/layer_7/attn_entropy_mean": 4.675711631774902, "geo/layer_7/attn_entropy_std": 0.8523639440536499, "geo/layer_14/stable_rank_q_proj": 59.25539779663086, "geo/layer_14/stable_rank_k_proj": 36.781402587890625, "geo/layer_14/stable_rank_o_proj": 51.280818939208984, "geo/layer_14/stable_rank_gate_proj": 96.48016357421875, "geo/layer_14/stable_rank_down_proj": 136.65052795410156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3996956944465637, "geo/layer_14/attn_entropy_mean": 5.571104049682617, "geo/layer_14/attn_entropy_std": 0.5346869826316833, "geo/layer_21/stable_rank_q_proj": 50.0081901550293, "geo/layer_21/stable_rank_k_proj": 32.284976959228516, "geo/layer_21/stable_rank_o_proj": 87.5487060546875, "geo/layer_21/stable_rank_gate_proj": 95.80108642578125, "geo/layer_21/stable_rank_down_proj": 65.00833892822266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15576374530792236, "geo/layer_21/attn_entropy_mean": 5.762539863586426, "geo/layer_21/attn_entropy_std": 0.28982067108154297, "geo/layer_27/stable_rank_q_proj": 42.08524703979492, "geo/layer_27/stable_rank_k_proj": 32.38420867919922, "geo/layer_27/stable_rank_o_proj": 115.90220642089844, "geo/layer_27/stable_rank_gate_proj": 93.38447570800781, "geo/layer_27/stable_rank_down_proj": 144.4372100830078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07492709159851074, "geo/layer_27/attn_entropy_mean": 4.448479652404785, "geo/layer_27/attn_entropy_std": 0.5045970678329468, "attnres/final_alpha/block_0": 0.2434728741645813, "attnres/block_norm/0": 1.5350909233093262, "attnres/final_alpha/block_1": 0.007786382921040058, "attnres/block_norm/1": 23762.099609375, "attnres/final_alpha/block_2": 0.016248561441898346, "attnres/block_norm/2": 17576.279296875, "attnres/final_alpha/block_3": 0.018266212195158005, "attnres/block_norm/3": 22160.421875, "attnres/final_alpha/block_4": 0.024337807670235634, "attnres/block_norm/4": 7500.26611328125, "attnres/final_alpha/block_5": 0.5450330972671509, "attnres/block_norm/5": 4487.9833984375, "attnres/final_alpha/block_6": 0.14485503733158112, "attnres/block_norm/6": 15268.78515625, "geo/tier1_time_s": 1.3587884902954102, "geo/step": 15000.0, "geo/rankme_slope": 0.00018376157103466387} {"step": 15000, "timestamp": 1778210752.1185193, "geo/ww_alpha_mean": 8.05113965672555, "geo/ww_alpha_std": 4.878418440877442, "geo/ww_alpha_min": 2.651101070369251, "geo/ww_alpha_max": 38.57070633774516, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.23107323492832, "geo/ww_alpha_by_type/k_proj": 5.385069572460544, "geo/ww_alpha_by_type/v_proj": 8.312336337703908, "geo/ww_alpha_by_type/o_proj": 7.4691064844170825, "geo/ww_alpha_by_type/gate_proj": 9.642123241927207, "geo/ww_alpha_by_type/up_proj": 12.123671493996552, "geo/ww_alpha_by_type/down_proj": 9.327112815203051, "geo/twonn_id/layer_0": 0.7216001749038696, "geo/twonn_id/layer_7": 3.0063247680664062, "geo/twonn_id/layer_14": 3.950178861618042, "geo/twonn_id/layer_21": 7.62211799621582, "geo/twonn_id/layer_27": 5.163618564605713, "geo/tier2_time_s": 6.877178430557251} {"step": 15000, "timestamp": 1778210752.7857478, "eoc/jacobian_sigma/layer_0/attn": 702.7047729492188, "eoc/jacobian_sigma/layer_0/mlp": 4409.2939453125, "eoc/jacobian_sigma/layer_0": 4409.2939453125, "eoc/jacobian_sigma/layer_7/attn": 1.139574646949768, "eoc/jacobian_sigma/layer_7/mlp": 1.6739165782928467, "eoc/jacobian_sigma/layer_7": 1.6739165782928467, "eoc/jacobian_sigma/layer_14/attn": 1.4558515548706055, "eoc/jacobian_sigma/layer_14/mlp": 5.803526401519775, "eoc/jacobian_sigma/layer_14": 5.803526401519775, "eoc/jacobian_sigma/layer_21/attn": 1.0844355821609497, "eoc/jacobian_sigma/layer_21/mlp": 4.04144287109375, "eoc/jacobian_sigma/layer_21": 4.04144287109375, "eoc/jacobian_sigma/layer_27/attn": 2.211224317550659, "eoc/jacobian_sigma/layer_27/mlp": 18.713497161865234, "eoc/jacobian_sigma/layer_27": 18.713497161865234, "eoc/layer0_sigma": 4409.2939453125, "eoc/sigma_max": 18.713497161865234, "eoc/sigma_min": 1.6739165782928467, "eoc/sigma_mean": 7.558095753192902, "eoc/time_s": 0.6605565547943115} {"step": 15010, "timestamp": 1778210763.1807597, "train/loss": 2.2176296710968018, "train/z_loss": 0.001669625286012888, "train/perplexity": 9.185532308491048, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1086358.2868083746, "perf/iters_per_sec": 0.5180159982721207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9304423093795777, "data/tokens_consumed": 31480348672, "data/tokens_consumed_B": 31.480348672, "train/loss_slope": 3.390381402737562e-06} {"step": 15020, "timestamp": 1778210773.5585284, "train/loss": 2.2844850778579713, "train/z_loss": 0.0016486189677380025, "train/perplexity": 9.820628063011306, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021961.4557003067, "perf/iters_per_sec": 0.9641463545323881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371869325637817, "data/tokens_consumed": 31501320192, "data/tokens_consumed_B": 31.501320192, "train/loss_slope": 4.525969996310664e-06} {"step": 15030, "timestamp": 1778210783.9340198, "train/loss": 2.2382218837738037, "train/z_loss": 0.0016630861908197403, "train/perplexity": 9.376643693300263, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022209.2186448753, "perf/iters_per_sec": 0.9642644971107842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370598554611206, "data/tokens_consumed": 31522291712, "data/tokens_consumed_B": 31.522291712, "train/loss_slope": 2.8458362293786635e-06} {"step": 15040, "timestamp": 1778210794.309058, "train/loss": 2.2749302864074705, "train/z_loss": 0.001665823080111295, "train/perplexity": 9.727240868015905, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022554.049508408, "perf/iters_per_sec": 0.9644289252798118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036883044242859, "data/tokens_consumed": 31543263232, "data/tokens_consumed_B": 31.543263232, "train/loss_slope": 1.9763308103613987e-06} {"step": 15050, "timestamp": 1778210804.673565, "grad/layer_0/attn": 0.002972367685288191, "grad/layer_0/mlp": 0.0026908095460385084, "grad/layer_0/attn_mlp_ratio": 1.1046369220744399, "grad/layer_4/attn": 0.001648406032472849, "grad/layer_4/mlp": 0.0025191931053996086, "grad/layer_4/attn_mlp_ratio": 0.6543388688647124, "grad/layer_8/attn": 0.00545697845518589, "grad/layer_8/mlp": 0.00396737502887845, "grad/layer_8/attn_mlp_ratio": 1.375463191132254, "grad/layer_12/attn": 0.00429140729829669, "grad/layer_12/mlp": 0.005681018810719252, "grad/layer_12/attn_mlp_ratio": 0.7553939470610515, "grad/layer_16/attn": 0.004444153513759375, "grad/layer_16/mlp": 0.004693015478551388, "grad/layer_16/attn_mlp_ratio": 0.9469718221415259, "grad/layer_20/attn": 0.004580682143568993, "grad/layer_20/mlp": 0.006332769989967346, "grad/layer_20/attn_mlp_ratio": 0.7233299296347283, "grad/layer_24/attn": 0.016260212287306786, "grad/layer_24/mlp": 0.011876889504492283, "grad/layer_24/attn_mlp_ratio": 1.3690631831044862, "grad/layer_27/attn": 0.007070798892527819, "grad/layer_27/mlp": 0.010821943171322346, "grad/layer_27/attn_mlp_ratio": 0.6533760818415222} {"step": 15050, "timestamp": 1778210804.6904266, "train/loss": 2.264974355697632, "train/z_loss": 0.001666356879286468, "train/perplexity": 9.6308776208057, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021176.7315863636, "perf/iters_per_sec": 0.963772168915922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375896215438842, "data/tokens_consumed": 31564234752, "data/tokens_consumed_B": 31.564234752, "train/loss_slope": 2.085020268174369e-06} {"step": 15060, "timestamp": 1778210815.0684278, "train/loss": 2.265321135520935, "train/z_loss": 0.0016635090461932123, "train/perplexity": 9.634217993998837, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021792.2401529117, "perf/iters_per_sec": 0.9640656662716445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372737407684327, "data/tokens_consumed": 31585206272, "data/tokens_consumed_B": 31.585206272, "train/loss_slope": 3.643944461603854e-06} {"step": 15070, "timestamp": 1778210825.454373, "train/loss": 2.2674844026565553, "train/z_loss": 0.0016593567677773535, "train/perplexity": 9.655081940170732, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020474.3884512964, "perf/iters_per_sec": 0.963437265611313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037950301170349, "data/tokens_consumed": 31606177792, "data/tokens_consumed_B": 31.606177792, "train/loss_slope": 3.597725306836216e-06} {"step": 15075, "timestamp": 1778210831.2572649, "eos/sharpness": 42.33505725860595, "eos/L0_probe": 2.081888198852539, "eos/L_plus": 2.2829079627990723, "eos/L_minus": 2.3042190074920654, "eos/grad_norm": 0.20355834066867828, "eos/embed_grad_frac": 0.06530343741178513, "eos/time_s": 0.6258459091186523} {"step": 15075, "timestamp": 1778210832.6330867, "geo/rankme_last": 440.99652099609375, "geo/layer_0/stable_rank_q_proj": 15.906743049621582, "geo/layer_0/stable_rank_k_proj": 14.24450397491455, "geo/layer_0/stable_rank_o_proj": 53.88783645629883, "geo/layer_0/stable_rank_gate_proj": 157.10406494140625, "geo/layer_0/stable_rank_down_proj": 48.62346649169922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03892943263053894, "geo/layer_0/attn_entropy_mean": 6.309395790100098, "geo/layer_0/attn_entropy_std": 0.29759681224823, "geo/layer_7/stable_rank_q_proj": 43.171695709228516, "geo/layer_7/stable_rank_k_proj": 42.478309631347656, "geo/layer_7/stable_rank_o_proj": 110.470458984375, "geo/layer_7/stable_rank_gate_proj": 112.81928253173828, "geo/layer_7/stable_rank_down_proj": 157.2342529296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5712977647781372, "geo/layer_7/attn_entropy_mean": 4.681771278381348, "geo/layer_7/attn_entropy_std": 0.8527274131774902, "geo/layer_14/stable_rank_q_proj": 59.21451187133789, "geo/layer_14/stable_rank_k_proj": 36.76185989379883, "geo/layer_14/stable_rank_o_proj": 51.24834060668945, "geo/layer_14/stable_rank_gate_proj": 96.22351837158203, "geo/layer_14/stable_rank_down_proj": 136.97610473632812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38602572679519653, "geo/layer_14/attn_entropy_mean": 5.578082084655762, "geo/layer_14/attn_entropy_std": 0.5401694774627686, "geo/layer_21/stable_rank_q_proj": 50.151519775390625, "geo/layer_21/stable_rank_k_proj": 32.141136169433594, "geo/layer_21/stable_rank_o_proj": 87.38745880126953, "geo/layer_21/stable_rank_gate_proj": 95.33720397949219, "geo/layer_21/stable_rank_down_proj": 64.99388122558594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15705092251300812, "geo/layer_21/attn_entropy_mean": 5.7498579025268555, "geo/layer_21/attn_entropy_std": 0.31166645884513855, "geo/layer_27/stable_rank_q_proj": 42.15188980102539, "geo/layer_27/stable_rank_k_proj": 32.424583435058594, "geo/layer_27/stable_rank_o_proj": 115.87490844726562, "geo/layer_27/stable_rank_gate_proj": 93.33926391601562, "geo/layer_27/stable_rank_down_proj": 144.81358337402344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06745606660842896, "geo/layer_27/attn_entropy_mean": 4.459529876708984, "geo/layer_27/attn_entropy_std": 0.5130777359008789, "attnres/final_alpha/block_0": 0.24643650650978088, "attnres/block_norm/0": 1.5365302562713623, "attnres/final_alpha/block_1": 0.00790569931268692, "attnres/block_norm/1": 23888.486328125, "attnres/final_alpha/block_2": 0.016582217067480087, "attnres/block_norm/2": 17547.630859375, "attnres/final_alpha/block_3": 0.018525343388319016, "attnres/block_norm/3": 22366.5703125, "attnres/final_alpha/block_4": 0.02469688281416893, "attnres/block_norm/4": 7603.3154296875, "attnres/final_alpha/block_5": 0.53627610206604, "attnres/block_norm/5": 4568.2802734375, "attnres/final_alpha/block_6": 0.14957721531391144, "attnres/block_norm/6": 15237.837890625, "geo/tier1_time_s": 1.3557703495025635, "geo/step": 15075.0, "geo/rankme_slope": 0.00017487528214410765} {"step": 15080, "timestamp": 1778210837.826936, "train/loss": 2.248929238319397, "train/z_loss": 0.0016646926174871623, "train/perplexity": 9.477582169438122, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695742.8133263309, "perf/iters_per_sec": 0.8085931841498999, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236715841293335, "data/tokens_consumed": 31627149312, "data/tokens_consumed_B": 31.627149312, "train/loss_slope": 7.069955600334116e-07} {"step": 15090, "timestamp": 1778210848.2097719, "train/loss": 2.243094968795776, "train/z_loss": 0.0016613745712675155, "train/perplexity": 9.422448389684922, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020921.931850436, "perf/iters_per_sec": 0.9636506709339313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377204418182373, "data/tokens_consumed": 31648120832, "data/tokens_consumed_B": 31.648120832, "train/loss_slope": 2.6251974100111724e-06} {"step": 15100, "timestamp": 1778210858.5795443, "grad/layer_0/attn": 0.0024509418290108442, "grad/layer_0/mlp": 0.002559087472036481, "grad/layer_0/attn_mlp_ratio": 0.957740507121616, "grad/layer_4/attn": 0.0018252775771543384, "grad/layer_4/mlp": 0.0023671698290854692, "grad/layer_4/attn_mlp_ratio": 0.7710800795190546, "grad/layer_8/attn": 0.004656777251511812, "grad/layer_8/mlp": 0.003716961480677128, "grad/layer_8/attn_mlp_ratio": 1.252845139890705, "grad/layer_12/attn": 0.003799848724156618, "grad/layer_12/mlp": 0.0061590890400111675, "grad/layer_12/attn_mlp_ratio": 0.6169497855570456, "grad/layer_16/attn": 0.004208831116557121, "grad/layer_16/mlp": 0.0043773045763373375, "grad/layer_16/attn_mlp_ratio": 0.9615120325777313, "grad/layer_20/attn": 0.004891498945653439, "grad/layer_20/mlp": 0.005766657646745443, "grad/layer_20/attn_mlp_ratio": 0.8482381234457824, "grad/layer_24/attn": 0.008661182597279549, "grad/layer_24/mlp": 0.008920704014599323, "grad/layer_24/attn_mlp_ratio": 0.9709079559207607, "grad/layer_27/attn": 0.00838091317564249, "grad/layer_27/mlp": 0.007528706919401884, "grad/layer_27/attn_mlp_ratio": 1.1131942249903504} {"step": 15100, "timestamp": 1778210858.5967984, "train/loss": 2.2144243001937864, "train/z_loss": 0.001675055839587003, "train/perplexity": 9.156136408051445, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020029.2699531587, "perf/iters_per_sec": 0.9632250165715974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381790161132813, "data/tokens_consumed": 31669092352, "data/tokens_consumed_B": 31.669092352, "train/loss_slope": -1.0041857853045228e-06} {"step": 15110, "timestamp": 1778210868.9752266, "train/loss": 2.236876606941223, "train/z_loss": 0.001665481098461896, "train/perplexity": 9.364037992751875, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021752.601055819, "perf/iters_per_sec": 0.9640467648772331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03729407787323, "data/tokens_consumed": 31690063872, "data/tokens_consumed_B": 31.690063872, "train/loss_slope": -4.291103575060182e-06} {"step": 15120, "timestamp": 1778210879.3607376, "train/loss": 2.277091407775879, "train/z_loss": 0.0016506880871020257, "train/perplexity": 9.748285347758266, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020773.5024277398, "perf/iters_per_sec": 0.9635798942698192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377966642379761, "data/tokens_consumed": 31711035392, "data/tokens_consumed_B": 31.711035392, "train/loss_slope": -2.849975098656633e-06} {"step": 15130, "timestamp": 1778210889.7413647, "train/loss": 2.236574721336365, "train/z_loss": 0.0016577988513745367, "train/perplexity": 9.361211551131003, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021315.7910151626, "perf/iters_per_sec": 0.9638384776187718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375182390213014, "data/tokens_consumed": 31732006912, "data/tokens_consumed_B": 31.732006912, "train/loss_slope": -2.5932888064769885e-06} {"step": 15140, "timestamp": 1778210900.125071, "train/loss": 2.2407165288925173, "train/z_loss": 0.001665881252847612, "train/perplexity": 9.400064292615523, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020683.2111780918, "perf/iters_per_sec": 0.9635368400469264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378430366516114, "data/tokens_consumed": 31752978432, "data/tokens_consumed_B": 31.752978432, "train/loss_slope": -3.4218780063775076e-06} {"step": 15150, "timestamp": 1778210910.496588, "grad/layer_0/attn": 0.0029533072374761105, "grad/layer_0/mlp": 0.003006510902196169, "grad/layer_0/attn_mlp_ratio": 0.9823038183857652, "grad/layer_4/attn": 0.0017056027427315712, "grad/layer_4/mlp": 0.0028058115858584642, "grad/layer_4/attn_mlp_ratio": 0.6078821152994542, "grad/layer_8/attn": 0.009386618621647358, "grad/layer_8/mlp": 0.004158355295658112, "grad/layer_8/attn_mlp_ratio": 2.257291099132187, "grad/layer_12/attn": 0.003902990370988846, "grad/layer_12/mlp": 0.005940398201346397, "grad/layer_12/attn_mlp_ratio": 0.6570250298038482, "grad/layer_16/attn": 0.005194922909140587, "grad/layer_16/mlp": 0.004969262983649969, "grad/layer_16/attn_mlp_ratio": 1.0454111246862923, "grad/layer_20/attn": 0.007676769979298115, "grad/layer_20/mlp": 0.007239439059048891, "grad/layer_20/attn_mlp_ratio": 1.0604094890006204, "grad/layer_24/attn": 0.018060743808746338, "grad/layer_24/mlp": 0.01365474984049797, "grad/layer_24/attn_mlp_ratio": 1.322671150145404, "grad/layer_27/attn": 0.008157984353601933, "grad/layer_27/mlp": 0.011570424772799015, "grad/layer_27/attn_mlp_ratio": 0.7050721510478487} {"step": 15150, "timestamp": 1778210911.1126742, "eos/sharpness": 33.47744941711425, "eos/L0_probe": 2.0888280868530273, "eos/L_plus": 2.272174596786499, "eos/L_minus": 2.2402560710906982, "eos/grad_norm": 0.18034668266773224, "eos/embed_grad_frac": 0.09547635912895203, "eos/time_s": 0.6133029460906982} {"step": 15150, "timestamp": 1778210911.1350703, "train/loss": 2.195388388633728, "train/z_loss": 0.0016763432999141515, "train/perplexity": 8.98348946454581, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905819.3766021545, "perf/iters_per_sec": 0.9087654955874226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1003938913345337, "data/tokens_consumed": 31773949952, "data/tokens_consumed_B": 31.773949952, "train/loss_slope": -7.74927636434684e-06} {"step": 15150, "timestamp": 1778210912.4953353, "geo/rankme_last": 440.9624328613281, "geo/layer_0/stable_rank_q_proj": 15.931042671203613, "geo/layer_0/stable_rank_k_proj": 14.248930931091309, "geo/layer_0/stable_rank_o_proj": 53.91976547241211, "geo/layer_0/stable_rank_gate_proj": 157.38284301757812, "geo/layer_0/stable_rank_down_proj": 48.65538787841797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03907516226172447, "geo/layer_0/attn_entropy_mean": 6.305210590362549, "geo/layer_0/attn_entropy_std": 0.30288127064704895, "geo/layer_7/stable_rank_q_proj": 43.130855560302734, "geo/layer_7/stable_rank_k_proj": 42.47408676147461, "geo/layer_7/stable_rank_o_proj": 110.3336181640625, "geo/layer_7/stable_rank_gate_proj": 112.73391723632812, "geo/layer_7/stable_rank_down_proj": 156.85337829589844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5732854008674622, "geo/layer_7/attn_entropy_mean": 4.657810211181641, "geo/layer_7/attn_entropy_std": 0.8657927513122559, "geo/layer_14/stable_rank_q_proj": 59.08435821533203, "geo/layer_14/stable_rank_k_proj": 36.83481216430664, "geo/layer_14/stable_rank_o_proj": 51.2517204284668, "geo/layer_14/stable_rank_gate_proj": 96.19098663330078, "geo/layer_14/stable_rank_down_proj": 137.1373748779297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3844188153743744, "geo/layer_14/attn_entropy_mean": 5.539829254150391, "geo/layer_14/attn_entropy_std": 0.5606820583343506, "geo/layer_21/stable_rank_q_proj": 50.224327087402344, "geo/layer_21/stable_rank_k_proj": 32.178829193115234, "geo/layer_21/stable_rank_o_proj": 87.65125274658203, "geo/layer_21/stable_rank_gate_proj": 95.07527923583984, "geo/layer_21/stable_rank_down_proj": 64.96009063720703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15252189338207245, "geo/layer_21/attn_entropy_mean": 5.775463581085205, "geo/layer_21/attn_entropy_std": 0.29378587007522583, "geo/layer_27/stable_rank_q_proj": 42.141517639160156, "geo/layer_27/stable_rank_k_proj": 32.451942443847656, "geo/layer_27/stable_rank_o_proj": 116.22509765625, "geo/layer_27/stable_rank_gate_proj": 93.5768814086914, "geo/layer_27/stable_rank_down_proj": 144.65411376953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06819559633731842, "geo/layer_27/attn_entropy_mean": 4.459786415100098, "geo/layer_27/attn_entropy_std": 0.5042637586593628, "attnres/final_alpha/block_0": 0.24532654881477356, "attnres/block_norm/0": 1.5378272533416748, "attnres/final_alpha/block_1": 0.007693881168961525, "attnres/block_norm/1": 23976.751953125, "attnres/final_alpha/block_2": 0.015994451940059662, "attnres/block_norm/2": 17654.40234375, "attnres/final_alpha/block_3": 0.0182877816259861, "attnres/block_norm/3": 22405.5546875, "attnres/final_alpha/block_4": 0.02452034503221512, "attnres/block_norm/4": 7630.37548828125, "attnres/final_alpha/block_5": 0.5438927412033081, "attnres/block_norm/5": 4478.4365234375, "attnres/final_alpha/block_6": 0.1442842185497284, "attnres/block_norm/6": 15435.6484375, "geo/tier1_time_s": 1.3561556339263916, "geo/step": 15150.0, "geo/rankme_slope": 0.00018562204178546417} {"step": 15160, "timestamp": 1778210922.8793137, "train/loss": 2.27480731010437, "train/z_loss": 0.0016657266882248224, "train/perplexity": 9.726044721444937, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786316.3445888013, "perf/iters_per_sec": 0.8517820094055182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1740092992782594, "data/tokens_consumed": 31794921472, "data/tokens_consumed_B": 31.794921472, "train/loss_slope": -7.68466929290667e-06} {"step": 15170, "timestamp": 1778210933.2696567, "train/loss": 2.245089530944824, "train/z_loss": 0.0016586860758252442, "train/perplexity": 9.441260803621981, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019548.5971375613, "perf/iters_per_sec": 0.9629958139121825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038426113128662, "data/tokens_consumed": 31815892992, "data/tokens_consumed_B": 31.815892992, "train/loss_slope": -6.995060956767869e-06} {"step": 15180, "timestamp": 1778210943.6475768, "train/loss": 2.252078723907471, "train/z_loss": 0.0016690041753463446, "train/perplexity": 9.507478732575276, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021920.4156561003, "perf/iters_per_sec": 0.9641267851143361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372079849243163, "data/tokens_consumed": 31836864512, "data/tokens_consumed_B": 31.836864512, "train/loss_slope": -6.619314706758331e-06} {"step": 15190, "timestamp": 1778210954.0195754, "train/loss": 2.275557804107666, "train/z_loss": 0.001649873168207705, "train/perplexity": 9.7333467994243, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023508.4339468705, "perf/iters_per_sec": 0.9648840112432816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363940000534058, "data/tokens_consumed": 31857836032, "data/tokens_consumed_B": 31.857836032, "train/loss_slope": -4.4954080106688005e-06} {"step": 15200, "timestamp": 1778210964.3867538, "grad/layer_0/attn": 0.0033215051516890526, "grad/layer_0/mlp": 0.0030113658867776394, "grad/layer_0/attn_mlp_ratio": 1.1029895290951608, "grad/layer_4/attn": 0.001415494829416275, "grad/layer_4/mlp": 0.0026253885589540005, "grad/layer_4/attn_mlp_ratio": 0.5391562977118337, "grad/layer_8/attn": 0.0050094248726964, "grad/layer_8/mlp": 0.004029945004731417, "grad/layer_8/attn_mlp_ratio": 1.2430503995736841, "grad/layer_12/attn": 0.005221936851739883, "grad/layer_12/mlp": 0.0064727445133030415, "grad/layer_12/attn_mlp_ratio": 0.8067577455485502, "grad/layer_16/attn": 0.007279196288436651, "grad/layer_16/mlp": 0.005387279205024242, "grad/layer_16/attn_mlp_ratio": 1.3511822714756936, "grad/layer_20/attn": 0.008401465602219105, "grad/layer_20/mlp": 0.007608234882354736, "grad/layer_20/attn_mlp_ratio": 1.1042594795907401, "grad/layer_24/attn": 0.020285096019506454, "grad/layer_24/mlp": 0.013391640968620777, "grad/layer_24/attn_mlp_ratio": 1.5147580431376992, "grad/layer_27/attn": 0.006093991920351982, "grad/layer_27/mlp": 0.012158614583313465, "grad/layer_27/attn_mlp_ratio": 0.5012077509714493} {"step": 15200, "timestamp": 1778210964.4038486, "train/loss": 2.2867937326431274, "train/z_loss": 0.0016616768087260425, "train/perplexity": 9.843326694552616, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020958.6595423496, "perf/iters_per_sec": 0.9636681840621708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377015829086305, "data/tokens_consumed": 31878807552, "data/tokens_consumed_B": 31.878807552, "train/loss_slope": -7.51172269460092e-07} {"step": 15210, "timestamp": 1778210974.7803524, "train/loss": 2.2535870552062987, "train/z_loss": 0.0016601813840679824, "train/perplexity": 9.521829980818177, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022212.3799902867, "perf/iters_per_sec": 0.9642660045577462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370582342147827, "data/tokens_consumed": 31899779072, "data/tokens_consumed_B": 31.899779072, "train/loss_slope": -3.2762274025368814e-06} {"step": 15220, "timestamp": 1778210985.1549382, "train/loss": 2.2530083894729613, "train/z_loss": 0.0016616279142908752, "train/perplexity": 9.516321617993729, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022500.4293302689, "perf/iters_per_sec": 0.9644033571864457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369105339050293, "data/tokens_consumed": 31920750592, "data/tokens_consumed_B": 31.920750592, "train/loss_slope": -4.827516078233684e-06} {"step": 15225, "timestamp": 1778210990.9374235, "eos/sharpness": 16.725158691406246, "eos/L0_probe": 2.0863163471221924, "eos/L_plus": 2.2060437202453613, "eos/L_minus": 2.133840560913086, "eos/grad_norm": 0.11652055382728577, "eos/embed_grad_frac": 0.19444049894809723, "eos/time_s": 0.6064577102661133} {"step": 15225, "timestamp": 1778210992.315642, "geo/rankme_last": 440.4276123046875, "geo/layer_0/stable_rank_q_proj": 15.945086479187012, "geo/layer_0/stable_rank_k_proj": 14.27929401397705, "geo/layer_0/stable_rank_o_proj": 54.05851745605469, "geo/layer_0/stable_rank_gate_proj": 157.3467254638672, "geo/layer_0/stable_rank_down_proj": 48.69912338256836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04027266800403595, "geo/layer_0/attn_entropy_mean": 6.305926322937012, "geo/layer_0/attn_entropy_std": 0.2985920310020447, "geo/layer_7/stable_rank_q_proj": 43.00141525268555, "geo/layer_7/stable_rank_k_proj": 42.582672119140625, "geo/layer_7/stable_rank_o_proj": 110.55115509033203, "geo/layer_7/stable_rank_gate_proj": 112.28446197509766, "geo/layer_7/stable_rank_down_proj": 157.00384521484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5665357708930969, "geo/layer_7/attn_entropy_mean": 4.681021213531494, "geo/layer_7/attn_entropy_std": 0.8750627636909485, "geo/layer_14/stable_rank_q_proj": 59.1301383972168, "geo/layer_14/stable_rank_k_proj": 36.84779739379883, "geo/layer_14/stable_rank_o_proj": 51.25330352783203, "geo/layer_14/stable_rank_gate_proj": 95.93727111816406, "geo/layer_14/stable_rank_down_proj": 137.150146484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37760281562805176, "geo/layer_14/attn_entropy_mean": 5.55432653427124, "geo/layer_14/attn_entropy_std": 0.5322307348251343, "geo/layer_21/stable_rank_q_proj": 49.97090148925781, "geo/layer_21/stable_rank_k_proj": 32.28540802001953, "geo/layer_21/stable_rank_o_proj": 87.61798095703125, "geo/layer_21/stable_rank_gate_proj": 94.97817993164062, "geo/layer_21/stable_rank_down_proj": 64.80231475830078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15722160041332245, "geo/layer_21/attn_entropy_mean": 5.768874168395996, "geo/layer_21/attn_entropy_std": 0.305337518453598, "geo/layer_27/stable_rank_q_proj": 42.08865737915039, "geo/layer_27/stable_rank_k_proj": 32.378055572509766, "geo/layer_27/stable_rank_o_proj": 116.12252044677734, "geo/layer_27/stable_rank_gate_proj": 93.55218505859375, "geo/layer_27/stable_rank_down_proj": 144.51699829101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06842868030071259, "geo/layer_27/attn_entropy_mean": 4.464353561401367, "geo/layer_27/attn_entropy_std": 0.5023024678230286, "attnres/final_alpha/block_0": 0.24581055343151093, "attnres/block_norm/0": 1.5390491485595703, "attnres/final_alpha/block_1": 0.007741338107734919, "attnres/block_norm/1": 24039.81640625, "attnres/final_alpha/block_2": 0.01618017815053463, "attnres/block_norm/2": 17619.060546875, "attnres/final_alpha/block_3": 0.018476665019989014, "attnres/block_norm/3": 22345.009765625, "attnres/final_alpha/block_4": 0.02448650263249874, "attnres/block_norm/4": 7628.3876953125, "attnres/final_alpha/block_5": 0.5400103330612183, "attnres/block_norm/5": 4557.80859375, "attnres/final_alpha/block_6": 0.14729446172714233, "attnres/block_norm/6": 15604.552734375, "geo/tier1_time_s": 1.3588001728057861, "geo/step": 15225.0, "geo/rankme_slope": 0.00015368235184698878} {"step": 15230, "timestamp": 1778210997.5047672, "train/loss": 2.2692892789840697, "train/z_loss": 0.0016625727759674192, "train/perplexity": 9.672523904564198, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698882.3886759987, "perf/iters_per_sec": 0.8100902503376001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344303607940674, "data/tokens_consumed": 31941722112, "data/tokens_consumed_B": 31.941722112, "train/loss_slope": -6.799956316566003e-06} {"step": 15240, "timestamp": 1778211007.8870673, "train/loss": 2.2481304168701173, "train/z_loss": 0.0016611713799647987, "train/perplexity": 9.470014296605866, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021352.6723771498, "perf/iters_per_sec": 0.9638560640226125, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374993085861206, "data/tokens_consumed": 31962693632, "data/tokens_consumed_B": 31.962693632, "train/loss_slope": -1.047668731001598e-05} {"step": 15250, "timestamp": 1778211018.265279, "grad/layer_0/attn": 0.003378356574103236, "grad/layer_0/mlp": 0.002907720860093832, "grad/layer_0/attn_mlp_ratio": 1.1618572141097805, "grad/layer_4/attn": 0.0018875926034525037, "grad/layer_4/mlp": 0.0027783415280282497, "grad/layer_4/attn_mlp_ratio": 0.6793954294210038, "grad/layer_8/attn": 0.010379529558122158, "grad/layer_8/mlp": 0.003955532796680927, "grad/layer_8/attn_mlp_ratio": 2.624053402976773, "grad/layer_12/attn": 0.004508885554969311, "grad/layer_12/mlp": 0.006717436946928501, "grad/layer_12/attn_mlp_ratio": 0.6712211105917197, "grad/layer_16/attn": 0.006617855280637741, "grad/layer_16/mlp": 0.005086236167699099, "grad/layer_16/attn_mlp_ratio": 1.3011301348042792, "grad/layer_20/attn": 0.0045384857803583145, "grad/layer_20/mlp": 0.006657428108155727, "grad/layer_20/attn_mlp_ratio": 0.6817175699767084, "grad/layer_24/attn": 0.024308063089847565, "grad/layer_24/mlp": 0.015971537679433823, "grad/layer_24/attn_mlp_ratio": 1.5219613430804697, "grad/layer_27/attn": 0.0069190640933811665, "grad/layer_27/mlp": 0.015006150119006634, "grad/layer_27/attn_mlp_ratio": 0.46108188925215154} {"step": 15250, "timestamp": 1778211018.28247, "train/loss": 2.2913034915924073, "train/z_loss": 0.0016504876781255006, "train/perplexity": 9.887817972267982, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018416.3218230284, "perf/iters_per_sec": 0.962455902968897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0390086412429809, "data/tokens_consumed": 31983665152, "data/tokens_consumed_B": 31.983665152, "train/loss_slope": -9.074787381100968e-06} {"step": 15260, "timestamp": 1778211028.6570258, "train/loss": 2.2690234422683715, "train/z_loss": 0.00165287897689268, "train/perplexity": 9.669952934321179, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022845.7782010776, "perf/iters_per_sec": 0.9645680323605907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367335081100464, "data/tokens_consumed": 32004636672, "data/tokens_consumed_B": 32.004636672, "train/loss_slope": -8.729131530077332e-06} {"step": 15270, "timestamp": 1778211039.0438888, "train/loss": 2.256464409828186, "train/z_loss": 0.001660190336406231, "train/perplexity": 9.54926711657728, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020034.4656601448, "perf/iters_per_sec": 0.9632274940777515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381763458251954, "data/tokens_consumed": 32025608192, "data/tokens_consumed_B": 32.025608192, "train/loss_slope": -1.1978788780729771e-05} {"step": 15280, "timestamp": 1778211049.4281766, "train/loss": 2.2843093156814573, "train/z_loss": 0.0016434235265478491, "train/perplexity": 9.818902119730433, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020641.2482229986, "perf/iters_per_sec": 0.96351683055067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037864589691162, "data/tokens_consumed": 32046579712, "data/tokens_consumed_B": 32.046579712, "train/loss_slope": -1.1154655540379367e-05} {"step": 15290, "timestamp": 1778211059.809992, "train/loss": 2.272768521308899, "train/z_loss": 0.0016616545617580414, "train/perplexity": 9.706235570641613, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021389.6015376737, "perf/iters_per_sec": 0.9638736732185715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037480354309082, "data/tokens_consumed": 32067551232, "data/tokens_consumed_B": 32.067551232, "train/loss_slope": -1.1760435360457314e-05} {"step": 15300, "timestamp": 1778211070.1905377, "grad/layer_0/attn": 0.002574395854026079, "grad/layer_0/mlp": 0.002635383512824774, "grad/layer_0/attn_mlp_ratio": 0.9768581095739136, "grad/layer_4/attn": 0.001800814876332879, "grad/layer_4/mlp": 0.0024911726359277964, "grad/layer_4/attn_mlp_ratio": 0.7228783658240363, "grad/layer_8/attn": 0.0043865772895514965, "grad/layer_8/mlp": 0.0037306235171854496, "grad/layer_8/attn_mlp_ratio": 1.1758294965335925, "grad/layer_12/attn": 0.005644341930747032, "grad/layer_12/mlp": 0.006201766896992922, "grad/layer_12/attn_mlp_ratio": 0.910118347478038, "grad/layer_16/attn": 0.004269983619451523, "grad/layer_16/mlp": 0.004512899089604616, "grad/layer_16/attn_mlp_ratio": 0.9461730563996101, "grad/layer_20/attn": 0.004448131192475557, "grad/layer_20/mlp": 0.00656748004257679, "grad/layer_20/attn_mlp_ratio": 0.6772964814371417, "grad/layer_24/attn": 0.01869671419262886, "grad/layer_24/mlp": 0.01381352823227644, "grad/layer_24/attn_mlp_ratio": 1.3535074995243943, "grad/layer_27/attn": 0.0058102174662053585, "grad/layer_27/mlp": 0.012010333128273487, "grad/layer_27/attn_mlp_ratio": 0.48376821490077765} {"step": 15300, "timestamp": 1778211070.81331, "eos/sharpness": 31.856155395507805, "eos/L0_probe": 2.0874037742614746, "eos/L_plus": 2.256927251815796, "eos/L_minus": 2.2364418506622314, "eos/grad_norm": 0.18900276720523834, "eos/embed_grad_frac": 0.08221789449453354, "eos/time_s": 0.6197216510772705} {"step": 15300, "timestamp": 1778211070.833335, "train/loss": 2.253144073486328, "train/z_loss": 0.0016484676161780953, "train/perplexity": 9.517612918305769, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903698.467006101, "perf/iters_per_sec": 0.9077541670828347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.101619839668274, "data/tokens_consumed": 32088522752, "data/tokens_consumed_B": 32.088522752, "train/loss_slope": -1.534858204171785e-05} {"step": 15300, "timestamp": 1778211072.1977375, "geo/rankme_last": 440.2707824707031, "geo/layer_0/stable_rank_q_proj": 15.975144386291504, "geo/layer_0/stable_rank_k_proj": 14.241525650024414, "geo/layer_0/stable_rank_o_proj": 54.02385711669922, "geo/layer_0/stable_rank_gate_proj": 157.78363037109375, "geo/layer_0/stable_rank_down_proj": 48.67154312133789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.045009151101112366, "geo/layer_0/attn_entropy_mean": 6.304180145263672, "geo/layer_0/attn_entropy_std": 0.2986302971839905, "geo/layer_7/stable_rank_q_proj": 42.75520324707031, "geo/layer_7/stable_rank_k_proj": 42.66061019897461, "geo/layer_7/stable_rank_o_proj": 110.46731567382812, "geo/layer_7/stable_rank_gate_proj": 111.80339813232422, "geo/layer_7/stable_rank_down_proj": 156.3737030029297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.572610080242157, "geo/layer_7/attn_entropy_mean": 4.6769866943359375, "geo/layer_7/attn_entropy_std": 0.8547680974006653, "geo/layer_14/stable_rank_q_proj": 59.21168518066406, "geo/layer_14/stable_rank_k_proj": 36.81588363647461, "geo/layer_14/stable_rank_o_proj": 51.30683517456055, "geo/layer_14/stable_rank_gate_proj": 95.57884979248047, "geo/layer_14/stable_rank_down_proj": 137.18812561035156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37463679909706116, "geo/layer_14/attn_entropy_mean": 5.559625625610352, "geo/layer_14/attn_entropy_std": 0.5409427881240845, "geo/layer_21/stable_rank_q_proj": 49.94782257080078, "geo/layer_21/stable_rank_k_proj": 32.358001708984375, "geo/layer_21/stable_rank_o_proj": 87.58625030517578, "geo/layer_21/stable_rank_gate_proj": 94.99124145507812, "geo/layer_21/stable_rank_down_proj": 64.8047866821289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15937326848506927, "geo/layer_21/attn_entropy_mean": 5.7719831466674805, "geo/layer_21/attn_entropy_std": 0.298647403717041, "geo/layer_27/stable_rank_q_proj": 41.99837875366211, "geo/layer_27/stable_rank_k_proj": 32.437591552734375, "geo/layer_27/stable_rank_o_proj": 116.31455993652344, "geo/layer_27/stable_rank_gate_proj": 93.63346099853516, "geo/layer_27/stable_rank_down_proj": 144.66506958007812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06466671824455261, "geo/layer_27/attn_entropy_mean": 4.48819637298584, "geo/layer_27/attn_entropy_std": 0.477460116147995, "attnres/final_alpha/block_0": 0.24528338015079498, "attnres/block_norm/0": 1.5403618812561035, "attnres/final_alpha/block_1": 0.007626762147992849, "attnres/block_norm/1": 24157.55078125, "attnres/final_alpha/block_2": 0.016088033095002174, "attnres/block_norm/2": 17744.23828125, "attnres/final_alpha/block_3": 0.017932787537574768, "attnres/block_norm/3": 22559.455078125, "attnres/final_alpha/block_4": 0.024084273725748062, "attnres/block_norm/4": 7688.58935546875, "attnres/final_alpha/block_5": 0.5441011190414429, "attnres/block_norm/5": 4520.712890625, "attnres/final_alpha/block_6": 0.14488361775875092, "attnres/block_norm/6": 15546.990234375, "geo/tier1_time_s": 1.3606243133544922, "geo/step": 15300.0, "geo/rankme_slope": 0.0001470566937712585} {"step": 15310, "timestamp": 1778211082.60087, "train/loss": 2.258698320388794, "train/z_loss": 0.001648768747691065, "train/perplexity": 9.570623170110638, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1782716.411185799, "perf/iters_per_sec": 0.85006542739191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1763800382614136, "data/tokens_consumed": 32109494272, "data/tokens_consumed_B": 32.109494272, "train/loss_slope": -1.505619050359857e-05} {"step": 15320, "timestamp": 1778211092.9797645, "train/loss": 2.3227669477462767, "train/z_loss": 0.001627709879539907, "train/perplexity": 10.20386885359311, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021658.5515512077, "perf/iters_per_sec": 0.9640019185787237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03734233379364, "data/tokens_consumed": 32130465792, "data/tokens_consumed_B": 32.130465792, "train/loss_slope": -1.0190027846683471e-05} {"step": 15330, "timestamp": 1778211103.3619485, "train/loss": 2.267985796928406, "train/z_loss": 0.0016456445096991955, "train/perplexity": 9.659924156778173, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020922.5354553163, "perf/iters_per_sec": 0.9636509587551672, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377201318740845, "data/tokens_consumed": 32151437312, "data/tokens_consumed_B": 32.151437312, "train/loss_slope": -1.1102376690935255e-05} {"step": 15340, "timestamp": 1778211113.7390764, "train/loss": 2.2220314025878904, "train/z_loss": 0.0016523348283953964, "train/perplexity": 9.226053671970684, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022065.992172416, "perf/iters_per_sec": 0.9641962014066773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371333122253419, "data/tokens_consumed": 32172408832, "data/tokens_consumed_B": 32.172408832, "train/loss_slope": -1.786585620241485e-05} {"step": 15350, "timestamp": 1778211124.1078744, "grad/layer_0/attn": 0.002579434309154749, "grad/layer_0/mlp": 0.0025228355079889297, "grad/layer_0/attn_mlp_ratio": 1.0224345577597649, "grad/layer_4/attn": 0.0021020343992859125, "grad/layer_4/mlp": 0.0024889991618692875, "grad/layer_4/attn_mlp_ratio": 0.8445299408032942, "grad/layer_8/attn": 0.009436693042516708, "grad/layer_8/mlp": 0.003854048205539584, "grad/layer_8/attn_mlp_ratio": 2.448514469565147, "grad/layer_12/attn": 0.004812989383935928, "grad/layer_12/mlp": 0.006114532705396414, "grad/layer_12/attn_mlp_ratio": 0.787139350971868, "grad/layer_16/attn": 0.0043898653239011765, "grad/layer_16/mlp": 0.004498004913330078, "grad/layer_16/attn_mlp_ratio": 0.9759582994886792, "grad/layer_20/attn": 0.003716901643201709, "grad/layer_20/mlp": 0.006121216807514429, "grad/layer_20/attn_mlp_ratio": 0.6072161302826611, "grad/layer_24/attn": 0.009798221290111542, "grad/layer_24/mlp": 0.009333563037216663, "grad/layer_24/attn_mlp_ratio": 1.0497835763323977, "grad/layer_27/attn": 0.0055248187854886055, "grad/layer_27/mlp": 0.00854120496660471, "grad/layer_27/attn_mlp_ratio": 0.6468430089672143} {"step": 15350, "timestamp": 1778211124.1247134, "train/loss": 2.232990097999573, "train/z_loss": 0.0016617674264125526, "train/perplexity": 9.327715205502958, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020561.8297459437, "perf/iters_per_sec": 0.9634789608697623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379053831100464, "data/tokens_consumed": 32193380352, "data/tokens_consumed_B": 32.193380352, "train/loss_slope": -1.8104482026133074e-05} {"step": 15360, "timestamp": 1778211134.5028424, "train/loss": 2.2882375717163086, "train/z_loss": 0.0016440506791695953, "train/perplexity": 9.857549139234212, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021759.7573546192, "perf/iters_per_sec": 0.9640501772664162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372904062271118, "data/tokens_consumed": 32214351872, "data/tokens_consumed_B": 32.214351872, "train/loss_slope": -1.5533479920314886e-05} {"step": 15370, "timestamp": 1778211144.8786957, "train/loss": 2.239059019088745, "train/z_loss": 0.0016700580366887153, "train/perplexity": 9.384496499343575, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022194.7137749982, "perf/iters_per_sec": 0.9642575806498519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370672941207886, "data/tokens_consumed": 32235323392, "data/tokens_consumed_B": 32.235323392, "train/loss_slope": -1.4878655717496653e-05} {"step": 15375, "timestamp": 1778211150.65983, "eos/sharpness": 29.797434806823723, "eos/L0_probe": 2.0845067501068115, "eos/L_plus": 2.2421717643737793, "eos/L_minus": 2.224816083908081, "eos/grad_norm": 0.15021854639053345, "eos/embed_grad_frac": 0.1259368658065796, "eos/time_s": 0.6045520305633545} {"step": 15375, "timestamp": 1778211152.0362673, "geo/rankme_last": 441.2733154296875, "geo/layer_0/stable_rank_q_proj": 16.008712768554688, "geo/layer_0/stable_rank_k_proj": 14.296366691589355, "geo/layer_0/stable_rank_o_proj": 54.127342224121094, "geo/layer_0/stable_rank_gate_proj": 158.147705078125, "geo/layer_0/stable_rank_down_proj": 48.680580139160156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.046939946711063385, "geo/layer_0/attn_entropy_mean": 6.305481910705566, "geo/layer_0/attn_entropy_std": 0.29708564281463623, "geo/layer_7/stable_rank_q_proj": 42.83975601196289, "geo/layer_7/stable_rank_k_proj": 42.50273513793945, "geo/layer_7/stable_rank_o_proj": 110.25654602050781, "geo/layer_7/stable_rank_gate_proj": 111.82585906982422, "geo/layer_7/stable_rank_down_proj": 156.35350036621094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.555194079875946, "geo/layer_7/attn_entropy_mean": 4.660284996032715, "geo/layer_7/attn_entropy_std": 0.8537673950195312, "geo/layer_14/stable_rank_q_proj": 59.2318115234375, "geo/layer_14/stable_rank_k_proj": 36.75803756713867, "geo/layer_14/stable_rank_o_proj": 51.342323303222656, "geo/layer_14/stable_rank_gate_proj": 95.2939682006836, "geo/layer_14/stable_rank_down_proj": 137.1329803466797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3747248649597168, "geo/layer_14/attn_entropy_mean": 5.572149276733398, "geo/layer_14/attn_entropy_std": 0.5467209219932556, "geo/layer_21/stable_rank_q_proj": 49.84526443481445, "geo/layer_21/stable_rank_k_proj": 32.155174255371094, "geo/layer_21/stable_rank_o_proj": 87.79300689697266, "geo/layer_21/stable_rank_gate_proj": 94.78907775878906, "geo/layer_21/stable_rank_down_proj": 64.77901458740234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1579359620809555, "geo/layer_21/attn_entropy_mean": 5.753988265991211, "geo/layer_21/attn_entropy_std": 0.2993728518486023, "geo/layer_27/stable_rank_q_proj": 41.98354721069336, "geo/layer_27/stable_rank_k_proj": 32.67495346069336, "geo/layer_27/stable_rank_o_proj": 116.94677734375, "geo/layer_27/stable_rank_gate_proj": 93.36689758300781, "geo/layer_27/stable_rank_down_proj": 144.8233642578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07125667482614517, "geo/layer_27/attn_entropy_mean": 4.463210105895996, "geo/layer_27/attn_entropy_std": 0.522447943687439, "attnres/final_alpha/block_0": 0.2458116114139557, "attnres/block_norm/0": 1.5417888164520264, "attnres/final_alpha/block_1": 0.007663181982934475, "attnres/block_norm/1": 24301.13671875, "attnres/final_alpha/block_2": 0.016220517456531525, "attnres/block_norm/2": 17748.84375, "attnres/final_alpha/block_3": 0.018096886575222015, "attnres/block_norm/3": 22754.8203125, "attnres/final_alpha/block_4": 0.024152632802724838, "attnres/block_norm/4": 7674.080078125, "attnres/final_alpha/block_5": 0.5416674017906189, "attnres/block_norm/5": 4527.5732421875, "attnres/final_alpha/block_6": 0.1463877558708191, "attnres/block_norm/6": 15633.359375, "geo/tier1_time_s": 1.3567090034484863, "geo/step": 15375.0, "geo/rankme_slope": 0.00012267989226940777} {"step": 15380, "timestamp": 1778211157.2264798, "train/loss": 2.225690245628357, "train/z_loss": 0.0016556987189687788, "train/perplexity": 9.259872184826811, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699160.4857121354, "perf/iters_per_sec": 0.8102228573380162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2342283248901367, "data/tokens_consumed": 32256294912, "data/tokens_consumed_B": 32.256294912, "train/loss_slope": -1.9761582097598048e-05} {"step": 15390, "timestamp": 1778211167.6031365, "train/loss": 2.2282549142837524, "train/z_loss": 0.0016485340544022619, "train/perplexity": 9.283651168345143, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021910.8414347214, "perf/iters_per_sec": 0.9641222197698218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037212896347046, "data/tokens_consumed": 32277266432, "data/tokens_consumed_B": 32.277266432, "train/loss_slope": -2.037645258275447e-05} {"step": 15400, "timestamp": 1778211177.9679205, "grad/layer_0/attn": 0.002751912921667099, "grad/layer_0/mlp": 0.0030205347575247288, "grad/layer_0/attn_mlp_ratio": 0.9110680894185209, "grad/layer_4/attn": 0.0019218834349885583, "grad/layer_4/mlp": 0.0026053639594465494, "grad/layer_4/attn_mlp_ratio": 0.7376640619648445, "grad/layer_8/attn": 0.004749293439090252, "grad/layer_8/mlp": 0.003953429404646158, "grad/layer_8/attn_mlp_ratio": 1.2013097573913427, "grad/layer_12/attn": 0.004404361825436354, "grad/layer_12/mlp": 0.00638757087290287, "grad/layer_12/attn_mlp_ratio": 0.6895206087134504, "grad/layer_16/attn": 0.004785231314599514, "grad/layer_16/mlp": 0.0048027546145021915, "grad/layer_16/attn_mlp_ratio": 0.9963513856225544, "grad/layer_20/attn": 0.005413020495325327, "grad/layer_20/mlp": 0.007603876292705536, "grad/layer_20/attn_mlp_ratio": 0.7118764450876772, "grad/layer_24/attn": 0.02561732940375805, "grad/layer_24/mlp": 0.01711718551814556, "grad/layer_24/attn_mlp_ratio": 1.4965853601892163, "grad/layer_27/attn": 0.014572137966752052, "grad/layer_27/mlp": 0.016214709728956223, "grad/layer_27/attn_mlp_ratio": 0.8986986582226179} {"step": 15400, "timestamp": 1778211177.9849904, "train/loss": 2.2807111978530883, "train/z_loss": 0.0016515563591383397, "train/perplexity": 9.783636036766891, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021010.5263316552, "perf/iters_per_sec": 0.9636929160745884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376749515533448, "data/tokens_consumed": 32298237952, "data/tokens_consumed_B": 32.298237952, "train/loss_slope": -1.6894144021888926e-05} {"step": 15410, "timestamp": 1778211188.3601344, "train/loss": 2.258373737335205, "train/z_loss": 0.001661610696464777, "train/perplexity": 9.567517212115268, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022418.4001450902, "perf/iters_per_sec": 0.9643642426228953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036952590942383, "data/tokens_consumed": 32319209472, "data/tokens_consumed_B": 32.319209472, "train/loss_slope": -1.470593830289479e-05} {"step": 15420, "timestamp": 1778211198.7409108, "train/loss": 2.23979856967926, "train/z_loss": 0.0016553916851989926, "train/perplexity": 9.391439376259273, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021648.8868805408, "perf/iters_per_sec": 0.963997310104628, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373472929000855, "data/tokens_consumed": 32340180992, "data/tokens_consumed_B": 32.340180992, "train/loss_slope": -1.765819645271337e-05} {"step": 15430, "timestamp": 1778211209.1245413, "train/loss": 2.252343225479126, "train/z_loss": 0.0016494367504492403, "train/perplexity": 9.50999380824859, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021341.8958198628, "perf/iters_per_sec": 0.9638509253596605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375048398971558, "data/tokens_consumed": 32361152512, "data/tokens_consumed_B": 32.361152512, "train/loss_slope": -1.7536494376385447e-05} {"step": 15440, "timestamp": 1778211219.5010903, "train/loss": 2.2070714712142943, "train/z_loss": 0.0016614592866972088, "train/perplexity": 9.089059806522625, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022097.229711954, "perf/iters_per_sec": 0.9642110966262598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371172904968262, "data/tokens_consumed": 32382124032, "data/tokens_consumed_B": 32.382124032, "train/loss_slope": -2.0722049676796177e-05} {"step": 15450, "timestamp": 1778211229.863846, "grad/layer_0/attn": 0.0032635387033224106, "grad/layer_0/mlp": 0.0027399363461881876, "grad/layer_0/attn_mlp_ratio": 1.1911001468164202, "grad/layer_4/attn": 0.0028702844865620136, "grad/layer_4/mlp": 0.0025320681743323803, "grad/layer_4/attn_mlp_ratio": 1.1335731013488606, "grad/layer_8/attn": 0.004698830656707287, "grad/layer_8/mlp": 0.00395191041752696, "grad/layer_8/attn_mlp_ratio": 1.1890022903777024, "grad/layer_12/attn": 0.004138428717851639, "grad/layer_12/mlp": 0.006026744842529297, "grad/layer_12/attn_mlp_ratio": 0.6866772623224418, "grad/layer_16/attn": 0.004759775474667549, "grad/layer_16/mlp": 0.004655974451452494, "grad/layer_16/attn_mlp_ratio": 1.0222941345722505, "grad/layer_20/attn": 0.005384678021073341, "grad/layer_20/mlp": 0.006331610959023237, "grad/layer_20/attn_mlp_ratio": 0.8504435870866685, "grad/layer_24/attn": 0.011375236324965954, "grad/layer_24/mlp": 0.011354475282132626, "grad/layer_24/attn_mlp_ratio": 1.0018284369937511, "grad/layer_27/attn": 0.00548601383343339, "grad/layer_27/mlp": 0.00990341603755951, "grad/layer_27/attn_mlp_ratio": 0.5539516624599099} {"step": 15450, "timestamp": 1778211230.4676461, "eos/sharpness": 21.890306472778317, "eos/L0_probe": 2.083050489425659, "eos/L_plus": 2.1978230476379395, "eos/L_minus": 2.187180995941162, "eos/grad_norm": 0.14445197582244873, "eos/embed_grad_frac": 0.11943096667528152, "eos/time_s": 0.6010468006134033} {"step": 15450, "timestamp": 1778211230.4877524, "train/loss": 2.186512851715088, "train/z_loss": 0.0016674731392413377, "train/perplexity": 8.904108965520125, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910009.4581399993, "perf/iters_per_sec": 0.9107634821605679, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979799032211304, "data/tokens_consumed": 32403095552, "data/tokens_consumed_B": 32.403095552, "train/loss_slope": -2.046693431483427e-05} {"step": 15450, "timestamp": 1778211231.8471777, "geo/rankme_last": 440.2367858886719, "geo/layer_0/stable_rank_q_proj": 16.0234432220459, "geo/layer_0/stable_rank_k_proj": 14.30534553527832, "geo/layer_0/stable_rank_o_proj": 53.93507766723633, "geo/layer_0/stable_rank_gate_proj": 157.9208984375, "geo/layer_0/stable_rank_down_proj": 48.75379180908203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.036845557391643524, "geo/layer_0/attn_entropy_mean": 6.300720691680908, "geo/layer_0/attn_entropy_std": 0.3009040951728821, "geo/layer_7/stable_rank_q_proj": 42.74397659301758, "geo/layer_7/stable_rank_k_proj": 42.316410064697266, "geo/layer_7/stable_rank_o_proj": 109.75636291503906, "geo/layer_7/stable_rank_gate_proj": 111.85323333740234, "geo/layer_7/stable_rank_down_proj": 156.37466430664062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5837691426277161, "geo/layer_7/attn_entropy_mean": 4.669862270355225, "geo/layer_7/attn_entropy_std": 0.8974533081054688, "geo/layer_14/stable_rank_q_proj": 59.33473205566406, "geo/layer_14/stable_rank_k_proj": 36.8648567199707, "geo/layer_14/stable_rank_o_proj": 51.38379669189453, "geo/layer_14/stable_rank_gate_proj": 95.12970733642578, "geo/layer_14/stable_rank_down_proj": 137.25669860839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.386085569858551, "geo/layer_14/attn_entropy_mean": 5.567821025848389, "geo/layer_14/attn_entropy_std": 0.5459511280059814, "geo/layer_21/stable_rank_q_proj": 49.81998062133789, "geo/layer_21/stable_rank_k_proj": 32.2246208190918, "geo/layer_21/stable_rank_o_proj": 87.7669448852539, "geo/layer_21/stable_rank_gate_proj": 94.77125549316406, "geo/layer_21/stable_rank_down_proj": 64.71660614013672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15698568522930145, "geo/layer_21/attn_entropy_mean": 5.783394813537598, "geo/layer_21/attn_entropy_std": 0.28924256563186646, "geo/layer_27/stable_rank_q_proj": 41.91874313354492, "geo/layer_27/stable_rank_k_proj": 32.46833038330078, "geo/layer_27/stable_rank_o_proj": 117.17466735839844, "geo/layer_27/stable_rank_gate_proj": 93.29502868652344, "geo/layer_27/stable_rank_down_proj": 144.1469268798828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0747561976313591, "geo/layer_27/attn_entropy_mean": 4.480862617492676, "geo/layer_27/attn_entropy_std": 0.4802791476249695, "attnres/final_alpha/block_0": 0.24716660380363464, "attnres/block_norm/0": 1.5430660247802734, "attnres/final_alpha/block_1": 0.007780129089951515, "attnres/block_norm/1": 24317.787109375, "attnres/final_alpha/block_2": 0.016317768022418022, "attnres/block_norm/2": 17848.296875, "attnres/final_alpha/block_3": 0.018490266054868698, "attnres/block_norm/3": 22684.68359375, "attnres/final_alpha/block_4": 0.02447064220905304, "attnres/block_norm/4": 7734.5166015625, "attnres/final_alpha/block_5": 0.5385745763778687, "attnres/block_norm/5": 4601.1162109375, "attnres/final_alpha/block_6": 0.14720001816749573, "attnres/block_norm/6": 15676.322265625, "geo/tier1_time_s": 1.355391025543213, "geo/step": 15450.0, "geo/rankme_slope": 0.0001215107918167267} {"step": 15460, "timestamp": 1778211242.2213068, "train/loss": 2.275047540664673, "train/z_loss": 0.0016444897861219942, "train/perplexity": 9.728381495288906, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787907.1843306513, "perf/iters_per_sec": 0.8525405809071785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1729646921157837, "data/tokens_consumed": 32424067072, "data/tokens_consumed_B": 32.424067072, "train/loss_slope": -1.9580806535605835e-05} {"step": 15470, "timestamp": 1778211252.5965521, "train/loss": 2.241092085838318, "train/z_loss": 0.0016515057301148771, "train/perplexity": 9.403595215041307, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022400.0328436499, "perf/iters_per_sec": 0.9643554844110727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369620084762574, "data/tokens_consumed": 32445038592, "data/tokens_consumed_B": 32.445038592, "train/loss_slope": -2.049717124145813e-05} {"step": 15480, "timestamp": 1778211263.4834464, "train/loss": 2.3294870376586916, "train/z_loss": 0.0016201538499444723, "train/perplexity": 10.27267068807889, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927618.13942418, "perf/iters_per_sec": 0.9191599557038211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0879499197006226, "data/tokens_consumed": 32466010112, "data/tokens_consumed_B": 32.466010112, "train/loss_slope": -1.5335350819189143e-05} {"step": 15490, "timestamp": 1778211273.860972, "train/loss": 2.212263512611389, "train/z_loss": 0.001662887865677476, "train/perplexity": 9.136373301824952, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022319.17418518, "perf/iters_per_sec": 0.9643169279981518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037003469467163, "data/tokens_consumed": 32486981632, "data/tokens_consumed_B": 32.486981632, "train/loss_slope": -2.0420835005997432e-05} {"step": 15500, "timestamp": 1778211284.2307801, "grad/layer_0/attn": 0.002629324095323682, "grad/layer_0/mlp": 0.002546041738241911, "grad/layer_0/attn_mlp_ratio": 1.0327104825344422, "grad/layer_4/attn": 0.0020008645951747894, "grad/layer_4/mlp": 0.0025958437472581863, "grad/layer_4/attn_mlp_ratio": 0.770795437979895, "grad/layer_8/attn": 0.004023634362965822, "grad/layer_8/mlp": 0.003884859150275588, "grad/layer_8/attn_mlp_ratio": 1.0357220439016908, "grad/layer_12/attn": 0.006333815399557352, "grad/layer_12/mlp": 0.0061622196808457375, "grad/layer_12/attn_mlp_ratio": 1.0278463970475367, "grad/layer_16/attn": 0.004209104459732771, "grad/layer_16/mlp": 0.0046605453826487064, "grad/layer_16/attn_mlp_ratio": 0.9031355826058005, "grad/layer_20/attn": 0.008241429924964905, "grad/layer_20/mlp": 0.005634830333292484, "grad/layer_20/attn_mlp_ratio": 1.462587033013762, "grad/layer_24/attn": 0.008975003845989704, "grad/layer_24/mlp": 0.008861319161951542, "grad/layer_24/attn_mlp_ratio": 1.0128293068647576, "grad/layer_27/attn": 0.004470315761864185, "grad/layer_27/mlp": 0.007577074225991964, "grad/layer_27/attn_mlp_ratio": 0.5899791356842665} {"step": 15500, "timestamp": 1778211284.247542, "train/loss": 2.2823975801467897, "train/z_loss": 0.0016486478853039443, "train/perplexity": 9.80014890693986, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020120.0595015064, "perf/iters_per_sec": 0.9632683084018261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381323575973511, "data/tokens_consumed": 32507953152, "data/tokens_consumed_B": 32.507953152, "train/loss_slope": -1.9953225755562797e-05} {"step": 15500, "timestamp": 1778211291.2552447, "geo/ww_alpha_mean": 8.155048790436759, "geo/ww_alpha_std": 5.084590831370382, "geo/ww_alpha_min": 1.373446629823507, "geo/ww_alpha_max": 34.54963200679647, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.226418640500654, "geo/ww_alpha_by_type/k_proj": 4.750007498673699, "geo/ww_alpha_by_type/v_proj": 8.937948279248802, "geo/ww_alpha_by_type/o_proj": 7.281448527539361, "geo/ww_alpha_by_type/gate_proj": 10.134319432777795, "geo/ww_alpha_by_type/up_proj": 12.90159775227315, "geo/ww_alpha_by_type/down_proj": 8.995949828094687, "geo/twonn_id/layer_0": 0.7432773113250732, "geo/twonn_id/layer_7": 2.9989089965820312, "geo/twonn_id/layer_14": 4.560058116912842, "geo/twonn_id/layer_21": 7.620523452758789, "geo/twonn_id/layer_27": 6.5186896324157715, "geo/tier2_time_s": 7.000047445297241} {"step": 15500, "timestamp": 1778211291.902676, "eoc/jacobian_sigma/layer_0/attn": 770.4403686523438, "eoc/jacobian_sigma/layer_0/mlp": 4598.3076171875, "eoc/jacobian_sigma/layer_0": 4598.3076171875, "eoc/jacobian_sigma/layer_7/attn": 1.1486232280731201, "eoc/jacobian_sigma/layer_7/mlp": 1.6362086534500122, "eoc/jacobian_sigma/layer_7": 1.6362086534500122, "eoc/jacobian_sigma/layer_14/attn": 1.4899287223815918, "eoc/jacobian_sigma/layer_14/mlp": 5.4491095542907715, "eoc/jacobian_sigma/layer_14": 5.4491095542907715, "eoc/jacobian_sigma/layer_21/attn": 1.0810173749923706, "eoc/jacobian_sigma/layer_21/mlp": 4.0333733558654785, "eoc/jacobian_sigma/layer_21": 4.0333733558654785, "eoc/jacobian_sigma/layer_27/attn": 2.3764233589172363, "eoc/jacobian_sigma/layer_27/mlp": 19.25986671447754, "eoc/jacobian_sigma/layer_27": 19.25986671447754, "eoc/layer0_sigma": 4598.3076171875, "eoc/sigma_max": 19.25986671447754, "eoc/sigma_min": 1.6362086534500122, "eoc/sigma_mean": 7.59463956952095, "eoc/time_s": 0.6403164863586426} {"step": 15510, "timestamp": 1778211302.3013413, "train/loss": 2.256002354621887, "train/z_loss": 0.001657959900330752, "train/perplexity": 9.544855847193201, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1162035.5765673274, "perf/iters_per_sec": 0.5541017420612943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8047227144241333, "data/tokens_consumed": 32528924672, "data/tokens_consumed_B": 32.528924672, "train/loss_slope": -1.9670643929493833e-05} {"step": 15520, "timestamp": 1778211312.6822426, "train/loss": 2.268294620513916, "train/z_loss": 0.001650999253615737, "train/perplexity": 9.662907829882624, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021216.2087711354, "perf/iters_per_sec": 0.9637909931045224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375693559646606, "data/tokens_consumed": 32549896192, "data/tokens_consumed_B": 32.549896192, "train/loss_slope": -2.0691437961602442e-05} {"step": 15525, "timestamp": 1778211318.4853523, "eos/sharpness": 66.23885631561278, "eos/L0_probe": 2.0844428539276123, "eos/L_plus": 2.3134799003601074, "eos/L_minus": 2.517794370651245, "eos/grad_norm": 0.28268763422966003, "eos/embed_grad_frac": 0.02980802394449711, "eos/time_s": 0.6252837181091309} {"step": 15525, "timestamp": 1778211319.8669767, "geo/rankme_last": 441.6046142578125, "geo/layer_0/stable_rank_q_proj": 16.047597885131836, "geo/layer_0/stable_rank_k_proj": 14.30302619934082, "geo/layer_0/stable_rank_o_proj": 53.9210090637207, "geo/layer_0/stable_rank_gate_proj": 158.19622802734375, "geo/layer_0/stable_rank_down_proj": 48.80157470703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.040318652987480164, "geo/layer_0/attn_entropy_mean": 6.305854797363281, "geo/layer_0/attn_entropy_std": 0.30060875415802, "geo/layer_7/stable_rank_q_proj": 42.746742248535156, "geo/layer_7/stable_rank_k_proj": 42.19526290893555, "geo/layer_7/stable_rank_o_proj": 109.70448303222656, "geo/layer_7/stable_rank_gate_proj": 111.8160400390625, "geo/layer_7/stable_rank_down_proj": 156.01795959472656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5617473721504211, "geo/layer_7/attn_entropy_mean": 4.651726722717285, "geo/layer_7/attn_entropy_std": 0.8655564785003662, "geo/layer_14/stable_rank_q_proj": 59.37147903442383, "geo/layer_14/stable_rank_k_proj": 36.685997009277344, "geo/layer_14/stable_rank_o_proj": 51.46788024902344, "geo/layer_14/stable_rank_gate_proj": 95.15741729736328, "geo/layer_14/stable_rank_down_proj": 137.07945251464844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.386589914560318, "geo/layer_14/attn_entropy_mean": 5.569306373596191, "geo/layer_14/attn_entropy_std": 0.5042518377304077, "geo/layer_21/stable_rank_q_proj": 49.766849517822266, "geo/layer_21/stable_rank_k_proj": 32.2299919128418, "geo/layer_21/stable_rank_o_proj": 87.77635955810547, "geo/layer_21/stable_rank_gate_proj": 94.82634735107422, "geo/layer_21/stable_rank_down_proj": 64.70330047607422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15378287434577942, "geo/layer_21/attn_entropy_mean": 5.757558822631836, "geo/layer_21/attn_entropy_std": 0.29573556780815125, "geo/layer_27/stable_rank_q_proj": 42.01820373535156, "geo/layer_27/stable_rank_k_proj": 32.55686569213867, "geo/layer_27/stable_rank_o_proj": 117.01231384277344, "geo/layer_27/stable_rank_gate_proj": 93.10889434814453, "geo/layer_27/stable_rank_down_proj": 143.52308654785156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06829453259706497, "geo/layer_27/attn_entropy_mean": 4.455635070800781, "geo/layer_27/attn_entropy_std": 0.4940330684185028, "attnres/final_alpha/block_0": 0.24592483043670654, "attnres/block_norm/0": 1.5444746017456055, "attnres/final_alpha/block_1": 0.0077293370850384235, "attnres/block_norm/1": 24397.75390625, "attnres/final_alpha/block_2": 0.01585344225168228, "attnres/block_norm/2": 17849.58984375, "attnres/final_alpha/block_3": 0.01803462952375412, "attnres/block_norm/3": 22995.33203125, "attnres/final_alpha/block_4": 0.024302372708916664, "attnres/block_norm/4": 7732.96337890625, "attnres/final_alpha/block_5": 0.5399501323699951, "attnres/block_norm/5": 4565.7958984375, "attnres/final_alpha/block_6": 0.14820531010627747, "attnres/block_norm/6": 15575.03125, "geo/tier1_time_s": 1.3613364696502686, "geo/step": 15525.0, "geo/rankme_slope": 0.000128158196872499} {"step": 15530, "timestamp": 1778211325.061652, "train/loss": 2.2529937028884888, "train/z_loss": 0.001653527794405818, "train/perplexity": 9.516181856758728, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694956.008438541, "perf/iters_per_sec": 0.8082180063431459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2372899293899535, "data/tokens_consumed": 32570867712, "data/tokens_consumed_B": 32.570867712, "train/loss_slope": -2.246314908447407e-05} {"step": 15540, "timestamp": 1778211335.439925, "train/loss": 2.2938836336135866, "train/z_loss": 0.0016415788093581795, "train/perplexity": 9.913362887499288, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021758.363266464, "perf/iters_per_sec": 0.964049512513382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037291121482849, "data/tokens_consumed": 32591839232, "data/tokens_consumed_B": 32.591839232, "train/loss_slope": -2.0153226481877504e-05} {"step": 15550, "timestamp": 1778211345.8049488, "grad/layer_0/attn": 0.003004393307492137, "grad/layer_0/mlp": 0.0028641889803111553, "grad/layer_0/attn_mlp_ratio": 1.0489507582249948, "grad/layer_4/attn": 0.0026373493019491434, "grad/layer_4/mlp": 0.002528035081923008, "grad/layer_4/attn_mlp_ratio": 1.043240743169952, "grad/layer_8/attn": 0.007179577834904194, "grad/layer_8/mlp": 0.004047226160764694, "grad/layer_8/attn_mlp_ratio": 1.7739502000433416, "grad/layer_12/attn": 0.004058653023093939, "grad/layer_12/mlp": 0.006254942622035742, "grad/layer_12/attn_mlp_ratio": 0.6488713331931207, "grad/layer_16/attn": 0.004102480132132769, "grad/layer_16/mlp": 0.0049747140146791935, "grad/layer_16/attn_mlp_ratio": 0.824666510991522, "grad/layer_20/attn": 0.005416735541075468, "grad/layer_20/mlp": 0.007580084260553122, "grad/layer_20/attn_mlp_ratio": 0.7146009573804014, "grad/layer_24/attn": 0.02714640088379383, "grad/layer_24/mlp": 0.01677447184920311, "grad/layer_24/attn_mlp_ratio": 1.618316270461405, "grad/layer_27/attn": 0.009181232191622257, "grad/layer_27/mlp": 0.01595437526702881, "grad/layer_27/attn_mlp_ratio": 0.5754679816921019} {"step": 15550, "timestamp": 1778211345.8216703, "train/loss": 2.2420879364013673, "train/z_loss": 0.0016531281173229218, "train/perplexity": 9.412964455038054, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020702.1507817481, "perf/iters_per_sec": 0.9635458711537114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037833309173584, "data/tokens_consumed": 32612810752, "data/tokens_consumed_B": 32.612810752, "train/loss_slope": -2.158036126841522e-05} {"step": 15560, "timestamp": 1778211356.199368, "train/loss": 2.210280203819275, "train/z_loss": 0.0016577816219069063, "train/perplexity": 9.118271009478988, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022111.8261550271, "perf/iters_per_sec": 0.9642180567526947, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371098041534423, "data/tokens_consumed": 32633782272, "data/tokens_consumed_B": 32.633782272, "train/loss_slope": -2.2063345119397374e-05} {"step": 15570, "timestamp": 1778211366.5788581, "train/loss": 2.252501893043518, "train/z_loss": 0.0016466959146782755, "train/perplexity": 9.51150285551879, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021613.4350576883, "perf/iters_per_sec": 0.9639804053581659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037365484237671, "data/tokens_consumed": 32654753792, "data/tokens_consumed_B": 32.654753792, "train/loss_slope": -2.1124614490391178e-05} {"step": 15580, "timestamp": 1778211376.955304, "train/loss": 2.233759951591492, "train/z_loss": 0.0016532577225007116, "train/perplexity": 9.334898945417498, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022028.7594016965, "perf/iters_per_sec": 0.9641784474380953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371524095535278, "data/tokens_consumed": 32675725312, "data/tokens_consumed_B": 32.675725312, "train/loss_slope": -2.0637876630508987e-05} {"step": 15590, "timestamp": 1778211387.3320007, "train/loss": 2.3001543283462524, "train/z_loss": 0.001638356107287109, "train/perplexity": 9.975721872683044, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022184.346639745, "perf/iters_per_sec": 0.964252637214539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370726108551025, "data/tokens_consumed": 32696696832, "data/tokens_consumed_B": 32.696696832, "train/loss_slope": -2.1330006810018628e-05} {"step": 15600, "timestamp": 1778211397.6978114, "grad/layer_0/attn": 0.0029241135343909264, "grad/layer_0/mlp": 0.0028096174355596304, "grad/layer_0/attn_mlp_ratio": 1.0407514536701843, "grad/layer_4/attn": 0.0016421611653640866, "grad/layer_4/mlp": 0.0025764312595129013, "grad/layer_4/attn_mlp_ratio": 0.6373781934072378, "grad/layer_8/attn": 0.004350669682025909, "grad/layer_8/mlp": 0.003912054467946291, "grad/layer_8/attn_mlp_ratio": 1.1121188640039543, "grad/layer_12/attn": 0.004349499940872192, "grad/layer_12/mlp": 0.007120421156287193, "grad/layer_12/attn_mlp_ratio": 0.6108486821663351, "grad/layer_16/attn": 0.005202800035476685, "grad/layer_16/mlp": 0.005324007011950016, "grad/layer_16/attn_mlp_ratio": 0.9772338627795453, "grad/layer_20/attn": 0.0038848365657031536, "grad/layer_20/mlp": 0.0067442310974001884, "grad/layer_20/attn_mlp_ratio": 0.5760236344211787, "grad/layer_24/attn": 0.009795097634196281, "grad/layer_24/mlp": 0.010364721529185772, "grad/layer_24/attn_mlp_ratio": 0.9450420363065517, "grad/layer_27/attn": 0.006148346699774265, "grad/layer_27/mlp": 0.009744159877300262, "grad/layer_27/attn_mlp_ratio": 0.6309776023892558} {"step": 15600, "timestamp": 1778211398.309057, "eos/sharpness": 52.36752033233642, "eos/L0_probe": 2.0876290798187256, "eos/L_plus": 2.2736313343048096, "eos/L_minus": 2.425302028656006, "eos/grad_norm": 0.18324466049671173, "eos/embed_grad_frac": 0.0797763541340828, "eos/time_s": 0.6084861755371094} {"step": 15600, "timestamp": 1778211398.3290462, "train/loss": 2.260989713668823, "train/z_loss": 0.001637495227623731, "train/perplexity": 9.592578376127799, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907965.254708065, "perf/iters_per_sec": 0.9097887300052953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0991562843322753, "data/tokens_consumed": 32717668352, "data/tokens_consumed_B": 32.717668352, "train/loss_slope": -1.970521319280901e-05} {"step": 15600, "timestamp": 1778211399.6905103, "geo/rankme_last": 441.8519287109375, "geo/layer_0/stable_rank_q_proj": 16.032243728637695, "geo/layer_0/stable_rank_k_proj": 14.265706062316895, "geo/layer_0/stable_rank_o_proj": 53.81068420410156, "geo/layer_0/stable_rank_gate_proj": 157.87625122070312, "geo/layer_0/stable_rank_down_proj": 48.7868537902832, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04181632399559021, "geo/layer_0/attn_entropy_mean": 6.304076671600342, "geo/layer_0/attn_entropy_std": 0.3007137179374695, "geo/layer_7/stable_rank_q_proj": 42.62255096435547, "geo/layer_7/stable_rank_k_proj": 42.27585983276367, "geo/layer_7/stable_rank_o_proj": 109.8005142211914, "geo/layer_7/stable_rank_gate_proj": 111.54492950439453, "geo/layer_7/stable_rank_down_proj": 156.46298217773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5664848685264587, "geo/layer_7/attn_entropy_mean": 4.6357808113098145, "geo/layer_7/attn_entropy_std": 0.8619899749755859, "geo/layer_14/stable_rank_q_proj": 59.32378387451172, "geo/layer_14/stable_rank_k_proj": 36.63151931762695, "geo/layer_14/stable_rank_o_proj": 51.514320373535156, "geo/layer_14/stable_rank_gate_proj": 94.9521255493164, "geo/layer_14/stable_rank_down_proj": 136.8624267578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3807372450828552, "geo/layer_14/attn_entropy_mean": 5.539636135101318, "geo/layer_14/attn_entropy_std": 0.5298563838005066, "geo/layer_21/stable_rank_q_proj": 49.8776741027832, "geo/layer_21/stable_rank_k_proj": 32.281368255615234, "geo/layer_21/stable_rank_o_proj": 87.30389404296875, "geo/layer_21/stable_rank_gate_proj": 94.76852416992188, "geo/layer_21/stable_rank_down_proj": 64.46627807617188, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1564074456691742, "geo/layer_21/attn_entropy_mean": 5.756592750549316, "geo/layer_21/attn_entropy_std": 0.30076196789741516, "geo/layer_27/stable_rank_q_proj": 42.15563201904297, "geo/layer_27/stable_rank_k_proj": 32.393089294433594, "geo/layer_27/stable_rank_o_proj": 116.96424102783203, "geo/layer_27/stable_rank_gate_proj": 93.00041961669922, "geo/layer_27/stable_rank_down_proj": 143.42913818359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06944464147090912, "geo/layer_27/attn_entropy_mean": 4.459715843200684, "geo/layer_27/attn_entropy_std": 0.5048666000366211, "attnres/final_alpha/block_0": 0.24650217592716217, "attnres/block_norm/0": 1.5458518266677856, "attnres/final_alpha/block_1": 0.007727185729891062, "attnres/block_norm/1": 24415.34375, "attnres/final_alpha/block_2": 0.0161404088139534, "attnres/block_norm/2": 17867.74609375, "attnres/final_alpha/block_3": 0.018116841092705727, "attnres/block_norm/3": 22873.259765625, "attnres/final_alpha/block_4": 0.024738214910030365, "attnres/block_norm/4": 7757.17333984375, "attnres/final_alpha/block_5": 0.5375505685806274, "attnres/block_norm/5": 4583.861328125, "attnres/final_alpha/block_6": 0.14922460913658142, "attnres/block_norm/6": 15640.91796875, "geo/tier1_time_s": 1.3577444553375244, "geo/step": 15600.0, "geo/rankme_slope": 0.0001452981974039616} {"step": 15610, "timestamp": 1778211410.0721278, "train/loss": 2.2899200916290283, "train/z_loss": 0.0016354103339836002, "train/perplexity": 9.87414862251625, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786444.7730903008, "perf/iters_per_sec": 0.8518432488872055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739248991012574, "data/tokens_consumed": 32738639872, "data/tokens_consumed_B": 32.738639872, "train/loss_slope": -1.58035820609916e-05} {"step": 15620, "timestamp": 1778211420.4255733, "train/loss": 2.2666577339172362, "train/z_loss": 0.001646034326404333, "train/perplexity": 9.647103683897106, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026937.4796889003, "perf/iters_per_sec": 0.9665191076702596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346406936645507, "data/tokens_consumed": 32759611392, "data/tokens_consumed_B": 32.759611392, "train/loss_slope": -1.4454782205839774e-05} {"step": 15630, "timestamp": 1778211430.7852428, "train/loss": 2.25305655002594, "train/z_loss": 0.00165200229967013, "train/perplexity": 9.516779940341612, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025496.163527933, "perf/iters_per_sec": 0.9658318345679917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353769302368163, "data/tokens_consumed": 32780582912, "data/tokens_consumed_B": 32.780582912, "train/loss_slope": -1.3768383714840163e-05} {"step": 15640, "timestamp": 1778211441.1505313, "train/loss": 2.3001677989959717, "train/z_loss": 0.001634794706478715, "train/perplexity": 9.975856253043181, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024433.8531767947, "perf/iters_per_sec": 0.9653252855190252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359202384948731, "data/tokens_consumed": 32801554432, "data/tokens_consumed_B": 32.801554432, "train/loss_slope": -1.0894821944123745e-05} {"step": 15650, "timestamp": 1778211451.4945598, "grad/layer_0/attn": 0.0040088254027068615, "grad/layer_0/mlp": 0.0032917552161961794, "grad/layer_0/attn_mlp_ratio": 1.217838210204305, "grad/layer_4/attn": 0.002003657165914774, "grad/layer_4/mlp": 0.002629833994433284, "grad/layer_4/attn_mlp_ratio": 0.7618948929729167, "grad/layer_8/attn": 0.005127020180225372, "grad/layer_8/mlp": 0.003839167533442378, "grad/layer_8/attn_mlp_ratio": 1.3354509804585546, "grad/layer_12/attn": 0.003948328085243702, "grad/layer_12/mlp": 0.006256532855331898, "grad/layer_12/attn_mlp_ratio": 0.6310728503202219, "grad/layer_16/attn": 0.005048563703894615, "grad/layer_16/mlp": 0.0050879307091236115, "grad/layer_16/attn_mlp_ratio": 0.9922626492564711, "grad/layer_20/attn": 0.00578802777454257, "grad/layer_20/mlp": 0.008265514858067036, "grad/layer_20/attn_mlp_ratio": 0.7002622103893879, "grad/layer_24/attn": 0.02845057100057602, "grad/layer_24/mlp": 0.016781341284513474, "grad/layer_24/attn_mlp_ratio": 1.6953693002653172, "grad/layer_27/attn": 0.011191067285835743, "grad/layer_27/mlp": 0.014527485705912113, "grad/layer_27/attn_mlp_ratio": 0.7703375129977013} {"step": 15650, "timestamp": 1778211451.5103753, "train/loss": 2.2098739385604858, "train/z_loss": 0.0016657926840707659, "train/perplexity": 9.114567325137694, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025771.5260666895, "perf/iters_per_sec": 0.965963137658448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352361917495727, "data/tokens_consumed": 32822525952, "data/tokens_consumed_B": 32.822525952, "train/loss_slope": -1.369543456115633e-05} {"step": 15660, "timestamp": 1778211461.8679218, "train/loss": 2.270754098892212, "train/z_loss": 0.0016493917442858218, "train/perplexity": 9.686702792364846, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025802.1783656196, "perf/iters_per_sec": 0.9659777538135622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352205276489257, "data/tokens_consumed": 32843497472, "data/tokens_consumed_B": 32.843497472, "train/loss_slope": -1.630921363830568e-05} {"step": 15670, "timestamp": 1778211472.2343268, "train/loss": 2.327417254447937, "train/z_loss": 0.0016306056524626911, "train/perplexity": 10.251430475659383, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024004.9655726056, "perf/iters_per_sec": 0.9651207759726551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361397504806518, "data/tokens_consumed": 32864468992, "data/tokens_consumed_B": 32.864468992, "train/loss_slope": -1.2415433166050677e-05} {"step": 15675, "timestamp": 1778211478.0109742, "eos/sharpness": 13.219618797302244, "eos/L0_probe": 2.0815789699554443, "eos/L_plus": 2.143423080444336, "eos/L_minus": 2.151931047439575, "eos/grad_norm": 0.11095762252807617, "eos/embed_grad_frac": 0.197371706366539, "eos/time_s": 0.6082220077514648} {"step": 15675, "timestamp": 1778211479.3845854, "geo/rankme_last": 440.9481506347656, "geo/layer_0/stable_rank_q_proj": 16.04694366455078, "geo/layer_0/stable_rank_k_proj": 14.259113311767578, "geo/layer_0/stable_rank_o_proj": 53.71919631958008, "geo/layer_0/stable_rank_gate_proj": 156.8257598876953, "geo/layer_0/stable_rank_down_proj": 48.78278732299805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03780018538236618, "geo/layer_0/attn_entropy_mean": 6.303874969482422, "geo/layer_0/attn_entropy_std": 0.2979607880115509, "geo/layer_7/stable_rank_q_proj": 42.56349182128906, "geo/layer_7/stable_rank_k_proj": 42.594825744628906, "geo/layer_7/stable_rank_o_proj": 110.52311706542969, "geo/layer_7/stable_rank_gate_proj": 111.4740982055664, "geo/layer_7/stable_rank_down_proj": 155.79930114746094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5720356702804565, "geo/layer_7/attn_entropy_mean": 4.662432670593262, "geo/layer_7/attn_entropy_std": 0.8775591850280762, "geo/layer_14/stable_rank_q_proj": 59.42756271362305, "geo/layer_14/stable_rank_k_proj": 36.61918258666992, "geo/layer_14/stable_rank_o_proj": 51.63405227661133, "geo/layer_14/stable_rank_gate_proj": 94.8758773803711, "geo/layer_14/stable_rank_down_proj": 137.30848693847656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3780333697795868, "geo/layer_14/attn_entropy_mean": 5.539127349853516, "geo/layer_14/attn_entropy_std": 0.5085225105285645, "geo/layer_21/stable_rank_q_proj": 49.88138961791992, "geo/layer_21/stable_rank_k_proj": 32.301551818847656, "geo/layer_21/stable_rank_o_proj": 87.3125228881836, "geo/layer_21/stable_rank_gate_proj": 94.6797103881836, "geo/layer_21/stable_rank_down_proj": 64.55081939697266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15426814556121826, "geo/layer_21/attn_entropy_mean": 5.754685401916504, "geo/layer_21/attn_entropy_std": 0.2972802221775055, "geo/layer_27/stable_rank_q_proj": 42.150211334228516, "geo/layer_27/stable_rank_k_proj": 32.30944061279297, "geo/layer_27/stable_rank_o_proj": 116.9581069946289, "geo/layer_27/stable_rank_gate_proj": 93.07589721679688, "geo/layer_27/stable_rank_down_proj": 143.43124389648438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07132266461849213, "geo/layer_27/attn_entropy_mean": 4.4737043380737305, "geo/layer_27/attn_entropy_std": 0.5133402943611145, "attnres/final_alpha/block_0": 0.2450534999370575, "attnres/block_norm/0": 1.547212839126587, "attnres/final_alpha/block_1": 0.007637079805135727, "attnres/block_norm/1": 24533.9140625, "attnres/final_alpha/block_2": 0.015910472720861435, "attnres/block_norm/2": 17962.96875, "attnres/final_alpha/block_3": 0.01798681914806366, "attnres/block_norm/3": 22986.01171875, "attnres/final_alpha/block_4": 0.023861113935709, "attnres/block_norm/4": 7790.49609375, "attnres/final_alpha/block_5": 0.5438665151596069, "attnres/block_norm/5": 4572.36865234375, "attnres/final_alpha/block_6": 0.14568454027175903, "attnres/block_norm/6": 15876.2880859375, "geo/tier1_time_s": 1.3544583320617676, "geo/step": 15675.0, "geo/rankme_slope": 0.00014435088488520408} {"step": 15680, "timestamp": 1778211484.5635412, "train/loss": 2.245354652404785, "train/z_loss": 0.0016463579144328833, "train/perplexity": 9.443764216309658, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701927.5368108077, "perf/iters_per_sec": 0.8115422901205099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322216749191284, "data/tokens_consumed": 32885440512, "data/tokens_consumed_B": 32.885440512, "train/loss_slope": -1.4419978923208181e-05} {"step": 15690, "timestamp": 1778211494.9222224, "train/loss": 2.238370490074158, "train/z_loss": 0.0016496607102453708, "train/perplexity": 9.378037225170507, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025283.640255499, "perf/iters_per_sec": 0.9657304955747122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035485577583313, "data/tokens_consumed": 32906412032, "data/tokens_consumed_B": 32.906412032, "train/loss_slope": -1.2747629061497764e-05} {"step": 15700, "timestamp": 1778211505.2682488, "grad/layer_0/attn": 0.0024703284725546837, "grad/layer_0/mlp": 0.002505915006622672, "grad/layer_0/attn_mlp_ratio": 0.9857989466706436, "grad/layer_4/attn": 0.0018592316191643476, "grad/layer_4/mlp": 0.002406972460448742, "grad/layer_4/attn_mlp_ratio": 0.7724357351284983, "grad/layer_8/attn": 0.006831523962318897, "grad/layer_8/mlp": 0.003764241235330701, "grad/layer_8/attn_mlp_ratio": 1.8148474961472483, "grad/layer_12/attn": 0.0036733432207256556, "grad/layer_12/mlp": 0.005942224990576506, "grad/layer_12/attn_mlp_ratio": 0.6181763842219705, "grad/layer_16/attn": 0.004068335052579641, "grad/layer_16/mlp": 0.00441586272791028, "grad/layer_16/attn_mlp_ratio": 0.9213001424921754, "grad/layer_20/attn": 0.00590141536667943, "grad/layer_20/mlp": 0.0058538587763905525, "grad/layer_20/attn_mlp_ratio": 1.0081239557176003, "grad/layer_24/attn": 0.0076652816496789455, "grad/layer_24/mlp": 0.009519655257463455, "grad/layer_24/attn_mlp_ratio": 0.8052057938914069, "grad/layer_27/attn": 0.008569917641580105, "grad/layer_27/mlp": 0.008207333274185658, "grad/layer_27/attn_mlp_ratio": 1.0441780845085293} {"step": 15700, "timestamp": 1778211505.2855635, "train/loss": 2.2191027402877808, "train/z_loss": 0.0016576012945733964, "train/perplexity": 9.199073204026876, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024621.3593471562, "perf/iters_per_sec": 0.9654146954284459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358242988586426, "data/tokens_consumed": 32927383552, "data/tokens_consumed_B": 32.927383552, "train/loss_slope": -1.2705604743213588e-05} {"step": 15710, "timestamp": 1778211515.6439397, "train/loss": 2.260021758079529, "train/z_loss": 0.0016540131880901753, "train/perplexity": 9.583297678649002, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025438.7960381215, "perf/iters_per_sec": 0.9658044796171767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354062557220458, "data/tokens_consumed": 32948355072, "data/tokens_consumed_B": 32.948355072, "train/loss_slope": -1.701211578810926e-05} {"step": 15720, "timestamp": 1778211525.9934218, "train/loss": 2.2449867963790893, "train/z_loss": 0.0016472113551571966, "train/perplexity": 9.440290909615003, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027312.6140168055, "perf/iters_per_sec": 0.9666979856571224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344492435455321, "data/tokens_consumed": 32969326592, "data/tokens_consumed_B": 32.969326592, "train/loss_slope": -1.7551107503900725e-05} {"step": 15730, "timestamp": 1778211536.346261, "train/loss": 2.2864047050476075, "train/z_loss": 0.0016218562377616763, "train/perplexity": 9.839498113596825, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026650.8734742939, "perf/iters_per_sec": 0.9663824431773633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347870111465454, "data/tokens_consumed": 32990298112, "data/tokens_consumed_B": 32.990298112, "train/loss_slope": -1.6636675683864784e-05} {"step": 15740, "timestamp": 1778211546.6966693, "train/loss": 2.268110227584839, "train/z_loss": 0.0016411101329140364, "train/perplexity": 9.661126222267445, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027237.0153818554, "perf/iters_per_sec": 0.9666619374188687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034487819671631, "data/tokens_consumed": 33011269632, "data/tokens_consumed_B": 33.011269632, "train/loss_slope": -1.542349934685242e-05} {"step": 15750, "timestamp": 1778211557.0336056, "grad/layer_0/attn": 0.0028964432422071695, "grad/layer_0/mlp": 0.0028483811765909195, "grad/layer_0/attn_mlp_ratio": 1.0168734312401368, "grad/layer_4/attn": 0.002049139467999339, "grad/layer_4/mlp": 0.002582683227956295, "grad/layer_4/attn_mlp_ratio": 0.7934149130163957, "grad/layer_8/attn": 0.00809948518872261, "grad/layer_8/mlp": 0.00391627149656415, "grad/layer_8/attn_mlp_ratio": 2.0681622785887748, "grad/layer_12/attn": 0.004016467835754156, "grad/layer_12/mlp": 0.006047418341040611, "grad/layer_12/attn_mlp_ratio": 0.6641623818349539, "grad/layer_16/attn": 0.004758601076900959, "grad/layer_16/mlp": 0.00469018192961812, "grad/layer_16/attn_mlp_ratio": 1.0145877168201103, "grad/layer_20/attn": 0.004194962792098522, "grad/layer_20/mlp": 0.006576825398951769, "grad/layer_20/attn_mlp_ratio": 0.6378400632291561, "grad/layer_24/attn": 0.017256328836083412, "grad/layer_24/mlp": 0.01074429601430893, "grad/layer_24/attn_mlp_ratio": 1.6060920745754534, "grad/layer_27/attn": 0.009084580466151237, "grad/layer_27/mlp": 0.009690431877970695, "grad/layer_27/attn_mlp_ratio": 0.9374794113206983} {"step": 15750, "timestamp": 1778211557.6427307, "eos/sharpness": 57.25443363189696, "eos/L0_probe": 2.0810039043426514, "eos/L_plus": 2.2800562381744385, "eos/L_minus": 2.454495906829834, "eos/grad_norm": 0.1951431781053543, "eos/embed_grad_frac": 0.062475647777318954, "eos/time_s": 0.6064021587371826} {"step": 15750, "timestamp": 1778211557.6618805, "train/loss": 2.2928570985794066, "train/z_loss": 0.0016269795945845545, "train/perplexity": 9.90319169462482, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913394.87373696, "perf/iters_per_sec": 0.9123777741131592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096037220954895, "data/tokens_consumed": 33032241152, "data/tokens_consumed_B": 33.032241152, "train/loss_slope": -1.4871916926876425e-05} {"step": 15750, "timestamp": 1778211559.0236104, "geo/rankme_last": 441.46759033203125, "geo/layer_0/stable_rank_q_proj": 16.04823112487793, "geo/layer_0/stable_rank_k_proj": 14.293293952941895, "geo/layer_0/stable_rank_o_proj": 53.65420150756836, "geo/layer_0/stable_rank_gate_proj": 156.76010131835938, "geo/layer_0/stable_rank_down_proj": 48.989376068115234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03832583874464035, "geo/layer_0/attn_entropy_mean": 6.297137260437012, "geo/layer_0/attn_entropy_std": 0.30495205521583557, "geo/layer_7/stable_rank_q_proj": 42.422733306884766, "geo/layer_7/stable_rank_k_proj": 42.77070999145508, "geo/layer_7/stable_rank_o_proj": 110.85894775390625, "geo/layer_7/stable_rank_gate_proj": 111.10458374023438, "geo/layer_7/stable_rank_down_proj": 154.65379333496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.580383837223053, "geo/layer_7/attn_entropy_mean": 4.663592338562012, "geo/layer_7/attn_entropy_std": 0.866654634475708, "geo/layer_14/stable_rank_q_proj": 59.468196868896484, "geo/layer_14/stable_rank_k_proj": 36.483787536621094, "geo/layer_14/stable_rank_o_proj": 51.56641387939453, "geo/layer_14/stable_rank_gate_proj": 94.66488647460938, "geo/layer_14/stable_rank_down_proj": 137.09063720703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39085566997528076, "geo/layer_14/attn_entropy_mean": 5.5506205558776855, "geo/layer_14/attn_entropy_std": 0.5266320109367371, "geo/layer_21/stable_rank_q_proj": 49.8101692199707, "geo/layer_21/stable_rank_k_proj": 32.29707717895508, "geo/layer_21/stable_rank_o_proj": 87.30995178222656, "geo/layer_21/stable_rank_gate_proj": 94.73065185546875, "geo/layer_21/stable_rank_down_proj": 64.46627044677734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15568502247333527, "geo/layer_21/attn_entropy_mean": 5.764935493469238, "geo/layer_21/attn_entropy_std": 0.30463823676109314, "geo/layer_27/stable_rank_q_proj": 42.234554290771484, "geo/layer_27/stable_rank_k_proj": 32.190486907958984, "geo/layer_27/stable_rank_o_proj": 117.07237243652344, "geo/layer_27/stable_rank_gate_proj": 93.26995849609375, "geo/layer_27/stable_rank_down_proj": 143.4457244873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07040295749902725, "geo/layer_27/attn_entropy_mean": 4.447493076324463, "geo/layer_27/attn_entropy_std": 0.5111412405967712, "attnres/final_alpha/block_0": 0.24575337767601013, "attnres/block_norm/0": 1.5484671592712402, "attnres/final_alpha/block_1": 0.007774571888148785, "attnres/block_norm/1": 24546.45703125, "attnres/final_alpha/block_2": 0.01576765440404415, "attnres/block_norm/2": 18045.259765625, "attnres/final_alpha/block_3": 0.018293723464012146, "attnres/block_norm/3": 23236.29296875, "attnres/final_alpha/block_4": 0.024366911500692368, "attnres/block_norm/4": 7818.71826171875, "attnres/final_alpha/block_5": 0.5388213396072388, "attnres/block_norm/5": 4647.21435546875, "attnres/final_alpha/block_6": 0.14922243356704712, "attnres/block_norm/6": 15880.80859375, "geo/tier1_time_s": 1.3576037883758545, "geo/step": 15750.0, "geo/rankme_slope": 0.00016832725277611044} {"step": 15760, "timestamp": 1778211569.8669703, "train/loss": 2.225931906700134, "train/z_loss": 0.0016589997336268424, "train/perplexity": 9.262110205873906, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1718798.4799869861, "perf/iters_per_sec": 0.8195869827208453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.220126748085022, "data/tokens_consumed": 33053212672, "data/tokens_consumed_B": 33.053212672, "train/loss_slope": -1.6154588738826914e-05} {"step": 15770, "timestamp": 1778211580.2213628, "train/loss": 2.2420957565307615, "train/z_loss": 0.0016474649542942642, "train/perplexity": 9.4130380659259, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026303.010601074, "perf/iters_per_sec": 0.9662165692334528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034964656829834, "data/tokens_consumed": 33074184192, "data/tokens_consumed_B": 33.074184192, "train/loss_slope": -1.424517734537891e-05} {"step": 15780, "timestamp": 1778211590.5827653, "train/loss": 2.2732774019241333, "train/z_loss": 0.0016427669092081488, "train/perplexity": 9.711176142744481, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025047.6177470663, "perf/iters_per_sec": 0.9656179512725193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356062650680542, "data/tokens_consumed": 33095155712, "data/tokens_consumed_B": 33.095155712, "train/loss_slope": -1.0807952986727838e-05} {"step": 15790, "timestamp": 1778211600.9401376, "train/loss": 2.241033339500427, "train/z_loss": 0.001646722515579313, "train/perplexity": 9.403042804485624, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026135.1211463271, "perf/iters_per_sec": 0.966136513302959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350504159927367, "data/tokens_consumed": 33116127232, "data/tokens_consumed_B": 33.116127232, "train/loss_slope": -1.2909049341137158e-05} {"step": 15800, "timestamp": 1778211611.2832475, "grad/layer_0/attn": 0.002791487844660878, "grad/layer_0/mlp": 0.002710496075451374, "grad/layer_0/attn_mlp_ratio": 1.029880753916215, "grad/layer_4/attn": 0.0034494546707719564, "grad/layer_4/mlp": 0.002559542888775468, "grad/layer_4/attn_mlp_ratio": 1.3476838192986313, "grad/layer_8/attn": 0.004937713034451008, "grad/layer_8/mlp": 0.003861493431031704, "grad/layer_8/attn_mlp_ratio": 1.2787055046889493, "grad/layer_12/attn": 0.004525585100054741, "grad/layer_12/mlp": 0.006086237728595734, "grad/layer_12/attn_mlp_ratio": 0.7435767755889554, "grad/layer_16/attn": 0.005036945920437574, "grad/layer_16/mlp": 0.004971290472894907, "grad/layer_16/attn_mlp_ratio": 1.0132069020267378, "grad/layer_20/attn": 0.004978008568286896, "grad/layer_20/mlp": 0.007143157534301281, "grad/layer_20/attn_mlp_ratio": 0.6968918821534348, "grad/layer_24/attn": 0.015599699690937996, "grad/layer_24/mlp": 0.015910688787698746, "grad/layer_24/attn_mlp_ratio": 0.9804540709107077, "grad/layer_27/attn": 0.006124837324023247, "grad/layer_27/mlp": 0.013383015058934689, "grad/layer_27/attn_mlp_ratio": 0.45765750477643447} {"step": 15800, "timestamp": 1778211611.299183, "train/loss": 2.2123054027557374, "train/z_loss": 0.00165753971086815, "train/perplexity": 9.13675603383768, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025423.964971865, "perf/iters_per_sec": 0.9657974076136899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354138374328614, "data/tokens_consumed": 33137098752, "data/tokens_consumed_B": 33.137098752, "train/loss_slope": -1.4274876431735003e-05} {"step": 15810, "timestamp": 1778211621.685631, "train/loss": 2.250759983062744, "train/z_loss": 0.0016425895388238131, "train/perplexity": 9.494949095528163, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020815.2851712161, "perf/iters_per_sec": 0.9635998178344803, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037775206565857, "data/tokens_consumed": 33158070272, "data/tokens_consumed_B": 33.158070272, "train/loss_slope": -1.2382117671148266e-05} {"step": 15820, "timestamp": 1778211632.0453594, "train/loss": 2.232792377471924, "train/z_loss": 0.0016573477536439896, "train/perplexity": 9.325871107044833, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025503.4396369054, "perf/iters_per_sec": 0.9658353040871169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353732109069824, "data/tokens_consumed": 33179041792, "data/tokens_consumed_B": 33.179041792, "train/loss_slope": -1.4313870022828439e-05} {"step": 15825, "timestamp": 1778211637.826419, "eos/sharpness": 46.91312313079833, "eos/L0_probe": 2.080254077911377, "eos/L_plus": 2.3675265312194824, "eos/L_minus": 2.262112855911255, "eos/grad_norm": 0.27046632766723633, "eos/embed_grad_frac": 0.0379527248442173, "eos/time_s": 0.6163575649261475} {"step": 15825, "timestamp": 1778211639.1995072, "geo/rankme_last": 440.2745361328125, "geo/layer_0/stable_rank_q_proj": 16.060131072998047, "geo/layer_0/stable_rank_k_proj": 14.265241622924805, "geo/layer_0/stable_rank_o_proj": 53.511634826660156, "geo/layer_0/stable_rank_gate_proj": 157.2041778564453, "geo/layer_0/stable_rank_down_proj": 48.95676040649414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03770577907562256, "geo/layer_0/attn_entropy_mean": 6.295735836029053, "geo/layer_0/attn_entropy_std": 0.3045055568218231, "geo/layer_7/stable_rank_q_proj": 42.53427505493164, "geo/layer_7/stable_rank_k_proj": 42.70429992675781, "geo/layer_7/stable_rank_o_proj": 110.63488006591797, "geo/layer_7/stable_rank_gate_proj": 111.01988983154297, "geo/layer_7/stable_rank_down_proj": 155.0140380859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.568654477596283, "geo/layer_7/attn_entropy_mean": 4.661952018737793, "geo/layer_7/attn_entropy_std": 0.8673452138900757, "geo/layer_14/stable_rank_q_proj": 59.61265563964844, "geo/layer_14/stable_rank_k_proj": 36.37205123901367, "geo/layer_14/stable_rank_o_proj": 51.62348937988281, "geo/layer_14/stable_rank_gate_proj": 94.60459899902344, "geo/layer_14/stable_rank_down_proj": 136.92153930664062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3752204179763794, "geo/layer_14/attn_entropy_mean": 5.576112747192383, "geo/layer_14/attn_entropy_std": 0.5193005204200745, "geo/layer_21/stable_rank_q_proj": 49.80802917480469, "geo/layer_21/stable_rank_k_proj": 32.266700744628906, "geo/layer_21/stable_rank_o_proj": 87.34428405761719, "geo/layer_21/stable_rank_gate_proj": 94.58663177490234, "geo/layer_21/stable_rank_down_proj": 64.49800109863281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15543173253536224, "geo/layer_21/attn_entropy_mean": 5.7761616706848145, "geo/layer_21/attn_entropy_std": 0.2926556169986725, "geo/layer_27/stable_rank_q_proj": 42.18824768066406, "geo/layer_27/stable_rank_k_proj": 32.13310623168945, "geo/layer_27/stable_rank_o_proj": 116.67395782470703, "geo/layer_27/stable_rank_gate_proj": 93.01741027832031, "geo/layer_27/stable_rank_down_proj": 142.92532348632812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07345492392778397, "geo/layer_27/attn_entropy_mean": 4.457289218902588, "geo/layer_27/attn_entropy_std": 0.5286750793457031, "attnres/final_alpha/block_0": 0.24361950159072876, "attnres/block_norm/0": 1.5496973991394043, "attnres/final_alpha/block_1": 0.007475698366761208, "attnres/block_norm/1": 24705.2734375, "attnres/final_alpha/block_2": 0.015640776604413986, "attnres/block_norm/2": 18092.892578125, "attnres/final_alpha/block_3": 0.017850598320364952, "attnres/block_norm/3": 23389.0390625, "attnres/final_alpha/block_4": 0.02410261705517769, "attnres/block_norm/4": 7828.447265625, "attnres/final_alpha/block_5": 0.5473051071166992, "attnres/block_norm/5": 4589.892578125, "attnres/final_alpha/block_6": 0.1440056711435318, "attnres/block_norm/6": 16044.818359375, "geo/tier1_time_s": 1.3530077934265137, "geo/step": 15825.0, "geo/rankme_slope": 0.000154831893694978} {"step": 15830, "timestamp": 1778211644.3777304, "train/loss": 2.266247367858887, "train/z_loss": 0.0016434668912552297, "train/perplexity": 9.643145652160342, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701187.7886658963, "perf/iters_per_sec": 0.8111895507173044, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327574968338013, "data/tokens_consumed": 33200013312, "data/tokens_consumed_B": 33.200013312, "train/loss_slope": -1.446356003684325e-05} {"step": 15840, "timestamp": 1778211654.7304218, "train/loss": 2.249410843849182, "train/z_loss": 0.0016514399321749806, "train/perplexity": 9.482147724730009, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026595.91524237, "perf/iters_per_sec": 0.9663562370502329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348150730133057, "data/tokens_consumed": 33220984832, "data/tokens_consumed_B": 33.220984832, "train/loss_slope": -1.6654997747508833e-05} {"step": 15850, "timestamp": 1778211665.0681987, "grad/layer_0/attn": 0.0035353926941752434, "grad/layer_0/mlp": 0.0030158255249261856, "grad/layer_0/attn_mlp_ratio": 1.1722802090925846, "grad/layer_4/attn": 0.0019370578229427338, "grad/layer_4/mlp": 0.0025237814988940954, "grad/layer_4/attn_mlp_ratio": 0.7675219693302858, "grad/layer_8/attn": 0.009105097502470016, "grad/layer_8/mlp": 0.0037965562660247087, "grad/layer_8/attn_mlp_ratio": 2.3982516324402066, "grad/layer_12/attn": 0.004691665526479483, "grad/layer_12/mlp": 0.0064047095365822315, "grad/layer_12/attn_mlp_ratio": 0.7325336811026946, "grad/layer_16/attn": 0.00476357527077198, "grad/layer_16/mlp": 0.005080864764750004, "grad/layer_16/attn_mlp_ratio": 0.9375520502071777, "grad/layer_20/attn": 0.007924117147922516, "grad/layer_20/mlp": 0.006851773243397474, "grad/layer_20/attn_mlp_ratio": 1.1565060241752418, "grad/layer_24/attn": 0.018780002370476723, "grad/layer_24/mlp": 0.012498730793595314, "grad/layer_24/attn_mlp_ratio": 1.5025527415827555, "grad/layer_27/attn": 0.015207167714834213, "grad/layer_27/mlp": 0.011285295709967613, "grad/layer_27/attn_mlp_ratio": 1.3475205232460676} {"step": 15850, "timestamp": 1778211665.0839477, "train/loss": 2.2554617881774903, "train/z_loss": 0.0016460805432870983, "train/perplexity": 9.539697612715448, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026558.0952509094, "perf/iters_per_sec": 0.9663382030729816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034834384918213, "data/tokens_consumed": 33241956352, "data/tokens_consumed_B": 33.241956352, "train/loss_slope": -1.800619044391164e-05} {"step": 15860, "timestamp": 1778211675.4367638, "train/loss": 2.229615879058838, "train/z_loss": 0.0016379375825636088, "train/perplexity": 9.296294492177108, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026706.254962711, "perf/iters_per_sec": 0.9664088511289172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034758734703064, "data/tokens_consumed": 33262927872, "data/tokens_consumed_B": 33.262927872, "train/loss_slope": -2.0696366173062913e-05} {"step": 15870, "timestamp": 1778211685.793556, "train/loss": 2.2533992528915405, "train/z_loss": 0.0016373404767364263, "train/perplexity": 9.520041927012622, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025842.2563627437, "perf/iters_per_sec": 0.9659968644918173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035200047492981, "data/tokens_consumed": 33283899392, "data/tokens_consumed_B": 33.283899392, "train/loss_slope": -1.9883099412045367e-05} {"step": 15880, "timestamp": 1778211696.1450465, "train/loss": 2.2373889207839968, "train/z_loss": 0.0016462766798213124, "train/perplexity": 9.368836548118034, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027004.7880932472, "perf/iters_per_sec": 0.9665512028185116, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346063375473022, "data/tokens_consumed": 33304870912, "data/tokens_consumed_B": 33.304870912, "train/loss_slope": -1.8573290243758223e-05} {"step": 15890, "timestamp": 1778211706.4979796, "train/loss": 2.2973942041397093, "train/z_loss": 0.001634428184479475, "train/perplexity": 9.948225605277337, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026715.0807914834, "perf/iters_per_sec": 0.9664130596120278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034754228591919, "data/tokens_consumed": 33325842432, "data/tokens_consumed_B": 33.325842432, "train/loss_slope": -1.5400151336582905e-05} {"step": 15900, "timestamp": 1778211716.8352761, "grad/layer_0/attn": 0.0025354379322379827, "grad/layer_0/mlp": 0.002646249020472169, "grad/layer_0/attn_mlp_ratio": 0.9581251865605102, "grad/layer_4/attn": 0.002131502842530608, "grad/layer_4/mlp": 0.0024671040009707212, "grad/layer_4/attn_mlp_ratio": 0.8639695591653123, "grad/layer_8/attn": 0.00831475667655468, "grad/layer_8/mlp": 0.003798488527536392, "grad/layer_8/attn_mlp_ratio": 2.188964478208121, "grad/layer_12/attn": 0.004083408508449793, "grad/layer_12/mlp": 0.0057401699014008045, "grad/layer_12/attn_mlp_ratio": 0.7113741417855738, "grad/layer_16/attn": 0.0041269175708293915, "grad/layer_16/mlp": 0.004460480064153671, "grad/layer_16/attn_mlp_ratio": 0.9252182318834345, "grad/layer_20/attn": 0.004337640013545752, "grad/layer_20/mlp": 0.006757606286555529, "grad/layer_20/attn_mlp_ratio": 0.6418900074108524, "grad/layer_24/attn": 0.011290766298770905, "grad/layer_24/mlp": 0.010414515621960163, "grad/layer_24/attn_mlp_ratio": 1.0841374289697474, "grad/layer_27/attn": 0.012668139301240444, "grad/layer_27/mlp": 0.008942333050072193, "grad/layer_27/attn_mlp_ratio": 1.416648103871882} {"step": 15900, "timestamp": 1778211717.443714, "eos/sharpness": 45.83220481872558, "eos/L0_probe": 2.0761091709136963, "eos/L_plus": 2.260589599609375, "eos/L_minus": 2.3499507904052734, "eos/grad_norm": 0.16204120218753815, "eos/embed_grad_frac": 0.08974432200193405, "eos/time_s": 0.605677604675293} {"step": 15900, "timestamp": 1778211717.4629838, "train/loss": 2.2430554151535036, "train/z_loss": 0.0016442179214209319, "train/perplexity": 9.422075704902554, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913457.8910057966, "perf/iters_per_sec": 0.912407823088549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960011243820191, "data/tokens_consumed": 33346813952, "data/tokens_consumed_B": 33.346813952, "train/loss_slope": -1.6752295132123945e-05} {"step": 15900, "timestamp": 1778211718.8266127, "geo/rankme_last": 440.62908935546875, "geo/layer_0/stable_rank_q_proj": 16.10603904724121, "geo/layer_0/stable_rank_k_proj": 14.28164291381836, "geo/layer_0/stable_rank_o_proj": 53.42647933959961, "geo/layer_0/stable_rank_gate_proj": 157.03623962402344, "geo/layer_0/stable_rank_down_proj": 49.017093658447266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04131879284977913, "geo/layer_0/attn_entropy_mean": 6.2947998046875, "geo/layer_0/attn_entropy_std": 0.3040696978569031, "geo/layer_7/stable_rank_q_proj": 42.55429458618164, "geo/layer_7/stable_rank_k_proj": 42.61813735961914, "geo/layer_7/stable_rank_o_proj": 110.50491333007812, "geo/layer_7/stable_rank_gate_proj": 110.8917465209961, "geo/layer_7/stable_rank_down_proj": 155.03195190429688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5842193365097046, "geo/layer_7/attn_entropy_mean": 4.629720687866211, "geo/layer_7/attn_entropy_std": 0.8748379349708557, "geo/layer_14/stable_rank_q_proj": 59.413902282714844, "geo/layer_14/stable_rank_k_proj": 36.28886032104492, "geo/layer_14/stable_rank_o_proj": 51.66439437866211, "geo/layer_14/stable_rank_gate_proj": 94.43869018554688, "geo/layer_14/stable_rank_down_proj": 136.5525665283203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3799494504928589, "geo/layer_14/attn_entropy_mean": 5.540121555328369, "geo/layer_14/attn_entropy_std": 0.5473936200141907, "geo/layer_21/stable_rank_q_proj": 49.74024200439453, "geo/layer_21/stable_rank_k_proj": 32.195648193359375, "geo/layer_21/stable_rank_o_proj": 87.21947479248047, "geo/layer_21/stable_rank_gate_proj": 94.36199951171875, "geo/layer_21/stable_rank_down_proj": 64.41117095947266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15473191440105438, "geo/layer_21/attn_entropy_mean": 5.757212162017822, "geo/layer_21/attn_entropy_std": 0.31172478199005127, "geo/layer_27/stable_rank_q_proj": 42.201873779296875, "geo/layer_27/stable_rank_k_proj": 32.08549118041992, "geo/layer_27/stable_rank_o_proj": 116.87117004394531, "geo/layer_27/stable_rank_gate_proj": 93.0223159790039, "geo/layer_27/stable_rank_down_proj": 142.8092041015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06922082602977753, "geo/layer_27/attn_entropy_mean": 4.4398932456970215, "geo/layer_27/attn_entropy_std": 0.5251577496528625, "attnres/final_alpha/block_0": 0.24533331394195557, "attnres/block_norm/0": 1.551018238067627, "attnres/final_alpha/block_1": 0.0076737189665436745, "attnres/block_norm/1": 24709.3359375, "attnres/final_alpha/block_2": 0.015820806846022606, "attnres/block_norm/2": 18104.146484375, "attnres/final_alpha/block_3": 0.01819426566362381, "attnres/block_norm/3": 23474.83984375, "attnres/final_alpha/block_4": 0.024241600185632706, "attnres/block_norm/4": 7829.59228515625, "attnres/final_alpha/block_5": 0.5438679456710815, "attnres/block_norm/5": 4582.02294921875, "attnres/final_alpha/block_6": 0.1448683738708496, "attnres/block_norm/6": 15996.26171875, "geo/tier1_time_s": 1.3596529960632324, "geo/step": 15900.0, "geo/rankme_slope": 0.0001052313112745098} {"step": 15910, "timestamp": 1778211729.1756167, "train/loss": 2.300224471092224, "train/z_loss": 0.001631590654142201, "train/perplexity": 9.976421621749115, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791070.2089669767, "perf/iters_per_sec": 0.8540488285860904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708932399749756, "data/tokens_consumed": 33367785472, "data/tokens_consumed_B": 33.367785472, "train/loss_slope": -1.3320974641256819e-05} {"step": 15920, "timestamp": 1778211739.5269513, "train/loss": 2.290941023826599, "train/z_loss": 0.0016264563892036676, "train/perplexity": 9.88423460644539, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026917.1152232585, "perf/iters_per_sec": 0.9665093971363347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346510887145997, "data/tokens_consumed": 33388756992, "data/tokens_consumed_B": 33.388756992, "train/loss_slope": -7.041228672351448e-06} {"step": 15930, "timestamp": 1778211749.878544, "train/loss": 2.295253872871399, "train/z_loss": 0.0016271200613118709, "train/perplexity": 9.92695587720194, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026756.4558072686, "perf/iters_per_sec": 0.9664327887569755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347331047058106, "data/tokens_consumed": 33409728512, "data/tokens_consumed_B": 33.409728512, "train/loss_slope": -3.487933408094484e-06} {"step": 15940, "timestamp": 1778211760.6289513, "train/loss": 2.27878315448761, "train/z_loss": 0.0016259032301604748, "train/perplexity": 9.76479093514035, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951753.895535763, "perf/iters_per_sec": 0.9306687810591521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0744961261749268, "data/tokens_consumed": 33430700032, "data/tokens_consumed_B": 33.430700032, "train/loss_slope": -3.378248186108538e-06} {"step": 15950, "timestamp": 1778211770.96755, "grad/layer_0/attn": 0.0026535133365541697, "grad/layer_0/mlp": 0.002683429978787899, "grad/layer_0/attn_mlp_ratio": 0.9888513054727169, "grad/layer_4/attn": 0.002397692296653986, "grad/layer_4/mlp": 0.002540143672376871, "grad/layer_4/attn_mlp_ratio": 0.9439199161590802, "grad/layer_8/attn": 0.008450079709291458, "grad/layer_8/mlp": 0.0038555844221264124, "grad/layer_8/attn_mlp_ratio": 2.1916468594575416, "grad/layer_12/attn": 0.003716361476108432, "grad/layer_12/mlp": 0.005818309262394905, "grad/layer_12/attn_mlp_ratio": 0.6387356265598639, "grad/layer_16/attn": 0.004052037373185158, "grad/layer_16/mlp": 0.0042593711987137794, "grad/layer_16/attn_mlp_ratio": 0.95132288054079, "grad/layer_20/attn": 0.008702768012881279, "grad/layer_20/mlp": 0.0057536023668944836, "grad/layer_20/attn_mlp_ratio": 1.5125772180048813, "grad/layer_24/attn": 0.004085133783519268, "grad/layer_24/mlp": 0.008579869754612446, "grad/layer_24/attn_mlp_ratio": 0.4761300407515091, "grad/layer_27/attn": 0.004116285126656294, "grad/layer_27/mlp": 0.007706248667091131, "grad/layer_27/attn_mlp_ratio": 0.5341490070025423} {"step": 15950, "timestamp": 1778211770.9833088, "train/loss": 2.288125419616699, "train/z_loss": 0.0016382912406697869, "train/perplexity": 9.856443656393514, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026632.802766661, "perf/iters_per_sec": 0.9663738263924889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347962379455566, "data/tokens_consumed": 33451671552, "data/tokens_consumed_B": 33.451671552, "train/loss_slope": 1.3869463986117712e-06} {"step": 15960, "timestamp": 1778211781.841186, "train/loss": 2.275034284591675, "train/z_loss": 0.0016469595837406814, "train/perplexity": 9.728252536008402, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932495.6744955934, "perf/iters_per_sec": 0.92148574566631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0852039813995362, "data/tokens_consumed": 33472643072, "data/tokens_consumed_B": 33.472643072, "train/loss_slope": 2.8486450835101653e-06} {"step": 15970, "timestamp": 1778211792.1988924, "train/loss": 2.2088504791259767, "train/z_loss": 0.0016597492853179575, "train/perplexity": 9.105243707203533, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025988.0248716653, "perf/iters_per_sec": 0.966066372333367, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351255655288696, "data/tokens_consumed": 33493614592, "data/tokens_consumed_B": 33.493614592, "train/loss_slope": 8.225837127246676e-07} {"step": 15975, "timestamp": 1778211797.9687998, "eos/sharpness": 54.73749637603759, "eos/L0_probe": 2.071796178817749, "eos/L_plus": 2.270214796066284, "eos/L_minus": 2.42075252532959, "eos/grad_norm": 0.2038925290107727, "eos/embed_grad_frac": 0.060530561953783035, "eos/time_s": 0.6055808067321777} {"step": 15975, "timestamp": 1778211799.3451989, "geo/rankme_last": 440.6584777832031, "geo/layer_0/stable_rank_q_proj": 16.124755859375, "geo/layer_0/stable_rank_k_proj": 14.298669815063477, "geo/layer_0/stable_rank_o_proj": 53.31278991699219, "geo/layer_0/stable_rank_gate_proj": 157.05316162109375, "geo/layer_0/stable_rank_down_proj": 48.99143600463867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04452041909098625, "geo/layer_0/attn_entropy_mean": 6.295539855957031, "geo/layer_0/attn_entropy_std": 0.30312007665634155, "geo/layer_7/stable_rank_q_proj": 42.762123107910156, "geo/layer_7/stable_rank_k_proj": 42.553707122802734, "geo/layer_7/stable_rank_o_proj": 110.19285583496094, "geo/layer_7/stable_rank_gate_proj": 110.72379302978516, "geo/layer_7/stable_rank_down_proj": 154.69911193847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5765228271484375, "geo/layer_7/attn_entropy_mean": 4.665007591247559, "geo/layer_7/attn_entropy_std": 0.8813517093658447, "geo/layer_14/stable_rank_q_proj": 59.29966735839844, "geo/layer_14/stable_rank_k_proj": 36.328208923339844, "geo/layer_14/stable_rank_o_proj": 51.71958923339844, "geo/layer_14/stable_rank_gate_proj": 94.2990951538086, "geo/layer_14/stable_rank_down_proj": 136.53981018066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39777565002441406, "geo/layer_14/attn_entropy_mean": 5.552639007568359, "geo/layer_14/attn_entropy_std": 0.5318526029586792, "geo/layer_21/stable_rank_q_proj": 49.89952850341797, "geo/layer_21/stable_rank_k_proj": 32.17388916015625, "geo/layer_21/stable_rank_o_proj": 87.40028381347656, "geo/layer_21/stable_rank_gate_proj": 94.42937469482422, "geo/layer_21/stable_rank_down_proj": 64.313232421875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15874816477298737, "geo/layer_21/attn_entropy_mean": 5.775102138519287, "geo/layer_21/attn_entropy_std": 0.2810612618923187, "geo/layer_27/stable_rank_q_proj": 42.170753479003906, "geo/layer_27/stable_rank_k_proj": 32.09322738647461, "geo/layer_27/stable_rank_o_proj": 116.66374969482422, "geo/layer_27/stable_rank_gate_proj": 93.06805419921875, "geo/layer_27/stable_rank_down_proj": 142.7618865966797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07334290444850922, "geo/layer_27/attn_entropy_mean": 4.438264846801758, "geo/layer_27/attn_entropy_std": 0.5242279171943665, "attnres/final_alpha/block_0": 0.24595162272453308, "attnres/block_norm/0": 1.5520257949829102, "attnres/final_alpha/block_1": 0.007613996975123882, "attnres/block_norm/1": 24852.33203125, "attnres/final_alpha/block_2": 0.01585591584444046, "attnres/block_norm/2": 18243.2109375, "attnres/final_alpha/block_3": 0.018670029938220978, "attnres/block_norm/3": 23581.01171875, "attnres/final_alpha/block_4": 0.024123063310980797, "attnres/block_norm/4": 7892.68212890625, "attnres/final_alpha/block_5": 0.5426840782165527, "attnres/block_norm/5": 4595.43798828125, "attnres/final_alpha/block_6": 0.14510127902030945, "attnres/block_norm/6": 16096.9697265625, "geo/tier1_time_s": 1.3565373420715332, "geo/step": 15975.0, "geo/rankme_slope": 9.648967008678472e-05} {"step": 15980, "timestamp": 1778211804.5288153, "train/loss": 2.2430567741394043, "train/z_loss": 0.0016423950903117657, "train/perplexity": 9.422088509379293, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701630.7571513245, "perf/iters_per_sec": 0.8114007745510695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2324365854263306, "data/tokens_consumed": 33514586112, "data/tokens_consumed_B": 33.514586112, "train/loss_slope": 3.008108141899454e-06} {"step": 15990, "timestamp": 1778211814.8901527, "train/loss": 2.2776257514953615, "train/z_loss": 0.001646132383029908, "train/perplexity": 9.753495674738343, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025518.8782207419, "perf/iters_per_sec": 0.9658426657775602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353653192520142, "data/tokens_consumed": 33535557632, "data/tokens_consumed_B": 33.535557632, "train/loss_slope": 4.479957692729788e-06} {"step": 16000, "timestamp": 1778211825.7891371, "grad/layer_0/attn": 0.00282518588937819, "grad/layer_0/mlp": 0.0027255478780716658, "grad/layer_0/attn_mlp_ratio": 1.0365570197656246, "grad/layer_4/attn": 0.0015461058355867863, "grad/layer_4/mlp": 0.0026417451445013285, "grad/layer_4/attn_mlp_ratio": 0.5852592481447386, "grad/layer_8/attn": 0.007271467708051205, "grad/layer_8/mlp": 0.0039496333338320255, "grad/layer_8/attn_mlp_ratio": 1.8410487529715536, "grad/layer_12/attn": 0.004414265975356102, "grad/layer_12/mlp": 0.006231037434190512, "grad/layer_12/attn_mlp_ratio": 0.708431934671305, "grad/layer_16/attn": 0.003864316502586007, "grad/layer_16/mlp": 0.004210447892546654, "grad/layer_16/attn_mlp_ratio": 0.9177922419245246, "grad/layer_20/attn": 0.004989085718989372, "grad/layer_20/mlp": 0.007169252727180719, "grad/layer_20/attn_mlp_ratio": 0.6959003733379718, "grad/layer_24/attn": 0.014188213273882866, "grad/layer_24/mlp": 0.012217389419674873, "grad/layer_24/attn_mlp_ratio": 1.161312999887102, "grad/layer_27/attn": 0.010686651803553104, "grad/layer_27/mlp": 0.011494938284158707, "grad/layer_27/attn_mlp_ratio": 0.9296832611369619} {"step": 16000, "timestamp": 1778211825.805419, "train/loss": 2.298789882659912, "train/z_loss": 0.0016296018147841096, "train/perplexity": 9.962119823744487, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922158.0594240811, "perf/iters_per_sec": 0.9165563866730123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.091040349006653, "data/tokens_consumed": 33556529152, "data/tokens_consumed_B": 33.556529152, "train/loss_slope": 1.0245998163964397e-05} {"step": 16000, "timestamp": 1778211832.8055575, "geo/ww_alpha_mean": 8.070097533075138, "geo/ww_alpha_std": 4.835631823683465, "geo/ww_alpha_min": 1.365225486602667, "geo/ww_alpha_max": 30.313911108201893, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 4.194358288759804, "geo/ww_alpha_by_type/k_proj": 4.792276893628087, "geo/ww_alpha_by_type/v_proj": 7.679453790204169, "geo/ww_alpha_by_type/o_proj": 8.724644499546281, "geo/ww_alpha_by_type/gate_proj": 9.484288609422572, "geo/ww_alpha_by_type/up_proj": 12.491453825488849, "geo/ww_alpha_by_type/down_proj": 9.26196614179553, "geo/twonn_id/layer_0": 0.7420014142990112, "geo/twonn_id/layer_7": 3.0592663288116455, "geo/twonn_id/layer_14": 3.9501051902770996, "geo/twonn_id/layer_21": 7.9597249031066895, "geo/twonn_id/layer_27": 6.929327964782715, "geo/tier2_time_s": 6.992931365966797} {"step": 16000, "timestamp": 1778211833.4365299, "eoc/jacobian_sigma/layer_0/attn": 788.1511840820312, "eoc/jacobian_sigma/layer_0/mlp": 4688.58154296875, "eoc/jacobian_sigma/layer_0": 4688.58154296875, "eoc/jacobian_sigma/layer_7/attn": 1.138405680656433, "eoc/jacobian_sigma/layer_7/mlp": 1.711683750152588, "eoc/jacobian_sigma/layer_7": 1.711683750152588, "eoc/jacobian_sigma/layer_14/attn": 1.5021216869354248, "eoc/jacobian_sigma/layer_14/mlp": 7.042079448699951, "eoc/jacobian_sigma/layer_14": 7.042079448699951, "eoc/jacobian_sigma/layer_21/attn": 1.0830700397491455, "eoc/jacobian_sigma/layer_21/mlp": 4.192724704742432, "eoc/jacobian_sigma/layer_21": 4.192724704742432, "eoc/jacobian_sigma/layer_27/attn": 2.7988979816436768, "eoc/jacobian_sigma/layer_27/mlp": 24.028730392456055, "eoc/jacobian_sigma/layer_27": 24.028730392456055, "eoc/layer0_sigma": 4688.58154296875, "eoc/sigma_max": 24.028730392456055, "eoc/sigma_min": 1.711683750152588, "eoc/sigma_mean": 9.243804574012756, "eoc/time_s": 0.6217355728149414} {"step": 16010, "timestamp": 1778211843.8071458, "train/loss": 2.226759505271912, "train/z_loss": 0.0016434165881946683, "train/perplexity": 9.2697786878266, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1165340.9824001456, "perf/iters_per_sec": 0.5556778823853233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7996037483215332, "data/tokens_consumed": 33577500672, "data/tokens_consumed_B": 33.577500672, "train/loss_slope": 5.9996412210267676e-06} {"step": 16020, "timestamp": 1778211854.587007, "train/loss": 2.2780869722366335, "train/z_loss": 0.001634880597703159, "train/perplexity": 9.757995226807045, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946502.4253367318, "perf/iters_per_sec": 0.9281646849330577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0773950099945069, "data/tokens_consumed": 33598472192, "data/tokens_consumed_B": 33.598472192, "train/loss_slope": 8.843326668749414e-06} {"step": 16030, "timestamp": 1778211864.9442723, "train/loss": 2.2994776725769044, "train/z_loss": 0.0016272416105493903, "train/perplexity": 9.968974026166585, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025981.585230907, "perf/iters_per_sec": 0.966063301673368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351288557052611, "data/tokens_consumed": 33619443712, "data/tokens_consumed_B": 33.619443712, "train/loss_slope": 1.016159534025158e-05} {"step": 16040, "timestamp": 1778211875.2998273, "train/loss": 2.228658843040466, "train/z_loss": 0.0016465066932141782, "train/perplexity": 9.287401859474343, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026124.5735579212, "perf/iters_per_sec": 0.9661314838208777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350558042526246, "data/tokens_consumed": 33640415232, "data/tokens_consumed_B": 33.640415232, "train/loss_slope": 9.424042208145441e-06} {"step": 16050, "timestamp": 1778211885.643035, "grad/layer_0/attn": 0.002699443604797125, "grad/layer_0/mlp": 0.0026698254514485598, "grad/layer_0/attn_mlp_ratio": 1.011093628695139, "grad/layer_4/attn": 0.0018457785481587052, "grad/layer_4/mlp": 0.0024573709815740585, "grad/layer_4/attn_mlp_ratio": 0.7511191785395308, "grad/layer_8/attn": 0.010158245451748371, "grad/layer_8/mlp": 0.003902701660990715, "grad/layer_8/attn_mlp_ratio": 2.6028751551770286, "grad/layer_12/attn": 0.004087082110345364, "grad/layer_12/mlp": 0.005544718354940414, "grad/layer_12/attn_mlp_ratio": 0.7371126493724351, "grad/layer_16/attn": 0.005518505349755287, "grad/layer_16/mlp": 0.004489106126129627, "grad/layer_16/attn_mlp_ratio": 1.229310484486614, "grad/layer_20/attn": 0.00402910728007555, "grad/layer_20/mlp": 0.005874429829418659, "grad/layer_20/attn_mlp_ratio": 0.6858720469024772, "grad/layer_24/attn": 0.013119904324412346, "grad/layer_24/mlp": 0.010873840190470219, "grad/layer_24/attn_mlp_ratio": 1.206556651003102, "grad/layer_27/attn": 0.0039995708502829075, "grad/layer_27/mlp": 0.010240040719509125, "grad/layer_27/attn_mlp_ratio": 0.3905815338805099} {"step": 16050, "timestamp": 1778211886.2755694, "eos/sharpness": 22.863984107971188, "eos/L0_probe": 2.076662063598633, "eos/L_plus": 2.1947288513183594, "eos/L_minus": 2.187235116958618, "eos/grad_norm": 0.142086461186409, "eos/embed_grad_frac": 0.1213732361793518, "eos/time_s": 0.6297616958618164} {"step": 16050, "timestamp": 1778211886.2952626, "train/loss": 2.2478689432144163, "train/z_loss": 0.0016422782558947802, "train/perplexity": 9.467538461045185, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908126.5073211323, "perf/iters_per_sec": 0.9098656212430631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0990633964538574, "data/tokens_consumed": 33661386752, "data/tokens_consumed_B": 33.661386752, "train/loss_slope": 9.279824690957617e-06} {"step": 16050, "timestamp": 1778211887.6579509, "geo/rankme_last": 441.2834167480469, "geo/layer_0/stable_rank_q_proj": 16.149734497070312, "geo/layer_0/stable_rank_k_proj": 14.267145156860352, "geo/layer_0/stable_rank_o_proj": 53.34673309326172, "geo/layer_0/stable_rank_gate_proj": 156.93052673339844, "geo/layer_0/stable_rank_down_proj": 49.014766693115234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04933519288897514, "geo/layer_0/attn_entropy_mean": 6.298001766204834, "geo/layer_0/attn_entropy_std": 0.30579307675361633, "geo/layer_7/stable_rank_q_proj": 42.704925537109375, "geo/layer_7/stable_rank_k_proj": 42.539676666259766, "geo/layer_7/stable_rank_o_proj": 110.04098510742188, "geo/layer_7/stable_rank_gate_proj": 110.41265869140625, "geo/layer_7/stable_rank_down_proj": 154.65875244140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5746563673019409, "geo/layer_7/attn_entropy_mean": 4.635920524597168, "geo/layer_7/attn_entropy_std": 0.8803018927574158, "geo/layer_14/stable_rank_q_proj": 59.29348373413086, "geo/layer_14/stable_rank_k_proj": 36.28802490234375, "geo/layer_14/stable_rank_o_proj": 51.784610748291016, "geo/layer_14/stable_rank_gate_proj": 94.07864379882812, "geo/layer_14/stable_rank_down_proj": 136.161865234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4053383767604828, "geo/layer_14/attn_entropy_mean": 5.521198272705078, "geo/layer_14/attn_entropy_std": 0.5393695831298828, "geo/layer_21/stable_rank_q_proj": 49.932373046875, "geo/layer_21/stable_rank_k_proj": 32.30968475341797, "geo/layer_21/stable_rank_o_proj": 87.25946044921875, "geo/layer_21/stable_rank_gate_proj": 94.36457824707031, "geo/layer_21/stable_rank_down_proj": 64.15953063964844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15359415113925934, "geo/layer_21/attn_entropy_mean": 5.771234512329102, "geo/layer_21/attn_entropy_std": 0.29132819175720215, "geo/layer_27/stable_rank_q_proj": 42.0844841003418, "geo/layer_27/stable_rank_k_proj": 32.119388580322266, "geo/layer_27/stable_rank_o_proj": 116.4415054321289, "geo/layer_27/stable_rank_gate_proj": 93.25911712646484, "geo/layer_27/stable_rank_down_proj": 143.0703582763672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07214536517858505, "geo/layer_27/attn_entropy_mean": 4.445575714111328, "geo/layer_27/attn_entropy_std": 0.5278415083885193, "attnres/final_alpha/block_0": 0.24645954370498657, "attnres/block_norm/0": 1.5533299446105957, "attnres/final_alpha/block_1": 0.007526875007897615, "attnres/block_norm/1": 24950.580078125, "attnres/final_alpha/block_2": 0.015698706731200218, "attnres/block_norm/2": 18304.265625, "attnres/final_alpha/block_3": 0.017974473536014557, "attnres/block_norm/3": 23793.9609375, "attnres/final_alpha/block_4": 0.024235667660832405, "attnres/block_norm/4": 7904.234375, "attnres/final_alpha/block_5": 0.5408382415771484, "attnres/block_norm/5": 4636.013671875, "attnres/final_alpha/block_6": 0.1472664773464203, "attnres/block_norm/6": 16007.580078125, "geo/tier1_time_s": 1.3585219383239746, "geo/step": 16050.0, "geo/rankme_slope": 8.585260275985394e-05} {"step": 16060, "timestamp": 1778211898.0262208, "train/loss": 2.269867181777954, "train/z_loss": 0.0016372567741200327, "train/perplexity": 9.678115298638073, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788289.8671350868, "perf/iters_per_sec": 0.8527230582881388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1727136850357056, "data/tokens_consumed": 33682358272, "data/tokens_consumed_B": 33.682358272, "train/loss_slope": 1.048397873387198e-05} {"step": 16070, "timestamp": 1778211908.3768566, "train/loss": 2.226320672035217, "train/z_loss": 0.001652731781359762, "train/perplexity": 9.265711693273035, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027454.5285338734, "perf/iters_per_sec": 0.9667656557721488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343768358230592, "data/tokens_consumed": 33703329792, "data/tokens_consumed_B": 33.703329792, "train/loss_slope": 9.226862703970647e-06} {"step": 16080, "timestamp": 1778211918.7353616, "train/loss": 2.268414115905762, "train/z_loss": 0.0016316871624439955, "train/perplexity": 9.664062571831924, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025620.844083123, "perf/iters_per_sec": 0.9658912868896117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353132009506225, "data/tokens_consumed": 33724301312, "data/tokens_consumed_B": 33.724301312, "train/loss_slope": 9.395193302556335e-06} {"step": 16090, "timestamp": 1778211929.0910463, "train/loss": 2.3263484716415403, "train/z_loss": 0.001608422165736556, "train/perplexity": 10.240479776028181, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026456.455182277, "perf/iters_per_sec": 0.9662897373114953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348862886428833, "data/tokens_consumed": 33745272832, "data/tokens_consumed_B": 33.745272832, "train/loss_slope": 1.262819251247334e-05} {"step": 16100, "timestamp": 1778211939.4542215, "grad/layer_0/attn": 0.002949514426290989, "grad/layer_0/mlp": 0.0026582391001284122, "grad/layer_0/attn_mlp_ratio": 1.10957449809201, "grad/layer_4/attn": 0.0030718196649104357, "grad/layer_4/mlp": 0.002562370151281357, "grad/layer_4/attn_mlp_ratio": 1.1988195942308961, "grad/layer_8/attn": 0.004685347434133291, "grad/layer_8/mlp": 0.003822332015261054, "grad/layer_8/attn_mlp_ratio": 1.2257824000762676, "grad/layer_12/attn": 0.0053436714224517345, "grad/layer_12/mlp": 0.005724948365241289, "grad/layer_12/attn_mlp_ratio": 0.9334007903993448, "grad/layer_16/attn": 0.004054979421198368, "grad/layer_16/mlp": 0.004511441569775343, "grad/layer_16/attn_mlp_ratio": 0.8988212012946839, "grad/layer_20/attn": 0.005278740543872118, "grad/layer_20/mlp": 0.006944242864847183, "grad/layer_20/attn_mlp_ratio": 0.7601606929069026, "grad/layer_24/attn": 0.020386265590786934, "grad/layer_24/mlp": 0.014878924936056137, "grad/layer_24/attn_mlp_ratio": 1.370143712760488, "grad/layer_27/attn": 0.006223650649189949, "grad/layer_27/mlp": 0.012781676836311817, "grad/layer_27/attn_mlp_ratio": 0.4869197273723144} {"step": 16100, "timestamp": 1778211939.4703085, "train/loss": 2.278037691116333, "train/z_loss": 0.0016252203029580414, "train/perplexity": 9.75751435371946, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021686.8026568398, "perf/iters_per_sec": 0.9640153897556495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373278379440307, "data/tokens_consumed": 33766244352, "data/tokens_consumed_B": 33.766244352, "train/loss_slope": 1.115371389071436e-05} {"step": 16110, "timestamp": 1778211949.821625, "train/loss": 2.2525219202041624, "train/z_loss": 0.001632672979030758, "train/perplexity": 9.511693345821932, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026953.5006455604, "perf/iters_per_sec": 0.9665267470577051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346325159072876, "data/tokens_consumed": 33787215872, "data/tokens_consumed_B": 33.787215872, "train/loss_slope": 9.44784905793896e-06} {"step": 16120, "timestamp": 1778211960.171714, "train/loss": 2.25888934135437, "train/z_loss": 0.00163278499385342, "train/perplexity": 9.572451534412155, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027108.5386706963, "perf/iters_per_sec": 0.9666006749490244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345533847808839, "data/tokens_consumed": 33808187392, "data/tokens_consumed_B": 33.808187392, "train/loss_slope": 1.0538731270855066e-05} {"step": 16125, "timestamp": 1778211965.9469469, "eos/sharpness": 54.18174266815185, "eos/L0_probe": 2.074918031692505, "eos/L_plus": 2.257514715194702, "eos/L_minus": 2.434138774871826, "eos/grad_norm": 0.16321058571338654, "eos/embed_grad_frac": 0.09022438526153564, "eos/time_s": 0.6108405590057373} {"step": 16125, "timestamp": 1778211967.3248174, "geo/rankme_last": 440.6706848144531, "geo/layer_0/stable_rank_q_proj": 16.18189239501953, "geo/layer_0/stable_rank_k_proj": 14.289528846740723, "geo/layer_0/stable_rank_o_proj": 53.402706146240234, "geo/layer_0/stable_rank_gate_proj": 157.28311157226562, "geo/layer_0/stable_rank_down_proj": 49.029991149902344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04360434040427208, "geo/layer_0/attn_entropy_mean": 6.290952205657959, "geo/layer_0/attn_entropy_std": 0.3039202392101288, "geo/layer_7/stable_rank_q_proj": 42.682132720947266, "geo/layer_7/stable_rank_k_proj": 42.494544982910156, "geo/layer_7/stable_rank_o_proj": 110.33206176757812, "geo/layer_7/stable_rank_gate_proj": 109.966064453125, "geo/layer_7/stable_rank_down_proj": 155.02389526367188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5745611786842346, "geo/layer_7/attn_entropy_mean": 4.643828392028809, "geo/layer_7/attn_entropy_std": 0.8516659736633301, "geo/layer_14/stable_rank_q_proj": 59.20137405395508, "geo/layer_14/stable_rank_k_proj": 36.09844207763672, "geo/layer_14/stable_rank_o_proj": 51.773338317871094, "geo/layer_14/stable_rank_gate_proj": 94.25829315185547, "geo/layer_14/stable_rank_down_proj": 136.53433227539062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3845391571521759, "geo/layer_14/attn_entropy_mean": 5.521071910858154, "geo/layer_14/attn_entropy_std": 0.5534881949424744, "geo/layer_21/stable_rank_q_proj": 49.89053726196289, "geo/layer_21/stable_rank_k_proj": 32.360965728759766, "geo/layer_21/stable_rank_o_proj": 87.44119262695312, "geo/layer_21/stable_rank_gate_proj": 94.36585235595703, "geo/layer_21/stable_rank_down_proj": 64.16850280761719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15241549909114838, "geo/layer_21/attn_entropy_mean": 5.767932415008545, "geo/layer_21/attn_entropy_std": 0.2893354296684265, "geo/layer_27/stable_rank_q_proj": 42.02471160888672, "geo/layer_27/stable_rank_k_proj": 32.12080001831055, "geo/layer_27/stable_rank_o_proj": 116.60411834716797, "geo/layer_27/stable_rank_gate_proj": 93.20568084716797, "geo/layer_27/stable_rank_down_proj": 142.76602172851562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06603231281042099, "geo/layer_27/attn_entropy_mean": 4.436030387878418, "geo/layer_27/attn_entropy_std": 0.5215997099876404, "attnres/final_alpha/block_0": 0.24595484137535095, "attnres/block_norm/0": 1.5544897317886353, "attnres/final_alpha/block_1": 0.007541591301560402, "attnres/block_norm/1": 24982.8671875, "attnres/final_alpha/block_2": 0.015992797911167145, "attnres/block_norm/2": 18296.0859375, "attnres/final_alpha/block_3": 0.017985740676522255, "attnres/block_norm/3": 23801.15234375, "attnres/final_alpha/block_4": 0.02381553128361702, "attnres/block_norm/4": 7963.421875, "attnres/final_alpha/block_5": 0.5430148839950562, "attnres/block_norm/5": 4647.025390625, "attnres/final_alpha/block_6": 0.14569464325904846, "attnres/block_norm/6": 16270.41015625, "geo/tier1_time_s": 1.3583667278289795, "geo/step": 16125.0, "geo/rankme_slope": 8.3718800020008e-05} {"step": 16130, "timestamp": 1778211972.5141776, "train/loss": 2.2070334911346436, "train/z_loss": 0.0016443233587779104, "train/perplexity": 9.088714609862565, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699816.2150564017, "perf/iters_per_sec": 0.8105355334550866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337522029876709, "data/tokens_consumed": 33829158912, "data/tokens_consumed_B": 33.829158912, "train/loss_slope": 6.115355836425961e-06} {"step": 16140, "timestamp": 1778211982.8651946, "train/loss": 2.226313757896423, "train/z_loss": 0.0016430123359896243, "train/perplexity": 9.265647629077838, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026954.1078588115, "perf/iters_per_sec": 0.9665270365995462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346322059631348, "data/tokens_consumed": 33850130432, "data/tokens_consumed_B": 33.850130432, "train/loss_slope": 3.1238144451957746e-06} {"step": 16150, "timestamp": 1778211993.2104864, "grad/layer_0/attn": 0.002551760757341981, "grad/layer_0/mlp": 0.0024600159376859665, "grad/layer_0/attn_mlp_ratio": 1.0372943583499215, "grad/layer_4/attn": 0.0015503275208175182, "grad/layer_4/mlp": 0.002451480831950903, "grad/layer_4/attn_mlp_ratio": 0.6324044787016795, "grad/layer_8/attn": 0.007188151590526104, "grad/layer_8/mlp": 0.003787479130551219, "grad/layer_8/attn_mlp_ratio": 1.8978722134088009, "grad/layer_12/attn": 0.0039590224623680115, "grad/layer_12/mlp": 0.00601051514968276, "grad/layer_12/attn_mlp_ratio": 0.65868270820492, "grad/layer_16/attn": 0.005638659466058016, "grad/layer_16/mlp": 0.004224018659442663, "grad/layer_16/attn_mlp_ratio": 1.3349039829553238, "grad/layer_20/attn": 0.006411230657249689, "grad/layer_20/mlp": 0.005443649832159281, "grad/layer_20/attn_mlp_ratio": 1.1777448471427714, "grad/layer_24/attn": 0.006866539362818003, "grad/layer_24/mlp": 0.007515536155551672, "grad/layer_24/attn_mlp_ratio": 0.9136459634195419, "grad/layer_27/attn": 0.003829494584351778, "grad/layer_27/mlp": 0.006343743298202753, "grad/layer_27/attn_mlp_ratio": 0.6036647991526128} {"step": 16150, "timestamp": 1778211993.22653, "train/loss": 2.189392113685608, "train/z_loss": 0.001641098118852824, "train/perplexity": 8.929783171491437, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025071.3014419593, "perf/iters_per_sec": 0.9656292445382878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355941534042359, "data/tokens_consumed": 33871101952, "data/tokens_consumed_B": 33.871101952, "train/loss_slope": -4.790966910640607e-06} {"step": 16160, "timestamp": 1778212003.5840743, "train/loss": 2.21969211101532, "train/z_loss": 0.0016507552703842522, "train/perplexity": 9.204496466492905, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025641.182509653, "perf/iters_per_sec": 0.9659009850071206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353028059005738, "data/tokens_consumed": 33892073472, "data/tokens_consumed_B": 33.892073472, "train/loss_slope": -6.085284207627561e-06} {"step": 16170, "timestamp": 1778212013.93544, "train/loss": 2.224807357788086, "train/z_loss": 0.001641710731200874, "train/perplexity": 9.251700364203716, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026848.1780053445, "perf/iters_per_sec": 0.9664765253092501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034686279296875, "data/tokens_consumed": 33913044992, "data/tokens_consumed_B": 33.913044992, "train/loss_slope": -8.81065813013638e-06} {"step": 16180, "timestamp": 1778212024.2970552, "train/loss": 2.2455260515213014, "train/z_loss": 0.0016352211125195027, "train/perplexity": 9.44538300787868, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025325.749618874, "perf/iters_per_sec": 0.9657505748838777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354640483856201, "data/tokens_consumed": 33934016512, "data/tokens_consumed_B": 33.934016512, "train/loss_slope": -9.857290069369409e-06} {"step": 16190, "timestamp": 1778212034.6739445, "train/loss": 2.2027610540390015, "train/z_loss": 0.0016486733453348278, "train/perplexity": 9.04996648182293, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021997.338055323, "perf/iters_per_sec": 0.9641634645725837, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037168526649475, "data/tokens_consumed": 33954988032, "data/tokens_consumed_B": 33.954988032, "train/loss_slope": -1.2013578357690742e-05} {"step": 16200, "timestamp": 1778212045.0171828, "grad/layer_0/attn": 0.0027363586705178022, "grad/layer_0/mlp": 0.0025670018512755632, "grad/layer_0/attn_mlp_ratio": 1.0659745190914578, "grad/layer_4/attn": 0.001971946796402335, "grad/layer_4/mlp": 0.0024224105291068554, "grad/layer_4/attn_mlp_ratio": 0.8140431571378108, "grad/layer_8/attn": 0.0039224326610565186, "grad/layer_8/mlp": 0.003948519472032785, "grad/layer_8/attn_mlp_ratio": 0.9933932425810831, "grad/layer_12/attn": 0.003878117073327303, "grad/layer_12/mlp": 0.005817691795527935, "grad/layer_12/attn_mlp_ratio": 0.6666075039670656, "grad/layer_16/attn": 0.0038217634428292513, "grad/layer_16/mlp": 0.004299772437661886, "grad/layer_16/attn_mlp_ratio": 0.8888292134884501, "grad/layer_20/attn": 0.005060362163931131, "grad/layer_20/mlp": 0.006204868201166391, "grad/layer_20/attn_mlp_ratio": 0.8155470701900127, "grad/layer_24/attn": 0.014791966415941715, "grad/layer_24/mlp": 0.014376788400113583, "grad/layer_24/attn_mlp_ratio": 1.0288783490015767, "grad/layer_27/attn": 0.006442583166062832, "grad/layer_27/mlp": 0.012728337198495865, "grad/layer_27/attn_mlp_ratio": 0.5061606253021097} {"step": 16200, "timestamp": 1778212045.6432188, "eos/sharpness": 19.677424430847164, "eos/L0_probe": 2.07633638381958, "eos/L_plus": 2.1589667797088623, "eos/L_minus": 2.1904802322387695, "eos/grad_norm": 0.16683976352214813, "eos/embed_grad_frac": 0.11793193221092224, "eos/time_s": 0.6231780052185059} {"step": 16200, "timestamp": 1778212045.6628966, "train/loss": 2.274593687057495, "train/z_loss": 0.0016273133805952966, "train/perplexity": 9.723967236044782, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909307.5527895924, "perf/iters_per_sec": 0.9104287876079523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0983835458755493, "data/tokens_consumed": 33975959552, "data/tokens_consumed_B": 33.975959552, "train/loss_slope": -9.134251313848964e-06} {"step": 16200, "timestamp": 1778212047.0269234, "geo/rankme_last": 441.10125732421875, "geo/layer_0/stable_rank_q_proj": 16.19745445251465, "geo/layer_0/stable_rank_k_proj": 14.253981590270996, "geo/layer_0/stable_rank_o_proj": 53.47769546508789, "geo/layer_0/stable_rank_gate_proj": 157.32127380371094, "geo/layer_0/stable_rank_down_proj": 49.151771545410156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.045274220407009125, "geo/layer_0/attn_entropy_mean": 6.290157318115234, "geo/layer_0/attn_entropy_std": 0.30584716796875, "geo/layer_7/stable_rank_q_proj": 42.749271392822266, "geo/layer_7/stable_rank_k_proj": 42.72774124145508, "geo/layer_7/stable_rank_o_proj": 110.64755249023438, "geo/layer_7/stable_rank_gate_proj": 110.14338684082031, "geo/layer_7/stable_rank_down_proj": 154.7578582763672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.573294460773468, "geo/layer_7/attn_entropy_mean": 4.646929740905762, "geo/layer_7/attn_entropy_std": 0.8617212772369385, "geo/layer_14/stable_rank_q_proj": 58.93964767456055, "geo/layer_14/stable_rank_k_proj": 36.055660247802734, "geo/layer_14/stable_rank_o_proj": 52.048179626464844, "geo/layer_14/stable_rank_gate_proj": 94.01937866210938, "geo/layer_14/stable_rank_down_proj": 136.4869842529297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3827972412109375, "geo/layer_14/attn_entropy_mean": 5.535246849060059, "geo/layer_14/attn_entropy_std": 0.5067691802978516, "geo/layer_21/stable_rank_q_proj": 49.855831146240234, "geo/layer_21/stable_rank_k_proj": 32.1291389465332, "geo/layer_21/stable_rank_o_proj": 87.20600891113281, "geo/layer_21/stable_rank_gate_proj": 94.00035858154297, "geo/layer_21/stable_rank_down_proj": 64.03126525878906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15664316713809967, "geo/layer_21/attn_entropy_mean": 5.751163959503174, "geo/layer_21/attn_entropy_std": 0.3019903302192688, "geo/layer_27/stable_rank_q_proj": 41.89898681640625, "geo/layer_27/stable_rank_k_proj": 32.10929870605469, "geo/layer_27/stable_rank_o_proj": 116.67503356933594, "geo/layer_27/stable_rank_gate_proj": 93.16959381103516, "geo/layer_27/stable_rank_down_proj": 142.75816345214844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06674888730049133, "geo/layer_27/attn_entropy_mean": 4.417210578918457, "geo/layer_27/attn_entropy_std": 0.5303556323051453, "attnres/final_alpha/block_0": 0.24485018849372864, "attnres/block_norm/0": 1.555877923965454, "attnres/final_alpha/block_1": 0.007575029507279396, "attnres/block_norm/1": 25032.33984375, "attnres/final_alpha/block_2": 0.015826798975467682, "attnres/block_norm/2": 18341.6328125, "attnres/final_alpha/block_3": 0.01788708195090294, "attnres/block_norm/3": 23869.787109375, "attnres/final_alpha/block_4": 0.02369207888841629, "attnres/block_norm/4": 7966.015625, "attnres/final_alpha/block_5": 0.5434670448303223, "attnres/block_norm/5": 4646.65673828125, "attnres/final_alpha/block_6": 0.14670178294181824, "attnres/block_norm/6": 16216.8984375, "geo/tier1_time_s": 1.3597657680511475, "geo/step": 16200.0, "geo/rankme_slope": 8.884039944102641e-05} {"step": 16210, "timestamp": 1778212057.3846562, "train/loss": 2.28310387134552, "train/z_loss": 0.0016178046935237944, "train/perplexity": 9.807073110823364, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789764.7339273053, "perf/iters_per_sec": 0.8534263295780684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1717473030090333, "data/tokens_consumed": 33996931072, "data/tokens_consumed_B": 33.996931072, "train/loss_slope": -7.747253154156e-06} {"step": 16220, "timestamp": 1778212067.7380881, "train/loss": 2.2647334575653075, "train/z_loss": 0.0016271702945232392, "train/perplexity": 9.628557839800878, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026500.2007144743, "perf/iters_per_sec": 0.9663105968067524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348639488220215, "data/tokens_consumed": 34017902592, "data/tokens_consumed_B": 34.017902592, "train/loss_slope": -7.522061076423151e-06} {"step": 16230, "timestamp": 1778212078.0934236, "train/loss": 2.222816801071167, "train/z_loss": 0.0016495313844643532, "train/perplexity": 9.233302646825578, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026144.1753584822, "perf/iters_per_sec": 0.9661408306877528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350457906723023, "data/tokens_consumed": 34038874112, "data/tokens_consumed_B": 34.038874112, "train/loss_slope": -8.814318316711691e-06} {"step": 16240, "timestamp": 1778212088.4503376, "train/loss": 2.2623472452163695, "train/z_loss": 0.0016267804661765695, "train/perplexity": 9.60560944693933, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026059.656969339, "perf/iters_per_sec": 0.9661005291792578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350889682769775, "data/tokens_consumed": 34059845632, "data/tokens_consumed_B": 34.059845632, "train/loss_slope": -8.984815069336476e-06} {"step": 16250, "timestamp": 1778212098.7969737, "grad/layer_0/attn": 0.0025866602081805468, "grad/layer_0/mlp": 0.002666504355147481, "grad/layer_0/attn_mlp_ratio": 0.9700565859498936, "grad/layer_4/attn": 0.0016949123237282038, "grad/layer_4/mlp": 0.0025343068409711123, "grad/layer_4/attn_mlp_ratio": 0.6687873107740991, "grad/layer_8/attn": 0.0040337396785616875, "grad/layer_8/mlp": 0.003827641950920224, "grad/layer_8/attn_mlp_ratio": 1.053844540555174, "grad/layer_12/attn": 0.004080400802195072, "grad/layer_12/mlp": 0.0062946719117462635, "grad/layer_12/attn_mlp_ratio": 0.6482308839254503, "grad/layer_16/attn": 0.004603201057761908, "grad/layer_16/mlp": 0.004922880791127682, "grad/layer_16/attn_mlp_ratio": 0.9350624480998676, "grad/layer_20/attn": 0.004130575340241194, "grad/layer_20/mlp": 0.006470406893640757, "grad/layer_20/attn_mlp_ratio": 0.6383795245493529, "grad/layer_24/attn": 0.013567596673965454, "grad/layer_24/mlp": 0.012774033471941948, "grad/layer_24/attn_mlp_ratio": 1.062123142040785, "grad/layer_27/attn": 0.003959557972848415, "grad/layer_27/mlp": 0.01051725447177887, "grad/layer_27/attn_mlp_ratio": 0.3764820891065208} {"step": 16250, "timestamp": 1778212098.8130598, "train/loss": 2.23000590801239, "train/z_loss": 0.0016327542602084578, "train/perplexity": 9.299921023369912, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024797.993594653, "perf/iters_per_sec": 0.9654989212010636, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035733938217163, "data/tokens_consumed": 34080817152, "data/tokens_consumed_B": 34.080817152, "train/loss_slope": -8.477091295669746e-06} {"step": 16260, "timestamp": 1778212109.1649644, "train/loss": 2.2696871519088746, "train/z_loss": 0.0016288358834572136, "train/perplexity": 9.676373105636017, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026926.4566704577, "perf/iters_per_sec": 0.9665138514854706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346463203430176, "data/tokens_consumed": 34101788672, "data/tokens_consumed_B": 34.101788672, "train/loss_slope": -6.888807467286904e-06} {"step": 16270, "timestamp": 1778212119.5173116, "train/loss": 2.2533323049545286, "train/z_loss": 0.0016244613099843264, "train/perplexity": 9.519404601179406, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026763.9744734587, "perf/iters_per_sec": 0.966436373936395, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034729266166687, "data/tokens_consumed": 34122760192, "data/tokens_consumed_B": 34.122760192, "train/loss_slope": -7.034048717943971e-06} {"step": 16275, "timestamp": 1778212125.3073127, "eos/sharpness": 20.22328376770019, "eos/L0_probe": 2.072650194168091, "eos/L_plus": 2.1514790058135986, "eos/L_minus": 2.196054220199585, "eos/grad_norm": 0.1587340384721756, "eos/embed_grad_frac": 0.11105751991271973, "eos/time_s": 0.6231944561004639} {"step": 16275, "timestamp": 1778212126.6888323, "geo/rankme_last": 440.46649169921875, "geo/layer_0/stable_rank_q_proj": 16.21011734008789, "geo/layer_0/stable_rank_k_proj": 14.311848640441895, "geo/layer_0/stable_rank_o_proj": 53.5073356628418, "geo/layer_0/stable_rank_gate_proj": 157.97222900390625, "geo/layer_0/stable_rank_down_proj": 49.07776641845703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.039954010397195816, "geo/layer_0/attn_entropy_mean": 6.294347763061523, "geo/layer_0/attn_entropy_std": 0.30674445629119873, "geo/layer_7/stable_rank_q_proj": 42.84516525268555, "geo/layer_7/stable_rank_k_proj": 42.62236404418945, "geo/layer_7/stable_rank_o_proj": 111.39417266845703, "geo/layer_7/stable_rank_gate_proj": 110.01333618164062, "geo/layer_7/stable_rank_down_proj": 154.83804321289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5689573884010315, "geo/layer_7/attn_entropy_mean": 4.651516914367676, "geo/layer_7/attn_entropy_std": 0.8787432312965393, "geo/layer_14/stable_rank_q_proj": 58.72023391723633, "geo/layer_14/stable_rank_k_proj": 36.09487533569336, "geo/layer_14/stable_rank_o_proj": 52.0135612487793, "geo/layer_14/stable_rank_gate_proj": 94.05506134033203, "geo/layer_14/stable_rank_down_proj": 136.8306121826172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38762232661247253, "geo/layer_14/attn_entropy_mean": 5.546160697937012, "geo/layer_14/attn_entropy_std": 0.5162338018417358, "geo/layer_21/stable_rank_q_proj": 49.74388885498047, "geo/layer_21/stable_rank_k_proj": 32.096900939941406, "geo/layer_21/stable_rank_o_proj": 87.11077880859375, "geo/layer_21/stable_rank_gate_proj": 93.77742004394531, "geo/layer_21/stable_rank_down_proj": 63.96107864379883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15559804439544678, "geo/layer_21/attn_entropy_mean": 5.772918701171875, "geo/layer_21/attn_entropy_std": 0.29266074299812317, "geo/layer_27/stable_rank_q_proj": 41.89518356323242, "geo/layer_27/stable_rank_k_proj": 32.032711029052734, "geo/layer_27/stable_rank_o_proj": 116.50407409667969, "geo/layer_27/stable_rank_gate_proj": 93.07050323486328, "geo/layer_27/stable_rank_down_proj": 142.71905517578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08125032484531403, "geo/layer_27/attn_entropy_mean": 4.41571044921875, "geo/layer_27/attn_entropy_std": 0.5474652647972107, "attnres/final_alpha/block_0": 0.24504119157791138, "attnres/block_norm/0": 1.557229995727539, "attnres/final_alpha/block_1": 0.0074297902174293995, "attnres/block_norm/1": 25206.763671875, "attnres/final_alpha/block_2": 0.01546621322631836, "attnres/block_norm/2": 18522.11328125, "attnres/final_alpha/block_3": 0.01810522750020027, "attnres/block_norm/3": 23856.60546875, "attnres/final_alpha/block_4": 0.023768601939082146, "attnres/block_norm/4": 7981.8681640625, "attnres/final_alpha/block_5": 0.5451983213424683, "attnres/block_norm/5": 4647.45703125, "attnres/final_alpha/block_6": 0.1449907124042511, "attnres/block_norm/6": 16320.212890625, "geo/tier1_time_s": 1.3619163036346436, "geo/step": 16275.0, "geo/rankme_slope": 5.88121381365046e-05} {"step": 16280, "timestamp": 1778212131.870719, "train/loss": 2.2446533203125, "train/z_loss": 0.0016523781814612448, "train/perplexity": 9.437143323386515, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698392.4453398949, "perf/iters_per_sec": 0.8098566271495318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2347864627838134, "data/tokens_consumed": 34143731712, "data/tokens_consumed_B": 34.143731712, "train/loss_slope": -6.003544261210154e-06} {"step": 16290, "timestamp": 1778212142.2229319, "train/loss": 2.2367407083511353, "train/z_loss": 0.0016418110812082888, "train/perplexity": 9.362765519656739, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026765.3287732978, "perf/iters_per_sec": 0.9664370197168817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347285747528077, "data/tokens_consumed": 34164703232, "data/tokens_consumed_B": 34.164703232, "train/loss_slope": -6.094946090144272e-06} {"step": 16300, "timestamp": 1778212152.5666153, "grad/layer_0/attn": 0.0028993692249059677, "grad/layer_0/mlp": 0.0027040119748562574, "grad/layer_0/attn_mlp_ratio": 1.0722471441108838, "grad/layer_4/attn": 0.00179407955147326, "grad/layer_4/mlp": 0.002515937900170684, "grad/layer_4/attn_mlp_ratio": 0.7130857562275174, "grad/layer_8/attn": 0.004095188807696104, "grad/layer_8/mlp": 0.003914599306881428, "grad/layer_8/attn_mlp_ratio": 1.0461322812488094, "grad/layer_12/attn": 0.005026577971875668, "grad/layer_12/mlp": 0.006264479365199804, "grad/layer_12/attn_mlp_ratio": 0.8023935587624033, "grad/layer_16/attn": 0.006225225515663624, "grad/layer_16/mlp": 0.004677560646086931, "grad/layer_16/attn_mlp_ratio": 1.3308700524886632, "grad/layer_20/attn": 0.006455500144511461, "grad/layer_20/mlp": 0.00753456586971879, "grad/layer_20/attn_mlp_ratio": 0.8567846071632973, "grad/layer_24/attn": 0.024266008287668228, "grad/layer_24/mlp": 0.018255675211548805, "grad/layer_24/attn_mlp_ratio": 1.3292309308501558, "grad/layer_27/attn": 0.009987953118979931, "grad/layer_27/mlp": 0.016269098967313766, "grad/layer_27/attn_mlp_ratio": 0.613921709964058} {"step": 16300, "timestamp": 1778212152.5827947, "train/loss": 2.236667037010193, "train/z_loss": 0.001636668771971017, "train/perplexity": 9.3620757775734, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025168.979226167, "perf/iters_per_sec": 0.965675820935329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035544204711914, "data/tokens_consumed": 34185674752, "data/tokens_consumed_B": 34.185674752, "train/loss_slope": -7.336847209157854e-06} {"step": 16310, "timestamp": 1778212162.9362483, "train/loss": 2.3168985486030578, "train/z_loss": 0.0016204755986109376, "train/perplexity": 10.144163836135704, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026677.8633999927, "perf/iters_per_sec": 0.9663953129768337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347732305526733, "data/tokens_consumed": 34206646272, "data/tokens_consumed_B": 34.206646272, "train/loss_slope": -3.456124471108747e-06} {"step": 16320, "timestamp": 1778212173.3002365, "train/loss": 2.2358258724212647, "train/z_loss": 0.001634719339199364, "train/perplexity": 9.354204042127142, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025468.5987430944, "perf/iters_per_sec": 0.9658186906543228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353910207748414, "data/tokens_consumed": 34227617792, "data/tokens_consumed_B": 34.227617792, "train/loss_slope": -5.785004641249231e-07} {"step": 16330, "timestamp": 1778212183.6576767, "train/loss": 2.2937808513641356, "train/z_loss": 0.001627259945962578, "train/perplexity": 9.912344022123621, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026271.08285767, "perf/iters_per_sec": 0.9662013448990202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349809646606445, "data/tokens_consumed": 34248589312, "data/tokens_consumed_B": 34.248589312, "train/loss_slope": 2.5262635568938745e-06} {"step": 16340, "timestamp": 1778212194.0068104, "train/loss": 2.2836318731307985, "train/z_loss": 0.0016145086265169084, "train/perplexity": 9.812252630211647, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027449.3880498244, "perf/iters_per_sec": 0.9667632045983431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343794584274293, "data/tokens_consumed": 34269560832, "data/tokens_consumed_B": 34.269560832, "train/loss_slope": 2.212046010337585e-06} {"step": 16350, "timestamp": 1778212204.3464215, "grad/layer_0/attn": 0.0029529111925512552, "grad/layer_0/mlp": 0.0027211064007133245, "grad/layer_0/attn_mlp_ratio": 1.0851876586885387, "grad/layer_4/attn": 0.0016653648344799876, "grad/layer_4/mlp": 0.0025104109663516283, "grad/layer_4/attn_mlp_ratio": 0.6633833226764159, "grad/layer_8/attn": 0.004360074643045664, "grad/layer_8/mlp": 0.00399242527782917, "grad/layer_8/attn_mlp_ratio": 1.0920866967879055, "grad/layer_12/attn": 0.003970538731664419, "grad/layer_12/mlp": 0.005824396852403879, "grad/layer_12/attn_mlp_ratio": 0.6817081260276526, "grad/layer_16/attn": 0.005924746859818697, "grad/layer_16/mlp": 0.00480311457067728, "grad/layer_16/attn_mlp_ratio": 1.2335218428135628, "grad/layer_20/attn": 0.007481241598725319, "grad/layer_20/mlp": 0.006244662683457136, "grad/layer_20/attn_mlp_ratio": 1.198021712004054, "grad/layer_24/attn": 0.01447477750480175, "grad/layer_24/mlp": 0.010169480927288532, "grad/layer_24/attn_mlp_ratio": 1.4233545906581164, "grad/layer_27/attn": 0.0040453351102769375, "grad/layer_27/mlp": 0.009128031320869923, "grad/layer_27/attn_mlp_ratio": 0.4431771675355832} {"step": 16350, "timestamp": 1778212204.9572105, "eos/sharpness": 23.768210411071774, "eos/L0_probe": 2.078866481781006, "eos/L_plus": 2.214656114578247, "eos/L_minus": 2.1807589530944824, "eos/grad_norm": 0.13136105239391327, "eos/embed_grad_frac": 0.14923956990242004, "eos/time_s": 0.6080281734466553} {"step": 16350, "timestamp": 1778212204.9777358, "train/loss": 2.3272969484329225, "train/z_loss": 0.001609474513679743, "train/perplexity": 10.250197241094913, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912745.5465385357, "perf/iters_per_sec": 0.9120681507771186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096409296989441, "data/tokens_consumed": 34290532352, "data/tokens_consumed_B": 34.290532352, "train/loss_slope": 5.0820287340509525e-06} {"step": 16350, "timestamp": 1778212206.3333378, "geo/rankme_last": 440.77996826171875, "geo/layer_0/stable_rank_q_proj": 16.231164932250977, "geo/layer_0/stable_rank_k_proj": 14.342592239379883, "geo/layer_0/stable_rank_o_proj": 53.39813995361328, "geo/layer_0/stable_rank_gate_proj": 157.95452880859375, "geo/layer_0/stable_rank_down_proj": 49.07601547241211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04367111623287201, "geo/layer_0/attn_entropy_mean": 6.295283317565918, "geo/layer_0/attn_entropy_std": 0.30442407727241516, "geo/layer_7/stable_rank_q_proj": 42.80289840698242, "geo/layer_7/stable_rank_k_proj": 42.52925109863281, "geo/layer_7/stable_rank_o_proj": 111.50074005126953, "geo/layer_7/stable_rank_gate_proj": 110.03413391113281, "geo/layer_7/stable_rank_down_proj": 154.95619201660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5842203497886658, "geo/layer_7/attn_entropy_mean": 4.647983551025391, "geo/layer_7/attn_entropy_std": 0.8699934482574463, "geo/layer_14/stable_rank_q_proj": 58.55611801147461, "geo/layer_14/stable_rank_k_proj": 36.04634094238281, "geo/layer_14/stable_rank_o_proj": 51.9598503112793, "geo/layer_14/stable_rank_gate_proj": 93.84609985351562, "geo/layer_14/stable_rank_down_proj": 136.95262145996094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3877171576023102, "geo/layer_14/attn_entropy_mean": 5.565980911254883, "geo/layer_14/attn_entropy_std": 0.48889970779418945, "geo/layer_21/stable_rank_q_proj": 49.86201477050781, "geo/layer_21/stable_rank_k_proj": 32.00634765625, "geo/layer_21/stable_rank_o_proj": 87.10555267333984, "geo/layer_21/stable_rank_gate_proj": 93.50936889648438, "geo/layer_21/stable_rank_down_proj": 63.829322814941406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15913254022598267, "geo/layer_21/attn_entropy_mean": 5.772765159606934, "geo/layer_21/attn_entropy_std": 0.29157474637031555, "geo/layer_27/stable_rank_q_proj": 41.80264663696289, "geo/layer_27/stable_rank_k_proj": 31.983095169067383, "geo/layer_27/stable_rank_o_proj": 116.75422668457031, "geo/layer_27/stable_rank_gate_proj": 93.14340209960938, "geo/layer_27/stable_rank_down_proj": 142.4774169921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07274418324232101, "geo/layer_27/attn_entropy_mean": 4.443041801452637, "geo/layer_27/attn_entropy_std": 0.517869234085083, "attnres/final_alpha/block_0": 0.24368740618228912, "attnres/block_norm/0": 1.55838143825531, "attnres/final_alpha/block_1": 0.007488940842449665, "attnres/block_norm/1": 25169.1484375, "attnres/final_alpha/block_2": 0.015268657356500626, "attnres/block_norm/2": 18657.4453125, "attnres/final_alpha/block_3": 0.01771152764558792, "attnres/block_norm/3": 23993.26953125, "attnres/final_alpha/block_4": 0.023320231586694717, "attnres/block_norm/4": 8025.06787109375, "attnres/final_alpha/block_5": 0.5497385263442993, "attnres/block_norm/5": 4645.9443359375, "attnres/final_alpha/block_6": 0.14278465509414673, "attnres/block_norm/6": 16466.630859375, "geo/tier1_time_s": 1.3520424365997314, "geo/step": 16350.0, "geo/rankme_slope": 5.140007174744898e-05} {"step": 16360, "timestamp": 1778212216.6852634, "train/loss": 2.245648241043091, "train/z_loss": 0.0016238450771197677, "train/perplexity": 9.446537205225507, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791800.1975294196, "perf/iters_per_sec": 0.8543969142577265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704162120819093, "data/tokens_consumed": 34311503872, "data/tokens_consumed_B": 34.311503872, "train/loss_slope": 6.336745398916603e-06} {"step": 16370, "timestamp": 1778212227.0482304, "train/loss": 2.232901692390442, "train/z_loss": 0.0016315275453962386, "train/perplexity": 9.32689061960796, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024709.3463811995, "perf/iters_per_sec": 0.9654566509157179, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357792854309082, "data/tokens_consumed": 34332475392, "data/tokens_consumed_B": 34.332475392, "train/loss_slope": 3.904835318717902e-06} {"step": 16380, "timestamp": 1778212237.3966665, "train/loss": 2.2373347520828246, "train/z_loss": 0.0016470861854031682, "train/perplexity": 9.368329064155727, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027727.8528938189, "perf/iters_per_sec": 0.9668959869832129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342374086380004, "data/tokens_consumed": 34353446912, "data/tokens_consumed_B": 34.353446912, "train/loss_slope": 9.334361556768987e-07} {"step": 16390, "timestamp": 1778212247.7461755, "train/loss": 2.258555769920349, "train/z_loss": 0.0016284499899484218, "train/perplexity": 9.569258970530385, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027547.0149683298, "perf/iters_per_sec": 0.9668097567407273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343296527862549, "data/tokens_consumed": 34374418432, "data/tokens_consumed_B": 34.374418432, "train/loss_slope": -6.358488236251683e-07} {"step": 16400, "timestamp": 1778212258.087253, "grad/layer_0/attn": 0.002850067103281617, "grad/layer_0/mlp": 0.002797324676066637, "grad/layer_0/attn_mlp_ratio": 1.018854559780199, "grad/layer_4/attn": 0.002318435814231634, "grad/layer_4/mlp": 0.0025243146810680628, "grad/layer_4/attn_mlp_ratio": 0.9184416427061766, "grad/layer_8/attn": 0.004340009298175573, "grad/layer_8/mlp": 0.003910518251359463, "grad/layer_8/attn_mlp_ratio": 1.1098296717279952, "grad/layer_12/attn": 0.005130472127348185, "grad/layer_12/mlp": 0.00601333100348711, "grad/layer_12/attn_mlp_ratio": 0.8531830426521901, "grad/layer_16/attn": 0.004092457238584757, "grad/layer_16/mlp": 0.00459261005744338, "grad/layer_16/attn_mlp_ratio": 0.891096151923975, "grad/layer_20/attn": 0.009829993359744549, "grad/layer_20/mlp": 0.006144310347735882, "grad/layer_20/attn_mlp_ratio": 1.5998529767269174, "grad/layer_24/attn": 0.016380006447434425, "grad/layer_24/mlp": 0.014629545621573925, "grad/layer_24/attn_mlp_ratio": 1.1196524320833237, "grad/layer_27/attn": 0.008170303888618946, "grad/layer_27/mlp": 0.012546723708510399, "grad/layer_27/attn_mlp_ratio": 0.6511902241027301} {"step": 16400, "timestamp": 1778212258.1030025, "train/loss": 2.326334762573242, "train/z_loss": 0.0016147257178090514, "train/perplexity": 10.24033938955381, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025872.910802192, "perf/iters_per_sec": 0.9660114816676102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035184383392334, "data/tokens_consumed": 34395389952, "data/tokens_consumed_B": 34.395389952, "train/loss_slope": 4.964145046077339e-06} {"step": 16410, "timestamp": 1778212268.452558, "train/loss": 2.2543120622634887, "train/z_loss": 0.0016284313867799937, "train/perplexity": 9.52873587786118, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027151.7517468731, "perf/iters_per_sec": 0.9666212805494657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034531331062317, "data/tokens_consumed": 34416361472, "data/tokens_consumed_B": 34.416361472, "train/loss_slope": 4.877023420783091e-06} {"step": 16420, "timestamp": 1778212278.8017933, "train/loss": 2.198436450958252, "train/z_loss": 0.001650055986829102, "train/perplexity": 9.01091347413835, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027522.759245813, "perf/iters_per_sec": 0.9667981907109322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343420267105103, "data/tokens_consumed": 34437332992, "data/tokens_consumed_B": 34.437332992, "train/loss_slope": 3.496653104450869e-07} {"step": 16425, "timestamp": 1778212284.5811808, "eos/sharpness": 22.256374359130856, "eos/L0_probe": 2.0754692554473877, "eos/L_plus": 2.1989381313323975, "eos/L_minus": 2.1745641231536865, "eos/grad_norm": 0.1621607542037964, "eos/embed_grad_frac": 0.10171495378017426, "eos/time_s": 0.61151123046875} {"step": 16425, "timestamp": 1778212285.9650102, "geo/rankme_last": 440.9340515136719, "geo/layer_0/stable_rank_q_proj": 16.208187103271484, "geo/layer_0/stable_rank_k_proj": 14.313703536987305, "geo/layer_0/stable_rank_o_proj": 53.4318733215332, "geo/layer_0/stable_rank_gate_proj": 158.0271453857422, "geo/layer_0/stable_rank_down_proj": 49.154808044433594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03969837725162506, "geo/layer_0/attn_entropy_mean": 6.29930305480957, "geo/layer_0/attn_entropy_std": 0.3024946451187134, "geo/layer_7/stable_rank_q_proj": 42.647216796875, "geo/layer_7/stable_rank_k_proj": 42.39760971069336, "geo/layer_7/stable_rank_o_proj": 111.35686492919922, "geo/layer_7/stable_rank_gate_proj": 110.25300598144531, "geo/layer_7/stable_rank_down_proj": 154.4502716064453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5818638801574707, "geo/layer_7/attn_entropy_mean": 4.619914531707764, "geo/layer_7/attn_entropy_std": 0.8471385836601257, "geo/layer_14/stable_rank_q_proj": 58.44520568847656, "geo/layer_14/stable_rank_k_proj": 36.00910949707031, "geo/layer_14/stable_rank_o_proj": 51.90141296386719, "geo/layer_14/stable_rank_gate_proj": 93.88333129882812, "geo/layer_14/stable_rank_down_proj": 136.78076171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3882308304309845, "geo/layer_14/attn_entropy_mean": 5.545284271240234, "geo/layer_14/attn_entropy_std": 0.5199689865112305, "geo/layer_21/stable_rank_q_proj": 49.768470764160156, "geo/layer_21/stable_rank_k_proj": 32.0277214050293, "geo/layer_21/stable_rank_o_proj": 87.0805435180664, "geo/layer_21/stable_rank_gate_proj": 93.52460479736328, "geo/layer_21/stable_rank_down_proj": 63.793487548828125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.158010795712471, "geo/layer_21/attn_entropy_mean": 5.773187637329102, "geo/layer_21/attn_entropy_std": 0.2983560562133789, "geo/layer_27/stable_rank_q_proj": 41.87137985229492, "geo/layer_27/stable_rank_k_proj": 32.13179397583008, "geo/layer_27/stable_rank_o_proj": 116.74015808105469, "geo/layer_27/stable_rank_gate_proj": 92.89675903320312, "geo/layer_27/stable_rank_down_proj": 142.80087280273438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07180023193359375, "geo/layer_27/attn_entropy_mean": 4.432173728942871, "geo/layer_27/attn_entropy_std": 0.5094508528709412, "attnres/final_alpha/block_0": 0.24354928731918335, "attnres/block_norm/0": 1.5594863891601562, "attnres/final_alpha/block_1": 0.007421817630529404, "attnres/block_norm/1": 25246.4296875, "attnres/final_alpha/block_2": 0.015304548665881157, "attnres/block_norm/2": 18544.3515625, "attnres/final_alpha/block_3": 0.01787010207772255, "attnres/block_norm/3": 24104.8203125, "attnres/final_alpha/block_4": 0.023246806114912033, "attnres/block_norm/4": 8034.9384765625, "attnres/final_alpha/block_5": 0.5499908328056335, "attnres/block_norm/5": 4625.82861328125, "attnres/final_alpha/block_6": 0.14261659979820251, "attnres/block_norm/6": 16574.33984375, "geo/tier1_time_s": 1.3615951538085938, "geo/step": 16425.0, "geo/rankme_slope": 2.2025782969437776e-05} {"step": 16430, "timestamp": 1778212291.1418378, "train/loss": 2.2703896284103395, "train/z_loss": 0.0016239213524386286, "train/perplexity": 9.68317291843686, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700308.8537404805, "perf/iters_per_sec": 0.8107704418852236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233394742012024, "data/tokens_consumed": 34458304512, "data/tokens_consumed_B": 34.458304512, "train/loss_slope": 9.066750614842782e-07} {"step": 16440, "timestamp": 1778212301.493603, "train/loss": 2.2448280811309815, "train/z_loss": 0.001626088155899197, "train/perplexity": 9.43879271039775, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026827.9554427923, "perf/iters_per_sec": 0.9664668824399911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03469660282135, "data/tokens_consumed": 34479276032, "data/tokens_consumed_B": 34.479276032, "train/loss_slope": -2.820223478665762e-06} {"step": 16450, "timestamp": 1778212311.840743, "grad/layer_0/attn": 0.0026094475761055946, "grad/layer_0/mlp": 0.002727784914895892, "grad/layer_0/attn_mlp_ratio": 0.9566177546455896, "grad/layer_4/attn": 0.0016464786604046822, "grad/layer_4/mlp": 0.0024695959873497486, "grad/layer_4/attn_mlp_ratio": 0.6666995744116211, "grad/layer_8/attn": 0.00602198950946331, "grad/layer_8/mlp": 0.0037588998675346375, "grad/layer_8/attn_mlp_ratio": 1.602061656727985, "grad/layer_12/attn": 0.004540885798633099, "grad/layer_12/mlp": 0.005840400233864784, "grad/layer_12/attn_mlp_ratio": 0.7774956405476825, "grad/layer_16/attn": 0.00432319613173604, "grad/layer_16/mlp": 0.004777966067194939, "grad/layer_16/attn_mlp_ratio": 0.9048193269803152, "grad/layer_20/attn": 0.004933241289108992, "grad/layer_20/mlp": 0.006637142971158028, "grad/layer_20/attn_mlp_ratio": 0.7432778284600479, "grad/layer_24/attn": 0.0154212461784482, "grad/layer_24/mlp": 0.01270805113017559, "grad/layer_24/attn_mlp_ratio": 1.213502046783543, "grad/layer_27/attn": 0.005307692103087902, "grad/layer_27/mlp": 0.010655988939106464, "grad/layer_27/attn_mlp_ratio": 0.49809474123980213} {"step": 16450, "timestamp": 1778212311.856388, "train/loss": 2.2262611865997313, "train/z_loss": 0.0016382110654376448, "train/perplexity": 9.26516053477099, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024785.6421767874, "perf/iters_per_sec": 0.9654930315860688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357402563095093, "data/tokens_consumed": 34500247552, "data/tokens_consumed_B": 34.500247552, "train/loss_slope": -8.941395254847608e-06} {"step": 16460, "timestamp": 1778212322.2158911, "train/loss": 2.222509574890137, "train/z_loss": 0.0016480298363603652, "train/perplexity": 9.23046637022662, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025347.3878908625, "perf/iters_per_sec": 0.9657608928160012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354529857635497, "data/tokens_consumed": 34521219072, "data/tokens_consumed_B": 34.521219072, "train/loss_slope": -9.967399101303104e-06} {"step": 16470, "timestamp": 1778212332.568555, "train/loss": 2.234167051315308, "train/z_loss": 0.0016326524782925844, "train/perplexity": 9.338699953842271, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026728.7633129107, "perf/iters_per_sec": 0.9664195839466623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347472429275513, "data/tokens_consumed": 34542190592, "data/tokens_consumed_B": 34.542190592, "train/loss_slope": -1.2295734084764818e-05} {"step": 16480, "timestamp": 1778212342.9140093, "train/loss": 2.2446079015731812, "train/z_loss": 0.0016322953277267515, "train/perplexity": 9.436714709967609, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028299.9318898737, "perf/iters_per_sec": 0.9671687755059594, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339457035064696, "data/tokens_consumed": 34563162112, "data/tokens_consumed_B": 34.563162112, "train/loss_slope": -8.63823985824559e-06} {"step": 16490, "timestamp": 1778212353.2581916, "train/loss": 2.2855273723602294, "train/z_loss": 0.001616730447858572, "train/perplexity": 9.830869385960245, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028453.1645144366, "perf/iters_per_sec": 0.9672418425152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033867597579956, "data/tokens_consumed": 34584133632, "data/tokens_consumed_B": 34.584133632, "train/loss_slope": -9.552476894665923e-06} {"step": 16500, "timestamp": 1778212363.6027546, "grad/layer_0/attn": 0.003306058933958411, "grad/layer_0/mlp": 0.0029395578894764185, "grad/layer_0/attn_mlp_ratio": 1.1246789298915216, "grad/layer_4/attn": 0.0016784395556896925, "grad/layer_4/mlp": 0.0024684048257768154, "grad/layer_4/attn_mlp_ratio": 0.6799692944063794, "grad/layer_8/attn": 0.007804627530276775, "grad/layer_8/mlp": 0.004016463179141283, "grad/layer_8/attn_mlp_ratio": 1.9431591895308948, "grad/layer_12/attn": 0.0038954804185777903, "grad/layer_12/mlp": 0.005757743958383799, "grad/layer_12/attn_mlp_ratio": 0.6765636643583727, "grad/layer_16/attn": 0.0038452462758868933, "grad/layer_16/mlp": 0.004428260959684849, "grad/layer_16/attn_mlp_ratio": 0.8683422734251699, "grad/layer_20/attn": 0.007048220839351416, "grad/layer_20/mlp": 0.005363882053643465, "grad/layer_20/attn_mlp_ratio": 1.314014856676866, "grad/layer_24/attn": 0.004895164165645838, "grad/layer_24/mlp": 0.009693706408143044, "grad/layer_24/attn_mlp_ratio": 0.5049837398660392, "grad/layer_27/attn": 0.011551042087376118, "grad/layer_27/mlp": 0.00777410576120019, "grad/layer_27/attn_mlp_ratio": 1.4858354508685367} {"step": 16500, "timestamp": 1778212364.2076216, "eos/sharpness": 27.482604980468743, "eos/L0_probe": 2.0756490230560303, "eos/L_plus": 2.2395334243774414, "eos/L_minus": 2.1865906715393066, "eos/grad_norm": 0.1252884566783905, "eos/embed_grad_frac": 0.16507838666439056, "eos/time_s": 0.6021699905395508} {"step": 16500, "timestamp": 1778212364.2268825, "train/loss": 2.2612353563308716, "train/z_loss": 0.0016204233281314373, "train/perplexity": 9.59493501204933, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912787.2656442858, "perf/iters_per_sec": 0.9120880439969472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096385383605957, "data/tokens_consumed": 34605105152, "data/tokens_consumed_B": 34.605105152, "train/loss_slope": -7.747179868877214e-06} {"step": 16500, "timestamp": 1778212365.5856173, "geo/rankme_last": 440.60113525390625, "geo/layer_0/stable_rank_q_proj": 16.196041107177734, "geo/layer_0/stable_rank_k_proj": 14.32182502746582, "geo/layer_0/stable_rank_o_proj": 53.277183532714844, "geo/layer_0/stable_rank_gate_proj": 157.11607360839844, "geo/layer_0/stable_rank_down_proj": 49.18195343017578, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04238932952284813, "geo/layer_0/attn_entropy_mean": 6.296065330505371, "geo/layer_0/attn_entropy_std": 0.3032495379447937, "geo/layer_7/stable_rank_q_proj": 42.663516998291016, "geo/layer_7/stable_rank_k_proj": 42.39223098754883, "geo/layer_7/stable_rank_o_proj": 111.0183334350586, "geo/layer_7/stable_rank_gate_proj": 110.05455780029297, "geo/layer_7/stable_rank_down_proj": 154.85806274414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5613758563995361, "geo/layer_7/attn_entropy_mean": 4.674128532409668, "geo/layer_7/attn_entropy_std": 0.8725017309188843, "geo/layer_14/stable_rank_q_proj": 58.576316833496094, "geo/layer_14/stable_rank_k_proj": 36.02622985839844, "geo/layer_14/stable_rank_o_proj": 51.89555358886719, "geo/layer_14/stable_rank_gate_proj": 93.61701202392578, "geo/layer_14/stable_rank_down_proj": 136.83375549316406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3992476165294647, "geo/layer_14/attn_entropy_mean": 5.53061056137085, "geo/layer_14/attn_entropy_std": 0.5241835117340088, "geo/layer_21/stable_rank_q_proj": 49.811988830566406, "geo/layer_21/stable_rank_k_proj": 31.79746437072754, "geo/layer_21/stable_rank_o_proj": 87.05915832519531, "geo/layer_21/stable_rank_gate_proj": 93.33759307861328, "geo/layer_21/stable_rank_down_proj": 63.67512130737305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15626856684684753, "geo/layer_21/attn_entropy_mean": 5.78096866607666, "geo/layer_21/attn_entropy_std": 0.28812137246131897, "geo/layer_27/stable_rank_q_proj": 41.99284362792969, "geo/layer_27/stable_rank_k_proj": 32.10870361328125, "geo/layer_27/stable_rank_o_proj": 116.45726013183594, "geo/layer_27/stable_rank_gate_proj": 92.89964294433594, "geo/layer_27/stable_rank_down_proj": 142.47769165039062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06989453732967377, "geo/layer_27/attn_entropy_mean": 4.4591145515441895, "geo/layer_27/attn_entropy_std": 0.5385412573814392, "attnres/final_alpha/block_0": 0.24293723702430725, "attnres/block_norm/0": 1.5607699155807495, "attnres/final_alpha/block_1": 0.00738926138728857, "attnres/block_norm/1": 25391.080078125, "attnres/final_alpha/block_2": 0.015308352187275887, "attnres/block_norm/2": 18544.765625, "attnres/final_alpha/block_3": 0.017504898831248283, "attnres/block_norm/3": 24194.412109375, "attnres/final_alpha/block_4": 0.023022953420877457, "attnres/block_norm/4": 8098.10595703125, "attnres/final_alpha/block_5": 0.5503273010253906, "attnres/block_norm/5": 4651.4580078125, "attnres/final_alpha/block_6": 0.14351001381874084, "attnres/block_norm/6": 16590.412109375, "geo/tier1_time_s": 1.3546512126922607, "geo/step": 16500.0, "geo/rankme_slope": 2.296397074454782e-05} {"step": 16500, "timestamp": 1778212372.4937727, "geo/ww_alpha_mean": 7.6600375719904115, "geo/ww_alpha_std": 4.06583579236053, "geo/ww_alpha_min": 1.3435631300672997, "geo/ww_alpha_max": 23.09986080799444, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.161779923079431, "geo/ww_alpha_by_type/k_proj": 4.723832405026774, "geo/ww_alpha_by_type/v_proj": 7.973350826633545, "geo/ww_alpha_by_type/o_proj": 7.508626017109106, "geo/ww_alpha_by_type/gate_proj": 9.193413702960271, "geo/ww_alpha_by_type/up_proj": 11.004829257686216, "geo/ww_alpha_by_type/down_proj": 9.177887443232033, "geo/twonn_id/layer_0": 0.7399978637695312, "geo/twonn_id/layer_7": 3.1389551162719727, "geo/twonn_id/layer_14": 3.9110665321350098, "geo/twonn_id/layer_21": 7.145631790161133, "geo/twonn_id/layer_27": 6.382184028625488, "geo/tier2_time_s": 6.90172004699707} {"step": 16500, "timestamp": 1778212373.1189852, "eoc/jacobian_sigma/layer_0/attn": 838.128173828125, "eoc/jacobian_sigma/layer_0/mlp": 4682.2373046875, "eoc/jacobian_sigma/layer_0": 4682.2373046875, "eoc/jacobian_sigma/layer_7/attn": 1.136549711227417, "eoc/jacobian_sigma/layer_7/mlp": 1.588739275932312, "eoc/jacobian_sigma/layer_7": 1.588739275932312, "eoc/jacobian_sigma/layer_14/attn": 1.4483939409255981, "eoc/jacobian_sigma/layer_14/mlp": 7.762530326843262, "eoc/jacobian_sigma/layer_14": 7.762530326843262, "eoc/jacobian_sigma/layer_21/attn": 1.0807300806045532, "eoc/jacobian_sigma/layer_21/mlp": 4.060043811798096, "eoc/jacobian_sigma/layer_21": 4.060043811798096, "eoc/jacobian_sigma/layer_27/attn": 2.4561824798583984, "eoc/jacobian_sigma/layer_27/mlp": 25.618505477905273, "eoc/jacobian_sigma/layer_27": 25.618505477905273, "eoc/layer0_sigma": 4682.2373046875, "eoc/sigma_max": 25.618505477905273, "eoc/sigma_min": 1.588739275932312, "eoc/sigma_mean": 9.757454723119736, "eoc/time_s": 0.6194543838500977} {"step": 16510, "timestamp": 1778212383.479675, "train/loss": 2.2805664777755736, "train/z_loss": 0.0016329760430380702, "train/perplexity": 9.782220250650084, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1089492.734303302, "perf/iters_per_sec": 0.5195106193081388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9248884677886964, "data/tokens_consumed": 34626076672, "data/tokens_consumed_B": 34.626076672, "train/loss_slope": -6.367812820500943e-06} {"step": 16520, "timestamp": 1778212393.831315, "train/loss": 2.2994879722595214, "train/z_loss": 0.0016139946528710425, "train/perplexity": 9.969076703963845, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027110.2204424483, "perf/iters_per_sec": 0.9666014768802873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034552526473999, "data/tokens_consumed": 34647048192, "data/tokens_consumed_B": 34.647048192, "train/loss_slope": -3.148890678996342e-06} {"step": 16530, "timestamp": 1778212404.1770594, "train/loss": 2.263585019111633, "train/z_loss": 0.0016185156651772558, "train/perplexity": 9.617506380899068, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028106.2258532022, "perf/iters_per_sec": 0.9670764092699061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340444564819335, "data/tokens_consumed": 34668019712, "data/tokens_consumed_B": 34.668019712, "train/loss_slope": -3.027581345952757e-06} {"step": 16540, "timestamp": 1778212414.5239484, "train/loss": 2.282181477546692, "train/z_loss": 0.0016151114366948604, "train/perplexity": 9.798031298098355, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028144.38423181, "perf/iters_per_sec": 0.9670946046027231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034025001525879, "data/tokens_consumed": 34688991232, "data/tokens_consumed_B": 34.688991232, "train/loss_slope": 6.639348612938996e-07} {"step": 16550, "timestamp": 1778212424.861624, "grad/layer_0/attn": 0.002884208457544446, "grad/layer_0/mlp": 0.0027417161036282778, "grad/layer_0/attn_mlp_ratio": 1.0519719195325905, "grad/layer_4/attn": 0.00162217253819108, "grad/layer_4/mlp": 0.0024691156577318907, "grad/layer_4/attn_mlp_ratio": 0.6569852114512421, "grad/layer_8/attn": 0.004419430624693632, "grad/layer_8/mlp": 0.003871278138831258, "grad/layer_8/attn_mlp_ratio": 1.1415946754650892, "grad/layer_12/attn": 0.004534210544079542, "grad/layer_12/mlp": 0.007381387986242771, "grad/layer_12/attn_mlp_ratio": 0.6142761349359587, "grad/layer_16/attn": 0.004558468237519264, "grad/layer_16/mlp": 0.004610042553395033, "grad/layer_16/attn_mlp_ratio": 0.9888125946431783, "grad/layer_20/attn": 0.00480157183483243, "grad/layer_20/mlp": 0.006233127787709236, "grad/layer_20/attn_mlp_ratio": 0.7703310314393496, "grad/layer_24/attn": 0.010823491960763931, "grad/layer_24/mlp": 0.01055434811860323, "grad/layer_24/attn_mlp_ratio": 1.025500744961807, "grad/layer_27/attn": 0.004164512734860182, "grad/layer_27/mlp": 0.008562498725950718, "grad/layer_27/attn_mlp_ratio": 0.4863665174748546} {"step": 16550, "timestamp": 1778212424.877274, "train/loss": 2.210765504837036, "train/z_loss": 0.0016495422809384764, "train/perplexity": 9.122697189608102, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026625.4718479563, "perf/iters_per_sec": 0.9663703307380468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347999811172486, "data/tokens_consumed": 34709962752, "data/tokens_consumed_B": 34.709962752, "train/loss_slope": -3.0121711364137214e-06} {"step": 16560, "timestamp": 1778212435.2233458, "train/loss": 2.222054696083069, "train/z_loss": 0.0016516192816197872, "train/perplexity": 9.226268581510395, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028268.4556487796, "perf/iters_per_sec": 0.9671537664646052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339617490768434, "data/tokens_consumed": 34730934272, "data/tokens_consumed_B": 34.730934272, "train/loss_slope": -7.907782801271306e-06} {"step": 16570, "timestamp": 1778212445.5748718, "train/loss": 2.2601248025894165, "train/z_loss": 0.001640548137947917, "train/perplexity": 9.5842852357417, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027220.6161749798, "perf/iters_per_sec": 0.9666541176676654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344961881637573, "data/tokens_consumed": 34751905792, "data/tokens_consumed_B": 34.751905792, "train/loss_slope": -7.997044528385495e-06} {"step": 16575, "timestamp": 1778212452.049743, "eos/sharpness": 14.835572242736813, "eos/L0_probe": 2.0760951042175293, "eos/L_plus": 2.1332950592041016, "eos/L_minus": 2.167250871658325, "eos/grad_norm": 0.13625285029411316, "eos/embed_grad_frac": 0.20988619327545166, "eos/time_s": 0.7357654571533203} {"step": 16575, "timestamp": 1778212453.4285226, "geo/rankme_last": 440.8682556152344, "geo/layer_0/stable_rank_q_proj": 16.234378814697266, "geo/layer_0/stable_rank_k_proj": 14.363994598388672, "geo/layer_0/stable_rank_o_proj": 53.31234359741211, "geo/layer_0/stable_rank_gate_proj": 156.6227264404297, "geo/layer_0/stable_rank_down_proj": 49.284820556640625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0493268184363842, "geo/layer_0/attn_entropy_mean": 6.296016216278076, "geo/layer_0/attn_entropy_std": 0.29980140924453735, "geo/layer_7/stable_rank_q_proj": 42.782936096191406, "geo/layer_7/stable_rank_k_proj": 42.31389236450195, "geo/layer_7/stable_rank_o_proj": 111.36398315429688, "geo/layer_7/stable_rank_gate_proj": 109.63212585449219, "geo/layer_7/stable_rank_down_proj": 154.83604431152344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5565115213394165, "geo/layer_7/attn_entropy_mean": 4.6645002365112305, "geo/layer_7/attn_entropy_std": 0.8691747784614563, "geo/layer_14/stable_rank_q_proj": 58.44185256958008, "geo/layer_14/stable_rank_k_proj": 36.026084899902344, "geo/layer_14/stable_rank_o_proj": 51.88601303100586, "geo/layer_14/stable_rank_gate_proj": 93.36788940429688, "geo/layer_14/stable_rank_down_proj": 136.85203552246094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37614014744758606, "geo/layer_14/attn_entropy_mean": 5.537961959838867, "geo/layer_14/attn_entropy_std": 0.5134679675102234, "geo/layer_21/stable_rank_q_proj": 49.65816879272461, "geo/layer_21/stable_rank_k_proj": 32.00569534301758, "geo/layer_21/stable_rank_o_proj": 86.92931365966797, "geo/layer_21/stable_rank_gate_proj": 93.37226104736328, "geo/layer_21/stable_rank_down_proj": 63.65660095214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15702132880687714, "geo/layer_21/attn_entropy_mean": 5.771223068237305, "geo/layer_21/attn_entropy_std": 0.2879864275455475, "geo/layer_27/stable_rank_q_proj": 41.9641227722168, "geo/layer_27/stable_rank_k_proj": 32.0103759765625, "geo/layer_27/stable_rank_o_proj": 116.52328491210938, "geo/layer_27/stable_rank_gate_proj": 92.99143981933594, "geo/layer_27/stable_rank_down_proj": 142.46847534179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07619043439626694, "geo/layer_27/attn_entropy_mean": 4.4585065841674805, "geo/layer_27/attn_entropy_std": 0.5049842000007629, "attnres/final_alpha/block_0": 0.24359087646007538, "attnres/block_norm/0": 1.562041997909546, "attnres/final_alpha/block_1": 0.007444190792739391, "attnres/block_norm/1": 25372.51953125, "attnres/final_alpha/block_2": 0.01553177647292614, "attnres/block_norm/2": 18585.830078125, "attnres/final_alpha/block_3": 0.01779414340853691, "attnres/block_norm/3": 24330.05859375, "attnres/final_alpha/block_4": 0.0229826420545578, "attnres/block_norm/4": 8102.52490234375, "attnres/final_alpha/block_5": 0.5491991639137268, "attnres/block_norm/5": 4616.189453125, "attnres/final_alpha/block_6": 0.14345718920230865, "attnres/block_norm/6": 16615.236328125, "geo/tier1_time_s": 1.3566629886627197, "geo/step": 16575.0, "geo/rankme_slope": 1.698876816351541e-05} {"step": 16580, "timestamp": 1778212458.6159935, "train/loss": 2.2510687589645384, "train/z_loss": 0.0016371961566619575, "train/perplexity": 9.497881359680584, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1608992.7442818803, "perf/iters_per_sec": 0.7672275277528192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.3033943176269531, "data/tokens_consumed": 34772877312, "data/tokens_consumed_B": 34.772877312, "train/loss_slope": -9.76931269090884e-06} {"step": 16590, "timestamp": 1778212468.9790518, "train/loss": 2.2188918590545654, "train/z_loss": 0.001631437405012548, "train/perplexity": 9.197133496656301, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024678.074689621, "perf/iters_per_sec": 0.9654417394111734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357952833175659, "data/tokens_consumed": 34793848832, "data/tokens_consumed_B": 34.793848832, "train/loss_slope": -9.449950882596658e-06} {"step": 16600, "timestamp": 1778212479.3228714, "grad/layer_0/attn": 0.004414181225001812, "grad/layer_0/mlp": 0.003510375041514635, "grad/layer_0/attn_mlp_ratio": 1.2574670931315928, "grad/layer_4/attn": 0.002133288886398077, "grad/layer_4/mlp": 0.0026587576139718294, "grad/layer_4/attn_mlp_ratio": 0.8023630266073504, "grad/layer_8/attn": 0.0044831507839262486, "grad/layer_8/mlp": 0.0041150888428092, "grad/layer_8/attn_mlp_ratio": 1.0894420135827705, "grad/layer_12/attn": 0.0048307632096111774, "grad/layer_12/mlp": 0.0062461779452860355, "grad/layer_12/attn_mlp_ratio": 0.7733950544776633, "grad/layer_16/attn": 0.005524259060621262, "grad/layer_16/mlp": 0.004439326003193855, "grad/layer_16/attn_mlp_ratio": 1.244391363060005, "grad/layer_20/attn": 0.006847493350505829, "grad/layer_20/mlp": 0.006544209085404873, "grad/layer_20/attn_mlp_ratio": 1.0463438983242392, "grad/layer_24/attn": 0.012198632583022118, "grad/layer_24/mlp": 0.010426714085042477, "grad/layer_24/attn_mlp_ratio": 1.1699402483403187, "grad/layer_27/attn": 0.007830950431525707, "grad/layer_27/mlp": 0.008825860917568207, "grad/layer_27/attn_mlp_ratio": 0.8872732548062912} {"step": 16600, "timestamp": 1778212479.3390143, "train/loss": 2.2436779737472534, "train/z_loss": 0.0016379321343265474, "train/perplexity": 9.427943325382865, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025281.1221430765, "perf/iters_per_sec": 0.9657292948451407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354868650436402, "data/tokens_consumed": 34814820352, "data/tokens_consumed_B": 34.814820352, "train/loss_slope": -9.934234912424823e-06} {"step": 16610, "timestamp": 1778212489.7236075, "train/loss": 2.2490856409072877, "train/z_loss": 0.0016156972851604223, "train/perplexity": 9.479064603741627, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020501.4461749252, "perf/iters_per_sec": 0.9634501677393557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379364013671875, "data/tokens_consumed": 34835791872, "data/tokens_consumed_B": 34.835791872, "train/loss_slope": -8.32313898265667e-06} {"step": 16620, "timestamp": 1778212500.081056, "train/loss": 2.2235669612884523, "train/z_loss": 0.0016279314761050046, "train/perplexity": 9.240231701771124, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026115.379510708, "perf/iters_per_sec": 0.9661270997575321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035060501098633, "data/tokens_consumed": 34856763392, "data/tokens_consumed_B": 34.856763392, "train/loss_slope": -9.588837838194379e-06} {"step": 16630, "timestamp": 1778212510.4344032, "train/loss": 2.2804424285888674, "train/z_loss": 0.00160889676772058, "train/perplexity": 9.781006849446081, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026679.4043690008, "perf/iters_per_sec": 0.9663960477681164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347724437713623, "data/tokens_consumed": 34877734912, "data/tokens_consumed_B": 34.877734912, "train/loss_slope": -8.24839613868995e-06} {"step": 16640, "timestamp": 1778212520.7908354, "train/loss": 2.238772654533386, "train/z_loss": 0.0016252680914476513, "train/perplexity": 9.381809496925763, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025966.512898448, "perf/iters_per_sec": 0.9660561146251907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351365566253663, "data/tokens_consumed": 34898706432, "data/tokens_consumed_B": 34.898706432, "train/loss_slope": -6.561025691897961e-06} {"step": 16650, "timestamp": 1778212531.1337907, "grad/layer_0/attn": 0.0027285360265523195, "grad/layer_0/mlp": 0.00269688805565238, "grad/layer_0/attn_mlp_ratio": 1.011734958616511, "grad/layer_4/attn": 0.0016281958669424057, "grad/layer_4/mlp": 0.002517854329198599, "grad/layer_4/attn_mlp_ratio": 0.6466600483573786, "grad/layer_8/attn": 0.0111404312774539, "grad/layer_8/mlp": 0.003752531250938773, "grad/layer_8/attn_mlp_ratio": 2.9687776691504233, "grad/layer_12/attn": 0.004305010661482811, "grad/layer_12/mlp": 0.005649037659168243, "grad/layer_12/attn_mlp_ratio": 0.7620785778066166, "grad/layer_16/attn": 0.004124227911233902, "grad/layer_16/mlp": 0.004346303176134825, "grad/layer_16/attn_mlp_ratio": 0.948904770147928, "grad/layer_20/attn": 0.005393956787884235, "grad/layer_20/mlp": 0.005722646601498127, "grad/layer_20/attn_mlp_ratio": 0.9425633049253515, "grad/layer_24/attn": 0.00681629404425621, "grad/layer_24/mlp": 0.010014542378485203, "grad/layer_24/attn_mlp_ratio": 0.6806395857723938, "grad/layer_27/attn": 0.008781869895756245, "grad/layer_27/mlp": 0.009072643704712391, "grad/layer_27/attn_mlp_ratio": 0.9679504767061267} {"step": 16650, "timestamp": 1778212531.7476993, "eos/sharpness": 13.303947448730467, "eos/L0_probe": 2.076096296310425, "eos/L_plus": 2.1585638523101807, "eos/L_minus": 2.1266682147979736, "eos/grad_norm": 0.12746474146842957, "eos/embed_grad_frac": 0.1831551343202591, "eos/time_s": 0.6111495494842529} {"step": 16650, "timestamp": 1778212531.768089, "train/loss": 2.250181794166565, "train/z_loss": 0.0016277901246212422, "train/perplexity": 9.489460808177641, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911242.5026912687, "perf/iters_per_sec": 0.911351443620333, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972715377807618, "data/tokens_consumed": 34919677952, "data/tokens_consumed_B": 34.919677952, "train/loss_slope": -9.59455821022701e-06} {"step": 16650, "timestamp": 1778212533.1286833, "geo/rankme_last": 441.2611389160156, "geo/layer_0/stable_rank_q_proj": 16.257360458374023, "geo/layer_0/stable_rank_k_proj": 14.37745475769043, "geo/layer_0/stable_rank_o_proj": 53.23939514160156, "geo/layer_0/stable_rank_gate_proj": 156.71304321289062, "geo/layer_0/stable_rank_down_proj": 49.1851806640625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.045327406376600266, "geo/layer_0/attn_entropy_mean": 6.29215145111084, "geo/layer_0/attn_entropy_std": 0.305631548166275, "geo/layer_7/stable_rank_q_proj": 42.80766677856445, "geo/layer_7/stable_rank_k_proj": 42.24070358276367, "geo/layer_7/stable_rank_o_proj": 110.94894409179688, "geo/layer_7/stable_rank_gate_proj": 109.2674560546875, "geo/layer_7/stable_rank_down_proj": 154.9853515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5595111846923828, "geo/layer_7/attn_entropy_mean": 4.618696212768555, "geo/layer_7/attn_entropy_std": 0.8513602018356323, "geo/layer_14/stable_rank_q_proj": 58.44047546386719, "geo/layer_14/stable_rank_k_proj": 35.966251373291016, "geo/layer_14/stable_rank_o_proj": 52.03355026245117, "geo/layer_14/stable_rank_gate_proj": 93.08325958251953, "geo/layer_14/stable_rank_down_proj": 136.79745483398438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38722026348114014, "geo/layer_14/attn_entropy_mean": 5.5153021812438965, "geo/layer_14/attn_entropy_std": 0.5466889142990112, "geo/layer_21/stable_rank_q_proj": 49.62781524658203, "geo/layer_21/stable_rank_k_proj": 32.09136199951172, "geo/layer_21/stable_rank_o_proj": 86.82218933105469, "geo/layer_21/stable_rank_gate_proj": 93.41478729248047, "geo/layer_21/stable_rank_down_proj": 63.55512619018555, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15783891081809998, "geo/layer_21/attn_entropy_mean": 5.771805763244629, "geo/layer_21/attn_entropy_std": 0.298738032579422, "geo/layer_27/stable_rank_q_proj": 42.01387405395508, "geo/layer_27/stable_rank_k_proj": 32.04595947265625, "geo/layer_27/stable_rank_o_proj": 116.62763977050781, "geo/layer_27/stable_rank_gate_proj": 92.99779510498047, "geo/layer_27/stable_rank_down_proj": 142.28050231933594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.068919837474823, "geo/layer_27/attn_entropy_mean": 4.4415130615234375, "geo/layer_27/attn_entropy_std": 0.5355867743492126, "attnres/final_alpha/block_0": 0.24472032487392426, "attnres/block_norm/0": 1.563088059425354, "attnres/final_alpha/block_1": 0.007569919805973768, "attnres/block_norm/1": 25408.369140625, "attnres/final_alpha/block_2": 0.015431845560669899, "attnres/block_norm/2": 18595.2421875, "attnres/final_alpha/block_3": 0.01795036345720291, "attnres/block_norm/3": 24221.703125, "attnres/final_alpha/block_4": 0.02313574030995369, "attnres/block_norm/4": 8150.8134765625, "attnres/final_alpha/block_5": 0.5455975532531738, "attnres/block_norm/5": 4694.1044921875, "attnres/final_alpha/block_6": 0.14559420943260193, "attnres/block_norm/6": 16706.28515625, "geo/tier1_time_s": 1.3566620349884033, "geo/step": 16650.0, "geo/rankme_slope": 1.2666980854841933e-05} {"step": 16660, "timestamp": 1778212543.4811656, "train/loss": 2.196562886238098, "train/z_loss": 0.0016453085700049996, "train/perplexity": 8.994046749940406, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791000.736368803, "perf/iters_per_sec": 0.8540157014698043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709386587142945, "data/tokens_consumed": 34940649472, "data/tokens_consumed_B": 34.940649472, "train/loss_slope": -1.2172039965055326e-05} {"step": 16670, "timestamp": 1778212553.8345876, "train/loss": 2.1937164783477785, "train/z_loss": 0.0016492398572154343, "train/perplexity": 8.968482424814106, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026417.7533991784, "perf/iters_per_sec": 0.9662712828632252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349060535430907, "data/tokens_consumed": 34961620992, "data/tokens_consumed_B": 34.961620992, "train/loss_slope": -1.1395446259160677e-05} {"step": 16680, "timestamp": 1778212564.1830025, "train/loss": 2.2394098043441772, "train/z_loss": 0.0016382521600462497, "train/perplexity": 9.387789019795253, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027614.6442960468, "perf/iters_per_sec": 0.9668420049171671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342951536178588, "data/tokens_consumed": 34982592512, "data/tokens_consumed_B": 34.982592512, "train/loss_slope": -1.2717431599479379e-05} {"step": 16690, "timestamp": 1778212574.557811, "train/loss": 2.2827116727828978, "train/z_loss": 0.0016216721967794, "train/perplexity": 9.803227545007756, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022621.53219457, "perf/iters_per_sec": 0.9644611035321092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368484497070312, "data/tokens_consumed": 35003564032, "data/tokens_consumed_B": 35.003564032, "train/loss_slope": -1.188317675008239e-05} {"step": 16700, "timestamp": 1778212584.933623, "grad/layer_0/attn": 0.0024349859450012445, "grad/layer_0/mlp": 0.002480490133166313, "grad/layer_0/attn_mlp_ratio": 0.9816551230250228, "grad/layer_4/attn": 0.0018507872009649873, "grad/layer_4/mlp": 0.0024804880376905203, "grad/layer_4/attn_mlp_ratio": 0.7461382994913971, "grad/layer_8/attn": 0.004750505555421114, "grad/layer_8/mlp": 0.0038629244081676006, "grad/layer_8/attn_mlp_ratio": 1.2297691931014574, "grad/layer_12/attn": 0.004527307115495205, "grad/layer_12/mlp": 0.006003085989505053, "grad/layer_12/attn_mlp_ratio": 0.754163283350227, "grad/layer_16/attn": 0.0037599809002131224, "grad/layer_16/mlp": 0.004375700373202562, "grad/layer_16/attn_mlp_ratio": 0.8592866269617407, "grad/layer_20/attn": 0.006371882278472185, "grad/layer_20/mlp": 0.005587180145084858, "grad/layer_20/attn_mlp_ratio": 1.1404468799941874, "grad/layer_24/attn": 0.006297673098742962, "grad/layer_24/mlp": 0.009064211510121822, "grad/layer_24/attn_mlp_ratio": 0.6947844301991447, "grad/layer_27/attn": 0.005068735685199499, "grad/layer_27/mlp": 0.007911425083875656, "grad/layer_27/attn_mlp_ratio": 0.6406855360940698} {"step": 16700, "timestamp": 1778212584.9493148, "train/loss": 2.239939022064209, "train/z_loss": 0.001616945816203952, "train/perplexity": 9.392758518953968, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019897.484215814, "perf/iters_per_sec": 0.9631621762351102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038246750831604, "data/tokens_consumed": 35024535552, "data/tokens_consumed_B": 35.024535552, "train/loss_slope": -1.4810828600350436e-05} {"step": 16710, "timestamp": 1778212595.328032, "train/loss": 2.250148558616638, "train/z_loss": 0.0016336393775418401, "train/perplexity": 9.489145425970152, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022290.0685106742, "perf/iters_per_sec": 0.9643030493310328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037018394470215, "data/tokens_consumed": 35045507072, "data/tokens_consumed_B": 35.045507072, "train/loss_slope": -1.467703889520041e-05} {"step": 16720, "timestamp": 1778212605.7030818, "train/loss": 2.2741124391555787, "train/z_loss": 0.0016143228276632726, "train/perplexity": 9.719288723066702, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022504.847190637, "perf/iters_per_sec": 0.9644054637864289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369082689285278, "data/tokens_consumed": 35066478592, "data/tokens_consumed_B": 35.066478592, "train/loss_slope": -1.40190131021197e-05} {"step": 16725, "timestamp": 1778212611.4820888, "eos/sharpness": 48.69372844696044, "eos/L0_probe": 2.077302932739258, "eos/L_plus": 2.3917300701141357, "eos/L_minus": 2.2498130798339844, "eos/grad_norm": 0.1993410587310791, "eos/embed_grad_frac": 0.07006841152906418, "eos/time_s": 0.5986533164978027} {"step": 16725, "timestamp": 1778212612.863478, "geo/rankme_last": 441.4164123535156, "geo/layer_0/stable_rank_q_proj": 16.29435157775879, "geo/layer_0/stable_rank_k_proj": 14.40587043762207, "geo/layer_0/stable_rank_o_proj": 53.27131652832031, "geo/layer_0/stable_rank_gate_proj": 156.49566650390625, "geo/layer_0/stable_rank_down_proj": 49.2623176574707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04395711049437523, "geo/layer_0/attn_entropy_mean": 6.2902512550354, "geo/layer_0/attn_entropy_std": 0.3008938133716583, "geo/layer_7/stable_rank_q_proj": 42.847110748291016, "geo/layer_7/stable_rank_k_proj": 42.31437683105469, "geo/layer_7/stable_rank_o_proj": 110.8554458618164, "geo/layer_7/stable_rank_gate_proj": 109.08304595947266, "geo/layer_7/stable_rank_down_proj": 154.45639038085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5613262057304382, "geo/layer_7/attn_entropy_mean": 4.632441997528076, "geo/layer_7/attn_entropy_std": 0.8686280250549316, "geo/layer_14/stable_rank_q_proj": 58.382171630859375, "geo/layer_14/stable_rank_k_proj": 35.9444694519043, "geo/layer_14/stable_rank_o_proj": 52.058189392089844, "geo/layer_14/stable_rank_gate_proj": 93.2663345336914, "geo/layer_14/stable_rank_down_proj": 136.3605194091797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3830364942550659, "geo/layer_14/attn_entropy_mean": 5.549764156341553, "geo/layer_14/attn_entropy_std": 0.5215232968330383, "geo/layer_21/stable_rank_q_proj": 49.417354583740234, "geo/layer_21/stable_rank_k_proj": 32.09865188598633, "geo/layer_21/stable_rank_o_proj": 86.66865539550781, "geo/layer_21/stable_rank_gate_proj": 93.36109924316406, "geo/layer_21/stable_rank_down_proj": 63.58827209472656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15525604784488678, "geo/layer_21/attn_entropy_mean": 5.756450653076172, "geo/layer_21/attn_entropy_std": 0.295828640460968, "geo/layer_27/stable_rank_q_proj": 42.03154373168945, "geo/layer_27/stable_rank_k_proj": 32.00733184814453, "geo/layer_27/stable_rank_o_proj": 116.83111572265625, "geo/layer_27/stable_rank_gate_proj": 92.93583679199219, "geo/layer_27/stable_rank_down_proj": 142.5152587890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07212807238101959, "geo/layer_27/attn_entropy_mean": 4.443521499633789, "geo/layer_27/attn_entropy_std": 0.548191487789154, "attnres/final_alpha/block_0": 0.24234303832054138, "attnres/block_norm/0": 1.5644854307174683, "attnres/final_alpha/block_1": 0.0073385220021009445, "attnres/block_norm/1": 25610.529296875, "attnres/final_alpha/block_2": 0.015317107550799847, "attnres/block_norm/2": 18719.14453125, "attnres/final_alpha/block_3": 0.017502326518297195, "attnres/block_norm/3": 24544.7421875, "attnres/final_alpha/block_4": 0.022720038890838623, "attnres/block_norm/4": 8140.037109375, "attnres/final_alpha/block_5": 0.5518580675125122, "attnres/block_norm/5": 4679.74267578125, "attnres/final_alpha/block_6": 0.14292088150978088, "attnres/block_norm/6": 16687.845703125, "geo/tier1_time_s": 1.3615264892578125, "geo/step": 16725.0, "geo/rankme_slope": 3.4056298300570227e-05} {"step": 16730, "timestamp": 1778212618.0488694, "train/loss": 2.217743182182312, "train/z_loss": 0.0016363072558306158, "train/perplexity": 9.186575027413333, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699624.00799951, "perf/iters_per_sec": 0.8104438819882918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338917255401611, "data/tokens_consumed": 35087450112, "data/tokens_consumed_B": 35.087450112, "train/loss_slope": -1.4234434023942105e-05} {"step": 16740, "timestamp": 1778212628.434424, "train/loss": 2.2233670949935913, "train/z_loss": 0.001626574748661369, "train/perplexity": 9.238385075442562, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020395.0759036671, "perf/iters_per_sec": 0.9633994464414917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379910469055176, "data/tokens_consumed": 35108421632, "data/tokens_consumed_B": 35.108421632, "train/loss_slope": -1.5142115257611115e-05} {"step": 16750, "timestamp": 1778212638.7978547, "grad/layer_0/attn": 0.0029995848890393972, "grad/layer_0/mlp": 0.002743501216173172, "grad/layer_0/attn_mlp_ratio": 1.0933418808136124, "grad/layer_4/attn": 0.001916418201290071, "grad/layer_4/mlp": 0.0025675022043287754, "grad/layer_4/attn_mlp_ratio": 0.7464134299155311, "grad/layer_8/attn": 0.004418141208589077, "grad/layer_8/mlp": 0.004126377869397402, "grad/layer_8/attn_mlp_ratio": 1.070706862375548, "grad/layer_12/attn": 0.0044749523513019085, "grad/layer_12/mlp": 0.006050868891179562, "grad/layer_12/attn_mlp_ratio": 0.739555319711121, "grad/layer_16/attn": 0.003783288411796093, "grad/layer_16/mlp": 0.004238942172378302, "grad/layer_16/attn_mlp_ratio": 0.8925076513659246, "grad/layer_20/attn": 0.0042589143849909306, "grad/layer_20/mlp": 0.006007620599120855, "grad/layer_20/attn_mlp_ratio": 0.7089186548701673, "grad/layer_24/attn": 0.009255443699657917, "grad/layer_24/mlp": 0.009467204101383686, "grad/layer_24/attn_mlp_ratio": 0.9776322030008797, "grad/layer_27/attn": 0.00900005642324686, "grad/layer_27/mlp": 0.007979916408658028, "grad/layer_27/attn_mlp_ratio": 1.1278384195476236} {"step": 16750, "timestamp": 1778212638.8134682, "train/loss": 2.254506826400757, "train/z_loss": 0.0016246145241893828, "train/perplexity": 9.530591914622521, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021759.3391279709, "perf/iters_per_sec": 0.9640499778404097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037290620803833, "data/tokens_consumed": 35129393152, "data/tokens_consumed_B": 35.129393152, "train/loss_slope": -1.264640396506158e-05} {"step": 16760, "timestamp": 1778212649.1835372, "train/loss": 2.2378626823425294, "train/z_loss": 0.0016302241710945964, "train/perplexity": 9.373276194306518, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023825.9551571757, "perf/iters_per_sec": 0.9650354171548727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362313985824585, "data/tokens_consumed": 35150364672, "data/tokens_consumed_B": 35.150364672, "train/loss_slope": -1.5149501011674202e-05} {"step": 16770, "timestamp": 1778212659.5538833, "train/loss": 2.2613638401031495, "train/z_loss": 0.0016130987671203912, "train/perplexity": 9.596167884694808, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023298.096102147, "perf/iters_per_sec": 0.964783714343141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365017414093018, "data/tokens_consumed": 35171336192, "data/tokens_consumed_B": 35.171336192, "train/loss_slope": -1.5291180776612677e-05} {"step": 16780, "timestamp": 1778212669.9185677, "train/loss": 2.2493168592453, "train/z_loss": 0.001624987763352692, "train/perplexity": 9.481256590709243, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024195.653011706, "perf/iters_per_sec": 0.9652117028292208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360421419143677, "data/tokens_consumed": 35192307712, "data/tokens_consumed_B": 35.192307712, "train/loss_slope": -1.4281850092911076e-05} {"step": 16790, "timestamp": 1778212680.2766752, "train/loss": 2.1768187046051026, "train/z_loss": 0.0016558060189709068, "train/perplexity": 8.818208263067723, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026169.984974344, "perf/iters_per_sec": 0.9661531376716347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035032606124878, "data/tokens_consumed": 35213279232, "data/tokens_consumed_B": 35.213279232, "train/loss_slope": -1.950477055161344e-05} {"step": 16800, "timestamp": 1778212690.618354, "grad/layer_0/attn": 0.0025560709182173014, "grad/layer_0/mlp": 0.0026886826381087303, "grad/layer_0/attn_mlp_ratio": 0.9506777731668294, "grad/layer_4/attn": 0.0018386102747172117, "grad/layer_4/mlp": 0.0026219587307423353, "grad/layer_4/attn_mlp_ratio": 0.7012353714938613, "grad/layer_8/attn": 0.006636525969952345, "grad/layer_8/mlp": 0.003903894452378154, "grad/layer_8/attn_mlp_ratio": 1.6999757244748173, "grad/layer_12/attn": 0.00450521195307374, "grad/layer_12/mlp": 0.006214349064975977, "grad/layer_12/attn_mlp_ratio": 0.7249692338604145, "grad/layer_16/attn": 0.004471469204872847, "grad/layer_16/mlp": 0.004596814978867769, "grad/layer_16/attn_mlp_ratio": 0.9727320173110388, "grad/layer_20/attn": 0.006860190071165562, "grad/layer_20/mlp": 0.006236425135284662, "grad/layer_20/attn_mlp_ratio": 1.1000196125741621, "grad/layer_24/attn": 0.008018753491342068, "grad/layer_24/mlp": 0.009174077771604061, "grad/layer_24/attn_mlp_ratio": 0.874066429734808, "grad/layer_27/attn": 0.009354074485599995, "grad/layer_27/mlp": 0.007842627353966236, "grad/layer_27/attn_mlp_ratio": 1.1927220233914564} {"step": 16800, "timestamp": 1778212691.220574, "eos/sharpness": 8.76953601837158, "eos/L0_probe": 2.0770375728607178, "eos/L_plus": 2.1356279850006104, "eos/L_minus": 2.106142520904541, "eos/grad_norm": 0.11600013077259064, "eos/embed_grad_frac": 0.21417248249053955, "eos/time_s": 0.5994954109191895} {"step": 16800, "timestamp": 1778212691.2402022, "train/loss": 2.1987570762634276, "train/z_loss": 0.001647986751049757, "train/perplexity": 9.013803064234004, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913668.450397688, "perf/iters_per_sec": 0.9125082256306114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0958805322647094, "data/tokens_consumed": 35234250752, "data/tokens_consumed_B": 35.234250752, "train/loss_slope": -2.5088443213885465e-05} {"step": 16800, "timestamp": 1778212692.6046243, "geo/rankme_last": 440.3890075683594, "geo/layer_0/stable_rank_q_proj": 16.32178497314453, "geo/layer_0/stable_rank_k_proj": 14.414894104003906, "geo/layer_0/stable_rank_o_proj": 53.44501495361328, "geo/layer_0/stable_rank_gate_proj": 156.37425231933594, "geo/layer_0/stable_rank_down_proj": 49.21082305908203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05008124187588692, "geo/layer_0/attn_entropy_mean": 6.293728828430176, "geo/layer_0/attn_entropy_std": 0.302844375371933, "geo/layer_7/stable_rank_q_proj": 42.83990478515625, "geo/layer_7/stable_rank_k_proj": 42.353694915771484, "geo/layer_7/stable_rank_o_proj": 110.37815856933594, "geo/layer_7/stable_rank_gate_proj": 108.84716033935547, "geo/layer_7/stable_rank_down_proj": 154.49769592285156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5453171730041504, "geo/layer_7/attn_entropy_mean": 4.656902313232422, "geo/layer_7/attn_entropy_std": 0.8609524369239807, "geo/layer_14/stable_rank_q_proj": 58.40161895751953, "geo/layer_14/stable_rank_k_proj": 35.85898208618164, "geo/layer_14/stable_rank_o_proj": 51.99848556518555, "geo/layer_14/stable_rank_gate_proj": 93.30913543701172, "geo/layer_14/stable_rank_down_proj": 136.31973266601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3921380043029785, "geo/layer_14/attn_entropy_mean": 5.541245937347412, "geo/layer_14/attn_entropy_std": 0.4982442259788513, "geo/layer_21/stable_rank_q_proj": 49.2784423828125, "geo/layer_21/stable_rank_k_proj": 32.118408203125, "geo/layer_21/stable_rank_o_proj": 86.83970642089844, "geo/layer_21/stable_rank_gate_proj": 93.12169647216797, "geo/layer_21/stable_rank_down_proj": 63.42472839355469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15763328969478607, "geo/layer_21/attn_entropy_mean": 5.774337291717529, "geo/layer_21/attn_entropy_std": 0.2966303825378418, "geo/layer_27/stable_rank_q_proj": 42.156925201416016, "geo/layer_27/stable_rank_k_proj": 32.03269577026367, "geo/layer_27/stable_rank_o_proj": 117.0309066772461, "geo/layer_27/stable_rank_gate_proj": 92.7987289428711, "geo/layer_27/stable_rank_down_proj": 142.0365753173828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07295973598957062, "geo/layer_27/attn_entropy_mean": 4.459632873535156, "geo/layer_27/attn_entropy_std": 0.533916711807251, "attnres/final_alpha/block_0": 0.24454236030578613, "attnres/block_norm/0": 1.5656423568725586, "attnres/final_alpha/block_1": 0.007352132350206375, "attnres/block_norm/1": 25695.43359375, "attnres/final_alpha/block_2": 0.01541923452168703, "attnres/block_norm/2": 18783.576171875, "attnres/final_alpha/block_3": 0.01745467819273472, "attnres/block_norm/3": 24724.28125, "attnres/final_alpha/block_4": 0.02275310456752777, "attnres/block_norm/4": 8193.61328125, "attnres/final_alpha/block_5": 0.5507993698120117, "attnres/block_norm/5": 4662.69970703125, "attnres/final_alpha/block_6": 0.1416790634393692, "attnres/block_norm/6": 16885.91796875, "geo/tier1_time_s": 1.3602068424224854, "geo/step": 16800.0, "geo/rankme_slope": 4.918373599439756e-07} {"step": 16810, "timestamp": 1778212703.7736354, "train/loss": 2.241155433654785, "train/z_loss": 0.0016308011021465064, "train/perplexity": 9.404190931133579, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1673805.8327002712, "perf/iters_per_sec": 0.7981328166486126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2529242992401124, "data/tokens_consumed": 35255222272, "data/tokens_consumed_B": 35.255222272, "train/loss_slope": -2.58065613332612e-05} {"step": 16820, "timestamp": 1778212714.1203053, "train/loss": 2.2441238164901733, "train/z_loss": 0.0016226178500801324, "train/perplexity": 9.432147642657677, "train/grad_norm": 0.66015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028106.6934717007, "perf/iters_per_sec": 0.9670766322477821, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340442180633544, "data/tokens_consumed": 35276193792, "data/tokens_consumed_B": 35.276193792, "train/loss_slope": -2.742575905968973e-05} {"step": 16830, "timestamp": 1778212724.4809208, "train/loss": 2.262771964073181, "train/z_loss": 0.001636036194395274, "train/perplexity": 9.609689996884526, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025203.1103917423, "perf/iters_per_sec": 0.965692095943328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355267524719238, "data/tokens_consumed": 35297165312, "data/tokens_consumed_B": 35.297165312, "train/loss_slope": -2.592316995037596e-05} {"step": 16840, "timestamp": 1778212734.8491778, "train/loss": 2.248311161994934, "train/z_loss": 0.0016232346883043648, "train/perplexity": 9.471726110218246, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023765.4228995747, "perf/iters_per_sec": 0.9650065531251787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362623929977417, "data/tokens_consumed": 35318136832, "data/tokens_consumed_B": 35.318136832, "train/loss_slope": -2.6295862883159525e-05} {"step": 16850, "timestamp": 1778212745.2134042, "grad/layer_0/attn": 0.002420207019895315, "grad/layer_0/mlp": 0.002370474860072136, "grad/layer_0/attn_mlp_ratio": 1.0209797870303874, "grad/layer_4/attn": 0.001590888830833137, "grad/layer_4/mlp": 0.0024929873179644346, "grad/layer_4/attn_mlp_ratio": 0.6381455515455926, "grad/layer_8/attn": 0.004764643497765064, "grad/layer_8/mlp": 0.0037858246359974146, "grad/layer_8/attn_mlp_ratio": 1.25854835604527, "grad/layer_12/attn": 0.003809926565736532, "grad/layer_12/mlp": 0.005802633706480265, "grad/layer_12/attn_mlp_ratio": 0.6565857320656153, "grad/layer_16/attn": 0.0042242128401994705, "grad/layer_16/mlp": 0.0041580344550311565, "grad/layer_16/attn_mlp_ratio": 1.0159157612310459, "grad/layer_20/attn": 0.005912054795771837, "grad/layer_20/mlp": 0.005773377139121294, "grad/layer_20/attn_mlp_ratio": 1.0240201793346957, "grad/layer_24/attn": 0.011184635572135448, "grad/layer_24/mlp": 0.010010828264057636, "grad/layer_24/attn_mlp_ratio": 1.117253754174049, "grad/layer_27/attn": 0.006495227571576834, "grad/layer_27/mlp": 0.009420282207429409, "grad/layer_27/attn_mlp_ratio": 0.689493940797751} {"step": 16850, "timestamp": 1778212745.2295458, "train/loss": 2.2429731369018553, "train/z_loss": 0.0016297762049362063, "train/perplexity": 9.421300504878145, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021440.7471695475, "perf/iters_per_sec": 0.9638980613563287, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037454104423523, "data/tokens_consumed": 35339108352, "data/tokens_consumed_B": 35.339108352, "train/loss_slope": -2.6617622568626676e-05} {"step": 16860, "timestamp": 1778212755.6113217, "train/loss": 2.2660048961639405, "train/z_loss": 0.0016214303555898367, "train/perplexity": 9.64080774573897, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021098.896384599, "perf/iters_per_sec": 0.9637350541995043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376295804977418, "data/tokens_consumed": 35360079872, "data/tokens_consumed_B": 35.360079872, "train/loss_slope": -2.7122589785738685e-05} {"step": 16870, "timestamp": 1778212766.0003552, "train/loss": 2.2851593017578127, "train/z_loss": 0.0016140210092999042, "train/perplexity": 9.827251597784654, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019630.8110313935, "perf/iters_per_sec": 0.9630350165516822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383838415145874, "data/tokens_consumed": 35381051392, "data/tokens_consumed_B": 35.381051392, "train/loss_slope": -2.509192611375014e-05} {"step": 16875, "timestamp": 1778212771.8087227, "eos/sharpness": 25.087070465087887, "eos/L0_probe": 2.0793726444244385, "eos/L_plus": 2.2096149921417236, "eos/L_minus": 2.2000010013580322, "eos/grad_norm": 0.175460547208786, "eos/embed_grad_frac": 0.092765673995018, "eos/time_s": 0.6272673606872559} {"step": 16875, "timestamp": 1778212773.1880527, "geo/rankme_last": 441.3537292480469, "geo/layer_0/stable_rank_q_proj": 16.3182315826416, "geo/layer_0/stable_rank_k_proj": 14.455588340759277, "geo/layer_0/stable_rank_o_proj": 53.37029266357422, "geo/layer_0/stable_rank_gate_proj": 156.36865234375, "geo/layer_0/stable_rank_down_proj": 49.161434173583984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04448878765106201, "geo/layer_0/attn_entropy_mean": 6.2874932289123535, "geo/layer_0/attn_entropy_std": 0.3056081533432007, "geo/layer_7/stable_rank_q_proj": 42.764915466308594, "geo/layer_7/stable_rank_k_proj": 42.15764617919922, "geo/layer_7/stable_rank_o_proj": 109.92147064208984, "geo/layer_7/stable_rank_gate_proj": 108.7046890258789, "geo/layer_7/stable_rank_down_proj": 154.40284729003906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5684573650360107, "geo/layer_7/attn_entropy_mean": 4.643474578857422, "geo/layer_7/attn_entropy_std": 0.8653896450996399, "geo/layer_14/stable_rank_q_proj": 58.46839904785156, "geo/layer_14/stable_rank_k_proj": 35.65997314453125, "geo/layer_14/stable_rank_o_proj": 52.058570861816406, "geo/layer_14/stable_rank_gate_proj": 93.34308624267578, "geo/layer_14/stable_rank_down_proj": 136.2791748046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3802824020385742, "geo/layer_14/attn_entropy_mean": 5.532472610473633, "geo/layer_14/attn_entropy_std": 0.5164183974266052, "geo/layer_21/stable_rank_q_proj": 49.24427795410156, "geo/layer_21/stable_rank_k_proj": 32.084903717041016, "geo/layer_21/stable_rank_o_proj": 86.90982055664062, "geo/layer_21/stable_rank_gate_proj": 92.71797943115234, "geo/layer_21/stable_rank_down_proj": 63.478797912597656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1554756462574005, "geo/layer_21/attn_entropy_mean": 5.755602836608887, "geo/layer_21/attn_entropy_std": 0.2932383418083191, "geo/layer_27/stable_rank_q_proj": 42.139549255371094, "geo/layer_27/stable_rank_k_proj": 32.06245803833008, "geo/layer_27/stable_rank_o_proj": 116.89484405517578, "geo/layer_27/stable_rank_gate_proj": 92.73881530761719, "geo/layer_27/stable_rank_down_proj": 141.7423858642578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07096829265356064, "geo/layer_27/attn_entropy_mean": 4.407323837280273, "geo/layer_27/attn_entropy_std": 0.5279600024223328, "attnres/final_alpha/block_0": 0.24570471048355103, "attnres/block_norm/0": 1.5667073726654053, "attnres/final_alpha/block_1": 0.007309652399271727, "attnres/block_norm/1": 25668.064453125, "attnres/final_alpha/block_2": 0.015345985069870949, "attnres/block_norm/2": 18845.1328125, "attnres/final_alpha/block_3": 0.017820239067077637, "attnres/block_norm/3": 24717.85546875, "attnres/final_alpha/block_4": 0.023044388741254807, "attnres/block_norm/4": 8207.6484375, "attnres/final_alpha/block_5": 0.5461234450340271, "attnres/block_norm/5": 4728.1298828125, "attnres/final_alpha/block_6": 0.1446515917778015, "attnres/block_norm/6": 16895.705078125, "geo/tier1_time_s": 1.3590803146362305, "geo/step": 16875.0, "geo/rankme_slope": -4.119421205982393e-07} {"step": 16880, "timestamp": 1778212778.3822384, "train/loss": 2.2309937715530395, "train/z_loss": 0.0016230333596467972, "train/perplexity": 9.309112615551731, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694813.8151699454, "perf/iters_per_sec": 0.8081502033090331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2373937368392944, "data/tokens_consumed": 35402022912, "data/tokens_consumed_B": 35.402022912, "train/loss_slope": -2.728745573961158e-05} {"step": 16890, "timestamp": 1778212788.7569485, "train/loss": 2.2230453729629516, "train/z_loss": 0.0016235513845458626, "train/perplexity": 9.235413361494814, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022324.009713858, "perf/iters_per_sec": 0.964319233757905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370009899139405, "data/tokens_consumed": 35422994432, "data/tokens_consumed_B": 35.422994432, "train/loss_slope": -2.6310808673621736e-05} {"step": 16900, "timestamp": 1778212799.1238549, "grad/layer_0/attn": 0.003356976667419076, "grad/layer_0/mlp": 0.002940758829936385, "grad/layer_0/attn_mlp_ratio": 1.1415341234691734, "grad/layer_4/attn": 0.001940934918820858, "grad/layer_4/mlp": 0.0025786159094423056, "grad/layer_4/attn_mlp_ratio": 0.7527041295460803, "grad/layer_8/attn": 0.00470992736518383, "grad/layer_8/mlp": 0.0039596171118319035, "grad/layer_8/attn_mlp_ratio": 1.1894905778038072, "grad/layer_12/attn": 0.004265619441866875, "grad/layer_12/mlp": 0.006060203071683645, "grad/layer_12/attn_mlp_ratio": 0.7038739991091422, "grad/layer_16/attn": 0.004253423772752285, "grad/layer_16/mlp": 0.004358375445008278, "grad/layer_16/attn_mlp_ratio": 0.9759195206626479, "grad/layer_20/attn": 0.006931636482477188, "grad/layer_20/mlp": 0.006241029128432274, "grad/layer_20/attn_mlp_ratio": 1.1106559877814244, "grad/layer_24/attn": 0.011600349098443985, "grad/layer_24/mlp": 0.01032975036650896, "grad/layer_24/attn_mlp_ratio": 1.1230038069220114, "grad/layer_27/attn": 0.008026372641324997, "grad/layer_27/mlp": 0.008425104431807995, "grad/layer_27/attn_mlp_ratio": 0.9526733598405062} {"step": 16900, "timestamp": 1778212799.139573, "train/loss": 2.23233482837677, "train/z_loss": 0.0016199904144741594, "train/perplexity": 9.32160503920045, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020812.3138968365, "perf/iters_per_sec": 0.9635984010204489, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037776732444763, "data/tokens_consumed": 35443965952, "data/tokens_consumed_B": 35.443965952, "train/loss_slope": -2.7986345189561216e-05} {"step": 16910, "timestamp": 1778212809.512848, "train/loss": 2.255783534049988, "train/z_loss": 0.0016150445793755353, "train/perplexity": 9.542767464876864, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022779.6295293912, "perf/iters_per_sec": 0.9645364902159649, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367674112319947, "data/tokens_consumed": 35464937472, "data/tokens_consumed_B": 35.464937472, "train/loss_slope": -2.479123202952543e-05} {"step": 16920, "timestamp": 1778212819.8993974, "train/loss": 2.234002423286438, "train/z_loss": 0.0016334188054315745, "train/perplexity": 9.337162668620255, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020038.5480200984, "perf/iters_per_sec": 0.9632294406986706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381742477416993, "data/tokens_consumed": 35485908992, "data/tokens_consumed_B": 35.485908992, "train/loss_slope": -2.3399344951298863e-05} {"step": 16930, "timestamp": 1778212830.2561524, "train/loss": 2.152574324607849, "train/z_loss": 0.0016468226676806808, "train/perplexity": 8.60698708039068, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026270.756116959, "perf/iters_per_sec": 0.9662011890969081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03498113155365, "data/tokens_consumed": 35506880512, "data/tokens_consumed_B": 35.506880512, "train/loss_slope": -2.6515052721302248e-05} {"step": 16940, "timestamp": 1778212840.608257, "train/loss": 2.2395658016204836, "train/z_loss": 0.001623375783674419, "train/perplexity": 9.389253603545447, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027266.2636287617, "perf/iters_per_sec": 0.966675884069806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034472894668579, "data/tokens_consumed": 35527852032, "data/tokens_consumed_B": 35.527852032, "train/loss_slope": -2.5289942751122466e-05} {"step": 16950, "timestamp": 1778212850.9560468, "grad/layer_0/attn": 0.0029366982635110617, "grad/layer_0/mlp": 0.002686366206035018, "grad/layer_0/attn_mlp_ratio": 1.093186084456786, "grad/layer_4/attn": 0.0018678369233384728, "grad/layer_4/mlp": 0.0026054850313812494, "grad/layer_4/attn_mlp_ratio": 0.7168864258105645, "grad/layer_8/attn": 0.005262672435492277, "grad/layer_8/mlp": 0.003750827629119158, "grad/layer_8/attn_mlp_ratio": 1.4030696197098234, "grad/layer_12/attn": 0.004460182506591082, "grad/layer_12/mlp": 0.006708729546517134, "grad/layer_12/attn_mlp_ratio": 0.6648326496368213, "grad/layer_16/attn": 0.006367442663758993, "grad/layer_16/mlp": 0.004792527761310339, "grad/layer_16/attn_mlp_ratio": 1.328618809952636, "grad/layer_20/attn": 0.003926553297787905, "grad/layer_20/mlp": 0.0062921312637627125, "grad/layer_20/attn_mlp_ratio": 0.6240418501752028, "grad/layer_24/attn": 0.006754502188414335, "grad/layer_24/mlp": 0.008376487530767918, "grad/layer_24/attn_mlp_ratio": 0.80636449143722, "grad/layer_27/attn": 0.007309342734515667, "grad/layer_27/mlp": 0.00697339279577136, "grad/layer_27/attn_mlp_ratio": 1.0481759516157516} {"step": 16950, "timestamp": 1778212851.5684383, "eos/sharpness": 5.733585357666015, "eos/L0_probe": 2.075359344482422, "eos/L_plus": 2.1101667881011963, "eos/L_minus": 2.0978877544403076, "eos/grad_norm": 0.10600850731134415, "eos/embed_grad_frac": 0.20614343881607056, "eos/time_s": 0.6095736026763916} {"step": 16950, "timestamp": 1778212851.5880938, "train/loss": 2.2851277351379395, "train/z_loss": 0.0016167516470886767, "train/perplexity": 9.826941389565208, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911189.2651850486, "perf/iters_per_sec": 0.9113260579991572, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973021030426025, "data/tokens_consumed": 35548823552, "data/tokens_consumed_B": 35.548823552, "train/loss_slope": -2.0744919848449204e-05} {"step": 16950, "timestamp": 1778212852.9522166, "geo/rankme_last": 441.0444641113281, "geo/layer_0/stable_rank_q_proj": 16.359952926635742, "geo/layer_0/stable_rank_k_proj": 14.511672973632812, "geo/layer_0/stable_rank_o_proj": 53.317935943603516, "geo/layer_0/stable_rank_gate_proj": 156.0133514404297, "geo/layer_0/stable_rank_down_proj": 49.21710205078125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04177803546190262, "geo/layer_0/attn_entropy_mean": 6.285195827484131, "geo/layer_0/attn_entropy_std": 0.3091192841529846, "geo/layer_7/stable_rank_q_proj": 42.68098068237305, "geo/layer_7/stable_rank_k_proj": 42.16907501220703, "geo/layer_7/stable_rank_o_proj": 109.9217300415039, "geo/layer_7/stable_rank_gate_proj": 109.05693054199219, "geo/layer_7/stable_rank_down_proj": 154.0255584716797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5866547226905823, "geo/layer_7/attn_entropy_mean": 4.643011093139648, "geo/layer_7/attn_entropy_std": 0.8803406357765198, "geo/layer_14/stable_rank_q_proj": 58.55332565307617, "geo/layer_14/stable_rank_k_proj": 35.56359100341797, "geo/layer_14/stable_rank_o_proj": 52.0895881652832, "geo/layer_14/stable_rank_gate_proj": 93.458251953125, "geo/layer_14/stable_rank_down_proj": 136.33743286132812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3814641833305359, "geo/layer_14/attn_entropy_mean": 5.556501388549805, "geo/layer_14/attn_entropy_std": 0.5209494233131409, "geo/layer_21/stable_rank_q_proj": 49.16389465332031, "geo/layer_21/stable_rank_k_proj": 32.011962890625, "geo/layer_21/stable_rank_o_proj": 87.10055541992188, "geo/layer_21/stable_rank_gate_proj": 92.82299041748047, "geo/layer_21/stable_rank_down_proj": 63.38257598876953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15748918056488037, "geo/layer_21/attn_entropy_mean": 5.774463653564453, "geo/layer_21/attn_entropy_std": 0.2817472219467163, "geo/layer_27/stable_rank_q_proj": 42.07158279418945, "geo/layer_27/stable_rank_k_proj": 32.05623245239258, "geo/layer_27/stable_rank_o_proj": 117.01185607910156, "geo/layer_27/stable_rank_gate_proj": 92.62001037597656, "geo/layer_27/stable_rank_down_proj": 141.21966552734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0748392716050148, "geo/layer_27/attn_entropy_mean": 4.445610046386719, "geo/layer_27/attn_entropy_std": 0.5212461948394775, "attnres/final_alpha/block_0": 0.24450771510601044, "attnres/block_norm/0": 1.5679196119308472, "attnres/final_alpha/block_1": 0.007338282186537981, "attnres/block_norm/1": 25879.1328125, "attnres/final_alpha/block_2": 0.015396641567349434, "attnres/block_norm/2": 18959.50390625, "attnres/final_alpha/block_3": 0.017570633441209793, "attnres/block_norm/3": 24757.796875, "attnres/final_alpha/block_4": 0.023133205249905586, "attnres/block_norm/4": 8217.0859375, "attnres/final_alpha/block_5": 0.548340916633606, "attnres/block_norm/5": 4717.7041015625, "attnres/final_alpha/block_6": 0.1437126100063324, "attnres/block_norm/6": 16986.59375, "geo/tier1_time_s": 1.3601324558258057, "geo/step": 16950.0, "geo/rankme_slope": 1.356249531062425e-05} {"step": 16960, "timestamp": 1778212863.3050146, "train/loss": 2.2366413831710816, "train/z_loss": 0.0016162905609235167, "train/perplexity": 9.361835607468311, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790440.630434429, "perf/iters_per_sec": 0.8537486221477647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171304965019226, "data/tokens_consumed": 35569795072, "data/tokens_consumed_B": 35.569795072, "train/loss_slope": -1.9870078831937873e-05} {"step": 16970, "timestamp": 1778212873.6680198, "train/loss": 2.2470436811447145, "train/z_loss": 0.0016079590772278608, "train/perplexity": 9.459728483742113, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024749.2879345578, "perf/iters_per_sec": 0.9654756965325154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357588529586792, "data/tokens_consumed": 35590766592, "data/tokens_consumed_B": 35.590766592, "train/loss_slope": -2.2342343155843392e-05} {"step": 16980, "timestamp": 1778212884.0297577, "train/loss": 2.226103162765503, "train/z_loss": 0.0016138929175212979, "train/perplexity": 9.263696534255079, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024807.1290799053, "perf/iters_per_sec": 0.9655032773398902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357292652130128, "data/tokens_consumed": 35611738112, "data/tokens_consumed_B": 35.611738112, "train/loss_slope": -2.4031326307965668e-05} {"step": 16990, "timestamp": 1778212894.7747722, "train/loss": 2.2512940168380737, "train/z_loss": 0.0016170529183000327, "train/perplexity": 9.500021073223369, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952988.455311041, "perf/iters_per_sec": 0.9312574650340276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0738168954849243, "data/tokens_consumed": 35632709632, "data/tokens_consumed_B": 35.632709632, "train/loss_slope": -2.2108387174528995e-05} {"step": 17000, "timestamp": 1778212905.619485, "grad/layer_0/attn": 0.002688672160729766, "grad/layer_0/mlp": 0.0025334074161946774, "grad/layer_0/attn_mlp_ratio": 1.0612868808285154, "grad/layer_4/attn": 0.0017829978605732322, "grad/layer_4/mlp": 0.002501223934814334, "grad/layer_4/attn_mlp_ratio": 0.7128501228821689, "grad/layer_8/attn": 0.005075855180621147, "grad/layer_8/mlp": 0.003683180082589388, "grad/layer_8/attn_mlp_ratio": 1.378117531315741, "grad/layer_12/attn": 0.004540132824331522, "grad/layer_12/mlp": 0.006136729381978512, "grad/layer_12/attn_mlp_ratio": 0.7398293891989779, "grad/layer_16/attn": 0.004635143559426069, "grad/layer_16/mlp": 0.004555894993245602, "grad/layer_16/attn_mlp_ratio": 1.017394708297379, "grad/layer_20/attn": 0.004036500584334135, "grad/layer_20/mlp": 0.006785150151699781, "grad/layer_20/attn_mlp_ratio": 0.5949021664366138, "grad/layer_24/attn": 0.016151640564203262, "grad/layer_24/mlp": 0.01103372871875763, "grad/layer_24/attn_mlp_ratio": 1.4638424443371354, "grad/layer_27/attn": 0.010803068988025188, "grad/layer_27/mlp": 0.008867341093719006, "grad/layer_27/attn_mlp_ratio": 1.2182985578221932} {"step": 17000, "timestamp": 1778212905.63518, "train/loss": 2.237778091430664, "train/z_loss": 0.0016239299438893795, "train/perplexity": 9.372483333860941, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931928.0240565105, "perf/iters_per_sec": 0.921215068844085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0855228424072265, "data/tokens_consumed": 35653681152, "data/tokens_consumed_B": 35.653681152, "train/loss_slope": -1.967409844517237e-05} {"step": 17000, "timestamp": 1778212912.6335986, "geo/ww_alpha_mean": 7.909895315569453, "geo/ww_alpha_std": 4.408582880534472, "geo/ww_alpha_min": 1.356435457615729, "geo/ww_alpha_max": 23.74135495966045, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.247220525708476, "geo/ww_alpha_by_type/k_proj": 4.6846231112024626, "geo/ww_alpha_by_type/v_proj": 8.185315602185133, "geo/ww_alpha_by_type/o_proj": 7.899099152646209, "geo/ww_alpha_by_type/gate_proj": 9.503194878410863, "geo/ww_alpha_by_type/up_proj": 12.012834534680294, "geo/ww_alpha_by_type/down_proj": 8.967308407213487, "geo/twonn_id/layer_0": 0.7216154932975769, "geo/twonn_id/layer_7": 3.31268310546875, "geo/twonn_id/layer_14": 3.928190231323242, "geo/twonn_id/layer_21": 7.5503950119018555, "geo/twonn_id/layer_27": 5.797934532165527, "geo/tier2_time_s": 6.993031740188599} {"step": 17000, "timestamp": 1778212913.2503788, "eoc/jacobian_sigma/layer_0/attn": 791.7455444335938, "eoc/jacobian_sigma/layer_0/mlp": 4651.376953125, "eoc/jacobian_sigma/layer_0": 4651.376953125, "eoc/jacobian_sigma/layer_7/attn": 1.121457815170288, "eoc/jacobian_sigma/layer_7/mlp": 1.5976227521896362, "eoc/jacobian_sigma/layer_7": 1.5976227521896362, "eoc/jacobian_sigma/layer_14/attn": 1.6059609651565552, "eoc/jacobian_sigma/layer_14/mlp": 7.2939276695251465, "eoc/jacobian_sigma/layer_14": 7.2939276695251465, "eoc/jacobian_sigma/layer_21/attn": 1.0802345275878906, "eoc/jacobian_sigma/layer_21/mlp": 3.833879232406616, "eoc/jacobian_sigma/layer_21": 3.833879232406616, "eoc/jacobian_sigma/layer_27/attn": 3.220059633255005, "eoc/jacobian_sigma/layer_27/mlp": 23.885129928588867, "eoc/jacobian_sigma/layer_27": 23.885129928588867, "eoc/layer0_sigma": 4651.376953125, "eoc/sigma_max": 23.885129928588867, "eoc/sigma_min": 1.5976227521896362, "eoc/sigma_mean": 9.152639895677567, "eoc/time_s": 0.6110851764678955} {"step": 17010, "timestamp": 1778212923.6198459, "train/loss": 2.2540189266204833, "train/z_loss": 0.0016118563944473863, "train/perplexity": 9.525943075097612, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1166343.4114152386, "perf/iters_per_sec": 0.5561558777881807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7980570554733277, "data/tokens_consumed": 35674652672, "data/tokens_consumed_B": 35.674652672, "train/loss_slope": -2.0567263571163723e-05} {"step": 17020, "timestamp": 1778212933.9815285, "train/loss": 2.2648760080337524, "train/z_loss": 0.001607992232311517, "train/perplexity": 9.629930493065249, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025132.9842160097, "perf/iters_per_sec": 0.9656586571769761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355626106262208, "data/tokens_consumed": 35695624192, "data/tokens_consumed_B": 35.695624192, "train/loss_slope": -1.773741256715004e-05} {"step": 17025, "timestamp": 1778212939.7720127, "eos/sharpness": 19.398283958435055, "eos/L0_probe": 2.073214292526245, "eos/L_plus": 2.1857497692108154, "eos/L_minus": 2.1546616554260254, "eos/grad_norm": 0.1071082279086113, "eos/embed_grad_frac": 0.19446790218353271, "eos/time_s": 0.6136429309844971} {"step": 17025, "timestamp": 1778212941.1578898, "geo/rankme_last": 441.24871826171875, "geo/layer_0/stable_rank_q_proj": 16.420225143432617, "geo/layer_0/stable_rank_k_proj": 14.488882064819336, "geo/layer_0/stable_rank_o_proj": 53.208824157714844, "geo/layer_0/stable_rank_gate_proj": 155.6998291015625, "geo/layer_0/stable_rank_down_proj": 49.32087326049805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04083840921521187, "geo/layer_0/attn_entropy_mean": 6.2901105880737305, "geo/layer_0/attn_entropy_std": 0.3094927668571472, "geo/layer_7/stable_rank_q_proj": 42.74653244018555, "geo/layer_7/stable_rank_k_proj": 42.185890197753906, "geo/layer_7/stable_rank_o_proj": 109.90786743164062, "geo/layer_7/stable_rank_gate_proj": 108.97821044921875, "geo/layer_7/stable_rank_down_proj": 154.3939666748047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5744494199752808, "geo/layer_7/attn_entropy_mean": 4.663931369781494, "geo/layer_7/attn_entropy_std": 0.8727918863296509, "geo/layer_14/stable_rank_q_proj": 58.55630874633789, "geo/layer_14/stable_rank_k_proj": 35.51388931274414, "geo/layer_14/stable_rank_o_proj": 52.31996154785156, "geo/layer_14/stable_rank_gate_proj": 93.1392593383789, "geo/layer_14/stable_rank_down_proj": 136.5497283935547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37829282879829407, "geo/layer_14/attn_entropy_mean": 5.540894031524658, "geo/layer_14/attn_entropy_std": 0.5206817388534546, "geo/layer_21/stable_rank_q_proj": 49.1704216003418, "geo/layer_21/stable_rank_k_proj": 31.944583892822266, "geo/layer_21/stable_rank_o_proj": 87.14300537109375, "geo/layer_21/stable_rank_gate_proj": 92.45606231689453, "geo/layer_21/stable_rank_down_proj": 63.24913787841797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15651647746562958, "geo/layer_21/attn_entropy_mean": 5.780046463012695, "geo/layer_21/attn_entropy_std": 0.2905269265174866, "geo/layer_27/stable_rank_q_proj": 42.14363479614258, "geo/layer_27/stable_rank_k_proj": 32.10868453979492, "geo/layer_27/stable_rank_o_proj": 117.05757904052734, "geo/layer_27/stable_rank_gate_proj": 92.6836929321289, "geo/layer_27/stable_rank_down_proj": 140.68678283691406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0757625624537468, "geo/layer_27/attn_entropy_mean": 4.445375919342041, "geo/layer_27/attn_entropy_std": 0.522039532661438, "attnres/final_alpha/block_0": 0.2429458647966385, "attnres/block_norm/0": 1.5691614151000977, "attnres/final_alpha/block_1": 0.007247837260365486, "attnres/block_norm/1": 25990.6484375, "attnres/final_alpha/block_2": 0.015138218179345131, "attnres/block_norm/2": 18904.236328125, "attnres/final_alpha/block_3": 0.017216186970472336, "attnres/block_norm/3": 25045.67578125, "attnres/final_alpha/block_4": 0.0227647852152586, "attnres/block_norm/4": 8241.31640625, "attnres/final_alpha/block_5": 0.5527575016021729, "attnres/block_norm/5": 4692.515625, "attnres/final_alpha/block_6": 0.14192958176136017, "attnres/block_norm/6": 17089.322265625, "geo/tier1_time_s": 1.3645739555358887, "geo/step": 17025.0, "geo/rankme_slope": 2.1185896233493396e-05} {"step": 17030, "timestamp": 1778212946.3414068, "train/loss": 2.282440519332886, "train/z_loss": 0.001618380902800709, "train/perplexity": 9.800569726392302, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697445.638065885, "perf/iters_per_sec": 0.8094051542596269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235475206375122, "data/tokens_consumed": 35716595712, "data/tokens_consumed_B": 35.716595712, "train/loss_slope": -1.2551864424590106e-05} {"step": 17040, "timestamp": 1778212956.6965408, "train/loss": 2.220312976837158, "train/z_loss": 0.00164061269024387, "train/perplexity": 9.210212998172121, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026592.086487658, "perf/iters_per_sec": 0.9663544113577166, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348170280456543, "data/tokens_consumed": 35737567232, "data/tokens_consumed_B": 35.737567232, "train/loss_slope": -1.532866502716625e-05} {"step": 17050, "timestamp": 1778212967.5530174, "grad/layer_0/attn": 0.002957196207717061, "grad/layer_0/mlp": 0.0028290797490626574, "grad/layer_0/attn_mlp_ratio": 1.045285522321631, "grad/layer_4/attn": 0.001640514936298132, "grad/layer_4/mlp": 0.002512428443878889, "grad/layer_4/attn_mlp_ratio": 0.6529598385175855, "grad/layer_8/attn": 0.00554779265075922, "grad/layer_8/mlp": 0.0037157535552978516, "grad/layer_8/attn_mlp_ratio": 1.4930464087276836, "grad/layer_12/attn": 0.004045833833515644, "grad/layer_12/mlp": 0.005478470120579004, "grad/layer_12/attn_mlp_ratio": 0.7384970020131005, "grad/layer_16/attn": 0.004120426252484322, "grad/layer_16/mlp": 0.004267371725291014, "grad/layer_16/attn_mlp_ratio": 0.9655653224460536, "grad/layer_20/attn": 0.010340763255953789, "grad/layer_20/mlp": 0.006294251419603825, "grad/layer_20/attn_mlp_ratio": 1.6428900598819194, "grad/layer_24/attn": 0.014929949305951595, "grad/layer_24/mlp": 0.01384790614247322, "grad/layer_24/attn_mlp_ratio": 1.0781376653287569, "grad/layer_27/attn": 0.004989125300198793, "grad/layer_27/mlp": 0.014091795310378075, "grad/layer_27/attn_mlp_ratio": 0.35404468734512645} {"step": 17050, "timestamp": 1778212967.5689201, "train/loss": 2.300135350227356, "train/z_loss": 0.0016009577782824635, "train/perplexity": 9.975532554043728, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929832.2567014834, "perf/iters_per_sec": 0.920215729094259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0867017030715942, "data/tokens_consumed": 35758538752, "data/tokens_consumed_B": 35.758538752, "train/loss_slope": -1.2189277921608468e-05} {"step": 17060, "timestamp": 1778212978.3572443, "train/loss": 2.226164126396179, "train/z_loss": 0.0016244687838479876, "train/perplexity": 9.2642613000442, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1944980.1153996063, "perf/iters_per_sec": 0.9274387909887344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.078238272666931, "data/tokens_consumed": 35779510272, "data/tokens_consumed_B": 35.779510272, "train/loss_slope": -1.2173720326038736e-05} {"step": 17070, "timestamp": 1778212988.7498968, "train/loss": 2.2218227863311766, "train/z_loss": 0.0016347477794624865, "train/perplexity": 9.22412916793779, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019619.4036275756, "perf/iters_per_sec": 0.9630295770776632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383897066116332, "data/tokens_consumed": 35800481792, "data/tokens_consumed_B": 35.800481792, "train/loss_slope": -1.5002797617770615e-05} {"step": 17080, "timestamp": 1778212999.109512, "train/loss": 2.2065531253814696, "train/z_loss": 0.0016293598106130957, "train/perplexity": 9.084349751071887, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025662.5476269198, "perf/iters_per_sec": 0.9659111726889228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035291886329651, "data/tokens_consumed": 35821453312, "data/tokens_consumed_B": 35.821453312, "train/loss_slope": -1.618246766779106e-05} {"step": 17090, "timestamp": 1778213009.4662747, "train/loss": 2.252963399887085, "train/z_loss": 0.00161615377292037, "train/perplexity": 9.51589349225574, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025884.528937195, "perf/iters_per_sec": 0.9660170216260886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351784467697143, "data/tokens_consumed": 35842424832, "data/tokens_consumed_B": 35.842424832, "train/loss_slope": -1.1019678019990833e-05} {"step": 17100, "timestamp": 1778213019.8240619, "grad/layer_0/attn": 0.002883834997192025, "grad/layer_0/mlp": 0.002652612281963229, "grad/layer_0/attn_mlp_ratio": 1.0871678865713752, "grad/layer_4/attn": 0.0015318350633606315, "grad/layer_4/mlp": 0.002595963655039668, "grad/layer_4/attn_mlp_ratio": 0.5900833786245309, "grad/layer_8/attn": 0.00473843514919281, "grad/layer_8/mlp": 0.0037796490360051394, "grad/layer_8/attn_mlp_ratio": 1.2536706394395776, "grad/layer_12/attn": 0.005481429398059845, "grad/layer_12/mlp": 0.006030299235135317, "grad/layer_12/attn_mlp_ratio": 0.9089813114454367, "grad/layer_16/attn": 0.005969658028334379, "grad/layer_16/mlp": 0.004493236541748047, "grad/layer_16/attn_mlp_ratio": 1.3285874981229446, "grad/layer_20/attn": 0.0069823917001485825, "grad/layer_20/mlp": 0.006095849443227053, "grad/layer_20/attn_mlp_ratio": 1.1454337333352569, "grad/layer_24/attn": 0.01182098500430584, "grad/layer_24/mlp": 0.013172116130590439, "grad/layer_24/attn_mlp_ratio": 0.8974248934163835, "grad/layer_27/attn": 0.015604582615196705, "grad/layer_27/mlp": 0.011923449113965034, "grad/layer_27/attn_mlp_ratio": 1.3087305808222203} {"step": 17100, "timestamp": 1778213020.450593, "eos/sharpness": 21.64511680603027, "eos/L0_probe": 2.069636821746826, "eos/L_plus": 2.1805739402770996, "eos/L_minus": 2.1751508712768555, "eos/grad_norm": 0.17455942928791046, "eos/embed_grad_frac": 0.086457259953022, "eos/time_s": 0.623668909072876} {"step": 17100, "timestamp": 1778213020.4705145, "train/loss": 2.2494240045547484, "train/z_loss": 0.0016096991021186113, "train/perplexity": 9.482272517305526, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906660.2197712634, "perf/iters_per_sec": 0.909166440854675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999086141586303, "data/tokens_consumed": 35863396352, "data/tokens_consumed_B": 35.863396352, "train/loss_slope": -8.907004680284436e-06} {"step": 17100, "timestamp": 1778213021.836764, "geo/rankme_last": 441.2994384765625, "geo/layer_0/stable_rank_q_proj": 16.395898818969727, "geo/layer_0/stable_rank_k_proj": 14.548223495483398, "geo/layer_0/stable_rank_o_proj": 53.118682861328125, "geo/layer_0/stable_rank_gate_proj": 155.95809936523438, "geo/layer_0/stable_rank_down_proj": 49.29548645019531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04733898118138313, "geo/layer_0/attn_entropy_mean": 6.286913871765137, "geo/layer_0/attn_entropy_std": 0.30738458037376404, "geo/layer_7/stable_rank_q_proj": 42.7864990234375, "geo/layer_7/stable_rank_k_proj": 42.32600402832031, "geo/layer_7/stable_rank_o_proj": 110.26764678955078, "geo/layer_7/stable_rank_gate_proj": 108.7475357055664, "geo/layer_7/stable_rank_down_proj": 154.19911193847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5752787590026855, "geo/layer_7/attn_entropy_mean": 4.643194198608398, "geo/layer_7/attn_entropy_std": 0.8604261875152588, "geo/layer_14/stable_rank_q_proj": 58.43609619140625, "geo/layer_14/stable_rank_k_proj": 35.478511810302734, "geo/layer_14/stable_rank_o_proj": 52.38941955566406, "geo/layer_14/stable_rank_gate_proj": 92.9161605834961, "geo/layer_14/stable_rank_down_proj": 136.3922576904297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3757956027984619, "geo/layer_14/attn_entropy_mean": 5.5263447761535645, "geo/layer_14/attn_entropy_std": 0.4955044686794281, "geo/layer_21/stable_rank_q_proj": 49.242271423339844, "geo/layer_21/stable_rank_k_proj": 32.05290985107422, "geo/layer_21/stable_rank_o_proj": 87.15641021728516, "geo/layer_21/stable_rank_gate_proj": 92.17003631591797, "geo/layer_21/stable_rank_down_proj": 63.25457763671875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15378203988075256, "geo/layer_21/attn_entropy_mean": 5.763881683349609, "geo/layer_21/attn_entropy_std": 0.2859863340854645, "geo/layer_27/stable_rank_q_proj": 42.0216064453125, "geo/layer_27/stable_rank_k_proj": 31.96422576904297, "geo/layer_27/stable_rank_o_proj": 116.67200469970703, "geo/layer_27/stable_rank_gate_proj": 92.64006805419922, "geo/layer_27/stable_rank_down_proj": 140.47927856445312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07366736978292465, "geo/layer_27/attn_entropy_mean": 4.42378568649292, "geo/layer_27/attn_entropy_std": 0.5323438048362732, "attnres/final_alpha/block_0": 0.24547605216503143, "attnres/block_norm/0": 1.5705044269561768, "attnres/final_alpha/block_1": 0.007296714000403881, "attnres/block_norm/1": 25952.4765625, "attnres/final_alpha/block_2": 0.015263747423887253, "attnres/block_norm/2": 19113.568359375, "attnres/final_alpha/block_3": 0.017554346472024918, "attnres/block_norm/3": 25185.90625, "attnres/final_alpha/block_4": 0.022957032546401024, "attnres/block_norm/4": 8229.056640625, "attnres/final_alpha/block_5": 0.5472699403762817, "attnres/block_norm/5": 4733.7587890625, "attnres/final_alpha/block_6": 0.14418219029903412, "attnres/block_norm/6": 17004.34375, "geo/tier1_time_s": 1.3621666431427002, "geo/step": 17100.0, "geo/rankme_slope": 2.5092517475740297e-05} {"step": 17110, "timestamp": 1778213032.1892529, "train/loss": 2.2736450672149657, "train/z_loss": 0.0016078339074738323, "train/perplexity": 9.714747261593324, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790166.210210588, "perf/iters_per_sec": 0.8536177683880749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714845180511475, "data/tokens_consumed": 35884367872, "data/tokens_consumed_B": 35.884367872, "train/loss_slope": -6.867526831513858e-06} {"step": 17120, "timestamp": 1778213042.5489113, "train/loss": 2.238962936401367, "train/z_loss": 0.0016205927357077598, "train/perplexity": 9.383594855017128, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025392.4846548093, "perf/iters_per_sec": 0.9657823966287657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354299306869508, "data/tokens_consumed": 35905339392, "data/tokens_consumed_B": 35.905339392, "train/loss_slope": -6.527819584841635e-06} {"step": 17130, "timestamp": 1778213052.9037867, "train/loss": 2.2651848793029785, "train/z_loss": 0.0016169250942766667, "train/perplexity": 9.632905361321217, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026451.0863303267, "perf/iters_per_sec": 0.9662871772433885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034889030456543, "data/tokens_consumed": 35926310912, "data/tokens_consumed_B": 35.926310912, "train/loss_slope": -7.74923787973966e-06} {"step": 17140, "timestamp": 1778213063.2553825, "train/loss": 2.2751209020614622, "train/z_loss": 0.0016098421765491366, "train/perplexity": 9.729095209123102, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026894.5092773456, "perf/iters_per_sec": 0.9664986177813271, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034662628173828, "data/tokens_consumed": 35947282432, "data/tokens_consumed_B": 35.947282432, "train/loss_slope": -7.2816850257070255e-06} {"step": 17150, "timestamp": 1778213073.600565, "grad/layer_0/attn": 0.0028123436495661736, "grad/layer_0/mlp": 0.0028641847893595695, "grad/layer_0/attn_mlp_ratio": 0.9819001769103712, "grad/layer_4/attn": 0.0018338612280786037, "grad/layer_4/mlp": 0.0024979703593999147, "grad/layer_4/attn_mlp_ratio": 0.7341404783942683, "grad/layer_8/attn": 0.004139681346714497, "grad/layer_8/mlp": 0.003796191420406103, "grad/layer_8/attn_mlp_ratio": 1.0904827442087663, "grad/layer_12/attn": 0.005841891746968031, "grad/layer_12/mlp": 0.005710764322429895, "grad/layer_12/attn_mlp_ratio": 1.0229614312268098, "grad/layer_16/attn": 0.005658772774040699, "grad/layer_16/mlp": 0.004584750160574913, "grad/layer_16/attn_mlp_ratio": 1.23425976387449, "grad/layer_20/attn": 0.004601464606821537, "grad/layer_20/mlp": 0.006250207778066397, "grad/layer_20/attn_mlp_ratio": 0.7362098503906201, "grad/layer_24/attn": 0.014069476164877415, "grad/layer_24/mlp": 0.01325121708214283, "grad/layer_24/attn_mlp_ratio": 1.0617497224207646, "grad/layer_27/attn": 0.00742808822542429, "grad/layer_27/mlp": 0.012076635845005512, "grad/layer_27/attn_mlp_ratio": 0.6150792537963601} {"step": 17150, "timestamp": 1778213073.6165895, "train/loss": 2.2751280784606935, "train/z_loss": 0.0016007060534320773, "train/perplexity": 9.72916502924501, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025464.1679255068, "perf/iters_per_sec": 0.9658165778758558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353932857513428, "data/tokens_consumed": 35968253952, "data/tokens_consumed_B": 35.968253952, "train/loss_slope": -9.109955688085147e-06} {"step": 17160, "timestamp": 1778213083.9768145, "train/loss": 2.2082401752471923, "train/z_loss": 0.0016157870064489543, "train/perplexity": 9.09968843702565, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025367.8607333472, "perf/iters_per_sec": 0.9657706550280319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354425191879273, "data/tokens_consumed": 35989225472, "data/tokens_consumed_B": 35.989225472, "train/loss_slope": -1.317829512777253e-05} {"step": 17170, "timestamp": 1778213094.3488686, "train/loss": 2.258955144882202, "train/z_loss": 0.0016213448485359549, "train/perplexity": 9.57308145621843, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023399.5592200875, "perf/iters_per_sec": 0.9648320957279622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364497661590577, "data/tokens_consumed": 36010196992, "data/tokens_consumed_B": 36.010196992, "train/loss_slope": -1.3910105440876288e-05} {"step": 17175, "timestamp": 1778213100.1338458, "eos/sharpness": 50.48439502716064, "eos/L0_probe": 2.0746231079101562, "eos/L_plus": 2.3933422565460205, "eos/L_minus": 2.2607479095458984, "eos/grad_norm": 0.18251334130764008, "eos/embed_grad_frac": 0.07153137028217316, "eos/time_s": 0.618138313293457} {"step": 17175, "timestamp": 1778213101.5156813, "geo/rankme_last": 440.466064453125, "geo/layer_0/stable_rank_q_proj": 16.39478874206543, "geo/layer_0/stable_rank_k_proj": 14.549391746520996, "geo/layer_0/stable_rank_o_proj": 53.235294342041016, "geo/layer_0/stable_rank_gate_proj": 156.29434204101562, "geo/layer_0/stable_rank_down_proj": 49.40935516357422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04879535362124443, "geo/layer_0/attn_entropy_mean": 6.288678169250488, "geo/layer_0/attn_entropy_std": 0.3011628985404968, "geo/layer_7/stable_rank_q_proj": 42.92619705200195, "geo/layer_7/stable_rank_k_proj": 42.29122543334961, "geo/layer_7/stable_rank_o_proj": 110.13533020019531, "geo/layer_7/stable_rank_gate_proj": 108.60252380371094, "geo/layer_7/stable_rank_down_proj": 154.3802490234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5677635669708252, "geo/layer_7/attn_entropy_mean": 4.64758825302124, "geo/layer_7/attn_entropy_std": 0.8884382247924805, "geo/layer_14/stable_rank_q_proj": 58.65887451171875, "geo/layer_14/stable_rank_k_proj": 35.45905303955078, "geo/layer_14/stable_rank_o_proj": 52.31947708129883, "geo/layer_14/stable_rank_gate_proj": 92.56282806396484, "geo/layer_14/stable_rank_down_proj": 135.75111389160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37822040915489197, "geo/layer_14/attn_entropy_mean": 5.5452375411987305, "geo/layer_14/attn_entropy_std": 0.5161340832710266, "geo/layer_21/stable_rank_q_proj": 49.34251403808594, "geo/layer_21/stable_rank_k_proj": 31.90692901611328, "geo/layer_21/stable_rank_o_proj": 86.99391174316406, "geo/layer_21/stable_rank_gate_proj": 92.22502899169922, "geo/layer_21/stable_rank_down_proj": 63.14907455444336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15581995248794556, "geo/layer_21/attn_entropy_mean": 5.775697708129883, "geo/layer_21/attn_entropy_std": 0.286100834608078, "geo/layer_27/stable_rank_q_proj": 42.064292907714844, "geo/layer_27/stable_rank_k_proj": 31.947710037231445, "geo/layer_27/stable_rank_o_proj": 116.69056701660156, "geo/layer_27/stable_rank_gate_proj": 92.5487060546875, "geo/layer_27/stable_rank_down_proj": 140.66592407226562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07803347706794739, "geo/layer_27/attn_entropy_mean": 4.437814712524414, "geo/layer_27/attn_entropy_std": 0.533322811126709, "attnres/final_alpha/block_0": 0.24380770325660706, "attnres/block_norm/0": 1.5718106031417847, "attnres/final_alpha/block_1": 0.007250715047121048, "attnres/block_norm/1": 26150.3984375, "attnres/final_alpha/block_2": 0.015114804729819298, "attnres/block_norm/2": 19137.30859375, "attnres/final_alpha/block_3": 0.017484106123447418, "attnres/block_norm/3": 25118.794921875, "attnres/final_alpha/block_4": 0.02254653349518776, "attnres/block_norm/4": 8288.27734375, "attnres/final_alpha/block_5": 0.5523718595504761, "attnres/block_norm/5": 4704.04736328125, "attnres/final_alpha/block_6": 0.1414242386817932, "attnres/block_norm/6": 17249.48828125, "geo/tier1_time_s": 1.3605742454528809, "geo/step": 17175.0, "geo/rankme_slope": -1.3865663452881153e-05} {"step": 17180, "timestamp": 1778213106.6947277, "train/loss": 2.2741719484329224, "train/z_loss": 0.0016015303088352084, "train/perplexity": 9.71986712812497, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699415.4277947156, "perf/iters_per_sec": 0.8103444231961802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2340431690216065, "data/tokens_consumed": 36031168512, "data/tokens_consumed_B": 36.031168512, "train/loss_slope": -1.2523250260798032e-05} {"step": 17190, "timestamp": 1778213117.0538993, "train/loss": 2.249134159088135, "train/z_loss": 0.0016148656723089516, "train/perplexity": 9.479524521869438, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025386.7483468843, "perf/iters_per_sec": 0.9657796613439962, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354328632354737, "data/tokens_consumed": 36052140032, "data/tokens_consumed_B": 36.052140032, "train/loss_slope": -1.5249985050518455e-05} {"step": 17200, "timestamp": 1778213127.396896, "grad/layer_0/attn": 0.0029230902437120676, "grad/layer_0/mlp": 0.002553079277276993, "grad/layer_0/attn_mlp_ratio": 1.1449272865263251, "grad/layer_4/attn": 0.0015923273749649525, "grad/layer_4/mlp": 0.002413342706859112, "grad/layer_4/attn_mlp_ratio": 0.6598015708499015, "grad/layer_8/attn": 0.005097540095448494, "grad/layer_8/mlp": 0.003764913184568286, "grad/layer_8/attn_mlp_ratio": 1.353959496581876, "grad/layer_12/attn": 0.0049370708875358105, "grad/layer_12/mlp": 0.006257602944970131, "grad/layer_12/attn_mlp_ratio": 0.7889715681956263, "grad/layer_16/attn": 0.004285829607397318, "grad/layer_16/mlp": 0.004623278975486755, "grad/layer_16/attn_mlp_ratio": 0.9270107941615207, "grad/layer_20/attn": 0.006472716107964516, "grad/layer_20/mlp": 0.006217869929969311, "grad/layer_20/attn_mlp_ratio": 1.0409860734892946, "grad/layer_24/attn": 0.009610104374587536, "grad/layer_24/mlp": 0.010933129116892815, "grad/layer_24/attn_mlp_ratio": 0.8789893711069409, "grad/layer_27/attn": 0.005506386514753103, "grad/layer_27/mlp": 0.009084164164960384, "grad/layer_27/attn_mlp_ratio": 0.606152239671892} {"step": 17200, "timestamp": 1778213127.4125714, "train/loss": 2.217060422897339, "train/z_loss": 0.0016230504377745091, "train/perplexity": 9.180304948737055, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025637.7305460982, "perf/iters_per_sec": 0.9658993389826289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035304570198059, "data/tokens_consumed": 36073111552, "data/tokens_consumed_B": 36.073111552, "train/loss_slope": -1.5584250688195187e-05} {"step": 17210, "timestamp": 1778213137.763998, "train/loss": 2.256847620010376, "train/z_loss": 0.001604196964763105, "train/perplexity": 9.552927194213524, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027509.5333448953, "perf/iters_per_sec": 0.966791884109924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343487739562989, "data/tokens_consumed": 36094083072, "data/tokens_consumed_B": 36.094083072, "train/loss_slope": -1.297010489136759e-05} {"step": 17220, "timestamp": 1778213148.1137142, "train/loss": 2.1974721908569337, "train/z_loss": 0.001625540805980563, "train/perplexity": 9.002228797615253, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027569.9158940807, "perf/iters_per_sec": 0.9668206767530826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034317970275879, "data/tokens_consumed": 36115054592, "data/tokens_consumed_B": 36.115054592, "train/loss_slope": -1.4965060069830686e-05} {"step": 17230, "timestamp": 1778213158.4638894, "train/loss": 2.2501965761184692, "train/z_loss": 0.0016211264533922075, "train/perplexity": 9.489601081967663, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027071.8206830118, "perf/iters_per_sec": 0.9665831664481219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345721244812012, "data/tokens_consumed": 36136026112, "data/tokens_consumed_B": 36.136026112, "train/loss_slope": -1.6287555381266734e-05} {"step": 17240, "timestamp": 1778213168.825659, "train/loss": 2.203306794166565, "train/z_loss": 0.0016349952784366906, "train/perplexity": 9.054906759616467, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025089.76391907, "perf/iters_per_sec": 0.9656380481334066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355847120285033, "data/tokens_consumed": 36156997632, "data/tokens_consumed_B": 36.156997632, "train/loss_slope": -1.8032656816115727e-05} {"step": 17250, "timestamp": 1778213179.1619885, "grad/layer_0/attn": 0.002988804830238223, "grad/layer_0/mlp": 0.0026927641592919827, "grad/layer_0/attn_mlp_ratio": 1.109939282625535, "grad/layer_4/attn": 0.001503417850472033, "grad/layer_4/mlp": 0.002594240242615342, "grad/layer_4/attn_mlp_ratio": 0.5795214212713947, "grad/layer_8/attn": 0.01075233705341816, "grad/layer_8/mlp": 0.00390898622572422, "grad/layer_8/attn_mlp_ratio": 2.7506714420204754, "grad/layer_12/attn": 0.005320283118635416, "grad/layer_12/mlp": 0.006260269321501255, "grad/layer_12/attn_mlp_ratio": 0.8498489059213008, "grad/layer_16/attn": 0.007378303445875645, "grad/layer_16/mlp": 0.004785177297890186, "grad/layer_16/attn_mlp_ratio": 1.5419080281388908, "grad/layer_20/attn": 0.004520465154200792, "grad/layer_20/mlp": 0.0070961518213152885, "grad/layer_20/attn_mlp_ratio": 0.637030492628308, "grad/layer_24/attn": 0.021621473133563995, "grad/layer_24/mlp": 0.012854893691837788, "grad/layer_24/attn_mlp_ratio": 1.6819643540962235, "grad/layer_27/attn": 0.004968990571796894, "grad/layer_27/mlp": 0.012706059031188488, "grad/layer_27/attn_mlp_ratio": 0.391072520637019} {"step": 17250, "timestamp": 1778213179.769237, "eos/sharpness": 46.49112224578857, "eos/L0_probe": 2.0746705532073975, "eos/L_plus": 2.3512823581695557, "eos/L_minus": 2.262969970703125, "eos/grad_norm": 0.21631017327308655, "eos/embed_grad_frac": 0.05419504642486572, "eos/time_s": 0.6044931411743164} {"step": 17250, "timestamp": 1778213179.788497, "train/loss": 2.2758481025695803, "train/z_loss": 0.0016138173639774322, "train/perplexity": 9.736172785199267, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913793.9005466048, "perf/iters_per_sec": 0.912568044923117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0958086967468261, "data/tokens_consumed": 36177969152, "data/tokens_consumed_B": 36.177969152, "train/loss_slope": -1.7357599009202835e-05} {"step": 17250, "timestamp": 1778213181.1520472, "geo/rankme_last": 441.1222839355469, "geo/layer_0/stable_rank_q_proj": 16.445674896240234, "geo/layer_0/stable_rank_k_proj": 14.604930877685547, "geo/layer_0/stable_rank_o_proj": 53.24477005004883, "geo/layer_0/stable_rank_gate_proj": 156.48243713378906, "geo/layer_0/stable_rank_down_proj": 49.40005111694336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.045523252338171005, "geo/layer_0/attn_entropy_mean": 6.285550117492676, "geo/layer_0/attn_entropy_std": 0.3088296055793762, "geo/layer_7/stable_rank_q_proj": 42.9233512878418, "geo/layer_7/stable_rank_k_proj": 42.299659729003906, "geo/layer_7/stable_rank_o_proj": 110.73783874511719, "geo/layer_7/stable_rank_gate_proj": 108.80956268310547, "geo/layer_7/stable_rank_down_proj": 153.90745544433594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.566364586353302, "geo/layer_7/attn_entropy_mean": 4.674355983734131, "geo/layer_7/attn_entropy_std": 0.8784031867980957, "geo/layer_14/stable_rank_q_proj": 58.53586959838867, "geo/layer_14/stable_rank_k_proj": 35.48596954345703, "geo/layer_14/stable_rank_o_proj": 52.39048385620117, "geo/layer_14/stable_rank_gate_proj": 92.65538024902344, "geo/layer_14/stable_rank_down_proj": 135.79739379882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38955801725387573, "geo/layer_14/attn_entropy_mean": 5.528132438659668, "geo/layer_14/attn_entropy_std": 0.5181632041931152, "geo/layer_21/stable_rank_q_proj": 49.323768615722656, "geo/layer_21/stable_rank_k_proj": 32.011192321777344, "geo/layer_21/stable_rank_o_proj": 86.98138427734375, "geo/layer_21/stable_rank_gate_proj": 92.15390014648438, "geo/layer_21/stable_rank_down_proj": 63.041168212890625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1557552069425583, "geo/layer_21/attn_entropy_mean": 5.785468578338623, "geo/layer_21/attn_entropy_std": 0.28676360845565796, "geo/layer_27/stable_rank_q_proj": 42.22038650512695, "geo/layer_27/stable_rank_k_proj": 31.95870590209961, "geo/layer_27/stable_rank_o_proj": 116.86421966552734, "geo/layer_27/stable_rank_gate_proj": 92.50723266601562, "geo/layer_27/stable_rank_down_proj": 141.13743591308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07810065895318985, "geo/layer_27/attn_entropy_mean": 4.431276321411133, "geo/layer_27/attn_entropy_std": 0.5366756916046143, "attnres/final_alpha/block_0": 0.24350149929523468, "attnres/block_norm/0": 1.572889804840088, "attnres/final_alpha/block_1": 0.007209259085357189, "attnres/block_norm/1": 26094.53515625, "attnres/final_alpha/block_2": 0.015152128413319588, "attnres/block_norm/2": 19073.0234375, "attnres/final_alpha/block_3": 0.01742636039853096, "attnres/block_norm/3": 25391.11328125, "attnres/final_alpha/block_4": 0.022640904411673546, "attnres/block_norm/4": 8304.4951171875, "attnres/final_alpha/block_5": 0.5543273687362671, "attnres/block_norm/5": 4720.6591796875, "attnres/final_alpha/block_6": 0.1397424340248108, "attnres/block_norm/6": 17173.64453125, "geo/tier1_time_s": 1.3590948581695557, "geo/step": 17250.0, "geo/rankme_slope": 2.15824024922469e-05} {"step": 17260, "timestamp": 1778213191.4988358, "train/loss": 2.2582313776016236, "train/z_loss": 0.0016083285445347428, "train/perplexity": 9.566155279858373, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791472.5631033224, "perf/iters_per_sec": 0.8542406859890568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706302642822266, "data/tokens_consumed": 36198940672, "data/tokens_consumed_B": 36.198940672, "train/loss_slope": -1.5379171581766173e-05} {"step": 17270, "timestamp": 1778213201.8464649, "train/loss": 2.198260688781738, "train/z_loss": 0.001627096754964441, "train/perplexity": 9.009329835549716, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027911.4814951692, "perf/iters_per_sec": 0.9669835479236456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341437578201294, "data/tokens_consumed": 36219912192, "data/tokens_consumed_B": 36.219912192, "train/loss_slope": -1.7940813873467282e-05} {"step": 17280, "timestamp": 1778213212.1979077, "train/loss": 2.2013266324996947, "train/z_loss": 0.0016327284160070122, "train/perplexity": 9.036994320969123, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027104.474400483, "perf/iters_per_sec": 0.9665987369539657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345554590225219, "data/tokens_consumed": 36240883712, "data/tokens_consumed_B": 36.240883712, "train/loss_slope": -2.0780227415346372e-05} {"step": 17290, "timestamp": 1778213222.5559068, "train/loss": 2.228267765045166, "train/z_loss": 0.0016207603621296585, "train/perplexity": 9.28377047109792, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025641.648992117, "perf/iters_per_sec": 0.9659012074432931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353025674819947, "data/tokens_consumed": 36261855232, "data/tokens_consumed_B": 36.261855232, "train/loss_slope": -2.244673265267735e-05} {"step": 17300, "timestamp": 1778213232.8988822, "grad/layer_0/attn": 0.003049321938306093, "grad/layer_0/mlp": 0.0028413187246769667, "grad/layer_0/attn_mlp_ratio": 1.0732065376903892, "grad/layer_4/attn": 0.003423272632062435, "grad/layer_4/mlp": 0.002521504182368517, "grad/layer_4/attn_mlp_ratio": 1.3576310998158856, "grad/layer_8/attn": 0.00856338907033205, "grad/layer_8/mlp": 0.003835967043414712, "grad/layer_8/attn_mlp_ratio": 2.2323937484795735, "grad/layer_12/attn": 0.005826225969940424, "grad/layer_12/mlp": 0.006527998484671116, "grad/layer_12/attn_mlp_ratio": 0.8924980442890144, "grad/layer_16/attn": 0.0050208792090415955, "grad/layer_16/mlp": 0.0048951623030006886, "grad/layer_16/attn_mlp_ratio": 1.0256818458083932, "grad/layer_20/attn": 0.008257771842181683, "grad/layer_20/mlp": 0.007771477103233337, "grad/layer_20/attn_mlp_ratio": 1.062574286230425, "grad/layer_24/attn": 0.021576305851340294, "grad/layer_24/mlp": 0.01608586683869362, "grad/layer_24/attn_mlp_ratio": 1.3413206719645145, "grad/layer_27/attn": 0.009910213761031628, "grad/layer_27/mlp": 0.016141705214977264, "grad/layer_27/attn_mlp_ratio": 0.6139508538689727} {"step": 17300, "timestamp": 1778213232.9149945, "train/loss": 2.252952527999878, "train/z_loss": 0.0016289640800096095, "train/perplexity": 9.515790037097398, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025776.6580459988, "perf/iters_per_sec": 0.9659655847768778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352335691452026, "data/tokens_consumed": 36282826752, "data/tokens_consumed_B": 36.282826752, "train/loss_slope": -2.264111270927431e-05} {"step": 17310, "timestamp": 1778213243.2682197, "train/loss": 2.2642447471618654, "train/z_loss": 0.0016001087264157833, "train/perplexity": 9.623853413059202, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026435.1199995836, "perf/iters_per_sec": 0.96627956390361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348971843719483, "data/tokens_consumed": 36303798272, "data/tokens_consumed_B": 36.303798272, "train/loss_slope": -1.7321696554688114e-05} {"step": 17320, "timestamp": 1778213253.629397, "train/loss": 2.252885365486145, "train/z_loss": 0.0016042542876675725, "train/perplexity": 9.515150954179797, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025071.3946859476, "perf/iters_per_sec": 0.9656292890004862, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03559410572052, "data/tokens_consumed": 36324769792, "data/tokens_consumed_B": 36.324769792, "train/loss_slope": -1.7527399798466777e-05} {"step": 17325, "timestamp": 1778213259.42042, "eos/sharpness": 59.115266799926744, "eos/L0_probe": 2.068460702896118, "eos/L_plus": 2.479856491088867, "eos/L_minus": 2.2482175827026367, "eos/grad_norm": 0.19209350645542145, "eos/embed_grad_frac": 0.0781639963388443, "eos/time_s": 0.6230669021606445} {"step": 17325, "timestamp": 1778213260.7987654, "geo/rankme_last": 441.3810729980469, "geo/layer_0/stable_rank_q_proj": 16.467071533203125, "geo/layer_0/stable_rank_k_proj": 14.564533233642578, "geo/layer_0/stable_rank_o_proj": 53.28232192993164, "geo/layer_0/stable_rank_gate_proj": 156.31356811523438, "geo/layer_0/stable_rank_down_proj": 49.424293518066406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04264969378709793, "geo/layer_0/attn_entropy_mean": 6.284954071044922, "geo/layer_0/attn_entropy_std": 0.3034736216068268, "geo/layer_7/stable_rank_q_proj": 42.80687713623047, "geo/layer_7/stable_rank_k_proj": 42.44598388671875, "geo/layer_7/stable_rank_o_proj": 110.91507720947266, "geo/layer_7/stable_rank_gate_proj": 108.51820373535156, "geo/layer_7/stable_rank_down_proj": 154.32571411132812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5686381459236145, "geo/layer_7/attn_entropy_mean": 4.630713939666748, "geo/layer_7/attn_entropy_std": 0.8791818618774414, "geo/layer_14/stable_rank_q_proj": 58.24980163574219, "geo/layer_14/stable_rank_k_proj": 35.42604446411133, "geo/layer_14/stable_rank_o_proj": 52.42734146118164, "geo/layer_14/stable_rank_gate_proj": 92.90572357177734, "geo/layer_14/stable_rank_down_proj": 135.89309692382812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3814255893230438, "geo/layer_14/attn_entropy_mean": 5.512784004211426, "geo/layer_14/attn_entropy_std": 0.5254861116409302, "geo/layer_21/stable_rank_q_proj": 49.28485107421875, "geo/layer_21/stable_rank_k_proj": 31.913108825683594, "geo/layer_21/stable_rank_o_proj": 86.92284393310547, "geo/layer_21/stable_rank_gate_proj": 92.2839584350586, "geo/layer_21/stable_rank_down_proj": 63.06291580200195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15215331315994263, "geo/layer_21/attn_entropy_mean": 5.749197959899902, "geo/layer_21/attn_entropy_std": 0.29606926441192627, "geo/layer_27/stable_rank_q_proj": 42.143436431884766, "geo/layer_27/stable_rank_k_proj": 32.005699157714844, "geo/layer_27/stable_rank_o_proj": 117.06802368164062, "geo/layer_27/stable_rank_gate_proj": 92.6351089477539, "geo/layer_27/stable_rank_down_proj": 141.05899047851562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07924553006887436, "geo/layer_27/attn_entropy_mean": 4.415424346923828, "geo/layer_27/attn_entropy_std": 0.530680775642395, "attnres/final_alpha/block_0": 0.24151001870632172, "attnres/block_norm/0": 1.5737345218658447, "attnres/final_alpha/block_1": 0.00714211817830801, "attnres/block_norm/1": 26189.1875, "attnres/final_alpha/block_2": 0.014868713915348053, "attnres/block_norm/2": 19201.234375, "attnres/final_alpha/block_3": 0.01699705235660076, "attnres/block_norm/3": 25572.4765625, "attnres/final_alpha/block_4": 0.022352103143930435, "attnres/block_norm/4": 8330.8974609375, "attnres/final_alpha/block_5": 0.557414174079895, "attnres/block_norm/5": 4685.66259765625, "attnres/final_alpha/block_6": 0.1397157907485962, "attnres/block_norm/6": 17370.75, "geo/tier1_time_s": 1.358450174331665, "geo/step": 17325.0, "geo/rankme_slope": 3.0531997955432175e-05} {"step": 17330, "timestamp": 1778213265.9811976, "train/loss": 2.2124993801116943, "train/z_loss": 0.001621345372404903, "train/perplexity": 9.138528529521603, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698570.8929103124, "perf/iters_per_sec": 0.8099417175818979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2346567392349244, "data/tokens_consumed": 36345741312, "data/tokens_consumed_B": 36.345741312, "train/loss_slope": -1.6640320963019766e-05} {"step": 17340, "timestamp": 1778213276.3333583, "train/loss": 2.2167855978012083, "train/z_loss": 0.001631923601962626, "train/perplexity": 9.177782317204112, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026719.236896204, "perf/iters_per_sec": 0.966415041397192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034752106666565, "data/tokens_consumed": 36366712832, "data/tokens_consumed_B": 36.366712832, "train/loss_slope": -1.601615740378054e-05} {"step": 17350, "timestamp": 1778213286.689727, "grad/layer_0/attn": 0.0028495751321315765, "grad/layer_0/mlp": 0.0025958186015486717, "grad/layer_0/attn_mlp_ratio": 1.0977558372745806, "grad/layer_4/attn": 0.0017921545077115297, "grad/layer_4/mlp": 0.0024184335488826036, "grad/layer_4/attn_mlp_ratio": 0.7410393535251895, "grad/layer_8/attn": 0.009173480793833733, "grad/layer_8/mlp": 0.003810763591900468, "grad/layer_8/attn_mlp_ratio": 2.4072552211335942, "grad/layer_12/attn": 0.00489960890263319, "grad/layer_12/mlp": 0.006425895728170872, "grad/layer_12/attn_mlp_ratio": 0.7624787319385882, "grad/layer_16/attn": 0.0049595823511481285, "grad/layer_16/mlp": 0.004663233179599047, "grad/layer_16/attn_mlp_ratio": 1.0635501279435378, "grad/layer_20/attn": 0.004504713695496321, "grad/layer_20/mlp": 0.006078323815017939, "grad/layer_20/attn_mlp_ratio": 0.7411111613131309, "grad/layer_24/attn": 0.016522562131285667, "grad/layer_24/mlp": 0.014040719717741013, "grad/layer_24/attn_mlp_ratio": 1.1767603332137393, "grad/layer_27/attn": 0.006260009948164225, "grad/layer_27/mlp": 0.013286913745105267, "grad/layer_27/attn_mlp_ratio": 0.471141005438997} {"step": 17350, "timestamp": 1778213286.7057524, "train/loss": 2.216458058357239, "train/z_loss": 0.0016240219585597516, "train/perplexity": 9.17477672373914, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022895.0436302293, "perf/iters_per_sec": 0.9645915239478251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367082595825194, "data/tokens_consumed": 36387684352, "data/tokens_consumed_B": 36.387684352, "train/loss_slope": -1.2684859880412473e-05} {"step": 17360, "timestamp": 1778213297.0747862, "train/loss": 2.2004119396209716, "train/z_loss": 0.0016299412469379603, "train/perplexity": 9.028732025925954, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023939.9053148523, "perf/iters_per_sec": 0.965089752824236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361730575561523, "data/tokens_consumed": 36408655872, "data/tokens_consumed_B": 36.408655872, "train/loss_slope": -1.512218367184313e-05} {"step": 17370, "timestamp": 1778213307.4265928, "train/loss": 2.238158941268921, "train/z_loss": 0.0016244924743659794, "train/perplexity": 9.376053522432425, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026892.5943340196, "perf/iters_per_sec": 0.9664977046651934, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346636056900025, "data/tokens_consumed": 36429627392, "data/tokens_consumed_B": 36.429627392, "train/loss_slope": -1.603534043532679e-05} {"step": 17380, "timestamp": 1778213317.785725, "train/loss": 2.1877864599227905, "train/z_loss": 0.0016423727152869104, "train/perplexity": 8.915456536426726, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025493.5049626944, "perf/iters_per_sec": 0.9658305668652984, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353782892227172, "data/tokens_consumed": 36450598912, "data/tokens_consumed_B": 36.450598912, "train/loss_slope": -1.9678560878434338e-05} {"step": 17390, "timestamp": 1778213328.1394112, "train/loss": 2.2642194986343385, "train/z_loss": 0.0016183809377253056, "train/perplexity": 9.623610427998909, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026652.741266068, "perf/iters_per_sec": 0.9663833338098851, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034786057472229, "data/tokens_consumed": 36471570432, "data/tokens_consumed_B": 36.471570432, "train/loss_slope": -1.7435618574255184e-05} {"step": 17400, "timestamp": 1778213338.4849567, "grad/layer_0/attn": 0.0027318757493048906, "grad/layer_0/mlp": 0.002756420522928238, "grad/layer_0/attn_mlp_ratio": 0.9910953816630234, "grad/layer_4/attn": 0.001569222891703248, "grad/layer_4/mlp": 0.0026293944101780653, "grad/layer_4/attn_mlp_ratio": 0.5968000943293134, "grad/layer_8/attn": 0.006226120516657829, "grad/layer_8/mlp": 0.0041336300782859325, "grad/layer_8/attn_mlp_ratio": 1.5062113077661867, "grad/layer_12/attn": 0.005391461309045553, "grad/layer_12/mlp": 0.0062340362928807735, "grad/layer_12/attn_mlp_ratio": 0.8648427710820817, "grad/layer_16/attn": 0.006668253801763058, "grad/layer_16/mlp": 0.005121632944792509, "grad/layer_16/attn_mlp_ratio": 1.3019780494705877, "grad/layer_20/attn": 0.0037932281848043203, "grad/layer_20/mlp": 0.006388742011040449, "grad/layer_20/attn_mlp_ratio": 0.5937363128571437, "grad/layer_24/attn": 0.01462414301931858, "grad/layer_24/mlp": 0.010726353153586388, "grad/layer_24/attn_mlp_ratio": 1.3633844302516287, "grad/layer_27/attn": 0.006485039833933115, "grad/layer_27/mlp": 0.009904583916068077, "grad/layer_27/attn_mlp_ratio": 0.6547513578977694} {"step": 17400, "timestamp": 1778213339.1001828, "eos/sharpness": 38.94968032836913, "eos/L0_probe": 2.0672786235809326, "eos/L_plus": 2.3003547191619873, "eos/L_minus": 2.2236993312835693, "eos/grad_norm": 0.1686965674161911, "eos/embed_grad_frac": 0.08791888505220413, "eos/time_s": 0.6124269962310791} {"step": 17400, "timestamp": 1778213339.1199882, "train/loss": 2.210760569572449, "train/z_loss": 0.0016228980850428343, "train/perplexity": 9.122652166794822, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910643.1914198345, "perf/iters_per_sec": 0.9110656697367833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976157188415527, "data/tokens_consumed": 36492541952, "data/tokens_consumed_B": 36.492541952, "train/loss_slope": -1.4267433651781924e-05} {"step": 17400, "timestamp": 1778213340.4792376, "geo/rankme_last": 440.87957763671875, "geo/layer_0/stable_rank_q_proj": 16.451704025268555, "geo/layer_0/stable_rank_k_proj": 14.557047843933105, "geo/layer_0/stable_rank_o_proj": 53.19318389892578, "geo/layer_0/stable_rank_gate_proj": 156.83900451660156, "geo/layer_0/stable_rank_down_proj": 49.4511833190918, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.046550899744033813, "geo/layer_0/attn_entropy_mean": 6.284356594085693, "geo/layer_0/attn_entropy_std": 0.3050265312194824, "geo/layer_7/stable_rank_q_proj": 42.944175720214844, "geo/layer_7/stable_rank_k_proj": 42.39833450317383, "geo/layer_7/stable_rank_o_proj": 110.81098937988281, "geo/layer_7/stable_rank_gate_proj": 107.89295196533203, "geo/layer_7/stable_rank_down_proj": 154.43649291992188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5735142827033997, "geo/layer_7/attn_entropy_mean": 4.638092994689941, "geo/layer_7/attn_entropy_std": 0.8576143383979797, "geo/layer_14/stable_rank_q_proj": 58.216461181640625, "geo/layer_14/stable_rank_k_proj": 35.49650573730469, "geo/layer_14/stable_rank_o_proj": 52.37115478515625, "geo/layer_14/stable_rank_gate_proj": 92.79332733154297, "geo/layer_14/stable_rank_down_proj": 136.3530731201172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39216241240501404, "geo/layer_14/attn_entropy_mean": 5.504453659057617, "geo/layer_14/attn_entropy_std": 0.5299627184867859, "geo/layer_21/stable_rank_q_proj": 49.19972229003906, "geo/layer_21/stable_rank_k_proj": 32.00570297241211, "geo/layer_21/stable_rank_o_proj": 86.60533905029297, "geo/layer_21/stable_rank_gate_proj": 91.69149780273438, "geo/layer_21/stable_rank_down_proj": 63.0499153137207, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15118685364723206, "geo/layer_21/attn_entropy_mean": 5.753201484680176, "geo/layer_21/attn_entropy_std": 0.2976279556751251, "geo/layer_27/stable_rank_q_proj": 42.18389892578125, "geo/layer_27/stable_rank_k_proj": 31.987905502319336, "geo/layer_27/stable_rank_o_proj": 116.7143325805664, "geo/layer_27/stable_rank_gate_proj": 92.63990020751953, "geo/layer_27/stable_rank_down_proj": 140.55918884277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07417359948158264, "geo/layer_27/attn_entropy_mean": 4.442303657531738, "geo/layer_27/attn_entropy_std": 0.5341803431510925, "attnres/final_alpha/block_0": 0.24364802241325378, "attnres/block_norm/0": 1.5749913454055786, "attnres/final_alpha/block_1": 0.007216856814920902, "attnres/block_norm/1": 26252.50390625, "attnres/final_alpha/block_2": 0.014996020123362541, "attnres/block_norm/2": 19129.634765625, "attnres/final_alpha/block_3": 0.017322354018688202, "attnres/block_norm/3": 25511.37109375, "attnres/final_alpha/block_4": 0.0225822851061821, "attnres/block_norm/4": 8323.40234375, "attnres/final_alpha/block_5": 0.5530943870544434, "attnres/block_norm/5": 4720.0185546875, "attnres/final_alpha/block_6": 0.14114007353782654, "attnres/block_norm/6": 17412.16015625, "geo/tier1_time_s": 1.3552608489990234, "geo/step": 17400.0, "geo/rankme_slope": 1.1285353985344139e-05} {"step": 17410, "timestamp": 1778213350.8327286, "train/loss": 2.2066895008087157, "train/z_loss": 0.0016257231240160764, "train/perplexity": 9.08558871763082, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791089.647671998, "perf/iters_per_sec": 0.8540580976829519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708805322647096, "data/tokens_consumed": 36513513472, "data/tokens_consumed_B": 36.513513472, "train/loss_slope": -1.5567403636535232e-05} {"step": 17420, "timestamp": 1778213361.1792328, "train/loss": 2.3093581438064574, "train/z_loss": 0.0015969375846907497, "train/perplexity": 10.067960397936037, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027938.785532502, "perf/iters_per_sec": 0.9669965675032148, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03412983417511, "data/tokens_consumed": 36534484992, "data/tokens_consumed_B": 36.534484992, "train/loss_slope": -1.4097494210633228e-05} {"step": 17430, "timestamp": 1778213371.5320673, "train/loss": 2.280260705947876, "train/z_loss": 0.0016124692861922084, "train/perplexity": 9.779229580539734, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026533.816494189, "perf/iters_per_sec": 0.9663266260596223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348467826843262, "data/tokens_consumed": 36555456512, "data/tokens_consumed_B": 36.555456512, "train/loss_slope": -1.0128466159489789e-05} {"step": 17440, "timestamp": 1778213381.8845682, "train/loss": 2.248309373855591, "train/z_loss": 0.0016135891200974584, "train/perplexity": 9.471709173467282, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026795.5908417644, "perf/iters_per_sec": 0.9664514497956106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034713125228882, "data/tokens_consumed": 36576428032, "data/tokens_consumed_B": 36.576428032, "train/loss_slope": -9.618568205811971e-06} {"step": 17450, "timestamp": 1778213392.2226815, "grad/layer_0/attn": 0.0027789415325969458, "grad/layer_0/mlp": 0.0026229177601635456, "grad/layer_0/attn_mlp_ratio": 1.0594847725897416, "grad/layer_4/attn": 0.0018065886106342077, "grad/layer_4/mlp": 0.0025206715799868107, "grad/layer_4/attn_mlp_ratio": 0.7167092108733735, "grad/layer_8/attn": 0.004065991844981909, "grad/layer_8/mlp": 0.003958500921726227, "grad/layer_8/attn_mlp_ratio": 1.027154426048072, "grad/layer_12/attn": 0.00379524496383965, "grad/layer_12/mlp": 0.005995149724185467, "grad/layer_12/attn_mlp_ratio": 0.633052563345286, "grad/layer_16/attn": 0.004163537640124559, "grad/layer_16/mlp": 0.004624540917575359, "grad/layer_16/attn_mlp_ratio": 0.900313701251913, "grad/layer_20/attn": 0.007100206799805164, "grad/layer_20/mlp": 0.007323958911001682, "grad/layer_20/attn_mlp_ratio": 0.9694492813435455, "grad/layer_24/attn": 0.011302337981760502, "grad/layer_24/mlp": 0.01575801707804203, "grad/layer_24/attn_mlp_ratio": 0.71724366422888, "grad/layer_27/attn": 0.021869953721761703, "grad/layer_27/mlp": 0.010317735373973846, "grad/layer_27/attn_mlp_ratio": 2.119646677987428} {"step": 17450, "timestamp": 1778213392.2384715, "train/loss": 2.2225363731384276, "train/z_loss": 0.0016314869397319853, "train/perplexity": 9.23071373387069, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026597.1292407871, "perf/iters_per_sec": 0.9663568159297882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034814453125, "data/tokens_consumed": 36597399552, "data/tokens_consumed_B": 36.597399552, "train/loss_slope": -1.176918390596233e-05} {"step": 17460, "timestamp": 1778213402.5897515, "train/loss": 2.248569369316101, "train/z_loss": 0.0016098356805741786, "train/perplexity": 9.474172095015994, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027148.8085275413, "perf/iters_per_sec": 0.9666198771131236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345328330993653, "data/tokens_consumed": 36618371072, "data/tokens_consumed_B": 36.618371072, "train/loss_slope": -1.2596185165162641e-05} {"step": 17470, "timestamp": 1778213412.9426665, "train/loss": 2.2616154670715334, "train/z_loss": 0.0016038761124946177, "train/perplexity": 9.598582843149332, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026676.4625211095, "perf/iters_per_sec": 0.966394644985728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347739458084106, "data/tokens_consumed": 36639342592, "data/tokens_consumed_B": 36.639342592, "train/loss_slope": -1.1972931874181055e-05} {"step": 17475, "timestamp": 1778213418.7151449, "eos/sharpness": 21.59042358398437, "eos/L0_probe": 2.0688588619232178, "eos/L_plus": 2.1685359477996826, "eos/L_minus": 2.1850860118865967, "eos/grad_norm": 0.15516887605190277, "eos/embed_grad_frac": 0.17360639572143555, "eos/time_s": 0.6100144386291504} {"step": 17475, "timestamp": 1778213420.0985308, "geo/rankme_last": 440.1398010253906, "geo/layer_0/stable_rank_q_proj": 16.462013244628906, "geo/layer_0/stable_rank_k_proj": 14.610427856445312, "geo/layer_0/stable_rank_o_proj": 53.056671142578125, "geo/layer_0/stable_rank_gate_proj": 156.64984130859375, "geo/layer_0/stable_rank_down_proj": 49.451873779296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04433673992753029, "geo/layer_0/attn_entropy_mean": 6.283159255981445, "geo/layer_0/attn_entropy_std": 0.3014768958091736, "geo/layer_7/stable_rank_q_proj": 42.86663818359375, "geo/layer_7/stable_rank_k_proj": 42.3477783203125, "geo/layer_7/stable_rank_o_proj": 110.7245101928711, "geo/layer_7/stable_rank_gate_proj": 107.60273742675781, "geo/layer_7/stable_rank_down_proj": 154.42196655273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5704687237739563, "geo/layer_7/attn_entropy_mean": 4.683849334716797, "geo/layer_7/attn_entropy_std": 0.8806462287902832, "geo/layer_14/stable_rank_q_proj": 58.55251693725586, "geo/layer_14/stable_rank_k_proj": 35.57674026489258, "geo/layer_14/stable_rank_o_proj": 52.419891357421875, "geo/layer_14/stable_rank_gate_proj": 92.68427276611328, "geo/layer_14/stable_rank_down_proj": 136.47891235351562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3710966110229492, "geo/layer_14/attn_entropy_mean": 5.500065326690674, "geo/layer_14/attn_entropy_std": 0.5179086923599243, "geo/layer_21/stable_rank_q_proj": 49.353668212890625, "geo/layer_21/stable_rank_k_proj": 31.85527992248535, "geo/layer_21/stable_rank_o_proj": 86.53717803955078, "geo/layer_21/stable_rank_gate_proj": 91.6614761352539, "geo/layer_21/stable_rank_down_proj": 63.04588317871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15504084527492523, "geo/layer_21/attn_entropy_mean": 5.792259693145752, "geo/layer_21/attn_entropy_std": 0.27930933237075806, "geo/layer_27/stable_rank_q_proj": 41.96294021606445, "geo/layer_27/stable_rank_k_proj": 32.0147590637207, "geo/layer_27/stable_rank_o_proj": 116.59957122802734, "geo/layer_27/stable_rank_gate_proj": 92.63724517822266, "geo/layer_27/stable_rank_down_proj": 140.2958984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07077521085739136, "geo/layer_27/attn_entropy_mean": 4.466240882873535, "geo/layer_27/attn_entropy_std": 0.5188825130462646, "attnres/final_alpha/block_0": 0.24427562952041626, "attnres/block_norm/0": 1.5761059522628784, "attnres/final_alpha/block_1": 0.0072785066440701485, "attnres/block_norm/1": 26279.908203125, "attnres/final_alpha/block_2": 0.014987042173743248, "attnres/block_norm/2": 19323.96875, "attnres/final_alpha/block_3": 0.017346128821372986, "attnres/block_norm/3": 25619.8828125, "attnres/final_alpha/block_4": 0.02245623990893364, "attnres/block_norm/4": 8378.7626953125, "attnres/final_alpha/block_5": 0.5528349876403809, "attnres/block_norm/5": 4712.44091796875, "attnres/final_alpha/block_6": 0.1408214271068573, "attnres/block_norm/6": 17457.943359375, "geo/tier1_time_s": 1.363358736038208, "geo/step": 17475.0, "geo/rankme_slope": -1.1807027498499402e-05} {"step": 17480, "timestamp": 1778213425.2764213, "train/loss": 2.2764525175094605, "train/z_loss": 0.0015996567206457257, "train/perplexity": 9.742059252243008, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701020.830324541, "perf/iters_per_sec": 0.8111099387762742, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2328784942626954, "data/tokens_consumed": 36660314112, "data/tokens_consumed_B": 36.660314112, "train/loss_slope": -9.8684322358322e-06} {"step": 17490, "timestamp": 1778213435.625889, "train/loss": 2.208892583847046, "train/z_loss": 0.001623540953733027, "train/perplexity": 9.105627089021128, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027183.9408463105, "perf/iters_per_sec": 0.9666366295081665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345149040222168, "data/tokens_consumed": 36681285632, "data/tokens_consumed_B": 36.681285632, "train/loss_slope": -9.335643437543197e-06} {"step": 17500, "timestamp": 1778213445.9697409, "grad/layer_0/attn": 0.0027988110668957233, "grad/layer_0/mlp": 0.002725361380726099, "grad/layer_0/attn_mlp_ratio": 1.0269504015115292, "grad/layer_4/attn": 0.001780597842298448, "grad/layer_4/mlp": 0.002471015090122819, "grad/layer_4/attn_mlp_ratio": 0.7205936448371025, "grad/layer_8/attn": 0.004195519257336855, "grad/layer_8/mlp": 0.0036955017130821943, "grad/layer_8/attn_mlp_ratio": 1.1353043428323002, "grad/layer_12/attn": 0.0040539163164794445, "grad/layer_12/mlp": 0.00557335838675499, "grad/layer_12/attn_mlp_ratio": 0.7273740467464136, "grad/layer_16/attn": 0.004833494313061237, "grad/layer_16/mlp": 0.004615133628249168, "grad/layer_16/attn_mlp_ratio": 1.0473140319803709, "grad/layer_20/attn": 0.004402461461722851, "grad/layer_20/mlp": 0.007422956638038158, "grad/layer_20/attn_mlp_ratio": 0.5930873123863032, "grad/layer_24/attn": 0.027506375685334206, "grad/layer_24/mlp": 0.01587505452334881, "grad/layer_24/attn_mlp_ratio": 1.7326791206676047, "grad/layer_27/attn": 0.007076960988342762, "grad/layer_27/mlp": 0.015117719769477844, "grad/layer_27/attn_mlp_ratio": 0.4681235695226039} {"step": 17500, "timestamp": 1778213445.9858193, "train/loss": 2.276809072494507, "train/z_loss": 0.0016104707843624055, "train/perplexity": 9.745533451368715, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025244.143916987, "perf/iters_per_sec": 0.9657116622528968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035505771636963, "data/tokens_consumed": 36702257152, "data/tokens_consumed_B": 36.702257152, "train/loss_slope": -6.1484842017145175e-06} {"step": 17500, "timestamp": 1778213453.0105972, "geo/ww_alpha_mean": 8.046137079573683, "geo/ww_alpha_std": 5.609523507075265, "geo/ww_alpha_min": 1.3654423472172947, "geo/ww_alpha_max": 48.16151166581915, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.255287115968218, "geo/ww_alpha_by_type/k_proj": 4.646540476256485, "geo/ww_alpha_by_type/v_proj": 8.27469451242753, "geo/ww_alpha_by_type/o_proj": 6.7086478880056, "geo/ww_alpha_by_type/gate_proj": 9.038290923659906, "geo/ww_alpha_by_type/up_proj": 14.231089700045015, "geo/ww_alpha_by_type/down_proj": 9.299803202167109, "geo/twonn_id/layer_0": 0.6768156290054321, "geo/twonn_id/layer_7": 3.3568191528320312, "geo/twonn_id/layer_14": 4.260143756866455, "geo/twonn_id/layer_21": 8.150527954101562, "geo/twonn_id/layer_27": 6.381114482879639, "geo/tier2_time_s": 7.018441677093506} {"step": 17500, "timestamp": 1778213453.6310036, "eoc/jacobian_sigma/layer_0/attn": 806.2401123046875, "eoc/jacobian_sigma/layer_0/mlp": 5168.39892578125, "eoc/jacobian_sigma/layer_0": 5168.39892578125, "eoc/jacobian_sigma/layer_7/attn": 1.1453909873962402, "eoc/jacobian_sigma/layer_7/mlp": 1.6299220323562622, "eoc/jacobian_sigma/layer_7": 1.6299220323562622, "eoc/jacobian_sigma/layer_14/attn": 1.5679504871368408, "eoc/jacobian_sigma/layer_14/mlp": 6.866014003753662, "eoc/jacobian_sigma/layer_14": 6.866014003753662, "eoc/jacobian_sigma/layer_21/attn": 1.0770310163497925, "eoc/jacobian_sigma/layer_21/mlp": 4.002626419067383, "eoc/jacobian_sigma/layer_21": 4.002626419067383, "eoc/jacobian_sigma/layer_27/attn": 3.2042250633239746, "eoc/jacobian_sigma/layer_27/mlp": 21.015960693359375, "eoc/jacobian_sigma/layer_27": 21.015960693359375, "eoc/layer0_sigma": 5168.39892578125, "eoc/sigma_max": 21.015960693359375, "eoc/sigma_min": 1.6299220323562622, "eoc/sigma_mean": 8.37863078713417, "eoc/time_s": 0.6144332885742188} {"step": 17510, "timestamp": 1778213463.9976048, "train/loss": 2.2521411657333372, "train/z_loss": 0.0016117123188450933, "train/perplexity": 9.508072415441852, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1164657.0241439613, "perf/iters_per_sec": 0.5553517456741148, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8006605863571168, "data/tokens_consumed": 36723228672, "data/tokens_consumed_B": 36.723228672, "train/loss_slope": -3.2738523800881577e-06} {"step": 17520, "timestamp": 1778213474.3598409, "train/loss": 2.208351159095764, "train/z_loss": 0.0016163106425665318, "train/perplexity": 9.100698411513587, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024926.690350569, "perf/iters_per_sec": 0.9655602885964246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356681108474732, "data/tokens_consumed": 36744200192, "data/tokens_consumed_B": 36.744200192, "train/loss_slope": -1.8197360641062892e-06} {"step": 17530, "timestamp": 1778213484.7253096, "train/loss": 2.2601210832595826, "train/z_loss": 0.001612021611072123, "train/perplexity": 9.584249588689977, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024710.558120433, "perf/iters_per_sec": 0.9654572287180104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357786655426025, "data/tokens_consumed": 36765171712, "data/tokens_consumed_B": 36.765171712, "train/loss_slope": 6.432598502007136e-07} {"step": 17540, "timestamp": 1778213495.074896, "train/loss": 2.272047424316406, "train/z_loss": 0.0016000862349756062, "train/perplexity": 9.699238956285159, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027306.726652431, "perf/iters_per_sec": 0.9666951783430248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034452247619629, "data/tokens_consumed": 36786143232, "data/tokens_consumed_B": 36.786143232, "train/loss_slope": 4.945966498543464e-06} {"step": 17550, "timestamp": 1778213505.4242508, "grad/layer_0/attn": 0.003050700295716524, "grad/layer_0/mlp": 0.003030827734619379, "grad/layer_0/attn_mlp_ratio": 1.0065567766239156, "grad/layer_4/attn": 0.0019293359946459532, "grad/layer_4/mlp": 0.0025416016578674316, "grad/layer_4/attn_mlp_ratio": 0.759102400159178, "grad/layer_8/attn": 0.0050397636368870735, "grad/layer_8/mlp": 0.004112220369279385, "grad/layer_8/attn_mlp_ratio": 1.2255577429607585, "grad/layer_12/attn": 0.004718853626400232, "grad/layer_12/mlp": 0.006308029871433973, "grad/layer_12/attn_mlp_ratio": 0.7480708949972726, "grad/layer_16/attn": 0.005563770420849323, "grad/layer_16/mlp": 0.004553603939712048, "grad/layer_16/attn_mlp_ratio": 1.2218388714362514, "grad/layer_20/attn": 0.004393707029521465, "grad/layer_20/mlp": 0.007238783407956362, "grad/layer_20/attn_mlp_ratio": 0.6069675967919488, "grad/layer_24/attn": 0.020765438675880432, "grad/layer_24/mlp": 0.016953427344560623, "grad/layer_24/attn_mlp_ratio": 1.2248519506622162, "grad/layer_27/attn": 0.010862288996577263, "grad/layer_27/mlp": 0.017066143453121185, "grad/layer_27/attn_mlp_ratio": 0.6364817548127728} {"step": 17550, "timestamp": 1778213506.0462089, "eos/sharpness": 38.754582405090325, "eos/L0_probe": 2.0698840618133545, "eos/L_plus": 2.285797595977783, "eos/L_minus": 2.241516351699829, "eos/grad_norm": 0.2468591332435608, "eos/embed_grad_frac": 0.054703764617443085, "eos/time_s": 0.6192188262939453} {"step": 17550, "timestamp": 1778213506.0666606, "train/loss": 2.243306851387024, "train/z_loss": 0.0016184181906282902, "train/perplexity": 9.42444505398736, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908857.9909542876, "perf/iters_per_sec": 0.910214419819969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986422300338745, "data/tokens_consumed": 36807114752, "data/tokens_consumed_B": 36.807114752, "train/loss_slope": 3.2252336886062823e-06} {"step": 17550, "timestamp": 1778213507.4308715, "geo/rankme_last": 440.8947448730469, "geo/layer_0/stable_rank_q_proj": 16.497652053833008, "geo/layer_0/stable_rank_k_proj": 14.62779426574707, "geo/layer_0/stable_rank_o_proj": 53.11887741088867, "geo/layer_0/stable_rank_gate_proj": 156.7106475830078, "geo/layer_0/stable_rank_down_proj": 49.34720993041992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04242716729640961, "geo/layer_0/attn_entropy_mean": 6.290743350982666, "geo/layer_0/attn_entropy_std": 0.30251380801200867, "geo/layer_7/stable_rank_q_proj": 42.806053161621094, "geo/layer_7/stable_rank_k_proj": 42.59123611450195, "geo/layer_7/stable_rank_o_proj": 110.8843994140625, "geo/layer_7/stable_rank_gate_proj": 107.59968566894531, "geo/layer_7/stable_rank_down_proj": 154.13494873046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.561539351940155, "geo/layer_7/attn_entropy_mean": 4.652621269226074, "geo/layer_7/attn_entropy_std": 0.8691471219062805, "geo/layer_14/stable_rank_q_proj": 58.48028564453125, "geo/layer_14/stable_rank_k_proj": 35.587158203125, "geo/layer_14/stable_rank_o_proj": 52.44788360595703, "geo/layer_14/stable_rank_gate_proj": 92.67508697509766, "geo/layer_14/stable_rank_down_proj": 136.34523010253906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3830346465110779, "geo/layer_14/attn_entropy_mean": 5.49302864074707, "geo/layer_14/attn_entropy_std": 0.5306230783462524, "geo/layer_21/stable_rank_q_proj": 49.393436431884766, "geo/layer_21/stable_rank_k_proj": 31.8577880859375, "geo/layer_21/stable_rank_o_proj": 86.39971160888672, "geo/layer_21/stable_rank_gate_proj": 91.58564758300781, "geo/layer_21/stable_rank_down_proj": 63.00046920776367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1553654670715332, "geo/layer_21/attn_entropy_mean": 5.76400089263916, "geo/layer_21/attn_entropy_std": 0.28819796442985535, "geo/layer_27/stable_rank_q_proj": 41.915950775146484, "geo/layer_27/stable_rank_k_proj": 31.85426139831543, "geo/layer_27/stable_rank_o_proj": 116.65106964111328, "geo/layer_27/stable_rank_gate_proj": 92.43441009521484, "geo/layer_27/stable_rank_down_proj": 140.46197509765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08117299526929855, "geo/layer_27/attn_entropy_mean": 4.42437744140625, "geo/layer_27/attn_entropy_std": 0.5085493326187134, "attnres/final_alpha/block_0": 0.24324923753738403, "attnres/block_norm/0": 1.5769989490509033, "attnres/final_alpha/block_1": 0.007220160216093063, "attnres/block_norm/1": 26358.953125, "attnres/final_alpha/block_2": 0.014807049185037613, "attnres/block_norm/2": 19321.41796875, "attnres/final_alpha/block_3": 0.016885966062545776, "attnres/block_norm/3": 25656.41796875, "attnres/final_alpha/block_4": 0.02235308289527893, "attnres/block_norm/4": 8383.4189453125, "attnres/final_alpha/block_5": 0.5545884370803833, "attnres/block_norm/5": 4737.9951171875, "attnres/final_alpha/block_6": 0.1408960372209549, "attnres/block_norm/6": 17430.05859375, "geo/tier1_time_s": 1.360159158706665, "geo/step": 17550.0, "geo/rankme_slope": 2.6223379976990605e-07} {"step": 17560, "timestamp": 1778213517.7864056, "train/loss": 2.239499592781067, "train/z_loss": 0.0016067784978076815, "train/perplexity": 9.388631972540328, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790012.4754589158, "perf/iters_per_sec": 0.8535444619459704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715851306915284, "data/tokens_consumed": 36828086272, "data/tokens_consumed_B": 36.828086272, "train/loss_slope": 1.923466987735755e-06} {"step": 17570, "timestamp": 1778213528.1302235, "train/loss": 2.23649377822876, "train/z_loss": 0.0016163570806384086, "train/perplexity": 9.360453856242609, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028400.6343615227, "perf/iters_per_sec": 0.9672167941863645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338943719863891, "data/tokens_consumed": 36849057792, "data/tokens_consumed_B": 36.849057792, "train/loss_slope": 2.729480258702449e-06} {"step": 17580, "timestamp": 1778213538.4824927, "train/loss": 2.2621651887893677, "train/z_loss": 0.0016207750886678696, "train/perplexity": 9.60386084318135, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027079.855548812, "perf/iters_per_sec": 0.9665869977706967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345680236816406, "data/tokens_consumed": 36870029312, "data/tokens_consumed_B": 36.870029312, "train/loss_slope": 4.540036756857136e-06} {"step": 17590, "timestamp": 1778213548.8312054, "train/loss": 2.2373883962631225, "train/z_loss": 0.0016095399390906096, "train/perplexity": 9.368831633968986, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027609.082349702, "perf/iters_per_sec": 0.966839352774478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342979907989502, "data/tokens_consumed": 36891000832, "data/tokens_consumed_B": 36.891000832, "train/loss_slope": 2.9152723965328533e-06} {"step": 17600, "timestamp": 1778213559.170408, "grad/layer_0/attn": 0.003262731246650219, "grad/layer_0/mlp": 0.0029405751265585423, "grad/layer_0/attn_mlp_ratio": 1.109555442480112, "grad/layer_4/attn": 0.0016884153010323644, "grad/layer_4/mlp": 0.0025564786046743393, "grad/layer_4/attn_mlp_ratio": 0.6604456739440924, "grad/layer_8/attn": 0.010270277038216591, "grad/layer_8/mlp": 0.0038275953847914934, "grad/layer_8/attn_mlp_ratio": 2.6832190285061066, "grad/layer_12/attn": 0.004456979688256979, "grad/layer_12/mlp": 0.006341691594570875, "grad/layer_12/attn_mlp_ratio": 0.7028061127715498, "grad/layer_16/attn": 0.004869958385825157, "grad/layer_16/mlp": 0.004907530732452869, "grad/layer_16/attn_mlp_ratio": 0.9923439204132453, "grad/layer_20/attn": 0.009657766669988632, "grad/layer_20/mlp": 0.007636731956154108, "grad/layer_20/attn_mlp_ratio": 1.2646465266783664, "grad/layer_24/attn": 0.0210284236818552, "grad/layer_24/mlp": 0.01481539849191904, "grad/layer_24/attn_mlp_ratio": 1.4193626686038008, "grad/layer_27/attn": 0.01368038821965456, "grad/layer_27/mlp": 0.014861593954265118, "grad/layer_27/attn_mlp_ratio": 0.9205195734523806} {"step": 17600, "timestamp": 1778213559.1862981, "train/loss": 2.2518381595611574, "train/z_loss": 0.0016093943035230041, "train/perplexity": 9.50519184725145, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026367.8494219119, "perf/iters_per_sec": 0.9662474867925224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349315404891968, "data/tokens_consumed": 36911972352, "data/tokens_consumed_B": 36.911972352, "train/loss_slope": 3.628900521086958e-06} {"step": 17610, "timestamp": 1778213569.5382922, "train/loss": 2.2683804273605346, "train/z_loss": 0.0015981504460796715, "train/perplexity": 9.663737009106793, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026927.8111874494, "perf/iters_per_sec": 0.9665144973695037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346456289291381, "data/tokens_consumed": 36932943872, "data/tokens_consumed_B": 36.932943872, "train/loss_slope": 5.6431818251634105e-06} {"step": 17620, "timestamp": 1778213579.8867044, "train/loss": 2.3072546482086183, "train/z_loss": 0.001592656853608787, "train/perplexity": 10.046804745770965, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027360.6487116693, "perf/iters_per_sec": 0.9667208903845164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344247341156005, "data/tokens_consumed": 36953915392, "data/tokens_consumed_B": 36.953915392, "train/loss_slope": 8.397080080665856e-06} {"step": 17625, "timestamp": 1778213585.6640801, "eos/sharpness": 22.97291755676269, "eos/L0_probe": 2.0710556507110596, "eos/L_plus": 2.2196168899536133, "eos/L_minus": 2.152223587036133, "eos/grad_norm": 0.12765441834926605, "eos/embed_grad_frac": 0.1733347475528717, "eos/time_s": 0.607628583908081} {"step": 17625, "timestamp": 1778213587.0469155, "geo/rankme_last": 440.68707275390625, "geo/layer_0/stable_rank_q_proj": 16.518552780151367, "geo/layer_0/stable_rank_k_proj": 14.627286911010742, "geo/layer_0/stable_rank_o_proj": 53.169403076171875, "geo/layer_0/stable_rank_gate_proj": 156.37655639648438, "geo/layer_0/stable_rank_down_proj": 49.42206954956055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.044477786868810654, "geo/layer_0/attn_entropy_mean": 6.287537574768066, "geo/layer_0/attn_entropy_std": 0.3082173466682434, "geo/layer_7/stable_rank_q_proj": 42.845703125, "geo/layer_7/stable_rank_k_proj": 42.50883102416992, "geo/layer_7/stable_rank_o_proj": 110.92250061035156, "geo/layer_7/stable_rank_gate_proj": 107.6726303100586, "geo/layer_7/stable_rank_down_proj": 154.29029846191406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.575331449508667, "geo/layer_7/attn_entropy_mean": 4.634151935577393, "geo/layer_7/attn_entropy_std": 0.8668444752693176, "geo/layer_14/stable_rank_q_proj": 58.32062530517578, "geo/layer_14/stable_rank_k_proj": 35.49272918701172, "geo/layer_14/stable_rank_o_proj": 52.599456787109375, "geo/layer_14/stable_rank_gate_proj": 92.77657318115234, "geo/layer_14/stable_rank_down_proj": 137.1703338623047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37485113739967346, "geo/layer_14/attn_entropy_mean": 5.5248188972473145, "geo/layer_14/attn_entropy_std": 0.5167708992958069, "geo/layer_21/stable_rank_q_proj": 49.428001403808594, "geo/layer_21/stable_rank_k_proj": 31.907506942749023, "geo/layer_21/stable_rank_o_proj": 86.26929473876953, "geo/layer_21/stable_rank_gate_proj": 91.32572937011719, "geo/layer_21/stable_rank_down_proj": 62.96357345581055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15415821969509125, "geo/layer_21/attn_entropy_mean": 5.7599897384643555, "geo/layer_21/attn_entropy_std": 0.28430116176605225, "geo/layer_27/stable_rank_q_proj": 41.92161560058594, "geo/layer_27/stable_rank_k_proj": 31.773712158203125, "geo/layer_27/stable_rank_o_proj": 116.64234161376953, "geo/layer_27/stable_rank_gate_proj": 92.36666870117188, "geo/layer_27/stable_rank_down_proj": 140.77908325195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07074791938066483, "geo/layer_27/attn_entropy_mean": 4.418286323547363, "geo/layer_27/attn_entropy_std": 0.5265395641326904, "attnres/final_alpha/block_0": 0.24424755573272705, "attnres/block_norm/0": 1.5782277584075928, "attnres/final_alpha/block_1": 0.007169978227466345, "attnres/block_norm/1": 26592.328125, "attnres/final_alpha/block_2": 0.015022218227386475, "attnres/block_norm/2": 19378.1796875, "attnres/final_alpha/block_3": 0.017179427668452263, "attnres/block_norm/3": 25822.19140625, "attnres/final_alpha/block_4": 0.02274162694811821, "attnres/block_norm/4": 8423.37890625, "attnres/final_alpha/block_5": 0.5533154010772705, "attnres/block_norm/5": 4769.4775390625, "attnres/final_alpha/block_6": 0.14032375812530518, "attnres/block_norm/6": 17525.423828125, "geo/tier1_time_s": 1.3632755279541016, "geo/step": 17625.0, "geo/rankme_slope": 3.586430666016407e-05} {"step": 17630, "timestamp": 1778213592.2374284, "train/loss": 2.1966646194458006, "train/z_loss": 0.001631125609856099, "train/perplexity": 8.994961789710679, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698834.5168636995, "perf/iters_per_sec": 0.8100674232786653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344651460647582, "data/tokens_consumed": 36974886912, "data/tokens_consumed_B": 36.974886912, "train/loss_slope": 7.927836407565026e-06} {"step": 17640, "timestamp": 1778213602.605903, "train/loss": 2.245377612113953, "train/z_loss": 0.0016106369555927813, "train/perplexity": 9.443981044878663, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024336.0599384932, "perf/iters_per_sec": 0.965278654069182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359702825546264, "data/tokens_consumed": 36995858432, "data/tokens_consumed_B": 36.995858432, "train/loss_slope": 7.92753645653416e-06} {"step": 17650, "timestamp": 1778213613.4941165, "grad/layer_0/attn": 0.0024209253024309874, "grad/layer_0/mlp": 0.00251556234434247, "grad/layer_0/attn_mlp_ratio": 0.9623793310619966, "grad/layer_4/attn": 0.0014227161882445216, "grad/layer_4/mlp": 0.0023575108498334885, "grad/layer_4/attn_mlp_ratio": 0.6034823245868735, "grad/layer_8/attn": 0.004949389956891537, "grad/layer_8/mlp": 0.003562372177839279, "grad/layer_8/attn_mlp_ratio": 1.389352254866956, "grad/layer_12/attn": 0.004569333977997303, "grad/layer_12/mlp": 0.006076595280319452, "grad/layer_12/attn_mlp_ratio": 0.7519562669576807, "grad/layer_16/attn": 0.004406313877552748, "grad/layer_16/mlp": 0.004591302014887333, "grad/layer_16/attn_mlp_ratio": 0.9597089817429442, "grad/layer_20/attn": 0.014275617897510529, "grad/layer_20/mlp": 0.0066264900378882885, "grad/layer_20/attn_mlp_ratio": 2.1543256838015674, "grad/layer_24/attn": 0.017886599525809288, "grad/layer_24/mlp": 0.01310049369931221, "grad/layer_24/attn_mlp_ratio": 1.3653378109112462, "grad/layer_27/attn": 0.011141971684992313, "grad/layer_27/mlp": 0.011394198052585125, "grad/layer_27/attn_mlp_ratio": 0.9778636053002479} {"step": 17650, "timestamp": 1778213613.50988, "train/loss": 2.2426300048828125, "train/z_loss": 0.0016123867593705655, "train/perplexity": 9.418068309580459, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1924216.8513035588, "perf/iters_per_sec": 0.9175380951421541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0898730039596558, "data/tokens_consumed": 37016829952, "data/tokens_consumed_B": 37.016829952, "train/loss_slope": 8.447548579854952e-06} {"step": 17660, "timestamp": 1778213623.88039, "train/loss": 2.203326392173767, "train/z_loss": 0.0016183764673769474, "train/perplexity": 9.055084219483279, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023815.1056477374, "perf/iters_per_sec": 0.9650302437056243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362369537353515, "data/tokens_consumed": 37037801472, "data/tokens_consumed_B": 37.037801472, "train/loss_slope": 3.3921248913526684e-06} {"step": 17670, "timestamp": 1778213634.2557304, "train/loss": 2.2439713001251222, "train/z_loss": 0.0016067894524894656, "train/perplexity": 9.43070919548075, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022839.2654816227, "perf/iters_per_sec": 0.9645649268539537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367368459701538, "data/tokens_consumed": 37058772992, "data/tokens_consumed_B": 37.058772992, "train/loss_slope": 5.706236021246462e-07} {"step": 17680, "timestamp": 1778213644.6130462, "train/loss": 2.2179465770721434, "train/z_loss": 0.0016167021589353681, "train/perplexity": 9.188443719863766, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025998.991020792, "perf/iters_per_sec": 0.966071601400753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351199626922607, "data/tokens_consumed": 37079744512, "data/tokens_consumed_B": 37.079744512, "train/loss_slope": -1.0879201714498041e-06} {"step": 17690, "timestamp": 1778213654.9675999, "train/loss": 2.2131659269332884, "train/z_loss": 0.0016150967334397138, "train/perplexity": 9.144621817172164, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026510.7522141286, "perf/iters_per_sec": 0.9663156281538623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348585605621339, "data/tokens_consumed": 37100716032, "data/tokens_consumed_B": 37.100716032, "train/loss_slope": -3.80348851649999e-07} {"step": 17700, "timestamp": 1778213665.3062253, "grad/layer_0/attn": 0.003480029758065939, "grad/layer_0/mlp": 0.0029368852265179157, "grad/layer_0/attn_mlp_ratio": 1.1849389305887528, "grad/layer_4/attn": 0.0017444671830162406, "grad/layer_4/mlp": 0.00252921343781054, "grad/layer_4/attn_mlp_ratio": 0.6897271254234905, "grad/layer_8/attn": 0.004975857678800821, "grad/layer_8/mlp": 0.0037323664873838425, "grad/layer_8/attn_mlp_ratio": 1.3331642437321756, "grad/layer_12/attn": 0.004259674344211817, "grad/layer_12/mlp": 0.006161815021187067, "grad/layer_12/attn_mlp_ratio": 0.6913018745994436, "grad/layer_16/attn": 0.004483031574636698, "grad/layer_16/mlp": 0.005300436168909073, "grad/layer_16/attn_mlp_ratio": 0.8457853933520435, "grad/layer_20/attn": 0.004932432901114225, "grad/layer_20/mlp": 0.008041217923164368, "grad/layer_20/attn_mlp_ratio": 0.6133937528997905, "grad/layer_24/attn": 0.03032325766980648, "grad/layer_24/mlp": 0.01640498638153076, "grad/layer_24/attn_mlp_ratio": 1.848417108050978, "grad/layer_27/attn": 0.01002698577940464, "grad/layer_27/mlp": 0.014954333193600178, "grad/layer_27/attn_mlp_ratio": 0.6705070418415622} {"step": 17700, "timestamp": 1778213665.9198642, "eos/sharpness": 57.11288452148436, "eos/L0_probe": 2.0726962089538574, "eos/L_plus": 2.31134295463562, "eos/L_minus": 2.4051783084869385, "eos/grad_norm": 0.2824665307998657, "eos/embed_grad_frac": 0.030598612502217293, "eos/time_s": 0.6107356548309326} {"step": 17700, "timestamp": 1778213665.9397295, "train/loss": 2.2142325401306153, "train/z_loss": 0.0016164757194928824, "train/perplexity": 9.15438079508904, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912237.4931692474, "perf/iters_per_sec": 0.9118258920522916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967005968093873, "data/tokens_consumed": 37121687552, "data/tokens_consumed_B": 37.121687552, "train/loss_slope": -2.1182329920080093e-06} {"step": 17700, "timestamp": 1778213667.303119, "geo/rankme_last": 440.9725341796875, "geo/layer_0/stable_rank_q_proj": 16.56696319580078, "geo/layer_0/stable_rank_k_proj": 14.635871887207031, "geo/layer_0/stable_rank_o_proj": 53.186824798583984, "geo/layer_0/stable_rank_gate_proj": 155.96372985839844, "geo/layer_0/stable_rank_down_proj": 49.49212646484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04162820428609848, "geo/layer_0/attn_entropy_mean": 6.2884297370910645, "geo/layer_0/attn_entropy_std": 0.30385148525238037, "geo/layer_7/stable_rank_q_proj": 42.889793395996094, "geo/layer_7/stable_rank_k_proj": 42.32963562011719, "geo/layer_7/stable_rank_o_proj": 111.48554992675781, "geo/layer_7/stable_rank_gate_proj": 107.494140625, "geo/layer_7/stable_rank_down_proj": 154.13705444335938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5716373920440674, "geo/layer_7/attn_entropy_mean": 4.667392730712891, "geo/layer_7/attn_entropy_std": 0.8842920660972595, "geo/layer_14/stable_rank_q_proj": 58.57237243652344, "geo/layer_14/stable_rank_k_proj": 35.55258560180664, "geo/layer_14/stable_rank_o_proj": 52.6735954284668, "geo/layer_14/stable_rank_gate_proj": 92.61865234375, "geo/layer_14/stable_rank_down_proj": 136.78158569335938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37605521082878113, "geo/layer_14/attn_entropy_mean": 5.582456588745117, "geo/layer_14/attn_entropy_std": 0.49587664008140564, "geo/layer_21/stable_rank_q_proj": 49.34431838989258, "geo/layer_21/stable_rank_k_proj": 31.919578552246094, "geo/layer_21/stable_rank_o_proj": 86.11531829833984, "geo/layer_21/stable_rank_gate_proj": 91.35938262939453, "geo/layer_21/stable_rank_down_proj": 62.92349624633789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1526082307100296, "geo/layer_21/attn_entropy_mean": 5.770817756652832, "geo/layer_21/attn_entropy_std": 0.30237480998039246, "geo/layer_27/stable_rank_q_proj": 41.81671142578125, "geo/layer_27/stable_rank_k_proj": 31.73899269104004, "geo/layer_27/stable_rank_o_proj": 116.74348449707031, "geo/layer_27/stable_rank_gate_proj": 92.22015380859375, "geo/layer_27/stable_rank_down_proj": 140.75526428222656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07269372791051865, "geo/layer_27/attn_entropy_mean": 4.433271408081055, "geo/layer_27/attn_entropy_std": 0.5291914343833923, "attnres/final_alpha/block_0": 0.2454300820827484, "attnres/block_norm/0": 1.5795636177062988, "attnres/final_alpha/block_1": 0.007339519448578358, "attnres/block_norm/1": 26506.96484375, "attnres/final_alpha/block_2": 0.01525631733238697, "attnres/block_norm/2": 19377.861328125, "attnres/final_alpha/block_3": 0.017188409343361855, "attnres/block_norm/3": 25895.962890625, "attnres/final_alpha/block_4": 0.02253604866564274, "attnres/block_norm/4": 8443.4677734375, "attnres/final_alpha/block_5": 0.5483866333961487, "attnres/block_norm/5": 4786.3740234375, "attnres/final_alpha/block_6": 0.1438629925251007, "attnres/block_norm/6": 17530.76953125, "geo/tier1_time_s": 1.3595223426818848, "geo/step": 17700.0, "geo/rankme_slope": 3.611391822353942e-05} {"step": 17710, "timestamp": 1778213677.6554978, "train/loss": 2.2578797340393066, "train/z_loss": 0.0016004771110601722, "train/perplexity": 9.562791994311597, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790609.0555508395, "perf/iters_per_sec": 0.8538289335016439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711947917938232, "data/tokens_consumed": 37142659072, "data/tokens_consumed_B": 37.142659072, "train/loss_slope": -6.136038694659388e-07} {"step": 17720, "timestamp": 1778213688.0076451, "train/loss": 2.1563623189926147, "train/z_loss": 0.0016335160355083645, "train/perplexity": 8.639652127569644, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026899.6469480481, "perf/iters_per_sec": 0.9665010676136246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034660005569458, "data/tokens_consumed": 37163630592, "data/tokens_consumed_B": 37.163630592, "train/loss_slope": -3.69663421649173e-06} {"step": 17730, "timestamp": 1778213698.3639102, "train/loss": 2.2310812950134276, "train/z_loss": 0.001614548813086003, "train/perplexity": 9.309927416957585, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025996.8911107057, "perf/iters_per_sec": 0.966070600085595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351210355758667, "data/tokens_consumed": 37184602112, "data/tokens_consumed_B": 37.184602112, "train/loss_slope": -5.615917841593453e-06} {"step": 17740, "timestamp": 1778213708.7257853, "train/loss": 2.230663847923279, "train/z_loss": 0.0016075557330623269, "train/perplexity": 9.306041825918644, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024876.9063983385, "perf/iters_per_sec": 0.9655365497581189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356935739517212, "data/tokens_consumed": 37205573632, "data/tokens_consumed_B": 37.205573632, "train/loss_slope": -7.235164439181057e-06} {"step": 17750, "timestamp": 1778213719.0685854, "grad/layer_0/attn": 0.0028207534924149513, "grad/layer_0/mlp": 0.002626507543027401, "grad/layer_0/attn_mlp_ratio": 1.073955942943176, "grad/layer_4/attn": 0.0024629367981106043, "grad/layer_4/mlp": 0.0025246813893318176, "grad/layer_4/attn_mlp_ratio": 0.9755435719388291, "grad/layer_8/attn": 0.004625103436410427, "grad/layer_8/mlp": 0.0037608081474900246, "grad/layer_8/attn_mlp_ratio": 1.229816340542551, "grad/layer_12/attn": 0.004665732849389315, "grad/layer_12/mlp": 0.005707825534045696, "grad/layer_12/attn_mlp_ratio": 0.8174273617539105, "grad/layer_16/attn": 0.0039489674381911755, "grad/layer_16/mlp": 0.004440570715814829, "grad/layer_16/attn_mlp_ratio": 0.8892927513118726, "grad/layer_20/attn": 0.004089461173862219, "grad/layer_20/mlp": 0.006000447552651167, "grad/layer_20/attn_mlp_ratio": 0.6815260144891656, "grad/layer_24/attn": 0.007825235836207867, "grad/layer_24/mlp": 0.009127422235906124, "grad/layer_24/attn_mlp_ratio": 0.8573325028934379, "grad/layer_27/attn": 0.00373059487901628, "grad/layer_27/mlp": 0.008112246170639992, "grad/layer_27/attn_mlp_ratio": 0.4598719953212128} {"step": 17750, "timestamp": 1778213719.084508, "train/loss": 2.252661347389221, "train/z_loss": 0.0015984912286512553, "train/perplexity": 9.513019626907953, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025558.291986989, "perf/iters_per_sec": 0.9658614597258516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353451728820802, "data/tokens_consumed": 37226545152, "data/tokens_consumed_B": 37.226545152, "train/loss_slope": -5.669129420571469e-06} {"step": 17760, "timestamp": 1778213729.442856, "train/loss": 2.248965287208557, "train/z_loss": 0.001613275334239006, "train/perplexity": 9.477923831905493, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025924.9834832202, "perf/iters_per_sec": 0.9660363118568517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351577758789063, "data/tokens_consumed": 37247516672, "data/tokens_consumed_B": 37.247516672, "train/loss_slope": -5.329183540721948e-06} {"step": 17770, "timestamp": 1778213739.8269186, "train/loss": 2.22930223941803, "train/z_loss": 0.0016077333246357739, "train/perplexity": 9.293379262901365, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021988.6927082539, "perf/iters_per_sec": 0.9641593421498555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371729612350464, "data/tokens_consumed": 37268488192, "data/tokens_consumed_B": 37.268488192, "train/loss_slope": -4.746351376546958e-06} {"step": 17775, "timestamp": 1778213745.5988555, "eos/sharpness": 13.248515129089354, "eos/L0_probe": 2.072385549545288, "eos/L_plus": 2.160144567489624, "eos/L_minus": 2.1171116828918457, "eos/grad_norm": 0.11958343535661697, "eos/embed_grad_frac": 0.17139464616775513, "eos/time_s": 0.6072525978088379} {"step": 17775, "timestamp": 1778213746.9751465, "geo/rankme_last": 441.5558166503906, "geo/layer_0/stable_rank_q_proj": 16.52754783630371, "geo/layer_0/stable_rank_k_proj": 14.667346000671387, "geo/layer_0/stable_rank_o_proj": 53.26924133300781, "geo/layer_0/stable_rank_gate_proj": 156.21237182617188, "geo/layer_0/stable_rank_down_proj": 49.53646469116211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.03921326622366905, "geo/layer_0/attn_entropy_mean": 6.2840189933776855, "geo/layer_0/attn_entropy_std": 0.3050391674041748, "geo/layer_7/stable_rank_q_proj": 42.977840423583984, "geo/layer_7/stable_rank_k_proj": 42.43360900878906, "geo/layer_7/stable_rank_o_proj": 110.76970672607422, "geo/layer_7/stable_rank_gate_proj": 107.50886535644531, "geo/layer_7/stable_rank_down_proj": 153.57974243164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5469143986701965, "geo/layer_7/attn_entropy_mean": 4.662021160125732, "geo/layer_7/attn_entropy_std": 0.8722234964370728, "geo/layer_14/stable_rank_q_proj": 58.364524841308594, "geo/layer_14/stable_rank_k_proj": 35.52556610107422, "geo/layer_14/stable_rank_o_proj": 52.55549240112305, "geo/layer_14/stable_rank_gate_proj": 92.30988311767578, "geo/layer_14/stable_rank_down_proj": 136.9573516845703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38382017612457275, "geo/layer_14/attn_entropy_mean": 5.5248613357543945, "geo/layer_14/attn_entropy_std": 0.5042516589164734, "geo/layer_21/stable_rank_q_proj": 49.409908294677734, "geo/layer_21/stable_rank_k_proj": 31.849475860595703, "geo/layer_21/stable_rank_o_proj": 86.10021209716797, "geo/layer_21/stable_rank_gate_proj": 91.3519515991211, "geo/layer_21/stable_rank_down_proj": 62.93574905395508, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1573832631111145, "geo/layer_21/attn_entropy_mean": 5.762489318847656, "geo/layer_21/attn_entropy_std": 0.2979811728000641, "geo/layer_27/stable_rank_q_proj": 41.7513313293457, "geo/layer_27/stable_rank_k_proj": 31.80666732788086, "geo/layer_27/stable_rank_o_proj": 116.76454162597656, "geo/layer_27/stable_rank_gate_proj": 92.08524322509766, "geo/layer_27/stable_rank_down_proj": 140.4320831298828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07473638653755188, "geo/layer_27/attn_entropy_mean": 4.4152512550354, "geo/layer_27/attn_entropy_std": 0.5426539778709412, "attnres/final_alpha/block_0": 0.24540424346923828, "attnres/block_norm/0": 1.5804338455200195, "attnres/final_alpha/block_1": 0.007312930189073086, "attnres/block_norm/1": 26642.677734375, "attnres/final_alpha/block_2": 0.015041060745716095, "attnres/block_norm/2": 19333.056640625, "attnres/final_alpha/block_3": 0.01711626537144184, "attnres/block_norm/3": 25870.33203125, "attnres/final_alpha/block_4": 0.022444356232881546, "attnres/block_norm/4": 8457.3974609375, "attnres/final_alpha/block_5": 0.5507186651229858, "attnres/block_norm/5": 4823.107421875, "attnres/final_alpha/block_6": 0.14196249842643738, "attnres/block_norm/6": 17638.462890625, "geo/tier1_time_s": 1.355497121810913, "geo/step": 17775.0, "geo/rankme_slope": 7.223494866696679e-05} {"step": 17780, "timestamp": 1778213752.153225, "train/loss": 2.233264136314392, "train/z_loss": 0.0016065148054622113, "train/perplexity": 9.330271707132685, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702184.133467783, "perf/iters_per_sec": 0.8116646449412265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2320359230041504, "data/tokens_consumed": 37289459712, "data/tokens_consumed_B": 37.289459712, "train/loss_slope": -4.619801284098769e-06} {"step": 17790, "timestamp": 1778213762.4982352, "train/loss": 2.251183843612671, "train/z_loss": 0.001605186820961535, "train/perplexity": 9.498974482914512, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027950.380620082, "perf/iters_per_sec": 0.9670020964718256, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034123921394348, "data/tokens_consumed": 37310431232, "data/tokens_consumed_B": 37.310431232, "train/loss_slope": -7.80327648434095e-06} {"step": 17800, "timestamp": 1778213772.8384638, "grad/layer_0/attn": 0.0029531968757510185, "grad/layer_0/mlp": 0.002601150656118989, "grad/layer_0/attn_mlp_ratio": 1.13534245133769, "grad/layer_4/attn": 0.0023939257953315973, "grad/layer_4/mlp": 0.002534727565944195, "grad/layer_4/attn_mlp_ratio": 0.9444508881548234, "grad/layer_8/attn": 0.008582638576626778, "grad/layer_8/mlp": 0.0037920107133686543, "grad/layer_8/attn_mlp_ratio": 2.2633476007950364, "grad/layer_12/attn": 0.005068968515843153, "grad/layer_12/mlp": 0.006108319852501154, "grad/layer_12/attn_mlp_ratio": 0.8298465953420759, "grad/layer_16/attn": 0.004527053330093622, "grad/layer_16/mlp": 0.004274762701243162, "grad/layer_16/attn_mlp_ratio": 1.0590186030385338, "grad/layer_20/attn": 0.003626573830842972, "grad/layer_20/mlp": 0.00584502425044775, "grad/layer_20/attn_mlp_ratio": 0.6204548712556114, "grad/layer_24/attn": 0.013742402195930481, "grad/layer_24/mlp": 0.011721886694431305, "grad/layer_24/attn_mlp_ratio": 1.1723711751301897, "grad/layer_27/attn": 0.006498160772025585, "grad/layer_27/mlp": 0.01149845402687788, "grad/layer_27/attn_mlp_ratio": 0.5651334257912111} {"step": 17800, "timestamp": 1778213772.8542073, "train/loss": 2.294942283630371, "train/z_loss": 0.0015946176717989147, "train/perplexity": 9.923863226397826, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026168.6314700143, "perf/iters_per_sec": 0.9661524922704765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350332975387573, "data/tokens_consumed": 37331402752, "data/tokens_consumed_B": 37.331402752, "train/loss_slope": -7.146889205598439e-06} {"step": 17810, "timestamp": 1778213783.1997674, "train/loss": 2.2482290744781492, "train/z_loss": 0.001605271629523486, "train/perplexity": 9.47094863165327, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028036.5998652093, "perf/iters_per_sec": 0.9670432090116545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340799570083619, "data/tokens_consumed": 37352374272, "data/tokens_consumed_B": 37.352374272, "train/loss_slope": -6.811380429272218e-06} {"step": 17820, "timestamp": 1778213793.552818, "train/loss": 2.3151561975479127, "train/z_loss": 0.0015953452442772687, "train/perplexity": 10.126504530394733, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026753.4203367254, "perf/iters_per_sec": 0.9664313413318278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347346544265748, "data/tokens_consumed": 37373345792, "data/tokens_consumed_B": 37.373345792, "train/loss_slope": -2.3285901836186196e-06} {"step": 17830, "timestamp": 1778213803.9038498, "train/loss": 2.1987053394317626, "train/z_loss": 0.001611868478357792, "train/perplexity": 9.013336730685623, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027122.1797887178, "perf/iters_per_sec": 0.9666071795409764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034546422958374, "data/tokens_consumed": 37394317312, "data/tokens_consumed_B": 37.394317312, "train/loss_slope": -3.7187293167412264e-06} {"step": 17840, "timestamp": 1778213814.2551527, "train/loss": 2.207477355003357, "train/z_loss": 0.0016146681155078113, "train/perplexity": 9.092749657330572, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026914.8265818267, "perf/iters_per_sec": 0.9665083058270582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346522569656371, "data/tokens_consumed": 37415288832, "data/tokens_consumed_B": 37.415288832, "train/loss_slope": -5.387283222760495e-06} {"step": 17850, "timestamp": 1778213825.080206, "grad/layer_0/attn": 0.0027490186039358377, "grad/layer_0/mlp": 0.002717413706704974, "grad/layer_0/attn_mlp_ratio": 1.0116304690705853, "grad/layer_4/attn": 0.004112070892006159, "grad/layer_4/mlp": 0.0024225704837590456, "grad/layer_4/attn_mlp_ratio": 1.6973998279239217, "grad/layer_8/attn": 0.004138784017413855, "grad/layer_8/mlp": 0.003681129077449441, "grad/layer_8/attn_mlp_ratio": 1.1243245802858541, "grad/layer_12/attn": 0.003578844480216503, "grad/layer_12/mlp": 0.006146176252514124, "grad/layer_12/attn_mlp_ratio": 0.5822879583910018, "grad/layer_16/attn": 0.003946768585592508, "grad/layer_16/mlp": 0.0045096431858837605, "grad/layer_16/attn_mlp_ratio": 0.8751842075728736, "grad/layer_20/attn": 0.005544624291360378, "grad/layer_20/mlp": 0.006659036967903376, "grad/layer_20/attn_mlp_ratio": 0.8326465575759477, "grad/layer_24/attn": 0.011107450351119041, "grad/layer_24/mlp": 0.01225780975073576, "grad/layer_24/attn_mlp_ratio": 0.9061529332217804, "grad/layer_27/attn": 0.008075560443103313, "grad/layer_27/mlp": 0.01077501941472292, "grad/layer_27/attn_mlp_ratio": 0.7494706095027409} {"step": 17850, "timestamp": 1778213825.6917927, "eos/sharpness": 27.316117286682125, "eos/L0_probe": 2.0703368186950684, "eos/L_plus": 2.2099363803863525, "eos/L_minus": 2.2038984298706055, "eos/grad_norm": 0.15706509351730347, "eos/embed_grad_frac": 0.1080513671040535, "eos/time_s": 0.6085946559906006} {"step": 17850, "timestamp": 1778213825.7114558, "train/loss": 2.232240843772888, "train/z_loss": 0.0016039102571085095, "train/perplexity": 9.320728993011368, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1831233.3197265004, "perf/iters_per_sec": 0.8732000921852591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1452128887176514, "data/tokens_consumed": 37436260352, "data/tokens_consumed_B": 37.436260352, "train/loss_slope": -5.8592499894063595e-06} {"step": 17850, "timestamp": 1778213827.074841, "geo/rankme_last": 441.065673828125, "geo/layer_0/stable_rank_q_proj": 16.532625198364258, "geo/layer_0/stable_rank_k_proj": 14.68604850769043, "geo/layer_0/stable_rank_o_proj": 53.07651138305664, "geo/layer_0/stable_rank_gate_proj": 155.85726928710938, "geo/layer_0/stable_rank_down_proj": 49.53059005737305, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050313279032707214, "geo/layer_0/attn_entropy_mean": 6.281798362731934, "geo/layer_0/attn_entropy_std": 0.3009868264198303, "geo/layer_7/stable_rank_q_proj": 42.893917083740234, "geo/layer_7/stable_rank_k_proj": 42.67608642578125, "geo/layer_7/stable_rank_o_proj": 110.67267608642578, "geo/layer_7/stable_rank_gate_proj": 107.2544174194336, "geo/layer_7/stable_rank_down_proj": 153.85482788085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5712053179740906, "geo/layer_7/attn_entropy_mean": 4.6641998291015625, "geo/layer_7/attn_entropy_std": 0.9020339846611023, "geo/layer_14/stable_rank_q_proj": 58.137264251708984, "geo/layer_14/stable_rank_k_proj": 35.46364212036133, "geo/layer_14/stable_rank_o_proj": 52.70338821411133, "geo/layer_14/stable_rank_gate_proj": 92.14955139160156, "geo/layer_14/stable_rank_down_proj": 136.75640869140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3838464021682739, "geo/layer_14/attn_entropy_mean": 5.536846160888672, "geo/layer_14/attn_entropy_std": 0.5323493480682373, "geo/layer_21/stable_rank_q_proj": 49.48857498168945, "geo/layer_21/stable_rank_k_proj": 31.827411651611328, "geo/layer_21/stable_rank_o_proj": 85.69847106933594, "geo/layer_21/stable_rank_gate_proj": 91.21810150146484, "geo/layer_21/stable_rank_down_proj": 62.90957260131836, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.151174396276474, "geo/layer_21/attn_entropy_mean": 5.770060062408447, "geo/layer_21/attn_entropy_std": 0.2889084219932556, "geo/layer_27/stable_rank_q_proj": 41.79655456542969, "geo/layer_27/stable_rank_k_proj": 31.874296188354492, "geo/layer_27/stable_rank_o_proj": 116.48928833007812, "geo/layer_27/stable_rank_gate_proj": 92.1160888671875, "geo/layer_27/stable_rank_down_proj": 140.1559600830078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07498271763324738, "geo/layer_27/attn_entropy_mean": 4.419462203979492, "geo/layer_27/attn_entropy_std": 0.518220841884613, "attnres/final_alpha/block_0": 0.2443200945854187, "attnres/block_norm/0": 1.581268072128296, "attnres/final_alpha/block_1": 0.007150947116315365, "attnres/block_norm/1": 26734.8203125, "attnres/final_alpha/block_2": 0.01497742161154747, "attnres/block_norm/2": 19474.662109375, "attnres/final_alpha/block_3": 0.017059385776519775, "attnres/block_norm/3": 26061.421875, "attnres/final_alpha/block_4": 0.022710217162966728, "attnres/block_norm/4": 8517.287109375, "attnres/final_alpha/block_5": 0.5547325015068054, "attnres/block_norm/5": 4793.802734375, "attnres/final_alpha/block_6": 0.1390494406223297, "attnres/block_norm/6": 17740.62109375, "geo/tier1_time_s": 1.3594565391540527, "geo/step": 17850.0, "geo/rankme_slope": 4.0519469506552624e-05} {"step": 17860, "timestamp": 1778213837.4280355, "train/loss": 2.21555700302124, "train/z_loss": 0.0016191114555113018, "train/perplexity": 9.166513465601115, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790445.0037653702, "perf/iters_per_sec": 0.8537507075144626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713021039962768, "data/tokens_consumed": 37457231872, "data/tokens_consumed_B": 37.457231872, "train/loss_slope": -5.91359177116449e-06} {"step": 17870, "timestamp": 1778213847.7891762, "train/loss": 2.2462605476379394, "train/z_loss": 0.0016018829308450222, "train/perplexity": 9.452323153461109, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024918.439470151, "perf/iters_per_sec": 0.9655563542700534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356723308563232, "data/tokens_consumed": 37478203392, "data/tokens_consumed_B": 37.478203392, "train/loss_slope": -2.922544012976801e-06} {"step": 17880, "timestamp": 1778213858.138755, "train/loss": 2.2217055559158325, "train/z_loss": 0.001606762211304158, "train/perplexity": 9.223047882825234, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027390.2276279244, "perf/iters_per_sec": 0.9667349947108862, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344096422195435, "data/tokens_consumed": 37499174912, "data/tokens_consumed_B": 37.499174912, "train/loss_slope": -4.626284867408928e-06} {"step": 17890, "timestamp": 1778213868.4928021, "train/loss": 2.1907922506332396, "train/z_loss": 0.0016107998206280171, "train/perplexity": 8.942294847830368, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026528.8674343894, "perf/iters_per_sec": 0.9663242661640117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348493099212646, "data/tokens_consumed": 37520146432, "data/tokens_consumed_B": 37.520146432, "train/loss_slope": -8.637033852234794e-06} {"step": 17900, "timestamp": 1778213878.8321621, "grad/layer_0/attn": 0.0025692079216241837, "grad/layer_0/mlp": 0.002546407515183091, "grad/layer_0/attn_mlp_ratio": 1.0089539107192205, "grad/layer_4/attn": 0.0020031037274748087, "grad/layer_4/mlp": 0.0028899535536766052, "grad/layer_4/attn_mlp_ratio": 0.6931265921605545, "grad/layer_8/attn": 0.006232570856809616, "grad/layer_8/mlp": 0.00405197823420167, "grad/layer_8/attn_mlp_ratio": 1.53815503014963, "grad/layer_12/attn": 0.004326751455664635, "grad/layer_12/mlp": 0.005969775840640068, "grad/layer_12/attn_mlp_ratio": 0.7247761890374614, "grad/layer_16/attn": 0.0036348788999021053, "grad/layer_16/mlp": 0.004230264108628035, "grad/layer_16/attn_mlp_ratio": 0.8592557629115499, "grad/layer_20/attn": 0.005391737911850214, "grad/layer_20/mlp": 0.005734600126743317, "grad/layer_20/attn_mlp_ratio": 0.940211644868606, "grad/layer_24/attn": 0.010230502113699913, "grad/layer_24/mlp": 0.009000785648822784, "grad/layer_24/attn_mlp_ratio": 1.1366232237044376, "grad/layer_27/attn": 0.00797613337635994, "grad/layer_27/mlp": 0.009062976576387882, "grad/layer_27/attn_mlp_ratio": 0.8800787711547866} {"step": 17900, "timestamp": 1778213878.8480177, "train/loss": 2.2600892543792725, "train/z_loss": 0.0015961071942001582, "train/perplexity": 9.5839445376117, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026153.1363079185, "perf/iters_per_sec": 0.9661451036014168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350412130355835, "data/tokens_consumed": 37541117952, "data/tokens_consumed_B": 37.541117952, "train/loss_slope": -7.92942423381292e-06} {"step": 17910, "timestamp": 1778213889.2028704, "train/loss": 2.210476207733154, "train/z_loss": 0.001613320759497583, "train/perplexity": 9.120058401446848, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026150.6627045812, "perf/iters_per_sec": 0.9661439240954309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350424766540527, "data/tokens_consumed": 37562089472, "data/tokens_consumed_B": 37.562089472, "train/loss_slope": -8.78129928204315e-06} {"step": 17920, "timestamp": 1778213899.5602636, "train/loss": 2.1984071254730226, "train/z_loss": 0.0016231564921326935, "train/perplexity": 9.010649228602944, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025850.2814840372, "perf/iters_per_sec": 0.9660006911678491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351959466934204, "data/tokens_consumed": 37583060992, "data/tokens_consumed_B": 37.583060992, "train/loss_slope": -1.1615842604043477e-05} {"step": 17925, "timestamp": 1778213905.3341367, "eos/sharpness": 23.579549789428707, "eos/L0_probe": 2.0714948177337646, "eos/L_plus": 2.1916730403900146, "eos/L_minus": 2.1871120929718018, "eos/grad_norm": 0.13639678061008453, "eos/embed_grad_frac": 0.15192686021327972, "eos/time_s": 0.6074435710906982} {"step": 17925, "timestamp": 1778213906.7119045, "geo/rankme_last": 441.2794494628906, "geo/layer_0/stable_rank_q_proj": 16.546489715576172, "geo/layer_0/stable_rank_k_proj": 14.693647384643555, "geo/layer_0/stable_rank_o_proj": 53.06736755371094, "geo/layer_0/stable_rank_gate_proj": 156.37295532226562, "geo/layer_0/stable_rank_down_proj": 49.52157974243164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.047435563057661057, "geo/layer_0/attn_entropy_mean": 6.276787757873535, "geo/layer_0/attn_entropy_std": 0.3041010797023773, "geo/layer_7/stable_rank_q_proj": 43.080657958984375, "geo/layer_7/stable_rank_k_proj": 42.73162078857422, "geo/layer_7/stable_rank_o_proj": 110.88016510009766, "geo/layer_7/stable_rank_gate_proj": 106.94651794433594, "geo/layer_7/stable_rank_down_proj": 153.95130920410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.558297872543335, "geo/layer_7/attn_entropy_mean": 4.673731803894043, "geo/layer_7/attn_entropy_std": 0.8458372950553894, "geo/layer_14/stable_rank_q_proj": 58.10765075683594, "geo/layer_14/stable_rank_k_proj": 35.38224792480469, "geo/layer_14/stable_rank_o_proj": 52.786014556884766, "geo/layer_14/stable_rank_gate_proj": 92.03252410888672, "geo/layer_14/stable_rank_down_proj": 137.00035095214844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37032583355903625, "geo/layer_14/attn_entropy_mean": 5.545814037322998, "geo/layer_14/attn_entropy_std": 0.5094146132469177, "geo/layer_21/stable_rank_q_proj": 49.48320770263672, "geo/layer_21/stable_rank_k_proj": 31.79062843322754, "geo/layer_21/stable_rank_o_proj": 85.85067749023438, "geo/layer_21/stable_rank_gate_proj": 90.90570831298828, "geo/layer_21/stable_rank_down_proj": 62.80717468261719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15060089528560638, "geo/layer_21/attn_entropy_mean": 5.764857769012451, "geo/layer_21/attn_entropy_std": 0.29327934980392456, "geo/layer_27/stable_rank_q_proj": 41.79637908935547, "geo/layer_27/stable_rank_k_proj": 31.811132431030273, "geo/layer_27/stable_rank_o_proj": 116.6061019897461, "geo/layer_27/stable_rank_gate_proj": 92.05004119873047, "geo/layer_27/stable_rank_down_proj": 140.01638793945312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07421386986970901, "geo/layer_27/attn_entropy_mean": 4.4405293464660645, "geo/layer_27/attn_entropy_std": 0.532863438129425, "attnres/final_alpha/block_0": 0.24482829868793488, "attnres/block_norm/0": 1.582237958908081, "attnres/final_alpha/block_1": 0.007257716730237007, "attnres/block_norm/1": 26731.0625, "attnres/final_alpha/block_2": 0.015025326982140541, "attnres/block_norm/2": 19482.0, "attnres/final_alpha/block_3": 0.017097093164920807, "attnres/block_norm/3": 26194.87890625, "attnres/final_alpha/block_4": 0.022330285981297493, "attnres/block_norm/4": 8556.5625, "attnres/final_alpha/block_5": 0.5511798858642578, "attnres/block_norm/5": 4830.5146484375, "attnres/final_alpha/block_6": 0.1422814428806305, "attnres/block_norm/6": 17871.33984375, "geo/tier1_time_s": 1.357625961303711, "geo/step": 17925.0, "geo/rankme_slope": 3.2318357030312126e-05} {"step": 17930, "timestamp": 1778213911.891987, "train/loss": 2.209894633293152, "train/z_loss": 0.001612277131062001, "train/perplexity": 9.114755950623627, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701386.2403038354, "perf/iters_per_sec": 0.81128417983238, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232613706588745, "data/tokens_consumed": 37604032512, "data/tokens_consumed_B": 37.604032512, "train/loss_slope": -1.866027737321925e-05} {"step": 17940, "timestamp": 1778213922.2407825, "train/loss": 2.326102924346924, "train/z_loss": 0.001577215502038598, "train/perplexity": 10.237965562615388, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027521.497403297, "perf/iters_per_sec": 0.9667975890175329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343426704406737, "data/tokens_consumed": 37625004032, "data/tokens_consumed_B": 37.625004032, "train/loss_slope": -1.359782982902633e-05} {"step": 17950, "timestamp": 1778213932.5976279, "grad/layer_0/attn": 0.002803649753332138, "grad/layer_0/mlp": 0.0027773475740104914, "grad/layer_0/attn_mlp_ratio": 1.0094702148988306, "grad/layer_4/attn": 0.0019203482661396265, "grad/layer_4/mlp": 0.002498654183000326, "grad/layer_4/attn_mlp_ratio": 0.7685530083952697, "grad/layer_8/attn": 0.013430685736238956, "grad/layer_8/mlp": 0.0037817656993865967, "grad/layer_8/attn_mlp_ratio": 3.5514324388933383, "grad/layer_12/attn": 0.004408656619489193, "grad/layer_12/mlp": 0.006907472852617502, "grad/layer_12/attn_mlp_ratio": 0.6382444997947603, "grad/layer_16/attn": 0.004676400683820248, "grad/layer_16/mlp": 0.004849414341151714, "grad/layer_16/attn_mlp_ratio": 0.9643227528950121, "grad/layer_20/attn": 0.004942824598401785, "grad/layer_20/mlp": 0.0072173383086919785, "grad/layer_20/attn_mlp_ratio": 0.6848542105839248, "grad/layer_24/attn": 0.01837107166647911, "grad/layer_24/mlp": 0.0147011773660779, "grad/layer_24/attn_mlp_ratio": 1.2496326711836026, "grad/layer_27/attn": 0.004573909565806389, "grad/layer_27/mlp": 0.014777777716517448, "grad/layer_27/attn_mlp_ratio": 0.309512676574013} {"step": 17950, "timestamp": 1778213932.6149676, "train/loss": 2.2677911281585694, "train/z_loss": 0.0016045556752942503, "train/perplexity": 9.658043854249888, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022951.6622946518, "perf/iters_per_sec": 0.9646185218308696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036679244041443, "data/tokens_consumed": 37645975552, "data/tokens_consumed_B": 37.645975552, "train/loss_slope": -9.34197501380847e-06} {"step": 17960, "timestamp": 1778213942.9798398, "train/loss": 2.2367733240127565, "train/z_loss": 0.0015986472601071, "train/perplexity": 9.36307089742879, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025031.0208421974, "perf/iters_per_sec": 0.9656100372515666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356147527694701, "data/tokens_consumed": 37666947072, "data/tokens_consumed_B": 37.666947072, "train/loss_slope": -9.846522805928972e-06} {"step": 17970, "timestamp": 1778213953.3314662, "train/loss": 2.229747986793518, "train/z_loss": 0.001621602161321789, "train/perplexity": 9.297522685708534, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026936.0784510907, "perf/iters_per_sec": 0.9665184395080045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034641408920288, "data/tokens_consumed": 37687918592, "data/tokens_consumed_B": 37.687918592, "train/loss_slope": -1.0138133406960983e-05} {"step": 17980, "timestamp": 1778213963.6870782, "train/loss": 2.2170692920684814, "train/z_loss": 0.0016245008097030222, "train/perplexity": 9.180386370793856, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026118.0863788857, "perf/iters_per_sec": 0.9661283904928616, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035059118270874, "data/tokens_consumed": 37708890112, "data/tokens_consumed_B": 37.708890112, "train/loss_slope": -1.2431299210739235e-05} {"step": 17990, "timestamp": 1778213974.0373063, "train/loss": 2.2007778167724608, "train/z_loss": 0.001619648770429194, "train/perplexity": 9.032036037075395, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027510.4680318735, "perf/iters_per_sec": 0.9667923298034065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343482971191407, "data/tokens_consumed": 37729861632, "data/tokens_consumed_B": 37.729861632, "train/loss_slope": -1.4154715270492499e-05} {"step": 18000, "timestamp": 1778213984.3717182, "grad/layer_0/attn": 0.002738713985309005, "grad/layer_0/mlp": 0.0028089401312172413, "grad/layer_0/attn_mlp_ratio": 0.9749990244976469, "grad/layer_4/attn": 0.0017350720008835196, "grad/layer_4/mlp": 0.002650858135893941, "grad/layer_4/attn_mlp_ratio": 0.6545321727845648, "grad/layer_8/attn": 0.007508511189371347, "grad/layer_8/mlp": 0.003864917904138565, "grad/layer_8/attn_mlp_ratio": 1.9427349251216228, "grad/layer_12/attn": 0.004389866720885038, "grad/layer_12/mlp": 0.006002791225910187, "grad/layer_12/attn_mlp_ratio": 0.7313042354040876, "grad/layer_16/attn": 0.003930149134248495, "grad/layer_16/mlp": 0.004661568906158209, "grad/layer_16/attn_mlp_ratio": 0.8430957750612584, "grad/layer_20/attn": 0.0067204018123447895, "grad/layer_20/mlp": 0.00675370404496789, "grad/layer_20/attn_mlp_ratio": 0.9950690270245384, "grad/layer_24/attn": 0.010201082564890385, "grad/layer_24/mlp": 0.011257139965891838, "grad/layer_24/attn_mlp_ratio": 0.9061877621829353, "grad/layer_27/attn": 0.0164474043995142, "grad/layer_27/mlp": 0.009077723138034344, "grad/layer_27/attn_mlp_ratio": 1.8118424596381129} {"step": 18000, "timestamp": 1778213984.9828281, "eos/sharpness": 38.40284347534179, "eos/L0_probe": 2.0696136951446533, "eos/L_plus": 2.23215389251709, "eos/L_minus": 2.2911019325256348, "eos/grad_norm": 0.15611128509044647, "eos/embed_grad_frac": 0.12062901258468628, "eos/time_s": 0.6083347797393799} {"step": 18000, "timestamp": 1778213985.0029528, "train/loss": 2.266342520713806, "train/z_loss": 0.0016040085931308568, "train/perplexity": 9.64406326865577, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913280.171318115, "perf/iters_per_sec": 0.9123230797377181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961029291152955, "data/tokens_consumed": 37750833152, "data/tokens_consumed_B": 37.750833152, "train/loss_slope": -1.2741719914598576e-05} {"step": 18000, "timestamp": 1778213986.363512, "geo/rankme_last": 440.8985290527344, "geo/layer_0/stable_rank_q_proj": 16.56497573852539, "geo/layer_0/stable_rank_k_proj": 14.703543663024902, "geo/layer_0/stable_rank_o_proj": 52.96359634399414, "geo/layer_0/stable_rank_gate_proj": 156.24888610839844, "geo/layer_0/stable_rank_down_proj": 49.51718521118164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04589810222387314, "geo/layer_0/attn_entropy_mean": 6.275512218475342, "geo/layer_0/attn_entropy_std": 0.301096111536026, "geo/layer_7/stable_rank_q_proj": 43.04526138305664, "geo/layer_7/stable_rank_k_proj": 42.94902420043945, "geo/layer_7/stable_rank_o_proj": 111.1072998046875, "geo/layer_7/stable_rank_gate_proj": 106.76712036132812, "geo/layer_7/stable_rank_down_proj": 153.8955078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5648170709609985, "geo/layer_7/attn_entropy_mean": 4.6541266441345215, "geo/layer_7/attn_entropy_std": 0.8626282811164856, "geo/layer_14/stable_rank_q_proj": 58.17680358886719, "geo/layer_14/stable_rank_k_proj": 35.50054931640625, "geo/layer_14/stable_rank_o_proj": 52.78368377685547, "geo/layer_14/stable_rank_gate_proj": 91.87831115722656, "geo/layer_14/stable_rank_down_proj": 137.0589599609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38238802552223206, "geo/layer_14/attn_entropy_mean": 5.535650253295898, "geo/layer_14/attn_entropy_std": 0.5017645955085754, "geo/layer_21/stable_rank_q_proj": 49.55475997924805, "geo/layer_21/stable_rank_k_proj": 31.65357208251953, "geo/layer_21/stable_rank_o_proj": 86.05937194824219, "geo/layer_21/stable_rank_gate_proj": 90.97896575927734, "geo/layer_21/stable_rank_down_proj": 62.73318099975586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15257146954536438, "geo/layer_21/attn_entropy_mean": 5.771668910980225, "geo/layer_21/attn_entropy_std": 0.29838278889656067, "geo/layer_27/stable_rank_q_proj": 41.81139373779297, "geo/layer_27/stable_rank_k_proj": 31.89035987854004, "geo/layer_27/stable_rank_o_proj": 116.55712127685547, "geo/layer_27/stable_rank_gate_proj": 92.08565521240234, "geo/layer_27/stable_rank_down_proj": 140.21389770507812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07648701965808868, "geo/layer_27/attn_entropy_mean": 4.4373860359191895, "geo/layer_27/attn_entropy_std": 0.559806764125824, "attnres/final_alpha/block_0": 0.2439827024936676, "attnres/block_norm/0": 1.5832774639129639, "attnres/final_alpha/block_1": 0.007188897579908371, "attnres/block_norm/1": 26801.216796875, "attnres/final_alpha/block_2": 0.014864644035696983, "attnres/block_norm/2": 19486.57421875, "attnres/final_alpha/block_3": 0.016919206827878952, "attnres/block_norm/3": 26228.794921875, "attnres/final_alpha/block_4": 0.022374821826815605, "attnres/block_norm/4": 8558.9765625, "attnres/final_alpha/block_5": 0.553607702255249, "attnres/block_norm/5": 4787.54296875, "attnres/final_alpha/block_6": 0.14106205105781555, "attnres/block_norm/6": 17853.037109375, "geo/tier1_time_s": 1.3564729690551758, "geo/step": 18000.0, "geo/rankme_slope": 9.10266450330132e-06} {"step": 18000, "timestamp": 1778213993.1449146, "geo/ww_alpha_mean": 8.06714833502562, "geo/ww_alpha_std": 4.779687762080727, "geo/ww_alpha_min": 1.383585355741287, "geo/ww_alpha_max": 28.2929065332202, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.231984180457613, "geo/ww_alpha_by_type/k_proj": 4.7077975892322375, "geo/ww_alpha_by_type/v_proj": 8.674423386058226, "geo/ww_alpha_by_type/o_proj": 8.372927256448905, "geo/ww_alpha_by_type/gate_proj": 9.187856319337754, "geo/ww_alpha_by_type/up_proj": 12.41337141966977, "geo/ww_alpha_by_type/down_proj": 9.012161650258486, "geo/twonn_id/layer_0": 0.7424075603485107, "geo/twonn_id/layer_7": 3.3666112422943115, "geo/twonn_id/layer_14": 4.360556602478027, "geo/twonn_id/layer_21": 6.254846572875977, "geo/twonn_id/layer_27": 5.313782215118408, "geo/tier2_time_s": 6.775165796279907} {"step": 18000, "timestamp": 1778213993.7606153, "eoc/jacobian_sigma/layer_0/attn": 807.1647338867188, "eoc/jacobian_sigma/layer_0/mlp": 5548.39990234375, "eoc/jacobian_sigma/layer_0": 5548.39990234375, "eoc/jacobian_sigma/layer_7/attn": 1.1448538303375244, "eoc/jacobian_sigma/layer_7/mlp": 1.6023215055465698, "eoc/jacobian_sigma/layer_7": 1.6023215055465698, "eoc/jacobian_sigma/layer_14/attn": 1.4857935905456543, "eoc/jacobian_sigma/layer_14/mlp": 5.461310386657715, "eoc/jacobian_sigma/layer_14": 5.461310386657715, "eoc/jacobian_sigma/layer_21/attn": 1.0764000415802002, "eoc/jacobian_sigma/layer_21/mlp": 3.7190022468566895, "eoc/jacobian_sigma/layer_21": 3.7190022468566895, "eoc/jacobian_sigma/layer_27/attn": 2.7238638401031494, "eoc/jacobian_sigma/layer_27/mlp": 22.13286018371582, "eoc/jacobian_sigma/layer_27": 22.13286018371582, "eoc/layer0_sigma": 5548.39990234375, "eoc/sigma_max": 22.13286018371582, "eoc/sigma_min": 1.6023215055465698, "eoc/sigma_mean": 8.228873580694199, "eoc/time_s": 0.6072986125946045} {"step": 18010, "timestamp": 1778214004.1284206, "train/loss": 2.24440450668335, "train/z_loss": 0.001598465978167951, "train/perplexity": 9.434795525601567, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1096742.3396590815, "perf/iters_per_sec": 0.5229675005240829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9121647119522094, "data/tokens_consumed": 37771804672, "data/tokens_consumed_B": 37.771804672, "train/loss_slope": -1.1681960618356444e-05} {"step": 18020, "timestamp": 1778214014.4784827, "train/loss": 2.1897470235824583, "train/z_loss": 0.0016163736581802368, "train/perplexity": 8.932953002384632, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027208.6556672016, "perf/iters_per_sec": 0.9666484144531258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345022916793822, "data/tokens_consumed": 37792776192, "data/tokens_consumed_B": 37.792776192, "train/loss_slope": -1.3199637124318818e-05} {"step": 18030, "timestamp": 1778214025.2210896, "train/loss": 2.2367749214172363, "train/z_loss": 0.0016029072110541164, "train/perplexity": 9.363085854052132, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953140.277556151, "perf/iters_per_sec": 0.9313298595219378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.073733425140381, "data/tokens_consumed": 37813747712, "data/tokens_consumed_B": 37.813747712, "train/loss_slope": -1.0768897491212434e-05} {"step": 18040, "timestamp": 1778214036.0690656, "train/loss": 2.245426392555237, "train/z_loss": 0.0015899123973213136, "train/perplexity": 9.44444173767782, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934425.01005809, "perf/iters_per_sec": 0.9224057245531512, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0841216325759888, "data/tokens_consumed": 37834719232, "data/tokens_consumed_B": 37.834719232, "train/loss_slope": -1.1534710485513878e-05} {"step": 18050, "timestamp": 1778214046.416322, "grad/layer_0/attn": 0.002595016499981284, "grad/layer_0/mlp": 0.002526266733184457, "grad/layer_0/attn_mlp_ratio": 1.0272139371398727, "grad/layer_4/attn": 0.0015457094414159656, "grad/layer_4/mlp": 0.0024654448498040438, "grad/layer_4/attn_mlp_ratio": 0.6269494849352935, "grad/layer_8/attn": 0.011658638715744019, "grad/layer_8/mlp": 0.0038073223549872637, "grad/layer_8/attn_mlp_ratio": 3.062162150324885, "grad/layer_12/attn": 0.004453225061297417, "grad/layer_12/mlp": 0.006093435920774937, "grad/layer_12/attn_mlp_ratio": 0.7308233066064219, "grad/layer_16/attn": 0.005277122370898724, "grad/layer_16/mlp": 0.004655915778130293, "grad/layer_16/attn_mlp_ratio": 1.1334230490903743, "grad/layer_20/attn": 0.005075699649751186, "grad/layer_20/mlp": 0.006104583851993084, "grad/layer_20/attn_mlp_ratio": 0.8314570967762713, "grad/layer_24/attn": 0.01429414190351963, "grad/layer_24/mlp": 0.01208858285099268, "grad/layer_24/attn_mlp_ratio": 1.1824497512626853, "grad/layer_27/attn": 0.0059133246541023254, "grad/layer_27/mlp": 0.011696688830852509, "grad/layer_27/attn_mlp_ratio": 0.5055554344533069} {"step": 18050, "timestamp": 1778214046.4320047, "train/loss": 2.2128422021865846, "train/z_loss": 0.001613017835188657, "train/perplexity": 9.141661955906715, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025101.139956423, "perf/iters_per_sec": 0.9656434726507297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355788946151734, "data/tokens_consumed": 37855690752, "data/tokens_consumed_B": 37.855690752, "train/loss_slope": -9.428637470528042e-06} {"step": 18060, "timestamp": 1778214056.7931726, "train/loss": 2.284486937522888, "train/z_loss": 0.0015936214476823807, "train/perplexity": 9.820646326105756, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025624.0627518618, "perf/iters_per_sec": 0.9658928216704663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353115558624268, "data/tokens_consumed": 37876662272, "data/tokens_consumed_B": 37.876662272, "train/loss_slope": -7.444783785019124e-06} {"step": 18070, "timestamp": 1778214067.1502767, "train/loss": 2.222374749183655, "train/z_loss": 0.0016061727539636195, "train/perplexity": 9.229221949968903, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025768.9600867857, "perf/iters_per_sec": 0.9659619141038827, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352375030517578, "data/tokens_consumed": 37897633792, "data/tokens_consumed_B": 37.897633792, "train/loss_slope": -9.483868962038107e-06} {"step": 18075, "timestamp": 1778214072.93685, "eos/sharpness": 6.727218627929687, "eos/L0_probe": 2.0714683532714844, "eos/L_plus": 2.111710548400879, "eos/L_minus": 2.0984983444213867, "eos/grad_norm": 0.0896332636475563, "eos/embed_grad_frac": 0.2955135703086853, "eos/time_s": 0.6216979026794434} {"step": 18075, "timestamp": 1778214074.3149743, "geo/rankme_last": 440.95233154296875, "geo/layer_0/stable_rank_q_proj": 16.593433380126953, "geo/layer_0/stable_rank_k_proj": 14.738092422485352, "geo/layer_0/stable_rank_o_proj": 53.08821105957031, "geo/layer_0/stable_rank_gate_proj": 155.6927032470703, "geo/layer_0/stable_rank_down_proj": 49.562156677246094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04380541667342186, "geo/layer_0/attn_entropy_mean": 6.277563095092773, "geo/layer_0/attn_entropy_std": 0.3038478493690491, "geo/layer_7/stable_rank_q_proj": 43.0694465637207, "geo/layer_7/stable_rank_k_proj": 43.015804290771484, "geo/layer_7/stable_rank_o_proj": 111.23241424560547, "geo/layer_7/stable_rank_gate_proj": 106.52418518066406, "geo/layer_7/stable_rank_down_proj": 153.7698974609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5657859444618225, "geo/layer_7/attn_entropy_mean": 4.65401029586792, "geo/layer_7/attn_entropy_std": 0.8529022932052612, "geo/layer_14/stable_rank_q_proj": 58.26914596557617, "geo/layer_14/stable_rank_k_proj": 35.408119201660156, "geo/layer_14/stable_rank_o_proj": 52.83856964111328, "geo/layer_14/stable_rank_gate_proj": 91.87225341796875, "geo/layer_14/stable_rank_down_proj": 137.19781494140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38987597823143005, "geo/layer_14/attn_entropy_mean": 5.504917144775391, "geo/layer_14/attn_entropy_std": 0.4894481301307678, "geo/layer_21/stable_rank_q_proj": 49.4794807434082, "geo/layer_21/stable_rank_k_proj": 31.637046813964844, "geo/layer_21/stable_rank_o_proj": 86.07747650146484, "geo/layer_21/stable_rank_gate_proj": 90.76724243164062, "geo/layer_21/stable_rank_down_proj": 62.55268478393555, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15311503410339355, "geo/layer_21/attn_entropy_mean": 5.759438514709473, "geo/layer_21/attn_entropy_std": 0.29784145951271057, "geo/layer_27/stable_rank_q_proj": 41.80709457397461, "geo/layer_27/stable_rank_k_proj": 31.893537521362305, "geo/layer_27/stable_rank_o_proj": 116.8565444946289, "geo/layer_27/stable_rank_gate_proj": 92.15046691894531, "geo/layer_27/stable_rank_down_proj": 140.53919982910156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07846636325120926, "geo/layer_27/attn_entropy_mean": 4.438030242919922, "geo/layer_27/attn_entropy_std": 0.5534651279449463, "attnres/final_alpha/block_0": 0.24324560165405273, "attnres/block_norm/0": 1.5843180418014526, "attnres/final_alpha/block_1": 0.0072130970656871796, "attnres/block_norm/1": 26895.52734375, "attnres/final_alpha/block_2": 0.01498144306242466, "attnres/block_norm/2": 19611.41015625, "attnres/final_alpha/block_3": 0.017084941267967224, "attnres/block_norm/3": 26293.78125, "attnres/final_alpha/block_4": 0.022583000361919403, "attnres/block_norm/4": 8581.630859375, "attnres/final_alpha/block_5": 0.5539956092834473, "attnres/block_norm/5": 4796.11572265625, "attnres/final_alpha/block_6": 0.14089637994766235, "attnres/block_norm/6": 17910.166015625, "geo/tier1_time_s": 1.358635425567627, "geo/step": 18075.0, "geo/rankme_slope": 7.136780493447377e-06} {"step": 18080, "timestamp": 1778214079.4934318, "train/loss": 2.2356730699539185, "train/z_loss": 0.0016024531098082662, "train/perplexity": 9.352774805867638, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699811.0907226119, "perf/iters_per_sec": 0.8105330899823245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233755922317505, "data/tokens_consumed": 37918605312, "data/tokens_consumed_B": 37.918605312, "train/loss_slope": -1.1659051303995147e-05} {"step": 18090, "timestamp": 1778214090.333397, "train/loss": 2.233488130569458, "train/z_loss": 0.001608552027028054, "train/perplexity": 9.332361868476614, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935526.4011080354, "perf/iters_per_sec": 0.9229309087314774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0835047245025635, "data/tokens_consumed": 37939576832, "data/tokens_consumed_B": 37.939576832, "train/loss_slope": -1.1186235544025601e-05} {"step": 18100, "timestamp": 1778214101.0629632, "grad/layer_0/attn": 0.0030485601164400578, "grad/layer_0/mlp": 0.002889575669541955, "grad/layer_0/attn_mlp_ratio": 1.0550199612600242, "grad/layer_4/attn": 0.0015382919227704406, "grad/layer_4/mlp": 0.0025680738035589457, "grad/layer_4/attn_mlp_ratio": 0.5990060958287106, "grad/layer_8/attn": 0.004881301429122686, "grad/layer_8/mlp": 0.0037728988099843264, "grad/layer_8/attn_mlp_ratio": 1.2937800735146028, "grad/layer_12/attn": 0.00422645453363657, "grad/layer_12/mlp": 0.0057756914757192135, "grad/layer_12/attn_mlp_ratio": 0.7317659674565075, "grad/layer_16/attn": 0.008143574930727482, "grad/layer_16/mlp": 0.004361954052001238, "grad/layer_16/attn_mlp_ratio": 1.8669556457834982, "grad/layer_20/attn": 0.006301622837781906, "grad/layer_20/mlp": 0.006311668083071709, "grad/layer_20/attn_mlp_ratio": 0.9984084484484238, "grad/layer_24/attn": 0.008924581110477448, "grad/layer_24/mlp": 0.012950082309544086, "grad/layer_24/attn_mlp_ratio": 0.6891524569681593, "grad/layer_27/attn": 0.007285917643457651, "grad/layer_27/mlp": 0.011464029550552368, "grad/layer_27/attn_mlp_ratio": 0.6355459524746253} {"step": 18100, "timestamp": 1778214101.0791574, "train/loss": 2.27248477935791, "train/z_loss": 0.0015825599897652865, "train/perplexity": 9.703481895109153, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952723.2891841829, "perf/iters_per_sec": 0.9311310239716448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0739627122879027, "data/tokens_consumed": 37960548352, "data/tokens_consumed_B": 37.960548352, "train/loss_slope": -8.587923330335058e-06} {"step": 18110, "timestamp": 1778214111.430118, "train/loss": 2.243421792984009, "train/z_loss": 0.0016066312440671028, "train/perplexity": 9.425528377010808, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027227.6710719145, "perf/iters_per_sec": 0.9666574817046711, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034492588043213, "data/tokens_consumed": 37981519872, "data/tokens_consumed_B": 37.981519872, "train/loss_slope": -6.275857664463666e-06} {"step": 18120, "timestamp": 1778214121.7836306, "train/loss": 2.2681459188461304, "train/z_loss": 0.0015888240304775536, "train/perplexity": 9.661471046201376, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026527.3733833511, "perf/iters_per_sec": 0.9663235537449604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348500728607177, "data/tokens_consumed": 38002491392, "data/tokens_consumed_B": 38.002491392, "train/loss_slope": -4.560707812190545e-06} {"step": 18130, "timestamp": 1778214132.1512117, "train/loss": 2.2183435678482057, "train/z_loss": 0.0016180072212591766, "train/perplexity": 9.192092171419812, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023998.5385320398, "perf/iters_per_sec": 0.9651177113208961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361430406570435, "data/tokens_consumed": 38023462912, "data/tokens_consumed_B": 38.023462912, "train/loss_slope": -4.249927885282745e-06} {"step": 18140, "timestamp": 1778214142.5093503, "train/loss": 2.2472666263580323, "train/z_loss": 0.0016062970156781376, "train/perplexity": 9.461837720040181, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025548.2634867767, "perf/iters_per_sec": 0.9658566777643093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353502988815309, "data/tokens_consumed": 38044434432, "data/tokens_consumed_B": 38.044434432, "train/loss_slope": -1.5625480699925666e-06} {"step": 18150, "timestamp": 1778214152.8505821, "grad/layer_0/attn": 0.0034264170099049807, "grad/layer_0/mlp": 0.002902461215853691, "grad/layer_0/attn_mlp_ratio": 1.1805211636032367, "grad/layer_4/attn": 0.0015913876704871655, "grad/layer_4/mlp": 0.002575697610154748, "grad/layer_4/attn_mlp_ratio": 0.6178472202747561, "grad/layer_8/attn": 0.006302499677985907, "grad/layer_8/mlp": 0.003926723264157772, "grad/layer_8/attn_mlp_ratio": 1.605027676640956, "grad/layer_12/attn": 0.006542054004967213, "grad/layer_12/mlp": 0.006614789832383394, "grad/layer_12/attn_mlp_ratio": 0.9890040457581132, "grad/layer_16/attn": 0.0039228773675858974, "grad/layer_16/mlp": 0.004760649986565113, "grad/layer_16/attn_mlp_ratio": 0.8240213618422685, "grad/layer_20/attn": 0.006076630670577288, "grad/layer_20/mlp": 0.00668462086468935, "grad/layer_20/attn_mlp_ratio": 0.9090464070702449, "grad/layer_24/attn": 0.020158177241683006, "grad/layer_24/mlp": 0.012263916432857513, "grad/layer_24/attn_mlp_ratio": 1.6436981764899632, "grad/layer_27/attn": 0.007589215412735939, "grad/layer_27/mlp": 0.011643076315522194, "grad/layer_27/attn_mlp_ratio": 0.6518221767073717} {"step": 18150, "timestamp": 1778214153.4717724, "eos/sharpness": 57.0974826812744, "eos/L0_probe": 2.071939706802368, "eos/L_plus": 2.438776969909668, "eos/L_minus": 2.2760772705078125, "eos/grad_norm": 0.21879814565181732, "eos/embed_grad_frac": 0.047137998044490814, "eos/time_s": 0.6183772087097168} {"step": 18150, "timestamp": 1778214153.49233, "train/loss": 2.2160999536514283, "train/z_loss": 0.0016089994576759636, "train/perplexity": 9.171491781231397, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910431.471512306, "perf/iters_per_sec": 0.9109647138177424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0977373600006104, "data/tokens_consumed": 38065405952, "data/tokens_consumed_B": 38.065405952, "train/loss_slope": -6.927903550471655e-07} {"step": 18150, "timestamp": 1778214154.8557742, "geo/rankme_last": 440.98663330078125, "geo/layer_0/stable_rank_q_proj": 16.626371383666992, "geo/layer_0/stable_rank_k_proj": 14.779333114624023, "geo/layer_0/stable_rank_o_proj": 53.119117736816406, "geo/layer_0/stable_rank_gate_proj": 155.9092254638672, "geo/layer_0/stable_rank_down_proj": 49.51537322998047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04389462620019913, "geo/layer_0/attn_entropy_mean": 6.276176452636719, "geo/layer_0/attn_entropy_std": 0.3054410517215729, "geo/layer_7/stable_rank_q_proj": 43.05017852783203, "geo/layer_7/stable_rank_k_proj": 42.86207580566406, "geo/layer_7/stable_rank_o_proj": 111.27359008789062, "geo/layer_7/stable_rank_gate_proj": 106.5182113647461, "geo/layer_7/stable_rank_down_proj": 153.52955627441406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5509220957756042, "geo/layer_7/attn_entropy_mean": 4.6639838218688965, "geo/layer_7/attn_entropy_std": 0.8499964475631714, "geo/layer_14/stable_rank_q_proj": 58.25933074951172, "geo/layer_14/stable_rank_k_proj": 35.246864318847656, "geo/layer_14/stable_rank_o_proj": 52.66169738769531, "geo/layer_14/stable_rank_gate_proj": 91.75559997558594, "geo/layer_14/stable_rank_down_proj": 136.6323699951172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.375636488199234, "geo/layer_14/attn_entropy_mean": 5.533641815185547, "geo/layer_14/attn_entropy_std": 0.515582799911499, "geo/layer_21/stable_rank_q_proj": 49.43800735473633, "geo/layer_21/stable_rank_k_proj": 31.707590103149414, "geo/layer_21/stable_rank_o_proj": 86.06765747070312, "geo/layer_21/stable_rank_gate_proj": 90.5235824584961, "geo/layer_21/stable_rank_down_proj": 62.358612060546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15542176365852356, "geo/layer_21/attn_entropy_mean": 5.770795822143555, "geo/layer_21/attn_entropy_std": 0.29947811365127563, "geo/layer_27/stable_rank_q_proj": 41.761348724365234, "geo/layer_27/stable_rank_k_proj": 31.964048385620117, "geo/layer_27/stable_rank_o_proj": 116.54362487792969, "geo/layer_27/stable_rank_gate_proj": 92.01717376708984, "geo/layer_27/stable_rank_down_proj": 140.78602600097656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0718282088637352, "geo/layer_27/attn_entropy_mean": 4.433229446411133, "geo/layer_27/attn_entropy_std": 0.5568550825119019, "attnres/final_alpha/block_0": 0.24358496069908142, "attnres/block_norm/0": 1.5853530168533325, "attnres/final_alpha/block_1": 0.007298293523490429, "attnres/block_norm/1": 27068.265625, "attnres/final_alpha/block_2": 0.015001548454165459, "attnres/block_norm/2": 19619.115234375, "attnres/final_alpha/block_3": 0.0169142484664917, "attnres/block_norm/3": 26579.505859375, "attnres/final_alpha/block_4": 0.02223031036555767, "attnres/block_norm/4": 8583.7958984375, "attnres/final_alpha/block_5": 0.5560517311096191, "attnres/block_norm/5": 4785.72705078125, "attnres/final_alpha/block_6": 0.13891887664794922, "attnres/block_norm/6": 17990.619140625, "geo/tier1_time_s": 1.3591797351837158, "geo/step": 18150.0, "geo/rankme_slope": -6.187338216536615e-06} {"step": 18160, "timestamp": 1778214165.2135372, "train/loss": 2.2053718090057375, "train/z_loss": 0.001621985773090273, "train/perplexity": 9.073624596095925, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789802.7539002474, "perf/iters_per_sec": 0.8534444589139211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171722412109375, "data/tokens_consumed": 38086377472, "data/tokens_consumed_B": 38.086377472, "train/loss_slope": -4.4433196457234215e-06} {"step": 18170, "timestamp": 1778214175.564658, "train/loss": 2.2392295360565186, "train/z_loss": 0.001603199541568756, "train/perplexity": 9.386096851670462, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027026.976090276, "perf/iters_per_sec": 0.966561782879961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034595012664795, "data/tokens_consumed": 38107348992, "data/tokens_consumed_B": 38.107348992, "train/loss_slope": -3.105421907509048e-06} {"step": 18180, "timestamp": 1778214185.9166439, "train/loss": 2.266378116607666, "train/z_loss": 0.0015994818648323416, "train/perplexity": 9.644406563818174, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026844.1614827458, "perf/iters_per_sec": 0.9664746100820283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346883296966554, "data/tokens_consumed": 38128320512, "data/tokens_consumed_B": 38.128320512, "train/loss_slope": 7.911664102659159e-07} {"step": 18190, "timestamp": 1778214196.272872, "train/loss": 2.2497870683670045, "train/z_loss": 0.001589344150852412, "train/perplexity": 9.485715812344598, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026062.6436944858, "perf/iters_per_sec": 0.9661019533607892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350874423980714, "data/tokens_consumed": 38149292032, "data/tokens_consumed_B": 38.149292032, "train/loss_slope": 2.1940596569346476e-06} {"step": 18200, "timestamp": 1778214206.608111, "grad/layer_0/attn": 0.002924500033259392, "grad/layer_0/mlp": 0.002825991017743945, "grad/layer_0/attn_mlp_ratio": 1.034858182991774, "grad/layer_4/attn": 0.0017248457297682762, "grad/layer_4/mlp": 0.0024803425185382366, "grad/layer_4/attn_mlp_ratio": 0.6954062381852696, "grad/layer_8/attn": 0.007504342123866081, "grad/layer_8/mlp": 0.003846629522740841, "grad/layer_8/attn_mlp_ratio": 1.9508876236748285, "grad/layer_12/attn": 0.004400269128382206, "grad/layer_12/mlp": 0.0057816714979708195, "grad/layer_12/attn_mlp_ratio": 0.7610721317908404, "grad/layer_16/attn": 0.0043687717989087105, "grad/layer_16/mlp": 0.004699876066297293, "grad/layer_16/attn_mlp_ratio": 0.9295504060802869, "grad/layer_20/attn": 0.008167466148734093, "grad/layer_20/mlp": 0.006554236635565758, "grad/layer_20/attn_mlp_ratio": 1.2461353591966469, "grad/layer_24/attn": 0.018151884898543358, "grad/layer_24/mlp": 0.015683390200138092, "grad/layer_24/attn_mlp_ratio": 1.1573954706963794, "grad/layer_27/attn": 0.007236143574118614, "grad/layer_27/mlp": 0.014639732427895069, "grad/layer_27/attn_mlp_ratio": 0.4942811325501069} {"step": 18200, "timestamp": 1778214206.623795, "train/loss": 2.2662403345108033, "train/z_loss": 0.00159719797084108, "train/perplexity": 9.643077828798864, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027003.3400513788, "perf/iters_per_sec": 0.9665505123383421, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346070766448974, "data/tokens_consumed": 38170263552, "data/tokens_consumed_B": 38.170263552, "train/loss_slope": 2.6297282857863735e-06} {"step": 18210, "timestamp": 1778214216.9699152, "train/loss": 2.208047127723694, "train/z_loss": 0.0016093345009721815, "train/perplexity": 9.097931934257986, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027983.1094051886, "perf/iters_per_sec": 0.9670177027727073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034107232093811, "data/tokens_consumed": 38191235072, "data/tokens_consumed_B": 38.191235072, "train/loss_slope": 1.9606982127274967e-06} {"step": 18220, "timestamp": 1778214227.3212147, "train/loss": 2.2093977689743043, "train/z_loss": 0.001610371272545308, "train/perplexity": 9.11022827852926, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027264.0209218073, "perf/iters_per_sec": 0.9666748146637951, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344740390777587, "data/tokens_consumed": 38212206592, "data/tokens_consumed_B": 38.212206592, "train/loss_slope": -2.168040156829359e-06} {"step": 18225, "timestamp": 1778214233.0979736, "eos/sharpness": 43.83769035339355, "eos/L0_probe": 2.0705673694610596, "eos/L_plus": 2.283123731613159, "eos/L_minus": 2.2963879108428955, "eos/grad_norm": 0.21452364325523376, "eos/embed_grad_frac": 0.05517439544200897, "eos/time_s": 0.6105842590332031} {"step": 18225, "timestamp": 1778214234.4758527, "geo/rankme_last": 440.6316223144531, "geo/layer_0/stable_rank_q_proj": 16.656635284423828, "geo/layer_0/stable_rank_k_proj": 14.747089385986328, "geo/layer_0/stable_rank_o_proj": 53.111873626708984, "geo/layer_0/stable_rank_gate_proj": 156.1256866455078, "geo/layer_0/stable_rank_down_proj": 49.591636657714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04131518304347992, "geo/layer_0/attn_entropy_mean": 6.270754337310791, "geo/layer_0/attn_entropy_std": 0.31053414940834045, "geo/layer_7/stable_rank_q_proj": 43.09066390991211, "geo/layer_7/stable_rank_k_proj": 42.5885124206543, "geo/layer_7/stable_rank_o_proj": 110.9105224609375, "geo/layer_7/stable_rank_gate_proj": 106.33038330078125, "geo/layer_7/stable_rank_down_proj": 153.10276794433594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5609064102172852, "geo/layer_7/attn_entropy_mean": 4.644590854644775, "geo/layer_7/attn_entropy_std": 0.8525364398956299, "geo/layer_14/stable_rank_q_proj": 58.34856033325195, "geo/layer_14/stable_rank_k_proj": 35.21426773071289, "geo/layer_14/stable_rank_o_proj": 52.677738189697266, "geo/layer_14/stable_rank_gate_proj": 91.54954528808594, "geo/layer_14/stable_rank_down_proj": 136.56773376464844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3884104788303375, "geo/layer_14/attn_entropy_mean": 5.5405449867248535, "geo/layer_14/attn_entropy_std": 0.4948311448097229, "geo/layer_21/stable_rank_q_proj": 49.43901062011719, "geo/layer_21/stable_rank_k_proj": 31.739118576049805, "geo/layer_21/stable_rank_o_proj": 86.01689147949219, "geo/layer_21/stable_rank_gate_proj": 90.1318359375, "geo/layer_21/stable_rank_down_proj": 62.3349609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15335619449615479, "geo/layer_21/attn_entropy_mean": 5.76045036315918, "geo/layer_21/attn_entropy_std": 0.2932113707065582, "geo/layer_27/stable_rank_q_proj": 41.690040588378906, "geo/layer_27/stable_rank_k_proj": 31.868637084960938, "geo/layer_27/stable_rank_o_proj": 116.80278778076172, "geo/layer_27/stable_rank_gate_proj": 91.98008728027344, "geo/layer_27/stable_rank_down_proj": 140.77401733398438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07255088537931442, "geo/layer_27/attn_entropy_mean": 4.440072536468506, "geo/layer_27/attn_entropy_std": 0.5289679765701294, "attnres/final_alpha/block_0": 0.24298129975795746, "attnres/block_norm/0": 1.5863162279129028, "attnres/final_alpha/block_1": 0.007225224748253822, "attnres/block_norm/1": 27027.71875, "attnres/final_alpha/block_2": 0.014816459268331528, "attnres/block_norm/2": 19711.9453125, "attnres/final_alpha/block_3": 0.016664575785398483, "attnres/block_norm/3": 26732.826171875, "attnres/final_alpha/block_4": 0.022248797118663788, "attnres/block_norm/4": 8609.103515625, "attnres/final_alpha/block_5": 0.5547943711280823, "attnres/block_norm/5": 4836.39453125, "attnres/final_alpha/block_6": 0.1412692666053772, "attnres/block_norm/6": 18126.224609375, "geo/tier1_time_s": 1.3571431636810303, "geo/step": 18225.0, "geo/rankme_slope": -8.237787302420968e-06} {"step": 18230, "timestamp": 1778214239.6566126, "train/loss": 2.2416996240615843, "train/z_loss": 0.0016007032245397568, "train/perplexity": 9.40930999436823, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700866.5998112243, "perf/iters_per_sec": 0.811036395936596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232990288734436, "data/tokens_consumed": 38233178112, "data/tokens_consumed_B": 38.233178112, "train/loss_slope": -1.1967512783687696e-06} {"step": 18240, "timestamp": 1778214250.004768, "train/loss": 2.273527455329895, "train/z_loss": 0.001590925408527255, "train/perplexity": 9.71360475904216, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027853.32260645, "perf/iters_per_sec": 0.9669558156044245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341734170913697, "data/tokens_consumed": 38254149632, "data/tokens_consumed_B": 38.254149632, "train/loss_slope": -1.1663077509227538e-06} {"step": 18250, "timestamp": 1778214260.3417716, "grad/layer_0/attn": 0.00269975233823061, "grad/layer_0/mlp": 0.002866963157430291, "grad/layer_0/attn_mlp_ratio": 0.9416766438264175, "grad/layer_4/attn": 0.0016229655593633652, "grad/layer_4/mlp": 0.0024985659401863813, "grad/layer_4/attn_mlp_ratio": 0.6495588002317919, "grad/layer_8/attn": 0.00428893743082881, "grad/layer_8/mlp": 0.003685052040964365, "grad/layer_8/attn_mlp_ratio": 1.1638742863775084, "grad/layer_12/attn": 0.004629252012819052, "grad/layer_12/mlp": 0.005940814036875963, "grad/layer_12/attn_mlp_ratio": 0.779228554565316, "grad/layer_16/attn": 0.004379614256322384, "grad/layer_16/mlp": 0.004926938563585281, "grad/layer_16/attn_mlp_ratio": 0.8889118690865506, "grad/layer_20/attn": 0.006654698401689529, "grad/layer_20/mlp": 0.006683951709419489, "grad/layer_20/attn_mlp_ratio": 0.9956233365284392, "grad/layer_24/attn": 0.01581382565200329, "grad/layer_24/mlp": 0.0126094501465559, "grad/layer_24/attn_mlp_ratio": 1.254124909713857, "grad/layer_27/attn": 0.005585349630564451, "grad/layer_27/mlp": 0.010923553258180618, "grad/layer_27/attn_mlp_ratio": 0.5113125232625516} {"step": 18250, "timestamp": 1778214260.3575246, "train/loss": 2.238494896888733, "train/z_loss": 0.0015999712166376411, "train/perplexity": 9.379203989483868, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026661.4732382828, "perf/iters_per_sec": 0.9663874975387015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347815990447997, "data/tokens_consumed": 38275121152, "data/tokens_consumed_B": 38.275121152, "train/loss_slope": 1.0951634037076668e-06} {"step": 18260, "timestamp": 1778214270.7055416, "train/loss": 2.3142978668212892, "train/z_loss": 0.0015814760699868202, "train/perplexity": 10.117816369594095, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027612.9616872107, "perf/iters_per_sec": 0.9668412025867513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342960119247437, "data/tokens_consumed": 38296092672, "data/tokens_consumed_B": 38.296092672, "train/loss_slope": 6.836929315566508e-06} {"step": 18270, "timestamp": 1778214281.0601926, "train/loss": 2.215805268287659, "train/z_loss": 0.001606021891348064, "train/perplexity": 9.168789475024136, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026709.1501996561, "perf/iters_per_sec": 0.9664102316854745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347572565078735, "data/tokens_consumed": 38317064192, "data/tokens_consumed_B": 38.317064192, "train/loss_slope": 3.0257760530616074e-06} {"step": 18280, "timestamp": 1778214291.4175177, "train/loss": 2.255296468734741, "train/z_loss": 0.001602849829941988, "train/perplexity": 9.538120645577376, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025958.0668906881, "perf/iters_per_sec": 0.9660520872548524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035140872001648, "data/tokens_consumed": 38338035712, "data/tokens_consumed_B": 38.338035712, "train/loss_slope": 1.725393868122221e-06} {"step": 18290, "timestamp": 1778214301.7860947, "train/loss": 2.1922733545303346, "train/z_loss": 0.0016121345805004239, "train/perplexity": 8.955549128638983, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024207.9506503504, "perf/iters_per_sec": 0.9652175668002846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360358476638794, "data/tokens_consumed": 38359007232, "data/tokens_consumed_B": 38.359007232, "train/loss_slope": -1.7509100639601258e-06} {"step": 18300, "timestamp": 1778214312.1259418, "grad/layer_0/attn": 0.002945161424577236, "grad/layer_0/mlp": 0.0027615611907094717, "grad/layer_0/attn_mlp_ratio": 1.0664841785280803, "grad/layer_4/attn": 0.0018697783816605806, "grad/layer_4/mlp": 0.002467300510033965, "grad/layer_4/attn_mlp_ratio": 0.757823499113406, "grad/layer_8/attn": 0.004056239500641823, "grad/layer_8/mlp": 0.003887250553816557, "grad/layer_8/attn_mlp_ratio": 1.0434725881798619, "grad/layer_12/attn": 0.006260787136852741, "grad/layer_12/mlp": 0.0063828835263848305, "grad/layer_12/attn_mlp_ratio": 0.9808712649832153, "grad/layer_16/attn": 0.005687130615115166, "grad/layer_16/mlp": 0.004531626123934984, "grad/layer_16/attn_mlp_ratio": 1.2549866944182377, "grad/layer_20/attn": 0.004141206853091717, "grad/layer_20/mlp": 0.006567871198058128, "grad/layer_20/attn_mlp_ratio": 0.6305249699877822, "grad/layer_24/attn": 0.012394820339977741, "grad/layer_24/mlp": 0.011580693535506725, "grad/layer_24/attn_mlp_ratio": 1.0703003403850424, "grad/layer_27/attn": 0.009582032449543476, "grad/layer_27/mlp": 0.009980963543057442, "grad/layer_27/attn_mlp_ratio": 0.9600307938410882} {"step": 18300, "timestamp": 1778214312.7422822, "eos/sharpness": 48.71065616607665, "eos/L0_probe": 2.0686910152435303, "eos/L_plus": 2.271545886993408, "eos/L_minus": 2.352942705154419, "eos/grad_norm": 0.18292607367038727, "eos/embed_grad_frac": 0.08177919685840607, "eos/time_s": 0.6135437488555908} {"step": 18300, "timestamp": 1778214312.7623193, "train/loss": 2.227357769012451, "train/z_loss": 0.0015898199053481221, "train/perplexity": 9.275326119546515, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911740.761784993, "perf/iters_per_sec": 0.9115890320706335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969855546951295, "data/tokens_consumed": 38379978752, "data/tokens_consumed_B": 38.379978752, "train/loss_slope": -1.6037475014819504e-06} {"step": 18300, "timestamp": 1778214314.129213, "geo/rankme_last": 440.1786193847656, "geo/layer_0/stable_rank_q_proj": 16.650781631469727, "geo/layer_0/stable_rank_k_proj": 14.729461669921875, "geo/layer_0/stable_rank_o_proj": 53.07394027709961, "geo/layer_0/stable_rank_gate_proj": 155.7086944580078, "geo/layer_0/stable_rank_down_proj": 49.638851165771484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04608443006873131, "geo/layer_0/attn_entropy_mean": 6.2709197998046875, "geo/layer_0/attn_entropy_std": 0.31036219000816345, "geo/layer_7/stable_rank_q_proj": 43.13177490234375, "geo/layer_7/stable_rank_k_proj": 42.36009979248047, "geo/layer_7/stable_rank_o_proj": 110.75567626953125, "geo/layer_7/stable_rank_gate_proj": 106.14765930175781, "geo/layer_7/stable_rank_down_proj": 153.2869110107422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.556082546710968, "geo/layer_7/attn_entropy_mean": 4.671133041381836, "geo/layer_7/attn_entropy_std": 0.85511714220047, "geo/layer_14/stable_rank_q_proj": 58.299739837646484, "geo/layer_14/stable_rank_k_proj": 35.23319625854492, "geo/layer_14/stable_rank_o_proj": 52.7331428527832, "geo/layer_14/stable_rank_gate_proj": 91.5495834350586, "geo/layer_14/stable_rank_down_proj": 136.78387451171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37243375182151794, "geo/layer_14/attn_entropy_mean": 5.521603584289551, "geo/layer_14/attn_entropy_std": 0.4994356334209442, "geo/layer_21/stable_rank_q_proj": 49.34562301635742, "geo/layer_21/stable_rank_k_proj": 31.85651969909668, "geo/layer_21/stable_rank_o_proj": 85.87552642822266, "geo/layer_21/stable_rank_gate_proj": 90.1646728515625, "geo/layer_21/stable_rank_down_proj": 62.22959899902344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15631450712680817, "geo/layer_21/attn_entropy_mean": 5.763177871704102, "geo/layer_21/attn_entropy_std": 0.29614847898483276, "geo/layer_27/stable_rank_q_proj": 41.59629440307617, "geo/layer_27/stable_rank_k_proj": 31.89042091369629, "geo/layer_27/stable_rank_o_proj": 117.32081604003906, "geo/layer_27/stable_rank_gate_proj": 91.80013275146484, "geo/layer_27/stable_rank_down_proj": 140.78619384765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08133132010698318, "geo/layer_27/attn_entropy_mean": 4.400930404663086, "geo/layer_27/attn_entropy_std": 0.5436501502990723, "attnres/final_alpha/block_0": 0.24374330043792725, "attnres/block_norm/0": 1.5874860286712646, "attnres/final_alpha/block_1": 0.007320183329284191, "attnres/block_norm/1": 26962.98046875, "attnres/final_alpha/block_2": 0.01514456421136856, "attnres/block_norm/2": 19643.34765625, "attnres/final_alpha/block_3": 0.01721198484301567, "attnres/block_norm/3": 26605.806640625, "attnres/final_alpha/block_4": 0.02240856923162937, "attnres/block_norm/4": 8682.216796875, "attnres/final_alpha/block_5": 0.5524662137031555, "attnres/block_norm/5": 4912.59375, "attnres/final_alpha/block_6": 0.14170518517494202, "attnres/block_norm/6": 18212.447265625, "geo/tier1_time_s": 1.3626489639282227, "geo/step": 18300.0, "geo/rankme_slope": -2.2599938412865147e-05} {"step": 18310, "timestamp": 1778214324.488604, "train/loss": 2.2760081768035887, "train/z_loss": 0.0015816587721928955, "train/perplexity": 9.737731420345368, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789042.0790222676, "perf/iters_per_sec": 0.8530817408667887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1722206115722655, "data/tokens_consumed": 38400950272, "data/tokens_consumed_B": 38.400950272, "train/loss_slope": 2.1486319974847917e-06} {"step": 18320, "timestamp": 1778214334.8455975, "train/loss": 2.2539145708084107, "train/z_loss": 0.0015829954762011766, "train/perplexity": 9.524949039439853, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025889.5215100392, "perf/iters_per_sec": 0.9660194022703358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035175895690918, "data/tokens_consumed": 38421921792, "data/tokens_consumed_B": 38.421921792, "train/loss_slope": 3.885955149584548e-06} {"step": 18330, "timestamp": 1778214345.2038016, "train/loss": 2.271989440917969, "train/z_loss": 0.001589493127539754, "train/perplexity": 9.698676577752684, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025806.1907595564, "perf/iters_per_sec": 0.9659796670720846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352184772491455, "data/tokens_consumed": 38442893312, "data/tokens_consumed_B": 38.442893312, "train/loss_slope": 4.248162314514913e-06} {"step": 18340, "timestamp": 1778214355.5601668, "train/loss": 2.2201056241989137, "train/z_loss": 0.001597031217534095, "train/perplexity": 9.208303434191563, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025866.1452829612, "perf/iters_per_sec": 0.9660082556166464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035187840461731, "data/tokens_consumed": 38463864832, "data/tokens_consumed_B": 38.463864832, "train/loss_slope": 1.7165381022126956e-06} {"step": 18350, "timestamp": 1778214365.9157882, "grad/layer_0/attn": 0.0027943397872149944, "grad/layer_0/mlp": 0.0027076799888163805, "grad/layer_0/attn_mlp_ratio": 1.0320051466776106, "grad/layer_4/attn": 0.001621126546524465, "grad/layer_4/mlp": 0.002487980527803302, "grad/layer_4/attn_mlp_ratio": 0.6515832673326708, "grad/layer_8/attn": 0.005723351612687111, "grad/layer_8/mlp": 0.003860386786982417, "grad/layer_8/attn_mlp_ratio": 1.4825849792379058, "grad/layer_12/attn": 0.005305085331201553, "grad/layer_12/mlp": 0.005768430884927511, "grad/layer_12/attn_mlp_ratio": 0.9196756180429919, "grad/layer_16/attn": 0.0038484700489789248, "grad/layer_16/mlp": 0.004311689175665379, "grad/layer_16/attn_mlp_ratio": 0.8925666491551691, "grad/layer_20/attn": 0.0045684524811804295, "grad/layer_20/mlp": 0.006520803086459637, "grad/layer_20/attn_mlp_ratio": 0.7005965907185732, "grad/layer_24/attn": 0.014784330502152443, "grad/layer_24/mlp": 0.015155107714235783, "grad/layer_24/attn_mlp_ratio": 0.9755344985579678, "grad/layer_27/attn": 0.008588905446231365, "grad/layer_27/mlp": 0.01390238106250763, "grad/layer_27/attn_mlp_ratio": 0.6178010332067567} {"step": 18350, "timestamp": 1778214365.9320092, "train/loss": 2.2386295557022096, "train/z_loss": 0.0015974053181707858, "train/perplexity": 9.3804670670048, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022936.728058176, "perf/iters_per_sec": 0.9646114006319886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036686897277832, "data/tokens_consumed": 38484836352, "data/tokens_consumed_B": 38.484836352, "train/loss_slope": 2.6151015885135245e-07} {"step": 18360, "timestamp": 1778214376.2868664, "train/loss": 2.2526509523391725, "train/z_loss": 0.0015919325756840407, "train/perplexity": 9.51292073910679, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026504.3092499496, "perf/iters_per_sec": 0.9663125559091328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348618507385254, "data/tokens_consumed": 38505807872, "data/tokens_consumed_B": 38.505807872, "train/loss_slope": -1.3596640311308189e-06} {"step": 18370, "timestamp": 1778214386.641046, "train/loss": 2.2463820695877077, "train/z_loss": 0.001593152910936624, "train/perplexity": 9.453471887997374, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026419.9942341174, "perf/iters_per_sec": 0.9662723513765895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034904909133911, "data/tokens_consumed": 38526779392, "data/tokens_consumed_B": 38.526779392, "train/loss_slope": -1.1282431124352406e-06} {"step": 18375, "timestamp": 1778214392.4329855, "eos/sharpness": 30.687737464904778, "eos/L0_probe": 2.0638725757598877, "eos/L_plus": 2.2280068397521973, "eos/L_minus": 2.206615686416626, "eos/grad_norm": 0.13691207766532898, "eos/embed_grad_frac": 0.1263689249753952, "eos/time_s": 0.6220738887786865} {"step": 18375, "timestamp": 1778214393.8105772, "geo/rankme_last": 441.29266357421875, "geo/layer_0/stable_rank_q_proj": 16.680831909179688, "geo/layer_0/stable_rank_k_proj": 14.809444427490234, "geo/layer_0/stable_rank_o_proj": 52.89350128173828, "geo/layer_0/stable_rank_gate_proj": 155.9485626220703, "geo/layer_0/stable_rank_down_proj": 49.68968963623047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.046443596482276917, "geo/layer_0/attn_entropy_mean": 6.276759147644043, "geo/layer_0/attn_entropy_std": 0.30845823884010315, "geo/layer_7/stable_rank_q_proj": 43.02851486206055, "geo/layer_7/stable_rank_k_proj": 42.271324157714844, "geo/layer_7/stable_rank_o_proj": 110.63385772705078, "geo/layer_7/stable_rank_gate_proj": 105.81714630126953, "geo/layer_7/stable_rank_down_proj": 153.18153381347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5618448853492737, "geo/layer_7/attn_entropy_mean": 4.652037620544434, "geo/layer_7/attn_entropy_std": 0.8396841883659363, "geo/layer_14/stable_rank_q_proj": 58.35205078125, "geo/layer_14/stable_rank_k_proj": 35.169097900390625, "geo/layer_14/stable_rank_o_proj": 52.81597137451172, "geo/layer_14/stable_rank_gate_proj": 91.25027465820312, "geo/layer_14/stable_rank_down_proj": 136.75624084472656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38386884331703186, "geo/layer_14/attn_entropy_mean": 5.509913444519043, "geo/layer_14/attn_entropy_std": 0.5137255191802979, "geo/layer_21/stable_rank_q_proj": 49.333946228027344, "geo/layer_21/stable_rank_k_proj": 31.922876358032227, "geo/layer_21/stable_rank_o_proj": 85.73404693603516, "geo/layer_21/stable_rank_gate_proj": 90.44482421875, "geo/layer_21/stable_rank_down_proj": 62.13138198852539, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15599973499774933, "geo/layer_21/attn_entropy_mean": 5.765762805938721, "geo/layer_21/attn_entropy_std": 0.2845434248447418, "geo/layer_27/stable_rank_q_proj": 41.619083404541016, "geo/layer_27/stable_rank_k_proj": 31.930967330932617, "geo/layer_27/stable_rank_o_proj": 117.09432220458984, "geo/layer_27/stable_rank_gate_proj": 91.88027954101562, "geo/layer_27/stable_rank_down_proj": 140.47972106933594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06766308844089508, "geo/layer_27/attn_entropy_mean": 4.428664207458496, "geo/layer_27/attn_entropy_std": 0.5610114932060242, "attnres/final_alpha/block_0": 0.24210035800933838, "attnres/block_norm/0": 1.5882657766342163, "attnres/final_alpha/block_1": 0.007091418839991093, "attnres/block_norm/1": 27181.130859375, "attnres/final_alpha/block_2": 0.014536550268530846, "attnres/block_norm/2": 19825.033203125, "attnres/final_alpha/block_3": 0.016810107976198196, "attnres/block_norm/3": 26743.140625, "attnres/final_alpha/block_4": 0.021847836673259735, "attnres/block_norm/4": 8629.9755859375, "attnres/final_alpha/block_5": 0.5587596297264099, "attnres/block_norm/5": 4794.6240234375, "attnres/final_alpha/block_6": 0.13885413110256195, "attnres/block_norm/6": 18181.3671875, "geo/tier1_time_s": 1.3573710918426514, "geo/step": 18375.0, "geo/rankme_slope": 4.801197822879151e-06} {"step": 18380, "timestamp": 1778214398.9920511, "train/loss": 2.293942666053772, "train/z_loss": 0.0015776425134390593, "train/perplexity": 9.913948114774508, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698621.1772270375, "perf/iters_per_sec": 0.8099656950125873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2346201896667481, "data/tokens_consumed": 38547750912, "data/tokens_consumed_B": 38.547750912, "train/loss_slope": -1.134186623656476e-06} {"step": 18390, "timestamp": 1778214409.3532524, "train/loss": 2.2490314483642577, "train/z_loss": 0.0015925713116303087, "train/perplexity": 9.478550923044162, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024950.6042990931, "perf/iters_per_sec": 0.9655716916556802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356558799743651, "data/tokens_consumed": 38568722432, "data/tokens_consumed_B": 38.568722432, "train/loss_slope": 6.967819241336648e-07} {"step": 18400, "timestamp": 1778214419.7020671, "grad/layer_0/attn": 0.003686450654640794, "grad/layer_0/mlp": 0.003343661315739155, "grad/layer_0/attn_mlp_ratio": 1.102519123882602, "grad/layer_4/attn": 0.0019256091909483075, "grad/layer_4/mlp": 0.0029568723402917385, "grad/layer_4/attn_mlp_ratio": 0.651231742265594, "grad/layer_8/attn": 0.009485711343586445, "grad/layer_8/mlp": 0.004203061107546091, "grad/layer_8/attn_mlp_ratio": 2.2568577698930463, "grad/layer_12/attn": 0.005752912722527981, "grad/layer_12/mlp": 0.006113622337579727, "grad/layer_12/attn_mlp_ratio": 0.9409990200188835, "grad/layer_16/attn": 0.009659155271947384, "grad/layer_16/mlp": 0.004537478554993868, "grad/layer_16/attn_mlp_ratio": 2.1287494677945626, "grad/layer_20/attn": 0.005761842709034681, "grad/layer_20/mlp": 0.006815964821726084, "grad/layer_20/attn_mlp_ratio": 0.8453451235743076, "grad/layer_24/attn": 0.023763302713632584, "grad/layer_24/mlp": 0.012372688390314579, "grad/layer_24/attn_mlp_ratio": 1.9206256370419938, "grad/layer_27/attn": 0.014831950888037682, "grad/layer_27/mlp": 0.010783021338284016, "grad/layer_27/attn_mlp_ratio": 1.3754911805496701} {"step": 18400, "timestamp": 1778214419.7181883, "train/loss": 2.2729705572128296, "train/z_loss": 0.00159389361506328, "train/perplexity": 9.708196776829254, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024513.7161213434, "perf/iters_per_sec": 0.9653633671385495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035879373550415, "data/tokens_consumed": 38589693952, "data/tokens_consumed_B": 38.589693952, "train/loss_slope": 7.281681384692148e-07} {"step": 18410, "timestamp": 1778214430.0743587, "train/loss": 2.237895131111145, "train/z_loss": 0.0016013044631108641, "train/perplexity": 9.373580350511638, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026080.9842466498, "perf/iters_per_sec": 0.966110698817563, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350780725479125, "data/tokens_consumed": 38610665472, "data/tokens_consumed_B": 38.610665472, "train/loss_slope": -1.6455252178430467e-06} {"step": 18420, "timestamp": 1778214440.4222476, "train/loss": 2.2495508909225466, "train/z_loss": 0.0015930527006275952, "train/perplexity": 9.483475764759953, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027600.8563337354, "perf/iters_per_sec": 0.9668354303044011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343021869659423, "data/tokens_consumed": 38631636992, "data/tokens_consumed_B": 38.631636992, "train/loss_slope": 2.858094299229514e-06} {"step": 18430, "timestamp": 1778214450.7693753, "train/loss": 2.195911002159119, "train/z_loss": 0.0016082716174423695, "train/perplexity": 8.988185584666248, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027655.4484153173, "perf/iters_per_sec": 0.966861461837443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342743396759033, "data/tokens_consumed": 38652608512, "data/tokens_consumed_B": 38.652608512, "train/loss_slope": 2.483480569660455e-06} {"step": 18440, "timestamp": 1778214461.1272783, "train/loss": 2.301115536689758, "train/z_loss": 0.001568666030652821, "train/perplexity": 9.985315229648313, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025691.1906528098, "perf/iters_per_sec": 0.9659248307479905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035277247428894, "data/tokens_consumed": 38673580032, "data/tokens_consumed_B": 38.673580032, "train/loss_slope": 6.523426566938093e-06} {"step": 18450, "timestamp": 1778214471.4718652, "grad/layer_0/attn": 0.0027849795296788216, "grad/layer_0/mlp": 0.0027404988650232553, "grad/layer_0/attn_mlp_ratio": 1.0162308270221108, "grad/layer_4/attn": 0.001648366334848106, "grad/layer_4/mlp": 0.002620427403599024, "grad/layer_4/attn_mlp_ratio": 0.6290448152387954, "grad/layer_8/attn": 0.004268697462975979, "grad/layer_8/mlp": 0.003884105710312724, "grad/layer_8/attn_mlp_ratio": 1.0990167805527136, "grad/layer_12/attn": 0.003605019999668002, "grad/layer_12/mlp": 0.005947163328528404, "grad/layer_12/attn_mlp_ratio": 0.6061746987437415, "grad/layer_16/attn": 0.0037317152600735426, "grad/layer_16/mlp": 0.0040518976747989655, "grad/layer_16/attn_mlp_ratio": 0.9209796168312, "grad/layer_20/attn": 0.004059705417603254, "grad/layer_20/mlp": 0.005624961107969284, "grad/layer_20/attn_mlp_ratio": 0.7217303848872024, "grad/layer_24/attn": 0.01275580283254385, "grad/layer_24/mlp": 0.012327397242188454, "grad/layer_24/attn_mlp_ratio": 1.0347523064653112, "grad/layer_27/attn": 0.007586734835058451, "grad/layer_27/mlp": 0.011196047998964787, "grad/layer_27/attn_mlp_ratio": 0.6776261380799122} {"step": 18450, "timestamp": 1778214472.0871696, "eos/sharpness": 20.94025611877441, "eos/L0_probe": 2.0641844272613525, "eos/L_plus": 2.174765110015869, "eos/L_minus": 2.16300630569458, "eos/grad_norm": 0.14329273998737335, "eos/embed_grad_frac": 0.1433248519897461, "eos/time_s": 0.6123065948486328} {"step": 18450, "timestamp": 1778214472.106412, "train/loss": 2.189305853843689, "train/z_loss": 0.0016068365657702088, "train/perplexity": 8.929012923027928, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911087.823139948, "perf/iters_per_sec": 0.9112776866626492, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973603487014771, "data/tokens_consumed": 38694551552, "data/tokens_consumed_B": 38.694551552, "train/loss_slope": 2.2958394109828103e-06} {"step": 18450, "timestamp": 1778214473.4726777, "geo/rankme_last": 441.0860900878906, "geo/layer_0/stable_rank_q_proj": 16.69234848022461, "geo/layer_0/stable_rank_k_proj": 14.800654411315918, "geo/layer_0/stable_rank_o_proj": 52.9106559753418, "geo/layer_0/stable_rank_gate_proj": 155.92884826660156, "geo/layer_0/stable_rank_down_proj": 49.721214294433594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.040878936648368835, "geo/layer_0/attn_entropy_mean": 6.276125907897949, "geo/layer_0/attn_entropy_std": 0.30872949957847595, "geo/layer_7/stable_rank_q_proj": 43.06507873535156, "geo/layer_7/stable_rank_k_proj": 42.3952751159668, "geo/layer_7/stable_rank_o_proj": 110.79339599609375, "geo/layer_7/stable_rank_gate_proj": 106.01102447509766, "geo/layer_7/stable_rank_down_proj": 153.58590698242188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5662925243377686, "geo/layer_7/attn_entropy_mean": 4.645780086517334, "geo/layer_7/attn_entropy_std": 0.8372867703437805, "geo/layer_14/stable_rank_q_proj": 58.21388244628906, "geo/layer_14/stable_rank_k_proj": 35.184444427490234, "geo/layer_14/stable_rank_o_proj": 52.73080062866211, "geo/layer_14/stable_rank_gate_proj": 90.88262939453125, "geo/layer_14/stable_rank_down_proj": 136.92294311523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38820067048072815, "geo/layer_14/attn_entropy_mean": 5.507281303405762, "geo/layer_14/attn_entropy_std": 0.48402127623558044, "geo/layer_21/stable_rank_q_proj": 49.19533920288086, "geo/layer_21/stable_rank_k_proj": 32.005760192871094, "geo/layer_21/stable_rank_o_proj": 85.54468536376953, "geo/layer_21/stable_rank_gate_proj": 90.220947265625, "geo/layer_21/stable_rank_down_proj": 62.043067932128906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1581908017396927, "geo/layer_21/attn_entropy_mean": 5.749026775360107, "geo/layer_21/attn_entropy_std": 0.2944386303424835, "geo/layer_27/stable_rank_q_proj": 41.54655838012695, "geo/layer_27/stable_rank_k_proj": 31.853288650512695, "geo/layer_27/stable_rank_o_proj": 117.08390045166016, "geo/layer_27/stable_rank_gate_proj": 91.8587646484375, "geo/layer_27/stable_rank_down_proj": 140.7168731689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07854289561510086, "geo/layer_27/attn_entropy_mean": 4.410580635070801, "geo/layer_27/attn_entropy_std": 0.5349738001823425, "attnres/final_alpha/block_0": 0.2450062334537506, "attnres/block_norm/0": 1.5893309116363525, "attnres/final_alpha/block_1": 0.007161230780184269, "attnres/block_norm/1": 27263.802734375, "attnres/final_alpha/block_2": 0.014567188918590546, "attnres/block_norm/2": 19881.3984375, "attnres/final_alpha/block_3": 0.01688981056213379, "attnres/block_norm/3": 26899.240234375, "attnres/final_alpha/block_4": 0.02191505767405033, "attnres/block_norm/4": 8691.5234375, "attnres/final_alpha/block_5": 0.5553923845291138, "attnres/block_norm/5": 4829.3056640625, "attnres/final_alpha/block_6": 0.1390681266784668, "attnres/block_norm/6": 18297.158203125, "geo/tier1_time_s": 1.3623290061950684, "geo/step": 18450.0, "geo/rankme_slope": 1.999569358993597e-05} {"step": 18460, "timestamp": 1778214483.8242486, "train/loss": 2.2048264026641844, "train/z_loss": 0.0015939356060698628, "train/perplexity": 9.068677133011855, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790285.7554066067, "perf/iters_per_sec": 0.8536747719796213, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714062929153441, "data/tokens_consumed": 38715523072, "data/tokens_consumed_B": 38.715523072, "train/loss_slope": 6.07903011561301e-07} {"step": 18470, "timestamp": 1778214494.179996, "train/loss": 2.1966049432754517, "train/z_loss": 0.0015924498322419823, "train/perplexity": 8.99442502085495, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026124.1535234884, "perf/iters_per_sec": 0.9661312835328524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350560188293456, "data/tokens_consumed": 38736494592, "data/tokens_consumed_B": 38.736494592, "train/loss_slope": -7.252675150021442e-07} {"step": 18480, "timestamp": 1778214504.5424685, "train/loss": 2.2365580558776856, "train/z_loss": 0.001593103923369199, "train/perplexity": 9.361055543546682, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024783.171911297, "perf/iters_per_sec": 0.9654918536716923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357415199279785, "data/tokens_consumed": 38757466112, "data/tokens_consumed_B": 38.757466112, "train/loss_slope": 1.2922498008850318e-06} {"step": 18490, "timestamp": 1778214514.9190876, "train/loss": 2.23584463596344, "train/z_loss": 0.0016012059175409376, "train/perplexity": 9.35437956177588, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022058.4153551415, "perf/iters_per_sec": 0.9641925884986599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371371984481812, "data/tokens_consumed": 38778437632, "data/tokens_consumed_B": 38.778437632, "train/loss_slope": -7.792775565855247e-07} {"step": 18500, "timestamp": 1778214525.2718956, "grad/layer_0/attn": 0.0028521695639938116, "grad/layer_0/mlp": 0.002734174020588398, "grad/layer_0/attn_mlp_ratio": 1.0431557897197932, "grad/layer_4/attn": 0.0014112145872786641, "grad/layer_4/mlp": 0.00258237193338573, "grad/layer_4/attn_mlp_ratio": 0.5464799684298124, "grad/layer_8/attn": 0.004083844833076, "grad/layer_8/mlp": 0.003992879763245583, "grad/layer_8/attn_mlp_ratio": 1.022781794831282, "grad/layer_12/attn": 0.004341178108006716, "grad/layer_12/mlp": 0.005968298763036728, "grad/layer_12/attn_mlp_ratio": 0.7273727753301353, "grad/layer_16/attn": 0.004579740110784769, "grad/layer_16/mlp": 0.0043129571713507175, "grad/layer_16/attn_mlp_ratio": 1.0618561285561965, "grad/layer_20/attn": 0.003563057165592909, "grad/layer_20/mlp": 0.005961446091532707, "grad/layer_20/attn_mlp_ratio": 0.5976833558698674, "grad/layer_24/attn": 0.007204264402389526, "grad/layer_24/mlp": 0.0080660879611969, "grad/layer_24/attn_mlp_ratio": 0.8931546925512378, "grad/layer_27/attn": 0.008253215812146664, "grad/layer_27/mlp": 0.006945755798369646, "grad/layer_27/attn_mlp_ratio": 1.1882386788288821} {"step": 18500, "timestamp": 1778214525.2879658, "train/loss": 2.2713791847229006, "train/z_loss": 0.0015917306765913964, "train/perplexity": 9.692759705874634, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024131.6983836342, "perf/iters_per_sec": 0.9651812068861171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360748767852783, "data/tokens_consumed": 38799409152, "data/tokens_consumed_B": 38.799409152, "train/loss_slope": 3.3439635085946166e-06} {"step": 18500, "timestamp": 1778214532.321534, "geo/ww_alpha_mean": 7.914563382446041, "geo/ww_alpha_std": 4.408257709381559, "geo/ww_alpha_min": 1.3531587108271048, "geo/ww_alpha_max": 26.660849672697523, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.206526969822211, "geo/ww_alpha_by_type/k_proj": 4.70479023026776, "geo/ww_alpha_by_type/v_proj": 8.082352336381087, "geo/ww_alpha_by_type/o_proj": 8.30683621348587, "geo/ww_alpha_by_type/gate_proj": 8.558346310768458, "geo/ww_alpha_by_type/up_proj": 12.716524871860656, "geo/ww_alpha_by_type/down_proj": 8.952308625152776, "geo/twonn_id/layer_0": 0.7440706491470337, "geo/twonn_id/layer_7": 3.0619397163391113, "geo/twonn_id/layer_14": 3.744666337966919, "geo/twonn_id/layer_21": 7.567195415496826, "geo/twonn_id/layer_27": 6.183074474334717, "geo/tier2_time_s": 7.026644945144653} {"step": 18500, "timestamp": 1778214532.9237537, "eoc/jacobian_sigma/layer_0/attn": 804.4513549804688, "eoc/jacobian_sigma/layer_0/mlp": 5170.0830078125, "eoc/jacobian_sigma/layer_0": 5170.0830078125, "eoc/jacobian_sigma/layer_7/attn": 1.139816403388977, "eoc/jacobian_sigma/layer_7/mlp": 1.6359608173370361, "eoc/jacobian_sigma/layer_7": 1.6359608173370361, "eoc/jacobian_sigma/layer_14/attn": 1.5540121793746948, "eoc/jacobian_sigma/layer_14/mlp": 6.457337856292725, "eoc/jacobian_sigma/layer_14": 6.457337856292725, "eoc/jacobian_sigma/layer_21/attn": 1.0765618085861206, "eoc/jacobian_sigma/layer_21/mlp": 3.955357074737549, "eoc/jacobian_sigma/layer_21": 3.955357074737549, "eoc/jacobian_sigma/layer_27/attn": 3.5609264373779297, "eoc/jacobian_sigma/layer_27/mlp": 22.88365936279297, "eoc/jacobian_sigma/layer_27": 22.88365936279297, "eoc/layer0_sigma": 5170.0830078125, "eoc/sigma_max": 22.88365936279297, "eoc/sigma_min": 1.6359608173370361, "eoc/sigma_mean": 8.73307877779007, "eoc/time_s": 0.5966126918792725} {"step": 18510, "timestamp": 1778214543.2917194, "train/loss": 2.264612102508545, "train/z_loss": 0.001590268756262958, "train/perplexity": 9.627389436514944, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1165214.9220179284, "perf/iters_per_sec": 0.5556177721109049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7997984409332275, "data/tokens_consumed": 38820380672, "data/tokens_consumed_B": 38.820380672, "train/loss_slope": 5.576691554539146e-06} {"step": 18520, "timestamp": 1778214553.6499798, "train/loss": 2.2613086700439453, "train/z_loss": 0.0015873226220719516, "train/perplexity": 9.595638478148306, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025898.573527295, "perf/iters_per_sec": 0.96602371860852, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351712703704834, "data/tokens_consumed": 38841352192, "data/tokens_consumed_B": 38.841352192, "train/loss_slope": 4.944269670726307e-06} {"step": 18525, "timestamp": 1778214559.439076, "eos/sharpness": 63.26663494110106, "eos/L0_probe": 2.063603639602661, "eos/L_plus": 2.4940671920776367, "eos/L_minus": 2.2658064365386963, "eos/grad_norm": 0.23700560629367828, "eos/embed_grad_frac": 0.05305667221546173, "eos/time_s": 0.6244258880615234} {"step": 18525, "timestamp": 1778214560.8206558, "geo/rankme_last": 440.6551208496094, "geo/layer_0/stable_rank_q_proj": 16.730600357055664, "geo/layer_0/stable_rank_k_proj": 14.868738174438477, "geo/layer_0/stable_rank_o_proj": 52.93574523925781, "geo/layer_0/stable_rank_gate_proj": 155.93783569335938, "geo/layer_0/stable_rank_down_proj": 49.73167037963867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.046135276556015015, "geo/layer_0/attn_entropy_mean": 6.272994041442871, "geo/layer_0/attn_entropy_std": 0.30705010890960693, "geo/layer_7/stable_rank_q_proj": 42.981956481933594, "geo/layer_7/stable_rank_k_proj": 42.31852722167969, "geo/layer_7/stable_rank_o_proj": 110.77130889892578, "geo/layer_7/stable_rank_gate_proj": 105.97428131103516, "geo/layer_7/stable_rank_down_proj": 153.47494506835938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5630733966827393, "geo/layer_7/attn_entropy_mean": 4.715083599090576, "geo/layer_7/attn_entropy_std": 0.8516319394111633, "geo/layer_14/stable_rank_q_proj": 58.23177719116211, "geo/layer_14/stable_rank_k_proj": 35.23793411254883, "geo/layer_14/stable_rank_o_proj": 52.645816802978516, "geo/layer_14/stable_rank_gate_proj": 90.88095092773438, "geo/layer_14/stable_rank_down_proj": 136.90585327148438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38082945346832275, "geo/layer_14/attn_entropy_mean": 5.527512073516846, "geo/layer_14/attn_entropy_std": 0.5092629790306091, "geo/layer_21/stable_rank_q_proj": 49.22150802612305, "geo/layer_21/stable_rank_k_proj": 32.041969299316406, "geo/layer_21/stable_rank_o_proj": 85.49007415771484, "geo/layer_21/stable_rank_gate_proj": 90.07939147949219, "geo/layer_21/stable_rank_down_proj": 62.07756042480469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15577957034111023, "geo/layer_21/attn_entropy_mean": 5.763489723205566, "geo/layer_21/attn_entropy_std": 0.2894165813922882, "geo/layer_27/stable_rank_q_proj": 41.52143096923828, "geo/layer_27/stable_rank_k_proj": 31.857669830322266, "geo/layer_27/stable_rank_o_proj": 117.09111785888672, "geo/layer_27/stable_rank_gate_proj": 91.76716613769531, "geo/layer_27/stable_rank_down_proj": 140.97604370117188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07745949923992157, "geo/layer_27/attn_entropy_mean": 4.431796073913574, "geo/layer_27/attn_entropy_std": 0.5293664932250977, "attnres/final_alpha/block_0": 0.24256959557533264, "attnres/block_norm/0": 1.5901219844818115, "attnres/final_alpha/block_1": 0.007206480018794537, "attnres/block_norm/1": 27326.12109375, "attnres/final_alpha/block_2": 0.01462099701166153, "attnres/block_norm/2": 19810.7734375, "attnres/final_alpha/block_3": 0.016527947038412094, "attnres/block_norm/3": 26930.9375, "attnres/final_alpha/block_4": 0.02166653797030449, "attnres/block_norm/4": 8718.79296875, "attnres/final_alpha/block_5": 0.5591118335723877, "attnres/block_norm/5": 4821.404296875, "attnres/final_alpha/block_6": 0.13829663395881653, "attnres/block_norm/6": 18451.0, "geo/tier1_time_s": 1.3615400791168213, "geo/step": 18525.0, "geo/rankme_slope": 2.1736702493497397e-05} {"step": 18530, "timestamp": 1778214565.999886, "train/loss": 2.1928351640701296, "train/z_loss": 0.001602870540227741, "train/perplexity": 8.960581855158066, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698966.9830250347, "perf/iters_per_sec": 0.8101305880665944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234368896484375, "data/tokens_consumed": 38862323712, "data/tokens_consumed_B": 38.862323712, "train/loss_slope": 3.31813055153486e-06} {"step": 18540, "timestamp": 1778214576.3492846, "train/loss": 2.1917890310287476, "train/z_loss": 0.0016140519292093813, "train/perplexity": 8.951212795905057, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027337.4254280878, "perf/iters_per_sec": 0.9667098166599692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034436583518982, "data/tokens_consumed": 38883295232, "data/tokens_consumed_B": 38.883295232, "train/loss_slope": 2.433404642077147e-06} {"step": 18550, "timestamp": 1778214586.6998553, "grad/layer_0/attn": 0.0023579213302582502, "grad/layer_0/mlp": 0.0024546440690755844, "grad/layer_0/attn_mlp_ratio": 0.9605959837128817, "grad/layer_4/attn": 0.0018653812585398555, "grad/layer_4/mlp": 0.0023618428967893124, "grad/layer_4/attn_mlp_ratio": 0.7897990091109585, "grad/layer_8/attn": 0.0039096917025744915, "grad/layer_8/mlp": 0.003819722682237625, "grad/layer_8/attn_mlp_ratio": 1.0235537826868581, "grad/layer_12/attn": 0.0037887736689299345, "grad/layer_12/mlp": 0.005857601296156645, "grad/layer_12/attn_mlp_ratio": 0.6468131599763458, "grad/layer_16/attn": 0.004685183055698872, "grad/layer_16/mlp": 0.004237302113324404, "grad/layer_16/attn_mlp_ratio": 1.105699527630124, "grad/layer_20/attn": 0.013853076845407486, "grad/layer_20/mlp": 0.005902805365622044, "grad/layer_20/attn_mlp_ratio": 2.3468631866809515, "grad/layer_24/attn": 0.010130494832992554, "grad/layer_24/mlp": 0.008927428163588047, "grad/layer_24/attn_mlp_ratio": 1.1347607097904564, "grad/layer_27/attn": 0.003976049367338419, "grad/layer_27/mlp": 0.007072627544403076, "grad/layer_27/attn_mlp_ratio": 0.5621742819282824} {"step": 18550, "timestamp": 1778214586.7158487, "train/loss": 2.2000035285949706, "train/z_loss": 0.0016044799820519983, "train/perplexity": 9.02504534510755, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024200.3111905819, "perf/iters_per_sec": 0.9652139240219983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360397577285767, "data/tokens_consumed": 38904266752, "data/tokens_consumed_B": 38.904266752, "train/loss_slope": 3.911354861530677e-07} {"step": 18560, "timestamp": 1778214597.0688794, "train/loss": 2.2203787565231323, "train/z_loss": 0.0015960115473717452, "train/perplexity": 9.210818863017478, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026728.2029329783, "perf/iters_per_sec": 0.9664193167366878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347475290298462, "data/tokens_consumed": 38925238272, "data/tokens_consumed_B": 38.925238272, "train/loss_slope": -6.194978514269132e-07} {"step": 18570, "timestamp": 1778214607.4212723, "train/loss": 2.223339056968689, "train/z_loss": 0.0015921174315735697, "train/perplexity": 9.238126053003016, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026688.1365709773, "perf/iters_per_sec": 0.9664002116064917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034767985343933, "data/tokens_consumed": 38946209792, "data/tokens_consumed_B": 38.946209792, "train/loss_slope": -1.6134950372859263e-06} {"step": 18580, "timestamp": 1778214617.7757137, "train/loss": 2.2208306074142454, "train/z_loss": 0.0015989829902537168, "train/perplexity": 9.214981720153139, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026429.377784252, "perf/iters_per_sec": 0.9662768258019695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349001169204712, "data/tokens_consumed": 38967181312, "data/tokens_consumed_B": 38.967181312, "train/loss_slope": -1.1848787198436479e-06} {"step": 18590, "timestamp": 1778214628.1285546, "train/loss": 2.229993772506714, "train/z_loss": 0.001588773459661752, "train/perplexity": 9.299808164810347, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026780.4129938476, "perf/iters_per_sec": 0.9664442124337423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347208738327027, "data/tokens_consumed": 38988152832, "data/tokens_consumed_B": 38.988152832, "train/loss_slope": -1.6639332924380504e-06} {"step": 18600, "timestamp": 1778214638.4726686, "grad/layer_0/attn": 0.0027504696045070887, "grad/layer_0/mlp": 0.0028291770722717047, "grad/layer_0/attn_mlp_ratio": 0.9721800498971871, "grad/layer_4/attn": 0.0025685650762170553, "grad/layer_4/mlp": 0.0024661971256136894, "grad/layer_4/attn_mlp_ratio": 1.041508379597618, "grad/layer_8/attn": 0.006004695314913988, "grad/layer_8/mlp": 0.0036762915551662445, "grad/layer_8/attn_mlp_ratio": 1.6333566207881436, "grad/layer_12/attn": 0.004243399947881699, "grad/layer_12/mlp": 0.005935536697506905, "grad/layer_12/attn_mlp_ratio": 0.714914268523118, "grad/layer_16/attn": 0.003933147061616182, "grad/layer_16/mlp": 0.004415890201926231, "grad/layer_16/attn_mlp_ratio": 0.8906804274328386, "grad/layer_20/attn": 0.004779478069394827, "grad/layer_20/mlp": 0.0061644744127988815, "grad/layer_20/attn_mlp_ratio": 0.7753261140866947, "grad/layer_24/attn": 0.006624720059335232, "grad/layer_24/mlp": 0.009150969795882702, "grad/layer_24/attn_mlp_ratio": 0.7239363843078419, "grad/layer_27/attn": 0.004859440494328737, "grad/layer_27/mlp": 0.008011401630938053, "grad/layer_27/attn_mlp_ratio": 0.6065655746063487} {"step": 18600, "timestamp": 1778214639.0925655, "eos/sharpness": 28.71038913726806, "eos/L0_probe": 2.064185857772827, "eos/L_plus": 2.242725133895874, "eos/L_minus": 2.172750473022461, "eos/grad_norm": 0.11762314289808273, "eos/embed_grad_frac": 0.20002776384353638, "eos/time_s": 0.616661787033081} {"step": 18600, "timestamp": 1778214639.1140583, "train/loss": 2.1969319105148317, "train/z_loss": 0.001612918870523572, "train/perplexity": 8.99736638401232, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909845.440554518, "perf/iters_per_sec": 0.9106852724812117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098074197769165, "data/tokens_consumed": 39009124352, "data/tokens_consumed_B": 39.009124352, "train/loss_slope": -3.2224412083159838e-06} {"step": 18600, "timestamp": 1778214640.4806614, "geo/rankme_last": 441.1081848144531, "geo/layer_0/stable_rank_q_proj": 16.775856018066406, "geo/layer_0/stable_rank_k_proj": 14.87932300567627, "geo/layer_0/stable_rank_o_proj": 52.99100112915039, "geo/layer_0/stable_rank_gate_proj": 155.70184326171875, "geo/layer_0/stable_rank_down_proj": 49.774864196777344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.040757566690444946, "geo/layer_0/attn_entropy_mean": 6.276822090148926, "geo/layer_0/attn_entropy_std": 0.31033942103385925, "geo/layer_7/stable_rank_q_proj": 42.982177734375, "geo/layer_7/stable_rank_k_proj": 42.18392562866211, "geo/layer_7/stable_rank_o_proj": 110.9745101928711, "geo/layer_7/stable_rank_gate_proj": 105.71019744873047, "geo/layer_7/stable_rank_down_proj": 153.2119598388672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5496196746826172, "geo/layer_7/attn_entropy_mean": 4.630615234375, "geo/layer_7/attn_entropy_std": 0.830253541469574, "geo/layer_14/stable_rank_q_proj": 58.04161834716797, "geo/layer_14/stable_rank_k_proj": 35.20765686035156, "geo/layer_14/stable_rank_o_proj": 52.77067184448242, "geo/layer_14/stable_rank_gate_proj": 90.71500396728516, "geo/layer_14/stable_rank_down_proj": 137.1470489501953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3775540888309479, "geo/layer_14/attn_entropy_mean": 5.527444362640381, "geo/layer_14/attn_entropy_std": 0.5123024582862854, "geo/layer_21/stable_rank_q_proj": 49.16275405883789, "geo/layer_21/stable_rank_k_proj": 32.03550720214844, "geo/layer_21/stable_rank_o_proj": 85.3032455444336, "geo/layer_21/stable_rank_gate_proj": 90.24475860595703, "geo/layer_21/stable_rank_down_proj": 62.092472076416016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15607567131519318, "geo/layer_21/attn_entropy_mean": 5.751272201538086, "geo/layer_21/attn_entropy_std": 0.2809033989906311, "geo/layer_27/stable_rank_q_proj": 41.553749084472656, "geo/layer_27/stable_rank_k_proj": 31.829267501831055, "geo/layer_27/stable_rank_o_proj": 117.00039672851562, "geo/layer_27/stable_rank_gate_proj": 91.73233795166016, "geo/layer_27/stable_rank_down_proj": 140.39817810058594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07942867279052734, "geo/layer_27/attn_entropy_mean": 4.401412010192871, "geo/layer_27/attn_entropy_std": 0.5297921895980835, "attnres/final_alpha/block_0": 0.24205449223518372, "attnres/block_norm/0": 1.5911096334457397, "attnres/final_alpha/block_1": 0.0071350811049342155, "attnres/block_norm/1": 27325.68359375, "attnres/final_alpha/block_2": 0.014369780197739601, "attnres/block_norm/2": 19837.2421875, "attnres/final_alpha/block_3": 0.016338935121893883, "attnres/block_norm/3": 27190.15625, "attnres/final_alpha/block_4": 0.021521931514143944, "attnres/block_norm/4": 8700.466796875, "attnres/final_alpha/block_5": 0.5593327283859253, "attnres/block_norm/5": 4829.3974609375, "attnres/final_alpha/block_6": 0.1392471194267273, "attnres/block_norm/6": 18372.1015625, "geo/tier1_time_s": 1.362264633178711, "geo/step": 18600.0, "geo/rankme_slope": 8.970365489945978e-06} {"step": 18610, "timestamp": 1778214650.8360715, "train/loss": 2.2677380800247193, "train/z_loss": 0.0015803953632712365, "train/perplexity": 9.65753152663591, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789684.8389799164, "perf/iters_per_sec": 0.8533882326984007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171799612045288, "data/tokens_consumed": 39030095872, "data/tokens_consumed_B": 39.030095872, "train/loss_slope": 4.938137484785221e-07} {"step": 18620, "timestamp": 1778214661.1942692, "train/loss": 2.1881659507751463, "train/z_loss": 0.001590439083520323, "train/perplexity": 8.91884051268028, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025676.7290889837, "perf/iters_per_sec": 0.9659179349369925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352846384048462, "data/tokens_consumed": 39051067392, "data/tokens_consumed_B": 39.051067392, "train/loss_slope": 1.8397958007547864e-06} {"step": 18630, "timestamp": 1778214671.5479062, "train/loss": 2.2043739795684814, "train/z_loss": 0.001596906699705869, "train/perplexity": 9.06457518200806, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026494.0379424973, "perf/iters_per_sec": 0.9663076581680762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348670959472657, "data/tokens_consumed": 39072038912, "data/tokens_consumed_B": 39.072038912, "train/loss_slope": -2.410874207957529e-06} {"step": 18640, "timestamp": 1778214681.9062352, "train/loss": 2.2657692432403564, "train/z_loss": 0.0015835345373488962, "train/perplexity": 9.638536128875064, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025864.04564825, "perf/iters_per_sec": 0.9660072544327974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035188913345337, "data/tokens_consumed": 39093010432, "data/tokens_consumed_B": 39.093010432, "train/loss_slope": -7.125040163615211e-08} {"step": 18650, "timestamp": 1778214692.2589555, "grad/layer_0/attn": 0.002682484919205308, "grad/layer_0/mlp": 0.002607601461932063, "grad/layer_0/attn_mlp_ratio": 1.0287173310395477, "grad/layer_4/attn": 0.0015741775278002024, "grad/layer_4/mlp": 0.002513187238946557, "grad/layer_4/attn_mlp_ratio": 0.626366965727292, "grad/layer_8/attn": 0.00561388349160552, "grad/layer_8/mlp": 0.003931405954062939, "grad/layer_8/attn_mlp_ratio": 1.4279581947033464, "grad/layer_12/attn": 0.0036850234027951956, "grad/layer_12/mlp": 0.006136901676654816, "grad/layer_12/attn_mlp_ratio": 0.6004696729566816, "grad/layer_16/attn": 0.0052607180550694466, "grad/layer_16/mlp": 0.004375006537884474, "grad/layer_16/attn_mlp_ratio": 1.2024480167676401, "grad/layer_20/attn": 0.0038291371893137693, "grad/layer_20/mlp": 0.005798823665827513, "grad/layer_20/attn_mlp_ratio": 0.6603299813798252, "grad/layer_24/attn": 0.0051290113478899, "grad/layer_24/mlp": 0.008019170723855495, "grad/layer_24/attn_mlp_ratio": 0.6395937261533419, "grad/layer_27/attn": 0.00461649801582098, "grad/layer_27/mlp": 0.006328074727207422, "grad/layer_27/attn_mlp_ratio": 0.7295264581847936} {"step": 18650, "timestamp": 1778214692.2750437, "train/loss": 2.2068125724792482, "train/z_loss": 0.0015975795569829643, "train/perplexity": 9.086706965022946, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023500.9859650435, "perf/iters_per_sec": 0.9648804597687929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363978147506714, "data/tokens_consumed": 39113981952, "data/tokens_consumed_B": 39.113981952, "train/loss_slope": -1.4249971311369752e-06} {"step": 18660, "timestamp": 1778214702.6296265, "train/loss": 2.2465355157852174, "train/z_loss": 0.001580231124535203, "train/perplexity": 9.454922598612018, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026393.9915169263, "perf/iters_per_sec": 0.9662599523148185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034918189048767, "data/tokens_consumed": 39134953472, "data/tokens_consumed_B": 39.134953472, "train/loss_slope": -2.7580169668577365e-06} {"step": 18670, "timestamp": 1778214712.9814591, "train/loss": 2.3218393325805664, "train/z_loss": 0.0015725014032796024, "train/perplexity": 10.194407978799468, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026823.7054867353, "perf/iters_per_sec": 0.9664648559030224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03469877243042, "data/tokens_consumed": 39155924992, "data/tokens_consumed_B": 39.155924992, "train/loss_slope": 2.7939288851523974e-06} {"step": 18675, "timestamp": 1778214718.7691383, "eos/sharpness": 52.95014381408691, "eos/L0_probe": 2.0632760524749756, "eos/L_plus": 2.387448787689209, "eos/L_minus": 2.2686047554016113, "eos/grad_norm": 0.2744598686695099, "eos/embed_grad_frac": 0.03372073918581009, "eos/time_s": 0.622065544128418} {"step": 18675, "timestamp": 1778214720.1469955, "geo/rankme_last": 440.00531005859375, "geo/layer_0/stable_rank_q_proj": 16.80507469177246, "geo/layer_0/stable_rank_k_proj": 14.896051406860352, "geo/layer_0/stable_rank_o_proj": 52.983123779296875, "geo/layer_0/stable_rank_gate_proj": 155.99441528320312, "geo/layer_0/stable_rank_down_proj": 49.739566802978516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04206104576587677, "geo/layer_0/attn_entropy_mean": 6.280474662780762, "geo/layer_0/attn_entropy_std": 0.3161952495574951, "geo/layer_7/stable_rank_q_proj": 42.970375061035156, "geo/layer_7/stable_rank_k_proj": 42.114967346191406, "geo/layer_7/stable_rank_o_proj": 110.72955322265625, "geo/layer_7/stable_rank_gate_proj": 105.89913177490234, "geo/layer_7/stable_rank_down_proj": 153.13970947265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5606998801231384, "geo/layer_7/attn_entropy_mean": 4.654870986938477, "geo/layer_7/attn_entropy_std": 0.8573461174964905, "geo/layer_14/stable_rank_q_proj": 57.95537185668945, "geo/layer_14/stable_rank_k_proj": 35.277950286865234, "geo/layer_14/stable_rank_o_proj": 52.650272369384766, "geo/layer_14/stable_rank_gate_proj": 90.78363800048828, "geo/layer_14/stable_rank_down_proj": 136.99771118164062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36225759983062744, "geo/layer_14/attn_entropy_mean": 5.572290420532227, "geo/layer_14/attn_entropy_std": 0.4854782819747925, "geo/layer_21/stable_rank_q_proj": 49.07762908935547, "geo/layer_21/stable_rank_k_proj": 31.98839569091797, "geo/layer_21/stable_rank_o_proj": 85.30804443359375, "geo/layer_21/stable_rank_gate_proj": 90.26702880859375, "geo/layer_21/stable_rank_down_proj": 61.989505767822266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15017075836658478, "geo/layer_21/attn_entropy_mean": 5.777047157287598, "geo/layer_21/attn_entropy_std": 0.2882542312145233, "geo/layer_27/stable_rank_q_proj": 41.46710968017578, "geo/layer_27/stable_rank_k_proj": 31.862579345703125, "geo/layer_27/stable_rank_o_proj": 117.14478302001953, "geo/layer_27/stable_rank_gate_proj": 91.55638122558594, "geo/layer_27/stable_rank_down_proj": 140.5376739501953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08061747997999191, "geo/layer_27/attn_entropy_mean": 4.415038108825684, "geo/layer_27/attn_entropy_std": 0.5416447520256042, "attnres/final_alpha/block_0": 0.24299198389053345, "attnres/block_norm/0": 1.5918302536010742, "attnres/final_alpha/block_1": 0.007066559977829456, "attnres/block_norm/1": 27496.498046875, "attnres/final_alpha/block_2": 0.014683997258543968, "attnres/block_norm/2": 19923.056640625, "attnres/final_alpha/block_3": 0.016792388632893562, "attnres/block_norm/3": 27248.56640625, "attnres/final_alpha/block_4": 0.0214258823543787, "attnres/block_norm/4": 8769.7822265625, "attnres/final_alpha/block_5": 0.5604711771011353, "attnres/block_norm/5": 4879.859375, "attnres/final_alpha/block_6": 0.13656803965568542, "attnres/block_norm/6": 18439.66015625, "geo/tier1_time_s": 1.357865810394287, "geo/step": 18675.0, "geo/rankme_slope": -4.729684842687075e-06} {"step": 18680, "timestamp": 1778214725.3262782, "train/loss": 2.2449542760848997, "train/z_loss": 0.001591631630435586, "train/perplexity": 9.439983913569217, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699460.1473870014, "perf/iters_per_sec": 0.8103657471594817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2340106964111328, "data/tokens_consumed": 39176896512, "data/tokens_consumed_B": 39.176896512, "train/loss_slope": 2.1077387117603418e-06} {"step": 18690, "timestamp": 1778214735.6848016, "train/loss": 2.2005627393722533, "train/z_loss": 0.0015984019497409462, "train/perplexity": 9.030093659134247, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025855.273888084, "perf/iters_per_sec": 0.9660030717316075, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035193395614624, "data/tokens_consumed": 39197868032, "data/tokens_consumed_B": 39.197868032, "train/loss_slope": -1.5377213065773272e-06} {"step": 18700, "timestamp": 1778214746.0286083, "grad/layer_0/attn": 0.003018549643456936, "grad/layer_0/mlp": 0.0030167296063154936, "grad/layer_0/attn_mlp_ratio": 1.00060328147319, "grad/layer_4/attn": 0.0017261244356632233, "grad/layer_4/mlp": 0.00255763903260231, "grad/layer_4/attn_mlp_ratio": 0.6748897503405614, "grad/layer_8/attn": 0.005995063576847315, "grad/layer_8/mlp": 0.003965417854487896, "grad/layer_8/attn_mlp_ratio": 1.5118364938208722, "grad/layer_12/attn": 0.0050071366131305695, "grad/layer_12/mlp": 0.006624160334467888, "grad/layer_12/attn_mlp_ratio": 0.7558899973310789, "grad/layer_16/attn": 0.0041707404889166355, "grad/layer_16/mlp": 0.0045176115818321705, "grad/layer_16/attn_mlp_ratio": 0.9232180148837279, "grad/layer_20/attn": 0.008247884921729565, "grad/layer_20/mlp": 0.007168728392571211, "grad/layer_20/attn_mlp_ratio": 1.1505366579689353, "grad/layer_24/attn": 0.02149878442287445, "grad/layer_24/mlp": 0.013710197061300278, "grad/layer_24/attn_mlp_ratio": 1.5680871813834296, "grad/layer_27/attn": 0.008770158514380455, "grad/layer_27/mlp": 0.012982997111976147, "grad/layer_27/attn_mlp_ratio": 0.67551108354937} {"step": 18700, "timestamp": 1778214746.0443704, "train/loss": 2.229895257949829, "train/z_loss": 0.001600683748256415, "train/perplexity": 9.298892043456261, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025525.0350630197, "perf/iters_per_sec": 0.9658456015887354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03536217212677, "data/tokens_consumed": 39218839552, "data/tokens_consumed_B": 39.218839552, "train/loss_slope": -3.3608869071339654e-06} {"step": 18710, "timestamp": 1778214756.3998814, "train/loss": 2.199870228767395, "train/z_loss": 0.0015961229568347336, "train/perplexity": 9.023842388297934, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026344.5555162546, "perf/iters_per_sec": 0.9662363793927453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034943437576294, "data/tokens_consumed": 39239811072, "data/tokens_consumed_B": 39.239811072, "train/loss_slope": -4.34123026941972e-06} {"step": 18720, "timestamp": 1778214766.7591784, "train/loss": 2.268898439407349, "train/z_loss": 0.001571218017488718, "train/perplexity": 9.668744238085251, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025708.6847559894, "perf/iters_per_sec": 0.965933172586436, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352683067321777, "data/tokens_consumed": 39260782592, "data/tokens_consumed_B": 39.260782592, "train/loss_slope": -7.303840091841475e-06} {"step": 18730, "timestamp": 1778214777.1175163, "train/loss": 2.207071900367737, "train/z_loss": 0.0015911784721538424, "train/perplexity": 9.089063707124769, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026140.8150228783, "perf/iters_per_sec": 0.9661392283548729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350475072860719, "data/tokens_consumed": 39281754112, "data/tokens_consumed_B": 39.281754112, "train/loss_slope": -9.545947680629237e-06} {"step": 18740, "timestamp": 1778214787.4728808, "train/loss": 2.245823073387146, "train/z_loss": 0.0015864921966567635, "train/perplexity": 9.448188909849788, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026123.5001369393, "perf/iters_per_sec": 0.9661309719738671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350563526153564, "data/tokens_consumed": 39302725632, "data/tokens_consumed_B": 39.302725632, "train/loss_slope": -9.482491055730909e-06} {"step": 18750, "timestamp": 1778214797.8210588, "grad/layer_0/attn": 0.002555828308686614, "grad/layer_0/mlp": 0.0024227795656770468, "grad/layer_0/attn_mlp_ratio": 1.0549157007112269, "grad/layer_4/attn": 0.0015331667382270098, "grad/layer_4/mlp": 0.002437946619465947, "grad/layer_4/attn_mlp_ratio": 0.6288762285021811, "grad/layer_8/attn": 0.0036930108908563852, "grad/layer_8/mlp": 0.003742686240002513, "grad/layer_8/attn_mlp_ratio": 0.9867273277444633, "grad/layer_12/attn": 0.0034602326340973377, "grad/layer_12/mlp": 0.005962786264717579, "grad/layer_12/attn_mlp_ratio": 0.5803046465947348, "grad/layer_16/attn": 0.0038913360331207514, "grad/layer_16/mlp": 0.004143184050917625, "grad/layer_16/attn_mlp_ratio": 0.939213873044698, "grad/layer_20/attn": 0.007246684283018112, "grad/layer_20/mlp": 0.005600443109869957, "grad/layer_20/attn_mlp_ratio": 1.2939483557742177, "grad/layer_24/attn": 0.008343786001205444, "grad/layer_24/mlp": 0.008978213183581829, "grad/layer_24/attn_mlp_ratio": 0.929337022597075, "grad/layer_27/attn": 0.0052152941934764385, "grad/layer_27/mlp": 0.008021984249353409, "grad/layer_27/attn_mlp_ratio": 0.6501252017397421} {"step": 18750, "timestamp": 1778214798.4319513, "eos/sharpness": 21.19679450988769, "eos/L0_probe": 2.0617964267730713, "eos/L_plus": 2.158411979675293, "eos/L_minus": 2.1771488189697266, "eos/grad_norm": 0.11044955253601074, "eos/embed_grad_frac": 0.21532396972179413, "eos/time_s": 0.6080083847045898} {"step": 18750, "timestamp": 1778214798.451624, "train/loss": 2.234937334060669, "train/z_loss": 0.001582149788737297, "train/perplexity": 9.34589616448345, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911389.9387218691, "perf/iters_per_sec": 0.9114217465981813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971868991851808, "data/tokens_consumed": 39323697152, "data/tokens_consumed_B": 39.323697152, "train/loss_slope": -8.750723118614635e-06} {"step": 18750, "timestamp": 1778214799.8173752, "geo/rankme_last": 440.71185302734375, "geo/layer_0/stable_rank_q_proj": 16.802370071411133, "geo/layer_0/stable_rank_k_proj": 14.880940437316895, "geo/layer_0/stable_rank_o_proj": 53.09962463378906, "geo/layer_0/stable_rank_gate_proj": 156.2710418701172, "geo/layer_0/stable_rank_down_proj": 49.744415283203125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.043476544320583344, "geo/layer_0/attn_entropy_mean": 6.277107238769531, "geo/layer_0/attn_entropy_std": 0.31599190831184387, "geo/layer_7/stable_rank_q_proj": 42.9910774230957, "geo/layer_7/stable_rank_k_proj": 42.205345153808594, "geo/layer_7/stable_rank_o_proj": 110.45669555664062, "geo/layer_7/stable_rank_gate_proj": 105.92339324951172, "geo/layer_7/stable_rank_down_proj": 152.802001953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.538806676864624, "geo/layer_7/attn_entropy_mean": 4.616706371307373, "geo/layer_7/attn_entropy_std": 0.8291173577308655, "geo/layer_14/stable_rank_q_proj": 57.8830451965332, "geo/layer_14/stable_rank_k_proj": 35.2615852355957, "geo/layer_14/stable_rank_o_proj": 52.640716552734375, "geo/layer_14/stable_rank_gate_proj": 90.6059799194336, "geo/layer_14/stable_rank_down_proj": 137.1514892578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3876193165779114, "geo/layer_14/attn_entropy_mean": 5.51949405670166, "geo/layer_14/attn_entropy_std": 0.49767303466796875, "geo/layer_21/stable_rank_q_proj": 49.06016540527344, "geo/layer_21/stable_rank_k_proj": 31.91358757019043, "geo/layer_21/stable_rank_o_proj": 85.06546783447266, "geo/layer_21/stable_rank_gate_proj": 90.20074462890625, "geo/layer_21/stable_rank_down_proj": 61.97389221191406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15665990114212036, "geo/layer_21/attn_entropy_mean": 5.760927677154541, "geo/layer_21/attn_entropy_std": 0.2902977168560028, "geo/layer_27/stable_rank_q_proj": 41.312191009521484, "geo/layer_27/stable_rank_k_proj": 31.665321350097656, "geo/layer_27/stable_rank_o_proj": 117.48405456542969, "geo/layer_27/stable_rank_gate_proj": 91.59307098388672, "geo/layer_27/stable_rank_down_proj": 140.33189392089844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07367153465747833, "geo/layer_27/attn_entropy_mean": 4.3871989250183105, "geo/layer_27/attn_entropy_std": 0.5449698567390442, "attnres/final_alpha/block_0": 0.24475470185279846, "attnres/block_norm/0": 1.592686414718628, "attnres/final_alpha/block_1": 0.007149694487452507, "attnres/block_norm/1": 27510.013671875, "attnres/final_alpha/block_2": 0.014541303738951683, "attnres/block_norm/2": 19889.50390625, "attnres/final_alpha/block_3": 0.016614677384495735, "attnres/block_norm/3": 27371.685546875, "attnres/final_alpha/block_4": 0.02221979945898056, "attnres/block_norm/4": 8754.1572265625, "attnres/final_alpha/block_5": 0.5545154213905334, "attnres/block_norm/5": 4861.87890625, "attnres/final_alpha/block_6": 0.14020439982414246, "attnres/block_norm/6": 18424.220703125, "geo/tier1_time_s": 1.3615813255310059, "geo/step": 18750.0, "geo/rankme_slope": -2.1723376850740296e-05} {"step": 18760, "timestamp": 1778214810.7864926, "train/loss": 2.2119399309158325, "train/z_loss": 0.0015979093150235713, "train/perplexity": 9.133417416921626, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700759.7830421154, "perf/iters_per_sec": 0.8109854617319657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330677270889283, "data/tokens_consumed": 39344668672, "data/tokens_consumed_B": 39.344668672, "train/loss_slope": -9.587870250762549e-06} {"step": 18770, "timestamp": 1778214821.146152, "train/loss": 2.217204737663269, "train/z_loss": 0.0015866359346546233, "train/perplexity": 9.18162989789946, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025403.4909951435, "perf/iters_per_sec": 0.9657876448608129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354243040084838, "data/tokens_consumed": 39365640192, "data/tokens_consumed_B": 39.365640192, "train/loss_slope": -1.1259521581087233e-05} {"step": 18780, "timestamp": 1778214831.50884, "train/loss": 2.2720425844192507, "train/z_loss": 0.0015894999727606774, "train/perplexity": 9.699192013079724, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024768.9563496795, "perf/iters_per_sec": 0.9654850751636884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035748791694641, "data/tokens_consumed": 39386611712, "data/tokens_consumed_B": 39.386611712, "train/loss_slope": -9.418845627352947e-06} {"step": 18790, "timestamp": 1778214841.8605688, "train/loss": 2.2313937664031984, "train/z_loss": 0.0015995778259821237, "train/perplexity": 9.312836957466638, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026927.4375274088, "perf/iters_per_sec": 0.9665143191945118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346458196640014, "data/tokens_consumed": 39407583232, "data/tokens_consumed_B": 39.407583232, "train/loss_slope": -8.953446866941612e-06} {"step": 18800, "timestamp": 1778214852.2107894, "grad/layer_0/attn": 0.0033502280712127686, "grad/layer_0/mlp": 0.002962679136544466, "grad/layer_0/attn_mlp_ratio": 1.1308102577855583, "grad/layer_4/attn": 0.0016846108483150601, "grad/layer_4/mlp": 0.0026011017616838217, "grad/layer_4/attn_mlp_ratio": 0.6476527786668563, "grad/layer_8/attn": 0.009721067734062672, "grad/layer_8/mlp": 0.003765514586120844, "grad/layer_8/attn_mlp_ratio": 2.581603988929629, "grad/layer_12/attn": 0.005744763650000095, "grad/layer_12/mlp": 0.006375160533934832, "grad/layer_12/attn_mlp_ratio": 0.901116690208691, "grad/layer_16/attn": 0.0043768794275820255, "grad/layer_16/mlp": 0.004696236457675695, "grad/layer_16/attn_mlp_ratio": 0.9319972224202163, "grad/layer_20/attn": 0.007877440191805363, "grad/layer_20/mlp": 0.00691527035087347, "grad/layer_20/attn_mlp_ratio": 1.1391369647459502, "grad/layer_24/attn": 0.01846628449857235, "grad/layer_24/mlp": 0.013404689729213715, "grad/layer_24/attn_mlp_ratio": 1.3775987907103655, "grad/layer_27/attn": 0.008880515582859516, "grad/layer_27/mlp": 0.012345733121037483, "grad/layer_27/attn_mlp_ratio": 0.7193186037526603} {"step": 18800, "timestamp": 1778214852.226555, "train/loss": 2.240247058868408, "train/z_loss": 0.0015861601918004454, "train/perplexity": 9.395652279940323, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024174.085922934, "perf/iters_per_sec": 0.9652014188399001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03605318069458, "data/tokens_consumed": 39428554752, "data/tokens_consumed_B": 39.428554752, "train/loss_slope": -5.286332804842672e-06} {"step": 18810, "timestamp": 1778214862.5786417, "train/loss": 2.232487440109253, "train/z_loss": 0.0015941411023959518, "train/perplexity": 9.323027734052205, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026777.470852709, "perf/iters_per_sec": 0.9664428095115227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034722375869751, "data/tokens_consumed": 39449526272, "data/tokens_consumed_B": 39.449526272, "train/loss_slope": -4.845648944968049e-06} {"step": 18820, "timestamp": 1778214872.926541, "train/loss": 2.2225843906402587, "train/z_loss": 0.0015906915650703013, "train/perplexity": 9.231156980326018, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027658.8605180245, "perf/iters_per_sec": 0.9668630888548014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034272599220276, "data/tokens_consumed": 39470497792, "data/tokens_consumed_B": 39.470497792, "train/loss_slope": -9.181839261654527e-07} {"step": 18825, "timestamp": 1778214878.7082121, "eos/sharpness": 15.450286865234371, "eos/L0_probe": 2.061187267303467, "eos/L_plus": 2.1480376720428467, "eos/L_minus": 2.1288397312164307, "eos/grad_norm": 0.11116960644721985, "eos/embed_grad_frac": 0.19847415387630463, "eos/time_s": 0.6128330230712891} {"step": 18825, "timestamp": 1778214880.087594, "geo/rankme_last": 441.91009521484375, "geo/layer_0/stable_rank_q_proj": 16.779951095581055, "geo/layer_0/stable_rank_k_proj": 14.919123649597168, "geo/layer_0/stable_rank_o_proj": 53.10285186767578, "geo/layer_0/stable_rank_gate_proj": 155.96510314941406, "geo/layer_0/stable_rank_down_proj": 49.819374084472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04718794301152229, "geo/layer_0/attn_entropy_mean": 6.273496627807617, "geo/layer_0/attn_entropy_std": 0.31477272510528564, "geo/layer_7/stable_rank_q_proj": 42.870914459228516, "geo/layer_7/stable_rank_k_proj": 42.10953140258789, "geo/layer_7/stable_rank_o_proj": 110.5321273803711, "geo/layer_7/stable_rank_gate_proj": 106.22297668457031, "geo/layer_7/stable_rank_down_proj": 152.42189025878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5690776109695435, "geo/layer_7/attn_entropy_mean": 4.625225067138672, "geo/layer_7/attn_entropy_std": 0.8634788393974304, "geo/layer_14/stable_rank_q_proj": 58.03194046020508, "geo/layer_14/stable_rank_k_proj": 35.271427154541016, "geo/layer_14/stable_rank_o_proj": 52.741241455078125, "geo/layer_14/stable_rank_gate_proj": 90.5272216796875, "geo/layer_14/stable_rank_down_proj": 136.9221649169922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3817816376686096, "geo/layer_14/attn_entropy_mean": 5.510500431060791, "geo/layer_14/attn_entropy_std": 0.49519142508506775, "geo/layer_21/stable_rank_q_proj": 49.02336120605469, "geo/layer_21/stable_rank_k_proj": 31.810218811035156, "geo/layer_21/stable_rank_o_proj": 84.91696166992188, "geo/layer_21/stable_rank_gate_proj": 90.0916519165039, "geo/layer_21/stable_rank_down_proj": 62.066680908203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15311561524868011, "geo/layer_21/attn_entropy_mean": 5.758148193359375, "geo/layer_21/attn_entropy_std": 0.29730692505836487, "geo/layer_27/stable_rank_q_proj": 41.34579849243164, "geo/layer_27/stable_rank_k_proj": 31.76247787475586, "geo/layer_27/stable_rank_o_proj": 117.18451690673828, "geo/layer_27/stable_rank_gate_proj": 91.50402069091797, "geo/layer_27/stable_rank_down_proj": 140.29812622070312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0723513662815094, "geo/layer_27/attn_entropy_mean": 4.406108379364014, "geo/layer_27/attn_entropy_std": 0.5407087802886963, "attnres/final_alpha/block_0": 0.2437264621257782, "attnres/block_norm/0": 1.5936241149902344, "attnres/final_alpha/block_1": 0.007032879162579775, "attnres/block_norm/1": 27596.45703125, "attnres/final_alpha/block_2": 0.014429588802158833, "attnres/block_norm/2": 20021.8828125, "attnres/final_alpha/block_3": 0.016640890389680862, "attnres/block_norm/3": 27416.78515625, "attnres/final_alpha/block_4": 0.021731648594141006, "attnres/block_norm/4": 8778.955078125, "attnres/final_alpha/block_5": 0.5579710006713867, "attnres/block_norm/5": 4869.3134765625, "attnres/final_alpha/block_6": 0.13846753537654877, "attnres/block_norm/6": 18518.5546875, "geo/tier1_time_s": 1.3595349788665771, "geo/step": 18825.0, "geo/rankme_slope": 1.2770498824529813e-05} {"step": 18830, "timestamp": 1778214885.264444, "train/loss": 2.219364047050476, "train/z_loss": 0.0015908065252006054, "train/perplexity": 9.201477298154964, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700593.401873328, "perf/iters_per_sec": 0.8109061250082626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2331883668899537, "data/tokens_consumed": 39491469312, "data/tokens_consumed_B": 39.491469312, "train/loss_slope": -4.12855693394811e-06} {"step": 18840, "timestamp": 1778214895.6105382, "train/loss": 2.2540655851364138, "train/z_loss": 0.0015761416754685343, "train/perplexity": 9.526387551833565, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027898.6712926982, "perf/iters_per_sec": 0.9669774395431033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341502904891968, "data/tokens_consumed": 39512440832, "data/tokens_consumed_B": 39.512440832, "train/loss_slope": -4.770608052741017e-06} {"step": 18850, "timestamp": 1778214905.9549775, "grad/layer_0/attn": 0.0027144199702888727, "grad/layer_0/mlp": 0.0028008988592773676, "grad/layer_0/attn_mlp_ratio": 0.9691245595625462, "grad/layer_4/attn": 0.0019542891532182693, "grad/layer_4/mlp": 0.0024765324778854847, "grad/layer_4/attn_mlp_ratio": 0.7891231355764688, "grad/layer_8/attn": 0.005833127535879612, "grad/layer_8/mlp": 0.003749583614990115, "grad/layer_8/attn_mlp_ratio": 1.5556733705024077, "grad/layer_12/attn": 0.0035975645296275616, "grad/layer_12/mlp": 0.005764726083725691, "grad/layer_12/attn_mlp_ratio": 0.624065118614617, "grad/layer_16/attn": 0.004687493667006493, "grad/layer_16/mlp": 0.004661549814045429, "grad/layer_16/attn_mlp_ratio": 1.005565477885991, "grad/layer_20/attn": 0.011136007495224476, "grad/layer_20/mlp": 0.006684896536171436, "grad/layer_20/attn_mlp_ratio": 1.6658458763548323, "grad/layer_24/attn": 0.018396563827991486, "grad/layer_24/mlp": 0.015519637614488602, "grad/layer_24/attn_mlp_ratio": 1.1853732778064197, "grad/layer_27/attn": 0.0060747042298316956, "grad/layer_27/mlp": 0.014637229964137077, "grad/layer_27/attn_mlp_ratio": 0.4150173361499202} {"step": 18850, "timestamp": 1778214905.9706335, "train/loss": 2.2552277088165282, "train/z_loss": 0.0015780541463755072, "train/perplexity": 9.53746482772913, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025605.8237642548, "perf/iters_per_sec": 0.9658841246434473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353208780288696, "data/tokens_consumed": 39533412352, "data/tokens_consumed_B": 39.533412352, "train/loss_slope": -3.898716087352786e-06} {"step": 18860, "timestamp": 1778214916.3230784, "train/loss": 2.2345879077911377, "train/z_loss": 0.001592706807423383, "train/perplexity": 9.34263103334579, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026856.5380278483, "perf/iters_per_sec": 0.9664805116786234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346820116043092, "data/tokens_consumed": 39554383872, "data/tokens_consumed_B": 39.554383872, "train/loss_slope": -5.291679776040388e-06} {"step": 18870, "timestamp": 1778214926.6741853, "train/loss": 2.220731019973755, "train/z_loss": 0.0015957191702909768, "train/perplexity": 9.214064069403465, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027029.8255314059, "perf/iters_per_sec": 0.9665631415993718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345935583114625, "data/tokens_consumed": 39575355392, "data/tokens_consumed_B": 39.575355392, "train/loss_slope": -5.669843403026264e-06} {"step": 18880, "timestamp": 1778214937.029815, "train/loss": 2.2292179822921754, "train/z_loss": 0.0015978292329236865, "train/perplexity": 9.292596262462343, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026283.5457605936, "perf/iters_per_sec": 0.9662072876742333, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349745988845824, "data/tokens_consumed": 39596326912, "data/tokens_consumed_B": 39.596326912, "train/loss_slope": -7.001373870144628e-06} {"step": 18890, "timestamp": 1778214947.3813424, "train/loss": 2.2449265480041505, "train/z_loss": 0.0015829253941774368, "train/perplexity": 9.439722164561903, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027189.6873387971, "perf/iters_per_sec": 0.9666393696493135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034511971473694, "data/tokens_consumed": 39617298432, "data/tokens_consumed_B": 39.617298432, "train/loss_slope": -9.28227168248294e-06} {"step": 18900, "timestamp": 1778214958.1599076, "grad/layer_0/attn": 0.002664410276338458, "grad/layer_0/mlp": 0.002615257166326046, "grad/layer_0/attn_mlp_ratio": 1.0187947131034887, "grad/layer_4/attn": 0.00170119886752218, "grad/layer_4/mlp": 0.002541028195992112, "grad/layer_4/attn_mlp_ratio": 0.6694922957786146, "grad/layer_8/attn": 0.008557507768273354, "grad/layer_8/mlp": 0.0038134579081088305, "grad/layer_8/attn_mlp_ratio": 2.24402832024816, "grad/layer_12/attn": 0.004359993152320385, "grad/layer_12/mlp": 0.006509528495371342, "grad/layer_12/attn_mlp_ratio": 0.669786312240889, "grad/layer_16/attn": 0.005971272476017475, "grad/layer_16/mlp": 0.004527853336185217, "grad/layer_16/attn_mlp_ratio": 1.3187866083069069, "grad/layer_20/attn": 0.003948777448385954, "grad/layer_20/mlp": 0.006103029940277338, "grad/layer_20/attn_mlp_ratio": 0.6470191728249319, "grad/layer_24/attn": 0.008020578883588314, "grad/layer_24/mlp": 0.009660685434937477, "grad/layer_24/attn_mlp_ratio": 0.8302287508047143, "grad/layer_27/attn": 0.011873329989612103, "grad/layer_27/mlp": 0.008837898261845112, "grad/layer_27/attn_mlp_ratio": 1.3434562724630923} {"step": 18900, "timestamp": 1778214958.7653577, "eos/sharpness": 57.241916656494126, "eos/L0_probe": 2.0608890056610107, "eos/L_plus": 2.4483795166015625, "eos/L_minus": 2.2458176612854004, "eos/grad_norm": 0.1752258688211441, "eos/embed_grad_frac": 0.07828289270401001, "eos/time_s": 0.6022963523864746} {"step": 18900, "timestamp": 1778214958.786114, "train/loss": 2.2544785499572755, "train/z_loss": 0.0015821476350538432, "train/perplexity": 9.530322427188995, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1840022.4160322412, "perf/iters_per_sec": 0.8773910598908621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.139742636680603, "data/tokens_consumed": 39638269952, "data/tokens_consumed_B": 39.638269952, "train/loss_slope": -6.860873078999537e-06} {"step": 18900, "timestamp": 1778214960.1581151, "geo/rankme_last": 440.6109924316406, "geo/layer_0/stable_rank_q_proj": 16.79633140563965, "geo/layer_0/stable_rank_k_proj": 14.951894760131836, "geo/layer_0/stable_rank_o_proj": 53.00883865356445, "geo/layer_0/stable_rank_gate_proj": 156.07965087890625, "geo/layer_0/stable_rank_down_proj": 49.890357971191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.044098179787397385, "geo/layer_0/attn_entropy_mean": 6.275210380554199, "geo/layer_0/attn_entropy_std": 0.31299296021461487, "geo/layer_7/stable_rank_q_proj": 42.83327865600586, "geo/layer_7/stable_rank_k_proj": 42.26725387573242, "geo/layer_7/stable_rank_o_proj": 110.66314697265625, "geo/layer_7/stable_rank_gate_proj": 105.82801818847656, "geo/layer_7/stable_rank_down_proj": 152.0648193359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5629763603210449, "geo/layer_7/attn_entropy_mean": 4.670758247375488, "geo/layer_7/attn_entropy_std": 0.8493112921714783, "geo/layer_14/stable_rank_q_proj": 57.996131896972656, "geo/layer_14/stable_rank_k_proj": 35.33633041381836, "geo/layer_14/stable_rank_o_proj": 52.897579193115234, "geo/layer_14/stable_rank_gate_proj": 90.27884674072266, "geo/layer_14/stable_rank_down_proj": 136.7633056640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38082313537597656, "geo/layer_14/attn_entropy_mean": 5.539854049682617, "geo/layer_14/attn_entropy_std": 0.5097361207008362, "geo/layer_21/stable_rank_q_proj": 49.013282775878906, "geo/layer_21/stable_rank_k_proj": 31.926179885864258, "geo/layer_21/stable_rank_o_proj": 84.9557876586914, "geo/layer_21/stable_rank_gate_proj": 89.83321380615234, "geo/layer_21/stable_rank_down_proj": 62.061519622802734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15627902746200562, "geo/layer_21/attn_entropy_mean": 5.7656049728393555, "geo/layer_21/attn_entropy_std": 0.2898353338241577, "geo/layer_27/stable_rank_q_proj": 41.51657485961914, "geo/layer_27/stable_rank_k_proj": 31.697385787963867, "geo/layer_27/stable_rank_o_proj": 117.18230438232422, "geo/layer_27/stable_rank_gate_proj": 91.7337417602539, "geo/layer_27/stable_rank_down_proj": 139.75828552246094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07938238233327866, "geo/layer_27/attn_entropy_mean": 4.379215240478516, "geo/layer_27/attn_entropy_std": 0.5517678260803223, "attnres/final_alpha/block_0": 0.24418529868125916, "attnres/block_norm/0": 1.5945909023284912, "attnres/final_alpha/block_1": 0.007095432374626398, "attnres/block_norm/1": 27713.28125, "attnres/final_alpha/block_2": 0.014661076478660107, "attnres/block_norm/2": 19911.2421875, "attnres/final_alpha/block_3": 0.016609933227300644, "attnres/block_norm/3": 27275.625, "attnres/final_alpha/block_4": 0.02155955322086811, "attnres/block_norm/4": 8889.193359375, "attnres/final_alpha/block_5": 0.5583295822143555, "attnres/block_norm/5": 4903.72607421875, "attnres/final_alpha/block_6": 0.13755910098552704, "attnres/block_norm/6": 18694.3203125, "geo/tier1_time_s": 1.3675639629364014, "geo/step": 18900.0, "geo/rankme_slope": 4.225283863545416e-06} {"step": 18910, "timestamp": 1778214970.509349, "train/loss": 2.230237674713135, "train/z_loss": 0.00159103226615116, "train/perplexity": 9.302076685178351, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789545.7499384875, "perf/iters_per_sec": 0.8533219098751486, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718906879425048, "data/tokens_consumed": 39659241472, "data/tokens_consumed_B": 39.659241472, "train/loss_slope": -8.879644263445254e-06} {"step": 18920, "timestamp": 1778214980.8623474, "train/loss": 2.2291779041290285, "train/z_loss": 0.0015787160024046897, "train/perplexity": 9.292223839736337, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026704.5738611836, "perf/iters_per_sec": 0.9664080495172422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347595930099487, "data/tokens_consumed": 39680212992, "data/tokens_consumed_B": 39.680212992, "train/loss_slope": -1.1716547779159969e-05} {"step": 18930, "timestamp": 1778214991.214049, "train/loss": 2.2224722146987914, "train/z_loss": 0.001596954243723303, "train/perplexity": 9.230121524678609, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027130.8690897522, "perf/iters_per_sec": 0.9666113229225884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345419883728026, "data/tokens_consumed": 39701184512, "data/tokens_consumed_B": 39.701184512, "train/loss_slope": -1.4292525114900139e-05} {"step": 18940, "timestamp": 1778215001.563779, "train/loss": 2.229771089553833, "train/z_loss": 0.0015939968405291438, "train/perplexity": 9.297737486627904, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027203.3762692127, "perf/iters_per_sec": 0.9666458970399917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345049858093263, "data/tokens_consumed": 39722156032, "data/tokens_consumed_B": 39.722156032, "train/loss_slope": -9.407073801214063e-06} {"step": 18950, "timestamp": 1778215011.9035294, "grad/layer_0/attn": 0.003978648688644171, "grad/layer_0/mlp": 0.0030033153016120195, "grad/layer_0/attn_mlp_ratio": 1.3247522010204604, "grad/layer_4/attn": 0.0024552298709750175, "grad/layer_4/mlp": 0.0027181662153452635, "grad/layer_4/attn_mlp_ratio": 0.9032669771213585, "grad/layer_8/attn": 0.005497921258211136, "grad/layer_8/mlp": 0.003873545676469803, "grad/layer_8/attn_mlp_ratio": 1.4193510482330545, "grad/layer_12/attn": 0.006896629463881254, "grad/layer_12/mlp": 0.0059287757612764835, "grad/layer_12/attn_mlp_ratio": 1.1632467857195041, "grad/layer_16/attn": 0.0057531483471393585, "grad/layer_16/mlp": 0.004934502765536308, "grad/layer_16/attn_mlp_ratio": 1.1659023216544582, "grad/layer_20/attn": 0.007753455545753241, "grad/layer_20/mlp": 0.006151963025331497, "grad/layer_20/attn_mlp_ratio": 1.2603221748562492, "grad/layer_24/attn": 0.012149211019277573, "grad/layer_24/mlp": 0.014690721407532692, "grad/layer_24/attn_mlp_ratio": 0.8269989335137858, "grad/layer_27/attn": 0.008084286004304886, "grad/layer_27/mlp": 0.012949670664966106, "grad/layer_27/attn_mlp_ratio": 0.6242850610670367} {"step": 18950, "timestamp": 1778215011.9194868, "train/loss": 2.228179597854614, "train/z_loss": 0.0015891329501755535, "train/perplexity": 9.282951983220173, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026157.0567482214, "perf/iters_per_sec": 0.9661469730130298, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035039210319519, "data/tokens_consumed": 39743127552, "data/tokens_consumed_B": 39.743127552, "train/loss_slope": -8.034605039979424e-06} {"step": 18960, "timestamp": 1778215022.2769043, "train/loss": 2.280918502807617, "train/z_loss": 0.0015779771492816508, "train/perplexity": 9.78566444323271, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026225.8069349725, "perf/iters_per_sec": 0.9661797556567061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350040912628173, "data/tokens_consumed": 39764099072, "data/tokens_consumed_B": 39.764099072, "train/loss_slope": -5.361460351815231e-06} {"step": 18970, "timestamp": 1778215032.634788, "train/loss": 2.25100462436676, "train/z_loss": 0.0015871333773247898, "train/perplexity": 9.49727223641298, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025851.774536874, "perf/iters_per_sec": 0.9660014031109209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351951837539672, "data/tokens_consumed": 39785070592, "data/tokens_consumed_B": 39.785070592, "train/loss_slope": -4.944135275038764e-06} {"step": 18975, "timestamp": 1778215038.4189758, "eos/sharpness": 30.147385597228997, "eos/L0_probe": 2.0580594539642334, "eos/L_plus": 2.227550983428955, "eos/L_minus": 2.1900417804718018, "eos/grad_norm": 0.23948343098163605, "eos/embed_grad_frac": 0.07580612599849701, "eos/time_s": 0.6162006855010986} {"step": 18975, "timestamp": 1778215039.798913, "geo/rankme_last": 440.5494384765625, "geo/layer_0/stable_rank_q_proj": 16.813005447387695, "geo/layer_0/stable_rank_k_proj": 14.995532035827637, "geo/layer_0/stable_rank_o_proj": 52.79289627075195, "geo/layer_0/stable_rank_gate_proj": 155.97573852539062, "geo/layer_0/stable_rank_down_proj": 49.87713623046875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04643320292234421, "geo/layer_0/attn_entropy_mean": 6.278188705444336, "geo/layer_0/attn_entropy_std": 0.31571975350379944, "geo/layer_7/stable_rank_q_proj": 42.840301513671875, "geo/layer_7/stable_rank_k_proj": 42.367942810058594, "geo/layer_7/stable_rank_o_proj": 110.56462860107422, "geo/layer_7/stable_rank_gate_proj": 105.8797607421875, "geo/layer_7/stable_rank_down_proj": 151.8834686279297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5567165613174438, "geo/layer_7/attn_entropy_mean": 4.657044410705566, "geo/layer_7/attn_entropy_std": 0.8759773969650269, "geo/layer_14/stable_rank_q_proj": 57.87453079223633, "geo/layer_14/stable_rank_k_proj": 35.31188201904297, "geo/layer_14/stable_rank_o_proj": 52.965492248535156, "geo/layer_14/stable_rank_gate_proj": 90.1868896484375, "geo/layer_14/stable_rank_down_proj": 136.59103393554688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3683891296386719, "geo/layer_14/attn_entropy_mean": 5.548320770263672, "geo/layer_14/attn_entropy_std": 0.4783831536769867, "geo/layer_21/stable_rank_q_proj": 49.06642150878906, "geo/layer_21/stable_rank_k_proj": 32.0590934753418, "geo/layer_21/stable_rank_o_proj": 84.95975494384766, "geo/layer_21/stable_rank_gate_proj": 89.7622299194336, "geo/layer_21/stable_rank_down_proj": 61.97062683105469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15181195735931396, "geo/layer_21/attn_entropy_mean": 5.753219127655029, "geo/layer_21/attn_entropy_std": 0.29054591059684753, "geo/layer_27/stable_rank_q_proj": 41.48590850830078, "geo/layer_27/stable_rank_k_proj": 31.621938705444336, "geo/layer_27/stable_rank_o_proj": 117.28319549560547, "geo/layer_27/stable_rank_gate_proj": 91.9256362915039, "geo/layer_27/stable_rank_down_proj": 139.8700714111328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07653643190860748, "geo/layer_27/attn_entropy_mean": 4.394923686981201, "geo/layer_27/attn_entropy_std": 0.5475684404373169, "attnres/final_alpha/block_0": 0.24408486485481262, "attnres/block_norm/0": 1.5955654382705688, "attnres/final_alpha/block_1": 0.007223028223961592, "attnres/block_norm/1": 27647.69140625, "attnres/final_alpha/block_2": 0.014810158871114254, "attnres/block_norm/2": 20008.15625, "attnres/final_alpha/block_3": 0.016786005347967148, "attnres/block_norm/3": 27639.103515625, "attnres/final_alpha/block_4": 0.021934006363153458, "attnres/block_norm/4": 8820.82421875, "attnres/final_alpha/block_5": 0.5550514459609985, "attnres/block_norm/5": 4905.6474609375, "attnres/final_alpha/block_6": 0.14011046290397644, "attnres/block_norm/6": 18750.875, "geo/tier1_time_s": 1.3598155975341797, "geo/step": 18975.0, "geo/rankme_slope": -2.3264540191076433e-05} {"step": 18980, "timestamp": 1778215045.0070164, "train/loss": 2.2292314767837524, "train/z_loss": 0.0015910489135421812, "train/perplexity": 9.292721662170436, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695801.2998052258, "perf/iters_per_sec": 0.8086210726762895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2366731882095336, "data/tokens_consumed": 39806042112, "data/tokens_consumed_B": 39.806042112, "train/loss_slope": -6.614180767174961e-06} {"step": 18990, "timestamp": 1778215055.360051, "train/loss": 2.237088513374329, "train/z_loss": 0.0015906201326288283, "train/perplexity": 9.366022502900192, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026651.6672853772, "perf/iters_per_sec": 0.9663828216959844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347866058349608, "data/tokens_consumed": 39827013632, "data/tokens_consumed_B": 39.827013632, "train/loss_slope": -8.819429830785244e-06} {"step": 19000, "timestamp": 1778215065.6999934, "grad/layer_0/attn": 0.0023625330068171024, "grad/layer_0/mlp": 0.002470223233103752, "grad/layer_0/attn_mlp_ratio": 0.9564046194352231, "grad/layer_4/attn": 0.001560668577440083, "grad/layer_4/mlp": 0.0025940011255443096, "grad/layer_4/attn_mlp_ratio": 0.6016452737460067, "grad/layer_8/attn": 0.006452180910855532, "grad/layer_8/mlp": 0.003948891069740057, "grad/layer_8/attn_mlp_ratio": 1.6339221906893573, "grad/layer_12/attn": 0.005773706827312708, "grad/layer_12/mlp": 0.005819512996822596, "grad/layer_12/attn_mlp_ratio": 0.9921288484538514, "grad/layer_16/attn": 0.005532481707632542, "grad/layer_16/mlp": 0.0046401661820709705, "grad/layer_16/attn_mlp_ratio": 1.1923024674803935, "grad/layer_20/attn": 0.004000541288405657, "grad/layer_20/mlp": 0.006053349003195763, "grad/layer_20/attn_mlp_ratio": 0.6608806497371248, "grad/layer_24/attn": 0.008431087248027325, "grad/layer_24/mlp": 0.00989533495157957, "grad/layer_24/attn_mlp_ratio": 0.8520264553024396, "grad/layer_27/attn": 0.005052516236901283, "grad/layer_27/mlp": 0.009166049771010876, "grad/layer_27/attn_mlp_ratio": 0.5512206793550937} {"step": 19000, "timestamp": 1778215065.7157612, "train/loss": 2.2306719541549684, "train/z_loss": 0.0015774478670209646, "train/perplexity": 9.306117263155553, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026498.2865161349, "perf/iters_per_sec": 0.9663096840458559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348649263381957, "data/tokens_consumed": 39847985152, "data/tokens_consumed_B": 39.847985152, "train/loss_slope": -7.475819403153116e-06} {"step": 19000, "timestamp": 1778215072.776292, "geo/ww_alpha_mean": 8.174470423193608, "geo/ww_alpha_std": 5.3520015274305655, "geo/ww_alpha_min": 1.3351964399929592, "geo/ww_alpha_max": 39.02537111983402, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.218629375712802, "geo/ww_alpha_by_type/k_proj": 4.649644235800251, "geo/ww_alpha_by_type/v_proj": 7.9028375842928495, "geo/ww_alpha_by_type/o_proj": 8.588528345349776, "geo/ww_alpha_by_type/gate_proj": 8.825024408843408, "geo/ww_alpha_by_type/up_proj": 13.810990866021191, "geo/ww_alpha_by_type/down_proj": 9.358862646858743, "geo/twonn_id/layer_0": 0.7326406240463257, "geo/twonn_id/layer_7": 3.535463333129883, "geo/twonn_id/layer_14": 4.181174278259277, "geo/twonn_id/layer_21": 6.386610507965088, "geo/twonn_id/layer_27": 5.4793829917907715, "geo/tier2_time_s": 7.054251670837402} {"step": 19000, "timestamp": 1778215073.3678188, "eoc/jacobian_sigma/layer_0/attn": 725.9598388671875, "eoc/jacobian_sigma/layer_0/mlp": 5540.384765625, "eoc/jacobian_sigma/layer_0": 5540.384765625, "eoc/jacobian_sigma/layer_7/attn": 1.1612420082092285, "eoc/jacobian_sigma/layer_7/mlp": 1.6108577251434326, "eoc/jacobian_sigma/layer_7": 1.6108577251434326, "eoc/jacobian_sigma/layer_14/attn": 1.6368399858474731, "eoc/jacobian_sigma/layer_14/mlp": 7.409372329711914, "eoc/jacobian_sigma/layer_14": 7.409372329711914, "eoc/jacobian_sigma/layer_21/attn": 1.0866321325302124, "eoc/jacobian_sigma/layer_21/mlp": 4.120837211608887, "eoc/jacobian_sigma/layer_21": 4.120837211608887, "eoc/jacobian_sigma/layer_27/attn": 4.364462375640869, "eoc/jacobian_sigma/layer_27/mlp": 22.45462417602539, "eoc/jacobian_sigma/layer_27": 22.45462417602539, "eoc/layer0_sigma": 5540.384765625, "eoc/sigma_max": 22.45462417602539, "eoc/sigma_min": 1.6108577251434326, "eoc/sigma_mean": 8.898922860622406, "eoc/time_s": 0.5839278697967529} {"step": 19010, "timestamp": 1778215083.7394214, "train/loss": 2.229346013069153, "train/z_loss": 0.0015809074393473566, "train/perplexity": 9.293786076946775, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163874.9605837457, "perf/iters_per_sec": 0.5549788287085274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8018705368041992, "data/tokens_consumed": 39868956672, "data/tokens_consumed_B": 39.868956672, "train/loss_slope": -7.497745400035935e-06} {"step": 19020, "timestamp": 1778215094.0910258, "train/loss": 2.251136827468872, "train/z_loss": 0.00158998939441517, "train/perplexity": 9.49852788826294, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026997.3610617004, "perf/iters_per_sec": 0.9665476613338949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03461012840271, "data/tokens_consumed": 39889928192, "data/tokens_consumed_B": 39.889928192, "train/loss_slope": -9.519670305043245e-06} {"step": 19030, "timestamp": 1778215104.4415774, "train/loss": 2.223309803009033, "train/z_loss": 0.0015881602186709643, "train/perplexity": 9.237855805189094, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027000.0235761853, "perf/iters_per_sec": 0.9665489309197356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346087694168091, "data/tokens_consumed": 39910899712, "data/tokens_consumed_B": 39.910899712, "train/loss_slope": -1.0418185277847913e-05} {"step": 19040, "timestamp": 1778215114.7930412, "train/loss": 2.181289267539978, "train/z_loss": 0.001602945604827255, "train/perplexity": 8.857718869602488, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026978.6302364357, "perf/iters_per_sec": 0.9665387297804049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034619688987732, "data/tokens_consumed": 39931871232, "data/tokens_consumed_B": 39.931871232, "train/loss_slope": -1.3272478342270918e-05} {"step": 19050, "timestamp": 1778215125.1452386, "grad/layer_0/attn": 0.0033441209234297276, "grad/layer_0/mlp": 0.0029869298450648785, "grad/layer_0/attn_mlp_ratio": 1.1195846521124522, "grad/layer_4/attn": 0.001822741818614304, "grad/layer_4/mlp": 0.002514963736757636, "grad/layer_4/attn_mlp_ratio": 0.7247586593389094, "grad/layer_8/attn": 0.007969082333147526, "grad/layer_8/mlp": 0.003807129105553031, "grad/layer_8/attn_mlp_ratio": 2.0931998634361877, "grad/layer_12/attn": 0.005922388285398483, "grad/layer_12/mlp": 0.006314841564744711, "grad/layer_12/attn_mlp_ratio": 0.9378522217687159, "grad/layer_16/attn": 0.005674589890986681, "grad/layer_16/mlp": 0.0050552235916256905, "grad/layer_16/attn_mlp_ratio": 1.122520038111669, "grad/layer_20/attn": 0.006713492795825005, "grad/layer_20/mlp": 0.007140701171010733, "grad/layer_20/attn_mlp_ratio": 0.940172756292148, "grad/layer_24/attn": 0.02343534491956234, "grad/layer_24/mlp": 0.01791759580373764, "grad/layer_24/attn_mlp_ratio": 1.307951415216016, "grad/layer_27/attn": 0.008401221595704556, "grad/layer_27/mlp": 0.016319947317242622, "grad/layer_27/attn_mlp_ratio": 0.514782393650874} {"step": 19050, "timestamp": 1778215125.7751985, "eos/sharpness": 38.96582126617431, "eos/L0_probe": 2.0613484382629395, "eos/L_plus": 2.2770352363586426, "eos/L_minus": 2.2353198528289795, "eos/grad_norm": 0.22817987203598022, "eos/embed_grad_frac": 0.06438309699296951, "eos/time_s": 0.6271450519561768} {"step": 19050, "timestamp": 1778215125.7949238, "train/loss": 2.204661011695862, "train/z_loss": 0.001584491308312863, "train/perplexity": 9.067177379745662, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907048.6692076079, "perf/iters_per_sec": 0.9093516679800071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0996845722198487, "data/tokens_consumed": 39952842752, "data/tokens_consumed_B": 39.952842752, "train/loss_slope": -1.6636178130828875e-05} {"step": 19050, "timestamp": 1778215127.158712, "geo/rankme_last": 441.13360595703125, "geo/layer_0/stable_rank_q_proj": 16.823251724243164, "geo/layer_0/stable_rank_k_proj": 14.956148147583008, "geo/layer_0/stable_rank_o_proj": 52.79832458496094, "geo/layer_0/stable_rank_gate_proj": 155.57896423339844, "geo/layer_0/stable_rank_down_proj": 50.012733459472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04699878767132759, "geo/layer_0/attn_entropy_mean": 6.279057502746582, "geo/layer_0/attn_entropy_std": 0.31441664695739746, "geo/layer_7/stable_rank_q_proj": 42.64970016479492, "geo/layer_7/stable_rank_k_proj": 42.2917594909668, "geo/layer_7/stable_rank_o_proj": 110.55079650878906, "geo/layer_7/stable_rank_gate_proj": 105.7436294555664, "geo/layer_7/stable_rank_down_proj": 151.8062286376953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5459648370742798, "geo/layer_7/attn_entropy_mean": 4.645423412322998, "geo/layer_7/attn_entropy_std": 0.8433202505111694, "geo/layer_14/stable_rank_q_proj": 58.00247573852539, "geo/layer_14/stable_rank_k_proj": 35.1348876953125, "geo/layer_14/stable_rank_o_proj": 52.9769172668457, "geo/layer_14/stable_rank_gate_proj": 89.8845443725586, "geo/layer_14/stable_rank_down_proj": 136.6905059814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3646690547466278, "geo/layer_14/attn_entropy_mean": 5.517490386962891, "geo/layer_14/attn_entropy_std": 0.5076876878738403, "geo/layer_21/stable_rank_q_proj": 48.93033981323242, "geo/layer_21/stable_rank_k_proj": 32.071556091308594, "geo/layer_21/stable_rank_o_proj": 85.11919403076172, "geo/layer_21/stable_rank_gate_proj": 89.80317687988281, "geo/layer_21/stable_rank_down_proj": 61.87946319580078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15251792967319489, "geo/layer_21/attn_entropy_mean": 5.7420854568481445, "geo/layer_21/attn_entropy_std": 0.29614385962486267, "geo/layer_27/stable_rank_q_proj": 41.465911865234375, "geo/layer_27/stable_rank_k_proj": 31.613637924194336, "geo/layer_27/stable_rank_o_proj": 117.48175048828125, "geo/layer_27/stable_rank_gate_proj": 91.70565795898438, "geo/layer_27/stable_rank_down_proj": 139.62010192871094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0848194807767868, "geo/layer_27/attn_entropy_mean": 4.413104057312012, "geo/layer_27/attn_entropy_std": 0.5437846183776855, "attnres/final_alpha/block_0": 0.24476364254951477, "attnres/block_norm/0": 1.5966237783432007, "attnres/final_alpha/block_1": 0.007086741738021374, "attnres/block_norm/1": 27792.36328125, "attnres/final_alpha/block_2": 0.01474841684103012, "attnres/block_norm/2": 20023.10546875, "attnres/final_alpha/block_3": 0.016630448400974274, "attnres/block_norm/3": 27615.1328125, "attnres/final_alpha/block_4": 0.022166434675455093, "attnres/block_norm/4": 8893.255859375, "attnres/final_alpha/block_5": 0.5541958212852478, "attnres/block_norm/5": 4926.25, "attnres/final_alpha/block_6": 0.1404084712266922, "attnres/block_norm/6": 18728.14453125, "geo/tier1_time_s": 1.3598968982696533, "geo/step": 19050.0, "geo/rankme_slope": -3.770451539990996e-05} {"step": 19060, "timestamp": 1778215137.5115242, "train/loss": 2.232057285308838, "train/z_loss": 0.0015834805672056973, "train/perplexity": 9.319018251328947, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790466.1782781458, "perf/iters_per_sec": 0.8537608043089608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171288251876831, "data/tokens_consumed": 39973814272, "data/tokens_consumed_B": 39.973814272, "train/loss_slope": -1.4020452934308677e-05} {"step": 19070, "timestamp": 1778215148.2452507, "train/loss": 2.2870328187942506, "train/z_loss": 0.0015589587157592178, "train/perplexity": 9.845680379001731, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1955059.016710399, "perf/iters_per_sec": 0.9322447856475825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0726796388626099, "data/tokens_consumed": 39994785792, "data/tokens_consumed_B": 39.994785792, "train/loss_slope": -1.1840307637445603e-05} {"step": 19080, "timestamp": 1778215159.110021, "train/loss": 2.2529567003250124, "train/z_loss": 0.001574945147149265, "train/perplexity": 9.515829740150169, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931339.970229662, "perf/iters_per_sec": 0.9209346629284201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.085853362083435, "data/tokens_consumed": 40015757312, "data/tokens_consumed_B": 40.015757312, "train/loss_slope": -1.0956124818757845e-05} {"step": 19090, "timestamp": 1778215169.46552, "train/loss": 2.282557725906372, "train/z_loss": 0.0015733139007352293, "train/perplexity": 9.801718484907855, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026127.3737919233, "perf/iters_per_sec": 0.9661328190765015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03505437374115, "data/tokens_consumed": 40036728832, "data/tokens_consumed_B": 40.036728832, "train/loss_slope": -8.466628232304169e-06} {"step": 19100, "timestamp": 1778215179.8070862, "grad/layer_0/attn": 0.0028259791433811188, "grad/layer_0/mlp": 0.0028397643472999334, "grad/layer_0/attn_mlp_ratio": 0.9951456171190812, "grad/layer_4/attn": 0.0022948947735130787, "grad/layer_4/mlp": 0.0025884928181767464, "grad/layer_4/attn_mlp_ratio": 0.8865756430693775, "grad/layer_8/attn": 0.004368404857814312, "grad/layer_8/mlp": 0.0036566904745996, "grad/layer_8/attn_mlp_ratio": 1.1946334448308205, "grad/layer_12/attn": 0.004510725382715464, "grad/layer_12/mlp": 0.006453886162489653, "grad/layer_12/attn_mlp_ratio": 0.6989161567553571, "grad/layer_16/attn": 0.0043473937548696995, "grad/layer_16/mlp": 0.004650745540857315, "grad/layer_16/attn_mlp_ratio": 0.9347734945289987, "grad/layer_20/attn": 0.0048329574055969715, "grad/layer_20/mlp": 0.006616510916501284, "grad/layer_20/attn_mlp_ratio": 0.7304389569584017, "grad/layer_24/attn": 0.0098279332742095, "grad/layer_24/mlp": 0.009793809615075588, "grad/layer_24/attn_mlp_ratio": 1.0034841966636727, "grad/layer_27/attn": 0.006980623118579388, "grad/layer_27/mlp": 0.00880434364080429, "grad/layer_27/attn_mlp_ratio": 0.7928612653123988} {"step": 19100, "timestamp": 1778215179.82292, "train/loss": 2.2436731338500975, "train/z_loss": 0.0015773338498547673, "train/perplexity": 9.427897695217201, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025940.895114146, "perf/iters_per_sec": 0.9660438991137247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351496458053588, "data/tokens_consumed": 40057700352, "data/tokens_consumed_B": 40.057700352, "train/loss_slope": -5.982563555008208e-06} {"step": 19110, "timestamp": 1778215190.17167, "train/loss": 2.2503615617752075, "train/z_loss": 0.001576281082816422, "train/perplexity": 9.491166859196197, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027525.5633459156, "perf/iters_per_sec": 0.9667995278100565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343405961990357, "data/tokens_consumed": 40078671872, "data/tokens_consumed_B": 40.078671872, "train/loss_slope": -4.827982206942679e-06} {"step": 19120, "timestamp": 1778215200.5200033, "train/loss": 2.21010160446167, "train/z_loss": 0.0015958231291733683, "train/perplexity": 9.116642637551648, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027606.3714961575, "perf/iters_per_sec": 0.9668380601387775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034299373626709, "data/tokens_consumed": 40099643392, "data/tokens_consumed_B": 40.099643392, "train/loss_slope": -4.574978176337902e-06} {"step": 19125, "timestamp": 1778215206.7588274, "eos/sharpness": 51.66571140289306, "eos/L0_probe": 2.0552291870117188, "eos/L_plus": 2.2908692359924316, "eos/L_minus": 2.3362462520599365, "eos/grad_norm": 0.2611818015575409, "eos/embed_grad_frac": 0.04209499806165695, "eos/time_s": 0.6112852096557617} {"step": 19125, "timestamp": 1778215208.1378005, "geo/rankme_last": 440.3915100097656, "geo/layer_0/stable_rank_q_proj": 16.81968116760254, "geo/layer_0/stable_rank_k_proj": 14.955904960632324, "geo/layer_0/stable_rank_o_proj": 52.76005172729492, "geo/layer_0/stable_rank_gate_proj": 155.08901977539062, "geo/layer_0/stable_rank_down_proj": 50.027740478515625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04469668120145798, "geo/layer_0/attn_entropy_mean": 6.276684761047363, "geo/layer_0/attn_entropy_std": 0.3184451162815094, "geo/layer_7/stable_rank_q_proj": 42.57621765136719, "geo/layer_7/stable_rank_k_proj": 42.40654754638672, "geo/layer_7/stable_rank_o_proj": 110.6502685546875, "geo/layer_7/stable_rank_gate_proj": 105.28752136230469, "geo/layer_7/stable_rank_down_proj": 151.86892700195312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5575395822525024, "geo/layer_7/attn_entropy_mean": 4.667447090148926, "geo/layer_7/attn_entropy_std": 0.839491605758667, "geo/layer_14/stable_rank_q_proj": 57.93975067138672, "geo/layer_14/stable_rank_k_proj": 35.04043197631836, "geo/layer_14/stable_rank_o_proj": 52.97880172729492, "geo/layer_14/stable_rank_gate_proj": 89.63018798828125, "geo/layer_14/stable_rank_down_proj": 136.5859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3674674332141876, "geo/layer_14/attn_entropy_mean": 5.537442207336426, "geo/layer_14/attn_entropy_std": 0.4877263605594635, "geo/layer_21/stable_rank_q_proj": 48.861934661865234, "geo/layer_21/stable_rank_k_proj": 32.05909729003906, "geo/layer_21/stable_rank_o_proj": 84.72498321533203, "geo/layer_21/stable_rank_gate_proj": 89.82238006591797, "geo/layer_21/stable_rank_down_proj": 61.83173751831055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1497640609741211, "geo/layer_21/attn_entropy_mean": 5.75808048248291, "geo/layer_21/attn_entropy_std": 0.28572142124176025, "geo/layer_27/stable_rank_q_proj": 41.47913360595703, "geo/layer_27/stable_rank_k_proj": 31.72068977355957, "geo/layer_27/stable_rank_o_proj": 117.76049041748047, "geo/layer_27/stable_rank_gate_proj": 91.7481460571289, "geo/layer_27/stable_rank_down_proj": 139.15130615234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07057441025972366, "geo/layer_27/attn_entropy_mean": 4.407520771026611, "geo/layer_27/attn_entropy_std": 0.5492885708808899, "attnres/final_alpha/block_0": 0.2443195879459381, "attnres/block_norm/0": 1.597748041152954, "attnres/final_alpha/block_1": 0.00710771419107914, "attnres/block_norm/1": 27822.69921875, "attnres/final_alpha/block_2": 0.014629506506025791, "attnres/block_norm/2": 20063.53515625, "attnres/final_alpha/block_3": 0.016613326966762543, "attnres/block_norm/3": 27689.14453125, "attnres/final_alpha/block_4": 0.021606219932436943, "attnres/block_norm/4": 8878.982421875, "attnres/final_alpha/block_5": 0.5558956861495972, "attnres/block_norm/5": 4901.89013671875, "attnres/final_alpha/block_6": 0.13982799649238586, "attnres/block_norm/6": 18845.0078125, "geo/tier1_time_s": 1.3551280498504639, "geo/step": 19125.0, "geo/rankme_slope": -4.379501800720288e-05} {"step": 19130, "timestamp": 1778215213.3168066, "train/loss": 2.292317533493042, "train/z_loss": 0.0015804973081685603, "train/perplexity": 9.897849719444075, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1639495.1895990153, "perf/iters_per_sec": 0.7817722270960881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2791449546813964, "data/tokens_consumed": 40120614912, "data/tokens_consumed_B": 40.120614912, "train/loss_slope": -2.386523709438878e-06} {"step": 19140, "timestamp": 1778215223.6746595, "train/loss": 2.2314355611801147, "train/z_loss": 0.0015908033936284482, "train/perplexity": 9.313226193543693, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027680.1747345745, "perf/iters_per_sec": 0.9668732522652504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342617273330688, "data/tokens_consumed": 40141586432, "data/tokens_consumed_B": 40.141586432, "train/loss_slope": -2.1506850725890518e-06} {"step": 19150, "timestamp": 1778215234.497397, "grad/layer_0/attn": 0.0022513875737786293, "grad/layer_0/mlp": 0.0024519162252545357, "grad/layer_0/attn_mlp_ratio": 0.9182154996846846, "grad/layer_4/attn": 0.0032189444173127413, "grad/layer_4/mlp": 0.002425439888611436, "grad/layer_4/attn_mlp_ratio": 1.3271589618490567, "grad/layer_8/attn": 0.004735460039228201, "grad/layer_8/mlp": 0.003822860773652792, "grad/layer_8/attn_mlp_ratio": 1.2387215218490044, "grad/layer_12/attn": 0.003942560404539108, "grad/layer_12/mlp": 0.006252387538552284, "grad/layer_12/attn_mlp_ratio": 0.6305687734761105, "grad/layer_16/attn": 0.003783315187320113, "grad/layer_16/mlp": 0.004261182155460119, "grad/layer_16/attn_mlp_ratio": 0.8878557546963205, "grad/layer_20/attn": 0.003977078013122082, "grad/layer_20/mlp": 0.006084016524255276, "grad/layer_20/attn_mlp_ratio": 0.6536928247806856, "grad/layer_24/attn": 0.022525489330291748, "grad/layer_24/mlp": 0.014754053205251694, "grad/layer_24/attn_mlp_ratio": 1.526732272430778, "grad/layer_27/attn": 0.004579085391014814, "grad/layer_27/mlp": 0.012195590883493423, "grad/layer_27/attn_mlp_ratio": 0.37547056122270317} {"step": 19150, "timestamp": 1778215234.513535, "train/loss": 2.2282833099365233, "train/z_loss": 0.0015873119002208114, "train/perplexity": 9.283914787422967, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935739.9712978308, "perf/iters_per_sec": 0.9230327469338564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.083385181427002, "data/tokens_consumed": 40162557952, "data/tokens_consumed_B": 40.162557952, "train/loss_slope": -3.971998962667942e-06} {"step": 19160, "timestamp": 1778215244.8670444, "train/loss": 2.231531262397766, "train/z_loss": 0.0015834949328564108, "train/perplexity": 9.31411752328067, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026706.3950546307, "perf/iters_per_sec": 0.9664089179299501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347586631774903, "data/tokens_consumed": 40183529472, "data/tokens_consumed_B": 40.183529472, "train/loss_slope": -6.265177256060324e-06} {"step": 19170, "timestamp": 1778215255.2197654, "train/loss": 2.2353312253952025, "train/z_loss": 0.0015806702082045375, "train/perplexity": 9.349578157100979, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026977.69603975, "perf/iters_per_sec": 0.9665382843207121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346201658248901, "data/tokens_consumed": 40204500992, "data/tokens_consumed_B": 40.204500992, "train/loss_slope": -6.312026199263062e-06} {"step": 19180, "timestamp": 1778215265.577897, "train/loss": 2.231372332572937, "train/z_loss": 0.0015856129117310046, "train/perplexity": 9.312637349839227, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026744.454076832, "perf/iters_per_sec": 0.9664270658859405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347392320632935, "data/tokens_consumed": 40225472512, "data/tokens_consumed_B": 40.225472512, "train/loss_slope": -4.944009935394535e-06} {"step": 19190, "timestamp": 1778215275.9379869, "train/loss": 2.2509791612625123, "train/z_loss": 0.0015699273091740907, "train/perplexity": 9.4970304094588, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025386.3752548113, "perf/iters_per_sec": 0.9657794834398323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035433053970337, "data/tokens_consumed": 40246444032, "data/tokens_consumed_B": 40.246444032, "train/loss_slope": -3.3747385043909535e-06} {"step": 19200, "timestamp": 1778215286.2885509, "grad/layer_0/attn": 0.0028523243963718414, "grad/layer_0/mlp": 0.0027513867244124413, "grad/layer_0/attn_mlp_ratio": 1.0366860708439116, "grad/layer_4/attn": 0.0017816360341385007, "grad/layer_4/mlp": 0.002539921784773469, "grad/layer_4/attn_mlp_ratio": 0.7014530820098043, "grad/layer_8/attn": 0.008905533701181412, "grad/layer_8/mlp": 0.004016040358692408, "grad/layer_8/attn_mlp_ratio": 2.2174910319705763, "grad/layer_12/attn": 0.004131430294364691, "grad/layer_12/mlp": 0.0064020357094705105, "grad/layer_12/attn_mlp_ratio": 0.6453307068750037, "grad/layer_16/attn": 0.004016605671495199, "grad/layer_16/mlp": 0.004626338370144367, "grad/layer_16/attn_mlp_ratio": 0.8682040229905313, "grad/layer_20/attn": 0.01284081395715475, "grad/layer_20/mlp": 0.006483528763055801, "grad/layer_20/attn_mlp_ratio": 1.980528540610622, "grad/layer_24/attn": 0.006780042313039303, "grad/layer_24/mlp": 0.010047413408756256, "grad/layer_24/attn_mlp_ratio": 0.6748047452342376, "grad/layer_27/attn": 0.01058700866997242, "grad/layer_27/mlp": 0.007857571355998516, "grad/layer_27/attn_mlp_ratio": 1.3473639698039572} {"step": 19200, "timestamp": 1778215286.9285913, "eos/sharpness": 32.48207569122314, "eos/L0_probe": 2.0559985637664795, "eos/L_plus": 2.18550705909729, "eos/L_minus": 2.2513108253479004, "eos/grad_norm": 0.12584415078163147, "eos/embed_grad_frac": 0.1666380912065506, "eos/time_s": 0.6371109485626221} {"step": 19200, "timestamp": 1778215286.949301, "train/loss": 2.2405446767807007, "train/z_loss": 0.0015793674625456332, "train/perplexity": 9.39844901051442, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905785.5997425006, "perf/iters_per_sec": 0.9087493895256522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1004133939743042, "data/tokens_consumed": 40267415552, "data/tokens_consumed_B": 40.267415552, "train/loss_slope": -1.4296004481048519e-06} {"step": 19200, "timestamp": 1778215288.3137038, "geo/rankme_last": 440.7925720214844, "geo/layer_0/stable_rank_q_proj": 16.82024574279785, "geo/layer_0/stable_rank_k_proj": 14.966856002807617, "geo/layer_0/stable_rank_o_proj": 52.70427703857422, "geo/layer_0/stable_rank_gate_proj": 155.51170349121094, "geo/layer_0/stable_rank_down_proj": 49.924217224121094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04722615331411362, "geo/layer_0/attn_entropy_mean": 6.273172378540039, "geo/layer_0/attn_entropy_std": 0.3121481239795685, "geo/layer_7/stable_rank_q_proj": 42.52983093261719, "geo/layer_7/stable_rank_k_proj": 42.37097930908203, "geo/layer_7/stable_rank_o_proj": 110.6976318359375, "geo/layer_7/stable_rank_gate_proj": 104.73578643798828, "geo/layer_7/stable_rank_down_proj": 151.73521423339844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5738715529441833, "geo/layer_7/attn_entropy_mean": 4.682193756103516, "geo/layer_7/attn_entropy_std": 0.8642327189445496, "geo/layer_14/stable_rank_q_proj": 57.84748840332031, "geo/layer_14/stable_rank_k_proj": 35.21565628051758, "geo/layer_14/stable_rank_o_proj": 52.926116943359375, "geo/layer_14/stable_rank_gate_proj": 89.79112243652344, "geo/layer_14/stable_rank_down_proj": 136.50439453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3703339695930481, "geo/layer_14/attn_entropy_mean": 5.530542373657227, "geo/layer_14/attn_entropy_std": 0.4850271940231323, "geo/layer_21/stable_rank_q_proj": 48.801025390625, "geo/layer_21/stable_rank_k_proj": 32.09134292602539, "geo/layer_21/stable_rank_o_proj": 84.80818176269531, "geo/layer_21/stable_rank_gate_proj": 89.57003784179688, "geo/layer_21/stable_rank_down_proj": 61.606956481933594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15269510447978973, "geo/layer_21/attn_entropy_mean": 5.776472568511963, "geo/layer_21/attn_entropy_std": 0.3018561601638794, "geo/layer_27/stable_rank_q_proj": 41.517250061035156, "geo/layer_27/stable_rank_k_proj": 31.847427368164062, "geo/layer_27/stable_rank_o_proj": 117.78935241699219, "geo/layer_27/stable_rank_gate_proj": 91.9031753540039, "geo/layer_27/stable_rank_down_proj": 138.8897705078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07655709236860275, "geo/layer_27/attn_entropy_mean": 4.424660682678223, "geo/layer_27/attn_entropy_std": 0.5328711271286011, "attnres/final_alpha/block_0": 0.24384286999702454, "attnres/block_norm/0": 1.5985898971557617, "attnres/final_alpha/block_1": 0.007152692414820194, "attnres/block_norm/1": 28048.5234375, "attnres/final_alpha/block_2": 0.014416279271245003, "attnres/block_norm/2": 20239.5546875, "attnres/final_alpha/block_3": 0.0165305957198143, "attnres/block_norm/3": 27893.94921875, "attnres/final_alpha/block_4": 0.02180521935224533, "attnres/block_norm/4": 8916.5185546875, "attnres/final_alpha/block_5": 0.5580850839614868, "attnres/block_norm/5": 4907.96826171875, "attnres/final_alpha/block_6": 0.13816721737384796, "attnres/block_norm/6": 18932.453125, "geo/tier1_time_s": 1.3609464168548584, "geo/step": 19200.0, "geo/rankme_slope": -7.001988295318127e-05} {"step": 19210, "timestamp": 1778215299.0550656, "train/loss": 2.1677318096160887, "train/z_loss": 0.0015997325303032995, "train/perplexity": 8.738441097393851, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1732860.5478492188, "perf/iters_per_sec": 0.8262922991987318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.210225486755371, "data/tokens_consumed": 40288387072, "data/tokens_consumed_B": 40.288387072, "train/loss_slope": -7.30600239741997e-06} {"step": 19220, "timestamp": 1778215309.4192617, "train/loss": 2.18345901966095, "train/z_loss": 0.0015991135500371457, "train/perplexity": 8.87695878928687, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026434.0462495275, "perf/iters_per_sec": 0.9662790518996847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348977327346802, "data/tokens_consumed": 40309358592, "data/tokens_consumed_B": 40.309358592, "train/loss_slope": -1.2117874375557108e-05} {"step": 19230, "timestamp": 1778215319.7794409, "train/loss": 2.2864471435546876, "train/z_loss": 0.0015744966454803943, "train/perplexity": 9.839915696067909, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025223.0207696212, "perf/iters_per_sec": 0.9657015899513346, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355165719985961, "data/tokens_consumed": 40330330112, "data/tokens_consumed_B": 40.330330112, "train/loss_slope": -8.822822184524106e-06} {"step": 19240, "timestamp": 1778215330.131076, "train/loss": 2.1711697578430176, "train/z_loss": 0.0015974278561770917, "train/perplexity": 8.768535106652335, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027079.6219762329, "perf/iters_per_sec": 0.9665868863946118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345681428909301, "data/tokens_consumed": 40351301632, "data/tokens_consumed_B": 40.351301632, "train/loss_slope": -1.0500674443741415e-05} {"step": 19250, "timestamp": 1778215340.4731221, "grad/layer_0/attn": 0.00288089900277555, "grad/layer_0/mlp": 0.0025673627387732267, "grad/layer_0/attn_mlp_ratio": 1.1221238226506938, "grad/layer_4/attn": 0.002356582786887884, "grad/layer_4/mlp": 0.0024620394688099623, "grad/layer_4/attn_mlp_ratio": 0.9571669020847414, "grad/layer_8/attn": 0.005626891274005175, "grad/layer_8/mlp": 0.0037801936268806458, "grad/layer_8/attn_mlp_ratio": 1.4885192877795692, "grad/layer_12/attn": 0.005777633283287287, "grad/layer_12/mlp": 0.006959360092878342, "grad/layer_12/attn_mlp_ratio": 0.8301960414693954, "grad/layer_16/attn": 0.004523114301264286, "grad/layer_16/mlp": 0.004460127092897892, "grad/layer_16/attn_mlp_ratio": 1.0141222672902002, "grad/layer_20/attn": 0.006584631744772196, "grad/layer_20/mlp": 0.005645102821290493, "grad/layer_20/attn_mlp_ratio": 1.1664325410150225, "grad/layer_24/attn": 0.009120224043726921, "grad/layer_24/mlp": 0.007550704292953014, "grad/layer_24/attn_mlp_ratio": 1.207864004348882, "grad/layer_27/attn": 0.0040033250115811825, "grad/layer_27/mlp": 0.006543994415551424, "grad/layer_27/attn_mlp_ratio": 0.611755557262084} {"step": 19250, "timestamp": 1778215340.4889078, "train/loss": 2.2378660440444946, "train/z_loss": 0.001572917983867228, "train/perplexity": 9.373307704520485, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025713.536507458, "perf/iters_per_sec": 0.9659354860818186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035265827178955, "data/tokens_consumed": 40372273152, "data/tokens_consumed_B": 40.372273152, "train/loss_slope": -1.0216714925963472e-05} {"step": 19260, "timestamp": 1778215350.8411276, "train/loss": 2.201543116569519, "train/z_loss": 0.0015883286483585834, "train/perplexity": 9.03895089805495, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026780.4596945695, "perf/iters_per_sec": 0.9664442347023818, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347208499908447, "data/tokens_consumed": 40393244672, "data/tokens_consumed_B": 40.393244672, "train/loss_slope": -7.495678991231384e-06} {"step": 19270, "timestamp": 1778215361.1944125, "train/loss": 2.267534875869751, "train/z_loss": 0.001587946736253798, "train/perplexity": 9.655569275478506, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026628.180079262, "perf/iters_per_sec": 0.9663716221233664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347985982894898, "data/tokens_consumed": 40414216192, "data/tokens_consumed_B": 40.414216192, "train/loss_slope": -6.6882696875645075e-06} {"step": 19275, "timestamp": 1778215366.96976, "eos/sharpness": 54.03656959533691, "eos/L0_probe": 2.0525145530700684, "eos/L_plus": 2.421380043029785, "eos/L_minus": 2.2240147590637207, "eos/grad_norm": 0.16620881855487823, "eos/embed_grad_frac": 0.08927730470895767, "eos/time_s": 0.6096842288970947} {"step": 19275, "timestamp": 1778215368.3513381, "geo/rankme_last": 440.7198791503906, "geo/layer_0/stable_rank_q_proj": 16.861616134643555, "geo/layer_0/stable_rank_k_proj": 14.999956130981445, "geo/layer_0/stable_rank_o_proj": 52.68709182739258, "geo/layer_0/stable_rank_gate_proj": 155.02420043945312, "geo/layer_0/stable_rank_down_proj": 49.984092712402344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04451845586299896, "geo/layer_0/attn_entropy_mean": 6.278075218200684, "geo/layer_0/attn_entropy_std": 0.314521461725235, "geo/layer_7/stable_rank_q_proj": 42.56378936767578, "geo/layer_7/stable_rank_k_proj": 42.38605880737305, "geo/layer_7/stable_rank_o_proj": 110.68946075439453, "geo/layer_7/stable_rank_gate_proj": 105.07485961914062, "geo/layer_7/stable_rank_down_proj": 151.35691833496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5627142786979675, "geo/layer_7/attn_entropy_mean": 4.6359453201293945, "geo/layer_7/attn_entropy_std": 0.8646540641784668, "geo/layer_14/stable_rank_q_proj": 57.7700080871582, "geo/layer_14/stable_rank_k_proj": 35.07184982299805, "geo/layer_14/stable_rank_o_proj": 52.983760833740234, "geo/layer_14/stable_rank_gate_proj": 89.654296875, "geo/layer_14/stable_rank_down_proj": 136.73289489746094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3790174722671509, "geo/layer_14/attn_entropy_mean": 5.4921183586120605, "geo/layer_14/attn_entropy_std": 0.503390371799469, "geo/layer_21/stable_rank_q_proj": 48.73574447631836, "geo/layer_21/stable_rank_k_proj": 32.243980407714844, "geo/layer_21/stable_rank_o_proj": 84.59552001953125, "geo/layer_21/stable_rank_gate_proj": 89.32361602783203, "geo/layer_21/stable_rank_down_proj": 61.71117401123047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15487661957740784, "geo/layer_21/attn_entropy_mean": 5.747208595275879, "geo/layer_21/attn_entropy_std": 0.29840365052223206, "geo/layer_27/stable_rank_q_proj": 41.46453094482422, "geo/layer_27/stable_rank_k_proj": 31.772632598876953, "geo/layer_27/stable_rank_o_proj": 117.66976928710938, "geo/layer_27/stable_rank_gate_proj": 92.02912139892578, "geo/layer_27/stable_rank_down_proj": 138.8541717529297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07768722623586655, "geo/layer_27/attn_entropy_mean": 4.410245418548584, "geo/layer_27/attn_entropy_std": 0.5490018725395203, "attnres/final_alpha/block_0": 0.24320682883262634, "attnres/block_norm/0": 1.5995304584503174, "attnres/final_alpha/block_1": 0.007049379870295525, "attnres/block_norm/1": 28049.52734375, "attnres/final_alpha/block_2": 0.014551861211657524, "attnres/block_norm/2": 20121.62109375, "attnres/final_alpha/block_3": 0.016564220190048218, "attnres/block_norm/3": 27808.94921875, "attnres/final_alpha/block_4": 0.021210815757513046, "attnres/block_norm/4": 8894.67578125, "attnres/final_alpha/block_5": 0.5594797134399414, "attnres/block_norm/5": 4930.1689453125, "attnres/final_alpha/block_6": 0.13793717324733734, "attnres/block_norm/6": 18922.6484375, "geo/tier1_time_s": 1.3624298572540283, "geo/step": 19275.0, "geo/rankme_slope": -5.457321600515206e-05} {"step": 19280, "timestamp": 1778215373.5314353, "train/loss": 2.200085711479187, "train/z_loss": 0.0015908117056824267, "train/perplexity": 9.025787079842729, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700710.2269855698, "perf/iters_per_sec": 0.8109618315627908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2331036567687987, "data/tokens_consumed": 40435187712, "data/tokens_consumed_B": 40.435187712, "train/loss_slope": -7.556417086372677e-06} {"step": 19290, "timestamp": 1778215383.904066, "train/loss": 2.229406237602234, "train/z_loss": 0.0015764894429594277, "train/perplexity": 9.294345807728407, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022888.0188567229, "perf/iters_per_sec": 0.9645881742747893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036711859703064, "data/tokens_consumed": 40456159232, "data/tokens_consumed_B": 40.456159232, "train/loss_slope": -1.0436074196523387e-05} {"step": 19300, "timestamp": 1778215394.2651894, "grad/layer_0/attn": 0.002960562240332365, "grad/layer_0/mlp": 0.0029118619859218597, "grad/layer_0/attn_mlp_ratio": 1.0167247462185653, "grad/layer_4/attn": 0.0027003029827028513, "grad/layer_4/mlp": 0.002489350736141205, "grad/layer_4/attn_mlp_ratio": 1.0847418304800491, "grad/layer_8/attn": 0.007078572642058134, "grad/layer_8/mlp": 0.003726973896846175, "grad/layer_8/attn_mlp_ratio": 1.8992814674983338, "grad/layer_12/attn": 0.004858509171754122, "grad/layer_12/mlp": 0.006547320634126663, "grad/layer_12/attn_mlp_ratio": 0.7420606640560722, "grad/layer_16/attn": 0.005244589410722256, "grad/layer_16/mlp": 0.004628968425095081, "grad/layer_16/attn_mlp_ratio": 1.1329931025215882, "grad/layer_20/attn": 0.005996232852339745, "grad/layer_20/mlp": 0.007047589868307114, "grad/layer_20/attn_mlp_ratio": 0.8508203342283954, "grad/layer_24/attn": 0.014661842957139015, "grad/layer_24/mlp": 0.013958096504211426, "grad/layer_24/attn_mlp_ratio": 1.0504185042475818, "grad/layer_27/attn": 0.011422676034271717, "grad/layer_27/mlp": 0.013999470509588718, "grad/layer_27/attn_mlp_ratio": 0.8159362845083538} {"step": 19300, "timestamp": 1778215394.2802517, "train/loss": 2.261739206314087, "train/z_loss": 0.0015767680015414953, "train/perplexity": 9.599770638006834, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022551.9567334817, "perf/iters_per_sec": 0.9644279273669633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368841171264649, "data/tokens_consumed": 40477130752, "data/tokens_consumed_B": 40.477130752, "train/loss_slope": -9.313196653794745e-06} {"step": 19310, "timestamp": 1778215404.6437716, "train/loss": 2.235389804840088, "train/z_loss": 0.00158807021798566, "train/perplexity": 9.350125866241429, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024835.654697074, "perf/iters_per_sec": 0.9655168794141169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357146739959717, "data/tokens_consumed": 40498102272, "data/tokens_consumed_B": 40.498102272, "train/loss_slope": -6.848382377567326e-06} {"step": 19320, "timestamp": 1778215415.012304, "train/loss": 2.2191086053848266, "train/z_loss": 0.0015840315027162433, "train/perplexity": 9.19912715764217, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023636.8737082656, "perf/iters_per_sec": 0.9649452560941055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363282203674316, "data/tokens_consumed": 40519073792, "data/tokens_consumed_B": 40.519073792, "train/loss_slope": -6.641027571881069e-06} {"step": 19330, "timestamp": 1778215425.3765395, "train/loss": 2.212051820755005, "train/z_loss": 0.0015880472026765346, "train/perplexity": 9.134439410701795, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024790.0234171995, "perf/iters_per_sec": 0.9654951207242963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357380151748656, "data/tokens_consumed": 40540045312, "data/tokens_consumed_B": 40.540045312, "train/loss_slope": -5.715669685750648e-06} {"step": 19340, "timestamp": 1778215435.7410252, "train/loss": 2.226754975318909, "train/z_loss": 0.0015724882367067039, "train/perplexity": 9.269736696259907, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024550.8074728244, "perf/iters_per_sec": 0.9653810536731836, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358603954315186, "data/tokens_consumed": 40561016832, "data/tokens_consumed_B": 40.561016832, "train/loss_slope": -6.989398476171178e-06} {"step": 19350, "timestamp": 1778215446.1038609, "grad/layer_0/attn": 0.0029271533712744713, "grad/layer_0/mlp": 0.002786638680845499, "grad/layer_0/attn_mlp_ratio": 1.050424400677557, "grad/layer_4/attn": 0.0020442085806280375, "grad/layer_4/mlp": 0.002431961242109537, "grad/layer_4/attn_mlp_ratio": 0.8405596525045277, "grad/layer_8/attn": 0.0044727991335093975, "grad/layer_8/mlp": 0.0037593331653624773, "grad/layer_8/attn_mlp_ratio": 1.1897852139688199, "grad/layer_12/attn": 0.004601266235113144, "grad/layer_12/mlp": 0.005941866431385279, "grad/layer_12/attn_mlp_ratio": 0.7743806110098558, "grad/layer_16/attn": 0.004191474989056587, "grad/layer_16/mlp": 0.004421396646648645, "grad/layer_16/attn_mlp_ratio": 0.9479979357730484, "grad/layer_20/attn": 0.008208859711885452, "grad/layer_20/mlp": 0.006344402674585581, "grad/layer_20/attn_mlp_ratio": 1.2938743020491255, "grad/layer_24/attn": 0.02027241140604019, "grad/layer_24/mlp": 0.013509799726307392, "grad/layer_24/attn_mlp_ratio": 1.5005708201955803, "grad/layer_27/attn": 0.004557882901281118, "grad/layer_27/mlp": 0.013378087431192398, "grad/layer_27/attn_mlp_ratio": 0.340697643863814} {"step": 19350, "timestamp": 1778215446.692678, "eos/sharpness": 56.07707500457762, "eos/L0_probe": 2.0544323921203613, "eos/L_plus": 2.2905592918395996, "eos/L_minus": 2.3790762424468994, "eos/grad_norm": 0.22803732752799988, "eos/embed_grad_frac": 0.05606914684176445, "eos/time_s": 0.5859107971191406} {"step": 19350, "timestamp": 1778215446.7096353, "train/loss": 2.2555611371994018, "train/z_loss": 0.001571914739906788, "train/perplexity": 9.540645419423658, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912805.9420849108, "perf/iters_per_sec": 0.9120969496178202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963746786117554, "data/tokens_consumed": 40581988352, "data/tokens_consumed_B": 40.581988352, "train/loss_slope": -5.437187629170953e-06} {"step": 19350, "timestamp": 1778215448.074924, "geo/rankme_last": 441.7933654785156, "geo/layer_0/stable_rank_q_proj": 16.87220001220703, "geo/layer_0/stable_rank_k_proj": 15.014413833618164, "geo/layer_0/stable_rank_o_proj": 52.62189865112305, "geo/layer_0/stable_rank_gate_proj": 155.19114685058594, "geo/layer_0/stable_rank_down_proj": 49.9841194152832, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052840035408735275, "geo/layer_0/attn_entropy_mean": 6.2788801193237305, "geo/layer_0/attn_entropy_std": 0.31696566939353943, "geo/layer_7/stable_rank_q_proj": 42.52102279663086, "geo/layer_7/stable_rank_k_proj": 42.23566436767578, "geo/layer_7/stable_rank_o_proj": 110.79344177246094, "geo/layer_7/stable_rank_gate_proj": 105.07130432128906, "geo/layer_7/stable_rank_down_proj": 151.20289611816406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.546194314956665, "geo/layer_7/attn_entropy_mean": 4.64960241317749, "geo/layer_7/attn_entropy_std": 0.8262504935264587, "geo/layer_14/stable_rank_q_proj": 57.84217071533203, "geo/layer_14/stable_rank_k_proj": 35.18861770629883, "geo/layer_14/stable_rank_o_proj": 52.997406005859375, "geo/layer_14/stable_rank_gate_proj": 89.68482208251953, "geo/layer_14/stable_rank_down_proj": 137.1790008544922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36992117762565613, "geo/layer_14/attn_entropy_mean": 5.499655723571777, "geo/layer_14/attn_entropy_std": 0.4975341856479645, "geo/layer_21/stable_rank_q_proj": 48.58525085449219, "geo/layer_21/stable_rank_k_proj": 32.131805419921875, "geo/layer_21/stable_rank_o_proj": 84.5029067993164, "geo/layer_21/stable_rank_gate_proj": 89.20013427734375, "geo/layer_21/stable_rank_down_proj": 61.66950988769531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15151140093803406, "geo/layer_21/attn_entropy_mean": 5.765302658081055, "geo/layer_21/attn_entropy_std": 0.28327566385269165, "geo/layer_27/stable_rank_q_proj": 41.4572639465332, "geo/layer_27/stable_rank_k_proj": 31.722339630126953, "geo/layer_27/stable_rank_o_proj": 117.90013122558594, "geo/layer_27/stable_rank_gate_proj": 92.02618408203125, "geo/layer_27/stable_rank_down_proj": 138.8424072265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07534525543451309, "geo/layer_27/attn_entropy_mean": 4.410975933074951, "geo/layer_27/attn_entropy_std": 0.5448713302612305, "attnres/final_alpha/block_0": 0.24422532320022583, "attnres/block_norm/0": 1.6004164218902588, "attnres/final_alpha/block_1": 0.007088129408657551, "attnres/block_norm/1": 28102.86328125, "attnres/final_alpha/block_2": 0.014637574553489685, "attnres/block_norm/2": 20259.3125, "attnres/final_alpha/block_3": 0.016693245619535446, "attnres/block_norm/3": 28209.21484375, "attnres/final_alpha/block_4": 0.021853169426321983, "attnres/block_norm/4": 8974.103515625, "attnres/final_alpha/block_5": 0.5560011863708496, "attnres/block_norm/5": 4935.87744140625, "attnres/final_alpha/block_6": 0.1395014226436615, "attnres/block_norm/6": 18917.921875, "geo/tier1_time_s": 1.3627820014953613, "geo/step": 19350.0, "geo/rankme_slope": 3.7599414765906326e-06} {"step": 19360, "timestamp": 1778215458.4410853, "train/loss": 2.253713512420654, "train/z_loss": 0.00156862479634583, "train/perplexity": 9.523034161050152, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788032.0980094674, "perf/iters_per_sec": 0.8526001443908059, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1728827476501464, "data/tokens_consumed": 40602959872, "data/tokens_consumed_B": 40.602959872, "train/loss_slope": -3.1652749854453405e-06} {"step": 19370, "timestamp": 1778215468.8054783, "train/loss": 2.2732249975204466, "train/z_loss": 0.001572791999205947, "train/perplexity": 9.710667247683912, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024418.337944805, "perf/iters_per_sec": 0.9653178872798943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359281778335572, "data/tokens_consumed": 40623931392, "data/tokens_consumed_B": 40.623931392, "train/loss_slope": -1.1547170456488608e-07} {"step": 19380, "timestamp": 1778215479.187238, "train/loss": 2.2702488899230957, "train/z_loss": 0.001573680166620761, "train/perplexity": 9.681810219222962, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021111.7137023974, "perf/iters_per_sec": 0.9637411659728992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376230001449585, "data/tokens_consumed": 40644902912, "data/tokens_consumed_B": 40.644902912, "train/loss_slope": 5.607778762075714e-06} {"step": 19390, "timestamp": 1778215489.5517082, "train/loss": 2.163499855995178, "train/z_loss": 0.0015980602242052556, "train/perplexity": 8.701538559939394, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024845.0701915848, "perf/iters_per_sec": 0.9655213690717624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357098579406738, "data/tokens_consumed": 40665874432, "data/tokens_consumed_B": 40.665874432, "train/loss_slope": 2.2960458544328264e-06} {"step": 19400, "timestamp": 1778215499.9176226, "grad/layer_0/attn": 0.0026046782732009888, "grad/layer_0/mlp": 0.0025216189678758383, "grad/layer_0/attn_mlp_ratio": 1.0329388393287802, "grad/layer_4/attn": 0.0019419235177338123, "grad/layer_4/mlp": 0.002436109585687518, "grad/layer_4/attn_mlp_ratio": 0.7971412490754756, "grad/layer_8/attn": 0.01028959359973669, "grad/layer_8/mlp": 0.0036645943764597178, "grad/layer_8/attn_mlp_ratio": 2.8078396302330386, "grad/layer_12/attn": 0.0052451035007834435, "grad/layer_12/mlp": 0.00609844783321023, "grad/layer_12/attn_mlp_ratio": 0.8600718671746396, "grad/layer_16/attn": 0.004640268627554178, "grad/layer_16/mlp": 0.004493130836635828, "grad/layer_16/attn_mlp_ratio": 1.0327472519704752, "grad/layer_20/attn": 0.01037411019206047, "grad/layer_20/mlp": 0.005803518928587437, "grad/layer_20/attn_mlp_ratio": 1.7875551266324539, "grad/layer_24/attn": 0.011471797712147236, "grad/layer_24/mlp": 0.009652340784668922, "grad/layer_24/attn_mlp_ratio": 1.1884990231093275, "grad/layer_27/attn": 0.004318166058510542, "grad/layer_27/mlp": 0.008254006505012512, "grad/layer_27/attn_mlp_ratio": 0.5231599955212292} {"step": 19400, "timestamp": 1778215499.9326248, "train/loss": 2.2299665927886965, "train/z_loss": 0.0015906592132523655, "train/perplexity": 9.299555402081834, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021270.8756073054, "perf/iters_per_sec": 0.9638170602833297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375412940979003, "data/tokens_consumed": 40686845952, "data/tokens_consumed_B": 40.686845952, "train/loss_slope": 4.4863352216187604e-06} {"step": 19410, "timestamp": 1778215510.302953, "train/loss": 2.2036510467529298, "train/z_loss": 0.0015786629403010011, "train/perplexity": 9.058024471297122, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023358.3676906119, "perf/iters_per_sec": 0.964812454076105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364708662033082, "data/tokens_consumed": 40707817472, "data/tokens_consumed_B": 40.707817472, "train/loss_slope": 3.039151342502398e-06} {"step": 19420, "timestamp": 1778215520.6699986, "train/loss": 2.2135233879089355, "train/z_loss": 0.001578790112398565, "train/perplexity": 9.147891246920814, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024099.9787600613, "perf/iters_per_sec": 0.9651660817909533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036091113090515, "data/tokens_consumed": 40728788992, "data/tokens_consumed_B": 40.728788992, "train/loss_slope": 2.925949872094441e-06} {"step": 19425, "timestamp": 1778215526.4645905, "eos/sharpness": 46.30343914031982, "eos/L0_probe": 2.0567235946655273, "eos/L_plus": 2.321967363357544, "eos/L_minus": 2.254514217376709, "eos/grad_norm": 0.1863660216331482, "eos/embed_grad_frac": 0.07676618546247482, "eos/time_s": 0.6158411502838135} {"step": 19425, "timestamp": 1778215527.8475251, "geo/rankme_last": 440.5631408691406, "geo/layer_0/stable_rank_q_proj": 16.880903244018555, "geo/layer_0/stable_rank_k_proj": 14.983253479003906, "geo/layer_0/stable_rank_o_proj": 52.41492462158203, "geo/layer_0/stable_rank_gate_proj": 154.95655822753906, "geo/layer_0/stable_rank_down_proj": 49.98569107055664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04923129826784134, "geo/layer_0/attn_entropy_mean": 6.274541854858398, "geo/layer_0/attn_entropy_std": 0.3141253888607025, "geo/layer_7/stable_rank_q_proj": 42.469139099121094, "geo/layer_7/stable_rank_k_proj": 42.13786315917969, "geo/layer_7/stable_rank_o_proj": 110.66636657714844, "geo/layer_7/stable_rank_gate_proj": 104.92313385009766, "geo/layer_7/stable_rank_down_proj": 151.6366424560547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5576039552688599, "geo/layer_7/attn_entropy_mean": 4.691798210144043, "geo/layer_7/attn_entropy_std": 0.8553028106689453, "geo/layer_14/stable_rank_q_proj": 57.82353210449219, "geo/layer_14/stable_rank_k_proj": 35.20565414428711, "geo/layer_14/stable_rank_o_proj": 53.06153869628906, "geo/layer_14/stable_rank_gate_proj": 89.54063415527344, "geo/layer_14/stable_rank_down_proj": 137.1334228515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37320688366889954, "geo/layer_14/attn_entropy_mean": 5.530388832092285, "geo/layer_14/attn_entropy_std": 0.5020044445991516, "geo/layer_21/stable_rank_q_proj": 48.587459564208984, "geo/layer_21/stable_rank_k_proj": 32.08341598510742, "geo/layer_21/stable_rank_o_proj": 84.40641784667969, "geo/layer_21/stable_rank_gate_proj": 89.04296112060547, "geo/layer_21/stable_rank_down_proj": 61.60039138793945, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1549489051103592, "geo/layer_21/attn_entropy_mean": 5.756169319152832, "geo/layer_21/attn_entropy_std": 0.2999820113182068, "geo/layer_27/stable_rank_q_proj": 41.548439025878906, "geo/layer_27/stable_rank_k_proj": 31.61394691467285, "geo/layer_27/stable_rank_o_proj": 117.82351684570312, "geo/layer_27/stable_rank_gate_proj": 91.83203887939453, "geo/layer_27/stable_rank_down_proj": 138.66976928710938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07484003156423569, "geo/layer_27/attn_entropy_mean": 4.4115118980407715, "geo/layer_27/attn_entropy_std": 0.5446707010269165, "attnres/final_alpha/block_0": 0.24354906380176544, "attnres/block_norm/0": 1.6013426780700684, "attnres/final_alpha/block_1": 0.0071173678152263165, "attnres/block_norm/1": 28178.443359375, "attnres/final_alpha/block_2": 0.01442820392549038, "attnres/block_norm/2": 20283.861328125, "attnres/final_alpha/block_3": 0.016489367932081223, "attnres/block_norm/3": 28187.44921875, "attnres/final_alpha/block_4": 0.021389909088611603, "attnres/block_norm/4": 8982.4013671875, "attnres/final_alpha/block_5": 0.5601806640625, "attnres/block_norm/5": 4925.8193359375, "attnres/final_alpha/block_6": 0.1368454098701477, "attnres/block_norm/6": 19077.6328125, "geo/tier1_time_s": 1.3635447025299072, "geo/step": 19425.0, "geo/rankme_slope": -5.938312825130052e-06} {"step": 19430, "timestamp": 1778215533.0355122, "train/loss": 2.2302241802215574, "train/z_loss": 0.0015880436054430901, "train/perplexity": 9.301951159229827, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696837.2669456953, "perf/iters_per_sec": 0.8091150603035427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2359181642532349, "data/tokens_consumed": 40749760512, "data/tokens_consumed_B": 40.749760512, "train/loss_slope": 5.972096557342428e-07} {"step": 19440, "timestamp": 1778215543.411849, "train/loss": 2.294408583641052, "train/z_loss": 0.0015641622245311736, "train/perplexity": 9.918568273783654, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022798.6549699425, "perf/iters_per_sec": 0.964545562252971, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367576599121093, "data/tokens_consumed": 40770732032, "data/tokens_consumed_B": 40.770732032, "train/loss_slope": 8.416256760105924e-06} {"step": 19450, "timestamp": 1778215553.7778563, "grad/layer_0/attn": 0.002505362266674638, "grad/layer_0/mlp": 0.002475558780133724, "grad/layer_0/attn_mlp_ratio": 1.0120390537991582, "grad/layer_4/attn": 0.0015483064344152808, "grad/layer_4/mlp": 0.002414356218650937, "grad/layer_4/attn_mlp_ratio": 0.6412916032544954, "grad/layer_8/attn": 0.005577367730438709, "grad/layer_8/mlp": 0.003899714909493923, "grad/layer_8/attn_mlp_ratio": 1.4301988009022484, "grad/layer_12/attn": 0.005452231504023075, "grad/layer_12/mlp": 0.006394501309841871, "grad/layer_12/attn_mlp_ratio": 0.8526437253780984, "grad/layer_16/attn": 0.0037165582180023193, "grad/layer_16/mlp": 0.00437504705041647, "grad/layer_16/attn_mlp_ratio": 0.8494898661031649, "grad/layer_20/attn": 0.0145708117634058, "grad/layer_20/mlp": 0.005684401374310255, "grad/layer_20/attn_mlp_ratio": 2.5632974428805317, "grad/layer_24/attn": 0.008180726319551468, "grad/layer_24/mlp": 0.009373700246214867, "grad/layer_24/attn_mlp_ratio": 0.8727317939979673, "grad/layer_27/attn": 0.010499658063054085, "grad/layer_27/mlp": 0.008242498151957989, "grad/layer_27/attn_mlp_ratio": 1.2738441358552806} {"step": 19450, "timestamp": 1778215553.7938402, "train/loss": 2.246318793296814, "train/z_loss": 0.001585265889298171, "train/perplexity": 9.45287372628516, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021207.291436925, "perf/iters_per_sec": 0.9637867409882188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375739336013794, "data/tokens_consumed": 40791703552, "data/tokens_consumed_B": 40.791703552, "train/loss_slope": 6.61018936976705e-06} {"step": 19460, "timestamp": 1778215564.1454139, "train/loss": 2.1982343196868896, "train/z_loss": 0.0015924487495794892, "train/perplexity": 9.009092270808958, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026944.9062816354, "perf/iters_per_sec": 0.9665226489456346, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034636902809143, "data/tokens_consumed": 40812675072, "data/tokens_consumed_B": 40.812675072, "train/loss_slope": 2.819835549533941e-06} {"step": 19470, "timestamp": 1778215574.5132806, "train/loss": 2.1928553581237793, "train/z_loss": 0.001585439476184547, "train/perplexity": 8.960762807455856, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024308.7130674005, "perf/iters_per_sec": 0.9652656140648844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359842777252197, "data/tokens_consumed": 40833646592, "data/tokens_consumed_B": 40.833646592, "train/loss_slope": -1.7804195170570033e-06} {"step": 19480, "timestamp": 1778215584.8622482, "train/loss": 2.1676883697509766, "train/z_loss": 0.0015811456134542823, "train/perplexity": 8.738061508935985, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027383.8725273053, "perf/iters_per_sec": 0.9667319643627669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344128847122191, "data/tokens_consumed": 40854618112, "data/tokens_consumed_B": 40.854618112, "train/loss_slope": -5.449842483905075e-06} {"step": 19490, "timestamp": 1778215595.2201684, "train/loss": 2.2705683946609496, "train/z_loss": 0.0015773564926348626, "train/perplexity": 9.6849040976871, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027560.7554617014, "perf/iters_per_sec": 0.966816308718539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343226432800292, "data/tokens_consumed": 40875589632, "data/tokens_consumed_B": 40.875589632, "train/loss_slope": -2.9681664846553814e-06} {"step": 19500, "timestamp": 1778215605.5685003, "grad/layer_0/attn": 0.0025642698165029287, "grad/layer_0/mlp": 0.0025570003781467676, "grad/layer_0/attn_mlp_ratio": 1.002842916306934, "grad/layer_4/attn": 0.001866447739303112, "grad/layer_4/mlp": 0.0025255864020437002, "grad/layer_4/attn_mlp_ratio": 0.7390155664012243, "grad/layer_8/attn": 0.004404783248901367, "grad/layer_8/mlp": 0.0038006440736353397, "grad/layer_8/attn_mlp_ratio": 1.1589570208800064, "grad/layer_12/attn": 0.004365471191704273, "grad/layer_12/mlp": 0.006279891822487116, "grad/layer_12/attn_mlp_ratio": 0.6951506881945435, "grad/layer_16/attn": 0.004134046379476786, "grad/layer_16/mlp": 0.0042762793600559235, "grad/layer_16/attn_mlp_ratio": 0.96673905858873, "grad/layer_20/attn": 0.007312936708331108, "grad/layer_20/mlp": 0.005646334961056709, "grad/layer_20/attn_mlp_ratio": 1.2951652052619236, "grad/layer_24/attn": 0.008935680612921715, "grad/layer_24/mlp": 0.011725625954568386, "grad/layer_24/attn_mlp_ratio": 0.7620642660218822, "grad/layer_27/attn": 0.007939482107758522, "grad/layer_27/mlp": 0.008414508774876595, "grad/layer_27/attn_mlp_ratio": 0.943546703178795} {"step": 19500, "timestamp": 1778215606.1787224, "eos/sharpness": 16.01762771606445, "eos/L0_probe": 2.0550127029418945, "eos/L_plus": 2.1278223991394043, "eos/L_minus": 2.1423792839050293, "eos/grad_norm": 0.10833027958869934, "eos/embed_grad_frac": 0.23866140842437744, "eos/time_s": 0.6074662208557129} {"step": 19500, "timestamp": 1778215606.1985083, "train/loss": 2.2490255355834963, "train/z_loss": 0.0015751250786706806, "train/perplexity": 9.478494878616308, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911399.0348162781, "perf/iters_per_sec": 0.91142608395399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971816778182983, "data/tokens_consumed": 40896561152, "data/tokens_consumed_B": 40.896561152, "train/loss_slope": 3.4567204603303413e-07} {"step": 19500, "timestamp": 1778215607.56244, "geo/rankme_last": 441.18536376953125, "geo/layer_0/stable_rank_q_proj": 16.8962459564209, "geo/layer_0/stable_rank_k_proj": 15.02625846862793, "geo/layer_0/stable_rank_o_proj": 52.45093536376953, "geo/layer_0/stable_rank_gate_proj": 154.3859100341797, "geo/layer_0/stable_rank_down_proj": 50.01960754394531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04647423326969147, "geo/layer_0/attn_entropy_mean": 6.2711968421936035, "geo/layer_0/attn_entropy_std": 0.3155319392681122, "geo/layer_7/stable_rank_q_proj": 42.394622802734375, "geo/layer_7/stable_rank_k_proj": 42.1411247253418, "geo/layer_7/stable_rank_o_proj": 110.25531005859375, "geo/layer_7/stable_rank_gate_proj": 104.60435485839844, "geo/layer_7/stable_rank_down_proj": 151.26170349121094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5624241232872009, "geo/layer_7/attn_entropy_mean": 4.683882713317871, "geo/layer_7/attn_entropy_std": 0.8712592124938965, "geo/layer_14/stable_rank_q_proj": 57.901363372802734, "geo/layer_14/stable_rank_k_proj": 35.17936706542969, "geo/layer_14/stable_rank_o_proj": 53.00554275512695, "geo/layer_14/stable_rank_gate_proj": 89.39187622070312, "geo/layer_14/stable_rank_down_proj": 136.94688415527344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.382467120885849, "geo/layer_14/attn_entropy_mean": 5.495837211608887, "geo/layer_14/attn_entropy_std": 0.4700937271118164, "geo/layer_21/stable_rank_q_proj": 48.72321701049805, "geo/layer_21/stable_rank_k_proj": 32.05673599243164, "geo/layer_21/stable_rank_o_proj": 84.32183074951172, "geo/layer_21/stable_rank_gate_proj": 89.07807922363281, "geo/layer_21/stable_rank_down_proj": 61.54439163208008, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15523341298103333, "geo/layer_21/attn_entropy_mean": 5.766898155212402, "geo/layer_21/attn_entropy_std": 0.28489258885383606, "geo/layer_27/stable_rank_q_proj": 41.49665451049805, "geo/layer_27/stable_rank_k_proj": 31.56048583984375, "geo/layer_27/stable_rank_o_proj": 117.94258880615234, "geo/layer_27/stable_rank_gate_proj": 91.63996124267578, "geo/layer_27/stable_rank_down_proj": 138.4796905517578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.072328120470047, "geo/layer_27/attn_entropy_mean": 4.411299705505371, "geo/layer_27/attn_entropy_std": 0.5509257316589355, "attnres/final_alpha/block_0": 0.24259229004383087, "attnres/block_norm/0": 1.6023850440979004, "attnres/final_alpha/block_1": 0.007006368599832058, "attnres/block_norm/1": 28211.48046875, "attnres/final_alpha/block_2": 0.014400525949895382, "attnres/block_norm/2": 20219.380859375, "attnres/final_alpha/block_3": 0.01626841351389885, "attnres/block_norm/3": 28319.84765625, "attnres/final_alpha/block_4": 0.021026667207479477, "attnres/block_norm/4": 9030.8876953125, "attnres/final_alpha/block_5": 0.5606266856193542, "attnres/block_norm/5": 4933.73046875, "attnres/final_alpha/block_6": 0.13807904720306396, "attnres/block_norm/6": 19033.59765625, "geo/tier1_time_s": 1.35982084274292, "geo/step": 19500.0, "geo/rankme_slope": 2.1338457257903163e-05} {"step": 19500, "timestamp": 1778215614.5234838, "geo/ww_alpha_mean": 7.838973327827022, "geo/ww_alpha_std": 4.6552831281033535, "geo/ww_alpha_min": 1.357336465594647, "geo/ww_alpha_max": 31.95913500570259, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.197727075497024, "geo/ww_alpha_by_type/k_proj": 4.688856200558488, "geo/ww_alpha_by_type/v_proj": 8.922481228503743, "geo/ww_alpha_by_type/o_proj": 7.547644849884014, "geo/ww_alpha_by_type/gate_proj": 8.463107965273112, "geo/ww_alpha_by_type/up_proj": 11.944551766726834, "geo/ww_alpha_by_type/down_proj": 9.227826083393522, "geo/twonn_id/layer_0": 0.7002572417259216, "geo/twonn_id/layer_7": 3.3001859188079834, "geo/twonn_id/layer_14": 4.148796081542969, "geo/twonn_id/layer_21": 7.938876152038574, "geo/twonn_id/layer_27": 5.725849628448486, "geo/tier2_time_s": 6.954761028289795} {"step": 19500, "timestamp": 1778215615.1089842, "eoc/jacobian_sigma/layer_0/attn": 759.6027221679688, "eoc/jacobian_sigma/layer_0/mlp": 5398.45703125, "eoc/jacobian_sigma/layer_0": 5398.45703125, "eoc/jacobian_sigma/layer_7/attn": 1.1503106355667114, "eoc/jacobian_sigma/layer_7/mlp": 1.6244747638702393, "eoc/jacobian_sigma/layer_7": 1.6244747638702393, "eoc/jacobian_sigma/layer_14/attn": 1.5582044124603271, "eoc/jacobian_sigma/layer_14/mlp": 8.714990615844727, "eoc/jacobian_sigma/layer_14": 8.714990615844727, "eoc/jacobian_sigma/layer_21/attn": 1.07448148727417, "eoc/jacobian_sigma/layer_21/mlp": 4.202447891235352, "eoc/jacobian_sigma/layer_21": 4.202447891235352, "eoc/jacobian_sigma/layer_27/attn": 3.7435250282287598, "eoc/jacobian_sigma/layer_27/mlp": 24.677749633789062, "eoc/jacobian_sigma/layer_27": 24.677749633789062, "eoc/layer0_sigma": 5398.45703125, "eoc/sigma_max": 24.677749633789062, "eoc/sigma_min": 1.6244747638702393, "eoc/sigma_mean": 9.804915726184845, "eoc/time_s": 0.5793704986572266} {"step": 19510, "timestamp": 1778215625.4869425, "train/loss": 2.269943857192993, "train/z_loss": 0.0015694267232902348, "train/perplexity": 9.678857400595515, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1087480.6257111402, "perf/iters_per_sec": 0.5185511711650563, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9284499883651733, "data/tokens_consumed": 40917532672, "data/tokens_consumed_B": 40.917532672, "train/loss_slope": 4.518884014446661e-06} {"step": 19520, "timestamp": 1778215635.8772357, "train/loss": 2.2397950410842897, "train/z_loss": 0.0015803968301042915, "train/perplexity": 9.39140623773199, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020101.1308078584, "perf/iters_per_sec": 0.9632592824973385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381420850753784, "data/tokens_consumed": 40938504192, "data/tokens_consumed_B": 40.938504192, "train/loss_slope": 6.694470430472001e-06} {"step": 19530, "timestamp": 1778215646.2544954, "train/loss": 2.2501970529556274, "train/z_loss": 0.001572134206071496, "train/perplexity": 9.489605606963154, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022045.63250992, "perf/iters_per_sec": 0.9641864931630707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371437549591065, "data/tokens_consumed": 40959475712, "data/tokens_consumed_B": 40.959475712, "train/loss_slope": 5.363907629471453e-06} {"step": 19540, "timestamp": 1778215656.644523, "train/loss": 2.2553141593933104, "train/z_loss": 0.0015738495625555516, "train/perplexity": 9.538289382705639, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019348.8633472682, "perf/iters_per_sec": 0.962900573419222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038528823852539, "data/tokens_consumed": 40980447232, "data/tokens_consumed_B": 40.980447232, "train/loss_slope": 4.2050881962356985e-06} {"step": 19550, "timestamp": 1778215667.0080118, "grad/layer_0/attn": 0.0031712637282907963, "grad/layer_0/mlp": 0.002900687512010336, "grad/layer_0/attn_mlp_ratio": 1.0932799916682294, "grad/layer_4/attn": 0.0020079684909433126, "grad/layer_4/mlp": 0.0025131842121481895, "grad/layer_4/attn_mlp_ratio": 0.7989738282374385, "grad/layer_8/attn": 0.004822383169084787, "grad/layer_8/mlp": 0.003981207497417927, "grad/layer_8/attn_mlp_ratio": 1.2112865383388745, "grad/layer_12/attn": 0.004715371876955032, "grad/layer_12/mlp": 0.006052306853234768, "grad/layer_12/attn_mlp_ratio": 0.7791032268174723, "grad/layer_16/attn": 0.004084222950041294, "grad/layer_16/mlp": 0.004550515208393335, "grad/layer_16/attn_mlp_ratio": 0.8975297682238375, "grad/layer_20/attn": 0.006108712870627642, "grad/layer_20/mlp": 0.005513291340321302, "grad/layer_20/attn_mlp_ratio": 1.107997452474894, "grad/layer_24/attn": 0.005654426757246256, "grad/layer_24/mlp": 0.009545523673295975, "grad/layer_24/attn_mlp_ratio": 0.5923642213395105, "grad/layer_27/attn": 0.003673959756270051, "grad/layer_27/mlp": 0.0076705943793058395, "grad/layer_27/attn_mlp_ratio": 0.47896675625101365} {"step": 19550, "timestamp": 1778215667.024033, "train/loss": 2.1929067611694335, "train/z_loss": 0.001580563234165311, "train/perplexity": 8.961223429794138, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021459.4686973095, "perf/iters_per_sec": 0.963906988476424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037444496154785, "data/tokens_consumed": 41001418752, "data/tokens_consumed_B": 41.001418752, "train/loss_slope": -2.3949086898589967e-07} {"step": 19560, "timestamp": 1778215677.4100554, "train/loss": 2.229779052734375, "train/z_loss": 0.001585060334764421, "train/perplexity": 9.297811526484935, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020305.1432869502, "perf/iters_per_sec": 0.9633565632281066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380372524261474, "data/tokens_consumed": 41022390272, "data/tokens_consumed_B": 41.022390272, "train/loss_slope": -1.2502575006493826e-06} {"step": 19570, "timestamp": 1778215687.7948587, "train/loss": 2.2730634927749636, "train/z_loss": 0.0015806231182068585, "train/perplexity": 9.709099055480253, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020654.8024382861, "perf/iters_per_sec": 0.9635232937041693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378576278686524, "data/tokens_consumed": 41043361792, "data/tokens_consumed_B": 41.043361792, "train/loss_slope": 4.784594059991827e-07} {"step": 19575, "timestamp": 1778215693.600899, "eos/sharpness": 48.399567604064934, "eos/L0_probe": 2.0573954582214355, "eos/L_plus": 2.262287139892578, "eos/L_minus": 2.3364994525909424, "eos/grad_norm": 0.16859222948551178, "eos/embed_grad_frac": 0.08290562778711319, "eos/time_s": 0.6274137496948242} {"step": 19575, "timestamp": 1778215694.9816282, "geo/rankme_last": 441.0423583984375, "geo/layer_0/stable_rank_q_proj": 16.94219970703125, "geo/layer_0/stable_rank_k_proj": 15.04159927368164, "geo/layer_0/stable_rank_o_proj": 52.48519515991211, "geo/layer_0/stable_rank_gate_proj": 153.80824279785156, "geo/layer_0/stable_rank_down_proj": 49.980628967285156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04613369703292847, "geo/layer_0/attn_entropy_mean": 6.278876304626465, "geo/layer_0/attn_entropy_std": 0.31330808997154236, "geo/layer_7/stable_rank_q_proj": 42.28749465942383, "geo/layer_7/stable_rank_k_proj": 42.187721252441406, "geo/layer_7/stable_rank_o_proj": 110.74164581298828, "geo/layer_7/stable_rank_gate_proj": 104.38074493408203, "geo/layer_7/stable_rank_down_proj": 151.81341552734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5558325052261353, "geo/layer_7/attn_entropy_mean": 4.676190376281738, "geo/layer_7/attn_entropy_std": 0.8616158366203308, "geo/layer_14/stable_rank_q_proj": 57.98944854736328, "geo/layer_14/stable_rank_k_proj": 35.23182678222656, "geo/layer_14/stable_rank_o_proj": 53.045318603515625, "geo/layer_14/stable_rank_gate_proj": 89.4070053100586, "geo/layer_14/stable_rank_down_proj": 137.2337646484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37279006838798523, "geo/layer_14/attn_entropy_mean": 5.520999908447266, "geo/layer_14/attn_entropy_std": 0.4985494911670685, "geo/layer_21/stable_rank_q_proj": 48.633419036865234, "geo/layer_21/stable_rank_k_proj": 32.08623123168945, "geo/layer_21/stable_rank_o_proj": 84.4030532836914, "geo/layer_21/stable_rank_gate_proj": 89.08312225341797, "geo/layer_21/stable_rank_down_proj": 61.52033233642578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15752068161964417, "geo/layer_21/attn_entropy_mean": 5.749841690063477, "geo/layer_21/attn_entropy_std": 0.29228633642196655, "geo/layer_27/stable_rank_q_proj": 41.497596740722656, "geo/layer_27/stable_rank_k_proj": 31.441709518432617, "geo/layer_27/stable_rank_o_proj": 117.87859344482422, "geo/layer_27/stable_rank_gate_proj": 91.782958984375, "geo/layer_27/stable_rank_down_proj": 138.40280151367188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07624508440494537, "geo/layer_27/attn_entropy_mean": 4.400813102722168, "geo/layer_27/attn_entropy_std": 0.5492873787879944, "attnres/final_alpha/block_0": 0.2448316216468811, "attnres/block_norm/0": 1.6032829284667969, "attnres/final_alpha/block_1": 0.007222817279398441, "attnres/block_norm/1": 28269.583984375, "attnres/final_alpha/block_2": 0.014498058706521988, "attnres/block_norm/2": 20317.98828125, "attnres/final_alpha/block_3": 0.0165092796087265, "attnres/block_norm/3": 28425.599609375, "attnres/final_alpha/block_4": 0.021835779771208763, "attnres/block_norm/4": 9018.5888671875, "attnres/final_alpha/block_5": 0.553645670413971, "attnres/block_norm/5": 4997.130859375, "attnres/final_alpha/block_6": 0.14145678281784058, "attnres/block_norm/6": 19099.2421875, "geo/tier1_time_s": 1.3612937927246094, "geo/step": 19575.0, "geo/rankme_slope": 5.533307072829132e-06} {"step": 19580, "timestamp": 1778215700.1823566, "train/loss": 2.2313833951950075, "train/z_loss": 0.0015635903808288276, "train/perplexity": 9.312740372596556, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1693988.9185115981, "perf/iters_per_sec": 0.8077568619306555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2379962921142578, "data/tokens_consumed": 41064333312, "data/tokens_consumed_B": 41.064333312, "train/loss_slope": -4.805716243621554e-07} {"step": 19590, "timestamp": 1778215710.5594704, "train/loss": 2.2329450368881227, "train/z_loss": 0.0015718657989054919, "train/perplexity": 9.327294897758344, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021874.4044505039, "perf/iters_per_sec": 0.9641048452618141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372315883636474, "data/tokens_consumed": 41085304832, "data/tokens_consumed_B": 41.085304832, "train/loss_slope": -8.041531339336849e-07} {"step": 19600, "timestamp": 1778215720.929071, "grad/layer_0/attn": 0.0030075975228101015, "grad/layer_0/mlp": 0.0027184265200048685, "grad/layer_0/attn_mlp_ratio": 1.1063743640079258, "grad/layer_4/attn": 0.0017722012707963586, "grad/layer_4/mlp": 0.002440167823806405, "grad/layer_4/attn_mlp_ratio": 0.726262013981362, "grad/layer_8/attn": 0.00953377690166235, "grad/layer_8/mlp": 0.0036755562759935856, "grad/layer_8/attn_mlp_ratio": 2.5938323144574693, "grad/layer_12/attn": 0.004497138783335686, "grad/layer_12/mlp": 0.006107271648943424, "grad/layer_12/attn_mlp_ratio": 0.736358061046441, "grad/layer_16/attn": 0.004226305056363344, "grad/layer_16/mlp": 0.0046479357406497, "grad/layer_16/attn_mlp_ratio": 0.909286444834527, "grad/layer_20/attn": 0.005288544110953808, "grad/layer_20/mlp": 0.0070922463200986385, "grad/layer_20/attn_mlp_ratio": 0.7456796898605583, "grad/layer_24/attn": 0.009358702227473259, "grad/layer_24/mlp": 0.011613210663199425, "grad/layer_24/attn_mlp_ratio": 0.8058669060867839, "grad/layer_27/attn": 0.019561678171157837, "grad/layer_27/mlp": 0.008393910713493824, "grad/layer_27/attn_mlp_ratio": 2.3304605690724056} {"step": 19600, "timestamp": 1778215720.9449158, "train/loss": 2.227329397201538, "train/z_loss": 0.0015815472812391817, "train/perplexity": 9.27506296548079, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020102.290644081, "perf/iters_per_sec": 0.9632598355503469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381414890289307, "data/tokens_consumed": 41106276352, "data/tokens_consumed_B": 41.106276352, "train/loss_slope": -3.468628084198177e-06} {"step": 19610, "timestamp": 1778215731.3276172, "train/loss": 2.2279537677764893, "train/z_loss": 0.0015858583385124803, "train/perplexity": 9.28085585014243, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022065.7132701601, "perf/iters_per_sec": 0.9641960684157181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371334552764893, "data/tokens_consumed": 41127247872, "data/tokens_consumed_B": 41.127247872, "train/loss_slope": -1.8412093446664382e-06} {"step": 19620, "timestamp": 1778215741.7100298, "train/loss": 2.174933409690857, "train/z_loss": 0.0015972477849572897, "train/perplexity": 8.801599001474383, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020876.8017991586, "perf/iters_per_sec": 0.9636291512485307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037743616104126, "data/tokens_consumed": 41148219392, "data/tokens_consumed_B": 41.148219392, "train/loss_slope": -8.138322186405636e-06} {"step": 19630, "timestamp": 1778215752.0948398, "train/loss": 2.2412942171096804, "train/z_loss": 0.0015861300984397531, "train/perplexity": 9.40549616781203, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020429.7888797235, "perf/iters_per_sec": 0.9634159988783472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379732131958008, "data/tokens_consumed": 41169190912, "data/tokens_consumed_B": 41.169190912, "train/loss_slope": -9.495023432129875e-06} {"step": 19640, "timestamp": 1778215762.4751425, "train/loss": 2.2575451374053954, "train/z_loss": 0.0015767157543450593, "train/perplexity": 9.559592851540556, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021431.7813994025, "perf/iters_per_sec": 0.9638937861439717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374587059020997, "data/tokens_consumed": 41190162432, "data/tokens_consumed_B": 41.190162432, "train/loss_slope": -6.209707117066367e-06} {"step": 19650, "timestamp": 1778215772.8493197, "grad/layer_0/attn": 0.003033342771232128, "grad/layer_0/mlp": 0.0028999438509345055, "grad/layer_0/attn_mlp_ratio": 1.0460004822695401, "grad/layer_4/attn": 0.0016770272050052881, "grad/layer_4/mlp": 0.0025946402456611395, "grad/layer_4/attn_mlp_ratio": 0.6463428381546905, "grad/layer_8/attn": 0.013124087825417519, "grad/layer_8/mlp": 0.003871731460094452, "grad/layer_8/attn_mlp_ratio": 3.3897204962983936, "grad/layer_12/attn": 0.0046527874656021595, "grad/layer_12/mlp": 0.006211129482835531, "grad/layer_12/attn_mlp_ratio": 0.7491048775507996, "grad/layer_16/attn": 0.005491995252668858, "grad/layer_16/mlp": 0.0050079976208508015, "grad/layer_16/attn_mlp_ratio": 1.0966449185475728, "grad/layer_20/attn": 0.006593470927327871, "grad/layer_20/mlp": 0.006239085923880339, "grad/layer_20/attn_mlp_ratio": 1.0568007721148758, "grad/layer_24/attn": 0.005896606482565403, "grad/layer_24/mlp": 0.008726410567760468, "grad/layer_24/attn_mlp_ratio": 0.675719572120332, "grad/layer_27/attn": 0.00617615133523941, "grad/layer_27/mlp": 0.0072555989027023315, "grad/layer_27/attn_mlp_ratio": 0.8512255615200229} {"step": 19650, "timestamp": 1778215773.4656277, "eos/sharpness": 38.18504810333251, "eos/L0_probe": 2.0566632747650146, "eos/L_plus": 2.2021443843841553, "eos/L_minus": 2.293032646179199, "eos/grad_norm": 0.13128530979156494, "eos/embed_grad_frac": 0.1293700635433197, "eos/time_s": 0.6135611534118652} {"step": 19650, "timestamp": 1778215773.485327, "train/loss": 2.2541526794433593, "train/z_loss": 0.0015628262073732913, "train/perplexity": 9.527217282086953, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905567.3590699334, "perf/iters_per_sec": 0.9086453242635409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1005394220352174, "data/tokens_consumed": 41211133952, "data/tokens_consumed_B": 41.211133952, "train/loss_slope": -6.689184605449357e-06} {"step": 19650, "timestamp": 1778215774.848535, "geo/rankme_last": 441.2682800292969, "geo/layer_0/stable_rank_q_proj": 16.95122528076172, "geo/layer_0/stable_rank_k_proj": 15.053082466125488, "geo/layer_0/stable_rank_o_proj": 52.58875274658203, "geo/layer_0/stable_rank_gate_proj": 153.9364776611328, "geo/layer_0/stable_rank_down_proj": 50.03373718261719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04661336913704872, "geo/layer_0/attn_entropy_mean": 6.277316093444824, "geo/layer_0/attn_entropy_std": 0.31463316082954407, "geo/layer_7/stable_rank_q_proj": 42.28310012817383, "geo/layer_7/stable_rank_k_proj": 42.18338394165039, "geo/layer_7/stable_rank_o_proj": 111.07678985595703, "geo/layer_7/stable_rank_gate_proj": 104.09729766845703, "geo/layer_7/stable_rank_down_proj": 152.2239227294922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5528174042701721, "geo/layer_7/attn_entropy_mean": 4.648778438568115, "geo/layer_7/attn_entropy_std": 0.8360198736190796, "geo/layer_14/stable_rank_q_proj": 57.911293029785156, "geo/layer_14/stable_rank_k_proj": 35.20234680175781, "geo/layer_14/stable_rank_o_proj": 53.09159851074219, "geo/layer_14/stable_rank_gate_proj": 89.21578979492188, "geo/layer_14/stable_rank_down_proj": 137.18612670898438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3634456694126129, "geo/layer_14/attn_entropy_mean": 5.497982501983643, "geo/layer_14/attn_entropy_std": 0.4947715401649475, "geo/layer_21/stable_rank_q_proj": 48.64366149902344, "geo/layer_21/stable_rank_k_proj": 32.083404541015625, "geo/layer_21/stable_rank_o_proj": 84.54243469238281, "geo/layer_21/stable_rank_gate_proj": 88.98625946044922, "geo/layer_21/stable_rank_down_proj": 61.462181091308594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15983402729034424, "geo/layer_21/attn_entropy_mean": 5.745068073272705, "geo/layer_21/attn_entropy_std": 0.3065117597579956, "geo/layer_27/stable_rank_q_proj": 41.53892135620117, "geo/layer_27/stable_rank_k_proj": 31.59010887145996, "geo/layer_27/stable_rank_o_proj": 118.03948211669922, "geo/layer_27/stable_rank_gate_proj": 91.7436294555664, "geo/layer_27/stable_rank_down_proj": 138.5021514892578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07360805571079254, "geo/layer_27/attn_entropy_mean": 4.390637397766113, "geo/layer_27/attn_entropy_std": 0.5586690902709961, "attnres/final_alpha/block_0": 0.24407070875167847, "attnres/block_norm/0": 1.6042909622192383, "attnres/final_alpha/block_1": 0.0072263143956661224, "attnres/block_norm/1": 28342.94921875, "attnres/final_alpha/block_2": 0.014474371448159218, "attnres/block_norm/2": 20365.384765625, "attnres/final_alpha/block_3": 0.016626324504613876, "attnres/block_norm/3": 28420.0859375, "attnres/final_alpha/block_4": 0.02149050310254097, "attnres/block_norm/4": 9077.3984375, "attnres/final_alpha/block_5": 0.5553798079490662, "attnres/block_norm/5": 4998.9208984375, "attnres/final_alpha/block_6": 0.14073196053504944, "attnres/block_norm/6": 19104.443359375, "geo/tier1_time_s": 1.3590867519378662, "geo/step": 19650.0, "geo/rankme_slope": 7.409936630902361e-06} {"step": 19660, "timestamp": 1778215785.2296855, "train/loss": 2.2883967638015745, "train/z_loss": 0.0015468403697013855, "train/perplexity": 9.859118507949523, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786246.4422591385, "perf/iters_per_sec": 0.85174867737729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174055242538452, "data/tokens_consumed": 41232105472, "data/tokens_consumed_B": 41.232105472, "train/loss_slope": -2.7837227530354843e-06} {"step": 19670, "timestamp": 1778215795.6112707, "train/loss": 2.274904751777649, "train/z_loss": 0.0015688676736317574, "train/perplexity": 9.726992489692291, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021086.4043001353, "perf/iters_per_sec": 0.9637290975094487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376359939575195, "data/tokens_consumed": 41253076992, "data/tokens_consumed_B": 41.253076992, "train/loss_slope": 4.8338625976855564e-06} {"step": 19680, "timestamp": 1778215805.9982123, "train/loss": 2.2739444017410277, "train/z_loss": 0.0015581597806885838, "train/perplexity": 9.71765565613042, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019958.0636265823, "perf/iters_per_sec": 0.9631910627491866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382156133651734, "data/tokens_consumed": 41274048512, "data/tokens_consumed_B": 41.274048512, "train/loss_slope": 7.791024187181282e-06} {"step": 19690, "timestamp": 1778215816.3859968, "train/loss": 2.267636299133301, "train/z_loss": 0.001557455863803625, "train/perplexity": 9.656548624489403, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019766.1794529036, "perf/iters_per_sec": 0.9630995652451055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383142471313476, "data/tokens_consumed": 41295020032, "data/tokens_consumed_B": 41.295020032, "train/loss_slope": 7.648259213547987e-06} {"step": 19700, "timestamp": 1778215826.7594206, "grad/layer_0/attn": 0.0030076748225837946, "grad/layer_0/mlp": 0.002706493018195033, "grad/layer_0/attn_mlp_ratio": 1.1112811639401545, "grad/layer_4/attn": 0.0015998190501704812, "grad/layer_4/mlp": 0.0024765734560787678, "grad/layer_4/attn_mlp_ratio": 0.6459808335769853, "grad/layer_8/attn": 0.006014946382492781, "grad/layer_8/mlp": 0.00379168544895947, "grad/layer_8/attn_mlp_ratio": 1.586351585548392, "grad/layer_12/attn": 0.004776617046445608, "grad/layer_12/mlp": 0.005974076688289642, "grad/layer_12/attn_mlp_ratio": 0.7995573568469544, "grad/layer_16/attn": 0.006772870197892189, "grad/layer_16/mlp": 0.004908398725092411, "grad/layer_16/attn_mlp_ratio": 1.3798532758316915, "grad/layer_20/attn": 0.006437996868044138, "grad/layer_20/mlp": 0.007653709966689348, "grad/layer_20/attn_mlp_ratio": 0.8411602754674149, "grad/layer_24/attn": 0.022287650033831596, "grad/layer_24/mlp": 0.01425405964255333, "grad/layer_24/attn_mlp_ratio": 1.5636001557714259, "grad/layer_27/attn": 0.013538307510316372, "grad/layer_27/mlp": 0.011712679639458656, "grad/layer_27/attn_mlp_ratio": 1.1558676418605889} {"step": 19700, "timestamp": 1778215826.775221, "train/loss": 2.26417281627655, "train/z_loss": 0.001566306420136243, "train/perplexity": 9.62316118565962, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019695.1307970677, "perf/iters_per_sec": 0.9630656866059626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038350772857666, "data/tokens_consumed": 41315991552, "data/tokens_consumed_B": 41.315991552, "train/loss_slope": 8.996974860373583e-06} {"step": 19710, "timestamp": 1778215837.1628766, "train/loss": 2.237605929374695, "train/z_loss": 0.0015584066393785178, "train/perplexity": 9.370869886751818, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019959.2696854328, "perf/iters_per_sec": 0.9631916378428615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382149934768676, "data/tokens_consumed": 41336963072, "data/tokens_consumed_B": 41.336963072, "train/loss_slope": 6.906623680098696e-06} {"step": 19720, "timestamp": 1778215847.5519319, "train/loss": 2.1698043823242186, "train/z_loss": 0.0015844807494431734, "train/perplexity": 8.756570933135478, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019582.3536399165, "perf/iters_per_sec": 0.9630119102668364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384087562561035, "data/tokens_consumed": 41357934592, "data/tokens_consumed_B": 41.357934592, "train/loss_slope": 4.92669837166514e-06} {"step": 19725, "timestamp": 1778215853.3425593, "eos/sharpness": 15.472030639648434, "eos/L0_probe": 2.054439067840576, "eos/L_plus": 2.1476376056671143, "eos/L_minus": 2.1159608364105225, "eos/grad_norm": 0.12338821589946747, "eos/embed_grad_frac": 0.16970568895339966, "eos/time_s": 0.6129426956176758} {"step": 19725, "timestamp": 1778215854.7208467, "geo/rankme_last": 440.87139892578125, "geo/layer_0/stable_rank_q_proj": 16.97952651977539, "geo/layer_0/stable_rank_k_proj": 15.056265830993652, "geo/layer_0/stable_rank_o_proj": 52.72059631347656, "geo/layer_0/stable_rank_gate_proj": 153.8314971923828, "geo/layer_0/stable_rank_down_proj": 50.00583267211914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.044836126267910004, "geo/layer_0/attn_entropy_mean": 6.272254943847656, "geo/layer_0/attn_entropy_std": 0.3135514557361603, "geo/layer_7/stable_rank_q_proj": 42.385154724121094, "geo/layer_7/stable_rank_k_proj": 42.29509735107422, "geo/layer_7/stable_rank_o_proj": 110.85908508300781, "geo/layer_7/stable_rank_gate_proj": 104.1151351928711, "geo/layer_7/stable_rank_down_proj": 151.70632934570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5644065141677856, "geo/layer_7/attn_entropy_mean": 4.655881881713867, "geo/layer_7/attn_entropy_std": 0.8669373989105225, "geo/layer_14/stable_rank_q_proj": 57.915557861328125, "geo/layer_14/stable_rank_k_proj": 35.112422943115234, "geo/layer_14/stable_rank_o_proj": 53.23295974731445, "geo/layer_14/stable_rank_gate_proj": 89.34845733642578, "geo/layer_14/stable_rank_down_proj": 136.8316650390625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36987075209617615, "geo/layer_14/attn_entropy_mean": 5.516120433807373, "geo/layer_14/attn_entropy_std": 0.47619909048080444, "geo/layer_21/stable_rank_q_proj": 48.61787033081055, "geo/layer_21/stable_rank_k_proj": 32.0813102722168, "geo/layer_21/stable_rank_o_proj": 84.52892303466797, "geo/layer_21/stable_rank_gate_proj": 88.71365356445312, "geo/layer_21/stable_rank_down_proj": 61.37964630126953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15336260199546814, "geo/layer_21/attn_entropy_mean": 5.7614946365356445, "geo/layer_21/attn_entropy_std": 0.28586915135383606, "geo/layer_27/stable_rank_q_proj": 41.586753845214844, "geo/layer_27/stable_rank_k_proj": 31.624984741210938, "geo/layer_27/stable_rank_o_proj": 118.04987335205078, "geo/layer_27/stable_rank_gate_proj": 91.8875961303711, "geo/layer_27/stable_rank_down_proj": 138.33792114257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07354965806007385, "geo/layer_27/attn_entropy_mean": 4.395011901855469, "geo/layer_27/attn_entropy_std": 0.5545159578323364, "attnres/final_alpha/block_0": 0.24178333580493927, "attnres/block_norm/0": 1.6049950122833252, "attnres/final_alpha/block_1": 0.007037163712084293, "attnres/block_norm/1": 28267.791015625, "attnres/final_alpha/block_2": 0.014240957796573639, "attnres/block_norm/2": 20435.154296875, "attnres/final_alpha/block_3": 0.016250120475888252, "attnres/block_norm/3": 28767.34765625, "attnres/final_alpha/block_4": 0.020687416195869446, "attnres/block_norm/4": 9051.291015625, "attnres/final_alpha/block_5": 0.5628700852394104, "attnres/block_norm/5": 4932.5302734375, "attnres/final_alpha/block_6": 0.13713090121746063, "attnres/block_norm/6": 19181.19921875, "geo/tier1_time_s": 1.3590247631072998, "geo/step": 19725.0, "geo/rankme_slope": -3.0234750150060004e-06} {"step": 19730, "timestamp": 1778215859.911234, "train/loss": 2.2508732795715334, "train/z_loss": 0.0015652531059458853, "train/perplexity": 9.496024901053174, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697539.9502341975, "perf/iters_per_sec": 0.809450125805949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2354065656661988, "data/tokens_consumed": 41378906112, "data/tokens_consumed_B": 41.378906112, "train/loss_slope": 4.134608946009634e-06} {"step": 19740, "timestamp": 1778215870.2908285, "train/loss": 2.2725802421569825, "train/z_loss": 0.0015707595623098314, "train/perplexity": 9.704408260867638, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021392.2028879037, "perf/iters_per_sec": 0.9638749136390227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374790191650392, "data/tokens_consumed": 41399877632, "data/tokens_consumed_B": 41.399877632, "train/loss_slope": 6.928031079017301e-06} {"step": 19750, "timestamp": 1778215880.6619797, "grad/layer_0/attn": 0.0034840304870158434, "grad/layer_0/mlp": 0.0030766883864998817, "grad/layer_0/attn_mlp_ratio": 1.1323962443072566, "grad/layer_4/attn": 0.0016163610853254795, "grad/layer_4/mlp": 0.0026663318276405334, "grad/layer_4/attn_mlp_ratio": 0.606211502990108, "grad/layer_8/attn": 0.004735433030873537, "grad/layer_8/mlp": 0.004090277012437582, "grad/layer_8/attn_mlp_ratio": 1.1577291466326782, "grad/layer_12/attn": 0.004751742351800203, "grad/layer_12/mlp": 0.006746124010533094, "grad/layer_12/attn_mlp_ratio": 0.704366281133347, "grad/layer_16/attn": 0.004397910553961992, "grad/layer_16/mlp": 0.00542100565508008, "grad/layer_16/attn_mlp_ratio": 0.8112720688113392, "grad/layer_20/attn": 0.004180623218417168, "grad/layer_20/mlp": 0.007554454728960991, "grad/layer_20/attn_mlp_ratio": 0.5533984004233108, "grad/layer_24/attn": 0.018309755250811577, "grad/layer_24/mlp": 0.0137368468567729, "grad/layer_24/attn_mlp_ratio": 1.332893589659163, "grad/layer_27/attn": 0.0061288801953196526, "grad/layer_27/mlp": 0.012821037322282791, "grad/layer_27/attn_mlp_ratio": 0.47803309462834426} {"step": 19750, "timestamp": 1778215880.6778042, "train/loss": 2.2732783794403075, "train/z_loss": 0.0015526957693509757, "train/perplexity": 9.71118563558087, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020503.6739443233, "perf/iters_per_sec": 0.9634512300225846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379352569580078, "data/tokens_consumed": 41420849152, "data/tokens_consumed_B": 41.420849152, "train/loss_slope": 9.071073122937257e-06} {"step": 19760, "timestamp": 1778215891.0547664, "train/loss": 2.235435366630554, "train/z_loss": 0.001572398527059704, "train/perplexity": 9.35055188442198, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022035.5922793632, "perf/iters_per_sec": 0.9641817056080643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037148904800415, "data/tokens_consumed": 41441820672, "data/tokens_consumed_B": 41.441820672, "train/loss_slope": 7.526219531361668e-06} {"step": 19770, "timestamp": 1778215901.4286897, "train/loss": 2.2176271438598634, "train/z_loss": 0.0015789601020514965, "train/perplexity": 9.185509094503834, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022298.2514910498, "perf/iters_per_sec": 0.9643069512801408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370141983032226, "data/tokens_consumed": 41462792192, "data/tokens_consumed_B": 41.462792192, "train/loss_slope": 5.214333570007044e-06} {"step": 19780, "timestamp": 1778215911.8120458, "train/loss": 2.206719899177551, "train/z_loss": 0.0015785109251737594, "train/perplexity": 9.085864908905606, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020889.3377223264, "perf/iters_per_sec": 0.9636351288425095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377371788024903, "data/tokens_consumed": 41483763712, "data/tokens_consumed_B": 41.483763712, "train/loss_slope": 5.5774914048792805e-06} {"step": 19790, "timestamp": 1778215922.188578, "train/loss": 2.2218618154525758, "train/z_loss": 0.0015747279627248644, "train/perplexity": 9.224489184620408, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022081.703790416, "perf/iters_per_sec": 0.9642036932899551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037125253677368, "data/tokens_consumed": 41504735232, "data/tokens_consumed_B": 41.504735232, "train/loss_slope": 4.454999352016477e-06} {"step": 19800, "timestamp": 1778215932.553285, "grad/layer_0/attn": 0.0031048916280269623, "grad/layer_0/mlp": 0.0026987912133336067, "grad/layer_0/attn_mlp_ratio": 1.1504748858079477, "grad/layer_4/attn": 0.001979645574465394, "grad/layer_4/mlp": 0.002565400442108512, "grad/layer_4/attn_mlp_ratio": 0.7716711452934816, "grad/layer_8/attn": 0.005294636823236942, "grad/layer_8/mlp": 0.0038340629544109106, "grad/layer_8/attn_mlp_ratio": 1.380946726253161, "grad/layer_12/attn": 0.0038825985975563526, "grad/layer_12/mlp": 0.00631303247064352, "grad/layer_12/attn_mlp_ratio": 0.615013237158157, "grad/layer_16/attn": 0.003758494509384036, "grad/layer_16/mlp": 0.004458369221538305, "grad/layer_16/attn_mlp_ratio": 0.8430200008839138, "grad/layer_20/attn": 0.0038713044486939907, "grad/layer_20/mlp": 0.0062605468556284904, "grad/layer_20/attn_mlp_ratio": 0.6183652125176584, "grad/layer_24/attn": 0.010152324102818966, "grad/layer_24/mlp": 0.009075590409338474, "grad/layer_24/attn_mlp_ratio": 1.118640609927538, "grad/layer_27/attn": 0.005356217734515667, "grad/layer_27/mlp": 0.0076012625358998775, "grad/layer_27/attn_mlp_ratio": 0.7046484236998831} {"step": 19800, "timestamp": 1778215933.1710525, "eos/sharpness": 45.7148790359497, "eos/L0_probe": 2.053067922592163, "eos/L_plus": 2.3438777923583984, "eos/L_minus": 2.219406843185425, "eos/grad_norm": 0.13218222558498383, "eos/embed_grad_frac": 0.1390649527311325, "eos/time_s": 0.6149933338165283} {"step": 19800, "timestamp": 1778215933.1929216, "train/loss": 2.236117696762085, "train/z_loss": 0.0015710783191025257, "train/perplexity": 9.356934224903153, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906568.1428031314, "perf/iters_per_sec": 0.909122535134855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999617338180543, "data/tokens_consumed": 41525706752, "data/tokens_consumed_B": 41.525706752, "train/loss_slope": 4.727394002141273e-06} {"step": 19800, "timestamp": 1778215934.5559742, "geo/rankme_last": 439.7751770019531, "geo/layer_0/stable_rank_q_proj": 17.003894805908203, "geo/layer_0/stable_rank_k_proj": 15.0828857421875, "geo/layer_0/stable_rank_o_proj": 52.83686447143555, "geo/layer_0/stable_rank_gate_proj": 153.4131622314453, "geo/layer_0/stable_rank_down_proj": 50.01313781738281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051068928092718124, "geo/layer_0/attn_entropy_mean": 6.2787628173828125, "geo/layer_0/attn_entropy_std": 0.30967608094215393, "geo/layer_7/stable_rank_q_proj": 42.295265197753906, "geo/layer_7/stable_rank_k_proj": 42.32963943481445, "geo/layer_7/stable_rank_o_proj": 111.3137435913086, "geo/layer_7/stable_rank_gate_proj": 104.0475082397461, "geo/layer_7/stable_rank_down_proj": 151.3342742919922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5575605630874634, "geo/layer_7/attn_entropy_mean": 4.623414039611816, "geo/layer_7/attn_entropy_std": 0.8535289168357849, "geo/layer_14/stable_rank_q_proj": 57.98170471191406, "geo/layer_14/stable_rank_k_proj": 35.112247467041016, "geo/layer_14/stable_rank_o_proj": 53.24399185180664, "geo/layer_14/stable_rank_gate_proj": 89.21092224121094, "geo/layer_14/stable_rank_down_proj": 137.12893676757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3614346385002136, "geo/layer_14/attn_entropy_mean": 5.493024826049805, "geo/layer_14/attn_entropy_std": 0.48331743478775024, "geo/layer_21/stable_rank_q_proj": 48.47416305541992, "geo/layer_21/stable_rank_k_proj": 31.96851921081543, "geo/layer_21/stable_rank_o_proj": 84.44876098632812, "geo/layer_21/stable_rank_gate_proj": 88.6104736328125, "geo/layer_21/stable_rank_down_proj": 61.27695846557617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15666857361793518, "geo/layer_21/attn_entropy_mean": 5.7459492683410645, "geo/layer_21/attn_entropy_std": 0.3041372001171112, "geo/layer_27/stable_rank_q_proj": 41.63558578491211, "geo/layer_27/stable_rank_k_proj": 31.7666015625, "geo/layer_27/stable_rank_o_proj": 118.2098159790039, "geo/layer_27/stable_rank_gate_proj": 91.70429992675781, "geo/layer_27/stable_rank_down_proj": 138.69236755371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08665063232183456, "geo/layer_27/attn_entropy_mean": 4.397876739501953, "geo/layer_27/attn_entropy_std": 0.5486632585525513, "attnres/final_alpha/block_0": 0.24283656477928162, "attnres/block_norm/0": 1.6057391166687012, "attnres/final_alpha/block_1": 0.006924851797521114, "attnres/block_norm/1": 28415.759765625, "attnres/final_alpha/block_2": 0.014352218247950077, "attnres/block_norm/2": 20425.296875, "attnres/final_alpha/block_3": 0.016341764479875565, "attnres/block_norm/3": 28806.48046875, "attnres/final_alpha/block_4": 0.021155264228582382, "attnres/block_norm/4": 9088.076171875, "attnres/final_alpha/block_5": 0.5619120597839355, "attnres/block_norm/5": 4970.28125, "attnres/final_alpha/block_6": 0.13647723197937012, "attnres/block_norm/6": 19258.3671875, "geo/tier1_time_s": 1.3588964939117432, "geo/step": 19800.0, "geo/rankme_slope": -2.771464054371749e-05} {"step": 19810, "timestamp": 1778215944.9460113, "train/loss": 2.231998324394226, "train/z_loss": 0.0015654358197934926, "train/perplexity": 9.318468809687513, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1784944.6909826891, "perf/iters_per_sec": 0.8511279539979406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1749114751815797, "data/tokens_consumed": 41546678272, "data/tokens_consumed_B": 41.546678272, "train/loss_slope": 4.289749267399549e-06} {"step": 19820, "timestamp": 1778215955.3260946, "train/loss": 2.287850046157837, "train/z_loss": 0.0015598166384734214, "train/perplexity": 9.853729827087202, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021291.54480766, "perf/iters_per_sec": 0.9638269161260891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375306844711303, "data/tokens_consumed": 41567649792, "data/tokens_consumed_B": 41.567649792, "train/loss_slope": 6.570430779077927e-06} {"step": 19830, "timestamp": 1778215965.710501, "train/loss": 2.192851948738098, "train/z_loss": 0.0015802260488271714, "train/perplexity": 8.960732256811527, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020446.9137685087, "perf/iters_per_sec": 0.96342416466165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037964415550232, "data/tokens_consumed": 41588621312, "data/tokens_consumed_B": 41.588621312, "train/loss_slope": 2.9341620008329028e-06} {"step": 19840, "timestamp": 1778215976.0891304, "train/loss": 2.2655192852020263, "train/z_loss": 0.0015644215396605431, "train/perplexity": 9.63612720036998, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021916.232732694, "perf/iters_per_sec": 0.9641247905410261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372101306915282, "data/tokens_consumed": 41609592832, "data/tokens_consumed_B": 41.609592832, "train/loss_slope": 5.7497059157972195e-06} {"step": 19850, "timestamp": 1778215986.4656353, "grad/layer_0/attn": 0.003234551288187504, "grad/layer_0/mlp": 0.002783674281090498, "grad/layer_0/attn_mlp_ratio": 1.1619718563923327, "grad/layer_4/attn": 0.0016316813416779041, "grad/layer_4/mlp": 0.0025936607271432877, "grad/layer_4/attn_mlp_ratio": 0.6291035915729475, "grad/layer_8/attn": 0.01125466637313366, "grad/layer_8/mlp": 0.003927893005311489, "grad/layer_8/attn_mlp_ratio": 2.8653189054240173, "grad/layer_12/attn": 0.004894710145890713, "grad/layer_12/mlp": 0.006420904770493507, "grad/layer_12/attn_mlp_ratio": 0.7623084665813633, "grad/layer_16/attn": 0.00396551052108407, "grad/layer_16/mlp": 0.004475691355764866, "grad/layer_16/attn_mlp_ratio": 0.8860107003078458, "grad/layer_20/attn": 0.004430763423442841, "grad/layer_20/mlp": 0.007086408790200949, "grad/layer_20/attn_mlp_ratio": 0.6252480617608276, "grad/layer_24/attn": 0.012339042499661446, "grad/layer_24/mlp": 0.012574930675327778, "grad/layer_24/attn_mlp_ratio": 0.9812413857474946, "grad/layer_27/attn": 0.00913060363382101, "grad/layer_27/mlp": 0.010484600439667702, "grad/layer_27/attn_mlp_ratio": 0.8708585128518779} {"step": 19850, "timestamp": 1778215986.4814434, "train/loss": 2.258502697944641, "train/z_loss": 0.0015584281762130558, "train/perplexity": 9.568751124527072, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019123.4453006857, "perf/iters_per_sec": 0.9627930857184819, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386447668075562, "data/tokens_consumed": 41630564352, "data/tokens_consumed_B": 41.630564352, "train/loss_slope": 8.205108788504838e-06} {"step": 19860, "timestamp": 1778215997.4015338, "train/loss": 2.230057430267334, "train/z_loss": 0.0015635892981663347, "train/perplexity": 9.300400188615571, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921457.9434872342, "perf/iters_per_sec": 0.9162225453792735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0914378881454467, "data/tokens_consumed": 41651535872, "data/tokens_consumed_B": 41.651535872, "train/loss_slope": 7.715866367558652e-06} {"step": 19870, "timestamp": 1778216007.7924829, "train/loss": 2.198580598831177, "train/z_loss": 0.0015825982089154422, "train/perplexity": 9.012212471770379, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020918.635245528, "perf/iters_per_sec": 0.9636490989902153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377221345901488, "data/tokens_consumed": 41672507392, "data/tokens_consumed_B": 41.672507392, "train/loss_slope": 4.522338902095435e-06} {"step": 19875, "timestamp": 1778216013.5827127, "eos/sharpness": 16.718697547912594, "eos/L0_probe": 2.053942918777466, "eos/L_plus": 2.1410326957702637, "eos/L_minus": 2.134040117263794, "eos/grad_norm": 0.11033803224563599, "eos/embed_grad_frac": 0.22699688374996185, "eos/time_s": 0.6117968559265137} {"step": 19875, "timestamp": 1778216014.9633317, "geo/rankme_last": 440.60931396484375, "geo/layer_0/stable_rank_q_proj": 17.013473510742188, "geo/layer_0/stable_rank_k_proj": 15.109055519104004, "geo/layer_0/stable_rank_o_proj": 52.68053436279297, "geo/layer_0/stable_rank_gate_proj": 153.260009765625, "geo/layer_0/stable_rank_down_proj": 50.13473129272461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04679715260863304, "geo/layer_0/attn_entropy_mean": 6.280531883239746, "geo/layer_0/attn_entropy_std": 0.29925692081451416, "geo/layer_7/stable_rank_q_proj": 42.247962951660156, "geo/layer_7/stable_rank_k_proj": 42.01151657104492, "geo/layer_7/stable_rank_o_proj": 111.35884857177734, "geo/layer_7/stable_rank_gate_proj": 103.85411071777344, "geo/layer_7/stable_rank_down_proj": 151.07489013671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5528954863548279, "geo/layer_7/attn_entropy_mean": 4.651342868804932, "geo/layer_7/attn_entropy_std": 0.8457537293434143, "geo/layer_14/stable_rank_q_proj": 57.91326141357422, "geo/layer_14/stable_rank_k_proj": 34.921112060546875, "geo/layer_14/stable_rank_o_proj": 53.246055603027344, "geo/layer_14/stable_rank_gate_proj": 89.0333023071289, "geo/layer_14/stable_rank_down_proj": 137.35264587402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37977394461631775, "geo/layer_14/attn_entropy_mean": 5.473139762878418, "geo/layer_14/attn_entropy_std": 0.5144716501235962, "geo/layer_21/stable_rank_q_proj": 48.507530212402344, "geo/layer_21/stable_rank_k_proj": 32.00570297241211, "geo/layer_21/stable_rank_o_proj": 84.44483184814453, "geo/layer_21/stable_rank_gate_proj": 88.26127624511719, "geo/layer_21/stable_rank_down_proj": 61.411041259765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1534581035375595, "geo/layer_21/attn_entropy_mean": 5.761277675628662, "geo/layer_21/attn_entropy_std": 0.29712629318237305, "geo/layer_27/stable_rank_q_proj": 41.58402633666992, "geo/layer_27/stable_rank_k_proj": 31.683074951171875, "geo/layer_27/stable_rank_o_proj": 118.23918151855469, "geo/layer_27/stable_rank_gate_proj": 91.56051635742188, "geo/layer_27/stable_rank_down_proj": 138.16615295410156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07110478729009628, "geo/layer_27/attn_entropy_mean": 4.367457389831543, "geo/layer_27/attn_entropy_std": 0.5454282760620117, "attnres/final_alpha/block_0": 0.24258387088775635, "attnres/block_norm/0": 1.6064646244049072, "attnres/final_alpha/block_1": 0.007017911411821842, "attnres/block_norm/1": 28414.625, "attnres/final_alpha/block_2": 0.014515606686472893, "attnres/block_norm/2": 20488.94140625, "attnres/final_alpha/block_3": 0.016516564413905144, "attnres/block_norm/3": 28869.1640625, "attnres/final_alpha/block_4": 0.021011075004935265, "attnres/block_norm/4": 9145.5625, "attnres/final_alpha/block_5": 0.5610613822937012, "attnres/block_norm/5": 4964.94384765625, "attnres/final_alpha/block_6": 0.13729363679885864, "attnres/block_norm/6": 19282.39453125, "geo/tier1_time_s": 1.3607001304626465, "geo/step": 19875.0, "geo/rankme_slope": -4.431305725415166e-05} {"step": 19880, "timestamp": 1778216020.1544192, "train/loss": 2.234564685821533, "train/z_loss": 0.0015677438350394368, "train/perplexity": 9.342414081570938, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697331.0296382394, "perf/iters_per_sec": 0.8093505047026822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2355586290359497, "data/tokens_consumed": 41693478912, "data/tokens_consumed_B": 41.693478912, "train/loss_slope": 4.0074244870317954e-06} {"step": 19890, "timestamp": 1778216030.5339935, "train/loss": 2.313636565208435, "train/z_loss": 0.0015607328969053924, "train/perplexity": 10.111127653183553, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021828.8135941587, "perf/iters_per_sec": 0.9640831058474344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372549772262574, "data/tokens_consumed": 41714450432, "data/tokens_consumed_B": 41.714450432, "train/loss_slope": 9.135467041157508e-06} {"step": 19900, "timestamp": 1778216040.9008453, "grad/layer_0/attn": 0.0027844442520290613, "grad/layer_0/mlp": 0.002902195556089282, "grad/layer_0/attn_mlp_ratio": 0.9594267864700442, "grad/layer_4/attn": 0.001903619966469705, "grad/layer_4/mlp": 0.0025157013442367315, "grad/layer_4/attn_mlp_ratio": 0.7566955016982418, "grad/layer_8/attn": 0.00512052234262228, "grad/layer_8/mlp": 0.0039282976649701595, "grad/layer_8/attn_mlp_ratio": 1.3034964885512383, "grad/layer_12/attn": 0.004767268430441618, "grad/layer_12/mlp": 0.006763475481420755, "grad/layer_12/attn_mlp_ratio": 0.7048548298950452, "grad/layer_16/attn": 0.004948518238961697, "grad/layer_16/mlp": 0.005452299490571022, "grad/layer_16/attn_mlp_ratio": 0.9076020414431109, "grad/layer_20/attn": 0.004569636192172766, "grad/layer_20/mlp": 0.007484289817512035, "grad/layer_20/attn_mlp_ratio": 0.6105637598939816, "grad/layer_24/attn": 0.021891720592975616, "grad/layer_24/mlp": 0.013794979080557823, "grad/layer_24/attn_mlp_ratio": 1.5869339349079314, "grad/layer_27/attn": 0.006468415725976229, "grad/layer_27/mlp": 0.011846715584397316, "grad/layer_27/attn_mlp_ratio": 0.5460091976796099} {"step": 19900, "timestamp": 1778216040.9165766, "train/loss": 2.1961422085762026, "train/z_loss": 0.001582678989507258, "train/perplexity": 8.990263951107933, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020855.4910866993, "perf/iters_per_sec": 0.9636189895089623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377545595169066, "data/tokens_consumed": 41735421952, "data/tokens_consumed_B": 41.735421952, "train/loss_slope": 7.780095767660088e-06} {"step": 19910, "timestamp": 1778216051.2902899, "train/loss": 2.1853646993637086, "train/z_loss": 0.0015848988434299827, "train/perplexity": 8.893891558566763, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022744.3240977956, "perf/iters_per_sec": 0.9645196552742937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367855072021483, "data/tokens_consumed": 41756393472, "data/tokens_consumed_B": 41.756393472, "train/loss_slope": 4.385343078661344e-06} {"step": 19920, "timestamp": 1778216061.6695166, "train/loss": 2.2045239210128784, "train/z_loss": 0.0015804766095243394, "train/perplexity": 9.065934439405657, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021887.9752146187, "perf/iters_per_sec": 0.9641113163064092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372246265411378, "data/tokens_consumed": 41777364992, "data/tokens_consumed_B": 41.777364992, "train/loss_slope": 2.1183864082476086e-06} {"step": 19930, "timestamp": 1778216072.0425293, "train/loss": 2.271953082084656, "train/z_loss": 0.0015680203214287758, "train/perplexity": 9.698323951598214, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022833.8692601288, "perf/iters_per_sec": 0.9645623537350315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367396116256713, "data/tokens_consumed": 41798336512, "data/tokens_consumed_B": 41.798336512, "train/loss_slope": 3.4803047932700086e-06} {"step": 19940, "timestamp": 1778216082.9895883, "train/loss": 2.2443845510482787, "train/z_loss": 0.0015572098433040082, "train/perplexity": 9.434607250143673, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916606.6637869934, "perf/iters_per_sec": 0.9139092749533622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0942005157470702, "data/tokens_consumed": 41819308032, "data/tokens_consumed_B": 41.819308032, "train/loss_slope": 3.5874617339396665e-06} {"step": 19950, "timestamp": 1778216093.3520904, "grad/layer_0/attn": 0.002705757971853018, "grad/layer_0/mlp": 0.002563531044870615, "grad/layer_0/attn_mlp_ratio": 1.0554808266195577, "grad/layer_4/attn": 0.002688826760277152, "grad/layer_4/mlp": 0.002534092403948307, "grad/layer_4/attn_mlp_ratio": 1.0610610133954295, "grad/layer_8/attn": 0.0041381800547242165, "grad/layer_8/mlp": 0.0038447969127446413, "grad/layer_8/attn_mlp_ratio": 1.0763065100724631, "grad/layer_12/attn": 0.004655197728425264, "grad/layer_12/mlp": 0.0062121739611029625, "grad/layer_12/attn_mlp_ratio": 0.749366917706543, "grad/layer_16/attn": 0.004040354397147894, "grad/layer_16/mlp": 0.004672511015087366, "grad/layer_16/attn_mlp_ratio": 0.8647072842912538, "grad/layer_20/attn": 0.011051285080611706, "grad/layer_20/mlp": 0.0059583173133432865, "grad/layer_20/attn_mlp_ratio": 1.8547660881347192, "grad/layer_24/attn": 0.00911804661154747, "grad/layer_24/mlp": 0.009788424707949162, "grad/layer_24/attn_mlp_ratio": 0.9315131689158731, "grad/layer_27/attn": 0.005818455014377832, "grad/layer_27/mlp": 0.008367534726858139, "grad/layer_27/attn_mlp_ratio": 0.6953607167193063} {"step": 19950, "timestamp": 1778216093.9655504, "eos/sharpness": 39.9545669555664, "eos/L0_probe": 2.0508997440338135, "eos/L_plus": 2.289478063583374, "eos/L_minus": 2.211867094039917, "eos/grad_norm": 0.13605065643787384, "eos/embed_grad_frac": 0.13122379779815674, "eos/time_s": 0.6105923652648926} {"step": 19950, "timestamp": 1778216093.9854527, "train/loss": 2.2539003133773803, "train/z_loss": 0.001569613127503544, "train/perplexity": 9.52481323910394, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907907.6474463416, "perf/iters_per_sec": 0.9097612607223232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0991894721984863, "data/tokens_consumed": 41840279552, "data/tokens_consumed_B": 41.840279552, "train/loss_slope": 4.1459195362781175e-06} {"step": 19950, "timestamp": 1778216095.3472717, "geo/rankme_last": 441.103759765625, "geo/layer_0/stable_rank_q_proj": 17.046655654907227, "geo/layer_0/stable_rank_k_proj": 15.131101608276367, "geo/layer_0/stable_rank_o_proj": 52.56561279296875, "geo/layer_0/stable_rank_gate_proj": 153.3348846435547, "geo/layer_0/stable_rank_down_proj": 50.20738220214844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04618226736783981, "geo/layer_0/attn_entropy_mean": 6.275501251220703, "geo/layer_0/attn_entropy_std": 0.3036597967147827, "geo/layer_7/stable_rank_q_proj": 42.26420211791992, "geo/layer_7/stable_rank_k_proj": 42.1704216003418, "geo/layer_7/stable_rank_o_proj": 111.29198455810547, "geo/layer_7/stable_rank_gate_proj": 103.79933166503906, "geo/layer_7/stable_rank_down_proj": 151.21307373046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5610199570655823, "geo/layer_7/attn_entropy_mean": 4.622270584106445, "geo/layer_7/attn_entropy_std": 0.8387449383735657, "geo/layer_14/stable_rank_q_proj": 57.85999298095703, "geo/layer_14/stable_rank_k_proj": 35.030860900878906, "geo/layer_14/stable_rank_o_proj": 53.1695442199707, "geo/layer_14/stable_rank_gate_proj": 89.30384063720703, "geo/layer_14/stable_rank_down_proj": 136.83567810058594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38783007860183716, "geo/layer_14/attn_entropy_mean": 5.528740882873535, "geo/layer_14/attn_entropy_std": 0.4777885675430298, "geo/layer_21/stable_rank_q_proj": 48.36327362060547, "geo/layer_21/stable_rank_k_proj": 31.916105270385742, "geo/layer_21/stable_rank_o_proj": 84.46461486816406, "geo/layer_21/stable_rank_gate_proj": 88.21449279785156, "geo/layer_21/stable_rank_down_proj": 61.25663757324219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1533288061618805, "geo/layer_21/attn_entropy_mean": 5.756006240844727, "geo/layer_21/attn_entropy_std": 0.28705668449401855, "geo/layer_27/stable_rank_q_proj": 41.602996826171875, "geo/layer_27/stable_rank_k_proj": 31.669532775878906, "geo/layer_27/stable_rank_o_proj": 118.30998992919922, "geo/layer_27/stable_rank_gate_proj": 91.6820068359375, "geo/layer_27/stable_rank_down_proj": 138.25753784179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08126536756753922, "geo/layer_27/attn_entropy_mean": 4.381769180297852, "geo/layer_27/attn_entropy_std": 0.5347000360488892, "attnres/final_alpha/block_0": 0.24287156760692596, "attnres/block_norm/0": 1.6073428392410278, "attnres/final_alpha/block_1": 0.007036205381155014, "attnres/block_norm/1": 28412.923828125, "attnres/final_alpha/block_2": 0.014101613312959671, "attnres/block_norm/2": 20654.515625, "attnres/final_alpha/block_3": 0.016162283718585968, "attnres/block_norm/3": 28829.25, "attnres/final_alpha/block_4": 0.021102173253893852, "attnres/block_norm/4": 9098.8193359375, "attnres/final_alpha/block_5": 0.5629309415817261, "attnres/block_norm/5": 4971.8857421875, "attnres/final_alpha/block_6": 0.1357952356338501, "attnres/block_norm/6": 19445.5546875, "geo/tier1_time_s": 1.3578636646270752, "geo/step": 19950.0, "geo/rankme_slope": -3.1261352197128854e-05} {"step": 19960, "timestamp": 1778216105.7241778, "train/loss": 2.2303300380706785, "train/z_loss": 0.0015719786169938743, "train/perplexity": 9.302935895892304, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787112.0276808455, "perf/iters_per_sec": 0.8521614206699588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173486590385437, "data/tokens_consumed": 41861251072, "data/tokens_consumed_B": 41.861251072, "train/loss_slope": 6.469592288418122e-06} {"step": 19970, "timestamp": 1778216116.0970805, "train/loss": 2.26302855014801, "train/z_loss": 0.0015666843974031508, "train/perplexity": 9.612156025881971, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022798.0967612115, "perf/iters_per_sec": 0.964545296078306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367579460144043, "data/tokens_consumed": 41882222592, "data/tokens_consumed_B": 41.882222592, "train/loss_slope": 8.983500805696526e-06} {"step": 19980, "timestamp": 1778216126.4803524, "train/loss": 2.1949538946151734, "train/z_loss": 0.001581959903705865, "train/perplexity": 8.97958703995918, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020749.2692281988, "perf/iters_per_sec": 0.9635683389798159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378091096878053, "data/tokens_consumed": 41903194112, "data/tokens_consumed_B": 41.903194112, "train/loss_slope": 6.11935552447682e-06} {"step": 19990, "timestamp": 1778216136.8568878, "train/loss": 2.2511304378509522, "train/z_loss": 0.0015552667318843305, "train/perplexity": 9.498467196492834, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022093.0460569349, "perf/iters_per_sec": 0.9642091017040896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371194362640381, "data/tokens_consumed": 41924165632, "data/tokens_consumed_B": 41.924165632, "train/loss_slope": 7.109751881617432e-06} {"step": 20000, "timestamp": 1778216147.219473, "grad/layer_0/attn": 0.003262358484789729, "grad/layer_0/mlp": 0.002900734543800354, "grad/layer_0/attn_mlp_ratio": 1.124666295058137, "grad/layer_4/attn": 0.002403303049504757, "grad/layer_4/mlp": 0.002563805552199483, "grad/layer_4/attn_mlp_ratio": 0.9373967357638707, "grad/layer_8/attn": 0.00454699108377099, "grad/layer_8/mlp": 0.003794766031205654, "grad/layer_8/attn_mlp_ratio": 1.1982269595956214, "grad/layer_12/attn": 0.004207076504826546, "grad/layer_12/mlp": 0.006298577878624201, "grad/layer_12/attn_mlp_ratio": 0.6679406874225122, "grad/layer_16/attn": 0.006161221768707037, "grad/layer_16/mlp": 0.00445488141849637, "grad/layer_16/attn_mlp_ratio": 1.3830270778529252, "grad/layer_20/attn": 0.0042253644205629826, "grad/layer_20/mlp": 0.005569320172071457, "grad/layer_20/attn_mlp_ratio": 0.7586858385128205, "grad/layer_24/attn": 0.00960011500865221, "grad/layer_24/mlp": 0.010476230643689632, "grad/layer_24/attn_mlp_ratio": 0.9163710921921846, "grad/layer_27/attn": 0.005590288899838924, "grad/layer_27/mlp": 0.009518132545053959, "grad/layer_27/attn_mlp_ratio": 0.5873304258628803} {"step": 20000, "timestamp": 1778216147.2352908, "train/loss": 2.2445355892181396, "train/z_loss": 0.0015615236363373697, "train/perplexity": 9.436032343575132, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021804.1368323078, "perf/iters_per_sec": 0.9640713390504397, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372676372528076, "data/tokens_consumed": 41945137152, "data/tokens_consumed_B": 41.945137152, "train/loss_slope": 7.302640681148025e-06} {"step": 20000, "timestamp": 1778216154.229702, "geo/ww_alpha_mean": 7.964647388252781, "geo/ww_alpha_std": 4.6061365591864805, "geo/ww_alpha_min": 2.624574308719146, "geo/ww_alpha_max": 30.339977425932737, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.197956634131823, "geo/ww_alpha_by_type/k_proj": 5.270183276319884, "geo/ww_alpha_by_type/v_proj": 7.826419834871323, "geo/ww_alpha_by_type/o_proj": 8.09222651482524, "geo/ww_alpha_by_type/gate_proj": 9.19881947137492, "geo/ww_alpha_by_type/up_proj": 12.400166250151745, "geo/ww_alpha_by_type/down_proj": 8.889852795490489, "geo/twonn_id/layer_0": 0.7488213181495667, "geo/twonn_id/layer_7": 2.9929466247558594, "geo/twonn_id/layer_14": 4.187983989715576, "geo/twonn_id/layer_21": 7.65036678314209, "geo/twonn_id/layer_27": 6.366107940673828, "geo/tier2_time_s": 6.985935211181641} {"step": 20000, "timestamp": 1778216154.8719373, "eoc/jacobian_sigma/layer_0/attn": 786.74365234375, "eoc/jacobian_sigma/layer_0/mlp": 5674.74169921875, "eoc/jacobian_sigma/layer_0": 5674.74169921875, "eoc/jacobian_sigma/layer_7/attn": 1.1593081951141357, "eoc/jacobian_sigma/layer_7/mlp": 1.6514077186584473, "eoc/jacobian_sigma/layer_7": 1.6514077186584473, "eoc/jacobian_sigma/layer_14/attn": 1.5807751417160034, "eoc/jacobian_sigma/layer_14/mlp": 8.512560844421387, "eoc/jacobian_sigma/layer_14": 8.512560844421387, "eoc/jacobian_sigma/layer_21/attn": 1.0778206586837769, "eoc/jacobian_sigma/layer_21/mlp": 4.151605129241943, "eoc/jacobian_sigma/layer_21": 4.151605129241943, "eoc/jacobian_sigma/layer_27/attn": 3.8389089107513428, "eoc/jacobian_sigma/layer_27/mlp": 23.167675018310547, "eoc/jacobian_sigma/layer_27": 23.167675018310547, "eoc/layer0_sigma": 5674.74169921875, "eoc/sigma_max": 23.167675018310547, "eoc/sigma_min": 1.6514077186584473, "eoc/sigma_mean": 9.370812177658081, "eoc/time_s": 0.6353435516357422} {"step": 20010, "timestamp": 1778216165.267412, "train/loss": 2.227425229549408, "train/z_loss": 0.0015707043814472855, "train/perplexity": 9.275951859133118, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163460.1842313635, "perf/iters_per_sec": 0.5547810479313676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8025129079818725, "data/tokens_consumed": 41966108672, "data/tokens_consumed_B": 41.966108672, "train/loss_slope": 6.382074378969575e-06} {"step": 20020, "timestamp": 1778216175.6460834, "train/loss": 2.240108275413513, "train/z_loss": 0.00157578659709543, "train/perplexity": 9.39434840933585, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021773.4195202827, "perf/iters_per_sec": 0.964056691894666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372833967208863, "data/tokens_consumed": 41987080192, "data/tokens_consumed_B": 41.987080192, "train/loss_slope": 7.537916934851429e-06} {"step": 20025, "timestamp": 1778216181.4488807, "eos/sharpness": 72.85096645355223, "eos/L0_probe": 2.0534117221832275, "eos/L_plus": 2.5541484355926514, "eos/L_minus": 2.281184673309326, "eos/grad_norm": 0.27739468216896057, "eos/embed_grad_frac": 0.031947918236255646, "eos/time_s": 0.6227455139160156} {"step": 20025, "timestamp": 1778216182.8312123, "geo/rankme_last": 440.54925537109375, "geo/layer_0/stable_rank_q_proj": 17.05381202697754, "geo/layer_0/stable_rank_k_proj": 15.095488548278809, "geo/layer_0/stable_rank_o_proj": 52.438568115234375, "geo/layer_0/stable_rank_gate_proj": 153.1402130126953, "geo/layer_0/stable_rank_down_proj": 50.1067008972168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05236507207155228, "geo/layer_0/attn_entropy_mean": 6.274075508117676, "geo/layer_0/attn_entropy_std": 0.3075132668018341, "geo/layer_7/stable_rank_q_proj": 42.22559356689453, "geo/layer_7/stable_rank_k_proj": 42.026065826416016, "geo/layer_7/stable_rank_o_proj": 111.05156707763672, "geo/layer_7/stable_rank_gate_proj": 103.8440933227539, "geo/layer_7/stable_rank_down_proj": 151.36190795898438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5550441741943359, "geo/layer_7/attn_entropy_mean": 4.661471366882324, "geo/layer_7/attn_entropy_std": 0.858361542224884, "geo/layer_14/stable_rank_q_proj": 58.06120300292969, "geo/layer_14/stable_rank_k_proj": 35.17015838623047, "geo/layer_14/stable_rank_o_proj": 53.23626708984375, "geo/layer_14/stable_rank_gate_proj": 88.90249633789062, "geo/layer_14/stable_rank_down_proj": 136.8312225341797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3820033371448517, "geo/layer_14/attn_entropy_mean": 5.513910293579102, "geo/layer_14/attn_entropy_std": 0.48657849431037903, "geo/layer_21/stable_rank_q_proj": 48.30864715576172, "geo/layer_21/stable_rank_k_proj": 31.75619125366211, "geo/layer_21/stable_rank_o_proj": 84.5625991821289, "geo/layer_21/stable_rank_gate_proj": 88.08894348144531, "geo/layer_21/stable_rank_down_proj": 61.19804000854492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15537884831428528, "geo/layer_21/attn_entropy_mean": 5.751121520996094, "geo/layer_21/attn_entropy_std": 0.28309088945388794, "geo/layer_27/stable_rank_q_proj": 41.5482292175293, "geo/layer_27/stable_rank_k_proj": 31.730632781982422, "geo/layer_27/stable_rank_o_proj": 118.19207000732422, "geo/layer_27/stable_rank_gate_proj": 91.54313659667969, "geo/layer_27/stable_rank_down_proj": 138.65223693847656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07469284534454346, "geo/layer_27/attn_entropy_mean": 4.39044713973999, "geo/layer_27/attn_entropy_std": 0.5341864824295044, "attnres/final_alpha/block_0": 0.24055251479148865, "attnres/block_norm/0": 1.6081597805023193, "attnres/final_alpha/block_1": 0.006857411935925484, "attnres/block_norm/1": 28711.779296875, "attnres/final_alpha/block_2": 0.014093160629272461, "attnres/block_norm/2": 20622.76953125, "attnres/final_alpha/block_3": 0.016357075423002243, "attnres/block_norm/3": 28824.224609375, "attnres/final_alpha/block_4": 0.020573526620864868, "attnres/block_norm/4": 9161.994140625, "attnres/final_alpha/block_5": 0.5668975710868835, "attnres/block_norm/5": 4958.779296875, "attnres/final_alpha/block_6": 0.1346687227487564, "attnres/block_norm/6": 19512.76171875, "geo/tier1_time_s": 1.3612418174743652, "geo/step": 20025.0, "geo/rankme_slope": -5.633610866221489e-05} {"step": 20030, "timestamp": 1778216188.020588, "train/loss": 2.2721640825271607, "train/z_loss": 0.00156421564752236, "train/perplexity": 9.700370518149189, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695565.7438752432, "perf/iters_per_sec": 0.8085087508560387, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2368449926376344, "data/tokens_consumed": 42008051712, "data/tokens_consumed_B": 42.008051712, "train/loss_slope": 8.924814118946968e-06} {"step": 20040, "timestamp": 1778216198.4046373, "train/loss": 2.229038190841675, "train/z_loss": 0.00157096975017339, "train/perplexity": 9.290925683283827, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020674.6234845992, "perf/iters_per_sec": 0.9635327451155659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378474473953248, "data/tokens_consumed": 42029023232, "data/tokens_consumed_B": 42.029023232, "train/loss_slope": 5.144447025650441e-06} {"step": 20050, "timestamp": 1778216208.7751021, "grad/layer_0/attn": 0.0025263067800551653, "grad/layer_0/mlp": 0.0025474405847489834, "grad/layer_0/attn_mlp_ratio": 0.9917038677994182, "grad/layer_4/attn": 0.002581625711172819, "grad/layer_4/mlp": 0.002571712713688612, "grad/layer_4/attn_mlp_ratio": 1.0038545896071456, "grad/layer_8/attn": 0.006578715983778238, "grad/layer_8/mlp": 0.00390348001383245, "grad/layer_8/attn_mlp_ratio": 1.685346355542012, "grad/layer_12/attn": 0.00466505903750658, "grad/layer_12/mlp": 0.006248080637305975, "grad/layer_12/attn_mlp_ratio": 0.7466387253372856, "grad/layer_16/attn": 0.004194389097392559, "grad/layer_16/mlp": 0.0047695995308458805, "grad/layer_16/attn_mlp_ratio": 0.8794006671475466, "grad/layer_20/attn": 0.005839209537953138, "grad/layer_20/mlp": 0.006371723487973213, "grad/layer_20/attn_mlp_ratio": 0.9164254314130625, "grad/layer_24/attn": 0.01841411553323269, "grad/layer_24/mlp": 0.012113534845411777, "grad/layer_24/attn_mlp_ratio": 1.5201273299836702, "grad/layer_27/attn": 0.007997444830834866, "grad/layer_27/mlp": 0.013079219497740269, "grad/layer_27/attn_mlp_ratio": 0.6114619279132377} {"step": 20050, "timestamp": 1778216208.7908888, "train/loss": 2.25342481136322, "train/z_loss": 0.0015543714282102883, "train/perplexity": 9.520285247844043, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020268.5321464252, "perf/iters_per_sec": 0.9633391056759001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380560636520386, "data/tokens_consumed": 42049994752, "data/tokens_consumed_B": 42.049994752, "train/loss_slope": 4.171954966721637e-06} {"step": 20060, "timestamp": 1778216219.173736, "train/loss": 2.207924795150757, "train/z_loss": 0.001565357600338757, "train/perplexity": 9.096819028909739, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021006.9043924038, "perf/iters_per_sec": 0.9636911889993686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376768112182617, "data/tokens_consumed": 42070966272, "data/tokens_consumed_B": 42.070966272, "train/loss_slope": 2.098349454772758e-06} {"step": 20070, "timestamp": 1778216229.5649521, "train/loss": 2.180046010017395, "train/z_loss": 0.0015785286435857416, "train/perplexity": 8.846713286789338, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019324.3861575904, "perf/iters_per_sec": 0.9628889017856552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385414123535157, "data/tokens_consumed": 42091937792, "data/tokens_consumed_B": 42.091937792, "train/loss_slope": 1.7293908880023813e-06} {"step": 20080, "timestamp": 1778216239.9449522, "train/loss": 2.246349048614502, "train/z_loss": 0.0015570204122923315, "train/perplexity": 9.453159730309364, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021397.4056084498, "perf/iters_per_sec": 0.9638773944895028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374763488769532, "data/tokens_consumed": 42112909312, "data/tokens_consumed_B": 42.112909312, "train/loss_slope": 3.3624044024046834e-06} {"step": 20090, "timestamp": 1778216250.3241267, "train/loss": 2.1987616062164306, "train/z_loss": 0.0015723481425084173, "train/perplexity": 9.013843896430746, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021549.0859741531, "perf/iters_per_sec": 0.96394972132404, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373985052108765, "data/tokens_consumed": 42133880832, "data/tokens_consumed_B": 42.133880832, "train/loss_slope": 3.970372666119476e-06} {"step": 20100, "timestamp": 1778216260.7017622, "grad/layer_0/attn": 0.0028902103658765554, "grad/layer_0/mlp": 0.002671669702976942, "grad/layer_0/attn_mlp_ratio": 1.0817992413044837, "grad/layer_4/attn": 0.001932590501382947, "grad/layer_4/mlp": 0.002496609464287758, "grad/layer_4/attn_mlp_ratio": 0.7740859960753549, "grad/layer_8/attn": 0.006845372263342142, "grad/layer_8/mlp": 0.003867693478241563, "grad/layer_8/attn_mlp_ratio": 1.7698848486478014, "grad/layer_12/attn": 0.004508958663791418, "grad/layer_12/mlp": 0.006820579990744591, "grad/layer_12/attn_mlp_ratio": 0.6610814041917047, "grad/layer_16/attn": 0.004139619413763285, "grad/layer_16/mlp": 0.004500926937907934, "grad/layer_16/attn_mlp_ratio": 0.9197259539864509, "grad/layer_20/attn": 0.004112990573048592, "grad/layer_20/mlp": 0.005826323293149471, "grad/layer_20/attn_mlp_ratio": 0.7059324200720822, "grad/layer_24/attn": 0.009020182304084301, "grad/layer_24/mlp": 0.009107555262744427, "grad/layer_24/attn_mlp_ratio": 0.9904065300533296, "grad/layer_27/attn": 0.005585336126387119, "grad/layer_27/mlp": 0.008402407169342041, "grad/layer_27/attn_mlp_ratio": 0.6647304691795171} {"step": 20100, "timestamp": 1778216261.3239603, "eos/sharpness": 34.446859359741204, "eos/L0_probe": 2.0571987628936768, "eos/L_plus": 2.202803373336792, "eos/L_minus": 2.2560627460479736, "eos/grad_norm": 0.12785544991493225, "eos/embed_grad_frac": 0.17876514792442322, "eos/time_s": 0.6193387508392334} {"step": 20100, "timestamp": 1778216261.345848, "train/loss": 2.2084285497665403, "train/z_loss": 0.0015768749522976576, "train/perplexity": 9.101402747922368, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903653.9298952876, "perf/iters_per_sec": 0.9077329301334799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1016456127166747, "data/tokens_consumed": 42154852352, "data/tokens_consumed_B": 42.154852352, "train/loss_slope": 2.8965382614616443e-06} {"step": 20100, "timestamp": 1778216262.7099707, "geo/rankme_last": 441.04339599609375, "geo/layer_0/stable_rank_q_proj": 17.06187629699707, "geo/layer_0/stable_rank_k_proj": 15.070817947387695, "geo/layer_0/stable_rank_o_proj": 52.51941680908203, "geo/layer_0/stable_rank_gate_proj": 153.14796447753906, "geo/layer_0/stable_rank_down_proj": 50.08290481567383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05057084560394287, "geo/layer_0/attn_entropy_mean": 6.268377304077148, "geo/layer_0/attn_entropy_std": 0.3057689964771271, "geo/layer_7/stable_rank_q_proj": 42.178340911865234, "geo/layer_7/stable_rank_k_proj": 42.056209564208984, "geo/layer_7/stable_rank_o_proj": 110.72545623779297, "geo/layer_7/stable_rank_gate_proj": 103.97785949707031, "geo/layer_7/stable_rank_down_proj": 151.5740509033203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5602971911430359, "geo/layer_7/attn_entropy_mean": 4.664778709411621, "geo/layer_7/attn_entropy_std": 0.885081946849823, "geo/layer_14/stable_rank_q_proj": 58.060482025146484, "geo/layer_14/stable_rank_k_proj": 35.08586120605469, "geo/layer_14/stable_rank_o_proj": 53.457332611083984, "geo/layer_14/stable_rank_gate_proj": 88.75275421142578, "geo/layer_14/stable_rank_down_proj": 136.67269897460938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3724684715270996, "geo/layer_14/attn_entropy_mean": 5.4607930183410645, "geo/layer_14/attn_entropy_std": 0.5138970613479614, "geo/layer_21/stable_rank_q_proj": 48.230079650878906, "geo/layer_21/stable_rank_k_proj": 31.78647804260254, "geo/layer_21/stable_rank_o_proj": 84.45492553710938, "geo/layer_21/stable_rank_gate_proj": 87.92993927001953, "geo/layer_21/stable_rank_down_proj": 61.15951919555664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15594959259033203, "geo/layer_21/attn_entropy_mean": 5.7303667068481445, "geo/layer_21/attn_entropy_std": 0.2991988956928253, "geo/layer_27/stable_rank_q_proj": 41.49949264526367, "geo/layer_27/stable_rank_k_proj": 31.685791015625, "geo/layer_27/stable_rank_o_proj": 117.98729705810547, "geo/layer_27/stable_rank_gate_proj": 91.51219177246094, "geo/layer_27/stable_rank_down_proj": 138.53466796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07887130975723267, "geo/layer_27/attn_entropy_mean": 4.377954483032227, "geo/layer_27/attn_entropy_std": 0.554236114025116, "attnres/final_alpha/block_0": 0.2427436113357544, "attnres/block_norm/0": 1.6090980768203735, "attnres/final_alpha/block_1": 0.006732206791639328, "attnres/block_norm/1": 28648.021484375, "attnres/final_alpha/block_2": 0.01464095525443554, "attnres/block_norm/2": 20624.3359375, "attnres/final_alpha/block_3": 0.01640957221388817, "attnres/block_norm/3": 29316.6171875, "attnres/final_alpha/block_4": 0.020904380828142166, "attnres/block_norm/4": 9171.20703125, "attnres/final_alpha/block_5": 0.5617193579673767, "attnres/block_norm/5": 4974.5673828125, "attnres/final_alpha/block_6": 0.13684991002082825, "attnres/block_norm/6": 19521.5859375, "geo/tier1_time_s": 1.3594942092895508, "geo/step": 20100.0, "geo/rankme_slope": -5.5778424651110446e-05} {"step": 20110, "timestamp": 1778216273.4223585, "train/loss": 2.183604431152344, "train/z_loss": 0.0015951611334457994, "train/perplexity": 8.87824969495745, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1737214.0437789503, "perf/iters_per_sec": 0.8283682078261139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.207192635536194, "data/tokens_consumed": 42175823872, "data/tokens_consumed_B": 42.175823872, "train/loss_slope": 7.956607399707968e-07} {"step": 20120, "timestamp": 1778216283.8071985, "train/loss": 2.2644443273544312, "train/z_loss": 0.0015598185826092958, "train/perplexity": 9.62577433525924, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020801.960930932, "perf/iters_per_sec": 0.9635934643416081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377820491790772, "data/tokens_consumed": 42196795392, "data/tokens_consumed_B": 42.196795392, "train/loss_slope": 1.1372724453536814e-06} {"step": 20130, "timestamp": 1778216294.6910615, "train/loss": 2.2335711240768434, "train/z_loss": 0.0015678832540288567, "train/perplexity": 9.333136426061452, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927794.688049912, "perf/iters_per_sec": 0.9192441406487999, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.087850284576416, "data/tokens_consumed": 42217766912, "data/tokens_consumed_B": 42.217766912, "train/loss_slope": 4.5623982592885344e-06} {"step": 20140, "timestamp": 1778216305.0440822, "train/loss": 2.2163301706314087, "train/z_loss": 0.0015745821059681476, "train/perplexity": 9.173603457433721, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026712.8393038781, "perf/iters_per_sec": 0.966411990787448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347553730010985, "data/tokens_consumed": 42238738432, "data/tokens_consumed_B": 42.238738432, "train/loss_slope": 3.3439952548187704e-06} {"step": 20150, "timestamp": 1778216315.3833263, "grad/layer_0/attn": 0.0024966690689325333, "grad/layer_0/mlp": 0.002530359663069248, "grad/layer_0/attn_mlp_ratio": 0.9866854134228529, "grad/layer_4/attn": 0.0020900473464280367, "grad/layer_4/mlp": 0.002437193877995014, "grad/layer_4/attn_mlp_ratio": 0.8575629864913085, "grad/layer_8/attn": 0.007513033226132393, "grad/layer_8/mlp": 0.003689374541863799, "grad/layer_8/attn_mlp_ratio": 2.036397480722358, "grad/layer_12/attn": 0.0037686550058424473, "grad/layer_12/mlp": 0.005863322876393795, "grad/layer_12/attn_mlp_ratio": 0.6427507099669169, "grad/layer_16/attn": 0.003867675783112645, "grad/layer_16/mlp": 0.0044025941751897335, "grad/layer_16/attn_mlp_ratio": 0.8784992532490323, "grad/layer_20/attn": 0.00479890638962388, "grad/layer_20/mlp": 0.0055396463721990585, "grad/layer_20/attn_mlp_ratio": 0.8662838709486946, "grad/layer_24/attn": 0.007509702816605568, "grad/layer_24/mlp": 0.008799046277999878, "grad/layer_24/attn_mlp_ratio": 0.8534678070775902, "grad/layer_27/attn": 0.004006428178399801, "grad/layer_27/mlp": 0.007394838612526655, "grad/layer_27/attn_mlp_ratio": 0.5417870942354728} {"step": 20150, "timestamp": 1778216315.3990946, "train/loss": 2.1925416469573973, "train/z_loss": 0.0015790994628332555, "train/perplexity": 8.957952156993125, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026412.758222464, "perf/iters_per_sec": 0.966268900977356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349086046218872, "data/tokens_consumed": 42259709952, "data/tokens_consumed_B": 42.259709952, "train/loss_slope": 5.394953777222813e-07} {"step": 20160, "timestamp": 1778216325.7674518, "train/loss": 2.223146939277649, "train/z_loss": 0.001574197260197252, "train/perplexity": 9.236351416031212, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023762.1635730574, "perf/iters_per_sec": 0.9650049989571845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362640619277954, "data/tokens_consumed": 42280681472, "data/tokens_consumed_B": 42.280681472, "train/loss_slope": -2.071284332183888e-07} {"step": 20170, "timestamp": 1778216336.628775, "train/loss": 2.2682973861694338, "train/z_loss": 0.0015606721281073987, "train/perplexity": 9.662934554193939, "train/grad_norm": 0.3359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932397.5193621065, "perf/iters_per_sec": 0.9214389416513951, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0852591037750243, "data/tokens_consumed": 42301652992, "data/tokens_consumed_B": 42.301652992, "train/loss_slope": 1.9688155033765025e-06} {"step": 20175, "timestamp": 1778216342.4121127, "eos/sharpness": 41.94445610046386, "eos/L0_probe": 2.0482168197631836, "eos/L_plus": 2.2468676567077637, "eos/L_minus": 2.269010543823242, "eos/grad_norm": 0.16143126785755157, "eos/embed_grad_frac": 0.08855819702148438, "eos/time_s": 0.6119201183319092} {"step": 20175, "timestamp": 1778216343.7917268, "geo/rankme_last": 441.193359375, "geo/layer_0/stable_rank_q_proj": 17.104310989379883, "geo/layer_0/stable_rank_k_proj": 15.13540267944336, "geo/layer_0/stable_rank_o_proj": 52.49110412597656, "geo/layer_0/stable_rank_gate_proj": 153.020751953125, "geo/layer_0/stable_rank_down_proj": 50.09986114501953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04427161440253258, "geo/layer_0/attn_entropy_mean": 6.270149230957031, "geo/layer_0/attn_entropy_std": 0.3115236163139343, "geo/layer_7/stable_rank_q_proj": 42.18634796142578, "geo/layer_7/stable_rank_k_proj": 42.14897537231445, "geo/layer_7/stable_rank_o_proj": 110.70172882080078, "geo/layer_7/stable_rank_gate_proj": 103.95075225830078, "geo/layer_7/stable_rank_down_proj": 151.7207794189453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5446239113807678, "geo/layer_7/attn_entropy_mean": 4.661956787109375, "geo/layer_7/attn_entropy_std": 0.8504570722579956, "geo/layer_14/stable_rank_q_proj": 57.926937103271484, "geo/layer_14/stable_rank_k_proj": 35.11337661743164, "geo/layer_14/stable_rank_o_proj": 53.59745788574219, "geo/layer_14/stable_rank_gate_proj": 88.4832534790039, "geo/layer_14/stable_rank_down_proj": 136.86727905273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37750130891799927, "geo/layer_14/attn_entropy_mean": 5.503767013549805, "geo/layer_14/attn_entropy_std": 0.49083444476127625, "geo/layer_21/stable_rank_q_proj": 48.265865325927734, "geo/layer_21/stable_rank_k_proj": 31.83694076538086, "geo/layer_21/stable_rank_o_proj": 84.39518737792969, "geo/layer_21/stable_rank_gate_proj": 88.06426239013672, "geo/layer_21/stable_rank_down_proj": 61.039371490478516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15405786037445068, "geo/layer_21/attn_entropy_mean": 5.765631675720215, "geo/layer_21/attn_entropy_std": 0.2963807284832001, "geo/layer_27/stable_rank_q_proj": 41.531620025634766, "geo/layer_27/stable_rank_k_proj": 31.768190383911133, "geo/layer_27/stable_rank_o_proj": 117.99302673339844, "geo/layer_27/stable_rank_gate_proj": 91.3750991821289, "geo/layer_27/stable_rank_down_proj": 138.3586883544922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07342210412025452, "geo/layer_27/attn_entropy_mean": 4.404098033905029, "geo/layer_27/attn_entropy_std": 0.5496456027030945, "attnres/final_alpha/block_0": 0.24392777681350708, "attnres/block_norm/0": 1.6098475456237793, "attnres/final_alpha/block_1": 0.006967406254261732, "attnres/block_norm/1": 28743.359375, "attnres/final_alpha/block_2": 0.01444297842681408, "attnres/block_norm/2": 20659.80859375, "attnres/final_alpha/block_3": 0.016125444322824478, "attnres/block_norm/3": 29037.23828125, "attnres/final_alpha/block_4": 0.02110644429922104, "attnres/block_norm/4": 9188.89453125, "attnres/final_alpha/block_5": 0.5580740571022034, "attnres/block_norm/5": 4979.26025390625, "attnres/final_alpha/block_6": 0.13935591280460358, "attnres/block_norm/6": 19580.375, "geo/tier1_time_s": 1.3589861392974854, "geo/step": 20175.0, "geo/rankme_slope": -4.582203975340136e-05} {"step": 20180, "timestamp": 1778216348.9714804, "train/loss": 2.2511397123336794, "train/z_loss": 0.0015693176072090864, "train/perplexity": 9.498555290271295, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700273.2590530282, "perf/iters_per_sec": 0.8107534690156117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2334205627441406, "data/tokens_consumed": 42322624512, "data/tokens_consumed_B": 42.322624512, "train/loss_slope": 2.845995439769205e-06} {"step": 20190, "timestamp": 1778216359.323536, "train/loss": 2.16432638168335, "train/z_loss": 0.0015819123829714954, "train/perplexity": 8.70873357810987, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026846.1697420552, "perf/iters_per_sec": 0.9664755676946903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034687304496765, "data/tokens_consumed": 42343596032, "data/tokens_consumed_B": 42.343596032, "train/loss_slope": -2.6948276740666225e-07} {"step": 20200, "timestamp": 1778216370.0647166, "grad/layer_0/attn": 0.002882883884012699, "grad/layer_0/mlp": 0.002573731355369091, "grad/layer_0/attn_mlp_ratio": 1.1201183705466558, "grad/layer_4/attn": 0.0016118079656735063, "grad/layer_4/mlp": 0.0025352982338517904, "grad/layer_4/attn_mlp_ratio": 0.6357468642456537, "grad/layer_8/attn": 0.004521918948739767, "grad/layer_8/mlp": 0.0037147621624171734, "grad/layer_8/attn_mlp_ratio": 1.2172835377619513, "grad/layer_12/attn": 0.0046266536228358746, "grad/layer_12/mlp": 0.006081290077418089, "grad/layer_12/attn_mlp_ratio": 0.7608013246952469, "grad/layer_16/attn": 0.003972688224166632, "grad/layer_16/mlp": 0.0048520248383283615, "grad/layer_16/attn_mlp_ratio": 0.8187691272533562, "grad/layer_20/attn": 0.00585961202159524, "grad/layer_20/mlp": 0.0062994095496833324, "grad/layer_20/attn_mlp_ratio": 0.9301843105075408, "grad/layer_24/attn": 0.01358142402023077, "grad/layer_24/mlp": 0.009244422428309917, "grad/layer_24/attn_mlp_ratio": 1.4691479082267513, "grad/layer_27/attn": 0.008793521672487259, "grad/layer_27/mlp": 0.008005142211914062, "grad/layer_27/attn_mlp_ratio": 1.0984841155665466} {"step": 20200, "timestamp": 1778216370.080629, "train/loss": 2.2261773109436036, "train/z_loss": 0.0015761870774440468, "train/perplexity": 9.264383445941881, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950519.468268648, "perf/iters_per_sec": 0.9300801602690926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0751761436462401, "data/tokens_consumed": 42364567552, "data/tokens_consumed_B": 42.364567552, "train/loss_slope": -2.3904760928970927e-07} {"step": 20210, "timestamp": 1778216380.4305108, "train/loss": 2.225286340713501, "train/z_loss": 0.0015649717533960938, "train/perplexity": 9.256132832162718, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027683.9608593823, "perf/iters_per_sec": 0.9668750576302444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342597961425781, "data/tokens_consumed": 42385539072, "data/tokens_consumed_B": 42.385539072, "train/loss_slope": -4.657199852751978e-06} {"step": 20220, "timestamp": 1778216390.787605, "train/loss": 2.1684629678726197, "train/z_loss": 0.0015764499665237963, "train/perplexity": 8.744832617072875, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025950.7875150882, "perf/iters_per_sec": 0.9660486161780778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351445913314818, "data/tokens_consumed": 42406510592, "data/tokens_consumed_B": 42.406510592, "train/loss_slope": -1.1566906106961983e-05} {"step": 20230, "timestamp": 1778216401.1467364, "train/loss": 2.186722493171692, "train/z_loss": 0.0015785683877766131, "train/perplexity": 8.90597583157284, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025529.606076193, "perf/iters_per_sec": 0.9658477812176671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035359835624695, "data/tokens_consumed": 42427482112, "data/tokens_consumed_B": 42.427482112, "train/loss_slope": -1.1132186545719047e-05} {"step": 20240, "timestamp": 1778216411.5074625, "train/loss": 2.2536476612091065, "train/z_loss": 0.0015659742057323456, "train/perplexity": 9.522407078360342, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025828.1192397743, "perf/iters_per_sec": 0.9659901233862754, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352072715759277, "data/tokens_consumed": 42448453632, "data/tokens_consumed_B": 42.448453632, "train/loss_slope": -1.3588541104133259e-05} {"step": 20250, "timestamp": 1778216421.8486385, "grad/layer_0/attn": 0.003133450634777546, "grad/layer_0/mlp": 0.002998054726049304, "grad/layer_0/attn_mlp_ratio": 1.0451612183846084, "grad/layer_4/attn": 0.002261048648506403, "grad/layer_4/mlp": 0.00257245940156281, "grad/layer_4/attn_mlp_ratio": 0.8789443126831656, "grad/layer_8/attn": 0.009982169605791569, "grad/layer_8/mlp": 0.0038800062611699104, "grad/layer_8/attn_mlp_ratio": 2.57271990728946, "grad/layer_12/attn": 0.0041773212142288685, "grad/layer_12/mlp": 0.006370243616402149, "grad/layer_12/attn_mlp_ratio": 0.6557553211775983, "grad/layer_16/attn": 0.003973088692873716, "grad/layer_16/mlp": 0.004725501406937838, "grad/layer_16/attn_mlp_ratio": 0.8407760926624505, "grad/layer_20/attn": 0.009616773575544357, "grad/layer_20/mlp": 0.008120700716972351, "grad/layer_20/attn_mlp_ratio": 1.1842295132268879, "grad/layer_24/attn": 0.026166075840592384, "grad/layer_24/mlp": 0.014498957432806492, "grad/layer_24/attn_mlp_ratio": 1.804686701191237, "grad/layer_27/attn": 0.007136785890907049, "grad/layer_27/mlp": 0.012152563780546188, "grad/layer_27/attn_mlp_ratio": 0.5872658610197975} {"step": 20250, "timestamp": 1778216422.4630804, "eos/sharpness": 60.06510257720946, "eos/L0_probe": 2.050410032272339, "eos/L_plus": 2.4456946849823, "eos/L_minus": 2.2557764053344727, "eos/grad_norm": 0.26544129848480225, "eos/embed_grad_frac": 0.03613096848130226, "eos/time_s": 0.6114795207977295} {"step": 20250, "timestamp": 1778216422.4835176, "train/loss": 2.2382628202438353, "train/z_loss": 0.0015799726825207472, "train/perplexity": 9.377027547850583, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911978.2064195143, "perf/iters_per_sec": 0.9117022544953891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0968493223190308, "data/tokens_consumed": 42469425152, "data/tokens_consumed_B": 42.469425152, "train/loss_slope": -1.3015630829631089e-05} {"step": 20250, "timestamp": 1778216423.8466883, "geo/rankme_last": 441.14581298828125, "geo/layer_0/stable_rank_q_proj": 17.1441650390625, "geo/layer_0/stable_rank_k_proj": 15.173677444458008, "geo/layer_0/stable_rank_o_proj": 52.330467224121094, "geo/layer_0/stable_rank_gate_proj": 153.15379333496094, "geo/layer_0/stable_rank_down_proj": 50.044063568115234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04746237397193909, "geo/layer_0/attn_entropy_mean": 6.270867347717285, "geo/layer_0/attn_entropy_std": 0.3134068250656128, "geo/layer_7/stable_rank_q_proj": 42.15433883666992, "geo/layer_7/stable_rank_k_proj": 42.36883544921875, "geo/layer_7/stable_rank_o_proj": 110.63021850585938, "geo/layer_7/stable_rank_gate_proj": 103.72041320800781, "geo/layer_7/stable_rank_down_proj": 151.3060760498047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5535054802894592, "geo/layer_7/attn_entropy_mean": 4.678470134735107, "geo/layer_7/attn_entropy_std": 0.8720636367797852, "geo/layer_14/stable_rank_q_proj": 58.05598831176758, "geo/layer_14/stable_rank_k_proj": 34.950260162353516, "geo/layer_14/stable_rank_o_proj": 53.54974365234375, "geo/layer_14/stable_rank_gate_proj": 88.67565155029297, "geo/layer_14/stable_rank_down_proj": 136.88455200195312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3690643608570099, "geo/layer_14/attn_entropy_mean": 5.490321636199951, "geo/layer_14/attn_entropy_std": 0.4837072193622589, "geo/layer_21/stable_rank_q_proj": 48.27384567260742, "geo/layer_21/stable_rank_k_proj": 31.907360076904297, "geo/layer_21/stable_rank_o_proj": 84.20173645019531, "geo/layer_21/stable_rank_gate_proj": 87.88884735107422, "geo/layer_21/stable_rank_down_proj": 61.103633880615234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15215332806110382, "geo/layer_21/attn_entropy_mean": 5.765091896057129, "geo/layer_21/attn_entropy_std": 0.29705172777175903, "geo/layer_27/stable_rank_q_proj": 41.498512268066406, "geo/layer_27/stable_rank_k_proj": 31.804567337036133, "geo/layer_27/stable_rank_o_proj": 118.08855438232422, "geo/layer_27/stable_rank_gate_proj": 91.29725646972656, "geo/layer_27/stable_rank_down_proj": 138.46707153320312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07717277109622955, "geo/layer_27/attn_entropy_mean": 4.421852111816406, "geo/layer_27/attn_entropy_std": 0.5327169895172119, "attnres/final_alpha/block_0": 0.24208149313926697, "attnres/block_norm/0": 1.6106442213058472, "attnres/final_alpha/block_1": 0.006814457476139069, "attnres/block_norm/1": 28893.60546875, "attnres/final_alpha/block_2": 0.014187687076628208, "attnres/block_norm/2": 20684.8984375, "attnres/final_alpha/block_3": 0.01640964485704899, "attnres/block_norm/3": 29287.201171875, "attnres/final_alpha/block_4": 0.02117101289331913, "attnres/block_norm/4": 9168.51171875, "attnres/final_alpha/block_5": 0.5645231604576111, "attnres/block_norm/5": 4976.5009765625, "attnres/final_alpha/block_6": 0.13481257855892181, "attnres/block_norm/6": 19648.287109375, "geo/tier1_time_s": 1.3595564365386963, "geo/step": 20250.0, "geo/rankme_slope": -4.8560088097739094e-05} {"step": 20260, "timestamp": 1778216434.198254, "train/loss": 2.24072527885437, "train/z_loss": 0.0015547028742730618, "train/perplexity": 9.400146543179343, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790762.8563407857, "perf/iters_per_sec": 0.8539022714332513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710942029953002, "data/tokens_consumed": 42490396672, "data/tokens_consumed_B": 42.490396672, "train/loss_slope": -1.449830160819119e-05} {"step": 20270, "timestamp": 1778216444.5564349, "train/loss": 2.210689735412598, "train/z_loss": 0.0015714227221906186, "train/perplexity": 9.122005994278789, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025841.0899259876, "perf/iters_per_sec": 0.9659963082914294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352006435394288, "data/tokens_consumed": 42511368192, "data/tokens_consumed_B": 42.511368192, "train/loss_slope": -1.3812784829584616e-05} {"step": 20280, "timestamp": 1778216454.9050806, "train/loss": 2.221479558944702, "train/z_loss": 0.0015706335310824216, "train/perplexity": 9.220963737453266, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027672.6492692225, "perf/iters_per_sec": 0.9668696638437378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342655658721924, "data/tokens_consumed": 42532339712, "data/tokens_consumed_B": 42.532339712, "train/loss_slope": -1.6505895589444472e-05} {"step": 20290, "timestamp": 1778216465.2610178, "train/loss": 2.262009596824646, "train/z_loss": 0.00155752181308344, "train/perplexity": 9.602366675847076, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026296.4289225186, "perf/iters_per_sec": 0.9662134308445542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349680185317993, "data/tokens_consumed": 42553311232, "data/tokens_consumed_B": 42.553311232, "train/loss_slope": -1.503995492799551e-05} {"step": 20300, "timestamp": 1778216475.6316252, "grad/layer_0/attn": 0.0027474842499941587, "grad/layer_0/mlp": 0.0027582875918596983, "grad/layer_0/attn_mlp_ratio": 0.9960832795297522, "grad/layer_4/attn": 0.0017447525169700384, "grad/layer_4/mlp": 0.0024306150153279305, "grad/layer_4/attn_mlp_ratio": 0.7178234455826801, "grad/layer_8/attn": 0.009369729086756706, "grad/layer_8/mlp": 0.0039941794238984585, "grad/layer_8/attn_mlp_ratio": 2.3458457564800503, "grad/layer_12/attn": 0.003987587988376617, "grad/layer_12/mlp": 0.006006660871207714, "grad/layer_12/attn_mlp_ratio": 0.6638610048895207, "grad/layer_16/attn": 0.005408621393144131, "grad/layer_16/mlp": 0.004686110652983189, "grad/layer_16/attn_mlp_ratio": 1.1541812983615445, "grad/layer_20/attn": 0.0037678841035813093, "grad/layer_20/mlp": 0.006156046409159899, "grad/layer_20/attn_mlp_ratio": 0.6120623192132939, "grad/layer_24/attn": 0.005987172946333885, "grad/layer_24/mlp": 0.009192654862999916, "grad/layer_24/attn_mlp_ratio": 0.6512996485163456, "grad/layer_27/attn": 0.005112998187541962, "grad/layer_27/mlp": 0.007673186715692282, "grad/layer_27/attn_mlp_ratio": 0.666346110208795} {"step": 20300, "timestamp": 1778216475.6474361, "train/loss": 2.2675843954086305, "train/z_loss": 0.001563016907311976, "train/perplexity": 9.656047426655462, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020401.4336723534, "perf/iters_per_sec": 0.9634024780618445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379877805709838, "data/tokens_consumed": 42574282752, "data/tokens_consumed_B": 42.574282752, "train/loss_slope": -1.1322392195579367e-05} {"step": 20310, "timestamp": 1778216486.0041542, "train/loss": 2.2271704196929933, "train/z_loss": 0.001574570033699274, "train/perplexity": 9.273588556280998, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025858.4933018747, "perf/iters_per_sec": 0.9660046068677305, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351917505264283, "data/tokens_consumed": 42595254272, "data/tokens_consumed_B": 42.595254272, "train/loss_slope": -1.160960805476903e-05} {"step": 20320, "timestamp": 1778216496.3531723, "train/loss": 2.2707774877548217, "train/z_loss": 0.0015570823452435434, "train/perplexity": 9.686929355975122, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027518.3194365127, "perf/iters_per_sec": 0.9667960736448825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343442916870118, "data/tokens_consumed": 42616225792, "data/tokens_consumed_B": 42.616225792, "train/loss_slope": -1.0283180131520175e-05} {"step": 20325, "timestamp": 1778216502.1289737, "eos/sharpness": 11.957478523254393, "eos/L0_probe": 2.0553860664367676, "eos/L_plus": 2.1180217266082764, "eos/L_minus": 2.1123251914978027, "eos/grad_norm": 0.09480089694261551, "eos/embed_grad_frac": 0.25486451387405396, "eos/time_s": 0.6099493503570557} {"step": 20325, "timestamp": 1778216503.5084748, "geo/rankme_last": 439.72320556640625, "geo/layer_0/stable_rank_q_proj": 17.12643814086914, "geo/layer_0/stable_rank_k_proj": 15.133633613586426, "geo/layer_0/stable_rank_o_proj": 52.224952697753906, "geo/layer_0/stable_rank_gate_proj": 153.5307159423828, "geo/layer_0/stable_rank_down_proj": 49.944820404052734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04946574196219444, "geo/layer_0/attn_entropy_mean": 6.269720554351807, "geo/layer_0/attn_entropy_std": 0.3193628489971161, "geo/layer_7/stable_rank_q_proj": 42.31432342529297, "geo/layer_7/stable_rank_k_proj": 42.359493255615234, "geo/layer_7/stable_rank_o_proj": 110.5898208618164, "geo/layer_7/stable_rank_gate_proj": 103.90192413330078, "geo/layer_7/stable_rank_down_proj": 151.2146759033203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.542331874370575, "geo/layer_7/attn_entropy_mean": 4.641562461853027, "geo/layer_7/attn_entropy_std": 0.8458633422851562, "geo/layer_14/stable_rank_q_proj": 58.09590530395508, "geo/layer_14/stable_rank_k_proj": 35.16268539428711, "geo/layer_14/stable_rank_o_proj": 53.52085876464844, "geo/layer_14/stable_rank_gate_proj": 88.34827423095703, "geo/layer_14/stable_rank_down_proj": 137.01475524902344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37792202830314636, "geo/layer_14/attn_entropy_mean": 5.524686813354492, "geo/layer_14/attn_entropy_std": 0.4935440719127655, "geo/layer_21/stable_rank_q_proj": 48.315582275390625, "geo/layer_21/stable_rank_k_proj": 31.938610076904297, "geo/layer_21/stable_rank_o_proj": 84.05972290039062, "geo/layer_21/stable_rank_gate_proj": 87.5933837890625, "geo/layer_21/stable_rank_down_proj": 61.062252044677734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1522836685180664, "geo/layer_21/attn_entropy_mean": 5.752852916717529, "geo/layer_21/attn_entropy_std": 0.2958228886127472, "geo/layer_27/stable_rank_q_proj": 41.468143463134766, "geo/layer_27/stable_rank_k_proj": 31.64080238342285, "geo/layer_27/stable_rank_o_proj": 118.41313171386719, "geo/layer_27/stable_rank_gate_proj": 91.4464340209961, "geo/layer_27/stable_rank_down_proj": 138.4144287109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07851060479879379, "geo/layer_27/attn_entropy_mean": 4.420591354370117, "geo/layer_27/attn_entropy_std": 0.5462493300437927, "attnres/final_alpha/block_0": 0.2430523931980133, "attnres/block_norm/0": 1.6116297245025635, "attnres/final_alpha/block_1": 0.006899565923959017, "attnres/block_norm/1": 28892.712890625, "attnres/final_alpha/block_2": 0.014391236938536167, "attnres/block_norm/2": 20753.51953125, "attnres/final_alpha/block_3": 0.01637583039700985, "attnres/block_norm/3": 29248.77734375, "attnres/final_alpha/block_4": 0.02091749757528305, "attnres/block_norm/4": 9236.9169921875, "attnres/final_alpha/block_5": 0.5624344944953918, "attnres/block_norm/5": 5029.85400390625, "attnres/final_alpha/block_6": 0.13592898845672607, "attnres/block_norm/6": 19877.265625, "geo/tier1_time_s": 1.3587114810943604, "geo/step": 20325.0, "geo/rankme_slope": -8.790412649434774e-05} {"step": 20330, "timestamp": 1778216508.6875877, "train/loss": 2.2628819227218626, "train/z_loss": 0.0015531048178672791, "train/perplexity": 9.610746723507882, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700927.1835103699, "perf/iters_per_sec": 0.8110652844955301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329463720321656, "data/tokens_consumed": 42637197312, "data/tokens_consumed_B": 42.637197312, "train/loss_slope": -9.915488452264664e-06} {"step": 20340, "timestamp": 1778216519.0387394, "train/loss": 2.221298170089722, "train/z_loss": 0.001574918464757502, "train/perplexity": 9.219291309083633, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027207.6278176499, "perf/iters_per_sec": 0.9666479243362665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345028162002563, "data/tokens_consumed": 42658168832, "data/tokens_consumed_B": 42.658168832, "train/loss_slope": -1.1188020657534426e-05} {"step": 20350, "timestamp": 1778216529.3873565, "grad/layer_0/attn": 0.0027969784568995237, "grad/layer_0/mlp": 0.0027887136675417423, "grad/layer_0/attn_mlp_ratio": 1.002963620524263, "grad/layer_4/attn": 0.0017878510989248753, "grad/layer_4/mlp": 0.0026042191311717033, "grad/layer_4/attn_mlp_ratio": 0.6865209647194243, "grad/layer_8/attn": 0.003881002776324749, "grad/layer_8/mlp": 0.0039042781572788954, "grad/layer_8/attn_mlp_ratio": 0.9940384676961093, "grad/layer_12/attn": 0.005781576037406921, "grad/layer_12/mlp": 0.006245515774935484, "grad/layer_12/attn_mlp_ratio": 0.9257163304330958, "grad/layer_16/attn": 0.00418090308085084, "grad/layer_16/mlp": 0.0051066386513412, "grad/layer_16/attn_mlp_ratio": 0.8187191780019243, "grad/layer_20/attn": 0.004408597946166992, "grad/layer_20/mlp": 0.0072233122773468494, "grad/layer_20/attn_mlp_ratio": 0.610329128225559, "grad/layer_24/attn": 0.016101527959108353, "grad/layer_24/mlp": 0.011933514848351479, "grad/layer_24/attn_mlp_ratio": 1.349269517723498, "grad/layer_27/attn": 0.004617185797542334, "grad/layer_27/mlp": 0.012102404609322548, "grad/layer_27/attn_mlp_ratio": 0.3815097832570159} {"step": 20350, "timestamp": 1778216529.4032652, "train/loss": 2.2155699253082277, "train/z_loss": 0.0015753257321193814, "train/perplexity": 9.166631918684134, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025316.7027198086, "perf/iters_per_sec": 0.9657462609862368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354686737060548, "data/tokens_consumed": 42679140352, "data/tokens_consumed_B": 42.679140352, "train/loss_slope": -1.1048467818087576e-05} {"step": 20360, "timestamp": 1778216539.7619202, "train/loss": 2.22083785533905, "train/z_loss": 0.001572496979497373, "train/perplexity": 9.215048509889767, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025571.1192831618, "perf/iters_per_sec": 0.965867576257306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353386163711549, "data/tokens_consumed": 42700111872, "data/tokens_consumed_B": 42.700111872, "train/loss_slope": -1.0659951645799277e-05} {"step": 20370, "timestamp": 1778216550.1254432, "train/loss": 2.207258200645447, "train/z_loss": 0.001572429051157087, "train/perplexity": 9.090757159957999, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024972.8405825682, "perf/iters_per_sec": 0.9655822947419015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356445074081422, "data/tokens_consumed": 42721083392, "data/tokens_consumed_B": 42.721083392, "train/loss_slope": -9.856178651083432e-06} {"step": 20380, "timestamp": 1778216560.4752543, "train/loss": 2.2497886180877686, "train/z_loss": 0.0015597223304212094, "train/perplexity": 9.485730512566747, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027669.1903784196, "perf/iters_per_sec": 0.9668680145160768, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342673301696776, "data/tokens_consumed": 42742054912, "data/tokens_consumed_B": 42.742054912, "train/loss_slope": -6.627048288706369e-06} {"step": 20390, "timestamp": 1778216570.821915, "train/loss": 2.262933611869812, "train/z_loss": 0.00154421633342281, "train/perplexity": 9.611243507656242, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027897.9232566792, "perf/iters_per_sec": 0.9669770828517338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341506719589233, "data/tokens_consumed": 42763026432, "data/tokens_consumed_B": 42.763026432, "train/loss_slope": -9.062110844797192e-06} {"step": 20400, "timestamp": 1778216581.1670558, "grad/layer_0/attn": 0.0028809625655412674, "grad/layer_0/mlp": 0.002684044186025858, "grad/layer_0/attn_mlp_ratio": 1.0733662557434838, "grad/layer_4/attn": 0.002062634564936161, "grad/layer_4/mlp": 0.002474339911714196, "grad/layer_4/attn_mlp_ratio": 0.8336099950577083, "grad/layer_8/attn": 0.007077280897647142, "grad/layer_8/mlp": 0.003688153112307191, "grad/layer_8/attn_mlp_ratio": 1.9189226938920527, "grad/layer_12/attn": 0.0035304909106343985, "grad/layer_12/mlp": 0.005995600018650293, "grad/layer_12/attn_mlp_ratio": 0.5888469612328264, "grad/layer_16/attn": 0.004324435722082853, "grad/layer_16/mlp": 0.004436900373548269, "grad/layer_16/attn_mlp_ratio": 0.9746524060803474, "grad/layer_20/attn": 0.004142322111874819, "grad/layer_20/mlp": 0.00608051847666502, "grad/layer_20/attn_mlp_ratio": 0.6812448740427592, "grad/layer_24/attn": 0.018268505111336708, "grad/layer_24/mlp": 0.013868727721273899, "grad/layer_24/attn_mlp_ratio": 1.3172444759723225, "grad/layer_27/attn": 0.007714216597378254, "grad/layer_27/mlp": 0.013086868450045586, "grad/layer_27/attn_mlp_ratio": 0.5894623735142035} {"step": 20400, "timestamp": 1778216581.7763503, "eos/sharpness": 26.96766853332519, "eos/L0_probe": 2.048130750656128, "eos/L_plus": 2.1963818073272705, "eos/L_minus": 2.1695563793182373, "eos/grad_norm": 0.16016261279582977, "eos/embed_grad_frac": 0.09624744206666946, "eos/time_s": 0.6058876514434814} {"step": 20400, "timestamp": 1778216581.7987413, "train/loss": 2.239565634727478, "train/z_loss": 0.0015718664159066974, "train/perplexity": 9.389252036544823, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911480.9866388545, "perf/iters_per_sec": 0.9114651616281769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971346378326416, "data/tokens_consumed": 42783997952, "data/tokens_consumed_B": 42.783997952, "train/loss_slope": -8.976415424707389e-06} {"step": 20400, "timestamp": 1778216583.1653817, "geo/rankme_last": 441.0271911621094, "geo/layer_0/stable_rank_q_proj": 17.128183364868164, "geo/layer_0/stable_rank_k_proj": 15.090800285339355, "geo/layer_0/stable_rank_o_proj": 52.29145812988281, "geo/layer_0/stable_rank_gate_proj": 153.27923583984375, "geo/layer_0/stable_rank_down_proj": 50.00807189941406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04942171648144722, "geo/layer_0/attn_entropy_mean": 6.2679290771484375, "geo/layer_0/attn_entropy_std": 0.32188868522644043, "geo/layer_7/stable_rank_q_proj": 42.224639892578125, "geo/layer_7/stable_rank_k_proj": 42.26884841918945, "geo/layer_7/stable_rank_o_proj": 111.08988189697266, "geo/layer_7/stable_rank_gate_proj": 103.87250518798828, "geo/layer_7/stable_rank_down_proj": 151.1108856201172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5592630505561829, "geo/layer_7/attn_entropy_mean": 4.627047061920166, "geo/layer_7/attn_entropy_std": 0.8525096774101257, "geo/layer_14/stable_rank_q_proj": 58.035850524902344, "geo/layer_14/stable_rank_k_proj": 35.09996795654297, "geo/layer_14/stable_rank_o_proj": 53.56563186645508, "geo/layer_14/stable_rank_gate_proj": 88.16728973388672, "geo/layer_14/stable_rank_down_proj": 137.1444091796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38087472319602966, "geo/layer_14/attn_entropy_mean": 5.508410930633545, "geo/layer_14/attn_entropy_std": 0.4809440076351166, "geo/layer_21/stable_rank_q_proj": 48.24827575683594, "geo/layer_21/stable_rank_k_proj": 31.930776596069336, "geo/layer_21/stable_rank_o_proj": 84.0455551147461, "geo/layer_21/stable_rank_gate_proj": 87.49909210205078, "geo/layer_21/stable_rank_down_proj": 61.08600997924805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15689633786678314, "geo/layer_21/attn_entropy_mean": 5.733626365661621, "geo/layer_21/attn_entropy_std": 0.29038968682289124, "geo/layer_27/stable_rank_q_proj": 41.467613220214844, "geo/layer_27/stable_rank_k_proj": 31.67874526977539, "geo/layer_27/stable_rank_o_proj": 118.62718963623047, "geo/layer_27/stable_rank_gate_proj": 91.32808685302734, "geo/layer_27/stable_rank_down_proj": 138.42898559570312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08362414687871933, "geo/layer_27/attn_entropy_mean": 4.400388717651367, "geo/layer_27/attn_entropy_std": 0.5384172797203064, "attnres/final_alpha/block_0": 0.24311666190624237, "attnres/block_norm/0": 1.6124763488769531, "attnres/final_alpha/block_1": 0.006922760047018528, "attnres/block_norm/1": 28964.12890625, "attnres/final_alpha/block_2": 0.014132599346339703, "attnres/block_norm/2": 20764.45703125, "attnres/final_alpha/block_3": 0.01620502956211567, "attnres/block_norm/3": 29307.5078125, "attnres/final_alpha/block_4": 0.021084900945425034, "attnres/block_norm/4": 9247.91796875, "attnres/final_alpha/block_5": 0.5604597926139832, "attnres/block_norm/5": 5004.607421875, "attnres/final_alpha/block_6": 0.1380782425403595, "attnres/block_norm/6": 19640.701171875, "geo/tier1_time_s": 1.363107442855835, "geo/step": 20400.0, "geo/rankme_slope": -7.204371983168268e-05} {"step": 20410, "timestamp": 1778216593.5312934, "train/loss": 2.2529650926589966, "train/z_loss": 0.0015474617946892976, "train/perplexity": 9.51590960050659, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788012.4711957409, "perf/iters_per_sec": 0.852590785596724, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172895622253418, "data/tokens_consumed": 42804969472, "data/tokens_consumed_B": 42.804969472, "train/loss_slope": -9.701114182997214e-06} {"step": 20420, "timestamp": 1778216603.8947873, "train/loss": 2.203200030326843, "train/z_loss": 0.0015797327505424619, "train/perplexity": 9.053940074606906, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024930.3729686164, "perf/iters_per_sec": 0.9655620446055491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356662273406982, "data/tokens_consumed": 42825940992, "data/tokens_consumed_B": 42.825940992, "train/loss_slope": -1.284301234479064e-05} {"step": 20430, "timestamp": 1778216614.2554557, "train/loss": 2.2779045581817625, "train/z_loss": 0.0015485279494896532, "train/perplexity": 9.756215393668537, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025184.4593390268, "perf/iters_per_sec": 0.9656832024283537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035536289215088, "data/tokens_consumed": 42846912512, "data/tokens_consumed_B": 42.846912512, "train/loss_slope": -1.0522459263634159e-05} {"step": 20440, "timestamp": 1778216624.6029902, "train/loss": 2.2123950719833374, "train/z_loss": 0.0015594798722304403, "train/perplexity": 9.137575356427465, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027782.3583205224, "perf/iters_per_sec": 0.9669219771959888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342096090316772, "data/tokens_consumed": 42867884032, "data/tokens_consumed_B": 42.867884032, "train/loss_slope": -8.260817160092756e-06} {"step": 20450, "timestamp": 1778216634.9410863, "grad/layer_0/attn": 0.0023678962606936693, "grad/layer_0/mlp": 0.002543332288041711, "grad/layer_0/attn_mlp_ratio": 0.9310211562700532, "grad/layer_4/attn": 0.003111450932919979, "grad/layer_4/mlp": 0.002500250469893217, "grad/layer_4/attn_mlp_ratio": 1.2444556439208674, "grad/layer_8/attn": 0.0074660321697592735, "grad/layer_8/mlp": 0.0037070803809911013, "grad/layer_8/attn_mlp_ratio": 2.01399246874813, "grad/layer_12/attn": 0.0038688387721776962, "grad/layer_12/mlp": 0.005888433661311865, "grad/layer_12/attn_mlp_ratio": 0.6570234002794948, "grad/layer_16/attn": 0.0035758099984377623, "grad/layer_16/mlp": 0.004273514728993177, "grad/layer_16/attn_mlp_ratio": 0.8367374729059285, "grad/layer_20/attn": 0.011472679674625397, "grad/layer_20/mlp": 0.005438320804387331, "grad/layer_20/attn_mlp_ratio": 2.1095996128823304, "grad/layer_24/attn": 0.00491460831835866, "grad/layer_24/mlp": 0.008465121500194073, "grad/layer_24/attn_mlp_ratio": 0.580571496840162, "grad/layer_27/attn": 0.005558057688176632, "grad/layer_27/mlp": 0.007204634603112936, "grad/layer_27/attn_mlp_ratio": 0.7714558637893382} {"step": 20450, "timestamp": 1778216634.9567578, "train/loss": 2.1518078088760375, "train/z_loss": 0.0015901519800536335, "train/perplexity": 8.600392217246135, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026597.1292407871, "perf/iters_per_sec": 0.9663568159297882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034814453125, "data/tokens_consumed": 42888855552, "data/tokens_consumed_B": 42.888855552, "train/loss_slope": -1.2414525101000984e-05} {"step": 20460, "timestamp": 1778216645.3037732, "train/loss": 2.215035557746887, "train/z_loss": 0.0015692361863330006, "train/perplexity": 9.161734876466824, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027550.7538596473, "perf/iters_per_sec": 0.966811539583038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034327745437622, "data/tokens_consumed": 42909827072, "data/tokens_consumed_B": 42.909827072, "train/loss_slope": -1.56129151466477e-05} {"step": 20470, "timestamp": 1778216655.6577, "train/loss": 2.1618203639984133, "train/z_loss": 0.0015912975766696035, "train/perplexity": 8.686936660887023, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026551.7453660513, "perf/iters_per_sec": 0.9663351752119309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348376274108886, "data/tokens_consumed": 42930798592, "data/tokens_consumed_B": 42.930798592, "train/loss_slope": -2.231876500333136e-05} {"step": 20475, "timestamp": 1778216661.4332426, "eos/sharpness": 18.208527565002438, "eos/L0_probe": 2.050405979156494, "eos/L_plus": 2.139899492263794, "eos/L_minus": 2.1429977416992188, "eos/grad_norm": 0.14644666016101837, "eos/embed_grad_frac": 0.11805225163698196, "eos/time_s": 0.6085007190704346} {"step": 20475, "timestamp": 1778216662.8092437, "geo/rankme_last": 440.4685974121094, "geo/layer_0/stable_rank_q_proj": 17.14682388305664, "geo/layer_0/stable_rank_k_proj": 15.123393058776855, "geo/layer_0/stable_rank_o_proj": 52.42178726196289, "geo/layer_0/stable_rank_gate_proj": 153.53697204589844, "geo/layer_0/stable_rank_down_proj": 50.037139892578125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05135742574930191, "geo/layer_0/attn_entropy_mean": 6.270388603210449, "geo/layer_0/attn_entropy_std": 0.3211056888103485, "geo/layer_7/stable_rank_q_proj": 42.10881423950195, "geo/layer_7/stable_rank_k_proj": 42.236881256103516, "geo/layer_7/stable_rank_o_proj": 111.45452880859375, "geo/layer_7/stable_rank_gate_proj": 104.1087646484375, "geo/layer_7/stable_rank_down_proj": 151.0125274658203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5589397549629211, "geo/layer_7/attn_entropy_mean": 4.644625663757324, "geo/layer_7/attn_entropy_std": 0.8480989336967468, "geo/layer_14/stable_rank_q_proj": 58.23179244995117, "geo/layer_14/stable_rank_k_proj": 35.07133865356445, "geo/layer_14/stable_rank_o_proj": 53.543643951416016, "geo/layer_14/stable_rank_gate_proj": 87.87215423583984, "geo/layer_14/stable_rank_down_proj": 137.12420654296875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3823622465133667, "geo/layer_14/attn_entropy_mean": 5.526612758636475, "geo/layer_14/attn_entropy_std": 0.4837665557861328, "geo/layer_21/stable_rank_q_proj": 48.27528762817383, "geo/layer_21/stable_rank_k_proj": 31.969858169555664, "geo/layer_21/stable_rank_o_proj": 84.27020263671875, "geo/layer_21/stable_rank_gate_proj": 87.28504943847656, "geo/layer_21/stable_rank_down_proj": 61.14237594604492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15419036149978638, "geo/layer_21/attn_entropy_mean": 5.735912322998047, "geo/layer_21/attn_entropy_std": 0.28941500186920166, "geo/layer_27/stable_rank_q_proj": 41.380218505859375, "geo/layer_27/stable_rank_k_proj": 31.72685432434082, "geo/layer_27/stable_rank_o_proj": 118.40889739990234, "geo/layer_27/stable_rank_gate_proj": 91.44113159179688, "geo/layer_27/stable_rank_down_proj": 138.4258270263672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07878387719392776, "geo/layer_27/attn_entropy_mean": 4.389175891876221, "geo/layer_27/attn_entropy_std": 0.5478426814079285, "attnres/final_alpha/block_0": 0.24262821674346924, "attnres/block_norm/0": 1.6133219003677368, "attnres/final_alpha/block_1": 0.0068291109055280685, "attnres/block_norm/1": 29055.4921875, "attnres/final_alpha/block_2": 0.014130469411611557, "attnres/block_norm/2": 20878.484375, "attnres/final_alpha/block_3": 0.016298538073897362, "attnres/block_norm/3": 29487.3828125, "attnres/final_alpha/block_4": 0.021012216806411743, "attnres/block_norm/4": 9260.8505859375, "attnres/final_alpha/block_5": 0.5628830790519714, "attnres/block_norm/5": 4989.1806640625, "attnres/final_alpha/block_6": 0.1362183392047882, "attnres/block_norm/6": 19964.15234375, "geo/tier1_time_s": 1.3561804294586182, "geo/step": 20475.0, "geo/rankme_slope": -6.833809695753301e-05} {"step": 20480, "timestamp": 1778216667.9871416, "train/loss": 2.2104262351989745, "train/z_loss": 0.0015775499283336102, "train/perplexity": 9.119602660404023, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701744.7617650242, "perf/iters_per_sec": 0.8114551361870881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2323540210723878, "data/tokens_consumed": 42951770112, "data/tokens_consumed_B": 42.951770112, "train/loss_slope": -2.7625163668023884e-05} {"step": 20490, "timestamp": 1778216678.3422673, "train/loss": 2.241891312599182, "train/z_loss": 0.001559830957558006, "train/perplexity": 9.411113824122076, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027064.9537260947, "perf/iters_per_sec": 0.9665798920279001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034575629234314, "data/tokens_consumed": 42972741632, "data/tokens_consumed_B": 42.972741632, "train/loss_slope": -2.4878487356639713e-05} {"step": 20500, "timestamp": 1778216688.6805515, "grad/layer_0/attn": 0.005861579906195402, "grad/layer_0/mlp": 0.004474443383514881, "grad/layer_0/attn_mlp_ratio": 1.310013173211624, "grad/layer_4/attn": 0.0026389833074063063, "grad/layer_4/mlp": 0.0027841536793857813, "grad/layer_4/attn_mlp_ratio": 0.9478583140578171, "grad/layer_8/attn": 0.005626911763101816, "grad/layer_8/mlp": 0.004091208800673485, "grad/layer_8/attn_mlp_ratio": 1.3753665236149462, "grad/layer_12/attn": 0.005774045828729868, "grad/layer_12/mlp": 0.007182663772255182, "grad/layer_12/attn_mlp_ratio": 0.8038864036271487, "grad/layer_16/attn": 0.00371216400526464, "grad/layer_16/mlp": 0.00436401879414916, "grad/layer_16/attn_mlp_ratio": 0.8506296822503537, "grad/layer_20/attn": 0.004230624530464411, "grad/layer_20/mlp": 0.005902266129851341, "grad/layer_20/attn_mlp_ratio": 0.7167796852448939, "grad/layer_24/attn": 0.013370406813919544, "grad/layer_24/mlp": 0.012128996662795544, "grad/layer_24/attn_mlp_ratio": 1.10235059629432, "grad/layer_27/attn": 0.0051389699801802635, "grad/layer_27/mlp": 0.011605589650571346, "grad/layer_27/attn_mlp_ratio": 0.44280127857589235} {"step": 20500, "timestamp": 1778216688.6963966, "train/loss": 2.2259254693984984, "train/z_loss": 0.0015497097396291793, "train/perplexity": 9.262050583068634, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026326.8170294613, "perf/iters_per_sec": 0.966227921023112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349524974823, "data/tokens_consumed": 42993713152, "data/tokens_consumed_B": 42.993713152, "train/loss_slope": -2.4351488860777242e-05} {"step": 20500, "timestamp": 1778216695.6495912, "geo/ww_alpha_mean": 7.946128301368878, "geo/ww_alpha_std": 4.366404128118852, "geo/ww_alpha_min": 1.357725664748263, "geo/ww_alpha_max": 25.862278343612026, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.193942606929963, "geo/ww_alpha_by_type/k_proj": 4.623494440767115, "geo/ww_alpha_by_type/v_proj": 8.38968085559163, "geo/ww_alpha_by_type/o_proj": 8.942317918111172, "geo/ww_alpha_by_type/gate_proj": 8.884050817515734, "geo/ww_alpha_by_type/up_proj": 11.661007325895536, "geo/ww_alpha_by_type/down_proj": 9.0580298138325, "geo/twonn_id/layer_0": 0.7659134864807129, "geo/twonn_id/layer_7": 3.133023500442505, "geo/twonn_id/layer_14": 4.238701820373535, "geo/twonn_id/layer_21": 7.5467352867126465, "geo/twonn_id/layer_27": 5.8080339431762695, "geo/tier2_time_s": 6.946029424667358} {"step": 20500, "timestamp": 1778216696.257607, "eoc/jacobian_sigma/layer_0/attn": 878.2415771484375, "eoc/jacobian_sigma/layer_0/mlp": 5163.22412109375, "eoc/jacobian_sigma/layer_0": 5163.22412109375, "eoc/jacobian_sigma/layer_7/attn": 1.1560817956924438, "eoc/jacobian_sigma/layer_7/mlp": 1.6415421962738037, "eoc/jacobian_sigma/layer_7": 1.6415421962738037, "eoc/jacobian_sigma/layer_14/attn": 1.5908325910568237, "eoc/jacobian_sigma/layer_14/mlp": 9.09342098236084, "eoc/jacobian_sigma/layer_14": 9.09342098236084, "eoc/jacobian_sigma/layer_21/attn": 1.0754497051239014, "eoc/jacobian_sigma/layer_21/mlp": 4.089704990386963, "eoc/jacobian_sigma/layer_21": 4.089704990386963, "eoc/jacobian_sigma/layer_27/attn": 3.3459784984588623, "eoc/jacobian_sigma/layer_27/mlp": 23.90303611755371, "eoc/jacobian_sigma/layer_27": 23.90303611755371, "eoc/layer0_sigma": 5163.22412109375, "eoc/sigma_max": 23.90303611755371, "eoc/sigma_min": 1.6415421962738037, "eoc/sigma_mean": 9.68192607164383, "eoc/time_s": 0.6022899150848389} {"step": 20510, "timestamp": 1778216706.627282, "train/loss": 2.2824583292007445, "train/z_loss": 0.0015422766213305295, "train/perplexity": 9.800744274798408, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1169970.3733285968, "perf/iters_per_sec": 0.5578853479998573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7924829959869384, "data/tokens_consumed": 43014684672, "data/tokens_consumed_B": 43.014684672, "train/loss_slope": -1.9170602734940857e-05} {"step": 20520, "timestamp": 1778216716.984643, "train/loss": 2.179132175445557, "train/z_loss": 0.001569008210208267, "train/perplexity": 8.838632547132711, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025825.179862746, "perf/iters_per_sec": 0.9659887217820864, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035208773612976, "data/tokens_consumed": 43035656192, "data/tokens_consumed_B": 43.035656192, "train/loss_slope": -2.197012380547898e-05} {"step": 20530, "timestamp": 1778216727.3452666, "train/loss": 2.2478429079055786, "train/z_loss": 0.0015568736009299756, "train/perplexity": 9.467291973966118, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025196.116206716, "perf/iters_per_sec": 0.9656887608560162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355303287506104, "data/tokens_consumed": 43056627712, "data/tokens_consumed_B": 43.056627712, "train/loss_slope": -1.99845915043851e-05} {"step": 20540, "timestamp": 1778216737.702057, "train/loss": 2.2595335245132446, "train/z_loss": 0.0015566427377052606, "train/perplexity": 9.578619933055718, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026821.4637588523, "perf/iters_per_sec": 0.9664637869638693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346999168395996, "data/tokens_consumed": 43077599232, "data/tokens_consumed_B": 43.077599232, "train/loss_slope": -1.69916142081127e-05} {"step": 20550, "timestamp": 1778216748.040938, "grad/layer_0/attn": 0.0036760419607162476, "grad/layer_0/mlp": 0.003007342107594013, "grad/layer_0/attn_mlp_ratio": 1.2223557237462574, "grad/layer_4/attn": 0.0020296184811741114, "grad/layer_4/mlp": 0.002670549787580967, "grad/layer_4/attn_mlp_ratio": 0.7600002121707514, "grad/layer_8/attn": 0.004818939138203859, "grad/layer_8/mlp": 0.003913800697773695, "grad/layer_8/attn_mlp_ratio": 1.2312683724079772, "grad/layer_12/attn": 0.004374614451080561, "grad/layer_12/mlp": 0.005993671249598265, "grad/layer_12/attn_mlp_ratio": 0.7298722595748892, "grad/layer_16/attn": 0.004217767622321844, "grad/layer_16/mlp": 0.004544955212622881, "grad/layer_16/attn_mlp_ratio": 0.9280108014720573, "grad/layer_20/attn": 0.0038296638522297144, "grad/layer_20/mlp": 0.006883644964545965, "grad/layer_20/attn_mlp_ratio": 0.5563424343236839, "grad/layer_24/attn": 0.015319136902689934, "grad/layer_24/mlp": 0.011669295839965343, "grad/layer_24/attn_mlp_ratio": 1.312773022597234, "grad/layer_27/attn": 0.013055499643087387, "grad/layer_27/mlp": 0.010861698538064957, "grad/layer_27/attn_mlp_ratio": 1.2019758675069687} {"step": 20550, "timestamp": 1778216748.6721642, "eos/sharpness": 47.41952419281005, "eos/L0_probe": 2.046725273132324, "eos/L_plus": 2.325439453125, "eos/L_minus": 2.242206335067749, "eos/grad_norm": 0.17772281169891357, "eos/embed_grad_frac": 0.07234100252389908, "eos/time_s": 0.6283712387084961} {"step": 20550, "timestamp": 1778216748.6935527, "train/loss": 2.220918297767639, "train/z_loss": 0.001555079547688365, "train/perplexity": 9.215789820587485, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908991.6772246072, "perf/iters_per_sec": 0.910278166401199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985652923583984, "data/tokens_consumed": 43098570752, "data/tokens_consumed_B": 43.098570752, "train/loss_slope": -2.0079941081457084e-05} {"step": 20550, "timestamp": 1778216750.0556319, "geo/rankme_last": 440.9683532714844, "geo/layer_0/stable_rank_q_proj": 17.131805419921875, "geo/layer_0/stable_rank_k_proj": 15.123026847839355, "geo/layer_0/stable_rank_o_proj": 52.48068618774414, "geo/layer_0/stable_rank_gate_proj": 153.9547119140625, "geo/layer_0/stable_rank_down_proj": 50.038089752197266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04824895039200783, "geo/layer_0/attn_entropy_mean": 6.264847755432129, "geo/layer_0/attn_entropy_std": 0.3189711272716522, "geo/layer_7/stable_rank_q_proj": 42.049015045166016, "geo/layer_7/stable_rank_k_proj": 42.20047378540039, "geo/layer_7/stable_rank_o_proj": 110.83025360107422, "geo/layer_7/stable_rank_gate_proj": 103.82855987548828, "geo/layer_7/stable_rank_down_proj": 150.65318298339844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5555976033210754, "geo/layer_7/attn_entropy_mean": 4.621034622192383, "geo/layer_7/attn_entropy_std": 0.8431045413017273, "geo/layer_14/stable_rank_q_proj": 58.23301315307617, "geo/layer_14/stable_rank_k_proj": 35.042598724365234, "geo/layer_14/stable_rank_o_proj": 53.56186294555664, "geo/layer_14/stable_rank_gate_proj": 87.54829406738281, "geo/layer_14/stable_rank_down_proj": 136.9720458984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.390118807554245, "geo/layer_14/attn_entropy_mean": 5.48149299621582, "geo/layer_14/attn_entropy_std": 0.47881755232810974, "geo/layer_21/stable_rank_q_proj": 48.0318717956543, "geo/layer_21/stable_rank_k_proj": 31.964323043823242, "geo/layer_21/stable_rank_o_proj": 84.34456634521484, "geo/layer_21/stable_rank_gate_proj": 87.16468811035156, "geo/layer_21/stable_rank_down_proj": 61.125247955322266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15104329586029053, "geo/layer_21/attn_entropy_mean": 5.745785713195801, "geo/layer_21/attn_entropy_std": 0.29420536756515503, "geo/layer_27/stable_rank_q_proj": 41.37913131713867, "geo/layer_27/stable_rank_k_proj": 31.591367721557617, "geo/layer_27/stable_rank_o_proj": 118.06562042236328, "geo/layer_27/stable_rank_gate_proj": 91.36563110351562, "geo/layer_27/stable_rank_down_proj": 138.3709259033203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0766625851392746, "geo/layer_27/attn_entropy_mean": 4.402754783630371, "geo/layer_27/attn_entropy_std": 0.5484866499900818, "attnres/final_alpha/block_0": 0.2410910427570343, "attnres/block_norm/0": 1.6139376163482666, "attnres/final_alpha/block_1": 0.006665996741503477, "attnres/block_norm/1": 29185.83984375, "attnres/final_alpha/block_2": 0.013925358653068542, "attnres/block_norm/2": 20910.9921875, "attnres/final_alpha/block_3": 0.016087012365460396, "attnres/block_norm/3": 29396.89453125, "attnres/final_alpha/block_4": 0.020845724269747734, "attnres/block_norm/4": 9252.482421875, "attnres/final_alpha/block_5": 0.5688410997390747, "attnres/block_norm/5": 4978.78515625, "attnres/final_alpha/block_6": 0.13254371285438538, "attnres/block_norm/6": 19878.29296875, "geo/tier1_time_s": 1.3582682609558105, "geo/step": 20550.0, "geo/rankme_slope": -8.128808163890557e-05} {"step": 20560, "timestamp": 1778216760.4051747, "train/loss": 2.1919026613235473, "train/z_loss": 0.0015773550025187434, "train/perplexity": 8.95222998264439, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791224.7090864757, "perf/iters_per_sec": 0.8541224999840143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170792245864868, "data/tokens_consumed": 43119542272, "data/tokens_consumed_B": 43.119542272, "train/loss_slope": -2.2690901998067256e-05} {"step": 20570, "timestamp": 1778216770.756998, "train/loss": 2.23621609210968, "train/z_loss": 0.0015531319775618612, "train/perplexity": 9.357854948995376, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026793.0222669062, "perf/iters_per_sec": 0.9664502250036746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034714436531067, "data/tokens_consumed": 43140513792, "data/tokens_consumed_B": 43.140513792, "train/loss_slope": -2.0000626266163618e-05} {"step": 20580, "timestamp": 1778216781.1094115, "train/loss": 2.2136615753173827, "train/z_loss": 0.0015679928474128246, "train/perplexity": 9.149155457651974, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026801.1483260116, "perf/iters_per_sec": 0.9664540998106058, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347102880477905, "data/tokens_consumed": 43161485312, "data/tokens_consumed_B": 43.161485312, "train/loss_slope": -2.1132068036019615e-05} {"step": 20590, "timestamp": 1778216791.4655194, "train/loss": 2.2453076601028443, "train/z_loss": 0.0015581035520881414, "train/perplexity": 9.443320442517205, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026118.0863788857, "perf/iters_per_sec": 0.9661283904928616, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035059118270874, "data/tokens_consumed": 43182456832, "data/tokens_consumed_B": 43.182456832, "train/loss_slope": -2.02676311112938e-05} {"step": 20600, "timestamp": 1778216801.806218, "grad/layer_0/attn": 0.002959385747089982, "grad/layer_0/mlp": 0.0027955356054008007, "grad/layer_0/attn_mlp_ratio": 1.0586113213909711, "grad/layer_4/attn": 0.001687901560217142, "grad/layer_4/mlp": 0.002529025310650468, "grad/layer_4/attn_mlp_ratio": 0.6674118627313484, "grad/layer_8/attn": 0.005704266019165516, "grad/layer_8/mlp": 0.003821093589067459, "grad/layer_8/attn_mlp_ratio": 1.492835947855978, "grad/layer_12/attn": 0.003973869141191244, "grad/layer_12/mlp": 0.005967558361589909, "grad/layer_12/attn_mlp_ratio": 0.6659120588041135, "grad/layer_16/attn": 0.00456751836463809, "grad/layer_16/mlp": 0.0042366888374090195, "grad/layer_16/attn_mlp_ratio": 1.0780867871388717, "grad/layer_20/attn": 0.0038679030258208513, "grad/layer_20/mlp": 0.0056422376073896885, "grad/layer_20/attn_mlp_ratio": 0.685526421681069, "grad/layer_24/attn": 0.004042535088956356, "grad/layer_24/mlp": 0.007843476720154285, "grad/layer_24/attn_mlp_ratio": 0.5154009097813382, "grad/layer_27/attn": 0.004732043948024511, "grad/layer_27/mlp": 0.007096529472619295, "grad/layer_27/attn_mlp_ratio": 0.6668109953747345} {"step": 20600, "timestamp": 1778216801.8220346, "train/loss": 2.203090500831604, "train/z_loss": 0.0015564648900181055, "train/perplexity": 9.052948455427376, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025908.3255333079, "perf/iters_per_sec": 0.966028368727354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03516628742218, "data/tokens_consumed": 43203428352, "data/tokens_consumed_B": 43.203428352, "train/loss_slope": -2.2266322672992997e-05} {"step": 20610, "timestamp": 1778216812.1718917, "train/loss": 2.2799692153930664, "train/z_loss": 0.0015462082927115261, "train/perplexity": 9.776379442897976, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027262.4790636543, "perf/iters_per_sec": 0.9666740794485351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03447482585907, "data/tokens_consumed": 43224399872, "data/tokens_consumed_B": 43.224399872, "train/loss_slope": -1.9631031658997796e-05} {"step": 20620, "timestamp": 1778216822.5227041, "train/loss": 2.248477077484131, "train/z_loss": 0.0015591844799928366, "train/perplexity": 9.473297746665171, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027090.6933756778, "perf/iters_per_sec": 0.9665921656492604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345624923706054, "data/tokens_consumed": 43245371392, "data/tokens_consumed_B": 43.245371392, "train/loss_slope": -2.214234053391146e-05} {"step": 20625, "timestamp": 1778216828.2991116, "eos/sharpness": 47.9743480682373, "eos/L0_probe": 2.052791118621826, "eos/L_plus": 2.2255818843841553, "eos/L_minus": 2.35974383354187, "eos/grad_norm": 0.1589605212211609, "eos/embed_grad_frac": 0.09864863008260727, "eos/time_s": 0.6118769645690918} {"step": 20625, "timestamp": 1778216829.6794446, "geo/rankme_last": 440.99041748046875, "geo/layer_0/stable_rank_q_proj": 17.115459442138672, "geo/layer_0/stable_rank_k_proj": 15.148231506347656, "geo/layer_0/stable_rank_o_proj": 52.424442291259766, "geo/layer_0/stable_rank_gate_proj": 153.19122314453125, "geo/layer_0/stable_rank_down_proj": 50.03791809082031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04621874541044235, "geo/layer_0/attn_entropy_mean": 6.256936073303223, "geo/layer_0/attn_entropy_std": 0.321149617433548, "geo/layer_7/stable_rank_q_proj": 42.21205520629883, "geo/layer_7/stable_rank_k_proj": 42.1643180847168, "geo/layer_7/stable_rank_o_proj": 110.40055084228516, "geo/layer_7/stable_rank_gate_proj": 103.94371032714844, "geo/layer_7/stable_rank_down_proj": 150.85760498046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5478374361991882, "geo/layer_7/attn_entropy_mean": 4.638487815856934, "geo/layer_7/attn_entropy_std": 0.8564514517784119, "geo/layer_14/stable_rank_q_proj": 58.44898986816406, "geo/layer_14/stable_rank_k_proj": 34.93794250488281, "geo/layer_14/stable_rank_o_proj": 53.7374153137207, "geo/layer_14/stable_rank_gate_proj": 87.27018737792969, "geo/layer_14/stable_rank_down_proj": 136.8004608154297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38122034072875977, "geo/layer_14/attn_entropy_mean": 5.529131889343262, "geo/layer_14/attn_entropy_std": 0.4838973879814148, "geo/layer_21/stable_rank_q_proj": 48.061614990234375, "geo/layer_21/stable_rank_k_proj": 31.974403381347656, "geo/layer_21/stable_rank_o_proj": 84.1890869140625, "geo/layer_21/stable_rank_gate_proj": 87.26564025878906, "geo/layer_21/stable_rank_down_proj": 61.01388931274414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15035554766654968, "geo/layer_21/attn_entropy_mean": 5.762572288513184, "geo/layer_21/attn_entropy_std": 0.2998143136501312, "geo/layer_27/stable_rank_q_proj": 41.2897834777832, "geo/layer_27/stable_rank_k_proj": 31.613073348999023, "geo/layer_27/stable_rank_o_proj": 117.68306732177734, "geo/layer_27/stable_rank_gate_proj": 91.49047088623047, "geo/layer_27/stable_rank_down_proj": 138.39772033691406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07663750648498535, "geo/layer_27/attn_entropy_mean": 4.41440486907959, "geo/layer_27/attn_entropy_std": 0.5482606887817383, "attnres/final_alpha/block_0": 0.2427838295698166, "attnres/block_norm/0": 1.6146857738494873, "attnres/final_alpha/block_1": 0.006785999052226543, "attnres/block_norm/1": 29173.396484375, "attnres/final_alpha/block_2": 0.014175385236740112, "attnres/block_norm/2": 20927.5859375, "attnres/final_alpha/block_3": 0.01621520332992077, "attnres/block_norm/3": 29661.078125, "attnres/final_alpha/block_4": 0.021122638136148453, "attnres/block_norm/4": 9329.8681640625, "attnres/final_alpha/block_5": 0.5624188184738159, "attnres/block_norm/5": 5007.5263671875, "attnres/final_alpha/block_6": 0.13649815320968628, "attnres/block_norm/6": 19914.37109375, "geo/tier1_time_s": 1.3603222370147705, "geo/step": 20625.0, "geo/rankme_slope": -6.278659901460584e-05} {"step": 20630, "timestamp": 1778216834.8580816, "train/loss": 2.200841951370239, "train/z_loss": 0.0015815443242900074, "train/perplexity": 9.032615321649645, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700814.2750191465, "perf/iters_per_sec": 0.8110114455314381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330282211303711, "data/tokens_consumed": 43266342912, "data/tokens_consumed_B": 43.266342912, "train/loss_slope": -2.3549852992119867e-05} {"step": 20640, "timestamp": 1778216845.2063367, "train/loss": 2.161072325706482, "train/z_loss": 0.0015813009464181959, "train/perplexity": 8.680440929455917, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027616.280168426, "perf/iters_per_sec": 0.9668427849619036, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034294319152832, "data/tokens_consumed": 43287314432, "data/tokens_consumed_B": 43.287314432, "train/loss_slope": -2.6286465635966337e-05} {"step": 20650, "timestamp": 1778216855.5767944, "grad/layer_0/attn": 0.0033463004510849714, "grad/layer_0/mlp": 0.0027882263530045748, "grad/layer_0/attn_mlp_ratio": 1.2001537563346119, "grad/layer_4/attn": 0.0021781264804303646, "grad/layer_4/mlp": 0.0025647347792983055, "grad/layer_4/attn_mlp_ratio": 0.8492598973919229, "grad/layer_8/attn": 0.0041051763109862804, "grad/layer_8/mlp": 0.0038968068547546864, "grad/layer_8/attn_mlp_ratio": 1.0534718190177095, "grad/layer_12/attn": 0.004965102765709162, "grad/layer_12/mlp": 0.005995489656925201, "grad/layer_12/attn_mlp_ratio": 0.8281396461355185, "grad/layer_16/attn": 0.004837492946535349, "grad/layer_16/mlp": 0.0049574729055166245, "grad/layer_16/attn_mlp_ratio": 0.9757981417452475, "grad/layer_20/attn": 0.00618331553414464, "grad/layer_20/mlp": 0.0069371978752315044, "grad/layer_20/attn_mlp_ratio": 0.8913275296771812, "grad/layer_24/attn": 0.014684650115668774, "grad/layer_24/mlp": 0.01078416220843792, "grad/layer_24/attn_mlp_ratio": 1.361686674928749, "grad/layer_27/attn": 0.015887867659330368, "grad/layer_27/mlp": 0.01079286728054285, "grad/layer_27/attn_mlp_ratio": 1.4720710538862616} {"step": 20650, "timestamp": 1778216855.593397, "train/loss": 2.214580035209656, "train/z_loss": 0.0015629674773663283, "train/perplexity": 9.157562450139716, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019949.1573904268, "perf/iters_per_sec": 0.963186815924848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038220191001892, "data/tokens_consumed": 43308285952, "data/tokens_consumed_B": 43.308285952, "train/loss_slope": -2.593422702865987e-05} {"step": 20660, "timestamp": 1778216865.9486394, "train/loss": 2.258077025413513, "train/z_loss": 0.0015716525726020337, "train/perplexity": 9.564678836808161, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026151.7828260988, "perf/iters_per_sec": 0.9661444582109923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035041904449463, "data/tokens_consumed": 43329257472, "data/tokens_consumed_B": 43.329257472, "train/loss_slope": -2.087511783099695e-05} {"step": 20670, "timestamp": 1778216876.303444, "train/loss": 2.2203481674194334, "train/z_loss": 0.001575617433991283, "train/perplexity": 9.210537116633331, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026367.2892415433, "perf/iters_per_sec": 0.9662472196777073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349318265914917, "data/tokens_consumed": 43350228992, "data/tokens_consumed_B": 43.350228992, "train/loss_slope": -1.8838637303633625e-05} {"step": 20680, "timestamp": 1778216886.6583493, "train/loss": 2.2454824686050414, "train/z_loss": 0.0015538890846073627, "train/perplexity": 9.444971359512488, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026373.264514776, "perf/iters_per_sec": 0.9662500689100151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349287748336793, "data/tokens_consumed": 43371200512, "data/tokens_consumed_B": 43.371200512, "train/loss_slope": -1.5301758557012635e-05} {"step": 20690, "timestamp": 1778216897.020877, "train/loss": 2.2010023832321166, "train/z_loss": 0.0015687292558141052, "train/perplexity": 9.034064557191991, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024883.0127251823, "perf/iters_per_sec": 0.9655394614816581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356904506683349, "data/tokens_consumed": 43392172032, "data/tokens_consumed_B": 43.392172032, "train/loss_slope": -1.4755412633090721e-05} {"step": 20700, "timestamp": 1778216907.3644953, "grad/layer_0/attn": 0.002462213160470128, "grad/layer_0/mlp": 0.002528406446799636, "grad/layer_0/attn_mlp_ratio": 0.9738201174912741, "grad/layer_4/attn": 0.00198170798830688, "grad/layer_4/mlp": 0.0023366811219602823, "grad/layer_4/attn_mlp_ratio": 0.8480865809519327, "grad/layer_8/attn": 0.008638213388621807, "grad/layer_8/mlp": 0.0036517218686640263, "grad/layer_8/attn_mlp_ratio": 2.365517819469174, "grad/layer_12/attn": 0.0043169716373085976, "grad/layer_12/mlp": 0.006061627063900232, "grad/layer_12/attn_mlp_ratio": 0.7121803305584583, "grad/layer_16/attn": 0.00437419768422842, "grad/layer_16/mlp": 0.004354360979050398, "grad/layer_16/attn_mlp_ratio": 1.0045555719468144, "grad/layer_20/attn": 0.0036077057011425495, "grad/layer_20/mlp": 0.00581341190263629, "grad/layer_20/attn_mlp_ratio": 0.6205831789500749, "grad/layer_24/attn": 0.015019504353404045, "grad/layer_24/mlp": 0.009223361499607563, "grad/layer_24/attn_mlp_ratio": 1.6284197676954462, "grad/layer_27/attn": 0.004591156728565693, "grad/layer_27/mlp": 0.008168299682438374, "grad/layer_27/attn_mlp_ratio": 0.5620700575212185} {"step": 20700, "timestamp": 1778216907.9857135, "eos/sharpness": 39.68307971954345, "eos/L0_probe": 2.049915075302124, "eos/L_plus": 2.284458875656128, "eos/L_minus": 2.2122020721435547, "eos/grad_norm": 0.1342913955450058, "eos/embed_grad_frac": 0.12171367555856705, "eos/time_s": 0.6184663772583008} {"step": 20700, "timestamp": 1778216908.0076523, "train/loss": 2.255820798873901, "train/z_loss": 0.001562908652704209, "train/perplexity": 9.543123081052034, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909756.5386232885, "perf/iters_per_sec": 0.9106428807369654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0981253147125245, "data/tokens_consumed": 43413143552, "data/tokens_consumed_B": 43.413143552, "train/loss_slope": -1.1082466522065755e-05} {"step": 20700, "timestamp": 1778216909.3742673, "geo/rankme_last": 440.76397705078125, "geo/layer_0/stable_rank_q_proj": 17.142526626586914, "geo/layer_0/stable_rank_k_proj": 15.15228271484375, "geo/layer_0/stable_rank_o_proj": 52.23185729980469, "geo/layer_0/stable_rank_gate_proj": 152.7083740234375, "geo/layer_0/stable_rank_down_proj": 50.00271224975586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050251998007297516, "geo/layer_0/attn_entropy_mean": 6.259310245513916, "geo/layer_0/attn_entropy_std": 0.3144388794898987, "geo/layer_7/stable_rank_q_proj": 42.36243438720703, "geo/layer_7/stable_rank_k_proj": 42.196067810058594, "geo/layer_7/stable_rank_o_proj": 110.81745910644531, "geo/layer_7/stable_rank_gate_proj": 104.10653686523438, "geo/layer_7/stable_rank_down_proj": 151.09573364257812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.560905396938324, "geo/layer_7/attn_entropy_mean": 4.646151065826416, "geo/layer_7/attn_entropy_std": 0.8570095300674438, "geo/layer_14/stable_rank_q_proj": 58.37431716918945, "geo/layer_14/stable_rank_k_proj": 34.94015121459961, "geo/layer_14/stable_rank_o_proj": 53.64385223388672, "geo/layer_14/stable_rank_gate_proj": 87.14620208740234, "geo/layer_14/stable_rank_down_proj": 136.859619140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38378995656967163, "geo/layer_14/attn_entropy_mean": 5.497370719909668, "geo/layer_14/attn_entropy_std": 0.49702462553977966, "geo/layer_21/stable_rank_q_proj": 47.90060043334961, "geo/layer_21/stable_rank_k_proj": 32.05723190307617, "geo/layer_21/stable_rank_o_proj": 84.20513916015625, "geo/layer_21/stable_rank_gate_proj": 87.14240264892578, "geo/layer_21/stable_rank_down_proj": 60.94704818725586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15655285120010376, "geo/layer_21/attn_entropy_mean": 5.74592399597168, "geo/layer_21/attn_entropy_std": 0.2955261170864105, "geo/layer_27/stable_rank_q_proj": 41.31230926513672, "geo/layer_27/stable_rank_k_proj": 31.521984100341797, "geo/layer_27/stable_rank_o_proj": 117.67826080322266, "geo/layer_27/stable_rank_gate_proj": 91.46898651123047, "geo/layer_27/stable_rank_down_proj": 138.4775390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08106739819049835, "geo/layer_27/attn_entropy_mean": 4.374842643737793, "geo/layer_27/attn_entropy_std": 0.5693808197975159, "attnres/final_alpha/block_0": 0.24279940128326416, "attnres/block_norm/0": 1.615500807762146, "attnres/final_alpha/block_1": 0.00682454276829958, "attnres/block_norm/1": 29233.34765625, "attnres/final_alpha/block_2": 0.014294466003775597, "attnres/block_norm/2": 20855.99609375, "attnres/final_alpha/block_3": 0.01634799689054489, "attnres/block_norm/3": 29858.7734375, "attnres/final_alpha/block_4": 0.02120123989880085, "attnres/block_norm/4": 9340.361328125, "attnres/final_alpha/block_5": 0.5631659030914307, "attnres/block_norm/5": 5030.1884765625, "attnres/final_alpha/block_6": 0.13536641001701355, "attnres/block_norm/6": 20080.52734375, "geo/tier1_time_s": 1.3616738319396973, "geo/step": 20700.0, "geo/rankme_slope": -6.1022358162014796e-05} {"step": 20710, "timestamp": 1778216919.7279415, "train/loss": 2.1963565826416014, "train/z_loss": 0.0015632624039426447, "train/perplexity": 8.992191437134265, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789987.8876341493, "perf/iters_per_sec": 0.8535327375574824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716012239456177, "data/tokens_consumed": 43434115072, "data/tokens_consumed_B": 43.434115072, "train/loss_slope": -1.25421390424718e-05} {"step": 20720, "timestamp": 1778216930.0842972, "train/loss": 2.1806883811950684, "train/z_loss": 0.0015875144978053867, "train/perplexity": 8.85239798606992, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026497.7262636465, "perf/iters_per_sec": 0.9663094168966515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348652124404907, "data/tokens_consumed": 43455086592, "data/tokens_consumed_B": 43.455086592, "train/loss_slope": -1.8992276310455812e-05} {"step": 20730, "timestamp": 1778216940.4423714, "train/loss": 2.193217325210571, "train/z_loss": 0.0015830729273147881, "train/perplexity": 8.96400689575588, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025717.1286845766, "perf/iters_per_sec": 0.9659371989653476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035263991355896, "data/tokens_consumed": 43476058112, "data/tokens_consumed_B": 43.476058112, "train/loss_slope": -1.9797915479566812e-05} {"step": 20740, "timestamp": 1778216950.7979152, "train/loss": 2.193677806854248, "train/z_loss": 0.0015691444044932724, "train/perplexity": 8.968135606910062, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026083.7843601669, "perf/iters_per_sec": 0.9661120340157351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035076642036438, "data/tokens_consumed": 43497029632, "data/tokens_consumed_B": 43.497029632, "train/loss_slope": -1.9191431748842058e-05} {"step": 20750, "timestamp": 1778216961.1402726, "grad/layer_0/attn": 0.002851101104170084, "grad/layer_0/mlp": 0.002733855741098523, "grad/layer_0/attn_mlp_ratio": 1.0428864102155617, "grad/layer_4/attn": 0.0034475214779376984, "grad/layer_4/mlp": 0.0024629414547234774, "grad/layer_4/attn_mlp_ratio": 1.399757729259135, "grad/layer_8/attn": 0.007896666415035725, "grad/layer_8/mlp": 0.0036489558406174183, "grad/layer_8/attn_mlp_ratio": 2.1640892747254097, "grad/layer_12/attn": 0.00420211860910058, "grad/layer_12/mlp": 0.006583505310118198, "grad/layer_12/attn_mlp_ratio": 0.6382798140702275, "grad/layer_16/attn": 0.004455409944057465, "grad/layer_16/mlp": 0.0048973034135997295, "grad/layer_16/attn_mlp_ratio": 0.9097679838884534, "grad/layer_20/attn": 0.004231388680636883, "grad/layer_20/mlp": 0.007342091761529446, "grad/layer_20/attn_mlp_ratio": 0.5763192235183277, "grad/layer_24/attn": 0.02553032711148262, "grad/layer_24/mlp": 0.016493555158376694, "grad/layer_24/attn_mlp_ratio": 1.5478971459786608, "grad/layer_27/attn": 0.014077016152441502, "grad/layer_27/mlp": 0.016290683299303055, "grad/layer_27/attn_mlp_ratio": 0.8641145253024647} {"step": 20750, "timestamp": 1778216961.156756, "train/loss": 2.1960965394973755, "train/z_loss": 0.0015597564866766333, "train/perplexity": 8.989853383410068, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025802.271676926, "perf/iters_per_sec": 0.9659777983078603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03522047996521, "data/tokens_consumed": 43518001152, "data/tokens_consumed_B": 43.518001152, "train/loss_slope": -1.8304257188299767e-05} {"step": 20760, "timestamp": 1778216971.5119917, "train/loss": 2.232390284538269, "train/z_loss": 0.0015474668238312007, "train/perplexity": 9.322121993968965, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026412.4314360607, "perf/iters_per_sec": 0.9662687451534561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349087715148926, "data/tokens_consumed": 43538972672, "data/tokens_consumed_B": 43.538972672, "train/loss_slope": -1.746190702310747e-05} {"step": 20770, "timestamp": 1778216981.9057326, "train/loss": 2.1995460033416747, "train/z_loss": 0.0015612418297678231, "train/perplexity": 9.02091710340945, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018772.97224088, "perf/iters_per_sec": 0.9626259671406173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388250827789307, "data/tokens_consumed": 43559944192, "data/tokens_consumed_B": 43.559944192, "train/loss_slope": -1.9646334040104213e-05} {"step": 20775, "timestamp": 1778216987.6844485, "eos/sharpness": 41.54162406921386, "eos/L0_probe": 2.0478341579437256, "eos/L_plus": 2.275766611099243, "eos/L_minus": 2.2353179454803467, "eos/grad_norm": 0.1685236096382141, "eos/embed_grad_frac": 0.09413856267929077, "eos/time_s": 0.6127772331237793} {"step": 20775, "timestamp": 1778216989.0641294, "geo/rankme_last": 440.97064208984375, "geo/layer_0/stable_rank_q_proj": 17.166601181030273, "geo/layer_0/stable_rank_k_proj": 15.213696479797363, "geo/layer_0/stable_rank_o_proj": 52.233680725097656, "geo/layer_0/stable_rank_gate_proj": 152.09364318847656, "geo/layer_0/stable_rank_down_proj": 50.12245559692383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052807994186878204, "geo/layer_0/attn_entropy_mean": 6.2614850997924805, "geo/layer_0/attn_entropy_std": 0.309459924697876, "geo/layer_7/stable_rank_q_proj": 42.30551528930664, "geo/layer_7/stable_rank_k_proj": 42.09138488769531, "geo/layer_7/stable_rank_o_proj": 110.73651885986328, "geo/layer_7/stable_rank_gate_proj": 103.7817611694336, "geo/layer_7/stable_rank_down_proj": 151.56849670410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.541323184967041, "geo/layer_7/attn_entropy_mean": 4.665590763092041, "geo/layer_7/attn_entropy_std": 0.8574604988098145, "geo/layer_14/stable_rank_q_proj": 58.34273910522461, "geo/layer_14/stable_rank_k_proj": 34.78334426879883, "geo/layer_14/stable_rank_o_proj": 53.43225860595703, "geo/layer_14/stable_rank_gate_proj": 87.25379943847656, "geo/layer_14/stable_rank_down_proj": 136.6510467529297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3769833743572235, "geo/layer_14/attn_entropy_mean": 5.499990463256836, "geo/layer_14/attn_entropy_std": 0.4791330397129059, "geo/layer_21/stable_rank_q_proj": 47.8213005065918, "geo/layer_21/stable_rank_k_proj": 32.12857437133789, "geo/layer_21/stable_rank_o_proj": 84.0857925415039, "geo/layer_21/stable_rank_gate_proj": 87.18965911865234, "geo/layer_21/stable_rank_down_proj": 60.99525451660156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1539178192615509, "geo/layer_21/attn_entropy_mean": 5.772987365722656, "geo/layer_21/attn_entropy_std": 0.2855702340602875, "geo/layer_27/stable_rank_q_proj": 41.384979248046875, "geo/layer_27/stable_rank_k_proj": 31.499557495117188, "geo/layer_27/stable_rank_o_proj": 117.39443969726562, "geo/layer_27/stable_rank_gate_proj": 91.38924407958984, "geo/layer_27/stable_rank_down_proj": 138.61318969726562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07888511568307877, "geo/layer_27/attn_entropy_mean": 4.401190757751465, "geo/layer_27/attn_entropy_std": 0.5349633693695068, "attnres/final_alpha/block_0": 0.2416432499885559, "attnres/block_norm/0": 1.6163299083709717, "attnres/final_alpha/block_1": 0.006750999018549919, "attnres/block_norm/1": 29292.16796875, "attnres/final_alpha/block_2": 0.01407855935394764, "attnres/block_norm/2": 20978.3125, "attnres/final_alpha/block_3": 0.015792690217494965, "attnres/block_norm/3": 30051.84375, "attnres/final_alpha/block_4": 0.02099902182817459, "attnres/block_norm/4": 9332.978515625, "attnres/final_alpha/block_5": 0.5652529001235962, "attnres/block_norm/5": 5051.615234375, "attnres/final_alpha/block_6": 0.1354825496673584, "attnres/block_norm/6": 20003.1875, "geo/tier1_time_s": 1.3596446514129639, "geo/step": 20775.0, "geo/rankme_slope": -4.574581785839336e-05} {"step": 20780, "timestamp": 1778216994.241481, "train/loss": 2.20514178276062, "train/z_loss": 0.0015689941821619869, "train/perplexity": 9.07153766433422, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700952.345761357, "perf/iters_per_sec": 0.8110772827917848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329281330108643, "data/tokens_consumed": 43580915712, "data/tokens_consumed_B": 43.580915712, "train/loss_slope": -2.2137684111047286e-05} {"step": 20790, "timestamp": 1778217004.5876882, "train/loss": 2.262604069709778, "train/z_loss": 0.001556021743454039, "train/perplexity": 9.608076719533875, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027915.7827673294, "perf/iters_per_sec": 0.9669855989300391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341415643692016, "data/tokens_consumed": 43601887232, "data/tokens_consumed_B": 43.601887232, "train/loss_slope": -2.029584715254063e-05} {"step": 20800, "timestamp": 1778217014.9313755, "grad/layer_0/attn": 0.0035578319802880287, "grad/layer_0/mlp": 0.002923260210081935, "grad/layer_0/attn_mlp_ratio": 1.2170766893449552, "grad/layer_4/attn": 0.0022171293385326862, "grad/layer_4/mlp": 0.0025132049340754747, "grad/layer_4/attn_mlp_ratio": 0.8821919853221586, "grad/layer_8/attn": 0.005538599565625191, "grad/layer_8/mlp": 0.0038399468176066875, "grad/layer_8/attn_mlp_ratio": 1.4423635754520319, "grad/layer_12/attn": 0.004502454306930304, "grad/layer_12/mlp": 0.006142294965684414, "grad/layer_12/attn_mlp_ratio": 0.7330247503224776, "grad/layer_16/attn": 0.0042062741704285145, "grad/layer_16/mlp": 0.004580749198794365, "grad/layer_16/attn_mlp_ratio": 0.9182502459882687, "grad/layer_20/attn": 0.004066034220159054, "grad/layer_20/mlp": 0.005897109862416983, "grad/layer_20/attn_mlp_ratio": 0.6894960830088632, "grad/layer_24/attn": 0.01742379553616047, "grad/layer_24/mlp": 0.011068650521337986, "grad/layer_24/attn_mlp_ratio": 1.5741571517824517, "grad/layer_27/attn": 0.006530574522912502, "grad/layer_27/mlp": 0.011174509301781654, "grad/layer_27/attn_mlp_ratio": 0.5844171129223152} {"step": 20800, "timestamp": 1778217014.9472966, "train/loss": 2.234626793861389, "train/z_loss": 0.00155826632399112, "train/perplexity": 9.342994338616196, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025410.346699795, "perf/iters_per_sec": 0.9657909139155364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035420799255371, "data/tokens_consumed": 43622858752, "data/tokens_consumed_B": 43.622858752, "train/loss_slope": -1.9300929369098157e-05} {"step": 20810, "timestamp": 1778217025.2994885, "train/loss": 2.1818952798843383, "train/z_loss": 0.0015638347598724068, "train/perplexity": 8.863088383411835, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026925.9428886243, "perf/iters_per_sec": 0.9665136064952012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346465826034545, "data/tokens_consumed": 43643830272, "data/tokens_consumed_B": 43.643830272, "train/loss_slope": -2.16864464413895e-05} {"step": 20820, "timestamp": 1778217035.647933, "train/loss": 2.288261890411377, "train/z_loss": 0.001538766745943576, "train/perplexity": 9.857788864880746, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027519.6747446582, "perf/iters_per_sec": 0.9667967199061671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343436002731323, "data/tokens_consumed": 43664801792, "data/tokens_consumed_B": 43.664801792, "train/loss_slope": -1.43080724669834e-05} {"step": 20830, "timestamp": 1778217045.998791, "train/loss": 2.2551956176757812, "train/z_loss": 0.0015465375850908457, "train/perplexity": 9.537158764513958, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027346.2100281264, "perf/iters_per_sec": 0.9667140054836876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344321012496949, "data/tokens_consumed": 43685773312, "data/tokens_consumed_B": 43.685773312, "train/loss_slope": -1.4651986238586706e-05} {"step": 20840, "timestamp": 1778217056.3487496, "train/loss": 2.1698125839233398, "train/z_loss": 0.0015625570551492274, "train/perplexity": 8.756642751314459, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027411.9100357695, "perf/iters_per_sec": 0.966745333688626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034398579597473, "data/tokens_consumed": 43706744832, "data/tokens_consumed_B": 43.706744832, "train/loss_slope": -1.5738898327927973e-05} {"step": 20850, "timestamp": 1778217066.7144573, "grad/layer_0/attn": 0.0028496249578893185, "grad/layer_0/mlp": 0.0026995306834578514, "grad/layer_0/attn_mlp_ratio": 1.0556000973766302, "grad/layer_4/attn": 0.002505120588466525, "grad/layer_4/mlp": 0.0024309305008500814, "grad/layer_4/attn_mlp_ratio": 1.0305191713784425, "grad/layer_8/attn": 0.005200235638767481, "grad/layer_8/mlp": 0.0038227965123951435, "grad/layer_8/attn_mlp_ratio": 1.3603223414779877, "grad/layer_12/attn": 0.005124836228787899, "grad/layer_12/mlp": 0.005701364018023014, "grad/layer_12/attn_mlp_ratio": 0.8988789564566472, "grad/layer_16/attn": 0.004134598653763533, "grad/layer_16/mlp": 0.004631554242223501, "grad/layer_16/attn_mlp_ratio": 0.8927021790655724, "grad/layer_20/attn": 0.004646081477403641, "grad/layer_20/mlp": 0.006883460562676191, "grad/layer_20/attn_mlp_ratio": 0.6749630316907064, "grad/layer_24/attn": 0.020235033705830574, "grad/layer_24/mlp": 0.013125648722052574, "grad/layer_24/attn_mlp_ratio": 1.5416406442196922, "grad/layer_27/attn": 0.008226818405091763, "grad/layer_27/mlp": 0.012488698586821556, "grad/layer_27/attn_mlp_ratio": 0.658741043514241} {"step": 20850, "timestamp": 1778217067.3203223, "eos/sharpness": 64.84885215759276, "eos/L0_probe": 2.049185276031494, "eos/L_plus": 2.296491861343384, "eos/L_minus": 2.4503672122955322, "eos/grad_norm": 0.24185128509998322, "eos/embed_grad_frac": 0.04371428117156029, "eos/time_s": 0.6025562286376953} {"step": 20850, "timestamp": 1778217067.341308, "train/loss": 2.1906481742858888, "train/z_loss": 0.0015642113867215812, "train/perplexity": 8.941006567459352, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908916.9398588145, "perf/iters_per_sec": 0.9102425288480828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986083030700684, "data/tokens_consumed": 43727716352, "data/tokens_consumed_B": 43.727716352, "train/loss_slope": -1.589844075903112e-05} {"step": 20850, "timestamp": 1778217068.7062364, "geo/rankme_last": 440.521728515625, "geo/layer_0/stable_rank_q_proj": 17.13896942138672, "geo/layer_0/stable_rank_k_proj": 15.140267372131348, "geo/layer_0/stable_rank_o_proj": 52.24513244628906, "geo/layer_0/stable_rank_gate_proj": 152.1903839111328, "geo/layer_0/stable_rank_down_proj": 50.17922592163086, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04807355999946594, "geo/layer_0/attn_entropy_mean": 6.258152008056641, "geo/layer_0/attn_entropy_std": 0.3121219873428345, "geo/layer_7/stable_rank_q_proj": 42.16310119628906, "geo/layer_7/stable_rank_k_proj": 42.010135650634766, "geo/layer_7/stable_rank_o_proj": 110.91671752929688, "geo/layer_7/stable_rank_gate_proj": 103.90754699707031, "geo/layer_7/stable_rank_down_proj": 151.7227325439453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5476017594337463, "geo/layer_7/attn_entropy_mean": 4.656993865966797, "geo/layer_7/attn_entropy_std": 0.866523265838623, "geo/layer_14/stable_rank_q_proj": 58.36749267578125, "geo/layer_14/stable_rank_k_proj": 34.832313537597656, "geo/layer_14/stable_rank_o_proj": 53.47890853881836, "geo/layer_14/stable_rank_gate_proj": 87.16199493408203, "geo/layer_14/stable_rank_down_proj": 136.5040283203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38141268491744995, "geo/layer_14/attn_entropy_mean": 5.514288902282715, "geo/layer_14/attn_entropy_std": 0.47092559933662415, "geo/layer_21/stable_rank_q_proj": 47.81629180908203, "geo/layer_21/stable_rank_k_proj": 32.07691955566406, "geo/layer_21/stable_rank_o_proj": 84.10993957519531, "geo/layer_21/stable_rank_gate_proj": 87.18182373046875, "geo/layer_21/stable_rank_down_proj": 60.92238998413086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15178066492080688, "geo/layer_21/attn_entropy_mean": 5.739787578582764, "geo/layer_21/attn_entropy_std": 0.2909512221813202, "geo/layer_27/stable_rank_q_proj": 41.515228271484375, "geo/layer_27/stable_rank_k_proj": 31.513822555541992, "geo/layer_27/stable_rank_o_proj": 117.41547393798828, "geo/layer_27/stable_rank_gate_proj": 91.55339050292969, "geo/layer_27/stable_rank_down_proj": 138.63340759277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07768657058477402, "geo/layer_27/attn_entropy_mean": 4.373259544372559, "geo/layer_27/attn_entropy_std": 0.5593825578689575, "attnres/final_alpha/block_0": 0.24365732073783875, "attnres/block_norm/0": 1.6172810792922974, "attnres/final_alpha/block_1": 0.006808842997997999, "attnres/block_norm/1": 29281.24609375, "attnres/final_alpha/block_2": 0.014482932165265083, "attnres/block_norm/2": 21112.32421875, "attnres/final_alpha/block_3": 0.01651587150990963, "attnres/block_norm/3": 29803.53125, "attnres/final_alpha/block_4": 0.021301405504345894, "attnres/block_norm/4": 9357.22265625, "attnres/final_alpha/block_5": 0.5596587061882019, "attnres/block_norm/5": 5091.12841796875, "attnres/final_alpha/block_6": 0.1375749558210373, "attnres/block_norm/6": 20183.07421875, "geo/tier1_time_s": 1.361715316772461, "geo/step": 20850.0, "geo/rankme_slope": -4.2540590454931973e-05} {"step": 20860, "timestamp": 1778217079.0866761, "train/loss": 2.234309768676758, "train/z_loss": 0.0015463638585060835, "train/perplexity": 9.340032843570048, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786031.2920516478, "perf/iters_per_sec": 0.8516460857637633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1741966724395752, "data/tokens_consumed": 43748687872, "data/tokens_consumed_B": 43.748687872, "train/loss_slope": -1.5106747152567187e-05} {"step": 20870, "timestamp": 1778217089.4303634, "train/loss": 2.196571969985962, "train/z_loss": 0.0015630231355316937, "train/perplexity": 8.994128449964427, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028775.7490073899, "perf/iters_per_sec": 0.9673956627881002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337032079696655, "data/tokens_consumed": 43769659392, "data/tokens_consumed_B": 43.769659392, "train/loss_slope": -1.8469693043408712e-05} {"step": 20880, "timestamp": 1778217099.8203554, "train/loss": 2.2686036348342897, "train/z_loss": 0.001554839895106852, "train/perplexity": 9.665894268180846, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019832.5022259026, "perf/iters_per_sec": 0.9631311904077066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038280153274536, "data/tokens_consumed": 43790630912, "data/tokens_consumed_B": 43.790630912, "train/loss_slope": -1.5370265778714158e-05} {"step": 20890, "timestamp": 1778217110.1982071, "train/loss": 2.2189261436462404, "train/z_loss": 0.001563924946822226, "train/perplexity": 9.197448822028184, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021865.9460527003, "perf/iters_per_sec": 0.9641008119834424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037235927581787, "data/tokens_consumed": 43811602432, "data/tokens_consumed_B": 43.811602432, "train/loss_slope": -1.0470592313938783e-05} {"step": 20900, "timestamp": 1778217120.564425, "grad/layer_0/attn": 0.002799514215439558, "grad/layer_0/mlp": 0.0026054352056235075, "grad/layer_0/attn_mlp_ratio": 1.074490013011321, "grad/layer_4/attn": 0.001838213880546391, "grad/layer_4/mlp": 0.002485301112756133, "grad/layer_4/attn_mlp_ratio": 0.7396342427676444, "grad/layer_8/attn": 0.008560690097510815, "grad/layer_8/mlp": 0.003883643075823784, "grad/layer_8/attn_mlp_ratio": 2.204293677339436, "grad/layer_12/attn": 0.004706572275608778, "grad/layer_12/mlp": 0.005732744932174683, "grad/layer_12/attn_mlp_ratio": 0.8209980121553336, "grad/layer_16/attn": 0.003659977810457349, "grad/layer_16/mlp": 0.004252301994711161, "grad/layer_16/attn_mlp_ratio": 0.860705031989491, "grad/layer_20/attn": 0.005207745358347893, "grad/layer_20/mlp": 0.006190802901983261, "grad/layer_20/attn_mlp_ratio": 0.8412067637557777, "grad/layer_24/attn": 0.010274679400026798, "grad/layer_24/mlp": 0.010091603733599186, "grad/layer_24/attn_mlp_ratio": 1.0181413746958712, "grad/layer_27/attn": 0.009512774646282196, "grad/layer_27/mlp": 0.00780748575925827, "grad/layer_27/attn_mlp_ratio": 1.218417147051475} {"step": 20900, "timestamp": 1778217120.5812125, "train/loss": 2.2432246923446657, "train/z_loss": 0.001569512952119112, "train/perplexity": 9.423670782414108, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020910.138412041, "perf/iters_per_sec": 0.9636450473842816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377264976501466, "data/tokens_consumed": 43832573952, "data/tokens_consumed_B": 43.832573952, "train/loss_slope": -1.113464693532421e-05} {"step": 20910, "timestamp": 1778217130.9573052, "train/loss": 2.2219621658325197, "train/z_loss": 0.0015593930147588253, "train/perplexity": 9.225414912062647, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022206.4757208356, "perf/iters_per_sec": 0.96426318918268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370612621307373, "data/tokens_consumed": 43853545472, "data/tokens_consumed_B": 43.853545472, "train/loss_slope": -1.3771508951069837e-05} {"step": 20920, "timestamp": 1778217141.3395815, "train/loss": 2.2206417322158813, "train/z_loss": 0.0015599140082485975, "train/perplexity": 9.21324140300937, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020847.8304828943, "perf/iters_per_sec": 0.9636153366484138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377584934234618, "data/tokens_consumed": 43874516992, "data/tokens_consumed_B": 43.874516992, "train/loss_slope": -1.536956897126711e-05} {"step": 20925, "timestamp": 1778217147.1230195, "eos/sharpness": 34.665155410766594, "eos/L0_probe": 2.0478506088256836, "eos/L_plus": 2.273162603378296, "eos/L_minus": 2.1691901683807373, "eos/grad_norm": 0.13140347599983215, "eos/embed_grad_frac": 0.14515253901481628, "eos/time_s": 0.6060600280761719} {"step": 20925, "timestamp": 1778217148.5002866, "geo/rankme_last": 440.2890625, "geo/layer_0/stable_rank_q_proj": 17.156320571899414, "geo/layer_0/stable_rank_k_proj": 15.148958206176758, "geo/layer_0/stable_rank_o_proj": 52.14860916137695, "geo/layer_0/stable_rank_gate_proj": 152.47457885742188, "geo/layer_0/stable_rank_down_proj": 50.09686279296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05255495756864548, "geo/layer_0/attn_entropy_mean": 6.250058650970459, "geo/layer_0/attn_entropy_std": 0.31730592250823975, "geo/layer_7/stable_rank_q_proj": 42.22587966918945, "geo/layer_7/stable_rank_k_proj": 41.934837341308594, "geo/layer_7/stable_rank_o_proj": 111.13282775878906, "geo/layer_7/stable_rank_gate_proj": 103.68533325195312, "geo/layer_7/stable_rank_down_proj": 151.64309692382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5500330328941345, "geo/layer_7/attn_entropy_mean": 4.649542808532715, "geo/layer_7/attn_entropy_std": 0.8588207960128784, "geo/layer_14/stable_rank_q_proj": 58.31321716308594, "geo/layer_14/stable_rank_k_proj": 34.922828674316406, "geo/layer_14/stable_rank_o_proj": 53.65654373168945, "geo/layer_14/stable_rank_gate_proj": 87.15443420410156, "geo/layer_14/stable_rank_down_proj": 136.23098754882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38371482491493225, "geo/layer_14/attn_entropy_mean": 5.481461048126221, "geo/layer_14/attn_entropy_std": 0.4796558618545532, "geo/layer_21/stable_rank_q_proj": 47.75959014892578, "geo/layer_21/stable_rank_k_proj": 32.064697265625, "geo/layer_21/stable_rank_o_proj": 84.05970764160156, "geo/layer_21/stable_rank_gate_proj": 87.1941146850586, "geo/layer_21/stable_rank_down_proj": 60.79157638549805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15260429680347443, "geo/layer_21/attn_entropy_mean": 5.768171787261963, "geo/layer_21/attn_entropy_std": 0.2828766703605652, "geo/layer_27/stable_rank_q_proj": 41.49396514892578, "geo/layer_27/stable_rank_k_proj": 31.44076156616211, "geo/layer_27/stable_rank_o_proj": 117.31765747070312, "geo/layer_27/stable_rank_gate_proj": 91.46867370605469, "geo/layer_27/stable_rank_down_proj": 138.34564208984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0774918720126152, "geo/layer_27/attn_entropy_mean": 4.380316734313965, "geo/layer_27/attn_entropy_std": 0.5624903440475464, "attnres/final_alpha/block_0": 0.23966464400291443, "attnres/block_norm/0": 1.6181296110153198, "attnres/final_alpha/block_1": 0.0065928976982831955, "attnres/block_norm/1": 29471.09375, "attnres/final_alpha/block_2": 0.014046220108866692, "attnres/block_norm/2": 21028.78125, "attnres/final_alpha/block_3": 0.01590532436966896, "attnres/block_norm/3": 29872.91015625, "attnres/final_alpha/block_4": 0.020656242966651917, "attnres/block_norm/4": 9374.890625, "attnres/final_alpha/block_5": 0.5696851015090942, "attnres/block_norm/5": 5044.470703125, "attnres/final_alpha/block_6": 0.13344955444335938, "attnres/block_norm/6": 20393.13671875, "geo/tier1_time_s": 1.356278657913208, "geo/step": 20925.0, "geo/rankme_slope": -7.285103494522809e-05} {"step": 20930, "timestamp": 1778217153.6911492, "train/loss": 2.22538480758667, "train/z_loss": 0.0015596100478433073, "train/perplexity": 9.25704429949427, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698615.9616898564, "perf/iters_per_sec": 0.8099632080506594, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2346239805221557, "data/tokens_consumed": 43895488512, "data/tokens_consumed_B": 43.895488512, "train/loss_slope": -1.2618589622996578e-05} {"step": 20940, "timestamp": 1778217164.0665588, "train/loss": 2.271248984336853, "train/z_loss": 0.0015362172503955661, "train/perplexity": 9.691497786972008, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022290.5799450073, "perf/iters_per_sec": 0.9643032932019269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370181322097778, "data/tokens_consumed": 43916460032, "data/tokens_consumed_B": 43.916460032, "train/loss_slope": -8.757938259970583e-06} {"step": 20950, "timestamp": 1778217174.45715, "grad/layer_0/attn": 0.003239464946091175, "grad/layer_0/mlp": 0.0028994944877922535, "grad/layer_0/attn_mlp_ratio": 1.1172515926500775, "grad/layer_4/attn": 0.005681417416781187, "grad/layer_4/mlp": 0.0027007751632481813, "grad/layer_4/attn_mlp_ratio": 2.103624649593477, "grad/layer_8/attn": 0.009153629653155804, "grad/layer_8/mlp": 0.0038611358031630516, "grad/layer_8/attn_mlp_ratio": 2.3707090044815904, "grad/layer_12/attn": 0.00697350362315774, "grad/layer_12/mlp": 0.006436379626393318, "grad/layer_12/attn_mlp_ratio": 1.0834512442704192, "grad/layer_16/attn": 0.003933475352823734, "grad/layer_16/mlp": 0.004528301302343607, "grad/layer_16/attn_mlp_ratio": 0.8686425666780877, "grad/layer_20/attn": 0.005216836463660002, "grad/layer_20/mlp": 0.005920820403844118, "grad/layer_20/attn_mlp_ratio": 0.881100256336592, "grad/layer_24/attn": 0.012541061267256737, "grad/layer_24/mlp": 0.010500536300241947, "grad/layer_24/attn_mlp_ratio": 1.194325774344992, "grad/layer_27/attn": 0.004731609020382166, "grad/layer_27/mlp": 0.009958365000784397, "grad/layer_27/attn_mlp_ratio": 0.475139139055011} {"step": 20950, "timestamp": 1778217174.4749744, "train/loss": 2.2284183740615844, "train/z_loss": 0.001567591237835586, "train/perplexity": 9.285168795934746, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2016216.809094206, "perf/iters_per_sec": 0.9614070935698538, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0401421070098877, "data/tokens_consumed": 43937431552, "data/tokens_consumed_B": 43.937431552, "train/loss_slope": -6.897207262134788e-06} {"step": 20960, "timestamp": 1778217185.3725321, "train/loss": 2.235582208633423, "train/z_loss": 0.0015405119862407445, "train/perplexity": 9.351925039004534, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925516.9966578097, "perf/iters_per_sec": 0.9181580527581261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0891371011734008, "data/tokens_consumed": 43958403072, "data/tokens_consumed_B": 43.958403072, "train/loss_slope": -6.0088220817206945e-06} {"step": 20970, "timestamp": 1778217195.75571, "train/loss": 2.2281418800354005, "train/z_loss": 0.0015398705028928817, "train/perplexity": 9.282601857118538, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020963.256401832, "perf/iters_per_sec": 0.963670376015583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376992225646973, "data/tokens_consumed": 43979374592, "data/tokens_consumed_B": 43.979374592, "train/loss_slope": -3.58701185031304e-06} {"step": 20980, "timestamp": 1778217206.2016873, "train/loss": 2.2545618534088137, "train/z_loss": 0.0015343440813012422, "train/perplexity": 9.53111636901004, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019774.2956422775, "perf/iters_per_sec": 0.963103435345782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383100748062133, "data/tokens_consumed": 44000346112, "data/tokens_consumed_B": 44.000346112, "train/loss_slope": -3.679566758193339e-06} {"step": 20990, "timestamp": 1778217217.1423998, "train/loss": 2.196390080451965, "train/z_loss": 0.0015599340782500803, "train/perplexity": 8.992492660902922, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918058.2230071921, "perf/iters_per_sec": 0.9146014323268853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0933724403381349, "data/tokens_consumed": 44021317632, "data/tokens_consumed_B": 44.021317632, "train/loss_slope": -3.894768274835488e-06} {"step": 21000, "timestamp": 1778217227.5103502, "grad/layer_0/attn": 0.0025498727336525917, "grad/layer_0/mlp": 0.002522652270272374, "grad/layer_0/attn_mlp_ratio": 1.010790374330205, "grad/layer_4/attn": 0.0019047590903937817, "grad/layer_4/mlp": 0.0025464470963925123, "grad/layer_4/attn_mlp_ratio": 0.7480065139745312, "grad/layer_8/attn": 0.0038690525107085705, "grad/layer_8/mlp": 0.003726961789652705, "grad/layer_8/attn_mlp_ratio": 1.038125053398146, "grad/layer_12/attn": 0.003654786851257086, "grad/layer_12/mlp": 0.006275242660194635, "grad/layer_12/attn_mlp_ratio": 0.5824136198268333, "grad/layer_16/attn": 0.005760319530963898, "grad/layer_16/mlp": 0.004371474497020245, "grad/layer_16/attn_mlp_ratio": 1.317706280368263, "grad/layer_20/attn": 0.007689510006457567, "grad/layer_20/mlp": 0.005453311838209629, "grad/layer_20/attn_mlp_ratio": 1.4100623792634357, "grad/layer_24/attn": 0.010039198212325573, "grad/layer_24/mlp": 0.009894236922264099, "grad/layer_24/attn_mlp_ratio": 1.0146510731181475, "grad/layer_27/attn": 0.004745862912386656, "grad/layer_27/mlp": 0.008312540128827095, "grad/layer_27/attn_mlp_ratio": 0.5709281136382904} {"step": 21000, "timestamp": 1778217228.151585, "eos/sharpness": 19.121694564819332, "eos/L0_probe": 2.046487808227539, "eos/L_plus": 2.152963399887085, "eos/L_minus": 2.1312291622161865, "eos/grad_norm": 0.1258520781993866, "eos/embed_grad_frac": 0.15774093568325043, "eos/time_s": 0.638355016708374} {"step": 21000, "timestamp": 1778217228.171825, "train/loss": 2.210122537612915, "train/z_loss": 0.0015657681389711798, "train/perplexity": 9.116833479608283, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1902537.845785841, "perf/iters_per_sec": 0.9072007397584158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1022918701171875, "data/tokens_consumed": 44042289152, "data/tokens_consumed_B": 44.042289152, "train/loss_slope": -3.6281730547131913e-06} {"step": 21000, "timestamp": 1778217229.5376468, "geo/rankme_last": 440.84051513671875, "geo/layer_0/stable_rank_q_proj": 17.179710388183594, "geo/layer_0/stable_rank_k_proj": 15.210846900939941, "geo/layer_0/stable_rank_o_proj": 52.09104537963867, "geo/layer_0/stable_rank_gate_proj": 152.67811584472656, "geo/layer_0/stable_rank_down_proj": 50.04110336303711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050621189177036285, "geo/layer_0/attn_entropy_mean": 6.246425151824951, "geo/layer_0/attn_entropy_std": 0.32155919075012207, "geo/layer_7/stable_rank_q_proj": 42.15140914916992, "geo/layer_7/stable_rank_k_proj": 41.82981872558594, "geo/layer_7/stable_rank_o_proj": 111.01411437988281, "geo/layer_7/stable_rank_gate_proj": 103.78081512451172, "geo/layer_7/stable_rank_down_proj": 151.69818115234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5558237433433533, "geo/layer_7/attn_entropy_mean": 4.644136428833008, "geo/layer_7/attn_entropy_std": 0.8228611350059509, "geo/layer_14/stable_rank_q_proj": 58.27976608276367, "geo/layer_14/stable_rank_k_proj": 34.91750717163086, "geo/layer_14/stable_rank_o_proj": 53.59950256347656, "geo/layer_14/stable_rank_gate_proj": 87.16688537597656, "geo/layer_14/stable_rank_down_proj": 135.76904296875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39072760939598083, "geo/layer_14/attn_entropy_mean": 5.500429630279541, "geo/layer_14/attn_entropy_std": 0.473196804523468, "geo/layer_21/stable_rank_q_proj": 47.68916320800781, "geo/layer_21/stable_rank_k_proj": 32.04087829589844, "geo/layer_21/stable_rank_o_proj": 83.9824447631836, "geo/layer_21/stable_rank_gate_proj": 87.2134780883789, "geo/layer_21/stable_rank_down_proj": 60.96995544433594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15883906185626984, "geo/layer_21/attn_entropy_mean": 5.758114814758301, "geo/layer_21/attn_entropy_std": 0.29254066944122314, "geo/layer_27/stable_rank_q_proj": 41.436737060546875, "geo/layer_27/stable_rank_k_proj": 31.446199417114258, "geo/layer_27/stable_rank_o_proj": 117.81417083740234, "geo/layer_27/stable_rank_gate_proj": 91.54558563232422, "geo/layer_27/stable_rank_down_proj": 138.00576782226562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07656221091747284, "geo/layer_27/attn_entropy_mean": 4.393231391906738, "geo/layer_27/attn_entropy_std": 0.5433290600776672, "attnres/final_alpha/block_0": 0.2413111925125122, "attnres/block_norm/0": 1.6187350749969482, "attnres/final_alpha/block_1": 0.006707142572849989, "attnres/block_norm/1": 29563.724609375, "attnres/final_alpha/block_2": 0.01409147959202528, "attnres/block_norm/2": 21110.064453125, "attnres/final_alpha/block_3": 0.015899885445833206, "attnres/block_norm/3": 29982.865234375, "attnres/final_alpha/block_4": 0.021115142852067947, "attnres/block_norm/4": 9403.169921875, "attnres/final_alpha/block_5": 0.5657746195793152, "attnres/block_norm/5": 5081.08056640625, "attnres/final_alpha/block_6": 0.1351005733013153, "attnres/block_norm/6": 20292.23046875, "geo/tier1_time_s": 1.3625123500823975, "geo/step": 21000.0, "geo/rankme_slope": -6.420677646058424e-05} {"step": 21000, "timestamp": 1778217236.4609964, "geo/ww_alpha_mean": 7.818910200456582, "geo/ww_alpha_std": 4.576749255162601, "geo/ww_alpha_min": 1.3461249196119218, "geo/ww_alpha_max": 34.926502567728825, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.105466426742653, "geo/ww_alpha_by_type/k_proj": 4.658042980960512, "geo/ww_alpha_by_type/v_proj": 7.732884805129864, "geo/ww_alpha_by_type/o_proj": 8.244315438876438, "geo/ww_alpha_by_type/gate_proj": 8.951597294257844, "geo/ww_alpha_by_type/up_proj": 11.817062764520111, "geo/ww_alpha_by_type/down_proj": 9.346126831248723, "geo/twonn_id/layer_0": 0.7598922848701477, "geo/twonn_id/layer_7": 2.821007013320923, "geo/twonn_id/layer_14": 4.3168768882751465, "geo/twonn_id/layer_21": 7.754636764526367, "geo/twonn_id/layer_27": 5.942540168762207, "geo/tier2_time_s": 6.917245388031006} {"step": 21000, "timestamp": 1778217237.092134, "eoc/jacobian_sigma/layer_0/attn": 910.3474731445312, "eoc/jacobian_sigma/layer_0/mlp": 6435.7822265625, "eoc/jacobian_sigma/layer_0": 6435.7822265625, "eoc/jacobian_sigma/layer_7/attn": 1.1566020250320435, "eoc/jacobian_sigma/layer_7/mlp": 1.6578896045684814, "eoc/jacobian_sigma/layer_7": 1.6578896045684814, "eoc/jacobian_sigma/layer_14/attn": 1.6020910739898682, "eoc/jacobian_sigma/layer_14/mlp": 7.0216546058654785, "eoc/jacobian_sigma/layer_14": 7.0216546058654785, "eoc/jacobian_sigma/layer_21/attn": 1.079001784324646, "eoc/jacobian_sigma/layer_21/mlp": 3.7964069843292236, "eoc/jacobian_sigma/layer_21": 3.7964069843292236, "eoc/jacobian_sigma/layer_27/attn": 3.400853395462036, "eoc/jacobian_sigma/layer_27/mlp": 20.386104583740234, "eoc/jacobian_sigma/layer_27": 20.386104583740234, "eoc/layer0_sigma": 6435.7822265625, "eoc/sigma_max": 20.386104583740234, "eoc/sigma_min": 1.6578896045684814, "eoc/sigma_mean": 8.215513944625854, "eoc/time_s": 0.6236984729766846} {"step": 21010, "timestamp": 1778217247.4859025, "train/loss": 2.1972490787506103, "train/z_loss": 0.0015653692185878753, "train/perplexity": 9.000220515430977, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085988.0225538907, "perf/iters_per_sec": 0.5178394425172285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9311004877090454, "data/tokens_consumed": 44063260672, "data/tokens_consumed_B": 44.063260672, "train/loss_slope": -5.122029470174222e-06} {"step": 21020, "timestamp": 1778217257.8712199, "train/loss": 2.25420401096344, "train/z_loss": 0.0015554523444734514, "train/perplexity": 9.527706341184148, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020497.0370624745, "perf/iters_per_sec": 0.9634480653107045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037938666343689, "data/tokens_consumed": 44084232192, "data/tokens_consumed_B": 44.084232192, "train/loss_slope": -2.427540117293012e-06} {"step": 21030, "timestamp": 1778217268.2580395, "train/loss": 2.1725626945495606, "train/z_loss": 0.001565174583811313, "train/perplexity": 8.780757631690637, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020424.2198727522, "perf/iters_per_sec": 0.963413343368889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03797607421875, "data/tokens_consumed": 44105203712, "data/tokens_consumed_B": 44.105203712, "train/loss_slope": -2.657170342926956e-06} {"step": 21040, "timestamp": 1778217278.6405876, "train/loss": 2.1806361436843873, "train/z_loss": 0.0015576989040710031, "train/perplexity": 8.851935570913383, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021173.9450198247, "perf/iters_per_sec": 0.9637708401774524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375910520553588, "data/tokens_consumed": 44126175232, "data/tokens_consumed_B": 44.126175232, "train/loss_slope": -4.901346515114168e-06} {"step": 21050, "timestamp": 1778217289.0084217, "grad/layer_0/attn": 0.002557269763201475, "grad/layer_0/mlp": 0.0025327554903924465, "grad/layer_0/attn_mlp_ratio": 1.0096788544864015, "grad/layer_4/attn": 0.0015570102259516716, "grad/layer_4/mlp": 0.002515883417800069, "grad/layer_4/attn_mlp_ratio": 0.6188721436965204, "grad/layer_8/attn": 0.011178609915077686, "grad/layer_8/mlp": 0.003926781937479973, "grad/layer_8/attn_mlp_ratio": 2.8467609886113268, "grad/layer_12/attn": 0.0038200970739126205, "grad/layer_12/mlp": 0.005943527445197105, "grad/layer_12/attn_mlp_ratio": 0.6427322906915096, "grad/layer_16/attn": 0.004882108885794878, "grad/layer_16/mlp": 0.004309903364628553, "grad/layer_16/attn_mlp_ratio": 1.132765252368742, "grad/layer_20/attn": 0.008241665549576283, "grad/layer_20/mlp": 0.006426825188100338, "grad/layer_20/attn_mlp_ratio": 1.2823851871057135, "grad/layer_24/attn": 0.006517478264868259, "grad/layer_24/mlp": 0.008435806259512901, "grad/layer_24/attn_mlp_ratio": 0.7725969500851121, "grad/layer_27/attn": 0.004133366513997316, "grad/layer_27/mlp": 0.0073911570943892, "grad/layer_27/attn_mlp_ratio": 0.5592313091561699} {"step": 21050, "timestamp": 1778217289.025501, "train/loss": 2.214912533760071, "train/z_loss": 0.0015603634528815746, "train/perplexity": 9.160607832644299, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020780.7446461536, "perf/iters_per_sec": 0.9635833476286667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037792944908142, "data/tokens_consumed": 44147146752, "data/tokens_consumed_B": 44.147146752, "train/loss_slope": -3.5732362279654682e-06} {"step": 21060, "timestamp": 1778217299.4035792, "train/loss": 2.203507924079895, "train/z_loss": 0.0015588193899020552, "train/perplexity": 9.056728155390687, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021740.9373267398, "perf/iters_per_sec": 0.9640412031778048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373000621795654, "data/tokens_consumed": 44168118272, "data/tokens_consumed_B": 44.168118272, "train/loss_slope": -5.633984891065723e-06} {"step": 21070, "timestamp": 1778217309.7875147, "train/loss": 2.2381691694259644, "train/z_loss": 0.0015584685956127942, "train/perplexity": 9.376149422670741, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021401.8651045198, "perf/iters_per_sec": 0.9638795209429358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374740600585937, "data/tokens_consumed": 44189089792, "data/tokens_consumed_B": 44.189089792, "train/loss_slope": -7.319972238274571e-06} {"step": 21075, "timestamp": 1778217315.5808458, "eos/sharpness": 42.11077690124511, "eos/L0_probe": 2.0428032875061035, "eos/L_plus": 2.2478439807891846, "eos/L_minus": 2.2588703632354736, "eos/grad_norm": 0.17366988956928253, "eos/embed_grad_frac": 0.07470081746578217, "eos/time_s": 0.6165080070495605} {"step": 21075, "timestamp": 1778217316.9626849, "geo/rankme_last": 440.4845275878906, "geo/layer_0/stable_rank_q_proj": 17.18964958190918, "geo/layer_0/stable_rank_k_proj": 15.196109771728516, "geo/layer_0/stable_rank_o_proj": 51.929931640625, "geo/layer_0/stable_rank_gate_proj": 152.51663208007812, "geo/layer_0/stable_rank_down_proj": 50.02305221557617, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04890517517924309, "geo/layer_0/attn_entropy_mean": 6.250471115112305, "geo/layer_0/attn_entropy_std": 0.3231405019760132, "geo/layer_7/stable_rank_q_proj": 42.14514923095703, "geo/layer_7/stable_rank_k_proj": 42.0045166015625, "geo/layer_7/stable_rank_o_proj": 111.25820922851562, "geo/layer_7/stable_rank_gate_proj": 103.71073913574219, "geo/layer_7/stable_rank_down_proj": 151.7374267578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5499638915061951, "geo/layer_7/attn_entropy_mean": 4.6543049812316895, "geo/layer_7/attn_entropy_std": 0.8348206877708435, "geo/layer_14/stable_rank_q_proj": 58.067745208740234, "geo/layer_14/stable_rank_k_proj": 34.97036361694336, "geo/layer_14/stable_rank_o_proj": 53.68886184692383, "geo/layer_14/stable_rank_gate_proj": 87.21705627441406, "geo/layer_14/stable_rank_down_proj": 135.73936462402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3796027898788452, "geo/layer_14/attn_entropy_mean": 5.483395576477051, "geo/layer_14/attn_entropy_std": 0.4536558985710144, "geo/layer_21/stable_rank_q_proj": 47.688114166259766, "geo/layer_21/stable_rank_k_proj": 32.08941650390625, "geo/layer_21/stable_rank_o_proj": 83.87589263916016, "geo/layer_21/stable_rank_gate_proj": 87.19356536865234, "geo/layer_21/stable_rank_down_proj": 60.92327880859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14870017766952515, "geo/layer_21/attn_entropy_mean": 5.765377044677734, "geo/layer_21/attn_entropy_std": 0.2946758568286896, "geo/layer_27/stable_rank_q_proj": 41.58073043823242, "geo/layer_27/stable_rank_k_proj": 31.500282287597656, "geo/layer_27/stable_rank_o_proj": 117.7559585571289, "geo/layer_27/stable_rank_gate_proj": 91.38752746582031, "geo/layer_27/stable_rank_down_proj": 137.92835998535156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07671947032213211, "geo/layer_27/attn_entropy_mean": 4.382196426391602, "geo/layer_27/attn_entropy_std": 0.5634846687316895, "attnres/final_alpha/block_0": 0.24235811829566956, "attnres/block_norm/0": 1.6196660995483398, "attnres/final_alpha/block_1": 0.0067491689696908, "attnres/block_norm/1": 29530.31640625, "attnres/final_alpha/block_2": 0.014200484380126, "attnres/block_norm/2": 21072.85546875, "attnres/final_alpha/block_3": 0.016049765050411224, "attnres/block_norm/3": 30090.794921875, "attnres/final_alpha/block_4": 0.0213616993278265, "attnres/block_norm/4": 9410.6171875, "attnres/final_alpha/block_5": 0.5629897117614746, "attnres/block_norm/5": 5078.568359375, "attnres/final_alpha/block_6": 0.136291041970253, "attnres/block_norm/6": 20309.33984375, "geo/tier1_time_s": 1.3596208095550537, "geo/step": 21075.0, "geo/rankme_slope": -5.791969131402561e-05} {"step": 21080, "timestamp": 1778217322.1693003, "train/loss": 2.187160110473633, "train/z_loss": 0.0015530978445895017, "train/perplexity": 8.909874093598647, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694516.6057611646, "perf/iters_per_sec": 0.8080084828191588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2376107692718505, "data/tokens_consumed": 44210061312, "data/tokens_consumed_B": 44.210061312, "train/loss_slope": -8.087589502549201e-06} {"step": 21090, "timestamp": 1778217332.5459542, "train/loss": 2.1736900329589846, "train/z_loss": 0.0015612230403348804, "train/perplexity": 8.790662098826838, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022146.1332744812, "perf/iters_per_sec": 0.9642344156620413, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370922088623047, "data/tokens_consumed": 44231032832, "data/tokens_consumed_B": 44.231032832, "train/loss_slope": -1.2468462980369393e-05} {"step": 21100, "timestamp": 1778217342.909655, "grad/layer_0/attn": 0.0025942493230104446, "grad/layer_0/mlp": 0.002628444228321314, "grad/layer_0/attn_mlp_ratio": 0.9869904015305098, "grad/layer_4/attn": 0.001612887135706842, "grad/layer_4/mlp": 0.002464421559125185, "grad/layer_4/attn_mlp_ratio": 0.654468820193449, "grad/layer_8/attn": 0.01270211674273014, "grad/layer_8/mlp": 0.0038726525381207466, "grad/layer_8/attn_mlp_ratio": 3.2799525105081466, "grad/layer_12/attn": 0.004624406807124615, "grad/layer_12/mlp": 0.00639164075255394, "grad/layer_12/attn_mlp_ratio": 0.7235085502773232, "grad/layer_16/attn": 0.00431786198168993, "grad/layer_16/mlp": 0.0048017483204603195, "grad/layer_16/attn_mlp_ratio": 0.8992270322392273, "grad/layer_20/attn": 0.007208127062767744, "grad/layer_20/mlp": 0.005769513547420502, "grad/layer_20/attn_mlp_ratio": 1.2493474326021283, "grad/layer_24/attn": 0.01284052338451147, "grad/layer_24/mlp": 0.011907137930393219, "grad/layer_24/attn_mlp_ratio": 1.078388723783647, "grad/layer_27/attn": 0.005593503825366497, "grad/layer_27/mlp": 0.011779100634157658, "grad/layer_27/attn_mlp_ratio": 0.47486679599794573} {"step": 21100, "timestamp": 1778217342.9263155, "train/loss": 2.2608192920684815, "train/z_loss": 0.0015387215302325786, "train/perplexity": 9.590943732862755, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021526.5996069005, "perf/iters_per_sec": 0.9639389989885809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037410044670105, "data/tokens_consumed": 44252004352, "data/tokens_consumed_B": 44.252004352, "train/loss_slope": -1.1057376604054458e-05} {"step": 21110, "timestamp": 1778217353.3064392, "train/loss": 2.2322436571121216, "train/z_loss": 0.0015510942554101347, "train/perplexity": 9.320755215420816, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021436.1945923844, "perf/iters_per_sec": 0.9638958905183718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374564409255982, "data/tokens_consumed": 44272975872, "data/tokens_consumed_B": 44.272975872, "train/loss_slope": -1.2911219915898757e-05} {"step": 21120, "timestamp": 1778217363.6823835, "train/loss": 2.2044329404830934, "train/z_loss": 0.0015675225062295795, "train/perplexity": 9.065109653407667, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022210.6133548678, "perf/iters_per_sec": 0.9642651621603335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370591402053833, "data/tokens_consumed": 44293947392, "data/tokens_consumed_B": 44.293947392, "train/loss_slope": -1.1576170267516551e-05} {"step": 21130, "timestamp": 1778217374.0730417, "train/loss": 2.2243330240249635, "train/z_loss": 0.0015551932039670647, "train/perplexity": 9.247313010971826, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019527.3607954006, "perf/iters_per_sec": 0.9629856876351359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384370326995849, "data/tokens_consumed": 44314918912, "data/tokens_consumed_B": 44.314918912, "train/loss_slope": -1.0858019801041667e-05} {"step": 21140, "timestamp": 1778217384.457769, "train/loss": 2.236531972885132, "train/z_loss": 0.0015449894010089338, "train/perplexity": 9.360811382388885, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020738.9633309825, "perf/iters_per_sec": 0.9635634247450745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378144025802611, "data/tokens_consumed": 44335890432, "data/tokens_consumed_B": 44.335890432, "train/loss_slope": -1.044899874394481e-05} {"step": 21150, "timestamp": 1778217394.825548, "grad/layer_0/attn": 0.0032573190983384848, "grad/layer_0/mlp": 0.003029370680451393, "grad/layer_0/attn_mlp_ratio": 1.0752460937954667, "grad/layer_4/attn": 0.0021648304536938667, "grad/layer_4/mlp": 0.0026207061018794775, "grad/layer_4/attn_mlp_ratio": 0.8260485101845171, "grad/layer_8/attn": 0.004267929587513208, "grad/layer_8/mlp": 0.0038666222244501114, "grad/layer_8/attn_mlp_ratio": 1.1037875513534072, "grad/layer_12/attn": 0.004653424955904484, "grad/layer_12/mlp": 0.006369270849972963, "grad/layer_12/attn_mlp_ratio": 0.7306055893138346, "grad/layer_16/attn": 0.006243984680622816, "grad/layer_16/mlp": 0.005103738512843847, "grad/layer_16/attn_mlp_ratio": 1.2234138842670108, "grad/layer_20/attn": 0.010850435122847557, "grad/layer_20/mlp": 0.007458745501935482, "grad/layer_20/attn_mlp_ratio": 1.4547265320366953, "grad/layer_24/attn": 0.02046871930360794, "grad/layer_24/mlp": 0.013785816729068756, "grad/layer_24/attn_mlp_ratio": 1.4847665218101278, "grad/layer_27/attn": 0.006760628428310156, "grad/layer_27/mlp": 0.01280947495251894, "grad/layer_27/attn_mlp_ratio": 0.5277834103733003} {"step": 21150, "timestamp": 1778217395.4325218, "eos/sharpness": 47.29471206665038, "eos/L0_probe": 2.045248031616211, "eos/L_plus": 2.3259613513946533, "eos/L_minus": 2.2374818325042725, "eos/grad_norm": 0.2264382690191269, "eos/embed_grad_frac": 0.04865255951881409, "eos/time_s": 0.6040449142456055} {"step": 21150, "timestamp": 1778217395.4526536, "train/loss": 2.243339014053345, "train/z_loss": 0.0015522517845965923, "train/perplexity": 9.424748174143442, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908456.5061716414, "perf/iters_per_sec": 0.9100229769571502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0988733530044557, "data/tokens_consumed": 44356861952, "data/tokens_consumed_B": 44.356861952, "train/loss_slope": -1.1101572298266797e-05} {"step": 21150, "timestamp": 1778217396.817253, "geo/rankme_last": 440.5446472167969, "geo/layer_0/stable_rank_q_proj": 17.217304229736328, "geo/layer_0/stable_rank_k_proj": 15.256699562072754, "geo/layer_0/stable_rank_o_proj": 51.80958938598633, "geo/layer_0/stable_rank_gate_proj": 152.2097930908203, "geo/layer_0/stable_rank_down_proj": 49.94287109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05136920511722565, "geo/layer_0/attn_entropy_mean": 6.253328800201416, "geo/layer_0/attn_entropy_std": 0.3225403130054474, "geo/layer_7/stable_rank_q_proj": 42.255592346191406, "geo/layer_7/stable_rank_k_proj": 41.873111724853516, "geo/layer_7/stable_rank_o_proj": 110.59463500976562, "geo/layer_7/stable_rank_gate_proj": 103.6144027709961, "geo/layer_7/stable_rank_down_proj": 151.7188262939453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5492066144943237, "geo/layer_7/attn_entropy_mean": 4.634708404541016, "geo/layer_7/attn_entropy_std": 0.846963107585907, "geo/layer_14/stable_rank_q_proj": 57.90327072143555, "geo/layer_14/stable_rank_k_proj": 35.09170150756836, "geo/layer_14/stable_rank_o_proj": 53.69654846191406, "geo/layer_14/stable_rank_gate_proj": 87.3962631225586, "geo/layer_14/stable_rank_down_proj": 136.18557739257812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37954074144363403, "geo/layer_14/attn_entropy_mean": 5.502416133880615, "geo/layer_14/attn_entropy_std": 0.49357870221138, "geo/layer_21/stable_rank_q_proj": 47.63798141479492, "geo/layer_21/stable_rank_k_proj": 32.07816696166992, "geo/layer_21/stable_rank_o_proj": 83.64103698730469, "geo/layer_21/stable_rank_gate_proj": 86.77588653564453, "geo/layer_21/stable_rank_down_proj": 60.78227615356445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15963615477085114, "geo/layer_21/attn_entropy_mean": 5.741029739379883, "geo/layer_21/attn_entropy_std": 0.2934316098690033, "geo/layer_27/stable_rank_q_proj": 41.62248611450195, "geo/layer_27/stable_rank_k_proj": 31.514598846435547, "geo/layer_27/stable_rank_o_proj": 118.10555267333984, "geo/layer_27/stable_rank_gate_proj": 91.37776947021484, "geo/layer_27/stable_rank_down_proj": 137.880859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.06962352246046066, "geo/layer_27/attn_entropy_mean": 4.39601993560791, "geo/layer_27/attn_entropy_std": 0.5737677216529846, "attnres/final_alpha/block_0": 0.24166613817214966, "attnres/block_norm/0": 1.6203033924102783, "attnres/final_alpha/block_1": 0.0067233312875032425, "attnres/block_norm/1": 29632.7890625, "attnres/final_alpha/block_2": 0.013939032331109047, "attnres/block_norm/2": 21141.173828125, "attnres/final_alpha/block_3": 0.015803616493940353, "attnres/block_norm/3": 30379.349609375, "attnres/final_alpha/block_4": 0.020925868302583694, "attnres/block_norm/4": 9422.8603515625, "attnres/final_alpha/block_5": 0.566988468170166, "attnres/block_norm/5": 5038.806640625, "attnres/final_alpha/block_6": 0.13395348191261292, "attnres/block_norm/6": 20330.05078125, "geo/tier1_time_s": 1.3604989051818848, "geo/step": 21150.0, "geo/rankme_slope": -6.497382155987394e-05} {"step": 21160, "timestamp": 1778217407.5194483, "train/loss": 2.176944446563721, "train/z_loss": 0.0015562148299068212, "train/perplexity": 8.819317151561668, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1738588.616236653, "perf/iters_per_sec": 0.8290236550505891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2062381982803345, "data/tokens_consumed": 44377833472, "data/tokens_consumed_B": 44.377833472, "train/loss_slope": -1.3904474320227107e-05} {"step": 21170, "timestamp": 1778217418.395726, "train/loss": 2.1887016534805297, "train/z_loss": 0.0015611827373504638, "train/perplexity": 8.923619639653138, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929535.4999974445, "perf/iters_per_sec": 0.9200742244708273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0868688344955444, "data/tokens_consumed": 44398804992, "data/tokens_consumed_B": 44.398804992, "train/loss_slope": -1.3217089149710845e-05} {"step": 21180, "timestamp": 1778217428.7725897, "train/loss": 2.217297840118408, "train/z_loss": 0.0015349818975664675, "train/perplexity": 9.182484769979862, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022491.5006608667, "perf/iters_per_sec": 0.9643990996651014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036915111541748, "data/tokens_consumed": 44419776512, "data/tokens_consumed_B": 44.419776512, "train/loss_slope": -1.177525520324705e-05} {"step": 21190, "timestamp": 1778217439.148664, "train/loss": 2.265148949623108, "train/z_loss": 0.0015469920937903226, "train/perplexity": 9.63255926033305, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022231.9526576505, "perf/iters_per_sec": 0.9642753375328305, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370481967926026, "data/tokens_consumed": 44440748032, "data/tokens_consumed_B": 44.440748032, "train/loss_slope": -1.2711580758905927e-05} {"step": 21200, "timestamp": 1778217449.5224547, "grad/layer_0/attn": 0.0024324615951627493, "grad/layer_0/mlp": 0.0025045268703252077, "grad/layer_0/attn_mlp_ratio": 0.9712259536366259, "grad/layer_4/attn": 0.0015836757374927402, "grad/layer_4/mlp": 0.0024834368377923965, "grad/layer_4/attn_mlp_ratio": 0.6376951688978735, "grad/layer_8/attn": 0.004349122755229473, "grad/layer_8/mlp": 0.003731000469997525, "grad/layer_8/attn_mlp_ratio": 1.1656719621547411, "grad/layer_12/attn": 0.004040366504341364, "grad/layer_12/mlp": 0.005703650880604982, "grad/layer_12/attn_mlp_ratio": 0.7083824936133811, "grad/layer_16/attn": 0.004058616701513529, "grad/layer_16/mlp": 0.004516729153692722, "grad/layer_16/attn_mlp_ratio": 0.8985742721229846, "grad/layer_20/attn": 0.005188112147152424, "grad/layer_20/mlp": 0.005616859067231417, "grad/layer_20/attn_mlp_ratio": 0.9236678351167696, "grad/layer_24/attn": 0.007971142418682575, "grad/layer_24/mlp": 0.009617543779313564, "grad/layer_24/attn_mlp_ratio": 0.8288126905069553, "grad/layer_27/attn": 0.008080004714429379, "grad/layer_27/mlp": 0.0076066493056714535, "grad/layer_27/attn_mlp_ratio": 1.0622291476198438} {"step": 21200, "timestamp": 1778217449.53929, "train/loss": 2.233784008026123, "train/z_loss": 0.0015601003891788424, "train/perplexity": 9.335123512504897, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019414.4632873943, "perf/iters_per_sec": 0.96293185390825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038495087623596, "data/tokens_consumed": 44461719552, "data/tokens_consumed_B": 44.461719552, "train/loss_slope": -1.1883628185969198e-05} {"step": 21210, "timestamp": 1778217459.9171946, "train/loss": 2.242027235031128, "train/z_loss": 0.001551932515576482, "train/perplexity": 9.41239309253904, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021908.424655308, "perf/iters_per_sec": 0.9641210673595944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372141361236573, "data/tokens_consumed": 44482691072, "data/tokens_consumed_B": 44.482691072, "train/loss_slope": -1.0629106104427532e-05} {"step": 21220, "timestamp": 1778217470.80035, "train/loss": 2.1952332258224487, "train/z_loss": 0.001562991226091981, "train/perplexity": 8.98209566920079, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927986.7356546624, "perf/iters_per_sec": 0.9193357160828888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0877419233322143, "data/tokens_consumed": 44503662592, "data/tokens_consumed_B": 44.503662592, "train/loss_slope": -1.561835801461916e-05} {"step": 21225, "timestamp": 1778217476.6084163, "eos/sharpness": 29.466152191162102, "eos/L0_probe": 2.0417885780334473, "eos/L_plus": 2.197516679763794, "eos/L_minus": 2.1807219982147217, "eos/grad_norm": 0.16847796738147736, "eos/embed_grad_frac": 0.10926032811403275, "eos/time_s": 0.6208925247192383} {"step": 21225, "timestamp": 1778217477.9938345, "geo/rankme_last": 441.3692932128906, "geo/layer_0/stable_rank_q_proj": 17.223390579223633, "geo/layer_0/stable_rank_k_proj": 15.27041244506836, "geo/layer_0/stable_rank_o_proj": 51.7823486328125, "geo/layer_0/stable_rank_gate_proj": 151.95774841308594, "geo/layer_0/stable_rank_down_proj": 50.021793365478516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0455806739628315, "geo/layer_0/attn_entropy_mean": 6.253787517547607, "geo/layer_0/attn_entropy_std": 0.3206944167613983, "geo/layer_7/stable_rank_q_proj": 42.267398834228516, "geo/layer_7/stable_rank_k_proj": 41.886940002441406, "geo/layer_7/stable_rank_o_proj": 110.58796691894531, "geo/layer_7/stable_rank_gate_proj": 103.29602813720703, "geo/layer_7/stable_rank_down_proj": 151.50405883789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5465912818908691, "geo/layer_7/attn_entropy_mean": 4.6651740074157715, "geo/layer_7/attn_entropy_std": 0.8260194063186646, "geo/layer_14/stable_rank_q_proj": 57.792144775390625, "geo/layer_14/stable_rank_k_proj": 34.95712661743164, "geo/layer_14/stable_rank_o_proj": 53.72504425048828, "geo/layer_14/stable_rank_gate_proj": 87.20543670654297, "geo/layer_14/stable_rank_down_proj": 136.071533203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3637075424194336, "geo/layer_14/attn_entropy_mean": 5.511876583099365, "geo/layer_14/attn_entropy_std": 0.4647774398326874, "geo/layer_21/stable_rank_q_proj": 47.656005859375, "geo/layer_21/stable_rank_k_proj": 32.0816764831543, "geo/layer_21/stable_rank_o_proj": 83.51457977294922, "geo/layer_21/stable_rank_gate_proj": 86.43826293945312, "geo/layer_21/stable_rank_down_proj": 60.68159484863281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15648643672466278, "geo/layer_21/attn_entropy_mean": 5.750767707824707, "geo/layer_21/attn_entropy_std": 0.3012438714504242, "geo/layer_27/stable_rank_q_proj": 41.564483642578125, "geo/layer_27/stable_rank_k_proj": 31.47346305847168, "geo/layer_27/stable_rank_o_proj": 118.21883392333984, "geo/layer_27/stable_rank_gate_proj": 91.43712615966797, "geo/layer_27/stable_rank_down_proj": 137.8583984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07931485772132874, "geo/layer_27/attn_entropy_mean": 4.401490211486816, "geo/layer_27/attn_entropy_std": 0.5584203600883484, "attnres/final_alpha/block_0": 0.24196164309978485, "attnres/block_norm/0": 1.6209511756896973, "attnres/final_alpha/block_1": 0.0066657522693276405, "attnres/block_norm/1": 29564.1484375, "attnres/final_alpha/block_2": 0.013960951939225197, "attnres/block_norm/2": 21247.7734375, "attnres/final_alpha/block_3": 0.01583060994744301, "attnres/block_norm/3": 30224.419921875, "attnres/final_alpha/block_4": 0.020763181149959564, "attnres/block_norm/4": 9429.16015625, "attnres/final_alpha/block_5": 0.568301796913147, "attnres/block_norm/5": 5059.908203125, "attnres/final_alpha/block_6": 0.1325160562992096, "attnres/block_norm/6": 20418.251953125, "geo/tier1_time_s": 1.3644068241119385, "geo/step": 21225.0, "geo/rankme_slope": -6.988379336109444e-05} {"step": 21230, "timestamp": 1778217483.188666, "train/loss": 2.226186752319336, "train/z_loss": 0.00154038897017017, "train/perplexity": 9.264470914879835, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1693932.5795017646, "perf/iters_per_sec": 0.8077299973973106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2380374670028687, "data/tokens_consumed": 44524634112, "data/tokens_consumed_B": 44.524634112, "train/loss_slope": -1.769427619393861e-05} {"step": 21240, "timestamp": 1778217493.9856741, "train/loss": 2.185166764259338, "train/z_loss": 0.0015534463571384549, "train/perplexity": 8.892131319425165, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943217.30381097, "perf/iters_per_sec": 0.9265982169203615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0792164087295533, "data/tokens_consumed": 44545605632, "data/tokens_consumed_B": 44.545605632, "train/loss_slope": -1.8198316404134868e-05} {"step": 21250, "timestamp": 1778217504.361915, "grad/layer_0/attn": 0.0029231933876872063, "grad/layer_0/mlp": 0.002870864700525999, "grad/layer_0/attn_mlp_ratio": 1.018227464822314, "grad/layer_4/attn": 0.002259514294564724, "grad/layer_4/mlp": 0.002490307902917266, "grad/layer_4/attn_mlp_ratio": 0.9073232274553271, "grad/layer_8/attn": 0.005522348452359438, "grad/layer_8/mlp": 0.003700018161907792, "grad/layer_8/attn_mlp_ratio": 1.49251924219207, "grad/layer_12/attn": 0.004013833124190569, "grad/layer_12/mlp": 0.006215798202902079, "grad/layer_12/attn_mlp_ratio": 0.6457470028132285, "grad/layer_16/attn": 0.0056615713983774185, "grad/layer_16/mlp": 0.004549301695078611, "grad/layer_16/attn_mlp_ratio": 1.2444923756216941, "grad/layer_20/attn": 0.0076558892615139484, "grad/layer_20/mlp": 0.005606918130069971, "grad/layer_20/attn_mlp_ratio": 1.3654362249221543, "grad/layer_24/attn": 0.006197003647685051, "grad/layer_24/mlp": 0.008905584923923016, "grad/layer_24/attn_mlp_ratio": 0.6958558737060037, "grad/layer_27/attn": 0.00481284037232399, "grad/layer_27/mlp": 0.007772454526275396, "grad/layer_27/attn_mlp_ratio": 0.6192175578682445} {"step": 21250, "timestamp": 1778217504.3794727, "train/loss": 2.2447343707084655, "train/z_loss": 0.0015501708490774035, "train/perplexity": 9.437908238587578, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018849.748009775, "perf/iters_per_sec": 0.9626625766800762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387855768203735, "data/tokens_consumed": 44566577152, "data/tokens_consumed_B": 44.566577152, "train/loss_slope": -1.6013916233370587e-05} {"step": 21260, "timestamp": 1778217514.765182, "train/loss": 2.247029161453247, "train/z_loss": 0.0015576062258332967, "train/perplexity": 9.459591132400316, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020470.0722776705, "perf/iters_per_sec": 0.963435207499347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379525184631349, "data/tokens_consumed": 44587548672, "data/tokens_consumed_B": 44.587548672, "train/loss_slope": -1.3551718581377294e-05} {"step": 21270, "timestamp": 1778217525.1464672, "train/loss": 2.2286808013916017, "train/z_loss": 0.0015581190236844122, "train/perplexity": 9.287605797744577, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021255.4088192857, "perf/iters_per_sec": 0.9638096851440838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375492334365846, "data/tokens_consumed": 44608520192, "data/tokens_consumed_B": 44.608520192, "train/loss_slope": -1.400742389903276e-05} {"step": 21280, "timestamp": 1778217535.526216, "train/loss": 2.184613585472107, "train/z_loss": 0.0015623213723301888, "train/perplexity": 8.887213741281343, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021425.5565073243, "perf/iters_per_sec": 0.963890817884123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374619007110595, "data/tokens_consumed": 44629491712, "data/tokens_consumed_B": 44.629491712, "train/loss_slope": -1.644864636715055e-05} {"step": 21290, "timestamp": 1778217545.9216895, "train/loss": 2.2014628887176513, "train/z_loss": 0.0015555289341136814, "train/perplexity": 9.038225751530124, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018774.362214966, "perf/iters_per_sec": 0.9626266299319105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388243675231934, "data/tokens_consumed": 44650463232, "data/tokens_consumed_B": 44.650463232, "train/loss_slope": -1.538831060296805e-05} {"step": 21300, "timestamp": 1778217556.2895212, "grad/layer_0/attn": 0.0030701463110744953, "grad/layer_0/mlp": 0.0027651272248476744, "grad/layer_0/attn_mlp_ratio": 1.1103092011300506, "grad/layer_4/attn": 0.002310320269316435, "grad/layer_4/mlp": 0.0024751219898462296, "grad/layer_4/attn_mlp_ratio": 0.9334166903499967, "grad/layer_8/attn": 0.004785506054759026, "grad/layer_8/mlp": 0.0036302777007222176, "grad/layer_8/attn_mlp_ratio": 1.3182203449573429, "grad/layer_12/attn": 0.0047990866005420685, "grad/layer_12/mlp": 0.006683732848614454, "grad/layer_12/attn_mlp_ratio": 0.718024887804191, "grad/layer_16/attn": 0.00415887450799346, "grad/layer_16/mlp": 0.004364588297903538, "grad/layer_16/attn_mlp_ratio": 0.9528675166691802, "grad/layer_20/attn": 0.00499996030703187, "grad/layer_20/mlp": 0.006376517005264759, "grad/layer_20/attn_mlp_ratio": 0.7841208961713069, "grad/layer_24/attn": 0.0050400434993207455, "grad/layer_24/mlp": 0.008777664043009281, "grad/layer_24/attn_mlp_ratio": 0.5741896041140676, "grad/layer_27/attn": 0.006764590740203857, "grad/layer_27/mlp": 0.006709160748869181, "grad/layer_27/attn_mlp_ratio": 1.0082618217960924} {"step": 21300, "timestamp": 1778217556.8967648, "eos/sharpness": 24.179124832153317, "eos/L0_probe": 2.0437426567077637, "eos/L_plus": 2.1901984214782715, "eos/L_minus": 2.139078140258789, "eos/grad_norm": 0.10926490277051926, "eos/embed_grad_frac": 0.1866275668144226, "eos/time_s": 0.604464054107666} {"step": 21300, "timestamp": 1778217556.9167912, "train/loss": 2.1850003838539123, "train/z_loss": 0.0015603656182065606, "train/perplexity": 8.890651966082256, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908353.283877219, "perf/iters_per_sec": 0.9099737567316146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0989327907562256, "data/tokens_consumed": 44671434752, "data/tokens_consumed_B": 44.671434752, "train/loss_slope": -1.4895415527842707e-05} {"step": 21300, "timestamp": 1778217558.2756565, "geo/rankme_last": 440.955078125, "geo/layer_0/stable_rank_q_proj": 17.264755249023438, "geo/layer_0/stable_rank_k_proj": 15.259706497192383, "geo/layer_0/stable_rank_o_proj": 51.99604034423828, "geo/layer_0/stable_rank_gate_proj": 151.99143981933594, "geo/layer_0/stable_rank_down_proj": 50.08671188354492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048567406833171844, "geo/layer_0/attn_entropy_mean": 6.256806373596191, "geo/layer_0/attn_entropy_std": 0.32114923000335693, "geo/layer_7/stable_rank_q_proj": 42.32892990112305, "geo/layer_7/stable_rank_k_proj": 41.9731330871582, "geo/layer_7/stable_rank_o_proj": 110.31804656982422, "geo/layer_7/stable_rank_gate_proj": 103.17002868652344, "geo/layer_7/stable_rank_down_proj": 151.45774841308594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5483027100563049, "geo/layer_7/attn_entropy_mean": 4.627925872802734, "geo/layer_7/attn_entropy_std": 0.8303559422492981, "geo/layer_14/stable_rank_q_proj": 57.80682373046875, "geo/layer_14/stable_rank_k_proj": 35.06040954589844, "geo/layer_14/stable_rank_o_proj": 53.6033935546875, "geo/layer_14/stable_rank_gate_proj": 87.25943756103516, "geo/layer_14/stable_rank_down_proj": 135.96926879882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3939209282398224, "geo/layer_14/attn_entropy_mean": 5.518063545227051, "geo/layer_14/attn_entropy_std": 0.461418092250824, "geo/layer_21/stable_rank_q_proj": 47.618019104003906, "geo/layer_21/stable_rank_k_proj": 32.12363052368164, "geo/layer_21/stable_rank_o_proj": 83.35308074951172, "geo/layer_21/stable_rank_gate_proj": 86.5936050415039, "geo/layer_21/stable_rank_down_proj": 60.72219467163086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1589730829000473, "geo/layer_21/attn_entropy_mean": 5.745573043823242, "geo/layer_21/attn_entropy_std": 0.29480668902397156, "geo/layer_27/stable_rank_q_proj": 41.50348663330078, "geo/layer_27/stable_rank_k_proj": 31.60698890686035, "geo/layer_27/stable_rank_o_proj": 118.42583465576172, "geo/layer_27/stable_rank_gate_proj": 91.39069366455078, "geo/layer_27/stable_rank_down_proj": 138.3065948486328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07406385987997055, "geo/layer_27/attn_entropy_mean": 4.387754917144775, "geo/layer_27/attn_entropy_std": 0.5737357139587402, "attnres/final_alpha/block_0": 0.24290800094604492, "attnres/block_norm/0": 1.6215672492980957, "attnres/final_alpha/block_1": 0.006804525852203369, "attnres/block_norm/1": 29725.681640625, "attnres/final_alpha/block_2": 0.014267646707594395, "attnres/block_norm/2": 21242.90234375, "attnres/final_alpha/block_3": 0.016172409057617188, "attnres/block_norm/3": 30414.265625, "attnres/final_alpha/block_4": 0.020906716585159302, "attnres/block_norm/4": 9474.53125, "attnres/final_alpha/block_5": 0.564877986907959, "attnres/block_norm/5": 5079.3525390625, "attnres/final_alpha/block_6": 0.13406270742416382, "attnres/block_norm/6": 20478.74609375, "geo/tier1_time_s": 1.3549492359161377, "geo/step": 21300.0, "geo/rankme_slope": -6.471190038515406e-05} {"step": 21310, "timestamp": 1778217568.6635778, "train/loss": 2.252273440361023, "train/z_loss": 0.0015468112891539932, "train/perplexity": 9.509330175363642, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785868.0054487484, "perf/iters_per_sec": 0.8515682246440641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1743040323257445, "data/tokens_consumed": 44692406272, "data/tokens_consumed_B": 44.692406272, "train/loss_slope": -1.2756322390890757e-05} {"step": 21320, "timestamp": 1778217579.0552967, "train/loss": 2.1835177659988405, "train/z_loss": 0.001567328895907849, "train/perplexity": 8.877480293425426, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019465.0453738875, "perf/iters_per_sec": 0.9629559733266294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384690761566162, "data/tokens_consumed": 44713377792, "data/tokens_consumed_B": 44.713377792, "train/loss_slope": -1.2088998788260714e-05} {"step": 21330, "timestamp": 1778217589.4434016, "train/loss": 2.2317378997802733, "train/z_loss": 0.0015480345813557506, "train/perplexity": 9.31604236701153, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020109.4352645827, "perf/iters_per_sec": 0.9632632423708833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381378173828124, "data/tokens_consumed": 44734349312, "data/tokens_consumed_B": 44.734349312, "train/loss_slope": -8.930909737358881e-06} {"step": 21340, "timestamp": 1778217599.8133314, "train/loss": 2.231270909309387, "train/z_loss": 0.0015536442282609642, "train/perplexity": 9.311692879663383, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023445.732962001, "perf/iters_per_sec": 0.96485411308384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364261150360108, "data/tokens_consumed": 44755320832, "data/tokens_consumed_B": 44.755320832, "train/loss_slope": -8.283413561693026e-06} {"step": 21350, "timestamp": 1778217610.1602557, "grad/layer_0/attn": 0.0023958066012710333, "grad/layer_0/mlp": 0.0026070093736052513, "grad/layer_0/attn_mlp_ratio": 0.9189865343902475, "grad/layer_4/attn": 0.0016817040741443634, "grad/layer_4/mlp": 0.0026949874591082335, "grad/layer_4/attn_mlp_ratio": 0.6240118135093864, "grad/layer_8/attn": 0.0073652202263474464, "grad/layer_8/mlp": 0.0037971732672303915, "grad/layer_8/attn_mlp_ratio": 1.939658665550881, "grad/layer_12/attn": 0.003695625811815262, "grad/layer_12/mlp": 0.00656553590670228, "grad/layer_12/attn_mlp_ratio": 0.5628825747117472, "grad/layer_16/attn": 0.004634218756109476, "grad/layer_16/mlp": 0.00439048558473587, "grad/layer_16/attn_mlp_ratio": 1.055513920070615, "grad/layer_20/attn": 0.007746574468910694, "grad/layer_20/mlp": 0.005949567537754774, "grad/layer_20/attn_mlp_ratio": 1.3020399028246141, "grad/layer_24/attn": 0.00983990915119648, "grad/layer_24/mlp": 0.008528725244104862, "grad/layer_24/attn_mlp_ratio": 1.153737370379493, "grad/layer_27/attn": 0.008736730553209782, "grad/layer_27/mlp": 0.006675930228084326, "grad/layer_27/attn_mlp_ratio": 1.308691092304555} {"step": 21350, "timestamp": 1778217610.1752691, "train/loss": 2.208049249649048, "train/z_loss": 0.0015472424332983793, "train/perplexity": 9.097951239410909, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025600.5060650043, "perf/iters_per_sec": 0.9658815889668485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353235960006715, "data/tokens_consumed": 44776292352, "data/tokens_consumed_B": 44.776292352, "train/loss_slope": -9.374556754610319e-06} {"step": 21360, "timestamp": 1778217620.5217867, "train/loss": 2.2404461145401, "train/z_loss": 0.0015445815399289132, "train/perplexity": 9.397522723970962, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027896.9882124315, "perf/iters_per_sec": 0.9669766369878919, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341511487960815, "data/tokens_consumed": 44797263872, "data/tokens_consumed_B": 44.797263872, "train/loss_slope": -8.212839516297143e-06} {"step": 21370, "timestamp": 1778217630.880754, "train/loss": 2.2889055252075194, "train/z_loss": 0.0015492719947360456, "train/perplexity": 9.864135723117506, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025582.920539129, "perf/iters_per_sec": 0.9658732035346647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353325843811034, "data/tokens_consumed": 44818235392, "data/tokens_consumed_B": 44.818235392, "train/loss_slope": -5.018887046289706e-06} {"step": 21375, "timestamp": 1778217636.6718624, "eos/sharpness": 61.89675331115721, "eos/L0_probe": 2.049852132797241, "eos/L_plus": 2.449860095977783, "eos/L_minus": 2.2688117027282715, "eos/grad_norm": 0.2963324189186096, "eos/embed_grad_frac": 0.028790444135665894, "eos/time_s": 0.6135709285736084} {"step": 21375, "timestamp": 1778217638.0522768, "geo/rankme_last": 440.121337890625, "geo/layer_0/stable_rank_q_proj": 17.299095153808594, "geo/layer_0/stable_rank_k_proj": 15.285858154296875, "geo/layer_0/stable_rank_o_proj": 52.00808334350586, "geo/layer_0/stable_rank_gate_proj": 152.0033416748047, "geo/layer_0/stable_rank_down_proj": 50.000370025634766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05119219049811363, "geo/layer_0/attn_entropy_mean": 6.254667282104492, "geo/layer_0/attn_entropy_std": 0.32286080718040466, "geo/layer_7/stable_rank_q_proj": 42.2807502746582, "geo/layer_7/stable_rank_k_proj": 41.98917770385742, "geo/layer_7/stable_rank_o_proj": 110.14249420166016, "geo/layer_7/stable_rank_gate_proj": 103.37646484375, "geo/layer_7/stable_rank_down_proj": 151.3352813720703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5600932240486145, "geo/layer_7/attn_entropy_mean": 4.685462951660156, "geo/layer_7/attn_entropy_std": 0.8492530584335327, "geo/layer_14/stable_rank_q_proj": 57.930908203125, "geo/layer_14/stable_rank_k_proj": 35.01553726196289, "geo/layer_14/stable_rank_o_proj": 53.595069885253906, "geo/layer_14/stable_rank_gate_proj": 87.20536804199219, "geo/layer_14/stable_rank_down_proj": 135.6302947998047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3811676502227783, "geo/layer_14/attn_entropy_mean": 5.516775131225586, "geo/layer_14/attn_entropy_std": 0.4652015268802643, "geo/layer_21/stable_rank_q_proj": 47.56266403198242, "geo/layer_21/stable_rank_k_proj": 31.974679946899414, "geo/layer_21/stable_rank_o_proj": 83.50508117675781, "geo/layer_21/stable_rank_gate_proj": 86.55394744873047, "geo/layer_21/stable_rank_down_proj": 60.69957733154297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15394628047943115, "geo/layer_21/attn_entropy_mean": 5.7638421058654785, "geo/layer_21/attn_entropy_std": 0.2986781895160675, "geo/layer_27/stable_rank_q_proj": 41.55702590942383, "geo/layer_27/stable_rank_k_proj": 31.551855087280273, "geo/layer_27/stable_rank_o_proj": 118.7829818725586, "geo/layer_27/stable_rank_gate_proj": 91.36768341064453, "geo/layer_27/stable_rank_down_proj": 138.31837463378906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07322173565626144, "geo/layer_27/attn_entropy_mean": 4.392116069793701, "geo/layer_27/attn_entropy_std": 0.5627526640892029, "attnres/final_alpha/block_0": 0.239282488822937, "attnres/block_norm/0": 1.6224031448364258, "attnres/final_alpha/block_1": 0.006690907292068005, "attnres/block_norm/1": 29787.37890625, "attnres/final_alpha/block_2": 0.013836795464158058, "attnres/block_norm/2": 21288.296875, "attnres/final_alpha/block_3": 0.01566907949745655, "attnres/block_norm/3": 30581.2734375, "attnres/final_alpha/block_4": 0.020402608439326286, "attnres/block_norm/4": 9506.693359375, "attnres/final_alpha/block_5": 0.5712857842445374, "attnres/block_norm/5": 5047.2998046875, "attnres/final_alpha/block_6": 0.1328323483467102, "attnres/block_norm/6": 20627.6171875, "geo/tier1_time_s": 1.3628809452056885, "geo/step": 21375.0, "geo/rankme_slope": -9.255495166816727e-05} {"step": 21380, "timestamp": 1778217643.248913, "train/loss": 2.2577107191085815, "train/z_loss": 0.0015478318557143212, "train/perplexity": 9.561175876263027, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696604.8919236886, "perf/iters_per_sec": 0.8090042552584117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236087441444397, "data/tokens_consumed": 44839206912, "data/tokens_consumed_B": 44.839206912, "train/loss_slope": -1.1984769052618704e-06} {"step": 21390, "timestamp": 1778217653.6418936, "train/loss": 2.219122886657715, "train/z_loss": 0.001546011900063604, "train/perplexity": 9.19925853382555, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018830.2407583378, "perf/iters_per_sec": 0.9626532748977364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387956142425536, "data/tokens_consumed": 44860178432, "data/tokens_consumed_B": 44.860178432, "train/loss_slope": 1.1167455713849033e-06} {"step": 21400, "timestamp": 1778217664.0096185, "grad/layer_0/attn": 0.002713481429964304, "grad/layer_0/mlp": 0.0026482564862817526, "grad/layer_0/attn_mlp_ratio": 1.0246293520123477, "grad/layer_4/attn": 0.002612916985526681, "grad/layer_4/mlp": 0.0024997929576784372, "grad/layer_4/attn_mlp_ratio": 1.0452533170698945, "grad/layer_8/attn": 0.004085694905370474, "grad/layer_8/mlp": 0.0036339035723358393, "grad/layer_8/attn_mlp_ratio": 1.1243266948637158, "grad/layer_12/attn": 0.004772497806698084, "grad/layer_12/mlp": 0.006552555598318577, "grad/layer_12/attn_mlp_ratio": 0.7283414329347455, "grad/layer_16/attn": 0.004169574938714504, "grad/layer_16/mlp": 0.004602054599672556, "grad/layer_16/attn_mlp_ratio": 0.9060246369977256, "grad/layer_20/attn": 0.004188845865428448, "grad/layer_20/mlp": 0.006642260123044252, "grad/layer_20/attn_mlp_ratio": 0.6306356157044136, "grad/layer_24/attn": 0.015345203690230846, "grad/layer_24/mlp": 0.01207730546593666, "grad/layer_24/attn_mlp_ratio": 1.2705817209354298, "grad/layer_27/attn": 0.006141620688140392, "grad/layer_27/mlp": 0.0118466317653656, "grad/layer_27/attn_mlp_ratio": 0.518427579917953} {"step": 21400, "timestamp": 1778217664.0238397, "train/loss": 2.2476608514785767, "train/z_loss": 0.00155427111312747, "train/perplexity": 9.465568549500963, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021216.9518858714, "perf/iters_per_sec": 0.9637913474492413, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375689744949341, "data/tokens_consumed": 44881149952, "data/tokens_consumed_B": 44.881149952, "train/loss_slope": 3.763629779993142e-06} {"step": 21410, "timestamp": 1778217674.4038584, "train/loss": 2.2336800336837768, "train/z_loss": 0.0015387646155431867, "train/perplexity": 9.334152949634657, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021598.9851956214, "perf/iters_per_sec": 0.9639735151270015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037372899055481, "data/tokens_consumed": 44902121472, "data/tokens_consumed_B": 44.902121472, "train/loss_slope": 6.382343485088622e-06} {"step": 21420, "timestamp": 1778217684.7796655, "train/loss": 2.179490089416504, "train/z_loss": 0.0015568396891467273, "train/perplexity": 8.841796583398013, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022373.3427268318, "perf/iters_per_sec": 0.964342757571617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369756937026977, "data/tokens_consumed": 44923092992, "data/tokens_consumed_B": 44.923092992, "train/loss_slope": 2.7889325816890054e-06} {"step": 21430, "timestamp": 1778217695.159606, "train/loss": 2.2520558834075928, "train/z_loss": 0.0015486541553400457, "train/perplexity": 9.507261579488395, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021552.8956975, "perf/iters_per_sec": 0.9639515379416943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373965501785278, "data/tokens_consumed": 44944064512, "data/tokens_consumed_B": 44.944064512, "train/loss_slope": 8.062362842577022e-06} {"step": 21440, "timestamp": 1778217705.5420575, "train/loss": 2.2361687421798706, "train/z_loss": 0.0015435937675647438, "train/perplexity": 9.35741186571044, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021000.1713347042, "perf/iters_per_sec": 0.963687978427269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376802682876587, "data/tokens_consumed": 44965036032, "data/tokens_consumed_B": 44.965036032, "train/loss_slope": 8.45275215654809e-06} {"step": 21450, "timestamp": 1778217715.9075544, "grad/layer_0/attn": 0.0028817132115364075, "grad/layer_0/mlp": 0.002934235380962491, "grad/layer_0/attn_mlp_ratio": 0.9821001859711493, "grad/layer_4/attn": 0.0031057142186909914, "grad/layer_4/mlp": 0.0024061938747763634, "grad/layer_4/attn_mlp_ratio": 1.2907164805695446, "grad/layer_8/attn": 0.006685602944344282, "grad/layer_8/mlp": 0.003809574758633971, "grad/layer_8/attn_mlp_ratio": 1.754947255910228, "grad/layer_12/attn": 0.004780040122568607, "grad/layer_12/mlp": 0.006010534707456827, "grad/layer_12/attn_mlp_ratio": 0.7952770054069672, "grad/layer_16/attn": 0.006165442522615194, "grad/layer_16/mlp": 0.004354259930551052, "grad/layer_16/attn_mlp_ratio": 1.4159564379151075, "grad/layer_20/attn": 0.006768909282982349, "grad/layer_20/mlp": 0.006758878473192453, "grad/layer_20/attn_mlp_ratio": 1.0014840790053072, "grad/layer_24/attn": 0.01141447201371193, "grad/layer_24/mlp": 0.010593470185995102, "grad/layer_24/attn_mlp_ratio": 1.0775007344667986, "grad/layer_27/attn": 0.007245275191962719, "grad/layer_27/mlp": 0.010789072141051292, "grad/layer_27/attn_mlp_ratio": 0.6715382963509322} {"step": 21450, "timestamp": 1778217716.5029466, "eos/sharpness": 40.840458869934075, "eos/L0_probe": 2.042118549346924, "eos/L_plus": 2.2185983657836914, "eos/L_minus": 2.274043321609497, "eos/grad_norm": 0.1671503335237503, "eos/embed_grad_frac": 0.08794158697128296, "eos/time_s": 0.592534065246582} {"step": 21450, "timestamp": 1778217716.5214765, "train/loss": 2.1788891792297362, "train/z_loss": 0.0015544798690825701, "train/perplexity": 8.836485053797674, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911156.5849713807, "perf/iters_per_sec": 0.9113104748589423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973208665847778, "data/tokens_consumed": 44986007552, "data/tokens_consumed_B": 44.986007552, "train/loss_slope": 1.7399085594518579e-06} {"step": 21450, "timestamp": 1778217717.8888845, "geo/rankme_last": 439.8037109375, "geo/layer_0/stable_rank_q_proj": 17.292871475219727, "geo/layer_0/stable_rank_k_proj": 15.252236366271973, "geo/layer_0/stable_rank_o_proj": 51.849693298339844, "geo/layer_0/stable_rank_gate_proj": 151.87229919433594, "geo/layer_0/stable_rank_down_proj": 49.99822998046875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048856716603040695, "geo/layer_0/attn_entropy_mean": 6.254507064819336, "geo/layer_0/attn_entropy_std": 0.3212236166000366, "geo/layer_7/stable_rank_q_proj": 42.19987106323242, "geo/layer_7/stable_rank_k_proj": 42.003570556640625, "geo/layer_7/stable_rank_o_proj": 109.82564544677734, "geo/layer_7/stable_rank_gate_proj": 103.0689468383789, "geo/layer_7/stable_rank_down_proj": 151.06008911132812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5418424606323242, "geo/layer_7/attn_entropy_mean": 4.648682594299316, "geo/layer_7/attn_entropy_std": 0.8691472411155701, "geo/layer_14/stable_rank_q_proj": 57.92007827758789, "geo/layer_14/stable_rank_k_proj": 35.09695816040039, "geo/layer_14/stable_rank_o_proj": 53.5851936340332, "geo/layer_14/stable_rank_gate_proj": 87.15074920654297, "geo/layer_14/stable_rank_down_proj": 135.99208068847656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3802982270717621, "geo/layer_14/attn_entropy_mean": 5.498069763183594, "geo/layer_14/attn_entropy_std": 0.4870937168598175, "geo/layer_21/stable_rank_q_proj": 47.46223449707031, "geo/layer_21/stable_rank_k_proj": 31.981943130493164, "geo/layer_21/stable_rank_o_proj": 83.44160461425781, "geo/layer_21/stable_rank_gate_proj": 86.63876342773438, "geo/layer_21/stable_rank_down_proj": 60.598201751708984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15688732266426086, "geo/layer_21/attn_entropy_mean": 5.7625861167907715, "geo/layer_21/attn_entropy_std": 0.29780852794647217, "geo/layer_27/stable_rank_q_proj": 41.4739990234375, "geo/layer_27/stable_rank_k_proj": 31.611560821533203, "geo/layer_27/stable_rank_o_proj": 118.724853515625, "geo/layer_27/stable_rank_gate_proj": 91.30096435546875, "geo/layer_27/stable_rank_down_proj": 138.17730712890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07563374936580658, "geo/layer_27/attn_entropy_mean": 4.41780948638916, "geo/layer_27/attn_entropy_std": 0.5467572808265686, "attnres/final_alpha/block_0": 0.24335335195064545, "attnres/block_norm/0": 1.623033046722412, "attnres/final_alpha/block_1": 0.006758269388228655, "attnres/block_norm/1": 29934.93359375, "attnres/final_alpha/block_2": 0.0140951257199049, "attnres/block_norm/2": 21333.953125, "attnres/final_alpha/block_3": 0.016039341688156128, "attnres/block_norm/3": 30530.765625, "attnres/final_alpha/block_4": 0.020966771990060806, "attnres/block_norm/4": 9505.73046875, "attnres/final_alpha/block_5": 0.561918318271637, "attnres/block_norm/5": 5104.953125, "attnres/final_alpha/block_6": 0.13686884939670563, "attnres/block_norm/6": 20535.9765625, "geo/tier1_time_s": 1.363821029663086, "geo/step": 21450.0, "geo/rankme_slope": -0.00012031715029761904} {"step": 21460, "timestamp": 1778217728.2732584, "train/loss": 2.2388261795043944, "train/z_loss": 0.001556649897247553, "train/perplexity": 9.38231167144641, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785072.4151549097, "perf/iters_per_sec": 0.8511888576292561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1748274087905883, "data/tokens_consumed": 45006979072, "data/tokens_consumed_B": 45.006979072, "train/loss_slope": 2.3871627255957e-06} {"step": 21470, "timestamp": 1778217738.6509862, "train/loss": 2.21525182723999, "train/z_loss": 0.0015629867906682194, "train/perplexity": 9.16371649449854, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022115.2196258046, "perf/iters_per_sec": 0.9642196748856566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037108063697815, "data/tokens_consumed": 45027950592, "data/tokens_consumed_B": 45.027950592, "train/loss_slope": -1.6197545371754926e-06} {"step": 21480, "timestamp": 1778217749.027193, "train/loss": 2.2136704444885256, "train/z_loss": 0.001548346516210586, "train/perplexity": 9.149236603437387, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022321.126992055, "perf/iters_per_sec": 0.9643178591690326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370024681091308, "data/tokens_consumed": 45048922112, "data/tokens_consumed_B": 45.048922112, "train/loss_slope": -2.8389291222517933e-06} {"step": 21490, "timestamp": 1778217759.4041855, "train/loss": 2.246431517601013, "train/z_loss": 0.0015406033024191856, "train/perplexity": 9.453939354958633, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022310.1541261773, "perf/iters_per_sec": 0.9643126268988501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370080947875977, "data/tokens_consumed": 45069893632, "data/tokens_consumed_B": 45.069893632, "train/loss_slope": -2.0882039681017901e-07} {"step": 21500, "timestamp": 1778217769.7753441, "grad/layer_0/attn": 0.002960466779768467, "grad/layer_0/mlp": 0.0029241538140922785, "grad/layer_0/attn_mlp_ratio": 1.0124182470358989, "grad/layer_4/attn": 0.002034506294876337, "grad/layer_4/mlp": 0.0025689832400530577, "grad/layer_4/attn_mlp_ratio": 0.7919499761467188, "grad/layer_8/attn": 0.00313375610858202, "grad/layer_8/mlp": 0.00367691065184772, "grad/layer_8/attn_mlp_ratio": 0.852279622780413, "grad/layer_12/attn": 0.0038606899324804544, "grad/layer_12/mlp": 0.006425731349736452, "grad/layer_12/attn_mlp_ratio": 0.6008171929810106, "grad/layer_16/attn": 0.004096794407814741, "grad/layer_16/mlp": 0.004753583110868931, "grad/layer_16/attn_mlp_ratio": 0.8618329007994515, "grad/layer_20/attn": 0.00501661142334342, "grad/layer_20/mlp": 0.007535235956311226, "grad/layer_20/attn_mlp_ratio": 0.6657537183777671, "grad/layer_24/attn": 0.02384466491639614, "grad/layer_24/mlp": 0.016085324808955193, "grad/layer_24/attn_mlp_ratio": 1.4823862776387615, "grad/layer_27/attn": 0.01538760494440794, "grad/layer_27/mlp": 0.014959597960114479, "grad/layer_27/attn_mlp_ratio": 1.0286108545546166} {"step": 21500, "timestamp": 1778217769.790154, "train/loss": 2.2626727342605593, "train/z_loss": 0.0015511866891756653, "train/perplexity": 9.60873647645639, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020657.216221727, "perf/iters_per_sec": 0.9635244446858058, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037856388092041, "data/tokens_consumed": 45090865152, "data/tokens_consumed_B": 45.090865152, "train/loss_slope": 2.4130374291549174e-06} {"step": 21500, "timestamp": 1778217777.018996, "geo/ww_alpha_mean": 7.877811314954686, "geo/ww_alpha_std": 4.821929448698509, "geo/ww_alpha_min": 1.3694644312303659, "geo/ww_alpha_max": 37.92317489432109, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 4.096062311376381, "geo/ww_alpha_by_type/k_proj": 4.669293728030811, "geo/ww_alpha_by_type/v_proj": 7.8065250201256475, "geo/ww_alpha_by_type/o_proj": 8.02684942541374, "geo/ww_alpha_by_type/gate_proj": 8.786212697673454, "geo/ww_alpha_by_type/up_proj": 12.670657453063669, "geo/ww_alpha_by_type/down_proj": 9.215173057358616, "geo/twonn_id/layer_0": 0.6940918564796448, "geo/twonn_id/layer_7": 2.8301687240600586, "geo/twonn_id/layer_14": 4.167908668518066, "geo/twonn_id/layer_21": 6.488036155700684, "geo/twonn_id/layer_27": 6.382431507110596, "geo/tier2_time_s": 7.219946622848511} {"step": 21500, "timestamp": 1778217777.7354836, "eoc/jacobian_sigma/layer_0/attn": 880.5145263671875, "eoc/jacobian_sigma/layer_0/mlp": 5298.091796875, "eoc/jacobian_sigma/layer_0": 5298.091796875, "eoc/jacobian_sigma/layer_7/attn": 1.1431324481964111, "eoc/jacobian_sigma/layer_7/mlp": 1.7049248218536377, "eoc/jacobian_sigma/layer_7": 1.7049248218536377, "eoc/jacobian_sigma/layer_14/attn": 1.6210813522338867, "eoc/jacobian_sigma/layer_14/mlp": 6.472417831420898, "eoc/jacobian_sigma/layer_14": 6.472417831420898, "eoc/jacobian_sigma/layer_21/attn": 1.0740317106246948, "eoc/jacobian_sigma/layer_21/mlp": 3.9566490650177, "eoc/jacobian_sigma/layer_21": 3.9566490650177, "eoc/jacobian_sigma/layer_27/attn": 3.5389082431793213, "eoc/jacobian_sigma/layer_27/mlp": 21.21991729736328, "eoc/jacobian_sigma/layer_27": 21.21991729736328, "eoc/layer0_sigma": 5298.091796875, "eoc/sigma_max": 21.21991729736328, "eoc/sigma_min": 1.7049248218536377, "eoc/sigma_mean": 8.33847725391388, "eoc/time_s": 0.6370766162872314} {"step": 21510, "timestamp": 1778217788.1231382, "train/loss": 2.2466714859008787, "train/z_loss": 0.0015503791044466197, "train/perplexity": 9.456208272935983, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1144386.379345111, "perf/iters_per_sec": 0.5456859490132862, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.832555890083313, "data/tokens_consumed": 45111836672, "data/tokens_consumed_B": 45.111836672, "train/loss_slope": 7.4664588450956284e-06} {"step": 21520, "timestamp": 1778217798.483716, "train/loss": 2.2501562356948854, "train/z_loss": 0.0015479588881134986, "train/perplexity": 9.489218275161722, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025463.934725118, "perf/iters_per_sec": 0.9658164666772452, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353934049606324, "data/tokens_consumed": 45132808192, "data/tokens_consumed_B": 45.132808192, "train/loss_slope": 6.507652470416737e-06} {"step": 21525, "timestamp": 1778217804.290929, "eos/sharpness": 9.248924255371092, "eos/L0_probe": 2.04304838180542, "eos/L_plus": 2.0936195850372314, "eos/L_minus": 2.0849664211273193, "eos/grad_norm": 0.11874988675117493, "eos/embed_grad_frac": 0.2571862041950226, "eos/time_s": 0.6394164562225342} {"step": 21525, "timestamp": 1778217805.670322, "geo/rankme_last": 439.9580993652344, "geo/layer_0/stable_rank_q_proj": 17.316476821899414, "geo/layer_0/stable_rank_k_proj": 15.23712158203125, "geo/layer_0/stable_rank_o_proj": 51.7220344543457, "geo/layer_0/stable_rank_gate_proj": 151.78749084472656, "geo/layer_0/stable_rank_down_proj": 49.959232330322266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0511133186519146, "geo/layer_0/attn_entropy_mean": 6.250300407409668, "geo/layer_0/attn_entropy_std": 0.3241063058376312, "geo/layer_7/stable_rank_q_proj": 42.16794967651367, "geo/layer_7/stable_rank_k_proj": 41.945316314697266, "geo/layer_7/stable_rank_o_proj": 110.03854370117188, "geo/layer_7/stable_rank_gate_proj": 103.21257019042969, "geo/layer_7/stable_rank_down_proj": 150.7891845703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5531174540519714, "geo/layer_7/attn_entropy_mean": 4.66168212890625, "geo/layer_7/attn_entropy_std": 0.8343993425369263, "geo/layer_14/stable_rank_q_proj": 57.743717193603516, "geo/layer_14/stable_rank_k_proj": 35.15317916870117, "geo/layer_14/stable_rank_o_proj": 53.603492736816406, "geo/layer_14/stable_rank_gate_proj": 86.96923828125, "geo/layer_14/stable_rank_down_proj": 135.882568359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3748668134212494, "geo/layer_14/attn_entropy_mean": 5.482501029968262, "geo/layer_14/attn_entropy_std": 0.500089168548584, "geo/layer_21/stable_rank_q_proj": 47.335052490234375, "geo/layer_21/stable_rank_k_proj": 31.986143112182617, "geo/layer_21/stable_rank_o_proj": 83.41851806640625, "geo/layer_21/stable_rank_gate_proj": 86.61856079101562, "geo/layer_21/stable_rank_down_proj": 60.54747772216797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15464653074741364, "geo/layer_21/attn_entropy_mean": 5.752216339111328, "geo/layer_21/attn_entropy_std": 0.29737144708633423, "geo/layer_27/stable_rank_q_proj": 41.47345733642578, "geo/layer_27/stable_rank_k_proj": 31.655309677124023, "geo/layer_27/stable_rank_o_proj": 118.85047149658203, "geo/layer_27/stable_rank_gate_proj": 91.14375305175781, "geo/layer_27/stable_rank_down_proj": 137.8839111328125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08085017651319504, "geo/layer_27/attn_entropy_mean": 4.39328670501709, "geo/layer_27/attn_entropy_std": 0.5532315373420715, "attnres/final_alpha/block_0": 0.2416001558303833, "attnres/block_norm/0": 1.6237010955810547, "attnres/final_alpha/block_1": 0.006800292059779167, "attnres/block_norm/1": 29829.8046875, "attnres/final_alpha/block_2": 0.013885058462619781, "attnres/block_norm/2": 21441.091796875, "attnres/final_alpha/block_3": 0.015654228627681732, "attnres/block_norm/3": 30950.8046875, "attnres/final_alpha/block_4": 0.02052238956093788, "attnres/block_norm/4": 9496.4091796875, "attnres/final_alpha/block_5": 0.5664677023887634, "attnres/block_norm/5": 5089.681640625, "attnres/final_alpha/block_6": 0.13507016003131866, "attnres/block_norm/6": 20797.21484375, "geo/tier1_time_s": 1.3604211807250977, "geo/step": 21525.0, "geo/rankme_slope": -0.00012269313975590235} {"step": 21530, "timestamp": 1778217810.8560793, "train/loss": 2.1982110023498533, "train/z_loss": 0.00155590366339311, "train/perplexity": 9.008882205217184, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695884.999045159, "perf/iters_per_sec": 0.808660983584003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2366121530532836, "data/tokens_consumed": 45153779712, "data/tokens_consumed_B": 45.153779712, "train/loss_slope": 6.542040069218416e-06} {"step": 21540, "timestamp": 1778217821.2123325, "train/loss": 2.187842869758606, "train/z_loss": 0.0015595320728607476, "train/perplexity": 8.915959470051233, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026293.1147601407, "perf/iters_per_sec": 0.9662118505287841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349697113037108, "data/tokens_consumed": 45174751232, "data/tokens_consumed_B": 45.174751232, "train/loss_slope": 6.72858546097072e-06} {"step": 21550, "timestamp": 1778217831.5755162, "grad/layer_0/attn": 0.0026595196686685085, "grad/layer_0/mlp": 0.00271544698625803, "grad/layer_0/attn_mlp_ratio": 0.9794039744421641, "grad/layer_4/attn": 0.001770959934219718, "grad/layer_4/mlp": 0.0026532637421041727, "grad/layer_4/attn_mlp_ratio": 0.6674646923975932, "grad/layer_8/attn": 0.00786867830902338, "grad/layer_8/mlp": 0.00395362451672554, "grad/layer_8/attn_mlp_ratio": 1.9902441612022226, "grad/layer_12/attn": 0.004246653523296118, "grad/layer_12/mlp": 0.0063369725830852985, "grad/layer_12/attn_mlp_ratio": 0.670139155662028, "grad/layer_16/attn": 0.0038585085421800613, "grad/layer_16/mlp": 0.004899629857391119, "grad/layer_16/attn_mlp_ratio": 0.7875101947973602, "grad/layer_20/attn": 0.006534246727824211, "grad/layer_20/mlp": 0.006862138397991657, "grad/layer_20/attn_mlp_ratio": 0.9522172613882086, "grad/layer_24/attn": 0.011159930378198624, "grad/layer_24/mlp": 0.01136383693665266, "grad/layer_24/attn_mlp_ratio": 0.9820565309238103, "grad/layer_27/attn": 0.007003091741353273, "grad/layer_27/mlp": 0.010895896703004837, "grad/layer_27/attn_mlp_ratio": 0.6427274292302385} {"step": 21550, "timestamp": 1778217831.5898526, "train/loss": 2.2211031198501585, "train/z_loss": 0.0015520644141361118, "train/perplexity": 9.217493259465897, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022270.8201702046, "perf/iters_per_sec": 0.9642938710070632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370282649993896, "data/tokens_consumed": 45195722752, "data/tokens_consumed_B": 45.195722752, "train/loss_slope": 6.636707910788931e-06} {"step": 21560, "timestamp": 1778217841.9373481, "train/loss": 2.1935651302337646, "train/z_loss": 0.0015631959424354136, "train/perplexity": 8.967125164625518, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027947.4818357562, "perf/iters_per_sec": 0.9670007142237455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341253995895385, "data/tokens_consumed": 45216694272, "data/tokens_consumed_B": 45.216694272, "train/loss_slope": 3.150164955841883e-06} {"step": 21570, "timestamp": 1778217852.2982357, "train/loss": 2.194607210159302, "train/z_loss": 0.0015583503874950111, "train/perplexity": 8.976474496278712, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025303.4123187938, "perf/iters_per_sec": 0.9657399236291856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354754686355592, "data/tokens_consumed": 45237665792, "data/tokens_consumed_B": 45.237665792, "train/loss_slope": 2.4091950487239163e-06} {"step": 21580, "timestamp": 1778217862.6491387, "train/loss": 2.165265989303589, "train/z_loss": 0.0015631291549652814, "train/perplexity": 8.716920216053888, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027142.0812011617, "perf/iters_per_sec": 0.9666166692739304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345362663269042, "data/tokens_consumed": 45258637312, "data/tokens_consumed_B": 45.258637312, "train/loss_slope": -1.391822441254447e-06} {"step": 21590, "timestamp": 1778217872.9989192, "train/loss": 2.220952010154724, "train/z_loss": 0.001541818433906883, "train/perplexity": 9.216100512098258, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027434.1069466174, "perf/iters_per_sec": 0.9667559180005156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343872547149657, "data/tokens_consumed": 45279608832, "data/tokens_consumed_B": 45.279608832, "train/loss_slope": 9.126555908917638e-08} {"step": 21600, "timestamp": 1778217883.3409755, "grad/layer_0/attn": 0.0025900080800056458, "grad/layer_0/mlp": 0.0025326302275061607, "grad/layer_0/attn_mlp_ratio": 1.0226553997542878, "grad/layer_4/attn": 0.0017328251851722598, "grad/layer_4/mlp": 0.0025645059067755938, "grad/layer_4/attn_mlp_ratio": 0.6756955065006756, "grad/layer_8/attn": 0.003846155945211649, "grad/layer_8/mlp": 0.0038864861708134413, "grad/layer_8/attn_mlp_ratio": 0.9896229337268825, "grad/layer_12/attn": 0.004351322539150715, "grad/layer_12/mlp": 0.00646787416189909, "grad/layer_12/attn_mlp_ratio": 0.672759296633742, "grad/layer_16/attn": 0.007078982423990965, "grad/layer_16/mlp": 0.004558179527521133, "grad/layer_16/attn_mlp_ratio": 1.5530284022265959, "grad/layer_20/attn": 0.0040161143988370895, "grad/layer_20/mlp": 0.005944706965237856, "grad/layer_20/attn_mlp_ratio": 0.6755781832079895, "grad/layer_24/attn": 0.006247739773243666, "grad/layer_24/mlp": 0.008731970563530922, "grad/layer_24/attn_mlp_ratio": 0.7155016907394516, "grad/layer_27/attn": 0.006604575552046299, "grad/layer_27/mlp": 0.0070089190267026424, "grad/layer_27/attn_mlp_ratio": 0.9423101383612957} {"step": 21600, "timestamp": 1778217883.9245768, "eos/sharpness": 13.952922821044918, "eos/L0_probe": 2.046125888824463, "eos/L_plus": 2.123865842819214, "eos/L_minus": 2.107915163040161, "eos/grad_norm": 0.10457999259233475, "eos/embed_grad_frac": 0.21074511110782623, "eos/time_s": 0.5806968212127686} {"step": 21600, "timestamp": 1778217883.9435253, "train/loss": 2.2263274431228637, "train/z_loss": 0.001551785529591143, "train/perplexity": 9.265774432431424, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917166.0971479425, "perf/iters_per_sec": 0.9141760335674012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0938812255859376, "data/tokens_consumed": 45300580352, "data/tokens_consumed_B": 45.300580352, "train/loss_slope": -6.356997994950179e-07} {"step": 21600, "timestamp": 1778217885.3074389, "geo/rankme_last": 440.7179870605469, "geo/layer_0/stable_rank_q_proj": 17.34413719177246, "geo/layer_0/stable_rank_k_proj": 15.311838150024414, "geo/layer_0/stable_rank_o_proj": 51.89739990234375, "geo/layer_0/stable_rank_gate_proj": 151.73587036132812, "geo/layer_0/stable_rank_down_proj": 50.09493637084961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04598071426153183, "geo/layer_0/attn_entropy_mean": 6.2489166259765625, "geo/layer_0/attn_entropy_std": 0.3255254924297333, "geo/layer_7/stable_rank_q_proj": 42.14897155761719, "geo/layer_7/stable_rank_k_proj": 42.09410095214844, "geo/layer_7/stable_rank_o_proj": 109.98997497558594, "geo/layer_7/stable_rank_gate_proj": 102.88468170166016, "geo/layer_7/stable_rank_down_proj": 150.83929443359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.551700234413147, "geo/layer_7/attn_entropy_mean": 4.64744758605957, "geo/layer_7/attn_entropy_std": 0.8440642356872559, "geo/layer_14/stable_rank_q_proj": 57.88774490356445, "geo/layer_14/stable_rank_k_proj": 35.1495246887207, "geo/layer_14/stable_rank_o_proj": 53.51216125488281, "geo/layer_14/stable_rank_gate_proj": 86.6980209350586, "geo/layer_14/stable_rank_down_proj": 135.971923828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37246209383010864, "geo/layer_14/attn_entropy_mean": 5.4950361251831055, "geo/layer_14/attn_entropy_std": 0.4739511013031006, "geo/layer_21/stable_rank_q_proj": 47.194557189941406, "geo/layer_21/stable_rank_k_proj": 32.11750793457031, "geo/layer_21/stable_rank_o_proj": 83.44042205810547, "geo/layer_21/stable_rank_gate_proj": 86.30091094970703, "geo/layer_21/stable_rank_down_proj": 60.52739715576172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1543651819229126, "geo/layer_21/attn_entropy_mean": 5.7692670822143555, "geo/layer_21/attn_entropy_std": 0.29387030005455017, "geo/layer_27/stable_rank_q_proj": 41.39393615722656, "geo/layer_27/stable_rank_k_proj": 31.70895767211914, "geo/layer_27/stable_rank_o_proj": 119.01173400878906, "geo/layer_27/stable_rank_gate_proj": 91.15149688720703, "geo/layer_27/stable_rank_down_proj": 138.07553100585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07950423657894135, "geo/layer_27/attn_entropy_mean": 4.402667045593262, "geo/layer_27/attn_entropy_std": 0.5564547181129456, "attnres/final_alpha/block_0": 0.24255329370498657, "attnres/block_norm/0": 1.6244838237762451, "attnres/final_alpha/block_1": 0.006737882271409035, "attnres/block_norm/1": 29958.634765625, "attnres/final_alpha/block_2": 0.013824926689267159, "attnres/block_norm/2": 21416.78125, "attnres/final_alpha/block_3": 0.015578129328787327, "attnres/block_norm/3": 30697.669921875, "attnres/final_alpha/block_4": 0.020803948864340782, "attnres/block_norm/4": 9546.357421875, "attnres/final_alpha/block_5": 0.5658012628555298, "attnres/block_norm/5": 5110.5361328125, "attnres/final_alpha/block_6": 0.13470056653022766, "attnres/block_norm/6": 20770.849609375, "geo/tier1_time_s": 1.359776258468628, "geo/step": 21600.0, "geo/rankme_slope": -0.00011518828234418767} {"step": 21610, "timestamp": 1778217896.085161, "train/loss": 2.251994490623474, "train/z_loss": 0.0015398477204144, "train/perplexity": 9.506677920147105, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1727835.1399898008, "perf/iters_per_sec": 0.8238959979962353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2137454271316528, "data/tokens_consumed": 45321551872, "data/tokens_consumed_B": 45.321551872, "train/loss_slope": 4.7935388555335104e-06} {"step": 21620, "timestamp": 1778217906.474742, "train/loss": 2.176878476142883, "train/z_loss": 0.001553645369131118, "train/perplexity": 8.818735356688517, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019831.6209856817, "perf/iters_per_sec": 0.9631307701996239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382806062698364, "data/tokens_consumed": 45342523392, "data/tokens_consumed_B": 45.342523392, "train/loss_slope": 3.8853986249684e-06} {"step": 21630, "timestamp": 1778217916.8766575, "train/loss": 2.196469473838806, "train/z_loss": 0.001539742830209434, "train/perplexity": 8.993206633693395, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019018.610377623, "perf/iters_per_sec": 0.9627430965316882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386986970901488, "data/tokens_consumed": 45363494912, "data/tokens_consumed_B": 45.363494912, "train/loss_slope": 1.3400296137706884e-06} {"step": 21640, "timestamp": 1778217927.2603545, "train/loss": 2.2488454818725585, "train/z_loss": 0.0015487184748053551, "train/perplexity": 9.476788394073356, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020711.481452323, "perf/iters_per_sec": 0.9635503203641524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037828516960144, "data/tokens_consumed": 45384466432, "data/tokens_consumed_B": 45.384466432, "train/loss_slope": -4.989262210904417e-07} {"step": 21650, "timestamp": 1778217937.6341321, "grad/layer_0/attn": 0.0035846740938723087, "grad/layer_0/mlp": 0.002998472424224019, "grad/layer_0/attn_mlp_ratio": 1.1955000637532918, "grad/layer_4/attn": 0.0018141320906579494, "grad/layer_4/mlp": 0.0027343870606273413, "grad/layer_4/attn_mlp_ratio": 0.6634510711503411, "grad/layer_8/attn": 0.008145873434841633, "grad/layer_8/mlp": 0.004205246921628714, "grad/layer_8/attn_mlp_ratio": 1.937073706477936, "grad/layer_12/attn": 0.004921485669910908, "grad/layer_12/mlp": 0.006419188342988491, "grad/layer_12/attn_mlp_ratio": 0.7666834699776599, "grad/layer_16/attn": 0.004982009995728731, "grad/layer_16/mlp": 0.005061813164502382, "grad/layer_16/attn_mlp_ratio": 0.9842342527067727, "grad/layer_20/attn": 0.005490146577358246, "grad/layer_20/mlp": 0.007213828153908253, "grad/layer_20/attn_mlp_ratio": 0.7610586756600196, "grad/layer_24/attn": 0.019333969801664352, "grad/layer_24/mlp": 0.01406911388039589, "grad/layer_24/attn_mlp_ratio": 1.3742137442773292, "grad/layer_27/attn": 0.00846158992499113, "grad/layer_27/mlp": 0.012934939004480839, "grad/layer_27/attn_mlp_ratio": 0.654165424100058} {"step": 21650, "timestamp": 1778217937.6493895, "train/loss": 2.242171573638916, "train/z_loss": 0.0015445457771420478, "train/perplexity": 9.41375176230586, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019904.9520785145, "perf/iters_per_sec": 0.9631657371895382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382429122924806, "data/tokens_consumed": 45405437952, "data/tokens_consumed_B": 45.405437952, "train/loss_slope": 4.032013117998661e-07} {"step": 21660, "timestamp": 1778217948.0324411, "train/loss": 2.212565851211548, "train/z_loss": 0.0015435686334967613, "train/perplexity": 9.139135997753321, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021053.6655549533, "perf/iters_per_sec": 0.9637134864592329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376528024673461, "data/tokens_consumed": 45426409472, "data/tokens_consumed_B": 45.426409472, "train/loss_slope": 2.1496411239710952e-06} {"step": 21670, "timestamp": 1778217958.412408, "train/loss": 2.19919376373291, "train/z_loss": 0.0015596651355735958, "train/perplexity": 9.017740138657505, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021546.0660817022, "perf/iters_per_sec": 0.9639482813271056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374000549316407, "data/tokens_consumed": 45447380992, "data/tokens_consumed_B": 45.447380992, "train/loss_slope": 8.697209185105886e-07} {"step": 21675, "timestamp": 1778217964.1793904, "eos/sharpness": 20.821762084960934, "eos/L0_probe": 2.048159122467041, "eos/L_plus": 2.1379005908966064, "eos/L_minus": 2.166635274887085, "eos/grad_norm": 0.09487984329462051, "eos/embed_grad_frac": 0.2501104176044464, "eos/time_s": 0.5856273174285889} {"step": 21675, "timestamp": 1778217965.5563293, "geo/rankme_last": 440.7513732910156, "geo/layer_0/stable_rank_q_proj": 17.371850967407227, "geo/layer_0/stable_rank_k_proj": 15.323799133300781, "geo/layer_0/stable_rank_o_proj": 51.94350051879883, "geo/layer_0/stable_rank_gate_proj": 151.2969970703125, "geo/layer_0/stable_rank_down_proj": 50.09761047363281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048829853534698486, "geo/layer_0/attn_entropy_mean": 6.256153106689453, "geo/layer_0/attn_entropy_std": 0.32393044233322144, "geo/layer_7/stable_rank_q_proj": 42.19581604003906, "geo/layer_7/stable_rank_k_proj": 41.97107696533203, "geo/layer_7/stable_rank_o_proj": 110.32984924316406, "geo/layer_7/stable_rank_gate_proj": 102.81551361083984, "geo/layer_7/stable_rank_down_proj": 150.53176879882812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5380372405052185, "geo/layer_7/attn_entropy_mean": 4.631728172302246, "geo/layer_7/attn_entropy_std": 0.8346587419509888, "geo/layer_14/stable_rank_q_proj": 57.782772064208984, "geo/layer_14/stable_rank_k_proj": 35.0952262878418, "geo/layer_14/stable_rank_o_proj": 53.546573638916016, "geo/layer_14/stable_rank_gate_proj": 86.52469635009766, "geo/layer_14/stable_rank_down_proj": 136.06784057617188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3718530535697937, "geo/layer_14/attn_entropy_mean": 5.548104286193848, "geo/layer_14/attn_entropy_std": 0.4498167037963867, "geo/layer_21/stable_rank_q_proj": 47.118446350097656, "geo/layer_21/stable_rank_k_proj": 32.11935043334961, "geo/layer_21/stable_rank_o_proj": 83.3414535522461, "geo/layer_21/stable_rank_gate_proj": 86.3567123413086, "geo/layer_21/stable_rank_down_proj": 60.45672607421875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1518949419260025, "geo/layer_21/attn_entropy_mean": 5.748772621154785, "geo/layer_21/attn_entropy_std": 0.2850508689880371, "geo/layer_27/stable_rank_q_proj": 41.483795166015625, "geo/layer_27/stable_rank_k_proj": 31.757728576660156, "geo/layer_27/stable_rank_o_proj": 119.53022766113281, "geo/layer_27/stable_rank_gate_proj": 91.27691650390625, "geo/layer_27/stable_rank_down_proj": 137.9075927734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08042153716087341, "geo/layer_27/attn_entropy_mean": 4.393989086151123, "geo/layer_27/attn_entropy_std": 0.5503169894218445, "attnres/final_alpha/block_0": 0.24290412664413452, "attnres/block_norm/0": 1.625171184539795, "attnres/final_alpha/block_1": 0.006720046512782574, "attnres/block_norm/1": 30055.263671875, "attnres/final_alpha/block_2": 0.013890222646296024, "attnres/block_norm/2": 21557.87109375, "attnres/final_alpha/block_3": 0.01603761874139309, "attnres/block_norm/3": 30907.533203125, "attnres/final_alpha/block_4": 0.021030789241194725, "attnres/block_norm/4": 9555.76953125, "attnres/final_alpha/block_5": 0.5654699802398682, "attnres/block_norm/5": 5131.3017578125, "attnres/final_alpha/block_6": 0.1339472532272339, "attnres/block_norm/6": 20814.91796875, "geo/tier1_time_s": 1.355762243270874, "geo/step": 21675.0, "geo/rankme_slope": -9.921036774084634e-05} {"step": 21680, "timestamp": 1778217970.7465, "train/loss": 2.2418493032455444, "train/z_loss": 0.0015518645872361959, "train/perplexity": 9.410718477617499, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701113.368899687, "perf/iters_per_sec": 0.811154064607471, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2328114271163941, "data/tokens_consumed": 45468352512, "data/tokens_consumed_B": 45.468352512, "train/loss_slope": 3.6724719634972838e-06} {"step": 21690, "timestamp": 1778217981.1270137, "train/loss": 2.2534008502960203, "train/z_loss": 0.0015408178092911839, "train/perplexity": 9.520057134382391, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021356.7136157572, "perf/iters_per_sec": 0.9638579910353456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374972343444824, "data/tokens_consumed": 45489324032, "data/tokens_consumed_B": 45.489324032, "train/loss_slope": 4.470051010914642e-06} {"step": 21700, "timestamp": 1778217991.4713573, "grad/layer_0/attn": 0.0026021883822977543, "grad/layer_0/mlp": 0.0026522548869252205, "grad/layer_0/attn_mlp_ratio": 0.9811230048112723, "grad/layer_4/attn": 0.002764677396044135, "grad/layer_4/mlp": 0.0025486440863460302, "grad/layer_4/attn_mlp_ratio": 1.0847639740594115, "grad/layer_8/attn": 0.005768324248492718, "grad/layer_8/mlp": 0.0036247032694518566, "grad/layer_8/attn_mlp_ratio": 1.5913920838617557, "grad/layer_12/attn": 0.005896182730793953, "grad/layer_12/mlp": 0.0064721484668552876, "grad/layer_12/attn_mlp_ratio": 0.9110085576510203, "grad/layer_16/attn": 0.003928475547581911, "grad/layer_16/mlp": 0.004647738300263882, "grad/layer_16/attn_mlp_ratio": 0.8452445489098239, "grad/layer_20/attn": 0.0035859481431543827, "grad/layer_20/mlp": 0.00597445759922266, "grad/layer_20/attn_mlp_ratio": 0.6002131613754589, "grad/layer_24/attn": 0.008546256460249424, "grad/layer_24/mlp": 0.011658255010843277, "grad/layer_24/attn_mlp_ratio": 0.7330648007779997, "grad/layer_27/attn": 0.010710041970014572, "grad/layer_27/mlp": 0.008566112257540226, "grad/layer_27/attn_mlp_ratio": 1.250280351574793} {"step": 21700, "timestamp": 1778217991.4857168, "train/loss": 2.196958541870117, "train/z_loss": 0.0015525478986091912, "train/perplexity": 8.997605999263751, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025954.8471604134, "perf/iters_per_sec": 0.966050551967818, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351425170898438, "data/tokens_consumed": 45510295552, "data/tokens_consumed_B": 45.510295552, "train/loss_slope": 5.174065408783862e-06} {"step": 21710, "timestamp": 1778218001.8328135, "train/loss": 2.2407178401947023, "train/z_loss": 0.00154198327800259, "train/perplexity": 9.400076618948452, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028010.3218565441, "perf/iters_per_sec": 0.9670306786806794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340933561325074, "data/tokens_consumed": 45531267072, "data/tokens_consumed_B": 45.531267072, "train/loss_slope": 4.944391963076254e-06} {"step": 21720, "timestamp": 1778218012.1920002, "train/loss": 2.2224607944488524, "train/z_loss": 0.0015442136907950043, "train/perplexity": 9.230016114985734, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025797.0929124223, "perf/iters_per_sec": 0.9659753288805114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035223126411438, "data/tokens_consumed": 45552238592, "data/tokens_consumed_B": 45.552238592, "train/loss_slope": 2.6273147429640956e-06} {"step": 21730, "timestamp": 1778218022.5454757, "train/loss": 2.1985599994659424, "train/z_loss": 0.0015547727816738188, "train/perplexity": 9.012026827826183, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026710.1308463034, "perf/iters_per_sec": 0.966410699294235, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347567558288575, "data/tokens_consumed": 45573210112, "data/tokens_consumed_B": 45.573210112, "train/loss_slope": -4.0041365281555824e-07} {"step": 21740, "timestamp": 1778218032.895561, "train/loss": 2.2166236877441405, "train/z_loss": 0.0015465645585209132, "train/perplexity": 9.176296462236056, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027190.6684505264, "perf/iters_per_sec": 0.9666398374798424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345114707946776, "data/tokens_consumed": 45594181632, "data/tokens_consumed_B": 45.594181632, "train/loss_slope": -2.3335555277177818e-06} {"step": 21750, "timestamp": 1778218043.2488427, "grad/layer_0/attn": 0.003062380477786064, "grad/layer_0/mlp": 0.002740577096119523, "grad/layer_0/attn_mlp_ratio": 1.1174217176302106, "grad/layer_4/attn": 0.0019236757652834058, "grad/layer_4/mlp": 0.0024283023085445166, "grad/layer_4/attn_mlp_ratio": 0.7921895388789012, "grad/layer_8/attn": 0.003964735195040703, "grad/layer_8/mlp": 0.003849797183647752, "grad/layer_8/attn_mlp_ratio": 1.0298555749626497, "grad/layer_12/attn": 0.0050328499637544155, "grad/layer_12/mlp": 0.006474857684224844, "grad/layer_12/attn_mlp_ratio": 0.7772911979651979, "grad/layer_16/attn": 0.004963827319443226, "grad/layer_16/mlp": 0.005287536419928074, "grad/layer_16/attn_mlp_ratio": 0.9387788246445528, "grad/layer_20/attn": 0.004843893926590681, "grad/layer_20/mlp": 0.006389536429196596, "grad/layer_20/attn_mlp_ratio": 0.758097853335184, "grad/layer_24/attn": 0.008854296989738941, "grad/layer_24/mlp": 0.010299773886799812, "grad/layer_24/attn_mlp_ratio": 0.8596593479708008, "grad/layer_27/attn": 0.008606583811342716, "grad/layer_27/mlp": 0.009368712082505226, "grad/layer_27/attn_mlp_ratio": 0.91865174675921} {"step": 21750, "timestamp": 1778218043.8515918, "eos/sharpness": 62.09371089935301, "eos/L0_probe": 2.049852132797241, "eos/L_plus": 2.4664485454559326, "eos/L_minus": 2.25419282913208, "eos/grad_norm": 0.194923996925354, "eos/embed_grad_frac": 0.06007566303014755, "eos/time_s": 0.5999677181243896} {"step": 21750, "timestamp": 1778218043.8701775, "train/loss": 2.2109840631484987, "train/z_loss": 0.0015495568397454918, "train/perplexity": 9.124691248803016, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911885.8644483346, "perf/iters_per_sec": 0.9116582224122689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969022989273072, "data/tokens_consumed": 45615153152, "data/tokens_consumed_B": 45.615153152, "train/loss_slope": -4.48267256001121e-06} {"step": 21750, "timestamp": 1778218045.23248, "geo/rankme_last": 440.47406005859375, "geo/layer_0/stable_rank_q_proj": 17.392892837524414, "geo/layer_0/stable_rank_k_proj": 15.302608489990234, "geo/layer_0/stable_rank_o_proj": 51.97552490234375, "geo/layer_0/stable_rank_gate_proj": 151.30435180664062, "geo/layer_0/stable_rank_down_proj": 50.111114501953125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048761844635009766, "geo/layer_0/attn_entropy_mean": 6.251881122589111, "geo/layer_0/attn_entropy_std": 0.3245700001716614, "geo/layer_7/stable_rank_q_proj": 42.19401168823242, "geo/layer_7/stable_rank_k_proj": 42.04264831542969, "geo/layer_7/stable_rank_o_proj": 110.1242904663086, "geo/layer_7/stable_rank_gate_proj": 102.63627624511719, "geo/layer_7/stable_rank_down_proj": 150.9216766357422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5535336136817932, "geo/layer_7/attn_entropy_mean": 4.615259647369385, "geo/layer_7/attn_entropy_std": 0.8197893500328064, "geo/layer_14/stable_rank_q_proj": 57.68905258178711, "geo/layer_14/stable_rank_k_proj": 35.00911331176758, "geo/layer_14/stable_rank_o_proj": 53.54196548461914, "geo/layer_14/stable_rank_gate_proj": 86.26495361328125, "geo/layer_14/stable_rank_down_proj": 136.180419921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36566513776779175, "geo/layer_14/attn_entropy_mean": 5.514213562011719, "geo/layer_14/attn_entropy_std": 0.45730850100517273, "geo/layer_21/stable_rank_q_proj": 47.19941329956055, "geo/layer_21/stable_rank_k_proj": 32.01944351196289, "geo/layer_21/stable_rank_o_proj": 83.31877899169922, "geo/layer_21/stable_rank_gate_proj": 86.22924041748047, "geo/layer_21/stable_rank_down_proj": 60.47404861450195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15155133605003357, "geo/layer_21/attn_entropy_mean": 5.727704048156738, "geo/layer_21/attn_entropy_std": 0.2915765941143036, "geo/layer_27/stable_rank_q_proj": 41.406463623046875, "geo/layer_27/stable_rank_k_proj": 31.709936141967773, "geo/layer_27/stable_rank_o_proj": 119.58349609375, "geo/layer_27/stable_rank_gate_proj": 91.33534240722656, "geo/layer_27/stable_rank_down_proj": 137.98162841796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07576242834329605, "geo/layer_27/attn_entropy_mean": 4.384971618652344, "geo/layer_27/attn_entropy_std": 0.5473691821098328, "attnres/final_alpha/block_0": 0.24169835448265076, "attnres/block_norm/0": 1.6258244514465332, "attnres/final_alpha/block_1": 0.0066406638361513615, "attnres/block_norm/1": 30115.333984375, "attnres/final_alpha/block_2": 0.01384598109871149, "attnres/block_norm/2": 21494.140625, "attnres/final_alpha/block_3": 0.015880778431892395, "attnres/block_norm/3": 30984.46875, "attnres/final_alpha/block_4": 0.02054281160235405, "attnres/block_norm/4": 9625.2412109375, "attnres/final_alpha/block_5": 0.5689536929130554, "attnres/block_norm/5": 5103.3896484375, "attnres/final_alpha/block_6": 0.13243772089481354, "attnres/block_norm/6": 20914.31640625, "geo/tier1_time_s": 1.3583083152770996, "geo/step": 21750.0, "geo/rankme_slope": -0.0001036944856067427} {"step": 21760, "timestamp": 1778218055.5842571, "train/loss": 2.2014277458190916, "train/z_loss": 0.0015496659674681724, "train/perplexity": 9.037908127660524, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790866.6931916818, "perf/iters_per_sec": 0.8539517847021493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710263013839721, "data/tokens_consumed": 45636124672, "data/tokens_consumed_B": 45.636124672, "train/loss_slope": -5.017737515366926e-06} {"step": 21770, "timestamp": 1778218065.9330466, "train/loss": 2.159760618209839, "train/z_loss": 0.001548999932128936, "train/perplexity": 8.669062194432096, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027624.7867473485, "perf/iters_per_sec": 0.966846841214823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342899799346923, "data/tokens_consumed": 45657096192, "data/tokens_consumed_B": 45.657096192, "train/loss_slope": -9.981481112627053e-06} {"step": 21780, "timestamp": 1778218076.28677, "train/loss": 2.212842655181885, "train/z_loss": 0.0015452086459845304, "train/perplexity": 9.141666097037557, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026648.7255180094, "perf/iters_per_sec": 0.9663814189519927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347881078720094, "data/tokens_consumed": 45678067712, "data/tokens_consumed_B": 45.678067712, "train/loss_slope": -1.1404950991429498e-05} {"step": 21790, "timestamp": 1778218086.640528, "train/loss": 2.1875783681869505, "train/z_loss": 0.0015599498758092523, "train/perplexity": 8.913601496616176, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026568.1804439437, "perf/iters_per_sec": 0.966343012067768, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348292350769044, "data/tokens_consumed": 45699039232, "data/tokens_consumed_B": 45.699039232, "train/loss_slope": -1.0855948728303407e-05} {"step": 21800, "timestamp": 1778218096.9710977, "grad/layer_0/attn": 0.002702333265915513, "grad/layer_0/mlp": 0.0026761088520288467, "grad/layer_0/attn_mlp_ratio": 1.0097994193647402, "grad/layer_4/attn": 0.001893559587188065, "grad/layer_4/mlp": 0.002564291935414076, "grad/layer_4/attn_mlp_ratio": 0.7384336733246911, "grad/layer_8/attn": 0.004618830978870392, "grad/layer_8/mlp": 0.003928508143872023, "grad/layer_8/attn_mlp_ratio": 1.1757213405559697, "grad/layer_12/attn": 0.005277987103909254, "grad/layer_12/mlp": 0.006749241147190332, "grad/layer_12/attn_mlp_ratio": 0.7820119196519235, "grad/layer_16/attn": 0.004190600011497736, "grad/layer_16/mlp": 0.0046579595655202866, "grad/layer_16/attn_mlp_ratio": 0.8996642977649427, "grad/layer_20/attn": 0.004614680539816618, "grad/layer_20/mlp": 0.00583664933219552, "grad/layer_20/attn_mlp_ratio": 0.7906386349609411, "grad/layer_24/attn": 0.00487220985814929, "grad/layer_24/mlp": 0.008407914079725742, "grad/layer_24/attn_mlp_ratio": 0.5794790186961942, "grad/layer_27/attn": 0.004365942906588316, "grad/layer_27/mlp": 0.0068942042998969555, "grad/layer_27/attn_mlp_ratio": 0.6332772649812315} {"step": 21800, "timestamp": 1778218096.9857101, "train/loss": 2.192503881454468, "train/z_loss": 0.001555167557671666, "train/perplexity": 8.957613861812684, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028368.5001595744, "perf/iters_per_sec": 0.9672014714048264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339107513427734, "data/tokens_consumed": 45720010752, "data/tokens_consumed_B": 45.720010752, "train/loss_slope": -1.1619894346459885e-05} {"step": 21810, "timestamp": 1778218107.3300512, "train/loss": 2.214498257637024, "train/z_loss": 0.0015577167621813714, "train/perplexity": 9.15681359753141, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028072.9319703549, "perf/iters_per_sec": 0.9670605335094237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340614318847656, "data/tokens_consumed": 45740982272, "data/tokens_consumed_B": 45.740982272, "train/loss_slope": -1.4222540141511192e-05} {"step": 21820, "timestamp": 1778218117.673547, "train/loss": 2.256680655479431, "train/z_loss": 0.0015376533032394946, "train/perplexity": 9.551332327352194, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028909.491412713, "perf/iters_per_sec": 0.9674594361365857, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336350679397583, "data/tokens_consumed": 45761953792, "data/tokens_consumed_B": 45.761953792, "train/loss_slope": -7.911967158210369e-06} {"step": 21825, "timestamp": 1778218123.4296784, "eos/sharpness": 24.97472763061523, "eos/L0_probe": 2.0490882396698, "eos/L_plus": 2.1801021099090576, "eos/L_minus": 2.1678216457366943, "eos/grad_norm": 0.12804833054542542, "eos/embed_grad_frac": 0.15162710845470428, "eos/time_s": 0.5885014533996582} {"step": 21825, "timestamp": 1778218124.8105931, "geo/rankme_last": 441.2905578613281, "geo/layer_0/stable_rank_q_proj": 17.42473602294922, "geo/layer_0/stable_rank_k_proj": 15.377994537353516, "geo/layer_0/stable_rank_o_proj": 52.00697326660156, "geo/layer_0/stable_rank_gate_proj": 151.5511016845703, "geo/layer_0/stable_rank_down_proj": 50.14282989501953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04739139601588249, "geo/layer_0/attn_entropy_mean": 6.258049011230469, "geo/layer_0/attn_entropy_std": 0.324363112449646, "geo/layer_7/stable_rank_q_proj": 42.21071243286133, "geo/layer_7/stable_rank_k_proj": 41.969444274902344, "geo/layer_7/stable_rank_o_proj": 110.4677963256836, "geo/layer_7/stable_rank_gate_proj": 102.6189956665039, "geo/layer_7/stable_rank_down_proj": 150.61656188964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5638976097106934, "geo/layer_7/attn_entropy_mean": 4.647504806518555, "geo/layer_7/attn_entropy_std": 0.8384663462638855, "geo/layer_14/stable_rank_q_proj": 57.80887222290039, "geo/layer_14/stable_rank_k_proj": 34.9909553527832, "geo/layer_14/stable_rank_o_proj": 53.63871765136719, "geo/layer_14/stable_rank_gate_proj": 86.23887634277344, "geo/layer_14/stable_rank_down_proj": 136.02093505859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3888944387435913, "geo/layer_14/attn_entropy_mean": 5.480294227600098, "geo/layer_14/attn_entropy_std": 0.4790986478328705, "geo/layer_21/stable_rank_q_proj": 47.0847282409668, "geo/layer_21/stable_rank_k_proj": 32.02730941772461, "geo/layer_21/stable_rank_o_proj": 83.2557144165039, "geo/layer_21/stable_rank_gate_proj": 86.03519439697266, "geo/layer_21/stable_rank_down_proj": 60.43812942504883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15278427302837372, "geo/layer_21/attn_entropy_mean": 5.75023078918457, "geo/layer_21/attn_entropy_std": 0.2931431829929352, "geo/layer_27/stable_rank_q_proj": 41.41902542114258, "geo/layer_27/stable_rank_k_proj": 31.62151527404785, "geo/layer_27/stable_rank_o_proj": 119.55978393554688, "geo/layer_27/stable_rank_gate_proj": 91.24079895019531, "geo/layer_27/stable_rank_down_proj": 137.9644012451172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07394948601722717, "geo/layer_27/attn_entropy_mean": 4.372100830078125, "geo/layer_27/attn_entropy_std": 0.5776100754737854, "attnres/final_alpha/block_0": 0.2417096495628357, "attnres/block_norm/0": 1.62662672996521, "attnres/final_alpha/block_1": 0.006677408702671528, "attnres/block_norm/1": 30124.732421875, "attnres/final_alpha/block_2": 0.013602161779999733, "attnres/block_norm/2": 21656.0859375, "attnres/final_alpha/block_3": 0.015750572085380554, "attnres/block_norm/3": 30857.83203125, "attnres/final_alpha/block_4": 0.020558299496769905, "attnres/block_norm/4": 9577.8662109375, "attnres/final_alpha/block_5": 0.5663060545921326, "attnres/block_norm/5": 5116.77783203125, "attnres/final_alpha/block_6": 0.13539589941501617, "attnres/block_norm/6": 20703.58984375, "geo/tier1_time_s": 1.3620848655700684, "geo/step": 21825.0, "geo/rankme_slope": -8.026263239670869e-05} {"step": 21830, "timestamp": 1778218130.0014174, "train/loss": 2.191891884803772, "train/z_loss": 0.0015483885770663619, "train/perplexity": 8.952133509280776, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702076.8217956412, "perf/iters_per_sec": 0.8116134747484404, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2321135997772217, "data/tokens_consumed": 45782925312, "data/tokens_consumed_B": 45.782925312, "train/loss_slope": -7.416347294214383e-06} {"step": 21840, "timestamp": 1778218140.379424, "train/loss": 2.23763792514801, "train/z_loss": 0.001551797625143081, "train/perplexity": 9.371169719777152, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022207.8704270446, "perf/iters_per_sec": 0.9642638542304252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037060546875, "data/tokens_consumed": 45803896832, "data/tokens_consumed_B": 45.803896832, "train/loss_slope": -9.30189811679086e-06} {"step": 21850, "timestamp": 1778218150.7445786, "grad/layer_0/attn": 0.002977065509185195, "grad/layer_0/mlp": 0.0027998483274132013, "grad/layer_0/attn_mlp_ratio": 1.0632952412840881, "grad/layer_4/attn": 0.0017907175933942199, "grad/layer_4/mlp": 0.0025293026119470596, "grad/layer_4/attn_mlp_ratio": 0.707988642457005, "grad/layer_8/attn": 0.00581386499106884, "grad/layer_8/mlp": 0.0038898829370737076, "grad/layer_8/attn_mlp_ratio": 1.4946117751248662, "grad/layer_12/attn": 0.00427996413782239, "grad/layer_12/mlp": 0.006332213990390301, "grad/layer_12/attn_mlp_ratio": 0.6759032585960125, "grad/layer_16/attn": 0.005378422327339649, "grad/layer_16/mlp": 0.00439992593601346, "grad/layer_16/attn_mlp_ratio": 1.2223892591187175, "grad/layer_20/attn": 0.00346461427398026, "grad/layer_20/mlp": 0.006200375966727734, "grad/layer_20/attn_mlp_ratio": 0.5587748608623864, "grad/layer_24/attn": 0.009262878447771072, "grad/layer_24/mlp": 0.01021087821573019, "grad/layer_24/attn_mlp_ratio": 0.9071578527677984, "grad/layer_27/attn": 0.0044116065837442875, "grad/layer_27/mlp": 0.008634692057967186, "grad/layer_27/attn_mlp_ratio": 0.5109164870080193} {"step": 21850, "timestamp": 1778218150.7590165, "train/loss": 2.2565571069717407, "train/z_loss": 0.0015336083131842315, "train/perplexity": 9.55015234739058, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021511.3611961117, "perf/iters_per_sec": 0.9639317327480849, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374178647994996, "data/tokens_consumed": 45824868352, "data/tokens_consumed_B": 45.824868352, "train/loss_slope": -8.882172644907426e-06} {"step": 21860, "timestamp": 1778218161.1392446, "train/loss": 2.214138126373291, "train/z_loss": 0.0015525707742199302, "train/perplexity": 9.153516536401796, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021468.155975624, "perf/iters_per_sec": 0.963911130893528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037440037727356, "data/tokens_consumed": 45845839872, "data/tokens_consumed_B": 45.845839872, "train/loss_slope": -8.415328007791726e-06} {"step": 21870, "timestamp": 1778218171.5208862, "train/loss": 2.2113961935043336, "train/z_loss": 0.0015447046724148094, "train/perplexity": 9.128452586081664, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021411.7596816865, "perf/iters_per_sec": 0.9638842390449937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374689817428588, "data/tokens_consumed": 45866811392, "data/tokens_consumed_B": 45.866811392, "train/loss_slope": -1.0374301394792595e-05} {"step": 21880, "timestamp": 1778218181.8991904, "train/loss": 2.206730771064758, "train/z_loss": 0.0015533563913777471, "train/perplexity": 9.08596368994104, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021769.0977948515, "perf/iters_per_sec": 0.9640546311353929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037285614013672, "data/tokens_consumed": 45887782912, "data/tokens_consumed_B": 45.887782912, "train/loss_slope": -8.262663992038649e-06} {"step": 21890, "timestamp": 1778218192.284869, "train/loss": 2.2819227457046507, "train/z_loss": 0.0015271008131094276, "train/perplexity": 9.795496563334657, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020297.1620380164, "perf/iters_per_sec": 0.9633527574720461, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038041353225708, "data/tokens_consumed": 45908754432, "data/tokens_consumed_B": 45.908754432, "train/loss_slope": -4.620678790367543e-06} {"step": 21900, "timestamp": 1778218202.6465013, "grad/layer_0/attn": 0.0029854862950742245, "grad/layer_0/mlp": 0.0026927287690341473, "grad/layer_0/attn_mlp_ratio": 1.108721464461844, "grad/layer_4/attn": 0.0019660061225295067, "grad/layer_4/mlp": 0.0025064924266189337, "grad/layer_4/attn_mlp_ratio": 0.7843654436031765, "grad/layer_8/attn": 0.005135091487318277, "grad/layer_8/mlp": 0.0036623175255954266, "grad/layer_8/attn_mlp_ratio": 1.4021425808154486, "grad/layer_12/attn": 0.004262719303369522, "grad/layer_12/mlp": 0.006469267420470715, "grad/layer_12/attn_mlp_ratio": 0.6589183844818586, "grad/layer_16/attn": 0.0052837710827589035, "grad/layer_16/mlp": 0.0046255867928266525, "grad/layer_16/attn_mlp_ratio": 1.1422920388660218, "grad/layer_20/attn": 0.004211418330669403, "grad/layer_20/mlp": 0.006101147271692753, "grad/layer_20/attn_mlp_ratio": 0.6902666128356366, "grad/layer_24/attn": 0.008258762769401073, "grad/layer_24/mlp": 0.009116743691265583, "grad/layer_24/attn_mlp_ratio": 0.9058895323255097, "grad/layer_27/attn": 0.005952141247689724, "grad/layer_27/mlp": 0.00869060680270195, "grad/layer_27/attn_mlp_ratio": 0.6848936230033803} {"step": 21900, "timestamp": 1778218203.2474446, "eos/sharpness": 12.417292594909666, "eos/L0_probe": 2.043069362640381, "eos/L_plus": 2.109508991241455, "eos/L_minus": 2.1008026599884033, "eos/grad_norm": 0.11744179576635361, "eos/embed_grad_frac": 0.17628727853298187, "eos/time_s": 0.5980827808380127} {"step": 21900, "timestamp": 1778218203.2666621, "train/loss": 2.211545300483704, "train/z_loss": 0.0015433188993483782, "train/perplexity": 9.129813803554091, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910727.9008241047, "perf/iters_per_sec": 0.9111060623283885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0975670576095582, "data/tokens_consumed": 45929725952, "data/tokens_consumed_B": 45.929725952, "train/loss_slope": -3.7624960864159736e-06} {"step": 21900, "timestamp": 1778218204.6300344, "geo/rankme_last": 440.9704284667969, "geo/layer_0/stable_rank_q_proj": 17.415603637695312, "geo/layer_0/stable_rank_k_proj": 15.379011154174805, "geo/layer_0/stable_rank_o_proj": 51.986087799072266, "geo/layer_0/stable_rank_gate_proj": 151.17164611816406, "geo/layer_0/stable_rank_down_proj": 50.15034484863281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05298711359500885, "geo/layer_0/attn_entropy_mean": 6.259772300720215, "geo/layer_0/attn_entropy_std": 0.32631561160087585, "geo/layer_7/stable_rank_q_proj": 42.36427307128906, "geo/layer_7/stable_rank_k_proj": 41.78258514404297, "geo/layer_7/stable_rank_o_proj": 110.51640319824219, "geo/layer_7/stable_rank_gate_proj": 102.73796844482422, "geo/layer_7/stable_rank_down_proj": 150.4518280029297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5564502477645874, "geo/layer_7/attn_entropy_mean": 4.660688400268555, "geo/layer_7/attn_entropy_std": 0.8439396023750305, "geo/layer_14/stable_rank_q_proj": 57.86051559448242, "geo/layer_14/stable_rank_k_proj": 34.95793151855469, "geo/layer_14/stable_rank_o_proj": 53.70872116088867, "geo/layer_14/stable_rank_gate_proj": 86.06839752197266, "geo/layer_14/stable_rank_down_proj": 136.27182006835938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37709102034568787, "geo/layer_14/attn_entropy_mean": 5.49983024597168, "geo/layer_14/attn_entropy_std": 0.47029823064804077, "geo/layer_21/stable_rank_q_proj": 46.98335266113281, "geo/layer_21/stable_rank_k_proj": 32.00570297241211, "geo/layer_21/stable_rank_o_proj": 83.15345001220703, "geo/layer_21/stable_rank_gate_proj": 85.91866302490234, "geo/layer_21/stable_rank_down_proj": 60.51045227050781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15482893586158752, "geo/layer_21/attn_entropy_mean": 5.736204147338867, "geo/layer_21/attn_entropy_std": 0.3044711947441101, "geo/layer_27/stable_rank_q_proj": 41.446624755859375, "geo/layer_27/stable_rank_k_proj": 31.64557647705078, "geo/layer_27/stable_rank_o_proj": 119.6402359008789, "geo/layer_27/stable_rank_gate_proj": 91.11564636230469, "geo/layer_27/stable_rank_down_proj": 137.83616638183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07643231749534607, "geo/layer_27/attn_entropy_mean": 4.37166690826416, "geo/layer_27/attn_entropy_std": 0.5934568047523499, "attnres/final_alpha/block_0": 0.24195906519889832, "attnres/block_norm/0": 1.6272625923156738, "attnres/final_alpha/block_1": 0.006543417926877737, "attnres/block_norm/1": 30177.68359375, "attnres/final_alpha/block_2": 0.01383407972753048, "attnres/block_norm/2": 21588.23046875, "attnres/final_alpha/block_3": 0.015754226595163345, "attnres/block_norm/3": 31141.0703125, "attnres/final_alpha/block_4": 0.020370302721858025, "attnres/block_norm/4": 9597.09765625, "attnres/final_alpha/block_5": 0.5675990581512451, "attnres/block_norm/5": 5099.23046875, "attnres/final_alpha/block_6": 0.13393986225128174, "attnres/block_norm/6": 20858.796875, "geo/tier1_time_s": 1.3593125343322754, "geo/step": 21900.0, "geo/rankme_slope": -6.618709983993597e-05} {"step": 21910, "timestamp": 1778218215.0060534, "train/loss": 2.2737934947013856, "train/z_loss": 0.001533064350951463, "train/perplexity": 9.716189304127294, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787041.5910856905, "perf/iters_per_sec": 0.8521278338840916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1735328435897827, "data/tokens_consumed": 45950697472, "data/tokens_consumed_B": 45.950697472, "train/loss_slope": -4.5701975297405035e-07} {"step": 21920, "timestamp": 1778218225.3778954, "train/loss": 2.255999970436096, "train/z_loss": 0.0015366123290732503, "train/perplexity": 9.544833090510641, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022954.7329126585, "perf/iters_per_sec": 0.9646199860156338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366776704788208, "data/tokens_consumed": 45971668992, "data/tokens_consumed_B": 45.971668992, "train/loss_slope": 1.6491854401847207e-06} {"step": 21930, "timestamp": 1778218235.7492003, "train/loss": 2.2116731882095335, "train/z_loss": 0.0015495307859964668, "train/perplexity": 9.130981469342146, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023138.707855029, "perf/iters_per_sec": 0.9647077121043344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365833997726441, "data/tokens_consumed": 45992640512, "data/tokens_consumed_B": 45.992640512, "train/loss_slope": 1.3671414257704572e-06} {"step": 21940, "timestamp": 1778218246.1266906, "train/loss": 2.2488014817237856, "train/z_loss": 0.0015329824876971544, "train/perplexity": 9.476371423147585, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021804.4621341014, "perf/iters_per_sec": 0.9640714941664226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372674703598022, "data/tokens_consumed": 46013612032, "data/tokens_consumed_B": 46.013612032, "train/loss_slope": 6.0868412748027555e-06} {"step": 21950, "timestamp": 1778218256.4969518, "grad/layer_0/attn": 0.0031829094514250755, "grad/layer_0/mlp": 0.002771133789792657, "grad/layer_0/attn_mlp_ratio": 1.1485946107292668, "grad/layer_4/attn": 0.001821494079194963, "grad/layer_4/mlp": 0.002580694854259491, "grad/layer_4/attn_mlp_ratio": 0.7058153371395364, "grad/layer_8/attn": 0.004137792624533176, "grad/layer_8/mlp": 0.0037088519893586636, "grad/layer_8/attn_mlp_ratio": 1.115653179161611, "grad/layer_12/attn": 0.004075723700225353, "grad/layer_12/mlp": 0.0063979835249483585, "grad/layer_12/attn_mlp_ratio": 0.6370325307386592, "grad/layer_16/attn": 0.00456301448866725, "grad/layer_16/mlp": 0.0048159281723201275, "grad/layer_16/attn_mlp_ratio": 0.9474838973191282, "grad/layer_20/attn": 0.004837509710341692, "grad/layer_20/mlp": 0.005976784974336624, "grad/layer_20/attn_mlp_ratio": 0.8093832470424943, "grad/layer_24/attn": 0.012845141813158989, "grad/layer_24/mlp": 0.010759523138403893, "grad/layer_24/attn_mlp_ratio": 1.1938393113284902, "grad/layer_27/attn": 0.004518268164247274, "grad/layer_27/mlp": 0.009830842725932598, "grad/layer_27/attn_mlp_ratio": 0.4596013021720395} {"step": 21950, "timestamp": 1778218256.5111964, "train/loss": 2.234713625907898, "train/z_loss": 0.0015405793907120823, "train/perplexity": 9.343805645158334, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021006.4400421567, "perf/iters_per_sec": 0.9636909675799163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376770496368408, "data/tokens_consumed": 46034583552, "data/tokens_consumed_B": 46.034583552, "train/loss_slope": 7.400783933106677e-06} {"step": 21960, "timestamp": 1778218266.887316, "train/loss": 2.2098639726638796, "train/z_loss": 0.001552298804745078, "train/perplexity": 9.114476490754745, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022190.9016321893, "perf/iters_per_sec": 0.9642557628785082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370692491531373, "data/tokens_consumed": 46055555072, "data/tokens_consumed_B": 46.055555072, "train/loss_slope": 7.665126301524747e-06} {"step": 21970, "timestamp": 1778218277.2675407, "train/loss": 2.2057777643203735, "train/z_loss": 0.0015534775913693011, "train/perplexity": 9.07730882999029, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021479.3984466349, "perf/iters_per_sec": 0.963916491721456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374342679977417, "data/tokens_consumed": 46076526592, "data/tokens_consumed_B": 46.076526592, "train/loss_slope": 7.2666595835532115e-06} {"step": 21975, "timestamp": 1778218283.0726526, "eos/sharpness": 49.62213039398193, "eos/L0_probe": 2.0458645820617676, "eos/L_plus": 2.3678531646728516, "eos/L_minus": 2.220097303390503, "eos/grad_norm": 0.1660022735595703, "eos/embed_grad_frac": 0.08020015060901642, "eos/time_s": 0.5902144908905029} {"step": 21975, "timestamp": 1778218284.4479666, "geo/rankme_last": 440.51690673828125, "geo/layer_0/stable_rank_q_proj": 17.4388427734375, "geo/layer_0/stable_rank_k_proj": 15.369477272033691, "geo/layer_0/stable_rank_o_proj": 51.915523529052734, "geo/layer_0/stable_rank_gate_proj": 151.23013305664062, "geo/layer_0/stable_rank_down_proj": 50.07613754272461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04819357767701149, "geo/layer_0/attn_entropy_mean": 6.261995315551758, "geo/layer_0/attn_entropy_std": 0.3216654062271118, "geo/layer_7/stable_rank_q_proj": 42.34125518798828, "geo/layer_7/stable_rank_k_proj": 41.90625762939453, "geo/layer_7/stable_rank_o_proj": 110.72003173828125, "geo/layer_7/stable_rank_gate_proj": 102.67353057861328, "geo/layer_7/stable_rank_down_proj": 150.61204528808594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5414031147956848, "geo/layer_7/attn_entropy_mean": 4.681099891662598, "geo/layer_7/attn_entropy_std": 0.8393105864524841, "geo/layer_14/stable_rank_q_proj": 57.819984436035156, "geo/layer_14/stable_rank_k_proj": 34.95785140991211, "geo/layer_14/stable_rank_o_proj": 53.78839874267578, "geo/layer_14/stable_rank_gate_proj": 85.95635223388672, "geo/layer_14/stable_rank_down_proj": 136.1909942626953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3833983540534973, "geo/layer_14/attn_entropy_mean": 5.499538421630859, "geo/layer_14/attn_entropy_std": 0.4568956792354584, "geo/layer_21/stable_rank_q_proj": 47.08647918701172, "geo/layer_21/stable_rank_k_proj": 32.03288650512695, "geo/layer_21/stable_rank_o_proj": 83.06211853027344, "geo/layer_21/stable_rank_gate_proj": 86.11856842041016, "geo/layer_21/stable_rank_down_proj": 60.380619049072266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1534375548362732, "geo/layer_21/attn_entropy_mean": 5.755593299865723, "geo/layer_21/attn_entropy_std": 0.2804795801639557, "geo/layer_27/stable_rank_q_proj": 41.378841400146484, "geo/layer_27/stable_rank_k_proj": 31.65528678894043, "geo/layer_27/stable_rank_o_proj": 119.8192138671875, "geo/layer_27/stable_rank_gate_proj": 91.15435791015625, "geo/layer_27/stable_rank_down_proj": 137.77601623535156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07648924738168716, "geo/layer_27/attn_entropy_mean": 4.3794121742248535, "geo/layer_27/attn_entropy_std": 0.5652627348899841, "attnres/final_alpha/block_0": 0.2414989173412323, "attnres/block_norm/0": 1.6278764009475708, "attnres/final_alpha/block_1": 0.0065599288791418076, "attnres/block_norm/1": 30290.88671875, "attnres/final_alpha/block_2": 0.013888463377952576, "attnres/block_norm/2": 21656.0546875, "attnres/final_alpha/block_3": 0.015974603593349457, "attnres/block_norm/3": 31330.9453125, "attnres/final_alpha/block_4": 0.020630182698369026, "attnres/block_norm/4": 9628.759765625, "attnres/final_alpha/block_5": 0.5682224631309509, "attnres/block_norm/5": 5105.40283203125, "attnres/final_alpha/block_6": 0.1332254707813263, "attnres/block_norm/6": 21024.66796875, "geo/tier1_time_s": 1.356367826461792, "geo/step": 21975.0, "geo/rankme_slope": -7.791114492672068e-05} {"step": 21980, "timestamp": 1778218289.6397095, "train/loss": 2.211940789222717, "train/z_loss": 0.0015592703595757485, "train/perplexity": 9.13342525620004, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699583.6474128508, "perf/iters_per_sec": 0.8104246365608457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2339210271835328, "data/tokens_consumed": 46097498112, "data/tokens_consumed_B": 46.097498112, "train/loss_slope": 8.862363287109914e-06} {"step": 21990, "timestamp": 1778218300.0179796, "train/loss": 2.2337869882583616, "train/z_loss": 0.0015427215141244233, "train/perplexity": 9.335151333382397, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021542.2563840966, "perf/iters_per_sec": 0.9639464647217257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374020099639893, "data/tokens_consumed": 46118469632, "data/tokens_consumed_B": 46.118469632, "train/loss_slope": 8.281449387461872e-06} {"step": 22000, "timestamp": 1778218310.392948, "grad/layer_0/attn": 0.0034062776248902082, "grad/layer_0/mlp": 0.002954791532829404, "grad/layer_0/attn_mlp_ratio": 1.1527979120573308, "grad/layer_4/attn": 0.0025078037288039923, "grad/layer_4/mlp": 0.002477806294336915, "grad/layer_4/attn_mlp_ratio": 1.0121064077224262, "grad/layer_8/attn": 0.005090388469398022, "grad/layer_8/mlp": 0.003733118763193488, "grad/layer_8/attn_mlp_ratio": 1.3635752452423817, "grad/layer_12/attn": 0.006434645038098097, "grad/layer_12/mlp": 0.006585660856217146, "grad/layer_12/attn_mlp_ratio": 0.9770689807563684, "grad/layer_16/attn": 0.00472700409591198, "grad/layer_16/mlp": 0.005300447810441256, "grad/layer_16/attn_mlp_ratio": 0.891812196965532, "grad/layer_20/attn": 0.007495772559195757, "grad/layer_20/mlp": 0.006797682493925095, "grad/layer_20/attn_mlp_ratio": 1.1026952870518736, "grad/layer_24/attn": 0.00581766851246357, "grad/layer_24/mlp": 0.010956862941384315, "grad/layer_24/attn_mlp_ratio": 0.5309611419336088, "grad/layer_27/attn": 0.010854608379304409, "grad/layer_27/mlp": 0.00813290011137724, "grad/layer_27/attn_mlp_ratio": 1.3346540713876867} {"step": 22000, "timestamp": 1778218310.4072063, "train/loss": 2.2132096529006957, "train/z_loss": 0.0015517083462327718, "train/perplexity": 9.145021683349897, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019780.3712324412, "perf/iters_per_sec": 0.9631063324129301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383069515228271, "data/tokens_consumed": 46139441152, "data/tokens_consumed_B": 46.139441152, "train/loss_slope": 7.265508879493538e-06} {"step": 22000, "timestamp": 1778218317.3981774, "geo/ww_alpha_mean": 8.398417735472878, "geo/ww_alpha_std": 5.329000587965348, "geo/ww_alpha_min": 1.3577004090131761, "geo/ww_alpha_max": 33.54926480648336, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.195532416746681, "geo/ww_alpha_by_type/k_proj": 4.665489944743718, "geo/ww_alpha_by_type/v_proj": 8.293881534294153, "geo/ww_alpha_by_type/o_proj": 9.271581355846935, "geo/ww_alpha_by_type/gate_proj": 9.120961249019047, "geo/ww_alpha_by_type/up_proj": 13.923601403511446, "geo/ww_alpha_by_type/down_proj": 9.462138448917164, "geo/twonn_id/layer_0": 0.6799092292785645, "geo/twonn_id/layer_7": 3.2139155864715576, "geo/twonn_id/layer_14": 4.187002658843994, "geo/twonn_id/layer_21": 9.106536865234375, "geo/twonn_id/layer_27": 5.865412712097168, "geo/tier2_time_s": 6.980286359786987} {"step": 22000, "timestamp": 1778218318.00426, "eoc/jacobian_sigma/layer_0/attn": 904.5484619140625, "eoc/jacobian_sigma/layer_0/mlp": 6136.25732421875, "eoc/jacobian_sigma/layer_0": 6136.25732421875, "eoc/jacobian_sigma/layer_7/attn": 1.1608806848526, "eoc/jacobian_sigma/layer_7/mlp": 1.6398777961730957, "eoc/jacobian_sigma/layer_7": 1.6398777961730957, "eoc/jacobian_sigma/layer_14/attn": 1.6121233701705933, "eoc/jacobian_sigma/layer_14/mlp": 8.544095039367676, "eoc/jacobian_sigma/layer_14": 8.544095039367676, "eoc/jacobian_sigma/layer_21/attn": 1.0755350589752197, "eoc/jacobian_sigma/layer_21/mlp": 3.726013422012329, "eoc/jacobian_sigma/layer_21": 3.726013422012329, "eoc/jacobian_sigma/layer_27/attn": 3.678305149078369, "eoc/jacobian_sigma/layer_27/mlp": 21.93422508239746, "eoc/jacobian_sigma/layer_27": 21.93422508239746, "eoc/layer0_sigma": 6136.25732421875, "eoc/sigma_max": 21.93422508239746, "eoc/sigma_min": 1.6398777961730957, "eoc/sigma_mean": 8.96105283498764, "eoc/time_s": 0.5993537902832031} {"step": 22010, "timestamp": 1778218328.3966112, "train/loss": 2.2430910587310793, "train/z_loss": 0.0015423895558342338, "train/perplexity": 9.422411547374143, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1166402.5543638158, "perf/iters_per_sec": 0.556184079343708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7979658842086792, "data/tokens_consumed": 46160412672, "data/tokens_consumed_B": 46.160412672, "train/loss_slope": 7.2407868542972494e-06} {"step": 22020, "timestamp": 1778218338.7812817, "train/loss": 2.223216199874878, "train/z_loss": 0.0015476953354664146, "train/perplexity": 9.236991153400544, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020382.3604863328, "perf/iters_per_sec": 0.9633933832580246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037997579574585, "data/tokens_consumed": 46181384192, "data/tokens_consumed_B": 46.181384192, "train/loss_slope": 9.432178340514657e-06} {"step": 22030, "timestamp": 1778218349.6709776, "train/loss": 2.2204225778579714, "train/z_loss": 0.0015361743047833444, "train/perplexity": 9.211222502238956, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926852.033788338, "perf/iters_per_sec": 0.9187946480695429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0883824825286865, "data/tokens_consumed": 46202355712, "data/tokens_consumed_B": 46.202355712, "train/loss_slope": 6.5468426048308145e-06} {"step": 22040, "timestamp": 1778218360.0550084, "train/loss": 2.1879850149154665, "train/z_loss": 0.001548772247042507, "train/perplexity": 8.91722692058751, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020671.8847220258, "perf/iters_per_sec": 0.9635314391718034, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378488540649413, "data/tokens_consumed": 46223327232, "data/tokens_consumed_B": 46.223327232, "train/loss_slope": 2.1663852758032513e-06} {"step": 22050, "timestamp": 1778218370.431115, "grad/layer_0/attn": 0.003318044124171138, "grad/layer_0/mlp": 0.0028408255893737078, "grad/layer_0/attn_mlp_ratio": 1.1679858206656242, "grad/layer_4/attn": 0.0016468027606606483, "grad/layer_4/mlp": 0.0024230123963207006, "grad/layer_4/attn_mlp_ratio": 0.6796509564689772, "grad/layer_8/attn": 0.005671090446412563, "grad/layer_8/mlp": 0.0035604729782789946, "grad/layer_8/attn_mlp_ratio": 1.5927912728815705, "grad/layer_12/attn": 0.004654180724173784, "grad/layer_12/mlp": 0.006018561311066151, "grad/layer_12/attn_mlp_ratio": 0.7733045168595074, "grad/layer_16/attn": 0.0038157831877470016, "grad/layer_16/mlp": 0.004619870334863663, "grad/layer_16/attn_mlp_ratio": 0.825950260195903, "grad/layer_20/attn": 0.008333556354045868, "grad/layer_20/mlp": 0.007395215332508087, "grad/layer_20/attn_mlp_ratio": 1.1268848663168076, "grad/layer_24/attn": 0.01849837601184845, "grad/layer_24/mlp": 0.013500618748366833, "grad/layer_24/attn_mlp_ratio": 1.3701872647183277, "grad/layer_27/attn": 0.0045049479231238365, "grad/layer_27/mlp": 0.01299112755805254, "grad/layer_27/attn_mlp_ratio": 0.346771122700149} {"step": 22050, "timestamp": 1778218371.0343266, "eos/sharpness": 39.235520362854, "eos/L0_probe": 2.043959140777588, "eos/L_plus": 2.257495641708374, "eos/L_minus": 2.222777843475342, "eos/grad_norm": 0.20403853058815002, "eos/embed_grad_frac": 0.06996283680200577, "eos/time_s": 0.600412130355835} {"step": 22050, "timestamp": 1778218371.0540147, "train/loss": 2.1817538499832154, "train/z_loss": 0.0015548614086583258, "train/perplexity": 8.861834966335541, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907780.8161324542, "perf/iters_per_sec": 0.909700782839038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099262547492981, "data/tokens_consumed": 46244298752, "data/tokens_consumed_B": 46.244298752, "train/loss_slope": -5.157028201675948e-07} {"step": 22050, "timestamp": 1778218372.416604, "geo/rankme_last": 440.8919982910156, "geo/layer_0/stable_rank_q_proj": 17.46134376525879, "geo/layer_0/stable_rank_k_proj": 15.35925579071045, "geo/layer_0/stable_rank_o_proj": 51.83174514770508, "geo/layer_0/stable_rank_gate_proj": 151.02479553222656, "geo/layer_0/stable_rank_down_proj": 50.10066604614258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04786328971385956, "geo/layer_0/attn_entropy_mean": 6.265279769897461, "geo/layer_0/attn_entropy_std": 0.3223232328891754, "geo/layer_7/stable_rank_q_proj": 42.39207077026367, "geo/layer_7/stable_rank_k_proj": 41.67023468017578, "geo/layer_7/stable_rank_o_proj": 110.22774505615234, "geo/layer_7/stable_rank_gate_proj": 102.62901306152344, "geo/layer_7/stable_rank_down_proj": 150.6056671142578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5473780632019043, "geo/layer_7/attn_entropy_mean": 4.656232833862305, "geo/layer_7/attn_entropy_std": 0.8256983160972595, "geo/layer_14/stable_rank_q_proj": 57.78778076171875, "geo/layer_14/stable_rank_k_proj": 35.04576110839844, "geo/layer_14/stable_rank_o_proj": 53.80829620361328, "geo/layer_14/stable_rank_gate_proj": 86.01164245605469, "geo/layer_14/stable_rank_down_proj": 135.8861541748047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.370525598526001, "geo/layer_14/attn_entropy_mean": 5.490072250366211, "geo/layer_14/attn_entropy_std": 0.4668367803096771, "geo/layer_21/stable_rank_q_proj": 47.247737884521484, "geo/layer_21/stable_rank_k_proj": 32.02336883544922, "geo/layer_21/stable_rank_o_proj": 83.05681610107422, "geo/layer_21/stable_rank_gate_proj": 85.80685424804688, "geo/layer_21/stable_rank_down_proj": 60.422996520996094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15005218982696533, "geo/layer_21/attn_entropy_mean": 5.725333213806152, "geo/layer_21/attn_entropy_std": 0.30680418014526367, "geo/layer_27/stable_rank_q_proj": 41.287445068359375, "geo/layer_27/stable_rank_k_proj": 31.661291122436523, "geo/layer_27/stable_rank_o_proj": 119.68630981445312, "geo/layer_27/stable_rank_gate_proj": 91.218017578125, "geo/layer_27/stable_rank_down_proj": 137.6644287109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07719994336366653, "geo/layer_27/attn_entropy_mean": 4.3886003494262695, "geo/layer_27/attn_entropy_std": 0.5541636943817139, "attnres/final_alpha/block_0": 0.24130336940288544, "attnres/block_norm/0": 1.6285924911499023, "attnres/final_alpha/block_1": 0.006624805275350809, "attnres/block_norm/1": 30214.7109375, "attnres/final_alpha/block_2": 0.013819633983075619, "attnres/block_norm/2": 21729.0234375, "attnres/final_alpha/block_3": 0.015943516045808792, "attnres/block_norm/3": 31207.3828125, "attnres/final_alpha/block_4": 0.020486418157815933, "attnres/block_norm/4": 9642.28125, "attnres/final_alpha/block_5": 0.5691727995872498, "attnres/block_norm/5": 5148.0791015625, "attnres/final_alpha/block_6": 0.1326494812965393, "attnres/block_norm/6": 20956.248046875, "geo/tier1_time_s": 1.3585200309753418, "geo/step": 22050.0, "geo/rankme_slope": -9.251264959108643e-05} {"step": 22060, "timestamp": 1778218383.3342466, "train/loss": 2.198574995994568, "train/z_loss": 0.0015572047093883156, "train/perplexity": 9.01216197795787, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1708305.132427303, "perf/iters_per_sec": 0.8145833646904482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2276214361190796, "data/tokens_consumed": 46265270272, "data/tokens_consumed_B": 46.265270272, "train/loss_slope": -2.8499089809570142e-06} {"step": 22070, "timestamp": 1778218393.7201288, "train/loss": 2.2094594478607177, "train/z_loss": 0.0015345707535743714, "train/perplexity": 9.11079020459376, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020245.9814578935, "perf/iters_per_sec": 0.9633283526696651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380676507949829, "data/tokens_consumed": 46286241792, "data/tokens_consumed_B": 46.286241792, "train/loss_slope": -2.430912398948655e-06} {"step": 22080, "timestamp": 1778218404.1072657, "train/loss": 2.2549160957336425, "train/z_loss": 0.0015327902277931572, "train/perplexity": 9.53449329192001, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020148.9635587905, "perf/iters_per_sec": 0.963282090930362, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381175041198731, "data/tokens_consumed": 46307213312, "data/tokens_consumed_B": 46.307213312, "train/loss_slope": -2.368523900729935e-06} {"step": 22090, "timestamp": 1778218414.49123, "train/loss": 2.227351713180542, "train/z_loss": 0.0015361853875219822, "train/perplexity": 9.275269949900709, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020749.7798834972, "perf/iters_per_sec": 0.9635685824792372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378088474273681, "data/tokens_consumed": 46328184832, "data/tokens_consumed_B": 46.328184832, "train/loss_slope": -4.841307123037152e-06} {"step": 22100, "timestamp": 1778218424.8609102, "grad/layer_0/attn": 0.0024867039173841476, "grad/layer_0/mlp": 0.00243399222381413, "grad/layer_0/attn_mlp_ratio": 1.0216564337751968, "grad/layer_4/attn": 0.0015317958313971758, "grad/layer_4/mlp": 0.00241800956428051, "grad/layer_4/attn_mlp_ratio": 0.6334945033617013, "grad/layer_8/attn": 0.004170705564320087, "grad/layer_8/mlp": 0.003845141502097249, "grad/layer_8/attn_mlp_ratio": 1.0846689136351355, "grad/layer_12/attn": 0.004029196687042713, "grad/layer_12/mlp": 0.006084195338189602, "grad/layer_12/attn_mlp_ratio": 0.662239852084967, "grad/layer_16/attn": 0.006240245420485735, "grad/layer_16/mlp": 0.004501360934227705, "grad/layer_16/attn_mlp_ratio": 1.3863019146955338, "grad/layer_20/attn": 0.007069023791700602, "grad/layer_20/mlp": 0.0059629506431519985, "grad/layer_20/attn_mlp_ratio": 1.1854908913709954, "grad/layer_24/attn": 0.011955899186432362, "grad/layer_24/mlp": 0.013221913017332554, "grad/layer_24/attn_mlp_ratio": 0.9042488088020652, "grad/layer_27/attn": 0.0044003878720104694, "grad/layer_27/mlp": 0.010874452069401741, "grad/layer_27/attn_mlp_ratio": 0.40465375206598175} {"step": 22100, "timestamp": 1778218424.8750486, "train/loss": 2.2102880477905273, "train/z_loss": 0.0015481308219023049, "train/perplexity": 9.118342533215175, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020724.340278845, "perf/iters_per_sec": 0.963556451930449, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378219127655028, "data/tokens_consumed": 46349156352, "data/tokens_consumed_B": 46.349156352, "train/loss_slope": -3.1116127359806415e-06} {"step": 22110, "timestamp": 1778218435.260898, "train/loss": 2.1825743913650513, "train/z_loss": 0.0015509511111304163, "train/perplexity": 8.869109452744832, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020338.4610420573, "perf/iters_per_sec": 0.9633724503717696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038020133972168, "data/tokens_consumed": 46370127872, "data/tokens_consumed_B": 46.370127872, "train/loss_slope": -4.699487282712658e-06} {"step": 22120, "timestamp": 1778218445.6360252, "train/loss": 2.2212851524353026, "train/z_loss": 0.0015406174585223197, "train/perplexity": 9.219171296316526, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022576.7912751797, "perf/iters_per_sec": 0.9644397693992518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368713855743408, "data/tokens_consumed": 46391099392, "data/tokens_consumed_B": 46.391099392, "train/loss_slope": -5.613601554666896e-06} {"step": 22125, "timestamp": 1778218451.394866, "eos/sharpness": 7.953310012817381, "eos/L0_probe": 2.047536849975586, "eos/L_plus": 2.0883991718292236, "eos/L_minus": 2.086207628250122, "eos/grad_norm": 0.10031937807798386, "eos/embed_grad_frac": 0.23523321747779846, "eos/time_s": 0.5821905136108398} {"step": 22125, "timestamp": 1778218452.7712805, "geo/rankme_last": 440.4862060546875, "geo/layer_0/stable_rank_q_proj": 17.48513412475586, "geo/layer_0/stable_rank_k_proj": 15.41097640991211, "geo/layer_0/stable_rank_o_proj": 51.69191360473633, "geo/layer_0/stable_rank_gate_proj": 150.78201293945312, "geo/layer_0/stable_rank_down_proj": 50.109981536865234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05073678120970726, "geo/layer_0/attn_entropy_mean": 6.25961971282959, "geo/layer_0/attn_entropy_std": 0.32320845127105713, "geo/layer_7/stable_rank_q_proj": 42.463836669921875, "geo/layer_7/stable_rank_k_proj": 41.75849914550781, "geo/layer_7/stable_rank_o_proj": 110.32743835449219, "geo/layer_7/stable_rank_gate_proj": 102.84441375732422, "geo/layer_7/stable_rank_down_proj": 150.47662353515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5610106587409973, "geo/layer_7/attn_entropy_mean": 4.658038139343262, "geo/layer_7/attn_entropy_std": 0.8579501509666443, "geo/layer_14/stable_rank_q_proj": 57.766937255859375, "geo/layer_14/stable_rank_k_proj": 35.17057800292969, "geo/layer_14/stable_rank_o_proj": 53.840309143066406, "geo/layer_14/stable_rank_gate_proj": 85.82260131835938, "geo/layer_14/stable_rank_down_proj": 135.14381408691406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37888064980506897, "geo/layer_14/attn_entropy_mean": 5.476954460144043, "geo/layer_14/attn_entropy_std": 0.463252454996109, "geo/layer_21/stable_rank_q_proj": 47.2179069519043, "geo/layer_21/stable_rank_k_proj": 31.920000076293945, "geo/layer_21/stable_rank_o_proj": 83.01618194580078, "geo/layer_21/stable_rank_gate_proj": 85.65125274658203, "geo/layer_21/stable_rank_down_proj": 60.47672653198242, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1482643336057663, "geo/layer_21/attn_entropy_mean": 5.721773147583008, "geo/layer_21/attn_entropy_std": 0.295204222202301, "geo/layer_27/stable_rank_q_proj": 41.24490737915039, "geo/layer_27/stable_rank_k_proj": 31.690692901611328, "geo/layer_27/stable_rank_o_proj": 119.54304504394531, "geo/layer_27/stable_rank_gate_proj": 91.19873809814453, "geo/layer_27/stable_rank_down_proj": 137.7836151123047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0779053121805191, "geo/layer_27/attn_entropy_mean": 4.397200584411621, "geo/layer_27/attn_entropy_std": 0.5529976487159729, "attnres/final_alpha/block_0": 0.2413739711046219, "attnres/block_norm/0": 1.6293056011199951, "attnres/final_alpha/block_1": 0.006576795596629381, "attnres/block_norm/1": 30505.255859375, "attnres/final_alpha/block_2": 0.013507548719644547, "attnres/block_norm/2": 21737.98046875, "attnres/final_alpha/block_3": 0.015814222395420074, "attnres/block_norm/3": 31162.23828125, "attnres/final_alpha/block_4": 0.020550251007080078, "attnres/block_norm/4": 9679.140625, "attnres/final_alpha/block_5": 0.5680012702941895, "attnres/block_norm/5": 5173.3955078125, "attnres/final_alpha/block_6": 0.13417592644691467, "attnres/block_norm/6": 21032.478515625, "geo/tier1_time_s": 1.3572094440460205, "geo/step": 22125.0, "geo/rankme_slope": -8.438142835259103e-05} {"step": 22130, "timestamp": 1778218457.9585414, "train/loss": 2.1543102502822875, "train/z_loss": 0.0015538162202574312, "train/perplexity": 8.621941146061095, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702661.4687018767, "perf/iters_per_sec": 0.811892256117762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231690526008606, "data/tokens_consumed": 46412070912, "data/tokens_consumed_B": 46.412070912, "train/loss_slope": -9.320581892822171e-06} {"step": 22140, "timestamp": 1778218468.3426325, "train/loss": 2.200989770889282, "train/z_loss": 0.001546572335064411, "train/perplexity": 9.033950617191133, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020689.9421240182, "perf/iters_per_sec": 0.963540049612054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378395795822144, "data/tokens_consumed": 46433042432, "data/tokens_consumed_B": 46.433042432, "train/loss_slope": -9.431155436825139e-06} {"step": 22150, "timestamp": 1778218478.7058954, "grad/layer_0/attn": 0.0038597427774220705, "grad/layer_0/mlp": 0.0033123630564659834, "grad/layer_0/attn_mlp_ratio": 1.165253504854249, "grad/layer_4/attn": 0.0021942206658422947, "grad/layer_4/mlp": 0.0026109125465154648, "grad/layer_4/attn_mlp_ratio": 0.8404037066390218, "grad/layer_8/attn": 0.006180604454129934, "grad/layer_8/mlp": 0.0038986688014119864, "grad/layer_8/attn_mlp_ratio": 1.5853114512728936, "grad/layer_12/attn": 0.004421200603246689, "grad/layer_12/mlp": 0.006554619874805212, "grad/layer_12/attn_mlp_ratio": 0.6745166951312196, "grad/layer_16/attn": 0.003969744313508272, "grad/layer_16/mlp": 0.004641245119273663, "grad/layer_16/attn_mlp_ratio": 0.8553188047516098, "grad/layer_20/attn": 0.00719786249101162, "grad/layer_20/mlp": 0.006699468474835157, "grad/layer_20/attn_mlp_ratio": 1.0743930523159033, "grad/layer_24/attn": 0.008277662098407745, "grad/layer_24/mlp": 0.012089057825505733, "grad/layer_24/attn_mlp_ratio": 0.6847235036357441, "grad/layer_27/attn": 0.008080553263425827, "grad/layer_27/mlp": 0.010037623345851898, "grad/layer_27/attn_mlp_ratio": 0.8050265391022573} {"step": 22150, "timestamp": 1778218478.7199912, "train/loss": 2.266715931892395, "train/z_loss": 0.0015226862858980894, "train/perplexity": 9.647665142135361, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021963.8261274777, "perf/iters_per_sec": 0.964147484840144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371857166290284, "data/tokens_consumed": 46454013952, "data/tokens_consumed_B": 46.454013952, "train/loss_slope": -5.182001707327127e-06} {"step": 22160, "timestamp": 1778218489.097863, "train/loss": 2.1858382701873778, "train/z_loss": 0.0015439700218848884, "train/perplexity": 8.898104443588757, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021834.9944970168, "perf/iters_per_sec": 0.9640860531315883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372518062591554, "data/tokens_consumed": 46474985472, "data/tokens_consumed_B": 46.474985472, "train/loss_slope": -9.78942983257068e-06} {"step": 22170, "timestamp": 1778218499.4713776, "train/loss": 2.1554760217666624, "train/z_loss": 0.0015494014020077883, "train/perplexity": 8.631998220175259, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022763.7210005473, "perf/iters_per_sec": 0.9645289044382798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367755651474, "data/tokens_consumed": 46495956992, "data/tokens_consumed_B": 46.495956992, "train/loss_slope": -1.549867154455317e-05} {"step": 22180, "timestamp": 1778218509.8492944, "train/loss": 2.1999318599700928, "train/z_loss": 0.001541217532940209, "train/perplexity": 9.024398555695736, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021875.333949084, "perf/iters_per_sec": 0.9641052884812755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372311115264892, "data/tokens_consumed": 46516928512, "data/tokens_consumed_B": 46.516928512, "train/loss_slope": -1.6793995407154892e-05} {"step": 22190, "timestamp": 1778218520.2251961, "train/loss": 2.2443106412887572, "train/z_loss": 0.0015407954808324576, "train/perplexity": 9.433909966358991, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022271.0061427557, "perf/iters_per_sec": 0.964293959685686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037028169631958, "data/tokens_consumed": 46537900032, "data/tokens_consumed_B": 46.537900032, "train/loss_slope": -1.2532047520090178e-05} {"step": 22200, "timestamp": 1778218530.6024637, "grad/layer_0/attn": 0.0036931047216057777, "grad/layer_0/mlp": 0.00321570853702724, "grad/layer_0/attn_mlp_ratio": 1.1484575060941673, "grad/layer_4/attn": 0.003448966657742858, "grad/layer_4/mlp": 0.0025539605412632227, "grad/layer_4/attn_mlp_ratio": 1.3504384531301756, "grad/layer_8/attn": 0.009366756305098534, "grad/layer_8/mlp": 0.003794516436755657, "grad/layer_8/attn_mlp_ratio": 2.4684979533933427, "grad/layer_12/attn": 0.004572573117911816, "grad/layer_12/mlp": 0.006072565447539091, "grad/layer_12/attn_mlp_ratio": 0.7529886803387166, "grad/layer_16/attn": 0.004413549788296223, "grad/layer_16/mlp": 0.005005277693271637, "grad/layer_16/attn_mlp_ratio": 0.881779188006139, "grad/layer_20/attn": 0.005391477607190609, "grad/layer_20/mlp": 0.006848610937595367, "grad/layer_20/attn_mlp_ratio": 0.7872366495328992, "grad/layer_24/attn": 0.013525511138141155, "grad/layer_24/mlp": 0.013282055966556072, "grad/layer_24/attn_mlp_ratio": 1.0183296223389764, "grad/layer_27/attn": 0.00476317061111331, "grad/layer_27/mlp": 0.012954674661159515, "grad/layer_27/attn_mlp_ratio": 0.3676796754013591} {"step": 22200, "timestamp": 1778218531.1902697, "eos/sharpness": 40.712189674377434, "eos/L0_probe": 2.048387050628662, "eos/L_plus": 2.2526018619537354, "eos/L_minus": 2.2512941360473633, "eos/grad_norm": 0.1967371702194214, "eos/embed_grad_frac": 0.06917893886566162, "eos/time_s": 0.5849463939666748} {"step": 22200, "timestamp": 1778218531.209175, "train/loss": 2.246274399757385, "train/z_loss": 0.0015303323976695537, "train/perplexity": 9.452454089077335, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910514.8342562288, "perf/iters_per_sec": 0.9110044642716545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976894617080688, "data/tokens_consumed": 46558871552, "data/tokens_consumed_B": 46.558871552, "train/loss_slope": -1.0029337794581208e-05} {"step": 22200, "timestamp": 1778218532.5693605, "geo/rankme_last": 440.77197265625, "geo/layer_0/stable_rank_q_proj": 17.48634910583496, "geo/layer_0/stable_rank_k_proj": 15.438135147094727, "geo/layer_0/stable_rank_o_proj": 51.69113540649414, "geo/layer_0/stable_rank_gate_proj": 150.58973693847656, "geo/layer_0/stable_rank_down_proj": 50.214691162109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04563978686928749, "geo/layer_0/attn_entropy_mean": 6.25873327255249, "geo/layer_0/attn_entropy_std": 0.32681727409362793, "geo/layer_7/stable_rank_q_proj": 42.485626220703125, "geo/layer_7/stable_rank_k_proj": 41.77043533325195, "geo/layer_7/stable_rank_o_proj": 110.2822036743164, "geo/layer_7/stable_rank_gate_proj": 102.73527526855469, "geo/layer_7/stable_rank_down_proj": 150.08883666992188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5350984334945679, "geo/layer_7/attn_entropy_mean": 4.6672492027282715, "geo/layer_7/attn_entropy_std": 0.8713136315345764, "geo/layer_14/stable_rank_q_proj": 57.71181106567383, "geo/layer_14/stable_rank_k_proj": 35.1768798828125, "geo/layer_14/stable_rank_o_proj": 53.9116325378418, "geo/layer_14/stable_rank_gate_proj": 85.51122283935547, "geo/layer_14/stable_rank_down_proj": 135.75259399414062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38637906312942505, "geo/layer_14/attn_entropy_mean": 5.517491340637207, "geo/layer_14/attn_entropy_std": 0.4506831765174866, "geo/layer_21/stable_rank_q_proj": 47.20124053955078, "geo/layer_21/stable_rank_k_proj": 31.872188568115234, "geo/layer_21/stable_rank_o_proj": 82.82025146484375, "geo/layer_21/stable_rank_gate_proj": 85.8377685546875, "geo/layer_21/stable_rank_down_proj": 60.32312774658203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14852534234523773, "geo/layer_21/attn_entropy_mean": 5.737725257873535, "geo/layer_21/attn_entropy_std": 0.296238511800766, "geo/layer_27/stable_rank_q_proj": 41.34575653076172, "geo/layer_27/stable_rank_k_proj": 31.713722229003906, "geo/layer_27/stable_rank_o_proj": 119.83346557617188, "geo/layer_27/stable_rank_gate_proj": 91.28169250488281, "geo/layer_27/stable_rank_down_proj": 138.11761474609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08102767914533615, "geo/layer_27/attn_entropy_mean": 4.389156341552734, "geo/layer_27/attn_entropy_std": 0.5392098426818848, "attnres/final_alpha/block_0": 0.24269184470176697, "attnres/block_norm/0": 1.6301909685134888, "attnres/final_alpha/block_1": 0.006665486842393875, "attnres/block_norm/1": 30416.203125, "attnres/final_alpha/block_2": 0.014006730169057846, "attnres/block_norm/2": 21777.05078125, "attnres/final_alpha/block_3": 0.016028063371777534, "attnres/block_norm/3": 31380.76953125, "attnres/final_alpha/block_4": 0.020572740584611893, "attnres/block_norm/4": 9660.181640625, "attnres/final_alpha/block_5": 0.5644265413284302, "attnres/block_norm/5": 5161.9541015625, "attnres/final_alpha/block_6": 0.13560862839221954, "attnres/block_norm/6": 20981.58203125, "geo/tier1_time_s": 1.3562922477722168, "geo/step": 22200.0, "geo/rankme_slope": -7.299755839835935e-05} {"step": 22210, "timestamp": 1778218543.8601341, "train/loss": 2.2442948818206787, "train/z_loss": 0.0015364225138910115, "train/perplexity": 9.433761294127521, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1658236.32374026, "perf/iters_per_sec": 0.7907086962415028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.26468825340271, "data/tokens_consumed": 46579843072, "data/tokens_consumed_B": 46.579843072, "train/loss_slope": -7.15962364764465e-06} {"step": 22220, "timestamp": 1778218554.2324154, "train/loss": 2.212373423576355, "train/z_loss": 0.0015352475922554732, "train/perplexity": 9.137377544618479, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023254.2094750274, "perf/iters_per_sec": 0.9647627875685822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036524224281311, "data/tokens_consumed": 46600814592, "data/tokens_consumed_B": 46.600814592, "train/loss_slope": -9.024955759240216e-06} {"step": 22230, "timestamp": 1778218564.5919216, "train/loss": 2.207382559776306, "train/z_loss": 0.001548781432211399, "train/perplexity": 9.091887748915337, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025398.7340028638, "perf/iters_per_sec": 0.9657853765501326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354267358779907, "data/tokens_consumed": 46621786112, "data/tokens_consumed_B": 46.621786112, "train/loss_slope": -9.331373820746987e-06} {"step": 22240, "timestamp": 1778218574.9504337, "train/loss": 2.207343888282776, "train/z_loss": 0.0015421170857734979, "train/perplexity": 9.091536158835378, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025621.170614268, "perf/iters_per_sec": 0.9658914425917949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035313034057617, "data/tokens_consumed": 46642757632, "data/tokens_consumed_B": 46.642757632, "train/loss_slope": -1.210358179334478e-05} {"step": 22250, "timestamp": 1778218585.3050702, "grad/layer_0/attn": 0.003178767394274473, "grad/layer_0/mlp": 0.002706171479076147, "grad/layer_0/attn_mlp_ratio": 1.1746363086702967, "grad/layer_4/attn": 0.0024270834401249886, "grad/layer_4/mlp": 0.0025173877365887165, "grad/layer_4/attn_mlp_ratio": 0.9641277378275973, "grad/layer_8/attn": 0.004324883222579956, "grad/layer_8/mlp": 0.003710737219080329, "grad/layer_8/attn_mlp_ratio": 1.165505087180851, "grad/layer_12/attn": 0.005463134031742811, "grad/layer_12/mlp": 0.00642018299549818, "grad/layer_12/attn_mlp_ratio": 0.8509311884848814, "grad/layer_16/attn": 0.004503333009779453, "grad/layer_16/mlp": 0.004763751290738583, "grad/layer_16/attn_mlp_ratio": 0.9453333392952832, "grad/layer_20/attn": 0.004082843195647001, "grad/layer_20/mlp": 0.006308723706752062, "grad/layer_20/attn_mlp_ratio": 0.6471741862081902, "grad/layer_24/attn": 0.01673596352338791, "grad/layer_24/mlp": 0.015146019868552685, "grad/layer_24/attn_mlp_ratio": 1.1049743469331472, "grad/layer_27/attn": 0.011572382412850857, "grad/layer_27/mlp": 0.013442965224385262, "grad/layer_27/attn_mlp_ratio": 0.8608504250069584} {"step": 22250, "timestamp": 1778218585.3194146, "train/loss": 2.156574749946594, "train/z_loss": 0.0015583684318698943, "train/perplexity": 8.641487652067362, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023624.024342863, "perf/iters_per_sec": 0.9649391290392222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363348007202149, "data/tokens_consumed": 46663729152, "data/tokens_consumed_B": 46.663729152, "train/loss_slope": -1.4308235425688766e-05} {"step": 22260, "timestamp": 1778218595.6815608, "train/loss": 2.1763790130615233, "train/z_loss": 0.0015515024773776532, "train/perplexity": 8.814331823746176, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025219.4303446773, "perf/iters_per_sec": 0.9656998779033076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355184078216553, "data/tokens_consumed": 46684700672, "data/tokens_consumed_B": 46.684700672, "train/loss_slope": -1.5091517827834928e-05} {"step": 22270, "timestamp": 1778218606.5529885, "train/loss": 2.231763553619385, "train/z_loss": 0.0015555849764496087, "train/perplexity": 9.316281362329132, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929790.5105130456, "perf/iters_per_sec": 0.9201958229603985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0867252111434937, "data/tokens_consumed": 46705672192, "data/tokens_consumed_B": 46.705672192, "train/loss_slope": -1.3611862668753627e-05} {"step": 22275, "timestamp": 1778218612.694658, "eos/sharpness": 44.938969612121575, "eos/L0_probe": 2.046747922897339, "eos/L_plus": 2.3407297134399414, "eos/L_minus": 2.202155828475952, "eos/grad_norm": 0.1288156807422638, "eos/embed_grad_frac": 0.14342673122882843, "eos/time_s": 0.5954265594482422} {"step": 22275, "timestamp": 1778218614.0760012, "geo/rankme_last": 441.883544921875, "geo/layer_0/stable_rank_q_proj": 17.483882904052734, "geo/layer_0/stable_rank_k_proj": 15.412176132202148, "geo/layer_0/stable_rank_o_proj": 51.72223663330078, "geo/layer_0/stable_rank_gate_proj": 150.55198669433594, "geo/layer_0/stable_rank_down_proj": 50.254886627197266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.047040365636348724, "geo/layer_0/attn_entropy_mean": 6.260107517242432, "geo/layer_0/attn_entropy_std": 0.32711222767829895, "geo/layer_7/stable_rank_q_proj": 42.48598861694336, "geo/layer_7/stable_rank_k_proj": 41.84359359741211, "geo/layer_7/stable_rank_o_proj": 110.03266906738281, "geo/layer_7/stable_rank_gate_proj": 102.58670806884766, "geo/layer_7/stable_rank_down_proj": 149.94473266601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.55319744348526, "geo/layer_7/attn_entropy_mean": 4.677883148193359, "geo/layer_7/attn_entropy_std": 0.858896017074585, "geo/layer_14/stable_rank_q_proj": 57.663978576660156, "geo/layer_14/stable_rank_k_proj": 35.17380905151367, "geo/layer_14/stable_rank_o_proj": 53.8431282043457, "geo/layer_14/stable_rank_gate_proj": 85.37149047851562, "geo/layer_14/stable_rank_down_proj": 136.0643768310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37732818722724915, "geo/layer_14/attn_entropy_mean": 5.487814426422119, "geo/layer_14/attn_entropy_std": 0.4565317928791046, "geo/layer_21/stable_rank_q_proj": 47.18345642089844, "geo/layer_21/stable_rank_k_proj": 31.887344360351562, "geo/layer_21/stable_rank_o_proj": 82.84687805175781, "geo/layer_21/stable_rank_gate_proj": 85.88851165771484, "geo/layer_21/stable_rank_down_proj": 60.261817932128906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15431228280067444, "geo/layer_21/attn_entropy_mean": 5.727451801300049, "geo/layer_21/attn_entropy_std": 0.2889161705970764, "geo/layer_27/stable_rank_q_proj": 41.42139434814453, "geo/layer_27/stable_rank_k_proj": 31.587013244628906, "geo/layer_27/stable_rank_o_proj": 119.78093719482422, "geo/layer_27/stable_rank_gate_proj": 91.25234985351562, "geo/layer_27/stable_rank_down_proj": 138.15997314453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0770847350358963, "geo/layer_27/attn_entropy_mean": 4.371432304382324, "geo/layer_27/attn_entropy_std": 0.5744595527648926, "attnres/final_alpha/block_0": 0.24163652956485748, "attnres/block_norm/0": 1.630782127380371, "attnres/final_alpha/block_1": 0.00664774514734745, "attnres/block_norm/1": 30450.6953125, "attnres/final_alpha/block_2": 0.0136288246139884, "attnres/block_norm/2": 21616.828125, "attnres/final_alpha/block_3": 0.015771424397826195, "attnres/block_norm/3": 31470.958984375, "attnres/final_alpha/block_4": 0.02013549953699112, "attnres/block_norm/4": 9760.947265625, "attnres/final_alpha/block_5": 0.5687292814254761, "attnres/block_norm/5": 5160.771484375, "attnres/final_alpha/block_6": 0.1334507167339325, "attnres/block_norm/6": 21146.86328125, "geo/tier1_time_s": 1.362072229385376, "geo/step": 22275.0, "geo/rankme_slope": -4.0411066770458184e-05} {"step": 22280, "timestamp": 1778218619.2609367, "train/loss": 2.223317766189575, "train/z_loss": 0.00154073485173285, "train/perplexity": 9.237929368195589, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1651107.3944667089, "perf/iters_per_sec": 0.7873093578656716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.270148754119873, "data/tokens_consumed": 46726643712, "data/tokens_consumed_B": 46.726643712, "train/loss_slope": -1.530837748739553e-05} {"step": 22290, "timestamp": 1778218629.611048, "train/loss": 2.1978442430496217, "train/z_loss": 0.0015538507956080138, "train/perplexity": 9.005578719712773, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027366.536389231, "perf/iters_per_sec": 0.9667236978479533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034421730041504, "data/tokens_consumed": 46747615232, "data/tokens_consumed_B": 46.747615232, "train/loss_slope": -1.754345067656869e-05} {"step": 22300, "timestamp": 1778218639.9469912, "grad/layer_0/attn": 0.0030673507135361433, "grad/layer_0/mlp": 0.002879516687244177, "grad/layer_0/attn_mlp_ratio": 1.0652310579066746, "grad/layer_4/attn": 0.0034170139115303755, "grad/layer_4/mlp": 0.002501945709809661, "grad/layer_4/attn_mlp_ratio": 1.3657425744925824, "grad/layer_8/attn": 0.005722488276660442, "grad/layer_8/mlp": 0.003730464493855834, "grad/layer_8/attn_mlp_ratio": 1.5339880952322988, "grad/layer_12/attn": 0.004459222313016653, "grad/layer_12/mlp": 0.005983363837003708, "grad/layer_12/attn_mlp_ratio": 0.7452701122589077, "grad/layer_16/attn": 0.003802285995334387, "grad/layer_16/mlp": 0.0042905183508992195, "grad/layer_16/attn_mlp_ratio": 0.8862066528434327, "grad/layer_20/attn": 0.009428943507373333, "grad/layer_20/mlp": 0.005301633384078741, "grad/layer_20/attn_mlp_ratio": 1.7784978036843282, "grad/layer_24/attn": 0.004635619465261698, "grad/layer_24/mlp": 0.009180357679724693, "grad/layer_24/attn_mlp_ratio": 0.5049497608360873, "grad/layer_27/attn": 0.008948411792516708, "grad/layer_27/mlp": 0.00725916400551796, "grad/layer_27/attn_mlp_ratio": 1.2327055377787492} {"step": 22300, "timestamp": 1778218639.9613278, "train/loss": 2.235059142112732, "train/z_loss": 0.0015414535417221486, "train/perplexity": 9.347034639226319, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027282.1496117483, "perf/iters_per_sec": 0.9666834590967885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344647884368896, "data/tokens_consumed": 46768586752, "data/tokens_consumed_B": 46.768586752, "train/loss_slope": -1.85611225412016e-05} {"step": 22310, "timestamp": 1778218650.304835, "train/loss": 2.2684393405914305, "train/z_loss": 0.0015209897654131054, "train/perplexity": 9.664306347847155, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028785.107594244, "perf/iters_per_sec": 0.9674001253100605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336984395980835, "data/tokens_consumed": 46789558272, "data/tokens_consumed_B": 46.789558272, "train/loss_slope": -1.3578733895728476e-05} {"step": 22320, "timestamp": 1778218660.6575239, "train/loss": 2.22885000705719, "train/z_loss": 0.001551858230959624, "train/perplexity": 9.28917744622747, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026839.6312379874, "perf/iters_per_sec": 0.9664724498929917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346906423568725, "data/tokens_consumed": 46810529792, "data/tokens_consumed_B": 46.810529792, "train/loss_slope": -1.5134598434132375e-05} {"step": 22330, "timestamp": 1778218671.0338047, "train/loss": 2.173369312286377, "train/z_loss": 0.0015503312461078167, "train/perplexity": 8.787843203828949, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022431.048202327, "perf/iters_per_sec": 0.9643702736865649, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369461059570313, "data/tokens_consumed": 46831501312, "data/tokens_consumed_B": 46.831501312, "train/loss_slope": -1.7118317609501465e-05} {"step": 22340, "timestamp": 1778218681.4262698, "train/loss": 2.2481220722198487, "train/z_loss": 0.0015534300822764636, "train/perplexity": 9.469935272978235, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019426.6565374986, "perf/iters_per_sec": 0.962937668102979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384888172149658, "data/tokens_consumed": 46852472832, "data/tokens_consumed_B": 46.852472832, "train/loss_slope": -1.4619531911878043e-05} {"step": 22350, "timestamp": 1778218691.80068, "grad/layer_0/attn": 0.0027467410545796156, "grad/layer_0/mlp": 0.0026548022869974375, "grad/layer_0/attn_mlp_ratio": 1.0346310776397034, "grad/layer_4/attn": 0.001682019210420549, "grad/layer_4/mlp": 0.0024833830539137125, "grad/layer_4/attn_mlp_ratio": 0.6773095838110015, "grad/layer_8/attn": 0.004266486037522554, "grad/layer_8/mlp": 0.0036782582756131887, "grad/layer_8/attn_mlp_ratio": 1.1599201583579088, "grad/layer_12/attn": 0.004779123235493898, "grad/layer_12/mlp": 0.006297419313341379, "grad/layer_12/attn_mlp_ratio": 0.75890184880637, "grad/layer_16/attn": 0.005747613497078419, "grad/layer_16/mlp": 0.004281428176909685, "grad/layer_16/attn_mlp_ratio": 1.3424523606003322, "grad/layer_20/attn": 0.003924540244042873, "grad/layer_20/mlp": 0.006408714223653078, "grad/layer_20/attn_mlp_ratio": 0.6123755945179684, "grad/layer_24/attn": 0.01026518177241087, "grad/layer_24/mlp": 0.010455002076923847, "grad/layer_24/attn_mlp_ratio": 0.9818440588245934, "grad/layer_27/attn": 0.005785323679447174, "grad/layer_27/mlp": 0.00897813681513071, "grad/layer_27/attn_mlp_ratio": 0.6443790882379239} {"step": 22350, "timestamp": 1778218692.4298816, "eos/sharpness": 59.78007316589354, "eos/L0_probe": 2.042646646499634, "eos/L_plus": 2.4403469562530518, "eos/L_minus": 2.2427470684051514, "eos/grad_norm": 0.18019095063209534, "eos/embed_grad_frac": 0.07396285980939865, "eos/time_s": 0.6261112689971924} {"step": 22350, "timestamp": 1778218692.450617, "train/loss": 2.204167437553406, "train/z_loss": 0.0015440179384313523, "train/perplexity": 9.062703159716449, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903254.5074659432, "perf/iters_per_sec": 0.9075424706773487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1018768072128295, "data/tokens_consumed": 46873444352, "data/tokens_consumed_B": 46.873444352, "train/loss_slope": -1.6159509360665075e-05} {"step": 22350, "timestamp": 1778218693.8196313, "geo/rankme_last": 441.1875, "geo/layer_0/stable_rank_q_proj": 17.511804580688477, "geo/layer_0/stable_rank_k_proj": 15.424851417541504, "geo/layer_0/stable_rank_o_proj": 51.695030212402344, "geo/layer_0/stable_rank_gate_proj": 149.89285278320312, "geo/layer_0/stable_rank_down_proj": 50.21493911743164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.044063933193683624, "geo/layer_0/attn_entropy_mean": 6.2628960609436035, "geo/layer_0/attn_entropy_std": 0.32623401284217834, "geo/layer_7/stable_rank_q_proj": 42.55560302734375, "geo/layer_7/stable_rank_k_proj": 41.730682373046875, "geo/layer_7/stable_rank_o_proj": 109.91665649414062, "geo/layer_7/stable_rank_gate_proj": 102.3475570678711, "geo/layer_7/stable_rank_down_proj": 149.67294311523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5599361658096313, "geo/layer_7/attn_entropy_mean": 4.683985710144043, "geo/layer_7/attn_entropy_std": 0.8437641263008118, "geo/layer_14/stable_rank_q_proj": 57.669281005859375, "geo/layer_14/stable_rank_k_proj": 35.170989990234375, "geo/layer_14/stable_rank_o_proj": 53.84757995605469, "geo/layer_14/stable_rank_gate_proj": 85.4088363647461, "geo/layer_14/stable_rank_down_proj": 135.97845458984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3859696090221405, "geo/layer_14/attn_entropy_mean": 5.5046491622924805, "geo/layer_14/attn_entropy_std": 0.46779680252075195, "geo/layer_21/stable_rank_q_proj": 47.313331604003906, "geo/layer_21/stable_rank_k_proj": 31.81293296813965, "geo/layer_21/stable_rank_o_proj": 82.94513702392578, "geo/layer_21/stable_rank_gate_proj": 85.78279876708984, "geo/layer_21/stable_rank_down_proj": 60.22600173950195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15535306930541992, "geo/layer_21/attn_entropy_mean": 5.750063419342041, "geo/layer_21/attn_entropy_std": 0.288545161485672, "geo/layer_27/stable_rank_q_proj": 41.44273376464844, "geo/layer_27/stable_rank_k_proj": 31.53049659729004, "geo/layer_27/stable_rank_o_proj": 119.92472076416016, "geo/layer_27/stable_rank_gate_proj": 91.02449798583984, "geo/layer_27/stable_rank_down_proj": 137.94203186035156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07378784567117691, "geo/layer_27/attn_entropy_mean": 4.384988307952881, "geo/layer_27/attn_entropy_std": 0.5480647087097168, "attnres/final_alpha/block_0": 0.2390711009502411, "attnres/block_norm/0": 1.6316087245941162, "attnres/final_alpha/block_1": 0.006532604806125164, "attnres/block_norm/1": 30668.703125, "attnres/final_alpha/block_2": 0.013458811677992344, "attnres/block_norm/2": 21787.8828125, "attnres/final_alpha/block_3": 0.015312975272536278, "attnres/block_norm/3": 31602.87890625, "attnres/final_alpha/block_4": 0.020019246265292168, "attnres/block_norm/4": 9716.8779296875, "attnres/final_alpha/block_5": 0.5729418992996216, "attnres/block_norm/5": 5161.2265625, "attnres/final_alpha/block_6": 0.13266338407993317, "attnres/block_norm/6": 21322.833984375, "geo/tier1_time_s": 1.3644330501556396, "geo/step": 22350.0, "geo/rankme_slope": -1.6440365208583436e-05} {"step": 22360, "timestamp": 1778218704.2050123, "train/loss": 2.2199263334274293, "train/z_loss": 0.001538824150338769, "train/perplexity": 9.206652618357229, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1784795.7629055795, "perf/iters_per_sec": 0.851056939556875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.175009512901306, "data/tokens_consumed": 46894415872, "data/tokens_consumed_B": 46.894415872, "train/loss_slope": -1.4795209779919642e-05} {"step": 22370, "timestamp": 1778218714.5884664, "train/loss": 2.1950575351715087, "train/z_loss": 0.001539672468788922, "train/perplexity": 8.980517737583838, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020921.6068325727, "perf/iters_per_sec": 0.9636505159533371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377206087112427, "data/tokens_consumed": 46915387392, "data/tokens_consumed_B": 46.915387392, "train/loss_slope": -1.1946704371212277e-05} {"step": 22380, "timestamp": 1778218724.9690053, "train/loss": 2.2101293325424196, "train/z_loss": 0.0015500779380090534, "train/perplexity": 9.116895428059548, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021419.5174695589, "perf/iters_per_sec": 0.9638879382465166, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374650001525878, "data/tokens_consumed": 46936358912, "data/tokens_consumed_B": 46.936358912, "train/loss_slope": -9.980810121341112e-06} {"step": 22390, "timestamp": 1778218735.341238, "train/loss": 2.2261984825134276, "train/z_loss": 0.001535172329749912, "train/perplexity": 9.26457958955921, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022891.926671725, "perf/iters_per_sec": 0.9645900376661897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367098569869995, "data/tokens_consumed": 46957330432, "data/tokens_consumed_B": 46.957330432, "train/loss_slope": -9.341866040852141e-06} {"step": 22400, "timestamp": 1778218745.6781416, "grad/layer_0/attn": 0.002635938348248601, "grad/layer_0/mlp": 0.002647132845595479, "grad/layer_0/attn_mlp_ratio": 0.9957710483088865, "grad/layer_4/attn": 0.0017648435896262527, "grad/layer_4/mlp": 0.0025343492161482573, "grad/layer_4/attn_mlp_ratio": 0.6963695092784166, "grad/layer_8/attn": 0.003621128387749195, "grad/layer_8/mlp": 0.003595180343836546, "grad/layer_8/attn_mlp_ratio": 1.0072174246377907, "grad/layer_12/attn": 0.0043268450535833836, "grad/layer_12/mlp": 0.006282268092036247, "grad/layer_12/attn_mlp_ratio": 0.6887393090075864, "grad/layer_16/attn": 0.005350612569600344, "grad/layer_16/mlp": 0.004553840495646, "grad/layer_16/attn_mlp_ratio": 1.1749670321609744, "grad/layer_20/attn": 0.003883623518049717, "grad/layer_20/mlp": 0.006608097814023495, "grad/layer_20/attn_mlp_ratio": 0.587706715090891, "grad/layer_24/attn": 0.011628672480583191, "grad/layer_24/mlp": 0.010634918697178364, "grad/layer_24/attn_mlp_ratio": 1.093442526676225, "grad/layer_27/attn": 0.010353267192840576, "grad/layer_27/mlp": 0.009450769983232021, "grad/layer_27/attn_mlp_ratio": 1.0954945577619972} {"step": 22400, "timestamp": 1778218745.692319, "train/loss": 2.244925594329834, "train/z_loss": 0.0015337804215960205, "train/perplexity": 9.439713162145615, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027368.2653169152, "perf/iters_per_sec": 0.966724522264917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344208478927612, "data/tokens_consumed": 46978301952, "data/tokens_consumed_B": 46.978301952, "train/loss_slope": -5.8693382069759045e-06} {"step": 22410, "timestamp": 1778218756.040875, "train/loss": 2.2217067956924437, "train/z_loss": 0.001536075142212212, "train/perplexity": 9.223059317351371, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027548.0898981711, "perf/iters_per_sec": 0.9668102693072181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034329104423523, "data/tokens_consumed": 46999273472, "data/tokens_consumed_B": 46.999273472, "train/loss_slope": -4.620184599369675e-06} {"step": 22420, "timestamp": 1778218766.3835645, "train/loss": 2.1776909112930296, "train/z_loss": 0.0015429978142492474, "train/perplexity": 8.825902918467447, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028759.0441444628, "perf/iters_per_sec": 0.9673876972887339, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337117195129395, "data/tokens_consumed": 47020244992, "data/tokens_consumed_B": 47.020244992, "train/loss_slope": -9.255705729569554e-06} {"step": 22425, "timestamp": 1778218772.1557279, "eos/sharpness": 28.63292694091796, "eos/L0_probe": 2.0450873374938965, "eos/L_plus": 2.191384792327881, "eos/L_minus": 2.185119152069092, "eos/grad_norm": 0.14185389876365662, "eos/embed_grad_frac": 0.12033572047948837, "eos/time_s": 0.6088912487030029} {"step": 22425, "timestamp": 1778218773.5375307, "geo/rankme_last": 441.2034606933594, "geo/layer_0/stable_rank_q_proj": 17.546138763427734, "geo/layer_0/stable_rank_k_proj": 15.468448638916016, "geo/layer_0/stable_rank_o_proj": 51.661319732666016, "geo/layer_0/stable_rank_gate_proj": 150.0177459716797, "geo/layer_0/stable_rank_down_proj": 50.25557327270508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.044337447732686996, "geo/layer_0/attn_entropy_mean": 6.263361930847168, "geo/layer_0/attn_entropy_std": 0.325199156999588, "geo/layer_7/stable_rank_q_proj": 42.65481948852539, "geo/layer_7/stable_rank_k_proj": 41.61391067504883, "geo/layer_7/stable_rank_o_proj": 109.66287231445312, "geo/layer_7/stable_rank_gate_proj": 102.34004974365234, "geo/layer_7/stable_rank_down_proj": 149.62611389160156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5528979897499084, "geo/layer_7/attn_entropy_mean": 4.68397331237793, "geo/layer_7/attn_entropy_std": 0.8514903783798218, "geo/layer_14/stable_rank_q_proj": 57.64702224731445, "geo/layer_14/stable_rank_k_proj": 35.134037017822266, "geo/layer_14/stable_rank_o_proj": 53.8150634765625, "geo/layer_14/stable_rank_gate_proj": 85.4656982421875, "geo/layer_14/stable_rank_down_proj": 136.48147583007812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3824123740196228, "geo/layer_14/attn_entropy_mean": 5.500211715698242, "geo/layer_14/attn_entropy_std": 0.4730476438999176, "geo/layer_21/stable_rank_q_proj": 47.3101806640625, "geo/layer_21/stable_rank_k_proj": 31.8576717376709, "geo/layer_21/stable_rank_o_proj": 82.77135467529297, "geo/layer_21/stable_rank_gate_proj": 85.77136993408203, "geo/layer_21/stable_rank_down_proj": 60.31133270263672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14936792850494385, "geo/layer_21/attn_entropy_mean": 5.744102954864502, "geo/layer_21/attn_entropy_std": 0.3019755780696869, "geo/layer_27/stable_rank_q_proj": 41.45078659057617, "geo/layer_27/stable_rank_k_proj": 31.531225204467773, "geo/layer_27/stable_rank_o_proj": 119.7210464477539, "geo/layer_27/stable_rank_gate_proj": 90.96665954589844, "geo/layer_27/stable_rank_down_proj": 137.92259216308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07390126585960388, "geo/layer_27/attn_entropy_mean": 4.386938095092773, "geo/layer_27/attn_entropy_std": 0.5782282948493958, "attnres/final_alpha/block_0": 0.24082261323928833, "attnres/block_norm/0": 1.6320911645889282, "attnres/final_alpha/block_1": 0.006525625474750996, "attnres/block_norm/1": 30610.171875, "attnres/final_alpha/block_2": 0.013758769258856773, "attnres/block_norm/2": 21765.6328125, "attnres/final_alpha/block_3": 0.015574363991618156, "attnres/block_norm/3": 31720.7734375, "attnres/final_alpha/block_4": 0.020434420555830002, "attnres/block_norm/4": 9747.498046875, "attnres/final_alpha/block_5": 0.5700783729553223, "attnres/block_norm/5": 5140.04931640625, "attnres/final_alpha/block_6": 0.13280579447746277, "attnres/block_norm/6": 21331.60546875, "geo/tier1_time_s": 1.3630115985870361, "geo/step": 22425.0, "geo/rankme_slope": -2.808273700105042e-05} {"step": 22430, "timestamp": 1778218778.7254312, "train/loss": 2.223047637939453, "train/z_loss": 0.0015375608811154962, "train/perplexity": 9.235434279512749, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699948.0458791722, "perf/iters_per_sec": 0.81059839528998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2336565256118774, "data/tokens_consumed": 47041216512, "data/tokens_consumed_B": 47.041216512, "train/loss_slope": -6.796681712849036e-06} {"step": 22440, "timestamp": 1778218789.0952013, "train/loss": 2.2397342681884767, "train/z_loss": 0.0015389028587378562, "train/perplexity": 9.390835512121667, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023231.7782333246, "perf/iters_per_sec": 0.9647520915190337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365357160568238, "data/tokens_consumed": 47062188032, "data/tokens_consumed_B": 47.062188032, "train/loss_slope": -4.2744163990450045e-06} {"step": 22450, "timestamp": 1778218799.4540896, "grad/layer_0/attn": 0.002700716257095337, "grad/layer_0/mlp": 0.002647785237058997, "grad/layer_0/attn_mlp_ratio": 1.0199906387030349, "grad/layer_4/attn": 0.0018183034844696522, "grad/layer_4/mlp": 0.0025969126727432013, "grad/layer_4/attn_mlp_ratio": 0.7001788830007257, "grad/layer_8/attn": 0.003816446755081415, "grad/layer_8/mlp": 0.0039029093459248543, "grad/layer_8/attn_mlp_ratio": 0.9778466059637337, "grad/layer_12/attn": 0.00414960877969861, "grad/layer_12/mlp": 0.00634408462792635, "grad/layer_12/attn_mlp_ratio": 0.6540910088152254, "grad/layer_16/attn": 0.005135276820510626, "grad/layer_16/mlp": 0.004275578074157238, "grad/layer_16/attn_mlp_ratio": 1.2010719045086444, "grad/layer_20/attn": 0.003749299328774214, "grad/layer_20/mlp": 0.006670402362942696, "grad/layer_20/attn_mlp_ratio": 0.5620799269014697, "grad/layer_24/attn": 0.016750523820519447, "grad/layer_24/mlp": 0.013417933136224747, "grad/layer_24/attn_mlp_ratio": 1.2483683981447768, "grad/layer_27/attn": 0.006207705941051245, "grad/layer_27/mlp": 0.012350915931165218, "grad/layer_27/attn_mlp_ratio": 0.5026109743914754} {"step": 22450, "timestamp": 1778218799.468307, "train/loss": 2.1872488021850587, "train/z_loss": 0.0015483698691241444, "train/perplexity": 8.910664360625146, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022563.211262955, "perf/iters_per_sec": 0.9644332939448143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368783473968506, "data/tokens_consumed": 47083159552, "data/tokens_consumed_B": 47.083159552, "train/loss_slope": -8.34586728345229e-06} {"step": 22460, "timestamp": 1778218809.834778, "train/loss": 2.2150726318359375, "train/z_loss": 0.001539403002243489, "train/perplexity": 9.162074545737914, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024071.567019129, "perf/iters_per_sec": 0.9651525340171475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361056566238402, "data/tokens_consumed": 47104131072, "data/tokens_consumed_B": 47.104131072, "train/loss_slope": -7.141904476130301e-06} {"step": 22470, "timestamp": 1778218820.203897, "train/loss": 2.2108626842498778, "train/z_loss": 0.0015433221124112606, "train/perplexity": 9.123583771042556, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023405.7031851648, "perf/iters_per_sec": 0.9648350253988098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364466190338135, "data/tokens_consumed": 47125102592, "data/tokens_consumed_B": 47.125102592, "train/loss_slope": -7.588279079182049e-06} {"step": 22480, "timestamp": 1778218830.5684793, "train/loss": 2.238313341140747, "train/z_loss": 0.0015465402160771192, "train/perplexity": 9.377501295659647, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024266.692569151, "perf/iters_per_sec": 0.9652455771299129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360057830810547, "data/tokens_consumed": 47146074112, "data/tokens_consumed_B": 47.146074112, "train/loss_slope": -6.494495470245586e-06} {"step": 22490, "timestamp": 1778218840.9410455, "train/loss": 2.2131853342056274, "train/z_loss": 0.001542841037735343, "train/perplexity": 9.144799291060343, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023365.535361654, "perf/iters_per_sec": 0.9648158718879957, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03646719455719, "data/tokens_consumed": 47167045632, "data/tokens_consumed_B": 47.167045632, "train/loss_slope": -4.937519599394964e-06} {"step": 22500, "timestamp": 1778218851.298107, "grad/layer_0/attn": 0.0031559655908495188, "grad/layer_0/mlp": 0.0029259559232741594, "grad/layer_0/attn_mlp_ratio": 1.0786100562502567, "grad/layer_4/attn": 0.002303904853761196, "grad/layer_4/mlp": 0.0024564170744270086, "grad/layer_4/attn_mlp_ratio": 0.9379126956717407, "grad/layer_8/attn": 0.0034771740902215242, "grad/layer_8/mlp": 0.003724395763128996, "grad/layer_8/attn_mlp_ratio": 0.9336209731744863, "grad/layer_12/attn": 0.0047989399172365665, "grad/layer_12/mlp": 0.00580980721861124, "grad/layer_12/attn_mlp_ratio": 0.8260067251909641, "grad/layer_16/attn": 0.00391609501093626, "grad/layer_16/mlp": 0.0048251403495669365, "grad/layer_16/attn_mlp_ratio": 0.8116022842998769, "grad/layer_20/attn": 0.006328613962978125, "grad/layer_20/mlp": 0.006206434685736895, "grad/layer_20/attn_mlp_ratio": 1.0196858875440071, "grad/layer_24/attn": 0.01425793208181858, "grad/layer_24/mlp": 0.011437037959694862, "grad/layer_24/attn_mlp_ratio": 1.2466455044916567, "grad/layer_27/attn": 0.005118106957525015, "grad/layer_27/mlp": 0.011056220158934593, "grad/layer_27/attn_mlp_ratio": 0.4629165155595597} {"step": 22500, "timestamp": 1778218851.8923867, "eos/sharpness": 44.24400329589843, "eos/L0_probe": 2.044917583465576, "eos/L_plus": 2.3079302310943604, "eos/L_minus": 2.2243449687957764, "eos/grad_norm": 0.17862609028816223, "eos/embed_grad_frac": 0.08045690506696701, "eos/time_s": 0.5915396213531494} {"step": 22500, "timestamp": 1778218851.9117796, "train/loss": 2.2272345304489134, "train/z_loss": 0.001537414058111608, "train/perplexity": 9.274183112111942, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912470.4040851118, "perf/iters_per_sec": 0.9119369526315269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965670347213745, "data/tokens_consumed": 47188017152, "data/tokens_consumed_B": 47.188017152, "train/loss_slope": -1.521722439921324e-06} {"step": 22500, "timestamp": 1778218853.2727928, "geo/rankme_last": 440.1910095214844, "geo/layer_0/stable_rank_q_proj": 17.53550910949707, "geo/layer_0/stable_rank_k_proj": 15.474181175231934, "geo/layer_0/stable_rank_o_proj": 51.442420959472656, "geo/layer_0/stable_rank_gate_proj": 149.96136474609375, "geo/layer_0/stable_rank_down_proj": 50.296329498291016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04966267943382263, "geo/layer_0/attn_entropy_mean": 6.261213779449463, "geo/layer_0/attn_entropy_std": 0.33109045028686523, "geo/layer_7/stable_rank_q_proj": 42.57417678833008, "geo/layer_7/stable_rank_k_proj": 41.569557189941406, "geo/layer_7/stable_rank_o_proj": 109.31707763671875, "geo/layer_7/stable_rank_gate_proj": 102.20919799804688, "geo/layer_7/stable_rank_down_proj": 149.27102661132812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5600946545600891, "geo/layer_7/attn_entropy_mean": 4.676771640777588, "geo/layer_7/attn_entropy_std": 0.8454448580741882, "geo/layer_14/stable_rank_q_proj": 57.803932189941406, "geo/layer_14/stable_rank_k_proj": 35.12260437011719, "geo/layer_14/stable_rank_o_proj": 53.855438232421875, "geo/layer_14/stable_rank_gate_proj": 85.43169403076172, "geo/layer_14/stable_rank_down_proj": 136.50900268554688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3757505714893341, "geo/layer_14/attn_entropy_mean": 5.497400283813477, "geo/layer_14/attn_entropy_std": 0.4597778022289276, "geo/layer_21/stable_rank_q_proj": 47.27640151977539, "geo/layer_21/stable_rank_k_proj": 31.887481689453125, "geo/layer_21/stable_rank_o_proj": 82.90618896484375, "geo/layer_21/stable_rank_gate_proj": 85.72386932373047, "geo/layer_21/stable_rank_down_proj": 60.15208053588867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15438388288021088, "geo/layer_21/attn_entropy_mean": 5.762049674987793, "geo/layer_21/attn_entropy_std": 0.2894466519355774, "geo/layer_27/stable_rank_q_proj": 41.401912689208984, "geo/layer_27/stable_rank_k_proj": 31.460689544677734, "geo/layer_27/stable_rank_o_proj": 119.54924774169922, "geo/layer_27/stable_rank_gate_proj": 90.93450164794922, "geo/layer_27/stable_rank_down_proj": 137.88136291503906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07503394782543182, "geo/layer_27/attn_entropy_mean": 4.384500026702881, "geo/layer_27/attn_entropy_std": 0.5497527122497559, "attnres/final_alpha/block_0": 0.2398199588060379, "attnres/block_norm/0": 1.6328591108322144, "attnres/final_alpha/block_1": 0.006598151288926601, "attnres/block_norm/1": 30671.51171875, "attnres/final_alpha/block_2": 0.01368049718439579, "attnres/block_norm/2": 21863.33984375, "attnres/final_alpha/block_3": 0.015641095116734505, "attnres/block_norm/3": 31815.55859375, "attnres/final_alpha/block_4": 0.020183976739645004, "attnres/block_norm/4": 9799.666015625, "attnres/final_alpha/block_5": 0.5733513832092285, "attnres/block_norm/5": 5165.74609375, "attnres/final_alpha/block_6": 0.1307249814271927, "attnres/block_norm/6": 21474.28515625, "geo/tier1_time_s": 1.357757568359375, "geo/step": 22500.0, "geo/rankme_slope": -4.995173850790317e-05} {"step": 22500, "timestamp": 1778218860.333141, "geo/ww_alpha_mean": 8.35770989376275, "geo/ww_alpha_std": 4.9444538058225005, "geo/ww_alpha_min": 1.3639339237804946, "geo/ww_alpha_max": 26.703265557774984, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.21173292866165, "geo/ww_alpha_by_type/k_proj": 4.714585458101935, "geo/ww_alpha_by_type/v_proj": 8.959619650997928, "geo/ww_alpha_by_type/o_proj": 9.334377863196824, "geo/ww_alpha_by_type/gate_proj": 9.132129376343153, "geo/ww_alpha_by_type/up_proj": 12.711275516061722, "geo/ww_alpha_by_type/down_proj": 9.582031823754418, "geo/twonn_id/layer_0": 0.7387381196022034, "geo/twonn_id/layer_7": 3.410254955291748, "geo/twonn_id/layer_14": 3.7988743782043457, "geo/twonn_id/layer_21": 7.715319633483887, "geo/twonn_id/layer_27": 5.852889537811279, "geo/tier2_time_s": 7.053406715393066} {"step": 22500, "timestamp": 1778218860.975452, "eoc/jacobian_sigma/layer_0/attn": 859.74267578125, "eoc/jacobian_sigma/layer_0/mlp": 5670.76025390625, "eoc/jacobian_sigma/layer_0": 5670.76025390625, "eoc/jacobian_sigma/layer_7/attn": 1.146222472190857, "eoc/jacobian_sigma/layer_7/mlp": 1.738767147064209, "eoc/jacobian_sigma/layer_7": 1.738767147064209, "eoc/jacobian_sigma/layer_14/attn": 1.6321464776992798, "eoc/jacobian_sigma/layer_14/mlp": 6.307745933532715, "eoc/jacobian_sigma/layer_14": 6.307745933532715, "eoc/jacobian_sigma/layer_21/attn": 1.0751233100891113, "eoc/jacobian_sigma/layer_21/mlp": 3.729261636734009, "eoc/jacobian_sigma/layer_21": 3.729261636734009, "eoc/jacobian_sigma/layer_27/attn": 3.577416181564331, "eoc/jacobian_sigma/layer_27/mlp": 22.291685104370117, "eoc/jacobian_sigma/layer_27": 22.291685104370117, "eoc/layer0_sigma": 5670.76025390625, "eoc/sigma_max": 22.291685104370117, "eoc/sigma_min": 1.738767147064209, "eoc/sigma_mean": 8.516864955425262, "eoc/time_s": 0.6328434944152832} {"step": 22510, "timestamp": 1778218871.358464, "train/loss": 2.22710063457489, "train/z_loss": 0.001526461320463568, "train/perplexity": 9.272941420388848, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1078590.7159885599, "perf/iters_per_sec": 0.5143121318762588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9443445682525635, "data/tokens_consumed": 47208988672, "data/tokens_consumed_B": 47.208988672, "train/loss_slope": 9.588779789386046e-07} {"step": 22520, "timestamp": 1778218881.7280927, "train/loss": 2.1809219121932983, "train/z_loss": 0.0015558865270577371, "train/perplexity": 8.854465536817536, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023330.767977021, "perf/iters_per_sec": 0.9647992935071091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364850044250489, "data/tokens_consumed": 47229960192, "data/tokens_consumed_B": 47.229960192, "train/loss_slope": 9.308722868288797e-07} {"step": 22530, "timestamp": 1778218892.0854807, "train/loss": 2.2387704372406008, "train/z_loss": 0.0015418639057315886, "train/perplexity": 9.381788694730314, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026321.5889031363, "perf/iters_per_sec": 0.9662254280582124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349551677703857, "data/tokens_consumed": 47250931712, "data/tokens_consumed_B": 47.250931712, "train/loss_slope": 1.274306173979805e-06} {"step": 22540, "timestamp": 1778218902.4375577, "train/loss": 2.25212676525116, "train/z_loss": 0.0015221500885672866, "train/perplexity": 9.50793549560035, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027271.2162908649, "perf/iters_per_sec": 0.9666782456831288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344703674316407, "data/tokens_consumed": 47271903232, "data/tokens_consumed_B": 47.271903232, "train/loss_slope": 1.7341374277961127e-06} {"step": 22550, "timestamp": 1778218912.783396, "grad/layer_0/attn": 0.0027940988074988127, "grad/layer_0/mlp": 0.002780729439109564, "grad/layer_0/attn_mlp_ratio": 1.0048078276586114, "grad/layer_4/attn": 0.0018465337343513966, "grad/layer_4/mlp": 0.002501090057194233, "grad/layer_4/attn_mlp_ratio": 0.7382915522017289, "grad/layer_8/attn": 0.0037081518676131964, "grad/layer_8/mlp": 0.003781223203986883, "grad/layer_8/attn_mlp_ratio": 0.9806751861767484, "grad/layer_12/attn": 0.004083871375769377, "grad/layer_12/mlp": 0.006208292208611965, "grad/layer_12/attn_mlp_ratio": 0.6578091321673671, "grad/layer_16/attn": 0.006581474095582962, "grad/layer_16/mlp": 0.004751398228108883, "grad/layer_16/attn_mlp_ratio": 1.3851657219828322, "grad/layer_20/attn": 0.004183626733720303, "grad/layer_20/mlp": 0.00618811696767807, "grad/layer_20/attn_mlp_ratio": 0.6760742707296745, "grad/layer_24/attn": 0.005851661786437035, "grad/layer_24/mlp": 0.008662715554237366, "grad/layer_24/attn_mlp_ratio": 0.6754996954765213, "grad/layer_27/attn": 0.006225433200597763, "grad/layer_27/mlp": 0.006791964638978243, "grad/layer_27/attn_mlp_ratio": 0.9165879741499209} {"step": 22550, "timestamp": 1778218912.7976635, "train/loss": 2.198595952987671, "train/z_loss": 0.0015293184667825698, "train/perplexity": 9.01235084775335, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025241.2994939873, "perf/iters_per_sec": 0.9657103059263169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355072259902953, "data/tokens_consumed": 47292874752, "data/tokens_consumed_B": 47.292874752, "train/loss_slope": 9.525447717272246e-07} {"step": 22560, "timestamp": 1778218923.1551466, "train/loss": 2.1990299224853516, "train/z_loss": 0.001532131270505488, "train/perplexity": 9.016262781892323, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026326.2101919858, "perf/iters_per_sec": 0.9662276316604547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349528074264527, "data/tokens_consumed": 47313846272, "data/tokens_consumed_B": 47.313846272, "train/loss_slope": -1.4452252844379342e-06} {"step": 22570, "timestamp": 1778218933.5059495, "train/loss": 2.201814818382263, "train/z_loss": 0.0015288154827430844, "train/perplexity": 9.041407131065625, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027501.7754762378, "perf/iters_per_sec": 0.9667881848698796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343527317047119, "data/tokens_consumed": 47334817792, "data/tokens_consumed_B": 47.334817792, "train/loss_slope": -3.6209580278096204e-06} {"step": 22575, "timestamp": 1778218939.305776, "eos/sharpness": 58.62193107604979, "eos/L0_probe": 2.044048547744751, "eos/L_plus": 2.2968215942382812, "eos/L_minus": 2.3774948120117188, "eos/grad_norm": 0.2724773585796356, "eos/embed_grad_frac": 0.03363452106714249, "eos/time_s": 0.6355259418487549} {"step": 22575, "timestamp": 1778218940.68751, "geo/rankme_last": 440.7833251953125, "geo/layer_0/stable_rank_q_proj": 17.5494384765625, "geo/layer_0/stable_rank_k_proj": 15.474672317504883, "geo/layer_0/stable_rank_o_proj": 51.289886474609375, "geo/layer_0/stable_rank_gate_proj": 150.017333984375, "geo/layer_0/stable_rank_down_proj": 50.3392448425293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.049856871366500854, "geo/layer_0/attn_entropy_mean": 6.264552593231201, "geo/layer_0/attn_entropy_std": 0.3254424035549164, "geo/layer_7/stable_rank_q_proj": 42.596736907958984, "geo/layer_7/stable_rank_k_proj": 41.54521942138672, "geo/layer_7/stable_rank_o_proj": 109.1627426147461, "geo/layer_7/stable_rank_gate_proj": 102.08309173583984, "geo/layer_7/stable_rank_down_proj": 149.17489624023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5530903339385986, "geo/layer_7/attn_entropy_mean": 4.6747541427612305, "geo/layer_7/attn_entropy_std": 0.8694443702697754, "geo/layer_14/stable_rank_q_proj": 57.63911819458008, "geo/layer_14/stable_rank_k_proj": 35.1967887878418, "geo/layer_14/stable_rank_o_proj": 53.83967590332031, "geo/layer_14/stable_rank_gate_proj": 85.1961441040039, "geo/layer_14/stable_rank_down_proj": 136.56419372558594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3819255828857422, "geo/layer_14/attn_entropy_mean": 5.522466659545898, "geo/layer_14/attn_entropy_std": 0.4668791890144348, "geo/layer_21/stable_rank_q_proj": 47.35256576538086, "geo/layer_21/stable_rank_k_proj": 31.85532569885254, "geo/layer_21/stable_rank_o_proj": 82.81353759765625, "geo/layer_21/stable_rank_gate_proj": 85.73783874511719, "geo/layer_21/stable_rank_down_proj": 59.95108413696289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15361659228801727, "geo/layer_21/attn_entropy_mean": 5.748627185821533, "geo/layer_21/attn_entropy_std": 0.2971954643726349, "geo/layer_27/stable_rank_q_proj": 41.350067138671875, "geo/layer_27/stable_rank_k_proj": 31.41497230529785, "geo/layer_27/stable_rank_o_proj": 119.63339233398438, "geo/layer_27/stable_rank_gate_proj": 90.85955047607422, "geo/layer_27/stable_rank_down_proj": 137.51385498046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08176105469465256, "geo/layer_27/attn_entropy_mean": 4.406346321105957, "geo/layer_27/attn_entropy_std": 0.5580313801765442, "attnres/final_alpha/block_0": 0.2427465319633484, "attnres/block_norm/0": 1.633629322052002, "attnres/final_alpha/block_1": 0.00679149990901351, "attnres/block_norm/1": 30642.25, "attnres/final_alpha/block_2": 0.014125167392194271, "attnres/block_norm/2": 21666.796875, "attnres/final_alpha/block_3": 0.01578722894191742, "attnres/block_norm/3": 31647.072265625, "attnres/final_alpha/block_4": 0.020489491522312164, "attnres/block_norm/4": 9819.107421875, "attnres/final_alpha/block_5": 0.5627027750015259, "attnres/block_norm/5": 5239.7421875, "attnres/final_alpha/block_6": 0.13735729455947876, "attnres/block_norm/6": 21374.298828125, "geo/tier1_time_s": 1.3634791374206543, "geo/step": 22575.0, "geo/rankme_slope": -1.3445065526210488e-05} {"step": 22580, "timestamp": 1778218945.8782704, "train/loss": 2.214870810508728, "train/z_loss": 0.0015366543782874941, "train/perplexity": 9.160225630274663, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695715.7781788942, "perf/iters_per_sec": 0.8085802927870246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2367355585098267, "data/tokens_consumed": 47355789312, "data/tokens_consumed_B": 47.355789312, "train/loss_slope": -6.807993082824202e-06} {"step": 22590, "timestamp": 1778218956.2440867, "train/loss": 2.242887043952942, "train/z_loss": 0.0015163042698986828, "train/perplexity": 9.420489432249397, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024100.9103060556, "perf/iters_per_sec": 0.9651665259866979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360906362533568, "data/tokens_consumed": 47376760832, "data/tokens_consumed_B": 47.376760832, "train/loss_slope": -5.015318822188297e-06} {"step": 22600, "timestamp": 1778218966.6011004, "grad/layer_0/attn": 0.0025311692152172327, "grad/layer_0/mlp": 0.0026316875591874123, "grad/layer_0/attn_mlp_ratio": 0.9618045691633419, "grad/layer_4/attn": 0.0018194817239418626, "grad/layer_4/mlp": 0.0024425657466053963, "grad/layer_4/attn_mlp_ratio": 0.7449059055953489, "grad/layer_8/attn": 0.006104928907006979, "grad/layer_8/mlp": 0.0036463334690779448, "grad/layer_8/attn_mlp_ratio": 1.6742650641671117, "grad/layer_12/attn": 0.0035793110728263855, "grad/layer_12/mlp": 0.006772124674171209, "grad/layer_12/attn_mlp_ratio": 0.5285359015353392, "grad/layer_16/attn": 0.005029506981372833, "grad/layer_16/mlp": 0.004304877482354641, "grad/layer_16/attn_mlp_ratio": 1.1683275273583602, "grad/layer_20/attn": 0.004930454306304455, "grad/layer_20/mlp": 0.005881862249225378, "grad/layer_20/attn_mlp_ratio": 0.8382471424129434, "grad/layer_24/attn": 0.014436892233788967, "grad/layer_24/mlp": 0.012597437016665936, "grad/layer_24/attn_mlp_ratio": 1.1460182019634377, "grad/layer_27/attn": 0.006001325324177742, "grad/layer_27/mlp": 0.011965947225689888, "grad/layer_27/attn_mlp_ratio": 0.5015336572051757} {"step": 22600, "timestamp": 1778218966.615323, "train/loss": 2.1868940114974977, "train/z_loss": 0.0015344523941166698, "train/perplexity": 8.907503500645014, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023067.9800940128, "perf/iters_per_sec": 0.9646739864797653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366196393966676, "data/tokens_consumed": 47397732352, "data/tokens_consumed_B": 47.397732352, "train/loss_slope": -6.249504178056029e-06} {"step": 22610, "timestamp": 1778218976.9796753, "train/loss": 2.2045310974121093, "train/z_loss": 0.0015431670239195228, "train/perplexity": 9.065999500404047, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024448.9493064564, "perf/iters_per_sec": 0.9653324839145929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359125137329102, "data/tokens_consumed": 47418703872, "data/tokens_consumed_B": 47.418703872, "train/loss_slope": -4.83303836899191e-06} {"step": 22620, "timestamp": 1778218987.3483703, "train/loss": 2.204687476158142, "train/z_loss": 0.0015453060157597065, "train/perplexity": 9.067417340894623, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023792.568841121, "perf/iters_per_sec": 0.9650194973188023, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362484931945801, "data/tokens_consumed": 47439675392, "data/tokens_consumed_B": 47.439675392, "train/loss_slope": -7.902806691496771e-06} {"step": 22630, "timestamp": 1778218997.7177186, "train/loss": 2.1620859622955324, "train/z_loss": 0.0015509874559938908, "train/perplexity": 8.689244202897386, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023542.881576028, "perf/iters_per_sec": 0.9649004371528759, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363763570785522, "data/tokens_consumed": 47460646912, "data/tokens_consumed_B": 47.460646912, "train/loss_slope": -1.2349398959480584e-05} {"step": 22640, "timestamp": 1778219008.0823479, "train/loss": 2.232211780548096, "train/z_loss": 0.0015312498435378075, "train/perplexity": 9.320458106505855, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024254.3942167507, "perf/iters_per_sec": 0.9652397128185037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360120773315429, "data/tokens_consumed": 47481618432, "data/tokens_consumed_B": 47.481618432, "train/loss_slope": -9.414533176759728e-06} {"step": 22650, "timestamp": 1778219018.444877, "grad/layer_0/attn": 0.0030412666965276003, "grad/layer_0/mlp": 0.002838511485606432, "grad/layer_0/attn_mlp_ratio": 1.0714300804510724, "grad/layer_4/attn": 0.001524405088275671, "grad/layer_4/mlp": 0.0024629782419651747, "grad/layer_4/attn_mlp_ratio": 0.618927524575539, "grad/layer_8/attn": 0.003573372960090637, "grad/layer_8/mlp": 0.003770421724766493, "grad/layer_8/attn_mlp_ratio": 0.947738244198165, "grad/layer_12/attn": 0.003987573087215424, "grad/layer_12/mlp": 0.005873870104551315, "grad/layer_12/attn_mlp_ratio": 0.6788663944473422, "grad/layer_16/attn": 0.004090167582035065, "grad/layer_16/mlp": 0.004893655423074961, "grad/layer_16/attn_mlp_ratio": 0.835810277766544, "grad/layer_20/attn": 0.007097064517438412, "grad/layer_20/mlp": 0.006166472565382719, "grad/layer_20/attn_mlp_ratio": 1.150911534446563, "grad/layer_24/attn": 0.009844784624874592, "grad/layer_24/mlp": 0.009544030763208866, "grad/layer_24/attn_mlp_ratio": 1.031512236912927, "grad/layer_27/attn": 0.004533611238002777, "grad/layer_27/mlp": 0.009118870832026005, "grad/layer_27/attn_mlp_ratio": 0.49716804545181914} {"step": 22650, "timestamp": 1778219019.0395112, "eos/sharpness": 45.34730911254882, "eos/L0_probe": 2.0429155826568604, "eos/L_plus": 2.324810266494751, "eos/L_minus": 2.214493989944458, "eos/grad_norm": 0.14394928514957428, "eos/embed_grad_frac": 0.132394477725029, "eos/time_s": 0.5918662548065186} {"step": 22650, "timestamp": 1778219019.057903, "train/loss": 2.2199039697647094, "train/z_loss": 0.0015262206783518196, "train/perplexity": 9.206446726185552, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911769.4315273743, "perf/iters_per_sec": 0.9116027028691169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969691038131715, "data/tokens_consumed": 47502589952, "data/tokens_consumed_B": 47.502589952, "train/loss_slope": -7.595341319333946e-06} {"step": 22650, "timestamp": 1778219020.4241502, "geo/rankme_last": 439.4953308105469, "geo/layer_0/stable_rank_q_proj": 17.566936492919922, "geo/layer_0/stable_rank_k_proj": 15.493834495544434, "geo/layer_0/stable_rank_o_proj": 51.37446594238281, "geo/layer_0/stable_rank_gate_proj": 150.0840606689453, "geo/layer_0/stable_rank_down_proj": 50.373504638671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048198431730270386, "geo/layer_0/attn_entropy_mean": 6.259127616882324, "geo/layer_0/attn_entropy_std": 0.3291894793510437, "geo/layer_7/stable_rank_q_proj": 42.475440979003906, "geo/layer_7/stable_rank_k_proj": 41.581626892089844, "geo/layer_7/stable_rank_o_proj": 109.27062225341797, "geo/layer_7/stable_rank_gate_proj": 101.98451232910156, "geo/layer_7/stable_rank_down_proj": 149.0702362060547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5435267686843872, "geo/layer_7/attn_entropy_mean": 4.659360408782959, "geo/layer_7/attn_entropy_std": 0.8345462083816528, "geo/layer_14/stable_rank_q_proj": 57.607940673828125, "geo/layer_14/stable_rank_k_proj": 35.126163482666016, "geo/layer_14/stable_rank_o_proj": 53.92877960205078, "geo/layer_14/stable_rank_gate_proj": 85.30463409423828, "geo/layer_14/stable_rank_down_proj": 136.6259002685547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3715190887451172, "geo/layer_14/attn_entropy_mean": 5.473530292510986, "geo/layer_14/attn_entropy_std": 0.46048831939697266, "geo/layer_21/stable_rank_q_proj": 47.29084014892578, "geo/layer_21/stable_rank_k_proj": 31.769750595092773, "geo/layer_21/stable_rank_o_proj": 82.82939147949219, "geo/layer_21/stable_rank_gate_proj": 85.54469299316406, "geo/layer_21/stable_rank_down_proj": 59.94023895263672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15779003500938416, "geo/layer_21/attn_entropy_mean": 5.76436710357666, "geo/layer_21/attn_entropy_std": 0.28867900371551514, "geo/layer_27/stable_rank_q_proj": 41.40073013305664, "geo/layer_27/stable_rank_k_proj": 31.490459442138672, "geo/layer_27/stable_rank_o_proj": 119.50907135009766, "geo/layer_27/stable_rank_gate_proj": 90.67459106445312, "geo/layer_27/stable_rank_down_proj": 137.1658477783203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07928258925676346, "geo/layer_27/attn_entropy_mean": 4.4153571128845215, "geo/layer_27/attn_entropy_std": 0.5383852124214172, "attnres/final_alpha/block_0": 0.2410801351070404, "attnres/block_norm/0": 1.634139895439148, "attnres/final_alpha/block_1": 0.006631192751228809, "attnres/block_norm/1": 30767.13671875, "attnres/final_alpha/block_2": 0.013688328675925732, "attnres/block_norm/2": 21867.12109375, "attnres/final_alpha/block_3": 0.015856420621275902, "attnres/block_norm/3": 31860.689453125, "attnres/final_alpha/block_4": 0.02045353502035141, "attnres/block_norm/4": 9807.224609375, "attnres/final_alpha/block_5": 0.5712016820907593, "attnres/block_norm/5": 5206.318359375, "attnres/final_alpha/block_6": 0.1310887336730957, "attnres/block_norm/6": 21578.5859375, "geo/tier1_time_s": 1.3624520301818848, "geo/step": 22650.0, "geo/rankme_slope": -5.8322899472288916e-05} {"step": 22660, "timestamp": 1778219030.7853978, "train/loss": 2.203296184539795, "train/z_loss": 0.001533065887633711, "train/perplexity": 9.054810690944937, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788818.178846981, "perf/iters_per_sec": 0.8529749769434838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1723673343658447, "data/tokens_consumed": 47523561472, "data/tokens_consumed_B": 47.523561472, "train/loss_slope": -8.530312975068843e-06} {"step": 22670, "timestamp": 1778219041.1478684, "train/loss": 2.213227415084839, "train/z_loss": 0.0015370860579423606, "train/perplexity": 9.145184120351642, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024755.9993887125, "perf/iters_per_sec": 0.965478896803242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03575541973114, "data/tokens_consumed": 47544532992, "data/tokens_consumed_B": 47.544532992, "train/loss_slope": -9.674615370701487e-06} {"step": 22680, "timestamp": 1778219051.5100296, "train/loss": 2.199429440498352, "train/z_loss": 0.0015306798741221428, "train/perplexity": 9.01986566094325, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024990.92831799, "perf/iters_per_sec": 0.9655909196462583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356352567672729, "data/tokens_consumed": 47565504512, "data/tokens_consumed_B": 47.565504512, "train/loss_slope": -9.070257277879275e-06} {"step": 22690, "timestamp": 1778219061.8734274, "train/loss": 2.262778472900391, "train/z_loss": 0.0015280131134204566, "train/perplexity": 9.609752544899811, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024826.6587865923, "perf/iters_per_sec": 0.9655125898297273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357192754745483, "data/tokens_consumed": 47586476032, "data/tokens_consumed_B": 47.586476032, "train/loss_slope": -3.951587642666239e-06} {"step": 22700, "timestamp": 1778219072.2253199, "grad/layer_0/attn": 0.0023726497311145067, "grad/layer_0/mlp": 0.002602118067443371, "grad/layer_0/attn_mlp_ratio": 0.9118147518433719, "grad/layer_4/attn": 0.0026672924868762493, "grad/layer_4/mlp": 0.00242306268773973, "grad/layer_4/attn_mlp_ratio": 1.1007937971612947, "grad/layer_8/attn": 0.007013573311269283, "grad/layer_8/mlp": 0.0035740870516747236, "grad/layer_8/attn_mlp_ratio": 1.9623397565957243, "grad/layer_12/attn": 0.006585342343896627, "grad/layer_12/mlp": 0.00624852254986763, "grad/layer_12/attn_mlp_ratio": 1.053903892632306, "grad/layer_16/attn": 0.0038804374635219574, "grad/layer_16/mlp": 0.004293115343898535, "grad/layer_16/attn_mlp_ratio": 0.9038744739643366, "grad/layer_20/attn": 0.003974526189267635, "grad/layer_20/mlp": 0.0058058928698301315, "grad/layer_20/attn_mlp_ratio": 0.6845675953588793, "grad/layer_24/attn": 0.010997864417731762, "grad/layer_24/mlp": 0.010464106686413288, "grad/layer_24/attn_mlp_ratio": 1.051008427399796, "grad/layer_27/attn": 0.0076983324252069, "grad/layer_27/mlp": 0.008303118869662285, "grad/layer_27/attn_mlp_ratio": 0.9271615224754531} {"step": 22700, "timestamp": 1778219072.2393975, "train/loss": 2.240063428878784, "train/z_loss": 0.0015241599758155644, "train/perplexity": 9.393927114810525, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024672.0628059707, "perf/iters_per_sec": 0.9654388727216581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357983589172364, "data/tokens_consumed": 47607447552, "data/tokens_consumed_B": 47.607447552, "train/loss_slope": -3.614326800950962e-06} {"step": 22710, "timestamp": 1778219082.6054015, "train/loss": 2.2040616273880005, "train/z_loss": 0.00154198034433648, "train/perplexity": 9.061744284326375, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024216.7081507805, "perf/iters_per_sec": 0.9652217427019026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360313653945923, "data/tokens_consumed": 47628419072, "data/tokens_consumed_B": 47.628419072, "train/loss_slope": -2.8154391910519584e-06} {"step": 22720, "timestamp": 1778219092.9715555, "train/loss": 2.1911929845809937, "train/z_loss": 0.0015372984576970338, "train/perplexity": 8.945879047053907, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024192.8581146712, "perf/iters_per_sec": 0.9652103701184612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360435724258423, "data/tokens_consumed": 47649390592, "data/tokens_consumed_B": 47.649390592, "train/loss_slope": -3.84352117767928e-06} {"step": 22725, "timestamp": 1778219098.7394269, "eos/sharpness": 21.153020858764645, "eos/L0_probe": 2.04549503326416, "eos/L_plus": 2.1619808673858643, "eos/L_minus": 2.1405394077301025, "eos/grad_norm": 0.14278088510036469, "eos/embed_grad_frac": 0.12873847782611847, "eos/time_s": 0.5953774452209473} {"step": 22725, "timestamp": 1778219100.114256, "geo/rankme_last": 440.5727844238281, "geo/layer_0/stable_rank_q_proj": 17.570863723754883, "geo/layer_0/stable_rank_k_proj": 15.485071182250977, "geo/layer_0/stable_rank_o_proj": 51.3679313659668, "geo/layer_0/stable_rank_gate_proj": 149.84512329101562, "geo/layer_0/stable_rank_down_proj": 50.392494201660156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05023306608200073, "geo/layer_0/attn_entropy_mean": 6.25702428817749, "geo/layer_0/attn_entropy_std": 0.32676833868026733, "geo/layer_7/stable_rank_q_proj": 42.46698760986328, "geo/layer_7/stable_rank_k_proj": 41.43833541870117, "geo/layer_7/stable_rank_o_proj": 109.18627166748047, "geo/layer_7/stable_rank_gate_proj": 102.1424789428711, "geo/layer_7/stable_rank_down_proj": 148.8748321533203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5464178919792175, "geo/layer_7/attn_entropy_mean": 4.673653602600098, "geo/layer_7/attn_entropy_std": 0.8539458513259888, "geo/layer_14/stable_rank_q_proj": 57.77849578857422, "geo/layer_14/stable_rank_k_proj": 35.10380554199219, "geo/layer_14/stable_rank_o_proj": 54.052032470703125, "geo/layer_14/stable_rank_gate_proj": 85.30152130126953, "geo/layer_14/stable_rank_down_proj": 136.84561157226562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3658904731273651, "geo/layer_14/attn_entropy_mean": 5.513245105743408, "geo/layer_14/attn_entropy_std": 0.4397547245025635, "geo/layer_21/stable_rank_q_proj": 47.18646240234375, "geo/layer_21/stable_rank_k_proj": 31.691272735595703, "geo/layer_21/stable_rank_o_proj": 82.73223876953125, "geo/layer_21/stable_rank_gate_proj": 85.38301086425781, "geo/layer_21/stable_rank_down_proj": 59.81570053100586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1583276093006134, "geo/layer_21/attn_entropy_mean": 5.77005672454834, "geo/layer_21/attn_entropy_std": 0.27003082633018494, "geo/layer_27/stable_rank_q_proj": 41.40485382080078, "geo/layer_27/stable_rank_k_proj": 31.421546936035156, "geo/layer_27/stable_rank_o_proj": 119.68201446533203, "geo/layer_27/stable_rank_gate_proj": 90.59346771240234, "geo/layer_27/stable_rank_down_proj": 137.05194091796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08170185983181, "geo/layer_27/attn_entropy_mean": 4.389745235443115, "geo/layer_27/attn_entropy_std": 0.5694131851196289, "attnres/final_alpha/block_0": 0.24034883081912994, "attnres/block_norm/0": 1.6348627805709839, "attnres/final_alpha/block_1": 0.006593788042664528, "attnres/block_norm/1": 30751.04296875, "attnres/final_alpha/block_2": 0.01365805696696043, "attnres/block_norm/2": 21951.35546875, "attnres/final_alpha/block_3": 0.015652496367692947, "attnres/block_norm/3": 32119.330078125, "attnres/final_alpha/block_4": 0.020095879212021828, "attnres/block_norm/4": 9835.8232421875, "attnres/final_alpha/block_5": 0.5728127956390381, "attnres/block_norm/5": 5160.59765625, "attnres/final_alpha/block_6": 0.13083814084529877, "attnres/block_norm/6": 21610.33984375, "geo/tier1_time_s": 1.3576021194458008, "geo/step": 22725.0, "geo/rankme_slope": -6.997957386079432e-05} {"step": 22730, "timestamp": 1778219105.2984023, "train/loss": 2.2229602575302123, "train/z_loss": 0.001531085546594113, "train/perplexity": 9.234627318742683, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702276.6668869643, "perf/iters_per_sec": 0.8117087683138677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319689512252807, "data/tokens_consumed": 47670362112, "data/tokens_consumed_B": 47.670362112, "train/loss_slope": -4.395446339563453e-06} {"step": 22740, "timestamp": 1778219115.6699173, "train/loss": 2.1770547151565554, "train/z_loss": 0.0015484459698200225, "train/perplexity": 8.820289698873458, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023105.065020428, "perf/iters_per_sec": 0.9646916699506893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366006374359131, "data/tokens_consumed": 47691333632, "data/tokens_consumed_B": 47.691333632, "train/loss_slope": -6.608947676078628e-06} {"step": 22750, "timestamp": 1778219126.0229416, "grad/layer_0/attn": 0.0030395700596272945, "grad/layer_0/mlp": 0.002900263061746955, "grad/layer_0/attn_mlp_ratio": 1.0480325026079458, "grad/layer_4/attn": 0.001752058626152575, "grad/layer_4/mlp": 0.0025194897316396236, "grad/layer_4/attn_mlp_ratio": 0.6954021421917695, "grad/layer_8/attn": 0.004929002374410629, "grad/layer_8/mlp": 0.0037673180922865868, "grad/layer_8/attn_mlp_ratio": 1.308358392583493, "grad/layer_12/attn": 0.0048768725246191025, "grad/layer_12/mlp": 0.006299672182649374, "grad/layer_12/attn_mlp_ratio": 0.7741470200046815, "grad/layer_16/attn": 0.004485048819333315, "grad/layer_16/mlp": 0.004389944486320019, "grad/layer_16/attn_mlp_ratio": 1.0216641078590516, "grad/layer_20/attn": 0.006098720245063305, "grad/layer_20/mlp": 0.005902217235416174, "grad/layer_20/attn_mlp_ratio": 1.0332930657209145, "grad/layer_24/attn": 0.00537516176700592, "grad/layer_24/mlp": 0.009722653776407242, "grad/layer_24/attn_mlp_ratio": 0.5528492359528666, "grad/layer_27/attn": 0.0067445444874465466, "grad/layer_27/mlp": 0.00757216801866889, "grad/layer_27/attn_mlp_ratio": 0.890701894325105} {"step": 22750, "timestamp": 1778219126.0372996, "train/loss": 2.232652521133423, "train/z_loss": 0.0015255534672178328, "train/perplexity": 9.324566916060284, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023911.1255187597, "perf/iters_per_sec": 0.9650760295480536, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361877918243407, "data/tokens_consumed": 47712305152, "data/tokens_consumed_B": 47.712305152, "train/loss_slope": -5.813916967754176e-06} {"step": 22760, "timestamp": 1778219136.3970375, "train/loss": 2.141341197490692, "train/z_loss": 0.001552126370370388, "train/perplexity": 8.510844701164705, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025546.677593511, "perf/iters_per_sec": 0.9658559215514713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353511095046997, "data/tokens_consumed": 47733276672, "data/tokens_consumed_B": 47.733276672, "train/loss_slope": -1.1048496824608443e-05} {"step": 22770, "timestamp": 1778219146.7692702, "train/loss": 2.2273839473724366, "train/z_loss": 0.0015177275286987423, "train/perplexity": 9.275568935550904, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022948.0333944804, "perf/iters_per_sec": 0.9646167914364244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03668110370636, "data/tokens_consumed": 47754248192, "data/tokens_consumed_B": 47.754248192, "train/loss_slope": -1.3624795585504122e-05} {"step": 22780, "timestamp": 1778219157.134994, "train/loss": 2.1790739059448243, "train/z_loss": 0.001529800333082676, "train/perplexity": 8.8381175394318, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024364.0598003154, "perf/iters_per_sec": 0.9652920054437234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359559535980225, "data/tokens_consumed": 47775219712, "data/tokens_consumed_B": 47.775219712, "train/loss_slope": -1.5935060758330288e-05} {"step": 22790, "timestamp": 1778219167.4978445, "train/loss": 2.1829179883003236, "train/z_loss": 0.0015248584561049939, "train/perplexity": 8.872157375169806, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024878.677695463, "perf/iters_per_sec": 0.965537394378406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356926679611207, "data/tokens_consumed": 47796191232, "data/tokens_consumed_B": 47.796191232, "train/loss_slope": -1.950760696015125e-05} {"step": 22800, "timestamp": 1778219177.8526268, "grad/layer_0/attn": 0.003086875891312957, "grad/layer_0/mlp": 0.0026105770375579596, "grad/layer_0/attn_mlp_ratio": 1.1824495997082647, "grad/layer_4/attn": 0.001695265993475914, "grad/layer_4/mlp": 0.002397409174591303, "grad/layer_4/attn_mlp_ratio": 0.7071241491567657, "grad/layer_8/attn": 0.00498098973184824, "grad/layer_8/mlp": 0.0036979380529373884, "grad/layer_8/attn_mlp_ratio": 1.3469640447857905, "grad/layer_12/attn": 0.0038338080048561096, "grad/layer_12/mlp": 0.006166827864944935, "grad/layer_12/attn_mlp_ratio": 0.6216823343620456, "grad/layer_16/attn": 0.003660618793219328, "grad/layer_16/mlp": 0.004575860686600208, "grad/layer_16/attn_mlp_ratio": 0.799984738158765, "grad/layer_20/attn": 0.004963587503880262, "grad/layer_20/mlp": 0.006726688239723444, "grad/layer_20/attn_mlp_ratio": 0.7378946746452553, "grad/layer_24/attn": 0.007350509520620108, "grad/layer_24/mlp": 0.009239304810762405, "grad/layer_24/attn_mlp_ratio": 0.7955695359786066, "grad/layer_27/attn": 0.005143461748957634, "grad/layer_27/mlp": 0.007826000452041626, "grad/layer_27/attn_mlp_ratio": 0.6572273685331931} {"step": 22800, "timestamp": 1778219178.4471905, "eos/sharpness": 27.908253669738762, "eos/L0_probe": 2.040848970413208, "eos/L_plus": 2.202324151992798, "eos/L_minus": 2.158456325531006, "eos/grad_norm": 0.13401173055171967, "eos/embed_grad_frac": 0.1253128945827484, "eos/time_s": 0.5917773246765137} {"step": 22800, "timestamp": 1778219178.4648533, "train/loss": 2.270844554901123, "train/z_loss": 0.001527341117616743, "train/perplexity": 9.687579052469852, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912932.5272181148, "perf/iters_per_sec": 0.9121573101130079, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963021278381349, "data/tokens_consumed": 47817162752, "data/tokens_consumed_B": 47.817162752, "train/loss_slope": -1.7552683799072914e-05} {"step": 22800, "timestamp": 1778219179.8275614, "geo/rankme_last": 440.4081115722656, "geo/layer_0/stable_rank_q_proj": 17.570314407348633, "geo/layer_0/stable_rank_k_proj": 15.54152774810791, "geo/layer_0/stable_rank_o_proj": 51.303062438964844, "geo/layer_0/stable_rank_gate_proj": 149.09048461914062, "geo/layer_0/stable_rank_down_proj": 50.40595245361328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050254855304956436, "geo/layer_0/attn_entropy_mean": 6.256410598754883, "geo/layer_0/attn_entropy_std": 0.33219242095947266, "geo/layer_7/stable_rank_q_proj": 42.43190383911133, "geo/layer_7/stable_rank_k_proj": 41.52803039550781, "geo/layer_7/stable_rank_o_proj": 109.12915802001953, "geo/layer_7/stable_rank_gate_proj": 102.0573501586914, "geo/layer_7/stable_rank_down_proj": 148.68051147460938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.553798258304596, "geo/layer_7/attn_entropy_mean": 4.658469200134277, "geo/layer_7/attn_entropy_std": 0.8446125388145447, "geo/layer_14/stable_rank_q_proj": 57.68236541748047, "geo/layer_14/stable_rank_k_proj": 35.12235641479492, "geo/layer_14/stable_rank_o_proj": 53.99320983886719, "geo/layer_14/stable_rank_gate_proj": 85.13615417480469, "geo/layer_14/stable_rank_down_proj": 136.66920471191406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3788701593875885, "geo/layer_14/attn_entropy_mean": 5.494139671325684, "geo/layer_14/attn_entropy_std": 0.46432632207870483, "geo/layer_21/stable_rank_q_proj": 47.13193130493164, "geo/layer_21/stable_rank_k_proj": 31.68862533569336, "geo/layer_21/stable_rank_o_proj": 82.76383972167969, "geo/layer_21/stable_rank_gate_proj": 85.55630493164062, "geo/layer_21/stable_rank_down_proj": 59.891658782958984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15253110229969025, "geo/layer_21/attn_entropy_mean": 5.774277687072754, "geo/layer_21/attn_entropy_std": 0.2870468199253082, "geo/layer_27/stable_rank_q_proj": 41.42705154418945, "geo/layer_27/stable_rank_k_proj": 31.343441009521484, "geo/layer_27/stable_rank_o_proj": 119.39745330810547, "geo/layer_27/stable_rank_gate_proj": 90.53793334960938, "geo/layer_27/stable_rank_down_proj": 137.1335906982422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07419254630804062, "geo/layer_27/attn_entropy_mean": 4.387564659118652, "geo/layer_27/attn_entropy_std": 0.5509734153747559, "attnres/final_alpha/block_0": 0.23988139629364014, "attnres/block_norm/0": 1.635648250579834, "attnres/final_alpha/block_1": 0.006564795970916748, "attnres/block_norm/1": 30969.4921875, "attnres/final_alpha/block_2": 0.013862418942153454, "attnres/block_norm/2": 21971.71484375, "attnres/final_alpha/block_3": 0.015728609636425972, "attnres/block_norm/3": 32213.12109375, "attnres/final_alpha/block_4": 0.020422786474227905, "attnres/block_norm/4": 9851.8046875, "attnres/final_alpha/block_5": 0.5734046697616577, "attnres/block_norm/5": 5157.0654296875, "attnres/final_alpha/block_6": 0.13013529777526855, "attnres/block_norm/6": 21791.0390625, "geo/tier1_time_s": 1.3588366508483887, "geo/step": 22800.0, "geo/rankme_slope": -6.775749362244898e-05} {"step": 22810, "timestamp": 1778219190.192414, "train/loss": 2.1590626716613768, "train/z_loss": 0.0015510369674302637, "train/perplexity": 8.663013763381409, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788758.7750304362, "perf/iters_per_sec": 0.8529466509964162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172406268119812, "data/tokens_consumed": 47838134272, "data/tokens_consumed_B": 47.838134272, "train/loss_slope": -2.099929426727153e-05} {"step": 22820, "timestamp": 1778219200.5524411, "train/loss": 2.205825996398926, "train/z_loss": 0.001535265101119876, "train/perplexity": 9.077746658021416, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025430.7275380301, "perf/iters_per_sec": 0.9658006322565222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354103803634644, "data/tokens_consumed": 47859105792, "data/tokens_consumed_B": 47.859105792, "train/loss_slope": -1.9044847127878332e-05} {"step": 22830, "timestamp": 1778219210.9169917, "train/loss": 2.2384143590927126, "train/z_loss": 0.0015319687430746854, "train/perplexity": 9.37844863948365, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024539.5774101426, "perf/iters_per_sec": 0.965375698762008, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035866141319275, "data/tokens_consumed": 47880077312, "data/tokens_consumed_B": 47.880077312, "train/loss_slope": -1.9020018118335514e-05} {"step": 22840, "timestamp": 1778219221.2789643, "train/loss": 2.2316179037094117, "train/z_loss": 0.0015349340043030678, "train/perplexity": 9.314924545599938, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024972.514260412, "perf/iters_per_sec": 0.9655821391393719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356446743011474, "data/tokens_consumed": 47901048832, "data/tokens_consumed_B": 47.901048832, "train/loss_slope": -1.668228419235503e-05} {"step": 22850, "timestamp": 1778219231.6398242, "grad/layer_0/attn": 0.0030574481934309006, "grad/layer_0/mlp": 0.002930923830717802, "grad/layer_0/attn_mlp_ratio": 1.0431687296238057, "grad/layer_4/attn": 0.0022249938920140266, "grad/layer_4/mlp": 0.002502977615222335, "grad/layer_4/attn_mlp_ratio": 0.8889387542215431, "grad/layer_8/attn": 0.00607676524668932, "grad/layer_8/mlp": 0.003765659173950553, "grad/layer_8/attn_mlp_ratio": 1.6137320996421924, "grad/layer_12/attn": 0.004336876794695854, "grad/layer_12/mlp": 0.006594343576580286, "grad/layer_12/attn_mlp_ratio": 0.6576661768627875, "grad/layer_16/attn": 0.0038970387540757656, "grad/layer_16/mlp": 0.004182788077741861, "grad/layer_16/attn_mlp_ratio": 0.9316844622477725, "grad/layer_20/attn": 0.004528077784925699, "grad/layer_20/mlp": 0.00589043227955699, "grad/layer_20/attn_mlp_ratio": 0.768717386628628, "grad/layer_24/attn": 0.006621049251407385, "grad/layer_24/mlp": 0.00916445255279541, "grad/layer_24/attn_mlp_ratio": 0.7224707794619664, "grad/layer_27/attn": 0.006629651412367821, "grad/layer_27/mlp": 0.007722062058746815, "grad/layer_27/attn_mlp_ratio": 0.858533805618022} {"step": 22850, "timestamp": 1778219231.6539133, "train/loss": 2.2362711429595947, "train/z_loss": 0.0015230934834107757, "train/perplexity": 9.358370121043897, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022668.3215935037, "perf/iters_per_sec": 0.9644834144561308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368244647979736, "data/tokens_consumed": 47922020352, "data/tokens_consumed_B": 47.922020352, "train/loss_slope": -1.2914278383480988e-05} {"step": 22860, "timestamp": 1778219242.0173, "train/loss": 2.178177571296692, "train/z_loss": 0.0015479777357541025, "train/perplexity": 8.830199177738056, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024946.9682237245, "perf/iters_per_sec": 0.9655699578398345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356577396392823, "data/tokens_consumed": 47942991872, "data/tokens_consumed_B": 47.942991872, "train/loss_slope": -1.5143877559333317e-05} {"step": 22870, "timestamp": 1778219252.3900633, "train/loss": 2.231352424621582, "train/z_loss": 0.0015420781215652823, "train/perplexity": 9.31245195615329, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023693.347672661, "perf/iters_per_sec": 0.9649721849788003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362993001937866, "data/tokens_consumed": 47963963392, "data/tokens_consumed_B": 47.963963392, "train/loss_slope": -1.433759532531309e-05} {"step": 22875, "timestamp": 1778219258.1613514, "eos/sharpness": 31.024837493896477, "eos/L0_probe": 2.0411581993103027, "eos/L_plus": 2.2095131874084473, "eos/L_minus": 2.183051586151123, "eos/grad_norm": 0.13869984447956085, "eos/embed_grad_frac": 0.12577924132347107, "eos/time_s": 0.5962648391723633} {"step": 22875, "timestamp": 1778219259.536833, "geo/rankme_last": 440.55950927734375, "geo/layer_0/stable_rank_q_proj": 17.572275161743164, "geo/layer_0/stable_rank_k_proj": 15.489195823669434, "geo/layer_0/stable_rank_o_proj": 51.27452850341797, "geo/layer_0/stable_rank_gate_proj": 148.76333618164062, "geo/layer_0/stable_rank_down_proj": 50.33452224731445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04714149981737137, "geo/layer_0/attn_entropy_mean": 6.247225761413574, "geo/layer_0/attn_entropy_std": 0.3321263790130615, "geo/layer_7/stable_rank_q_proj": 42.26744079589844, "geo/layer_7/stable_rank_k_proj": 41.69264221191406, "geo/layer_7/stable_rank_o_proj": 108.82782745361328, "geo/layer_7/stable_rank_gate_proj": 101.99205017089844, "geo/layer_7/stable_rank_down_proj": 148.7556915283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5560682415962219, "geo/layer_7/attn_entropy_mean": 4.677138328552246, "geo/layer_7/attn_entropy_std": 0.8351063132286072, "geo/layer_14/stable_rank_q_proj": 57.535377502441406, "geo/layer_14/stable_rank_k_proj": 35.193817138671875, "geo/layer_14/stable_rank_o_proj": 54.10139083862305, "geo/layer_14/stable_rank_gate_proj": 85.0510025024414, "geo/layer_14/stable_rank_down_proj": 136.32815551757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39544886350631714, "geo/layer_14/attn_entropy_mean": 5.511476516723633, "geo/layer_14/attn_entropy_std": 0.4525754749774933, "geo/layer_21/stable_rank_q_proj": 47.078006744384766, "geo/layer_21/stable_rank_k_proj": 31.680343627929688, "geo/layer_21/stable_rank_o_proj": 82.75701904296875, "geo/layer_21/stable_rank_gate_proj": 85.5324478149414, "geo/layer_21/stable_rank_down_proj": 59.8731803894043, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14975963532924652, "geo/layer_21/attn_entropy_mean": 5.748263359069824, "geo/layer_21/attn_entropy_std": 0.28299352526664734, "geo/layer_27/stable_rank_q_proj": 41.3841438293457, "geo/layer_27/stable_rank_k_proj": 31.389699935913086, "geo/layer_27/stable_rank_o_proj": 119.23100280761719, "geo/layer_27/stable_rank_gate_proj": 90.57469177246094, "geo/layer_27/stable_rank_down_proj": 137.09095764160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08611869066953659, "geo/layer_27/attn_entropy_mean": 4.373621940612793, "geo/layer_27/attn_entropy_std": 0.5700395703315735, "attnres/final_alpha/block_0": 0.23898963630199432, "attnres/block_norm/0": 1.6363774538040161, "attnres/final_alpha/block_1": 0.00658476073294878, "attnres/block_norm/1": 31076.27734375, "attnres/final_alpha/block_2": 0.013585273176431656, "attnres/block_norm/2": 22042.46484375, "attnres/final_alpha/block_3": 0.015501980669796467, "attnres/block_norm/3": 32224.15234375, "attnres/final_alpha/block_4": 0.020131710916757584, "attnres/block_norm/4": 9880.45703125, "attnres/final_alpha/block_5": 0.5748395323753357, "attnres/block_norm/5": 5145.328125, "attnres/final_alpha/block_6": 0.13036713004112244, "attnres/block_norm/6": 21669.31640625, "geo/tier1_time_s": 1.3576490879058838, "geo/step": 22875.0, "geo/rankme_slope": -8.408838144632853e-05} {"step": 22880, "timestamp": 1778219264.7245924, "train/loss": 2.1730285763740538, "train/z_loss": 0.0015607584617100656, "train/perplexity": 8.784849380138128, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701249.1849394776, "perf/iters_per_sec": 0.8112188267419231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327130079269408, "data/tokens_consumed": 47984934912, "data/tokens_consumed_B": 47.984934912, "train/loss_slope": -1.7302798902956184e-05} {"step": 22890, "timestamp": 1778219275.0898783, "train/loss": 2.2458672761917113, "train/z_loss": 0.001517193962354213, "train/perplexity": 9.448606555528155, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024317.3782651552, "perf/iters_per_sec": 0.9652697459531571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359798431396485, "data/tokens_consumed": 48005906432, "data/tokens_consumed_B": 48.005906432, "train/loss_slope": -1.134341495587161e-05} {"step": 22900, "timestamp": 1778219285.4432075, "grad/layer_0/attn": 0.003764950670301914, "grad/layer_0/mlp": 0.0030080866999924183, "grad/layer_0/attn_mlp_ratio": 1.251609717615663, "grad/layer_4/attn": 0.0016897901659831405, "grad/layer_4/mlp": 0.0025153597816824913, "grad/layer_4/attn_mlp_ratio": 0.6717886288513364, "grad/layer_8/attn": 0.0033263068180531263, "grad/layer_8/mlp": 0.0037425279151648283, "grad/layer_8/attn_mlp_ratio": 0.8887860837847674, "grad/layer_12/attn": 0.005410706624388695, "grad/layer_12/mlp": 0.0057821813970804214, "grad/layer_12/attn_mlp_ratio": 0.9357552382471401, "grad/layer_16/attn": 0.004178180359303951, "grad/layer_16/mlp": 0.004647234920412302, "grad/layer_16/attn_mlp_ratio": 0.899068013765584, "grad/layer_20/attn": 0.004517609719187021, "grad/layer_20/mlp": 0.006703671533614397, "grad/layer_20/attn_mlp_ratio": 0.6739008063184736, "grad/layer_24/attn": 0.018106328323483467, "grad/layer_24/mlp": 0.014035186730325222, "grad/layer_24/attn_mlp_ratio": 1.2900667830343309, "grad/layer_27/attn": 0.005785204470157623, "grad/layer_27/mlp": 0.013446612283587456, "grad/layer_27/attn_mlp_ratio": 0.43023508859516735} {"step": 22900, "timestamp": 1778219285.4574797, "train/loss": 2.2029560804367065, "train/z_loss": 0.0015415738336741925, "train/perplexity": 9.051731636305496, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023944.7020271102, "perf/iters_per_sec": 0.9650920400748778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361706018447876, "data/tokens_consumed": 48026877952, "data/tokens_consumed_B": 48.026877952, "train/loss_slope": -1.2155239774484745e-05} {"step": 22910, "timestamp": 1778219295.8175058, "train/loss": 2.1712637186050414, "train/z_loss": 0.0015277839847840369, "train/perplexity": 8.769359043601055, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025265.7804452756, "perf/iters_per_sec": 0.9657219793535593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354947090148925, "data/tokens_consumed": 48047849472, "data/tokens_consumed_B": 48.047849472, "train/loss_slope": -1.1066853231114678e-05} {"step": 22920, "timestamp": 1778219306.185612, "train/loss": 2.1918672800064085, "train/z_loss": 0.0015389973297715188, "train/perplexity": 8.95191324655958, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023999.9822839748, "perf/iters_per_sec": 0.9651183997554659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361423015594482, "data/tokens_consumed": 48068820992, "data/tokens_consumed_B": 48.068820992, "train/loss_slope": -9.709840117007267e-06} {"step": 22930, "timestamp": 1778219316.5431578, "train/loss": 2.2287365913391115, "train/z_loss": 0.0015233926475048064, "train/perplexity": 9.288123967238715, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025938.0487336593, "perf/iters_per_sec": 0.9660425418537423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351511001586915, "data/tokens_consumed": 48089792512, "data/tokens_consumed_B": 48.089792512, "train/loss_slope": -8.772075609011971e-06} {"step": 22940, "timestamp": 1778219326.9035137, "train/loss": 2.252575492858887, "train/z_loss": 0.001531555608380586, "train/perplexity": 9.512202926135055, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025141.842972862, "perf/iters_per_sec": 0.9656628813614188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355580806732179, "data/tokens_consumed": 48110764032, "data/tokens_consumed_B": 48.110764032, "train/loss_slope": -4.1884172581495565e-06} {"step": 22950, "timestamp": 1778219337.257668, "grad/layer_0/attn": 0.0030242439825087786, "grad/layer_0/mlp": 0.002735020825639367, "grad/layer_0/attn_mlp_ratio": 1.1057480234092907, "grad/layer_4/attn": 0.002077952027320862, "grad/layer_4/mlp": 0.0025459444150328636, "grad/layer_4/attn_mlp_ratio": 0.8161811913226393, "grad/layer_8/attn": 0.010544081218540668, "grad/layer_8/mlp": 0.0038140867836773396, "grad/layer_8/attn_mlp_ratio": 2.7645099705685325, "grad/layer_12/attn": 0.00605788454413414, "grad/layer_12/mlp": 0.006735668051987886, "grad/layer_12/attn_mlp_ratio": 0.8993739607475002, "grad/layer_16/attn": 0.008270402438938618, "grad/layer_16/mlp": 0.004479512572288513, "grad/layer_16/attn_mlp_ratio": 1.8462728077769672, "grad/layer_20/attn": 0.004485013894736767, "grad/layer_20/mlp": 0.006788595579564571, "grad/layer_20/attn_mlp_ratio": 0.6606688785779096, "grad/layer_24/attn": 0.006616900209337473, "grad/layer_24/mlp": 0.01122982893139124, "grad/layer_24/attn_mlp_ratio": 0.5892253738539526, "grad/layer_27/attn": 0.01473518367856741, "grad/layer_27/mlp": 0.009392487816512585, "grad/layer_27/attn_mlp_ratio": 1.5688264717021387} {"step": 22950, "timestamp": 1778219337.8555748, "eos/sharpness": 31.766223907470696, "eos/L0_probe": 2.038358211517334, "eos/L_plus": 2.1721158027648926, "eos/L_minus": 2.2222628593444824, "eos/grad_norm": 0.14581409096717834, "eos/embed_grad_frac": 0.1301979273557663, "eos/time_s": 0.5950813293457031} {"step": 22950, "timestamp": 1778219337.8746035, "train/loss": 2.256460189819336, "train/z_loss": 0.001517138013150543, "train/perplexity": 9.549226818670567, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912601.976845019, "perf/iters_per_sec": 0.911999691412458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964915990829467, "data/tokens_consumed": 48131735552, "data/tokens_consumed_B": 48.131735552, "train/loss_slope": -2.3232354010464415e-07} {"step": 22950, "timestamp": 1778219339.2348793, "geo/rankme_last": 440.3243103027344, "geo/layer_0/stable_rank_q_proj": 17.60076332092285, "geo/layer_0/stable_rank_k_proj": 15.531576156616211, "geo/layer_0/stable_rank_o_proj": 51.4391975402832, "geo/layer_0/stable_rank_gate_proj": 148.63735961914062, "geo/layer_0/stable_rank_down_proj": 50.388328552246094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05388351529836655, "geo/layer_0/attn_entropy_mean": 6.245573043823242, "geo/layer_0/attn_entropy_std": 0.3281143307685852, "geo/layer_7/stable_rank_q_proj": 42.25913619995117, "geo/layer_7/stable_rank_k_proj": 41.69668197631836, "geo/layer_7/stable_rank_o_proj": 108.96138000488281, "geo/layer_7/stable_rank_gate_proj": 101.8232192993164, "geo/layer_7/stable_rank_down_proj": 149.1416015625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5480577349662781, "geo/layer_7/attn_entropy_mean": 4.667394161224365, "geo/layer_7/attn_entropy_std": 0.8412806987762451, "geo/layer_14/stable_rank_q_proj": 57.440860748291016, "geo/layer_14/stable_rank_k_proj": 35.28325653076172, "geo/layer_14/stable_rank_o_proj": 54.282508850097656, "geo/layer_14/stable_rank_gate_proj": 85.04739379882812, "geo/layer_14/stable_rank_down_proj": 136.35400390625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39123329520225525, "geo/layer_14/attn_entropy_mean": 5.523512363433838, "geo/layer_14/attn_entropy_std": 0.4467391073703766, "geo/layer_21/stable_rank_q_proj": 47.074668884277344, "geo/layer_21/stable_rank_k_proj": 31.67453956604004, "geo/layer_21/stable_rank_o_proj": 82.56856536865234, "geo/layer_21/stable_rank_gate_proj": 85.44059753417969, "geo/layer_21/stable_rank_down_proj": 59.81256103515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15441949665546417, "geo/layer_21/attn_entropy_mean": 5.765084266662598, "geo/layer_21/attn_entropy_std": 0.2882043123245239, "geo/layer_27/stable_rank_q_proj": 41.359580993652344, "geo/layer_27/stable_rank_k_proj": 31.35028076171875, "geo/layer_27/stable_rank_o_proj": 119.7112808227539, "geo/layer_27/stable_rank_gate_proj": 90.64276885986328, "geo/layer_27/stable_rank_down_proj": 137.1389923095703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0857517346739769, "geo/layer_27/attn_entropy_mean": 4.377648830413818, "geo/layer_27/attn_entropy_std": 0.5629059672355652, "attnres/final_alpha/block_0": 0.24068963527679443, "attnres/block_norm/0": 1.6370656490325928, "attnres/final_alpha/block_1": 0.0067162420600652695, "attnres/block_norm/1": 30959.95703125, "attnres/final_alpha/block_2": 0.013717537745833397, "attnres/block_norm/2": 21965.93359375, "attnres/final_alpha/block_3": 0.015843261033296585, "attnres/block_norm/3": 32170.45703125, "attnres/final_alpha/block_4": 0.020269671455025673, "attnres/block_norm/4": 9870.177734375, "attnres/final_alpha/block_5": 0.569332480430603, "attnres/block_norm/5": 5217.4140625, "attnres/final_alpha/block_6": 0.13343118131160736, "attnres/block_norm/6": 21842.67578125, "geo/tier1_time_s": 1.3564367294311523, "geo/step": 22950.0, "geo/rankme_slope": -9.491728331957783e-05} {"step": 22960, "timestamp": 1778219349.5965977, "train/loss": 2.248641037940979, "train/z_loss": 0.0015186662087216974, "train/perplexity": 9.47485112023401, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789642.6366430908, "perf/iters_per_sec": 0.8533681090560392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171827244758606, "data/tokens_consumed": 48152707072, "data/tokens_consumed_B": 48.152707072, "train/loss_slope": 1.7271280288697044e-06} {"step": 22970, "timestamp": 1778219359.9650302, "train/loss": 2.2439329624176025, "train/z_loss": 0.0015217268839478493, "train/perplexity": 9.430347650640355, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023717.3256181958, "perf/iters_per_sec": 0.9649836185542087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362870216369628, "data/tokens_consumed": 48173678592, "data/tokens_consumed_B": 48.173678592, "train/loss_slope": 3.1127058204287687e-06} {"step": 22980, "timestamp": 1778219370.327613, "train/loss": 2.2918394088745115, "train/z_loss": 0.0015250098425894977, "train/perplexity": 9.893118444982225, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024783.5447811733, "perf/iters_per_sec": 0.9654920314699046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357413291931152, "data/tokens_consumed": 48194650112, "data/tokens_consumed_B": 48.194650112, "train/loss_slope": 7.671936968229203e-06} {"step": 22990, "timestamp": 1778219380.6879094, "train/loss": 2.2529887914657594, "train/z_loss": 0.0015393036534078418, "train/perplexity": 9.516135118881634, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025546.2577986505, "perf/iters_per_sec": 0.9658557213776829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035351324081421, "data/tokens_consumed": 48215621632, "data/tokens_consumed_B": 48.215621632, "train/loss_slope": 1.1151334776593715e-05} {"step": 23000, "timestamp": 1778219391.0437915, "grad/layer_0/attn": 0.0036047182511538267, "grad/layer_0/mlp": 0.003212550887838006, "grad/layer_0/attn_mlp_ratio": 1.122073475191671, "grad/layer_4/attn": 0.0018445741152390838, "grad/layer_4/mlp": 0.002576973522081971, "grad/layer_4/attn_mlp_ratio": 0.7157908406329856, "grad/layer_8/attn": 0.004999025259166956, "grad/layer_8/mlp": 0.0037860216107219458, "grad/layer_8/attn_mlp_ratio": 1.3203900138791613, "grad/layer_12/attn": 0.005289194639772177, "grad/layer_12/mlp": 0.006236369721591473, "grad/layer_12/attn_mlp_ratio": 0.8481207483013598, "grad/layer_16/attn": 0.0041093360632658005, "grad/layer_16/mlp": 0.004779128357768059, "grad/layer_16/attn_mlp_ratio": 0.859850514498398, "grad/layer_20/attn": 0.005162765737622976, "grad/layer_20/mlp": 0.006706054322421551, "grad/layer_20/attn_mlp_ratio": 0.7698663643947444, "grad/layer_24/attn": 0.015951864421367645, "grad/layer_24/mlp": 0.013297222554683685, "grad/layer_24/attn_mlp_ratio": 1.1996388144819787, "grad/layer_27/attn": 0.006317064166069031, "grad/layer_27/mlp": 0.012400771491229534, "grad/layer_27/attn_mlp_ratio": 0.5094089605308741} {"step": 23000, "timestamp": 1778219391.0580251, "train/loss": 2.1701616525650023, "train/z_loss": 0.001537193136755377, "train/perplexity": 8.75969995426098, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023313.5010854723, "perf/iters_per_sec": 0.9647910600116121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364938497543335, "data/tokens_consumed": 48236593152, "data/tokens_consumed_B": 48.236593152, "train/loss_slope": 8.440152789750302e-06} {"step": 23000, "timestamp": 1778219398.1925535, "geo/ww_alpha_mean": 7.9896131756571025, "geo/ww_alpha_std": 4.612472592392813, "geo/ww_alpha_min": 1.357207138523819, "geo/ww_alpha_max": 31.769713242049914, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.181802605146782, "geo/ww_alpha_by_type/k_proj": 4.680810870467406, "geo/ww_alpha_by_type/v_proj": 8.449680604720347, "geo/ww_alpha_by_type/o_proj": 8.495227555003838, "geo/ww_alpha_by_type/gate_proj": 8.957921013785485, "geo/ww_alpha_by_type/up_proj": 11.97180400675985, "geo/ww_alpha_by_type/down_proj": 9.317947868977678, "geo/twonn_id/layer_0": 0.7410311698913574, "geo/twonn_id/layer_7": 3.202425241470337, "geo/twonn_id/layer_14": 4.112280368804932, "geo/twonn_id/layer_21": 7.500252723693848, "geo/twonn_id/layer_27": 5.6110920906066895, "geo/tier2_time_s": 7.1257405281066895} {"step": 23000, "timestamp": 1778219398.818662, "eoc/jacobian_sigma/layer_0/attn": 849.1570434570312, "eoc/jacobian_sigma/layer_0/mlp": 5196.1416015625, "eoc/jacobian_sigma/layer_0": 5196.1416015625, "eoc/jacobian_sigma/layer_7/attn": 1.154577612876892, "eoc/jacobian_sigma/layer_7/mlp": 1.6816498041152954, "eoc/jacobian_sigma/layer_7": 1.6816498041152954, "eoc/jacobian_sigma/layer_14/attn": 1.6449284553527832, "eoc/jacobian_sigma/layer_14/mlp": 7.196231365203857, "eoc/jacobian_sigma/layer_14": 7.196231365203857, "eoc/jacobian_sigma/layer_21/attn": 1.0756031274795532, "eoc/jacobian_sigma/layer_21/mlp": 3.805647850036621, "eoc/jacobian_sigma/layer_21": 3.805647850036621, "eoc/jacobian_sigma/layer_27/attn": 4.013000011444092, "eoc/jacobian_sigma/layer_27/mlp": 22.197498321533203, "eoc/jacobian_sigma/layer_27": 22.197498321533203, "eoc/layer0_sigma": 5196.1416015625, "eoc/sigma_max": 22.197498321533203, "eoc/sigma_min": 1.6816498041152954, "eoc/sigma_mean": 8.720256835222244, "eoc/time_s": 0.6194350719451904} {"step": 23010, "timestamp": 1778219409.2008753, "train/loss": 2.202564239501953, "train/z_loss": 0.0015327321481890977, "train/perplexity": 9.048185492127587, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1156358.4095523134, "perf/iters_per_sec": 0.5513946578752105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8135830402374267, "data/tokens_consumed": 48257564672, "data/tokens_consumed_B": 48.257564672, "train/loss_slope": 9.516534017007257e-06} {"step": 23020, "timestamp": 1778219419.5585167, "train/loss": 2.210748720169067, "train/z_loss": 0.001523434859700501, "train/perplexity": 9.122544069449834, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025830.0788291986, "perf/iters_per_sec": 0.9659910577913278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352062702178955, "data/tokens_consumed": 48278536192, "data/tokens_consumed_B": 48.278536192, "train/loss_slope": 9.9232221629718e-06} {"step": 23025, "timestamp": 1778219425.3337433, "eos/sharpness": 25.268387794494625, "eos/L0_probe": 2.0375239849090576, "eos/L_plus": 2.174379348754883, "eos/L_minus": 2.1533524990081787, "eos/grad_norm": 0.14685600996017456, "eos/embed_grad_frac": 0.13457252085208893, "eos/time_s": 0.6051597595214844} {"step": 23025, "timestamp": 1778219426.7100255, "geo/rankme_last": 440.4005432128906, "geo/layer_0/stable_rank_q_proj": 17.61079216003418, "geo/layer_0/stable_rank_k_proj": 15.521522521972656, "geo/layer_0/stable_rank_o_proj": 51.382225036621094, "geo/layer_0/stable_rank_gate_proj": 148.85203552246094, "geo/layer_0/stable_rank_down_proj": 50.496700286865234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05395049229264259, "geo/layer_0/attn_entropy_mean": 6.251170635223389, "geo/layer_0/attn_entropy_std": 0.3304413855075836, "geo/layer_7/stable_rank_q_proj": 42.272491455078125, "geo/layer_7/stable_rank_k_proj": 41.77775955200195, "geo/layer_7/stable_rank_o_proj": 108.95594787597656, "geo/layer_7/stable_rank_gate_proj": 101.726318359375, "geo/layer_7/stable_rank_down_proj": 149.1474151611328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5443584322929382, "geo/layer_7/attn_entropy_mean": 4.673555374145508, "geo/layer_7/attn_entropy_std": 0.8221887946128845, "geo/layer_14/stable_rank_q_proj": 57.446231842041016, "geo/layer_14/stable_rank_k_proj": 35.29728698730469, "geo/layer_14/stable_rank_o_proj": 54.23515319824219, "geo/layer_14/stable_rank_gate_proj": 84.97891998291016, "geo/layer_14/stable_rank_down_proj": 136.30538940429688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38473671674728394, "geo/layer_14/attn_entropy_mean": 5.505021095275879, "geo/layer_14/attn_entropy_std": 0.4761207401752472, "geo/layer_21/stable_rank_q_proj": 47.04276657104492, "geo/layer_21/stable_rank_k_proj": 31.663759231567383, "geo/layer_21/stable_rank_o_proj": 82.55672454833984, "geo/layer_21/stable_rank_gate_proj": 85.16355895996094, "geo/layer_21/stable_rank_down_proj": 59.8945426940918, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15120406448841095, "geo/layer_21/attn_entropy_mean": 5.761993408203125, "geo/layer_21/attn_entropy_std": 0.2864226698875427, "geo/layer_27/stable_rank_q_proj": 41.44853973388672, "geo/layer_27/stable_rank_k_proj": 31.26764488220215, "geo/layer_27/stable_rank_o_proj": 119.55735778808594, "geo/layer_27/stable_rank_gate_proj": 90.50764465332031, "geo/layer_27/stable_rank_down_proj": 136.9248809814453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07801290601491928, "geo/layer_27/attn_entropy_mean": 4.36384391784668, "geo/layer_27/attn_entropy_std": 0.5638300180435181, "attnres/final_alpha/block_0": 0.24106672406196594, "attnres/block_norm/0": 1.6378769874572754, "attnres/final_alpha/block_1": 0.006552254781126976, "attnres/block_norm/1": 31036.625, "attnres/final_alpha/block_2": 0.013457249850034714, "attnres/block_norm/2": 22025.98828125, "attnres/final_alpha/block_3": 0.01557212695479393, "attnres/block_norm/3": 32505.818359375, "attnres/final_alpha/block_4": 0.020449936389923096, "attnres/block_norm/4": 9870.8125, "attnres/final_alpha/block_5": 0.5709502100944519, "attnres/block_norm/5": 5208.34423828125, "attnres/final_alpha/block_6": 0.13195151090621948, "attnres/block_norm/6": 21824.15625, "geo/tier1_time_s": 1.3576304912567139, "geo/step": 23025.0, "geo/rankme_slope": -0.0001051285944065126} {"step": 23030, "timestamp": 1778219431.8906167, "train/loss": 2.2258347272872925, "train/z_loss": 0.001518908271100372, "train/perplexity": 9.261210163175944, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701407.8289480377, "perf/iters_per_sec": 0.8112944741001309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232598066329956, "data/tokens_consumed": 48299507712, "data/tokens_consumed_B": 48.299507712, "train/loss_slope": 1.1071760798230299e-05} {"step": 23040, "timestamp": 1778219442.2498736, "train/loss": 2.213830852508545, "train/z_loss": 0.0015374547336250543, "train/perplexity": 9.150704332080208, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025374.7628327582, "perf/iters_per_sec": 0.9657739462055007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354389905929566, "data/tokens_consumed": 48320479232, "data/tokens_consumed_B": 48.320479232, "train/loss_slope": 9.534789827039558e-06} {"step": 23050, "timestamp": 1778219452.60747, "grad/layer_0/attn": 0.002934010699391365, "grad/layer_0/mlp": 0.002774605993181467, "grad/layer_0/attn_mlp_ratio": 1.0574512564510081, "grad/layer_4/attn": 0.0017816225299611688, "grad/layer_4/mlp": 0.0024520556908100843, "grad/layer_4/attn_mlp_ratio": 0.7265831946558504, "grad/layer_8/attn": 0.005250905174762011, "grad/layer_8/mlp": 0.0038030955474823713, "grad/layer_8/attn_mlp_ratio": 1.3806923783886638, "grad/layer_12/attn": 0.004201581701636314, "grad/layer_12/mlp": 0.006517660804092884, "grad/layer_12/attn_mlp_ratio": 0.6446456425798179, "grad/layer_16/attn": 0.004804644268006086, "grad/layer_16/mlp": 0.004554827697575092, "grad/layer_16/attn_mlp_ratio": 1.054846523630156, "grad/layer_20/attn": 0.003928763791918755, "grad/layer_20/mlp": 0.005499514285475016, "grad/layer_20/attn_mlp_ratio": 0.7143837649184386, "grad/layer_24/attn": 0.011271634139120579, "grad/layer_24/mlp": 0.010446115396916866, "grad/layer_24/attn_mlp_ratio": 1.079026375158054, "grad/layer_27/attn": 0.005110159050673246, "grad/layer_27/mlp": 0.009268263354897499, "grad/layer_27/attn_mlp_ratio": 0.5513610047384827} {"step": 23050, "timestamp": 1778219452.6219978, "train/loss": 2.2244435787200927, "train/z_loss": 0.0015132963075302542, "train/perplexity": 9.248335401356506, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022952.9649799406, "perf/iters_per_sec": 0.9646191429996207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366785764694213, "data/tokens_consumed": 48341450752, "data/tokens_consumed_B": 48.341450752, "train/loss_slope": 8.219613372260295e-06} {"step": 23060, "timestamp": 1778219462.9854743, "train/loss": 2.192822813987732, "train/z_loss": 0.0015300039201974868, "train/perplexity": 8.960471191917176, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024947.9937821217, "perf/iters_per_sec": 0.9655704468641861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356572151184082, "data/tokens_consumed": 48362422272, "data/tokens_consumed_B": 48.362422272, "train/loss_slope": 5.994206243115075e-06} {"step": 23070, "timestamp": 1778219473.8330932, "train/loss": 2.2218650341033936, "train/z_loss": 0.0015405080164782702, "train/perplexity": 9.224518875077848, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934305.6881857438, "perf/iters_per_sec": 0.9223488274506301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0841885089874268, "data/tokens_consumed": 48383393792, "data/tokens_consumed_B": 48.383393792, "train/loss_slope": 6.160646575083149e-06} {"step": 23080, "timestamp": 1778219484.2028863, "train/loss": 2.2164726734161375, "train/z_loss": 0.0015218056156300007, "train/perplexity": 9.17491081462121, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023440.4265981973, "perf/iters_per_sec": 0.9648515828124034, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364288330078124, "data/tokens_consumed": 48404365312, "data/tokens_consumed_B": 48.404365312, "train/loss_slope": 8.746808812026742e-06} {"step": 23090, "timestamp": 1778219494.5676212, "train/loss": 2.2417966604232786, "train/z_loss": 0.0015350087196566164, "train/perplexity": 9.410223083876865, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025292.3604339422, "perf/iters_per_sec": 0.9657346536798201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354811191558837, "data/tokens_consumed": 48425336832, "data/tokens_consumed_B": 48.425336832, "train/loss_slope": 1.1212934397592482e-05} {"step": 23100, "timestamp": 1778219504.9242752, "grad/layer_0/attn": 0.0025524261873215437, "grad/layer_0/mlp": 0.0025985713582485914, "grad/layer_0/attn_mlp_ratio": 0.9822420619680979, "grad/layer_4/attn": 0.001705550355836749, "grad/layer_4/mlp": 0.0024870976340025663, "grad/layer_4/attn_mlp_ratio": 0.6857592818002982, "grad/layer_8/attn": 0.009743550792336464, "grad/layer_8/mlp": 0.0036557535640895367, "grad/layer_8/attn_mlp_ratio": 2.665264590458419, "grad/layer_12/attn": 0.003720748471096158, "grad/layer_12/mlp": 0.006018972024321556, "grad/layer_12/attn_mlp_ratio": 0.6181700785855612, "grad/layer_16/attn": 0.0055388216860592365, "grad/layer_16/mlp": 0.004482722841203213, "grad/layer_16/attn_mlp_ratio": 1.2355931336172554, "grad/layer_20/attn": 0.003971393685787916, "grad/layer_20/mlp": 0.006065526511520147, "grad/layer_20/attn_mlp_ratio": 0.6547483739079009, "grad/layer_24/attn": 0.011861910112202168, "grad/layer_24/mlp": 0.011223308742046356, "grad/layer_24/attn_mlp_ratio": 1.056899554235146, "grad/layer_27/attn": 0.011907304637134075, "grad/layer_27/mlp": 0.00813903845846653, "grad/layer_27/attn_mlp_ratio": 1.4629866355342003} {"step": 23100, "timestamp": 1778219505.5265472, "eos/sharpness": 59.59219932556151, "eos/L0_probe": 2.0388121604919434, "eos/L_plus": 2.4485294818878174, "eos/L_minus": 2.2250168323516846, "eos/grad_norm": 0.1807439923286438, "eos/embed_grad_frac": 0.07435396313667297, "eos/time_s": 0.5993800163269043} {"step": 23100, "timestamp": 1778219505.5451136, "train/loss": 2.188015651702881, "train/z_loss": 0.0015319601749069988, "train/perplexity": 8.917500119957957, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911485.8051096979, "perf/iters_per_sec": 0.9114674592541208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097131872177124, "data/tokens_consumed": 48446308352, "data/tokens_consumed_B": 48.446308352, "train/loss_slope": 9.432651364978624e-06} {"step": 23100, "timestamp": 1778219506.9167721, "geo/rankme_last": 440.562255859375, "geo/layer_0/stable_rank_q_proj": 17.636926651000977, "geo/layer_0/stable_rank_k_proj": 15.518486022949219, "geo/layer_0/stable_rank_o_proj": 51.35212326049805, "geo/layer_0/stable_rank_gate_proj": 148.6521453857422, "geo/layer_0/stable_rank_down_proj": 50.43574142456055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04827494919300079, "geo/layer_0/attn_entropy_mean": 6.253050327301025, "geo/layer_0/attn_entropy_std": 0.3258509635925293, "geo/layer_7/stable_rank_q_proj": 42.2710075378418, "geo/layer_7/stable_rank_k_proj": 41.8099479675293, "geo/layer_7/stable_rank_o_proj": 108.53712463378906, "geo/layer_7/stable_rank_gate_proj": 101.72563934326172, "geo/layer_7/stable_rank_down_proj": 149.37139892578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5365229249000549, "geo/layer_7/attn_entropy_mean": 4.661379814147949, "geo/layer_7/attn_entropy_std": 0.8297368288040161, "geo/layer_14/stable_rank_q_proj": 57.42409133911133, "geo/layer_14/stable_rank_k_proj": 35.34268569946289, "geo/layer_14/stable_rank_o_proj": 54.13202667236328, "geo/layer_14/stable_rank_gate_proj": 85.00829315185547, "geo/layer_14/stable_rank_down_proj": 136.20474243164062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3787141442298889, "geo/layer_14/attn_entropy_mean": 5.495580673217773, "geo/layer_14/attn_entropy_std": 0.47636982798576355, "geo/layer_21/stable_rank_q_proj": 47.03050994873047, "geo/layer_21/stable_rank_k_proj": 31.779434204101562, "geo/layer_21/stable_rank_o_proj": 82.42239379882812, "geo/layer_21/stable_rank_gate_proj": 85.23480224609375, "geo/layer_21/stable_rank_down_proj": 59.88543701171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15076543390750885, "geo/layer_21/attn_entropy_mean": 5.755350112915039, "geo/layer_21/attn_entropy_std": 0.287241131067276, "geo/layer_27/stable_rank_q_proj": 41.418426513671875, "geo/layer_27/stable_rank_k_proj": 31.30674171447754, "geo/layer_27/stable_rank_o_proj": 119.75401306152344, "geo/layer_27/stable_rank_gate_proj": 90.4599838256836, "geo/layer_27/stable_rank_down_proj": 137.247802734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07900788635015488, "geo/layer_27/attn_entropy_mean": 4.382874965667725, "geo/layer_27/attn_entropy_std": 0.5537278652191162, "attnres/final_alpha/block_0": 0.24011650681495667, "attnres/block_norm/0": 1.6382501125335693, "attnres/final_alpha/block_1": 0.0065346406772732735, "attnres/block_norm/1": 31007.99609375, "attnres/final_alpha/block_2": 0.013426758348941803, "attnres/block_norm/2": 22155.0078125, "attnres/final_alpha/block_3": 0.01546545047312975, "attnres/block_norm/3": 32492.16796875, "attnres/final_alpha/block_4": 0.02002106048166752, "attnres/block_norm/4": 9946.2763671875, "attnres/final_alpha/block_5": 0.5730278491973877, "attnres/block_norm/5": 5180.61767578125, "attnres/final_alpha/block_6": 0.1314077377319336, "attnres/block_norm/6": 21734.58203125, "geo/tier1_time_s": 1.3677875995635986, "geo/step": 23100.0, "geo/rankme_slope": -7.480503920318127e-05} {"step": 23110, "timestamp": 1778219517.282217, "train/loss": 2.1985758543014526, "train/z_loss": 0.0015395680093206465, "train/perplexity": 9.01216971316186, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787351.8078738232, "perf/iters_per_sec": 0.8522757567757717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1733291625976563, "data/tokens_consumed": 48467279872, "data/tokens_consumed_B": 48.467279872, "train/loss_slope": 6.6268211079188014e-06} {"step": 23120, "timestamp": 1778219527.643802, "train/loss": 2.2546621799468993, "train/z_loss": 0.001525550673250109, "train/perplexity": 9.532072640888355, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024922.9145155924, "perf/iters_per_sec": 0.9655584881380045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356700420379639, "data/tokens_consumed": 48488251392, "data/tokens_consumed_B": 48.488251392, "train/loss_slope": 9.479754835930566e-06} {"step": 23130, "timestamp": 1778219538.0077806, "train/loss": 2.1948123931884767, "train/z_loss": 0.0015247942646965384, "train/perplexity": 8.978316505475219, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024398.722931836, "perf/iters_per_sec": 0.9653085341128521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359382152557373, "data/tokens_consumed": 48509222912, "data/tokens_consumed_B": 48.509222912, "train/loss_slope": 4.678114269575958e-06} {"step": 23140, "timestamp": 1778219548.3714068, "train/loss": 2.2518162727355957, "train/z_loss": 0.0015203052549622952, "train/perplexity": 9.504983811052194, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024940.0224237405, "perf/iters_per_sec": 0.9655666458243086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356612920761108, "data/tokens_consumed": 48530194432, "data/tokens_consumed_B": 48.530194432, "train/loss_slope": 6.04329720796714e-06} {"step": 23150, "timestamp": 1778219558.7224107, "grad/layer_0/attn": 0.002845549490302801, "grad/layer_0/mlp": 0.0027641719207167625, "grad/layer_0/attn_mlp_ratio": 1.0294400887412705, "grad/layer_4/attn": 0.002578830812126398, "grad/layer_4/mlp": 0.0024711787700653076, "grad/layer_4/attn_mlp_ratio": 1.0435629906702162, "grad/layer_8/attn": 0.007306166924536228, "grad/layer_8/mlp": 0.0036986013874411583, "grad/layer_8/attn_mlp_ratio": 1.975386358693899, "grad/layer_12/attn": 0.004330046474933624, "grad/layer_12/mlp": 0.006476700771600008, "grad/layer_12/attn_mlp_ratio": 0.6685574277361874, "grad/layer_16/attn": 0.005077507812529802, "grad/layer_16/mlp": 0.004428832326084375, "grad/layer_16/attn_mlp_ratio": 1.1464664552727128, "grad/layer_20/attn": 0.006399421952664852, "grad/layer_20/mlp": 0.007101869210600853, "grad/layer_20/attn_mlp_ratio": 0.9010897937973221, "grad/layer_24/attn": 0.027308925986289978, "grad/layer_24/mlp": 0.016211334615945816, "grad/layer_24/attn_mlp_ratio": 1.6845575311839271, "grad/layer_27/attn": 0.0041045877151191235, "grad/layer_27/mlp": 0.014907303266227245, "grad/layer_27/attn_mlp_ratio": 0.27534072489717615} {"step": 23150, "timestamp": 1778219558.7368715, "train/loss": 2.2002620935440063, "train/z_loss": 0.0015376172494143247, "train/perplexity": 9.027379207211718, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024375.5208656688, "perf/iters_per_sec": 0.9652974705055565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359500885009765, "data/tokens_consumed": 48551165952, "data/tokens_consumed_B": 48.551165952, "train/loss_slope": 8.26826143507984e-06} {"step": 23160, "timestamp": 1778219569.6544678, "train/loss": 2.220012068748474, "train/z_loss": 0.0015240817097947, "train/perplexity": 9.207441987513137, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921760.6598272286, "perf/iters_per_sec": 0.9163668917785781, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0912659645080567, "data/tokens_consumed": 48572137472, "data/tokens_consumed_B": 48.572137472, "train/loss_slope": 6.844567630228764e-06} {"step": 23170, "timestamp": 1778219580.0160592, "train/loss": 2.208655261993408, "train/z_loss": 0.0015384695027023554, "train/perplexity": 9.103466381123571, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025538.7481644326, "perf/iters_per_sec": 0.9658521405050433, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353551626205444, "data/tokens_consumed": 48593108992, "data/tokens_consumed_B": 48.593108992, "train/loss_slope": 2.865063110486615e-06} {"step": 23175, "timestamp": 1778219585.7838516, "eos/sharpness": 52.603363990783684, "eos/L0_probe": 2.0381414890289307, "eos/L_plus": 2.274890661239624, "eos/L_minus": 2.327425956726074, "eos/grad_norm": 0.20897164940834045, "eos/embed_grad_frac": 0.05268459394574165, "eos/time_s": 0.5969154834747314} {"step": 23175, "timestamp": 1778219587.1626291, "geo/rankme_last": 440.0084228515625, "geo/layer_0/stable_rank_q_proj": 17.660600662231445, "geo/layer_0/stable_rank_k_proj": 15.53927993774414, "geo/layer_0/stable_rank_o_proj": 51.19568634033203, "geo/layer_0/stable_rank_gate_proj": 148.58070373535156, "geo/layer_0/stable_rank_down_proj": 50.492408752441406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051666244864463806, "geo/layer_0/attn_entropy_mean": 6.24857234954834, "geo/layer_0/attn_entropy_std": 0.3253268599510193, "geo/layer_7/stable_rank_q_proj": 42.26239013671875, "geo/layer_7/stable_rank_k_proj": 41.835628509521484, "geo/layer_7/stable_rank_o_proj": 108.5495376586914, "geo/layer_7/stable_rank_gate_proj": 101.76731872558594, "geo/layer_7/stable_rank_down_proj": 149.17056274414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5329505801200867, "geo/layer_7/attn_entropy_mean": 4.676205635070801, "geo/layer_7/attn_entropy_std": 0.8396069407463074, "geo/layer_14/stable_rank_q_proj": 57.41791534423828, "geo/layer_14/stable_rank_k_proj": 35.298431396484375, "geo/layer_14/stable_rank_o_proj": 54.022743225097656, "geo/layer_14/stable_rank_gate_proj": 84.98030090332031, "geo/layer_14/stable_rank_down_proj": 135.76577758789062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.367899090051651, "geo/layer_14/attn_entropy_mean": 5.504615783691406, "geo/layer_14/attn_entropy_std": 0.4327283501625061, "geo/layer_21/stable_rank_q_proj": 46.91368103027344, "geo/layer_21/stable_rank_k_proj": 31.67539405822754, "geo/layer_21/stable_rank_o_proj": 82.3129653930664, "geo/layer_21/stable_rank_gate_proj": 85.08881378173828, "geo/layer_21/stable_rank_down_proj": 59.76133346557617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15361887216567993, "geo/layer_21/attn_entropy_mean": 5.745950698852539, "geo/layer_21/attn_entropy_std": 0.287529855966568, "geo/layer_27/stable_rank_q_proj": 41.345157623291016, "geo/layer_27/stable_rank_k_proj": 31.389986038208008, "geo/layer_27/stable_rank_o_proj": 119.39032745361328, "geo/layer_27/stable_rank_gate_proj": 90.42752075195312, "geo/layer_27/stable_rank_down_proj": 136.82948303222656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08277759701013565, "geo/layer_27/attn_entropy_mean": 4.369556427001953, "geo/layer_27/attn_entropy_std": 0.5683661699295044, "attnres/final_alpha/block_0": 0.24013081192970276, "attnres/block_norm/0": 1.6392297744750977, "attnres/final_alpha/block_1": 0.0065539018251001835, "attnres/block_norm/1": 31167.9375, "attnres/final_alpha/block_2": 0.013410108163952827, "attnres/block_norm/2": 22081.31640625, "attnres/final_alpha/block_3": 0.015609191730618477, "attnres/block_norm/3": 32450.5390625, "attnres/final_alpha/block_4": 0.020174257457256317, "attnres/block_norm/4": 9916.6875, "attnres/final_alpha/block_5": 0.5721536874771118, "attnres/block_norm/5": 5200.4599609375, "attnres/final_alpha/block_6": 0.13196808099746704, "attnres/block_norm/6": 21877.22265625, "geo/tier1_time_s": 1.3608157634735107, "geo/step": 23175.0, "geo/rankme_slope": -0.00010045078578306323} {"step": 23180, "timestamp": 1778219592.3458753, "train/loss": 2.18410210609436, "train/z_loss": 0.00152946631424129, "train/perplexity": 8.882669277025997, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701720.3332705481, "perf/iters_per_sec": 0.811443487773203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232371711730957, "data/tokens_consumed": 48614080512, "data/tokens_consumed_B": 48.614080512, "train/loss_slope": 5.7427024040130644e-08} {"step": 23190, "timestamp": 1778219602.7097566, "train/loss": 2.2467621326446534, "train/z_loss": 0.001528764807153493, "train/perplexity": 9.457065486275592, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024514.648048203, "perf/iters_per_sec": 0.9653638115159049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358788967132568, "data/tokens_consumed": 48635052032, "data/tokens_consumed_B": 48.635052032, "train/loss_slope": 3.6807894670959616e-06} {"step": 23200, "timestamp": 1778219613.0659676, "grad/layer_0/attn": 0.002698668045923114, "grad/layer_0/mlp": 0.0026985560543835163, "grad/layer_0/attn_mlp_ratio": 1.0000414634838766, "grad/layer_4/attn": 0.0017436473863199353, "grad/layer_4/mlp": 0.0025211237370967865, "grad/layer_4/attn_mlp_ratio": 0.6916151284055295, "grad/layer_8/attn": 0.0033292134758085012, "grad/layer_8/mlp": 0.0036079641431570053, "grad/layer_8/attn_mlp_ratio": 0.9227401524621003, "grad/layer_12/attn": 0.0035902811214327812, "grad/layer_12/mlp": 0.005884507670998573, "grad/layer_12/attn_mlp_ratio": 0.6101242892612456, "grad/layer_16/attn": 0.003464559791609645, "grad/layer_16/mlp": 0.004186706617474556, "grad/layer_16/attn_mlp_ratio": 0.8275143269886089, "grad/layer_20/attn": 0.011510881595313549, "grad/layer_20/mlp": 0.006187541410326958, "grad/layer_20/attn_mlp_ratio": 1.8603320197693995, "grad/layer_24/attn": 0.023464210331439972, "grad/layer_24/mlp": 0.01421928871423006, "grad/layer_24/attn_mlp_ratio": 1.6501676446685567, "grad/layer_27/attn": 0.005658728536218405, "grad/layer_27/mlp": 0.013115817680954933, "grad/layer_27/attn_mlp_ratio": 0.43144305835319446} {"step": 23200, "timestamp": 1778219613.0805, "train/loss": 2.224853181838989, "train/z_loss": 0.001526727934833616, "train/perplexity": 9.252124324305857, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023332.7692800055, "perf/iters_per_sec": 0.964800247802737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364839792251588, "data/tokens_consumed": 48656023552, "data/tokens_consumed_B": 48.656023552, "train/loss_slope": 6.118703661042197e-06} {"step": 23210, "timestamp": 1778219623.4408455, "train/loss": 2.208489990234375, "train/z_loss": 0.0015336800832301378, "train/perplexity": 9.101961959544088, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025426.623354557, "perf/iters_per_sec": 0.9657986752293382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354124784469605, "data/tokens_consumed": 48676995072, "data/tokens_consumed_B": 48.676995072, "train/loss_slope": 7.490284314858013e-06} {"step": 23220, "timestamp": 1778219633.7964873, "train/loss": 2.230194926261902, "train/z_loss": 0.001528344373218715, "train/perplexity": 9.301679044306134, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026209.050688944, "perf/iters_per_sec": 0.9661717656559677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350126504898072, "data/tokens_consumed": 48697966592, "data/tokens_consumed_B": 48.697966592, "train/loss_slope": 8.259603442853871e-06} {"step": 23230, "timestamp": 1778219644.1540835, "train/loss": 2.211377429962158, "train/z_loss": 0.0015433592605404555, "train/perplexity": 9.128281305583489, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026222.4463285934, "perf/iters_per_sec": 0.9661781531947105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035005807876587, "data/tokens_consumed": 48718938112, "data/tokens_consumed_B": 48.718938112, "train/loss_slope": 7.587186967579014e-06} {"step": 23240, "timestamp": 1778219654.5186844, "train/loss": 2.207660365104675, "train/z_loss": 0.0015350180561654269, "train/perplexity": 9.094413874646266, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024332.891949659, "perf/iters_per_sec": 0.9652771434543891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359719038009643, "data/tokens_consumed": 48739909632, "data/tokens_consumed_B": 48.739909632, "train/loss_slope": 6.686816707660263e-06} {"step": 23250, "timestamp": 1778219665.2027462, "grad/layer_0/attn": 0.002670934423804283, "grad/layer_0/mlp": 0.0024642159696668386, "grad/layer_0/attn_mlp_ratio": 1.0838880797353914, "grad/layer_4/attn": 0.00169920374173671, "grad/layer_4/mlp": 0.0024641742929816246, "grad/layer_4/attn_mlp_ratio": 0.689563103397358, "grad/layer_8/attn": 0.005440404172986746, "grad/layer_8/mlp": 0.0036585433408617973, "grad/layer_8/attn_mlp_ratio": 1.487041020812703, "grad/layer_12/attn": 0.003991241566836834, "grad/layer_12/mlp": 0.006571054924279451, "grad/layer_12/attn_mlp_ratio": 0.6073973741034825, "grad/layer_16/attn": 0.003794985357671976, "grad/layer_16/mlp": 0.00462020980194211, "grad/layer_16/attn_mlp_ratio": 0.8213880837051906, "grad/layer_20/attn": 0.006097203120589256, "grad/layer_20/mlp": 0.00696649169549346, "grad/layer_20/attn_mlp_ratio": 0.8752185891517829, "grad/layer_24/attn": 0.013976464979350567, "grad/layer_24/mlp": 0.011669236235320568, "grad/layer_24/attn_mlp_ratio": 1.1977189061675317, "grad/layer_27/attn": 0.004884510766714811, "grad/layer_27/mlp": 0.012055791914463043, "grad/layer_27/attn_mlp_ratio": 0.4051588448817781} {"step": 23250, "timestamp": 1778219665.8177433, "eos/sharpness": 58.04061889648436, "eos/L0_probe": 2.038140296936035, "eos/L_plus": 2.274087905883789, "eos/L_minus": 2.382598876953125, "eos/grad_norm": 0.19850707054138184, "eos/embed_grad_frac": 0.06284962594509125, "eos/time_s": 0.612217903137207} {"step": 23250, "timestamp": 1778219665.8367548, "train/loss": 2.19962682723999, "train/z_loss": 0.0015394005109556018, "train/perplexity": 9.021646238561503, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1853607.173335009, "perf/iters_per_sec": 0.883868776957993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1313896656036377, "data/tokens_consumed": 48760881152, "data/tokens_consumed_B": 48.760881152, "train/loss_slope": 2.231909289504497e-06} {"step": 23250, "timestamp": 1778219667.1986995, "geo/rankme_last": 440.7816467285156, "geo/layer_0/stable_rank_q_proj": 17.665563583374023, "geo/layer_0/stable_rank_k_proj": 15.560359954833984, "geo/layer_0/stable_rank_o_proj": 51.22270965576172, "geo/layer_0/stable_rank_gate_proj": 148.11257934570312, "geo/layer_0/stable_rank_down_proj": 50.56995391845703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05051472410559654, "geo/layer_0/attn_entropy_mean": 6.252220153808594, "geo/layer_0/attn_entropy_std": 0.32306596636772156, "geo/layer_7/stable_rank_q_proj": 42.18461990356445, "geo/layer_7/stable_rank_k_proj": 41.81747817993164, "geo/layer_7/stable_rank_o_proj": 108.63648223876953, "geo/layer_7/stable_rank_gate_proj": 101.9676513671875, "geo/layer_7/stable_rank_down_proj": 149.4807586669922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5344088077545166, "geo/layer_7/attn_entropy_mean": 4.677435874938965, "geo/layer_7/attn_entropy_std": 0.8431444764137268, "geo/layer_14/stable_rank_q_proj": 57.20509719848633, "geo/layer_14/stable_rank_k_proj": 35.274295806884766, "geo/layer_14/stable_rank_o_proj": 53.980892181396484, "geo/layer_14/stable_rank_gate_proj": 85.06546020507812, "geo/layer_14/stable_rank_down_proj": 135.5362091064453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38869911432266235, "geo/layer_14/attn_entropy_mean": 5.467726707458496, "geo/layer_14/attn_entropy_std": 0.45542222261428833, "geo/layer_21/stable_rank_q_proj": 46.993309020996094, "geo/layer_21/stable_rank_k_proj": 31.7230224609375, "geo/layer_21/stable_rank_o_proj": 82.28487396240234, "geo/layer_21/stable_rank_gate_proj": 85.02079772949219, "geo/layer_21/stable_rank_down_proj": 59.750492095947266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15050232410430908, "geo/layer_21/attn_entropy_mean": 5.738795280456543, "geo/layer_21/attn_entropy_std": 0.29264599084854126, "geo/layer_27/stable_rank_q_proj": 41.330177307128906, "geo/layer_27/stable_rank_k_proj": 31.34679412841797, "geo/layer_27/stable_rank_o_proj": 119.20854187011719, "geo/layer_27/stable_rank_gate_proj": 90.5340347290039, "geo/layer_27/stable_rank_down_proj": 136.68020629882812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0742252767086029, "geo/layer_27/attn_entropy_mean": 4.366146087646484, "geo/layer_27/attn_entropy_std": 0.556430459022522, "attnres/final_alpha/block_0": 0.24300986528396606, "attnres/block_norm/0": 1.6398818492889404, "attnres/final_alpha/block_1": 0.006627620197832584, "attnres/block_norm/1": 31287.37109375, "attnres/final_alpha/block_2": 0.013759467750787735, "attnres/block_norm/2": 22196.642578125, "attnres/final_alpha/block_3": 0.015574812889099121, "attnres/block_norm/3": 32765.376953125, "attnres/final_alpha/block_4": 0.020216679200530052, "attnres/block_norm/4": 9971.3623046875, "attnres/final_alpha/block_5": 0.5659428834915161, "attnres/block_norm/5": 5236.79296875, "attnres/final_alpha/block_6": 0.13486868143081665, "attnres/block_norm/6": 21889.66796875, "geo/tier1_time_s": 1.358358383178711, "geo/step": 23250.0, "geo/rankme_slope": -8.081072272659063e-05} {"step": 23260, "timestamp": 1778219678.1070664, "train/loss": 2.198332500457764, "train/z_loss": 0.0015227270312607288, "train/perplexity": 9.009976833855813, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1709613.2194945624, "perf/iters_per_sec": 0.8152071092102825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2266821384429931, "data/tokens_consumed": 48781852672, "data/tokens_consumed_B": 48.781852672, "train/loss_slope": -1.1512981175017662e-06} {"step": 23270, "timestamp": 1778219688.4506848, "train/loss": 2.2357731580734255, "train/z_loss": 0.0015365619095973671, "train/perplexity": 9.35371095435802, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028441.1894583239, "perf/iters_per_sec": 0.9672361323634738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033873701095581, "data/tokens_consumed": 48802824192, "data/tokens_consumed_B": 48.802824192, "train/loss_slope": 1.0199839597416251e-06} {"step": 23280, "timestamp": 1778219698.8067062, "train/loss": 2.238141345977783, "train/z_loss": 0.001536885614041239, "train/perplexity": 9.37588854949235, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026039.9168042548, "perf/iters_per_sec": 0.9660911163350366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350990533828734, "data/tokens_consumed": 48823795712, "data/tokens_consumed_B": 48.823795712, "train/loss_slope": 2.8152725555643883e-06} {"step": 23290, "timestamp": 1778219709.165161, "train/loss": 2.2657293558120726, "train/z_loss": 0.001516580837778747, "train/perplexity": 9.63815168012385, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025830.3587708545, "perf/iters_per_sec": 0.9659911912779114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035206127166748, "data/tokens_consumed": 48844767232, "data/tokens_consumed_B": 48.844767232, "train/loss_slope": 4.687812862688053e-06} {"step": 23300, "timestamp": 1778219719.5156167, "grad/layer_0/attn": 0.0029250297229737043, "grad/layer_0/mlp": 0.0028976835310459137, "grad/layer_0/attn_mlp_ratio": 1.0094372248353145, "grad/layer_4/attn": 0.0018650242127478123, "grad/layer_4/mlp": 0.002655092626810074, "grad/layer_4/attn_mlp_ratio": 0.702432797888954, "grad/layer_8/attn": 0.004300747066736221, "grad/layer_8/mlp": 0.0037258530501276255, "grad/layer_8/attn_mlp_ratio": 1.1542985977826055, "grad/layer_12/attn": 0.004880172666162252, "grad/layer_12/mlp": 0.006874195300042629, "grad/layer_12/attn_mlp_ratio": 0.709926381512517, "grad/layer_16/attn": 0.005099487956613302, "grad/layer_16/mlp": 0.0056663136929273605, "grad/layer_16/attn_mlp_ratio": 0.8999656819180109, "grad/layer_20/attn": 0.004765134304761887, "grad/layer_20/mlp": 0.006817561574280262, "grad/layer_20/attn_mlp_ratio": 0.6989499372977727, "grad/layer_24/attn": 0.022775141522288322, "grad/layer_24/mlp": 0.013843020424246788, "grad/layer_24/attn_mlp_ratio": 1.645243643350557, "grad/layer_27/attn": 0.009663714095950127, "grad/layer_27/mlp": 0.01282388623803854, "grad/layer_27/attn_mlp_ratio": 0.7535714089484223} {"step": 23300, "timestamp": 1778219719.5302875, "train/loss": 2.186664414405823, "train/z_loss": 0.0015531728975474834, "train/perplexity": 8.905458598507957, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026701.5385460455, "perf/iters_per_sec": 0.9664066021661976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347611427307128, "data/tokens_consumed": 48865738752, "data/tokens_consumed_B": 48.865738752, "train/loss_slope": 4.037404103283377e-06} {"step": 23310, "timestamp": 1778219730.3660884, "train/loss": 2.2184639453887938, "train/z_loss": 0.0015207575401291252, "train/perplexity": 9.193198759471104, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936392.5595514767, "perf/iters_per_sec": 0.9233439252622017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.083020067214966, "data/tokens_consumed": 48886710272, "data/tokens_consumed_B": 48.886710272, "train/loss_slope": 7.357198205610765e-06} {"step": 23320, "timestamp": 1778219741.128365, "train/loss": 2.207638478279114, "train/z_loss": 0.0015276055433787405, "train/perplexity": 9.094214828974456, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949942.3485574648, "perf/iters_per_sec": 0.9298049681460689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.075494360923767, "data/tokens_consumed": 48907681792, "data/tokens_consumed_B": 48.907681792, "train/loss_slope": 7.694518176278188e-06} {"step": 23325, "timestamp": 1778219746.8885381, "eos/sharpness": 17.666006088256832, "eos/L0_probe": 2.037912607192993, "eos/L_plus": 2.1315460205078125, "eos/L_minus": 2.120939254760742, "eos/grad_norm": 0.12058073282241821, "eos/embed_grad_frac": 0.174102321267128, "eos/time_s": 0.5990476608276367} {"step": 23325, "timestamp": 1778219748.2707832, "geo/rankme_last": 440.01495361328125, "geo/layer_0/stable_rank_q_proj": 17.691316604614258, "geo/layer_0/stable_rank_k_proj": 15.541173934936523, "geo/layer_0/stable_rank_o_proj": 51.13136291503906, "geo/layer_0/stable_rank_gate_proj": 148.050537109375, "geo/layer_0/stable_rank_down_proj": 50.5548210144043, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04493365436792374, "geo/layer_0/attn_entropy_mean": 6.25389289855957, "geo/layer_0/attn_entropy_std": 0.3275608420372009, "geo/layer_7/stable_rank_q_proj": 42.385353088378906, "geo/layer_7/stable_rank_k_proj": 41.8787956237793, "geo/layer_7/stable_rank_o_proj": 108.5702896118164, "geo/layer_7/stable_rank_gate_proj": 101.56755828857422, "geo/layer_7/stable_rank_down_proj": 149.7211151123047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5515294671058655, "geo/layer_7/attn_entropy_mean": 4.640702247619629, "geo/layer_7/attn_entropy_std": 0.7970046401023865, "geo/layer_14/stable_rank_q_proj": 57.24871063232422, "geo/layer_14/stable_rank_k_proj": 35.22220993041992, "geo/layer_14/stable_rank_o_proj": 54.19252395629883, "geo/layer_14/stable_rank_gate_proj": 85.04053497314453, "geo/layer_14/stable_rank_down_proj": 135.55592346191406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37961748242378235, "geo/layer_14/attn_entropy_mean": 5.526178359985352, "geo/layer_14/attn_entropy_std": 0.45172345638275146, "geo/layer_21/stable_rank_q_proj": 47.105167388916016, "geo/layer_21/stable_rank_k_proj": 31.692869186401367, "geo/layer_21/stable_rank_o_proj": 82.42711639404297, "geo/layer_21/stable_rank_gate_proj": 84.67120361328125, "geo/layer_21/stable_rank_down_proj": 59.802093505859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14971454441547394, "geo/layer_21/attn_entropy_mean": 5.753481864929199, "geo/layer_21/attn_entropy_std": 0.3045659065246582, "geo/layer_27/stable_rank_q_proj": 41.27032470703125, "geo/layer_27/stable_rank_k_proj": 31.354703903198242, "geo/layer_27/stable_rank_o_proj": 119.19277954101562, "geo/layer_27/stable_rank_gate_proj": 90.48733520507812, "geo/layer_27/stable_rank_down_proj": 136.58322143554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0730280727148056, "geo/layer_27/attn_entropy_mean": 4.375002861022949, "geo/layer_27/attn_entropy_std": 0.5532918572425842, "attnres/final_alpha/block_0": 0.2407144010066986, "attnres/block_norm/0": 1.6405646800994873, "attnres/final_alpha/block_1": 0.006556685082614422, "attnres/block_norm/1": 31216.48046875, "attnres/final_alpha/block_2": 0.013592645525932312, "attnres/block_norm/2": 22174.765625, "attnres/final_alpha/block_3": 0.015828682109713554, "attnres/block_norm/3": 32534.1953125, "attnres/final_alpha/block_4": 0.02016611211001873, "attnres/block_norm/4": 9985.43359375, "attnres/final_alpha/block_5": 0.5704740285873413, "attnres/block_norm/5": 5212.7216796875, "attnres/final_alpha/block_6": 0.13266751170158386, "attnres/block_norm/6": 22011.33984375, "geo/tier1_time_s": 1.3635389804840088, "geo/step": 23325.0, "geo/rankme_slope": -8.937637555022009e-05} {"step": 23330, "timestamp": 1778219753.4573019, "train/loss": 2.2244242429733276, "train/z_loss": 0.0015282013919204473, "train/perplexity": 9.248156579614019, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701745.453147715, "perf/iters_per_sec": 0.8114554658640456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2323535203933715, "data/tokens_consumed": 48928653312, "data/tokens_consumed_B": 48.928653312, "train/loss_slope": 5.692002317621395e-06} {"step": 23340, "timestamp": 1778219763.8360605, "train/loss": 2.20547833442688, "train/z_loss": 0.0015361792175099254, "train/perplexity": 9.074591219261366, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021849.8195233096, "perf/iters_per_sec": 0.9640931222549961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037244200706482, "data/tokens_consumed": 48949624832, "data/tokens_consumed_B": 48.949624832, "train/loss_slope": 7.0331851963233895e-06} {"step": 23350, "timestamp": 1778219774.2044837, "grad/layer_0/attn": 0.002769953804090619, "grad/layer_0/mlp": 0.002661082660779357, "grad/layer_0/attn_mlp_ratio": 1.0409123101753426, "grad/layer_4/attn": 0.002319399267435074, "grad/layer_4/mlp": 0.0024663559161126614, "grad/layer_4/attn_mlp_ratio": 0.9404154356802013, "grad/layer_8/attn": 0.005361633375287056, "grad/layer_8/mlp": 0.0036283605732023716, "grad/layer_8/attn_mlp_ratio": 1.4777013252530122, "grad/layer_12/attn": 0.003971174359321594, "grad/layer_12/mlp": 0.006583607755601406, "grad/layer_12/attn_mlp_ratio": 0.6031912055550022, "grad/layer_16/attn": 0.004271531943231821, "grad/layer_16/mlp": 0.004254710394889116, "grad/layer_16/attn_mlp_ratio": 1.0039536058594143, "grad/layer_20/attn": 0.003507819026708603, "grad/layer_20/mlp": 0.005378412548452616, "grad/layer_20/attn_mlp_ratio": 0.6522034020052018, "grad/layer_24/attn": 0.0050579095259308815, "grad/layer_24/mlp": 0.008326251991093159, "grad/layer_24/attn_mlp_ratio": 0.6074653362154959, "grad/layer_27/attn": 0.006853550672531128, "grad/layer_27/mlp": 0.007198667153716087, "grad/layer_27/attn_mlp_ratio": 0.9520582673123552} {"step": 23350, "timestamp": 1778219774.2188928, "train/loss": 2.176618552207947, "train/z_loss": 0.0015368421678431331, "train/perplexity": 8.816443454166508, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020762.128535653, "perf/iters_per_sec": 0.9635744707754388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037802505493164, "data/tokens_consumed": 48970596352, "data/tokens_consumed_B": 48.970596352, "train/loss_slope": 4.047185917570154e-06} {"step": 23360, "timestamp": 1778219784.5995345, "train/loss": 2.2415765285491944, "train/z_loss": 0.0015382856247015298, "train/perplexity": 9.408151821817624, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021190.2929865415, "perf/iters_per_sec": 0.9637786354954441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375826597213744, "data/tokens_consumed": 48991567872, "data/tokens_consumed_B": 48.991567872, "train/loss_slope": 5.9082228894924564e-06} {"step": 23370, "timestamp": 1778219794.9795861, "train/loss": 2.2428962707519533, "train/z_loss": 0.0015257716411724686, "train/perplexity": 9.420576353612981, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022072.9647538203, "perf/iters_per_sec": 0.9641995261925794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371297359466554, "data/tokens_consumed": 49012539392, "data/tokens_consumed_B": 49.012539392, "train/loss_slope": 6.314477661583746e-06} {"step": 23380, "timestamp": 1778219805.3594964, "train/loss": 2.21921169757843, "train/z_loss": 0.0015284516499377786, "train/perplexity": 9.200075564725934, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021744.5618974594, "perf/iters_per_sec": 0.9640429315078065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372982025146484, "data/tokens_consumed": 49033510912, "data/tokens_consumed_B": 49.033510912, "train/loss_slope": 6.169758127717834e-06} {"step": 23390, "timestamp": 1778219815.737559, "train/loss": 2.2269350051879884, "train/z_loss": 0.0015201195841655136, "train/perplexity": 9.271405675972327, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021669.3779506786, "perf/iters_per_sec": 0.964007081008281, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037336778640747, "data/tokens_consumed": 49054482432, "data/tokens_consumed_B": 49.054482432, "train/loss_slope": 7.446836895889752e-06} {"step": 23400, "timestamp": 1778219826.1047053, "grad/layer_0/attn": 0.0024516212288290262, "grad/layer_0/mlp": 0.002512448700144887, "grad/layer_0/attn_mlp_ratio": 0.9757895280045695, "grad/layer_4/attn": 0.0017886016285046935, "grad/layer_4/mlp": 0.002397101139649749, "grad/layer_4/attn_mlp_ratio": 0.7461518933451614, "grad/layer_8/attn": 0.0044718775898218155, "grad/layer_8/mlp": 0.0038035509642213583, "grad/layer_8/attn_mlp_ratio": 1.1757112010108568, "grad/layer_12/attn": 0.005262371152639389, "grad/layer_12/mlp": 0.007208381313830614, "grad/layer_12/attn_mlp_ratio": 0.7300350592634508, "grad/layer_16/attn": 0.005348337814211845, "grad/layer_16/mlp": 0.004401118494570255, "grad/layer_16/attn_mlp_ratio": 1.215222380239922, "grad/layer_20/attn": 0.0039746868424117565, "grad/layer_20/mlp": 0.006417796481400728, "grad/layer_20/attn_mlp_ratio": 0.6193226587970562, "grad/layer_24/attn": 0.01064649410545826, "grad/layer_24/mlp": 0.009762054309248924, "grad/layer_24/attn_mlp_ratio": 1.0905997507421579, "grad/layer_27/attn": 0.00833460595458746, "grad/layer_27/mlp": 0.00878024660050869, "grad/layer_27/attn_mlp_ratio": 0.9492450769184638} {"step": 23400, "timestamp": 1778219826.69539, "eos/sharpness": 29.15320396423339, "eos/L0_probe": 2.0427796840667725, "eos/L_plus": 2.208862066268921, "eos/L_minus": 2.168229341506958, "eos/grad_norm": 0.1191234141588211, "eos/embed_grad_frac": 0.19029824435710907, "eos/time_s": 0.5879824161529541} {"step": 23400, "timestamp": 1778219826.7134435, "train/loss": 2.2275317668914796, "train/z_loss": 0.0015357860829681158, "train/perplexity": 9.276940147033221, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911789.916377246, "perf/iters_per_sec": 0.9116124708067159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969573497772216, "data/tokens_consumed": 49075453952, "data/tokens_consumed_B": 49.075453952, "train/loss_slope": 9.893459412488652e-06} {"step": 23400, "timestamp": 1778219828.0728266, "geo/rankme_last": 440.6475524902344, "geo/layer_0/stable_rank_q_proj": 17.65789222717285, "geo/layer_0/stable_rank_k_proj": 15.545215606689453, "geo/layer_0/stable_rank_o_proj": 51.12849426269531, "geo/layer_0/stable_rank_gate_proj": 147.65782165527344, "geo/layer_0/stable_rank_down_proj": 50.58443069458008, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048666659742593765, "geo/layer_0/attn_entropy_mean": 6.24664831161499, "geo/layer_0/attn_entropy_std": 0.3283170461654663, "geo/layer_7/stable_rank_q_proj": 42.479671478271484, "geo/layer_7/stable_rank_k_proj": 41.891212463378906, "geo/layer_7/stable_rank_o_proj": 108.61239624023438, "geo/layer_7/stable_rank_gate_proj": 101.62181091308594, "geo/layer_7/stable_rank_down_proj": 149.77728271484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5485637784004211, "geo/layer_7/attn_entropy_mean": 4.664155960083008, "geo/layer_7/attn_entropy_std": 0.8347061276435852, "geo/layer_14/stable_rank_q_proj": 57.26737976074219, "geo/layer_14/stable_rank_k_proj": 35.10884475708008, "geo/layer_14/stable_rank_o_proj": 54.27942657470703, "geo/layer_14/stable_rank_gate_proj": 85.01602172851562, "geo/layer_14/stable_rank_down_proj": 135.7224578857422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3653963506221771, "geo/layer_14/attn_entropy_mean": 5.493497848510742, "geo/layer_14/attn_entropy_std": 0.47341227531433105, "geo/layer_21/stable_rank_q_proj": 47.04794692993164, "geo/layer_21/stable_rank_k_proj": 31.67804527282715, "geo/layer_21/stable_rank_o_proj": 82.35059356689453, "geo/layer_21/stable_rank_gate_proj": 84.59283447265625, "geo/layer_21/stable_rank_down_proj": 59.78512191772461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15336523950099945, "geo/layer_21/attn_entropy_mean": 5.735437393188477, "geo/layer_21/attn_entropy_std": 0.29351869225502014, "geo/layer_27/stable_rank_q_proj": 41.436622619628906, "geo/layer_27/stable_rank_k_proj": 31.332670211791992, "geo/layer_27/stable_rank_o_proj": 119.25277709960938, "geo/layer_27/stable_rank_gate_proj": 90.35858917236328, "geo/layer_27/stable_rank_down_proj": 136.39968872070312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0734911784529686, "geo/layer_27/attn_entropy_mean": 4.374028205871582, "geo/layer_27/attn_entropy_std": 0.575654149055481, "attnres/final_alpha/block_0": 0.2413237988948822, "attnres/block_norm/0": 1.6413601636886597, "attnres/final_alpha/block_1": 0.0066474424675107, "attnres/block_norm/1": 31290.943359375, "attnres/final_alpha/block_2": 0.013595685362815857, "attnres/block_norm/2": 22252.607421875, "attnres/final_alpha/block_3": 0.015687625855207443, "attnres/block_norm/3": 32623.07421875, "attnres/final_alpha/block_4": 0.019727708771824837, "attnres/block_norm/4": 9977.32421875, "attnres/final_alpha/block_5": 0.5710605382919312, "attnres/block_norm/5": 5216.2216796875, "attnres/final_alpha/block_6": 0.13195721805095673, "attnres/block_norm/6": 22123.046875, "geo/tier1_time_s": 1.3558070659637451, "geo/step": 23400.0, "geo/rankme_slope": -6.94031518857543e-05} {"step": 23410, "timestamp": 1778219838.4482098, "train/loss": 2.1835382699966432, "train/z_loss": 0.0015334433177486063, "train/perplexity": 8.877662319127978, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787650.8334169295, "perf/iters_per_sec": 0.8524183432659767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1731328964233398, "data/tokens_consumed": 49096425472, "data/tokens_consumed_B": 49.096425472, "train/loss_slope": 8.340281629004432e-06} {"step": 23420, "timestamp": 1778219848.821187, "train/loss": 2.149294447898865, "train/z_loss": 0.001530017622280866, "train/perplexity": 8.578803468582795, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022723.532147523, "perf/iters_per_sec": 0.9645097408998122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367961645126342, "data/tokens_consumed": 49117396992, "data/tokens_consumed_B": 49.117396992, "train/loss_slope": 2.1309948382419225e-06} {"step": 23430, "timestamp": 1778219859.1953125, "train/loss": 2.2089010000228884, "train/z_loss": 0.001534260215703398, "train/perplexity": 9.105703723902351, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022756.929716494, "perf/iters_per_sec": 0.9645256661016912, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367790460586548, "data/tokens_consumed": 49138368512, "data/tokens_consumed_B": 49.138368512, "train/loss_slope": 2.245663029990913e-06} {"step": 23440, "timestamp": 1778219869.5734398, "train/loss": 2.2012494564056397, "train/z_loss": 0.001526238035876304, "train/perplexity": 9.036296907957587, "train/grad_norm": 0.37890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021708.3632423566, "perf/iters_per_sec": 0.9640256706439765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373167753219605, "data/tokens_consumed": 49159340032, "data/tokens_consumed_B": 49.159340032, "train/loss_slope": 2.9340726636102293e-06} {"step": 23450, "timestamp": 1778219879.9462867, "grad/layer_0/attn": 0.00312313181348145, "grad/layer_0/mlp": 0.0027300615329295397, "grad/layer_0/attn_mlp_ratio": 1.1439785006355772, "grad/layer_4/attn": 0.0025247277226299047, "grad/layer_4/mlp": 0.0024155350401997566, "grad/layer_4/attn_mlp_ratio": 1.0452043030187994, "grad/layer_8/attn": 0.004225214011967182, "grad/layer_8/mlp": 0.0036904574371874332, "grad/layer_8/attn_mlp_ratio": 1.1449024868573063, "grad/layer_12/attn": 0.003864373778924346, "grad/layer_12/mlp": 0.005903454497456551, "grad/layer_12/attn_mlp_ratio": 0.6545953246746873, "grad/layer_16/attn": 0.004208043683320284, "grad/layer_16/mlp": 0.004289035685360432, "grad/layer_16/attn_mlp_ratio": 0.9811164779000926, "grad/layer_20/attn": 0.0034915271680802107, "grad/layer_20/mlp": 0.005549953319132328, "grad/layer_20/attn_mlp_ratio": 0.6291092743308234, "grad/layer_24/attn": 0.006920820567756891, "grad/layer_24/mlp": 0.007631460204720497, "grad/layer_24/attn_mlp_ratio": 0.9068802419736057, "grad/layer_27/attn": 0.003958059009164572, "grad/layer_27/mlp": 0.006850066594779491, "grad/layer_27/attn_mlp_ratio": 0.5778132075970955} {"step": 23450, "timestamp": 1778219879.9605079, "train/loss": 2.237238812446594, "train/z_loss": 0.0015215169987641275, "train/perplexity": 9.367430313186834, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019917.2904074371, "perf/iters_per_sec": 0.9631716205632387, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382365703582763, "data/tokens_consumed": 49180311552, "data/tokens_consumed_B": 49.180311552, "train/loss_slope": 2.625712903455132e-06} {"step": 23460, "timestamp": 1778219890.334285, "train/loss": 2.1544959783554076, "train/z_loss": 0.0015443075681105256, "train/perplexity": 8.623542631292487, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022792.9333450554, "perf/iters_per_sec": 0.9645428339696195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367605924606322, "data/tokens_consumed": 49201283072, "data/tokens_consumed_B": 49.201283072, "train/loss_slope": -9.717640489062473e-07} {"step": 23470, "timestamp": 1778219900.7103767, "train/loss": 2.258648705482483, "train/z_loss": 0.00151144735282287, "train/perplexity": 9.57014833631823, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022339.3533714174, "perf/iters_per_sec": 0.9643265501839721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369931221008302, "data/tokens_consumed": 49222254592, "data/tokens_consumed_B": 49.222254592, "train/loss_slope": 1.435600634705668e-06} {"step": 23475, "timestamp": 1778219906.469952, "eos/sharpness": 40.02950191497802, "eos/L0_probe": 2.040518283843994, "eos/L_plus": 2.227229595184326, "eos/L_minus": 2.2541019916534424, "eos/grad_norm": 0.13922233879566193, "eos/embed_grad_frac": 0.11527936160564423, "eos/time_s": 0.5865347385406494} {"step": 23475, "timestamp": 1778219907.845344, "geo/rankme_last": 440.5960998535156, "geo/layer_0/stable_rank_q_proj": 17.657503128051758, "geo/layer_0/stable_rank_k_proj": 15.564247131347656, "geo/layer_0/stable_rank_o_proj": 51.08846664428711, "geo/layer_0/stable_rank_gate_proj": 147.8231964111328, "geo/layer_0/stable_rank_down_proj": 50.66216278076172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051045626401901245, "geo/layer_0/attn_entropy_mean": 6.247953414916992, "geo/layer_0/attn_entropy_std": 0.33240199089050293, "geo/layer_7/stable_rank_q_proj": 42.43703842163086, "geo/layer_7/stable_rank_k_proj": 41.822017669677734, "geo/layer_7/stable_rank_o_proj": 108.68655395507812, "geo/layer_7/stable_rank_gate_proj": 101.73734283447266, "geo/layer_7/stable_rank_down_proj": 149.61434936523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.552270770072937, "geo/layer_7/attn_entropy_mean": 4.668596267700195, "geo/layer_7/attn_entropy_std": 0.8208622932434082, "geo/layer_14/stable_rank_q_proj": 57.305118560791016, "geo/layer_14/stable_rank_k_proj": 35.073909759521484, "geo/layer_14/stable_rank_o_proj": 54.29817199707031, "geo/layer_14/stable_rank_gate_proj": 84.83641052246094, "geo/layer_14/stable_rank_down_proj": 135.83872985839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37066394090652466, "geo/layer_14/attn_entropy_mean": 5.514636039733887, "geo/layer_14/attn_entropy_std": 0.4769298732280731, "geo/layer_21/stable_rank_q_proj": 46.78888702392578, "geo/layer_21/stable_rank_k_proj": 31.687137603759766, "geo/layer_21/stable_rank_o_proj": 82.15562438964844, "geo/layer_21/stable_rank_gate_proj": 84.26422882080078, "geo/layer_21/stable_rank_down_proj": 59.67777633666992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1508064717054367, "geo/layer_21/attn_entropy_mean": 5.7513532638549805, "geo/layer_21/attn_entropy_std": 0.2933303713798523, "geo/layer_27/stable_rank_q_proj": 41.4515495300293, "geo/layer_27/stable_rank_k_proj": 31.317625045776367, "geo/layer_27/stable_rank_o_proj": 119.42619323730469, "geo/layer_27/stable_rank_gate_proj": 90.40355682373047, "geo/layer_27/stable_rank_down_proj": 136.07862854003906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07800403237342834, "geo/layer_27/attn_entropy_mean": 4.345171928405762, "geo/layer_27/attn_entropy_std": 0.56170254945755, "attnres/final_alpha/block_0": 0.24121282994747162, "attnres/block_norm/0": 1.6420749425888062, "attnres/final_alpha/block_1": 0.006562915630638599, "attnres/block_norm/1": 31335.0625, "attnres/final_alpha/block_2": 0.013497760519385338, "attnres/block_norm/2": 22220.958984375, "attnres/final_alpha/block_3": 0.015570039860904217, "attnres/block_norm/3": 32851.57421875, "attnres/final_alpha/block_4": 0.01976434886455536, "attnres/block_norm/4": 9994.767578125, "attnres/final_alpha/block_5": 0.5714542865753174, "attnres/block_norm/5": 5231.029296875, "attnres/final_alpha/block_6": 0.13193784654140472, "attnres/block_norm/6": 22164.78125, "geo/tier1_time_s": 1.356788158416748, "geo/step": 23475.0, "geo/rankme_slope": -6.320866237119848e-05} {"step": 23480, "timestamp": 1778219913.0341434, "train/loss": 2.2126878976821898, "train/z_loss": 0.0015275084530003368, "train/perplexity": 9.140251465114604, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702279.9283052795, "perf/iters_per_sec": 0.8117103234793088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319665908813477, "data/tokens_consumed": 49243226112, "data/tokens_consumed_B": 49.243226112, "train/loss_slope": 2.7189475153074086e-06} {"step": 23490, "timestamp": 1778219923.4180946, "train/loss": 2.2224406480789183, "train/z_loss": 0.0015206161886453628, "train/perplexity": 9.229830165539692, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020590.1430397758, "perf/iters_per_sec": 0.9634924617003325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378908395767212, "data/tokens_consumed": 49264197632, "data/tokens_consumed_B": 49.264197632, "train/loss_slope": 3.0895098028975403e-06} {"step": 23500, "timestamp": 1778219933.7880104, "grad/layer_0/attn": 0.0029143723659217358, "grad/layer_0/mlp": 0.002745844889432192, "grad/layer_0/attn_mlp_ratio": 1.0613754152685777, "grad/layer_4/attn": 0.0016269494080916047, "grad/layer_4/mlp": 0.0025225963909178972, "grad/layer_4/attn_mlp_ratio": 0.6449503176386353, "grad/layer_8/attn": 0.003733623307198286, "grad/layer_8/mlp": 0.0037365283351391554, "grad/layer_8/attn_mlp_ratio": 0.999222506133354, "grad/layer_12/attn": 0.003787357360124588, "grad/layer_12/mlp": 0.006034224294126034, "grad/layer_12/attn_mlp_ratio": 0.6276460921492015, "grad/layer_16/attn": 0.0037574402522295713, "grad/layer_16/mlp": 0.004501123446971178, "grad/layer_16/attn_mlp_ratio": 0.8347782976892439, "grad/layer_20/attn": 0.006876435596495867, "grad/layer_20/mlp": 0.006269061006605625, "grad/layer_20/attn_mlp_ratio": 1.0968844424327369, "grad/layer_24/attn": 0.01766258105635643, "grad/layer_24/mlp": 0.011627553962171078, "grad/layer_24/attn_mlp_ratio": 1.5190280743410711, "grad/layer_27/attn": 0.0067777493968605995, "grad/layer_27/mlp": 0.009710719808936119, "grad/layer_27/attn_mlp_ratio": 0.6979656977464148} {"step": 23500, "timestamp": 1778219933.8022048, "train/loss": 2.2010034322738647, "train/z_loss": 0.0015309811336919666, "train/perplexity": 9.034074034307837, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020515.0449271563, "perf/iters_per_sec": 0.9634566521297246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379294157028198, "data/tokens_consumed": 49285169152, "data/tokens_consumed_B": 49.285169152, "train/loss_slope": 3.026933073460976e-06} {"step": 23500, "timestamp": 1778219941.008894, "geo/ww_alpha_mean": 8.059448701354368, "geo/ww_alpha_std": 5.161040103042614, "geo/ww_alpha_min": 1.3388854319432206, "geo/ww_alpha_max": 48.18309826267975, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.154219258453181, "geo/ww_alpha_by_type/k_proj": 4.693078549846908, "geo/ww_alpha_by_type/v_proj": 8.208646589460011, "geo/ww_alpha_by_type/o_proj": 8.169511243152872, "geo/ww_alpha_by_type/gate_proj": 8.911516673209126, "geo/ww_alpha_by_type/up_proj": 13.160744164205306, "geo/ww_alpha_by_type/down_proj": 9.24807848692046, "geo/twonn_id/layer_0": 0.7259756326675415, "geo/twonn_id/layer_7": 3.0444071292877197, "geo/twonn_id/layer_14": 4.181042194366455, "geo/twonn_id/layer_21": 7.796447277069092, "geo/twonn_id/layer_27": 5.7288594245910645, "geo/tier2_time_s": 7.200648069381714} {"step": 23500, "timestamp": 1778219941.6337025, "eoc/jacobian_sigma/layer_0/attn": 976.7730102539062, "eoc/jacobian_sigma/layer_0/mlp": 6007.359375, "eoc/jacobian_sigma/layer_0": 6007.359375, "eoc/jacobian_sigma/layer_7/attn": 1.1513738632202148, "eoc/jacobian_sigma/layer_7/mlp": 1.7637097835540771, "eoc/jacobian_sigma/layer_7": 1.7637097835540771, "eoc/jacobian_sigma/layer_14/attn": 1.6420620679855347, "eoc/jacobian_sigma/layer_14/mlp": 7.609112739562988, "eoc/jacobian_sigma/layer_14": 7.609112739562988, "eoc/jacobian_sigma/layer_21/attn": 1.075913906097412, "eoc/jacobian_sigma/layer_21/mlp": 3.996229887008667, "eoc/jacobian_sigma/layer_21": 3.996229887008667, "eoc/jacobian_sigma/layer_27/attn": 3.713935136795044, "eoc/jacobian_sigma/layer_27/mlp": 21.74879264831543, "eoc/jacobian_sigma/layer_27": 21.74879264831543, "eoc/layer0_sigma": 6007.359375, "eoc/sigma_max": 21.74879264831543, "eoc/sigma_min": 1.7637097835540771, "eoc/sigma_mean": 8.77946126461029, "eoc/time_s": 0.6194334030151367} {"step": 23510, "timestamp": 1778219952.0245204, "train/loss": 2.242026114463806, "train/z_loss": 0.0015163887408562004, "train/perplexity": 9.412382545324826, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1151184.3203296948, "perf/iters_per_sec": 0.5489274598740076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.82173433303833, "data/tokens_consumed": 49306140672, "data/tokens_consumed_B": 49.306140672, "train/loss_slope": 5.424712929609726e-06} {"step": 23520, "timestamp": 1778219962.415139, "train/loss": 2.1755061030387877, "train/z_loss": 0.0015399998752400278, "train/perplexity": 8.806641062313226, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019307.558433898, "perf/iters_per_sec": 0.9628808777017107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038550066947937, "data/tokens_consumed": 49327112192, "data/tokens_consumed_B": 49.327112192, "train/loss_slope": 1.0541861242550773e-06} {"step": 23530, "timestamp": 1778219972.7876203, "train/loss": 2.215902829170227, "train/z_loss": 0.0015312895644456147, "train/perplexity": 9.169684033853667, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022738.4167216977, "perf/iters_per_sec": 0.9645168384178627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036788535118103, "data/tokens_consumed": 49348083712, "data/tokens_consumed_B": 49.348083712, "train/loss_slope": 2.595935667594732e-06} {"step": 23540, "timestamp": 1778219983.1597087, "train/loss": 2.1713456392288206, "train/z_loss": 0.0015443074284121395, "train/perplexity": 8.770077464390388, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023082.0321897098, "perf/iters_per_sec": 0.9646806870411443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366124391555787, "data/tokens_consumed": 49369055232, "data/tokens_consumed_B": 49.369055232, "train/loss_slope": 2.327641731191205e-06} {"step": 23550, "timestamp": 1778219993.5205073, "grad/layer_0/attn": 0.003512832336127758, "grad/layer_0/mlp": 0.0030942391604185104, "grad/layer_0/attn_mlp_ratio": 1.1352814183000932, "grad/layer_4/attn": 0.001962015638127923, "grad/layer_4/mlp": 0.0026168732438236475, "grad/layer_4/attn_mlp_ratio": 0.7497556741745558, "grad/layer_8/attn": 0.006169595755636692, "grad/layer_8/mlp": 0.0038006198592483997, "grad/layer_8/attn_mlp_ratio": 1.6233129915090942, "grad/layer_12/attn": 0.004947044886648655, "grad/layer_12/mlp": 0.006541762966662645, "grad/layer_12/attn_mlp_ratio": 0.756225017053766, "grad/layer_16/attn": 0.004726017825305462, "grad/layer_16/mlp": 0.005429021082818508, "grad/layer_16/attn_mlp_ratio": 0.8705101096790936, "grad/layer_20/attn": 0.010142862796783447, "grad/layer_20/mlp": 0.007342717610299587, "grad/layer_20/attn_mlp_ratio": 1.381349957462768, "grad/layer_24/attn": 0.0237981379032135, "grad/layer_24/mlp": 0.01412271149456501, "grad/layer_24/attn_mlp_ratio": 1.68509692659673, "grad/layer_27/attn": 0.009666417725384235, "grad/layer_27/mlp": 0.0137609438970685, "grad/layer_27/attn_mlp_ratio": 0.7024530967819851} {"step": 23550, "timestamp": 1778219994.1292214, "eos/sharpness": 69.6777820587158, "eos/L0_probe": 2.0384747982025146, "eos/L_plus": 2.303922653198242, "eos/L_minus": 2.4698047637939453, "eos/grad_norm": 0.2779037356376648, "eos/embed_grad_frac": 0.03193844482302666, "eos/time_s": 0.6058924198150635} {"step": 23550, "timestamp": 1778219994.147648, "train/loss": 2.26220498085022, "train/z_loss": 0.0015040667960420251, "train/perplexity": 9.604243008199953, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909381.4918003068, "perf/iters_per_sec": 0.9104640444757017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0983410120010375, "data/tokens_consumed": 49390026752, "data/tokens_consumed_B": 49.390026752, "train/loss_slope": 4.309587839162614e-06} {"step": 23550, "timestamp": 1778219995.5130203, "geo/rankme_last": 441.0318298339844, "geo/layer_0/stable_rank_q_proj": 17.67450714111328, "geo/layer_0/stable_rank_k_proj": 15.576091766357422, "geo/layer_0/stable_rank_o_proj": 51.193359375, "geo/layer_0/stable_rank_gate_proj": 147.90609741210938, "geo/layer_0/stable_rank_down_proj": 50.66640090942383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0509539395570755, "geo/layer_0/attn_entropy_mean": 6.24262809753418, "geo/layer_0/attn_entropy_std": 0.33197924494743347, "geo/layer_7/stable_rank_q_proj": 42.35834884643555, "geo/layer_7/stable_rank_k_proj": 41.80912399291992, "geo/layer_7/stable_rank_o_proj": 108.91201782226562, "geo/layer_7/stable_rank_gate_proj": 101.71765899658203, "geo/layer_7/stable_rank_down_proj": 149.71009826660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.552511990070343, "geo/layer_7/attn_entropy_mean": 4.657801628112793, "geo/layer_7/attn_entropy_std": 0.813814103603363, "geo/layer_14/stable_rank_q_proj": 57.45002365112305, "geo/layer_14/stable_rank_k_proj": 35.05329895019531, "geo/layer_14/stable_rank_o_proj": 54.29732894897461, "geo/layer_14/stable_rank_gate_proj": 84.7734375, "geo/layer_14/stable_rank_down_proj": 135.9432830810547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37894776463508606, "geo/layer_14/attn_entropy_mean": 5.516385555267334, "geo/layer_14/attn_entropy_std": 0.44706133008003235, "geo/layer_21/stable_rank_q_proj": 46.73891067504883, "geo/layer_21/stable_rank_k_proj": 31.70606231689453, "geo/layer_21/stable_rank_o_proj": 82.10337829589844, "geo/layer_21/stable_rank_gate_proj": 84.12418365478516, "geo/layer_21/stable_rank_down_proj": 59.61309051513672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15434108674526215, "geo/layer_21/attn_entropy_mean": 5.7315521240234375, "geo/layer_21/attn_entropy_std": 0.2919158935546875, "geo/layer_27/stable_rank_q_proj": 41.50550079345703, "geo/layer_27/stable_rank_k_proj": 31.309608459472656, "geo/layer_27/stable_rank_o_proj": 119.60630798339844, "geo/layer_27/stable_rank_gate_proj": 90.35782623291016, "geo/layer_27/stable_rank_down_proj": 136.274658203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07881376892328262, "geo/layer_27/attn_entropy_mean": 4.36884880065918, "geo/layer_27/attn_entropy_std": 0.5658217668533325, "attnres/final_alpha/block_0": 0.24270467460155487, "attnres/block_norm/0": 1.6425466537475586, "attnres/final_alpha/block_1": 0.006636437959969044, "attnres/block_norm/1": 31361.80078125, "attnres/final_alpha/block_2": 0.013704252429306507, "attnres/block_norm/2": 22158.30859375, "attnres/final_alpha/block_3": 0.01561378501355648, "attnres/block_norm/3": 32801.26953125, "attnres/final_alpha/block_4": 0.0202183797955513, "attnres/block_norm/4": 10032.96875, "attnres/final_alpha/block_5": 0.5672907829284668, "attnres/block_norm/5": 5250.12890625, "attnres/final_alpha/block_6": 0.13383173942565918, "attnres/block_norm/6": 22129.13671875, "geo/tier1_time_s": 1.361429214477539, "geo/step": 23550.0, "geo/rankme_slope": -7.878669045743297e-05} {"step": 23560, "timestamp": 1778220005.8881376, "train/loss": 2.19308660030365, "train/z_loss": 0.0015250002616085112, "train/perplexity": 8.962835153378416, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786850.2058660842, "perf/iters_per_sec": 0.8520365742998525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173658537864685, "data/tokens_consumed": 49410998272, "data/tokens_consumed_B": 49.410998272, "train/loss_slope": 2.135454252822724e-06} {"step": 23570, "timestamp": 1778220016.2591844, "train/loss": 2.2299979448318483, "train/z_loss": 0.001515740610193461, "train/perplexity": 9.299846966714643, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023254.3025517554, "perf/iters_per_sec": 0.9647628319510247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365241765975952, "data/tokens_consumed": 49431969792, "data/tokens_consumed_B": 49.431969792, "train/loss_slope": 2.3299880761696582e-06} {"step": 23580, "timestamp": 1778220026.6385896, "train/loss": 2.2109689712524414, "train/z_loss": 0.0015171869425103068, "train/perplexity": 9.124553540950274, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021823.7945429494, "perf/iters_per_sec": 0.9640807125773189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372575521469116, "data/tokens_consumed": 49452941312, "data/tokens_consumed_B": 49.452941312, "train/loss_slope": 2.151536941528302e-06} {"step": 23590, "timestamp": 1778220037.0175066, "train/loss": 2.183719301223755, "train/z_loss": 0.0015348662505857646, "train/perplexity": 8.879269598711, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022066.6894283926, "perf/iters_per_sec": 0.9641965338842357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371329545974732, "data/tokens_consumed": 49473912832, "data/tokens_consumed_B": 49.473912832, "train/loss_slope": 2.0569297454037115e-06} {"step": 23600, "timestamp": 1778220047.3767245, "grad/layer_0/attn": 0.0038531392347067595, "grad/layer_0/mlp": 0.003175540827214718, "grad/layer_0/attn_mlp_ratio": 1.2133804359707516, "grad/layer_4/attn": 0.002199147129431367, "grad/layer_4/mlp": 0.002543020062148571, "grad/layer_4/attn_mlp_ratio": 0.864777701004671, "grad/layer_8/attn": 0.005586476065218449, "grad/layer_8/mlp": 0.0039720358327031136, "grad/layer_8/attn_mlp_ratio": 1.4064515427020952, "grad/layer_12/attn": 0.0059236460365355015, "grad/layer_12/mlp": 0.006578708067536354, "grad/layer_12/attn_mlp_ratio": 0.9004269357571815, "grad/layer_16/attn": 0.004567835014313459, "grad/layer_16/mlp": 0.004919684957712889, "grad/layer_16/attn_mlp_ratio": 0.9284811854271419, "grad/layer_20/attn": 0.004005689173936844, "grad/layer_20/mlp": 0.006938754115253687, "grad/layer_20/attn_mlp_ratio": 0.5772922702941413, "grad/layer_24/attn": 0.011920827440917492, "grad/layer_24/mlp": 0.009886804036796093, "grad/layer_24/attn_mlp_ratio": 1.2057311216019035, "grad/layer_27/attn": 0.0042541963048279285, "grad/layer_27/mlp": 0.008225910365581512, "grad/layer_27/attn_mlp_ratio": 0.5171702661520748} {"step": 23600, "timestamp": 1778220047.3908477, "train/loss": 2.212845516204834, "train/z_loss": 0.0015295865014195442, "train/perplexity": 9.141692251591468, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022778.0944849984, "perf/iters_per_sec": 0.9645357582497589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367681980133057, "data/tokens_consumed": 49494884352, "data/tokens_consumed_B": 49.494884352, "train/loss_slope": 3.700840197773982e-07} {"step": 23610, "timestamp": 1778220057.77081, "train/loss": 2.253853440284729, "train/z_loss": 0.0015194592531770467, "train/perplexity": 9.524366792113755, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021260.425048924, "perf/iters_per_sec": 0.9638120770687695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03754665851593, "data/tokens_consumed": 49515855872, "data/tokens_consumed_B": 49.515855872, "train/loss_slope": 2.157121973641444e-06} {"step": 23620, "timestamp": 1778220068.1470041, "train/loss": 2.173383688926697, "train/z_loss": 0.0015242519555613398, "train/perplexity": 8.787969544398052, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022292.1607436733, "perf/iters_per_sec": 0.9643040469854705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037017321586609, "data/tokens_consumed": 49536827392, "data/tokens_consumed_B": 49.536827392, "train/loss_slope": -8.859365722491504e-07} {"step": 23625, "timestamp": 1778220073.9102638, "eos/sharpness": 40.515661239624016, "eos/L0_probe": 2.0401663780212402, "eos/L_plus": 2.244931936264038, "eos/L_minus": 2.2405574321746826, "eos/grad_norm": 0.17954479157924652, "eos/embed_grad_frac": 0.0901159793138504, "eos/time_s": 0.5865261554718018} {"step": 23625, "timestamp": 1778220075.28638, "geo/rankme_last": 440.6715087890625, "geo/layer_0/stable_rank_q_proj": 17.700780868530273, "geo/layer_0/stable_rank_k_proj": 15.595980644226074, "geo/layer_0/stable_rank_o_proj": 51.25446701049805, "geo/layer_0/stable_rank_gate_proj": 148.25759887695312, "geo/layer_0/stable_rank_down_proj": 50.58034133911133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04877721890807152, "geo/layer_0/attn_entropy_mean": 6.2426557540893555, "geo/layer_0/attn_entropy_std": 0.3275451958179474, "geo/layer_7/stable_rank_q_proj": 42.427886962890625, "geo/layer_7/stable_rank_k_proj": 41.73609161376953, "geo/layer_7/stable_rank_o_proj": 108.72476959228516, "geo/layer_7/stable_rank_gate_proj": 101.58887481689453, "geo/layer_7/stable_rank_down_proj": 149.0673370361328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5448621511459351, "geo/layer_7/attn_entropy_mean": 4.683249473571777, "geo/layer_7/attn_entropy_std": 0.8272822499275208, "geo/layer_14/stable_rank_q_proj": 57.31450653076172, "geo/layer_14/stable_rank_k_proj": 34.95931625366211, "geo/layer_14/stable_rank_o_proj": 54.1103515625, "geo/layer_14/stable_rank_gate_proj": 84.72417449951172, "geo/layer_14/stable_rank_down_proj": 136.09063720703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3801449239253998, "geo/layer_14/attn_entropy_mean": 5.528592109680176, "geo/layer_14/attn_entropy_std": 0.45060259103775024, "geo/layer_21/stable_rank_q_proj": 46.6954460144043, "geo/layer_21/stable_rank_k_proj": 31.730009078979492, "geo/layer_21/stable_rank_o_proj": 82.08695983886719, "geo/layer_21/stable_rank_gate_proj": 84.05391693115234, "geo/layer_21/stable_rank_down_proj": 59.59916305541992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15386702120304108, "geo/layer_21/attn_entropy_mean": 5.736136436462402, "geo/layer_21/attn_entropy_std": 0.2891284227371216, "geo/layer_27/stable_rank_q_proj": 41.35369110107422, "geo/layer_27/stable_rank_k_proj": 31.263072967529297, "geo/layer_27/stable_rank_o_proj": 119.4166259765625, "geo/layer_27/stable_rank_gate_proj": 90.33253479003906, "geo/layer_27/stable_rank_down_proj": 136.58555603027344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07548369467258453, "geo/layer_27/attn_entropy_mean": 4.367249965667725, "geo/layer_27/attn_entropy_std": 0.5709080100059509, "attnres/final_alpha/block_0": 0.24173057079315186, "attnres/block_norm/0": 1.6433238983154297, "attnres/final_alpha/block_1": 0.006567173637449741, "attnres/block_norm/1": 31462.1796875, "attnres/final_alpha/block_2": 0.013403858989477158, "attnres/block_norm/2": 22121.369140625, "attnres/final_alpha/block_3": 0.015410698018968105, "attnres/block_norm/3": 33084.84375, "attnres/final_alpha/block_4": 0.01988491788506508, "attnres/block_norm/4": 10028.154296875, "attnres/final_alpha/block_5": 0.5708693265914917, "attnres/block_norm/5": 5236.056640625, "attnres/final_alpha/block_6": 0.13213342428207397, "attnres/block_norm/6": 22066.96875, "geo/tier1_time_s": 1.3582730293273926, "geo/step": 23625.0, "geo/rankme_slope": -8.004094215811324e-05} {"step": 23630, "timestamp": 1778220080.4597332, "train/loss": 2.268198013305664, "train/z_loss": 0.0014992620679549872, "train/perplexity": 9.661974368423863, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703984.525405325, "perf/iters_per_sec": 0.8125231387163758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307341814041137, "data/tokens_consumed": 49557798912, "data/tokens_consumed_B": 49.557798912, "train/loss_slope": -8.4080262617631e-07} {"step": 23640, "timestamp": 1778220090.7990358, "train/loss": 2.23800368309021, "train/z_loss": 0.0015142828691750766, "train/perplexity": 9.374597926438552, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029227.5865193072, "perf/iters_per_sec": 0.9676111157032524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334730386734008, "data/tokens_consumed": 49578770432, "data/tokens_consumed_B": 49.578770432, "train/loss_slope": 1.5333111279725643e-06} {"step": 23650, "timestamp": 1778220101.134942, "grad/layer_0/attn": 0.0031285579316318035, "grad/layer_0/mlp": 0.0029845135286450386, "grad/layer_0/attn_mlp_ratio": 1.048263911950089, "grad/layer_4/attn": 0.0018273195018991828, "grad/layer_4/mlp": 0.002544229384511709, "grad/layer_4/attn_mlp_ratio": 0.7182211797415293, "grad/layer_8/attn": 0.0038774623535573483, "grad/layer_8/mlp": 0.00382538465783, "grad/layer_8/attn_mlp_ratio": 1.0136136883017461, "grad/layer_12/attn": 0.004389588721096516, "grad/layer_12/mlp": 0.0069303689524531364, "grad/layer_12/attn_mlp_ratio": 0.6333845554072964, "grad/layer_16/attn": 0.004292046185582876, "grad/layer_16/mlp": 0.004583603236824274, "grad/layer_16/attn_mlp_ratio": 0.9363912778186864, "grad/layer_20/attn": 0.004227038938552141, "grad/layer_20/mlp": 0.006100909318774939, "grad/layer_20/attn_mlp_ratio": 0.6928539088851003, "grad/layer_24/attn": 0.014482651837170124, "grad/layer_24/mlp": 0.013766900636255741, "grad/layer_24/attn_mlp_ratio": 1.0519907214141104, "grad/layer_27/attn": 0.004578994121402502, "grad/layer_27/mlp": 0.012909401208162308, "grad/layer_27/attn_mlp_ratio": 0.3547022834054522} {"step": 23650, "timestamp": 1778220101.1492558, "train/loss": 2.198450493812561, "train/z_loss": 0.0015258138766512275, "train/perplexity": 9.011040013971943, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027602.165013629, "perf/iters_per_sec": 0.9668360543316026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034301519393921, "data/tokens_consumed": 49599741952, "data/tokens_consumed_B": 49.599741952, "train/loss_slope": 8.048515651736176e-07} {"step": 23660, "timestamp": 1778220111.4900298, "train/loss": 2.2275322914123534, "train/z_loss": 0.0015130354673601686, "train/perplexity": 9.27694501298325, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029125.4444652393, "perf/iters_per_sec": 0.9675624105764576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335250616073608, "data/tokens_consumed": 49620713472, "data/tokens_consumed_B": 49.620713472, "train/loss_slope": 8.232378508999637e-07} {"step": 23670, "timestamp": 1778220121.8333743, "train/loss": 2.2491968870162964, "train/z_loss": 0.0015196322929114103, "train/perplexity": 9.480119171453024, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028593.6480939689, "perf/iters_per_sec": 0.9673088303060383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033796000480652, "data/tokens_consumed": 49641684992, "data/tokens_consumed_B": 49.641684992, "train/loss_slope": 2.701436339503397e-06} {"step": 23680, "timestamp": 1778220132.1767237, "train/loss": 2.2260342597961427, "train/z_loss": 0.0015227502095513045, "train/perplexity": 9.263058260046362, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028990.0822989442, "perf/iters_per_sec": 0.9674978648657533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033594012260437, "data/tokens_consumed": 49662656512, "data/tokens_consumed_B": 49.662656512, "train/loss_slope": 2.3242286043484678e-06} {"step": 23690, "timestamp": 1778220142.5157917, "train/loss": 2.2249836683273316, "train/z_loss": 0.0015304244123399257, "train/perplexity": 9.25333168028875, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029429.5603334422, "perf/iters_per_sec": 0.967707424322816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333701848983765, "data/tokens_consumed": 49683628032, "data/tokens_consumed_B": 49.683628032, "train/loss_slope": 5.692015815847037e-06} {"step": 23700, "timestamp": 1778220152.8495913, "grad/layer_0/attn": 0.0037225037813186646, "grad/layer_0/mlp": 0.003042136784642935, "grad/layer_0/attn_mlp_ratio": 1.2236476932087776, "grad/layer_4/attn": 0.002096485812216997, "grad/layer_4/mlp": 0.00258428486995399, "grad/layer_4/attn_mlp_ratio": 0.8112440526457585, "grad/layer_8/attn": 0.004971283487975597, "grad/layer_8/mlp": 0.003925143741071224, "grad/layer_8/attn_mlp_ratio": 1.2665226267526717, "grad/layer_12/attn": 0.005176362115889788, "grad/layer_12/mlp": 0.006580508314073086, "grad/layer_12/attn_mlp_ratio": 0.7866203931628766, "grad/layer_16/attn": 0.005457636434584856, "grad/layer_16/mlp": 0.0049476707354187965, "grad/layer_16/attn_mlp_ratio": 1.1030718526211118, "grad/layer_20/attn": 0.010880588553845882, "grad/layer_20/mlp": 0.006673374678939581, "grad/layer_20/attn_mlp_ratio": 1.6304476991437369, "grad/layer_24/attn": 0.02190137840807438, "grad/layer_24/mlp": 0.014248166233301163, "grad/layer_24/attn_mlp_ratio": 1.5371366318826545, "grad/layer_27/attn": 0.006059493403881788, "grad/layer_27/mlp": 0.01503808330744505, "grad/layer_27/attn_mlp_ratio": 0.4029431969290619} {"step": 23700, "timestamp": 1778220153.472006, "eos/sharpness": 66.9569969177246, "eos/L0_probe": 2.037619113922119, "eos/L_plus": 2.3068370819091797, "eos/L_minus": 2.4379711151123047, "eos/grad_norm": 0.272273987531662, "eos/embed_grad_frac": 0.031201539561152458, "eos/time_s": 0.6195604801177979} {"step": 23700, "timestamp": 1778220153.4913113, "train/loss": 2.2126423120498657, "train/z_loss": 0.001527089613955468, "train/perplexity": 9.139834810468772, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911627.794749851, "perf/iters_per_sec": 0.9115351651906257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097050380706787, "data/tokens_consumed": 49704599552, "data/tokens_consumed_B": 49.704599552, "train/loss_slope": 6.9953421161513255e-06} {"step": 23700, "timestamp": 1778220154.8523579, "geo/rankme_last": 440.86614990234375, "geo/layer_0/stable_rank_q_proj": 17.71420669555664, "geo/layer_0/stable_rank_k_proj": 15.599727630615234, "geo/layer_0/stable_rank_o_proj": 51.176841735839844, "geo/layer_0/stable_rank_gate_proj": 148.585205078125, "geo/layer_0/stable_rank_down_proj": 50.59672927856445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048330627381801605, "geo/layer_0/attn_entropy_mean": 6.23809289932251, "geo/layer_0/attn_entropy_std": 0.3287367820739746, "geo/layer_7/stable_rank_q_proj": 42.473907470703125, "geo/layer_7/stable_rank_k_proj": 41.819244384765625, "geo/layer_7/stable_rank_o_proj": 108.96172332763672, "geo/layer_7/stable_rank_gate_proj": 101.45126342773438, "geo/layer_7/stable_rank_down_proj": 149.19979858398438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5432002544403076, "geo/layer_7/attn_entropy_mean": 4.663283824920654, "geo/layer_7/attn_entropy_std": 0.83647221326828, "geo/layer_14/stable_rank_q_proj": 57.386844635009766, "geo/layer_14/stable_rank_k_proj": 34.92256164550781, "geo/layer_14/stable_rank_o_proj": 54.02385711669922, "geo/layer_14/stable_rank_gate_proj": 84.53308868408203, "geo/layer_14/stable_rank_down_proj": 136.24798583984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38087278604507446, "geo/layer_14/attn_entropy_mean": 5.489150047302246, "geo/layer_14/attn_entropy_std": 0.48780062794685364, "geo/layer_21/stable_rank_q_proj": 46.619937896728516, "geo/layer_21/stable_rank_k_proj": 31.89514923095703, "geo/layer_21/stable_rank_o_proj": 81.98570251464844, "geo/layer_21/stable_rank_gate_proj": 84.08148956298828, "geo/layer_21/stable_rank_down_proj": 59.586917877197266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15200909972190857, "geo/layer_21/attn_entropy_mean": 5.744922637939453, "geo/layer_21/attn_entropy_std": 0.28606924414634705, "geo/layer_27/stable_rank_q_proj": 41.30817794799805, "geo/layer_27/stable_rank_k_proj": 31.28422737121582, "geo/layer_27/stable_rank_o_proj": 119.38768768310547, "geo/layer_27/stable_rank_gate_proj": 90.28337860107422, "geo/layer_27/stable_rank_down_proj": 136.4708251953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0784243494272232, "geo/layer_27/attn_entropy_mean": 4.372162818908691, "geo/layer_27/attn_entropy_std": 0.5668419599533081, "attnres/final_alpha/block_0": 0.24239271879196167, "attnres/block_norm/0": 1.6439378261566162, "attnres/final_alpha/block_1": 0.006679496727883816, "attnres/block_norm/1": 31514.142578125, "attnres/final_alpha/block_2": 0.013630308210849762, "attnres/block_norm/2": 22199.3671875, "attnres/final_alpha/block_3": 0.015642661601305008, "attnres/block_norm/3": 32752.39453125, "attnres/final_alpha/block_4": 0.01989673264324665, "attnres/block_norm/4": 10081.2646484375, "attnres/final_alpha/block_5": 0.56782066822052, "attnres/block_norm/5": 5305.4921875, "attnres/final_alpha/block_6": 0.13393741846084595, "attnres/block_norm/6": 22222.005859375, "geo/tier1_time_s": 1.3570687770843506, "geo/step": 23700.0, "geo/rankme_slope": -5.912327821753702e-05} {"step": 23710, "timestamp": 1778220165.203329, "train/loss": 2.23872971534729, "train/z_loss": 0.0015252503450028597, "train/perplexity": 9.381406658310699, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791167.5890813402, "perf/iters_per_sec": 0.854095263043089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708295822143555, "data/tokens_consumed": 49725571072, "data/tokens_consumed_B": 49.725571072, "train/loss_slope": 7.699396414975676e-06} {"step": 23720, "timestamp": 1778220175.5430477, "train/loss": 2.2104694724082945, "train/z_loss": 0.0015250218915753067, "train/perplexity": 9.119996975097639, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029165.93492924, "perf/iters_per_sec": 0.9675817179342461, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335044384002685, "data/tokens_consumed": 49746542592, "data/tokens_consumed_B": 49.746542592, "train/loss_slope": 5.90310078857541e-06} {"step": 23730, "timestamp": 1778220185.895553, "train/loss": 2.237353301048279, "train/z_loss": 0.0015151681145653129, "train/perplexity": 9.368502838579564, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027311.3991610394, "perf/iters_per_sec": 0.9666974063687512, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034449863433838, "data/tokens_consumed": 49767514112, "data/tokens_consumed_B": 49.767514112, "train/loss_slope": 7.606019460149907e-06} {"step": 23740, "timestamp": 1778220196.2493997, "train/loss": 2.2218770146369935, "train/z_loss": 0.0015162414289079606, "train/perplexity": 9.224629390398189, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026841.3125635758, "perf/iters_per_sec": 0.9664732516115073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346897840499878, "data/tokens_consumed": 49788485632, "data/tokens_consumed_B": 49.788485632, "train/loss_slope": 5.5901273177473574e-06} {"step": 23750, "timestamp": 1778220206.587884, "grad/layer_0/attn": 0.0028631875757128, "grad/layer_0/mlp": 0.0027919569984078407, "grad/layer_0/attn_mlp_ratio": 1.0255127406311435, "grad/layer_4/attn": 0.0017697493312880397, "grad/layer_4/mlp": 0.0024714944884181023, "grad/layer_4/attn_mlp_ratio": 0.7160644168841901, "grad/layer_8/attn": 0.004031611140817404, "grad/layer_8/mlp": 0.003704134840518236, "grad/layer_8/attn_mlp_ratio": 1.0884082803563715, "grad/layer_12/attn": 0.004356780089437962, "grad/layer_12/mlp": 0.006207991857081652, "grad/layer_12/attn_mlp_ratio": 0.7018018257043722, "grad/layer_16/attn": 0.004792023450136185, "grad/layer_16/mlp": 0.004910353571176529, "grad/layer_16/attn_mlp_ratio": 0.9759018944531562, "grad/layer_20/attn": 0.003795072203502059, "grad/layer_20/mlp": 0.006214767228811979, "grad/layer_20/attn_mlp_ratio": 0.6106539477202807, "grad/layer_24/attn": 0.01685466058552265, "grad/layer_24/mlp": 0.01123122964054346, "grad/layer_24/attn_mlp_ratio": 1.500695914417924, "grad/layer_27/attn": 0.0038870868738740683, "grad/layer_27/mlp": 0.010711435228586197, "grad/layer_27/attn_mlp_ratio": 0.36289131704883526} {"step": 23750, "timestamp": 1778220206.6021857, "train/loss": 2.1804131746292112, "train/z_loss": 0.001526205870322883, "train/perplexity": 8.849962083224177, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026647.8850146157, "perf/iters_per_sec": 0.966381018168743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347885370254517, "data/tokens_consumed": 49809457152, "data/tokens_consumed_B": 49.809457152, "train/loss_slope": 4.4268086190485635e-06} {"step": 23760, "timestamp": 1778220216.9500384, "train/loss": 2.2333661794662474, "train/z_loss": 0.0015277438797056675, "train/perplexity": 9.33122384604415, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028146.067722761, "perf/iters_per_sec": 0.9670954073537641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034024143218994, "data/tokens_consumed": 49830428672, "data/tokens_consumed_B": 49.830428672, "train/loss_slope": 9.378868504182804e-07} {"step": 23770, "timestamp": 1778220227.3052325, "train/loss": 2.259176993370056, "train/z_loss": 0.0015213213628157973, "train/perplexity": 9.575205465459003, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026595.7284735118, "perf/iters_per_sec": 0.9663561479919013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348151683807374, "data/tokens_consumed": 49851400192, "data/tokens_consumed_B": 49.851400192, "train/loss_slope": 4.086551850814194e-06} {"step": 23775, "timestamp": 1778220233.1074698, "eos/sharpness": 45.08283138275146, "eos/L0_probe": 2.041187286376953, "eos/L_plus": 2.292335033416748, "eos/L_minus": 2.240867853164673, "eos/grad_norm": 0.20220598578453064, "eos/embed_grad_frac": 0.06790454685688019, "eos/time_s": 0.6204237937927246} {"step": 23775, "timestamp": 1778220234.4971602, "geo/rankme_last": 440.4547424316406, "geo/layer_0/stable_rank_q_proj": 17.733135223388672, "geo/layer_0/stable_rank_k_proj": 15.580734252929688, "geo/layer_0/stable_rank_o_proj": 51.12080764770508, "geo/layer_0/stable_rank_gate_proj": 148.51751708984375, "geo/layer_0/stable_rank_down_proj": 50.624725341796875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050408441573381424, "geo/layer_0/attn_entropy_mean": 6.243166923522949, "geo/layer_0/attn_entropy_std": 0.326931893825531, "geo/layer_7/stable_rank_q_proj": 42.4908332824707, "geo/layer_7/stable_rank_k_proj": 41.804222106933594, "geo/layer_7/stable_rank_o_proj": 108.89578247070312, "geo/layer_7/stable_rank_gate_proj": 101.41219329833984, "geo/layer_7/stable_rank_down_proj": 149.31787109375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5525128245353699, "geo/layer_7/attn_entropy_mean": 4.668448448181152, "geo/layer_7/attn_entropy_std": 0.8255916237831116, "geo/layer_14/stable_rank_q_proj": 57.63380813598633, "geo/layer_14/stable_rank_k_proj": 35.02145767211914, "geo/layer_14/stable_rank_o_proj": 54.07212829589844, "geo/layer_14/stable_rank_gate_proj": 84.47945404052734, "geo/layer_14/stable_rank_down_proj": 136.1959991455078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37039005756378174, "geo/layer_14/attn_entropy_mean": 5.506274223327637, "geo/layer_14/attn_entropy_std": 0.4411337971687317, "geo/layer_21/stable_rank_q_proj": 46.44586944580078, "geo/layer_21/stable_rank_k_proj": 31.724632263183594, "geo/layer_21/stable_rank_o_proj": 82.09825897216797, "geo/layer_21/stable_rank_gate_proj": 83.94183349609375, "geo/layer_21/stable_rank_down_proj": 59.5586051940918, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15142254531383514, "geo/layer_21/attn_entropy_mean": 5.767154693603516, "geo/layer_21/attn_entropy_std": 0.2817533016204834, "geo/layer_27/stable_rank_q_proj": 41.34109115600586, "geo/layer_27/stable_rank_k_proj": 31.23600196838379, "geo/layer_27/stable_rank_o_proj": 119.57080841064453, "geo/layer_27/stable_rank_gate_proj": 90.29059600830078, "geo/layer_27/stable_rank_down_proj": 136.59686279296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08846619725227356, "geo/layer_27/attn_entropy_mean": 4.351568222045898, "geo/layer_27/attn_entropy_std": 0.5602450370788574, "attnres/final_alpha/block_0": 0.24035689234733582, "attnres/block_norm/0": 1.6445722579956055, "attnres/final_alpha/block_1": 0.006465274840593338, "attnres/block_norm/1": 31587.115234375, "attnres/final_alpha/block_2": 0.013271532021462917, "attnres/block_norm/2": 22216.064453125, "attnres/final_alpha/block_3": 0.015202931128442287, "attnres/block_norm/3": 33247.734375, "attnres/final_alpha/block_4": 0.01963638886809349, "attnres/block_norm/4": 10084.30859375, "attnres/final_alpha/block_5": 0.5778215527534485, "attnres/block_norm/5": 5183.0556640625, "attnres/final_alpha/block_6": 0.12724542617797852, "attnres/block_norm/6": 22374.828125, "geo/tier1_time_s": 1.3664133548736572, "geo/step": 23775.0, "geo/rankme_slope": -6.891451893257303e-05} {"step": 23780, "timestamp": 1778220239.6902304, "train/loss": 2.2072657346725464, "train/z_loss": 0.001523337501566857, "train/perplexity": 9.090825650226801, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694217.4773649757, "perf/iters_per_sec": 0.8078658472847823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2378292798995971, "data/tokens_consumed": 49872371712, "data/tokens_consumed_B": 49.872371712, "train/loss_slope": 1.185342931475611e-06} {"step": 23790, "timestamp": 1778220250.0673792, "train/loss": 2.1925008058547975, "train/z_loss": 0.0015280009829439223, "train/perplexity": 8.957586311820808, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021937.7517342772, "perf/iters_per_sec": 0.9641350516005884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371990919113159, "data/tokens_consumed": 49893343232, "data/tokens_consumed_B": 49.893343232, "train/loss_slope": -2.393849361704653e-06} {"step": 23800, "timestamp": 1778220260.413727, "grad/layer_0/attn": 0.0029237919952720404, "grad/layer_0/mlp": 0.0027790958993136883, "grad/layer_0/attn_mlp_ratio": 1.0520658501880056, "grad/layer_4/attn": 0.0018856280948966742, "grad/layer_4/mlp": 0.0024176030419766903, "grad/layer_4/attn_mlp_ratio": 0.7799576622633521, "grad/layer_8/attn": 0.005337398499250412, "grad/layer_8/mlp": 0.0035221013240516186, "grad/layer_8/attn_mlp_ratio": 1.5154017038812535, "grad/layer_12/attn": 0.004533088766038418, "grad/layer_12/mlp": 0.005875210743397474, "grad/layer_12/attn_mlp_ratio": 0.7715618872014909, "grad/layer_16/attn": 0.005364725831896067, "grad/layer_16/mlp": 0.004198980517685413, "grad/layer_16/attn_mlp_ratio": 1.2776257669065505, "grad/layer_20/attn": 0.006838769651949406, "grad/layer_20/mlp": 0.0063031441532075405, "grad/layer_20/attn_mlp_ratio": 1.0849774933310938, "grad/layer_24/attn": 0.011388068087399006, "grad/layer_24/mlp": 0.013564836233854294, "grad/layer_24/attn_mlp_ratio": 0.8395286022712533, "grad/layer_27/attn": 0.009907439351081848, "grad/layer_27/mlp": 0.010934458114206791, "grad/layer_27/attn_mlp_ratio": 0.9060750113992324} {"step": 23800, "timestamp": 1778220260.4284782, "train/loss": 2.2137266397476196, "train/z_loss": 0.0015315836295485496, "train/perplexity": 9.14975076160532, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025175.0406879517, "perf/iters_per_sec": 0.965678711265541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355411052703858, "data/tokens_consumed": 49914314752, "data/tokens_consumed_B": 49.914314752, "train/loss_slope": 6.052812655838524e-07} {"step": 23810, "timestamp": 1778220270.773246, "train/loss": 2.251441144943237, "train/z_loss": 0.0015233993181027472, "train/perplexity": 9.501418896149884, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028374.1598214183, "perf/iters_per_sec": 0.9672041701418964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339078664779664, "data/tokens_consumed": 49935286272, "data/tokens_consumed_B": 49.935286272, "train/loss_slope": -8.612336892105935e-07} {"step": 23820, "timestamp": 1778220281.1275983, "train/loss": 2.228775906562805, "train/z_loss": 0.0015237248968333005, "train/perplexity": 9.288489139088538, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026687.2026420385, "perf/iters_per_sec": 0.9663997662744706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347684621810913, "data/tokens_consumed": 49956257792, "data/tokens_consumed_B": 49.956257792, "train/loss_slope": -9.509228100620696e-07} {"step": 23830, "timestamp": 1778220291.4825912, "train/loss": 2.182461404800415, "train/z_loss": 0.001536037027835846, "train/perplexity": 8.868107419145616, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026972.4178446534, "perf/iters_per_sec": 0.9665357674811618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034622859954834, "data/tokens_consumed": 49977229312, "data/tokens_consumed_B": 49.977229312, "train/loss_slope": -1.8444588332429332e-06} {"step": 23840, "timestamp": 1778220301.8319342, "train/loss": 2.254063105583191, "train/z_loss": 0.0015022901236079634, "train/perplexity": 9.526363930677896, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027586.4609663999, "perf/iters_per_sec": 0.9668285660583495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343095302581786, "data/tokens_consumed": 49998200832, "data/tokens_consumed_B": 49.998200832, "train/loss_slope": 1.17081540002147e-06} {"step": 23850, "timestamp": 1778220312.2107153, "grad/layer_0/attn": 0.0027530165389180183, "grad/layer_0/mlp": 0.0027068161871284246, "grad/layer_0/attn_mlp_ratio": 1.0170681150432288, "grad/layer_4/attn": 0.0020323789212852716, "grad/layer_4/mlp": 0.0025420684833079576, "grad/layer_4/attn_mlp_ratio": 0.7994980680814543, "grad/layer_8/attn": 0.005387832876294851, "grad/layer_8/mlp": 0.0036660090554505587, "grad/layer_8/attn_mlp_ratio": 1.4696725097601884, "grad/layer_12/attn": 0.0036973352544009686, "grad/layer_12/mlp": 0.005795619450509548, "grad/layer_12/attn_mlp_ratio": 0.6379534098431118, "grad/layer_16/attn": 0.003891024040058255, "grad/layer_16/mlp": 0.0042117624543607235, "grad/layer_16/attn_mlp_ratio": 0.9238469618923827, "grad/layer_20/attn": 0.003949490841478109, "grad/layer_20/mlp": 0.0063478704541921616, "grad/layer_20/attn_mlp_ratio": 0.622175705657679, "grad/layer_24/attn": 0.010813247412443161, "grad/layer_24/mlp": 0.009972854517400265, "grad/layer_24/attn_mlp_ratio": 1.0842680282911785, "grad/layer_27/attn": 0.008040917105972767, "grad/layer_27/mlp": 0.00922273937612772, "grad/layer_27/attn_mlp_ratio": 0.8718577735808325} {"step": 23850, "timestamp": 1778220312.8245986, "eos/sharpness": 42.50280857086181, "eos/L0_probe": 2.039612293243408, "eos/L_plus": 2.228883743286133, "eos/L_minus": 2.2753689289093018, "eos/grad_norm": 0.13667386770248413, "eos/embed_grad_frac": 0.14072109758853912, "eos/time_s": 0.6109654903411865} {"step": 23850, "timestamp": 1778220312.8449502, "train/loss": 2.195765233039856, "train/z_loss": 0.0015193056082352997, "train/perplexity": 8.986875480258503, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905664.0872751991, "perf/iters_per_sec": 0.9086914478660579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1004835605621337, "data/tokens_consumed": 50019172352, "data/tokens_consumed_B": 50.019172352, "train/loss_slope": 9.779272442854337e-07} {"step": 23850, "timestamp": 1778220314.2062938, "geo/rankme_last": 439.1823425292969, "geo/layer_0/stable_rank_q_proj": 17.74259376525879, "geo/layer_0/stable_rank_k_proj": 15.59409236907959, "geo/layer_0/stable_rank_o_proj": 50.962562561035156, "geo/layer_0/stable_rank_gate_proj": 148.25869750976562, "geo/layer_0/stable_rank_down_proj": 50.60002517700195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04768124967813492, "geo/layer_0/attn_entropy_mean": 6.243503570556641, "geo/layer_0/attn_entropy_std": 0.32740384340286255, "geo/layer_7/stable_rank_q_proj": 42.50355911254883, "geo/layer_7/stable_rank_k_proj": 41.87160873413086, "geo/layer_7/stable_rank_o_proj": 109.07611846923828, "geo/layer_7/stable_rank_gate_proj": 101.22522735595703, "geo/layer_7/stable_rank_down_proj": 149.15249633789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5596306324005127, "geo/layer_7/attn_entropy_mean": 4.660700798034668, "geo/layer_7/attn_entropy_std": 0.8302637338638306, "geo/layer_14/stable_rank_q_proj": 57.577877044677734, "geo/layer_14/stable_rank_k_proj": 35.05978012084961, "geo/layer_14/stable_rank_o_proj": 54.08030319213867, "geo/layer_14/stable_rank_gate_proj": 84.4415512084961, "geo/layer_14/stable_rank_down_proj": 136.04202270507812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38305121660232544, "geo/layer_14/attn_entropy_mean": 5.510054588317871, "geo/layer_14/attn_entropy_std": 0.4611109495162964, "geo/layer_21/stable_rank_q_proj": 46.53078079223633, "geo/layer_21/stable_rank_k_proj": 31.660764694213867, "geo/layer_21/stable_rank_o_proj": 81.97412109375, "geo/layer_21/stable_rank_gate_proj": 83.85499572753906, "geo/layer_21/stable_rank_down_proj": 59.56535339355469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14952823519706726, "geo/layer_21/attn_entropy_mean": 5.723682403564453, "geo/layer_21/attn_entropy_std": 0.30362480878829956, "geo/layer_27/stable_rank_q_proj": 41.365074157714844, "geo/layer_27/stable_rank_k_proj": 31.234277725219727, "geo/layer_27/stable_rank_o_proj": 119.45557403564453, "geo/layer_27/stable_rank_gate_proj": 90.24617004394531, "geo/layer_27/stable_rank_down_proj": 137.00535583496094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07956350594758987, "geo/layer_27/attn_entropy_mean": 4.356382369995117, "geo/layer_27/attn_entropy_std": 0.5680580139160156, "attnres/final_alpha/block_0": 0.24110959470272064, "attnres/block_norm/0": 1.645202875137329, "attnres/final_alpha/block_1": 0.006514600478112698, "attnres/block_norm/1": 31637.978515625, "attnres/final_alpha/block_2": 0.013247692957520485, "attnres/block_norm/2": 22326.34765625, "attnres/final_alpha/block_3": 0.01531831081956625, "attnres/block_norm/3": 33451.3203125, "attnres/final_alpha/block_4": 0.019581621512770653, "attnres/block_norm/4": 10075.69921875, "attnres/final_alpha/block_5": 0.5741977691650391, "attnres/block_norm/5": 5296.564453125, "attnres/final_alpha/block_6": 0.13003040850162506, "attnres/block_norm/6": 22385.91015625, "geo/tier1_time_s": 1.3569884300231934, "geo/step": 23850.0, "geo/rankme_slope": -0.00010236795890231093} {"step": 23860, "timestamp": 1778220324.5833116, "train/loss": 2.192746031284332, "train/z_loss": 0.0015250037889927627, "train/perplexity": 8.959783209128252, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787204.3661225382, "perf/iters_per_sec": 0.8522054510700885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734259605407715, "data/tokens_consumed": 50040143872, "data/tokens_consumed_B": 50.040143872, "train/loss_slope": -2.866530003506153e-06} {"step": 23870, "timestamp": 1778220334.959435, "train/loss": 2.2536890506744385, "train/z_loss": 0.001513426541350782, "train/perplexity": 9.52280121385446, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022195.2251611026, "perf/iters_per_sec": 0.9642578244977487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370670318603517, "data/tokens_consumed": 50061115392, "data/tokens_consumed_B": 50.061115392, "train/loss_slope": 1.1462462116500026e-07} {"step": 23880, "timestamp": 1778220345.3466895, "train/loss": 2.2325729608535765, "train/z_loss": 0.0015271283569745719, "train/perplexity": 9.323825080417713, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020012.1985326728, "perf/iters_per_sec": 0.9632168762839665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381877899169922, "data/tokens_consumed": 50082086912, "data/tokens_consumed_B": 50.082086912, "train/loss_slope": -1.7202272452358007e-06} {"step": 23890, "timestamp": 1778220355.7235353, "train/loss": 2.185792589187622, "train/z_loss": 0.0015297549427486957, "train/perplexity": 8.897697978565779, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022168.3544960653, "perf/iters_per_sec": 0.9642450115661932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370808124542237, "data/tokens_consumed": 50103058432, "data/tokens_consumed_B": 50.103058432, "train/loss_slope": -1.9911048435928737e-06} {"step": 23900, "timestamp": 1778220366.091198, "grad/layer_0/attn": 0.002717751543968916, "grad/layer_0/mlp": 0.0025871936231851578, "grad/layer_0/attn_mlp_ratio": 1.0504631020142647, "grad/layer_4/attn": 0.001567474682815373, "grad/layer_4/mlp": 0.002309238538146019, "grad/layer_4/attn_mlp_ratio": 0.6787841918641314, "grad/layer_8/attn": 0.004745005164295435, "grad/layer_8/mlp": 0.0036487935576587915, "grad/layer_8/attn_mlp_ratio": 1.300431213569916, "grad/layer_12/attn": 0.0045630536042153835, "grad/layer_12/mlp": 0.006149177439510822, "grad/layer_12/attn_mlp_ratio": 0.7420591737506383, "grad/layer_16/attn": 0.0038519976660609245, "grad/layer_16/mlp": 0.004365419968962669, "grad/layer_16/attn_mlp_ratio": 0.882388774782046, "grad/layer_20/attn": 0.0051038796082139015, "grad/layer_20/mlp": 0.005778228864073753, "grad/layer_20/attn_mlp_ratio": 0.8832948019103031, "grad/layer_24/attn": 0.00630379980430007, "grad/layer_24/mlp": 0.008361946791410446, "grad/layer_24/attn_mlp_ratio": 0.7538674768163686, "grad/layer_27/attn": 0.003973069135099649, "grad/layer_27/mlp": 0.007040114142000675, "grad/layer_27/attn_mlp_ratio": 0.5643472532585742} {"step": 23900, "timestamp": 1778220366.1080673, "train/loss": 2.2319806575775147, "train/z_loss": 0.0015196503605693578, "train/perplexity": 9.318304183461237, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020881.7232911922, "perf/iters_per_sec": 0.9636314979988061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377410888671874, "data/tokens_consumed": 50124029952, "data/tokens_consumed_B": 50.124029952, "train/loss_slope": -2.0467184915436593e-06} {"step": 23910, "timestamp": 1778220376.4965284, "train/loss": 2.2364949941635133, "train/z_loss": 0.0015127059305086732, "train/perplexity": 9.36046523795068, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019918.8674979385, "perf/iters_per_sec": 0.9631723725785916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382357597351075, "data/tokens_consumed": 50145001472, "data/tokens_consumed_B": 50.145001472, "train/loss_slope": -3.7897359062306834e-06} {"step": 23920, "timestamp": 1778220386.8858802, "train/loss": 2.2354299545288088, "train/z_loss": 0.001521391433198005, "train/perplexity": 9.350501278420749, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019693.1366836755, "perf/iters_per_sec": 0.9630647357385995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038351798057556, "data/tokens_consumed": 50165972992, "data/tokens_consumed_B": 50.165972992, "train/loss_slope": -4.425607670401423e-06} {"step": 23925, "timestamp": 1778220392.6888354, "eos/sharpness": 24.72455501556396, "eos/L0_probe": 2.0337460041046143, "eos/L_plus": 2.164421796798706, "eos/L_minus": 2.150315761566162, "eos/grad_norm": 0.1406468003988266, "eos/embed_grad_frac": 0.11959853023290634, "eos/time_s": 0.6147282123565674} {"step": 23925, "timestamp": 1778220394.0691714, "geo/rankme_last": 440.3022766113281, "geo/layer_0/stable_rank_q_proj": 17.736684799194336, "geo/layer_0/stable_rank_k_proj": 15.6004638671875, "geo/layer_0/stable_rank_o_proj": 50.90971374511719, "geo/layer_0/stable_rank_gate_proj": 148.33700561523438, "geo/layer_0/stable_rank_down_proj": 50.579654693603516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051477402448654175, "geo/layer_0/attn_entropy_mean": 6.248034477233887, "geo/layer_0/attn_entropy_std": 0.32419058680534363, "geo/layer_7/stable_rank_q_proj": 42.418819427490234, "geo/layer_7/stable_rank_k_proj": 41.87821960449219, "geo/layer_7/stable_rank_o_proj": 108.48770904541016, "geo/layer_7/stable_rank_gate_proj": 101.06038665771484, "geo/layer_7/stable_rank_down_proj": 149.02769470214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5462988018989563, "geo/layer_7/attn_entropy_mean": 4.657890796661377, "geo/layer_7/attn_entropy_std": 0.8311758637428284, "geo/layer_14/stable_rank_q_proj": 57.68301773071289, "geo/layer_14/stable_rank_k_proj": 35.04940414428711, "geo/layer_14/stable_rank_o_proj": 54.17069625854492, "geo/layer_14/stable_rank_gate_proj": 84.41805267333984, "geo/layer_14/stable_rank_down_proj": 136.029052734375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.378640741109848, "geo/layer_14/attn_entropy_mean": 5.476420879364014, "geo/layer_14/attn_entropy_std": 0.4318969249725342, "geo/layer_21/stable_rank_q_proj": 46.49443817138672, "geo/layer_21/stable_rank_k_proj": 31.643184661865234, "geo/layer_21/stable_rank_o_proj": 81.89202117919922, "geo/layer_21/stable_rank_gate_proj": 83.6471176147461, "geo/layer_21/stable_rank_down_proj": 59.43488693237305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15274687111377716, "geo/layer_21/attn_entropy_mean": 5.736676216125488, "geo/layer_21/attn_entropy_std": 0.29125046730041504, "geo/layer_27/stable_rank_q_proj": 41.39395523071289, "geo/layer_27/stable_rank_k_proj": 31.23467254638672, "geo/layer_27/stable_rank_o_proj": 119.54349517822266, "geo/layer_27/stable_rank_gate_proj": 90.2236557006836, "geo/layer_27/stable_rank_down_proj": 136.50509643554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07763446122407913, "geo/layer_27/attn_entropy_mean": 4.353587627410889, "geo/layer_27/attn_entropy_std": 0.545189619064331, "attnres/final_alpha/block_0": 0.23994868993759155, "attnres/block_norm/0": 1.6453653573989868, "attnres/final_alpha/block_1": 0.006369183771312237, "attnres/block_norm/1": 31816.609375, "attnres/final_alpha/block_2": 0.01336658000946045, "attnres/block_norm/2": 22292.2109375, "attnres/final_alpha/block_3": 0.015385749749839306, "attnres/block_norm/3": 33170.9140625, "attnres/final_alpha/block_4": 0.019366679713129997, "attnres/block_norm/4": 10159.0546875, "attnres/final_alpha/block_5": 0.5768343210220337, "attnres/block_norm/5": 5226.4345703125, "attnres/final_alpha/block_6": 0.1287287175655365, "attnres/block_norm/6": 22547.84765625, "geo/tier1_time_s": 1.360692024230957, "geo/step": 23925.0, "geo/rankme_slope": -9.34063469137655e-05} {"step": 23930, "timestamp": 1778220399.262394, "train/loss": 2.1876587152481077, "train/z_loss": 0.0015317594399675727, "train/perplexity": 8.914317707073073, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695602.5799041374, "perf/iters_per_sec": 0.8085263156433761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2368181228637696, "data/tokens_consumed": 50186944512, "data/tokens_consumed_B": 50.186944512, "train/loss_slope": -5.717151557723198e-06} {"step": 23940, "timestamp": 1778220409.646556, "train/loss": 2.1580130457878113, "train/z_loss": 0.0015367638203315436, "train/perplexity": 8.65392561040694, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020881.212569206, "perf/iters_per_sec": 0.9636312544675856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377413511276246, "data/tokens_consumed": 50207916032, "data/tokens_consumed_B": 50.207916032, "train/loss_slope": -7.275744032437734e-06} {"step": 23950, "timestamp": 1778220420.0138884, "grad/layer_0/attn": 0.0026472616009414196, "grad/layer_0/mlp": 0.0026160837151110172, "grad/layer_0/attn_mlp_ratio": 1.011917732012374, "grad/layer_4/attn": 0.0016184070846065879, "grad/layer_4/mlp": 0.0024645288940519094, "grad/layer_4/attn_mlp_ratio": 0.6566800749808911, "grad/layer_8/attn": 0.0059838853776454926, "grad/layer_8/mlp": 0.0038203150033950806, "grad/layer_8/attn_mlp_ratio": 1.5663329373871977, "grad/layer_12/attn": 0.008094253949820995, "grad/layer_12/mlp": 0.0062996577471494675, "grad/layer_12/attn_mlp_ratio": 1.2848719956248997, "grad/layer_16/attn": 0.004992783535271883, "grad/layer_16/mlp": 0.004443839192390442, "grad/layer_16/attn_mlp_ratio": 1.1235292742969902, "grad/layer_20/attn": 0.0035336557775735855, "grad/layer_20/mlp": 0.005688996985554695, "grad/layer_20/attn_mlp_ratio": 0.6211386162503268, "grad/layer_24/attn": 0.011557823047041893, "grad/layer_24/mlp": 0.009280920028686523, "grad/layer_24/attn_mlp_ratio": 1.2453315928576585, "grad/layer_27/attn": 0.012371735647320747, "grad/layer_27/mlp": 0.008453494869172573, "grad/layer_27/attn_mlp_ratio": 1.4635054131382172} {"step": 23950, "timestamp": 1778220420.0306408, "train/loss": 2.229907917976379, "train/z_loss": 0.0015253426507115364, "train/perplexity": 9.299009768421618, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020623.2845260876, "perf/iters_per_sec": 0.9635082647924841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378738164901733, "data/tokens_consumed": 50228887552, "data/tokens_consumed_B": 50.228887552, "train/loss_slope": -4.214431536365747e-06} {"step": 23960, "timestamp": 1778220430.4021997, "train/loss": 2.1928490161895753, "train/z_loss": 0.0015315504511818289, "train/perplexity": 8.960705979067916, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022980.7404905157, "perf/iters_per_sec": 0.964632387395151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366643428802491, "data/tokens_consumed": 50249859072, "data/tokens_consumed_B": 50.249859072, "train/loss_slope": -3.7966600381465885e-06} {"step": 23970, "timestamp": 1778220440.7774477, "train/loss": 2.218593430519104, "train/z_loss": 0.0015226409770548344, "train/perplexity": 9.194389219082188, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022642.4150683703, "perf/iters_per_sec": 0.9644710612623073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368377447128296, "data/tokens_consumed": 50270830592, "data/tokens_consumed_B": 50.270830592, "train/loss_slope": -2.067898180332375e-06} {"step": 23980, "timestamp": 1778220451.1502736, "train/loss": 2.2371934175491335, "train/z_loss": 0.0015233239508233964, "train/perplexity": 9.367005089299868, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022841.8705643723, "perf/iters_per_sec": 0.9645661690542089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036735510826111, "data/tokens_consumed": 50291802112, "data/tokens_consumed_B": 50.291802112, "train/loss_slope": 3.699644878752587e-06} {"step": 23990, "timestamp": 1778220461.5241807, "train/loss": 2.2042084693908692, "train/z_loss": 0.0015127962455153466, "train/perplexity": 9.063075026708617, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022586.2322815468, "perf/iters_per_sec": 0.9644442712218985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036866545677185, "data/tokens_consumed": 50312773632, "data/tokens_consumed_B": 50.312773632, "train/loss_slope": 5.218684941557097e-06} {"step": 24000, "timestamp": 1778220471.88724, "grad/layer_0/attn": 0.002596267033368349, "grad/layer_0/mlp": 0.0026574309449642897, "grad/layer_0/attn_mlp_ratio": 0.9769837822464505, "grad/layer_4/attn": 0.0018214152660220861, "grad/layer_4/mlp": 0.0025681122206151485, "grad/layer_4/attn_mlp_ratio": 0.7092428362268035, "grad/layer_8/attn": 0.00523043517023325, "grad/layer_8/mlp": 0.0036647252272814512, "grad/layer_8/attn_mlp_ratio": 1.427237979145702, "grad/layer_12/attn": 0.007267103064805269, "grad/layer_12/mlp": 0.006173261906951666, "grad/layer_12/attn_mlp_ratio": 1.17719012357192, "grad/layer_16/attn": 0.007043059449642897, "grad/layer_16/mlp": 0.004761419724673033, "grad/layer_16/attn_mlp_ratio": 1.4791931207466136, "grad/layer_20/attn": 0.005838603246957064, "grad/layer_20/mlp": 0.006437875330448151, "grad/layer_20/attn_mlp_ratio": 0.9069146040544978, "grad/layer_24/attn": 0.012000038288533688, "grad/layer_24/mlp": 0.01120953168720007, "grad/layer_24/attn_mlp_ratio": 1.0705209206183153, "grad/layer_27/attn": 0.004449551459401846, "grad/layer_27/mlp": 0.011002859100699425, "grad/layer_27/attn_mlp_ratio": 0.4043995636260618} {"step": 24000, "timestamp": 1778220472.4941237, "eos/sharpness": 47.28646278381347, "eos/L0_probe": 2.0373830795288086, "eos/L_plus": 2.329310417175293, "eos/L_minus": 2.218320369720459, "eos/grad_norm": 0.17468993365764618, "eos/embed_grad_frac": 0.08319924026727676, "eos/time_s": 0.6042177677154541} {"step": 24000, "timestamp": 1778220472.5137136, "train/loss": 2.1957093477249146, "train/z_loss": 0.001523239503148943, "train/perplexity": 8.986373259925449, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909469.612696753, "perf/iters_per_sec": 0.9105060637935415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0982903242111206, "data/tokens_consumed": 50333745152, "data/tokens_consumed_B": 50.333745152, "train/loss_slope": 1.2715423282402743e-06} {"step": 24000, "timestamp": 1778220473.8788886, "geo/rankme_last": 441.2228088378906, "geo/layer_0/stable_rank_q_proj": 17.775861740112305, "geo/layer_0/stable_rank_k_proj": 15.607771873474121, "geo/layer_0/stable_rank_o_proj": 51.02968215942383, "geo/layer_0/stable_rank_gate_proj": 148.0744171142578, "geo/layer_0/stable_rank_down_proj": 50.51416778564453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04857851564884186, "geo/layer_0/attn_entropy_mean": 6.250118255615234, "geo/layer_0/attn_entropy_std": 0.3203141987323761, "geo/layer_7/stable_rank_q_proj": 42.549842834472656, "geo/layer_7/stable_rank_k_proj": 41.900089263916016, "geo/layer_7/stable_rank_o_proj": 108.69209289550781, "geo/layer_7/stable_rank_gate_proj": 101.05172729492188, "geo/layer_7/stable_rank_down_proj": 148.8954315185547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5404233336448669, "geo/layer_7/attn_entropy_mean": 4.64527702331543, "geo/layer_7/attn_entropy_std": 0.8182384371757507, "geo/layer_14/stable_rank_q_proj": 57.64852523803711, "geo/layer_14/stable_rank_k_proj": 35.075706481933594, "geo/layer_14/stable_rank_o_proj": 54.145389556884766, "geo/layer_14/stable_rank_gate_proj": 84.36571502685547, "geo/layer_14/stable_rank_down_proj": 135.8884735107422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38435330986976624, "geo/layer_14/attn_entropy_mean": 5.478832244873047, "geo/layer_14/attn_entropy_std": 0.447948157787323, "geo/layer_21/stable_rank_q_proj": 46.50336837768555, "geo/layer_21/stable_rank_k_proj": 31.536659240722656, "geo/layer_21/stable_rank_o_proj": 81.81928253173828, "geo/layer_21/stable_rank_gate_proj": 83.51516723632812, "geo/layer_21/stable_rank_down_proj": 59.400665283203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1523074060678482, "geo/layer_21/attn_entropy_mean": 5.730376243591309, "geo/layer_21/attn_entropy_std": 0.2916640043258667, "geo/layer_27/stable_rank_q_proj": 41.467201232910156, "geo/layer_27/stable_rank_k_proj": 31.273361206054688, "geo/layer_27/stable_rank_o_proj": 119.5654296875, "geo/layer_27/stable_rank_gate_proj": 90.44992065429688, "geo/layer_27/stable_rank_down_proj": 136.37937927246094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07563700526952744, "geo/layer_27/attn_entropy_mean": 4.363203048706055, "geo/layer_27/attn_entropy_std": 0.586670994758606, "attnres/final_alpha/block_0": 0.24182796478271484, "attnres/block_norm/0": 1.6461063623428345, "attnres/final_alpha/block_1": 0.006560548674315214, "attnres/block_norm/1": 31765.4453125, "attnres/final_alpha/block_2": 0.013121810741722584, "attnres/block_norm/2": 22367.88671875, "attnres/final_alpha/block_3": 0.015143442898988724, "attnres/block_norm/3": 33535.0546875, "attnres/final_alpha/block_4": 0.019455477595329285, "attnres/block_norm/4": 10090.9130859375, "attnres/final_alpha/block_5": 0.5738312602043152, "attnres/block_norm/5": 5264.88525390625, "attnres/final_alpha/block_6": 0.13005948066711426, "attnres/block_norm/6": 22278.67578125, "geo/tier1_time_s": 1.3610785007476807, "geo/step": 24000.0, "geo/rankme_slope": -5.5976785245348136e-05} {"step": 24000, "timestamp": 1778220480.687969, "geo/ww_alpha_mean": 8.144293148771608, "geo/ww_alpha_std": 5.147517894151293, "geo/ww_alpha_min": 1.3618311161335346, "geo/ww_alpha_max": 36.8207819354572, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.138931511695578, "geo/ww_alpha_by_type/k_proj": 4.660028936792502, "geo/ww_alpha_by_type/v_proj": 9.129591078528156, "geo/ww_alpha_by_type/o_proj": 7.799923794974279, "geo/ww_alpha_by_type/gate_proj": 8.541046680983609, "geo/ww_alpha_by_type/up_proj": 13.687609245915255, "geo/ww_alpha_by_type/down_proj": 9.18216040812801, "geo/twonn_id/layer_0": 0.7202767133712769, "geo/twonn_id/layer_7": 3.4298348426818848, "geo/twonn_id/layer_14": 3.834660530090332, "geo/twonn_id/layer_21": 8.125696182250977, "geo/twonn_id/layer_27": 5.210573673248291, "geo/tier2_time_s": 6.80312967300415} {"step": 24000, "timestamp": 1778220481.3849428, "eoc/jacobian_sigma/layer_0/attn": 972.4030151367188, "eoc/jacobian_sigma/layer_0/mlp": 6372.22314453125, "eoc/jacobian_sigma/layer_0": 6372.22314453125, "eoc/jacobian_sigma/layer_7/attn": 1.1553229093551636, "eoc/jacobian_sigma/layer_7/mlp": 1.671558141708374, "eoc/jacobian_sigma/layer_7": 1.671558141708374, "eoc/jacobian_sigma/layer_14/attn": 1.6516082286834717, "eoc/jacobian_sigma/layer_14/mlp": 6.385504722595215, "eoc/jacobian_sigma/layer_14": 6.385504722595215, "eoc/jacobian_sigma/layer_21/attn": 1.0791219472885132, "eoc/jacobian_sigma/layer_21/mlp": 3.7518489360809326, "eoc/jacobian_sigma/layer_21": 3.7518489360809326, "eoc/jacobian_sigma/layer_27/attn": 3.4607884883880615, "eoc/jacobian_sigma/layer_27/mlp": 25.318147659301758, "eoc/jacobian_sigma/layer_27": 25.318147659301758, "eoc/layer0_sigma": 6372.22314453125, "eoc/sigma_max": 25.318147659301758, "eoc/sigma_min": 1.671558141708374, "eoc/sigma_mean": 9.28176486492157, "eoc/time_s": 0.6901381015777588} {"step": 24010, "timestamp": 1778220492.1004188, "train/loss": 2.22973837852478, "train/z_loss": 0.0015092020737938582, "train/perplexity": 9.297433353041141, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1070917.0840999119, "perf/iters_per_sec": 0.510653059053379, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9582767248153687, "data/tokens_consumed": 50354716672, "data/tokens_consumed_B": 50.354716672, "train/loss_slope": 1.279059144565763e-06} {"step": 24020, "timestamp": 1778220502.4486823, "train/loss": 2.1598955154418946, "train/z_loss": 0.0015319076366722584, "train/perplexity": 8.670231705806795, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027796.288994063, "perf/iters_per_sec": 0.9669286198587718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03420250415802, "data/tokens_consumed": 50375688192, "data/tokens_consumed_B": 50.375688192, "train/loss_slope": -2.3990881110396244e-06} {"step": 24030, "timestamp": 1778220512.796493, "train/loss": 2.2236268520355225, "train/z_loss": 0.0015189357567578555, "train/perplexity": 9.240785122723077, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027628.1987468626, "perf/iters_per_sec": 0.9668484681829751, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034288239479065, "data/tokens_consumed": 50396659712, "data/tokens_consumed_B": 50.396659712, "train/loss_slope": -1.3158819606058545e-06} {"step": 24040, "timestamp": 1778220523.1485825, "train/loss": 2.219339895248413, "train/z_loss": 0.001512894849292934, "train/perplexity": 9.201255068580203, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026703.920100232, "perf/iters_per_sec": 0.9664077377797279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347599267959595, "data/tokens_consumed": 50417631232, "data/tokens_consumed_B": 50.417631232, "train/loss_slope": -1.2122043574711056e-06} {"step": 24050, "timestamp": 1778220533.4988196, "grad/layer_0/attn": 0.002977923257276416, "grad/layer_0/mlp": 0.002778148278594017, "grad/layer_0/attn_mlp_ratio": 1.0719093624450333, "grad/layer_4/attn": 0.0015610690461471677, "grad/layer_4/mlp": 0.0024903533048927784, "grad/layer_4/attn_mlp_ratio": 0.6268463917932882, "grad/layer_8/attn": 0.0042524999007582664, "grad/layer_8/mlp": 0.0036941831931471825, "grad/layer_8/attn_mlp_ratio": 1.1511339755790617, "grad/layer_12/attn": 0.004569590091705322, "grad/layer_12/mlp": 0.006847594864666462, "grad/layer_12/attn_mlp_ratio": 0.6673277428475798, "grad/layer_16/attn": 0.003898234572261572, "grad/layer_16/mlp": 0.004294532351195812, "grad/layer_16/attn_mlp_ratio": 0.9077203668995699, "grad/layer_20/attn": 0.003498463425785303, "grad/layer_20/mlp": 0.005895093083381653, "grad/layer_20/attn_mlp_ratio": 0.5934534564521421, "grad/layer_24/attn": 0.005165155045688152, "grad/layer_24/mlp": 0.009190583601593971, "grad/layer_24/attn_mlp_ratio": 0.5620051145165386, "grad/layer_27/attn": 0.00772200059145689, "grad/layer_27/mlp": 0.007991034537553787, "grad/layer_27/attn_mlp_ratio": 0.966333014647118} {"step": 24050, "timestamp": 1778220533.5149362, "train/loss": 2.2033066272735597, "train/z_loss": 0.001527008949778974, "train/perplexity": 9.054905248415992, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024309.598218608, "perf/iters_per_sec": 0.9652660361378708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359838247299193, "data/tokens_consumed": 50438602752, "data/tokens_consumed_B": 50.438602752, "train/loss_slope": -1.424414103645608e-06} {"step": 24060, "timestamp": 1778220543.866222, "train/loss": 2.2365689992904665, "train/z_loss": 0.0015282864100299776, "train/perplexity": 9.361157986002093, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026929.1189986763, "perf/iters_per_sec": 0.9665151209824926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346449613571167, "data/tokens_consumed": 50459574272, "data/tokens_consumed_B": 50.459574272, "train/loss_slope": -1.5516844591220004e-06} {"step": 24070, "timestamp": 1778220554.2190168, "train/loss": 2.235807704925537, "train/z_loss": 0.0015161324408836663, "train/perplexity": 9.354034101208875, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026684.2140752191, "perf/iters_per_sec": 0.9663983412147613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347699880599976, "data/tokens_consumed": 50480545792, "data/tokens_consumed_B": 50.480545792, "train/loss_slope": -1.654634238217009e-08} {"step": 24075, "timestamp": 1778220560.021915, "eos/sharpness": 15.284967422485348, "eos/L0_probe": 2.0406837463378906, "eos/L_plus": 2.103090763092041, "eos/L_minus": 2.1311264038085938, "eos/grad_norm": 0.15118584036827087, "eos/embed_grad_frac": 0.11153388768434525, "eos/time_s": 0.6330692768096924} {"step": 24075, "timestamp": 1778220561.4016745, "geo/rankme_last": 440.3885803222656, "geo/layer_0/stable_rank_q_proj": 17.796045303344727, "geo/layer_0/stable_rank_k_proj": 15.602264404296875, "geo/layer_0/stable_rank_o_proj": 51.210289001464844, "geo/layer_0/stable_rank_gate_proj": 148.32357788085938, "geo/layer_0/stable_rank_down_proj": 50.49943923950195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05170942097902298, "geo/layer_0/attn_entropy_mean": 6.249770164489746, "geo/layer_0/attn_entropy_std": 0.31729578971862793, "geo/layer_7/stable_rank_q_proj": 42.56393051147461, "geo/layer_7/stable_rank_k_proj": 41.878787994384766, "geo/layer_7/stable_rank_o_proj": 108.74662780761719, "geo/layer_7/stable_rank_gate_proj": 101.20282745361328, "geo/layer_7/stable_rank_down_proj": 148.5896453857422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5501081943511963, "geo/layer_7/attn_entropy_mean": 4.667509078979492, "geo/layer_7/attn_entropy_std": 0.8353026509284973, "geo/layer_14/stable_rank_q_proj": 57.658267974853516, "geo/layer_14/stable_rank_k_proj": 34.96079635620117, "geo/layer_14/stable_rank_o_proj": 54.063011169433594, "geo/layer_14/stable_rank_gate_proj": 84.47473907470703, "geo/layer_14/stable_rank_down_proj": 135.457275390625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37905627489089966, "geo/layer_14/attn_entropy_mean": 5.484605312347412, "geo/layer_14/attn_entropy_std": 0.45177504420280457, "geo/layer_21/stable_rank_q_proj": 46.484375, "geo/layer_21/stable_rank_k_proj": 31.531585693359375, "geo/layer_21/stable_rank_o_proj": 81.80421447753906, "geo/layer_21/stable_rank_gate_proj": 83.54828643798828, "geo/layer_21/stable_rank_down_proj": 59.36398696899414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1536456197500229, "geo/layer_21/attn_entropy_mean": 5.723591327667236, "geo/layer_21/attn_entropy_std": 0.28180161118507385, "geo/layer_27/stable_rank_q_proj": 41.413543701171875, "geo/layer_27/stable_rank_k_proj": 31.237770080566406, "geo/layer_27/stable_rank_o_proj": 119.33756256103516, "geo/layer_27/stable_rank_gate_proj": 90.40108489990234, "geo/layer_27/stable_rank_down_proj": 136.25831604003906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07676780968904495, "geo/layer_27/attn_entropy_mean": 4.355188846588135, "geo/layer_27/attn_entropy_std": 0.5524592399597168, "attnres/final_alpha/block_0": 0.2401263415813446, "attnres/block_norm/0": 1.6464190483093262, "attnres/final_alpha/block_1": 0.006460517644882202, "attnres/block_norm/1": 31701.61328125, "attnres/final_alpha/block_2": 0.013218093663454056, "attnres/block_norm/2": 22392.13671875, "attnres/final_alpha/block_3": 0.01526271365582943, "attnres/block_norm/3": 33816.140625, "attnres/final_alpha/block_4": 0.019612571224570274, "attnres/block_norm/4": 10145.3486328125, "attnres/final_alpha/block_5": 0.5752149820327759, "attnres/block_norm/5": 5263.3818359375, "attnres/final_alpha/block_6": 0.13010476529598236, "attnres/block_norm/6": 22448.6015625, "geo/tier1_time_s": 1.3595671653747559, "geo/step": 24075.0, "geo/rankme_slope": -9.12701799469788e-05} {"step": 24080, "timestamp": 1778220566.58487, "train/loss": 2.2196136713027954, "train/z_loss": 0.0015361024765297771, "train/perplexity": 9.203774496752061, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696672.372308478, "perf/iters_per_sec": 0.8090364324133291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2360382795333862, "data/tokens_consumed": 50501517312, "data/tokens_consumed_B": 50.501517312, "train/loss_slope": 2.1302735809089566e-07} {"step": 24090, "timestamp": 1778220576.933772, "train/loss": 2.2525145292282103, "train/z_loss": 0.0015137740992940962, "train/perplexity": 9.511623045384942, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027962.957699667, "perf/iters_per_sec": 0.9670080936907134, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341175079345704, "data/tokens_consumed": 50522488832, "data/tokens_consumed_B": 50.522488832, "train/loss_slope": 3.9281249868951964e-06} {"step": 24100, "timestamp": 1778220587.274195, "grad/layer_0/attn": 0.003185691311955452, "grad/layer_0/mlp": 0.0029007107950747013, "grad/layer_0/attn_mlp_ratio": 1.0982450258537093, "grad/layer_4/attn": 0.001717529259622097, "grad/layer_4/mlp": 0.0024130111560225487, "grad/layer_4/attn_mlp_ratio": 0.7117783869989736, "grad/layer_8/attn": 0.007128182332962751, "grad/layer_8/mlp": 0.0038035474717617035, "grad/layer_8/attn_mlp_ratio": 1.8740878609968756, "grad/layer_12/attn": 0.00421473104506731, "grad/layer_12/mlp": 0.00635383790358901, "grad/layer_12/attn_mlp_ratio": 0.6633362453821753, "grad/layer_16/attn": 0.004882806446403265, "grad/layer_16/mlp": 0.004704353865236044, "grad/layer_16/attn_mlp_ratio": 1.0379334723717515, "grad/layer_20/attn": 0.004107377957552671, "grad/layer_20/mlp": 0.006451806053519249, "grad/layer_20/attn_mlp_ratio": 0.636624514100169, "grad/layer_24/attn": 0.0167120061814785, "grad/layer_24/mlp": 0.011407465673983097, "grad/layer_24/attn_mlp_ratio": 1.4650060331185408, "grad/layer_27/attn": 0.010549962520599365, "grad/layer_27/mlp": 0.0094636594876647, "grad/layer_27/attn_mlp_ratio": 1.114786771742149} {"step": 24100, "timestamp": 1778220587.2900023, "train/loss": 2.211583685874939, "train/z_loss": 0.0015167606063187123, "train/perplexity": 9.130164261755041, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025936.5555536516, "perf/iters_per_sec": 0.9660418298500307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351518630981444, "data/tokens_consumed": 50543460352, "data/tokens_consumed_B": 50.543460352, "train/loss_slope": 1.9393695641880304e-06} {"step": 24110, "timestamp": 1778220597.6408274, "train/loss": 2.1827563285827636, "train/z_loss": 0.0015313527546823025, "train/perplexity": 8.87072322064002, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027149.4625755432, "perf/iters_per_sec": 0.9666201889875141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345324993133544, "data/tokens_consumed": 50564431872, "data/tokens_consumed_B": 50.564431872, "train/loss_slope": -1.1501743240539476e-06} {"step": 24120, "timestamp": 1778220608.6039917, "train/loss": 2.223543119430542, "train/z_loss": 0.0015291152289137245, "train/perplexity": 9.240011400106045, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914130.1114242177, "perf/iters_per_sec": 0.912728362762555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0956162214279175, "data/tokens_consumed": 50585403392, "data/tokens_consumed_B": 50.585403392, "train/loss_slope": 1.6014160424640944e-06} {"step": 24130, "timestamp": 1778220618.966981, "train/loss": 2.151990461349487, "train/z_loss": 0.0015414796536788345, "train/perplexity": 8.601963243628813, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024624.0622248137, "perf/iters_per_sec": 0.9654159842609471, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358229160308838, "data/tokens_consumed": 50606374912, "data/tokens_consumed_B": 50.606374912, "train/loss_slope": -3.487559742588475e-06} {"step": 24140, "timestamp": 1778220629.326595, "train/loss": 2.1694415807724, "train/z_loss": 0.0015163238276727498, "train/perplexity": 8.753394611834315, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025394.67658593, "perf/iters_per_sec": 0.9657834418229723, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035428810119629, "data/tokens_consumed": 50627346432, "data/tokens_consumed_B": 50.627346432, "train/loss_slope": -4.033663327937855e-06} {"step": 24150, "timestamp": 1778220639.6675384, "grad/layer_0/attn": 0.0028322620783001184, "grad/layer_0/mlp": 0.002685996936634183, "grad/layer_0/attn_mlp_ratio": 1.0544546548901705, "grad/layer_4/attn": 0.002198223490267992, "grad/layer_4/mlp": 0.0026254914700984955, "grad/layer_4/attn_mlp_ratio": 0.8372616828419394, "grad/layer_8/attn": 0.006775679998099804, "grad/layer_8/mlp": 0.004097936674952507, "grad/layer_8/attn_mlp_ratio": 1.6534369294114675, "grad/layer_12/attn": 0.0045520830899477005, "grad/layer_12/mlp": 0.006348786875605583, "grad/layer_12/attn_mlp_ratio": 0.7170004455084898, "grad/layer_16/attn": 0.005820348393172026, "grad/layer_16/mlp": 0.004512665327638388, "grad/layer_16/attn_mlp_ratio": 1.28978061558133, "grad/layer_20/attn": 0.005717883352190256, "grad/layer_20/mlp": 0.005251405760645866, "grad/layer_20/attn_mlp_ratio": 1.0888290686195445, "grad/layer_24/attn": 0.006164956372231245, "grad/layer_24/mlp": 0.007636116351932287, "grad/layer_24/attn_mlp_ratio": 0.8073418485742486, "grad/layer_27/attn": 0.006283256225287914, "grad/layer_27/mlp": 0.006941250059753656, "grad/layer_27/attn_mlp_ratio": 0.9052052700418604} {"step": 24150, "timestamp": 1778220640.280872, "eos/sharpness": 30.16142845153808, "eos/L0_probe": 2.037405014038086, "eos/L_plus": 2.2307629585266113, "eos/L_minus": 2.1456613540649414, "eos/grad_norm": 0.11630397289991379, "eos/embed_grad_frac": 0.1845080703496933, "eos/time_s": 0.610623836517334} {"step": 24150, "timestamp": 1778220640.300832, "train/loss": 2.243410086631775, "train/z_loss": 0.0015157630201429128, "train/perplexity": 9.425418039101466, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911882.664638939, "perf/iters_per_sec": 0.91165669662425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969041347503663, "data/tokens_consumed": 50648317952, "data/tokens_consumed_B": 50.648317952, "train/loss_slope": -3.2112344406475756e-06} {"step": 24150, "timestamp": 1778220641.6631768, "geo/rankme_last": 440.4652099609375, "geo/layer_0/stable_rank_q_proj": 17.803171157836914, "geo/layer_0/stable_rank_k_proj": 15.634015083312988, "geo/layer_0/stable_rank_o_proj": 51.0662956237793, "geo/layer_0/stable_rank_gate_proj": 148.1854248046875, "geo/layer_0/stable_rank_down_proj": 50.628562927246094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05098670348525047, "geo/layer_0/attn_entropy_mean": 6.243452548980713, "geo/layer_0/attn_entropy_std": 0.32157284021377563, "geo/layer_7/stable_rank_q_proj": 42.46746826171875, "geo/layer_7/stable_rank_k_proj": 41.76225280761719, "geo/layer_7/stable_rank_o_proj": 108.70378875732422, "geo/layer_7/stable_rank_gate_proj": 100.86748504638672, "geo/layer_7/stable_rank_down_proj": 148.24050903320312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5387925505638123, "geo/layer_7/attn_entropy_mean": 4.648856163024902, "geo/layer_7/attn_entropy_std": 0.8402032256126404, "geo/layer_14/stable_rank_q_proj": 57.69091033935547, "geo/layer_14/stable_rank_k_proj": 35.00867462158203, "geo/layer_14/stable_rank_o_proj": 53.978172302246094, "geo/layer_14/stable_rank_gate_proj": 84.50348663330078, "geo/layer_14/stable_rank_down_proj": 135.75575256347656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38808301091194153, "geo/layer_14/attn_entropy_mean": 5.483607292175293, "geo/layer_14/attn_entropy_std": 0.45050135254859924, "geo/layer_21/stable_rank_q_proj": 46.3572883605957, "geo/layer_21/stable_rank_k_proj": 31.649904251098633, "geo/layer_21/stable_rank_o_proj": 81.81387329101562, "geo/layer_21/stable_rank_gate_proj": 83.40868377685547, "geo/layer_21/stable_rank_down_proj": 59.36910629272461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14842787384986877, "geo/layer_21/attn_entropy_mean": 5.746737957000732, "geo/layer_21/attn_entropy_std": 0.2854815125465393, "geo/layer_27/stable_rank_q_proj": 41.406593322753906, "geo/layer_27/stable_rank_k_proj": 31.26193618774414, "geo/layer_27/stable_rank_o_proj": 119.60543823242188, "geo/layer_27/stable_rank_gate_proj": 90.3588638305664, "geo/layer_27/stable_rank_down_proj": 135.8631591796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08794823288917542, "geo/layer_27/attn_entropy_mean": 4.364001274108887, "geo/layer_27/attn_entropy_std": 0.5599791407585144, "attnres/final_alpha/block_0": 0.24127697944641113, "attnres/block_norm/0": 1.6469595432281494, "attnres/final_alpha/block_1": 0.0064840116538107395, "attnres/block_norm/1": 31888.82421875, "attnres/final_alpha/block_2": 0.013236887753009796, "attnres/block_norm/2": 22478.5703125, "attnres/final_alpha/block_3": 0.015275958925485611, "attnres/block_norm/3": 33500.3203125, "attnres/final_alpha/block_4": 0.019469160586595535, "attnres/block_norm/4": 10170.923828125, "attnres/final_alpha/block_5": 0.5737978219985962, "attnres/block_norm/5": 5278.630859375, "attnres/final_alpha/block_6": 0.13045915961265564, "attnres/block_norm/6": 22535.5859375, "geo/tier1_time_s": 1.358290433883667, "geo/step": 24150.0, "geo/rankme_slope": -8.243240655637255e-05} {"step": 24160, "timestamp": 1778220652.014551, "train/loss": 2.2199815273284913, "train/z_loss": 0.0015142648597247898, "train/perplexity": 9.207160783454636, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790932.727733542, "perf/iters_per_sec": 0.8539832724254331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170983123779297, "data/tokens_consumed": 50669289472, "data/tokens_consumed_B": 50.669289472, "train/loss_slope": -2.6354157313047123e-06} {"step": 24170, "timestamp": 1778220662.3658319, "train/loss": 2.236304259300232, "train/z_loss": 0.0015050033223815262, "train/perplexity": 9.358680041148316, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027049.6317042327, "perf/iters_per_sec": 0.9665725859185375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345834493637085, "data/tokens_consumed": 50690260992, "data/tokens_consumed_B": 50.690260992, "train/loss_slope": -1.778184441235397e-06} {"step": 24180, "timestamp": 1778220672.7184734, "train/loss": 2.2024935483932495, "train/z_loss": 0.0015222450834698974, "train/perplexity": 9.047545888470804, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026803.9504304666, "perf/iters_per_sec": 0.9664554359581311, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034708857536316, "data/tokens_consumed": 50711232512, "data/tokens_consumed_B": 50.711232512, "train/loss_slope": -4.4507624042166425e-06} {"step": 24190, "timestamp": 1778220683.0739224, "train/loss": 2.181686985492706, "train/z_loss": 0.0015183379873633385, "train/perplexity": 8.86124244406511, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026063.8103863879, "perf/iters_per_sec": 0.9661025096828403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350868463516236, "data/tokens_consumed": 50732204032, "data/tokens_consumed_B": 50.732204032, "train/loss_slope": -4.583868378102153e-06} {"step": 24200, "timestamp": 1778220693.4167364, "grad/layer_0/attn": 0.003037170972675085, "grad/layer_0/mlp": 0.0028715094085782766, "grad/layer_0/attn_mlp_ratio": 1.0576914210459396, "grad/layer_4/attn": 0.0020359710324555635, "grad/layer_4/mlp": 0.0024526179768145084, "grad/layer_4/attn_mlp_ratio": 0.8301215145164015, "grad/layer_8/attn": 0.004635788034647703, "grad/layer_8/mlp": 0.003709010547026992, "grad/layer_8/attn_mlp_ratio": 1.2498718595924179, "grad/layer_12/attn": 0.004823646973818541, "grad/layer_12/mlp": 0.006745733320713043, "grad/layer_12/attn_mlp_ratio": 0.7150663497919056, "grad/layer_16/attn": 0.005227524787187576, "grad/layer_16/mlp": 0.004716852679848671, "grad/layer_16/attn_mlp_ratio": 1.1082654115304589, "grad/layer_20/attn": 0.0045892237685620785, "grad/layer_20/mlp": 0.007476759608834982, "grad/layer_20/attn_mlp_ratio": 0.6137984832037842, "grad/layer_24/attn": 0.024357466027140617, "grad/layer_24/mlp": 0.013480191119015217, "grad/layer_24/attn_mlp_ratio": 1.8069080498488679, "grad/layer_27/attn": 0.014290401712059975, "grad/layer_27/mlp": 0.012581395916640759, "grad/layer_27/attn_mlp_ratio": 1.1358359353094682} {"step": 24200, "timestamp": 1778220693.432518, "train/loss": 2.179974913597107, "train/z_loss": 0.0015170460450462998, "train/perplexity": 8.84608433950155, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025656.5298954586, "perf/iters_per_sec": 0.965908303210954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352949619293212, "data/tokens_consumed": 50753175552, "data/tokens_consumed_B": 50.753175552, "train/loss_slope": -6.0683988072726995e-06} {"step": 24210, "timestamp": 1778220703.7833252, "train/loss": 2.2273823261260985, "train/z_loss": 0.0015096073155291378, "train/perplexity": 9.275553897580924, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027094.1502931074, "perf/iters_per_sec": 0.9665938140359437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345607280731202, "data/tokens_consumed": 50774147072, "data/tokens_consumed_B": 50.774147072, "train/loss_slope": -5.674496578304684e-06} {"step": 24220, "timestamp": 1778220714.1621163, "train/loss": 2.211945366859436, "train/z_loss": 0.0015150072635151447, "train/perplexity": 9.133467065798557, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021554.9864054392, "perf/iters_per_sec": 0.9639525348689266, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373954772949219, "data/tokens_consumed": 50795118592, "data/tokens_consumed_B": 50.795118592, "train/loss_slope": -4.90486383652708e-06} {"step": 24225, "timestamp": 1778220719.9466627, "eos/sharpness": 21.11895084381103, "eos/L0_probe": 2.037698984146118, "eos/L_plus": 2.158806324005127, "eos/L_minus": 2.1277811527252197, "eos/grad_norm": 0.12062405049800873, "eos/embed_grad_frac": 0.16694018244743347, "eos/time_s": 0.6075856685638428} {"step": 24225, "timestamp": 1778220721.3232071, "geo/rankme_last": 440.1120910644531, "geo/layer_0/stable_rank_q_proj": 17.780532836914062, "geo/layer_0/stable_rank_k_proj": 15.644976615905762, "geo/layer_0/stable_rank_o_proj": 51.073612213134766, "geo/layer_0/stable_rank_gate_proj": 148.74337768554688, "geo/layer_0/stable_rank_down_proj": 50.60945510864258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05502117797732353, "geo/layer_0/attn_entropy_mean": 6.244683265686035, "geo/layer_0/attn_entropy_std": 0.3261226415634155, "geo/layer_7/stable_rank_q_proj": 42.37627410888672, "geo/layer_7/stable_rank_k_proj": 41.53312301635742, "geo/layer_7/stable_rank_o_proj": 108.6683578491211, "geo/layer_7/stable_rank_gate_proj": 100.84086608886719, "geo/layer_7/stable_rank_down_proj": 147.92782592773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5560310482978821, "geo/layer_7/attn_entropy_mean": 4.663080215454102, "geo/layer_7/attn_entropy_std": 0.836478590965271, "geo/layer_14/stable_rank_q_proj": 57.594139099121094, "geo/layer_14/stable_rank_k_proj": 35.11420440673828, "geo/layer_14/stable_rank_o_proj": 54.05048751831055, "geo/layer_14/stable_rank_gate_proj": 84.51666259765625, "geo/layer_14/stable_rank_down_proj": 135.4235382080078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37543541193008423, "geo/layer_14/attn_entropy_mean": 5.5162482261657715, "geo/layer_14/attn_entropy_std": 0.42493587732315063, "geo/layer_21/stable_rank_q_proj": 46.16529083251953, "geo/layer_21/stable_rank_k_proj": 31.61960792541504, "geo/layer_21/stable_rank_o_proj": 81.62711334228516, "geo/layer_21/stable_rank_gate_proj": 83.40811920166016, "geo/layer_21/stable_rank_down_proj": 59.28931427001953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14891284704208374, "geo/layer_21/attn_entropy_mean": 5.73658561706543, "geo/layer_21/attn_entropy_std": 0.2906355559825897, "geo/layer_27/stable_rank_q_proj": 41.41156005859375, "geo/layer_27/stable_rank_k_proj": 31.31519889831543, "geo/layer_27/stable_rank_o_proj": 119.55233001708984, "geo/layer_27/stable_rank_gate_proj": 90.36872863769531, "geo/layer_27/stable_rank_down_proj": 136.04037475585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07949192821979523, "geo/layer_27/attn_entropy_mean": 4.356164932250977, "geo/layer_27/attn_entropy_std": 0.568019449710846, "attnres/final_alpha/block_0": 0.24195563793182373, "attnres/block_norm/0": 1.6475310325622559, "attnres/final_alpha/block_1": 0.006399191915988922, "attnres/block_norm/1": 31904.41796875, "attnres/final_alpha/block_2": 0.013173675164580345, "attnres/block_norm/2": 22514.783203125, "attnres/final_alpha/block_3": 0.015216134488582611, "attnres/block_norm/3": 33750.7734375, "attnres/final_alpha/block_4": 0.019483258947730064, "attnres/block_norm/4": 10149.8369140625, "attnres/final_alpha/block_5": 0.5756626725196838, "attnres/block_norm/5": 5220.33740234375, "attnres/final_alpha/block_6": 0.1281094253063202, "attnres/block_norm/6": 22558.3125, "geo/tier1_time_s": 1.3563282489776611, "geo/step": 24225.0, "geo/rankme_slope": -0.00010219304518682473} {"step": 24230, "timestamp": 1778220726.5173807, "train/loss": 2.2210567712783815, "train/z_loss": 0.001522586343344301, "train/perplexity": 9.217066051718268, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698176.3644088856, "perf/iters_per_sec": 0.8097535917324474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2349435806274414, "data/tokens_consumed": 50816090112, "data/tokens_consumed_B": 50.816090112, "train/loss_slope": -4.712512204856649e-06} {"step": 24240, "timestamp": 1778220737.4111805, "train/loss": 2.169839644432068, "train/z_loss": 0.0015405758982524275, "train/perplexity": 8.756879713728207, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925976.97076323, "perf/iters_per_sec": 0.9183773855034018, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.088876986503601, "data/tokens_consumed": 50837061632, "data/tokens_consumed_B": 50.837061632, "train/loss_slope": -7.799655197262596e-06} {"step": 24250, "timestamp": 1778220747.7894526, "grad/layer_0/attn": 0.002949673915281892, "grad/layer_0/mlp": 0.002655200194567442, "grad/layer_0/attn_mlp_ratio": 1.1109044848017473, "grad/layer_4/attn": 0.001846199156716466, "grad/layer_4/mlp": 0.002444856334477663, "grad/layer_4/attn_mlp_ratio": 0.7551360197192525, "grad/layer_8/attn": 0.00992075726389885, "grad/layer_8/mlp": 0.0034051216207444668, "grad/layer_8/attn_mlp_ratio": 2.91348094943574, "grad/layer_12/attn": 0.005819832440465689, "grad/layer_12/mlp": 0.005992877297103405, "grad/layer_12/attn_mlp_ratio": 0.9711248962441055, "grad/layer_16/attn": 0.005286352708935738, "grad/layer_16/mlp": 0.004404149483889341, "grad/layer_16/attn_mlp_ratio": 1.2003117987348912, "grad/layer_20/attn": 0.004188602324575186, "grad/layer_20/mlp": 0.006258866749703884, "grad/layer_20/attn_mlp_ratio": 0.6692269423774936, "grad/layer_24/attn": 0.014866377227008343, "grad/layer_24/mlp": 0.010942095890641212, "grad/layer_24/attn_mlp_ratio": 1.3586407247499541, "grad/layer_27/attn": 0.011030642315745354, "grad/layer_27/mlp": 0.01047714613378048, "grad/layer_27/attn_mlp_ratio": 1.0528288972602375} {"step": 24250, "timestamp": 1778220747.805571, "train/loss": 2.158050560951233, "train/z_loss": 0.0015213998733088375, "train/perplexity": 8.654250269930243, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019077.4222631278, "perf/iters_per_sec": 0.9627711402240409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038668441772461, "data/tokens_consumed": 50858033152, "data/tokens_consumed_B": 50.858033152, "train/loss_slope": -1.2028631461550108e-05} {"step": 24260, "timestamp": 1778220758.1634107, "train/loss": 2.209696912765503, "train/z_loss": 0.0015074806753546, "train/perplexity": 9.112953954419368, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026194.861772518, "perf/iters_per_sec": 0.9661649998533811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350198984146117, "data/tokens_consumed": 50879004672, "data/tokens_consumed_B": 50.879004672, "train/loss_slope": -1.3218055156269429e-05} {"step": 24270, "timestamp": 1778220768.5171602, "train/loss": 2.2276726722717286, "train/z_loss": 0.0015138182905502617, "train/perplexity": 9.27824740991021, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026557.6283462557, "perf/iters_per_sec": 0.9663379804354933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034834623336792, "data/tokens_consumed": 50899976192, "data/tokens_consumed_B": 50.899976192, "train/loss_slope": -1.1084119843678398e-05} {"step": 24280, "timestamp": 1778220778.871863, "train/loss": 2.253322219848633, "train/z_loss": 0.0015018522390164435, "train/perplexity": 9.519308597460043, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026931.1274263463, "perf/iters_per_sec": 0.9665160786754352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346439361572266, "data/tokens_consumed": 50920947712, "data/tokens_consumed_B": 50.920947712, "train/loss_slope": -7.2732009366937125e-06} {"step": 24290, "timestamp": 1778220789.6170456, "train/loss": 2.199899959564209, "train/z_loss": 0.001515571109484881, "train/perplexity": 9.024110678310679, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952635.2921439826, "perf/iters_per_sec": 0.9310890637130654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0740111112594604, "data/tokens_consumed": 50941919232, "data/tokens_consumed_B": 50.941919232, "train/loss_slope": -4.982099925080427e-06} {"step": 24300, "timestamp": 1778220799.9993181, "grad/layer_0/attn": 0.0032813441939651966, "grad/layer_0/mlp": 0.0027091067750006914, "grad/layer_0/attn_mlp_ratio": 1.211227295698461, "grad/layer_4/attn": 0.004151038359850645, "grad/layer_4/mlp": 0.0025360879953950644, "grad/layer_4/attn_mlp_ratio": 1.636787920493751, "grad/layer_8/attn": 0.0041610123589634895, "grad/layer_8/mlp": 0.003740537678822875, "grad/layer_8/attn_mlp_ratio": 1.1124101947375427, "grad/layer_12/attn": 0.004305514972656965, "grad/layer_12/mlp": 0.005868144799023867, "grad/layer_12/attn_mlp_ratio": 0.7337097237277089, "grad/layer_16/attn": 0.003724593436345458, "grad/layer_16/mlp": 0.00440453365445137, "grad/layer_16/attn_mlp_ratio": 0.8456271750855033, "grad/layer_20/attn": 0.004772945772856474, "grad/layer_20/mlp": 0.005680721253156662, "grad/layer_20/attn_mlp_ratio": 0.8402006499058865, "grad/layer_24/attn": 0.009489789605140686, "grad/layer_24/mlp": 0.010263381525874138, "grad/layer_24/attn_mlp_ratio": 0.9246260103216649, "grad/layer_27/attn": 0.004725341219455004, "grad/layer_27/mlp": 0.010196516290307045, "grad/layer_27/attn_mlp_ratio": 0.4634270214037984} {"step": 24300, "timestamp": 1778220800.6219451, "eos/sharpness": 42.529654502868645, "eos/L0_probe": 2.0373473167419434, "eos/L_plus": 2.291088104248047, "eos/L_minus": 2.2089030742645264, "eos/grad_norm": 0.14937250316143036, "eos/embed_grad_frac": 0.10192801058292389, "eos/time_s": 0.6197202205657959} {"step": 24300, "timestamp": 1778220800.6426466, "train/loss": 2.201925754547119, "train/z_loss": 0.0015306040295399725, "train/perplexity": 9.04241020573525, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903181.2069221837, "perf/iters_per_sec": 0.9075075182543677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1019192457199096, "data/tokens_consumed": 50962890752, "data/tokens_consumed_B": 50.962890752, "train/loss_slope": -7.283466111923405e-06} {"step": 24300, "timestamp": 1778220802.024787, "geo/rankme_last": 440.343017578125, "geo/layer_0/stable_rank_q_proj": 17.782939910888672, "geo/layer_0/stable_rank_k_proj": 15.689620971679688, "geo/layer_0/stable_rank_o_proj": 51.159732818603516, "geo/layer_0/stable_rank_gate_proj": 148.07748413085938, "geo/layer_0/stable_rank_down_proj": 50.576412200927734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04908452183008194, "geo/layer_0/attn_entropy_mean": 6.244916915893555, "geo/layer_0/attn_entropy_std": 0.33225366473197937, "geo/layer_7/stable_rank_q_proj": 42.28664016723633, "geo/layer_7/stable_rank_k_proj": 41.705562591552734, "geo/layer_7/stable_rank_o_proj": 108.36149597167969, "geo/layer_7/stable_rank_gate_proj": 100.70635223388672, "geo/layer_7/stable_rank_down_proj": 148.06182861328125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5481296181678772, "geo/layer_7/attn_entropy_mean": 4.6501240730285645, "geo/layer_7/attn_entropy_std": 0.8346228003501892, "geo/layer_14/stable_rank_q_proj": 57.354557037353516, "geo/layer_14/stable_rank_k_proj": 35.05386734008789, "geo/layer_14/stable_rank_o_proj": 54.10364532470703, "geo/layer_14/stable_rank_gate_proj": 84.43999481201172, "geo/layer_14/stable_rank_down_proj": 135.5402374267578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3735978603363037, "geo/layer_14/attn_entropy_mean": 5.4836015701293945, "geo/layer_14/attn_entropy_std": 0.4524637758731842, "geo/layer_21/stable_rank_q_proj": 46.101837158203125, "geo/layer_21/stable_rank_k_proj": 31.626598358154297, "geo/layer_21/stable_rank_o_proj": 81.62429809570312, "geo/layer_21/stable_rank_gate_proj": 83.59122467041016, "geo/layer_21/stable_rank_down_proj": 59.18539810180664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14962288737297058, "geo/layer_21/attn_entropy_mean": 5.734999656677246, "geo/layer_21/attn_entropy_std": 0.29933565855026245, "geo/layer_27/stable_rank_q_proj": 41.476558685302734, "geo/layer_27/stable_rank_k_proj": 31.363636016845703, "geo/layer_27/stable_rank_o_proj": 119.86112976074219, "geo/layer_27/stable_rank_gate_proj": 90.52259826660156, "geo/layer_27/stable_rank_down_proj": 136.18116760253906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07825132459402084, "geo/layer_27/attn_entropy_mean": 4.356699466705322, "geo/layer_27/attn_entropy_std": 0.5750143527984619, "attnres/final_alpha/block_0": 0.23944717645645142, "attnres/block_norm/0": 1.6477781534194946, "attnres/final_alpha/block_1": 0.006384094245731831, "attnres/block_norm/1": 31926.78125, "attnres/final_alpha/block_2": 0.013194035738706589, "attnres/block_norm/2": 22583.1171875, "attnres/final_alpha/block_3": 0.015345976687967777, "attnres/block_norm/3": 33783.7265625, "attnres/final_alpha/block_4": 0.019233424216508865, "attnres/block_norm/4": 10207.4052734375, "attnres/final_alpha/block_5": 0.5784436464309692, "attnres/block_norm/5": 5227.373046875, "attnres/final_alpha/block_6": 0.12795160710811615, "attnres/block_norm/6": 22607.712890625, "geo/tier1_time_s": 1.361753225326538, "geo/step": 24300.0, "geo/rankme_slope": -9.793386104441777e-05} {"step": 24310, "timestamp": 1778220812.9187443, "train/loss": 2.1904916048049925, "train/z_loss": 0.0015247246134094893, "train/perplexity": 8.939606788286603, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1711133.3946868158, "perf/iters_per_sec": 0.8159319852289275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.225592350959778, "data/tokens_consumed": 50983862272, "data/tokens_consumed_B": 50.983862272, "train/loss_slope": -8.355159856806032e-06} {"step": 24320, "timestamp": 1778220823.2927258, "train/loss": 2.1706278324127197, "train/z_loss": 0.0015280149527825415, "train/perplexity": 8.763784501844622, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022570.6058360387, "perf/iters_per_sec": 0.9644368199520296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368745565414428, "data/tokens_consumed": 51004833792, "data/tokens_consumed_B": 51.004833792, "train/loss_slope": -1.1229398434418976e-05} {"step": 24330, "timestamp": 1778220833.668069, "train/loss": 2.1978426456451414, "train/z_loss": 0.0015298568410798906, "train/perplexity": 9.005564334172469, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022167.1922861803, "perf/iters_per_sec": 0.9642444573813345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370814085006714, "data/tokens_consumed": 51025805312, "data/tokens_consumed_B": 51.025805312, "train/loss_slope": -1.1425179144730314e-05} {"step": 24340, "timestamp": 1778220844.0437603, "train/loss": 2.2133544921875, "train/z_loss": 0.0015184623189270496, "train/perplexity": 9.146346337697002, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022556.5143377644, "perf/iters_per_sec": 0.9644301006020376, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368817806243897, "data/tokens_consumed": 51046776832, "data/tokens_consumed_B": 51.046776832, "train/loss_slope": -1.1815799795540524e-05} {"step": 24350, "timestamp": 1778220854.9143345, "grad/layer_0/attn": 0.003012103959918022, "grad/layer_0/mlp": 0.002735226647928357, "grad/layer_0/attn_mlp_ratio": 1.101226420149397, "grad/layer_4/attn": 0.0022007108200341463, "grad/layer_4/mlp": 0.002564535243436694, "grad/layer_4/attn_mlp_ratio": 0.8581323808487692, "grad/layer_8/attn": 0.0032015275210142136, "grad/layer_8/mlp": 0.003760243533179164, "grad/layer_8/attn_mlp_ratio": 0.8514149170455282, "grad/layer_12/attn": 0.005107854027301073, "grad/layer_12/mlp": 0.006268410012125969, "grad/layer_12/attn_mlp_ratio": 0.8148563887707586, "grad/layer_16/attn": 0.00408568512648344, "grad/layer_16/mlp": 0.004793116357177496, "grad/layer_16/attn_mlp_ratio": 0.852406813601471, "grad/layer_20/attn": 0.004381406586617231, "grad/layer_20/mlp": 0.006840311922132969, "grad/layer_20/attn_mlp_ratio": 0.6405272994039543, "grad/layer_24/attn": 0.022715304046869278, "grad/layer_24/mlp": 0.015716243535280228, "grad/layer_24/attn_mlp_ratio": 1.4453392664311577, "grad/layer_27/attn": 0.005467891693115234, "grad/layer_27/mlp": 0.01511469203978777, "grad/layer_27/attn_mlp_ratio": 0.36176004397215666} {"step": 24350, "timestamp": 1778220854.930792, "train/loss": 2.2334125995635987, "train/z_loss": 0.0015124507830478252, "train/perplexity": 9.331657012417226, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927268.2639894017, "perf/iters_per_sec": 0.9189931220957764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0881474256515502, "data/tokens_consumed": 51067748352, "data/tokens_consumed_B": 51.067748352, "train/loss_slope": -1.2773379755921499e-05} {"step": 24360, "timestamp": 1778220865.3084776, "train/loss": 2.167405390739441, "train/z_loss": 0.0015214377315714955, "train/perplexity": 8.735589170753853, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022170.7254083732, "perf/iters_per_sec": 0.9642461421052805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370795965194701, "data/tokens_consumed": 51088719872, "data/tokens_consumed_B": 51.088719872, "train/loss_slope": -1.3783492664299434e-05} {"step": 24370, "timestamp": 1778220876.0765398, "train/loss": 2.1976394653320312, "train/z_loss": 0.001519323221873492, "train/perplexity": 9.003734766663662, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948851.9126403357, "perf/iters_per_sec": 0.9292850077821425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0760961294174194, "data/tokens_consumed": 51109691392, "data/tokens_consumed_B": 51.109691392, "train/loss_slope": -1.2828523290790414e-05} {"step": 24375, "timestamp": 1778220881.8733451, "eos/sharpness": 44.62592601776122, "eos/L0_probe": 2.036970853805542, "eos/L_plus": 2.28818678855896, "eos/L_minus": 2.2320141792297363, "eos/grad_norm": 0.20653359591960907, "eos/embed_grad_frac": 0.06643087416887283, "eos/time_s": 0.6165814399719238} {"step": 24375, "timestamp": 1778220883.2544796, "geo/rankme_last": 439.85699462890625, "geo/layer_0/stable_rank_q_proj": 17.831411361694336, "geo/layer_0/stable_rank_k_proj": 15.697032928466797, "geo/layer_0/stable_rank_o_proj": 51.22147750854492, "geo/layer_0/stable_rank_gate_proj": 148.02420043945312, "geo/layer_0/stable_rank_down_proj": 50.621795654296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.049439746886491776, "geo/layer_0/attn_entropy_mean": 6.247192859649658, "geo/layer_0/attn_entropy_std": 0.32939058542251587, "geo/layer_7/stable_rank_q_proj": 42.3036003112793, "geo/layer_7/stable_rank_k_proj": 41.74723815917969, "geo/layer_7/stable_rank_o_proj": 108.51318359375, "geo/layer_7/stable_rank_gate_proj": 100.63013458251953, "geo/layer_7/stable_rank_down_proj": 148.75865173339844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5369554162025452, "geo/layer_7/attn_entropy_mean": 4.680918216705322, "geo/layer_7/attn_entropy_std": 0.8102635741233826, "geo/layer_14/stable_rank_q_proj": 57.185001373291016, "geo/layer_14/stable_rank_k_proj": 35.008583068847656, "geo/layer_14/stable_rank_o_proj": 54.22951889038086, "geo/layer_14/stable_rank_gate_proj": 84.4126968383789, "geo/layer_14/stable_rank_down_proj": 135.4891815185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3755571246147156, "geo/layer_14/attn_entropy_mean": 5.5238261222839355, "geo/layer_14/attn_entropy_std": 0.42921993136405945, "geo/layer_21/stable_rank_q_proj": 46.19367218017578, "geo/layer_21/stable_rank_k_proj": 31.611330032348633, "geo/layer_21/stable_rank_o_proj": 81.71822357177734, "geo/layer_21/stable_rank_gate_proj": 83.46334075927734, "geo/layer_21/stable_rank_down_proj": 59.208919525146484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15474942326545715, "geo/layer_21/attn_entropy_mean": 5.756289482116699, "geo/layer_21/attn_entropy_std": 0.2882574498653412, "geo/layer_27/stable_rank_q_proj": 41.420833587646484, "geo/layer_27/stable_rank_k_proj": 31.38146209716797, "geo/layer_27/stable_rank_o_proj": 120.03727722167969, "geo/layer_27/stable_rank_gate_proj": 90.31397247314453, "geo/layer_27/stable_rank_down_proj": 136.25674438476562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07795219868421555, "geo/layer_27/attn_entropy_mean": 4.367981433868408, "geo/layer_27/attn_entropy_std": 0.5666423439979553, "attnres/final_alpha/block_0": 0.23995567858219147, "attnres/block_norm/0": 1.6485264301300049, "attnres/final_alpha/block_1": 0.006380641832947731, "attnres/block_norm/1": 31999.701171875, "attnres/final_alpha/block_2": 0.013228567317128181, "attnres/block_norm/2": 22568.4453125, "attnres/final_alpha/block_3": 0.015119928866624832, "attnres/block_norm/3": 33986.25390625, "attnres/final_alpha/block_4": 0.019363926723599434, "attnres/block_norm/4": 10257.470703125, "attnres/final_alpha/block_5": 0.5785126686096191, "attnres/block_norm/5": 5272.90234375, "attnres/final_alpha/block_6": 0.12743856012821198, "attnres/block_norm/6": 22716.77734375, "geo/tier1_time_s": 1.360668659210205, "geo/step": 24375.0, "geo/rankme_slope": -0.00010740124174669868} {"step": 24380, "timestamp": 1778220888.4518692, "train/loss": 2.2388601303100586, "train/z_loss": 0.0015106008504517376, "train/perplexity": 9.382630213894004, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695560.5144001702, "perf/iters_per_sec": 0.808506257248006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2368488073349, "data/tokens_consumed": 51130662912, "data/tokens_consumed_B": 51.130662912, "train/loss_slope": -1.080591662166861e-05} {"step": 24390, "timestamp": 1778220898.833677, "train/loss": 2.208378052711487, "train/z_loss": 0.0015086118248291314, "train/perplexity": 9.10094316549062, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021138.8349397534, "perf/iters_per_sec": 0.9637540983866469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037609076499939, "data/tokens_consumed": 51151634432, "data/tokens_consumed_B": 51.151634432, "train/loss_slope": -1.0149627671812612e-05} {"step": 24400, "timestamp": 1778220909.2032144, "grad/layer_0/attn": 0.0023698213044553995, "grad/layer_0/mlp": 0.0024844196159392595, "grad/layer_0/attn_mlp_ratio": 0.9538731677467247, "grad/layer_4/attn": 0.002448341576382518, "grad/layer_4/mlp": 0.002453520894050598, "grad/layer_4/attn_mlp_ratio": 0.9978889857960703, "grad/layer_8/attn": 0.003921945113688707, "grad/layer_8/mlp": 0.003718987572938204, "grad/layer_8/attn_mlp_ratio": 1.0545733028983546, "grad/layer_12/attn": 0.004957887344062328, "grad/layer_12/mlp": 0.005739565938711166, "grad/layer_12/attn_mlp_ratio": 0.8638087462751164, "grad/layer_16/attn": 0.004706225357949734, "grad/layer_16/mlp": 0.004265005234628916, "grad/layer_16/attn_mlp_ratio": 1.1034512242548478, "grad/layer_20/attn": 0.003958182875066996, "grad/layer_20/mlp": 0.005224280990660191, "grad/layer_20/attn_mlp_ratio": 0.7576512071954385, "grad/layer_24/attn": 0.004736857023090124, "grad/layer_24/mlp": 0.00772777060046792, "grad/layer_24/attn_mlp_ratio": 0.6129655248186007, "grad/layer_27/attn": 0.005649424158036709, "grad/layer_27/mlp": 0.006978299934417009, "grad/layer_27/attn_mlp_ratio": 0.8095702578240721} {"step": 24400, "timestamp": 1778220909.21879, "train/loss": 2.235788607597351, "train/z_loss": 0.0015129552106373013, "train/perplexity": 9.353855465855515, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020616.7861097527, "perf/iters_per_sec": 0.9635051661061061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378771543502807, "data/tokens_consumed": 51172605952, "data/tokens_consumed_B": 51.172605952, "train/loss_slope": -7.806550980758801e-06} {"step": 24410, "timestamp": 1778220919.5997899, "train/loss": 2.212130808830261, "train/z_loss": 0.0015266172937117516, "train/perplexity": 9.135160950985552, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020907.166858723, "perf/iters_per_sec": 0.963643630437242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377280235290527, "data/tokens_consumed": 51193577472, "data/tokens_consumed_B": 51.193577472, "train/loss_slope": -9.545069816696957e-06} {"step": 24420, "timestamp": 1778220929.9488358, "train/loss": 2.136356806755066, "train/z_loss": 0.0015411844127811492, "train/perplexity": 8.468528872604159, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027447.7057152514, "perf/iters_per_sec": 0.9667624023987061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034380316734314, "data/tokens_consumed": 51214548992, "data/tokens_consumed_B": 51.214548992, "train/loss_slope": -1.789471215397754e-05} {"step": 24430, "timestamp": 1778220940.2954848, "train/loss": 2.1914652585983276, "train/z_loss": 0.0015306056942790748, "train/perplexity": 8.948315109103783, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028021.9645201357, "perf/iters_per_sec": 0.9670362303353003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340874195098877, "data/tokens_consumed": 51235520512, "data/tokens_consumed_B": 51.235520512, "train/loss_slope": -1.9342540042235105e-05} {"step": 24440, "timestamp": 1778220950.651816, "train/loss": 2.2437963485717773, "train/z_loss": 0.0015054970048367978, "train/perplexity": 9.42905942257723, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025949.7609406868, "perf/iters_per_sec": 0.9660481266692575, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035145115852356, "data/tokens_consumed": 51256492032, "data/tokens_consumed_B": 51.256492032, "train/loss_slope": -1.812439524229673e-05} {"step": 24450, "timestamp": 1778220960.9998116, "grad/layer_0/attn": 0.002574200974777341, "grad/layer_0/mlp": 0.0026277548167854548, "grad/layer_0/attn_mlp_ratio": 0.9796198870503393, "grad/layer_4/attn": 0.0016858947928994894, "grad/layer_4/mlp": 0.0023971921764314175, "grad/layer_4/attn_mlp_ratio": 0.703278919039902, "grad/layer_8/attn": 0.004563484340906143, "grad/layer_8/mlp": 0.003883790923282504, "grad/layer_8/attn_mlp_ratio": 1.1750076957150943, "grad/layer_12/attn": 0.004250039346516132, "grad/layer_12/mlp": 0.006381145212799311, "grad/layer_12/attn_mlp_ratio": 0.6660308045314997, "grad/layer_16/attn": 0.004392571747303009, "grad/layer_16/mlp": 0.004637652076780796, "grad/layer_16/attn_mlp_ratio": 0.9471542021402933, "grad/layer_20/attn": 0.004681125748902559, "grad/layer_20/mlp": 0.005782921798527241, "grad/layer_20/attn_mlp_ratio": 0.8094741431826565, "grad/layer_24/attn": 0.010470948182046413, "grad/layer_24/mlp": 0.010113292373716831, "grad/layer_24/attn_mlp_ratio": 1.0353649129854678, "grad/layer_27/attn": 0.005088586825877428, "grad/layer_27/mlp": 0.008762537501752377, "grad/layer_27/attn_mlp_ratio": 0.580720683567713} {"step": 24450, "timestamp": 1778220961.6190357, "eos/sharpness": 34.842061996459954, "eos/L0_probe": 2.036623954772949, "eos/L_plus": 2.2316977977752686, "eos/L_minus": 2.1899707317352295, "eos/grad_norm": 0.13273108005523682, "eos/embed_grad_frac": 0.185409814119339, "eos/time_s": 0.6161537170410156} {"step": 24450, "timestamp": 1778220961.6393723, "train/loss": 2.1989574909210203, "train/z_loss": 0.0015268729883246123, "train/perplexity": 9.01560974352519, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909549.658057893, "perf/iters_per_sec": 0.9105442323960747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0982442855834962, "data/tokens_consumed": 51277463552, "data/tokens_consumed_B": 51.277463552, "train/loss_slope": -1.7439833239610583e-05} {"step": 24450, "timestamp": 1778220963.0011768, "geo/rankme_last": 440.2525329589844, "geo/layer_0/stable_rank_q_proj": 17.82806968688965, "geo/layer_0/stable_rank_k_proj": 15.696198463439941, "geo/layer_0/stable_rank_o_proj": 51.26209259033203, "geo/layer_0/stable_rank_gate_proj": 147.7091064453125, "geo/layer_0/stable_rank_down_proj": 50.6986198425293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04906909167766571, "geo/layer_0/attn_entropy_mean": 6.239499092102051, "geo/layer_0/attn_entropy_std": 0.33637726306915283, "geo/layer_7/stable_rank_q_proj": 42.28342819213867, "geo/layer_7/stable_rank_k_proj": 41.90262222290039, "geo/layer_7/stable_rank_o_proj": 108.8021469116211, "geo/layer_7/stable_rank_gate_proj": 100.23114776611328, "geo/layer_7/stable_rank_down_proj": 148.82777404785156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.532399594783783, "geo/layer_7/attn_entropy_mean": 4.655905723571777, "geo/layer_7/attn_entropy_std": 0.8261256814002991, "geo/layer_14/stable_rank_q_proj": 57.13058090209961, "geo/layer_14/stable_rank_k_proj": 34.9871940612793, "geo/layer_14/stable_rank_o_proj": 54.268672943115234, "geo/layer_14/stable_rank_gate_proj": 84.415771484375, "geo/layer_14/stable_rank_down_proj": 135.6689910888672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.364389032125473, "geo/layer_14/attn_entropy_mean": 5.499648571014404, "geo/layer_14/attn_entropy_std": 0.4440349042415619, "geo/layer_21/stable_rank_q_proj": 46.20281982421875, "geo/layer_21/stable_rank_k_proj": 31.539958953857422, "geo/layer_21/stable_rank_o_proj": 81.5918197631836, "geo/layer_21/stable_rank_gate_proj": 83.44963073730469, "geo/layer_21/stable_rank_down_proj": 59.13336944580078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1564633548259735, "geo/layer_21/attn_entropy_mean": 5.726559162139893, "geo/layer_21/attn_entropy_std": 0.2974870800971985, "geo/layer_27/stable_rank_q_proj": 41.50639343261719, "geo/layer_27/stable_rank_k_proj": 31.452312469482422, "geo/layer_27/stable_rank_o_proj": 119.8310317993164, "geo/layer_27/stable_rank_gate_proj": 90.37622833251953, "geo/layer_27/stable_rank_down_proj": 136.33245849609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0806172639131546, "geo/layer_27/attn_entropy_mean": 4.348085403442383, "geo/layer_27/attn_entropy_std": 0.5478919744491577, "attnres/final_alpha/block_0": 0.2410528063774109, "attnres/block_norm/0": 1.648868203163147, "attnres/final_alpha/block_1": 0.006350578740239143, "attnres/block_norm/1": 31975.17578125, "attnres/final_alpha/block_2": 0.013523198664188385, "attnres/block_norm/2": 22570.1171875, "attnres/final_alpha/block_3": 0.015466708689928055, "attnres/block_norm/3": 34146.171875, "attnres/final_alpha/block_4": 0.019476301968097687, "attnres/block_norm/4": 10223.0263671875, "attnres/final_alpha/block_5": 0.5764932632446289, "attnres/block_norm/5": 5290.833984375, "attnres/final_alpha/block_6": 0.1276371330022812, "attnres/block_norm/6": 22757.642578125, "geo/tier1_time_s": 1.357940912246704, "geo/step": 24450.0, "geo/rankme_slope": -0.00011040218821903761} {"step": 24460, "timestamp": 1778220973.3573804, "train/loss": 2.1953094482421873, "train/z_loss": 0.001522840978577733, "train/perplexity": 8.982780332360031, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790243.3061072417, "perf/iters_per_sec": 0.8536545305763443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714340686798095, "data/tokens_consumed": 51298435072, "data/tokens_consumed_B": 51.298435072, "train/loss_slope": -2.1940761681186747e-05} {"step": 24470, "timestamp": 1778220983.7183044, "train/loss": 2.2003729820251463, "train/z_loss": 0.0015202732640318573, "train/perplexity": 9.028380295084212, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025252.5839732424, "perf/iters_per_sec": 0.9657156867853367, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355014562606812, "data/tokens_consumed": 51319406592, "data/tokens_consumed_B": 51.319406592, "train/loss_slope": -1.9877580705076744e-05} {"step": 24480, "timestamp": 1778220994.0745153, "train/loss": 2.2394672632217407, "train/z_loss": 0.0015140175935812294, "train/perplexity": 9.388328447112428, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026389.2298708085, "perf/iters_per_sec": 0.9662576817850154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034920620918274, "data/tokens_consumed": 51340378112, "data/tokens_consumed_B": 51.340378112, "train/loss_slope": -1.8207532823747482e-05} {"step": 24490, "timestamp": 1778221004.4353836, "train/loss": 2.2049112796783445, "train/z_loss": 0.0015139938681386412, "train/perplexity": 9.069446887916074, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025217.425307716, "perf/iters_per_sec": 0.9656989218271809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355194330215454, "data/tokens_consumed": 51361349632, "data/tokens_consumed_B": 51.361349632, "train/loss_slope": -1.803137831883931e-05} {"step": 24500, "timestamp": 1778221014.788984, "grad/layer_0/attn": 0.0026531287003308535, "grad/layer_0/mlp": 0.0028780403081327677, "grad/layer_0/attn_mlp_ratio": 0.9218524843618044, "grad/layer_4/attn": 0.0018403686117380857, "grad/layer_4/mlp": 0.002621277002617717, "grad/layer_4/attn_mlp_ratio": 0.7020885391705504, "grad/layer_8/attn": 0.004683769308030605, "grad/layer_8/mlp": 0.0037956763990223408, "grad/layer_8/attn_mlp_ratio": 1.2339748419648018, "grad/layer_12/attn": 0.0045526097528636456, "grad/layer_12/mlp": 0.006309176795184612, "grad/layer_12/attn_mlp_ratio": 0.7215853713561843, "grad/layer_16/attn": 0.004077644553035498, "grad/layer_16/mlp": 0.004707717802375555, "grad/layer_16/attn_mlp_ratio": 0.8661616174957899, "grad/layer_20/attn": 0.00390899321064353, "grad/layer_20/mlp": 0.005611171945929527, "grad/layer_20/attn_mlp_ratio": 0.6966446900303482, "grad/layer_24/attn": 0.006097843870520592, "grad/layer_24/mlp": 0.008906158618628979, "grad/layer_24/attn_mlp_ratio": 0.6846772063208075, "grad/layer_27/attn": 0.005133193917572498, "grad/layer_27/mlp": 0.007804805412888527, "grad/layer_27/attn_mlp_ratio": 0.6576965830981641} {"step": 24500, "timestamp": 1778221014.8053167, "train/loss": 2.181350326538086, "train/z_loss": 0.001520068128593266, "train/perplexity": 8.858259729554202, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023515.881983526, "perf/iters_per_sec": 0.9648875627439146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363901853561401, "data/tokens_consumed": 51382321152, "data/tokens_consumed_B": 51.382321152, "train/loss_slope": -2.0533072251011758e-05} {"step": 24500, "timestamp": 1778221021.6272323, "geo/ww_alpha_mean": 8.136143113576805, "geo/ww_alpha_std": 5.268968009670801, "geo/ww_alpha_min": 1.3790664942305664, "geo/ww_alpha_max": 38.26885427447151, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.155445570327816, "geo/ww_alpha_by_type/k_proj": 4.735768891565686, "geo/ww_alpha_by_type/v_proj": 7.932573134684209, "geo/ww_alpha_by_type/o_proj": 8.338930302455326, "geo/ww_alpha_by_type/gate_proj": 8.501324423532296, "geo/ww_alpha_by_type/up_proj": 13.96889782468524, "geo/ww_alpha_by_type/down_proj": 9.45249510326498, "geo/twonn_id/layer_0": 0.7178983688354492, "geo/twonn_id/layer_7": 2.9801321029663086, "geo/twonn_id/layer_14": 4.0313897132873535, "geo/twonn_id/layer_21": 8.525031089782715, "geo/twonn_id/layer_27": 6.563308238983154, "geo/tier2_time_s": 6.817176580429077} {"step": 24500, "timestamp": 1778221022.2687871, "eoc/jacobian_sigma/layer_0/attn": 1028.7882080078125, "eoc/jacobian_sigma/layer_0/mlp": 6000.29248046875, "eoc/jacobian_sigma/layer_0": 6000.29248046875, "eoc/jacobian_sigma/layer_7/attn": 1.1508424282073975, "eoc/jacobian_sigma/layer_7/mlp": 1.7623622417449951, "eoc/jacobian_sigma/layer_7": 1.7623622417449951, "eoc/jacobian_sigma/layer_14/attn": 1.631757378578186, "eoc/jacobian_sigma/layer_14/mlp": 7.488888740539551, "eoc/jacobian_sigma/layer_14": 7.488888740539551, "eoc/jacobian_sigma/layer_21/attn": 1.0804914236068726, "eoc/jacobian_sigma/layer_21/mlp": 4.089718818664551, "eoc/jacobian_sigma/layer_21": 4.089718818664551, "eoc/jacobian_sigma/layer_27/attn": 3.72404146194458, "eoc/jacobian_sigma/layer_27/mlp": 22.226221084594727, "eoc/jacobian_sigma/layer_27": 22.226221084594727, "eoc/layer0_sigma": 6000.29248046875, "eoc/sigma_max": 22.226221084594727, "eoc/sigma_min": 1.7623622417449951, "eoc/sigma_mean": 8.891797721385956, "eoc/time_s": 0.6358382701873779} {"step": 24510, "timestamp": 1778221032.6358514, "train/loss": 2.259553909301758, "train/z_loss": 0.0015002824366092682, "train/perplexity": 9.578815193187463, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1176389.3008989987, "perf/iters_per_sec": 0.5609461311812395, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.782702374458313, "data/tokens_consumed": 51403292672, "data/tokens_consumed_B": 51.403292672, "train/loss_slope": -1.5879199542764675e-05} {"step": 24520, "timestamp": 1778221042.9959707, "train/loss": 2.19065465927124, "train/z_loss": 0.0015198468696326017, "train/perplexity": 8.941064549943976, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025212.715817909, "perf/iters_per_sec": 0.9656966761674447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355218410491944, "data/tokens_consumed": 51424264192, "data/tokens_consumed_B": 51.424264192, "train/loss_slope": -1.9370902890097563e-05} {"step": 24525, "timestamp": 1778221048.7878597, "eos/sharpness": 41.357874870300286, "eos/L0_probe": 2.033573865890503, "eos/L_plus": 2.1827921867370605, "eos/L_minus": 2.2979342937469482, "eos/grad_norm": 0.15197832882404327, "eos/embed_grad_frac": 0.13146209716796875, "eos/time_s": 0.6228492259979248} {"step": 24525, "timestamp": 1778221050.163624, "geo/rankme_last": 440.6412658691406, "geo/layer_0/stable_rank_q_proj": 17.85014533996582, "geo/layer_0/stable_rank_k_proj": 15.755048751831055, "geo/layer_0/stable_rank_o_proj": 51.22808837890625, "geo/layer_0/stable_rank_gate_proj": 147.87368774414062, "geo/layer_0/stable_rank_down_proj": 50.706504821777344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.048394884914159775, "geo/layer_0/attn_entropy_mean": 6.237128257751465, "geo/layer_0/attn_entropy_std": 0.3335108160972595, "geo/layer_7/stable_rank_q_proj": 42.363075256347656, "geo/layer_7/stable_rank_k_proj": 41.97364807128906, "geo/layer_7/stable_rank_o_proj": 108.65904235839844, "geo/layer_7/stable_rank_gate_proj": 100.15399932861328, "geo/layer_7/stable_rank_down_proj": 148.88209533691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5400869846343994, "geo/layer_7/attn_entropy_mean": 4.66655158996582, "geo/layer_7/attn_entropy_std": 0.8062353134155273, "geo/layer_14/stable_rank_q_proj": 57.238651275634766, "geo/layer_14/stable_rank_k_proj": 34.81816864013672, "geo/layer_14/stable_rank_o_proj": 54.271209716796875, "geo/layer_14/stable_rank_gate_proj": 84.19605255126953, "geo/layer_14/stable_rank_down_proj": 135.45413208007812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3673417270183563, "geo/layer_14/attn_entropy_mean": 5.490354537963867, "geo/layer_14/attn_entropy_std": 0.45263567566871643, "geo/layer_21/stable_rank_q_proj": 46.194091796875, "geo/layer_21/stable_rank_k_proj": 31.68311882019043, "geo/layer_21/stable_rank_o_proj": 81.63140106201172, "geo/layer_21/stable_rank_gate_proj": 83.62588500976562, "geo/layer_21/stable_rank_down_proj": 59.17362594604492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1507270634174347, "geo/layer_21/attn_entropy_mean": 5.741013526916504, "geo/layer_21/attn_entropy_std": 0.29574984312057495, "geo/layer_27/stable_rank_q_proj": 41.431678771972656, "geo/layer_27/stable_rank_k_proj": 31.423757553100586, "geo/layer_27/stable_rank_o_proj": 119.84671783447266, "geo/layer_27/stable_rank_gate_proj": 90.43234252929688, "geo/layer_27/stable_rank_down_proj": 136.07078552246094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0743272602558136, "geo/layer_27/attn_entropy_mean": 4.361615180969238, "geo/layer_27/attn_entropy_std": 0.5948458313941956, "attnres/final_alpha/block_0": 0.24278175830841064, "attnres/block_norm/0": 1.6492373943328857, "attnres/final_alpha/block_1": 0.006495066452771425, "attnres/block_norm/1": 32152.2890625, "attnres/final_alpha/block_2": 0.013298748061060905, "attnres/block_norm/2": 22602.810546875, "attnres/final_alpha/block_3": 0.015291293151676655, "attnres/block_norm/3": 34132.78515625, "attnres/final_alpha/block_4": 0.01937221735715866, "attnres/block_norm/4": 10297.671875, "attnres/final_alpha/block_5": 0.571500301361084, "attnres/block_norm/5": 5266.0224609375, "attnres/final_alpha/block_6": 0.13126066327095032, "attnres/block_norm/6": 22615.34375, "geo/tier1_time_s": 1.3547894954681396, "geo/step": 24525.0, "geo/rankme_slope": -9.380443583683473e-05} {"step": 24530, "timestamp": 1778221055.3424802, "train/loss": 2.1718810319900514, "train/z_loss": 0.0015287053771317004, "train/perplexity": 8.774774157555827, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699141.021864117, "perf/iters_per_sec": 0.8102135762520394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2342424631118774, "data/tokens_consumed": 51445235712, "data/tokens_consumed_B": 51.445235712, "train/loss_slope": -2.154776487771081e-05} {"step": 24540, "timestamp": 1778221065.7117841, "train/loss": 2.2392188787460325, "train/z_loss": 0.001500288606621325, "train/perplexity": 9.385996821655088, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023756.9486724664, "perf/iters_per_sec": 0.9650025122988064, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362667322158814, "data/tokens_consumed": 51466207232, "data/tokens_consumed_B": 51.466207232, "train/loss_slope": -2.2371963055470803e-05} {"step": 24550, "timestamp": 1778221076.0607219, "grad/layer_0/attn": 0.002684331499040127, "grad/layer_0/mlp": 0.0027143710758537054, "grad/layer_0/attn_mlp_ratio": 0.9889330990983093, "grad/layer_4/attn": 0.0017257772851735353, "grad/layer_4/mlp": 0.002471622545272112, "grad/layer_4/attn_mlp_ratio": 0.6982365566502312, "grad/layer_8/attn": 0.007027830928564072, "grad/layer_8/mlp": 0.003662650939077139, "grad/layer_8/attn_mlp_ratio": 1.9187825576566098, "grad/layer_12/attn": 0.004551257938146591, "grad/layer_12/mlp": 0.006481108721345663, "grad/layer_12/attn_mlp_ratio": 0.7022344576528219, "grad/layer_16/attn": 0.004552026279270649, "grad/layer_16/mlp": 0.004840520676225424, "grad/layer_16/attn_mlp_ratio": 0.9404001118285171, "grad/layer_20/attn": 0.004094010218977928, "grad/layer_20/mlp": 0.006411472335457802, "grad/layer_20/attn_mlp_ratio": 0.638544462319848, "grad/layer_24/attn": 0.011930635198950768, "grad/layer_24/mlp": 0.011902467347681522, "grad/layer_24/attn_mlp_ratio": 1.002366547221663, "grad/layer_27/attn": 0.004704257473349571, "grad/layer_27/mlp": 0.010504930280148983, "grad/layer_27/attn_mlp_ratio": 0.44781424560786615} {"step": 24550, "timestamp": 1778221076.0767593, "train/loss": 2.1917073130607605, "train/z_loss": 0.001521129719913006, "train/perplexity": 8.950481350870863, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024088.7071214884, "perf/iters_per_sec": 0.9651607070548479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360968828201294, "data/tokens_consumed": 51487178752, "data/tokens_consumed_B": 51.487178752, "train/loss_slope": -2.0593459749951492e-05} {"step": 24560, "timestamp": 1778221086.444462, "train/loss": 2.2164209604263307, "train/z_loss": 0.0015207085176371037, "train/perplexity": 9.17443636481949, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024294.7836845499, "perf/iters_per_sec": 0.9652589720175504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035991406440735, "data/tokens_consumed": 51508150272, "data/tokens_consumed_B": 51.508150272, "train/loss_slope": -2.1451206075655213e-05} {"step": 24570, "timestamp": 1778221096.806879, "train/loss": 2.141443133354187, "train/z_loss": 0.001531482010614127, "train/perplexity": 8.511712305687649, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024740.7588591024, "perf/iters_per_sec": 0.9654716295524132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357632160186767, "data/tokens_consumed": 51529121792, "data/tokens_consumed_B": 51.529121792, "train/loss_slope": -2.4554033591301662e-05} {"step": 24580, "timestamp": 1778221107.170452, "train/loss": 2.2314144134521485, "train/z_loss": 0.0015192255028523504, "train/perplexity": 9.313029242052211, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024507.3324454227, "perf/iters_per_sec": 0.9653603231646646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358826398849488, "data/tokens_consumed": 51550093312, "data/tokens_consumed_B": 51.550093312, "train/loss_slope": -2.335902731088846e-05} {"step": 24590, "timestamp": 1778221117.538856, "train/loss": 2.2008126974105835, "train/z_loss": 0.0015220593893900513, "train/perplexity": 9.032351085750433, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023863.673241048, "perf/iters_per_sec": 0.9650534025387992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362120866775513, "data/tokens_consumed": 51571064832, "data/tokens_consumed_B": 51.571064832, "train/loss_slope": -2.565797689330856e-05} {"step": 24600, "timestamp": 1778221127.8887684, "grad/layer_0/attn": 0.0026589136105030775, "grad/layer_0/mlp": 0.002694905735552311, "grad/layer_0/attn_mlp_ratio": 0.9866443478007991, "grad/layer_4/attn": 0.0019668464083224535, "grad/layer_4/mlp": 0.002526588272303343, "grad/layer_4/attn_mlp_ratio": 0.7784593762415659, "grad/layer_8/attn": 0.011995157226920128, "grad/layer_8/mlp": 0.003776336321607232, "grad/layer_8/attn_mlp_ratio": 3.1764005871635015, "grad/layer_12/attn": 0.004055587109178305, "grad/layer_12/mlp": 0.006204061675816774, "grad/layer_12/attn_mlp_ratio": 0.6536986986472069, "grad/layer_16/attn": 0.006257192697376013, "grad/layer_16/mlp": 0.004797230940312147, "grad/layer_16/attn_mlp_ratio": 1.3043342388130776, "grad/layer_20/attn": 0.007854216732084751, "grad/layer_20/mlp": 0.0072215222753584385, "grad/layer_20/attn_mlp_ratio": 1.0876123238065727, "grad/layer_24/attn": 0.022835684940218925, "grad/layer_24/mlp": 0.015887156128883362, "grad/layer_24/attn_mlp_ratio": 1.4373676831290243, "grad/layer_27/attn": 0.006487412378191948, "grad/layer_27/mlp": 0.013887249864637852, "grad/layer_27/attn_mlp_ratio": 0.46714881597950164} {"step": 24600, "timestamp": 1778221128.4964805, "eos/sharpness": 47.664165496826165, "eos/L0_probe": 2.0360031127929688, "eos/L_plus": 2.3084771633148193, "eos/L_minus": 2.24017071723938, "eos/grad_norm": 0.23719757795333862, "eos/embed_grad_frac": 0.056202273815870285, "eos/time_s": 0.6049766540527344} {"step": 24600, "timestamp": 1778221128.516652, "train/loss": 2.214543271064758, "train/z_loss": 0.001519670884590596, "train/perplexity": 9.157225786375506, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911269.1640858504, "perf/iters_per_sec": 0.9113641567639591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972562313079834, "data/tokens_consumed": 51592036352, "data/tokens_consumed_B": 51.592036352, "train/loss_slope": -2.5396537809374833e-05} {"step": 24600, "timestamp": 1778221129.8775392, "geo/rankme_last": 440.35308837890625, "geo/layer_0/stable_rank_q_proj": 17.900428771972656, "geo/layer_0/stable_rank_k_proj": 15.771454811096191, "geo/layer_0/stable_rank_o_proj": 51.186527252197266, "geo/layer_0/stable_rank_gate_proj": 147.7024383544922, "geo/layer_0/stable_rank_down_proj": 50.70372009277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.053446389734745026, "geo/layer_0/attn_entropy_mean": 6.237835884094238, "geo/layer_0/attn_entropy_std": 0.32996729016304016, "geo/layer_7/stable_rank_q_proj": 42.33259963989258, "geo/layer_7/stable_rank_k_proj": 41.91344451904297, "geo/layer_7/stable_rank_o_proj": 108.44241333007812, "geo/layer_7/stable_rank_gate_proj": 100.06295776367188, "geo/layer_7/stable_rank_down_proj": 148.92022705078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5366080403327942, "geo/layer_7/attn_entropy_mean": 4.6741838455200195, "geo/layer_7/attn_entropy_std": 0.8175774812698364, "geo/layer_14/stable_rank_q_proj": 56.99573516845703, "geo/layer_14/stable_rank_k_proj": 34.78501892089844, "geo/layer_14/stable_rank_o_proj": 54.20880126953125, "geo/layer_14/stable_rank_gate_proj": 83.84678649902344, "geo/layer_14/stable_rank_down_proj": 135.2900848388672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3966362476348877, "geo/layer_14/attn_entropy_mean": 5.488032341003418, "geo/layer_14/attn_entropy_std": 0.4824233055114746, "geo/layer_21/stable_rank_q_proj": 46.279232025146484, "geo/layer_21/stable_rank_k_proj": 31.800621032714844, "geo/layer_21/stable_rank_o_proj": 81.58257293701172, "geo/layer_21/stable_rank_gate_proj": 83.42106628417969, "geo/layer_21/stable_rank_down_proj": 59.168922424316406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.147193044424057, "geo/layer_21/attn_entropy_mean": 5.743039131164551, "geo/layer_21/attn_entropy_std": 0.2991054356098175, "geo/layer_27/stable_rank_q_proj": 41.42774200439453, "geo/layer_27/stable_rank_k_proj": 31.502796173095703, "geo/layer_27/stable_rank_o_proj": 119.54322814941406, "geo/layer_27/stable_rank_gate_proj": 90.3236312866211, "geo/layer_27/stable_rank_down_proj": 136.0088348388672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08426029980182648, "geo/layer_27/attn_entropy_mean": 4.3550214767456055, "geo/layer_27/attn_entropy_std": 0.5828911662101746, "attnres/final_alpha/block_0": 0.24137818813323975, "attnres/block_norm/0": 1.649972677230835, "attnres/final_alpha/block_1": 0.00635130051523447, "attnres/block_norm/1": 32187.853515625, "attnres/final_alpha/block_2": 0.01328341756016016, "attnres/block_norm/2": 22652.98828125, "attnres/final_alpha/block_3": 0.015362095087766647, "attnres/block_norm/3": 34228.5625, "attnres/final_alpha/block_4": 0.019612785428762436, "attnres/block_norm/4": 10273.4873046875, "attnres/final_alpha/block_5": 0.5751774907112122, "attnres/block_norm/5": 5274.21337890625, "attnres/final_alpha/block_6": 0.12883473932743073, "attnres/block_norm/6": 22845.111328125, "geo/tier1_time_s": 1.3565313816070557, "geo/step": 24600.0, "geo/rankme_slope": -0.00010048417804621848} {"step": 24610, "timestamp": 1778221140.243306, "train/loss": 2.1922682523727417, "train/z_loss": 0.001512196904513985, "train/perplexity": 8.955503436132563, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788986.1533016788, "perf/iters_per_sec": 0.8530550734051127, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1722572565078735, "data/tokens_consumed": 51613007872, "data/tokens_consumed_B": 51.613007872, "train/loss_slope": -2.397507594005863e-05} {"step": 24620, "timestamp": 1778221150.6105986, "train/loss": 2.250089168548584, "train/z_loss": 0.001498600433114916, "train/perplexity": 9.48858188171216, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024429.0541457008, "perf/iters_per_sec": 0.9653229971626762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359226942062378, "data/tokens_consumed": 51633979392, "data/tokens_consumed_B": 51.633979392, "train/loss_slope": -2.3921753170609742e-05} {"step": 24630, "timestamp": 1778221160.9746614, "train/loss": 2.1693594217300416, "train/z_loss": 0.0015207013115286828, "train/perplexity": 8.752675470857994, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024459.4328622955, "perf/iters_per_sec": 0.9653374828635671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359071493148804, "data/tokens_consumed": 51654950912, "data/tokens_consumed_B": 51.654950912, "train/loss_slope": -2.3009990036326642e-05} {"step": 24640, "timestamp": 1778221171.3378212, "train/loss": 2.1972161769866942, "train/z_loss": 0.0015251924167387187, "train/perplexity": 8.999924397171819, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024722.4425628618, "perf/iters_per_sec": 0.965462895661765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357725858688354, "data/tokens_consumed": 51675922432, "data/tokens_consumed_B": 51.675922432, "train/loss_slope": -2.2154711389412867e-05} {"step": 24650, "timestamp": 1778221181.69069, "grad/layer_0/attn": 0.003003319725394249, "grad/layer_0/mlp": 0.002801216673105955, "grad/layer_0/attn_mlp_ratio": 1.072148272932196, "grad/layer_4/attn": 0.0026073013432323933, "grad/layer_4/mlp": 0.002610816853120923, "grad/layer_4/attn_mlp_ratio": 0.99865344451501, "grad/layer_8/attn": 0.004318807274103165, "grad/layer_8/mlp": 0.003948836121708155, "grad/layer_8/attn_mlp_ratio": 1.0936911615531555, "grad/layer_12/attn": 0.004330678842961788, "grad/layer_12/mlp": 0.005978252273052931, "grad/layer_12/attn_mlp_ratio": 0.7244054905547972, "grad/layer_16/attn": 0.003619835712015629, "grad/layer_16/mlp": 0.004569668788462877, "grad/layer_16/attn_mlp_ratio": 0.7921439825005027, "grad/layer_20/attn": 0.007445994298905134, "grad/layer_20/mlp": 0.0058195400051772594, "grad/layer_20/attn_mlp_ratio": 1.279481568016161, "grad/layer_24/attn": 0.009033962152898312, "grad/layer_24/mlp": 0.007767512928694487, "grad/layer_24/attn_mlp_ratio": 1.1630443514578412, "grad/layer_27/attn": 0.003540792502462864, "grad/layer_27/mlp": 0.006486644968390465, "grad/layer_27/attn_mlp_ratio": 0.5458588322825321} {"step": 24650, "timestamp": 1778221181.7063801, "train/loss": 2.2207398414611816, "train/z_loss": 0.0015146818244829775, "train/perplexity": 9.214145351512316, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023751.0819414267, "perf/iters_per_sec": 0.9649997148234495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036269736289978, "data/tokens_consumed": 51696893952, "data/tokens_consumed_B": 51.696893952, "train/loss_slope": -2.225020041238286e-05} {"step": 24660, "timestamp": 1778221192.068689, "train/loss": 2.208266997337341, "train/z_loss": 0.001521802821662277, "train/perplexity": 9.099932512962534, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024739.174230048, "perf/iters_per_sec": 0.9654708739423981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357640266418457, "data/tokens_consumed": 51717865472, "data/tokens_consumed_B": 51.717865472, "train/loss_slope": -2.135086717671403e-05} {"step": 24670, "timestamp": 1778221202.4294713, "train/loss": 2.149580204486847, "train/z_loss": 0.0015102882287465035, "train/perplexity": 8.581255268483247, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025161.938650721, "perf/iters_per_sec": 0.9656724637273412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355478048324585, "data/tokens_consumed": 51738836992, "data/tokens_consumed_B": 51.738836992, "train/loss_slope": -2.2601751611642664e-05} {"step": 24675, "timestamp": 1778221208.2063854, "eos/sharpness": 51.51216983795165, "eos/L0_probe": 2.0365889072418213, "eos/L_plus": 2.3917407989501953, "eos/L_minus": 2.196558713912964, "eos/grad_norm": 0.13583587110042572, "eos/embed_grad_frac": 0.12290690094232559, "eos/time_s": 0.6034867763519287} {"step": 24675, "timestamp": 1778221209.584776, "geo/rankme_last": 441.099365234375, "geo/layer_0/stable_rank_q_proj": 17.87659454345703, "geo/layer_0/stable_rank_k_proj": 15.764498710632324, "geo/layer_0/stable_rank_o_proj": 51.243751525878906, "geo/layer_0/stable_rank_gate_proj": 147.49063110351562, "geo/layer_0/stable_rank_down_proj": 50.60850524902344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05358274281024933, "geo/layer_0/attn_entropy_mean": 6.235393524169922, "geo/layer_0/attn_entropy_std": 0.3351554572582245, "geo/layer_7/stable_rank_q_proj": 42.4480094909668, "geo/layer_7/stable_rank_k_proj": 41.904823303222656, "geo/layer_7/stable_rank_o_proj": 108.29875183105469, "geo/layer_7/stable_rank_gate_proj": 100.3038558959961, "geo/layer_7/stable_rank_down_proj": 148.65936279296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5455173850059509, "geo/layer_7/attn_entropy_mean": 4.666735649108887, "geo/layer_7/attn_entropy_std": 0.8539784550666809, "geo/layer_14/stable_rank_q_proj": 56.941585540771484, "geo/layer_14/stable_rank_k_proj": 34.78870391845703, "geo/layer_14/stable_rank_o_proj": 54.30316162109375, "geo/layer_14/stable_rank_gate_proj": 83.86290740966797, "geo/layer_14/stable_rank_down_proj": 135.62142944335938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38551533222198486, "geo/layer_14/attn_entropy_mean": 5.483824729919434, "geo/layer_14/attn_entropy_std": 0.4335554838180542, "geo/layer_21/stable_rank_q_proj": 46.37237548828125, "geo/layer_21/stable_rank_k_proj": 31.728286743164062, "geo/layer_21/stable_rank_o_proj": 81.60954284667969, "geo/layer_21/stable_rank_gate_proj": 83.24717712402344, "geo/layer_21/stable_rank_down_proj": 59.03459548950195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15600645542144775, "geo/layer_21/attn_entropy_mean": 5.73219108581543, "geo/layer_21/attn_entropy_std": 0.2915828227996826, "geo/layer_27/stable_rank_q_proj": 41.51230239868164, "geo/layer_27/stable_rank_k_proj": 31.56884002685547, "geo/layer_27/stable_rank_o_proj": 119.56153106689453, "geo/layer_27/stable_rank_gate_proj": 90.3003921508789, "geo/layer_27/stable_rank_down_proj": 136.07614135742188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07765576988458633, "geo/layer_27/attn_entropy_mean": 4.361420631408691, "geo/layer_27/attn_entropy_std": 0.5983635187149048, "attnres/final_alpha/block_0": 0.24233442544937134, "attnres/block_norm/0": 1.6506876945495605, "attnres/final_alpha/block_1": 0.006469452753663063, "attnres/block_norm/1": 32242.697265625, "attnres/final_alpha/block_2": 0.013348989188671112, "attnres/block_norm/2": 22672.90234375, "attnres/final_alpha/block_3": 0.015267952345311642, "attnres/block_norm/3": 34120.765625, "attnres/final_alpha/block_4": 0.019046835601329803, "attnres/block_norm/4": 10319.771484375, "attnres/final_alpha/block_5": 0.5732930898666382, "attnres/block_norm/5": 5313.15869140625, "attnres/final_alpha/block_6": 0.13023923337459564, "attnres/block_norm/6": 22663.583984375, "geo/tier1_time_s": 1.3589699268341064, "geo/step": 24675.0, "geo/rankme_slope": -9.113262492496999e-05} {"step": 24680, "timestamp": 1778221214.767481, "train/loss": 2.183216381072998, "train/z_loss": 0.0015233867103233933, "train/perplexity": 8.874805157828543, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700586.398795048, "perf/iters_per_sec": 0.8109027856803169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2331934452056885, "data/tokens_consumed": 51759808512, "data/tokens_consumed_B": 51.759808512, "train/loss_slope": -2.313869095335058e-05} {"step": 24690, "timestamp": 1778221225.1291556, "train/loss": 2.2270384311676024, "train/z_loss": 0.001501876104157418, "train/perplexity": 9.272364629776277, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024841.2946609925, "perf/iters_per_sec": 0.9655195687584841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357117891311645, "data/tokens_consumed": 51780780032, "data/tokens_consumed_B": 51.780780032, "train/loss_slope": -2.108462580753241e-05} {"step": 24700, "timestamp": 1778221235.482953, "grad/layer_0/attn": 0.002447008155286312, "grad/layer_0/mlp": 0.0025038528256118298, "grad/layer_0/attn_mlp_ratio": 0.9772970809331274, "grad/layer_4/attn": 0.0023602915462106466, "grad/layer_4/mlp": 0.0025073890574276447, "grad/layer_4/attn_mlp_ratio": 0.9413343513984455, "grad/layer_8/attn": 0.0050278217531740665, "grad/layer_8/mlp": 0.0035372423008084297, "grad/layer_8/attn_mlp_ratio": 1.42139587380977, "grad/layer_12/attn": 0.004325789399445057, "grad/layer_12/mlp": 0.00614565797150135, "grad/layer_12/attn_mlp_ratio": 0.7038773308109364, "grad/layer_16/attn": 0.003788983216509223, "grad/layer_16/mlp": 0.0044130380265414715, "grad/layer_16/attn_mlp_ratio": 0.8585883710636952, "grad/layer_20/attn": 0.0035320131573826075, "grad/layer_20/mlp": 0.00578315882012248, "grad/layer_20/attn_mlp_ratio": 0.6107411548198997, "grad/layer_24/attn": 0.008266429416835308, "grad/layer_24/mlp": 0.00946716032922268, "grad/layer_24/attn_mlp_ratio": 0.8731688322634711, "grad/layer_27/attn": 0.004402983468025923, "grad/layer_27/mlp": 0.007609083317220211, "grad/layer_27/attn_mlp_ratio": 0.5786483373360681} {"step": 24700, "timestamp": 1778221235.49869, "train/loss": 2.2297881603240968, "train/z_loss": 0.0014979470521211623, "train/perplexity": 9.29789620752325, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023408.9613636031, "perf/iters_per_sec": 0.9648365790193573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364449501037598, "data/tokens_consumed": 51801751552, "data/tokens_consumed_B": 51.801751552, "train/loss_slope": -1.9617637363311396e-05} {"step": 24710, "timestamp": 1778221245.8603024, "train/loss": 2.2033632040023803, "train/z_loss": 0.001519651326816529, "train/perplexity": 9.055417559827042, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025157.0895411344, "perf/iters_per_sec": 0.9656701514917061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355502843856812, "data/tokens_consumed": 51822723072, "data/tokens_consumed_B": 51.822723072, "train/loss_slope": -1.815997063487708e-05} {"step": 24720, "timestamp": 1778221256.22094, "train/loss": 2.1707597017288207, "train/z_loss": 0.0015265961294062436, "train/perplexity": 8.764940252315673, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025138.1595855998, "perf/iters_per_sec": 0.965661124985504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355599641799926, "data/tokens_consumed": 51843694592, "data/tokens_consumed_B": 51.843694592, "train/loss_slope": -2.030944356394719e-05} {"step": 24730, "timestamp": 1778221266.5819113, "train/loss": 2.2229320287704466, "train/z_loss": 0.0015130993677303195, "train/perplexity": 9.23436664034591, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025037.9206460125, "perf/iters_per_sec": 0.96561332733441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356112241744995, "data/tokens_consumed": 51864666112, "data/tokens_consumed_B": 51.864666112, "train/loss_slope": -1.768259089378638e-05} {"step": 24740, "timestamp": 1778221276.9436915, "train/loss": 2.2169842958450316, "train/z_loss": 0.0015091591631062329, "train/perplexity": 9.179606105782788, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024935.0811453764, "perf/iters_per_sec": 0.9655642896391756, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356638193130494, "data/tokens_consumed": 51885637632, "data/tokens_consumed_B": 51.885637632, "train/loss_slope": -1.632971838720207e-05} {"step": 24750, "timestamp": 1778221287.3093631, "grad/layer_0/attn": 0.0032090998720377684, "grad/layer_0/mlp": 0.00293336040340364, "grad/layer_0/attn_mlp_ratio": 1.0940011868006616, "grad/layer_4/attn": 0.0032129394821822643, "grad/layer_4/mlp": 0.0024375764187425375, "grad/layer_4/attn_mlp_ratio": 1.318087640522444, "grad/layer_8/attn": 0.010950000956654549, "grad/layer_8/mlp": 0.003812953596934676, "grad/layer_8/attn_mlp_ratio": 2.871789648391879, "grad/layer_12/attn": 0.005837719421833754, "grad/layer_12/mlp": 0.006526496261358261, "grad/layer_12/attn_mlp_ratio": 0.8944645179605735, "grad/layer_16/attn": 0.006064657121896744, "grad/layer_16/mlp": 0.0043336511589586735, "grad/layer_16/attn_mlp_ratio": 1.3994335860227969, "grad/layer_20/attn": 0.0037833056412637234, "grad/layer_20/mlp": 0.006003136280924082, "grad/layer_20/attn_mlp_ratio": 0.630221504426549, "grad/layer_24/attn": 0.01010469626635313, "grad/layer_24/mlp": 0.009242188185453415, "grad/layer_24/attn_mlp_ratio": 1.0933229181510233, "grad/layer_27/attn": 0.004915263503789902, "grad/layer_27/mlp": 0.008052410557866096, "grad/layer_27/attn_mlp_ratio": 0.6104089461692278} {"step": 24750, "timestamp": 1778221287.9152567, "eos/sharpness": 22.15654850006103, "eos/L0_probe": 2.0343782901763916, "eos/L_plus": 2.1610074043273926, "eos/L_minus": 2.129314661026001, "eos/grad_norm": 0.12320301681756973, "eos/embed_grad_frac": 0.15391148626804352, "eos/time_s": 0.6029839515686035} {"step": 24750, "timestamp": 1778221287.934072, "train/loss": 2.2036485195159914, "train/z_loss": 0.0015158134745433926, "train/perplexity": 9.058001579552016, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909341.9933988173, "perf/iters_per_sec": 0.9104452101701819, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098363733291626, "data/tokens_consumed": 51906609152, "data/tokens_consumed_B": 51.906609152, "train/loss_slope": -1.827615836773271e-05} {"step": 24750, "timestamp": 1778221289.293223, "geo/rankme_last": 440.64337158203125, "geo/layer_0/stable_rank_q_proj": 17.907791137695312, "geo/layer_0/stable_rank_k_proj": 15.774845123291016, "geo/layer_0/stable_rank_o_proj": 51.23273849487305, "geo/layer_0/stable_rank_gate_proj": 147.56700134277344, "geo/layer_0/stable_rank_down_proj": 50.68928146362305, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04985319450497627, "geo/layer_0/attn_entropy_mean": 6.239904880523682, "geo/layer_0/attn_entropy_std": 0.3291735053062439, "geo/layer_7/stable_rank_q_proj": 42.68149185180664, "geo/layer_7/stable_rank_k_proj": 42.010589599609375, "geo/layer_7/stable_rank_o_proj": 108.13796997070312, "geo/layer_7/stable_rank_gate_proj": 100.2403564453125, "geo/layer_7/stable_rank_down_proj": 148.167724609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5377436280250549, "geo/layer_7/attn_entropy_mean": 4.690581321716309, "geo/layer_7/attn_entropy_std": 0.8255394101142883, "geo/layer_14/stable_rank_q_proj": 56.82978057861328, "geo/layer_14/stable_rank_k_proj": 34.80924606323242, "geo/layer_14/stable_rank_o_proj": 54.245994567871094, "geo/layer_14/stable_rank_gate_proj": 83.80213165283203, "geo/layer_14/stable_rank_down_proj": 135.62612915039062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3732250928878784, "geo/layer_14/attn_entropy_mean": 5.481091499328613, "geo/layer_14/attn_entropy_std": 0.46526843309402466, "geo/layer_21/stable_rank_q_proj": 46.294395446777344, "geo/layer_21/stable_rank_k_proj": 31.717472076416016, "geo/layer_21/stable_rank_o_proj": 81.61973571777344, "geo/layer_21/stable_rank_gate_proj": 83.04768371582031, "geo/layer_21/stable_rank_down_proj": 59.045372009277344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1545751839876175, "geo/layer_21/attn_entropy_mean": 5.725637435913086, "geo/layer_21/attn_entropy_std": 0.298152893781662, "geo/layer_27/stable_rank_q_proj": 41.50445556640625, "geo/layer_27/stable_rank_k_proj": 31.566455841064453, "geo/layer_27/stable_rank_o_proj": 119.69206237792969, "geo/layer_27/stable_rank_gate_proj": 90.15653228759766, "geo/layer_27/stable_rank_down_proj": 135.85931396484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07757141441106796, "geo/layer_27/attn_entropy_mean": 4.351667404174805, "geo/layer_27/attn_entropy_std": 0.5856766700744629, "attnres/final_alpha/block_0": 0.24150632321834564, "attnres/block_norm/0": 1.6510682106018066, "attnres/final_alpha/block_1": 0.00631718710064888, "attnres/block_norm/1": 32288.224609375, "attnres/final_alpha/block_2": 0.013321641832590103, "attnres/block_norm/2": 22645.3671875, "attnres/final_alpha/block_3": 0.015355873852968216, "attnres/block_norm/3": 34494.08203125, "attnres/final_alpha/block_4": 0.019391674548387527, "attnres/block_norm/4": 10331.955078125, "attnres/final_alpha/block_5": 0.5759996175765991, "attnres/block_norm/5": 5274.5244140625, "attnres/final_alpha/block_6": 0.12810763716697693, "attnres/block_norm/6": 22887.134765625, "geo/tier1_time_s": 1.3554556369781494, "geo/step": 24750.0, "geo/rankme_slope": -7.911774084633854e-05} {"step": 24760, "timestamp": 1778221299.6570566, "train/loss": 2.196700119972229, "train/z_loss": 0.0015191306709311902, "train/perplexity": 8.995281121257602, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789467.14870447, "perf/iters_per_sec": 0.8532844298860883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171942162513733, "data/tokens_consumed": 51927580672, "data/tokens_consumed_B": 51.927580672, "train/loss_slope": -1.7453986740741843e-05} {"step": 24770, "timestamp": 1778221310.0256157, "train/loss": 2.167827272415161, "train/z_loss": 0.0015211788704618812, "train/perplexity": 8.73927533325915, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023731.7126582598, "perf/iters_per_sec": 0.9649904788295077, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362796545028687, "data/tokens_consumed": 51948552192, "data/tokens_consumed_B": 51.948552192, "train/loss_slope": -1.67387383570015e-05} {"step": 24780, "timestamp": 1778221320.3854043, "train/loss": 2.1987916946411135, "train/z_loss": 0.0015235453960485757, "train/perplexity": 9.014115112874146, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025337.687918657, "perf/iters_per_sec": 0.96575626750882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354579448699952, "data/tokens_consumed": 51969523712, "data/tokens_consumed_B": 51.969523712, "train/loss_slope": -1.7220525608526322e-05} {"step": 24790, "timestamp": 1778221330.7505682, "train/loss": 2.2105005860328673, "train/z_loss": 0.001509005983825773, "train/perplexity": 9.120280735674013, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024640.6058573965, "perf/iters_per_sec": 0.9654238728796942, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358144521713257, "data/tokens_consumed": 51990495232, "data/tokens_consumed_B": 51.990495232, "train/loss_slope": -1.7891409459358765e-05} {"step": 24800, "timestamp": 1778221341.0993145, "grad/layer_0/attn": 0.0026673241518437862, "grad/layer_0/mlp": 0.002644917694851756, "grad/layer_0/attn_mlp_ratio": 1.0084714757621742, "grad/layer_4/attn": 0.0017421329393982887, "grad/layer_4/mlp": 0.0024704253301024437, "grad/layer_4/attn_mlp_ratio": 0.7051955174077227, "grad/layer_8/attn": 0.006071522831916809, "grad/layer_8/mlp": 0.0037289124447852373, "grad/layer_8/attn_mlp_ratio": 1.628228809068648, "grad/layer_12/attn": 0.006800292991101742, "grad/layer_12/mlp": 0.0064036124385893345, "grad/layer_12/attn_mlp_ratio": 1.0619463545181629, "grad/layer_16/attn": 0.004056377336382866, "grad/layer_16/mlp": 0.004508036654442549, "grad/layer_16/attn_mlp_ratio": 0.899810173993239, "grad/layer_20/attn": 0.0037059274036437273, "grad/layer_20/mlp": 0.00552584882825613, "grad/layer_20/attn_mlp_ratio": 0.6706530438596806, "grad/layer_24/attn": 0.015297538600862026, "grad/layer_24/mlp": 0.009466860443353653, "grad/layer_24/attn_mlp_ratio": 1.6159040825421143, "grad/layer_27/attn": 0.00455224746838212, "grad/layer_27/mlp": 0.008107410743832588, "grad/layer_27/attn_mlp_ratio": 0.5614921404710942} {"step": 24800, "timestamp": 1778221341.1149719, "train/loss": 2.18739857673645, "train/z_loss": 0.0015280302497558296, "train/perplexity": 8.911999051331213, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024443.684272704, "perf/iters_per_sec": 0.9653299733508606, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035915207862854, "data/tokens_consumed": 52011466752, "data/tokens_consumed_B": 52.011466752, "train/loss_slope": -1.866987762313834e-05} {"step": 24810, "timestamp": 1778221351.4794095, "train/loss": 2.193744444847107, "train/z_loss": 0.001513830805197358, "train/perplexity": 8.968733245379084, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024364.2461580639, "perf/iters_per_sec": 0.9652920943060226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359558582305908, "data/tokens_consumed": 52032438272, "data/tokens_consumed_B": 52.032438272, "train/loss_slope": -1.6754039100008386e-05} {"step": 24820, "timestamp": 1778221361.8431742, "train/loss": 2.209606742858887, "train/z_loss": 0.0015218662680126726, "train/perplexity": 9.112132277258134, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024545.2623225686, "perf/iters_per_sec": 0.9653784095394938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358632326126098, "data/tokens_consumed": 52053409792, "data/tokens_consumed_B": 52.053409792, "train/loss_slope": -1.5200293690028856e-05} {"step": 24825, "timestamp": 1778221367.6133428, "eos/sharpness": 44.86093521118163, "eos/L0_probe": 2.030179977416992, "eos/L_plus": 2.334439754486084, "eos/L_minus": 2.174529552459717, "eos/grad_norm": 0.12688173353672028, "eos/embed_grad_frac": 0.14715032279491425, "eos/time_s": 0.6007940769195557} {"step": 24825, "timestamp": 1778221368.9905376, "geo/rankme_last": 440.001708984375, "geo/layer_0/stable_rank_q_proj": 17.901098251342773, "geo/layer_0/stable_rank_k_proj": 15.77502727508545, "geo/layer_0/stable_rank_o_proj": 51.05281448364258, "geo/layer_0/stable_rank_gate_proj": 147.37876892089844, "geo/layer_0/stable_rank_down_proj": 50.533287048339844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05598549544811249, "geo/layer_0/attn_entropy_mean": 6.235522270202637, "geo/layer_0/attn_entropy_std": 0.3326561152935028, "geo/layer_7/stable_rank_q_proj": 42.60588073730469, "geo/layer_7/stable_rank_k_proj": 41.95120620727539, "geo/layer_7/stable_rank_o_proj": 108.34923553466797, "geo/layer_7/stable_rank_gate_proj": 100.01642608642578, "geo/layer_7/stable_rank_down_proj": 148.2166290283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5454225540161133, "geo/layer_7/attn_entropy_mean": 4.671499252319336, "geo/layer_7/attn_entropy_std": 0.8588883876800537, "geo/layer_14/stable_rank_q_proj": 56.77228927612305, "geo/layer_14/stable_rank_k_proj": 34.79051208496094, "geo/layer_14/stable_rank_o_proj": 54.298095703125, "geo/layer_14/stable_rank_gate_proj": 83.76773834228516, "geo/layer_14/stable_rank_down_proj": 136.0121307373047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36008358001708984, "geo/layer_14/attn_entropy_mean": 5.51595401763916, "geo/layer_14/attn_entropy_std": 0.4382118880748749, "geo/layer_21/stable_rank_q_proj": 46.3085823059082, "geo/layer_21/stable_rank_k_proj": 31.69281768798828, "geo/layer_21/stable_rank_o_proj": 81.68692779541016, "geo/layer_21/stable_rank_gate_proj": 82.98602294921875, "geo/layer_21/stable_rank_down_proj": 58.93036651611328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1527591347694397, "geo/layer_21/attn_entropy_mean": 5.746336936950684, "geo/layer_21/attn_entropy_std": 0.2944962680339813, "geo/layer_27/stable_rank_q_proj": 41.57996368408203, "geo/layer_27/stable_rank_k_proj": 31.585784912109375, "geo/layer_27/stable_rank_o_proj": 119.2470703125, "geo/layer_27/stable_rank_gate_proj": 90.29230499267578, "geo/layer_27/stable_rank_down_proj": 135.76779174804688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08040514588356018, "geo/layer_27/attn_entropy_mean": 4.3415069580078125, "geo/layer_27/attn_entropy_std": 0.5658993721008301, "attnres/final_alpha/block_0": 0.24021127820014954, "attnres/block_norm/0": 1.6516568660736084, "attnres/final_alpha/block_1": 0.006268410477787256, "attnres/block_norm/1": 32441.734375, "attnres/final_alpha/block_2": 0.013197360560297966, "attnres/block_norm/2": 22681.30078125, "attnres/final_alpha/block_3": 0.015130866318941116, "attnres/block_norm/3": 34566.3515625, "attnres/final_alpha/block_4": 0.019147053360939026, "attnres/block_norm/4": 10323.5400390625, "attnres/final_alpha/block_5": 0.5785168409347534, "attnres/block_norm/5": 5318.54345703125, "attnres/final_alpha/block_6": 0.1275281310081482, "attnres/block_norm/6": 22937.58203125, "geo/tier1_time_s": 1.3569936752319336, "geo/step": 24825.0, "geo/rankme_slope": -9.860508656587636e-05} {"step": 24830, "timestamp": 1778221374.1758215, "train/loss": 2.1987937688827515, "train/z_loss": 0.001511506934184581, "train/perplexity": 9.014133810346433, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701286.5973819417, "perf/iters_per_sec": 0.8112366663846692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2326858997344972, "data/tokens_consumed": 52074381312, "data/tokens_consumed_B": 52.074381312, "train/loss_slope": -1.707283737349724e-05} {"step": 24840, "timestamp": 1778221384.538076, "train/loss": 2.2331155061721804, "train/z_loss": 0.0015016914927400648, "train/perplexity": 9.328885050574016, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024886.788411352, "perf/iters_per_sec": 0.9655412618691216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356885194778442, "data/tokens_consumed": 52095352832, "data/tokens_consumed_B": 52.095352832, "train/loss_slope": -1.258656975030546e-05} {"step": 24850, "timestamp": 1778221394.8939853, "grad/layer_0/attn": 0.0033021753188222647, "grad/layer_0/mlp": 0.0030341739766299725, "grad/layer_0/attn_mlp_ratio": 1.08832757627735, "grad/layer_4/attn": 0.0016904447693377733, "grad/layer_4/mlp": 0.0025272571947425604, "grad/layer_4/attn_mlp_ratio": 0.6688851083165914, "grad/layer_8/attn": 0.005399438552558422, "grad/layer_8/mlp": 0.003951114136725664, "grad/layer_8/attn_mlp_ratio": 1.3665609823098919, "grad/layer_12/attn": 0.004711231216788292, "grad/layer_12/mlp": 0.007202183827757835, "grad/layer_12/attn_mlp_ratio": 0.6541392533215935, "grad/layer_16/attn": 0.004094647243618965, "grad/layer_16/mlp": 0.0049058822914958, "grad/layer_16/attn_mlp_ratio": 0.8346403188786005, "grad/layer_20/attn": 0.004161551129072905, "grad/layer_20/mlp": 0.006918018683791161, "grad/layer_20/attn_mlp_ratio": 0.6015524471866673, "grad/layer_24/attn": 0.014201831072568893, "grad/layer_24/mlp": 0.011719133704900742, "grad/layer_24/attn_mlp_ratio": 1.2118498951372951, "grad/layer_27/attn": 0.010194078087806702, "grad/layer_27/mlp": 0.010312973521649837, "grad/layer_27/attn_mlp_ratio": 0.9884712655917649} {"step": 24850, "timestamp": 1778221394.9100635, "train/loss": 2.2046642303466797, "train/z_loss": 0.0015068502863869071, "train/perplexity": 9.067206563870519, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023143.7799700303, "perf/iters_per_sec": 0.9647101306772377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365808010101318, "data/tokens_consumed": 52116324352, "data/tokens_consumed_B": 52.116324352, "train/loss_slope": -1.3298541691937788e-05} {"step": 24860, "timestamp": 1778221405.2749798, "train/loss": 2.1952962636947633, "train/z_loss": 0.001512715278659016, "train/perplexity": 8.982661899247484, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024316.260170954, "perf/iters_per_sec": 0.9652692128042956, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359804153442382, "data/tokens_consumed": 52137295872, "data/tokens_consumed_B": 52.137295872, "train/loss_slope": -1.4760688293312864e-05} {"step": 24870, "timestamp": 1778221415.6387613, "train/loss": 2.176166272163391, "train/z_loss": 0.0015229358337819575, "train/perplexity": 8.81245685432597, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024492.7479938262, "perf/iters_per_sec": 0.9653533687562114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358901023864746, "data/tokens_consumed": 52158267392, "data/tokens_consumed_B": 52.158267392, "train/loss_slope": -1.3668814293443387e-05} {"step": 24880, "timestamp": 1778221426.0029485, "train/loss": 2.166592240333557, "train/z_loss": 0.0015240543405525386, "train/perplexity": 8.728488710136482, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024737.496390223, "perf/iters_per_sec": 0.965470073886024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357648849487304, "data/tokens_consumed": 52179238912, "data/tokens_consumed_B": 52.179238912, "train/loss_slope": -1.4332421041271807e-05} {"step": 24890, "timestamp": 1778221436.3686082, "train/loss": 2.209460496902466, "train/z_loss": 0.0014998631202615797, "train/perplexity": 9.110799762198056, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024313.5115279593, "perf/iters_per_sec": 0.965267902149181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035981822013855, "data/tokens_consumed": 52200210432, "data/tokens_consumed_B": 52.200210432, "train/loss_slope": -1.5205387943255281e-05} {"step": 24900, "timestamp": 1778221446.723336, "grad/layer_0/attn": 0.0029681778978556395, "grad/layer_0/mlp": 0.003094841493293643, "grad/layer_0/attn_mlp_ratio": 0.959072640191835, "grad/layer_4/attn": 0.0020148225594311953, "grad/layer_4/mlp": 0.002626947592943907, "grad/layer_4/attn_mlp_ratio": 0.7669823669664586, "grad/layer_8/attn": 0.003962661139667034, "grad/layer_8/mlp": 0.00398470601066947, "grad/layer_8/attn_mlp_ratio": 0.9944676042874511, "grad/layer_12/attn": 0.004517780616879463, "grad/layer_12/mlp": 0.006248694844543934, "grad/layer_12/attn_mlp_ratio": 0.7229958666527924, "grad/layer_16/attn": 0.003882336663082242, "grad/layer_16/mlp": 0.0045481412671506405, "grad/layer_16/attn_mlp_ratio": 0.8536094966446662, "grad/layer_20/attn": 0.0052641890943050385, "grad/layer_20/mlp": 0.0061124153435230255, "grad/layer_20/attn_mlp_ratio": 0.8612289434421865, "grad/layer_24/attn": 0.0069994330406188965, "grad/layer_24/mlp": 0.010026685893535614, "grad/layer_24/attn_mlp_ratio": 0.6980804071386656, "grad/layer_27/attn": 0.0049088564701378345, "grad/layer_27/mlp": 0.008022683672606945, "grad/layer_27/attn_mlp_ratio": 0.6118721127833657} {"step": 24900, "timestamp": 1778221447.3359137, "eos/sharpness": 3.706955909729003, "eos/L0_probe": 2.032586097717285, "eos/L_plus": 2.0567803382873535, "eos/L_minus": 2.045461416244507, "eos/grad_norm": 0.09223700314760208, "eos/embed_grad_frac": 0.3381265103816986, "eos/time_s": 0.6097593307495117} {"step": 24900, "timestamp": 1778221447.3563385, "train/loss": 2.246290349960327, "train/z_loss": 0.0015163345378823578, "train/perplexity": 9.452604858840756, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909896.5712069375, "perf/iters_per_sec": 0.9107096534762085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0980448007583619, "data/tokens_consumed": 52221181952, "data/tokens_consumed_B": 52.221181952, "train/loss_slope": -1.1119570239971585e-05} {"step": 24900, "timestamp": 1778221448.718561, "geo/rankme_last": 439.971923828125, "geo/layer_0/stable_rank_q_proj": 17.850513458251953, "geo/layer_0/stable_rank_k_proj": 15.732734680175781, "geo/layer_0/stable_rank_o_proj": 51.0667724609375, "geo/layer_0/stable_rank_gate_proj": 147.53932189941406, "geo/layer_0/stable_rank_down_proj": 50.677181243896484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04694538936018944, "geo/layer_0/attn_entropy_mean": 6.235445022583008, "geo/layer_0/attn_entropy_std": 0.3359922766685486, "geo/layer_7/stable_rank_q_proj": 42.741695404052734, "geo/layer_7/stable_rank_k_proj": 41.916446685791016, "geo/layer_7/stable_rank_o_proj": 108.21324920654297, "geo/layer_7/stable_rank_gate_proj": 99.72100830078125, "geo/layer_7/stable_rank_down_proj": 148.1482391357422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5397933721542358, "geo/layer_7/attn_entropy_mean": 4.672615051269531, "geo/layer_7/attn_entropy_std": 0.8708582520484924, "geo/layer_14/stable_rank_q_proj": 56.740169525146484, "geo/layer_14/stable_rank_k_proj": 34.788761138916016, "geo/layer_14/stable_rank_o_proj": 54.3376350402832, "geo/layer_14/stable_rank_gate_proj": 83.82722473144531, "geo/layer_14/stable_rank_down_proj": 136.02088928222656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3915150463581085, "geo/layer_14/attn_entropy_mean": 5.514096260070801, "geo/layer_14/attn_entropy_std": 0.44184374809265137, "geo/layer_21/stable_rank_q_proj": 46.26768112182617, "geo/layer_21/stable_rank_k_proj": 31.63334846496582, "geo/layer_21/stable_rank_o_proj": 81.70471954345703, "geo/layer_21/stable_rank_gate_proj": 82.92172241210938, "geo/layer_21/stable_rank_down_proj": 58.9207763671875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15103983879089355, "geo/layer_21/attn_entropy_mean": 5.72055721282959, "geo/layer_21/attn_entropy_std": 0.29129675030708313, "geo/layer_27/stable_rank_q_proj": 41.586421966552734, "geo/layer_27/stable_rank_k_proj": 31.510093688964844, "geo/layer_27/stable_rank_o_proj": 119.22779846191406, "geo/layer_27/stable_rank_gate_proj": 90.22811126708984, "geo/layer_27/stable_rank_down_proj": 135.82504272460938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08273260295391083, "geo/layer_27/attn_entropy_mean": 4.3334784507751465, "geo/layer_27/attn_entropy_std": 0.5804125666618347, "attnres/final_alpha/block_0": 0.24035640060901642, "attnres/block_norm/0": 1.6524653434753418, "attnres/final_alpha/block_1": 0.006313025020062923, "attnres/block_norm/1": 32303.8984375, "attnres/final_alpha/block_2": 0.013296397402882576, "attnres/block_norm/2": 22695.599609375, "attnres/final_alpha/block_3": 0.015332093462347984, "attnres/block_norm/3": 34530.828125, "attnres/final_alpha/block_4": 0.019220303744077682, "attnres/block_norm/4": 10315.525390625, "attnres/final_alpha/block_5": 0.5761168003082275, "attnres/block_norm/5": 5316.6953125, "attnres/final_alpha/block_6": 0.1293649822473526, "attnres/block_norm/6": 23059.787109375, "geo/tier1_time_s": 1.3580622673034668, "geo/step": 24900.0, "geo/rankme_slope": -0.00011644552352190877} {"step": 24910, "timestamp": 1778221459.0796065, "train/loss": 2.193050265312195, "train/z_loss": 0.0015140788396820425, "train/perplexity": 8.962509494756143, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789471.626495732, "perf/iters_per_sec": 0.8532865650633488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.17193922996521, "data/tokens_consumed": 52242153472, "data/tokens_consumed_B": 52.242153472, "train/loss_slope": -9.940107033984535e-06} {"step": 24920, "timestamp": 1778221469.4491367, "train/loss": 2.2073961734771728, "train/z_loss": 0.0015146164456382393, "train/perplexity": 9.09201152399798, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023547.9091659286, "perf/iters_per_sec": 0.9649028344945567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036373782157898, "data/tokens_consumed": 52263124992, "data/tokens_consumed_B": 52.263124992, "train/loss_slope": -7.920820627919302e-06} {"step": 24930, "timestamp": 1778221479.8119414, "train/loss": 2.2133336544036863, "train/z_loss": 0.0015031316434033215, "train/perplexity": 9.146155750095051, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025073.7257884403, "perf/iters_per_sec": 0.9656304005567743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355929136276245, "data/tokens_consumed": 52284096512, "data/tokens_consumed_B": 52.284096512, "train/loss_slope": -8.410395304075902e-06} {"step": 24940, "timestamp": 1778221490.1723232, "train/loss": 2.2706722736358644, "train/z_loss": 0.0014965614769607781, "train/perplexity": 9.685910207852858, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025316.6560865175, "perf/iters_per_sec": 0.9657462387497509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354686975479126, "data/tokens_consumed": 52305068032, "data/tokens_consumed_B": 52.305068032, "train/loss_slope": -7.321235620685827e-06} {"step": 24950, "timestamp": 1778221500.5231705, "grad/layer_0/attn": 0.0026371688582003117, "grad/layer_0/mlp": 0.002595318481326103, "grad/layer_0/attn_mlp_ratio": 1.0161252946652988, "grad/layer_4/attn": 0.0023883681278675795, "grad/layer_4/mlp": 0.002546785632148385, "grad/layer_4/attn_mlp_ratio": 0.9377970426482758, "grad/layer_8/attn": 0.011004263535141945, "grad/layer_8/mlp": 0.0037779638078063726, "grad/layer_8/attn_mlp_ratio": 2.912749778367108, "grad/layer_12/attn": 0.004596874583512545, "grad/layer_12/mlp": 0.0069815171882510185, "grad/layer_12/attn_mlp_ratio": 0.6584348922616698, "grad/layer_16/attn": 0.003942182287573814, "grad/layer_16/mlp": 0.00427910964936018, "grad/layer_16/attn_mlp_ratio": 0.9212622527765867, "grad/layer_20/attn": 0.005035609006881714, "grad/layer_20/mlp": 0.005923797842115164, "grad/layer_20/attn_mlp_ratio": 0.8500642756703631, "grad/layer_24/attn": 0.00543178990483284, "grad/layer_24/mlp": 0.0079480716958642, "grad/layer_24/attn_mlp_ratio": 0.6834097683490084, "grad/layer_27/attn": 0.00480106845498085, "grad/layer_27/mlp": 0.006569158751517534, "grad/layer_27/attn_mlp_ratio": 0.7308498033765405} {"step": 24950, "timestamp": 1778221500.5393004, "train/loss": 2.1955597758293153, "train/z_loss": 0.0015119465882889927, "train/perplexity": 8.985029251557846, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023943.491204395, "perf/iters_per_sec": 0.9650914627096152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361712217330932, "data/tokens_consumed": 52326039552, "data/tokens_consumed_B": 52.326039552, "train/loss_slope": -6.472144132614751e-06} {"step": 24960, "timestamp": 1778221510.9136577, "train/loss": 2.1949485301971436, "train/z_loss": 0.0015103440033271908, "train/perplexity": 8.979538869829765, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022892.810584355, "perf/iters_per_sec": 0.9645904591485762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367094039916993, "data/tokens_consumed": 52347011072, "data/tokens_consumed_B": 52.347011072, "train/loss_slope": -7.864136421176176e-06} {"step": 24970, "timestamp": 1778221521.2759008, "train/loss": 2.2367470502853393, "train/z_loss": 0.0015116000082343816, "train/perplexity": 9.362824897887918, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024864.041280639, "perf/iters_per_sec": 0.965530415191955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357001543045044, "data/tokens_consumed": 52367982592, "data/tokens_consumed_B": 52.367982592, "train/loss_slope": -5.215300442111649e-06} {"step": 24975, "timestamp": 1778221527.0536504, "eos/sharpness": 22.854399681091305, "eos/L0_probe": 2.0340189933776855, "eos/L_plus": 2.182978868484497, "eos/L_minus": 2.113603115081787, "eos/grad_norm": 0.10716169327497482, "eos/embed_grad_frac": 0.2215835303068161, "eos/time_s": 0.6084349155426025} {"step": 24975, "timestamp": 1778221528.4316583, "geo/rankme_last": 440.40350341796875, "geo/layer_0/stable_rank_q_proj": 17.87240219116211, "geo/layer_0/stable_rank_k_proj": 15.714492797851562, "geo/layer_0/stable_rank_o_proj": 51.126197814941406, "geo/layer_0/stable_rank_gate_proj": 147.37542724609375, "geo/layer_0/stable_rank_down_proj": 50.73040008544922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052332062274217606, "geo/layer_0/attn_entropy_mean": 6.238797187805176, "geo/layer_0/attn_entropy_std": 0.33047372102737427, "geo/layer_7/stable_rank_q_proj": 42.74958038330078, "geo/layer_7/stable_rank_k_proj": 42.033180236816406, "geo/layer_7/stable_rank_o_proj": 108.0505142211914, "geo/layer_7/stable_rank_gate_proj": 99.69779968261719, "geo/layer_7/stable_rank_down_proj": 148.25758361816406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5373883247375488, "geo/layer_7/attn_entropy_mean": 4.680332183837891, "geo/layer_7/attn_entropy_std": 0.8045173287391663, "geo/layer_14/stable_rank_q_proj": 56.80868148803711, "geo/layer_14/stable_rank_k_proj": 34.899925231933594, "geo/layer_14/stable_rank_o_proj": 54.359230041503906, "geo/layer_14/stable_rank_gate_proj": 83.70756530761719, "geo/layer_14/stable_rank_down_proj": 136.0026397705078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3599817752838135, "geo/layer_14/attn_entropy_mean": 5.495429039001465, "geo/layer_14/attn_entropy_std": 0.4438960552215576, "geo/layer_21/stable_rank_q_proj": 46.399837493896484, "geo/layer_21/stable_rank_k_proj": 31.682601928710938, "geo/layer_21/stable_rank_o_proj": 81.4944839477539, "geo/layer_21/stable_rank_gate_proj": 83.00894165039062, "geo/layer_21/stable_rank_down_proj": 58.85333251953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15467609465122223, "geo/layer_21/attn_entropy_mean": 5.7456841468811035, "geo/layer_21/attn_entropy_std": 0.28496766090393066, "geo/layer_27/stable_rank_q_proj": 41.40931701660156, "geo/layer_27/stable_rank_k_proj": 31.556798934936523, "geo/layer_27/stable_rank_o_proj": 119.4921646118164, "geo/layer_27/stable_rank_gate_proj": 90.1067886352539, "geo/layer_27/stable_rank_down_proj": 136.09776306152344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08215624839067459, "geo/layer_27/attn_entropy_mean": 4.319173812866211, "geo/layer_27/attn_entropy_std": 0.5867388248443604, "attnres/final_alpha/block_0": 0.2400665134191513, "attnres/block_norm/0": 1.652782678604126, "attnres/final_alpha/block_1": 0.006340160965919495, "attnres/block_norm/1": 32399.6875, "attnres/final_alpha/block_2": 0.013150685466825962, "attnres/block_norm/2": 22842.13671875, "attnres/final_alpha/block_3": 0.015233932062983513, "attnres/block_norm/3": 34631.52734375, "attnres/final_alpha/block_4": 0.019248146563768387, "attnres/block_norm/4": 10367.6796875, "attnres/final_alpha/block_5": 0.5762524008750916, "attnres/block_norm/5": 5312.9658203125, "attnres/final_alpha/block_6": 0.1297082006931305, "attnres/block_norm/6": 22961.65625, "geo/tier1_time_s": 1.3582830429077148, "geo/step": 24975.0, "geo/rankme_slope": -9.308561315151061e-05} {"step": 24980, "timestamp": 1778221533.6153603, "train/loss": 2.1526206970214843, "train/z_loss": 0.0015086130937561394, "train/perplexity": 8.607386216410104, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700203.0600486195, "perf/iters_per_sec": 0.8107199955218408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2334714889526368, "data/tokens_consumed": 52388954112, "data/tokens_consumed_B": 52.388954112, "train/loss_slope": -6.458584188592369e-06} {"step": 24990, "timestamp": 1778221543.9817183, "train/loss": 2.200755500793457, "train/z_loss": 0.001511239691171795, "train/perplexity": 9.031834480597803, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024096.7649329635, "perf/iters_per_sec": 0.965164549318773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036092758178711, "data/tokens_consumed": 52409925632, "data/tokens_consumed_B": 52.409925632, "train/loss_slope": -6.739964994481489e-06} {"step": 25000, "timestamp": 1778221554.3335319, "grad/layer_0/attn": 0.0030393891502171755, "grad/layer_0/mlp": 0.002872878685593605, "grad/layer_0/attn_mlp_ratio": 1.0579594118131808, "grad/layer_4/attn": 0.002137810457497835, "grad/layer_4/mlp": 0.0025626379065215588, "grad/layer_4/attn_mlp_ratio": 0.8342225675485198, "grad/layer_8/attn": 0.003645658725872636, "grad/layer_8/mlp": 0.0038497259374707937, "grad/layer_8/attn_mlp_ratio": 0.9469917314603958, "grad/layer_12/attn": 0.00595942884683609, "grad/layer_12/mlp": 0.006902647204697132, "grad/layer_12/attn_mlp_ratio": 0.8633540993440013, "grad/layer_16/attn": 0.00515255331993103, "grad/layer_16/mlp": 0.004833206068724394, "grad/layer_16/attn_mlp_ratio": 1.0660735627776707, "grad/layer_20/attn": 0.004188926424831152, "grad/layer_20/mlp": 0.007052500266581774, "grad/layer_20/attn_mlp_ratio": 0.5939633047990117, "grad/layer_24/attn": 0.023054517805576324, "grad/layer_24/mlp": 0.014130808413028717, "grad/layer_24/attn_mlp_ratio": 1.6315073397477489, "grad/layer_27/attn": 0.013111899606883526, "grad/layer_27/mlp": 0.013402733020484447, "grad/layer_27/attn_mlp_ratio": 0.9783004323829729} {"step": 25000, "timestamp": 1778221554.349403, "train/loss": 2.1787997007369997, "train/z_loss": 0.0015273046912625432, "train/perplexity": 8.835694413807136, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023923.0005781557, "perf/iters_per_sec": 0.9650816920176295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361817121505736, "data/tokens_consumed": 52430897152, "data/tokens_consumed_B": 52.430897152, "train/loss_slope": -8.83660506029012e-06} {"step": 25000, "timestamp": 1778221561.2975173, "geo/ww_alpha_mean": 7.712870592788469, "geo/ww_alpha_std": 4.099670310244802, "geo/ww_alpha_min": 1.3620191086418594, "geo/ww_alpha_max": 25.612355267532266, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.079999133189026, "geo/ww_alpha_by_type/k_proj": 4.748638558954566, "geo/ww_alpha_by_type/v_proj": 8.098159473024754, "geo/ww_alpha_by_type/o_proj": 7.881554153408844, "geo/ww_alpha_by_type/gate_proj": 8.916228229785663, "geo/ww_alpha_by_type/up_proj": 11.566701910249567, "geo/ww_alpha_by_type/down_proj": 8.815137071457638, "geo/twonn_id/layer_0": 0.7796710133552551, "geo/twonn_id/layer_7": 2.977771520614624, "geo/twonn_id/layer_14": 4.141265869140625, "geo/twonn_id/layer_21": 6.6548919677734375, "geo/twonn_id/layer_27": 5.949453353881836, "geo/tier2_time_s": 6.940420150756836} {"step": 25000, "timestamp": 1778221561.9196455, "eoc/jacobian_sigma/layer_0/attn": 839.882080078125, "eoc/jacobian_sigma/layer_0/mlp": 5608.00927734375, "eoc/jacobian_sigma/layer_0": 5608.00927734375, "eoc/jacobian_sigma/layer_7/attn": 1.1575905084609985, "eoc/jacobian_sigma/layer_7/mlp": 1.6670130491256714, "eoc/jacobian_sigma/layer_7": 1.6670130491256714, "eoc/jacobian_sigma/layer_14/attn": 1.624687910079956, "eoc/jacobian_sigma/layer_14/mlp": 6.158542633056641, "eoc/jacobian_sigma/layer_14": 6.158542633056641, "eoc/jacobian_sigma/layer_21/attn": 1.0828118324279785, "eoc/jacobian_sigma/layer_21/mlp": 3.870637893676758, "eoc/jacobian_sigma/layer_21": 3.870637893676758, "eoc/jacobian_sigma/layer_27/attn": 3.7023184299468994, "eoc/jacobian_sigma/layer_27/mlp": 24.081022262573242, "eoc/jacobian_sigma/layer_27": 24.081022262573242, "eoc/layer0_sigma": 5608.00927734375, "eoc/sigma_max": 24.081022262573242, "eoc/sigma_min": 1.6670130491256714, "eoc/sigma_mean": 8.944303959608078, "eoc/time_s": 0.6139101982116699} {"step": 25010, "timestamp": 1778221572.3069491, "train/loss": 2.155526947975159, "train/z_loss": 0.0015302753658033908, "train/perplexity": 8.632437826310001, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1168248.1353793486, "perf/iters_per_sec": 0.5570641209503883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7951254844665527, "data/tokens_consumed": 52451868672, "data/tokens_consumed_B": 52.451868672, "train/loss_slope": -1.0233125785360667e-05} {"step": 25020, "timestamp": 1778221582.6725338, "train/loss": 2.2253662586212157, "train/z_loss": 0.001509089907631278, "train/perplexity": 9.256872592491849, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024474.2964008928, "perf/iters_per_sec": 0.9653445703510727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035899543762207, "data/tokens_consumed": 52472840192, "data/tokens_consumed_B": 52.472840192, "train/loss_slope": -1.1624614741041816e-05} {"step": 25030, "timestamp": 1778221593.0414226, "train/loss": 2.2202751278877257, "train/z_loss": 0.0015058713383041323, "train/perplexity": 9.209864407883021, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023629.4247809069, "perf/iters_per_sec": 0.9649417041687521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363320350646972, "data/tokens_consumed": 52493811712, "data/tokens_consumed_B": 52.493811712, "train/loss_slope": -9.534614607624699e-06} {"step": 25040, "timestamp": 1778221603.4109144, "train/loss": 2.162596273422241, "train/z_loss": 0.0015238717780448497, "train/perplexity": 8.693679552504687, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023456.4388684295, "perf/iters_per_sec": 0.9648592180578373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364206314086915, "data/tokens_consumed": 52514783232, "data/tokens_consumed_B": 52.514783232, "train/loss_slope": -1.112687416774821e-05} {"step": 25050, "timestamp": 1778221613.7586906, "grad/layer_0/attn": 0.0025295361410826445, "grad/layer_0/mlp": 0.0027375833597034216, "grad/layer_0/attn_mlp_ratio": 0.9240032964535386, "grad/layer_4/attn": 0.0016823374899104238, "grad/layer_4/mlp": 0.002642398700118065, "grad/layer_4/attn_mlp_ratio": 0.6366705471692068, "grad/layer_8/attn": 0.004276875406503677, "grad/layer_8/mlp": 0.0039190021343529224, "grad/layer_8/attn_mlp_ratio": 1.0913174197793856, "grad/layer_12/attn": 0.003746544476598501, "grad/layer_12/mlp": 0.006054621189832687, "grad/layer_12/attn_mlp_ratio": 0.6187908867049937, "grad/layer_16/attn": 0.006228105165064335, "grad/layer_16/mlp": 0.004463937133550644, "grad/layer_16/attn_mlp_ratio": 1.3952044661950727, "grad/layer_20/attn": 0.004553642589598894, "grad/layer_20/mlp": 0.00658357935026288, "grad/layer_20/attn_mlp_ratio": 0.6916666874001298, "grad/layer_24/attn": 0.00823745783418417, "grad/layer_24/mlp": 0.00864703580737114, "grad/layer_24/attn_mlp_ratio": 0.9526337027422511, "grad/layer_27/attn": 0.0070630768314003944, "grad/layer_27/mlp": 0.0069905719719827175, "grad/layer_27/attn_mlp_ratio": 1.0103717919894235} {"step": 25050, "timestamp": 1778221614.3781745, "eos/sharpness": 17.6579475402832, "eos/L0_probe": 2.034169912338257, "eos/L_plus": 2.135615110397339, "eos/L_minus": 2.109304189682007, "eos/grad_norm": 0.10384614020586014, "eos/embed_grad_frac": 0.22197647392749786, "eos/time_s": 0.6167111396789551} {"step": 25050, "timestamp": 1778221614.3974643, "train/loss": 2.188383138179779, "train/z_loss": 0.0015112358960323036, "train/perplexity": 8.92077778287129, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909677.9684801823, "perf/iters_per_sec": 0.9106054155732071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0981704950332642, "data/tokens_consumed": 52535754752, "data/tokens_consumed_B": 52.535754752, "train/loss_slope": -1.2090854826468516e-05} {"step": 25050, "timestamp": 1778221615.7589192, "geo/rankme_last": 439.7403869628906, "geo/layer_0/stable_rank_q_proj": 17.925844192504883, "geo/layer_0/stable_rank_k_proj": 15.720857620239258, "geo/layer_0/stable_rank_o_proj": 51.141231536865234, "geo/layer_0/stable_rank_gate_proj": 147.32164001464844, "geo/layer_0/stable_rank_down_proj": 50.7548828125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051920924335718155, "geo/layer_0/attn_entropy_mean": 6.2455267906188965, "geo/layer_0/attn_entropy_std": 0.3290897607803345, "geo/layer_7/stable_rank_q_proj": 42.788360595703125, "geo/layer_7/stable_rank_k_proj": 42.14133071899414, "geo/layer_7/stable_rank_o_proj": 108.18842315673828, "geo/layer_7/stable_rank_gate_proj": 99.92010498046875, "geo/layer_7/stable_rank_down_proj": 148.56240844726562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5491071343421936, "geo/layer_7/attn_entropy_mean": 4.649389266967773, "geo/layer_7/attn_entropy_std": 0.8398730754852295, "geo/layer_14/stable_rank_q_proj": 56.73506546020508, "geo/layer_14/stable_rank_k_proj": 34.86075973510742, "geo/layer_14/stable_rank_o_proj": 54.328609466552734, "geo/layer_14/stable_rank_gate_proj": 83.63446044921875, "geo/layer_14/stable_rank_down_proj": 135.8323974609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3636588752269745, "geo/layer_14/attn_entropy_mean": 5.5254130363464355, "geo/layer_14/attn_entropy_std": 0.42803582549095154, "geo/layer_21/stable_rank_q_proj": 46.1992073059082, "geo/layer_21/stable_rank_k_proj": 31.697412490844727, "geo/layer_21/stable_rank_o_proj": 81.5684814453125, "geo/layer_21/stable_rank_gate_proj": 82.77587127685547, "geo/layer_21/stable_rank_down_proj": 58.855812072753906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15688066184520721, "geo/layer_21/attn_entropy_mean": 5.74948787689209, "geo/layer_21/attn_entropy_std": 0.2913225293159485, "geo/layer_27/stable_rank_q_proj": 41.40671920776367, "geo/layer_27/stable_rank_k_proj": 31.511245727539062, "geo/layer_27/stable_rank_o_proj": 119.52403259277344, "geo/layer_27/stable_rank_gate_proj": 90.10340118408203, "geo/layer_27/stable_rank_down_proj": 136.05174255371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07638553529977798, "geo/layer_27/attn_entropy_mean": 4.364776611328125, "geo/layer_27/attn_entropy_std": 0.5496808290481567, "attnres/final_alpha/block_0": 0.24091842770576477, "attnres/block_norm/0": 1.6533747911453247, "attnres/final_alpha/block_1": 0.006340966559946537, "attnres/block_norm/1": 32468.265625, "attnres/final_alpha/block_2": 0.013113620691001415, "attnres/block_norm/2": 22758.6640625, "attnres/final_alpha/block_3": 0.015159408561885357, "attnres/block_norm/3": 34721.7109375, "attnres/final_alpha/block_4": 0.01922135055065155, "attnres/block_norm/4": 10388.04296875, "attnres/final_alpha/block_5": 0.5765136480331421, "attnres/block_norm/5": 5322.57373046875, "attnres/final_alpha/block_6": 0.1287325769662857, "attnres/block_norm/6": 23238.8125, "geo/tier1_time_s": 1.3569977283477783, "geo/step": 25050.0, "geo/rankme_slope": -0.0001028180412790116} {"step": 25060, "timestamp": 1778221626.1184201, "train/loss": 2.1794797658920286, "train/z_loss": 0.0015049618436023594, "train/perplexity": 8.841705305365736, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789854.9065288887, "perf/iters_per_sec": 0.8534693272251552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716882705688476, "data/tokens_consumed": 52556726272, "data/tokens_consumed_B": 52.556726272, "train/loss_slope": -1.1549937378133408e-05} {"step": 25070, "timestamp": 1778221636.49256, "train/loss": 2.202871870994568, "train/z_loss": 0.0015107539016753434, "train/perplexity": 9.05096942712707, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022554.5610762709, "perf/iters_per_sec": 0.9644291692143778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368827819824218, "data/tokens_consumed": 52577697792, "data/tokens_consumed_B": 52.577697792, "train/loss_slope": -9.597015044655549e-06} {"step": 25080, "timestamp": 1778221646.8678129, "train/loss": 2.182550072669983, "train/z_loss": 0.0015269409283064305, "train/perplexity": 8.868893770199094, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022400.8233282343, "perf/iters_per_sec": 0.9643558613434955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036961603164673, "data/tokens_consumed": 52598669312, "data/tokens_consumed_B": 52.598669312, "train/loss_slope": -9.793257884996406e-06} {"step": 25090, "timestamp": 1778221657.2252371, "train/loss": 2.184365081787109, "train/z_loss": 0.001523600541986525, "train/perplexity": 8.885005510305403, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025715.5891785373, "perf/iters_per_sec": 0.9659364648716627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035264778137207, "data/tokens_consumed": 52619640832, "data/tokens_consumed_B": 52.619640832, "train/loss_slope": -7.84320624569248e-06} {"step": 25100, "timestamp": 1778221667.5787835, "grad/layer_0/attn": 0.002726934617385268, "grad/layer_0/mlp": 0.00272186566144228, "grad/layer_0/attn_mlp_ratio": 1.0018622725686157, "grad/layer_4/attn": 0.0018128533847630024, "grad/layer_4/mlp": 0.0025162023957818747, "grad/layer_4/attn_mlp_ratio": 0.7204719762427879, "grad/layer_8/attn": 0.0040123360231518745, "grad/layer_8/mlp": 0.0036508888006210327, "grad/layer_8/attn_mlp_ratio": 1.0990024983968583, "grad/layer_12/attn": 0.004269038327038288, "grad/layer_12/mlp": 0.006243123672902584, "grad/layer_12/attn_mlp_ratio": 0.6837984448694513, "grad/layer_16/attn": 0.005809272173792124, "grad/layer_16/mlp": 0.004277708940207958, "grad/layer_16/attn_mlp_ratio": 1.3580334985826221, "grad/layer_20/attn": 0.0050948793068528175, "grad/layer_20/mlp": 0.006288518663495779, "grad/layer_20/attn_mlp_ratio": 0.8101875017735626, "grad/layer_24/attn": 0.018761275336146355, "grad/layer_24/mlp": 0.012601490132510662, "grad/layer_24/attn_mlp_ratio": 1.4888140204040332, "grad/layer_27/attn": 0.008364911191165447, "grad/layer_27/mlp": 0.01367337629199028, "grad/layer_27/attn_mlp_ratio": 0.611766322476541} {"step": 25100, "timestamp": 1778221667.594666, "train/loss": 2.1609987378120423, "train/z_loss": 0.0015109099098481239, "train/perplexity": 8.6798021775876, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023514.159620176, "perf/iters_per_sec": 0.9648867414570694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363910675048829, "data/tokens_consumed": 52640612352, "data/tokens_consumed_B": 52.640612352, "train/loss_slope": -9.680123905716368e-06} {"step": 25110, "timestamp": 1778221677.9587774, "train/loss": 2.1958082914352417, "train/z_loss": 0.001515659352298826, "train/perplexity": 8.98726244902728, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024637.9961396481, "perf/iters_per_sec": 0.9654226284692994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358157873153686, "data/tokens_consumed": 52661583872, "data/tokens_consumed_B": 52.661583872, "train/loss_slope": -1.113555184578058e-05} {"step": 25120, "timestamp": 1778221688.317712, "train/loss": 2.2381457567214964, "train/z_loss": 0.0015074254712089895, "train/perplexity": 9.37592990422503, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025439.0292327215, "perf/iters_per_sec": 0.9658045908130272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354061365127563, "data/tokens_consumed": 52682555392, "data/tokens_consumed_B": 52.682555392, "train/loss_slope": -7.619620041914461e-06} {"step": 25125, "timestamp": 1778221694.0975013, "eos/sharpness": 15.3965950012207, "eos/L0_probe": 2.031867265701294, "eos/L_plus": 2.112252950668335, "eos/L_minus": 2.10544753074646, "eos/grad_norm": 0.11087503284215927, "eos/embed_grad_frac": 0.19817806780338287, "eos/time_s": 0.6070420742034912} {"step": 25125, "timestamp": 1778221695.4734054, "geo/rankme_last": 440.5557861328125, "geo/layer_0/stable_rank_q_proj": 17.953960418701172, "geo/layer_0/stable_rank_k_proj": 15.721479415893555, "geo/layer_0/stable_rank_o_proj": 51.2677001953125, "geo/layer_0/stable_rank_gate_proj": 147.3001251220703, "geo/layer_0/stable_rank_down_proj": 50.78427505493164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060280900448560715, "geo/layer_0/attn_entropy_mean": 6.238702774047852, "geo/layer_0/attn_entropy_std": 0.3362545669078827, "geo/layer_7/stable_rank_q_proj": 42.846214294433594, "geo/layer_7/stable_rank_k_proj": 41.9625244140625, "geo/layer_7/stable_rank_o_proj": 108.26396942138672, "geo/layer_7/stable_rank_gate_proj": 99.75375366210938, "geo/layer_7/stable_rank_down_proj": 148.13023376464844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5418644547462463, "geo/layer_7/attn_entropy_mean": 4.65194845199585, "geo/layer_7/attn_entropy_std": 0.8352773785591125, "geo/layer_14/stable_rank_q_proj": 56.698394775390625, "geo/layer_14/stable_rank_k_proj": 34.892173767089844, "geo/layer_14/stable_rank_o_proj": 54.409488677978516, "geo/layer_14/stable_rank_gate_proj": 83.56904602050781, "geo/layer_14/stable_rank_down_proj": 135.63934326171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3786732256412506, "geo/layer_14/attn_entropy_mean": 5.443833351135254, "geo/layer_14/attn_entropy_std": 0.42383354902267456, "geo/layer_21/stable_rank_q_proj": 46.17230224609375, "geo/layer_21/stable_rank_k_proj": 31.5526065826416, "geo/layer_21/stable_rank_o_proj": 81.6341552734375, "geo/layer_21/stable_rank_gate_proj": 82.710205078125, "geo/layer_21/stable_rank_down_proj": 58.82115936279297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14739100635051727, "geo/layer_21/attn_entropy_mean": 5.722867012023926, "geo/layer_21/attn_entropy_std": 0.281507670879364, "geo/layer_27/stable_rank_q_proj": 41.409175872802734, "geo/layer_27/stable_rank_k_proj": 31.505041122436523, "geo/layer_27/stable_rank_o_proj": 119.28839111328125, "geo/layer_27/stable_rank_gate_proj": 89.99341583251953, "geo/layer_27/stable_rank_down_proj": 135.9522247314453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07569101452827454, "geo/layer_27/attn_entropy_mean": 4.333759784698486, "geo/layer_27/attn_entropy_std": 0.569856584072113, "attnres/final_alpha/block_0": 0.24171635508537292, "attnres/block_norm/0": 1.6539279222488403, "attnres/final_alpha/block_1": 0.006351072806864977, "attnres/block_norm/1": 32573.46484375, "attnres/final_alpha/block_2": 0.013019464910030365, "attnres/block_norm/2": 22933.833984375, "attnres/final_alpha/block_3": 0.015291839838027954, "attnres/block_norm/3": 35033.7109375, "attnres/final_alpha/block_4": 0.01919535920023918, "attnres/block_norm/4": 10402.1474609375, "attnres/final_alpha/block_5": 0.5764973163604736, "attnres/block_norm/5": 5322.3818359375, "attnres/final_alpha/block_6": 0.12792864441871643, "attnres/block_norm/6": 23231.287109375, "geo/tier1_time_s": 1.356276273727417, "geo/step": 25125.0, "geo/rankme_slope": -0.00011263806303771509} {"step": 25130, "timestamp": 1778221700.6561363, "train/loss": 2.204831850528717, "train/z_loss": 0.0015164599870331585, "train/perplexity": 9.068726538070942, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700367.8857040105, "perf/iters_per_sec": 0.8107985905189564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2333519220352174, "data/tokens_consumed": 52703526912, "data/tokens_consumed_B": 52.703526912, "train/loss_slope": -1.0436781819718672e-05} {"step": 25140, "timestamp": 1778221711.0138893, "train/loss": 2.1625476121902465, "train/z_loss": 0.0015240335371345282, "train/perplexity": 8.69325651763988, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025633.2056927704, "perf/iters_per_sec": 0.9658971813644268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353068828582763, "data/tokens_consumed": 52724498432, "data/tokens_consumed_B": 52.724498432, "train/loss_slope": -1.4771650955550412e-05} {"step": 25150, "timestamp": 1778221721.3642588, "grad/layer_0/attn": 0.002805289113894105, "grad/layer_0/mlp": 0.0027253534644842148, "grad/layer_0/attn_mlp_ratio": 1.0293303410066057, "grad/layer_4/attn": 0.0022148038260638714, "grad/layer_4/mlp": 0.002528999699279666, "grad/layer_4/attn_mlp_ratio": 0.8757627528063517, "grad/layer_8/attn": 0.007563661318272352, "grad/layer_8/mlp": 0.003729742718860507, "grad/layer_8/attn_mlp_ratio": 2.027931062706132, "grad/layer_12/attn": 0.005654563196003437, "grad/layer_12/mlp": 0.00637310603633523, "grad/layer_12/attn_mlp_ratio": 0.8872538876710154, "grad/layer_16/attn": 0.003891567001119256, "grad/layer_16/mlp": 0.004728643223643303, "grad/layer_16/attn_mlp_ratio": 0.8229774873611951, "grad/layer_20/attn": 0.004358039237558842, "grad/layer_20/mlp": 0.006006618961691856, "grad/layer_20/attn_mlp_ratio": 0.7255394745028717, "grad/layer_24/attn": 0.007942496798932552, "grad/layer_24/mlp": 0.010299638845026493, "grad/layer_24/attn_mlp_ratio": 0.7711432256339276, "grad/layer_27/attn": 0.008442210964858532, "grad/layer_27/mlp": 0.0095816133543849, "grad/layer_27/attn_mlp_ratio": 0.8810844859322796} {"step": 25150, "timestamp": 1778221721.3800495, "train/loss": 2.179590845108032, "train/z_loss": 0.0015278105158358813, "train/perplexity": 8.842687489608307, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024268.6957240354, "perf/iters_per_sec": 0.9652465323085954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360047578811646, "data/tokens_consumed": 52745469952, "data/tokens_consumed_B": 52.745469952, "train/loss_slope": -1.3602837470426872e-05} {"step": 25160, "timestamp": 1778221731.7451863, "train/loss": 2.2692379474639894, "train/z_loss": 0.0014969038777053357, "train/perplexity": 9.672027411952133, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024332.891949659, "perf/iters_per_sec": 0.9652771434543891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359719038009643, "data/tokens_consumed": 52766441472, "data/tokens_consumed_B": 52.766441472, "train/loss_slope": -8.451775484459885e-06} {"step": 25170, "timestamp": 1778221742.6818123, "train/loss": 2.2258772611618043, "train/z_loss": 0.0015038663521409034, "train/perplexity": 9.26160408670434, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918643.4048100424, "perf/iters_per_sec": 0.9148804687547886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0930389642715455, "data/tokens_consumed": 52787412992, "data/tokens_consumed_B": 52.787412992, "train/loss_slope": -4.946452985466504e-06} {"step": 25180, "timestamp": 1778221753.0489602, "train/loss": 2.143290662765503, "train/z_loss": 0.0015196009306237101, "train/perplexity": 8.527452480253544, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024134.539690647, "perf/iters_per_sec": 0.9651825617268787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036073422431946, "data/tokens_consumed": 52808384512, "data/tokens_consumed_B": 52.808384512, "train/loss_slope": -8.383885389900796e-06} {"step": 25190, "timestamp": 1778221763.4200952, "train/loss": 2.1964709520339967, "train/z_loss": 0.001487995614297688, "train/perplexity": 8.993219927418016, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023150.666919182, "perf/iters_per_sec": 0.9647134146304999, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365772724151612, "data/tokens_consumed": 52829356032, "data/tokens_consumed_B": 52.829356032, "train/loss_slope": -9.85204593362012e-06} {"step": 25200, "timestamp": 1778221773.768957, "grad/layer_0/attn": 0.002676942152902484, "grad/layer_0/mlp": 0.0026215421967208385, "grad/layer_0/attn_mlp_ratio": 1.0211325433318175, "grad/layer_4/attn": 0.0016348224598914385, "grad/layer_4/mlp": 0.0024729447904974222, "grad/layer_4/attn_mlp_ratio": 0.6610832559081415, "grad/layer_8/attn": 0.007179029751569033, "grad/layer_8/mlp": 0.0036292430013418198, "grad/layer_8/attn_mlp_ratio": 1.9781066054557686, "grad/layer_12/attn": 0.0078920628875494, "grad/layer_12/mlp": 0.006481165066361427, "grad/layer_12/attn_mlp_ratio": 1.217691986698753, "grad/layer_16/attn": 0.0036006495356559753, "grad/layer_16/mlp": 0.004443721380084753, "grad/layer_16/attn_mlp_ratio": 0.8102779509905967, "grad/layer_20/attn": 0.004476467147469521, "grad/layer_20/mlp": 0.0058550238609313965, "grad/layer_20/attn_mlp_ratio": 0.7645514650904043, "grad/layer_24/attn": 0.0076356204226613045, "grad/layer_24/mlp": 0.009667839854955673, "grad/layer_24/attn_mlp_ratio": 0.7897959066593085, "grad/layer_27/attn": 0.004716916009783745, "grad/layer_27/mlp": 0.008531619794666767, "grad/layer_27/attn_mlp_ratio": 0.5528746085760752} {"step": 25200, "timestamp": 1778221774.3783672, "eos/sharpness": 24.039673805236813, "eos/L0_probe": 2.0346338748931885, "eos/L_plus": 2.159968614578247, "eos/L_minus": 2.149695873260498, "eos/grad_norm": 0.11885664612054825, "eos/embed_grad_frac": 0.15856988728046417, "eos/time_s": 0.6066625118255615} {"step": 25200, "timestamp": 1778221774.3975117, "train/loss": 2.260943150520325, "train/z_loss": 0.0014965119538828731, "train/perplexity": 9.592131725875191, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911251.5143255326, "perf/iters_per_sec": 0.9113557407024062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972663640975953, "data/tokens_consumed": 52850327552, "data/tokens_consumed_B": 52.850327552, "train/loss_slope": -7.611679351261803e-06} {"step": 25200, "timestamp": 1778221775.757272, "geo/rankme_last": 440.5255126953125, "geo/layer_0/stable_rank_q_proj": 17.95821762084961, "geo/layer_0/stable_rank_k_proj": 15.747690200805664, "geo/layer_0/stable_rank_o_proj": 51.35505294799805, "geo/layer_0/stable_rank_gate_proj": 147.0024871826172, "geo/layer_0/stable_rank_down_proj": 50.77933120727539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052552156150341034, "geo/layer_0/attn_entropy_mean": 6.239321708679199, "geo/layer_0/attn_entropy_std": 0.3327871263027191, "geo/layer_7/stable_rank_q_proj": 42.918983459472656, "geo/layer_7/stable_rank_k_proj": 41.71019744873047, "geo/layer_7/stable_rank_o_proj": 108.34516906738281, "geo/layer_7/stable_rank_gate_proj": 99.73207092285156, "geo/layer_7/stable_rank_down_proj": 147.95758056640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5471615195274353, "geo/layer_7/attn_entropy_mean": 4.625523567199707, "geo/layer_7/attn_entropy_std": 0.8260458707809448, "geo/layer_14/stable_rank_q_proj": 56.74479675292969, "geo/layer_14/stable_rank_k_proj": 34.915950775146484, "geo/layer_14/stable_rank_o_proj": 54.37968826293945, "geo/layer_14/stable_rank_gate_proj": 83.48341369628906, "geo/layer_14/stable_rank_down_proj": 135.4089813232422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.35983675718307495, "geo/layer_14/attn_entropy_mean": 5.49240779876709, "geo/layer_14/attn_entropy_std": 0.44701534509658813, "geo/layer_21/stable_rank_q_proj": 46.08891677856445, "geo/layer_21/stable_rank_k_proj": 31.54184913635254, "geo/layer_21/stable_rank_o_proj": 81.58431243896484, "geo/layer_21/stable_rank_gate_proj": 82.66707611083984, "geo/layer_21/stable_rank_down_proj": 58.88898849487305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15432943403720856, "geo/layer_21/attn_entropy_mean": 5.73709774017334, "geo/layer_21/attn_entropy_std": 0.2865999937057495, "geo/layer_27/stable_rank_q_proj": 41.295387268066406, "geo/layer_27/stable_rank_k_proj": 31.50190544128418, "geo/layer_27/stable_rank_o_proj": 119.48096466064453, "geo/layer_27/stable_rank_gate_proj": 90.18675231933594, "geo/layer_27/stable_rank_down_proj": 135.67962646484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07491029798984528, "geo/layer_27/attn_entropy_mean": 4.354225158691406, "geo/layer_27/attn_entropy_std": 0.5846174955368042, "attnres/final_alpha/block_0": 0.24068856239318848, "attnres/block_norm/0": 1.6545276641845703, "attnres/final_alpha/block_1": 0.006249162368476391, "attnres/block_norm/1": 32527.37109375, "attnres/final_alpha/block_2": 0.013007711619138718, "attnres/block_norm/2": 22830.125, "attnres/final_alpha/block_3": 0.015120680443942547, "attnres/block_norm/3": 34589.1171875, "attnres/final_alpha/block_4": 0.019290955737233162, "attnres/block_norm/4": 10409.060546875, "attnres/final_alpha/block_5": 0.5776406526565552, "attnres/block_norm/5": 5356.296875, "attnres/final_alpha/block_6": 0.12800228595733643, "attnres/block_norm/6": 23187.40234375, "geo/tier1_time_s": 1.3563461303710938, "geo/step": 25200.0, "geo/rankme_slope": -0.00013433564832182872} {"step": 25210, "timestamp": 1778221786.124012, "train/loss": 2.2005628824234007, "train/z_loss": 0.0015121732954867183, "train/perplexity": 9.0300949508996, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788869.8011664585, "perf/iters_per_sec": 0.8529995923836033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1723335027694701, "data/tokens_consumed": 52871299072, "data/tokens_consumed_B": 52.871299072, "train/loss_slope": -6.182254473082166e-06} {"step": 25220, "timestamp": 1778221796.4914193, "train/loss": 2.2231838703155518, "train/z_loss": 0.0015093352063558995, "train/perplexity": 9.236692530374256, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024195.0008683742, "perf/iters_per_sec": 0.9652113918630477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360424757003783, "data/tokens_consumed": 52892270592, "data/tokens_consumed_B": 52.892270592, "train/loss_slope": -4.312395269792745e-06} {"step": 25230, "timestamp": 1778221806.8528285, "train/loss": 2.17887544631958, "train/z_loss": 0.0015123913646675647, "train/perplexity": 8.836363703975579, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024973.493227196, "perf/iters_per_sec": 0.9655826059471111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356441736221313, "data/tokens_consumed": 52913242112, "data/tokens_consumed_B": 52.913242112, "train/loss_slope": -4.536000894706112e-06} {"step": 25240, "timestamp": 1778221817.2317863, "train/loss": 2.170683407783508, "train/z_loss": 0.00151674262015149, "train/perplexity": 8.76427156595208, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021800.3261619483, "perf/iters_per_sec": 0.9640695219812146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372695922851562, "data/tokens_consumed": 52934213632, "data/tokens_consumed_B": 52.934213632, "train/loss_slope": -8.299707915737814e-06} {"step": 25250, "timestamp": 1778221827.602023, "grad/layer_0/attn": 0.002605761867016554, "grad/layer_0/mlp": 0.002590198302641511, "grad/layer_0/attn_mlp_ratio": 1.0060085993255077, "grad/layer_4/attn": 0.0016787261702120304, "grad/layer_4/mlp": 0.002436306094750762, "grad/layer_4/attn_mlp_ratio": 0.6890456436998744, "grad/layer_8/attn": 0.005404194816946983, "grad/layer_8/mlp": 0.0037405500188469887, "grad/layer_8/attn_mlp_ratio": 1.444759365665927, "grad/layer_12/attn": 0.003810897935181856, "grad/layer_12/mlp": 0.005946971010416746, "grad/layer_12/attn_mlp_ratio": 0.6408132584512926, "grad/layer_16/attn": 0.006366897374391556, "grad/layer_16/mlp": 0.004280634690076113, "grad/layer_16/attn_mlp_ratio": 1.4873722442176265, "grad/layer_20/attn": 0.0071250698529183865, "grad/layer_20/mlp": 0.006159116514027119, "grad/layer_20/attn_mlp_ratio": 1.1568330816616383, "grad/layer_24/attn": 0.012623236514627934, "grad/layer_24/mlp": 0.010551255196332932, "grad/layer_24/attn_mlp_ratio": 1.1963729584872347, "grad/layer_27/attn": 0.004471663851290941, "grad/layer_27/mlp": 0.009356070309877396, "grad/layer_27/attn_mlp_ratio": 0.47794251810782806} {"step": 25250, "timestamp": 1778221827.6179347, "train/loss": 2.17261688709259, "train/z_loss": 0.0015185259864665568, "train/perplexity": 8.78123349617046, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020711.3421879497, "perf/iters_per_sec": 0.9635502539577244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378285884857177, "data/tokens_consumed": 52955185152, "data/tokens_consumed_B": 52.955185152, "train/loss_slope": -1.2664057305007198e-05} {"step": 25260, "timestamp": 1778221837.9897442, "train/loss": 2.2272502899169924, "train/z_loss": 0.0015034782234579324, "train/perplexity": 9.274329269456334, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025152.8465892952, "perf/iters_per_sec": 0.9656681282946087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035552453994751, "data/tokens_consumed": 52976156672, "data/tokens_consumed_B": 52.976156672, "train/loss_slope": -1.0670257423004831e-05} {"step": 25270, "timestamp": 1778221848.3562777, "train/loss": 2.211168885231018, "train/z_loss": 0.001502416201401502, "train/perplexity": 9.126377849097654, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023977.02221503, "perf/iters_per_sec": 0.9651074515414381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361540555953979, "data/tokens_consumed": 52997128192, "data/tokens_consumed_B": 52.997128192, "train/loss_slope": -8.563414701569899e-06} {"step": 25275, "timestamp": 1778221854.1322665, "eos/sharpness": 8.923077583312987, "eos/L0_probe": 2.0327553749084473, "eos/L_plus": 2.0762557983398438, "eos/L_minus": 2.0784857273101807, "eos/grad_norm": 0.10351172834634781, "eos/embed_grad_frac": 0.2022484391927719, "eos/time_s": 0.606278657913208} {"step": 25275, "timestamp": 1778221855.5109105, "geo/rankme_last": 440.7093505859375, "geo/layer_0/stable_rank_q_proj": 17.981386184692383, "geo/layer_0/stable_rank_k_proj": 15.763471603393555, "geo/layer_0/stable_rank_o_proj": 51.237648010253906, "geo/layer_0/stable_rank_gate_proj": 147.10565185546875, "geo/layer_0/stable_rank_down_proj": 50.88639831542969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.049300797283649445, "geo/layer_0/attn_entropy_mean": 6.2408342361450195, "geo/layer_0/attn_entropy_std": 0.32825905084609985, "geo/layer_7/stable_rank_q_proj": 42.94207000732422, "geo/layer_7/stable_rank_k_proj": 41.798858642578125, "geo/layer_7/stable_rank_o_proj": 108.2796401977539, "geo/layer_7/stable_rank_gate_proj": 99.62349700927734, "geo/layer_7/stable_rank_down_proj": 147.72996520996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5347890853881836, "geo/layer_7/attn_entropy_mean": 4.631378650665283, "geo/layer_7/attn_entropy_std": 0.8260785341262817, "geo/layer_14/stable_rank_q_proj": 56.778831481933594, "geo/layer_14/stable_rank_k_proj": 34.97135543823242, "geo/layer_14/stable_rank_o_proj": 54.297603607177734, "geo/layer_14/stable_rank_gate_proj": 83.2783203125, "geo/layer_14/stable_rank_down_proj": 135.31407165527344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3689803183078766, "geo/layer_14/attn_entropy_mean": 5.479854583740234, "geo/layer_14/attn_entropy_std": 0.42758217453956604, "geo/layer_21/stable_rank_q_proj": 46.244022369384766, "geo/layer_21/stable_rank_k_proj": 31.625545501708984, "geo/layer_21/stable_rank_o_proj": 81.49671173095703, "geo/layer_21/stable_rank_gate_proj": 82.60965728759766, "geo/layer_21/stable_rank_down_proj": 58.93159484863281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1537446230649948, "geo/layer_21/attn_entropy_mean": 5.70902156829834, "geo/layer_21/attn_entropy_std": 0.28781449794769287, "geo/layer_27/stable_rank_q_proj": 41.2857780456543, "geo/layer_27/stable_rank_k_proj": 31.52047348022461, "geo/layer_27/stable_rank_o_proj": 119.35881805419922, "geo/layer_27/stable_rank_gate_proj": 90.1184310913086, "geo/layer_27/stable_rank_down_proj": 135.43853759765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08084838837385178, "geo/layer_27/attn_entropy_mean": 4.33627986907959, "geo/layer_27/attn_entropy_std": 0.5678663849830627, "attnres/final_alpha/block_0": 0.2418796420097351, "attnres/block_norm/0": 1.655043601989746, "attnres/final_alpha/block_1": 0.0063670724630355835, "attnres/block_norm/1": 32591.31640625, "attnres/final_alpha/block_2": 0.01314477063715458, "attnres/block_norm/2": 22864.404296875, "attnres/final_alpha/block_3": 0.0151902474462986, "attnres/block_norm/3": 34659.99609375, "attnres/final_alpha/block_4": 0.019092753529548645, "attnres/block_norm/4": 10403.0, "attnres/final_alpha/block_5": 0.5736185908317566, "attnres/block_norm/5": 5337.328125, "attnres/final_alpha/block_6": 0.13070690631866455, "attnres/block_norm/6": 23333.5, "geo/tier1_time_s": 1.3593907356262207, "geo/step": 25275.0, "geo/rankme_slope": -0.00014614879154786916} {"step": 25280, "timestamp": 1778221860.6949944, "train/loss": 2.192627739906311, "train/z_loss": 0.001505722850561142, "train/perplexity": 8.958723406709655, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700418.9996515298, "perf/iters_per_sec": 0.8108229635484361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233314847946167, "data/tokens_consumed": 53018099712, "data/tokens_consumed_B": 53.018099712, "train/loss_slope": -5.983701597775064e-06} {"step": 25290, "timestamp": 1778221871.0584393, "train/loss": 2.2300368547439575, "train/z_loss": 0.0015006205881945788, "train/perplexity": 9.300208829982736, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025029.2492785335, "perf/iters_per_sec": 0.9656091925041835, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035615658760071, "data/tokens_consumed": 53039071232, "data/tokens_consumed_B": 53.039071232, "train/loss_slope": -4.346536924772433e-06} {"step": 25300, "timestamp": 1778221881.42975, "grad/layer_0/attn": 0.0032251987140625715, "grad/layer_0/mlp": 0.0027310606092214584, "grad/layer_0/attn_mlp_ratio": 1.1809326329409853, "grad/layer_4/attn": 0.0018264653626829386, "grad/layer_4/mlp": 0.0026026363484561443, "grad/layer_4/attn_mlp_ratio": 0.70177506496014, "grad/layer_8/attn": 0.008715484291315079, "grad/layer_8/mlp": 0.0037840399891138077, "grad/layer_8/attn_mlp_ratio": 2.3032219760008346, "grad/layer_12/attn": 0.004546647891402245, "grad/layer_12/mlp": 0.006270105484873056, "grad/layer_12/attn_mlp_ratio": 0.725130993387298, "grad/layer_16/attn": 0.004819108173251152, "grad/layer_16/mlp": 0.005084243603050709, "grad/layer_16/attn_mlp_ratio": 0.9478515300829371, "grad/layer_20/attn": 0.005566098727285862, "grad/layer_20/mlp": 0.007090723607689142, "grad/layer_20/attn_mlp_ratio": 0.7849831634604539, "grad/layer_24/attn": 0.021295275539159775, "grad/layer_24/mlp": 0.014151097275316715, "grad/layer_24/attn_mlp_ratio": 1.5048497635458584, "grad/layer_27/attn": 0.008826463483273983, "grad/layer_27/mlp": 0.013088400475680828, "grad/layer_27/attn_mlp_ratio": 0.674372963467681} {"step": 25300, "timestamp": 1778221881.4450376, "train/loss": 2.2041099309921264, "train/z_loss": 0.0015081453952006997, "train/perplexity": 9.06218200980675, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020138.5709935238, "perf/iters_per_sec": 0.9632771353690738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381228446960449, "data/tokens_consumed": 53060042752, "data/tokens_consumed_B": 53.060042752, "train/loss_slope": -4.162977981452872e-06} {"step": 25310, "timestamp": 1778221891.8259907, "train/loss": 2.2199880361557005, "train/z_loss": 0.0015025031752884388, "train/perplexity": 9.207220711468294, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021501.3727283229, "perf/iters_per_sec": 0.9639269698754896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374229907989503, "data/tokens_consumed": 53081014272, "data/tokens_consumed_B": 53.081014272, "train/loss_slope": -3.731765312151293e-06} {"step": 25320, "timestamp": 1778221902.2074485, "train/loss": 2.157785415649414, "train/z_loss": 0.001519976044073701, "train/perplexity": 8.651955940309207, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021460.3513585434, "perf/iters_per_sec": 0.9639074093620984, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037444043159485, "data/tokens_consumed": 53101985792, "data/tokens_consumed_B": 53.101985792, "train/loss_slope": -8.23502207245867e-06} {"step": 25330, "timestamp": 1778221912.5833216, "train/loss": 2.2012776374816894, "train/z_loss": 0.0015052191098220646, "train/perplexity": 9.036551564116182, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022378.2715107005, "perf/iters_per_sec": 0.9643451077989104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369731664657593, "data/tokens_consumed": 53122957312, "data/tokens_consumed_B": 53.122957312, "train/loss_slope": -8.489787424787373e-06} {"step": 25340, "timestamp": 1778221923.989134, "train/loss": 2.1747508525848387, "train/z_loss": 0.0015170633094385266, "train/perplexity": 8.799992353689287, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1839548.524482279, "perf/iters_per_sec": 0.8771650907908817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1400362491607665, "data/tokens_consumed": 53143928832, "data/tokens_consumed_B": 53.143928832, "train/loss_slope": -9.384411875635793e-06} {"step": 25350, "timestamp": 1778221934.8658721, "grad/layer_0/attn": 0.004358490463346243, "grad/layer_0/mlp": 0.003289624350145459, "grad/layer_0/attn_mlp_ratio": 1.324920375988062, "grad/layer_4/attn": 0.0018127833027392626, "grad/layer_4/mlp": 0.0024696472100913525, "grad/layer_4/attn_mlp_ratio": 0.7340251765229617, "grad/layer_8/attn": 0.003979152534157038, "grad/layer_8/mlp": 0.003729417687281966, "grad/layer_8/attn_mlp_ratio": 1.0669634675221198, "grad/layer_12/attn": 0.004466804675757885, "grad/layer_12/mlp": 0.00685241911560297, "grad/layer_12/attn_mlp_ratio": 0.6518580570183102, "grad/layer_16/attn": 0.004478049464523792, "grad/layer_16/mlp": 0.004797407425940037, "grad/layer_16/attn_mlp_ratio": 0.9334311167668278, "grad/layer_20/attn": 0.0061826445162296295, "grad/layer_20/mlp": 0.005690934602171183, "grad/layer_20/attn_mlp_ratio": 1.0864022941382288, "grad/layer_24/attn": 0.004893253557384014, "grad/layer_24/mlp": 0.007584534119814634, "grad/layer_24/attn_mlp_ratio": 0.6451620383754568, "grad/layer_27/attn": 0.007101472932845354, "grad/layer_27/mlp": 0.006886965595185757, "grad/layer_27/attn_mlp_ratio": 1.031146842768441} {"step": 25350, "timestamp": 1778221935.456835, "eos/sharpness": 17.40474700927734, "eos/L0_probe": 2.03149676322937, "eos/L_plus": 2.137726306915283, "eos/L_minus": 2.0993146896362305, "eos/grad_norm": 0.09600379317998886, "eos/embed_grad_frac": 0.25469207763671875, "eos/time_s": 0.5878150463104248} {"step": 25350, "timestamp": 1778221935.4759727, "train/loss": 2.2054807424545286, "train/z_loss": 0.0015046400600112974, "train/perplexity": 9.074613071154234, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1826876.9180256014, "perf/iters_per_sec": 0.8711227979782111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1479437828063965, "data/tokens_consumed": 53164900352, "data/tokens_consumed_B": 53.164900352, "train/loss_slope": -7.191526442721469e-06} {"step": 25350, "timestamp": 1778221936.8386297, "geo/rankme_last": 440.4097900390625, "geo/layer_0/stable_rank_q_proj": 17.950780868530273, "geo/layer_0/stable_rank_k_proj": 15.753741264343262, "geo/layer_0/stable_rank_o_proj": 51.26901626586914, "geo/layer_0/stable_rank_gate_proj": 146.81088256835938, "geo/layer_0/stable_rank_down_proj": 50.9292106628418, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050998009741306305, "geo/layer_0/attn_entropy_mean": 6.2408294677734375, "geo/layer_0/attn_entropy_std": 0.33073002099990845, "geo/layer_7/stable_rank_q_proj": 42.88957977294922, "geo/layer_7/stable_rank_k_proj": 41.68387985229492, "geo/layer_7/stable_rank_o_proj": 107.95575714111328, "geo/layer_7/stable_rank_gate_proj": 99.3251724243164, "geo/layer_7/stable_rank_down_proj": 147.5618896484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5400427579879761, "geo/layer_7/attn_entropy_mean": 4.641862869262695, "geo/layer_7/attn_entropy_std": 0.8230540752410889, "geo/layer_14/stable_rank_q_proj": 56.87398910522461, "geo/layer_14/stable_rank_k_proj": 34.911895751953125, "geo/layer_14/stable_rank_o_proj": 54.27361297607422, "geo/layer_14/stable_rank_gate_proj": 83.35535430908203, "geo/layer_14/stable_rank_down_proj": 135.00331115722656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37208905816078186, "geo/layer_14/attn_entropy_mean": 5.46529483795166, "geo/layer_14/attn_entropy_std": 0.4183458685874939, "geo/layer_21/stable_rank_q_proj": 46.283348083496094, "geo/layer_21/stable_rank_k_proj": 31.530271530151367, "geo/layer_21/stable_rank_o_proj": 81.4869155883789, "geo/layer_21/stable_rank_gate_proj": 82.4421615600586, "geo/layer_21/stable_rank_down_proj": 58.86597442626953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14960359036922455, "geo/layer_21/attn_entropy_mean": 5.728935718536377, "geo/layer_21/attn_entropy_std": 0.2960864305496216, "geo/layer_27/stable_rank_q_proj": 41.369285583496094, "geo/layer_27/stable_rank_k_proj": 31.56564712524414, "geo/layer_27/stable_rank_o_proj": 119.11625671386719, "geo/layer_27/stable_rank_gate_proj": 90.00218963623047, "geo/layer_27/stable_rank_down_proj": 135.75120544433594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08063296973705292, "geo/layer_27/attn_entropy_mean": 4.331872940063477, "geo/layer_27/attn_entropy_std": 0.5746247172355652, "attnres/final_alpha/block_0": 0.24126654863357544, "attnres/block_norm/0": 1.655615210533142, "attnres/final_alpha/block_1": 0.0063321152701973915, "attnres/block_norm/1": 32551.58984375, "attnres/final_alpha/block_2": 0.013134825974702835, "attnres/block_norm/2": 23105.65234375, "attnres/final_alpha/block_3": 0.015352758578956127, "attnres/block_norm/3": 35150.515625, "attnres/final_alpha/block_4": 0.01912282407283783, "attnres/block_norm/4": 10395.0859375, "attnres/final_alpha/block_5": 0.5767306089401245, "attnres/block_norm/5": 5351.81201171875, "attnres/final_alpha/block_6": 0.12806031107902527, "attnres/block_norm/6": 23409.630859375, "geo/tier1_time_s": 1.3585000038146973, "geo/step": 25350.0, "geo/rankme_slope": -0.00014350923963335333} {"step": 25360, "timestamp": 1778221947.254207, "train/loss": 2.1747082710266112, "train/z_loss": 0.0015182892209850251, "train/perplexity": 8.79961764428039, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1781167.9729961916, "perf/iters_per_sec": 0.8493270745259245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.177402710914612, "data/tokens_consumed": 53185871872, "data/tokens_consumed_B": 53.185871872, "train/loss_slope": -1.0793623937131258e-05} {"step": 25370, "timestamp": 1778221957.633921, "train/loss": 2.210898995399475, "train/z_loss": 0.0014960555359721183, "train/perplexity": 9.123915064872524, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021443.7667473443, "perf/iters_per_sec": 0.9638995012032243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374525547027589, "data/tokens_consumed": 53206843392, "data/tokens_consumed_B": 53.206843392, "train/loss_slope": -1.042217358504191e-05} {"step": 25380, "timestamp": 1778221968.0222516, "train/loss": 2.1849302768707277, "train/z_loss": 0.0015227332129143178, "train/perplexity": 8.890028691142588, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019914.8783914335, "perf/iters_per_sec": 0.963170470424382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382378101348877, "data/tokens_consumed": 53227814912, "data/tokens_consumed_B": 53.227814912, "train/loss_slope": -9.111110290678405e-06} {"step": 25390, "timestamp": 1778221978.4142482, "train/loss": 2.2095680236816406, "train/z_loss": 0.0015044621773995458, "train/perplexity": 9.11177946982365, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019806.0653537668, "perf/iters_per_sec": 0.9631185843247255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382937431335448, "data/tokens_consumed": 53248786432, "data/tokens_consumed_B": 53.248786432, "train/loss_slope": -8.11909481887517e-06} {"step": 25400, "timestamp": 1778221989.3148954, "grad/layer_0/attn": 0.003017488634213805, "grad/layer_0/mlp": 0.0028607440181076527, "grad/layer_0/attn_mlp_ratio": 1.0547915191414732, "grad/layer_4/attn": 0.002635095501318574, "grad/layer_4/mlp": 0.0024791548494249582, "grad/layer_4/attn_mlp_ratio": 1.0629006879661904, "grad/layer_8/attn": 0.005375818815082312, "grad/layer_8/mlp": 0.003687313525006175, "grad/layer_8/attn_mlp_ratio": 1.4579228570700462, "grad/layer_12/attn": 0.00443678954616189, "grad/layer_12/mlp": 0.006245833821594715, "grad/layer_12/attn_mlp_ratio": 0.71035983374804, "grad/layer_16/attn": 0.0036358239594846964, "grad/layer_16/mlp": 0.004237358924001455, "grad/layer_16/attn_mlp_ratio": 0.858040099715527, "grad/layer_20/attn": 0.004633781034499407, "grad/layer_20/mlp": 0.005495613906532526, "grad/layer_20/attn_mlp_ratio": 0.8431780377936523, "grad/layer_24/attn": 0.010210266336798668, "grad/layer_24/mlp": 0.010060468688607216, "grad/layer_24/attn_mlp_ratio": 1.0148897184951349, "grad/layer_27/attn": 0.007301808800548315, "grad/layer_27/mlp": 0.009126988239586353, "grad/layer_27/attn_mlp_ratio": 0.8000238993270417} {"step": 25400, "timestamp": 1778221989.3303404, "train/loss": 2.203691005706787, "train/z_loss": 0.0015092544606886804, "train/perplexity": 9.05838642771066, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922227.956256446, "perf/iters_per_sec": 0.9165897160799246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0910006761550903, "data/tokens_consumed": 53269757952, "data/tokens_consumed_B": 53.269757952, "train/loss_slope": -5.816391485072861e-06} {"step": 25410, "timestamp": 1778222000.129391, "train/loss": 2.2541441202163695, "train/z_loss": 0.001496868149843067, "train/perplexity": 9.527135736820636, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943272.1259622918, "perf/iters_per_sec": 0.9266243581592044, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.079185962677002, "data/tokens_consumed": 53290729472, "data/tokens_consumed_B": 53.290729472, "train/loss_slope": -1.9117584394471446e-06} {"step": 25420, "timestamp": 1778222010.5075846, "train/loss": 2.2038439750671386, "train/z_loss": 0.0015116859343834222, "train/perplexity": 9.05977218927515, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022185.3694017255, "perf/iters_per_sec": 0.9642531249054553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370720863342284, "data/tokens_consumed": 53311700992, "data/tokens_consumed_B": 53.311700992, "train/loss_slope": -5.638037606327147e-06} {"step": 25425, "timestamp": 1778222016.2827892, "eos/sharpness": 56.175231933593736, "eos/L0_probe": 2.0331714153289795, "eos/L_plus": 2.380307912826538, "eos/L_minus": 2.2477872371673584, "eos/grad_norm": 0.17453807592391968, "eos/embed_grad_frac": 0.07303161919116974, "eos/time_s": 0.5958859920501709} {"step": 25425, "timestamp": 1778222017.6688259, "geo/rankme_last": 439.88720703125, "geo/layer_0/stable_rank_q_proj": 18.01671600341797, "geo/layer_0/stable_rank_k_proj": 15.818483352661133, "geo/layer_0/stable_rank_o_proj": 51.274444580078125, "geo/layer_0/stable_rank_gate_proj": 146.68637084960938, "geo/layer_0/stable_rank_down_proj": 50.89234161376953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051753390580415726, "geo/layer_0/attn_entropy_mean": 6.243681907653809, "geo/layer_0/attn_entropy_std": 0.332014799118042, "geo/layer_7/stable_rank_q_proj": 42.85245132446289, "geo/layer_7/stable_rank_k_proj": 41.69236373901367, "geo/layer_7/stable_rank_o_proj": 107.64388275146484, "geo/layer_7/stable_rank_gate_proj": 99.14769744873047, "geo/layer_7/stable_rank_down_proj": 147.63829040527344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.543738067150116, "geo/layer_7/attn_entropy_mean": 4.619256973266602, "geo/layer_7/attn_entropy_std": 0.8309463262557983, "geo/layer_14/stable_rank_q_proj": 56.74124526977539, "geo/layer_14/stable_rank_k_proj": 34.90302276611328, "geo/layer_14/stable_rank_o_proj": 54.341556549072266, "geo/layer_14/stable_rank_gate_proj": 83.35385131835938, "geo/layer_14/stable_rank_down_proj": 134.899658203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37665218114852905, "geo/layer_14/attn_entropy_mean": 5.499006271362305, "geo/layer_14/attn_entropy_std": 0.4364275634288788, "geo/layer_21/stable_rank_q_proj": 46.26814270019531, "geo/layer_21/stable_rank_k_proj": 31.52431869506836, "geo/layer_21/stable_rank_o_proj": 81.31336975097656, "geo/layer_21/stable_rank_gate_proj": 82.34574890136719, "geo/layer_21/stable_rank_down_proj": 58.94593048095703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15432631969451904, "geo/layer_21/attn_entropy_mean": 5.729459762573242, "geo/layer_21/attn_entropy_std": 0.29164978861808777, "geo/layer_27/stable_rank_q_proj": 41.34538650512695, "geo/layer_27/stable_rank_k_proj": 31.435672760009766, "geo/layer_27/stable_rank_o_proj": 119.00391387939453, "geo/layer_27/stable_rank_gate_proj": 89.8209457397461, "geo/layer_27/stable_rank_down_proj": 136.15185546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07702549546957016, "geo/layer_27/attn_entropy_mean": 4.329057216644287, "geo/layer_27/attn_entropy_std": 0.5625851154327393, "attnres/final_alpha/block_0": 0.2409135401248932, "attnres/block_norm/0": 1.6562618017196655, "attnres/final_alpha/block_1": 0.006224929820746183, "attnres/block_norm/1": 32734.7421875, "attnres/final_alpha/block_2": 0.01326904259622097, "attnres/block_norm/2": 23025.98046875, "attnres/final_alpha/block_3": 0.015543704852461815, "attnres/block_norm/3": 34867.28125, "attnres/final_alpha/block_4": 0.019101332873106003, "attnres/block_norm/4": 10439.599609375, "attnres/final_alpha/block_5": 0.5768441557884216, "attnres/block_norm/5": 5367.111328125, "attnres/final_alpha/block_6": 0.12810328602790833, "attnres/block_norm/6": 23412.623046875, "geo/tier1_time_s": 1.3584928512573242, "geo/step": 25425.0, "geo/rankme_slope": -0.00015577963607317928} {"step": 25430, "timestamp": 1778222022.8797498, "train/loss": 2.1861862897872926, "train/z_loss": 0.0015033267904073, "train/perplexity": 8.901201697258445, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696140.0384060973, "perf/iters_per_sec": 0.8087825958281027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2364262104034425, "data/tokens_consumed": 53332672512, "data/tokens_consumed_B": 53.332672512, "train/loss_slope": -7.154374704896005e-06} {"step": 25440, "timestamp": 1778222033.2589943, "train/loss": 2.1791099548339843, "train/z_loss": 0.001500996307004243, "train/perplexity": 8.838436149494095, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021560.5151872723, "perf/iters_per_sec": 0.9639551711975443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373926401138305, "data/tokens_consumed": 53353644032, "data/tokens_consumed_B": 53.353644032, "train/loss_slope": -5.9131715306998045e-06} {"step": 25450, "timestamp": 1778222043.6225107, "grad/layer_0/attn": 0.0023900840897113085, "grad/layer_0/mlp": 0.0023550994228571653, "grad/layer_0/attn_mlp_ratio": 1.0148548146329293, "grad/layer_4/attn": 0.001511194626800716, "grad/layer_4/mlp": 0.0022797586861997843, "grad/layer_4/attn_mlp_ratio": 0.662874790064866, "grad/layer_8/attn": 0.003502399194985628, "grad/layer_8/mlp": 0.003457986284047365, "grad/layer_8/attn_mlp_ratio": 1.0128435470836876, "grad/layer_12/attn": 0.004144032020121813, "grad/layer_12/mlp": 0.0062240553088486195, "grad/layer_12/attn_mlp_ratio": 0.6658089859274586, "grad/layer_16/attn": 0.005550615023821592, "grad/layer_16/mlp": 0.0043805683963000774, "grad/layer_16/attn_mlp_ratio": 1.267099242600535, "grad/layer_20/attn": 0.003931714687496424, "grad/layer_20/mlp": 0.005586451850831509, "grad/layer_20/attn_mlp_ratio": 0.7037945948700431, "grad/layer_24/attn": 0.005161500535905361, "grad/layer_24/mlp": 0.007942414842545986, "grad/layer_24/attn_mlp_ratio": 0.6498653839219855, "grad/layer_27/attn": 0.004186018370091915, "grad/layer_27/mlp": 0.007083664182573557, "grad/layer_27/attn_mlp_ratio": 0.5909396892777502} {"step": 25450, "timestamp": 1778222043.6385474, "train/loss": 2.2130499124526977, "train/z_loss": 0.0015008003218099474, "train/perplexity": 9.143560970159843, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021973.3543711966, "perf/iters_per_sec": 0.9641520282608016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371808290481568, "data/tokens_consumed": 53374615552, "data/tokens_consumed_B": 53.374615552, "train/loss_slope": -5.295608339100812e-06} {"step": 25460, "timestamp": 1778222054.0065632, "train/loss": 2.182848405838013, "train/z_loss": 0.0015151059604249894, "train/perplexity": 8.871540050091383, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023596.7432602358, "perf/iters_per_sec": 0.9649261204053096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363487720489502, "data/tokens_consumed": 53395587072, "data/tokens_consumed_B": 53.395587072, "train/loss_slope": -6.7102001480894465e-06} {"step": 25470, "timestamp": 1778222064.3870304, "train/loss": 2.1810086011886596, "train/z_loss": 0.0015134362853132187, "train/perplexity": 8.855233154810918, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021374.4116441333, "perf/iters_per_sec": 0.9638664301129023, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374881505966187, "data/tokens_consumed": 53416558592, "data/tokens_consumed_B": 53.416558592, "train/loss_slope": -7.912251701091747e-06} {"step": 25480, "timestamp": 1778222074.7587972, "train/loss": 2.2241936683654786, "train/z_loss": 0.0014939458575099706, "train/perplexity": 9.246024435355956, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023208.4632997178, "perf/iters_per_sec": 0.9647409740923489, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365476608276367, "data/tokens_consumed": 53437530112, "data/tokens_consumed_B": 53.437530112, "train/loss_slope": -4.156263762324413e-06} {"step": 25490, "timestamp": 1778222085.1254995, "train/loss": 2.202170991897583, "train/z_loss": 0.0014996935962699354, "train/perplexity": 9.044628014389554, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024164.117704589, "perf/iters_per_sec": 0.965196665622992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036058282852173, "data/tokens_consumed": 53458501632, "data/tokens_consumed_B": 53.458501632, "train/loss_slope": -3.784525519621982e-06} {"step": 25500, "timestamp": 1778222095.4893, "grad/layer_0/attn": 0.002652416005730629, "grad/layer_0/mlp": 0.0026252621319144964, "grad/layer_0/attn_mlp_ratio": 1.010343261517281, "grad/layer_4/attn": 0.0016948350239545107, "grad/layer_4/mlp": 0.0025153711903840303, "grad/layer_4/attn_mlp_ratio": 0.6737911935441365, "grad/layer_8/attn": 0.00473161181434989, "grad/layer_8/mlp": 0.004053956363350153, "grad/layer_8/attn_mlp_ratio": 1.1671590104940912, "grad/layer_12/attn": 0.0035261495504528284, "grad/layer_12/mlp": 0.006060351617634296, "grad/layer_12/attn_mlp_ratio": 0.5818390936276034, "grad/layer_16/attn": 0.006713082082569599, "grad/layer_16/mlp": 0.004111425951123238, "grad/layer_16/attn_mlp_ratio": 1.632786774976919, "grad/layer_20/attn": 0.0033203288912773132, "grad/layer_20/mlp": 0.005987707059830427, "grad/layer_20/attn_mlp_ratio": 0.5545242615658154, "grad/layer_24/attn": 0.010653134435415268, "grad/layer_24/mlp": 0.009896618314087391, "grad/layer_24/attn_mlp_ratio": 1.0764418703110763, "grad/layer_27/attn": 0.004628276918083429, "grad/layer_27/mlp": 0.008601071313023567, "grad/layer_27/attn_mlp_ratio": 0.5381046960121023} {"step": 25500, "timestamp": 1778222096.095679, "eos/sharpness": 10.517263412475584, "eos/L0_probe": 2.0323290824890137, "eos/L_plus": 2.099242687225342, "eos/L_minus": 2.0705881118774414, "eos/grad_norm": 0.1191723495721817, "eos/embed_grad_frac": 0.19086547195911407, "eos/time_s": 0.6035158634185791} {"step": 25500, "timestamp": 1778222096.1142874, "train/loss": 2.25527970790863, "train/z_loss": 0.0015061254263855517, "train/perplexity": 9.53796078013555, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909410.1737204075, "perf/iters_per_sec": 0.9104777210809744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0983245134353639, "data/tokens_consumed": 53479473152, "data/tokens_consumed_B": 53.479473152, "train/loss_slope": -1.6824619282912344e-06} {"step": 25500, "timestamp": 1778222097.4815853, "geo/rankme_last": 440.24517822265625, "geo/layer_0/stable_rank_q_proj": 18.056589126586914, "geo/layer_0/stable_rank_k_proj": 15.802480697631836, "geo/layer_0/stable_rank_o_proj": 51.17808151245117, "geo/layer_0/stable_rank_gate_proj": 146.4164276123047, "geo/layer_0/stable_rank_down_proj": 50.87211608886719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050229545682668686, "geo/layer_0/attn_entropy_mean": 6.243775844573975, "geo/layer_0/attn_entropy_std": 0.3383851945400238, "geo/layer_7/stable_rank_q_proj": 42.72165298461914, "geo/layer_7/stable_rank_k_proj": 41.73371505737305, "geo/layer_7/stable_rank_o_proj": 107.59416961669922, "geo/layer_7/stable_rank_gate_proj": 99.10416412353516, "geo/layer_7/stable_rank_down_proj": 148.40118408203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5492346882820129, "geo/layer_7/attn_entropy_mean": 4.629219055175781, "geo/layer_7/attn_entropy_std": 0.8078653216362, "geo/layer_14/stable_rank_q_proj": 56.72389221191406, "geo/layer_14/stable_rank_k_proj": 34.89673614501953, "geo/layer_14/stable_rank_o_proj": 54.51793670654297, "geo/layer_14/stable_rank_gate_proj": 83.21795654296875, "geo/layer_14/stable_rank_down_proj": 135.15260314941406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3872213661670685, "geo/layer_14/attn_entropy_mean": 5.4841837882995605, "geo/layer_14/attn_entropy_std": 0.44545987248420715, "geo/layer_21/stable_rank_q_proj": 46.08536148071289, "geo/layer_21/stable_rank_k_proj": 31.51669692993164, "geo/layer_21/stable_rank_o_proj": 81.29914855957031, "geo/layer_21/stable_rank_gate_proj": 82.43695831298828, "geo/layer_21/stable_rank_down_proj": 58.815826416015625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15267117321491241, "geo/layer_21/attn_entropy_mean": 5.707756042480469, "geo/layer_21/attn_entropy_std": 0.28664666414260864, "geo/layer_27/stable_rank_q_proj": 41.323158264160156, "geo/layer_27/stable_rank_k_proj": 31.552637100219727, "geo/layer_27/stable_rank_o_proj": 119.3042984008789, "geo/layer_27/stable_rank_gate_proj": 89.812255859375, "geo/layer_27/stable_rank_down_proj": 136.4346923828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07926680892705917, "geo/layer_27/attn_entropy_mean": 4.345620155334473, "geo/layer_27/attn_entropy_std": 0.583993136882782, "attnres/final_alpha/block_0": 0.24054935574531555, "attnres/block_norm/0": 1.6568589210510254, "attnres/final_alpha/block_1": 0.006256772670894861, "attnres/block_norm/1": 32775.9296875, "attnres/final_alpha/block_2": 0.013007125817239285, "attnres/block_norm/2": 22857.7265625, "attnres/final_alpha/block_3": 0.015014205127954483, "attnres/block_norm/3": 35206.90234375, "attnres/final_alpha/block_4": 0.01917678862810135, "attnres/block_norm/4": 10459.62890625, "attnres/final_alpha/block_5": 0.5773429870605469, "attnres/block_norm/5": 5376.55419921875, "attnres/final_alpha/block_6": 0.12865281105041504, "attnres/block_norm/6": 23376.12890625, "geo/tier1_time_s": 1.363572120666504, "geo/step": 25500.0, "geo/rankme_slope": -0.00016476811427696078} {"step": 25500, "timestamp": 1778222104.5512824, "geo/ww_alpha_mean": 8.028736828085387, "geo/ww_alpha_std": 5.025566834354331, "geo/ww_alpha_min": 1.354095622544355, "geo/ww_alpha_max": 42.68095185000542, "geo/ww_alpha_healthy_frac": 0.14720812182741116, "geo/ww_alpha_by_type/q_proj": 4.2033599709030245, "geo/ww_alpha_by_type/k_proj": 4.707643739800386, "geo/ww_alpha_by_type/v_proj": 7.946319284750742, "geo/ww_alpha_by_type/o_proj": 8.135185966731934, "geo/ww_alpha_by_type/gate_proj": 8.353936399975286, "geo/ww_alpha_by_type/up_proj": 14.218019878064922, "geo/ww_alpha_by_type/down_proj": 8.762973250641606, "geo/twonn_id/layer_0": 0.7397857904434204, "geo/twonn_id/layer_7": 3.0058751106262207, "geo/twonn_id/layer_14": 4.614168167114258, "geo/twonn_id/layer_21": 7.216447830200195, "geo/twonn_id/layer_27": 5.7529215812683105, "geo/tier2_time_s": 7.063687562942505} {"step": 25500, "timestamp": 1778222105.2452614, "eoc/jacobian_sigma/layer_0/attn": 1066.9635009765625, "eoc/jacobian_sigma/layer_0/mlp": 6185.3935546875, "eoc/jacobian_sigma/layer_0": 6185.3935546875, "eoc/jacobian_sigma/layer_7/attn": 1.1534188985824585, "eoc/jacobian_sigma/layer_7/mlp": 1.6808000802993774, "eoc/jacobian_sigma/layer_7": 1.6808000802993774, "eoc/jacobian_sigma/layer_14/attn": 1.6103816032409668, "eoc/jacobian_sigma/layer_14/mlp": 7.616322994232178, "eoc/jacobian_sigma/layer_14": 7.616322994232178, "eoc/jacobian_sigma/layer_21/attn": 1.0845670700073242, "eoc/jacobian_sigma/layer_21/mlp": 4.207259654998779, "eoc/jacobian_sigma/layer_21": 4.207259654998779, "eoc/jacobian_sigma/layer_27/attn": 3.6524713039398193, "eoc/jacobian_sigma/layer_27/mlp": 25.84589958190918, "eoc/jacobian_sigma/layer_27": 25.84589958190918, "eoc/layer0_sigma": 6185.3935546875, "eoc/sigma_max": 25.84589958190918, "eoc/sigma_min": 1.6808000802993774, "eoc/sigma_mean": 9.837570577859879, "eoc/time_s": 0.68650221824646} {"step": 25510, "timestamp": 1778222115.6329353, "train/loss": 2.2006297588348387, "train/z_loss": 0.001510852633509785, "train/perplexity": 9.030698871438652, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1074643.0050365846, "perf/iters_per_sec": 0.5124297166045115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.951487135887146, "data/tokens_consumed": 53500444672, "data/tokens_consumed_B": 53.500444672, "train/loss_slope": 1.823956981421995e-06} {"step": 25520, "timestamp": 1778222126.0238676, "train/loss": 2.1717233419418336, "train/z_loss": 0.0015020868740975857, "train/perplexity": 8.773390572087518, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019593.1578138084, "perf/iters_per_sec": 0.9630170620984118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384032011032105, "data/tokens_consumed": 53521416192, "data/tokens_consumed_B": 53.521416192, "train/loss_slope": -4.918330644939095e-07} {"step": 25530, "timestamp": 1778222136.3935103, "train/loss": 2.2000386476516725, "train/z_loss": 0.001513126923236996, "train/perplexity": 9.025362301752338, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023586.2220836165, "perf/iters_per_sec": 0.9649211035173495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036354160308838, "data/tokens_consumed": 53542387712, "data/tokens_consumed_B": 53.542387712, "train/loss_slope": -2.2406014815272454e-06} {"step": 25540, "timestamp": 1778222146.7694588, "train/loss": 2.214667820930481, "train/z_loss": 0.0015058447374030949, "train/perplexity": 9.158366388647046, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022396.5919179532, "perf/iters_per_sec": 0.9643538436498419, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369637727737426, "data/tokens_consumed": 53563359232, "data/tokens_consumed_B": 53.563359232, "train/loss_slope": 9.269790037093747e-07} {"step": 25550, "timestamp": 1778222157.1295426, "grad/layer_0/attn": 0.004604455083608627, "grad/layer_0/mlp": 0.0035246701445430517, "grad/layer_0/attn_mlp_ratio": 1.306350598538208, "grad/layer_4/attn": 0.0024742083624005318, "grad/layer_4/mlp": 0.002532415557652712, "grad/layer_4/attn_mlp_ratio": 0.977015110028923, "grad/layer_8/attn": 0.004794301465153694, "grad/layer_8/mlp": 0.003929812926799059, "grad/layer_8/attn_mlp_ratio": 1.219982078653443, "grad/layer_12/attn": 0.005420183762907982, "grad/layer_12/mlp": 0.007292633410543203, "grad/layer_12/attn_mlp_ratio": 0.7432409369087095, "grad/layer_16/attn": 0.004538797307759523, "grad/layer_16/mlp": 0.005745031405240297, "grad/layer_16/attn_mlp_ratio": 0.7900387149521262, "grad/layer_20/attn": 0.007297459989786148, "grad/layer_20/mlp": 0.007838360033929348, "grad/layer_20/attn_mlp_ratio": 0.9309932007586837, "grad/layer_24/attn": 0.019928226247429848, "grad/layer_24/mlp": 0.014799668453633785, "grad/layer_24/attn_mlp_ratio": 1.3465319290908606, "grad/layer_27/attn": 0.00850533414632082, "grad/layer_27/mlp": 0.014767419546842575, "grad/layer_27/attn_mlp_ratio": 0.5759526274544076} {"step": 25550, "timestamp": 1778222157.1455045, "train/loss": 2.2177663087844848, "train/z_loss": 0.0015054127899929882, "train/perplexity": 9.186787484136014, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022204.8950561245, "perf/iters_per_sec": 0.964262435463011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370620727539062, "data/tokens_consumed": 53584330752, "data/tokens_consumed_B": 53.584330752, "train/loss_slope": 1.4286034607221822e-06} {"step": 25560, "timestamp": 1778222167.5207114, "train/loss": 2.230587935447693, "train/z_loss": 0.0015095503069460392, "train/perplexity": 9.305335408059035, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022549.2128795842, "perf/iters_per_sec": 0.9644266189954682, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368855237960815, "data/tokens_consumed": 53605302272, "data/tokens_consumed_B": 53.605302272, "train/loss_slope": 4.158431597382606e-06} {"step": 25570, "timestamp": 1778222177.8929718, "train/loss": 2.178451681137085, "train/z_loss": 0.0015060951700434089, "train/perplexity": 8.832619953989441, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023396.0218025746, "perf/iters_per_sec": 0.964830408955848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364515781402588, "data/tokens_consumed": 53626273792, "data/tokens_consumed_B": 53.626273792, "train/loss_slope": -7.700562942074588e-07} {"step": 25575, "timestamp": 1778222183.7899313, "eos/sharpness": 12.364840507507322, "eos/L0_probe": 2.0363388061523438, "eos/L_plus": 2.1004891395568848, "eos/L_minus": 2.095836877822876, "eos/grad_norm": 0.12980404496192932, "eos/embed_grad_frac": 0.1486882120370865, "eos/time_s": 0.7248499393463135} {"step": 25575, "timestamp": 1778222185.1701736, "geo/rankme_last": 440.3060607910156, "geo/layer_0/stable_rank_q_proj": 18.044837951660156, "geo/layer_0/stable_rank_k_proj": 15.784534454345703, "geo/layer_0/stable_rank_o_proj": 51.13712692260742, "geo/layer_0/stable_rank_gate_proj": 146.47360229492188, "geo/layer_0/stable_rank_down_proj": 50.91973876953125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05197178199887276, "geo/layer_0/attn_entropy_mean": 6.237862586975098, "geo/layer_0/attn_entropy_std": 0.33832889795303345, "geo/layer_7/stable_rank_q_proj": 42.68717956542969, "geo/layer_7/stable_rank_k_proj": 42.021385192871094, "geo/layer_7/stable_rank_o_proj": 107.62081909179688, "geo/layer_7/stable_rank_gate_proj": 99.11961364746094, "geo/layer_7/stable_rank_down_proj": 148.6793975830078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.536938488483429, "geo/layer_7/attn_entropy_mean": 4.645431041717529, "geo/layer_7/attn_entropy_std": 0.8247663974761963, "geo/layer_14/stable_rank_q_proj": 56.518917083740234, "geo/layer_14/stable_rank_k_proj": 34.93782043457031, "geo/layer_14/stable_rank_o_proj": 54.55208969116211, "geo/layer_14/stable_rank_gate_proj": 83.16809844970703, "geo/layer_14/stable_rank_down_proj": 135.05691528320312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.386606901884079, "geo/layer_14/attn_entropy_mean": 5.504544258117676, "geo/layer_14/attn_entropy_std": 0.41974666714668274, "geo/layer_21/stable_rank_q_proj": 46.05339431762695, "geo/layer_21/stable_rank_k_proj": 31.417802810668945, "geo/layer_21/stable_rank_o_proj": 81.18263244628906, "geo/layer_21/stable_rank_gate_proj": 82.09017944335938, "geo/layer_21/stable_rank_down_proj": 58.900001525878906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15112179517745972, "geo/layer_21/attn_entropy_mean": 5.723275184631348, "geo/layer_21/attn_entropy_std": 0.2873234152793884, "geo/layer_27/stable_rank_q_proj": 41.39944076538086, "geo/layer_27/stable_rank_k_proj": 31.44286346435547, "geo/layer_27/stable_rank_o_proj": 119.33924865722656, "geo/layer_27/stable_rank_gate_proj": 89.99555969238281, "geo/layer_27/stable_rank_down_proj": 136.38450622558594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08302395790815353, "geo/layer_27/attn_entropy_mean": 4.372245788574219, "geo/layer_27/attn_entropy_std": 0.5784626603126526, "attnres/final_alpha/block_0": 0.241281658411026, "attnres/block_norm/0": 1.6574064493179321, "attnres/final_alpha/block_1": 0.006333284080028534, "attnres/block_norm/1": 32888.3671875, "attnres/final_alpha/block_2": 0.013171002268791199, "attnres/block_norm/2": 22976.76171875, "attnres/final_alpha/block_3": 0.015245998278260231, "attnres/block_norm/3": 35004.2734375, "attnres/final_alpha/block_4": 0.019086483865976334, "attnres/block_norm/4": 10489.9501953125, "attnres/final_alpha/block_5": 0.575046956539154, "attnres/block_norm/5": 5379.82080078125, "attnres/final_alpha/block_6": 0.12983465194702148, "attnres/block_norm/6": 23461.09375, "geo/tier1_time_s": 1.3611481189727783, "geo/step": 25575.0, "geo/rankme_slope": -0.00014489231239370748} {"step": 25580, "timestamp": 1778222190.3542268, "train/loss": 2.2187536478042604, "train/z_loss": 0.0014947505900636316, "train/perplexity": 9.19586243717589, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1683714.2247689161, "perf/iters_per_sec": 0.8028575061649876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2455510377883912, "data/tokens_consumed": 53647245312, "data/tokens_consumed_B": 53.647245312, "train/loss_slope": 2.1040222074690202e-06} {"step": 25590, "timestamp": 1778222200.716843, "train/loss": 2.180996370315552, "train/z_loss": 0.0015073847491294146, "train/perplexity": 8.855124848240207, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024878.0717250453, "perf/iters_per_sec": 0.9655371054291941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356929779052735, "data/tokens_consumed": 53668216832, "data/tokens_consumed_B": 53.668216832, "train/loss_slope": 8.956391068146086e-07} {"step": 25600, "timestamp": 1778222211.0672793, "grad/layer_0/attn": 0.0028120719362050295, "grad/layer_0/mlp": 0.0028842065948992968, "grad/layer_0/attn_mlp_ratio": 0.9749897402215184, "grad/layer_4/attn": 0.0023086979053914547, "grad/layer_4/mlp": 0.0025075855664908886, "grad/layer_4/attn_mlp_ratio": 0.9206855567260611, "grad/layer_8/attn": 0.004139150958508253, "grad/layer_8/mlp": 0.0035188852343708277, "grad/layer_8/attn_mlp_ratio": 1.176267643074061, "grad/layer_12/attn": 0.004428664222359657, "grad/layer_12/mlp": 0.006361154839396477, "grad/layer_12/attn_mlp_ratio": 0.696204425855383, "grad/layer_16/attn": 0.0038422842044383287, "grad/layer_16/mlp": 0.004026442766189575, "grad/layer_16/attn_mlp_ratio": 0.954262690947972, "grad/layer_20/attn": 0.004055304918438196, "grad/layer_20/mlp": 0.005812520161271095, "grad/layer_20/attn_mlp_ratio": 0.6976844356928527, "grad/layer_24/attn": 0.011274915188550949, "grad/layer_24/mlp": 0.008971448056399822, "grad/layer_24/attn_mlp_ratio": 1.2567553188732343, "grad/layer_27/attn": 0.005713034421205521, "grad/layer_27/mlp": 0.008411956019699574, "grad/layer_27/attn_mlp_ratio": 0.6791564696618457} {"step": 25600, "timestamp": 1778222211.0826452, "train/loss": 2.2296300411224363, "train/z_loss": 0.0014953264617361128, "train/perplexity": 9.29642614782319, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024293.9451325026, "perf/iters_per_sec": 0.9652585721647752, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359918355941773, "data/tokens_consumed": 53689188352, "data/tokens_consumed_B": 53.689188352, "train/loss_slope": 3.4323229147370072e-06} {"step": 25610, "timestamp": 1778222221.4594934, "train/loss": 2.2238152980804444, "train/z_loss": 0.001497145986650139, "train/perplexity": 9.242526676220708, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022048.0961002447, "perf/iters_per_sec": 0.9641876678944801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371424913406373, "data/tokens_consumed": 53710159872, "data/tokens_consumed_B": 53.710159872, "train/loss_slope": 4.2554693754249545e-06} {"step": 25620, "timestamp": 1778222231.8276448, "train/loss": 2.1988316774368286, "train/z_loss": 0.0015070101711899043, "train/perplexity": 9.014475529602443, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024330.5625524302, "perf/iters_per_sec": 0.9652760327112342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359730958938598, "data/tokens_consumed": 53731131392, "data/tokens_consumed_B": 53.731131392, "train/loss_slope": 7.06087806151146e-06} {"step": 25630, "timestamp": 1778222242.1878605, "train/loss": 2.226902794837952, "train/z_loss": 0.001492866931948811, "train/perplexity": 9.271107045559695, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025336.0090867458, "perf/iters_per_sec": 0.9657554669793824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354588031768799, "data/tokens_consumed": 53752102912, "data/tokens_consumed_B": 53.752102912, "train/loss_slope": 6.702681092789643e-06} {"step": 25640, "timestamp": 1778222252.5694537, "train/loss": 2.1837393999099732, "train/z_loss": 0.0015056277392432094, "train/perplexity": 8.879448062157948, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021159.037019356, "perf/iters_per_sec": 0.9637637314888744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375987052917481, "data/tokens_consumed": 53773074432, "data/tokens_consumed_B": 53.773074432, "train/loss_slope": 5.399551292886437e-06} {"step": 25650, "timestamp": 1778222262.937467, "grad/layer_0/attn": 0.00240090349689126, "grad/layer_0/mlp": 0.002483950462192297, "grad/layer_0/attn_mlp_ratio": 0.9665665385756548, "grad/layer_4/attn": 0.00183543108869344, "grad/layer_4/mlp": 0.002532878890633583, "grad/layer_4/attn_mlp_ratio": 0.7246422333955713, "grad/layer_8/attn": 0.0038019255734980106, "grad/layer_8/mlp": 0.0038245010655373335, "grad/layer_8/attn_mlp_ratio": 0.9940971146138607, "grad/layer_12/attn": 0.004668523091822863, "grad/layer_12/mlp": 0.006166448350995779, "grad/layer_12/attn_mlp_ratio": 0.7570845891153074, "grad/layer_16/attn": 0.004551774822175503, "grad/layer_16/mlp": 0.0043959361501038074, "grad/layer_16/attn_mlp_ratio": 1.035450598736052, "grad/layer_20/attn": 0.007419017609208822, "grad/layer_20/mlp": 0.006108683533966541, "grad/layer_20/attn_mlp_ratio": 1.214503492692982, "grad/layer_24/attn": 0.01806510053575039, "grad/layer_24/mlp": 0.012929282151162624, "grad/layer_24/attn_mlp_ratio": 1.397223773510393, "grad/layer_27/attn": 0.004209187347441912, "grad/layer_27/mlp": 0.012250254862010479, "grad/layer_27/attn_mlp_ratio": 0.34359997897963024} {"step": 25650, "timestamp": 1778222263.5484319, "eos/sharpness": 45.63429355621337, "eos/L0_probe": 2.0325424671173096, "eos/L_plus": 2.242368459701538, "eos/L_minus": 2.279059410095215, "eos/grad_norm": 0.1745646446943283, "eos/embed_grad_frac": 0.0959174707531929, "eos/time_s": 0.6079111099243164} {"step": 25650, "timestamp": 1778222263.567354, "train/loss": 2.2545082569122314, "train/z_loss": 0.0014907473116181791, "train/perplexity": 9.530605548253366, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908369.14120934, "perf/iters_per_sec": 0.9099813180967998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098923659324646, "data/tokens_consumed": 53794045952, "data/tokens_consumed_B": 53.794045952, "train/loss_slope": 9.742362392653337e-06} {"step": 25650, "timestamp": 1778222264.935439, "geo/rankme_last": 440.1024169921875, "geo/layer_0/stable_rank_q_proj": 18.032554626464844, "geo/layer_0/stable_rank_k_proj": 15.837271690368652, "geo/layer_0/stable_rank_o_proj": 50.97578811645508, "geo/layer_0/stable_rank_gate_proj": 146.53810119628906, "geo/layer_0/stable_rank_down_proj": 50.94174575805664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05278369411826134, "geo/layer_0/attn_entropy_mean": 6.237682819366455, "geo/layer_0/attn_entropy_std": 0.33617547154426575, "geo/layer_7/stable_rank_q_proj": 42.640594482421875, "geo/layer_7/stable_rank_k_proj": 42.07736587524414, "geo/layer_7/stable_rank_o_proj": 107.51385498046875, "geo/layer_7/stable_rank_gate_proj": 99.0976333618164, "geo/layer_7/stable_rank_down_proj": 148.6247100830078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.537828803062439, "geo/layer_7/attn_entropy_mean": 4.633593559265137, "geo/layer_7/attn_entropy_std": 0.8194338083267212, "geo/layer_14/stable_rank_q_proj": 56.299861907958984, "geo/layer_14/stable_rank_k_proj": 35.01587677001953, "geo/layer_14/stable_rank_o_proj": 54.291500091552734, "geo/layer_14/stable_rank_gate_proj": 83.14804077148438, "geo/layer_14/stable_rank_down_proj": 134.95785522460938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3618859648704529, "geo/layer_14/attn_entropy_mean": 5.525146007537842, "geo/layer_14/attn_entropy_std": 0.42560794949531555, "geo/layer_21/stable_rank_q_proj": 46.14203643798828, "geo/layer_21/stable_rank_k_proj": 31.24742889404297, "geo/layer_21/stable_rank_o_proj": 81.26215362548828, "geo/layer_21/stable_rank_gate_proj": 82.12615203857422, "geo/layer_21/stable_rank_down_proj": 58.908058166503906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15121299028396606, "geo/layer_21/attn_entropy_mean": 5.721963882446289, "geo/layer_21/attn_entropy_std": 0.29549404978752136, "geo/layer_27/stable_rank_q_proj": 41.443504333496094, "geo/layer_27/stable_rank_k_proj": 31.357091903686523, "geo/layer_27/stable_rank_o_proj": 119.23038482666016, "geo/layer_27/stable_rank_gate_proj": 90.09907531738281, "geo/layer_27/stable_rank_down_proj": 136.32595825195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08743716031312943, "geo/layer_27/attn_entropy_mean": 4.34217643737793, "geo/layer_27/attn_entropy_std": 0.5852675437927246, "attnres/final_alpha/block_0": 0.24115104973316193, "attnres/block_norm/0": 1.6579606533050537, "attnres/final_alpha/block_1": 0.006199754308909178, "attnres/block_norm/1": 32990.83984375, "attnres/final_alpha/block_2": 0.013053103350102901, "attnres/block_norm/2": 23036.703125, "attnres/final_alpha/block_3": 0.015210584737360477, "attnres/block_norm/3": 35206.4765625, "attnres/final_alpha/block_4": 0.01917935162782669, "attnres/block_norm/4": 10510.96484375, "attnres/final_alpha/block_5": 0.5757354497909546, "attnres/block_norm/5": 5385.1005859375, "attnres/final_alpha/block_6": 0.12947067618370056, "attnres/block_norm/6": 23452.7734375, "geo/tier1_time_s": 1.364356279373169, "geo/step": 25650.0, "geo/rankme_slope": -0.00014059787977691076} {"step": 25660, "timestamp": 1778222275.3091567, "train/loss": 2.201276731491089, "train/z_loss": 0.001498770690523088, "train/perplexity": 9.036543377089114, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786630.95464394, "perf/iters_per_sec": 0.8519320271701527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1738025665283203, "data/tokens_consumed": 53815017472, "data/tokens_consumed_B": 53.815017472, "train/loss_slope": 1.0126448557464893e-05} {"step": 25670, "timestamp": 1778222285.687577, "train/loss": 2.216472864151001, "train/z_loss": 0.0015076523297466338, "train/perplexity": 9.174912564596738, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021953.8796664206, "perf/iters_per_sec": 0.9641427419979194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037190818786621, "data/tokens_consumed": 53835988992, "data/tokens_consumed_B": 53.835988992, "train/loss_slope": 7.864889109989534e-06} {"step": 25680, "timestamp": 1778222296.068227, "train/loss": 2.1897512912750243, "train/z_loss": 0.001507397962268442, "train/perplexity": 8.932991125563102, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021757.1550582848, "perf/iters_per_sec": 0.9640489363948272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372917413711549, "data/tokens_consumed": 53856960512, "data/tokens_consumed_B": 53.856960512, "train/loss_slope": 5.974186428881563e-06} {"step": 25690, "timestamp": 1778222306.4529588, "train/loss": 2.21177453994751, "train/z_loss": 0.0015142093761824071, "train/perplexity": 9.131906957082597, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020883.487605494, "perf/iters_per_sec": 0.9636323392894239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377401828765869, "data/tokens_consumed": 53877932032, "data/tokens_consumed_B": 53.877932032, "train/loss_slope": 8.039834697504813e-06} {"step": 25700, "timestamp": 1778222316.807012, "grad/layer_0/attn": 0.0023594556841999292, "grad/layer_0/mlp": 0.0024431596975773573, "grad/layer_0/attn_mlp_ratio": 0.9657394029402296, "grad/layer_4/attn": 0.00172700104303658, "grad/layer_4/mlp": 0.0023930540774017572, "grad/layer_4/attn_mlp_ratio": 0.7216723546608794, "grad/layer_8/attn": 0.007499643135815859, "grad/layer_8/mlp": 0.0035338522866368294, "grad/layer_8/attn_mlp_ratio": 2.122228750746795, "grad/layer_12/attn": 0.0047616115771234035, "grad/layer_12/mlp": 0.006389884743839502, "grad/layer_12/attn_mlp_ratio": 0.7451795601158729, "grad/layer_16/attn": 0.0038616093806922436, "grad/layer_16/mlp": 0.004346638452261686, "grad/layer_16/attn_mlp_ratio": 0.8884128123979697, "grad/layer_20/attn": 0.011594348587095737, "grad/layer_20/mlp": 0.005212181247770786, "grad/layer_20/attn_mlp_ratio": 2.224471447459241, "grad/layer_24/attn": 0.004833687096834183, "grad/layer_24/mlp": 0.007594354450702667, "grad/layer_24/attn_mlp_ratio": 0.6364842547925217, "grad/layer_27/attn": 0.006364135071635246, "grad/layer_27/mlp": 0.0065930248238146305, "grad/layer_27/attn_mlp_ratio": 0.9652830294403086} {"step": 25700, "timestamp": 1778222316.8226786, "train/loss": 2.1467617988586425, "train/z_loss": 0.0015181247843429446, "train/perplexity": 8.557103860556554, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023610.150935887, "perf/iters_per_sec": 0.9649325136832652, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036341905593872, "data/tokens_consumed": 53898903552, "data/tokens_consumed_B": 53.898903552, "train/loss_slope": 6.4283086843688325e-06} {"step": 25710, "timestamp": 1778222327.1917317, "train/loss": 2.201452040672302, "train/z_loss": 0.0015039267600513995, "train/perplexity": 9.038127704979104, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023545.0229538572, "perf/iters_per_sec": 0.9649014582413946, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363752603530885, "data/tokens_consumed": 53919875072, "data/tokens_consumed_B": 53.919875072, "train/loss_slope": 6.5638370210617446e-06} {"step": 25720, "timestamp": 1778222337.5579038, "train/loss": 2.1856600284576415, "train/z_loss": 0.001503841020166874, "train/perplexity": 8.896518571399858, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024017.726629199, "perf/iters_per_sec": 0.9651268609186168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361332178115845, "data/tokens_consumed": 53940846592, "data/tokens_consumed_B": 53.940846592, "train/loss_slope": 3.787549808867311e-06} {"step": 25725, "timestamp": 1778222343.3328786, "eos/sharpness": 22.936344146728512, "eos/L0_probe": 2.0364959239959717, "eos/L_plus": 2.141789674758911, "eos/L_minus": 2.1605656147003174, "eos/grad_norm": 0.11082417517900467, "eos/embed_grad_frac": 0.16778315603733063, "eos/time_s": 0.6039004325866699} {"step": 25725, "timestamp": 1778222344.71079, "geo/rankme_last": 440.2779235839844, "geo/layer_0/stable_rank_q_proj": 18.043567657470703, "geo/layer_0/stable_rank_k_proj": 15.8311128616333, "geo/layer_0/stable_rank_o_proj": 50.98957824707031, "geo/layer_0/stable_rank_gate_proj": 146.11215209960938, "geo/layer_0/stable_rank_down_proj": 50.87675094604492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05148737505078316, "geo/layer_0/attn_entropy_mean": 6.236635208129883, "geo/layer_0/attn_entropy_std": 0.338756799697876, "geo/layer_7/stable_rank_q_proj": 42.62134552001953, "geo/layer_7/stable_rank_k_proj": 41.86298370361328, "geo/layer_7/stable_rank_o_proj": 107.63910675048828, "geo/layer_7/stable_rank_gate_proj": 99.02130889892578, "geo/layer_7/stable_rank_down_proj": 148.4840545654297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5353342294692993, "geo/layer_7/attn_entropy_mean": 4.626043319702148, "geo/layer_7/attn_entropy_std": 0.8284541368484497, "geo/layer_14/stable_rank_q_proj": 56.21930694580078, "geo/layer_14/stable_rank_k_proj": 34.881629943847656, "geo/layer_14/stable_rank_o_proj": 54.330169677734375, "geo/layer_14/stable_rank_gate_proj": 83.10601806640625, "geo/layer_14/stable_rank_down_proj": 134.89468383789062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37581661343574524, "geo/layer_14/attn_entropy_mean": 5.4740447998046875, "geo/layer_14/attn_entropy_std": 0.44080087542533875, "geo/layer_21/stable_rank_q_proj": 46.21877670288086, "geo/layer_21/stable_rank_k_proj": 31.214086532592773, "geo/layer_21/stable_rank_o_proj": 81.23441314697266, "geo/layer_21/stable_rank_gate_proj": 82.29847717285156, "geo/layer_21/stable_rank_down_proj": 58.9002571105957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15081165730953217, "geo/layer_21/attn_entropy_mean": 5.73052978515625, "geo/layer_21/attn_entropy_std": 0.29509416222572327, "geo/layer_27/stable_rank_q_proj": 41.52882766723633, "geo/layer_27/stable_rank_k_proj": 31.292879104614258, "geo/layer_27/stable_rank_o_proj": 119.19758605957031, "geo/layer_27/stable_rank_gate_proj": 90.1427993774414, "geo/layer_27/stable_rank_down_proj": 136.28646850585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07546167075634003, "geo/layer_27/attn_entropy_mean": 4.333699703216553, "geo/layer_27/attn_entropy_std": 0.5837867259979248, "attnres/final_alpha/block_0": 0.24172136187553406, "attnres/block_norm/0": 1.6587494611740112, "attnres/final_alpha/block_1": 0.006242323666810989, "attnres/block_norm/1": 32998.03515625, "attnres/final_alpha/block_2": 0.013029972091317177, "attnres/block_norm/2": 22972.55859375, "attnres/final_alpha/block_3": 0.01513439230620861, "attnres/block_norm/3": 35202.703125, "attnres/final_alpha/block_4": 0.01899093948304653, "attnres/block_norm/4": 10497.5859375, "attnres/final_alpha/block_5": 0.5751384496688843, "attnres/block_norm/5": 5396.9384765625, "attnres/final_alpha/block_6": 0.1297425925731659, "attnres/block_norm/6": 23501.140625, "geo/tier1_time_s": 1.3560936450958252, "geo/step": 25725.0, "geo/rankme_slope": -0.00014449473148634453} {"step": 25730, "timestamp": 1778222349.8943489, "train/loss": 2.1615792751312255, "train/z_loss": 0.0015063768136315049, "train/perplexity": 8.684842589606896, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700635.158513819, "perf/iters_per_sec": 0.8109260361260505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2331580877304078, "data/tokens_consumed": 53961818112, "data/tokens_consumed_B": 53.961818112, "train/loss_slope": 2.724799829454987e-06} {"step": 25740, "timestamp": 1778222360.2580848, "train/loss": 2.190301203727722, "train/z_loss": 0.0015167313278652728, "train/perplexity": 8.937904839555317, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024394.6229296492, "perf/iters_per_sec": 0.9653065790794607, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359403133392333, "data/tokens_consumed": 53982789632, "data/tokens_consumed_B": 53.982789632, "train/loss_slope": 3.0814650392804026e-06} {"step": 25750, "timestamp": 1778222370.6109679, "grad/layer_0/attn": 0.00252595660276711, "grad/layer_0/mlp": 0.0026304845232516527, "grad/layer_0/attn_mlp_ratio": 0.9602628277844364, "grad/layer_4/attn": 0.0020225520711392164, "grad/layer_4/mlp": 0.0026357604656368494, "grad/layer_4/attn_mlp_ratio": 0.7673504556930534, "grad/layer_8/attn": 0.011092282831668854, "grad/layer_8/mlp": 0.0038525033742189407, "grad/layer_8/attn_mlp_ratio": 2.8792401891130575, "grad/layer_12/attn": 0.0035506568383425474, "grad/layer_12/mlp": 0.0058276355266571045, "grad/layer_12/attn_mlp_ratio": 0.6092791426596629, "grad/layer_16/attn": 0.0036642157938331366, "grad/layer_16/mlp": 0.004289483185857534, "grad/layer_16/attn_mlp_ratio": 0.8542324447128842, "grad/layer_20/attn": 0.005036414135247469, "grad/layer_20/mlp": 0.005743410438299179, "grad/layer_20/attn_mlp_ratio": 0.8769030355157102, "grad/layer_24/attn": 0.006178320851176977, "grad/layer_24/mlp": 0.007931195199489594, "grad/layer_24/attn_mlp_ratio": 0.7789898770459704, "grad/layer_27/attn": 0.005593458656221628, "grad/layer_27/mlp": 0.007253620307892561, "grad/layer_27/attn_mlp_ratio": 0.7711264639841735} {"step": 25750, "timestamp": 1778222370.6267347, "train/loss": 2.1858224868774414, "train/z_loss": 0.001500357873737812, "train/perplexity": 8.897964003156789, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023493.770784933, "perf/iters_per_sec": 0.964877019302813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364015102386475, "data/tokens_consumed": 54003761152, "data/tokens_consumed_B": 54.003761152, "train/loss_slope": 2.395861906365805e-06} {"step": 25760, "timestamp": 1778222380.9908273, "train/loss": 2.2611072063446045, "train/z_loss": 0.0014788586995564401, "train/perplexity": 9.593705500041958, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024314.676206283, "perf/iters_per_sec": 0.9652684575110831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359812259674073, "data/tokens_consumed": 54024732672, "data/tokens_consumed_B": 54.024732672, "train/loss_slope": 5.782896600397643e-06} {"step": 25770, "timestamp": 1778222391.3576906, "train/loss": 2.2126330614089964, "train/z_loss": 0.0014963006135076285, "train/perplexity": 9.1397502615304, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023934.363510552, "perf/iters_per_sec": 0.9650871102860222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361758947372437, "data/tokens_consumed": 54045704192, "data/tokens_consumed_B": 54.045704192, "train/loss_slope": 4.463113337853539e-06} {"step": 25780, "timestamp": 1778222401.7233772, "train/loss": 2.1924289226531983, "train/z_loss": 0.0015067467698827385, "train/perplexity": 8.956942434980355, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024135.098637228, "perf/iters_per_sec": 0.9651828282533779, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036073136329651, "data/tokens_consumed": 54066675712, "data/tokens_consumed_B": 54.066675712, "train/loss_slope": 3.765943534661079e-06} {"step": 25790, "timestamp": 1778222412.0928164, "train/loss": 2.1861892461776735, "train/z_loss": 0.00151450497796759, "train/perplexity": 8.901228012724422, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023718.5827300453, "perf/iters_per_sec": 0.9649842179918505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362863779067992, "data/tokens_consumed": 54087647232, "data/tokens_consumed_B": 54.087647232, "train/loss_slope": 3.4153657527503516e-06} {"step": 25800, "timestamp": 1778222422.4503858, "grad/layer_0/attn": 0.00268101436085999, "grad/layer_0/mlp": 0.0028696339577436447, "grad/layer_0/attn_mlp_ratio": 0.9342704703498105, "grad/layer_4/attn": 0.0018765126587823033, "grad/layer_4/mlp": 0.002686304273083806, "grad/layer_4/attn_mlp_ratio": 0.6985480415341455, "grad/layer_8/attn": 0.00403615552932024, "grad/layer_8/mlp": 0.003919944632798433, "grad/layer_8/attn_mlp_ratio": 1.029646029330328, "grad/layer_12/attn": 0.004999466706067324, "grad/layer_12/mlp": 0.006638708058744669, "grad/layer_12/attn_mlp_ratio": 0.753078247532527, "grad/layer_16/attn": 0.0037333841901272535, "grad/layer_16/mlp": 0.004568800795823336, "grad/layer_16/attn_mlp_ratio": 0.8171474912684868, "grad/layer_20/attn": 0.0044213272631168365, "grad/layer_20/mlp": 0.006741489749401808, "grad/layer_20/attn_mlp_ratio": 0.6558383030880265, "grad/layer_24/attn": 0.017991533502936363, "grad/layer_24/mlp": 0.014420842751860619, "grad/layer_24/attn_mlp_ratio": 1.2476062382591628, "grad/layer_27/attn": 0.0061852773651480675, "grad/layer_27/mlp": 0.013571426272392273, "grad/layer_27/attn_mlp_ratio": 0.4557573533855286} {"step": 25800, "timestamp": 1778222423.057659, "eos/sharpness": 34.80439186096191, "eos/L0_probe": 2.0269217491149902, "eos/L_plus": 2.2036101818084717, "eos/L_minus": 2.198277235031128, "eos/grad_norm": 0.17486189305782318, "eos/embed_grad_frac": 0.0936218872666359, "eos/time_s": 0.6042888164520264} {"step": 25800, "timestamp": 1778222423.077358, "train/loss": 2.2220739126205444, "train/z_loss": 0.0015125325182452798, "train/perplexity": 9.226445880149878, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910421.9697002375, "perf/iters_per_sec": 0.9109601830006778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0977428197860717, "data/tokens_consumed": 54108618752, "data/tokens_consumed_B": 54.108618752, "train/loss_slope": 3.825605083720796e-06} {"step": 25800, "timestamp": 1778222424.440935, "geo/rankme_last": 439.92620849609375, "geo/layer_0/stable_rank_q_proj": 18.06909942626953, "geo/layer_0/stable_rank_k_proj": 15.873537063598633, "geo/layer_0/stable_rank_o_proj": 51.009742736816406, "geo/layer_0/stable_rank_gate_proj": 146.5083770751953, "geo/layer_0/stable_rank_down_proj": 50.849021911621094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04855727404356003, "geo/layer_0/attn_entropy_mean": 6.237606048583984, "geo/layer_0/attn_entropy_std": 0.3324478566646576, "geo/layer_7/stable_rank_q_proj": 42.73561477661133, "geo/layer_7/stable_rank_k_proj": 41.838504791259766, "geo/layer_7/stable_rank_o_proj": 107.34042358398438, "geo/layer_7/stable_rank_gate_proj": 98.81861114501953, "geo/layer_7/stable_rank_down_proj": 147.88211059570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.531074047088623, "geo/layer_7/attn_entropy_mean": 4.661843299865723, "geo/layer_7/attn_entropy_std": 0.8335606455802917, "geo/layer_14/stable_rank_q_proj": 56.22406768798828, "geo/layer_14/stable_rank_k_proj": 34.836936950683594, "geo/layer_14/stable_rank_o_proj": 54.446372985839844, "geo/layer_14/stable_rank_gate_proj": 82.93553924560547, "geo/layer_14/stable_rank_down_proj": 135.06285095214844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37278565764427185, "geo/layer_14/attn_entropy_mean": 5.466201305389404, "geo/layer_14/attn_entropy_std": 0.4371935725212097, "geo/layer_21/stable_rank_q_proj": 46.14898681640625, "geo/layer_21/stable_rank_k_proj": 31.31874656677246, "geo/layer_21/stable_rank_o_proj": 81.1447982788086, "geo/layer_21/stable_rank_gate_proj": 82.22895050048828, "geo/layer_21/stable_rank_down_proj": 58.88984680175781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15511804819107056, "geo/layer_21/attn_entropy_mean": 5.747821807861328, "geo/layer_21/attn_entropy_std": 0.2911593019962311, "geo/layer_27/stable_rank_q_proj": 41.451290130615234, "geo/layer_27/stable_rank_k_proj": 31.30770492553711, "geo/layer_27/stable_rank_o_proj": 119.6575698852539, "geo/layer_27/stable_rank_gate_proj": 90.10285186767578, "geo/layer_27/stable_rank_down_proj": 136.22691345214844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08427444845438004, "geo/layer_27/attn_entropy_mean": 4.331905364990234, "geo/layer_27/attn_entropy_std": 0.5760497450828552, "attnres/final_alpha/block_0": 0.24124369025230408, "attnres/block_norm/0": 1.6593172550201416, "attnres/final_alpha/block_1": 0.006173156201839447, "attnres/block_norm/1": 33104.8203125, "attnres/final_alpha/block_2": 0.01288607157766819, "attnres/block_norm/2": 23081.03515625, "attnres/final_alpha/block_3": 0.014880681410431862, "attnres/block_norm/3": 35406.5625, "attnres/final_alpha/block_4": 0.019341817125678062, "attnres/block_norm/4": 10536.73046875, "attnres/final_alpha/block_5": 0.5747348666191101, "attnres/block_norm/5": 5362.5703125, "attnres/final_alpha/block_6": 0.1307397186756134, "attnres/block_norm/6": 23542.748046875, "geo/tier1_time_s": 1.3592684268951416, "geo/step": 25800.0, "geo/rankme_slope": -0.0001468718737494998} {"step": 25810, "timestamp": 1778222434.803262, "train/loss": 2.2368824005126955, "train/z_loss": 0.0014889619196765125, "train/perplexity": 9.36409224413241, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789092.258685767, "perf/iters_per_sec": 0.8531056683949313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1721877336502076, "data/tokens_consumed": 54129590272, "data/tokens_consumed_B": 54.129590272, "train/loss_slope": 5.458540069495382e-06} {"step": 25820, "timestamp": 1778222445.1607902, "train/loss": 2.196982169151306, "train/z_loss": 0.001500186463817954, "train/perplexity": 8.997818590742192, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026115.5195209535, "perf/iters_per_sec": 0.9661271665196197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035060429573059, "data/tokens_consumed": 54150561792, "data/tokens_consumed_B": 54.150561792, "train/loss_slope": 5.630744952108278e-06} {"step": 25830, "timestamp": 1778222455.515915, "train/loss": 2.211368465423584, "train/z_loss": 0.0014853631029836834, "train/perplexity": 9.128199475120395, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026253.1122750503, "perf/iters_per_sec": 0.9661927758574725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03499014377594, "data/tokens_consumed": 54171533312, "data/tokens_consumed_B": 54.171533312, "train/loss_slope": 6.0174005438606856e-06} {"step": 25840, "timestamp": 1778222465.870302, "train/loss": 2.1984189987182616, "train/z_loss": 0.0014871596358716488, "train/perplexity": 9.010756214886133, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026141.4217493155, "perf/iters_per_sec": 0.966139517664583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350471973419189, "data/tokens_consumed": 54192504832, "data/tokens_consumed_B": 54.192504832, "train/loss_slope": 7.699795031096892e-06} {"step": 25850, "timestamp": 1778222476.2172778, "grad/layer_0/attn": 0.003058710601180792, "grad/layer_0/mlp": 0.0027523627504706383, "grad/layer_0/attn_mlp_ratio": 1.1113035480252067, "grad/layer_4/attn": 0.0023751903790980577, "grad/layer_4/mlp": 0.002555408515036106, "grad/layer_4/attn_mlp_ratio": 0.9294757656847369, "grad/layer_8/attn": 0.0046552624553442, "grad/layer_8/mlp": 0.0038466989062726498, "grad/layer_8/attn_mlp_ratio": 1.2101967031350933, "grad/layer_12/attn": 0.0038772253319621086, "grad/layer_12/mlp": 0.00624444056302309, "grad/layer_12/attn_mlp_ratio": 0.6209083473114543, "grad/layer_16/attn": 0.0038693828973919153, "grad/layer_16/mlp": 0.004467605613172054, "grad/layer_16/attn_mlp_ratio": 0.8660976697168307, "grad/layer_20/attn": 0.003964691888540983, "grad/layer_20/mlp": 0.006062133703380823, "grad/layer_20/attn_mlp_ratio": 0.6540093005419797, "grad/layer_24/attn": 0.01679142192006111, "grad/layer_24/mlp": 0.01039180252701044, "grad/layer_24/attn_mlp_ratio": 1.6158334143507247, "grad/layer_27/attn": 0.009629787877202034, "grad/layer_27/mlp": 0.009062991477549076, "grad/layer_27/attn_mlp_ratio": 1.0625396476211046} {"step": 25850, "timestamp": 1778222476.233261, "train/loss": 2.220616340637207, "train/z_loss": 0.0014847116777673363, "train/perplexity": 9.213007467235451, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024677.2824238418, "perf/iters_per_sec": 0.9654413616294106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357956886291504, "data/tokens_consumed": 54213476352, "data/tokens_consumed_B": 54.213476352, "train/loss_slope": 9.018163719657947e-06} {"step": 25860, "timestamp": 1778222486.594691, "train/loss": 2.231483554840088, "train/z_loss": 0.0014938575099222363, "train/perplexity": 9.313673180081055, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025075.217696852, "perf/iters_per_sec": 0.9656311119541416, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355921506881713, "data/tokens_consumed": 54234447872, "data/tokens_consumed_B": 54.234447872, "train/loss_slope": 1.0395209488123356e-05} {"step": 25870, "timestamp": 1778222496.9518898, "train/loss": 2.1878727436065675, "train/z_loss": 0.001500697433948517, "train/perplexity": 8.916225828047422, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025717.408595017, "perf/iters_per_sec": 0.9659373324370465, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352638483047485, "data/tokens_consumed": 54255419392, "data/tokens_consumed_B": 54.255419392, "train/loss_slope": 7.978691476764125e-06} {"step": 25875, "timestamp": 1778222502.7340875, "eos/sharpness": 32.29377269744872, "eos/L0_probe": 2.027322769165039, "eos/L_plus": 2.2113070487976074, "eos/L_minus": 2.166276216506958, "eos/grad_norm": 0.1424773782491684, "eos/embed_grad_frac": 0.1532057821750641, "eos/time_s": 0.6160669326782227} {"step": 25875, "timestamp": 1778222504.111473, "geo/rankme_last": 440.1101989746094, "geo/layer_0/stable_rank_q_proj": 18.106874465942383, "geo/layer_0/stable_rank_k_proj": 15.908186912536621, "geo/layer_0/stable_rank_o_proj": 51.034332275390625, "geo/layer_0/stable_rank_gate_proj": 146.44583129882812, "geo/layer_0/stable_rank_down_proj": 50.93523406982422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05023685097694397, "geo/layer_0/attn_entropy_mean": 6.237579345703125, "geo/layer_0/attn_entropy_std": 0.3327499330043793, "geo/layer_7/stable_rank_q_proj": 42.665748596191406, "geo/layer_7/stable_rank_k_proj": 41.865867614746094, "geo/layer_7/stable_rank_o_proj": 107.04053497314453, "geo/layer_7/stable_rank_gate_proj": 98.89850616455078, "geo/layer_7/stable_rank_down_proj": 147.7750701904297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.546049177646637, "geo/layer_7/attn_entropy_mean": 4.633940696716309, "geo/layer_7/attn_entropy_std": 0.834740936756134, "geo/layer_14/stable_rank_q_proj": 56.23989486694336, "geo/layer_14/stable_rank_k_proj": 34.79530334472656, "geo/layer_14/stable_rank_o_proj": 54.36310577392578, "geo/layer_14/stable_rank_gate_proj": 83.05155944824219, "geo/layer_14/stable_rank_down_proj": 135.1147918701172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36708715558052063, "geo/layer_14/attn_entropy_mean": 5.475868225097656, "geo/layer_14/attn_entropy_std": 0.44019022583961487, "geo/layer_21/stable_rank_q_proj": 46.148094177246094, "geo/layer_21/stable_rank_k_proj": 31.38558578491211, "geo/layer_21/stable_rank_o_proj": 81.25141143798828, "geo/layer_21/stable_rank_gate_proj": 82.19573974609375, "geo/layer_21/stable_rank_down_proj": 58.85929489135742, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14995920658111572, "geo/layer_21/attn_entropy_mean": 5.736995220184326, "geo/layer_21/attn_entropy_std": 0.29163074493408203, "geo/layer_27/stable_rank_q_proj": 41.39018249511719, "geo/layer_27/stable_rank_k_proj": 31.232954025268555, "geo/layer_27/stable_rank_o_proj": 119.89513397216797, "geo/layer_27/stable_rank_gate_proj": 90.04744720458984, "geo/layer_27/stable_rank_down_proj": 136.40782165527344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07422354072332382, "geo/layer_27/attn_entropy_mean": 4.341081619262695, "geo/layer_27/attn_entropy_std": 0.5742135047912598, "attnres/final_alpha/block_0": 0.24008172750473022, "attnres/block_norm/0": 1.6598163843154907, "attnres/final_alpha/block_1": 0.006194875575602055, "attnres/block_norm/1": 33112.8359375, "attnres/final_alpha/block_2": 0.012729496695101261, "attnres/block_norm/2": 23135.50390625, "attnres/final_alpha/block_3": 0.014998426660895348, "attnres/block_norm/3": 35385.29296875, "attnres/final_alpha/block_4": 0.018859179690480232, "attnres/block_norm/4": 10582.708984375, "attnres/final_alpha/block_5": 0.579364538192749, "attnres/block_norm/5": 5380.4794921875, "attnres/final_alpha/block_6": 0.12777170538902283, "attnres/block_norm/6": 23637.46484375, "geo/tier1_time_s": 1.3574635982513428, "geo/step": 25875.0, "geo/rankme_slope": -0.00015549041882377952} {"step": 25880, "timestamp": 1778222509.291785, "train/loss": 2.1848256587982178, "train/z_loss": 0.001513441267888993, "train/perplexity": 8.889098682125086, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700231.2901311761, "perf/iters_per_sec": 0.810733456674183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233451008796692, "data/tokens_consumed": 54276390912, "data/tokens_consumed_B": 54.276390912, "train/loss_slope": 4.786865002322861e-06} {"step": 25890, "timestamp": 1778222519.660813, "train/loss": 2.2380059003829955, "train/z_loss": 0.0015126827755011618, "train/perplexity": 9.374618712689946, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023468.1689481435, "perf/iters_per_sec": 0.9648648113957136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036414623260498, "data/tokens_consumed": 54297362432, "data/tokens_consumed_B": 54.297362432, "train/loss_slope": 7.330454665549432e-06} {"step": 25900, "timestamp": 1778222530.0080457, "grad/layer_0/attn": 0.0028216654900461435, "grad/layer_0/mlp": 0.0026250253431499004, "grad/layer_0/attn_mlp_ratio": 1.074909767983157, "grad/layer_4/attn": 0.0023529238533228636, "grad/layer_4/mlp": 0.0025427015498280525, "grad/layer_4/attn_mlp_ratio": 0.9253637183434311, "grad/layer_8/attn": 0.007452310062944889, "grad/layer_8/mlp": 0.003778360551223159, "grad/layer_8/attn_mlp_ratio": 1.972365994371758, "grad/layer_12/attn": 0.004431312903761864, "grad/layer_12/mlp": 0.006320268847048283, "grad/layer_12/attn_mlp_ratio": 0.7011272686159018, "grad/layer_16/attn": 0.003564210841432214, "grad/layer_16/mlp": 0.0044304677285254, "grad/layer_16/attn_mlp_ratio": 0.8044773101576715, "grad/layer_20/attn": 0.0035871563013643026, "grad/layer_20/mlp": 0.005903841461986303, "grad/layer_20/attn_mlp_ratio": 0.6075969796447978, "grad/layer_24/attn": 0.014723381958901882, "grad/layer_24/mlp": 0.009424952790141106, "grad/layer_24/attn_mlp_ratio": 1.5621703504007065, "grad/layer_27/attn": 0.01257292926311493, "grad/layer_27/mlp": 0.007500660605728626, "grad/layer_27/attn_mlp_ratio": 1.6762429013103286} {"step": 25900, "timestamp": 1778222530.023726, "train/loss": 2.2231576204299928, "train/z_loss": 0.0014924776274710893, "train/perplexity": 9.236450071434664, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024728.5945643333, "perf/iters_per_sec": 0.965465829164664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357694387435914, "data/tokens_consumed": 54318333952, "data/tokens_consumed_B": 54.318333952, "train/loss_slope": 1.1189822688533373e-05} {"step": 25910, "timestamp": 1778222540.382297, "train/loss": 2.1755632162094116, "train/z_loss": 0.0015140316914767026, "train/perplexity": 8.807144051870369, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025940.9417761879, "perf/iters_per_sec": 0.9660439213639201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035149621963501, "data/tokens_consumed": 54339305472, "data/tokens_consumed_B": 54.339305472, "train/loss_slope": 9.022890616564885e-06} {"step": 25920, "timestamp": 1778222550.7346392, "train/loss": 2.2253690481185915, "train/z_loss": 0.0015047009801492095, "train/perplexity": 9.256898414549669, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026784.6627683502, "perf/iters_per_sec": 0.9664462388841392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347187042236328, "data/tokens_consumed": 54360276992, "data/tokens_consumed_B": 54.360276992, "train/loss_slope": 1.0705156373505577e-05} {"step": 25930, "timestamp": 1778222561.094465, "train/loss": 2.176254677772522, "train/z_loss": 0.0015123208635486663, "train/perplexity": 8.813235959380235, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025349.1133764214, "perf/iters_per_sec": 0.9657617155916316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354521036148072, "data/tokens_consumed": 54381248512, "data/tokens_consumed_B": 54.381248512, "train/loss_slope": 9.808014485702271e-06} {"step": 25940, "timestamp": 1778222571.44537, "train/loss": 2.2479968070983887, "train/z_loss": 0.001491620682645589, "train/perplexity": 9.468749094680982, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027171.8873337621, "perf/iters_per_sec": 0.9666308819454966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345210552215576, "data/tokens_consumed": 54402220032, "data/tokens_consumed_B": 54.402220032, "train/loss_slope": 1.6692348324855547e-05} {"step": 25950, "timestamp": 1778222581.78586, "grad/layer_0/attn": 0.003686151932924986, "grad/layer_0/mlp": 0.003012764500454068, "grad/layer_0/attn_mlp_ratio": 1.223511432778196, "grad/layer_4/attn": 0.0017459457740187645, "grad/layer_4/mlp": 0.002561175264418125, "grad/layer_4/attn_mlp_ratio": 0.6816970826265256, "grad/layer_8/attn": 0.0044046188704669476, "grad/layer_8/mlp": 0.0038470756262540817, "grad/layer_8/attn_mlp_ratio": 1.1449264802374317, "grad/layer_12/attn": 0.004333182238042355, "grad/layer_12/mlp": 0.007216856349259615, "grad/layer_12/attn_mlp_ratio": 0.60042516690032, "grad/layer_16/attn": 0.004497292451560497, "grad/layer_16/mlp": 0.004453767091035843, "grad/layer_16/attn_mlp_ratio": 1.0097726842598012, "grad/layer_20/attn": 0.007373313885182142, "grad/layer_20/mlp": 0.0061563607305288315, "grad/layer_20/attn_mlp_ratio": 1.1976740948351423, "grad/layer_24/attn": 0.01332335826009512, "grad/layer_24/mlp": 0.012872098945081234, "grad/layer_24/attn_mlp_ratio": 1.0350571583883457, "grad/layer_27/attn": 0.006418691482394934, "grad/layer_27/mlp": 0.010427637957036495, "grad/layer_27/attn_mlp_ratio": 0.615546056286797} {"step": 25950, "timestamp": 1778222582.4037693, "eos/sharpness": 16.475081443786618, "eos/L0_probe": 2.030177354812622, "eos/L_plus": 2.1192009449005127, "eos/L_minus": 2.1059045791625977, "eos/grad_norm": 0.13399085402488708, "eos/embed_grad_frac": 0.18661247193813324, "eos/time_s": 0.6149537563323975} {"step": 25950, "timestamp": 1778222582.4249318, "train/loss": 2.2285879850387573, "train/z_loss": 0.0014958580257371068, "train/perplexity": 9.286743796052319, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910854.3355911777, "perf/iters_per_sec": 0.9111663511234177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0974944353103637, "data/tokens_consumed": 54423191552, "data/tokens_consumed_B": 54.423191552, "train/loss_slope": 1.789862352057047e-05} {"step": 25950, "timestamp": 1778222583.805161, "geo/rankme_last": 440.4593505859375, "geo/layer_0/stable_rank_q_proj": 18.113969802856445, "geo/layer_0/stable_rank_k_proj": 15.915196418762207, "geo/layer_0/stable_rank_o_proj": 51.14485168457031, "geo/layer_0/stable_rank_gate_proj": 146.46585083007812, "geo/layer_0/stable_rank_down_proj": 50.94322204589844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05304266884922981, "geo/layer_0/attn_entropy_mean": 6.242794036865234, "geo/layer_0/attn_entropy_std": 0.33455148339271545, "geo/layer_7/stable_rank_q_proj": 42.651424407958984, "geo/layer_7/stable_rank_k_proj": 41.9721565246582, "geo/layer_7/stable_rank_o_proj": 107.2900161743164, "geo/layer_7/stable_rank_gate_proj": 98.82332611083984, "geo/layer_7/stable_rank_down_proj": 147.5701141357422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5448086261749268, "geo/layer_7/attn_entropy_mean": 4.628750801086426, "geo/layer_7/attn_entropy_std": 0.8427187204360962, "geo/layer_14/stable_rank_q_proj": 56.09810256958008, "geo/layer_14/stable_rank_k_proj": 34.8731575012207, "geo/layer_14/stable_rank_o_proj": 54.273582458496094, "geo/layer_14/stable_rank_gate_proj": 82.85769653320312, "geo/layer_14/stable_rank_down_proj": 135.10140991210938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3609558641910553, "geo/layer_14/attn_entropy_mean": 5.465517044067383, "geo/layer_14/attn_entropy_std": 0.43553605675697327, "geo/layer_21/stable_rank_q_proj": 46.1712760925293, "geo/layer_21/stable_rank_k_proj": 31.392127990722656, "geo/layer_21/stable_rank_o_proj": 81.14571380615234, "geo/layer_21/stable_rank_gate_proj": 82.24503326416016, "geo/layer_21/stable_rank_down_proj": 58.818572998046875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14796695113182068, "geo/layer_21/attn_entropy_mean": 5.727434158325195, "geo/layer_21/attn_entropy_std": 0.2861405313014984, "geo/layer_27/stable_rank_q_proj": 41.37244415283203, "geo/layer_27/stable_rank_k_proj": 31.34107780456543, "geo/layer_27/stable_rank_o_proj": 119.99263763427734, "geo/layer_27/stable_rank_gate_proj": 90.08743286132812, "geo/layer_27/stable_rank_down_proj": 136.04959106445312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08220039308071136, "geo/layer_27/attn_entropy_mean": 4.36214542388916, "geo/layer_27/attn_entropy_std": 0.619135320186615, "attnres/final_alpha/block_0": 0.24068789184093475, "attnres/block_norm/0": 1.6603378057479858, "attnres/final_alpha/block_1": 0.0062945447862148285, "attnres/block_norm/1": 33027.921875, "attnres/final_alpha/block_2": 0.012791378423571587, "attnres/block_norm/2": 23152.84375, "attnres/final_alpha/block_3": 0.014919744804501534, "attnres/block_norm/3": 35315.5078125, "attnres/final_alpha/block_4": 0.018903132528066635, "attnres/block_norm/4": 10559.7392578125, "attnres/final_alpha/block_5": 0.5782617330551147, "attnres/block_norm/5": 5370.345703125, "attnres/final_alpha/block_6": 0.12814156711101532, "attnres/block_norm/6": 23720.595703125, "geo/tier1_time_s": 1.3758611679077148, "geo/step": 25950.0, "geo/rankme_slope": -0.0001433425518644958} {"step": 25960, "timestamp": 1778222594.1783023, "train/loss": 2.2138797283172607, "train/z_loss": 0.0015016519813798369, "train/perplexity": 9.15115159108474, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1784945.8862744998, "perf/iters_per_sec": 0.8511285239574908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1749106884002685, "data/tokens_consumed": 54444163072, "data/tokens_consumed_B": 54.444163072, "train/loss_slope": 1.8154457886584584e-05} {"step": 25970, "timestamp": 1778222604.5366783, "train/loss": 2.235442805290222, "train/z_loss": 0.0015028864610940218, "train/perplexity": 9.350621440253855, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025685.5926036083, "perf/iters_per_sec": 0.9659221613901178, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352801084518433, "data/tokens_consumed": 54465134592, "data/tokens_consumed_B": 54.465134592, "train/loss_slope": 2.2201791054273287e-05} {"step": 25980, "timestamp": 1778222614.9045417, "train/loss": 2.202907371520996, "train/z_loss": 0.00150227346457541, "train/perplexity": 9.051290747009896, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023925.8878685588, "perf/iters_per_sec": 0.9650830687849802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361802339553834, "data/tokens_consumed": 54486106112, "data/tokens_consumed_B": 54.486106112, "train/loss_slope": 1.9219324562308527e-05} {"step": 25990, "timestamp": 1778222625.2400565, "train/loss": 2.223013663291931, "train/z_loss": 0.001486890623345971, "train/perplexity": 9.235120514218453, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030459.8587935825, "perf/iters_per_sec": 0.9681987089126504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0328458309173585, "data/tokens_consumed": 54507077632, "data/tokens_consumed_B": 54.507077632, "train/loss_slope": 2.0288202035115915e-05} {"step": 26000, "timestamp": 1778222635.5635786, "grad/layer_0/attn": 0.00305430730804801, "grad/layer_0/mlp": 0.00316276540979743, "grad/layer_0/attn_mlp_ratio": 0.9657077954677814, "grad/layer_4/attn": 0.0017767325043678284, "grad/layer_4/mlp": 0.002658247947692871, "grad/layer_4/attn_mlp_ratio": 0.6683847678962384, "grad/layer_8/attn": 0.005830514244735241, "grad/layer_8/mlp": 0.003897011047229171, "grad/layer_8/attn_mlp_ratio": 1.4961502609200432, "grad/layer_12/attn": 0.005027213133871555, "grad/layer_12/mlp": 0.006596503779292107, "grad/layer_12/attn_mlp_ratio": 0.7621026570837162, "grad/layer_16/attn": 0.0042092762887477875, "grad/layer_16/mlp": 0.004564214497804642, "grad/layer_16/attn_mlp_ratio": 0.9222345265650793, "grad/layer_20/attn": 0.006271359510719776, "grad/layer_20/mlp": 0.007236894685775042, "grad/layer_20/attn_mlp_ratio": 0.8665815513922992, "grad/layer_24/attn": 0.016983114182949066, "grad/layer_24/mlp": 0.012281364761292934, "grad/layer_24/attn_mlp_ratio": 1.3828360589200142, "grad/layer_27/attn": 0.006183754652738571, "grad/layer_27/mlp": 0.010983414947986603, "grad/layer_27/attn_mlp_ratio": 0.5630083745102695} {"step": 26000, "timestamp": 1778222635.5780869, "train/loss": 2.1890084028244017, "train/z_loss": 0.0014933342230506242, "train/perplexity": 8.926357374000204, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029564.2787100358, "perf/iters_per_sec": 0.9677716630506686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033301591873169, "data/tokens_consumed": 54528049152, "data/tokens_consumed_B": 54.528049152, "train/loss_slope": 1.797959800958275e-05} {"step": 26000, "timestamp": 1778222642.6323123, "geo/ww_alpha_mean": 7.722029328373927, "geo/ww_alpha_std": 4.157295948695384, "geo/ww_alpha_min": 1.3480958348159193, "geo/ww_alpha_max": 28.183631033488652, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.172470047307462, "geo/ww_alpha_by_type/k_proj": 4.652302152824128, "geo/ww_alpha_by_type/v_proj": 7.962048718299022, "geo/ww_alpha_by_type/o_proj": 7.580581265315587, "geo/ww_alpha_by_type/gate_proj": 8.595450178965788, "geo/ww_alpha_by_type/up_proj": 11.814985892273183, "geo/ww_alpha_by_type/down_proj": 9.389955778148812, "geo/twonn_id/layer_0": 0.7160465121269226, "geo/twonn_id/layer_7": 3.143099069595337, "geo/twonn_id/layer_14": 4.1062750816345215, "geo/twonn_id/layer_21": 6.455090045928955, "geo/twonn_id/layer_27": 6.454526424407959, "geo/tier2_time_s": 7.045276641845703} {"step": 26000, "timestamp": 1778222643.289106, "eoc/jacobian_sigma/layer_0/attn": 918.5782470703125, "eoc/jacobian_sigma/layer_0/mlp": 6025.5986328125, "eoc/jacobian_sigma/layer_0": 6025.5986328125, "eoc/jacobian_sigma/layer_7/attn": 1.1545521020889282, "eoc/jacobian_sigma/layer_7/mlp": 1.6370333433151245, "eoc/jacobian_sigma/layer_7": 1.6370333433151245, "eoc/jacobian_sigma/layer_14/attn": 1.6042753458023071, "eoc/jacobian_sigma/layer_14/mlp": 7.254449367523193, "eoc/jacobian_sigma/layer_14": 7.254449367523193, "eoc/jacobian_sigma/layer_21/attn": 1.0861519575119019, "eoc/jacobian_sigma/layer_21/mlp": 4.003536701202393, "eoc/jacobian_sigma/layer_21": 4.003536701202393, "eoc/jacobian_sigma/layer_27/attn": 3.3956432342529297, "eoc/jacobian_sigma/layer_27/mlp": 26.605369567871094, "eoc/jacobian_sigma/layer_27": 26.605369567871094, "eoc/layer0_sigma": 6025.5986328125, "eoc/sigma_max": 26.605369567871094, "eoc/sigma_min": 1.6370333433151245, "eoc/sigma_mean": 9.875097244977951, "eoc/time_s": 0.6508424282073975} {"step": 26010, "timestamp": 1778222653.6499233, "train/loss": 2.2177529096603394, "train/z_loss": 0.0014933781581930815, "train/perplexity": 9.186664390054696, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1160943.9031672156, "perf/iters_per_sec": 0.553581191619499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8064197540283202, "data/tokens_consumed": 54549020672, "data/tokens_consumed_B": 54.549020672, "train/loss_slope": 1.59558669175252e-05} {"step": 26020, "timestamp": 1778222663.9962234, "train/loss": 2.1945227146148683, "train/z_loss": 0.0015060158213600517, "train/perplexity": 8.975716056221907, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028103.5136701632, "perf/iters_per_sec": 0.9670751160002533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340458393096923, "data/tokens_consumed": 54569992192, "data/tokens_consumed_B": 54.569992192, "train/loss_slope": 1.671013111042398e-05} {"step": 26025, "timestamp": 1778222669.7753448, "eos/sharpness": 23.186659812927243, "eos/L0_probe": 2.027127504348755, "eos/L_plus": 2.1396329402923584, "eos/L_minus": 2.146488666534424, "eos/grad_norm": 0.10940065234899521, "eos/embed_grad_frac": 0.19345270097255707, "eos/time_s": 0.6200487613677979} {"step": 26025, "timestamp": 1778222671.1555946, "geo/rankme_last": 441.2903747558594, "geo/layer_0/stable_rank_q_proj": 18.096599578857422, "geo/layer_0/stable_rank_k_proj": 15.920843124389648, "geo/layer_0/stable_rank_o_proj": 51.045799255371094, "geo/layer_0/stable_rank_gate_proj": 146.4247589111328, "geo/layer_0/stable_rank_down_proj": 50.95109558105469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.04813355579972267, "geo/layer_0/attn_entropy_mean": 6.239534854888916, "geo/layer_0/attn_entropy_std": 0.33194395899772644, "geo/layer_7/stable_rank_q_proj": 42.51396179199219, "geo/layer_7/stable_rank_k_proj": 42.01463317871094, "geo/layer_7/stable_rank_o_proj": 107.29883575439453, "geo/layer_7/stable_rank_gate_proj": 98.77339935302734, "geo/layer_7/stable_rank_down_proj": 147.99571228027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.538374125957489, "geo/layer_7/attn_entropy_mean": 4.626832485198975, "geo/layer_7/attn_entropy_std": 0.8326941728591919, "geo/layer_14/stable_rank_q_proj": 56.168636322021484, "geo/layer_14/stable_rank_k_proj": 34.8978157043457, "geo/layer_14/stable_rank_o_proj": 54.43575668334961, "geo/layer_14/stable_rank_gate_proj": 82.65740203857422, "geo/layer_14/stable_rank_down_proj": 134.9810333251953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3785887658596039, "geo/layer_14/attn_entropy_mean": 5.534617900848389, "geo/layer_14/attn_entropy_std": 0.42904818058013916, "geo/layer_21/stable_rank_q_proj": 46.175697326660156, "geo/layer_21/stable_rank_k_proj": 31.388296127319336, "geo/layer_21/stable_rank_o_proj": 81.01649475097656, "geo/layer_21/stable_rank_gate_proj": 82.17717742919922, "geo/layer_21/stable_rank_down_proj": 58.80604934692383, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15075844526290894, "geo/layer_21/attn_entropy_mean": 5.724658966064453, "geo/layer_21/attn_entropy_std": 0.2909507155418396, "geo/layer_27/stable_rank_q_proj": 41.30967712402344, "geo/layer_27/stable_rank_k_proj": 31.38040542602539, "geo/layer_27/stable_rank_o_proj": 119.96709442138672, "geo/layer_27/stable_rank_gate_proj": 90.00926208496094, "geo/layer_27/stable_rank_down_proj": 136.0143280029297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0786510556936264, "geo/layer_27/attn_entropy_mean": 4.361061096191406, "geo/layer_27/attn_entropy_std": 0.5964824557304382, "attnres/final_alpha/block_0": 0.24160191416740417, "attnres/block_norm/0": 1.660728931427002, "attnres/final_alpha/block_1": 0.0062349094077944756, "attnres/block_norm/1": 33115.85546875, "attnres/final_alpha/block_2": 0.012962408363819122, "attnres/block_norm/2": 23110.517578125, "attnres/final_alpha/block_3": 0.014964194968342781, "attnres/block_norm/3": 35531.5625, "attnres/final_alpha/block_4": 0.01921803131699562, "attnres/block_norm/4": 10548.41015625, "attnres/final_alpha/block_5": 0.5760313272476196, "attnres/block_norm/5": 5392.49365234375, "attnres/final_alpha/block_6": 0.12898719310760498, "attnres/block_norm/6": 23629.041015625, "geo/tier1_time_s": 1.362196683883667, "geo/step": 26025.0, "geo/rankme_slope": -6.842678477641056e-05} {"step": 26030, "timestamp": 1778222676.3263252, "train/loss": 2.2283342838287354, "train/z_loss": 0.0015037445235066115, "train/perplexity": 9.284388036756225, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701515.9778723244, "perf/iters_per_sec": 0.8113460435258505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325197219848634, "data/tokens_consumed": 54590963712, "data/tokens_consumed_B": 54.590963712, "train/loss_slope": 1.9201465952049474e-05} {"step": 26040, "timestamp": 1778222686.6712427, "train/loss": 2.2253172636032104, "train/z_loss": 0.001500268978998065, "train/perplexity": 9.25641906296294, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028583.776650768, "perf/iters_per_sec": 0.967304123235115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338010311126709, "data/tokens_consumed": 54611935232, "data/tokens_consumed_B": 54.611935232, "train/loss_slope": 1.800821176563838e-05} {"step": 26050, "timestamp": 1778222696.9935331, "grad/layer_0/attn": 0.0032516366336494684, "grad/layer_0/mlp": 0.0028539698105305433, "grad/layer_0/attn_mlp_ratio": 1.1393380924065175, "grad/layer_4/attn": 0.0026014780160039663, "grad/layer_4/mlp": 0.00252146041020751, "grad/layer_4/attn_mlp_ratio": 1.0317345861545422, "grad/layer_8/attn": 0.004861264489591122, "grad/layer_8/mlp": 0.0039459154941141605, "grad/layer_8/attn_mlp_ratio": 1.2319737646801978, "grad/layer_12/attn": 0.004769749473780394, "grad/layer_12/mlp": 0.006167367100715637, "grad/layer_12/attn_mlp_ratio": 0.7733850310107903, "grad/layer_16/attn": 0.004189902916550636, "grad/layer_16/mlp": 0.004399522207677364, "grad/layer_16/attn_mlp_ratio": 0.9523540565390628, "grad/layer_20/attn": 0.008431804366409779, "grad/layer_20/mlp": 0.006045723333954811, "grad/layer_20/attn_mlp_ratio": 1.3946725248882437, "grad/layer_24/attn": 0.009603235870599747, "grad/layer_24/mlp": 0.008928441442549229, "grad/layer_24/attn_mlp_ratio": 1.07557806419348, "grad/layer_27/attn": 0.009113458916544914, "grad/layer_27/mlp": 0.007677544839680195, "grad/layer_27/attn_mlp_ratio": 1.1870277527707356} {"step": 26050, "timestamp": 1778222697.0079215, "train/loss": 2.250684404373169, "train/z_loss": 0.0015005035325884818, "train/perplexity": 9.494231506835488, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029915.979354062, "perf/iters_per_sec": 0.9679393669863042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331225633621215, "data/tokens_consumed": 54632906752, "data/tokens_consumed_B": 54.632906752, "train/loss_slope": 1.9809484009695526e-05} {"step": 26060, "timestamp": 1778222707.34385, "train/loss": 2.250462555885315, "train/z_loss": 0.0014873960753902793, "train/perplexity": 9.492125459552701, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030108.8599069833, "perf/iters_per_sec": 0.9680313396010319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330244064331056, "data/tokens_consumed": 54653878272, "data/tokens_consumed_B": 54.653878272, "train/loss_slope": 2.0983209818860896e-05} {"step": 26070, "timestamp": 1778222717.687346, "train/loss": 2.1951709508895876, "train/z_loss": 0.001495755254290998, "train/perplexity": 8.981536327212712, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028924.092770858, "perf/iters_per_sec": 0.9674663986067095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336276292800903, "data/tokens_consumed": 54674849792, "data/tokens_consumed_B": 54.674849792, "train/loss_slope": 2.0204801299069122e-05} {"step": 26080, "timestamp": 1778222728.021401, "train/loss": 2.2246477365493775, "train/z_loss": 0.0014810560038313269, "train/perplexity": 9.25022371418691, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030441.20453104, "perf/iters_per_sec": 0.9681898138671112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0328553199768067, "data/tokens_consumed": 54695821312, "data/tokens_consumed_B": 54.695821312, "train/loss_slope": 1.995510684453151e-05} {"step": 26090, "timestamp": 1778222738.3605306, "train/loss": 2.1294171094894407, "train/z_loss": 0.0015010791015811265, "train/perplexity": 8.409963294673652, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029351.0882160156, "perf/iters_per_sec": 0.9676700059013441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334101438522338, "data/tokens_consumed": 54716792832, "data/tokens_consumed_B": 54.716792832, "train/loss_slope": 1.4107625719809217e-05} {"step": 26100, "timestamp": 1778222748.6873603, "grad/layer_0/attn": 0.00312153622508049, "grad/layer_0/mlp": 0.0026080745737999678, "grad/layer_0/attn_mlp_ratio": 1.1968737921650106, "grad/layer_4/attn": 0.0022166690323501825, "grad/layer_4/mlp": 0.002332038711756468, "grad/layer_4/attn_mlp_ratio": 0.9505283622104943, "grad/layer_8/attn": 0.00440113665536046, "grad/layer_8/mlp": 0.003717514919117093, "grad/layer_8/attn_mlp_ratio": 1.1838920980084504, "grad/layer_12/attn": 0.005286956671625376, "grad/layer_12/mlp": 0.006351079326122999, "grad/layer_12/attn_mlp_ratio": 0.8324500949994866, "grad/layer_16/attn": 0.003452213481068611, "grad/layer_16/mlp": 0.004262552596628666, "grad/layer_16/attn_mlp_ratio": 0.8098934433817165, "grad/layer_20/attn": 0.004729248117655516, "grad/layer_20/mlp": 0.006397294346243143, "grad/layer_20/attn_mlp_ratio": 0.7392575341647438, "grad/layer_24/attn": 0.01095933374017477, "grad/layer_24/mlp": 0.011699297465384007, "grad/layer_24/attn_mlp_ratio": 0.9367514313510028, "grad/layer_27/attn": 0.010482682846486568, "grad/layer_27/mlp": 0.011118423193693161, "grad/layer_27/attn_mlp_ratio": 0.9428209890545172} {"step": 26100, "timestamp": 1778222749.2857404, "eos/sharpness": 23.64726066589355, "eos/L0_probe": 2.029048204421997, "eos/L_plus": 2.15425968170166, "eos/L_minus": 2.1403093338012695, "eos/grad_norm": 0.13477261364459991, "eos/embed_grad_frac": 0.13705085217952728, "eos/time_s": 0.5955040454864502} {"step": 26100, "timestamp": 1778222749.3031616, "train/loss": 2.2406155824661256, "train/z_loss": 0.0014851165004074573, "train/perplexity": 9.399115437609899, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917799.6133847567, "perf/iters_per_sec": 0.9144781176494392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0935198783874511, "data/tokens_consumed": 54737764352, "data/tokens_consumed_B": 54.737764352, "train/loss_slope": 1.3515796576968341e-05} {"step": 26100, "timestamp": 1778222750.669352, "geo/rankme_last": 439.5200500488281, "geo/layer_0/stable_rank_q_proj": 18.100255966186523, "geo/layer_0/stable_rank_k_proj": 15.913397789001465, "geo/layer_0/stable_rank_o_proj": 51.048553466796875, "geo/layer_0/stable_rank_gate_proj": 146.6754913330078, "geo/layer_0/stable_rank_down_proj": 51.01462173461914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051333870738744736, "geo/layer_0/attn_entropy_mean": 6.239138603210449, "geo/layer_0/attn_entropy_std": 0.3343747854232788, "geo/layer_7/stable_rank_q_proj": 42.5284309387207, "geo/layer_7/stable_rank_k_proj": 42.171695709228516, "geo/layer_7/stable_rank_o_proj": 107.13566589355469, "geo/layer_7/stable_rank_gate_proj": 98.67052459716797, "geo/layer_7/stable_rank_down_proj": 147.74855041503906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.55130535364151, "geo/layer_7/attn_entropy_mean": 4.645123481750488, "geo/layer_7/attn_entropy_std": 0.8250156044960022, "geo/layer_14/stable_rank_q_proj": 56.30439758300781, "geo/layer_14/stable_rank_k_proj": 34.91244125366211, "geo/layer_14/stable_rank_o_proj": 54.337650299072266, "geo/layer_14/stable_rank_gate_proj": 82.55584716796875, "geo/layer_14/stable_rank_down_proj": 135.0184783935547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3593204617500305, "geo/layer_14/attn_entropy_mean": 5.460750579833984, "geo/layer_14/attn_entropy_std": 0.4488030672073364, "geo/layer_21/stable_rank_q_proj": 46.184879302978516, "geo/layer_21/stable_rank_k_proj": 31.311838150024414, "geo/layer_21/stable_rank_o_proj": 81.10941314697266, "geo/layer_21/stable_rank_gate_proj": 82.19916534423828, "geo/layer_21/stable_rank_down_proj": 58.745880126953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15304610133171082, "geo/layer_21/attn_entropy_mean": 5.722192764282227, "geo/layer_21/attn_entropy_std": 0.292458176612854, "geo/layer_27/stable_rank_q_proj": 41.31330871582031, "geo/layer_27/stable_rank_k_proj": 31.405977249145508, "geo/layer_27/stable_rank_o_proj": 119.80207824707031, "geo/layer_27/stable_rank_gate_proj": 89.94120025634766, "geo/layer_27/stable_rank_down_proj": 136.02391052246094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08034561574459076, "geo/layer_27/attn_entropy_mean": 4.354807376861572, "geo/layer_27/attn_entropy_std": 0.5832706689834595, "attnres/final_alpha/block_0": 0.2430608570575714, "attnres/block_norm/0": 1.661153793334961, "attnres/final_alpha/block_1": 0.006353871431201696, "attnres/block_norm/1": 33198.9453125, "attnres/final_alpha/block_2": 0.013145066797733307, "attnres/block_norm/2": 23130.59765625, "attnres/final_alpha/block_3": 0.015037333592772484, "attnres/block_norm/3": 35411.71875, "attnres/final_alpha/block_4": 0.018829770386219025, "attnres/block_norm/4": 10599.94140625, "attnres/final_alpha/block_5": 0.5742696523666382, "attnres/block_norm/5": 5404.71875, "attnres/final_alpha/block_6": 0.12930342555046082, "attnres/block_norm/6": 23778.4453125, "geo/tier1_time_s": 1.3623275756835938, "geo/step": 26100.0, "geo/rankme_slope": -7.101905215211084e-05} {"step": 26110, "timestamp": 1778222761.009785, "train/loss": 2.181745433807373, "train/z_loss": 0.0014871480292640627, "train/perplexity": 8.861760383888027, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791989.1034133222, "perf/iters_per_sec": 0.8544869916025745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1702928304672242, "data/tokens_consumed": 54758735872, "data/tokens_consumed_B": 54.758735872, "train/loss_slope": 1.1440851028137942e-05} {"step": 26120, "timestamp": 1778222771.3680174, "train/loss": 2.1856086611747743, "train/z_loss": 0.0014997036545537413, "train/perplexity": 8.896061593150833, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025566.734661831, "perf/iters_per_sec": 0.9658654855069309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353408575057983, "data/tokens_consumed": 54779707392, "data/tokens_consumed_B": 54.779707392, "train/loss_slope": 1.2178188238707597e-05} {"step": 26130, "timestamp": 1778222781.7073462, "train/loss": 2.212988090515137, "train/z_loss": 0.0014909004559740425, "train/perplexity": 9.142995714977229, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029302.4441688186, "perf/iters_per_sec": 0.9676468106121152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334349155426026, "data/tokens_consumed": 54800678912, "data/tokens_consumed_B": 54.800678912, "train/loss_slope": 1.2586052404163929e-05} {"step": 26140, "timestamp": 1778222792.0608013, "train/loss": 2.2308660745620728, "train/z_loss": 0.0014845278928987682, "train/perplexity": 9.307923945778548, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026653.5817734897, "perf/iters_per_sec": 0.9663837345950554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347856283187866, "data/tokens_consumed": 54821650432, "data/tokens_consumed_B": 54.821650432, "train/loss_slope": 1.148350544960122e-05} {"step": 26150, "timestamp": 1778222802.4252143, "grad/layer_0/attn": 0.0033627126831561327, "grad/layer_0/mlp": 0.0028660432435572147, "grad/layer_0/attn_mlp_ratio": 1.1732944272163284, "grad/layer_4/attn": 0.002433682791888714, "grad/layer_4/mlp": 0.0024071361403912306, "grad/layer_4/attn_mlp_ratio": 1.0110282712926828, "grad/layer_8/attn": 0.004359356593340635, "grad/layer_8/mlp": 0.0035697452258318663, "grad/layer_8/attn_mlp_ratio": 1.221195406236652, "grad/layer_12/attn": 0.006135254632681608, "grad/layer_12/mlp": 0.0067346845753490925, "grad/layer_12/attn_mlp_ratio": 0.9109935993200138, "grad/layer_16/attn": 0.0036560161970555782, "grad/layer_16/mlp": 0.004739555064588785, "grad/layer_16/attn_mlp_ratio": 0.7713838261386252, "grad/layer_20/attn": 0.004346984438598156, "grad/layer_20/mlp": 0.006871227640658617, "grad/layer_20/attn_mlp_ratio": 0.6326357679685191, "grad/layer_24/attn": 0.015783684328198433, "grad/layer_24/mlp": 0.01259599905461073, "grad/layer_24/attn_mlp_ratio": 1.2530712438497473, "grad/layer_27/attn": 0.006741097662597895, "grad/layer_27/mlp": 0.01103026419878006, "grad/layer_27/attn_mlp_ratio": 0.6111456153723765} {"step": 26150, "timestamp": 1778222802.4422877, "train/loss": 2.1787317037582397, "train/z_loss": 0.0015042036306113006, "train/perplexity": 8.835093633707597, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021171.5300017062, "perf/iters_per_sec": 0.9637696886070758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375922918319702, "data/tokens_consumed": 54842621952, "data/tokens_consumed_B": 54.842621952, "train/loss_slope": 8.234800013414282e-06} {"step": 26160, "timestamp": 1778222812.832739, "train/loss": 2.143137884140015, "train/z_loss": 0.00151570268208161, "train/perplexity": 8.526149767300577, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019588.6135549103, "perf/iters_per_sec": 0.9630148952269126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384055376052856, "data/tokens_consumed": 54863593472, "data/tokens_consumed_B": 54.863593472, "train/loss_slope": 8.305799034741302e-06} {"step": 26170, "timestamp": 1778222823.211666, "train/loss": 2.227204966545105, "train/z_loss": 0.0014725228887982667, "train/perplexity": 9.273908935107404, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021642.5677227585, "perf/iters_per_sec": 0.9639942968953888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373505353927612, "data/tokens_consumed": 54884564992, "data/tokens_consumed_B": 54.884564992, "train/loss_slope": 1.08942969869001e-05} {"step": 26175, "timestamp": 1778222829.018866, "eos/sharpness": 66.49124622344969, "eos/L0_probe": 2.027369499206543, "eos/L_plus": 2.260723352432251, "eos/L_minus": 2.458928108215332, "eos/grad_norm": 0.18042875826358795, "eos/embed_grad_frac": 0.0782582014799118, "eos/time_s": 0.6248905658721924} {"step": 26175, "timestamp": 1778222830.405604, "geo/rankme_last": 439.870849609375, "geo/layer_0/stable_rank_q_proj": 18.116722106933594, "geo/layer_0/stable_rank_k_proj": 15.9169921875, "geo/layer_0/stable_rank_o_proj": 51.076019287109375, "geo/layer_0/stable_rank_gate_proj": 146.69215393066406, "geo/layer_0/stable_rank_down_proj": 50.88743209838867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.053898412734270096, "geo/layer_0/attn_entropy_mean": 6.242207050323486, "geo/layer_0/attn_entropy_std": 0.33829590678215027, "geo/layer_7/stable_rank_q_proj": 42.384578704833984, "geo/layer_7/stable_rank_k_proj": 42.17936706542969, "geo/layer_7/stable_rank_o_proj": 107.08294677734375, "geo/layer_7/stable_rank_gate_proj": 98.86011505126953, "geo/layer_7/stable_rank_down_proj": 147.83982849121094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5519964098930359, "geo/layer_7/attn_entropy_mean": 4.629456520080566, "geo/layer_7/attn_entropy_std": 0.8518531918525696, "geo/layer_14/stable_rank_q_proj": 56.205711364746094, "geo/layer_14/stable_rank_k_proj": 34.94464111328125, "geo/layer_14/stable_rank_o_proj": 54.19827651977539, "geo/layer_14/stable_rank_gate_proj": 82.55182647705078, "geo/layer_14/stable_rank_down_proj": 135.00929260253906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36110013723373413, "geo/layer_14/attn_entropy_mean": 5.494692802429199, "geo/layer_14/attn_entropy_std": 0.4274706542491913, "geo/layer_21/stable_rank_q_proj": 46.07131576538086, "geo/layer_21/stable_rank_k_proj": 31.38882827758789, "geo/layer_21/stable_rank_o_proj": 80.89064025878906, "geo/layer_21/stable_rank_gate_proj": 81.9889144897461, "geo/layer_21/stable_rank_down_proj": 58.65409851074219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1514228731393814, "geo/layer_21/attn_entropy_mean": 5.726563453674316, "geo/layer_21/attn_entropy_std": 0.300139456987381, "geo/layer_27/stable_rank_q_proj": 41.31981658935547, "geo/layer_27/stable_rank_k_proj": 31.367530822753906, "geo/layer_27/stable_rank_o_proj": 119.58616638183594, "geo/layer_27/stable_rank_gate_proj": 89.92803955078125, "geo/layer_27/stable_rank_down_proj": 135.74325561523438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08478236198425293, "geo/layer_27/attn_entropy_mean": 4.354143142700195, "geo/layer_27/attn_entropy_std": 0.6097481846809387, "attnres/final_alpha/block_0": 0.24255666136741638, "attnres/block_norm/0": 1.6614632606506348, "attnres/final_alpha/block_1": 0.006318016443401575, "attnres/block_norm/1": 33250.875, "attnres/final_alpha/block_2": 0.013034224510192871, "attnres/block_norm/2": 23273.85546875, "attnres/final_alpha/block_3": 0.015002770349383354, "attnres/block_norm/3": 35501.34375, "attnres/final_alpha/block_4": 0.018967056646943092, "attnres/block_norm/4": 10634.41796875, "attnres/final_alpha/block_5": 0.5743928551673889, "attnres/block_norm/5": 5440.9873046875, "attnres/final_alpha/block_6": 0.12972837686538696, "attnres/block_norm/6": 23720.859375, "geo/tier1_time_s": 1.367112636566162, "geo/step": 26175.0, "geo/rankme_slope": -5.994948760754302e-05} {"step": 26180, "timestamp": 1778222835.6019874, "train/loss": 2.2005842447280886, "train/z_loss": 0.0014742640079930425, "train/perplexity": 9.030287856599747, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1693520.769381684, "perf/iters_per_sec": 0.807533631029932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2383385181427002, "data/tokens_consumed": 54905536512, "data/tokens_consumed_B": 54.905536512, "train/loss_slope": 6.894524126771077e-06} {"step": 26190, "timestamp": 1778222845.994862, "train/loss": 2.2753387689590454, "train/z_loss": 0.0014752118499018253, "train/perplexity": 9.731215087829918, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018840.3881888804, "perf/iters_per_sec": 0.9626581135696795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387903928756714, "data/tokens_consumed": 54926508032, "data/tokens_consumed_B": 54.926508032, "train/loss_slope": 1.0489902752425091e-05} {"step": 26200, "timestamp": 1778222856.364579, "grad/layer_0/attn": 0.002729554660618305, "grad/layer_0/mlp": 0.0026813126169145107, "grad/layer_0/attn_mlp_ratio": 1.0179919124686465, "grad/layer_4/attn": 0.0015682067023590207, "grad/layer_4/mlp": 0.0024903039447963238, "grad/layer_4/attn_mlp_ratio": 0.6297249950807837, "grad/layer_8/attn": 0.003747036447748542, "grad/layer_8/mlp": 0.003875270951539278, "grad/layer_8/attn_mlp_ratio": 0.9669095136610378, "grad/layer_12/attn": 0.0038402013015002012, "grad/layer_12/mlp": 0.006556535139679909, "grad/layer_12/attn_mlp_ratio": 0.5857058890280106, "grad/layer_16/attn": 0.007196103688329458, "grad/layer_16/mlp": 0.0043817320838570595, "grad/layer_16/attn_mlp_ratio": 1.6422965590733625, "grad/layer_20/attn": 0.004381002392619848, "grad/layer_20/mlp": 0.006174913607537746, "grad/layer_20/attn_mlp_ratio": 0.7094839863546495, "grad/layer_24/attn": 0.02140740491449833, "grad/layer_24/mlp": 0.012773863039910793, "grad/layer_24/attn_mlp_ratio": 1.6758755499432907, "grad/layer_27/attn": 0.007119883317500353, "grad/layer_27/mlp": 0.01099290233105421, "grad/layer_27/attn_mlp_ratio": 0.6476800246481913} {"step": 26200, "timestamp": 1778222856.3817687, "train/loss": 2.173084926605225, "train/z_loss": 0.0015047014108859002, "train/perplexity": 8.785344422379243, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020045.0427176678, "perf/iters_per_sec": 0.9632325376118029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381709098815919, "data/tokens_consumed": 54947479552, "data/tokens_consumed_B": 54.947479552, "train/loss_slope": 1.1823550817167495e-05} {"step": 26210, "timestamp": 1778222867.251697, "train/loss": 2.2065812826156614, "train/z_loss": 0.0014943659072741867, "train/perplexity": 9.084605544836515, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930150.7896162057, "perf/iters_per_sec": 0.9203676174241093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.086522364616394, "data/tokens_consumed": 54968451072, "data/tokens_consumed_B": 54.968451072, "train/loss_slope": 1.159311164223039e-05} {"step": 26220, "timestamp": 1778222877.635623, "train/loss": 2.2047128200531008, "train/z_loss": 0.001498331967741251, "train/perplexity": 9.067647147479343, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020641.1553865345, "perf/iters_per_sec": 0.9635167862827942, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378646373748779, "data/tokens_consumed": 54989422592, "data/tokens_consumed_B": 54.989422592, "train/loss_slope": 1.2615420851949285e-05} {"step": 26230, "timestamp": 1778222888.0153058, "train/loss": 2.200198173522949, "train/z_loss": 0.0014991457457654179, "train/perplexity": 9.026802195384203, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021392.2028879037, "perf/iters_per_sec": 0.9638749136390227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374790191650392, "data/tokens_consumed": 55010394112, "data/tokens_consumed_B": 55.010394112, "train/loss_slope": 1.070634168509858e-05} {"step": 26240, "timestamp": 1778222898.3960297, "train/loss": 2.204452729225159, "train/z_loss": 0.0014950961340218782, "train/perplexity": 9.065289042299327, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021238.0379942316, "perf/iters_per_sec": 0.9638014020892294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375581502914428, "data/tokens_consumed": 55031365632, "data/tokens_consumed_B": 55.031365632, "train/loss_slope": 8.527931385915893e-06} {"step": 26250, "timestamp": 1778222908.7594593, "grad/layer_0/attn": 0.0033247428946197033, "grad/layer_0/mlp": 0.0029646328184753656, "grad/layer_0/attn_mlp_ratio": 1.121468656001274, "grad/layer_4/attn": 0.0019080624915659428, "grad/layer_4/mlp": 0.002653407398611307, "grad/layer_4/attn_mlp_ratio": 0.719098929419831, "grad/layer_8/attn": 0.006318263243883848, "grad/layer_8/mlp": 0.0037876141723245382, "grad/layer_8/attn_mlp_ratio": 1.668137985974532, "grad/layer_12/attn": 0.004754427820444107, "grad/layer_12/mlp": 0.006084814667701721, "grad/layer_12/attn_mlp_ratio": 0.7813594993360643, "grad/layer_16/attn": 0.0036665992811322212, "grad/layer_16/mlp": 0.004292248748242855, "grad/layer_16/attn_mlp_ratio": 0.854237349876217, "grad/layer_20/attn": 0.004652719013392925, "grad/layer_20/mlp": 0.006582834292203188, "grad/layer_20/attn_mlp_ratio": 0.7067956956206705, "grad/layer_24/attn": 0.018433399498462677, "grad/layer_24/mlp": 0.014297716319561005, "grad/layer_24/attn_mlp_ratio": 1.2892547982867781, "grad/layer_27/attn": 0.00526896957308054, "grad/layer_27/mlp": 0.0135720269754529, "grad/layer_27/attn_mlp_ratio": 0.38822274254155315} {"step": 26250, "timestamp": 1778222909.3716233, "eos/sharpness": 37.59560585021972, "eos/L0_probe": 2.025454521179199, "eos/L_plus": 2.224158763885498, "eos/L_minus": 2.2027063369750977, "eos/grad_norm": 0.21044693887233734, "eos/embed_grad_frac": 0.05725320056080818, "eos/time_s": 0.6091728210449219} {"step": 26250, "timestamp": 1778222909.39125, "train/loss": 2.2004238605499267, "train/z_loss": 0.0015004442655481398, "train/perplexity": 9.028839657440521, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908285.261810563, "perf/iters_per_sec": 0.9099413212826553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098971962928772, "data/tokens_consumed": 55052337152, "data/tokens_consumed_B": 55.052337152, "train/loss_slope": 6.186835679284058e-06} {"step": 26250, "timestamp": 1778222910.750693, "geo/rankme_last": 440.1834411621094, "geo/layer_0/stable_rank_q_proj": 18.153470993041992, "geo/layer_0/stable_rank_k_proj": 15.891803741455078, "geo/layer_0/stable_rank_o_proj": 51.27273941040039, "geo/layer_0/stable_rank_gate_proj": 146.17433166503906, "geo/layer_0/stable_rank_down_proj": 51.02696228027344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052139267325401306, "geo/layer_0/attn_entropy_mean": 6.238565444946289, "geo/layer_0/attn_entropy_std": 0.3352508842945099, "geo/layer_7/stable_rank_q_proj": 42.44514083862305, "geo/layer_7/stable_rank_k_proj": 42.16121292114258, "geo/layer_7/stable_rank_o_proj": 107.063720703125, "geo/layer_7/stable_rank_gate_proj": 98.85700225830078, "geo/layer_7/stable_rank_down_proj": 147.93597412109375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5315604209899902, "geo/layer_7/attn_entropy_mean": 4.645943641662598, "geo/layer_7/attn_entropy_std": 0.8088961839675903, "geo/layer_14/stable_rank_q_proj": 56.15953826904297, "geo/layer_14/stable_rank_k_proj": 35.0174560546875, "geo/layer_14/stable_rank_o_proj": 54.22540283203125, "geo/layer_14/stable_rank_gate_proj": 82.31587219238281, "geo/layer_14/stable_rank_down_proj": 135.24609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36369916796684265, "geo/layer_14/attn_entropy_mean": 5.510828971862793, "geo/layer_14/attn_entropy_std": 0.42700669169425964, "geo/layer_21/stable_rank_q_proj": 45.93476486206055, "geo/layer_21/stable_rank_k_proj": 31.33040428161621, "geo/layer_21/stable_rank_o_proj": 80.86085510253906, "geo/layer_21/stable_rank_gate_proj": 81.98282623291016, "geo/layer_21/stable_rank_down_proj": 58.695003509521484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15432390570640564, "geo/layer_21/attn_entropy_mean": 5.731603622436523, "geo/layer_21/attn_entropy_std": 0.29175814986228943, "geo/layer_27/stable_rank_q_proj": 41.3204345703125, "geo/layer_27/stable_rank_k_proj": 31.429651260375977, "geo/layer_27/stable_rank_o_proj": 119.65135192871094, "geo/layer_27/stable_rank_gate_proj": 89.87511444091797, "geo/layer_27/stable_rank_down_proj": 135.4728546142578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08906902372837067, "geo/layer_27/attn_entropy_mean": 4.350569248199463, "geo/layer_27/attn_entropy_std": 0.5919021368026733, "attnres/final_alpha/block_0": 0.24113120138645172, "attnres/block_norm/0": 1.6617100238800049, "attnres/final_alpha/block_1": 0.00618365965783596, "attnres/block_norm/1": 33270.9140625, "attnres/final_alpha/block_2": 0.01281535904854536, "attnres/block_norm/2": 23317.09375, "attnres/final_alpha/block_3": 0.014734980650246143, "attnres/block_norm/3": 35810.95703125, "attnres/final_alpha/block_4": 0.018821030855178833, "attnres/block_norm/4": 10617.4990234375, "attnres/final_alpha/block_5": 0.5795409083366394, "attnres/block_norm/5": 5410.8974609375, "attnres/final_alpha/block_6": 0.12677282094955444, "attnres/block_norm/6": 23827.21484375, "geo/tier1_time_s": 1.3556208610534668, "geo/step": 26250.0, "geo/rankme_slope": -7.042633459633853e-05} {"step": 26260, "timestamp": 1778222921.1279955, "train/loss": 2.1535012483596803, "train/z_loss": 0.0014944559196010232, "train/perplexity": 8.614968799798365, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787385.7301818184, "perf/iters_per_sec": 0.8522919321927158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1733068943023681, "data/tokens_consumed": 55073308672, "data/tokens_consumed_B": 55.073308672, "train/loss_slope": 4.336001677732062e-06} {"step": 26270, "timestamp": 1778222931.5058892, "train/loss": 2.192685842514038, "train/z_loss": 0.0014908551354892551, "train/perplexity": 8.959243947023717, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021686.8491231245, "perf/iters_per_sec": 0.9640154119125006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037327814102173, "data/tokens_consumed": 55094280192, "data/tokens_consumed_B": 55.094280192, "train/loss_slope": 3.92684246232627e-06} {"step": 26280, "timestamp": 1778222941.885161, "train/loss": 2.2476540565490724, "train/z_loss": 0.0014862387091852725, "train/perplexity": 9.465504231848469, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021592.6663497945, "perf/iters_per_sec": 0.9639705020665142, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373761415481568, "data/tokens_consumed": 55115251712, "data/tokens_consumed_B": 55.115251712, "train/loss_slope": 5.681597796639624e-06} {"step": 26290, "timestamp": 1778222952.2702284, "train/loss": 2.189880394935608, "train/z_loss": 0.0014874202315695584, "train/perplexity": 8.934144481867031, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020334.7951071206, "perf/iters_per_sec": 0.9633707023177722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038022017478943, "data/tokens_consumed": 55136223232, "data/tokens_consumed_B": 55.136223232, "train/loss_slope": 6.205435318521996e-06} {"step": 26300, "timestamp": 1778222962.6408806, "grad/layer_0/attn": 0.0027685982640832663, "grad/layer_0/mlp": 0.0026912586763501167, "grad/layer_0/attn_mlp_ratio": 1.0287372914164863, "grad/layer_4/attn": 0.002280701184645295, "grad/layer_4/mlp": 0.0024891828652471304, "grad/layer_4/attn_mlp_ratio": 0.9162448950066887, "grad/layer_8/attn": 0.006300370208919048, "grad/layer_8/mlp": 0.0038049526046961546, "grad/layer_8/attn_mlp_ratio": 1.6558340399719018, "grad/layer_12/attn": 0.004386344458907843, "grad/layer_12/mlp": 0.005961896385997534, "grad/layer_12/attn_mlp_ratio": 0.7357297244609787, "grad/layer_16/attn": 0.0032821521162986755, "grad/layer_16/mlp": 0.004461758304387331, "grad/layer_16/attn_mlp_ratio": 0.7356185205078056, "grad/layer_20/attn": 0.005051139276474714, "grad/layer_20/mlp": 0.006157082039862871, "grad/layer_20/attn_mlp_ratio": 0.8203787381318275, "grad/layer_24/attn": 0.019265122711658478, "grad/layer_24/mlp": 0.011338961310684681, "grad/layer_24/attn_mlp_ratio": 1.6990200437144967, "grad/layer_27/attn": 0.0072647579945623875, "grad/layer_27/mlp": 0.010437259450554848, "grad/layer_27/attn_mlp_ratio": 0.6960407527832525} {"step": 26300, "timestamp": 1778222962.6581666, "train/loss": 2.201235365867615, "train/z_loss": 0.001485709077678621, "train/perplexity": 9.036169582569444, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019852.5854351753, "perf/iters_per_sec": 0.9631407668281438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038269829750061, "data/tokens_consumed": 55157194752, "data/tokens_consumed_B": 55.157194752, "train/loss_slope": 5.880689427833385e-06} {"step": 26310, "timestamp": 1778222973.0418706, "train/loss": 2.2034340858459474, "train/z_loss": 0.001500108977779746, "train/perplexity": 9.056059447266765, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020693.005879773, "perf/iters_per_sec": 0.9635415105246415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378380060195922, "data/tokens_consumed": 55178166272, "data/tokens_consumed_B": 55.178166272, "train/loss_slope": 6.652319792545266e-06} {"step": 26320, "timestamp": 1778222983.4231575, "train/loss": 2.2064690589904785, "train/z_loss": 0.0014997290796600281, "train/perplexity": 9.083586094673182, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021475.4960871555, "perf/iters_per_sec": 0.9639146309314516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037436270713806, "data/tokens_consumed": 55199137792, "data/tokens_consumed_B": 55.199137792, "train/loss_slope": 3.854256530370356e-06} {"step": 26325, "timestamp": 1778222989.2151062, "eos/sharpness": 34.89630222320556, "eos/L0_probe": 2.024587869644165, "eos/L_plus": 2.236161231994629, "eos/L_minus": 2.161977529525757, "eos/grad_norm": 0.11788471043109894, "eos/embed_grad_frac": 0.16755175590515137, "eos/time_s": 0.6108427047729492} {"step": 26325, "timestamp": 1778222990.594666, "geo/rankme_last": 440.0128173828125, "geo/layer_0/stable_rank_q_proj": 18.210494995117188, "geo/layer_0/stable_rank_k_proj": 15.88337516784668, "geo/layer_0/stable_rank_o_proj": 51.28678512573242, "geo/layer_0/stable_rank_gate_proj": 145.83380126953125, "geo/layer_0/stable_rank_down_proj": 51.092308044433594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050254594534635544, "geo/layer_0/attn_entropy_mean": 6.23855447769165, "geo/layer_0/attn_entropy_std": 0.3370125889778137, "geo/layer_7/stable_rank_q_proj": 42.51235580444336, "geo/layer_7/stable_rank_k_proj": 42.147090911865234, "geo/layer_7/stable_rank_o_proj": 107.30144500732422, "geo/layer_7/stable_rank_gate_proj": 98.99105072021484, "geo/layer_7/stable_rank_down_proj": 148.2327423095703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5429995656013489, "geo/layer_7/attn_entropy_mean": 4.636885643005371, "geo/layer_7/attn_entropy_std": 0.8082057237625122, "geo/layer_14/stable_rank_q_proj": 56.048370361328125, "geo/layer_14/stable_rank_k_proj": 35.04802322387695, "geo/layer_14/stable_rank_o_proj": 54.195831298828125, "geo/layer_14/stable_rank_gate_proj": 82.32313537597656, "geo/layer_14/stable_rank_down_proj": 135.0975341796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36882224678993225, "geo/layer_14/attn_entropy_mean": 5.492685794830322, "geo/layer_14/attn_entropy_std": 0.4324641525745392, "geo/layer_21/stable_rank_q_proj": 45.97803497314453, "geo/layer_21/stable_rank_k_proj": 31.28762435913086, "geo/layer_21/stable_rank_o_proj": 80.76195526123047, "geo/layer_21/stable_rank_gate_proj": 82.09992980957031, "geo/layer_21/stable_rank_down_proj": 58.704715728759766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14899463951587677, "geo/layer_21/attn_entropy_mean": 5.731637477874756, "geo/layer_21/attn_entropy_std": 0.29102233052253723, "geo/layer_27/stable_rank_q_proj": 41.18518829345703, "geo/layer_27/stable_rank_k_proj": 31.28864860534668, "geo/layer_27/stable_rank_o_proj": 119.7669677734375, "geo/layer_27/stable_rank_gate_proj": 89.87904357910156, "geo/layer_27/stable_rank_down_proj": 135.54214477539062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08193736523389816, "geo/layer_27/attn_entropy_mean": 4.339033126831055, "geo/layer_27/attn_entropy_std": 0.5695151090621948, "attnres/final_alpha/block_0": 0.23983582854270935, "attnres/block_norm/0": 1.6621955633163452, "attnres/final_alpha/block_1": 0.006217606365680695, "attnres/block_norm/1": 33242.4375, "attnres/final_alpha/block_2": 0.012644218280911446, "attnres/block_norm/2": 23232.7734375, "attnres/final_alpha/block_3": 0.014566982164978981, "attnres/block_norm/3": 36064.8828125, "attnres/final_alpha/block_4": 0.0185711607336998, "attnres/block_norm/4": 10598.59375, "attnres/final_alpha/block_5": 0.5824193954467773, "attnres/block_norm/5": 5336.501953125, "attnres/final_alpha/block_6": 0.12574473023414612, "attnres/block_norm/6": 23909.078125, "geo/tier1_time_s": 1.3598620891571045, "geo/step": 26325.0, "geo/rankme_slope": -6.69054731267507e-05} {"step": 26330, "timestamp": 1778222995.7893846, "train/loss": 2.2015954732894896, "train/z_loss": 0.0015010361559689045, "train/perplexity": 9.039424160265067, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696891.9334164087, "perf/iters_per_sec": 0.8091411273080867, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235878348350525, "data/tokens_consumed": 55220109312, "data/tokens_consumed_B": 55.220109312, "train/loss_slope": 3.344139343190806e-06} {"step": 26340, "timestamp": 1778223006.163918, "train/loss": 2.157722282409668, "train/z_loss": 0.001507909770589322, "train/perplexity": 8.6514097315427, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022348.7921612787, "perf/iters_per_sec": 0.9643310509497064, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369882822036742, "data/tokens_consumed": 55241080832, "data/tokens_consumed_B": 55.241080832, "train/loss_slope": -1.380371389323201e-06} {"step": 26350, "timestamp": 1778223016.530905, "grad/layer_0/attn": 0.002495118649676442, "grad/layer_0/mlp": 0.00258428486995399, "grad/layer_0/attn_mlp_ratio": 0.9654967152174648, "grad/layer_4/attn": 0.001679846434853971, "grad/layer_4/mlp": 0.002402709098532796, "grad/layer_4/attn_mlp_ratio": 0.6991467947431017, "grad/layer_8/attn": 0.00544979190453887, "grad/layer_8/mlp": 0.0035605852026492357, "grad/layer_8/attn_mlp_ratio": 1.5305887772114262, "grad/layer_12/attn": 0.004348989576101303, "grad/layer_12/mlp": 0.006019802764058113, "grad/layer_12/attn_mlp_ratio": 0.7224471754826753, "grad/layer_16/attn": 0.004512677900493145, "grad/layer_16/mlp": 0.004057350102812052, "grad/layer_16/attn_mlp_ratio": 1.1122229225777742, "grad/layer_20/attn": 0.013236958533525467, "grad/layer_20/mlp": 0.005334778223186731, "grad/layer_20/attn_mlp_ratio": 2.481257464062418, "grad/layer_24/attn": 0.00411309814080596, "grad/layer_24/mlp": 0.007358195260167122, "grad/layer_24/attn_mlp_ratio": 0.5589819160105233, "grad/layer_27/attn": 0.005230836570262909, "grad/layer_27/mlp": 0.006347746588289738, "grad/layer_27/attn_mlp_ratio": 0.8240462052325916} {"step": 26350, "timestamp": 1778223016.5477743, "train/loss": 2.1671447277069094, "train/z_loss": 0.0014914086204953492, "train/perplexity": 8.73331242233462, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020615.9041850427, "perf/iters_per_sec": 0.9635047455716337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037877607345581, "data/tokens_consumed": 55262052352, "data/tokens_consumed_B": 55.262052352, "train/loss_slope": -3.662279000555515e-06} {"step": 26360, "timestamp": 1778223026.926208, "train/loss": 2.2015639781951903, "train/z_loss": 0.0014920659945346416, "train/perplexity": 9.03913946723197, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021974.0515632872, "perf/iters_per_sec": 0.9641523607078968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371804714202881, "data/tokens_consumed": 55283023872, "data/tokens_consumed_B": 55.283023872, "train/loss_slope": -5.718469111868598e-06} {"step": 26370, "timestamp": 1778223037.3026586, "train/loss": 2.229119157791138, "train/z_loss": 0.0014817481045611204, "train/perplexity": 9.291677971648923, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022185.4623801387, "perf/iters_per_sec": 0.9642531692410177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370720386505128, "data/tokens_consumed": 55303995392, "data/tokens_consumed_B": 55.303995392, "train/loss_slope": -3.976570697936624e-06} {"step": 26380, "timestamp": 1778223048.1133933, "train/loss": 2.1668363332748415, "train/z_loss": 0.0015068783424794674, "train/perplexity": 8.730619532667497, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1940856.835588195, "perf/iters_per_sec": 0.9254726579609847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0805289506912232, "data/tokens_consumed": 55324966912, "data/tokens_consumed_B": 55.324966912, "train/loss_slope": -7.530370167773114e-06} {"step": 26390, "timestamp": 1778223059.0108204, "train/loss": 2.160790801048279, "train/z_loss": 0.0015130467829294503, "train/perplexity": 8.677997515247013, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925747.6729772622, "perf/iters_per_sec": 0.9182680477987586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0890066385269166, "data/tokens_consumed": 55345938432, "data/tokens_consumed_B": 55.345938432, "train/loss_slope": -9.928398485695847e-06} {"step": 26400, "timestamp": 1778223069.380088, "grad/layer_0/attn": 0.0033796303905546665, "grad/layer_0/mlp": 0.0029504208359867334, "grad/layer_0/attn_mlp_ratio": 1.1454739726568504, "grad/layer_4/attn": 0.001636843546293676, "grad/layer_4/mlp": 0.0023744215723127127, "grad/layer_4/attn_mlp_ratio": 0.6893651474716251, "grad/layer_8/attn": 0.0046783750876784325, "grad/layer_8/mlp": 0.003620576811954379, "grad/layer_8/attn_mlp_ratio": 1.2921628794105788, "grad/layer_12/attn": 0.004396243952214718, "grad/layer_12/mlp": 0.006022335030138493, "grad/layer_12/attn_mlp_ratio": 0.7299899220509869, "grad/layer_16/attn": 0.004051032941788435, "grad/layer_16/mlp": 0.004728525411337614, "grad/layer_16/attn_mlp_ratio": 0.8567222344629947, "grad/layer_20/attn": 0.0049624270759522915, "grad/layer_20/mlp": 0.0061859614215791225, "grad/layer_20/attn_mlp_ratio": 0.8022078796709856, "grad/layer_24/attn": 0.0181194469332695, "grad/layer_24/mlp": 0.01098787784576416, "grad/layer_24/attn_mlp_ratio": 1.6490396983572764, "grad/layer_27/attn": 0.0043619475327432156, "grad/layer_27/mlp": 0.010758941993117332, "grad/layer_27/attn_mlp_ratio": 0.405425319235952} {"step": 26400, "timestamp": 1778223069.9957154, "eos/sharpness": 52.17278003692626, "eos/L0_probe": 2.0223748683929443, "eos/L_plus": 2.2492432594299316, "eos/L_minus": 2.3172342777252197, "eos/grad_norm": 0.1829504519701004, "eos/embed_grad_frac": 0.06795737147331238, "eos/time_s": 0.6126437187194824} {"step": 26400, "timestamp": 1778223070.0159237, "train/loss": 2.2209877252578734, "train/z_loss": 0.0014898460474796593, "train/perplexity": 9.216429671956634, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906592.566302811, "perf/iters_per_sec": 0.9091341811670356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999476432800293, "data/tokens_consumed": 55366909952, "data/tokens_consumed_B": 55.366909952, "train/loss_slope": -9.048016286633057e-06} {"step": 26400, "timestamp": 1778223071.3809848, "geo/rankme_last": 439.9115295410156, "geo/layer_0/stable_rank_q_proj": 18.197914123535156, "geo/layer_0/stable_rank_k_proj": 15.855384826660156, "geo/layer_0/stable_rank_o_proj": 51.222042083740234, "geo/layer_0/stable_rank_gate_proj": 146.18409729003906, "geo/layer_0/stable_rank_down_proj": 51.15795135498047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054840825498104095, "geo/layer_0/attn_entropy_mean": 6.237798690795898, "geo/layer_0/attn_entropy_std": 0.3370257616043091, "geo/layer_7/stable_rank_q_proj": 42.47609329223633, "geo/layer_7/stable_rank_k_proj": 42.31563186645508, "geo/layer_7/stable_rank_o_proj": 107.30044555664062, "geo/layer_7/stable_rank_gate_proj": 98.94519805908203, "geo/layer_7/stable_rank_down_proj": 148.41537475585938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5413525700569153, "geo/layer_7/attn_entropy_mean": 4.653387069702148, "geo/layer_7/attn_entropy_std": 0.8276017308235168, "geo/layer_14/stable_rank_q_proj": 56.075199127197266, "geo/layer_14/stable_rank_k_proj": 34.97249221801758, "geo/layer_14/stable_rank_o_proj": 54.09055709838867, "geo/layer_14/stable_rank_gate_proj": 82.4391860961914, "geo/layer_14/stable_rank_down_proj": 135.2727508544922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3569178879261017, "geo/layer_14/attn_entropy_mean": 5.512905120849609, "geo/layer_14/attn_entropy_std": 0.42040398716926575, "geo/layer_21/stable_rank_q_proj": 46.062870025634766, "geo/layer_21/stable_rank_k_proj": 31.228675842285156, "geo/layer_21/stable_rank_o_proj": 80.80374145507812, "geo/layer_21/stable_rank_gate_proj": 81.85796356201172, "geo/layer_21/stable_rank_down_proj": 58.81526184082031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14810281991958618, "geo/layer_21/attn_entropy_mean": 5.73123025894165, "geo/layer_21/attn_entropy_std": 0.29392513632774353, "geo/layer_27/stable_rank_q_proj": 41.16169738769531, "geo/layer_27/stable_rank_k_proj": 31.286413192749023, "geo/layer_27/stable_rank_o_proj": 119.82286834716797, "geo/layer_27/stable_rank_gate_proj": 89.89236450195312, "geo/layer_27/stable_rank_down_proj": 136.0022735595703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08149871230125427, "geo/layer_27/attn_entropy_mean": 4.334183692932129, "geo/layer_27/attn_entropy_std": 0.5893095135688782, "attnres/final_alpha/block_0": 0.2425766885280609, "attnres/block_norm/0": 1.6626737117767334, "attnres/final_alpha/block_1": 0.006314633414149284, "attnres/block_norm/1": 33379.9921875, "attnres/final_alpha/block_2": 0.012971512041985989, "attnres/block_norm/2": 23286.10546875, "attnres/final_alpha/block_3": 0.015101851895451546, "attnres/block_norm/3": 35999.875, "attnres/final_alpha/block_4": 0.019154824316501617, "attnres/block_norm/4": 10637.8916015625, "attnres/final_alpha/block_5": 0.5749351978302002, "attnres/block_norm/5": 5406.99951171875, "attnres/final_alpha/block_6": 0.12894529104232788, "attnres/block_norm/6": 24068.48828125, "geo/tier1_time_s": 1.3608977794647217, "geo/step": 26400.0, "geo/rankme_slope": -0.00010763258428371348} {"step": 26410, "timestamp": 1778223082.0952356, "train/loss": 2.1509183168411257, "train/z_loss": 0.0014940754859708249, "train/perplexity": 8.592745638161535, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1736752.2208578931, "perf/iters_per_sec": 0.8281479934968439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2075136423110961, "data/tokens_consumed": 55387881472, "data/tokens_consumed_B": 55.387881472, "train/loss_slope": -9.293166824979433e-06} {"step": 26420, "timestamp": 1778223093.0321934, "train/loss": 2.2071059942245483, "train/z_loss": 0.0014956516912207007, "train/perplexity": 9.089373593643932, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919018.791853437, "perf/iters_per_sec": 0.9150594672457871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0928251504898072, "data/tokens_consumed": 55408852992, "data/tokens_consumed_B": 55.408852992, "train/loss_slope": -9.125045831590427e-06} {"step": 26430, "timestamp": 1778223103.4195964, "train/loss": 2.222165846824646, "train/z_loss": 0.0015117418835870922, "train/perplexity": 9.22729414510024, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020135.5089187948, "perf/iters_per_sec": 0.9632756752580618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038124418258667, "data/tokens_consumed": 55429824512, "data/tokens_consumed_B": 55.429824512, "train/loss_slope": -9.136357687988165e-06} {"step": 26440, "timestamp": 1778223114.2889972, "train/loss": 2.2292393922805784, "train/z_loss": 0.0014952946454286574, "train/perplexity": 9.292795218970376, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930586.497065983, "perf/iters_per_sec": 0.9205753789262691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.086277151107788, "data/tokens_consumed": 55450796032, "data/tokens_consumed_B": 55.450796032, "train/loss_slope": -9.199507494713848e-06} {"step": 26450, "timestamp": 1778223124.9803069, "grad/layer_0/attn": 0.0029806937091052532, "grad/layer_0/mlp": 0.002888845046982169, "grad/layer_0/attn_mlp_ratio": 1.0317942144524548, "grad/layer_4/attn": 0.002426199149340391, "grad/layer_4/mlp": 0.002578769577667117, "grad/layer_4/attn_mlp_ratio": 0.9408359227859586, "grad/layer_8/attn": 0.00826270878314972, "grad/layer_8/mlp": 0.003840748919174075, "grad/layer_8/attn_mlp_ratio": 2.151327447302598, "grad/layer_12/attn": 0.00485332403331995, "grad/layer_12/mlp": 0.006392272654920816, "grad/layer_12/attn_mlp_ratio": 0.7592485833123794, "grad/layer_16/attn": 0.0067222206853330135, "grad/layer_16/mlp": 0.0047219302505254745, "grad/layer_16/attn_mlp_ratio": 1.423617077406688, "grad/layer_20/attn": 0.003888083156198263, "grad/layer_20/mlp": 0.006048949435353279, "grad/layer_20/attn_mlp_ratio": 0.6427699774108276, "grad/layer_24/attn": 0.010259641334414482, "grad/layer_24/mlp": 0.011327600106596947, "grad/layer_24/attn_mlp_ratio": 0.9057206422627355, "grad/layer_27/attn": 0.013879377394914627, "grad/layer_27/mlp": 0.007980963215231895, "grad/layer_27/attn_mlp_ratio": 1.7390604174843707} {"step": 26450, "timestamp": 1778223124.9983985, "train/loss": 2.209841275215149, "train/z_loss": 0.0014983379165641963, "train/perplexity": 9.114269617739643, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1959208.7566991826, "perf/iters_per_sec": 0.934223535871116, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0704076290130615, "data/tokens_consumed": 55471767552, "data/tokens_consumed_B": 55.471767552, "train/loss_slope": -8.418214904128201e-06} {"step": 26460, "timestamp": 1778223135.3767872, "train/loss": 2.1668980360031127, "train/z_loss": 0.0015088428976014257, "train/perplexity": 8.731158252332225, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022223.9561777024, "perf/iters_per_sec": 0.9642715245140564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037052297592163, "data/tokens_consumed": 55492739072, "data/tokens_consumed_B": 55.492739072, "train/loss_slope": -1.2014549421613147e-05} {"step": 26470, "timestamp": 1778223145.7704515, "train/loss": 2.1446767330169676, "train/z_loss": 0.0015015384647995233, "train/perplexity": 8.539280323674527, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019439.1281129334, "perf/iters_per_sec": 0.9629436150135676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038482403755188, "data/tokens_consumed": 55513710592, "data/tokens_consumed_B": 55.513710592, "train/loss_slope": -1.702332229825996e-05} {"step": 26475, "timestamp": 1778223151.5691094, "eos/sharpness": 47.780156135559075, "eos/L0_probe": 2.023869037628174, "eos/L_plus": 2.2558884620666504, "eos/L_minus": 2.269651174545288, "eos/grad_norm": 0.1978929042816162, "eos/embed_grad_frac": 0.06933888047933578, "eos/time_s": 0.6201374530792236} {"step": 26475, "timestamp": 1778223152.9506073, "geo/rankme_last": 439.97320556640625, "geo/layer_0/stable_rank_q_proj": 18.17388343811035, "geo/layer_0/stable_rank_k_proj": 15.858942985534668, "geo/layer_0/stable_rank_o_proj": 51.211761474609375, "geo/layer_0/stable_rank_gate_proj": 146.11517333984375, "geo/layer_0/stable_rank_down_proj": 51.09584426879883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058579955250024796, "geo/layer_0/attn_entropy_mean": 6.238864421844482, "geo/layer_0/attn_entropy_std": 0.3392643928527832, "geo/layer_7/stable_rank_q_proj": 42.538272857666016, "geo/layer_7/stable_rank_k_proj": 42.2939453125, "geo/layer_7/stable_rank_o_proj": 107.0439224243164, "geo/layer_7/stable_rank_gate_proj": 98.89525604248047, "geo/layer_7/stable_rank_down_proj": 148.4442901611328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5311281085014343, "geo/layer_7/attn_entropy_mean": 4.648162364959717, "geo/layer_7/attn_entropy_std": 0.8418161869049072, "geo/layer_14/stable_rank_q_proj": 56.301246643066406, "geo/layer_14/stable_rank_k_proj": 34.93754196166992, "geo/layer_14/stable_rank_o_proj": 54.1704216003418, "geo/layer_14/stable_rank_gate_proj": 82.40013122558594, "geo/layer_14/stable_rank_down_proj": 135.39559936523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3730924725532532, "geo/layer_14/attn_entropy_mean": 5.519650459289551, "geo/layer_14/attn_entropy_std": 0.4248582720756531, "geo/layer_21/stable_rank_q_proj": 46.13418197631836, "geo/layer_21/stable_rank_k_proj": 31.146085739135742, "geo/layer_21/stable_rank_o_proj": 80.92889404296875, "geo/layer_21/stable_rank_gate_proj": 81.80548858642578, "geo/layer_21/stable_rank_down_proj": 58.769779205322266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14384311437606812, "geo/layer_21/attn_entropy_mean": 5.737924575805664, "geo/layer_21/attn_entropy_std": 0.2938575744628906, "geo/layer_27/stable_rank_q_proj": 41.1950569152832, "geo/layer_27/stable_rank_k_proj": 31.262048721313477, "geo/layer_27/stable_rank_o_proj": 119.59618377685547, "geo/layer_27/stable_rank_gate_proj": 89.82151794433594, "geo/layer_27/stable_rank_down_proj": 135.9305877685547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08199853450059891, "geo/layer_27/attn_entropy_mean": 4.3595805168151855, "geo/layer_27/attn_entropy_std": 0.6129158735275269, "attnres/final_alpha/block_0": 0.24149614572525024, "attnres/block_norm/0": 1.663055658340454, "attnres/final_alpha/block_1": 0.006276700180023909, "attnres/block_norm/1": 33317.3828125, "attnres/final_alpha/block_2": 0.012857233174145222, "attnres/block_norm/2": 23306.369140625, "attnres/final_alpha/block_3": 0.015022825449705124, "attnres/block_norm/3": 35922.390625, "attnres/final_alpha/block_4": 0.01919795759022236, "attnres/block_norm/4": 10659.8310546875, "attnres/final_alpha/block_5": 0.5755342245101929, "attnres/block_norm/5": 5405.3486328125, "attnres/final_alpha/block_6": 0.129614919424057, "attnres/block_norm/6": 23995.28125, "geo/tier1_time_s": 1.3615822792053223, "geo/step": 26475.0, "geo/rankme_slope": -0.00011177555006377551} {"step": 26480, "timestamp": 1778223158.1476138, "train/loss": 2.238900089263916, "train/z_loss": 0.0014881472918204964, "train/perplexity": 9.383005141472587, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695214.329545294, "perf/iters_per_sec": 0.8083411834455938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2371013879776, "data/tokens_consumed": 55534682112, "data/tokens_consumed_B": 55.534682112, "train/loss_slope": -1.3773788670466333e-05} {"step": 26490, "timestamp": 1778223168.5266569, "train/loss": 2.1719410181045533, "train/z_loss": 0.0014950495329685508, "train/perplexity": 8.77530053795077, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021679.4610106943, "perf/iters_per_sec": 0.9640118889859649, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373316049575805, "data/tokens_consumed": 55555653632, "data/tokens_consumed_B": 55.555653632, "train/loss_slope": -1.5854378780945277e-05} {"step": 26500, "timestamp": 1778223178.8981726, "grad/layer_0/attn": 0.003002389334142208, "grad/layer_0/mlp": 0.0028798917774111032, "grad/layer_0/attn_mlp_ratio": 1.042535436032142, "grad/layer_4/attn": 0.0026766920927911997, "grad/layer_4/mlp": 0.0025268520694226027, "grad/layer_4/attn_mlp_ratio": 1.0592990461340823, "grad/layer_8/attn": 0.006026852410286665, "grad/layer_8/mlp": 0.003781696781516075, "grad/layer_8/attn_mlp_ratio": 1.5936899754563383, "grad/layer_12/attn": 0.005758434534072876, "grad/layer_12/mlp": 0.007674405816942453, "grad/layer_12/attn_mlp_ratio": 0.7503427100930679, "grad/layer_16/attn": 0.004069007467478514, "grad/layer_16/mlp": 0.004984367173165083, "grad/layer_16/attn_mlp_ratio": 0.8163538608772476, "grad/layer_20/attn": 0.00481216236948967, "grad/layer_20/mlp": 0.0065410505048930645, "grad/layer_20/attn_mlp_ratio": 0.735686460809508, "grad/layer_24/attn": 0.014526602812111378, "grad/layer_24/mlp": 0.011681074276566505, "grad/layer_24/attn_mlp_ratio": 1.2436016023708667, "grad/layer_27/attn": 0.0059130494482815266, "grad/layer_27/mlp": 0.009734513238072395, "grad/layer_27/attn_mlp_ratio": 0.6074314393463469} {"step": 26500, "timestamp": 1778223178.9149067, "train/loss": 2.2012415885925294, "train/z_loss": 0.0014887418365105987, "train/perplexity": 9.03622581234199, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020214.058710208, "perf/iters_per_sec": 0.9633131307173767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038084053993225, "data/tokens_consumed": 55576625152, "data/tokens_consumed_B": 55.576625152, "train/loss_slope": -1.2939351283856583e-05} {"step": 26500, "timestamp": 1778223185.9168115, "geo/ww_alpha_mean": 7.749308984703915, "geo/ww_alpha_std": 4.127010898043807, "geo/ww_alpha_min": 1.3660917853420664, "geo/ww_alpha_max": 29.90868061012223, "geo/ww_alpha_healthy_frac": 0.14720812182741116, "geo/ww_alpha_by_type/q_proj": 4.1570746663864595, "geo/ww_alpha_by_type/k_proj": 4.640554025599544, "geo/ww_alpha_by_type/v_proj": 8.294686176091629, "geo/ww_alpha_by_type/o_proj": 9.14971281864244, "geo/ww_alpha_by_type/gate_proj": 8.548403641666196, "geo/ww_alpha_by_type/up_proj": 10.591347111031439, "geo/ww_alpha_by_type/down_proj": 8.977184506445203, "geo/twonn_id/layer_0": 0.7166092991828918, "geo/twonn_id/layer_7": 3.149993419647217, "geo/twonn_id/layer_14": 3.904581308364868, "geo/twonn_id/layer_21": 6.764953136444092, "geo/twonn_id/layer_27": 5.690842628479004, "geo/tier2_time_s": 6.995551586151123} {"step": 26500, "timestamp": 1778223186.5856435, "eoc/jacobian_sigma/layer_0/attn": 902.6802978515625, "eoc/jacobian_sigma/layer_0/mlp": 6586.2978515625, "eoc/jacobian_sigma/layer_0": 6586.2978515625, "eoc/jacobian_sigma/layer_7/attn": 1.1648874282836914, "eoc/jacobian_sigma/layer_7/mlp": 1.6307789087295532, "eoc/jacobian_sigma/layer_7": 1.6307789087295532, "eoc/jacobian_sigma/layer_14/attn": 1.6181172132492065, "eoc/jacobian_sigma/layer_14/mlp": 6.84258508682251, "eoc/jacobian_sigma/layer_14": 6.84258508682251, "eoc/jacobian_sigma/layer_21/attn": 1.091607928276062, "eoc/jacobian_sigma/layer_21/mlp": 4.437602519989014, "eoc/jacobian_sigma/layer_21": 4.437602519989014, "eoc/jacobian_sigma/layer_27/attn": 3.616065740585327, "eoc/jacobian_sigma/layer_27/mlp": 30.12218475341797, "eoc/jacobian_sigma/layer_27": 30.12218475341797, "eoc/layer0_sigma": 6586.2978515625, "eoc/sigma_max": 30.12218475341797, "eoc/sigma_min": 1.6307789087295532, "eoc/sigma_mean": 10.758287817239761, "eoc/time_s": 0.6634657382965088} {"step": 26510, "timestamp": 1778223196.978402, "train/loss": 2.215081477165222, "train/z_loss": 0.0014856900786980986, "train/perplexity": 9.162155587662621, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1161331.2039592366, "perf/iters_per_sec": 0.5537658710285361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8058173179626464, "data/tokens_consumed": 55597596672, "data/tokens_consumed_B": 55.597596672, "train/loss_slope": -1.2449418085672277e-05} {"step": 26520, "timestamp": 1778223207.3660533, "train/loss": 2.2263001203536987, "train/z_loss": 0.0014975972240790724, "train/perplexity": 9.265521269274048, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020087.6768050066, "perf/iters_per_sec": 0.9632528671288522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381489992141724, "data/tokens_consumed": 55618568192, "data/tokens_consumed_B": 55.618568192, "train/loss_slope": -1.3062278687185e-05} {"step": 26530, "timestamp": 1778223217.744646, "train/loss": 2.1867338895797728, "train/z_loss": 0.0015001144143752753, "train/perplexity": 8.906077328286122, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021819.2402402684, "perf/iters_per_sec": 0.9640785409165709, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372598886489868, "data/tokens_consumed": 55639539712, "data/tokens_consumed_B": 55.639539712, "train/loss_slope": -1.4375027993617419e-05} {"step": 26540, "timestamp": 1778223228.1199903, "train/loss": 2.1566378831863404, "train/z_loss": 0.0014989604707807303, "train/perplexity": 8.642033234401072, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022339.725341764, "perf/iters_per_sec": 0.9643267275532551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369929313659667, "data/tokens_consumed": 55660511232, "data/tokens_consumed_B": 55.660511232, "train/loss_slope": -1.6573074999207567e-05} {"step": 26550, "timestamp": 1778223238.4905593, "grad/layer_0/attn": 0.003143560141324997, "grad/layer_0/mlp": 0.0026448729913681746, "grad/layer_0/attn_mlp_ratio": 1.1885485740636632, "grad/layer_4/attn": 0.0016403894405812025, "grad/layer_4/mlp": 0.002457445953041315, "grad/layer_4/attn_mlp_ratio": 0.6675179862243853, "grad/layer_8/attn": 0.00864576455205679, "grad/layer_8/mlp": 0.0036421208642423153, "grad/layer_8/attn_mlp_ratio": 2.373826854445347, "grad/layer_12/attn": 0.004174899309873581, "grad/layer_12/mlp": 0.00632118247449398, "grad/layer_12/attn_mlp_ratio": 0.66046175073622, "grad/layer_16/attn": 0.004921691957861185, "grad/layer_16/mlp": 0.004525710828602314, "grad/layer_16/attn_mlp_ratio": 1.087495873135928, "grad/layer_20/attn": 0.008389991708099842, "grad/layer_20/mlp": 0.0060948836617171764, "grad/layer_20/attn_mlp_ratio": 1.3765630381334855, "grad/layer_24/attn": 0.015130153857171535, "grad/layer_24/mlp": 0.009155991487205029, "grad/layer_24/attn_mlp_ratio": 1.65248664910473, "grad/layer_27/attn": 0.007174546830356121, "grad/layer_27/mlp": 0.007577220443636179, "grad/layer_27/attn_mlp_ratio": 0.9468573323211177} {"step": 26550, "timestamp": 1778223239.1270468, "eos/sharpness": 31.77318572998046, "eos/L0_probe": 2.026359796524048, "eos/L_plus": 2.160682439804077, "eos/L_minus": 2.2097690105438232, "eos/grad_norm": 0.13834896683692932, "eos/embed_grad_frac": 0.13294018805027008, "eos/time_s": 0.6335034370422363} {"step": 26550, "timestamp": 1778223239.1476352, "train/loss": 2.2040568351745606, "train/z_loss": 0.0015065457206219434, "train/perplexity": 9.06170085861768, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1902534.2656852352, "perf/iters_per_sec": 0.9071990326334167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1022939443588258, "data/tokens_consumed": 55681482752, "data/tokens_consumed_B": 55.681482752, "train/loss_slope": -1.5696724527704115e-05} {"step": 26550, "timestamp": 1778223240.507989, "geo/rankme_last": 440.69696044921875, "geo/layer_0/stable_rank_q_proj": 18.206130981445312, "geo/layer_0/stable_rank_k_proj": 15.902202606201172, "geo/layer_0/stable_rank_o_proj": 51.17448425292969, "geo/layer_0/stable_rank_gate_proj": 145.8732147216797, "geo/layer_0/stable_rank_down_proj": 51.119503021240234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05827416479587555, "geo/layer_0/attn_entropy_mean": 6.243450164794922, "geo/layer_0/attn_entropy_std": 0.3408350348472595, "geo/layer_7/stable_rank_q_proj": 42.65657424926758, "geo/layer_7/stable_rank_k_proj": 42.139957427978516, "geo/layer_7/stable_rank_o_proj": 107.25750732421875, "geo/layer_7/stable_rank_gate_proj": 98.63544464111328, "geo/layer_7/stable_rank_down_proj": 148.3571319580078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5444996356964111, "geo/layer_7/attn_entropy_mean": 4.665273666381836, "geo/layer_7/attn_entropy_std": 0.825596034526825, "geo/layer_14/stable_rank_q_proj": 56.33467483520508, "geo/layer_14/stable_rank_k_proj": 34.94845199584961, "geo/layer_14/stable_rank_o_proj": 54.06604766845703, "geo/layer_14/stable_rank_gate_proj": 82.32052612304688, "geo/layer_14/stable_rank_down_proj": 135.21551513671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37271326780319214, "geo/layer_14/attn_entropy_mean": 5.500146389007568, "geo/layer_14/attn_entropy_std": 0.42835068702697754, "geo/layer_21/stable_rank_q_proj": 46.0908203125, "geo/layer_21/stable_rank_k_proj": 31.131988525390625, "geo/layer_21/stable_rank_o_proj": 80.92674255371094, "geo/layer_21/stable_rank_gate_proj": 81.69241333007812, "geo/layer_21/stable_rank_down_proj": 58.78704071044922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14885136485099792, "geo/layer_21/attn_entropy_mean": 5.748142242431641, "geo/layer_21/attn_entropy_std": 0.30517005920410156, "geo/layer_27/stable_rank_q_proj": 41.184871673583984, "geo/layer_27/stable_rank_k_proj": 31.176563262939453, "geo/layer_27/stable_rank_o_proj": 119.4386978149414, "geo/layer_27/stable_rank_gate_proj": 89.51055145263672, "geo/layer_27/stable_rank_down_proj": 135.56729125976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08464658260345459, "geo/layer_27/attn_entropy_mean": 4.337523460388184, "geo/layer_27/attn_entropy_std": 0.6117454767227173, "attnres/final_alpha/block_0": 0.2411384880542755, "attnres/block_norm/0": 1.663684606552124, "attnres/final_alpha/block_1": 0.006220594979822636, "attnres/block_norm/1": 33382.62890625, "attnres/final_alpha/block_2": 0.013014093972742558, "attnres/block_norm/2": 23315.6796875, "attnres/final_alpha/block_3": 0.014994052238762379, "attnres/block_norm/3": 36145.2734375, "attnres/final_alpha/block_4": 0.018998021259903908, "attnres/block_norm/4": 10725.7763671875, "attnres/final_alpha/block_5": 0.577973484992981, "attnres/block_norm/5": 5398.7138671875, "attnres/final_alpha/block_6": 0.12766127288341522, "attnres/block_norm/6": 24046.42578125, "geo/tier1_time_s": 1.3566205501556396, "geo/step": 26550.0, "geo/rankme_slope": -9.78217849639856e-05} {"step": 26560, "timestamp": 1778223250.8895714, "train/loss": 2.2276781797409058, "train/z_loss": 0.001500646723434329, "train/perplexity": 9.278298509712553, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786588.4607567787, "perf/iters_per_sec": 0.8519117645057577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1738304853439332, "data/tokens_consumed": 55702454272, "data/tokens_consumed_B": 55.702454272, "train/loss_slope": -1.2623604567888611e-05} {"step": 26570, "timestamp": 1778223261.2672284, "train/loss": 2.200301241874695, "train/z_loss": 0.001484917977359146, "train/perplexity": 9.027732620955907, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021817.2884025434, "perf/iters_per_sec": 0.9640776102078168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037260890007019, "data/tokens_consumed": 55723425792, "data/tokens_consumed_B": 55.723425792, "train/loss_slope": -1.433311828506843e-05} {"step": 26580, "timestamp": 1778223271.6551802, "train/loss": 2.171358919143677, "train/z_loss": 0.0015054542920552195, "train/perplexity": 8.77019393104573, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020165.758912624, "perf/iters_per_sec": 0.963290099579155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381088733673096, "data/tokens_consumed": 55744397312, "data/tokens_consumed_B": 55.744397312, "train/loss_slope": -1.534565656778157e-05} {"step": 26590, "timestamp": 1778223282.0400355, "train/loss": 2.2080824613571166, "train/z_loss": 0.001493901910725981, "train/perplexity": 9.09825340292915, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020523.956154175, "perf/iters_per_sec": 0.9634609013338924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037924838066101, "data/tokens_consumed": 55765368832, "data/tokens_consumed_B": 55.765368832, "train/loss_slope": -1.6408038768831264e-05} {"step": 26600, "timestamp": 1778223292.410201, "grad/layer_0/attn": 0.0026912738103419542, "grad/layer_0/mlp": 0.0027340957894921303, "grad/layer_0/attn_mlp_ratio": 0.9843377551918523, "grad/layer_4/attn": 0.0021834629587829113, "grad/layer_4/mlp": 0.0023053723853081465, "grad/layer_4/attn_mlp_ratio": 0.9471193799257389, "grad/layer_8/attn": 0.005137437488883734, "grad/layer_8/mlp": 0.003922704141587019, "grad/layer_8/attn_mlp_ratio": 1.3096673041058196, "grad/layer_12/attn": 0.0046427552588284016, "grad/layer_12/mlp": 0.00633516488596797, "grad/layer_12/attn_mlp_ratio": 0.7328546721532653, "grad/layer_16/attn": 0.004132473841309547, "grad/layer_16/mlp": 0.004544320981949568, "grad/layer_16/attn_mlp_ratio": 0.9093710076350211, "grad/layer_20/attn": 0.004717010073363781, "grad/layer_20/mlp": 0.00583994947373867, "grad/layer_20/attn_mlp_ratio": 0.8077141786592524, "grad/layer_24/attn": 0.013287922367453575, "grad/layer_24/mlp": 0.008898396044969559, "grad/layer_24/attn_mlp_ratio": 1.4932940892910802, "grad/layer_27/attn": 0.0050138323567807674, "grad/layer_27/mlp": 0.007630040869116783, "grad/layer_27/attn_mlp_ratio": 0.6571173571773291} {"step": 26600, "timestamp": 1778223292.4269373, "train/loss": 2.2402920484542848, "train/z_loss": 0.0014868746511638165, "train/perplexity": 9.396074995954278, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020513.8382045403, "perf/iters_per_sec": 0.9634560767195417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379300355911254, "data/tokens_consumed": 55786340352, "data/tokens_consumed_B": 55.786340352, "train/loss_slope": -1.2641991511716028e-05} {"step": 26610, "timestamp": 1778223302.8056338, "train/loss": 2.2372249603271483, "train/z_loss": 0.001488014217466116, "train/perplexity": 9.367300555321949, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021926.5971190424, "perf/iters_per_sec": 0.964129732665559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372048139572143, "data/tokens_consumed": 55807311872, "data/tokens_consumed_B": 55.807311872, "train/loss_slope": -9.423351867256894e-06} {"step": 26620, "timestamp": 1778223313.1819701, "train/loss": 2.2305869102478026, "train/z_loss": 0.0014805089798755945, "train/perplexity": 9.305325868235085, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022225.3509080238, "perf/iters_per_sec": 0.9642721895732993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370515823364257, "data/tokens_consumed": 55828283392, "data/tokens_consumed_B": 55.828283392, "train/loss_slope": -8.129303862374013e-06} {"step": 26625, "timestamp": 1778223318.98952, "eos/sharpness": 42.40527153015136, "eos/L0_probe": 2.026055097579956, "eos/L_plus": 2.27691912651062, "eos/L_minus": 2.1992437839508057, "eos/grad_norm": 0.13889996707439423, "eos/embed_grad_frac": 0.13239555060863495, "eos/time_s": 0.6141400337219238} {"step": 26625, "timestamp": 1778223320.3696694, "geo/rankme_last": 440.6793212890625, "geo/layer_0/stable_rank_q_proj": 18.205995559692383, "geo/layer_0/stable_rank_k_proj": 15.91513729095459, "geo/layer_0/stable_rank_o_proj": 51.12834548950195, "geo/layer_0/stable_rank_gate_proj": 145.67051696777344, "geo/layer_0/stable_rank_down_proj": 51.18064880371094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05068790167570114, "geo/layer_0/attn_entropy_mean": 6.245870590209961, "geo/layer_0/attn_entropy_std": 0.3351883888244629, "geo/layer_7/stable_rank_q_proj": 42.57180404663086, "geo/layer_7/stable_rank_k_proj": 42.07318115234375, "geo/layer_7/stable_rank_o_proj": 107.21404266357422, "geo/layer_7/stable_rank_gate_proj": 98.61858367919922, "geo/layer_7/stable_rank_down_proj": 148.59832763671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5304964184761047, "geo/layer_7/attn_entropy_mean": 4.6471076011657715, "geo/layer_7/attn_entropy_std": 0.8524825572967529, "geo/layer_14/stable_rank_q_proj": 56.34248352050781, "geo/layer_14/stable_rank_k_proj": 34.95559310913086, "geo/layer_14/stable_rank_o_proj": 53.90913009643555, "geo/layer_14/stable_rank_gate_proj": 82.15082550048828, "geo/layer_14/stable_rank_down_proj": 135.01393127441406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3730889558792114, "geo/layer_14/attn_entropy_mean": 5.459504127502441, "geo/layer_14/attn_entropy_std": 0.4346693158149719, "geo/layer_21/stable_rank_q_proj": 45.98039627075195, "geo/layer_21/stable_rank_k_proj": 31.183616638183594, "geo/layer_21/stable_rank_o_proj": 81.10203552246094, "geo/layer_21/stable_rank_gate_proj": 81.42876434326172, "geo/layer_21/stable_rank_down_proj": 58.80175018310547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1506023108959198, "geo/layer_21/attn_entropy_mean": 5.72872257232666, "geo/layer_21/attn_entropy_std": 0.2925605773925781, "geo/layer_27/stable_rank_q_proj": 41.29756546020508, "geo/layer_27/stable_rank_k_proj": 31.28714370727539, "geo/layer_27/stable_rank_o_proj": 119.48245239257812, "geo/layer_27/stable_rank_gate_proj": 89.53649139404297, "geo/layer_27/stable_rank_down_proj": 135.7245330810547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07595496624708176, "geo/layer_27/attn_entropy_mean": 4.3476033210754395, "geo/layer_27/attn_entropy_std": 0.6113555431365967, "attnres/final_alpha/block_0": 0.2402913123369217, "attnres/block_norm/0": 1.663900375366211, "attnres/final_alpha/block_1": 0.0061669666320085526, "attnres/block_norm/1": 33431.6328125, "attnres/final_alpha/block_2": 0.012546776793897152, "attnres/block_norm/2": 23287.03515625, "attnres/final_alpha/block_3": 0.01508269552141428, "attnres/block_norm/3": 36018.49609375, "attnres/final_alpha/block_4": 0.018918901681900024, "attnres/block_norm/4": 10717.1884765625, "attnres/final_alpha/block_5": 0.5803348422050476, "attnres/block_norm/5": 5426.5087890625, "attnres/final_alpha/block_6": 0.12665849924087524, "attnres/block_norm/6": 24097.8125, "geo/tier1_time_s": 1.3605601787567139, "geo/step": 26625.0, "geo/rankme_slope": -7.9847681260004e-05} {"step": 26630, "timestamp": 1778223325.5620027, "train/loss": 2.2171104669570925, "train/z_loss": 0.001490952877793461, "train/perplexity": 9.18076437996227, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694718.4996004275, "perf/iters_per_sec": 0.8081047533037317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2374633312225343, "data/tokens_consumed": 55849254912, "data/tokens_consumed_B": 55.849254912, "train/loss_slope": -5.9726668949281634e-06} {"step": 26640, "timestamp": 1778223335.9457428, "train/loss": 2.181073546409607, "train/z_loss": 0.0015025062952190638, "train/perplexity": 8.855808278560266, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020812.5924534383, "perf/iters_per_sec": 0.9635985338465873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377765893936157, "data/tokens_consumed": 55870226432, "data/tokens_consumed_B": 55.870226432, "train/loss_slope": -8.561048449033581e-06} {"step": 26650, "timestamp": 1778223346.31417, "grad/layer_0/attn": 0.002890833420678973, "grad/layer_0/mlp": 0.002697348827496171, "grad/layer_0/attn_mlp_ratio": 1.0717313548908947, "grad/layer_4/attn": 0.0018468297785148025, "grad/layer_4/mlp": 0.002469934755936265, "grad/layer_4/attn_mlp_ratio": 0.7477240843320682, "grad/layer_8/attn": 0.005187090951949358, "grad/layer_8/mlp": 0.0037315073423087597, "grad/layer_8/attn_mlp_ratio": 1.3900792192283553, "grad/layer_12/attn": 0.0038611809723079205, "grad/layer_12/mlp": 0.0057572899386286736, "grad/layer_12/attn_mlp_ratio": 0.6706594502624039, "grad/layer_16/attn": 0.003746915142983198, "grad/layer_16/mlp": 0.004472556058317423, "grad/layer_16/attn_mlp_ratio": 0.8377569806508118, "grad/layer_20/attn": 0.008112020790576935, "grad/layer_20/mlp": 0.006606843788176775, "grad/layer_20/attn_mlp_ratio": 1.2278208669488524, "grad/layer_24/attn": 0.010813970118761063, "grad/layer_24/mlp": 0.012270966544747353, "grad/layer_24/attn_mlp_ratio": 0.8812647309566305, "grad/layer_27/attn": 0.012960943393409252, "grad/layer_27/mlp": 0.008877159096300602, "grad/layer_27/attn_mlp_ratio": 1.4600327770184065} {"step": 26650, "timestamp": 1778223346.3308072, "train/loss": 2.1994336366653444, "train/z_loss": 0.0014918009052053093, "train/perplexity": 9.019903509885223, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020416.0984592836, "perf/iters_per_sec": 0.96340947077717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379802465438843, "data/tokens_consumed": 55891197952, "data/tokens_consumed_B": 55.891197952, "train/loss_slope": -5.766510591469737e-06} {"step": 26660, "timestamp": 1778223356.7084582, "train/loss": 2.210370397567749, "train/z_loss": 0.0014892866369336844, "train/perplexity": 9.119093457610221, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021824.3057414698, "perf/iters_per_sec": 0.9640809563357686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372572898864747, "data/tokens_consumed": 55912169472, "data/tokens_consumed_B": 55.912169472, "train/loss_slope": -5.482321067361366e-06} {"step": 26670, "timestamp": 1778223367.0882869, "train/loss": 2.18269305229187, "train/z_loss": 0.0014872208703309298, "train/perplexity": 8.870161931935394, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021508.4343325354, "perf/iters_per_sec": 0.9639303371107747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037419366836548, "data/tokens_consumed": 55933140992, "data/tokens_consumed_B": 55.933140992, "train/loss_slope": -5.93226602618984e-06} {"step": 26680, "timestamp": 1778223377.4712794, "train/loss": 2.206273007392883, "train/z_loss": 0.00150842210277915, "train/perplexity": 9.081805417665416, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021086.0792293667, "perf/iters_per_sec": 0.9637289425036272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037636160850525, "data/tokens_consumed": 55954112512, "data/tokens_consumed_B": 55.954112512, "train/loss_slope": -6.560371010073863e-06} {"step": 26690, "timestamp": 1778223387.8533723, "train/loss": 2.199318528175354, "train/z_loss": 0.0015046001179143786, "train/perplexity": 9.018865302166748, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021064.624789851, "perf/iters_per_sec": 0.9637187122296577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376471757888794, "data/tokens_consumed": 55975084032, "data/tokens_consumed_B": 55.975084032, "train/loss_slope": -6.286699071575682e-06} {"step": 26700, "timestamp": 1778223398.219881, "grad/layer_0/attn": 0.0032951885368674994, "grad/layer_0/mlp": 0.003032789332792163, "grad/layer_0/attn_mlp_ratio": 1.0865207129905343, "grad/layer_4/attn": 0.0025573468301445246, "grad/layer_4/mlp": 0.002544508781284094, "grad/layer_4/attn_mlp_ratio": 1.0050453543136983, "grad/layer_8/attn": 0.0063881403766572475, "grad/layer_8/mlp": 0.0038380997721105814, "grad/layer_8/attn_mlp_ratio": 1.6644018106658574, "grad/layer_12/attn": 0.003897168906405568, "grad/layer_12/mlp": 0.006336729507893324, "grad/layer_12/attn_mlp_ratio": 0.6150126559844173, "grad/layer_16/attn": 0.00469348207116127, "grad/layer_16/mlp": 0.004474068060517311, "grad/layer_16/attn_mlp_ratio": 1.0490412534570304, "grad/layer_20/attn": 0.0055645909160375595, "grad/layer_20/mlp": 0.005796661134809256, "grad/layer_20/attn_mlp_ratio": 0.9599648298613517, "grad/layer_24/attn": 0.015177258290350437, "grad/layer_24/mlp": 0.012743593193590641, "grad/layer_24/attn_mlp_ratio": 1.1909716467476879, "grad/layer_27/attn": 0.007207239046692848, "grad/layer_27/mlp": 0.01279967837035656, "grad/layer_27/attn_mlp_ratio": 0.5630796948051834} {"step": 26700, "timestamp": 1778223398.8426402, "eos/sharpness": 35.52556037902831, "eos/L0_probe": 2.0237865447998047, "eos/L_plus": 2.2026264667510986, "eos/L_minus": 2.200202226638794, "eos/grad_norm": 0.15540456771850586, "eos/embed_grad_frac": 0.10765673965215683, "eos/time_s": 0.6197531223297119} {"step": 26700, "timestamp": 1778223398.8629744, "train/loss": 2.200333905220032, "train/z_loss": 0.0014950662036426366, "train/perplexity": 9.028027501719986, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905909.3164260713, "perf/iters_per_sec": 0.9088083822374684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1003419637680054, "data/tokens_consumed": 55996055552, "data/tokens_consumed_B": 55.996055552, "train/loss_slope": -9.877925122281331e-06} {"step": 26700, "timestamp": 1778223400.2273276, "geo/rankme_last": 439.83160400390625, "geo/layer_0/stable_rank_q_proj": 18.228252410888672, "geo/layer_0/stable_rank_k_proj": 15.839792251586914, "geo/layer_0/stable_rank_o_proj": 50.88725280761719, "geo/layer_0/stable_rank_gate_proj": 145.0047149658203, "geo/layer_0/stable_rank_down_proj": 51.15684509277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05156564339995384, "geo/layer_0/attn_entropy_mean": 6.240331649780273, "geo/layer_0/attn_entropy_std": 0.335634708404541, "geo/layer_7/stable_rank_q_proj": 42.73514938354492, "geo/layer_7/stable_rank_k_proj": 41.928428649902344, "geo/layer_7/stable_rank_o_proj": 107.36299133300781, "geo/layer_7/stable_rank_gate_proj": 98.36581420898438, "geo/layer_7/stable_rank_down_proj": 147.97850036621094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5487187504768372, "geo/layer_7/attn_entropy_mean": 4.641498565673828, "geo/layer_7/attn_entropy_std": 0.8516897559165955, "geo/layer_14/stable_rank_q_proj": 56.2584342956543, "geo/layer_14/stable_rank_k_proj": 35.057491302490234, "geo/layer_14/stable_rank_o_proj": 53.886356353759766, "geo/layer_14/stable_rank_gate_proj": 82.25531768798828, "geo/layer_14/stable_rank_down_proj": 134.99684143066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3659714162349701, "geo/layer_14/attn_entropy_mean": 5.492260456085205, "geo/layer_14/attn_entropy_std": 0.430066853761673, "geo/layer_21/stable_rank_q_proj": 45.95360565185547, "geo/layer_21/stable_rank_k_proj": 31.192596435546875, "geo/layer_21/stable_rank_o_proj": 81.10275268554688, "geo/layer_21/stable_rank_gate_proj": 81.28288269042969, "geo/layer_21/stable_rank_down_proj": 58.83818054199219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1536455899477005, "geo/layer_21/attn_entropy_mean": 5.732630729675293, "geo/layer_21/attn_entropy_std": 0.2890819311141968, "geo/layer_27/stable_rank_q_proj": 41.28095626831055, "geo/layer_27/stable_rank_k_proj": 31.292692184448242, "geo/layer_27/stable_rank_o_proj": 119.5455551147461, "geo/layer_27/stable_rank_gate_proj": 89.43370819091797, "geo/layer_27/stable_rank_down_proj": 135.51780700683594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08703260868787766, "geo/layer_27/attn_entropy_mean": 4.339900970458984, "geo/layer_27/attn_entropy_std": 0.5841988921165466, "attnres/final_alpha/block_0": 0.23935644328594208, "attnres/block_norm/0": 1.6642401218414307, "attnres/final_alpha/block_1": 0.006098032463341951, "attnres/block_norm/1": 33538.0234375, "attnres/final_alpha/block_2": 0.01283295638859272, "attnres/block_norm/2": 23371.9765625, "attnres/final_alpha/block_3": 0.015158332884311676, "attnres/block_norm/3": 36273.33203125, "attnres/final_alpha/block_4": 0.01869126409292221, "attnres/block_norm/4": 10734.486328125, "attnres/final_alpha/block_5": 0.5821504592895508, "attnres/block_norm/5": 5402.57666015625, "attnres/final_alpha/block_6": 0.12571248412132263, "attnres/block_norm/6": 24304.955078125, "geo/tier1_time_s": 1.3590409755706787, "geo/step": 26700.0, "geo/rankme_slope": -9.630201690051021e-05} {"step": 26710, "timestamp": 1778223410.6186173, "train/loss": 2.1682578206062315, "train/z_loss": 0.0015008686343207956, "train/perplexity": 8.74303882256877, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1784707.9098931688, "perf/iters_per_sec": 0.8510150479760975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1750673532485962, "data/tokens_consumed": 56017027072, "data/tokens_consumed_B": 56.017027072, "train/loss_slope": -1.2124393997055025e-05} {"step": 26720, "timestamp": 1778223420.9977849, "train/loss": 2.2580612897872925, "train/z_loss": 0.0014812708250246942, "train/perplexity": 9.564528331781212, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021461.8379476266, "perf/iters_per_sec": 0.9639081182230123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374432802200317, "data/tokens_consumed": 56037998592, "data/tokens_consumed_B": 56.037998592, "train/loss_slope": -9.953257918107474e-06} {"step": 26730, "timestamp": 1778223431.3755405, "train/loss": 2.169450879096985, "train/z_loss": 0.001503319782204926, "train/perplexity": 8.753476004117042, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021854.3274897703, "perf/iters_per_sec": 0.9640952718209125, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372418880462646, "data/tokens_consumed": 56058970112, "data/tokens_consumed_B": 56.058970112, "train/loss_slope": -1.4592436430799754e-05} {"step": 26740, "timestamp": 1778223441.7518167, "train/loss": 2.1982646226882934, "train/z_loss": 0.0014994258293882013, "train/perplexity": 9.009365277481127, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022367.3910198412, "perf/iters_per_sec": 0.9643399195765692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369787454605102, "data/tokens_consumed": 56079941632, "data/tokens_consumed_B": 56.079941632, "train/loss_slope": -1.5788631327617936e-05} {"step": 26750, "timestamp": 1778223452.1182492, "grad/layer_0/attn": 0.0029855843167752028, "grad/layer_0/mlp": 0.002879687352105975, "grad/layer_0/attn_mlp_ratio": 1.0367737354940318, "grad/layer_4/attn": 0.001737047336064279, "grad/layer_4/mlp": 0.0026080573443323374, "grad/layer_4/attn_mlp_ratio": 0.6660310875587186, "grad/layer_8/attn": 0.0051560113206505775, "grad/layer_8/mlp": 0.0038811524864286184, "grad/layer_8/attn_mlp_ratio": 1.3284742626919148, "grad/layer_12/attn": 0.0049267057329416275, "grad/layer_12/mlp": 0.006418474949896336, "grad/layer_12/attn_mlp_ratio": 0.7675819715184837, "grad/layer_16/attn": 0.003722556633874774, "grad/layer_16/mlp": 0.004641042090952396, "grad/layer_16/attn_mlp_ratio": 0.8020949779624524, "grad/layer_20/attn": 0.003613952547311783, "grad/layer_20/mlp": 0.00631912425160408, "grad/layer_20/attn_mlp_ratio": 0.5719071735618556, "grad/layer_24/attn": 0.008345706388354301, "grad/layer_24/mlp": 0.009375674650073051, "grad/layer_24/attn_mlp_ratio": 0.890144614742451, "grad/layer_27/attn": 0.0041841319762170315, "grad/layer_27/mlp": 0.008083198219537735, "grad/layer_27/attn_mlp_ratio": 0.5176332202692159} {"step": 26750, "timestamp": 1778223452.1452217, "train/loss": 2.174158215522766, "train/z_loss": 0.0014934271457605064, "train/perplexity": 8.794778697130184, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018727.6138035904, "perf/iters_per_sec": 0.9626043385522797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388484239578246, "data/tokens_consumed": 56100913152, "data/tokens_consumed_B": 56.100913152, "train/loss_slope": -1.8697884459771668e-05} {"step": 26760, "timestamp": 1778223462.5186353, "train/loss": 2.178634595870972, "train/z_loss": 0.0014927544281817973, "train/perplexity": 8.834235718086875, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022419.469643807, "perf/iters_per_sec": 0.9643647525996242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369520425796508, "data/tokens_consumed": 56121884672, "data/tokens_consumed_B": 56.121884672, "train/loss_slope": -1.676450521066816e-05} {"step": 26770, "timestamp": 1778223472.89291, "train/loss": 2.1867974996566772, "train/z_loss": 0.0014945478062145412, "train/perplexity": 8.906643862568346, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022651.996219364, "perf/iters_per_sec": 0.9644756299111195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368328332901, "data/tokens_consumed": 56142856192, "data/tokens_consumed_B": 56.142856192, "train/loss_slope": -1.7185050919719053e-05} {"step": 26775, "timestamp": 1778223478.6808658, "eos/sharpness": 56.646680831909165, "eos/L0_probe": 2.023679494857788, "eos/L_plus": 2.366812229156494, "eos/L_minus": 2.247013568878174, "eos/grad_norm": 0.19794802367687225, "eos/embed_grad_frac": 0.06450065970420837, "eos/time_s": 0.6067554950714111} {"step": 26775, "timestamp": 1778223480.0592022, "geo/rankme_last": 440.5177307128906, "geo/layer_0/stable_rank_q_proj": 18.229629516601562, "geo/layer_0/stable_rank_k_proj": 15.84298324584961, "geo/layer_0/stable_rank_o_proj": 50.81220626831055, "geo/layer_0/stable_rank_gate_proj": 144.9580535888672, "geo/layer_0/stable_rank_down_proj": 51.20175552368164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05496615171432495, "geo/layer_0/attn_entropy_mean": 6.237529277801514, "geo/layer_0/attn_entropy_std": 0.33980512619018555, "geo/layer_7/stable_rank_q_proj": 42.693355560302734, "geo/layer_7/stable_rank_k_proj": 41.88815689086914, "geo/layer_7/stable_rank_o_proj": 107.467529296875, "geo/layer_7/stable_rank_gate_proj": 98.36294555664062, "geo/layer_7/stable_rank_down_proj": 148.06256103515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5356928706169128, "geo/layer_7/attn_entropy_mean": 4.583325386047363, "geo/layer_7/attn_entropy_std": 0.8225864768028259, "geo/layer_14/stable_rank_q_proj": 56.33059310913086, "geo/layer_14/stable_rank_k_proj": 35.08053207397461, "geo/layer_14/stable_rank_o_proj": 53.861656188964844, "geo/layer_14/stable_rank_gate_proj": 82.34188079833984, "geo/layer_14/stable_rank_down_proj": 134.96768188476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3705950379371643, "geo/layer_14/attn_entropy_mean": 5.496484279632568, "geo/layer_14/attn_entropy_std": 0.4189065992832184, "geo/layer_21/stable_rank_q_proj": 45.914817810058594, "geo/layer_21/stable_rank_k_proj": 31.198413848876953, "geo/layer_21/stable_rank_o_proj": 81.01914978027344, "geo/layer_21/stable_rank_gate_proj": 81.22191619873047, "geo/layer_21/stable_rank_down_proj": 58.77435302734375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14794862270355225, "geo/layer_21/attn_entropy_mean": 5.726593494415283, "geo/layer_21/attn_entropy_std": 0.2811393439769745, "geo/layer_27/stable_rank_q_proj": 41.23090362548828, "geo/layer_27/stable_rank_k_proj": 31.392963409423828, "geo/layer_27/stable_rank_o_proj": 119.6435317993164, "geo/layer_27/stable_rank_gate_proj": 89.33824920654297, "geo/layer_27/stable_rank_down_proj": 135.31594848632812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07631567120552063, "geo/layer_27/attn_entropy_mean": 4.315441131591797, "geo/layer_27/attn_entropy_std": 0.6043960452079773, "attnres/final_alpha/block_0": 0.24085941910743713, "attnres/block_norm/0": 1.6647007465362549, "attnres/final_alpha/block_1": 0.006060509011149406, "attnres/block_norm/1": 33606.921875, "attnres/final_alpha/block_2": 0.012867648154497147, "attnres/block_norm/2": 23295.33984375, "attnres/final_alpha/block_3": 0.014930750243365765, "attnres/block_norm/3": 36392.15625, "attnres/final_alpha/block_4": 0.018629122525453568, "attnres/block_norm/4": 10734.65625, "attnres/final_alpha/block_5": 0.5803539156913757, "attnres/block_norm/5": 5410.8466796875, "attnres/final_alpha/block_6": 0.12629862129688263, "attnres/block_norm/6": 24108.587890625, "geo/tier1_time_s": 1.3566298484802246, "geo/step": 26775.0, "geo/rankme_slope": -8.811065441801721e-05} {"step": 26780, "timestamp": 1778223485.2468836, "train/loss": 2.1887048959732054, "train/z_loss": 0.0015054168412461876, "train/perplexity": 8.92364857447137, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698312.0069694158, "perf/iters_per_sec": 0.8098182711455421, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234844946861267, "data/tokens_consumed": 56163827712, "data/tokens_consumed_B": 56.163827712, "train/loss_slope": -1.8685773439747854e-05} {"step": 26790, "timestamp": 1778223495.6256912, "train/loss": 2.179589533805847, "train/z_loss": 0.0015034643583931029, "train/perplexity": 8.842675894180482, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021986.972943728, "perf/iters_per_sec": 0.9641585221022263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371738433837892, "data/tokens_consumed": 56184799232, "data/tokens_consumed_B": 56.184799232, "train/loss_slope": -2.1101695554878065e-05} {"step": 26800, "timestamp": 1778223505.9922004, "grad/layer_0/attn": 0.002863673958927393, "grad/layer_0/mlp": 0.0026705695781856775, "grad/layer_0/attn_mlp_ratio": 1.0723082727700652, "grad/layer_4/attn": 0.0018124744528904557, "grad/layer_4/mlp": 0.002457324182614684, "grad/layer_4/attn_mlp_ratio": 0.7375804918030255, "grad/layer_8/attn": 0.006280165631324053, "grad/layer_8/mlp": 0.0037035555578768253, "grad/layer_8/attn_mlp_ratio": 1.6957125021105655, "grad/layer_12/attn": 0.005237102974206209, "grad/layer_12/mlp": 0.00665404973551631, "grad/layer_12/attn_mlp_ratio": 0.7870549670747763, "grad/layer_16/attn": 0.0038743081968277693, "grad/layer_16/mlp": 0.00426039332523942, "grad/layer_16/attn_mlp_ratio": 0.9093780339335789, "grad/layer_20/attn": 0.009031415916979313, "grad/layer_20/mlp": 0.005982399918138981, "grad/layer_20/attn_mlp_ratio": 1.5096643303014745, "grad/layer_24/attn": 0.011781839653849602, "grad/layer_24/mlp": 0.010934289544820786, "grad/layer_24/attn_mlp_ratio": 1.077513038026231, "grad/layer_27/attn": 0.007708971854299307, "grad/layer_27/mlp": 0.010452698916196823, "grad/layer_27/attn_mlp_ratio": 0.7375101724783222} {"step": 26800, "timestamp": 1778223506.008906, "train/loss": 2.22900447845459, "train/z_loss": 0.0014937943429686128, "train/perplexity": 9.29061246928044, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020682.8862370092, "perf/iters_per_sec": 0.963536685102944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378432035446168, "data/tokens_consumed": 56205770752, "data/tokens_consumed_B": 56.205770752, "train/loss_slope": -1.8399327687590537e-05} {"step": 26810, "timestamp": 1778223516.3907416, "train/loss": 2.2683469533920286, "train/z_loss": 0.0014694042038172483, "train/perplexity": 9.66341353089258, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021023.4818360137, "perf/iters_per_sec": 0.9636990937404698, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376682996749877, "data/tokens_consumed": 56226742272, "data/tokens_consumed_B": 56.226742272, "train/loss_slope": -1.2470616487422363e-05} {"step": 26820, "timestamp": 1778223526.7722838, "train/loss": 2.2301652669906615, "train/z_loss": 0.0014905278454534709, "train/perplexity": 9.301403167375543, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021232.6967577757, "perf/iters_per_sec": 0.9637988551892165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375608921051025, "data/tokens_consumed": 56247713792, "data/tokens_consumed_B": 56.247713792, "train/loss_slope": -1.1266081361058173e-05} {"step": 26830, "timestamp": 1778223537.153869, "train/loss": 2.179179072380066, "train/z_loss": 0.001494744699448347, "train/perplexity": 8.839047061624077, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021018.420346799, "perf/iters_per_sec": 0.9636966802343364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376708984375, "data/tokens_consumed": 56268685312, "data/tokens_consumed_B": 56.268685312, "train/loss_slope": -1.2258356038850971e-05} {"step": 26840, "timestamp": 1778223547.5289257, "train/loss": 2.182262897491455, "train/z_loss": 0.001490865962114185, "train/perplexity": 8.866347209719272, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022141.2985963968, "perf/iters_per_sec": 0.9642321103078827, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370946884155274, "data/tokens_consumed": 56289656832, "data/tokens_consumed_B": 56.289656832, "train/loss_slope": -1.3813618214467342e-05} {"step": 26850, "timestamp": 1778223557.8912082, "grad/layer_0/attn": 0.002557617612183094, "grad/layer_0/mlp": 0.002571217715740204, "grad/layer_0/attn_mlp_ratio": 0.9947105984277748, "grad/layer_4/attn": 0.0028981100767850876, "grad/layer_4/mlp": 0.0026144806761294603, "grad/layer_4/attn_mlp_ratio": 1.1084839878132586, "grad/layer_8/attn": 0.003731298493221402, "grad/layer_8/mlp": 0.0039159152656793594, "grad/layer_8/attn_mlp_ratio": 0.9528547337677385, "grad/layer_12/attn": 0.003938083071261644, "grad/layer_12/mlp": 0.006063093431293964, "grad/layer_12/attn_mlp_ratio": 0.6495171237150934, "grad/layer_16/attn": 0.00339717511087656, "grad/layer_16/mlp": 0.004589658696204424, "grad/layer_16/attn_mlp_ratio": 0.7401803187822963, "grad/layer_20/attn": 0.00760392751544714, "grad/layer_20/mlp": 0.005438776221126318, "grad/layer_20/attn_mlp_ratio": 1.398095282188851, "grad/layer_24/attn": 0.0070508746430277824, "grad/layer_24/mlp": 0.008213006891310215, "grad/layer_24/attn_mlp_ratio": 0.8585009912311011, "grad/layer_27/attn": 0.003452721517533064, "grad/layer_27/mlp": 0.006642746739089489, "grad/layer_27/attn_mlp_ratio": 0.5197731602859523} {"step": 26850, "timestamp": 1778223558.5000417, "eos/sharpness": 19.061303138732907, "eos/L0_probe": 2.0213210582733154, "eos/L_plus": 2.1051156520843506, "eos/L_minus": 2.1281394958496094, "eos/grad_norm": 0.10231257230043411, "eos/embed_grad_frac": 0.22471915185451508, "eos/time_s": 0.6060957908630371} {"step": 26850, "timestamp": 1778223558.5197024, "train/loss": 2.235613250732422, "train/z_loss": 0.001493547682184726, "train/perplexity": 9.352215346893285, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909093.6841155319, "perf/iters_per_sec": 0.9103268070771846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985065937042235, "data/tokens_consumed": 56310628352, "data/tokens_consumed_B": 56.310628352, "train/loss_slope": -1.0834869764747495e-05} {"step": 26850, "timestamp": 1778223559.8862352, "geo/rankme_last": 440.12384033203125, "geo/layer_0/stable_rank_q_proj": 18.218645095825195, "geo/layer_0/stable_rank_k_proj": 15.857356071472168, "geo/layer_0/stable_rank_o_proj": 50.71254348754883, "geo/layer_0/stable_rank_gate_proj": 144.67575073242188, "geo/layer_0/stable_rank_down_proj": 51.21900177001953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05519216135144234, "geo/layer_0/attn_entropy_mean": 6.238327980041504, "geo/layer_0/attn_entropy_std": 0.3383464515209198, "geo/layer_7/stable_rank_q_proj": 42.806907653808594, "geo/layer_7/stable_rank_k_proj": 42.00847625732422, "geo/layer_7/stable_rank_o_proj": 107.23497772216797, "geo/layer_7/stable_rank_gate_proj": 98.02410888671875, "geo/layer_7/stable_rank_down_proj": 148.3297882080078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5388049483299255, "geo/layer_7/attn_entropy_mean": 4.643157005310059, "geo/layer_7/attn_entropy_std": 0.7987843155860901, "geo/layer_14/stable_rank_q_proj": 56.294124603271484, "geo/layer_14/stable_rank_k_proj": 35.0683479309082, "geo/layer_14/stable_rank_o_proj": 53.7025260925293, "geo/layer_14/stable_rank_gate_proj": 82.2608413696289, "geo/layer_14/stable_rank_down_proj": 135.00112915039062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36785200238227844, "geo/layer_14/attn_entropy_mean": 5.469903945922852, "geo/layer_14/attn_entropy_std": 0.4292568564414978, "geo/layer_21/stable_rank_q_proj": 45.894439697265625, "geo/layer_21/stable_rank_k_proj": 31.18828582763672, "geo/layer_21/stable_rank_o_proj": 81.01220703125, "geo/layer_21/stable_rank_gate_proj": 81.05613708496094, "geo/layer_21/stable_rank_down_proj": 58.70978546142578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15197505056858063, "geo/layer_21/attn_entropy_mean": 5.744436264038086, "geo/layer_21/attn_entropy_std": 0.2879585325717926, "geo/layer_27/stable_rank_q_proj": 41.33440017700195, "geo/layer_27/stable_rank_k_proj": 31.436952590942383, "geo/layer_27/stable_rank_o_proj": 119.50006866455078, "geo/layer_27/stable_rank_gate_proj": 89.27296447753906, "geo/layer_27/stable_rank_down_proj": 135.28565979003906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0818466767668724, "geo/layer_27/attn_entropy_mean": 4.356186866760254, "geo/layer_27/attn_entropy_std": 0.5905580520629883, "attnres/final_alpha/block_0": 0.24265044927597046, "attnres/block_norm/0": 1.665117621421814, "attnres/final_alpha/block_1": 0.006212197244167328, "attnres/block_norm/1": 33736.28125, "attnres/final_alpha/block_2": 0.012788886204361916, "attnres/block_norm/2": 23498.314453125, "attnres/final_alpha/block_3": 0.015124398283660412, "attnres/block_norm/3": 36306.26953125, "attnres/final_alpha/block_4": 0.018889185041189194, "attnres/block_norm/4": 10763.265625, "attnres/final_alpha/block_5": 0.5756814479827881, "attnres/block_norm/5": 5456.11767578125, "attnres/final_alpha/block_6": 0.128653421998024, "attnres/block_norm/6": 24148.890625, "geo/tier1_time_s": 1.3623931407928467, "geo/step": 26850.0, "geo/rankme_slope": -8.714628429496799e-05} {"step": 26860, "timestamp": 1778223570.2616787, "train/loss": 2.1821962118148805, "train/z_loss": 0.001492558908648789, "train/perplexity": 8.865755971070643, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786661.3656862678, "perf/iters_per_sec": 0.8519465282851543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1737825870513916, "data/tokens_consumed": 56331599872, "data/tokens_consumed_B": 56.331599872, "train/loss_slope": -1.0388789804998268e-05} {"step": 26870, "timestamp": 1778223580.6378775, "train/loss": 2.1761627316474916, "train/z_loss": 0.0015019973972812295, "train/perplexity": 8.812425653737595, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022325.7300517529, "perf/iters_per_sec": 0.964320054078938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370001077651978, "data/tokens_consumed": 56352571392, "data/tokens_consumed_B": 56.352571392, "train/loss_slope": -1.2885063151643677e-05} {"step": 26880, "timestamp": 1778223591.0095038, "train/loss": 2.212045097351074, "train/z_loss": 0.0014879900962114334, "train/perplexity": 9.13437799638241, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023021.1722032563, "perf/iters_per_sec": 0.9646516667381555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036643624305725, "data/tokens_consumed": 56373542912, "data/tokens_consumed_B": 56.373542912, "train/loss_slope": -1.342032919980154e-05} {"step": 26890, "timestamp": 1778223601.380478, "train/loss": 2.226948285102844, "train/z_loss": 0.0014926555915735662, "train/perplexity": 9.271528800267836, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023234.105102496, "perf/iters_per_sec": 0.9647532010567169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365345239639283, "data/tokens_consumed": 56394514432, "data/tokens_consumed_B": 56.394514432, "train/loss_slope": -9.87987901010637e-06} {"step": 26900, "timestamp": 1778223611.7392614, "grad/layer_0/attn": 0.0028305966407060623, "grad/layer_0/mlp": 0.0027503082528710365, "grad/layer_0/attn_mlp_ratio": 1.0291924677286506, "grad/layer_4/attn": 0.002183899749070406, "grad/layer_4/mlp": 0.002501593204215169, "grad/layer_4/attn_mlp_ratio": 0.8730035155556854, "grad/layer_8/attn": 0.007057725917547941, "grad/layer_8/mlp": 0.0037464157212525606, "grad/layer_8/attn_mlp_ratio": 1.8838608030403554, "grad/layer_12/attn": 0.004140879493206739, "grad/layer_12/mlp": 0.006845321040600538, "grad/layer_12/attn_mlp_ratio": 0.604921143676754, "grad/layer_16/attn": 0.004976852331310511, "grad/layer_16/mlp": 0.004242141265422106, "grad/layer_16/attn_mlp_ratio": 1.1731934187476798, "grad/layer_20/attn": 0.004803289659321308, "grad/layer_20/mlp": 0.005868555512279272, "grad/layer_20/attn_mlp_ratio": 0.8184790222096527, "grad/layer_24/attn": 0.011014129966497421, "grad/layer_24/mlp": 0.009662194177508354, "grad/layer_24/attn_mlp_ratio": 1.1399201516922612, "grad/layer_27/attn": 0.012251108884811401, "grad/layer_27/mlp": 0.007432505954056978, "grad/layer_27/attn_mlp_ratio": 1.6483146862859568} {"step": 26900, "timestamp": 1778223611.7559006, "train/loss": 2.1550952672958372, "train/z_loss": 0.0014979069237597287, "train/perplexity": 8.628712173888879, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022284.6287251366, "perf/iters_per_sec": 0.9643004554391559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370211839675902, "data/tokens_consumed": 56415485952, "data/tokens_consumed_B": 56.415485952, "train/loss_slope": -1.1494550217102869e-05} {"step": 26910, "timestamp": 1778223622.1359158, "train/loss": 2.21051561832428, "train/z_loss": 0.0014821094577200712, "train/perplexity": 9.120417835422256, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021400.4715098843, "perf/iters_per_sec": 0.9638788564252302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037474775314331, "data/tokens_consumed": 56436457472, "data/tokens_consumed_B": 56.436457472, "train/loss_slope": -1.2619749697843482e-05} {"step": 26920, "timestamp": 1778223632.511962, "train/loss": 2.236841154098511, "train/z_loss": 0.0014808500884100795, "train/perplexity": 9.363706016870546, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022231.5342356202, "perf/iters_per_sec": 0.9642751380136586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370484113693237, "data/tokens_consumed": 56457428992, "data/tokens_consumed_B": 56.457428992, "train/loss_slope": -9.204467774295881e-06} {"step": 26925, "timestamp": 1778223638.2977626, "eos/sharpness": 10.613489151000975, "eos/L0_probe": 2.025399923324585, "eos/L_plus": 2.0780718326568604, "eos/L_minus": 2.0788629055023193, "eos/grad_norm": 0.0986938402056694, "eos/embed_grad_frac": 0.23064520955085754, "eos/time_s": 0.6092984676361084} {"step": 26925, "timestamp": 1778223639.6726058, "geo/rankme_last": 440.0510559082031, "geo/layer_0/stable_rank_q_proj": 18.205018997192383, "geo/layer_0/stable_rank_k_proj": 15.912799835205078, "geo/layer_0/stable_rank_o_proj": 50.883872985839844, "geo/layer_0/stable_rank_gate_proj": 145.0980682373047, "geo/layer_0/stable_rank_down_proj": 51.24822998046875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.056636739522218704, "geo/layer_0/attn_entropy_mean": 6.235278129577637, "geo/layer_0/attn_entropy_std": 0.3425436317920685, "geo/layer_7/stable_rank_q_proj": 42.79878616333008, "geo/layer_7/stable_rank_k_proj": 41.99226379394531, "geo/layer_7/stable_rank_o_proj": 107.02751159667969, "geo/layer_7/stable_rank_gate_proj": 98.025390625, "geo/layer_7/stable_rank_down_proj": 147.7777557373047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5253730416297913, "geo/layer_7/attn_entropy_mean": 4.655516624450684, "geo/layer_7/attn_entropy_std": 0.8398430347442627, "geo/layer_14/stable_rank_q_proj": 56.220088958740234, "geo/layer_14/stable_rank_k_proj": 35.01771545410156, "geo/layer_14/stable_rank_o_proj": 53.574066162109375, "geo/layer_14/stable_rank_gate_proj": 82.072509765625, "geo/layer_14/stable_rank_down_proj": 134.9263916015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3725811541080475, "geo/layer_14/attn_entropy_mean": 5.5335774421691895, "geo/layer_14/attn_entropy_std": 0.42239490151405334, "geo/layer_21/stable_rank_q_proj": 45.79404830932617, "geo/layer_21/stable_rank_k_proj": 31.175643920898438, "geo/layer_21/stable_rank_o_proj": 81.31548309326172, "geo/layer_21/stable_rank_gate_proj": 81.21412658691406, "geo/layer_21/stable_rank_down_proj": 58.69663619995117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14939936995506287, "geo/layer_21/attn_entropy_mean": 5.733794689178467, "geo/layer_21/attn_entropy_std": 0.2817964553833008, "geo/layer_27/stable_rank_q_proj": 41.211395263671875, "geo/layer_27/stable_rank_k_proj": 31.39198875427246, "geo/layer_27/stable_rank_o_proj": 119.00655364990234, "geo/layer_27/stable_rank_gate_proj": 89.23141479492188, "geo/layer_27/stable_rank_down_proj": 135.6061553955078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08356712758541107, "geo/layer_27/attn_entropy_mean": 4.359967231750488, "geo/layer_27/attn_entropy_std": 0.5873059630393982, "attnres/final_alpha/block_0": 0.24057577550411224, "attnres/block_norm/0": 1.6655737161636353, "attnres/final_alpha/block_1": 0.0060677058063447475, "attnres/block_norm/1": 33750.40625, "attnres/final_alpha/block_2": 0.012842396274209023, "attnres/block_norm/2": 23502.486328125, "attnres/final_alpha/block_3": 0.014827432110905647, "attnres/block_norm/3": 36618.6484375, "attnres/final_alpha/block_4": 0.019053597003221512, "attnres/block_norm/4": 10749.7451171875, "attnres/final_alpha/block_5": 0.5810908675193787, "attnres/block_norm/5": 5410.0185546875, "attnres/final_alpha/block_6": 0.12554223835468292, "attnres/block_norm/6": 24385.9921875, "geo/tier1_time_s": 1.3546786308288574, "geo/step": 26925.0, "geo/rankme_slope": -0.000105988293755002} {"step": 26930, "timestamp": 1778223644.867988, "train/loss": 2.2111706495285035, "train/z_loss": 0.001488606899511069, "train/perplexity": 9.126393950757349, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698026.0910663614, "perf/iters_per_sec": 0.8096819358188445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350528717041016, "data/tokens_consumed": 56478400512, "data/tokens_consumed_B": 56.478400512, "train/loss_slope": -1.0304562730042377e-05} {"step": 26940, "timestamp": 1778223655.2454228, "train/loss": 2.1919540643692015, "train/z_loss": 0.0014981516287662089, "train/perplexity": 8.952690166358217, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021884.5825065114, "perf/iters_per_sec": 0.9641096985371167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372263669967652, "data/tokens_consumed": 56499372032, "data/tokens_consumed_B": 56.499372032, "train/loss_slope": -8.24013252069455e-06} {"step": 26950, "timestamp": 1778223665.6237996, "grad/layer_0/attn": 0.0026806392706930637, "grad/layer_0/mlp": 0.0026606163010001183, "grad/layer_0/attn_mlp_ratio": 1.0075256507046333, "grad/layer_4/attn": 0.001648647477850318, "grad/layer_4/mlp": 0.0024261297658085823, "grad/layer_4/attn_mlp_ratio": 0.6795380169399363, "grad/layer_8/attn": 0.005318830255419016, "grad/layer_8/mlp": 0.0036423029378056526, "grad/layer_8/attn_mlp_ratio": 1.4602931717135228, "grad/layer_12/attn": 0.004603562876582146, "grad/layer_12/mlp": 0.007307284511625767, "grad/layer_12/attn_mlp_ratio": 0.629996383233514, "grad/layer_16/attn": 0.0044214664958417416, "grad/layer_16/mlp": 0.004613346420228481, "grad/layer_16/attn_mlp_ratio": 0.9584076280536507, "grad/layer_20/attn": 0.005229968577623367, "grad/layer_20/mlp": 0.006656547077000141, "grad/layer_20/attn_mlp_ratio": 0.7856879007323915, "grad/layer_24/attn": 0.017241954803466797, "grad/layer_24/mlp": 0.01137747336179018, "grad/layer_24/attn_mlp_ratio": 1.515446716827927, "grad/layer_27/attn": 0.007150569930672646, "grad/layer_27/mlp": 0.009940102696418762, "grad/layer_27/attn_mlp_ratio": 0.7193657930025498} {"step": 26950, "timestamp": 1778223665.6404638, "train/loss": 2.17009596824646, "train/z_loss": 0.0015069570392370224, "train/perplexity": 8.75912459823499, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018428.1324895432, "perf/iters_per_sec": 0.9624615347335544, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0390025615692138, "data/tokens_consumed": 56520343552, "data/tokens_consumed_B": 56.520343552, "train/loss_slope": -8.583237300075083e-06} {"step": 26960, "timestamp": 1778223676.0143952, "train/loss": 2.2369436979293824, "train/z_loss": 0.0014905696152709424, "train/perplexity": 9.364666256389148, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022821.820903973, "perf/iters_per_sec": 0.9645566086311211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367457866668701, "data/tokens_consumed": 56541315072, "data/tokens_consumed_B": 56.541315072, "train/loss_slope": -5.776401879441955e-06} {"step": 26970, "timestamp": 1778223686.3974628, "train/loss": 2.200204610824585, "train/z_loss": 0.0014868160244077445, "train/perplexity": 9.026860303819772, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021240.6854001556, "perf/iters_per_sec": 0.9638026644707468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037556791305542, "data/tokens_consumed": 56562286592, "data/tokens_consumed_B": 56.562286592, "train/loss_slope": -3.872912866447649e-06} {"step": 26980, "timestamp": 1778223696.7838473, "train/loss": 2.197111797332764, "train/z_loss": 0.0014985132729634643, "train/perplexity": 8.99898503720373, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020294.3778962472, "perf/iters_per_sec": 0.9633514298897968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380427837371826, "data/tokens_consumed": 56583258112, "data/tokens_consumed_B": 56.583258112, "train/loss_slope": -4.082709756514158e-06} {"step": 26990, "timestamp": 1778223707.1604383, "train/loss": 2.185579228401184, "train/z_loss": 0.0014882884686812758, "train/perplexity": 8.895799761237354, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022133.9536489316, "perf/iters_per_sec": 0.9642286079640062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370984554290772, "data/tokens_consumed": 56604229632, "data/tokens_consumed_B": 56.604229632, "train/loss_slope": -3.752091229230231e-06} {"step": 27000, "timestamp": 1778223717.5075586, "grad/layer_0/attn": 0.003435234073549509, "grad/layer_0/mlp": 0.003085861913859844, "grad/layer_0/attn_mlp_ratio": 1.1132170065027194, "grad/layer_4/attn": 0.0020157587714493275, "grad/layer_4/mlp": 0.0026157975662499666, "grad/layer_4/attn_mlp_ratio": 0.7706095916581878, "grad/layer_8/attn": 0.006429805886000395, "grad/layer_8/mlp": 0.004005650524049997, "grad/layer_8/attn_mlp_ratio": 1.6051838988142717, "grad/layer_12/attn": 0.004756392911076546, "grad/layer_12/mlp": 0.00659192306920886, "grad/layer_12/attn_mlp_ratio": 0.7215485965148759, "grad/layer_16/attn": 0.0037644829135388136, "grad/layer_16/mlp": 0.004537770990282297, "grad/layer_16/attn_mlp_ratio": 0.8295885443848212, "grad/layer_20/attn": 0.0037557941395789385, "grad/layer_20/mlp": 0.006180812604725361, "grad/layer_20/attn_mlp_ratio": 0.6076537696584068, "grad/layer_24/attn": 0.008998465724289417, "grad/layer_24/mlp": 0.009226711466908455, "grad/layer_24/attn_mlp_ratio": 0.9752624929300229, "grad/layer_27/attn": 0.004254220053553581, "grad/layer_27/mlp": 0.007998982444405556, "grad/layer_27/attn_mlp_ratio": 0.5318451477968231} {"step": 27000, "timestamp": 1778223718.1150885, "eos/sharpness": 15.740823745727536, "eos/L0_probe": 2.0248420238494873, "eos/L_plus": 2.1101226806640625, "eos/L_minus": 2.0969696044921875, "eos/grad_norm": 0.10234575718641281, "eos/embed_grad_frac": 0.2286030501127243, "eos/time_s": 0.6045174598693848} {"step": 27000, "timestamp": 1778223718.1423736, "train/loss": 2.236360502243042, "train/z_loss": 0.001481081312522292, "train/perplexity": 9.359206415656937, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910606.2968251884, "perf/iters_per_sec": 0.9110480770231192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097636914253235, "data/tokens_consumed": 56625201152, "data/tokens_consumed_B": 56.625201152, "train/loss_slope": -2.4207624271758325e-06} {"step": 27000, "timestamp": 1778223719.5139573, "geo/rankme_last": 440.60870361328125, "geo/layer_0/stable_rank_q_proj": 18.21630096435547, "geo/layer_0/stable_rank_k_proj": 15.926849365234375, "geo/layer_0/stable_rank_o_proj": 50.9560661315918, "geo/layer_0/stable_rank_gate_proj": 144.93560791015625, "geo/layer_0/stable_rank_down_proj": 51.17643737792969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051348257809877396, "geo/layer_0/attn_entropy_mean": 6.234349250793457, "geo/layer_0/attn_entropy_std": 0.3439744710922241, "geo/layer_7/stable_rank_q_proj": 42.722679138183594, "geo/layer_7/stable_rank_k_proj": 42.15605926513672, "geo/layer_7/stable_rank_o_proj": 107.10759735107422, "geo/layer_7/stable_rank_gate_proj": 97.7646713256836, "geo/layer_7/stable_rank_down_proj": 147.64398193359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5295926332473755, "geo/layer_7/attn_entropy_mean": 4.669720649719238, "geo/layer_7/attn_entropy_std": 0.8409909009933472, "geo/layer_14/stable_rank_q_proj": 56.15596008300781, "geo/layer_14/stable_rank_k_proj": 34.9795036315918, "geo/layer_14/stable_rank_o_proj": 53.667381286621094, "geo/layer_14/stable_rank_gate_proj": 82.17250061035156, "geo/layer_14/stable_rank_down_proj": 134.65513610839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37878477573394775, "geo/layer_14/attn_entropy_mean": 5.512240886688232, "geo/layer_14/attn_entropy_std": 0.4221615195274353, "geo/layer_21/stable_rank_q_proj": 45.67121887207031, "geo/layer_21/stable_rank_k_proj": 31.120166778564453, "geo/layer_21/stable_rank_o_proj": 81.28022766113281, "geo/layer_21/stable_rank_gate_proj": 81.02227020263672, "geo/layer_21/stable_rank_down_proj": 58.656150817871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.148389533162117, "geo/layer_21/attn_entropy_mean": 5.752043724060059, "geo/layer_21/attn_entropy_std": 0.28865525126457214, "geo/layer_27/stable_rank_q_proj": 41.171836853027344, "geo/layer_27/stable_rank_k_proj": 31.346250534057617, "geo/layer_27/stable_rank_o_proj": 118.84564971923828, "geo/layer_27/stable_rank_gate_proj": 89.1356201171875, "geo/layer_27/stable_rank_down_proj": 135.48974609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08389817923307419, "geo/layer_27/attn_entropy_mean": 4.352726936340332, "geo/layer_27/attn_entropy_std": 0.582058310508728, "attnres/final_alpha/block_0": 0.24150070548057556, "attnres/block_norm/0": 1.666080355644226, "attnres/final_alpha/block_1": 0.0060777971521019936, "attnres/block_norm/1": 33853.421875, "attnres/final_alpha/block_2": 0.012803709134459496, "attnres/block_norm/2": 23514.8828125, "attnres/final_alpha/block_3": 0.014773629605770111, "attnres/block_norm/3": 36709.1484375, "attnres/final_alpha/block_4": 0.018784359097480774, "attnres/block_norm/4": 10799.076171875, "attnres/final_alpha/block_5": 0.578467607498169, "attnres/block_norm/5": 5398.5419921875, "attnres/final_alpha/block_6": 0.12759216129779816, "attnres/block_norm/6": 24216.19140625, "geo/tier1_time_s": 1.3672504425048828, "geo/step": 27000.0, "geo/rankme_slope": -8.21419388067727e-05} {"step": 27000, "timestamp": 1778223726.4612617, "geo/ww_alpha_mean": 7.790477190510149, "geo/ww_alpha_std": 4.301195190667283, "geo/ww_alpha_min": 1.3639538549537225, "geo/ww_alpha_max": 25.228989276543256, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.133483760736107, "geo/ww_alpha_by_type/k_proj": 4.5961004708376345, "geo/ww_alpha_by_type/v_proj": 8.159207522768721, "geo/ww_alpha_by_type/o_proj": 7.918699387762898, "geo/ww_alpha_by_type/gate_proj": 8.958536062431309, "geo/ww_alpha_by_type/up_proj": 12.067912779335519, "geo/ww_alpha_by_type/down_proj": 8.81959522499629, "geo/twonn_id/layer_0": 0.7334230542182922, "geo/twonn_id/layer_7": 3.3695123195648193, "geo/twonn_id/layer_14": 4.496463298797607, "geo/twonn_id/layer_21": 7.031209468841553, "geo/twonn_id/layer_27": 4.877251625061035, "geo/tier2_time_s": 6.941235303878784} {"step": 27000, "timestamp": 1778223727.147722, "eoc/jacobian_sigma/layer_0/attn": 969.4791259765625, "eoc/jacobian_sigma/layer_0/mlp": 6081.20361328125, "eoc/jacobian_sigma/layer_0": 6081.20361328125, "eoc/jacobian_sigma/layer_7/attn": 1.1536037921905518, "eoc/jacobian_sigma/layer_7/mlp": 1.6709704399108887, "eoc/jacobian_sigma/layer_7": 1.6709704399108887, "eoc/jacobian_sigma/layer_14/attn": 1.6388096809387207, "eoc/jacobian_sigma/layer_14/mlp": 7.895770072937012, "eoc/jacobian_sigma/layer_14": 7.895770072937012, "eoc/jacobian_sigma/layer_21/attn": 1.0889086723327637, "eoc/jacobian_sigma/layer_21/mlp": 3.9150519371032715, "eoc/jacobian_sigma/layer_21": 3.9150519371032715, "eoc/jacobian_sigma/layer_27/attn": 3.319497585296631, "eoc/jacobian_sigma/layer_27/mlp": 23.170902252197266, "eoc/jacobian_sigma/layer_27": 23.170902252197266, "eoc/layer0_sigma": 6081.20361328125, "eoc/sigma_max": 23.170902252197266, "eoc/sigma_min": 1.6709704399108887, "eoc/sigma_mean": 9.16317367553711, "eoc/time_s": 0.6789700984954834} {"step": 27010, "timestamp": 1778223737.5175722, "train/loss": 2.248165798187256, "train/z_loss": 0.0014877123176120222, "train/perplexity": 9.470349364112533, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1082623.5529165498, "perf/iters_per_sec": 0.5162351383764981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9371017694473267, "data/tokens_consumed": 56646172672, "data/tokens_consumed_B": 56.646172672, "train/loss_slope": 1.297134205703414e-06} {"step": 27020, "timestamp": 1778223747.854914, "train/loss": 2.2163702487945556, "train/z_loss": 0.0014870834536850452, "train/perplexity": 9.173971125977422, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029920.6170524175, "perf/iters_per_sec": 0.9679415784132087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331202030181885, "data/tokens_consumed": 56667144192, "data/tokens_consumed_B": 56.667144192, "train/loss_slope": 1.6817965952440649e-06} {"step": 27030, "timestamp": 1778223758.1932652, "train/loss": 2.1968541622161863, "train/z_loss": 0.0014896540786139667, "train/perplexity": 8.9966668812766, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029681.2645435878, "perf/iters_per_sec": 0.9678274462430896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332420349121094, "data/tokens_consumed": 56688115712, "data/tokens_consumed_B": 56.688115712, "train/loss_slope": 2.9300538548613167e-06} {"step": 27040, "timestamp": 1778223768.5457666, "train/loss": 2.2435322046279906, "train/z_loss": 0.0014752542949281633, "train/perplexity": 9.426569122548466, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026981.6196716162, "perf/iters_per_sec": 0.966540155254181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346181631088256, "data/tokens_consumed": 56709087232, "data/tokens_consumed_B": 56.709087232, "train/loss_slope": 6.80619434233077e-06} {"step": 27050, "timestamp": 1778223778.8807127, "grad/layer_0/attn": 0.002560555702075362, "grad/layer_0/mlp": 0.0025478610768914223, "grad/layer_0/attn_mlp_ratio": 1.0049824241991976, "grad/layer_4/attn": 0.002487895777449012, "grad/layer_4/mlp": 0.002412902656942606, "grad/layer_4/attn_mlp_ratio": 1.0310800011689778, "grad/layer_8/attn": 0.0036657496821135283, "grad/layer_8/mlp": 0.0035445510875433683, "grad/layer_8/attn_mlp_ratio": 1.0341929028972945, "grad/layer_12/attn": 0.005862872116267681, "grad/layer_12/mlp": 0.006706757005304098, "grad/layer_12/attn_mlp_ratio": 0.8741739150849785, "grad/layer_16/attn": 0.004735404625535011, "grad/layer_16/mlp": 0.004469308070838451, "grad/layer_16/attn_mlp_ratio": 1.0595386230989392, "grad/layer_20/attn": 0.0042762369848787785, "grad/layer_20/mlp": 0.0055648693814873695, "grad/layer_20/attn_mlp_ratio": 0.7684343719299294, "grad/layer_24/attn": 0.0045115710236132145, "grad/layer_24/mlp": 0.008733351714909077, "grad/layer_24/attn_mlp_ratio": 0.5165910087248884, "grad/layer_27/attn": 0.010327894240617752, "grad/layer_27/mlp": 0.007891223765909672, "grad/layer_27/attn_mlp_ratio": 1.308782315153239} {"step": 27050, "timestamp": 1778223778.8950677, "train/loss": 2.2103442192077636, "train/z_loss": 0.0014834789675660432, "train/perplexity": 9.118854737823606, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027593.7988391435, "perf/iters_per_sec": 0.9668320650287359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034305787086487, "data/tokens_consumed": 56730058752, "data/tokens_consumed_B": 56.730058752, "train/loss_slope": 1.022631369515502e-05} {"step": 27060, "timestamp": 1778223789.2388532, "train/loss": 2.1928068876266478, "train/z_loss": 0.0015016249381005764, "train/perplexity": 8.960328485353891, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028679.5477402452, "perf/iters_per_sec": 0.9673497904492594, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337522268295287, "data/tokens_consumed": 56751030272, "data/tokens_consumed_B": 56.751030272, "train/loss_slope": 1.2639579070497776e-05} {"step": 27070, "timestamp": 1778223799.5808501, "train/loss": 2.1731874465942385, "train/z_loss": 0.0015001702005974949, "train/perplexity": 8.786245141962999, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029021.9085701855, "perf/iters_per_sec": 0.9675130408144882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033577799797058, "data/tokens_consumed": 56772001792, "data/tokens_consumed_B": 56.772001792, "train/loss_slope": 1.0605520455285688e-05} {"step": 27075, "timestamp": 1778223805.3727014, "eos/sharpness": 67.85547733306883, "eos/L0_probe": 2.0250625610351562, "eos/L_plus": 2.49100923538208, "eos/L_minus": 2.237670660018921, "eos/grad_norm": 0.16352887451648712, "eos/embed_grad_frac": 0.08036015927791595, "eos/time_s": 0.6243033409118652} {"step": 27075, "timestamp": 1778223806.75832, "geo/rankme_last": 439.987060546875, "geo/layer_0/stable_rank_q_proj": 18.202590942382812, "geo/layer_0/stable_rank_k_proj": 15.934242248535156, "geo/layer_0/stable_rank_o_proj": 50.85919189453125, "geo/layer_0/stable_rank_gate_proj": 144.20571899414062, "geo/layer_0/stable_rank_down_proj": 51.279537200927734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050911519676446915, "geo/layer_0/attn_entropy_mean": 6.23399019241333, "geo/layer_0/attn_entropy_std": 0.33825239539146423, "geo/layer_7/stable_rank_q_proj": 42.66737747192383, "geo/layer_7/stable_rank_k_proj": 42.19717025756836, "geo/layer_7/stable_rank_o_proj": 107.2569580078125, "geo/layer_7/stable_rank_gate_proj": 97.5982437133789, "geo/layer_7/stable_rank_down_proj": 148.1593475341797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5388546586036682, "geo/layer_7/attn_entropy_mean": 4.633752822875977, "geo/layer_7/attn_entropy_std": 0.8341144323348999, "geo/layer_14/stable_rank_q_proj": 56.0776252746582, "geo/layer_14/stable_rank_k_proj": 34.87643814086914, "geo/layer_14/stable_rank_o_proj": 53.51805114746094, "geo/layer_14/stable_rank_gate_proj": 82.03585052490234, "geo/layer_14/stable_rank_down_proj": 134.50267028808594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3872503936290741, "geo/layer_14/attn_entropy_mean": 5.471337795257568, "geo/layer_14/attn_entropy_std": 0.4163910746574402, "geo/layer_21/stable_rank_q_proj": 45.70285415649414, "geo/layer_21/stable_rank_k_proj": 31.164501190185547, "geo/layer_21/stable_rank_o_proj": 81.05038452148438, "geo/layer_21/stable_rank_gate_proj": 81.0445556640625, "geo/layer_21/stable_rank_down_proj": 58.538265228271484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1525481939315796, "geo/layer_21/attn_entropy_mean": 5.723544597625732, "geo/layer_21/attn_entropy_std": 0.28699302673339844, "geo/layer_27/stable_rank_q_proj": 41.21816635131836, "geo/layer_27/stable_rank_k_proj": 31.383182525634766, "geo/layer_27/stable_rank_o_proj": 118.88978576660156, "geo/layer_27/stable_rank_gate_proj": 89.0893325805664, "geo/layer_27/stable_rank_down_proj": 135.6554718017578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07299534976482391, "geo/layer_27/attn_entropy_mean": 4.309447288513184, "geo/layer_27/attn_entropy_std": 0.5890619158744812, "attnres/final_alpha/block_0": 0.24151504039764404, "attnres/block_norm/0": 1.6666960716247559, "attnres/final_alpha/block_1": 0.00604163808748126, "attnres/block_norm/1": 33779.1875, "attnres/final_alpha/block_2": 0.012724677100777626, "attnres/block_norm/2": 23517.404296875, "attnres/final_alpha/block_3": 0.014638751745223999, "attnres/block_norm/3": 36741.27734375, "attnres/final_alpha/block_4": 0.01882103830575943, "attnres/block_norm/4": 10768.140625, "attnres/final_alpha/block_5": 0.5801709294319153, "attnres/block_norm/5": 5433.66259765625, "attnres/final_alpha/block_6": 0.12608790397644043, "attnres/block_norm/6": 24487.296875, "geo/tier1_time_s": 1.3679468631744385, "geo/step": 27075.0, "geo/rankme_slope": -0.00010261153289440776} {"step": 27080, "timestamp": 1778223811.9443297, "train/loss": 2.285243821144104, "train/z_loss": 0.0014928729506209493, "train/perplexity": 9.828082226160237, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696970.4694392665, "perf/iters_per_sec": 0.8091785762020428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2358211517333983, "data/tokens_consumed": 56792973312, "data/tokens_consumed_B": 56.792973312, "train/loss_slope": 1.7041130871376448e-05} {"step": 27090, "timestamp": 1778223822.3129091, "train/loss": 2.1840893983840943, "train/z_loss": 0.0014892973355017602, "train/perplexity": 8.882556399355648, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023522.166307102, "perf/iters_per_sec": 0.9648905593429098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363869667053223, "data/tokens_consumed": 56813944832, "data/tokens_consumed_B": 56.813944832, "train/loss_slope": 1.1623291929240957e-05} {"step": 27100, "timestamp": 1778223832.6755772, "grad/layer_0/attn": 0.003017554758116603, "grad/layer_0/mlp": 0.0028212100733071566, "grad/layer_0/attn_mlp_ratio": 1.069595872958051, "grad/layer_4/attn": 0.002041536383330822, "grad/layer_4/mlp": 0.0024547260254621506, "grad/layer_4/attn_mlp_ratio": 0.8316758281726694, "grad/layer_8/attn": 0.00495337788015604, "grad/layer_8/mlp": 0.0035987012088298798, "grad/layer_8/attn_mlp_ratio": 1.3764348455378295, "grad/layer_12/attn": 0.006158316507935524, "grad/layer_12/mlp": 0.006393211428076029, "grad/layer_12/attn_mlp_ratio": 0.9632586816330174, "grad/layer_16/attn": 0.0036358071956783533, "grad/layer_16/mlp": 0.004263961222022772, "grad/layer_16/attn_mlp_ratio": 0.8526829680419264, "grad/layer_20/attn": 0.004810130223631859, "grad/layer_20/mlp": 0.006670747883617878, "grad/layer_20/attn_mlp_ratio": 0.7210780913091978, "grad/layer_24/attn": 0.013334391638636589, "grad/layer_24/mlp": 0.011660738848149776, "grad/layer_24/attn_mlp_ratio": 1.1435288705054472, "grad/layer_27/attn": 0.004307053983211517, "grad/layer_27/mlp": 0.010949499905109406, "grad/layer_27/attn_mlp_ratio": 0.39335622459488573} {"step": 27100, "timestamp": 1778223832.69058, "train/loss": 2.2350346565246584, "train/z_loss": 0.0014781659236177802, "train/perplexity": 9.34680577438839, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022270.4947183225, "perf/iters_per_sec": 0.9642937158195126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037028431892395, "data/tokens_consumed": 56834916352, "data/tokens_consumed_B": 56.834916352, "train/loss_slope": 1.5905592050274804e-05} {"step": 27110, "timestamp": 1778223843.0570612, "train/loss": 2.19700973033905, "train/z_loss": 0.0014920158078894018, "train/perplexity": 8.998066584727146, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024475.041913279, "perf/iters_per_sec": 0.9653449258390804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358991622924805, "data/tokens_consumed": 56855887872, "data/tokens_consumed_B": 56.855887872, "train/loss_slope": 1.4367795648640605e-05} {"step": 27120, "timestamp": 1778223853.4231973, "train/loss": 2.180977535247803, "train/z_loss": 0.0014899247442372143, "train/perplexity": 8.854958062934475, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024074.967070701, "perf/iters_per_sec": 0.9651541552880769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361039161682128, "data/tokens_consumed": 56876859392, "data/tokens_consumed_B": 56.876859392, "train/loss_slope": 1.2093407622050333e-05} {"step": 27130, "timestamp": 1778223863.7875602, "train/loss": 2.177354097366333, "train/z_loss": 0.0014926637522876263, "train/perplexity": 8.822930732013829, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024614.7419874924, "perf/iters_per_sec": 0.965411540025469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358276844024659, "data/tokens_consumed": 56897830912, "data/tokens_consumed_B": 56.897830912, "train/loss_slope": 1.1268683120791624e-05} {"step": 27140, "timestamp": 1778223874.1497777, "train/loss": 2.220528817176819, "train/z_loss": 0.0014891918515786528, "train/perplexity": 9.212201148227768, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024973.5398446855, "perf/iters_per_sec": 0.9655826281760623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356441497802735, "data/tokens_consumed": 56918802432, "data/tokens_consumed_B": 56.918802432, "train/loss_slope": 1.4135072712707975e-05} {"step": 27150, "timestamp": 1778223884.5077033, "grad/layer_0/attn": 0.0025330749340355396, "grad/layer_0/mlp": 0.0025125350803136826, "grad/layer_0/attn_mlp_ratio": 1.0081749118909025, "grad/layer_4/attn": 0.0031018403824418783, "grad/layer_4/mlp": 0.002501189708709717, "grad/layer_4/attn_mlp_ratio": 1.2401459384012194, "grad/layer_8/attn": 0.004255929961800575, "grad/layer_8/mlp": 0.003659473964944482, "grad/layer_8/attn_mlp_ratio": 1.162989513320992, "grad/layer_12/attn": 0.004790504928678274, "grad/layer_12/mlp": 0.0068914867006242275, "grad/layer_12/attn_mlp_ratio": 0.6951337305390113, "grad/layer_16/attn": 0.0036314092576503754, "grad/layer_16/mlp": 0.0041512795723974705, "grad/layer_16/attn_mlp_ratio": 0.8747686362343166, "grad/layer_20/attn": 0.004498686175793409, "grad/layer_20/mlp": 0.005826005712151527, "grad/layer_20/attn_mlp_ratio": 0.7721733072099456, "grad/layer_24/attn": 0.009027375839650631, "grad/layer_24/mlp": 0.008204487152397633, "grad/layer_24/attn_mlp_ratio": 1.1002973814131434, "grad/layer_27/attn": 0.006773608736693859, "grad/layer_27/mlp": 0.006882617715746164, "grad/layer_27/attn_mlp_ratio": 0.9841616835380701} {"step": 27150, "timestamp": 1778223885.149856, "eos/sharpness": 7.402014732360838, "eos/L0_probe": 2.0235860347747803, "eos/L_plus": 2.0664117336273193, "eos/L_minus": 2.0547804832458496, "eos/grad_norm": 0.10630632191896439, "eos/embed_grad_frac": 0.200787752866745, "eos/time_s": 0.6382436752319336} {"step": 27150, "timestamp": 1778223885.1737666, "train/loss": 2.2097978830337524, "train/z_loss": 0.001488211948890239, "train/perplexity": 9.113874138279513, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903169.8829034967, "perf/iters_per_sec": 0.9075021185414776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.101925802230835, "data/tokens_consumed": 56939773952, "data/tokens_consumed_B": 56.939773952, "train/loss_slope": 1.3216728181263853e-05} {"step": 27150, "timestamp": 1778223886.5414035, "geo/rankme_last": 440.3522644042969, "geo/layer_0/stable_rank_q_proj": 18.178617477416992, "geo/layer_0/stable_rank_k_proj": 15.893623352050781, "geo/layer_0/stable_rank_o_proj": 50.75782775878906, "geo/layer_0/stable_rank_gate_proj": 144.61407470703125, "geo/layer_0/stable_rank_down_proj": 51.32949447631836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050693318247795105, "geo/layer_0/attn_entropy_mean": 6.23408842086792, "geo/layer_0/attn_entropy_std": 0.33718419075012207, "geo/layer_7/stable_rank_q_proj": 42.69367599487305, "geo/layer_7/stable_rank_k_proj": 42.148277282714844, "geo/layer_7/stable_rank_o_proj": 107.5439224243164, "geo/layer_7/stable_rank_gate_proj": 97.5778579711914, "geo/layer_7/stable_rank_down_proj": 148.4370880126953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5431938767433167, "geo/layer_7/attn_entropy_mean": 4.658851146697998, "geo/layer_7/attn_entropy_std": 0.8323763012886047, "geo/layer_14/stable_rank_q_proj": 56.16810989379883, "geo/layer_14/stable_rank_k_proj": 34.945438385009766, "geo/layer_14/stable_rank_o_proj": 53.58930969238281, "geo/layer_14/stable_rank_gate_proj": 82.01116943359375, "geo/layer_14/stable_rank_down_proj": 134.57423400878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36509427428245544, "geo/layer_14/attn_entropy_mean": 5.49053430557251, "geo/layer_14/attn_entropy_std": 0.43559107184410095, "geo/layer_21/stable_rank_q_proj": 45.730316162109375, "geo/layer_21/stable_rank_k_proj": 31.095415115356445, "geo/layer_21/stable_rank_o_proj": 81.05818939208984, "geo/layer_21/stable_rank_gate_proj": 81.0790786743164, "geo/layer_21/stable_rank_down_proj": 58.48035430908203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15268352627754211, "geo/layer_21/attn_entropy_mean": 5.721613883972168, "geo/layer_21/attn_entropy_std": 0.29863396286964417, "geo/layer_27/stable_rank_q_proj": 41.28333282470703, "geo/layer_27/stable_rank_k_proj": 31.357858657836914, "geo/layer_27/stable_rank_o_proj": 118.72506713867188, "geo/layer_27/stable_rank_gate_proj": 89.18069458007812, "geo/layer_27/stable_rank_down_proj": 135.7498779296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07930566370487213, "geo/layer_27/attn_entropy_mean": 4.337409973144531, "geo/layer_27/attn_entropy_std": 0.5947253108024597, "attnres/final_alpha/block_0": 0.24240049719810486, "attnres/block_norm/0": 1.6670295000076294, "attnres/final_alpha/block_1": 0.006092295050621033, "attnres/block_norm/1": 33877.87890625, "attnres/final_alpha/block_2": 0.012994397431612015, "attnres/block_norm/2": 23488.865234375, "attnres/final_alpha/block_3": 0.01467728428542614, "attnres/block_norm/3": 36605.9765625, "attnres/final_alpha/block_4": 0.01927073672413826, "attnres/block_norm/4": 10832.5283203125, "attnres/final_alpha/block_5": 0.5766940116882324, "attnres/block_norm/5": 5447.96044921875, "attnres/final_alpha/block_6": 0.12787073850631714, "attnres/block_norm/6": 24394.359375, "geo/tier1_time_s": 1.3634495735168457, "geo/step": 27150.0, "geo/rankme_slope": -9.093174379126651e-05} {"step": 27160, "timestamp": 1778223896.9264476, "train/loss": 2.2055608510971068, "train/z_loss": 0.0014837338356301188, "train/perplexity": 9.07534005520775, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1784991.7432236578, "perf/iters_per_sec": 0.8511503902548112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1748805046081543, "data/tokens_consumed": 56960745472, "data/tokens_consumed_B": 56.960745472, "train/loss_slope": 9.852194449867947e-06} {"step": 27170, "timestamp": 1778223907.3055768, "train/loss": 2.2001937866210937, "train/z_loss": 0.0014872110332362354, "train/perplexity": 9.026762595775763, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021644.1010441787, "perf/iters_per_sec": 0.9639950280400175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373497486114502, "data/tokens_consumed": 56981716992, "data/tokens_consumed_B": 56.981716992, "train/loss_slope": 1.11888848444094e-05} {"step": 27180, "timestamp": 1778223917.6824203, "train/loss": 2.2523043870925905, "train/z_loss": 0.0014871588558889926, "train/perplexity": 9.509624462605556, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022306.155569061, "perf/iters_per_sec": 0.9643107202382378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370101451873779, "data/tokens_consumed": 57002688512, "data/tokens_consumed_B": 57.002688512, "train/loss_slope": 1.4040294052636762e-05} {"step": 27190, "timestamp": 1778223928.044608, "train/loss": 2.163843107223511, "train/z_loss": 0.001507710397709161, "train/perplexity": 8.704525886410888, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025402.651524294, "perf/iters_per_sec": 0.9657872445699186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354247331619262, "data/tokens_consumed": 57023660032, "data/tokens_consumed_B": 57.023660032, "train/loss_slope": 1.610508457233053e-05} {"step": 27200, "timestamp": 1778223938.4081483, "grad/layer_0/attn": 0.003324283752590418, "grad/layer_0/mlp": 0.0029834157321602106, "grad/layer_0/attn_mlp_ratio": 1.1142542439963503, "grad/layer_4/attn": 0.0033425427973270416, "grad/layer_4/mlp": 0.002611924195662141, "grad/layer_4/attn_mlp_ratio": 1.2797242258813952, "grad/layer_8/attn": 0.007906259968876839, "grad/layer_8/mlp": 0.0038316508289426565, "grad/layer_8/attn_mlp_ratio": 2.063408205887526, "grad/layer_12/attn": 0.004398568067699671, "grad/layer_12/mlp": 0.00637496542185545, "grad/layer_12/attn_mlp_ratio": 0.689975193217901, "grad/layer_16/attn": 0.005332798231393099, "grad/layer_16/mlp": 0.0047500948421657085, "grad/layer_16/attn_mlp_ratio": 1.1226719247345651, "grad/layer_20/attn": 0.0049651083536446095, "grad/layer_20/mlp": 0.00527274189516902, "grad/layer_20/attn_mlp_ratio": 0.9416558515841911, "grad/layer_24/attn": 0.006877798121422529, "grad/layer_24/mlp": 0.00852309912443161, "grad/layer_24/attn_mlp_ratio": 0.8069597619733445, "grad/layer_27/attn": 0.005099030211567879, "grad/layer_27/mlp": 0.008456896990537643, "grad/layer_27/attn_mlp_ratio": 0.6029433912910144} {"step": 27200, "timestamp": 1778223938.425441, "train/loss": 2.1735761880874636, "train/z_loss": 0.0014953003264963627, "train/perplexity": 8.789661383993819, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021570.4113176593, "perf/iters_per_sec": 0.9639598900402352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373875617980957, "data/tokens_consumed": 57044631552, "data/tokens_consumed_B": 57.044631552, "train/loss_slope": 1.268468352362735e-05} {"step": 27210, "timestamp": 1778223948.7809129, "train/loss": 2.2000171184539794, "train/z_loss": 0.0014968147035688162, "train/perplexity": 9.025167995034733, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026687.249338465, "perf/iters_per_sec": 0.9663997885410619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347684383392335, "data/tokens_consumed": 57065603072, "data/tokens_consumed_B": 57.065603072, "train/loss_slope": 1.2864523423243822e-05} {"step": 27220, "timestamp": 1778223959.1320262, "train/loss": 2.2585699796676635, "train/z_loss": 0.0014708045171573758, "train/perplexity": 9.569394948248442, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026869.4751818494, "perf/iters_per_sec": 0.9664866805943725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034675407409668, "data/tokens_consumed": 57086574592, "data/tokens_consumed_B": 57.086574592, "train/loss_slope": 1.6417388747198402e-05} {"step": 27225, "timestamp": 1778223964.900651, "eos/sharpness": 39.462757110595696, "eos/L0_probe": 2.0257205963134766, "eos/L_plus": 2.2343294620513916, "eos/L_minus": 2.2117393016815186, "eos/grad_norm": 0.17801575362682343, "eos/embed_grad_frac": 0.07929062843322754, "eos/time_s": 0.6057891845703125} {"step": 27225, "timestamp": 1778223966.2763693, "geo/rankme_last": 439.74310302734375, "geo/layer_0/stable_rank_q_proj": 18.235637664794922, "geo/layer_0/stable_rank_k_proj": 15.87579345703125, "geo/layer_0/stable_rank_o_proj": 50.59682083129883, "geo/layer_0/stable_rank_gate_proj": 144.56649780273438, "geo/layer_0/stable_rank_down_proj": 51.3509635925293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0503210611641407, "geo/layer_0/attn_entropy_mean": 6.237909317016602, "geo/layer_0/attn_entropy_std": 0.33071503043174744, "geo/layer_7/stable_rank_q_proj": 42.605106353759766, "geo/layer_7/stable_rank_k_proj": 41.99110794067383, "geo/layer_7/stable_rank_o_proj": 107.41007995605469, "geo/layer_7/stable_rank_gate_proj": 97.34465789794922, "geo/layer_7/stable_rank_down_proj": 148.1698760986328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5281054377555847, "geo/layer_7/attn_entropy_mean": 4.6634321212768555, "geo/layer_7/attn_entropy_std": 0.8167319297790527, "geo/layer_14/stable_rank_q_proj": 56.070560455322266, "geo/layer_14/stable_rank_k_proj": 34.95525360107422, "geo/layer_14/stable_rank_o_proj": 53.469058990478516, "geo/layer_14/stable_rank_gate_proj": 81.84410095214844, "geo/layer_14/stable_rank_down_proj": 134.63491821289062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3684186339378357, "geo/layer_14/attn_entropy_mean": 5.471492767333984, "geo/layer_14/attn_entropy_std": 0.41159990429878235, "geo/layer_21/stable_rank_q_proj": 45.735206604003906, "geo/layer_21/stable_rank_k_proj": 31.06680679321289, "geo/layer_21/stable_rank_o_proj": 81.052978515625, "geo/layer_21/stable_rank_gate_proj": 81.0127182006836, "geo/layer_21/stable_rank_down_proj": 58.5384521484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14696799218654633, "geo/layer_21/attn_entropy_mean": 5.717554092407227, "geo/layer_21/attn_entropy_std": 0.29572540521621704, "geo/layer_27/stable_rank_q_proj": 41.29378128051758, "geo/layer_27/stable_rank_k_proj": 31.46823501586914, "geo/layer_27/stable_rank_o_proj": 118.81442260742188, "geo/layer_27/stable_rank_gate_proj": 89.32957458496094, "geo/layer_27/stable_rank_down_proj": 136.1120147705078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08678141236305237, "geo/layer_27/attn_entropy_mean": 4.351971626281738, "geo/layer_27/attn_entropy_std": 0.5880329012870789, "attnres/final_alpha/block_0": 0.24174998700618744, "attnres/block_norm/0": 1.667593002319336, "attnres/final_alpha/block_1": 0.006061695981770754, "attnres/block_norm/1": 33842.234375, "attnres/final_alpha/block_2": 0.013064553029835224, "attnres/block_norm/2": 23565.345703125, "attnres/final_alpha/block_3": 0.014905646443367004, "attnres/block_norm/3": 37039.3828125, "attnres/final_alpha/block_4": 0.019171185791492462, "attnres/block_norm/4": 10840.228515625, "attnres/final_alpha/block_5": 0.5786254405975342, "attnres/block_norm/5": 5452.642578125, "attnres/final_alpha/block_6": 0.1264214813709259, "attnres/block_norm/6": 24471.99609375, "geo/tier1_time_s": 1.3560502529144287, "geo/step": 27225.0, "geo/rankme_slope": -9.966486594637855e-05} {"step": 27230, "timestamp": 1778223971.457794, "train/loss": 2.1998255252838135, "train/z_loss": 0.0014871968072839082, "train/perplexity": 9.023439000124382, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702085.9780033063, "perf/iters_per_sec": 0.8116178407684833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2321069717407227, "data/tokens_consumed": 57107546112, "data/tokens_consumed_B": 57.107546112, "train/loss_slope": 1.6142234507531272e-05} {"step": 27240, "timestamp": 1778223981.804456, "train/loss": 2.220011568069458, "train/z_loss": 0.0014817307703197003, "train/perplexity": 9.207437377541297, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027867.1607534115, "perf/iters_per_sec": 0.9669624141470964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341663599014281, "data/tokens_consumed": 57128517632, "data/tokens_consumed_B": 57.128517632, "train/loss_slope": 1.7324550209766483e-05} {"step": 27250, "timestamp": 1778223992.6025493, "grad/layer_0/attn": 0.0024764311965554953, "grad/layer_0/mlp": 0.0025888734962791204, "grad/layer_0/attn_mlp_ratio": 0.956567056852357, "grad/layer_4/attn": 0.0016737201949581504, "grad/layer_4/mlp": 0.002441790420562029, "grad/layer_4/attn_mlp_ratio": 0.6854479042587605, "grad/layer_8/attn": 0.01009424403309822, "grad/layer_8/mlp": 0.003534364979714155, "grad/layer_8/attn_mlp_ratio": 2.856027548210894, "grad/layer_12/attn": 0.004253242630511522, "grad/layer_12/mlp": 0.005818550940603018, "grad/layer_12/attn_mlp_ratio": 0.730979689072338, "grad/layer_16/attn": 0.003627041121944785, "grad/layer_16/mlp": 0.004241200163960457, "grad/layer_16/attn_mlp_ratio": 0.8551921381231439, "grad/layer_20/attn": 0.007758129853755236, "grad/layer_20/mlp": 0.005303282290697098, "grad/layer_20/attn_mlp_ratio": 1.4628920887494843, "grad/layer_24/attn": 0.006087185349315405, "grad/layer_24/mlp": 0.010224021971225739, "grad/layer_24/attn_mlp_ratio": 0.5953806933229384, "grad/layer_27/attn": 0.005934969987720251, "grad/layer_27/mlp": 0.009116819128394127, "grad/layer_27/attn_mlp_ratio": 0.650991298504189} {"step": 27250, "timestamp": 1778223992.6182427, "train/loss": 2.250083303451538, "train/z_loss": 0.001487312326207757, "train/perplexity": 9.488526230421796, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1940340.6321093508, "perf/iters_per_sec": 0.925226512961078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0808164119720458, "data/tokens_consumed": 57149489152, "data/tokens_consumed_B": 57.149489152, "train/loss_slope": 2.00304593047043e-05} {"step": 27260, "timestamp": 1778224002.9796567, "train/loss": 2.2187575578689573, "train/z_loss": 0.0014895400614477693, "train/perplexity": 9.19589839366326, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025014.75054504, "perf/iters_per_sec": 0.965602278969307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356230735778809, "data/tokens_consumed": 57170460672, "data/tokens_consumed_B": 57.170460672, "train/loss_slope": 1.7972041049090888e-05} {"step": 27270, "timestamp": 1778224013.3436751, "train/loss": 2.1858291387557984, "train/z_loss": 0.0014994309633038938, "train/perplexity": 8.89802319152782, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025030.2749202768, "perf/iters_per_sec": 0.9656096815682778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356151342391968, "data/tokens_consumed": 57191432192, "data/tokens_consumed_B": 57.191432192, "train/loss_slope": 1.6253987568976342e-05} {"step": 27280, "timestamp": 1778224023.7073543, "train/loss": 2.1877821922302245, "train/z_loss": 0.0014903766452334822, "train/perplexity": 8.915418488080332, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025160.4466145285, "perf/iters_per_sec": 0.9656717522690432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355485677719116, "data/tokens_consumed": 57212403712, "data/tokens_consumed_B": 57.212403712, "train/loss_slope": 1.79915928461514e-05} {"step": 27290, "timestamp": 1778224034.0674827, "train/loss": 2.242263925075531, "train/z_loss": 0.0014772244729101657, "train/perplexity": 9.41462117595023, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025263.588793135, "perf/iters_per_sec": 0.9657209342923808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354958295822143, "data/tokens_consumed": 57233375232, "data/tokens_consumed_B": 57.233375232, "train/loss_slope": 1.9536156093541363e-05} {"step": 27300, "timestamp": 1778224044.4086978, "grad/layer_0/attn": 0.003241264959797263, "grad/layer_0/mlp": 0.002884829184040427, "grad/layer_0/attn_mlp_ratio": 1.123555205754713, "grad/layer_4/attn": 0.0021660884376615286, "grad/layer_4/mlp": 0.0026020624209195375, "grad/layer_4/attn_mlp_ratio": 0.8324505734382043, "grad/layer_8/attn": 0.004762222524732351, "grad/layer_8/mlp": 0.0038553166668862104, "grad/layer_8/attn_mlp_ratio": 1.2352350825321712, "grad/layer_12/attn": 0.004164230078458786, "grad/layer_12/mlp": 0.0065845344215631485, "grad/layer_12/attn_mlp_ratio": 0.6324258859637979, "grad/layer_16/attn": 0.008512363769114017, "grad/layer_16/mlp": 0.004602737259119749, "grad/layer_16/attn_mlp_ratio": 1.849413317544142, "grad/layer_20/attn": 0.005748700350522995, "grad/layer_20/mlp": 0.0059837764129042625, "grad/layer_20/attn_mlp_ratio": 0.9607144147388668, "grad/layer_24/attn": 0.011239560320973396, "grad/layer_24/mlp": 0.014012495055794716, "grad/layer_24/attn_mlp_ratio": 0.8021098452494664, "grad/layer_27/attn": 0.012430792674422264, "grad/layer_27/mlp": 0.012013427913188934, "grad/layer_27/attn_mlp_ratio": 1.0347415126452773} {"step": 27300, "timestamp": 1778224045.0207143, "eos/sharpness": 16.009521484374996, "eos/L0_probe": 2.0277650356292725, "eos/L_plus": 2.10825777053833, "eos/L_minus": 2.107367515563965, "eos/grad_norm": 0.13978208601474762, "eos/embed_grad_frac": 0.155943363904953, "eos/time_s": 0.6092507839202881} {"step": 27300, "timestamp": 1778224045.0397704, "train/loss": 2.158089780807495, "train/z_loss": 0.0014885142096318304, "train/perplexity": 8.654589695037945, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912299.145428725, "perf/iters_per_sec": 0.9118552901404977, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966652393341065, "data/tokens_consumed": 57254346752, "data/tokens_consumed_B": 57.254346752, "train/loss_slope": 1.6705588660653666e-05} {"step": 27300, "timestamp": 1778224046.4032161, "geo/rankme_last": 440.468994140625, "geo/layer_0/stable_rank_q_proj": 18.260530471801758, "geo/layer_0/stable_rank_k_proj": 15.890323638916016, "geo/layer_0/stable_rank_o_proj": 50.625370025634766, "geo/layer_0/stable_rank_gate_proj": 144.2338409423828, "geo/layer_0/stable_rank_down_proj": 51.32048416137695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.049182306975126266, "geo/layer_0/attn_entropy_mean": 6.237200736999512, "geo/layer_0/attn_entropy_std": 0.3336009383201599, "geo/layer_7/stable_rank_q_proj": 42.54689025878906, "geo/layer_7/stable_rank_k_proj": 41.9985237121582, "geo/layer_7/stable_rank_o_proj": 107.56117248535156, "geo/layer_7/stable_rank_gate_proj": 97.48439025878906, "geo/layer_7/stable_rank_down_proj": 148.00682067871094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5433032512664795, "geo/layer_7/attn_entropy_mean": 4.676608085632324, "geo/layer_7/attn_entropy_std": 0.8260529637336731, "geo/layer_14/stable_rank_q_proj": 56.160953521728516, "geo/layer_14/stable_rank_k_proj": 34.90499496459961, "geo/layer_14/stable_rank_o_proj": 53.464698791503906, "geo/layer_14/stable_rank_gate_proj": 81.75643920898438, "geo/layer_14/stable_rank_down_proj": 134.779541015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38422638177871704, "geo/layer_14/attn_entropy_mean": 5.470041275024414, "geo/layer_14/attn_entropy_std": 0.4370764493942261, "geo/layer_21/stable_rank_q_proj": 45.76136016845703, "geo/layer_21/stable_rank_k_proj": 31.001922607421875, "geo/layer_21/stable_rank_o_proj": 81.04911804199219, "geo/layer_21/stable_rank_gate_proj": 80.86410522460938, "geo/layer_21/stable_rank_down_proj": 58.640785217285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15025334060192108, "geo/layer_21/attn_entropy_mean": 5.707184314727783, "geo/layer_21/attn_entropy_std": 0.2951587438583374, "geo/layer_27/stable_rank_q_proj": 41.370601654052734, "geo/layer_27/stable_rank_k_proj": 31.450469970703125, "geo/layer_27/stable_rank_o_proj": 118.76592254638672, "geo/layer_27/stable_rank_gate_proj": 89.23919677734375, "geo/layer_27/stable_rank_down_proj": 136.56283569335938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08156169950962067, "geo/layer_27/attn_entropy_mean": 4.306461334228516, "geo/layer_27/attn_entropy_std": 0.6293036341667175, "attnres/final_alpha/block_0": 0.24200020730495453, "attnres/block_norm/0": 1.6682109832763672, "attnres/final_alpha/block_1": 0.006222845055162907, "attnres/block_norm/1": 33938.16796875, "attnres/final_alpha/block_2": 0.012983450666069984, "attnres/block_norm/2": 23663.0078125, "attnres/final_alpha/block_3": 0.014742573723196983, "attnres/block_norm/3": 37080.421875, "attnres/final_alpha/block_4": 0.018791038542985916, "attnres/block_norm/4": 10852.119140625, "attnres/final_alpha/block_5": 0.576504111289978, "attnres/block_norm/5": 5467.53515625, "attnres/final_alpha/block_6": 0.12875574827194214, "attnres/block_norm/6": 24311.30859375, "geo/tier1_time_s": 1.3594965934753418, "geo/step": 27300.0, "geo/rankme_slope": -7.030474299094638e-05} {"step": 27310, "timestamp": 1778224056.7652695, "train/loss": 2.1760257720947265, "train/z_loss": 0.0014930046861991286, "train/perplexity": 8.811218790508894, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789092.1131279226, "perf/iters_per_sec": 0.8531055989875425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172187829017639, "data/tokens_consumed": 57275318272, "data/tokens_consumed_B": 57.275318272, "train/loss_slope": 1.5125561296993534e-05} {"step": 27320, "timestamp": 1778224067.115874, "train/loss": 2.182049059867859, "train/z_loss": 0.0014935745391994714, "train/perplexity": 8.864451453801161, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027174.9240579123, "perf/iters_per_sec": 0.9666323299684106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345195055007934, "data/tokens_consumed": 57296289792, "data/tokens_consumed_B": 57.296289792, "train/loss_slope": 1.4120184041128295e-05} {"step": 27330, "timestamp": 1778224077.4689543, "train/loss": 2.1800456285476684, "train/z_loss": 0.0014799324213527142, "train/perplexity": 8.846709912036685, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026712.559118276, "perf/iters_per_sec": 0.9664118571845417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347555160522461, "data/tokens_consumed": 57317261312, "data/tokens_consumed_B": 57.317261312, "train/loss_slope": 1.2729729162548686e-05} {"step": 27340, "timestamp": 1778224087.8245955, "train/loss": 2.1484773635864256, "train/z_loss": 0.0014978889841586352, "train/perplexity": 8.571796725788749, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026723.019433294, "perf/iters_per_sec": 0.9664168450514288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347501754760742, "data/tokens_consumed": 57338232832, "data/tokens_consumed_B": 57.338232832, "train/loss_slope": 6.830812945510396e-06} {"step": 27350, "timestamp": 1778224098.163903, "grad/layer_0/attn": 0.0032204012386500835, "grad/layer_0/mlp": 0.002976449439302087, "grad/layer_0/attn_mlp_ratio": 1.0819606367004624, "grad/layer_4/attn": 0.003875468159094453, "grad/layer_4/mlp": 0.0027076152618974447, "grad/layer_4/attn_mlp_ratio": 1.4313215287634502, "grad/layer_8/attn": 0.004041534848511219, "grad/layer_8/mlp": 0.003987088333815336, "grad/layer_8/attn_mlp_ratio": 1.0136556827368342, "grad/layer_12/attn": 0.00553086819127202, "grad/layer_12/mlp": 0.006709903012961149, "grad/layer_12/attn_mlp_ratio": 0.8242843597232199, "grad/layer_16/attn": 0.0038436155300587416, "grad/layer_16/mlp": 0.004612088203430176, "grad/layer_16/attn_mlp_ratio": 0.8333785645864817, "grad/layer_20/attn": 0.006405222229659557, "grad/layer_20/mlp": 0.005871907342225313, "grad/layer_20/attn_mlp_ratio": 1.0908247946142917, "grad/layer_24/attn": 0.006786133162677288, "grad/layer_24/mlp": 0.009424720890820026, "grad/layer_24/attn_mlp_ratio": 0.720035443944409, "grad/layer_27/attn": 0.0044497353956103325, "grad/layer_27/mlp": 0.008037311024963856, "grad/layer_27/attn_mlp_ratio": 0.553634832150453} {"step": 27350, "timestamp": 1778224098.1798055, "train/loss": 2.1981099128723143, "train/z_loss": 0.0014752801158465444, "train/perplexity": 9.00797154805155, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026071.603922761, "perf/iters_per_sec": 0.9661062259305768, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350828647613526, "data/tokens_consumed": 57359204352, "data/tokens_consumed_B": 57.359204352, "train/loss_slope": 4.4625173057123475e-06} {"step": 27360, "timestamp": 1778224108.5316093, "train/loss": 2.2108243703842163, "train/z_loss": 0.0014882021234370769, "train/perplexity": 9.123234217976007, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026949.2968714912, "perf/iters_per_sec": 0.9665247425420243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346346616744995, "data/tokens_consumed": 57380175872, "data/tokens_consumed_B": 57.380175872, "train/loss_slope": 4.898389216267656e-06} {"step": 27370, "timestamp": 1778224118.8789275, "train/loss": 2.2005420923233032, "train/z_loss": 0.0014840529067441822, "train/perplexity": 9.029907216273196, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027747.812945885, "perf/iters_per_sec": 0.9669055046777176, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034227228164673, "data/tokens_consumed": 57401147392, "data/tokens_consumed_B": 57.401147392, "train/loss_slope": 6.382330029782607e-06} {"step": 27375, "timestamp": 1778224124.6647618, "eos/sharpness": 16.594052314758297, "eos/L0_probe": 2.02729868888855, "eos/L_plus": 2.1135916709899902, "eos/L_minus": 2.1069462299346924, "eos/grad_norm": 0.10085487365722656, "eos/embed_grad_frac": 0.23242293298244476, "eos/time_s": 0.6097071170806885} {"step": 27375, "timestamp": 1778224126.0404372, "geo/rankme_last": 439.8347473144531, "geo/layer_0/stable_rank_q_proj": 18.252914428710938, "geo/layer_0/stable_rank_k_proj": 15.826119422912598, "geo/layer_0/stable_rank_o_proj": 50.86861801147461, "geo/layer_0/stable_rank_gate_proj": 144.4378204345703, "geo/layer_0/stable_rank_down_proj": 51.33654022216797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052380844950675964, "geo/layer_0/attn_entropy_mean": 6.237161159515381, "geo/layer_0/attn_entropy_std": 0.3309096693992615, "geo/layer_7/stable_rank_q_proj": 42.3991813659668, "geo/layer_7/stable_rank_k_proj": 42.10194396972656, "geo/layer_7/stable_rank_o_proj": 107.38410186767578, "geo/layer_7/stable_rank_gate_proj": 97.61459350585938, "geo/layer_7/stable_rank_down_proj": 147.96267700195312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.525173008441925, "geo/layer_7/attn_entropy_mean": 4.671971321105957, "geo/layer_7/attn_entropy_std": 0.8294327259063721, "geo/layer_14/stable_rank_q_proj": 56.325870513916016, "geo/layer_14/stable_rank_k_proj": 35.06358337402344, "geo/layer_14/stable_rank_o_proj": 53.41475296020508, "geo/layer_14/stable_rank_gate_proj": 81.7645492553711, "geo/layer_14/stable_rank_down_proj": 134.575927734375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38009944558143616, "geo/layer_14/attn_entropy_mean": 5.515928268432617, "geo/layer_14/attn_entropy_std": 0.4234603941440582, "geo/layer_21/stable_rank_q_proj": 45.7834587097168, "geo/layer_21/stable_rank_k_proj": 31.051212310791016, "geo/layer_21/stable_rank_o_proj": 81.0810546875, "geo/layer_21/stable_rank_gate_proj": 80.86226654052734, "geo/layer_21/stable_rank_down_proj": 58.597259521484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14573903381824493, "geo/layer_21/attn_entropy_mean": 5.716842174530029, "geo/layer_21/attn_entropy_std": 0.299159437417984, "geo/layer_27/stable_rank_q_proj": 41.428768157958984, "geo/layer_27/stable_rank_k_proj": 31.389371871948242, "geo/layer_27/stable_rank_o_proj": 118.5771713256836, "geo/layer_27/stable_rank_gate_proj": 89.29984283447266, "geo/layer_27/stable_rank_down_proj": 136.51438903808594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08771004527807236, "geo/layer_27/attn_entropy_mean": 4.37068510055542, "geo/layer_27/attn_entropy_std": 0.5823036432266235, "attnres/final_alpha/block_0": 0.2410239279270172, "attnres/block_norm/0": 1.668527603149414, "attnres/final_alpha/block_1": 0.006086799316108227, "attnres/block_norm/1": 33976.1875, "attnres/final_alpha/block_2": 0.012830393388867378, "attnres/block_norm/2": 23724.98828125, "attnres/final_alpha/block_3": 0.014893583953380585, "attnres/block_norm/3": 36922.3046875, "attnres/final_alpha/block_4": 0.01876210793852806, "attnres/block_norm/4": 10899.7548828125, "attnres/final_alpha/block_5": 0.5791791677474976, "attnres/block_norm/5": 5446.21435546875, "attnres/final_alpha/block_6": 0.12722401320934296, "attnres/block_norm/6": 24578.083984375, "geo/tier1_time_s": 1.3547101020812988, "geo/step": 27375.0, "geo/rankme_slope": -7.188742684573829e-05} {"step": 27380, "timestamp": 1778224131.2245796, "train/loss": 2.200639176368713, "train/z_loss": 0.0014946504030376672, "train/perplexity": 9.030783918751652, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699778.2431628804, "perf/iters_per_sec": 0.8105174270452883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337797641754151, "data/tokens_consumed": 57422118912, "data/tokens_consumed_B": 57.422118912, "train/loss_slope": 4.1316174831327415e-06} {"step": 27390, "timestamp": 1778224141.5727613, "train/loss": 2.1783981800079344, "train/z_loss": 0.0014949778327718378, "train/perplexity": 8.832147411489437, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027332.0519070802, "perf/iters_per_sec": 0.9667072543654824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344393253326416, "data/tokens_consumed": 57443090432, "data/tokens_consumed_B": 57.443090432, "train/loss_slope": 1.5269426694331522e-07} {"step": 27400, "timestamp": 1778224151.920785, "grad/layer_0/attn": 0.00261924066580832, "grad/layer_0/mlp": 0.00271680043078959, "grad/layer_0/attn_mlp_ratio": 0.9640901627205897, "grad/layer_4/attn": 0.002041222294792533, "grad/layer_4/mlp": 0.002532324520871043, "grad/layer_4/attn_mlp_ratio": 0.8060665990327947, "grad/layer_8/attn": 0.0037904209457337856, "grad/layer_8/mlp": 0.003729751566424966, "grad/layer_8/attn_mlp_ratio": 1.0162663053025651, "grad/layer_12/attn": 0.0048739411868155, "grad/layer_12/mlp": 0.006310426630079746, "grad/layer_12/attn_mlp_ratio": 0.7723631689728703, "grad/layer_16/attn": 0.004390863701701164, "grad/layer_16/mlp": 0.00482561532407999, "grad/layer_16/attn_mlp_ratio": 0.9099075072975357, "grad/layer_20/attn": 0.004317858722060919, "grad/layer_20/mlp": 0.006927125621587038, "grad/layer_20/attn_mlp_ratio": 0.6233261666675333, "grad/layer_24/attn": 0.01525744330137968, "grad/layer_24/mlp": 0.011465768329799175, "grad/layer_24/attn_mlp_ratio": 1.3306952250776374, "grad/layer_27/attn": 0.00834206584841013, "grad/layer_27/mlp": 0.009996220469474792, "grad/layer_27/attn_mlp_ratio": 0.8345219866280349} {"step": 27400, "timestamp": 1778224151.936655, "train/loss": 2.222690391540527, "train/z_loss": 0.0014683306217193604, "train/perplexity": 9.232135543139805, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024526.7165785327, "perf/iters_per_sec": 0.965369566239611, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035872721672058, "data/tokens_consumed": 57464061952, "data/tokens_consumed_B": 57.464061952, "train/loss_slope": 2.4321590069353615e-06} {"step": 27410, "timestamp": 1778224162.2927914, "train/loss": 2.1413837909698485, "train/z_loss": 0.0015005519147962333, "train/perplexity": 8.511207215371405, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026235.51541602, "perf/iters_per_sec": 0.9661843850212193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034999132156372, "data/tokens_consumed": 57485033472, "data/tokens_consumed_B": 57.485033472, "train/loss_slope": -4.36714555349504e-06} {"step": 27420, "timestamp": 1778224172.985167, "train/loss": 2.1755290031433105, "train/z_loss": 0.0015046861255541444, "train/perplexity": 8.806842737623231, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1962624.2780767325, "perf/iters_per_sec": 0.9358521833785689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0685448169708252, "data/tokens_consumed": 57506004992, "data/tokens_consumed_B": 57.506004992, "train/loss_slope": -5.7212653118606454e-06} {"step": 27430, "timestamp": 1778224183.343218, "train/loss": 2.245133137702942, "train/z_loss": 0.0014749440946616233, "train/perplexity": 9.441672515374814, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026322.7558932807, "perf/iters_per_sec": 0.9662259845224765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034954571723938, "data/tokens_consumed": 57526976512, "data/tokens_consumed_B": 57.526976512, "train/loss_slope": -1.9898714691606385e-06} {"step": 27440, "timestamp": 1778224194.2167761, "train/loss": 2.195067572593689, "train/z_loss": 0.0014856437221169472, "train/perplexity": 8.980607879284161, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929643.524112205, "perf/iters_per_sec": 0.9201257343827272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0868079900741576, "data/tokens_consumed": 57547948032, "data/tokens_consumed_B": 57.547948032, "train/loss_slope": -8.315333939514648e-07} {"step": 27450, "timestamp": 1778224204.5558844, "grad/layer_0/attn": 0.0026886321138590574, "grad/layer_0/mlp": 0.0026242483872920275, "grad/layer_0/attn_mlp_ratio": 1.024534119721821, "grad/layer_4/attn": 0.0025055164005607367, "grad/layer_4/mlp": 0.0025154370814561844, "grad/layer_4/attn_mlp_ratio": 0.9960560410855879, "grad/layer_8/attn": 0.006322432775050402, "grad/layer_8/mlp": 0.0037925310898572206, "grad/layer_8/attn_mlp_ratio": 1.6670746945889785, "grad/layer_12/attn": 0.005132810212671757, "grad/layer_12/mlp": 0.006710719782859087, "grad/layer_12/attn_mlp_ratio": 0.7648673022073654, "grad/layer_16/attn": 0.0036502869334071875, "grad/layer_16/mlp": 0.004369575064629316, "grad/layer_16/attn_mlp_ratio": 0.8353871476923895, "grad/layer_20/attn": 0.004402133636176586, "grad/layer_20/mlp": 0.006176521070301533, "grad/layer_20/attn_mlp_ratio": 0.71272056142919, "grad/layer_24/attn": 0.006495010107755661, "grad/layer_24/mlp": 0.009496000595390797, "grad/layer_24/attn_mlp_ratio": 0.6839732131557481, "grad/layer_27/attn": 0.004008416552096605, "grad/layer_27/mlp": 0.008276114240288734, "grad/layer_27/attn_mlp_ratio": 0.48433556948136136} {"step": 27450, "timestamp": 1778224205.1629062, "eos/sharpness": 4.563164710998534, "eos/L0_probe": 2.024775505065918, "eos/L_plus": 2.0526349544525146, "eos/L_minus": 2.0425477027893066, "eos/grad_norm": 0.09336038678884506, "eos/embed_grad_frac": 0.30334755778312683, "eos/time_s": 0.6043219566345215} {"step": 27450, "timestamp": 1778224205.182109, "train/loss": 2.1991512298583986, "train/z_loss": 0.0014898743364028632, "train/perplexity": 9.01735658738709, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913609.623981426, "perf/iters_per_sec": 0.9124801750094538, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0959142208099366, "data/tokens_consumed": 57568919552, "data/tokens_consumed_B": 57.568919552, "train/loss_slope": -5.652349917265919e-07} {"step": 27450, "timestamp": 1778224206.5416415, "geo/rankme_last": 440.1192321777344, "geo/layer_0/stable_rank_q_proj": 18.287111282348633, "geo/layer_0/stable_rank_k_proj": 15.871504783630371, "geo/layer_0/stable_rank_o_proj": 50.772056579589844, "geo/layer_0/stable_rank_gate_proj": 144.57225036621094, "geo/layer_0/stable_rank_down_proj": 51.34222412109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05077607184648514, "geo/layer_0/attn_entropy_mean": 6.2366228103637695, "geo/layer_0/attn_entropy_std": 0.3314797282218933, "geo/layer_7/stable_rank_q_proj": 42.34485626220703, "geo/layer_7/stable_rank_k_proj": 41.95726776123047, "geo/layer_7/stable_rank_o_proj": 106.99466705322266, "geo/layer_7/stable_rank_gate_proj": 97.64411163330078, "geo/layer_7/stable_rank_down_proj": 148.14414978027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5285878777503967, "geo/layer_7/attn_entropy_mean": 4.641899585723877, "geo/layer_7/attn_entropy_std": 0.8432042002677917, "geo/layer_14/stable_rank_q_proj": 56.22860336303711, "geo/layer_14/stable_rank_k_proj": 35.13214874267578, "geo/layer_14/stable_rank_o_proj": 53.51203155517578, "geo/layer_14/stable_rank_gate_proj": 81.74974822998047, "geo/layer_14/stable_rank_down_proj": 134.39389038085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3636943995952606, "geo/layer_14/attn_entropy_mean": 5.513815402984619, "geo/layer_14/attn_entropy_std": 0.41142308712005615, "geo/layer_21/stable_rank_q_proj": 45.88742446899414, "geo/layer_21/stable_rank_k_proj": 31.137550354003906, "geo/layer_21/stable_rank_o_proj": 81.11253356933594, "geo/layer_21/stable_rank_gate_proj": 80.88732147216797, "geo/layer_21/stable_rank_down_proj": 58.521427154541016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1501537710428238, "geo/layer_21/attn_entropy_mean": 5.739323616027832, "geo/layer_21/attn_entropy_std": 0.2889992892742157, "geo/layer_27/stable_rank_q_proj": 41.4334602355957, "geo/layer_27/stable_rank_k_proj": 31.315845489501953, "geo/layer_27/stable_rank_o_proj": 118.68363952636719, "geo/layer_27/stable_rank_gate_proj": 89.11229705810547, "geo/layer_27/stable_rank_down_proj": 136.605712890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08153099566698074, "geo/layer_27/attn_entropy_mean": 4.3324127197265625, "geo/layer_27/attn_entropy_std": 0.5954399108886719, "attnres/final_alpha/block_0": 0.24140509963035583, "attnres/block_norm/0": 1.6690425872802734, "attnres/final_alpha/block_1": 0.006138271652162075, "attnres/block_norm/1": 33987.6328125, "attnres/final_alpha/block_2": 0.012799616903066635, "attnres/block_norm/2": 23729.482421875, "attnres/final_alpha/block_3": 0.014859361574053764, "attnres/block_norm/3": 36982.578125, "attnres/final_alpha/block_4": 0.01876000687479973, "attnres/block_norm/4": 10901.201171875, "attnres/final_alpha/block_5": 0.5792703628540039, "attnres/block_norm/5": 5471.2607421875, "attnres/final_alpha/block_6": 0.1267673224210739, "attnres/block_norm/6": 24377.5546875, "geo/tier1_time_s": 1.3555748462677002, "geo/step": 27450.0, "geo/rankme_slope": -5.7117983912314926e-05} {"step": 27460, "timestamp": 1778224216.904991, "train/loss": 2.155604028701782, "train/z_loss": 0.0014869404374621808, "train/perplexity": 8.633103246535391, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789493.324078086, "perf/iters_per_sec": 0.8532969112768584, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1719250202178955, "data/tokens_consumed": 57589891072, "data/tokens_consumed_B": 57.589891072, "train/loss_slope": -5.475690255392616e-06} {"step": 27470, "timestamp": 1778224227.2626858, "train/loss": 2.2080934762954714, "train/z_loss": 0.0014904760057106615, "train/perplexity": 9.098353620181461, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025682.6536401634, "perf/iters_per_sec": 0.9659207599831406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352816104888916, "data/tokens_consumed": 57610862592, "data/tokens_consumed_B": 57.610862592, "train/loss_slope": -8.601151984838875e-06} {"step": 27480, "timestamp": 1778224238.1406758, "train/loss": 2.2304360628128053, "train/z_loss": 0.0014771599322557448, "train/perplexity": 9.30392228956183, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928804.1580613293, "perf/iters_per_sec": 0.9197254934603354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0872809410095214, "data/tokens_consumed": 57631834112, "data/tokens_consumed_B": 57.631834112, "train/loss_slope": -4.764932610414249e-06} {"step": 27490, "timestamp": 1778224248.499728, "train/loss": 2.17177529335022, "train/z_loss": 0.001490685110911727, "train/perplexity": 8.773846373923732, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025561.650390651, "perf/iters_per_sec": 0.9658630611375099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353434562683106, "data/tokens_consumed": 57652805632, "data/tokens_consumed_B": 57.652805632, "train/loss_slope": -8.461479099169555e-06} {"step": 27500, "timestamp": 1778224259.269001, "grad/layer_0/attn": 0.0023006757255643606, "grad/layer_0/mlp": 0.0023975803051143885, "grad/layer_0/attn_mlp_ratio": 0.9595823024982536, "grad/layer_4/attn": 0.0018093379912897944, "grad/layer_4/mlp": 0.0023752888664603233, "grad/layer_4/attn_mlp_ratio": 0.7617338424242697, "grad/layer_8/attn": 0.004781773313879967, "grad/layer_8/mlp": 0.003584008663892746, "grad/layer_8/attn_mlp_ratio": 1.3341968808933038, "grad/layer_12/attn": 0.003990366589277983, "grad/layer_12/mlp": 0.006219453644007444, "grad/layer_12/attn_mlp_ratio": 0.641594383288528, "grad/layer_16/attn": 0.0036137988790869713, "grad/layer_16/mlp": 0.004473009146749973, "grad/layer_16/attn_mlp_ratio": 0.8079122308349145, "grad/layer_20/attn": 0.005041830707341433, "grad/layer_20/mlp": 0.006221298594027758, "grad/layer_20/attn_mlp_ratio": 0.8104145059264595, "grad/layer_24/attn": 0.011218439787626266, "grad/layer_24/mlp": 0.010362956672906876, "grad/layer_24/attn_mlp_ratio": 1.0825520199945236, "grad/layer_27/attn": 0.0040700542740523815, "grad/layer_27/mlp": 0.009093930013477802, "grad/layer_27/attn_mlp_ratio": 0.4475572412878227} {"step": 27500, "timestamp": 1778224259.2851782, "train/loss": 2.1871604681015016, "train/z_loss": 0.0014904470182955265, "train/perplexity": 8.909877280018499, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945766.818949551, "perf/iters_per_sec": 0.9278139204738384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0778023242950439, "data/tokens_consumed": 57673777152, "data/tokens_consumed_B": 57.673777152, "train/loss_slope": -9.468063770240409e-06} {"step": 27500, "timestamp": 1778224266.2435188, "geo/ww_alpha_mean": 7.752638622902379, "geo/ww_alpha_std": 4.246961676472739, "geo/ww_alpha_min": 1.3512861587848506, "geo/ww_alpha_max": 25.758183450556714, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.055756295422527, "geo/ww_alpha_by_type/k_proj": 4.615016298833851, "geo/ww_alpha_by_type/v_proj": 8.1114547558177, "geo/ww_alpha_by_type/o_proj": 7.504326861186036, "geo/ww_alpha_by_type/gate_proj": 8.782762716267044, "geo/ww_alpha_by_type/up_proj": 12.154355602214507, "geo/ww_alpha_by_type/down_proj": 9.164866290425227, "geo/twonn_id/layer_0": 0.6842548847198486, "geo/twonn_id/layer_7": 3.396336793899536, "geo/twonn_id/layer_14": 4.1334919929504395, "geo/twonn_id/layer_21": 7.081839561462402, "geo/twonn_id/layer_27": 5.699623107910156, "geo/tier2_time_s": 6.950738906860352} {"step": 27500, "timestamp": 1778224266.8757074, "eoc/jacobian_sigma/layer_0/attn": 977.9716186523438, "eoc/jacobian_sigma/layer_0/mlp": 6373.84521484375, "eoc/jacobian_sigma/layer_0": 6373.84521484375, "eoc/jacobian_sigma/layer_7/attn": 1.1482396125793457, "eoc/jacobian_sigma/layer_7/mlp": 1.6462738513946533, "eoc/jacobian_sigma/layer_7": 1.6462738513946533, "eoc/jacobian_sigma/layer_14/attn": 1.637562870979309, "eoc/jacobian_sigma/layer_14/mlp": 6.666217803955078, "eoc/jacobian_sigma/layer_14": 6.666217803955078, "eoc/jacobian_sigma/layer_21/attn": 1.0904875993728638, "eoc/jacobian_sigma/layer_21/mlp": 3.950453996658325, "eoc/jacobian_sigma/layer_21": 3.950453996658325, "eoc/jacobian_sigma/layer_27/attn": 3.554072856903076, "eoc/jacobian_sigma/layer_27/mlp": 26.04104232788086, "eoc/jacobian_sigma/layer_27": 26.04104232788086, "eoc/layer0_sigma": 6373.84521484375, "eoc/sigma_max": 26.04104232788086, "eoc/sigma_min": 1.6462738513946533, "eoc/sigma_mean": 9.575996994972229, "eoc/time_s": 0.6261429786682129} {"step": 27510, "timestamp": 1778224277.2439353, "train/loss": 2.189168095588684, "train/z_loss": 0.0014828254585154354, "train/perplexity": 8.927782962509287, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1168179.7447594092, "perf/iters_per_sec": 0.5570315097615286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7952305793762207, "data/tokens_consumed": 57694748672, "data/tokens_consumed_B": 57.694748672, "train/loss_slope": -9.499703281962277e-06} {"step": 27520, "timestamp": 1778224288.0989377, "train/loss": 2.190484118461609, "train/z_loss": 0.001473264954984188, "train/perplexity": 8.939539863570985, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933458.13975686, "perf/iters_per_sec": 0.9219446848663616, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0846637725830077, "data/tokens_consumed": 57715720192, "data/tokens_consumed_B": 57.715720192, "train/loss_slope": -8.742146280267528e-06} {"step": 27525, "timestamp": 1778224293.8887591, "eos/sharpness": 35.39361953735351, "eos/L0_probe": 2.024099349975586, "eos/L_plus": 2.2039825916290283, "eos/L_minus": 2.1981523036956787, "eos/grad_norm": 0.18577657639980316, "eos/embed_grad_frac": 0.13410136103630066, "eos/time_s": 0.6268501281738281} {"step": 27525, "timestamp": 1778224295.2711365, "geo/rankme_last": 440.5872497558594, "geo/layer_0/stable_rank_q_proj": 18.260173797607422, "geo/layer_0/stable_rank_k_proj": 15.829322814941406, "geo/layer_0/stable_rank_o_proj": 50.73567199707031, "geo/layer_0/stable_rank_gate_proj": 144.33224487304688, "geo/layer_0/stable_rank_down_proj": 51.27519607543945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052838604897260666, "geo/layer_0/attn_entropy_mean": 6.242101669311523, "geo/layer_0/attn_entropy_std": 0.3321099877357483, "geo/layer_7/stable_rank_q_proj": 42.2093391418457, "geo/layer_7/stable_rank_k_proj": 41.90367889404297, "geo/layer_7/stable_rank_o_proj": 106.63518524169922, "geo/layer_7/stable_rank_gate_proj": 97.65059661865234, "geo/layer_7/stable_rank_down_proj": 147.86875915527344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5420088768005371, "geo/layer_7/attn_entropy_mean": 4.658080577850342, "geo/layer_7/attn_entropy_std": 0.836489200592041, "geo/layer_14/stable_rank_q_proj": 56.287349700927734, "geo/layer_14/stable_rank_k_proj": 35.13284683227539, "geo/layer_14/stable_rank_o_proj": 53.348663330078125, "geo/layer_14/stable_rank_gate_proj": 81.7715835571289, "geo/layer_14/stable_rank_down_proj": 134.4619140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38125231862068176, "geo/layer_14/attn_entropy_mean": 5.489933967590332, "geo/layer_14/attn_entropy_std": 0.4328419268131256, "geo/layer_21/stable_rank_q_proj": 45.63962936401367, "geo/layer_21/stable_rank_k_proj": 31.04900360107422, "geo/layer_21/stable_rank_o_proj": 80.97496032714844, "geo/layer_21/stable_rank_gate_proj": 80.7355728149414, "geo/layer_21/stable_rank_down_proj": 58.485958099365234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15280526876449585, "geo/layer_21/attn_entropy_mean": 5.743321895599365, "geo/layer_21/attn_entropy_std": 0.2838405668735504, "geo/layer_27/stable_rank_q_proj": 41.40760803222656, "geo/layer_27/stable_rank_k_proj": 31.2519474029541, "geo/layer_27/stable_rank_o_proj": 118.79744720458984, "geo/layer_27/stable_rank_gate_proj": 89.04047393798828, "geo/layer_27/stable_rank_down_proj": 136.28546142578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07840970903635025, "geo/layer_27/attn_entropy_mean": 4.358044147491455, "geo/layer_27/attn_entropy_std": 0.6069666743278503, "attnres/final_alpha/block_0": 0.24125361442565918, "attnres/block_norm/0": 1.6695899963378906, "attnres/final_alpha/block_1": 0.006041158922016621, "attnres/block_norm/1": 34036.7109375, "attnres/final_alpha/block_2": 0.012828847393393517, "attnres/block_norm/2": 23727.958984375, "attnres/final_alpha/block_3": 0.014794256538152695, "attnres/block_norm/3": 37097.6171875, "attnres/final_alpha/block_4": 0.01867694966495037, "attnres/block_norm/4": 10882.9755859375, "attnres/final_alpha/block_5": 0.5786997675895691, "attnres/block_norm/5": 5489.89453125, "attnres/final_alpha/block_6": 0.1277053952217102, "attnres/block_norm/6": 24570.73046875, "geo/tier1_time_s": 1.3620331287384033, "geo/step": 27525.0, "geo/rankme_slope": -4.014291263380352e-05} {"step": 27530, "timestamp": 1778224300.4497888, "train/loss": 2.2111064195632935, "train/z_loss": 0.0014846684178337454, "train/perplexity": 9.125807781616412, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698652.4383193012, "perf/iters_per_sec": 0.8099806014629847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345974683761596, "data/tokens_consumed": 57736691712, "data/tokens_consumed_B": 57.736691712, "train/loss_slope": -9.114471964030677e-06} {"step": 27540, "timestamp": 1778224310.8077419, "train/loss": 2.2011260747909547, "train/z_loss": 0.0014968606876209379, "train/perplexity": 9.035182063831357, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025714.8894038384, "perf/iters_per_sec": 0.965936131193084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352651357650757, "data/tokens_consumed": 57757663232, "data/tokens_consumed_B": 57.757663232, "train/loss_slope": -1.1932939764904687e-05} {"step": 27550, "timestamp": 1778224321.1501899, "grad/layer_0/attn": 0.002487874124199152, "grad/layer_0/mlp": 0.002434490015730262, "grad/layer_0/attn_mlp_ratio": 1.0219282091654238, "grad/layer_4/attn": 0.0019344320753589272, "grad/layer_4/mlp": 0.002451966516673565, "grad/layer_4/attn_mlp_ratio": 0.7889308370695741, "grad/layer_8/attn": 0.004398939199745655, "grad/layer_8/mlp": 0.0037512073758989573, "grad/layer_8/attn_mlp_ratio": 1.1726728601412462, "grad/layer_12/attn": 0.003939324524253607, "grad/layer_12/mlp": 0.0065372115932404995, "grad/layer_12/attn_mlp_ratio": 0.602600115937332, "grad/layer_16/attn": 0.010022451169788837, "grad/layer_16/mlp": 0.004497859627008438, "grad/layer_16/attn_mlp_ratio": 2.228271172977386, "grad/layer_20/attn": 0.00351792573928833, "grad/layer_20/mlp": 0.005286633502691984, "grad/layer_20/attn_mlp_ratio": 0.6654377820882049, "grad/layer_24/attn": 0.007372557185590267, "grad/layer_24/mlp": 0.008610998280346394, "grad/layer_24/attn_mlp_ratio": 0.8561791397403204, "grad/layer_27/attn": 0.004408854991197586, "grad/layer_27/mlp": 0.007493176963180304, "grad/layer_27/attn_mlp_ratio": 0.5883825984656968} {"step": 27550, "timestamp": 1778224321.1659386, "train/loss": 2.206797647476196, "train/z_loss": 0.001483679050579667, "train/perplexity": 9.086571346905812, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025965.5796344613, "perf/iters_per_sec": 0.9660556696102435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351370334625245, "data/tokens_consumed": 57778634752, "data/tokens_consumed_B": 57.778634752, "train/loss_slope": -1.1593997746732843e-05} {"step": 27560, "timestamp": 1778224331.514088, "train/loss": 2.179234576225281, "train/z_loss": 0.0014968212344683707, "train/perplexity": 8.839537676339411, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027572.720124622, "perf/iters_per_sec": 0.9668220139144049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343165397644043, "data/tokens_consumed": 57799606272, "data/tokens_consumed_B": 57.799606272, "train/loss_slope": -1.1464158565190472e-05} {"step": 27570, "timestamp": 1778224341.8863475, "train/loss": 2.2064812421798705, "train/z_loss": 0.001482529635541141, "train/perplexity": 9.083696762397073, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022886.157997743, "perf/iters_per_sec": 0.9645872869480815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367128133773804, "data/tokens_consumed": 57820577792, "data/tokens_consumed_B": 57.820577792, "train/loss_slope": -1.1316775782058942e-05} {"step": 27580, "timestamp": 1778224352.2636402, "train/loss": 2.1989181280136108, "train/z_loss": 0.0014870890765450895, "train/perplexity": 9.015254869898088, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022256.3144166085, "perf/iters_per_sec": 0.9642869541247409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370357036590576, "data/tokens_consumed": 57841549312, "data/tokens_consumed_B": 57.841549312, "train/loss_slope": -1.3380183817350996e-05} {"step": 27590, "timestamp": 1778224362.6375325, "train/loss": 2.1819817304611204, "train/z_loss": 0.0014927679090760647, "train/perplexity": 8.863854635635647, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022542.2370133686, "perf/iters_per_sec": 0.964423292643246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036889100074768, "data/tokens_consumed": 57862520832, "data/tokens_consumed_B": 57.862520832, "train/loss_slope": -1.4257119612546438e-05} {"step": 27600, "timestamp": 1778224373.0031219, "grad/layer_0/attn": 0.0032105369027704, "grad/layer_0/mlp": 0.0028192901518195868, "grad/layer_0/attn_mlp_ratio": 1.1387748745267716, "grad/layer_4/attn": 0.0017468417063355446, "grad/layer_4/mlp": 0.002410824876278639, "grad/layer_4/attn_mlp_ratio": 0.7245825489297767, "grad/layer_8/attn": 0.005104387179017067, "grad/layer_8/mlp": 0.003730298252776265, "grad/layer_8/attn_mlp_ratio": 1.368358961212349, "grad/layer_12/attn": 0.004547233693301678, "grad/layer_12/mlp": 0.0062478939071297646, "grad/layer_12/attn_mlp_ratio": 0.7278026304723826, "grad/layer_16/attn": 0.005592554807662964, "grad/layer_16/mlp": 0.004518970847129822, "grad/layer_16/attn_mlp_ratio": 1.2375726405621215, "grad/layer_20/attn": 0.0039916085079312325, "grad/layer_20/mlp": 0.005840423982590437, "grad/layer_20/attn_mlp_ratio": 0.6834449778792111, "grad/layer_24/attn": 0.018351120874285698, "grad/layer_24/mlp": 0.011378795839846134, "grad/layer_24/attn_mlp_ratio": 1.6127471633465154, "grad/layer_27/attn": 0.005221688188612461, "grad/layer_27/mlp": 0.010475901886820793, "grad/layer_27/attn_mlp_ratio": 0.498447598610755} {"step": 27600, "timestamp": 1778224373.6155577, "eos/sharpness": 46.23539447784423, "eos/L0_probe": 2.025404214859009, "eos/L_plus": 2.2397429943084717, "eos/L_minus": 2.2734193801879883, "eos/grad_norm": 0.15547122061252594, "eos/embed_grad_frac": 0.10069190710783005, "eos/time_s": 0.6093771457672119} {"step": 27600, "timestamp": 1778224373.6354823, "train/loss": 2.2176085472106934, "train/z_loss": 0.0014866136712953447, "train/perplexity": 9.185338276402083, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907799.4776918439, "perf/iters_per_sec": 0.909709681363985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0992517948150635, "data/tokens_consumed": 57883492352, "data/tokens_consumed_B": 57.883492352, "train/loss_slope": -1.1034190651177999e-05} {"step": 27600, "timestamp": 1778224374.9951048, "geo/rankme_last": 439.6860656738281, "geo/layer_0/stable_rank_q_proj": 18.264665603637695, "geo/layer_0/stable_rank_k_proj": 15.844595909118652, "geo/layer_0/stable_rank_o_proj": 50.621681213378906, "geo/layer_0/stable_rank_gate_proj": 144.6001434326172, "geo/layer_0/stable_rank_down_proj": 51.19702911376953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050180595368146896, "geo/layer_0/attn_entropy_mean": 6.238010883331299, "geo/layer_0/attn_entropy_std": 0.331157922744751, "geo/layer_7/stable_rank_q_proj": 42.15657043457031, "geo/layer_7/stable_rank_k_proj": 41.799049377441406, "geo/layer_7/stable_rank_o_proj": 106.41815185546875, "geo/layer_7/stable_rank_gate_proj": 97.5308837890625, "geo/layer_7/stable_rank_down_proj": 147.7093505859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5251797437667847, "geo/layer_7/attn_entropy_mean": 4.653326511383057, "geo/layer_7/attn_entropy_std": 0.816897451877594, "geo/layer_14/stable_rank_q_proj": 56.23143005371094, "geo/layer_14/stable_rank_k_proj": 35.146080017089844, "geo/layer_14/stable_rank_o_proj": 53.3773193359375, "geo/layer_14/stable_rank_gate_proj": 81.88397216796875, "geo/layer_14/stable_rank_down_proj": 134.6359100341797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3868047893047333, "geo/layer_14/attn_entropy_mean": 5.495508193969727, "geo/layer_14/attn_entropy_std": 0.4189194142818451, "geo/layer_21/stable_rank_q_proj": 45.769676208496094, "geo/layer_21/stable_rank_k_proj": 31.104467391967773, "geo/layer_21/stable_rank_o_proj": 80.96479034423828, "geo/layer_21/stable_rank_gate_proj": 80.76305389404297, "geo/layer_21/stable_rank_down_proj": 58.51511001586914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14916811883449554, "geo/layer_21/attn_entropy_mean": 5.702672481536865, "geo/layer_21/attn_entropy_std": 0.3014675974845886, "geo/layer_27/stable_rank_q_proj": 41.48084259033203, "geo/layer_27/stable_rank_k_proj": 31.2463321685791, "geo/layer_27/stable_rank_o_proj": 118.94644927978516, "geo/layer_27/stable_rank_gate_proj": 89.05087280273438, "geo/layer_27/stable_rank_down_proj": 136.04522705078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08560828864574432, "geo/layer_27/attn_entropy_mean": 4.311599254608154, "geo/layer_27/attn_entropy_std": 0.5945575833320618, "attnres/final_alpha/block_0": 0.24185988306999207, "attnres/block_norm/0": 1.6699832677841187, "attnres/final_alpha/block_1": 0.006084160879254341, "attnres/block_norm/1": 34051.3125, "attnres/final_alpha/block_2": 0.012952841818332672, "attnres/block_norm/2": 23705.734375, "attnres/final_alpha/block_3": 0.01488536037504673, "attnres/block_norm/3": 37297.8828125, "attnres/final_alpha/block_4": 0.018650837242603302, "attnres/block_norm/4": 10909.2734375, "attnres/final_alpha/block_5": 0.5775448679924011, "attnres/block_norm/5": 5490.63671875, "attnres/final_alpha/block_6": 0.12802201509475708, "attnres/block_norm/6": 24744.4453125, "geo/tier1_time_s": 1.3558287620544434, "geo/step": 27600.0, "geo/rankme_slope": -9.315771230367147e-05} {"step": 27610, "timestamp": 1778224385.3724604, "train/loss": 2.239634299278259, "train/z_loss": 0.0014797179028391838, "train/perplexity": 9.38989676745291, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787358.3452634246, "perf/iters_per_sec": 0.8522788740460513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1733248710632325, "data/tokens_consumed": 57904463872, "data/tokens_consumed_B": 57.904463872, "train/loss_slope": -6.66146237607694e-06} {"step": 27620, "timestamp": 1778224395.7557423, "train/loss": 2.177132618427277, "train/z_loss": 0.0014818368013948201, "train/perplexity": 8.820976855055221, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020784.8764478634, "perf/iters_per_sec": 0.9635853178252523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377908229827881, "data/tokens_consumed": 57925435392, "data/tokens_consumed_B": 57.925435392, "train/loss_slope": -6.406902801944014e-06} {"step": 27630, "timestamp": 1778224406.1351614, "train/loss": 2.190579152107239, "train/z_loss": 0.0014808618929237128, "train/perplexity": 8.940389461004004, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021552.5240165414, "perf/iters_per_sec": 0.9639513607104022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037396740913391, "data/tokens_consumed": 57946406912, "data/tokens_consumed_B": 57.946406912, "train/loss_slope": -6.106141820312996e-06} {"step": 27640, "timestamp": 1778224416.518237, "train/loss": 2.1456489324569703, "train/z_loss": 0.0015051112044602633, "train/perplexity": 8.5475862440746, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020812.4531751277, "perf/iters_per_sec": 0.9635984674335135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377766609191894, "data/tokens_consumed": 57967378432, "data/tokens_consumed_B": 57.967378432, "train/loss_slope": -1.0626717826964559e-05} {"step": 27650, "timestamp": 1778224426.8805563, "grad/layer_0/attn": 0.0024650346022099257, "grad/layer_0/mlp": 0.002608755836263299, "grad/layer_0/attn_mlp_ratio": 0.9449080950595803, "grad/layer_4/attn": 0.002229035133495927, "grad/layer_4/mlp": 0.002358831698074937, "grad/layer_4/attn_mlp_ratio": 0.9449741754859595, "grad/layer_8/attn": 0.003301717806607485, "grad/layer_8/mlp": 0.003626427613198757, "grad/layer_8/attn_mlp_ratio": 0.9104601188079781, "grad/layer_12/attn": 0.004415706265717745, "grad/layer_12/mlp": 0.006361539475619793, "grad/layer_12/attn_mlp_ratio": 0.6941254099307448, "grad/layer_16/attn": 0.004217876121401787, "grad/layer_16/mlp": 0.004510285332798958, "grad/layer_16/attn_mlp_ratio": 0.9351683356288807, "grad/layer_20/attn": 0.0045845299027860165, "grad/layer_20/mlp": 0.006305213551968336, "grad/layer_20/attn_mlp_ratio": 0.7271014363414691, "grad/layer_24/attn": 0.016268882900476456, "grad/layer_24/mlp": 0.014063563197851181, "grad/layer_24/attn_mlp_ratio": 1.1568108704685272, "grad/layer_27/attn": 0.005115351174026728, "grad/layer_27/mlp": 0.012884064577519894, "grad/layer_27/attn_mlp_ratio": 0.39702929953091504} {"step": 27650, "timestamp": 1778224426.8971965, "train/loss": 2.204374361038208, "train/z_loss": 0.001491572812665254, "train/perplexity": 9.064578639869735, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021570.0860911773, "perf/iters_per_sec": 0.9639597349601637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037387728691101, "data/tokens_consumed": 57988349952, "data/tokens_consumed_B": 57.988349952, "train/loss_slope": -1.0503407981064503e-05} {"step": 27660, "timestamp": 1778224437.283252, "train/loss": 2.1791621446609497, "train/z_loss": 0.0014943163841962814, "train/perplexity": 8.838897437984562, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020242.2694586536, "perf/iters_per_sec": 0.9633265826504963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380695581436157, "data/tokens_consumed": 58009321472, "data/tokens_consumed_B": 58.009321472, "train/loss_slope": -1.1220949031624936e-05} {"step": 27670, "timestamp": 1778224447.6570878, "train/loss": 2.188983988761902, "train/z_loss": 0.0014871070510707796, "train/perplexity": 8.926139448013622, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022607.3004608396, "perf/iters_per_sec": 0.964454317312641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368557453155518, "data/tokens_consumed": 58030292992, "data/tokens_consumed_B": 58.030292992, "train/loss_slope": -1.2994975411351834e-05} {"step": 27675, "timestamp": 1778224453.4475749, "eos/sharpness": 66.42839908599852, "eos/L0_probe": 2.026282548904419, "eos/L_plus": 2.4427073001861572, "eos/L_minus": 2.274141788482666, "eos/grad_norm": 0.260049045085907, "eos/embed_grad_frac": 0.036977723240852356, "eos/time_s": 0.605273962020874} {"step": 27675, "timestamp": 1778224454.83106, "geo/rankme_last": 439.58734130859375, "geo/layer_0/stable_rank_q_proj": 18.26445198059082, "geo/layer_0/stable_rank_k_proj": 15.789144515991211, "geo/layer_0/stable_rank_o_proj": 50.66551971435547, "geo/layer_0/stable_rank_gate_proj": 144.3749237060547, "geo/layer_0/stable_rank_down_proj": 51.36415100097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05040647089481354, "geo/layer_0/attn_entropy_mean": 6.2327351570129395, "geo/layer_0/attn_entropy_std": 0.3353107273578644, "geo/layer_7/stable_rank_q_proj": 42.2120361328125, "geo/layer_7/stable_rank_k_proj": 41.703880310058594, "geo/layer_7/stable_rank_o_proj": 106.4574966430664, "geo/layer_7/stable_rank_gate_proj": 97.63301849365234, "geo/layer_7/stable_rank_down_proj": 147.92140197753906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5379388928413391, "geo/layer_7/attn_entropy_mean": 4.674627304077148, "geo/layer_7/attn_entropy_std": 0.8454583883285522, "geo/layer_14/stable_rank_q_proj": 56.05887222290039, "geo/layer_14/stable_rank_k_proj": 35.24998474121094, "geo/layer_14/stable_rank_o_proj": 53.34931564331055, "geo/layer_14/stable_rank_gate_proj": 81.6910171508789, "geo/layer_14/stable_rank_down_proj": 134.99562072753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3591383993625641, "geo/layer_14/attn_entropy_mean": 5.5050249099731445, "geo/layer_14/attn_entropy_std": 0.4184521734714508, "geo/layer_21/stable_rank_q_proj": 45.80316162109375, "geo/layer_21/stable_rank_k_proj": 31.110822677612305, "geo/layer_21/stable_rank_o_proj": 80.91954040527344, "geo/layer_21/stable_rank_gate_proj": 80.61609649658203, "geo/layer_21/stable_rank_down_proj": 58.51678466796875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15317635238170624, "geo/layer_21/attn_entropy_mean": 5.748938083648682, "geo/layer_21/attn_entropy_std": 0.28074154257774353, "geo/layer_27/stable_rank_q_proj": 41.40437316894531, "geo/layer_27/stable_rank_k_proj": 31.12563705444336, "geo/layer_27/stable_rank_o_proj": 118.71446228027344, "geo/layer_27/stable_rank_gate_proj": 89.04866790771484, "geo/layer_27/stable_rank_down_proj": 136.03094482421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08850546926259995, "geo/layer_27/attn_entropy_mean": 4.361284255981445, "geo/layer_27/attn_entropy_std": 0.5890225172042847, "attnres/final_alpha/block_0": 0.2390919327735901, "attnres/block_norm/0": 1.6701271533966064, "attnres/final_alpha/block_1": 0.005871265660971403, "attnres/block_norm/1": 34169.8671875, "attnres/final_alpha/block_2": 0.012557156383991241, "attnres/block_norm/2": 23841.2578125, "attnres/final_alpha/block_3": 0.014637865126132965, "attnres/block_norm/3": 37484.9765625, "attnres/final_alpha/block_4": 0.01831923797726631, "attnres/block_norm/4": 10930.818359375, "attnres/final_alpha/block_5": 0.5861408710479736, "attnres/block_norm/5": 5441.59765625, "attnres/final_alpha/block_6": 0.12338167428970337, "attnres/block_norm/6": 24773.046875, "geo/tier1_time_s": 1.3637022972106934, "geo/step": 27675.0, "geo/rankme_slope": -0.00011334549444777911} {"step": 27680, "timestamp": 1778224460.0257928, "train/loss": 2.210625410079956, "train/z_loss": 0.0014710651012137532, "train/perplexity": 9.121419237080724, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696515.3954266317, "perf/iters_per_sec": 0.8089615800030859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2361526489257812, "data/tokens_consumed": 58051264512, "data/tokens_consumed_B": 58.051264512, "train/loss_slope": -1.2061834492699128e-05} {"step": 27690, "timestamp": 1778224470.411084, "train/loss": 2.1947199583053587, "train/z_loss": 0.0014818562660366297, "train/perplexity": 8.977486634193557, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020271.0378095605, "perf/iters_per_sec": 0.963340300469189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380547761917114, "data/tokens_consumed": 58072236032, "data/tokens_consumed_B": 58.072236032, "train/loss_slope": -1.2500278891796029e-05} {"step": 27700, "timestamp": 1778224480.7822587, "grad/layer_0/attn": 0.002851430093869567, "grad/layer_0/mlp": 0.0027395538054406643, "grad/layer_0/attn_mlp_ratio": 1.0408373743647534, "grad/layer_4/attn": 0.0016971423756331205, "grad/layer_4/mlp": 0.0024945829063653946, "grad/layer_4/attn_mlp_ratio": 0.680331089926671, "grad/layer_8/attn": 0.008175840601325035, "grad/layer_8/mlp": 0.0038726856000721455, "grad/layer_8/attn_mlp_ratio": 2.111155212304661, "grad/layer_12/attn": 0.004828968085348606, "grad/layer_12/mlp": 0.0062429760582745075, "grad/layer_12/attn_mlp_ratio": 0.7735041689928994, "grad/layer_16/attn": 0.004766872618347406, "grad/layer_16/mlp": 0.004318685736507177, "grad/layer_16/attn_mlp_ratio": 1.1037785101318482, "grad/layer_20/attn": 0.003738617291674018, "grad/layer_20/mlp": 0.005459012929350138, "grad/layer_20/attn_mlp_ratio": 0.6848522382294218, "grad/layer_24/attn": 0.0050513241440057755, "grad/layer_24/mlp": 0.007828235626220703, "grad/layer_24/attn_mlp_ratio": 0.6452698054411351, "grad/layer_27/attn": 0.009198633953928947, "grad/layer_27/mlp": 0.007013864815235138, "grad/layer_27/attn_mlp_ratio": 1.3114928880292764} {"step": 27700, "timestamp": 1778224480.7989051, "train/loss": 2.1700472831726074, "train/z_loss": 0.0015000290470197797, "train/perplexity": 8.758698169987472, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020364.4942032162, "perf/iters_per_sec": 0.9633848639503556, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380067586898805, "data/tokens_consumed": 58093207552, "data/tokens_consumed_B": 58.093207552, "train/loss_slope": -1.4337369925690364e-05} {"step": 27710, "timestamp": 1778224491.1822548, "train/loss": 2.1919607162475585, "train/z_loss": 0.001496390497777611, "train/perplexity": 8.952749718762238, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020662.4151594942, "perf/iters_per_sec": 0.9635269237325164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037853717803955, "data/tokens_consumed": 58114179072, "data/tokens_consumed_B": 58.114179072, "train/loss_slope": -1.678033040301634e-05} {"step": 27720, "timestamp": 1778224501.5596802, "train/loss": 2.1566707611083986, "train/z_loss": 0.0014992432319559158, "train/perplexity": 8.642317371167065, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022525.3556162561, "perf/iters_per_sec": 0.9644152429658204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368977546691895, "data/tokens_consumed": 58135150592, "data/tokens_consumed_B": 58.135150592, "train/loss_slope": -1.5905535682485926e-05} {"step": 27730, "timestamp": 1778224511.9355216, "train/loss": 2.2572299957275392, "train/z_loss": 0.001482840022072196, "train/perplexity": 9.556580700061865, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022272.26145837, "perf/iters_per_sec": 0.9642945582668161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370275259017945, "data/tokens_consumed": 58156122112, "data/tokens_consumed_B": 58.156122112, "train/loss_slope": -1.4305572162116544e-05} {"step": 27740, "timestamp": 1778224522.3188083, "train/loss": 2.1753260850906373, "train/z_loss": 0.001487041136715561, "train/perplexity": 8.805055851546566, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020757.439743941, "perf/iters_per_sec": 0.9635722349853234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037804913520813, "data/tokens_consumed": 58177093632, "data/tokens_consumed_B": 58.177093632, "train/loss_slope": -1.593024547558589e-05} {"step": 27750, "timestamp": 1778224532.6904113, "grad/layer_0/attn": 0.003307424718514085, "grad/layer_0/mlp": 0.002880230313166976, "grad/layer_0/attn_mlp_ratio": 1.1483194897860218, "grad/layer_4/attn": 0.0022675388026982546, "grad/layer_4/mlp": 0.0024577216245234013, "grad/layer_4/attn_mlp_ratio": 0.9226182037097683, "grad/layer_8/attn": 0.004997742362320423, "grad/layer_8/mlp": 0.003765407484024763, "grad/layer_8/attn_mlp_ratio": 1.327277924313958, "grad/layer_12/attn": 0.004677889868617058, "grad/layer_12/mlp": 0.006486504338681698, "grad/layer_12/attn_mlp_ratio": 0.7211726921392166, "grad/layer_16/attn": 0.005374874919652939, "grad/layer_16/mlp": 0.004606984090059996, "grad/layer_16/attn_mlp_ratio": 1.1666796971541036, "grad/layer_20/attn": 0.0089305080473423, "grad/layer_20/mlp": 0.0070685953833162785, "grad/layer_20/attn_mlp_ratio": 1.263406297392547, "grad/layer_24/attn": 0.023575881496071815, "grad/layer_24/mlp": 0.016129324212670326, "grad/layer_24/attn_mlp_ratio": 1.461678185585981, "grad/layer_27/attn": 0.005188276991248131, "grad/layer_27/mlp": 0.015619010664522648, "grad/layer_27/attn_mlp_ratio": 0.33217705458228464} {"step": 27750, "timestamp": 1778224533.3006234, "eos/sharpness": 59.21058654785155, "eos/L0_probe": 2.019914388656616, "eos/L_plus": 2.2831575870513916, "eos/L_minus": 2.3487770557403564, "eos/grad_norm": 0.23370765149593353, "eos/embed_grad_frac": 0.054267656058073044, "eos/time_s": 0.6072566509246826} {"step": 27750, "timestamp": 1778224533.3207574, "train/loss": 2.201175403594971, "train/z_loss": 0.001463673822581768, "train/perplexity": 9.03562776954961, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907780.1127105264, "perf/iters_per_sec": 0.9097004474213249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0992629528045654, "data/tokens_consumed": 58198065152, "data/tokens_consumed_B": 58.198065152, "train/loss_slope": -1.7452781085241203e-05} {"step": 27750, "timestamp": 1778224534.6792393, "geo/rankme_last": 440.8535461425781, "geo/layer_0/stable_rank_q_proj": 18.27802848815918, "geo/layer_0/stable_rank_k_proj": 15.822160720825195, "geo/layer_0/stable_rank_o_proj": 50.7347297668457, "geo/layer_0/stable_rank_gate_proj": 143.81019592285156, "geo/layer_0/stable_rank_down_proj": 51.32253646850586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050018876791000366, "geo/layer_0/attn_entropy_mean": 6.231683254241943, "geo/layer_0/attn_entropy_std": 0.3384333550930023, "geo/layer_7/stable_rank_q_proj": 42.19083023071289, "geo/layer_7/stable_rank_k_proj": 41.75722885131836, "geo/layer_7/stable_rank_o_proj": 106.49830627441406, "geo/layer_7/stable_rank_gate_proj": 97.42892456054688, "geo/layer_7/stable_rank_down_proj": 147.9031524658203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5270476937294006, "geo/layer_7/attn_entropy_mean": 4.674595832824707, "geo/layer_7/attn_entropy_std": 0.7837966084480286, "geo/layer_14/stable_rank_q_proj": 56.03521728515625, "geo/layer_14/stable_rank_k_proj": 35.32980728149414, "geo/layer_14/stable_rank_o_proj": 53.2097282409668, "geo/layer_14/stable_rank_gate_proj": 81.47369384765625, "geo/layer_14/stable_rank_down_proj": 134.84881591796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37565869092941284, "geo/layer_14/attn_entropy_mean": 5.517086029052734, "geo/layer_14/attn_entropy_std": 0.40282389521598816, "geo/layer_21/stable_rank_q_proj": 45.767845153808594, "geo/layer_21/stable_rank_k_proj": 31.100751876831055, "geo/layer_21/stable_rank_o_proj": 80.72566223144531, "geo/layer_21/stable_rank_gate_proj": 80.57415771484375, "geo/layer_21/stable_rank_down_proj": 58.46535873413086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15139932930469513, "geo/layer_21/attn_entropy_mean": 5.719708442687988, "geo/layer_21/attn_entropy_std": 0.28818854689598083, "geo/layer_27/stable_rank_q_proj": 41.431640625, "geo/layer_27/stable_rank_k_proj": 31.158958435058594, "geo/layer_27/stable_rank_o_proj": 118.30455017089844, "geo/layer_27/stable_rank_gate_proj": 89.20506286621094, "geo/layer_27/stable_rank_down_proj": 135.92626953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08100054413080215, "geo/layer_27/attn_entropy_mean": 4.355565071105957, "geo/layer_27/attn_entropy_std": 0.5919737815856934, "attnres/final_alpha/block_0": 0.2428075075149536, "attnres/block_norm/0": 1.6709907054901123, "attnres/final_alpha/block_1": 0.006213285028934479, "attnres/block_norm/1": 34169.0859375, "attnres/final_alpha/block_2": 0.012929883785545826, "attnres/block_norm/2": 23777.267578125, "attnres/final_alpha/block_3": 0.015161502175033092, "attnres/block_norm/3": 36928.265625, "attnres/final_alpha/block_4": 0.01895684190094471, "attnres/block_norm/4": 10972.763671875, "attnres/final_alpha/block_5": 0.5754194855690002, "attnres/block_norm/5": 5542.40380859375, "attnres/final_alpha/block_6": 0.1285114735364914, "attnres/block_norm/6": 24657.23828125, "geo/tier1_time_s": 1.3545095920562744, "geo/step": 27750.0, "geo/rankme_slope": -6.283552483493398e-05} {"step": 27760, "timestamp": 1778224545.0370603, "train/loss": 2.189427876472473, "train/z_loss": 0.0014865635894238949, "train/perplexity": 8.930102531134313, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790534.0785876964, "perf/iters_per_sec": 0.853793181699608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712438344955445, "data/tokens_consumed": 58219036672, "data/tokens_consumed_B": 58.219036672, "train/loss_slope": -1.9434317682657047e-05} {"step": 27770, "timestamp": 1778224555.3950448, "train/loss": 2.1579321146011354, "train/z_loss": 0.0015027436311356723, "train/perplexity": 8.65322526627811, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025690.350943457, "perf/iters_per_sec": 0.9659244303433691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352776765823364, "data/tokens_consumed": 58240008192, "data/tokens_consumed_B": 58.240008192, "train/loss_slope": -2.2805121429253313e-05} {"step": 27780, "timestamp": 1778224565.75243, "train/loss": 2.2064050674438476, "train/z_loss": 0.0014821290504187345, "train/perplexity": 9.083004840547902, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025709.524480542, "perf/iters_per_sec": 0.9659335729983053, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352678775787354, "data/tokens_consumed": 58260979712, "data/tokens_consumed_B": 58.260979712, "train/loss_slope": -2.3146102115838224e-05} {"step": 27790, "timestamp": 1778224576.1144307, "train/loss": 2.2029659509658814, "train/z_loss": 0.0014787912834435702, "train/perplexity": 9.051820982127639, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024986.0334092458, "perf/iters_per_sec": 0.9655885855718831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356377601623534, "data/tokens_consumed": 58281951232, "data/tokens_consumed_B": 58.281951232, "train/loss_slope": -2.426507527356812e-05} {"step": 27800, "timestamp": 1778224586.4765668, "grad/layer_0/attn": 0.0029560967814177275, "grad/layer_0/mlp": 0.0030989539809525013, "grad/layer_0/attn_mlp_ratio": 0.9539014468098007, "grad/layer_4/attn": 0.0022242888808250427, "grad/layer_4/mlp": 0.0025316725950688124, "grad/layer_4/attn_mlp_ratio": 0.8785846942843396, "grad/layer_8/attn": 0.0037156222388148308, "grad/layer_8/mlp": 0.0037918228190392256, "grad/layer_8/attn_mlp_ratio": 0.979903945450147, "grad/layer_12/attn": 0.00469239754602313, "grad/layer_12/mlp": 0.006839902605861425, "grad/layer_12/attn_mlp_ratio": 0.6860327913731873, "grad/layer_16/attn": 0.0064952862448990345, "grad/layer_16/mlp": 0.004704161547124386, "grad/layer_16/attn_mlp_ratio": 1.3807531994291795, "grad/layer_20/attn": 0.00946267694234848, "grad/layer_20/mlp": 0.007656840607523918, "grad/layer_20/attn_mlp_ratio": 1.2358461281622426, "grad/layer_24/attn": 0.02103143185377121, "grad/layer_24/mlp": 0.017258362844586372, "grad/layer_24/attn_mlp_ratio": 1.2186226423270567, "grad/layer_27/attn": 0.004493351094424725, "grad/layer_27/mlp": 0.016120294108986855, "grad/layer_27/attn_mlp_ratio": 0.278738777107414} {"step": 27800, "timestamp": 1778224586.4925888, "train/loss": 2.2720171451568603, "train/z_loss": 0.0014629345969296992, "train/perplexity": 9.698945275927548, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021987.4842247681, "perf/iters_per_sec": 0.9641587659000245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037173581123352, "data/tokens_consumed": 58302922752, "data/tokens_consumed_B": 58.302922752, "train/loss_slope": -1.8315206667055675e-05} {"step": 27810, "timestamp": 1778224596.856988, "train/loss": 2.2468754529953, "train/z_loss": 0.001470438519027084, "train/perplexity": 9.458137224976342, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024311.8809805575, "perf/iters_per_sec": 0.9652671246435917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359826564788819, "data/tokens_consumed": 58323894272, "data/tokens_consumed_B": 58.323894272, "train/loss_slope": -1.15261318731551e-05} {"step": 27820, "timestamp": 1778224607.233847, "train/loss": 2.232848560810089, "train/z_loss": 0.001477874640841037, "train/perplexity": 9.326395080334073, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022054.929108598, "perf/iters_per_sec": 0.9641909261267653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371389865875245, "data/tokens_consumed": 58344865792, "data/tokens_consumed_B": 58.344865792, "train/loss_slope": -7.858611025897495e-06} {"step": 27825, "timestamp": 1778224613.0385954, "eos/sharpness": 9.658670425415037, "eos/L0_probe": 2.0254061222076416, "eos/L_plus": 2.0729217529296875, "eos/L_minus": 2.074477195739746, "eos/grad_norm": 0.11235076189041138, "eos/embed_grad_frac": 0.20253044366836548, "eos/time_s": 0.6249361038208008} {"step": 27825, "timestamp": 1778224614.4216113, "geo/rankme_last": 440.65576171875, "geo/layer_0/stable_rank_q_proj": 18.275732040405273, "geo/layer_0/stable_rank_k_proj": 15.820414543151855, "geo/layer_0/stable_rank_o_proj": 50.74143600463867, "geo/layer_0/stable_rank_gate_proj": 143.6392059326172, "geo/layer_0/stable_rank_down_proj": 51.39985275268555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.050263404846191406, "geo/layer_0/attn_entropy_mean": 6.2279253005981445, "geo/layer_0/attn_entropy_std": 0.3320024609565735, "geo/layer_7/stable_rank_q_proj": 42.21818161010742, "geo/layer_7/stable_rank_k_proj": 41.73369598388672, "geo/layer_7/stable_rank_o_proj": 106.48807525634766, "geo/layer_7/stable_rank_gate_proj": 97.53477478027344, "geo/layer_7/stable_rank_down_proj": 148.1042022705078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5297790765762329, "geo/layer_7/attn_entropy_mean": 4.648038864135742, "geo/layer_7/attn_entropy_std": 0.8218982815742493, "geo/layer_14/stable_rank_q_proj": 55.98981475830078, "geo/layer_14/stable_rank_k_proj": 35.31267166137695, "geo/layer_14/stable_rank_o_proj": 53.20828628540039, "geo/layer_14/stable_rank_gate_proj": 81.41673278808594, "geo/layer_14/stable_rank_down_proj": 134.85922241210938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3825482428073883, "geo/layer_14/attn_entropy_mean": 5.537169456481934, "geo/layer_14/attn_entropy_std": 0.4246515929698944, "geo/layer_21/stable_rank_q_proj": 45.71509552001953, "geo/layer_21/stable_rank_k_proj": 31.037609100341797, "geo/layer_21/stable_rank_o_proj": 80.66780853271484, "geo/layer_21/stable_rank_gate_proj": 80.26654815673828, "geo/layer_21/stable_rank_down_proj": 58.4522705078125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15084946155548096, "geo/layer_21/attn_entropy_mean": 5.731834411621094, "geo/layer_21/attn_entropy_std": 0.2942727506160736, "geo/layer_27/stable_rank_q_proj": 41.4393310546875, "geo/layer_27/stable_rank_k_proj": 31.1967716217041, "geo/layer_27/stable_rank_o_proj": 118.50343322753906, "geo/layer_27/stable_rank_gate_proj": 89.23548126220703, "geo/layer_27/stable_rank_down_proj": 136.34201049804688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07946807891130447, "geo/layer_27/attn_entropy_mean": 4.334377288818359, "geo/layer_27/attn_entropy_std": 0.5962207317352295, "attnres/final_alpha/block_0": 0.24088707566261292, "attnres/block_norm/0": 1.6715693473815918, "attnres/final_alpha/block_1": 0.006006930489093065, "attnres/block_norm/1": 34138.01953125, "attnres/final_alpha/block_2": 0.012943753972649574, "attnres/block_norm/2": 23759.6953125, "attnres/final_alpha/block_3": 0.015244180336594582, "attnres/block_norm/3": 37106.2890625, "attnres/final_alpha/block_4": 0.018514122813940048, "attnres/block_norm/4": 10929.3271484375, "attnres/final_alpha/block_5": 0.5794795751571655, "attnres/block_norm/5": 5445.994140625, "attnres/final_alpha/block_6": 0.12692436575889587, "attnres/block_norm/6": 24664.337890625, "geo/tier1_time_s": 1.3639163970947266, "geo/step": 27825.0, "geo/rankme_slope": -4.529782616171468e-05} {"step": 27830, "timestamp": 1778224619.6449358, "train/loss": 2.216711711883545, "train/z_loss": 0.0014876192319206894, "train/perplexity": 9.177104233386224, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1690433.479985535, "perf/iters_per_sec": 0.8060614967277216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2406001329421996, "data/tokens_consumed": 58365837312, "data/tokens_consumed_B": 58.365837312, "train/loss_slope": -8.24300754259841e-06} {"step": 27840, "timestamp": 1778224630.0205112, "train/loss": 2.212554693222046, "train/z_loss": 0.0014956457307562232, "train/perplexity": 9.139034023938711, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022153.8967001562, "perf/iters_per_sec": 0.964238117551878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370882272720336, "data/tokens_consumed": 58386808832, "data/tokens_consumed_B": 58.386808832, "train/loss_slope": -8.732500843124752e-06} {"step": 27850, "timestamp": 1778224640.3911848, "grad/layer_0/attn": 0.003176316851750016, "grad/layer_0/mlp": 0.0031629859004169703, "grad/layer_0/attn_mlp_ratio": 1.0042146412697648, "grad/layer_4/attn": 0.001728287898004055, "grad/layer_4/mlp": 0.0024502845481038094, "grad/layer_4/attn_mlp_ratio": 0.7053416831964879, "grad/layer_8/attn": 0.012357870116829872, "grad/layer_8/mlp": 0.003514858428388834, "grad/layer_8/attn_mlp_ratio": 3.515894030163018, "grad/layer_12/attn": 0.004169086925685406, "grad/layer_12/mlp": 0.005993452854454517, "grad/layer_12/attn_mlp_ratio": 0.6956068492348494, "grad/layer_16/attn": 0.005449783988296986, "grad/layer_16/mlp": 0.004490986000746489, "grad/layer_16/attn_mlp_ratio": 1.2134938443454828, "grad/layer_20/attn": 0.003878530580550432, "grad/layer_20/mlp": 0.0056500728242099285, "grad/layer_20/attn_mlp_ratio": 0.6864567294222635, "grad/layer_24/attn": 0.007101472932845354, "grad/layer_24/mlp": 0.009772082790732384, "grad/layer_24/attn_mlp_ratio": 0.7267102635386183, "grad/layer_27/attn": 0.006999057251960039, "grad/layer_27/mlp": 0.008168802596628666, "grad/layer_27/attn_mlp_ratio": 0.856803317681869} {"step": 27850, "timestamp": 1778224640.4069982, "train/loss": 2.207166576385498, "train/z_loss": 0.0014865899342112244, "train/perplexity": 9.089924264218153, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020165.4805343177, "perf/iters_per_sec": 0.9632899668380345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038109016418457, "data/tokens_consumed": 58407780352, "data/tokens_consumed_B": 58.407780352, "train/loss_slope": -6.3450793407358764e-06} {"step": 27860, "timestamp": 1778224650.784782, "train/loss": 2.1510504245758058, "train/z_loss": 0.0014933263533748685, "train/perplexity": 8.593880881308023, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021798.0025895105, "perf/iters_per_sec": 0.9640684140155366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372707843780518, "data/tokens_consumed": 58428751872, "data/tokens_consumed_B": 58.428751872, "train/loss_slope": -1.0494547493994974e-05} {"step": 27870, "timestamp": 1778224661.1585844, "train/loss": 2.1687624931335447, "train/z_loss": 0.0014945918112061918, "train/perplexity": 8.747452307656417, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022565.8156347282, "perf/iters_per_sec": 0.9644345358060494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368770122528077, "data/tokens_consumed": 58449723392, "data/tokens_consumed_B": 58.449723392, "train/loss_slope": -1.3920100336850213e-05} {"step": 27880, "timestamp": 1778224671.5437703, "train/loss": 2.1375402927398683, "train/z_loss": 0.00150615437887609, "train/perplexity": 8.478557190853326, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020809.2962052377, "perf/iters_per_sec": 0.9635969620729626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377782821655273, "data/tokens_consumed": 58470694912, "data/tokens_consumed_B": 58.470694912, "train/loss_slope": -1.7016867356653688e-05} {"step": 27890, "timestamp": 1778224681.9340448, "train/loss": 2.1212727427482605, "train/z_loss": 0.001499446388334036, "train/perplexity": 8.341747633070446, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019380.5269558623, "perf/iters_per_sec": 0.9629156718043624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385125398635864, "data/tokens_consumed": 58491666432, "data/tokens_consumed_B": 58.491666432, "train/loss_slope": -2.008738527775809e-05} {"step": 27900, "timestamp": 1778224692.3097384, "grad/layer_0/attn": 0.002453708089888096, "grad/layer_0/mlp": 0.0025007473304867744, "grad/layer_0/attn_mlp_ratio": 0.981189887461157, "grad/layer_4/attn": 0.0025471358094364405, "grad/layer_4/mlp": 0.0023957404773682356, "grad/layer_4/attn_mlp_ratio": 1.063193499955039, "grad/layer_8/attn": 0.004013508092612028, "grad/layer_8/mlp": 0.003581542056053877, "grad/layer_8/attn_mlp_ratio": 1.1206089214468689, "grad/layer_12/attn": 0.0045368908904492855, "grad/layer_12/mlp": 0.0060530188493430614, "grad/layer_12/attn_mlp_ratio": 0.7495253076882699, "grad/layer_16/attn": 0.010265386663377285, "grad/layer_16/mlp": 0.0046575311571359634, "grad/layer_16/attn_mlp_ratio": 2.2040403159182955, "grad/layer_20/attn": 0.003913975786417723, "grad/layer_20/mlp": 0.0062797158025205135, "grad/layer_20/attn_mlp_ratio": 0.6232727478717238, "grad/layer_24/attn": 0.005838977172970772, "grad/layer_24/mlp": 0.008333862759172916, "grad/layer_24/attn_mlp_ratio": 0.7006327403796819, "grad/layer_27/attn": 0.01097638439387083, "grad/layer_27/mlp": 0.007077016867697239, "grad/layer_27/attn_mlp_ratio": 1.5509902610057456} {"step": 27900, "timestamp": 1778224692.92565, "eos/sharpness": 24.946236610412594, "eos/L0_probe": 2.025294542312622, "eos/L_plus": 2.175018787384033, "eos/L_minus": 2.125032663345337, "eos/grad_norm": 0.12539266049861908, "eos/embed_grad_frac": 0.14307135343551636, "eos/time_s": 0.6131775379180908} {"step": 27900, "timestamp": 1778224692.9457645, "train/loss": 2.261950206756592, "train/z_loss": 0.0014848232152871788, "train/perplexity": 9.60179640757102, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905567.7718881855, "perf/iters_per_sec": 0.9086455211106231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1005391836166383, "data/tokens_consumed": 58512637952, "data/tokens_consumed_B": 58.512637952, "train/loss_slope": -1.902873114784162e-05} {"step": 27900, "timestamp": 1778224694.312939, "geo/rankme_last": 439.8074951171875, "geo/layer_0/stable_rank_q_proj": 18.276945114135742, "geo/layer_0/stable_rank_k_proj": 15.806775093078613, "geo/layer_0/stable_rank_o_proj": 50.66263198852539, "geo/layer_0/stable_rank_gate_proj": 143.87464904785156, "geo/layer_0/stable_rank_down_proj": 51.47610855102539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05007757991552353, "geo/layer_0/attn_entropy_mean": 6.222357273101807, "geo/layer_0/attn_entropy_std": 0.3342605531215668, "geo/layer_7/stable_rank_q_proj": 42.1751823425293, "geo/layer_7/stable_rank_k_proj": 41.885807037353516, "geo/layer_7/stable_rank_o_proj": 106.27559661865234, "geo/layer_7/stable_rank_gate_proj": 97.46598052978516, "geo/layer_7/stable_rank_down_proj": 148.06671142578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5327353477478027, "geo/layer_7/attn_entropy_mean": 4.628573417663574, "geo/layer_7/attn_entropy_std": 0.8185478448867798, "geo/layer_14/stable_rank_q_proj": 55.75705337524414, "geo/layer_14/stable_rank_k_proj": 35.31208801269531, "geo/layer_14/stable_rank_o_proj": 53.279754638671875, "geo/layer_14/stable_rank_gate_proj": 81.38609313964844, "geo/layer_14/stable_rank_down_proj": 134.9901885986328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3570939004421234, "geo/layer_14/attn_entropy_mean": 5.479187965393066, "geo/layer_14/attn_entropy_std": 0.4040316045284271, "geo/layer_21/stable_rank_q_proj": 45.80787658691406, "geo/layer_21/stable_rank_k_proj": 31.22346305847168, "geo/layer_21/stable_rank_o_proj": 80.82235717773438, "geo/layer_21/stable_rank_gate_proj": 80.23793029785156, "geo/layer_21/stable_rank_down_proj": 58.52777862548828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15246576070785522, "geo/layer_21/attn_entropy_mean": 5.740478038787842, "geo/layer_21/attn_entropy_std": 0.2846432328224182, "geo/layer_27/stable_rank_q_proj": 41.56025695800781, "geo/layer_27/stable_rank_k_proj": 31.237749099731445, "geo/layer_27/stable_rank_o_proj": 118.90522766113281, "geo/layer_27/stable_rank_gate_proj": 89.14615631103516, "geo/layer_27/stable_rank_down_proj": 136.02294921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08780299872159958, "geo/layer_27/attn_entropy_mean": 4.365365505218506, "geo/layer_27/attn_entropy_std": 0.6100482940673828, "attnres/final_alpha/block_0": 0.24020196497440338, "attnres/block_norm/0": 1.6720032691955566, "attnres/final_alpha/block_1": 0.005916462279856205, "attnres/block_norm/1": 34523.33984375, "attnres/final_alpha/block_2": 0.012749791145324707, "attnres/block_norm/2": 23848.521484375, "attnres/final_alpha/block_3": 0.01486991811543703, "attnres/block_norm/3": 37623.0, "attnres/final_alpha/block_4": 0.018786316737532616, "attnres/block_norm/4": 10951.1123046875, "attnres/final_alpha/block_5": 0.5833341479301453, "attnres/block_norm/5": 5393.62744140625, "attnres/final_alpha/block_6": 0.1241413950920105, "attnres/block_norm/6": 24924.984375, "geo/tier1_time_s": 1.3629395961761475, "geo/step": 27900.0, "geo/rankme_slope": -5.221234978366347e-05} {"step": 27910, "timestamp": 1778224704.6954174, "train/loss": 2.175143098831177, "train/z_loss": 0.0015021772822365164, "train/perplexity": 8.803444794717079, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785483.1689863799, "perf/iters_per_sec": 0.8513847203189754, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174557137489319, "data/tokens_consumed": 58533609472, "data/tokens_consumed_B": 58.533609472, "train/loss_slope": -1.989636451247355e-05} {"step": 27920, "timestamp": 1778224715.0798776, "train/loss": 2.2167325019836426, "train/z_loss": 0.0014907145057804882, "train/perplexity": 9.177295028285156, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020414.1957375684, "perf/iters_per_sec": 0.9634085634887545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379812240600585, "data/tokens_consumed": 58554580992, "data/tokens_consumed_B": 58.554580992, "train/loss_slope": -1.665540197894907e-05} {"step": 27930, "timestamp": 1778224725.4657278, "train/loss": 2.1656620502471924, "train/z_loss": 0.0014826647704467178, "train/perplexity": 8.72037333147691, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020217.1210230698, "perf/iters_per_sec": 0.9633145909419393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038082480430603, "data/tokens_consumed": 58575552512, "data/tokens_consumed_B": 58.575552512, "train/loss_slope": -1.797998302733253e-05} {"step": 27940, "timestamp": 1778224735.8438947, "train/loss": 2.2044140577316282, "train/z_loss": 0.0014913875493220985, "train/perplexity": 9.064938480811188, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021686.3379941108, "perf/iters_per_sec": 0.9640151681871942, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373280763626098, "data/tokens_consumed": 58596524032, "data/tokens_consumed_B": 58.596524032, "train/loss_slope": -1.8112490365285587e-05} {"step": 27950, "timestamp": 1778224746.2202628, "grad/layer_0/attn": 0.0035499429795891047, "grad/layer_0/mlp": 0.0029531687032431364, "grad/layer_0/attn_mlp_ratio": 1.202079263363001, "grad/layer_4/attn": 0.0019986701663583517, "grad/layer_4/mlp": 0.0026184755843132734, "grad/layer_4/attn_mlp_ratio": 0.763295293644297, "grad/layer_8/attn": 0.004482778254896402, "grad/layer_8/mlp": 0.0037790643982589245, "grad/layer_8/attn_mlp_ratio": 1.1862137460108686, "grad/layer_12/attn": 0.0049663083627820015, "grad/layer_12/mlp": 0.006473594810813665, "grad/layer_12/attn_mlp_ratio": 0.7671639067940671, "grad/layer_16/attn": 0.0036771378945559263, "grad/layer_16/mlp": 0.004692495800554752, "grad/layer_16/attn_mlp_ratio": 0.7836209071852809, "grad/layer_20/attn": 0.0037830073852092028, "grad/layer_20/mlp": 0.006584829185158014, "grad/layer_20/attn_mlp_ratio": 0.5745034869371596, "grad/layer_24/attn": 0.017964042723178864, "grad/layer_24/mlp": 0.012638346292078495, "grad/layer_24/attn_mlp_ratio": 1.4213918629765068, "grad/layer_27/attn": 0.016165336593985558, "grad/layer_27/mlp": 0.011443271301686764, "grad/layer_27/attn_mlp_ratio": 1.4126499343188479} {"step": 27950, "timestamp": 1778224746.236011, "train/loss": 2.2433281421661375, "train/z_loss": 0.0014628418372012674, "train/perplexity": 9.424645709901325, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018773.250235544, "perf/iters_per_sec": 0.962626099698803, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388249397277831, "data/tokens_consumed": 58617495552, "data/tokens_consumed_B": 58.617495552, "train/loss_slope": -1.72729565007816e-05} {"step": 27960, "timestamp": 1778224756.619392, "train/loss": 2.197815752029419, "train/z_loss": 0.0014812899753451347, "train/perplexity": 9.005322145242586, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020938.6007638727, "perf/iters_per_sec": 0.9636586192912449, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377118825912475, "data/tokens_consumed": 58638467072, "data/tokens_consumed_B": 58.638467072, "train/loss_slope": -1.5173638821935766e-05} {"step": 27970, "timestamp": 1778224766.9958792, "train/loss": 2.2225608825683594, "train/z_loss": 0.0014790618908591569, "train/perplexity": 9.230939976174694, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021955.0416311221, "perf/iters_per_sec": 0.9641432960658656, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371902227401733, "data/tokens_consumed": 58659438592, "data/tokens_consumed_B": 58.659438592, "train/loss_slope": -1.3783966497083043e-05} {"step": 27975, "timestamp": 1778224772.7912707, "eos/sharpness": 58.36384296417235, "eos/L0_probe": 2.024632692337036, "eos/L_plus": 2.2621207237243652, "eos/L_minus": 2.3707830905914307, "eos/grad_norm": 0.17197883129119873, "eos/embed_grad_frac": 0.07498181611299515, "eos/time_s": 0.611375093460083} {"step": 27975, "timestamp": 1778224774.1775568, "geo/rankme_last": 440.6372985839844, "geo/layer_0/stable_rank_q_proj": 18.306928634643555, "geo/layer_0/stable_rank_k_proj": 15.851564407348633, "geo/layer_0/stable_rank_o_proj": 50.58599853515625, "geo/layer_0/stable_rank_gate_proj": 144.13653564453125, "geo/layer_0/stable_rank_down_proj": 51.274070739746094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.053185611963272095, "geo/layer_0/attn_entropy_mean": 6.220620155334473, "geo/layer_0/attn_entropy_std": 0.33482441306114197, "geo/layer_7/stable_rank_q_proj": 42.11325454711914, "geo/layer_7/stable_rank_k_proj": 42.04825973510742, "geo/layer_7/stable_rank_o_proj": 106.02327728271484, "geo/layer_7/stable_rank_gate_proj": 97.3996353149414, "geo/layer_7/stable_rank_down_proj": 147.98475646972656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5265136957168579, "geo/layer_7/attn_entropy_mean": 4.659552574157715, "geo/layer_7/attn_entropy_std": 0.8344561457633972, "geo/layer_14/stable_rank_q_proj": 55.797203063964844, "geo/layer_14/stable_rank_k_proj": 35.38750457763672, "geo/layer_14/stable_rank_o_proj": 53.244384765625, "geo/layer_14/stable_rank_gate_proj": 81.31128692626953, "geo/layer_14/stable_rank_down_proj": 134.60891723632812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3685726821422577, "geo/layer_14/attn_entropy_mean": 5.505771636962891, "geo/layer_14/attn_entropy_std": 0.41901716589927673, "geo/layer_21/stable_rank_q_proj": 45.767852783203125, "geo/layer_21/stable_rank_k_proj": 31.252456665039062, "geo/layer_21/stable_rank_o_proj": 80.91490173339844, "geo/layer_21/stable_rank_gate_proj": 80.18487548828125, "geo/layer_21/stable_rank_down_proj": 58.44062805175781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1505182683467865, "geo/layer_21/attn_entropy_mean": 5.739086151123047, "geo/layer_21/attn_entropy_std": 0.2944503128528595, "geo/layer_27/stable_rank_q_proj": 41.5709228515625, "geo/layer_27/stable_rank_k_proj": 31.182846069335938, "geo/layer_27/stable_rank_o_proj": 118.9930648803711, "geo/layer_27/stable_rank_gate_proj": 89.11937713623047, "geo/layer_27/stable_rank_down_proj": 135.90301513671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08147237449884415, "geo/layer_27/attn_entropy_mean": 4.356501579284668, "geo/layer_27/attn_entropy_std": 0.6216376423835754, "attnres/final_alpha/block_0": 0.2398771494626999, "attnres/block_norm/0": 1.6724905967712402, "attnres/final_alpha/block_1": 0.005911952815949917, "attnres/block_norm/1": 34511.73046875, "attnres/final_alpha/block_2": 0.012768741697072983, "attnres/block_norm/2": 23838.625, "attnres/final_alpha/block_3": 0.014902198687195778, "attnres/block_norm/3": 37760.3984375, "attnres/final_alpha/block_4": 0.018832296133041382, "attnres/block_norm/4": 10967.0146484375, "attnres/final_alpha/block_5": 0.5797865390777588, "attnres/block_norm/5": 5477.677734375, "attnres/final_alpha/block_6": 0.12792116403579712, "attnres/block_norm/6": 24762.2421875, "geo/tier1_time_s": 1.3658788204193115, "geo/step": 27975.0, "geo/rankme_slope": -4.378202452856142e-05} {"step": 27980, "timestamp": 1778224779.3739507, "train/loss": 2.2107373237609864, "train/z_loss": 0.0014887295896187426, "train/perplexity": 9.122440105807291, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695051.710036452, "perf/iters_per_sec": 0.8082636404211292, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2372200727462768, "data/tokens_consumed": 58680410112, "data/tokens_consumed_B": 58.680410112, "train/loss_slope": -1.3310957255393477e-05} {"step": 27990, "timestamp": 1778224789.75702, "train/loss": 2.1979361295700075, "train/z_loss": 0.0014881618553772569, "train/perplexity": 9.0064062490242, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021270.7362658163, "perf/iters_per_sec": 0.96381699384013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375413656234742, "data/tokens_consumed": 58701381632, "data/tokens_consumed_B": 58.701381632, "train/loss_slope": -1.4313710833897691e-05} {"step": 28000, "timestamp": 1778224800.1239967, "grad/layer_0/attn": 0.0031872165855020285, "grad/layer_0/mlp": 0.0028983522206544876, "grad/layer_0/attn_mlp_ratio": 1.0996649933788283, "grad/layer_4/attn": 0.0021549195516854525, "grad/layer_4/mlp": 0.0024765331763774157, "grad/layer_4/attn_mlp_ratio": 0.8701355125086757, "grad/layer_8/attn": 0.008586643263697624, "grad/layer_8/mlp": 0.003792324336245656, "grad/layer_8/attn_mlp_ratio": 2.264216421366699, "grad/layer_12/attn": 0.0041346475481987, "grad/layer_12/mlp": 0.006379852537065744, "grad/layer_12/attn_mlp_ratio": 0.6480788481190264, "grad/layer_16/attn": 0.00595187908038497, "grad/layer_16/mlp": 0.004620829131454229, "grad/layer_16/attn_mlp_ratio": 1.2880543258058996, "grad/layer_20/attn": 0.006114623975008726, "grad/layer_20/mlp": 0.006155375391244888, "grad/layer_20/attn_mlp_ratio": 0.9933795239146459, "grad/layer_24/attn": 0.008756720460951328, "grad/layer_24/mlp": 0.011665972881019115, "grad/layer_24/attn_mlp_ratio": 0.7506206704917603, "grad/layer_27/attn": 0.007293534930795431, "grad/layer_27/mlp": 0.009516937658190727, "grad/layer_27/attn_mlp_ratio": 0.7663741337930123} {"step": 28000, "timestamp": 1778224800.1397643, "train/loss": 2.144740843772888, "train/z_loss": 0.0014851278974674642, "train/perplexity": 8.539827800940497, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020825.0347266472, "perf/iters_per_sec": 0.9636044667847858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377701997756958, "data/tokens_consumed": 58722353152, "data/tokens_consumed_B": 58.722353152, "train/loss_slope": -1.5413757204615998e-05} {"step": 28000, "timestamp": 1778224807.1708493, "geo/ww_alpha_mean": 7.577347574179914, "geo/ww_alpha_std": 4.045785310260781, "geo/ww_alpha_min": 1.3571449870311272, "geo/ww_alpha_max": 24.523090596488856, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.047078886434684, "geo/ww_alpha_by_type/k_proj": 4.615852963399299, "geo/ww_alpha_by_type/v_proj": 7.997720900584341, "geo/ww_alpha_by_type/o_proj": 6.742306141045259, "geo/ww_alpha_by_type/gate_proj": 8.470499427672056, "geo/ww_alpha_by_type/up_proj": 12.177931355841887, "geo/ww_alpha_by_type/down_proj": 9.103222018102837, "geo/twonn_id/layer_0": 0.7645363807678223, "geo/twonn_id/layer_7": 2.8758955001831055, "geo/twonn_id/layer_14": 4.198861122131348, "geo/twonn_id/layer_21": 6.182560920715332, "geo/twonn_id/layer_27": 5.966292858123779, "geo/tier2_time_s": 7.0238938331604} {"step": 28000, "timestamp": 1778224807.8033388, "eoc/jacobian_sigma/layer_0/attn": 966.9615478515625, "eoc/jacobian_sigma/layer_0/mlp": 6104.31591796875, "eoc/jacobian_sigma/layer_0": 6104.31591796875, "eoc/jacobian_sigma/layer_7/attn": 1.142985224723816, "eoc/jacobian_sigma/layer_7/mlp": 1.7171025276184082, "eoc/jacobian_sigma/layer_7": 1.7171025276184082, "eoc/jacobian_sigma/layer_14/attn": 1.68097722530365, "eoc/jacobian_sigma/layer_14/mlp": 12.345038414001465, "eoc/jacobian_sigma/layer_14": 12.345038414001465, "eoc/jacobian_sigma/layer_21/attn": 1.093437671661377, "eoc/jacobian_sigma/layer_21/mlp": 4.164409637451172, "eoc/jacobian_sigma/layer_21": 4.164409637451172, "eoc/jacobian_sigma/layer_27/attn": 3.3208937644958496, "eoc/jacobian_sigma/layer_27/mlp": 26.572742462158203, "eoc/jacobian_sigma/layer_27": 26.572742462158203, "eoc/layer0_sigma": 6104.31591796875, "eoc/sigma_max": 26.572742462158203, "eoc/sigma_min": 1.7171025276184082, "eoc/sigma_mean": 11.199823260307312, "eoc/time_s": 0.6263840198516846} {"step": 28010, "timestamp": 1778224818.1962228, "train/loss": 2.144067168235779, "train/z_loss": 0.0014823377947323023, "train/perplexity": 8.53407666527709, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1161810.2151317971, "perf/iters_per_sec": 0.5539942813548074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8050727844238281, "data/tokens_consumed": 58743324672, "data/tokens_consumed_B": 58.743324672, "train/loss_slope": -1.572839683718123e-05} {"step": 28020, "timestamp": 1778224828.5782948, "train/loss": 2.164533257484436, "train/z_loss": 0.0014904479146935045, "train/perplexity": 8.710535390714574, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021008.2974444258, "perf/iters_per_sec": 0.963691853258336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376760959625244, "data/tokens_consumed": 58764296192, "data/tokens_consumed_B": 58.764296192, "train/loss_slope": -1.6629301332118342e-05} {"step": 28030, "timestamp": 1778224838.9696083, "train/loss": 2.1896633386611937, "train/z_loss": 0.0014863049611449242, "train/perplexity": 8.932205480194572, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019224.258378636, "perf/iters_per_sec": 0.9628411571400814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385929107666017, "data/tokens_consumed": 58785267712, "data/tokens_consumed_B": 58.785267712, "train/loss_slope": -1.715791223287366e-05} {"step": 28040, "timestamp": 1778224849.3545516, "train/loss": 2.212083911895752, "train/z_loss": 0.0014735263655893504, "train/perplexity": 9.13473254998613, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020965.7173554017, "perf/iters_per_sec": 0.9636715494896897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037697958946228, "data/tokens_consumed": 58806239232, "data/tokens_consumed_B": 58.806239232, "train/loss_slope": -1.3517005773338856e-05} {"step": 28050, "timestamp": 1778224859.71793, "grad/layer_0/attn": 0.002428169595077634, "grad/layer_0/mlp": 0.002382091013714671, "grad/layer_0/attn_mlp_ratio": 1.019343710699255, "grad/layer_4/attn": 0.001676480402238667, "grad/layer_4/mlp": 0.002341067185625434, "grad/layer_4/attn_mlp_ratio": 0.7161179913676776, "grad/layer_8/attn": 0.003912437707185745, "grad/layer_8/mlp": 0.003559232922270894, "grad/layer_8/attn_mlp_ratio": 1.099236178891568, "grad/layer_12/attn": 0.004311581142246723, "grad/layer_12/mlp": 0.006996401585638523, "grad/layer_12/attn_mlp_ratio": 0.6162569469241715, "grad/layer_16/attn": 0.004030941985547543, "grad/layer_16/mlp": 0.004349308088421822, "grad/layer_16/attn_mlp_ratio": 0.926800725751789, "grad/layer_20/attn": 0.007467837538570166, "grad/layer_20/mlp": 0.005398271605372429, "grad/layer_20/attn_mlp_ratio": 1.383375633193505, "grad/layer_24/attn": 0.004344489891082048, "grad/layer_24/mlp": 0.007835355587303638, "grad/layer_24/attn_mlp_ratio": 0.5544725810114572, "grad/layer_27/attn": 0.005808137357234955, "grad/layer_27/mlp": 0.0064606573432683945, "grad/layer_27/attn_mlp_ratio": 0.8990009775687264} {"step": 28050, "timestamp": 1778224860.3620248, "eos/sharpness": 4.234790802001952, "eos/L0_probe": 2.0185816287994385, "eos/L_plus": 2.0441150665283203, "eos/L_minus": 2.035396099090576, "eos/grad_norm": 0.08759721368551254, "eos/embed_grad_frac": 0.30290687084198, "eos/time_s": 0.641284704208374} {"step": 28050, "timestamp": 1778224860.3841076, "train/loss": 2.1306163907051086, "train/z_loss": 0.0014912794693373143, "train/perplexity": 8.42005525601766, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1902262.957955178, "perf/iters_per_sec": 0.9070696630264178, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1024511575698852, "data/tokens_consumed": 58827210752, "data/tokens_consumed_B": 58.827210752, "train/loss_slope": -1.6689405368320795e-05} {"step": 28050, "timestamp": 1778224861.753812, "geo/rankme_last": 440.4969482421875, "geo/layer_0/stable_rank_q_proj": 18.33190155029297, "geo/layer_0/stable_rank_k_proj": 15.87503433227539, "geo/layer_0/stable_rank_o_proj": 50.455345153808594, "geo/layer_0/stable_rank_gate_proj": 144.2577667236328, "geo/layer_0/stable_rank_down_proj": 51.32088088989258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05177243798971176, "geo/layer_0/attn_entropy_mean": 6.226291656494141, "geo/layer_0/attn_entropy_std": 0.3361685574054718, "geo/layer_7/stable_rank_q_proj": 42.06372833251953, "geo/layer_7/stable_rank_k_proj": 42.02607727050781, "geo/layer_7/stable_rank_o_proj": 105.94216918945312, "geo/layer_7/stable_rank_gate_proj": 97.44212341308594, "geo/layer_7/stable_rank_down_proj": 148.00970458984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.538625180721283, "geo/layer_7/attn_entropy_mean": 4.6628336906433105, "geo/layer_7/attn_entropy_std": 0.8479504585266113, "geo/layer_14/stable_rank_q_proj": 55.63100814819336, "geo/layer_14/stable_rank_k_proj": 35.303401947021484, "geo/layer_14/stable_rank_o_proj": 53.2655143737793, "geo/layer_14/stable_rank_gate_proj": 81.3210220336914, "geo/layer_14/stable_rank_down_proj": 134.50869750976562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.384374737739563, "geo/layer_14/attn_entropy_mean": 5.48974609375, "geo/layer_14/attn_entropy_std": 0.4247119128704071, "geo/layer_21/stable_rank_q_proj": 45.677268981933594, "geo/layer_21/stable_rank_k_proj": 31.211353302001953, "geo/layer_21/stable_rank_o_proj": 81.05242156982422, "geo/layer_21/stable_rank_gate_proj": 80.09075927734375, "geo/layer_21/stable_rank_down_proj": 58.413818359375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15003688633441925, "geo/layer_21/attn_entropy_mean": 5.7379560470581055, "geo/layer_21/attn_entropy_std": 0.28554362058639526, "geo/layer_27/stable_rank_q_proj": 41.57554244995117, "geo/layer_27/stable_rank_k_proj": 31.217361450195312, "geo/layer_27/stable_rank_o_proj": 119.31777954101562, "geo/layer_27/stable_rank_gate_proj": 89.22205352783203, "geo/layer_27/stable_rank_down_proj": 135.576904296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0786101371049881, "geo/layer_27/attn_entropy_mean": 4.343008041381836, "geo/layer_27/attn_entropy_std": 0.6058141589164734, "attnres/final_alpha/block_0": 0.23856334388256073, "attnres/block_norm/0": 1.673029899597168, "attnres/final_alpha/block_1": 0.005883256904780865, "attnres/block_norm/1": 34492.703125, "attnres/final_alpha/block_2": 0.012476368807256222, "attnres/block_norm/2": 23866.94921875, "attnres/final_alpha/block_3": 0.01467522606253624, "attnres/block_norm/3": 37739.6328125, "attnres/final_alpha/block_4": 0.018242936581373215, "attnres/block_norm/4": 11035.853515625, "attnres/final_alpha/block_5": 0.5850110054016113, "attnres/block_norm/5": 5473.01953125, "attnres/final_alpha/block_6": 0.12514781951904297, "attnres/block_norm/6": 24814.109375, "geo/tier1_time_s": 1.366307020187378, "geo/step": 28050.0, "geo/rankme_slope": -3.288692430097039e-05} {"step": 28060, "timestamp": 1778224872.129705, "train/loss": 2.107508969306946, "train/z_loss": 0.0014895973727107048, "train/perplexity": 8.227720232320008, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785965.473406502, "perf/iters_per_sec": 0.8516147009880553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174239945411682, "data/tokens_consumed": 58848182272, "data/tokens_consumed_B": 58.848182272, "train/loss_slope": -2.2201708684338128e-05} {"step": 28070, "timestamp": 1778224882.5108275, "train/loss": 2.225811743736267, "train/z_loss": 0.0014805217273533345, "train/perplexity": 9.260997310125669, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021286.1568405416, "perf/iters_per_sec": 0.9638243469431599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037533450126648, "data/tokens_consumed": 58869153792, "data/tokens_consumed_B": 58.869153792, "train/loss_slope": -2.1772813704004517e-05} {"step": 28080, "timestamp": 1778224892.8926585, "train/loss": 2.228142428398132, "train/z_loss": 0.0014956697588786483, "train/perplexity": 9.282606947352845, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020969.8499135582, "perf/iters_per_sec": 0.9636735200469772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037695837020874, "data/tokens_consumed": 58890125312, "data/tokens_consumed_B": 58.890125312, "train/loss_slope": -1.447732225395772e-05} {"step": 28090, "timestamp": 1778224903.2690146, "train/loss": 2.2273486614227296, "train/z_loss": 0.0014756023418158292, "train/perplexity": 9.27524164406637, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022360.509402302, "perf/iters_per_sec": 0.964336638165618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036982274055481, "data/tokens_consumed": 58911096832, "data/tokens_consumed_B": 58.911096832, "train/loss_slope": -1.3291027829913865e-05} {"step": 28100, "timestamp": 1778224913.6337004, "grad/layer_0/attn": 0.002781784860417247, "grad/layer_0/mlp": 0.0029548206366598606, "grad/layer_0/attn_mlp_ratio": 0.9414394673437232, "grad/layer_4/attn": 0.0017922351835295558, "grad/layer_4/mlp": 0.0024937435518950224, "grad/layer_4/attn_mlp_ratio": 0.7186926299211297, "grad/layer_8/attn": 0.004656799137592316, "grad/layer_8/mlp": 0.0036048279143869877, "grad/layer_8/attn_mlp_ratio": 1.2918228328804804, "grad/layer_12/attn": 0.004366382956504822, "grad/layer_12/mlp": 0.006260163616389036, "grad/layer_12/attn_mlp_ratio": 0.6974870233942416, "grad/layer_16/attn": 0.00491742417216301, "grad/layer_16/mlp": 0.0048009189777076244, "grad/layer_16/attn_mlp_ratio": 1.0242672481184614, "grad/layer_20/attn": 0.00744960131123662, "grad/layer_20/mlp": 0.006295339670032263, "grad/layer_20/attn_mlp_ratio": 1.1833517464297947, "grad/layer_24/attn": 0.01215896662324667, "grad/layer_24/mlp": 0.011085095815360546, "grad/layer_24/attn_mlp_ratio": 1.0968751841288147, "grad/layer_27/attn": 0.0046968054957687855, "grad/layer_27/mlp": 0.01049040723592043, "grad/layer_27/attn_mlp_ratio": 0.44772384382886193} {"step": 28100, "timestamp": 1778224913.6501868, "train/loss": 2.177388882637024, "train/z_loss": 0.001484088005963713, "train/perplexity": 8.823237645385632, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021302.5530711547, "perf/iters_per_sec": 0.9638321652751707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375250339508058, "data/tokens_consumed": 58932068352, "data/tokens_consumed_B": 58.932068352, "train/loss_slope": -1.2036965961324645e-05} {"step": 28110, "timestamp": 1778224924.0307555, "train/loss": 2.1880579710006716, "train/z_loss": 0.00148245666641742, "train/perplexity": 8.917877510286472, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021354.344611855, "perf/iters_per_sec": 0.9638568614062571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037498450279236, "data/tokens_consumed": 58953039872, "data/tokens_consumed_B": 58.953039872, "train/loss_slope": -1.238445599730792e-05} {"step": 28120, "timestamp": 1778224934.408138, "train/loss": 2.218333601951599, "train/z_loss": 0.0014756495598703622, "train/perplexity": 9.192000564436107, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022238.0895339893, "perf/iters_per_sec": 0.9642782638235041, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370450496673584, "data/tokens_consumed": 58974011392, "data/tokens_consumed_B": 58.974011392, "train/loss_slope": -1.1894298701396433e-05} {"step": 28125, "timestamp": 1778224940.2211425, "eos/sharpness": 12.49728202819824, "eos/L0_probe": 2.021510124206543, "eos/L_plus": 2.0940163135528564, "eos/L_minus": 2.073976755142212, "eos/grad_norm": 0.09896272420883179, "eos/embed_grad_frac": 0.22365184128284454, "eos/time_s": 0.6316132545471191} {"step": 28125, "timestamp": 1778224941.6037586, "geo/rankme_last": 440.7955017089844, "geo/layer_0/stable_rank_q_proj": 18.355321884155273, "geo/layer_0/stable_rank_k_proj": 15.878885269165039, "geo/layer_0/stable_rank_o_proj": 50.45448684692383, "geo/layer_0/stable_rank_gate_proj": 144.0216522216797, "geo/layer_0/stable_rank_down_proj": 51.33843231201172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05682911351323128, "geo/layer_0/attn_entropy_mean": 6.226576805114746, "geo/layer_0/attn_entropy_std": 0.33987921476364136, "geo/layer_7/stable_rank_q_proj": 42.03716278076172, "geo/layer_7/stable_rank_k_proj": 42.21162033081055, "geo/layer_7/stable_rank_o_proj": 106.33795928955078, "geo/layer_7/stable_rank_gate_proj": 97.28410339355469, "geo/layer_7/stable_rank_down_proj": 147.73446655273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5330805778503418, "geo/layer_7/attn_entropy_mean": 4.626993179321289, "geo/layer_7/attn_entropy_std": 0.817677915096283, "geo/layer_14/stable_rank_q_proj": 55.67243576049805, "geo/layer_14/stable_rank_k_proj": 35.289161682128906, "geo/layer_14/stable_rank_o_proj": 53.26715087890625, "geo/layer_14/stable_rank_gate_proj": 81.10884857177734, "geo/layer_14/stable_rank_down_proj": 134.9039306640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3640690743923187, "geo/layer_14/attn_entropy_mean": 5.495577812194824, "geo/layer_14/attn_entropy_std": 0.42241373658180237, "geo/layer_21/stable_rank_q_proj": 45.58140563964844, "geo/layer_21/stable_rank_k_proj": 31.207195281982422, "geo/layer_21/stable_rank_o_proj": 81.05288696289062, "geo/layer_21/stable_rank_gate_proj": 80.04621124267578, "geo/layer_21/stable_rank_down_proj": 58.35457229614258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1541808545589447, "geo/layer_21/attn_entropy_mean": 5.732425212860107, "geo/layer_21/attn_entropy_std": 0.2802996039390564, "geo/layer_27/stable_rank_q_proj": 41.56350326538086, "geo/layer_27/stable_rank_k_proj": 30.99701499938965, "geo/layer_27/stable_rank_o_proj": 119.29308319091797, "geo/layer_27/stable_rank_gate_proj": 89.26798248291016, "geo/layer_27/stable_rank_down_proj": 135.8188934326172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07880976051092148, "geo/layer_27/attn_entropy_mean": 4.34085750579834, "geo/layer_27/attn_entropy_std": 0.6034785509109497, "attnres/final_alpha/block_0": 0.23923854529857635, "attnres/block_norm/0": 1.6734857559204102, "attnres/final_alpha/block_1": 0.005948184989392757, "attnres/block_norm/1": 34471.6875, "attnres/final_alpha/block_2": 0.012518556788563728, "attnres/block_norm/2": 23840.115234375, "attnres/final_alpha/block_3": 0.01472117193043232, "attnres/block_norm/3": 37737.67578125, "attnres/final_alpha/block_4": 0.01828855462372303, "attnres/block_norm/4": 10969.77734375, "attnres/final_alpha/block_5": 0.5835572481155396, "attnres/block_norm/5": 5468.5791015625, "attnres/final_alpha/block_6": 0.12572777271270752, "attnres/block_norm/6": 25031.1484375, "geo/tier1_time_s": 1.3627920150756836, "geo/step": 28125.0, "geo/rankme_slope": -2.8692688012705085e-05} {"step": 28130, "timestamp": 1778224946.7936225, "train/loss": 2.128489065170288, "train/z_loss": 0.0014807004132308065, "train/perplexity": 8.402162096502478, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694159.6221640268, "perf/iters_per_sec": 0.8078382597751745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2378715515136718, "data/tokens_consumed": 58994982912, "data/tokens_consumed_B": 58.994982912, "train/loss_slope": -1.7005874679283847e-05} {"step": 28140, "timestamp": 1778224957.1693237, "train/loss": 2.1817276954650877, "train/z_loss": 0.001493189379107207, "train/perplexity": 8.861603192343251, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022177.8846675132, "perf/iters_per_sec": 0.9642495559060636, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037075924873352, "data/tokens_consumed": 59015954432, "data/tokens_consumed_B": 59.015954432, "train/loss_slope": -1.627946616005974e-05} {"step": 28150, "timestamp": 1778224967.5330162, "grad/layer_0/attn": 0.0029155041556805372, "grad/layer_0/mlp": 0.002914754208177328, "grad/layer_0/attn_mlp_ratio": 1.0002572592486116, "grad/layer_4/attn": 0.0022956766188144684, "grad/layer_4/mlp": 0.0025472010020166636, "grad/layer_4/attn_mlp_ratio": 0.9012545640769946, "grad/layer_8/attn": 0.003752609482035041, "grad/layer_8/mlp": 0.0036506641190499067, "grad/layer_8/attn_mlp_ratio": 1.027925127283184, "grad/layer_12/attn": 0.005283404607325792, "grad/layer_12/mlp": 0.00640160171315074, "grad/layer_12/attn_mlp_ratio": 0.8253254047248226, "grad/layer_16/attn": 0.0038868682458996773, "grad/layer_16/mlp": 0.00506178941577673, "grad/layer_16/attn_mlp_ratio": 0.7678842104723984, "grad/layer_20/attn": 0.004574864637106657, "grad/layer_20/mlp": 0.007136850617825985, "grad/layer_20/attn_mlp_ratio": 0.641020082664731, "grad/layer_24/attn": 0.014940234832465649, "grad/layer_24/mlp": 0.01631993055343628, "grad/layer_24/attn_mlp_ratio": 0.9154594556637943, "grad/layer_27/attn": 0.009992877952754498, "grad/layer_27/mlp": 0.015846602618694305, "grad/layer_27/attn_mlp_ratio": 0.6306006486151038} {"step": 28150, "timestamp": 1778224967.5491672, "train/loss": 2.1841673135757445, "train/z_loss": 0.0014734851312823594, "train/perplexity": 8.883248512402556, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021665.5677882854, "perf/iters_per_sec": 0.9640052641812732, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373387336730957, "data/tokens_consumed": 59036925952, "data/tokens_consumed_B": 59.036925952, "train/loss_slope": -1.6011923474661757e-05} {"step": 28160, "timestamp": 1778224977.9045823, "train/loss": 2.192729949951172, "train/z_loss": 0.001483487302903086, "train/perplexity": 8.959639125027957, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026173.485424962, "perf/iters_per_sec": 0.9661548068165597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350308179855348, "data/tokens_consumed": 59057897472, "data/tokens_consumed_B": 59.057897472, "train/loss_slope": -1.5461739376433217e-05} {"step": 28170, "timestamp": 1778224988.259001, "train/loss": 2.2615626573562624, "train/z_loss": 0.0014537388109602035, "train/perplexity": 9.598075958106724, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026510.0985782142, "perf/iters_per_sec": 0.9663153164759704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348588943481445, "data/tokens_consumed": 59078868992, "data/tokens_consumed_B": 59.078868992, "train/loss_slope": -1.113236156484224e-05} {"step": 28180, "timestamp": 1778224998.6153374, "train/loss": 2.158277428150177, "train/z_loss": 0.0014828375191427768, "train/perplexity": 8.656213858176406, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026005.5241022317, "perf/iters_per_sec": 0.966074716616741, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351166248321533, "data/tokens_consumed": 59099840512, "data/tokens_consumed_B": 59.099840512, "train/loss_slope": -9.854171743201218e-06} {"step": 28190, "timestamp": 1778225008.977282, "train/loss": 2.1613758325576784, "train/z_loss": 0.0014828373212367296, "train/perplexity": 8.683075902595386, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024942.2599916873, "perf/iters_per_sec": 0.9655677127798497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356601476669312, "data/tokens_consumed": 59120812032, "data/tokens_consumed_B": 59.120812032, "train/loss_slope": -1.364036434733252e-05} {"step": 28200, "timestamp": 1778225019.3245475, "grad/layer_0/attn": 0.0038561534602195024, "grad/layer_0/mlp": 0.0030678908806294203, "grad/layer_0/attn_mlp_ratio": 1.2569395342165512, "grad/layer_4/attn": 0.0029719131998717785, "grad/layer_4/mlp": 0.0025730556808412075, "grad/layer_4/attn_mlp_ratio": 1.1550131256385638, "grad/layer_8/attn": 0.006790332496166229, "grad/layer_8/mlp": 0.003691363614052534, "grad/layer_8/attn_mlp_ratio": 1.8395186771534677, "grad/layer_12/attn": 0.004835174418985844, "grad/layer_12/mlp": 0.006486030295491219, "grad/layer_12/attn_mlp_ratio": 0.7454751402871976, "grad/layer_16/attn": 0.003924649674445391, "grad/layer_16/mlp": 0.005251414608210325, "grad/layer_16/attn_mlp_ratio": 0.7473509316088476, "grad/layer_20/attn": 0.00436904514208436, "grad/layer_20/mlp": 0.006704532075673342, "grad/layer_20/attn_mlp_ratio": 0.6516554813379762, "grad/layer_24/attn": 0.01582545042037964, "grad/layer_24/mlp": 0.01239653117954731, "grad/layer_24/attn_mlp_ratio": 1.2766031128796171, "grad/layer_27/attn": 0.00488569401204586, "grad/layer_27/mlp": 0.01180235669016838, "grad/layer_27/attn_mlp_ratio": 0.41395918619540034} {"step": 28200, "timestamp": 1778225019.9482949, "eos/sharpness": 59.45992469787596, "eos/L0_probe": 2.0215003490448, "eos/L_plus": 2.265676736831665, "eos/L_minus": 2.3719232082366943, "eos/grad_norm": 0.2226589471101761, "eos/embed_grad_frac": 0.042314860969781876, "eos/time_s": 0.6208014488220215} {"step": 28200, "timestamp": 1778225019.9701536, "train/loss": 2.1924167394638063, "train/z_loss": 0.0014686648733913898, "train/perplexity": 8.956833311519034, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908674.4565788314, "perf/iters_per_sec": 0.9101269038099439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0987478733062743, "data/tokens_consumed": 59141783552, "data/tokens_consumed_B": 59.141783552, "train/loss_slope": -1.4989697965386687e-05} {"step": 28200, "timestamp": 1778225021.3355749, "geo/rankme_last": 440.2987060546875, "geo/layer_0/stable_rank_q_proj": 18.36703872680664, "geo/layer_0/stable_rank_k_proj": 15.889622688293457, "geo/layer_0/stable_rank_o_proj": 50.57453918457031, "geo/layer_0/stable_rank_gate_proj": 144.10072326660156, "geo/layer_0/stable_rank_down_proj": 51.292327880859375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051126450300216675, "geo/layer_0/attn_entropy_mean": 6.228211402893066, "geo/layer_0/attn_entropy_std": 0.3424585163593292, "geo/layer_7/stable_rank_q_proj": 42.090179443359375, "geo/layer_7/stable_rank_k_proj": 42.20899200439453, "geo/layer_7/stable_rank_o_proj": 106.23389434814453, "geo/layer_7/stable_rank_gate_proj": 97.27151489257812, "geo/layer_7/stable_rank_down_proj": 148.09605407714844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.538845956325531, "geo/layer_7/attn_entropy_mean": 4.650026321411133, "geo/layer_7/attn_entropy_std": 0.8373139500617981, "geo/layer_14/stable_rank_q_proj": 55.661808013916016, "geo/layer_14/stable_rank_k_proj": 35.276023864746094, "geo/layer_14/stable_rank_o_proj": 53.22247314453125, "geo/layer_14/stable_rank_gate_proj": 81.15843963623047, "geo/layer_14/stable_rank_down_proj": 134.90089416503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38105323910713196, "geo/layer_14/attn_entropy_mean": 5.5244221687316895, "geo/layer_14/attn_entropy_std": 0.4134282171726227, "geo/layer_21/stable_rank_q_proj": 45.600738525390625, "geo/layer_21/stable_rank_k_proj": 31.027381896972656, "geo/layer_21/stable_rank_o_proj": 80.96734619140625, "geo/layer_21/stable_rank_gate_proj": 79.95747375488281, "geo/layer_21/stable_rank_down_proj": 58.263572692871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15319274365901947, "geo/layer_21/attn_entropy_mean": 5.745663642883301, "geo/layer_21/attn_entropy_std": 0.28411686420440674, "geo/layer_27/stable_rank_q_proj": 41.57310485839844, "geo/layer_27/stable_rank_k_proj": 31.015222549438477, "geo/layer_27/stable_rank_o_proj": 118.97240447998047, "geo/layer_27/stable_rank_gate_proj": 89.14823150634766, "geo/layer_27/stable_rank_down_proj": 135.26589965820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07634570449590683, "geo/layer_27/attn_entropy_mean": 4.353077411651611, "geo/layer_27/attn_entropy_std": 0.6129395365715027, "attnres/final_alpha/block_0": 0.24121394753456116, "attnres/block_norm/0": 1.6738991737365723, "attnres/final_alpha/block_1": 0.006011521443724632, "attnres/block_norm/1": 34683.42578125, "attnres/final_alpha/block_2": 0.012847860343754292, "attnres/block_norm/2": 23821.34375, "attnres/final_alpha/block_3": 0.014897097833454609, "attnres/block_norm/3": 37654.875, "attnres/final_alpha/block_4": 0.018648598343133926, "attnres/block_norm/4": 11010.1240234375, "attnres/final_alpha/block_5": 0.5772503614425659, "attnres/block_norm/5": 5525.02783203125, "attnres/final_alpha/block_6": 0.12913060188293457, "attnres/block_norm/6": 24908.896484375, "geo/tier1_time_s": 1.3616721630096436, "geo/step": 28200.0, "geo/rankme_slope": -2.8370430203331334e-05} {"step": 28210, "timestamp": 1778225031.6938155, "train/loss": 2.195644497871399, "train/z_loss": 0.0014760843478143215, "train/perplexity": 8.98579051383161, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789347.0940063507, "perf/iters_per_sec": 0.8532271833450082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720207929611206, "data/tokens_consumed": 59162755072, "data/tokens_consumed_B": 59.162755072, "train/loss_slope": -1.456741385369763e-05} {"step": 28220, "timestamp": 1778225042.0476403, "train/loss": 2.1116474151611326, "train/z_loss": 0.0014942026929929853, "train/perplexity": 8.261840761287932, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026391.61069107, "perf/iters_per_sec": 0.9662588170485831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349194049835204, "data/tokens_consumed": 59183726592, "data/tokens_consumed_B": 59.183726592, "train/loss_slope": -1.558114952272811e-05} {"step": 28230, "timestamp": 1778225052.413789, "train/loss": 2.177621102333069, "train/z_loss": 0.0014821187243796885, "train/perplexity": 8.825286812869091, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024008.9242773254, "perf/iters_per_sec": 0.9651226636301639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361377239227294, "data/tokens_consumed": 59204698112, "data/tokens_consumed_B": 59.204698112, "train/loss_slope": -1.605960150363024e-05} {"step": 28240, "timestamp": 1778225062.7932365, "train/loss": 2.208373475074768, "train/z_loss": 0.0014868110069073738, "train/perplexity": 9.100901504774361, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021644.6121518356, "perf/iters_per_sec": 0.9639952717551401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037349486351013, "data/tokens_consumed": 59225669632, "data/tokens_consumed_B": 59.225669632, "train/loss_slope": -1.3461135351511187e-05} {"step": 28250, "timestamp": 1778225073.160183, "grad/layer_0/attn": 0.0031338047701865435, "grad/layer_0/mlp": 0.0030176800210028887, "grad/layer_0/attn_mlp_ratio": 1.038481430942741, "grad/layer_4/attn": 0.003043085802346468, "grad/layer_4/mlp": 0.002557154046371579, "grad/layer_4/attn_mlp_ratio": 1.190028300274503, "grad/layer_8/attn": 0.005189584102481604, "grad/layer_8/mlp": 0.0038731463719159365, "grad/layer_8/attn_mlp_ratio": 1.339888418914987, "grad/layer_12/attn": 0.005445065908133984, "grad/layer_12/mlp": 0.007176005747169256, "grad/layer_12/attn_mlp_ratio": 0.7587878304589061, "grad/layer_16/attn": 0.004270999692380428, "grad/layer_16/mlp": 0.004805621691048145, "grad/layer_16/attn_mlp_ratio": 0.8887506920199996, "grad/layer_20/attn": 0.005172924138605595, "grad/layer_20/mlp": 0.007206349167972803, "grad/layer_20/attn_mlp_ratio": 0.717828673888405, "grad/layer_24/attn": 0.016080012544989586, "grad/layer_24/mlp": 0.013465307652950287, "grad/layer_24/attn_mlp_ratio": 1.1941808416125068, "grad/layer_27/attn": 0.00942136812955141, "grad/layer_27/mlp": 0.011468219570815563, "grad/layer_27/attn_mlp_ratio": 0.8215196778561018} {"step": 28250, "timestamp": 1778225073.176406, "train/loss": 2.2159086942672728, "train/z_loss": 0.001481714763212949, "train/perplexity": 9.16973781509812, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020733.346197032, "perf/iters_per_sec": 0.9635607462868843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378172874450684, "data/tokens_consumed": 59246641152, "data/tokens_consumed_B": 59.246641152, "train/loss_slope": -8.578535890278753e-06} {"step": 28260, "timestamp": 1778225083.5563707, "train/loss": 2.181818056106567, "train/z_loss": 0.001481457892805338, "train/perplexity": 8.862403968671034, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021470.5717026484, "perf/iters_per_sec": 0.9639122828019373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374387979507447, "data/tokens_consumed": 59267612672, "data/tokens_consumed_B": 59.267612672, "train/loss_slope": -7.578639257358341e-06} {"step": 28270, "timestamp": 1778225093.9292939, "train/loss": 2.1924710035324098, "train/z_loss": 0.0014831860200501979, "train/perplexity": 8.957319358923655, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022920.5844430819, "perf/iters_per_sec": 0.9646037027564439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036695170402527, "data/tokens_consumed": 59288584192, "data/tokens_consumed_B": 59.288584192, "train/loss_slope": -7.897224191642165e-06} {"step": 28275, "timestamp": 1778225099.7286565, "eos/sharpness": 21.60842418670654, "eos/L0_probe": 2.019353151321411, "eos/L_plus": 2.1111931800842285, "eos/L_minus": 2.143597364425659, "eos/grad_norm": 0.1110147014260292, "eos/embed_grad_frac": 0.17635171115398407, "eos/time_s": 0.6246178150177002} {"step": 28275, "timestamp": 1778225101.1066432, "geo/rankme_last": 440.3872985839844, "geo/layer_0/stable_rank_q_proj": 18.364761352539062, "geo/layer_0/stable_rank_k_proj": 15.885749816894531, "geo/layer_0/stable_rank_o_proj": 50.640953063964844, "geo/layer_0/stable_rank_gate_proj": 143.5368194580078, "geo/layer_0/stable_rank_down_proj": 51.333717346191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.055963244289159775, "geo/layer_0/attn_entropy_mean": 6.2299675941467285, "geo/layer_0/attn_entropy_std": 0.3409050405025482, "geo/layer_7/stable_rank_q_proj": 42.010440826416016, "geo/layer_7/stable_rank_k_proj": 42.22456741333008, "geo/layer_7/stable_rank_o_proj": 105.96597290039062, "geo/layer_7/stable_rank_gate_proj": 97.38692474365234, "geo/layer_7/stable_rank_down_proj": 148.17337036132812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5255559682846069, "geo/layer_7/attn_entropy_mean": 4.642830848693848, "geo/layer_7/attn_entropy_std": 0.8455252051353455, "geo/layer_14/stable_rank_q_proj": 55.64926528930664, "geo/layer_14/stable_rank_k_proj": 35.248817443847656, "geo/layer_14/stable_rank_o_proj": 53.15410232543945, "geo/layer_14/stable_rank_gate_proj": 80.89663696289062, "geo/layer_14/stable_rank_down_proj": 134.89096069335938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3708325922489166, "geo/layer_14/attn_entropy_mean": 5.496216773986816, "geo/layer_14/attn_entropy_std": 0.4245821237564087, "geo/layer_21/stable_rank_q_proj": 45.54061508178711, "geo/layer_21/stable_rank_k_proj": 31.191356658935547, "geo/layer_21/stable_rank_o_proj": 80.99943542480469, "geo/layer_21/stable_rank_gate_proj": 79.98741149902344, "geo/layer_21/stable_rank_down_proj": 58.06093978881836, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15049158036708832, "geo/layer_21/attn_entropy_mean": 5.737987518310547, "geo/layer_21/attn_entropy_std": 0.2818169891834259, "geo/layer_27/stable_rank_q_proj": 41.58280944824219, "geo/layer_27/stable_rank_k_proj": 31.130273818969727, "geo/layer_27/stable_rank_o_proj": 118.73991394042969, "geo/layer_27/stable_rank_gate_proj": 89.0656509399414, "geo/layer_27/stable_rank_down_proj": 135.2793731689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08134929835796356, "geo/layer_27/attn_entropy_mean": 4.361236572265625, "geo/layer_27/attn_entropy_std": 0.5949750542640686, "attnres/final_alpha/block_0": 0.24035564064979553, "attnres/block_norm/0": 1.6742390394210815, "attnres/final_alpha/block_1": 0.006062074098736048, "attnres/block_norm/1": 34603.0703125, "attnres/final_alpha/block_2": 0.0127236507833004, "attnres/block_norm/2": 23817.80078125, "attnres/final_alpha/block_3": 0.014615233987569809, "attnres/block_norm/3": 37769.125, "attnres/final_alpha/block_4": 0.018414152786135674, "attnres/block_norm/4": 11061.806640625, "attnres/final_alpha/block_5": 0.5814578533172607, "attnres/block_norm/5": 5479.50244140625, "attnres/final_alpha/block_6": 0.12637139856815338, "attnres/block_norm/6": 24891.4453125, "geo/tier1_time_s": 1.35837984085083, "geo/step": 28275.0, "geo/rankme_slope": -1.2634604623099242e-05} {"step": 28280, "timestamp": 1778225106.2963495, "train/loss": 2.283230423927307, "train/z_loss": 0.0014661390217952431, "train/perplexity": 9.808314299781491, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696436.1817251204, "perf/iters_per_sec": 0.8089238079667666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2362103700637816, "data/tokens_consumed": 59309555712, "data/tokens_consumed_B": 59.309555712, "train/loss_slope": -2.7137645471929215e-06} {"step": 28290, "timestamp": 1778225117.0815187, "train/loss": 2.181189846992493, "train/z_loss": 0.0014786348561756312, "train/perplexity": 8.856838274118369, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945308.7454322164, "perf/iters_per_sec": 0.9275954939995844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0780561208724975, "data/tokens_consumed": 59330527232, "data/tokens_consumed_B": 59.330527232, "train/loss_slope": -4.047474845407429e-07} {"step": 28300, "timestamp": 1778225127.4539626, "grad/layer_0/attn": 0.002898668171837926, "grad/layer_0/mlp": 0.0027463636361062527, "grad/layer_0/attn_mlp_ratio": 1.055456760417179, "grad/layer_4/attn": 0.001557500334456563, "grad/layer_4/mlp": 0.00243235076777637, "grad/layer_4/attn_mlp_ratio": 0.6403271645921764, "grad/layer_8/attn": 0.004247935023158789, "grad/layer_8/mlp": 0.003480935702100396, "grad/layer_8/attn_mlp_ratio": 1.2203428229258309, "grad/layer_12/attn": 0.004752535838633776, "grad/layer_12/mlp": 0.0066075557842850685, "grad/layer_12/attn_mlp_ratio": 0.7192577591264665, "grad/layer_16/attn": 0.00370381330139935, "grad/layer_16/mlp": 0.004362351726740599, "grad/layer_16/attn_mlp_ratio": 0.8490404828641972, "grad/layer_20/attn": 0.005767400376498699, "grad/layer_20/mlp": 0.005853018257766962, "grad/layer_20/attn_mlp_ratio": 0.9853719950912767, "grad/layer_24/attn": 0.010079157538712025, "grad/layer_24/mlp": 0.01115476805716753, "grad/layer_24/attn_mlp_ratio": 0.9035739153606372, "grad/layer_27/attn": 0.008891649544239044, "grad/layer_27/mlp": 0.009894274175167084, "grad/layer_27/attn_mlp_ratio": 0.8986661676193418} {"step": 28300, "timestamp": 1778225127.4702332, "train/loss": 2.1727043628692626, "train/z_loss": 0.0014775285962969065, "train/perplexity": 8.78200167498871, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019903.885237605, "perf/iters_per_sec": 0.9631652284801507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382434606552124, "data/tokens_consumed": 59351498752, "data/tokens_consumed_B": 59.351498752, "train/loss_slope": -3.6279856532749246e-06} {"step": 28310, "timestamp": 1778225137.853511, "train/loss": 2.2273671627044678, "train/z_loss": 0.00148703622398898, "train/perplexity": 9.275413249512672, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020588.8433957985, "perf/iters_per_sec": 0.9634918419817917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378915071487427, "data/tokens_consumed": 59372470272, "data/tokens_consumed_B": 59.372470272, "train/loss_slope": -2.5344382597095834e-06} {"step": 28320, "timestamp": 1778225148.2338197, "train/loss": 2.1718695878982546, "train/z_loss": 0.0014902158291079104, "train/perplexity": 8.774673738809472, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021563.7209653798, "perf/iters_per_sec": 0.9639566998316669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373909950256348, "data/tokens_consumed": 59393441792, "data/tokens_consumed_B": 59.393441792, "train/loss_slope": -4.434344584685033e-06} {"step": 28330, "timestamp": 1778225158.6137898, "train/loss": 2.1770984172821044, "train/z_loss": 0.0014944886905141174, "train/perplexity": 8.82067517270421, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021387.2324566827, "perf/iters_per_sec": 0.9638725435527242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374815702438354, "data/tokens_consumed": 59414413312, "data/tokens_consumed_B": 59.414413312, "train/loss_slope": -6.13283087389624e-06} {"step": 28340, "timestamp": 1778225168.9857595, "train/loss": 2.2230685234069822, "train/z_loss": 0.0014704412664286793, "train/perplexity": 9.235627167889787, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022864.5257633312, "perf/iters_per_sec": 0.964576971894899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367238998413086, "data/tokens_consumed": 59435384832, "data/tokens_consumed_B": 59.435384832, "train/loss_slope": -7.010110941800208e-06} {"step": 28350, "timestamp": 1778225179.3525538, "grad/layer_0/attn": 0.004342667292803526, "grad/layer_0/mlp": 0.003360999748110771, "grad/layer_0/attn_mlp_ratio": 1.2920760157857647, "grad/layer_4/attn": 0.0027465696912258863, "grad/layer_4/mlp": 0.0025060807820409536, "grad/layer_4/attn_mlp_ratio": 1.0959621099655326, "grad/layer_8/attn": 0.004973873030394316, "grad/layer_8/mlp": 0.0036355892661958933, "grad/layer_8/attn_mlp_ratio": 1.3681063864478014, "grad/layer_12/attn": 0.00560357328504324, "grad/layer_12/mlp": 0.00743426289409399, "grad/layer_12/attn_mlp_ratio": 0.7537496708812821, "grad/layer_16/attn": 0.005961595568805933, "grad/layer_16/mlp": 0.005065994802862406, "grad/layer_16/attn_mlp_ratio": 1.1767867285925402, "grad/layer_20/attn": 0.007035079877823591, "grad/layer_20/mlp": 0.007034497335553169, "grad/layer_20/attn_mlp_ratio": 1.0000827979931415, "grad/layer_24/attn": 0.017267214134335518, "grad/layer_24/mlp": 0.013140789233148098, "grad/layer_24/attn_mlp_ratio": 1.3140165097067928, "grad/layer_27/attn": 0.004408986307680607, "grad/layer_27/mlp": 0.012392833828926086, "grad/layer_27/attn_mlp_ratio": 0.35576901400974975} {"step": 28350, "timestamp": 1778225179.9863336, "eos/sharpness": 65.0585889816284, "eos/L0_probe": 2.026080369949341, "eos/L_plus": 2.2811858654022217, "eos/L_minus": 2.421560764312744, "eos/grad_norm": 0.22323568165302277, "eos/embed_grad_frac": 0.046816885471343994, "eos/time_s": 0.6309201717376709} {"step": 28350, "timestamp": 1778225180.0060463, "train/loss": 2.1963309764862062, "train/z_loss": 0.001464630465488881, "train/perplexity": 8.991961184630936, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903994.046814727, "perf/iters_per_sec": 0.9078951105188021, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1014488220214844, "data/tokens_consumed": 59456356352, "data/tokens_consumed_B": 59.456356352, "train/loss_slope": -6.557245187275842e-06} {"step": 28350, "timestamp": 1778225181.3678305, "geo/rankme_last": 440.3002014160156, "geo/layer_0/stable_rank_q_proj": 18.382984161376953, "geo/layer_0/stable_rank_k_proj": 15.861665725708008, "geo/layer_0/stable_rank_o_proj": 50.60298538208008, "geo/layer_0/stable_rank_gate_proj": 143.3094482421875, "geo/layer_0/stable_rank_down_proj": 51.49095916748047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05391460284590721, "geo/layer_0/attn_entropy_mean": 6.227338790893555, "geo/layer_0/attn_entropy_std": 0.34299278259277344, "geo/layer_7/stable_rank_q_proj": 42.01120376586914, "geo/layer_7/stable_rank_k_proj": 42.23748016357422, "geo/layer_7/stable_rank_o_proj": 105.83914184570312, "geo/layer_7/stable_rank_gate_proj": 97.27544403076172, "geo/layer_7/stable_rank_down_proj": 147.74295043945312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.544628381729126, "geo/layer_7/attn_entropy_mean": 4.632699012756348, "geo/layer_7/attn_entropy_std": 0.8312632441520691, "geo/layer_14/stable_rank_q_proj": 55.59259796142578, "geo/layer_14/stable_rank_k_proj": 35.23164367675781, "geo/layer_14/stable_rank_o_proj": 53.25172805786133, "geo/layer_14/stable_rank_gate_proj": 80.97952270507812, "geo/layer_14/stable_rank_down_proj": 135.14071655273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.371847540140152, "geo/layer_14/attn_entropy_mean": 5.456121921539307, "geo/layer_14/attn_entropy_std": 0.44292542338371277, "geo/layer_21/stable_rank_q_proj": 45.714813232421875, "geo/layer_21/stable_rank_k_proj": 31.334823608398438, "geo/layer_21/stable_rank_o_proj": 80.94928741455078, "geo/layer_21/stable_rank_gate_proj": 80.17736053466797, "geo/layer_21/stable_rank_down_proj": 58.214080810546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15097834169864655, "geo/layer_21/attn_entropy_mean": 5.735731601715088, "geo/layer_21/attn_entropy_std": 0.2890380620956421, "geo/layer_27/stable_rank_q_proj": 41.54419708251953, "geo/layer_27/stable_rank_k_proj": 31.180767059326172, "geo/layer_27/stable_rank_o_proj": 118.7491226196289, "geo/layer_27/stable_rank_gate_proj": 88.99249267578125, "geo/layer_27/stable_rank_down_proj": 135.5458984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08588976413011551, "geo/layer_27/attn_entropy_mean": 4.353362560272217, "geo/layer_27/attn_entropy_std": 0.6064472794532776, "attnres/final_alpha/block_0": 0.24178344011306763, "attnres/block_norm/0": 1.674901008605957, "attnres/final_alpha/block_1": 0.006221695337444544, "attnres/block_norm/1": 34578.49609375, "attnres/final_alpha/block_2": 0.012849746271967888, "attnres/block_norm/2": 23870.9921875, "attnres/final_alpha/block_3": 0.01489298976957798, "attnres/block_norm/3": 37690.7109375, "attnres/final_alpha/block_4": 0.018888508901000023, "attnres/block_norm/4": 11044.32421875, "attnres/final_alpha/block_5": 0.5759434700012207, "attnres/block_norm/5": 5542.18017578125, "attnres/final_alpha/block_6": 0.12942013144493103, "attnres/block_norm/6": 24999.041015625, "geo/tier1_time_s": 1.357924222946167, "geo/step": 28350.0, "geo/rankme_slope": -8.715908238295321e-06} {"step": 28360, "timestamp": 1778225191.7445698, "train/loss": 2.191271114349365, "train/z_loss": 0.0014710968593135477, "train/perplexity": 8.946578013816431, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787126.0793566608, "perf/iters_per_sec": 0.8521681210311226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734773635864257, "data/tokens_consumed": 59477327872, "data/tokens_consumed_B": 59.477327872, "train/loss_slope": -5.632257182570231e-06} {"step": 28370, "timestamp": 1778225202.1184554, "train/loss": 2.16465220451355, "train/z_loss": 0.001475418242625892, "train/perplexity": 8.711571544643785, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022477.1777516988, "perf/iters_per_sec": 0.9643922699697965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369224548339844, "data/tokens_consumed": 59498299392, "data/tokens_consumed_B": 59.498299392, "train/loss_slope": -6.888292667710538e-06} {"step": 28380, "timestamp": 1778225212.4706414, "train/loss": 2.170532298088074, "train/z_loss": 0.001491224963683635, "train/perplexity": 8.76294729960231, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026866.4393727593, "perf/iters_per_sec": 0.9664852330077931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346769571304322, "data/tokens_consumed": 59519270912, "data/tokens_consumed_B": 59.519270912, "train/loss_slope": -7.746059609146198e-06} {"step": 28390, "timestamp": 1778225222.8219924, "train/loss": 2.1820191621780394, "train/z_loss": 0.0014680873835459351, "train/perplexity": 8.86418643114298, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026908.1942215615, "perf/iters_per_sec": 0.9665051432712372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346556425094604, "data/tokens_consumed": 59540242432, "data/tokens_consumed_B": 59.540242432, "train/loss_slope": -9.233245891098391e-06} {"step": 28400, "timestamp": 1778225233.1690006, "grad/layer_0/attn": 0.003977111075073481, "grad/layer_0/mlp": 0.0030984305776655674, "grad/layer_0/attn_mlp_ratio": 1.2835888515246474, "grad/layer_4/attn": 0.0019240421243011951, "grad/layer_4/mlp": 0.002542432164773345, "grad/layer_4/attn_mlp_ratio": 0.756772225934885, "grad/layer_8/attn": 0.0042796023190021515, "grad/layer_8/mlp": 0.0035667947959154844, "grad/layer_8/attn_mlp_ratio": 1.1998453636633097, "grad/layer_12/attn": 0.004653099458664656, "grad/layer_12/mlp": 0.006637616083025932, "grad/layer_12/attn_mlp_ratio": 0.7010196628367592, "grad/layer_16/attn": 0.004341672174632549, "grad/layer_16/mlp": 0.0050675272941589355, "grad/layer_16/attn_mlp_ratio": 0.8567634344980474, "grad/layer_20/attn": 0.004159616306424141, "grad/layer_20/mlp": 0.006542513146996498, "grad/layer_20/attn_mlp_ratio": 0.6357826341939339, "grad/layer_24/attn": 0.011470377445220947, "grad/layer_24/mlp": 0.011664032004773617, "grad/layer_24/attn_mlp_ratio": 0.9833972799617541, "grad/layer_27/attn": 0.004728919360786676, "grad/layer_27/mlp": 0.010308608412742615, "grad/layer_27/attn_mlp_ratio": 0.45873498396424633} {"step": 28400, "timestamp": 1778225233.1850853, "train/loss": 2.1949299335479737, "train/z_loss": 0.001472336845472455, "train/perplexity": 8.979371882048406, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024549.0367492733, "perf/iters_per_sec": 0.9653802093263976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358613014221192, "data/tokens_consumed": 59561213952, "data/tokens_consumed_B": 59.561213952, "train/loss_slope": -7.273424812669449e-06} {"step": 28410, "timestamp": 1778225243.5385454, "train/loss": 2.1814034938812257, "train/z_loss": 0.0014908511890098452, "train/perplexity": 8.858730712209201, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026750.9919668376, "perf/iters_per_sec": 0.9664301833948314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347358942031861, "data/tokens_consumed": 59582185472, "data/tokens_consumed_B": 59.582185472, "train/loss_slope": -1.1011511486212598e-05} {"step": 28420, "timestamp": 1778225253.8886175, "train/loss": 2.19419207572937, "train/z_loss": 0.0014751815819181503, "train/perplexity": 8.972748826036401, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027039.3081554791, "perf/iters_per_sec": 0.9665676632668873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345887184143066, "data/tokens_consumed": 59603156992, "data/tokens_consumed_B": 59.603156992, "train/loss_slope": -1.1968502074149267e-05} {"step": 28425, "timestamp": 1778225259.6690178, "eos/sharpness": 58.57355594635008, "eos/L0_probe": 2.0230350494384766, "eos/L_plus": 2.284473180770874, "eos/L_minus": 2.34733247756958, "eos/grad_norm": 0.25277361273765564, "eos/embed_grad_frac": 0.0371568463742733, "eos/time_s": 0.6107935905456543} {"step": 28425, "timestamp": 1778225261.0486753, "geo/rankme_last": 439.55926513671875, "geo/layer_0/stable_rank_q_proj": 18.4210147857666, "geo/layer_0/stable_rank_k_proj": 15.904836654663086, "geo/layer_0/stable_rank_o_proj": 50.61388397216797, "geo/layer_0/stable_rank_gate_proj": 143.5016632080078, "geo/layer_0/stable_rank_down_proj": 51.46205139160156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05333220586180687, "geo/layer_0/attn_entropy_mean": 6.228019714355469, "geo/layer_0/attn_entropy_std": 0.3429422676563263, "geo/layer_7/stable_rank_q_proj": 42.09996032714844, "geo/layer_7/stable_rank_k_proj": 42.32803726196289, "geo/layer_7/stable_rank_o_proj": 105.87066650390625, "geo/layer_7/stable_rank_gate_proj": 97.30715942382812, "geo/layer_7/stable_rank_down_proj": 147.19700622558594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5193409323692322, "geo/layer_7/attn_entropy_mean": 4.671151638031006, "geo/layer_7/attn_entropy_std": 0.8296241760253906, "geo/layer_14/stable_rank_q_proj": 55.69094467163086, "geo/layer_14/stable_rank_k_proj": 35.3028564453125, "geo/layer_14/stable_rank_o_proj": 53.06708908081055, "geo/layer_14/stable_rank_gate_proj": 81.05523681640625, "geo/layer_14/stable_rank_down_proj": 135.2819061279297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38537710905075073, "geo/layer_14/attn_entropy_mean": 5.494418144226074, "geo/layer_14/attn_entropy_std": 0.4120687246322632, "geo/layer_21/stable_rank_q_proj": 45.52043151855469, "geo/layer_21/stable_rank_k_proj": 31.230344772338867, "geo/layer_21/stable_rank_o_proj": 80.87227630615234, "geo/layer_21/stable_rank_gate_proj": 80.17461395263672, "geo/layer_21/stable_rank_down_proj": 58.18050003051758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1504094898700714, "geo/layer_21/attn_entropy_mean": 5.720968723297119, "geo/layer_21/attn_entropy_std": 0.2913268804550171, "geo/layer_27/stable_rank_q_proj": 41.605003356933594, "geo/layer_27/stable_rank_k_proj": 31.243879318237305, "geo/layer_27/stable_rank_o_proj": 118.75784301757812, "geo/layer_27/stable_rank_gate_proj": 88.9215316772461, "geo/layer_27/stable_rank_down_proj": 135.5185546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08598900586366653, "geo/layer_27/attn_entropy_mean": 4.3336181640625, "geo/layer_27/attn_entropy_std": 0.6046958565711975, "attnres/final_alpha/block_0": 0.2411966323852539, "attnres/block_norm/0": 1.6754603385925293, "attnres/final_alpha/block_1": 0.006042883265763521, "attnres/block_norm/1": 34850.0234375, "attnres/final_alpha/block_2": 0.012751409783959389, "attnres/block_norm/2": 23941.7265625, "attnres/final_alpha/block_3": 0.014550081454217434, "attnres/block_norm/3": 38056.046875, "attnres/final_alpha/block_4": 0.018714863806962967, "attnres/block_norm/4": 11022.8671875, "attnres/final_alpha/block_5": 0.5795738101005554, "attnres/block_norm/5": 5527.046875, "attnres/final_alpha/block_6": 0.1271703541278839, "attnres/block_norm/6": 25002.45703125, "geo/tier1_time_s": 1.3583078384399414, "geo/step": 28425.0, "geo/rankme_slope": -3.606325342637054e-06} {"step": 28430, "timestamp": 1778225266.2569747, "train/loss": 2.173922562599182, "train/z_loss": 0.0014802203280851246, "train/perplexity": 8.792706425995895, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696281.0149839658, "perf/iters_per_sec": 0.8088498186988667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2363234519958497, "data/tokens_consumed": 59624128512, "data/tokens_consumed_B": 59.624128512, "train/loss_slope": -9.933587684310932e-06} {"step": 28440, "timestamp": 1778225276.6086378, "train/loss": 2.1693975210189818, "train/z_loss": 0.0014787155902013182, "train/perplexity": 8.753008947922337, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026792.4618514387, "perf/iters_per_sec": 0.9664499577767557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347147226333617, "data/tokens_consumed": 59645100032, "data/tokens_consumed_B": 59.645100032, "train/loss_slope": -1.1116303082811559e-05} {"step": 28450, "timestamp": 1778225286.9484892, "grad/layer_0/attn": 0.0026427044067531824, "grad/layer_0/mlp": 0.0026606780011206865, "grad/layer_0/attn_mlp_ratio": 0.9932446941402142, "grad/layer_4/attn": 0.0019377212738618255, "grad/layer_4/mlp": 0.0025398992002010345, "grad/layer_4/attn_mlp_ratio": 0.7629126374059214, "grad/layer_8/attn": 0.00393883278593421, "grad/layer_8/mlp": 0.0036026407033205032, "grad/layer_8/attn_mlp_ratio": 1.093318207661403, "grad/layer_12/attn": 0.004653082229197025, "grad/layer_12/mlp": 0.005873384419828653, "grad/layer_12/attn_mlp_ratio": 0.7922318406854063, "grad/layer_16/attn": 0.003956806845963001, "grad/layer_16/mlp": 0.004498412366956472, "grad/layer_16/attn_mlp_ratio": 0.879600720260339, "grad/layer_20/attn": 0.004545815289020538, "grad/layer_20/mlp": 0.005914366338402033, "grad/layer_20/attn_mlp_ratio": 0.7686056209680415, "grad/layer_24/attn": 0.006607954856008291, "grad/layer_24/mlp": 0.008259430527687073, "grad/layer_24/attn_mlp_ratio": 0.8000496830686195, "grad/layer_27/attn": 0.005912586115300655, "grad/layer_27/mlp": 0.007874686270952225, "grad/layer_27/attn_mlp_ratio": 0.7508344887373201} {"step": 28450, "timestamp": 1778225286.9643312, "train/loss": 2.167684555053711, "train/z_loss": 0.001478922867681831, "train/perplexity": 8.738028175940219, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026298.2027040198, "perf/iters_per_sec": 0.9662142766494846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349671125411988, "data/tokens_consumed": 59666071552, "data/tokens_consumed_B": 59.666071552, "train/loss_slope": -1.2122477313401416e-05} {"step": 28460, "timestamp": 1778225297.3268292, "train/loss": 2.223924684524536, "train/z_loss": 0.0014750819304026664, "train/perplexity": 9.243537738645495, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025018.340244284, "perf/iters_per_sec": 0.9656039906712932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356212377548217, "data/tokens_consumed": 59687043072, "data/tokens_consumed_B": 59.687043072, "train/loss_slope": -1.2389114280023205e-05} {"step": 28470, "timestamp": 1778225308.0688503, "train/loss": 2.2014605522155763, "train/z_loss": 0.0014942622045055031, "train/perplexity": 9.038204633721573, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953658.194546184, "perf/iters_per_sec": 0.9315768215876503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0734487771987915, "data/tokens_consumed": 59708014592, "data/tokens_consumed_B": 59.708014592, "train/loss_slope": -1.0891068500332301e-05} {"step": 28480, "timestamp": 1778225318.427272, "train/loss": 2.187613272666931, "train/z_loss": 0.001474695187062025, "train/perplexity": 8.913912626671053, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025700.194246842, "perf/iters_per_sec": 0.9659291239961825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352726459503174, "data/tokens_consumed": 59728986112, "data/tokens_consumed_B": 59.728986112, "train/loss_slope": -8.853576929405496e-06} {"step": 28490, "timestamp": 1778225329.288299, "train/loss": 2.1872788190841677, "train/z_loss": 0.00147568688262254, "train/perplexity": 8.91093183515261, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931691.9196130384, "perf/iters_per_sec": 0.9211024854722206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0856555223464965, "data/tokens_consumed": 59749957632, "data/tokens_consumed_B": 59.749957632, "train/loss_slope": -1.0339759549971608e-05} {"step": 28500, "timestamp": 1778225339.6288183, "grad/layer_0/attn": 0.00339091126807034, "grad/layer_0/mlp": 0.002841660287231207, "grad/layer_0/attn_mlp_ratio": 1.193285194566934, "grad/layer_4/attn": 0.0025334912352263927, "grad/layer_4/mlp": 0.002446763915941119, "grad/layer_4/attn_mlp_ratio": 1.0354456820192832, "grad/layer_8/attn": 0.0048975092358887196, "grad/layer_8/mlp": 0.0037708240561187267, "grad/layer_8/attn_mlp_ratio": 1.2987901405961286, "grad/layer_12/attn": 0.0041399323381483555, "grad/layer_12/mlp": 0.006340494379401207, "grad/layer_12/attn_mlp_ratio": 0.6529352484412744, "grad/layer_16/attn": 0.0036366682033985853, "grad/layer_16/mlp": 0.004383142571896315, "grad/layer_16/attn_mlp_ratio": 0.8296942343939775, "grad/layer_20/attn": 0.006846804637461901, "grad/layer_20/mlp": 0.0059347692877054214, "grad/layer_20/attn_mlp_ratio": 1.1536766115370665, "grad/layer_24/attn": 0.004498017020523548, "grad/layer_24/mlp": 0.00801064632833004, "grad/layer_24/attn_mlp_ratio": 0.5615048748895086, "grad/layer_27/attn": 0.003972821868956089, "grad/layer_27/mlp": 0.006798203103244305, "grad/layer_27/attn_mlp_ratio": 0.5843929270987576} {"step": 28500, "timestamp": 1778225340.2515564, "eos/sharpness": 6.178665161132812, "eos/L0_probe": 2.0226118564605713, "eos/L_plus": 2.059126853942871, "eos/L_minus": 2.0478835105895996, "eos/grad_norm": 0.08924338966608047, "eos/embed_grad_frac": 0.2987236976623535, "eos/time_s": 0.619856595993042} {"step": 28500, "timestamp": 1778225340.271598, "train/loss": 2.1668183326721193, "train/z_loss": 0.0014880542294122279, "train/perplexity": 8.730462377668216, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910311.7306945885, "perf/iters_per_sec": 0.910907616946501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097806167602539, "data/tokens_consumed": 59770929152, "data/tokens_consumed_B": 59.770929152, "train/loss_slope": -1.2127587868936303e-05} {"step": 28500, "timestamp": 1778225341.639464, "geo/rankme_last": 440.03948974609375, "geo/layer_0/stable_rank_q_proj": 18.411569595336914, "geo/layer_0/stable_rank_k_proj": 15.86972713470459, "geo/layer_0/stable_rank_o_proj": 50.73324966430664, "geo/layer_0/stable_rank_gate_proj": 143.51119995117188, "geo/layer_0/stable_rank_down_proj": 51.500545501708984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05828271433711052, "geo/layer_0/attn_entropy_mean": 6.2222723960876465, "geo/layer_0/attn_entropy_std": 0.34399595856666565, "geo/layer_7/stable_rank_q_proj": 42.18955612182617, "geo/layer_7/stable_rank_k_proj": 42.18798828125, "geo/layer_7/stable_rank_o_proj": 105.48921966552734, "geo/layer_7/stable_rank_gate_proj": 97.1342544555664, "geo/layer_7/stable_rank_down_proj": 147.6554412841797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5320289134979248, "geo/layer_7/attn_entropy_mean": 4.6830267906188965, "geo/layer_7/attn_entropy_std": 0.8320130109786987, "geo/layer_14/stable_rank_q_proj": 55.644554138183594, "geo/layer_14/stable_rank_k_proj": 35.43185806274414, "geo/layer_14/stable_rank_o_proj": 53.06528091430664, "geo/layer_14/stable_rank_gate_proj": 81.03736877441406, "geo/layer_14/stable_rank_down_proj": 135.6033935546875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3708004653453827, "geo/layer_14/attn_entropy_mean": 5.480878829956055, "geo/layer_14/attn_entropy_std": 0.4058583378791809, "geo/layer_21/stable_rank_q_proj": 45.56707000732422, "geo/layer_21/stable_rank_k_proj": 31.208948135375977, "geo/layer_21/stable_rank_o_proj": 80.72528076171875, "geo/layer_21/stable_rank_gate_proj": 80.22848510742188, "geo/layer_21/stable_rank_down_proj": 58.22303009033203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14949113130569458, "geo/layer_21/attn_entropy_mean": 5.7314558029174805, "geo/layer_21/attn_entropy_std": 0.29015108942985535, "geo/layer_27/stable_rank_q_proj": 41.661964416503906, "geo/layer_27/stable_rank_k_proj": 31.213088989257812, "geo/layer_27/stable_rank_o_proj": 118.85697937011719, "geo/layer_27/stable_rank_gate_proj": 89.0185317993164, "geo/layer_27/stable_rank_down_proj": 135.4752655029297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08890700340270996, "geo/layer_27/attn_entropy_mean": 4.333700656890869, "geo/layer_27/attn_entropy_std": 0.5921734571456909, "attnres/final_alpha/block_0": 0.2398211658000946, "attnres/block_norm/0": 1.6759037971496582, "attnres/final_alpha/block_1": 0.006009833887219429, "attnres/block_norm/1": 34790.3984375, "attnres/final_alpha/block_2": 0.012653147801756859, "attnres/block_norm/2": 23868.7890625, "attnres/final_alpha/block_3": 0.014696422964334488, "attnres/block_norm/3": 37809.7265625, "attnres/final_alpha/block_4": 0.018322911113500595, "attnres/block_norm/4": 11075.53125, "attnres/final_alpha/block_5": 0.5826258659362793, "attnres/block_norm/5": 5492.4150390625, "attnres/final_alpha/block_6": 0.12587058544158936, "attnres/block_norm/6": 25088.513671875, "geo/tier1_time_s": 1.3636574745178223, "geo/step": 28500.0, "geo/rankme_slope": 3.65165597488996e-06} {"step": 28500, "timestamp": 1778225348.6036234, "geo/ww_alpha_mean": 7.661236734491661, "geo/ww_alpha_std": 3.9675355665937087, "geo/ww_alpha_min": 1.3659291491464138, "geo/ww_alpha_max": 24.36166653721292, "geo/ww_alpha_healthy_frac": 0.14720812182741116, "geo/ww_alpha_by_type/q_proj": 4.060752643538647, "geo/ww_alpha_by_type/k_proj": 4.622078571911677, "geo/ww_alpha_by_type/v_proj": 8.302476438313429, "geo/ww_alpha_by_type/o_proj": 8.069815103779874, "geo/ww_alpha_by_type/gate_proj": 8.286995743199085, "geo/ww_alpha_by_type/up_proj": 11.360929930049151, "geo/ww_alpha_by_type/down_proj": 9.040405408259383, "geo/twonn_id/layer_0": 0.6633111834526062, "geo/twonn_id/layer_7": 3.2116811275482178, "geo/twonn_id/layer_14": 3.794393539428711, "geo/twonn_id/layer_21": 6.7741899490356445, "geo/twonn_id/layer_27": 6.171803951263428, "geo/tier2_time_s": 6.95764684677124} {"step": 28500, "timestamp": 1778225349.2056222, "eoc/jacobian_sigma/layer_0/attn": 927.059814453125, "eoc/jacobian_sigma/layer_0/mlp": 6651.15673828125, "eoc/jacobian_sigma/layer_0": 6651.15673828125, "eoc/jacobian_sigma/layer_7/attn": 1.1542267799377441, "eoc/jacobian_sigma/layer_7/mlp": 1.7130401134490967, "eoc/jacobian_sigma/layer_7": 1.7130401134490967, "eoc/jacobian_sigma/layer_14/attn": 1.6225416660308838, "eoc/jacobian_sigma/layer_14/mlp": 9.567486763000488, "eoc/jacobian_sigma/layer_14": 9.567486763000488, "eoc/jacobian_sigma/layer_21/attn": 1.0879448652267456, "eoc/jacobian_sigma/layer_21/mlp": 4.228311538696289, "eoc/jacobian_sigma/layer_21": 4.228311538696289, "eoc/jacobian_sigma/layer_27/attn": 3.0545389652252197, "eoc/jacobian_sigma/layer_27/mlp": 25.042200088500977, "eoc/jacobian_sigma/layer_27": 25.042200088500977, "eoc/layer0_sigma": 6651.15673828125, "eoc/sigma_max": 25.042200088500977, "eoc/sigma_min": 1.7130401134490967, "eoc/sigma_mean": 10.137759625911713, "eoc/time_s": 0.5962326526641846} {"step": 28510, "timestamp": 1778225359.5903988, "train/loss": 2.2169212818145754, "train/z_loss": 0.001474298571702093, "train/perplexity": 9.179027680028723, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085785.6816536959, "perf/iters_per_sec": 0.5177429588573913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9314603567123414, "data/tokens_consumed": 59791900672, "data/tokens_consumed_B": 59.791900672, "train/loss_slope": -1.0792915970578854e-05} {"step": 28520, "timestamp": 1778225369.9855158, "train/loss": 2.203411889076233, "train/z_loss": 0.0014816582668572664, "train/perplexity": 9.055858434231624, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018671.7409411515, "perf/iters_per_sec": 0.9625776962953336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388771772384644, "data/tokens_consumed": 59812872192, "data/tokens_consumed_B": 59.812872192, "train/loss_slope": -1.021433044688822e-05} {"step": 28530, "timestamp": 1778225380.8862703, "train/loss": 2.1869026899337767, "train/z_loss": 0.0014730129274539649, "train/perplexity": 8.907580804181984, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1924867.7574925986, "perf/iters_per_sec": 0.9178484713995927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0895044565200807, "data/tokens_consumed": 59833843712, "data/tokens_consumed_B": 59.833843712, "train/loss_slope": -9.382167872529906e-06} {"step": 28540, "timestamp": 1778225391.2380164, "train/loss": 2.178844666481018, "train/z_loss": 0.0014793558162637054, "train/perplexity": 8.83609172631303, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027308.2218528425, "perf/iters_per_sec": 0.9666958913101399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344514846801758, "data/tokens_consumed": 59854815232, "data/tokens_consumed_B": 59.854815232, "train/loss_slope": -9.604521760559994e-06} {"step": 28550, "timestamp": 1778225401.9118185, "grad/layer_0/attn": 0.003022746881470084, "grad/layer_0/mlp": 0.0030273606535047293, "grad/layer_0/attn_mlp_ratio": 0.9984759424428346, "grad/layer_4/attn": 0.002115771872922778, "grad/layer_4/mlp": 0.0025257004890590906, "grad/layer_4/attn_mlp_ratio": 0.8376970263569424, "grad/layer_8/attn": 0.004184349440038204, "grad/layer_8/mlp": 0.0036680700723081827, "grad/layer_8/attn_mlp_ratio": 1.1407495613436276, "grad/layer_12/attn": 0.005545519758015871, "grad/layer_12/mlp": 0.0066056218929588795, "grad/layer_12/attn_mlp_ratio": 0.8395151529904373, "grad/layer_16/attn": 0.004983645398169756, "grad/layer_16/mlp": 0.004842648282647133, "grad/layer_16/attn_mlp_ratio": 1.0291156830687653, "grad/layer_20/attn": 0.005344782955944538, "grad/layer_20/mlp": 0.007026563864201307, "grad/layer_20/attn_mlp_ratio": 0.7606538534588103, "grad/layer_24/attn": 0.01687568984925747, "grad/layer_24/mlp": 0.013270074501633644, "grad/layer_24/attn_mlp_ratio": 1.2717102469928803, "grad/layer_27/attn": 0.007371821906417608, "grad/layer_27/mlp": 0.015047122724354267, "grad/layer_27/attn_mlp_ratio": 0.48991571295517505} {"step": 28550, "timestamp": 1778225401.9277503, "train/loss": 2.1519694566726684, "train/z_loss": 0.001488216209691018, "train/perplexity": 8.601782564068438, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1962915.1807808804, "perf/iters_per_sec": 0.9359908965973284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0683864593505858, "data/tokens_consumed": 59875786752, "data/tokens_consumed_B": 59.875786752, "train/loss_slope": -1.1052950724016049e-05} {"step": 28560, "timestamp": 1778225412.281731, "train/loss": 2.2105332136154177, "train/z_loss": 0.001468605501577258, "train/perplexity": 9.120578313241191, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026650.173053266, "perf/iters_per_sec": 0.9663821091905909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034787368774414, "data/tokens_consumed": 59896758272, "data/tokens_consumed_B": 59.896758272, "train/loss_slope": -1.0627033379283204e-05} {"step": 28570, "timestamp": 1778225422.6336772, "train/loss": 2.198618757724762, "train/z_loss": 0.0014823622885160148, "train/perplexity": 9.012556374388485, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027095.5050342348, "perf/iters_per_sec": 0.966594460026853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345600366592407, "data/tokens_consumed": 59917729792, "data/tokens_consumed_B": 59.917729792, "train/loss_slope": -9.29515452632453e-06} {"step": 28575, "timestamp": 1778225428.413394, "eos/sharpness": 44.05672550201415, "eos/L0_probe": 2.0236456394195557, "eos/L_plus": 2.302321434020996, "eos/L_minus": 2.185537099838257, "eos/grad_norm": 0.1588583141565323, "eos/embed_grad_frac": 0.08168142288923264, "eos/time_s": 0.6169357299804688} {"step": 28575, "timestamp": 1778225429.7923138, "geo/rankme_last": 440.2216491699219, "geo/layer_0/stable_rank_q_proj": 18.443328857421875, "geo/layer_0/stable_rank_k_proj": 15.893411636352539, "geo/layer_0/stable_rank_o_proj": 50.742210388183594, "geo/layer_0/stable_rank_gate_proj": 143.44105529785156, "geo/layer_0/stable_rank_down_proj": 51.49314498901367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052766311913728714, "geo/layer_0/attn_entropy_mean": 6.222782135009766, "geo/layer_0/attn_entropy_std": 0.3387058675289154, "geo/layer_7/stable_rank_q_proj": 42.15522766113281, "geo/layer_7/stable_rank_k_proj": 42.09808349609375, "geo/layer_7/stable_rank_o_proj": 105.50059509277344, "geo/layer_7/stable_rank_gate_proj": 97.06388092041016, "geo/layer_7/stable_rank_down_proj": 147.53726196289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5099997520446777, "geo/layer_7/attn_entropy_mean": 4.638972759246826, "geo/layer_7/attn_entropy_std": 0.8327423334121704, "geo/layer_14/stable_rank_q_proj": 55.629581451416016, "geo/layer_14/stable_rank_k_proj": 35.482337951660156, "geo/layer_14/stable_rank_o_proj": 53.011749267578125, "geo/layer_14/stable_rank_gate_proj": 81.13155364990234, "geo/layer_14/stable_rank_down_proj": 135.46075439453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3853195309638977, "geo/layer_14/attn_entropy_mean": 5.487139701843262, "geo/layer_14/attn_entropy_std": 0.42115679383277893, "geo/layer_21/stable_rank_q_proj": 45.55803298950195, "geo/layer_21/stable_rank_k_proj": 31.21375846862793, "geo/layer_21/stable_rank_o_proj": 80.74730682373047, "geo/layer_21/stable_rank_gate_proj": 80.34101104736328, "geo/layer_21/stable_rank_down_proj": 58.18206787109375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1517602950334549, "geo/layer_21/attn_entropy_mean": 5.738422870635986, "geo/layer_21/attn_entropy_std": 0.2832499146461487, "geo/layer_27/stable_rank_q_proj": 41.63954162597656, "geo/layer_27/stable_rank_k_proj": 31.306474685668945, "geo/layer_27/stable_rank_o_proj": 118.62059020996094, "geo/layer_27/stable_rank_gate_proj": 88.96587371826172, "geo/layer_27/stable_rank_down_proj": 135.45469665527344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07938934117555618, "geo/layer_27/attn_entropy_mean": 4.351471900939941, "geo/layer_27/attn_entropy_std": 0.6142563819885254, "attnres/final_alpha/block_0": 0.2403048574924469, "attnres/block_norm/0": 1.6762371063232422, "attnres/final_alpha/block_1": 0.005936278961598873, "attnres/block_norm/1": 34888.2734375, "attnres/final_alpha/block_2": 0.012567203491926193, "attnres/block_norm/2": 24035.162109375, "attnres/final_alpha/block_3": 0.014538518153131008, "attnres/block_norm/3": 38214.9296875, "attnres/final_alpha/block_4": 0.018276682123541832, "attnres/block_norm/4": 11070.740234375, "attnres/final_alpha/block_5": 0.5846033692359924, "attnres/block_norm/5": 5488.24609375, "attnres/final_alpha/block_6": 0.12377305328845978, "attnres/block_norm/6": 25117.64453125, "geo/tier1_time_s": 1.3596062660217285, "geo/step": 28575.0, "geo/rankme_slope": -3.5545272796618686e-06} {"step": 28580, "timestamp": 1778225434.9842246, "train/loss": 2.1772057056427, "train/z_loss": 0.0014907435281202198, "train/perplexity": 8.821621579251131, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698683.2741020687, "perf/iters_per_sec": 0.8099953051100105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234575057029724, "data/tokens_consumed": 59938701312, "data/tokens_consumed_B": 59.938701312, "train/loss_slope": -9.684272803882583e-06} {"step": 28590, "timestamp": 1778225445.335622, "train/loss": 2.216936159133911, "train/z_loss": 0.0014818527270108461, "train/perplexity": 9.179164240370532, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026804.5575541197, "perf/iters_per_sec": 0.9664557254572486, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347085475921631, "data/tokens_consumed": 59959672832, "data/tokens_consumed_B": 59.959672832, "train/loss_slope": -8.713556945484937e-06} {"step": 28600, "timestamp": 1778225455.681376, "grad/layer_0/attn": 0.0026731642428785563, "grad/layer_0/mlp": 0.002655190182849765, "grad/layer_0/attn_mlp_ratio": 1.0067693679601375, "grad/layer_4/attn": 0.001810133340768516, "grad/layer_4/mlp": 0.0025214802008122206, "grad/layer_4/attn_mlp_ratio": 0.7178851804574619, "grad/layer_8/attn": 0.004889745265245438, "grad/layer_8/mlp": 0.003638191381469369, "grad/layer_8/attn_mlp_ratio": 1.3440043741926961, "grad/layer_12/attn": 0.00487893633544445, "grad/layer_12/mlp": 0.006320821586996317, "grad/layer_12/attn_mlp_ratio": 0.7718832419338416, "grad/layer_16/attn": 0.003846029518172145, "grad/layer_16/mlp": 0.0046435752883553505, "grad/layer_16/attn_mlp_ratio": 0.8282474594504908, "grad/layer_20/attn": 0.004712022375315428, "grad/layer_20/mlp": 0.006416781339794397, "grad/layer_20/attn_mlp_ratio": 0.7343280146793355, "grad/layer_24/attn": 0.006990536116063595, "grad/layer_24/mlp": 0.008449935354292393, "grad/layer_24/attn_mlp_ratio": 0.8272886998814348, "grad/layer_27/attn": 0.012944916263222694, "grad/layer_27/mlp": 0.008073212578892708, "grad/layer_27/attn_mlp_ratio": 1.6034405109959486} {"step": 28600, "timestamp": 1778225455.6974773, "train/loss": 2.2297669410705567, "train/z_loss": 0.0014636334381066263, "train/perplexity": 9.297698915199438, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025524.4287056793, "perf/iters_per_sec": 0.9658453124550244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353624820709229, "data/tokens_consumed": 59980644352, "data/tokens_consumed_B": 59.980644352, "train/loss_slope": -4.863364911816228e-06} {"step": 28610, "timestamp": 1778225466.049301, "train/loss": 2.185171961784363, "train/z_loss": 0.0014841695316135882, "train/perplexity": 8.892177536620327, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026899.7870667048, "perf/iters_per_sec": 0.9665011344274067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346599340438842, "data/tokens_consumed": 60001615872, "data/tokens_consumed_B": 60.001615872, "train/loss_slope": -2.3420770355005264e-06} {"step": 28620, "timestamp": 1778225476.9042604, "train/loss": 2.1599605560302733, "train/z_loss": 0.0014855709974654018, "train/perplexity": 8.670795641117463, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932762.042705723, "perf/iters_per_sec": 0.9216127599266639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0850544214248656, "data/tokens_consumed": 60022587392, "data/tokens_consumed_B": 60.022587392, "train/loss_slope": -5.041115748213504e-06} {"step": 28630, "timestamp": 1778225487.2576666, "train/loss": 2.22531578540802, "train/z_loss": 0.0014722557389177382, "train/perplexity": 9.256405380178915, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026851.4005756832, "perf/iters_per_sec": 0.9664780619505325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346846342086793, "data/tokens_consumed": 60043558912, "data/tokens_consumed_B": 60.043558912, "train/loss_slope": -3.0221156161217328e-06} {"step": 28640, "timestamp": 1778225497.608447, "train/loss": 2.193473243713379, "train/z_loss": 0.001485222694464028, "train/perplexity": 8.966301244550431, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027093.5429959486, "perf/iters_per_sec": 0.9665935244540923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034561038017273, "data/tokens_consumed": 60064530432, "data/tokens_consumed_B": 60.064530432, "train/loss_slope": -5.6594833682472875e-06} {"step": 28650, "timestamp": 1778225507.9531658, "grad/layer_0/attn": 0.002757494803518057, "grad/layer_0/mlp": 0.002797025954350829, "grad/layer_0/attn_mlp_ratio": 0.9858666848057133, "grad/layer_4/attn": 0.001737118698656559, "grad/layer_4/mlp": 0.0024002871941775084, "grad/layer_4/attn_mlp_ratio": 0.7237128250732197, "grad/layer_8/attn": 0.006141645833849907, "grad/layer_8/mlp": 0.0038012643344700336, "grad/layer_8/attn_mlp_ratio": 1.6156849752827485, "grad/layer_12/attn": 0.0034786956384778023, "grad/layer_12/mlp": 0.006359729450196028, "grad/layer_12/attn_mlp_ratio": 0.5469879829041752, "grad/layer_16/attn": 0.0037108352407813072, "grad/layer_16/mlp": 0.0043170880526304245, "grad/layer_16/attn_mlp_ratio": 0.8595690218927488, "grad/layer_20/attn": 0.0034357786644250154, "grad/layer_20/mlp": 0.0054159220308065414, "grad/layer_20/attn_mlp_ratio": 0.6343847975364738, "grad/layer_24/attn": 0.011001098901033401, "grad/layer_24/mlp": 0.009104223921895027, "grad/layer_24/attn_mlp_ratio": 1.2083510768821728, "grad/layer_27/attn": 0.004365699365735054, "grad/layer_27/mlp": 0.007733848877251148, "grad/layer_27/attn_mlp_ratio": 0.564492451116722} {"step": 28650, "timestamp": 1778225508.5710669, "eos/sharpness": 24.314999580383297, "eos/L0_probe": 2.022608518600464, "eos/L_plus": 2.147963047027588, "eos/L_minus": 2.140403985977173, "eos/grad_norm": 0.11917176097631454, "eos/embed_grad_frac": 0.16737602651119232, "eos/time_s": 0.6149640083312988} {"step": 28650, "timestamp": 1778225508.5918298, "train/loss": 2.2235955476760862, "train/z_loss": 0.0014732402982190252, "train/perplexity": 9.240495850391888, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910109.1680574494, "perf/iters_per_sec": 0.9108110275542495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979225873947143, "data/tokens_consumed": 60085501952, "data/tokens_consumed_B": 60.085501952, "train/loss_slope": -3.005685349895669e-06} {"step": 28650, "timestamp": 1778225509.955572, "geo/rankme_last": 440.1398010253906, "geo/layer_0/stable_rank_q_proj": 18.452489852905273, "geo/layer_0/stable_rank_k_proj": 15.907645225524902, "geo/layer_0/stable_rank_o_proj": 50.608619689941406, "geo/layer_0/stable_rank_gate_proj": 143.58131408691406, "geo/layer_0/stable_rank_down_proj": 51.48910903930664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054711245000362396, "geo/layer_0/attn_entropy_mean": 6.224733352661133, "geo/layer_0/attn_entropy_std": 0.3463191092014313, "geo/layer_7/stable_rank_q_proj": 42.143646240234375, "geo/layer_7/stable_rank_k_proj": 41.99751281738281, "geo/layer_7/stable_rank_o_proj": 105.86026000976562, "geo/layer_7/stable_rank_gate_proj": 96.93050384521484, "geo/layer_7/stable_rank_down_proj": 147.18531799316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5251580476760864, "geo/layer_7/attn_entropy_mean": 4.639205455780029, "geo/layer_7/attn_entropy_std": 0.8236970901489258, "geo/layer_14/stable_rank_q_proj": 55.61279296875, "geo/layer_14/stable_rank_k_proj": 35.43323516845703, "geo/layer_14/stable_rank_o_proj": 53.11526107788086, "geo/layer_14/stable_rank_gate_proj": 81.24261474609375, "geo/layer_14/stable_rank_down_proj": 135.29214477539062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38514241576194763, "geo/layer_14/attn_entropy_mean": 5.476914405822754, "geo/layer_14/attn_entropy_std": 0.42229053378105164, "geo/layer_21/stable_rank_q_proj": 45.497154235839844, "geo/layer_21/stable_rank_k_proj": 31.173786163330078, "geo/layer_21/stable_rank_o_proj": 80.64356994628906, "geo/layer_21/stable_rank_gate_proj": 80.14067077636719, "geo/layer_21/stable_rank_down_proj": 58.225887298583984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15011274814605713, "geo/layer_21/attn_entropy_mean": 5.725831031799316, "geo/layer_21/attn_entropy_std": 0.2902345359325409, "geo/layer_27/stable_rank_q_proj": 41.54914093017578, "geo/layer_27/stable_rank_k_proj": 31.274763107299805, "geo/layer_27/stable_rank_o_proj": 118.68061065673828, "geo/layer_27/stable_rank_gate_proj": 88.98897552490234, "geo/layer_27/stable_rank_down_proj": 135.34259033203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07998624444007874, "geo/layer_27/attn_entropy_mean": 4.334198951721191, "geo/layer_27/attn_entropy_std": 0.6171584129333496, "attnres/final_alpha/block_0": 0.2430337518453598, "attnres/block_norm/0": 1.6765110492706299, "attnres/final_alpha/block_1": 0.006030944641679525, "attnres/block_norm/1": 34815.0703125, "attnres/final_alpha/block_2": 0.012625083327293396, "attnres/block_norm/2": 24021.08203125, "attnres/final_alpha/block_3": 0.014554568566381931, "attnres/block_norm/3": 38061.8125, "attnres/final_alpha/block_4": 0.018741950392723083, "attnres/block_norm/4": 11073.080078125, "attnres/final_alpha/block_5": 0.5783311724662781, "attnres/block_norm/5": 5555.5166015625, "attnres/final_alpha/block_6": 0.12668251991271973, "attnres/block_norm/6": 24983.541015625, "geo/tier1_time_s": 1.3595902919769287, "geo/step": 28650.0, "geo/rankme_slope": -1.4582708083233295e-05} {"step": 28660, "timestamp": 1778225520.3066049, "train/loss": 2.203159141540527, "train/z_loss": 0.0014708617469295858, "train/perplexity": 9.053569877554384, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790757.8616834558, "perf/iters_per_sec": 0.8538998897950438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171097469329834, "data/tokens_consumed": 60106473472, "data/tokens_consumed_B": 60.106473472, "train/loss_slope": -3.1170121049008922e-06} {"step": 28670, "timestamp": 1778225530.6607866, "train/loss": 2.1720166206359863, "train/z_loss": 0.0014710460673086345, "train/perplexity": 8.775963997964837, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026259.0868749395, "perf/iters_per_sec": 0.9661956247687051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349870920181274, "data/tokens_consumed": 60127444992, "data/tokens_consumed_B": 60.127444992, "train/loss_slope": -4.511925604048038e-06} {"step": 28680, "timestamp": 1778225541.0152564, "train/loss": 2.202313446998596, "train/z_loss": 0.0014751311275176704, "train/perplexity": 9.045916559564708, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026647.5581523732, "perf/iters_per_sec": 0.9663808623086801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347887039184571, "data/tokens_consumed": 60148416512, "data/tokens_consumed_B": 60.148416512, "train/loss_slope": -2.7750634660672766e-06} {"step": 28690, "timestamp": 1778225551.3664737, "train/loss": 2.201067066192627, "train/z_loss": 0.0014724764740094542, "train/perplexity": 9.034648926132144, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027006.4229817144, "perf/iters_per_sec": 0.9665519823940822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346055030822754, "data/tokens_consumed": 60169388032, "data/tokens_consumed_B": 60.169388032, "train/loss_slope": -2.0662351731885557e-06} {"step": 28700, "timestamp": 1778225561.7041407, "grad/layer_0/attn": 0.0029903845861554146, "grad/layer_0/mlp": 0.0027796097565442324, "grad/layer_0/attn_mlp_ratio": 1.0758288898403998, "grad/layer_4/attn": 0.0018899098504334688, "grad/layer_4/mlp": 0.0025218375958502293, "grad/layer_4/attn_mlp_ratio": 0.7494177176998268, "grad/layer_8/attn": 0.003886191872879863, "grad/layer_8/mlp": 0.0035473075695335865, "grad/layer_8/attn_mlp_ratio": 1.0955327913213782, "grad/layer_12/attn": 0.004203976131975651, "grad/layer_12/mlp": 0.0065214973874390125, "grad/layer_12/attn_mlp_ratio": 0.6446335584845175, "grad/layer_16/attn": 0.0033730496652424335, "grad/layer_16/mlp": 0.004383885767310858, "grad/layer_16/attn_mlp_ratio": 0.7694200458990311, "grad/layer_20/attn": 0.0041579301469028, "grad/layer_20/mlp": 0.005784288048744202, "grad/layer_20/attn_mlp_ratio": 0.7188317801569943, "grad/layer_24/attn": 0.008152399212121964, "grad/layer_24/mlp": 0.010823472402989864, "grad/layer_24/attn_mlp_ratio": 0.7532147570818842, "grad/layer_27/attn": 0.007454224396497011, "grad/layer_27/mlp": 0.010552733205258846, "grad/layer_27/attn_mlp_ratio": 0.7063785448630902} {"step": 28700, "timestamp": 1778225561.72018, "train/loss": 2.12697411775589, "train/z_loss": 0.0014892115956172346, "train/perplexity": 8.38944289964884, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026336.19971668, "perf/iters_per_sec": 0.9662323950370216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349477052688598, "data/tokens_consumed": 60190359552, "data/tokens_consumed_B": 60.190359552, "train/loss_slope": -7.261899109184348e-06} {"step": 28710, "timestamp": 1778225572.075311, "train/loss": 2.2179550886154176, "train/z_loss": 0.0014698819955810904, "train/perplexity": 9.188521928032946, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026546.842908639, "perf/iters_per_sec": 0.9663328375380702, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348401308059691, "data/tokens_consumed": 60211331072, "data/tokens_consumed_B": 60.211331072, "train/loss_slope": -5.672973074285022e-06} {"step": 28720, "timestamp": 1778225582.4251537, "train/loss": 2.203861081600189, "train/z_loss": 0.001470071857329458, "train/perplexity": 9.059927171893142, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027187.9119961772, "perf/iters_per_sec": 0.9666385230999838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345128774642944, "data/tokens_consumed": 60232302592, "data/tokens_consumed_B": 60.232302592, "train/loss_slope": -7.091296309291775e-06} {"step": 28725, "timestamp": 1778225588.203467, "eos/sharpness": 9.48045253753662, "eos/L0_probe": 2.0254135131835938, "eos/L_plus": 2.0838441848754883, "eos/L_minus": 2.0617873668670654, "eos/grad_norm": 0.11214505136013031, "eos/embed_grad_frac": 0.1888824999332428, "eos/time_s": 0.6145825386047363} {"step": 28725, "timestamp": 1778225589.5836232, "geo/rankme_last": 440.5299377441406, "geo/layer_0/stable_rank_q_proj": 18.41767692565918, "geo/layer_0/stable_rank_k_proj": 15.899432182312012, "geo/layer_0/stable_rank_o_proj": 50.547035217285156, "geo/layer_0/stable_rank_gate_proj": 143.7111053466797, "geo/layer_0/stable_rank_down_proj": 51.42053985595703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05757972598075867, "geo/layer_0/attn_entropy_mean": 6.231163501739502, "geo/layer_0/attn_entropy_std": 0.3417299687862396, "geo/layer_7/stable_rank_q_proj": 42.18048095703125, "geo/layer_7/stable_rank_k_proj": 41.993160247802734, "geo/layer_7/stable_rank_o_proj": 105.35579681396484, "geo/layer_7/stable_rank_gate_proj": 96.85408020019531, "geo/layer_7/stable_rank_down_proj": 147.74554443359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5402958989143372, "geo/layer_7/attn_entropy_mean": 4.642236232757568, "geo/layer_7/attn_entropy_std": 0.8202973008155823, "geo/layer_14/stable_rank_q_proj": 55.610595703125, "geo/layer_14/stable_rank_k_proj": 35.37190628051758, "geo/layer_14/stable_rank_o_proj": 53.04420471191406, "geo/layer_14/stable_rank_gate_proj": 81.32616424560547, "geo/layer_14/stable_rank_down_proj": 135.26490783691406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3694673478603363, "geo/layer_14/attn_entropy_mean": 5.498865127563477, "geo/layer_14/attn_entropy_std": 0.40353184938430786, "geo/layer_21/stable_rank_q_proj": 45.43518829345703, "geo/layer_21/stable_rank_k_proj": 31.127450942993164, "geo/layer_21/stable_rank_o_proj": 80.51553344726562, "geo/layer_21/stable_rank_gate_proj": 80.1690444946289, "geo/layer_21/stable_rank_down_proj": 58.1756706237793, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14615680277347565, "geo/layer_21/attn_entropy_mean": 5.728572368621826, "geo/layer_21/attn_entropy_std": 0.2945745289325714, "geo/layer_27/stable_rank_q_proj": 41.52106857299805, "geo/layer_27/stable_rank_k_proj": 31.340076446533203, "geo/layer_27/stable_rank_o_proj": 118.85476684570312, "geo/layer_27/stable_rank_gate_proj": 88.96293640136719, "geo/layer_27/stable_rank_down_proj": 135.2980194091797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08548996597528458, "geo/layer_27/attn_entropy_mean": 4.308818340301514, "geo/layer_27/attn_entropy_std": 0.5907153487205505, "attnres/final_alpha/block_0": 0.240993469953537, "attnres/block_norm/0": 1.6769144535064697, "attnres/final_alpha/block_1": 0.005926020909100771, "attnres/block_norm/1": 34930.4296875, "attnres/final_alpha/block_2": 0.01267133466899395, "attnres/block_norm/2": 24032.2421875, "attnres/final_alpha/block_3": 0.014425015076994896, "attnres/block_norm/3": 38239.953125, "attnres/final_alpha/block_4": 0.018355106934905052, "attnres/block_norm/4": 11105.4150390625, "attnres/final_alpha/block_5": 0.5813307762145996, "attnres/block_norm/5": 5574.55029296875, "attnres/final_alpha/block_6": 0.12629824876785278, "attnres/block_norm/6": 25431.34375, "geo/tier1_time_s": 1.3603436946868896, "geo/step": 28725.0, "geo/rankme_slope": 5.061204169167648e-07} {"step": 28730, "timestamp": 1778225594.7624, "train/loss": 2.1389508485794066, "train/z_loss": 0.0014893927494995297, "train/perplexity": 8.490525107933058, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700455.0935375735, "perf/iters_per_sec": 0.8108401744544856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2332886695861816, "data/tokens_consumed": 60253274112, "data/tokens_consumed_B": 60.253274112, "train/loss_slope": -6.327807971245929e-06} {"step": 28740, "timestamp": 1778225605.116151, "train/loss": 2.230257821083069, "train/z_loss": 0.0014713449636474252, "train/perplexity": 9.302264090144165, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026425.8764494113, "perf/iters_per_sec": 0.9662751562354142, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349019050598145, "data/tokens_consumed": 60274245632, "data/tokens_consumed_B": 60.274245632, "train/loss_slope": -4.962067249262643e-06} {"step": 28750, "timestamp": 1778225615.458878, "grad/layer_0/attn": 0.0027356224600225687, "grad/layer_0/mlp": 0.002744109835475683, "grad/layer_0/attn_mlp_ratio": 0.9969070206177278, "grad/layer_4/attn": 0.001672857441008091, "grad/layer_4/mlp": 0.0025243088603019714, "grad/layer_4/attn_mlp_ratio": 0.6626991653224472, "grad/layer_8/attn": 0.009020366705954075, "grad/layer_8/mlp": 0.0037780082784593105, "grad/layer_8/attn_mlp_ratio": 2.387598385801522, "grad/layer_12/attn": 0.004178803414106369, "grad/layer_12/mlp": 0.006150317844003439, "grad/layer_12/attn_mlp_ratio": 0.6794451038390142, "grad/layer_16/attn": 0.003301385324448347, "grad/layer_16/mlp": 0.0043928008526563644, "grad/layer_16/attn_mlp_ratio": 0.7515444838109433, "grad/layer_20/attn": 0.0031931460835039616, "grad/layer_20/mlp": 0.005480469204485416, "grad/layer_20/attn_mlp_ratio": 0.5826409940642445, "grad/layer_24/attn": 0.007448130287230015, "grad/layer_24/mlp": 0.009270680136978626, "grad/layer_24/attn_mlp_ratio": 0.8034070960101853, "grad/layer_27/attn": 0.004735079128295183, "grad/layer_27/mlp": 0.007724422961473465, "grad/layer_27/attn_mlp_ratio": 0.6130010086982403} {"step": 28750, "timestamp": 1778225615.4747157, "train/loss": 2.1707677125930784, "train/z_loss": 0.0014817313756793737, "train/perplexity": 8.765010467343505, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025580.1218098602, "perf/iters_per_sec": 0.9658718689965535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035334014892578, "data/tokens_consumed": 60295217152, "data/tokens_consumed_B": 60.295217152, "train/loss_slope": -5.629691689929366e-06} {"step": 28760, "timestamp": 1778225625.8254492, "train/loss": 2.221816325187683, "train/z_loss": 0.0014777016127482056, "train/perplexity": 9.224069569708169, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027067.3828483936, "perf/iters_per_sec": 0.966581050323674, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345743894577026, "data/tokens_consumed": 60316188672, "data/tokens_consumed_B": 60.316188672, "train/loss_slope": -3.940203745659179e-06} {"step": 28770, "timestamp": 1778225636.1778905, "train/loss": 2.1575260877609255, "train/z_loss": 0.0015049554756842553, "train/perplexity": 8.64971253774491, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026771.3997948181, "perf/iters_per_sec": 0.9664399146055308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347254753112793, "data/tokens_consumed": 60337160192, "data/tokens_consumed_B": 60.337160192, "train/loss_slope": -8.017641905010065e-06} {"step": 28780, "timestamp": 1778225646.5314152, "train/loss": 2.138061547279358, "train/z_loss": 0.0014951506862416863, "train/perplexity": 8.48297782931622, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026496.83919784, "perf/iters_per_sec": 0.9663089939107132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034865665435791, "data/tokens_consumed": 60358131712, "data/tokens_consumed_B": 60.358131712, "train/loss_slope": -1.0313147401223149e-05} {"step": 28790, "timestamp": 1778225656.883975, "train/loss": 2.1279601097106933, "train/z_loss": 0.001487947814166546, "train/perplexity": 8.397718902218669, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026828.2356603015, "perf/iters_per_sec": 0.9664670160581119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346964597702026, "data/tokens_consumed": 60379103232, "data/tokens_consumed_B": 60.379103232, "train/loss_slope": -1.3335149173009825e-05} {"step": 28800, "timestamp": 1778225667.2366812, "grad/layer_0/attn": 0.0024277069605886936, "grad/layer_0/mlp": 0.0025941741187125444, "grad/layer_0/attn_mlp_ratio": 0.9358303475059333, "grad/layer_4/attn": 0.0016136167105287313, "grad/layer_4/mlp": 0.0023296077270060778, "grad/layer_4/attn_mlp_ratio": 0.6926559448430809, "grad/layer_8/attn": 0.003630797378718853, "grad/layer_8/mlp": 0.003721376182511449, "grad/layer_8/attn_mlp_ratio": 0.9756598374052444, "grad/layer_12/attn": 0.00422645453363657, "grad/layer_12/mlp": 0.006410099565982819, "grad/layer_12/attn_mlp_ratio": 0.659343029573403, "grad/layer_16/attn": 0.003656307701021433, "grad/layer_16/mlp": 0.004410479683429003, "grad/layer_16/attn_mlp_ratio": 0.829004525711435, "grad/layer_20/attn": 0.0031627975404262543, "grad/layer_20/mlp": 0.005266102030873299, "grad/layer_20/attn_mlp_ratio": 0.60059555660417, "grad/layer_24/attn": 0.0061686900444328785, "grad/layer_24/mlp": 0.008089961484074593, "grad/layer_24/attn_mlp_ratio": 0.7625116609423941, "grad/layer_27/attn": 0.006699136458337307, "grad/layer_27/mlp": 0.006737225688993931, "grad/layer_27/attn_mlp_ratio": 0.9943464369683369} {"step": 28800, "timestamp": 1778225667.8365464, "eos/sharpness": 12.157416343688963, "eos/L0_probe": 2.0205001831054688, "eos/L_plus": 2.0867011547088623, "eos/L_minus": 2.075873374938965, "eos/grad_norm": 0.09024504572153091, "eos/embed_grad_frac": 0.2522119879722595, "eos/time_s": 0.5968003273010254} {"step": 28800, "timestamp": 1778225667.8563747, "train/loss": 2.1248581409454346, "train/z_loss": 0.0014947471325285732, "train/perplexity": 8.3717098010497, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912528.0377119754, "perf/iters_per_sec": 0.9119644344863774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965339899063111, "data/tokens_consumed": 60400074752, "data/tokens_consumed_B": 60.400074752, "train/loss_slope": -1.2266489445823427e-05} {"step": 28800, "timestamp": 1778225669.2159164, "geo/rankme_last": 440.2182922363281, "geo/layer_0/stable_rank_q_proj": 18.434906005859375, "geo/layer_0/stable_rank_k_proj": 15.910343170166016, "geo/layer_0/stable_rank_o_proj": 50.59401321411133, "geo/layer_0/stable_rank_gate_proj": 144.1187744140625, "geo/layer_0/stable_rank_down_proj": 51.51298904418945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05624450370669365, "geo/layer_0/attn_entropy_mean": 6.231404781341553, "geo/layer_0/attn_entropy_std": 0.340270459651947, "geo/layer_7/stable_rank_q_proj": 42.18798828125, "geo/layer_7/stable_rank_k_proj": 42.101463317871094, "geo/layer_7/stable_rank_o_proj": 105.14551544189453, "geo/layer_7/stable_rank_gate_proj": 96.8707504272461, "geo/layer_7/stable_rank_down_proj": 148.1162872314453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5294005870819092, "geo/layer_7/attn_entropy_mean": 4.6565985679626465, "geo/layer_7/attn_entropy_std": 0.7972931265830994, "geo/layer_14/stable_rank_q_proj": 55.67109298706055, "geo/layer_14/stable_rank_k_proj": 35.509334564208984, "geo/layer_14/stable_rank_o_proj": 52.966468811035156, "geo/layer_14/stable_rank_gate_proj": 81.3620376586914, "geo/layer_14/stable_rank_down_proj": 134.88670349121094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3521731197834015, "geo/layer_14/attn_entropy_mean": 5.508993148803711, "geo/layer_14/attn_entropy_std": 0.403752863407135, "geo/layer_21/stable_rank_q_proj": 45.50300979614258, "geo/layer_21/stable_rank_k_proj": 31.089946746826172, "geo/layer_21/stable_rank_o_proj": 80.36336517333984, "geo/layer_21/stable_rank_gate_proj": 80.00489044189453, "geo/layer_21/stable_rank_down_proj": 58.12615203857422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14771156013011932, "geo/layer_21/attn_entropy_mean": 5.737488269805908, "geo/layer_21/attn_entropy_std": 0.29045170545578003, "geo/layer_27/stable_rank_q_proj": 41.56005096435547, "geo/layer_27/stable_rank_k_proj": 31.326557159423828, "geo/layer_27/stable_rank_o_proj": 118.71179962158203, "geo/layer_27/stable_rank_gate_proj": 89.05253601074219, "geo/layer_27/stable_rank_down_proj": 135.6633758544922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08337192982435226, "geo/layer_27/attn_entropy_mean": 4.344921112060547, "geo/layer_27/attn_entropy_std": 0.5894617438316345, "attnres/final_alpha/block_0": 0.24023857712745667, "attnres/block_norm/0": 1.6773545742034912, "attnres/final_alpha/block_1": 0.006109694018959999, "attnres/block_norm/1": 34831.69140625, "attnres/final_alpha/block_2": 0.012631013989448547, "attnres/block_norm/2": 23987.93359375, "attnres/final_alpha/block_3": 0.014505553990602493, "attnres/block_norm/3": 38251.08203125, "attnres/final_alpha/block_4": 0.01820388063788414, "attnres/block_norm/4": 11142.279296875, "attnres/final_alpha/block_5": 0.5828216075897217, "attnres/block_norm/5": 5533.701171875, "attnres/final_alpha/block_6": 0.12548966705799103, "attnres/block_norm/6": 25293.49609375, "geo/tier1_time_s": 1.355440616607666, "geo/step": 28800.0, "geo/rankme_slope": -1.5996847957933173e-05} {"step": 28810, "timestamp": 1778225679.936505, "train/loss": 2.1648925662040712, "train/z_loss": 0.0014766170410439373, "train/perplexity": 8.713665724377561, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1736605.6715464653, "perf/iters_per_sec": 0.828078113339646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2076155424118042, "data/tokens_consumed": 60421046272, "data/tokens_consumed_B": 60.421046272, "train/loss_slope": -1.0166677482987283e-05} {"step": 28820, "timestamp": 1778225690.2850459, "train/loss": 2.173502516746521, "train/z_loss": 0.0014924823539331556, "train/perplexity": 8.789013861705442, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027507.5705050465, "perf/iters_per_sec": 0.9667909481549485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343497753143311, "data/tokens_consumed": 60442017792, "data/tokens_consumed_B": 60.442017792, "train/loss_slope": -8.307108684520351e-06} {"step": 28830, "timestamp": 1778225700.6408749, "train/loss": 2.2077610015869142, "train/z_loss": 0.001473437761887908, "train/perplexity": 9.095329150520936, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026177.3592710777, "perf/iters_per_sec": 0.9661566540103329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035028839111328, "data/tokens_consumed": 60462989312, "data/tokens_consumed_B": 60.462989312, "train/loss_slope": -5.3191508397017294e-06} {"step": 28840, "timestamp": 1778225710.9897168, "train/loss": 2.2050805568695067, "train/z_loss": 0.0014602765790186823, "train/perplexity": 9.070982268359433, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027481.2127381084, "perf/iters_per_sec": 0.9667783797922651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343632221221923, "data/tokens_consumed": 60483960832, "data/tokens_consumed_B": 60.483960832, "train/loss_slope": -2.7316259329218174e-06} {"step": 28850, "timestamp": 1778225721.3348925, "grad/layer_0/attn": 0.003614646615460515, "grad/layer_0/mlp": 0.0028995894826948643, "grad/layer_0/attn_mlp_ratio": 1.2466062911224423, "grad/layer_4/attn": 0.0018110625678673387, "grad/layer_4/mlp": 0.002512817271053791, "grad/layer_4/attn_mlp_ratio": 0.7207298822149737, "grad/layer_8/attn": 0.004143026191741228, "grad/layer_8/mlp": 0.0036128005012869835, "grad/layer_8/attn_mlp_ratio": 1.146763037590661, "grad/layer_12/attn": 0.004622561391443014, "grad/layer_12/mlp": 0.006684901658445597, "grad/layer_12/attn_mlp_ratio": 0.69149279353327, "grad/layer_16/attn": 0.004018574021756649, "grad/layer_16/mlp": 0.0049917553551495075, "grad/layer_16/attn_mlp_ratio": 0.805042245731625, "grad/layer_20/attn": 0.003667807672172785, "grad/layer_20/mlp": 0.006915999110788107, "grad/layer_20/attn_mlp_ratio": 0.5303366238751815, "grad/layer_24/attn": 0.01947859488427639, "grad/layer_24/mlp": 0.011280340142548084, "grad/layer_24/attn_mlp_ratio": 1.7267737023397112, "grad/layer_27/attn": 0.007501411251723766, "grad/layer_27/mlp": 0.010033400729298592, "grad/layer_27/attn_mlp_ratio": 0.7476439324360343} {"step": 28850, "timestamp": 1778225721.3505037, "train/loss": 2.151703190803528, "train/z_loss": 0.0014836501912213862, "train/perplexity": 8.599492507853293, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025174.6676738712, "perf/iters_per_sec": 0.9656785333985668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035541296005249, "data/tokens_consumed": 60504932352, "data/tokens_consumed_B": 60.504932352, "train/loss_slope": -3.632616338664046e-06} {"step": 28860, "timestamp": 1778225731.7025585, "train/loss": 2.2061745166778564, "train/z_loss": 0.0014754345058463514, "train/perplexity": 9.080910988203318, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026752.813243708, "perf/iters_per_sec": 0.9664310518473187, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347349643707275, "data/tokens_consumed": 60525903872, "data/tokens_consumed_B": 60.525903872, "train/loss_slope": -4.632102574023117e-06} {"step": 28870, "timestamp": 1778225742.0589147, "train/loss": 2.143679702281952, "train/z_loss": 0.0014872281113639473, "train/perplexity": 8.530770641649104, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026042.7168042597, "perf/iters_per_sec": 0.966092451479082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350976228713988, "data/tokens_consumed": 60546875392, "data/tokens_consumed_B": 60.546875392, "train/loss_slope": -8.3368488664281e-06} {"step": 28875, "timestamp": 1778225747.826641, "eos/sharpness": 5.680632591247558, "eos/L0_probe": 2.0178308486938477, "eos/L_plus": 2.0554440021514893, "eos/L_minus": 2.0370240211486816, "eos/grad_norm": 0.09342367947101593, "eos/embed_grad_frac": 0.2544281780719757, "eos/time_s": 0.604344367980957} {"step": 28875, "timestamp": 1778225749.2037446, "geo/rankme_last": 440.51226806640625, "geo/layer_0/stable_rank_q_proj": 18.432430267333984, "geo/layer_0/stable_rank_k_proj": 15.952651977539062, "geo/layer_0/stable_rank_o_proj": 50.58662414550781, "geo/layer_0/stable_rank_gate_proj": 144.7117919921875, "geo/layer_0/stable_rank_down_proj": 51.533809661865234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052389852702617645, "geo/layer_0/attn_entropy_mean": 6.227156162261963, "geo/layer_0/attn_entropy_std": 0.3469257950782776, "geo/layer_7/stable_rank_q_proj": 42.10127258300781, "geo/layer_7/stable_rank_k_proj": 42.21488571166992, "geo/layer_7/stable_rank_o_proj": 104.98916625976562, "geo/layer_7/stable_rank_gate_proj": 96.91226196289062, "geo/layer_7/stable_rank_down_proj": 148.03050231933594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5293923616409302, "geo/layer_7/attn_entropy_mean": 4.643956184387207, "geo/layer_7/attn_entropy_std": 0.8356555700302124, "geo/layer_14/stable_rank_q_proj": 55.516353607177734, "geo/layer_14/stable_rank_k_proj": 35.52913284301758, "geo/layer_14/stable_rank_o_proj": 52.899375915527344, "geo/layer_14/stable_rank_gate_proj": 81.31660461425781, "geo/layer_14/stable_rank_down_proj": 134.646728515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.379946231842041, "geo/layer_14/attn_entropy_mean": 5.4610795974731445, "geo/layer_14/attn_entropy_std": 0.4167577028274536, "geo/layer_21/stable_rank_q_proj": 45.379337310791016, "geo/layer_21/stable_rank_k_proj": 31.06344223022461, "geo/layer_21/stable_rank_o_proj": 80.52482604980469, "geo/layer_21/stable_rank_gate_proj": 79.8447265625, "geo/layer_21/stable_rank_down_proj": 58.20326232910156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14810998737812042, "geo/layer_21/attn_entropy_mean": 5.737801551818848, "geo/layer_21/attn_entropy_std": 0.28497201204299927, "geo/layer_27/stable_rank_q_proj": 41.63631057739258, "geo/layer_27/stable_rank_k_proj": 31.20197868347168, "geo/layer_27/stable_rank_o_proj": 118.76535034179688, "geo/layer_27/stable_rank_gate_proj": 88.97846221923828, "geo/layer_27/stable_rank_down_proj": 135.6635284423828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08424123376607895, "geo/layer_27/attn_entropy_mean": 4.326903343200684, "geo/layer_27/attn_entropy_std": 0.6177307367324829, "attnres/final_alpha/block_0": 0.24129830300807953, "attnres/block_norm/0": 1.6778059005737305, "attnres/final_alpha/block_1": 0.0060356371104717255, "attnres/block_norm/1": 34940.0625, "attnres/final_alpha/block_2": 0.012663519009947777, "attnres/block_norm/2": 24068.53125, "attnres/final_alpha/block_3": 0.014572828076779842, "attnres/block_norm/3": 38425.83984375, "attnres/final_alpha/block_4": 0.018543336540460587, "attnres/block_norm/4": 11118.6650390625, "attnres/final_alpha/block_5": 0.5819876194000244, "attnres/block_norm/5": 5559.0634765625, "attnres/final_alpha/block_6": 0.1248987540602684, "attnres/block_norm/6": 25308.45703125, "geo/tier1_time_s": 1.3572003841400146, "geo/step": 28875.0, "geo/rankme_slope": 2.7362702893657455e-06} {"step": 28880, "timestamp": 1778225754.3838727, "train/loss": 2.1744830131530763, "train/z_loss": 0.0014817762072198092, "train/perplexity": 8.797635684356338, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702283.6509495385, "perf/iters_per_sec": 0.8117120985744183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319638967514037, "data/tokens_consumed": 60567846912, "data/tokens_consumed_B": 60.567846912, "train/loss_slope": -1.2073847713178599e-05} {"step": 28890, "timestamp": 1778225764.736361, "train/loss": 2.1685447216033937, "train/z_loss": 0.0014859019778668881, "train/perplexity": 8.74554756898891, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027024.1733691108, "perf/iters_per_sec": 0.9665604464383654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345964431762695, "data/tokens_consumed": 60588818432, "data/tokens_consumed_B": 60.588818432, "train/loss_slope": -1.719386417372891e-05} {"step": 28900, "timestamp": 1778225775.0747037, "grad/layer_0/attn": 0.0026564686559140682, "grad/layer_0/mlp": 0.0026571466587483883, "grad/layer_0/attn_mlp_ratio": 0.9997448003833106, "grad/layer_4/attn": 0.002186184050515294, "grad/layer_4/mlp": 0.002424932084977627, "grad/layer_4/attn_mlp_ratio": 0.9015444077399899, "grad/layer_8/attn": 0.004752031993120909, "grad/layer_8/mlp": 0.0038640624843537807, "grad/layer_8/attn_mlp_ratio": 1.229802025557935, "grad/layer_12/attn": 0.0045082177966833115, "grad/layer_12/mlp": 0.006395199801772833, "grad/layer_12/attn_mlp_ratio": 0.7049377448597932, "grad/layer_16/attn": 0.0034631199669092894, "grad/layer_16/mlp": 0.004341219086199999, "grad/layer_16/attn_mlp_ratio": 0.7977298123803475, "grad/layer_20/attn": 0.0054104323498904705, "grad/layer_20/mlp": 0.005726845469325781, "grad/layer_20/attn_mlp_ratio": 0.9447491266169132, "grad/layer_24/attn": 0.007844530045986176, "grad/layer_24/mlp": 0.008041203953325748, "grad/layer_24/attn_mlp_ratio": 0.9755417216084412, "grad/layer_27/attn": 0.00450528971850872, "grad/layer_27/mlp": 0.006960907019674778, "grad/layer_27/attn_mlp_ratio": 0.6472273858926613} {"step": 28900, "timestamp": 1778225775.0903473, "train/loss": 2.241828751564026, "train/z_loss": 0.001468082726933062, "train/perplexity": 9.410525073515883, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026399.8268980768, "perf/iters_per_sec": 0.9662627348413834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349152088165283, "data/tokens_consumed": 60609789952, "data/tokens_consumed_B": 60.609789952, "train/loss_slope": -9.491199042179722e-06} {"step": 28910, "timestamp": 1778225785.4406197, "train/loss": 2.2185775518417357, "train/z_loss": 0.001472540234681219, "train/perplexity": 9.194243225501273, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027210.5244872388, "perf/iters_per_sec": 0.9666493055759615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345013380050658, "data/tokens_consumed": 60630761472, "data/tokens_consumed_B": 60.630761472, "train/loss_slope": -8.406681982990899e-06} {"step": 28920, "timestamp": 1778225795.8074455, "train/loss": 2.1855842590332033, "train/z_loss": 0.001471540390048176, "train/perplexity": 8.895844512845036, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024595.3095686506, "perf/iters_per_sec": 0.9654022739260915, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358376264572144, "data/tokens_consumed": 60651732992, "data/tokens_consumed_B": 60.651732992, "train/loss_slope": -6.813719184628226e-06} {"step": 28930, "timestamp": 1778225806.1595902, "train/loss": 2.1941739082336427, "train/z_loss": 0.0014694438199512661, "train/perplexity": 8.972585815141196, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026737.2624465432, "perf/iters_per_sec": 0.9664236366493908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347429037094116, "data/tokens_consumed": 60672704512, "data/tokens_consumed_B": 60.672704512, "train/loss_slope": -7.76827745955514e-06} {"step": 28940, "timestamp": 1778225816.5233858, "train/loss": 2.1145502090454102, "train/z_loss": 0.0015011379262432456, "train/perplexity": 8.285858023839555, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024382.0434811306, "perf/iters_per_sec": 0.9653005807309774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035946750640869, "data/tokens_consumed": 60693676032, "data/tokens_consumed_B": 60.693676032, "train/loss_slope": -1.1138567615478087e-05} {"step": 28950, "timestamp": 1778225826.8704345, "grad/layer_0/attn": 0.002930722665041685, "grad/layer_0/mlp": 0.002775040455162525, "grad/layer_0/attn_mlp_ratio": 1.0561008413334854, "grad/layer_4/attn": 0.003096667118370533, "grad/layer_4/mlp": 0.00251209014095366, "grad/layer_4/attn_mlp_ratio": 1.2327053653912334, "grad/layer_8/attn": 0.004602754022926092, "grad/layer_8/mlp": 0.0036821067333221436, "grad/layer_8/attn_mlp_ratio": 1.2500327207435489, "grad/layer_12/attn": 0.005423699505627155, "grad/layer_12/mlp": 0.006705746985971928, "grad/layer_12/attn_mlp_ratio": 0.808813609593665, "grad/layer_16/attn": 0.0037598845083266497, "grad/layer_16/mlp": 0.004528946243226528, "grad/layer_16/attn_mlp_ratio": 0.8301896784336861, "grad/layer_20/attn": 0.0033209556713700294, "grad/layer_20/mlp": 0.005555340088903904, "grad/layer_20/attn_mlp_ratio": 0.5977951949735181, "grad/layer_24/attn": 0.007632838096469641, "grad/layer_24/mlp": 0.008158829063177109, "grad/layer_24/attn_mlp_ratio": 0.9355310601328191, "grad/layer_27/attn": 0.003875791560858488, "grad/layer_27/mlp": 0.007170379627496004, "grad/layer_27/attn_mlp_ratio": 0.540528076357815} {"step": 28950, "timestamp": 1778225827.4842508, "eos/sharpness": 16.89343452453613, "eos/L0_probe": 2.020644187927246, "eos/L_plus": 2.10613751411438, "eos/L_minus": 2.1040852069854736, "eos/grad_norm": 0.11072707176208496, "eos/embed_grad_frac": 0.18145370483398438, "eos/time_s": 0.6108977794647217} {"step": 28950, "timestamp": 1778225827.5042381, "train/loss": 2.1909976243972777, "train/z_loss": 0.0014792280737310648, "train/perplexity": 8.944131549181085, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910908.5922778994, "perf/iters_per_sec": 0.9111922227277276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097463274002075, "data/tokens_consumed": 60714647552, "data/tokens_consumed_B": 60.714647552, "train/loss_slope": -7.501150393607601e-06} {"step": 28950, "timestamp": 1778225828.8659868, "geo/rankme_last": 439.4846496582031, "geo/layer_0/stable_rank_q_proj": 18.426082611083984, "geo/layer_0/stable_rank_k_proj": 15.952412605285645, "geo/layer_0/stable_rank_o_proj": 50.577537536621094, "geo/layer_0/stable_rank_gate_proj": 144.36082458496094, "geo/layer_0/stable_rank_down_proj": 51.56718826293945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05147426202893257, "geo/layer_0/attn_entropy_mean": 6.224881172180176, "geo/layer_0/attn_entropy_std": 0.34610798954963684, "geo/layer_7/stable_rank_q_proj": 42.24420166015625, "geo/layer_7/stable_rank_k_proj": 41.98423767089844, "geo/layer_7/stable_rank_o_proj": 105.32347869873047, "geo/layer_7/stable_rank_gate_proj": 96.92450714111328, "geo/layer_7/stable_rank_down_proj": 148.07247924804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5339091420173645, "geo/layer_7/attn_entropy_mean": 4.672231197357178, "geo/layer_7/attn_entropy_std": 0.8222737312316895, "geo/layer_14/stable_rank_q_proj": 55.5651969909668, "geo/layer_14/stable_rank_k_proj": 35.52630615234375, "geo/layer_14/stable_rank_o_proj": 52.84111022949219, "geo/layer_14/stable_rank_gate_proj": 81.184814453125, "geo/layer_14/stable_rank_down_proj": 134.6967010498047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3689180910587311, "geo/layer_14/attn_entropy_mean": 5.516946792602539, "geo/layer_14/attn_entropy_std": 0.4267079532146454, "geo/layer_21/stable_rank_q_proj": 45.434505462646484, "geo/layer_21/stable_rank_k_proj": 31.07926368713379, "geo/layer_21/stable_rank_o_proj": 80.47355651855469, "geo/layer_21/stable_rank_gate_proj": 79.8213119506836, "geo/layer_21/stable_rank_down_proj": 58.28562545776367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15086854994297028, "geo/layer_21/attn_entropy_mean": 5.741123199462891, "geo/layer_21/attn_entropy_std": 0.2858273684978485, "geo/layer_27/stable_rank_q_proj": 41.636592864990234, "geo/layer_27/stable_rank_k_proj": 31.194438934326172, "geo/layer_27/stable_rank_o_proj": 118.79508972167969, "geo/layer_27/stable_rank_gate_proj": 88.92821502685547, "geo/layer_27/stable_rank_down_proj": 135.504150390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08569516986608505, "geo/layer_27/attn_entropy_mean": 4.33128023147583, "geo/layer_27/attn_entropy_std": 0.6099525094032288, "attnres/final_alpha/block_0": 0.24082010984420776, "attnres/block_norm/0": 1.6783336400985718, "attnres/final_alpha/block_1": 0.005975707899779081, "attnres/block_norm/1": 34976.89453125, "attnres/final_alpha/block_2": 0.012553281150758266, "attnres/block_norm/2": 24183.685546875, "attnres/final_alpha/block_3": 0.014595047570765018, "attnres/block_norm/3": 38441.609375, "attnres/final_alpha/block_4": 0.018265539780259132, "attnres/block_norm/4": 11153.009765625, "attnres/final_alpha/block_5": 0.5823615789413452, "attnres/block_norm/5": 5504.50830078125, "attnres/final_alpha/block_6": 0.12542879581451416, "attnres/block_norm/6": 25435.5703125, "geo/tier1_time_s": 1.357954740524292, "geo/step": 28950.0, "geo/rankme_slope": -1.1702415341136452e-05} {"step": 28960, "timestamp": 1778225839.217143, "train/loss": 2.2338669061660767, "train/z_loss": 0.0014617542619816959, "train/perplexity": 9.335897408957168, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791013.3905561082, "perf/iters_per_sec": 0.8540217354565183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709303855895996, "data/tokens_consumed": 60735619072, "data/tokens_consumed_B": 60.735619072, "train/loss_slope": -4.01256693900969e-06} {"step": 28970, "timestamp": 1778225849.5719004, "train/loss": 2.178455185890198, "train/z_loss": 0.0014827341306954621, "train/perplexity": 8.83265091019597, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026278.3645351408, "perf/iters_per_sec": 0.9662048170734123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349772453308106, "data/tokens_consumed": 60756590592, "data/tokens_consumed_B": 60.756590592, "train/loss_slope": -2.359329677722516e-06} {"step": 28980, "timestamp": 1778225859.9246984, "train/loss": 2.201137137413025, "train/z_loss": 0.001460600970312953, "train/perplexity": 9.035282017188738, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026677.5832240612, "perf/iters_per_sec": 0.9663951793785387, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347733736038207, "data/tokens_consumed": 60777562112, "data/tokens_consumed_B": 60.777562112, "train/loss_slope": -2.229694414518867e-08} {"step": 28990, "timestamp": 1778225870.275216, "train/loss": 2.20000741481781, "train/z_loss": 0.0014758517732843756, "train/perplexity": 9.025080418513049, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027090.4130855922, "perf/iters_per_sec": 0.9665920319965325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034562635421753, "data/tokens_consumed": 60798533632, "data/tokens_consumed_B": 60.798533632, "train/loss_slope": 1.483314980839745e-06} {"step": 29000, "timestamp": 1778225880.6155052, "grad/layer_0/attn": 0.0028233325574547052, "grad/layer_0/mlp": 0.002880273386836052, "grad/layer_0/attn_mlp_ratio": 0.9802307212694945, "grad/layer_4/attn": 0.001824028673581779, "grad/layer_4/mlp": 0.0026094764471054077, "grad/layer_4/attn_mlp_ratio": 0.6990017502188721, "grad/layer_8/attn": 0.009103130549192429, "grad/layer_8/mlp": 0.003766290144994855, "grad/layer_8/attn_mlp_ratio": 2.4170018657722605, "grad/layer_12/attn": 0.005085399840027094, "grad/layer_12/mlp": 0.006294384133070707, "grad/layer_12/attn_mlp_ratio": 0.8079265026924148, "grad/layer_16/attn": 0.004029074218124151, "grad/layer_16/mlp": 0.004791284445673227, "grad/layer_16/attn_mlp_ratio": 0.8409173322345486, "grad/layer_20/attn": 0.0064185974188148975, "grad/layer_20/mlp": 0.007115280721336603, "grad/layer_20/attn_mlp_ratio": 0.9020863097303821, "grad/layer_24/attn": 0.016472166404128075, "grad/layer_24/mlp": 0.012320260517299175, "grad/layer_24/attn_mlp_ratio": 1.3369982109793286, "grad/layer_27/attn": 0.0058294497430324554, "grad/layer_27/mlp": 0.010819056071341038, "grad/layer_27/attn_mlp_ratio": 0.5388131506770694} {"step": 29000, "timestamp": 1778225880.6315138, "train/loss": 2.1880001544952394, "train/z_loss": 0.001486606616526842, "train/perplexity": 8.917361924677778, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026148.7958381372, "perf/iters_per_sec": 0.966143033904141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035043430328369, "data/tokens_consumed": 60819505152, "data/tokens_consumed_B": 60.819505152, "train/loss_slope": -9.508181934487483e-07} {"step": 29000, "timestamp": 1778225887.6370456, "geo/ww_alpha_mean": 7.759309896229992, "geo/ww_alpha_std": 4.103030655032098, "geo/ww_alpha_min": 1.3710607519955422, "geo/ww_alpha_max": 22.033898626697795, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.097857720205668, "geo/ww_alpha_by_type/k_proj": 4.611355749633094, "geo/ww_alpha_by_type/v_proj": 8.204497379505938, "geo/ww_alpha_by_type/o_proj": 7.473913256190146, "geo/ww_alpha_by_type/gate_proj": 8.450825741495601, "geo/ww_alpha_by_type/up_proj": 12.39840395192597, "geo/ww_alpha_by_type/down_proj": 9.197407345231325, "geo/twonn_id/layer_0": 0.7497855424880981, "geo/twonn_id/layer_7": 3.326157808303833, "geo/twonn_id/layer_14": 4.032294273376465, "geo/twonn_id/layer_21": 7.196681022644043, "geo/twonn_id/layer_27": 5.416452884674072, "geo/tier2_time_s": 6.999233245849609} {"step": 29000, "timestamp": 1778225888.2656107, "eoc/jacobian_sigma/layer_0/attn": 972.061279296875, "eoc/jacobian_sigma/layer_0/mlp": 7031.3251953125, "eoc/jacobian_sigma/layer_0": 7031.3251953125, "eoc/jacobian_sigma/layer_7/attn": 1.1427087783813477, "eoc/jacobian_sigma/layer_7/mlp": 1.7139756679534912, "eoc/jacobian_sigma/layer_7": 1.7139756679534912, "eoc/jacobian_sigma/layer_14/attn": 1.6129387617111206, "eoc/jacobian_sigma/layer_14/mlp": 8.143218994140625, "eoc/jacobian_sigma/layer_14": 8.143218994140625, "eoc/jacobian_sigma/layer_21/attn": 1.0882445573806763, "eoc/jacobian_sigma/layer_21/mlp": 3.9835872650146484, "eoc/jacobian_sigma/layer_21": 3.9835872650146484, "eoc/jacobian_sigma/layer_27/attn": 3.264007568359375, "eoc/jacobian_sigma/layer_27/mlp": 27.295217514038086, "eoc/jacobian_sigma/layer_27": 27.295217514038086, "eoc/layer0_sigma": 7031.3251953125, "eoc/sigma_max": 27.295217514038086, "eoc/sigma_min": 1.7139756679534912, "eoc/sigma_mean": 10.283999860286713, "eoc/time_s": 0.6228797435760498} {"step": 29010, "timestamp": 1778225898.6308374, "train/loss": 2.1480315804481505, "train/z_loss": 0.0014895817497745157, "train/perplexity": 8.567976414922018, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1165460.0593281265, "perf/iters_per_sec": 0.5557346626892693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.79941987991333, "data/tokens_consumed": 60840476672, "data/tokens_consumed_B": 60.840476672, "train/loss_slope": -5.852067288142014e-06} {"step": 29020, "timestamp": 1778225908.9820614, "train/loss": 2.202382135391235, "train/z_loss": 0.0014774470939300955, "train/perplexity": 9.046537930373368, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027078.8278298664, "perf/iters_per_sec": 0.9665865077161152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345685482025146, "data/tokens_consumed": 60861448192, "data/tokens_consumed_B": 60.861448192, "train/loss_slope": -6.288959269690948e-06} {"step": 29025, "timestamp": 1778225914.7673218, "eos/sharpness": 47.10547924041747, "eos/L0_probe": 2.021214246749878, "eos/L_plus": 2.3030145168304443, "eos/L_minus": 2.2104687690734863, "eos/grad_norm": 0.1384100466966629, "eos/embed_grad_frac": 0.12484721839427948, "eos/time_s": 0.621654748916626} {"step": 29025, "timestamp": 1778225916.1434634, "geo/rankme_last": 440.7480163574219, "geo/layer_0/stable_rank_q_proj": 18.445579528808594, "geo/layer_0/stable_rank_k_proj": 15.94353199005127, "geo/layer_0/stable_rank_o_proj": 50.6866340637207, "geo/layer_0/stable_rank_gate_proj": 144.42726135253906, "geo/layer_0/stable_rank_down_proj": 51.61033248901367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05074375122785568, "geo/layer_0/attn_entropy_mean": 6.227540016174316, "geo/layer_0/attn_entropy_std": 0.3455590605735779, "geo/layer_7/stable_rank_q_proj": 42.2319221496582, "geo/layer_7/stable_rank_k_proj": 42.02164077758789, "geo/layer_7/stable_rank_o_proj": 105.66573333740234, "geo/layer_7/stable_rank_gate_proj": 96.68527221679688, "geo/layer_7/stable_rank_down_proj": 147.73814392089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5300070643424988, "geo/layer_7/attn_entropy_mean": 4.6766357421875, "geo/layer_7/attn_entropy_std": 0.8065875172615051, "geo/layer_14/stable_rank_q_proj": 55.56332778930664, "geo/layer_14/stable_rank_k_proj": 35.491397857666016, "geo/layer_14/stable_rank_o_proj": 52.81591033935547, "geo/layer_14/stable_rank_gate_proj": 81.1542739868164, "geo/layer_14/stable_rank_down_proj": 134.38966369628906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3854949474334717, "geo/layer_14/attn_entropy_mean": 5.458817005157471, "geo/layer_14/attn_entropy_std": 0.43383246660232544, "geo/layer_21/stable_rank_q_proj": 45.34079360961914, "geo/layer_21/stable_rank_k_proj": 31.074796676635742, "geo/layer_21/stable_rank_o_proj": 80.41124725341797, "geo/layer_21/stable_rank_gate_proj": 79.85843658447266, "geo/layer_21/stable_rank_down_proj": 58.28797149658203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.150019571185112, "geo/layer_21/attn_entropy_mean": 5.7342095375061035, "geo/layer_21/attn_entropy_std": 0.2822554111480713, "geo/layer_27/stable_rank_q_proj": 41.5811653137207, "geo/layer_27/stable_rank_k_proj": 31.171680450439453, "geo/layer_27/stable_rank_o_proj": 118.75521850585938, "geo/layer_27/stable_rank_gate_proj": 88.77082061767578, "geo/layer_27/stable_rank_down_proj": 135.5518798828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07306695729494095, "geo/layer_27/attn_entropy_mean": 4.324681758880615, "geo/layer_27/attn_entropy_std": 0.6080275774002075, "attnres/final_alpha/block_0": 0.24041447043418884, "attnres/block_norm/0": 1.678688406944275, "attnres/final_alpha/block_1": 0.0059896367602050304, "attnres/block_norm/1": 35029.2578125, "attnres/final_alpha/block_2": 0.012486634775996208, "attnres/block_norm/2": 24110.9453125, "attnres/final_alpha/block_3": 0.014286108314990997, "attnres/block_norm/3": 38686.6171875, "attnres/final_alpha/block_4": 0.01843259483575821, "attnres/block_norm/4": 11175.5029296875, "attnres/final_alpha/block_5": 0.5834393501281738, "attnres/block_norm/5": 5565.91064453125, "attnres/final_alpha/block_6": 0.12495123594999313, "attnres/block_norm/6": 25426.703125, "geo/tier1_time_s": 1.355942964553833, "geo/step": 29025.0, "geo/rankme_slope": 2.0829640449929974e-05} {"step": 29030, "timestamp": 1778225921.3216128, "train/loss": 2.233008027076721, "train/z_loss": 0.0014668167917989195, "train/perplexity": 9.327882444327708, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700220.5763922618, "perf/iters_per_sec": 0.8107283479653653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2334587812423705, "data/tokens_consumed": 60882419712, "data/tokens_consumed_B": 60.882419712, "train/loss_slope": -3.4288793221057475e-06} {"step": 29040, "timestamp": 1778225931.674214, "train/loss": 2.214187216758728, "train/z_loss": 0.0014731668634340167, "train/perplexity": 9.153965897086227, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027048.6507290155, "perf/iters_per_sec": 0.9665721181531026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345839500427245, "data/tokens_consumed": 60903391232, "data/tokens_consumed_B": 60.903391232, "train/loss_slope": -3.800635219084643e-07} {"step": 29050, "timestamp": 1778225942.0117202, "grad/layer_0/attn": 0.0026690715458244085, "grad/layer_0/mlp": 0.0026033823378384113, "grad/layer_0/attn_mlp_ratio": 1.0252322159937972, "grad/layer_4/attn": 0.0026640219148248434, "grad/layer_4/mlp": 0.0025626281276345253, "grad/layer_4/attn_mlp_ratio": 1.039566288272689, "grad/layer_8/attn": 0.005404218565672636, "grad/layer_8/mlp": 0.0037437276914715767, "grad/layer_8/attn_mlp_ratio": 1.4435393988803755, "grad/layer_12/attn": 0.004440666642040014, "grad/layer_12/mlp": 0.007116756401956081, "grad/layer_12/attn_mlp_ratio": 0.6239733846197312, "grad/layer_16/attn": 0.0052043842151761055, "grad/layer_16/mlp": 0.004759531002491713, "grad/layer_16/attn_mlp_ratio": 1.0934657433904578, "grad/layer_20/attn": 0.005112855229526758, "grad/layer_20/mlp": 0.006885859649628401, "grad/layer_20/attn_mlp_ratio": 0.7425151564846607, "grad/layer_24/attn": 0.016216928139328957, "grad/layer_24/mlp": 0.01199035719037056, "grad/layer_24/attn_mlp_ratio": 1.3524974899916244, "grad/layer_27/attn": 0.005890762899070978, "grad/layer_27/mlp": 0.011389225721359253, "grad/layer_27/attn_mlp_ratio": 0.5172224162965927} {"step": 29050, "timestamp": 1778225942.027577, "train/loss": 2.177227592468262, "train/z_loss": 0.0014811525237746538, "train/perplexity": 8.821814658656749, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026502.5351075872, "perf/iters_per_sec": 0.9663117099321304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034862756729126, "data/tokens_consumed": 60924362752, "data/tokens_consumed_B": 60.924362752, "train/loss_slope": -4.4668187116524466e-06} {"step": 29060, "timestamp": 1778225952.3883307, "train/loss": 2.178267407417297, "train/z_loss": 0.0014889792073518038, "train/perplexity": 8.83099248420961, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025343.7037558607, "perf/iters_per_sec": 0.9657591360835365, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354548692703247, "data/tokens_consumed": 60945334272, "data/tokens_consumed_B": 60.945334272, "train/loss_slope": -9.948191534031427e-06} {"step": 29070, "timestamp": 1778225962.7450175, "train/loss": 2.2113373994827272, "train/z_loss": 0.0014619911438785494, "train/perplexity": 9.127915903420107, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026370.697010241, "perf/iters_per_sec": 0.966248844628449, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349300861358643, "data/tokens_consumed": 60966305792, "data/tokens_consumed_B": 60.966305792, "train/loss_slope": -6.3800638729910865e-06} {"step": 29080, "timestamp": 1778225973.098598, "train/loss": 2.1846605777740478, "train/z_loss": 0.0014710123767144978, "train/perplexity": 8.887631381725752, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026616.2732542888, "perf/iters_per_sec": 0.9663659445067829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034804677963257, "data/tokens_consumed": 60987277312, "data/tokens_consumed_B": 60.987277312, "train/loss_slope": -4.238073331544991e-06} {"step": 29090, "timestamp": 1778225983.4480846, "train/loss": 2.1493244647979735, "train/z_loss": 0.001475822855718434, "train/perplexity": 8.579060981525833, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027174.0363991428, "perf/iters_per_sec": 0.9666319066997255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345199584960938, "data/tokens_consumed": 61008248832, "data/tokens_consumed_B": 61.008248832, "train/loss_slope": -4.1911814687060645e-06} {"step": 29100, "timestamp": 1778225993.7958388, "grad/layer_0/attn": 0.0028093024156987667, "grad/layer_0/mlp": 0.0027678015176206827, "grad/layer_0/attn_mlp_ratio": 1.0149941375183384, "grad/layer_4/attn": 0.0020904913544654846, "grad/layer_4/mlp": 0.0024724092800170183, "grad/layer_4/attn_mlp_ratio": 0.8455279984624129, "grad/layer_8/attn": 0.01068985927850008, "grad/layer_8/mlp": 0.0036699469201266766, "grad/layer_8/attn_mlp_ratio": 2.9128102449095015, "grad/layer_12/attn": 0.004823256749659777, "grad/layer_12/mlp": 0.006973701529204845, "grad/layer_12/attn_mlp_ratio": 0.6916350893850521, "grad/layer_16/attn": 0.005071099381893873, "grad/layer_16/mlp": 0.00454188184812665, "grad/layer_16/attn_mlp_ratio": 1.1165194163590062, "grad/layer_20/attn": 0.005329248961061239, "grad/layer_20/mlp": 0.007029073312878609, "grad/layer_20/attn_mlp_ratio": 0.7581723291290478, "grad/layer_24/attn": 0.020533140748739243, "grad/layer_24/mlp": 0.014223617501556873, "grad/layer_24/attn_mlp_ratio": 1.4435948240405274, "grad/layer_27/attn": 0.00592966889962554, "grad/layer_27/mlp": 0.013560529798269272, "grad/layer_27/attn_mlp_ratio": 0.4372741289691299} {"step": 29100, "timestamp": 1778225994.4087734, "eos/sharpness": 58.81385803222655, "eos/L0_probe": 2.016242027282715, "eos/L_plus": 2.270939588546753, "eos/L_minus": 2.3496830463409424, "eos/grad_norm": 0.214507058262825, "eos/embed_grad_frac": 0.07074219733476639, "eos/time_s": 0.6100468635559082} {"step": 29100, "timestamp": 1778225994.428922, "train/loss": 2.218778896331787, "train/z_loss": 0.001461022556759417, "train/perplexity": 9.196094622092916, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910597.0422741408, "perf/iters_per_sec": 0.9110436641092972, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976422309875489, "data/tokens_consumed": 61029220352, "data/tokens_consumed_B": 61.029220352, "train/loss_slope": -2.9525107533374148e-06} {"step": 29100, "timestamp": 1778225995.790316, "geo/rankme_last": 440.43646240234375, "geo/layer_0/stable_rank_q_proj": 18.46742057800293, "geo/layer_0/stable_rank_k_proj": 15.986593246459961, "geo/layer_0/stable_rank_o_proj": 50.624176025390625, "geo/layer_0/stable_rank_gate_proj": 144.33187866210938, "geo/layer_0/stable_rank_down_proj": 51.679931640625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05248643830418587, "geo/layer_0/attn_entropy_mean": 6.222538948059082, "geo/layer_0/attn_entropy_std": 0.34096699953079224, "geo/layer_7/stable_rank_q_proj": 42.15778350830078, "geo/layer_7/stable_rank_k_proj": 41.924434661865234, "geo/layer_7/stable_rank_o_proj": 105.9538803100586, "geo/layer_7/stable_rank_gate_proj": 96.4903335571289, "geo/layer_7/stable_rank_down_proj": 147.52471923828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5392265915870667, "geo/layer_7/attn_entropy_mean": 4.650449752807617, "geo/layer_7/attn_entropy_std": 0.8248550891876221, "geo/layer_14/stable_rank_q_proj": 55.52070999145508, "geo/layer_14/stable_rank_k_proj": 35.39438247680664, "geo/layer_14/stable_rank_o_proj": 52.67832565307617, "geo/layer_14/stable_rank_gate_proj": 81.01811981201172, "geo/layer_14/stable_rank_down_proj": 134.52235412597656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3672637939453125, "geo/layer_14/attn_entropy_mean": 5.51450252532959, "geo/layer_14/attn_entropy_std": 0.4140934944152832, "geo/layer_21/stable_rank_q_proj": 45.351219177246094, "geo/layer_21/stable_rank_k_proj": 31.017333984375, "geo/layer_21/stable_rank_o_proj": 80.41133117675781, "geo/layer_21/stable_rank_gate_proj": 79.85407257080078, "geo/layer_21/stable_rank_down_proj": 58.21174240112305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14776577055454254, "geo/layer_21/attn_entropy_mean": 5.733944416046143, "geo/layer_21/attn_entropy_std": 0.2898227572441101, "geo/layer_27/stable_rank_q_proj": 41.691566467285156, "geo/layer_27/stable_rank_k_proj": 31.30263328552246, "geo/layer_27/stable_rank_o_proj": 118.9805679321289, "geo/layer_27/stable_rank_gate_proj": 88.78569793701172, "geo/layer_27/stable_rank_down_proj": 135.43531799316406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08484764397144318, "geo/layer_27/attn_entropy_mean": 4.326980113983154, "geo/layer_27/attn_entropy_std": 0.6131030321121216, "attnres/final_alpha/block_0": 0.2427729070186615, "attnres/block_norm/0": 1.6789653301239014, "attnres/final_alpha/block_1": 0.006055775564163923, "attnres/block_norm/1": 35086.42578125, "attnres/final_alpha/block_2": 0.012742774561047554, "attnres/block_norm/2": 24108.669921875, "attnres/final_alpha/block_3": 0.014621801674365997, "attnres/block_norm/3": 38501.83984375, "attnres/final_alpha/block_4": 0.01843872293829918, "attnres/block_norm/4": 11220.6357421875, "attnres/final_alpha/block_5": 0.5795877575874329, "attnres/block_norm/5": 5575.6904296875, "attnres/final_alpha/block_6": 0.12578028440475464, "attnres/block_norm/6": 25491.126953125, "geo/tier1_time_s": 1.3571043014526367, "geo/step": 29100.0, "geo/rankme_slope": 3.375637364320729e-05} {"step": 29110, "timestamp": 1778226006.143977, "train/loss": 2.208314085006714, "train/z_loss": 0.0014679540880024433, "train/perplexity": 9.100361017664582, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790736.4250764293, "perf/iters_per_sec": 0.8538896680242678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171111488342285, "data/tokens_consumed": 61050191872, "data/tokens_consumed_B": 61.050191872, "train/loss_slope": -1.7385735656275507e-06} {"step": 29120, "timestamp": 1778226016.497657, "train/loss": 2.1951119184494017, "train/z_loss": 0.00148084337124601, "train/perplexity": 8.98100614085595, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026489.4625754252, "perf/iters_per_sec": 0.9663054764630438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348694324493408, "data/tokens_consumed": 61071163392, "data/tokens_consumed_B": 61.071163392, "train/loss_slope": 5.01653456380365e-07} {"step": 29130, "timestamp": 1778226026.851223, "train/loss": 2.228799843788147, "train/z_loss": 0.0014746441040188075, "train/perplexity": 9.288711482407278, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026514.9541693607, "perf/iters_per_sec": 0.966317631802254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034856414794922, "data/tokens_consumed": 61092134912, "data/tokens_consumed_B": 61.092134912, "train/loss_slope": -6.741112644093564e-07} {"step": 29140, "timestamp": 1778226037.20048, "train/loss": 2.2206584930419924, "train/z_loss": 0.0014830033527687193, "train/perplexity": 9.213395825840568, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027266.1701825396, "perf/iters_per_sec": 0.966675839511175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034472942352295, "data/tokens_consumed": 61113106432, "data/tokens_consumed_B": 61.113106432, "train/loss_slope": 7.72678097411039e-07} {"step": 29150, "timestamp": 1778226047.5398648, "grad/layer_0/attn": 0.0033331753220409155, "grad/layer_0/mlp": 0.0029169758781790733, "grad/layer_0/attn_mlp_ratio": 1.1426817865403387, "grad/layer_4/attn": 0.002345151035115123, "grad/layer_4/mlp": 0.0024675142485648394, "grad/layer_4/attn_mlp_ratio": 0.9504102930461638, "grad/layer_8/attn": 0.003930455539375544, "grad/layer_8/mlp": 0.0035863923840224743, "grad/layer_8/attn_mlp_ratio": 1.0959356949597359, "grad/layer_12/attn": 0.004597747232764959, "grad/layer_12/mlp": 0.005999377463012934, "grad/layer_12/attn_mlp_ratio": 0.7663707083732759, "grad/layer_16/attn": 0.004181540571153164, "grad/layer_16/mlp": 0.004514683969318867, "grad/layer_16/attn_mlp_ratio": 0.9262089012097885, "grad/layer_20/attn": 0.0037871217355132103, "grad/layer_20/mlp": 0.006044973153620958, "grad/layer_20/attn_mlp_ratio": 0.6264910656543786, "grad/layer_24/attn": 0.0183268990367651, "grad/layer_24/mlp": 0.012307891622185707, "grad/layer_24/attn_mlp_ratio": 1.4890364207323805, "grad/layer_27/attn": 0.007336313370615244, "grad/layer_27/mlp": 0.01328545156866312, "grad/layer_27/attn_mlp_ratio": 0.5522065454439666} {"step": 29150, "timestamp": 1778226047.555652, "train/loss": 2.1602003812789916, "train/z_loss": 0.0014918617205694317, "train/perplexity": 8.672875366214006, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026150.2893310173, "perf/iters_per_sec": 0.9661437460570418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035042667388916, "data/tokens_consumed": 61134077952, "data/tokens_consumed_B": 61.134077952, "train/loss_slope": -1.2709695227754948e-06} {"step": 29160, "timestamp": 1778226057.905506, "train/loss": 2.1934714794158934, "train/z_loss": 0.001487905893009156, "train/perplexity": 8.966285425341647, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027367.611127715, "perf/iters_per_sec": 0.9667242103231979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034421181678772, "data/tokens_consumed": 61155049472, "data/tokens_consumed_B": 61.155049472, "train/loss_slope": -7.904054093496974e-07} {"step": 29170, "timestamp": 1778226068.255316, "train/loss": 2.195670223236084, "train/z_loss": 0.0014671689830720424, "train/perplexity": 8.986021679542961, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027263.1331850085, "perf/iters_per_sec": 0.9666743913579028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034474492073059, "data/tokens_consumed": 61176020992, "data/tokens_consumed_B": 61.176020992, "train/loss_slope": 3.991566461161953e-06} {"step": 29175, "timestamp": 1778226074.0318935, "eos/sharpness": 20.41826248168945, "eos/L0_probe": 2.0116257667541504, "eos/L_plus": 2.1192171573638916, "eos/L_minus": 2.1082170009613037, "eos/grad_norm": 0.13457562029361725, "eos/embed_grad_frac": 0.14166106283664703, "eos/time_s": 0.6127643585205078} {"step": 29175, "timestamp": 1778226075.4091399, "geo/rankme_last": 439.8918151855469, "geo/layer_0/stable_rank_q_proj": 18.479520797729492, "geo/layer_0/stable_rank_k_proj": 15.978560447692871, "geo/layer_0/stable_rank_o_proj": 50.67122268676758, "geo/layer_0/stable_rank_gate_proj": 144.40643310546875, "geo/layer_0/stable_rank_down_proj": 51.68175506591797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05466078966856003, "geo/layer_0/attn_entropy_mean": 6.226268768310547, "geo/layer_0/attn_entropy_std": 0.33783355355262756, "geo/layer_7/stable_rank_q_proj": 42.176448822021484, "geo/layer_7/stable_rank_k_proj": 42.027713775634766, "geo/layer_7/stable_rank_o_proj": 105.92316436767578, "geo/layer_7/stable_rank_gate_proj": 96.26066589355469, "geo/layer_7/stable_rank_down_proj": 147.45912170410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5228312015533447, "geo/layer_7/attn_entropy_mean": 4.6357808113098145, "geo/layer_7/attn_entropy_std": 0.8369765281677246, "geo/layer_14/stable_rank_q_proj": 55.4539909362793, "geo/layer_14/stable_rank_k_proj": 35.4291877746582, "geo/layer_14/stable_rank_o_proj": 52.58525085449219, "geo/layer_14/stable_rank_gate_proj": 80.7490005493164, "geo/layer_14/stable_rank_down_proj": 134.76278686523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3772193193435669, "geo/layer_14/attn_entropy_mean": 5.48567533493042, "geo/layer_14/attn_entropy_std": 0.3916907012462616, "geo/layer_21/stable_rank_q_proj": 45.20867156982422, "geo/layer_21/stable_rank_k_proj": 31.121980667114258, "geo/layer_21/stable_rank_o_proj": 80.33885955810547, "geo/layer_21/stable_rank_gate_proj": 79.74285125732422, "geo/layer_21/stable_rank_down_proj": 58.14965057373047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14848165214061737, "geo/layer_21/attn_entropy_mean": 5.725892543792725, "geo/layer_21/attn_entropy_std": 0.2931172251701355, "geo/layer_27/stable_rank_q_proj": 41.7504768371582, "geo/layer_27/stable_rank_k_proj": 31.316940307617188, "geo/layer_27/stable_rank_o_proj": 118.84632110595703, "geo/layer_27/stable_rank_gate_proj": 88.82518005371094, "geo/layer_27/stable_rank_down_proj": 135.47653198242188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08372543007135391, "geo/layer_27/attn_entropy_mean": 4.322417736053467, "geo/layer_27/attn_entropy_std": 0.5955919027328491, "attnres/final_alpha/block_0": 0.23931151628494263, "attnres/block_norm/0": 1.6794712543487549, "attnres/final_alpha/block_1": 0.005864589475095272, "attnres/block_norm/1": 35107.6484375, "attnres/final_alpha/block_2": 0.01233562920242548, "attnres/block_norm/2": 24152.70703125, "attnres/final_alpha/block_3": 0.014187954366207123, "attnres/block_norm/3": 38695.17578125, "attnres/final_alpha/block_4": 0.017864642664790154, "attnres/block_norm/4": 11209.115234375, "attnres/final_alpha/block_5": 0.584055483341217, "attnres/block_norm/5": 5563.89794921875, "attnres/final_alpha/block_6": 0.12638017535209656, "attnres/block_norm/6": 25696.341796875, "geo/tier1_time_s": 1.3562018871307373, "geo/step": 29175.0, "geo/rankme_slope": 1.2498046093437375e-05} {"step": 29180, "timestamp": 1778226080.5896876, "train/loss": 2.179870939254761, "train/z_loss": 0.0014781793695874512, "train/perplexity": 8.845164621514376, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700948.1355550827, "perf/iters_per_sec": 0.8110752752089895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329311847686768, "data/tokens_consumed": 61196992512, "data/tokens_consumed_B": 61.196992512, "train/loss_slope": 1.6543349739026742e-06} {"step": 29190, "timestamp": 1778226090.9459784, "train/loss": 2.229602646827698, "train/z_loss": 0.0014598639332689344, "train/perplexity": 9.29617148227349, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025915.5579428985, "perf/iters_per_sec": 0.9660318174089901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035162591934204, "data/tokens_consumed": 61217964032, "data/tokens_consumed_B": 61.217964032, "train/loss_slope": 2.4333296328595563e-06} {"step": 29200, "timestamp": 1778226101.2845058, "grad/layer_0/attn": 0.0030546458438038826, "grad/layer_0/mlp": 0.0028736728709191084, "grad/layer_0/attn_mlp_ratio": 1.0629761544602245, "grad/layer_4/attn": 0.0020288233645260334, "grad/layer_4/mlp": 0.002448789542540908, "grad/layer_4/attn_mlp_ratio": 0.8285004678559854, "grad/layer_8/attn": 0.005844741594046354, "grad/layer_8/mlp": 0.0038916063494980335, "grad/layer_8/attn_mlp_ratio": 1.5018840342399602, "grad/layer_12/attn": 0.005330909509211779, "grad/layer_12/mlp": 0.007681956514716148, "grad/layer_12/attn_mlp_ratio": 0.693952045888866, "grad/layer_16/attn": 0.005267421714961529, "grad/layer_16/mlp": 0.005391822662204504, "grad/layer_16/attn_mlp_ratio": 0.976927830767176, "grad/layer_20/attn": 0.004637441597878933, "grad/layer_20/mlp": 0.0066954209469258785, "grad/layer_20/attn_mlp_ratio": 0.6926288227994502, "grad/layer_24/attn": 0.019622420892119408, "grad/layer_24/mlp": 0.012841667979955673, "grad/layer_24/attn_mlp_ratio": 1.528027415904612, "grad/layer_27/attn": 0.010880090296268463, "grad/layer_27/mlp": 0.010462401434779167, "grad/layer_27/attn_mlp_ratio": 1.0399228379928656} {"step": 29200, "timestamp": 1778226101.3002706, "train/loss": 2.1924111604690553, "train/z_loss": 0.00147866765037179, "train/perplexity": 8.956783341532395, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026223.9866051334, "perf/iters_per_sec": 0.9661788876557986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035005021095276, "data/tokens_consumed": 61238935552, "data/tokens_consumed_B": 61.238935552, "train/loss_slope": 2.8023157826494787e-06} {"step": 29210, "timestamp": 1778226111.6546197, "train/loss": 2.1637996673583983, "train/z_loss": 0.0014846758567728102, "train/perplexity": 8.70414777119321, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026404.0750766937, "perf/iters_per_sec": 0.9662647605308026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349130392074586, "data/tokens_consumed": 61259907072, "data/tokens_consumed_B": 61.259907072, "train/loss_slope": 1.667237689535901e-06} {"step": 29220, "timestamp": 1778226122.020242, "train/loss": 2.1891384601593016, "train/z_loss": 0.0014801020151935518, "train/perplexity": 8.927518387748172, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024346.7286796956, "perf/iters_per_sec": 0.9652837413214186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359648227691651, "data/tokens_consumed": 61280878592, "data/tokens_consumed_B": 61.280878592, "train/loss_slope": -3.015080250815285e-06} {"step": 29230, "timestamp": 1778226132.3697488, "train/loss": 2.195604395866394, "train/z_loss": 0.0014703803462907672, "train/perplexity": 8.9854301728407, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027196.788740373, "perf/iters_per_sec": 0.9666427558614602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345083475112915, "data/tokens_consumed": 61301850112, "data/tokens_consumed_B": 61.301850112, "train/loss_slope": -3.407876469371451e-06} {"step": 29240, "timestamp": 1778226142.7250235, "train/loss": 2.1828057765960693, "train/z_loss": 0.0014746354660019278, "train/perplexity": 8.871161871124974, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026621.3628211764, "perf/iters_per_sec": 0.9663683714013941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348020792007446, "data/tokens_consumed": 61322821632, "data/tokens_consumed_B": 61.322821632, "train/loss_slope": -2.718788648273985e-06} {"step": 29250, "timestamp": 1778226153.065167, "grad/layer_0/attn": 0.0028075992595404387, "grad/layer_0/mlp": 0.0027813345659524202, "grad/layer_0/attn_mlp_ratio": 1.0094431619141468, "grad/layer_4/attn": 0.0021934728138148785, "grad/layer_4/mlp": 0.002477649599313736, "grad/layer_4/attn_mlp_ratio": 0.8853038484102215, "grad/layer_8/attn": 0.004119206219911575, "grad/layer_8/mlp": 0.0037654542829841375, "grad/layer_8/attn_mlp_ratio": 1.0939466531651587, "grad/layer_12/attn": 0.00529889203608036, "grad/layer_12/mlp": 0.005914062727242708, "grad/layer_12/attn_mlp_ratio": 0.8959816949646515, "grad/layer_16/attn": 0.00423469441011548, "grad/layer_16/mlp": 0.004107717890292406, "grad/layer_16/attn_mlp_ratio": 1.0309116692341471, "grad/layer_20/attn": 0.0054227798245847225, "grad/layer_20/mlp": 0.005953439511358738, "grad/layer_20/attn_mlp_ratio": 0.910865008899804, "grad/layer_24/attn": 0.012338036671280861, "grad/layer_24/mlp": 0.011160722933709621, "grad/layer_24/attn_mlp_ratio": 1.1054872192433507, "grad/layer_27/attn": 0.0054457359947264194, "grad/layer_27/mlp": 0.010841610841453075, "grad/layer_27/attn_mlp_ratio": 0.5022995221037272} {"step": 29250, "timestamp": 1778226153.7943206, "eos/sharpness": 49.6436595916748, "eos/L0_probe": 2.0163867473602295, "eos/L_plus": 2.247823715209961, "eos/L_minus": 2.281386375427246, "eos/grad_norm": 0.16904690861701965, "eos/embed_grad_frac": 0.07911591976881027, "eos/time_s": 0.7261667251586914} {"step": 29250, "timestamp": 1778226153.8142145, "train/loss": 2.2272971510887145, "train/z_loss": 0.0014643727685324849, "train/perplexity": 9.274763885576068, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1892079.3573502977, "perf/iters_per_sec": 0.902213743853711, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1083847999572753, "data/tokens_consumed": 61343793152, "data/tokens_consumed_B": 61.343793152, "train/loss_slope": 1.1007153805476125e-06} {"step": 29250, "timestamp": 1778226155.186493, "geo/rankme_last": 439.59405517578125, "geo/layer_0/stable_rank_q_proj": 18.49765396118164, "geo/layer_0/stable_rank_k_proj": 15.998261451721191, "geo/layer_0/stable_rank_o_proj": 50.68512725830078, "geo/layer_0/stable_rank_gate_proj": 144.6462860107422, "geo/layer_0/stable_rank_down_proj": 51.722103118896484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057860180735588074, "geo/layer_0/attn_entropy_mean": 6.228184700012207, "geo/layer_0/attn_entropy_std": 0.33975401520729065, "geo/layer_7/stable_rank_q_proj": 42.139808654785156, "geo/layer_7/stable_rank_k_proj": 41.97917175292969, "geo/layer_7/stable_rank_o_proj": 105.82921600341797, "geo/layer_7/stable_rank_gate_proj": 96.31883239746094, "geo/layer_7/stable_rank_down_proj": 146.8250732421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5247195363044739, "geo/layer_7/attn_entropy_mean": 4.667379379272461, "geo/layer_7/attn_entropy_std": 0.8225244879722595, "geo/layer_14/stable_rank_q_proj": 55.336490631103516, "geo/layer_14/stable_rank_k_proj": 35.554466247558594, "geo/layer_14/stable_rank_o_proj": 52.72193145751953, "geo/layer_14/stable_rank_gate_proj": 80.65791320800781, "geo/layer_14/stable_rank_down_proj": 134.7263946533203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3749328851699829, "geo/layer_14/attn_entropy_mean": 5.469052314758301, "geo/layer_14/attn_entropy_std": 0.4182586669921875, "geo/layer_21/stable_rank_q_proj": 45.045631408691406, "geo/layer_21/stable_rank_k_proj": 31.089595794677734, "geo/layer_21/stable_rank_o_proj": 80.42561340332031, "geo/layer_21/stable_rank_gate_proj": 79.60401153564453, "geo/layer_21/stable_rank_down_proj": 57.9538688659668, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14675264060497284, "geo/layer_21/attn_entropy_mean": 5.7196173667907715, "geo/layer_21/attn_entropy_std": 0.29160869121551514, "geo/layer_27/stable_rank_q_proj": 41.65465545654297, "geo/layer_27/stable_rank_k_proj": 31.350460052490234, "geo/layer_27/stable_rank_o_proj": 118.64665222167969, "geo/layer_27/stable_rank_gate_proj": 89.00003814697266, "geo/layer_27/stable_rank_down_proj": 135.29913330078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08508096635341644, "geo/layer_27/attn_entropy_mean": 4.317647933959961, "geo/layer_27/attn_entropy_std": 0.6025261878967285, "attnres/final_alpha/block_0": 0.24229232966899872, "attnres/block_norm/0": 1.6798176765441895, "attnres/final_alpha/block_1": 0.005968429613858461, "attnres/block_norm/1": 35048.51171875, "attnres/final_alpha/block_2": 0.01265023835003376, "attnres/block_norm/2": 24156.9140625, "attnres/final_alpha/block_3": 0.014491645619273186, "attnres/block_norm/3": 38618.859375, "attnres/final_alpha/block_4": 0.018538406118750572, "attnres/block_norm/4": 11221.78515625, "attnres/final_alpha/block_5": 0.5796167254447937, "attnres/block_norm/5": 5587.3798828125, "attnres/final_alpha/block_6": 0.12644225358963013, "attnres/block_norm/6": 25577.3984375, "geo/tier1_time_s": 1.358640432357788, "geo/step": 29250.0, "geo/rankme_slope": -6.418817527010806e-06} {"step": 29260, "timestamp": 1778226165.5403442, "train/loss": 2.1645633220672607, "train/z_loss": 0.0014776274445466697, "train/perplexity": 8.710797273263951, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790510.606293334, "perf/iters_per_sec": 0.8537819892374677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712591886520385, "data/tokens_consumed": 61364764672, "data/tokens_consumed_B": 61.364764672, "train/loss_slope": -8.863095081213842e-07} {"step": 29270, "timestamp": 1778226175.894074, "train/loss": 2.142635774612427, "train/z_loss": 0.0014979569823481143, "train/perplexity": 8.521869780869677, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026550.1579010545, "perf/iters_per_sec": 0.9663344182496331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348384380340576, "data/tokens_consumed": 61385736192, "data/tokens_consumed_B": 61.385736192, "train/loss_slope": -3.5096200493195313e-06} {"step": 29280, "timestamp": 1778226186.258891, "train/loss": 2.15858051776886, "train/z_loss": 0.0014754872536286712, "train/perplexity": 8.658837864368348, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024569.6332515704, "perf/iters_per_sec": 0.9653900305040218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358507633209229, "data/tokens_consumed": 61406707712, "data/tokens_consumed_B": 61.406707712, "train/loss_slope": 3.7466106277453157e-07} {"step": 29290, "timestamp": 1778226196.613893, "train/loss": 2.166130304336548, "train/z_loss": 0.0014873390784487129, "train/perplexity": 8.724457638122097, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026238.7360384562, "perf/iters_per_sec": 0.9661859207336694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349974870681762, "data/tokens_consumed": 61427679232, "data/tokens_consumed_B": 61.427679232, "train/loss_slope": -1.3272381959074308e-06} {"step": 29300, "timestamp": 1778226206.9554524, "grad/layer_0/attn": 0.002759487833827734, "grad/layer_0/mlp": 0.0026348948013037443, "grad/layer_0/attn_mlp_ratio": 1.0472857313824324, "grad/layer_4/attn": 0.0031365707982331514, "grad/layer_4/mlp": 0.0024980513844639063, "grad/layer_4/attn_mlp_ratio": 1.2556069471507607, "grad/layer_8/attn": 0.003882698016241193, "grad/layer_8/mlp": 0.003776740515604615, "grad/layer_8/attn_mlp_ratio": 1.0280552495977051, "grad/layer_12/attn": 0.004380758851766586, "grad/layer_12/mlp": 0.006692391354590654, "grad/layer_12/attn_mlp_ratio": 0.6545879573080859, "grad/layer_16/attn": 0.0052514527924358845, "grad/layer_16/mlp": 0.004891383461654186, "grad/layer_16/attn_mlp_ratio": 1.0736129616995174, "grad/layer_20/attn": 0.0033788850996643305, "grad/layer_20/mlp": 0.005921030882745981, "grad/layer_20/attn_mlp_ratio": 0.5706582366331943, "grad/layer_24/attn": 0.014531923457980156, "grad/layer_24/mlp": 0.011560004204511642, "grad/layer_24/attn_mlp_ratio": 1.2570863362316078, "grad/layer_27/attn": 0.004421079531311989, "grad/layer_27/mlp": 0.01111427042633295, "grad/layer_27/attn_mlp_ratio": 0.3977840489699402} {"step": 29300, "timestamp": 1778226206.97121, "train/loss": 2.182999348640442, "train/z_loss": 0.0014790331479161978, "train/perplexity": 8.872879246276876, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025646.033937779, "perf/iters_per_sec": 0.9659032983483214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035300326347351, "data/tokens_consumed": 61448650752, "data/tokens_consumed_B": 61.448650752, "train/loss_slope": -2.5232145173249755e-06} {"step": 29310, "timestamp": 1778226217.320412, "train/loss": 2.1403090000152587, "train/z_loss": 0.00147997330641374, "train/perplexity": 8.502064361052378, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027493.7839989145, "perf/iters_per_sec": 0.966784374236543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343568086624146, "data/tokens_consumed": 61469622272, "data/tokens_consumed_B": 61.469622272, "train/loss_slope": -2.9547094118476977e-06} {"step": 29320, "timestamp": 1778226227.670824, "train/loss": 2.1819932222366334, "train/z_loss": 0.0014710712130181491, "train/perplexity": 8.863956497648587, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027407.3772526837, "perf/iters_per_sec": 0.9667431722892207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344008922576904, "data/tokens_consumed": 61490593792, "data/tokens_consumed_B": 61.490593792, "train/loss_slope": -4.168923016321228e-06} {"step": 29325, "timestamp": 1778226233.4484036, "eos/sharpness": 21.22688293457031, "eos/L0_probe": 2.0142581462860107, "eos/L_plus": 2.1122512817382812, "eos/L_minus": 2.1285338401794434, "eos/grad_norm": 0.09809063374996185, "eos/embed_grad_frac": 0.22198261320590973, "eos/time_s": 0.6080873012542725} {"step": 29325, "timestamp": 1778226234.8279386, "geo/rankme_last": 440.48980712890625, "geo/layer_0/stable_rank_q_proj": 18.524208068847656, "geo/layer_0/stable_rank_k_proj": 15.977679252624512, "geo/layer_0/stable_rank_o_proj": 50.66617202758789, "geo/layer_0/stable_rank_gate_proj": 144.8411407470703, "geo/layer_0/stable_rank_down_proj": 51.80399703979492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058051060885190964, "geo/layer_0/attn_entropy_mean": 6.2232561111450195, "geo/layer_0/attn_entropy_std": 0.33757004141807556, "geo/layer_7/stable_rank_q_proj": 42.10777282714844, "geo/layer_7/stable_rank_k_proj": 42.069183349609375, "geo/layer_7/stable_rank_o_proj": 105.60118103027344, "geo/layer_7/stable_rank_gate_proj": 96.42863464355469, "geo/layer_7/stable_rank_down_proj": 147.50906372070312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5253298282623291, "geo/layer_7/attn_entropy_mean": 4.645438194274902, "geo/layer_7/attn_entropy_std": 0.8325352668762207, "geo/layer_14/stable_rank_q_proj": 55.293792724609375, "geo/layer_14/stable_rank_k_proj": 35.55357360839844, "geo/layer_14/stable_rank_o_proj": 52.850154876708984, "geo/layer_14/stable_rank_gate_proj": 80.67797088623047, "geo/layer_14/stable_rank_down_proj": 134.49168395996094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37507012486457825, "geo/layer_14/attn_entropy_mean": 5.469229698181152, "geo/layer_14/attn_entropy_std": 0.40959134697914124, "geo/layer_21/stable_rank_q_proj": 45.07734298706055, "geo/layer_21/stable_rank_k_proj": 31.208396911621094, "geo/layer_21/stable_rank_o_proj": 80.2543716430664, "geo/layer_21/stable_rank_gate_proj": 79.6645278930664, "geo/layer_21/stable_rank_down_proj": 57.95360565185547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15108437836170197, "geo/layer_21/attn_entropy_mean": 5.73613166809082, "geo/layer_21/attn_entropy_std": 0.2802785336971283, "geo/layer_27/stable_rank_q_proj": 41.538108825683594, "geo/layer_27/stable_rank_k_proj": 31.341625213623047, "geo/layer_27/stable_rank_o_proj": 118.67412567138672, "geo/layer_27/stable_rank_gate_proj": 88.9979019165039, "geo/layer_27/stable_rank_down_proj": 135.18190002441406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07744692265987396, "geo/layer_27/attn_entropy_mean": 4.323606491088867, "geo/layer_27/attn_entropy_std": 0.5951195359230042, "attnres/final_alpha/block_0": 0.24107837677001953, "attnres/block_norm/0": 1.6800665855407715, "attnres/final_alpha/block_1": 0.005949804559350014, "attnres/block_norm/1": 35156.734375, "attnres/final_alpha/block_2": 0.012506970204412937, "attnres/block_norm/2": 24245.78125, "attnres/final_alpha/block_3": 0.014037326909601688, "attnres/block_norm/3": 38832.328125, "attnres/final_alpha/block_4": 0.018054544925689697, "attnres/block_norm/4": 11260.326171875, "attnres/final_alpha/block_5": 0.5811759233474731, "attnres/block_norm/5": 5615.9609375, "attnres/final_alpha/block_6": 0.12719708681106567, "attnres/block_norm/6": 25717.068359375, "geo/tier1_time_s": 1.3604235649108887, "geo/step": 29325.0, "geo/rankme_slope": 5.588114151910765e-06} {"step": 29330, "timestamp": 1778226240.006218, "train/loss": 2.168222761154175, "train/z_loss": 0.001479194953572005, "train/perplexity": 8.742732301791598, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700834.1718737404, "perf/iters_per_sec": 0.8110209330910398, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330137968063355, "data/tokens_consumed": 61511565312, "data/tokens_consumed_B": 61.511565312, "train/loss_slope": -5.896434639439411e-06} {"step": 29340, "timestamp": 1778226250.745812, "train/loss": 2.230674386024475, "train/z_loss": 0.001468844769988209, "train/perplexity": 9.306139894445868, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953543.169739963, "perf/iters_per_sec": 0.931521973485929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0735119819641112, "data/tokens_consumed": 61532536832, "data/tokens_consumed_B": 61.532536832, "train/loss_slope": -1.1172298550999053e-06} {"step": 29350, "timestamp": 1778226261.1056445, "grad/layer_0/attn": 0.0033858725801110268, "grad/layer_0/mlp": 0.0029229435604065657, "grad/layer_0/attn_mlp_ratio": 1.158377640313488, "grad/layer_4/attn": 0.0022522038780152798, "grad/layer_4/mlp": 0.0026309776585549116, "grad/layer_4/attn_mlp_ratio": 0.8560330358901715, "grad/layer_8/attn": 0.0040891836397349834, "grad/layer_8/mlp": 0.003913295920938253, "grad/layer_8/attn_mlp_ratio": 1.0449461573710848, "grad/layer_12/attn": 0.005282198078930378, "grad/layer_12/mlp": 0.00653931125998497, "grad/layer_12/attn_mlp_ratio": 0.8077606017130401, "grad/layer_16/attn": 0.007686768192797899, "grad/layer_16/mlp": 0.004798275884240866, "grad/layer_16/attn_mlp_ratio": 1.6019854251910068, "grad/layer_20/attn": 0.003722701920196414, "grad/layer_20/mlp": 0.005852425936609507, "grad/layer_20/attn_mlp_ratio": 0.6360955092656057, "grad/layer_24/attn": 0.007754756137728691, "grad/layer_24/mlp": 0.009277082048356533, "grad/layer_24/attn_mlp_ratio": 0.8359046533938985, "grad/layer_27/attn": 0.003824256593361497, "grad/layer_27/mlp": 0.008957776241004467, "grad/layer_27/attn_mlp_ratio": 0.4269203034078734} {"step": 29350, "timestamp": 1778226261.1226087, "train/loss": 2.1603296518325807, "train/z_loss": 0.0014761223341338337, "train/perplexity": 8.673996586082602, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022046.2832690452, "perf/iters_per_sec": 0.9641868034692026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371434211730957, "data/tokens_consumed": 61553508352, "data/tokens_consumed_B": 61.553508352, "train/loss_slope": -2.1465055083427004e-06} {"step": 29360, "timestamp": 1778226271.4855278, "train/loss": 2.2042150020599367, "train/z_loss": 0.0014657804858870804, "train/perplexity": 9.063134232971887, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024915.4095026967, "perf/iters_per_sec": 0.965554909468983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356738805770873, "data/tokens_consumed": 61574479872, "data/tokens_consumed_B": 61.574479872, "train/loss_slope": -8.321830541780336e-07} {"step": 29370, "timestamp": 1778226281.8514469, "train/loss": 2.1702929496765138, "train/z_loss": 0.0014810325112193823, "train/perplexity": 8.76085015306992, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024230.0774598105, "perf/iters_per_sec": 0.9652281176852276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036024522781372, "data/tokens_consumed": 61595451392, "data/tokens_consumed_B": 61.595451392, "train/loss_slope": -3.161833510182793e-06} {"step": 29380, "timestamp": 1778226292.222997, "train/loss": 2.2169432163238527, "train/z_loss": 0.0014715164317749441, "train/perplexity": 9.179229019704662, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023329.5113469379, "perf/iters_per_sec": 0.9647986942991914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364856481552125, "data/tokens_consumed": 61616422912, "data/tokens_consumed_B": 61.616422912, "train/loss_slope": -2.3705812606444646e-06} {"step": 29390, "timestamp": 1778226302.5783622, "train/loss": 2.184053921699524, "train/z_loss": 0.0014714236254803835, "train/perplexity": 8.882241281293792, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026454.4943808757, "perf/iters_per_sec": 0.9662888023285273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348872900009156, "data/tokens_consumed": 61637394432, "data/tokens_consumed_B": 61.637394432, "train/loss_slope": -2.8926735842796168e-06} {"step": 29400, "timestamp": 1778226312.9142919, "grad/layer_0/attn": 0.0028409285005182028, "grad/layer_0/mlp": 0.0027466057799756527, "grad/layer_0/attn_mlp_ratio": 1.034341519921084, "grad/layer_4/attn": 0.0016500967321917415, "grad/layer_4/mlp": 0.002366398461163044, "grad/layer_4/attn_mlp_ratio": 0.6973029646285567, "grad/layer_8/attn": 0.0034808937925845385, "grad/layer_8/mlp": 0.0035376951564103365, "grad/layer_8/attn_mlp_ratio": 0.9839439353282696, "grad/layer_12/attn": 0.004060282371938229, "grad/layer_12/mlp": 0.006011919118463993, "grad/layer_12/attn_mlp_ratio": 0.6753720774338025, "grad/layer_16/attn": 0.0037911117542535067, "grad/layer_16/mlp": 0.004442352335900068, "grad/layer_16/attn_mlp_ratio": 0.8534018425950005, "grad/layer_20/attn": 0.004764040466398001, "grad/layer_20/mlp": 0.006029084790498018, "grad/layer_20/attn_mlp_ratio": 0.7901763788242959, "grad/layer_24/attn": 0.009642972610890865, "grad/layer_24/mlp": 0.010547947138547897, "grad/layer_24/attn_mlp_ratio": 0.914203720668059, "grad/layer_27/attn": 0.008461595512926579, "grad/layer_27/mlp": 0.009470085613429546, "grad/layer_27/attn_mlp_ratio": 0.8935078064739345} {"step": 29400, "timestamp": 1778226313.523193, "eos/sharpness": 43.01605224609374, "eos/L0_probe": 2.0205790996551514, "eos/L_plus": 2.2683370113372803, "eos/L_minus": 2.20298171043396, "eos/grad_norm": 0.14586536586284637, "eos/embed_grad_frac": 0.12562201917171478, "eos/time_s": 0.6060867309570312} {"step": 29400, "timestamp": 1778226313.5439746, "train/loss": 2.230833721160889, "train/z_loss": 0.0014589290600270032, "train/perplexity": 9.307622807652388, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913338.7693842323, "perf/iters_per_sec": 0.9123510214730417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960693597793578, "data/tokens_consumed": 61658365952, "data/tokens_consumed_B": 61.658365952, "train/loss_slope": 1.4426111638498298e-07} {"step": 29400, "timestamp": 1778226314.9072418, "geo/rankme_last": 439.450927734375, "geo/layer_0/stable_rank_q_proj": 18.507659912109375, "geo/layer_0/stable_rank_k_proj": 15.960763931274414, "geo/layer_0/stable_rank_o_proj": 50.50295639038086, "geo/layer_0/stable_rank_gate_proj": 144.5253143310547, "geo/layer_0/stable_rank_down_proj": 51.781124114990234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.055739376693964005, "geo/layer_0/attn_entropy_mean": 6.2239885330200195, "geo/layer_0/attn_entropy_std": 0.34032076597213745, "geo/layer_7/stable_rank_q_proj": 42.01248550415039, "geo/layer_7/stable_rank_k_proj": 42.10810089111328, "geo/layer_7/stable_rank_o_proj": 105.89464569091797, "geo/layer_7/stable_rank_gate_proj": 96.45328521728516, "geo/layer_7/stable_rank_down_proj": 147.59730529785156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5134920477867126, "geo/layer_7/attn_entropy_mean": 4.646101474761963, "geo/layer_7/attn_entropy_std": 0.8189684152603149, "geo/layer_14/stable_rank_q_proj": 55.28247833251953, "geo/layer_14/stable_rank_k_proj": 35.682586669921875, "geo/layer_14/stable_rank_o_proj": 52.76043701171875, "geo/layer_14/stable_rank_gate_proj": 80.50190734863281, "geo/layer_14/stable_rank_down_proj": 134.7652130126953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3906697928905487, "geo/layer_14/attn_entropy_mean": 5.46904182434082, "geo/layer_14/attn_entropy_std": 0.3959892988204956, "geo/layer_21/stable_rank_q_proj": 45.05875015258789, "geo/layer_21/stable_rank_k_proj": 31.117048263549805, "geo/layer_21/stable_rank_o_proj": 80.20087432861328, "geo/layer_21/stable_rank_gate_proj": 79.50834655761719, "geo/layer_21/stable_rank_down_proj": 57.88670349121094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1482270210981369, "geo/layer_21/attn_entropy_mean": 5.743462085723877, "geo/layer_21/attn_entropy_std": 0.28079938888549805, "geo/layer_27/stable_rank_q_proj": 41.561256408691406, "geo/layer_27/stable_rank_k_proj": 31.220972061157227, "geo/layer_27/stable_rank_o_proj": 118.62010955810547, "geo/layer_27/stable_rank_gate_proj": 88.83348846435547, "geo/layer_27/stable_rank_down_proj": 135.33731079101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07987716794013977, "geo/layer_27/attn_entropy_mean": 4.294000625610352, "geo/layer_27/attn_entropy_std": 0.5788766741752625, "attnres/final_alpha/block_0": 0.2406081259250641, "attnres/block_norm/0": 1.6804461479187012, "attnres/final_alpha/block_1": 0.005815903656184673, "attnres/block_norm/1": 35149.81640625, "attnres/final_alpha/block_2": 0.01259290985763073, "attnres/block_norm/2": 24171.8828125, "attnres/final_alpha/block_3": 0.01425845455378294, "attnres/block_norm/3": 39149.4375, "attnres/final_alpha/block_4": 0.0181790292263031, "attnres/block_norm/4": 11225.54296875, "attnres/final_alpha/block_5": 0.5857058763504028, "attnres/block_norm/5": 5548.6787109375, "attnres/final_alpha/block_6": 0.12283973395824432, "attnres/block_norm/6": 25898.013671875, "geo/tier1_time_s": 1.3593692779541016, "geo/step": 29400.0, "geo/rankme_slope": -2.1882131758953582e-05} {"step": 29410, "timestamp": 1778226325.2588592, "train/loss": 2.130214262008667, "train/z_loss": 0.0014872818253934383, "train/perplexity": 8.416669990875338, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790700.4068991805, "perf/iters_per_sec": 0.8538724932189848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711350440979005, "data/tokens_consumed": 61679337472, "data/tokens_consumed_B": 61.679337472, "train/loss_slope": -3.6590708722017608e-06} {"step": 29420, "timestamp": 1778226335.6115623, "train/loss": 2.2271618366241457, "train/z_loss": 0.0014711893862113356, "train/perplexity": 9.273508960773544, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026620.242180401, "perf/iters_per_sec": 0.9663678370382314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348026514053346, "data/tokens_consumed": 61700308992, "data/tokens_consumed_B": 61.700308992, "train/loss_slope": -8.666422143197957e-07} {"step": 29430, "timestamp": 1778226345.9664984, "train/loss": 2.2094738245010377, "train/z_loss": 0.0014661430031992495, "train/perplexity": 9.110921188089112, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026241.0698292279, "perf/iters_per_sec": 0.9661870335718288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349962949752807, "data/tokens_consumed": 61721280512, "data/tokens_consumed_B": 61.721280512, "train/loss_slope": -3.930095649144142e-07} {"step": 29440, "timestamp": 1778226356.3356473, "train/loss": 2.193525457382202, "train/z_loss": 0.0014854824054054915, "train/perplexity": 8.966769420256663, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023613.5959923489, "perf/iters_per_sec": 0.9649341564141983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363401412963866, "data/tokens_consumed": 61742252032, "data/tokens_consumed_B": 61.742252032, "train/loss_slope": -1.183715390257709e-06} {"step": 29450, "timestamp": 1778226366.6819608, "grad/layer_0/attn": 0.002476999070495367, "grad/layer_0/mlp": 0.0026622849982231855, "grad/layer_0/attn_mlp_ratio": 0.930403386229567, "grad/layer_4/attn": 0.0023782423231750727, "grad/layer_4/mlp": 0.0025148873683065176, "grad/layer_4/attn_mlp_ratio": 0.945665503187123, "grad/layer_8/attn": 0.003335352288559079, "grad/layer_8/mlp": 0.003606474259868264, "grad/layer_8/attn_mlp_ratio": 0.9248235134218205, "grad/layer_12/attn": 0.005310139153152704, "grad/layer_12/mlp": 0.006165825761854649, "grad/layer_12/attn_mlp_ratio": 0.8612210711308416, "grad/layer_16/attn": 0.0036838853266090155, "grad/layer_16/mlp": 0.004437573719769716, "grad/layer_16/attn_mlp_ratio": 0.8301575311709829, "grad/layer_20/attn": 0.0041048722341656685, "grad/layer_20/mlp": 0.0051012407056987286, "grad/layer_20/attn_mlp_ratio": 0.804681133574406, "grad/layer_24/attn": 0.004006868693977594, "grad/layer_24/mlp": 0.00795779936015606, "grad/layer_24/attn_mlp_ratio": 0.5035146605590657, "grad/layer_27/attn": 0.006478835828602314, "grad/layer_27/mlp": 0.00713047431781888, "grad/layer_27/attn_mlp_ratio": 0.9086121692564891} {"step": 29450, "timestamp": 1778226366.697836, "train/loss": 2.162717509269714, "train/z_loss": 0.0014827282517217099, "train/perplexity": 8.694733602005895, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024832.9978654536, "perf/iters_per_sec": 0.9655156125380772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357160329818726, "data/tokens_consumed": 61763223552, "data/tokens_consumed_B": 61.763223552, "train/loss_slope": -3.937368889381438e-06} {"step": 29460, "timestamp": 1778226377.0503955, "train/loss": 2.1797950983047487, "train/z_loss": 0.001472254318650812, "train/perplexity": 8.844493821263853, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026625.8920902705, "perf/iters_per_sec": 0.9663705311251977, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347997665405273, "data/tokens_consumed": 61784195072, "data/tokens_consumed_B": 61.784195072, "train/loss_slope": -2.262058402552728e-06} {"step": 29470, "timestamp": 1778226387.399759, "train/loss": 2.165776014328003, "train/z_loss": 0.0014906407799571753, "train/perplexity": 8.72136719743939, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027456.8651261493, "perf/iters_per_sec": 0.9667667699461695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343756437301637, "data/tokens_consumed": 61805166592, "data/tokens_consumed_B": 61.805166592, "train/loss_slope": -2.7280665502653794e-06} {"step": 29475, "timestamp": 1778226393.1673427, "eos/sharpness": 39.6601915359497, "eos/L0_probe": 2.0250682830810547, "eos/L_plus": 2.2485315799713135, "eos/L_minus": 2.198206901550293, "eos/grad_norm": 0.17873455584049225, "eos/embed_grad_frac": 0.08246432989835739, "eos/time_s": 0.6056089401245117} {"step": 29475, "timestamp": 1778226394.5491111, "geo/rankme_last": 439.9303894042969, "geo/layer_0/stable_rank_q_proj": 18.504091262817383, "geo/layer_0/stable_rank_k_proj": 15.995499610900879, "geo/layer_0/stable_rank_o_proj": 50.60029220581055, "geo/layer_0/stable_rank_gate_proj": 144.19631958007812, "geo/layer_0/stable_rank_down_proj": 51.719669342041016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057230401784181595, "geo/layer_0/attn_entropy_mean": 6.223996162414551, "geo/layer_0/attn_entropy_std": 0.33992618322372437, "geo/layer_7/stable_rank_q_proj": 42.00341033935547, "geo/layer_7/stable_rank_k_proj": 42.27090072631836, "geo/layer_7/stable_rank_o_proj": 105.81866455078125, "geo/layer_7/stable_rank_gate_proj": 96.2939224243164, "geo/layer_7/stable_rank_down_proj": 147.82003784179688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5261188745498657, "geo/layer_7/attn_entropy_mean": 4.660394668579102, "geo/layer_7/attn_entropy_std": 0.8154792785644531, "geo/layer_14/stable_rank_q_proj": 55.17851638793945, "geo/layer_14/stable_rank_k_proj": 35.65623092651367, "geo/layer_14/stable_rank_o_proj": 52.71215057373047, "geo/layer_14/stable_rank_gate_proj": 80.52764129638672, "geo/layer_14/stable_rank_down_proj": 134.54098510742188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3565131723880768, "geo/layer_14/attn_entropy_mean": 5.476312637329102, "geo/layer_14/attn_entropy_std": 0.4109560549259186, "geo/layer_21/stable_rank_q_proj": 45.08686447143555, "geo/layer_21/stable_rank_k_proj": 31.19362449645996, "geo/layer_21/stable_rank_o_proj": 80.36129760742188, "geo/layer_21/stable_rank_gate_proj": 79.40351867675781, "geo/layer_21/stable_rank_down_proj": 57.87030029296875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15189559757709503, "geo/layer_21/attn_entropy_mean": 5.721057891845703, "geo/layer_21/attn_entropy_std": 0.2861933708190918, "geo/layer_27/stable_rank_q_proj": 41.60069274902344, "geo/layer_27/stable_rank_k_proj": 31.16334342956543, "geo/layer_27/stable_rank_o_proj": 118.82125091552734, "geo/layer_27/stable_rank_gate_proj": 88.83271026611328, "geo/layer_27/stable_rank_down_proj": 135.04994201660156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07473060488700867, "geo/layer_27/attn_entropy_mean": 4.323614120483398, "geo/layer_27/attn_entropy_std": 0.6112021207809448, "attnres/final_alpha/block_0": 0.23942649364471436, "attnres/block_norm/0": 1.6810736656188965, "attnres/final_alpha/block_1": 0.0059526944532990456, "attnres/block_norm/1": 35281.546875, "attnres/final_alpha/block_2": 0.0121763925999403, "attnres/block_norm/2": 24267.26171875, "attnres/final_alpha/block_3": 0.014039207249879837, "attnres/block_norm/3": 39195.65625, "attnres/final_alpha/block_4": 0.017916753888130188, "attnres/block_norm/4": 11215.0556640625, "attnres/final_alpha/block_5": 0.5870805978775024, "attnres/block_norm/5": 5554.873046875, "attnres/final_alpha/block_6": 0.12340783327817917, "attnres/block_norm/6": 25948.265625, "geo/tier1_time_s": 1.3625526428222656, "geo/step": 29475.0, "geo/rankme_slope": -2.774537549394758e-05} {"step": 29480, "timestamp": 1778226399.726401, "train/loss": 2.2340198516845704, "train/z_loss": 0.001469303760677576, "train/perplexity": 9.337325401826755, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701886.342322343, "perf/iters_per_sec": 0.8115226470576968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322515010833741, "data/tokens_consumed": 61826138112, "data/tokens_consumed_B": 61.826138112, "train/loss_slope": 6.361131442050169e-08} {"step": 29490, "timestamp": 1778226410.0806615, "train/loss": 2.206102442741394, "train/z_loss": 0.0014615598833188415, "train/perplexity": 9.080256514787257, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026469.900779821, "perf/iters_per_sec": 0.9662961486720185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348794221878053, "data/tokens_consumed": 61847109632, "data/tokens_consumed_B": 61.847109632, "train/loss_slope": 1.120865863613538e-06} {"step": 29500, "timestamp": 1778226420.426308, "grad/layer_0/attn": 0.002638942562043667, "grad/layer_0/mlp": 0.002668078290298581, "grad/layer_0/attn_mlp_ratio": 0.9890798454944745, "grad/layer_4/attn": 0.0016532555455341935, "grad/layer_4/mlp": 0.0024506354238837957, "grad/layer_4/attn_mlp_ratio": 0.6746231862803062, "grad/layer_8/attn": 0.011767568066716194, "grad/layer_8/mlp": 0.003648258512839675, "grad/layer_8/attn_mlp_ratio": 3.2255301269765897, "grad/layer_12/attn": 0.005240003578364849, "grad/layer_12/mlp": 0.006965241394937038, "grad/layer_12/attn_mlp_ratio": 0.7523075233175696, "grad/layer_16/attn": 0.0038400269113481045, "grad/layer_16/mlp": 0.00423726812005043, "grad/layer_16/attn_mlp_ratio": 0.9062506105177365, "grad/layer_20/attn": 0.0058426340110599995, "grad/layer_20/mlp": 0.0056790998205542564, "grad/layer_20/attn_mlp_ratio": 1.0287957762309954, "grad/layer_24/attn": 0.007589534390717745, "grad/layer_24/mlp": 0.008766347542405128, "grad/layer_24/attn_mlp_ratio": 0.865757861803834, "grad/layer_27/attn": 0.006372697651386261, "grad/layer_27/mlp": 0.00831026304513216, "grad/layer_27/attn_mlp_ratio": 0.7668466738167188} {"step": 29500, "timestamp": 1778226420.4420197, "train/loss": 2.2104143142700194, "train/z_loss": 0.0014737981953658163, "train/perplexity": 9.119493946916595, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025085.8476079158, "perf/iters_per_sec": 0.9656361806907252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355867147445679, "data/tokens_consumed": 61868081152, "data/tokens_consumed_B": 61.868081152, "train/loss_slope": 1.1716511097178023e-06} {"step": 29500, "timestamp": 1778226427.4594262, "geo/ww_alpha_mean": 7.815125188142741, "geo/ww_alpha_std": 4.576302432788054, "geo/ww_alpha_min": 1.3635357011182145, "geo/ww_alpha_max": 29.828944764764742, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.125925970117175, "geo/ww_alpha_by_type/k_proj": 4.510079716378159, "geo/ww_alpha_by_type/v_proj": 8.390949929770978, "geo/ww_alpha_by_type/o_proj": 7.1896673943409155, "geo/ww_alpha_by_type/gate_proj": 8.931015035339593, "geo/ww_alpha_by_type/up_proj": 12.496928863183626, "geo/ww_alpha_by_type/down_proj": 9.18082230694164, "geo/twonn_id/layer_0": 0.7045556902885437, "geo/twonn_id/layer_7": 3.269803524017334, "geo/twonn_id/layer_14": 4.669790744781494, "geo/twonn_id/layer_21": 6.606216907501221, "geo/twonn_id/layer_27": 5.958815574645996, "geo/tier2_time_s": 7.008760929107666} {"step": 29500, "timestamp": 1778226428.0888917, "eoc/jacobian_sigma/layer_0/attn": 1055.422607421875, "eoc/jacobian_sigma/layer_0/mlp": 6540.59619140625, "eoc/jacobian_sigma/layer_0": 6540.59619140625, "eoc/jacobian_sigma/layer_7/attn": 1.1598149538040161, "eoc/jacobian_sigma/layer_7/mlp": 1.6899924278259277, "eoc/jacobian_sigma/layer_7": 1.6899924278259277, "eoc/jacobian_sigma/layer_14/attn": 1.6457889080047607, "eoc/jacobian_sigma/layer_14/mlp": 5.309127330780029, "eoc/jacobian_sigma/layer_14": 5.309127330780029, "eoc/jacobian_sigma/layer_21/attn": 1.092499852180481, "eoc/jacobian_sigma/layer_21/mlp": 4.0841240882873535, "eoc/jacobian_sigma/layer_21": 4.0841240882873535, "eoc/jacobian_sigma/layer_27/attn": 3.5617706775665283, "eoc/jacobian_sigma/layer_27/mlp": 26.61821174621582, "eoc/jacobian_sigma/layer_27": 26.61821174621582, "eoc/layer0_sigma": 6540.59619140625, "eoc/sigma_max": 26.61821174621582, "eoc/sigma_min": 1.6899924278259277, "eoc/sigma_mean": 9.425363898277283, "eoc/time_s": 0.623694896697998} {"step": 29510, "timestamp": 1778226438.4623432, "train/loss": 2.164226770401001, "train/z_loss": 0.0014657379942946136, "train/perplexity": 8.707866133194887, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1164250.2735989653, "perf/iters_per_sec": 0.5551577919001414, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.801289677619934, "data/tokens_consumed": 61889052672, "data/tokens_consumed_B": 61.889052672, "train/loss_slope": 1.4628438308652308e-06} {"step": 29520, "timestamp": 1778226449.2092683, "train/loss": 2.2226644277572634, "train/z_loss": 0.0014760866295546293, "train/perplexity": 9.231895845085248, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952478.3909936915, "perf/iters_per_sec": 0.9310142474144418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0740974187850951, "data/tokens_consumed": 61910024192, "data/tokens_consumed_B": 61.910024192, "train/loss_slope": 4.470069215993791e-06} {"step": 29530, "timestamp": 1778226460.0492523, "train/loss": 2.2326941967010496, "train/z_loss": 0.0014682508306577801, "train/perplexity": 9.324955530777197, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935612.8202247308, "perf/iters_per_sec": 0.9229721165774969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0834563493728637, "data/tokens_consumed": 61930995712, "data/tokens_consumed_B": 61.930995712, "train/loss_slope": 7.04945957843089e-06} {"step": 29540, "timestamp": 1778226470.3957038, "train/loss": 2.1873283982276917, "train/z_loss": 0.0014782599289901554, "train/perplexity": 8.911373642473121, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027814.0999662126, "perf/iters_per_sec": 0.9669371127921165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341934204101562, "data/tokens_consumed": 61951967232, "data/tokens_consumed_B": 61.951967232, "train/loss_slope": 6.39053165036358e-06} {"step": 29550, "timestamp": 1778226480.732017, "grad/layer_0/attn": 0.0025928854010999203, "grad/layer_0/mlp": 0.0026885857805609703, "grad/layer_0/attn_mlp_ratio": 0.9644049014193729, "grad/layer_4/attn": 0.0018625340890139341, "grad/layer_4/mlp": 0.0025430216919630766, "grad/layer_4/attn_mlp_ratio": 0.7324098027395028, "grad/layer_8/attn": 0.003915614448487759, "grad/layer_8/mlp": 0.003807293949648738, "grad/layer_8/attn_mlp_ratio": 1.0284507572639456, "grad/layer_12/attn": 0.004457938950508833, "grad/layer_12/mlp": 0.006119541823863983, "grad/layer_12/attn_mlp_ratio": 0.7284759228667911, "grad/layer_16/attn": 0.0045953490771353245, "grad/layer_16/mlp": 0.004448347724974155, "grad/layer_16/attn_mlp_ratio": 1.0330462585088034, "grad/layer_20/attn": 0.006032322999089956, "grad/layer_20/mlp": 0.005870656576007605, "grad/layer_20/attn_mlp_ratio": 1.0275380305823463, "grad/layer_24/attn": 0.007071890868246555, "grad/layer_24/mlp": 0.008270629681646824, "grad/layer_24/attn_mlp_ratio": 0.8550607456689255, "grad/layer_27/attn": 0.004444985184818506, "grad/layer_27/mlp": 0.0069626811891794205, "grad/layer_27/attn_mlp_ratio": 0.638401357207945} {"step": 29550, "timestamp": 1778226481.3549707, "eos/sharpness": 40.7984972000122, "eos/L0_probe": 2.0205280780792236, "eos/L_plus": 2.278160572052002, "eos/L_minus": 2.1708805561065674, "eos/grad_norm": 0.11754655092954636, "eos/embed_grad_frac": 0.18651075661182404, "eos/time_s": 0.620213508605957} {"step": 29550, "timestamp": 1778226481.3732522, "train/loss": 2.2175057888031007, "train/z_loss": 0.0014598604873754084, "train/perplexity": 9.184394454161264, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911342.715184851, "perf/iters_per_sec": 0.9113992286609893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972140073776244, "data/tokens_consumed": 61972938752, "data/tokens_consumed_B": 61.972938752, "train/loss_slope": 5.8853379034116734e-06} {"step": 29550, "timestamp": 1778226482.7361932, "geo/rankme_last": 439.5668029785156, "geo/layer_0/stable_rank_q_proj": 18.478736877441406, "geo/layer_0/stable_rank_k_proj": 15.974523544311523, "geo/layer_0/stable_rank_o_proj": 50.53811264038086, "geo/layer_0/stable_rank_gate_proj": 144.09861755371094, "geo/layer_0/stable_rank_down_proj": 51.7647590637207, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052547600120306015, "geo/layer_0/attn_entropy_mean": 6.228785037994385, "geo/layer_0/attn_entropy_std": 0.3400057554244995, "geo/layer_7/stable_rank_q_proj": 41.96567916870117, "geo/layer_7/stable_rank_k_proj": 42.15581512451172, "geo/layer_7/stable_rank_o_proj": 105.73722076416016, "geo/layer_7/stable_rank_gate_proj": 96.2212905883789, "geo/layer_7/stable_rank_down_proj": 147.82705688476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5205604434013367, "geo/layer_7/attn_entropy_mean": 4.656816482543945, "geo/layer_7/attn_entropy_std": 0.8331054449081421, "geo/layer_14/stable_rank_q_proj": 55.10234832763672, "geo/layer_14/stable_rank_k_proj": 35.70651626586914, "geo/layer_14/stable_rank_o_proj": 52.67521667480469, "geo/layer_14/stable_rank_gate_proj": 80.34941101074219, "geo/layer_14/stable_rank_down_proj": 134.39291381835938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3688136637210846, "geo/layer_14/attn_entropy_mean": 5.501563549041748, "geo/layer_14/attn_entropy_std": 0.39772793650627136, "geo/layer_21/stable_rank_q_proj": 45.117469787597656, "geo/layer_21/stable_rank_k_proj": 31.083786010742188, "geo/layer_21/stable_rank_o_proj": 80.4004898071289, "geo/layer_21/stable_rank_gate_proj": 79.3231201171875, "geo/layer_21/stable_rank_down_proj": 57.869895935058594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1483682096004486, "geo/layer_21/attn_entropy_mean": 5.732090950012207, "geo/layer_21/attn_entropy_std": 0.2846880555152893, "geo/layer_27/stable_rank_q_proj": 41.59022903442383, "geo/layer_27/stable_rank_k_proj": 31.051393508911133, "geo/layer_27/stable_rank_o_proj": 118.5473861694336, "geo/layer_27/stable_rank_gate_proj": 88.6535873413086, "geo/layer_27/stable_rank_down_proj": 135.32090759277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08092872053384781, "geo/layer_27/attn_entropy_mean": 4.314212799072266, "geo/layer_27/attn_entropy_std": 0.607464075088501, "attnres/final_alpha/block_0": 0.23957255482673645, "attnres/block_norm/0": 1.6812503337860107, "attnres/final_alpha/block_1": 0.005881068296730518, "attnres/block_norm/1": 35345.6796875, "attnres/final_alpha/block_2": 0.012212575413286686, "attnres/block_norm/2": 24215.853515625, "attnres/final_alpha/block_3": 0.014083517715334892, "attnres/block_norm/3": 39056.7265625, "attnres/final_alpha/block_4": 0.018261101096868515, "attnres/block_norm/4": 11274.32421875, "attnres/final_alpha/block_5": 0.5870310664176941, "attnres/block_norm/5": 5547.46484375, "attnres/final_alpha/block_6": 0.12295807898044586, "attnres/block_norm/6": 25960.8984375, "geo/tier1_time_s": 1.3588969707489014, "geo/step": 29550.0, "geo/rankme_slope": -5.6054804734393755e-05} {"step": 29560, "timestamp": 1778226493.0908248, "train/loss": 2.1714157104492187, "train/z_loss": 0.001490261557046324, "train/perplexity": 8.77069201595224, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790343.3657871361, "perf/iters_per_sec": 0.8537022427497559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713685989379883, "data/tokens_consumed": 61993910272, "data/tokens_consumed_B": 61.993910272, "train/loss_slope": 6.112786838681486e-06} {"step": 29570, "timestamp": 1778226503.446513, "train/loss": 2.2310736179351807, "train/z_loss": 0.0014553020359016956, "train/perplexity": 9.309855944190684, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026482.179380665, "perf/iters_per_sec": 0.9663020035651517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348731517791747, "data/tokens_consumed": 62014881792, "data/tokens_consumed_B": 62.014881792, "train/loss_slope": 9.209127344600198e-06} {"step": 29580, "timestamp": 1778226514.3383126, "train/loss": 2.1772974252700807, "train/z_loss": 0.0014815549715422095, "train/perplexity": 8.82243073220231, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926415.9428392267, "perf/iters_per_sec": 0.9185867037006505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0886288642883302, "data/tokens_consumed": 62035853312, "data/tokens_consumed_B": 62.035853312, "train/loss_slope": 7.774132084209906e-06} {"step": 29590, "timestamp": 1778226525.0755956, "train/loss": 2.219015645980835, "train/z_loss": 0.001462486549280584, "train/perplexity": 9.198272052010022, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1954177.0809195377, "perf/iters_per_sec": 0.9318242458913506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0731637477874756, "data/tokens_consumed": 62056824832, "data/tokens_consumed_B": 62.056824832, "train/loss_slope": 1.122524316268681e-05} {"step": 29600, "timestamp": 1778226535.413888, "grad/layer_0/attn": 0.0024636148009449244, "grad/layer_0/mlp": 0.0026040789671242237, "grad/layer_0/attn_mlp_ratio": 0.9460599073382134, "grad/layer_4/attn": 0.0019444265635684133, "grad/layer_4/mlp": 0.002457613591104746, "grad/layer_4/attn_mlp_ratio": 0.7911847865293895, "grad/layer_8/attn": 0.006855836603790522, "grad/layer_8/mlp": 0.0036938507109880447, "grad/layer_8/attn_mlp_ratio": 1.856013400269595, "grad/layer_12/attn": 0.004185431636869907, "grad/layer_12/mlp": 0.006623280234634876, "grad/layer_12/attn_mlp_ratio": 0.6319272966573956, "grad/layer_16/attn": 0.003803456900641322, "grad/layer_16/mlp": 0.004408002831041813, "grad/layer_16/attn_mlp_ratio": 0.8628526251325317, "grad/layer_20/attn": 0.004154224414378405, "grad/layer_20/mlp": 0.006112441886216402, "grad/layer_20/attn_mlp_ratio": 0.6796341664667263, "grad/layer_24/attn": 0.008451447822153568, "grad/layer_24/mlp": 0.009940489195287228, "grad/layer_24/attn_mlp_ratio": 0.8502044085657221, "grad/layer_27/attn": 0.004064954351633787, "grad/layer_27/mlp": 0.009184146299958229, "grad/layer_27/attn_mlp_ratio": 0.44260556992561395} {"step": 29600, "timestamp": 1778226535.4296837, "train/loss": 2.152582883834839, "train/z_loss": 0.0014780758880078792, "train/perplexity": 8.607060749862075, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026547.263118345, "perf/iters_per_sec": 0.9663330379096723, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034839916229248, "data/tokens_consumed": 62077796352, "data/tokens_consumed_B": 62.077796352, "train/loss_slope": 1.1504981033515215e-05} {"step": 29610, "timestamp": 1778226545.7814536, "train/loss": 2.1871687173843384, "train/z_loss": 0.0014675067621283233, "train/perplexity": 8.909950780419384, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026987.505147897, "perf/iters_per_sec": 0.9665429616679654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034615159034729, "data/tokens_consumed": 62098767872, "data/tokens_consumed_B": 62.098767872, "train/loss_slope": 1.1229226991932165e-05} {"step": 29620, "timestamp": 1778226556.1313515, "train/loss": 2.212850308418274, "train/z_loss": 0.0014658061205409466, "train/perplexity": 9.14173606063691, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027265.0021054894, "perf/iters_per_sec": 0.9666752825286338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344735383987427, "data/tokens_consumed": 62119739392, "data/tokens_consumed_B": 62.119739392, "train/loss_slope": 1.0948751690221076e-05} {"step": 29625, "timestamp": 1778226561.904658, "eos/sharpness": 26.134681701660153, "eos/L0_probe": 2.022540330886841, "eos/L_plus": 2.1767334938049316, "eos/L_minus": 2.1296939849853516, "eos/grad_norm": 0.1146729364991188, "eos/embed_grad_frac": 0.17212368547916412, "eos/time_s": 0.6104786396026611} {"step": 29625, "timestamp": 1778226563.2829735, "geo/rankme_last": 439.8016052246094, "geo/layer_0/stable_rank_q_proj": 18.469480514526367, "geo/layer_0/stable_rank_k_proj": 16.004655838012695, "geo/layer_0/stable_rank_o_proj": 50.56252670288086, "geo/layer_0/stable_rank_gate_proj": 144.00497436523438, "geo/layer_0/stable_rank_down_proj": 51.75338363647461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05301222577691078, "geo/layer_0/attn_entropy_mean": 6.227085113525391, "geo/layer_0/attn_entropy_std": 0.341096431016922, "geo/layer_7/stable_rank_q_proj": 41.958927154541016, "geo/layer_7/stable_rank_k_proj": 42.16034698486328, "geo/layer_7/stable_rank_o_proj": 105.77014923095703, "geo/layer_7/stable_rank_gate_proj": 96.1773452758789, "geo/layer_7/stable_rank_down_proj": 147.65309143066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5180953145027161, "geo/layer_7/attn_entropy_mean": 4.652971267700195, "geo/layer_7/attn_entropy_std": 0.8174768686294556, "geo/layer_14/stable_rank_q_proj": 55.21242141723633, "geo/layer_14/stable_rank_k_proj": 35.771507263183594, "geo/layer_14/stable_rank_o_proj": 52.57111740112305, "geo/layer_14/stable_rank_gate_proj": 80.31822204589844, "geo/layer_14/stable_rank_down_proj": 134.09693908691406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36675459146499634, "geo/layer_14/attn_entropy_mean": 5.489079475402832, "geo/layer_14/attn_entropy_std": 0.39102059602737427, "geo/layer_21/stable_rank_q_proj": 45.12525177001953, "geo/layer_21/stable_rank_k_proj": 31.08452606201172, "geo/layer_21/stable_rank_o_proj": 80.3276138305664, "geo/layer_21/stable_rank_gate_proj": 79.34020233154297, "geo/layer_21/stable_rank_down_proj": 57.920005798339844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15357211232185364, "geo/layer_21/attn_entropy_mean": 5.740326881408691, "geo/layer_21/attn_entropy_std": 0.27936968207359314, "geo/layer_27/stable_rank_q_proj": 41.579833984375, "geo/layer_27/stable_rank_k_proj": 31.026508331298828, "geo/layer_27/stable_rank_o_proj": 118.93659973144531, "geo/layer_27/stable_rank_gate_proj": 88.70187377929688, "geo/layer_27/stable_rank_down_proj": 135.84701538085938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08368641138076782, "geo/layer_27/attn_entropy_mean": 4.331246376037598, "geo/layer_27/attn_entropy_std": 0.6124786734580994, "attnres/final_alpha/block_0": 0.23956099152565002, "attnres/block_norm/0": 1.6815688610076904, "attnres/final_alpha/block_1": 0.005947730969637632, "attnres/block_norm/1": 35339.4453125, "attnres/final_alpha/block_2": 0.012587904930114746, "attnres/block_norm/2": 24109.404296875, "attnres/final_alpha/block_3": 0.01419912837445736, "attnres/block_norm/3": 39242.28515625, "attnres/final_alpha/block_4": 0.018008053302764893, "attnres/block_norm/4": 11291.615234375, "attnres/final_alpha/block_5": 0.5864672660827637, "attnres/block_norm/5": 5565.3349609375, "attnres/final_alpha/block_6": 0.12322891503572464, "attnres/block_norm/6": 25998.66015625, "geo/tier1_time_s": 1.3591113090515137, "geo/step": 29625.0, "geo/rankme_slope": -7.052981348789516e-05} {"step": 29630, "timestamp": 1778226568.4602613, "train/loss": 2.199418377876282, "train/z_loss": 0.0014749802532605827, "train/perplexity": 9.01976587813025, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701656.8948901172, "perf/iters_per_sec": 0.8114132379961573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2324176549911499, "data/tokens_consumed": 62140710912, "data/tokens_consumed_B": 62.140710912, "train/loss_slope": 1.3767788862988964e-05} {"step": 29640, "timestamp": 1778226578.809801, "train/loss": 2.191783595085144, "train/z_loss": 0.0014816667069680988, "train/perplexity": 8.951164137749366, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027303.4559092205, "perf/iters_per_sec": 0.966693618731127, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344539165496827, "data/tokens_consumed": 62161682432, "data/tokens_consumed_B": 62.161682432, "train/loss_slope": 1.4234503568059306e-05} {"step": 29650, "timestamp": 1778226589.149392, "grad/layer_0/attn": 0.002650425536558032, "grad/layer_0/mlp": 0.002803522627800703, "grad/layer_0/attn_mlp_ratio": 0.9453911360430554, "grad/layer_4/attn": 0.0039061878342181444, "grad/layer_4/mlp": 0.0026011820882558823, "grad/layer_4/attn_mlp_ratio": 1.5016971328860593, "grad/layer_8/attn": 0.0044283801689744, "grad/layer_8/mlp": 0.003818741999566555, "grad/layer_8/attn_mlp_ratio": 1.1596436872437763, "grad/layer_12/attn": 0.006221597548574209, "grad/layer_12/mlp": 0.006750467699021101, "grad/layer_12/attn_mlp_ratio": 0.9216542814227506, "grad/layer_16/attn": 0.004589956719428301, "grad/layer_16/mlp": 0.004518219735473394, "grad/layer_16/attn_mlp_ratio": 1.0158772451467915, "grad/layer_20/attn": 0.010118338279426098, "grad/layer_20/mlp": 0.0057905372232198715, "grad/layer_20/attn_mlp_ratio": 1.7473919456234022, "grad/layer_24/attn": 0.011629986576735973, "grad/layer_24/mlp": 0.01038381364196539, "grad/layer_24/attn_mlp_ratio": 1.1200110928159537, "grad/layer_27/attn": 0.0050240508280694485, "grad/layer_27/mlp": 0.00907167885452509, "grad/layer_27/attn_mlp_ratio": 0.5538170886838293} {"step": 29650, "timestamp": 1778226589.1650872, "train/loss": 2.223057174682617, "train/z_loss": 0.0014623236376792193, "train/perplexity": 9.235522355897462, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026221.092754173, "perf/iters_per_sec": 0.9661775077601304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350064992904664, "data/tokens_consumed": 62182653952, "data/tokens_consumed_B": 62.182653952, "train/loss_slope": 1.8386676645550754e-05} {"step": 29660, "timestamp": 1778226599.5232263, "train/loss": 2.176753067970276, "train/z_loss": 0.001472466520499438, "train/perplexity": 8.817629484546877, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025643.188385772, "perf/iters_per_sec": 0.9659019414833889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353017807006837, "data/tokens_consumed": 62203625472, "data/tokens_consumed_B": 62.203625472, "train/loss_slope": 1.8550186753809486e-05} {"step": 29670, "timestamp": 1778226609.87648, "train/loss": 2.17879079580307, "train/z_loss": 0.001473914587404579, "train/perplexity": 8.835615732882482, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026916.8349811728, "perf/iters_per_sec": 0.9665092635064949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034651231765747, "data/tokens_consumed": 62224596992, "data/tokens_consumed_B": 62.224596992, "train/loss_slope": 1.697901494861397e-05} {"step": 29680, "timestamp": 1778226620.2295759, "train/loss": 2.162339997291565, "train/z_loss": 0.0014700653613545001, "train/perplexity": 8.691451855412623, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026508.0909848276, "perf/iters_per_sec": 0.9663143591808451, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348599195480346, "data/tokens_consumed": 62245568512, "data/tokens_consumed_B": 62.245568512, "train/loss_slope": 1.625860945583996e-05} {"step": 29690, "timestamp": 1778226630.579347, "train/loss": 2.2161262035369873, "train/z_loss": 0.0014693561708554626, "train/perplexity": 9.171732535000926, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027332.6126209847, "perf/iters_per_sec": 0.9667075217347072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344390392303466, "data/tokens_consumed": 62266540032, "data/tokens_consumed_B": 62.266540032, "train/loss_slope": 1.8705858842338675e-05} {"step": 29700, "timestamp": 1778226641.479454, "grad/layer_0/attn": 0.0028194112237542868, "grad/layer_0/mlp": 0.0029331303667277098, "grad/layer_0/attn_mlp_ratio": 0.9612293949200659, "grad/layer_4/attn": 0.0022855279967188835, "grad/layer_4/mlp": 0.0025657054502516985, "grad/layer_4/attn_mlp_ratio": 0.8907990227072903, "grad/layer_8/attn": 0.003663032315671444, "grad/layer_8/mlp": 0.0037242916878312826, "grad/layer_8/attn_mlp_ratio": 0.9835513768389477, "grad/layer_12/attn": 0.005146097857505083, "grad/layer_12/mlp": 0.006649816408753395, "grad/layer_12/attn_mlp_ratio": 0.7738706550370357, "grad/layer_16/attn": 0.004721559584140778, "grad/layer_16/mlp": 0.004868158604949713, "grad/layer_16/attn_mlp_ratio": 0.9698861253927733, "grad/layer_20/attn": 0.0038701596204191446, "grad/layer_20/mlp": 0.0069679636508226395, "grad/layer_20/attn_mlp_ratio": 0.555421893514046, "grad/layer_24/attn": 0.018267720937728882, "grad/layer_24/mlp": 0.013052739202976227, "grad/layer_24/attn_mlp_ratio": 1.3995315859532687, "grad/layer_27/attn": 0.005160406697541475, "grad/layer_27/mlp": 0.012938430532813072, "grad/layer_27/attn_mlp_ratio": 0.3988433252835317} {"step": 29700, "timestamp": 1778226642.090652, "eos/sharpness": 54.65071201324462, "eos/L0_probe": 2.0225577354431152, "eos/L_plus": 2.2609024047851562, "eos/L_minus": 2.3307201862335205, "eos/grad_norm": 0.20010049641132355, "eos/embed_grad_frac": 0.06470554322004318, "eos/time_s": 0.608090877532959} {"step": 29700, "timestamp": 1778226642.110346, "train/loss": 2.1456223011016844, "train/z_loss": 0.0014671103446744383, "train/perplexity": 8.547358613299568, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1819328.0008326296, "perf/iters_per_sec": 0.8675231937564037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1527069330215454, "data/tokens_consumed": 62287511552, "data/tokens_consumed_B": 62.287511552, "train/loss_slope": 1.2456203093587374e-05} {"step": 29700, "timestamp": 1778226643.4692533, "geo/rankme_last": 440.2172546386719, "geo/layer_0/stable_rank_q_proj": 18.474334716796875, "geo/layer_0/stable_rank_k_proj": 16.033178329467773, "geo/layer_0/stable_rank_o_proj": 50.679744720458984, "geo/layer_0/stable_rank_gate_proj": 143.80540466308594, "geo/layer_0/stable_rank_down_proj": 51.804298400878906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05449628084897995, "geo/layer_0/attn_entropy_mean": 6.226357460021973, "geo/layer_0/attn_entropy_std": 0.3398192524909973, "geo/layer_7/stable_rank_q_proj": 41.98836898803711, "geo/layer_7/stable_rank_k_proj": 42.142616271972656, "geo/layer_7/stable_rank_o_proj": 105.9151382446289, "geo/layer_7/stable_rank_gate_proj": 96.07777404785156, "geo/layer_7/stable_rank_down_proj": 147.43536376953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.527972400188446, "geo/layer_7/attn_entropy_mean": 4.6454386711120605, "geo/layer_7/attn_entropy_std": 0.8444457054138184, "geo/layer_14/stable_rank_q_proj": 55.19307327270508, "geo/layer_14/stable_rank_k_proj": 35.72947311401367, "geo/layer_14/stable_rank_o_proj": 52.55414581298828, "geo/layer_14/stable_rank_gate_proj": 80.24825286865234, "geo/layer_14/stable_rank_down_proj": 133.92196655273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3663370609283447, "geo/layer_14/attn_entropy_mean": 5.513932228088379, "geo/layer_14/attn_entropy_std": 0.4133259356021881, "geo/layer_21/stable_rank_q_proj": 44.97706604003906, "geo/layer_21/stable_rank_k_proj": 31.07628059387207, "geo/layer_21/stable_rank_o_proj": 80.50920104980469, "geo/layer_21/stable_rank_gate_proj": 79.32913970947266, "geo/layer_21/stable_rank_down_proj": 58.03022003173828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15213769674301147, "geo/layer_21/attn_entropy_mean": 5.721292018890381, "geo/layer_21/attn_entropy_std": 0.2888922691345215, "geo/layer_27/stable_rank_q_proj": 41.572235107421875, "geo/layer_27/stable_rank_k_proj": 31.07712745666504, "geo/layer_27/stable_rank_o_proj": 118.72795867919922, "geo/layer_27/stable_rank_gate_proj": 88.68035888671875, "geo/layer_27/stable_rank_down_proj": 135.45257568359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08697953820228577, "geo/layer_27/attn_entropy_mean": 4.31502628326416, "geo/layer_27/attn_entropy_std": 0.6180062890052795, "attnres/final_alpha/block_0": 0.24091795086860657, "attnres/block_norm/0": 1.6819655895233154, "attnres/final_alpha/block_1": 0.005962420254945755, "attnres/block_norm/1": 35446.53125, "attnres/final_alpha/block_2": 0.012547231279313564, "attnres/block_norm/2": 24250.33203125, "attnres/final_alpha/block_3": 0.014205786399543285, "attnres/block_norm/3": 39181.140625, "attnres/final_alpha/block_4": 0.018337760120630264, "attnres/block_norm/4": 11251.873046875, "attnres/final_alpha/block_5": 0.5810003280639648, "attnres/block_norm/5": 5562.81591796875, "attnres/final_alpha/block_6": 0.12702849507331848, "attnres/block_norm/6": 25776.041015625, "geo/tier1_time_s": 1.3552038669586182, "geo/step": 29700.0, "geo/rankme_slope": -6.016881361919768e-05} {"step": 29710, "timestamp": 1778226653.8253257, "train/loss": 2.1382225275039675, "train/z_loss": 0.001490765914786607, "train/perplexity": 8.484343530915067, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790709.8487711037, "perf/iters_per_sec": 0.8538769954543608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711288690567017, "data/tokens_consumed": 62308483072, "data/tokens_consumed_B": 62.308483072, "train/loss_slope": 1.1258575009970617e-05} {"step": 29720, "timestamp": 1778226664.1717572, "train/loss": 2.203115153312683, "train/z_loss": 0.0014619730412960053, "train/perplexity": 9.053171635818845, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027864.168705633, "perf/iters_per_sec": 0.9669609874275364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341678857803345, "data/tokens_consumed": 62329454592, "data/tokens_consumed_B": 62.329454592, "train/loss_slope": 1.3157460798989716e-05} {"step": 29730, "timestamp": 1778226674.5168078, "train/loss": 2.2003805160522463, "train/z_loss": 0.001459728879854083, "train/perplexity": 9.028448315402256, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028215.7012275257, "perf/iters_per_sec": 0.9671286111962918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033988642692566, "data/tokens_consumed": 62350426112, "data/tokens_consumed_B": 62.350426112, "train/loss_slope": 1.0960834576423308e-05} {"step": 29740, "timestamp": 1778226684.8762307, "train/loss": 2.225520300865173, "train/z_loss": 0.0014663079055026174, "train/perplexity": 9.258298651751888, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025944.954719826, "perf/iters_per_sec": 0.9660458348845606, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351475715637206, "data/tokens_consumed": 62371397632, "data/tokens_consumed_B": 62.371397632, "train/loss_slope": 1.571769385304921e-05} {"step": 29750, "timestamp": 1778226695.215291, "grad/layer_0/attn": 0.0028738200198858976, "grad/layer_0/mlp": 0.002807818353176117, "grad/layer_0/attn_mlp_ratio": 1.0235063512155218, "grad/layer_4/attn": 0.0020196805708110332, "grad/layer_4/mlp": 0.002445199992507696, "grad/layer_4/attn_mlp_ratio": 0.8259776273522599, "grad/layer_8/attn": 0.0038188647013157606, "grad/layer_8/mlp": 0.0036777853965759277, "grad/layer_8/attn_mlp_ratio": 1.0383598241037113, "grad/layer_12/attn": 0.003913309425115585, "grad/layer_12/mlp": 0.005882845725864172, "grad/layer_12/attn_mlp_ratio": 0.6652068643224612, "grad/layer_16/attn": 0.0041541955433785915, "grad/layer_16/mlp": 0.004618166014552116, "grad/layer_16/attn_mlp_ratio": 0.8995335898136003, "grad/layer_20/attn": 0.0036722745280712843, "grad/layer_20/mlp": 0.00575275020673871, "grad/layer_20/attn_mlp_ratio": 0.6383511072555368, "grad/layer_24/attn": 0.014397737570106983, "grad/layer_24/mlp": 0.010607927106320858, "grad/layer_24/attn_mlp_ratio": 1.357262101264036, "grad/layer_27/attn": 0.004380602855235338, "grad/layer_27/mlp": 0.009216568432748318, "grad/layer_27/attn_mlp_ratio": 0.47529650972269966} {"step": 29750, "timestamp": 1778226695.2308803, "train/loss": 2.2664083003997804, "train/z_loss": 0.0014625206124037505, "train/perplexity": 9.644697672974328, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026367.8961036233, "perf/iters_per_sec": 0.966247509052097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034931516647339, "data/tokens_consumed": 62392369152, "data/tokens_consumed_B": 62.392369152, "train/loss_slope": 1.9303767673253184e-05} {"step": 29760, "timestamp": 1778226705.576971, "train/loss": 2.2277953147888185, "train/z_loss": 0.0014688228722661734, "train/perplexity": 9.279385387307524, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027974.272529101, "perf/iters_per_sec": 0.9670134890218263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034111738204956, "data/tokens_consumed": 62413340672, "data/tokens_consumed_B": 62.413340672, "train/loss_slope": 2.3575075889470625e-05} {"step": 29770, "timestamp": 1778226715.931807, "train/loss": 2.1576722860336304, "train/z_loss": 0.0014645771472714842, "train/perplexity": 8.650977203221023, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026590.1721157273, "perf/iters_per_sec": 0.9663534985140454, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348180055618286, "data/tokens_consumed": 62434312192, "data/tokens_consumed_B": 62.434312192, "train/loss_slope": 1.977710609424587e-05} {"step": 29775, "timestamp": 1778226721.7037165, "eos/sharpness": 18.66989135742187, "eos/L0_probe": 2.023668050765991, "eos/L_plus": 2.119781494140625, "eos/L_minus": 2.114253520965576, "eos/grad_norm": 0.10850393772125244, "eos/embed_grad_frac": 0.20628702640533447, "eos/time_s": 0.6060631275177002} {"step": 29775, "timestamp": 1778226723.082848, "geo/rankme_last": 440.0489501953125, "geo/layer_0/stable_rank_q_proj": 18.48733139038086, "geo/layer_0/stable_rank_k_proj": 16.062131881713867, "geo/layer_0/stable_rank_o_proj": 50.722877502441406, "geo/layer_0/stable_rank_gate_proj": 143.1129913330078, "geo/layer_0/stable_rank_down_proj": 51.771549224853516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0534764863550663, "geo/layer_0/attn_entropy_mean": 6.221638202667236, "geo/layer_0/attn_entropy_std": 0.3396945893764496, "geo/layer_7/stable_rank_q_proj": 41.98964309692383, "geo/layer_7/stable_rank_k_proj": 42.19342041015625, "geo/layer_7/stable_rank_o_proj": 105.84388732910156, "geo/layer_7/stable_rank_gate_proj": 96.18816375732422, "geo/layer_7/stable_rank_down_proj": 147.5249786376953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5343163013458252, "geo/layer_7/attn_entropy_mean": 4.662578582763672, "geo/layer_7/attn_entropy_std": 0.8282003998756409, "geo/layer_14/stable_rank_q_proj": 55.30413818359375, "geo/layer_14/stable_rank_k_proj": 35.8184700012207, "geo/layer_14/stable_rank_o_proj": 52.4406623840332, "geo/layer_14/stable_rank_gate_proj": 80.26461791992188, "geo/layer_14/stable_rank_down_proj": 133.9837188720703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3794252276420593, "geo/layer_14/attn_entropy_mean": 5.514975547790527, "geo/layer_14/attn_entropy_std": 0.3862176537513733, "geo/layer_21/stable_rank_q_proj": 45.08305358886719, "geo/layer_21/stable_rank_k_proj": 31.022932052612305, "geo/layer_21/stable_rank_o_proj": 80.49353790283203, "geo/layer_21/stable_rank_gate_proj": 79.28618621826172, "geo/layer_21/stable_rank_down_proj": 58.01081085205078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15125153958797455, "geo/layer_21/attn_entropy_mean": 5.731939315795898, "geo/layer_21/attn_entropy_std": 0.28908759355545044, "geo/layer_27/stable_rank_q_proj": 41.602928161621094, "geo/layer_27/stable_rank_k_proj": 31.021251678466797, "geo/layer_27/stable_rank_o_proj": 118.74613952636719, "geo/layer_27/stable_rank_gate_proj": 88.79265594482422, "geo/layer_27/stable_rank_down_proj": 135.15557861328125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08108685910701752, "geo/layer_27/attn_entropy_mean": 4.3015241622924805, "geo/layer_27/attn_entropy_std": 0.6152500510215759, "attnres/final_alpha/block_0": 0.23963090777397156, "attnres/block_norm/0": 1.682198166847229, "attnres/final_alpha/block_1": 0.005861228797584772, "attnres/block_norm/1": 35449.33203125, "attnres/final_alpha/block_2": 0.012530416250228882, "attnres/block_norm/2": 24249.38671875, "attnres/final_alpha/block_3": 0.014189922250807285, "attnres/block_norm/3": 39559.36328125, "attnres/final_alpha/block_4": 0.018139507621526718, "attnres/block_norm/4": 11315.5234375, "attnres/final_alpha/block_5": 0.585423469543457, "attnres/block_norm/5": 5571.8154296875, "attnres/final_alpha/block_6": 0.12422455847263336, "attnres/block_norm/6": 25924.8984375, "geo/tier1_time_s": 1.3597867488861084, "geo/step": 29775.0, "geo/rankme_slope": -2.764246323529412e-05} {"step": 29780, "timestamp": 1778226728.2595718, "train/loss": 2.236468029022217, "train/z_loss": 0.0014671909739263356, "train/perplexity": 9.360212835085994, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702066.7764293335, "perf/iters_per_sec": 0.8116086847445171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2321208715438843, "data/tokens_consumed": 62455283712, "data/tokens_consumed_B": 62.455283712, "train/loss_slope": 1.948022695049806e-05} {"step": 29790, "timestamp": 1778226738.610948, "train/loss": 2.247429132461548, "train/z_loss": 0.001461028354242444, "train/perplexity": 9.46337545136214, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026876.200698993, "perf/iters_per_sec": 0.9664898875708546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346719741821289, "data/tokens_consumed": 62476255232, "data/tokens_consumed_B": 62.476255232, "train/loss_slope": 1.9104194540967427e-05} {"step": 29800, "timestamp": 1778226748.95278, "grad/layer_0/attn": 0.0030464339070022106, "grad/layer_0/mlp": 0.0028824356850236654, "grad/layer_0/attn_mlp_ratio": 1.0568956723444227, "grad/layer_4/attn": 0.0030287038534879684, "grad/layer_4/mlp": 0.0025546723045408726, "grad/layer_4/attn_mlp_ratio": 1.1855546911238073, "grad/layer_8/attn": 0.009900476783514023, "grad/layer_8/mlp": 0.003713612910360098, "grad/layer_8/attn_mlp_ratio": 2.665995825600041, "grad/layer_12/attn": 0.003950071986764669, "grad/layer_12/mlp": 0.006174015812575817, "grad/layer_12/attn_mlp_ratio": 0.6397897321124151, "grad/layer_16/attn": 0.0037998363841325045, "grad/layer_16/mlp": 0.004294241778552532, "grad/layer_16/attn_mlp_ratio": 0.8848678047481858, "grad/layer_20/attn": 0.0035074353218078613, "grad/layer_20/mlp": 0.005375596694648266, "grad/layer_20/attn_mlp_ratio": 0.6524736612127843, "grad/layer_24/attn": 0.004998598713427782, "grad/layer_24/mlp": 0.007619578391313553, "grad/layer_24/attn_mlp_ratio": 0.6560203716158668, "grad/layer_27/attn": 0.00508037069812417, "grad/layer_27/mlp": 0.006366724148392677, "grad/layer_27/attn_mlp_ratio": 0.7979567670779435} {"step": 29800, "timestamp": 1778226748.9685957, "train/loss": 2.208378720283508, "train/z_loss": 0.0014653863618150353, "train/perplexity": 9.100949241027672, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026169.8449565708, "perf/iters_per_sec": 0.9661530709059576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350326776504517, "data/tokens_consumed": 62497226752, "data/tokens_consumed_B": 62.497226752, "train/loss_slope": 1.607696039341173e-05} {"step": 29810, "timestamp": 1778226759.3175302, "train/loss": 2.1753986358642576, "train/z_loss": 0.0014840202289633452, "train/perplexity": 8.805694688334137, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027472.941027, "perf/iters_per_sec": 0.9667744355330468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343674421310425, "data/tokens_consumed": 62518198272, "data/tokens_consumed_B": 62.518198272, "train/loss_slope": 1.341660739493515e-05} {"step": 29820, "timestamp": 1778226769.663477, "train/loss": 2.1689982652664184, "train/z_loss": 0.001481819769833237, "train/perplexity": 8.749514956292183, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027927.3775918654, "perf/iters_per_sec": 0.9669911277732207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03413565158844, "data/tokens_consumed": 62539169792, "data/tokens_consumed_B": 62.539169792, "train/loss_slope": 1.0885241000887607e-05} {"step": 29830, "timestamp": 1778226780.01957, "train/loss": 2.20512170791626, "train/z_loss": 0.001478762470651418, "train/perplexity": 9.071355556455401, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026505.102946223, "perf/iters_per_sec": 0.9663129343730082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034861445426941, "data/tokens_consumed": 62560141312, "data/tokens_consumed_B": 62.560141312, "train/loss_slope": 1.2581499145798492e-05} {"step": 29840, "timestamp": 1778226790.3682675, "train/loss": 2.243282699584961, "train/z_loss": 0.001468000968452543, "train/perplexity": 9.424217439404524, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027433.592907395, "perf/iters_per_sec": 0.9667556728875136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343875169754029, "data/tokens_consumed": 62581112832, "data/tokens_consumed_B": 62.581112832, "train/loss_slope": 1.638546316894792e-05} {"step": 29850, "timestamp": 1778226800.7090247, "grad/layer_0/attn": 0.002516260603442788, "grad/layer_0/mlp": 0.002677545417100191, "grad/layer_0/attn_mlp_ratio": 0.9397638947209845, "grad/layer_4/attn": 0.0028939591720700264, "grad/layer_4/mlp": 0.002585430396720767, "grad/layer_4/attn_mlp_ratio": 1.1193335793557715, "grad/layer_8/attn": 0.004249094519764185, "grad/layer_8/mlp": 0.003731408389285207, "grad/layer_8/attn_mlp_ratio": 1.138737431713928, "grad/layer_12/attn": 0.006113806739449501, "grad/layer_12/mlp": 0.006056889425963163, "grad/layer_12/attn_mlp_ratio": 1.0093971027938284, "grad/layer_16/attn": 0.004411251284182072, "grad/layer_16/mlp": 0.004279953893274069, "grad/layer_16/attn_mlp_ratio": 1.03067726688519, "grad/layer_20/attn": 0.0034956156741827726, "grad/layer_20/mlp": 0.005624345503747463, "grad/layer_20/attn_mlp_ratio": 0.6215150917919532, "grad/layer_24/attn": 0.011091594584286213, "grad/layer_24/mlp": 0.009978389367461205, "grad/layer_24/attn_mlp_ratio": 1.1115616022460426, "grad/layer_27/attn": 0.006360647268593311, "grad/layer_27/mlp": 0.009947169572114944, "grad/layer_27/attn_mlp_ratio": 0.6394429247974138} {"step": 29850, "timestamp": 1778226801.3144798, "eos/sharpness": 39.227533340454094, "eos/L0_probe": 2.0205719470977783, "eos/L_plus": 2.262752056121826, "eos/L_minus": 2.1706671714782715, "eos/grad_norm": 0.1700127273797989, "eos/embed_grad_frac": 0.08993794023990631, "eos/time_s": 0.6025712490081787} {"step": 29850, "timestamp": 1778226801.3353462, "train/loss": 2.1770333766937258, "train/z_loss": 0.001473084557801485, "train/perplexity": 8.82010148945763, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913042.8192802838, "perf/iters_per_sec": 0.9122099014665049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096238923072815, "data/tokens_consumed": 62602084352, "data/tokens_consumed_B": 62.602084352, "train/loss_slope": 1.297298481326325e-05} {"step": 29850, "timestamp": 1778226802.6976385, "geo/rankme_last": 440.359375, "geo/layer_0/stable_rank_q_proj": 18.516460418701172, "geo/layer_0/stable_rank_k_proj": 16.050107955932617, "geo/layer_0/stable_rank_o_proj": 50.71625900268555, "geo/layer_0/stable_rank_gate_proj": 143.1929931640625, "geo/layer_0/stable_rank_down_proj": 51.71577072143555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0569942407310009, "geo/layer_0/attn_entropy_mean": 6.223180770874023, "geo/layer_0/attn_entropy_std": 0.3400326669216156, "geo/layer_7/stable_rank_q_proj": 41.874088287353516, "geo/layer_7/stable_rank_k_proj": 42.0927848815918, "geo/layer_7/stable_rank_o_proj": 105.37080383300781, "geo/layer_7/stable_rank_gate_proj": 96.14984130859375, "geo/layer_7/stable_rank_down_proj": 147.26686096191406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5313536524772644, "geo/layer_7/attn_entropy_mean": 4.647940635681152, "geo/layer_7/attn_entropy_std": 0.8078203201293945, "geo/layer_14/stable_rank_q_proj": 55.25004959106445, "geo/layer_14/stable_rank_k_proj": 35.92647171020508, "geo/layer_14/stable_rank_o_proj": 52.54416275024414, "geo/layer_14/stable_rank_gate_proj": 80.49458312988281, "geo/layer_14/stable_rank_down_proj": 134.1891326904297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4014293849468231, "geo/layer_14/attn_entropy_mean": 5.538484573364258, "geo/layer_14/attn_entropy_std": 0.39904940128326416, "geo/layer_21/stable_rank_q_proj": 45.084442138671875, "geo/layer_21/stable_rank_k_proj": 30.99408721923828, "geo/layer_21/stable_rank_o_proj": 80.44519805908203, "geo/layer_21/stable_rank_gate_proj": 79.09432220458984, "geo/layer_21/stable_rank_down_proj": 58.022396087646484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1501455307006836, "geo/layer_21/attn_entropy_mean": 5.729133605957031, "geo/layer_21/attn_entropy_std": 0.28388482332229614, "geo/layer_27/stable_rank_q_proj": 41.63575744628906, "geo/layer_27/stable_rank_k_proj": 31.057985305786133, "geo/layer_27/stable_rank_o_proj": 118.77478790283203, "geo/layer_27/stable_rank_gate_proj": 88.7139663696289, "geo/layer_27/stable_rank_down_proj": 135.14706420898438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08469750732183456, "geo/layer_27/attn_entropy_mean": 4.3069634437561035, "geo/layer_27/attn_entropy_std": 0.6038535833358765, "attnres/final_alpha/block_0": 0.23886215686798096, "attnres/block_norm/0": 1.6823375225067139, "attnres/final_alpha/block_1": 0.005945492535829544, "attnres/block_norm/1": 35327.5859375, "attnres/final_alpha/block_2": 0.012205539271235466, "attnres/block_norm/2": 24348.65234375, "attnres/final_alpha/block_3": 0.013785598799586296, "attnres/block_norm/3": 39494.203125, "attnres/final_alpha/block_4": 0.01793350838124752, "attnres/block_norm/4": 11273.306640625, "attnres/final_alpha/block_5": 0.5871303677558899, "attnres/block_norm/5": 5567.55810546875, "attnres/final_alpha/block_6": 0.12413731217384338, "attnres/block_norm/6": 26128.59375, "geo/tier1_time_s": 1.3584692478179932, "geo/step": 29850.0, "geo/rankme_slope": -4.159415719412765e-05} {"step": 29860, "timestamp": 1778226813.0591352, "train/loss": 2.1715074300765993, "train/z_loss": 0.0014628589153289795, "train/perplexity": 8.771496497448622, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789357.213216684, "perf/iters_per_sec": 0.8532320085605068, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720141649246216, "data/tokens_consumed": 62623055872, "data/tokens_consumed_B": 62.623055872, "train/loss_slope": 1.2503125615353104e-05} {"step": 29870, "timestamp": 1778226823.4181612, "train/loss": 2.2528223752975465, "train/z_loss": 0.0014664617483504117, "train/perplexity": 9.51455161190319, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025736.8159074944, "perf/iters_per_sec": 0.9659465865647766, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035253930091858, "data/tokens_consumed": 62644027392, "data/tokens_consumed_B": 62.644027392, "train/loss_slope": 1.3117897378192993e-05} {"step": 29880, "timestamp": 1778226833.7753358, "train/loss": 2.19372181892395, "train/z_loss": 0.0014687763992697, "train/perplexity": 8.968530321805536, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025844.962501191, "perf/iters_per_sec": 0.9659981548791843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351986646652223, "data/tokens_consumed": 62664998912, "data/tokens_consumed_B": 62.664998912, "train/loss_slope": 1.195762801950053e-05} {"step": 29890, "timestamp": 1778226844.126137, "train/loss": 2.2124171257019043, "train/z_loss": 0.001470796880312264, "train/perplexity": 9.137776876164882, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027086.8160363731, "perf/iters_per_sec": 0.966590316789805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034564471244812, "data/tokens_consumed": 62685970432, "data/tokens_consumed_B": 62.685970432, "train/loss_slope": 1.1524985611277782e-05} {"step": 29900, "timestamp": 1778226854.4783702, "grad/layer_0/attn": 0.003362998366355896, "grad/layer_0/mlp": 0.002987116575241089, "grad/layer_0/attn_mlp_ratio": 1.1258342850248626, "grad/layer_4/attn": 0.0017314115539193153, "grad/layer_4/mlp": 0.0026778816245496273, "grad/layer_4/attn_mlp_ratio": 0.6465601292418949, "grad/layer_8/attn": 0.004207280930131674, "grad/layer_8/mlp": 0.0038228053599596024, "grad/layer_8/attn_mlp_ratio": 1.10057416580548, "grad/layer_12/attn": 0.004812024999409914, "grad/layer_12/mlp": 0.006686716806143522, "grad/layer_12/attn_mlp_ratio": 0.7196394085397564, "grad/layer_16/attn": 0.007543173618614674, "grad/layer_16/mlp": 0.004269078373908997, "grad/layer_16/attn_mlp_ratio": 1.7669325276440142, "grad/layer_20/attn": 0.006248285062611103, "grad/layer_20/mlp": 0.006725749000906944, "grad/layer_20/attn_mlp_ratio": 0.9290095376541095, "grad/layer_24/attn": 0.013728058896958828, "grad/layer_24/mlp": 0.012875760905444622, "grad/layer_24/attn_mlp_ratio": 1.066193981944353, "grad/layer_27/attn": 0.00636912090703845, "grad/layer_27/mlp": 0.012024492025375366, "grad/layer_27/attn_mlp_ratio": 0.5296789952232288} {"step": 29900, "timestamp": 1778226854.4943123, "train/loss": 2.1982267618179323, "train/z_loss": 0.001468119479250163, "train/perplexity": 9.009024181527456, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023620.6723613334, "perf/iters_per_sec": 0.9649375306898753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363365173339845, "data/tokens_consumed": 62706941952, "data/tokens_consumed_B": 62.706941952, "train/loss_slope": 1.4638155500272926e-05} {"step": 29910, "timestamp": 1778226864.8738606, "train/loss": 2.205512046813965, "train/z_loss": 0.0014536118833348155, "train/perplexity": 9.074897150549997, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021828.8135941587, "perf/iters_per_sec": 0.9640831058474344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372549772262574, "data/tokens_consumed": 62727913472, "data/tokens_consumed_B": 62.727913472, "train/loss_slope": 1.682727859787778e-05} {"step": 29920, "timestamp": 1778226875.238543, "train/loss": 2.130364751815796, "train/z_loss": 0.0014706496731378138, "train/perplexity": 8.417936709230645, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024552.4850032562, "perf/iters_per_sec": 0.9653818535820275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358595371246337, "data/tokens_consumed": 62748884992, "data/tokens_consumed_B": 62.748884992, "train/loss_slope": 1.256829262352047e-05} {"step": 29925, "timestamp": 1778226881.0124726, "eos/sharpness": 59.713840484619126, "eos/L0_probe": 2.024160146713257, "eos/L_plus": 2.2936296463012695, "eos/L_minus": 2.3518290519714355, "eos/grad_norm": 0.2546765208244324, "eos/embed_grad_frac": 0.04057205468416214, "eos/time_s": 0.6105175018310547} {"step": 29925, "timestamp": 1778226882.3904235, "geo/rankme_last": 439.9966735839844, "geo/layer_0/stable_rank_q_proj": 18.5457706451416, "geo/layer_0/stable_rank_k_proj": 16.069154739379883, "geo/layer_0/stable_rank_o_proj": 50.508872985839844, "geo/layer_0/stable_rank_gate_proj": 143.4631805419922, "geo/layer_0/stable_rank_down_proj": 51.661041259765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.051269419491291046, "geo/layer_0/attn_entropy_mean": 6.219822883605957, "geo/layer_0/attn_entropy_std": 0.34447619318962097, "geo/layer_7/stable_rank_q_proj": 41.890567779541016, "geo/layer_7/stable_rank_k_proj": 42.085086822509766, "geo/layer_7/stable_rank_o_proj": 105.42141723632812, "geo/layer_7/stable_rank_gate_proj": 96.26434326171875, "geo/layer_7/stable_rank_down_proj": 147.47418212890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5254457592964172, "geo/layer_7/attn_entropy_mean": 4.607346057891846, "geo/layer_7/attn_entropy_std": 0.8441597819328308, "geo/layer_14/stable_rank_q_proj": 55.2639274597168, "geo/layer_14/stable_rank_k_proj": 35.97995376586914, "geo/layer_14/stable_rank_o_proj": 52.54700469970703, "geo/layer_14/stable_rank_gate_proj": 80.51093292236328, "geo/layer_14/stable_rank_down_proj": 134.45765686035156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37138333916664124, "geo/layer_14/attn_entropy_mean": 5.516017913818359, "geo/layer_14/attn_entropy_std": 0.41132915019989014, "geo/layer_21/stable_rank_q_proj": 44.91053009033203, "geo/layer_21/stable_rank_k_proj": 31.054853439331055, "geo/layer_21/stable_rank_o_proj": 80.35662841796875, "geo/layer_21/stable_rank_gate_proj": 78.8533935546875, "geo/layer_21/stable_rank_down_proj": 58.025543212890625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15033672749996185, "geo/layer_21/attn_entropy_mean": 5.7333526611328125, "geo/layer_21/attn_entropy_std": 0.29756295680999756, "geo/layer_27/stable_rank_q_proj": 41.674461364746094, "geo/layer_27/stable_rank_k_proj": 31.05784034729004, "geo/layer_27/stable_rank_o_proj": 119.1258316040039, "geo/layer_27/stable_rank_gate_proj": 88.6844711303711, "geo/layer_27/stable_rank_down_proj": 135.16427612304688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07423733174800873, "geo/layer_27/attn_entropy_mean": 4.30302619934082, "geo/layer_27/attn_entropy_std": 0.6302888989448547, "attnres/final_alpha/block_0": 0.24111448228359222, "attnres/block_norm/0": 1.6828911304473877, "attnres/final_alpha/block_1": 0.005926785059273243, "attnres/block_norm/1": 35672.26953125, "attnres/final_alpha/block_2": 0.012532643973827362, "attnres/block_norm/2": 24209.259765625, "attnres/final_alpha/block_3": 0.014086725190281868, "attnres/block_norm/3": 39395.828125, "attnres/final_alpha/block_4": 0.01831815019249916, "attnres/block_norm/4": 11343.4130859375, "attnres/final_alpha/block_5": 0.5823537707328796, "attnres/block_norm/5": 5649.4453125, "attnres/final_alpha/block_6": 0.12566743791103363, "attnres/block_norm/6": 26063.630859375, "geo/tier1_time_s": 1.3585469722747803, "geo/step": 29925.0, "geo/rankme_slope": -5.6544805422168866e-05} {"step": 29930, "timestamp": 1778226887.5675812, "train/loss": 2.2249358892440796, "train/z_loss": 0.001470151066314429, "train/perplexity": 9.252889575145813, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701616.9643861295, "perf/iters_per_sec": 0.8113941976481102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2324465751647948, "data/tokens_consumed": 62769856512, "data/tokens_consumed_B": 62.769856512, "train/loss_slope": 1.4514248856831487e-05} {"step": 29940, "timestamp": 1778226897.9218435, "train/loss": 2.1507163763046266, "train/z_loss": 0.0014824080164544285, "train/perplexity": 8.591010589691571, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026466.959540112, "perf/iters_per_sec": 0.9662947461796341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348809242248536, "data/tokens_consumed": 62790828032, "data/tokens_consumed_B": 62.790828032, "train/loss_slope": 7.18852830584116e-06} {"step": 29950, "timestamp": 1778226908.2630346, "grad/layer_0/attn": 0.0030365961138159037, "grad/layer_0/mlp": 0.002845799783244729, "grad/layer_0/attn_mlp_ratio": 1.0670448515001105, "grad/layer_4/attn": 0.002235200721770525, "grad/layer_4/mlp": 0.00249918713234365, "grad/layer_4/attn_mlp_ratio": 0.8943710550547397, "grad/layer_8/attn": 0.004555011633783579, "grad/layer_8/mlp": 0.0037659720983356237, "grad/layer_8/attn_mlp_ratio": 1.2095181254382792, "grad/layer_12/attn": 0.005690900143235922, "grad/layer_12/mlp": 0.006712721660733223, "grad/layer_12/attn_mlp_ratio": 0.847778344772972, "grad/layer_16/attn": 0.004585884511470795, "grad/layer_16/mlp": 0.005163955502212048, "grad/layer_16/attn_mlp_ratio": 0.8880565335430792, "grad/layer_20/attn": 0.004826148971915245, "grad/layer_20/mlp": 0.00691727502271533, "grad/layer_20/attn_mlp_ratio": 0.6976951019436644, "grad/layer_24/attn": 0.010721374303102493, "grad/layer_24/mlp": 0.01175989955663681, "grad/layer_24/attn_mlp_ratio": 0.9116892674379058, "grad/layer_27/attn": 0.007794560398906469, "grad/layer_27/mlp": 0.011028628796339035, "grad/layer_27/attn_mlp_ratio": 0.7067569751570735} {"step": 29950, "timestamp": 1778226908.2788193, "train/loss": 2.178913104534149, "train/z_loss": 0.0014693587087094783, "train/perplexity": 8.836696471921636, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025651.118632583, "perf/iters_per_sec": 0.9659057229197421, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035297727584839, "data/tokens_consumed": 62811799552, "data/tokens_consumed_B": 62.811799552, "train/loss_slope": 6.1276338639074765e-06} {"step": 29960, "timestamp": 1778226918.6283784, "train/loss": 2.151741600036621, "train/z_loss": 0.0014666255563497543, "train/perplexity": 8.599822814108874, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027364.5270986003, "perf/iters_per_sec": 0.966722739743519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034422755241394, "data/tokens_consumed": 62832771072, "data/tokens_consumed_B": 62.832771072, "train/loss_slope": 6.065231803083729e-06} {"step": 29970, "timestamp": 1778226928.9848526, "train/loss": 2.1889291763305665, "train/z_loss": 0.0014632547390647233, "train/perplexity": 8.92565019801665, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026055.9235752884, "perf/iters_per_sec": 0.9660987489582483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350908756256103, "data/tokens_consumed": 62853742592, "data/tokens_consumed_B": 62.853742592, "train/loss_slope": 4.952266836466842e-06} {"step": 29980, "timestamp": 1778226939.3333323, "train/loss": 2.176180028915405, "train/z_loss": 0.001457806897815317, "train/perplexity": 8.812578085943423, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027608.7084384374, "perf/iters_per_sec": 0.9668391744796931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342981815338135, "data/tokens_consumed": 62874714112, "data/tokens_consumed_B": 62.874714112, "train/loss_slope": 4.444020459003171e-06} {"step": 29990, "timestamp": 1778226949.6817164, "train/loss": 2.2243109941482544, "train/z_loss": 0.0014630225487053394, "train/perplexity": 9.247109296050219, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027532.0128055871, "perf/iters_per_sec": 0.9668026031520782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034337306022644, "data/tokens_consumed": 62895685632, "data/tokens_consumed_B": 62.895685632, "train/loss_slope": 6.7565228321729254e-06} {"step": 30000, "timestamp": 1778226960.0417163, "grad/layer_0/attn": 0.0032372502610087395, "grad/layer_0/mlp": 0.0030880779959261417, "grad/layer_0/attn_mlp_ratio": 1.0483058266173348, "grad/layer_4/attn": 0.0027742006350308657, "grad/layer_4/mlp": 0.0024835967924445868, "grad/layer_4/attn_mlp_ratio": 1.1170092229823334, "grad/layer_8/attn": 0.006370514631271362, "grad/layer_8/mlp": 0.0038174663204699755, "grad/layer_8/attn_mlp_ratio": 1.6687807906079961, "grad/layer_12/attn": 0.004221292212605476, "grad/layer_12/mlp": 0.006487402133643627, "grad/layer_12/attn_mlp_ratio": 0.650690686437459, "grad/layer_16/attn": 0.005237685050815344, "grad/layer_16/mlp": 0.004787335637956858, "grad/layer_16/attn_mlp_ratio": 1.0940709692215334, "grad/layer_20/attn": 0.00432094931602478, "grad/layer_20/mlp": 0.006666393950581551, "grad/layer_20/attn_mlp_ratio": 0.6481689025940249, "grad/layer_24/attn": 0.011554254218935966, "grad/layer_24/mlp": 0.01214824989438057, "grad/layer_24/attn_mlp_ratio": 0.9511044162147332, "grad/layer_27/attn": 0.007515409030020237, "grad/layer_27/mlp": 0.011176896281540394, "grad/layer_27/attn_mlp_ratio": 0.6724057174255084} {"step": 30000, "timestamp": 1778226960.656475, "eos/sharpness": 46.75269126892089, "eos/L0_probe": 2.0210158824920654, "eos/L_plus": 2.292602777481079, "eos/L_minus": 2.2169559001922607, "eos/grad_norm": 0.17528192698955536, "eos/embed_grad_frac": 0.0885566845536232, "eos/time_s": 0.6119246482849121} {"step": 30000, "timestamp": 1778226960.6773324, "train/loss": 2.1900922060012817, "train/z_loss": 0.0014572713524103164, "train/perplexity": 8.936037032955175, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908086.72969606, "perf/iters_per_sec": 0.9098466537933636, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099086308479309, "data/tokens_consumed": 62916657152, "data/tokens_consumed_B": 62.916657152, "train/loss_slope": 6.279345950742727e-06} {"step": 30000, "timestamp": 1778226962.0385828, "geo/rankme_last": 439.73931884765625, "geo/layer_0/stable_rank_q_proj": 18.561012268066406, "geo/layer_0/stable_rank_k_proj": 16.09564971923828, "geo/layer_0/stable_rank_o_proj": 50.44280242919922, "geo/layer_0/stable_rank_gate_proj": 143.3249969482422, "geo/layer_0/stable_rank_down_proj": 51.68698501586914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05175041779875755, "geo/layer_0/attn_entropy_mean": 6.219605445861816, "geo/layer_0/attn_entropy_std": 0.3438509404659271, "geo/layer_7/stable_rank_q_proj": 41.97614669799805, "geo/layer_7/stable_rank_k_proj": 41.97615051269531, "geo/layer_7/stable_rank_o_proj": 105.58036041259766, "geo/layer_7/stable_rank_gate_proj": 96.1959457397461, "geo/layer_7/stable_rank_down_proj": 147.50979614257812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5428515076637268, "geo/layer_7/attn_entropy_mean": 4.626245498657227, "geo/layer_7/attn_entropy_std": 0.818651556968689, "geo/layer_14/stable_rank_q_proj": 55.085968017578125, "geo/layer_14/stable_rank_k_proj": 36.002784729003906, "geo/layer_14/stable_rank_o_proj": 52.57981491088867, "geo/layer_14/stable_rank_gate_proj": 80.41766357421875, "geo/layer_14/stable_rank_down_proj": 134.69186401367188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36424922943115234, "geo/layer_14/attn_entropy_mean": 5.468362808227539, "geo/layer_14/attn_entropy_std": 0.3925419747829437, "geo/layer_21/stable_rank_q_proj": 44.898746490478516, "geo/layer_21/stable_rank_k_proj": 31.043943405151367, "geo/layer_21/stable_rank_o_proj": 80.31151580810547, "geo/layer_21/stable_rank_gate_proj": 78.72479248046875, "geo/layer_21/stable_rank_down_proj": 57.974735260009766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14951007068157196, "geo/layer_21/attn_entropy_mean": 5.714132785797119, "geo/layer_21/attn_entropy_std": 0.2862415015697479, "geo/layer_27/stable_rank_q_proj": 41.65104293823242, "geo/layer_27/stable_rank_k_proj": 31.082693099975586, "geo/layer_27/stable_rank_o_proj": 119.11518096923828, "geo/layer_27/stable_rank_gate_proj": 88.633544921875, "geo/layer_27/stable_rank_down_proj": 135.22723388671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08508601039648056, "geo/layer_27/attn_entropy_mean": 4.301269054412842, "geo/layer_27/attn_entropy_std": 0.5898798108100891, "attnres/final_alpha/block_0": 0.2381487786769867, "attnres/block_norm/0": 1.6833856105804443, "attnres/final_alpha/block_1": 0.005784319713711739, "attnres/block_norm/1": 35731.7578125, "attnres/final_alpha/block_2": 0.012224869802594185, "attnres/block_norm/2": 24257.49609375, "attnres/final_alpha/block_3": 0.01393476314842701, "attnres/block_norm/3": 39607.25, "attnres/final_alpha/block_4": 0.018055176362395287, "attnres/block_norm/4": 11304.123046875, "attnres/final_alpha/block_5": 0.5909602642059326, "attnres/block_norm/5": 5558.14501953125, "attnres/final_alpha/block_6": 0.12089177221059799, "attnres/block_norm/6": 26334.29296875, "geo/tier1_time_s": 1.3569765090942383, "geo/step": 30000.0, "geo/rankme_slope": -6.952337575655261e-05} {"step": 30000, "timestamp": 1778226968.9748003, "geo/ww_alpha_mean": 7.925917751146573, "geo/ww_alpha_std": 4.562333966449749, "geo/ww_alpha_min": 1.3545643529079185, "geo/ww_alpha_max": 29.111909267887967, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.132830755288842, "geo/ww_alpha_by_type/k_proj": 4.588533469048383, "geo/ww_alpha_by_type/v_proj": 9.124867586792787, "geo/ww_alpha_by_type/o_proj": 7.198545291650843, "geo/ww_alpha_by_type/gate_proj": 8.880064925087337, "geo/ww_alpha_by_type/up_proj": 12.653709988984774, "geo/ww_alpha_by_type/down_proj": 9.026828549371308, "geo/twonn_id/layer_0": 0.7094221711158752, "geo/twonn_id/layer_7": 2.9822256565093994, "geo/twonn_id/layer_14": 4.27069616317749, "geo/twonn_id/layer_21": 6.988086223602295, "geo/twonn_id/layer_27": 6.004968166351318, "geo/tier2_time_s": 6.920898675918579} {"step": 30000, "timestamp": 1778226969.6071494, "eoc/jacobian_sigma/layer_0/attn": 902.3929443359375, "eoc/jacobian_sigma/layer_0/mlp": 7377.68212890625, "eoc/jacobian_sigma/layer_0": 7377.68212890625, "eoc/jacobian_sigma/layer_7/attn": 1.1719133853912354, "eoc/jacobian_sigma/layer_7/mlp": 1.690916895866394, "eoc/jacobian_sigma/layer_7": 1.690916895866394, "eoc/jacobian_sigma/layer_14/attn": 1.6699326038360596, "eoc/jacobian_sigma/layer_14/mlp": 7.226251125335693, "eoc/jacobian_sigma/layer_14": 7.226251125335693, "eoc/jacobian_sigma/layer_21/attn": 1.0952095985412598, "eoc/jacobian_sigma/layer_21/mlp": 4.031754970550537, "eoc/jacobian_sigma/layer_21": 4.031754970550537, "eoc/jacobian_sigma/layer_27/attn": 3.675236225128174, "eoc/jacobian_sigma/layer_27/mlp": 26.04814910888672, "eoc/jacobian_sigma/layer_27": 26.04814910888672, "eoc/layer0_sigma": 7377.68212890625, "eoc/sigma_max": 26.04814910888672, "eoc/sigma_min": 1.690916895866394, "eoc/sigma_mean": 9.749268025159836, "eoc/time_s": 0.6252985000610352} {"step": 30010, "timestamp": 1778226980.0060234, "train/loss": 2.2203933715820314, "train/z_loss": 0.0014606540789827704, "train/perplexity": 9.21095348066139, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085227.1222986586, "perf/iters_per_sec": 0.5174766170018475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9324544668197632, "data/tokens_consumed": 62937628672, "data/tokens_consumed_B": 62.937628672, "train/loss_slope": 5.177389777819067e-06} {"step": 30020, "timestamp": 1778226990.3763542, "train/loss": 2.258522868156433, "train/z_loss": 0.0014486038475297392, "train/perplexity": 9.568944130210314, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023341.5657516948, "perf/iters_per_sec": 0.9648044422872996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364794731140137, "data/tokens_consumed": 62958600192, "data/tokens_consumed_B": 62.958600192, "train/loss_slope": 9.547682406485303e-06} {"step": 30030, "timestamp": 1778227000.7232475, "train/loss": 2.192729949951172, "train/z_loss": 0.0014536719070747494, "train/perplexity": 8.959639125027957, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027804.0023687894, "perf/iters_per_sec": 0.9669322978824565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341985702514649, "data/tokens_consumed": 62979571712, "data/tokens_consumed_B": 62.979571712, "train/loss_slope": 1.1798223875465268e-05} {"step": 30040, "timestamp": 1778227011.0687962, "train/loss": 2.164406752586365, "train/z_loss": 0.0014800028293393552, "train/perplexity": 8.709433535019368, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027981.0988922212, "perf/iters_per_sec": 0.9670167440854174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341082572937013, "data/tokens_consumed": 63000543232, "data/tokens_consumed_B": 63.000543232, "train/loss_slope": 1.1273882519973531e-05} {"step": 30050, "timestamp": 1778227021.4092412, "grad/layer_0/attn": 0.00276166875846684, "grad/layer_0/mlp": 0.0026652072556316853, "grad/layer_0/attn_mlp_ratio": 1.0361928322880123, "grad/layer_4/attn": 0.0018632644787430763, "grad/layer_4/mlp": 0.002532517071813345, "grad/layer_4/attn_mlp_ratio": 0.7357361677468638, "grad/layer_8/attn": 0.007835762575268745, "grad/layer_8/mlp": 0.0037100256886333227, "grad/layer_8/attn_mlp_ratio": 2.1120507030640443, "grad/layer_12/attn": 0.004531875252723694, "grad/layer_12/mlp": 0.006841891445219517, "grad/layer_12/attn_mlp_ratio": 0.6623716881174696, "grad/layer_16/attn": 0.004142620135098696, "grad/layer_16/mlp": 0.004605041351169348, "grad/layer_16/attn_mlp_ratio": 0.8995836799789869, "grad/layer_20/attn": 0.0034889057278633118, "grad/layer_20/mlp": 0.005586433224380016, "grad/layer_20/attn_mlp_ratio": 0.6245318838116647, "grad/layer_24/attn": 0.00740246195346117, "grad/layer_24/mlp": 0.00864684022963047, "grad/layer_24/attn_mlp_ratio": 0.8560886602814742, "grad/layer_27/attn": 0.005881365388631821, "grad/layer_27/mlp": 0.007144276052713394, "grad/layer_27/attn_mlp_ratio": 0.8232276108753271} {"step": 30050, "timestamp": 1778227021.4249372, "train/loss": 2.177686357498169, "train/z_loss": 0.0014691696036607028, "train/perplexity": 8.825862727207607, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026063.203706431, "perf/iters_per_sec": 0.9661022203952937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350871562957764, "data/tokens_consumed": 63021514752, "data/tokens_consumed_B": 63.021514752, "train/loss_slope": 9.35819388651973e-06} {"step": 30060, "timestamp": 1778227031.8149223, "train/loss": 2.1848040342330934, "train/z_loss": 0.0014704908011481167, "train/perplexity": 8.888906461310091, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019547.7625130622, "perf/iters_per_sec": 0.9629954159322082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384265422821044, "data/tokens_consumed": 63042486272, "data/tokens_consumed_B": 63.042486272, "train/loss_slope": 7.927806049075705e-06} {"step": 30070, "timestamp": 1778227042.1929657, "train/loss": 2.165806198120117, "train/z_loss": 0.0014669419499114156, "train/perplexity": 8.721630445346719, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022165.8906126954, "perf/iters_per_sec": 0.964243836695049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370820760726929, "data/tokens_consumed": 63063457792, "data/tokens_consumed_B": 63.063457792, "train/loss_slope": 7.3652310947953095e-06} {"step": 30075, "timestamp": 1778227047.9833198, "eos/sharpness": 62.876629829406724, "eos/L0_probe": 2.0199053287506104, "eos/L_plus": 2.4197468757629395, "eos/L_minus": 2.2488300800323486, "eos/grad_norm": 0.18704652786254883, "eos/embed_grad_frac": 0.07935187965631485, "eos/time_s": 0.6249215602874756} {"step": 30075, "timestamp": 1778227049.3614054, "geo/rankme_last": 439.9459228515625, "geo/layer_0/stable_rank_q_proj": 18.57451057434082, "geo/layer_0/stable_rank_k_proj": 16.092041015625, "geo/layer_0/stable_rank_o_proj": 50.409305572509766, "geo/layer_0/stable_rank_gate_proj": 143.32315063476562, "geo/layer_0/stable_rank_down_proj": 51.808597564697266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052115947008132935, "geo/layer_0/attn_entropy_mean": 6.225406646728516, "geo/layer_0/attn_entropy_std": 0.3417397141456604, "geo/layer_7/stable_rank_q_proj": 42.092247009277344, "geo/layer_7/stable_rank_k_proj": 42.1658821105957, "geo/layer_7/stable_rank_o_proj": 105.61528778076172, "geo/layer_7/stable_rank_gate_proj": 96.09380340576172, "geo/layer_7/stable_rank_down_proj": 147.06732177734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.540367603302002, "geo/layer_7/attn_entropy_mean": 4.656777858734131, "geo/layer_7/attn_entropy_std": 0.8225235342979431, "geo/layer_14/stable_rank_q_proj": 55.071868896484375, "geo/layer_14/stable_rank_k_proj": 35.8966064453125, "geo/layer_14/stable_rank_o_proj": 52.437103271484375, "geo/layer_14/stable_rank_gate_proj": 80.29739379882812, "geo/layer_14/stable_rank_down_proj": 134.67218017578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38014093041419983, "geo/layer_14/attn_entropy_mean": 5.497323513031006, "geo/layer_14/attn_entropy_std": 0.3957696557044983, "geo/layer_21/stable_rank_q_proj": 44.947265625, "geo/layer_21/stable_rank_k_proj": 30.969327926635742, "geo/layer_21/stable_rank_o_proj": 80.25654602050781, "geo/layer_21/stable_rank_gate_proj": 78.58071899414062, "geo/layer_21/stable_rank_down_proj": 57.94009017944336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15072844922542572, "geo/layer_21/attn_entropy_mean": 5.729499816894531, "geo/layer_21/attn_entropy_std": 0.2875765860080719, "geo/layer_27/stable_rank_q_proj": 41.684234619140625, "geo/layer_27/stable_rank_k_proj": 31.15189552307129, "geo/layer_27/stable_rank_o_proj": 118.8547592163086, "geo/layer_27/stable_rank_gate_proj": 88.67266845703125, "geo/layer_27/stable_rank_down_proj": 135.4511260986328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08110281080007553, "geo/layer_27/attn_entropy_mean": 4.321012496948242, "geo/layer_27/attn_entropy_std": 0.6124204993247986, "attnres/final_alpha/block_0": 0.2378617823123932, "attnres/block_norm/0": 1.6838278770446777, "attnres/final_alpha/block_1": 0.005857343785464764, "attnres/block_norm/1": 35479.2265625, "attnres/final_alpha/block_2": 0.012424485757946968, "attnres/block_norm/2": 24343.14453125, "attnres/final_alpha/block_3": 0.013785355724394321, "attnres/block_norm/3": 39643.75, "attnres/final_alpha/block_4": 0.0179107915610075, "attnres/block_norm/4": 11329.888671875, "attnres/final_alpha/block_5": 0.5892674922943115, "attnres/block_norm/5": 5585.4921875, "attnres/final_alpha/block_6": 0.12289277464151382, "attnres/block_norm/6": 26212.990234375, "geo/tier1_time_s": 1.3567841053009033, "geo/step": 30075.0, "geo/rankme_slope": -8.102293651835734e-05} {"step": 30080, "timestamp": 1778227054.5448575, "train/loss": 2.180865478515625, "train/z_loss": 0.0014735994394868613, "train/perplexity": 8.853965860862871, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698647.0585740265, "perf/iters_per_sec": 0.809978036200536, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234601378440857, "data/tokens_consumed": 63084429312, "data/tokens_consumed_B": 63.084429312, "train/loss_slope": 6.1351326897521714e-06} {"step": 30090, "timestamp": 1778227064.8936229, "train/loss": 2.186498498916626, "train/z_loss": 0.001458036992698908, "train/perplexity": 8.903981167555793, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027290.7468357796, "perf/iters_per_sec": 0.9666875585726641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344604015350343, "data/tokens_consumed": 63105400832, "data/tokens_consumed_B": 63.105400832, "train/loss_slope": 3.1026411299730166e-06} {"step": 30100, "timestamp": 1778227075.2356296, "grad/layer_0/attn": 0.0030774769838899374, "grad/layer_0/mlp": 0.00293124676682055, "grad/layer_0/attn_mlp_ratio": 1.049886660425841, "grad/layer_4/attn": 0.003337711561471224, "grad/layer_4/mlp": 0.0025644975248724222, "grad/layer_4/attn_mlp_ratio": 1.3015069809773239, "grad/layer_8/attn": 0.0060968538746237755, "grad/layer_8/mlp": 0.003819130128249526, "grad/layer_8/attn_mlp_ratio": 1.5963985280015522, "grad/layer_12/attn": 0.004315208178013563, "grad/layer_12/mlp": 0.006434448063373566, "grad/layer_12/attn_mlp_ratio": 0.6706415326456077, "grad/layer_16/attn": 0.005468976683914661, "grad/layer_16/mlp": 0.004824525211006403, "grad/layer_16/attn_mlp_ratio": 1.133578192954661, "grad/layer_20/attn": 0.003963470458984375, "grad/layer_20/mlp": 0.006648038048297167, "grad/layer_20/attn_mlp_ratio": 0.5961864794653113, "grad/layer_24/attn": 0.01108397077769041, "grad/layer_24/mlp": 0.010518582537770271, "grad/layer_24/attn_mlp_ratio": 1.05375136169867, "grad/layer_27/attn": 0.005973291117697954, "grad/layer_27/mlp": 0.011282422579824924, "grad/layer_27/attn_mlp_ratio": 0.5294333750125594} {"step": 30100, "timestamp": 1778227075.2514355, "train/loss": 2.1635470628738402, "train/z_loss": 0.001463326287921518, "train/perplexity": 8.701949342110163, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026013.177194077, "perf/iters_per_sec": 0.966078365895308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351127147674561, "data/tokens_consumed": 63126372352, "data/tokens_consumed_B": 63.126372352, "train/loss_slope": 2.87144410871295e-06} {"step": 30110, "timestamp": 1778227085.6057048, "train/loss": 2.2190624952316282, "train/z_loss": 0.001462440739851445, "train/perplexity": 9.198702994258834, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026334.5659091752, "perf/iters_per_sec": 0.966231615976894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349485397338867, "data/tokens_consumed": 63147343872, "data/tokens_consumed_B": 63.147343872, "train/loss_slope": 5.3702474033871855e-06} {"step": 30120, "timestamp": 1778227095.960279, "train/loss": 2.1573308944702148, "train/z_loss": 0.0014742317725904286, "train/perplexity": 8.648024336659088, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026402.6745762804, "perf/iters_per_sec": 0.9662640927201654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034913754463196, "data/tokens_consumed": 63168315392, "data/tokens_consumed_B": 63.168315392, "train/loss_slope": 3.3887962017408203e-06} {"step": 30130, "timestamp": 1778227106.3127406, "train/loss": 2.1946335792541505, "train/z_loss": 0.0014664835296571254, "train/perplexity": 8.976711200906939, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027079.5285472162, "perf/iters_per_sec": 0.966586841844185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034568190574646, "data/tokens_consumed": 63189286912, "data/tokens_consumed_B": 63.189286912, "train/loss_slope": 5.710380284091619e-06} {"step": 30140, "timestamp": 1778227116.6661344, "train/loss": 2.1452013492584228, "train/z_loss": 0.0014769696164876223, "train/perplexity": 8.543761344127946, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026593.0203289369, "perf/iters_per_sec": 0.9663548566479382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348165512084961, "data/tokens_consumed": 63210258432, "data/tokens_consumed_B": 63.210258432, "train/loss_slope": 4.642984668950272e-06} {"step": 30150, "timestamp": 1778227127.004333, "grad/layer_0/attn": 0.0030551364179700613, "grad/layer_0/mlp": 0.0030102713499218225, "grad/layer_0/attn_mlp_ratio": 1.0149039609200041, "grad/layer_4/attn": 0.0018606212688609958, "grad/layer_4/mlp": 0.0023902838584035635, "grad/layer_4/attn_mlp_ratio": 0.7784101392303524, "grad/layer_8/attn": 0.00350481946952641, "grad/layer_8/mlp": 0.003514933865517378, "grad/layer_8/attn_mlp_ratio": 0.9971224221876729, "grad/layer_12/attn": 0.004388811532407999, "grad/layer_12/mlp": 0.006460146047174931, "grad/layer_12/attn_mlp_ratio": 0.67936722055851, "grad/layer_16/attn": 0.004795947577804327, "grad/layer_16/mlp": 0.004643601831048727, "grad/layer_16/attn_mlp_ratio": 1.0328076456633728, "grad/layer_20/attn": 0.004443501587957144, "grad/layer_20/mlp": 0.007213156670331955, "grad/layer_20/attn_mlp_ratio": 0.6160273136213357, "grad/layer_24/attn": 0.020353971049189568, "grad/layer_24/mlp": 0.013175045140087605, "grad/layer_24/attn_mlp_ratio": 1.5448881334584494, "grad/layer_27/attn": 0.008905607275664806, "grad/layer_27/mlp": 0.011506881564855576, "grad/layer_27/attn_mlp_ratio": 0.7739375040993421} {"step": 30150, "timestamp": 1778227127.6141202, "eos/sharpness": 71.49541378021239, "eos/L0_probe": 2.0167927742004395, "eos/L_plus": 2.302216053009033, "eos/L_minus": 2.4463236331939697, "eos/grad_norm": 0.24967528879642487, "eos/embed_grad_frac": 0.0340348556637764, "eos/time_s": 0.6070108413696289} {"step": 30150, "timestamp": 1778227127.6352184, "train/loss": 2.18396577835083, "train/z_loss": 0.0014660686836577951, "train/perplexity": 8.881458405306518, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912756.8600144908, "perf/iters_per_sec": 0.9120735454628424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964028120040894, "data/tokens_consumed": 63231229952, "data/tokens_consumed_B": 63.231229952, "train/loss_slope": 2.304856068301814e-06} {"step": 30150, "timestamp": 1778227128.9986212, "geo/rankme_last": 440.1234436035156, "geo/layer_0/stable_rank_q_proj": 18.585691452026367, "geo/layer_0/stable_rank_k_proj": 16.053070068359375, "geo/layer_0/stable_rank_o_proj": 50.509639739990234, "geo/layer_0/stable_rank_gate_proj": 143.0527801513672, "geo/layer_0/stable_rank_down_proj": 51.80938720703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.053419046103954315, "geo/layer_0/attn_entropy_mean": 6.227766513824463, "geo/layer_0/attn_entropy_std": 0.34106671810150146, "geo/layer_7/stable_rank_q_proj": 42.165523529052734, "geo/layer_7/stable_rank_k_proj": 42.0914306640625, "geo/layer_7/stable_rank_o_proj": 105.17168426513672, "geo/layer_7/stable_rank_gate_proj": 95.95207977294922, "geo/layer_7/stable_rank_down_proj": 147.4098663330078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5311887264251709, "geo/layer_7/attn_entropy_mean": 4.63761043548584, "geo/layer_7/attn_entropy_std": 0.826963484287262, "geo/layer_14/stable_rank_q_proj": 55.06483459472656, "geo/layer_14/stable_rank_k_proj": 36.00094223022461, "geo/layer_14/stable_rank_o_proj": 52.50103759765625, "geo/layer_14/stable_rank_gate_proj": 80.2503433227539, "geo/layer_14/stable_rank_down_proj": 134.6394805908203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38556358218193054, "geo/layer_14/attn_entropy_mean": 5.496903419494629, "geo/layer_14/attn_entropy_std": 0.42161834239959717, "geo/layer_21/stable_rank_q_proj": 44.87139892578125, "geo/layer_21/stable_rank_k_proj": 30.950851440429688, "geo/layer_21/stable_rank_o_proj": 80.24180603027344, "geo/layer_21/stable_rank_gate_proj": 78.37188720703125, "geo/layer_21/stable_rank_down_proj": 57.95762252807617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14725066721439362, "geo/layer_21/attn_entropy_mean": 5.726532936096191, "geo/layer_21/attn_entropy_std": 0.291688472032547, "geo/layer_27/stable_rank_q_proj": 41.65058517456055, "geo/layer_27/stable_rank_k_proj": 31.18014144897461, "geo/layer_27/stable_rank_o_proj": 118.96337890625, "geo/layer_27/stable_rank_gate_proj": 88.7151870727539, "geo/layer_27/stable_rank_down_proj": 135.54983520507812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08054571598768234, "geo/layer_27/attn_entropy_mean": 4.334888458251953, "geo/layer_27/attn_entropy_std": 0.6217423677444458, "attnres/final_alpha/block_0": 0.24146801233291626, "attnres/block_norm/0": 1.6842617988586426, "attnres/final_alpha/block_1": 0.006051328964531422, "attnres/block_norm/1": 35611.890625, "attnres/final_alpha/block_2": 0.012586064636707306, "attnres/block_norm/2": 24412.908203125, "attnres/final_alpha/block_3": 0.01422263402491808, "attnres/block_norm/3": 39288.28515625, "attnres/final_alpha/block_4": 0.018544528633356094, "attnres/block_norm/4": 11376.9794921875, "attnres/final_alpha/block_5": 0.5797873735427856, "attnres/block_norm/5": 5650.421875, "attnres/final_alpha/block_6": 0.12734000384807587, "attnres/block_norm/6": 26061.6015625, "geo/tier1_time_s": 1.3594183921813965, "geo/step": 30150.0, "geo/rankme_slope": -9.017491762329932e-05} {"step": 30160, "timestamp": 1778227139.3488443, "train/loss": 2.230154824256897, "train/z_loss": 0.0014699290739372372, "train/perplexity": 9.301306035805792, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790939.9477082014, "perf/iters_per_sec": 0.853986715177632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709784030914308, "data/tokens_consumed": 63252201472, "data/tokens_consumed_B": 63.252201472, "train/loss_slope": 4.6985400463416454e-06} {"step": 30170, "timestamp": 1778227149.6970057, "train/loss": 2.1713629007339477, "train/z_loss": 0.001481140370015055, "train/perplexity": 8.770228850434076, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027590.1532651545, "perf/iters_per_sec": 0.9668303266835949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343076467514039, "data/tokens_consumed": 63273172992, "data/tokens_consumed_B": 63.273172992, "train/loss_slope": 3.6888672931395023e-06} {"step": 30180, "timestamp": 1778227160.0558376, "train/loss": 2.146506977081299, "train/z_loss": 0.0014709321549162269, "train/perplexity": 8.554923601942379, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025452.5079718537, "perf/iters_per_sec": 0.9658110179766911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353992462158204, "data/tokens_consumed": 63294144512, "data/tokens_consumed_B": 63.294144512, "train/loss_slope": 2.742443576861821e-07} {"step": 30190, "timestamp": 1778227170.4167137, "train/loss": 2.2221374988555906, "train/z_loss": 0.0014650705619715155, "train/perplexity": 9.227032573758876, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025083.563100071, "perf/iters_per_sec": 0.9656350913524966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355878829956056, "data/tokens_consumed": 63315116032, "data/tokens_consumed_B": 63.315116032, "train/loss_slope": 4.406607347746476e-06} {"step": 30200, "timestamp": 1778227180.7673068, "grad/layer_0/attn": 0.002840425120666623, "grad/layer_0/mlp": 0.002778516849502921, "grad/layer_0/attn_mlp_ratio": 1.0222810126009048, "grad/layer_4/attn": 0.0027615532744675875, "grad/layer_4/mlp": 0.002507362747564912, "grad/layer_4/attn_mlp_ratio": 1.1013775996360236, "grad/layer_8/attn": 0.01117405854165554, "grad/layer_8/mlp": 0.0038077039644122124, "grad/layer_8/attn_mlp_ratio": 2.9345921722465733, "grad/layer_12/attn": 0.003665971802547574, "grad/layer_12/mlp": 0.006352959666401148, "grad/layer_12/attn_mlp_ratio": 0.5770494285098062, "grad/layer_16/attn": 0.0034416653215885162, "grad/layer_16/mlp": 0.004298169631510973, "grad/layer_16/attn_mlp_ratio": 0.8007281090732166, "grad/layer_20/attn": 0.005107419565320015, "grad/layer_20/mlp": 0.006018270738422871, "grad/layer_20/attn_mlp_ratio": 0.8486523292890636, "grad/layer_24/attn": 0.01325314212590456, "grad/layer_24/mlp": 0.010842958465218544, "grad/layer_24/attn_mlp_ratio": 1.2222809896569442, "grad/layer_27/attn": 0.004246148280799389, "grad/layer_27/mlp": 0.010282929986715317, "grad/layer_27/attn_mlp_ratio": 0.41293174659283705} {"step": 30200, "timestamp": 1778227180.783, "train/loss": 2.207151746749878, "train/z_loss": 0.0014614276587963104, "train/perplexity": 9.089789464953016, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024759.494955373, "perf/iters_per_sec": 0.9654805636193147, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357536315917968, "data/tokens_consumed": 63336087552, "data/tokens_consumed_B": 63.336087552, "train/loss_slope": 5.403657235650009e-06} {"step": 30210, "timestamp": 1778227191.1324072, "train/loss": 2.181397891044617, "train/z_loss": 0.0014781589969061315, "train/perplexity": 8.858681078327503, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027389.807068564, "perf/iters_per_sec": 0.9667347941725559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344098567962647, "data/tokens_consumed": 63357059072, "data/tokens_consumed_B": 63.357059072, "train/loss_slope": 3.11905474338021e-06} {"step": 30220, "timestamp": 1778227201.4859653, "train/loss": 2.2086653232574465, "train/z_loss": 0.001463099627289921, "train/perplexity": 9.103557973963264, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026598.9969334996, "perf/iters_per_sec": 0.9663577065150736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348134994506837, "data/tokens_consumed": 63378030592, "data/tokens_consumed_B": 63.378030592, "train/loss_slope": 3.968864141052764e-06} {"step": 30225, "timestamp": 1778227207.262674, "eos/sharpness": 67.2166347503662, "eos/L0_probe": 2.0151500701904297, "eos/L_plus": 2.2620787620544434, "eos/L_minus": 2.440387725830078, "eos/grad_norm": 0.18417923152446747, "eos/embed_grad_frac": 0.062238745391368866, "eos/time_s": 0.6103732585906982} {"step": 30225, "timestamp": 1778227208.641917, "geo/rankme_last": 439.98956298828125, "geo/layer_0/stable_rank_q_proj": 18.57583236694336, "geo/layer_0/stable_rank_k_proj": 16.07344627380371, "geo/layer_0/stable_rank_o_proj": 50.59333801269531, "geo/layer_0/stable_rank_gate_proj": 143.06307983398438, "geo/layer_0/stable_rank_down_proj": 51.798580169677734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054686035960912704, "geo/layer_0/attn_entropy_mean": 6.224058628082275, "geo/layer_0/attn_entropy_std": 0.3430601954460144, "geo/layer_7/stable_rank_q_proj": 42.15073776245117, "geo/layer_7/stable_rank_k_proj": 42.20668411254883, "geo/layer_7/stable_rank_o_proj": 105.11358642578125, "geo/layer_7/stable_rank_gate_proj": 95.7895736694336, "geo/layer_7/stable_rank_down_proj": 147.85879516601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5379278063774109, "geo/layer_7/attn_entropy_mean": 4.663407325744629, "geo/layer_7/attn_entropy_std": 0.8207433819770813, "geo/layer_14/stable_rank_q_proj": 54.976112365722656, "geo/layer_14/stable_rank_k_proj": 36.05705642700195, "geo/layer_14/stable_rank_o_proj": 52.43098449707031, "geo/layer_14/stable_rank_gate_proj": 80.36096954345703, "geo/layer_14/stable_rank_down_proj": 134.36412048339844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36665862798690796, "geo/layer_14/attn_entropy_mean": 5.507774353027344, "geo/layer_14/attn_entropy_std": 0.37926891446113586, "geo/layer_21/stable_rank_q_proj": 44.80158996582031, "geo/layer_21/stable_rank_k_proj": 30.874460220336914, "geo/layer_21/stable_rank_o_proj": 80.06571197509766, "geo/layer_21/stable_rank_gate_proj": 78.25765991210938, "geo/layer_21/stable_rank_down_proj": 57.93042755126953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14806677401065826, "geo/layer_21/attn_entropy_mean": 5.737240314483643, "geo/layer_21/attn_entropy_std": 0.28384965658187866, "geo/layer_27/stable_rank_q_proj": 41.62421798706055, "geo/layer_27/stable_rank_k_proj": 31.12554931640625, "geo/layer_27/stable_rank_o_proj": 119.2273178100586, "geo/layer_27/stable_rank_gate_proj": 88.60613250732422, "geo/layer_27/stable_rank_down_proj": 135.58685302734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08771906048059464, "geo/layer_27/attn_entropy_mean": 4.334206581115723, "geo/layer_27/attn_entropy_std": 0.6423653364181519, "attnres/final_alpha/block_0": 0.23988009989261627, "attnres/block_norm/0": 1.684564471244812, "attnres/final_alpha/block_1": 0.00589729193598032, "attnres/block_norm/1": 35676.2578125, "attnres/final_alpha/block_2": 0.012423575855791569, "attnres/block_norm/2": 24372.19140625, "attnres/final_alpha/block_3": 0.014011664316058159, "attnres/block_norm/3": 39888.28515625, "attnres/final_alpha/block_4": 0.01828598976135254, "attnres/block_norm/4": 11357.2294921875, "attnres/final_alpha/block_5": 0.5839695334434509, "attnres/block_norm/5": 5591.439453125, "attnres/final_alpha/block_6": 0.12553182244300842, "attnres/block_norm/6": 26362.60546875, "geo/tier1_time_s": 1.359978437423706, "geo/step": 30225.0, "geo/rankme_slope": -0.00010178452240271108} {"step": 30230, "timestamp": 1778227213.821566, "train/loss": 2.187850594520569, "train/z_loss": 0.0014635349274612964, "train/perplexity": 8.916028343981827, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700957.937473788, "perf/iters_per_sec": 0.8110799491280498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329240798950196, "data/tokens_consumed": 63399002112, "data/tokens_consumed_B": 63.399002112, "train/loss_slope": 3.950595312064158e-06} {"step": 30240, "timestamp": 1778227224.1708755, "train/loss": 2.1600428104400633, "train/z_loss": 0.0014799133874475955, "train/perplexity": 8.67150888162852, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027261.4978824146, "perf/iters_per_sec": 0.9666736115848611, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034475326538086, "data/tokens_consumed": 63419973632, "data/tokens_consumed_B": 63.419973632, "train/loss_slope": 1.5140105109296138e-06} {"step": 30250, "timestamp": 1778227234.509043, "grad/layer_0/attn": 0.0024157906882464886, "grad/layer_0/mlp": 0.0025618954095989466, "grad/layer_0/attn_mlp_ratio": 0.942970031055119, "grad/layer_4/attn": 0.002374421339482069, "grad/layer_4/mlp": 0.0024403149727731943, "grad/layer_4/attn_mlp_ratio": 0.9729978583395614, "grad/layer_8/attn": 0.003576034214347601, "grad/layer_8/mlp": 0.0037386512849479914, "grad/layer_8/attn_mlp_ratio": 0.9565037887043719, "grad/layer_12/attn": 0.004749143496155739, "grad/layer_12/mlp": 0.00646160077303648, "grad/layer_12/attn_mlp_ratio": 0.7349793943438028, "grad/layer_16/attn": 0.00645857909694314, "grad/layer_16/mlp": 0.004479505587369204, "grad/layer_16/attn_mlp_ratio": 1.4418062053485725, "grad/layer_20/attn": 0.005131092853844166, "grad/layer_20/mlp": 0.006224013399332762, "grad/layer_20/attn_mlp_ratio": 0.8244025907710899, "grad/layer_24/attn": 0.009469840675592422, "grad/layer_24/mlp": 0.011340954340994358, "grad/layer_24/attn_mlp_ratio": 0.8350126724221388, "grad/layer_27/attn": 0.010908171534538269, "grad/layer_27/mlp": 0.009805534966289997, "grad/layer_27/attn_mlp_ratio": 1.112450413036508} {"step": 30250, "timestamp": 1778227234.5250099, "train/loss": 2.2086934804916383, "train/z_loss": 0.0014528881059959532, "train/perplexity": 9.103814308585935, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026421.0212851206, "perf/iters_per_sec": 0.9662728411126712, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034904384613037, "data/tokens_consumed": 63440945152, "data/tokens_consumed_B": 63.440945152, "train/loss_slope": 4.691329754904636e-06} {"step": 30260, "timestamp": 1778227244.877162, "train/loss": 2.259730267524719, "train/z_loss": 0.0014422426000237464, "train/perplexity": 9.580504644982987, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026889.8386902171, "perf/iters_per_sec": 0.9664963906718336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034665012359619, "data/tokens_consumed": 63461916672, "data/tokens_consumed_B": 63.461916672, "train/loss_slope": 7.120813731134271e-06} {"step": 30270, "timestamp": 1778227255.228559, "train/loss": 2.2636754512786865, "train/z_loss": 0.0014514880953356625, "train/perplexity": 9.6183761521698, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027406.3492016078, "perf/iters_per_sec": 0.9667426820762671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344014167785645, "data/tokens_consumed": 63482888192, "data/tokens_consumed_B": 63.482888192, "train/loss_slope": 8.341511033370303e-06} {"step": 30280, "timestamp": 1778227265.5766442, "train/loss": 2.155796027183533, "train/z_loss": 0.001473030971828848, "train/perplexity": 8.63476094838455, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027480.6052090072, "perf/iters_per_sec": 0.966778090099815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343635320663451, "data/tokens_consumed": 63503859712, "data/tokens_consumed_B": 63.503859712, "train/loss_slope": 3.974615608361807e-06} {"step": 30290, "timestamp": 1778227275.9297657, "train/loss": 2.1965139627456667, "train/z_loss": 0.0014582532923668623, "train/perplexity": 8.993606740525793, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026535.1704864583, "perf/iters_per_sec": 0.9663272716934482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348460912704467, "data/tokens_consumed": 63524831232, "data/tokens_consumed_B": 63.524831232, "train/loss_slope": 2.487511989628969e-06} {"step": 30300, "timestamp": 1778227286.2789834, "grad/layer_0/attn": 0.0034574526362121105, "grad/layer_0/mlp": 0.0031975144520401955, "grad/layer_0/attn_mlp_ratio": 1.0812937923944896, "grad/layer_4/attn": 0.002625236054882407, "grad/layer_4/mlp": 0.0025989320129156113, "grad/layer_4/attn_mlp_ratio": 1.0101210577360125, "grad/layer_8/attn": 0.0072297994047403336, "grad/layer_8/mlp": 0.003624396165832877, "grad/layer_8/attn_mlp_ratio": 1.9947596439427806, "grad/layer_12/attn": 0.006186590064316988, "grad/layer_12/mlp": 0.00614971574395895, "grad/layer_12/attn_mlp_ratio": 1.005996085232826, "grad/layer_16/attn": 0.0040842704474925995, "grad/layer_16/mlp": 0.004553480073809624, "grad/layer_16/attn_mlp_ratio": 0.8969557989917711, "grad/layer_20/attn": 0.005168762523680925, "grad/layer_20/mlp": 0.0063322437927126884, "grad/layer_20/attn_mlp_ratio": 0.8162608091626529, "grad/layer_24/attn": 0.012302831746637821, "grad/layer_24/mlp": 0.00997539795935154, "grad/layer_24/attn_mlp_ratio": 1.2333173747492117, "grad/layer_27/attn": 0.006174566689878702, "grad/layer_27/mlp": 0.00943033304065466, "grad/layer_27/attn_mlp_ratio": 0.6547559452867919} {"step": 30300, "timestamp": 1778227286.892341, "eos/sharpness": 64.37275409698485, "eos/L0_probe": 2.0183064937591553, "eos/L_plus": 2.2787928581237793, "eos/L_minus": 2.40154767036438, "eos/grad_norm": 0.18226847052574158, "eos/embed_grad_frac": 0.06694304198026657, "eos/time_s": 0.6105506420135498} {"step": 30300, "timestamp": 1778227286.9116824, "train/loss": 2.1689399003982546, "train/z_loss": 0.0014738687430508434, "train/perplexity": 8.749004306907398, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910835.1160961261, "perf/iters_per_sec": 0.9111571865540152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097505474090576, "data/tokens_consumed": 63545802752, "data/tokens_consumed_B": 63.545802752, "train/loss_slope": 3.482475461024154e-07} {"step": 30300, "timestamp": 1778227288.2744944, "geo/rankme_last": 439.9962463378906, "geo/layer_0/stable_rank_q_proj": 18.578678131103516, "geo/layer_0/stable_rank_k_proj": 16.062131881713867, "geo/layer_0/stable_rank_o_proj": 50.524723052978516, "geo/layer_0/stable_rank_gate_proj": 142.62539672851562, "geo/layer_0/stable_rank_down_proj": 51.81563186645508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05493119731545448, "geo/layer_0/attn_entropy_mean": 6.226720809936523, "geo/layer_0/attn_entropy_std": 0.3376064598560333, "geo/layer_7/stable_rank_q_proj": 42.29695510864258, "geo/layer_7/stable_rank_k_proj": 42.27415466308594, "geo/layer_7/stable_rank_o_proj": 105.22383880615234, "geo/layer_7/stable_rank_gate_proj": 95.79557800292969, "geo/layer_7/stable_rank_down_proj": 147.753173828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5305343866348267, "geo/layer_7/attn_entropy_mean": 4.636028289794922, "geo/layer_7/attn_entropy_std": 0.830748975276947, "geo/layer_14/stable_rank_q_proj": 54.92210006713867, "geo/layer_14/stable_rank_k_proj": 36.11214828491211, "geo/layer_14/stable_rank_o_proj": 52.27981185913086, "geo/layer_14/stable_rank_gate_proj": 80.22938537597656, "geo/layer_14/stable_rank_down_proj": 134.57241821289062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3774903118610382, "geo/layer_14/attn_entropy_mean": 5.5065178871154785, "geo/layer_14/attn_entropy_std": 0.40150678157806396, "geo/layer_21/stable_rank_q_proj": 44.803279876708984, "geo/layer_21/stable_rank_k_proj": 30.85369300842285, "geo/layer_21/stable_rank_o_proj": 80.020751953125, "geo/layer_21/stable_rank_gate_proj": 77.98516082763672, "geo/layer_21/stable_rank_down_proj": 57.88117599487305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1502787321805954, "geo/layer_21/attn_entropy_mean": 5.730030059814453, "geo/layer_21/attn_entropy_std": 0.29254111647605896, "geo/layer_27/stable_rank_q_proj": 41.64505386352539, "geo/layer_27/stable_rank_k_proj": 31.121755599975586, "geo/layer_27/stable_rank_o_proj": 119.26441955566406, "geo/layer_27/stable_rank_gate_proj": 88.5245132446289, "geo/layer_27/stable_rank_down_proj": 135.22531127929688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07730977237224579, "geo/layer_27/attn_entropy_mean": 4.309449672698975, "geo/layer_27/attn_entropy_std": 0.6061952710151672, "attnres/final_alpha/block_0": 0.24151907861232758, "attnres/block_norm/0": 1.6849493980407715, "attnres/final_alpha/block_1": 0.005866173189133406, "attnres/block_norm/1": 35751.89453125, "attnres/final_alpha/block_2": 0.012499218806624413, "attnres/block_norm/2": 24437.47265625, "attnres/final_alpha/block_3": 0.014062875881791115, "attnres/block_norm/3": 39635.36328125, "attnres/final_alpha/block_4": 0.01833375357091427, "attnres/block_norm/4": 11431.94140625, "attnres/final_alpha/block_5": 0.5836875438690186, "attnres/block_norm/5": 5617.279296875, "attnres/final_alpha/block_6": 0.12403139472007751, "attnres/block_norm/6": 26283.095703125, "geo/tier1_time_s": 1.3595540523529053, "geo/step": 30300.0, "geo/rankme_slope": -8.95725868472389e-05} {"step": 30310, "timestamp": 1778227298.6245723, "train/loss": 2.154695010185242, "train/z_loss": 0.0014706762740388513, "train/perplexity": 8.625259161578462, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790960.1493607503, "perf/iters_per_sec": 0.8539963480762245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709651947021484, "data/tokens_consumed": 63566774272, "data/tokens_consumed_B": 63.566774272, "train/loss_slope": -5.207668830065843e-06} {"step": 30320, "timestamp": 1778227308.9760256, "train/loss": 2.193307864665985, "train/z_loss": 0.001465206267312169, "train/perplexity": 8.964818528800398, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027231.1284565527, "perf/iters_per_sec": 0.9666591303141369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344908237457275, "data/tokens_consumed": 63587745792, "data/tokens_consumed_B": 63.587745792, "train/loss_slope": -5.9607007072167325e-06} {"step": 30330, "timestamp": 1778227319.324602, "train/loss": 2.15020272731781, "train/z_loss": 0.0014705535140819848, "train/perplexity": 8.586598958918279, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027590.2934792961, "perf/iters_per_sec": 0.9668303935429078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343075752258302, "data/tokens_consumed": 63608717312, "data/tokens_consumed_B": 63.608717312, "train/loss_slope": -1.0122586140717045e-05} {"step": 30340, "timestamp": 1778227329.672704, "train/loss": 2.1909476041793825, "train/z_loss": 0.0014744130545295774, "train/perplexity": 8.943684172961133, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027656.944130159, "perf/iters_per_sec": 0.9668621750498576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342735767364502, "data/tokens_consumed": 63629688832, "data/tokens_consumed_B": 63.629688832, "train/loss_slope": -8.057410674806188e-06} {"step": 30350, "timestamp": 1778227340.0115054, "grad/layer_0/attn": 0.0027610743418335915, "grad/layer_0/mlp": 0.0026918102521449327, "grad/layer_0/attn_mlp_ratio": 1.0257313780049424, "grad/layer_4/attn": 0.0016280676936730742, "grad/layer_4/mlp": 0.0024306573905050755, "grad/layer_4/attn_mlp_ratio": 0.6698054744581772, "grad/layer_8/attn": 0.004355841316282749, "grad/layer_8/mlp": 0.0038065616972744465, "grad/layer_8/attn_mlp_ratio": 1.1442980695601985, "grad/layer_12/attn": 0.005439104046672583, "grad/layer_12/mlp": 0.006777417846024036, "grad/layer_12/attn_mlp_ratio": 0.8025333674254849, "grad/layer_16/attn": 0.003851652145385742, "grad/layer_16/mlp": 0.004517207387834787, "grad/layer_16/attn_mlp_ratio": 0.8526622157070624, "grad/layer_20/attn": 0.0060465713031589985, "grad/layer_20/mlp": 0.006246537901461124, "grad/layer_20/attn_mlp_ratio": 0.9679875959683025, "grad/layer_24/attn": 0.013777473010122776, "grad/layer_24/mlp": 0.011516825295984745, "grad/layer_24/attn_mlp_ratio": 1.1962908645750754, "grad/layer_27/attn": 0.005705689545720816, "grad/layer_27/mlp": 0.01165764406323433, "grad/layer_27/attn_mlp_ratio": 0.48943761413779624} {"step": 30350, "timestamp": 1778227340.027046, "train/loss": 2.176506233215332, "train/z_loss": 0.0014654955244623124, "train/perplexity": 8.815453255729379, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026600.257628027, "perf/iters_per_sec": 0.9663583076610693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03481285572052, "data/tokens_consumed": 63650660352, "data/tokens_consumed_B": 63.650660352, "train/loss_slope": -1.1065778752328948e-05} {"step": 30360, "timestamp": 1778227350.3728337, "train/loss": 2.172458791732788, "train/z_loss": 0.001477281586267054, "train/perplexity": 8.779845333635295, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028138.1646922543, "perf/iters_per_sec": 0.967091638895156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034028172492981, "data/tokens_consumed": 63671631872, "data/tokens_consumed_B": 63.671631872, "train/loss_slope": -1.1674284541567537e-05} {"step": 30370, "timestamp": 1778227360.7449493, "train/loss": 2.1459023475646974, "train/z_loss": 0.0014548308914527296, "train/perplexity": 8.549752606046281, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023008.4237097115, "perf/iters_per_sec": 0.9646455877827222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366501569747926, "data/tokens_consumed": 63692603392, "data/tokens_consumed_B": 63.692603392, "train/loss_slope": -1.5878171574557936e-05} {"step": 30375, "timestamp": 1778227366.5264833, "eos/sharpness": 23.055672645568844, "eos/L0_probe": 2.021023750305176, "eos/L_plus": 2.1407201290130615, "eos/L_minus": 2.1318840980529785, "eos/grad_norm": 0.12327826768159866, "eos/embed_grad_frac": 0.14771324396133423, "eos/time_s": 0.6091578006744385} {"step": 30375, "timestamp": 1778227367.901571, "geo/rankme_last": 438.9608459472656, "geo/layer_0/stable_rank_q_proj": 18.59998321533203, "geo/layer_0/stable_rank_k_proj": 16.086170196533203, "geo/layer_0/stable_rank_o_proj": 50.637699127197266, "geo/layer_0/stable_rank_gate_proj": 143.04673767089844, "geo/layer_0/stable_rank_down_proj": 51.834434509277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05190027505159378, "geo/layer_0/attn_entropy_mean": 6.223764419555664, "geo/layer_0/attn_entropy_std": 0.3427789509296417, "geo/layer_7/stable_rank_q_proj": 42.365291595458984, "geo/layer_7/stable_rank_k_proj": 42.33343505859375, "geo/layer_7/stable_rank_o_proj": 105.23453521728516, "geo/layer_7/stable_rank_gate_proj": 95.72551727294922, "geo/layer_7/stable_rank_down_proj": 147.68655395507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5417532920837402, "geo/layer_7/attn_entropy_mean": 4.64042329788208, "geo/layer_7/attn_entropy_std": 0.829356849193573, "geo/layer_14/stable_rank_q_proj": 54.91450881958008, "geo/layer_14/stable_rank_k_proj": 36.09813690185547, "geo/layer_14/stable_rank_o_proj": 52.37775421142578, "geo/layer_14/stable_rank_gate_proj": 80.22261810302734, "geo/layer_14/stable_rank_down_proj": 134.37571716308594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3695736229419708, "geo/layer_14/attn_entropy_mean": 5.47293758392334, "geo/layer_14/attn_entropy_std": 0.39197319746017456, "geo/layer_21/stable_rank_q_proj": 44.905784606933594, "geo/layer_21/stable_rank_k_proj": 30.933204650878906, "geo/layer_21/stable_rank_o_proj": 80.0545654296875, "geo/layer_21/stable_rank_gate_proj": 78.04779052734375, "geo/layer_21/stable_rank_down_proj": 57.85619354248047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15693575143814087, "geo/layer_21/attn_entropy_mean": 5.739451885223389, "geo/layer_21/attn_entropy_std": 0.27872633934020996, "geo/layer_27/stable_rank_q_proj": 41.652488708496094, "geo/layer_27/stable_rank_k_proj": 31.14365577697754, "geo/layer_27/stable_rank_o_proj": 119.08951568603516, "geo/layer_27/stable_rank_gate_proj": 88.54747009277344, "geo/layer_27/stable_rank_down_proj": 135.23597717285156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08428157866001129, "geo/layer_27/attn_entropy_mean": 4.298600196838379, "geo/layer_27/attn_entropy_std": 0.607114315032959, "attnres/final_alpha/block_0": 0.23810164630413055, "attnres/block_norm/0": 1.6852869987487793, "attnres/final_alpha/block_1": 0.005739094223827124, "attnres/block_norm/1": 35884.1015625, "attnres/final_alpha/block_2": 0.012162541039288044, "attnres/block_norm/2": 24557.2890625, "attnres/final_alpha/block_3": 0.013921098783612251, "attnres/block_norm/3": 39598.5859375, "attnres/final_alpha/block_4": 0.01789890043437481, "attnres/block_norm/4": 11405.140625, "attnres/final_alpha/block_5": 0.589690625667572, "attnres/block_norm/5": 5586.6728515625, "attnres/final_alpha/block_6": 0.12248606979846954, "attnres/block_norm/6": 26636.541015625, "geo/tier1_time_s": 1.35528564453125, "geo/step": 30375.0, "geo/rankme_slope": -0.00010952250040641257} {"step": 30380, "timestamp": 1778227373.0793128, "train/loss": 2.1480137586593626, "train/z_loss": 0.0014684549998492003, "train/perplexity": 8.567823719616669, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700993.2976831135, "perf/iters_per_sec": 0.8110968101897781, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2328984498977662, "data/tokens_consumed": 63713574912, "data/tokens_consumed_B": 63.713574912, "train/loss_slope": -1.710006770807235e-05} {"step": 30390, "timestamp": 1778227383.8400915, "train/loss": 2.1876960515975954, "train/z_loss": 0.0014561648131348193, "train/perplexity": 8.914650541367799, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949736.26465293, "perf/iters_per_sec": 0.9297066996826792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0756080389022826, "data/tokens_consumed": 63734546432, "data/tokens_consumed_B": 63.734546432, "train/loss_slope": -1.7875166866395163e-05} {"step": 30400, "timestamp": 1778227394.2005877, "grad/layer_0/attn": 0.0031715549994260073, "grad/layer_0/mlp": 0.0030150278471410275, "grad/layer_0/attn_mlp_ratio": 1.0519156223521593, "grad/layer_4/attn": 0.0019288850016891956, "grad/layer_4/mlp": 0.002641142113134265, "grad/layer_4/attn_mlp_ratio": 0.7303222795413853, "grad/layer_8/attn": 0.0032160598784685135, "grad/layer_8/mlp": 0.003694278420880437, "grad/layer_8/attn_mlp_ratio": 0.8705515462061176, "grad/layer_12/attn": 0.0038931570015847683, "grad/layer_12/mlp": 0.005909082945436239, "grad/layer_12/attn_mlp_ratio": 0.6588428308841537, "grad/layer_16/attn": 0.004679462406784296, "grad/layer_16/mlp": 0.004707218613475561, "grad/layer_16/attn_mlp_ratio": 0.9941034593077638, "grad/layer_20/attn": 0.004134419374167919, "grad/layer_20/mlp": 0.006385188549757004, "grad/layer_20/attn_mlp_ratio": 0.6475015228133095, "grad/layer_24/attn": 0.01846911758184433, "grad/layer_24/mlp": 0.012800995260477066, "grad/layer_24/attn_mlp_ratio": 1.4427876162558053, "grad/layer_27/attn": 0.009779179468750954, "grad/layer_27/mlp": 0.011675441637635231, "grad/layer_27/attn_mlp_ratio": 0.8375853940693511} {"step": 30400, "timestamp": 1778227394.2164767, "train/loss": 2.1639516830444334, "train/z_loss": 0.0014636803301982582, "train/perplexity": 8.705471038764165, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022015.6050801617, "perf/iters_per_sec": 0.9641721749687966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371591567993164, "data/tokens_consumed": 63755517952, "data/tokens_consumed_B": 63.755517952, "train/loss_slope": -1.7230054204112698e-05} {"step": 30410, "timestamp": 1778227404.572507, "train/loss": 2.157997179031372, "train/z_loss": 0.0014751668786630034, "train/perplexity": 8.65378830176636, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026336.5731587652, "perf/iters_per_sec": 0.9662325731080843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349475145339966, "data/tokens_consumed": 63776489472, "data/tokens_consumed_B": 63.776489472, "train/loss_slope": -2.295655702063888e-05} {"step": 30420, "timestamp": 1778227414.93369, "train/loss": 2.1808403968811034, "train/z_loss": 0.0014735602657310666, "train/perplexity": 8.853743791712022, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025015.543074947, "perf/iters_per_sec": 0.9656026568770156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356226682662963, "data/tokens_consumed": 63797460992, "data/tokens_consumed_B": 63.797460992, "train/loss_slope": -2.148376924656118e-05} {"step": 30430, "timestamp": 1778227425.287524, "train/loss": 2.230404424667358, "train/z_loss": 0.0014614920830354095, "train/perplexity": 9.30362793537164, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026880.030527311, "perf/iters_per_sec": 0.9664917137753062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346700191497802, "data/tokens_consumed": 63818432512, "data/tokens_consumed_B": 63.818432512, "train/loss_slope": -1.80829932301244e-05} {"step": 30440, "timestamp": 1778227435.6506662, "train/loss": 2.2469090461730956, "train/z_loss": 0.001455588627140969, "train/perplexity": 9.458454959198576, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024545.8680938075, "perf/iters_per_sec": 0.9653786983937299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035862922668457, "data/tokens_consumed": 63839404032, "data/tokens_consumed_B": 63.839404032, "train/loss_slope": -1.4693431585284998e-05} {"step": 30450, "timestamp": 1778227445.9983315, "grad/layer_0/attn": 0.002806372009217739, "grad/layer_0/mlp": 0.002676902338862419, "grad/layer_0/attn_mlp_ratio": 1.0483654422648065, "grad/layer_4/attn": 0.0035239295102655888, "grad/layer_4/mlp": 0.0024734761100262403, "grad/layer_4/attn_mlp_ratio": 1.424687044080446, "grad/layer_8/attn": 0.004100858699530363, "grad/layer_8/mlp": 0.0035091836471110582, "grad/layer_8/attn_mlp_ratio": 1.1686075723183211, "grad/layer_12/attn": 0.003766498761251569, "grad/layer_12/mlp": 0.006199450232088566, "grad/layer_12/attn_mlp_ratio": 0.6075536635491768, "grad/layer_16/attn": 0.004475889727473259, "grad/layer_16/mlp": 0.0044017662294209, "grad/layer_16/attn_mlp_ratio": 1.0168394668197007, "grad/layer_20/attn": 0.004217860754579306, "grad/layer_20/mlp": 0.006221035495400429, "grad/layer_20/attn_mlp_ratio": 0.6779997783163006, "grad/layer_24/attn": 0.012710846960544586, "grad/layer_24/mlp": 0.010648438706994057, "grad/layer_24/attn_mlp_ratio": 1.1936817397303263, "grad/layer_27/attn": 0.012515711598098278, "grad/layer_27/mlp": 0.009117767214775085, "grad/layer_27/attn_mlp_ratio": 1.3726728447891974} {"step": 30450, "timestamp": 1778227446.6433647, "eos/sharpness": 58.58635902404784, "eos/L0_probe": 2.0181381702423096, "eos/L_plus": 2.254871368408203, "eos/L_minus": 2.3672685623168945, "eos/grad_norm": 0.16474872827529907, "eos/embed_grad_frac": 0.09631697833538055, "eos/time_s": 0.6421511173248291} {"step": 30450, "timestamp": 1778227446.6729522, "train/loss": 2.204052472114563, "train/z_loss": 0.0014548685168847442, "train/perplexity": 9.061661321959402, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903538.662203021, "perf/iters_per_sec": 0.9076779662146669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1017123222351075, "data/tokens_consumed": 63860375552, "data/tokens_consumed_B": 63.860375552, "train/loss_slope": -1.5781020116705905e-05} {"step": 30450, "timestamp": 1778227448.0336165, "geo/rankme_last": 440.04241943359375, "geo/layer_0/stable_rank_q_proj": 18.619991302490234, "geo/layer_0/stable_rank_k_proj": 16.079484939575195, "geo/layer_0/stable_rank_o_proj": 50.67472457885742, "geo/layer_0/stable_rank_gate_proj": 142.96957397460938, "geo/layer_0/stable_rank_down_proj": 51.820213317871094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058876171708106995, "geo/layer_0/attn_entropy_mean": 6.221444129943848, "geo/layer_0/attn_entropy_std": 0.3419317305088043, "geo/layer_7/stable_rank_q_proj": 42.337562561035156, "geo/layer_7/stable_rank_k_proj": 42.38053894042969, "geo/layer_7/stable_rank_o_proj": 105.3449478149414, "geo/layer_7/stable_rank_gate_proj": 95.80889129638672, "geo/layer_7/stable_rank_down_proj": 148.16061401367188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5366830825805664, "geo/layer_7/attn_entropy_mean": 4.6366376876831055, "geo/layer_7/attn_entropy_std": 0.8091118931770325, "geo/layer_14/stable_rank_q_proj": 54.905826568603516, "geo/layer_14/stable_rank_k_proj": 36.03322219848633, "geo/layer_14/stable_rank_o_proj": 52.32325744628906, "geo/layer_14/stable_rank_gate_proj": 80.10974884033203, "geo/layer_14/stable_rank_down_proj": 134.3725128173828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3791464567184448, "geo/layer_14/attn_entropy_mean": 5.466951370239258, "geo/layer_14/attn_entropy_std": 0.4051359295845032, "geo/layer_21/stable_rank_q_proj": 44.89802932739258, "geo/layer_21/stable_rank_k_proj": 30.914676666259766, "geo/layer_21/stable_rank_o_proj": 80.12516021728516, "geo/layer_21/stable_rank_gate_proj": 77.97505950927734, "geo/layer_21/stable_rank_down_proj": 57.759605407714844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14778879284858704, "geo/layer_21/attn_entropy_mean": 5.730297088623047, "geo/layer_21/attn_entropy_std": 0.2813117504119873, "geo/layer_27/stable_rank_q_proj": 41.592529296875, "geo/layer_27/stable_rank_k_proj": 31.153064727783203, "geo/layer_27/stable_rank_o_proj": 118.63086700439453, "geo/layer_27/stable_rank_gate_proj": 88.38436889648438, "geo/layer_27/stable_rank_down_proj": 134.76300048828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08870598673820496, "geo/layer_27/attn_entropy_mean": 4.325376510620117, "geo/layer_27/attn_entropy_std": 0.6029244065284729, "attnres/final_alpha/block_0": 0.24092505872249603, "attnres/block_norm/0": 1.6857891082763672, "attnres/final_alpha/block_1": 0.005827778019011021, "attnres/block_norm/1": 35803.5703125, "attnres/final_alpha/block_2": 0.012466778978705406, "attnres/block_norm/2": 24418.31640625, "attnres/final_alpha/block_3": 0.014375640079379082, "attnres/block_norm/3": 39729.2421875, "attnres/final_alpha/block_4": 0.018291903659701347, "attnres/block_norm/4": 11432.994140625, "attnres/final_alpha/block_5": 0.5848464369773865, "attnres/block_norm/5": 5605.08203125, "attnres/final_alpha/block_6": 0.12326636910438538, "attnres/block_norm/6": 26497.42578125, "geo/tier1_time_s": 1.3567075729370117, "geo/step": 30450.0, "geo/rankme_slope": -0.00012102073251175471} {"step": 30460, "timestamp": 1778227458.389719, "train/loss": 2.170798420906067, "train/z_loss": 0.0014564921730197966, "train/perplexity": 8.765279630161029, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790465.084917547, "perf/iters_per_sec": 0.853760282954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712889671325684, "data/tokens_consumed": 63881347072, "data/tokens_consumed_B": 63.881347072, "train/loss_slope": -1.785869835877327e-05} {"step": 30470, "timestamp": 1778227468.753692, "train/loss": 2.1819947719573975, "train/z_loss": 0.0014641129644587636, "train/perplexity": 8.863970234316668, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024661.5770478358, "perf/iters_per_sec": 0.9654338727225474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358037233352662, "data/tokens_consumed": 63902318592, "data/tokens_consumed_B": 63.902318592, "train/loss_slope": -2.011009119596348e-05} {"step": 30480, "timestamp": 1778227479.1231358, "train/loss": 2.197733998298645, "train/z_loss": 0.001460024924017489, "train/perplexity": 9.004585956653882, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023804.2562546246, "perf/iters_per_sec": 0.9650250703118441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362425088882445, "data/tokens_consumed": 63923290112, "data/tokens_consumed_B": 63.923290112, "train/loss_slope": -1.730995480090765e-05} {"step": 30490, "timestamp": 1778227489.493239, "train/loss": 2.211646056175232, "train/z_loss": 0.001469899108633399, "train/perplexity": 9.130733730600557, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023592.6930598246, "perf/iters_per_sec": 0.9649241891192554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363508462905884, "data/tokens_consumed": 63944261632, "data/tokens_consumed_B": 63.944261632, "train/loss_slope": -1.5331776321667496e-05} {"step": 30500, "timestamp": 1778227499.849581, "grad/layer_0/attn": 0.002573868725448847, "grad/layer_0/mlp": 0.0027170414105057716, "grad/layer_0/attn_mlp_ratio": 0.9473056320621787, "grad/layer_4/attn": 0.0017887076828628778, "grad/layer_4/mlp": 0.002415674040094018, "grad/layer_4/attn_mlp_ratio": 0.7404590102509693, "grad/layer_8/attn": 0.0036347457207739353, "grad/layer_8/mlp": 0.003499707207083702, "grad/layer_8/attn_mlp_ratio": 1.0385856306945738, "grad/layer_12/attn": 0.004691087640821934, "grad/layer_12/mlp": 0.006403978914022446, "grad/layer_12/attn_mlp_ratio": 0.7325270164924204, "grad/layer_16/attn": 0.004903809167444706, "grad/layer_16/mlp": 0.004483602941036224, "grad/layer_16/attn_mlp_ratio": 1.0937206355162445, "grad/layer_20/attn": 0.00478952145203948, "grad/layer_20/mlp": 0.006296803709119558, "grad/layer_20/attn_mlp_ratio": 0.7606273908523075, "grad/layer_24/attn": 0.01096823625266552, "grad/layer_24/mlp": 0.011838107369840145, "grad/layer_24/attn_mlp_ratio": 0.9265194019068682, "grad/layer_27/attn": 0.006089170929044485, "grad/layer_27/mlp": 0.011639603413641453, "grad/layer_27/attn_mlp_ratio": 0.5231424697506287} {"step": 30500, "timestamp": 1778227499.865337, "train/loss": 2.2030077457427977, "train/z_loss": 0.0014712390839122236, "train/perplexity": 9.05219930887226, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022925.3763249952, "perf/iters_per_sec": 0.9646059877037979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366927146911622, "data/tokens_consumed": 63965233152, "data/tokens_consumed_B": 63.965233152, "train/loss_slope": -1.3612092253040925e-05} {"step": 30500, "timestamp": 1778227506.7098727, "geo/ww_alpha_mean": 7.625780266651858, "geo/ww_alpha_std": 4.642888142530699, "geo/ww_alpha_min": 1.3537361539668755, "geo/ww_alpha_max": 45.26459198093718, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.135323122059907, "geo/ww_alpha_by_type/k_proj": 4.4935810314558084, "geo/ww_alpha_by_type/v_proj": 7.433058385947549, "geo/ww_alpha_by_type/o_proj": 8.538991455661185, "geo/ww_alpha_by_type/gate_proj": 8.408625083044083, "geo/ww_alpha_by_type/up_proj": 11.450369634826941, "geo/ww_alpha_by_type/down_proj": 9.033140545008006, "geo/twonn_id/layer_0": 0.71798175573349, "geo/twonn_id/layer_7": 2.896064281463623, "geo/twonn_id/layer_14": 4.252801418304443, "geo/twonn_id/layer_21": 6.7823166847229, "geo/twonn_id/layer_27": 5.774855613708496, "geo/tier2_time_s": 6.838211297988892} {"step": 30500, "timestamp": 1778227507.3516214, "eoc/jacobian_sigma/layer_0/attn": 996.4449462890625, "eoc/jacobian_sigma/layer_0/mlp": 7405.0458984375, "eoc/jacobian_sigma/layer_0": 7405.0458984375, "eoc/jacobian_sigma/layer_7/attn": 1.1587313413619995, "eoc/jacobian_sigma/layer_7/mlp": 1.6666990518569946, "eoc/jacobian_sigma/layer_7": 1.6666990518569946, "eoc/jacobian_sigma/layer_14/attn": 1.6432865858078003, "eoc/jacobian_sigma/layer_14/mlp": 9.234596252441406, "eoc/jacobian_sigma/layer_14": 9.234596252441406, "eoc/jacobian_sigma/layer_21/attn": 1.089451789855957, "eoc/jacobian_sigma/layer_21/mlp": 4.092410087585449, "eoc/jacobian_sigma/layer_21": 4.092410087585449, "eoc/jacobian_sigma/layer_27/attn": 3.4490816593170166, "eoc/jacobian_sigma/layer_27/mlp": 22.376346588134766, "eoc/jacobian_sigma/layer_27": 22.376346588134766, "eoc/layer0_sigma": 7405.0458984375, "eoc/sigma_max": 22.376346588134766, "eoc/sigma_min": 1.6666990518569946, "eoc/sigma_mean": 9.342512995004654, "eoc/time_s": 0.6350667476654053} {"step": 30510, "timestamp": 1778227517.7487786, "train/loss": 2.2042097091674804, "train/z_loss": 0.0014643308473750948, "train/perplexity": 9.063086262904026, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1173015.0061129224, "perf/iters_per_sec": 0.5593371420445072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7878304958343505, "data/tokens_consumed": 63986204672, "data/tokens_consumed_B": 63.986204672, "train/loss_slope": -1.4611360728472459e-05} {"step": 30520, "timestamp": 1778227528.1118228, "train/loss": 2.178198051452637, "train/z_loss": 0.0014636600972153246, "train/perplexity": 8.830380023446109, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024785.1294796804, "perf/iters_per_sec": 0.9654927871130373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357405185699462, "data/tokens_consumed": 64007176192, "data/tokens_consumed_B": 64.007176192, "train/loss_slope": -1.366218431840742e-05} {"step": 30525, "timestamp": 1778227533.9045632, "eos/sharpness": 64.24925327301024, "eos/L0_probe": 2.015000343322754, "eos/L_plus": 2.4062016010284424, "eos/L_minus": 2.266291618347168, "eos/grad_norm": 0.24139373004436493, "eos/embed_grad_frac": 0.043444402515888214, "eos/time_s": 0.6194212436676025} {"step": 30525, "timestamp": 1778227535.279732, "geo/rankme_last": 439.2037048339844, "geo/layer_0/stable_rank_q_proj": 18.62156867980957, "geo/layer_0/stable_rank_k_proj": 16.102794647216797, "geo/layer_0/stable_rank_o_proj": 50.621150970458984, "geo/layer_0/stable_rank_gate_proj": 142.9093780517578, "geo/layer_0/stable_rank_down_proj": 51.92167282104492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05550459772348404, "geo/layer_0/attn_entropy_mean": 6.219615936279297, "geo/layer_0/attn_entropy_std": 0.34148409962654114, "geo/layer_7/stable_rank_q_proj": 42.30604934692383, "geo/layer_7/stable_rank_k_proj": 42.517948150634766, "geo/layer_7/stable_rank_o_proj": 105.21587371826172, "geo/layer_7/stable_rank_gate_proj": 95.74118041992188, "geo/layer_7/stable_rank_down_proj": 148.17941284179688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5134467482566833, "geo/layer_7/attn_entropy_mean": 4.622847557067871, "geo/layer_7/attn_entropy_std": 0.8480402827262878, "geo/layer_14/stable_rank_q_proj": 55.06272506713867, "geo/layer_14/stable_rank_k_proj": 36.10076141357422, "geo/layer_14/stable_rank_o_proj": 52.366615295410156, "geo/layer_14/stable_rank_gate_proj": 80.03313446044922, "geo/layer_14/stable_rank_down_proj": 134.30966186523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38011041283607483, "geo/layer_14/attn_entropy_mean": 5.478878021240234, "geo/layer_14/attn_entropy_std": 0.4049108326435089, "geo/layer_21/stable_rank_q_proj": 44.92933654785156, "geo/layer_21/stable_rank_k_proj": 30.934724807739258, "geo/layer_21/stable_rank_o_proj": 80.27784729003906, "geo/layer_21/stable_rank_gate_proj": 77.91654205322266, "geo/layer_21/stable_rank_down_proj": 57.80728530883789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14879454672336578, "geo/layer_21/attn_entropy_mean": 5.742684841156006, "geo/layer_21/attn_entropy_std": 0.2796074151992798, "geo/layer_27/stable_rank_q_proj": 41.63569641113281, "geo/layer_27/stable_rank_k_proj": 31.212617874145508, "geo/layer_27/stable_rank_o_proj": 118.60179138183594, "geo/layer_27/stable_rank_gate_proj": 88.39981842041016, "geo/layer_27/stable_rank_down_proj": 134.7927703857422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08406800031661987, "geo/layer_27/attn_entropy_mean": 4.3141984939575195, "geo/layer_27/attn_entropy_std": 0.6107887029647827, "attnres/final_alpha/block_0": 0.23852390050888062, "attnres/block_norm/0": 1.685862421989441, "attnres/final_alpha/block_1": 0.005820172838866711, "attnres/block_norm/1": 35895.3515625, "attnres/final_alpha/block_2": 0.012414196506142616, "attnres/block_norm/2": 24492.76171875, "attnres/final_alpha/block_3": 0.014126951806247234, "attnres/block_norm/3": 40196.28125, "attnres/final_alpha/block_4": 0.018039576709270477, "attnres/block_norm/4": 11445.638671875, "attnres/final_alpha/block_5": 0.5895785093307495, "attnres/block_norm/5": 5590.6669921875, "attnres/final_alpha/block_6": 0.12149670720100403, "attnres/block_norm/6": 26772.74609375, "geo/tier1_time_s": 1.3556547164916992, "geo/step": 30525.0, "geo/rankme_slope": -0.0001366973938012705} {"step": 30530, "timestamp": 1778227540.4671545, "train/loss": 2.2401287317276, "train/z_loss": 0.0014580053510144352, "train/perplexity": 9.394540585043151, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698089.3245786005, "perf/iters_per_sec": 0.8097120879071238, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350068807601928, "data/tokens_consumed": 64028147712, "data/tokens_consumed_B": 64.028147712, "train/loss_slope": -8.372727822441976e-06} {"step": 30540, "timestamp": 1778227550.8315842, "train/loss": 2.190924572944641, "train/z_loss": 0.0014724296634085476, "train/perplexity": 8.943478191243507, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024692.5686024236, "perf/iters_per_sec": 0.9654486506473654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035787868499756, "data/tokens_consumed": 64049119232, "data/tokens_consumed_B": 64.049119232, "train/loss_slope": -8.764655326101625e-06} {"step": 30550, "timestamp": 1778227561.1966445, "grad/layer_0/attn": 0.0024356823414564133, "grad/layer_0/mlp": 0.0024966043420135975, "grad/layer_0/attn_mlp_ratio": 0.9755980164370578, "grad/layer_4/attn": 0.0018579306779429317, "grad/layer_4/mlp": 0.002433139132335782, "grad/layer_4/attn_mlp_ratio": 0.7635940653340015, "grad/layer_8/attn": 0.007530306465923786, "grad/layer_8/mlp": 0.003471303265541792, "grad/layer_8/attn_mlp_ratio": 2.1693023262311377, "grad/layer_12/attn": 0.004913171287626028, "grad/layer_12/mlp": 0.006310786120593548, "grad/layer_12/attn_mlp_ratio": 0.7785355288368381, "grad/layer_16/attn": 0.0034714972134679556, "grad/layer_16/mlp": 0.004343928769230843, "grad/layer_16/attn_mlp_ratio": 0.7991606948395166, "grad/layer_20/attn": 0.0038656392134726048, "grad/layer_20/mlp": 0.0055879815481603146, "grad/layer_20/attn_mlp_ratio": 0.691777363790244, "grad/layer_24/attn": 0.010438165627419949, "grad/layer_24/mlp": 0.009304705075919628, "grad/layer_24/attn_mlp_ratio": 1.1218158372641072, "grad/layer_27/attn": 0.00604055542498827, "grad/layer_27/mlp": 0.008821806870400906, "grad/layer_27/attn_mlp_ratio": 0.6847299476462879} {"step": 30550, "timestamp": 1778227561.2125816, "train/loss": 2.144673728942871, "train/z_loss": 0.001475385727826506, "train/perplexity": 8.539254671082233, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021405.813633089, "perf/iters_per_sec": 0.9638814037480778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374720335006713, "data/tokens_consumed": 64070090752, "data/tokens_consumed_B": 64.070090752, "train/loss_slope": -1.007954080005399e-05} {"step": 30560, "timestamp": 1778227572.2262282, "train/loss": 2.184479236602783, "train/z_loss": 0.00146578288404271, "train/perplexity": 8.88601983436558, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905056.509734712, "perf/iters_per_sec": 0.908401732318264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1008345365524292, "data/tokens_consumed": 64091062272, "data/tokens_consumed_B": 64.091062272, "train/loss_slope": -1.1735673520145157e-05} {"step": 30570, "timestamp": 1778227583.142039, "train/loss": 2.2135243892669676, "train/z_loss": 0.0014591517392545938, "train/perplexity": 9.147900407239778, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921999.0042699566, "perf/iters_per_sec": 0.916480543265322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0911306381225585, "data/tokens_consumed": 64112033792, "data/tokens_consumed_B": 64.112033792, "train/loss_slope": -8.066398668961625e-06} {"step": 30580, "timestamp": 1778227593.502164, "train/loss": 2.2419084310531616, "train/z_loss": 0.0014530298416502775, "train/perplexity": 9.411274929219902, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025391.7851033227, "perf/iters_per_sec": 0.9657820630566228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354302883148194, "data/tokens_consumed": 64133005312, "data/tokens_consumed_B": 64.133005312, "train/loss_slope": -5.949045128912479e-06} {"step": 30590, "timestamp": 1778227603.8529608, "train/loss": 2.172987866401672, "train/z_loss": 0.001467885437887162, "train/perplexity": 8.78449175644196, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027310.3244821522, "perf/iters_per_sec": 0.9666968939219247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344504117965698, "data/tokens_consumed": 64153976832, "data/tokens_consumed_B": 64.153976832, "train/loss_slope": -5.475146506521577e-06} {"step": 30600, "timestamp": 1778227614.1995726, "grad/layer_0/attn": 0.0029485567938536406, "grad/layer_0/mlp": 0.0027590859681367874, "grad/layer_0/attn_mlp_ratio": 1.068671553201964, "grad/layer_4/attn": 0.002892423188313842, "grad/layer_4/mlp": 0.002626546425744891, "grad/layer_4/attn_mlp_ratio": 1.1012267096595778, "grad/layer_8/attn": 0.008293228223919868, "grad/layer_8/mlp": 0.0036490370985120535, "grad/layer_8/attn_mlp_ratio": 2.2727168216595737, "grad/layer_12/attn": 0.004577583633363247, "grad/layer_12/mlp": 0.006350312847644091, "grad/layer_12/attn_mlp_ratio": 0.7208437869288772, "grad/layer_16/attn": 0.0037421896122395992, "grad/layer_16/mlp": 0.004791611339896917, "grad/layer_16/attn_mlp_ratio": 0.7809877030262925, "grad/layer_20/attn": 0.003558097407221794, "grad/layer_20/mlp": 0.006187673658132553, "grad/layer_20/attn_mlp_ratio": 0.5750298975516176, "grad/layer_24/attn": 0.014368237927556038, "grad/layer_24/mlp": 0.011089758016169071, "grad/layer_24/attn_mlp_ratio": 1.2956313182887986, "grad/layer_27/attn": 0.00874567124992609, "grad/layer_27/mlp": 0.008630499243736267, "grad/layer_27/attn_mlp_ratio": 1.0133447558018078} {"step": 30600, "timestamp": 1778227614.8293805, "eos/sharpness": 65.93096256256102, "eos/L0_probe": 2.0199477672576904, "eos/L_plus": 2.4622738361358643, "eos/L_minus": 2.236931324005127, "eos/grad_norm": 0.17686881124973297, "eos/embed_grad_frac": 0.08360965549945831, "eos/time_s": 0.6269590854644775} {"step": 30600, "timestamp": 1778227614.8493924, "train/loss": 2.18700795173645, "train/z_loss": 0.0014625722775235771, "train/perplexity": 8.908518481544832, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908180.3608379497, "perf/iters_per_sec": 0.9098913006009816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0990323781967164, "data/tokens_consumed": 64174948352, "data/tokens_consumed_B": 64.174948352, "train/loss_slope": -8.139360677076521e-06} {"step": 30600, "timestamp": 1778227616.2155783, "geo/rankme_last": 439.9996337890625, "geo/layer_0/stable_rank_q_proj": 18.640043258666992, "geo/layer_0/stable_rank_k_proj": 16.134967803955078, "geo/layer_0/stable_rank_o_proj": 50.597713470458984, "geo/layer_0/stable_rank_gate_proj": 143.0184783935547, "geo/layer_0/stable_rank_down_proj": 51.810001373291016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05772877484560013, "geo/layer_0/attn_entropy_mean": 6.221884250640869, "geo/layer_0/attn_entropy_std": 0.3452285826206207, "geo/layer_7/stable_rank_q_proj": 42.21520233154297, "geo/layer_7/stable_rank_k_proj": 42.572566986083984, "geo/layer_7/stable_rank_o_proj": 105.29747009277344, "geo/layer_7/stable_rank_gate_proj": 95.71881103515625, "geo/layer_7/stable_rank_down_proj": 148.24046325683594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5361112952232361, "geo/layer_7/attn_entropy_mean": 4.633714199066162, "geo/layer_7/attn_entropy_std": 0.8244494199752808, "geo/layer_14/stable_rank_q_proj": 54.940860748291016, "geo/layer_14/stable_rank_k_proj": 36.21283721923828, "geo/layer_14/stable_rank_o_proj": 52.32149887084961, "geo/layer_14/stable_rank_gate_proj": 80.03558349609375, "geo/layer_14/stable_rank_down_proj": 134.55572509765625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37395811080932617, "geo/layer_14/attn_entropy_mean": 5.4719648361206055, "geo/layer_14/attn_entropy_std": 0.38537928462028503, "geo/layer_21/stable_rank_q_proj": 44.903465270996094, "geo/layer_21/stable_rank_k_proj": 30.976789474487305, "geo/layer_21/stable_rank_o_proj": 80.06394958496094, "geo/layer_21/stable_rank_gate_proj": 77.83567810058594, "geo/layer_21/stable_rank_down_proj": 57.73351287841797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15018287301063538, "geo/layer_21/attn_entropy_mean": 5.735989570617676, "geo/layer_21/attn_entropy_std": 0.28780996799468994, "geo/layer_27/stable_rank_q_proj": 41.610958099365234, "geo/layer_27/stable_rank_k_proj": 31.263078689575195, "geo/layer_27/stable_rank_o_proj": 118.85132598876953, "geo/layer_27/stable_rank_gate_proj": 88.39139556884766, "geo/layer_27/stable_rank_down_proj": 134.9647216796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08548164367675781, "geo/layer_27/attn_entropy_mean": 4.3240556716918945, "geo/layer_27/attn_entropy_std": 0.6225568056106567, "attnres/final_alpha/block_0": 0.2398281693458557, "attnres/block_norm/0": 1.6862716674804688, "attnres/final_alpha/block_1": 0.005838099401444197, "attnres/block_norm/1": 36020.15625, "attnres/final_alpha/block_2": 0.012323768809437752, "attnres/block_norm/2": 24540.26171875, "attnres/final_alpha/block_3": 0.01405867375433445, "attnres/block_norm/3": 39904.4765625, "attnres/final_alpha/block_4": 0.01786862313747406, "attnres/block_norm/4": 11456.09765625, "attnres/final_alpha/block_5": 0.5873589515686035, "attnres/block_norm/5": 5614.021484375, "attnres/final_alpha/block_6": 0.12272372096776962, "attnres/block_norm/6": 26596.6015625, "geo/tier1_time_s": 1.3618924617767334, "geo/step": 30600.0, "geo/rankme_slope": -0.00013858283547794117} {"step": 30610, "timestamp": 1778227626.5719712, "train/loss": 2.1750108003616333, "train/z_loss": 0.0014644315233454108, "train/perplexity": 8.80228018948347, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789570.944569745, "perf/iters_per_sec": 0.8533339236115194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171874189376831, "data/tokens_consumed": 64195919872, "data/tokens_consumed_B": 64.195919872, "train/loss_slope": -9.461479998192899e-06} {"step": 30620, "timestamp": 1778227637.4202209, "train/loss": 2.190091276168823, "train/z_loss": 0.0014467017026618123, "train/perplexity": 8.936028723941755, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933989.0992873032, "perf/iters_per_sec": 0.922197866099979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0843659877777099, "data/tokens_consumed": 64216891392, "data/tokens_consumed_B": 64.216891392, "train/loss_slope": -8.31667844719124e-06} {"step": 30630, "timestamp": 1778227648.1786585, "train/loss": 2.2062137365341186, "train/z_loss": 0.001454744371585548, "train/perplexity": 9.081267147211213, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950406.8886259154, "perf/iters_per_sec": 0.9300264781121804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0752382040023805, "data/tokens_consumed": 64237862912, "data/tokens_consumed_B": 64.237862912, "train/loss_slope": -7.00084979992679e-06} {"step": 30640, "timestamp": 1778227658.5333648, "train/loss": 2.166382384300232, "train/z_loss": 0.0014703431865200401, "train/perplexity": 8.726657176304785, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026479.0513579606, "perf/iters_per_sec": 0.9663005120076945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348747491836547, "data/tokens_consumed": 64258834432, "data/tokens_consumed_B": 64.258834432, "train/loss_slope": -8.52210945171747e-06} {"step": 30650, "timestamp": 1778227668.8881724, "grad/layer_0/attn": 0.0027123023755848408, "grad/layer_0/mlp": 0.002691598841920495, "grad/layer_0/attn_mlp_ratio": 1.007691871675939, "grad/layer_4/attn": 0.0019123932579532266, "grad/layer_4/mlp": 0.0025752796791493893, "grad/layer_4/attn_mlp_ratio": 0.7425963087338366, "grad/layer_8/attn": 0.010241846553981304, "grad/layer_8/mlp": 0.0037394091486930847, "grad/layer_8/attn_mlp_ratio": 2.7388942672056573, "grad/layer_12/attn": 0.005237999372184277, "grad/layer_12/mlp": 0.006801318377256393, "grad/layer_12/attn_mlp_ratio": 0.7701446991050551, "grad/layer_16/attn": 0.006669418420642614, "grad/layer_16/mlp": 0.004331890493631363, "grad/layer_16/attn_mlp_ratio": 1.5396091559763367, "grad/layer_20/attn": 0.0036754952743649483, "grad/layer_20/mlp": 0.005421825684607029, "grad/layer_20/attn_mlp_ratio": 0.6779072991979853, "grad/layer_24/attn": 0.008272075094282627, "grad/layer_24/mlp": 0.009920243173837662, "grad/layer_24/attn_mlp_ratio": 0.8338580885509436, "grad/layer_27/attn": 0.006265711504966021, "grad/layer_27/mlp": 0.008682969957590103, "grad/layer_27/attn_mlp_ratio": 0.721609249301618} {"step": 30650, "timestamp": 1778227668.9044518, "train/loss": 2.1939204931259155, "train/z_loss": 0.0014634793624281883, "train/perplexity": 8.970312314422143, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023127.865712835, "perf/iters_per_sec": 0.9647025421680617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365889549255372, "data/tokens_consumed": 64279805952, "data/tokens_consumed_B": 64.279805952, "train/loss_slope": -6.481588884691901e-06} {"step": 30660, "timestamp": 1778227679.2617571, "train/loss": 2.168625867366791, "train/z_loss": 0.0014761145343072713, "train/perplexity": 8.74625726191662, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025951.114152616, "perf/iters_per_sec": 0.9660487719309883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351444244384767, "data/tokens_consumed": 64300777472, "data/tokens_consumed_B": 64.300777472, "train/loss_slope": -8.715061335960008e-06} {"step": 30670, "timestamp": 1778227689.6166167, "train/loss": 2.1540109872817994, "train/z_loss": 0.0014807409839704632, "train/perplexity": 8.619361304128565, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026277.290951199, "perf/iters_per_sec": 0.9662043051486964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349777936935425, "data/tokens_consumed": 64321748992, "data/tokens_consumed_B": 64.321748992, "train/loss_slope": -1.1683492212727653e-05} {"step": 30675, "timestamp": 1778227695.4146085, "eos/sharpness": 24.470043182373043, "eos/L0_probe": 2.017744541168213, "eos/L_plus": 2.1157116889953613, "eos/L_minus": 2.164477825164795, "eos/grad_norm": 0.09911602735519409, "eos/embed_grad_frac": 0.23555690050125122, "eos/time_s": 0.6260843276977539} {"step": 30675, "timestamp": 1778227696.8040435, "geo/rankme_last": 440.00299072265625, "geo/layer_0/stable_rank_q_proj": 18.635461807250977, "geo/layer_0/stable_rank_k_proj": 16.15994644165039, "geo/layer_0/stable_rank_o_proj": 50.68627166748047, "geo/layer_0/stable_rank_gate_proj": 143.50457763671875, "geo/layer_0/stable_rank_down_proj": 51.88697814941406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.052151914685964584, "geo/layer_0/attn_entropy_mean": 6.223025321960449, "geo/layer_0/attn_entropy_std": 0.34741950035095215, "geo/layer_7/stable_rank_q_proj": 42.20054626464844, "geo/layer_7/stable_rank_k_proj": 42.543922424316406, "geo/layer_7/stable_rank_o_proj": 105.65737915039062, "geo/layer_7/stable_rank_gate_proj": 95.66122436523438, "geo/layer_7/stable_rank_down_proj": 148.3875732421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5305685997009277, "geo/layer_7/attn_entropy_mean": 4.635418891906738, "geo/layer_7/attn_entropy_std": 0.8186837434768677, "geo/layer_14/stable_rank_q_proj": 54.949100494384766, "geo/layer_14/stable_rank_k_proj": 36.029483795166016, "geo/layer_14/stable_rank_o_proj": 52.30344009399414, "geo/layer_14/stable_rank_gate_proj": 79.96244049072266, "geo/layer_14/stable_rank_down_proj": 134.9303436279297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3957526683807373, "geo/layer_14/attn_entropy_mean": 5.501080513000488, "geo/layer_14/attn_entropy_std": 0.3871590793132782, "geo/layer_21/stable_rank_q_proj": 44.93204116821289, "geo/layer_21/stable_rank_k_proj": 30.968454360961914, "geo/layer_21/stable_rank_o_proj": 79.86244201660156, "geo/layer_21/stable_rank_gate_proj": 77.8412094116211, "geo/layer_21/stable_rank_down_proj": 57.63719177246094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14304204285144806, "geo/layer_21/attn_entropy_mean": 5.737282752990723, "geo/layer_21/attn_entropy_std": 0.28821948170661926, "geo/layer_27/stable_rank_q_proj": 41.6208610534668, "geo/layer_27/stable_rank_k_proj": 31.299291610717773, "geo/layer_27/stable_rank_o_proj": 118.74077606201172, "geo/layer_27/stable_rank_gate_proj": 88.47496032714844, "geo/layer_27/stable_rank_down_proj": 135.07652282714844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0751807689666748, "geo/layer_27/attn_entropy_mean": 4.333743095397949, "geo/layer_27/attn_entropy_std": 0.5856099724769592, "attnres/final_alpha/block_0": 0.2404024600982666, "attnres/block_norm/0": 1.6865674257278442, "attnres/final_alpha/block_1": 0.005767395719885826, "attnres/block_norm/1": 36001.609375, "attnres/final_alpha/block_2": 0.012224774807691574, "attnres/block_norm/2": 24450.94140625, "attnres/final_alpha/block_3": 0.014247545972466469, "attnres/block_norm/3": 40133.98046875, "attnres/final_alpha/block_4": 0.017995648086071014, "attnres/block_norm/4": 11425.7275390625, "attnres/final_alpha/block_5": 0.5869014263153076, "attnres/block_norm/5": 5553.2197265625, "attnres/final_alpha/block_6": 0.12246073782444, "attnres/block_norm/6": 26406.435546875, "geo/tier1_time_s": 1.367142677307129, "geo/step": 30675.0, "geo/rankme_slope": -0.00014258054784413766} {"step": 30680, "timestamp": 1778227701.9851627, "train/loss": 2.2110722780227663, "train/z_loss": 0.0014565516728907823, "train/perplexity": 9.125496217798856, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696207.3509810052, "perf/iters_per_sec": 0.8088146929650332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2363771438598632, "data/tokens_consumed": 64342720512, "data/tokens_consumed_B": 64.342720512, "train/loss_slope": -1.2229422789023483e-05} {"step": 30690, "timestamp": 1778227712.3377123, "train/loss": 2.2376538276672364, "train/z_loss": 0.001461809070315212, "train/perplexity": 9.371318746168738, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026914.593047277, "perf/iters_per_sec": 0.9665081944691072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346523761749267, "data/tokens_consumed": 64363692032, "data/tokens_consumed_B": 64.363692032, "train/loss_slope": -7.994965916097721e-06} {"step": 30700, "timestamp": 1778227722.6763248, "grad/layer_0/attn": 0.0031451391987502575, "grad/layer_0/mlp": 0.003079021582379937, "grad/layer_0/attn_mlp_ratio": 1.021473546857005, "grad/layer_4/attn": 0.002597728744149208, "grad/layer_4/mlp": 0.00276753562502563, "grad/layer_4/attn_mlp_ratio": 0.9386432560414918, "grad/layer_8/attn": 0.005789659917354584, "grad/layer_8/mlp": 0.0038579010870307684, "grad/layer_8/attn_mlp_ratio": 1.5007278923622698, "grad/layer_12/attn": 0.004403560422360897, "grad/layer_12/mlp": 0.006387500092387199, "grad/layer_12/attn_mlp_ratio": 0.6894027850847164, "grad/layer_16/attn": 0.005627393256872892, "grad/layer_16/mlp": 0.0047352802939713, "grad/layer_16/attn_mlp_ratio": 1.1883970512152526, "grad/layer_20/attn": 0.004620017018169165, "grad/layer_20/mlp": 0.006807243451476097, "grad/layer_20/attn_mlp_ratio": 0.6786913062876017, "grad/layer_24/attn": 0.015680495649576187, "grad/layer_24/mlp": 0.010605867952108383, "grad/layer_24/attn_mlp_ratio": 1.4784735744905857, "grad/layer_27/attn": 0.010027838870882988, "grad/layer_27/mlp": 0.008873218670487404, "grad/layer_27/attn_mlp_ratio": 1.1301241556486679} {"step": 30700, "timestamp": 1778227722.6923764, "train/loss": 2.175221490859985, "train/z_loss": 0.0014521711505949496, "train/perplexity": 8.804134941665694, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026123.920171101, "perf/iters_per_sec": 0.9661311722617631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350561380386352, "data/tokens_consumed": 64384663552, "data/tokens_consumed_B": 64.384663552, "train/loss_slope": -1.1768160251179141e-05} {"step": 30710, "timestamp": 1778227733.0570505, "train/loss": 2.1846729278564454, "train/z_loss": 0.0014665948692709208, "train/perplexity": 8.88774114538343, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024271.3044898282, "perf/iters_per_sec": 0.9652477762650624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360034227371215, "data/tokens_consumed": 64405635072, "data/tokens_consumed_B": 64.405635072, "train/loss_slope": -1.5463876788622667e-05} {"step": 30720, "timestamp": 1778227743.410884, "train/loss": 2.203054690361023, "train/z_loss": 0.0014692505006678402, "train/perplexity": 9.052624270887675, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026478.6311765418, "perf/iters_per_sec": 0.9663003116495809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034874963760376, "data/tokens_consumed": 64426606592, "data/tokens_consumed_B": 64.426606592, "train/loss_slope": -1.4190467077084205e-05} {"step": 30730, "timestamp": 1778227753.7586055, "train/loss": 2.1597563862800597, "train/z_loss": 0.0014729685964994133, "train/perplexity": 8.669025507647268, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027472.4269680853, "perf/iters_per_sec": 0.9667741904106547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343677043914794, "data/tokens_consumed": 64447578112, "data/tokens_consumed_B": 64.447578112, "train/loss_slope": -1.5654896874346304e-05} {"step": 30740, "timestamp": 1778227764.1168997, "train/loss": 2.1530803203582765, "train/z_loss": 0.0014734096475876868, "train/perplexity": 8.611343281293971, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025663.0607684662, "perf/iters_per_sec": 0.9659114173738795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352916240692138, "data/tokens_consumed": 64468549632, "data/tokens_consumed_B": 64.468549632, "train/loss_slope": -1.5943547494054585e-05} {"step": 30750, "timestamp": 1778227774.4562309, "grad/layer_0/attn": 0.0025870096869766712, "grad/layer_0/mlp": 0.002615459728986025, "grad/layer_0/attn_mlp_ratio": 0.9891223173477746, "grad/layer_4/attn": 0.0022811281960457563, "grad/layer_4/mlp": 0.0025436854921281338, "grad/layer_4/attn_mlp_ratio": 0.8967807197183072, "grad/layer_8/attn": 0.00591745600104332, "grad/layer_8/mlp": 0.0036798580549657345, "grad/layer_8/attn_mlp_ratio": 1.608066330779106, "grad/layer_12/attn": 0.007075143046677113, "grad/layer_12/mlp": 0.006127679254859686, "grad/layer_12/attn_mlp_ratio": 1.1546203116953273, "grad/layer_16/attn": 0.008496852591633797, "grad/layer_16/mlp": 0.004443975165486336, "grad/layer_16/attn_mlp_ratio": 1.9119936732374965, "grad/layer_20/attn": 0.006114579737186432, "grad/layer_20/mlp": 0.005366950761526823, "grad/layer_20/attn_mlp_ratio": 1.1393023515491847, "grad/layer_24/attn": 0.00685089360922575, "grad/layer_24/mlp": 0.008511276915669441, "grad/layer_24/attn_mlp_ratio": 0.80491959039908, "grad/layer_27/attn": 0.005087587516754866, "grad/layer_27/mlp": 0.007641438394784927, "grad/layer_27/attn_mlp_ratio": 0.6657892385349964} {"step": 30750, "timestamp": 1778227775.0817564, "eos/sharpness": 29.054927825927727, "eos/L0_probe": 2.017392158508301, "eos/L_plus": 2.149742364883423, "eos/L_minus": 2.175591230392456, "eos/grad_norm": 0.11546038091182709, "eos/embed_grad_frac": 0.21795383095741272, "eos/time_s": 0.622650146484375} {"step": 30750, "timestamp": 1778227775.101186, "train/loss": 2.2069474935531614, "train/z_loss": 0.0014652069658041, "train/perplexity": 9.087933035994547, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910186.6951491977, "perf/iters_per_sec": 0.9108479953523625, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978780269622803, "data/tokens_consumed": 64489521152, "data/tokens_consumed_B": 64.489521152, "train/loss_slope": -1.0467170774847332e-05} {"step": 30750, "timestamp": 1778227776.4676023, "geo/rankme_last": 438.891357421875, "geo/layer_0/stable_rank_q_proj": 18.618087768554688, "geo/layer_0/stable_rank_k_proj": 16.12671661376953, "geo/layer_0/stable_rank_o_proj": 50.65449905395508, "geo/layer_0/stable_rank_gate_proj": 143.3064422607422, "geo/layer_0/stable_rank_down_proj": 51.968780517578125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05508628115057945, "geo/layer_0/attn_entropy_mean": 6.2138237953186035, "geo/layer_0/attn_entropy_std": 0.3536902666091919, "geo/layer_7/stable_rank_q_proj": 42.27133560180664, "geo/layer_7/stable_rank_k_proj": 42.44085693359375, "geo/layer_7/stable_rank_o_proj": 105.54154205322266, "geo/layer_7/stable_rank_gate_proj": 95.45457458496094, "geo/layer_7/stable_rank_down_proj": 147.71786499023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5301461219787598, "geo/layer_7/attn_entropy_mean": 4.649852752685547, "geo/layer_7/attn_entropy_std": 0.8357181549072266, "geo/layer_14/stable_rank_q_proj": 54.90345001220703, "geo/layer_14/stable_rank_k_proj": 36.04609298706055, "geo/layer_14/stable_rank_o_proj": 52.27622604370117, "geo/layer_14/stable_rank_gate_proj": 79.90019989013672, "geo/layer_14/stable_rank_down_proj": 135.37510681152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36931902170181274, "geo/layer_14/attn_entropy_mean": 5.522088527679443, "geo/layer_14/attn_entropy_std": 0.3818967044353485, "geo/layer_21/stable_rank_q_proj": 44.84050369262695, "geo/layer_21/stable_rank_k_proj": 30.99239730834961, "geo/layer_21/stable_rank_o_proj": 79.76058197021484, "geo/layer_21/stable_rank_gate_proj": 77.85163116455078, "geo/layer_21/stable_rank_down_proj": 57.755069732666016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14602030813694, "geo/layer_21/attn_entropy_mean": 5.761151313781738, "geo/layer_21/attn_entropy_std": 0.271958589553833, "geo/layer_27/stable_rank_q_proj": 41.665557861328125, "geo/layer_27/stable_rank_k_proj": 31.358396530151367, "geo/layer_27/stable_rank_o_proj": 118.9859848022461, "geo/layer_27/stable_rank_gate_proj": 88.47431182861328, "geo/layer_27/stable_rank_down_proj": 135.07220458984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07956580817699432, "geo/layer_27/attn_entropy_mean": 4.342324256896973, "geo/layer_27/attn_entropy_std": 0.6143599152565002, "attnres/final_alpha/block_0": 0.24067384004592896, "attnres/block_norm/0": 1.6868479251861572, "attnres/final_alpha/block_1": 0.005877100862562656, "attnres/block_norm/1": 36040.38671875, "attnres/final_alpha/block_2": 0.012274296954274178, "attnres/block_norm/2": 24517.8671875, "attnres/final_alpha/block_3": 0.014141029678285122, "attnres/block_norm/3": 40229.265625, "attnres/final_alpha/block_4": 0.017830740660429, "attnres/block_norm/4": 11495.1171875, "attnres/final_alpha/block_5": 0.5851578712463379, "attnres/block_norm/5": 5620.552734375, "attnres/final_alpha/block_6": 0.12404510378837585, "attnres/block_norm/6": 26520.55078125, "geo/tier1_time_s": 1.361889123916626, "geo/step": 30750.0, "geo/rankme_slope": -0.00016318257771858743} {"step": 30760, "timestamp": 1778227786.8202114, "train/loss": 2.0982631921768187, "train/z_loss": 0.0014656454673968255, "train/perplexity": 8.151999154282176, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790140.9623835243, "perf/iters_per_sec": 0.853605729285967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715010404586792, "data/tokens_consumed": 64510492672, "data/tokens_consumed_B": 64.510492672, "train/loss_slope": -1.3716108048602479e-05} {"step": 30770, "timestamp": 1778227797.1747305, "train/loss": 2.1437079668045045, "train/z_loss": 0.0014773882692679762, "train/perplexity": 8.531011763215874, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026664.7419103384, "perf/iters_per_sec": 0.9663890561629955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034779930114746, "data/tokens_consumed": 64531464192, "data/tokens_consumed_B": 64.531464192, "train/loss_slope": -1.8359782178588715e-05} {"step": 30780, "timestamp": 1778227807.5282564, "train/loss": 2.2081903219223022, "train/z_loss": 0.0014583428273908794, "train/perplexity": 9.099234798609386, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026524.758799335, "perf/iters_per_sec": 0.9663223070141482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348514080047608, "data/tokens_consumed": 64552435712, "data/tokens_consumed_B": 64.552435712, "train/loss_slope": -1.4380562917055311e-05} {"step": 30790, "timestamp": 1778227817.886444, "train/loss": 2.2113717794418335, "train/z_loss": 0.0014410478295758366, "train/perplexity": 9.128229726190169, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025806.6106621663, "perf/iters_per_sec": 0.9659798672972518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352182626724242, "data/tokens_consumed": 64573407232, "data/tokens_consumed_B": 64.573407232, "train/loss_slope": -9.51410023757661e-06} {"step": 30800, "timestamp": 1778227828.7924092, "grad/layer_0/attn": 0.002607849659398198, "grad/layer_0/mlp": 0.002740326104685664, "grad/layer_0/attn_mlp_ratio": 0.951656651291753, "grad/layer_4/attn": 0.002239222638309002, "grad/layer_4/mlp": 0.002536489861086011, "grad/layer_4/attn_mlp_ratio": 0.8828036667451534, "grad/layer_8/attn": 0.006029020994901657, "grad/layer_8/mlp": 0.003788192057982087, "grad/layer_8/attn_mlp_ratio": 1.591529875853295, "grad/layer_12/attn": 0.0047449045814573765, "grad/layer_12/mlp": 0.006581354420632124, "grad/layer_12/attn_mlp_ratio": 0.7209617057677725, "grad/layer_16/attn": 0.0038424073718488216, "grad/layer_16/mlp": 0.004889020696282387, "grad/layer_16/attn_mlp_ratio": 0.7859257573153687, "grad/layer_20/attn": 0.003952103666961193, "grad/layer_20/mlp": 0.006426216568797827, "grad/layer_20/attn_mlp_ratio": 0.6149969524293252, "grad/layer_24/attn": 0.01764531061053276, "grad/layer_24/mlp": 0.01185144204646349, "grad/layer_24/attn_mlp_ratio": 1.4888745515074873, "grad/layer_27/attn": 0.007577064912766218, "grad/layer_27/mlp": 0.01124129444360733, "grad/layer_27/attn_mlp_ratio": 0.6740384644644974} {"step": 30800, "timestamp": 1778227828.8086827, "train/loss": 2.2293107748031615, "train/z_loss": 0.0014519866905175149, "train/perplexity": 9.293458585811072, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921075.0564360062, "perf/iters_per_sec": 0.9160399706058532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.091655421257019, "data/tokens_consumed": 64594378752, "data/tokens_consumed_B": 64.594378752, "train/loss_slope": -5.9053731615609185e-06} {"step": 30810, "timestamp": 1778227839.1665237, "train/loss": 2.155313563346863, "train/z_loss": 0.0014696231693960727, "train/perplexity": 8.630595993289564, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025769.893169635, "perf/iters_per_sec": 0.9659623590324569, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352370262145996, "data/tokens_consumed": 64615350272, "data/tokens_consumed_B": 64.615350272, "train/loss_slope": -8.71643400607158e-06} {"step": 30820, "timestamp": 1778227849.5233245, "train/loss": 2.1762651205062866, "train/z_loss": 0.0014655424631200731, "train/perplexity": 8.813327994137511, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026023.3969617572, "perf/iters_per_sec": 0.9660832390602861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351074934005737, "data/tokens_consumed": 64636321792, "data/tokens_consumed_B": 64.636321792, "train/loss_slope": -1.0646644641499753e-05} {"step": 30825, "timestamp": 1778227855.3098528, "eos/sharpness": 21.45721912384033, "eos/L0_probe": 2.0180132389068604, "eos/L_plus": 2.135161876678467, "eos/L_minus": 2.1154367923736572, "eos/grad_norm": 0.1256726086139679, "eos/embed_grad_frac": 0.15344291925430298, "eos/time_s": 0.6211822032928467} {"step": 30825, "timestamp": 1778227856.6937735, "geo/rankme_last": 439.921142578125, "geo/layer_0/stable_rank_q_proj": 18.650081634521484, "geo/layer_0/stable_rank_k_proj": 16.15342903137207, "geo/layer_0/stable_rank_o_proj": 50.69549560546875, "geo/layer_0/stable_rank_gate_proj": 143.28529357910156, "geo/layer_0/stable_rank_down_proj": 52.01810836791992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05397968366742134, "geo/layer_0/attn_entropy_mean": 6.222873687744141, "geo/layer_0/attn_entropy_std": 0.3536737561225891, "geo/layer_7/stable_rank_q_proj": 42.26028060913086, "geo/layer_7/stable_rank_k_proj": 42.42838668823242, "geo/layer_7/stable_rank_o_proj": 105.78572082519531, "geo/layer_7/stable_rank_gate_proj": 95.39524841308594, "geo/layer_7/stable_rank_down_proj": 148.0144500732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5210654139518738, "geo/layer_7/attn_entropy_mean": 4.628699779510498, "geo/layer_7/attn_entropy_std": 0.8014273047447205, "geo/layer_14/stable_rank_q_proj": 54.8563117980957, "geo/layer_14/stable_rank_k_proj": 35.9434814453125, "geo/layer_14/stable_rank_o_proj": 52.236732482910156, "geo/layer_14/stable_rank_gate_proj": 79.93509674072266, "geo/layer_14/stable_rank_down_proj": 135.0892333984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38122186064720154, "geo/layer_14/attn_entropy_mean": 5.475772380828857, "geo/layer_14/attn_entropy_std": 0.38462650775909424, "geo/layer_21/stable_rank_q_proj": 44.92363739013672, "geo/layer_21/stable_rank_k_proj": 30.9571475982666, "geo/layer_21/stable_rank_o_proj": 79.8708267211914, "geo/layer_21/stable_rank_gate_proj": 77.89781188964844, "geo/layer_21/stable_rank_down_proj": 57.80673599243164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14594177901744843, "geo/layer_21/attn_entropy_mean": 5.728126525878906, "geo/layer_21/attn_entropy_std": 0.2863060534000397, "geo/layer_27/stable_rank_q_proj": 41.57854461669922, "geo/layer_27/stable_rank_k_proj": 31.26127052307129, "geo/layer_27/stable_rank_o_proj": 119.08930206298828, "geo/layer_27/stable_rank_gate_proj": 88.48106384277344, "geo/layer_27/stable_rank_down_proj": 135.16876220703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08282072842121124, "geo/layer_27/attn_entropy_mean": 4.3293256759643555, "geo/layer_27/attn_entropy_std": 0.6101406812667847, "attnres/final_alpha/block_0": 0.24096164107322693, "attnres/block_norm/0": 1.6870497465133667, "attnres/final_alpha/block_1": 0.005728497169911861, "attnres/block_norm/1": 36091.875, "attnres/final_alpha/block_2": 0.012313233688473701, "attnres/block_norm/2": 24487.93359375, "attnres/final_alpha/block_3": 0.014199711382389069, "attnres/block_norm/3": 40263.25, "attnres/final_alpha/block_4": 0.018119648098945618, "attnres/block_norm/4": 11441.9765625, "attnres/final_alpha/block_5": 0.5841504335403442, "attnres/block_norm/5": 5645.60400390625, "attnres/final_alpha/block_6": 0.12452682107686996, "attnres/block_norm/6": 26381.765625, "geo/tier1_time_s": 1.36421799659729, "geo/step": 30825.0, "geo/rankme_slope": -0.0001695770886479592} {"step": 30830, "timestamp": 1778227861.879364, "train/loss": 2.20824556350708, "train/z_loss": 0.0014527768013067543, "train/perplexity": 9.099737468643943, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697960.6662205225, "perf/iters_per_sec": 0.8096507388212788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351004600524902, "data/tokens_consumed": 64657293312, "data/tokens_consumed_B": 64.657293312, "train/loss_slope": -8.496448523712924e-06} {"step": 30840, "timestamp": 1778227872.2414436, "train/loss": 2.1598955392837524, "train/z_loss": 0.00147376375971362, "train/perplexity": 8.67023191252123, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024786.854007345, "perf/iters_per_sec": 0.9654936094319081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357396364212037, "data/tokens_consumed": 64678264832, "data/tokens_consumed_B": 64.678264832, "train/loss_slope": -6.909492700406908e-06} {"step": 30850, "timestamp": 1778227882.5907989, "grad/layer_0/attn": 0.0031405703630298376, "grad/layer_0/mlp": 0.0029001382645219564, "grad/layer_0/attn_mlp_ratio": 1.0829036302023172, "grad/layer_4/attn": 0.002439856994897127, "grad/layer_4/mlp": 0.0026579545810818672, "grad/layer_4/attn_mlp_ratio": 0.9179452954043716, "grad/layer_8/attn": 0.0045631867833435535, "grad/layer_8/mlp": 0.0036638411693274975, "grad/layer_8/attn_mlp_ratio": 1.2454651956527396, "grad/layer_12/attn": 0.004449024796485901, "grad/layer_12/mlp": 0.006414424162358046, "grad/layer_12/attn_mlp_ratio": 0.6935969020001131, "grad/layer_16/attn": 0.0033606947399675846, "grad/layer_16/mlp": 0.004530676174908876, "grad/layer_16/attn_mlp_ratio": 0.7417644819558813, "grad/layer_20/attn": 0.004518700297921896, "grad/layer_20/mlp": 0.0052850027568638325, "grad/layer_20/attn_mlp_ratio": 0.8550043245583661, "grad/layer_24/attn": 0.006284553091973066, "grad/layer_24/mlp": 0.008144657127559185, "grad/layer_24/attn_mlp_ratio": 0.7716166459047461, "grad/layer_27/attn": 0.006396673619747162, "grad/layer_27/mlp": 0.006750714499503374, "grad/layer_27/attn_mlp_ratio": 0.947555036650156} {"step": 30850, "timestamp": 1778227882.6066651, "train/loss": 2.2207072973251343, "train/z_loss": 0.0014457633369602263, "train/perplexity": 9.21384548999183, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024133.235483159, "perf/iters_per_sec": 0.9651819398322864, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360740900039673, "data/tokens_consumed": 64699236352, "data/tokens_consumed_B": 64.699236352, "train/loss_slope": -5.6249930699094835e-06} {"step": 30860, "timestamp": 1778227892.9683185, "train/loss": 2.1743138313293455, "train/z_loss": 0.0014694954617880285, "train/perplexity": 8.796147410204757, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025060.2054686868, "perf/iters_per_sec": 0.965623953565925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355998277664185, "data/tokens_consumed": 64720207872, "data/tokens_consumed_B": 64.720207872, "train/loss_slope": -7.48386072604613e-06} {"step": 30870, "timestamp": 1778227903.3306832, "train/loss": 2.2369045972824098, "train/z_loss": 0.0014553675893694163, "train/perplexity": 9.364300099038381, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024627.5107343714, "perf/iters_per_sec": 0.9654176286384446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358211517333984, "data/tokens_consumed": 64741179392, "data/tokens_consumed_B": 64.741179392, "train/loss_slope": -6.996545783042701e-07} {"step": 30880, "timestamp": 1778227913.6925807, "train/loss": 2.2169331550598144, "train/z_loss": 0.0014585828059352935, "train/perplexity": 9.179136665522428, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024817.942617966, "perf/iters_per_sec": 0.9655084336366492, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357237339019776, "data/tokens_consumed": 64762150912, "data/tokens_consumed_B": 64.762150912, "train/loss_slope": 1.3353806016968682e-06} {"step": 30890, "timestamp": 1778227924.051411, "train/loss": 2.2288161754608153, "train/z_loss": 0.0014681452768854797, "train/perplexity": 9.288863183841485, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025513.8874745443, "perf/iters_per_sec": 0.965840286004326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353678703308105, "data/tokens_consumed": 64783122432, "data/tokens_consumed_B": 64.783122432, "train/loss_slope": 5.181530292349523e-06} {"step": 30900, "timestamp": 1778227934.401778, "grad/layer_0/attn": 0.0027328901924192905, "grad/layer_0/mlp": 0.002758823800832033, "grad/layer_0/attn_mlp_ratio": 0.9905997231628589, "grad/layer_4/attn": 0.0019852141849696636, "grad/layer_4/mlp": 0.002551544923335314, "grad/layer_4/attn_mlp_ratio": 0.7780439564317947, "grad/layer_8/attn": 0.008874636143445969, "grad/layer_8/mlp": 0.003825495019555092, "grad/layer_8/attn_mlp_ratio": 2.319866021546015, "grad/layer_12/attn": 0.004701306112110615, "grad/layer_12/mlp": 0.006517566740512848, "grad/layer_12/attn_mlp_ratio": 0.7213284078480864, "grad/layer_16/attn": 0.0034630000591278076, "grad/layer_16/mlp": 0.00441137095913291, "grad/layer_16/attn_mlp_ratio": 0.7850167244395189, "grad/layer_20/attn": 0.0036139150615781546, "grad/layer_20/mlp": 0.005828924011439085, "grad/layer_20/attn_mlp_ratio": 0.6199969312494491, "grad/layer_24/attn": 0.014582780189812183, "grad/layer_24/mlp": 0.009611680172383785, "grad/layer_24/attn_mlp_ratio": 1.5171936411275904, "grad/layer_27/attn": 0.007749080192297697, "grad/layer_27/mlp": 0.009815621189773083, "grad/layer_27/attn_mlp_ratio": 0.7894640556651754} {"step": 30900, "timestamp": 1778227935.022624, "eos/sharpness": 51.46489143371581, "eos/L0_probe": 2.0174779891967773, "eos/L_plus": 2.2298078536987305, "eos/L_minus": 2.3197970390319824, "eos/grad_norm": 0.1591091752052307, "eos/embed_grad_frac": 0.09733609110116959, "eos/time_s": 0.6179277896881104} {"step": 30900, "timestamp": 1778227935.0420308, "train/loss": 2.2032802581787108, "train/z_loss": 0.0014696873025968672, "train/perplexity": 9.05466648190869, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908931.10802296, "perf/iters_per_sec": 0.9102492847552108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098600149154663, "data/tokens_consumed": 64804093952, "data/tokens_consumed_B": 64.804093952, "train/loss_slope": 6.6309919165591235e-06} {"step": 30900, "timestamp": 1778227936.4078438, "geo/rankme_last": 439.58148193359375, "geo/layer_0/stable_rank_q_proj": 18.65407943725586, "geo/layer_0/stable_rank_k_proj": 16.16225814819336, "geo/layer_0/stable_rank_o_proj": 50.6966667175293, "geo/layer_0/stable_rank_gate_proj": 143.3237762451172, "geo/layer_0/stable_rank_down_proj": 52.00287628173828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05575575307011604, "geo/layer_0/attn_entropy_mean": 6.223296165466309, "geo/layer_0/attn_entropy_std": 0.3581705391407013, "geo/layer_7/stable_rank_q_proj": 42.34345245361328, "geo/layer_7/stable_rank_k_proj": 42.39668273925781, "geo/layer_7/stable_rank_o_proj": 105.63079833984375, "geo/layer_7/stable_rank_gate_proj": 95.30892944335938, "geo/layer_7/stable_rank_down_proj": 148.00437927246094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5094043016433716, "geo/layer_7/attn_entropy_mean": 4.646002769470215, "geo/layer_7/attn_entropy_std": 0.818915605545044, "geo/layer_14/stable_rank_q_proj": 54.8177375793457, "geo/layer_14/stable_rank_k_proj": 36.00117874145508, "geo/layer_14/stable_rank_o_proj": 52.154380798339844, "geo/layer_14/stable_rank_gate_proj": 79.94437408447266, "geo/layer_14/stable_rank_down_proj": 135.21156311035156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3798934519290924, "geo/layer_14/attn_entropy_mean": 5.514366149902344, "geo/layer_14/attn_entropy_std": 0.3766292929649353, "geo/layer_21/stable_rank_q_proj": 44.87369155883789, "geo/layer_21/stable_rank_k_proj": 30.906320571899414, "geo/layer_21/stable_rank_o_proj": 79.80782318115234, "geo/layer_21/stable_rank_gate_proj": 77.91007232666016, "geo/layer_21/stable_rank_down_proj": 57.7166633605957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14888793230056763, "geo/layer_21/attn_entropy_mean": 5.748892784118652, "geo/layer_21/attn_entropy_std": 0.27675506472587585, "geo/layer_27/stable_rank_q_proj": 41.55111312866211, "geo/layer_27/stable_rank_k_proj": 31.197418212890625, "geo/layer_27/stable_rank_o_proj": 118.79742431640625, "geo/layer_27/stable_rank_gate_proj": 88.41378784179688, "geo/layer_27/stable_rank_down_proj": 135.5061798095703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07933181524276733, "geo/layer_27/attn_entropy_mean": 4.337381362915039, "geo/layer_27/attn_entropy_std": 0.6178697347640991, "attnres/final_alpha/block_0": 0.24101950228214264, "attnres/block_norm/0": 1.6873652935028076, "attnres/final_alpha/block_1": 0.005845896899700165, "attnres/block_norm/1": 36076.765625, "attnres/final_alpha/block_2": 0.012548423372209072, "attnres/block_norm/2": 24448.2890625, "attnres/final_alpha/block_3": 0.014057561755180359, "attnres/block_norm/3": 40236.98046875, "attnres/final_alpha/block_4": 0.018107831478118896, "attnres/block_norm/4": 11499.265625, "attnres/final_alpha/block_5": 0.5835099816322327, "attnres/block_norm/5": 5692.7314453125, "attnres/final_alpha/block_6": 0.12491083890199661, "attnres/block_norm/6": 26646.91796875, "geo/tier1_time_s": 1.3622045516967773, "geo/step": 30900.0, "geo/rankme_slope": -0.00017461818711859745} {"step": 30910, "timestamp": 1778227946.7706828, "train/loss": 2.1646868228912353, "train/z_loss": 0.0014592450577765703, "train/perplexity": 8.711873130337924, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788637.578745533, "perf/iters_per_sec": 0.8528888601043382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1724857091903687, "data/tokens_consumed": 64825065472, "data/tokens_consumed_B": 64.825065472, "train/loss_slope": 6.223241838649106e-06} {"step": 30920, "timestamp": 1778227957.1320386, "train/loss": 2.155991053581238, "train/z_loss": 0.0014530091895721853, "train/perplexity": 8.636445118930878, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025075.7305408765, "perf/iters_per_sec": 0.9656313564972289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355918884277344, "data/tokens_consumed": 64846036992, "data/tokens_consumed_B": 64.846036992, "train/loss_slope": 7.935250588733005e-07} {"step": 30930, "timestamp": 1778227967.4915938, "train/loss": 2.189006972312927, "train/z_loss": 0.001465854316484183, "train/perplexity": 8.926344604752689, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025351.6316579191, "perf/iters_per_sec": 0.9657629164018245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03545081615448, "data/tokens_consumed": 64867008512, "data/tokens_consumed_B": 64.867008512, "train/loss_slope": 3.02597943491087e-06} {"step": 30940, "timestamp": 1778227977.8532996, "train/loss": 2.1511153101921083, "train/z_loss": 0.0014645439689047635, "train/perplexity": 8.594438518656563, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024978.341457602, "perf/iters_per_sec": 0.9655849177635203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356416940689086, "data/tokens_consumed": 64887980032, "data/tokens_consumed_B": 64.887980032, "train/loss_slope": -1.447589305153269e-06} {"step": 30950, "timestamp": 1778227988.2062063, "grad/layer_0/attn": 0.0030015273950994015, "grad/layer_0/mlp": 0.002797419670969248, "grad/layer_0/attn_mlp_ratio": 1.07296281603795, "grad/layer_4/attn": 0.002940872684121132, "grad/layer_4/mlp": 0.002489273203536868, "grad/layer_4/attn_mlp_ratio": 1.181418159244552, "grad/layer_8/attn": 0.003432398894801736, "grad/layer_8/mlp": 0.0035232941154390574, "grad/layer_8/attn_mlp_ratio": 0.974201609323735, "grad/layer_12/attn": 0.0041178432293236256, "grad/layer_12/mlp": 0.006222267169505358, "grad/layer_12/attn_mlp_ratio": 0.6617914420849004, "grad/layer_16/attn": 0.003530609654262662, "grad/layer_16/mlp": 0.004273628816008568, "grad/layer_16/attn_mlp_ratio": 0.8261385636542674, "grad/layer_20/attn": 0.004843704868108034, "grad/layer_20/mlp": 0.005998471286147833, "grad/layer_20/attn_mlp_ratio": 0.8074898680509703, "grad/layer_24/attn": 0.011591310612857342, "grad/layer_24/mlp": 0.009909196756780148, "grad/layer_24/attn_mlp_ratio": 1.1697527842457025, "grad/layer_27/attn": 0.004721582867205143, "grad/layer_27/mlp": 0.008185305632650852, "grad/layer_27/attn_mlp_ratio": 0.576836470306899} {"step": 30950, "timestamp": 1778227988.2221622, "train/loss": 2.137923336029053, "train/z_loss": 0.0014674763777293264, "train/perplexity": 8.481805467362792, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023824.232265933, "perf/iters_per_sec": 0.9650345956163087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362322807312012, "data/tokens_consumed": 64908951552, "data/tokens_consumed_B": 64.908951552, "train/loss_slope": -4.996425827237031e-06} {"step": 30960, "timestamp": 1778227998.5924265, "train/loss": 2.1597659945487977, "train/z_loss": 0.0014660687185823918, "train/perplexity": 8.669108802374199, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023797.69079855, "perf/iters_per_sec": 0.9650219396584272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03624587059021, "data/tokens_consumed": 64929923072, "data/tokens_consumed_B": 64.929923072, "train/loss_slope": -8.845243881745261e-06} {"step": 30970, "timestamp": 1778228008.9625704, "train/loss": 2.203628182411194, "train/z_loss": 0.0014546833583153785, "train/perplexity": 9.057817367897808, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023575.0027193516, "perf/iters_per_sec": 0.9649157537075765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363599061965942, "data/tokens_consumed": 64950894592, "data/tokens_consumed_B": 64.950894592, "train/loss_slope": -7.844225143549413e-06} {"step": 30975, "timestamp": 1778228014.7451162, "eos/sharpness": 49.03779029846191, "eos/L0_probe": 2.0132477283477783, "eos/L_plus": 2.2942280769348145, "eos/L_minus": 2.2226452827453613, "eos/grad_norm": 0.18643122911453247, "eos/embed_grad_frac": 0.07812245935201645, "eos/time_s": 0.6127631664276123} {"step": 30975, "timestamp": 1778228016.1260436, "geo/rankme_last": 440.351806640625, "geo/layer_0/stable_rank_q_proj": 18.660335540771484, "geo/layer_0/stable_rank_k_proj": 16.116300582885742, "geo/layer_0/stable_rank_o_proj": 50.64102554321289, "geo/layer_0/stable_rank_gate_proj": 142.88693237304688, "geo/layer_0/stable_rank_down_proj": 52.014366149902344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05761360377073288, "geo/layer_0/attn_entropy_mean": 6.217959403991699, "geo/layer_0/attn_entropy_std": 0.35680443048477173, "geo/layer_7/stable_rank_q_proj": 42.338661193847656, "geo/layer_7/stable_rank_k_proj": 42.34021759033203, "geo/layer_7/stable_rank_o_proj": 105.64244842529297, "geo/layer_7/stable_rank_gate_proj": 95.26332092285156, "geo/layer_7/stable_rank_down_proj": 147.8550567626953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5349529385566711, "geo/layer_7/attn_entropy_mean": 4.648557186126709, "geo/layer_7/attn_entropy_std": 0.8194402456283569, "geo/layer_14/stable_rank_q_proj": 54.85113525390625, "geo/layer_14/stable_rank_k_proj": 35.953426361083984, "geo/layer_14/stable_rank_o_proj": 52.00377655029297, "geo/layer_14/stable_rank_gate_proj": 80.20924377441406, "geo/layer_14/stable_rank_down_proj": 134.9917755126953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36530378460884094, "geo/layer_14/attn_entropy_mean": 5.498980522155762, "geo/layer_14/attn_entropy_std": 0.392970472574234, "geo/layer_21/stable_rank_q_proj": 44.89968490600586, "geo/layer_21/stable_rank_k_proj": 30.986215591430664, "geo/layer_21/stable_rank_o_proj": 79.54590606689453, "geo/layer_21/stable_rank_gate_proj": 77.84185028076172, "geo/layer_21/stable_rank_down_proj": 57.658573150634766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14697708189487457, "geo/layer_21/attn_entropy_mean": 5.738497734069824, "geo/layer_21/attn_entropy_std": 0.28104454278945923, "geo/layer_27/stable_rank_q_proj": 41.6071662902832, "geo/layer_27/stable_rank_k_proj": 31.234535217285156, "geo/layer_27/stable_rank_o_proj": 118.69979858398438, "geo/layer_27/stable_rank_gate_proj": 88.37533569335938, "geo/layer_27/stable_rank_down_proj": 135.75804138183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08315455913543701, "geo/layer_27/attn_entropy_mean": 4.330387115478516, "geo/layer_27/attn_entropy_std": 0.623916745185852, "attnres/final_alpha/block_0": 0.23928353190422058, "attnres/block_norm/0": 1.6877905130386353, "attnres/final_alpha/block_1": 0.005810993257910013, "attnres/block_norm/1": 36157.98046875, "attnres/final_alpha/block_2": 0.012386634014546871, "attnres/block_norm/2": 24553.111328125, "attnres/final_alpha/block_3": 0.014205316081643105, "attnres/block_norm/3": 40604.63671875, "attnres/final_alpha/block_4": 0.017886221408843994, "attnres/block_norm/4": 11492.240234375, "attnres/final_alpha/block_5": 0.5891177654266357, "attnres/block_norm/5": 5628.24072265625, "attnres/final_alpha/block_6": 0.1213095635175705, "attnres/block_norm/6": 26579.125, "geo/tier1_time_s": 1.3613784313201904, "geo/step": 30975.0, "geo/rankme_slope": -0.00017439622333308323} {"step": 30980, "timestamp": 1778228021.3112402, "train/loss": 2.1743059396743774, "train/z_loss": 0.0014628577046096325, "train/perplexity": 8.79607799431825, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699004.3279742121, "perf/iters_per_sec": 0.8101483955260335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2343417644500732, "data/tokens_consumed": 64971866112, "data/tokens_consumed_B": 64.971866112, "train/loss_slope": -9.375437987734232e-06} {"step": 30990, "timestamp": 1778228031.6772647, "train/loss": 2.170793128013611, "train/z_loss": 0.0014645804301835596, "train/perplexity": 8.765233236601379, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023960.0702874598, "perf/iters_per_sec": 0.9650993682324694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361627340316772, "data/tokens_consumed": 64992837632, "data/tokens_consumed_B": 64.992837632, "train/loss_slope": -8.196055384823966e-06} {"step": 31000, "timestamp": 1778228042.0242558, "grad/layer_0/attn": 0.0024651777930557728, "grad/layer_0/mlp": 0.0025680819526314735, "grad/layer_0/attn_mlp_ratio": 0.9599295281588622, "grad/layer_4/attn": 0.0018898911075666547, "grad/layer_4/mlp": 0.0024046748876571655, "grad/layer_4/attn_mlp_ratio": 0.7859237182850826, "grad/layer_8/attn": 0.005088884383440018, "grad/layer_8/mlp": 0.0035337950102984905, "grad/layer_8/attn_mlp_ratio": 1.4400620931897132, "grad/layer_12/attn": 0.004382607061415911, "grad/layer_12/mlp": 0.006126361433416605, "grad/layer_12/attn_mlp_ratio": 0.7153686633592743, "grad/layer_16/attn": 0.004093270283192396, "grad/layer_16/mlp": 0.00423560431227088, "grad/layer_16/attn_mlp_ratio": 0.9663957926131793, "grad/layer_20/attn": 0.003665846772491932, "grad/layer_20/mlp": 0.00535577442497015, "grad/layer_20/attn_mlp_ratio": 0.6844662252678315, "grad/layer_24/attn": 0.0047645242884755135, "grad/layer_24/mlp": 0.008373032324016094, "grad/layer_24/attn_mlp_ratio": 0.5690321077474375, "grad/layer_27/attn": 0.005054009612649679, "grad/layer_27/mlp": 0.007707588374614716, "grad/layer_27/attn_mlp_ratio": 0.6557186634049397} {"step": 31000, "timestamp": 1778228042.0400698, "train/loss": 2.1610488653182984, "train/z_loss": 0.001460684498306364, "train/perplexity": 8.680237285330902, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024917.9267057907, "perf/iters_per_sec": 0.965556109764953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356725931167603, "data/tokens_consumed": 65013809152, "data/tokens_consumed_B": 65.013809152, "train/loss_slope": -9.605177958877937e-06} {"step": 31000, "timestamp": 1778228048.8968449, "geo/ww_alpha_mean": 7.630171913597687, "geo/ww_alpha_std": 4.148323949628134, "geo/ww_alpha_min": 1.3436291637092643, "geo/ww_alpha_max": 30.299524221852305, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.134844690140263, "geo/ww_alpha_by_type/k_proj": 4.601331391763133, "geo/ww_alpha_by_type/v_proj": 7.680531350315545, "geo/ww_alpha_by_type/o_proj": 7.578604697548937, "geo/ww_alpha_by_type/gate_proj": 8.291056285471614, "geo/ww_alpha_by_type/up_proj": 12.163707839282798, "geo/ww_alpha_by_type/down_proj": 9.07503895322611, "geo/twonn_id/layer_0": 0.719781219959259, "geo/twonn_id/layer_7": 3.0820460319519043, "geo/twonn_id/layer_14": 4.421426773071289, "geo/twonn_id/layer_21": 7.358016014099121, "geo/twonn_id/layer_27": 5.502708435058594, "geo/tier2_time_s": 6.851006984710693} {"step": 31000, "timestamp": 1778228049.5393674, "eoc/jacobian_sigma/layer_0/attn": 1125.8790283203125, "eoc/jacobian_sigma/layer_0/mlp": 7531.12744140625, "eoc/jacobian_sigma/layer_0": 7531.12744140625, "eoc/jacobian_sigma/layer_7/attn": 1.1721792221069336, "eoc/jacobian_sigma/layer_7/mlp": 1.6648701429367065, "eoc/jacobian_sigma/layer_7": 1.6648701429367065, "eoc/jacobian_sigma/layer_14/attn": 1.625092625617981, "eoc/jacobian_sigma/layer_14/mlp": 5.94326639175415, "eoc/jacobian_sigma/layer_14": 5.94326639175415, "eoc/jacobian_sigma/layer_21/attn": 1.0895230770111084, "eoc/jacobian_sigma/layer_21/mlp": 4.232021808624268, "eoc/jacobian_sigma/layer_21": 4.232021808624268, "eoc/jacobian_sigma/layer_27/attn": 2.9371185302734375, "eoc/jacobian_sigma/layer_27/mlp": 25.532695770263672, "eoc/jacobian_sigma/layer_27": 25.532695770263672, "eoc/layer0_sigma": 7531.12744140625, "eoc/sigma_max": 25.532695770263672, "eoc/sigma_min": 1.6648701429367065, "eoc/sigma_mean": 9.343213528394699, "eoc/time_s": 0.636284351348877} {"step": 31010, "timestamp": 1778228059.9189205, "train/loss": 2.224202108383179, "train/z_loss": 0.0014607822289690376, "train/perplexity": 9.246102472295163, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1173295.0652630476, "perf/iters_per_sec": 0.5594706846537817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7874037504196167, "data/tokens_consumed": 65034780672, "data/tokens_consumed_B": 65.034780672, "train/loss_slope": -5.391332955107213e-06} {"step": 31020, "timestamp": 1778228070.2807727, "train/loss": 2.192019462585449, "train/z_loss": 0.0014550019521266222, "train/perplexity": 8.953275675471133, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024843.485399251, "perf/iters_per_sec": 0.9655206133838897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357106685638429, "data/tokens_consumed": 65055752192, "data/tokens_consumed_B": 65.055752192, "train/loss_slope": -7.830206567448077e-07} {"step": 31030, "timestamp": 1778228080.6404922, "train/loss": 2.176869809627533, "train/z_loss": 0.0014558327849954367, "train/perplexity": 8.818658929314358, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025305.930486645, "perf/iters_per_sec": 0.9657411243851877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035474181175232, "data/tokens_consumed": 65076723712, "data/tokens_consumed_B": 65.076723712, "train/loss_slope": -9.823252623266731e-07} {"step": 31040, "timestamp": 1778228091.0111585, "train/loss": 2.207144570350647, "train/z_loss": 0.0014496218878775834, "train/perplexity": 9.089724233228955, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023221.074704072, "perf/iters_per_sec": 0.9647469876785622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365411996841432, "data/tokens_consumed": 65097695232, "data/tokens_consumed_B": 65.097695232, "train/loss_slope": -1.0806525751452186e-06} {"step": 31050, "timestamp": 1778228101.366109, "grad/layer_0/attn": 0.002861063927412033, "grad/layer_0/mlp": 0.002729433123022318, "grad/layer_0/attn_mlp_ratio": 1.0482263875442828, "grad/layer_4/attn": 0.002074343850836158, "grad/layer_4/mlp": 0.0024000329431146383, "grad/layer_4/attn_mlp_ratio": 0.8642980382237496, "grad/layer_8/attn": 0.005424252711236477, "grad/layer_8/mlp": 0.003442284883931279, "grad/layer_8/attn_mlp_ratio": 1.57577095927766, "grad/layer_12/attn": 0.0061792959459125996, "grad/layer_12/mlp": 0.006487210281193256, "grad/layer_12/attn_mlp_ratio": 0.952535154991533, "grad/layer_16/attn": 0.0043753269128501415, "grad/layer_16/mlp": 0.005290237255394459, "grad/layer_16/attn_mlp_ratio": 0.8270568254160879, "grad/layer_20/attn": 0.01099822111427784, "grad/layer_20/mlp": 0.005899412557482719, "grad/layer_20/attn_mlp_ratio": 1.8642908629773296, "grad/layer_24/attn": 0.00475861132144928, "grad/layer_24/mlp": 0.008045156486332417, "grad/layer_24/attn_mlp_ratio": 0.5914877193979652, "grad/layer_27/attn": 0.00358314230106771, "grad/layer_27/mlp": 0.0067440541461110115, "grad/layer_27/attn_mlp_ratio": 0.5313038967819609} {"step": 31050, "timestamp": 1778228101.9894423, "eos/sharpness": 5.970239639282226, "eos/L0_probe": 2.015427589416504, "eos/L_plus": 2.045105457305908, "eos/L_minus": 2.045452117919922, "eos/grad_norm": 0.10896790772676468, "eos/embed_grad_frac": 0.18711112439632416, "eos/time_s": 0.6206250190734863} {"step": 31050, "timestamp": 1778228102.0096817, "train/loss": 2.1875946044921877, "train/z_loss": 0.001469532516784966, "train/perplexity": 8.913746221745734, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907622.4765275565, "perf/iters_per_sec": 0.9096252806318076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0993537902832031, "data/tokens_consumed": 65118666752, "data/tokens_consumed_B": 65.118666752, "train/loss_slope": -1.5868299686261391e-06} {"step": 31050, "timestamp": 1778228103.3707335, "geo/rankme_last": 439.5552673339844, "geo/layer_0/stable_rank_q_proj": 18.659448623657227, "geo/layer_0/stable_rank_k_proj": 16.12250518798828, "geo/layer_0/stable_rank_o_proj": 50.5831184387207, "geo/layer_0/stable_rank_gate_proj": 143.0588836669922, "geo/layer_0/stable_rank_down_proj": 52.06939697265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05726588889956474, "geo/layer_0/attn_entropy_mean": 6.219814300537109, "geo/layer_0/attn_entropy_std": 0.3543727993965149, "geo/layer_7/stable_rank_q_proj": 42.42997741699219, "geo/layer_7/stable_rank_k_proj": 42.34752655029297, "geo/layer_7/stable_rank_o_proj": 105.5924072265625, "geo/layer_7/stable_rank_gate_proj": 95.20146179199219, "geo/layer_7/stable_rank_down_proj": 148.14089965820312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5288832187652588, "geo/layer_7/attn_entropy_mean": 4.672292709350586, "geo/layer_7/attn_entropy_std": 0.8117650747299194, "geo/layer_14/stable_rank_q_proj": 54.91630935668945, "geo/layer_14/stable_rank_k_proj": 36.028751373291016, "geo/layer_14/stable_rank_o_proj": 51.858699798583984, "geo/layer_14/stable_rank_gate_proj": 80.3795166015625, "geo/layer_14/stable_rank_down_proj": 134.66290283203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3772612512111664, "geo/layer_14/attn_entropy_mean": 5.501195430755615, "geo/layer_14/attn_entropy_std": 0.3762210011482239, "geo/layer_21/stable_rank_q_proj": 44.85879135131836, "geo/layer_21/stable_rank_k_proj": 31.025463104248047, "geo/layer_21/stable_rank_o_proj": 79.5568618774414, "geo/layer_21/stable_rank_gate_proj": 77.76107788085938, "geo/layer_21/stable_rank_down_proj": 57.615394592285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14714854955673218, "geo/layer_21/attn_entropy_mean": 5.738419055938721, "geo/layer_21/attn_entropy_std": 0.2898238003253937, "geo/layer_27/stable_rank_q_proj": 41.55912780761719, "geo/layer_27/stable_rank_k_proj": 31.212005615234375, "geo/layer_27/stable_rank_o_proj": 118.73446655273438, "geo/layer_27/stable_rank_gate_proj": 88.24467468261719, "geo/layer_27/stable_rank_down_proj": 135.65045166015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07994415611028671, "geo/layer_27/attn_entropy_mean": 4.327342987060547, "geo/layer_27/attn_entropy_std": 0.6148096919059753, "attnres/final_alpha/block_0": 0.24256516993045807, "attnres/block_norm/0": 1.688122272491455, "attnres/final_alpha/block_1": 0.005822960287332535, "attnres/block_norm/1": 36133.546875, "attnres/final_alpha/block_2": 0.012420456856489182, "attnres/block_norm/2": 24594.74609375, "attnres/final_alpha/block_3": 0.01411623228341341, "attnres/block_norm/3": 40328.8671875, "attnres/final_alpha/block_4": 0.018249953165650368, "attnres/block_norm/4": 11517.59765625, "attnres/final_alpha/block_5": 0.5819151401519775, "attnres/block_norm/5": 5649.9140625, "attnres/final_alpha/block_6": 0.12491010874509811, "attnres/block_norm/6": 26643.00390625, "geo/tier1_time_s": 1.3569860458374023, "geo/step": 31050.0, "geo/rankme_slope": -0.00017624061343287316} {"step": 31060, "timestamp": 1778228113.731807, "train/loss": 2.1952999353408815, "train/z_loss": 0.0014652493875473738, "train/perplexity": 8.982694880463725, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789648.6446195787, "perf/iters_per_sec": 0.8533709738824743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718233108520508, "data/tokens_consumed": 65139638272, "data/tokens_consumed_B": 65.139638272, "train/loss_slope": -1.215781673382165e-06} {"step": 31070, "timestamp": 1778228124.0965712, "train/loss": 2.145018196105957, "train/z_loss": 0.0014675383572466671, "train/perplexity": 8.542196670595674, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024245.776142187, "perf/iters_per_sec": 0.9652356034003196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360164880752563, "data/tokens_consumed": 65160609792, "data/tokens_consumed_B": 65.160609792, "train/loss_slope": -4.99574773990459e-06} {"step": 31080, "timestamp": 1778228134.460939, "train/loss": 2.125525975227356, "train/z_loss": 0.0014763901010155678, "train/perplexity": 8.377302583171087, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024881.2414204737, "perf/iters_per_sec": 0.9655386168577546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356913566589356, "data/tokens_consumed": 65181581312, "data/tokens_consumed_B": 65.181581312, "train/loss_slope": -8.996035764426946e-06} {"step": 31090, "timestamp": 1778228144.818661, "train/loss": 2.155862545967102, "train/z_loss": 0.0014595620799809695, "train/perplexity": 8.635335341282996, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025537.2089295126, "perf/iters_per_sec": 0.9658514065406383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353559494018554, "data/tokens_consumed": 65202552832, "data/tokens_consumed_B": 65.202552832, "train/loss_slope": -1.078634204858776e-05} {"step": 31100, "timestamp": 1778228155.1722538, "grad/layer_0/attn": 0.003218551864847541, "grad/layer_0/mlp": 0.0029323811177164316, "grad/layer_0/attn_mlp_ratio": 1.0975898513474869, "grad/layer_4/attn": 0.002077179029583931, "grad/layer_4/mlp": 0.0025854192208498716, "grad/layer_4/attn_mlp_ratio": 0.8034205565158098, "grad/layer_8/attn": 0.007066552992910147, "grad/layer_8/mlp": 0.0039650737307965755, "grad/layer_8/attn_mlp_ratio": 1.7821995994184276, "grad/layer_12/attn": 0.006572674959897995, "grad/layer_12/mlp": 0.00658101961016655, "grad/layer_12/attn_mlp_ratio": 0.9987319973748653, "grad/layer_16/attn": 0.003958878107368946, "grad/layer_16/mlp": 0.004887125454843044, "grad/layer_16/attn_mlp_ratio": 0.8100626969662723, "grad/layer_20/attn": 0.0044096969068050385, "grad/layer_20/mlp": 0.006088028196245432, "grad/layer_20/attn_mlp_ratio": 0.7243226693812441, "grad/layer_24/attn": 0.007666910067200661, "grad/layer_24/mlp": 0.008508549071848392, "grad/layer_24/attn_mlp_ratio": 0.9010831238500213, "grad/layer_27/attn": 0.004277446772903204, "grad/layer_27/mlp": 0.007428332697600126, "grad/layer_27/attn_mlp_ratio": 0.5758286400799275} {"step": 31100, "timestamp": 1778228155.188498, "train/loss": 2.2114492654800415, "train/z_loss": 0.0014573668129742145, "train/perplexity": 9.12893706395155, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023279.6662792359, "perf/iters_per_sec": 0.9647749263187579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365111827850342, "data/tokens_consumed": 65223524352, "data/tokens_consumed_B": 65.223524352, "train/loss_slope": -1.0628696279128008e-05} {"step": 31110, "timestamp": 1778228165.549536, "train/loss": 2.1563052892684937, "train/z_loss": 0.0014621297363191844, "train/perplexity": 8.639159424641797, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025396.1689672798, "perf/iters_per_sec": 0.9657841534458541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354280471801758, "data/tokens_consumed": 65244495872, "data/tokens_consumed_B": 65.244495872, "train/loss_slope": -1.0439847018530567e-05} {"step": 31120, "timestamp": 1778228175.913, "train/loss": 2.2203744888305663, "train/z_loss": 0.0014574894681572913, "train/perplexity": 9.210779554158169, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024600.342389534, "perf/iters_per_sec": 0.9654046737620993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03583505153656, "data/tokens_consumed": 65265467392, "data/tokens_consumed_B": 65.265467392, "train/loss_slope": -1.011089965312143e-05} {"step": 31125, "timestamp": 1778228181.6917255, "eos/sharpness": 11.6457462310791, "eos/L0_probe": 2.0142555236816406, "eos/L_plus": 2.0718448162078857, "eos/L_minus": 2.0731236934661865, "eos/grad_norm": 0.10171985626220703, "eos/embed_grad_frac": 0.21479113399982452, "eos/time_s": 0.6029648780822754} {"step": 31125, "timestamp": 1778228183.0688918, "geo/rankme_last": 438.9895324707031, "geo/layer_0/stable_rank_q_proj": 18.661727905273438, "geo/layer_0/stable_rank_k_proj": 16.165285110473633, "geo/layer_0/stable_rank_o_proj": 50.38016891479492, "geo/layer_0/stable_rank_gate_proj": 143.3514862060547, "geo/layer_0/stable_rank_down_proj": 52.11345291137695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054349806159734726, "geo/layer_0/attn_entropy_mean": 6.217715263366699, "geo/layer_0/attn_entropy_std": 0.35262659192085266, "geo/layer_7/stable_rank_q_proj": 42.431888580322266, "geo/layer_7/stable_rank_k_proj": 42.23604965209961, "geo/layer_7/stable_rank_o_proj": 105.2576904296875, "geo/layer_7/stable_rank_gate_proj": 95.0053482055664, "geo/layer_7/stable_rank_down_proj": 147.9800567626953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.532831072807312, "geo/layer_7/attn_entropy_mean": 4.634840488433838, "geo/layer_7/attn_entropy_std": 0.8299829363822937, "geo/layer_14/stable_rank_q_proj": 54.80057144165039, "geo/layer_14/stable_rank_k_proj": 36.05117416381836, "geo/layer_14/stable_rank_o_proj": 51.8885383605957, "geo/layer_14/stable_rank_gate_proj": 80.53654479980469, "geo/layer_14/stable_rank_down_proj": 134.22230529785156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3709709048271179, "geo/layer_14/attn_entropy_mean": 5.479186058044434, "geo/layer_14/attn_entropy_std": 0.3900119364261627, "geo/layer_21/stable_rank_q_proj": 44.8663215637207, "geo/layer_21/stable_rank_k_proj": 31.204570770263672, "geo/layer_21/stable_rank_o_proj": 79.4178695678711, "geo/layer_21/stable_rank_gate_proj": 77.73323822021484, "geo/layer_21/stable_rank_down_proj": 57.66379928588867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1420111358165741, "geo/layer_21/attn_entropy_mean": 5.745231628417969, "geo/layer_21/attn_entropy_std": 0.28068655729293823, "geo/layer_27/stable_rank_q_proj": 41.59636688232422, "geo/layer_27/stable_rank_k_proj": 31.3040714263916, "geo/layer_27/stable_rank_o_proj": 118.49446105957031, "geo/layer_27/stable_rank_gate_proj": 88.25932312011719, "geo/layer_27/stable_rank_down_proj": 135.6940155029297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0940147265791893, "geo/layer_27/attn_entropy_mean": 4.316314697265625, "geo/layer_27/attn_entropy_std": 0.6267739534378052, "attnres/final_alpha/block_0": 0.24024488031864166, "attnres/block_norm/0": 1.6885380744934082, "attnres/final_alpha/block_1": 0.005734918639063835, "attnres/block_norm/1": 36298.3046875, "attnres/final_alpha/block_2": 0.01240997388958931, "attnres/block_norm/2": 24617.0703125, "attnres/final_alpha/block_3": 0.014246846549212933, "attnres/block_norm/3": 40566.01171875, "attnres/final_alpha/block_4": 0.017964569851756096, "attnres/block_norm/4": 11544.4609375, "attnres/final_alpha/block_5": 0.5866384506225586, "attnres/block_norm/5": 5657.431640625, "attnres/final_alpha/block_6": 0.12276032567024231, "attnres/block_norm/6": 26891.45703125, "geo/tier1_time_s": 1.357787847518921, "geo/step": 31125.0, "geo/rankme_slope": -0.00021537361038165265} {"step": 31130, "timestamp": 1778228188.2578032, "train/loss": 2.2275413990020754, "train/z_loss": 0.0014373169862665236, "train/perplexity": 9.277029503977055, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699677.2120270794, "perf/iters_per_sec": 0.8104692516456029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338531017303467, "data/tokens_consumed": 65286438912, "data/tokens_consumed_B": 65.286438912, "train/loss_slope": -7.171086352257026e-06} {"step": 31140, "timestamp": 1778228198.6355863, "train/loss": 2.2763505220413207, "train/z_loss": 0.0014458910096436738, "train/perplexity": 9.741065657020895, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022233.765821783, "perf/iters_per_sec": 0.9642762021168628, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037047266960144, "data/tokens_consumed": 65307410432, "data/tokens_consumed_B": 65.307410432, "train/loss_slope": -4.367107247719691e-06} {"step": 31150, "timestamp": 1778228208.9914885, "grad/layer_0/attn": 0.0029765733052045107, "grad/layer_0/mlp": 0.0028437587898224592, "grad/layer_0/attn_mlp_ratio": 1.0467038242438138, "grad/layer_4/attn": 0.001829486689530313, "grad/layer_4/mlp": 0.0024808261077851057, "grad/layer_4/attn_mlp_ratio": 0.7374505653758342, "grad/layer_8/attn": 0.005156523548066616, "grad/layer_8/mlp": 0.0036226343363523483, "grad/layer_8/attn_mlp_ratio": 1.423418133588646, "grad/layer_12/attn": 0.004430980887264013, "grad/layer_12/mlp": 0.006378274876624346, "grad/layer_12/attn_mlp_ratio": 0.6946989434452192, "grad/layer_16/attn": 0.005855457857251167, "grad/layer_16/mlp": 0.004898295737802982, "grad/layer_16/attn_mlp_ratio": 1.1954071479434158, "grad/layer_20/attn": 0.0041173165664076805, "grad/layer_20/mlp": 0.006295849569141865, "grad/layer_20/attn_mlp_ratio": 0.6539731383022171, "grad/layer_24/attn": 0.011858291923999786, "grad/layer_24/mlp": 0.012081939727067947, "grad/layer_24/attn_mlp_ratio": 0.981489073255678, "grad/layer_27/attn": 0.004937388468533754, "grad/layer_27/mlp": 0.011687623336911201, "grad/layer_27/attn_mlp_ratio": 0.4224458886090366} {"step": 31150, "timestamp": 1778228209.0070558, "train/loss": 2.194871520996094, "train/z_loss": 0.0014476965996436774, "train/perplexity": 8.978847389341125, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023047.0419029652, "perf/iters_per_sec": 0.9646640023722483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366303682327271, "data/tokens_consumed": 65328381952, "data/tokens_consumed_B": 65.328381952, "train/loss_slope": -4.211500246818785e-06} {"step": 31160, "timestamp": 1778228219.3761725, "train/loss": 2.171723461151123, "train/z_loss": 0.0014540501055307687, "train/perplexity": 8.773391617957238, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023520.909439264, "perf/iters_per_sec": 0.9648899600216218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363876104354859, "data/tokens_consumed": 65349353472, "data/tokens_consumed_B": 65.349353472, "train/loss_slope": -2.644777598411045e-06} {"step": 31170, "timestamp": 1778228229.7437353, "train/loss": 2.2014451742172243, "train/z_loss": 0.001447574410121888, "train/perplexity": 9.038065645294294, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023934.8292072243, "perf/iters_per_sec": 0.9650873323475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361756563186646, "data/tokens_consumed": 65370324992, "data/tokens_consumed_B": 65.370324992, "train/loss_slope": -2.80543086123183e-06} {"step": 31180, "timestamp": 1778228240.108505, "train/loss": 2.1868396043777465, "train/z_loss": 0.0014563785050995647, "train/perplexity": 8.907018882218836, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024245.1705505175, "perf/iters_per_sec": 0.9652353146317089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360167980194093, "data/tokens_consumed": 65391296512, "data/tokens_consumed_B": 65.391296512, "train/loss_slope": -5.37626369200483e-06} {"step": 31190, "timestamp": 1778228250.4746568, "train/loss": 2.1682738065719604, "train/z_loss": 0.00147567184176296, "train/perplexity": 8.743178589604907, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024088.3810842242, "perf/iters_per_sec": 0.9651605515881654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360970497131348, "data/tokens_consumed": 65412268032, "data/tokens_consumed_B": 65.412268032, "train/loss_slope": -4.514751211144024e-06} {"step": 31200, "timestamp": 1778228260.828304, "grad/layer_0/attn": 0.002850182354450226, "grad/layer_0/mlp": 0.0028688390739262104, "grad/layer_0/attn_mlp_ratio": 0.9934967356673217, "grad/layer_4/attn": 0.003017204813659191, "grad/layer_4/mlp": 0.0025213160552084446, "grad/layer_4/attn_mlp_ratio": 1.1966784916783868, "grad/layer_8/attn": 0.0044251116923987865, "grad/layer_8/mlp": 0.0037241836544126272, "grad/layer_8/attn_mlp_ratio": 1.1882098156826069, "grad/layer_12/attn": 0.004878987558186054, "grad/layer_12/mlp": 0.007291075307875872, "grad/layer_12/attn_mlp_ratio": 0.6691725548354276, "grad/layer_16/attn": 0.003695509862154722, "grad/layer_16/mlp": 0.004567817784845829, "grad/layer_16/attn_mlp_ratio": 0.8090317861434293, "grad/layer_20/attn": 0.006022843066602945, "grad/layer_20/mlp": 0.005718311294913292, "grad/layer_20/attn_mlp_ratio": 1.0532555243424744, "grad/layer_24/attn": 0.01028130017220974, "grad/layer_24/mlp": 0.008849400095641613, "grad/layer_24/attn_mlp_ratio": 1.1618075739498535, "grad/layer_27/attn": 0.00635903887450695, "grad/layer_27/mlp": 0.008962978608906269, "grad/layer_27/attn_mlp_ratio": 0.709478297453518} {"step": 31200, "timestamp": 1778228261.42801, "eos/sharpness": 59.39660072326659, "eos/L0_probe": 2.0169363021850586, "eos/L_plus": 2.2511136531829834, "eos/L_minus": 2.3767249584198, "eos/grad_norm": 0.15452200174331665, "eos/embed_grad_frac": 0.10751979798078537, "eos/time_s": 0.5969750881195068} {"step": 31200, "timestamp": 1778228261.447482, "train/loss": 2.1449679136276245, "train/z_loss": 0.0014644726063124836, "train/perplexity": 8.541767158575226, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912385.5401110146, "perf/iters_per_sec": 0.9118964863352845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096615695953369, "data/tokens_consumed": 65433239552, "data/tokens_consumed_B": 65.433239552, "train/loss_slope": -5.8813317273423075e-06} {"step": 31200, "timestamp": 1778228262.8109164, "geo/rankme_last": 439.5894470214844, "geo/layer_0/stable_rank_q_proj": 18.660743713378906, "geo/layer_0/stable_rank_k_proj": 16.19283676147461, "geo/layer_0/stable_rank_o_proj": 50.35044479370117, "geo/layer_0/stable_rank_gate_proj": 142.89996337890625, "geo/layer_0/stable_rank_down_proj": 52.158531188964844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05222347378730774, "geo/layer_0/attn_entropy_mean": 6.217698574066162, "geo/layer_0/attn_entropy_std": 0.3478192687034607, "geo/layer_7/stable_rank_q_proj": 42.450443267822266, "geo/layer_7/stable_rank_k_proj": 42.20399475097656, "geo/layer_7/stable_rank_o_proj": 105.18783569335938, "geo/layer_7/stable_rank_gate_proj": 94.96458435058594, "geo/layer_7/stable_rank_down_proj": 147.67605590820312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5332212448120117, "geo/layer_7/attn_entropy_mean": 4.6564435958862305, "geo/layer_7/attn_entropy_std": 0.8173828721046448, "geo/layer_14/stable_rank_q_proj": 54.676876068115234, "geo/layer_14/stable_rank_k_proj": 36.103553771972656, "geo/layer_14/stable_rank_o_proj": 51.920265197753906, "geo/layer_14/stable_rank_gate_proj": 80.44923400878906, "geo/layer_14/stable_rank_down_proj": 134.46617126464844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37611666321754456, "geo/layer_14/attn_entropy_mean": 5.528779983520508, "geo/layer_14/attn_entropy_std": 0.3811361491680145, "geo/layer_21/stable_rank_q_proj": 44.87202835083008, "geo/layer_21/stable_rank_k_proj": 31.26491355895996, "geo/layer_21/stable_rank_o_proj": 79.43135070800781, "geo/layer_21/stable_rank_gate_proj": 77.82628631591797, "geo/layer_21/stable_rank_down_proj": 57.726966857910156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1493026316165924, "geo/layer_21/attn_entropy_mean": 5.7508649826049805, "geo/layer_21/attn_entropy_std": 0.2859390377998352, "geo/layer_27/stable_rank_q_proj": 41.65618896484375, "geo/layer_27/stable_rank_k_proj": 31.314985275268555, "geo/layer_27/stable_rank_o_proj": 118.36911010742188, "geo/layer_27/stable_rank_gate_proj": 88.18194580078125, "geo/layer_27/stable_rank_down_proj": 135.6822052001953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.085246242582798, "geo/layer_27/attn_entropy_mean": 4.316425323486328, "geo/layer_27/attn_entropy_std": 0.6377770900726318, "attnres/final_alpha/block_0": 0.24103789031505585, "attnres/block_norm/0": 1.6890008449554443, "attnres/final_alpha/block_1": 0.005793898366391659, "attnres/block_norm/1": 36223.640625, "attnres/final_alpha/block_2": 0.012486930936574936, "attnres/block_norm/2": 24600.611328125, "attnres/final_alpha/block_3": 0.014140361919999123, "attnres/block_norm/3": 40568.2890625, "attnres/final_alpha/block_4": 0.01778854802250862, "attnres/block_norm/4": 11604.3544921875, "attnres/final_alpha/block_5": 0.5848451852798462, "attnres/block_norm/5": 5649.744140625, "attnres/final_alpha/block_6": 0.12390722334384918, "attnres/block_norm/6": 26721.44140625, "geo/tier1_time_s": 1.3591692447662354, "geo/step": 31200.0, "geo/rankme_slope": -0.00022531414909713886} {"step": 31210, "timestamp": 1778228273.5239294, "train/loss": 2.1784220695495606, "train/z_loss": 0.0014563682489097118, "train/perplexity": 8.832358409962993, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1737163.781530937, "perf/iters_per_sec": 0.8283442409186063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2072275638580323, "data/tokens_consumed": 65454211072, "data/tokens_consumed_B": 65.454211072, "train/loss_slope": -6.746748321377494e-06} {"step": 31220, "timestamp": 1778228283.878537, "train/loss": 2.1901718378067017, "train/z_loss": 0.0014449277659878135, "train/perplexity": 8.93674865405087, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026409.7237813438, "perf/iters_per_sec": 0.9662674540430755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349101543426513, "data/tokens_consumed": 65475182592, "data/tokens_consumed_B": 65.475182592, "train/loss_slope": -5.258015888144232e-06} {"step": 31230, "timestamp": 1778228294.237348, "train/loss": 2.1888864040374756, "train/z_loss": 0.001449043012689799, "train/perplexity": 8.925268435654838, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025792.007484757, "perf/iters_per_sec": 0.9659729039596353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352257251739503, "data/tokens_consumed": 65496154112, "data/tokens_consumed_B": 65.496154112, "train/loss_slope": -5.0849501663880346e-06} {"step": 31240, "timestamp": 1778228304.5896792, "train/loss": 2.190255904197693, "train/z_loss": 0.0014556705602444709, "train/perplexity": 8.937499965837002, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026707.5157906865, "perf/iters_per_sec": 0.966409452338546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347580909729004, "data/tokens_consumed": 65517125632, "data/tokens_consumed_B": 65.517125632, "train/loss_slope": -6.517091342503648e-06} {"step": 31250, "timestamp": 1778228314.9267936, "grad/layer_0/attn": 0.0026434774044901133, "grad/layer_0/mlp": 0.002652863971889019, "grad/layer_0/attn_mlp_ratio": 0.9964616855049713, "grad/layer_4/attn": 0.00221836450509727, "grad/layer_4/mlp": 0.0024909544736146927, "grad/layer_4/attn_mlp_ratio": 0.8905680290580893, "grad/layer_8/attn": 0.0051222569309175014, "grad/layer_8/mlp": 0.0037244639825075865, "grad/layer_8/attn_mlp_ratio": 1.3753003969013486, "grad/layer_12/attn": 0.0038800037000328302, "grad/layer_12/mlp": 0.006468135863542557, "grad/layer_12/attn_mlp_ratio": 0.5998642764936217, "grad/layer_16/attn": 0.007244022563099861, "grad/layer_16/mlp": 0.004231317900121212, "grad/layer_16/attn_mlp_ratio": 1.7120014527133973, "grad/layer_20/attn": 0.005036901216953993, "grad/layer_20/mlp": 0.005811113398522139, "grad/layer_20/attn_mlp_ratio": 0.8667704078116801, "grad/layer_24/attn": 0.011613716371357441, "grad/layer_24/mlp": 0.010019677691161633, "grad/layer_24/attn_mlp_ratio": 1.1590908024609246, "grad/layer_27/attn": 0.008908303454518318, "grad/layer_27/mlp": 0.009768265299499035, "grad/layer_27/attn_mlp_ratio": 0.9119636998166717} {"step": 31250, "timestamp": 1778228314.9427233, "train/loss": 2.1273823022842406, "train/z_loss": 0.0014603271731175483, "train/perplexity": 8.392868039438936, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026624.8648315875, "perf/iters_per_sec": 0.9663700412900865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348002910614014, "data/tokens_consumed": 65538097152, "data/tokens_consumed_B": 65.538097152, "train/loss_slope": -8.772031859596131e-06} {"step": 31260, "timestamp": 1778228325.294399, "train/loss": 2.1343581914901733, "train/z_loss": 0.0014742462313733994, "train/perplexity": 8.451620443878406, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027006.98351551, "perf/iters_per_sec": 0.9665522496774245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346052169799804, "data/tokens_consumed": 65559068672, "data/tokens_consumed_B": 65.559068672, "train/loss_slope": -7.421841336698642e-06} {"step": 31270, "timestamp": 1778228335.6462326, "train/loss": 2.235963726043701, "train/z_loss": 0.0014561142073944212, "train/perplexity": 9.355493641925317, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027034.3099139987, "perf/iters_per_sec": 0.9665652799196237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034591269493103, "data/tokens_consumed": 65580040192, "data/tokens_consumed_B": 65.580040192, "train/loss_slope": 3.5388511900356376e-07} {"step": 31275, "timestamp": 1778228341.4416215, "eos/sharpness": 28.303670883178704, "eos/L0_probe": 2.0165462493896484, "eos/L_plus": 2.155179262161255, "eos/L_minus": 2.160949945449829, "eos/grad_norm": 0.14178434014320374, "eos/embed_grad_frac": 0.12117040902376175, "eos/time_s": 0.6117124557495117} {"step": 31275, "timestamp": 1778228342.8175068, "geo/rankme_last": 440.2556457519531, "geo/layer_0/stable_rank_q_proj": 18.65747833251953, "geo/layer_0/stable_rank_k_proj": 16.22666358947754, "geo/layer_0/stable_rank_o_proj": 50.270503997802734, "geo/layer_0/stable_rank_gate_proj": 142.7109832763672, "geo/layer_0/stable_rank_down_proj": 52.14637756347656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05421072244644165, "geo/layer_0/attn_entropy_mean": 6.220058917999268, "geo/layer_0/attn_entropy_std": 0.3483175039291382, "geo/layer_7/stable_rank_q_proj": 42.47407531738281, "geo/layer_7/stable_rank_k_proj": 42.24281311035156, "geo/layer_7/stable_rank_o_proj": 105.09944915771484, "geo/layer_7/stable_rank_gate_proj": 94.71755981445312, "geo/layer_7/stable_rank_down_proj": 147.63043212890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5281212329864502, "geo/layer_7/attn_entropy_mean": 4.642925262451172, "geo/layer_7/attn_entropy_std": 0.8034443259239197, "geo/layer_14/stable_rank_q_proj": 54.64303970336914, "geo/layer_14/stable_rank_k_proj": 36.009525299072266, "geo/layer_14/stable_rank_o_proj": 51.945247650146484, "geo/layer_14/stable_rank_gate_proj": 80.5112075805664, "geo/layer_14/stable_rank_down_proj": 134.7499542236328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38784030079841614, "geo/layer_14/attn_entropy_mean": 5.508008003234863, "geo/layer_14/attn_entropy_std": 0.38713338971138, "geo/layer_21/stable_rank_q_proj": 44.89731979370117, "geo/layer_21/stable_rank_k_proj": 31.14853286743164, "geo/layer_21/stable_rank_o_proj": 79.11714172363281, "geo/layer_21/stable_rank_gate_proj": 77.84186553955078, "geo/layer_21/stable_rank_down_proj": 57.736759185791016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14618229866027832, "geo/layer_21/attn_entropy_mean": 5.7330732345581055, "geo/layer_21/attn_entropy_std": 0.2776961028575897, "geo/layer_27/stable_rank_q_proj": 41.55826950073242, "geo/layer_27/stable_rank_k_proj": 31.35889434814453, "geo/layer_27/stable_rank_o_proj": 118.30545806884766, "geo/layer_27/stable_rank_gate_proj": 88.12389373779297, "geo/layer_27/stable_rank_down_proj": 135.46812438964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08342531323432922, "geo/layer_27/attn_entropy_mean": 4.316011905670166, "geo/layer_27/attn_entropy_std": 0.6223679184913635, "attnres/final_alpha/block_0": 0.23992080986499786, "attnres/block_norm/0": 1.689449429512024, "attnres/final_alpha/block_1": 0.0057394178584218025, "attnres/block_norm/1": 36237.18359375, "attnres/final_alpha/block_2": 0.012500353157520294, "attnres/block_norm/2": 24572.96484375, "attnres/final_alpha/block_3": 0.014334343373775482, "attnres/block_norm/3": 40546.3046875, "attnres/final_alpha/block_4": 0.017803218215703964, "attnres/block_norm/4": 11570.1044921875, "attnres/final_alpha/block_5": 0.5872439742088318, "attnres/block_norm/5": 5683.8984375, "attnres/final_alpha/block_6": 0.12245789915323257, "attnres/block_norm/6": 26728.646484375, "geo/tier1_time_s": 1.3550682067871094, "geo/step": 31275.0, "geo/rankme_slope": -0.0001983937324929972} {"step": 31280, "timestamp": 1778228347.9989705, "train/loss": 2.2031504631042482, "train/z_loss": 0.001443515217397362, "train/perplexity": 9.05349130706604, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698438.5212997599, "perf/iters_per_sec": 0.8098785978792953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2347529649734497, "data/tokens_consumed": 65601011712, "data/tokens_consumed_B": 65.601011712, "train/loss_slope": -3.245806930088346e-07} {"step": 31290, "timestamp": 1778228358.3510513, "train/loss": 2.150915277004242, "train/z_loss": 0.0014519280986860395, "train/perplexity": 8.592719517656114, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026646.3907880818, "perf/iters_per_sec": 0.9663803056660089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347892999649049, "data/tokens_consumed": 65621983232, "data/tokens_consumed_B": 65.621983232, "train/loss_slope": -1.6952042031709573e-06} {"step": 31300, "timestamp": 1778228368.6932697, "grad/layer_0/attn": 0.0027630047407001257, "grad/layer_0/mlp": 0.0027700907085090876, "grad/layer_0/attn_mlp_ratio": 0.9974419366371691, "grad/layer_4/attn": 0.0021533179096877575, "grad/layer_4/mlp": 0.0024714386090636253, "grad/layer_4/attn_mlp_ratio": 0.8712811294048247, "grad/layer_8/attn": 0.003771737450733781, "grad/layer_8/mlp": 0.0038089649751782417, "grad/layer_8/attn_mlp_ratio": 0.9902263150988018, "grad/layer_12/attn": 0.005415666848421097, "grad/layer_12/mlp": 0.006427401676774025, "grad/layer_12/attn_mlp_ratio": 0.8425903711187124, "grad/layer_16/attn": 0.004162810742855072, "grad/layer_16/mlp": 0.0045332531444728374, "grad/layer_16/attn_mlp_ratio": 0.9182832986289865, "grad/layer_20/attn": 0.005451914854347706, "grad/layer_20/mlp": 0.006465110462158918, "grad/layer_20/attn_mlp_ratio": 0.8432825397075852, "grad/layer_24/attn": 0.011221768334507942, "grad/layer_24/mlp": 0.009533287025988102, "grad/layer_24/attn_mlp_ratio": 1.1771142719405752, "grad/layer_27/attn": 0.008278229273855686, "grad/layer_27/mlp": 0.007621235679835081, "grad/layer_27/attn_mlp_ratio": 1.0862056381668357} {"step": 31300, "timestamp": 1778228368.7091818, "train/loss": 2.1993942737579344, "train/z_loss": 0.0014592233579605817, "train/perplexity": 9.019548467246318, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025945.1880309917, "perf/iters_per_sec": 0.9660459461359938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035147452354431, "data/tokens_consumed": 65642954752, "data/tokens_consumed_B": 65.642954752, "train/loss_slope": -1.8023187106269513e-06} {"step": 31310, "timestamp": 1778228379.058247, "train/loss": 2.171760320663452, "train/z_loss": 0.0014719071099534632, "train/perplexity": 8.773715006853688, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027377.237097117, "perf/iters_per_sec": 0.9667288003430924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344162702560424, "data/tokens_consumed": 65663926272, "data/tokens_consumed_B": 65.663926272, "train/loss_slope": -4.450929776491223e-06} {"step": 31320, "timestamp": 1778228389.40413, "train/loss": 2.1921170473098757, "train/z_loss": 0.0014656142448075116, "train/perplexity": 8.954149421042054, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027906.245188471, "perf/iters_per_sec": 0.9669810510580401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341464281082153, "data/tokens_consumed": 65684897792, "data/tokens_consumed_B": 65.684897792, "train/loss_slope": -3.570537770291549e-06} {"step": 31330, "timestamp": 1778228399.749941, "train/loss": 2.1708285093307493, "train/z_loss": 0.0014545867103151976, "train/perplexity": 8.765543367584703, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028245.9130868923, "perf/iters_per_sec": 0.9671430173334562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339732408523559, "data/tokens_consumed": 65705869312, "data/tokens_consumed_B": 65.705869312, "train/loss_slope": -6.565814829430573e-06} {"step": 31340, "timestamp": 1778228410.097917, "train/loss": 2.164311385154724, "train/z_loss": 0.0014544082921929657, "train/perplexity": 8.708602978316748, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027485.325252367, "perf/iters_per_sec": 0.9667803407918772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343611240386963, "data/tokens_consumed": 65726840832, "data/tokens_consumed_B": 65.726840832, "train/loss_slope": -7.503614717512627e-06} {"step": 31350, "timestamp": 1778228420.4345443, "grad/layer_0/attn": 0.0026494271587580442, "grad/layer_0/mlp": 0.002706783125177026, "grad/layer_0/attn_mlp_ratio": 0.9788102475715503, "grad/layer_4/attn": 0.0019402200123295188, "grad/layer_4/mlp": 0.0026435262989252806, "grad/layer_4/attn_mlp_ratio": 0.7339514419520496, "grad/layer_8/attn": 0.005276658106595278, "grad/layer_8/mlp": 0.0037953821010887623, "grad/layer_8/attn_mlp_ratio": 1.3902837255972769, "grad/layer_12/attn": 0.0047806547954678535, "grad/layer_12/mlp": 0.0063659693114459515, "grad/layer_12/attn_mlp_ratio": 0.7509704314432116, "grad/layer_16/attn": 0.004253305494785309, "grad/layer_16/mlp": 0.004500263370573521, "grad/layer_16/attn_mlp_ratio": 0.9451236627804064, "grad/layer_20/attn": 0.00422297278419137, "grad/layer_20/mlp": 0.006232710089534521, "grad/layer_20/attn_mlp_ratio": 0.6775499992414625, "grad/layer_24/attn": 0.012698731385171413, "grad/layer_24/mlp": 0.008777259849011898, "grad/layer_24/attn_mlp_ratio": 1.446776267188142, "grad/layer_27/attn": 0.007343259174376726, "grad/layer_27/mlp": 0.00791235826909542, "grad/layer_27/attn_mlp_ratio": 0.9280746437191827} {"step": 31350, "timestamp": 1778228421.0411427, "eos/sharpness": 48.227119445800774, "eos/L0_probe": 2.016157627105713, "eos/L_plus": 2.216343402862549, "eos/L_minus": 2.2982430458068848, "eos/grad_norm": 0.1343315839767456, "eos/embed_grad_frac": 0.12539392709732056, "eos/time_s": 0.6038200855255127} {"step": 31350, "timestamp": 1778228421.0604503, "train/loss": 2.222028064727783, "train/z_loss": 0.00145397869637236, "train/perplexity": 9.226022876745578, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913837.705800216, "perf/iters_per_sec": 0.9125889328957634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0957836151123046, "data/tokens_consumed": 65747812352, "data/tokens_consumed_B": 65.747812352, "train/loss_slope": -5.855968911977484e-06} {"step": 31350, "timestamp": 1778228422.4301658, "geo/rankme_last": 440.0684509277344, "geo/layer_0/stable_rank_q_proj": 18.66929054260254, "geo/layer_0/stable_rank_k_proj": 16.180883407592773, "geo/layer_0/stable_rank_o_proj": 50.22205352783203, "geo/layer_0/stable_rank_gate_proj": 142.99560546875, "geo/layer_0/stable_rank_down_proj": 52.21682357788086, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05324149131774902, "geo/layer_0/attn_entropy_mean": 6.221919059753418, "geo/layer_0/attn_entropy_std": 0.34926748275756836, "geo/layer_7/stable_rank_q_proj": 42.46894073486328, "geo/layer_7/stable_rank_k_proj": 42.297637939453125, "geo/layer_7/stable_rank_o_proj": 104.88481140136719, "geo/layer_7/stable_rank_gate_proj": 94.6507568359375, "geo/layer_7/stable_rank_down_proj": 147.73777770996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.535323441028595, "geo/layer_7/attn_entropy_mean": 4.617982864379883, "geo/layer_7/attn_entropy_std": 0.8091065287590027, "geo/layer_14/stable_rank_q_proj": 54.789222717285156, "geo/layer_14/stable_rank_k_proj": 36.08089065551758, "geo/layer_14/stable_rank_o_proj": 51.90776824951172, "geo/layer_14/stable_rank_gate_proj": 80.54263305664062, "geo/layer_14/stable_rank_down_proj": 134.7786407470703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3915702700614929, "geo/layer_14/attn_entropy_mean": 5.517294883728027, "geo/layer_14/attn_entropy_std": 0.39464911818504333, "geo/layer_21/stable_rank_q_proj": 45.012474060058594, "geo/layer_21/stable_rank_k_proj": 31.148595809936523, "geo/layer_21/stable_rank_o_proj": 79.24191284179688, "geo/layer_21/stable_rank_gate_proj": 77.88246154785156, "geo/layer_21/stable_rank_down_proj": 57.77420425415039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14575153589248657, "geo/layer_21/attn_entropy_mean": 5.69590950012207, "geo/layer_21/attn_entropy_std": 0.29591771960258484, "geo/layer_27/stable_rank_q_proj": 41.59142303466797, "geo/layer_27/stable_rank_k_proj": 31.33955955505371, "geo/layer_27/stable_rank_o_proj": 118.46681213378906, "geo/layer_27/stable_rank_gate_proj": 88.17988586425781, "geo/layer_27/stable_rank_down_proj": 135.44744873046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08860525488853455, "geo/layer_27/attn_entropy_mean": 4.314830780029297, "geo/layer_27/attn_entropy_std": 0.6320729851722717, "attnres/final_alpha/block_0": 0.23944388329982758, "attnres/block_norm/0": 1.6897157430648804, "attnres/final_alpha/block_1": 0.005761052016168833, "attnres/block_norm/1": 36261.44921875, "attnres/final_alpha/block_2": 0.012450789101421833, "attnres/block_norm/2": 24576.7109375, "attnres/final_alpha/block_3": 0.014292228035628796, "attnres/block_norm/3": 40744.7265625, "attnres/final_alpha/block_4": 0.018039101734757423, "attnres/block_norm/4": 11547.8759765625, "attnres/final_alpha/block_5": 0.5857903957366943, "attnres/block_norm/5": 5653.35107421875, "attnres/final_alpha/block_6": 0.12422256171703339, "attnres/block_norm/6": 26685.154296875, "geo/tier1_time_s": 1.3658106327056885, "geo/step": 31350.0, "geo/rankme_slope": -0.00020634789071878753} {"step": 31360, "timestamp": 1778228432.786305, "train/loss": 2.190749669075012, "train/z_loss": 0.0014632301637902856, "train/perplexity": 8.941914079088539, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789111.1814072228, "perf/iters_per_sec": 0.8531146914516557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1721753358840943, "data/tokens_consumed": 65768783872, "data/tokens_consumed_B": 65.768783872, "train/loss_slope": -6.366376765239932e-06} {"step": 31370, "timestamp": 1778228443.1690445, "train/loss": 2.2151088953018188, "train/z_loss": 0.0014509942033328117, "train/perplexity": 9.162406800339921, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021904.334734078, "perf/iters_per_sec": 0.9641191171331778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372162342071534, "data/tokens_consumed": 65789755392, "data/tokens_consumed_B": 65.789755392, "train/loss_slope": -7.061134585023674e-06} {"step": 31380, "timestamp": 1778228453.5507905, "train/loss": 2.22418327331543, "train/z_loss": 0.0014517985284328461, "train/perplexity": 9.245928322968746, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021590.8078732453, "perf/iters_per_sec": 0.963969615875838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373770952224732, "data/tokens_consumed": 65810726912, "data/tokens_consumed_B": 65.810726912, "train/loss_slope": -7.171912297736046e-06} {"step": 31390, "timestamp": 1778228463.9327412, "train/loss": 2.1455065250396728, "train/z_loss": 0.0014563818927854299, "train/perplexity": 8.546369091061319, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021146.7299572711, "perf/iters_per_sec": 0.9637578630243641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376050233840943, "data/tokens_consumed": 65831698432, "data/tokens_consumed_B": 65.831698432, "train/loss_slope": -9.642981352692988e-06} {"step": 31400, "timestamp": 1778228474.2921822, "grad/layer_0/attn": 0.00273093837313354, "grad/layer_0/mlp": 0.002639074344187975, "grad/layer_0/attn_mlp_ratio": 1.0348091464975067, "grad/layer_4/attn": 0.001805678359232843, "grad/layer_4/mlp": 0.0025152412708848715, "grad/layer_4/attn_mlp_ratio": 0.7178946641600438, "grad/layer_8/attn": 0.007176793646067381, "grad/layer_8/mlp": 0.003718775464221835, "grad/layer_8/attn_mlp_ratio": 1.929880822901757, "grad/layer_12/attn": 0.005116025917232037, "grad/layer_12/mlp": 0.007205494679510593, "grad/layer_12/attn_mlp_ratio": 0.710017295658845, "grad/layer_16/attn": 0.00400818046182394, "grad/layer_16/mlp": 0.004384508356451988, "grad/layer_16/attn_mlp_ratio": 0.9141687150644526, "grad/layer_20/attn": 0.0048003774136304855, "grad/layer_20/mlp": 0.00634398590773344, "grad/layer_20/attn_mlp_ratio": 0.7566815891111257, "grad/layer_24/attn": 0.010969860479235649, "grad/layer_24/mlp": 0.01007102057337761, "grad/layer_24/attn_mlp_ratio": 1.0892501202220835, "grad/layer_27/attn": 0.00693925004452467, "grad/layer_27/mlp": 0.008959789760410786, "grad/layer_27/attn_mlp_ratio": 0.774488035169892} {"step": 31400, "timestamp": 1778228474.3076978, "train/loss": 2.176490271091461, "train/z_loss": 0.0014575372682884336, "train/perplexity": 8.815312543495567, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023060.4422953066, "perf/iters_per_sec": 0.9646703921772511, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366235017776488, "data/tokens_consumed": 65852669952, "data/tokens_consumed_B": 65.852669952, "train/loss_slope": -1.1661851974782526e-05} {"step": 31410, "timestamp": 1778228484.6663487, "train/loss": 2.199697732925415, "train/z_loss": 0.0014569217921234668, "train/perplexity": 9.022285947250934, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025592.0164626662, "perf/iters_per_sec": 0.9658775408089953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035327935218811, "data/tokens_consumed": 65873641472, "data/tokens_consumed_B": 65.873641472, "train/loss_slope": -1.2677988339357408e-05} {"step": 31420, "timestamp": 1778228495.01316, "train/loss": 2.2127472162246704, "train/z_loss": 0.0014533943845890462, "train/perplexity": 9.140793667590593, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027887.9183279774, "perf/iters_per_sec": 0.9669723121299636, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341557741165162, "data/tokens_consumed": 65894612992, "data/tokens_consumed_B": 65.894612992, "train/loss_slope": -1.1584516098075058e-05} {"step": 31425, "timestamp": 1778228501.1787977, "eos/sharpness": 40.79039096832275, "eos/L0_probe": 2.022338628768921, "eos/L_plus": 2.2511987686157227, "eos/L_minus": 2.2013823986053467, "eos/grad_norm": 0.15276673436164856, "eos/embed_grad_frac": 0.11325927078723907, "eos/time_s": 0.6121573448181152} {"step": 31425, "timestamp": 1778228502.5642455, "geo/rankme_last": 439.3995666503906, "geo/layer_0/stable_rank_q_proj": 18.690582275390625, "geo/layer_0/stable_rank_k_proj": 16.20060920715332, "geo/layer_0/stable_rank_o_proj": 50.23351287841797, "geo/layer_0/stable_rank_gate_proj": 143.0521697998047, "geo/layer_0/stable_rank_down_proj": 52.207923889160156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054317276924848557, "geo/layer_0/attn_entropy_mean": 6.2211689949035645, "geo/layer_0/attn_entropy_std": 0.35448893904685974, "geo/layer_7/stable_rank_q_proj": 42.432796478271484, "geo/layer_7/stable_rank_k_proj": 42.44670867919922, "geo/layer_7/stable_rank_o_proj": 104.97918701171875, "geo/layer_7/stable_rank_gate_proj": 94.58782958984375, "geo/layer_7/stable_rank_down_proj": 147.44561767578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5134575366973877, "geo/layer_7/attn_entropy_mean": 4.653778553009033, "geo/layer_7/attn_entropy_std": 0.8251955509185791, "geo/layer_14/stable_rank_q_proj": 54.746246337890625, "geo/layer_14/stable_rank_k_proj": 36.115562438964844, "geo/layer_14/stable_rank_o_proj": 51.8626594543457, "geo/layer_14/stable_rank_gate_proj": 80.423583984375, "geo/layer_14/stable_rank_down_proj": 134.5385284423828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39080482721328735, "geo/layer_14/attn_entropy_mean": 5.49098014831543, "geo/layer_14/attn_entropy_std": 0.40153974294662476, "geo/layer_21/stable_rank_q_proj": 44.86178970336914, "geo/layer_21/stable_rank_k_proj": 31.24369239807129, "geo/layer_21/stable_rank_o_proj": 79.1558609008789, "geo/layer_21/stable_rank_gate_proj": 77.73636627197266, "geo/layer_21/stable_rank_down_proj": 57.74596405029297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1466672420501709, "geo/layer_21/attn_entropy_mean": 5.723095893859863, "geo/layer_21/attn_entropy_std": 0.29684725403785706, "geo/layer_27/stable_rank_q_proj": 41.702571868896484, "geo/layer_27/stable_rank_k_proj": 31.426843643188477, "geo/layer_27/stable_rank_o_proj": 118.43865203857422, "geo/layer_27/stable_rank_gate_proj": 88.2706298828125, "geo/layer_27/stable_rank_down_proj": 135.29457092285156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07716427743434906, "geo/layer_27/attn_entropy_mean": 4.296785831451416, "geo/layer_27/attn_entropy_std": 0.6400042772293091, "attnres/final_alpha/block_0": 0.2388772815465927, "attnres/block_norm/0": 1.6899526119232178, "attnres/final_alpha/block_1": 0.005735321901738644, "attnres/block_norm/1": 36301.9296875, "attnres/final_alpha/block_2": 0.012405134737491608, "attnres/block_norm/2": 24633.86328125, "attnres/final_alpha/block_3": 0.01416081190109253, "attnres/block_norm/3": 40566.421875, "attnres/final_alpha/block_4": 0.017816122621297836, "attnres/block_norm/4": 11584.787109375, "attnres/final_alpha/block_5": 0.5898933410644531, "attnres/block_norm/5": 5674.3291015625, "attnres/final_alpha/block_6": 0.12111194431781769, "attnres/block_norm/6": 26939.84375, "geo/tier1_time_s": 1.3582870960235596, "geo/step": 31425.0, "geo/rankme_slope": -0.00023899995545093036} {"step": 31430, "timestamp": 1778228507.7439604, "train/loss": 2.1928279638290404, "train/z_loss": 0.0014529337873682379, "train/perplexity": 8.960517337040685, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1648109.521117201, "perf/iters_per_sec": 0.7858798604570394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.272459125518799, "data/tokens_consumed": 65915584512, "data/tokens_consumed_B": 65.915584512, "train/loss_slope": -8.708777321804868e-06} {"step": 31440, "timestamp": 1778228518.0980754, "train/loss": 2.2003403902053833, "train/z_loss": 0.001446799689438194, "train/perplexity": 9.028086048535924, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026335.8729549681, "perf/iters_per_sec": 0.9662322392248955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349478721618652, "data/tokens_consumed": 65936556032, "data/tokens_consumed_B": 65.936556032, "train/loss_slope": -4.3413794366153525e-06} {"step": 31450, "timestamp": 1778228528.435617, "grad/layer_0/attn": 0.0026756254956126213, "grad/layer_0/mlp": 0.002706586616113782, "grad/layer_0/attn_mlp_ratio": 0.9885607875347826, "grad/layer_4/attn": 0.0028585465624928474, "grad/layer_4/mlp": 0.00257371598854661, "grad/layer_4/attn_mlp_ratio": 1.110668956538669, "grad/layer_8/attn": 0.004602245520800352, "grad/layer_8/mlp": 0.003935299348086119, "grad/layer_8/attn_mlp_ratio": 1.1694778457173303, "grad/layer_12/attn": 0.004247257951647043, "grad/layer_12/mlp": 0.006405350752174854, "grad/layer_12/attn_mlp_ratio": 0.6630796734896952, "grad/layer_16/attn": 0.0036842231638729572, "grad/layer_16/mlp": 0.004493468441069126, "grad/layer_16/attn_mlp_ratio": 0.8199062995991009, "grad/layer_20/attn": 0.005155297461897135, "grad/layer_20/mlp": 0.007246812339872122, "grad/layer_20/attn_mlp_ratio": 0.7113882834241128, "grad/layer_24/attn": 0.023687750101089478, "grad/layer_24/mlp": 0.014296197332441807, "grad/layer_24/attn_mlp_ratio": 1.6569266207345308, "grad/layer_27/attn": 0.02131844311952591, "grad/layer_27/mlp": 0.01335760485380888, "grad/layer_27/attn_mlp_ratio": 1.5959779611124838} {"step": 31450, "timestamp": 1778228528.4512603, "train/loss": 2.145907497406006, "train/z_loss": 0.0014715149067342282, "train/perplexity": 8.549796636028804, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026675.0149482982, "perf/iters_per_sec": 0.9663939547292224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347746849060058, "data/tokens_consumed": 65957527552, "data/tokens_consumed_B": 65.957527552, "train/loss_slope": -5.7490988962291605e-06} {"step": 31460, "timestamp": 1778228538.8066301, "train/loss": 2.186073112487793, "train/z_loss": 0.001450278761330992, "train/perplexity": 8.900194340294092, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026370.323555578, "perf/iters_per_sec": 0.9662486665513887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349302768707276, "data/tokens_consumed": 65978499072, "data/tokens_consumed_B": 65.978499072, "train/loss_slope": -6.716358307564593e-06} {"step": 31470, "timestamp": 1778228549.1743298, "train/loss": 2.1664222955703734, "train/z_loss": 0.0014559031114913523, "train/perplexity": 8.727005475227259, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024128.3447200472, "perf/iters_per_sec": 0.9651796077347027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036076593399048, "data/tokens_consumed": 65999470592, "data/tokens_consumed_B": 65.999470592, "train/loss_slope": -8.19075771398168e-06} {"step": 31480, "timestamp": 1778228559.5597324, "train/loss": 2.2120847940444945, "train/z_loss": 0.0014499793527647854, "train/perplexity": 9.134740608182517, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020195.1282549666, "perf/iters_per_sec": 0.9633041039728959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380937814712525, "data/tokens_consumed": 66020442112, "data/tokens_consumed_B": 66.020442112, "train/loss_slope": -5.979952102590162e-06} {"step": 31490, "timestamp": 1778228569.9470649, "train/loss": 2.22082781791687, "train/z_loss": 0.001464160392060876, "train/perplexity": 9.21495601502167, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019983.9014331768, "perf/iters_per_sec": 0.9632033831754574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382023334503174, "data/tokens_consumed": 66041413632, "data/tokens_consumed_B": 66.041413632, "train/loss_slope": -2.423826588763138e-06} {"step": 31500, "timestamp": 1778228580.3167772, "grad/layer_0/attn": 0.0027711617294698954, "grad/layer_0/mlp": 0.002771723549813032, "grad/layer_0/attn_mlp_ratio": 0.9997972668223346, "grad/layer_4/attn": 0.0018933025421574712, "grad/layer_4/mlp": 0.0025812468957155943, "grad/layer_4/attn_mlp_ratio": 0.7334836787413265, "grad/layer_8/attn": 0.005402490962296724, "grad/layer_8/mlp": 0.0037951692938804626, "grad/layer_8/attn_mlp_ratio": 1.4235177410020208, "grad/layer_12/attn": 0.0035789040848612785, "grad/layer_12/mlp": 0.0061272685416042805, "grad/layer_12/attn_mlp_ratio": 0.5840945279533601, "grad/layer_16/attn": 0.003214299213141203, "grad/layer_16/mlp": 0.0042498852126300335, "grad/layer_16/attn_mlp_ratio": 0.7563261068689969, "grad/layer_20/attn": 0.003166486043483019, "grad/layer_20/mlp": 0.005528504028916359, "grad/layer_20/attn_mlp_ratio": 0.5727563857501689, "grad/layer_24/attn": 0.00736994668841362, "grad/layer_24/mlp": 0.009211520664393902, "grad/layer_24/attn_mlp_ratio": 0.8000792569345684, "grad/layer_27/attn": 0.009864180348813534, "grad/layer_27/mlp": 0.0072817131876945496, "grad/layer_27/attn_mlp_ratio": 1.3546510222371877} {"step": 31500, "timestamp": 1778228580.9378977, "eos/sharpness": 23.37086200714111, "eos/L0_probe": 2.0219762325286865, "eos/L_plus": 2.118894100189209, "eos/L_minus": 2.158766984939575, "eos/grad_norm": 0.11818524450063705, "eos/embed_grad_frac": 0.1749572455883026, "eos/time_s": 0.618384599685669} {"step": 31500, "timestamp": 1778228580.9580238, "train/loss": 2.1872140645980833, "train/z_loss": 0.0014511791639961302, "train/perplexity": 8.910354831023097, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905848.6536019335, "perf/iters_per_sec": 0.9087794559487979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1003769874572753, "data/tokens_consumed": 66062385152, "data/tokens_consumed_B": 66.062385152, "train/loss_slope": -1.3991108893489785e-06} {"step": 31500, "timestamp": 1778228582.3236775, "geo/rankme_last": 438.5971984863281, "geo/layer_0/stable_rank_q_proj": 18.70294761657715, "geo/layer_0/stable_rank_k_proj": 16.216114044189453, "geo/layer_0/stable_rank_o_proj": 50.2658576965332, "geo/layer_0/stable_rank_gate_proj": 143.1255645751953, "geo/layer_0/stable_rank_down_proj": 52.26689910888672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05507862567901611, "geo/layer_0/attn_entropy_mean": 6.219423294067383, "geo/layer_0/attn_entropy_std": 0.3552127480506897, "geo/layer_7/stable_rank_q_proj": 42.4883918762207, "geo/layer_7/stable_rank_k_proj": 42.49724197387695, "geo/layer_7/stable_rank_o_proj": 105.05010986328125, "geo/layer_7/stable_rank_gate_proj": 94.61902618408203, "geo/layer_7/stable_rank_down_proj": 147.6219024658203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5252390503883362, "geo/layer_7/attn_entropy_mean": 4.665011882781982, "geo/layer_7/attn_entropy_std": 0.8037970662117004, "geo/layer_14/stable_rank_q_proj": 54.85087203979492, "geo/layer_14/stable_rank_k_proj": 36.104652404785156, "geo/layer_14/stable_rank_o_proj": 51.76293182373047, "geo/layer_14/stable_rank_gate_proj": 80.427490234375, "geo/layer_14/stable_rank_down_proj": 134.59133911132812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3867371380329132, "geo/layer_14/attn_entropy_mean": 5.497203826904297, "geo/layer_14/attn_entropy_std": 0.3857010006904602, "geo/layer_21/stable_rank_q_proj": 44.88202667236328, "geo/layer_21/stable_rank_k_proj": 31.15462303161621, "geo/layer_21/stable_rank_o_proj": 79.00748443603516, "geo/layer_21/stable_rank_gate_proj": 77.64399719238281, "geo/layer_21/stable_rank_down_proj": 57.7064094543457, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14895349740982056, "geo/layer_21/attn_entropy_mean": 5.741639137268066, "geo/layer_21/attn_entropy_std": 0.29088568687438965, "geo/layer_27/stable_rank_q_proj": 41.769962310791016, "geo/layer_27/stable_rank_k_proj": 31.330297470092773, "geo/layer_27/stable_rank_o_proj": 118.76341247558594, "geo/layer_27/stable_rank_gate_proj": 88.1700668334961, "geo/layer_27/stable_rank_down_proj": 135.25502014160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08447565138339996, "geo/layer_27/attn_entropy_mean": 4.319460868835449, "geo/layer_27/attn_entropy_std": 0.6225051879882812, "attnres/final_alpha/block_0": 0.23866596817970276, "attnres/block_norm/0": 1.69035804271698, "attnres/final_alpha/block_1": 0.0056884875521063805, "attnres/block_norm/1": 36413.98046875, "attnres/final_alpha/block_2": 0.012256610207259655, "attnres/block_norm/2": 24699.109375, "attnres/final_alpha/block_3": 0.014075180515646935, "attnres/block_norm/3": 40797.5703125, "attnres/final_alpha/block_4": 0.017476748675107956, "attnres/block_norm/4": 11627.9677734375, "attnres/final_alpha/block_5": 0.5884654521942139, "attnres/block_norm/5": 5631.31005859375, "attnres/final_alpha/block_6": 0.12337160110473633, "attnres/block_norm/6": 27034.97265625, "geo/tier1_time_s": 1.3575234413146973, "geo/step": 31500.0, "geo/rankme_slope": -0.0002552384430334634} {"step": 31500, "timestamp": 1778228589.2445695, "geo/ww_alpha_mean": 7.838961973496056, "geo/ww_alpha_std": 4.477950906309612, "geo/ww_alpha_min": 1.3444892102558692, "geo/ww_alpha_max": 31.525094733776843, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.13482757522261, "geo/ww_alpha_by_type/k_proj": 4.6062966523931745, "geo/ww_alpha_by_type/v_proj": 8.54561163754685, "geo/ww_alpha_by_type/o_proj": 7.592579590215399, "geo/ww_alpha_by_type/gate_proj": 8.432190397103279, "geo/ww_alpha_by_type/up_proj": 12.67162701366166, "geo/ww_alpha_by_type/down_proj": 9.009597593199683, "geo/twonn_id/layer_0": 0.7435894012451172, "geo/twonn_id/layer_7": 3.0114262104034424, "geo/twonn_id/layer_14": 4.345608234405518, "geo/twonn_id/layer_21": 5.976420879364014, "geo/twonn_id/layer_27": 5.183547019958496, "geo/tier2_time_s": 6.913632154464722} {"step": 31500, "timestamp": 1778228589.8789957, "eoc/jacobian_sigma/layer_0/attn": 918.3526000976562, "eoc/jacobian_sigma/layer_0/mlp": 7743.34033203125, "eoc/jacobian_sigma/layer_0": 7743.34033203125, "eoc/jacobian_sigma/layer_7/attn": 1.1723500490188599, "eoc/jacobian_sigma/layer_7/mlp": 1.7022141218185425, "eoc/jacobian_sigma/layer_7": 1.7022141218185425, "eoc/jacobian_sigma/layer_14/attn": 1.6526834964752197, "eoc/jacobian_sigma/layer_14/mlp": 6.617771625518799, "eoc/jacobian_sigma/layer_14": 6.617771625518799, "eoc/jacobian_sigma/layer_21/attn": 1.090806245803833, "eoc/jacobian_sigma/layer_21/mlp": 3.949544668197632, "eoc/jacobian_sigma/layer_21": 3.949544668197632, "eoc/jacobian_sigma/layer_27/attn": 3.135976791381836, "eoc/jacobian_sigma/layer_27/mlp": 23.785873413085938, "eoc/jacobian_sigma/layer_27": 23.785873413085938, "eoc/layer0_sigma": 7743.34033203125, "eoc/sigma_max": 23.785873413085938, "eoc/sigma_min": 1.7022141218185425, "eoc/sigma_mean": 9.013850957155228, "eoc/time_s": 0.6282262802124023} {"step": 31510, "timestamp": 1778228600.27828, "train/loss": 2.200669026374817, "train/z_loss": 0.0014336511958390474, "train/perplexity": 9.031053491730107, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085924.043898867, "perf/iters_per_sec": 0.5178089351171813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9312142610549927, "data/tokens_consumed": 66083356672, "data/tokens_consumed_B": 66.083356672, "train/loss_slope": 5.16710051036686e-07} {"step": 31520, "timestamp": 1778228610.644369, "train/loss": 2.146729016304016, "train/z_loss": 0.0014564473880454899, "train/perplexity": 8.556823341429892, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024487.482732255, "perf/iters_per_sec": 0.9653508580838466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358927965164184, "data/tokens_consumed": 66104328192, "data/tokens_consumed_B": 66.104328192, "train/loss_slope": -2.344040902140901e-06} {"step": 31530, "timestamp": 1778228621.000617, "train/loss": 2.1708446025848387, "train/z_loss": 0.0014614219893701375, "train/perplexity": 8.765684434836462, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026043.4634722346, "perf/iters_per_sec": 0.9660928075181172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350972414016724, "data/tokens_consumed": 66125299712, "data/tokens_consumed_B": 66.125299712, "train/loss_slope": 1.895841163019287e-08} {"step": 31540, "timestamp": 1778228631.348657, "train/loss": 2.2215137481689453, "train/z_loss": 0.0014459039084613323, "train/perplexity": 9.221279000439491, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027681.15632113, "perf/iters_per_sec": 0.9668737203221941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342612266540527, "data/tokens_consumed": 66146271232, "data/tokens_consumed_B": 66.146271232, "train/loss_slope": 2.4930850185505193e-06} {"step": 31550, "timestamp": 1778228641.6858566, "grad/layer_0/attn": 0.0028214524500072002, "grad/layer_0/mlp": 0.002749532926827669, "grad/layer_0/attn_mlp_ratio": 1.0261569591919066, "grad/layer_4/attn": 0.0017269803211092949, "grad/layer_4/mlp": 0.0023965344298630953, "grad/layer_4/attn_mlp_ratio": 0.7206156638218564, "grad/layer_8/attn": 0.006535668857395649, "grad/layer_8/mlp": 0.003726705675944686, "grad/layer_8/attn_mlp_ratio": 1.7537388917532446, "grad/layer_12/attn": 0.0046341135166585445, "grad/layer_12/mlp": 0.0068022883497178555, "grad/layer_12/attn_mlp_ratio": 0.6812580135220168, "grad/layer_16/attn": 0.004408629611134529, "grad/layer_16/mlp": 0.004500091541558504, "grad/layer_16/attn_mlp_ratio": 0.9796755182540463, "grad/layer_20/attn": 0.0036055154632776976, "grad/layer_20/mlp": 0.006078382954001427, "grad/layer_20/attn_mlp_ratio": 0.5931701624010305, "grad/layer_24/attn": 0.016480091959238052, "grad/layer_24/mlp": 0.013141012750566006, "grad/layer_24/attn_mlp_ratio": 1.2540960233920042, "grad/layer_27/attn": 0.005788784474134445, "grad/layer_27/mlp": 0.012628431431949139, "grad/layer_27/attn_mlp_ratio": 0.45839298882756613} {"step": 31550, "timestamp": 1778228641.7016485, "train/loss": 2.099238121509552, "train/z_loss": 0.0014688165509141981, "train/perplexity": 8.159950652822868, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026561.36358951, "perf/iters_per_sec": 0.966339761538272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348327159881592, "data/tokens_consumed": 66167242752, "data/tokens_consumed_B": 66.167242752, "train/loss_slope": -5.136479123948488e-06} {"step": 31560, "timestamp": 1778228652.078551, "train/loss": 2.2635594844818114, "train/z_loss": 0.001445019419770688, "train/perplexity": 9.617260804569186, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022100.4836778264, "perf/iters_per_sec": 0.9642126482380993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371156215667725, "data/tokens_consumed": 66188214272, "data/tokens_consumed_B": 66.188214272, "train/loss_slope": -5.373949479527681e-07} {"step": 31570, "timestamp": 1778228662.4546897, "train/loss": 2.190415143966675, "train/z_loss": 0.0014486981322988867, "train/perplexity": 8.938923284588306, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022190.2507799752, "perf/iters_per_sec": 0.964255452527988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370695829391479, "data/tokens_consumed": 66209185792, "data/tokens_consumed_B": 66.209185792, "train/loss_slope": 1.3818873752532983e-06} {"step": 31575, "timestamp": 1778228668.2618349, "eos/sharpness": 20.224952697753903, "eos/L0_probe": 2.0158960819244385, "eos/L_plus": 2.1144015789031982, "eos/L_minus": 2.1196401119232178, "eos/grad_norm": 0.11061538755893707, "eos/embed_grad_frac": 0.1966368705034256, "eos/time_s": 0.630298376083374} {"step": 31575, "timestamp": 1778228669.642145, "geo/rankme_last": 439.25439453125, "geo/layer_0/stable_rank_q_proj": 18.717723846435547, "geo/layer_0/stable_rank_k_proj": 16.237028121948242, "geo/layer_0/stable_rank_o_proj": 50.26063919067383, "geo/layer_0/stable_rank_gate_proj": 143.31590270996094, "geo/layer_0/stable_rank_down_proj": 52.33708953857422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05996907502412796, "geo/layer_0/attn_entropy_mean": 6.218778610229492, "geo/layer_0/attn_entropy_std": 0.35513612627983093, "geo/layer_7/stable_rank_q_proj": 42.40010452270508, "geo/layer_7/stable_rank_k_proj": 42.54007339477539, "geo/layer_7/stable_rank_o_proj": 104.56851196289062, "geo/layer_7/stable_rank_gate_proj": 94.52214813232422, "geo/layer_7/stable_rank_down_proj": 147.5311737060547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5156444907188416, "geo/layer_7/attn_entropy_mean": 4.616122245788574, "geo/layer_7/attn_entropy_std": 0.7852992415428162, "geo/layer_14/stable_rank_q_proj": 54.88765335083008, "geo/layer_14/stable_rank_k_proj": 36.05082702636719, "geo/layer_14/stable_rank_o_proj": 51.696529388427734, "geo/layer_14/stable_rank_gate_proj": 80.40814208984375, "geo/layer_14/stable_rank_down_proj": 134.43600463867188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3903413712978363, "geo/layer_14/attn_entropy_mean": 5.50111198425293, "geo/layer_14/attn_entropy_std": 0.3944922983646393, "geo/layer_21/stable_rank_q_proj": 44.8410758972168, "geo/layer_21/stable_rank_k_proj": 31.149681091308594, "geo/layer_21/stable_rank_o_proj": 79.02436065673828, "geo/layer_21/stable_rank_gate_proj": 77.54984283447266, "geo/layer_21/stable_rank_down_proj": 57.67469024658203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14345325529575348, "geo/layer_21/attn_entropy_mean": 5.7344865798950195, "geo/layer_21/attn_entropy_std": 0.29596400260925293, "geo/layer_27/stable_rank_q_proj": 41.7856330871582, "geo/layer_27/stable_rank_k_proj": 31.31719970703125, "geo/layer_27/stable_rank_o_proj": 118.5721206665039, "geo/layer_27/stable_rank_gate_proj": 88.09982299804688, "geo/layer_27/stable_rank_down_proj": 135.2010955810547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08194494992494583, "geo/layer_27/attn_entropy_mean": 4.3027753829956055, "geo/layer_27/attn_entropy_std": 0.6261110901832581, "attnres/final_alpha/block_0": 0.23938238620758057, "attnres/block_norm/0": 1.6905744075775146, "attnres/final_alpha/block_1": 0.005767684895545244, "attnres/block_norm/1": 36310.0078125, "attnres/final_alpha/block_2": 0.012489967048168182, "attnres/block_norm/2": 24727.9375, "attnres/final_alpha/block_3": 0.014355617575347424, "attnres/block_norm/3": 40760.9609375, "attnres/final_alpha/block_4": 0.01798475906252861, "attnres/block_norm/4": 11592.4521484375, "attnres/final_alpha/block_5": 0.5871944427490234, "attnres/block_norm/5": 5690.2890625, "attnres/final_alpha/block_6": 0.12282513082027435, "attnres/block_norm/6": 27026.14453125, "geo/tier1_time_s": 1.3605737686157227, "geo/step": 31575.0, "geo/rankme_slope": -0.00025442800166941776} {"step": 31580, "timestamp": 1778228674.8319895, "train/loss": 2.2076969385147094, "train/z_loss": 0.001457145472522825, "train/perplexity": 9.094746494456407, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695243.9297828756, "perf/iters_per_sec": 0.8083552979387644, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2370797872543335, "data/tokens_consumed": 66230157312, "data/tokens_consumed_B": 66.230157312, "train/loss_slope": 6.075789783224804e-06} {"step": 31590, "timestamp": 1778228685.2170262, "train/loss": 2.1987600564956664, "train/z_loss": 0.0014599154470488428, "train/perplexity": 9.01382992750052, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020232.896721279, "perf/iters_per_sec": 0.963322113381042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380743741989136, "data/tokens_consumed": 66251128832, "data/tokens_consumed_B": 66.251128832, "train/loss_slope": 6.102842287440665e-06} {"step": 31600, "timestamp": 1778228695.9924746, "grad/layer_0/attn": 0.0026637441478669643, "grad/layer_0/mlp": 0.002599864499643445, "grad/layer_0/attn_mlp_ratio": 1.0245703365599423, "grad/layer_4/attn": 0.0027580605819821358, "grad/layer_4/mlp": 0.0025213826447725296, "grad/layer_4/attn_mlp_ratio": 1.0938682703767604, "grad/layer_8/attn": 0.0058025186881423, "grad/layer_8/mlp": 0.003660866292193532, "grad/layer_8/attn_mlp_ratio": 1.58501241687368, "grad/layer_12/attn": 0.005039629992097616, "grad/layer_12/mlp": 0.006324035581201315, "grad/layer_12/attn_mlp_ratio": 0.7969009420801192, "grad/layer_16/attn": 0.003533814335241914, "grad/layer_16/mlp": 0.004590337630361319, "grad/layer_16/attn_mlp_ratio": 0.7698375463462375, "grad/layer_20/attn": 0.004613906145095825, "grad/layer_20/mlp": 0.005962417460978031, "grad/layer_20/attn_mlp_ratio": 0.7738314363107092, "grad/layer_24/attn": 0.012248524464666843, "grad/layer_24/mlp": 0.01083331648260355, "grad/layer_24/attn_mlp_ratio": 1.1306347757191806, "grad/layer_27/attn": 0.006150017026811838, "grad/layer_27/mlp": 0.011274165473878384, "grad/layer_27/attn_mlp_ratio": 0.5454964260114369} {"step": 31600, "timestamp": 1778228696.0084531, "train/loss": 2.1983863592147825, "train/z_loss": 0.001452613715082407, "train/perplexity": 9.010462113077002, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1944368.4882847401, "perf/iters_per_sec": 0.9271471444534016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.078577446937561, "data/tokens_consumed": 66272100352, "data/tokens_consumed_B": 66.272100352, "train/loss_slope": 6.92646737360496e-06} {"step": 31610, "timestamp": 1778228706.9238536, "train/loss": 2.1518593549728395, "train/z_loss": 0.0014638549415394663, "train/perplexity": 8.600835545321718, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922479.8202312416, "perf/iters_per_sec": 0.9167098141819199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.090857744216919, "data/tokens_consumed": 66293071872, "data/tokens_consumed_B": 66.293071872, "train/loss_slope": 4.245356471911165e-06} {"step": 31620, "timestamp": 1778228717.2860239, "train/loss": 2.1638644695281983, "train/z_loss": 0.001469883311074227, "train/perplexity": 8.704711837131194, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024840.7819357016, "perf/iters_per_sec": 0.9655193242720135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357120513916016, "data/tokens_consumed": 66314043392, "data/tokens_consumed_B": 66.314043392, "train/loss_slope": 3.2191734359745164e-06} {"step": 31630, "timestamp": 1778228727.6418, "train/loss": 2.169249987602234, "train/z_loss": 0.0014734470401890575, "train/perplexity": 8.751717681860216, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025965.066339635, "perf/iters_per_sec": 0.9660554248521972, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351372957229614, "data/tokens_consumed": 66335014912, "data/tokens_consumed_B": 66.335014912, "train/loss_slope": 3.521516292330464e-06} {"step": 31640, "timestamp": 1778228737.9916115, "train/loss": 2.174970579147339, "train/z_loss": 0.0014488369575701653, "train/perplexity": 8.801926158205521, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027313.68869812, "perf/iters_per_sec": 0.9666984981051063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344486951828002, "data/tokens_consumed": 66355986432, "data/tokens_consumed_B": 66.355986432, "train/loss_slope": 1.7940358574812757e-06} {"step": 31650, "timestamp": 1778228748.337874, "grad/layer_0/attn": 0.002648783614858985, "grad/layer_0/mlp": 0.0026648135390132666, "grad/layer_0/attn_mlp_ratio": 0.9939845609015205, "grad/layer_4/attn": 0.0019741440191864967, "grad/layer_4/mlp": 0.0024197923485189676, "grad/layer_4/attn_mlp_ratio": 0.8158319612885685, "grad/layer_8/attn": 0.006856943015009165, "grad/layer_8/mlp": 0.0037168445996940136, "grad/layer_8/attn_mlp_ratio": 1.8448290335008233, "grad/layer_12/attn": 0.004857190884649754, "grad/layer_12/mlp": 0.007599272765219212, "grad/layer_12/attn_mlp_ratio": 0.6391652163038419, "grad/layer_16/attn": 0.005150658078491688, "grad/layer_16/mlp": 0.005082158837467432, "grad/layer_16/attn_mlp_ratio": 1.013478354743937, "grad/layer_20/attn": 0.005109651479870081, "grad/layer_20/mlp": 0.006848411168903112, "grad/layer_20/attn_mlp_ratio": 0.7461075684913528, "grad/layer_24/attn": 0.008335149846971035, "grad/layer_24/mlp": 0.010645189322531223, "grad/layer_24/attn_mlp_ratio": 0.7829968557749811, "grad/layer_27/attn": 0.009833883494138718, "grad/layer_27/mlp": 0.008557227440178394, "grad/layer_27/attn_mlp_ratio": 1.1491903712932834} {"step": 31650, "timestamp": 1778228748.950242, "eos/sharpness": 8.829998970031737, "eos/L0_probe": 2.017530918121338, "eos/L_plus": 2.0782759189605713, "eos/L_minus": 2.045085906982422, "eos/grad_norm": 0.11824357509613037, "eos/embed_grad_frac": 0.1928950846195221, "eos/time_s": 0.6095759868621826} {"step": 31650, "timestamp": 1778228748.9699595, "train/loss": 2.1848937392234804, "train/z_loss": 0.001465116860345006, "train/perplexity": 8.889703876344273, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911087.9892253669, "perf/iters_per_sec": 0.9112777658583483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973602533340454, "data/tokens_consumed": 66376957952, "data/tokens_consumed_B": 66.376957952, "train/loss_slope": 2.314719508583366e-06} {"step": 31650, "timestamp": 1778228750.3315446, "geo/rankme_last": 439.726318359375, "geo/layer_0/stable_rank_q_proj": 18.70949363708496, "geo/layer_0/stable_rank_k_proj": 16.226118087768555, "geo/layer_0/stable_rank_o_proj": 50.32246017456055, "geo/layer_0/stable_rank_gate_proj": 142.9564971923828, "geo/layer_0/stable_rank_down_proj": 52.258583068847656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05360632762312889, "geo/layer_0/attn_entropy_mean": 6.218591213226318, "geo/layer_0/attn_entropy_std": 0.34946200251579285, "geo/layer_7/stable_rank_q_proj": 42.38872146606445, "geo/layer_7/stable_rank_k_proj": 42.49991226196289, "geo/layer_7/stable_rank_o_proj": 104.51020812988281, "geo/layer_7/stable_rank_gate_proj": 94.59191131591797, "geo/layer_7/stable_rank_down_proj": 147.43853759765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5254210829734802, "geo/layer_7/attn_entropy_mean": 4.646684646606445, "geo/layer_7/attn_entropy_std": 0.810559093952179, "geo/layer_14/stable_rank_q_proj": 54.95370864868164, "geo/layer_14/stable_rank_k_proj": 36.06584930419922, "geo/layer_14/stable_rank_o_proj": 51.68196487426758, "geo/layer_14/stable_rank_gate_proj": 80.27067565917969, "geo/layer_14/stable_rank_down_proj": 134.4833984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3668636679649353, "geo/layer_14/attn_entropy_mean": 5.4851861000061035, "geo/layer_14/attn_entropy_std": 0.3999825716018677, "geo/layer_21/stable_rank_q_proj": 44.73143768310547, "geo/layer_21/stable_rank_k_proj": 31.08819007873535, "geo/layer_21/stable_rank_o_proj": 78.916015625, "geo/layer_21/stable_rank_gate_proj": 77.49533081054688, "geo/layer_21/stable_rank_down_proj": 57.71402359008789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14700603485107422, "geo/layer_21/attn_entropy_mean": 5.74165153503418, "geo/layer_21/attn_entropy_std": 0.2847308814525604, "geo/layer_27/stable_rank_q_proj": 41.828365325927734, "geo/layer_27/stable_rank_k_proj": 31.31330108642578, "geo/layer_27/stable_rank_o_proj": 118.52539825439453, "geo/layer_27/stable_rank_gate_proj": 88.03429412841797, "geo/layer_27/stable_rank_down_proj": 135.12745666503906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08760649710893631, "geo/layer_27/attn_entropy_mean": 4.3089470863342285, "geo/layer_27/attn_entropy_std": 0.6209636926651001, "attnres/final_alpha/block_0": 0.2406582534313202, "attnres/block_norm/0": 1.6908950805664062, "attnres/final_alpha/block_1": 0.00577901117503643, "attnres/block_norm/1": 36377.484375, "attnres/final_alpha/block_2": 0.012372739613056183, "attnres/block_norm/2": 24832.470703125, "attnres/final_alpha/block_3": 0.014378927648067474, "attnres/block_norm/3": 41005.0625, "attnres/final_alpha/block_4": 0.01802992634475231, "attnres/block_norm/4": 11630.248046875, "attnres/final_alpha/block_5": 0.5852718949317932, "attnres/block_norm/5": 5674.689453125, "attnres/final_alpha/block_6": 0.1235092356801033, "attnres/block_norm/6": 26953.84375, "geo/tier1_time_s": 1.3575143814086914, "geo/step": 31650.0, "geo/rankme_slope": -0.00026471614036239496} {"step": 31660, "timestamp": 1778228761.1550157, "train/loss": 2.2324219703674317, "train/z_loss": 0.0014569079503417015, "train/perplexity": 9.322417377813617, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1721677.86620524, "perf/iters_per_sec": 0.8209599810625267, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2180861711502076, "data/tokens_consumed": 66397929472, "data/tokens_consumed_B": 66.397929472, "train/loss_slope": 4.136688001800343e-06} {"step": 31670, "timestamp": 1778228771.5070047, "train/loss": 2.2150859117507933, "train/z_loss": 0.0014418161939829589, "train/perplexity": 9.162196218115682, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026943.7385747693, "perf/iters_per_sec": 0.9665220921396109, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346374988555909, "data/tokens_consumed": 66418900992, "data/tokens_consumed_B": 66.418900992, "train/loss_slope": 3.966478338622107e-06} {"step": 31680, "timestamp": 1778228782.2755888, "train/loss": 2.174863576889038, "train/z_loss": 0.0014589697238989174, "train/perplexity": 8.800984382616152, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948578.8035298854, "perf/iters_per_sec": 0.929154779210036, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0762469530105592, "data/tokens_consumed": 66439872512, "data/tokens_consumed_B": 66.439872512, "train/loss_slope": 4.7917858459124584e-06} {"step": 31690, "timestamp": 1778228792.6313944, "train/loss": 2.189006781578064, "train/z_loss": 0.0014600605936720967, "train/perplexity": 8.926342902187736, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026391.1438631269, "perf/iters_per_sec": 0.9662585944476733, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349196434020995, "data/tokens_consumed": 66460844032, "data/tokens_consumed_B": 66.460844032, "train/loss_slope": 8.111741526363632e-06} {"step": 31700, "timestamp": 1778228802.9740062, "grad/layer_0/attn": 0.0024818882811814547, "grad/layer_0/mlp": 0.0024787054862827063, "grad/layer_0/attn_mlp_ratio": 1.0012840148972761, "grad/layer_4/attn": 0.0019188560545444489, "grad/layer_4/mlp": 0.002502444665879011, "grad/layer_4/attn_mlp_ratio": 0.7667925704927315, "grad/layer_8/attn": 0.007743521127849817, "grad/layer_8/mlp": 0.0036899971310049295, "grad/layer_8/attn_mlp_ratio": 2.09851678553725, "grad/layer_12/attn": 0.003620302304625511, "grad/layer_12/mlp": 0.0064935931004583836, "grad/layer_12/attn_mlp_ratio": 0.5575191104317951, "grad/layer_16/attn": 0.004658862482756376, "grad/layer_16/mlp": 0.0041486844420433044, "grad/layer_16/attn_mlp_ratio": 1.1229734233930933, "grad/layer_20/attn": 0.00320951989851892, "grad/layer_20/mlp": 0.0051965294405817986, "grad/layer_20/attn_mlp_ratio": 0.6176275672936106, "grad/layer_24/attn": 0.007771858014166355, "grad/layer_24/mlp": 0.008605563081800938, "grad/layer_24/attn_mlp_ratio": 0.9031202084021992, "grad/layer_27/attn": 0.007160429377108812, "grad/layer_27/mlp": 0.007731564808636904, "grad/layer_27/attn_mlp_ratio": 0.9261293750647973} {"step": 31700, "timestamp": 1778228802.9904244, "train/loss": 2.1654559850692747, "train/z_loss": 0.0014633428188972175, "train/perplexity": 8.718576551328118, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025447.0045203648, "perf/iters_per_sec": 0.9658083937265228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354020595550537, "data/tokens_consumed": 66481815552, "data/tokens_consumed_B": 66.481815552, "train/loss_slope": 6.307244293688989e-06} {"step": 31710, "timestamp": 1778228813.3623648, "train/loss": 2.256275510787964, "train/z_loss": 0.0014430174953304232, "train/perplexity": 9.547463439545929, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022731.3465216507, "perf/iters_per_sec": 0.9645134670837644, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367921590805054, "data/tokens_consumed": 66502787072, "data/tokens_consumed_B": 66.502787072, "train/loss_slope": 1.0482500552988026e-05} {"step": 31720, "timestamp": 1778228823.7186432, "train/loss": 2.186266314983368, "train/z_loss": 0.0014667312148958444, "train/perplexity": 8.901914046172127, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025997.5444155992, "perf/iters_per_sec": 0.9660709116056438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035120701789856, "data/tokens_consumed": 66523758592, "data/tokens_consumed_B": 66.523758592, "train/loss_slope": 1.1526909204992911e-05} {"step": 31725, "timestamp": 1778228829.5088108, "eos/sharpness": 64.27443027496336, "eos/L0_probe": 2.0188186168670654, "eos/L_plus": 2.305084228515625, "eos/L_minus": 2.3752973079681396, "eos/grad_norm": 0.24638791382312775, "eos/embed_grad_frac": 0.04281553998589516, "eos/time_s": 0.6229047775268555} {"step": 31725, "timestamp": 1778228830.8916876, "geo/rankme_last": 439.7785339355469, "geo/layer_0/stable_rank_q_proj": 18.68617820739746, "geo/layer_0/stable_rank_k_proj": 16.15682029724121, "geo/layer_0/stable_rank_o_proj": 50.35929489135742, "geo/layer_0/stable_rank_gate_proj": 143.07444763183594, "geo/layer_0/stable_rank_down_proj": 52.24433135986328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05228453129529953, "geo/layer_0/attn_entropy_mean": 6.216554641723633, "geo/layer_0/attn_entropy_std": 0.35029447078704834, "geo/layer_7/stable_rank_q_proj": 42.27745056152344, "geo/layer_7/stable_rank_k_proj": 42.40319061279297, "geo/layer_7/stable_rank_o_proj": 104.23355102539062, "geo/layer_7/stable_rank_gate_proj": 94.5257797241211, "geo/layer_7/stable_rank_down_proj": 147.62892150878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5193493366241455, "geo/layer_7/attn_entropy_mean": 4.628295421600342, "geo/layer_7/attn_entropy_std": 0.800234854221344, "geo/layer_14/stable_rank_q_proj": 54.860877990722656, "geo/layer_14/stable_rank_k_proj": 36.035099029541016, "geo/layer_14/stable_rank_o_proj": 51.62299728393555, "geo/layer_14/stable_rank_gate_proj": 80.3730239868164, "geo/layer_14/stable_rank_down_proj": 134.105712890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3709132671356201, "geo/layer_14/attn_entropy_mean": 5.511600494384766, "geo/layer_14/attn_entropy_std": 0.3988300561904907, "geo/layer_21/stable_rank_q_proj": 44.7347526550293, "geo/layer_21/stable_rank_k_proj": 31.179378509521484, "geo/layer_21/stable_rank_o_proj": 78.91370391845703, "geo/layer_21/stable_rank_gate_proj": 77.42168426513672, "geo/layer_21/stable_rank_down_proj": 57.66292953491211, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14753657579421997, "geo/layer_21/attn_entropy_mean": 5.731823921203613, "geo/layer_21/attn_entropy_std": 0.2874003052711487, "geo/layer_27/stable_rank_q_proj": 41.73689651489258, "geo/layer_27/stable_rank_k_proj": 31.21246337890625, "geo/layer_27/stable_rank_o_proj": 118.41815185546875, "geo/layer_27/stable_rank_gate_proj": 87.89112091064453, "geo/layer_27/stable_rank_down_proj": 135.19232177734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.087254099547863, "geo/layer_27/attn_entropy_mean": 4.2780351638793945, "geo/layer_27/attn_entropy_std": 0.6305046677589417, "attnres/final_alpha/block_0": 0.23961223661899567, "attnres/block_norm/0": 1.6914641857147217, "attnres/final_alpha/block_1": 0.005738001316785812, "attnres/block_norm/1": 36428.69140625, "attnres/final_alpha/block_2": 0.012516777031123638, "attnres/block_norm/2": 24720.87109375, "attnres/final_alpha/block_3": 0.014329493045806885, "attnres/block_norm/3": 40844.6875, "attnres/final_alpha/block_4": 0.017752358689904213, "attnres/block_norm/4": 11650.6435546875, "attnres/final_alpha/block_5": 0.5859339237213135, "attnres/block_norm/5": 5682.8310546875, "attnres/final_alpha/block_6": 0.12411724030971527, "attnres/block_norm/6": 26860.2265625, "geo/tier1_time_s": 1.3627450466156006, "geo/step": 31725.0, "geo/rankme_slope": -0.00024616657209758906} {"step": 31730, "timestamp": 1778228836.150287, "train/loss": 2.1832266807556153, "train/z_loss": 0.001453538122586906, "train/perplexity": 8.874896565975698, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1687637.9148229987, "perf/iters_per_sec": 0.8047284673800462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2426551818847655, "data/tokens_consumed": 66544730112, "data/tokens_consumed_B": 66.544730112, "train/loss_slope": 9.786753950625464e-06} {"step": 31740, "timestamp": 1778228846.5097775, "train/loss": 2.2200857162475587, "train/z_loss": 0.001451593148522079, "train/perplexity": 9.208120117559469, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025534.3636832437, "perf/iters_per_sec": 0.965850049821493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035357403755188, "data/tokens_consumed": 66565701632, "data/tokens_consumed_B": 66.565701632, "train/loss_slope": 9.803467143093402e-06} {"step": 31750, "timestamp": 1778228856.8514607, "grad/layer_0/attn": 0.002820901805534959, "grad/layer_0/mlp": 0.0026558435056358576, "grad/layer_0/attn_mlp_ratio": 1.0621490661380957, "grad/layer_4/attn": 0.0019173822365701199, "grad/layer_4/mlp": 0.0024932129308581352, "grad/layer_4/attn_mlp_ratio": 0.7690406767648648, "grad/layer_8/attn": 0.003901429707184434, "grad/layer_8/mlp": 0.0037938677705824375, "grad/layer_8/attn_mlp_ratio": 1.028351497804135, "grad/layer_12/attn": 0.004057639744132757, "grad/layer_12/mlp": 0.006802769843488932, "grad/layer_12/attn_mlp_ratio": 0.5964687587320818, "grad/layer_16/attn": 0.005272898823022842, "grad/layer_16/mlp": 0.004739333409816027, "grad/layer_16/attn_mlp_ratio": 1.112582351949215, "grad/layer_20/attn": 0.005060678813606501, "grad/layer_20/mlp": 0.0060517312958836555, "grad/layer_20/attn_mlp_ratio": 0.8362365218404668, "grad/layer_24/attn": 0.00966805499047041, "grad/layer_24/mlp": 0.009853329509496689, "grad/layer_24/attn_mlp_ratio": 0.9811967500967682, "grad/layer_27/attn": 0.005527405068278313, "grad/layer_27/mlp": 0.009240084327757359, "grad/layer_27/attn_mlp_ratio": 0.5981985458567782} {"step": 31750, "timestamp": 1778228856.8675869, "train/loss": 2.1738098859786987, "train/z_loss": 0.0014670733129605652, "train/perplexity": 8.791715749365007, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025781.090230873, "perf/iters_per_sec": 0.9659676982073179, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352313041687011, "data/tokens_consumed": 66586673152, "data/tokens_consumed_B": 66.586673152, "train/loss_slope": 1.0255383758476244e-05} {"step": 31760, "timestamp": 1778228867.2231588, "train/loss": 2.1630829334259034, "train/z_loss": 0.00145714043173939, "train/perplexity": 8.697911448291235, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026090.5980020554, "perf/iters_per_sec": 0.9661152830133702, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035073161125183, "data/tokens_consumed": 66607644672, "data/tokens_consumed_B": 66.607644672, "train/loss_slope": 3.522897417610401e-06} {"step": 31770, "timestamp": 1778228877.5812736, "train/loss": 2.2389732837677, "train/z_loss": 0.0014439310762099922, "train/perplexity": 9.383691951012958, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025774.7452143074, "perf/iters_per_sec": 0.96596467266765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035234546661377, "data/tokens_consumed": 66628616192, "data/tokens_consumed_B": 66.628616192, "train/loss_slope": 3.975186017480214e-06} {"step": 31780, "timestamp": 1778228887.9320412, "train/loss": 2.1979641199111937, "train/z_loss": 0.0014496050542220473, "train/perplexity": 9.006658344936081, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027116.9475573613, "perf/iters_per_sec": 0.9666046846186453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03454909324646, "data/tokens_consumed": 66649587712, "data/tokens_consumed_B": 66.649587712, "train/loss_slope": 5.784978374431963e-06} {"step": 31790, "timestamp": 1778228898.2836068, "train/loss": 2.148679089546204, "train/z_loss": 0.0014620141941122712, "train/perplexity": 8.573526054129628, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026904.0840482516, "perf/iters_per_sec": 0.9665031833878763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346577405929565, "data/tokens_consumed": 66670559232, "data/tokens_consumed_B": 66.670559232, "train/loss_slope": 4.872035422269326e-06} {"step": 31800, "timestamp": 1778228908.6262965, "grad/layer_0/attn": 0.002775835804641247, "grad/layer_0/mlp": 0.0026563520077615976, "grad/layer_0/attn_mlp_ratio": 1.0449803685778438, "grad/layer_4/attn": 0.0018807153683155775, "grad/layer_4/mlp": 0.0024877015966922045, "grad/layer_4/attn_mlp_ratio": 0.7560051797272511, "grad/layer_8/attn": 0.004847424570471048, "grad/layer_8/mlp": 0.0035718162544071674, "grad/layer_8/attn_mlp_ratio": 1.357131523430633, "grad/layer_12/attn": 0.003949603531509638, "grad/layer_12/mlp": 0.006356053054332733, "grad/layer_12/attn_mlp_ratio": 0.6213924640981503, "grad/layer_16/attn": 0.004235398955643177, "grad/layer_16/mlp": 0.004437812138348818, "grad/layer_16/attn_mlp_ratio": 0.9543889484650769, "grad/layer_20/attn": 0.005014478694647551, "grad/layer_20/mlp": 0.006176644004881382, "grad/layer_20/attn_mlp_ratio": 0.811845171827955, "grad/layer_24/attn": 0.01849302276968956, "grad/layer_24/mlp": 0.014071082696318626, "grad/layer_24/attn_mlp_ratio": 1.3142572634515257, "grad/layer_27/attn": 0.009823691099882126, "grad/layer_27/mlp": 0.013516101986169815, "grad/layer_27/attn_mlp_ratio": 0.7268139170045258} {"step": 31800, "timestamp": 1778228909.2396693, "eos/sharpness": 62.7690553665161, "eos/L0_probe": 2.017618179321289, "eos/L_plus": 2.298492908477783, "eos/L_minus": 2.364434003829956, "eos/grad_norm": 0.22096514701843262, "eos/embed_grad_frac": 0.05615340173244476, "eos/time_s": 0.6104996204376221} {"step": 31800, "timestamp": 1778228909.260205, "train/loss": 2.1655516147613527, "train/z_loss": 0.0014511976391077042, "train/perplexity": 8.719410345986208, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911398.2871888147, "perf/iters_per_sec": 0.911425727457435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971821069717407, "data/tokens_consumed": 66691530752, "data/tokens_consumed_B": 66.691530752, "train/loss_slope": 6.123871292540012e-06} {"step": 31800, "timestamp": 1778228910.6246262, "geo/rankme_last": 438.72186279296875, "geo/layer_0/stable_rank_q_proj": 18.728458404541016, "geo/layer_0/stable_rank_k_proj": 16.21481704711914, "geo/layer_0/stable_rank_o_proj": 50.253875732421875, "geo/layer_0/stable_rank_gate_proj": 143.0952606201172, "geo/layer_0/stable_rank_down_proj": 52.175689697265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.055464159697294235, "geo/layer_0/attn_entropy_mean": 6.219512462615967, "geo/layer_0/attn_entropy_std": 0.3525860905647278, "geo/layer_7/stable_rank_q_proj": 42.34638977050781, "geo/layer_7/stable_rank_k_proj": 42.20705032348633, "geo/layer_7/stable_rank_o_proj": 104.16769409179688, "geo/layer_7/stable_rank_gate_proj": 94.5927963256836, "geo/layer_7/stable_rank_down_proj": 147.43638610839844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5250048041343689, "geo/layer_7/attn_entropy_mean": 4.603212833404541, "geo/layer_7/attn_entropy_std": 0.7963526248931885, "geo/layer_14/stable_rank_q_proj": 54.921268463134766, "geo/layer_14/stable_rank_k_proj": 36.02051544189453, "geo/layer_14/stable_rank_o_proj": 51.537071228027344, "geo/layer_14/stable_rank_gate_proj": 80.44271850585938, "geo/layer_14/stable_rank_down_proj": 133.8757781982422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3848126232624054, "geo/layer_14/attn_entropy_mean": 5.49537992477417, "geo/layer_14/attn_entropy_std": 0.39109158515930176, "geo/layer_21/stable_rank_q_proj": 44.66083908081055, "geo/layer_21/stable_rank_k_proj": 31.270566940307617, "geo/layer_21/stable_rank_o_proj": 78.89569091796875, "geo/layer_21/stable_rank_gate_proj": 77.3120346069336, "geo/layer_21/stable_rank_down_proj": 57.635093688964844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14748375117778778, "geo/layer_21/attn_entropy_mean": 5.7285332679748535, "geo/layer_21/attn_entropy_std": 0.28789880871772766, "geo/layer_27/stable_rank_q_proj": 41.70333480834961, "geo/layer_27/stable_rank_k_proj": 31.22458267211914, "geo/layer_27/stable_rank_o_proj": 118.5096435546875, "geo/layer_27/stable_rank_gate_proj": 87.95693969726562, "geo/layer_27/stable_rank_down_proj": 134.9674835205078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08063501864671707, "geo/layer_27/attn_entropy_mean": 4.295943260192871, "geo/layer_27/attn_entropy_std": 0.63602614402771, "attnres/final_alpha/block_0": 0.2390843629837036, "attnres/block_norm/0": 1.691812515258789, "attnres/final_alpha/block_1": 0.005719085223972797, "attnres/block_norm/1": 36610.875, "attnres/final_alpha/block_2": 0.012309328652918339, "attnres/block_norm/2": 24805.78515625, "attnres/final_alpha/block_3": 0.014191258698701859, "attnres/block_norm/3": 41121.76953125, "attnres/final_alpha/block_4": 0.017872914671897888, "attnres/block_norm/4": 11596.3525390625, "attnres/final_alpha/block_5": 0.5864803791046143, "attnres/block_norm/5": 5689.35888671875, "attnres/final_alpha/block_6": 0.12434262782335281, "attnres/block_norm/6": 27207.646484375, "geo/tier1_time_s": 1.3602745532989502, "geo/step": 31800.0, "geo/rankme_slope": -0.00026425068074104643} {"step": 31810, "timestamp": 1778228920.9756866, "train/loss": 2.1996888399124144, "train/z_loss": 0.0014557322021573782, "train/perplexity": 9.022205712301474, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790653.1990330494, "perf/iters_per_sec": 0.853849982754254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711659193038941, "data/tokens_consumed": 66712502272, "data/tokens_consumed_B": 66.712502272, "train/loss_slope": 4.995500293895348e-06} {"step": 31820, "timestamp": 1778228931.3244038, "train/loss": 2.160775923728943, "train/z_loss": 0.0014637461164966226, "train/perplexity": 8.677868410867147, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027425.508506638, "perf/iters_per_sec": 0.9667518179448309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343916416168213, "data/tokens_consumed": 66733473792, "data/tokens_consumed_B": 66.733473792, "train/loss_slope": 2.772006593664724e-06} {"step": 31830, "timestamp": 1778228941.6810734, "train/loss": 2.1414995908737184, "train/z_loss": 0.0014669256168417632, "train/perplexity": 8.512192869416983, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026109.3124186518, "perf/iters_per_sec": 0.9661242067425975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350636005401612, "data/tokens_consumed": 66754445312, "data/tokens_consumed_B": 66.754445312, "train/loss_slope": 1.3601817707023863e-06} {"step": 31840, "timestamp": 1778228952.0284452, "train/loss": 2.1864007234573366, "train/z_loss": 0.001461500022560358, "train/perplexity": 8.903110619267453, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027769.3627640796, "perf/iters_per_sec": 0.9669157804317854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342162370681762, "data/tokens_consumed": 66775416832, "data/tokens_consumed_B": 66.775416832, "train/loss_slope": -2.3445035829724794e-07} {"step": 31850, "timestamp": 1778228962.3651626, "grad/layer_0/attn": 0.00261776614934206, "grad/layer_0/mlp": 0.0027642850764095783, "grad/layer_0/attn_mlp_ratio": 0.9469956904888424, "grad/layer_4/attn": 0.0017077423399314284, "grad/layer_4/mlp": 0.002463560551404953, "grad/layer_4/attn_mlp_ratio": 0.6932008509542941, "grad/layer_8/attn": 0.011338314972817898, "grad/layer_8/mlp": 0.0036807660944759846, "grad/layer_8/attn_mlp_ratio": 3.0804224918806846, "grad/layer_12/attn": 0.005258697550743818, "grad/layer_12/mlp": 0.006238524802029133, "grad/layer_12/attn_mlp_ratio": 0.842939257809708, "grad/layer_16/attn": 0.0038876342587172985, "grad/layer_16/mlp": 0.004425684921443462, "grad/layer_16/attn_mlp_ratio": 0.8784254279011761, "grad/layer_20/attn": 0.004287364427000284, "grad/layer_20/mlp": 0.006558945868164301, "grad/layer_20/attn_mlp_ratio": 0.6536666787331715, "grad/layer_24/attn": 0.019539734348654747, "grad/layer_24/mlp": 0.01437439490109682, "grad/layer_24/attn_mlp_ratio": 1.3593430782418177, "grad/layer_27/attn": 0.005378311034291983, "grad/layer_27/mlp": 0.014206216670572758, "grad/layer_27/attn_mlp_ratio": 0.3785885518396988} {"step": 31850, "timestamp": 1778228962.3809795, "train/loss": 2.2116755485534667, "train/z_loss": 0.0014547377126291395, "train/perplexity": 9.131003021624297, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026695.4679430444, "perf/iters_per_sec": 0.9664037074771139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347642421722412, "data/tokens_consumed": 66796388352, "data/tokens_consumed_B": 66.796388352, "train/loss_slope": 3.326143962357652e-06} {"step": 31860, "timestamp": 1778228972.7318966, "train/loss": 2.2004695177078246, "train/z_loss": 0.0014486122177913784, "train/perplexity": 9.029251898009191, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027696.300919858, "perf/iters_per_sec": 0.9668809418296137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342535018920898, "data/tokens_consumed": 66817359872, "data/tokens_consumed_B": 66.817359872, "train/loss_slope": 3.4201474508793994e-06} {"step": 31870, "timestamp": 1778228983.080152, "train/loss": 2.2038668394088745, "train/z_loss": 0.0014438887708820402, "train/perplexity": 9.059979337370681, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027534.0691636743, "perf/iters_per_sec": 0.9668035837000247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034336256980896, "data/tokens_consumed": 66838331392, "data/tokens_consumed_B": 66.838331392, "train/loss_slope": 7.477961822633351e-06} {"step": 31875, "timestamp": 1778228988.8546731, "eos/sharpness": 56.559562683105455, "eos/L0_probe": 2.0199339389801025, "eos/L_plus": 2.28511643409729, "eos/L_minus": 2.3203470706939697, "eos/grad_norm": 0.19823357462882996, "eos/embed_grad_frac": 0.06385254859924316, "eos/time_s": 0.6089460849761963} {"step": 31875, "timestamp": 1778228990.2316592, "geo/rankme_last": 440.26446533203125, "geo/layer_0/stable_rank_q_proj": 18.71527671813965, "geo/layer_0/stable_rank_k_proj": 16.163789749145508, "geo/layer_0/stable_rank_o_proj": 50.2457160949707, "geo/layer_0/stable_rank_gate_proj": 143.06517028808594, "geo/layer_0/stable_rank_down_proj": 52.16825485229492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05733920633792877, "geo/layer_0/attn_entropy_mean": 6.219675064086914, "geo/layer_0/attn_entropy_std": 0.3512116074562073, "geo/layer_7/stable_rank_q_proj": 42.30625915527344, "geo/layer_7/stable_rank_k_proj": 42.15855407714844, "geo/layer_7/stable_rank_o_proj": 103.89726257324219, "geo/layer_7/stable_rank_gate_proj": 94.45025634765625, "geo/layer_7/stable_rank_down_proj": 147.6866912841797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5209578275680542, "geo/layer_7/attn_entropy_mean": 4.636863708496094, "geo/layer_7/attn_entropy_std": 0.8068708181381226, "geo/layer_14/stable_rank_q_proj": 54.96308517456055, "geo/layer_14/stable_rank_k_proj": 36.151893615722656, "geo/layer_14/stable_rank_o_proj": 51.43567657470703, "geo/layer_14/stable_rank_gate_proj": 80.17809295654297, "geo/layer_14/stable_rank_down_proj": 133.9911651611328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38192710280418396, "geo/layer_14/attn_entropy_mean": 5.478711128234863, "geo/layer_14/attn_entropy_std": 0.3830300271511078, "geo/layer_21/stable_rank_q_proj": 44.66603469848633, "geo/layer_21/stable_rank_k_proj": 31.31839942932129, "geo/layer_21/stable_rank_o_proj": 78.8952865600586, "geo/layer_21/stable_rank_gate_proj": 77.45521545410156, "geo/layer_21/stable_rank_down_proj": 57.574405670166016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15276917815208435, "geo/layer_21/attn_entropy_mean": 5.726117134094238, "geo/layer_21/attn_entropy_std": 0.2872787415981293, "geo/layer_27/stable_rank_q_proj": 41.69181823730469, "geo/layer_27/stable_rank_k_proj": 31.158992767333984, "geo/layer_27/stable_rank_o_proj": 118.57064056396484, "geo/layer_27/stable_rank_gate_proj": 87.8281021118164, "geo/layer_27/stable_rank_down_proj": 134.63230895996094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08193560689687729, "geo/layer_27/attn_entropy_mean": 4.302222728729248, "geo/layer_27/attn_entropy_std": 0.6367390155792236, "attnres/final_alpha/block_0": 0.2402420938014984, "attnres/block_norm/0": 1.6924798488616943, "attnres/final_alpha/block_1": 0.005823940970003605, "attnres/block_norm/1": 36340.8828125, "attnres/final_alpha/block_2": 0.012598229572176933, "attnres/block_norm/2": 24693.283203125, "attnres/final_alpha/block_3": 0.0146124716848135, "attnres/block_norm/3": 41008.640625, "attnres/final_alpha/block_4": 0.017632126808166504, "attnres/block_norm/4": 11672.19921875, "attnres/final_alpha/block_5": 0.5845903158187866, "attnres/block_norm/5": 5711.2314453125, "attnres/final_alpha/block_6": 0.1245008111000061, "attnres/block_norm/6": 27016.986328125, "geo/tier1_time_s": 1.355879306793213, "geo/step": 31875.0, "geo/rankme_slope": -0.00022191771239745897} {"step": 31880, "timestamp": 1778228995.408135, "train/loss": 2.1787461757659914, "train/z_loss": 0.0014548744773492217, "train/perplexity": 8.835221496176363, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701851.010828638, "perf/iters_per_sec": 0.8115057996886434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322770833969117, "data/tokens_consumed": 66859302912, "data/tokens_consumed_B": 66.859302912, "train/loss_slope": 8.872718378977469e-06} {"step": 31890, "timestamp": 1778229006.2604027, "train/loss": 2.13180615901947, "train/z_loss": 0.0014648053329437971, "train/perplexity": 8.430079132829512, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933326.9114431345, "perf/iters_per_sec": 0.921882110330169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0847373962402345, "data/tokens_consumed": 66880274432, "data/tokens_consumed_B": 66.880274432, "train/loss_slope": 8.2449711016481e-06} {"step": 31900, "timestamp": 1778229016.6085327, "grad/layer_0/attn": 0.0034795452374964952, "grad/layer_0/mlp": 0.0030301716178655624, "grad/layer_0/attn_mlp_ratio": 1.148299687764055, "grad/layer_4/attn": 0.002266723895445466, "grad/layer_4/mlp": 0.0026268374640494585, "grad/layer_4/attn_mlp_ratio": 0.8629098070118765, "grad/layer_8/attn": 0.003658825531601906, "grad/layer_8/mlp": 0.003963563125580549, "grad/layer_8/attn_mlp_ratio": 0.9231152181421283, "grad/layer_12/attn": 0.004863034933805466, "grad/layer_12/mlp": 0.006780050694942474, "grad/layer_12/attn_mlp_ratio": 0.717256416048241, "grad/layer_16/attn": 0.004611698444932699, "grad/layer_16/mlp": 0.00561620993539691, "grad/layer_16/attn_mlp_ratio": 0.8211406653004174, "grad/layer_20/attn": 0.00515461852774024, "grad/layer_20/mlp": 0.007750381715595722, "grad/layer_20/attn_mlp_ratio": 0.6650793019471438, "grad/layer_24/attn": 0.024307111278176308, "grad/layer_24/mlp": 0.01537671685218811, "grad/layer_24/attn_mlp_ratio": 1.5807737993588677, "grad/layer_27/attn": 0.012741363607347012, "grad/layer_27/mlp": 0.016388075426220894, "grad/layer_27/attn_mlp_ratio": 0.7774777207342528} {"step": 31900, "timestamp": 1778229016.6245556, "train/loss": 2.20511155128479, "train/z_loss": 0.0014422885607928038, "train/perplexity": 9.071263422507966, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024408.4605035896, "perf/iters_per_sec": 0.9653131773488949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359332323074342, "data/tokens_consumed": 66901245952, "data/tokens_consumed_B": 66.901245952, "train/loss_slope": 1.0540791914121781e-05} {"step": 31910, "timestamp": 1778229026.974593, "train/loss": 2.2366326808929444, "train/z_loss": 0.0014494610484689474, "train/perplexity": 9.361754138525463, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027523.6939449853, "perf/iters_per_sec": 0.9667986364102293, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343415498733521, "data/tokens_consumed": 66922217472, "data/tokens_consumed_B": 66.922217472, "train/loss_slope": 1.2367961230021876e-05} {"step": 31920, "timestamp": 1778229037.3257866, "train/loss": 2.1628780364990234, "train/z_loss": 0.0014469034154899417, "train/perplexity": 8.69612945553386, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026908.2409281721, "perf/iters_per_sec": 0.9665051655426846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346556186676026, "data/tokens_consumed": 66943188992, "data/tokens_consumed_B": 66.943188992, "train/loss_slope": 9.200306536734254e-06} {"step": 31930, "timestamp": 1778229047.6758733, "train/loss": 2.2021487236022947, "train/z_loss": 0.0014441925915889442, "train/perplexity": 9.044426608184652, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027026.929378193, "perf/iters_per_sec": 0.9665617606059042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345950365066527, "data/tokens_consumed": 66964160512, "data/tokens_consumed_B": 66.964160512, "train/loss_slope": 1.035826351418704e-05} {"step": 31940, "timestamp": 1778229058.034724, "train/loss": 2.146396255493164, "train/z_loss": 0.0014639224275015295, "train/perplexity": 8.553976439651425, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025897.1737255247, "perf/iters_per_sec": 0.9660230511310218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351719856262207, "data/tokens_consumed": 66985132032, "data/tokens_consumed_B": 66.985132032, "train/loss_slope": 5.891956633502802e-06} {"step": 31950, "timestamp": 1778229068.3768108, "grad/layer_0/attn": 0.0027673605363816023, "grad/layer_0/mlp": 0.0028076358139514923, "grad/layer_0/attn_mlp_ratio": 0.985655056850585, "grad/layer_4/attn": 0.0025490145199000835, "grad/layer_4/mlp": 0.002451315987855196, "grad/layer_4/attn_mlp_ratio": 1.0398555015115858, "grad/layer_8/attn": 0.004339432343840599, "grad/layer_8/mlp": 0.003621810581535101, "grad/layer_8/attn_mlp_ratio": 1.19813892149695, "grad/layer_12/attn": 0.004386283922940493, "grad/layer_12/mlp": 0.006879232358187437, "grad/layer_12/attn_mlp_ratio": 0.637612400743935, "grad/layer_16/attn": 0.004549791105091572, "grad/layer_16/mlp": 0.004477699287235737, "grad/layer_16/attn_mlp_ratio": 1.0161001692211278, "grad/layer_20/attn": 0.0059033227153122425, "grad/layer_20/mlp": 0.006558587308973074, "grad/layer_20/attn_mlp_ratio": 0.9000905754851515, "grad/layer_24/attn": 0.018768806010484695, "grad/layer_24/mlp": 0.013356312178075314, "grad/layer_24/attn_mlp_ratio": 1.4052386332187001, "grad/layer_27/attn": 0.008843110874295235, "grad/layer_27/mlp": 0.012164944782853127, "grad/layer_27/attn_mlp_ratio": 0.7269339038896822} {"step": 31950, "timestamp": 1778229068.984054, "eos/sharpness": 65.42270183563231, "eos/L0_probe": 2.0156402587890625, "eos/L_plus": 2.2840418815612793, "eos/L_minus": 2.401465654373169, "eos/grad_norm": 0.23224696516990662, "eos/embed_grad_frac": 0.04439937323331833, "eos/time_s": 0.6043739318847656} {"step": 31950, "timestamp": 1778229069.0043154, "train/loss": 2.1559814691543577, "train/z_loss": 0.0014657380525022745, "train/perplexity": 8.636362343950806, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912807.1899669932, "perf/iters_per_sec": 0.9120975446543661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096373963356018, "data/tokens_consumed": 67006103552, "data/tokens_consumed_B": 67.006103552, "train/loss_slope": 1.2012182324989997e-06} {"step": 31950, "timestamp": 1778229070.368081, "geo/rankme_last": 440.1268005371094, "geo/layer_0/stable_rank_q_proj": 18.742063522338867, "geo/layer_0/stable_rank_k_proj": 16.18121910095215, "geo/layer_0/stable_rank_o_proj": 50.22032165527344, "geo/layer_0/stable_rank_gate_proj": 142.67861938476562, "geo/layer_0/stable_rank_down_proj": 52.15978240966797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05848388001322746, "geo/layer_0/attn_entropy_mean": 6.226541519165039, "geo/layer_0/attn_entropy_std": 0.34718990325927734, "geo/layer_7/stable_rank_q_proj": 42.3982048034668, "geo/layer_7/stable_rank_k_proj": 42.119422912597656, "geo/layer_7/stable_rank_o_proj": 103.82363891601562, "geo/layer_7/stable_rank_gate_proj": 94.3723373413086, "geo/layer_7/stable_rank_down_proj": 147.5557098388672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5280120372772217, "geo/layer_7/attn_entropy_mean": 4.656764984130859, "geo/layer_7/attn_entropy_std": 0.8378876447677612, "geo/layer_14/stable_rank_q_proj": 54.8963508605957, "geo/layer_14/stable_rank_k_proj": 36.039794921875, "geo/layer_14/stable_rank_o_proj": 51.499237060546875, "geo/layer_14/stable_rank_gate_proj": 80.17662048339844, "geo/layer_14/stable_rank_down_proj": 134.27822875976562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3721091151237488, "geo/layer_14/attn_entropy_mean": 5.501918792724609, "geo/layer_14/attn_entropy_std": 0.3801082968711853, "geo/layer_21/stable_rank_q_proj": 44.66603088378906, "geo/layer_21/stable_rank_k_proj": 31.322065353393555, "geo/layer_21/stable_rank_o_proj": 78.8484115600586, "geo/layer_21/stable_rank_gate_proj": 77.3124008178711, "geo/layer_21/stable_rank_down_proj": 57.54540252685547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14438529312610626, "geo/layer_21/attn_entropy_mean": 5.707265853881836, "geo/layer_21/attn_entropy_std": 0.29360634088516235, "geo/layer_27/stable_rank_q_proj": 41.78542709350586, "geo/layer_27/stable_rank_k_proj": 31.193408966064453, "geo/layer_27/stable_rank_o_proj": 118.48907470703125, "geo/layer_27/stable_rank_gate_proj": 87.8057632446289, "geo/layer_27/stable_rank_down_proj": 134.81349182128906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08675173670053482, "geo/layer_27/attn_entropy_mean": 4.290135383605957, "geo/layer_27/attn_entropy_std": 0.6301651000976562, "attnres/final_alpha/block_0": 0.23873327672481537, "attnres/block_norm/0": 1.6929051876068115, "attnres/final_alpha/block_1": 0.005720142275094986, "attnres/block_norm/1": 36580.90625, "attnres/final_alpha/block_2": 0.012181148864328861, "attnres/block_norm/2": 24805.541015625, "attnres/final_alpha/block_3": 0.014183539897203445, "attnres/block_norm/3": 40941.96875, "attnres/final_alpha/block_4": 0.0176593866199255, "attnres/block_norm/4": 11684.4296875, "attnres/final_alpha/block_5": 0.5874513387680054, "attnres/block_norm/5": 5687.103515625, "attnres/final_alpha/block_6": 0.12407118082046509, "attnres/block_norm/6": 27143.591796875, "geo/tier1_time_s": 1.359727144241333, "geo/step": 31950.0, "geo/rankme_slope": -0.00019944553993472388} {"step": 31960, "timestamp": 1778229080.7201393, "train/loss": 2.1628062248229982, "train/z_loss": 0.0014523385907523334, "train/perplexity": 8.695504994324798, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790617.0383975604, "perf/iters_per_sec": 0.8538327400195886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711895704269408, "data/tokens_consumed": 67027075072, "data/tokens_consumed_B": 67.027075072, "train/loss_slope": -1.7819635795824929e-06} {"step": 31970, "timestamp": 1778229091.0702188, "train/loss": 2.256879734992981, "train/z_loss": 0.0014454041374847293, "train/perplexity": 9.553233991230556, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027334.9022059802, "perf/iters_per_sec": 0.9667086134939099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344378709793092, "data/tokens_consumed": 67048046592, "data/tokens_consumed_B": 67.048046592, "train/loss_slope": 3.47804573967828e-06} {"step": 31980, "timestamp": 1778229101.4196236, "train/loss": 2.1502827048301696, "train/z_loss": 0.0014631370780989529, "train/perplexity": 8.587285721205046, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027341.4906324723, "perf/iters_per_sec": 0.9667117551004755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344345092773437, "data/tokens_consumed": 67069018112, "data/tokens_consumed_B": 67.069018112, "train/loss_slope": 5.645443432472741e-07} {"step": 31990, "timestamp": 1778229111.7828474, "train/loss": 2.1634382367134095, "train/z_loss": 0.0014630366931669414, "train/perplexity": 8.701002393902302, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024700.3049880897, "perf/iters_per_sec": 0.9654523396435212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357839107513427, "data/tokens_consumed": 67089989632, "data/tokens_consumed_B": 67.089989632, "train/loss_slope": -1.751507219165268e-06} {"step": 32000, "timestamp": 1778229122.121481, "grad/layer_0/attn": 0.002821682719513774, "grad/layer_0/mlp": 0.002747995313256979, "grad/layer_0/attn_mlp_ratio": 1.0268149305858771, "grad/layer_4/attn": 0.001811614609323442, "grad/layer_4/mlp": 0.002501150593161583, "grad/layer_4/attn_mlp_ratio": 0.7243124591719294, "grad/layer_8/attn": 0.003692715661600232, "grad/layer_8/mlp": 0.0037959343753755093, "grad/layer_8/attn_mlp_ratio": 0.972808062298002, "grad/layer_12/attn": 0.005520664155483246, "grad/layer_12/mlp": 0.00634619640186429, "grad/layer_12/attn_mlp_ratio": 0.8699169894694355, "grad/layer_16/attn": 0.004057849291712046, "grad/layer_16/mlp": 0.004441744182258844, "grad/layer_16/attn_mlp_ratio": 0.9135711184274731, "grad/layer_20/attn": 0.00385344959795475, "grad/layer_20/mlp": 0.005992122460156679, "grad/layer_20/attn_mlp_ratio": 0.6430859114226782, "grad/layer_24/attn": 0.004648379981517792, "grad/layer_24/mlp": 0.007394406478852034, "grad/layer_24/attn_mlp_ratio": 0.6286346215816879, "grad/layer_27/attn": 0.004742998629808426, "grad/layer_27/mlp": 0.006615510676056147, "grad/layer_27/attn_mlp_ratio": 0.7169512363240343} {"step": 32000, "timestamp": 1778229122.1374326, "train/loss": 2.203294372558594, "train/z_loss": 0.0014619170455262066, "train/perplexity": 9.05479428381305, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026477.6507539093, "perf/iters_per_sec": 0.9662998441476389, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034875464439392, "data/tokens_consumed": 67110961152, "data/tokens_consumed_B": 67.110961152, "train/loss_slope": -2.2816021664880552e-06} {"step": 32000, "timestamp": 1778229129.103922, "geo/ww_alpha_mean": 8.041706326536668, "geo/ww_alpha_std": 4.793371532902682, "geo/ww_alpha_min": 1.3654553788534707, "geo/ww_alpha_max": 26.26916882214205, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.113805334879926, "geo/ww_alpha_by_type/k_proj": 4.551005830011635, "geo/ww_alpha_by_type/v_proj": 8.426368731491367, "geo/ww_alpha_by_type/o_proj": 8.998129923533167, "geo/ww_alpha_by_type/gate_proj": 8.608866066800399, "geo/ww_alpha_by_type/up_proj": 12.565525022254187, "geo/ww_alpha_by_type/down_proj": 9.155016019149263, "geo/twonn_id/layer_0": 0.7247359752655029, "geo/twonn_id/layer_7": 3.03995943069458, "geo/twonn_id/layer_14": 4.216907501220703, "geo/twonn_id/layer_21": 9.117034912109375, "geo/twonn_id/layer_27": 6.359879016876221, "geo/tier2_time_s": 6.960192918777466} {"step": 32000, "timestamp": 1778229129.7155588, "eoc/jacobian_sigma/layer_0/attn": 995.0257568359375, "eoc/jacobian_sigma/layer_0/mlp": 7527.27099609375, "eoc/jacobian_sigma/layer_0": 7527.27099609375, "eoc/jacobian_sigma/layer_7/attn": 1.1653155088424683, "eoc/jacobian_sigma/layer_7/mlp": 1.6996036767959595, "eoc/jacobian_sigma/layer_7": 1.6996036767959595, "eoc/jacobian_sigma/layer_14/attn": 1.6293680667877197, "eoc/jacobian_sigma/layer_14/mlp": 7.525699138641357, "eoc/jacobian_sigma/layer_14": 7.525699138641357, "eoc/jacobian_sigma/layer_21/attn": 1.0915530920028687, "eoc/jacobian_sigma/layer_21/mlp": 3.594999074935913, "eoc/jacobian_sigma/layer_21": 3.594999074935913, "eoc/jacobian_sigma/layer_27/attn": 3.1292946338653564, "eoc/jacobian_sigma/layer_27/mlp": 20.260509490966797, "eoc/jacobian_sigma/layer_27": 20.260509490966797, "eoc/layer0_sigma": 7527.27099609375, "eoc/sigma_max": 20.260509490966797, "eoc/sigma_min": 1.6996036767959595, "eoc/sigma_mean": 8.270202845335007, "eoc/time_s": 0.6060633659362793} {"step": 32010, "timestamp": 1778229140.0864851, "train/loss": 2.2238981246948244, "train/z_loss": 0.0014510027365759015, "train/perplexity": 9.243292235117504, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1168729.5786341273, "perf/iters_per_sec": 0.557293690983833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7943860054016114, "data/tokens_consumed": 67131932672, "data/tokens_consumed_B": 67.131932672, "train/loss_slope": 2.1890577596119988e-06} {"step": 32020, "timestamp": 1778229150.434711, "train/loss": 2.19444797039032, "train/z_loss": 0.001448457094375044, "train/perplexity": 8.975045198357206, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027704.6679402466, "perf/iters_per_sec": 0.9668849315358384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034249234199524, "data/tokens_consumed": 67152904192, "data/tokens_consumed_B": 67.152904192, "train/loss_slope": 2.960105003839904e-06} {"step": 32025, "timestamp": 1778229156.2186368, "eos/sharpness": 36.62190437316894, "eos/L0_probe": 2.0147526264190674, "eos/L_plus": 2.156409978866577, "eos/L_minus": 2.239314317703247, "eos/grad_norm": 0.11143267154693604, "eos/embed_grad_frac": 0.18682776391506195, "eos/time_s": 0.6201963424682617} {"step": 32025, "timestamp": 1778229157.5956945, "geo/rankme_last": 439.5064392089844, "geo/layer_0/stable_rank_q_proj": 18.7662410736084, "geo/layer_0/stable_rank_k_proj": 16.148513793945312, "geo/layer_0/stable_rank_o_proj": 50.28116226196289, "geo/layer_0/stable_rank_gate_proj": 142.5634765625, "geo/layer_0/stable_rank_down_proj": 52.26163101196289, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05659569427371025, "geo/layer_0/attn_entropy_mean": 6.2217912673950195, "geo/layer_0/attn_entropy_std": 0.3520245850086212, "geo/layer_7/stable_rank_q_proj": 42.414100646972656, "geo/layer_7/stable_rank_k_proj": 42.103721618652344, "geo/layer_7/stable_rank_o_proj": 103.68930053710938, "geo/layer_7/stable_rank_gate_proj": 94.5419921875, "geo/layer_7/stable_rank_down_proj": 147.10189819335938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.520737886428833, "geo/layer_7/attn_entropy_mean": 4.627648830413818, "geo/layer_7/attn_entropy_std": 0.8219183683395386, "geo/layer_14/stable_rank_q_proj": 55.0024299621582, "geo/layer_14/stable_rank_k_proj": 36.029624938964844, "geo/layer_14/stable_rank_o_proj": 51.41563415527344, "geo/layer_14/stable_rank_gate_proj": 80.14671325683594, "geo/layer_14/stable_rank_down_proj": 134.2695770263672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3838842511177063, "geo/layer_14/attn_entropy_mean": 5.501354217529297, "geo/layer_14/attn_entropy_std": 0.38607320189476013, "geo/layer_21/stable_rank_q_proj": 44.687870025634766, "geo/layer_21/stable_rank_k_proj": 31.154077529907227, "geo/layer_21/stable_rank_o_proj": 78.96678161621094, "geo/layer_21/stable_rank_gate_proj": 77.31317901611328, "geo/layer_21/stable_rank_down_proj": 57.600982666015625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14397411048412323, "geo/layer_21/attn_entropy_mean": 5.7332868576049805, "geo/layer_21/attn_entropy_std": 0.2846081852912903, "geo/layer_27/stable_rank_q_proj": 41.90421676635742, "geo/layer_27/stable_rank_k_proj": 31.143022537231445, "geo/layer_27/stable_rank_o_proj": 118.45573425292969, "geo/layer_27/stable_rank_gate_proj": 87.68551635742188, "geo/layer_27/stable_rank_down_proj": 134.76983642578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09061003476381302, "geo/layer_27/attn_entropy_mean": 4.301614761352539, "geo/layer_27/attn_entropy_std": 0.6065851449966431, "attnres/final_alpha/block_0": 0.23993706703186035, "attnres/block_norm/0": 1.6931004524230957, "attnres/final_alpha/block_1": 0.005840997211635113, "attnres/block_norm/1": 36630.2109375, "attnres/final_alpha/block_2": 0.0121532641351223, "attnres/block_norm/2": 24741.755859375, "attnres/final_alpha/block_3": 0.014127257280051708, "attnres/block_norm/3": 40966.8359375, "attnres/final_alpha/block_4": 0.01765727624297142, "attnres/block_norm/4": 11717.3203125, "attnres/final_alpha/block_5": 0.5858800411224365, "attnres/block_norm/5": 5715.685546875, "attnres/final_alpha/block_6": 0.12440404295921326, "attnres/block_norm/6": 27143.126953125, "geo/tier1_time_s": 1.357055902481079, "geo/step": 32025.0, "geo/rankme_slope": -0.00019332273925195078} {"step": 32030, "timestamp": 1778229162.7732503, "train/loss": 2.2294450998306274, "train/z_loss": 0.0014432316180318593, "train/perplexity": 9.294707013736545, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700417.0930954209, "perf/iters_per_sec": 0.8108220544316391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2333162307739258, "data/tokens_consumed": 67173875712, "data/tokens_consumed_B": 67.173875712, "train/loss_slope": 4.889114354417143e-06} {"step": 32040, "timestamp": 1778229173.1289136, "train/loss": 2.1684826612472534, "train/z_loss": 0.0014553535380400716, "train/perplexity": 8.745004834033473, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026073.7973240493, "perf/iters_per_sec": 0.9661072718258139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350817441940308, "data/tokens_consumed": 67194847232, "data/tokens_consumed_B": 67.194847232, "train/loss_slope": 4.968330025351006e-06} {"step": 32050, "timestamp": 1778229183.4668088, "grad/layer_0/attn": 0.002710498869419098, "grad/layer_0/mlp": 0.0027023928705602884, "grad/layer_0/attn_mlp_ratio": 1.0029995263261542, "grad/layer_4/attn": 0.002043958753347397, "grad/layer_4/mlp": 0.0024057291448116302, "grad/layer_4/attn_mlp_ratio": 0.8496212771057035, "grad/layer_8/attn": 0.009633452631533146, "grad/layer_8/mlp": 0.0036565058398991823, "grad/layer_8/attn_mlp_ratio": 2.6346060391737693, "grad/layer_12/attn": 0.005673286970704794, "grad/layer_12/mlp": 0.0066222851164639, "grad/layer_12/attn_mlp_ratio": 0.8566962589591027, "grad/layer_16/attn": 0.004602618981152773, "grad/layer_16/mlp": 0.005007173400372267, "grad/layer_16/attn_mlp_ratio": 0.9192050127303525, "grad/layer_20/attn": 0.004260794725269079, "grad/layer_20/mlp": 0.007357345428317785, "grad/layer_20/attn_mlp_ratio": 0.5791211937606641, "grad/layer_24/attn": 0.022027751430869102, "grad/layer_24/mlp": 0.01415637694299221, "grad/layer_24/attn_mlp_ratio": 1.5560302868433018, "grad/layer_27/attn": 0.009599339216947556, "grad/layer_27/mlp": 0.013839848339557648, "grad/layer_27/attn_mlp_ratio": 0.693601469616554} {"step": 32050, "timestamp": 1778229183.482434, "train/loss": 2.140689492225647, "train/z_loss": 0.0014667472103610634, "train/perplexity": 8.505299945832345, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026544.0882048823, "perf/iters_per_sec": 0.9663315239929592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034841537475586, "data/tokens_consumed": 67215818752, "data/tokens_consumed_B": 67.215818752, "train/loss_slope": 2.2580188636196107e-06} {"step": 32060, "timestamp": 1778229193.8368099, "train/loss": 2.137587833404541, "train/z_loss": 0.0014570773346349597, "train/perplexity": 8.47896027667905, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026367.7093767908, "perf/iters_per_sec": 0.9662474200138048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349316120147705, "data/tokens_consumed": 67236790272, "data/tokens_consumed_B": 67.236790272, "train/loss_slope": -1.1326774547953245e-07} {"step": 32070, "timestamp": 1778229204.1900141, "train/loss": 2.1849100828170775, "train/z_loss": 0.00145830794936046, "train/perplexity": 8.88984916723891, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026540.2596459968, "perf/iters_per_sec": 0.9663296983938202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348434925079346, "data/tokens_consumed": 67257761792, "data/tokens_consumed_B": 67.257761792, "train/loss_slope": -2.651448304181765e-06} {"step": 32080, "timestamp": 1778229214.5755703, "train/loss": 2.246347951889038, "train/z_loss": 0.0014406067552044989, "train/perplexity": 9.453149362794056, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020664.968219085, "perf/iters_per_sec": 0.9635281411261963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03785240650177, "data/tokens_consumed": 67278733312, "data/tokens_consumed_B": 67.278733312, "train/loss_slope": -2.7690762340432464e-06} {"step": 32090, "timestamp": 1778229224.9475245, "train/loss": 2.2366469621658327, "train/z_loss": 0.0014479554025456308, "train/perplexity": 9.361887837245721, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023632.3112337508, "perf/iters_per_sec": 0.9649430805367235, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036330556869507, "data/tokens_consumed": 67299704832, "data/tokens_consumed_B": 67.299704832, "train/loss_slope": -1.7694214103531589e-06} {"step": 32100, "timestamp": 1778229235.3052652, "grad/layer_0/attn": 0.0029047064017504454, "grad/layer_0/mlp": 0.002800491638481617, "grad/layer_0/attn_mlp_ratio": 1.0372129872182132, "grad/layer_4/attn": 0.0017980528064072132, "grad/layer_4/mlp": 0.0024250266142189503, "grad/layer_4/attn_mlp_ratio": 0.7414569067897167, "grad/layer_8/attn": 0.004040905274450779, "grad/layer_8/mlp": 0.003566707717254758, "grad/layer_8/attn_mlp_ratio": 1.132951024163511, "grad/layer_12/attn": 0.003978548105806112, "grad/layer_12/mlp": 0.006027615163475275, "grad/layer_12/attn_mlp_ratio": 0.6600534260894823, "grad/layer_16/attn": 0.0057022408582270145, "grad/layer_16/mlp": 0.004930879920721054, "grad/layer_16/attn_mlp_ratio": 1.1564347204280916, "grad/layer_20/attn": 0.005818825680762529, "grad/layer_20/mlp": 0.006429907865822315, "grad/layer_20/attn_mlp_ratio": 0.9049625145013044, "grad/layer_24/attn": 0.017157157883048058, "grad/layer_24/mlp": 0.013621254824101925, "grad/layer_24/attn_mlp_ratio": 1.2595871656942257, "grad/layer_27/attn": 0.010935246013104916, "grad/layer_27/mlp": 0.012043802998960018, "grad/layer_27/attn_mlp_ratio": 0.9079562263890858} {"step": 32100, "timestamp": 1778229235.9148989, "eos/sharpness": 64.41988945007323, "eos/L0_probe": 2.0111024379730225, "eos/L_plus": 2.4001317024230957, "eos/L_minus": 2.2662720680236816, "eos/grad_norm": 0.23481029272079468, "eos/embed_grad_frac": 0.04318199306726456, "eos/time_s": 0.6068389415740967} {"step": 32100, "timestamp": 1778229235.9348757, "train/loss": 2.24135844707489, "train/z_loss": 0.0014502590871416031, "train/perplexity": 9.406100301905216, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910016.3429328157, "perf/iters_per_sec": 0.9107667650856093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979759454727174, "data/tokens_consumed": 67320676352, "data/tokens_consumed_B": 67.320676352, "train/loss_slope": 2.782064703109557e-06} {"step": 32100, "timestamp": 1778229237.2975128, "geo/rankme_last": 439.4460754394531, "geo/layer_0/stable_rank_q_proj": 18.724720001220703, "geo/layer_0/stable_rank_k_proj": 16.156818389892578, "geo/layer_0/stable_rank_o_proj": 50.28484344482422, "geo/layer_0/stable_rank_gate_proj": 142.3203125, "geo/layer_0/stable_rank_down_proj": 52.22624588012695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057748328894376755, "geo/layer_0/attn_entropy_mean": 6.211899280548096, "geo/layer_0/attn_entropy_std": 0.35349029302597046, "geo/layer_7/stable_rank_q_proj": 42.432186126708984, "geo/layer_7/stable_rank_k_proj": 42.015743255615234, "geo/layer_7/stable_rank_o_proj": 103.74282836914062, "geo/layer_7/stable_rank_gate_proj": 94.3901596069336, "geo/layer_7/stable_rank_down_proj": 147.00860595703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5302650332450867, "geo/layer_7/attn_entropy_mean": 4.668636322021484, "geo/layer_7/attn_entropy_std": 0.8139551281929016, "geo/layer_14/stable_rank_q_proj": 55.02920913696289, "geo/layer_14/stable_rank_k_proj": 35.994300842285156, "geo/layer_14/stable_rank_o_proj": 51.34846878051758, "geo/layer_14/stable_rank_gate_proj": 80.1535873413086, "geo/layer_14/stable_rank_down_proj": 134.2714080810547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3765649199485779, "geo/layer_14/attn_entropy_mean": 5.486075401306152, "geo/layer_14/attn_entropy_std": 0.3960595428943634, "geo/layer_21/stable_rank_q_proj": 44.68939971923828, "geo/layer_21/stable_rank_k_proj": 31.279624938964844, "geo/layer_21/stable_rank_o_proj": 79.06814575195312, "geo/layer_21/stable_rank_gate_proj": 77.31468200683594, "geo/layer_21/stable_rank_down_proj": 57.624839782714844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15055756270885468, "geo/layer_21/attn_entropy_mean": 5.73541259765625, "geo/layer_21/attn_entropy_std": 0.27865082025527954, "geo/layer_27/stable_rank_q_proj": 41.885475158691406, "geo/layer_27/stable_rank_k_proj": 31.172086715698242, "geo/layer_27/stable_rank_o_proj": 118.36688995361328, "geo/layer_27/stable_rank_gate_proj": 87.65110778808594, "geo/layer_27/stable_rank_down_proj": 134.9143524169922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08138225227594376, "geo/layer_27/attn_entropy_mean": 4.314760208129883, "geo/layer_27/attn_entropy_std": 0.5992744565010071, "attnres/final_alpha/block_0": 0.23745772242546082, "attnres/block_norm/0": 1.6937233209609985, "attnres/final_alpha/block_1": 0.0056120227091014385, "attnres/block_norm/1": 36723.3046875, "attnres/final_alpha/block_2": 0.012143553234636784, "attnres/block_norm/2": 24769.7265625, "attnres/final_alpha/block_3": 0.013937532901763916, "attnres/block_norm/3": 41657.8515625, "attnres/final_alpha/block_4": 0.017375096678733826, "attnres/block_norm/4": 11696.5078125, "attnres/final_alpha/block_5": 0.5939656496047974, "attnres/block_norm/5": 5652.16064453125, "attnres/final_alpha/block_6": 0.11950838565826416, "attnres/block_norm/6": 27190.7265625, "geo/tier1_time_s": 1.3583507537841797, "geo/step": 32100.0, "geo/rankme_slope": -0.0001908097418654962} {"step": 32110, "timestamp": 1778229247.6561213, "train/loss": 2.2077581167221068, "train/z_loss": 0.001449920842424035, "train/perplexity": 9.095302911763806, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789820.9996907322, "perf/iters_per_sec": 0.853453159184805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171710467338562, "data/tokens_consumed": 67341647872, "data/tokens_consumed_B": 67.341647872, "train/loss_slope": 1.9595378159832783e-06} {"step": 32120, "timestamp": 1778229258.0139475, "train/loss": 2.183730936050415, "train/z_loss": 0.00144311300246045, "train/perplexity": 8.879372908074643, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025799.7989302112, "perf/iters_per_sec": 0.965976619210344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352217435836792, "data/tokens_consumed": 67362619392, "data/tokens_consumed_B": 67.362619392, "train/loss_slope": 3.5308858849713918e-06} {"step": 32130, "timestamp": 1778229268.373557, "train/loss": 2.1347093105316164, "train/z_loss": 0.001456261775456369, "train/perplexity": 8.454588489785527, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025386.2353453194, "perf/iters_per_sec": 0.9657794167257878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354331254959106, "data/tokens_consumed": 67383590912, "data/tokens_consumed_B": 67.383590912, "train/loss_slope": 2.6683930337804716e-06} {"step": 32140, "timestamp": 1778229278.731489, "train/loss": 2.182427442073822, "train/z_loss": 0.0014458424178883434, "train/perplexity": 8.86780623915242, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025777.3578633738, "perf/iters_per_sec": 0.9659659184758062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035233211517334, "data/tokens_consumed": 67404562432, "data/tokens_consumed_B": 67.404562432, "train/loss_slope": 7.710178986419774e-06} {"step": 32150, "timestamp": 1778229289.0708542, "grad/layer_0/attn": 0.0028286089655011892, "grad/layer_0/mlp": 0.0026949821040034294, "grad/layer_0/attn_mlp_ratio": 1.0495835413307204, "grad/layer_4/attn": 0.002056804718449712, "grad/layer_4/mlp": 0.0024940278381109238, "grad/layer_4/attn_mlp_ratio": 0.8246919318825344, "grad/layer_8/attn": 0.007086269091814756, "grad/layer_8/mlp": 0.0034572752192616463, "grad/layer_8/attn_mlp_ratio": 2.0496687239036944, "grad/layer_12/attn": 0.005414256360381842, "grad/layer_12/mlp": 0.0065827760845422745, "grad/layer_12/attn_mlp_ratio": 0.8224882949986421, "grad/layer_16/attn": 0.0036525398027151823, "grad/layer_16/mlp": 0.004646765999495983, "grad/layer_16/attn_mlp_ratio": 0.7860390913825767, "grad/layer_20/attn": 0.005383115727454424, "grad/layer_20/mlp": 0.005805176217108965, "grad/layer_20/attn_mlp_ratio": 0.9272958190071424, "grad/layer_24/attn": 0.010074685327708721, "grad/layer_24/mlp": 0.008842125535011292, "grad/layer_24/attn_mlp_ratio": 1.1393963107487395, "grad/layer_27/attn": 0.0074594938196241856, "grad/layer_27/mlp": 0.008384726010262966, "grad/layer_27/attn_mlp_ratio": 0.8896526519207003} {"step": 32150, "timestamp": 1778229289.0865264, "train/loss": 2.203672432899475, "train/z_loss": 0.0014410451636649667, "train/perplexity": 9.058218189607311, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026349.2235907128, "perf/iters_per_sec": 0.9662386053041042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034941053390503, "data/tokens_consumed": 67425533952, "data/tokens_consumed_B": 67.425533952, "train/loss_slope": 9.188641294358638e-06} {"step": 32160, "timestamp": 1778229299.4346843, "train/loss": 2.182881784439087, "train/z_loss": 0.0014520901488140226, "train/perplexity": 8.871836174629712, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027422.3775710636, "perf/iters_per_sec": 0.9667503249984091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343932390213013, "data/tokens_consumed": 67446505472, "data/tokens_consumed_B": 67.446505472, "train/loss_slope": 8.018540706571604e-06} {"step": 32170, "timestamp": 1778229309.7832987, "train/loss": 2.2259879112243652, "train/z_loss": 0.0014537409413605928, "train/perplexity": 9.262628940474968, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027457.612836815, "perf/iters_per_sec": 0.9667671264823985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034375262260437, "data/tokens_consumed": 67467476992, "data/tokens_consumed_B": 67.467476992, "train/loss_slope": 1.1197124742152602e-05} {"step": 32175, "timestamp": 1778229315.5649974, "eos/sharpness": 38.3556842803955, "eos/L0_probe": 2.0119237899780273, "eos/L_plus": 2.2493765354156494, "eos/L_minus": 2.1580278873443604, "eos/grad_norm": 0.12475807219743729, "eos/embed_grad_frac": 0.1470465213060379, "eos/time_s": 0.6095528602600098} {"step": 32175, "timestamp": 1778229316.9447072, "geo/rankme_last": 439.455322265625, "geo/layer_0/stable_rank_q_proj": 18.77231216430664, "geo/layer_0/stable_rank_k_proj": 16.160327911376953, "geo/layer_0/stable_rank_o_proj": 50.28955841064453, "geo/layer_0/stable_rank_gate_proj": 141.84014892578125, "geo/layer_0/stable_rank_down_proj": 52.133766174316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06105571985244751, "geo/layer_0/attn_entropy_mean": 6.215094566345215, "geo/layer_0/attn_entropy_std": 0.35642874240875244, "geo/layer_7/stable_rank_q_proj": 42.428016662597656, "geo/layer_7/stable_rank_k_proj": 42.14853286743164, "geo/layer_7/stable_rank_o_proj": 103.78096008300781, "geo/layer_7/stable_rank_gate_proj": 94.51876068115234, "geo/layer_7/stable_rank_down_proj": 146.98011779785156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5144118070602417, "geo/layer_7/attn_entropy_mean": 4.632983207702637, "geo/layer_7/attn_entropy_std": 0.818043053150177, "geo/layer_14/stable_rank_q_proj": 55.041969299316406, "geo/layer_14/stable_rank_k_proj": 35.93058776855469, "geo/layer_14/stable_rank_o_proj": 51.28567123413086, "geo/layer_14/stable_rank_gate_proj": 80.04821014404297, "geo/layer_14/stable_rank_down_proj": 134.04867553710938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3892483115196228, "geo/layer_14/attn_entropy_mean": 5.482166290283203, "geo/layer_14/attn_entropy_std": 0.39182907342910767, "geo/layer_21/stable_rank_q_proj": 44.68999481201172, "geo/layer_21/stable_rank_k_proj": 31.355112075805664, "geo/layer_21/stable_rank_o_proj": 79.12326049804688, "geo/layer_21/stable_rank_gate_proj": 77.34928131103516, "geo/layer_21/stable_rank_down_proj": 57.537105560302734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15264414250850677, "geo/layer_21/attn_entropy_mean": 5.720337390899658, "geo/layer_21/attn_entropy_std": 0.28976571559906006, "geo/layer_27/stable_rank_q_proj": 41.807193756103516, "geo/layer_27/stable_rank_k_proj": 31.017719268798828, "geo/layer_27/stable_rank_o_proj": 118.72193145751953, "geo/layer_27/stable_rank_gate_proj": 87.65913391113281, "geo/layer_27/stable_rank_down_proj": 134.78611755371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08133206516504288, "geo/layer_27/attn_entropy_mean": 4.334048748016357, "geo/layer_27/attn_entropy_std": 0.6460585594177246, "attnres/final_alpha/block_0": 0.23952113091945648, "attnres/block_norm/0": 1.6941897869110107, "attnres/final_alpha/block_1": 0.005710347089916468, "attnres/block_norm/1": 36560.734375, "attnres/final_alpha/block_2": 0.012270866893231869, "attnres/block_norm/2": 24825.62109375, "attnres/final_alpha/block_3": 0.014160439372062683, "attnres/block_norm/3": 41172.984375, "attnres/final_alpha/block_4": 0.01785072311758995, "attnres/block_norm/4": 11693.4736328125, "attnres/final_alpha/block_5": 0.5880142450332642, "attnres/block_norm/5": 5728.1533203125, "attnres/final_alpha/block_6": 0.12247221171855927, "attnres/block_norm/6": 27246.2734375, "geo/tier1_time_s": 1.359013557434082, "geo/step": 32175.0, "geo/rankme_slope": -0.0002111067669255202} {"step": 32180, "timestamp": 1778229322.131163, "train/loss": 2.1565430879592897, "train/z_loss": 0.0014591710059903562, "train/perplexity": 8.64121404972645, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699056.2132690907, "perf/iters_per_sec": 0.8101731363625959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2343040704727173, "data/tokens_consumed": 67488448512, "data/tokens_consumed_B": 67.488448512, "train/loss_slope": 9.335633451574575e-06} {"step": 32190, "timestamp": 1778229332.4879222, "train/loss": 2.1817704677581786, "train/z_loss": 0.001445941673591733, "train/perplexity": 8.861982231538379, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025890.1747456896, "perf/iters_per_sec": 0.966019713757367, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351755619049072, "data/tokens_consumed": 67509420032, "data/tokens_consumed_B": 67.509420032, "train/loss_slope": 7.883957975303495e-06} {"step": 32200, "timestamp": 1778229342.8381445, "grad/layer_0/attn": 0.0030160900205373764, "grad/layer_0/mlp": 0.0027656857855618, "grad/layer_0/attn_mlp_ratio": 1.0905396148864213, "grad/layer_4/attn": 0.003125952323898673, "grad/layer_4/mlp": 0.0023090180475264788, "grad/layer_4/attn_mlp_ratio": 1.3538015399520917, "grad/layer_8/attn": 0.006225796416401863, "grad/layer_8/mlp": 0.003432536032050848, "grad/layer_8/attn_mlp_ratio": 1.813759907221171, "grad/layer_12/attn": 0.0035644820891320705, "grad/layer_12/mlp": 0.006147204432636499, "grad/layer_12/attn_mlp_ratio": 0.579854154877662, "grad/layer_16/attn": 0.0059957862831652164, "grad/layer_16/mlp": 0.004392008762806654, "grad/layer_16/attn_mlp_ratio": 1.3651580564738868, "grad/layer_20/attn": 0.007089368067681789, "grad/layer_20/mlp": 0.0059076338075101376, "grad/layer_20/attn_mlp_ratio": 1.2000351035072367, "grad/layer_24/attn": 0.008551348932087421, "grad/layer_24/mlp": 0.009558767080307007, "grad/layer_24/attn_mlp_ratio": 0.8946079312094691, "grad/layer_27/attn": 0.010226700454950333, "grad/layer_27/mlp": 0.008633825927972794, "grad/layer_27/attn_mlp_ratio": 1.1844923006111976} {"step": 32200, "timestamp": 1778229342.8540628, "train/loss": 2.180857515335083, "train/z_loss": 0.0014564747689291835, "train/perplexity": 8.853895355414933, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024630.0738234096, "perf/iters_per_sec": 0.9654188508145378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358198404312133, "data/tokens_consumed": 67530391552, "data/tokens_consumed_B": 67.530391552, "train/loss_slope": 4.949371728173233e-06} {"step": 32210, "timestamp": 1778229353.2043467, "train/loss": 2.17847945690155, "train/z_loss": 0.0014563640695996582, "train/perplexity": 8.83286529016808, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027240.519520292, "perf/iters_per_sec": 0.9666636083222828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344860315322877, "data/tokens_consumed": 67551363072, "data/tokens_consumed_B": 67.551363072, "train/loss_slope": 3.857967455108904e-06} {"step": 32220, "timestamp": 1778229363.5571651, "train/loss": 2.187191939353943, "train/z_loss": 0.0014456333010457456, "train/perplexity": 8.910157689427994, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026828.7960955524, "perf/iters_per_sec": 0.9664672832944643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346961736679077, "data/tokens_consumed": 67572334592, "data/tokens_consumed_B": 67.572334592, "train/loss_slope": 3.9961746924281015e-06} {"step": 32230, "timestamp": 1778229373.9108317, "train/loss": 2.167232036590576, "train/z_loss": 0.001459925202652812, "train/perplexity": 8.734074951380224, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026439.741806293, "perf/iters_per_sec": 0.9662817677527871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348948240280151, "data/tokens_consumed": 67593306112, "data/tokens_consumed_B": 67.593306112, "train/loss_slope": 2.8743162907676225e-06} {"step": 32240, "timestamp": 1778229384.2614622, "train/loss": 2.1469441175460817, "train/z_loss": 0.0014618760556913911, "train/perplexity": 8.558664122728846, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027137.0357356789, "perf/iters_per_sec": 0.9666142634085078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345388412475587, "data/tokens_consumed": 67614277632, "data/tokens_consumed_B": 67.614277632, "train/loss_slope": 6.562228250032052e-07} {"step": 32250, "timestamp": 1778229394.6042838, "grad/layer_0/attn": 0.0024754994083195925, "grad/layer_0/mlp": 0.0027736681513488293, "grad/layer_0/attn_mlp_ratio": 0.892500178100161, "grad/layer_4/attn": 0.0016872448613867164, "grad/layer_4/mlp": 0.0025158997159451246, "grad/layer_4/attn_mlp_ratio": 0.6706327695138706, "grad/layer_8/attn": 0.004940408747643232, "grad/layer_8/mlp": 0.00357093196362257, "grad/layer_8/attn_mlp_ratio": 1.383506787477604, "grad/layer_12/attn": 0.0041749123483896255, "grad/layer_12/mlp": 0.006643193308264017, "grad/layer_12/attn_mlp_ratio": 0.6284496162938907, "grad/layer_16/attn": 0.003717479296028614, "grad/layer_16/mlp": 0.004504610784351826, "grad/layer_16/attn_mlp_ratio": 0.8252609140874834, "grad/layer_20/attn": 0.0041175116784870625, "grad/layer_20/mlp": 0.005460482556372881, "grad/layer_20/attn_mlp_ratio": 0.7540563605090022, "grad/layer_24/attn": 0.011217940598726273, "grad/layer_24/mlp": 0.008872374892234802, "grad/layer_24/attn_mlp_ratio": 1.2643672757907927, "grad/layer_27/attn": 0.0044684261083602905, "grad/layer_27/mlp": 0.00812106765806675, "grad/layer_27/attn_mlp_ratio": 0.5502264285286564} {"step": 32250, "timestamp": 1778229395.2134705, "eos/sharpness": 42.808938026428216, "eos/L0_probe": 2.0115468502044678, "eos/L_plus": 2.191481113433838, "eos/L_minus": 2.25970196723938, "eos/grad_norm": 0.15321634709835052, "eos/embed_grad_frac": 0.09785246104001999, "eos/time_s": 0.6064608097076416} {"step": 32250, "timestamp": 1778229395.2330055, "train/loss": 2.1716449737548826, "train/z_loss": 0.0014462631195783615, "train/perplexity": 8.772703044315476, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912340.304578063, "perf/iters_per_sec": 0.9118749163523021, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966416358947755, "data/tokens_consumed": 67635249152, "data/tokens_consumed_B": 67.635249152, "train/loss_slope": -3.8530350303706654e-06} {"step": 32250, "timestamp": 1778229396.595295, "geo/rankme_last": 439.4129638671875, "geo/layer_0/stable_rank_q_proj": 18.7817325592041, "geo/layer_0/stable_rank_k_proj": 16.16263771057129, "geo/layer_0/stable_rank_o_proj": 50.314186096191406, "geo/layer_0/stable_rank_gate_proj": 141.81402587890625, "geo/layer_0/stable_rank_down_proj": 52.12355422973633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05505184084177017, "geo/layer_0/attn_entropy_mean": 6.2098894119262695, "geo/layer_0/attn_entropy_std": 0.3535953164100647, "geo/layer_7/stable_rank_q_proj": 42.56822204589844, "geo/layer_7/stable_rank_k_proj": 42.184444427490234, "geo/layer_7/stable_rank_o_proj": 103.61934661865234, "geo/layer_7/stable_rank_gate_proj": 94.50343322753906, "geo/layer_7/stable_rank_down_proj": 147.2542724609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5133702754974365, "geo/layer_7/attn_entropy_mean": 4.637362957000732, "geo/layer_7/attn_entropy_std": 0.8093544244766235, "geo/layer_14/stable_rank_q_proj": 54.94565963745117, "geo/layer_14/stable_rank_k_proj": 36.056217193603516, "geo/layer_14/stable_rank_o_proj": 51.20466613769531, "geo/layer_14/stable_rank_gate_proj": 79.91815948486328, "geo/layer_14/stable_rank_down_proj": 133.85948181152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3637060523033142, "geo/layer_14/attn_entropy_mean": 5.476221084594727, "geo/layer_14/attn_entropy_std": 0.3857063055038452, "geo/layer_21/stable_rank_q_proj": 44.70730972290039, "geo/layer_21/stable_rank_k_proj": 31.28700065612793, "geo/layer_21/stable_rank_o_proj": 79.14017486572266, "geo/layer_21/stable_rank_gate_proj": 77.27685546875, "geo/layer_21/stable_rank_down_proj": 57.33671951293945, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14914177358150482, "geo/layer_21/attn_entropy_mean": 5.711452007293701, "geo/layer_21/attn_entropy_std": 0.29520779848098755, "geo/layer_27/stable_rank_q_proj": 41.811729431152344, "geo/layer_27/stable_rank_k_proj": 31.087566375732422, "geo/layer_27/stable_rank_o_proj": 118.52748107910156, "geo/layer_27/stable_rank_gate_proj": 87.62097930908203, "geo/layer_27/stable_rank_down_proj": 134.738525390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09092297405004501, "geo/layer_27/attn_entropy_mean": 4.316218376159668, "geo/layer_27/attn_entropy_std": 0.6221767067909241, "attnres/final_alpha/block_0": 0.2393723726272583, "attnres/block_norm/0": 1.694493293762207, "attnres/final_alpha/block_1": 0.005633493885397911, "attnres/block_norm/1": 36743.296875, "attnres/final_alpha/block_2": 0.012321345508098602, "attnres/block_norm/2": 24934.052734375, "attnres/final_alpha/block_3": 0.013991067185997963, "attnres/block_norm/3": 41437.8828125, "attnres/final_alpha/block_4": 0.01771477796137333, "attnres/block_norm/4": 11748.5849609375, "attnres/final_alpha/block_5": 0.5863726139068604, "attnres/block_norm/5": 5737.091796875, "attnres/final_alpha/block_6": 0.12459434568881989, "attnres/block_norm/6": 27057.810546875, "geo/tier1_time_s": 1.3588130474090576, "geo/step": 32250.0, "geo/rankme_slope": -0.0002169187792304422} {"step": 32260, "timestamp": 1778229406.9425738, "train/loss": 2.2363932371139525, "train/z_loss": 0.001445761090144515, "train/perplexity": 9.359512793085361, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791515.6545279867, "perf/iters_per_sec": 0.8542612335815366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706021070480346, "data/tokens_consumed": 67656220672, "data/tokens_consumed_B": 67.656220672, "train/loss_slope": -4.14620075288775e-06} {"step": 32270, "timestamp": 1778229417.3035657, "train/loss": 2.1816668033599855, "train/z_loss": 0.0014624032191932201, "train/perplexity": 8.861063607098696, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025339.0403130539, "perf/iters_per_sec": 0.965756912380721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354572534561157, "data/tokens_consumed": 67677192192, "data/tokens_consumed_B": 67.677192192, "train/loss_slope": -1.6549848630339888e-06} {"step": 32280, "timestamp": 1778229427.6545558, "train/loss": 2.1711955308914184, "train/z_loss": 0.0014550023013725876, "train/perplexity": 8.768761101444317, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026960.4602654045, "perf/iters_per_sec": 0.9665300656630538, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034628963470459, "data/tokens_consumed": 67698163712, "data/tokens_consumed_B": 67.698163712, "train/loss_slope": -1.7093445422804118e-06} {"step": 32290, "timestamp": 1778229438.0057416, "train/loss": 2.1669679641723634, "train/z_loss": 0.0014716780511662363, "train/perplexity": 8.731768827592205, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027133.858974001, "perf/iters_per_sec": 0.966612748610497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345404624938965, "data/tokens_consumed": 67719135232, "data/tokens_consumed_B": 67.719135232, "train/loss_slope": -5.142265899811503e-06} {"step": 32300, "timestamp": 1778229448.3469043, "grad/layer_0/attn": 0.002579272259026766, "grad/layer_0/mlp": 0.002645410830155015, "grad/layer_0/attn_mlp_ratio": 0.9749987155589572, "grad/layer_4/attn": 0.0018227876862511039, "grad/layer_4/mlp": 0.002576302969828248, "grad/layer_4/attn_mlp_ratio": 0.7075206747211701, "grad/layer_8/attn": 0.00979836005717516, "grad/layer_8/mlp": 0.003828170243650675, "grad/layer_8/attn_mlp_ratio": 2.5595412893333025, "grad/layer_12/attn": 0.005074852611869574, "grad/layer_12/mlp": 0.006753223016858101, "grad/layer_12/attn_mlp_ratio": 0.7514711899864817, "grad/layer_16/attn": 0.004911103285849094, "grad/layer_16/mlp": 0.00460400665178895, "grad/layer_16/attn_mlp_ratio": 1.0667020164427037, "grad/layer_20/attn": 0.00486789969727397, "grad/layer_20/mlp": 0.0059187510050833225, "grad/layer_20/attn_mlp_ratio": 0.8224538607634931, "grad/layer_24/attn": 0.007832699455320835, "grad/layer_24/mlp": 0.008903924375772476, "grad/layer_24/attn_mlp_ratio": 0.8796906888230648, "grad/layer_27/attn": 0.008637456223368645, "grad/layer_27/mlp": 0.007119215093553066, "grad/layer_27/attn_mlp_ratio": 1.2132596063665064} {"step": 32300, "timestamp": 1778229448.362775, "train/loss": 2.2130157709121705, "train/z_loss": 0.0014545465703122318, "train/perplexity": 9.14324880023143, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025810.8563538876, "perf/iters_per_sec": 0.9659818918008268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352160930633545, "data/tokens_consumed": 67740106752, "data/tokens_consumed_B": 67.740106752, "train/loss_slope": -2.920818121412452e-06} {"step": 32310, "timestamp": 1778229458.7232077, "train/loss": 2.188327193260193, "train/z_loss": 0.0014417500351555645, "train/perplexity": 8.920278724635597, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025595.0951014438, "perf/iters_per_sec": 0.9658790088183612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035326361656189, "data/tokens_consumed": 67761078272, "data/tokens_consumed_B": 67.761078272, "train/loss_slope": -3.857150987716051e-06} {"step": 32320, "timestamp": 1778229469.0764582, "train/loss": 2.174401843547821, "train/z_loss": 0.0014558844617567957, "train/perplexity": 8.796921612721507, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026811.0024276313, "perf/iters_per_sec": 0.9664587986124188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347052574157716, "data/tokens_consumed": 67782049792, "data/tokens_consumed_B": 67.782049792, "train/loss_slope": -4.406873578249786e-06} {"step": 32325, "timestamp": 1778229474.8522232, "eos/sharpness": 50.172615051269524, "eos/L0_probe": 2.0133824348449707, "eos/L_plus": 2.194934368133545, "eos/L_minus": 2.333556652069092, "eos/grad_norm": 0.13144676387310028, "eos/embed_grad_frac": 0.11970925331115723, "eos/time_s": 0.6027002334594727} {"step": 32325, "timestamp": 1778229476.22847, "geo/rankme_last": 439.9222106933594, "geo/layer_0/stable_rank_q_proj": 18.777385711669922, "geo/layer_0/stable_rank_k_proj": 16.172622680664062, "geo/layer_0/stable_rank_o_proj": 50.291770935058594, "geo/layer_0/stable_rank_gate_proj": 141.7307891845703, "geo/layer_0/stable_rank_down_proj": 52.20360565185547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05670933052897453, "geo/layer_0/attn_entropy_mean": 6.2117767333984375, "geo/layer_0/attn_entropy_std": 0.3535325527191162, "geo/layer_7/stable_rank_q_proj": 42.72141647338867, "geo/layer_7/stable_rank_k_proj": 42.13079833984375, "geo/layer_7/stable_rank_o_proj": 103.673095703125, "geo/layer_7/stable_rank_gate_proj": 94.24674224853516, "geo/layer_7/stable_rank_down_proj": 147.02500915527344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5127950310707092, "geo/layer_7/attn_entropy_mean": 4.6274824142456055, "geo/layer_7/attn_entropy_std": 0.8099645972251892, "geo/layer_14/stable_rank_q_proj": 54.920902252197266, "geo/layer_14/stable_rank_k_proj": 36.10806655883789, "geo/layer_14/stable_rank_o_proj": 51.3311767578125, "geo/layer_14/stable_rank_gate_proj": 79.69464111328125, "geo/layer_14/stable_rank_down_proj": 133.97337341308594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3880905508995056, "geo/layer_14/attn_entropy_mean": 5.497586727142334, "geo/layer_14/attn_entropy_std": 0.39540159702301025, "geo/layer_21/stable_rank_q_proj": 44.68451690673828, "geo/layer_21/stable_rank_k_proj": 31.199005126953125, "geo/layer_21/stable_rank_o_proj": 79.14494323730469, "geo/layer_21/stable_rank_gate_proj": 77.31209564208984, "geo/layer_21/stable_rank_down_proj": 57.40324020385742, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14698798954486847, "geo/layer_21/attn_entropy_mean": 5.740266799926758, "geo/layer_21/attn_entropy_std": 0.2995249032974243, "geo/layer_27/stable_rank_q_proj": 41.72849655151367, "geo/layer_27/stable_rank_k_proj": 31.13643455505371, "geo/layer_27/stable_rank_o_proj": 118.44414520263672, "geo/layer_27/stable_rank_gate_proj": 87.64070892333984, "geo/layer_27/stable_rank_down_proj": 134.32470703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08151090145111084, "geo/layer_27/attn_entropy_mean": 4.3125834465026855, "geo/layer_27/attn_entropy_std": 0.6291594505310059, "attnres/final_alpha/block_0": 0.24068626761436462, "attnres/block_norm/0": 1.6948350667953491, "attnres/final_alpha/block_1": 0.005709030199795961, "attnres/block_norm/1": 36803.79296875, "attnres/final_alpha/block_2": 0.012290006503462791, "attnres/block_norm/2": 24879.908203125, "attnres/final_alpha/block_3": 0.014130723662674427, "attnres/block_norm/3": 41625.4296875, "attnres/final_alpha/block_4": 0.017893347889184952, "attnres/block_norm/4": 11738.025390625, "attnres/final_alpha/block_5": 0.5850647687911987, "attnres/block_norm/5": 5719.974609375, "attnres/final_alpha/block_6": 0.12422582507133484, "attnres/block_norm/6": 27256.26953125, "geo/tier1_time_s": 1.356311559677124, "geo/step": 32325.0, "geo/rankme_slope": -0.00020000408366471588} {"step": 32330, "timestamp": 1778229481.4147751, "train/loss": 2.2033957481384276, "train/z_loss": 0.001467224955558777, "train/perplexity": 9.055712265363518, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700463.3118496488, "perf/iters_per_sec": 0.8108440932510609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233282709121704, "data/tokens_consumed": 67803021312, "data/tokens_consumed_B": 67.803021312, "train/loss_slope": -4.503140047509948e-06} {"step": 32340, "timestamp": 1778229491.7657468, "train/loss": 2.1735158205032348, "train/z_loss": 0.0014663046575151383, "train/perplexity": 8.789130789385396, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027030.1992291973, "perf/iters_per_sec": 0.9665633197923648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034593367576599, "data/tokens_consumed": 67823992832, "data/tokens_consumed_B": 67.823992832, "train/loss_slope": -6.80851354063929e-06} {"step": 32350, "timestamp": 1778229502.1088924, "grad/layer_0/attn": 0.0024753969628363848, "grad/layer_0/mlp": 0.0025775444228202105, "grad/layer_0/attn_mlp_ratio": 0.960370205410822, "grad/layer_4/attn": 0.002142571611329913, "grad/layer_4/mlp": 0.002492510015144944, "grad/layer_4/attn_mlp_ratio": 0.85960397845981, "grad/layer_8/attn": 0.005462624132633209, "grad/layer_8/mlp": 0.0037388408090919256, "grad/layer_8/attn_mlp_ratio": 1.46104749184419, "grad/layer_12/attn": 0.005625236313790083, "grad/layer_12/mlp": 0.00646124267950654, "grad/layer_12/attn_mlp_ratio": 0.8706121261426576, "grad/layer_16/attn": 0.00412089005112648, "grad/layer_16/mlp": 0.004664978943765163, "grad/layer_16/attn_mlp_ratio": 0.8833673232967962, "grad/layer_20/attn": 0.004208254162222147, "grad/layer_20/mlp": 0.006437990814447403, "grad/layer_20/attn_mlp_ratio": 0.6536595372911236, "grad/layer_24/attn": 0.01299931388348341, "grad/layer_24/mlp": 0.009308172389864922, "grad/layer_24/attn_mlp_ratio": 1.3965484521948361, "grad/layer_27/attn": 0.009152185171842575, "grad/layer_27/mlp": 0.008462519384920597, "grad/layer_27/attn_mlp_ratio": 1.0814964961854323} {"step": 32350, "timestamp": 1778229502.1247, "train/loss": 2.1683456897735596, "train/z_loss": 0.0014606073265895248, "train/perplexity": 8.743807099863476, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025520.4174278022, "perf/iters_per_sec": 0.9658433997286807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035364532470703, "data/tokens_consumed": 67844964352, "data/tokens_consumed_B": 67.844964352, "train/loss_slope": -5.934089359634772e-06} {"step": 32360, "timestamp": 1778229512.484592, "train/loss": 2.180449438095093, "train/z_loss": 0.0014533156878314911, "train/perplexity": 8.85028301934133, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025450.1759972323, "perf/iters_per_sec": 0.9658099060045396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354004383087159, "data/tokens_consumed": 67865935872, "data/tokens_consumed_B": 67.865935872, "train/loss_slope": -6.17186567499366e-06} {"step": 32370, "timestamp": 1778229522.8325765, "train/loss": 2.2162551164627073, "train/z_loss": 0.0014303669682703911, "train/perplexity": 9.172914966089623, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027679.0996646758, "perf/iters_per_sec": 0.9668727396319751, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342622756958009, "data/tokens_consumed": 67886907392, "data/tokens_consumed_B": 67.886907392, "train/loss_slope": -2.7938937339702276e-06} {"step": 32380, "timestamp": 1778229533.1904576, "train/loss": 2.205301809310913, "train/z_loss": 0.0014555201167240738, "train/perplexity": 9.072989467372913, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026160.2304491024, "perf/iters_per_sec": 0.9661484863515388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035037589073181, "data/tokens_consumed": 67907878912, "data/tokens_consumed_B": 67.907878912, "train/loss_slope": 4.819733939870761e-07} {"step": 32390, "timestamp": 1778229543.5422683, "train/loss": 2.1657048940658568, "train/z_loss": 0.00145784723572433, "train/perplexity": 8.72074695357425, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026860.3210774562, "perf/iters_per_sec": 0.9664823155772477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346800804138183, "data/tokens_consumed": 67928850432, "data/tokens_consumed_B": 67.928850432, "train/loss_slope": -3.34007922429204e-06} {"step": 32400, "timestamp": 1778229553.8879158, "grad/layer_0/attn": 0.0032030241563916206, "grad/layer_0/mlp": 0.003046545432880521, "grad/layer_0/attn_mlp_ratio": 1.0513626406768812, "grad/layer_4/attn": 0.0034422778990119696, "grad/layer_4/mlp": 0.002476073568686843, "grad/layer_4/attn_mlp_ratio": 1.3902162696304354, "grad/layer_8/attn": 0.008664341643452644, "grad/layer_8/mlp": 0.003737477120012045, "grad/layer_8/attn_mlp_ratio": 2.3182326295020794, "grad/layer_12/attn": 0.00494422297924757, "grad/layer_12/mlp": 0.0065420703031122684, "grad/layer_12/attn_mlp_ratio": 0.7557581430024733, "grad/layer_16/attn": 0.006809887941926718, "grad/layer_16/mlp": 0.004509073216468096, "grad/layer_16/attn_mlp_ratio": 1.5102632989034703, "grad/layer_20/attn": 0.0035648797638714314, "grad/layer_20/mlp": 0.005731673911213875, "grad/layer_20/attn_mlp_ratio": 0.6219613601361199, "grad/layer_24/attn": 0.00565304234623909, "grad/layer_24/mlp": 0.008517466485500336, "grad/layer_24/attn_mlp_ratio": 0.663699973400837, "grad/layer_27/attn": 0.006243663839995861, "grad/layer_27/mlp": 0.007116094697266817, "grad/layer_27/attn_mlp_ratio": 0.8774003182748432} {"step": 32400, "timestamp": 1778229554.496693, "eos/sharpness": 19.11520957946777, "eos/L0_probe": 2.0106589794158936, "eos/L_plus": 2.1293623447418213, "eos/L_minus": 2.0831077098846436, "eos/grad_norm": 0.0937931165099144, "eos/embed_grad_frac": 0.2702481746673584, "eos/time_s": 0.6058821678161621} {"step": 32400, "timestamp": 1778229554.5165422, "train/loss": 2.1873342275619505, "train/z_loss": 0.001451901765540242, "train/perplexity": 8.9114255900002, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911671.8748180252, "perf/iters_per_sec": 0.9115561842050672, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0970250844955445, "data/tokens_consumed": 67949821952, "data/tokens_consumed_B": 67.949821952, "train/loss_slope": -4.023658617673746e-06} {"step": 32400, "timestamp": 1778229555.8794003, "geo/rankme_last": 439.0841369628906, "geo/layer_0/stable_rank_q_proj": 18.77251625061035, "geo/layer_0/stable_rank_k_proj": 16.178972244262695, "geo/layer_0/stable_rank_o_proj": 50.204620361328125, "geo/layer_0/stable_rank_gate_proj": 141.27151489257812, "geo/layer_0/stable_rank_down_proj": 52.20090866088867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05947941914200783, "geo/layer_0/attn_entropy_mean": 6.213707447052002, "geo/layer_0/attn_entropy_std": 0.3503969609737396, "geo/layer_7/stable_rank_q_proj": 42.67353057861328, "geo/layer_7/stable_rank_k_proj": 42.09357833862305, "geo/layer_7/stable_rank_o_proj": 103.6081771850586, "geo/layer_7/stable_rank_gate_proj": 94.23770904541016, "geo/layer_7/stable_rank_down_proj": 146.5822296142578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5247693657875061, "geo/layer_7/attn_entropy_mean": 4.653169631958008, "geo/layer_7/attn_entropy_std": 0.7911980152130127, "geo/layer_14/stable_rank_q_proj": 55.07674026489258, "geo/layer_14/stable_rank_k_proj": 36.192962646484375, "geo/layer_14/stable_rank_o_proj": 51.269771575927734, "geo/layer_14/stable_rank_gate_proj": 79.63885498046875, "geo/layer_14/stable_rank_down_proj": 134.03985595703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37218713760375977, "geo/layer_14/attn_entropy_mean": 5.463862895965576, "geo/layer_14/attn_entropy_std": 0.3950236141681671, "geo/layer_21/stable_rank_q_proj": 44.64300537109375, "geo/layer_21/stable_rank_k_proj": 31.226938247680664, "geo/layer_21/stable_rank_o_proj": 79.16769409179688, "geo/layer_21/stable_rank_gate_proj": 77.43476104736328, "geo/layer_21/stable_rank_down_proj": 57.38452911376953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15193352103233337, "geo/layer_21/attn_entropy_mean": 5.725386619567871, "geo/layer_21/attn_entropy_std": 0.28729110956192017, "geo/layer_27/stable_rank_q_proj": 41.76034164428711, "geo/layer_27/stable_rank_k_proj": 31.21219825744629, "geo/layer_27/stable_rank_o_proj": 118.24180603027344, "geo/layer_27/stable_rank_gate_proj": 87.54920196533203, "geo/layer_27/stable_rank_down_proj": 133.90570068359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0893925130367279, "geo/layer_27/attn_entropy_mean": 4.312314987182617, "geo/layer_27/attn_entropy_std": 0.6140061020851135, "attnres/final_alpha/block_0": 0.23778635263442993, "attnres/block_norm/0": 1.6951093673706055, "attnres/final_alpha/block_1": 0.0056835319846868515, "attnres/block_norm/1": 36656.15234375, "attnres/final_alpha/block_2": 0.011910778470337391, "attnres/block_norm/2": 24935.0703125, "attnres/final_alpha/block_3": 0.013877476565539837, "attnres/block_norm/3": 41579.3125, "attnres/final_alpha/block_4": 0.017257582396268845, "attnres/block_norm/4": 11738.451171875, "attnres/final_alpha/block_5": 0.5911482572555542, "attnres/block_norm/5": 5681.16162109375, "attnres/final_alpha/block_6": 0.12233605980873108, "attnres/block_norm/6": 27438.125, "geo/tier1_time_s": 1.358569622039795, "geo/step": 32400.0, "geo/rankme_slope": -0.0002116710551408063} {"step": 32410, "timestamp": 1778229566.2276416, "train/loss": 2.175494408607483, "train/z_loss": 0.0014482729602605104, "train/perplexity": 8.806538074256492, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791348.189745919, "perf/iters_per_sec": 0.8541813801507564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707115411758422, "data/tokens_consumed": 67970793472, "data/tokens_consumed_B": 67.970793472, "train/loss_slope": -4.01709480325702e-06} {"step": 32420, "timestamp": 1778229576.582544, "train/loss": 2.1779206514358522, "train/z_loss": 0.001447736332193017, "train/perplexity": 8.827930815600244, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026332.605343647, "perf/iters_per_sec": 0.9662306811063991, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034949541091919, "data/tokens_consumed": 67991764992, "data/tokens_consumed_B": 67.991764992, "train/loss_slope": -3.0464730819280754e-06} {"step": 32430, "timestamp": 1778229586.9303925, "train/loss": 2.199016571044922, "train/z_loss": 0.0014523814432322979, "train/perplexity": 9.016142402600511, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027609.362783241, "perf/iters_per_sec": 0.9668394864956098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342978477478026, "data/tokens_consumed": 68012736512, "data/tokens_consumed_B": 68.012736512, "train/loss_slope": -1.9880597740903382e-06} {"step": 32440, "timestamp": 1778229597.2786129, "train/loss": 2.148455500602722, "train/z_loss": 0.0014443213469348848, "train/perplexity": 8.571609322785225, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027560.9424084735, "perf/iters_per_sec": 0.9668163978617065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343225479125977, "data/tokens_consumed": 68033708032, "data/tokens_consumed_B": 68.033708032, "train/loss_slope": -3.485402916416962e-06} {"step": 32450, "timestamp": 1778229607.6249685, "grad/layer_0/attn": 0.0026844660751521587, "grad/layer_0/mlp": 0.0027393586933612823, "grad/layer_0/attn_mlp_ratio": 0.9799614718808819, "grad/layer_4/attn": 0.0018693297170102596, "grad/layer_4/mlp": 0.0024763038381934166, "grad/layer_4/attn_mlp_ratio": 0.754887026660397, "grad/layer_8/attn": 0.003494995879009366, "grad/layer_8/mlp": 0.0037317906972020864, "grad/layer_8/attn_mlp_ratio": 0.936546572125571, "grad/layer_12/attn": 0.004016352817416191, "grad/layer_12/mlp": 0.007331282831728458, "grad/layer_12/attn_mlp_ratio": 0.5478376506292162, "grad/layer_16/attn": 0.007876553572714329, "grad/layer_16/mlp": 0.004733968526124954, "grad/layer_16/attn_mlp_ratio": 1.6638372990574244, "grad/layer_20/attn": 0.004004678223282099, "grad/layer_20/mlp": 0.006664360407739878, "grad/layer_20/attn_mlp_ratio": 0.6009096024488969, "grad/layer_24/attn": 0.0167307760566473, "grad/layer_24/mlp": 0.013213339261710644, "grad/layer_24/attn_mlp_ratio": 1.266203462928487, "grad/layer_27/attn": 0.0038473058957606554, "grad/layer_27/mlp": 0.012380317784845829, "grad/layer_27/attn_mlp_ratio": 0.31075986348217793} {"step": 32450, "timestamp": 1778229607.6409397, "train/loss": 2.1933646202087402, "train/z_loss": 0.0014342151349410414, "train/perplexity": 8.965327346380677, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024866.4651308951, "perf/iters_per_sec": 0.9655315709738231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035698914527893, "data/tokens_consumed": 68054679552, "data/tokens_consumed_B": 68.054679552, "train/loss_slope": -5.551572672926603e-06} {"step": 32460, "timestamp": 1778229617.9983385, "train/loss": 2.2134777069091798, "train/z_loss": 0.001435206481255591, "train/perplexity": 9.14747337164755, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026059.656969339, "perf/iters_per_sec": 0.9661005291792578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350889682769775, "data/tokens_consumed": 68075651072, "data/tokens_consumed_B": 68.075651072, "train/loss_slope": -4.045580136607852e-06} {"step": 32470, "timestamp": 1778229628.7616801, "train/loss": 2.1898632287979125, "train/z_loss": 0.0014449319220148026, "train/perplexity": 8.933991118428997, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949387.2143509195, "perf/iters_per_sec": 0.9295402595285985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.075800633430481, "data/tokens_consumed": 68096622592, "data/tokens_consumed_B": 68.096622592, "train/loss_slope": -5.166275287368366e-06} {"step": 32475, "timestamp": 1778229634.5305326, "eos/sharpness": 61.49792671203612, "eos/L0_probe": 2.0167300701141357, "eos/L_plus": 2.395641565322876, "eos/L_minus": 2.252797842025757, "eos/grad_norm": 0.20784033834934235, "eos/embed_grad_frac": 0.05605461075901985, "eos/time_s": 0.604661226272583} {"step": 32475, "timestamp": 1778229635.9067328, "geo/rankme_last": 438.84844970703125, "geo/layer_0/stable_rank_q_proj": 18.758472442626953, "geo/layer_0/stable_rank_k_proj": 16.150854110717773, "geo/layer_0/stable_rank_o_proj": 50.302490234375, "geo/layer_0/stable_rank_gate_proj": 141.33401489257812, "geo/layer_0/stable_rank_down_proj": 52.141632080078125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06399407982826233, "geo/layer_0/attn_entropy_mean": 6.208368301391602, "geo/layer_0/attn_entropy_std": 0.3563084006309509, "geo/layer_7/stable_rank_q_proj": 42.70404052734375, "geo/layer_7/stable_rank_k_proj": 42.05869674682617, "geo/layer_7/stable_rank_o_proj": 103.33273315429688, "geo/layer_7/stable_rank_gate_proj": 94.24739837646484, "geo/layer_7/stable_rank_down_proj": 146.983154296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5191320776939392, "geo/layer_7/attn_entropy_mean": 4.647092819213867, "geo/layer_7/attn_entropy_std": 0.8079068064689636, "geo/layer_14/stable_rank_q_proj": 55.16849136352539, "geo/layer_14/stable_rank_k_proj": 36.23421096801758, "geo/layer_14/stable_rank_o_proj": 51.31386184692383, "geo/layer_14/stable_rank_gate_proj": 79.74524688720703, "geo/layer_14/stable_rank_down_proj": 134.02952575683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.387183278799057, "geo/layer_14/attn_entropy_mean": 5.531593322753906, "geo/layer_14/attn_entropy_std": 0.36785808205604553, "geo/layer_21/stable_rank_q_proj": 44.591827392578125, "geo/layer_21/stable_rank_k_proj": 31.260967254638672, "geo/layer_21/stable_rank_o_proj": 78.91497802734375, "geo/layer_21/stable_rank_gate_proj": 77.41246795654297, "geo/layer_21/stable_rank_down_proj": 57.40262985229492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1469433754682541, "geo/layer_21/attn_entropy_mean": 5.746705532073975, "geo/layer_21/attn_entropy_std": 0.2804727554321289, "geo/layer_27/stable_rank_q_proj": 41.862327575683594, "geo/layer_27/stable_rank_k_proj": 31.263818740844727, "geo/layer_27/stable_rank_o_proj": 118.0884780883789, "geo/layer_27/stable_rank_gate_proj": 87.49163055419922, "geo/layer_27/stable_rank_down_proj": 134.02999877929688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09240207821130753, "geo/layer_27/attn_entropy_mean": 4.287339210510254, "geo/layer_27/attn_entropy_std": 0.6228699684143066, "attnres/final_alpha/block_0": 0.23746329545974731, "attnres/block_norm/0": 1.695579171180725, "attnres/final_alpha/block_1": 0.005584820173680782, "attnres/block_norm/1": 36704.6328125, "attnres/final_alpha/block_2": 0.011842655017971992, "attnres/block_norm/2": 25089.568359375, "attnres/final_alpha/block_3": 0.013851102441549301, "attnres/block_norm/3": 41881.8671875, "attnres/final_alpha/block_4": 0.017613496631383896, "attnres/block_norm/4": 11708.5869140625, "attnres/final_alpha/block_5": 0.5948806405067444, "attnres/block_norm/5": 5635.57763671875, "attnres/final_alpha/block_6": 0.11876403540372849, "attnres/block_norm/6": 27550.453125, "geo/tier1_time_s": 1.3557262420654297, "geo/step": 32475.0, "geo/rankme_slope": -0.00021664118772509003} {"step": 32480, "timestamp": 1778229641.0816016, "train/loss": 2.177160048484802, "train/z_loss": 0.0014537853305228055, "train/perplexity": 8.821218818276302, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703045.752085783, "perf/iters_per_sec": 0.8120754967144885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2314126014709472, "data/tokens_consumed": 68117594112, "data/tokens_consumed_B": 68.117594112, "train/loss_slope": -4.3023225962847535e-06} {"step": 32490, "timestamp": 1778229651.429168, "train/loss": 2.1823585748672487, "train/z_loss": 0.0014421745552681386, "train/perplexity": 8.86719555913645, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027881.5133553583, "perf/iters_per_sec": 0.9669692580010215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03415904045105, "data/tokens_consumed": 68138565632, "data/tokens_consumed_B": 68.138565632, "train/loss_slope": -2.5577524349992045e-06} {"step": 32500, "timestamp": 1778229661.7684357, "grad/layer_0/attn": 0.003539626020938158, "grad/layer_0/mlp": 0.003225102322176099, "grad/layer_0/attn_mlp_ratio": 1.0975236000566575, "grad/layer_4/attn": 0.0017558705294504762, "grad/layer_4/mlp": 0.002350252354517579, "grad/layer_4/attn_mlp_ratio": 0.7470986897919834, "grad/layer_8/attn": 0.0048824637196958065, "grad/layer_8/mlp": 0.003604625351727009, "grad/layer_8/attn_mlp_ratio": 1.354499596443945, "grad/layer_12/attn": 0.008031483739614487, "grad/layer_12/mlp": 0.006590852048248053, "grad/layer_12/attn_mlp_ratio": 1.2185804747189446, "grad/layer_16/attn": 0.004224638920277357, "grad/layer_16/mlp": 0.004285056609660387, "grad/layer_16/attn_mlp_ratio": 0.9859003524394855, "grad/layer_20/attn": 0.004089586436748505, "grad/layer_20/mlp": 0.005805002059787512, "grad/layer_20/attn_mlp_ratio": 0.7044935254422371, "grad/layer_24/attn": 0.005650200415402651, "grad/layer_24/mlp": 0.008239970542490482, "grad/layer_24/attn_mlp_ratio": 0.6857063769458906, "grad/layer_27/attn": 0.005610247142612934, "grad/layer_27/mlp": 0.006554591469466686, "grad/layer_27/attn_mlp_ratio": 0.8559262744527363} {"step": 32500, "timestamp": 1778229661.7840476, "train/loss": 2.188597846031189, "train/z_loss": 0.0014475035248324275, "train/perplexity": 8.922693349538196, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026373.2178328172, "perf/iters_per_sec": 0.9662500466503225, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034928798675537, "data/tokens_consumed": 68159537152, "data/tokens_consumed_B": 68.159537152, "train/loss_slope": -2.4335619640035804e-06} {"step": 32500, "timestamp": 1778229668.8182056, "geo/ww_alpha_mean": 7.615914145709312, "geo/ww_alpha_std": 4.225944556675377, "geo/ww_alpha_min": 1.3489669786152216, "geo/ww_alpha_max": 27.734568673840073, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.089993410395585, "geo/ww_alpha_by_type/k_proj": 4.487814848906955, "geo/ww_alpha_by_type/v_proj": 7.859597282789556, "geo/ww_alpha_by_type/o_proj": 8.40941958609464, "geo/ww_alpha_by_type/gate_proj": 8.338376175098698, "geo/ww_alpha_by_type/up_proj": 11.607850238496882, "geo/ww_alpha_by_type/down_proj": 8.628832654089118, "geo/twonn_id/layer_0": 0.6800287365913391, "geo/twonn_id/layer_7": 3.1486175060272217, "geo/twonn_id/layer_14": 4.125717639923096, "geo/twonn_id/layer_21": 7.118529319763184, "geo/twonn_id/layer_27": 5.551860332489014, "geo/tier2_time_s": 7.027682065963745} {"step": 32500, "timestamp": 1778229669.4397845, "eoc/jacobian_sigma/layer_0/attn": 1019.1929931640625, "eoc/jacobian_sigma/layer_0/mlp": 7445.880859375, "eoc/jacobian_sigma/layer_0": 7445.880859375, "eoc/jacobian_sigma/layer_7/attn": 1.1649917364120483, "eoc/jacobian_sigma/layer_7/mlp": 1.6988028287887573, "eoc/jacobian_sigma/layer_7": 1.6988028287887573, "eoc/jacobian_sigma/layer_14/attn": 1.6291576623916626, "eoc/jacobian_sigma/layer_14/mlp": 6.022490501403809, "eoc/jacobian_sigma/layer_14": 6.022490501403809, "eoc/jacobian_sigma/layer_21/attn": 1.0916920900344849, "eoc/jacobian_sigma/layer_21/mlp": 3.9036548137664795, "eoc/jacobian_sigma/layer_21": 3.9036548137664795, "eoc/jacobian_sigma/layer_27/attn": 3.335245132446289, "eoc/jacobian_sigma/layer_27/mlp": 19.281797409057617, "eoc/jacobian_sigma/layer_27": 19.281797409057617, "eoc/layer0_sigma": 7445.880859375, "eoc/sigma_max": 19.281797409057617, "eoc/sigma_min": 1.6988028287887573, "eoc/sigma_mean": 7.726686388254166, "eoc/time_s": 0.6154327392578125} {"step": 32510, "timestamp": 1778229679.8135257, "train/loss": 2.1532377004623413, "train/z_loss": 0.0014621478039771318, "train/perplexity": 8.612698642046329, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1163519.6353645276, "perf/iters_per_sec": 0.5548093964407576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8024208068847656, "data/tokens_consumed": 68180508672, "data/tokens_consumed_B": 68.180508672, "train/loss_slope": -3.596182674965357e-06} {"step": 32520, "timestamp": 1778229690.1595964, "train/loss": 2.1750385999679565, "train/z_loss": 0.0014487715670838952, "train/perplexity": 8.802524892808796, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028068.2559507587, "perf/iters_per_sec": 0.9670583038095277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340638160705566, "data/tokens_consumed": 68201480192, "data/tokens_consumed_B": 68.201480192, "train/loss_slope": -6.675868681018669e-06} {"step": 32530, "timestamp": 1778229700.5208948, "train/loss": 2.1696261882781984, "train/z_loss": 0.001448007661383599, "train/perplexity": 8.755010703347597, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025088.1321209148, "perf/iters_per_sec": 0.9656372700314115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355855464935302, "data/tokens_consumed": 68222451712, "data/tokens_consumed_B": 68.222451712, "train/loss_slope": -8.649508308584636e-06} {"step": 32540, "timestamp": 1778229710.867487, "train/loss": 2.1589113593101503, "train/z_loss": 0.0014488499728031456, "train/perplexity": 8.661703041566865, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027927.3775918654, "perf/iters_per_sec": 0.9669911277732207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03413565158844, "data/tokens_consumed": 68243423232, "data/tokens_consumed_B": 68.243423232, "train/loss_slope": -8.187352877781996e-06} {"step": 32550, "timestamp": 1778229721.2048888, "grad/layer_0/attn": 0.002751111052930355, "grad/layer_0/mlp": 0.0026677539572119713, "grad/layer_0/attn_mlp_ratio": 1.0312461321136548, "grad/layer_4/attn": 0.002408779226243496, "grad/layer_4/mlp": 0.0024896571412682533, "grad/layer_4/attn_mlp_ratio": 0.9675143976913234, "grad/layer_8/attn": 0.007695722859352827, "grad/layer_8/mlp": 0.0036289978306740522, "grad/layer_8/attn_mlp_ratio": 2.120619247066757, "grad/layer_12/attn": 0.006285571493208408, "grad/layer_12/mlp": 0.00588116142898798, "grad/layer_12/attn_mlp_ratio": 1.0687636213062863, "grad/layer_16/attn": 0.0038988653104752302, "grad/layer_16/mlp": 0.004348066635429859, "grad/layer_16/attn_mlp_ratio": 0.8966893904147449, "grad/layer_20/attn": 0.004860175773501396, "grad/layer_20/mlp": 0.00628406647592783, "grad/layer_20/attn_mlp_ratio": 0.7734125211402302, "grad/layer_24/attn": 0.010655349120497704, "grad/layer_24/mlp": 0.010776398703455925, "grad/layer_24/attn_mlp_ratio": 0.9887671489181152, "grad/layer_27/attn": 0.00819131825119257, "grad/layer_27/mlp": 0.01003669761121273, "grad/layer_27/attn_mlp_ratio": 0.8161367898966857} {"step": 32550, "timestamp": 1778229721.8234675, "eos/sharpness": 67.3093557357788, "eos/L0_probe": 2.0120604038238525, "eos/L_plus": 2.285698413848877, "eos/L_minus": 2.411515951156616, "eos/grad_norm": 0.19979269802570343, "eos/embed_grad_frac": 0.05633346736431122, "eos/time_s": 0.615875244140625} {"step": 32550, "timestamp": 1778229721.8433778, "train/loss": 2.157348465919495, "train/z_loss": 0.0014456590404734015, "train/perplexity": 8.648176296315164, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911533.7419098602, "perf/iters_per_sec": 0.9114903173016835, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971043586730957, "data/tokens_consumed": 68264394752, "data/tokens_consumed_B": 68.264394752, "train/loss_slope": -1.5153556259194197e-05} {"step": 32550, "timestamp": 1778229723.2055109, "geo/rankme_last": 440.0302429199219, "geo/layer_0/stable_rank_q_proj": 18.782651901245117, "geo/layer_0/stable_rank_k_proj": 16.158769607543945, "geo/layer_0/stable_rank_o_proj": 50.29001235961914, "geo/layer_0/stable_rank_gate_proj": 141.33135986328125, "geo/layer_0/stable_rank_down_proj": 52.10405349731445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0554787814617157, "geo/layer_0/attn_entropy_mean": 6.213245868682861, "geo/layer_0/attn_entropy_std": 0.35463955998420715, "geo/layer_7/stable_rank_q_proj": 42.79706954956055, "geo/layer_7/stable_rank_k_proj": 42.07872772216797, "geo/layer_7/stable_rank_o_proj": 103.02317810058594, "geo/layer_7/stable_rank_gate_proj": 94.20216369628906, "geo/layer_7/stable_rank_down_proj": 146.6280059814453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5123894214630127, "geo/layer_7/attn_entropy_mean": 4.653874397277832, "geo/layer_7/attn_entropy_std": 0.8090358972549438, "geo/layer_14/stable_rank_q_proj": 55.03258514404297, "geo/layer_14/stable_rank_k_proj": 36.12035369873047, "geo/layer_14/stable_rank_o_proj": 51.28157043457031, "geo/layer_14/stable_rank_gate_proj": 79.59080505371094, "geo/layer_14/stable_rank_down_proj": 134.2134552001953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3722740709781647, "geo/layer_14/attn_entropy_mean": 5.490743637084961, "geo/layer_14/attn_entropy_std": 0.39340588450431824, "geo/layer_21/stable_rank_q_proj": 44.53378677368164, "geo/layer_21/stable_rank_k_proj": 31.23055648803711, "geo/layer_21/stable_rank_o_proj": 78.72505187988281, "geo/layer_21/stable_rank_gate_proj": 77.35773468017578, "geo/layer_21/stable_rank_down_proj": 57.3077392578125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14972974359989166, "geo/layer_21/attn_entropy_mean": 5.733798027038574, "geo/layer_21/attn_entropy_std": 0.29159048199653625, "geo/layer_27/stable_rank_q_proj": 41.93834686279297, "geo/layer_27/stable_rank_k_proj": 31.2403621673584, "geo/layer_27/stable_rank_o_proj": 117.74015808105469, "geo/layer_27/stable_rank_gate_proj": 87.55463409423828, "geo/layer_27/stable_rank_down_proj": 134.13662719726562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0824672132730484, "geo/layer_27/attn_entropy_mean": 4.340082168579102, "geo/layer_27/attn_entropy_std": 0.6179707646369934, "attnres/final_alpha/block_0": 0.2409210205078125, "attnres/block_norm/0": 1.6960557699203491, "attnres/final_alpha/block_1": 0.005877777002751827, "attnres/block_norm/1": 36697.890625, "attnres/final_alpha/block_2": 0.012421994470059872, "attnres/block_norm/2": 24859.44921875, "attnres/final_alpha/block_3": 0.01408588420599699, "attnres/block_norm/3": 41643.9453125, "attnres/final_alpha/block_4": 0.01805567741394043, "attnres/block_norm/4": 11830.953125, "attnres/final_alpha/block_5": 0.5836505889892578, "attnres/block_norm/5": 5765.8349609375, "attnres/final_alpha/block_6": 0.12498704344034195, "attnres/block_norm/6": 27107.171875, "geo/tier1_time_s": 1.3593616485595703, "geo/step": 32550.0, "geo/rankme_slope": -0.00019255846869997999} {"step": 32560, "timestamp": 1778229733.5716605, "train/loss": 2.1794466018676757, "train/z_loss": 0.0014513929607346654, "train/perplexity": 8.841412083697898, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788549.310756587, "perf/iters_per_sec": 0.8528467706473288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1725435733795166, "data/tokens_consumed": 68285366272, "data/tokens_consumed_B": 68.285366272, "train/loss_slope": -1.0917867993292729e-05} {"step": 32570, "timestamp": 1778229743.9273138, "train/loss": 2.1834984540939333, "train/z_loss": 0.0014360800967551767, "train/perplexity": 8.8773088540256, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026353.7983245484, "perf/iters_per_sec": 0.9662407867071859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349387168884276, "data/tokens_consumed": 68306337792, "data/tokens_consumed_B": 68.306337792, "train/loss_slope": -1.0773520415300667e-05} {"step": 32580, "timestamp": 1778229754.2823546, "train/loss": 2.1862635612487793, "train/z_loss": 0.001452104840427637, "train/perplexity": 8.901889532697265, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027114.6584646364, "perf/iters_per_sec": 0.9666035930941755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345502614974975, "data/tokens_consumed": 68327309312, "data/tokens_consumed_B": 68.327309312, "train/loss_slope": -9.40922670023886e-06} {"step": 32590, "timestamp": 1778229764.6490731, "train/loss": 2.1745765686035154, "train/z_loss": 0.0014610599144361913, "train/perplexity": 8.79845878962798, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024367.786961801, "perf/iters_per_sec": 0.9652937826928143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359540462493897, "data/tokens_consumed": 68348280832, "data/tokens_consumed_B": 68.348280832, "train/loss_slope": -9.255116364278522e-06} {"step": 32600, "timestamp": 1778229774.9927013, "grad/layer_0/attn": 0.0028466631192713976, "grad/layer_0/mlp": 0.0029834904707968235, "grad/layer_0/attn_mlp_ratio": 0.9541384669136452, "grad/layer_4/attn": 0.0032278059516102076, "grad/layer_4/mlp": 0.0026099332608282566, "grad/layer_4/attn_mlp_ratio": 1.2367388378781712, "grad/layer_8/attn": 0.003879183903336525, "grad/layer_8/mlp": 0.0038074476178735495, "grad/layer_8/attn_mlp_ratio": 1.0188410165492805, "grad/layer_12/attn": 0.005354246590286493, "grad/layer_12/mlp": 0.006976116448640823, "grad/layer_12/attn_mlp_ratio": 0.7675110576140929, "grad/layer_16/attn": 0.0037107185926288366, "grad/layer_16/mlp": 0.004545297473669052, "grad/layer_16/attn_mlp_ratio": 0.8163862832931031, "grad/layer_20/attn": 0.0034243487752974033, "grad/layer_20/mlp": 0.006241349037736654, "grad/layer_20/attn_mlp_ratio": 0.5486552185637222, "grad/layer_24/attn": 0.017197245731949806, "grad/layer_24/mlp": 0.012270602397620678, "grad/layer_24/attn_mlp_ratio": 1.4014997010362307, "grad/layer_27/attn": 0.0076012033969163895, "grad/layer_27/mlp": 0.01014222763478756, "grad/layer_27/attn_mlp_ratio": 0.7494609267000063} {"step": 32600, "timestamp": 1778229775.0087752, "train/loss": 2.0775512933731077, "train/z_loss": 0.0014703052933327855, "train/perplexity": 7.984892296699832, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025536.0428440096, "perf/iters_per_sec": 0.9658508505077408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353565454483031, "data/tokens_consumed": 68369252352, "data/tokens_consumed_B": 68.369252352, "train/loss_slope": -1.4858508997052259e-05} {"step": 32610, "timestamp": 1778229785.3610258, "train/loss": 2.1817758083343506, "train/z_loss": 0.0014347796328365802, "train/perplexity": 8.862029559755902, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027275.1877826755, "perf/iters_per_sec": 0.9666801394379976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344683408737183, "data/tokens_consumed": 68390223872, "data/tokens_consumed_B": 68.390223872, "train/loss_slope": -1.6945148143831307e-05} {"step": 32620, "timestamp": 1778229795.7154448, "train/loss": 2.1439136028289796, "train/z_loss": 0.0014483762672170997, "train/perplexity": 8.532766226943902, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026485.5408486645, "perf/iters_per_sec": 0.966303606438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348714351654054, "data/tokens_consumed": 68411195392, "data/tokens_consumed_B": 68.411195392, "train/loss_slope": -2.0589347855664967e-05} {"step": 32625, "timestamp": 1778229801.4999242, "eos/sharpness": 73.58207702636717, "eos/L0_probe": 2.0149424076080322, "eos/L_plus": 2.4988749027252197, "eos/L_minus": 2.2668306827545166, "eos/grad_norm": 0.29207003116607666, "eos/embed_grad_frac": 0.029223458841443062, "eos/time_s": 0.6177771091461182} {"step": 32625, "timestamp": 1778229802.877643, "geo/rankme_last": 439.0971374511719, "geo/layer_0/stable_rank_q_proj": 18.725135803222656, "geo/layer_0/stable_rank_k_proj": 16.147705078125, "geo/layer_0/stable_rank_o_proj": 50.152565002441406, "geo/layer_0/stable_rank_gate_proj": 141.44580078125, "geo/layer_0/stable_rank_down_proj": 52.221397399902344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.053611356765031815, "geo/layer_0/attn_entropy_mean": 6.2118659019470215, "geo/layer_0/attn_entropy_std": 0.35742974281311035, "geo/layer_7/stable_rank_q_proj": 42.800907135009766, "geo/layer_7/stable_rank_k_proj": 42.11244201660156, "geo/layer_7/stable_rank_o_proj": 102.67933654785156, "geo/layer_7/stable_rank_gate_proj": 93.97013854980469, "geo/layer_7/stable_rank_down_proj": 146.7716522216797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5158897638320923, "geo/layer_7/attn_entropy_mean": 4.6317901611328125, "geo/layer_7/attn_entropy_std": 0.7906610369682312, "geo/layer_14/stable_rank_q_proj": 55.04691696166992, "geo/layer_14/stable_rank_k_proj": 36.16066360473633, "geo/layer_14/stable_rank_o_proj": 51.239593505859375, "geo/layer_14/stable_rank_gate_proj": 79.52882385253906, "geo/layer_14/stable_rank_down_proj": 134.4857940673828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39907270669937134, "geo/layer_14/attn_entropy_mean": 5.465179443359375, "geo/layer_14/attn_entropy_std": 0.37588420510292053, "geo/layer_21/stable_rank_q_proj": 44.5410270690918, "geo/layer_21/stable_rank_k_proj": 31.250295639038086, "geo/layer_21/stable_rank_o_proj": 78.56533813476562, "geo/layer_21/stable_rank_gate_proj": 77.37141418457031, "geo/layer_21/stable_rank_down_proj": 57.384830474853516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14890111982822418, "geo/layer_21/attn_entropy_mean": 5.7384033203125, "geo/layer_21/attn_entropy_std": 0.28694891929626465, "geo/layer_27/stable_rank_q_proj": 41.99978256225586, "geo/layer_27/stable_rank_k_proj": 31.207372665405273, "geo/layer_27/stable_rank_o_proj": 117.6875, "geo/layer_27/stable_rank_gate_proj": 87.45563507080078, "geo/layer_27/stable_rank_down_proj": 134.19595336914062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09208012372255325, "geo/layer_27/attn_entropy_mean": 4.318936347961426, "geo/layer_27/attn_entropy_std": 0.6438333988189697, "attnres/final_alpha/block_0": 0.23917651176452637, "attnres/block_norm/0": 1.6962687969207764, "attnres/final_alpha/block_1": 0.005664866417646408, "attnres/block_norm/1": 36764.14453125, "attnres/final_alpha/block_2": 0.011999152600765228, "attnres/block_norm/2": 24882.75, "attnres/final_alpha/block_3": 0.013800431042909622, "attnres/block_norm/3": 42059.23046875, "attnres/final_alpha/block_4": 0.01736840419471264, "attnres/block_norm/4": 11750.126953125, "attnres/final_alpha/block_5": 0.5921992659568787, "attnres/block_norm/5": 5704.3720703125, "attnres/final_alpha/block_6": 0.11979136615991592, "attnres/block_norm/6": 27532.4140625, "geo/tier1_time_s": 1.357187271118164, "geo/step": 32625.0, "geo/rankme_slope": -0.00018790963651085435} {"step": 32630, "timestamp": 1778229808.055397, "train/loss": 2.1769069910049437, "train/z_loss": 0.001468179444782436, "train/perplexity": 8.81898682529604, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700259.5540167387, "perf/iters_per_sec": 0.8107469339450544, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2334305047988892, "data/tokens_consumed": 68432166912, "data/tokens_consumed_B": 68.432166912, "train/loss_slope": -2.1923205840348025e-05} {"step": 32640, "timestamp": 1778229818.4046788, "train/loss": 2.1937609910964966, "train/z_loss": 0.001440092024859041, "train/perplexity": 8.968881645503803, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027155.4891805637, "perf/iters_per_sec": 0.9666230626967257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345294237136842, "data/tokens_consumed": 68453138432, "data/tokens_consumed_B": 68.453138432, "train/loss_slope": -2.1918322901950417e-05} {"step": 32650, "timestamp": 1778229829.2037022, "grad/layer_0/attn": 0.0028765760362148285, "grad/layer_0/mlp": 0.0029621145222336054, "grad/layer_0/attn_mlp_ratio": 0.971122459145664, "grad/layer_4/attn": 0.0020489832386374474, "grad/layer_4/mlp": 0.0024554876144975424, "grad/layer_4/attn_mlp_ratio": 0.8344506170973547, "grad/layer_8/attn": 0.004208084661513567, "grad/layer_8/mlp": 0.003606092417612672, "grad/layer_8/attn_mlp_ratio": 1.166937520587916, "grad/layer_12/attn": 0.0061724535189569, "grad/layer_12/mlp": 0.005885417107492685, "grad/layer_12/attn_mlp_ratio": 1.0487707670237534, "grad/layer_16/attn": 0.00364944851025939, "grad/layer_16/mlp": 0.004532539751380682, "grad/layer_16/attn_mlp_ratio": 0.805166337180182, "grad/layer_20/attn": 0.004212143365293741, "grad/layer_20/mlp": 0.005819777492433786, "grad/layer_20/attn_mlp_ratio": 0.7237636315810233, "grad/layer_24/attn": 0.01043796818703413, "grad/layer_24/mlp": 0.009243703447282314, "grad/layer_24/attn_mlp_ratio": 1.129197635303107, "grad/layer_27/attn": 0.004879376385360956, "grad/layer_27/mlp": 0.008509758859872818, "grad/layer_27/attn_mlp_ratio": 0.5733859687882253} {"step": 32650, "timestamp": 1778229829.219748, "train/loss": 2.1902008295059203, "train/z_loss": 0.0014437179546803237, "train/perplexity": 8.937007749335628, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1939996.9484867665, "perf/iters_per_sec": 0.925062631839164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0810078859329224, "data/tokens_consumed": 68474109952, "data/tokens_consumed_B": 68.474109952, "train/loss_slope": -2.1546081819943497e-05} {"step": 32660, "timestamp": 1778229840.0840924, "train/loss": 2.153598427772522, "train/z_loss": 0.0014418632490560412, "train/perplexity": 8.615806038088483, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931356.9751984857, "perf/iters_per_sec": 0.9209427715294293, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.085843801498413, "data/tokens_consumed": 68495081472, "data/tokens_consumed_B": 68.495081472, "train/loss_slope": -2.047411112895502e-05} {"step": 32670, "timestamp": 1778229850.434561, "train/loss": 2.177363467216492, "train/z_loss": 0.0014496638323180377, "train/perplexity": 8.823013401940052, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027303.3157347473, "perf/iters_per_sec": 0.9666935518907296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344539880752563, "data/tokens_consumed": 68516052992, "data/tokens_consumed_B": 68.516052992, "train/loss_slope": -1.894642842484782e-05} {"step": 32680, "timestamp": 1778229860.789725, "train/loss": 2.143858480453491, "train/z_loss": 0.0014610955957323312, "train/perplexity": 8.53229589356305, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026548.8038887577, "perf/iters_per_sec": 0.9663337726062573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034839129447937, "data/tokens_consumed": 68537024512, "data/tokens_consumed_B": 68.537024512, "train/loss_slope": -2.180158677774977e-05} {"step": 32690, "timestamp": 1778229871.139072, "train/loss": 2.2060215950012205, "train/z_loss": 0.0014439173624850809, "train/perplexity": 9.079522426242939, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027346.116574534, "perf/iters_per_sec": 0.9667139609215422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344321489334107, "data/tokens_consumed": 68557996032, "data/tokens_consumed_B": 68.557996032, "train/loss_slope": -2.0069513083434237e-05} {"step": 32700, "timestamp": 1778229881.479489, "grad/layer_0/attn": 0.002826028037816286, "grad/layer_0/mlp": 0.0028803953900933266, "grad/layer_0/attn_mlp_ratio": 0.9811250043738685, "grad/layer_4/attn": 0.0023778388276696205, "grad/layer_4/mlp": 0.0024233083240687847, "grad/layer_4/attn_mlp_ratio": 0.9812365624005797, "grad/layer_8/attn": 0.0038159789983183146, "grad/layer_8/mlp": 0.0036669287364929914, "grad/layer_8/attn_mlp_ratio": 1.0406471378287976, "grad/layer_12/attn": 0.003951212391257286, "grad/layer_12/mlp": 0.0057046678848564625, "grad/layer_12/attn_mlp_ratio": 0.6926279323785579, "grad/layer_16/attn": 0.0033955734688788652, "grad/layer_16/mlp": 0.004318546038120985, "grad/layer_16/attn_mlp_ratio": 0.7862769923667625, "grad/layer_20/attn": 0.005294990260154009, "grad/layer_20/mlp": 0.005578962154686451, "grad/layer_20/attn_mlp_ratio": 0.949099495288053, "grad/layer_24/attn": 0.01079684216529131, "grad/layer_24/mlp": 0.007924718782305717, "grad/layer_24/attn_mlp_ratio": 1.3624258886202838, "grad/layer_27/attn": 0.00595976784825325, "grad/layer_27/mlp": 0.007402174640446901, "grad/layer_27/attn_mlp_ratio": 0.8051374166686361} {"step": 32700, "timestamp": 1778229882.0873296, "eos/sharpness": 39.848661422729485, "eos/L0_probe": 2.016211748123169, "eos/L_plus": 2.26369571685791, "eos/L_minus": 2.1672143936157227, "eos/grad_norm": 0.11994118988513947, "eos/embed_grad_frac": 0.14897629618644714, "eos/time_s": 0.6043891906738281} {"step": 32700, "timestamp": 1778229882.1087108, "train/loss": 2.174835753440857, "train/z_loss": 0.0014506282866932451, "train/perplexity": 8.800739512289825, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912781.3591398646, "perf/iters_per_sec": 0.9120852275561641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963887691497802, "data/tokens_consumed": 68578967552, "data/tokens_consumed_B": 68.578967552, "train/loss_slope": -2.163780445885169e-05} {"step": 32700, "timestamp": 1778229883.4712336, "geo/rankme_last": 439.9742431640625, "geo/layer_0/stable_rank_q_proj": 18.74589729309082, "geo/layer_0/stable_rank_k_proj": 16.15342903137207, "geo/layer_0/stable_rank_o_proj": 50.05109786987305, "geo/layer_0/stable_rank_gate_proj": 141.35731506347656, "geo/layer_0/stable_rank_down_proj": 52.321556091308594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05673649534583092, "geo/layer_0/attn_entropy_mean": 6.2058234214782715, "geo/layer_0/attn_entropy_std": 0.3588517904281616, "geo/layer_7/stable_rank_q_proj": 42.808536529541016, "geo/layer_7/stable_rank_k_proj": 42.07609176635742, "geo/layer_7/stable_rank_o_proj": 102.60807037353516, "geo/layer_7/stable_rank_gate_proj": 93.76042175292969, "geo/layer_7/stable_rank_down_proj": 147.03929138183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.524682343006134, "geo/layer_7/attn_entropy_mean": 4.6390485763549805, "geo/layer_7/attn_entropy_std": 0.8040968775749207, "geo/layer_14/stable_rank_q_proj": 55.08363342285156, "geo/layer_14/stable_rank_k_proj": 36.2158088684082, "geo/layer_14/stable_rank_o_proj": 51.2052001953125, "geo/layer_14/stable_rank_gate_proj": 79.5535659790039, "geo/layer_14/stable_rank_down_proj": 134.77552795410156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3957582116127014, "geo/layer_14/attn_entropy_mean": 5.511120796203613, "geo/layer_14/attn_entropy_std": 0.39785102009773254, "geo/layer_21/stable_rank_q_proj": 44.45655822753906, "geo/layer_21/stable_rank_k_proj": 31.276103973388672, "geo/layer_21/stable_rank_o_proj": 78.55028533935547, "geo/layer_21/stable_rank_gate_proj": 77.23322296142578, "geo/layer_21/stable_rank_down_proj": 57.47669219970703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15098628401756287, "geo/layer_21/attn_entropy_mean": 5.720272064208984, "geo/layer_21/attn_entropy_std": 0.2921162247657776, "geo/layer_27/stable_rank_q_proj": 42.04584884643555, "geo/layer_27/stable_rank_k_proj": 31.25186538696289, "geo/layer_27/stable_rank_o_proj": 117.59190368652344, "geo/layer_27/stable_rank_gate_proj": 87.51837158203125, "geo/layer_27/stable_rank_down_proj": 134.08934020996094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07772815227508545, "geo/layer_27/attn_entropy_mean": 4.299434661865234, "geo/layer_27/attn_entropy_std": 0.6129103899002075, "attnres/final_alpha/block_0": 0.2385663241147995, "attnres/block_norm/0": 1.6964595317840576, "attnres/final_alpha/block_1": 0.005673019215464592, "attnres/block_norm/1": 36913.82421875, "attnres/final_alpha/block_2": 0.011919173412024975, "attnres/block_norm/2": 25037.41796875, "attnres/final_alpha/block_3": 0.013724596239626408, "attnres/block_norm/3": 41789.20703125, "attnres/final_alpha/block_4": 0.017560850828886032, "attnres/block_norm/4": 11773.71875, "attnres/final_alpha/block_5": 0.5912142395973206, "attnres/block_norm/5": 5681.7060546875, "attnres/final_alpha/block_6": 0.12134182453155518, "attnres/block_norm/6": 27522.359375, "geo/tier1_time_s": 1.3583366870880127, "geo/step": 32700.0, "geo/rankme_slope": -0.00018748655712284913} {"step": 32710, "timestamp": 1778229893.8245082, "train/loss": 2.2121728658676147, "train/z_loss": 0.0014502546517178416, "train/perplexity": 9.135545156870123, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790643.794211115, "perf/iters_per_sec": 0.8538454981856894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171172070503235, "data/tokens_consumed": 68599939072, "data/tokens_consumed_B": 68.599939072, "train/loss_slope": -1.5495092719301566e-05} {"step": 32720, "timestamp": 1778229904.1794767, "train/loss": 2.2035058736801147, "train/z_loss": 0.00145008887629956, "train/perplexity": 9.056709585496305, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026610.996941292, "perf/iters_per_sec": 0.9663634285646877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348073720932007, "data/tokens_consumed": 68620910592, "data/tokens_consumed_B": 68.620910592, "train/loss_slope": -1.4057304899935055e-05} {"step": 32730, "timestamp": 1778229914.9531696, "train/loss": 2.177591395378113, "train/z_loss": 0.0014583191950805485, "train/perplexity": 8.825024644365401, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947588.292383228, "perf/iters_per_sec": 0.9286824666896953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0767943143844605, "data/tokens_consumed": 68641882112, "data/tokens_consumed_B": 68.641882112, "train/loss_slope": -1.4363900834720778e-05} {"step": 32740, "timestamp": 1778229925.3017504, "train/loss": 2.231196427345276, "train/z_loss": 0.0014441794366575778, "train/perplexity": 9.310999352316463, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027789.3703805949, "perf/iters_per_sec": 0.966925320806787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342060327529907, "data/tokens_consumed": 68662853632, "data/tokens_consumed_B": 68.662853632, "train/loss_slope": -9.245395467262037e-06} {"step": 32750, "timestamp": 1778229935.6410432, "grad/layer_0/attn": 0.002727026352658868, "grad/layer_0/mlp": 0.0027201741468161345, "grad/layer_0/attn_mlp_ratio": 1.0025189951896476, "grad/layer_4/attn": 0.0033661213237792253, "grad/layer_4/mlp": 0.00260869855992496, "grad/layer_4/attn_mlp_ratio": 1.2903450197180124, "grad/layer_8/attn": 0.004179667681455612, "grad/layer_8/mlp": 0.0037868167273700237, "grad/layer_8/attn_mlp_ratio": 1.103741710252837, "grad/layer_12/attn": 0.004939975682646036, "grad/layer_12/mlp": 0.006284892093390226, "grad/layer_12/attn_mlp_ratio": 0.7860080221966849, "grad/layer_16/attn": 0.004255607724189758, "grad/layer_16/mlp": 0.004436443094164133, "grad/layer_16/attn_mlp_ratio": 0.959238637336267, "grad/layer_20/attn": 0.0037520607002079487, "grad/layer_20/mlp": 0.0058868261985480785, "grad/layer_20/attn_mlp_ratio": 0.6373656211214104, "grad/layer_24/attn": 0.008334601297974586, "grad/layer_24/mlp": 0.008017626591026783, "grad/layer_24/attn_mlp_ratio": 1.0395347175870082, "grad/layer_27/attn": 0.008692785166203976, "grad/layer_27/mlp": 0.007188418880105019, "grad/layer_27/attn_mlp_ratio": 1.2092763638656159} {"step": 32750, "timestamp": 1778229935.6567852, "train/loss": 2.206298995018005, "train/z_loss": 0.0014543548342771829, "train/perplexity": 9.082041435286797, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026298.4360965537, "perf/iters_per_sec": 0.9662143879397171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349669933319092, "data/tokens_consumed": 68683825152, "data/tokens_consumed_B": 68.683825152, "train/loss_slope": -8.423875446187979e-06} {"step": 32760, "timestamp": 1778229946.0203454, "train/loss": 2.1839319705963134, "train/z_loss": 0.0014599811634980141, "train/perplexity": 8.881158148216537, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024696.9494400586, "perf/iters_per_sec": 0.9654507395935338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357856273651123, "data/tokens_consumed": 68704796672, "data/tokens_consumed_B": 68.704796672, "train/loss_slope": -9.62019871563803e-06} {"step": 32770, "timestamp": 1778229956.3818007, "train/loss": 2.213623595237732, "train/z_loss": 0.0014343925518915056, "train/perplexity": 9.148807978597638, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025288.0702832919, "perf/iters_per_sec": 0.9657326079765758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354833126068115, "data/tokens_consumed": 68725768192, "data/tokens_consumed_B": 68.725768192, "train/loss_slope": -4.478269270007915e-06} {"step": 32775, "timestamp": 1778229962.1615484, "eos/sharpness": 27.04396247863769, "eos/L0_probe": 2.0149405002593994, "eos/L_plus": 2.147522449493408, "eos/L_minus": 2.1527981758117676, "eos/grad_norm": 0.11809905618429184, "eos/embed_grad_frac": 0.18120130896568298, "eos/time_s": 0.6124622821807861} {"step": 32775, "timestamp": 1778229963.5392656, "geo/rankme_last": 439.0711669921875, "geo/layer_0/stable_rank_q_proj": 18.74540138244629, "geo/layer_0/stable_rank_k_proj": 16.147254943847656, "geo/layer_0/stable_rank_o_proj": 49.986717224121094, "geo/layer_0/stable_rank_gate_proj": 140.7217559814453, "geo/layer_0/stable_rank_down_proj": 52.36461639404297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059276264160871506, "geo/layer_0/attn_entropy_mean": 6.20166015625, "geo/layer_0/attn_entropy_std": 0.36483699083328247, "geo/layer_7/stable_rank_q_proj": 42.84416580200195, "geo/layer_7/stable_rank_k_proj": 42.07915115356445, "geo/layer_7/stable_rank_o_proj": 102.41543579101562, "geo/layer_7/stable_rank_gate_proj": 93.86898803710938, "geo/layer_7/stable_rank_down_proj": 146.81689453125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5241084098815918, "geo/layer_7/attn_entropy_mean": 4.6506667137146, "geo/layer_7/attn_entropy_std": 0.8428062796592712, "geo/layer_14/stable_rank_q_proj": 55.20882797241211, "geo/layer_14/stable_rank_k_proj": 36.25191116333008, "geo/layer_14/stable_rank_o_proj": 51.09825134277344, "geo/layer_14/stable_rank_gate_proj": 79.43656158447266, "geo/layer_14/stable_rank_down_proj": 135.1818389892578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3732249438762665, "geo/layer_14/attn_entropy_mean": 5.496189117431641, "geo/layer_14/attn_entropy_std": 0.3864635229110718, "geo/layer_21/stable_rank_q_proj": 44.345706939697266, "geo/layer_21/stable_rank_k_proj": 31.213716506958008, "geo/layer_21/stable_rank_o_proj": 78.37808990478516, "geo/layer_21/stable_rank_gate_proj": 77.01284790039062, "geo/layer_21/stable_rank_down_proj": 57.424991607666016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14333927631378174, "geo/layer_21/attn_entropy_mean": 5.737667083740234, "geo/layer_21/attn_entropy_std": 0.2859092056751251, "geo/layer_27/stable_rank_q_proj": 42.06527328491211, "geo/layer_27/stable_rank_k_proj": 31.324390411376953, "geo/layer_27/stable_rank_o_proj": 117.62555694580078, "geo/layer_27/stable_rank_gate_proj": 87.52957916259766, "geo/layer_27/stable_rank_down_proj": 134.32518005371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08872222155332565, "geo/layer_27/attn_entropy_mean": 4.309159755706787, "geo/layer_27/attn_entropy_std": 0.6200004816055298, "attnres/final_alpha/block_0": 0.24042250216007233, "attnres/block_norm/0": 1.6965441703796387, "attnres/final_alpha/block_1": 0.005759121384471655, "attnres/block_norm/1": 36831.328125, "attnres/final_alpha/block_2": 0.012314219027757645, "attnres/block_norm/2": 25098.224609375, "attnres/final_alpha/block_3": 0.014212521724402905, "attnres/block_norm/3": 41921.9765625, "attnres/final_alpha/block_4": 0.017546750605106354, "attnres/block_norm/4": 11856.751953125, "attnres/final_alpha/block_5": 0.5868886709213257, "attnres/block_norm/5": 5711.08447265625, "attnres/final_alpha/block_6": 0.1228562593460083, "attnres/block_norm/6": 27457.83984375, "geo/tier1_time_s": 1.3579401969909668, "geo/step": 32775.0, "geo/rankme_slope": -0.00017476941948654461} {"step": 32780, "timestamp": 1778229968.7229683, "train/loss": 2.1033055782318115, "train/z_loss": 0.0014567505801096558, "train/perplexity": 8.193208490514719, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699827.0550950714, "perf/iters_per_sec": 0.8105407023883207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337443351745605, "data/tokens_consumed": 68746739712, "data/tokens_consumed_B": 68.746739712, "train/loss_slope": -8.34486659067444e-06} {"step": 32790, "timestamp": 1778229979.0765567, "train/loss": 2.1526413202285766, "train/z_loss": 0.0014551807660609483, "train/perplexity": 8.607563730149014, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026404.401860402, "perf/iters_per_sec": 0.9662649163534174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349128723144532, "data/tokens_consumed": 68767711232, "data/tokens_consumed_B": 68.767711232, "train/loss_slope": -1.2153997680213136e-05} {"step": 32800, "timestamp": 1778229989.4177136, "grad/layer_0/attn": 0.0030627513770014048, "grad/layer_0/mlp": 0.002912115305662155, "grad/layer_0/attn_mlp_ratio": 1.051727335752684, "grad/layer_4/attn": 0.0018323517870157957, "grad/layer_4/mlp": 0.0024658050388097763, "grad/layer_4/attn_mlp_ratio": 0.7431048618465678, "grad/layer_8/attn": 0.005424578674137592, "grad/layer_8/mlp": 0.003795451717451215, "grad/layer_8/attn_mlp_ratio": 1.4292313366213132, "grad/layer_12/attn": 0.0036597067955881357, "grad/layer_12/mlp": 0.0059256781823933125, "grad/layer_12/attn_mlp_ratio": 0.6176013312200984, "grad/layer_16/attn": 0.0040214285254478455, "grad/layer_16/mlp": 0.004799953196197748, "grad/layer_16/attn_mlp_ratio": 0.837805761283843, "grad/layer_20/attn": 0.003277583746239543, "grad/layer_20/mlp": 0.006065843626856804, "grad/layer_20/attn_mlp_ratio": 0.5403343531136302, "grad/layer_24/attn": 0.011729241348803043, "grad/layer_24/mlp": 0.012329667806625366, "grad/layer_24/attn_mlp_ratio": 0.9513022927811638, "grad/layer_27/attn": 0.009309764951467514, "grad/layer_27/mlp": 0.011683186516165733, "grad/layer_27/attn_mlp_ratio": 0.7968515146874249} {"step": 32800, "timestamp": 1778229989.4336193, "train/loss": 2.1885045051574705, "train/z_loss": 0.0014588614576496184, "train/perplexity": 8.921860536413387, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025905.8525276824, "perf/iters_per_sec": 0.9660271895063793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351675510406495, "data/tokens_consumed": 68788682752, "data/tokens_consumed_B": 68.788682752, "train/loss_slope": -1.2814820402919673e-05} {"step": 32810, "timestamp": 1778229999.7873378, "train/loss": 2.2134505987167357, "train/z_loss": 0.0014525704318657517, "train/perplexity": 9.147225403540013, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026600.257628027, "perf/iters_per_sec": 0.9663583076610693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03481285572052, "data/tokens_consumed": 68809654272, "data/tokens_consumed_B": 68.809654272, "train/loss_slope": -9.952320460260325e-06} {"step": 32820, "timestamp": 1778230010.1365228, "train/loss": 2.1683403491973876, "train/z_loss": 0.0014633007813245058, "train/perplexity": 8.74376040302032, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027510.7484381348, "perf/iters_per_sec": 0.9667924635115313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034348154067993, "data/tokens_consumed": 68830625792, "data/tokens_consumed_B": 68.830625792, "train/loss_slope": -1.2144511639446004e-05} {"step": 32830, "timestamp": 1778230020.4869223, "train/loss": 2.2178895235061646, "train/z_loss": 0.001455699559301138, "train/perplexity": 9.187919501338165, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027209.403194803, "perf/iters_per_sec": 0.9666487709020629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345019102096558, "data/tokens_consumed": 68851597312, "data/tokens_consumed_B": 68.851597312, "train/loss_slope": -1.2570528328830073e-05} {"step": 32840, "timestamp": 1778230030.8364727, "train/loss": 2.1019444823265077, "train/z_loss": 0.0014700497034937144, "train/perplexity": 8.182064333840442, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027397.3304345885, "perf/iters_per_sec": 0.9667383815930312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034406018257141, "data/tokens_consumed": 68872568832, "data/tokens_consumed_B": 68.872568832, "train/loss_slope": -1.7254765570932716e-05} {"step": 32850, "timestamp": 1778230041.1763754, "grad/layer_0/attn": 0.0027146590873599052, "grad/layer_0/mlp": 0.0027327409479767084, "grad/layer_0/attn_mlp_ratio": 0.993383214765193, "grad/layer_4/attn": 0.002388743916526437, "grad/layer_4/mlp": 0.0025628244038671255, "grad/layer_4/attn_mlp_ratio": 0.9320747140204048, "grad/layer_8/attn": 0.004747329279780388, "grad/layer_8/mlp": 0.0038057139609009027, "grad/layer_8/attn_mlp_ratio": 1.2474214309880607, "grad/layer_12/attn": 0.004465368576347828, "grad/layer_12/mlp": 0.00675841374322772, "grad/layer_12/attn_mlp_ratio": 0.660712510350688, "grad/layer_16/attn": 0.005987075157463551, "grad/layer_16/mlp": 0.004954497795552015, "grad/layer_16/attn_mlp_ratio": 1.208412090120888, "grad/layer_20/attn": 0.007129771634936333, "grad/layer_20/mlp": 0.006775174755603075, "grad/layer_20/attn_mlp_ratio": 1.0523376572399465, "grad/layer_24/attn": 0.013144378550350666, "grad/layer_24/mlp": 0.01282875519245863, "grad/layer_24/attn_mlp_ratio": 1.0246027966623992, "grad/layer_27/attn": 0.005048760678619146, "grad/layer_27/mlp": 0.013173947110772133, "grad/layer_27/attn_mlp_ratio": 0.38323826548286555} {"step": 32850, "timestamp": 1778230041.7909439, "eos/sharpness": 45.10862827301025, "eos/L0_probe": 2.018423557281494, "eos/L_plus": 2.26543927192688, "eos/L_minus": 2.222494125366211, "eos/grad_norm": 0.18354032933712006, "eos/embed_grad_frac": 0.08987191319465637, "eos/time_s": 0.6117925643920898} {"step": 32850, "timestamp": 1778230041.8117223, "train/loss": 2.1944482922554016, "train/z_loss": 0.001440264587290585, "train/perplexity": 8.975048087111327, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911691.7759107128, "perf/iters_per_sec": 0.9115656737855495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0970136642456054, "data/tokens_consumed": 68893540352, "data/tokens_consumed_B": 68.893540352, "train/loss_slope": -1.481056176897217e-05} {"step": 32850, "timestamp": 1778230043.1764429, "geo/rankme_last": 439.4159240722656, "geo/layer_0/stable_rank_q_proj": 18.772254943847656, "geo/layer_0/stable_rank_k_proj": 16.113344192504883, "geo/layer_0/stable_rank_o_proj": 49.93976593017578, "geo/layer_0/stable_rank_gate_proj": 140.7568359375, "geo/layer_0/stable_rank_down_proj": 52.240867614746094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057225871831178665, "geo/layer_0/attn_entropy_mean": 6.203583240509033, "geo/layer_0/attn_entropy_std": 0.3652356266975403, "geo/layer_7/stable_rank_q_proj": 42.898921966552734, "geo/layer_7/stable_rank_k_proj": 42.062965393066406, "geo/layer_7/stable_rank_o_proj": 102.1159439086914, "geo/layer_7/stable_rank_gate_proj": 93.62122344970703, "geo/layer_7/stable_rank_down_proj": 146.8162078857422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5207006335258484, "geo/layer_7/attn_entropy_mean": 4.650937080383301, "geo/layer_7/attn_entropy_std": 0.8190252780914307, "geo/layer_14/stable_rank_q_proj": 55.182552337646484, "geo/layer_14/stable_rank_k_proj": 36.27878189086914, "geo/layer_14/stable_rank_o_proj": 51.07155990600586, "geo/layer_14/stable_rank_gate_proj": 79.25601959228516, "geo/layer_14/stable_rank_down_proj": 135.2752227783203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3987281024456024, "geo/layer_14/attn_entropy_mean": 5.464698791503906, "geo/layer_14/attn_entropy_std": 0.3952910602092743, "geo/layer_21/stable_rank_q_proj": 44.30778121948242, "geo/layer_21/stable_rank_k_proj": 31.170700073242188, "geo/layer_21/stable_rank_o_proj": 78.20182800292969, "geo/layer_21/stable_rank_gate_proj": 77.04495239257812, "geo/layer_21/stable_rank_down_proj": 57.39606475830078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14896507561206818, "geo/layer_21/attn_entropy_mean": 5.737429618835449, "geo/layer_21/attn_entropy_std": 0.27764880657196045, "geo/layer_27/stable_rank_q_proj": 42.11345672607422, "geo/layer_27/stable_rank_k_proj": 31.44623565673828, "geo/layer_27/stable_rank_o_proj": 117.91513061523438, "geo/layer_27/stable_rank_gate_proj": 87.35051727294922, "geo/layer_27/stable_rank_down_proj": 134.0886993408203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07989561557769775, "geo/layer_27/attn_entropy_mean": 4.292387962341309, "geo/layer_27/attn_entropy_std": 0.6419063210487366, "attnres/final_alpha/block_0": 0.23949545621871948, "attnres/block_norm/0": 1.696758508682251, "attnres/final_alpha/block_1": 0.005651956889778376, "attnres/block_norm/1": 37062.69921875, "attnres/final_alpha/block_2": 0.012053059414029121, "attnres/block_norm/2": 24981.37890625, "attnres/final_alpha/block_3": 0.013844411820173264, "attnres/block_norm/3": 42209.4453125, "attnres/final_alpha/block_4": 0.017456311732530594, "attnres/block_norm/4": 11795.0576171875, "attnres/final_alpha/block_5": 0.5906124711036682, "attnres/block_norm/5": 5713.31201171875, "attnres/final_alpha/block_6": 0.12088631838560104, "attnres/block_norm/6": 27388.85546875, "geo/tier1_time_s": 1.3608791828155518, "geo/step": 32850.0, "geo/rankme_slope": -0.00015926233774759903} {"step": 32860, "timestamp": 1778230053.5266852, "train/loss": 2.178126311302185, "train/z_loss": 0.0014516251510940492, "train/perplexity": 8.829746553377571, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790688.4497679744, "perf/iters_per_sec": 0.8538667916145203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171142864227295, "data/tokens_consumed": 68914511872, "data/tokens_consumed_B": 68.914511872, "train/loss_slope": -1.399445920744496e-05} {"step": 32870, "timestamp": 1778230063.8725107, "train/loss": 2.149294137954712, "train/z_loss": 0.001459905446972698, "train/perplexity": 8.578800809633233, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027916.6710758095, "perf/iters_per_sec": 0.9669860225085304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341411113739014, "data/tokens_consumed": 68935483392, "data/tokens_consumed_B": 68.935483392, "train/loss_slope": -1.4658446208943573e-05} {"step": 32880, "timestamp": 1778230074.2200234, "train/loss": 2.2493664264678954, "train/z_loss": 0.0014335785410366953, "train/perplexity": 9.481726561912645, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027783.5737407939, "perf/iters_per_sec": 0.9669225567535371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342089891433717, "data/tokens_consumed": 68956454912, "data/tokens_consumed_B": 68.956454912, "train/loss_slope": -1.083451545599736e-05} {"step": 32890, "timestamp": 1778230084.5691352, "train/loss": 2.1508466005325317, "train/z_loss": 0.0014532832545228302, "train/perplexity": 8.59212942026038, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027815.1751792699, "perf/iters_per_sec": 0.9669376254936551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341928720474243, "data/tokens_consumed": 68977426432, "data/tokens_consumed_B": 68.977426432, "train/loss_slope": -1.5792849905813503e-05} {"step": 32900, "timestamp": 1778230094.9254863, "grad/layer_0/attn": 0.0025368519127368927, "grad/layer_0/mlp": 0.0028016511350870132, "grad/layer_0/attn_mlp_ratio": 0.9054845517408249, "grad/layer_4/attn": 0.0019656000658869743, "grad/layer_4/mlp": 0.002599230967462063, "grad/layer_4/attn_mlp_ratio": 0.7562236734136233, "grad/layer_8/attn": 0.004538666922599077, "grad/layer_8/mlp": 0.0038124765269458294, "grad/layer_8/attn_mlp_ratio": 1.1904773108694433, "grad/layer_12/attn": 0.004296812228858471, "grad/layer_12/mlp": 0.006724583450704813, "grad/layer_12/attn_mlp_ratio": 0.6389707550600834, "grad/layer_16/attn": 0.003739542094990611, "grad/layer_16/mlp": 0.004697095137089491, "grad/layer_16/attn_mlp_ratio": 0.796139295933838, "grad/layer_20/attn": 0.004479057155549526, "grad/layer_20/mlp": 0.006344255991280079, "grad/layer_20/attn_mlp_ratio": 0.70600194744752, "grad/layer_24/attn": 0.01267383061349392, "grad/layer_24/mlp": 0.011596640571951866, "grad/layer_24/attn_mlp_ratio": 1.0928881019955539, "grad/layer_27/attn": 0.004962463863193989, "grad/layer_27/mlp": 0.010979311540722847, "grad/layer_27/attn_mlp_ratio": 0.4519831502721854} {"step": 32900, "timestamp": 1778230094.941955, "train/loss": 2.1789193153381348, "train/z_loss": 0.0014484876999631524, "train/perplexity": 8.83675135508174, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022759.3485247637, "perf/iters_per_sec": 0.9645268194793528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367778062820434, "data/tokens_consumed": 68998397952, "data/tokens_consumed_B": 68.998397952, "train/loss_slope": -1.4663598096088166e-05} {"step": 32910, "timestamp": 1778230105.303446, "train/loss": 2.198481297492981, "train/z_loss": 0.001430295023601502, "train/perplexity": 9.01131759144397, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025716.2423020257, "perf/iters_per_sec": 0.9659367763052109, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352644443511962, "data/tokens_consumed": 69019369472, "data/tokens_consumed_B": 69.019369472, "train/loss_slope": -1.0430443018648037e-05} {"step": 32920, "timestamp": 1778230115.6647544, "train/loss": 2.1695210218429564, "train/z_loss": 0.0014453788520768286, "train/perplexity": 8.754090018494841, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025571.0259931486, "perf/iters_per_sec": 0.9658675317731612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353386640548705, "data/tokens_consumed": 69040340992, "data/tokens_consumed_B": 69.040340992, "train/loss_slope": -1.2341892565473455e-05} {"step": 32925, "timestamp": 1778230121.4519055, "eos/sharpness": 13.406300544738768, "eos/L0_probe": 2.011500597000122, "eos/L_plus": 2.088301658630371, "eos/L_minus": 2.0687625408172607, "eos/grad_norm": 0.11663083732128143, "eos/embed_grad_frac": 0.18057510256767273, "eos/time_s": 0.6200714111328125} {"step": 32925, "timestamp": 1778230122.8318968, "geo/rankme_last": 439.48004150390625, "geo/layer_0/stable_rank_q_proj": 18.767704010009766, "geo/layer_0/stable_rank_k_proj": 16.1247501373291, "geo/layer_0/stable_rank_o_proj": 49.941917419433594, "geo/layer_0/stable_rank_gate_proj": 140.9161834716797, "geo/layer_0/stable_rank_down_proj": 52.262672424316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05828885734081268, "geo/layer_0/attn_entropy_mean": 6.2046027183532715, "geo/layer_0/attn_entropy_std": 0.3633807897567749, "geo/layer_7/stable_rank_q_proj": 42.785770416259766, "geo/layer_7/stable_rank_k_proj": 41.76081085205078, "geo/layer_7/stable_rank_o_proj": 102.42185974121094, "geo/layer_7/stable_rank_gate_proj": 93.80159759521484, "geo/layer_7/stable_rank_down_proj": 147.0181121826172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5308700203895569, "geo/layer_7/attn_entropy_mean": 4.664315223693848, "geo/layer_7/attn_entropy_std": 0.813011109828949, "geo/layer_14/stable_rank_q_proj": 54.98360061645508, "geo/layer_14/stable_rank_k_proj": 36.33965301513672, "geo/layer_14/stable_rank_o_proj": 51.082210540771484, "geo/layer_14/stable_rank_gate_proj": 79.30445098876953, "geo/layer_14/stable_rank_down_proj": 135.10047912597656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3783304691314697, "geo/layer_14/attn_entropy_mean": 5.4931640625, "geo/layer_14/attn_entropy_std": 0.39249685406684875, "geo/layer_21/stable_rank_q_proj": 44.39535140991211, "geo/layer_21/stable_rank_k_proj": 31.03798484802246, "geo/layer_21/stable_rank_o_proj": 78.15093994140625, "geo/layer_21/stable_rank_gate_proj": 77.01569366455078, "geo/layer_21/stable_rank_down_proj": 57.32947540283203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14615623652935028, "geo/layer_21/attn_entropy_mean": 5.739374160766602, "geo/layer_21/attn_entropy_std": 0.28861740231513977, "geo/layer_27/stable_rank_q_proj": 42.125064849853516, "geo/layer_27/stable_rank_k_proj": 31.374725341796875, "geo/layer_27/stable_rank_o_proj": 117.84272766113281, "geo/layer_27/stable_rank_gate_proj": 87.32353210449219, "geo/layer_27/stable_rank_down_proj": 133.98915100097656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09292415529489517, "geo/layer_27/attn_entropy_mean": 4.332818984985352, "geo/layer_27/attn_entropy_std": 0.6308423280715942, "attnres/final_alpha/block_0": 0.23868101835250854, "attnres/block_norm/0": 1.6969023942947388, "attnres/final_alpha/block_1": 0.005711473058909178, "attnres/block_norm/1": 36967.859375, "attnres/final_alpha/block_2": 0.011931659653782845, "attnres/block_norm/2": 25030.62890625, "attnres/final_alpha/block_3": 0.013697950169444084, "attnres/block_norm/3": 42130.24609375, "attnres/final_alpha/block_4": 0.017302000895142555, "attnres/block_norm/4": 11840.23046875, "attnres/final_alpha/block_5": 0.5916315913200378, "attnres/block_norm/5": 5689.3544921875, "attnres/final_alpha/block_6": 0.12104429304599762, "attnres/block_norm/6": 27574.271484375, "geo/tier1_time_s": 1.359832525253296, "geo/step": 32925.0, "geo/rankme_slope": -0.00015822131587009804} {"step": 32930, "timestamp": 1778230128.0157776, "train/loss": 2.215201210975647, "train/z_loss": 0.0014361374895088374, "train/perplexity": 9.163252673140642, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698697.4458160158, "perf/iters_per_sec": 0.8100020627098159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234564757347107, "data/tokens_consumed": 69061312512, "data/tokens_consumed_B": 69.061312512, "train/loss_slope": -9.167598247385032e-06} {"step": 32940, "timestamp": 1778230138.375995, "train/loss": 2.160148811340332, "train/z_loss": 0.00144166168756783, "train/perplexity": 8.672428118095748, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025193.458429073, "perf/iters_per_sec": 0.9656874935288777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355316877365113, "data/tokens_consumed": 69082284032, "data/tokens_consumed_B": 69.082284032, "train/loss_slope": -1.2658345524055126e-05} {"step": 32950, "timestamp": 1778230148.7162359, "grad/layer_0/attn": 0.0030948999337852, "grad/layer_0/mlp": 0.0031054578721523285, "grad/layer_0/attn_mlp_ratio": 0.9966001670408018, "grad/layer_4/attn": 0.002423814730718732, "grad/layer_4/mlp": 0.0024788270238786936, "grad/layer_4/attn_mlp_ratio": 0.9778070876221968, "grad/layer_8/attn": 0.005036462564021349, "grad/layer_8/mlp": 0.003629186423495412, "grad/layer_8/attn_mlp_ratio": 1.3877662477294042, "grad/layer_12/attn": 0.004785553086549044, "grad/layer_12/mlp": 0.006312756799161434, "grad/layer_12/attn_mlp_ratio": 0.7580765682874214, "grad/layer_16/attn": 0.00447506457567215, "grad/layer_16/mlp": 0.0047212932258844376, "grad/layer_16/attn_mlp_ratio": 0.9478471822832228, "grad/layer_20/attn": 0.004223427269607782, "grad/layer_20/mlp": 0.00658737076446414, "grad/layer_20/attn_mlp_ratio": 0.6411400476009702, "grad/layer_24/attn": 0.02017693594098091, "grad/layer_24/mlp": 0.012611542828381062, "grad/layer_24/attn_mlp_ratio": 1.5998784649556768, "grad/layer_27/attn": 0.012617915868759155, "grad/layer_27/mlp": 0.012405242770910263, "grad/layer_27/attn_mlp_ratio": 1.0171437996065036} {"step": 32950, "timestamp": 1778230148.7319927, "train/loss": 2.2065064191818236, "train/z_loss": 0.001440627675037831, "train/perplexity": 9.08392546552722, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025880.702869988, "perf/iters_per_sec": 0.9660151972150746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035180401802063, "data/tokens_consumed": 69103255552, "data/tokens_consumed_B": 69.103255552, "train/loss_slope": -1.2830758180626895e-05} {"step": 32960, "timestamp": 1778230159.0791354, "train/loss": 2.1997951745986937, "train/z_loss": 0.0014442330808378755, "train/perplexity": 9.023165136724582, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027828.9193074866, "perf/iters_per_sec": 0.966944179204696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341858625411988, "data/tokens_consumed": 69124227072, "data/tokens_consumed_B": 69.124227072, "train/loss_slope": -1.3048873082174421e-05} {"step": 32970, "timestamp": 1778230169.4300308, "train/loss": 2.178981304168701, "train/z_loss": 0.00144300990505144, "train/perplexity": 8.837299151942714, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027210.8515310995, "perf/iters_per_sec": 0.9666494615226267, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345011711120606, "data/tokens_consumed": 69145198592, "data/tokens_consumed_B": 69.145198592, "train/loss_slope": -8.84642185408043e-06} {"step": 32980, "timestamp": 1778230179.7779243, "train/loss": 2.1749612927436828, "train/z_loss": 0.0014397597056813537, "train/perplexity": 8.801844420345791, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027740.8946632019, "perf/iters_per_sec": 0.9669022057834634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342307567596436, "data/tokens_consumed": 69166170112, "data/tokens_consumed_B": 69.166170112, "train/loss_slope": -1.124972179062622e-05} {"step": 32990, "timestamp": 1778230190.597181, "train/loss": 2.1975085735321045, "train/z_loss": 0.001440521946642548, "train/perplexity": 9.002556328739603, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1939413.0811164612, "perf/iters_per_sec": 0.9247842221815401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.081333327293396, "data/tokens_consumed": 69187141632, "data/tokens_consumed_B": 69.187141632, "train/loss_slope": -1.1545891608699786e-05} {"step": 33000, "timestamp": 1778230200.9415905, "grad/layer_0/attn": 0.0026149339973926544, "grad/layer_0/mlp": 0.0025895514991134405, "grad/layer_0/attn_mlp_ratio": 1.0098018507481774, "grad/layer_4/attn": 0.0026754471473395824, "grad/layer_4/mlp": 0.0023724788334220648, "grad/layer_4/attn_mlp_ratio": 1.1277011187115227, "grad/layer_8/attn": 0.003762161126360297, "grad/layer_8/mlp": 0.0034191515296697617, "grad/layer_8/attn_mlp_ratio": 1.1003200600154914, "grad/layer_12/attn": 0.005576528143137693, "grad/layer_12/mlp": 0.005964695475995541, "grad/layer_12/attn_mlp_ratio": 0.9349225072910683, "grad/layer_16/attn": 0.003759685205295682, "grad/layer_16/mlp": 0.004547114484012127, "grad/layer_16/attn_mlp_ratio": 0.8268287802807776, "grad/layer_20/attn": 0.004700419493019581, "grad/layer_20/mlp": 0.00561267975717783, "grad/layer_20/attn_mlp_ratio": 0.8374643864656568, "grad/layer_24/attn": 0.017581989988684654, "grad/layer_24/mlp": 0.011075576767325401, "grad/layer_24/attn_mlp_ratio": 1.587455913068884, "grad/layer_27/attn": 0.0069676912389695644, "grad/layer_27/mlp": 0.009142369031906128, "grad/layer_27/attn_mlp_ratio": 0.7621319089657939} {"step": 33000, "timestamp": 1778230201.5496714, "eos/sharpness": 63.23783397674559, "eos/L0_probe": 2.0132737159729004, "eos/L_plus": 2.409525156021118, "eos/L_minus": 2.2494006156921387, "eos/grad_norm": 0.17091114819049835, "eos/embed_grad_frac": 0.0774257630109787, "eos/time_s": 0.6053512096405029} {"step": 33000, "timestamp": 1778230201.613767, "train/loss": 2.164043354988098, "train/z_loss": 0.001449190068524331, "train/perplexity": 8.7062691227952, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904636.5787233773, "perf/iters_per_sec": 0.9082014936081778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1010772466659546, "data/tokens_consumed": 69208113152, "data/tokens_consumed_B": 69.208113152, "train/loss_slope": -1.1455459320040933e-05} {"step": 33000, "timestamp": 1778230202.977841, "geo/rankme_last": 439.0518798828125, "geo/layer_0/stable_rank_q_proj": 18.782238006591797, "geo/layer_0/stable_rank_k_proj": 16.113359451293945, "geo/layer_0/stable_rank_o_proj": 50.03015899658203, "geo/layer_0/stable_rank_gate_proj": 141.1059112548828, "geo/layer_0/stable_rank_down_proj": 52.31092071533203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05714819207787514, "geo/layer_0/attn_entropy_mean": 6.200746536254883, "geo/layer_0/attn_entropy_std": 0.3650917410850525, "geo/layer_7/stable_rank_q_proj": 42.83407974243164, "geo/layer_7/stable_rank_k_proj": 41.7344856262207, "geo/layer_7/stable_rank_o_proj": 102.76795959472656, "geo/layer_7/stable_rank_gate_proj": 93.57894134521484, "geo/layer_7/stable_rank_down_proj": 146.7447509765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5204828381538391, "geo/layer_7/attn_entropy_mean": 4.652095794677734, "geo/layer_7/attn_entropy_std": 0.8247661590576172, "geo/layer_14/stable_rank_q_proj": 54.8387565612793, "geo/layer_14/stable_rank_k_proj": 36.327781677246094, "geo/layer_14/stable_rank_o_proj": 51.081024169921875, "geo/layer_14/stable_rank_gate_proj": 79.37765502929688, "geo/layer_14/stable_rank_down_proj": 134.94192504882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3841388523578644, "geo/layer_14/attn_entropy_mean": 5.484667778015137, "geo/layer_14/attn_entropy_std": 0.3828297555446625, "geo/layer_21/stable_rank_q_proj": 44.37168884277344, "geo/layer_21/stable_rank_k_proj": 31.117992401123047, "geo/layer_21/stable_rank_o_proj": 78.10948181152344, "geo/layer_21/stable_rank_gate_proj": 77.08865356445312, "geo/layer_21/stable_rank_down_proj": 57.3019905090332, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14910724759101868, "geo/layer_21/attn_entropy_mean": 5.738936424255371, "geo/layer_21/attn_entropy_std": 0.2841927409172058, "geo/layer_27/stable_rank_q_proj": 42.24959182739258, "geo/layer_27/stable_rank_k_proj": 31.492517471313477, "geo/layer_27/stable_rank_o_proj": 117.84590911865234, "geo/layer_27/stable_rank_gate_proj": 87.28836059570312, "geo/layer_27/stable_rank_down_proj": 133.5286865234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08917828649282455, "geo/layer_27/attn_entropy_mean": 4.330901145935059, "geo/layer_27/attn_entropy_std": 0.6222677826881409, "attnres/final_alpha/block_0": 0.23819133639335632, "attnres/block_norm/0": 1.6971988677978516, "attnres/final_alpha/block_1": 0.005619844421744347, "attnres/block_norm/1": 36986.80078125, "attnres/final_alpha/block_2": 0.011838595382869244, "attnres/block_norm/2": 25059.4140625, "attnres/final_alpha/block_3": 0.013777665793895721, "attnres/block_norm/3": 42024.1640625, "attnres/final_alpha/block_4": 0.01733335107564926, "attnres/block_norm/4": 11855.580078125, "attnres/final_alpha/block_5": 0.59308922290802, "attnres/block_norm/5": 5720.2724609375, "attnres/final_alpha/block_6": 0.12015000730752945, "attnres/block_norm/6": 27763.15234375, "geo/tier1_time_s": 1.3601024150848389, "geo/step": 33000.0, "geo/rankme_slope": -0.00017980831004276712} {"step": 33000, "timestamp": 1778230209.837877, "geo/ww_alpha_mean": 7.53934010263507, "geo/ww_alpha_std": 4.067025573569906, "geo/ww_alpha_min": 1.3438245285501658, "geo/ww_alpha_max": 24.03602089468095, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.137423001047044, "geo/ww_alpha_by_type/k_proj": 4.533267504889135, "geo/ww_alpha_by_type/v_proj": 8.124382214461551, "geo/ww_alpha_by_type/o_proj": 7.756004201763363, "geo/ww_alpha_by_type/gate_proj": 8.38904142136552, "geo/ww_alpha_by_type/up_proj": 10.90647208388387, "geo/ww_alpha_by_type/down_proj": 9.037482922648298, "geo/twonn_id/layer_0": 0.7347115278244019, "geo/twonn_id/layer_7": 3.030355930328369, "geo/twonn_id/layer_14": 4.604554653167725, "geo/twonn_id/layer_21": 7.452495098114014, "geo/twonn_id/layer_27": 5.399207592010498, "geo/tier2_time_s": 6.8541319370269775} {"step": 33000, "timestamp": 1778230210.4519324, "eoc/jacobian_sigma/layer_0/attn": 964.6555786132812, "eoc/jacobian_sigma/layer_0/mlp": 7799.134765625, "eoc/jacobian_sigma/layer_0": 7799.134765625, "eoc/jacobian_sigma/layer_7/attn": 1.1802932024002075, "eoc/jacobian_sigma/layer_7/mlp": 1.6696312427520752, "eoc/jacobian_sigma/layer_7": 1.6696312427520752, "eoc/jacobian_sigma/layer_14/attn": 1.6216133832931519, "eoc/jacobian_sigma/layer_14/mlp": 6.01896333694458, "eoc/jacobian_sigma/layer_14": 6.01896333694458, "eoc/jacobian_sigma/layer_21/attn": 1.092013955116272, "eoc/jacobian_sigma/layer_21/mlp": 4.093536376953125, "eoc/jacobian_sigma/layer_21": 4.093536376953125, "eoc/jacobian_sigma/layer_27/attn": 3.332326650619507, "eoc/jacobian_sigma/layer_27/mlp": 25.81877326965332, "eoc/jacobian_sigma/layer_27": 25.81877326965332, "eoc/layer0_sigma": 7799.134765625, "eoc/sigma_max": 25.81877326965332, "eoc/sigma_min": 1.6696312427520752, "eoc/sigma_mean": 9.400226056575775, "eoc/time_s": 0.6079435348510742} {"step": 33010, "timestamp": 1778230220.8440773, "train/loss": 2.201880121231079, "train/z_loss": 0.0014492410933598875, "train/perplexity": 9.041997579987378, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1090771.9673686046, "perf/iters_per_sec": 0.5201206051676772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9226310014724732, "data/tokens_consumed": 69229084672, "data/tokens_consumed_B": 69.229084672, "train/loss_slope": -7.821480161321834e-06} {"step": 33020, "timestamp": 1778230231.1924112, "train/loss": 2.155583620071411, "train/z_loss": 0.0014556309906765819, "train/perplexity": 8.632927058520163, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027617.0279966784, "perf/iters_per_sec": 0.9668431415542023, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342939376831055, "data/tokens_consumed": 69250056192, "data/tokens_consumed_B": 69.250056192, "train/loss_slope": -8.696221790262247e-06} {"step": 33030, "timestamp": 1778230241.5369675, "train/loss": 2.189435076713562, "train/z_loss": 0.0014556314330548049, "train/perplexity": 8.930166830256972, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028186.1449853224, "perf/iters_per_sec": 0.9671145176817524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340037107467652, "data/tokens_consumed": 69271027712, "data/tokens_consumed_B": 69.271027712, "train/loss_slope": -5.392305764428082e-06} {"step": 33040, "timestamp": 1778230251.949387, "train/loss": 2.2180060386657714, "train/z_loss": 0.0014447755529545248, "train/perplexity": 9.188990095614336, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024301.4921259393, "perf/iters_per_sec": 0.9652621708516785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359879732131958, "data/tokens_consumed": 69291999232, "data/tokens_consumed_B": 69.291999232, "train/loss_slope": -4.037781288199642e-06} {"step": 33050, "timestamp": 1778230262.297909, "grad/layer_0/attn": 0.0026909802109003067, "grad/layer_0/mlp": 0.0027192486450076103, "grad/layer_0/attn_mlp_ratio": 0.9896042853162282, "grad/layer_4/attn": 0.001973561244085431, "grad/layer_4/mlp": 0.0024482703302055597, "grad/layer_4/attn_mlp_ratio": 0.8061042684405286, "grad/layer_8/attn": 0.004582648631185293, "grad/layer_8/mlp": 0.003691502381116152, "grad/layer_8/attn_mlp_ratio": 1.2414047273780233, "grad/layer_12/attn": 0.0038403584621846676, "grad/layer_12/mlp": 0.006779703311622143, "grad/layer_12/attn_mlp_ratio": 0.5664493310432006, "grad/layer_16/attn": 0.004139923490583897, "grad/layer_16/mlp": 0.004466937854886055, "grad/layer_16/attn_mlp_ratio": 0.9267922528575843, "grad/layer_20/attn": 0.004045198671519756, "grad/layer_20/mlp": 0.0061395480297505856, "grad/layer_20/attn_mlp_ratio": 0.6588756348236477, "grad/layer_24/attn": 0.00885708723217249, "grad/layer_24/mlp": 0.012090340256690979, "grad/layer_24/attn_mlp_ratio": 0.7325755082875597, "grad/layer_27/attn": 0.00876208208501339, "grad/layer_27/mlp": 0.01006647851318121, "grad/layer_27/attn_mlp_ratio": 0.870421765317236} {"step": 33050, "timestamp": 1778230262.3136117, "train/loss": 2.1607621908187866, "train/z_loss": 0.0014634581748396158, "train/perplexity": 8.677749239298201, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025140.397591453, "perf/iters_per_sec": 0.9656621921498552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035558819770813, "data/tokens_consumed": 69312970752, "data/tokens_consumed_B": 69.312970752, "train/loss_slope": -7.827749919004371e-06} {"step": 33060, "timestamp": 1778230272.6630306, "train/loss": 2.24168860912323, "train/z_loss": 0.0014348130906000733, "train/perplexity": 9.409206351969495, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027686.4382079581, "perf/iters_per_sec": 0.9668762389220992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034258532524109, "data/tokens_consumed": 69333942272, "data/tokens_consumed_B": 69.333942272, "train/loss_slope": -7.022277521293112e-06} {"step": 33070, "timestamp": 1778230283.0165935, "train/loss": 2.1607444524765014, "train/z_loss": 0.0014402853208594023, "train/perplexity": 8.677595311777143, "train/grad_norm": 0.33203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026673.800856567, "perf/iters_per_sec": 0.9663933758051715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347753047943116, "data/tokens_consumed": 69354913792, "data/tokens_consumed_B": 69.354913792, "train/loss_slope": -8.282287193067184e-06} {"step": 33075, "timestamp": 1778230288.8055046, "eos/sharpness": 20.561432838439938, "eos/L0_probe": 2.0070712566375732, "eos/L_plus": 2.116459846496582, "eos/L_minus": 2.103296995162964, "eos/grad_norm": 0.12464534491300583, "eos/embed_grad_frac": 0.15688401460647583, "eos/time_s": 0.6248264312744141} {"step": 33075, "timestamp": 1778230290.193572, "geo/rankme_last": 440.0201721191406, "geo/layer_0/stable_rank_q_proj": 18.81498146057129, "geo/layer_0/stable_rank_k_proj": 16.129610061645508, "geo/layer_0/stable_rank_o_proj": 49.919124603271484, "geo/layer_0/stable_rank_gate_proj": 141.24761962890625, "geo/layer_0/stable_rank_down_proj": 52.30915069580078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05515666678547859, "geo/layer_0/attn_entropy_mean": 6.211918354034424, "geo/layer_0/attn_entropy_std": 0.3592139482498169, "geo/layer_7/stable_rank_q_proj": 42.827022552490234, "geo/layer_7/stable_rank_k_proj": 41.840065002441406, "geo/layer_7/stable_rank_o_proj": 102.8430404663086, "geo/layer_7/stable_rank_gate_proj": 93.4957275390625, "geo/layer_7/stable_rank_down_proj": 146.8571319580078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5142418146133423, "geo/layer_7/attn_entropy_mean": 4.643074989318848, "geo/layer_7/attn_entropy_std": 0.8120523691177368, "geo/layer_14/stable_rank_q_proj": 54.737037658691406, "geo/layer_14/stable_rank_k_proj": 36.397525787353516, "geo/layer_14/stable_rank_o_proj": 51.101749420166016, "geo/layer_14/stable_rank_gate_proj": 79.64041137695312, "geo/layer_14/stable_rank_down_proj": 135.0322723388672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3751479983329773, "geo/layer_14/attn_entropy_mean": 5.5008625984191895, "geo/layer_14/attn_entropy_std": 0.3899957239627838, "geo/layer_21/stable_rank_q_proj": 44.183258056640625, "geo/layer_21/stable_rank_k_proj": 31.151729583740234, "geo/layer_21/stable_rank_o_proj": 78.15007781982422, "geo/layer_21/stable_rank_gate_proj": 76.85189056396484, "geo/layer_21/stable_rank_down_proj": 57.26988220214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.148011714220047, "geo/layer_21/attn_entropy_mean": 5.7290143966674805, "geo/layer_21/attn_entropy_std": 0.28567054867744446, "geo/layer_27/stable_rank_q_proj": 42.157623291015625, "geo/layer_27/stable_rank_k_proj": 31.53168487548828, "geo/layer_27/stable_rank_o_proj": 117.68868255615234, "geo/layer_27/stable_rank_gate_proj": 87.22933197021484, "geo/layer_27/stable_rank_down_proj": 133.7050323486328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0850747674703598, "geo/layer_27/attn_entropy_mean": 4.316570281982422, "geo/layer_27/attn_entropy_std": 0.6018099188804626, "attnres/final_alpha/block_0": 0.24079421162605286, "attnres/block_norm/0": 1.69744873046875, "attnres/final_alpha/block_1": 0.005719408392906189, "attnres/block_norm/1": 37042.515625, "attnres/final_alpha/block_2": 0.012093332596123219, "attnres/block_norm/2": 25136.8203125, "attnres/final_alpha/block_3": 0.013875615783035755, "attnres/block_norm/3": 41928.0, "attnres/final_alpha/block_4": 0.017509549856185913, "attnres/block_norm/4": 11907.28125, "attnres/final_alpha/block_5": 0.5886187553405762, "attnres/block_norm/5": 5731.390625, "attnres/final_alpha/block_6": 0.12138918042182922, "attnres/block_norm/6": 27611.5, "geo/tier1_time_s": 1.3592071533203125, "geo/step": 33075.0, "geo/rankme_slope": -0.0001410742031187475} {"step": 33080, "timestamp": 1778230295.3770714, "train/loss": 2.222192072868347, "train/z_loss": 0.0014424997149035334, "train/perplexity": 9.227536143693051, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697403.088003611, "perf/iters_per_sec": 0.8093848648088507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2355061769485474, "data/tokens_consumed": 69375885312, "data/tokens_consumed_B": 69.375885312, "train/loss_slope": -2.139434305140162e-06} {"step": 33090, "timestamp": 1778230305.7333972, "train/loss": 2.1824877738952635, "train/z_loss": 0.001446523144841194, "train/perplexity": 8.868341266194433, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026082.710982404, "perf/iters_per_sec": 0.9661115221893329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03507719039917, "data/tokens_consumed": 69396856832, "data/tokens_consumed_B": 69.396856832, "train/loss_slope": 1.0857985393799474e-06} {"step": 33100, "timestamp": 1778230316.095597, "grad/layer_0/attn": 0.002557705156505108, "grad/layer_0/mlp": 0.002647838555276394, "grad/layer_0/attn_mlp_ratio": 0.965959595539677, "grad/layer_4/attn": 0.0021146582439541817, "grad/layer_4/mlp": 0.0024744251277297735, "grad/layer_4/attn_mlp_ratio": 0.8546058374511201, "grad/layer_8/attn": 0.007299710996448994, "grad/layer_8/mlp": 0.003730672411620617, "grad/layer_8/attn_mlp_ratio": 1.9566742922921354, "grad/layer_12/attn": 0.004599053878337145, "grad/layer_12/mlp": 0.0059014009311795235, "grad/layer_12/attn_mlp_ratio": 0.7793155987939908, "grad/layer_16/attn": 0.00626197038218379, "grad/layer_16/mlp": 0.004549968987703323, "grad/layer_16/attn_mlp_ratio": 1.3762665770867093, "grad/layer_20/attn": 0.0056955222971737385, "grad/layer_20/mlp": 0.0059206816367805, "grad/layer_20/attn_mlp_ratio": 0.9619706902656118, "grad/layer_24/attn": 0.0070385560393333435, "grad/layer_24/mlp": 0.00880105048418045, "grad/layer_24/attn_mlp_ratio": 0.7997404368956675, "grad/layer_27/attn": 0.007255145348608494, "grad/layer_27/mlp": 0.007929059676826, "grad/layer_27/attn_mlp_ratio": 0.9150070188413596} {"step": 33100, "timestamp": 1778230316.1111567, "train/loss": 2.1375955820083616, "train/z_loss": 0.0014568466576747596, "train/perplexity": 8.47902597703759, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022058.647772005, "perf/iters_per_sec": 0.9641926993236566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371370792388916, "data/tokens_consumed": 69417828352, "data/tokens_consumed_B": 69.417828352, "train/loss_slope": 1.9947105651497654e-06} {"step": 33110, "timestamp": 1778230326.4603271, "train/loss": 2.1722729921340944, "train/z_loss": 0.0014463173341937364, "train/perplexity": 8.778214193433005, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027704.3407370346, "perf/iters_per_sec": 0.9668847755131886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342494010925294, "data/tokens_consumed": 69438799872, "data/tokens_consumed_B": 69.438799872, "train/loss_slope": 3.051810989929183e-06} {"step": 33120, "timestamp": 1778230336.8129342, "train/loss": 2.1828107595443726, "train/z_loss": 0.0014445664826780557, "train/perplexity": 8.871206075776103, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026764.6749731496, "perf/iters_per_sec": 0.9664367079606769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347289085388183, "data/tokens_consumed": 69459771392, "data/tokens_consumed_B": 69.459771392, "train/loss_slope": 3.3213110396427233e-06} {"step": 33130, "timestamp": 1778230347.1611688, "train/loss": 2.186697220802307, "train/z_loss": 0.00144049251684919, "train/perplexity": 8.90575075930596, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027591.0412883798, "perf/iters_per_sec": 0.9668307501260661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343071937561035, "data/tokens_consumed": 69480742912, "data/tokens_consumed_B": 69.480742912, "train/loss_slope": 8.517866910057897e-07} {"step": 33140, "timestamp": 1778230357.5080323, "train/loss": 2.179642128944397, "train/z_loss": 0.0014471488539129496, "train/perplexity": 8.843140988175023, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027964.8279106983, "perf/iters_per_sec": 0.9670089854768268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034116554260254, "data/tokens_consumed": 69501714432, "data/tokens_consumed_B": 69.501714432, "train/loss_slope": 7.927642153291104e-07} {"step": 33150, "timestamp": 1778230367.871892, "grad/layer_0/attn": 0.0030679304618388414, "grad/layer_0/mlp": 0.0027805210556834936, "grad/layer_0/attn_mlp_ratio": 1.1033652650215848, "grad/layer_4/attn": 0.004284645896404982, "grad/layer_4/mlp": 0.0024010103661566973, "grad/layer_4/attn_mlp_ratio": 1.7845177923207571, "grad/layer_8/attn": 0.004538824316114187, "grad/layer_8/mlp": 0.003873964073136449, "grad/layer_8/attn_mlp_ratio": 1.17162268757882, "grad/layer_12/attn": 0.004644232336431742, "grad/layer_12/mlp": 0.007046400103718042, "grad/layer_12/attn_mlp_ratio": 0.6590928988082747, "grad/layer_16/attn": 0.003634908702224493, "grad/layer_16/mlp": 0.00484847417101264, "grad/layer_16/attn_mlp_ratio": 0.7497015553854461, "grad/layer_20/attn": 0.0045481594279408455, "grad/layer_20/mlp": 0.00595918670296669, "grad/layer_20/attn_mlp_ratio": 0.7632181333326578, "grad/layer_24/attn": 0.013741127215325832, "grad/layer_24/mlp": 0.009767495095729828, "grad/layer_24/attn_mlp_ratio": 1.406822009119923, "grad/layer_27/attn": 0.004713921807706356, "grad/layer_27/mlp": 0.008692193776369095, "grad/layer_27/attn_mlp_ratio": 0.5423166895209034} {"step": 33150, "timestamp": 1778230368.4837074, "eos/sharpness": 62.64545917510985, "eos/L0_probe": 2.0098519325256348, "eos/L_plus": 2.413397789001465, "eos/L_minus": 2.2327606678009033, "eos/grad_norm": 0.16986949741840363, "eos/embed_grad_frac": 0.07876279950141907, "eos/time_s": 0.6090877056121826} {"step": 33150, "timestamp": 1778230368.5028837, "train/loss": 2.1953497648239138, "train/z_loss": 0.001453502126969397, "train/perplexity": 8.983142494657955, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908348.7710031944, "perf/iters_per_sec": 0.9099716048255894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0989353895187377, "data/tokens_consumed": 69522685952, "data/tokens_consumed_B": 69.522685952, "train/loss_slope": 2.957786535165165e-06} {"step": 33150, "timestamp": 1778230369.8627424, "geo/rankme_last": 438.1617736816406, "geo/layer_0/stable_rank_q_proj": 18.817564010620117, "geo/layer_0/stable_rank_k_proj": 16.103361129760742, "geo/layer_0/stable_rank_o_proj": 49.94557189941406, "geo/layer_0/stable_rank_gate_proj": 140.80091857910156, "geo/layer_0/stable_rank_down_proj": 52.30735397338867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05993563309311867, "geo/layer_0/attn_entropy_mean": 6.207169055938721, "geo/layer_0/attn_entropy_std": 0.360734224319458, "geo/layer_7/stable_rank_q_proj": 42.86906051635742, "geo/layer_7/stable_rank_k_proj": 41.993408203125, "geo/layer_7/stable_rank_o_proj": 102.88676452636719, "geo/layer_7/stable_rank_gate_proj": 93.37625885009766, "geo/layer_7/stable_rank_down_proj": 146.7721710205078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5324399471282959, "geo/layer_7/attn_entropy_mean": 4.633538246154785, "geo/layer_7/attn_entropy_std": 0.7986077666282654, "geo/layer_14/stable_rank_q_proj": 54.6122932434082, "geo/layer_14/stable_rank_k_proj": 36.27496337890625, "geo/layer_14/stable_rank_o_proj": 51.021610260009766, "geo/layer_14/stable_rank_gate_proj": 79.6189193725586, "geo/layer_14/stable_rank_down_proj": 134.55239868164062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37973496317863464, "geo/layer_14/attn_entropy_mean": 5.4588165283203125, "geo/layer_14/attn_entropy_std": 0.39911654591560364, "geo/layer_21/stable_rank_q_proj": 44.22796630859375, "geo/layer_21/stable_rank_k_proj": 31.06702995300293, "geo/layer_21/stable_rank_o_proj": 78.21464538574219, "geo/layer_21/stable_rank_gate_proj": 76.70480346679688, "geo/layer_21/stable_rank_down_proj": 57.28832244873047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1475648581981659, "geo/layer_21/attn_entropy_mean": 5.729574203491211, "geo/layer_21/attn_entropy_std": 0.27751052379608154, "geo/layer_27/stable_rank_q_proj": 42.001216888427734, "geo/layer_27/stable_rank_k_proj": 31.47105598449707, "geo/layer_27/stable_rank_o_proj": 117.60446166992188, "geo/layer_27/stable_rank_gate_proj": 87.23046875, "geo/layer_27/stable_rank_down_proj": 133.8976287841797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08583226054906845, "geo/layer_27/attn_entropy_mean": 4.291447639465332, "geo/layer_27/attn_entropy_std": 0.6185410022735596, "attnres/final_alpha/block_0": 0.23865479230880737, "attnres/block_norm/0": 1.6975237131118774, "attnres/final_alpha/block_1": 0.0055624498054385185, "attnres/block_norm/1": 37271.31640625, "attnres/final_alpha/block_2": 0.011843651533126831, "attnres/block_norm/2": 25189.84765625, "attnres/final_alpha/block_3": 0.013701449148356915, "attnres/block_norm/3": 42172.97265625, "attnres/final_alpha/block_4": 0.017031366005539894, "attnres/block_norm/4": 11852.4072265625, "attnres/final_alpha/block_5": 0.5936347246170044, "attnres/block_norm/5": 5697.875, "attnres/final_alpha/block_6": 0.11957157403230667, "attnres/block_norm/6": 28055.734375, "geo/tier1_time_s": 1.3559675216674805, "geo/step": 33150.0, "geo/rankme_slope": -0.00019396401138580433} {"step": 33160, "timestamp": 1778230380.2195134, "train/loss": 2.1781657695770265, "train/z_loss": 0.0014437630656175316, "train/perplexity": 8.830094966817704, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790469.203249427, "perf/iters_per_sec": 0.8537622467276702, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712862730026246, "data/tokens_consumed": 69543657472, "data/tokens_consumed_B": 69.543657472, "train/loss_slope": 2.8519263540771887e-06} {"step": 33170, "timestamp": 1778230390.573944, "train/loss": 2.176103401184082, "train/z_loss": 0.001454373518936336, "train/perplexity": 8.811902823949815, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026304.7843940982, "perf/iters_per_sec": 0.9662174150438777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349637508392333, "data/tokens_consumed": 69564628992, "data/tokens_consumed_B": 69.564628992, "train/loss_slope": 5.241701555008832e-06} {"step": 33180, "timestamp": 1778230400.9281628, "train/loss": 2.1570518016815186, "train/z_loss": 0.0014512212481349706, "train/perplexity": 8.645611072208274, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026486.567966111, "perf/iters_per_sec": 0.9663040962057643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348709106445313, "data/tokens_consumed": 69585600512, "data/tokens_consumed_B": 69.585600512, "train/loss_slope": 2.3507887988772735e-06} {"step": 33190, "timestamp": 1778230411.286689, "train/loss": 2.181446886062622, "train/z_loss": 0.001454256137367338, "train/perplexity": 8.859115120199299, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026005.7107623206, "perf/iters_per_sec": 0.9660748056232074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351165294647218, "data/tokens_consumed": 69606572032, "data/tokens_consumed_B": 69.606572032, "train/loss_slope": 2.437410567781767e-06} {"step": 33200, "timestamp": 1778230421.6297245, "grad/layer_0/attn": 0.0030642314814031124, "grad/layer_0/mlp": 0.002958041848614812, "grad/layer_0/attn_mlp_ratio": 1.0358985892130526, "grad/layer_4/attn": 0.0019875746220350266, "grad/layer_4/mlp": 0.0024516151752322912, "grad/layer_4/attn_mlp_ratio": 0.8107204430135162, "grad/layer_8/attn": 0.008474335074424744, "grad/layer_8/mlp": 0.003713957266882062, "grad/layer_8/attn_mlp_ratio": 2.281753460605583, "grad/layer_12/attn": 0.0035394695587456226, "grad/layer_12/mlp": 0.006562595255672932, "grad/layer_12/attn_mlp_ratio": 0.5393399054668188, "grad/layer_16/attn": 0.0038998336531221867, "grad/layer_16/mlp": 0.004672865383327007, "grad/layer_16/attn_mlp_ratio": 0.8345700656346663, "grad/layer_20/attn": 0.004046091344207525, "grad/layer_20/mlp": 0.006003635935485363, "grad/layer_20/attn_mlp_ratio": 0.6739401456538195, "grad/layer_24/attn": 0.011573468334972858, "grad/layer_24/mlp": 0.010356823913753033, "grad/layer_24/attn_mlp_ratio": 1.1174727232599704, "grad/layer_27/attn": 0.004492840263992548, "grad/layer_27/mlp": 0.009904949925839901, "grad/layer_27/attn_mlp_ratio": 0.4535954499792212} {"step": 33200, "timestamp": 1778230421.6457934, "train/loss": 2.156068432331085, "train/z_loss": 0.001453543733805418, "train/perplexity": 8.637113422113202, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025640.7160274042, "perf/iters_per_sec": 0.9659007625710507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353030443191529, "data/tokens_consumed": 69627543552, "data/tokens_consumed_B": 69.627543552, "train/loss_slope": 9.61459308925256e-07} {"step": 33210, "timestamp": 1778230432.0419972, "train/loss": 2.1718945026397707, "train/z_loss": 0.0014424502034671605, "train/perplexity": 8.774892360261001, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021449.01618868, "perf/iters_per_sec": 0.963902004331913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374498605728149, "data/tokens_consumed": 69648515072, "data/tokens_consumed_B": 69.648515072, "train/loss_slope": 3.112957637087416e-07} {"step": 33220, "timestamp": 1778230442.3956838, "train/loss": 2.1892916440963743, "train/z_loss": 0.0014596472843550145, "train/perplexity": 8.928886044911977, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026803.7636232618, "perf/iters_per_sec": 0.9664553468815145, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347089529037476, "data/tokens_consumed": 69669486592, "data/tokens_consumed_B": 69.669486592, "train/loss_slope": 1.2305577452963013e-06} {"step": 33225, "timestamp": 1778230448.1706223, "eos/sharpness": 72.22244739532469, "eos/L0_probe": 2.0114448070526123, "eos/L_plus": 2.4596142768859863, "eos/L_minus": 2.2854998111724854, "eos/grad_norm": 0.2820049524307251, "eos/embed_grad_frac": 0.029700445011258125, "eos/time_s": 0.6055879592895508} {"step": 33225, "timestamp": 1778230449.5455675, "geo/rankme_last": 439.3191223144531, "geo/layer_0/stable_rank_q_proj": 18.845291137695312, "geo/layer_0/stable_rank_k_proj": 16.163969039916992, "geo/layer_0/stable_rank_o_proj": 49.91130828857422, "geo/layer_0/stable_rank_gate_proj": 140.79391479492188, "geo/layer_0/stable_rank_down_proj": 52.28557205200195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05520552769303322, "geo/layer_0/attn_entropy_mean": 6.214051723480225, "geo/layer_0/attn_entropy_std": 0.3574323058128357, "geo/layer_7/stable_rank_q_proj": 42.82093048095703, "geo/layer_7/stable_rank_k_proj": 41.80324935913086, "geo/layer_7/stable_rank_o_proj": 102.78836822509766, "geo/layer_7/stable_rank_gate_proj": 93.31652069091797, "geo/layer_7/stable_rank_down_proj": 146.84918212890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5295256972312927, "geo/layer_7/attn_entropy_mean": 4.648650169372559, "geo/layer_7/attn_entropy_std": 0.8339521288871765, "geo/layer_14/stable_rank_q_proj": 54.52162170410156, "geo/layer_14/stable_rank_k_proj": 36.28425979614258, "geo/layer_14/stable_rank_o_proj": 51.01564407348633, "geo/layer_14/stable_rank_gate_proj": 79.54204559326172, "geo/layer_14/stable_rank_down_proj": 134.33709716796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3806019723415375, "geo/layer_14/attn_entropy_mean": 5.501792907714844, "geo/layer_14/attn_entropy_std": 0.3735713064670563, "geo/layer_21/stable_rank_q_proj": 44.20814895629883, "geo/layer_21/stable_rank_k_proj": 31.067541122436523, "geo/layer_21/stable_rank_o_proj": 78.29728698730469, "geo/layer_21/stable_rank_gate_proj": 76.74626159667969, "geo/layer_21/stable_rank_down_proj": 57.26873016357422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14938919246196747, "geo/layer_21/attn_entropy_mean": 5.722291469573975, "geo/layer_21/attn_entropy_std": 0.2822887599468231, "geo/layer_27/stable_rank_q_proj": 42.07207489013672, "geo/layer_27/stable_rank_k_proj": 31.532136917114258, "geo/layer_27/stable_rank_o_proj": 117.59201049804688, "geo/layer_27/stable_rank_gate_proj": 87.18408203125, "geo/layer_27/stable_rank_down_proj": 133.8745574951172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07720629870891571, "geo/layer_27/attn_entropy_mean": 4.286671161651611, "geo/layer_27/attn_entropy_std": 0.6291865110397339, "attnres/final_alpha/block_0": 0.2386530190706253, "attnres/block_norm/0": 1.6978583335876465, "attnres/final_alpha/block_1": 0.005632401444017887, "attnres/block_norm/1": 37160.76171875, "attnres/final_alpha/block_2": 0.011868751607835293, "attnres/block_norm/2": 25099.33203125, "attnres/final_alpha/block_3": 0.013723913580179214, "attnres/block_norm/3": 42241.10546875, "attnres/final_alpha/block_4": 0.01709240861237049, "attnres/block_norm/4": 11848.064453125, "attnres/final_alpha/block_5": 0.5934547185897827, "attnres/block_norm/5": 5691.8642578125, "attnres/final_alpha/block_6": 0.11957474052906036, "attnres/block_norm/6": 27877.9453125, "geo/tier1_time_s": 1.3560020923614502, "geo/step": 33225.0, "geo/rankme_slope": -0.00019323817417592036} {"step": 33230, "timestamp": 1778230454.7266128, "train/loss": 2.1890507459640505, "train/z_loss": 0.0014482948696240782, "train/perplexity": 8.926735351999378, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701446.79520488, "perf/iters_per_sec": 0.8113130546593094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325698375701903, "data/tokens_consumed": 69690458112, "data/tokens_consumed_B": 69.690458112, "train/loss_slope": 9.232979724974398e-07} {"step": 33240, "timestamp": 1778230465.0843182, "train/loss": 2.2087094306945803, "train/z_loss": 0.0014489417662844062, "train/perplexity": 9.103959517429756, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025591.5966490123, "perf/iters_per_sec": 0.9658773406262456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353281497955322, "data/tokens_consumed": 69711429632, "data/tokens_consumed_B": 69.711429632, "train/loss_slope": 5.281249360211458e-07} {"step": 33250, "timestamp": 1778230475.4211268, "grad/layer_0/attn": 0.0026876921765506268, "grad/layer_0/mlp": 0.002769951708614826, "grad/layer_0/attn_mlp_ratio": 0.9703028652670549, "grad/layer_4/attn": 0.001823101076297462, "grad/layer_4/mlp": 0.0026187202893197536, "grad/layer_4/attn_mlp_ratio": 0.696180120540107, "grad/layer_8/attn": 0.010418851859867573, "grad/layer_8/mlp": 0.0037167135160416365, "grad/layer_8/attn_mlp_ratio": 2.803243116418485, "grad/layer_12/attn": 0.0037407472264021635, "grad/layer_12/mlp": 0.006337514612823725, "grad/layer_12/attn_mlp_ratio": 0.5902546023022086, "grad/layer_16/attn": 0.003666289383545518, "grad/layer_16/mlp": 0.004544184543192387, "grad/layer_16/attn_mlp_ratio": 0.8068090694857575, "grad/layer_20/attn": 0.003719345200806856, "grad/layer_20/mlp": 0.006346563808619976, "grad/layer_20/attn_mlp_ratio": 0.5860407701488989, "grad/layer_24/attn": 0.019022969529032707, "grad/layer_24/mlp": 0.012362088076770306, "grad/layer_24/attn_mlp_ratio": 1.538815227412705, "grad/layer_27/attn": 0.007271434646099806, "grad/layer_27/mlp": 0.011333219707012177, "grad/layer_27/attn_mlp_ratio": 0.641603601617324} {"step": 33250, "timestamp": 1778230475.4366488, "train/loss": 2.2411409854888915, "train/z_loss": 0.0014319256180897356, "train/perplexity": 9.404055058804467, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026935.19100148, "perf/iters_per_sec": 0.9665180163390541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346418619155884, "data/tokens_consumed": 69732401152, "data/tokens_consumed_B": 69.732401152, "train/loss_slope": 3.4824747170838327e-06} {"step": 33260, "timestamp": 1778230485.802904, "train/loss": 2.1250836849212646, "train/z_loss": 0.0014593464555218815, "train/perplexity": 8.373598202713529, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024111.436835378, "perf/iters_per_sec": 0.9651715454270258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360852479934692, "data/tokens_consumed": 69753372672, "data/tokens_consumed_B": 69.753372672, "train/loss_slope": 3.3830650139598907e-06} {"step": 33270, "timestamp": 1778230496.1762695, "train/loss": 2.197517824172974, "train/z_loss": 0.0014455064432695509, "train/perplexity": 9.002639608540301, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023126.0044125577, "perf/iters_per_sec": 0.9647016546309269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365899085998536, "data/tokens_consumed": 69774344192, "data/tokens_consumed_B": 69.774344192, "train/loss_slope": 4.4035047444716615e-06} {"step": 33280, "timestamp": 1778230506.5633798, "train/loss": 2.1365426540374757, "train/z_loss": 0.0014500356512144208, "train/perplexity": 8.47010287193836, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020931.728866524, "perf/iters_per_sec": 0.9636553425152417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377154111862184, "data/tokens_consumed": 69795315712, "data/tokens_consumed_B": 69.795315712, "train/loss_slope": 1.1480114390604948e-06} {"step": 33290, "timestamp": 1778230516.9414742, "train/loss": 2.18473961353302, "train/z_loss": 0.0014520264929160475, "train/perplexity": 8.888333850177169, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022021.2758267615, "perf/iters_per_sec": 0.9641748789914901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371562480926513, "data/tokens_consumed": 69816287232, "data/tokens_consumed_B": 69.816287232, "train/loss_slope": 5.41075292450534e-07} {"step": 33300, "timestamp": 1778230527.324785, "grad/layer_0/attn": 0.002649230882525444, "grad/layer_0/mlp": 0.002660550409927964, "grad/layer_0/attn_mlp_ratio": 0.9957453815064662, "grad/layer_4/attn": 0.0016352214151993394, "grad/layer_4/mlp": 0.002582345623522997, "grad/layer_4/attn_mlp_ratio": 0.6332310194966724, "grad/layer_8/attn": 0.005604931619018316, "grad/layer_8/mlp": 0.00393650634214282, "grad/layer_8/attn_mlp_ratio": 1.4238339759879313, "grad/layer_12/attn": 0.005076240282505751, "grad/layer_12/mlp": 0.006470460910350084, "grad/layer_12/attn_mlp_ratio": 0.7845252872068695, "grad/layer_16/attn": 0.006013018079102039, "grad/layer_16/mlp": 0.004485482815653086, "grad/layer_16/attn_mlp_ratio": 1.340550882072983, "grad/layer_20/attn": 0.004227817989885807, "grad/layer_20/mlp": 0.00576014444231987, "grad/layer_20/attn_mlp_ratio": 0.7339777602495847, "grad/layer_24/attn": 0.004574383143335581, "grad/layer_24/mlp": 0.008555549196898937, "grad/layer_24/attn_mlp_ratio": 0.5346685507374286, "grad/layer_27/attn": 0.009438751265406609, "grad/layer_27/mlp": 0.0066799623891711235, "grad/layer_27/attn_mlp_ratio": 1.412994650899276} {"step": 33300, "timestamp": 1778230527.9410305, "eos/sharpness": 24.926447868347164, "eos/L0_probe": 2.006382703781128, "eos/L_plus": 2.116649627685547, "eos/L_minus": 2.1453802585601807, "eos/grad_norm": 0.09940396249294281, "eos/embed_grad_frac": 0.22253242135047913, "eos/time_s": 0.6133589744567871} {"step": 33300, "timestamp": 1778230527.9611385, "train/loss": 2.1890445947647095, "train/z_loss": 0.0014428795780986547, "train/perplexity": 8.926680442039645, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904259.9533729693, "perf/iters_per_sec": 0.908021904646382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1012950181961059, "data/tokens_consumed": 69837258752, "data/tokens_consumed_B": 69.837258752, "train/loss_slope": 2.959328659153788e-06} {"step": 33300, "timestamp": 1778230529.3258593, "geo/rankme_last": 439.78253173828125, "geo/layer_0/stable_rank_q_proj": 18.80821990966797, "geo/layer_0/stable_rank_k_proj": 16.144495010375977, "geo/layer_0/stable_rank_o_proj": 49.93998336791992, "geo/layer_0/stable_rank_gate_proj": 140.53897094726562, "geo/layer_0/stable_rank_down_proj": 52.28900146484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06271824985742569, "geo/layer_0/attn_entropy_mean": 6.205051422119141, "geo/layer_0/attn_entropy_std": 0.3609008491039276, "geo/layer_7/stable_rank_q_proj": 42.721378326416016, "geo/layer_7/stable_rank_k_proj": 41.86738204956055, "geo/layer_7/stable_rank_o_proj": 102.75978088378906, "geo/layer_7/stable_rank_gate_proj": 93.3153076171875, "geo/layer_7/stable_rank_down_proj": 146.37564086914062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5191293358802795, "geo/layer_7/attn_entropy_mean": 4.65078067779541, "geo/layer_7/attn_entropy_std": 0.8199083805084229, "geo/layer_14/stable_rank_q_proj": 54.62297058105469, "geo/layer_14/stable_rank_k_proj": 36.25817108154297, "geo/layer_14/stable_rank_o_proj": 51.02708053588867, "geo/layer_14/stable_rank_gate_proj": 79.57479095458984, "geo/layer_14/stable_rank_down_proj": 134.3663787841797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3823654353618622, "geo/layer_14/attn_entropy_mean": 5.507208824157715, "geo/layer_14/attn_entropy_std": 0.39536961913108826, "geo/layer_21/stable_rank_q_proj": 44.151817321777344, "geo/layer_21/stable_rank_k_proj": 31.09584617614746, "geo/layer_21/stable_rank_o_proj": 78.36273193359375, "geo/layer_21/stable_rank_gate_proj": 76.8025131225586, "geo/layer_21/stable_rank_down_proj": 57.263648986816406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15062308311462402, "geo/layer_21/attn_entropy_mean": 5.727890491485596, "geo/layer_21/attn_entropy_std": 0.2922159731388092, "geo/layer_27/stable_rank_q_proj": 42.09002685546875, "geo/layer_27/stable_rank_k_proj": 31.523380279541016, "geo/layer_27/stable_rank_o_proj": 117.64652252197266, "geo/layer_27/stable_rank_gate_proj": 87.13304901123047, "geo/layer_27/stable_rank_down_proj": 134.04930114746094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08138515800237656, "geo/layer_27/attn_entropy_mean": 4.3118672370910645, "geo/layer_27/attn_entropy_std": 0.62883061170578, "attnres/final_alpha/block_0": 0.24038568139076233, "attnres/block_norm/0": 1.6982190608978271, "attnres/final_alpha/block_1": 0.005735164508223534, "attnres/block_norm/1": 37228.33203125, "attnres/final_alpha/block_2": 0.012037236243486404, "attnres/block_norm/2": 25054.078125, "attnres/final_alpha/block_3": 0.01400919258594513, "attnres/block_norm/3": 42147.16796875, "attnres/final_alpha/block_4": 0.017582204192876816, "attnres/block_norm/4": 11913.46875, "attnres/final_alpha/block_5": 0.5874550938606262, "attnres/block_norm/5": 5756.26318359375, "attnres/final_alpha/block_6": 0.12279541045427322, "attnres/block_norm/6": 27629.27734375, "geo/tier1_time_s": 1.36067533493042, "geo/step": 33300.0, "geo/rankme_slope": -0.00018906347695328132} {"step": 33310, "timestamp": 1778230539.676621, "train/loss": 2.2344007015228273, "train/z_loss": 0.0014569843653589488, "train/perplexity": 9.340882197955416, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790644.7055267473, "perf/iters_per_sec": 0.8538459327348458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711714744567872, "data/tokens_consumed": 69858230272, "data/tokens_consumed_B": 69.858230272, "train/loss_slope": 6.6044950642601915e-06} {"step": 33320, "timestamp": 1778230550.0288908, "train/loss": 2.2437937259674072, "train/z_loss": 0.0014404861605726183, "train/perplexity": 9.42903469391721, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026795.1705290694, "perf/iters_per_sec": 0.9664512493748996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034713339805603, "data/tokens_consumed": 69879201792, "data/tokens_consumed_B": 69.879201792, "train/loss_slope": 9.908408593602165e-06} {"step": 33330, "timestamp": 1778230560.381916, "train/loss": 2.175475335121155, "train/z_loss": 0.0014513022149913013, "train/perplexity": 8.806370104474825, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026563.7448142716, "perf/iters_per_sec": 0.9663408969947203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348315000534059, "data/tokens_consumed": 69900173312, "data/tokens_consumed_B": 69.900173312, "train/loss_slope": 1.0827731606912411e-05} {"step": 33340, "timestamp": 1778230570.7373211, "train/loss": 2.2176573276519775, "train/z_loss": 0.001444172253832221, "train/perplexity": 9.185786352185128, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026605.8140411149, "perf/iters_per_sec": 0.966360957165296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348100185394287, "data/tokens_consumed": 69921144832, "data/tokens_consumed_B": 69.921144832, "train/loss_slope": 1.2475518703889924e-05} {"step": 33350, "timestamp": 1778230581.0727491, "grad/layer_0/attn": 0.0029572162311524153, "grad/layer_0/mlp": 0.0029429253190755844, "grad/layer_0/attn_mlp_ratio": 1.0048559885290331, "grad/layer_4/attn": 0.002991091925650835, "grad/layer_4/mlp": 0.0024903106968849897, "grad/layer_4/attn_mlp_ratio": 1.2010918192991205, "grad/layer_8/attn": 0.0097917215898633, "grad/layer_8/mlp": 0.003940383903682232, "grad/layer_8/attn_mlp_ratio": 2.4849663334114336, "grad/layer_12/attn": 0.004544912837445736, "grad/layer_12/mlp": 0.00666860630735755, "grad/layer_12/attn_mlp_ratio": 0.6815386243865408, "grad/layer_16/attn": 0.005725560709834099, "grad/layer_16/mlp": 0.00440247030928731, "grad/layer_16/attn_mlp_ratio": 1.3005336044407343, "grad/layer_20/attn": 0.00402410002425313, "grad/layer_20/mlp": 0.0065030911937355995, "grad/layer_20/attn_mlp_ratio": 0.6187980212010139, "grad/layer_24/attn": 0.014129175804555416, "grad/layer_24/mlp": 0.013657854869961739, "grad/layer_24/attn_mlp_ratio": 1.0345091403906597, "grad/layer_27/attn": 0.006574024446308613, "grad/layer_27/mlp": 0.013186842203140259, "grad/layer_27/attn_mlp_ratio": 0.4985290864320947} {"step": 33350, "timestamp": 1778230581.0884027, "train/loss": 2.2153591752052306, "train/z_loss": 0.0014423181535676121, "train/perplexity": 9.164700253619591, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027003.667028394, "perf/iters_per_sec": 0.9665506682531328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346069097518922, "data/tokens_consumed": 69942116352, "data/tokens_consumed_B": 69.942116352, "train/loss_slope": 1.3620465530706849e-05} {"step": 33360, "timestamp": 1778230591.438213, "train/loss": 2.202490496635437, "train/z_loss": 0.0014490175875835122, "train/perplexity": 9.047518277594085, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027315.0904580685, "perf/iters_per_sec": 0.9666991665163367, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034447979927063, "data/tokens_consumed": 69963087872, "data/tokens_consumed_B": 69.963087872, "train/loss_slope": 1.467807508251744e-05} {"step": 33370, "timestamp": 1778230601.7850146, "train/loss": 2.186450552940369, "train/z_loss": 0.0014435822144150735, "train/perplexity": 8.903554267720285, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027933.5490847987, "perf/iters_per_sec": 0.9669940705703729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341325044631957, "data/tokens_consumed": 69984059392, "data/tokens_consumed_B": 69.984059392, "train/loss_slope": 1.6926405889795556e-05} {"step": 33375, "timestamp": 1778230607.5525022, "eos/sharpness": 65.1113986968994, "eos/L0_probe": 2.0129892826080322, "eos/L_plus": 2.275317907333374, "eos/L_minus": 2.4017746448516846, "eos/grad_norm": 0.18648436665534973, "eos/embed_grad_frac": 0.05801470950245857, "eos/time_s": 0.6025559902191162} {"step": 33375, "timestamp": 1778230608.9288237, "geo/rankme_last": 440.109375, "geo/layer_0/stable_rank_q_proj": 18.823606491088867, "geo/layer_0/stable_rank_k_proj": 16.14212417602539, "geo/layer_0/stable_rank_o_proj": 49.937477111816406, "geo/layer_0/stable_rank_gate_proj": 140.64622497558594, "geo/layer_0/stable_rank_down_proj": 52.44575119018555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05241319164633751, "geo/layer_0/attn_entropy_mean": 6.204599857330322, "geo/layer_0/attn_entropy_std": 0.3621003031730652, "geo/layer_7/stable_rank_q_proj": 42.71072769165039, "geo/layer_7/stable_rank_k_proj": 41.95921325683594, "geo/layer_7/stable_rank_o_proj": 102.6414566040039, "geo/layer_7/stable_rank_gate_proj": 93.23734283447266, "geo/layer_7/stable_rank_down_proj": 146.08265686035156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5308863520622253, "geo/layer_7/attn_entropy_mean": 4.646556377410889, "geo/layer_7/attn_entropy_std": 0.8152611255645752, "geo/layer_14/stable_rank_q_proj": 54.61655807495117, "geo/layer_14/stable_rank_k_proj": 36.22507095336914, "geo/layer_14/stable_rank_o_proj": 51.136634826660156, "geo/layer_14/stable_rank_gate_proj": 79.49081420898438, "geo/layer_14/stable_rank_down_proj": 134.45535278320312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3801727294921875, "geo/layer_14/attn_entropy_mean": 5.490683078765869, "geo/layer_14/attn_entropy_std": 0.37629562616348267, "geo/layer_21/stable_rank_q_proj": 44.02873992919922, "geo/layer_21/stable_rank_k_proj": 31.090608596801758, "geo/layer_21/stable_rank_o_proj": 78.3330307006836, "geo/layer_21/stable_rank_gate_proj": 76.59210205078125, "geo/layer_21/stable_rank_down_proj": 57.23249435424805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14623604714870453, "geo/layer_21/attn_entropy_mean": 5.717371940612793, "geo/layer_21/attn_entropy_std": 0.28727537393569946, "geo/layer_27/stable_rank_q_proj": 42.211326599121094, "geo/layer_27/stable_rank_k_proj": 31.55300521850586, "geo/layer_27/stable_rank_o_proj": 117.81513977050781, "geo/layer_27/stable_rank_gate_proj": 87.14993286132812, "geo/layer_27/stable_rank_down_proj": 133.73159790039062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08651197701692581, "geo/layer_27/attn_entropy_mean": 4.315577507019043, "geo/layer_27/attn_entropy_std": 0.6329650282859802, "attnres/final_alpha/block_0": 0.24256333708763123, "attnres/block_norm/0": 1.6984708309173584, "attnres/final_alpha/block_1": 0.005852853413671255, "attnres/block_norm/1": 37235.83203125, "attnres/final_alpha/block_2": 0.012243712320923805, "attnres/block_norm/2": 25086.0390625, "attnres/final_alpha/block_3": 0.01400733646005392, "attnres/block_norm/3": 41952.4921875, "attnres/final_alpha/block_4": 0.01767764240503311, "attnres/block_norm/4": 11919.94140625, "attnres/final_alpha/block_5": 0.5835639238357544, "attnres/block_norm/5": 5786.28662109375, "attnres/final_alpha/block_6": 0.12409120053052902, "attnres/block_norm/6": 27614.7265625, "geo/tier1_time_s": 1.357266902923584, "geo/step": 33375.0, "geo/rankme_slope": -0.00016724410467311925} {"step": 33380, "timestamp": 1778230614.1068435, "train/loss": 2.2116087675094604, "train/z_loss": 0.0014381446409970522, "train/perplexity": 9.130393264070038, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702861.4511145514, "perf/iters_per_sec": 0.8119876151631124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231545877456665, "data/tokens_consumed": 70005030912, "data/tokens_consumed_B": 70.005030912, "train/loss_slope": 2.00412163532714e-05} {"step": 33390, "timestamp": 1778230624.4630072, "train/loss": 2.212103176116943, "train/z_loss": 0.0014371402212418616, "train/perplexity": 9.134908525189502, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026274.443625384, "perf/iters_per_sec": 0.9662029474379463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034979248046875, "data/tokens_consumed": 70026002432, "data/tokens_consumed_B": 70.026002432, "train/loss_slope": 2.0778015473208703e-05} {"step": 33400, "timestamp": 1778230634.8010614, "grad/layer_0/attn": 0.0030444376170635223, "grad/layer_0/mlp": 0.0028798053972423077, "grad/layer_0/attn_mlp_ratio": 1.0571677913591269, "grad/layer_4/attn": 0.0023905872367322445, "grad/layer_4/mlp": 0.0026071234606206417, "grad/layer_4/attn_mlp_ratio": 0.9169443569307281, "grad/layer_8/attn": 0.006530356593430042, "grad/layer_8/mlp": 0.0037211282178759575, "grad/layer_8/attn_mlp_ratio": 1.754939909505085, "grad/layer_12/attn": 0.004492572043091059, "grad/layer_12/mlp": 0.00664499681442976, "grad/layer_12/attn_mlp_ratio": 0.6760833903978704, "grad/layer_16/attn": 0.00905358511954546, "grad/layer_16/mlp": 0.004482107236981392, "grad/layer_16/attn_mlp_ratio": 2.019939380934787, "grad/layer_20/attn": 0.010290959849953651, "grad/layer_20/mlp": 0.006103557534515858, "grad/layer_20/attn_mlp_ratio": 1.6860592569418646, "grad/layer_24/attn": 0.012188803404569626, "grad/layer_24/mlp": 0.009444518946111202, "grad/layer_24/attn_mlp_ratio": 1.2905689898087915, "grad/layer_27/attn": 0.004126585088670254, "grad/layer_27/mlp": 0.008387277834117413, "grad/layer_27/attn_mlp_ratio": 0.4920052871843327} {"step": 33400, "timestamp": 1778230634.8166735, "train/loss": 2.1647545099258423, "train/z_loss": 0.0014565133722499013, "train/perplexity": 8.712462831153314, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026676.8360886225, "perf/iters_per_sec": 0.9663948231165993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347737550735474, "data/tokens_consumed": 70046973952, "data/tokens_consumed_B": 70.046973952, "train/loss_slope": 1.9957207732110483e-05} {"step": 33410, "timestamp": 1778230645.1637135, "train/loss": 2.191927361488342, "train/z_loss": 0.0014409937895834446, "train/perplexity": 8.952451106931136, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027869.8723043373, "perf/iters_per_sec": 0.9669637071153342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341649770736694, "data/tokens_consumed": 70067945472, "data/tokens_consumed_B": 70.067945472, "train/loss_slope": 2.0060162382586514e-05} {"step": 33420, "timestamp": 1778230655.517228, "train/loss": 2.1734177350997923, "train/z_loss": 0.0014530487824231386, "train/perplexity": 8.788268746223629, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027048.136885233, "perf/iters_per_sec": 0.9665718731332936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345842123031617, "data/tokens_consumed": 70088916992, "data/tokens_consumed_B": 70.088916992, "train/loss_slope": 1.9190858771698692e-05} {"step": 33430, "timestamp": 1778230665.8639452, "train/loss": 2.1403467655181885, "train/z_loss": 0.0014552020234987139, "train/perplexity": 8.502385451851955, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027907.5075099336, "perf/iters_per_sec": 0.9669816529798191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341457843780517, "data/tokens_consumed": 70109888512, "data/tokens_consumed_B": 70.109888512, "train/loss_slope": 1.7640887728833298e-05} {"step": 33440, "timestamp": 1778230676.2209282, "train/loss": 2.1512033939361572, "train/z_loss": 0.0014554630615748465, "train/perplexity": 8.595195582321297, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025934.8757287737, "perf/iters_per_sec": 0.9660410288471096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351527214050293, "data/tokens_consumed": 70130860032, "data/tokens_consumed_B": 70.130860032, "train/loss_slope": 1.374196841700313e-05} {"step": 33450, "timestamp": 1778230686.607705, "grad/layer_0/attn": 0.0026166548486799, "grad/layer_0/mlp": 0.002701409161090851, "grad/layer_0/attn_mlp_ratio": 0.9686258525756567, "grad/layer_4/attn": 0.0017797444015741348, "grad/layer_4/mlp": 0.0024683771189302206, "grad/layer_4/attn_mlp_ratio": 0.7210179983533737, "grad/layer_8/attn": 0.003858415875583887, "grad/layer_8/mlp": 0.0036320118233561516, "grad/layer_8/attn_mlp_ratio": 1.062335685291068, "grad/layer_12/attn": 0.005605421960353851, "grad/layer_12/mlp": 0.006222262512892485, "grad/layer_12/attn_mlp_ratio": 0.9008655386449708, "grad/layer_16/attn": 0.0038639912381768227, "grad/layer_16/mlp": 0.004612623248249292, "grad/layer_16/attn_mlp_ratio": 0.8376992757588563, "grad/layer_20/attn": 0.00413084588944912, "grad/layer_20/mlp": 0.006126823369413614, "grad/layer_20/attn_mlp_ratio": 0.6742230962049368, "grad/layer_24/attn": 0.007267051842063665, "grad/layer_24/mlp": 0.009695030748844147, "grad/layer_24/attn_mlp_ratio": 0.749564591940422, "grad/layer_27/attn": 0.015165034681558609, "grad/layer_27/mlp": 0.008379439823329449, "grad/layer_27/attn_mlp_ratio": 1.8097909669758696} {"step": 33450, "timestamp": 1778230687.2126095, "eos/sharpness": 62.7293825149536, "eos/L0_probe": 2.0115315914154053, "eos/L_plus": 2.2678894996643066, "eos/L_minus": 2.38246750831604, "eos/grad_norm": 0.1651526391506195, "eos/embed_grad_frac": 0.07821496576070786, "eos/time_s": 0.6016218662261963} {"step": 33450, "timestamp": 1778230687.2321107, "train/loss": 2.1794809103012085, "train/z_loss": 0.001453549030702561, "train/perplexity": 8.841715423900242, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906235.9048625838, "perf/iters_per_sec": 0.908964111739437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1001534461975098, "data/tokens_consumed": 70151831552, "data/tokens_consumed_B": 70.151831552, "train/loss_slope": 1.4241368585329595e-05} {"step": 33450, "timestamp": 1778230688.6004398, "geo/rankme_last": 439.713134765625, "geo/layer_0/stable_rank_q_proj": 18.812925338745117, "geo/layer_0/stable_rank_k_proj": 16.133604049682617, "geo/layer_0/stable_rank_o_proj": 49.945899963378906, "geo/layer_0/stable_rank_gate_proj": 140.3625030517578, "geo/layer_0/stable_rank_down_proj": 52.463958740234375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061285339295864105, "geo/layer_0/attn_entropy_mean": 6.207222938537598, "geo/layer_0/attn_entropy_std": 0.3601645231246948, "geo/layer_7/stable_rank_q_proj": 42.85552215576172, "geo/layer_7/stable_rank_k_proj": 41.94292449951172, "geo/layer_7/stable_rank_o_proj": 102.57569885253906, "geo/layer_7/stable_rank_gate_proj": 93.10062408447266, "geo/layer_7/stable_rank_down_proj": 146.13096618652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5299353003501892, "geo/layer_7/attn_entropy_mean": 4.6369171142578125, "geo/layer_7/attn_entropy_std": 0.8153902292251587, "geo/layer_14/stable_rank_q_proj": 54.62714767456055, "geo/layer_14/stable_rank_k_proj": 36.195945739746094, "geo/layer_14/stable_rank_o_proj": 51.06532669067383, "geo/layer_14/stable_rank_gate_proj": 79.4390640258789, "geo/layer_14/stable_rank_down_proj": 134.3794708251953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36491724848747253, "geo/layer_14/attn_entropy_mean": 5.486176490783691, "geo/layer_14/attn_entropy_std": 0.38802191615104675, "geo/layer_21/stable_rank_q_proj": 44.06267166137695, "geo/layer_21/stable_rank_k_proj": 31.023616790771484, "geo/layer_21/stable_rank_o_proj": 78.41031646728516, "geo/layer_21/stable_rank_gate_proj": 76.43148803710938, "geo/layer_21/stable_rank_down_proj": 57.23075866699219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14862759411334991, "geo/layer_21/attn_entropy_mean": 5.714897632598877, "geo/layer_21/attn_entropy_std": 0.2858206331729889, "geo/layer_27/stable_rank_q_proj": 42.217708587646484, "geo/layer_27/stable_rank_k_proj": 31.460126876831055, "geo/layer_27/stable_rank_o_proj": 117.7927017211914, "geo/layer_27/stable_rank_gate_proj": 87.20958709716797, "geo/layer_27/stable_rank_down_proj": 133.88412475585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08408400416374207, "geo/layer_27/attn_entropy_mean": 4.336778163909912, "geo/layer_27/attn_entropy_std": 0.6173849105834961, "attnres/final_alpha/block_0": 0.2431592494249344, "attnres/block_norm/0": 1.6988087892532349, "attnres/final_alpha/block_1": 0.005718533881008625, "attnres/block_norm/1": 37271.94921875, "attnres/final_alpha/block_2": 0.012232719920575619, "attnres/block_norm/2": 25128.068359375, "attnres/final_alpha/block_3": 0.01393083855509758, "attnres/block_norm/3": 41996.640625, "attnres/final_alpha/block_4": 0.01757635921239853, "attnres/block_norm/4": 11944.83203125, "attnres/final_alpha/block_5": 0.5840552449226379, "attnres/block_norm/5": 5775.8173828125, "attnres/final_alpha/block_6": 0.12332703918218613, "attnres/block_norm/6": 27766.072265625, "geo/tier1_time_s": 1.3643693923950195, "geo/step": 33450.0, "geo/rankme_slope": -0.00014467767575780313} {"step": 33460, "timestamp": 1778230698.9767745, "train/loss": 2.1238370180130004, "train/z_loss": 0.0014563866774551571, "train/perplexity": 8.36316561926056, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786211.8377334275, "perf/iters_per_sec": 0.851732176653589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1740779876708984, "data/tokens_consumed": 70172803072, "data/tokens_consumed_B": 70.172803072, "train/loss_slope": 1.267082806360317e-05} {"step": 33470, "timestamp": 1778230709.3471596, "train/loss": 2.179129219055176, "train/z_loss": 0.0014637213200330735, "train/perplexity": 8.838606416723094, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023387.6437578024, "perf/iters_per_sec": 0.9648264139927876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364558696746826, "data/tokens_consumed": 70193774592, "data/tokens_consumed_B": 70.193774592, "train/loss_slope": 1.306137184534493e-05} {"step": 33480, "timestamp": 1778230719.7160769, "train/loss": 2.1817126274108887, "train/z_loss": 0.0014247757499106228, "train/perplexity": 8.86146966623205, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023982.9833999025, "perf/iters_per_sec": 0.9651102940558922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361510038375854, "data/tokens_consumed": 70214746112, "data/tokens_consumed_B": 70.214746112, "train/loss_slope": 1.2848377814351558e-05} {"step": 33490, "timestamp": 1778230730.0861685, "train/loss": 2.156599998474121, "train/z_loss": 0.0014358490705490112, "train/perplexity": 8.641705839660665, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023652.1907374628, "perf/iters_per_sec": 0.96495255982278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363203763961792, "data/tokens_consumed": 70235717632, "data/tokens_consumed_B": 70.235717632, "train/loss_slope": 1.145314300450126e-05} {"step": 33500, "timestamp": 1778230740.441036, "grad/layer_0/attn": 0.0026179819833487272, "grad/layer_0/mlp": 0.002749283565208316, "grad/layer_0/attn_mlp_ratio": 0.952241493476578, "grad/layer_4/attn": 0.0024878098629415035, "grad/layer_4/mlp": 0.0025807213969528675, "grad/layer_4/attn_mlp_ratio": 0.9639978067679649, "grad/layer_8/attn": 0.013122014701366425, "grad/layer_8/mlp": 0.0035874072927981615, "grad/layer_8/attn_mlp_ratio": 3.6577988682604885, "grad/layer_12/attn": 0.006006890442222357, "grad/layer_12/mlp": 0.006221409887075424, "grad/layer_12/attn_mlp_ratio": 0.9655191435223338, "grad/layer_16/attn": 0.0037634961772710085, "grad/layer_16/mlp": 0.004400500562041998, "grad/layer_16/attn_mlp_ratio": 0.8552427249322592, "grad/layer_20/attn": 0.005242913495749235, "grad/layer_20/mlp": 0.005863755010068417, "grad/layer_20/attn_mlp_ratio": 0.8941221789339123, "grad/layer_24/attn": 0.008169413544237614, "grad/layer_24/mlp": 0.011672921478748322, "grad/layer_24/attn_mlp_ratio": 0.6998602268613556, "grad/layer_27/attn": 0.011586545966565609, "grad/layer_27/mlp": 0.009590855799615383, "grad/layer_27/attn_mlp_ratio": 1.208082582809972} {"step": 33500, "timestamp": 1778230740.4597456, "train/loss": 2.140118730068207, "train/z_loss": 0.001448007149156183, "train/perplexity": 8.500446827605453, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022946.777239762, "perf/iters_per_sec": 0.9646161924551783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366817474365235, "data/tokens_consumed": 70256689152, "data/tokens_consumed_B": 70.256689152, "train/loss_slope": 9.487873972123932e-06} {"step": 33500, "timestamp": 1778230747.4156454, "geo/ww_alpha_mean": 7.798589195694792, "geo/ww_alpha_std": 4.663102660000605, "geo/ww_alpha_min": 1.381540629314212, "geo/ww_alpha_max": 28.509771403066726, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.116603799427618, "geo/ww_alpha_by_type/k_proj": 4.523297454914429, "geo/ww_alpha_by_type/v_proj": 7.730010205095357, "geo/ww_alpha_by_type/o_proj": 9.26163440643759, "geo/ww_alpha_by_type/gate_proj": 8.221936068957335, "geo/ww_alpha_by_type/up_proj": 12.062490997002111, "geo/ww_alpha_by_type/down_proj": 8.791379916931183, "geo/twonn_id/layer_0": 0.7642085552215576, "geo/twonn_id/layer_7": 2.986470937728882, "geo/twonn_id/layer_14": 4.124358177185059, "geo/twonn_id/layer_21": 8.315630912780762, "geo/twonn_id/layer_27": 6.183341979980469, "geo/tier2_time_s": 6.947063684463501} {"step": 33500, "timestamp": 1778230748.0426922, "eoc/jacobian_sigma/layer_0/attn": 1099.564697265625, "eoc/jacobian_sigma/layer_0/mlp": 7415.89111328125, "eoc/jacobian_sigma/layer_0": 7415.89111328125, "eoc/jacobian_sigma/layer_7/attn": 1.1770222187042236, "eoc/jacobian_sigma/layer_7/mlp": 1.7081092596054077, "eoc/jacobian_sigma/layer_7": 1.7081092596054077, "eoc/jacobian_sigma/layer_14/attn": 1.6218128204345703, "eoc/jacobian_sigma/layer_14/mlp": 5.100194454193115, "eoc/jacobian_sigma/layer_14": 5.100194454193115, "eoc/jacobian_sigma/layer_21/attn": 1.0949867963790894, "eoc/jacobian_sigma/layer_21/mlp": 4.092069149017334, "eoc/jacobian_sigma/layer_21": 4.092069149017334, "eoc/jacobian_sigma/layer_27/attn": 2.844313621520996, "eoc/jacobian_sigma/layer_27/mlp": 23.478654861450195, "eoc/jacobian_sigma/layer_27": 23.478654861450195, "eoc/layer0_sigma": 7415.89111328125, "eoc/sigma_max": 23.478654861450195, "eoc/sigma_min": 1.7081092596054077, "eoc/sigma_mean": 8.594756931066513, "eoc/time_s": 0.6208071708679199} {"step": 33510, "timestamp": 1778230758.4204574, "train/loss": 2.193804144859314, "train/z_loss": 0.0014445161796174943, "train/perplexity": 8.969268694846328, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1168139.6108895862, "perf/iters_per_sec": 0.5570123724410945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7952922582626343, "data/tokens_consumed": 70277660672, "data/tokens_consumed_B": 70.277660672, "train/loss_slope": 8.626979133441152e-06} {"step": 33520, "timestamp": 1778230769.1960185, "train/loss": 2.2057103395462034, "train/z_loss": 0.0014374881167896092, "train/perplexity": 9.07669681512507, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947360.6750141345, "perf/iters_per_sec": 0.9285739302702591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.076920175552368, "data/tokens_consumed": 70298632192, "data/tokens_consumed_B": 70.298632192, "train/loss_slope": 9.74596502399644e-06} {"step": 33525, "timestamp": 1778230774.9937923, "eos/sharpness": 54.963612556457505, "eos/L0_probe": 2.0087592601776123, "eos/L_plus": 2.2451560497283936, "eos/L_minus": 2.3219985961914062, "eos/grad_norm": 0.16958042979240417, "eos/embed_grad_frac": 0.08858484774827957, "eos/time_s": 0.6133944988250732} {"step": 33525, "timestamp": 1778230776.3797247, "geo/rankme_last": 439.1513366699219, "geo/layer_0/stable_rank_q_proj": 18.811365127563477, "geo/layer_0/stable_rank_k_proj": 16.156230926513672, "geo/layer_0/stable_rank_o_proj": 49.90565872192383, "geo/layer_0/stable_rank_gate_proj": 140.611328125, "geo/layer_0/stable_rank_down_proj": 52.409156799316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05941954627633095, "geo/layer_0/attn_entropy_mean": 6.21033239364624, "geo/layer_0/attn_entropy_std": 0.36243951320648193, "geo/layer_7/stable_rank_q_proj": 42.81854248046875, "geo/layer_7/stable_rank_k_proj": 41.909358978271484, "geo/layer_7/stable_rank_o_proj": 102.64955139160156, "geo/layer_7/stable_rank_gate_proj": 92.94969177246094, "geo/layer_7/stable_rank_down_proj": 145.95274353027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5276591181755066, "geo/layer_7/attn_entropy_mean": 4.629975318908691, "geo/layer_7/attn_entropy_std": 0.8193954229354858, "geo/layer_14/stable_rank_q_proj": 54.811302185058594, "geo/layer_14/stable_rank_k_proj": 36.129268646240234, "geo/layer_14/stable_rank_o_proj": 51.09647750854492, "geo/layer_14/stable_rank_gate_proj": 79.37178039550781, "geo/layer_14/stable_rank_down_proj": 134.2373046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3762737810611725, "geo/layer_14/attn_entropy_mean": 5.504777908325195, "geo/layer_14/attn_entropy_std": 0.4038179814815521, "geo/layer_21/stable_rank_q_proj": 44.08660125732422, "geo/layer_21/stable_rank_k_proj": 31.04500389099121, "geo/layer_21/stable_rank_o_proj": 78.32836151123047, "geo/layer_21/stable_rank_gate_proj": 76.45769500732422, "geo/layer_21/stable_rank_down_proj": 57.17130661010742, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1510397493839264, "geo/layer_21/attn_entropy_mean": 5.7037835121154785, "geo/layer_21/attn_entropy_std": 0.2895757555961609, "geo/layer_27/stable_rank_q_proj": 42.289676666259766, "geo/layer_27/stable_rank_k_proj": 31.398784637451172, "geo/layer_27/stable_rank_o_proj": 118.00617980957031, "geo/layer_27/stable_rank_gate_proj": 87.26325225830078, "geo/layer_27/stable_rank_down_proj": 133.9852294921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08990409970283508, "geo/layer_27/attn_entropy_mean": 4.307666778564453, "geo/layer_27/attn_entropy_std": 0.6327478289604187, "attnres/final_alpha/block_0": 0.24229007959365845, "attnres/block_norm/0": 1.6991713047027588, "attnres/final_alpha/block_1": 0.0057524750009179115, "attnres/block_norm/1": 37263.97265625, "attnres/final_alpha/block_2": 0.011982602998614311, "attnres/block_norm/2": 25200.22265625, "attnres/final_alpha/block_3": 0.01398690976202488, "attnres/block_norm/3": 41921.5078125, "attnres/final_alpha/block_4": 0.017429769039154053, "attnres/block_norm/4": 11957.564453125, "attnres/final_alpha/block_5": 0.583916425704956, "attnres/block_norm/5": 5795.1513671875, "attnres/final_alpha/block_6": 0.12464174628257751, "attnres/block_norm/6": 27709.515625, "geo/tier1_time_s": 1.3655142784118652, "geo/step": 33525.0, "geo/rankme_slope": -0.0001445864087822629} {"step": 33530, "timestamp": 1778230781.5643528, "train/loss": 2.17146475315094, "train/z_loss": 0.0014611324178986252, "train/perplexity": 8.771122164932418, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696440.2387527307, "perf/iters_per_sec": 0.808925742508283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236207413673401, "data/tokens_consumed": 70319603712, "data/tokens_consumed_B": 70.319603712, "train/loss_slope": 8.465724924180816e-06} {"step": 33540, "timestamp": 1778230791.9336123, "train/loss": 2.1111387610435486, "train/z_loss": 0.0014580289600417018, "train/perplexity": 8.257639410573633, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023585.5703329113, "perf/iters_per_sec": 0.9649207927383954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363544940948486, "data/tokens_consumed": 70340575232, "data/tokens_consumed_B": 70.340575232, "train/loss_slope": 2.950172448637558e-06} {"step": 33550, "timestamp": 1778230802.284439, "grad/layer_0/attn": 0.003207109635695815, "grad/layer_0/mlp": 0.0029430000577121973, "grad/layer_0/attn_mlp_ratio": 1.089741577923981, "grad/layer_4/attn": 0.002207415644079447, "grad/layer_4/mlp": 0.0024432740174233913, "grad/layer_4/attn_mlp_ratio": 0.9034662252335918, "grad/layer_8/attn": 0.006054082419723272, "grad/layer_8/mlp": 0.003587715793401003, "grad/layer_8/attn_mlp_ratio": 1.6874475570539835, "grad/layer_12/attn": 0.005534996744245291, "grad/layer_12/mlp": 0.00681016081944108, "grad/layer_12/attn_mlp_ratio": 0.8127556469986551, "grad/layer_16/attn": 0.005514646414667368, "grad/layer_16/mlp": 0.004922615829855204, "grad/layer_16/attn_mlp_ratio": 1.1202674539814401, "grad/layer_20/attn": 0.008417141623795033, "grad/layer_20/mlp": 0.007428920362144709, "grad/layer_20/attn_mlp_ratio": 1.1330235216120523, "grad/layer_24/attn": 0.030803298577666283, "grad/layer_24/mlp": 0.01832754537463188, "grad/layer_24/attn_mlp_ratio": 1.6807105250565464, "grad/layer_27/attn": 0.00951852835714817, "grad/layer_27/mlp": 0.017477668821811676, "grad/layer_27/attn_mlp_ratio": 0.5446108631380066} {"step": 33550, "timestamp": 1778230802.3001666, "train/loss": 2.2231413602828978, "train/z_loss": 0.0014399886946193873, "train/perplexity": 9.236299886618884, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024473.6440780053, "perf/iters_per_sec": 0.9653442592992808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358998775482178, "data/tokens_consumed": 70361546752, "data/tokens_consumed_B": 70.361546752, "train/loss_slope": 4.050851769537483e-06} {"step": 33560, "timestamp": 1778230812.6819487, "train/loss": 2.190406584739685, "train/z_loss": 0.001443783228751272, "train/perplexity": 8.938846774642302, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021193.4975905635, "perf/iters_per_sec": 0.9637801635697191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375810146331788, "data/tokens_consumed": 70382518272, "data/tokens_consumed_B": 70.382518272, "train/loss_slope": 4.467212556540858e-06} {"step": 33570, "timestamp": 1778230823.0541794, "train/loss": 2.183792293071747, "train/z_loss": 0.0014468417852185667, "train/perplexity": 8.879917736661934, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023207.6721836522, "perf/iters_per_sec": 0.9647405968588124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365480661392212, "data/tokens_consumed": 70403489792, "data/tokens_consumed_B": 70.403489792, "train/loss_slope": 4.723058632462398e-06} {"step": 33580, "timestamp": 1778230833.4133298, "train/loss": 2.158987057209015, "train/z_loss": 0.0014458558172918857, "train/perplexity": 8.662358739104851, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025817.6215038404, "perf/iters_per_sec": 0.9659851176757052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352126359939575, "data/tokens_consumed": 70424461312, "data/tokens_consumed_B": 70.424461312, "train/loss_slope": 3.6725559500721307e-06} {"step": 33590, "timestamp": 1778230843.772214, "train/loss": 2.175652599334717, "train/z_loss": 0.001453061494976282, "train/perplexity": 8.807931297113434, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025637.3107134954, "perf/iters_per_sec": 0.9658991387908437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353047847747803, "data/tokens_consumed": 70445432832, "data/tokens_consumed_B": 70.445432832, "train/loss_slope": 2.9365179669632098e-06} {"step": 33600, "timestamp": 1778230854.1127048, "grad/layer_0/attn": 0.003199934959411621, "grad/layer_0/mlp": 0.0028357880655676126, "grad/layer_0/attn_mlp_ratio": 1.1284111409538662, "grad/layer_4/attn": 0.00155706238001585, "grad/layer_4/mlp": 0.00244418578222394, "grad/layer_4/attn_mlp_ratio": 0.6370474485349268, "grad/layer_8/attn": 0.00623917393386364, "grad/layer_8/mlp": 0.003558259690180421, "grad/layer_8/attn_mlp_ratio": 1.7534340665854764, "grad/layer_12/attn": 0.004836983513087034, "grad/layer_12/mlp": 0.006360882893204689, "grad/layer_12/attn_mlp_ratio": 0.760426424798942, "grad/layer_16/attn": 0.0034936387091875076, "grad/layer_16/mlp": 0.004366003908216953, "grad/layer_16/attn_mlp_ratio": 0.8001913655169287, "grad/layer_20/attn": 0.003222899278625846, "grad/layer_20/mlp": 0.005893965717405081, "grad/layer_20/attn_mlp_ratio": 0.5468133644597183, "grad/layer_24/attn": 0.006968027912080288, "grad/layer_24/mlp": 0.008392327465116978, "grad/layer_24/attn_mlp_ratio": 0.8302855028017682, "grad/layer_27/attn": 0.005868837703019381, "grad/layer_27/mlp": 0.00770642189309001, "grad/layer_27/attn_mlp_ratio": 0.7615515615783427} {"step": 33600, "timestamp": 1778230854.7327468, "eos/sharpness": 5.385231971740722, "eos/L0_probe": 2.013355255126953, "eos/L_plus": 2.0409421920776367, "eos/L_minus": 2.0396206378936768, "eos/grad_norm": 0.09906917810440063, "eos/embed_grad_frac": 0.2897113263607025, "eos/time_s": 0.6169741153717041} {"step": 33600, "timestamp": 1778230854.7539482, "train/loss": 2.1117769360542296, "train/z_loss": 0.0014546801918186246, "train/perplexity": 8.262910911583873, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910633.9365113657, "perf/iters_per_sec": 0.9110612566525296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976210355758667, "data/tokens_consumed": 70466404352, "data/tokens_consumed_B": 70.466404352, "train/loss_slope": -7.475724951340329e-06} {"step": 33600, "timestamp": 1778230856.1211398, "geo/rankme_last": 438.6580810546875, "geo/layer_0/stable_rank_q_proj": 18.808847427368164, "geo/layer_0/stable_rank_k_proj": 16.134294509887695, "geo/layer_0/stable_rank_o_proj": 49.91730499267578, "geo/layer_0/stable_rank_gate_proj": 140.77047729492188, "geo/layer_0/stable_rank_down_proj": 52.49076843261719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05221013352274895, "geo/layer_0/attn_entropy_mean": 6.209519386291504, "geo/layer_0/attn_entropy_std": 0.36150726675987244, "geo/layer_7/stable_rank_q_proj": 42.76737594604492, "geo/layer_7/stable_rank_k_proj": 42.028072357177734, "geo/layer_7/stable_rank_o_proj": 102.63777923583984, "geo/layer_7/stable_rank_gate_proj": 92.92456817626953, "geo/layer_7/stable_rank_down_proj": 145.94097900390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5165092349052429, "geo/layer_7/attn_entropy_mean": 4.632418155670166, "geo/layer_7/attn_entropy_std": 0.8287877440452576, "geo/layer_14/stable_rank_q_proj": 54.83686828613281, "geo/layer_14/stable_rank_k_proj": 36.16913986206055, "geo/layer_14/stable_rank_o_proj": 51.02111053466797, "geo/layer_14/stable_rank_gate_proj": 79.2729263305664, "geo/layer_14/stable_rank_down_proj": 134.31582641601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38207513093948364, "geo/layer_14/attn_entropy_mean": 5.528171062469482, "geo/layer_14/attn_entropy_std": 0.3886542320251465, "geo/layer_21/stable_rank_q_proj": 44.155174255371094, "geo/layer_21/stable_rank_k_proj": 31.100034713745117, "geo/layer_21/stable_rank_o_proj": 78.35901641845703, "geo/layer_21/stable_rank_gate_proj": 76.35858154296875, "geo/layer_21/stable_rank_down_proj": 57.097686767578125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15214073657989502, "geo/layer_21/attn_entropy_mean": 5.730412483215332, "geo/layer_21/attn_entropy_std": 0.2945655286312103, "geo/layer_27/stable_rank_q_proj": 42.28556823730469, "geo/layer_27/stable_rank_k_proj": 31.463342666625977, "geo/layer_27/stable_rank_o_proj": 118.04412841796875, "geo/layer_27/stable_rank_gate_proj": 87.25968170166016, "geo/layer_27/stable_rank_down_proj": 133.92379760742188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0844624936580658, "geo/layer_27/attn_entropy_mean": 4.2871198654174805, "geo/layer_27/attn_entropy_std": 0.6341674327850342, "attnres/final_alpha/block_0": 0.2391413450241089, "attnres/block_norm/0": 1.6993651390075684, "attnres/final_alpha/block_1": 0.005596456117928028, "attnres/block_norm/1": 37268.0, "attnres/final_alpha/block_2": 0.012076010927557945, "attnres/block_norm/2": 25195.486328125, "attnres/final_alpha/block_3": 0.013995206914842129, "attnres/block_norm/3": 42175.28125, "attnres/final_alpha/block_4": 0.017077673226594925, "attnres/block_norm/4": 12030.0390625, "attnres/final_alpha/block_5": 0.5910658836364746, "attnres/block_norm/5": 5749.634765625, "attnres/final_alpha/block_6": 0.12104736268520355, "attnres/block_norm/6": 27919.65234375, "geo/tier1_time_s": 1.3630623817443848, "geo/step": 33600.0, "geo/rankme_slope": -0.0001486841025472689} {"step": 33610, "timestamp": 1778230866.7979398, "train/loss": 2.16371693611145, "train/z_loss": 0.0014360293513163923, "train/perplexity": 8.703427695981247, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1741821.7072666306, "perf/iters_per_sec": 0.8305653129895356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2039992332458496, "data/tokens_consumed": 70487375872, "data/tokens_consumed_B": 70.487375872, "train/loss_slope": -8.526860920116045e-06} {"step": 33620, "timestamp": 1778230877.155908, "train/loss": 2.150605845451355, "train/z_loss": 0.0014377604704350234, "train/perplexity": 8.590061070437178, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025900.7665539572, "perf/iters_per_sec": 0.9660247643251215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351701498031616, "data/tokens_consumed": 70508347392, "data/tokens_consumed_B": 70.508347392, "train/loss_slope": -1.262987986935747e-05} {"step": 33630, "timestamp": 1778230887.508715, "train/loss": 2.22363657951355, "train/z_loss": 0.0014321029302664101, "train/perplexity": 9.240875012694515, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026723.3930179703, "perf/iters_per_sec": 0.9664170231904842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034749984741211, "data/tokens_consumed": 70529318912, "data/tokens_consumed_B": 70.529318912, "train/loss_slope": -1.0402871580264092e-05} {"step": 33640, "timestamp": 1778230897.8696733, "train/loss": 2.172528886795044, "train/z_loss": 0.001429530105087906, "train/perplexity": 8.780460779010141, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025047.244779924, "perf/iters_per_sec": 0.965617773427927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356064558029174, "data/tokens_consumed": 70550290432, "data/tokens_consumed_B": 70.550290432, "train/loss_slope": -1.0246590412024287e-05} {"step": 33650, "timestamp": 1778230908.2172117, "grad/layer_0/attn": 0.0026870043948292732, "grad/layer_0/mlp": 0.0028992951847612858, "grad/layer_0/attn_mlp_ratio": 0.9267784516300168, "grad/layer_4/attn": 0.0022608500439673662, "grad/layer_4/mlp": 0.0025510243140161037, "grad/layer_4/attn_mlp_ratio": 0.8862518255590063, "grad/layer_8/attn": 0.00665042782202363, "grad/layer_8/mlp": 0.003697938285768032, "grad/layer_8/attn_mlp_ratio": 1.7984149891784609, "grad/layer_12/attn": 0.004414362832903862, "grad/layer_12/mlp": 0.0067476993426680565, "grad/layer_12/attn_mlp_ratio": 0.6542026464590741, "grad/layer_16/attn": 0.00623871898278594, "grad/layer_16/mlp": 0.004561992362141609, "grad/layer_16/attn_mlp_ratio": 1.3675425890241824, "grad/layer_20/attn": 0.0042763929814100266, "grad/layer_20/mlp": 0.0063644349575042725, "grad/layer_20/attn_mlp_ratio": 0.6719202792976502, "grad/layer_24/attn": 0.007704285439103842, "grad/layer_24/mlp": 0.011356623843312263, "grad/layer_24/attn_mlp_ratio": 0.6783957519030798, "grad/layer_27/attn": 0.010474905371665955, "grad/layer_27/mlp": 0.008536959998309612, "grad/layer_27/attn_mlp_ratio": 1.227006481351608} {"step": 33650, "timestamp": 1778230908.2330883, "train/loss": 2.162387466430664, "train/z_loss": 0.0014394217752851546, "train/perplexity": 8.691864440942183, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024829.6418775828, "perf/iters_per_sec": 0.9655140122783579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035717749595642, "data/tokens_consumed": 70571261952, "data/tokens_consumed_B": 70.571261952, "train/loss_slope": -1.088305615534697e-05} {"step": 33660, "timestamp": 1778230918.5900083, "train/loss": 2.1889602661132814, "train/z_loss": 0.0014520147233270109, "train/perplexity": 8.925927698855594, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025941.8750174758, "perf/iters_per_sec": 0.9660443663680438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351491451263428, "data/tokens_consumed": 70592233472, "data/tokens_consumed_B": 70.592233472, "train/loss_slope": -1.2125887743460298e-05} {"step": 33670, "timestamp": 1778230928.9419181, "train/loss": 2.2060556650161742, "train/z_loss": 0.00143983936868608, "train/perplexity": 9.079831770977433, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026789.239468515, "perf/iters_per_sec": 0.9664484212248396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347163677215576, "data/tokens_consumed": 70613204992, "data/tokens_consumed_B": 70.613204992, "train/loss_slope": -1.0955284108923864e-05} {"step": 33675, "timestamp": 1778230934.719242, "eos/sharpness": 47.95489311218261, "eos/L0_probe": 2.016352415084839, "eos/L_plus": 2.2929790019989014, "eos/L_minus": 2.2192747592926025, "eos/grad_norm": 0.16846375167369843, "eos/embed_grad_frac": 0.10752346366643906, "eos/time_s": 0.6115128993988037} {"step": 33675, "timestamp": 1778230936.096062, "geo/rankme_last": 438.9154357910156, "geo/layer_0/stable_rank_q_proj": 18.831480026245117, "geo/layer_0/stable_rank_k_proj": 16.108623504638672, "geo/layer_0/stable_rank_o_proj": 49.9708137512207, "geo/layer_0/stable_rank_gate_proj": 140.69676208496094, "geo/layer_0/stable_rank_down_proj": 52.4898567199707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05455309525132179, "geo/layer_0/attn_entropy_mean": 6.20758056640625, "geo/layer_0/attn_entropy_std": 0.36577123403549194, "geo/layer_7/stable_rank_q_proj": 42.881202697753906, "geo/layer_7/stable_rank_k_proj": 42.05498123168945, "geo/layer_7/stable_rank_o_proj": 102.91836547851562, "geo/layer_7/stable_rank_gate_proj": 92.93598937988281, "geo/layer_7/stable_rank_down_proj": 145.63633728027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.512065589427948, "geo/layer_7/attn_entropy_mean": 4.651028633117676, "geo/layer_7/attn_entropy_std": 0.8190205097198486, "geo/layer_14/stable_rank_q_proj": 54.78685760498047, "geo/layer_14/stable_rank_k_proj": 36.25754928588867, "geo/layer_14/stable_rank_o_proj": 51.07680130004883, "geo/layer_14/stable_rank_gate_proj": 79.31558990478516, "geo/layer_14/stable_rank_down_proj": 134.18760681152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37583687901496887, "geo/layer_14/attn_entropy_mean": 5.489401340484619, "geo/layer_14/attn_entropy_std": 0.3901136815547943, "geo/layer_21/stable_rank_q_proj": 44.28286361694336, "geo/layer_21/stable_rank_k_proj": 31.079708099365234, "geo/layer_21/stable_rank_o_proj": 78.32127380371094, "geo/layer_21/stable_rank_gate_proj": 76.26447296142578, "geo/layer_21/stable_rank_down_proj": 57.10653305053711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14950688183307648, "geo/layer_21/attn_entropy_mean": 5.7112908363342285, "geo/layer_21/attn_entropy_std": 0.2860909700393677, "geo/layer_27/stable_rank_q_proj": 42.25938415527344, "geo/layer_27/stable_rank_k_proj": 31.489044189453125, "geo/layer_27/stable_rank_o_proj": 118.42505645751953, "geo/layer_27/stable_rank_gate_proj": 87.21540832519531, "geo/layer_27/stable_rank_down_proj": 133.74757385253906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08179926127195358, "geo/layer_27/attn_entropy_mean": 4.271223545074463, "geo/layer_27/attn_entropy_std": 0.6151125431060791, "attnres/final_alpha/block_0": 0.24032866954803467, "attnres/block_norm/0": 1.6994736194610596, "attnres/final_alpha/block_1": 0.005550289060920477, "attnres/block_norm/1": 37453.7421875, "attnres/final_alpha/block_2": 0.011849047616124153, "attnres/block_norm/2": 25183.99609375, "attnres/final_alpha/block_3": 0.01382347010076046, "attnres/block_norm/3": 42422.98828125, "attnres/final_alpha/block_4": 0.01713588833808899, "attnres/block_norm/4": 11986.865234375, "attnres/final_alpha/block_5": 0.5897059440612793, "attnres/block_norm/5": 5745.70654296875, "attnres/final_alpha/block_6": 0.12160669267177582, "attnres/block_norm/6": 27978.447265625, "geo/tier1_time_s": 1.3568768501281738, "geo/step": 33675.0, "geo/rankme_slope": -0.0001543727061136955} {"step": 33680, "timestamp": 1778230941.27698, "train/loss": 2.1796274542808534, "train/z_loss": 0.0014438074664212762, "train/perplexity": 8.843011219008515, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700867.2904805422, "perf/iters_per_sec": 0.8110367252733909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23298978805542, "data/tokens_consumed": 70634176512, "data/tokens_consumed_B": 70.634176512, "train/loss_slope": -1.3419712530468568e-05} {"step": 33690, "timestamp": 1778230951.6285312, "train/loss": 2.1603625297546385, "train/z_loss": 0.0014532010070979595, "train/perplexity": 8.674281773754451, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026730.3043593226, "perf/iters_per_sec": 0.966420318774854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347464561462403, "data/tokens_consumed": 70655148032, "data/tokens_consumed_B": 70.655148032, "train/loss_slope": -1.3304057487524413e-05} {"step": 33700, "timestamp": 1778230962.9569006, "grad/layer_0/attn": 0.0030369083397090435, "grad/layer_0/mlp": 0.0029229936189949512, "grad/layer_0/attn_mlp_ratio": 1.0389719006147098, "grad/layer_4/attn": 0.0019885487854480743, "grad/layer_4/mlp": 0.0024884731974452734, "grad/layer_4/attn_mlp_ratio": 0.7991039274922365, "grad/layer_8/attn": 0.003682208713144064, "grad/layer_8/mlp": 0.0036997657734900713, "grad/layer_8/attn_mlp_ratio": 0.9952545212463821, "grad/layer_12/attn": 0.005533487070351839, "grad/layer_12/mlp": 0.006882823538035154, "grad/layer_12/attn_mlp_ratio": 0.8039559578097062, "grad/layer_16/attn": 0.004425994586199522, "grad/layer_16/mlp": 0.004544760100543499, "grad/layer_16/attn_mlp_ratio": 0.9738675729624252, "grad/layer_20/attn": 0.006410467438399792, "grad/layer_20/mlp": 0.0062627834267914295, "grad/layer_20/attn_mlp_ratio": 1.023581193725855, "grad/layer_24/attn": 0.006164271850138903, "grad/layer_24/mlp": 0.008729199878871441, "grad/layer_24/attn_mlp_ratio": 0.7061668726869806, "grad/layer_27/attn": 0.0052046398632228374, "grad/layer_27/mlp": 0.0073380060493946075, "grad/layer_27/attn_mlp_ratio": 0.7092716682517669} {"step": 33700, "timestamp": 1778230962.972901, "train/loss": 2.1810644626617433, "train/z_loss": 0.0014549556537531317, "train/perplexity": 8.855727834996099, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1859089.3293983166, "perf/iters_per_sec": 0.8864828726760466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1280533790588378, "data/tokens_consumed": 70676119552, "data/tokens_consumed_B": 70.676119552, "train/loss_slope": -1.3793839293845287e-05} {"step": 33710, "timestamp": 1778230973.328009, "train/loss": 2.207673501968384, "train/z_loss": 0.0014451006310991943, "train/perplexity": 9.094533347506596, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026345.39576807, "perf/iters_per_sec": 0.9662367800560331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349430084228515, "data/tokens_consumed": 70697091072, "data/tokens_consumed_B": 70.697091072, "train/loss_slope": -1.0447505991844422e-05} {"step": 33720, "timestamp": 1778230983.6871908, "train/loss": 2.233021855354309, "train/z_loss": 0.0014323156094178558, "train/perplexity": 9.328011433767305, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025627.5613164403, "perf/iters_per_sec": 0.9658944899160578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353097677230836, "data/tokens_consumed": 70718062592, "data/tokens_consumed_B": 70.718062592, "train/loss_slope": -6.115202391573199e-06} {"step": 33730, "timestamp": 1778230994.0345066, "train/loss": 2.174792504310608, "train/z_loss": 0.0014412387507036328, "train/perplexity": 8.800358896191087, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027452.6592639305, "perf/iters_per_sec": 0.9667647644347813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343777894973756, "data/tokens_consumed": 70739034112, "data/tokens_consumed_B": 70.739034112, "train/loss_slope": -6.848065323061337e-06} {"step": 33740, "timestamp": 1778231004.3942778, "train/loss": 2.1886242866516112, "train/z_loss": 0.001438874891027808, "train/perplexity": 8.922929274205181, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025459.5972076769, "perf/iters_per_sec": 0.9658143983877548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035395622253418, "data/tokens_consumed": 70760005632, "data/tokens_consumed_B": 70.760005632, "train/loss_slope": -3.507089393116668e-06} {"step": 33750, "timestamp": 1778231014.7394814, "grad/layer_0/attn": 0.0028917465824633837, "grad/layer_0/mlp": 0.0027888359036296606, "grad/layer_0/attn_mlp_ratio": 1.0369009072960136, "grad/layer_4/attn": 0.0021432661451399326, "grad/layer_4/mlp": 0.002592748962342739, "grad/layer_4/attn_mlp_ratio": 0.826638480471895, "grad/layer_8/attn": 0.00688930694013834, "grad/layer_8/mlp": 0.003609107807278633, "grad/layer_8/attn_mlp_ratio": 1.9088669879458016, "grad/layer_12/attn": 0.006793429143726826, "grad/layer_12/mlp": 0.006581883877515793, "grad/layer_12/attn_mlp_ratio": 1.0321405188747914, "grad/layer_16/attn": 0.003959940746426582, "grad/layer_16/mlp": 0.0046339756809175014, "grad/layer_16/attn_mlp_ratio": 0.8545449811657273, "grad/layer_20/attn": 0.004998623859137297, "grad/layer_20/mlp": 0.0064070383086800575, "grad/layer_20/attn_mlp_ratio": 0.7801769773012935, "grad/layer_24/attn": 0.010518405586481094, "grad/layer_24/mlp": 0.010281410999596119, "grad/layer_24/attn_mlp_ratio": 1.0230507743138766, "grad/layer_27/attn": 0.007987185381352901, "grad/layer_27/mlp": 0.00944247655570507, "grad/layer_27/attn_mlp_ratio": 0.8458782237525688} {"step": 33750, "timestamp": 1778231015.351636, "eos/sharpness": 46.50945663452148, "eos/L0_probe": 2.011972427368164, "eos/L_plus": 2.217013359069824, "eos/L_minus": 2.2720260620117188, "eos/grad_norm": 0.15005294978618622, "eos/embed_grad_frac": 0.1162792295217514, "eos/time_s": 0.6093108654022217} {"step": 33750, "timestamp": 1778231015.3711715, "train/loss": 2.1699591159820555, "train/z_loss": 0.0014537779032252729, "train/perplexity": 8.7579259742186, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911453.073216788, "perf/iters_per_sec": 0.911451851471323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971506595611573, "data/tokens_consumed": 70780977152, "data/tokens_consumed_B": 70.780977152, "train/loss_slope": -2.732779097707266e-06} {"step": 33750, "timestamp": 1778231016.734586, "geo/rankme_last": 439.5936584472656, "geo/layer_0/stable_rank_q_proj": 18.849943161010742, "geo/layer_0/stable_rank_k_proj": 16.173006057739258, "geo/layer_0/stable_rank_o_proj": 49.99455261230469, "geo/layer_0/stable_rank_gate_proj": 140.44081115722656, "geo/layer_0/stable_rank_down_proj": 52.443931579589844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057011187076568604, "geo/layer_0/attn_entropy_mean": 6.204665184020996, "geo/layer_0/attn_entropy_std": 0.36300498247146606, "geo/layer_7/stable_rank_q_proj": 42.94943618774414, "geo/layer_7/stable_rank_k_proj": 41.94435501098633, "geo/layer_7/stable_rank_o_proj": 102.70992279052734, "geo/layer_7/stable_rank_gate_proj": 93.05289459228516, "geo/layer_7/stable_rank_down_proj": 146.21507263183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5142890810966492, "geo/layer_7/attn_entropy_mean": 4.637140274047852, "geo/layer_7/attn_entropy_std": 0.8143704533576965, "geo/layer_14/stable_rank_q_proj": 54.727516174316406, "geo/layer_14/stable_rank_k_proj": 36.1812858581543, "geo/layer_14/stable_rank_o_proj": 50.97872543334961, "geo/layer_14/stable_rank_gate_proj": 79.40241241455078, "geo/layer_14/stable_rank_down_proj": 134.0049591064453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38116776943206787, "geo/layer_14/attn_entropy_mean": 5.503597259521484, "geo/layer_14/attn_entropy_std": 0.3916034698486328, "geo/layer_21/stable_rank_q_proj": 44.25728988647461, "geo/layer_21/stable_rank_k_proj": 31.122512817382812, "geo/layer_21/stable_rank_o_proj": 78.30387115478516, "geo/layer_21/stable_rank_gate_proj": 76.18923950195312, "geo/layer_21/stable_rank_down_proj": 57.09831237792969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14772018790245056, "geo/layer_21/attn_entropy_mean": 5.709630489349365, "geo/layer_21/attn_entropy_std": 0.28165191411972046, "geo/layer_27/stable_rank_q_proj": 42.236427307128906, "geo/layer_27/stable_rank_k_proj": 31.470853805541992, "geo/layer_27/stable_rank_o_proj": 118.37798309326172, "geo/layer_27/stable_rank_gate_proj": 87.29605865478516, "geo/layer_27/stable_rank_down_proj": 133.66091918945312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07819680124521255, "geo/layer_27/attn_entropy_mean": 4.3027448654174805, "geo/layer_27/attn_entropy_std": 0.6254252791404724, "attnres/final_alpha/block_0": 0.24032950401306152, "attnres/block_norm/0": 1.6998114585876465, "attnres/final_alpha/block_1": 0.005642460193485022, "attnres/block_norm/1": 37285.35546875, "attnres/final_alpha/block_2": 0.011916114017367363, "attnres/block_norm/2": 25200.658203125, "attnres/final_alpha/block_3": 0.013808729127049446, "attnres/block_norm/3": 42410.8515625, "attnres/final_alpha/block_4": 0.017368480563163757, "attnres/block_norm/4": 12004.5966796875, "attnres/final_alpha/block_5": 0.5887484550476074, "attnres/block_norm/5": 5710.19287109375, "attnres/final_alpha/block_6": 0.12218626588582993, "attnres/block_norm/6": 27889.259765625, "geo/tier1_time_s": 1.3597078323364258, "geo/step": 33750.0, "geo/rankme_slope": -0.00014580261792216886} {"step": 33760, "timestamp": 1778231027.5702603, "train/loss": 2.1981082916259767, "train/z_loss": 0.001439520006533712, "train/perplexity": 9.007956943922506, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1719639.8886562542, "perf/iters_per_sec": 0.8199881976395865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2195297479629517, "data/tokens_consumed": 70801948672, "data/tokens_consumed_B": 70.801948672, "train/loss_slope": -1.5982055678368825e-06} {"step": 33770, "timestamp": 1778231038.321354, "train/loss": 2.136981749534607, "train/z_loss": 0.001461205177474767, "train/perplexity": 8.473822872627672, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951719.0338471674, "perf/iters_per_sec": 0.930652157710632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0745153188705445, "data/tokens_consumed": 70822920192, "data/tokens_consumed_B": 70.822920192, "train/loss_slope": -2.3124326502207124e-06} {"step": 33780, "timestamp": 1778231048.6811862, "train/loss": 2.1826548337936402, "train/z_loss": 0.0014584315358661115, "train/perplexity": 8.869822934145336, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025635.4447929214, "perf/iters_per_sec": 0.9658982490505797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353057384490967, "data/tokens_consumed": 70843891712, "data/tokens_consumed_B": 70.843891712, "train/loss_slope": -6.907368214181155e-06} {"step": 33790, "timestamp": 1778231059.0272696, "train/loss": 2.1717607259750364, "train/z_loss": 0.0014434283482842147, "train/perplexity": 8.773718562942738, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027984.8393846538, "perf/iters_per_sec": 0.9670185276911992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341063499450684, "data/tokens_consumed": 70864863232, "data/tokens_consumed_B": 70.864863232, "train/loss_slope": -9.254662160075087e-06} {"step": 33800, "timestamp": 1778231069.3758197, "grad/layer_0/attn": 0.002976830583065748, "grad/layer_0/mlp": 0.0028325440362095833, "grad/layer_0/attn_mlp_ratio": 1.0509388168084273, "grad/layer_4/attn": 0.0026050484739243984, "grad/layer_4/mlp": 0.0023988436441868544, "grad/layer_4/attn_mlp_ratio": 1.0859600506440832, "grad/layer_8/attn": 0.005571682937443256, "grad/layer_8/mlp": 0.003721361979842186, "grad/layer_8/attn_mlp_ratio": 1.497216024106834, "grad/layer_12/attn": 0.00438132556155324, "grad/layer_12/mlp": 0.006464771460741758, "grad/layer_12/attn_mlp_ratio": 0.6777231833154725, "grad/layer_16/attn": 0.005396679975092411, "grad/layer_16/mlp": 0.00436134310439229, "grad/layer_16/attn_mlp_ratio": 1.2373894284809868, "grad/layer_20/attn": 0.0033814182970672846, "grad/layer_20/mlp": 0.0057494547218084335, "grad/layer_20/attn_mlp_ratio": 0.5881285098964728, "grad/layer_24/attn": 0.006016266066581011, "grad/layer_24/mlp": 0.008447818458080292, "grad/layer_24/attn_mlp_ratio": 0.7121680023331569, "grad/layer_27/attn": 0.00408925162628293, "grad/layer_27/mlp": 0.007750730495899916, "grad/layer_27/attn_mlp_ratio": 0.5275956344613651} {"step": 33800, "timestamp": 1778231069.3917668, "train/loss": 2.2066232681274416, "train/z_loss": 0.0014544032281264662, "train/perplexity": 9.084986974656843, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024494.6118097953, "perf/iters_per_sec": 0.9653542574929215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358891487121582, "data/tokens_consumed": 70885834752, "data/tokens_consumed_B": 70.885834752, "train/loss_slope": -7.380333220032339e-06} {"step": 33810, "timestamp": 1778231079.7709408, "train/loss": 2.2085346937179566, "train/z_loss": 0.0014423957094550134, "train/perplexity": 9.10236885804593, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021763.9396306202, "perf/iters_per_sec": 0.9640521715310193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372882604598999, "data/tokens_consumed": 70906806272, "data/tokens_consumed_B": 70.906806272, "train/loss_slope": -3.902314476805178e-06} {"step": 33820, "timestamp": 1778231090.154265, "train/loss": 2.211705279350281, "train/z_loss": 0.0014265263103879987, "train/perplexity": 9.131274497655422, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021700.2315019947, "perf/iters_per_sec": 0.9640217931280111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373209476470948, "data/tokens_consumed": 70927777792, "data/tokens_consumed_B": 70.927777792, "train/loss_slope": -2.963998952213773e-06} {"step": 33825, "timestamp": 1778231095.9644566, "eos/sharpness": 62.315368652343736, "eos/L0_probe": 2.0101301670074463, "eos/L_plus": 2.3914332389831543, "eos/L_minus": 2.251980781555176, "eos/grad_norm": 0.17836087942123413, "eos/embed_grad_frac": 0.07896095514297485, "eos/time_s": 0.609743595123291} {"step": 33825, "timestamp": 1778231097.354488, "geo/rankme_last": 438.2787780761719, "geo/layer_0/stable_rank_q_proj": 18.855886459350586, "geo/layer_0/stable_rank_k_proj": 16.18360137939453, "geo/layer_0/stable_rank_o_proj": 49.97417068481445, "geo/layer_0/stable_rank_gate_proj": 140.39974975585938, "geo/layer_0/stable_rank_down_proj": 52.47848892211914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05785394459962845, "geo/layer_0/attn_entropy_mean": 6.205158233642578, "geo/layer_0/attn_entropy_std": 0.3655022382736206, "geo/layer_7/stable_rank_q_proj": 42.90971374511719, "geo/layer_7/stable_rank_k_proj": 41.940650939941406, "geo/layer_7/stable_rank_o_proj": 102.48799133300781, "geo/layer_7/stable_rank_gate_proj": 93.01329040527344, "geo/layer_7/stable_rank_down_proj": 146.4607696533203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5104133486747742, "geo/layer_7/attn_entropy_mean": 4.630620002746582, "geo/layer_7/attn_entropy_std": 0.8297948837280273, "geo/layer_14/stable_rank_q_proj": 54.611122131347656, "geo/layer_14/stable_rank_k_proj": 36.151451110839844, "geo/layer_14/stable_rank_o_proj": 50.93211364746094, "geo/layer_14/stable_rank_gate_proj": 79.3093032836914, "geo/layer_14/stable_rank_down_proj": 134.14651489257812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38112419843673706, "geo/layer_14/attn_entropy_mean": 5.498978137969971, "geo/layer_14/attn_entropy_std": 0.39152607321739197, "geo/layer_21/stable_rank_q_proj": 44.32237243652344, "geo/layer_21/stable_rank_k_proj": 31.22312355041504, "geo/layer_21/stable_rank_o_proj": 78.27892303466797, "geo/layer_21/stable_rank_gate_proj": 76.02552032470703, "geo/layer_21/stable_rank_down_proj": 57.074005126953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1498718410730362, "geo/layer_21/attn_entropy_mean": 5.748650550842285, "geo/layer_21/attn_entropy_std": 0.272628515958786, "geo/layer_27/stable_rank_q_proj": 42.34920120239258, "geo/layer_27/stable_rank_k_proj": 31.573135375976562, "geo/layer_27/stable_rank_o_proj": 118.16646575927734, "geo/layer_27/stable_rank_gate_proj": 87.20320892333984, "geo/layer_27/stable_rank_down_proj": 133.61309814453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07873688638210297, "geo/layer_27/attn_entropy_mean": 4.3157196044921875, "geo/layer_27/attn_entropy_std": 0.6325533390045166, "attnres/final_alpha/block_0": 0.2399192899465561, "attnres/block_norm/0": 1.700160264968872, "attnres/final_alpha/block_1": 0.005519072990864515, "attnres/block_norm/1": 37536.77734375, "attnres/final_alpha/block_2": 0.011736719869077206, "attnres/block_norm/2": 25251.517578125, "attnres/final_alpha/block_3": 0.013733475469052792, "attnres/block_norm/3": 42683.12109375, "attnres/final_alpha/block_4": 0.017251506447792053, "attnres/block_norm/4": 11960.1875, "attnres/final_alpha/block_5": 0.592602550983429, "attnres/block_norm/5": 5699.8603515625, "attnres/final_alpha/block_6": 0.11923738569021225, "attnres/block_norm/6": 28160.421875, "geo/tier1_time_s": 1.3614115715026855, "geo/step": 33825.0, "geo/rankme_slope": -0.00017155090551845739} {"step": 33830, "timestamp": 1778231102.5462687, "train/loss": 2.1446191549301146, "train/z_loss": 0.0014402175438590348, "train/perplexity": 8.538788662404981, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1693524.0299407877, "perf/iters_per_sec": 0.8075351857856692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2383361339569092, "data/tokens_consumed": 70948749312, "data/tokens_consumed_B": 70.948749312, "train/loss_slope": -3.0600606089699675e-06} {"step": 33840, "timestamp": 1778231112.9151387, "train/loss": 2.1627850770950316, "train/z_loss": 0.0014503581332974136, "train/perplexity": 8.695321106095061, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023700.7505117175, "perf/iters_per_sec": 0.9649757149275386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036295509338379, "data/tokens_consumed": 70969720832, "data/tokens_consumed_B": 70.969720832, "train/loss_slope": -9.015997529852454e-06} {"step": 33850, "timestamp": 1778231123.2601376, "grad/layer_0/attn": 0.0033174084965139627, "grad/layer_0/mlp": 0.0028316271491348743, "grad/layer_0/attn_mlp_ratio": 1.1715555066534642, "grad/layer_4/attn": 0.0036120200529694557, "grad/layer_4/mlp": 0.0025909149553626776, "grad/layer_4/attn_mlp_ratio": 1.3941097935624305, "grad/layer_8/attn": 0.004653298296034336, "grad/layer_8/mlp": 0.0037018435541540384, "grad/layer_8/attn_mlp_ratio": 1.257021833110812, "grad/layer_12/attn": 0.004303209949284792, "grad/layer_12/mlp": 0.006433749105781317, "grad/layer_12/attn_mlp_ratio": 0.6688495015344958, "grad/layer_16/attn": 0.0038955353666096926, "grad/layer_16/mlp": 0.00463998643681407, "grad/layer_16/attn_mlp_ratio": 0.8395574719241458, "grad/layer_20/attn": 0.006110535003244877, "grad/layer_20/mlp": 0.005994186736643314, "grad/layer_20/attn_mlp_ratio": 1.0194101668453692, "grad/layer_24/attn": 0.0077560520730912685, "grad/layer_24/mlp": 0.011343664489686489, "grad/layer_24/attn_mlp_ratio": 0.6837342564009667, "grad/layer_27/attn": 0.012845366261899471, "grad/layer_27/mlp": 0.009416256099939346, "grad/layer_27/attn_mlp_ratio": 1.3641691548263326} {"step": 33850, "timestamp": 1778231123.2764926, "train/loss": 2.202667236328125, "train/z_loss": 0.001438747881911695, "train/perplexity": 9.04911747451068, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025248.1541006905, "perf/iters_per_sec": 0.9657135744574978, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355037212371827, "data/tokens_consumed": 70990692352, "data/tokens_consumed_B": 70.990692352, "train/loss_slope": -7.0694217730526835e-06} {"step": 33860, "timestamp": 1778231133.6365085, "train/loss": 2.1600149154663084, "train/z_loss": 0.0014358837157487868, "train/perplexity": 8.6712669934896, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025386.1887088262, "perf/iters_per_sec": 0.965779394487775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354331493377686, "data/tokens_consumed": 71011663872, "data/tokens_consumed_B": 71.011663872, "train/loss_slope": -8.655721941689547e-06} {"step": 33870, "timestamp": 1778231143.9934192, "train/loss": 2.174919676780701, "train/z_loss": 0.0014401081018149853, "train/perplexity": 8.80147813073602, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025828.2125534704, "perf/iters_per_sec": 0.9659901678817131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352072238922119, "data/tokens_consumed": 71032635392, "data/tokens_consumed_B": 71.032635392, "train/loss_slope": -1.1082259341828476e-05} {"step": 33880, "timestamp": 1778231154.3497117, "train/loss": 2.18487434387207, "train/z_loss": 0.0014593551983125507, "train/perplexity": 8.88953145908571, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026056.8569225112, "perf/iters_per_sec": 0.9660991940128857, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035090398788452, "data/tokens_consumed": 71053606912, "data/tokens_consumed_B": 71.053606912, "train/loss_slope": -6.883196919450084e-06} {"step": 33890, "timestamp": 1778231164.7098138, "train/loss": 2.178285312652588, "train/z_loss": 0.001435366051737219, "train/perplexity": 8.831150606623497, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025399.0138253204, "perf/iters_per_sec": 0.9657855099798777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354265928268434, "data/tokens_consumed": 71074578432, "data/tokens_consumed_B": 71.074578432, "train/loss_slope": -8.969062396866955e-06} {"step": 33900, "timestamp": 1778231175.0691829, "grad/layer_0/attn": 0.0025246390141546726, "grad/layer_0/mlp": 0.002673002891242504, "grad/layer_0/attn_mlp_ratio": 0.9444953942910229, "grad/layer_4/attn": 0.0023274533450603485, "grad/layer_4/mlp": 0.0026061790995299816, "grad/layer_4/attn_mlp_ratio": 0.8930519227074252, "grad/layer_8/attn": 0.010033692233264446, "grad/layer_8/mlp": 0.0036116933915764093, "grad/layer_8/attn_mlp_ratio": 2.778112887116842, "grad/layer_12/attn": 0.005729453172534704, "grad/layer_12/mlp": 0.006011390592902899, "grad/layer_12/attn_mlp_ratio": 0.9530994515626722, "grad/layer_16/attn": 0.004815336316823959, "grad/layer_16/mlp": 0.004525349009782076, "grad/layer_16/attn_mlp_ratio": 1.064080626711217, "grad/layer_20/attn": 0.0061884112656116486, "grad/layer_20/mlp": 0.006168810650706291, "grad/layer_20/attn_mlp_ratio": 1.0031773571434517, "grad/layer_24/attn": 0.008280106820166111, "grad/layer_24/mlp": 0.009366607293486595, "grad/layer_24/attn_mlp_ratio": 0.8840027634684442, "grad/layer_27/attn": 0.0070724948309361935, "grad/layer_27/mlp": 0.008366944268345833, "grad/layer_27/attn_mlp_ratio": 0.8452900508927903} {"step": 33900, "timestamp": 1778231175.680257, "eos/sharpness": 57.423448562622056, "eos/L0_probe": 2.011863946914673, "eos/L_plus": 2.3720548152923584, "eos/L_minus": 2.225907564163208, "eos/grad_norm": 0.1418130248785019, "eos/embed_grad_frac": 0.12190528213977814, "eos/time_s": 0.6080284118652344} {"step": 33900, "timestamp": 1778231175.7009513, "train/loss": 2.2388298749923705, "train/z_loss": 0.0014284679433330893, "train/perplexity": 9.382346343730445, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908906.4174402582, "perf/iters_per_sec": 0.910237511367921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986143589019775, "data/tokens_consumed": 71095549952, "data/tokens_consumed_B": 71.095549952, "train/loss_slope": -5.78977429326717e-06} {"step": 33900, "timestamp": 1778231177.0606406, "geo/rankme_last": 439.9283142089844, "geo/layer_0/stable_rank_q_proj": 18.83519744873047, "geo/layer_0/stable_rank_k_proj": 16.164531707763672, "geo/layer_0/stable_rank_o_proj": 49.88908386230469, "geo/layer_0/stable_rank_gate_proj": 140.39004516601562, "geo/layer_0/stable_rank_down_proj": 52.51197052001953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05340820923447609, "geo/layer_0/attn_entropy_mean": 6.198117256164551, "geo/layer_0/attn_entropy_std": 0.3675808608531952, "geo/layer_7/stable_rank_q_proj": 42.965091705322266, "geo/layer_7/stable_rank_k_proj": 42.012760162353516, "geo/layer_7/stable_rank_o_proj": 102.41507720947266, "geo/layer_7/stable_rank_gate_proj": 93.15697479248047, "geo/layer_7/stable_rank_down_proj": 146.00216674804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5317032933235168, "geo/layer_7/attn_entropy_mean": 4.634188652038574, "geo/layer_7/attn_entropy_std": 0.8152592182159424, "geo/layer_14/stable_rank_q_proj": 54.60093688964844, "geo/layer_14/stable_rank_k_proj": 36.2293815612793, "geo/layer_14/stable_rank_o_proj": 50.993526458740234, "geo/layer_14/stable_rank_gate_proj": 79.27759552001953, "geo/layer_14/stable_rank_down_proj": 134.00936889648438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39687228202819824, "geo/layer_14/attn_entropy_mean": 5.49691104888916, "geo/layer_14/attn_entropy_std": 0.3943121135234833, "geo/layer_21/stable_rank_q_proj": 44.3218879699707, "geo/layer_21/stable_rank_k_proj": 31.18065643310547, "geo/layer_21/stable_rank_o_proj": 78.28009796142578, "geo/layer_21/stable_rank_gate_proj": 75.85662078857422, "geo/layer_21/stable_rank_down_proj": 57.02310562133789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14745216071605682, "geo/layer_21/attn_entropy_mean": 5.750003814697266, "geo/layer_21/attn_entropy_std": 0.29565927386283875, "geo/layer_27/stable_rank_q_proj": 42.39631271362305, "geo/layer_27/stable_rank_k_proj": 31.63245391845703, "geo/layer_27/stable_rank_o_proj": 118.17528533935547, "geo/layer_27/stable_rank_gate_proj": 87.10433959960938, "geo/layer_27/stable_rank_down_proj": 133.2939453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08433286845684052, "geo/layer_27/attn_entropy_mean": 4.301466941833496, "geo/layer_27/attn_entropy_std": 0.6304152607917786, "attnres/final_alpha/block_0": 0.23908080160617828, "attnres/block_norm/0": 1.700488805770874, "attnres/final_alpha/block_1": 0.005513102747499943, "attnres/block_norm/1": 37515.8203125, "attnres/final_alpha/block_2": 0.0116195660084486, "attnres/block_norm/2": 25353.330078125, "attnres/final_alpha/block_3": 0.013604417443275452, "attnres/block_norm/3": 42596.0, "attnres/final_alpha/block_4": 0.017116200178861618, "attnres/block_norm/4": 11973.2109375, "attnres/final_alpha/block_5": 0.5944056510925293, "attnres/block_norm/5": 5693.8525390625, "attnres/final_alpha/block_6": 0.11866024136543274, "attnres/block_norm/6": 28015.6953125, "geo/tier1_time_s": 1.3560223579406738, "geo/step": 33900.0, "geo/rankme_slope": -0.00013761764471413566} {"step": 33910, "timestamp": 1778231187.4204106, "train/loss": 2.1944674611091615, "train/z_loss": 0.0014289293554611503, "train/perplexity": 8.975220130144525, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790002.348835941, "perf/iters_per_sec": 0.8535396331958489, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715917587280273, "data/tokens_consumed": 71116521472, "data/tokens_consumed_B": 71.116521472, "train/loss_slope": -4.13220229894233e-06} {"step": 33920, "timestamp": 1778231197.7736366, "train/loss": 2.197024416923523, "train/z_loss": 0.00142524519469589, "train/perplexity": 8.998198736562564, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026653.8152478973, "perf/iters_per_sec": 0.9663838459243285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034785509109497, "data/tokens_consumed": 71137492992, "data/tokens_consumed_B": 71.137492992, "train/loss_slope": -4.073083075252382e-06} {"step": 33930, "timestamp": 1778231208.133524, "train/loss": 2.2278538227081297, "train/z_loss": 0.0014429762843064964, "train/perplexity": 9.279928320721819, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025346.0354853175, "perf/iters_per_sec": 0.9657602479387843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354536771774292, "data/tokens_consumed": 71158464512, "data/tokens_consumed_B": 71.158464512, "train/loss_slope": 5.529749332660606e-07} {"step": 33940, "timestamp": 1778231218.5010216, "train/loss": 2.1815449476242064, "train/z_loss": 0.0014488941873423756, "train/perplexity": 8.859983901458566, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023928.6820284051, "perf/iters_per_sec": 0.9650844011442209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361788034439088, "data/tokens_consumed": 71179436032, "data/tokens_consumed_B": 71.179436032, "train/loss_slope": -9.236830617799634e-07} {"step": 33950, "timestamp": 1778231228.8449602, "grad/layer_0/attn": 0.002618159633129835, "grad/layer_0/mlp": 0.0028544727247208357, "grad/layer_0/attn_mlp_ratio": 0.9172130175686252, "grad/layer_4/attn": 0.002172111766412854, "grad/layer_4/mlp": 0.0025164762046188116, "grad/layer_4/attn_mlp_ratio": 0.8631560577089873, "grad/layer_8/attn": 0.005814748350530863, "grad/layer_8/mlp": 0.0036359333898872137, "grad/layer_8/attn_mlp_ratio": 1.5992449715330836, "grad/layer_12/attn": 0.003966853953897953, "grad/layer_12/mlp": 0.006230517290532589, "grad/layer_12/attn_mlp_ratio": 0.6366813067443928, "grad/layer_16/attn": 0.0037695039063692093, "grad/layer_16/mlp": 0.00457145506516099, "grad/layer_16/attn_mlp_ratio": 0.8245741826577581, "grad/layer_20/attn": 0.0046508703380823135, "grad/layer_20/mlp": 0.00566999614238739, "grad/layer_20/attn_mlp_ratio": 0.8202598624869694, "grad/layer_24/attn": 0.006850159261375666, "grad/layer_24/mlp": 0.008005267940461636, "grad/layer_24/attn_mlp_ratio": 0.8557064206660395, "grad/layer_27/attn": 0.005450038239359856, "grad/layer_27/mlp": 0.006710330490022898, "grad/layer_27/attn_mlp_ratio": 0.8121862501771703} {"step": 33950, "timestamp": 1778231228.860858, "train/loss": 2.2195982694625855, "train/z_loss": 0.0014447223860770465, "train/perplexity": 9.203632742779574, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025344.6364469992, "perf/iters_per_sec": 0.9657595808253284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354543924331665, "data/tokens_consumed": 71200407552, "data/tokens_consumed_B": 71.200407552, "train/loss_slope": 2.6441263787710206e-06} {"step": 33960, "timestamp": 1778231239.213602, "train/loss": 2.1956328630447386, "train/z_loss": 0.0014324376825243235, "train/perplexity": 8.985685966324773, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026927.8111874494, "perf/iters_per_sec": 0.9665144973695037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346456289291381, "data/tokens_consumed": 71221379072, "data/tokens_consumed_B": 71.221379072, "train/loss_slope": 4.365794421887568e-06} {"step": 33970, "timestamp": 1778231249.5613546, "train/loss": 2.1962148904800416, "train/z_loss": 0.0014353593694977462, "train/perplexity": 8.99091740435474, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027696.8618352667, "perf/iters_per_sec": 0.9668812092949232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034253215789795, "data/tokens_consumed": 71242350592, "data/tokens_consumed_B": 71.242350592, "train/loss_slope": 4.865586932915104e-06} {"step": 33975, "timestamp": 1778231255.3380942, "eos/sharpness": 59.251451492309556, "eos/L0_probe": 2.0128095149993896, "eos/L_plus": 2.256791591644287, "eos/L_minus": 2.361341953277588, "eos/grad_norm": 0.17336629331111908, "eos/embed_grad_frac": 0.07583983987569809, "eos/time_s": 0.612863302230835} {"step": 33975, "timestamp": 1778231256.7146149, "geo/rankme_last": 439.05377197265625, "geo/layer_0/stable_rank_q_proj": 18.859249114990234, "geo/layer_0/stable_rank_k_proj": 16.205337524414062, "geo/layer_0/stable_rank_o_proj": 49.9518928527832, "geo/layer_0/stable_rank_gate_proj": 140.52682495117188, "geo/layer_0/stable_rank_down_proj": 52.56997299194336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0592864565551281, "geo/layer_0/attn_entropy_mean": 6.203769207000732, "geo/layer_0/attn_entropy_std": 0.3617487847805023, "geo/layer_7/stable_rank_q_proj": 42.869911193847656, "geo/layer_7/stable_rank_k_proj": 41.978981018066406, "geo/layer_7/stable_rank_o_proj": 102.67694854736328, "geo/layer_7/stable_rank_gate_proj": 92.96200561523438, "geo/layer_7/stable_rank_down_proj": 145.85604858398438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5164850950241089, "geo/layer_7/attn_entropy_mean": 4.655938625335693, "geo/layer_7/attn_entropy_std": 0.8319681882858276, "geo/layer_14/stable_rank_q_proj": 54.65227508544922, "geo/layer_14/stable_rank_k_proj": 36.21404266357422, "geo/layer_14/stable_rank_o_proj": 51.04500961303711, "geo/layer_14/stable_rank_gate_proj": 79.24779510498047, "geo/layer_14/stable_rank_down_proj": 133.99562072753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3856702744960785, "geo/layer_14/attn_entropy_mean": 5.501822471618652, "geo/layer_14/attn_entropy_std": 0.379365473985672, "geo/layer_21/stable_rank_q_proj": 44.25617218017578, "geo/layer_21/stable_rank_k_proj": 31.143463134765625, "geo/layer_21/stable_rank_o_proj": 78.25202941894531, "geo/layer_21/stable_rank_gate_proj": 75.84471130371094, "geo/layer_21/stable_rank_down_proj": 56.96685791015625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14854173362255096, "geo/layer_21/attn_entropy_mean": 5.729894638061523, "geo/layer_21/attn_entropy_std": 0.2858838438987732, "geo/layer_27/stable_rank_q_proj": 42.3311882019043, "geo/layer_27/stable_rank_k_proj": 31.54400634765625, "geo/layer_27/stable_rank_o_proj": 118.09296417236328, "geo/layer_27/stable_rank_gate_proj": 87.18403625488281, "geo/layer_27/stable_rank_down_proj": 133.2855987548828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07792545855045319, "geo/layer_27/attn_entropy_mean": 4.288636684417725, "geo/layer_27/attn_entropy_std": 0.6495624780654907, "attnres/final_alpha/block_0": 0.24205349385738373, "attnres/block_norm/0": 1.700896978378296, "attnres/final_alpha/block_1": 0.005578044801950455, "attnres/block_norm/1": 37470.2890625, "attnres/final_alpha/block_2": 0.011946738697588444, "attnres/block_norm/2": 25199.5234375, "attnres/final_alpha/block_3": 0.013774852268397808, "attnres/block_norm/3": 42795.4296875, "attnres/final_alpha/block_4": 0.01760273613035679, "attnres/block_norm/4": 12002.99609375, "attnres/final_alpha/block_5": 0.5869874954223633, "attnres/block_norm/5": 5771.39013671875, "attnres/final_alpha/block_6": 0.12205658853054047, "attnres/block_norm/6": 27952.0703125, "geo/tier1_time_s": 1.3565008640289307, "geo/step": 33975.0, "geo/rankme_slope": -0.0001352427494435274} {"step": 33980, "timestamp": 1778231261.8917987, "train/loss": 2.186735820770264, "train/z_loss": 0.001449603820219636, "train/perplexity": 8.906094527634579, "train/grad_norm": 0.365234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701478.4566028155, "perf/iters_per_sec": 0.8113281519903257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232546901702881, "data/tokens_consumed": 71263322112, "data/tokens_consumed_B": 71.263322112, "train/loss_slope": 4.537946964004126e-06} {"step": 33990, "timestamp": 1778231272.2471876, "train/loss": 2.2284509181976317, "train/z_loss": 0.0014376134844496847, "train/perplexity": 9.285470978648375, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026547.8700882278, "perf/iters_per_sec": 0.9663333273354663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348396062850953, "data/tokens_consumed": 71284293632, "data/tokens_consumed_B": 71.284293632, "train/loss_slope": 8.04080261398715e-06} {"step": 34000, "timestamp": 1778231282.590188, "grad/layer_0/attn": 0.0025418263394385576, "grad/layer_0/mlp": 0.0024718150962144136, "grad/layer_0/attn_mlp_ratio": 1.0283237773322884, "grad/layer_4/attn": 0.0017989333719015121, "grad/layer_4/mlp": 0.002394805895164609, "grad/layer_4/attn_mlp_ratio": 0.7511812545708367, "grad/layer_8/attn": 0.003156725550070405, "grad/layer_8/mlp": 0.0033268809784203768, "grad/layer_8/attn_mlp_ratio": 0.9488543400442908, "grad/layer_12/attn": 0.004849692806601524, "grad/layer_12/mlp": 0.006204542238265276, "grad/layer_12/attn_mlp_ratio": 0.7816358632436131, "grad/layer_16/attn": 0.004386697430163622, "grad/layer_16/mlp": 0.004255405627191067, "grad/layer_16/attn_mlp_ratio": 1.030852922468387, "grad/layer_20/attn": 0.005267188418656588, "grad/layer_20/mlp": 0.005455820821225643, "grad/layer_20/attn_mlp_ratio": 0.9654254592860282, "grad/layer_24/attn": 0.00810203142464161, "grad/layer_24/mlp": 0.009307350032031536, "grad/layer_24/attn_mlp_ratio": 0.8704981879598808, "grad/layer_27/attn": 0.008760267868638039, "grad/layer_27/mlp": 0.0072889141738414764, "grad/layer_27/attn_mlp_ratio": 1.2018618328489568} {"step": 34000, "timestamp": 1778231282.6062737, "train/loss": 2.195976901054382, "train/z_loss": 0.0014337682398036123, "train/perplexity": 8.988777915683464, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025553.954065408, "perf/iters_per_sec": 0.9658593912436524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353473901748658, "data/tokens_consumed": 71305265152, "data/tokens_consumed_B": 71.305265152, "train/loss_slope": 7.549179140383359e-06} {"step": 34000, "timestamp": 1778231289.676008, "geo/ww_alpha_mean": 7.570880418495146, "geo/ww_alpha_std": 4.292247085096086, "geo/ww_alpha_min": 1.3658278568213393, "geo/ww_alpha_max": 25.717820076401853, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.079519712009999, "geo/ww_alpha_by_type/k_proj": 4.565891794388085, "geo/ww_alpha_by_type/v_proj": 7.4289485440965155, "geo/ww_alpha_by_type/o_proj": 8.222788554342511, "geo/ww_alpha_by_type/gate_proj": 8.121909964940869, "geo/ww_alpha_by_type/up_proj": 11.960619471870194, "geo/ww_alpha_by_type/down_proj": 8.721772928541803, "geo/twonn_id/layer_0": 0.7243766784667969, "geo/twonn_id/layer_7": 3.259624481201172, "geo/twonn_id/layer_14": 5.956668853759766, "geo/twonn_id/layer_21": 7.421369552612305, "geo/twonn_id/layer_27": 5.5919270515441895, "geo/tier2_time_s": 7.063131809234619} {"step": 34000, "timestamp": 1778231290.330262, "eoc/jacobian_sigma/layer_0/attn": 982.0134887695312, "eoc/jacobian_sigma/layer_0/mlp": 7573.99853515625, "eoc/jacobian_sigma/layer_0": 7573.99853515625, "eoc/jacobian_sigma/layer_7/attn": 1.1557486057281494, "eoc/jacobian_sigma/layer_7/mlp": 1.6971944570541382, "eoc/jacobian_sigma/layer_7": 1.6971944570541382, "eoc/jacobian_sigma/layer_14/attn": 1.6613566875457764, "eoc/jacobian_sigma/layer_14/mlp": 8.168014526367188, "eoc/jacobian_sigma/layer_14": 8.168014526367188, "eoc/jacobian_sigma/layer_21/attn": 1.0954418182373047, "eoc/jacobian_sigma/layer_21/mlp": 4.106914520263672, "eoc/jacobian_sigma/layer_21": 4.106914520263672, "eoc/jacobian_sigma/layer_27/attn": 2.5966670513153076, "eoc/jacobian_sigma/layer_27/mlp": 24.092670440673828, "eoc/jacobian_sigma/layer_27": 24.092670440673828, "eoc/layer0_sigma": 7573.99853515625, "eoc/sigma_max": 24.092670440673828, "eoc/sigma_min": 1.6971944570541382, "eoc/sigma_mean": 9.516198486089706, "eoc/time_s": 0.6485364437103271} {"step": 34010, "timestamp": 1778231300.700044, "train/loss": 2.1884658098220826, "train/z_loss": 0.0014452560339123012, "train/perplexity": 8.92151530870704, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1159409.092790828, "perf/iters_per_sec": 0.5528493370012417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.808811068534851, "data/tokens_consumed": 71326236672, "data/tokens_consumed_B": 71.326236672, "train/loss_slope": 8.866165487608171e-06} {"step": 34020, "timestamp": 1778231311.0470402, "train/loss": 2.1902788162231444, "train/z_loss": 0.0014436765923164785, "train/perplexity": 8.937704744409631, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027874.6409114865, "perf/iters_per_sec": 0.9669659809644158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341625452041625, "data/tokens_consumed": 71347208192, "data/tokens_consumed_B": 71.347208192, "train/loss_slope": 7.501105413828871e-06} {"step": 34030, "timestamp": 1778231321.394453, "train/loss": 2.176795983314514, "train/z_loss": 0.001436526293400675, "train/perplexity": 8.81800790427152, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027791.8479867487, "perf/iters_per_sec": 0.9669265022214645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342047691345215, "data/tokens_consumed": 71368179712, "data/tokens_consumed_B": 71.368179712, "train/loss_slope": 7.345050002875489e-06} {"step": 34040, "timestamp": 1778231331.746732, "train/loss": 2.101793313026428, "train/z_loss": 0.0014600491151213646, "train/perplexity": 8.18082755038609, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026807.5931778415, "perf/iters_per_sec": 0.9664571729554374, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034706997871399, "data/tokens_consumed": 71389151232, "data/tokens_consumed_B": 71.389151232, "train/loss_slope": 4.480132296963042e-06} {"step": 34050, "timestamp": 1778231342.1650662, "grad/layer_0/attn": 0.002870198804885149, "grad/layer_0/mlp": 0.002685526153072715, "grad/layer_0/attn_mlp_ratio": 1.0687658709726395, "grad/layer_4/attn": 0.002263793721795082, "grad/layer_4/mlp": 0.002456466667354107, "grad/layer_4/attn_mlp_ratio": 0.9215649696060999, "grad/layer_8/attn": 0.003646611236035824, "grad/layer_8/mlp": 0.0037181617226451635, "grad/layer_8/attn_mlp_ratio": 0.9807564624612176, "grad/layer_12/attn": 0.006482838653028011, "grad/layer_12/mlp": 0.006290131714195013, "grad/layer_12/attn_mlp_ratio": 1.0306363752820116, "grad/layer_16/attn": 0.005001097917556763, "grad/layer_16/mlp": 0.004484215285629034, "grad/layer_16/attn_mlp_ratio": 1.1152671063892758, "grad/layer_20/attn": 0.004917604383081198, "grad/layer_20/mlp": 0.007183495443314314, "grad/layer_20/attn_mlp_ratio": 0.6845698383787566, "grad/layer_24/attn": 0.02063407376408577, "grad/layer_24/mlp": 0.013957488350570202, "grad/layer_24/attn_mlp_ratio": 1.4783514840195202, "grad/layer_27/attn": 0.009321815334260464, "grad/layer_27/mlp": 0.013145643286406994, "grad/layer_27/attn_mlp_ratio": 0.7091182272523467} {"step": 34050, "timestamp": 1778231342.7832904, "eos/sharpness": 67.82896518707274, "eos/L0_probe": 2.0137579441070557, "eos/L_plus": 2.303614616394043, "eos/L_minus": 2.402190923690796, "eos/grad_norm": 0.2354147732257843, "eos/embed_grad_frac": 0.0417671836912632, "eos/time_s": 0.6155450344085693} {"step": 34050, "timestamp": 1778231342.8040266, "train/loss": 2.1847164630889893, "train/z_loss": 0.0014369964716024696, "train/perplexity": 8.888128083683647, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911237.3116962814, "perf/iters_per_sec": 0.9113489683610351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972745180130006, "data/tokens_consumed": 71410122752, "data/tokens_consumed_B": 71.410122752, "train/loss_slope": 3.2114874232898855e-06} {"step": 34050, "timestamp": 1778231344.1687415, "geo/rankme_last": 440.24139404296875, "geo/layer_0/stable_rank_q_proj": 18.875350952148438, "geo/layer_0/stable_rank_k_proj": 16.247365951538086, "geo/layer_0/stable_rank_o_proj": 49.93405532836914, "geo/layer_0/stable_rank_gate_proj": 140.31028747558594, "geo/layer_0/stable_rank_down_proj": 52.61676788330078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061925433576107025, "geo/layer_0/attn_entropy_mean": 6.206214904785156, "geo/layer_0/attn_entropy_std": 0.3612975776195526, "geo/layer_7/stable_rank_q_proj": 42.8807373046875, "geo/layer_7/stable_rank_k_proj": 42.169891357421875, "geo/layer_7/stable_rank_o_proj": 102.41143798828125, "geo/layer_7/stable_rank_gate_proj": 92.95410919189453, "geo/layer_7/stable_rank_down_proj": 145.83148193359375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5052208304405212, "geo/layer_7/attn_entropy_mean": 4.649981498718262, "geo/layer_7/attn_entropy_std": 0.8288871049880981, "geo/layer_14/stable_rank_q_proj": 54.6749153137207, "geo/layer_14/stable_rank_k_proj": 36.319175720214844, "geo/layer_14/stable_rank_o_proj": 50.93698501586914, "geo/layer_14/stable_rank_gate_proj": 79.26444244384766, "geo/layer_14/stable_rank_down_proj": 133.92718505859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39243084192276, "geo/layer_14/attn_entropy_mean": 5.482800483703613, "geo/layer_14/attn_entropy_std": 0.38519012928009033, "geo/layer_21/stable_rank_q_proj": 44.227867126464844, "geo/layer_21/stable_rank_k_proj": 31.117618560791016, "geo/layer_21/stable_rank_o_proj": 78.27124786376953, "geo/layer_21/stable_rank_gate_proj": 75.82820129394531, "geo/layer_21/stable_rank_down_proj": 56.8530158996582, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14613118767738342, "geo/layer_21/attn_entropy_mean": 5.720698356628418, "geo/layer_21/attn_entropy_std": 0.2974563241004944, "geo/layer_27/stable_rank_q_proj": 42.34236526489258, "geo/layer_27/stable_rank_k_proj": 31.64683723449707, "geo/layer_27/stable_rank_o_proj": 118.0196533203125, "geo/layer_27/stable_rank_gate_proj": 87.11803436279297, "geo/layer_27/stable_rank_down_proj": 133.32437133789062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07841439545154572, "geo/layer_27/attn_entropy_mean": 4.299203872680664, "geo/layer_27/attn_entropy_std": 0.6484688520431519, "attnres/final_alpha/block_0": 0.24191510677337646, "attnres/block_norm/0": 1.7012065649032593, "attnres/final_alpha/block_1": 0.005674717947840691, "attnres/block_norm/1": 37529.8203125, "attnres/final_alpha/block_2": 0.01218387484550476, "attnres/block_norm/2": 25147.5078125, "attnres/final_alpha/block_3": 0.014038624241948128, "attnres/block_norm/3": 42710.0859375, "attnres/final_alpha/block_4": 0.017593849450349808, "attnres/block_norm/4": 12039.234375, "attnres/final_alpha/block_5": 0.5847153663635254, "attnres/block_norm/5": 5812.037109375, "attnres/final_alpha/block_6": 0.12387849390506744, "attnres/block_norm/6": 27958.197265625, "geo/tier1_time_s": 1.3606393337249756, "geo/step": 34050.0, "geo/rankme_slope": -9.419259891456583e-05} {"step": 34060, "timestamp": 1778231355.046676, "train/loss": 2.1774199485778807, "train/z_loss": 0.0014483233680948615, "train/perplexity": 8.823511751822153, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1713560.2767982283, "perf/iters_per_sec": 0.8170892127982274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2238565683364868, "data/tokens_consumed": 71431094272, "data/tokens_consumed_B": 71.431094272, "train/loss_slope": 6.38526965289223e-06} {"step": 34070, "timestamp": 1778231365.4086726, "train/loss": 2.1666199207305907, "train/z_loss": 0.0014521054341457783, "train/perplexity": 8.728730321513416, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025481.9379530705, "perf/iters_per_sec": 0.9658250512853005, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035384202003479, "data/tokens_consumed": 71452065792, "data/tokens_consumed_B": 71.452065792, "train/loss_slope": 4.08888971201117e-06} {"step": 34080, "timestamp": 1778231375.7596679, "train/loss": 2.196529579162598, "train/z_loss": 0.001436226605437696, "train/perplexity": 8.99374718953502, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027327.3793032714, "perf/iters_per_sec": 0.9667050262943608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344417095184326, "data/tokens_consumed": 71473037312, "data/tokens_consumed_B": 71.473037312, "train/loss_slope": 7.286368101665326e-06} {"step": 34090, "timestamp": 1778231386.1149423, "train/loss": 2.181861472129822, "train/z_loss": 0.0014482286176644265, "train/perplexity": 8.86278874736055, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026280.4650287523, "perf/iters_per_sec": 0.9662058186668169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349761724472046, "data/tokens_consumed": 71494008832, "data/tokens_consumed_B": 71.494008832, "train/loss_slope": 7.236950718673366e-06} {"step": 34100, "timestamp": 1778231396.459792, "grad/layer_0/attn": 0.0028997212648391724, "grad/layer_0/mlp": 0.002869714517146349, "grad/layer_0/attn_mlp_ratio": 1.010456317681743, "grad/layer_4/attn": 0.0020350210834294558, "grad/layer_4/mlp": 0.0025439637247473, "grad/layer_4/attn_mlp_ratio": 0.7999410461866928, "grad/layer_8/attn": 0.00354253058321774, "grad/layer_8/mlp": 0.00369640882126987, "grad/layer_8/attn_mlp_ratio": 0.9583708563285076, "grad/layer_12/attn": 0.007229043170809746, "grad/layer_12/mlp": 0.006688171066343784, "grad/layer_12/attn_mlp_ratio": 1.080869940528397, "grad/layer_16/attn": 0.005950489547103643, "grad/layer_16/mlp": 0.004363385960459709, "grad/layer_16/attn_mlp_ratio": 1.363732079777677, "grad/layer_20/attn": 0.004346577916294336, "grad/layer_20/mlp": 0.006644828245043755, "grad/layer_20/attn_mlp_ratio": 0.6541294508437321, "grad/layer_24/attn": 0.015383264049887657, "grad/layer_24/mlp": 0.011821489781141281, "grad/layer_24/attn_mlp_ratio": 1.3012965543732726, "grad/layer_27/attn": 0.005805217660963535, "grad/layer_27/mlp": 0.01177260559052229, "grad/layer_27/attn_mlp_ratio": 0.49311238425636833} {"step": 34100, "timestamp": 1778231396.4758077, "train/loss": 2.1801364183425904, "train/z_loss": 0.0014420509920455515, "train/perplexity": 8.84751313947721, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025346.1753892556, "perf/iters_per_sec": 0.9657603146501806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354536056518555, "data/tokens_consumed": 71514980352, "data/tokens_consumed_B": 71.514980352, "train/loss_slope": 4.365067627921354e-06} {"step": 34110, "timestamp": 1778231406.8298414, "train/loss": 2.2041056156158447, "train/z_loss": 0.0014425691915675998, "train/perplexity": 9.062142903165824, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026254.0924804285, "perf/iters_per_sec": 0.9661932432558196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349896430969239, "data/tokens_consumed": 71535951872, "data/tokens_consumed_B": 71.535951872, "train/loss_slope": 4.967704357200981e-06} {"step": 34120, "timestamp": 1778231417.1800048, "train/loss": 2.1931374788284304, "train/z_loss": 0.00144733494380489, "train/perplexity": 8.963291180809772, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027132.5508985568, "perf/iters_per_sec": 0.9666121248715195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034541130065918, "data/tokens_consumed": 71556923392, "data/tokens_consumed_B": 71.556923392, "train/loss_slope": 5.519218205904898e-06} {"step": 34125, "timestamp": 1778231422.9486532, "eos/sharpness": 49.83751773834228, "eos/L0_probe": 2.015049457550049, "eos/L_plus": 2.2536425590515137, "eos/L_minus": 2.274831533432007, "eos/grad_norm": 0.1941707283258438, "eos/embed_grad_frac": 0.06698870658874512, "eos/time_s": 0.6037616729736328} {"step": 34125, "timestamp": 1778231424.3286839, "geo/rankme_last": 439.0904235839844, "geo/layer_0/stable_rank_q_proj": 18.92080307006836, "geo/layer_0/stable_rank_k_proj": 16.253446578979492, "geo/layer_0/stable_rank_o_proj": 49.99733352661133, "geo/layer_0/stable_rank_gate_proj": 140.5283203125, "geo/layer_0/stable_rank_down_proj": 52.5786018371582, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05364900454878807, "geo/layer_0/attn_entropy_mean": 6.209184646606445, "geo/layer_0/attn_entropy_std": 0.35817626118659973, "geo/layer_7/stable_rank_q_proj": 42.84587478637695, "geo/layer_7/stable_rank_k_proj": 42.24209976196289, "geo/layer_7/stable_rank_o_proj": 102.26116943359375, "geo/layer_7/stable_rank_gate_proj": 92.8795394897461, "geo/layer_7/stable_rank_down_proj": 145.5384521484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.533596932888031, "geo/layer_7/attn_entropy_mean": 4.651359558105469, "geo/layer_7/attn_entropy_std": 0.8179603815078735, "geo/layer_14/stable_rank_q_proj": 54.685546875, "geo/layer_14/stable_rank_k_proj": 36.34836196899414, "geo/layer_14/stable_rank_o_proj": 50.912811279296875, "geo/layer_14/stable_rank_gate_proj": 79.14622497558594, "geo/layer_14/stable_rank_down_proj": 134.06527709960938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37094780802726746, "geo/layer_14/attn_entropy_mean": 5.498948574066162, "geo/layer_14/attn_entropy_std": 0.3574516475200653, "geo/layer_21/stable_rank_q_proj": 44.284767150878906, "geo/layer_21/stable_rank_k_proj": 31.079687118530273, "geo/layer_21/stable_rank_o_proj": 78.27373504638672, "geo/layer_21/stable_rank_gate_proj": 75.68860626220703, "geo/layer_21/stable_rank_down_proj": 56.8736572265625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1542663872241974, "geo/layer_21/attn_entropy_mean": 5.70393180847168, "geo/layer_21/attn_entropy_std": 0.2933793067932129, "geo/layer_27/stable_rank_q_proj": 42.34627914428711, "geo/layer_27/stable_rank_k_proj": 31.546266555786133, "geo/layer_27/stable_rank_o_proj": 118.0840072631836, "geo/layer_27/stable_rank_gate_proj": 87.19353485107422, "geo/layer_27/stable_rank_down_proj": 133.2101287841797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09155237674713135, "geo/layer_27/attn_entropy_mean": 4.3151469230651855, "geo/layer_27/attn_entropy_std": 0.6428532004356384, "attnres/final_alpha/block_0": 0.24098148941993713, "attnres/block_norm/0": 1.701643466949463, "attnres/final_alpha/block_1": 0.005557245574891567, "attnres/block_norm/1": 37641.8203125, "attnres/final_alpha/block_2": 0.011849211528897285, "attnres/block_norm/2": 25251.89453125, "attnres/final_alpha/block_3": 0.01387539878487587, "attnres/block_norm/3": 42730.875, "attnres/final_alpha/block_4": 0.017448708415031433, "attnres/block_norm/4": 11979.328125, "attnres/final_alpha/block_5": 0.5882871150970459, "attnres/block_norm/5": 5798.25048828125, "attnres/final_alpha/block_6": 0.12200085818767548, "attnres/block_norm/6": 28071.177734375, "geo/tier1_time_s": 1.3601219654083252, "geo/step": 34125.0, "geo/rankme_slope": -0.0001233742325055022} {"step": 34130, "timestamp": 1778231429.5069425, "train/loss": 2.172929310798645, "train/z_loss": 0.0014519252232275902, "train/perplexity": 8.783977390289532, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702201.888299388, "perf/iters_per_sec": 0.8116731111046734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2320230722427368, "data/tokens_consumed": 71577894912, "data/tokens_consumed_B": 71.577894912, "train/loss_slope": 5.093396629186985e-06} {"step": 34140, "timestamp": 1778231439.8566527, "train/loss": 2.117906641960144, "train/z_loss": 0.001460481563117355, "train/perplexity": 8.31371567545413, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027560.9424084735, "perf/iters_per_sec": 0.9668163978617065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343225479125977, "data/tokens_consumed": 71598866432, "data/tokens_consumed_B": 71.598866432, "train/loss_slope": 9.878434757194788e-07} {"step": 34150, "timestamp": 1778231450.2041292, "grad/layer_0/attn": 0.0028208685107529163, "grad/layer_0/mlp": 0.002935107098892331, "grad/layer_0/attn_mlp_ratio": 0.9610785295397296, "grad/layer_4/attn": 0.0018194738076999784, "grad/layer_4/mlp": 0.002485221717506647, "grad/layer_4/attn_mlp_ratio": 0.7321172681179039, "grad/layer_8/attn": 0.004744693636894226, "grad/layer_8/mlp": 0.003601099830120802, "grad/layer_8/attn_mlp_ratio": 1.3175678900793684, "grad/layer_12/attn": 0.004192472435534, "grad/layer_12/mlp": 0.0065080514177680016, "grad/layer_12/attn_mlp_ratio": 0.6441977946990575, "grad/layer_16/attn": 0.004597632680088282, "grad/layer_16/mlp": 0.004522427450865507, "grad/layer_16/attn_mlp_ratio": 1.016629371809037, "grad/layer_20/attn": 0.0047359648160636425, "grad/layer_20/mlp": 0.006230812519788742, "grad/layer_20/attn_mlp_ratio": 0.760087825626863, "grad/layer_24/attn": 0.01088690385222435, "grad/layer_24/mlp": 0.010736742056906223, "grad/layer_24/attn_mlp_ratio": 1.0139857782857846, "grad/layer_27/attn": 0.005595072638243437, "grad/layer_27/mlp": 0.00910263229161501, "grad/layer_27/attn_mlp_ratio": 0.6146653404785851} {"step": 34150, "timestamp": 1778231450.2197902, "train/loss": 2.1260823249816894, "train/z_loss": 0.0014540462754666806, "train/perplexity": 8.381964590138177, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024665.9111480296, "perf/iters_per_sec": 0.9654359393825672, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358015060424806, "data/tokens_consumed": 71619837952, "data/tokens_consumed_B": 71.619837952, "train/loss_slope": -1.6059576123818792e-06} {"step": 34160, "timestamp": 1778231460.570324, "train/loss": 2.155222308635712, "train/z_loss": 0.0014558262773789465, "train/perplexity": 8.62980844667935, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027227.1104160575, "perf/iters_per_sec": 0.9666572143631256, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344928741455077, "data/tokens_consumed": 71640809472, "data/tokens_consumed_B": 71.640809472, "train/loss_slope": -3.4269955792728824e-06} {"step": 34170, "timestamp": 1778231470.9190276, "train/loss": 2.1762214422225954, "train/z_loss": 0.001444085198454559, "train/perplexity": 8.812943051503996, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027492.8960608784, "perf/iters_per_sec": 0.9667839508346932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343572616577148, "data/tokens_consumed": 71661780992, "data/tokens_consumed_B": 71.661780992, "train/loss_slope": -4.098017380969349e-06} {"step": 34180, "timestamp": 1778231481.2646496, "train/loss": 2.1189191937446594, "train/z_loss": 0.001455092162359506, "train/perplexity": 8.322138006400788, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028041.1354623083, "perf/iters_per_sec": 0.967045371752886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340776443481445, "data/tokens_consumed": 71682752512, "data/tokens_consumed_B": 71.682752512, "train/loss_slope": -9.327917209159413e-06} {"step": 34190, "timestamp": 1778231491.609431, "train/loss": 2.1879305839538574, "train/z_loss": 0.0014449835056439043, "train/perplexity": 8.916741560560762, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028122.7329167456, "perf/iters_per_sec": 0.9670842804511764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340360403060913, "data/tokens_consumed": 71703724032, "data/tokens_consumed_B": 71.703724032, "train/loss_slope": -8.933876821882459e-06} {"step": 34200, "timestamp": 1778231501.9471319, "grad/layer_0/attn": 0.002453909022733569, "grad/layer_0/mlp": 0.0027016669046133757, "grad/layer_0/attn_mlp_ratio": 0.908294404359703, "grad/layer_4/attn": 0.0023526574950665236, "grad/layer_4/mlp": 0.002413906389847398, "grad/layer_4/attn_mlp_ratio": 0.9746266083468932, "grad/layer_8/attn": 0.00504002720117569, "grad/layer_8/mlp": 0.00361380516551435, "grad/layer_8/attn_mlp_ratio": 1.394659321926232, "grad/layer_12/attn": 0.005364538636058569, "grad/layer_12/mlp": 0.0061206575483083725, "grad/layer_12/attn_mlp_ratio": 0.876464416783909, "grad/layer_16/attn": 0.004886234179139137, "grad/layer_16/mlp": 0.004380341153591871, "grad/layer_16/attn_mlp_ratio": 1.115491669771718, "grad/layer_20/attn": 0.004013928584754467, "grad/layer_20/mlp": 0.006031075492501259, "grad/layer_20/attn_mlp_ratio": 0.6655410835415805, "grad/layer_24/attn": 0.018310286104679108, "grad/layer_24/mlp": 0.011439208872616291, "grad/layer_24/attn_mlp_ratio": 1.6006601635227682, "grad/layer_27/attn": 0.010010682977735996, "grad/layer_27/mlp": 0.009972590953111649, "grad/layer_27/attn_mlp_ratio": 1.0038196617530468} {"step": 34200, "timestamp": 1778231502.5551114, "eos/sharpness": 69.38886642456053, "eos/L0_probe": 2.0108606815338135, "eos/L_plus": 2.2920939922332764, "eos/L_minus": 2.423516035079956, "eos/grad_norm": 0.20124351978302002, "eos/embed_grad_frac": 0.052405908703804016, "eos/time_s": 0.6049172878265381} {"step": 34200, "timestamp": 1778231502.5754404, "train/loss": 2.220221662521362, "train/z_loss": 0.001436637423466891, "train/perplexity": 9.20937201227147, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913345.5949605498, "perf/iters_per_sec": 0.912354276161456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960654497146607, "data/tokens_consumed": 71724695552, "data/tokens_consumed_B": 71.724695552, "train/loss_slope": -8.167423860038898e-06} {"step": 34200, "timestamp": 1778231503.9379654, "geo/rankme_last": 438.90411376953125, "geo/layer_0/stable_rank_q_proj": 18.89015769958496, "geo/layer_0/stable_rank_k_proj": 16.236631393432617, "geo/layer_0/stable_rank_o_proj": 50.095802307128906, "geo/layer_0/stable_rank_gate_proj": 140.50643920898438, "geo/layer_0/stable_rank_down_proj": 52.6340217590332, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06232919171452522, "geo/layer_0/attn_entropy_mean": 6.204268455505371, "geo/layer_0/attn_entropy_std": 0.3602219521999359, "geo/layer_7/stable_rank_q_proj": 42.927757263183594, "geo/layer_7/stable_rank_k_proj": 42.3211669921875, "geo/layer_7/stable_rank_o_proj": 102.22938537597656, "geo/layer_7/stable_rank_gate_proj": 92.80764770507812, "geo/layer_7/stable_rank_down_proj": 145.50808715820312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5216435790061951, "geo/layer_7/attn_entropy_mean": 4.663778781890869, "geo/layer_7/attn_entropy_std": 0.8188925385475159, "geo/layer_14/stable_rank_q_proj": 54.6649284362793, "geo/layer_14/stable_rank_k_proj": 36.34747314453125, "geo/layer_14/stable_rank_o_proj": 50.900352478027344, "geo/layer_14/stable_rank_gate_proj": 79.09981536865234, "geo/layer_14/stable_rank_down_proj": 133.8972930908203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38318201899528503, "geo/layer_14/attn_entropy_mean": 5.531708717346191, "geo/layer_14/attn_entropy_std": 0.37441977858543396, "geo/layer_21/stable_rank_q_proj": 44.256202697753906, "geo/layer_21/stable_rank_k_proj": 31.179340362548828, "geo/layer_21/stable_rank_o_proj": 78.18993377685547, "geo/layer_21/stable_rank_gate_proj": 75.50611877441406, "geo/layer_21/stable_rank_down_proj": 56.96549606323242, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14744696021080017, "geo/layer_21/attn_entropy_mean": 5.729522705078125, "geo/layer_21/attn_entropy_std": 0.2868301272392273, "geo/layer_27/stable_rank_q_proj": 42.29071807861328, "geo/layer_27/stable_rank_k_proj": 31.393024444580078, "geo/layer_27/stable_rank_o_proj": 117.79461669921875, "geo/layer_27/stable_rank_gate_proj": 87.24272155761719, "geo/layer_27/stable_rank_down_proj": 133.02230834960938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08489830791950226, "geo/layer_27/attn_entropy_mean": 4.289486885070801, "geo/layer_27/attn_entropy_std": 0.6296249032020569, "attnres/final_alpha/block_0": 0.2415887415409088, "attnres/block_norm/0": 1.7021641731262207, "attnres/final_alpha/block_1": 0.005632233805954456, "attnres/block_norm/1": 37444.9453125, "attnres/final_alpha/block_2": 0.012045076116919518, "attnres/block_norm/2": 25298.21875, "attnres/final_alpha/block_3": 0.01396099105477333, "attnres/block_norm/3": 42659.765625, "attnres/final_alpha/block_4": 0.017448604106903076, "attnres/block_norm/4": 12056.984375, "attnres/final_alpha/block_5": 0.5869632959365845, "attnres/block_norm/5": 5794.74560546875, "attnres/final_alpha/block_6": 0.12236107885837555, "attnres/block_norm/6": 28121.29296875, "geo/tier1_time_s": 1.358151912689209, "geo/step": 34200.0, "geo/rankme_slope": -0.00012325012036064426} {"step": 34210, "timestamp": 1778231514.2939765, "train/loss": 2.1630266189575194, "train/z_loss": 0.001450239063706249, "train/perplexity": 8.697421643823642, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790231.2457722363, "perf/iters_per_sec": 0.8536487797604734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714419603347779, "data/tokens_consumed": 71745667072, "data/tokens_consumed_B": 71.745667072, "train/loss_slope": -9.916532057525517e-06} {"step": 34220, "timestamp": 1778231524.6517048, "train/loss": 2.1716569662094116, "train/z_loss": 0.0014450859744101763, "train/perplexity": 8.772808251188673, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025886.7219334512, "perf/iters_per_sec": 0.9660180673281914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351773262023927, "data/tokens_consumed": 71766638592, "data/tokens_consumed_B": 71.766638592, "train/loss_slope": -1.008793162469782e-05} {"step": 34230, "timestamp": 1778231535.0029614, "train/loss": 2.2339131593704225, "train/z_loss": 0.0014405292924493552, "train/perplexity": 9.336329234114347, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026908.614581135, "perf/iters_per_sec": 0.9665053437143016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346554279327393, "data/tokens_consumed": 71787610112, "data/tokens_consumed_B": 71.787610112, "train/loss_slope": -6.554379636304997e-06} {"step": 34240, "timestamp": 1778231545.357366, "train/loss": 2.1682364106178285, "train/z_loss": 0.001448938436806202, "train/perplexity": 8.742851636212809, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026517.8955483914, "perf/iters_per_sec": 0.9663190343610722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348549127578734, "data/tokens_consumed": 71808581632, "data/tokens_consumed_B": 71.808581632, "train/loss_slope": -5.784821524621466e-06} {"step": 34250, "timestamp": 1778231555.7008274, "grad/layer_0/attn": 0.003049686085432768, "grad/layer_0/mlp": 0.0030285585671663284, "grad/layer_0/attn_mlp_ratio": 1.0069760637280991, "grad/layer_4/attn": 0.002084232633933425, "grad/layer_4/mlp": 0.002568404655903578, "grad/layer_4/attn_mlp_ratio": 0.8114891662393692, "grad/layer_8/attn": 0.007468858268111944, "grad/layer_8/mlp": 0.0036368086002767086, "grad/layer_8/attn_mlp_ratio": 2.053684667973785, "grad/layer_12/attn": 0.004218850284814835, "grad/layer_12/mlp": 0.006864560302346945, "grad/layer_12/attn_mlp_ratio": 0.6145841885770922, "grad/layer_16/attn": 0.005381960887461901, "grad/layer_16/mlp": 0.004861642140895128, "grad/layer_16/attn_mlp_ratio": 1.1070252850343367, "grad/layer_20/attn": 0.005621842574328184, "grad/layer_20/mlp": 0.006604164373129606, "grad/layer_20/attn_mlp_ratio": 0.8512572025124148, "grad/layer_24/attn": 0.011469523422420025, "grad/layer_24/mlp": 0.011861628852784634, "grad/layer_24/attn_mlp_ratio": 0.9669433657109501, "grad/layer_27/attn": 0.00832943245768547, "grad/layer_27/mlp": 0.011615435592830181, "grad/layer_27/attn_mlp_ratio": 0.7171003032479402} {"step": 34250, "timestamp": 1778231555.7170658, "train/loss": 2.099908375740051, "train/z_loss": 0.001448466954752803, "train/perplexity": 8.165421727569267, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025459.9703267266, "perf/iters_per_sec": 0.9658145763047822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353954315185547, "data/tokens_consumed": 71829553152, "data/tokens_consumed_B": 71.829553152, "train/loss_slope": -7.060233337042954e-06} {"step": 34260, "timestamp": 1778231566.071381, "train/loss": 2.2034810066223143, "train/z_loss": 0.0014391828211955727, "train/perplexity": 9.056484374575739, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026460.843656306, "perf/iters_per_sec": 0.9662918298989801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348840475082397, "data/tokens_consumed": 71850524672, "data/tokens_consumed_B": 71.850524672, "train/loss_slope": -9.047095302772925e-06} {"step": 34270, "timestamp": 1778231576.445314, "train/loss": 2.1945942401885987, "train/z_loss": 0.0014409483061172067, "train/perplexity": 8.976358072422485, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022716.1829698067, "perf/iters_per_sec": 0.9645062365387949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036799931526184, "data/tokens_consumed": 71871496192, "data/tokens_consumed_B": 71.871496192, "train/loss_slope": -7.266022348561287e-06} {"step": 34275, "timestamp": 1778231582.2366905, "eos/sharpness": 23.96900653839111, "eos/L0_probe": 2.0125153064727783, "eos/L_plus": 2.1448988914489746, "eos/L_minus": 2.119821786880493, "eos/grad_norm": 0.10893818736076355, "eos/embed_grad_frac": 0.196655735373497, "eos/time_s": 0.6143040657043457} {"step": 34275, "timestamp": 1778231583.616749, "geo/rankme_last": 439.5093688964844, "geo/layer_0/stable_rank_q_proj": 18.908235549926758, "geo/layer_0/stable_rank_k_proj": 16.253822326660156, "geo/layer_0/stable_rank_o_proj": 50.013999938964844, "geo/layer_0/stable_rank_gate_proj": 140.4513397216797, "geo/layer_0/stable_rank_down_proj": 52.647621154785156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054186686873435974, "geo/layer_0/attn_entropy_mean": 6.2068352699279785, "geo/layer_0/attn_entropy_std": 0.35874536633491516, "geo/layer_7/stable_rank_q_proj": 42.89900588989258, "geo/layer_7/stable_rank_k_proj": 42.34431076049805, "geo/layer_7/stable_rank_o_proj": 101.93277740478516, "geo/layer_7/stable_rank_gate_proj": 92.80238342285156, "geo/layer_7/stable_rank_down_proj": 145.6826629638672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5163251757621765, "geo/layer_7/attn_entropy_mean": 4.665209770202637, "geo/layer_7/attn_entropy_std": 0.8328678607940674, "geo/layer_14/stable_rank_q_proj": 54.57321548461914, "geo/layer_14/stable_rank_k_proj": 36.35137176513672, "geo/layer_14/stable_rank_o_proj": 50.81520080566406, "geo/layer_14/stable_rank_gate_proj": 79.05081176757812, "geo/layer_14/stable_rank_down_proj": 133.8662872314453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37487924098968506, "geo/layer_14/attn_entropy_mean": 5.495851516723633, "geo/layer_14/attn_entropy_std": 0.3794155716896057, "geo/layer_21/stable_rank_q_proj": 44.31856918334961, "geo/layer_21/stable_rank_k_proj": 31.231653213500977, "geo/layer_21/stable_rank_o_proj": 78.2152099609375, "geo/layer_21/stable_rank_gate_proj": 75.57300567626953, "geo/layer_21/stable_rank_down_proj": 56.983009338378906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14902406930923462, "geo/layer_21/attn_entropy_mean": 5.753975868225098, "geo/layer_21/attn_entropy_std": 0.27898895740509033, "geo/layer_27/stable_rank_q_proj": 42.36711120605469, "geo/layer_27/stable_rank_k_proj": 31.456932067871094, "geo/layer_27/stable_rank_o_proj": 117.81035614013672, "geo/layer_27/stable_rank_gate_proj": 87.08578491210938, "geo/layer_27/stable_rank_down_proj": 133.18136596679688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0854683443903923, "geo/layer_27/attn_entropy_mean": 4.3107171058654785, "geo/layer_27/attn_entropy_std": 0.6405898332595825, "attnres/final_alpha/block_0": 0.2417510449886322, "attnres/block_norm/0": 1.7022955417633057, "attnres/final_alpha/block_1": 0.00550062395632267, "attnres/block_norm/1": 37833.3359375, "attnres/final_alpha/block_2": 0.011847439222037792, "attnres/block_norm/2": 25243.935546875, "attnres/final_alpha/block_3": 0.013903355225920677, "attnres/block_norm/3": 43115.37890625, "attnres/final_alpha/block_4": 0.017459990456700325, "attnres/block_norm/4": 12038.435546875, "attnres/final_alpha/block_5": 0.5902887582778931, "attnres/block_norm/5": 5746.466796875, "attnres/final_alpha/block_6": 0.11924879252910614, "attnres/block_norm/6": 28157.9296875, "geo/tier1_time_s": 1.3579940795898438, "geo/step": 34275.0, "geo/rankme_slope": -0.0001300665969512805} {"step": 34280, "timestamp": 1778231588.8039563, "train/loss": 2.199726390838623, "train/z_loss": 0.0014495867188088595, "train/perplexity": 9.022544510843476, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697927.0052005702, "perf/iters_per_sec": 0.8096346879961825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235124945640564, "data/tokens_consumed": 71892467712, "data/tokens_consumed_B": 71.892467712, "train/loss_slope": -8.872025362764996e-06} {"step": 34290, "timestamp": 1778231599.15292, "train/loss": 2.136746275424957, "train/z_loss": 0.0014402212342247366, "train/perplexity": 8.471827741641476, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027408.5922234813, "perf/iters_per_sec": 0.9667437516324431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344002723693848, "data/tokens_consumed": 71913439232, "data/tokens_consumed_B": 71.913439232, "train/loss_slope": -1.1374221590593207e-05} {"step": 34300, "timestamp": 1778231609.491065, "grad/layer_0/attn": 0.0029042076785117388, "grad/layer_0/mlp": 0.0029411755967885256, "grad/layer_0/attn_mlp_ratio": 0.9874308704790563, "grad/layer_4/attn": 0.0018192385323345661, "grad/layer_4/mlp": 0.0025685408618301153, "grad/layer_4/attn_mlp_ratio": 0.7082770177191714, "grad/layer_8/attn": 0.0052941846661269665, "grad/layer_8/mlp": 0.003771289950236678, "grad/layer_8/attn_mlp_ratio": 1.4038126464960483, "grad/layer_12/attn": 0.004586379509419203, "grad/layer_12/mlp": 0.006620520260185003, "grad/layer_12/attn_mlp_ratio": 0.6927521191538246, "grad/layer_16/attn": 0.0038351097609847784, "grad/layer_16/mlp": 0.004720549564808607, "grad/layer_16/attn_mlp_ratio": 0.8124286435487109, "grad/layer_20/attn": 0.005933832377195358, "grad/layer_20/mlp": 0.007191261742264032, "grad/layer_20/attn_mlp_ratio": 0.8251448087067852, "grad/layer_24/attn": 0.021983599290251732, "grad/layer_24/mlp": 0.015921374782919884, "grad/layer_24/attn_mlp_ratio": 1.3807601072087858, "grad/layer_27/attn": 0.010057162493467331, "grad/layer_27/mlp": 0.01643131487071514, "grad/layer_27/attn_mlp_ratio": 0.612072893215899} {"step": 34300, "timestamp": 1778231609.5067694, "train/loss": 2.2002798080444337, "train/z_loss": 0.0014309572288766503, "train/perplexity": 9.027539124140961, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026529.941284934, "perf/iters_per_sec": 0.9663247782158537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348487615585327, "data/tokens_consumed": 71934410752, "data/tokens_consumed_B": 71.934410752, "train/loss_slope": -9.7836428349561e-06} {"step": 34310, "timestamp": 1778231619.852799, "train/loss": 2.2357184886932373, "train/z_loss": 0.0014278493938036263, "train/perplexity": 9.353199606755341, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028114.783305859, "perf/iters_per_sec": 0.9670804897813124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034040093421936, "data/tokens_consumed": 71955382272, "data/tokens_consumed_B": 71.955382272, "train/loss_slope": -3.3524245712229786e-06} {"step": 34320, "timestamp": 1778231630.208529, "train/loss": 2.185967946052551, "train/z_loss": 0.0014387381845153869, "train/perplexity": 8.899258387798623, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026153.0429642869, "perf/iters_per_sec": 0.9661450590917048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350412607192994, "data/tokens_consumed": 71976353792, "data/tokens_consumed_B": 71.976353792, "train/loss_slope": 6.91008617882976e-07} {"step": 34330, "timestamp": 1778231640.5587873, "train/loss": 2.184538149833679, "train/z_loss": 0.0014498691307380795, "train/perplexity": 8.886543353924791, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027225.5686140493, "perf/iters_per_sec": 0.9666564791746375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034493660926819, "data/tokens_consumed": 71997325312, "data/tokens_consumed_B": 71.997325312, "train/loss_slope": 5.78392757297896e-07} {"step": 34340, "timestamp": 1778231650.9068162, "train/loss": 2.2172632694244383, "train/z_loss": 0.0014263421879149973, "train/perplexity": 9.182167330596078, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027657.1778358023, "perf/iters_per_sec": 0.9668622864893924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342734575271606, "data/tokens_consumed": 72018296832, "data/tokens_consumed_B": 72.018296832, "train/loss_slope": 4.95545133756082e-06} {"step": 34350, "timestamp": 1778231661.2487473, "grad/layer_0/attn": 0.003330003237351775, "grad/layer_0/mlp": 0.003115906147286296, "grad/layer_0/attn_mlp_ratio": 1.0687109858494428, "grad/layer_4/attn": 0.0020533259958028793, "grad/layer_4/mlp": 0.0025926304515451193, "grad/layer_4/attn_mlp_ratio": 0.7919855741031699, "grad/layer_8/attn": 0.00799062941223383, "grad/layer_8/mlp": 0.003813772229477763, "grad/layer_8/attn_mlp_ratio": 2.095203573236901, "grad/layer_12/attn": 0.004665717948228121, "grad/layer_12/mlp": 0.006414521485567093, "grad/layer_12/attn_mlp_ratio": 0.727368032984121, "grad/layer_16/attn": 0.0053083342500030994, "grad/layer_16/mlp": 0.004600306041538715, "grad/layer_16/attn_mlp_ratio": 1.1539089109899026, "grad/layer_20/attn": 0.004114147275686264, "grad/layer_20/mlp": 0.006592232268303633, "grad/layer_20/attn_mlp_ratio": 0.6240901481973928, "grad/layer_24/attn": 0.01130757387727499, "grad/layer_24/mlp": 0.013306114822626114, "grad/layer_24/attn_mlp_ratio": 0.8498028119422939, "grad/layer_27/attn": 0.0048053571954369545, "grad/layer_27/mlp": 0.012209860607981682, "grad/layer_27/attn_mlp_ratio": 0.39356363765031777} {"step": 34350, "timestamp": 1778231661.8614333, "eos/sharpness": 37.065243721008294, "eos/L0_probe": 2.006955146789551, "eos/L_plus": 2.2062814235687256, "eos/L_minus": 2.178281307220459, "eos/grad_norm": 0.17291879653930664, "eos/embed_grad_frac": 0.0943179801106453, "eos/time_s": 0.6099755764007568} {"step": 34350, "timestamp": 1778231661.8824537, "train/loss": 2.163539695739746, "train/z_loss": 0.0014423080254346132, "train/perplexity": 8.701885233918624, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911761.453773582, "perf/iters_per_sec": 0.9115988987796697, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.09697368144989, "data/tokens_consumed": 72039268352, "data/tokens_consumed_B": 72.039268352, "train/loss_slope": 6.002201441705588e-06} {"step": 34350, "timestamp": 1778231663.3084972, "geo/rankme_last": 438.59991455078125, "geo/layer_0/stable_rank_q_proj": 18.910198211669922, "geo/layer_0/stable_rank_k_proj": 16.263235092163086, "geo/layer_0/stable_rank_o_proj": 50.01765060424805, "geo/layer_0/stable_rank_gate_proj": 140.10565185546875, "geo/layer_0/stable_rank_down_proj": 52.63357162475586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06334497779607773, "geo/layer_0/attn_entropy_mean": 6.204548358917236, "geo/layer_0/attn_entropy_std": 0.3639047145843506, "geo/layer_7/stable_rank_q_proj": 42.87440490722656, "geo/layer_7/stable_rank_k_proj": 42.301998138427734, "geo/layer_7/stable_rank_o_proj": 102.04368591308594, "geo/layer_7/stable_rank_gate_proj": 92.79988861083984, "geo/layer_7/stable_rank_down_proj": 145.84368896484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5340316295623779, "geo/layer_7/attn_entropy_mean": 4.661068916320801, "geo/layer_7/attn_entropy_std": 0.8336097598075867, "geo/layer_14/stable_rank_q_proj": 54.62934494018555, "geo/layer_14/stable_rank_k_proj": 36.40818405151367, "geo/layer_14/stable_rank_o_proj": 50.732181549072266, "geo/layer_14/stable_rank_gate_proj": 78.94685363769531, "geo/layer_14/stable_rank_down_proj": 133.83277893066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3881663978099823, "geo/layer_14/attn_entropy_mean": 5.489506721496582, "geo/layer_14/attn_entropy_std": 0.367147833108902, "geo/layer_21/stable_rank_q_proj": 44.360191345214844, "geo/layer_21/stable_rank_k_proj": 31.117053985595703, "geo/layer_21/stable_rank_o_proj": 77.96717071533203, "geo/layer_21/stable_rank_gate_proj": 75.6121597290039, "geo/layer_21/stable_rank_down_proj": 57.040164947509766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14852406084537506, "geo/layer_21/attn_entropy_mean": 5.712279319763184, "geo/layer_21/attn_entropy_std": 0.29390230774879456, "geo/layer_27/stable_rank_q_proj": 42.28302001953125, "geo/layer_27/stable_rank_k_proj": 31.497285842895508, "geo/layer_27/stable_rank_o_proj": 117.71446990966797, "geo/layer_27/stable_rank_gate_proj": 87.171142578125, "geo/layer_27/stable_rank_down_proj": 133.29579162597656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09322649985551834, "geo/layer_27/attn_entropy_mean": 4.293162822723389, "geo/layer_27/attn_entropy_std": 0.6461739540100098, "attnres/final_alpha/block_0": 0.24272382259368896, "attnres/block_norm/0": 1.7023003101348877, "attnres/final_alpha/block_1": 0.005603479687124491, "attnres/block_norm/1": 37835.328125, "attnres/final_alpha/block_2": 0.011871833354234695, "attnres/block_norm/2": 25243.75390625, "attnres/final_alpha/block_3": 0.013880844227969646, "attnres/block_norm/3": 42809.234375, "attnres/final_alpha/block_4": 0.017396599054336548, "attnres/block_norm/4": 12028.998046875, "attnres/final_alpha/block_5": 0.5876926779747009, "attnres/block_norm/5": 5860.74658203125, "attnres/final_alpha/block_6": 0.12083075195550919, "attnres/block_norm/6": 28478.56640625, "geo/tier1_time_s": 1.3603546619415283, "geo/step": 34350.0, "geo/rankme_slope": -0.00013981686424569827} {"step": 34360, "timestamp": 1778231673.6592042, "train/loss": 2.1018566608428957, "train/z_loss": 0.0014642191585153342, "train/perplexity": 8.181345804363259, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790723.9935377452, "perf/iters_per_sec": 0.8538837402046896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711196184158326, "data/tokens_consumed": 72060239872, "data/tokens_consumed_B": 72.060239872, "train/loss_slope": 2.6668825201039576e-06} {"step": 34370, "timestamp": 1778231684.0154123, "train/loss": 2.1873027324676513, "train/z_loss": 0.001435778313316405, "train/perplexity": 8.911144928230659, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026150.0093009346, "perf/iters_per_sec": 0.966143612528293, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350428104400635, "data/tokens_consumed": 72081211392, "data/tokens_consumed_B": 72.081211392, "train/loss_slope": 3.556222733956282e-06} {"step": 34380, "timestamp": 1778231694.3701572, "train/loss": 2.1467220783233643, "train/z_loss": 0.0014470946392975748, "train/perplexity": 8.556763974561052, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026406.4092483511, "perf/iters_per_sec": 0.9662658735505825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034911847114563, "data/tokens_consumed": 72102182912, "data/tokens_consumed_B": 72.102182912, "train/loss_slope": 3.5585485776551557e-06} {"step": 34390, "timestamp": 1778231704.7162845, "train/loss": 2.2101672410964968, "train/z_loss": 0.0014242334058508278, "train/perplexity": 9.117241042933738, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028020.4215088428, "perf/iters_per_sec": 0.9670354945701803, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340882062911987, "data/tokens_consumed": 72123154432, "data/tokens_consumed_B": 72.123154432, "train/loss_slope": 7.437729964269159e-06} {"step": 34400, "timestamp": 1778231715.0566206, "grad/layer_0/attn": 0.0030038279946893454, "grad/layer_0/mlp": 0.00289786234498024, "grad/layer_0/attn_mlp_ratio": 1.0365667976727682, "grad/layer_4/attn": 0.0017257292056456208, "grad/layer_4/mlp": 0.002393855946138501, "grad/layer_4/attn_mlp_ratio": 0.7208993240965231, "grad/layer_8/attn": 0.007285045925527811, "grad/layer_8/mlp": 0.0035436900798231363, "grad/layer_8/attn_mlp_ratio": 2.0557795845153155, "grad/layer_12/attn": 0.0041101351380348206, "grad/layer_12/mlp": 0.006589769851416349, "grad/layer_12/attn_mlp_ratio": 0.6237145102692733, "grad/layer_16/attn": 0.006200368516147137, "grad/layer_16/mlp": 0.004284773021936417, "grad/layer_16/attn_mlp_ratio": 1.4470704374996186, "grad/layer_20/attn": 0.004888087045401335, "grad/layer_20/mlp": 0.00603579543530941, "grad/layer_20/attn_mlp_ratio": 0.8098496738012447, "grad/layer_24/attn": 0.005242592189460993, "grad/layer_24/mlp": 0.007608959451317787, "grad/layer_24/attn_mlp_ratio": 0.6890025047580959, "grad/layer_27/attn": 0.006635059602558613, "grad/layer_27/mlp": 0.006502537988126278, "grad/layer_27/attn_mlp_ratio": 1.0203799674275376} {"step": 34400, "timestamp": 1778231715.0722907, "train/loss": 2.1581737875938414, "train/z_loss": 0.00144035016419366, "train/perplexity": 8.655316769844552, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026151.7828260988, "perf/iters_per_sec": 0.9661444582109923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035041904449463, "data/tokens_consumed": 72144125952, "data/tokens_consumed_B": 72.144125952, "train/loss_slope": 5.360898562390407e-06} {"step": 34410, "timestamp": 1778231725.4200544, "train/loss": 2.1629735231399536, "train/z_loss": 0.0014556158101186156, "train/perplexity": 8.696959859370269, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027769.3160177744, "perf/iters_per_sec": 0.9669157581414101, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342162609100343, "data/tokens_consumed": 72165097472, "data/tokens_consumed_B": 72.165097472, "train/loss_slope": 5.223937110431602e-06} {"step": 34420, "timestamp": 1778231735.769988, "train/loss": 2.198620009422302, "train/z_loss": 0.001443397137336433, "train/perplexity": 9.012567655390189, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027578.141658997, "perf/iters_per_sec": 0.9668245991034493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343137741088868, "data/tokens_consumed": 72186068992, "data/tokens_consumed_B": 72.186068992, "train/loss_slope": 6.117541247075104e-06} {"step": 34425, "timestamp": 1778231741.5425565, "eos/sharpness": 19.438338279724118, "eos/L0_probe": 2.0121567249298096, "eos/L_plus": 2.1201701164245605, "eos/L_minus": 2.0985267162323, "eos/grad_norm": 0.11879327893257141, "eos/embed_grad_frac": 0.18301324546337128, "eos/time_s": 0.6061511039733887} {"step": 34425, "timestamp": 1778231742.9177396, "geo/rankme_last": 440.0214538574219, "geo/layer_0/stable_rank_q_proj": 18.899003982543945, "geo/layer_0/stable_rank_k_proj": 16.247594833374023, "geo/layer_0/stable_rank_o_proj": 50.00871658325195, "geo/layer_0/stable_rank_gate_proj": 140.43563842773438, "geo/layer_0/stable_rank_down_proj": 52.695926666259766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.055993154644966125, "geo/layer_0/attn_entropy_mean": 6.2020697593688965, "geo/layer_0/attn_entropy_std": 0.36101195216178894, "geo/layer_7/stable_rank_q_proj": 42.92498779296875, "geo/layer_7/stable_rank_k_proj": 42.288368225097656, "geo/layer_7/stable_rank_o_proj": 102.01319122314453, "geo/layer_7/stable_rank_gate_proj": 92.73645782470703, "geo/layer_7/stable_rank_down_proj": 146.05374145507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5140697956085205, "geo/layer_7/attn_entropy_mean": 4.635505676269531, "geo/layer_7/attn_entropy_std": 0.8316506147384644, "geo/layer_14/stable_rank_q_proj": 54.54420852661133, "geo/layer_14/stable_rank_k_proj": 36.39314270019531, "geo/layer_14/stable_rank_o_proj": 50.61269760131836, "geo/layer_14/stable_rank_gate_proj": 79.08087921142578, "geo/layer_14/stable_rank_down_proj": 133.64659118652344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3808184564113617, "geo/layer_14/attn_entropy_mean": 5.491300106048584, "geo/layer_14/attn_entropy_std": 0.384281188249588, "geo/layer_21/stable_rank_q_proj": 44.356327056884766, "geo/layer_21/stable_rank_k_proj": 31.225637435913086, "geo/layer_21/stable_rank_o_proj": 77.85022735595703, "geo/layer_21/stable_rank_gate_proj": 75.6174087524414, "geo/layer_21/stable_rank_down_proj": 57.033348083496094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14785730838775635, "geo/layer_21/attn_entropy_mean": 5.728603363037109, "geo/layer_21/attn_entropy_std": 0.28601470589637756, "geo/layer_27/stable_rank_q_proj": 42.38825988769531, "geo/layer_27/stable_rank_k_proj": 31.484323501586914, "geo/layer_27/stable_rank_o_proj": 117.71774291992188, "geo/layer_27/stable_rank_gate_proj": 87.18575286865234, "geo/layer_27/stable_rank_down_proj": 133.1783447265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08686268329620361, "geo/layer_27/attn_entropy_mean": 4.308165550231934, "geo/layer_27/attn_entropy_std": 0.6401512622833252, "attnres/final_alpha/block_0": 0.24078746140003204, "attnres/block_norm/0": 1.702745795249939, "attnres/final_alpha/block_1": 0.0054919179528951645, "attnres/block_norm/1": 37617.3125, "attnres/final_alpha/block_2": 0.011786472983658314, "attnres/block_norm/2": 25304.50390625, "attnres/final_alpha/block_3": 0.013766255229711533, "attnres/block_norm/3": 42957.65625, "attnres/final_alpha/block_4": 0.017417144030332565, "attnres/block_norm/4": 12031.7158203125, "attnres/final_alpha/block_5": 0.5919453501701355, "attnres/block_norm/5": 5740.99267578125, "attnres/final_alpha/block_6": 0.11880543828010559, "attnres/block_norm/6": 28148.59375, "geo/tier1_time_s": 1.355821132659912, "geo/step": 34425.0, "geo/rankme_slope": -0.00010306784823304322} {"step": 34430, "timestamp": 1778231748.108861, "train/loss": 2.214632177352905, "train/z_loss": 0.0014271218446083368, "train/perplexity": 9.158039957521828, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700375.8073428858, "perf/iters_per_sec": 0.810802367850726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233346176147461, "data/tokens_consumed": 72207040512, "data/tokens_consumed_B": 72.207040512, "train/loss_slope": 5.927816340059863e-06} {"step": 34440, "timestamp": 1778231758.459324, "train/loss": 2.1809090971946716, "train/z_loss": 0.0014388495008461177, "train/perplexity": 8.854352067580896, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027056.8722649654, "perf/iters_per_sec": 0.9665760384869411, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345797538757324, "data/tokens_consumed": 72228012032, "data/tokens_consumed_B": 72.228012032, "train/loss_slope": 4.303566207050194e-06} {"step": 34450, "timestamp": 1778231768.7926123, "grad/layer_0/attn": 0.0026098170783370733, "grad/layer_0/mlp": 0.0027112322859466076, "grad/layer_0/attn_mlp_ratio": 0.96259438765367, "grad/layer_4/attn": 0.002183408010751009, "grad/layer_4/mlp": 0.002404690720140934, "grad/layer_4/attn_mlp_ratio": 0.9079786858516156, "grad/layer_8/attn": 0.00763088371604681, "grad/layer_8/mlp": 0.0035683345049619675, "grad/layer_8/attn_mlp_ratio": 2.1385000457736343, "grad/layer_12/attn": 0.003825834719464183, "grad/layer_12/mlp": 0.006386830471456051, "grad/layer_12/attn_mlp_ratio": 0.59901929081422, "grad/layer_16/attn": 0.005526970140635967, "grad/layer_16/mlp": 0.004349174909293652, "grad/layer_16/attn_mlp_ratio": 1.2708088611806867, "grad/layer_20/attn": 0.005008121021091938, "grad/layer_20/mlp": 0.005764165427535772, "grad/layer_20/attn_mlp_ratio": 0.8688371277972218, "grad/layer_24/attn": 0.006152214948087931, "grad/layer_24/mlp": 0.008204867132008076, "grad/layer_24/attn_mlp_ratio": 0.749825045807868, "grad/layer_27/attn": 0.004911268129944801, "grad/layer_27/mlp": 0.006900763139128685, "grad/layer_27/attn_mlp_ratio": 0.7116992656836776} {"step": 34450, "timestamp": 1778231768.8081324, "train/loss": 2.152622842788696, "train/z_loss": 0.0014473290997557343, "train/perplexity": 8.607404685877043, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027542.6685245056, "perf/iters_per_sec": 0.9668076841948059, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343318700790405, "data/tokens_consumed": 72248983552, "data/tokens_consumed_B": 72.248983552, "train/loss_slope": 2.677082986828755e-06} {"step": 34460, "timestamp": 1778231779.1567261, "train/loss": 2.113759469985962, "train/z_loss": 0.0014450007351115347, "train/perplexity": 8.279308662018426, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027539.0698700645, "perf/iters_per_sec": 0.9668059682226489, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343337059020996, "data/tokens_consumed": 72269955072, "data/tokens_consumed_B": 72.269955072, "train/loss_slope": -4.598239653467496e-06} {"step": 34470, "timestamp": 1778231789.5099776, "train/loss": 2.2494919061660767, "train/z_loss": 0.001431993255391717, "train/perplexity": 9.48291640074862, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026697.4759077043, "perf/iters_per_sec": 0.9664046649492761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034763216972351, "data/tokens_consumed": 72290926592, "data/tokens_consumed_B": 72.290926592, "train/loss_slope": -4.471124988971689e-07} {"step": 34480, "timestamp": 1778231799.8606074, "train/loss": 2.1994419574737547, "train/z_loss": 0.001432449440471828, "train/perplexity": 9.01997856308646, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027302.4746883153, "perf/iters_per_sec": 0.966693150848539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344544172286987, "data/tokens_consumed": 72311898112, "data/tokens_consumed_B": 72.311898112, "train/loss_slope": 8.028769113979165e-07} {"step": 34490, "timestamp": 1778231810.2137797, "train/loss": 2.216944456100464, "train/z_loss": 0.001428000198211521, "train/perplexity": 9.179240399905165, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026684.6810382036, "perf/iters_per_sec": 0.9663985638800638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347697496414185, "data/tokens_consumed": 72332869632, "data/tokens_consumed_B": 72.332869632, "train/loss_slope": 1.5493638802318244e-06} {"step": 34500, "timestamp": 1778231820.5743294, "grad/layer_0/attn": 0.0029200913850218058, "grad/layer_0/mlp": 0.0027625516522675753, "grad/layer_0/attn_mlp_ratio": 1.0570268530263434, "grad/layer_4/attn": 0.0017855096375569701, "grad/layer_4/mlp": 0.0025208117440342903, "grad/layer_4/attn_mlp_ratio": 0.7083073819184587, "grad/layer_8/attn": 0.00680858688428998, "grad/layer_8/mlp": 0.003833569586277008, "grad/layer_8/attn_mlp_ratio": 1.7760435942152328, "grad/layer_12/attn": 0.0051061129197478294, "grad/layer_12/mlp": 0.006925283931195736, "grad/layer_12/attn_mlp_ratio": 0.7373145847515796, "grad/layer_16/attn": 0.00541701028123498, "grad/layer_16/mlp": 0.00457586906850338, "grad/layer_16/attn_mlp_ratio": 1.183821058198373, "grad/layer_20/attn": 0.0040067038498818874, "grad/layer_20/mlp": 0.006019498221576214, "grad/layer_20/attn_mlp_ratio": 0.665620893276157, "grad/layer_24/attn": 0.00737658329308033, "grad/layer_24/mlp": 0.009436458349227905, "grad/layer_24/attn_mlp_ratio": 0.7817109917634285, "grad/layer_27/attn": 0.004502520896494389, "grad/layer_27/mlp": 0.00913753267377615, "grad/layer_27/attn_mlp_ratio": 0.49275017753328293} {"step": 34500, "timestamp": 1778231821.186296, "eos/sharpness": 26.758074760437008, "eos/L0_probe": 2.0112266540527344, "eos/L_plus": 2.1574854850769043, "eos/L_minus": 2.1325485706329346, "eos/grad_norm": 0.11977369338274002, "eos/embed_grad_frac": 0.1780695915222168, "eos/time_s": 0.6091611385345459} {"step": 34500, "timestamp": 1778231821.2044764, "train/loss": 2.2333884954452516, "train/z_loss": 0.001433107873890549, "train/perplexity": 9.331432083763088, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909031.4511576127, "perf/iters_per_sec": 0.9102971320903839, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985424041748046, "data/tokens_consumed": 72353841152, "data/tokens_consumed_B": 72.353841152, "train/loss_slope": 2.201439130424256e-06} {"step": 34500, "timestamp": 1778231822.566626, "geo/rankme_last": 439.8360290527344, "geo/layer_0/stable_rank_q_proj": 18.92586898803711, "geo/layer_0/stable_rank_k_proj": 16.260982513427734, "geo/layer_0/stable_rank_o_proj": 49.93556213378906, "geo/layer_0/stable_rank_gate_proj": 140.3858642578125, "geo/layer_0/stable_rank_down_proj": 52.70433807373047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0597727969288826, "geo/layer_0/attn_entropy_mean": 6.203192710876465, "geo/layer_0/attn_entropy_std": 0.35704073309898376, "geo/layer_7/stable_rank_q_proj": 42.9279670715332, "geo/layer_7/stable_rank_k_proj": 42.26051330566406, "geo/layer_7/stable_rank_o_proj": 101.55387115478516, "geo/layer_7/stable_rank_gate_proj": 92.67706298828125, "geo/layer_7/stable_rank_down_proj": 145.953857421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5121886134147644, "geo/layer_7/attn_entropy_mean": 4.651385307312012, "geo/layer_7/attn_entropy_std": 0.8317728042602539, "geo/layer_14/stable_rank_q_proj": 54.52141189575195, "geo/layer_14/stable_rank_k_proj": 36.41132354736328, "geo/layer_14/stable_rank_o_proj": 50.69522476196289, "geo/layer_14/stable_rank_gate_proj": 78.9693374633789, "geo/layer_14/stable_rank_down_proj": 133.6287078857422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3788999021053314, "geo/layer_14/attn_entropy_mean": 5.490828037261963, "geo/layer_14/attn_entropy_std": 0.3869883120059967, "geo/layer_21/stable_rank_q_proj": 44.37376022338867, "geo/layer_21/stable_rank_k_proj": 31.273937225341797, "geo/layer_21/stable_rank_o_proj": 77.84927368164062, "geo/layer_21/stable_rank_gate_proj": 75.61502838134766, "geo/layer_21/stable_rank_down_proj": 56.9570198059082, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14419052004814148, "geo/layer_21/attn_entropy_mean": 5.719571113586426, "geo/layer_21/attn_entropy_std": 0.27721136808395386, "geo/layer_27/stable_rank_q_proj": 42.256343841552734, "geo/layer_27/stable_rank_k_proj": 31.442594528198242, "geo/layer_27/stable_rank_o_proj": 117.63377380371094, "geo/layer_27/stable_rank_gate_proj": 87.19437408447266, "geo/layer_27/stable_rank_down_proj": 133.04383850097656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08310220390558243, "geo/layer_27/attn_entropy_mean": 4.299449443817139, "geo/layer_27/attn_entropy_std": 0.6467477679252625, "attnres/final_alpha/block_0": 0.2409771978855133, "attnres/block_norm/0": 1.7029869556427002, "attnres/final_alpha/block_1": 0.00550577649846673, "attnres/block_norm/1": 37805.4296875, "attnres/final_alpha/block_2": 0.011805248446762562, "attnres/block_norm/2": 25326.34375, "attnres/final_alpha/block_3": 0.013797616586089134, "attnres/block_norm/3": 43015.62890625, "attnres/final_alpha/block_4": 0.0172316562384367, "attnres/block_norm/4": 12055.248046875, "attnres/final_alpha/block_5": 0.5900311470031738, "attnres/block_norm/5": 5781.2802734375, "attnres/final_alpha/block_6": 0.12065131962299347, "attnres/block_norm/6": 28266.7578125, "geo/tier1_time_s": 1.3585214614868164, "geo/step": 34500.0, "geo/rankme_slope": -0.00010845795349389756} {"step": 34500, "timestamp": 1778231829.5048065, "geo/ww_alpha_mean": 7.565862052552359, "geo/ww_alpha_std": 4.171249310520511, "geo/ww_alpha_min": 2.646988412580324, "geo/ww_alpha_max": 26.33321083164988, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.06509518723218, "geo/ww_alpha_by_type/k_proj": 4.865013657166395, "geo/ww_alpha_by_type/v_proj": 7.884450798713146, "geo/ww_alpha_by_type/o_proj": 7.449353148621533, "geo/ww_alpha_by_type/gate_proj": 8.328936663045202, "geo/ww_alpha_by_type/up_proj": 11.59981828739522, "geo/ww_alpha_by_type/down_proj": 8.876836187288582, "geo/twonn_id/layer_0": 0.6795082688331604, "geo/twonn_id/layer_7": 3.4629688262939453, "geo/twonn_id/layer_14": 4.452800750732422, "geo/twonn_id/layer_21": 6.866807460784912, "geo/twonn_id/layer_27": 6.192887306213379, "geo/tier2_time_s": 6.930584669113159} {"step": 34500, "timestamp": 1778231830.1881804, "eoc/jacobian_sigma/layer_0/attn": 1185.59326171875, "eoc/jacobian_sigma/layer_0/mlp": 7947.05224609375, "eoc/jacobian_sigma/layer_0": 7947.05224609375, "eoc/jacobian_sigma/layer_7/attn": 1.1707959175109863, "eoc/jacobian_sigma/layer_7/mlp": 1.626129388809204, "eoc/jacobian_sigma/layer_7": 1.626129388809204, "eoc/jacobian_sigma/layer_14/attn": 1.6362303495407104, "eoc/jacobian_sigma/layer_14/mlp": 9.057618141174316, "eoc/jacobian_sigma/layer_14": 9.057618141174316, "eoc/jacobian_sigma/layer_21/attn": 1.091959834098816, "eoc/jacobian_sigma/layer_21/mlp": 4.419153690338135, "eoc/jacobian_sigma/layer_21": 4.419153690338135, "eoc/jacobian_sigma/layer_27/attn": 3.334049701690674, "eoc/jacobian_sigma/layer_27/mlp": 20.474498748779297, "eoc/jacobian_sigma/layer_27": 20.474498748779297, "eoc/layer0_sigma": 7947.05224609375, "eoc/sigma_max": 20.474498748779297, "eoc/sigma_min": 1.626129388809204, "eoc/sigma_mean": 8.894349992275238, "eoc/time_s": 0.6767668724060059} {"step": 34510, "timestamp": 1778231840.55907, "train/loss": 2.1490135312080385, "train/z_loss": 0.0014421574771404267, "train/perplexity": 8.576393877964108, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1083735.153282096, "perf/iters_per_sec": 0.5167651907358627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.935114860534668, "data/tokens_consumed": 72374812672, "data/tokens_consumed_B": 72.374812672, "train/loss_slope": 9.828668580626014e-07} {"step": 34520, "timestamp": 1778231850.9343455, "train/loss": 2.189062547683716, "train/z_loss": 0.0014428035588935017, "train/perplexity": 8.926840703449194, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022487.222368049, "perf/iters_per_sec": 0.9643970596161122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369173049926759, "data/tokens_consumed": 72395784192, "data/tokens_consumed_B": 72.395784192, "train/loss_slope": 2.918785747879441e-06} {"step": 34530, "timestamp": 1778231861.3079574, "train/loss": 2.1431219577789307, "train/z_loss": 0.0014467121683992446, "train/perplexity": 8.526013977842045, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022774.0940772255, "perf/iters_per_sec": 0.9645338507066848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367702484130858, "data/tokens_consumed": 72416755712, "data/tokens_consumed_B": 72.416755712, "train/loss_slope": 7.004994656494493e-08} {"step": 34540, "timestamp": 1778231871.681791, "train/loss": 2.166505241394043, "train/z_loss": 0.0014481042744591832, "train/perplexity": 8.72772937390634, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023162.85879549, "perf/iters_per_sec": 0.9647192281701518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365710258483887, "data/tokens_consumed": 72437727232, "data/tokens_consumed_B": 72.437727232, "train/loss_slope": -5.011685875275453e-06} {"step": 34550, "timestamp": 1778231882.025318, "grad/layer_0/attn": 0.0027903823647648096, "grad/layer_0/mlp": 0.0028228459414094687, "grad/layer_0/attn_mlp_ratio": 0.9884996644632983, "grad/layer_4/attn": 0.0039266301319003105, "grad/layer_4/mlp": 0.002415744122117758, "grad/layer_4/attn_mlp_ratio": 1.625432898048303, "grad/layer_8/attn": 0.007255956996232271, "grad/layer_8/mlp": 0.003675475949421525, "grad/layer_8/attn_mlp_ratio": 1.974154340462719, "grad/layer_12/attn": 0.004527553450316191, "grad/layer_12/mlp": 0.006696776952594519, "grad/layer_12/attn_mlp_ratio": 0.6760794655037964, "grad/layer_16/attn": 0.003537377342581749, "grad/layer_16/mlp": 0.004554594401270151, "grad/layer_16/attn_mlp_ratio": 0.7766613123506981, "grad/layer_20/attn": 0.007307067513465881, "grad/layer_20/mlp": 0.0062612565234303474, "grad/layer_20/attn_mlp_ratio": 1.1670289133529503, "grad/layer_24/attn": 0.025820063427090645, "grad/layer_24/mlp": 0.011340582743287086, "grad/layer_24/attn_mlp_ratio": 2.2767845166242493, "grad/layer_27/attn": 0.009934643283486366, "grad/layer_27/mlp": 0.010173698887228966, "grad/layer_27/attn_mlp_ratio": 0.9765025774752442} {"step": 34550, "timestamp": 1778231882.0414615, "train/loss": 2.200528693199158, "train/z_loss": 0.0014462731895036995, "train/perplexity": 9.02978622423598, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025438.3296490824, "perf/iters_per_sec": 0.9658042572255527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035406494140625, "data/tokens_consumed": 72458698752, "data/tokens_consumed_B": 72.458698752, "train/loss_slope": -1.3506366439027303e-06} {"step": 34560, "timestamp": 1778231892.8066301, "train/loss": 2.1572497844696046, "train/z_loss": 0.0014454158255830408, "train/perplexity": 8.647322923846046, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949157.75098736, "perf/iters_per_sec": 0.9294308428704071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0759272813796996, "data/tokens_consumed": 72479670272, "data/tokens_consumed_B": 72.479670272, "train/loss_slope": -2.217399569699399e-06} {"step": 34570, "timestamp": 1778231903.159661, "train/loss": 2.1919626951217652, "train/z_loss": 0.0014357421547174453, "train/perplexity": 8.952767435145265, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026651.4338114646, "perf/iters_per_sec": 0.9663827103669475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347867250442504, "data/tokens_consumed": 72500641792, "data/tokens_consumed_B": 72.500641792, "train/loss_slope": -1.3830834096020907e-06} {"step": 34575, "timestamp": 1778231909.0477607, "eos/sharpness": 76.72660350799559, "eos/L0_probe": 2.0105140209198, "eos/L_plus": 2.3066725730895996, "eos/L_minus": 2.481621503829956, "eos/grad_norm": 0.21652594208717346, "eos/embed_grad_frac": 0.046850092709064484, "eos/time_s": 0.7225675582885742} {"step": 34575, "timestamp": 1778231910.4297285, "geo/rankme_last": 440.3744812011719, "geo/layer_0/stable_rank_q_proj": 18.933773040771484, "geo/layer_0/stable_rank_k_proj": 16.23397445678711, "geo/layer_0/stable_rank_o_proj": 49.88002014160156, "geo/layer_0/stable_rank_gate_proj": 140.635498046875, "geo/layer_0/stable_rank_down_proj": 52.68573760986328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05773165822029114, "geo/layer_0/attn_entropy_mean": 6.206480979919434, "geo/layer_0/attn_entropy_std": 0.3624016344547272, "geo/layer_7/stable_rank_q_proj": 42.97579574584961, "geo/layer_7/stable_rank_k_proj": 42.3433837890625, "geo/layer_7/stable_rank_o_proj": 101.62858581542969, "geo/layer_7/stable_rank_gate_proj": 92.4893569946289, "geo/layer_7/stable_rank_down_proj": 145.65274047851562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5086371898651123, "geo/layer_7/attn_entropy_mean": 4.651218414306641, "geo/layer_7/attn_entropy_std": 0.8146876096725464, "geo/layer_14/stable_rank_q_proj": 54.515621185302734, "geo/layer_14/stable_rank_k_proj": 36.40225601196289, "geo/layer_14/stable_rank_o_proj": 50.66526412963867, "geo/layer_14/stable_rank_gate_proj": 78.96180725097656, "geo/layer_14/stable_rank_down_proj": 133.7397003173828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3921874165534973, "geo/layer_14/attn_entropy_mean": 5.5274176597595215, "geo/layer_14/attn_entropy_std": 0.3856264054775238, "geo/layer_21/stable_rank_q_proj": 44.41907501220703, "geo/layer_21/stable_rank_k_proj": 31.336437225341797, "geo/layer_21/stable_rank_o_proj": 77.79985809326172, "geo/layer_21/stable_rank_gate_proj": 75.69583892822266, "geo/layer_21/stable_rank_down_proj": 56.88082504272461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14271581172943115, "geo/layer_21/attn_entropy_mean": 5.730347633361816, "geo/layer_21/attn_entropy_std": 0.295498788356781, "geo/layer_27/stable_rank_q_proj": 42.16606140136719, "geo/layer_27/stable_rank_k_proj": 31.393354415893555, "geo/layer_27/stable_rank_o_proj": 117.64906311035156, "geo/layer_27/stable_rank_gate_proj": 87.16693115234375, "geo/layer_27/stable_rank_down_proj": 132.85476684570312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08295605331659317, "geo/layer_27/attn_entropy_mean": 4.302870750427246, "geo/layer_27/attn_entropy_std": 0.6430118083953857, "attnres/final_alpha/block_0": 0.24296316504478455, "attnres/block_norm/0": 1.7033493518829346, "attnres/final_alpha/block_1": 0.0054848771542310715, "attnres/block_norm/1": 37781.41015625, "attnres/final_alpha/block_2": 0.01201704889535904, "attnres/block_norm/2": 25347.4765625, "attnres/final_alpha/block_3": 0.014128414914011955, "attnres/block_norm/3": 43167.1171875, "attnres/final_alpha/block_4": 0.017453771084547043, "attnres/block_norm/4": 12098.830078125, "attnres/final_alpha/block_5": 0.5857816338539124, "attnres/block_norm/5": 5835.8408203125, "attnres/final_alpha/block_6": 0.12217110395431519, "attnres/block_norm/6": 28153.025390625, "geo/tier1_time_s": 1.3602263927459717, "geo/step": 34575.0, "geo/rankme_slope": -6.45394681310024e-05} {"step": 34580, "timestamp": 1778231915.6102395, "train/loss": 2.157277798652649, "train/z_loss": 0.0014448442845605314, "train/perplexity": 8.647565174926497, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1685053.695807615, "perf/iters_per_sec": 0.8034962157285762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2445609331130982, "data/tokens_consumed": 72521613312, "data/tokens_consumed_B": 72.521613312, "train/loss_slope": -4.122409239710873e-06} {"step": 34590, "timestamp": 1778231925.962177, "train/loss": 2.2255016803741454, "train/z_loss": 0.0014278927352279424, "train/perplexity": 9.258126259289932, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026954.4815286854, "perf/iters_per_sec": 0.966527214779227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346320152282715, "data/tokens_consumed": 72542584832, "data/tokens_consumed_B": 72.542584832, "train/loss_slope": -1.7967470336739843e-06} {"step": 34600, "timestamp": 1778231936.3051581, "grad/layer_0/attn": 0.00265708239749074, "grad/layer_0/mlp": 0.002788941841572523, "grad/layer_0/attn_mlp_ratio": 0.9527205847793894, "grad/layer_4/attn": 0.0018859035335481167, "grad/layer_4/mlp": 0.0024542121682316065, "grad/layer_4/attn_mlp_ratio": 0.7684353786182533, "grad/layer_8/attn": 0.004732668865472078, "grad/layer_8/mlp": 0.0037314367946237326, "grad/layer_8/attn_mlp_ratio": 1.2683234365536034, "grad/layer_12/attn": 0.004937019199132919, "grad/layer_12/mlp": 0.007002809550613165, "grad/layer_12/attn_mlp_ratio": 0.7050054828636725, "grad/layer_16/attn": 0.004375105258077383, "grad/layer_16/mlp": 0.004631274379789829, "grad/layer_16/attn_mlp_ratio": 0.9446871000994804, "grad/layer_20/attn": 0.004036467522382736, "grad/layer_20/mlp": 0.006607511080801487, "grad/layer_20/attn_mlp_ratio": 0.6108907593090316, "grad/layer_24/attn": 0.016900476068258286, "grad/layer_24/mlp": 0.01249481737613678, "grad/layer_24/attn_mlp_ratio": 1.3525988755366496, "grad/layer_27/attn": 0.005982244852930307, "grad/layer_27/mlp": 0.011717927642166615, "grad/layer_27/attn_mlp_ratio": 0.5105207153141402} {"step": 34600, "timestamp": 1778231936.321329, "train/loss": 2.2199817657470704, "train/z_loss": 0.0014343383838422597, "train/perplexity": 9.20716297861309, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025625.9286514665, "perf/iters_per_sec": 0.9658937114007313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353106021881104, "data/tokens_consumed": 72563556352, "data/tokens_consumed_B": 72.563556352, "train/loss_slope": -3.7300777478222165e-06} {"step": 34610, "timestamp": 1778231946.6782424, "train/loss": 2.159437108039856, "train/z_loss": 0.0014498758013360203, "train/perplexity": 8.66625811824602, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026036.743480272, "perf/iters_per_sec": 0.9660896031762466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351006746292115, "data/tokens_consumed": 72584527872, "data/tokens_consumed_B": 72.584527872, "train/loss_slope": -6.242100791175762e-06} {"step": 34620, "timestamp": 1778231957.0309017, "train/loss": 2.1604610681533813, "train/z_loss": 0.0014426945243030786, "train/perplexity": 8.675136565704905, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026853.6890738856, "perf/iters_per_sec": 0.9664791531915119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346834659576416, "data/tokens_consumed": 72605499392, "data/tokens_consumed_B": 72.605499392, "train/loss_slope": -9.482769768695166e-06} {"step": 34630, "timestamp": 1778231967.3847826, "train/loss": 2.206342077255249, "train/z_loss": 0.001432729884982109, "train/perplexity": 9.082432718379186, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026500.0139632577, "perf/iters_per_sec": 0.9663105077568329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348640441894532, "data/tokens_consumed": 72626470912, "data/tokens_consumed_B": 72.626470912, "train/loss_slope": -5.583556600422455e-06} {"step": 34640, "timestamp": 1778231977.733758, "train/loss": 2.24632842540741, "train/z_loss": 0.00142535837367177, "train/perplexity": 9.452964777848848, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027407.7978193334, "perf/iters_per_sec": 0.9667433728310267, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344006776809693, "data/tokens_consumed": 72647442432, "data/tokens_consumed_B": 72.647442432, "train/loss_slope": -2.3855972533250186e-06} {"step": 34650, "timestamp": 1778231988.0777574, "grad/layer_0/attn": 0.0027513604145497084, "grad/layer_0/mlp": 0.002824518596753478, "grad/layer_0/attn_mlp_ratio": 0.974098850084491, "grad/layer_4/attn": 0.002675859723240137, "grad/layer_4/mlp": 0.0024392257910221815, "grad/layer_4/attn_mlp_ratio": 1.0970118565438776, "grad/layer_8/attn": 0.0064705912955105305, "grad/layer_8/mlp": 0.003711441997438669, "grad/layer_8/attn_mlp_ratio": 1.7434170130192785, "grad/layer_12/attn": 0.007603348698467016, "grad/layer_12/mlp": 0.006624107714742422, "grad/layer_12/attn_mlp_ratio": 1.1478298528815043, "grad/layer_16/attn": 0.003642068011686206, "grad/layer_16/mlp": 0.00426795519888401, "grad/layer_16/attn_mlp_ratio": 0.8533519581703535, "grad/layer_20/attn": 0.006405610125511885, "grad/layer_20/mlp": 0.005825694650411606, "grad/layer_20/attn_mlp_ratio": 1.099544414863018, "grad/layer_24/attn": 0.008603470399975777, "grad/layer_24/mlp": 0.010357295162975788, "grad/layer_24/attn_mlp_ratio": 0.8306676773742845, "grad/layer_27/attn": 0.00546153774484992, "grad/layer_27/mlp": 0.009122708812355995, "grad/layer_27/attn_mlp_ratio": 0.5986749985470539} {"step": 34650, "timestamp": 1778231988.6905363, "eos/sharpness": 21.407985687255856, "eos/L0_probe": 2.008612632751465, "eos/L_plus": 2.1137824058532715, "eos/L_minus": 2.117522716522217, "eos/grad_norm": 0.10429587215185165, "eos/embed_grad_frac": 0.2584723234176636, "eos/time_s": 0.6099810600280762} {"step": 34650, "timestamp": 1778231988.7104666, "train/loss": 2.201736330986023, "train/z_loss": 0.001435508008580655, "train/perplexity": 9.040697522409594, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911751.4816749713, "perf/iters_per_sec": 0.9115941437125069, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969794034957885, "data/tokens_consumed": 72668413952, "data/tokens_consumed_B": 72.668413952, "train/loss_slope": -2.53987305878947e-06} {"step": 34650, "timestamp": 1778231990.0738888, "geo/rankme_last": 439.6535949707031, "geo/layer_0/stable_rank_q_proj": 18.9365177154541, "geo/layer_0/stable_rank_k_proj": 16.230899810791016, "geo/layer_0/stable_rank_o_proj": 50.004154205322266, "geo/layer_0/stable_rank_gate_proj": 140.77818298339844, "geo/layer_0/stable_rank_down_proj": 52.645782470703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06058604270219803, "geo/layer_0/attn_entropy_mean": 6.207452774047852, "geo/layer_0/attn_entropy_std": 0.36247140169143677, "geo/layer_7/stable_rank_q_proj": 42.96982955932617, "geo/layer_7/stable_rank_k_proj": 42.37236785888672, "geo/layer_7/stable_rank_o_proj": 101.7789306640625, "geo/layer_7/stable_rank_gate_proj": 92.56037139892578, "geo/layer_7/stable_rank_down_proj": 145.9169464111328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5305365920066833, "geo/layer_7/attn_entropy_mean": 4.634557247161865, "geo/layer_7/attn_entropy_std": 0.8178699612617493, "geo/layer_14/stable_rank_q_proj": 54.480918884277344, "geo/layer_14/stable_rank_k_proj": 36.45060348510742, "geo/layer_14/stable_rank_o_proj": 50.617000579833984, "geo/layer_14/stable_rank_gate_proj": 78.99459838867188, "geo/layer_14/stable_rank_down_proj": 133.94761657714844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3880279064178467, "geo/layer_14/attn_entropy_mean": 5.519263744354248, "geo/layer_14/attn_entropy_std": 0.38691428303718567, "geo/layer_21/stable_rank_q_proj": 44.35478210449219, "geo/layer_21/stable_rank_k_proj": 31.208120346069336, "geo/layer_21/stable_rank_o_proj": 77.79963684082031, "geo/layer_21/stable_rank_gate_proj": 75.54376220703125, "geo/layer_21/stable_rank_down_proj": 56.88502883911133, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14304012060165405, "geo/layer_21/attn_entropy_mean": 5.723715782165527, "geo/layer_21/attn_entropy_std": 0.2926757037639618, "geo/layer_27/stable_rank_q_proj": 42.19777297973633, "geo/layer_27/stable_rank_k_proj": 31.381851196289062, "geo/layer_27/stable_rank_o_proj": 117.60963439941406, "geo/layer_27/stable_rank_gate_proj": 87.05103302001953, "geo/layer_27/stable_rank_down_proj": 132.5404052734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08239038288593292, "geo/layer_27/attn_entropy_mean": 4.299281120300293, "geo/layer_27/attn_entropy_std": 0.621461808681488, "attnres/final_alpha/block_0": 0.24178358912467957, "attnres/block_norm/0": 1.7037949562072754, "attnres/final_alpha/block_1": 0.005473030731081963, "attnres/block_norm/1": 37882.5, "attnres/final_alpha/block_2": 0.011971860192716122, "attnres/block_norm/2": 25325.5078125, "attnres/final_alpha/block_3": 0.013724742457270622, "attnres/block_norm/3": 43203.484375, "attnres/final_alpha/block_4": 0.017156314104795456, "attnres/block_norm/4": 12054.009765625, "attnres/final_alpha/block_5": 0.5881727933883667, "attnres/block_norm/5": 5835.87841796875, "attnres/final_alpha/block_6": 0.12171768397092819, "attnres/block_norm/6": 28291.423828125, "geo/tier1_time_s": 1.3593506813049316, "geo/step": 34650.0, "geo/rankme_slope": -5.4908486832232896e-05} {"step": 34660, "timestamp": 1778232000.4252822, "train/loss": 2.1683486223220827, "train/z_loss": 0.0014329832862131297, "train/perplexity": 8.74383274153967, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790761.0699272994, "perf/iters_per_sec": 0.853901419604921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171095371246338, "data/tokens_consumed": 72689385472, "data/tokens_consumed_B": 72.689385472, "train/loss_slope": -3.11432775348072e-06} {"step": 34670, "timestamp": 1778232010.7852392, "train/loss": 2.227458429336548, "train/z_loss": 0.0014152768184430898, "train/perplexity": 9.276259823872515, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025659.8886248886, "perf/iters_per_sec": 0.9659099047779506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352932453155517, "data/tokens_consumed": 72710356992, "data/tokens_consumed_B": 72.710356992, "train/loss_slope": 8.835124640432331e-07} {"step": 34680, "timestamp": 1778232021.1537488, "train/loss": 2.176558756828308, "train/z_loss": 0.0014505396015010775, "train/perplexity": 8.81591628734433, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023743.259686285, "perf/iters_per_sec": 0.9649959848815369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362737417221068, "data/tokens_consumed": 72731328512, "data/tokens_consumed_B": 72.731328512, "train/loss_slope": 2.3021413297797953e-07} {"step": 34690, "timestamp": 1778232031.5234544, "train/loss": 2.1805516719818114, "train/z_loss": 0.0014465372543781996, "train/perplexity": 8.851187864425082, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024059.8299425743, "perf/iters_per_sec": 0.9651469373429176, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361116647720336, "data/tokens_consumed": 72752300032, "data/tokens_consumed_B": 72.752300032, "train/loss_slope": -1.3497696958645155e-06} {"step": 34700, "timestamp": 1778232041.8703115, "grad/layer_0/attn": 0.003006688551977277, "grad/layer_0/mlp": 0.002904849126935005, "grad/layer_0/attn_mlp_ratio": 1.0350583858528608, "grad/layer_4/attn": 0.0023283790796995163, "grad/layer_4/mlp": 0.002584194764494896, "grad/layer_4/attn_mlp_ratio": 0.9010075484979415, "grad/layer_8/attn": 0.004100314807146788, "grad/layer_8/mlp": 0.0037303501740098, "grad/layer_8/attn_mlp_ratio": 1.0991768884854098, "grad/layer_12/attn": 0.007013252470642328, "grad/layer_12/mlp": 0.0064781950786709785, "grad/layer_12/attn_mlp_ratio": 1.0825935738603538, "grad/layer_16/attn": 0.0036867058370262384, "grad/layer_16/mlp": 0.0045562125742435455, "grad/layer_16/attn_mlp_ratio": 0.8091601732876378, "grad/layer_20/attn": 0.004004772752523422, "grad/layer_20/mlp": 0.006328628398478031, "grad/layer_20/attn_mlp_ratio": 0.6328026291141167, "grad/layer_24/attn": 0.01379895769059658, "grad/layer_24/mlp": 0.011247127316892147, "grad/layer_24/attn_mlp_ratio": 1.2268872912270754, "grad/layer_27/attn": 0.00476763816550374, "grad/layer_27/mlp": 0.0101319570094347, "grad/layer_27/attn_mlp_ratio": 0.4705545151848501} {"step": 34700, "timestamp": 1778232041.886263, "train/loss": 2.161775231361389, "train/z_loss": 0.0014346859068609774, "train/perplexity": 8.686544605375172, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025409.6471359695, "perf/iters_per_sec": 0.9657905803375099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354211568832397, "data/tokens_consumed": 72773271552, "data/tokens_consumed_B": 72.773271552, "train/loss_slope": -2.814752433953077e-06} {"step": 34710, "timestamp": 1778232052.245353, "train/loss": 2.194862127304077, "train/z_loss": 0.0014349124510772526, "train/perplexity": 8.978763045210236, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026234.3952019203, "perf/iters_per_sec": 0.9661838508615114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034999704360962, "data/tokens_consumed": 72794243072, "data/tokens_consumed_B": 72.794243072, "train/loss_slope": -6.783585558415011e-07} {"step": 34720, "timestamp": 1778232062.5969408, "train/loss": 2.175363826751709, "train/z_loss": 0.0014341354253701866, "train/perplexity": 8.805388175251416, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026866.8130103103, "perf/iters_per_sec": 0.9664854111720611, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034676766395569, "data/tokens_consumed": 72815214592, "data/tokens_consumed_B": 72.815214592, "train/loss_slope": 1.851359466658065e-06} {"step": 34725, "timestamp": 1778232068.378043, "eos/sharpness": 57.4697971343994, "eos/L0_probe": 2.0114643573760986, "eos/L_plus": 2.3647243976593018, "eos/L_minus": 2.2329022884368896, "eos/grad_norm": 0.17655512690544128, "eos/embed_grad_frac": 0.08369328826665878, "eos/time_s": 0.6125741004943848} {"step": 34725, "timestamp": 1778232069.762563, "geo/rankme_last": 439.6410217285156, "geo/layer_0/stable_rank_q_proj": 18.925155639648438, "geo/layer_0/stable_rank_k_proj": 16.302982330322266, "geo/layer_0/stable_rank_o_proj": 49.990604400634766, "geo/layer_0/stable_rank_gate_proj": 140.62535095214844, "geo/layer_0/stable_rank_down_proj": 52.67515182495117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05464360862970352, "geo/layer_0/attn_entropy_mean": 6.2014875411987305, "geo/layer_0/attn_entropy_std": 0.3620629608631134, "geo/layer_7/stable_rank_q_proj": 43.04294967651367, "geo/layer_7/stable_rank_k_proj": 42.39999008178711, "geo/layer_7/stable_rank_o_proj": 101.72356414794922, "geo/layer_7/stable_rank_gate_proj": 92.60980987548828, "geo/layer_7/stable_rank_down_proj": 145.8419647216797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5048197507858276, "geo/layer_7/attn_entropy_mean": 4.674365997314453, "geo/layer_7/attn_entropy_std": 0.8136105537414551, "geo/layer_14/stable_rank_q_proj": 54.40570068359375, "geo/layer_14/stable_rank_k_proj": 36.526405334472656, "geo/layer_14/stable_rank_o_proj": 50.56985855102539, "geo/layer_14/stable_rank_gate_proj": 78.98880767822266, "geo/layer_14/stable_rank_down_proj": 133.83116149902344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38289251923561096, "geo/layer_14/attn_entropy_mean": 5.497305870056152, "geo/layer_14/attn_entropy_std": 0.3916686475276947, "geo/layer_21/stable_rank_q_proj": 44.33377456665039, "geo/layer_21/stable_rank_k_proj": 31.101078033447266, "geo/layer_21/stable_rank_o_proj": 77.69281768798828, "geo/layer_21/stable_rank_gate_proj": 75.4421157836914, "geo/layer_21/stable_rank_down_proj": 56.85732650756836, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14903661608695984, "geo/layer_21/attn_entropy_mean": 5.723084449768066, "geo/layer_21/attn_entropy_std": 0.2787749767303467, "geo/layer_27/stable_rank_q_proj": 42.214500427246094, "geo/layer_27/stable_rank_k_proj": 31.382001876831055, "geo/layer_27/stable_rank_o_proj": 117.53736114501953, "geo/layer_27/stable_rank_gate_proj": 86.91342163085938, "geo/layer_27/stable_rank_down_proj": 132.31088256835938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0863351821899414, "geo/layer_27/attn_entropy_mean": 4.317875862121582, "geo/layer_27/attn_entropy_std": 0.6289277672767639, "attnres/final_alpha/block_0": 0.24030272662639618, "attnres/block_norm/0": 1.7039470672607422, "attnres/final_alpha/block_1": 0.005405692383646965, "attnres/block_norm/1": 37991.25, "attnres/final_alpha/block_2": 0.011676780879497528, "attnres/block_norm/2": 25450.427734375, "attnres/final_alpha/block_3": 0.013712371699512005, "attnres/block_norm/3": 43292.484375, "attnres/final_alpha/block_4": 0.016963519155979156, "attnres/block_norm/4": 12093.42578125, "attnres/final_alpha/block_5": 0.5945771932601929, "attnres/block_norm/5": 5704.9443359375, "attnres/final_alpha/block_6": 0.11736170947551727, "attnres/block_norm/6": 28506.63671875, "geo/tier1_time_s": 1.3585586547851562, "geo/step": 34725.0, "geo/rankme_slope": -2.061084199304722e-05} {"step": 34730, "timestamp": 1778232074.942486, "train/loss": 2.1583357095718383, "train/z_loss": 0.0014537862269207835, "train/perplexity": 8.656718369327933, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699623.1869757993, "perf/iters_per_sec": 0.8104434904936787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338923215866089, "data/tokens_consumed": 72836186112, "data/tokens_consumed_B": 72.836186112, "train/loss_slope": -9.034885288604694e-08} {"step": 34740, "timestamp": 1778232085.7252321, "train/loss": 2.197298860549927, "train/z_loss": 0.0014393990160897375, "train/perplexity": 9.000668573754963, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945951.830769996, "perf/iters_per_sec": 0.9279021409845333, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.077699851989746, "data/tokens_consumed": 72857157632, "data/tokens_consumed_B": 72.857157632, "train/loss_slope": 1.1406254274796129e-06} {"step": 34750, "timestamp": 1778232096.5951688, "grad/layer_0/attn": 0.0028757192194461823, "grad/layer_0/mlp": 0.0028267980087548494, "grad/layer_0/attn_mlp_ratio": 1.0173061919561286, "grad/layer_4/attn": 0.003298569703474641, "grad/layer_4/mlp": 0.002574730897322297, "grad/layer_4/attn_mlp_ratio": 1.2811317791664958, "grad/layer_8/attn": 0.00510401139035821, "grad/layer_8/mlp": 0.003701084991917014, "grad/layer_8/attn_mlp_ratio": 1.3790581042043903, "grad/layer_12/attn": 0.0055332123301923275, "grad/layer_12/mlp": 0.006703895051032305, "grad/layer_12/attn_mlp_ratio": 0.8253727430895591, "grad/layer_16/attn": 0.006011803634464741, "grad/layer_16/mlp": 0.004887644667178392, "grad/layer_16/attn_mlp_ratio": 1.2300001167914907, "grad/layer_20/attn": 0.003804009174928069, "grad/layer_20/mlp": 0.006952802184969187, "grad/layer_20/attn_mlp_ratio": 0.5471188477704465, "grad/layer_24/attn": 0.014662420377135277, "grad/layer_24/mlp": 0.011656449176371098, "grad/layer_24/attn_mlp_ratio": 1.257880511422772, "grad/layer_27/attn": 0.005928452592343092, "grad/layer_27/mlp": 0.011222714558243752, "grad/layer_27/attn_mlp_ratio": 0.5282547737225317} {"step": 34750, "timestamp": 1778232096.61125, "train/loss": 2.228800058364868, "train/z_loss": 0.0014202012331224977, "train/perplexity": 9.288713475548747, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927596.5958618429, "perf/iters_per_sec": 0.9191496829327788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0879620790481568, "data/tokens_consumed": 72878129152, "data/tokens_consumed_B": 72.878129152, "train/loss_slope": 3.1013249993288762e-06} {"step": 34760, "timestamp": 1778232106.968183, "train/loss": 2.1474074602127073, "train/z_loss": 0.001445171155501157, "train/perplexity": 8.562630635842643, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026506.223459667, "perf/iters_per_sec": 0.9663134686754546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034860873222351, "data/tokens_consumed": 72899100672, "data/tokens_consumed_B": 72.899100672, "train/loss_slope": 1.8622151731622337e-06} {"step": 34770, "timestamp": 1778232117.3284097, "train/loss": 2.2310743808746336, "train/z_loss": 0.00142226405441761, "train/perplexity": 9.309863047049795, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025523.2159920873, "perf/iters_per_sec": 0.9658447341881214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353631019592284, "data/tokens_consumed": 72920072192, "data/tokens_consumed_B": 72.920072192, "train/loss_slope": 1.949625633301555e-06} {"step": 34780, "timestamp": 1778232127.6900663, "train/loss": 2.2061112403869627, "train/z_loss": 0.0014388703857548535, "train/perplexity": 9.080336400017146, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025045.0069799542, "perf/iters_per_sec": 0.9656167063617488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035607600212097, "data/tokens_consumed": 72941043712, "data/tokens_consumed_B": 72.941043712, "train/loss_slope": 3.209220515404811e-06} {"step": 34790, "timestamp": 1778232138.0462205, "train/loss": 2.1380902767181396, "train/z_loss": 0.0014515916118398309, "train/perplexity": 8.48322154400933, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026570.468302628, "perf/iters_per_sec": 0.9663441030038014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348280668258667, "data/tokens_consumed": 72962015232, "data/tokens_consumed_B": 72.962015232, "train/loss_slope": -2.604333910182567e-07} {"step": 34800, "timestamp": 1778232148.8785617, "grad/layer_0/attn": 0.003213664283975959, "grad/layer_0/mlp": 0.0030442853458225727, "grad/layer_0/attn_mlp_ratio": 1.0556382905505168, "grad/layer_4/attn": 0.001649423036724329, "grad/layer_4/mlp": 0.0025789046194404364, "grad/layer_4/attn_mlp_ratio": 0.6395827749240058, "grad/layer_8/attn": 0.012516738846898079, "grad/layer_8/mlp": 0.003859973745420575, "grad/layer_8/attn_mlp_ratio": 3.2427004296279835, "grad/layer_12/attn": 0.003798814956098795, "grad/layer_12/mlp": 0.006694538984447718, "grad/layer_12/attn_mlp_ratio": 0.5674498136733468, "grad/layer_16/attn": 0.003504555905237794, "grad/layer_16/mlp": 0.00445909146219492, "grad/layer_16/attn_mlp_ratio": 0.7859349502822791, "grad/layer_20/attn": 0.0034820078872144222, "grad/layer_20/mlp": 0.005702991969883442, "grad/layer_20/attn_mlp_ratio": 0.6105580797845276, "grad/layer_24/attn": 0.012693498283624649, "grad/layer_24/mlp": 0.009870320558547974, "grad/layer_24/attn_mlp_ratio": 1.2860269410428653, "grad/layer_27/attn": 0.0058969841338694096, "grad/layer_27/mlp": 0.008796618320047855, "grad/layer_27/attn_mlp_ratio": 0.6703694365587054} {"step": 34800, "timestamp": 1778232149.494807, "eos/sharpness": 41.338038444519036, "eos/L0_probe": 2.010176181793213, "eos/L_plus": 2.2508246898651123, "eos/L_minus": 2.182908058166504, "eos/grad_norm": 0.13309074938297272, "eos/embed_grad_frac": 0.1578063815832138, "eos/time_s": 0.6133203506469727} {"step": 34800, "timestamp": 1778232149.5154634, "train/loss": 2.1964876651763916, "train/z_loss": 0.0014253467554226518, "train/perplexity": 8.993370233639293, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1829365.3251354676, "perf/iters_per_sec": 0.8723093629529322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1463822841644287, "data/tokens_consumed": 72982986752, "data/tokens_consumed_B": 72.982986752, "train/loss_slope": 1.8923544146940898e-06} {"step": 34800, "timestamp": 1778232150.8792927, "geo/rankme_last": 438.8827819824219, "geo/layer_0/stable_rank_q_proj": 18.959741592407227, "geo/layer_0/stable_rank_k_proj": 16.30110740661621, "geo/layer_0/stable_rank_o_proj": 49.9482421875, "geo/layer_0/stable_rank_gate_proj": 140.68951416015625, "geo/layer_0/stable_rank_down_proj": 52.72412109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0590277723968029, "geo/layer_0/attn_entropy_mean": 6.204784393310547, "geo/layer_0/attn_entropy_std": 0.35620996356010437, "geo/layer_7/stable_rank_q_proj": 43.14379119873047, "geo/layer_7/stable_rank_k_proj": 42.48752975463867, "geo/layer_7/stable_rank_o_proj": 101.85513305664062, "geo/layer_7/stable_rank_gate_proj": 92.6142578125, "geo/layer_7/stable_rank_down_proj": 145.80584716796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.528511643409729, "geo/layer_7/attn_entropy_mean": 4.6542439460754395, "geo/layer_7/attn_entropy_std": 0.8278757929801941, "geo/layer_14/stable_rank_q_proj": 54.39756774902344, "geo/layer_14/stable_rank_k_proj": 36.63140106201172, "geo/layer_14/stable_rank_o_proj": 50.47599792480469, "geo/layer_14/stable_rank_gate_proj": 78.9703140258789, "geo/layer_14/stable_rank_down_proj": 133.99777221679688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3738100826740265, "geo/layer_14/attn_entropy_mean": 5.479065418243408, "geo/layer_14/attn_entropy_std": 0.3840232789516449, "geo/layer_21/stable_rank_q_proj": 44.34912109375, "geo/layer_21/stable_rank_k_proj": 31.107547760009766, "geo/layer_21/stable_rank_o_proj": 77.7536392211914, "geo/layer_21/stable_rank_gate_proj": 75.30892181396484, "geo/layer_21/stable_rank_down_proj": 56.87504196166992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14482970535755157, "geo/layer_21/attn_entropy_mean": 5.7589802742004395, "geo/layer_21/attn_entropy_std": 0.28744518756866455, "geo/layer_27/stable_rank_q_proj": 42.21524429321289, "geo/layer_27/stable_rank_k_proj": 31.35367202758789, "geo/layer_27/stable_rank_o_proj": 117.40068054199219, "geo/layer_27/stable_rank_gate_proj": 86.8141860961914, "geo/layer_27/stable_rank_down_proj": 132.3701934814453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08687243610620499, "geo/layer_27/attn_entropy_mean": 4.285882472991943, "geo/layer_27/attn_entropy_std": 0.6413767337799072, "attnres/final_alpha/block_0": 0.24171218276023865, "attnres/block_norm/0": 1.704239845275879, "attnres/final_alpha/block_1": 0.005486675538122654, "attnres/block_norm/1": 37900.4453125, "attnres/final_alpha/block_2": 0.011873706243932247, "attnres/block_norm/2": 25406.482421875, "attnres/final_alpha/block_3": 0.014004722237586975, "attnres/block_norm/3": 43253.05859375, "attnres/final_alpha/block_4": 0.01731545850634575, "attnres/block_norm/4": 12120.4716796875, "attnres/final_alpha/block_5": 0.5892863869667053, "attnres/block_norm/5": 5815.181640625, "attnres/final_alpha/block_6": 0.1203208789229393, "attnres/block_norm/6": 28510.94921875, "geo/tier1_time_s": 1.3597514629364014, "geo/step": 34800.0, "geo/rankme_slope": -3.52003887492497e-05} {"step": 34810, "timestamp": 1778232161.6557825, "train/loss": 2.2208446741104124, "train/z_loss": 0.001418648916296661, "train/perplexity": 9.215111345412879, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1728021.729586083, "perf/iters_per_sec": 0.8239849708490767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2136143684387206, "data/tokens_consumed": 73003958272, "data/tokens_consumed_B": 73.003958272, "train/loss_slope": 5.620101161307834e-06} {"step": 34820, "timestamp": 1778232172.013196, "train/loss": 2.138444149494171, "train/z_loss": 0.001439040165860206, "train/perplexity": 8.486224056389162, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026060.3103147123, "perf/iters_per_sec": 0.966100840718609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350886344909669, "data/tokens_consumed": 73024929792, "data/tokens_consumed_B": 73.024929792, "train/loss_slope": 4.630150488822837e-06} {"step": 34830, "timestamp": 1778232182.36539, "train/loss": 2.2115163087844847, "train/z_loss": 0.001427355920895934, "train/perplexity": 9.129549118575223, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026923.7476419057, "perf/iters_per_sec": 0.9665125597199944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346477031707764, "data/tokens_consumed": 73045901312, "data/tokens_consumed_B": 73.045901312, "train/loss_slope": 4.003216595825668e-06} {"step": 34840, "timestamp": 1778232192.7252092, "train/loss": 2.206431007385254, "train/z_loss": 0.0014252834022045135, "train/perplexity": 9.083240456217176, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025574.337793879, "perf/iters_per_sec": 0.96586911096281, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035336971282959, "data/tokens_consumed": 73066872832, "data/tokens_consumed_B": 73.066872832, "train/loss_slope": 4.094865956608305e-06} {"step": 34850, "timestamp": 1778232203.0639799, "grad/layer_0/attn": 0.002918911399319768, "grad/layer_0/mlp": 0.002810173435136676, "grad/layer_0/attn_mlp_ratio": 1.038694359200064, "grad/layer_4/attn": 0.0027073731180280447, "grad/layer_4/mlp": 0.0025012476835399866, "grad/layer_4/attn_mlp_ratio": 1.0824090023565482, "grad/layer_8/attn": 0.0038820572663098574, "grad/layer_8/mlp": 0.003505890490487218, "grad/layer_8/attn_mlp_ratio": 1.1072956118035513, "grad/layer_12/attn": 0.004212636500597, "grad/layer_12/mlp": 0.006697452627122402, "grad/layer_12/attn_mlp_ratio": 0.6289908525277464, "grad/layer_16/attn": 0.0038074434269219637, "grad/layer_16/mlp": 0.00487831886857748, "grad/layer_16/attn_mlp_ratio": 0.780482672708918, "grad/layer_20/attn": 0.006349510978907347, "grad/layer_20/mlp": 0.0066011035814881325, "grad/layer_20/attn_mlp_ratio": 0.9618862670970701, "grad/layer_24/attn": 0.011544618755578995, "grad/layer_24/mlp": 0.01271532941609621, "grad/layer_24/attn_mlp_ratio": 0.9079291842940268, "grad/layer_27/attn": 0.0049674659967422485, "grad/layer_27/mlp": 0.013116537593305111, "grad/layer_27/attn_mlp_ratio": 0.37871777696927766} {"step": 34850, "timestamp": 1778232203.079705, "train/loss": 2.1813925981521605, "train/z_loss": 0.0014341649715788663, "train/perplexity": 8.858634190405338, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026652.3677074378, "perf/iters_per_sec": 0.9663831556832494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347862482070922, "data/tokens_consumed": 73087844352, "data/tokens_consumed_B": 73.087844352, "train/loss_slope": 5.063805271117749e-06} {"step": 34860, "timestamp": 1778232213.4275107, "train/loss": 2.16839656829834, "train/z_loss": 0.001435409381520003, "train/perplexity": 8.744251983187088, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027781.283142266, "perf/iters_per_sec": 0.9669214645110445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342101573944091, "data/tokens_consumed": 73108815872, "data/tokens_consumed_B": 73.108815872, "train/loss_slope": 2.70124618405517e-06} {"step": 34870, "timestamp": 1778232223.785943, "train/loss": 2.1399412870407106, "train/z_loss": 0.001451182528398931, "train/perplexity": 8.498938616400029, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025742.0410166972, "perf/iters_per_sec": 0.9659490780910002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035251259803772, "data/tokens_consumed": 73129787392, "data/tokens_consumed_B": 73.129787392, "train/loss_slope": -4.584657727438205e-07} {"step": 34875, "timestamp": 1778232229.572777, "eos/sharpness": 7.682442665100096, "eos/L0_probe": 2.0082314014434814, "eos/L_plus": 2.0457704067230225, "eos/L_minus": 2.0475168228149414, "eos/grad_norm": 0.14091277122497559, "eos/embed_grad_frac": 0.23187227547168732, "eos/time_s": 0.6207518577575684} {"step": 34875, "timestamp": 1778232230.9537616, "geo/rankme_last": 439.30487060546875, "geo/layer_0/stable_rank_q_proj": 18.96183204650879, "geo/layer_0/stable_rank_k_proj": 16.2028865814209, "geo/layer_0/stable_rank_o_proj": 50.00693130493164, "geo/layer_0/stable_rank_gate_proj": 141.12274169921875, "geo/layer_0/stable_rank_down_proj": 52.66273880004883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.055159952491521835, "geo/layer_0/attn_entropy_mean": 6.206155300140381, "geo/layer_0/attn_entropy_std": 0.36027735471725464, "geo/layer_7/stable_rank_q_proj": 43.278438568115234, "geo/layer_7/stable_rank_k_proj": 42.43328857421875, "geo/layer_7/stable_rank_o_proj": 101.9103012084961, "geo/layer_7/stable_rank_gate_proj": 92.60699462890625, "geo/layer_7/stable_rank_down_proj": 145.35638427734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5098649859428406, "geo/layer_7/attn_entropy_mean": 4.634342670440674, "geo/layer_7/attn_entropy_std": 0.8145756125450134, "geo/layer_14/stable_rank_q_proj": 54.489845275878906, "geo/layer_14/stable_rank_k_proj": 36.6971435546875, "geo/layer_14/stable_rank_o_proj": 50.438411712646484, "geo/layer_14/stable_rank_gate_proj": 78.90843200683594, "geo/layer_14/stable_rank_down_proj": 134.04722595214844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37320148944854736, "geo/layer_14/attn_entropy_mean": 5.479807376861572, "geo/layer_14/attn_entropy_std": 0.38360095024108887, "geo/layer_21/stable_rank_q_proj": 44.3480339050293, "geo/layer_21/stable_rank_k_proj": 31.16034507751465, "geo/layer_21/stable_rank_o_proj": 77.85077667236328, "geo/layer_21/stable_rank_gate_proj": 75.2354965209961, "geo/layer_21/stable_rank_down_proj": 56.78972244262695, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15173062682151794, "geo/layer_21/attn_entropy_mean": 5.741805553436279, "geo/layer_21/attn_entropy_std": 0.2959510087966919, "geo/layer_27/stable_rank_q_proj": 42.315181732177734, "geo/layer_27/stable_rank_k_proj": 31.487245559692383, "geo/layer_27/stable_rank_o_proj": 117.42549133300781, "geo/layer_27/stable_rank_gate_proj": 86.75931549072266, "geo/layer_27/stable_rank_down_proj": 132.5924072265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09509075433015823, "geo/layer_27/attn_entropy_mean": 4.29448127746582, "geo/layer_27/attn_entropy_std": 0.6274422407150269, "attnres/final_alpha/block_0": 0.24198225140571594, "attnres/block_norm/0": 1.7045907974243164, "attnres/final_alpha/block_1": 0.00548552256077528, "attnres/block_norm/1": 37938.63671875, "attnres/final_alpha/block_2": 0.012063771486282349, "attnres/block_norm/2": 25437.92578125, "attnres/final_alpha/block_3": 0.014242498204112053, "attnres/block_norm/3": 43086.43359375, "attnres/final_alpha/block_4": 0.017322681844234467, "attnres/block_norm/4": 12167.92578125, "attnres/final_alpha/block_5": 0.589806079864502, "attnres/block_norm/5": 5805.611328125, "attnres/final_alpha/block_6": 0.11909718066453934, "attnres/block_norm/6": 28608.5859375, "geo/tier1_time_s": 1.3615179061889648, "geo/step": 34875.0, "geo/rankme_slope": -5.4159788915566224e-05} {"step": 34880, "timestamp": 1778232236.1366224, "train/loss": 2.2171354055404664, "train/z_loss": 0.0014296831795945764, "train/perplexity": 9.18099333807513, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698780.8734530557, "perf/iters_per_sec": 0.8100418441071776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345041275024413, "data/tokens_consumed": 73150758912, "data/tokens_consumed_B": 73.150758912, "train/loss_slope": 1.6129029108316866e-06} {"step": 34890, "timestamp": 1778232246.4933982, "train/loss": 2.1800049185752868, "train/z_loss": 0.00144585024099797, "train/perplexity": 8.846349770051232, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025967.5861530956, "perf/iters_per_sec": 0.9660566263928869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351360082626342, "data/tokens_consumed": 73171730432, "data/tokens_consumed_B": 73.171730432, "train/loss_slope": 1.040447720385432e-06} {"step": 34900, "timestamp": 1778232256.840057, "grad/layer_0/attn": 0.0031392232049256563, "grad/layer_0/mlp": 0.002880816115066409, "grad/layer_0/attn_mlp_ratio": 1.0896992277771147, "grad/layer_4/attn": 0.002371292095631361, "grad/layer_4/mlp": 0.002583099761977792, "grad/layer_4/attn_mlp_ratio": 0.9180024862901519, "grad/layer_8/attn": 0.005830331239849329, "grad/layer_8/mlp": 0.0036696256138384342, "grad/layer_8/attn_mlp_ratio": 1.5888081495239996, "grad/layer_12/attn": 0.005020138807594776, "grad/layer_12/mlp": 0.006680672988295555, "grad/layer_12/attn_mlp_ratio": 0.7514420689720605, "grad/layer_16/attn": 0.007269140798598528, "grad/layer_16/mlp": 0.005129126366227865, "grad/layer_16/attn_mlp_ratio": 1.4172278352778662, "grad/layer_20/attn": 0.0035232414957135916, "grad/layer_20/mlp": 0.006389436777681112, "grad/layer_20/attn_mlp_ratio": 0.5514165900942847, "grad/layer_24/attn": 0.013302542269229889, "grad/layer_24/mlp": 0.01431933231651783, "grad/layer_24/attn_mlp_ratio": 0.9289917911176474, "grad/layer_27/attn": 0.005324529949575663, "grad/layer_27/mlp": 0.013170955702662468, "grad/layer_27/attn_mlp_ratio": 0.40426298814998113} {"step": 34900, "timestamp": 1778232256.8557208, "train/loss": 2.2156745910644533, "train/z_loss": 0.0014256076654419303, "train/perplexity": 9.167591401357559, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024777.7187050446, "perf/iters_per_sec": 0.9654892533803199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035744309425354, "data/tokens_consumed": 73192701952, "data/tokens_consumed_B": 73.192701952, "train/loss_slope": 6.2542866558918435e-06} {"step": 34910, "timestamp": 1778232267.200776, "train/loss": 2.1298644065856935, "train/z_loss": 0.0014520945609547199, "train/perplexity": 8.413725888270815, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028273.2261306806, "perf/iters_per_sec": 0.9671560412076381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339593172073365, "data/tokens_consumed": 73213673472, "data/tokens_consumed_B": 73.213673472, "train/loss_slope": 3.7096488450763237e-06} {"step": 34920, "timestamp": 1778232277.5466487, "train/loss": 2.1950297355651855, "train/z_loss": 0.0014530167332850396, "train/perplexity": 8.980268086196277, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028121.1429895826, "perf/iters_per_sec": 0.9670835223148263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340368509292603, "data/tokens_consumed": 73234644992, "data/tokens_consumed_B": 73.234644992, "train/loss_slope": 5.268717134031086e-06} {"step": 34930, "timestamp": 1778232287.9066794, "train/loss": 2.1646886348724363, "train/z_loss": 0.0014501491212286055, "train/perplexity": 8.711888916102565, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025542.0132159463, "perf/iters_per_sec": 0.9658536974029285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353534936904907, "data/tokens_consumed": 73255616512, "data/tokens_consumed_B": 73.255616512, "train/loss_slope": 6.896186490120421e-06} {"step": 34940, "timestamp": 1778232298.2733605, "train/loss": 2.1243303298950194, "train/z_loss": 0.0014404229004867375, "train/perplexity": 8.367292286015017, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024137.5207426453, "perf/iters_per_sec": 0.965183983203242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360718965530396, "data/tokens_consumed": 73276588032, "data/tokens_consumed_B": 73.276588032, "train/loss_slope": 3.3953400942930754e-06} {"step": 34950, "timestamp": 1778232308.6302772, "grad/layer_0/attn": 0.0027392832562327385, "grad/layer_0/mlp": 0.0026357057504355907, "grad/layer_0/attn_mlp_ratio": 1.0392977865037671, "grad/layer_4/attn": 0.0017435922054573894, "grad/layer_4/mlp": 0.002474502893164754, "grad/layer_4/attn_mlp_ratio": 0.704623195152183, "grad/layer_8/attn": 0.009718749672174454, "grad/layer_8/mlp": 0.003708216594532132, "grad/layer_8/attn_mlp_ratio": 2.620868863058898, "grad/layer_12/attn": 0.0036628530360758305, "grad/layer_12/mlp": 0.00663360208272934, "grad/layer_12/attn_mlp_ratio": 0.5521665205688866, "grad/layer_16/attn": 0.0034599960781633854, "grad/layer_16/mlp": 0.00447398005053401, "grad/layer_16/attn_mlp_ratio": 0.7733597293118084, "grad/layer_20/attn": 0.003358604619279504, "grad/layer_20/mlp": 0.005452007986605167, "grad/layer_20/attn_mlp_ratio": 0.6160307479240783, "grad/layer_24/attn": 0.006609936244785786, "grad/layer_24/mlp": 0.008763543330132961, "grad/layer_24/attn_mlp_ratio": 0.7542538355042419, "grad/layer_27/attn": 0.005022129975259304, "grad/layer_27/mlp": 0.007345767226070166, "grad/layer_27/attn_mlp_ratio": 0.6836767014707558} {"step": 34950, "timestamp": 1778232309.2726698, "eos/sharpness": 7.044601440429686, "eos/L0_probe": 2.0043578147888184, "eos/L_plus": 2.0443813800811768, "eos/L_minus": 2.034780263900757, "eos/grad_norm": 0.09119340777397156, "eos/embed_grad_frac": 0.29536429047584534, "eos/time_s": 0.6395630836486816} {"step": 34950, "timestamp": 1778232309.2934353, "train/loss": 2.151421642303467, "train/z_loss": 0.001438147691078484, "train/perplexity": 8.59707167444342, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904382.6062096679, "perf/iters_per_sec": 0.9080803900764789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1012240886688232, "data/tokens_consumed": 73297559552, "data/tokens_consumed_B": 73.297559552, "train/loss_slope": 3.8788049313316e-06} {"step": 34950, "timestamp": 1778232310.654637, "geo/rankme_last": 439.7114562988281, "geo/layer_0/stable_rank_q_proj": 18.943588256835938, "geo/layer_0/stable_rank_k_proj": 16.172983169555664, "geo/layer_0/stable_rank_o_proj": 50.124717712402344, "geo/layer_0/stable_rank_gate_proj": 140.93870544433594, "geo/layer_0/stable_rank_down_proj": 52.72863006591797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0561690554022789, "geo/layer_0/attn_entropy_mean": 6.202889919281006, "geo/layer_0/attn_entropy_std": 0.36384978890419006, "geo/layer_7/stable_rank_q_proj": 43.21446990966797, "geo/layer_7/stable_rank_k_proj": 42.224430084228516, "geo/layer_7/stable_rank_o_proj": 102.07390594482422, "geo/layer_7/stable_rank_gate_proj": 92.48030090332031, "geo/layer_7/stable_rank_down_proj": 145.62127685546875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5244902968406677, "geo/layer_7/attn_entropy_mean": 4.640735149383545, "geo/layer_7/attn_entropy_std": 0.8215305805206299, "geo/layer_14/stable_rank_q_proj": 54.52899932861328, "geo/layer_14/stable_rank_k_proj": 36.69281005859375, "geo/layer_14/stable_rank_o_proj": 50.3087158203125, "geo/layer_14/stable_rank_gate_proj": 78.97999572753906, "geo/layer_14/stable_rank_down_proj": 134.08575439453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39244160056114197, "geo/layer_14/attn_entropy_mean": 5.459045886993408, "geo/layer_14/attn_entropy_std": 0.38698020577430725, "geo/layer_21/stable_rank_q_proj": 44.34427261352539, "geo/layer_21/stable_rank_k_proj": 31.21794891357422, "geo/layer_21/stable_rank_o_proj": 77.79670715332031, "geo/layer_21/stable_rank_gate_proj": 75.24076080322266, "geo/layer_21/stable_rank_down_proj": 56.7109489440918, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14872093498706818, "geo/layer_21/attn_entropy_mean": 5.736416339874268, "geo/layer_21/attn_entropy_std": 0.28098687529563904, "geo/layer_27/stable_rank_q_proj": 42.28302001953125, "geo/layer_27/stable_rank_k_proj": 31.476837158203125, "geo/layer_27/stable_rank_o_proj": 117.6158676147461, "geo/layer_27/stable_rank_gate_proj": 86.75495910644531, "geo/layer_27/stable_rank_down_proj": 132.9542694091797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08710391819477081, "geo/layer_27/attn_entropy_mean": 4.30202579498291, "geo/layer_27/attn_entropy_std": 0.64290851354599, "attnres/final_alpha/block_0": 0.2406756430864334, "attnres/block_norm/0": 1.7049534320831299, "attnres/final_alpha/block_1": 0.005461032502353191, "attnres/block_norm/1": 37919.5625, "attnres/final_alpha/block_2": 0.011996306478977203, "attnres/block_norm/2": 25393.830078125, "attnres/final_alpha/block_3": 0.013942724093794823, "attnres/block_norm/3": 43538.5546875, "attnres/final_alpha/block_4": 0.017199302092194557, "attnres/block_norm/4": 12162.650390625, "attnres/final_alpha/block_5": 0.590789794921875, "attnres/block_norm/5": 5778.3564453125, "attnres/final_alpha/block_6": 0.11993518471717834, "attnres/block_norm/6": 28447.26953125, "geo/tier1_time_s": 1.3576695919036865, "geo/step": 34950.0, "geo/rankme_slope": -4.117830726040416e-05} {"step": 34960, "timestamp": 1778232321.018891, "train/loss": 2.1902557611465454, "train/z_loss": 0.0014327680110000074, "train/perplexity": 8.93749868731747, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789056.5613337874, "perf/iters_per_sec": 0.853088646571058, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1722111225128173, "data/tokens_consumed": 73318531072, "data/tokens_consumed_B": 73.318531072, "train/loss_slope": 5.298618386180188e-06} {"step": 34970, "timestamp": 1778232331.3677065, "train/loss": 2.1831897497177124, "train/z_loss": 0.0014265642850659788, "train/perplexity": 8.874568812886404, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027530.330333891, "perf/iters_per_sec": 0.9668018008870559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343381643295289, "data/tokens_consumed": 73339502592, "data/tokens_consumed_B": 73.339502592, "train/loss_slope": 6.340396343463532e-06} {"step": 34980, "timestamp": 1778232341.7254784, "train/loss": 2.2183151721954344, "train/z_loss": 0.0014200247940607369, "train/perplexity": 9.191831159668087, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025981.585230907, "perf/iters_per_sec": 0.966063301673368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351288557052611, "data/tokens_consumed": 73360474112, "data/tokens_consumed_B": 73.360474112, "train/loss_slope": 8.909975720567809e-06} {"step": 34990, "timestamp": 1778232352.0747478, "train/loss": 2.17871298789978, "train/z_loss": 0.0014414951438084245, "train/perplexity": 8.834928278893056, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027743.5123862328, "perf/iters_per_sec": 0.9669034540110745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342294216156005, "data/tokens_consumed": 73381445632, "data/tokens_consumed_B": 73.381445632, "train/loss_slope": 1.1617238801268875e-05} {"step": 35000, "timestamp": 1778232362.418106, "grad/layer_0/attn": 0.0028167658019810915, "grad/layer_0/mlp": 0.0026476529892534018, "grad/layer_0/attn_mlp_ratio": 1.0638726853658071, "grad/layer_4/attn": 0.0017646093619987369, "grad/layer_4/mlp": 0.002416031900793314, "grad/layer_4/attn_mlp_ratio": 0.7303749956206376, "grad/layer_8/attn": 0.003306623548269272, "grad/layer_8/mlp": 0.003526850137859583, "grad/layer_8/attn_mlp_ratio": 0.9375571190332241, "grad/layer_12/attn": 0.005001606419682503, "grad/layer_12/mlp": 0.006051874253898859, "grad/layer_12/attn_mlp_ratio": 0.826455760182838, "grad/layer_16/attn": 0.005795703269541264, "grad/layer_16/mlp": 0.004246068652719259, "grad/layer_16/attn_mlp_ratio": 1.3649574717389092, "grad/layer_20/attn": 0.006405510939657688, "grad/layer_20/mlp": 0.005249457433819771, "grad/layer_20/attn_mlp_ratio": 1.2202234037307689, "grad/layer_24/attn": 0.007136684376746416, "grad/layer_24/mlp": 0.007593050133436918, "grad/layer_24/attn_mlp_ratio": 0.9398969001046721, "grad/layer_27/attn": 0.004459625110030174, "grad/layer_27/mlp": 0.0066072652116417885, "grad/layer_27/attn_mlp_ratio": 0.6749577774896459} {"step": 35000, "timestamp": 1778232362.4343226, "train/loss": 2.183033847808838, "train/z_loss": 0.001437628548592329, "train/perplexity": 8.873185358512426, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025398.5008175422, "perf/iters_per_sec": 0.9657852653587066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354268550872803, "data/tokens_consumed": 73402417152, "data/tokens_consumed_B": 73.402417152, "train/loss_slope": 1.2672755863442607e-05} {"step": 35000, "timestamp": 1778232369.2588322, "geo/ww_alpha_mean": 7.6090904756814615, "geo/ww_alpha_std": 4.181801119604493, "geo/ww_alpha_min": 1.3553939009436835, "geo/ww_alpha_max": 24.479779148536885, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.095959427953052, "geo/ww_alpha_by_type/k_proj": 4.55290950395873, "geo/ww_alpha_by_type/v_proj": 8.256826051270489, "geo/ww_alpha_by_type/o_proj": 7.823627161715038, "geo/ww_alpha_by_type/gate_proj": 8.07616468635421, "geo/ww_alpha_by_type/up_proj": 11.865473319167737, "geo/ww_alpha_by_type/down_proj": 8.699712603409, "geo/twonn_id/layer_0": 0.6993277072906494, "geo/twonn_id/layer_7": 2.9906959533691406, "geo/twonn_id/layer_14": 4.382958889007568, "geo/twonn_id/layer_21": 6.220691204071045, "geo/twonn_id/layer_27": 6.599081039428711, "geo/tier2_time_s": 6.816519260406494} {"step": 35000, "timestamp": 1778232369.9044404, "eoc/jacobian_sigma/layer_0/attn": 1080.2979736328125, "eoc/jacobian_sigma/layer_0/mlp": 8067.28173828125, "eoc/jacobian_sigma/layer_0": 8067.28173828125, "eoc/jacobian_sigma/layer_7/attn": 1.1630398035049438, "eoc/jacobian_sigma/layer_7/mlp": 1.670946717262268, "eoc/jacobian_sigma/layer_7": 1.670946717262268, "eoc/jacobian_sigma/layer_14/attn": 1.618021845817566, "eoc/jacobian_sigma/layer_14/mlp": 6.812384605407715, "eoc/jacobian_sigma/layer_14": 6.812384605407715, "eoc/jacobian_sigma/layer_21/attn": 1.091848611831665, "eoc/jacobian_sigma/layer_21/mlp": 3.9606404304504395, "eoc/jacobian_sigma/layer_21": 3.9606404304504395, "eoc/jacobian_sigma/layer_27/attn": 4.497761249542236, "eoc/jacobian_sigma/layer_27/mlp": 22.637571334838867, "eoc/jacobian_sigma/layer_27": 22.637571334838867, "eoc/layer0_sigma": 8067.28173828125, "eoc/sigma_max": 22.637571334838867, "eoc/sigma_min": 1.670946717262268, "eoc/sigma_mean": 8.770385771989822, "eoc/time_s": 0.6400845050811768} {"step": 35010, "timestamp": 1778232380.313923, "train/loss": 2.1973833799362184, "train/z_loss": 0.001431398035492748, "train/perplexity": 9.001429336888194, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1173388.1141053713, "perf/iters_per_sec": 0.5595150537993294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7872620105743409, "data/tokens_consumed": 73423388672, "data/tokens_consumed_B": 73.423388672, "train/loss_slope": 1.4141035945502033e-05} {"step": 35020, "timestamp": 1778232390.6678822, "train/loss": 2.142357039451599, "train/z_loss": 0.0014409763854928315, "train/perplexity": 8.519494767141044, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026567.3866982602, "perf/iters_per_sec": 0.9663426335803319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348296403884887, "data/tokens_consumed": 73444360192, "data/tokens_consumed_B": 73.444360192, "train/loss_slope": 1.2439601535093198e-05} {"step": 35025, "timestamp": 1778232396.4466386, "eos/sharpness": 47.05939292907714, "eos/L0_probe": 2.0091865062713623, "eos/L_plus": 2.1924617290496826, "eos/L_minus": 2.2965052127838135, "eos/grad_norm": 0.13561739027500153, "eos/embed_grad_frac": 0.1350666880607605, "eos/time_s": 0.6140689849853516} {"step": 35025, "timestamp": 1778232397.8245335, "geo/rankme_last": 438.92169189453125, "geo/layer_0/stable_rank_q_proj": 18.949281692504883, "geo/layer_0/stable_rank_k_proj": 16.220117568969727, "geo/layer_0/stable_rank_o_proj": 50.01918411254883, "geo/layer_0/stable_rank_gate_proj": 141.02593994140625, "geo/layer_0/stable_rank_down_proj": 52.781227111816406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06119421496987343, "geo/layer_0/attn_entropy_mean": 6.203094959259033, "geo/layer_0/attn_entropy_std": 0.36591920256614685, "geo/layer_7/stable_rank_q_proj": 43.1519889831543, "geo/layer_7/stable_rank_k_proj": 42.26116180419922, "geo/layer_7/stable_rank_o_proj": 101.7337875366211, "geo/layer_7/stable_rank_gate_proj": 92.37857055664062, "geo/layer_7/stable_rank_down_proj": 145.57310485839844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5045844912528992, "geo/layer_7/attn_entropy_mean": 4.654187202453613, "geo/layer_7/attn_entropy_std": 0.8132025599479675, "geo/layer_14/stable_rank_q_proj": 54.5272102355957, "geo/layer_14/stable_rank_k_proj": 36.72138977050781, "geo/layer_14/stable_rank_o_proj": 50.25519943237305, "geo/layer_14/stable_rank_gate_proj": 78.85205841064453, "geo/layer_14/stable_rank_down_proj": 134.34410095214844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3920742869377136, "geo/layer_14/attn_entropy_mean": 5.474823951721191, "geo/layer_14/attn_entropy_std": 0.3782445788383484, "geo/layer_21/stable_rank_q_proj": 44.362327575683594, "geo/layer_21/stable_rank_k_proj": 31.216657638549805, "geo/layer_21/stable_rank_o_proj": 77.77042388916016, "geo/layer_21/stable_rank_gate_proj": 75.2252197265625, "geo/layer_21/stable_rank_down_proj": 56.649253845214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14485852420330048, "geo/layer_21/attn_entropy_mean": 5.709291458129883, "geo/layer_21/attn_entropy_std": 0.2959035336971283, "geo/layer_27/stable_rank_q_proj": 42.356605529785156, "geo/layer_27/stable_rank_k_proj": 31.5059814453125, "geo/layer_27/stable_rank_o_proj": 117.81011199951172, "geo/layer_27/stable_rank_gate_proj": 86.69330596923828, "geo/layer_27/stable_rank_down_proj": 132.85923767089844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.091123566031456, "geo/layer_27/attn_entropy_mean": 4.26827335357666, "geo/layer_27/attn_entropy_std": 0.6500847339630127, "attnres/final_alpha/block_0": 0.24219605326652527, "attnres/block_norm/0": 1.7053020000457764, "attnres/final_alpha/block_1": 0.005493274424225092, "attnres/block_norm/1": 37922.44140625, "attnres/final_alpha/block_2": 0.011925147846341133, "attnres/block_norm/2": 25438.3203125, "attnres/final_alpha/block_3": 0.013995185494422913, "attnres/block_norm/3": 43326.6328125, "attnres/final_alpha/block_4": 0.017273422330617905, "attnres/block_norm/4": 12141.576171875, "attnres/final_alpha/block_5": 0.5882361531257629, "attnres/block_norm/5": 5813.150390625, "attnres/final_alpha/block_6": 0.12088076025247574, "attnres/block_norm/6": 28643.5703125, "geo/tier1_time_s": 1.3583476543426514, "geo/step": 35025.0, "geo/rankme_slope": -3.137655452806122e-05} {"step": 35030, "timestamp": 1778232403.004294, "train/loss": 2.1992230892181395, "train/z_loss": 0.0014339063316583633, "train/perplexity": 9.01800459214034, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700888.6357279017, "perf/iters_per_sec": 0.8110469034804829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329743146896361, "data/tokens_consumed": 73465331712, "data/tokens_consumed_B": 73.465331712, "train/loss_slope": 1.3356718793846645e-05} {"step": 35040, "timestamp": 1778232413.3562288, "train/loss": 2.2020256996154783, "train/z_loss": 0.0014262417913414537, "train/perplexity": 9.04331399520529, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027040.6161105898, "perf/iters_per_sec": 0.9665682869484853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345880508422851, "data/tokens_consumed": 73486303232, "data/tokens_consumed_B": 73.486303232, "train/loss_slope": 9.867796159670372e-06} {"step": 35050, "timestamp": 1778232423.7036324, "grad/layer_0/attn": 0.00275576114654541, "grad/layer_0/mlp": 0.002833763137459755, "grad/layer_0/attn_mlp_ratio": 0.9724740268053348, "grad/layer_4/attn": 0.0016166851855814457, "grad/layer_4/mlp": 0.0024194372817873955, "grad/layer_4/attn_mlp_ratio": 0.6682070789478731, "grad/layer_8/attn": 0.00645109498873353, "grad/layer_8/mlp": 0.0037049194797873497, "grad/layer_8/attn_mlp_ratio": 1.7412240265425145, "grad/layer_12/attn": 0.004959775600582361, "grad/layer_12/mlp": 0.006024294998496771, "grad/layer_12/attn_mlp_ratio": 0.8232955921797329, "grad/layer_16/attn": 0.003437468782067299, "grad/layer_16/mlp": 0.004597593564540148, "grad/layer_16/attn_mlp_ratio": 0.747666939029314, "grad/layer_20/attn": 0.003359424415975809, "grad/layer_20/mlp": 0.00592105882242322, "grad/layer_20/attn_mlp_ratio": 0.5673688541172209, "grad/layer_24/attn": 0.005319599993526936, "grad/layer_24/mlp": 0.007591315545141697, "grad/layer_24/attn_mlp_ratio": 0.7007480971933215, "grad/layer_27/attn": 0.006705989129841328, "grad/layer_27/mlp": 0.007043702062219381, "grad/layer_27/attn_mlp_ratio": 0.9520546121059094} {"step": 35050, "timestamp": 1778232423.7194135, "train/loss": 2.1780359745025635, "train/z_loss": 0.0014460248756222426, "train/perplexity": 8.828948938360005, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024633.615557124, "perf/iters_per_sec": 0.9654205396447773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358180284500123, "data/tokens_consumed": 73507274752, "data/tokens_consumed_B": 73.507274752, "train/loss_slope": 9.859095905909342e-06} {"step": 35060, "timestamp": 1778232434.1161666, "train/loss": 2.1508031606674196, "train/z_loss": 0.0014365094481036066, "train/perplexity": 8.591756187423988, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018189.954271594, "perf/iters_per_sec": 0.9623479625089617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.039125180244446, "data/tokens_consumed": 73528246272, "data/tokens_consumed_B": 73.528246272, "train/loss_slope": 7.798409118617941e-06} {"step": 35070, "timestamp": 1778232444.4930792, "train/loss": 2.171972322463989, "train/z_loss": 0.0014420193387195469, "train/perplexity": 8.775575247412746, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022087.8862395869, "perf/iters_per_sec": 0.9642066413114485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037122082710266, "data/tokens_consumed": 73549217792, "data/tokens_consumed_B": 73.549217792, "train/loss_slope": 6.37269249700612e-06} {"step": 35080, "timestamp": 1778232454.8727195, "train/loss": 2.1569724917411803, "train/z_loss": 0.0014346642186865211, "train/perplexity": 8.644925416499966, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021401.307666435, "perf/iters_per_sec": 0.9638792551357436, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374743461608886, "data/tokens_consumed": 73570189312, "data/tokens_consumed_B": 73.570189312, "train/loss_slope": 5.862179869758446e-06} {"step": 35090, "timestamp": 1778232465.252177, "train/loss": 2.212983989715576, "train/z_loss": 0.001428781228605658, "train/perplexity": 9.142958221461292, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021484.7874440188, "perf/iters_per_sec": 0.9639190613956541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037431502342224, "data/tokens_consumed": 73591160832, "data/tokens_consumed_B": 73.591160832, "train/loss_slope": 7.83758003218833e-06} {"step": 35100, "timestamp": 1778232475.618783, "grad/layer_0/attn": 0.0029819298069924116, "grad/layer_0/mlp": 0.0029098056256771088, "grad/layer_0/attn_mlp_ratio": 1.0247865624425905, "grad/layer_4/attn": 0.0018604072974994779, "grad/layer_4/mlp": 0.0024400276597589254, "grad/layer_4/attn_mlp_ratio": 0.7624533327781836, "grad/layer_8/attn": 0.006537930574268103, "grad/layer_8/mlp": 0.0037472506519407034, "grad/layer_8/attn_mlp_ratio": 1.7447272699544072, "grad/layer_12/attn": 0.0050597297959029675, "grad/layer_12/mlp": 0.006345207802951336, "grad/layer_12/attn_mlp_ratio": 0.7974096157746926, "grad/layer_16/attn": 0.003513224422931671, "grad/layer_16/mlp": 0.0042723133228719234, "grad/layer_16/attn_mlp_ratio": 0.8223236628950366, "grad/layer_20/attn": 0.0039099701680243015, "grad/layer_20/mlp": 0.005869911052286625, "grad/layer_20/attn_mlp_ratio": 0.6661038074658374, "grad/layer_24/attn": 0.008662394247949123, "grad/layer_24/mlp": 0.0106444600969553, "grad/layer_24/attn_mlp_ratio": 0.8137936623997976, "grad/layer_27/attn": 0.0056389071978628635, "grad/layer_27/mlp": 0.009572597220540047, "grad/layer_27/attn_mlp_ratio": 0.5890676280473417} {"step": 35100, "timestamp": 1778232476.2289262, "eos/sharpness": 16.313266754150387, "eos/L0_probe": 2.008681297302246, "eos/L_plus": 2.101391077041626, "eos/L_minus": 2.07910418510437, "eos/grad_norm": 0.11544711142778397, "eos/embed_grad_frac": 0.16777659952640533, "eos/time_s": 0.6073503494262695} {"step": 35100, "timestamp": 1778232476.2496917, "train/loss": 2.182831144332886, "train/z_loss": 0.0014296583482064306, "train/perplexity": 8.871386915278995, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907799.1466632194, "perf/iters_per_sec": 0.9097095235172364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0992519855499268, "data/tokens_consumed": 73612132352, "data/tokens_consumed_B": 73.612132352, "train/loss_slope": 7.879822582516071e-06} {"step": 35100, "timestamp": 1778232477.6154447, "geo/rankme_last": 439.07952880859375, "geo/layer_0/stable_rank_q_proj": 18.943157196044922, "geo/layer_0/stable_rank_k_proj": 16.241975784301758, "geo/layer_0/stable_rank_o_proj": 49.9765510559082, "geo/layer_0/stable_rank_gate_proj": 140.9345245361328, "geo/layer_0/stable_rank_down_proj": 52.736698150634766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062142424285411835, "geo/layer_0/attn_entropy_mean": 6.195611000061035, "geo/layer_0/attn_entropy_std": 0.36658796668052673, "geo/layer_7/stable_rank_q_proj": 43.261051177978516, "geo/layer_7/stable_rank_k_proj": 42.420413970947266, "geo/layer_7/stable_rank_o_proj": 101.55278015136719, "geo/layer_7/stable_rank_gate_proj": 92.41356658935547, "geo/layer_7/stable_rank_down_proj": 145.9518585205078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5128747224807739, "geo/layer_7/attn_entropy_mean": 4.623138427734375, "geo/layer_7/attn_entropy_std": 0.8058304190635681, "geo/layer_14/stable_rank_q_proj": 54.53178405761719, "geo/layer_14/stable_rank_k_proj": 36.78190994262695, "geo/layer_14/stable_rank_o_proj": 50.18373107910156, "geo/layer_14/stable_rank_gate_proj": 78.90628051757812, "geo/layer_14/stable_rank_down_proj": 134.3626708984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37497562170028687, "geo/layer_14/attn_entropy_mean": 5.49713134765625, "geo/layer_14/attn_entropy_std": 0.36845290660858154, "geo/layer_21/stable_rank_q_proj": 44.381591796875, "geo/layer_21/stable_rank_k_proj": 31.186437606811523, "geo/layer_21/stable_rank_o_proj": 77.9470443725586, "geo/layer_21/stable_rank_gate_proj": 75.23839569091797, "geo/layer_21/stable_rank_down_proj": 56.59049987792969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14302869141101837, "geo/layer_21/attn_entropy_mean": 5.722026348114014, "geo/layer_21/attn_entropy_std": 0.29628440737724304, "geo/layer_27/stable_rank_q_proj": 42.37926483154297, "geo/layer_27/stable_rank_k_proj": 31.438365936279297, "geo/layer_27/stable_rank_o_proj": 117.926513671875, "geo/layer_27/stable_rank_gate_proj": 86.66287994384766, "geo/layer_27/stable_rank_down_proj": 132.9946746826172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09023195505142212, "geo/layer_27/attn_entropy_mean": 4.29484748840332, "geo/layer_27/attn_entropy_std": 0.6598519086837769, "attnres/final_alpha/block_0": 0.24262209236621857, "attnres/block_norm/0": 1.7056554555892944, "attnres/final_alpha/block_1": 0.005450839176774025, "attnres/block_norm/1": 38014.31640625, "attnres/final_alpha/block_2": 0.011805367656052113, "attnres/block_norm/2": 25528.578125, "attnres/final_alpha/block_3": 0.013690547086298466, "attnres/block_norm/3": 43204.7421875, "attnres/final_alpha/block_4": 0.01724664494395256, "attnres/block_norm/4": 12173.908203125, "attnres/final_alpha/block_5": 0.5880609750747681, "attnres/block_norm/5": 5839.060546875, "attnres/final_alpha/block_6": 0.12112349271774292, "attnres/block_norm/6": 28542.71484375, "geo/tier1_time_s": 1.3614227771759033, "geo/step": 35100.0, "geo/rankme_slope": -2.1027395333133258e-05} {"step": 35110, "timestamp": 1778232487.9903302, "train/loss": 2.142467474937439, "train/z_loss": 0.0014319620910100638, "train/perplexity": 8.520435673638543, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786860.3331537012, "perf/iters_per_sec": 0.8520414033669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173651885986328, "data/tokens_consumed": 73633103872, "data/tokens_consumed_B": 73.633103872, "train/loss_slope": 6.9736680146133036e-06} {"step": 35120, "timestamp": 1778232498.3640568, "train/loss": 2.1752997159957888, "train/z_loss": 0.0014309583115391434, "train/perplexity": 8.80482367325485, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022576.6982607772, "perf/iters_per_sec": 0.9644397250465284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368714332580566, "data/tokens_consumed": 73654075392, "data/tokens_consumed_B": 73.654075392, "train/loss_slope": 7.427181211849797e-06} {"step": 35130, "timestamp": 1778232508.7389753, "train/loss": 2.2396524429321287, "train/z_loss": 0.001426903496030718, "train/perplexity": 9.39006713603528, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022476.294202257, "perf/iters_per_sec": 0.9643918486605916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369229078292848, "data/tokens_consumed": 73675046912, "data/tokens_consumed_B": 73.675046912, "train/loss_slope": 1.0500298682326384e-05} {"step": 35140, "timestamp": 1778232519.1156654, "train/loss": 2.223792862892151, "train/z_loss": 0.0014310000231489539, "train/perplexity": 9.242319320720458, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021959.550066807, "perf/iters_per_sec": 0.9641454458555255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371879100799561, "data/tokens_consumed": 73696018432, "data/tokens_consumed_B": 73.696018432, "train/loss_slope": 9.216479914631155e-06} {"step": 35150, "timestamp": 1778232529.4895134, "grad/layer_0/attn": 0.003341676201671362, "grad/layer_0/mlp": 0.0030515051912516356, "grad/layer_0/attn_mlp_ratio": 1.0950910723476763, "grad/layer_4/attn": 0.0030226227827370167, "grad/layer_4/mlp": 0.002535806503146887, "grad/layer_4/attn_mlp_ratio": 1.1919768561947903, "grad/layer_8/attn": 0.008390265516936779, "grad/layer_8/mlp": 0.003693397156894207, "grad/layer_8/attn_mlp_ratio": 2.2716932226218707, "grad/layer_12/attn": 0.0044581168331205845, "grad/layer_12/mlp": 0.006717537064105272, "grad/layer_12/attn_mlp_ratio": 0.6636534676640489, "grad/layer_16/attn": 0.00404376769438386, "grad/layer_16/mlp": 0.004680824466049671, "grad/layer_16/attn_mlp_ratio": 0.8639007160647663, "grad/layer_20/attn": 0.0035110171884298325, "grad/layer_20/mlp": 0.006068773102015257, "grad/layer_20/attn_mlp_ratio": 0.5785382105338736, "grad/layer_24/attn": 0.014397742226719856, "grad/layer_24/mlp": 0.01128931250423193, "grad/layer_24/attn_mlp_ratio": 1.2753426830720143, "grad/layer_27/attn": 0.00734983803704381, "grad/layer_27/mlp": 0.010936906561255455, "grad/layer_27/attn_mlp_ratio": 0.6720216478651119} {"step": 35150, "timestamp": 1778232529.5052967, "train/loss": 2.169326829910278, "train/z_loss": 0.0014419913524761796, "train/perplexity": 8.752390209885212, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019765.9939407662, "perf/iters_per_sec": 0.9630994767860251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383143424987793, "data/tokens_consumed": 73716989952, "data/tokens_consumed_B": 73.716989952, "train/loss_slope": 5.0654749951847295e-06} {"step": 35160, "timestamp": 1778232539.8949535, "train/loss": 2.13781623840332, "train/z_loss": 0.001434892148245126, "train/perplexity": 8.48089713477631, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019673.7986254974, "perf/iters_per_sec": 0.9630555146338927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383617401123046, "data/tokens_consumed": 73737961472, "data/tokens_consumed_B": 73.737961472, "train/loss_slope": 7.567147682137002e-07} {"step": 35170, "timestamp": 1778232550.7084439, "train/loss": 2.1541093826293944, "train/z_loss": 0.0014344026683829724, "train/perplexity": 8.620209450906296, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1940589.130730499, "perf/iters_per_sec": 0.9253450063374038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0806780099868774, "data/tokens_consumed": 73758932992, "data/tokens_consumed_B": 73.758932992, "train/loss_slope": -1.2905717647150977e-06} {"step": 35175, "timestamp": 1778232556.494108, "eos/sharpness": 3.953838348388671, "eos/L0_probe": 2.008106231689453, "eos/L_plus": 2.029174566268921, "eos/L_minus": 2.026576280593872, "eos/grad_norm": 0.09134458005428314, "eos/embed_grad_frac": 0.28963202238082886, "eos/time_s": 0.6054768562316895} {"step": 35175, "timestamp": 1778232557.8720524, "geo/rankme_last": 438.8143310546875, "geo/layer_0/stable_rank_q_proj": 18.959999084472656, "geo/layer_0/stable_rank_k_proj": 16.25638198852539, "geo/layer_0/stable_rank_o_proj": 49.93661880493164, "geo/layer_0/stable_rank_gate_proj": 140.5876007080078, "geo/layer_0/stable_rank_down_proj": 52.770774841308594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058314647525548935, "geo/layer_0/attn_entropy_mean": 6.201457500457764, "geo/layer_0/attn_entropy_std": 0.36299413442611694, "geo/layer_7/stable_rank_q_proj": 43.25822067260742, "geo/layer_7/stable_rank_k_proj": 42.31586456298828, "geo/layer_7/stable_rank_o_proj": 101.32131958007812, "geo/layer_7/stable_rank_gate_proj": 92.33854675292969, "geo/layer_7/stable_rank_down_proj": 145.9015655517578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5171704888343811, "geo/layer_7/attn_entropy_mean": 4.636923789978027, "geo/layer_7/attn_entropy_std": 0.8367297053337097, "geo/layer_14/stable_rank_q_proj": 54.641563415527344, "geo/layer_14/stable_rank_k_proj": 36.67185974121094, "geo/layer_14/stable_rank_o_proj": 50.24950408935547, "geo/layer_14/stable_rank_gate_proj": 78.84294891357422, "geo/layer_14/stable_rank_down_proj": 134.2435760498047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39090222120285034, "geo/layer_14/attn_entropy_mean": 5.51985502243042, "geo/layer_14/attn_entropy_std": 0.3751031160354614, "geo/layer_21/stable_rank_q_proj": 44.24697494506836, "geo/layer_21/stable_rank_k_proj": 31.135433197021484, "geo/layer_21/stable_rank_o_proj": 78.03112030029297, "geo/layer_21/stable_rank_gate_proj": 75.32489776611328, "geo/layer_21/stable_rank_down_proj": 56.7413330078125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14770947396755219, "geo/layer_21/attn_entropy_mean": 5.7225341796875, "geo/layer_21/attn_entropy_std": 0.2995989918708801, "geo/layer_27/stable_rank_q_proj": 42.31705856323242, "geo/layer_27/stable_rank_k_proj": 31.43022346496582, "geo/layer_27/stable_rank_o_proj": 117.95372009277344, "geo/layer_27/stable_rank_gate_proj": 86.72991180419922, "geo/layer_27/stable_rank_down_proj": 132.73175048828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08405669033527374, "geo/layer_27/attn_entropy_mean": 4.292702674865723, "geo/layer_27/attn_entropy_std": 0.6630955338478088, "attnres/final_alpha/block_0": 0.24180589616298676, "attnres/block_norm/0": 1.7058155536651611, "attnres/final_alpha/block_1": 0.005456362850964069, "attnres/block_norm/1": 38152.1484375, "attnres/final_alpha/block_2": 0.011747175827622414, "attnres/block_norm/2": 25573.11328125, "attnres/final_alpha/block_3": 0.013805249705910683, "attnres/block_norm/3": 43500.66015625, "attnres/final_alpha/block_4": 0.01721072755753994, "attnres/block_norm/4": 12174.751953125, "attnres/final_alpha/block_5": 0.5881765484809875, "attnres/block_norm/5": 5850.22412109375, "attnres/final_alpha/block_6": 0.12179800868034363, "attnres/block_norm/6": 28782.935546875, "geo/tier1_time_s": 1.357978343963623, "geo/step": 35175.0, "geo/rankme_slope": -3.957295808948579e-05} {"step": 35180, "timestamp": 1778232563.0619545, "train/loss": 2.180528998374939, "train/z_loss": 0.0014419739018194377, "train/perplexity": 8.850987178346237, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698395.7246850093, "perf/iters_per_sec": 0.8098581908631369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2347840785980224, "data/tokens_consumed": 73779904512, "data/tokens_consumed_B": 73.779904512, "train/loss_slope": -5.214702593039168e-06} {"step": 35190, "timestamp": 1778232573.439642, "train/loss": 2.130539131164551, "train/z_loss": 0.0014542934601195156, "train/perplexity": 8.419404751546272, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021707.5732989956, "perf/iters_per_sec": 0.9640252939696291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373171806335448, "data/tokens_consumed": 73800876032, "data/tokens_consumed_B": 73.800876032, "train/loss_slope": -7.999959165113599e-06} {"step": 35200, "timestamp": 1778232583.807057, "grad/layer_0/attn": 0.002862316556274891, "grad/layer_0/mlp": 0.0026940128300338984, "grad/layer_0/attn_mlp_ratio": 1.0624732065554245, "grad/layer_4/attn": 0.001836411189287901, "grad/layer_4/mlp": 0.002579975174739957, "grad/layer_4/attn_mlp_ratio": 0.7117941041016365, "grad/layer_8/attn": 0.003440556116402149, "grad/layer_8/mlp": 0.0035774584393948317, "grad/layer_8/attn_mlp_ratio": 0.9617319330230893, "grad/layer_12/attn": 0.004266649484634399, "grad/layer_12/mlp": 0.005936179775744677, "grad/layer_12/attn_mlp_ratio": 0.7187534026837691, "grad/layer_16/attn": 0.006347094662487507, "grad/layer_16/mlp": 0.004283884074538946, "grad/layer_16/attn_mlp_ratio": 1.4816214453722047, "grad/layer_20/attn": 0.0038532312028110027, "grad/layer_20/mlp": 0.005452383775264025, "grad/layer_20/attn_mlp_ratio": 0.7067057806204842, "grad/layer_24/attn": 0.014970134012401104, "grad/layer_24/mlp": 0.009631656110286713, "grad/layer_24/attn_mlp_ratio": 1.5542637408935798, "grad/layer_27/attn": 0.006178359966725111, "grad/layer_27/mlp": 0.008698228746652603, "grad/layer_27/attn_mlp_ratio": 0.7103009216758847} {"step": 35200, "timestamp": 1778232583.8227339, "train/loss": 2.1631598114967345, "train/z_loss": 0.0014452814706601203, "train/perplexity": 8.698580152647661, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020814.913761441, "perf/iters_per_sec": 0.9635996407324986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377753973007202, "data/tokens_consumed": 73821847552, "data/tokens_consumed_B": 73.821847552, "train/loss_slope": -6.821441843529002e-06} {"step": 35210, "timestamp": 1778232594.1970654, "train/loss": 2.1781551599502564, "train/z_loss": 0.0014405493275262415, "train/perplexity": 8.830001283302737, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022575.8146243792, "perf/iters_per_sec": 0.9644393036958595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036871886253357, "data/tokens_consumed": 73842819072, "data/tokens_consumed_B": 73.842819072, "train/loss_slope": -8.149996949787311e-06} {"step": 35220, "timestamp": 1778232604.5752914, "train/loss": 2.1826250553131104, "train/z_loss": 0.0014157851808704435, "train/perplexity": 8.869558808228442, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021892.2044968328, "perf/iters_per_sec": 0.9641133329853214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372224569320678, "data/tokens_consumed": 73863790592, "data/tokens_consumed_B": 73.863790592, "train/loss_slope": -8.708118433856491e-06} {"step": 35230, "timestamp": 1778232614.9534671, "train/loss": 2.2325171709060667, "train/z_loss": 0.001427070889621973, "train/perplexity": 9.323304919215904, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021803.439757388, "perf/iters_per_sec": 0.964071006659216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372679948806762, "data/tokens_consumed": 73884762112, "data/tokens_consumed_B": 73.884762112, "train/loss_slope": -2.5424122703064864e-06} {"step": 35240, "timestamp": 1778232625.3295588, "train/loss": 2.1482494354248045, "train/z_loss": 0.0014441559673286975, "train/perplexity": 8.569843194560098, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022141.624006696, "perf/iters_per_sec": 0.964232265475605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037094521522522, "data/tokens_consumed": 73905733632, "data/tokens_consumed_B": 73.905733632, "train/loss_slope": -5.361443856368385e-06} {"step": 35250, "timestamp": 1778232635.6950688, "grad/layer_0/attn": 0.0026541375555098057, "grad/layer_0/mlp": 0.0026615390088409185, "grad/layer_0/attn_mlp_ratio": 0.9972190702340135, "grad/layer_4/attn": 0.001736326958052814, "grad/layer_4/mlp": 0.002286250237375498, "grad/layer_4/attn_mlp_ratio": 0.7594649324561856, "grad/layer_8/attn": 0.004463883116841316, "grad/layer_8/mlp": 0.0035299763549119234, "grad/layer_8/attn_mlp_ratio": 1.264564558392414, "grad/layer_12/attn": 0.0032972933258861303, "grad/layer_12/mlp": 0.006036759819835424, "grad/layer_12/attn_mlp_ratio": 0.5462024943301078, "grad/layer_16/attn": 0.00394043792039156, "grad/layer_16/mlp": 0.004496979061514139, "grad/layer_16/attn_mlp_ratio": 0.8762410895995366, "grad/layer_20/attn": 0.006184134632349014, "grad/layer_20/mlp": 0.006701235193759203, "grad/layer_20/attn_mlp_ratio": 0.9228350238810806, "grad/layer_24/attn": 0.01620345190167427, "grad/layer_24/mlp": 0.010948660783469677, "grad/layer_24/attn_mlp_ratio": 1.4799482853778305, "grad/layer_27/attn": 0.008247838355600834, "grad/layer_27/mlp": 0.010318445973098278, "grad/layer_27/attn_mlp_ratio": 0.7993295014744686} {"step": 35250, "timestamp": 1778232636.3031764, "eos/sharpness": 76.56755447387694, "eos/L0_probe": 2.0087974071502686, "eos/L_plus": 2.514846086502075, "eos/L_minus": 2.2684242725372314, "eos/grad_norm": 0.20596638321876526, "eos/embed_grad_frac": 0.050944458693265915, "eos/time_s": 0.60530686378479} {"step": 35250, "timestamp": 1778232636.322597, "train/loss": 2.1564709186553954, "train/z_loss": 0.0014383612433448434, "train/perplexity": 8.640590441826625, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908876.010788204, "perf/iters_per_sec": 0.9102230123463649, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986318588256836, "data/tokens_consumed": 73926705152, "data/tokens_consumed_B": 73.926705152, "train/loss_slope": -1.1809176752025817e-05} {"step": 35250, "timestamp": 1778232637.6855109, "geo/rankme_last": 439.4234619140625, "geo/layer_0/stable_rank_q_proj": 18.9910945892334, "geo/layer_0/stable_rank_k_proj": 16.269588470458984, "geo/layer_0/stable_rank_o_proj": 49.938289642333984, "geo/layer_0/stable_rank_gate_proj": 140.71279907226562, "geo/layer_0/stable_rank_down_proj": 52.766639709472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06287345290184021, "geo/layer_0/attn_entropy_mean": 6.204346656799316, "geo/layer_0/attn_entropy_std": 0.36456578969955444, "geo/layer_7/stable_rank_q_proj": 43.329185485839844, "geo/layer_7/stable_rank_k_proj": 42.16691970825195, "geo/layer_7/stable_rank_o_proj": 101.28834533691406, "geo/layer_7/stable_rank_gate_proj": 92.01312255859375, "geo/layer_7/stable_rank_down_proj": 146.220703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.529613733291626, "geo/layer_7/attn_entropy_mean": 4.633780479431152, "geo/layer_7/attn_entropy_std": 0.8031231164932251, "geo/layer_14/stable_rank_q_proj": 54.627193450927734, "geo/layer_14/stable_rank_k_proj": 36.706581115722656, "geo/layer_14/stable_rank_o_proj": 50.1356201171875, "geo/layer_14/stable_rank_gate_proj": 78.7120132446289, "geo/layer_14/stable_rank_down_proj": 134.182861328125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3786211907863617, "geo/layer_14/attn_entropy_mean": 5.4802141189575195, "geo/layer_14/attn_entropy_std": 0.39079225063323975, "geo/layer_21/stable_rank_q_proj": 44.22282791137695, "geo/layer_21/stable_rank_k_proj": 31.29737663269043, "geo/layer_21/stable_rank_o_proj": 77.98963165283203, "geo/layer_21/stable_rank_gate_proj": 75.32185363769531, "geo/layer_21/stable_rank_down_proj": 56.77457046508789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14672890305519104, "geo/layer_21/attn_entropy_mean": 5.715275764465332, "geo/layer_21/attn_entropy_std": 0.29103025794029236, "geo/layer_27/stable_rank_q_proj": 42.285484313964844, "geo/layer_27/stable_rank_k_proj": 31.44146728515625, "geo/layer_27/stable_rank_o_proj": 117.8196792602539, "geo/layer_27/stable_rank_gate_proj": 86.62494659423828, "geo/layer_27/stable_rank_down_proj": 132.67140197753906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08771912008523941, "geo/layer_27/attn_entropy_mean": 4.2777605056762695, "geo/layer_27/attn_entropy_std": 0.6498169302940369, "attnres/final_alpha/block_0": 0.2388973832130432, "attnres/block_norm/0": 1.706014633178711, "attnres/final_alpha/block_1": 0.005395590327680111, "attnres/block_norm/1": 38115.9921875, "attnres/final_alpha/block_2": 0.011446230113506317, "attnres/block_norm/2": 25499.55078125, "attnres/final_alpha/block_3": 0.013460349291563034, "attnres/block_norm/3": 43584.828125, "attnres/final_alpha/block_4": 0.016817469149827957, "attnres/block_norm/4": 12143.451171875, "attnres/final_alpha/block_5": 0.5962660908699036, "attnres/block_norm/5": 5802.47216796875, "attnres/final_alpha/block_6": 0.11771689355373383, "attnres/block_norm/6": 29017.1875, "geo/tier1_time_s": 1.3588933944702148, "geo/step": 35250.0, "geo/rankme_slope": -6.445890856342537e-05} {"step": 35260, "timestamp": 1778232648.0671992, "train/loss": 2.2098931312561034, "train/z_loss": 0.0014254477573558688, "train/perplexity": 9.114742259932783, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786222.0303317844, "perf/iters_per_sec": 0.8517370368632242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1740712881088258, "data/tokens_consumed": 73947676672, "data/tokens_consumed_B": 73.947676672, "train/loss_slope": -8.874065554825179e-06} {"step": 35270, "timestamp": 1778232658.4398735, "train/loss": 2.1753811120986937, "train/z_loss": 0.0014290757826529443, "train/perplexity": 8.80554038075682, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023042.7612594904, "perf/iters_per_sec": 0.9646619612023785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366325616836547, "data/tokens_consumed": 73968648192, "data/tokens_consumed_B": 73.968648192, "train/loss_slope": -8.53546054163679e-06} {"step": 35280, "timestamp": 1778232668.8164332, "train/loss": 2.177136492729187, "train/z_loss": 0.0014354088460095226, "train/perplexity": 8.821011030248904, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022151.9907039006, "perf/iters_per_sec": 0.96423720870204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037089204788208, "data/tokens_consumed": 73989619712, "data/tokens_consumed_B": 73.989619712, "train/loss_slope": -7.758477998144705e-06} {"step": 35290, "timestamp": 1778232679.2084596, "train/loss": 2.1515282511711122, "train/z_loss": 0.001434783823788166, "train/perplexity": 8.59798824737623, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019272.8840088525, "perf/iters_per_sec": 0.9628643436474097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385679006576538, "data/tokens_consumed": 74010591232, "data/tokens_consumed_B": 74.010591232, "train/loss_slope": -1.2292643234317065e-05} {"step": 35300, "timestamp": 1778232689.5935252, "grad/layer_0/attn": 0.0030589879024773836, "grad/layer_0/mlp": 0.0029386922251433134, "grad/layer_0/attn_mlp_ratio": 1.0409350704409668, "grad/layer_4/attn": 0.0018395279766991735, "grad/layer_4/mlp": 0.0026018654461950064, "grad/layer_4/attn_mlp_ratio": 0.7070034727156894, "grad/layer_8/attn": 0.0044251903891563416, "grad/layer_8/mlp": 0.0038165031000971794, "grad/layer_8/attn_mlp_ratio": 1.1594881904052021, "grad/layer_12/attn": 0.005532546900212765, "grad/layer_12/mlp": 0.0064529720693826675, "grad/layer_12/attn_mlp_ratio": 0.8573641346948571, "grad/layer_16/attn": 0.010929624550044537, "grad/layer_16/mlp": 0.004685458727180958, "grad/layer_16/attn_mlp_ratio": 2.332668998528033, "grad/layer_20/attn": 0.003866836428642273, "grad/layer_20/mlp": 0.006903754081577063, "grad/layer_20/attn_mlp_ratio": 0.5601063315610334, "grad/layer_24/attn": 0.017082907259464264, "grad/layer_24/mlp": 0.01244085282087326, "grad/layer_24/attn_mlp_ratio": 1.3731299106351917, "grad/layer_27/attn": 0.0068545714020729065, "grad/layer_27/mlp": 0.012648831121623516, "grad/layer_27/attn_mlp_ratio": 0.5419134212459751} {"step": 35300, "timestamp": 1778232689.6093378, "train/loss": 2.1516807794570925, "train/z_loss": 0.0014304663054645061, "train/perplexity": 8.59929978380714, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017931.925710241, "perf/iters_per_sec": 0.9622249249030309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0392580509185791, "data/tokens_consumed": 74031562752, "data/tokens_consumed_B": 74.031562752, "train/loss_slope": -1.2984970400650316e-05} {"step": 35310, "timestamp": 1778232700.0023644, "train/loss": 2.202898716926575, "train/z_loss": 0.0014391749165952206, "train/perplexity": 9.05121241209847, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019779.2117658472, "perf/iters_per_sec": 0.9631057795361744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038307547569275, "data/tokens_consumed": 74052534272, "data/tokens_consumed_B": 74.052534272, "train/loss_slope": -8.428524322349547e-06} {"step": 35320, "timestamp": 1778232710.3920286, "train/loss": 2.1817647933959963, "train/z_loss": 0.0014301383984275163, "train/perplexity": 8.861931945584214, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019721.2401333838, "perf/iters_per_sec": 0.9630781365076941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383373498916626, "data/tokens_consumed": 74073505792, "data/tokens_consumed_B": 74.073505792, "train/loss_slope": -8.103355587405061e-06} {"step": 35325, "timestamp": 1778232716.1783435, "eos/sharpness": 65.9558057785034, "eos/L0_probe": 2.0095551013946533, "eos/L_plus": 2.301663875579834, "eos/L_minus": 2.377004384994507, "eos/grad_norm": 0.23596982657909393, "eos/embed_grad_frac": 0.038647860288619995, "eos/time_s": 0.6059482097625732} {"step": 35325, "timestamp": 1778232717.5593467, "geo/rankme_last": 439.0854187011719, "geo/layer_0/stable_rank_q_proj": 18.96271324157715, "geo/layer_0/stable_rank_k_proj": 16.281408309936523, "geo/layer_0/stable_rank_o_proj": 49.870296478271484, "geo/layer_0/stable_rank_gate_proj": 140.92410278320312, "geo/layer_0/stable_rank_down_proj": 52.725929260253906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.056348446756601334, "geo/layer_0/attn_entropy_mean": 6.1997222900390625, "geo/layer_0/attn_entropy_std": 0.36023756861686707, "geo/layer_7/stable_rank_q_proj": 43.3958854675293, "geo/layer_7/stable_rank_k_proj": 42.3054084777832, "geo/layer_7/stable_rank_o_proj": 101.36124420166016, "geo/layer_7/stable_rank_gate_proj": 92.157958984375, "geo/layer_7/stable_rank_down_proj": 146.2881317138672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5055705308914185, "geo/layer_7/attn_entropy_mean": 4.681088447570801, "geo/layer_7/attn_entropy_std": 0.7987872958183289, "geo/layer_14/stable_rank_q_proj": 54.582218170166016, "geo/layer_14/stable_rank_k_proj": 36.675331115722656, "geo/layer_14/stable_rank_o_proj": 50.149208068847656, "geo/layer_14/stable_rank_gate_proj": 78.65129852294922, "geo/layer_14/stable_rank_down_proj": 134.19651794433594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38034990429878235, "geo/layer_14/attn_entropy_mean": 5.4945831298828125, "geo/layer_14/attn_entropy_std": 0.3675086200237274, "geo/layer_21/stable_rank_q_proj": 44.189083099365234, "geo/layer_21/stable_rank_k_proj": 31.303682327270508, "geo/layer_21/stable_rank_o_proj": 77.81576538085938, "geo/layer_21/stable_rank_gate_proj": 75.05883026123047, "geo/layer_21/stable_rank_down_proj": 56.771419525146484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1505805402994156, "geo/layer_21/attn_entropy_mean": 5.745404243469238, "geo/layer_21/attn_entropy_std": 0.2935180962085724, "geo/layer_27/stable_rank_q_proj": 42.197383880615234, "geo/layer_27/stable_rank_k_proj": 31.530529022216797, "geo/layer_27/stable_rank_o_proj": 117.77528381347656, "geo/layer_27/stable_rank_gate_proj": 86.54303741455078, "geo/layer_27/stable_rank_down_proj": 132.6741180419922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09037046134471893, "geo/layer_27/attn_entropy_mean": 4.290554523468018, "geo/layer_27/attn_entropy_std": 0.648513674736023, "attnres/final_alpha/block_0": 0.24233782291412354, "attnres/block_norm/0": 1.7060456275939941, "attnres/final_alpha/block_1": 0.00548391742631793, "attnres/block_norm/1": 38113.3046875, "attnres/final_alpha/block_2": 0.011863614432513714, "attnres/block_norm/2": 25466.0234375, "attnres/final_alpha/block_3": 0.013689685612916946, "attnres/block_norm/3": 43427.09375, "attnres/final_alpha/block_4": 0.017115168273448944, "attnres/block_norm/4": 12235.85546875, "attnres/final_alpha/block_5": 0.5883241891860962, "attnres/block_norm/5": 5848.3271484375, "attnres/final_alpha/block_6": 0.12118561565876007, "attnres/block_norm/6": 28868.056640625, "geo/tier1_time_s": 1.3611195087432861, "geo/step": 35325.0, "geo/rankme_slope": -7.954871401685674e-05} {"step": 35330, "timestamp": 1778232722.7504256, "train/loss": 2.180134916305542, "train/z_loss": 0.0014370784279890358, "train/perplexity": 8.847499850194666, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697832.6827922135, "perf/iters_per_sec": 0.8095897115670269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351935625076294, "data/tokens_consumed": 74094477312, "data/tokens_consumed_B": 74.094477312, "train/loss_slope": -7.956621262750374e-06} {"step": 35340, "timestamp": 1778232733.1328814, "train/loss": 2.155559849739075, "train/z_loss": 0.0014388450072146952, "train/perplexity": 8.632721853413853, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020878.4732465951, "perf/iters_per_sec": 0.9636299482567764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377427577972411, "data/tokens_consumed": 74115448832, "data/tokens_consumed_B": 74.115448832, "train/loss_slope": -7.281166892705522e-06} {"step": 35350, "timestamp": 1778232743.5048454, "grad/layer_0/attn": 0.003491902956739068, "grad/layer_0/mlp": 0.0028488885145634413, "grad/layer_0/attn_mlp_ratio": 1.2257070841199458, "grad/layer_4/attn": 0.0018826017621904612, "grad/layer_4/mlp": 0.0025254185311496258, "grad/layer_4/attn_mlp_ratio": 0.7454612629247372, "grad/layer_8/attn": 0.004977173637598753, "grad/layer_8/mlp": 0.0037045686040073633, "grad/layer_8/attn_mlp_ratio": 1.3435231022209866, "grad/layer_12/attn": 0.005073904059827328, "grad/layer_12/mlp": 0.006611577235162258, "grad/layer_12/attn_mlp_ratio": 0.7674271664104806, "grad/layer_16/attn": 0.00502076605334878, "grad/layer_16/mlp": 0.005091647617518902, "grad/layer_16/attn_mlp_ratio": 0.9860788357516883, "grad/layer_20/attn": 0.004049468785524368, "grad/layer_20/mlp": 0.0066583044826984406, "grad/layer_20/attn_mlp_ratio": 0.6081831696385422, "grad/layer_24/attn": 0.01416260190308094, "grad/layer_24/mlp": 0.011233612895011902, "grad/layer_24/attn_mlp_ratio": 1.2607343611863437, "grad/layer_27/attn": 0.006418260280042887, "grad/layer_27/mlp": 0.011525453068315983, "grad/layer_27/attn_mlp_ratio": 0.5568770430378381} {"step": 35350, "timestamp": 1778232743.5206404, "train/loss": 2.2177417039871217, "train/z_loss": 0.0014276502537541092, "train/perplexity": 9.186561447872348, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019900.3136479794, "perf/iters_per_sec": 0.9631635254135034, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382452964782716, "data/tokens_consumed": 74136420352, "data/tokens_consumed_B": 74.136420352, "train/loss_slope": -6.093663637108054e-06} {"step": 35360, "timestamp": 1778232753.9056528, "train/loss": 2.174523210525513, "train/z_loss": 0.001428492774721235, "train/perplexity": 8.797989333302334, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020464.1317677826, "perf/iters_per_sec": 0.9634323748434938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379555702209473, "data/tokens_consumed": 74157391872, "data/tokens_consumed_B": 74.157391872, "train/loss_slope": -1.1277010302768256e-05} {"step": 35370, "timestamp": 1778232764.2891595, "train/loss": 2.2220064401626587, "train/z_loss": 0.0014319027308374644, "train/perplexity": 9.22582337017017, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020689.0601354472, "perf/iters_per_sec": 0.9635396290471302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378400325775146, "data/tokens_consumed": 74178363392, "data/tokens_consumed_B": 74.178363392, "train/loss_slope": -8.548229834427543e-06} {"step": 35380, "timestamp": 1778232774.6720922, "train/loss": 2.1981555223464966, "train/z_loss": 0.0014337370404973627, "train/perplexity": 9.00838240626675, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020812.4531751277, "perf/iters_per_sec": 0.9635984674335135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377766609191894, "data/tokens_consumed": 74199334912, "data/tokens_consumed_B": 74.199334912, "train/loss_slope": -9.73741777158521e-06} {"step": 35390, "timestamp": 1778232785.0459585, "train/loss": 2.198967385292053, "train/z_loss": 0.0014294292544946075, "train/perplexity": 9.015698947754387, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022627.5319070374, "perf/iters_per_sec": 0.9644639644179522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368453741073609, "data/tokens_consumed": 74220306432, "data/tokens_consumed_B": 74.220306432, "train/loss_slope": -7.094941152097089e-06} {"step": 35400, "timestamp": 1778232795.4157481, "grad/layer_0/attn": 0.0028616469353437424, "grad/layer_0/mlp": 0.002638791920617223, "grad/layer_0/attn_mlp_ratio": 1.0844533835881438, "grad/layer_4/attn": 0.0021316860802471638, "grad/layer_4/mlp": 0.0024517285637557507, "grad/layer_4/attn_mlp_ratio": 0.8694624783566708, "grad/layer_8/attn": 0.005766437854617834, "grad/layer_8/mlp": 0.003600032301619649, "grad/layer_8/attn_mlp_ratio": 1.60177387626387, "grad/layer_12/attn": 0.008499657735228539, "grad/layer_12/mlp": 0.0060465713031589985, "grad/layer_12/attn_mlp_ratio": 1.405698728834648, "grad/layer_16/attn": 0.0037092319689691067, "grad/layer_16/mlp": 0.00466878293082118, "grad/layer_16/attn_mlp_ratio": 0.794475122206889, "grad/layer_20/attn": 0.0038333721458911896, "grad/layer_20/mlp": 0.006007174961268902, "grad/layer_20/attn_mlp_ratio": 0.638132251315057, "grad/layer_24/attn": 0.0125874700024724, "grad/layer_24/mlp": 0.01177087239921093, "grad/layer_24/attn_mlp_ratio": 1.0693744243101955, "grad/layer_27/attn": 0.006106473971158266, "grad/layer_27/mlp": 0.011790066957473755, "grad/layer_27/attn_mlp_ratio": 0.5179337777631516} {"step": 35400, "timestamp": 1778232796.0224915, "eos/sharpness": 55.639505386352525, "eos/L0_probe": 2.006542682647705, "eos/L_plus": 2.342782735824585, "eos/L_minus": 2.2266976833343506, "eos/grad_norm": 0.1905844360589981, "eos/embed_grad_frac": 0.06083974987268448, "eos/time_s": 0.6038918495178223} {"step": 35400, "timestamp": 1778232796.0424225, "train/loss": 2.17888400554657, "train/z_loss": 0.0014400466228835286, "train/perplexity": 8.836439336741964, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908123.8167947177, "perf/iters_per_sec": 0.9098643383000935, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0990649461746216, "data/tokens_consumed": 74241277952, "data/tokens_consumed_B": 74.241277952, "train/loss_slope": -8.783213819714959e-06} {"step": 35400, "timestamp": 1778232797.405667, "geo/rankme_last": 440.4496765136719, "geo/layer_0/stable_rank_q_proj": 18.946332931518555, "geo/layer_0/stable_rank_k_proj": 16.283187866210938, "geo/layer_0/stable_rank_o_proj": 49.94053268432617, "geo/layer_0/stable_rank_gate_proj": 140.96926879882812, "geo/layer_0/stable_rank_down_proj": 52.69831466674805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.056889407336711884, "geo/layer_0/attn_entropy_mean": 6.2016191482543945, "geo/layer_0/attn_entropy_std": 0.36310869455337524, "geo/layer_7/stable_rank_q_proj": 43.366085052490234, "geo/layer_7/stable_rank_k_proj": 42.226715087890625, "geo/layer_7/stable_rank_o_proj": 101.4951171875, "geo/layer_7/stable_rank_gate_proj": 92.18978118896484, "geo/layer_7/stable_rank_down_proj": 146.12744140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5025759935379028, "geo/layer_7/attn_entropy_mean": 4.653685569763184, "geo/layer_7/attn_entropy_std": 0.8209224343299866, "geo/layer_14/stable_rank_q_proj": 54.519927978515625, "geo/layer_14/stable_rank_k_proj": 36.632164001464844, "geo/layer_14/stable_rank_o_proj": 49.99689865112305, "geo/layer_14/stable_rank_gate_proj": 78.47877502441406, "geo/layer_14/stable_rank_down_proj": 134.15420532226562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.379104346036911, "geo/layer_14/attn_entropy_mean": 5.503255844116211, "geo/layer_14/attn_entropy_std": 0.38045698404312134, "geo/layer_21/stable_rank_q_proj": 44.17947769165039, "geo/layer_21/stable_rank_k_proj": 31.297611236572266, "geo/layer_21/stable_rank_o_proj": 77.71322631835938, "geo/layer_21/stable_rank_gate_proj": 75.00016784667969, "geo/layer_21/stable_rank_down_proj": 56.751914978027344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14978815615177155, "geo/layer_21/attn_entropy_mean": 5.7264814376831055, "geo/layer_21/attn_entropy_std": 0.3012051284313202, "geo/layer_27/stable_rank_q_proj": 42.24776840209961, "geo/layer_27/stable_rank_k_proj": 31.555274963378906, "geo/layer_27/stable_rank_o_proj": 117.80965423583984, "geo/layer_27/stable_rank_gate_proj": 86.47020721435547, "geo/layer_27/stable_rank_down_proj": 132.49110412597656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08501316606998444, "geo/layer_27/attn_entropy_mean": 4.301765441894531, "geo/layer_27/attn_entropy_std": 0.6595155596733093, "attnres/final_alpha/block_0": 0.24125955998897552, "attnres/block_norm/0": 1.7063264846801758, "attnres/final_alpha/block_1": 0.005523949861526489, "attnres/block_norm/1": 38339.28125, "attnres/final_alpha/block_2": 0.01169087365269661, "attnres/block_norm/2": 25477.07421875, "attnres/final_alpha/block_3": 0.013564360328018665, "attnres/block_norm/3": 43989.84375, "attnres/final_alpha/block_4": 0.016864821314811707, "attnres/block_norm/4": 12162.666015625, "attnres/final_alpha/block_5": 0.591571569442749, "attnres/block_norm/5": 5809.3681640625, "attnres/final_alpha/block_6": 0.11952480673789978, "attnres/block_norm/6": 28596.857421875, "geo/tier1_time_s": 1.3589212894439697, "geo/step": 35400.0, "geo/rankme_slope": -3.6212141106442576e-05} {"step": 35410, "timestamp": 1778232807.7910488, "train/loss": 2.1862943172454834, "train/z_loss": 0.0014403679873794318, "train/perplexity": 8.902163323392722, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785641.9265524684, "perf/iters_per_sec": 0.8514604218256323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1744527101516724, "data/tokens_consumed": 74262249472, "data/tokens_consumed_B": 74.262249472, "train/loss_slope": -9.765231634380947e-06} {"step": 35420, "timestamp": 1778232818.1728754, "train/loss": 2.1518526554107664, "train/z_loss": 0.0014365918934345246, "train/perplexity": 8.600777923683122, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021110.6455863717, "perf/iters_per_sec": 0.9637406566554888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376235485076903, "data/tokens_consumed": 74283220992, "data/tokens_consumed_B": 74.283220992, "train/loss_slope": -1.06608834096415e-05} {"step": 35430, "timestamp": 1778232828.552576, "train/loss": 2.179927670955658, "train/z_loss": 0.001441229274496436, "train/perplexity": 8.845666436982357, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021728.2978997554, "perf/iters_per_sec": 0.9640351762293603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037306547164917, "data/tokens_consumed": 74304192512, "data/tokens_consumed_B": 74.304192512, "train/loss_slope": -8.862151621770798e-06} {"step": 35440, "timestamp": 1778232838.9307604, "train/loss": 2.205925178527832, "train/z_loss": 0.001440482004545629, "train/perplexity": 9.078647052911434, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021766.4490045263, "perf/iters_per_sec": 0.9640533680937415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372869729995728, "data/tokens_consumed": 74325164032, "data/tokens_consumed_B": 74.325164032, "train/loss_slope": -7.52118691073288e-06} {"step": 35450, "timestamp": 1778232849.2971423, "grad/layer_0/attn": 0.002983525861054659, "grad/layer_0/mlp": 0.0029761490877717733, "grad/layer_0/attn_mlp_ratio": 1.002478596608394, "grad/layer_4/attn": 0.003326460486277938, "grad/layer_4/mlp": 0.0024988281074911356, "grad/layer_4/attn_mlp_ratio": 1.3312081544084051, "grad/layer_8/attn": 0.009567547589540482, "grad/layer_8/mlp": 0.003802558872848749, "grad/layer_8/attn_mlp_ratio": 2.5160813173063814, "grad/layer_12/attn": 0.0077785346657037735, "grad/layer_12/mlp": 0.006533637642860413, "grad/layer_12/attn_mlp_ratio": 1.1905365696473935, "grad/layer_16/attn": 0.0037752706557512283, "grad/layer_16/mlp": 0.004531479440629482, "grad/layer_16/attn_mlp_ratio": 0.8331209755890885, "grad/layer_20/attn": 0.004090441856533289, "grad/layer_20/mlp": 0.006650904659181833, "grad/layer_20/attn_mlp_ratio": 0.6150203625884546, "grad/layer_24/attn": 0.0140389334410429, "grad/layer_24/mlp": 0.010404326021671295, "grad/layer_24/attn_mlp_ratio": 1.34933615852362, "grad/layer_27/attn": 0.007780546322464943, "grad/layer_27/mlp": 0.008191590197384357, "grad/layer_27/attn_mlp_ratio": 0.9498212239629875} {"step": 35450, "timestamp": 1778232849.3132284, "train/loss": 2.159777021408081, "train/z_loss": 0.001434037322178483, "train/perplexity": 8.669204395944123, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020953.1340520615, "perf/iters_per_sec": 0.9636655493030841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377044200897216, "data/tokens_consumed": 74346135552, "data/tokens_consumed_B": 74.346135552, "train/loss_slope": -1.0666037628752984e-05} {"step": 35460, "timestamp": 1778232859.688268, "train/loss": 2.212654972076416, "train/z_loss": 0.001439700135961175, "train/perplexity": 9.139950521752588, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022421.98064524, "perf/iters_per_sec": 0.9643659499384117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369507551193238, "data/tokens_consumed": 74367107072, "data/tokens_consumed_B": 74.367107072, "train/loss_slope": -1.3033565751003376e-05} {"step": 35470, "timestamp": 1778232870.0731342, "train/loss": 2.1777450323104857, "train/z_loss": 0.0014378578052856028, "train/perplexity": 8.826380598239504, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020393.2660352262, "perf/iters_per_sec": 0.9633985834289676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379919767379762, "data/tokens_consumed": 74388078592, "data/tokens_consumed_B": 74.388078592, "train/loss_slope": -9.367429898946191e-06} {"step": 35475, "timestamp": 1778232875.8598967, "eos/sharpness": 67.26536750793456, "eos/L0_probe": 2.0054636001586914, "eos/L_plus": 2.2524759769439697, "eos/L_minus": 2.431104898452759, "eos/grad_norm": 0.17874568700790405, "eos/embed_grad_frac": 0.0713362768292427, "eos/time_s": 0.6053638458251953} {"step": 35475, "timestamp": 1778232877.241287, "geo/rankme_last": 438.7748107910156, "geo/layer_0/stable_rank_q_proj": 18.940839767456055, "geo/layer_0/stable_rank_k_proj": 16.277149200439453, "geo/layer_0/stable_rank_o_proj": 49.88426208496094, "geo/layer_0/stable_rank_gate_proj": 140.8046112060547, "geo/layer_0/stable_rank_down_proj": 52.65995788574219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06113356724381447, "geo/layer_0/attn_entropy_mean": 6.202308654785156, "geo/layer_0/attn_entropy_std": 0.36383193731307983, "geo/layer_7/stable_rank_q_proj": 43.45220184326172, "geo/layer_7/stable_rank_k_proj": 42.20201873779297, "geo/layer_7/stable_rank_o_proj": 101.39786529541016, "geo/layer_7/stable_rank_gate_proj": 92.05363464355469, "geo/layer_7/stable_rank_down_proj": 146.23435974121094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5083209872245789, "geo/layer_7/attn_entropy_mean": 4.647800445556641, "geo/layer_7/attn_entropy_std": 0.8059408664703369, "geo/layer_14/stable_rank_q_proj": 54.57280731201172, "geo/layer_14/stable_rank_k_proj": 36.669410705566406, "geo/layer_14/stable_rank_o_proj": 50.002647399902344, "geo/layer_14/stable_rank_gate_proj": 78.32628631591797, "geo/layer_14/stable_rank_down_proj": 133.79766845703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3879150152206421, "geo/layer_14/attn_entropy_mean": 5.498845100402832, "geo/layer_14/attn_entropy_std": 0.3578334450721741, "geo/layer_21/stable_rank_q_proj": 44.25295639038086, "geo/layer_21/stable_rank_k_proj": 31.298954010009766, "geo/layer_21/stable_rank_o_proj": 77.65821075439453, "geo/layer_21/stable_rank_gate_proj": 75.02342224121094, "geo/layer_21/stable_rank_down_proj": 56.80134963989258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14040713012218475, "geo/layer_21/attn_entropy_mean": 5.726706027984619, "geo/layer_21/attn_entropy_std": 0.29037901759147644, "geo/layer_27/stable_rank_q_proj": 42.23393630981445, "geo/layer_27/stable_rank_k_proj": 31.559860229492188, "geo/layer_27/stable_rank_o_proj": 117.94619750976562, "geo/layer_27/stable_rank_gate_proj": 86.45326232910156, "geo/layer_27/stable_rank_down_proj": 132.70643615722656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09050906449556351, "geo/layer_27/attn_entropy_mean": 4.3131279945373535, "geo/layer_27/attn_entropy_std": 0.6359830498695374, "attnres/final_alpha/block_0": 0.24252387881278992, "attnres/block_norm/0": 1.7065783739089966, "attnres/final_alpha/block_1": 0.005408453289419413, "attnres/block_norm/1": 38321.13671875, "attnres/final_alpha/block_2": 0.01173615362495184, "attnres/block_norm/2": 25563.66796875, "attnres/final_alpha/block_3": 0.013767844066023827, "attnres/block_norm/3": 43888.078125, "attnres/final_alpha/block_4": 0.017312046140432358, "attnres/block_norm/4": 12172.447265625, "attnres/final_alpha/block_5": 0.5883662104606628, "attnres/block_norm/5": 5857.9130859375, "attnres/final_alpha/block_6": 0.12088539451360703, "attnres/block_norm/6": 28859.65625, "geo/tier1_time_s": 1.360978364944458, "geo/step": 35475.0, "geo/rankme_slope": -4.464164572078832e-05} {"step": 35480, "timestamp": 1778232882.4321353, "train/loss": 2.1899072408676146, "train/z_loss": 0.001429638860281557, "train/perplexity": 8.934384330521794, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697561.0154812222, "perf/iters_per_sec": 0.8094601704984771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353912353515626, "data/tokens_consumed": 74409050112, "data/tokens_consumed_B": 74.409050112, "train/loss_slope": -7.926011972516116e-06} {"step": 35490, "timestamp": 1778232892.8068342, "train/loss": 2.1883477330207826, "train/z_loss": 0.0014343288377858699, "train/perplexity": 8.92046194690666, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022296.1592454477, "perf/iters_per_sec": 0.9643059536196936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370152711868286, "data/tokens_consumed": 74430021632, "data/tokens_consumed_B": 74.430021632, "train/loss_slope": -5.505037779855296e-06} {"step": 35500, "timestamp": 1778232903.1727376, "grad/layer_0/attn": 0.0027412313502281904, "grad/layer_0/mlp": 0.00258769397623837, "grad/layer_0/attn_mlp_ratio": 1.059333626567253, "grad/layer_4/attn": 0.004164178390055895, "grad/layer_4/mlp": 0.0024193597491830587, "grad/layer_4/attn_mlp_ratio": 1.721190169979057, "grad/layer_8/attn": 0.004346126224845648, "grad/layer_8/mlp": 0.003510242560878396, "grad/layer_8/attn_mlp_ratio": 1.2381270028089972, "grad/layer_12/attn": 0.004707726184278727, "grad/layer_12/mlp": 0.006527291610836983, "grad/layer_12/attn_mlp_ratio": 0.7212372899563675, "grad/layer_16/attn": 0.0035656155087053776, "grad/layer_16/mlp": 0.0044478061608970165, "grad/layer_16/attn_mlp_ratio": 0.8016571090455451, "grad/layer_20/attn": 0.0030404042918235064, "grad/layer_20/mlp": 0.005588501691818237, "grad/layer_20/attn_mlp_ratio": 0.544046401895185, "grad/layer_24/attn": 0.005652082618325949, "grad/layer_24/mlp": 0.008591121062636375, "grad/layer_24/attn_mlp_ratio": 0.657898138243866, "grad/layer_27/attn": 0.0049491217359900475, "grad/layer_27/mlp": 0.007648431695997715, "grad/layer_27/attn_mlp_ratio": 0.647076664601995} {"step": 35500, "timestamp": 1778232903.188524, "train/loss": 2.17211709022522, "train/z_loss": 0.001437189558055252, "train/perplexity": 8.77684575975722, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021264.0478969254, "perf/iters_per_sec": 0.963813804577315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375447988510131, "data/tokens_consumed": 74450993152, "data/tokens_consumed_B": 74.450993152, "train/loss_slope": -3.017332244126841e-06} {"step": 35500, "timestamp": 1778232910.1701264, "geo/ww_alpha_mean": 7.673800483757908, "geo/ww_alpha_std": 4.3710344111001715, "geo/ww_alpha_min": 1.3599480582501697, "geo/ww_alpha_max": 26.55560984808629, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.070279198225151, "geo/ww_alpha_by_type/k_proj": 4.587556203699251, "geo/ww_alpha_by_type/v_proj": 7.847459499712831, "geo/ww_alpha_by_type/o_proj": 8.175442637963998, "geo/ww_alpha_by_type/gate_proj": 7.967062425634074, "geo/ww_alpha_by_type/up_proj": 12.560669683803914, "geo/ww_alpha_by_type/down_proj": 8.618648223018075, "geo/twonn_id/layer_0": 0.7114284038543701, "geo/twonn_id/layer_7": 3.7082579135894775, "geo/twonn_id/layer_14": 4.603508949279785, "geo/twonn_id/layer_21": 7.442824363708496, "geo/twonn_id/layer_27": 5.415610313415527, "geo/tier2_time_s": 6.975205183029175} {"step": 35500, "timestamp": 1778232910.8070545, "eoc/jacobian_sigma/layer_0/attn": 1260.5283203125, "eoc/jacobian_sigma/layer_0/mlp": 7943.35205078125, "eoc/jacobian_sigma/layer_0": 7943.35205078125, "eoc/jacobian_sigma/layer_7/attn": 1.15226149559021, "eoc/jacobian_sigma/layer_7/mlp": 1.6521267890930176, "eoc/jacobian_sigma/layer_7": 1.6521267890930176, "eoc/jacobian_sigma/layer_14/attn": 1.6408114433288574, "eoc/jacobian_sigma/layer_14/mlp": 5.868249893188477, "eoc/jacobian_sigma/layer_14": 5.868249893188477, "eoc/jacobian_sigma/layer_21/attn": 1.0946705341339111, "eoc/jacobian_sigma/layer_21/mlp": 3.7501254081726074, "eoc/jacobian_sigma/layer_21": 3.7501254081726074, "eoc/jacobian_sigma/layer_27/attn": 3.706268310546875, "eoc/jacobian_sigma/layer_27/mlp": 24.71184539794922, "eoc/jacobian_sigma/layer_27": 24.71184539794922, "eoc/layer0_sigma": 7943.35205078125, "eoc/sigma_max": 24.71184539794922, "eoc/sigma_min": 1.6521267890930176, "eoc/sigma_mean": 8.99558687210083, "eoc/time_s": 0.630382776260376} {"step": 35510, "timestamp": 1778232921.204295, "train/loss": 2.2070853471755982, "train/z_loss": 0.001425306312739849, "train/perplexity": 9.089185926839809, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1164394.5753934998, "perf/iters_per_sec": 0.5552266003577708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8010664463043213, "data/tokens_consumed": 74471964672, "data/tokens_consumed_B": 74.471964672, "train/loss_slope": -3.492405669953141e-06} {"step": 35520, "timestamp": 1778232931.5844493, "train/loss": 2.187596917152405, "train/z_loss": 0.0014177937060594558, "train/perplexity": 8.913766836235844, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021361.4516402201, "perf/iters_per_sec": 0.963860250301466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374948024749755, "data/tokens_consumed": 74492936192, "data/tokens_consumed_B": 74.492936192, "train/loss_slope": -2.7676876407466433e-06} {"step": 35530, "timestamp": 1778232941.971442, "train/loss": 2.2048663854599, "train/z_loss": 0.001428166450932622, "train/perplexity": 9.069039731325873, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020323.6581711913, "perf/iters_per_sec": 0.9633653918128926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380277395248414, "data/tokens_consumed": 74513907712, "data/tokens_consumed_B": 74.513907712, "train/loss_slope": -3.7995798466908637e-06} {"step": 35540, "timestamp": 1778232952.3198938, "train/loss": 2.1252156019210817, "train/z_loss": 0.0014389708521775901, "train/perplexity": 8.374702895528383, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027551.2212220316, "perf/iters_per_sec": 0.9668117624387892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034327507019043, "data/tokens_consumed": 74534879232, "data/tokens_consumed_B": 74.534879232, "train/loss_slope": -8.220133787155705e-06} {"step": 35550, "timestamp": 1778232962.6864765, "grad/layer_0/attn": 0.002638275967910886, "grad/layer_0/mlp": 0.002691740868613124, "grad/layer_0/attn_mlp_ratio": 0.9801373901405583, "grad/layer_4/attn": 0.0019841643515974283, "grad/layer_4/mlp": 0.0024229236878454685, "grad/layer_4/attn_mlp_ratio": 0.8189132326616855, "grad/layer_8/attn": 0.0058652497828006744, "grad/layer_8/mlp": 0.0036494641099125147, "grad/layer_8/attn_mlp_ratio": 1.6071536656996772, "grad/layer_12/attn": 0.010195104405283928, "grad/layer_12/mlp": 0.00611506262794137, "grad/layer_12/attn_mlp_ratio": 1.667211745629321, "grad/layer_16/attn": 0.004212317522615194, "grad/layer_16/mlp": 0.00457006087526679, "grad/layer_16/attn_mlp_ratio": 0.9217202014179446, "grad/layer_20/attn": 0.0047351885586977005, "grad/layer_20/mlp": 0.006288379430770874, "grad/layer_20/attn_mlp_ratio": 0.7530061656627185, "grad/layer_24/attn": 0.005856268107891083, "grad/layer_24/mlp": 0.010043277405202389, "grad/layer_24/attn_mlp_ratio": 0.5831032852430447, "grad/layer_27/attn": 0.01499228086322546, "grad/layer_27/mlp": 0.008717222139239311, "grad/layer_27/attn_mlp_ratio": 1.719846122052491} {"step": 35550, "timestamp": 1778232963.3034678, "eos/sharpness": 21.21763229370117, "eos/L0_probe": 2.008084774017334, "eos/L_plus": 2.097707509994507, "eos/L_minus": 2.130638360977173, "eos/grad_norm": 0.12968255579471588, "eos/embed_grad_frac": 0.13982218503952026, "eos/time_s": 0.6140482425689697} {"step": 35550, "timestamp": 1778232963.3246052, "train/loss": 2.1694117069244383, "train/z_loss": 0.0014313398860394955, "train/perplexity": 8.753133118160465, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906645.2587251887, "perf/iters_per_sec": 0.909159306871981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999172449111938, "data/tokens_consumed": 74555850752, "data/tokens_consumed_B": 74.555850752, "train/loss_slope": -7.903596634554335e-06} {"step": 35550, "timestamp": 1778232964.6853049, "geo/rankme_last": 438.9129333496094, "geo/layer_0/stable_rank_q_proj": 18.94434928894043, "geo/layer_0/stable_rank_k_proj": 16.301939010620117, "geo/layer_0/stable_rank_o_proj": 49.730831146240234, "geo/layer_0/stable_rank_gate_proj": 140.80418395996094, "geo/layer_0/stable_rank_down_proj": 52.64344024658203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05787228047847748, "geo/layer_0/attn_entropy_mean": 6.203187465667725, "geo/layer_0/attn_entropy_std": 0.3652147650718689, "geo/layer_7/stable_rank_q_proj": 43.34345245361328, "geo/layer_7/stable_rank_k_proj": 42.13325119018555, "geo/layer_7/stable_rank_o_proj": 101.18618774414062, "geo/layer_7/stable_rank_gate_proj": 92.17683410644531, "geo/layer_7/stable_rank_down_proj": 146.16464233398438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5099117159843445, "geo/layer_7/attn_entropy_mean": 4.622851848602295, "geo/layer_7/attn_entropy_std": 0.8173016905784607, "geo/layer_14/stable_rank_q_proj": 54.666378021240234, "geo/layer_14/stable_rank_k_proj": 36.66188430786133, "geo/layer_14/stable_rank_o_proj": 49.951866149902344, "geo/layer_14/stable_rank_gate_proj": 78.4319076538086, "geo/layer_14/stable_rank_down_proj": 133.64605712890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37549933791160583, "geo/layer_14/attn_entropy_mean": 5.509808540344238, "geo/layer_14/attn_entropy_std": 0.3737308084964752, "geo/layer_21/stable_rank_q_proj": 44.16652297973633, "geo/layer_21/stable_rank_k_proj": 31.273479461669922, "geo/layer_21/stable_rank_o_proj": 77.55630493164062, "geo/layer_21/stable_rank_gate_proj": 74.86505126953125, "geo/layer_21/stable_rank_down_proj": 56.742774963378906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14890801906585693, "geo/layer_21/attn_entropy_mean": 5.7244791984558105, "geo/layer_21/attn_entropy_std": 0.2932547628879547, "geo/layer_27/stable_rank_q_proj": 42.274879455566406, "geo/layer_27/stable_rank_k_proj": 31.59328269958496, "geo/layer_27/stable_rank_o_proj": 117.93818664550781, "geo/layer_27/stable_rank_gate_proj": 86.34366607666016, "geo/layer_27/stable_rank_down_proj": 132.468994140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08852408826351166, "geo/layer_27/attn_entropy_mean": 4.287132263183594, "geo/layer_27/attn_entropy_std": 0.6580557823181152, "attnres/final_alpha/block_0": 0.24405473470687866, "attnres/block_norm/0": 1.7068692445755005, "attnres/final_alpha/block_1": 0.005604993551969528, "attnres/block_norm/1": 38240.3828125, "attnres/final_alpha/block_2": 0.011849349364638329, "attnres/block_norm/2": 25508.703125, "attnres/final_alpha/block_3": 0.013735910877585411, "attnres/block_norm/3": 43855.609375, "attnres/final_alpha/block_4": 0.017113231122493744, "attnres/block_norm/4": 12184.451171875, "attnres/final_alpha/block_5": 0.5860294103622437, "attnres/block_norm/5": 5851.9736328125, "attnres/final_alpha/block_6": 0.12161239981651306, "attnres/block_norm/6": 28909.314453125, "geo/tier1_time_s": 1.3566954135894775, "geo/step": 35550.0, "geo/rankme_slope": -8.195616137079832e-05} {"step": 35560, "timestamp": 1778232975.0678134, "train/loss": 2.191103291511536, "train/z_loss": 0.001425656967330724, "train/perplexity": 8.945076699686215, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786420.9362355205, "perf/iters_per_sec": 0.8518318825891116, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739405632019042, "data/tokens_consumed": 74576822272, "data/tokens_consumed_B": 74.576822272, "train/loss_slope": -8.884070544066851e-06} {"step": 35570, "timestamp": 1778232985.4475744, "train/loss": 2.143443202972412, "train/z_loss": 0.0014502790989354252, "train/perplexity": 8.528753358834912, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021664.220295267, "perf/iters_per_sec": 0.9640046216465316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373394250869752, "data/tokens_consumed": 74597793792, "data/tokens_consumed_B": 74.597793792, "train/loss_slope": -1.0632652346521986e-05} {"step": 35580, "timestamp": 1778232995.8379118, "train/loss": 2.1707588911056517, "train/z_loss": 0.0014340530498884619, "train/perplexity": 8.76493314725491, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019617.0850653215, "perf/iters_per_sec": 0.9630284715010269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383908987045287, "data/tokens_consumed": 74618765312, "data/tokens_consumed_B": 74.618765312, "train/loss_slope": -1.2802405726469667e-05} {"step": 35590, "timestamp": 1778233006.221468, "train/loss": 2.143350803852081, "train/z_loss": 0.0014463268569670617, "train/perplexity": 8.527965345933447, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020665.8037672613, "perf/iters_per_sec": 0.9635285395466143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378519773483277, "data/tokens_consumed": 74639736832, "data/tokens_consumed_B": 74.639736832, "train/loss_slope": -1.2481760528042515e-05} {"step": 35600, "timestamp": 1778233017.0522654, "grad/layer_0/attn": 0.0025065846275538206, "grad/layer_0/mlp": 0.0027618531603366137, "grad/layer_0/attn_mlp_ratio": 0.9075734267099762, "grad/layer_4/attn": 0.0025400612503290176, "grad/layer_4/mlp": 0.0023727898951619864, "grad/layer_4/attn_mlp_ratio": 1.0704956003304507, "grad/layer_8/attn": 0.004816628526896238, "grad/layer_8/mlp": 0.0034858391154557467, "grad/layer_8/attn_mlp_ratio": 1.3817701360234715, "grad/layer_12/attn": 0.005167064256966114, "grad/layer_12/mlp": 0.006489270832389593, "grad/layer_12/attn_mlp_ratio": 0.7962472688843965, "grad/layer_16/attn": 0.0037083798088133335, "grad/layer_16/mlp": 0.004601702094078064, "grad/layer_16/attn_mlp_ratio": 0.8058713172672608, "grad/layer_20/attn": 0.004865133669227362, "grad/layer_20/mlp": 0.005942641291767359, "grad/layer_20/attn_mlp_ratio": 0.8186820217634666, "grad/layer_24/attn": 0.009426039643585682, "grad/layer_24/mlp": 0.009219052270054817, "grad/layer_24/attn_mlp_ratio": 1.0224521203722845, "grad/layer_27/attn": 0.004959460347890854, "grad/layer_27/mlp": 0.008885946124792099, "grad/layer_27/attn_mlp_ratio": 0.5581240559450819} {"step": 35600, "timestamp": 1778233017.0684376, "train/loss": 2.235790455341339, "train/z_loss": 0.0014304393087513744, "train/perplexity": 9.353872749401685, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934351.1180937018, "perf/iters_per_sec": 0.9223704901188382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0841630458831788, "data/tokens_consumed": 74660708352, "data/tokens_consumed_B": 74.660708352, "train/loss_slope": -6.905600254220178e-06} {"step": 35610, "timestamp": 1778233027.4455786, "train/loss": 2.159202742576599, "train/z_loss": 0.0014307549106888473, "train/perplexity": 8.66422728463536, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021934.1729484804, "perf/iters_per_sec": 0.9641333451025392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037200927734375, "data/tokens_consumed": 74681679872, "data/tokens_consumed_B": 74.681679872, "train/loss_slope": -9.567550614257115e-06} {"step": 35620, "timestamp": 1778233037.8283188, "train/loss": 2.2132288455963134, "train/z_loss": 0.0014162857201881706, "train/perplexity": 9.145197202651822, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021127.6891376725, "perf/iters_per_sec": 0.9637487836540568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376147985458375, "data/tokens_consumed": 74702651392, "data/tokens_consumed_B": 74.702651392, "train/loss_slope": -8.95769005239527e-06} {"step": 35625, "timestamp": 1778233043.6067936, "eos/sharpness": 50.13713836669921, "eos/L0_probe": 2.002142906188965, "eos/L_plus": 2.20721697807312, "eos/L_minus": 2.2984402179718018, "eos/grad_norm": 0.12549103796482086, "eos/embed_grad_frac": 0.14659640192985535, "eos/time_s": 0.6034102439880371} {"step": 35625, "timestamp": 1778233044.9836607, "geo/rankme_last": 438.8116149902344, "geo/layer_0/stable_rank_q_proj": 18.946962356567383, "geo/layer_0/stable_rank_k_proj": 16.304641723632812, "geo/layer_0/stable_rank_o_proj": 49.68746566772461, "geo/layer_0/stable_rank_gate_proj": 140.58279418945312, "geo/layer_0/stable_rank_down_proj": 52.676395416259766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054833900183439255, "geo/layer_0/attn_entropy_mean": 6.199624061584473, "geo/layer_0/attn_entropy_std": 0.3656938076019287, "geo/layer_7/stable_rank_q_proj": 43.355751037597656, "geo/layer_7/stable_rank_k_proj": 42.040000915527344, "geo/layer_7/stable_rank_o_proj": 101.09977722167969, "geo/layer_7/stable_rank_gate_proj": 92.25091552734375, "geo/layer_7/stable_rank_down_proj": 146.2238311767578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5007099509239197, "geo/layer_7/attn_entropy_mean": 4.679776191711426, "geo/layer_7/attn_entropy_std": 0.8305255174636841, "geo/layer_14/stable_rank_q_proj": 54.6207160949707, "geo/layer_14/stable_rank_k_proj": 36.649654388427734, "geo/layer_14/stable_rank_o_proj": 49.85591506958008, "geo/layer_14/stable_rank_gate_proj": 78.42357635498047, "geo/layer_14/stable_rank_down_proj": 133.41957092285156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.393659770488739, "geo/layer_14/attn_entropy_mean": 5.456441402435303, "geo/layer_14/attn_entropy_std": 0.3824310600757599, "geo/layer_21/stable_rank_q_proj": 44.17499542236328, "geo/layer_21/stable_rank_k_proj": 31.226085662841797, "geo/layer_21/stable_rank_o_proj": 77.48118591308594, "geo/layer_21/stable_rank_gate_proj": 74.98933410644531, "geo/layer_21/stable_rank_down_proj": 56.770137786865234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1457909643650055, "geo/layer_21/attn_entropy_mean": 5.7295942306518555, "geo/layer_21/attn_entropy_std": 0.29141366481781006, "geo/layer_27/stable_rank_q_proj": 42.329986572265625, "geo/layer_27/stable_rank_k_proj": 31.545597076416016, "geo/layer_27/stable_rank_o_proj": 117.8737564086914, "geo/layer_27/stable_rank_gate_proj": 86.317626953125, "geo/layer_27/stable_rank_down_proj": 132.51712036132812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08673591166734695, "geo/layer_27/attn_entropy_mean": 4.283060073852539, "geo/layer_27/attn_entropy_std": 0.6460391879081726, "attnres/final_alpha/block_0": 0.24303114414215088, "attnres/block_norm/0": 1.7073003053665161, "attnres/final_alpha/block_1": 0.005548474378883839, "attnres/block_norm/1": 38176.5, "attnres/final_alpha/block_2": 0.01188630424439907, "attnres/block_norm/2": 25443.587890625, "attnres/final_alpha/block_3": 0.013799789361655712, "attnres/block_norm/3": 43911.9140625, "attnres/final_alpha/block_4": 0.017087766900658607, "attnres/block_norm/4": 12229.6806640625, "attnres/final_alpha/block_5": 0.5878006219863892, "attnres/block_norm/5": 5841.3935546875, "attnres/final_alpha/block_6": 0.12084587663412094, "attnres/block_norm/6": 28987.4609375, "geo/tier1_time_s": 1.3572273254394531, "geo/step": 35625.0, "geo/rankme_slope": -7.232326524359744e-05} {"step": 35630, "timestamp": 1778233050.1759446, "train/loss": 2.162612962722778, "train/z_loss": 0.0014457559445872903, "train/perplexity": 8.693824645146256, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699146.1749811524, "perf/iters_per_sec": 0.8102160334497225, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2342387199401856, "data/tokens_consumed": 74723622912, "data/tokens_consumed_B": 74.723622912, "train/loss_slope": -8.637374069037598e-06} {"step": 35640, "timestamp": 1778233060.5590332, "train/loss": 2.1926621198654175, "train/z_loss": 0.0014179222052916884, "train/perplexity": 8.959031412548606, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020965.7173554017, "perf/iters_per_sec": 0.9636715494896897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037697958946228, "data/tokens_consumed": 74744594432, "data/tokens_consumed_B": 74.744594432, "train/loss_slope": -4.056064416580335e-06} {"step": 35650, "timestamp": 1778233070.9318428, "grad/layer_0/attn": 0.0028721520211547613, "grad/layer_0/mlp": 0.002782090101391077, "grad/layer_0/attn_mlp_ratio": 1.0323719984774944, "grad/layer_4/attn": 0.0018378694076091051, "grad/layer_4/mlp": 0.0024292264133691788, "grad/layer_4/attn_mlp_ratio": 0.7565656794434127, "grad/layer_8/attn": 0.004327340517193079, "grad/layer_8/mlp": 0.0037668796721845865, "grad/layer_8/attn_mlp_ratio": 1.14878647020992, "grad/layer_12/attn": 0.004421604331582785, "grad/layer_12/mlp": 0.006818662863224745, "grad/layer_12/attn_mlp_ratio": 0.6484562084135742, "grad/layer_16/attn": 0.005568372551351786, "grad/layer_16/mlp": 0.004614435136318207, "grad/layer_16/attn_mlp_ratio": 1.2067289421521306, "grad/layer_20/attn": 0.0042526815086603165, "grad/layer_20/mlp": 0.006241332273930311, "grad/layer_20/attn_mlp_ratio": 0.6813739845715515, "grad/layer_24/attn": 0.00574800418689847, "grad/layer_24/mlp": 0.009447219781577587, "grad/layer_24/attn_mlp_ratio": 0.6084334078120994, "grad/layer_27/attn": 0.01205371879041195, "grad/layer_27/mlp": 0.007007961627095938, "grad/layer_27/attn_mlp_ratio": 1.7200035131194913} {"step": 35650, "timestamp": 1778233070.9475107, "train/loss": 2.204125761985779, "train/z_loss": 0.0014371331199072301, "train/perplexity": 9.062325474288214, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019725.7386129636, "perf/iters_per_sec": 0.9630802815499132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383350372314453, "data/tokens_consumed": 74765565952, "data/tokens_consumed_B": 74.765565952, "train/loss_slope": -1.4318914911319654e-06} {"step": 35660, "timestamp": 1778233081.331248, "train/loss": 2.170734715461731, "train/z_loss": 0.0014356281375512482, "train/perplexity": 8.76472125191352, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021106.0944959586, "perf/iters_per_sec": 0.9637384865264695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376258850097657, "data/tokens_consumed": 74786537472, "data/tokens_consumed_B": 74.786537472, "train/loss_slope": -2.817710190609006e-06} {"step": 35670, "timestamp": 1778233091.7095985, "train/loss": 2.164419674873352, "train/z_loss": 0.0014409682364203036, "train/perplexity": 8.70954608154618, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021915.9073949321, "perf/iters_per_sec": 0.9641246354078923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372102975845336, "data/tokens_consumed": 74807508992, "data/tokens_consumed_B": 74.807508992, "train/loss_slope": -9.991308679245326e-07} {"step": 35680, "timestamp": 1778233102.0869405, "train/loss": 2.1781516075134277, "train/z_loss": 0.001440908305812627, "train/perplexity": 8.829969915336697, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022088.5835106627, "perf/iters_per_sec": 0.9642069737962068, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371217250823974, "data/tokens_consumed": 74828480512, "data/tokens_consumed_B": 74.828480512, "train/loss_slope": -1.3739677366822115e-06} {"step": 35690, "timestamp": 1778233112.4648755, "train/loss": 2.240742015838623, "train/z_loss": 0.0014263472403399646, "train/perplexity": 9.400303874600631, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022050.141349603, "perf/iters_per_sec": 0.9641886431453719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371414422988892, "data/tokens_consumed": 74849452032, "data/tokens_consumed_B": 74.849452032, "train/loss_slope": 2.209520719089316e-06} {"step": 35700, "timestamp": 1778233122.8340309, "grad/layer_0/attn": 0.00291140447370708, "grad/layer_0/mlp": 0.0029237675480544567, "grad/layer_0/attn_mlp_ratio": 0.9957714921855697, "grad/layer_4/attn": 0.002028611721470952, "grad/layer_4/mlp": 0.002537589520215988, "grad/layer_4/attn_mlp_ratio": 0.7994246608315988, "grad/layer_8/attn": 0.005028414539992809, "grad/layer_8/mlp": 0.0037070673424750566, "grad/layer_8/attn_mlp_ratio": 1.3564399941522358, "grad/layer_12/attn": 0.004336763173341751, "grad/layer_12/mlp": 0.006514816544950008, "grad/layer_12/attn_mlp_ratio": 0.6656769345463336, "grad/layer_16/attn": 0.003755182959139347, "grad/layer_16/mlp": 0.004722666926681995, "grad/layer_16/attn_mlp_ratio": 0.795140317520464, "grad/layer_20/attn": 0.003912082873284817, "grad/layer_20/mlp": 0.005916179623454809, "grad/layer_20/attn_mlp_ratio": 0.6612515265172368, "grad/layer_24/attn": 0.013112152926623821, "grad/layer_24/mlp": 0.010936240665614605, "grad/layer_24/attn_mlp_ratio": 1.1989634470969817, "grad/layer_27/attn": 0.005340683273971081, "grad/layer_27/mlp": 0.009621602483093739, "grad/layer_27/attn_mlp_ratio": 0.5550721127637589} {"step": 35700, "timestamp": 1778233123.438728, "eos/sharpness": 63.54713439941405, "eos/L0_probe": 2.0075433254241943, "eos/L_plus": 2.279048442840576, "eos/L_minus": 2.371509552001953, "eos/grad_norm": 0.18986766040325165, "eos/embed_grad_frac": 0.06077536568045616, "eos/time_s": 0.6018738746643066} {"step": 35700, "timestamp": 1778233123.4593678, "train/loss": 2.246500086784363, "train/z_loss": 0.0014273861423134803, "train/perplexity": 9.4545876260851, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908742.4649755103, "perf/iters_per_sec": 0.9101593327405502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098708724975586, "data/tokens_consumed": 74870423552, "data/tokens_consumed_B": 74.870423552, "train/loss_slope": 4.924871061000657e-06} {"step": 35700, "timestamp": 1778233124.8191733, "geo/rankme_last": 439.33587646484375, "geo/layer_0/stable_rank_q_proj": 18.979928970336914, "geo/layer_0/stable_rank_k_proj": 16.280961990356445, "geo/layer_0/stable_rank_o_proj": 49.57675552368164, "geo/layer_0/stable_rank_gate_proj": 140.72283935546875, "geo/layer_0/stable_rank_down_proj": 52.65309143066406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06422276794910431, "geo/layer_0/attn_entropy_mean": 6.201537132263184, "geo/layer_0/attn_entropy_std": 0.363007128238678, "geo/layer_7/stable_rank_q_proj": 43.28199768066406, "geo/layer_7/stable_rank_k_proj": 41.98839569091797, "geo/layer_7/stable_rank_o_proj": 101.12939453125, "geo/layer_7/stable_rank_gate_proj": 92.42942810058594, "geo/layer_7/stable_rank_down_proj": 146.23956298828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5208300352096558, "geo/layer_7/attn_entropy_mean": 4.634692668914795, "geo/layer_7/attn_entropy_std": 0.8061484694480896, "geo/layer_14/stable_rank_q_proj": 54.700504302978516, "geo/layer_14/stable_rank_k_proj": 36.64360809326172, "geo/layer_14/stable_rank_o_proj": 49.7738151550293, "geo/layer_14/stable_rank_gate_proj": 78.28911590576172, "geo/layer_14/stable_rank_down_proj": 133.66502380371094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36289891600608826, "geo/layer_14/attn_entropy_mean": 5.479620933532715, "geo/layer_14/attn_entropy_std": 0.3704529106616974, "geo/layer_21/stable_rank_q_proj": 44.15546798706055, "geo/layer_21/stable_rank_k_proj": 31.22066879272461, "geo/layer_21/stable_rank_o_proj": 77.43842315673828, "geo/layer_21/stable_rank_gate_proj": 75.04734802246094, "geo/layer_21/stable_rank_down_proj": 56.716957092285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14522424340248108, "geo/layer_21/attn_entropy_mean": 5.7225470542907715, "geo/layer_21/attn_entropy_std": 0.2928965091705322, "geo/layer_27/stable_rank_q_proj": 42.37196731567383, "geo/layer_27/stable_rank_k_proj": 31.526260375976562, "geo/layer_27/stable_rank_o_proj": 117.84159851074219, "geo/layer_27/stable_rank_gate_proj": 86.4129638671875, "geo/layer_27/stable_rank_down_proj": 132.6277313232422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09274830669164658, "geo/layer_27/attn_entropy_mean": 4.300154685974121, "geo/layer_27/attn_entropy_std": 0.6263970136642456, "attnres/final_alpha/block_0": 0.24138493835926056, "attnres/block_norm/0": 1.7073345184326172, "attnres/final_alpha/block_1": 0.005413886625319719, "attnres/block_norm/1": 38368.9921875, "attnres/final_alpha/block_2": 0.011602578684687614, "attnres/block_norm/2": 25558.994140625, "attnres/final_alpha/block_3": 0.013599367812275887, "attnres/block_norm/3": 44041.26171875, "attnres/final_alpha/block_4": 0.017153127118945122, "attnres/block_norm/4": 12219.94921875, "attnres/final_alpha/block_5": 0.5910301208496094, "attnres/block_norm/5": 5807.63671875, "attnres/final_alpha/block_6": 0.11981593072414398, "attnres/block_norm/6": 28764.48046875, "geo/tier1_time_s": 1.3557348251342773, "geo/step": 35700.0, "geo/rankme_slope": -4.887773468762505e-05} {"step": 35710, "timestamp": 1778233135.1943767, "train/loss": 2.1810285806655885, "train/z_loss": 0.0014303727308288216, "train/perplexity": 8.85541007950486, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787704.56836161, "perf/iters_per_sec": 0.8524439660842943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1730976343154906, "data/tokens_consumed": 74891395072, "data/tokens_consumed_B": 74.891395072, "train/loss_slope": 5.654411426078433e-06} {"step": 35720, "timestamp": 1778233145.5733447, "train/loss": 2.240435838699341, "train/z_loss": 0.0014211318688467146, "train/perplexity": 9.397426157020075, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021765.7054856825, "perf/iters_per_sec": 0.964053013556329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372873544692993, "data/tokens_consumed": 74912366592, "data/tokens_consumed_B": 74.912366592, "train/loss_slope": 8.74798256393104e-06} {"step": 35730, "timestamp": 1778233155.9478092, "train/loss": 2.165451979637146, "train/z_loss": 0.0014263068558648228, "train/perplexity": 8.718541629731423, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022536.6098496858, "perf/iters_per_sec": 0.9644206094025067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368919849395752, "data/tokens_consumed": 74933338112, "data/tokens_consumed_B": 74.933338112, "train/loss_slope": 6.276965699251716e-06} {"step": 35740, "timestamp": 1778233166.3307679, "train/loss": 2.1650427103042604, "train/z_loss": 0.001421165221836418, "train/perplexity": 8.71497412809919, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020926.296386155, "perf/iters_per_sec": 0.9636527521067405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377182006835937, "data/tokens_consumed": 74954309632, "data/tokens_consumed_B": 74.954309632, "train/loss_slope": 6.134498516360959e-06} {"step": 35750, "timestamp": 1778233176.697991, "grad/layer_0/attn": 0.0025558099150657654, "grad/layer_0/mlp": 0.0025729096960276365, "grad/layer_0/attn_mlp_ratio": 0.993353874672065, "grad/layer_4/attn": 0.0017734774155542254, "grad/layer_4/mlp": 0.0025199605152010918, "grad/layer_4/attn_mlp_ratio": 0.7037718783603698, "grad/layer_8/attn": 0.0032688495703041553, "grad/layer_8/mlp": 0.003518851241096854, "grad/layer_8/attn_mlp_ratio": 0.9289535855428405, "grad/layer_12/attn": 0.005292203277349472, "grad/layer_12/mlp": 0.006872334983199835, "grad/layer_12/attn_mlp_ratio": 0.770073521340197, "grad/layer_16/attn": 0.006114100571721792, "grad/layer_16/mlp": 0.00445295637473464, "grad/layer_16/attn_mlp_ratio": 1.3730429673885693, "grad/layer_20/attn": 0.005328590050339699, "grad/layer_20/mlp": 0.00565022649243474, "grad/layer_20/attn_mlp_ratio": 0.9430754613406673, "grad/layer_24/attn": 0.009701928123831749, "grad/layer_24/mlp": 0.008613512851297855, "grad/layer_24/attn_mlp_ratio": 1.1263613555454044, "grad/layer_27/attn": 0.003593068802729249, "grad/layer_27/mlp": 0.006838691886514425, "grad/layer_27/attn_mlp_ratio": 0.525402929363482} {"step": 35750, "timestamp": 1778233176.7137084, "train/loss": 2.185493326187134, "train/z_loss": 0.0014393750461749732, "train/perplexity": 8.895035625163079, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020989.352101469, "perf/iters_per_sec": 0.9636828194148392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376858234405517, "data/tokens_consumed": 74975281152, "data/tokens_consumed_B": 74.975281152, "train/loss_slope": 9.15479416107589e-06} {"step": 35760, "timestamp": 1778233187.0905297, "train/loss": 2.2183727264404296, "train/z_loss": 0.0014430961920879782, "train/perplexity": 9.192360203794827, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022165.4722180024, "perf/iters_per_sec": 0.9642436371889126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370822906494142, "data/tokens_consumed": 74996252672, "data/tokens_consumed_B": 74.996252672, "train/loss_slope": 9.247410043452686e-06} {"step": 35770, "timestamp": 1778233197.4613204, "train/loss": 2.145389938354492, "train/z_loss": 0.0014346946380101144, "train/perplexity": 8.545372756299473, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023264.0291170285, "perf/iters_per_sec": 0.9647674699387686, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036519193649292, "data/tokens_consumed": 75017224192, "data/tokens_consumed_B": 75.017224192, "train/loss_slope": 9.98997038060974e-06} {"step": 35775, "timestamp": 1778233203.8500516, "eos/sharpness": 74.78525638580321, "eos/L0_probe": 2.0037989616394043, "eos/L_plus": 2.3196539878845215, "eos/L_minus": 2.4357964992523193, "eos/grad_norm": 0.302521288394928, "eos/embed_grad_frac": 0.02627415396273136, "eos/time_s": 0.6243557929992676} {"step": 35775, "timestamp": 1778233205.2260015, "geo/rankme_last": 439.5165100097656, "geo/layer_0/stable_rank_q_proj": 18.969276428222656, "geo/layer_0/stable_rank_k_proj": 16.287044525146484, "geo/layer_0/stable_rank_o_proj": 49.500247955322266, "geo/layer_0/stable_rank_gate_proj": 140.37789916992188, "geo/layer_0/stable_rank_down_proj": 52.672157287597656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058508168905973434, "geo/layer_0/attn_entropy_mean": 6.203123569488525, "geo/layer_0/attn_entropy_std": 0.3655712306499481, "geo/layer_7/stable_rank_q_proj": 43.274776458740234, "geo/layer_7/stable_rank_k_proj": 41.98204803466797, "geo/layer_7/stable_rank_o_proj": 101.25146484375, "geo/layer_7/stable_rank_gate_proj": 92.59260559082031, "geo/layer_7/stable_rank_down_proj": 146.2114715576172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.512643575668335, "geo/layer_7/attn_entropy_mean": 4.680692195892334, "geo/layer_7/attn_entropy_std": 0.8377139568328857, "geo/layer_14/stable_rank_q_proj": 54.77680206298828, "geo/layer_14/stable_rank_k_proj": 36.627647399902344, "geo/layer_14/stable_rank_o_proj": 49.92609786987305, "geo/layer_14/stable_rank_gate_proj": 78.31716918945312, "geo/layer_14/stable_rank_down_proj": 133.49795532226562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38669419288635254, "geo/layer_14/attn_entropy_mean": 5.488874435424805, "geo/layer_14/attn_entropy_std": 0.3735358715057373, "geo/layer_21/stable_rank_q_proj": 44.1749382019043, "geo/layer_21/stable_rank_k_proj": 31.220252990722656, "geo/layer_21/stable_rank_o_proj": 77.50279235839844, "geo/layer_21/stable_rank_gate_proj": 75.1185531616211, "geo/layer_21/stable_rank_down_proj": 56.724857330322266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14739301800727844, "geo/layer_21/attn_entropy_mean": 5.729390621185303, "geo/layer_21/attn_entropy_std": 0.2897387146949768, "geo/layer_27/stable_rank_q_proj": 42.476104736328125, "geo/layer_27/stable_rank_k_proj": 31.609310150146484, "geo/layer_27/stable_rank_o_proj": 117.54341125488281, "geo/layer_27/stable_rank_gate_proj": 86.4091567993164, "geo/layer_27/stable_rank_down_proj": 132.68875122070312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08381623029708862, "geo/layer_27/attn_entropy_mean": 4.320017337799072, "geo/layer_27/attn_entropy_std": 0.6332317590713501, "attnres/final_alpha/block_0": 0.24273014068603516, "attnres/block_norm/0": 1.7077913284301758, "attnres/final_alpha/block_1": 0.005575294606387615, "attnres/block_norm/1": 38338.74609375, "attnres/final_alpha/block_2": 0.011858182027935982, "attnres/block_norm/2": 25498.9453125, "attnres/final_alpha/block_3": 0.013797273859381676, "attnres/block_norm/3": 44069.5703125, "attnres/final_alpha/block_4": 0.017114849761128426, "attnres/block_norm/4": 12261.4267578125, "attnres/final_alpha/block_5": 0.5886614322662354, "attnres/block_norm/5": 5809.8642578125, "attnres/final_alpha/block_6": 0.12026280909776688, "attnres/block_norm/6": 28634.47265625, "geo/tier1_time_s": 1.3552374839782715, "geo/step": 35775.0, "geo/rankme_slope": -3.900917788990596e-05} {"step": 35780, "timestamp": 1778233210.4155579, "train/loss": 2.1561322689056395, "train/z_loss": 0.0014426837791688741, "train/perplexity": 8.63766480344707, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1619663.3563120796, "perf/iters_per_sec": 0.7723156720695875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2948073387145995, "data/tokens_consumed": 75038195712, "data/tokens_consumed_B": 75.038195712, "train/loss_slope": 9.960602977202205e-06} {"step": 35790, "timestamp": 1778233221.2980268, "train/loss": 2.18773672580719, "train/z_loss": 0.001427259622141719, "train/perplexity": 8.91501314510664, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927948.4498443576, "perf/iters_per_sec": 0.9193174599859035, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.087763524055481, "data/tokens_consumed": 75059167232, "data/tokens_consumed_B": 75.059167232, "train/loss_slope": 7.746226180254322e-06} {"step": 35800, "timestamp": 1778233231.667965, "grad/layer_0/attn": 0.002741679083555937, "grad/layer_0/mlp": 0.0027583404444158077, "grad/layer_0/attn_mlp_ratio": 0.9939596070203871, "grad/layer_4/attn": 0.002087197033688426, "grad/layer_4/mlp": 0.00263132038526237, "grad/layer_4/attn_mlp_ratio": 0.7932127786708242, "grad/layer_8/attn": 0.004113983362913132, "grad/layer_8/mlp": 0.0038053514435887337, "grad/layer_8/attn_mlp_ratio": 1.0811046800247357, "grad/layer_12/attn": 0.004924485459923744, "grad/layer_12/mlp": 0.006333362776786089, "grad/layer_12/attn_mlp_ratio": 0.7775467087119933, "grad/layer_16/attn": 0.004512239247560501, "grad/layer_16/mlp": 0.004412900656461716, "grad/layer_16/attn_mlp_ratio": 1.0225109279770968, "grad/layer_20/attn": 0.004353925585746765, "grad/layer_20/mlp": 0.006367362104356289, "grad/layer_20/attn_mlp_ratio": 0.6837879558301219, "grad/layer_24/attn": 0.021090302616357803, "grad/layer_24/mlp": 0.013629820197820663, "grad/layer_24/attn_mlp_ratio": 1.547364686805888, "grad/layer_27/attn": 0.007414681371301413, "grad/layer_27/mlp": 0.01334442850202322, "grad/layer_27/attn_mlp_ratio": 0.5556387307716745} {"step": 35800, "timestamp": 1778233231.6836789, "train/loss": 2.164397358894348, "train/z_loss": 0.001432925893459469, "train/perplexity": 8.709351721667366, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020253.776700698, "perf/iters_per_sec": 0.9633320697310915, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038063645362854, "data/tokens_consumed": 75080138752, "data/tokens_consumed_B": 75.080138752, "train/loss_slope": 7.625006306039119e-06} {"step": 35810, "timestamp": 1778233242.058587, "train/loss": 2.2092418670654297, "train/z_loss": 0.0014250941341742874, "train/perplexity": 9.108808087258495, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022375.4351321433, "perf/iters_per_sec": 0.9643437553082196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369746208190918, "data/tokens_consumed": 75101110272, "data/tokens_consumed_B": 75.101110272, "train/loss_slope": 1.1682511198602952e-05} {"step": 35820, "timestamp": 1778233252.428715, "train/loss": 2.213726782798767, "train/z_loss": 0.0014497646130621432, "train/perplexity": 9.149752070487759, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023279.3405025578, "perf/iters_per_sec": 0.9647747709763326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365113496780396, "data/tokens_consumed": 75122081792, "data/tokens_consumed_B": 75.122081792, "train/loss_slope": 1.1026400300857741e-05} {"step": 35830, "timestamp": 1778233262.8067045, "train/loss": 2.1461036682128904, "train/z_loss": 0.0014450193382799625, "train/perplexity": 8.5514740210552, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022006.4018050192, "perf/iters_per_sec": 0.9641677865052315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371638774871825, "data/tokens_consumed": 75143053312, "data/tokens_consumed_B": 75.143053312, "train/loss_slope": 1.0691342195018145e-05} {"step": 35840, "timestamp": 1778233273.6412768, "train/loss": 2.2155242204666137, "train/z_loss": 0.0014119495986960827, "train/perplexity": 9.166212968798249, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936842.77536946, "perf/iters_per_sec": 0.9235586048934269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0827683210372925, "data/tokens_consumed": 75164024832, "data/tokens_consumed_B": 75.164024832, "train/loss_slope": 1.4250580226555981e-05} {"step": 35850, "timestamp": 1778233284.4074216, "grad/layer_0/attn": 0.002614134456962347, "grad/layer_0/mlp": 0.002658444456756115, "grad/layer_0/attn_mlp_ratio": 0.9833323212699098, "grad/layer_4/attn": 0.0017825283575803041, "grad/layer_4/mlp": 0.0024069722276180983, "grad/layer_4/attn_mlp_ratio": 0.7405686958371748, "grad/layer_8/attn": 0.004163441248238087, "grad/layer_8/mlp": 0.0037635855842381716, "grad/layer_8/attn_mlp_ratio": 1.1062432471444725, "grad/layer_12/attn": 0.004017756786197424, "grad/layer_12/mlp": 0.00644207838922739, "grad/layer_12/attn_mlp_ratio": 0.6236739885917287, "grad/layer_16/attn": 0.006554555147886276, "grad/layer_16/mlp": 0.004250533413141966, "grad/layer_16/attn_mlp_ratio": 1.5420546921040963, "grad/layer_20/attn": 0.003712399397045374, "grad/layer_20/mlp": 0.005735313519835472, "grad/layer_20/attn_mlp_ratio": 0.64728795025369, "grad/layer_24/attn": 0.005830296780914068, "grad/layer_24/mlp": 0.007983025163412094, "grad/layer_24/attn_mlp_ratio": 0.7303367568728062, "grad/layer_27/attn": 0.0043226503767073154, "grad/layer_27/mlp": 0.006417457479983568, "grad/layer_27/attn_mlp_ratio": 0.6735767744207487} {"step": 35850, "timestamp": 1778233285.0168195, "eos/sharpness": 3.53405475616455, "eos/L0_probe": 2.0028693675994873, "eos/L_plus": 2.022416591644287, "eos/L_minus": 2.018662691116333, "eos/grad_norm": 0.08355376124382019, "eos/embed_grad_frac": 0.31500008702278137, "eos/time_s": 0.6063554286956787} {"step": 35850, "timestamp": 1778233285.0379655, "train/loss": 2.212044382095337, "train/z_loss": 0.0014378081890754403, "train/perplexity": 9.134371462968478, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1840872.6460703185, "perf/iters_per_sec": 0.8777964811660378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1392162322998047, "data/tokens_consumed": 75184996352, "data/tokens_consumed_B": 75.184996352, "train/loss_slope": 1.607470289208027e-05} {"step": 35850, "timestamp": 1778233286.3988261, "geo/rankme_last": 439.464111328125, "geo/layer_0/stable_rank_q_proj": 18.989320755004883, "geo/layer_0/stable_rank_k_proj": 16.31367301940918, "geo/layer_0/stable_rank_o_proj": 49.4634895324707, "geo/layer_0/stable_rank_gate_proj": 140.3049774169922, "geo/layer_0/stable_rank_down_proj": 52.68023681640625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05577242001891136, "geo/layer_0/attn_entropy_mean": 6.207244873046875, "geo/layer_0/attn_entropy_std": 0.36504390835762024, "geo/layer_7/stable_rank_q_proj": 43.21416473388672, "geo/layer_7/stable_rank_k_proj": 41.95325469970703, "geo/layer_7/stable_rank_o_proj": 101.21040344238281, "geo/layer_7/stable_rank_gate_proj": 92.57814025878906, "geo/layer_7/stable_rank_down_proj": 145.81619262695312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5140711665153503, "geo/layer_7/attn_entropy_mean": 4.63723087310791, "geo/layer_7/attn_entropy_std": 0.8372848033905029, "geo/layer_14/stable_rank_q_proj": 54.769195556640625, "geo/layer_14/stable_rank_k_proj": 36.642276763916016, "geo/layer_14/stable_rank_o_proj": 49.83052062988281, "geo/layer_14/stable_rank_gate_proj": 78.0906982421875, "geo/layer_14/stable_rank_down_proj": 133.38009643554688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38195228576660156, "geo/layer_14/attn_entropy_mean": 5.505802154541016, "geo/layer_14/attn_entropy_std": 0.3809809684753418, "geo/layer_21/stable_rank_q_proj": 44.08857727050781, "geo/layer_21/stable_rank_k_proj": 31.22368049621582, "geo/layer_21/stable_rank_o_proj": 77.50421142578125, "geo/layer_21/stable_rank_gate_proj": 75.11446380615234, "geo/layer_21/stable_rank_down_proj": 56.822021484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1429767906665802, "geo/layer_21/attn_entropy_mean": 5.720078945159912, "geo/layer_21/attn_entropy_std": 0.2875339984893799, "geo/layer_27/stable_rank_q_proj": 42.46250534057617, "geo/layer_27/stable_rank_k_proj": 31.630558013916016, "geo/layer_27/stable_rank_o_proj": 117.48481750488281, "geo/layer_27/stable_rank_gate_proj": 86.35917663574219, "geo/layer_27/stable_rank_down_proj": 132.72991943359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09126811474561691, "geo/layer_27/attn_entropy_mean": 4.305727005004883, "geo/layer_27/attn_entropy_std": 0.6551384329795837, "attnres/final_alpha/block_0": 0.24170154333114624, "attnres/block_norm/0": 1.7081409692764282, "attnres/final_alpha/block_1": 0.005491897463798523, "attnres/block_norm/1": 38283.49609375, "attnres/final_alpha/block_2": 0.011627974919974804, "attnres/block_norm/2": 25567.23046875, "attnres/final_alpha/block_3": 0.013639423996210098, "attnres/block_norm/3": 43920.890625, "attnres/final_alpha/block_4": 0.016963912174105644, "attnres/block_norm/4": 12240.595703125, "attnres/final_alpha/block_5": 0.590513288974762, "attnres/block_norm/5": 5850.8916015625, "attnres/final_alpha/block_6": 0.1200619786977768, "attnres/block_norm/6": 28967.24609375, "geo/tier1_time_s": 1.356773853302002, "geo/step": 35850.0, "geo/rankme_slope": -3.276845894607843e-05} {"step": 35860, "timestamp": 1778233296.7782269, "train/loss": 2.1642774105072022, "train/z_loss": 0.001433675258886069, "train/perplexity": 8.708307111626162, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786905.7440160266, "perf/iters_per_sec": 0.8520630569534429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1736220598220826, "data/tokens_consumed": 75205967872, "data/tokens_consumed_B": 75.205967872, "train/loss_slope": 1.423675968880915e-05} {"step": 35870, "timestamp": 1778233307.153879, "train/loss": 2.1623494386672975, "train/z_loss": 0.0014469709480181337, "train/perplexity": 8.691533915062626, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022418.6791446465, "perf/iters_per_sec": 0.9643643756602509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369524478912353, "data/tokens_consumed": 75226939392, "data/tokens_consumed_B": 75.226939392, "train/loss_slope": 1.0564664488184978e-05} {"step": 35880, "timestamp": 1778233317.5344622, "train/loss": 2.206591081619263, "train/z_loss": 0.0014216471812687815, "train/perplexity": 9.084694565355122, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021407.300141959, "perf/iters_per_sec": 0.9638821125707431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374712705612184, "data/tokens_consumed": 75247910912, "data/tokens_consumed_B": 75.247910912, "train/loss_slope": 1.4172324653577451e-05} {"step": 35890, "timestamp": 1778233327.9087439, "train/loss": 2.1293232679367065, "train/z_loss": 0.001446825743187219, "train/perplexity": 8.409174127688548, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022669.8099598095, "perf/iters_per_sec": 0.9644841241644905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368237018585205, "data/tokens_consumed": 75268882432, "data/tokens_consumed_B": 75.268882432, "train/loss_slope": 1.0952139580317764e-05} {"step": 35900, "timestamp": 1778233338.2788987, "grad/layer_0/attn": 0.0031795974355190992, "grad/layer_0/mlp": 0.0032184403389692307, "grad/layer_0/attn_mlp_ratio": 0.9879311100557226, "grad/layer_4/attn": 0.002907932735979557, "grad/layer_4/mlp": 0.002576421480625868, "grad/layer_4/attn_mlp_ratio": 1.1286711607473643, "grad/layer_8/attn": 0.014929593540728092, "grad/layer_8/mlp": 0.0037309869658201933, "grad/layer_8/attn_mlp_ratio": 4.001513078804008, "grad/layer_12/attn": 0.005067890509963036, "grad/layer_12/mlp": 0.007107499521225691, "grad/layer_12/attn_mlp_ratio": 0.713034228637648, "grad/layer_16/attn": 0.0043368567712605, "grad/layer_16/mlp": 0.004964929539710283, "grad/layer_16/attn_mlp_ratio": 0.8734981331001435, "grad/layer_20/attn": 0.00428802240639925, "grad/layer_20/mlp": 0.006958430167287588, "grad/layer_20/attn_mlp_ratio": 0.6162341565105216, "grad/layer_24/attn": 0.016031581908464432, "grad/layer_24/mlp": 0.013483401387929916, "grad/layer_24/attn_mlp_ratio": 1.188986467755603, "grad/layer_27/attn": 0.010157479904592037, "grad/layer_27/mlp": 0.011991812847554684, "grad/layer_27/attn_mlp_ratio": 0.8470345517408445} {"step": 35900, "timestamp": 1778233338.2946172, "train/loss": 2.194379949569702, "train/z_loss": 0.0014293882297351957, "train/perplexity": 8.974434729180274, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020853.7268212815, "perf/iters_per_sec": 0.9636181482416541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377554655075074, "data/tokens_consumed": 75289853952, "data/tokens_consumed_B": 75.289853952, "train/loss_slope": 1.3819330267243825e-05} {"step": 35910, "timestamp": 1778233348.669704, "train/loss": 2.174867606163025, "train/z_loss": 0.0014434057637117803, "train/perplexity": 8.801019844265026, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022638.3686872034, "perf/iters_per_sec": 0.9644691317974107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368398189544679, "data/tokens_consumed": 75310825472, "data/tokens_consumed_B": 75.310825472, "train/loss_slope": 1.0352310725171228e-05} {"step": 35920, "timestamp": 1778233359.0466714, "train/loss": 2.188235640525818, "train/z_loss": 0.0014354420127347112, "train/perplexity": 8.919462086110284, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022169.9815921488, "perf/iters_per_sec": 0.9642457874260658, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370799779891968, "data/tokens_consumed": 75331796992, "data/tokens_consumed_B": 75.331796992, "train/loss_slope": 1.157483647544218e-05} {"step": 35925, "timestamp": 1778233364.8279436, "eos/sharpness": 26.343703269958493, "eos/L0_probe": 2.0031540393829346, "eos/L_plus": 2.130459785461426, "eos/L_minus": 2.1392853260040283, "eos/grad_norm": 0.11586593091487885, "eos/embed_grad_frac": 0.16524824500083923, "eos/time_s": 0.6033148765563965} {"step": 35925, "timestamp": 1778233366.204668, "geo/rankme_last": 440.0247802734375, "geo/layer_0/stable_rank_q_proj": 18.99312973022461, "geo/layer_0/stable_rank_k_proj": 16.299226760864258, "geo/layer_0/stable_rank_o_proj": 49.49055862426758, "geo/layer_0/stable_rank_gate_proj": 140.02073669433594, "geo/layer_0/stable_rank_down_proj": 52.72886657714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06340073049068451, "geo/layer_0/attn_entropy_mean": 6.206354141235352, "geo/layer_0/attn_entropy_std": 0.36839425563812256, "geo/layer_7/stable_rank_q_proj": 43.20343780517578, "geo/layer_7/stable_rank_k_proj": 41.93204116821289, "geo/layer_7/stable_rank_o_proj": 101.2232666015625, "geo/layer_7/stable_rank_gate_proj": 92.61106872558594, "geo/layer_7/stable_rank_down_proj": 145.80841064453125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49838221073150635, "geo/layer_7/attn_entropy_mean": 4.641777992248535, "geo/layer_7/attn_entropy_std": 0.8092977404594421, "geo/layer_14/stable_rank_q_proj": 54.723350524902344, "geo/layer_14/stable_rank_k_proj": 36.5521240234375, "geo/layer_14/stable_rank_o_proj": 49.70197677612305, "geo/layer_14/stable_rank_gate_proj": 78.16952514648438, "geo/layer_14/stable_rank_down_proj": 133.4113311767578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3758833408355713, "geo/layer_14/attn_entropy_mean": 5.494924545288086, "geo/layer_14/attn_entropy_std": 0.3646720349788666, "geo/layer_21/stable_rank_q_proj": 44.04752731323242, "geo/layer_21/stable_rank_k_proj": 31.215679168701172, "geo/layer_21/stable_rank_o_proj": 77.51531219482422, "geo/layer_21/stable_rank_gate_proj": 75.06021881103516, "geo/layer_21/stable_rank_down_proj": 56.8155403137207, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451810747385025, "geo/layer_21/attn_entropy_mean": 5.72284460067749, "geo/layer_21/attn_entropy_std": 0.282461941242218, "geo/layer_27/stable_rank_q_proj": 42.46718978881836, "geo/layer_27/stable_rank_k_proj": 31.5520076751709, "geo/layer_27/stable_rank_o_proj": 117.69065856933594, "geo/layer_27/stable_rank_gate_proj": 86.36112976074219, "geo/layer_27/stable_rank_down_proj": 132.76315307617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08867233246564865, "geo/layer_27/attn_entropy_mean": 4.293023586273193, "geo/layer_27/attn_entropy_std": 0.6459012627601624, "attnres/final_alpha/block_0": 0.24232056736946106, "attnres/block_norm/0": 1.7082557678222656, "attnres/final_alpha/block_1": 0.00552764069288969, "attnres/block_norm/1": 38223.02734375, "attnres/final_alpha/block_2": 0.011820647865533829, "attnres/block_norm/2": 25626.1484375, "attnres/final_alpha/block_3": 0.013636821880936623, "attnres/block_norm/3": 44062.6484375, "attnres/final_alpha/block_4": 0.017046552151441574, "attnres/block_norm/4": 12297.98828125, "attnres/final_alpha/block_5": 0.5895504951477051, "attnres/block_norm/5": 5835.16455078125, "attnres/final_alpha/block_6": 0.12009730190038681, "attnres/block_norm/6": 28960.96875, "geo/tier1_time_s": 1.3572402000427246, "geo/step": 35925.0, "geo/rankme_slope": -8.659284026110444e-06} {"step": 35930, "timestamp": 1778233371.393667, "train/loss": 2.1387244701385497, "train/z_loss": 0.0014455816941335796, "train/perplexity": 8.488603253638463, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699315.5231407098, "perf/iters_per_sec": 0.8102967849448728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234115719795227, "data/tokens_consumed": 75352768512, "data/tokens_consumed_B": 75.352768512, "train/loss_slope": 8.025403720925563e-06} {"step": 35940, "timestamp": 1778233381.7691553, "train/loss": 2.2095141887664793, "train/z_loss": 0.0014380470267497004, "train/perplexity": 9.111288951152558, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022449.1835604303, "perf/iters_per_sec": 0.9643789212991859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369368076324463, "data/tokens_consumed": 75373740032, "data/tokens_consumed_B": 75.373740032, "train/loss_slope": 6.266503022162735e-06} {"step": 35950, "timestamp": 1778233392.144716, "grad/layer_0/attn": 0.0025569619610905647, "grad/layer_0/mlp": 0.002727495040744543, "grad/layer_0/attn_mlp_ratio": 0.9374762663711192, "grad/layer_4/attn": 0.0028113373555243015, "grad/layer_4/mlp": 0.0025674798525869846, "grad/layer_4/attn_mlp_ratio": 1.0949792821913202, "grad/layer_8/attn": 0.005010710097849369, "grad/layer_8/mlp": 0.003777717938646674, "grad/layer_8/attn_mlp_ratio": 1.3263854121956653, "grad/layer_12/attn": 0.004054242745041847, "grad/layer_12/mlp": 0.006471690256148577, "grad/layer_12/attn_mlp_ratio": 0.6264580846625367, "grad/layer_16/attn": 0.005072292871773243, "grad/layer_16/mlp": 0.0046868822537362576, "grad/layer_16/attn_mlp_ratio": 1.0822317457424009, "grad/layer_20/attn": 0.00808947067707777, "grad/layer_20/mlp": 0.00611605029553175, "grad/layer_20/attn_mlp_ratio": 1.3226625279261528, "grad/layer_24/attn": 0.013211281038820744, "grad/layer_24/mlp": 0.012228245846927166, "grad/layer_24/attn_mlp_ratio": 1.080390523396416, "grad/layer_27/attn": 0.009866118431091309, "grad/layer_27/mlp": 0.009832908399403095, "grad/layer_27/attn_mlp_ratio": 1.0033774271051368} {"step": 35950, "timestamp": 1778233392.1603975, "train/loss": 2.2285785913467406, "train/z_loss": 0.001424295105971396, "train/perplexity": 9.286656559650998, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019222.5896664462, "perf/iters_per_sec": 0.962840361436103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385937690734863, "data/tokens_consumed": 75394711552, "data/tokens_consumed_B": 75.394711552, "train/loss_slope": 7.179807958537112e-06} {"step": 35960, "timestamp": 1778233402.551155, "train/loss": 2.151034450531006, "train/z_loss": 0.0014348050695843994, "train/perplexity": 8.593743603366265, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019500.8855445283, "perf/iters_per_sec": 0.9629730632517473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384506464004517, "data/tokens_consumed": 75415683072, "data/tokens_consumed_B": 75.415683072, "train/loss_slope": 5.747515633769363e-06} {"step": 35970, "timestamp": 1778233412.93014, "train/loss": 2.147060823440552, "train/z_loss": 0.00143831935711205, "train/perplexity": 8.559663027568666, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021701.253774331, "perf/iters_per_sec": 0.9640222805854468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373204231262207, "data/tokens_consumed": 75436654592, "data/tokens_consumed_B": 75.436654592, "train/loss_slope": 3.6979934169908137e-06} {"step": 35980, "timestamp": 1778233423.3050027, "train/loss": 2.2007792472839354, "train/z_loss": 0.0014306428842246533, "train/perplexity": 9.032048957515826, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022674.7401888436, "perf/iters_per_sec": 0.9644864750808924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036821174621582, "data/tokens_consumed": 75457626112, "data/tokens_consumed_B": 75.457626112, "train/loss_slope": 7.011837226794442e-06} {"step": 35990, "timestamp": 1778233433.6769786, "train/loss": 2.1685386419296266, "train/z_loss": 0.0014407736714929342, "train/perplexity": 8.745494399074405, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022662.8797728424, "perf/iters_per_sec": 0.9644808195938313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368272542953492, "data/tokens_consumed": 75478597632, "data/tokens_consumed_B": 75.478597632, "train/loss_slope": 6.031310383063934e-06} {"step": 36000, "timestamp": 1778233444.042364, "grad/layer_0/attn": 0.0028726954478770494, "grad/layer_0/mlp": 0.002903154818341136, "grad/layer_0/attn_mlp_ratio": 0.9895081484382201, "grad/layer_4/attn": 0.002120143733918667, "grad/layer_4/mlp": 0.0025723387952893972, "grad/layer_4/attn_mlp_ratio": 0.824208558911575, "grad/layer_8/attn": 0.005005501676350832, "grad/layer_8/mlp": 0.0038499259389936924, "grad/layer_8/attn_mlp_ratio": 1.3001552823750318, "grad/layer_12/attn": 0.005698104854673147, "grad/layer_12/mlp": 0.006837791763246059, "grad/layer_12/attn_mlp_ratio": 0.8333252852139498, "grad/layer_16/attn": 0.0036422768607735634, "grad/layer_16/mlp": 0.004783908370882273, "grad/layer_16/attn_mlp_ratio": 0.7613600642534527, "grad/layer_20/attn": 0.005239339545369148, "grad/layer_20/mlp": 0.006253629922866821, "grad/layer_20/attn_mlp_ratio": 0.8378077254668327, "grad/layer_24/attn": 0.00865988153964281, "grad/layer_24/mlp": 0.007778699975460768, "grad/layer_24/attn_mlp_ratio": 1.1132813266527501, "grad/layer_27/attn": 0.006551395170390606, "grad/layer_27/mlp": 0.006629470735788345, "grad/layer_27/attn_mlp_ratio": 0.9882229415692942} {"step": 36000, "timestamp": 1778233444.652463, "eos/sharpness": 4.104852676391601, "eos/L0_probe": 2.0062429904937744, "eos/L_plus": 2.0300824642181396, "eos/L_minus": 2.023452043533325, "eos/grad_norm": 0.09441894292831421, "eos/embed_grad_frac": 0.31621626019477844, "eos/time_s": 0.6071271896362305} {"step": 36000, "timestamp": 1778233444.6718402, "train/loss": 2.2170096397399903, "train/z_loss": 0.0014278420363552867, "train/perplexity": 9.17983875570381, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908443.7942793982, "perf/iters_per_sec": 0.9100169154545775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098880672454834, "data/tokens_consumed": 75499569152, "data/tokens_consumed_B": 75.499569152, "train/loss_slope": 8.204329489517121e-06} {"step": 36000, "timestamp": 1778233446.0386484, "geo/rankme_last": 439.71063232421875, "geo/layer_0/stable_rank_q_proj": 18.97857093811035, "geo/layer_0/stable_rank_k_proj": 16.322961807250977, "geo/layer_0/stable_rank_o_proj": 49.511383056640625, "geo/layer_0/stable_rank_gate_proj": 139.8232421875, "geo/layer_0/stable_rank_down_proj": 52.749420166015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06218913197517395, "geo/layer_0/attn_entropy_mean": 6.205056190490723, "geo/layer_0/attn_entropy_std": 0.3698030114173889, "geo/layer_7/stable_rank_q_proj": 43.22126770019531, "geo/layer_7/stable_rank_k_proj": 41.83994674682617, "geo/layer_7/stable_rank_o_proj": 101.45942687988281, "geo/layer_7/stable_rank_gate_proj": 92.73217010498047, "geo/layer_7/stable_rank_down_proj": 145.445556640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.509869396686554, "geo/layer_7/attn_entropy_mean": 4.652235507965088, "geo/layer_7/attn_entropy_std": 0.8267339468002319, "geo/layer_14/stable_rank_q_proj": 54.793922424316406, "geo/layer_14/stable_rank_k_proj": 36.55817794799805, "geo/layer_14/stable_rank_o_proj": 49.66948318481445, "geo/layer_14/stable_rank_gate_proj": 78.15460205078125, "geo/layer_14/stable_rank_down_proj": 133.09718322753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37547120451927185, "geo/layer_14/attn_entropy_mean": 5.486481666564941, "geo/layer_14/attn_entropy_std": 0.3727295696735382, "geo/layer_21/stable_rank_q_proj": 43.97370529174805, "geo/layer_21/stable_rank_k_proj": 31.295406341552734, "geo/layer_21/stable_rank_o_proj": 77.6418228149414, "geo/layer_21/stable_rank_gate_proj": 74.90968322753906, "geo/layer_21/stable_rank_down_proj": 56.79065704345703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14712877571582794, "geo/layer_21/attn_entropy_mean": 5.717264175415039, "geo/layer_21/attn_entropy_std": 0.29304900765419006, "geo/layer_27/stable_rank_q_proj": 42.51041793823242, "geo/layer_27/stable_rank_k_proj": 31.479406356811523, "geo/layer_27/stable_rank_o_proj": 117.5489273071289, "geo/layer_27/stable_rank_gate_proj": 86.3542251586914, "geo/layer_27/stable_rank_down_proj": 132.8321075439453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08158126473426819, "geo/layer_27/attn_entropy_mean": 4.289625644683838, "geo/layer_27/attn_entropy_std": 0.6401599049568176, "attnres/final_alpha/block_0": 0.24278002977371216, "attnres/block_norm/0": 1.708493947982788, "attnres/final_alpha/block_1": 0.005470948293805122, "attnres/block_norm/1": 38383.33984375, "attnres/final_alpha/block_2": 0.011782050132751465, "attnres/block_norm/2": 25671.1796875, "attnres/final_alpha/block_3": 0.013599583879113197, "attnres/block_norm/3": 44107.91015625, "attnres/final_alpha/block_4": 0.01705082878470421, "attnres/block_norm/4": 12282.857421875, "attnres/final_alpha/block_5": 0.5888159275054932, "attnres/block_norm/5": 5850.7236328125, "attnres/final_alpha/block_6": 0.12050066888332367, "attnres/block_norm/6": 29058.75390625, "geo/tier1_time_s": 1.361269474029541, "geo/step": 36000.0, "geo/rankme_slope": 3.4819396508603446e-06} {"step": 36000, "timestamp": 1778233452.9494374, "geo/ww_alpha_mean": 7.632086705286134, "geo/ww_alpha_std": 4.625928732656328, "geo/ww_alpha_min": 1.3419907642539224, "geo/ww_alpha_max": 37.76550602465289, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.102639900421261, "geo/ww_alpha_by_type/k_proj": 4.516616874782445, "geo/ww_alpha_by_type/v_proj": 8.815500321402041, "geo/ww_alpha_by_type/o_proj": 6.991359738990553, "geo/ww_alpha_by_type/gate_proj": 8.001152149686211, "geo/ww_alpha_by_type/up_proj": 12.407470875043924, "geo/ww_alpha_by_type/down_proj": 8.69897755408146, "geo/twonn_id/layer_0": 0.7459827661514282, "geo/twonn_id/layer_7": 3.368739604949951, "geo/twonn_id/layer_14": 4.617231845855713, "geo/twonn_id/layer_21": 6.38575553894043, "geo/twonn_id/layer_27": 6.459735870361328, "geo/tier2_time_s": 6.90107798576355} {"step": 36000, "timestamp": 1778233453.6028626, "eoc/jacobian_sigma/layer_0/attn": 1179.3671875, "eoc/jacobian_sigma/layer_0/mlp": 8168.7294921875, "eoc/jacobian_sigma/layer_0": 8168.7294921875, "eoc/jacobian_sigma/layer_7/attn": 1.1528176069259644, "eoc/jacobian_sigma/layer_7/mlp": 1.662562608718872, "eoc/jacobian_sigma/layer_7": 1.662562608718872, "eoc/jacobian_sigma/layer_14/attn": 1.6170690059661865, "eoc/jacobian_sigma/layer_14/mlp": 7.135120868682861, "eoc/jacobian_sigma/layer_14": 7.135120868682861, "eoc/jacobian_sigma/layer_21/attn": 1.0902334451675415, "eoc/jacobian_sigma/layer_21/mlp": 4.093120574951172, "eoc/jacobian_sigma/layer_21": 4.093120574951172, "eoc/jacobian_sigma/layer_27/attn": 3.6619958877563477, "eoc/jacobian_sigma/layer_27/mlp": 22.361539840698242, "eoc/jacobian_sigma/layer_27": 22.361539840698242, "eoc/layer0_sigma": 8168.7294921875, "eoc/sigma_max": 22.361539840698242, "eoc/sigma_min": 1.662562608718872, "eoc/sigma_mean": 8.813085973262787, "eoc/time_s": 0.6456613540649414} {"step": 36010, "timestamp": 1778233464.3341734, "train/loss": 2.160783219337463, "train/z_loss": 0.0014416716992855072, "train/perplexity": 8.677931721428807, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1066859.0981739897, "perf/iters_per_sec": 0.508718060576434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9657253742218017, "data/tokens_consumed": 75520540672, "data/tokens_consumed_B": 75.520540672, "train/loss_slope": 7.866058791681649e-06} {"step": 36020, "timestamp": 1778233474.7157302, "train/loss": 2.2294747114181517, "train/z_loss": 0.0014355925377458334, "train/perplexity": 9.29498224884185, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021312.4002270207, "perf/iters_per_sec": 0.9638368607649902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375199794769288, "data/tokens_consumed": 75541512192, "data/tokens_consumed_B": 75.541512192, "train/loss_slope": 8.317465650545309e-06} {"step": 36030, "timestamp": 1778233485.0909865, "train/loss": 2.156106698513031, "train/z_loss": 0.0014454929274506866, "train/perplexity": 8.637443937790646, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022407.054228354, "perf/iters_per_sec": 0.9643588324682016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369584083557128, "data/tokens_consumed": 75562483712, "data/tokens_consumed_B": 75.562483712, "train/loss_slope": 7.752252290315376e-06} {"step": 36040, "timestamp": 1778233495.4651208, "train/loss": 2.2152047157287598, "train/z_loss": 0.001429466682020575, "train/perplexity": 9.163284788135247, "train/grad_norm": 0.328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022612.4164204958, "perf/iters_per_sec": 0.9644567567923049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368531227111817, "data/tokens_consumed": 75583455232, "data/tokens_consumed_B": 75.583455232, "train/loss_slope": 1.0919412248956494e-05} {"step": 36050, "timestamp": 1778233505.8266795, "grad/layer_0/attn": 0.0032219276763498783, "grad/layer_0/mlp": 0.0029869030695408583, "grad/layer_0/attn_mlp_ratio": 1.0786850103497487, "grad/layer_4/attn": 0.0022486404050141573, "grad/layer_4/mlp": 0.002504520583897829, "grad/layer_4/attn_mlp_ratio": 0.8978326349912827, "grad/layer_8/attn": 0.004921847488731146, "grad/layer_8/mlp": 0.003607872175052762, "grad/layer_8/attn_mlp_ratio": 1.3641967102782673, "grad/layer_12/attn": 0.005298932082951069, "grad/layer_12/mlp": 0.006066383793950081, "grad/layer_12/attn_mlp_ratio": 0.8734910575368663, "grad/layer_16/attn": 0.0036897631362080574, "grad/layer_16/mlp": 0.0050329104997217655, "grad/layer_16/attn_mlp_ratio": 0.7331270967563062, "grad/layer_20/attn": 0.0037884104531258345, "grad/layer_20/mlp": 0.006301401648670435, "grad/layer_20/attn_mlp_ratio": 0.6012012254138804, "grad/layer_24/attn": 0.006994185503572226, "grad/layer_24/mlp": 0.009658594615757465, "grad/layer_24/attn_mlp_ratio": 0.7241411105242461, "grad/layer_27/attn": 0.007790129166096449, "grad/layer_27/mlp": 0.009442133828997612, "grad/layer_27/attn_mlp_ratio": 0.8250390456941399} {"step": 36050, "timestamp": 1778233505.842856, "train/loss": 2.1742722511291506, "train/z_loss": 0.001435960887465626, "train/perplexity": 8.795781672238277, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021870.5005658006, "perf/iters_per_sec": 0.9641029837445262, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037233591079712, "data/tokens_consumed": 75604426752, "data/tokens_consumed_B": 75.604426752, "train/loss_slope": 1.0185201512132701e-05} {"step": 36060, "timestamp": 1778233516.2272742, "train/loss": 2.1695456981658934, "train/z_loss": 0.0014392204815521837, "train/perplexity": 8.754306039912453, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020698.9941564833, "perf/iters_per_sec": 0.9635443659574906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037834930419922, "data/tokens_consumed": 75625398272, "data/tokens_consumed_B": 75.625398272, "train/loss_slope": 7.524248804732751e-06} {"step": 36070, "timestamp": 1778233526.6194282, "train/loss": 2.09564448595047, "train/z_loss": 0.0014501136145554483, "train/perplexity": 8.130679390622163, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019008.1830959388, "perf/iters_per_sec": 0.9627381244163221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387040615081786, "data/tokens_consumed": 75646369792, "data/tokens_consumed_B": 75.646369792, "train/loss_slope": 1.733611283129182e-06} {"step": 36075, "timestamp": 1778233532.415382, "eos/sharpness": 49.99084472656249, "eos/L0_probe": 2.003035306930542, "eos/L_plus": 2.2791872024536133, "eos/L_minus": 2.2267918586730957, "eos/grad_norm": 0.16655856370925903, "eos/embed_grad_frac": 0.10120493173599243, "eos/time_s": 0.6183769702911377} {"step": 36075, "timestamp": 1778233533.7924335, "geo/rankme_last": 439.2560729980469, "geo/layer_0/stable_rank_q_proj": 18.94727897644043, "geo/layer_0/stable_rank_k_proj": 16.328271865844727, "geo/layer_0/stable_rank_o_proj": 49.53520965576172, "geo/layer_0/stable_rank_gate_proj": 139.60145568847656, "geo/layer_0/stable_rank_down_proj": 52.7275505065918, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05399215593934059, "geo/layer_0/attn_entropy_mean": 6.203330993652344, "geo/layer_0/attn_entropy_std": 0.36883544921875, "geo/layer_7/stable_rank_q_proj": 43.350914001464844, "geo/layer_7/stable_rank_k_proj": 41.765289306640625, "geo/layer_7/stable_rank_o_proj": 101.37572479248047, "geo/layer_7/stable_rank_gate_proj": 92.66312408447266, "geo/layer_7/stable_rank_down_proj": 145.4681854248047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.516823410987854, "geo/layer_7/attn_entropy_mean": 4.666346549987793, "geo/layer_7/attn_entropy_std": 0.839748740196228, "geo/layer_14/stable_rank_q_proj": 54.74845504760742, "geo/layer_14/stable_rank_k_proj": 36.546016693115234, "geo/layer_14/stable_rank_o_proj": 49.826805114746094, "geo/layer_14/stable_rank_gate_proj": 78.18811798095703, "geo/layer_14/stable_rank_down_proj": 133.15985107421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3781384825706482, "geo/layer_14/attn_entropy_mean": 5.504174709320068, "geo/layer_14/attn_entropy_std": 0.3892931342124939, "geo/layer_21/stable_rank_q_proj": 43.94776916503906, "geo/layer_21/stable_rank_k_proj": 31.106435775756836, "geo/layer_21/stable_rank_o_proj": 77.67194366455078, "geo/layer_21/stable_rank_gate_proj": 74.7856216430664, "geo/layer_21/stable_rank_down_proj": 56.72720718383789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14586785435676575, "geo/layer_21/attn_entropy_mean": 5.718140602111816, "geo/layer_21/attn_entropy_std": 0.28313854336738586, "geo/layer_27/stable_rank_q_proj": 42.540435791015625, "geo/layer_27/stable_rank_k_proj": 31.41185760498047, "geo/layer_27/stable_rank_o_proj": 117.33916473388672, "geo/layer_27/stable_rank_gate_proj": 86.25172424316406, "geo/layer_27/stable_rank_down_proj": 132.92526245117188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0916270837187767, "geo/layer_27/attn_entropy_mean": 4.300826072692871, "geo/layer_27/attn_entropy_std": 0.6336459517478943, "attnres/final_alpha/block_0": 0.2407820224761963, "attnres/block_norm/0": 1.7086970806121826, "attnres/final_alpha/block_1": 0.0054252492263913155, "attnres/block_norm/1": 38420.140625, "attnres/final_alpha/block_2": 0.011760424822568893, "attnres/block_norm/2": 25609.8671875, "attnres/final_alpha/block_3": 0.013549022376537323, "attnres/block_norm/3": 44304.21875, "attnres/final_alpha/block_4": 0.01667836681008339, "attnres/block_norm/4": 12294.38671875, "attnres/final_alpha/block_5": 0.5939825177192688, "attnres/block_norm/5": 5842.984375, "attnres/final_alpha/block_6": 0.11782239377498627, "attnres/block_norm/6": 29234.07421875, "geo/tier1_time_s": 1.3569087982177734, "geo/step": 36075.0, "geo/rankme_slope": 1.7609778286314526e-05} {"step": 36080, "timestamp": 1778233538.9860086, "train/loss": 2.199742293357849, "train/z_loss": 0.0014466933673247695, "train/perplexity": 9.02268799317189, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696492.7855571127, "perf/iters_per_sec": 0.8089507987771571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236169123649597, "data/tokens_consumed": 75667341312, "data/tokens_consumed_B": 75.667341312, "train/loss_slope": 1.3095238385456432e-06} {"step": 36090, "timestamp": 1778233549.3668919, "train/loss": 2.215669846534729, "train/z_loss": 0.001428673602640629, "train/perplexity": 9.167547905550839, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021100.196684792, "perf/iters_per_sec": 0.9637356742309532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376289129257201, "data/tokens_consumed": 75688312832, "data/tokens_consumed_B": 75.688312832, "train/loss_slope": 5.174935017839538e-06} {"step": 36100, "timestamp": 1778233559.7354527, "grad/layer_0/attn": 0.00394936790689826, "grad/layer_0/mlp": 0.0042283120565116405, "grad/layer_0/attn_mlp_ratio": 0.9340294095402102, "grad/layer_4/attn": 0.003899188246577978, "grad/layer_4/mlp": 0.0029683697503060102, "grad/layer_4/attn_mlp_ratio": 1.3135789821393078, "grad/layer_8/attn": 0.005560618359595537, "grad/layer_8/mlp": 0.003923070151358843, "grad/layer_8/attn_mlp_ratio": 1.4174149335382138, "grad/layer_12/attn": 0.007430702447891235, "grad/layer_12/mlp": 0.008438398130238056, "grad/layer_12/attn_mlp_ratio": 0.8805821016202038, "grad/layer_16/attn": 0.004822175949811935, "grad/layer_16/mlp": 0.005280120298266411, "grad/layer_16/attn_mlp_ratio": 0.913270074560264, "grad/layer_20/attn": 0.009933230467140675, "grad/layer_20/mlp": 0.009508784860372543, "grad/layer_20/attn_mlp_ratio": 1.0446371969223187, "grad/layer_24/attn": 0.013383853249251842, "grad/layer_24/mlp": 0.014256863854825497, "grad/layer_24/attn_mlp_ratio": 0.9387655862930383, "grad/layer_27/attn": 0.009518195874989033, "grad/layer_27/mlp": 0.013371706008911133, "grad/layer_27/attn_mlp_ratio": 0.7118161136256161} {"step": 36100, "timestamp": 1778233559.75127, "train/loss": 2.149592399597168, "train/z_loss": 0.0014381186454556882, "train/perplexity": 8.581359918476048, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020640.7840407637, "perf/iters_per_sec": 0.9635166092113322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378648281097411, "data/tokens_consumed": 75709284352, "data/tokens_consumed_B": 75.709284352, "train/loss_slope": 3.284284784527027e-06} {"step": 36110, "timestamp": 1778233570.1352677, "train/loss": 2.2272646903991697, "train/z_loss": 0.0014209304470568895, "train/perplexity": 9.274462825231318, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020589.6324651567, "perf/iters_per_sec": 0.9634922182393821, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378911018371582, "data/tokens_consumed": 75730255872, "data/tokens_consumed_B": 75.730255872, "train/loss_slope": 3.6014375334704354e-06} {"step": 36120, "timestamp": 1778233580.5193317, "train/loss": 2.143964672088623, "train/z_loss": 0.0014351662481203674, "train/perplexity": 8.533202000125035, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020871.7874734385, "perf/iters_per_sec": 0.963626760231704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377461910247803, "data/tokens_consumed": 75751227392, "data/tokens_consumed_B": 75.751227392, "train/loss_slope": 8.581402993509722e-07} {"step": 36130, "timestamp": 1778233590.913485, "train/loss": 2.193328833580017, "train/z_loss": 0.001433435664512217, "train/perplexity": 8.965006513280352, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018587.5205308674, "perf/iters_per_sec": 0.962537536874231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038920521736145, "data/tokens_consumed": 75772198912, "data/tokens_consumed_B": 75.772198912, "train/loss_slope": 4.985138592403845e-06} {"step": 36140, "timestamp": 1778233601.2970936, "train/loss": 2.179201769828796, "train/z_loss": 0.0014317218447104096, "train/perplexity": 8.839247687718425, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020931.6360033664, "perf/iters_per_sec": 0.9636552982346375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037715458869934, "data/tokens_consumed": 75793170432, "data/tokens_consumed_B": 75.793170432, "train/loss_slope": 7.367312597005234e-06} {"step": 36150, "timestamp": 1778233611.6760418, "grad/layer_0/attn": 0.002589720766991377, "grad/layer_0/mlp": 0.0025438752491027117, "grad/layer_0/attn_mlp_ratio": 1.0180218806337489, "grad/layer_4/attn": 0.0021862261928617954, "grad/layer_4/mlp": 0.0024960702285170555, "grad/layer_4/attn_mlp_ratio": 0.875867225328005, "grad/layer_8/attn": 0.004716407973319292, "grad/layer_8/mlp": 0.00349388737231493, "grad/layer_8/attn_mlp_ratio": 1.3499026544762651, "grad/layer_12/attn": 0.004752923734486103, "grad/layer_12/mlp": 0.006286372896283865, "grad/layer_12/attn_mlp_ratio": 0.7560677257451559, "grad/layer_16/attn": 0.004145825281739235, "grad/layer_16/mlp": 0.004277103114873171, "grad/layer_16/attn_mlp_ratio": 0.9693068119849363, "grad/layer_20/attn": 0.003606742015108466, "grad/layer_20/mlp": 0.005805658176541328, "grad/layer_20/attn_mlp_ratio": 0.6212460057599449, "grad/layer_24/attn": 0.005819744896143675, "grad/layer_24/mlp": 0.009410307742655277, "grad/layer_24/attn_mlp_ratio": 0.6184436251664149, "grad/layer_27/attn": 0.005542835686355829, "grad/layer_27/mlp": 0.007255328353494406, "grad/layer_27/attn_mlp_ratio": 0.7639675752634213} {"step": 36150, "timestamp": 1778233612.281493, "eos/sharpness": 15.359520912170407, "eos/L0_probe": 2.004899024963379, "eos/L_plus": 2.076181411743164, "eos/L_minus": 2.087211847305298, "eos/grad_norm": 0.10414964705705643, "eos/embed_grad_frac": 0.23134303092956543, "eos/time_s": 0.6027536392211914} {"step": 36150, "timestamp": 1778233612.3008826, "train/loss": 2.191042900085449, "train/z_loss": 0.001440108602400869, "train/perplexity": 8.944536510059443, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906985.2878612196, "perf/iters_per_sec": 0.9093214453989122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099721121788025, "data/tokens_consumed": 75814141952, "data/tokens_consumed_B": 75.814141952, "train/loss_slope": 7.2054630494711265e-06} {"step": 36150, "timestamp": 1778233613.664552, "geo/rankme_last": 439.6049499511719, "geo/layer_0/stable_rank_q_proj": 19.00885009765625, "geo/layer_0/stable_rank_k_proj": 16.37700653076172, "geo/layer_0/stable_rank_o_proj": 49.51509475708008, "geo/layer_0/stable_rank_gate_proj": 139.50196838378906, "geo/layer_0/stable_rank_down_proj": 52.737579345703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05859595164656639, "geo/layer_0/attn_entropy_mean": 6.209268093109131, "geo/layer_0/attn_entropy_std": 0.3621707558631897, "geo/layer_7/stable_rank_q_proj": 43.243038177490234, "geo/layer_7/stable_rank_k_proj": 41.74161148071289, "geo/layer_7/stable_rank_o_proj": 101.38740539550781, "geo/layer_7/stable_rank_gate_proj": 92.71306610107422, "geo/layer_7/stable_rank_down_proj": 145.44924926757812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5150028467178345, "geo/layer_7/attn_entropy_mean": 4.640817165374756, "geo/layer_7/attn_entropy_std": 0.824995219707489, "geo/layer_14/stable_rank_q_proj": 54.66300582885742, "geo/layer_14/stable_rank_k_proj": 36.57069396972656, "geo/layer_14/stable_rank_o_proj": 49.754783630371094, "geo/layer_14/stable_rank_gate_proj": 77.98075866699219, "geo/layer_14/stable_rank_down_proj": 133.4691619873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37929701805114746, "geo/layer_14/attn_entropy_mean": 5.480587482452393, "geo/layer_14/attn_entropy_std": 0.3766323924064636, "geo/layer_21/stable_rank_q_proj": 43.95588684082031, "geo/layer_21/stable_rank_k_proj": 31.097986221313477, "geo/layer_21/stable_rank_o_proj": 77.59376525878906, "geo/layer_21/stable_rank_gate_proj": 74.61058807373047, "geo/layer_21/stable_rank_down_proj": 56.759517669677734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14871230721473694, "geo/layer_21/attn_entropy_mean": 5.719769477844238, "geo/layer_21/attn_entropy_std": 0.2927647531032562, "geo/layer_27/stable_rank_q_proj": 42.44544982910156, "geo/layer_27/stable_rank_k_proj": 31.448673248291016, "geo/layer_27/stable_rank_o_proj": 117.48521423339844, "geo/layer_27/stable_rank_gate_proj": 86.2865219116211, "geo/layer_27/stable_rank_down_proj": 133.07101440429688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08896786719560623, "geo/layer_27/attn_entropy_mean": 4.294256210327148, "geo/layer_27/attn_entropy_std": 0.6464682221412659, "attnres/final_alpha/block_0": 0.24158434569835663, "attnres/block_norm/0": 1.7090749740600586, "attnres/final_alpha/block_1": 0.0054801031947135925, "attnres/block_norm/1": 38538.96875, "attnres/final_alpha/block_2": 0.01159842498600483, "attnres/block_norm/2": 25823.44921875, "attnres/final_alpha/block_3": 0.013606904074549675, "attnres/block_norm/3": 44140.70703125, "attnres/final_alpha/block_4": 0.01711367443203926, "attnres/block_norm/4": 12311.9375, "attnres/final_alpha/block_5": 0.5910511612892151, "attnres/block_norm/5": 5867.365234375, "attnres/final_alpha/block_6": 0.11956541240215302, "attnres/block_norm/6": 29211.80859375, "geo/tier1_time_s": 1.359834909439087, "geo/step": 36150.0, "geo/rankme_slope": 1.6170100852841136e-05} {"step": 36160, "timestamp": 1778233624.051512, "train/loss": 2.149374270439148, "train/z_loss": 0.0014431840856559575, "train/perplexity": 8.579488277799488, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785271.8984812095, "perf/iters_per_sec": 0.8512839786916778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174696135520935, "data/tokens_consumed": 75835113472, "data/tokens_consumed_B": 75.835113472, "train/loss_slope": 2.6324546555302152e-06} {"step": 36170, "timestamp": 1778233634.4392865, "train/loss": 2.1878644704818724, "train/z_loss": 0.0014379407861270011, "train/perplexity": 8.91615206330447, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020246.7238593784, "perf/iters_per_sec": 0.9633287066742794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380672693252564, "data/tokens_consumed": 75856084992, "data/tokens_consumed_B": 75.856084992, "train/loss_slope": 1.319585059664058e-06} {"step": 36180, "timestamp": 1778233644.8380723, "train/loss": 2.1822601318359376, "train/z_loss": 0.001441032241564244, "train/perplexity": 8.8663226884911, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021763.0567042457, "perf/iters_per_sec": 0.964051750518916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372887134552002, "data/tokens_consumed": 75877056512, "data/tokens_consumed_B": 75.877056512, "train/loss_slope": 1.2344631317531944e-06} {"step": 36190, "timestamp": 1778233655.2185318, "train/loss": 2.189268708229065, "train/z_loss": 0.0014287375495769083, "train/perplexity": 8.92868125551495, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021450.363394835, "perf/iters_per_sec": 0.9639026467298675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374491691589356, "data/tokens_consumed": 75898028032, "data/tokens_consumed_B": 75.898028032, "train/loss_slope": -1.4660742130503042e-06} {"step": 36200, "timestamp": 1778233665.587926, "grad/layer_0/attn": 0.002647869987413287, "grad/layer_0/mlp": 0.0027870014309883118, "grad/layer_0/attn_mlp_ratio": 0.9500784115013792, "grad/layer_4/attn": 0.002401673002168536, "grad/layer_4/mlp": 0.002411456312984228, "grad/layer_4/attn_mlp_ratio": 0.9959429451998326, "grad/layer_8/attn": 0.003576225135475397, "grad/layer_8/mlp": 0.0034494970459491014, "grad/layer_8/attn_mlp_ratio": 1.0367381053424316, "grad/layer_12/attn": 0.0041995421051979065, "grad/layer_12/mlp": 0.006260924506932497, "grad/layer_12/attn_mlp_ratio": 0.6707543004986682, "grad/layer_16/attn": 0.003385247429832816, "grad/layer_16/mlp": 0.0043810163624584675, "grad/layer_16/attn_mlp_ratio": 0.7727082193918848, "grad/layer_20/attn": 0.004473094362765551, "grad/layer_20/mlp": 0.005824805237352848, "grad/layer_20/attn_mlp_ratio": 0.7679388586740998, "grad/layer_24/attn": 0.006751649547368288, "grad/layer_24/mlp": 0.00823031272739172, "grad/layer_24/attn_mlp_ratio": 0.8203393587784151, "grad/layer_27/attn": 0.009679009206593037, "grad/layer_27/mlp": 0.007105005439370871, "grad/layer_27/attn_mlp_ratio": 1.3622803181445637} {"step": 36200, "timestamp": 1778233665.603792, "train/loss": 2.2418421506881714, "train/z_loss": 0.0014204294653609394, "train/perplexity": 9.410651167154386, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020478.7510541305, "perf/iters_per_sec": 0.9634393458624508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379480600357056, "data/tokens_consumed": 75918999552, "data/tokens_consumed_B": 75.918999552, "train/loss_slope": 8.630916301888303e-07} {"step": 36210, "timestamp": 1778233675.9886699, "train/loss": 2.134597110748291, "train/z_loss": 0.0014542541932314635, "train/perplexity": 8.453639940003402, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020719.4659751633, "perf/iters_per_sec": 0.9635541276813332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378244161605834, "data/tokens_consumed": 75939971072, "data/tokens_consumed_B": 75.939971072, "train/loss_slope": -2.3643542509196703e-06} {"step": 36220, "timestamp": 1778233686.3712592, "train/loss": 2.1591433763504027, "train/z_loss": 0.0014367193914949894, "train/perplexity": 8.66371293742614, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021011.5479066116, "perf/iters_per_sec": 0.9636934031994875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376744270324707, "data/tokens_consumed": 75960942592, "data/tokens_consumed_B": 75.960942592, "train/loss_slope": -3.8104285000682264e-06} {"step": 36225, "timestamp": 1778233692.1534128, "eos/sharpness": 77.34014987945555, "eos/L0_probe": 2.0037925243377686, "eos/L_plus": 2.5027947425842285, "eos/L_minus": 2.2781918048858643, "eos/grad_norm": 0.2970104515552521, "eos/embed_grad_frac": 0.026429761201143265, "eos/time_s": 0.600482702255249} {"step": 36225, "timestamp": 1778233693.5302103, "geo/rankme_last": 439.28497314453125, "geo/layer_0/stable_rank_q_proj": 18.991943359375, "geo/layer_0/stable_rank_k_proj": 16.375913619995117, "geo/layer_0/stable_rank_o_proj": 49.60420227050781, "geo/layer_0/stable_rank_gate_proj": 139.7532501220703, "geo/layer_0/stable_rank_down_proj": 52.69207763671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05644999071955681, "geo/layer_0/attn_entropy_mean": 6.20858097076416, "geo/layer_0/attn_entropy_std": 0.3654732406139374, "geo/layer_7/stable_rank_q_proj": 43.267906188964844, "geo/layer_7/stable_rank_k_proj": 41.71665954589844, "geo/layer_7/stable_rank_o_proj": 101.27655792236328, "geo/layer_7/stable_rank_gate_proj": 92.64608764648438, "geo/layer_7/stable_rank_down_proj": 145.60874938964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5201348066329956, "geo/layer_7/attn_entropy_mean": 4.658876419067383, "geo/layer_7/attn_entropy_std": 0.8259682655334473, "geo/layer_14/stable_rank_q_proj": 54.6979866027832, "geo/layer_14/stable_rank_k_proj": 36.6089973449707, "geo/layer_14/stable_rank_o_proj": 49.83445739746094, "geo/layer_14/stable_rank_gate_proj": 78.0259780883789, "geo/layer_14/stable_rank_down_proj": 132.96548461914062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3811684250831604, "geo/layer_14/attn_entropy_mean": 5.505887508392334, "geo/layer_14/attn_entropy_std": 0.3844415247440338, "geo/layer_21/stable_rank_q_proj": 43.86366653442383, "geo/layer_21/stable_rank_k_proj": 31.200485229492188, "geo/layer_21/stable_rank_o_proj": 77.53888702392578, "geo/layer_21/stable_rank_gate_proj": 74.5942611694336, "geo/layer_21/stable_rank_down_proj": 56.84487533569336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14768919348716736, "geo/layer_21/attn_entropy_mean": 5.737212181091309, "geo/layer_21/attn_entropy_std": 0.2899119555950165, "geo/layer_27/stable_rank_q_proj": 42.42008590698242, "geo/layer_27/stable_rank_k_proj": 31.416135787963867, "geo/layer_27/stable_rank_o_proj": 117.19319915771484, "geo/layer_27/stable_rank_gate_proj": 86.25225830078125, "geo/layer_27/stable_rank_down_proj": 132.94776916503906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08903467655181885, "geo/layer_27/attn_entropy_mean": 4.295161724090576, "geo/layer_27/attn_entropy_std": 0.641720712184906, "attnres/final_alpha/block_0": 0.2410845309495926, "attnres/block_norm/0": 1.7093210220336914, "attnres/final_alpha/block_1": 0.0053610652685165405, "attnres/block_norm/1": 38587.27734375, "attnres/final_alpha/block_2": 0.011641569435596466, "attnres/block_norm/2": 25681.166015625, "attnres/final_alpha/block_3": 0.013622596859931946, "attnres/block_norm/3": 44435.0546875, "attnres/final_alpha/block_4": 0.01675124280154705, "attnres/block_norm/4": 12331.236328125, "attnres/final_alpha/block_5": 0.5946069359779358, "attnres/block_norm/5": 5780.658203125, "attnres/final_alpha/block_6": 0.11693201959133148, "attnres/block_norm/6": 29224.744140625, "geo/tier1_time_s": 1.357841968536377, "geo/step": 36225.0, "geo/rankme_slope": -3.670882415466183e-06} {"step": 36230, "timestamp": 1778233698.7287972, "train/loss": 2.1589951276779176, "train/z_loss": 0.0014387860079295932, "train/perplexity": 8.662428648683779, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697920.482917761, "perf/iters_per_sec": 0.8096315779293828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351296901702882, "data/tokens_consumed": 75981914112, "data/tokens_consumed_B": 75.981914112, "train/loss_slope": -2.213364186817753e-06} {"step": 36240, "timestamp": 1778233709.1138334, "train/loss": 2.2169128179550173, "train/z_loss": 0.0014228802174329758, "train/perplexity": 9.178949990356337, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020700.6188888438, "perf/iters_per_sec": 0.9635451406902522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037834095954895, "data/tokens_consumed": 76002885632, "data/tokens_consumed_B": 76.002885632, "train/loss_slope": -2.1945452139323037e-06} {"step": 36250, "timestamp": 1778233720.013557, "grad/layer_0/attn": 0.0028386570047587156, "grad/layer_0/mlp": 0.0028127788100391626, "grad/layer_0/attn_mlp_ratio": 1.0092001879803603, "grad/layer_4/attn": 0.0034308908507227898, "grad/layer_4/mlp": 0.002720397664234042, "grad/layer_4/attn_mlp_ratio": 1.2611725005180612, "grad/layer_8/attn": 0.004225480370223522, "grad/layer_8/mlp": 0.0038755720015615225, "grad/layer_8/attn_mlp_ratio": 1.0902855783591323, "grad/layer_12/attn": 0.005750181619077921, "grad/layer_12/mlp": 0.006386578548699617, "grad/layer_12/attn_mlp_ratio": 0.9003539978715718, "grad/layer_16/attn": 0.0050925761461257935, "grad/layer_16/mlp": 0.0043875244446098804, "grad/layer_16/attn_mlp_ratio": 1.1606946227530686, "grad/layer_20/attn": 0.0034147994592785835, "grad/layer_20/mlp": 0.006050193216651678, "grad/layer_20/attn_mlp_ratio": 0.5644116279524793, "grad/layer_24/attn": 0.0064292787574231625, "grad/layer_24/mlp": 0.00867550354450941, "grad/layer_24/attn_mlp_ratio": 0.7410842091562202, "grad/layer_27/attn": 0.004272435326129198, "grad/layer_27/mlp": 0.006837718188762665, "grad/layer_27/attn_mlp_ratio": 0.6248334818283844} {"step": 36250, "timestamp": 1778233720.029543, "train/loss": 2.2109251499176024, "train/z_loss": 0.0014235648210160433, "train/perplexity": 9.124153699595153, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922360.7490361417, "perf/iters_per_sec": 0.9166530366116246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0909253120422364, "data/tokens_consumed": 76023857152, "data/tokens_consumed_B": 76.023857152, "train/loss_slope": -2.1155618812957714e-06} {"step": 36260, "timestamp": 1778233730.406922, "train/loss": 2.1702863931655885, "train/z_loss": 0.00143181630410254, "train/perplexity": 8.760792712648483, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021928.967464482, "perf/iters_per_sec": 0.9641308629343424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037203598022461, "data/tokens_consumed": 76044828672, "data/tokens_consumed_B": 76.044828672, "train/loss_slope": -1.2784038761733593e-06} {"step": 36270, "timestamp": 1778233740.7856603, "train/loss": 2.1150463819503784, "train/z_loss": 0.0014482337981462478, "train/perplexity": 8.289970262191657, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021682.6206999677, "perf/iters_per_sec": 0.9640133956432189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373299837112426, "data/tokens_consumed": 76065800192, "data/tokens_consumed_B": 76.065800192, "train/loss_slope": -5.766935381415872e-06} {"step": 36280, "timestamp": 1778233751.1596441, "train/loss": 2.145827829837799, "train/z_loss": 0.0014364471659064292, "train/perplexity": 8.54911552165387, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022636.415267607, "perf/iters_per_sec": 0.9644682003343615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368408203125, "data/tokens_consumed": 76086771712, "data/tokens_consumed_B": 76.086771712, "train/loss_slope": -8.248070400111062e-06} {"step": 36290, "timestamp": 1778233761.5410266, "train/loss": 2.215724062919617, "train/z_loss": 0.0014231955050490797, "train/perplexity": 9.168044950330428, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020938.2293087672, "perf/iters_per_sec": 0.963658442167648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037712073326111, "data/tokens_consumed": 76107743232, "data/tokens_consumed_B": 76.107743232, "train/loss_slope": -8.091394435597625e-06} {"step": 36300, "timestamp": 1778233771.9046705, "grad/layer_0/attn": 0.003296402283012867, "grad/layer_0/mlp": 0.0028705152217298746, "grad/layer_0/attn_mlp_ratio": 1.1483660296320375, "grad/layer_4/attn": 0.0018421470886096358, "grad/layer_4/mlp": 0.002640737686306238, "grad/layer_4/attn_mlp_ratio": 0.6975880370108082, "grad/layer_8/attn": 0.007437464781105518, "grad/layer_8/mlp": 0.0036783432587981224, "grad/layer_8/attn_mlp_ratio": 2.0219604467636394, "grad/layer_12/attn": 0.0043662237003445625, "grad/layer_12/mlp": 0.007023385725915432, "grad/layer_12/attn_mlp_ratio": 0.6216693498787627, "grad/layer_16/attn": 0.005762127693742514, "grad/layer_16/mlp": 0.004656464327126741, "grad/layer_16/attn_mlp_ratio": 1.2374469479836698, "grad/layer_20/attn": 0.0036619410384446383, "grad/layer_20/mlp": 0.006232499610632658, "grad/layer_20/attn_mlp_ratio": 0.587555749452721, "grad/layer_24/attn": 0.01934181898832321, "grad/layer_24/mlp": 0.011715823784470558, "grad/layer_24/attn_mlp_ratio": 1.6509141123195772, "grad/layer_27/attn": 0.0065224976278841496, "grad/layer_27/mlp": 0.008380459621548653, "grad/layer_27/attn_mlp_ratio": 0.7782983087566032} {"step": 36300, "timestamp": 1778233772.5096045, "eos/sharpness": 64.1052722930908, "eos/L0_probe": 2.002629280090332, "eos/L_plus": 2.2417409420013428, "eos/L_minus": 2.4045703411102295, "eos/grad_norm": 0.17635934054851532, "eos/embed_grad_frac": 0.07558414340019226, "eos/time_s": 0.6022436618804932} {"step": 36300, "timestamp": 1778233772.5290565, "train/loss": 2.2403538465499877, "train/z_loss": 0.0014201338402926923, "train/perplexity": 9.396655673438307, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909528.4335693459, "perf/iters_per_sec": 0.9105341117712716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098256492614746, "data/tokens_consumed": 76128714752, "data/tokens_consumed_B": 76.128714752, "train/loss_slope": -6.53936154771563e-06} {"step": 36300, "timestamp": 1778233773.8930633, "geo/rankme_last": 439.62213134765625, "geo/layer_0/stable_rank_q_proj": 19.010202407836914, "geo/layer_0/stable_rank_k_proj": 16.332473754882812, "geo/layer_0/stable_rank_o_proj": 49.58184051513672, "geo/layer_0/stable_rank_gate_proj": 140.0062713623047, "geo/layer_0/stable_rank_down_proj": 52.73286437988281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.054412804543972015, "geo/layer_0/attn_entropy_mean": 6.205730438232422, "geo/layer_0/attn_entropy_std": 0.3649844229221344, "geo/layer_7/stable_rank_q_proj": 43.243675231933594, "geo/layer_7/stable_rank_k_proj": 41.739498138427734, "geo/layer_7/stable_rank_o_proj": 101.30348205566406, "geo/layer_7/stable_rank_gate_proj": 92.72542572021484, "geo/layer_7/stable_rank_down_proj": 145.958740234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5090333223342896, "geo/layer_7/attn_entropy_mean": 4.649052143096924, "geo/layer_7/attn_entropy_std": 0.8138555884361267, "geo/layer_14/stable_rank_q_proj": 54.76273727416992, "geo/layer_14/stable_rank_k_proj": 36.61515808105469, "geo/layer_14/stable_rank_o_proj": 49.74938201904297, "geo/layer_14/stable_rank_gate_proj": 78.01435089111328, "geo/layer_14/stable_rank_down_proj": 133.0223388671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38748252391815186, "geo/layer_14/attn_entropy_mean": 5.505197525024414, "geo/layer_14/attn_entropy_std": 0.37579819560050964, "geo/layer_21/stable_rank_q_proj": 43.86530685424805, "geo/layer_21/stable_rank_k_proj": 31.122161865234375, "geo/layer_21/stable_rank_o_proj": 77.5203857421875, "geo/layer_21/stable_rank_gate_proj": 74.61431884765625, "geo/layer_21/stable_rank_down_proj": 56.799896240234375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14722396433353424, "geo/layer_21/attn_entropy_mean": 5.702089786529541, "geo/layer_21/attn_entropy_std": 0.29600754380226135, "geo/layer_27/stable_rank_q_proj": 42.4323844909668, "geo/layer_27/stable_rank_k_proj": 31.39130210876465, "geo/layer_27/stable_rank_o_proj": 117.28482818603516, "geo/layer_27/stable_rank_gate_proj": 86.22179412841797, "geo/layer_27/stable_rank_down_proj": 133.0342559814453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0858897939324379, "geo/layer_27/attn_entropy_mean": 4.30120325088501, "geo/layer_27/attn_entropy_std": 0.6500255465507507, "attnres/final_alpha/block_0": 0.24447080492973328, "attnres/block_norm/0": 1.7094261646270752, "attnres/final_alpha/block_1": 0.005513922311365604, "attnres/block_norm/1": 38566.22265625, "attnres/final_alpha/block_2": 0.011890126392245293, "attnres/block_norm/2": 25582.9140625, "attnres/final_alpha/block_3": 0.01384084764868021, "attnres/block_norm/3": 44331.33203125, "attnres/final_alpha/block_4": 0.017004305496811867, "attnres/block_norm/4": 12362.533203125, "attnres/final_alpha/block_5": 0.5865863561630249, "attnres/block_norm/5": 5881.34814453125, "attnres/final_alpha/block_6": 0.12069365382194519, "attnres/block_norm/6": 29050.15234375, "geo/tier1_time_s": 1.3602499961853027, "geo/step": 36300.0, "geo/rankme_slope": 2.509583911689676e-05} {"step": 36310, "timestamp": 1778233784.2751632, "train/loss": 2.153830409049988, "train/z_loss": 0.0014423303538933397, "train/perplexity": 8.617804975628674, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785959.0187261996, "perf/iters_per_sec": 0.8516116231566427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.17424418926239, "data/tokens_consumed": 76149686272, "data/tokens_consumed_B": 76.149686272, "train/loss_slope": -7.1296357479032815e-06} {"step": 36320, "timestamp": 1778233794.6532826, "train/loss": 2.1933114290237428, "train/z_loss": 0.0014269612147472799, "train/perplexity": 8.964850482677818, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021710.7330761433, "perf/iters_per_sec": 0.9640268006687848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037315559387207, "data/tokens_consumed": 76170657792, "data/tokens_consumed_B": 76.170657792, "train/loss_slope": -6.596458813037537e-06} {"step": 36330, "timestamp": 1778233805.0320787, "train/loss": 2.149701714515686, "train/z_loss": 0.001427903480362147, "train/perplexity": 8.582298040410738, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021623.3317061996, "perf/iters_per_sec": 0.9639851244479177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373604059219361, "data/tokens_consumed": 76191629312, "data/tokens_consumed_B": 76.191629312, "train/loss_slope": -8.76659576577395e-06} {"step": 36340, "timestamp": 1778233815.4110696, "train/loss": 2.1660447835922243, "train/z_loss": 0.001428684452548623, "train/perplexity": 8.723711547914615, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021952.7177030547, "perf/iters_per_sec": 0.96414218793061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371914148330688, "data/tokens_consumed": 76212600832, "data/tokens_consumed_B": 76.212600832, "train/loss_slope": -1.1418731807052924e-05} {"step": 36350, "timestamp": 1778233825.780202, "grad/layer_0/attn": 0.0025780880823731422, "grad/layer_0/mlp": 0.0026911278255283833, "grad/layer_0/attn_mlp_ratio": 0.957995366150033, "grad/layer_4/attn": 0.0016726123867556453, "grad/layer_4/mlp": 0.0023910345043987036, "grad/layer_4/attn_mlp_ratio": 0.6995349978116578, "grad/layer_8/attn": 0.0064559862948954105, "grad/layer_8/mlp": 0.003495698794722557, "grad/layer_8/attn_mlp_ratio": 1.8468370673006156, "grad/layer_12/attn": 0.004800392314791679, "grad/layer_12/mlp": 0.006476843263953924, "grad/layer_12/attn_mlp_ratio": 0.741162329400719, "grad/layer_16/attn": 0.0041944184340536594, "grad/layer_16/mlp": 0.004657858982682228, "grad/layer_16/attn_mlp_ratio": 0.9005035059236494, "grad/layer_20/attn": 0.00395035557448864, "grad/layer_20/mlp": 0.006380216218531132, "grad/layer_20/attn_mlp_ratio": 0.6191569967643511, "grad/layer_24/attn": 0.023124994710087776, "grad/layer_24/mlp": 0.012584545649588108, "grad/layer_24/attn_mlp_ratio": 1.837570872261691, "grad/layer_27/attn": 0.004619174171239138, "grad/layer_27/mlp": 0.012567702680826187, "grad/layer_27/attn_mlp_ratio": 0.3675432377575274} {"step": 36350, "timestamp": 1778233825.7961793, "train/loss": 2.179018270969391, "train/z_loss": 0.0014259617659263314, "train/perplexity": 8.837625844657456, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020420.1359538184, "perf/iters_per_sec": 0.9634113960045902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037978172302246, "data/tokens_consumed": 76233572352, "data/tokens_consumed_B": 76.233572352, "train/loss_slope": -9.544151579693469e-06} {"step": 36360, "timestamp": 1778233836.1685333, "train/loss": 2.1756899118423463, "train/z_loss": 0.0014388632611371578, "train/perplexity": 8.808259949248537, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023015.3097373648, "perf/iters_per_sec": 0.9646488712965797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366466283798217, "data/tokens_consumed": 76254543872, "data/tokens_consumed_B": 76.254543872, "train/loss_slope": -1.0440125524049869e-05} {"step": 36370, "timestamp": 1778233846.5483658, "train/loss": 2.1567906618118284, "train/z_loss": 0.001435768883675337, "train/perplexity": 8.643353653223365, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021735.8257747353, "perf/iters_per_sec": 0.964038765799873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373026847839355, "data/tokens_consumed": 76275515392, "data/tokens_consumed_B": 76.275515392, "train/loss_slope": -9.582455881429489e-06} {"step": 36375, "timestamp": 1778233852.3344576, "eos/sharpness": 29.718303680419915, "eos/L0_probe": 2.003833293914795, "eos/L_plus": 2.157492160797119, "eos/L_minus": 2.14735746383667, "eos/grad_norm": 0.13140994310379028, "eos/embed_grad_frac": 0.13383781909942627, "eos/time_s": 0.6040103435516357} {"step": 36375, "timestamp": 1778233853.7122173, "geo/rankme_last": 438.498291015625, "geo/layer_0/stable_rank_q_proj": 19.040578842163086, "geo/layer_0/stable_rank_k_proj": 16.338930130004883, "geo/layer_0/stable_rank_o_proj": 49.502559661865234, "geo/layer_0/stable_rank_gate_proj": 139.74317932128906, "geo/layer_0/stable_rank_down_proj": 52.77547836303711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05961855128407478, "geo/layer_0/attn_entropy_mean": 6.206928253173828, "geo/layer_0/attn_entropy_std": 0.36792850494384766, "geo/layer_7/stable_rank_q_proj": 43.27690124511719, "geo/layer_7/stable_rank_k_proj": 41.733577728271484, "geo/layer_7/stable_rank_o_proj": 101.10610961914062, "geo/layer_7/stable_rank_gate_proj": 92.7018051147461, "geo/layer_7/stable_rank_down_proj": 145.85928344726562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5041247010231018, "geo/layer_7/attn_entropy_mean": 4.652752876281738, "geo/layer_7/attn_entropy_std": 0.820465087890625, "geo/layer_14/stable_rank_q_proj": 54.613677978515625, "geo/layer_14/stable_rank_k_proj": 36.690059661865234, "geo/layer_14/stable_rank_o_proj": 49.606361389160156, "geo/layer_14/stable_rank_gate_proj": 78.0293197631836, "geo/layer_14/stable_rank_down_proj": 133.14273071289062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3737601935863495, "geo/layer_14/attn_entropy_mean": 5.458560943603516, "geo/layer_14/attn_entropy_std": 0.3775259852409363, "geo/layer_21/stable_rank_q_proj": 43.907073974609375, "geo/layer_21/stable_rank_k_proj": 31.08595848083496, "geo/layer_21/stable_rank_o_proj": 77.5045394897461, "geo/layer_21/stable_rank_gate_proj": 74.58502960205078, "geo/layer_21/stable_rank_down_proj": 56.82081604003906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14683271944522858, "geo/layer_21/attn_entropy_mean": 5.71476936340332, "geo/layer_21/attn_entropy_std": 0.2914506494998932, "geo/layer_27/stable_rank_q_proj": 42.45608901977539, "geo/layer_27/stable_rank_k_proj": 31.360790252685547, "geo/layer_27/stable_rank_o_proj": 117.46168518066406, "geo/layer_27/stable_rank_gate_proj": 86.20622253417969, "geo/layer_27/stable_rank_down_proj": 133.07275390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09193862974643707, "geo/layer_27/attn_entropy_mean": 4.302128314971924, "geo/layer_27/attn_entropy_std": 0.6403689980506897, "attnres/final_alpha/block_0": 0.24119220674037933, "attnres/block_norm/0": 1.709672212600708, "attnres/final_alpha/block_1": 0.005461357533931732, "attnres/block_norm/1": 38674.94921875, "attnres/final_alpha/block_2": 0.011623548343777657, "attnres/block_norm/2": 25725.10546875, "attnres/final_alpha/block_3": 0.013573028147220612, "attnres/block_norm/3": 44679.625, "attnres/final_alpha/block_4": 0.01666940189898014, "attnres/block_norm/4": 12368.3984375, "attnres/final_alpha/block_5": 0.5937931537628174, "attnres/block_norm/5": 5874.6328125, "attnres/final_alpha/block_6": 0.11768728494644165, "attnres/block_norm/6": 29540.01953125, "geo/tier1_time_s": 1.3569471836090088, "geo/step": 36375.0, "geo/rankme_slope": -1.1341333408363345e-05} {"step": 36380, "timestamp": 1778233858.899196, "train/loss": 2.144540536403656, "train/z_loss": 0.0014268895727582275, "train/perplexity": 8.53811738181049, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698723.8871922537, "perf/iters_per_sec": 0.8100146709405202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345455408096313, "data/tokens_consumed": 76296486912, "data/tokens_consumed_B": 76.296486912, "train/loss_slope": -1.0819759871056213e-05} {"step": 36390, "timestamp": 1778233869.2730076, "train/loss": 2.189530944824219, "train/z_loss": 0.001425209012813866, "train/perplexity": 8.931022989517366, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022529.3085352157, "perf/iters_per_sec": 0.9644171278644637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368957281112672, "data/tokens_consumed": 76317458432, "data/tokens_consumed_B": 76.317458432, "train/loss_slope": -9.270818098293126e-06} {"step": 36400, "timestamp": 1778233879.6374698, "grad/layer_0/attn": 0.0029877102933824062, "grad/layer_0/mlp": 0.002820476656779647, "grad/layer_0/attn_mlp_ratio": 1.059292648379667, "grad/layer_4/attn": 0.002542968839406967, "grad/layer_4/mlp": 0.0024421398993581533, "grad/layer_4/attn_mlp_ratio": 1.0412870843093818, "grad/layer_8/attn": 0.005111780017614365, "grad/layer_8/mlp": 0.0035238023847341537, "grad/layer_8/attn_mlp_ratio": 1.4506431730381202, "grad/layer_12/attn": 0.0046851746737957, "grad/layer_12/mlp": 0.006382500287145376, "grad/layer_12/attn_mlp_ratio": 0.7340657092997345, "grad/layer_16/attn": 0.004326427821069956, "grad/layer_16/mlp": 0.004503391683101654, "grad/layer_16/attn_mlp_ratio": 0.9607042934403952, "grad/layer_20/attn": 0.004331635776907206, "grad/layer_20/mlp": 0.006987083703279495, "grad/layer_20/attn_mlp_ratio": 0.6199490229205618, "grad/layer_24/attn": 0.025740714743733406, "grad/layer_24/mlp": 0.014865295961499214, "grad/layer_24/attn_mlp_ratio": 1.731597852961791, "grad/layer_27/attn": 0.005665966775268316, "grad/layer_27/mlp": 0.014278993010520935, "grad/layer_27/attn_mlp_ratio": 0.3968043636839886} {"step": 36400, "timestamp": 1778233879.6535413, "train/loss": 2.18471417427063, "train/z_loss": 0.001433433045167476, "train/perplexity": 8.88810774039619, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021165.0745008367, "perf/iters_per_sec": 0.9637666103843864, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375956058502198, "data/tokens_consumed": 76338429952, "data/tokens_consumed_B": 76.338429952, "train/loss_slope": -9.213870779873527e-06} {"step": 36410, "timestamp": 1778233890.0319505, "train/loss": 2.1391236543655396, "train/z_loss": 0.001436005183495581, "train/perplexity": 8.491992446577667, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021915.6285340767, "perf/iters_per_sec": 0.9641245024366745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372104406356812, "data/tokens_consumed": 76359401472, "data/tokens_consumed_B": 76.359401472, "train/loss_slope": -1.1423158280812945e-05} {"step": 36420, "timestamp": 1778233900.4072146, "train/loss": 2.1907294273376463, "train/z_loss": 0.0014259709743782878, "train/perplexity": 8.941733081044065, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022331.3095461465, "perf/iters_per_sec": 0.9643227145891888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369972467422486, "data/tokens_consumed": 76380372992, "data/tokens_consumed_B": 76.380372992, "train/loss_slope": -1.2597519286287596e-05} {"step": 36430, "timestamp": 1778233910.7876358, "train/loss": 2.160067319869995, "train/z_loss": 0.0014276059926487505, "train/perplexity": 8.67172141797242, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022009.4230729754, "perf/iters_per_sec": 0.9641692271580579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371623277664184, "data/tokens_consumed": 76401344512, "data/tokens_consumed_B": 76.401344512, "train/loss_slope": -1.3938532324835873e-05} {"step": 36440, "timestamp": 1778233921.16648, "train/loss": 2.1397465825080872, "train/z_loss": 0.0014391234028153122, "train/perplexity": 8.497283995615238, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021782.5277333062, "perf/iters_per_sec": 0.9640610350290805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372787237167358, "data/tokens_consumed": 76422316032, "data/tokens_consumed_B": 76.422316032, "train/loss_slope": -1.4887276555624138e-05} {"step": 36450, "timestamp": 1778233931.5313244, "grad/layer_0/attn": 0.002729725791141391, "grad/layer_0/mlp": 0.0027448595501482487, "grad/layer_0/attn_mlp_ratio": 0.9944864725575165, "grad/layer_4/attn": 0.0024071726948022842, "grad/layer_4/mlp": 0.0025073981378227472, "grad/layer_4/attn_mlp_ratio": 0.9600280715250514, "grad/layer_8/attn": 0.005040301475673914, "grad/layer_8/mlp": 0.003742558415979147, "grad/layer_8/attn_mlp_ratio": 1.3467528841977892, "grad/layer_12/attn": 0.005154664162546396, "grad/layer_12/mlp": 0.007201462052762508, "grad/layer_12/attn_mlp_ratio": 0.7157802197945382, "grad/layer_16/attn": 0.004087598994374275, "grad/layer_16/mlp": 0.004877649247646332, "grad/layer_16/attn_mlp_ratio": 0.8380264145774874, "grad/layer_20/attn": 0.0034329853951931, "grad/layer_20/mlp": 0.0065471334382891655, "grad/layer_20/attn_mlp_ratio": 0.5243493774972208, "grad/layer_24/attn": 0.014903062023222446, "grad/layer_24/mlp": 0.012442750856280327, "grad/layer_24/attn_mlp_ratio": 1.1977304758077079, "grad/layer_27/attn": 0.00931016355752945, "grad/layer_27/mlp": 0.011945558711886406, "grad/layer_27/attn_mlp_ratio": 0.779382840446559} {"step": 36450, "timestamp": 1778233932.1442876, "eos/sharpness": 64.76528644561766, "eos/L0_probe": 2.000908613204956, "eos/L_plus": 2.2826573848724365, "eos/L_minus": 2.3668127059936523, "eos/grad_norm": 0.20154204964637756, "eos/embed_grad_frac": 0.06443320959806442, "eos/time_s": 0.6101605892181396} {"step": 36450, "timestamp": 1778233932.165032, "train/loss": 2.1991221427917482, "train/z_loss": 0.0014342798269353807, "train/perplexity": 9.017094302749586, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907883.7282117796, "perf/iters_per_sec": 0.9097498551424883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0992032527923583, "data/tokens_consumed": 76443287552, "data/tokens_consumed_B": 76.443287552, "train/loss_slope": -1.5026195529270448e-05} {"step": 36450, "timestamp": 1778233933.5225549, "geo/rankme_last": 439.6871032714844, "geo/layer_0/stable_rank_q_proj": 19.035499572753906, "geo/layer_0/stable_rank_k_proj": 16.34419059753418, "geo/layer_0/stable_rank_o_proj": 49.59642028808594, "geo/layer_0/stable_rank_gate_proj": 139.70741271972656, "geo/layer_0/stable_rank_down_proj": 52.71902084350586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0603695772588253, "geo/layer_0/attn_entropy_mean": 6.203407287597656, "geo/layer_0/attn_entropy_std": 0.36452266573905945, "geo/layer_7/stable_rank_q_proj": 43.434749603271484, "geo/layer_7/stable_rank_k_proj": 41.808021545410156, "geo/layer_7/stable_rank_o_proj": 100.86143493652344, "geo/layer_7/stable_rank_gate_proj": 92.49060821533203, "geo/layer_7/stable_rank_down_proj": 145.52085876464844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5191434621810913, "geo/layer_7/attn_entropy_mean": 4.686013221740723, "geo/layer_7/attn_entropy_std": 0.8236786723136902, "geo/layer_14/stable_rank_q_proj": 54.625057220458984, "geo/layer_14/stable_rank_k_proj": 36.64045333862305, "geo/layer_14/stable_rank_o_proj": 49.5416374206543, "geo/layer_14/stable_rank_gate_proj": 78.03846740722656, "geo/layer_14/stable_rank_down_proj": 133.26504516601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3869488835334778, "geo/layer_14/attn_entropy_mean": 5.498200416564941, "geo/layer_14/attn_entropy_std": 0.3802773058414459, "geo/layer_21/stable_rank_q_proj": 43.904685974121094, "geo/layer_21/stable_rank_k_proj": 31.074188232421875, "geo/layer_21/stable_rank_o_proj": 77.41889953613281, "geo/layer_21/stable_rank_gate_proj": 74.54884338378906, "geo/layer_21/stable_rank_down_proj": 56.80461502075195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.150170236825943, "geo/layer_21/attn_entropy_mean": 5.718424320220947, "geo/layer_21/attn_entropy_std": 0.2919314503669739, "geo/layer_27/stable_rank_q_proj": 42.52825927734375, "geo/layer_27/stable_rank_k_proj": 31.432838439941406, "geo/layer_27/stable_rank_o_proj": 117.57331085205078, "geo/layer_27/stable_rank_gate_proj": 86.0875015258789, "geo/layer_27/stable_rank_down_proj": 133.41455078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08618255704641342, "geo/layer_27/attn_entropy_mean": 4.304379463195801, "geo/layer_27/attn_entropy_std": 0.6479917168617249, "attnres/final_alpha/block_0": 0.2436123490333557, "attnres/block_norm/0": 1.709977626800537, "attnres/final_alpha/block_1": 0.005563684273511171, "attnres/block_norm/1": 38656.53515625, "attnres/final_alpha/block_2": 0.01180063746869564, "attnres/block_norm/2": 25746.93359375, "attnres/final_alpha/block_3": 0.013710921630263329, "attnres/block_norm/3": 44499.4375, "attnres/final_alpha/block_4": 0.017119362950325012, "attnres/block_norm/4": 12385.7763671875, "attnres/final_alpha/block_5": 0.5869358777999878, "attnres/block_norm/5": 5928.68359375, "attnres/final_alpha/block_6": 0.12125720083713531, "attnres/block_norm/6": 29231.97265625, "geo/tier1_time_s": 1.3537120819091797, "geo/step": 36450.0, "geo/rankme_slope": 1.8924581551370547e-05} {"step": 36460, "timestamp": 1778233943.9127243, "train/loss": 2.2049870252609254, "train/z_loss": 0.0014273963402956723, "train/perplexity": 9.070133884472433, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785714.8267552147, "perf/iters_per_sec": 0.8514951833511423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174404764175415, "data/tokens_consumed": 76464259072, "data/tokens_consumed_B": 76.464259072, "train/loss_slope": -1.1659200227979455e-05} {"step": 36470, "timestamp": 1778233954.3120427, "train/loss": 2.155513381958008, "train/z_loss": 0.0014348866185173391, "train/perplexity": 8.632320719304735, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017878.7354897964, "perf/iters_per_sec": 0.9621995618294699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0392854452133178, "data/tokens_consumed": 76485230592, "data/tokens_consumed_B": 76.485230592, "train/loss_slope": -1.3337784710496926e-05} {"step": 36480, "timestamp": 1778233964.6921854, "train/loss": 2.163443422317505, "train/z_loss": 0.0014252490596845745, "train/perplexity": 8.701047513972938, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021529.7123593127, "perf/iters_per_sec": 0.9639404832645954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037408447265625, "data/tokens_consumed": 76506202112, "data/tokens_consumed_B": 76.506202112, "train/loss_slope": -1.3781493489104892e-05} {"step": 36490, "timestamp": 1778233975.082839, "train/loss": 2.2331605672836305, "train/z_loss": 0.0014106584712862968, "train/perplexity": 9.329305429974296, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019488.6450327293, "perf/iters_per_sec": 0.9629672265208861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384569406509399, "data/tokens_consumed": 76527173632, "data/tokens_consumed_B": 76.527173632, "train/loss_slope": -1.014634556431255e-05} {"step": 36500, "timestamp": 1778233985.4572814, "grad/layer_0/attn": 0.002497653942555189, "grad/layer_0/mlp": 0.002735957968980074, "grad/layer_0/attn_mlp_ratio": 0.9128992037097552, "grad/layer_4/attn": 0.001769741764292121, "grad/layer_4/mlp": 0.0026110352482646704, "grad/layer_4/attn_mlp_ratio": 0.6777931081891769, "grad/layer_8/attn": 0.006461575627326965, "grad/layer_8/mlp": 0.003659256035462022, "grad/layer_8/attn_mlp_ratio": 1.7658167092233676, "grad/layer_12/attn": 0.006983872037380934, "grad/layer_12/mlp": 0.006732023321092129, "grad/layer_12/attn_mlp_ratio": 1.0374105377440812, "grad/layer_16/attn": 0.003649350954219699, "grad/layer_16/mlp": 0.004581427201628685, "grad/layer_16/attn_mlp_ratio": 0.7965532821883619, "grad/layer_20/attn": 0.0035627633333206177, "grad/layer_20/mlp": 0.006512629333883524, "grad/layer_20/attn_mlp_ratio": 0.5470545145382422, "grad/layer_24/attn": 0.01447067130357027, "grad/layer_24/mlp": 0.010068266652524471, "grad/layer_24/attn_mlp_ratio": 1.4372554541169622, "grad/layer_27/attn": 0.006307619158178568, "grad/layer_27/mlp": 0.008676012977957726, "grad/layer_27/attn_mlp_ratio": 0.7270181708466649} {"step": 36500, "timestamp": 1778233985.473456, "train/loss": 2.1662805318832397, "train/z_loss": 0.001429012243170291, "train/perplexity": 8.725768390442376, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019914.0434641591, "perf/iters_per_sec": 0.9631700723000332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03823823928833, "data/tokens_consumed": 76548145152, "data/tokens_consumed_B": 76.548145152, "train/loss_slope": -1.1521725149580952e-05} {"step": 36500, "timestamp": 1778233992.5116043, "geo/ww_alpha_mean": 8.02299158194996, "geo/ww_alpha_std": 5.636342417425179, "geo/ww_alpha_min": 1.357371967112492, "geo/ww_alpha_max": 46.99692898583475, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.062932048664735, "geo/ww_alpha_by_type/k_proj": 4.563713395083657, "geo/ww_alpha_by_type/v_proj": 9.397506536876703, "geo/ww_alpha_by_type/o_proj": 9.092037518660586, "geo/ww_alpha_by_type/gate_proj": 8.043309682460125, "geo/ww_alpha_by_type/up_proj": 12.121085473533073, "geo/ww_alpha_by_type/down_proj": 9.002969804056374, "geo/twonn_id/layer_0": 0.7141320109367371, "geo/twonn_id/layer_7": 3.21822452545166, "geo/twonn_id/layer_14": 3.9817018508911133, "geo/twonn_id/layer_21": 7.06612491607666, "geo/twonn_id/layer_27": 5.993957996368408, "geo/tier2_time_s": 7.028737545013428} {"step": 36500, "timestamp": 1778233993.1623604, "eoc/jacobian_sigma/layer_0/attn": 951.82763671875, "eoc/jacobian_sigma/layer_0/mlp": 8751.4140625, "eoc/jacobian_sigma/layer_0": 8751.4140625, "eoc/jacobian_sigma/layer_7/attn": 1.1419496536254883, "eoc/jacobian_sigma/layer_7/mlp": 1.7256860733032227, "eoc/jacobian_sigma/layer_7": 1.7256860733032227, "eoc/jacobian_sigma/layer_14/attn": 1.6154911518096924, "eoc/jacobian_sigma/layer_14/mlp": 7.8862833976745605, "eoc/jacobian_sigma/layer_14": 7.8862833976745605, "eoc/jacobian_sigma/layer_21/attn": 1.0913622379302979, "eoc/jacobian_sigma/layer_21/mlp": 4.201053619384766, "eoc/jacobian_sigma/layer_21": 4.201053619384766, "eoc/jacobian_sigma/layer_27/attn": 3.8555281162261963, "eoc/jacobian_sigma/layer_27/mlp": 21.442890167236328, "eoc/jacobian_sigma/layer_27": 21.442890167236328, "eoc/layer0_sigma": 8751.4140625, "eoc/sigma_max": 21.442890167236328, "eoc/sigma_min": 1.7256860733032227, "eoc/sigma_mean": 8.81397831439972, "eoc/time_s": 0.6392152309417725} {"step": 36510, "timestamp": 1778234003.5648866, "train/loss": 2.1881587147712707, "train/z_loss": 0.0014339225832372903, "train/perplexity": 8.918775976149258, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1159706.1004375813, "perf/iters_per_sec": 0.5529909612834841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8083478212356567, "data/tokens_consumed": 76569116672, "data/tokens_consumed_B": 76.569116672, "train/loss_slope": -9.47111783140666e-06} {"step": 36520, "timestamp": 1778234013.9447608, "train/loss": 2.2204782724380494, "train/z_loss": 0.001418822002597153, "train/perplexity": 9.21173553169457, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021599.1245823717, "perf/iters_per_sec": 0.9639735815917834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373728275299072, "data/tokens_consumed": 76590088192, "data/tokens_consumed_B": 76.590088192, "train/loss_slope": -6.658939531247843e-06} {"step": 36525, "timestamp": 1778234019.7549417, "eos/sharpness": 42.004823684692376, "eos/L0_probe": 2.001879930496216, "eos/L_plus": 2.2333791255950928, "eos/L_minus": 2.1904289722442627, "eos/grad_norm": 0.13817527890205383, "eos/embed_grad_frac": 0.12553822994232178, "eos/time_s": 0.6316773891448975} {"step": 36525, "timestamp": 1778234021.1366413, "geo/rankme_last": 438.0677795410156, "geo/layer_0/stable_rank_q_proj": 19.066505432128906, "geo/layer_0/stable_rank_k_proj": 16.35483169555664, "geo/layer_0/stable_rank_o_proj": 49.58493423461914, "geo/layer_0/stable_rank_gate_proj": 139.74517822265625, "geo/layer_0/stable_rank_down_proj": 52.61325454711914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06665083765983582, "geo/layer_0/attn_entropy_mean": 6.205492973327637, "geo/layer_0/attn_entropy_std": 0.36685824394226074, "geo/layer_7/stable_rank_q_proj": 43.50278854370117, "geo/layer_7/stable_rank_k_proj": 41.886566162109375, "geo/layer_7/stable_rank_o_proj": 101.06289672851562, "geo/layer_7/stable_rank_gate_proj": 92.4889144897461, "geo/layer_7/stable_rank_down_proj": 145.41162109375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5129897594451904, "geo/layer_7/attn_entropy_mean": 4.61483907699585, "geo/layer_7/attn_entropy_std": 0.8232715725898743, "geo/layer_14/stable_rank_q_proj": 54.51184844970703, "geo/layer_14/stable_rank_k_proj": 36.70582580566406, "geo/layer_14/stable_rank_o_proj": 49.6046257019043, "geo/layer_14/stable_rank_gate_proj": 78.01995086669922, "geo/layer_14/stable_rank_down_proj": 132.83424377441406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39495593309402466, "geo/layer_14/attn_entropy_mean": 5.520566463470459, "geo/layer_14/attn_entropy_std": 0.36017563939094543, "geo/layer_21/stable_rank_q_proj": 43.975032806396484, "geo/layer_21/stable_rank_k_proj": 31.10719871520996, "geo/layer_21/stable_rank_o_proj": 77.34294891357422, "geo/layer_21/stable_rank_gate_proj": 74.47708892822266, "geo/layer_21/stable_rank_down_proj": 56.790645599365234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14315298199653625, "geo/layer_21/attn_entropy_mean": 5.715809345245361, "geo/layer_21/attn_entropy_std": 0.2917046844959259, "geo/layer_27/stable_rank_q_proj": 42.4398078918457, "geo/layer_27/stable_rank_k_proj": 31.484298706054688, "geo/layer_27/stable_rank_o_proj": 117.55583953857422, "geo/layer_27/stable_rank_gate_proj": 85.87744903564453, "geo/layer_27/stable_rank_down_proj": 133.56253051757812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09646030515432358, "geo/layer_27/attn_entropy_mean": 4.285865783691406, "geo/layer_27/attn_entropy_std": 0.6457062363624573, "attnres/final_alpha/block_0": 0.24240607023239136, "attnres/block_norm/0": 1.7100168466567993, "attnres/final_alpha/block_1": 0.005492560565471649, "attnres/block_norm/1": 38766.8828125, "attnres/final_alpha/block_2": 0.01171658281236887, "attnres/block_norm/2": 25761.802734375, "attnres/final_alpha/block_3": 0.013968627899885178, "attnres/block_norm/3": 44428.140625, "attnres/final_alpha/block_4": 0.01717923954129219, "attnres/block_norm/4": 12400.017578125, "attnres/final_alpha/block_5": 0.589552104473114, "attnres/block_norm/5": 5918.0712890625, "attnres/final_alpha/block_6": 0.11968480050563812, "attnres/block_norm/6": 29552.5703125, "geo/tier1_time_s": 1.3619353771209717, "geo/step": 36525.0, "geo/rankme_slope": -3.0732331995298125e-05} {"step": 36530, "timestamp": 1778234026.332085, "train/loss": 2.1597846031188963, "train/z_loss": 0.0014401101623661816, "train/perplexity": 8.669270123594014, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1693822.7830288531, "perf/iters_per_sec": 0.8076776423591867, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2381177186965941, "data/tokens_consumed": 76611059712, "data/tokens_consumed_B": 76.611059712, "train/loss_slope": -6.445152874719645e-06} {"step": 36540, "timestamp": 1778234036.7106216, "train/loss": 2.1168131709098814, "train/z_loss": 0.0014430987648665905, "train/perplexity": 8.30462983649915, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021934.5447697914, "perf/iters_per_sec": 0.9641335224007566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372007369995118, "data/tokens_consumed": 76632031232, "data/tokens_consumed_B": 76.632031232, "train/loss_slope": -1.3557341232551553e-05} {"step": 36550, "timestamp": 1778234047.0760264, "grad/layer_0/attn": 0.0031632944010198116, "grad/layer_0/mlp": 0.003009601728990674, "grad/layer_0/attn_mlp_ratio": 1.051067410495521, "grad/layer_4/attn": 0.0017423686804249883, "grad/layer_4/mlp": 0.0025031547993421555, "grad/layer_4/attn_mlp_ratio": 0.6960690610408863, "grad/layer_8/attn": 0.0041634393855929375, "grad/layer_8/mlp": 0.0035964488051831722, "grad/layer_8/attn_mlp_ratio": 1.1576528668578117, "grad/layer_12/attn": 0.004007350653409958, "grad/layer_12/mlp": 0.006237050984054804, "grad/layer_12/attn_mlp_ratio": 0.6425072681631329, "grad/layer_16/attn": 0.003851597663015127, "grad/layer_16/mlp": 0.0047312029637396336, "grad/layer_16/attn_mlp_ratio": 0.8140842003874488, "grad/layer_20/attn": 0.004324333742260933, "grad/layer_20/mlp": 0.0054085333831608295, "grad/layer_20/attn_mlp_ratio": 0.7995390535575878, "grad/layer_24/attn": 0.005902089644223452, "grad/layer_24/mlp": 0.008175162598490715, "grad/layer_24/attn_mlp_ratio": 0.7219537839061094, "grad/layer_27/attn": 0.004570129327476025, "grad/layer_27/mlp": 0.00678856810554862, "grad/layer_27/attn_mlp_ratio": 0.6732096060757908} {"step": 36550, "timestamp": 1778234047.0920563, "train/loss": 2.1915667057037354, "train/z_loss": 0.001436375838238746, "train/perplexity": 8.949222935817296, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021324.6164074994, "perf/iters_per_sec": 0.9638426858937738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375137090682984, "data/tokens_consumed": 76653002752, "data/tokens_consumed_B": 76.653002752, "train/loss_slope": -1.354008979923257e-05} {"step": 36560, "timestamp": 1778234057.4789822, "train/loss": 2.218476748466492, "train/z_loss": 0.0014278306858614086, "train/perplexity": 9.193316461462587, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020344.0759808552, "perf/iters_per_sec": 0.9633751277832294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038017249107361, "data/tokens_consumed": 76673974272, "data/tokens_consumed_B": 76.673974272, "train/loss_slope": -1.0636169131439456e-05} {"step": 36570, "timestamp": 1778234067.8573961, "train/loss": 2.163466453552246, "train/z_loss": 0.0014378062332980335, "train/perplexity": 8.701247912148425, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021715.4262927838, "perf/iters_per_sec": 0.9640290385688705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373131513595581, "data/tokens_consumed": 76694945792, "data/tokens_consumed_B": 76.694945792, "train/loss_slope": -1.3921528423365346e-05} {"step": 36580, "timestamp": 1778234078.2348316, "train/loss": 2.159302806854248, "train/z_loss": 0.0014410112402401865, "train/perplexity": 8.665094307658277, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022092.767147216, "perf/iters_per_sec": 0.9642089687095718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371195793151855, "data/tokens_consumed": 76715917312, "data/tokens_consumed_B": 76.715917312, "train/loss_slope": -1.5822767116913966e-05} {"step": 36590, "timestamp": 1778234088.6263585, "train/loss": 2.2052932024002074, "train/z_loss": 0.001428162434604019, "train/perplexity": 9.072911377298793, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019300.9294076299, "perf/iters_per_sec": 0.9628777167356634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385534763336182, "data/tokens_consumed": 76736888832, "data/tokens_consumed_B": 76.736888832, "train/loss_slope": -1.663925062597884e-05} {"step": 36600, "timestamp": 1778234098.9992993, "grad/layer_0/attn": 0.0031497501768171787, "grad/layer_0/mlp": 0.0030372554901987314, "grad/layer_0/attn_mlp_ratio": 1.037038235103252, "grad/layer_4/attn": 0.0020507106091827154, "grad/layer_4/mlp": 0.0024639596231281757, "grad/layer_4/attn_mlp_ratio": 0.832282520665228, "grad/layer_8/attn": 0.0042320191860198975, "grad/layer_8/mlp": 0.0039007754530757666, "grad/layer_8/attn_mlp_ratio": 1.084917378207763, "grad/layer_12/attn": 0.004230814520269632, "grad/layer_12/mlp": 0.006878138519823551, "grad/layer_12/attn_mlp_ratio": 0.615110388743251, "grad/layer_16/attn": 0.0035210829228162766, "grad/layer_16/mlp": 0.004575865808874369, "grad/layer_16/attn_mlp_ratio": 0.7694899704092157, "grad/layer_20/attn": 0.0033637327142059803, "grad/layer_20/mlp": 0.006525137461721897, "grad/layer_20/attn_mlp_ratio": 0.5155037242338744, "grad/layer_24/attn": 0.017115293070673943, "grad/layer_24/mlp": 0.015497037209570408, "grad/layer_24/attn_mlp_ratio": 1.1044235571468979, "grad/layer_27/attn": 0.006665433757007122, "grad/layer_27/mlp": 0.014987120404839516, "grad/layer_27/attn_mlp_ratio": 0.4447441224519931} {"step": 36600, "timestamp": 1778234099.6240797, "eos/sharpness": 63.26010227203368, "eos/L0_probe": 2.001319169998169, "eos/L_plus": 2.281507968902588, "eos/L_minus": 2.353731393814087, "eos/grad_norm": 0.22117960453033447, "eos/embed_grad_frac": 0.0603952631354332, "eos/time_s": 0.6218712329864502} {"step": 36600, "timestamp": 1778234099.6436489, "train/loss": 2.108327841758728, "train/z_loss": 0.0014328663586638869, "train/perplexity": 8.234460445069857, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904478.1008353764, "perf/iters_per_sec": 0.9081259254624254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1011688709259033, "data/tokens_consumed": 76757860352, "data/tokens_consumed_B": 76.757860352, "train/loss_slope": -1.7687987780520863e-05} {"step": 36600, "timestamp": 1778234101.0063179, "geo/rankme_last": 439.39141845703125, "geo/layer_0/stable_rank_q_proj": 19.063770294189453, "geo/layer_0/stable_rank_k_proj": 16.313085556030273, "geo/layer_0/stable_rank_o_proj": 49.546260833740234, "geo/layer_0/stable_rank_gate_proj": 139.8645782470703, "geo/layer_0/stable_rank_down_proj": 52.740806579589844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05649133771657944, "geo/layer_0/attn_entropy_mean": 6.204105854034424, "geo/layer_0/attn_entropy_std": 0.3655969202518463, "geo/layer_7/stable_rank_q_proj": 43.508541107177734, "geo/layer_7/stable_rank_k_proj": 41.966590881347656, "geo/layer_7/stable_rank_o_proj": 100.88021087646484, "geo/layer_7/stable_rank_gate_proj": 92.30852508544922, "geo/layer_7/stable_rank_down_proj": 145.59616088867188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4978848099708557, "geo/layer_7/attn_entropy_mean": 4.648066520690918, "geo/layer_7/attn_entropy_std": 0.8174840807914734, "geo/layer_14/stable_rank_q_proj": 54.40694046020508, "geo/layer_14/stable_rank_k_proj": 36.747337341308594, "geo/layer_14/stable_rank_o_proj": 49.64586639404297, "geo/layer_14/stable_rank_gate_proj": 77.84184265136719, "geo/layer_14/stable_rank_down_proj": 132.76373291015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38754069805145264, "geo/layer_14/attn_entropy_mean": 5.473267555236816, "geo/layer_14/attn_entropy_std": 0.37497320771217346, "geo/layer_21/stable_rank_q_proj": 43.88829040527344, "geo/layer_21/stable_rank_k_proj": 31.0432186126709, "geo/layer_21/stable_rank_o_proj": 77.2459716796875, "geo/layer_21/stable_rank_gate_proj": 74.446533203125, "geo/layer_21/stable_rank_down_proj": 56.70270538330078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1483653485774994, "geo/layer_21/attn_entropy_mean": 5.710271835327148, "geo/layer_21/attn_entropy_std": 0.29403620958328247, "geo/layer_27/stable_rank_q_proj": 42.42770004272461, "geo/layer_27/stable_rank_k_proj": 31.458717346191406, "geo/layer_27/stable_rank_o_proj": 117.38742065429688, "geo/layer_27/stable_rank_gate_proj": 85.8652114868164, "geo/layer_27/stable_rank_down_proj": 133.0417022705078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08478804677724838, "geo/layer_27/attn_entropy_mean": 4.286764144897461, "geo/layer_27/attn_entropy_std": 0.6312673091888428, "attnres/final_alpha/block_0": 0.24190635979175568, "attnres/block_norm/0": 1.7106492519378662, "attnres/final_alpha/block_1": 0.005451574455946684, "attnres/block_norm/1": 38843.9921875, "attnres/final_alpha/block_2": 0.011610232293605804, "attnres/block_norm/2": 25832.130859375, "attnres/final_alpha/block_3": 0.013708933256566525, "attnres/block_norm/3": 44520.6640625, "attnres/final_alpha/block_4": 0.01699323207139969, "attnres/block_norm/4": 12340.494140625, "attnres/final_alpha/block_5": 0.5901605486869812, "attnres/block_norm/5": 5868.92919921875, "attnres/final_alpha/block_6": 0.12016913294792175, "attnres/block_norm/6": 29354.87109375, "geo/tier1_time_s": 1.3587560653686523, "geo/step": 36600.0, "geo/rankme_slope": -2.6321094844187667e-05} {"step": 36610, "timestamp": 1778234111.3587325, "train/loss": 2.172977638244629, "train/z_loss": 0.0014165079570375382, "train/perplexity": 8.784401907740227, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790696.9072344936, "perf/iters_per_sec": 0.8538708244488209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711373329162598, "data/tokens_consumed": 76778831872, "data/tokens_consumed_B": 76.778831872, "train/loss_slope": -1.9384852093760354e-05} {"step": 36620, "timestamp": 1778234121.7168684, "train/loss": 2.2008331298828123, "train/z_loss": 0.0014206511666998267, "train/perplexity": 9.032535640898606, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025656.3432997058, "perf/iters_per_sec": 0.9659082142351655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035295057296753, "data/tokens_consumed": 76799803392, "data/tokens_consumed_B": 76.799803392, "train/loss_slope": -1.6169156979555955e-05} {"step": 36630, "timestamp": 1778234132.0827043, "train/loss": 2.179031324386597, "train/z_loss": 0.001415980071760714, "train/perplexity": 8.837741206627648, "train/grad_norm": 0.349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024762.8041029326, "perf/iters_per_sec": 0.965482141543833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357519388198853, "data/tokens_consumed": 76820774912, "data/tokens_consumed_B": 76.820774912, "train/loss_slope": -1.730137154607486e-05} {"step": 36640, "timestamp": 1778234142.4318585, "train/loss": 2.1515334606170655, "train/z_loss": 0.0014382493100129068, "train/perplexity": 8.59803303824798, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027290.2795934693, "perf/iters_per_sec": 0.9666873357741687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344606399536134, "data/tokens_consumed": 76841746432, "data/tokens_consumed_B": 76.841746432, "train/loss_slope": -1.826566553959927e-05} {"step": 36650, "timestamp": 1778234153.1987708, "grad/layer_0/attn": 0.0029761434998363256, "grad/layer_0/mlp": 0.0028771404176950455, "grad/layer_0/attn_mlp_ratio": 1.0344102005211042, "grad/layer_4/attn": 0.002611977281048894, "grad/layer_4/mlp": 0.0026780751068145037, "grad/layer_4/attn_mlp_ratio": 0.9753188687168232, "grad/layer_8/attn": 0.005763696040958166, "grad/layer_8/mlp": 0.0036503756418824196, "grad/layer_8/attn_mlp_ratio": 1.5789322657469647, "grad/layer_12/attn": 0.005020835902541876, "grad/layer_12/mlp": 0.006528135389089584, "grad/layer_12/attn_mlp_ratio": 0.7691071839628872, "grad/layer_16/attn": 0.005181265529245138, "grad/layer_16/mlp": 0.005311774555593729, "grad/layer_16/attn_mlp_ratio": 0.9754302215717766, "grad/layer_20/attn": 0.006830023135989904, "grad/layer_20/mlp": 0.007968787103891373, "grad/layer_20/attn_mlp_ratio": 0.85709694100686, "grad/layer_24/attn": 0.024948641657829285, "grad/layer_24/mlp": 0.017189878970384598, "grad/layer_24/attn_mlp_ratio": 1.4513564380340398, "grad/layer_27/attn": 0.007117724046111107, "grad/layer_27/mlp": 0.015961498022079468, "grad/layer_27/attn_mlp_ratio": 0.44593082627157604} {"step": 36650, "timestamp": 1778234153.214567, "train/loss": 2.205078363418579, "train/z_loss": 0.0014384597074240447, "train/perplexity": 9.070962371626782, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945686.2047480391, "perf/iters_per_sec": 0.9277754806270786, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0778469800949098, "data/tokens_consumed": 76862717952, "data/tokens_consumed_B": 76.862717952, "train/loss_slope": -1.5304948702038746e-05} {"step": 36660, "timestamp": 1778234163.5665836, "train/loss": 2.175312542915344, "train/z_loss": 0.001427216944284737, "train/perplexity": 8.804936612744138, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026725.6812271173, "perf/iters_per_sec": 0.9664181142936312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347488164901733, "data/tokens_consumed": 76883689472, "data/tokens_consumed_B": 76.883689472, "train/loss_slope": -1.613734145440602e-05} {"step": 36670, "timestamp": 1778234173.9307308, "train/loss": 2.1973472118377684, "train/z_loss": 0.0014337526052258908, "train/perplexity": 9.001103778193201, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024411.7685038277, "perf/iters_per_sec": 0.9653147547263278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359315395355224, "data/tokens_consumed": 76904660992, "data/tokens_consumed_B": 76.904660992, "train/loss_slope": -1.60489676487256e-05} {"step": 36675, "timestamp": 1778234179.6954064, "eos/sharpness": 7.329869270324705, "eos/L0_probe": 2.004633665084839, "eos/L_plus": 2.038490056991577, "eos/L_minus": 2.0440759658813477, "eos/grad_norm": 0.10297198593616486, "eos/embed_grad_frac": 0.25304070115089417, "eos/time_s": 0.6019749641418457} {"step": 36675, "timestamp": 1778234181.073839, "geo/rankme_last": 439.29229736328125, "geo/layer_0/stable_rank_q_proj": 19.05040740966797, "geo/layer_0/stable_rank_k_proj": 16.24771499633789, "geo/layer_0/stable_rank_o_proj": 49.59521484375, "geo/layer_0/stable_rank_gate_proj": 139.8122100830078, "geo/layer_0/stable_rank_down_proj": 52.76670837402344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05665481463074684, "geo/layer_0/attn_entropy_mean": 6.199968338012695, "geo/layer_0/attn_entropy_std": 0.3702346682548523, "geo/layer_7/stable_rank_q_proj": 43.522056579589844, "geo/layer_7/stable_rank_k_proj": 42.13243103027344, "geo/layer_7/stable_rank_o_proj": 100.6601333618164, "geo/layer_7/stable_rank_gate_proj": 92.15825653076172, "geo/layer_7/stable_rank_down_proj": 145.43386840820312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4974621832370758, "geo/layer_7/attn_entropy_mean": 4.648881912231445, "geo/layer_7/attn_entropy_std": 0.7983742952346802, "geo/layer_14/stable_rank_q_proj": 54.53108596801758, "geo/layer_14/stable_rank_k_proj": 36.72412872314453, "geo/layer_14/stable_rank_o_proj": 49.61054611206055, "geo/layer_14/stable_rank_gate_proj": 77.70966339111328, "geo/layer_14/stable_rank_down_proj": 133.0316619873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40653863549232483, "geo/layer_14/attn_entropy_mean": 5.4747395515441895, "geo/layer_14/attn_entropy_std": 0.3579384982585907, "geo/layer_21/stable_rank_q_proj": 43.8883056640625, "geo/layer_21/stable_rank_k_proj": 31.01995277404785, "geo/layer_21/stable_rank_o_proj": 77.30955505371094, "geo/layer_21/stable_rank_gate_proj": 74.45394134521484, "geo/layer_21/stable_rank_down_proj": 56.62066650390625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14740705490112305, "geo/layer_21/attn_entropy_mean": 5.72572135925293, "geo/layer_21/attn_entropy_std": 0.288809210062027, "geo/layer_27/stable_rank_q_proj": 42.52107620239258, "geo/layer_27/stable_rank_k_proj": 31.440753936767578, "geo/layer_27/stable_rank_o_proj": 117.36544799804688, "geo/layer_27/stable_rank_gate_proj": 85.71864318847656, "geo/layer_27/stable_rank_down_proj": 132.65879821777344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08831850439310074, "geo/layer_27/attn_entropy_mean": 4.2836594581604, "geo/layer_27/attn_entropy_std": 0.648578941822052, "attnres/final_alpha/block_0": 0.24280112981796265, "attnres/block_norm/0": 1.7106798887252808, "attnres/final_alpha/block_1": 0.005399888381361961, "attnres/block_norm/1": 38872.52734375, "attnres/final_alpha/block_2": 0.011722708120942116, "attnres/block_norm/2": 25856.0, "attnres/final_alpha/block_3": 0.013885840773582458, "attnres/block_norm/3": 44585.21875, "attnres/final_alpha/block_4": 0.01668575406074524, "attnres/block_norm/4": 12398.625, "attnres/final_alpha/block_5": 0.5902373194694519, "attnres/block_norm/5": 5929.0927734375, "attnres/final_alpha/block_6": 0.11926740407943726, "attnres/block_norm/6": 29610.623046875, "geo/tier1_time_s": 1.3574678897857666, "geo/step": 36675.0, "geo/rankme_slope": -2.2894528905312133e-05} {"step": 36680, "timestamp": 1778234186.251608, "train/loss": 2.161982607841492, "train/z_loss": 0.0014305833959951998, "train/perplexity": 8.688346177215047, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702781.3470306022, "perf/iters_per_sec": 0.8119494185593615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2316038131713867, "data/tokens_consumed": 76925632512, "data/tokens_consumed_B": 76.925632512, "train/loss_slope": -1.726874006141455e-05} {"step": 36690, "timestamp": 1778234196.6090083, "train/loss": 2.1789329767227175, "train/z_loss": 0.0014263278688304126, "train/perplexity": 8.836872078165083, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025542.6195838149, "perf/iters_per_sec": 0.9658539865416598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035353183746338, "data/tokens_consumed": 76946604032, "data/tokens_consumed_B": 76.946604032, "train/loss_slope": -1.3668797046902835e-05} {"step": 36700, "timestamp": 1778234206.945563, "grad/layer_0/attn": 0.0027435789816081524, "grad/layer_0/mlp": 0.002848152769729495, "grad/layer_0/attn_mlp_ratio": 0.9632836112019245, "grad/layer_4/attn": 0.002777841640636325, "grad/layer_4/mlp": 0.0024981191381812096, "grad/layer_4/attn_mlp_ratio": 1.1119731989489705, "grad/layer_8/attn": 0.006998767610639334, "grad/layer_8/mlp": 0.003534784074872732, "grad/layer_8/attn_mlp_ratio": 1.9799702794842686, "grad/layer_12/attn": 0.004690074361860752, "grad/layer_12/mlp": 0.006938184145838022, "grad/layer_12/attn_mlp_ratio": 0.6759800829264755, "grad/layer_16/attn": 0.0044061774387955666, "grad/layer_16/mlp": 0.004779356066137552, "grad/layer_16/attn_mlp_ratio": 0.921918619502348, "grad/layer_20/attn": 0.0038000624626874924, "grad/layer_20/mlp": 0.006465252488851547, "grad/layer_20/attn_mlp_ratio": 0.5877670532532921, "grad/layer_24/attn": 0.01736394874751568, "grad/layer_24/mlp": 0.014193073846399784, "grad/layer_24/attn_mlp_ratio": 1.2234100106214285, "grad/layer_27/attn": 0.005229394417256117, "grad/layer_27/mlp": 0.012709376402199268, "grad/layer_27/attn_mlp_ratio": 0.41145955636385523} {"step": 36700, "timestamp": 1778234206.9614587, "train/loss": 2.1728063344955446, "train/z_loss": 0.0014477930730208755, "train/perplexity": 8.78289723564163, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026736.1416775885, "perf/iters_per_sec": 0.9664231022251074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347434759140015, "data/tokens_consumed": 76967575552, "data/tokens_consumed_B": 76.967575552, "train/loss_slope": -1.000966071510252e-05} {"step": 36710, "timestamp": 1778234217.3080208, "train/loss": 2.1898743391036986, "train/z_loss": 0.0014313047868199646, "train/perplexity": 8.934090378353616, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027929.9490426893, "perf/iters_per_sec": 0.966992353936524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341343402862548, "data/tokens_consumed": 76988547072, "data/tokens_consumed_B": 76.988547072, "train/loss_slope": -9.216112236414323e-06} {"step": 36720, "timestamp": 1778234227.662981, "train/loss": 2.1258186221122743, "train/z_loss": 0.0014398011262528598, "train/perplexity": 8.379754533436376, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026226.27368674, "perf/iters_per_sec": 0.9661799782212925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350038528442382, "data/tokens_consumed": 77009518592, "data/tokens_consumed_B": 77.009518592, "train/loss_slope": -8.638029966917986e-06} {"step": 36730, "timestamp": 1778234238.0266542, "train/loss": 2.180619788169861, "train/z_loss": 0.0014209000510163605, "train/perplexity": 8.851790794136521, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024740.292791476, "perf/iters_per_sec": 0.9654714073140507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357634544372558, "data/tokens_consumed": 77030490112, "data/tokens_consumed_B": 77.030490112, "train/loss_slope": -9.211354773573134e-06} {"step": 36740, "timestamp": 1778234248.3999906, "train/loss": 2.2030576705932616, "train/z_loss": 0.0014216512092389167, "train/perplexity": 9.052651249850573, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022669.9029827765, "perf/iters_per_sec": 0.9644841685212977, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368236541748046, "data/tokens_consumed": 77051461632, "data/tokens_consumed_B": 77.051461632, "train/loss_slope": -8.494743462478644e-06} {"step": 36750, "timestamp": 1778234258.7654073, "grad/layer_0/attn": 0.002362053608521819, "grad/layer_0/mlp": 0.002622470259666443, "grad/layer_0/attn_mlp_ratio": 0.9006979239308746, "grad/layer_4/attn": 0.0023233676329255104, "grad/layer_4/mlp": 0.0024786063004285097, "grad/layer_4/attn_mlp_ratio": 0.9373685279453159, "grad/layer_8/attn": 0.0033066521864384413, "grad/layer_8/mlp": 0.003624449949711561, "grad/layer_8/attn_mlp_ratio": 0.9123183217000298, "grad/layer_12/attn": 0.005432910285890102, "grad/layer_12/mlp": 0.006558581721037626, "grad/layer_12/attn_mlp_ratio": 0.8283666246967043, "grad/layer_16/attn": 0.004741396754980087, "grad/layer_16/mlp": 0.004631385672837496, "grad/layer_16/attn_mlp_ratio": 1.0237533618529022, "grad/layer_20/attn": 0.003295636037364602, "grad/layer_20/mlp": 0.005425042938441038, "grad/layer_20/attn_mlp_ratio": 0.6074856944013571, "grad/layer_24/attn": 0.004437802359461784, "grad/layer_24/mlp": 0.007374591194093227, "grad/layer_24/attn_mlp_ratio": 0.601769261845914, "grad/layer_27/attn": 0.004538979846984148, "grad/layer_27/mlp": 0.006331036798655987, "grad/layer_27/attn_mlp_ratio": 0.7169409876520743} {"step": 36750, "timestamp": 1778234259.376162, "eos/sharpness": 7.039308547973631, "eos/L0_probe": 2.000211238861084, "eos/L_plus": 2.046811103820801, "eos/L_minus": 2.0240044593811035, "eos/grad_norm": 0.09616269171237946, "eos/embed_grad_frac": 0.25373291969299316, "eos/time_s": 0.6079962253570557} {"step": 36750, "timestamp": 1778234259.3964753, "train/loss": 2.1646213293075562, "train/z_loss": 0.0014302385505288839, "train/perplexity": 8.71130257723005, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908108.8741630258, "perf/iters_per_sec": 0.9098572130980614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099073553085327, "data/tokens_consumed": 77072433152, "data/tokens_consumed_B": 77.072433152, "train/loss_slope": -8.867670407425322e-06} {"step": 36750, "timestamp": 1778234260.7599397, "geo/rankme_last": 439.76763916015625, "geo/layer_0/stable_rank_q_proj": 19.057437896728516, "geo/layer_0/stable_rank_k_proj": 16.254680633544922, "geo/layer_0/stable_rank_o_proj": 49.460079193115234, "geo/layer_0/stable_rank_gate_proj": 139.7432098388672, "geo/layer_0/stable_rank_down_proj": 52.781246185302734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05756699666380882, "geo/layer_0/attn_entropy_mean": 6.2037272453308105, "geo/layer_0/attn_entropy_std": 0.3688584864139557, "geo/layer_7/stable_rank_q_proj": 43.3819465637207, "geo/layer_7/stable_rank_k_proj": 42.08272171020508, "geo/layer_7/stable_rank_o_proj": 100.68376159667969, "geo/layer_7/stable_rank_gate_proj": 92.24915313720703, "geo/layer_7/stable_rank_down_proj": 145.44789123535156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4942304193973541, "geo/layer_7/attn_entropy_mean": 4.655107498168945, "geo/layer_7/attn_entropy_std": 0.7998711466789246, "geo/layer_14/stable_rank_q_proj": 54.65177536010742, "geo/layer_14/stable_rank_k_proj": 36.78593444824219, "geo/layer_14/stable_rank_o_proj": 49.654640197753906, "geo/layer_14/stable_rank_gate_proj": 77.65184020996094, "geo/layer_14/stable_rank_down_proj": 133.3261260986328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3926021456718445, "geo/layer_14/attn_entropy_mean": 5.495133399963379, "geo/layer_14/attn_entropy_std": 0.3770413398742676, "geo/layer_21/stable_rank_q_proj": 43.86470413208008, "geo/layer_21/stable_rank_k_proj": 31.009902954101562, "geo/layer_21/stable_rank_o_proj": 77.30233764648438, "geo/layer_21/stable_rank_gate_proj": 74.39340209960938, "geo/layer_21/stable_rank_down_proj": 56.60911560058594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14327438175678253, "geo/layer_21/attn_entropy_mean": 5.722546577453613, "geo/layer_21/attn_entropy_std": 0.2881866693496704, "geo/layer_27/stable_rank_q_proj": 42.52232360839844, "geo/layer_27/stable_rank_k_proj": 31.388824462890625, "geo/layer_27/stable_rank_o_proj": 117.66488647460938, "geo/layer_27/stable_rank_gate_proj": 85.7428207397461, "geo/layer_27/stable_rank_down_proj": 132.8674774169922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08580513298511505, "geo/layer_27/attn_entropy_mean": 4.29940128326416, "geo/layer_27/attn_entropy_std": 0.6547209620475769, "attnres/final_alpha/block_0": 0.24222931265830994, "attnres/block_norm/0": 1.7109113931655884, "attnres/final_alpha/block_1": 0.0054545532912015915, "attnres/block_norm/1": 38784.6171875, "attnres/final_alpha/block_2": 0.01158426608890295, "attnres/block_norm/2": 25789.01953125, "attnres/final_alpha/block_3": 0.01361069455742836, "attnres/block_norm/3": 44524.9765625, "attnres/final_alpha/block_4": 0.016989972442388535, "attnres/block_norm/4": 12434.1513671875, "attnres/final_alpha/block_5": 0.5888732671737671, "attnres/block_norm/5": 5950.591796875, "attnres/final_alpha/block_6": 0.12125793099403381, "attnres/block_norm/6": 29485.05859375, "geo/tier1_time_s": 1.3593978881835938, "geo/step": 36750.0, "geo/rankme_slope": -1.8295658106992797e-05} {"step": 36760, "timestamp": 1778234271.135899, "train/loss": 2.2112895488739013, "train/z_loss": 0.0014173696981742978, "train/perplexity": 9.127479137536657, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786986.1170988078, "perf/iters_per_sec": 0.8521013818258323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1735692739486694, "data/tokens_consumed": 77093404672, "data/tokens_consumed_B": 77.093404672, "train/loss_slope": -4.450488047595418e-06} {"step": 36770, "timestamp": 1778234281.5147498, "train/loss": 2.2146979570388794, "train/z_loss": 0.0014235488371923566, "train/perplexity": 9.158642390328076, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021588.206011817, "perf/iters_per_sec": 0.9639683752116285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373784303665161, "data/tokens_consumed": 77114376192, "data/tokens_consumed_B": 77.114376192, "train/loss_slope": -4.245524609585973e-06} {"step": 36780, "timestamp": 1778234291.8920074, "train/loss": 2.2355191230773928, "train/z_loss": 0.0014115030760876834, "train/perplexity": 9.351335086222441, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022144.9710901387, "perf/iters_per_sec": 0.9642338614893621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370928049087524, "data/tokens_consumed": 77135347712, "data/tokens_consumed_B": 77.135347712, "train/loss_slope": -2.23578782257567e-06} {"step": 36790, "timestamp": 1778234302.2411277, "train/loss": 2.2081091165542603, "train/z_loss": 0.001425730052869767, "train/perplexity": 9.098495921899449, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027619.0377878402, "perf/iters_per_sec": 0.9668440998973085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342929124832154, "data/tokens_consumed": 77156319232, "data/tokens_consumed_B": 77.156319232, "train/loss_slope": -3.422037674568792e-08} {"step": 36800, "timestamp": 1778234312.5864625, "grad/layer_0/attn": 0.002795339096337557, "grad/layer_0/mlp": 0.002639617770910263, "grad/layer_0/attn_mlp_ratio": 1.0589938517780961, "grad/layer_4/attn": 0.0028963852673768997, "grad/layer_4/mlp": 0.002333669690415263, "grad/layer_4/attn_mlp_ratio": 1.2411290060285283, "grad/layer_8/attn": 0.00846004206687212, "grad/layer_8/mlp": 0.003567920532077551, "grad/layer_8/attn_mlp_ratio": 2.371140767765889, "grad/layer_12/attn": 0.0055077411234378815, "grad/layer_12/mlp": 0.006633174605667591, "grad/layer_12/attn_mlp_ratio": 0.8303325885163098, "grad/layer_16/attn": 0.003995545674115419, "grad/layer_16/mlp": 0.0046322825364768505, "grad/layer_16/attn_mlp_ratio": 0.8625435854566702, "grad/layer_20/attn": 0.0036875454243272543, "grad/layer_20/mlp": 0.00567262526601553, "grad/layer_20/attn_mlp_ratio": 0.6500597494802303, "grad/layer_24/attn": 0.007063192781060934, "grad/layer_24/mlp": 0.008441849611699581, "grad/layer_24/attn_mlp_ratio": 0.8366878139600185, "grad/layer_27/attn": 0.006751417648047209, "grad/layer_27/mlp": 0.00749545730650425, "grad/layer_27/attn_mlp_ratio": 0.9007345758763977} {"step": 36800, "timestamp": 1778234312.6027944, "train/loss": 2.1399384260177614, "train/z_loss": 0.0014389282558113336, "train/perplexity": 8.498914300776388, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024919.8379197267, "perf/iters_per_sec": 0.965557021102775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356716156005858, "data/tokens_consumed": 77177290752, "data/tokens_consumed_B": 77.177290752, "train/loss_slope": -3.321353322637619e-06} {"step": 36810, "timestamp": 1778234322.9752927, "train/loss": 2.273841118812561, "train/z_loss": 0.001412276248447597, "train/perplexity": 9.716652040025522, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023246.3910604552, "perf/iters_per_sec": 0.964759059457996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03652822971344, "data/tokens_consumed": 77198262272, "data/tokens_consumed_B": 77.198262272, "train/loss_slope": 4.093331779905843e-06} {"step": 36820, "timestamp": 1778234334.0280252, "train/loss": 2.1478874921798705, "train/z_loss": 0.0014308292069472373, "train/perplexity": 8.566741958975175, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1898525.6175345764, "perf/iters_per_sec": 0.9052875602410204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1046213865280152, "data/tokens_consumed": 77219233792, "data/tokens_consumed_B": 77.219233792, "train/loss_slope": 4.219909441067029e-06} {"step": 36825, "timestamp": 1778234339.7959638, "eos/sharpness": 58.61461162567137, "eos/L0_probe": 2.003624439239502, "eos/L_plus": 2.37736439704895, "eos/L_minus": 2.2160305976867676, "eos/grad_norm": 0.14341478049755096, "eos/embed_grad_frac": 0.11697298288345337, "eos/time_s": 0.604611873626709} {"step": 36825, "timestamp": 1778234341.172256, "geo/rankme_last": 438.5430603027344, "geo/layer_0/stable_rank_q_proj": 19.090662002563477, "geo/layer_0/stable_rank_k_proj": 16.302457809448242, "geo/layer_0/stable_rank_o_proj": 49.53688049316406, "geo/layer_0/stable_rank_gate_proj": 139.88755798339844, "geo/layer_0/stable_rank_down_proj": 52.77481460571289, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06288314610719681, "geo/layer_0/attn_entropy_mean": 6.20428991317749, "geo/layer_0/attn_entropy_std": 0.3668130338191986, "geo/layer_7/stable_rank_q_proj": 43.396541595458984, "geo/layer_7/stable_rank_k_proj": 41.93181610107422, "geo/layer_7/stable_rank_o_proj": 100.58805847167969, "geo/layer_7/stable_rank_gate_proj": 92.16773223876953, "geo/layer_7/stable_rank_down_proj": 145.37124633789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5166261792182922, "geo/layer_7/attn_entropy_mean": 4.649309158325195, "geo/layer_7/attn_entropy_std": 0.8051777482032776, "geo/layer_14/stable_rank_q_proj": 54.70734405517578, "geo/layer_14/stable_rank_k_proj": 36.82963943481445, "geo/layer_14/stable_rank_o_proj": 49.705787658691406, "geo/layer_14/stable_rank_gate_proj": 77.59744262695312, "geo/layer_14/stable_rank_down_proj": 133.375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38777655363082886, "geo/layer_14/attn_entropy_mean": 5.481711387634277, "geo/layer_14/attn_entropy_std": 0.3736189901828766, "geo/layer_21/stable_rank_q_proj": 43.780635833740234, "geo/layer_21/stable_rank_k_proj": 31.065921783447266, "geo/layer_21/stable_rank_o_proj": 77.23847198486328, "geo/layer_21/stable_rank_gate_proj": 74.2760009765625, "geo/layer_21/stable_rank_down_proj": 56.575042724609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15363456308841705, "geo/layer_21/attn_entropy_mean": 5.714693069458008, "geo/layer_21/attn_entropy_std": 0.29170992970466614, "geo/layer_27/stable_rank_q_proj": 42.618228912353516, "geo/layer_27/stable_rank_k_proj": 31.332334518432617, "geo/layer_27/stable_rank_o_proj": 117.81068420410156, "geo/layer_27/stable_rank_gate_proj": 85.71868133544922, "geo/layer_27/stable_rank_down_proj": 132.87939453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09467755258083344, "geo/layer_27/attn_entropy_mean": 4.307034492492676, "geo/layer_27/attn_entropy_std": 0.6288446187973022, "attnres/final_alpha/block_0": 0.2407759428024292, "attnres/block_norm/0": 1.7111880779266357, "attnres/final_alpha/block_1": 0.00530229602009058, "attnres/block_norm/1": 38895.7734375, "attnres/final_alpha/block_2": 0.01159886084496975, "attnres/block_norm/2": 25795.01171875, "attnres/final_alpha/block_3": 0.013528779149055481, "attnres/block_norm/3": 44760.63671875, "attnres/final_alpha/block_4": 0.016617218032479286, "attnres/block_norm/4": 12402.427734375, "attnres/final_alpha/block_5": 0.5952675938606262, "attnres/block_norm/5": 5853.7080078125, "attnres/final_alpha/block_6": 0.11690926551818848, "attnres/block_norm/6": 29594.82421875, "geo/tier1_time_s": 1.3569073677062988, "geo/step": 36825.0, "geo/rankme_slope": -2.1413780355892365e-05} {"step": 36830, "timestamp": 1778234346.3642688, "train/loss": 2.1160420894622805, "train/z_loss": 0.0014370246208272874, "train/perplexity": 8.298228758696357, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700784.7430254738, "perf/iters_per_sec": 0.8109973635794991, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330496311187744, "data/tokens_consumed": 77240205312, "data/tokens_consumed_B": 77.240205312, "train/loss_slope": -1.5646745305213549e-06} {"step": 36840, "timestamp": 1778234357.236583, "train/loss": 2.194502568244934, "train/z_loss": 0.0014330704230815171, "train/perplexity": 8.975535229947317, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929867.7804890324, "perf/iters_per_sec": 0.9202326681561624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0866816997528077, "data/tokens_consumed": 77261176832, "data/tokens_consumed_B": 77.261176832, "train/loss_slope": 1.5551437388803506e-06} {"step": 36850, "timestamp": 1778234367.5745103, "grad/layer_0/attn": 0.0028557816985994577, "grad/layer_0/mlp": 0.0031226533465087414, "grad/layer_0/attn_mlp_ratio": 0.9145368666485685, "grad/layer_4/attn": 0.0020153624936938286, "grad/layer_4/mlp": 0.0024867532774806023, "grad/layer_4/attn_mlp_ratio": 0.8104392305020803, "grad/layer_8/attn": 0.00517090130597353, "grad/layer_8/mlp": 0.003842621576040983, "grad/layer_8/attn_mlp_ratio": 1.3456701548878631, "grad/layer_12/attn": 0.004412367939949036, "grad/layer_12/mlp": 0.006577654741704464, "grad/layer_12/attn_mlp_ratio": 0.6708117172663414, "grad/layer_16/attn": 0.0033717306796461344, "grad/layer_16/mlp": 0.004388968925923109, "grad/layer_16/attn_mlp_ratio": 0.7682284062000132, "grad/layer_20/attn": 0.00568372430279851, "grad/layer_20/mlp": 0.005498299840837717, "grad/layer_20/attn_mlp_ratio": 1.0337239444839275, "grad/layer_24/attn": 0.007687645964324474, "grad/layer_24/mlp": 0.00808650441467762, "grad/layer_24/attn_mlp_ratio": 0.9506760245259016, "grad/layer_27/attn": 0.003931900020688772, "grad/layer_27/mlp": 0.007331004366278648, "grad/layer_27/attn_mlp_ratio": 0.5363385111514843} {"step": 36850, "timestamp": 1778234367.5904608, "train/loss": 2.209717845916748, "train/z_loss": 0.0014323199517093598, "train/perplexity": 9.113144719259434, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026316.4074831286, "perf/iters_per_sec": 0.9662229573646205, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349578142166138, "data/tokens_consumed": 77282148352, "data/tokens_consumed_B": 77.282148352, "train/loss_slope": 5.393169100063624e-06} {"step": 36860, "timestamp": 1778234377.9484227, "train/loss": 2.1547781944274904, "train/z_loss": 0.0014403707580640912, "train/perplexity": 8.625976677068593, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025742.5075456153, "perf/iters_per_sec": 0.9659493005493237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352510213851929, "data/tokens_consumed": 77303119872, "data/tokens_consumed_B": 77.303119872, "train/loss_slope": 3.0752769362058334e-06} {"step": 36870, "timestamp": 1778234388.3122306, "train/loss": 2.167706346511841, "train/z_loss": 0.001430857798550278, "train/perplexity": 8.738218592390071, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025127.995371264, "perf/iters_per_sec": 0.9656562783104248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035565161705017, "data/tokens_consumed": 77324091392, "data/tokens_consumed_B": 77.324091392, "train/loss_slope": 1.4199472282014733e-06} {"step": 36880, "timestamp": 1778234399.2774744, "train/loss": 2.1896505117416383, "train/z_loss": 0.0014406980248168112, "train/perplexity": 8.93209090824823, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913357.6646964033, "perf/iters_per_sec": 0.9123600314600007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960585355758667, "data/tokens_consumed": 77345062912, "data/tokens_consumed_B": 77.345062912, "train/loss_slope": 3.743114057976505e-06} {"step": 36890, "timestamp": 1778234409.632239, "train/loss": 2.136069130897522, "train/z_loss": 0.0014272902510128916, "train/perplexity": 8.46609303168171, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026753.8406321045, "perf/iters_per_sec": 0.966431541744282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347344398498535, "data/tokens_consumed": 77366034432, "data/tokens_consumed_B": 77.366034432, "train/loss_slope": -1.7793385216396688e-06} {"step": 36900, "timestamp": 1778234420.3563254, "grad/layer_0/attn": 0.002691373461857438, "grad/layer_0/mlp": 0.0028343508020043373, "grad/layer_0/attn_mlp_ratio": 0.9495554908018653, "grad/layer_4/attn": 0.0027966350317001343, "grad/layer_4/mlp": 0.002471860032528639, "grad/layer_4/attn_mlp_ratio": 1.1313888657766653, "grad/layer_8/attn": 0.0055709173902869225, "grad/layer_8/mlp": 0.003570529166609049, "grad/layer_8/attn_mlp_ratio": 1.5602497485135174, "grad/layer_12/attn": 0.004671710543334484, "grad/layer_12/mlp": 0.006623166613280773, "grad/layer_12/attn_mlp_ratio": 0.7053590443324899, "grad/layer_16/attn": 0.005707652308046818, "grad/layer_16/mlp": 0.00448776688426733, "grad/layer_16/attn_mlp_ratio": 1.2718245684448444, "grad/layer_20/attn": 0.004434281960129738, "grad/layer_20/mlp": 0.005816112272441387, "grad/layer_20/attn_mlp_ratio": 0.7624133916567352, "grad/layer_24/attn": 0.008073898032307625, "grad/layer_24/mlp": 0.00898511242121458, "grad/layer_24/attn_mlp_ratio": 0.8985861905729612, "grad/layer_27/attn": 0.006711025256663561, "grad/layer_27/mlp": 0.008154855109751225, "grad/layer_27/attn_mlp_ratio": 0.8229484256984485} {"step": 36900, "timestamp": 1778234420.9644032, "eos/sharpness": 33.03339481353759, "eos/L0_probe": 2.0026190280914307, "eos/L_plus": 2.185188055038452, "eos/L_minus": 2.150383949279785, "eos/grad_norm": 0.12812063097953796, "eos/embed_grad_frac": 0.1366446316242218, "eos/time_s": 0.6050829887390137} {"step": 36900, "timestamp": 1778234420.9858634, "train/loss": 2.1632357835769653, "train/z_loss": 0.0014429921749979258, "train/perplexity": 8.699241026980593, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1848418.5129571485, "perf/iters_per_sec": 0.8813946308885329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.134565567970276, "data/tokens_consumed": 77387005952, "data/tokens_consumed_B": 77.387005952, "train/loss_slope": -1.7531972013003671e-06} {"step": 36900, "timestamp": 1778234422.3490152, "geo/rankme_last": 438.7827453613281, "geo/layer_0/stable_rank_q_proj": 19.061119079589844, "geo/layer_0/stable_rank_k_proj": 16.28173828125, "geo/layer_0/stable_rank_o_proj": 49.51278305053711, "geo/layer_0/stable_rank_gate_proj": 139.56637573242188, "geo/layer_0/stable_rank_down_proj": 52.7762336730957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05877931788563728, "geo/layer_0/attn_entropy_mean": 6.201207637786865, "geo/layer_0/attn_entropy_std": 0.36484041810035706, "geo/layer_7/stable_rank_q_proj": 43.35015869140625, "geo/layer_7/stable_rank_k_proj": 42.04963302612305, "geo/layer_7/stable_rank_o_proj": 100.55884552001953, "geo/layer_7/stable_rank_gate_proj": 92.08939361572266, "geo/layer_7/stable_rank_down_proj": 145.21847534179688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5097599029541016, "geo/layer_7/attn_entropy_mean": 4.665936470031738, "geo/layer_7/attn_entropy_std": 0.8127914667129517, "geo/layer_14/stable_rank_q_proj": 54.658199310302734, "geo/layer_14/stable_rank_k_proj": 36.767208099365234, "geo/layer_14/stable_rank_o_proj": 49.68890380859375, "geo/layer_14/stable_rank_gate_proj": 77.58850860595703, "geo/layer_14/stable_rank_down_proj": 133.1404266357422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3798677623271942, "geo/layer_14/attn_entropy_mean": 5.496246814727783, "geo/layer_14/attn_entropy_std": 0.3585469126701355, "geo/layer_21/stable_rank_q_proj": 43.924842834472656, "geo/layer_21/stable_rank_k_proj": 31.058290481567383, "geo/layer_21/stable_rank_o_proj": 77.22632598876953, "geo/layer_21/stable_rank_gate_proj": 74.1629638671875, "geo/layer_21/stable_rank_down_proj": 56.53251266479492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14971917867660522, "geo/layer_21/attn_entropy_mean": 5.733763694763184, "geo/layer_21/attn_entropy_std": 0.2864168584346771, "geo/layer_27/stable_rank_q_proj": 42.48324966430664, "geo/layer_27/stable_rank_k_proj": 31.387489318847656, "geo/layer_27/stable_rank_o_proj": 117.64763641357422, "geo/layer_27/stable_rank_gate_proj": 85.53982543945312, "geo/layer_27/stable_rank_down_proj": 132.98863220214844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08954024314880371, "geo/layer_27/attn_entropy_mean": 4.283111572265625, "geo/layer_27/attn_entropy_std": 0.6623440384864807, "attnres/final_alpha/block_0": 0.24201498925685883, "attnres/block_norm/0": 1.7116351127624512, "attnres/final_alpha/block_1": 0.005342681426554918, "attnres/block_norm/1": 38896.125, "attnres/final_alpha/block_2": 0.011605913750827312, "attnres/block_norm/2": 25796.2734375, "attnres/final_alpha/block_3": 0.013699783012270927, "attnres/block_norm/3": 44575.3203125, "attnres/final_alpha/block_4": 0.016575276851654053, "attnres/block_norm/4": 12443.8896484375, "attnres/final_alpha/block_5": 0.5929505825042725, "attnres/block_norm/5": 5904.634765625, "attnres/final_alpha/block_6": 0.11781077086925507, "attnres/block_norm/6": 29678.474609375, "geo/tier1_time_s": 1.359011173248291, "geo/step": 36900.0, "geo/rankme_slope": -7.580330960509203e-05} {"step": 36910, "timestamp": 1778234432.702061, "train/loss": 2.212409865856171, "train/z_loss": 0.001423900667577982, "train/perplexity": 9.137710537555218, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790563.8936625558, "perf/iters_per_sec": 0.8538073986351756, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171224331855774, "data/tokens_consumed": 77407977472, "data/tokens_consumed_B": 77.407977472, "train/loss_slope": 4.8987199478439156e-08} {"step": 36920, "timestamp": 1778234443.0807111, "train/loss": 2.1638691425323486, "train/z_loss": 0.0014428563066758216, "train/perplexity": 8.704752514380779, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021666.7758870015, "perf/iters_per_sec": 0.9640058402476318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373381137847901, "data/tokens_consumed": 77428948992, "data/tokens_consumed_B": 77.428948992, "train/loss_slope": -2.673070422885754e-07} {"step": 36930, "timestamp": 1778234453.4416428, "train/loss": 2.1306114077568052, "train/z_loss": 0.0014384122798219324, "train/perplexity": 8.420013299422141, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025192.339366889, "perf/iters_per_sec": 0.9656869599184461, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035532259941101, "data/tokens_consumed": 77449920512, "data/tokens_consumed_B": 77.449920512, "train/loss_slope": -5.5307425717279196e-06} {"step": 36940, "timestamp": 1778234463.79503, "train/loss": 2.147340774536133, "train/z_loss": 0.0014323239563964307, "train/perplexity": 8.562059650063931, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026579.8999378693, "perf/iters_per_sec": 0.966348600357947, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348232507705688, "data/tokens_consumed": 77470892032, "data/tokens_consumed_B": 77.470892032, "train/loss_slope": -5.500331706125076e-06} {"step": 36950, "timestamp": 1778234474.134253, "grad/layer_0/attn": 0.002734506968408823, "grad/layer_0/mlp": 0.002903854940086603, "grad/layer_0/attn_mlp_ratio": 0.9416816372236234, "grad/layer_4/attn": 0.002173233777284622, "grad/layer_4/mlp": 0.0026453323662281036, "grad/layer_4/attn_mlp_ratio": 0.8215352153384969, "grad/layer_8/attn": 0.007003447972238064, "grad/layer_8/mlp": 0.003708089701831341, "grad/layer_8/attn_mlp_ratio": 1.8886942729324465, "grad/layer_12/attn": 0.004204650409519672, "grad/layer_12/mlp": 0.006653816904872656, "grad/layer_12/attn_mlp_ratio": 0.6319155465863527, "grad/layer_16/attn": 0.004893460310995579, "grad/layer_16/mlp": 0.004483464173972607, "grad/layer_16/attn_mlp_ratio": 1.0914462594032655, "grad/layer_20/attn": 0.003986977972090244, "grad/layer_20/mlp": 0.006344834342598915, "grad/layer_20/attn_mlp_ratio": 0.6283817187288395, "grad/layer_24/attn": 0.011300116777420044, "grad/layer_24/mlp": 0.01057827565819025, "grad/layer_24/attn_mlp_ratio": 1.0682380603162955, "grad/layer_27/attn": 0.004867679439485073, "grad/layer_27/mlp": 0.009213341400027275, "grad/layer_27/attn_mlp_ratio": 0.5283294274363609} {"step": 36950, "timestamp": 1778234474.149913, "train/loss": 2.2024728536605833, "train/z_loss": 0.0014259087387472392, "train/perplexity": 9.047358653864748, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026189.0742452461, "perf/iters_per_sec": 0.9661622401453238, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350228548049927, "data/tokens_consumed": 77491863552, "data/tokens_consumed_B": 77.491863552, "train/loss_slope": -9.647139263982088e-07} {"step": 36960, "timestamp": 1778234484.4963791, "train/loss": 2.2019572973251345, "train/z_loss": 0.0014281151234172285, "train/perplexity": 9.042695432971497, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027685.456616289, "perf/iters_per_sec": 0.9668757708627171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034259033203125, "data/tokens_consumed": 77512835072, "data/tokens_consumed_B": 77.512835072, "train/loss_slope": -1.1280381354060096e-06} {"step": 36970, "timestamp": 1778234494.8445888, "train/loss": 2.1595326542854307, "train/z_loss": 0.0014177517499774695, "train/perplexity": 8.667086186231165, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027480.231345126, "perf/iters_per_sec": 0.9667779118276243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343637228012086, "data/tokens_consumed": 77533806592, "data/tokens_consumed_B": 77.533806592, "train/loss_slope": -4.113577580330271e-06} {"step": 36975, "timestamp": 1778234500.6180158, "eos/sharpness": 55.69868087768553, "eos/L0_probe": 2.006829261779785, "eos/L_plus": 2.211294651031494, "eos/L_minus": 2.3593506813049316, "eos/grad_norm": 0.13649342954158783, "eos/embed_grad_frac": 0.12281631678342819, "eos/time_s": 0.5998454093933105} {"step": 36975, "timestamp": 1778234501.9948597, "geo/rankme_last": 439.67266845703125, "geo/layer_0/stable_rank_q_proj": 19.0557861328125, "geo/layer_0/stable_rank_k_proj": 16.27996253967285, "geo/layer_0/stable_rank_o_proj": 49.47549057006836, "geo/layer_0/stable_rank_gate_proj": 139.6335906982422, "geo/layer_0/stable_rank_down_proj": 52.86885070800781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05585124343633652, "geo/layer_0/attn_entropy_mean": 6.204596519470215, "geo/layer_0/attn_entropy_std": 0.3669837415218353, "geo/layer_7/stable_rank_q_proj": 43.30507278442383, "geo/layer_7/stable_rank_k_proj": 42.07024002075195, "geo/layer_7/stable_rank_o_proj": 100.41485595703125, "geo/layer_7/stable_rank_gate_proj": 92.14144134521484, "geo/layer_7/stable_rank_down_proj": 145.22640991210938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5015411376953125, "geo/layer_7/attn_entropy_mean": 4.655417442321777, "geo/layer_7/attn_entropy_std": 0.8035162091255188, "geo/layer_14/stable_rank_q_proj": 54.49472427368164, "geo/layer_14/stable_rank_k_proj": 36.636688232421875, "geo/layer_14/stable_rank_o_proj": 49.66232681274414, "geo/layer_14/stable_rank_gate_proj": 77.56958770751953, "geo/layer_14/stable_rank_down_proj": 133.0044403076172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3760761320590973, "geo/layer_14/attn_entropy_mean": 5.503694534301758, "geo/layer_14/attn_entropy_std": 0.3719744086265564, "geo/layer_21/stable_rank_q_proj": 43.91945266723633, "geo/layer_21/stable_rank_k_proj": 31.13873863220215, "geo/layer_21/stable_rank_o_proj": 77.32110595703125, "geo/layer_21/stable_rank_gate_proj": 73.96475219726562, "geo/layer_21/stable_rank_down_proj": 56.47731018066406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14900803565979004, "geo/layer_21/attn_entropy_mean": 5.714539527893066, "geo/layer_21/attn_entropy_std": 0.29174309968948364, "geo/layer_27/stable_rank_q_proj": 42.4328727722168, "geo/layer_27/stable_rank_k_proj": 31.458066940307617, "geo/layer_27/stable_rank_o_proj": 117.52837371826172, "geo/layer_27/stable_rank_gate_proj": 85.55174255371094, "geo/layer_27/stable_rank_down_proj": 132.78182983398438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08899465948343277, "geo/layer_27/attn_entropy_mean": 4.283206462860107, "geo/layer_27/attn_entropy_std": 0.653927743434906, "attnres/final_alpha/block_0": 0.24341663718223572, "attnres/block_norm/0": 1.71177339553833, "attnres/final_alpha/block_1": 0.005431121215224266, "attnres/block_norm/1": 38934.3671875, "attnres/final_alpha/block_2": 0.011590497568249702, "attnres/block_norm/2": 25731.916015625, "attnres/final_alpha/block_3": 0.01369692012667656, "attnres/block_norm/3": 44514.1015625, "attnres/final_alpha/block_4": 0.016855396330356598, "attnres/block_norm/4": 12481.19140625, "attnres/final_alpha/block_5": 0.5887563228607178, "attnres/block_norm/5": 5893.44091796875, "attnres/final_alpha/block_6": 0.12025308609008789, "attnres/block_norm/6": 29575.3046875, "geo/tier1_time_s": 1.3571949005126953, "geo/step": 36975.0, "geo/rankme_slope": -6.527786896008403e-05} {"step": 36980, "timestamp": 1778234507.171429, "train/loss": 2.1394174695014954, "train/z_loss": 0.0014202246209606528, "train/perplexity": 8.49448788907433, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701955.494877799, "perf/iters_per_sec": 0.8115556215657229, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322014331817628, "data/tokens_consumed": 77554778112, "data/tokens_consumed_B": 77.554778112, "train/loss_slope": -5.053384140236611e-06} {"step": 36990, "timestamp": 1778234517.5377333, "train/loss": 2.1739211559295653, "train/z_loss": 0.0014354399405419826, "train/perplexity": 8.792694057571616, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024220.3416040516, "perf/iters_per_sec": 0.9652234752674349, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360295057296753, "data/tokens_consumed": 77575749632, "data/tokens_consumed_B": 77.575749632, "train/loss_slope": -5.823801338511616e-06} {"step": 37000, "timestamp": 1778234527.8886178, "grad/layer_0/attn": 0.0039115301333367825, "grad/layer_0/mlp": 0.0031478572636842728, "grad/layer_0/attn_mlp_ratio": 1.2426008174521295, "grad/layer_4/attn": 0.0019408742664381862, "grad/layer_4/mlp": 0.0025368034839630127, "grad/layer_4/attn_mlp_ratio": 0.7650865359493613, "grad/layer_8/attn": 0.006035581696778536, "grad/layer_8/mlp": 0.0038469305727630854, "grad/layer_8/attn_mlp_ratio": 1.5689343557739335, "grad/layer_12/attn": 0.004591164644807577, "grad/layer_12/mlp": 0.007075024768710136, "grad/layer_12/attn_mlp_ratio": 0.6489255840092337, "grad/layer_16/attn": 0.003688414115458727, "grad/layer_16/mlp": 0.004546589683741331, "grad/layer_16/attn_mlp_ratio": 0.8112484941237822, "grad/layer_20/attn": 0.005674754269421101, "grad/layer_20/mlp": 0.005123774986714125, "grad/layer_20/attn_mlp_ratio": 1.1075338346009094, "grad/layer_24/attn": 0.010139011777937412, "grad/layer_24/mlp": 0.007942793890833855, "grad/layer_24/attn_mlp_ratio": 1.2765044378134494, "grad/layer_27/attn": 0.010051148943603039, "grad/layer_27/mlp": 0.006665781140327454, "grad/layer_27/attn_mlp_ratio": 1.5078726080589588} {"step": 37000, "timestamp": 1778234527.9043605, "train/loss": 2.1606915950775147, "train/z_loss": 0.0014322827802971006, "train/perplexity": 8.677136648781476, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024107.337996785, "perf/iters_per_sec": 0.9651695909484792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360873460769653, "data/tokens_consumed": 77596721152, "data/tokens_consumed_B": 77.596721152, "train/loss_slope": -4.448956474684098e-06} {"step": 37000, "timestamp": 1778234534.788849, "geo/ww_alpha_mean": 7.7714235128605855, "geo/ww_alpha_std": 4.777536421765268, "geo/ww_alpha_min": 1.3460867897107538, "geo/ww_alpha_max": 37.076371176773506, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.040601081378438, "geo/ww_alpha_by_type/k_proj": 4.523839974056188, "geo/ww_alpha_by_type/v_proj": 8.383454520956464, "geo/ww_alpha_by_type/o_proj": 8.769017519246725, "geo/ww_alpha_by_type/gate_proj": 7.776654064970525, "geo/ww_alpha_by_type/up_proj": 12.05545599405176, "geo/ww_alpha_by_type/down_proj": 8.965266649184798, "geo/twonn_id/layer_0": 0.7348678708076477, "geo/twonn_id/layer_7": 3.2609009742736816, "geo/twonn_id/layer_14": 4.045636177062988, "geo/twonn_id/layer_21": 7.288363456726074, "geo/twonn_id/layer_27": 6.673467636108398, "geo/tier2_time_s": 6.877158880233765} {"step": 37000, "timestamp": 1778234535.4183402, "eoc/jacobian_sigma/layer_0/attn": 1062.384521484375, "eoc/jacobian_sigma/layer_0/mlp": 7954.3349609375, "eoc/jacobian_sigma/layer_0": 7954.3349609375, "eoc/jacobian_sigma/layer_7/attn": 1.1341912746429443, "eoc/jacobian_sigma/layer_7/mlp": 1.7226017713546753, "eoc/jacobian_sigma/layer_7": 1.7226017713546753, "eoc/jacobian_sigma/layer_14/attn": 1.6349871158599854, "eoc/jacobian_sigma/layer_14/mlp": 5.778046131134033, "eoc/jacobian_sigma/layer_14": 5.778046131134033, "eoc/jacobian_sigma/layer_21/attn": 1.09697425365448, "eoc/jacobian_sigma/layer_21/mlp": 4.3008928298950195, "eoc/jacobian_sigma/layer_21": 4.3008928298950195, "eoc/jacobian_sigma/layer_27/attn": 3.712000608444214, "eoc/jacobian_sigma/layer_27/mlp": 20.54033660888672, "eoc/jacobian_sigma/layer_27": 20.54033660888672, "eoc/layer0_sigma": 7954.3349609375, "eoc/sigma_max": 20.54033660888672, "eoc/sigma_min": 1.7226017713546753, "eoc/sigma_mean": 8.085469335317612, "eoc/time_s": 0.6232566833496094} {"step": 37010, "timestamp": 1778234545.782029, "train/loss": 2.135456371307373, "train/z_loss": 0.001431420282460749, "train/perplexity": 8.460906941061102, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1173464.8337624408, "perf/iters_per_sec": 0.5595516365825848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.787145161628723, "data/tokens_consumed": 77617692672, "data/tokens_consumed_B": 77.617692672, "train/loss_slope": -7.913307245164613e-06} {"step": 37020, "timestamp": 1778234556.158473, "train/loss": 2.2217885732650755, "train/z_loss": 0.0014254891662858426, "train/perplexity": 9.223813587595357, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022065.8527212786, "perf/iters_per_sec": 0.9641961349111932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371333837509156, "data/tokens_consumed": 77638664192, "data/tokens_consumed_B": 77.638664192, "train/loss_slope": -2.055496365466742e-06} {"step": 37030, "timestamp": 1778234566.5386138, "train/loss": 2.206118369102478, "train/z_loss": 0.0014124170644208788, "train/perplexity": 9.08040113138285, "train/grad_norm": 0.328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021654.4161760062, "perf/iters_per_sec": 0.9639999466781646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373444557189941, "data/tokens_consumed": 77659635712, "data/tokens_consumed_B": 77.659635712, "train/loss_slope": -1.5659106518104038e-06} {"step": 37040, "timestamp": 1778234576.9200027, "train/loss": 2.2088646411895754, "train/z_loss": 0.001424252090509981, "train/perplexity": 9.105372657157094, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021176.4064865382, "perf/iters_per_sec": 0.9637720138962451, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375897884368896, "data/tokens_consumed": 77680607232, "data/tokens_consumed_B": 77.680607232, "train/loss_slope": 2.608497944673644e-06} {"step": 37050, "timestamp": 1778234587.2806897, "grad/layer_0/attn": 0.002408880041912198, "grad/layer_0/mlp": 0.002535976469516754, "grad/layer_0/attn_mlp_ratio": 0.9498826096690731, "grad/layer_4/attn": 0.0021235886961221695, "grad/layer_4/mlp": 0.0024048949126154184, "grad/layer_4/attn_mlp_ratio": 0.8830276103457351, "grad/layer_8/attn": 0.0036411737091839314, "grad/layer_8/mlp": 0.0035627775359898806, "grad/layer_8/attn_mlp_ratio": 1.0220042004311811, "grad/layer_12/attn": 0.004267217591404915, "grad/layer_12/mlp": 0.006578963715583086, "grad/layer_12/attn_mlp_ratio": 0.6486154523752639, "grad/layer_16/attn": 0.005300747696310282, "grad/layer_16/mlp": 0.00428931787610054, "grad/layer_16/attn_mlp_ratio": 1.2358019913294567, "grad/layer_20/attn": 0.003632629755884409, "grad/layer_20/mlp": 0.005420361179858446, "grad/layer_20/attn_mlp_ratio": 0.6701822200270894, "grad/layer_24/attn": 0.007201930973678827, "grad/layer_24/mlp": 0.008113324642181396, "grad/layer_24/attn_mlp_ratio": 0.8876670418769006, "grad/layer_27/attn": 0.004246744327247143, "grad/layer_27/mlp": 0.007544298656284809, "grad/layer_27/attn_mlp_ratio": 0.5629077617995145} {"step": 37050, "timestamp": 1778234587.897312, "eos/sharpness": 18.844270706176754, "eos/L0_probe": 2.0045876502990723, "eos/L_plus": 2.099724292755127, "eos/L_minus": 2.097893714904785, "eos/grad_norm": 0.0977044403553009, "eos/embed_grad_frac": 0.20787854492664337, "eos/time_s": 0.6138644218444824} {"step": 37050, "timestamp": 1778234587.917806, "train/loss": 2.221633219718933, "train/z_loss": 0.0014118362800218166, "train/perplexity": 9.222380746746902, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907491.5052269183, "perf/iters_per_sec": 0.9095628286490051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0994292736053466, "data/tokens_consumed": 77701578752, "data/tokens_consumed_B": 77.701578752, "train/loss_slope": 5.068289378796002e-06} {"step": 37050, "timestamp": 1778234589.27749, "geo/rankme_last": 438.65765380859375, "geo/layer_0/stable_rank_q_proj": 19.05030632019043, "geo/layer_0/stable_rank_k_proj": 16.26229476928711, "geo/layer_0/stable_rank_o_proj": 49.48740768432617, "geo/layer_0/stable_rank_gate_proj": 139.51808166503906, "geo/layer_0/stable_rank_down_proj": 52.78300094604492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06388651579618454, "geo/layer_0/attn_entropy_mean": 6.200542449951172, "geo/layer_0/attn_entropy_std": 0.3661293387413025, "geo/layer_7/stable_rank_q_proj": 43.24201965332031, "geo/layer_7/stable_rank_k_proj": 42.100990295410156, "geo/layer_7/stable_rank_o_proj": 100.50205993652344, "geo/layer_7/stable_rank_gate_proj": 92.14629364013672, "geo/layer_7/stable_rank_down_proj": 144.8921661376953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5142958760261536, "geo/layer_7/attn_entropy_mean": 4.638617992401123, "geo/layer_7/attn_entropy_std": 0.808312714099884, "geo/layer_14/stable_rank_q_proj": 54.56134033203125, "geo/layer_14/stable_rank_k_proj": 36.63580322265625, "geo/layer_14/stable_rank_o_proj": 49.56632614135742, "geo/layer_14/stable_rank_gate_proj": 77.4695053100586, "geo/layer_14/stable_rank_down_proj": 133.45411682128906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3811955451965332, "geo/layer_14/attn_entropy_mean": 5.52617073059082, "geo/layer_14/attn_entropy_std": 0.3624270558357239, "geo/layer_21/stable_rank_q_proj": 43.93050003051758, "geo/layer_21/stable_rank_k_proj": 31.11952018737793, "geo/layer_21/stable_rank_o_proj": 77.37175750732422, "geo/layer_21/stable_rank_gate_proj": 73.8507308959961, "geo/layer_21/stable_rank_down_proj": 56.54798126220703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14764699339866638, "geo/layer_21/attn_entropy_mean": 5.731090545654297, "geo/layer_21/attn_entropy_std": 0.2837575078010559, "geo/layer_27/stable_rank_q_proj": 42.47615432739258, "geo/layer_27/stable_rank_k_proj": 31.433605194091797, "geo/layer_27/stable_rank_o_proj": 117.64240264892578, "geo/layer_27/stable_rank_gate_proj": 85.6637191772461, "geo/layer_27/stable_rank_down_proj": 132.8931121826172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08571913093328476, "geo/layer_27/attn_entropy_mean": 4.291798114776611, "geo/layer_27/attn_entropy_std": 0.6534958481788635, "attnres/final_alpha/block_0": 0.24176433682441711, "attnres/block_norm/0": 1.7119574546813965, "attnres/final_alpha/block_1": 0.005349435843527317, "attnres/block_norm/1": 39028.36328125, "attnres/final_alpha/block_2": 0.011577746830880642, "attnres/block_norm/2": 25811.865234375, "attnres/final_alpha/block_3": 0.013967989943921566, "attnres/block_norm/3": 44572.4296875, "attnres/final_alpha/block_4": 0.016970878466963768, "attnres/block_norm/4": 12519.0888671875, "attnres/final_alpha/block_5": 0.590467631816864, "attnres/block_norm/5": 5888.724609375, "attnres/final_alpha/block_6": 0.11990202963352203, "attnres/block_norm/6": 29602.189453125, "geo/tier1_time_s": 1.3561761379241943, "geo/step": 37050.0, "geo/rankme_slope": -7.191700899109644e-05} {"step": 37060, "timestamp": 1778234599.6555293, "train/loss": 2.191883111000061, "train/z_loss": 0.001410823001060635, "train/perplexity": 8.952054965363134, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787199.8270457487, "perf/iters_per_sec": 0.8522032866696113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734289407730103, "data/tokens_consumed": 77722550272, "data/tokens_consumed_B": 77.722550272, "train/loss_slope": 5.417451020156959e-06} {"step": 37070, "timestamp": 1778234610.0393677, "train/loss": 2.137033247947693, "train/z_loss": 0.0014381670975126327, "train/perplexity": 8.474259272295233, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021040.8025367975, "perf/iters_per_sec": 0.9637073528942096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376594066619873, "data/tokens_consumed": 77743521792, "data/tokens_consumed_B": 77.743521792, "train/loss_slope": -1.997463967111265e-06} {"step": 37080, "timestamp": 1778234620.4202654, "train/loss": 2.1898420810699464, "train/z_loss": 0.0014080997905693948, "train/perplexity": 8.933802186812917, "train/grad_norm": 0.33203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021427.1824082725, "perf/iters_per_sec": 0.9638915931741107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374610662460326, "data/tokens_consumed": 77764493312, "data/tokens_consumed_B": 77.764493312, "train/loss_slope": -1.593400793247639e-08} {"step": 37090, "timestamp": 1778234630.8035223, "train/loss": 2.1899885654449465, "train/z_loss": 0.0014203742728568613, "train/perplexity": 8.935110945096607, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020786.9191425908, "perf/iters_per_sec": 0.9635862918580012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03778977394104, "data/tokens_consumed": 77785464832, "data/tokens_consumed_B": 77.785464832, "train/loss_slope": 2.9514856893595326e-06} {"step": 37100, "timestamp": 1778234641.1673832, "grad/layer_0/attn": 0.0034015050623565912, "grad/layer_0/mlp": 0.0031658352818340063, "grad/layer_0/attn_mlp_ratio": 1.0744415461002457, "grad/layer_4/attn": 0.003041781485080719, "grad/layer_4/mlp": 0.0024656897876411676, "grad/layer_4/attn_mlp_ratio": 1.233643168318574, "grad/layer_8/attn": 0.0045760441571474075, "grad/layer_8/mlp": 0.003580356016755104, "grad/layer_8/attn_mlp_ratio": 1.2780974874909095, "grad/layer_12/attn": 0.00489827012643218, "grad/layer_12/mlp": 0.006949443835765123, "grad/layer_12/attn_mlp_ratio": 0.7048434625428615, "grad/layer_16/attn": 0.004834913648664951, "grad/layer_16/mlp": 0.004889489617198706, "grad/layer_16/attn_mlp_ratio": 0.9888380850169736, "grad/layer_20/attn": 0.003198579652234912, "grad/layer_20/mlp": 0.005929979030042887, "grad/layer_20/attn_mlp_ratio": 0.5393913843693038, "grad/layer_24/attn": 0.011758019216358662, "grad/layer_24/mlp": 0.009943703189492226, "grad/layer_24/attn_mlp_ratio": 1.1824587755734497, "grad/layer_27/attn": 0.008375906385481358, "grad/layer_27/mlp": 0.009789064526557922, "grad/layer_27/attn_mlp_ratio": 0.8556390937247836} {"step": 37100, "timestamp": 1778234641.1831799, "train/loss": 2.1473888516426087, "train/z_loss": 0.001430226454976946, "train/perplexity": 8.562471299012747, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021672.6305396946, "perf/iters_per_sec": 0.9640086319635842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373351097106933, "data/tokens_consumed": 77806436352, "data/tokens_consumed_B": 77.806436352, "train/loss_slope": -5.85643815235974e-07} {"step": 37110, "timestamp": 1778234651.559419, "train/loss": 2.195379638671875, "train/z_loss": 0.0014132547657936811, "train/perplexity": 8.983410859699582, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022244.7378587225, "perf/iters_per_sec": 0.9642814339917767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370416402816773, "data/tokens_consumed": 77827407872, "data/tokens_consumed_B": 77.827407872, "train/loss_slope": 3.4382206092466167e-06} {"step": 37120, "timestamp": 1778234661.9348805, "train/loss": 2.1652143239974975, "train/z_loss": 0.0014261781587265431, "train/perplexity": 8.716469865336602, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022219.5395443768, "perf/iters_per_sec": 0.9642694184991726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370545625686645, "data/tokens_consumed": 77848379392, "data/tokens_consumed_B": 77.848379392, "train/loss_slope": 6.598660726286963e-07} {"step": 37125, "timestamp": 1778234667.7143745, "eos/sharpness": 32.61263370513915, "eos/L0_probe": 2.003737688064575, "eos/L_plus": 2.1763505935668945, "eos/L_minus": 2.1572511196136475, "eos/grad_norm": 0.14300674200057983, "eos/embed_grad_frac": 0.121795654296875, "eos/time_s": 0.6032819747924805} {"step": 37125, "timestamp": 1778234669.0927474, "geo/rankme_last": 439.5437316894531, "geo/layer_0/stable_rank_q_proj": 19.033828735351562, "geo/layer_0/stable_rank_k_proj": 16.217267990112305, "geo/layer_0/stable_rank_o_proj": 49.54063415527344, "geo/layer_0/stable_rank_gate_proj": 139.21600341796875, "geo/layer_0/stable_rank_down_proj": 52.86514663696289, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059634361416101456, "geo/layer_0/attn_entropy_mean": 6.203662872314453, "geo/layer_0/attn_entropy_std": 0.3662204444408417, "geo/layer_7/stable_rank_q_proj": 43.123802185058594, "geo/layer_7/stable_rank_k_proj": 42.125144958496094, "geo/layer_7/stable_rank_o_proj": 100.4779052734375, "geo/layer_7/stable_rank_gate_proj": 92.05412292480469, "geo/layer_7/stable_rank_down_proj": 145.0270233154297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5112214088439941, "geo/layer_7/attn_entropy_mean": 4.66541862487793, "geo/layer_7/attn_entropy_std": 0.7958498597145081, "geo/layer_14/stable_rank_q_proj": 54.45216751098633, "geo/layer_14/stable_rank_k_proj": 36.69655990600586, "geo/layer_14/stable_rank_o_proj": 49.509735107421875, "geo/layer_14/stable_rank_gate_proj": 77.63191986083984, "geo/layer_14/stable_rank_down_proj": 133.3116455078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3812878727912903, "geo/layer_14/attn_entropy_mean": 5.507350444793701, "geo/layer_14/attn_entropy_std": 0.37749627232551575, "geo/layer_21/stable_rank_q_proj": 43.88473129272461, "geo/layer_21/stable_rank_k_proj": 31.207763671875, "geo/layer_21/stable_rank_o_proj": 77.40957641601562, "geo/layer_21/stable_rank_gate_proj": 73.75580596923828, "geo/layer_21/stable_rank_down_proj": 56.490203857421875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14936774969100952, "geo/layer_21/attn_entropy_mean": 5.71612024307251, "geo/layer_21/attn_entropy_std": 0.29313260316848755, "geo/layer_27/stable_rank_q_proj": 42.49898910522461, "geo/layer_27/stable_rank_k_proj": 31.45569610595703, "geo/layer_27/stable_rank_o_proj": 117.3457260131836, "geo/layer_27/stable_rank_gate_proj": 85.65750122070312, "geo/layer_27/stable_rank_down_proj": 132.84027099609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08337540924549103, "geo/layer_27/attn_entropy_mean": 4.278528213500977, "geo/layer_27/attn_entropy_std": 0.6573271155357361, "attnres/final_alpha/block_0": 0.24250227212905884, "attnres/block_norm/0": 1.7118878364562988, "attnres/final_alpha/block_1": 0.0053057922050356865, "attnres/block_norm/1": 38886.07421875, "attnres/final_alpha/block_2": 0.011772365309298038, "attnres/block_norm/2": 25780.474609375, "attnres/final_alpha/block_3": 0.014030753634870052, "attnres/block_norm/3": 44476.1171875, "attnres/final_alpha/block_4": 0.01697157695889473, "attnres/block_norm/4": 12427.72265625, "attnres/final_alpha/block_5": 0.5928400754928589, "attnres/block_norm/5": 5938.765625, "attnres/final_alpha/block_6": 0.11657717823982239, "attnres/block_norm/6": 29934.95703125, "geo/tier1_time_s": 1.3589375019073486, "geo/step": 37125.0, "geo/rankme_slope": -3.864477431597639e-05} {"step": 37130, "timestamp": 1778234674.2867029, "train/loss": 2.1842052578926086, "train/z_loss": 0.0014270033105276525, "train/perplexity": 8.883585587593897, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698738.9125594208, "perf/iters_per_sec": 0.8100218355939011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345346212387085, "data/tokens_consumed": 77869350912, "data/tokens_consumed_B": 77.869350912, "train/loss_slope": 1.9759510574203562e-06} {"step": 37140, "timestamp": 1778234684.6678975, "train/loss": 2.1640027284622194, "train/z_loss": 0.0014383573085069656, "train/perplexity": 8.705915424512193, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021147.3801379334, "perf/iters_per_sec": 0.9637581730546634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376046895980835, "data/tokens_consumed": 77890322432, "data/tokens_consumed_B": 77.890322432, "train/loss_slope": 1.246649482891869e-06} {"step": 37150, "timestamp": 1778234695.030566, "grad/layer_0/attn": 0.002777933143079281, "grad/layer_0/mlp": 0.0028350315988063812, "grad/layer_0/attn_mlp_ratio": 0.9798596411633985, "grad/layer_4/attn": 0.0023756849113851786, "grad/layer_4/mlp": 0.002406321233138442, "grad/layer_4/attn_mlp_ratio": 0.9872683579988438, "grad/layer_8/attn": 0.00572910625487566, "grad/layer_8/mlp": 0.0036845840513706207, "grad/layer_8/attn_mlp_ratio": 1.5548854414804183, "grad/layer_12/attn": 0.004048728384077549, "grad/layer_12/mlp": 0.00633371202275157, "grad/layer_12/attn_mlp_ratio": 0.63923467085502, "grad/layer_16/attn": 0.003923693206161261, "grad/layer_16/mlp": 0.004283358808606863, "grad/layer_16/attn_mlp_ratio": 0.9160318548784461, "grad/layer_20/attn": 0.0035102602560073137, "grad/layer_20/mlp": 0.005606113467365503, "grad/layer_20/attn_mlp_ratio": 0.6261486168316959, "grad/layer_24/attn": 0.00505880918353796, "grad/layer_24/mlp": 0.00790899433195591, "grad/layer_24/attn_mlp_ratio": 0.6396273542813592, "grad/layer_27/attn": 0.00429584551602602, "grad/layer_27/mlp": 0.006229067221283913, "grad/layer_27/attn_mlp_ratio": 0.6896450615243273} {"step": 37150, "timestamp": 1778234695.0462837, "train/loss": 2.171660804748535, "train/z_loss": 0.001446006540209055, "train/perplexity": 8.772841926020998, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021663.987969066, "perf/iters_per_sec": 0.9640045108647661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373395442962647, "data/tokens_consumed": 77911293952, "data/tokens_consumed_B": 77.911293952, "train/loss_slope": 1.7081680649792573e-06} {"step": 37160, "timestamp": 1778234705.4221396, "train/loss": 2.1895191192626955, "train/z_loss": 0.0014306371100246905, "train/perplexity": 8.93091737578001, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022165.5651945858, "perf/iters_per_sec": 0.9642436815236024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370822429656983, "data/tokens_consumed": 77932265472, "data/tokens_consumed_B": 77.932265472, "train/loss_slope": 7.284659649780002e-07} {"step": 37170, "timestamp": 1778234715.8062189, "train/loss": 2.252498173713684, "train/z_loss": 0.001401535701006651, "train/perplexity": 9.511467479168243, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020628.947465834, "perf/iters_per_sec": 0.9635109650925798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378709077835082, "data/tokens_consumed": 77953236992, "data/tokens_consumed_B": 77.953236992, "train/loss_slope": 5.774654592439056e-06} {"step": 37180, "timestamp": 1778234726.1761928, "train/loss": 2.1767924070358275, "train/z_loss": 0.0014257438015192747, "train/perplexity": 8.817976368674184, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023257.699858185, "perf/iters_per_sec": 0.9647644519129681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365224361419678, "data/tokens_consumed": 77974208512, "data/tokens_consumed_B": 77.974208512, "train/loss_slope": 5.906245853676994e-06} {"step": 37190, "timestamp": 1778234736.5507808, "train/loss": 2.2043641328811647, "train/z_loss": 0.0014148279093205928, "train/perplexity": 9.064485926410022, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022433.001225265, "perf/iters_per_sec": 0.9643712049604726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036945104598999, "data/tokens_consumed": 77995180032, "data/tokens_consumed_B": 77.995180032, "train/loss_slope": 8.107085554155534e-06} {"step": 37200, "timestamp": 1778234746.9365222, "grad/layer_0/attn": 0.003039916744455695, "grad/layer_0/mlp": 0.002966594882309437, "grad/layer_0/attn_mlp_ratio": 1.0247157979378698, "grad/layer_4/attn": 0.002238778630271554, "grad/layer_4/mlp": 0.0025239097885787487, "grad/layer_4/attn_mlp_ratio": 0.8870279562683767, "grad/layer_8/attn": 0.003284611040726304, "grad/layer_8/mlp": 0.0036871337797492743, "grad/layer_8/attn_mlp_ratio": 0.8908304248907984, "grad/layer_12/attn": 0.007158439140766859, "grad/layer_12/mlp": 0.006733093876391649, "grad/layer_12/attn_mlp_ratio": 1.0631723195705576, "grad/layer_16/attn": 0.008095039054751396, "grad/layer_16/mlp": 0.004805866628885269, "grad/layer_16/attn_mlp_ratio": 1.6844077273505789, "grad/layer_20/attn": 0.007826441898941994, "grad/layer_20/mlp": 0.006448805332183838, "grad/layer_20/attn_mlp_ratio": 1.2136266136799279, "grad/layer_24/attn": 0.006383545696735382, "grad/layer_24/mlp": 0.009514396078884602, "grad/layer_24/attn_mlp_ratio": 0.6709354515741579, "grad/layer_27/attn": 0.006152737885713577, "grad/layer_27/mlp": 0.008285108022391796, "grad/layer_27/attn_mlp_ratio": 0.7426261425707704} {"step": 37200, "timestamp": 1778234747.540471, "eos/sharpness": 15.373992919921871, "eos/L0_probe": 2.0010054111480713, "eos/L_plus": 2.07956862449646, "eos/L_minus": 2.0761821269989014, "eos/grad_norm": 0.11270292103290558, "eos/embed_grad_frac": 0.20782417058944702, "eos/time_s": 0.6012113094329834} {"step": 37200, "timestamp": 1778234747.5598333, "train/loss": 2.2046926259994506, "train/z_loss": 0.0014169969246722758, "train/perplexity": 9.067464036775245, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907419.863344657, "perf/iters_per_sec": 0.9095286671374593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099470567703247, "data/tokens_consumed": 78016151552, "data/tokens_consumed_B": 78.016151552, "train/loss_slope": 1.3495592616035724e-05} {"step": 37200, "timestamp": 1778234748.921502, "geo/rankme_last": 439.4249267578125, "geo/layer_0/stable_rank_q_proj": 19.071792602539062, "geo/layer_0/stable_rank_k_proj": 16.23807716369629, "geo/layer_0/stable_rank_o_proj": 49.60356903076172, "geo/layer_0/stable_rank_gate_proj": 139.29344177246094, "geo/layer_0/stable_rank_down_proj": 52.93288803100586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0586417131125927, "geo/layer_0/attn_entropy_mean": 6.205513954162598, "geo/layer_0/attn_entropy_std": 0.3660964369773865, "geo/layer_7/stable_rank_q_proj": 42.97283935546875, "geo/layer_7/stable_rank_k_proj": 42.128082275390625, "geo/layer_7/stable_rank_o_proj": 100.32389068603516, "geo/layer_7/stable_rank_gate_proj": 91.98101806640625, "geo/layer_7/stable_rank_down_proj": 145.07586669921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5029211044311523, "geo/layer_7/attn_entropy_mean": 4.682422161102295, "geo/layer_7/attn_entropy_std": 0.824356198310852, "geo/layer_14/stable_rank_q_proj": 54.25723648071289, "geo/layer_14/stable_rank_k_proj": 36.75919723510742, "geo/layer_14/stable_rank_o_proj": 49.487060546875, "geo/layer_14/stable_rank_gate_proj": 77.53887176513672, "geo/layer_14/stable_rank_down_proj": 132.9548797607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39784711599349976, "geo/layer_14/attn_entropy_mean": 5.494953155517578, "geo/layer_14/attn_entropy_std": 0.3838343620300293, "geo/layer_21/stable_rank_q_proj": 43.936187744140625, "geo/layer_21/stable_rank_k_proj": 31.177736282348633, "geo/layer_21/stable_rank_o_proj": 77.2850570678711, "geo/layer_21/stable_rank_gate_proj": 73.79096984863281, "geo/layer_21/stable_rank_down_proj": 56.3929557800293, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1449451893568039, "geo/layer_21/attn_entropy_mean": 5.7301530838012695, "geo/layer_21/attn_entropy_std": 0.2906210422515869, "geo/layer_27/stable_rank_q_proj": 42.5808219909668, "geo/layer_27/stable_rank_k_proj": 31.499629974365234, "geo/layer_27/stable_rank_o_proj": 117.28118133544922, "geo/layer_27/stable_rank_gate_proj": 85.59567260742188, "geo/layer_27/stable_rank_down_proj": 132.5475616455078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07813043147325516, "geo/layer_27/attn_entropy_mean": 4.313312530517578, "geo/layer_27/attn_entropy_std": 0.6300859451293945, "attnres/final_alpha/block_0": 0.24292099475860596, "attnres/block_norm/0": 1.7119848728179932, "attnres/final_alpha/block_1": 0.005359259434044361, "attnres/block_norm/1": 38983.05078125, "attnres/final_alpha/block_2": 0.011565550230443478, "attnres/block_norm/2": 25790.87890625, "attnres/final_alpha/block_3": 0.013751400634646416, "attnres/block_norm/3": 45213.5078125, "attnres/final_alpha/block_4": 0.017027921974658966, "attnres/block_norm/4": 12485.404296875, "attnres/final_alpha/block_5": 0.5909042358398438, "attnres/block_norm/5": 5910.4814453125, "attnres/final_alpha/block_6": 0.118470698595047, "attnres/block_norm/6": 29916.716796875, "geo/tier1_time_s": 1.3578581809997559, "geo/step": 37200.0, "geo/rankme_slope": -2.1313896652410965e-05} {"step": 37210, "timestamp": 1778234759.2992926, "train/loss": 2.188908004760742, "train/z_loss": 0.001422716665547341, "train/perplexity": 8.925461229990637, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786988.8761947746, "perf/iters_per_sec": 0.8521026974653123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1735674619674683, "data/tokens_consumed": 78037123072, "data/tokens_consumed_B": 78.037123072, "train/loss_slope": 1.1491283904028947e-05} {"step": 37220, "timestamp": 1778234769.67854, "train/loss": 2.1882951974868776, "train/z_loss": 0.0014358353335410356, "train/perplexity": 8.919993317985542, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021708.6885133316, "perf/iters_per_sec": 0.9640258257452639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373166084289551, "data/tokens_consumed": 78058094592, "data/tokens_consumed_B": 78.058094592, "train/loss_slope": 1.0873043669475399e-05} {"step": 37230, "timestamp": 1778234780.0589778, "train/loss": 2.205604815483093, "train/z_loss": 0.0014172824332490564, "train/perplexity": 9.075739055731738, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021725.091599658, "perf/iters_per_sec": 0.9640336473463335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373081922531129, "data/tokens_consumed": 78079066112, "data/tokens_consumed_B": 78.079066112, "train/loss_slope": 1.1239127121349331e-05} {"step": 37240, "timestamp": 1778234790.4322927, "train/loss": 2.1501498460769652, "train/z_loss": 0.0014412917313165962, "train/perplexity": 8.586144900916375, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022671.4843745222, "perf/iters_per_sec": 0.9644849225876437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368228435516358, "data/tokens_consumed": 78100037632, "data/tokens_consumed_B": 78.100037632, "train/loss_slope": 1.1765081783046278e-05} {"step": 37250, "timestamp": 1778234800.7988164, "grad/layer_0/attn": 0.0035407193936407566, "grad/layer_0/mlp": 0.0029662970919162035, "grad/layer_0/attn_mlp_ratio": 1.193649578771128, "grad/layer_4/attn": 0.0020361740607768297, "grad/layer_4/mlp": 0.002422233112156391, "grad/layer_4/attn_mlp_ratio": 0.8406185046749183, "grad/layer_8/attn": 0.0038964413106441498, "grad/layer_8/mlp": 0.003665878903120756, "grad/layer_8/attn_mlp_ratio": 1.0628941400758498, "grad/layer_12/attn": 0.005134690552949905, "grad/layer_12/mlp": 0.006793332286179066, "grad/layer_12/attn_mlp_ratio": 0.7558426794184777, "grad/layer_16/attn": 0.003855527378618717, "grad/layer_16/mlp": 0.004603131208568811, "grad/layer_16/attn_mlp_ratio": 0.8375879635329073, "grad/layer_20/attn": 0.004050910007208586, "grad/layer_20/mlp": 0.005551053676754236, "grad/layer_20/attn_mlp_ratio": 0.729755136614295, "grad/layer_24/attn": 0.010049659758806229, "grad/layer_24/mlp": 0.008447869680821896, "grad/layer_24/attn_mlp_ratio": 1.1896087439250862, "grad/layer_27/attn": 0.005454082973301411, "grad/layer_27/mlp": 0.006802702788263559, "grad/layer_27/attn_mlp_ratio": 0.8017523421037731} {"step": 37250, "timestamp": 1778234800.814632, "train/loss": 2.135612654685974, "train/z_loss": 0.0014295845525339247, "train/perplexity": 8.462229343515952, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021067.643244067, "perf/iters_per_sec": 0.9637201515407882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376456260681153, "data/tokens_consumed": 78121009152, "data/tokens_consumed_B": 78.121009152, "train/loss_slope": 1.1144675523212506e-05} {"step": 37260, "timestamp": 1778234811.19921, "train/loss": 2.2019542694091796, "train/z_loss": 0.0014292853651568294, "train/perplexity": 9.042668052491175, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021013.3588829364, "perf/iters_per_sec": 0.9636942667402918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376734972000121, "data/tokens_consumed": 78141980672, "data/tokens_consumed_B": 78.141980672, "train/loss_slope": 1.209278436455318e-05} {"step": 37270, "timestamp": 1778234821.5738904, "train/loss": 2.1794852018356323, "train/z_loss": 0.0014080806053243578, "train/perplexity": 8.84175336850777, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022379.2944688534, "perf/iters_per_sec": 0.964345595583369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369726419448853, "data/tokens_consumed": 78162952192, "data/tokens_consumed_B": 78.162952192, "train/loss_slope": 8.3202123928098e-06} {"step": 37275, "timestamp": 1778234827.353792, "eos/sharpness": 65.32928943634032, "eos/L0_probe": 1.9996922016143799, "eos/L_plus": 2.2720298767089844, "eos/L_minus": 2.3806474208831787, "eos/grad_norm": 0.19194376468658447, "eos/embed_grad_frac": 0.06429723650217056, "eos/time_s": 0.6039218902587891} {"step": 37275, "timestamp": 1778234828.7288194, "geo/rankme_last": 439.7729187011719, "geo/layer_0/stable_rank_q_proj": 19.093122482299805, "geo/layer_0/stable_rank_k_proj": 16.242877960205078, "geo/layer_0/stable_rank_o_proj": 49.59242248535156, "geo/layer_0/stable_rank_gate_proj": 138.8518524169922, "geo/layer_0/stable_rank_down_proj": 53.019840240478516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05927982181310654, "geo/layer_0/attn_entropy_mean": 6.203144073486328, "geo/layer_0/attn_entropy_std": 0.36733463406562805, "geo/layer_7/stable_rank_q_proj": 43.064048767089844, "geo/layer_7/stable_rank_k_proj": 41.962215423583984, "geo/layer_7/stable_rank_o_proj": 100.1368637084961, "geo/layer_7/stable_rank_gate_proj": 91.9565200805664, "geo/layer_7/stable_rank_down_proj": 145.07424926757812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5125149488449097, "geo/layer_7/attn_entropy_mean": 4.657806396484375, "geo/layer_7/attn_entropy_std": 0.8171520233154297, "geo/layer_14/stable_rank_q_proj": 54.22187805175781, "geo/layer_14/stable_rank_k_proj": 36.7380485534668, "geo/layer_14/stable_rank_o_proj": 49.48698425292969, "geo/layer_14/stable_rank_gate_proj": 77.50282287597656, "geo/layer_14/stable_rank_down_proj": 133.23928833007812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3789095878601074, "geo/layer_14/attn_entropy_mean": 5.506797790527344, "geo/layer_14/attn_entropy_std": 0.35730618238449097, "geo/layer_21/stable_rank_q_proj": 43.87743377685547, "geo/layer_21/stable_rank_k_proj": 31.19588851928711, "geo/layer_21/stable_rank_o_proj": 77.24186706542969, "geo/layer_21/stable_rank_gate_proj": 73.84687042236328, "geo/layer_21/stable_rank_down_proj": 56.3524169921875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14570662379264832, "geo/layer_21/attn_entropy_mean": 5.708703517913818, "geo/layer_21/attn_entropy_std": 0.29097816348075867, "geo/layer_27/stable_rank_q_proj": 42.61860656738281, "geo/layer_27/stable_rank_k_proj": 31.416833877563477, "geo/layer_27/stable_rank_o_proj": 117.33948516845703, "geo/layer_27/stable_rank_gate_proj": 85.54940795898438, "geo/layer_27/stable_rank_down_proj": 132.64031982421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08710134029388428, "geo/layer_27/attn_entropy_mean": 4.305079936981201, "geo/layer_27/attn_entropy_std": 0.6452351808547974, "attnres/final_alpha/block_0": 0.24431520700454712, "attnres/block_norm/0": 1.712378978729248, "attnres/final_alpha/block_1": 0.005432371515780687, "attnres/block_norm/1": 39009.60546875, "attnres/final_alpha/block_2": 0.011955436319112778, "attnres/block_norm/2": 25739.44140625, "attnres/final_alpha/block_3": 0.013984128832817078, "attnres/block_norm/3": 44874.8984375, "attnres/final_alpha/block_4": 0.017279190942645073, "attnres/block_norm/4": 12515.5234375, "attnres/final_alpha/block_5": 0.5864627361297607, "attnres/block_norm/5": 5916.64404296875, "attnres/final_alpha/block_6": 0.12057092785835266, "attnres/block_norm/6": 29565.41796875, "geo/tier1_time_s": 1.3562214374542236, "geo/step": 37275.0, "geo/rankme_slope": -1.1040998430622248e-05} {"step": 37280, "timestamp": 1778234833.9184468, "train/loss": 2.1660295009613035, "train/z_loss": 0.0014425195869989694, "train/perplexity": 8.723578227669513, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699628.5072235283, "perf/iters_per_sec": 0.8104460273854868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338884592056274, "data/tokens_consumed": 78183923712, "data/tokens_consumed_B": 78.183923712, "train/loss_slope": 5.53649983318789e-06} {"step": 37290, "timestamp": 1778234844.2975235, "train/loss": 2.1888606905937196, "train/z_loss": 0.001418115000706166, "train/perplexity": 8.925038939217494, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021691.5886953792, "perf/iters_per_sec": 0.9640176719166657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037325382232666, "data/tokens_consumed": 78204895232, "data/tokens_consumed_B": 78.204895232, "train/loss_slope": 8.320986477061794e-06} {"step": 37300, "timestamp": 1778234854.6590807, "grad/layer_0/attn": 0.0025936109013855457, "grad/layer_0/mlp": 0.002746060024946928, "grad/layer_0/attn_mlp_ratio": 0.944484382488046, "grad/layer_4/attn": 0.001979661639779806, "grad/layer_4/mlp": 0.0025710253976285458, "grad/layer_4/attn_mlp_ratio": 0.7699891119733352, "grad/layer_8/attn": 0.008119695819914341, "grad/layer_8/mlp": 0.003548674052581191, "grad/layer_8/attn_mlp_ratio": 2.2880928117923522, "grad/layer_12/attn": 0.00380191789008677, "grad/layer_12/mlp": 0.0061871809884905815, "grad/layer_12/attn_mlp_ratio": 0.6144830474024937, "grad/layer_16/attn": 0.0031929491087794304, "grad/layer_16/mlp": 0.004394374322146177, "grad/layer_16/attn_mlp_ratio": 0.7265992384918397, "grad/layer_20/attn": 0.004387520253658295, "grad/layer_20/mlp": 0.005762853659689426, "grad/layer_20/attn_mlp_ratio": 0.7613450621198391, "grad/layer_24/attn": 0.005813136231154203, "grad/layer_24/mlp": 0.00882711447775364, "grad/layer_24/attn_mlp_ratio": 0.6585545231059586, "grad/layer_27/attn": 0.006117724813520908, "grad/layer_27/mlp": 0.007187597453594208, "grad/layer_27/attn_mlp_ratio": 0.8511501608018799} {"step": 37300, "timestamp": 1778234854.6746964, "train/loss": 2.123180937767029, "train/z_loss": 0.0014518995070829988, "train/perplexity": 8.357680511036303, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022108.0608103718, "perf/iters_per_sec": 0.9642162612964496, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371117353439332, "data/tokens_consumed": 78225866752, "data/tokens_consumed_B": 78.225866752, "train/loss_slope": 8.728659056415375e-06} {"step": 37310, "timestamp": 1778234865.0513418, "train/loss": 2.2178085803985597, "train/z_loss": 0.0014247287763282657, "train/perplexity": 9.187175832679133, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022261.521586264, "perf/iters_per_sec": 0.9642894370967217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370330333709716, "data/tokens_consumed": 78246838272, "data/tokens_consumed_B": 78.246838272, "train/loss_slope": 9.654552608218677e-06} {"step": 37320, "timestamp": 1778234875.4360127, "train/loss": 2.214735579490662, "train/z_loss": 0.0014356840867549181, "train/perplexity": 9.158986967391675, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021063.092347315, "perf/iters_per_sec": 0.9637179815041137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376479625701904, "data/tokens_consumed": 78267809792, "data/tokens_consumed_B": 78.267809792, "train/loss_slope": 1.271389925619373e-05} {"step": 37330, "timestamp": 1778234885.7827208, "train/loss": 2.2123759269714354, "train/z_loss": 0.0014343104790896176, "train/perplexity": 9.137400419113105, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028063.0655942578, "perf/iters_per_sec": 0.9670558288546838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340664625167846, "data/tokens_consumed": 78288781312, "data/tokens_consumed_B": 78.288781312, "train/loss_slope": 1.2964343974585235e-05} {"step": 37340, "timestamp": 1778234896.131711, "train/loss": 2.2486201524734497, "train/z_loss": 0.0013973781024105847, "train/perplexity": 9.474653235605059, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027636.3782445125, "perf/iters_per_sec": 0.96685236847139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342840671539306, "data/tokens_consumed": 78309752832, "data/tokens_consumed_B": 78.309752832, "train/loss_slope": 1.628318345836425e-05} {"step": 37350, "timestamp": 1778234906.495693, "grad/layer_0/attn": 0.00294479844160378, "grad/layer_0/mlp": 0.002790592610836029, "grad/layer_0/attn_mlp_ratio": 1.0552591319288411, "grad/layer_4/attn": 0.0019386028870940208, "grad/layer_4/mlp": 0.0026404005475342274, "grad/layer_4/attn_mlp_ratio": 0.7342078517153866, "grad/layer_8/attn": 0.007093885447829962, "grad/layer_8/mlp": 0.0039597563445568085, "grad/layer_8/attn_mlp_ratio": 1.791495398051922, "grad/layer_12/attn": 0.00456895399838686, "grad/layer_12/mlp": 0.006860788445919752, "grad/layer_12/attn_mlp_ratio": 0.6659517295725589, "grad/layer_16/attn": 0.0039199586026370525, "grad/layer_16/mlp": 0.00467159366235137, "grad/layer_16/attn_mlp_ratio": 0.8391051966522034, "grad/layer_20/attn": 0.0044981068931519985, "grad/layer_20/mlp": 0.007047509774565697, "grad/layer_20/attn_mlp_ratio": 0.63825478406005, "grad/layer_24/attn": 0.02639533393085003, "grad/layer_24/mlp": 0.01492504682391882, "grad/layer_24/attn_mlp_ratio": 1.7685260264441096, "grad/layer_27/attn": 0.013680645264685154, "grad/layer_27/mlp": 0.015013691037893295, "grad/layer_27/attn_mlp_ratio": 0.9112113163268928} {"step": 37350, "timestamp": 1778234907.0986063, "eos/sharpness": 77.54414081573485, "eos/L0_probe": 2.0019168853759766, "eos/L_plus": 2.4895877838134766, "eos/L_minus": 2.289687395095825, "eos/grad_norm": 0.3002174198627472, "eos/embed_grad_frac": 0.02333642914891243, "eos/time_s": 0.6002118587493896} {"step": 37350, "timestamp": 1778234907.1193004, "train/loss": 2.209468054771423, "train/z_loss": 0.0014142293366603553, "train/perplexity": 9.110868620688967, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909942.1059173294, "perf/iters_per_sec": 0.9107313661181113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0980186223983766, "data/tokens_consumed": 78330724352, "data/tokens_consumed_B": 78.330724352, "train/loss_slope": 1.796332736148365e-05} {"step": 37350, "timestamp": 1778234908.4786038, "geo/rankme_last": 438.92022705078125, "geo/layer_0/stable_rank_q_proj": 19.102684020996094, "geo/layer_0/stable_rank_k_proj": 16.242231369018555, "geo/layer_0/stable_rank_o_proj": 49.57170104980469, "geo/layer_0/stable_rank_gate_proj": 138.83828735351562, "geo/layer_0/stable_rank_down_proj": 52.98728942871094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05475423112511635, "geo/layer_0/attn_entropy_mean": 6.203105926513672, "geo/layer_0/attn_entropy_std": 0.3708474338054657, "geo/layer_7/stable_rank_q_proj": 43.04469299316406, "geo/layer_7/stable_rank_k_proj": 41.86372375488281, "geo/layer_7/stable_rank_o_proj": 99.97445678710938, "geo/layer_7/stable_rank_gate_proj": 91.9413070678711, "geo/layer_7/stable_rank_down_proj": 144.82391357421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5111547112464905, "geo/layer_7/attn_entropy_mean": 4.649727821350098, "geo/layer_7/attn_entropy_std": 0.7991390228271484, "geo/layer_14/stable_rank_q_proj": 54.29086685180664, "geo/layer_14/stable_rank_k_proj": 36.74140548706055, "geo/layer_14/stable_rank_o_proj": 49.38452911376953, "geo/layer_14/stable_rank_gate_proj": 77.50174713134766, "geo/layer_14/stable_rank_down_proj": 133.03884887695312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3879201412200928, "geo/layer_14/attn_entropy_mean": 5.526527404785156, "geo/layer_14/attn_entropy_std": 0.36419156193733215, "geo/layer_21/stable_rank_q_proj": 43.85189437866211, "geo/layer_21/stable_rank_k_proj": 31.15266990661621, "geo/layer_21/stable_rank_o_proj": 77.13761901855469, "geo/layer_21/stable_rank_gate_proj": 73.80863189697266, "geo/layer_21/stable_rank_down_proj": 56.34200668334961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1511484831571579, "geo/layer_21/attn_entropy_mean": 5.715569496154785, "geo/layer_21/attn_entropy_std": 0.27715468406677246, "geo/layer_27/stable_rank_q_proj": 42.56439971923828, "geo/layer_27/stable_rank_k_proj": 31.379188537597656, "geo/layer_27/stable_rank_o_proj": 117.37799835205078, "geo/layer_27/stable_rank_gate_proj": 85.49942779541016, "geo/layer_27/stable_rank_down_proj": 132.45156860351562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08257424831390381, "geo/layer_27/attn_entropy_mean": 4.2709174156188965, "geo/layer_27/attn_entropy_std": 0.6373962163925171, "attnres/final_alpha/block_0": 0.2397049069404602, "attnres/block_norm/0": 1.7124695777893066, "attnres/final_alpha/block_1": 0.005254390649497509, "attnres/block_norm/1": 39222.765625, "attnres/final_alpha/block_2": 0.011331839486956596, "attnres/block_norm/2": 25848.490234375, "attnres/final_alpha/block_3": 0.013307040557265282, "attnres/block_norm/3": 45373.7890625, "attnres/final_alpha/block_4": 0.01637139357626438, "attnres/block_norm/4": 12482.037109375, "attnres/final_alpha/block_5": 0.5981922149658203, "attnres/block_norm/5": 5879.12939453125, "attnres/final_alpha/block_6": 0.1158381849527359, "attnres/block_norm/6": 29771.71484375, "geo/tier1_time_s": 1.3555676937103271, "geo/step": 37350.0, "geo/rankme_slope": -4.4421459990246096e-05} {"step": 37360, "timestamp": 1778234919.369883, "train/loss": 2.150469517707825, "train/z_loss": 0.0014324380550533532, "train/perplexity": 8.588890086615265, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1712396.6790383877, "perf/iters_per_sec": 0.8165343661491336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2246881961822509, "data/tokens_consumed": 78351695872, "data/tokens_consumed_B": 78.351695872, "train/loss_slope": 1.5900345546792182e-05} {"step": 37370, "timestamp": 1778234929.7225072, "train/loss": 2.177476239204407, "train/z_loss": 0.0014160747407004238, "train/perplexity": 8.824008446806328, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027339.434664999, "perf/iters_per_sec": 0.9667107747387881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344355583190918, "data/tokens_consumed": 78372667392, "data/tokens_consumed_B": 78.372667392, "train/loss_slope": 1.4326581831919719e-05} {"step": 37380, "timestamp": 1778234940.0804558, "train/loss": 2.1897481441497804, "train/z_loss": 0.0014307077741250395, "train/perplexity": 8.932963012365464, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025895.8205856523, "perf/iters_per_sec": 0.9660224059036504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351726770401002, "data/tokens_consumed": 78393638912, "data/tokens_consumed_B": 78.393638912, "train/loss_slope": 1.2714585154899867e-05} {"step": 37390, "timestamp": 1778234950.4318473, "train/loss": 2.107885420322418, "train/z_loss": 0.0014151809271425008, "train/perplexity": 8.230818149026847, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027064.7668707743, "perf/iters_per_sec": 0.9665798029283401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345757246017455, "data/tokens_consumed": 78414610432, "data/tokens_consumed_B": 78.414610432, "train/loss_slope": 8.911893234001082e-06} {"step": 37400, "timestamp": 1778234960.7701607, "grad/layer_0/attn": 0.002598751336336136, "grad/layer_0/mlp": 0.0027668331749737263, "grad/layer_0/attn_mlp_ratio": 0.9392511503465324, "grad/layer_4/attn": 0.0017382603837177157, "grad/layer_4/mlp": 0.00246681016869843, "grad/layer_4/attn_mlp_ratio": 0.7046591323924069, "grad/layer_8/attn": 0.005344347562640905, "grad/layer_8/mlp": 0.0035111841280013323, "grad/layer_8/attn_mlp_ratio": 1.5220926091033005, "grad/layer_12/attn": 0.0043837567791342735, "grad/layer_12/mlp": 0.00616113469004631, "grad/layer_12/attn_mlp_ratio": 0.711517752576441, "grad/layer_16/attn": 0.00341129582375288, "grad/layer_16/mlp": 0.004510446451604366, "grad/layer_16/attn_mlp_ratio": 0.7563099982948434, "grad/layer_20/attn": 0.0036260338965803385, "grad/layer_20/mlp": 0.0060048229061067104, "grad/layer_20/attn_mlp_ratio": 0.6038535845091154, "grad/layer_24/attn": 0.009791433811187744, "grad/layer_24/mlp": 0.011045322753489017, "grad/layer_24/attn_mlp_ratio": 0.8864778278613022, "grad/layer_27/attn": 0.005068095866590738, "grad/layer_27/mlp": 0.01069946214556694, "grad/layer_27/attn_mlp_ratio": 0.4736776251246253} {"step": 37400, "timestamp": 1778234960.7859051, "train/loss": 2.2275107383728026, "train/z_loss": 0.0014202536316588522, "train/perplexity": 9.276745068775185, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026303.4307096675, "perf/iters_per_sec": 0.9662167695568407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349644422531128, "data/tokens_consumed": 78435581952, "data/tokens_consumed_B": 78.435581952, "train/loss_slope": 1.2021714790497547e-05} {"step": 37410, "timestamp": 1778234971.1391823, "train/loss": 2.1207982778549193, "train/z_loss": 0.0014510698849335313, "train/perplexity": 8.337790705455292, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026832.3922291202, "perf/iters_per_sec": 0.9664689980645753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346943378448485, "data/tokens_consumed": 78456553472, "data/tokens_consumed_B": 78.456553472, "train/loss_slope": 5.977759431369098e-06} {"step": 37420, "timestamp": 1778234981.519856, "train/loss": 2.1261045932769775, "train/z_loss": 0.0014304425218142568, "train/perplexity": 8.382151244278992, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021344.9615529133, "perf/iters_per_sec": 0.9638523872150961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375032663345336, "data/tokens_consumed": 78477524992, "data/tokens_consumed_B": 78.477524992, "train/loss_slope": 3.3986459864247384e-06} {"step": 37425, "timestamp": 1778234987.307325, "eos/sharpness": 56.90331459045409, "eos/L0_probe": 2.0004117488861084, "eos/L_plus": 2.2581050395965576, "eos/L_minus": 2.3117516040802, "eos/grad_norm": 0.1760619878768921, "eos/embed_grad_frac": 0.1143963411450386, "eos/time_s": 0.605388879776001} {"step": 37425, "timestamp": 1778234988.6836512, "geo/rankme_last": 439.7334899902344, "geo/layer_0/stable_rank_q_proj": 19.097707748413086, "geo/layer_0/stable_rank_k_proj": 16.248985290527344, "geo/layer_0/stable_rank_o_proj": 49.57652282714844, "geo/layer_0/stable_rank_gate_proj": 139.37271118164062, "geo/layer_0/stable_rank_down_proj": 52.99047088623047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06483550369739532, "geo/layer_0/attn_entropy_mean": 6.203449249267578, "geo/layer_0/attn_entropy_std": 0.3695485293865204, "geo/layer_7/stable_rank_q_proj": 43.13743591308594, "geo/layer_7/stable_rank_k_proj": 42.01636505126953, "geo/layer_7/stable_rank_o_proj": 99.85144805908203, "geo/layer_7/stable_rank_gate_proj": 91.87299346923828, "geo/layer_7/stable_rank_down_proj": 144.8527069091797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5144954323768616, "geo/layer_7/attn_entropy_mean": 4.644800186157227, "geo/layer_7/attn_entropy_std": 0.821744978427887, "geo/layer_14/stable_rank_q_proj": 54.32857894897461, "geo/layer_14/stable_rank_k_proj": 36.74872589111328, "geo/layer_14/stable_rank_o_proj": 49.33780288696289, "geo/layer_14/stable_rank_gate_proj": 77.4444580078125, "geo/layer_14/stable_rank_down_proj": 132.924072265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37600815296173096, "geo/layer_14/attn_entropy_mean": 5.52117395401001, "geo/layer_14/attn_entropy_std": 0.38624364137649536, "geo/layer_21/stable_rank_q_proj": 43.892032623291016, "geo/layer_21/stable_rank_k_proj": 31.13421630859375, "geo/layer_21/stable_rank_o_proj": 77.12036895751953, "geo/layer_21/stable_rank_gate_proj": 73.72296905517578, "geo/layer_21/stable_rank_down_proj": 56.265167236328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14575043320655823, "geo/layer_21/attn_entropy_mean": 5.713237762451172, "geo/layer_21/attn_entropy_std": 0.2846381664276123, "geo/layer_27/stable_rank_q_proj": 42.549434661865234, "geo/layer_27/stable_rank_k_proj": 31.39519500732422, "geo/layer_27/stable_rank_o_proj": 117.06663513183594, "geo/layer_27/stable_rank_gate_proj": 85.37080383300781, "geo/layer_27/stable_rank_down_proj": 132.48736572265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08463414013385773, "geo/layer_27/attn_entropy_mean": 4.289781093597412, "geo/layer_27/attn_entropy_std": 0.6458303332328796, "attnres/final_alpha/block_0": 0.24315780401229858, "attnres/block_norm/0": 1.7127951383590698, "attnres/final_alpha/block_1": 0.005430556833744049, "attnres/block_norm/1": 39069.234375, "attnres/final_alpha/block_2": 0.011570591479539871, "attnres/block_norm/2": 25943.087890625, "attnres/final_alpha/block_3": 0.013598271645605564, "attnres/block_norm/3": 45227.1328125, "attnres/final_alpha/block_4": 0.016922544687986374, "attnres/block_norm/4": 12496.53125, "attnres/final_alpha/block_5": 0.5880815982818604, "attnres/block_norm/5": 5948.8984375, "attnres/final_alpha/block_6": 0.12123863399028778, "attnres/block_norm/6": 29716.75, "geo/tier1_time_s": 1.3569509983062744, "geo/step": 37425.0, "geo/rankme_slope": -4.4220051301770707e-05} {"step": 37430, "timestamp": 1778234993.8758886, "train/loss": 2.2253352642059325, "train/z_loss": 0.0014220676850527525, "train/perplexity": 9.256585685584772, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697940.0826924366, "perf/iters_per_sec": 0.8096409238302406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351154327392577, "data/tokens_consumed": 78498496512, "data/tokens_consumed_B": 78.498496512, "train/loss_slope": 4.933671896928989e-06} {"step": 37440, "timestamp": 1778235004.256666, "train/loss": 2.123097777366638, "train/z_loss": 0.0014339760295115412, "train/perplexity": 8.356985511877271, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021348.5847039626, "perf/iters_per_sec": 0.9638541148681462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375014066696167, "data/tokens_consumed": 78519468032, "data/tokens_consumed_B": 78.519468032, "train/loss_slope": -9.147054684831777e-07} {"step": 37450, "timestamp": 1778235014.6237857, "grad/layer_0/attn": 0.0030969269573688507, "grad/layer_0/mlp": 0.002849825192242861, "grad/layer_0/attn_mlp_ratio": 1.086707653903765, "grad/layer_4/attn": 0.0023396878968924284, "grad/layer_4/mlp": 0.0025069231633096933, "grad/layer_4/attn_mlp_ratio": 0.9332905921514019, "grad/layer_8/attn": 0.004481917712837458, "grad/layer_8/mlp": 0.003605096135288477, "grad/layer_8/attn_mlp_ratio": 1.24321721816084, "grad/layer_12/attn": 0.004163085483014584, "grad/layer_12/mlp": 0.006494299042969942, "grad/layer_12/attn_mlp_ratio": 0.6410369142790582, "grad/layer_16/attn": 0.0036845095455646515, "grad/layer_16/mlp": 0.00446332385763526, "grad/layer_16/attn_mlp_ratio": 0.8255079802714486, "grad/layer_20/attn": 0.005736180115491152, "grad/layer_20/mlp": 0.005578441079705954, "grad/layer_20/attn_mlp_ratio": 1.0282765257719386, "grad/layer_24/attn": 0.007229712791740894, "grad/layer_24/mlp": 0.008395462296903133, "grad/layer_24/attn_mlp_ratio": 0.8611452770496294, "grad/layer_27/attn": 0.006438367068767548, "grad/layer_27/mlp": 0.007900632917881012, "grad/layer_27/attn_mlp_ratio": 0.8149178748330661} {"step": 37450, "timestamp": 1778235014.6394072, "train/loss": 2.175767517089844, "train/z_loss": 0.0014169581583701075, "train/perplexity": 8.808943542966807, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020794.8114110199, "perf/iters_per_sec": 0.9635900551848506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377857208251953, "data/tokens_consumed": 78540439552, "data/tokens_consumed_B": 78.540439552, "train/loss_slope": -1.568803549749395e-08} {"step": 37460, "timestamp": 1778235025.019434, "train/loss": 2.1522390127182005, "train/z_loss": 0.0014325045282021164, "train/perplexity": 8.604101539093762, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021376.501982909, "perf/iters_per_sec": 0.9638674268641038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374870777130127, "data/tokens_consumed": 78561411072, "data/tokens_consumed_B": 78.561411072, "train/loss_slope": -1.309277594286955e-07} {"step": 37470, "timestamp": 1778235035.4021244, "train/loss": 2.1248146653175355, "train/z_loss": 0.0014363989233970643, "train/perplexity": 8.371345843621206, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020891.798495876, "perf/iters_per_sec": 0.9636363022307758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037735915184021, "data/tokens_consumed": 78582382592, "data/tokens_consumed_B": 78.582382592, "train/loss_slope": -4.810435314848076e-06} {"step": 37480, "timestamp": 1778235045.7798674, "train/loss": 2.204417371749878, "train/z_loss": 0.001419925014488399, "train/perplexity": 9.064968522232522, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021792.6119220236, "perf/iters_per_sec": 0.9640658435449713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372735500335692, "data/tokens_consumed": 78603354112, "data/tokens_consumed_B": 78.603354112, "train/loss_slope": -4.243618570002205e-06} {"step": 37490, "timestamp": 1778235056.159472, "train/loss": 2.13432981967926, "train/z_loss": 0.0014271851163357497, "train/perplexity": 8.451380659502838, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021323.6409656107, "perf/iters_per_sec": 0.9638422207668356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375142097473145, "data/tokens_consumed": 78624325632, "data/tokens_consumed_B": 78.624325632, "train/loss_slope": -3.664310923432491e-06} {"step": 37500, "timestamp": 1778235066.5302792, "grad/layer_0/attn": 0.002965199062600732, "grad/layer_0/mlp": 0.0029867880512028933, "grad/layer_0/attn_mlp_ratio": 0.9927718045240448, "grad/layer_4/attn": 0.002479043323546648, "grad/layer_4/mlp": 0.0026648258790373802, "grad/layer_4/attn_mlp_ratio": 0.9302833817471853, "grad/layer_8/attn": 0.010354368016123772, "grad/layer_8/mlp": 0.003842871403321624, "grad/layer_8/attn_mlp_ratio": 2.6944351397578274, "grad/layer_12/attn": 0.0042741065844893456, "grad/layer_12/mlp": 0.006974807940423489, "grad/layer_12/attn_mlp_ratio": 0.6127920022627369, "grad/layer_16/attn": 0.005204949993640184, "grad/layer_16/mlp": 0.004333198070526123, "grad/layer_16/attn_mlp_ratio": 1.2011797727238993, "grad/layer_20/attn": 0.007001953199505806, "grad/layer_20/mlp": 0.006089452188462019, "grad/layer_20/attn_mlp_ratio": 1.149849422873835, "grad/layer_24/attn": 0.012365284375846386, "grad/layer_24/mlp": 0.011859056539833546, "grad/layer_24/attn_mlp_ratio": 1.0426870156191401, "grad/layer_27/attn": 0.016038425266742706, "grad/layer_27/mlp": 0.01034162100404501, "grad/layer_27/attn_mlp_ratio": 1.5508618141569175} {"step": 37500, "timestamp": 1778235067.1381836, "eos/sharpness": 75.39505958557127, "eos/L0_probe": 1.9998672008514404, "eos/L_plus": 2.2776753902435303, "eos/L_minus": 2.4760096073150635, "eos/grad_norm": 0.21882204711437225, "eos/embed_grad_frac": 0.04840918630361557, "eos/time_s": 0.6051254272460938} {"step": 37500, "timestamp": 1778235067.158212, "train/loss": 2.2181572914123535, "train/z_loss": 0.0014000458177179098, "train/perplexity": 9.190380060719985, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907434.4642847192, "perf/iters_per_sec": 0.9095356294082256, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0994621515274048, "data/tokens_consumed": 78645297152, "data/tokens_consumed_B": 78.645297152, "train/loss_slope": -2.039880191746962e-06} {"step": 37500, "timestamp": 1778235068.524089, "geo/rankme_last": 439.3476257324219, "geo/layer_0/stable_rank_q_proj": 19.09693145751953, "geo/layer_0/stable_rank_k_proj": 16.283517837524414, "geo/layer_0/stable_rank_o_proj": 49.53899002075195, "geo/layer_0/stable_rank_gate_proj": 138.9883270263672, "geo/layer_0/stable_rank_down_proj": 52.98772430419922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0584145188331604, "geo/layer_0/attn_entropy_mean": 6.200481414794922, "geo/layer_0/attn_entropy_std": 0.36744698882102966, "geo/layer_7/stable_rank_q_proj": 43.11488723754883, "geo/layer_7/stable_rank_k_proj": 41.95299530029297, "geo/layer_7/stable_rank_o_proj": 99.88512420654297, "geo/layer_7/stable_rank_gate_proj": 91.68190002441406, "geo/layer_7/stable_rank_down_proj": 144.85488891601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49963080883026123, "geo/layer_7/attn_entropy_mean": 4.6726531982421875, "geo/layer_7/attn_entropy_std": 0.8161014914512634, "geo/layer_14/stable_rank_q_proj": 54.334495544433594, "geo/layer_14/stable_rank_k_proj": 36.76844024658203, "geo/layer_14/stable_rank_o_proj": 49.197425842285156, "geo/layer_14/stable_rank_gate_proj": 77.42200469970703, "geo/layer_14/stable_rank_down_proj": 132.6915283203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38416367769241333, "geo/layer_14/attn_entropy_mean": 5.489051342010498, "geo/layer_14/attn_entropy_std": 0.38542836904525757, "geo/layer_21/stable_rank_q_proj": 43.92020034790039, "geo/layer_21/stable_rank_k_proj": 31.112232208251953, "geo/layer_21/stable_rank_o_proj": 77.23507690429688, "geo/layer_21/stable_rank_gate_proj": 73.65359497070312, "geo/layer_21/stable_rank_down_proj": 56.32640838623047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1442463994026184, "geo/layer_21/attn_entropy_mean": 5.72680139541626, "geo/layer_21/attn_entropy_std": 0.2931358814239502, "geo/layer_27/stable_rank_q_proj": 42.490745544433594, "geo/layer_27/stable_rank_k_proj": 31.399089813232422, "geo/layer_27/stable_rank_o_proj": 117.15298461914062, "geo/layer_27/stable_rank_gate_proj": 85.38552856445312, "geo/layer_27/stable_rank_down_proj": 132.8829803466797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08426076173782349, "geo/layer_27/attn_entropy_mean": 4.296160697937012, "geo/layer_27/attn_entropy_std": 0.6667993068695068, "attnres/final_alpha/block_0": 0.24437585473060608, "attnres/block_norm/0": 1.712931513786316, "attnres/final_alpha/block_1": 0.005372071638703346, "attnres/block_norm/1": 39275.1875, "attnres/final_alpha/block_2": 0.011625485494732857, "attnres/block_norm/2": 25867.701171875, "attnres/final_alpha/block_3": 0.013722049072384834, "attnres/block_norm/3": 45332.4765625, "attnres/final_alpha/block_4": 0.016899418085813522, "attnres/block_norm/4": 12515.7421875, "attnres/final_alpha/block_5": 0.5893121361732483, "attnres/block_norm/5": 5912.19287109375, "attnres/final_alpha/block_6": 0.1186930388212204, "attnres/block_norm/6": 29722.37890625, "geo/tier1_time_s": 1.3621995449066162, "geo/step": 37500.0, "geo/rankme_slope": -3.502567042441977e-05} {"step": 37500, "timestamp": 1778235075.4055383, "geo/ww_alpha_mean": 7.567602524687869, "geo/ww_alpha_std": 4.085535980434726, "geo/ww_alpha_min": 1.358979875218646, "geo/ww_alpha_max": 23.74970439676369, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.050533347485049, "geo/ww_alpha_by_type/k_proj": 4.524353132425723, "geo/ww_alpha_by_type/v_proj": 8.299250466326422, "geo/ww_alpha_by_type/o_proj": 7.99731141224552, "geo/ww_alpha_by_type/gate_proj": 8.081966006373586, "geo/ww_alpha_by_type/up_proj": 11.233304871440717, "geo/ww_alpha_by_type/down_proj": 8.893156028389553, "geo/twonn_id/layer_0": 0.7392644882202148, "geo/twonn_id/layer_7": 3.2655365467071533, "geo/twonn_id/layer_14": 4.561884880065918, "geo/twonn_id/layer_21": 7.0673298835754395, "geo/twonn_id/layer_27": 5.381228923797607, "geo/tier2_time_s": 6.873910427093506} {"step": 37500, "timestamp": 1778235076.0069323, "eoc/jacobian_sigma/layer_0/attn": 1021.3309326171875, "eoc/jacobian_sigma/layer_0/mlp": 8545.60546875, "eoc/jacobian_sigma/layer_0": 8545.60546875, "eoc/jacobian_sigma/layer_7/attn": 1.1387311220169067, "eoc/jacobian_sigma/layer_7/mlp": 1.674783706665039, "eoc/jacobian_sigma/layer_7": 1.674783706665039, "eoc/jacobian_sigma/layer_14/attn": 1.6169134378433228, "eoc/jacobian_sigma/layer_14/mlp": 6.839232444763184, "eoc/jacobian_sigma/layer_14": 6.839232444763184, "eoc/jacobian_sigma/layer_21/attn": 1.0999314785003662, "eoc/jacobian_sigma/layer_21/mlp": 4.108652114868164, "eoc/jacobian_sigma/layer_21": 4.108652114868164, "eoc/jacobian_sigma/layer_27/attn": 3.875551700592041, "eoc/jacobian_sigma/layer_27/mlp": 25.816682815551758, "eoc/jacobian_sigma/layer_27": 25.816682815551758, "eoc/layer0_sigma": 8545.60546875, "eoc/sigma_max": 25.816682815551758, "eoc/sigma_min": 1.674783706665039, "eoc/sigma_mean": 9.609837770462036, "eoc/time_s": 0.594428300857544} {"step": 37510, "timestamp": 1778235086.4146588, "train/loss": 2.162368559837341, "train/z_loss": 0.001424037991091609, "train/perplexity": 8.691700108949464, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1089263.5908291712, "perf/iters_per_sec": 0.5194013551851135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9252933979034423, "data/tokens_consumed": 78666268672, "data/tokens_consumed_B": 78.666268672, "train/loss_slope": -2.46593939303072e-06} {"step": 37520, "timestamp": 1778235096.7931664, "train/loss": 2.183234679698944, "train/z_loss": 0.0014204650768078863, "train/perplexity": 8.8749675560543, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021816.730735314, "perf/iters_per_sec": 0.9640773442913599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037261176109314, "data/tokens_consumed": 78687240192, "data/tokens_consumed_B": 78.687240192, "train/loss_slope": 3.372852343751003e-07} {"step": 37530, "timestamp": 1778235107.1755123, "train/loss": 2.195059561729431, "train/z_loss": 0.0014279216062277555, "train/perplexity": 8.980535937141648, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021492.4528777304, "perf/iters_per_sec": 0.9639227165592815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374275684356689, "data/tokens_consumed": 78708211712, "data/tokens_consumed_B": 78.708211712, "train/loss_slope": 2.0927068102002916e-07} {"step": 37540, "timestamp": 1778235117.5657258, "train/loss": 2.170652687549591, "train/z_loss": 0.0014305360498838126, "train/perplexity": 8.764002329614986, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019431.7564196174, "perf/iters_per_sec": 0.9629400999162757, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384861946105957, "data/tokens_consumed": 78729183232, "data/tokens_consumed_B": 78.729183232, "train/loss_slope": -4.015320598489098e-06} {"step": 37550, "timestamp": 1778235127.9320066, "grad/layer_0/attn": 0.003891012165695429, "grad/layer_0/mlp": 0.003215854289010167, "grad/layer_0/attn_mlp_ratio": 1.2099466253797249, "grad/layer_4/attn": 0.0017859297804534435, "grad/layer_4/mlp": 0.0025706076994538307, "grad/layer_4/attn_mlp_ratio": 0.6947500045836998, "grad/layer_8/attn": 0.005765282083302736, "grad/layer_8/mlp": 0.0036266513634473085, "grad/layer_8/attn_mlp_ratio": 1.5896984150284337, "grad/layer_12/attn": 0.005160157103091478, "grad/layer_12/mlp": 0.006414089351892471, "grad/layer_12/attn_mlp_ratio": 0.8045034516269459, "grad/layer_16/attn": 0.003587705548852682, "grad/layer_16/mlp": 0.004510107450187206, "grad/layer_16/attn_mlp_ratio": 0.7954811518194868, "grad/layer_20/attn": 0.004630546551197767, "grad/layer_20/mlp": 0.005729855503886938, "grad/layer_20/attn_mlp_ratio": 0.8081436726008528, "grad/layer_24/attn": 0.00812302902340889, "grad/layer_24/mlp": 0.008976682089269161, "grad/layer_24/attn_mlp_ratio": 0.9049032651639668, "grad/layer_27/attn": 0.0051583596505224705, "grad/layer_27/mlp": 0.007891636341810226, "grad/layer_27/attn_mlp_ratio": 0.6536489216854012} {"step": 37550, "timestamp": 1778235127.9476292, "train/loss": 2.1797630310058596, "train/z_loss": 0.0014187159133143723, "train/perplexity": 8.844210206784362, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021029.4256355313, "perf/iters_per_sec": 0.9637019279649407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376652479171753, "data/tokens_consumed": 78750154752, "data/tokens_consumed_B": 78.750154752, "train/loss_slope": -3.2327999769658756e-06} {"step": 37560, "timestamp": 1778235138.3282495, "train/loss": 2.1876731872558595, "train/z_loss": 0.0014251721673645078, "train/perplexity": 8.914446716081537, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021658.5980161938, "perf/iters_per_sec": 0.9640019407349557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373423099517822, "data/tokens_consumed": 78771126272, "data/tokens_consumed_B": 78.771126272, "train/loss_slope": -3.3529157435405985e-07} {"step": 37570, "timestamp": 1778235148.7043092, "train/loss": 2.184455156326294, "train/z_loss": 0.00141651201993227, "train/perplexity": 8.885805859127382, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022198.200503567, "perf/iters_per_sec": 0.9642592432515941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370655059814453, "data/tokens_consumed": 78792097792, "data/tokens_consumed_B": 78.792097792, "train/loss_slope": -9.259426232541173e-07} {"step": 37575, "timestamp": 1778235154.498093, "eos/sharpness": 63.73252868652342, "eos/L0_probe": 1.9983998537063599, "eos/L_plus": 2.409726619720459, "eos/L_minus": 2.224398374557495, "eos/grad_norm": 0.15604880452156067, "eos/embed_grad_frac": 0.08955730497837067, "eos/time_s": 0.6177372932434082} {"step": 37575, "timestamp": 1778235155.8750787, "geo/rankme_last": 439.1417236328125, "geo/layer_0/stable_rank_q_proj": 19.131412506103516, "geo/layer_0/stable_rank_k_proj": 16.30204963684082, "geo/layer_0/stable_rank_o_proj": 49.57600784301758, "geo/layer_0/stable_rank_gate_proj": 138.67880249023438, "geo/layer_0/stable_rank_down_proj": 53.00916290283203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05911803990602493, "geo/layer_0/attn_entropy_mean": 6.200809478759766, "geo/layer_0/attn_entropy_std": 0.370413601398468, "geo/layer_7/stable_rank_q_proj": 43.05371856689453, "geo/layer_7/stable_rank_k_proj": 41.925228118896484, "geo/layer_7/stable_rank_o_proj": 99.83186340332031, "geo/layer_7/stable_rank_gate_proj": 91.786865234375, "geo/layer_7/stable_rank_down_proj": 145.02252197265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.503731369972229, "geo/layer_7/attn_entropy_mean": 4.665349960327148, "geo/layer_7/attn_entropy_std": 0.821760892868042, "geo/layer_14/stable_rank_q_proj": 54.38691329956055, "geo/layer_14/stable_rank_k_proj": 36.80046081542969, "geo/layer_14/stable_rank_o_proj": 49.19216537475586, "geo/layer_14/stable_rank_gate_proj": 77.37498474121094, "geo/layer_14/stable_rank_down_proj": 132.69961547851562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38030004501342773, "geo/layer_14/attn_entropy_mean": 5.541533470153809, "geo/layer_14/attn_entropy_std": 0.3713122606277466, "geo/layer_21/stable_rank_q_proj": 43.86285400390625, "geo/layer_21/stable_rank_k_proj": 31.149702072143555, "geo/layer_21/stable_rank_o_proj": 77.23640441894531, "geo/layer_21/stable_rank_gate_proj": 73.70565032958984, "geo/layer_21/stable_rank_down_proj": 56.24680709838867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14936576783657074, "geo/layer_21/attn_entropy_mean": 5.725461006164551, "geo/layer_21/attn_entropy_std": 0.27505648136138916, "geo/layer_27/stable_rank_q_proj": 42.50520706176758, "geo/layer_27/stable_rank_k_proj": 31.29643440246582, "geo/layer_27/stable_rank_o_proj": 117.00965118408203, "geo/layer_27/stable_rank_gate_proj": 85.5324478149414, "geo/layer_27/stable_rank_down_proj": 132.89076232910156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0906674712896347, "geo/layer_27/attn_entropy_mean": 4.283475875854492, "geo/layer_27/attn_entropy_std": 0.6573973298072815, "attnres/final_alpha/block_0": 0.24090439081192017, "attnres/block_norm/0": 1.7132164239883423, "attnres/final_alpha/block_1": 0.005265214946120977, "attnres/block_norm/1": 39312.2421875, "attnres/final_alpha/block_2": 0.0112131517380476, "attnres/block_norm/2": 25904.314453125, "attnres/final_alpha/block_3": 0.013305777683854103, "attnres/block_norm/3": 45510.5703125, "attnres/final_alpha/block_4": 0.016504526138305664, "attnres/block_norm/4": 12538.005859375, "attnres/final_alpha/block_5": 0.5955913662910461, "attnres/block_norm/5": 5884.271484375, "attnres/final_alpha/block_6": 0.11721555888652802, "attnres/block_norm/6": 30008.591796875, "geo/tier1_time_s": 1.3560900688171387, "geo/step": 37575.0, "geo/rankme_slope": -7.491078462635054e-05} {"step": 37580, "timestamp": 1778235161.0649192, "train/loss": 2.23096649646759, "train/z_loss": 0.001415533816907555, "train/perplexity": 9.308858712172318, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697388.4138040217, "perf/iters_per_sec": 0.8093778676052197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235516858100891, "data/tokens_consumed": 78813069312, "data/tokens_consumed_B": 78.813069312, "train/loss_slope": 9.689250508835909e-07} {"step": 37590, "timestamp": 1778235171.4505713, "train/loss": 2.18471999168396, "train/z_loss": 0.0014421977568417788, "train/perplexity": 8.888159446343035, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020312.2893424095, "perf/iters_per_sec": 0.9633599707328842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380335807800294, "data/tokens_consumed": 78834040832, "data/tokens_consumed_B": 78.834040832, "train/loss_slope": 2.817767280878385e-06} {"step": 37600, "timestamp": 1778235181.8125315, "grad/layer_0/attn": 0.002917723497375846, "grad/layer_0/mlp": 0.0029300672467797995, "grad/layer_0/attn_mlp_ratio": 0.9957871789474328, "grad/layer_4/attn": 0.00272191921249032, "grad/layer_4/mlp": 0.00261073000729084, "grad/layer_4/attn_mlp_ratio": 1.0425892760377522, "grad/layer_8/attn": 0.005848986096680164, "grad/layer_8/mlp": 0.003785382490605116, "grad/layer_8/attn_mlp_ratio": 1.5451505776976613, "grad/layer_12/attn": 0.004039875231683254, "grad/layer_12/mlp": 0.006252086255699396, "grad/layer_12/attn_mlp_ratio": 0.6461643364859329, "grad/layer_16/attn": 0.0038096997886896133, "grad/layer_16/mlp": 0.0044182054698467255, "grad/layer_16/attn_mlp_ratio": 0.8622730944639541, "grad/layer_20/attn": 0.003056492656469345, "grad/layer_20/mlp": 0.006290304474532604, "grad/layer_20/attn_mlp_ratio": 0.48590535168107585, "grad/layer_24/attn": 0.008287539705634117, "grad/layer_24/mlp": 0.010856752283871174, "grad/layer_24/attn_mlp_ratio": 0.7633534792546354, "grad/layer_27/attn": 0.004774631932377815, "grad/layer_27/mlp": 0.010233398526906967, "grad/layer_27/attn_mlp_ratio": 0.466573433367849} {"step": 37600, "timestamp": 1778235181.8284373, "train/loss": 2.1102200984954833, "train/z_loss": 0.0014424788998439908, "train/perplexity": 8.250056909924659, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021740.5655766316, "perf/iters_per_sec": 0.9640410259135397, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373002529144286, "data/tokens_consumed": 78855012352, "data/tokens_consumed_B": 78.855012352, "train/loss_slope": -5.6111249199795816e-06} {"step": 37610, "timestamp": 1778235192.2026064, "train/loss": 2.1491697072982787, "train/z_loss": 0.001430005009751767, "train/perplexity": 8.577733410227061, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022588.464650616, "perf/iters_per_sec": 0.9644453356984215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368654012680054, "data/tokens_consumed": 78875983872, "data/tokens_consumed_B": 78.875983872, "train/loss_slope": -7.810280425320455e-06} {"step": 37620, "timestamp": 1778235202.58689, "train/loss": 2.1546057939529417, "train/z_loss": 0.0014371375553309917, "train/perplexity": 8.624489682778965, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020535.0952984856, "perf/iters_per_sec": 0.9634662128918102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379191160202026, "data/tokens_consumed": 78896955392, "data/tokens_consumed_B": 78.896955392, "train/loss_slope": -7.969716100981853e-06} {"step": 37630, "timestamp": 1778235212.9654799, "train/loss": 2.186859655380249, "train/z_loss": 0.0014183684950694441, "train/perplexity": 8.907197478667245, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021754.6457074496, "perf/iters_per_sec": 0.9640477398431061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037293028831482, "data/tokens_consumed": 78917926912, "data/tokens_consumed_B": 78.917926912, "train/loss_slope": -7.478923995037861e-06} {"step": 37640, "timestamp": 1778235223.3398435, "train/loss": 2.183535122871399, "train/z_loss": 0.0014298459980636835, "train/perplexity": 8.877634380056746, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022460.2044369264, "perf/iters_per_sec": 0.9643841764626152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369311571121216, "data/tokens_consumed": 78938898432, "data/tokens_consumed_B": 78.938898432, "train/loss_slope": -8.861561004657096e-06} {"step": 37650, "timestamp": 1778235233.7292182, "grad/layer_0/attn": 0.002822493901476264, "grad/layer_0/mlp": 0.00278307287953794, "grad/layer_0/attn_mlp_ratio": 1.0141645304410483, "grad/layer_4/attn": 0.0020524172578006983, "grad/layer_4/mlp": 0.0025760666467249393, "grad/layer_4/attn_mlp_ratio": 0.7967251859487032, "grad/layer_8/attn": 0.009928117506206036, "grad/layer_8/mlp": 0.0038631651550531387, "grad/layer_8/attn_mlp_ratio": 2.5699437768601157, "grad/layer_12/attn": 0.004983475897461176, "grad/layer_12/mlp": 0.0071919639594852924, "grad/layer_12/attn_mlp_ratio": 0.692922803318045, "grad/layer_16/attn": 0.0065807620994746685, "grad/layer_16/mlp": 0.004972535651177168, "grad/layer_16/attn_mlp_ratio": 1.3234217768905492, "grad/layer_20/attn": 0.007601388264447451, "grad/layer_20/mlp": 0.007067549508064985, "grad/layer_20/attn_mlp_ratio": 1.07553376856008, "grad/layer_24/attn": 0.01967030204832554, "grad/layer_24/mlp": 0.011586206033825874, "grad/layer_24/attn_mlp_ratio": 1.6977345147431984, "grad/layer_27/attn": 0.013594037853181362, "grad/layer_27/mlp": 0.00900549441576004, "grad/layer_27/attn_mlp_ratio": 1.5095270814269173} {"step": 37650, "timestamp": 1778235234.3360574, "eos/sharpness": 61.382770538330064, "eos/L0_probe": 1.9997273683547974, "eos/L_plus": 2.3808810710906982, "eos/L_minus": 2.2324013710021973, "eos/grad_norm": 0.18563632667064667, "eos/embed_grad_frac": 0.06784925609827042, "eos/time_s": 0.6040728092193604} {"step": 37650, "timestamp": 1778235234.356896, "train/loss": 2.1353357672691344, "train/z_loss": 0.001431731937918812, "train/perplexity": 8.459886583047737, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906240.3251271306, "perf/iters_per_sec": 0.908966219485822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1001508951187133, "data/tokens_consumed": 78959869952, "data/tokens_consumed_B": 78.959869952, "train/loss_slope": -9.900786280810968e-06} {"step": 37650, "timestamp": 1778235235.7182534, "geo/rankme_last": 440.1532287597656, "geo/layer_0/stable_rank_q_proj": 19.146427154541016, "geo/layer_0/stable_rank_k_proj": 16.318449020385742, "geo/layer_0/stable_rank_o_proj": 49.65562438964844, "geo/layer_0/stable_rank_gate_proj": 138.3699188232422, "geo/layer_0/stable_rank_down_proj": 52.96026611328125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06177671626210213, "geo/layer_0/attn_entropy_mean": 6.197182655334473, "geo/layer_0/attn_entropy_std": 0.37158554792404175, "geo/layer_7/stable_rank_q_proj": 43.14838409423828, "geo/layer_7/stable_rank_k_proj": 42.021480560302734, "geo/layer_7/stable_rank_o_proj": 99.71104431152344, "geo/layer_7/stable_rank_gate_proj": 91.87521362304688, "geo/layer_7/stable_rank_down_proj": 145.50958251953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5110873579978943, "geo/layer_7/attn_entropy_mean": 4.663205623626709, "geo/layer_7/attn_entropy_std": 0.80504310131073, "geo/layer_14/stable_rank_q_proj": 54.3635368347168, "geo/layer_14/stable_rank_k_proj": 36.886146545410156, "geo/layer_14/stable_rank_o_proj": 49.16546630859375, "geo/layer_14/stable_rank_gate_proj": 77.40276336669922, "geo/layer_14/stable_rank_down_proj": 132.98190307617188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3813740611076355, "geo/layer_14/attn_entropy_mean": 5.49603271484375, "geo/layer_14/attn_entropy_std": 0.3734741806983948, "geo/layer_21/stable_rank_q_proj": 43.75137710571289, "geo/layer_21/stable_rank_k_proj": 31.04058265686035, "geo/layer_21/stable_rank_o_proj": 77.28390502929688, "geo/layer_21/stable_rank_gate_proj": 73.69843292236328, "geo/layer_21/stable_rank_down_proj": 56.208213806152344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14851456880569458, "geo/layer_21/attn_entropy_mean": 5.733092308044434, "geo/layer_21/attn_entropy_std": 0.28008270263671875, "geo/layer_27/stable_rank_q_proj": 42.49586486816406, "geo/layer_27/stable_rank_k_proj": 31.29193878173828, "geo/layer_27/stable_rank_o_proj": 116.38663482666016, "geo/layer_27/stable_rank_gate_proj": 85.58594512939453, "geo/layer_27/stable_rank_down_proj": 132.64096069335938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08701412379741669, "geo/layer_27/attn_entropy_mean": 4.280678749084473, "geo/layer_27/attn_entropy_std": 0.6669764518737793, "attnres/final_alpha/block_0": 0.2400253415107727, "attnres/block_norm/0": 1.7134020328521729, "attnres/final_alpha/block_1": 0.005245364271104336, "attnres/block_norm/1": 39285.7734375, "attnres/final_alpha/block_2": 0.01122736930847168, "attnres/block_norm/2": 26005.078125, "attnres/final_alpha/block_3": 0.013324793428182602, "attnres/block_norm/3": 45315.1953125, "attnres/final_alpha/block_4": 0.01642375998198986, "attnres/block_norm/4": 12542.8037109375, "attnres/final_alpha/block_5": 0.5981236696243286, "attnres/block_norm/5": 5863.96240234375, "attnres/final_alpha/block_6": 0.11562970280647278, "attnres/block_norm/6": 29828.046875, "geo/tier1_time_s": 1.3578312397003174, "geo/step": 37650.0, "geo/rankme_slope": -3.0305012630052015e-05} {"step": 37660, "timestamp": 1778235246.076762, "train/loss": 2.165065860748291, "train/z_loss": 0.0014333975617773832, "train/perplexity": 8.715175885955354, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789877.8153132196, "perf/iters_per_sec": 0.8534802509847734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716732740402223, "data/tokens_consumed": 78980841472, "data/tokens_consumed_B": 78.980841472, "train/loss_slope": -1.089415702596654e-05} {"step": 37670, "timestamp": 1778235256.4271429, "train/loss": 2.2145729780197145, "train/z_loss": 0.0014156088582240045, "train/perplexity": 9.157497823710147, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027436.8640705314, "perf/iters_per_sec": 0.9667572326996476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343858480453492, "data/tokens_consumed": 79001812992, "data/tokens_consumed_B": 79.001812992, "train/loss_slope": -7.598779175040577e-06} {"step": 37680, "timestamp": 1778235266.7865884, "train/loss": 2.1413610696792604, "train/z_loss": 0.0014337449450977147, "train/perplexity": 8.511013831955978, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025540.8471249116, "perf/iters_per_sec": 0.9658531413673933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353540897369384, "data/tokens_consumed": 79022784512, "data/tokens_consumed_B": 79.022784512, "train/loss_slope": -1.0816605921590027e-05} {"step": 37690, "timestamp": 1778235277.1425261, "train/loss": 2.1740456342697145, "train/z_loss": 0.001423961005639285, "train/perplexity": 8.793788625656939, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026454.7744951292, "perf/iters_per_sec": 0.9662889358974119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034887146949768, "data/tokens_consumed": 79043756032, "data/tokens_consumed_B": 79.043756032, "train/loss_slope": -1.1040731961398882e-05} {"step": 37700, "timestamp": 1778235288.066728, "grad/layer_0/attn": 0.003929257858544588, "grad/layer_0/mlp": 0.0032556564547121525, "grad/layer_0/attn_mlp_ratio": 1.2069018314777344, "grad/layer_4/attn": 0.0025922588538378477, "grad/layer_4/mlp": 0.0024943409953266382, "grad/layer_4/attn_mlp_ratio": 1.039255961702538, "grad/layer_8/attn": 0.0034007916692644358, "grad/layer_8/mlp": 0.0036562946625053883, "grad/layer_8/attn_mlp_ratio": 0.9301196676315355, "grad/layer_12/attn": 0.005213938187807798, "grad/layer_12/mlp": 0.00632383581250906, "grad/layer_12/attn_mlp_ratio": 0.8244897970066248, "grad/layer_16/attn": 0.003749793628230691, "grad/layer_16/mlp": 0.0046988665126264095, "grad/layer_16/attn_mlp_ratio": 0.7980208712787362, "grad/layer_20/attn": 0.004145113751292229, "grad/layer_20/mlp": 0.006218191236257553, "grad/layer_20/attn_mlp_ratio": 0.6666108402169214, "grad/layer_24/attn": 0.01443299651145935, "grad/layer_24/mlp": 0.010445255786180496, "grad/layer_24/attn_mlp_ratio": 1.3817752928920393, "grad/layer_27/attn": 0.013548985123634338, "grad/layer_27/mlp": 0.008772371336817741, "grad/layer_27/attn_mlp_ratio": 1.544506547769865} {"step": 37700, "timestamp": 1778235288.0826576, "train/loss": 2.1293214321136475, "train/z_loss": 0.0014350855257362128, "train/perplexity": 8.409158689946949, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918046.930380648, "perf/iters_per_sec": 0.9145960475829353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0933788776397706, "data/tokens_consumed": 79064727552, "data/tokens_consumed_B": 79.064727552, "train/loss_slope": -1.4287187550256093e-05} {"step": 37710, "timestamp": 1778235298.4500694, "train/loss": 2.134120559692383, "train/z_loss": 0.0014410438598133624, "train/perplexity": 8.44961230872592, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024406.0843410762, "perf/iters_per_sec": 0.9653120443063146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359344482421875, "data/tokens_consumed": 79085699072, "data/tokens_consumed_B": 79.085699072, "train/loss_slope": -1.6161934827992905e-05} {"step": 37720, "timestamp": 1778235308.811924, "train/loss": 2.1647211074829102, "train/z_loss": 0.0014378285151906312, "train/perplexity": 8.712171818471093, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025458.3845717153, "perf/iters_per_sec": 0.965813820157869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353962421417235, "data/tokens_consumed": 79106670592, "data/tokens_consumed_B": 79.106670592, "train/loss_slope": -2.0034081221270674e-05} {"step": 37725, "timestamp": 1778235314.5779119, "eos/sharpness": 15.81017971038818, "eos/L0_probe": 2.004833936691284, "eos/L_plus": 2.0873465538024902, "eos/L_minus": 2.08042311668396, "eos/grad_norm": 0.10929182171821594, "eos/embed_grad_frac": 0.1934433877468109, "eos/time_s": 0.6018006801605225} {"step": 37725, "timestamp": 1778235315.956389, "geo/rankme_last": 438.99224853515625, "geo/layer_0/stable_rank_q_proj": 19.12826919555664, "geo/layer_0/stable_rank_k_proj": 16.263368606567383, "geo/layer_0/stable_rank_o_proj": 49.536991119384766, "geo/layer_0/stable_rank_gate_proj": 138.7288360595703, "geo/layer_0/stable_rank_down_proj": 52.89706039428711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05756090581417084, "geo/layer_0/attn_entropy_mean": 6.1974992752075195, "geo/layer_0/attn_entropy_std": 0.3687712848186493, "geo/layer_7/stable_rank_q_proj": 43.2189826965332, "geo/layer_7/stable_rank_k_proj": 42.01655578613281, "geo/layer_7/stable_rank_o_proj": 99.40583801269531, "geo/layer_7/stable_rank_gate_proj": 91.57777404785156, "geo/layer_7/stable_rank_down_proj": 145.791015625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5012407302856445, "geo/layer_7/attn_entropy_mean": 4.6332244873046875, "geo/layer_7/attn_entropy_std": 0.813845157623291, "geo/layer_14/stable_rank_q_proj": 54.2369499206543, "geo/layer_14/stable_rank_k_proj": 36.8515510559082, "geo/layer_14/stable_rank_o_proj": 49.16068649291992, "geo/layer_14/stable_rank_gate_proj": 77.42211151123047, "geo/layer_14/stable_rank_down_proj": 132.9748992919922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3796750009059906, "geo/layer_14/attn_entropy_mean": 5.480283737182617, "geo/layer_14/attn_entropy_std": 0.377337247133255, "geo/layer_21/stable_rank_q_proj": 43.660648345947266, "geo/layer_21/stable_rank_k_proj": 31.055057525634766, "geo/layer_21/stable_rank_o_proj": 77.27377319335938, "geo/layer_21/stable_rank_gate_proj": 73.65518188476562, "geo/layer_21/stable_rank_down_proj": 56.181766510009766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14738351106643677, "geo/layer_21/attn_entropy_mean": 5.702434062957764, "geo/layer_21/attn_entropy_std": 0.2889902591705322, "geo/layer_27/stable_rank_q_proj": 42.58572006225586, "geo/layer_27/stable_rank_k_proj": 31.334686279296875, "geo/layer_27/stable_rank_o_proj": 116.61605072021484, "geo/layer_27/stable_rank_gate_proj": 85.57987213134766, "geo/layer_27/stable_rank_down_proj": 132.6296844482422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08260520547628403, "geo/layer_27/attn_entropy_mean": 4.259865760803223, "geo/layer_27/attn_entropy_std": 0.6453613042831421, "attnres/final_alpha/block_0": 0.2420853078365326, "attnres/block_norm/0": 1.7137370109558105, "attnres/final_alpha/block_1": 0.005322497338056564, "attnres/block_norm/1": 39464.8046875, "attnres/final_alpha/block_2": 0.011331599205732346, "attnres/block_norm/2": 25922.552734375, "attnres/final_alpha/block_3": 0.013391295447945595, "attnres/block_norm/3": 45620.66015625, "attnres/final_alpha/block_4": 0.016720237210392952, "attnres/block_norm/4": 12521.4248046875, "attnres/final_alpha/block_5": 0.592293381690979, "attnres/block_norm/5": 5947.3828125, "attnres/final_alpha/block_6": 0.11885564029216766, "attnres/block_norm/6": 30007.40234375, "geo/tier1_time_s": 1.359269142150879, "geo/step": 37725.0, "geo/rankme_slope": -5.0966636654661864e-05} {"step": 37730, "timestamp": 1778235321.1329598, "train/loss": 2.1484952330589295, "train/z_loss": 0.0014319785521365702, "train/perplexity": 8.571949900643222, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702993.7547310286, "perf/iters_per_sec": 0.8120507024436133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2314502000808716, "data/tokens_consumed": 79127642112, "data/tokens_consumed_B": 79.127642112, "train/loss_slope": -2.1595545806507106e-05} {"step": 37740, "timestamp": 1778235331.4879696, "train/loss": 2.213077926635742, "train/z_loss": 0.001416116871405393, "train/perplexity": 9.143817123138481, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026331.3916621262, "perf/iters_per_sec": 0.9662301023779517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349501609802245, "data/tokens_consumed": 79148613632, "data/tokens_consumed_B": 79.148613632, "train/loss_slope": -1.7921989750225523e-05} {"step": 37750, "timestamp": 1778235341.8344076, "grad/layer_0/attn": 0.0032872078008949757, "grad/layer_0/mlp": 0.003067829180508852, "grad/layer_0/attn_mlp_ratio": 1.0715093638945046, "grad/layer_4/attn": 0.0018688072450459003, "grad/layer_4/mlp": 0.002572152763605118, "grad/layer_4/attn_mlp_ratio": 0.7265537252815483, "grad/layer_8/attn": 0.00542685529217124, "grad/layer_8/mlp": 0.003521442413330078, "grad/layer_8/attn_mlp_ratio": 1.5410887077180464, "grad/layer_12/attn": 0.00448229955509305, "grad/layer_12/mlp": 0.0068781026639044285, "grad/layer_12/attn_mlp_ratio": 0.6516767354241484, "grad/layer_16/attn": 0.0037586961407214403, "grad/layer_16/mlp": 0.00476031331345439, "grad/layer_16/attn_mlp_ratio": 0.7895900572634542, "grad/layer_20/attn": 0.0037982396315783262, "grad/layer_20/mlp": 0.006643553264439106, "grad/layer_20/attn_mlp_ratio": 0.5717180886826518, "grad/layer_24/attn": 0.009315540082752705, "grad/layer_24/mlp": 0.01104775257408619, "grad/layer_24/attn_mlp_ratio": 0.8432067912421144, "grad/layer_27/attn": 0.008189437910914421, "grad/layer_27/mlp": 0.011331702582538128, "grad/layer_27/attn_mlp_ratio": 0.7227014457001366} {"step": 37750, "timestamp": 1778235341.85, "train/loss": 2.185558032989502, "train/z_loss": 0.00142881590873003, "train/perplexity": 8.89561121309736, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024954.2870041232, "perf/iters_per_sec": 0.9655734477062813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356539964675904, "data/tokens_consumed": 79169585152, "data/tokens_consumed_B": 79.169585152, "train/loss_slope": -1.8224779600285952e-05} {"step": 37760, "timestamp": 1778235352.1958244, "train/loss": 2.1791320204734803, "train/z_loss": 0.0014239947544410826, "train/perplexity": 8.838631177391578, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028208.2653196703, "perf/iters_per_sec": 0.9671250654791214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339924335479735, "data/tokens_consumed": 79190556672, "data/tokens_consumed_B": 79.190556672, "train/loss_slope": -1.610606238178905e-05} {"step": 37770, "timestamp": 1778235362.5686572, "train/loss": 2.1303173065185548, "train/z_loss": 0.001420590199995786, "train/perplexity": 8.417537327195792, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022663.4379069472, "perf/iters_per_sec": 0.9644810857329117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368269681930542, "data/tokens_consumed": 79211528192, "data/tokens_consumed_B": 79.211528192, "train/loss_slope": -1.664206607542776e-05} {"step": 37780, "timestamp": 1778235372.9220695, "train/loss": 2.2134660720825194, "train/z_loss": 0.001420098845846951, "train/perplexity": 9.147366942999632, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026830.1971850356, "perf/iters_per_sec": 0.9664679513859918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346954584121704, "data/tokens_consumed": 79232499712, "data/tokens_consumed_B": 79.232499712, "train/loss_slope": -1.0875383383370096e-05} {"step": 37790, "timestamp": 1778235383.2733293, "train/loss": 2.2102496147155763, "train/z_loss": 0.001413243159186095, "train/perplexity": 9.117992094007437, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027027.1162265376, "perf/iters_per_sec": 0.9665618497021378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345949411392212, "data/tokens_consumed": 79253471232, "data/tokens_consumed_B": 79.253471232, "train/loss_slope": -6.934523575305227e-06} {"step": 37800, "timestamp": 1778235393.622744, "grad/layer_0/attn": 0.002982366131618619, "grad/layer_0/mlp": 0.0028796393889933825, "grad/layer_0/attn_mlp_ratio": 1.0356734386432331, "grad/layer_4/attn": 0.0017763610230758786, "grad/layer_4/mlp": 0.0024965829215943813, "grad/layer_4/attn_mlp_ratio": 0.7115169043893639, "grad/layer_8/attn": 0.005474905017763376, "grad/layer_8/mlp": 0.0036060865968465805, "grad/layer_8/attn_mlp_ratio": 1.5182399864515248, "grad/layer_12/attn": 0.004086191300302744, "grad/layer_12/mlp": 0.006171874701976776, "grad/layer_12/attn_mlp_ratio": 0.6620664597723186, "grad/layer_16/attn": 0.0038984150160104036, "grad/layer_16/mlp": 0.004604156129062176, "grad/layer_16/attn_mlp_ratio": 0.8467164931118063, "grad/layer_20/attn": 0.003840632038190961, "grad/layer_20/mlp": 0.0061742267571389675, "grad/layer_20/attn_mlp_ratio": 0.6220425855830385, "grad/layer_24/attn": 0.007106411270797253, "grad/layer_24/mlp": 0.008300086483359337, "grad/layer_24/attn_mlp_ratio": 0.856185197518872, "grad/layer_27/attn": 0.004101170692592859, "grad/layer_27/mlp": 0.007147908676415682, "grad/layer_27/attn_mlp_ratio": 0.5737581187555937} {"step": 37800, "timestamp": 1778235394.2356713, "eos/sharpness": 11.558556556701658, "eos/L0_probe": 2.003618001937866, "eos/L_plus": 2.064284563064575, "eos/L_minus": 2.058537006378174, "eos/grad_norm": 0.114071324467659, "eos/embed_grad_frac": 0.18952448666095734, "eos/time_s": 0.6100914478302002} {"step": 37800, "timestamp": 1778235394.2554653, "train/loss": 2.2114439725875856, "train/z_loss": 0.0014115784550085663, "train/perplexity": 9.128888745597306, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910407.4059708766, "perf/iters_per_sec": 0.9109532384733565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0977511882781983, "data/tokens_consumed": 79274442752, "data/tokens_consumed_B": 79.274442752, "train/loss_slope": -7.056837671338834e-06} {"step": 37800, "timestamp": 1778235395.62035, "geo/rankme_last": 438.4844970703125, "geo/layer_0/stable_rank_q_proj": 19.17641830444336, "geo/layer_0/stable_rank_k_proj": 16.27726173400879, "geo/layer_0/stable_rank_o_proj": 49.50261688232422, "geo/layer_0/stable_rank_gate_proj": 138.5894317626953, "geo/layer_0/stable_rank_down_proj": 52.889122009277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05698566511273384, "geo/layer_0/attn_entropy_mean": 6.201225757598877, "geo/layer_0/attn_entropy_std": 0.3709622919559479, "geo/layer_7/stable_rank_q_proj": 43.22256851196289, "geo/layer_7/stable_rank_k_proj": 42.08026885986328, "geo/layer_7/stable_rank_o_proj": 99.26448822021484, "geo/layer_7/stable_rank_gate_proj": 91.57994079589844, "geo/layer_7/stable_rank_down_proj": 145.85890197753906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.52349454164505, "geo/layer_7/attn_entropy_mean": 4.654299736022949, "geo/layer_7/attn_entropy_std": 0.8212385773658752, "geo/layer_14/stable_rank_q_proj": 54.218467712402344, "geo/layer_14/stable_rank_k_proj": 36.88722610473633, "geo/layer_14/stable_rank_o_proj": 49.03983688354492, "geo/layer_14/stable_rank_gate_proj": 77.37267303466797, "geo/layer_14/stable_rank_down_proj": 132.99610900878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4007478654384613, "geo/layer_14/attn_entropy_mean": 5.506157398223877, "geo/layer_14/attn_entropy_std": 0.37324264645576477, "geo/layer_21/stable_rank_q_proj": 43.698123931884766, "geo/layer_21/stable_rank_k_proj": 31.033971786499023, "geo/layer_21/stable_rank_o_proj": 77.2353744506836, "geo/layer_21/stable_rank_gate_proj": 73.69599151611328, "geo/layer_21/stable_rank_down_proj": 56.11153030395508, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15022510290145874, "geo/layer_21/attn_entropy_mean": 5.727943420410156, "geo/layer_21/attn_entropy_std": 0.28770914673805237, "geo/layer_27/stable_rank_q_proj": 42.64408874511719, "geo/layer_27/stable_rank_k_proj": 31.37302589416504, "geo/layer_27/stable_rank_o_proj": 116.6578598022461, "geo/layer_27/stable_rank_gate_proj": 85.532470703125, "geo/layer_27/stable_rank_down_proj": 132.27366638183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09023534506559372, "geo/layer_27/attn_entropy_mean": 4.2754058837890625, "geo/layer_27/attn_entropy_std": 0.6326360106468201, "attnres/final_alpha/block_0": 0.2422356754541397, "attnres/block_norm/0": 1.7136948108673096, "attnres/final_alpha/block_1": 0.005305481143295765, "attnres/block_norm/1": 39344.5703125, "attnres/final_alpha/block_2": 0.01134673971682787, "attnres/block_norm/2": 25971.013671875, "attnres/final_alpha/block_3": 0.013514449819922447, "attnres/block_norm/3": 45767.6328125, "attnres/final_alpha/block_4": 0.016631444916129112, "attnres/block_norm/4": 12607.857421875, "attnres/final_alpha/block_5": 0.5925022959709167, "attnres/block_norm/5": 5909.0078125, "attnres/final_alpha/block_6": 0.11846393346786499, "attnres/block_norm/6": 30138.564453125, "geo/tier1_time_s": 1.3607702255249023, "geo/step": 37800.0, "geo/rankme_slope": -4.8699440713785516e-05} {"step": 37810, "timestamp": 1778235405.9809675, "train/loss": 2.1951876878738403, "train/z_loss": 0.0014182913699187338, "train/perplexity": 8.981686652302777, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789127.9574623257, "perf/iters_per_sec": 0.8531226908980969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1721643447875976, "data/tokens_consumed": 79295414272, "data/tokens_consumed_B": 79.295414272, "train/loss_slope": -1.1537217392849294e-07} {"step": 37820, "timestamp": 1778235416.329997, "train/loss": 2.146481251716614, "train/z_loss": 0.001420175505336374, "train/perplexity": 8.554703526243642, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027309.9506810673, "perf/iters_per_sec": 0.9666967156796776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034450602531433, "data/tokens_consumed": 79316385792, "data/tokens_consumed_B": 79.316385792, "train/loss_slope": -3.6065179021945567e-06} {"step": 37830, "timestamp": 1778235426.680446, "train/loss": 2.1598353147506715, "train/z_loss": 0.0014350449084304274, "train/perplexity": 8.66970976757572, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027146.6595156498, "perf/iters_per_sec": 0.9666188523844003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034533929824829, "data/tokens_consumed": 79337357312, "data/tokens_consumed_B": 79.337357312, "train/loss_slope": -8.232689704975102e-06} {"step": 37840, "timestamp": 1778235437.0345914, "train/loss": 2.1813410997390745, "train/z_loss": 0.0014263416291214527, "train/perplexity": 8.858177996549154, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026688.7903217464, "perf/iters_per_sec": 0.9664005233391506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347676515579223, "data/tokens_consumed": 79358328832, "data/tokens_consumed_B": 79.358328832, "train/loss_slope": -6.87866671608259e-06} {"step": 37850, "timestamp": 1778235447.3711991, "grad/layer_0/attn": 0.0029704563785344362, "grad/layer_0/mlp": 0.003004632890224457, "grad/layer_0/attn_mlp_ratio": 0.9886253622984192, "grad/layer_4/attn": 0.0021601407788693905, "grad/layer_4/mlp": 0.002515084110200405, "grad/layer_4/attn_mlp_ratio": 0.8588741363444319, "grad/layer_8/attn": 0.004173539113253355, "grad/layer_8/mlp": 0.0038382119964808226, "grad/layer_8/attn_mlp_ratio": 1.0873654212803892, "grad/layer_12/attn": 0.005984458141028881, "grad/layer_12/mlp": 0.006693254690617323, "grad/layer_12/attn_mlp_ratio": 0.8941028435699115, "grad/layer_16/attn": 0.007063285447657108, "grad/layer_16/mlp": 0.004534061998128891, "grad/layer_16/attn_mlp_ratio": 1.5578272407367277, "grad/layer_20/attn": 0.005693159531801939, "grad/layer_20/mlp": 0.006242736242711544, "grad/layer_20/attn_mlp_ratio": 0.911965397745615, "grad/layer_24/attn": 0.01648854836821556, "grad/layer_24/mlp": 0.012514617294073105, "grad/layer_24/attn_mlp_ratio": 1.3175431456677613, "grad/layer_27/attn": 0.00885503925383091, "grad/layer_27/mlp": 0.011681731790304184, "grad/layer_27/attn_mlp_ratio": 0.7580245238448399} {"step": 37850, "timestamp": 1778235447.3868785, "train/loss": 2.136500668525696, "train/z_loss": 0.0014445289736613632, "train/perplexity": 8.469747257799828, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026919.2170413712, "perf/iters_per_sec": 0.9665103993613106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346500158309937, "data/tokens_consumed": 79379300352, "data/tokens_consumed_B": 79.379300352, "train/loss_slope": -7.250498337606848e-06} {"step": 37860, "timestamp": 1778235457.7366962, "train/loss": 2.207300543785095, "train/z_loss": 0.001413724524900317, "train/perplexity": 9.091142099307644, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027270.608887607, "perf/iters_per_sec": 0.9666779560506854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344706773757935, "data/tokens_consumed": 79400271872, "data/tokens_consumed_B": 79.400271872, "train/loss_slope": -6.658203223428976e-06} {"step": 37870, "timestamp": 1778235468.7759, "train/loss": 2.1442670822143555, "train/z_loss": 0.0014251764630898833, "train/perplexity": 8.535782917042829, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1900822.998861031, "perf/iters_per_sec": 0.9063830370240359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1032863140106202, "data/tokens_consumed": 79421243392, "data/tokens_consumed_B": 79.421243392, "train/loss_slope": -9.089978936553262e-06} {"step": 37875, "timestamp": 1778235474.5596042, "eos/sharpness": 72.36921787261961, "eos/L0_probe": 1.9999419450759888, "eos/L_plus": 2.44644832611084, "eos/L_minus": 2.277127742767334, "eos/grad_norm": 0.2385471910238266, "eos/embed_grad_frac": 0.037177469581365585, "eos/time_s": 0.6128873825073242} {"step": 37875, "timestamp": 1778235475.9384596, "geo/rankme_last": 439.5064392089844, "geo/layer_0/stable_rank_q_proj": 19.163541793823242, "geo/layer_0/stable_rank_k_proj": 16.231969833374023, "geo/layer_0/stable_rank_o_proj": 49.54204177856445, "geo/layer_0/stable_rank_gate_proj": 138.50531005859375, "geo/layer_0/stable_rank_down_proj": 52.91139221191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05871429666876793, "geo/layer_0/attn_entropy_mean": 6.2012128829956055, "geo/layer_0/attn_entropy_std": 0.37085357308387756, "geo/layer_7/stable_rank_q_proj": 43.20872497558594, "geo/layer_7/stable_rank_k_proj": 42.11558151245117, "geo/layer_7/stable_rank_o_proj": 99.56410217285156, "geo/layer_7/stable_rank_gate_proj": 91.51216888427734, "geo/layer_7/stable_rank_down_proj": 145.5072479248047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5132117867469788, "geo/layer_7/attn_entropy_mean": 4.669337272644043, "geo/layer_7/attn_entropy_std": 0.831118106842041, "geo/layer_14/stable_rank_q_proj": 54.13441848754883, "geo/layer_14/stable_rank_k_proj": 36.86350631713867, "geo/layer_14/stable_rank_o_proj": 49.07657241821289, "geo/layer_14/stable_rank_gate_proj": 77.46083068847656, "geo/layer_14/stable_rank_down_proj": 133.07449340820312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38319844007492065, "geo/layer_14/attn_entropy_mean": 5.494892120361328, "geo/layer_14/attn_entropy_std": 0.3678641617298126, "geo/layer_21/stable_rank_q_proj": 43.64634323120117, "geo/layer_21/stable_rank_k_proj": 31.127243041992188, "geo/layer_21/stable_rank_o_proj": 77.1110610961914, "geo/layer_21/stable_rank_gate_proj": 73.7086181640625, "geo/layer_21/stable_rank_down_proj": 56.14491653442383, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1464722603559494, "geo/layer_21/attn_entropy_mean": 5.7378997802734375, "geo/layer_21/attn_entropy_std": 0.2842857241630554, "geo/layer_27/stable_rank_q_proj": 42.69165802001953, "geo/layer_27/stable_rank_k_proj": 31.44045639038086, "geo/layer_27/stable_rank_o_proj": 116.34513092041016, "geo/layer_27/stable_rank_gate_proj": 85.48014068603516, "geo/layer_27/stable_rank_down_proj": 132.3135223388672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08607853204011917, "geo/layer_27/attn_entropy_mean": 4.274877548217773, "geo/layer_27/attn_entropy_std": 0.6279263496398926, "attnres/final_alpha/block_0": 0.23988136649131775, "attnres/block_norm/0": 1.71372389793396, "attnres/final_alpha/block_1": 0.0053346604108810425, "attnres/block_norm/1": 39242.77734375, "attnres/final_alpha/block_2": 0.011456922627985477, "attnres/block_norm/2": 26073.3671875, "attnres/final_alpha/block_3": 0.013540356419980526, "attnres/block_norm/3": 45625.84375, "attnres/final_alpha/block_4": 0.016557544469833374, "attnres/block_norm/4": 12587.4560546875, "attnres/final_alpha/block_5": 0.5966044664382935, "attnres/block_norm/5": 5923.19287109375, "attnres/final_alpha/block_6": 0.11662471294403076, "attnres/block_norm/6": 30072.287109375, "geo/tier1_time_s": 1.3587329387664795, "geo/step": 37875.0, "geo/rankme_slope": -4.970429578081232e-05} {"step": 37880, "timestamp": 1778235481.7118442, "train/loss": 2.1979734659194947, "train/z_loss": 0.0014190171961672605, "train/perplexity": 9.006742521633095, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1621889.760051729, "perf/iters_per_sec": 0.7733773041018147, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2930299282073974, "data/tokens_consumed": 79442214912, "data/tokens_consumed_B": 79.442214912, "train/loss_slope": -6.973197071751781e-06} {"step": 37890, "timestamp": 1778235492.0624683, "train/loss": 2.2121629238128664, "train/z_loss": 0.0014342542504891752, "train/perplexity": 9.135454331231516, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027146.7996684604, "perf/iters_per_sec": 0.9666189192144682, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345338582992554, "data/tokens_consumed": 79463186432, "data/tokens_consumed_B": 79.463186432, "train/loss_slope": -7.270822072937485e-06} {"step": 37900, "timestamp": 1778235502.401059, "grad/layer_0/attn": 0.003172178752720356, "grad/layer_0/mlp": 0.002763546770438552, "grad/layer_0/attn_mlp_ratio": 1.1478650087874058, "grad/layer_4/attn": 0.004240324255079031, "grad/layer_4/mlp": 0.002534640021622181, "grad/layer_4/attn_mlp_ratio": 1.6729492360300857, "grad/layer_8/attn": 0.007172063924372196, "grad/layer_8/mlp": 0.0035911540035158396, "grad/layer_8/attn_mlp_ratio": 1.9971473564307771, "grad/layer_12/attn": 0.004769814666360617, "grad/layer_12/mlp": 0.006230582483112812, "grad/layer_12/attn_mlp_ratio": 0.7655487432729938, "grad/layer_16/attn": 0.004201102070510387, "grad/layer_16/mlp": 0.004785384051501751, "grad/layer_16/attn_mlp_ratio": 0.8779027842920398, "grad/layer_20/attn": 0.0038727836217731237, "grad/layer_20/mlp": 0.006731233559548855, "grad/layer_20/attn_mlp_ratio": 0.5753452959219794, "grad/layer_24/attn": 0.01528456062078476, "grad/layer_24/mlp": 0.012100933119654655, "grad/layer_24/attn_mlp_ratio": 1.2630894116462996, "grad/layer_27/attn": 0.008082625456154346, "grad/layer_27/mlp": 0.010977421887218952, "grad/layer_27/attn_mlp_ratio": 0.73629541303641} {"step": 37900, "timestamp": 1778235502.416682, "train/loss": 2.1851405620574953, "train/z_loss": 0.0014232689514756202, "train/perplexity": 8.891898329057964, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026445.4373950749, "perf/iters_per_sec": 0.9662844836211562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03489191532135, "data/tokens_consumed": 79484157952, "data/tokens_consumed_B": 79.484157952, "train/loss_slope": -7.618593776186597e-06} {"step": 37910, "timestamp": 1778235512.7707098, "train/loss": 2.153497815132141, "train/z_loss": 0.0014239358715713024, "train/perplexity": 8.614939222701006, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026356.5991921935, "perf/iters_per_sec": 0.9662421222649543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034937286376953, "data/tokens_consumed": 79505129472, "data/tokens_consumed_B": 79.505129472, "train/loss_slope": -6.892173575668152e-06} {"step": 37920, "timestamp": 1778235523.130591, "train/loss": 2.195061779022217, "train/z_loss": 0.0014215184724889695, "train/perplexity": 8.98055584964127, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025428.209059836, "perf/iters_per_sec": 0.9657994313525372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354116678237915, "data/tokens_consumed": 79526100992, "data/tokens_consumed_B": 79.526100992, "train/loss_slope": -6.5677674940458855e-06} {"step": 37930, "timestamp": 1778235534.0598385, "train/loss": 2.1645847082138063, "train/z_loss": 0.0014335375046357513, "train/perplexity": 8.710983565642998, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919902.124924152, "perf/iters_per_sec": 0.9154806732769737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0923223495483398, "data/tokens_consumed": 79547072512, "data/tokens_consumed_B": 79.547072512, "train/loss_slope": -1.010693566705444e-05} {"step": 37940, "timestamp": 1778235544.8050663, "train/loss": 2.177721607685089, "train/z_loss": 0.0014107929193414748, "train/perplexity": 8.826173846001938, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953044.5240168204, "perf/iters_per_sec": 0.9312842006763555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0737860679626465, "data/tokens_consumed": 79568044032, "data/tokens_consumed_B": 79.568044032, "train/loss_slope": -1.1892564991305083e-05} {"step": 37950, "timestamp": 1778235555.1453962, "grad/layer_0/attn": 0.0027237662579864264, "grad/layer_0/mlp": 0.0027000324334949255, "grad/layer_0/attn_mlp_ratio": 1.0087901624136284, "grad/layer_4/attn": 0.0019501956412568688, "grad/layer_4/mlp": 0.0024772430770099163, "grad/layer_4/attn_mlp_ratio": 0.7872443284356101, "grad/layer_8/attn": 0.003689027391374111, "grad/layer_8/mlp": 0.003763537621125579, "grad/layer_8/attn_mlp_ratio": 0.9802020504980661, "grad/layer_12/attn": 0.0041737183928489685, "grad/layer_12/mlp": 0.00644091097638011, "grad/layer_12/attn_mlp_ratio": 0.6480012444442351, "grad/layer_16/attn": 0.00371119799092412, "grad/layer_16/mlp": 0.004361580591648817, "grad/layer_16/attn_mlp_ratio": 0.8508837170042517, "grad/layer_20/attn": 0.0050081852823495865, "grad/layer_20/mlp": 0.006162272300571203, "grad/layer_20/attn_mlp_ratio": 0.8127172829758971, "grad/layer_24/attn": 0.007510030642151833, "grad/layer_24/mlp": 0.008812880143523216, "grad/layer_24/attn_mlp_ratio": 0.8521652892845246, "grad/layer_27/attn": 0.003941243514418602, "grad/layer_27/mlp": 0.00800379179418087, "grad/layer_27/attn_mlp_ratio": 0.4924220377698813} {"step": 37950, "timestamp": 1778235555.7518995, "eos/sharpness": 20.321202278137203, "eos/L0_probe": 1.9965224266052246, "eos/L_plus": 2.1043574810028076, "eos/L_minus": 2.0918993949890137, "eos/grad_norm": 0.11204151809215546, "eos/embed_grad_frac": 0.19550248980522156, "eos/time_s": 0.6037311553955078} {"step": 37950, "timestamp": 1778235555.7718863, "train/loss": 2.110861909389496, "train/z_loss": 0.0014346585841849447, "train/perplexity": 8.25535358587602, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913209.4255949608, "perf/iters_per_sec": 0.9122893455481342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961434602737428, "data/tokens_consumed": 79589015552, "data/tokens_consumed_B": 79.589015552, "train/loss_slope": -1.434518009772921e-05} {"step": 37950, "timestamp": 1778235557.1308212, "geo/rankme_last": 438.4380798339844, "geo/layer_0/stable_rank_q_proj": 19.17752456665039, "geo/layer_0/stable_rank_k_proj": 16.267610549926758, "geo/layer_0/stable_rank_o_proj": 49.48309326171875, "geo/layer_0/stable_rank_gate_proj": 138.48731994628906, "geo/layer_0/stable_rank_down_proj": 52.922401428222656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06215735897421837, "geo/layer_0/attn_entropy_mean": 6.20434045791626, "geo/layer_0/attn_entropy_std": 0.3723108470439911, "geo/layer_7/stable_rank_q_proj": 43.133235931396484, "geo/layer_7/stable_rank_k_proj": 42.192081451416016, "geo/layer_7/stable_rank_o_proj": 99.58238220214844, "geo/layer_7/stable_rank_gate_proj": 91.59873962402344, "geo/layer_7/stable_rank_down_proj": 145.5963897705078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49867552518844604, "geo/layer_7/attn_entropy_mean": 4.665266513824463, "geo/layer_7/attn_entropy_std": 0.8176717758178711, "geo/layer_14/stable_rank_q_proj": 54.14830017089844, "geo/layer_14/stable_rank_k_proj": 37.03565216064453, "geo/layer_14/stable_rank_o_proj": 48.98624038696289, "geo/layer_14/stable_rank_gate_proj": 77.39553833007812, "geo/layer_14/stable_rank_down_proj": 132.8883056640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3913307785987854, "geo/layer_14/attn_entropy_mean": 5.503499984741211, "geo/layer_14/attn_entropy_std": 0.3742704391479492, "geo/layer_21/stable_rank_q_proj": 43.639888763427734, "geo/layer_21/stable_rank_k_proj": 31.1866455078125, "geo/layer_21/stable_rank_o_proj": 77.04862976074219, "geo/layer_21/stable_rank_gate_proj": 73.53447723388672, "geo/layer_21/stable_rank_down_proj": 56.0672721862793, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14757774770259857, "geo/layer_21/attn_entropy_mean": 5.73447322845459, "geo/layer_21/attn_entropy_std": 0.2877781391143799, "geo/layer_27/stable_rank_q_proj": 42.70737075805664, "geo/layer_27/stable_rank_k_proj": 31.415668487548828, "geo/layer_27/stable_rank_o_proj": 116.39388275146484, "geo/layer_27/stable_rank_gate_proj": 85.37255859375, "geo/layer_27/stable_rank_down_proj": 132.4801483154297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08508816361427307, "geo/layer_27/attn_entropy_mean": 4.2548418045043945, "geo/layer_27/attn_entropy_std": 0.652675449848175, "attnres/final_alpha/block_0": 0.2414969503879547, "attnres/block_norm/0": 1.7140494585037231, "attnres/final_alpha/block_1": 0.005324453115463257, "attnres/block_norm/1": 39472.8203125, "attnres/final_alpha/block_2": 0.011432862840592861, "attnres/block_norm/2": 25977.16796875, "attnres/final_alpha/block_3": 0.013322876766324043, "attnres/block_norm/3": 45735.67578125, "attnres/final_alpha/block_4": 0.016768867149949074, "attnres/block_norm/4": 12630.9443359375, "attnres/final_alpha/block_5": 0.592859148979187, "attnres/block_norm/5": 5953.84912109375, "attnres/final_alpha/block_6": 0.11879484355449677, "attnres/block_norm/6": 30118.212890625, "geo/tier1_time_s": 1.354776382446289, "geo/step": 37950.0, "geo/rankme_slope": -9.084260657387955e-05} {"step": 37960, "timestamp": 1778235567.4795341, "train/loss": 2.192653274536133, "train/z_loss": 0.001415938977152109, "train/perplexity": 8.958952167316166, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791850.057512971, "perf/iters_per_sec": 0.8544206893505912, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703836441040039, "data/tokens_consumed": 79609987072, "data/tokens_consumed_B": 79.609987072, "train/loss_slope": -1.1860203936119197e-05} {"step": 37970, "timestamp": 1778235577.832945, "train/loss": 2.151080918312073, "train/z_loss": 0.0014242430916056037, "train/perplexity": 8.594142944840756, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026676.7426967314, "perf/iters_per_sec": 0.9663947785838753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347738027572633, "data/tokens_consumed": 79630958592, "data/tokens_consumed_B": 79.630958592, "train/loss_slope": -1.4404897242024342e-05} {"step": 37980, "timestamp": 1778235588.1853929, "train/loss": 2.1664462089538574, "train/z_loss": 0.00142511788289994, "train/perplexity": 8.727214169951143, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026768.4109796782, "perf/iters_per_sec": 0.9664384894274131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347270011901855, "data/tokens_consumed": 79651930112, "data/tokens_consumed_B": 79.651930112, "train/loss_slope": -1.7245760015492697e-05} {"step": 37990, "timestamp": 1778235598.5419288, "train/loss": 2.2043647527694703, "train/z_loss": 0.0014312355662696064, "train/perplexity": 9.064491545380584, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026584.7091718116, "perf/iters_per_sec": 0.9663508935793932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034820795059204, "data/tokens_consumed": 79672901632, "data/tokens_consumed_B": 79.672901632, "train/loss_slope": -1.5775341247961353e-05} {"step": 38000, "timestamp": 1778235608.887706, "grad/layer_0/attn": 0.0026130639016628265, "grad/layer_0/mlp": 0.0027087985072284937, "grad/layer_0/attn_mlp_ratio": 0.9646578725674929, "grad/layer_4/attn": 0.0020294790156185627, "grad/layer_4/mlp": 0.002490475308150053, "grad/layer_4/attn_mlp_ratio": 0.8148962278355029, "grad/layer_8/attn": 0.006351733114570379, "grad/layer_8/mlp": 0.00345591246150434, "grad/layer_8/attn_mlp_ratio": 1.8379322397571032, "grad/layer_12/attn": 0.0042294845916330814, "grad/layer_12/mlp": 0.006769984029233456, "grad/layer_12/attn_mlp_ratio": 0.6247406952358651, "grad/layer_16/attn": 0.0034450357779860497, "grad/layer_16/mlp": 0.004521477036178112, "grad/layer_16/attn_mlp_ratio": 0.7619270592835619, "grad/layer_20/attn": 0.004081698600202799, "grad/layer_20/mlp": 0.005996898282319307, "grad/layer_20/attn_mlp_ratio": 0.6806349449303487, "grad/layer_24/attn": 0.010278448462486267, "grad/layer_24/mlp": 0.010193935595452785, "grad/layer_24/attn_mlp_ratio": 1.0082904944231874, "grad/layer_27/attn": 0.0041351597756147385, "grad/layer_27/mlp": 0.009618924930691719, "grad/layer_27/attn_mlp_ratio": 0.42989832672782247} {"step": 38000, "timestamp": 1778235608.903658, "train/loss": 2.182274317741394, "train/z_loss": 0.0014282561256550252, "train/perplexity": 8.86644846619864, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025077.0359632012, "perf/iters_per_sec": 0.9656319789711004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035591220855713, "data/tokens_consumed": 79693873152, "data/tokens_consumed_B": 79.693873152, "train/loss_slope": -1.645555308800063e-05} {"step": 38000, "timestamp": 1778235615.9097054, "geo/ww_alpha_mean": 7.6153637751572445, "geo/ww_alpha_std": 4.5410921651114595, "geo/ww_alpha_min": 1.3523297183568372, "geo/ww_alpha_max": 36.24119354940324, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.068755220714746, "geo/ww_alpha_by_type/k_proj": 4.5322804389842295, "geo/ww_alpha_by_type/v_proj": 8.043283964678425, "geo/ww_alpha_by_type/o_proj": 8.424965200147488, "geo/ww_alpha_by_type/gate_proj": 7.8520618441021055, "geo/ww_alpha_by_type/up_proj": 11.68911365323993, "geo/ww_alpha_by_type/down_proj": 8.804161213589731, "geo/twonn_id/layer_0": 0.7291220426559448, "geo/twonn_id/layer_7": 3.2277185916900635, "geo/twonn_id/layer_14": 4.451842308044434, "geo/twonn_id/layer_21": 7.689896106719971, "geo/twonn_id/layer_27": 6.055816650390625, "geo/tier2_time_s": 6.996649503707886} {"step": 38000, "timestamp": 1778235616.517104, "eoc/jacobian_sigma/layer_0/attn": 1030.405029296875, "eoc/jacobian_sigma/layer_0/mlp": 8921.6875, "eoc/jacobian_sigma/layer_0": 8921.6875, "eoc/jacobian_sigma/layer_7/attn": 1.1314666271209717, "eoc/jacobian_sigma/layer_7/mlp": 1.648082971572876, "eoc/jacobian_sigma/layer_7": 1.648082971572876, "eoc/jacobian_sigma/layer_14/attn": 1.6280994415283203, "eoc/jacobian_sigma/layer_14/mlp": 8.692882537841797, "eoc/jacobian_sigma/layer_14": 8.692882537841797, "eoc/jacobian_sigma/layer_21/attn": 1.0939548015594482, "eoc/jacobian_sigma/layer_21/mlp": 3.903740167617798, "eoc/jacobian_sigma/layer_21": 3.903740167617798, "eoc/jacobian_sigma/layer_27/attn": 3.5626814365386963, "eoc/jacobian_sigma/layer_27/mlp": 28.174514770507812, "eoc/jacobian_sigma/layer_27": 28.174514770507812, "eoc/layer0_sigma": 8921.6875, "eoc/sigma_max": 28.174514770507812, "eoc/sigma_min": 1.648082971572876, "eoc/sigma_mean": 10.60480511188507, "eoc/time_s": 0.6000361442565918} {"step": 38010, "timestamp": 1778235626.8880033, "train/loss": 2.149240279197693, "train/z_loss": 0.0014273803099058568, "train/perplexity": 8.578338778527232, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1166637.0044176823, "perf/iters_per_sec": 0.5562958738411342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7976045608520508, "data/tokens_consumed": 79714844672, "data/tokens_consumed_B": 79.714844672, "train/loss_slope": -2.0653492417952016e-05} {"step": 38020, "timestamp": 1778235637.2379377, "train/loss": 2.1474079370498655, "train/z_loss": 0.0014271511230617762, "train/perplexity": 8.562634718824075, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027484.5775212608, "perf/iters_per_sec": 0.9667799842459015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343615055084228, "data/tokens_consumed": 79735816192, "data/tokens_consumed_B": 79.735816192, "train/loss_slope": -1.9744571436284782e-05} {"step": 38025, "timestamp": 1778235643.0242786, "eos/sharpness": 67.46435165405272, "eos/L0_probe": 1.997380018234253, "eos/L_plus": 2.3028767108917236, "eos/L_minus": 2.3665268421173096, "eos/grad_norm": 0.2354070246219635, "eos/embed_grad_frac": 0.04665461182594299, "eos/time_s": 0.6185548305511475} {"step": 38025, "timestamp": 1778235644.404743, "geo/rankme_last": 439.3289489746094, "geo/layer_0/stable_rank_q_proj": 19.198814392089844, "geo/layer_0/stable_rank_k_proj": 16.262773513793945, "geo/layer_0/stable_rank_o_proj": 49.49300765991211, "geo/layer_0/stable_rank_gate_proj": 138.48587036132812, "geo/layer_0/stable_rank_down_proj": 52.98318862915039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05641490966081619, "geo/layer_0/attn_entropy_mean": 6.2094573974609375, "geo/layer_0/attn_entropy_std": 0.3727026879787445, "geo/layer_7/stable_rank_q_proj": 43.17272186279297, "geo/layer_7/stable_rank_k_proj": 42.28908920288086, "geo/layer_7/stable_rank_o_proj": 99.66022491455078, "geo/layer_7/stable_rank_gate_proj": 91.55671691894531, "geo/layer_7/stable_rank_down_proj": 146.2650909423828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5102893114089966, "geo/layer_7/attn_entropy_mean": 4.6588287353515625, "geo/layer_7/attn_entropy_std": 0.8312027454376221, "geo/layer_14/stable_rank_q_proj": 54.10133361816406, "geo/layer_14/stable_rank_k_proj": 37.001163482666016, "geo/layer_14/stable_rank_o_proj": 48.915897369384766, "geo/layer_14/stable_rank_gate_proj": 77.40068054199219, "geo/layer_14/stable_rank_down_proj": 133.1061553955078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38900113105773926, "geo/layer_14/attn_entropy_mean": 5.521755218505859, "geo/layer_14/attn_entropy_std": 0.35389941930770874, "geo/layer_21/stable_rank_q_proj": 43.60338592529297, "geo/layer_21/stable_rank_k_proj": 31.194095611572266, "geo/layer_21/stable_rank_o_proj": 76.89472961425781, "geo/layer_21/stable_rank_gate_proj": 73.55364227294922, "geo/layer_21/stable_rank_down_proj": 56.03378677368164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1463003009557724, "geo/layer_21/attn_entropy_mean": 5.733053207397461, "geo/layer_21/attn_entropy_std": 0.2926676869392395, "geo/layer_27/stable_rank_q_proj": 42.678192138671875, "geo/layer_27/stable_rank_k_proj": 31.39101219177246, "geo/layer_27/stable_rank_o_proj": 116.3503189086914, "geo/layer_27/stable_rank_gate_proj": 85.45378875732422, "geo/layer_27/stable_rank_down_proj": 132.1343231201172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08652431517839432, "geo/layer_27/attn_entropy_mean": 4.27947998046875, "geo/layer_27/attn_entropy_std": 0.6734803318977356, "attnres/final_alpha/block_0": 0.24380633234977722, "attnres/block_norm/0": 1.7144933938980103, "attnres/final_alpha/block_1": 0.005532830487936735, "attnres/block_norm/1": 39305.21875, "attnres/final_alpha/block_2": 0.011508259922266006, "attnres/block_norm/2": 26080.31640625, "attnres/final_alpha/block_3": 0.01351124793291092, "attnres/block_norm/3": 45364.5546875, "attnres/final_alpha/block_4": 0.01670488342642784, "attnres/block_norm/4": 12636.267578125, "attnres/final_alpha/block_5": 0.5882091522216797, "attnres/block_norm/5": 5972.9658203125, "attnres/final_alpha/block_6": 0.1207273080945015, "attnres/block_norm/6": 29893.95703125, "geo/tier1_time_s": 1.3590359687805176, "geo/step": 38025.0, "geo/rankme_slope": -8.367188672343938e-05} {"step": 38030, "timestamp": 1778235649.5817983, "train/loss": 2.158677577972412, "train/z_loss": 0.0014304353389889002, "train/perplexity": 8.659678333721391, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699757.3199292254, "perf/iters_per_sec": 0.8105074500700118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337949514389037, "data/tokens_consumed": 79756787712, "data/tokens_consumed_B": 79.756787712, "train/loss_slope": -1.902661048861684e-05} {"step": 38040, "timestamp": 1778235659.9361756, "train/loss": 2.203599715232849, "train/z_loss": 0.001421375386416912, "train/perplexity": 9.057559521065484, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026282.518848939, "perf/iters_per_sec": 0.9662067980045982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349751234054565, "data/tokens_consumed": 79777759232, "data/tokens_consumed_B": 79.777759232, "train/loss_slope": -1.5416632348125478e-05} {"step": 38050, "timestamp": 1778235670.2826028, "grad/layer_0/attn": 0.0031695824582129717, "grad/layer_0/mlp": 0.002989437198266387, "grad/layer_0/attn_mlp_ratio": 1.0602605580826376, "grad/layer_4/attn": 0.0019079819321632385, "grad/layer_4/mlp": 0.0025849591474980116, "grad/layer_4/attn_mlp_ratio": 0.7381090955340898, "grad/layer_8/attn": 0.004049654118716717, "grad/layer_8/mlp": 0.003831523470580578, "grad/layer_8/attn_mlp_ratio": 1.056930498825846, "grad/layer_12/attn": 0.005130615085363388, "grad/layer_12/mlp": 0.006662216503173113, "grad/layer_12/attn_mlp_ratio": 0.7701063161050263, "grad/layer_16/attn": 0.0037260439712554216, "grad/layer_16/mlp": 0.004400050267577171, "grad/layer_16/attn_mlp_ratio": 0.8468184816045908, "grad/layer_20/attn": 0.003342072246596217, "grad/layer_20/mlp": 0.005949782207608223, "grad/layer_20/attn_mlp_ratio": 0.561713365936528, "grad/layer_24/attn": 0.014965305104851723, "grad/layer_24/mlp": 0.010930265299975872, "grad/layer_24/attn_mlp_ratio": 1.3691620978283618, "grad/layer_27/attn": 0.008418118581175804, "grad/layer_27/mlp": 0.01093754731118679, "grad/layer_27/attn_mlp_ratio": 0.7696532197488676} {"step": 38050, "timestamp": 1778235670.3002641, "train/loss": 2.169475960731506, "train/z_loss": 0.0014440755359828473, "train/perplexity": 8.753695558356346, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024596.8007722297, "perf/iters_per_sec": 0.9654029849873684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358368635177613, "data/tokens_consumed": 79798730752, "data/tokens_consumed_B": 79.798730752, "train/loss_slope": -1.3053636153181296e-05} {"step": 38060, "timestamp": 1778235680.6554437, "train/loss": 2.1779889106750487, "train/z_loss": 0.001419512869324535, "train/perplexity": 8.828533424007958, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026617.113731462, "perf/iters_per_sec": 0.9663663452775297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348042488098144, "data/tokens_consumed": 79819702272, "data/tokens_consumed_B": 79.819702272, "train/loss_slope": -1.1925362081381732e-05} {"step": 38070, "timestamp": 1778235691.0047386, "train/loss": 2.1830926656723024, "train/z_loss": 0.0014165793196298181, "train/perplexity": 8.873707275666215, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027420.9756627465, "perf/iters_per_sec": 0.9667496565164311, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343939542770386, "data/tokens_consumed": 79840673792, "data/tokens_consumed_B": 79.840673792, "train/loss_slope": -1.380145405993383e-05} {"step": 38080, "timestamp": 1778235701.3510394, "train/loss": 2.138505697250366, "train/z_loss": 0.0014448727597482502, "train/perplexity": 8.486746380512175, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027805.0308231865, "perf/iters_per_sec": 0.9669327882877286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341980457305908, "data/tokens_consumed": 79861645312, "data/tokens_consumed_B": 79.861645312, "train/loss_slope": -1.518101832403846e-05} {"step": 38090, "timestamp": 1778235711.724743, "train/loss": 2.1314231157302856, "train/z_loss": 0.0014294274151325226, "train/perplexity": 8.426850665951147, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023108.7410156787, "perf/iters_per_sec": 0.9646934228018182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365987539291381, "data/tokens_consumed": 79882616832, "data/tokens_consumed_B": 79.882616832, "train/loss_slope": -1.691084234270283e-05} {"step": 38100, "timestamp": 1778235722.066632, "grad/layer_0/attn": 0.0024084695614874363, "grad/layer_0/mlp": 0.002718100557103753, "grad/layer_0/attn_mlp_ratio": 0.8860854932626957, "grad/layer_4/attn": 0.0017209345242008567, "grad/layer_4/mlp": 0.002395652001723647, "grad/layer_4/attn_mlp_ratio": 0.718357445541722, "grad/layer_8/attn": 0.005244317930191755, "grad/layer_8/mlp": 0.0036031436175107956, "grad/layer_8/attn_mlp_ratio": 1.4554839721505064, "grad/layer_12/attn": 0.004791622515767813, "grad/layer_12/mlp": 0.0064469282515347, "grad/layer_12/attn_mlp_ratio": 0.7432411614481741, "grad/layer_16/attn": 0.0033097541891038418, "grad/layer_16/mlp": 0.004324361681938171, "grad/layer_16/attn_mlp_ratio": 0.7653740265044189, "grad/layer_20/attn": 0.004157709423452616, "grad/layer_20/mlp": 0.006243682932108641, "grad/layer_20/attn_mlp_ratio": 0.6659065493990105, "grad/layer_24/attn": 0.004972629249095917, "grad/layer_24/mlp": 0.008048420771956444, "grad/layer_24/attn_mlp_ratio": 0.6178391175369969, "grad/layer_27/attn": 0.00887832697480917, "grad/layer_27/mlp": 0.006868620403110981, "grad/layer_27/attn_mlp_ratio": 1.292592445715692} {"step": 38100, "timestamp": 1778235722.6742952, "eos/sharpness": 12.672281265258787, "eos/L0_probe": 1.9972352981567383, "eos/L_plus": 2.0777904987335205, "eos/L_minus": 2.043402910232544, "eos/grad_norm": 0.09878797829151154, "eos/embed_grad_frac": 0.23835568130016327, "eos/time_s": 0.6048891544342041} {"step": 38100, "timestamp": 1778235722.6943665, "train/loss": 2.197475862503052, "train/z_loss": 0.001425420818850398, "train/perplexity": 9.002261850674353, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912741.5119857343, "perf/iters_per_sec": 0.9120662269524261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964116096496581, "data/tokens_consumed": 79903588352, "data/tokens_consumed_B": 79.903588352, "train/loss_slope": -1.72282560788007e-05} {"step": 38100, "timestamp": 1778235724.0587552, "geo/rankme_last": 439.4209289550781, "geo/layer_0/stable_rank_q_proj": 19.18984603881836, "geo/layer_0/stable_rank_k_proj": 16.289207458496094, "geo/layer_0/stable_rank_o_proj": 49.31074523925781, "geo/layer_0/stable_rank_gate_proj": 138.39610290527344, "geo/layer_0/stable_rank_down_proj": 52.86186599731445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061032846570014954, "geo/layer_0/attn_entropy_mean": 6.203875541687012, "geo/layer_0/attn_entropy_std": 0.3769802749156952, "geo/layer_7/stable_rank_q_proj": 43.16233444213867, "geo/layer_7/stable_rank_k_proj": 42.321083068847656, "geo/layer_7/stable_rank_o_proj": 99.25379180908203, "geo/layer_7/stable_rank_gate_proj": 91.51580047607422, "geo/layer_7/stable_rank_down_proj": 146.0422821044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5005032420158386, "geo/layer_7/attn_entropy_mean": 4.6572418212890625, "geo/layer_7/attn_entropy_std": 0.8006570339202881, "geo/layer_14/stable_rank_q_proj": 54.13457489013672, "geo/layer_14/stable_rank_k_proj": 36.99136734008789, "geo/layer_14/stable_rank_o_proj": 48.95440673828125, "geo/layer_14/stable_rank_gate_proj": 77.37767791748047, "geo/layer_14/stable_rank_down_proj": 133.02700805664062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3925379812717438, "geo/layer_14/attn_entropy_mean": 5.496679306030273, "geo/layer_14/attn_entropy_std": 0.37765708565711975, "geo/layer_21/stable_rank_q_proj": 43.58167266845703, "geo/layer_21/stable_rank_k_proj": 31.162715911865234, "geo/layer_21/stable_rank_o_proj": 76.87519073486328, "geo/layer_21/stable_rank_gate_proj": 73.55821228027344, "geo/layer_21/stable_rank_down_proj": 55.980926513671875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14573699235916138, "geo/layer_21/attn_entropy_mean": 5.722376823425293, "geo/layer_21/attn_entropy_std": 0.2783925235271454, "geo/layer_27/stable_rank_q_proj": 42.71824645996094, "geo/layer_27/stable_rank_k_proj": 31.4837703704834, "geo/layer_27/stable_rank_o_proj": 116.19281768798828, "geo/layer_27/stable_rank_gate_proj": 85.39979553222656, "geo/layer_27/stable_rank_down_proj": 132.28248596191406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08984961360692978, "geo/layer_27/attn_entropy_mean": 4.29597806930542, "geo/layer_27/attn_entropy_std": 0.6630445122718811, "attnres/final_alpha/block_0": 0.24089279770851135, "attnres/block_norm/0": 1.7150259017944336, "attnres/final_alpha/block_1": 0.005346188321709633, "attnres/block_norm/1": 39549.1328125, "attnres/final_alpha/block_2": 0.011458463966846466, "attnres/block_norm/2": 26091.5625, "attnres/final_alpha/block_3": 0.01342422142624855, "attnres/block_norm/3": 45483.43359375, "attnres/final_alpha/block_4": 0.016542676836252213, "attnres/block_norm/4": 12609.3671875, "attnres/final_alpha/block_5": 0.5942825078964233, "attnres/block_norm/5": 5910.63916015625, "attnres/final_alpha/block_6": 0.118053138256073, "attnres/block_norm/6": 30018.3984375, "geo/tier1_time_s": 1.360483169555664, "geo/step": 38100.0, "geo/rankme_slope": -0.00010308095894607844} {"step": 38110, "timestamp": 1778235734.4214358, "train/loss": 2.190033507347107, "train/z_loss": 0.0014274552580900491, "train/perplexity": 8.935512515002086, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788899.7427493383, "perf/iters_per_sec": 0.8530138696428958, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1723138809204101, "data/tokens_consumed": 79924559872, "data/tokens_consumed_B": 79.924559872, "train/loss_slope": -1.5139367800019203e-05} {"step": 38120, "timestamp": 1778235744.7732742, "train/loss": 2.21498966217041, "train/z_loss": 0.001421323453541845, "train/perplexity": 9.161314403012147, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027040.3825470535, "perf/iters_per_sec": 0.9665681755767124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345881700515747, "data/tokens_consumed": 79945531392, "data/tokens_consumed_B": 79.945531392, "train/loss_slope": -1.3389720560514986e-05} {"step": 38130, "timestamp": 1778235755.120227, "train/loss": 2.1940728187561036, "train/z_loss": 0.0014198775636032224, "train/perplexity": 8.971678826973221, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027578.7025090354, "perf/iters_per_sec": 0.9668248665375878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343134880065918, "data/tokens_consumed": 79966502912, "data/tokens_consumed_B": 79.966502912, "train/loss_slope": -1.1791428767128953e-05} {"step": 38140, "timestamp": 1778235765.4802186, "train/loss": 2.2009233713150023, "train/z_loss": 0.0014322847477160395, "train/perplexity": 9.033350786630553, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025289.8422998355, "perf/iters_per_sec": 0.9657334529399088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035482406616211, "data/tokens_consumed": 79987474432, "data/tokens_consumed_B": 79.987474432, "train/loss_slope": -1.1022411435708313e-05} {"step": 38150, "timestamp": 1778235775.8191185, "grad/layer_0/attn": 0.003092673607170582, "grad/layer_0/mlp": 0.0030149915255606174, "grad/layer_0/attn_mlp_ratio": 1.025765239595158, "grad/layer_4/attn": 0.001943587907589972, "grad/layer_4/mlp": 0.002726426348090172, "grad/layer_4/attn_mlp_ratio": 0.7128701047304729, "grad/layer_8/attn": 0.0037486942019313574, "grad/layer_8/mlp": 0.0037115919403731823, "grad/layer_8/attn_mlp_ratio": 1.0099962930070419, "grad/layer_12/attn": 0.004937293007969856, "grad/layer_12/mlp": 0.006667265202850103, "grad/layer_12/attn_mlp_ratio": 0.7405274552160503, "grad/layer_16/attn": 0.0037946519441902637, "grad/layer_16/mlp": 0.0045354366302490234, "grad/layer_16/attn_mlp_ratio": 0.8366673751354292, "grad/layer_20/attn": 0.0033255591988563538, "grad/layer_20/mlp": 0.005642507690936327, "grad/layer_20/attn_mlp_ratio": 0.5893760934097896, "grad/layer_24/attn": 0.0075009409338235855, "grad/layer_24/mlp": 0.00786388199776411, "grad/layer_24/attn_mlp_ratio": 0.9538470745837202, "grad/layer_27/attn": 0.004836498759686947, "grad/layer_27/mlp": 0.007615944836288691, "grad/layer_27/attn_mlp_ratio": 0.6350490714083091} {"step": 38150, "timestamp": 1778235775.835053, "train/loss": 2.180974268913269, "train/z_loss": 0.00142608581809327, "train/perplexity": 8.854929139726394, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026204.7566537927, "perf/iters_per_sec": 0.9661697181004489, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035014843940735, "data/tokens_consumed": 80008445952, "data/tokens_consumed_B": 80.008445952, "train/loss_slope": -1.1018672663279141e-05} {"step": 38160, "timestamp": 1778235786.185464, "train/loss": 2.181904649734497, "train/z_loss": 0.0014211031841114164, "train/perplexity": 8.863171429610997, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027120.684862717, "perf/iters_per_sec": 0.9666064667047105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345471858978272, "data/tokens_consumed": 80029417472, "data/tokens_consumed_B": 80.029417472, "train/loss_slope": -9.88851892601209e-06} {"step": 38170, "timestamp": 1778235796.5372856, "train/loss": 2.201671576499939, "train/z_loss": 0.0014291879371739924, "train/perplexity": 9.040112115641753, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026988.2992227103, "perf/iters_per_sec": 0.9665433403123428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346147537231445, "data/tokens_consumed": 80050388992, "data/tokens_consumed_B": 80.050388992, "train/loss_slope": -3.7580416433595094e-06} {"step": 38175, "timestamp": 1778235802.3179672, "eos/sharpness": 31.715464591979973, "eos/L0_probe": 1.993498682975769, "eos/L_plus": 2.1565990447998047, "eos/L_minus": 2.147552967071533, "eos/grad_norm": 0.12790921330451965, "eos/embed_grad_frac": 0.13049417734146118, "eos/time_s": 0.6112799644470215} {"step": 38175, "timestamp": 1778235803.6983726, "geo/rankme_last": 439.39996337890625, "geo/layer_0/stable_rank_q_proj": 19.175434112548828, "geo/layer_0/stable_rank_k_proj": 16.258445739746094, "geo/layer_0/stable_rank_o_proj": 49.2475700378418, "geo/layer_0/stable_rank_gate_proj": 138.21939086914062, "geo/layer_0/stable_rank_down_proj": 52.91513442993164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057031258940696716, "geo/layer_0/attn_entropy_mean": 6.20260763168335, "geo/layer_0/attn_entropy_std": 0.37827005982398987, "geo/layer_7/stable_rank_q_proj": 43.16059875488281, "geo/layer_7/stable_rank_k_proj": 42.283241271972656, "geo/layer_7/stable_rank_o_proj": 99.50218963623047, "geo/layer_7/stable_rank_gate_proj": 91.4848861694336, "geo/layer_7/stable_rank_down_proj": 146.20339965820312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5058362483978271, "geo/layer_7/attn_entropy_mean": 4.650754928588867, "geo/layer_7/attn_entropy_std": 0.8279147744178772, "geo/layer_14/stable_rank_q_proj": 54.18737030029297, "geo/layer_14/stable_rank_k_proj": 36.9662971496582, "geo/layer_14/stable_rank_o_proj": 48.92450714111328, "geo/layer_14/stable_rank_gate_proj": 77.22178649902344, "geo/layer_14/stable_rank_down_proj": 132.99916076660156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3789098262786865, "geo/layer_14/attn_entropy_mean": 5.498814582824707, "geo/layer_14/attn_entropy_std": 0.36393845081329346, "geo/layer_21/stable_rank_q_proj": 43.598602294921875, "geo/layer_21/stable_rank_k_proj": 31.0797176361084, "geo/layer_21/stable_rank_o_proj": 76.96195983886719, "geo/layer_21/stable_rank_gate_proj": 73.44581604003906, "geo/layer_21/stable_rank_down_proj": 55.939598083496094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1442248523235321, "geo/layer_21/attn_entropy_mean": 5.718923091888428, "geo/layer_21/attn_entropy_std": 0.2885664701461792, "geo/layer_27/stable_rank_q_proj": 42.621822357177734, "geo/layer_27/stable_rank_k_proj": 31.402450561523438, "geo/layer_27/stable_rank_o_proj": 116.26862335205078, "geo/layer_27/stable_rank_gate_proj": 85.34803009033203, "geo/layer_27/stable_rank_down_proj": 132.4696502685547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08020272105932236, "geo/layer_27/attn_entropy_mean": 4.290322303771973, "geo/layer_27/attn_entropy_std": 0.6607220768928528, "attnres/final_alpha/block_0": 0.24109847843647003, "attnres/block_norm/0": 1.7156097888946533, "attnres/final_alpha/block_1": 0.005315390415489674, "attnres/block_norm/1": 39446.40234375, "attnres/final_alpha/block_2": 0.011398445814847946, "attnres/block_norm/2": 26036.52734375, "attnres/final_alpha/block_3": 0.013426201418042183, "attnres/block_norm/3": 45685.36328125, "attnres/final_alpha/block_4": 0.0165992584079504, "attnres/block_norm/4": 12626.0078125, "attnres/final_alpha/block_5": 0.5931831002235413, "attnres/block_norm/5": 5942.18017578125, "attnres/final_alpha/block_6": 0.11897913366556168, "attnres/block_norm/6": 29950.373046875, "geo/tier1_time_s": 1.3602550029754639, "geo/step": 38175.0, "geo/rankme_slope": -7.77814836872249e-05} {"step": 38180, "timestamp": 1778235808.8972683, "train/loss": 2.1571568965911867, "train/z_loss": 0.0014278760878369211, "train/perplexity": 8.646519729669734, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697498.2471736644, "perf/iters_per_sec": 0.809430240237076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2354369163513184, "data/tokens_consumed": 80071360512, "data/tokens_consumed_B": 80.071360512, "train/loss_slope": -4.799231057978154e-06} {"step": 38190, "timestamp": 1778235819.2530432, "train/loss": 2.236552381515503, "train/z_loss": 0.0014046365395188332, "train/perplexity": 9.36100242567782, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026343.2484592604, "perf/iters_per_sec": 0.9662357561394026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349441051483155, "data/tokens_consumed": 80092332032, "data/tokens_consumed_B": 80.092332032, "train/loss_slope": 5.707216448803328e-07} {"step": 38200, "timestamp": 1778235829.6018205, "grad/layer_0/attn": 0.0026624437887221575, "grad/layer_0/mlp": 0.0027829348109662533, "grad/layer_0/attn_mlp_ratio": 0.9567035787400933, "grad/layer_4/attn": 0.002168683335185051, "grad/layer_4/mlp": 0.002343194093555212, "grad/layer_4/attn_mlp_ratio": 0.9255243723076215, "grad/layer_8/attn": 0.003349805949255824, "grad/layer_8/mlp": 0.0037750289775431156, "grad/layer_8/attn_mlp_ratio": 0.8873589793475081, "grad/layer_12/attn": 0.005193871911615133, "grad/layer_12/mlp": 0.006650153547525406, "grad/layer_12/attn_mlp_ratio": 0.7810153248937082, "grad/layer_16/attn": 0.0060807173140347, "grad/layer_16/mlp": 0.00480744568631053, "grad/layer_16/attn_mlp_ratio": 1.2648540585418317, "grad/layer_20/attn": 0.0038987270090729, "grad/layer_20/mlp": 0.006139036267995834, "grad/layer_20/attn_mlp_ratio": 0.6350714958128988, "grad/layer_24/attn": 0.011307038366794586, "grad/layer_24/mlp": 0.010521226562559605, "grad/layer_24/attn_mlp_ratio": 1.0746882211968058, "grad/layer_27/attn": 0.008327904157340527, "grad/layer_27/mlp": 0.01069905236363411, "grad/layer_27/attn_mlp_ratio": 0.7783777288359811} {"step": 38200, "timestamp": 1778235829.617472, "train/loss": 2.1708690881729127, "train/z_loss": 0.0014273066306486725, "train/perplexity": 8.765899070402448, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024923.7069736174, "perf/iters_per_sec": 0.9655588660114371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356696367263794, "data/tokens_consumed": 80113303552, "data/tokens_consumed_B": 80.113303552, "train/loss_slope": 2.019975437904896e-06} {"step": 38210, "timestamp": 1778235839.9813044, "train/loss": 2.113012266159058, "train/z_loss": 0.0014321905327960849, "train/perplexity": 8.273124641551641, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025092.4214245514, "perf/iters_per_sec": 0.9656393153307683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355833530426026, "data/tokens_consumed": 80134275072, "data/tokens_consumed_B": 80.134275072, "train/loss_slope": -8.838611074012422e-07} {"step": 38220, "timestamp": 1778235850.3339195, "train/loss": 2.1411973595619203, "train/z_loss": 0.0014249862055294217, "train/perplexity": 8.509620606928493, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026997.2209295528, "perf/iters_per_sec": 0.9665475945136799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346101999282837, "data/tokens_consumed": 80155246592, "data/tokens_consumed_B": 80.155246592, "train/loss_slope": -2.05939150843709e-06} {"step": 38230, "timestamp": 1778235860.6920533, "train/loss": 2.1649471282958985, "train/z_loss": 0.0014197857584804297, "train/perplexity": 8.71414117317759, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026099.7451550043, "perf/iters_per_sec": 0.966119644715788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350684881210328, "data/tokens_consumed": 80176218112, "data/tokens_consumed_B": 80.176218112, "train/loss_slope": -7.184537437774476e-07} {"step": 38240, "timestamp": 1778235871.0474172, "train/loss": 2.1360297203063965, "train/z_loss": 0.0014355463674291969, "train/perplexity": 8.465759384525466, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026573.8767547384, "perf/iters_per_sec": 0.9663457282804195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348263263702393, "data/tokens_consumed": 80197189632, "data/tokens_consumed_B": 80.197189632, "train/loss_slope": -4.40749493297633e-06} {"step": 38250, "timestamp": 1778235881.3848662, "grad/layer_0/attn": 0.0026195060927420855, "grad/layer_0/mlp": 0.0026741144247353077, "grad/layer_0/attn_mlp_ratio": 0.9795788731230097, "grad/layer_4/attn": 0.001871407264843583, "grad/layer_4/mlp": 0.0025386177003383636, "grad/layer_4/attn_mlp_ratio": 0.7371756648811608, "grad/layer_8/attn": 0.005320959724485874, "grad/layer_8/mlp": 0.00379198812879622, "grad/layer_8/attn_mlp_ratio": 1.4032110342745003, "grad/layer_12/attn": 0.005409239791333675, "grad/layer_12/mlp": 0.007180275861173868, "grad/layer_12/attn_mlp_ratio": 0.7533470608348799, "grad/layer_16/attn": 0.003295535920187831, "grad/layer_16/mlp": 0.004241663962602615, "grad/layer_16/attn_mlp_ratio": 0.7769441123929425, "grad/layer_20/attn": 0.005795569624751806, "grad/layer_20/mlp": 0.0056608738377690315, "grad/layer_20/attn_mlp_ratio": 1.0237941505964465, "grad/layer_24/attn": 0.0049814824014902115, "grad/layer_24/mlp": 0.007934462279081345, "grad/layer_24/attn_mlp_ratio": 0.6278285992789558, "grad/layer_27/attn": 0.0051830122247338295, "grad/layer_27/mlp": 0.006598086096346378, "grad/layer_27/attn_mlp_ratio": 0.7855326636387183} {"step": 38250, "timestamp": 1778235881.9927347, "eos/sharpness": 6.250500679016112, "eos/L0_probe": 2.0000786781311035, "eos/L_plus": 2.0299110412597656, "eos/L_minus": 2.0327513217926025, "eos/grad_norm": 0.08922646194696426, "eos/embed_grad_frac": 0.25941506028175354, "eos/time_s": 0.6051256656646729} {"step": 38250, "timestamp": 1778235882.0139682, "train/loss": 2.215085804462433, "train/z_loss": 0.0014004197670146822, "train/perplexity": 9.162195235118725, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913331.0698596423, "perf/iters_per_sec": 0.9123473500536167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960737705230712, "data/tokens_consumed": 80218161152, "data/tokens_consumed_B": 80.218161152, "train/loss_slope": -4.264231140177844e-06} {"step": 38250, "timestamp": 1778235883.3737473, "geo/rankme_last": 438.7356872558594, "geo/layer_0/stable_rank_q_proj": 19.191574096679688, "geo/layer_0/stable_rank_k_proj": 16.245563507080078, "geo/layer_0/stable_rank_o_proj": 49.267574310302734, "geo/layer_0/stable_rank_gate_proj": 138.11911010742188, "geo/layer_0/stable_rank_down_proj": 52.95777130126953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06043484807014465, "geo/layer_0/attn_entropy_mean": 6.201786041259766, "geo/layer_0/attn_entropy_std": 0.3758879005908966, "geo/layer_7/stable_rank_q_proj": 43.05504608154297, "geo/layer_7/stable_rank_k_proj": 42.40318298339844, "geo/layer_7/stable_rank_o_proj": 99.4146728515625, "geo/layer_7/stable_rank_gate_proj": 91.34598541259766, "geo/layer_7/stable_rank_down_proj": 146.13160705566406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.51467365026474, "geo/layer_7/attn_entropy_mean": 4.649755001068115, "geo/layer_7/attn_entropy_std": 0.810055673122406, "geo/layer_14/stable_rank_q_proj": 54.325714111328125, "geo/layer_14/stable_rank_k_proj": 36.88921356201172, "geo/layer_14/stable_rank_o_proj": 48.94596862792969, "geo/layer_14/stable_rank_gate_proj": 77.2682876586914, "geo/layer_14/stable_rank_down_proj": 133.0750732421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38606274127960205, "geo/layer_14/attn_entropy_mean": 5.502662181854248, "geo/layer_14/attn_entropy_std": 0.3767530620098114, "geo/layer_21/stable_rank_q_proj": 43.495872497558594, "geo/layer_21/stable_rank_k_proj": 31.10118293762207, "geo/layer_21/stable_rank_o_proj": 76.84529876708984, "geo/layer_21/stable_rank_gate_proj": 73.49394989013672, "geo/layer_21/stable_rank_down_proj": 55.967281341552734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14139896631240845, "geo/layer_21/attn_entropy_mean": 5.726709365844727, "geo/layer_21/attn_entropy_std": 0.292917400598526, "geo/layer_27/stable_rank_q_proj": 42.632972717285156, "geo/layer_27/stable_rank_k_proj": 31.34543228149414, "geo/layer_27/stable_rank_o_proj": 116.53985595703125, "geo/layer_27/stable_rank_gate_proj": 85.27143859863281, "geo/layer_27/stable_rank_down_proj": 132.1831817626953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09020122140645981, "geo/layer_27/attn_entropy_mean": 4.295827865600586, "geo/layer_27/attn_entropy_std": 0.6373781561851501, "attnres/final_alpha/block_0": 0.24195781350135803, "attnres/block_norm/0": 1.7159301042556763, "attnres/final_alpha/block_1": 0.005400598049163818, "attnres/block_norm/1": 39490.2109375, "attnres/final_alpha/block_2": 0.01164859626442194, "attnres/block_norm/2": 26148.11328125, "attnres/final_alpha/block_3": 0.013629917055368423, "attnres/block_norm/3": 45804.2890625, "attnres/final_alpha/block_4": 0.016707655042409897, "attnres/block_norm/4": 12652.3154296875, "attnres/final_alpha/block_5": 0.590653121471405, "attnres/block_norm/5": 5964.7626953125, "attnres/final_alpha/block_6": 0.12000226229429245, "attnres/block_norm/6": 30070.69140625, "geo/tier1_time_s": 1.3559863567352295, "geo/step": 38250.0, "geo/rankme_slope": -7.85809050182573e-05} {"step": 38260, "timestamp": 1778235893.7378561, "train/loss": 2.2287696838378905, "train/z_loss": 0.0014189883251674474, "train/perplexity": 9.28843133955559, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789359.4336342637, "perf/iters_per_sec": 0.8532330673381155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172012710571289, "data/tokens_consumed": 80239132672, "data/tokens_consumed_B": 80.239132672, "train/loss_slope": 6.172629830503847e-07} {"step": 38270, "timestamp": 1778235904.0897312, "train/loss": 2.1640202045440673, "train/z_loss": 0.001420517556834966, "train/perplexity": 8.706067571132172, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026916.7415671616, "perf/iters_per_sec": 0.9665092189632233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034651279449463, "data/tokens_consumed": 80260104192, "data/tokens_consumed_B": 80.260104192, "train/loss_slope": 2.583099992433894e-07} {"step": 38280, "timestamp": 1778235914.444153, "train/loss": 2.201738691329956, "train/z_loss": 0.0014170910697430372, "train/perplexity": 9.040718861590326, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026440.1619715847, "perf/iters_per_sec": 0.9662819681032108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348946094512939, "data/tokens_consumed": 80281075712, "data/tokens_consumed_B": 80.281075712, "train/loss_slope": 1.343122030785281e-06} {"step": 38290, "timestamp": 1778235924.8037655, "train/loss": 2.1947171211242678, "train/z_loss": 0.0014086310635320841, "train/perplexity": 8.977461163474366, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025363.243814729, "perf/iters_per_sec": 0.9657684535096783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354448795318603, "data/tokens_consumed": 80302047232, "data/tokens_consumed_B": 80.302047232, "train/loss_slope": 3.3516642284079195e-06} {"step": 38300, "timestamp": 1778235935.139976, "grad/layer_0/attn": 0.002641416620463133, "grad/layer_0/mlp": 0.0029184790328145027, "grad/layer_0/attn_mlp_ratio": 0.9050661321384271, "grad/layer_4/attn": 0.0019104414386674762, "grad/layer_4/mlp": 0.0024720625951886177, "grad/layer_4/attn_mlp_ratio": 0.7728126970184751, "grad/layer_8/attn": 0.005730283912271261, "grad/layer_8/mlp": 0.0035934573970735073, "grad/layer_8/attn_mlp_ratio": 1.594643575703337, "grad/layer_12/attn": 0.004811884369701147, "grad/layer_12/mlp": 0.007215356454253197, "grad/layer_12/attn_mlp_ratio": 0.6668948836443449, "grad/layer_16/attn": 0.003816274693235755, "grad/layer_16/mlp": 0.00514012947678566, "grad/layer_16/attn_mlp_ratio": 0.7424471769099318, "grad/layer_20/attn": 0.007876518182456493, "grad/layer_20/mlp": 0.006321795284748077, "grad/layer_20/attn_mlp_ratio": 1.2459305787497221, "grad/layer_24/attn": 0.012545718811452389, "grad/layer_24/mlp": 0.010768256150186062, "grad/layer_24/attn_mlp_ratio": 1.165065031883469, "grad/layer_27/attn": 0.003591579617932439, "grad/layer_27/mlp": 0.00986446626484394, "grad/layer_27/attn_mlp_ratio": 0.36409264171983} {"step": 38300, "timestamp": 1778235935.1557264, "train/loss": 2.188591480255127, "train/z_loss": 0.0014236447517760099, "train/perplexity": 8.922636549851251, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026846.8702985647, "perf/iters_per_sec": 0.9664759017460655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346869468688964, "data/tokens_consumed": 80323018752, "data/tokens_consumed_B": 80.323018752, "train/loss_slope": 1.0086874685737401e-06} {"step": 38310, "timestamp": 1778235945.50503, "train/loss": 2.122510826587677, "train/z_loss": 0.0014313276158645749, "train/perplexity": 8.352081811977328, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027485.4654520107, "perf/iters_per_sec": 0.966780407644277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343610525131226, "data/tokens_consumed": 80343990272, "data/tokens_consumed_B": 80.343990272, "train/loss_slope": 3.966354658537927e-07} {"step": 38320, "timestamp": 1778235955.852434, "train/loss": 2.237700414657593, "train/z_loss": 0.0014167288783937693, "train/perplexity": 9.371755337874461, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027645.3056572664, "perf/iters_per_sec": 0.9668566253935177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342795133590699, "data/tokens_consumed": 80364961792, "data/tokens_consumed_B": 80.364961792, "train/loss_slope": 6.555655551250107e-06} {"step": 38325, "timestamp": 1778235961.627135, "eos/sharpness": 6.1208248138427725, "eos/L0_probe": 2.0017573833465576, "eos/L_plus": 2.0324366092681885, "eos/L_minus": 2.0322864055633545, "eos/grad_norm": 0.09276562184095383, "eos/embed_grad_frac": 0.26233550906181335, "eos/time_s": 0.6105690002441406} {"step": 38325, "timestamp": 1778235963.0063593, "geo/rankme_last": 438.9200439453125, "geo/layer_0/stable_rank_q_proj": 19.191003799438477, "geo/layer_0/stable_rank_k_proj": 16.237287521362305, "geo/layer_0/stable_rank_o_proj": 49.24409103393555, "geo/layer_0/stable_rank_gate_proj": 137.88400268554688, "geo/layer_0/stable_rank_down_proj": 52.987579345703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05813330411911011, "geo/layer_0/attn_entropy_mean": 6.196353435516357, "geo/layer_0/attn_entropy_std": 0.36778607964515686, "geo/layer_7/stable_rank_q_proj": 42.95077896118164, "geo/layer_7/stable_rank_k_proj": 42.42216873168945, "geo/layer_7/stable_rank_o_proj": 99.6531982421875, "geo/layer_7/stable_rank_gate_proj": 91.40325927734375, "geo/layer_7/stable_rank_down_proj": 145.6254119873047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5164411664009094, "geo/layer_7/attn_entropy_mean": 4.672338485717773, "geo/layer_7/attn_entropy_std": 0.8151953220367432, "geo/layer_14/stable_rank_q_proj": 54.14951705932617, "geo/layer_14/stable_rank_k_proj": 36.73750305175781, "geo/layer_14/stable_rank_o_proj": 48.93351745605469, "geo/layer_14/stable_rank_gate_proj": 77.04029846191406, "geo/layer_14/stable_rank_down_proj": 133.5009002685547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3929762840270996, "geo/layer_14/attn_entropy_mean": 5.503259658813477, "geo/layer_14/attn_entropy_std": 0.35922932624816895, "geo/layer_21/stable_rank_q_proj": 43.55655288696289, "geo/layer_21/stable_rank_k_proj": 31.05701446533203, "geo/layer_21/stable_rank_o_proj": 76.83473205566406, "geo/layer_21/stable_rank_gate_proj": 73.41292572021484, "geo/layer_21/stable_rank_down_proj": 55.94599151611328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15156902372837067, "geo/layer_21/attn_entropy_mean": 5.731772422790527, "geo/layer_21/attn_entropy_std": 0.2911251485347748, "geo/layer_27/stable_rank_q_proj": 42.586429595947266, "geo/layer_27/stable_rank_k_proj": 31.313806533813477, "geo/layer_27/stable_rank_o_proj": 116.49270629882812, "geo/layer_27/stable_rank_gate_proj": 85.09709167480469, "geo/layer_27/stable_rank_down_proj": 132.18861389160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08909512311220169, "geo/layer_27/attn_entropy_mean": 4.28645133972168, "geo/layer_27/attn_entropy_std": 0.6529196500778198, "attnres/final_alpha/block_0": 0.24045029282569885, "attnres/block_norm/0": 1.7162953615188599, "attnres/final_alpha/block_1": 0.005423463881015778, "attnres/block_norm/1": 39495.984375, "attnres/final_alpha/block_2": 0.011448712088167667, "attnres/block_norm/2": 26160.50390625, "attnres/final_alpha/block_3": 0.013384316116571426, "attnres/block_norm/3": 45714.30859375, "attnres/final_alpha/block_4": 0.016556251794099808, "attnres/block_norm/4": 12644.537109375, "attnres/final_alpha/block_5": 0.5936565399169922, "attnres/block_norm/5": 5954.833984375, "attnres/final_alpha/block_6": 0.11908039450645447, "attnres/block_norm/6": 30315.87890625, "geo/tier1_time_s": 1.3579599857330322, "geo/step": 38325.0, "geo/rankme_slope": -5.4605475002501e-05} {"step": 38330, "timestamp": 1778235968.1836274, "train/loss": 2.156835341453552, "train/z_loss": 0.0014235719689168036, "train/perplexity": 8.643739843795245, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701412.93000344, "perf/iters_per_sec": 0.8112969064728928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23259437084198, "data/tokens_consumed": 80385933312, "data/tokens_consumed_B": 80.385933312, "train/loss_slope": 7.740240099907284e-06} {"step": 38340, "timestamp": 1778235978.5350266, "train/loss": 2.191235613822937, "train/z_loss": 0.0014211072120815516, "train/perplexity": 8.946260411224777, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026932.3418264035, "perf/iters_per_sec": 0.9665166577465074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346433162689208, "data/tokens_consumed": 80406904832, "data/tokens_consumed_B": 80.406904832, "train/loss_slope": 1.3231680280912698e-05} {"step": 38350, "timestamp": 1778235988.8807347, "grad/layer_0/attn": 0.002548488089814782, "grad/layer_0/mlp": 0.002739825053140521, "grad/layer_0/attn_mlp_ratio": 0.9301644985971389, "grad/layer_4/attn": 0.0017918006051331758, "grad/layer_4/mlp": 0.0025611829478293657, "grad/layer_4/attn_mlp_ratio": 0.6995988071417812, "grad/layer_8/attn": 0.0034120259806513786, "grad/layer_8/mlp": 0.0035833800211548805, "grad/layer_8/attn_mlp_ratio": 0.9521808642371224, "grad/layer_12/attn": 0.0056356750428676605, "grad/layer_12/mlp": 0.0066415672190487385, "grad/layer_12/attn_mlp_ratio": 0.8485459488913002, "grad/layer_16/attn": 0.005149081815034151, "grad/layer_16/mlp": 0.004345115274190903, "grad/layer_16/attn_mlp_ratio": 1.1850276394543273, "grad/layer_20/attn": 0.004718002863228321, "grad/layer_20/mlp": 0.005527763161808252, "grad/layer_20/attn_mlp_ratio": 0.8535102969813073, "grad/layer_24/attn": 0.007164679933339357, "grad/layer_24/mlp": 0.008556564338505268, "grad/layer_24/attn_mlp_ratio": 0.8373313828033233, "grad/layer_27/attn": 0.006401576101779938, "grad/layer_27/mlp": 0.006885398644953966, "grad/layer_27/attn_mlp_ratio": 0.9297320807268276} {"step": 38350, "timestamp": 1778235988.8966312, "train/loss": 2.1322181105613707, "train/z_loss": 0.0014231552369892598, "train/perplexity": 8.43355263233424, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025020.4847460084, "perf/iters_per_sec": 0.9656050132494013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035620141029358, "data/tokens_consumed": 80427876352, "data/tokens_consumed_B": 80.427876352, "train/loss_slope": 1.2913144096182292e-05} {"step": 38360, "timestamp": 1778235999.2576418, "train/loss": 2.172906589508057, "train/z_loss": 0.0014196998788975179, "train/perplexity": 8.783777809254106, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025617.9053075556, "perf/iters_per_sec": 0.9658898855722216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353147029876708, "data/tokens_consumed": 80448847872, "data/tokens_consumed_B": 80.448847872, "train/loss_slope": 1.1528785565648468e-05} {"step": 38370, "timestamp": 1778236009.6124403, "train/loss": 2.1490928411483763, "train/z_loss": 0.001423294621054083, "train/perplexity": 8.57707409822464, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026406.1291474681, "perf/iters_per_sec": 0.9662657399880734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349119901657104, "data/tokens_consumed": 80469819392, "data/tokens_consumed_B": 80.469819392, "train/loss_slope": 1.0339592735652268e-05} {"step": 38380, "timestamp": 1778236019.965087, "train/loss": 2.167035698890686, "train/z_loss": 0.0014187583001330495, "train/perplexity": 8.732360291525296, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026770.4190888787, "perf/iters_per_sec": 0.9664394469684976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347259759902954, "data/tokens_consumed": 80490790912, "data/tokens_consumed_B": 80.490790912, "train/loss_slope": 1.0994127545670137e-05} {"step": 38390, "timestamp": 1778236030.3133516, "train/loss": 2.2551405906677244, "train/z_loss": 0.0014048975659534334, "train/perplexity": 9.536633977640642, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027568.3268335504, "perf/iters_per_sec": 0.9668199190299751, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034318780899048, "data/tokens_consumed": 80511762432, "data/tokens_consumed_B": 80.511762432, "train/loss_slope": 1.1948496781059392e-05} {"step": 38400, "timestamp": 1778236040.6578114, "grad/layer_0/attn": 0.0030773780308663845, "grad/layer_0/mlp": 0.003148482646793127, "grad/layer_0/attn_mlp_ratio": 0.9774161964205883, "grad/layer_4/attn": 0.002099480014294386, "grad/layer_4/mlp": 0.0027878093533217907, "grad/layer_4/attn_mlp_ratio": 0.7530930823814979, "grad/layer_8/attn": 0.005921011790633202, "grad/layer_8/mlp": 0.0038156749214977026, "grad/layer_8/attn_mlp_ratio": 1.551759978843567, "grad/layer_12/attn": 0.006812450475990772, "grad/layer_12/mlp": 0.006340173073112965, "grad/layer_12/attn_mlp_ratio": 1.0744896535130324, "grad/layer_16/attn": 0.005142866633832455, "grad/layer_16/mlp": 0.0050599765963852406, "grad/layer_16/attn_mlp_ratio": 1.0163814860069276, "grad/layer_20/attn": 0.004080900922417641, "grad/layer_20/mlp": 0.006686351262032986, "grad/layer_20/attn_mlp_ratio": 0.6103330054699433, "grad/layer_24/attn": 0.014828301034867764, "grad/layer_24/mlp": 0.012227721512317657, "grad/layer_24/attn_mlp_ratio": 1.2126789850964876, "grad/layer_27/attn": 0.005469632800668478, "grad/layer_27/mlp": 0.01178866159170866, "grad/layer_27/attn_mlp_ratio": 0.4639740238296468} {"step": 38400, "timestamp": 1778236041.264095, "eos/sharpness": 59.057497978210435, "eos/L0_probe": 2.001347541809082, "eos/L_plus": 2.347968101501465, "eos/L_minus": 2.2453019618988037, "eos/grad_norm": 0.2012527585029602, "eos/embed_grad_frac": 0.06388308107852936, "eos/time_s": 0.6034660339355469} {"step": 38400, "timestamp": 1778236041.2845216, "train/loss": 2.150836873054504, "train/z_loss": 0.0014205983374267817, "train/perplexity": 8.592045840916743, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912646.018674151, "perf/iters_per_sec": 0.9120206921931033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.09646635055542, "data/tokens_consumed": 80532733952, "data/tokens_consumed_B": 80.532733952, "train/loss_slope": 1.3779900963156961e-05} {"step": 38400, "timestamp": 1778236042.6456528, "geo/rankme_last": 438.91204833984375, "geo/layer_0/stable_rank_q_proj": 19.19407844543457, "geo/layer_0/stable_rank_k_proj": 16.245840072631836, "geo/layer_0/stable_rank_o_proj": 49.285186767578125, "geo/layer_0/stable_rank_gate_proj": 138.107421875, "geo/layer_0/stable_rank_down_proj": 53.034244537353516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06035982072353363, "geo/layer_0/attn_entropy_mean": 6.202007293701172, "geo/layer_0/attn_entropy_std": 0.3692852258682251, "geo/layer_7/stable_rank_q_proj": 42.94679260253906, "geo/layer_7/stable_rank_k_proj": 42.38327407836914, "geo/layer_7/stable_rank_o_proj": 99.55679321289062, "geo/layer_7/stable_rank_gate_proj": 91.36995697021484, "geo/layer_7/stable_rank_down_proj": 145.8174285888672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5096937417984009, "geo/layer_7/attn_entropy_mean": 4.662264823913574, "geo/layer_7/attn_entropy_std": 0.8049674034118652, "geo/layer_14/stable_rank_q_proj": 54.19947814941406, "geo/layer_14/stable_rank_k_proj": 36.75178527832031, "geo/layer_14/stable_rank_o_proj": 48.930660247802734, "geo/layer_14/stable_rank_gate_proj": 76.98724365234375, "geo/layer_14/stable_rank_down_proj": 133.18812561035156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37817782163619995, "geo/layer_14/attn_entropy_mean": 5.512942790985107, "geo/layer_14/attn_entropy_std": 0.364599347114563, "geo/layer_21/stable_rank_q_proj": 43.566368103027344, "geo/layer_21/stable_rank_k_proj": 31.010936737060547, "geo/layer_21/stable_rank_o_proj": 76.8002700805664, "geo/layer_21/stable_rank_gate_proj": 73.44969940185547, "geo/layer_21/stable_rank_down_proj": 55.961204528808594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14969530701637268, "geo/layer_21/attn_entropy_mean": 5.721315383911133, "geo/layer_21/attn_entropy_std": 0.27703914046287537, "geo/layer_27/stable_rank_q_proj": 42.61013412475586, "geo/layer_27/stable_rank_k_proj": 31.298873901367188, "geo/layer_27/stable_rank_o_proj": 116.54590606689453, "geo/layer_27/stable_rank_gate_proj": 85.12641906738281, "geo/layer_27/stable_rank_down_proj": 132.24447631835938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09018928557634354, "geo/layer_27/attn_entropy_mean": 4.290349960327148, "geo/layer_27/attn_entropy_std": 0.6393610835075378, "attnres/final_alpha/block_0": 0.2414686381816864, "attnres/block_norm/0": 1.7161781787872314, "attnres/final_alpha/block_1": 0.0053693987429142, "attnres/block_norm/1": 39422.40234375, "attnres/final_alpha/block_2": 0.01145471353083849, "attnres/block_norm/2": 26092.064453125, "attnres/final_alpha/block_3": 0.013253485783934593, "attnres/block_norm/3": 45796.421875, "attnres/final_alpha/block_4": 0.016461733728647232, "attnres/block_norm/4": 12645.0947265625, "attnres/final_alpha/block_5": 0.5957064032554626, "attnres/block_norm/5": 5937.470703125, "attnres/final_alpha/block_6": 0.11628561466932297, "attnres/block_norm/6": 30092.1640625, "geo/tier1_time_s": 1.3571741580963135, "geo/step": 38400.0, "geo/rankme_slope": -5.2557605073279314e-05} {"step": 38410, "timestamp": 1778236053.3422968, "train/loss": 2.206762671470642, "train/z_loss": 0.0014118037186563015, "train/perplexity": 9.086253540493747, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1739843.4296342444, "perf/iters_per_sec": 0.8296219967051718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2053682327270507, "data/tokens_consumed": 80553705472, "data/tokens_consumed_B": 80.553705472, "train/loss_slope": 1.2558225794236246e-05} {"step": 38420, "timestamp": 1778236063.697141, "train/loss": 2.1549738168716432, "train/z_loss": 0.001420687115751207, "train/perplexity": 8.627664276770174, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026980.2650827076, "perf/iters_per_sec": 0.9665395093358553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034618854522705, "data/tokens_consumed": 80574676992, "data/tokens_consumed_B": 80.574676992, "train/loss_slope": 8.478412290539388e-06} {"step": 38430, "timestamp": 1778236074.0522494, "train/loss": 2.1674201250076295, "train/z_loss": 0.001418650452978909, "train/perplexity": 8.73571788421573, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026749.8244833858, "perf/iters_per_sec": 0.96642962669534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347364902496339, "data/tokens_consumed": 80595648512, "data/tokens_consumed_B": 80.595648512, "train/loss_slope": 1.1117316951917695e-05} {"step": 38440, "timestamp": 1778236084.9035249, "train/loss": 2.165950083732605, "train/z_loss": 0.0014098354615271092, "train/perplexity": 8.722885452771832, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933837.4336199623, "perf/iters_per_sec": 0.9221255462741672, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0844510316848754, "data/tokens_consumed": 80616620032, "data/tokens_consumed_B": 80.616620032, "train/loss_slope": 7.542186222597195e-06} {"step": 38450, "timestamp": 1778236095.2465754, "grad/layer_0/attn": 0.002728835679590702, "grad/layer_0/mlp": 0.0026006586849689484, "grad/layer_0/attn_mlp_ratio": 1.0492863175140772, "grad/layer_4/attn": 0.0016601636307314038, "grad/layer_4/mlp": 0.0024800009559839964, "grad/layer_4/attn_mlp_ratio": 0.6694205338040456, "grad/layer_8/attn": 0.0042690010741353035, "grad/layer_8/mlp": 0.003512088442221284, "grad/layer_8/attn_mlp_ratio": 1.215516358091387, "grad/layer_12/attn": 0.00405280664563179, "grad/layer_12/mlp": 0.006373267155140638, "grad/layer_12/attn_mlp_ratio": 0.6359072173480285, "grad/layer_16/attn": 0.0038297411520034075, "grad/layer_16/mlp": 0.004864872898906469, "grad/layer_16/attn_mlp_ratio": 0.7872232538987679, "grad/layer_20/attn": 0.0063150664791464806, "grad/layer_20/mlp": 0.0063795531168580055, "grad/layer_20/attn_mlp_ratio": 0.9898916529858051, "grad/layer_24/attn": 0.011326545849442482, "grad/layer_24/mlp": 0.009482004679739475, "grad/layer_24/attn_mlp_ratio": 1.1945307044818518, "grad/layer_27/attn": 0.01057196781039238, "grad/layer_27/mlp": 0.007463174872100353, "grad/layer_27/attn_mlp_ratio": 1.4165509786268782} {"step": 38450, "timestamp": 1778236095.2623725, "train/loss": 2.199671196937561, "train/z_loss": 0.0014143159729428588, "train/perplexity": 9.022046535157154, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025980.8852724219, "perf/iters_per_sec": 0.9660629679071531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351292133331298, "data/tokens_consumed": 80637591552, "data/tokens_consumed_B": 80.637591552, "train/loss_slope": 9.110967468435697e-06} {"step": 38460, "timestamp": 1778236105.6239624, "train/loss": 2.140282917022705, "train/z_loss": 0.001427842304110527, "train/perplexity": 8.501842604663006, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025153.8257304493, "perf/iters_per_sec": 0.9656685951854941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035551953315735, "data/tokens_consumed": 80658563072, "data/tokens_consumed_B": 80.658563072, "train/loss_slope": 5.697074839205406e-06} {"step": 38470, "timestamp": 1778236116.0015733, "train/loss": 2.1981774806976317, "train/z_loss": 0.0014183021965436637, "train/perplexity": 9.008580217662583, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022582.0466028235, "perf/iters_per_sec": 0.9644422753347509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036868691444397, "data/tokens_consumed": 80679534592, "data/tokens_consumed_B": 80.679534592, "train/loss_slope": 4.074730309430009e-06} {"step": 38475, "timestamp": 1778236121.7958658, "eos/sharpness": 5.227899551391601, "eos/L0_probe": 2.0000858306884766, "eos/L_plus": 2.030658721923828, "eos/L_minus": 2.021791934967041, "eos/grad_norm": 0.09445593506097794, "eos/embed_grad_frac": 0.2655918002128601, "eos/time_s": 0.6205160617828369} {"step": 38475, "timestamp": 1778236123.1815584, "geo/rankme_last": 438.6446533203125, "geo/layer_0/stable_rank_q_proj": 19.191728591918945, "geo/layer_0/stable_rank_k_proj": 16.24390983581543, "geo/layer_0/stable_rank_o_proj": 49.241390228271484, "geo/layer_0/stable_rank_gate_proj": 137.78736877441406, "geo/layer_0/stable_rank_down_proj": 52.955474853515625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06129666045308113, "geo/layer_0/attn_entropy_mean": 6.201367378234863, "geo/layer_0/attn_entropy_std": 0.36988911032676697, "geo/layer_7/stable_rank_q_proj": 42.88249588012695, "geo/layer_7/stable_rank_k_proj": 42.2809944152832, "geo/layer_7/stable_rank_o_proj": 99.60002899169922, "geo/layer_7/stable_rank_gate_proj": 91.19064331054688, "geo/layer_7/stable_rank_down_proj": 145.55381774902344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5123705267906189, "geo/layer_7/attn_entropy_mean": 4.679008960723877, "geo/layer_7/attn_entropy_std": 0.8011105060577393, "geo/layer_14/stable_rank_q_proj": 54.1119499206543, "geo/layer_14/stable_rank_k_proj": 36.682010650634766, "geo/layer_14/stable_rank_o_proj": 48.811946868896484, "geo/layer_14/stable_rank_gate_proj": 77.0899658203125, "geo/layer_14/stable_rank_down_proj": 133.0767822265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37283051013946533, "geo/layer_14/attn_entropy_mean": 5.52393913269043, "geo/layer_14/attn_entropy_std": 0.3729199171066284, "geo/layer_21/stable_rank_q_proj": 43.4552116394043, "geo/layer_21/stable_rank_k_proj": 30.9099178314209, "geo/layer_21/stable_rank_o_proj": 76.67029571533203, "geo/layer_21/stable_rank_gate_proj": 73.5170669555664, "geo/layer_21/stable_rank_down_proj": 56.008792877197266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14928846061229706, "geo/layer_21/attn_entropy_mean": 5.738531589508057, "geo/layer_21/attn_entropy_std": 0.2874814569950104, "geo/layer_27/stable_rank_q_proj": 42.69833755493164, "geo/layer_27/stable_rank_k_proj": 31.349260330200195, "geo/layer_27/stable_rank_o_proj": 116.74715423583984, "geo/layer_27/stable_rank_gate_proj": 85.19979858398438, "geo/layer_27/stable_rank_down_proj": 132.1719970703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08481530845165253, "geo/layer_27/attn_entropy_mean": 4.298017501831055, "geo/layer_27/attn_entropy_std": 0.6365455389022827, "attnres/final_alpha/block_0": 0.24091507494449615, "attnres/block_norm/0": 1.7163007259368896, "attnres/final_alpha/block_1": 0.005388589575886726, "attnres/block_norm/1": 39697.9453125, "attnres/final_alpha/block_2": 0.011426623910665512, "attnres/block_norm/2": 26066.134765625, "attnres/final_alpha/block_3": 0.013342127203941345, "attnres/block_norm/3": 46185.30078125, "attnres/final_alpha/block_4": 0.016401926055550575, "attnres/block_norm/4": 12651.013671875, "attnres/final_alpha/block_5": 0.5943312644958496, "attnres/block_norm/5": 5924.634765625, "attnres/final_alpha/block_6": 0.11819440126419067, "attnres/block_norm/6": 30245.16015625, "geo/tier1_time_s": 1.362825632095337, "geo/step": 38475.0, "geo/rankme_slope": -5.8359593837535015e-05} {"step": 38480, "timestamp": 1778236128.4263184, "train/loss": 2.2013460636138915, "train/z_loss": 0.0014305129530839621, "train/perplexity": 9.037169921543821, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1689047.6599060541, "perf/iters_per_sec": 0.8054006862192412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.241618013381958, "data/tokens_consumed": 80700506112, "data/tokens_consumed_B": 80.700506112, "train/loss_slope": 7.376980695715878e-06} {"step": 38490, "timestamp": 1778236138.783339, "train/loss": 2.2107890129089354, "train/z_loss": 0.0014142423402518033, "train/perplexity": 9.122911649150307, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026022.043653265, "perf/iters_per_sec": 0.9660825937525106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351081848144532, "data/tokens_consumed": 80721477632, "data/tokens_consumed_B": 80.721477632, "train/loss_slope": 6.996153876690572e-06} {"step": 38500, "timestamp": 1778236149.1388497, "grad/layer_0/attn": 0.0030686401296406984, "grad/layer_0/mlp": 0.0027864929288625717, "grad/layer_0/attn_mlp_ratio": 1.1012552688471275, "grad/layer_4/attn": 0.0021065936889499426, "grad/layer_4/mlp": 0.0025527486577630043, "grad/layer_4/attn_mlp_ratio": 0.8252256249439796, "grad/layer_8/attn": 0.003941304981708527, "grad/layer_8/mlp": 0.003654652973636985, "grad/layer_8/attn_mlp_ratio": 1.0784347795251261, "grad/layer_12/attn": 0.004236678592860699, "grad/layer_12/mlp": 0.006707096938043833, "grad/layer_12/attn_mlp_ratio": 0.6316709850520298, "grad/layer_16/attn": 0.005839518271386623, "grad/layer_16/mlp": 0.004634535871446133, "grad/layer_16/attn_mlp_ratio": 1.2600006359567635, "grad/layer_20/attn": 0.00435266038402915, "grad/layer_20/mlp": 0.00655804667621851, "grad/layer_20/attn_mlp_ratio": 0.6637129213248726, "grad/layer_24/attn": 0.01776862144470215, "grad/layer_24/mlp": 0.011976636946201324, "grad/layer_24/attn_mlp_ratio": 1.4836069070272018, "grad/layer_27/attn": 0.015116816386580467, "grad/layer_27/mlp": 0.010517057031393051, "grad/layer_27/attn_mlp_ratio": 1.437361820680549} {"step": 38500, "timestamp": 1778236149.1547863, "train/loss": 2.1649327278137207, "train/z_loss": 0.0014254008769057692, "train/perplexity": 8.71401568624647, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023220.4697253653, "perf/iters_per_sec": 0.9647466992022349, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036541509628296, "data/tokens_consumed": 80742449152, "data/tokens_consumed_B": 80.742449152, "train/loss_slope": 8.879883933846982e-06} {"step": 38500, "timestamp": 1778236156.293147, "geo/ww_alpha_mean": 7.613521158879633, "geo/ww_alpha_std": 4.159418535517838, "geo/ww_alpha_min": 1.3623680113761567, "geo/ww_alpha_max": 25.947102911525693, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.072897770315914, "geo/ww_alpha_by_type/k_proj": 4.6038273060657025, "geo/ww_alpha_by_type/v_proj": 7.8835145853967665, "geo/ww_alpha_by_type/o_proj": 7.863960187113261, "geo/ww_alpha_by_type/gate_proj": 8.483836804400053, "geo/ww_alpha_by_type/up_proj": 11.592358984709026, "geo/ww_alpha_by_type/down_proj": 8.902190791382557, "geo/twonn_id/layer_0": 0.7247450947761536, "geo/twonn_id/layer_7": 3.2903809547424316, "geo/twonn_id/layer_14": 4.330845832824707, "geo/twonn_id/layer_21": 6.691437721252441, "geo/twonn_id/layer_27": 5.437766075134277, "geo/tier2_time_s": 7.131191730499268} {"step": 38500, "timestamp": 1778236157.048464, "eoc/jacobian_sigma/layer_0/attn": 1091.6876220703125, "eoc/jacobian_sigma/layer_0/mlp": 9220.8125, "eoc/jacobian_sigma/layer_0": 9220.8125, "eoc/jacobian_sigma/layer_7/attn": 1.122749924659729, "eoc/jacobian_sigma/layer_7/mlp": 1.7005678415298462, "eoc/jacobian_sigma/layer_7": 1.7005678415298462, "eoc/jacobian_sigma/layer_14/attn": 1.6273356676101685, "eoc/jacobian_sigma/layer_14/mlp": 7.047020435333252, "eoc/jacobian_sigma/layer_14": 7.047020435333252, "eoc/jacobian_sigma/layer_21/attn": 1.0964136123657227, "eoc/jacobian_sigma/layer_21/mlp": 4.002790451049805, "eoc/jacobian_sigma/layer_21": 4.002790451049805, "eoc/jacobian_sigma/layer_27/attn": 4.284756183624268, "eoc/jacobian_sigma/layer_27/mlp": 27.91025161743164, "eoc/jacobian_sigma/layer_27": 27.91025161743164, "eoc/layer0_sigma": 9220.8125, "eoc/sigma_max": 27.91025161743164, "eoc/sigma_min": 1.7005678415298462, "eoc/sigma_mean": 10.165157586336136, "eoc/time_s": 0.7471334934234619} {"step": 38510, "timestamp": 1778236167.4278176, "train/loss": 2.1581364393234255, "train/z_loss": 0.0014311014208942651, "train/perplexity": 8.654993514769844, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1148040.9998496051, "perf/iters_per_sec": 0.54742860786896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.82672221660614, "data/tokens_consumed": 80763420672, "data/tokens_consumed_B": 80.763420672, "train/loss_slope": 7.0426146976231295e-06} {"step": 38520, "timestamp": 1778236177.7793713, "train/loss": 2.2124993085861204, "train/z_loss": 0.0014194278861396014, "train/perplexity": 9.138527875883128, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027360.6954391343, "perf/iters_per_sec": 0.966720912665908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344247102737427, "data/tokens_consumed": 80784392192, "data/tokens_consumed_B": 80.784392192, "train/loss_slope": 9.704515220809122e-06} {"step": 38530, "timestamp": 1778236188.131287, "train/loss": 2.183457374572754, "train/z_loss": 0.0014086849871091545, "train/perplexity": 8.876944185918763, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027173.3356164526, "perf/iters_per_sec": 0.966631572540499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345203161239624, "data/tokens_consumed": 80805363712, "data/tokens_consumed_B": 80.805363712, "train/loss_slope": 1.1322690787488401e-05} {"step": 38540, "timestamp": 1778236198.5035026, "train/loss": 2.2038951396942137, "train/z_loss": 0.0014164321240969003, "train/perplexity": 9.060235740999225, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024986.4995900113, "perf/iters_per_sec": 0.9655888078641945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356375217437743, "data/tokens_consumed": 80826335232, "data/tokens_consumed_B": 80.826335232, "train/loss_slope": 1.2689710536090294e-05} {"step": 38550, "timestamp": 1778236208.8419757, "grad/layer_0/attn": 0.003080636030063033, "grad/layer_0/mlp": 0.0028789215721189976, "grad/layer_0/attn_mlp_ratio": 1.0700659416675153, "grad/layer_4/attn": 0.002376468386501074, "grad/layer_4/mlp": 0.0025696749798953533, "grad/layer_4/attn_mlp_ratio": 0.9248127925176642, "grad/layer_8/attn": 0.005149892531335354, "grad/layer_8/mlp": 0.003708950011059642, "grad/layer_8/attn_mlp_ratio": 1.3885041257306199, "grad/layer_12/attn": 0.0038683046586811543, "grad/layer_12/mlp": 0.006365987937897444, "grad/layer_12/attn_mlp_ratio": 0.6076518893300932, "grad/layer_16/attn": 0.0034708408638834953, "grad/layer_16/mlp": 0.004260450601577759, "grad/layer_16/attn_mlp_ratio": 0.814665186149942, "grad/layer_20/attn": 0.0033881328999996185, "grad/layer_20/mlp": 0.005631233099848032, "grad/layer_20/attn_mlp_ratio": 0.6016680147593698, "grad/layer_24/attn": 0.007663020398467779, "grad/layer_24/mlp": 0.008053756318986416, "grad/layer_24/attn_mlp_ratio": 0.9514840032165002, "grad/layer_27/attn": 0.003565556835383177, "grad/layer_27/mlp": 0.006642100401222706, "grad/layer_27/attn_mlp_ratio": 0.5368116358261689} {"step": 38550, "timestamp": 1778236209.469286, "eos/sharpness": 8.447194099426268, "eos/L0_probe": 2.0049514770507812, "eos/L_plus": 2.0575973987579346, "eos/L_minus": 2.0367774963378906, "eos/grad_norm": 0.09449102729558945, "eos/embed_grad_frac": 0.24899522960186005, "eos/time_s": 0.6244421005249023} {"step": 38550, "timestamp": 1778236209.4900746, "train/loss": 2.201542115211487, "train/z_loss": 0.0014129142160527407, "train/perplexity": 9.038941846833398, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909998.5918812412, "perf/iters_per_sec": 0.9107583007246214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979861497879029, "data/tokens_consumed": 80847306752, "data/tokens_consumed_B": 80.847306752, "train/loss_slope": 1.4429193747640832e-05} {"step": 38550, "timestamp": 1778236210.8527658, "geo/rankme_last": 439.92620849609375, "geo/layer_0/stable_rank_q_proj": 19.151424407958984, "geo/layer_0/stable_rank_k_proj": 16.2414608001709, "geo/layer_0/stable_rank_o_proj": 49.271793365478516, "geo/layer_0/stable_rank_gate_proj": 137.74546813964844, "geo/layer_0/stable_rank_down_proj": 52.94055938720703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06153562292456627, "geo/layer_0/attn_entropy_mean": 6.200201988220215, "geo/layer_0/attn_entropy_std": 0.3664278984069824, "geo/layer_7/stable_rank_q_proj": 42.89995574951172, "geo/layer_7/stable_rank_k_proj": 42.20705032348633, "geo/layer_7/stable_rank_o_proj": 99.55061340332031, "geo/layer_7/stable_rank_gate_proj": 91.17020416259766, "geo/layer_7/stable_rank_down_proj": 145.6854705810547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5192545056343079, "geo/layer_7/attn_entropy_mean": 4.648395538330078, "geo/layer_7/attn_entropy_std": 0.8066909313201904, "geo/layer_14/stable_rank_q_proj": 54.02925491333008, "geo/layer_14/stable_rank_k_proj": 36.705291748046875, "geo/layer_14/stable_rank_o_proj": 48.701087951660156, "geo/layer_14/stable_rank_gate_proj": 77.05677795410156, "geo/layer_14/stable_rank_down_proj": 133.15135192871094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39457258582115173, "geo/layer_14/attn_entropy_mean": 5.532427787780762, "geo/layer_14/attn_entropy_std": 0.367471843957901, "geo/layer_21/stable_rank_q_proj": 43.48717498779297, "geo/layer_21/stable_rank_k_proj": 30.934385299682617, "geo/layer_21/stable_rank_o_proj": 76.66271209716797, "geo/layer_21/stable_rank_gate_proj": 73.52527618408203, "geo/layer_21/stable_rank_down_proj": 56.01801300048828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14780069887638092, "geo/layer_21/attn_entropy_mean": 5.722343444824219, "geo/layer_21/attn_entropy_std": 0.2848552167415619, "geo/layer_27/stable_rank_q_proj": 42.66862106323242, "geo/layer_27/stable_rank_k_proj": 31.361066818237305, "geo/layer_27/stable_rank_o_proj": 116.72711181640625, "geo/layer_27/stable_rank_gate_proj": 85.08673858642578, "geo/layer_27/stable_rank_down_proj": 132.07962036132812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09083417057991028, "geo/layer_27/attn_entropy_mean": 4.285585403442383, "geo/layer_27/attn_entropy_std": 0.6557416319847107, "attnres/final_alpha/block_0": 0.24215656518936157, "attnres/block_norm/0": 1.7165863513946533, "attnres/final_alpha/block_1": 0.005245620384812355, "attnres/block_norm/1": 39623.1015625, "attnres/final_alpha/block_2": 0.011755025014281273, "attnres/block_norm/2": 26124.5234375, "attnres/final_alpha/block_3": 0.01376650482416153, "attnres/block_norm/3": 46014.46484375, "attnres/final_alpha/block_4": 0.01685892790555954, "attnres/block_norm/4": 12689.9921875, "attnres/final_alpha/block_5": 0.5923677682876587, "attnres/block_norm/5": 5930.4501953125, "attnres/final_alpha/block_6": 0.11784954369068146, "attnres/block_norm/6": 29950.12109375, "geo/tier1_time_s": 1.3587656021118164, "geo/step": 38550.0, "geo/rankme_slope": -4.7438916973039216e-05} {"step": 38560, "timestamp": 1778236221.2067683, "train/loss": 2.1660105705261232, "train/z_loss": 0.0014283585362136364, "train/perplexity": 8.72341308810042, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790438.9175522996, "perf/iters_per_sec": 0.8537478053819177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713060855865478, "data/tokens_consumed": 80868278272, "data/tokens_consumed_B": 80.868278272, "train/loss_slope": 1.4511158020690161e-05} {"step": 38570, "timestamp": 1778236231.555895, "train/loss": 2.224661636352539, "train/z_loss": 0.0014094836078584194, "train/perplexity": 9.250352291369332, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027445.0887559098, "perf/iters_per_sec": 0.9667611545352506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343816518783568, "data/tokens_consumed": 80889249792, "data/tokens_consumed_B": 80.889249792, "train/loss_slope": 1.790830958830205e-05} {"step": 38580, "timestamp": 1778236241.919654, "train/loss": 2.1687400102615357, "train/z_loss": 0.0014220064040273428, "train/perplexity": 8.747255642016592, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024953.0749731408, "perf/iters_per_sec": 0.965572869764872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035654616355896, "data/tokens_consumed": 80910221312, "data/tokens_consumed_B": 80.910221312, "train/loss_slope": 2.0754000856609473e-05} {"step": 38590, "timestamp": 1778236252.275032, "train/loss": 2.2475554227828978, "train/z_loss": 0.0014037616201676428, "train/perplexity": 9.464570659558971, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026256.0995705908, "perf/iters_per_sec": 0.9661942003109888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349886178970338, "data/tokens_consumed": 80931192832, "data/tokens_consumed_B": 80.931192832, "train/loss_slope": 2.555365661869447e-05} {"step": 38600, "timestamp": 1778236262.6317263, "grad/layer_0/attn": 0.003294165013357997, "grad/layer_0/mlp": 0.003124586306512356, "grad/layer_0/attn_mlp_ratio": 1.054272337129868, "grad/layer_4/attn": 0.0024093487299978733, "grad/layer_4/mlp": 0.0025583289097994566, "grad/layer_4/attn_mlp_ratio": 0.9417665674622273, "grad/layer_8/attn": 0.0054374332539737225, "grad/layer_8/mlp": 0.0036564194597303867, "grad/layer_8/attn_mlp_ratio": 1.4870922674898561, "grad/layer_12/attn": 0.005749816540628672, "grad/layer_12/mlp": 0.006475446745753288, "grad/layer_12/attn_mlp_ratio": 0.8879412768864751, "grad/layer_16/attn": 0.0057316129095852375, "grad/layer_16/mlp": 0.004656494595110416, "grad/layer_16/attn_mlp_ratio": 1.230885738064677, "grad/layer_20/attn": 0.003085886826738715, "grad/layer_20/mlp": 0.0058472175151109695, "grad/layer_20/attn_mlp_ratio": 0.5277530322736502, "grad/layer_24/attn": 0.006811806466430426, "grad/layer_24/mlp": 0.00828782469034195, "grad/layer_24/attn_mlp_ratio": 0.8219052210621571, "grad/layer_27/attn": 0.00442441226914525, "grad/layer_27/mlp": 0.007614020258188248, "grad/layer_27/attn_mlp_ratio": 0.5810875281397379} {"step": 38600, "timestamp": 1778236262.6476228, "train/loss": 2.169117879867554, "train/z_loss": 0.0014228406711481512, "train/perplexity": 8.750561588628814, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022977.9024217867, "perf/iters_per_sec": 0.9646310340985235, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366657972335815, "data/tokens_consumed": 80952164352, "data/tokens_consumed_B": 80.952164352, "train/loss_slope": 2.1103101265956135e-05} {"step": 38610, "timestamp": 1778236272.9984124, "train/loss": 2.10144499540329, "train/z_loss": 0.0014372014789842068, "train/perplexity": 8.177978520190953, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027164.1320487573, "perf/iters_per_sec": 0.9666271839374339, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345250129699708, "data/tokens_consumed": 80973135872, "data/tokens_consumed_B": 80.973135872, "train/loss_slope": 1.4922272504502562e-05} {"step": 38620, "timestamp": 1778236283.366791, "train/loss": 2.153760004043579, "train/z_loss": 0.0014260362600907683, "train/perplexity": 8.617198260372282, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023703.7768451672, "perf/iters_per_sec": 0.9649771579957805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362939596176148, "data/tokens_consumed": 80994107392, "data/tokens_consumed_B": 80.994107392, "train/loss_slope": 1.2236001217576178e-05} {"step": 38625, "timestamp": 1778236289.134505, "eos/sharpness": 61.79146766662596, "eos/L0_probe": 1.9991538524627686, "eos/L_plus": 2.2499938011169434, "eos/L_minus": 2.3662285804748535, "eos/grad_norm": 0.14940573275089264, "eos/embed_grad_frac": 0.09881048649549484, "eos/time_s": 0.6034562587738037} {"step": 38625, "timestamp": 1778236290.51199, "geo/rankme_last": 439.198486328125, "geo/layer_0/stable_rank_q_proj": 19.18221664428711, "geo/layer_0/stable_rank_k_proj": 16.288776397705078, "geo/layer_0/stable_rank_o_proj": 49.271121978759766, "geo/layer_0/stable_rank_gate_proj": 137.84341430664062, "geo/layer_0/stable_rank_down_proj": 52.93370056152344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062186144292354584, "geo/layer_0/attn_entropy_mean": 6.201647758483887, "geo/layer_0/attn_entropy_std": 0.3665882349014282, "geo/layer_7/stable_rank_q_proj": 42.93601989746094, "geo/layer_7/stable_rank_k_proj": 42.35728073120117, "geo/layer_7/stable_rank_o_proj": 99.42047119140625, "geo/layer_7/stable_rank_gate_proj": 91.0141372680664, "geo/layer_7/stable_rank_down_proj": 145.80239868164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.514037013053894, "geo/layer_7/attn_entropy_mean": 4.661227226257324, "geo/layer_7/attn_entropy_std": 0.8285431861877441, "geo/layer_14/stable_rank_q_proj": 53.9583625793457, "geo/layer_14/stable_rank_k_proj": 36.6722297668457, "geo/layer_14/stable_rank_o_proj": 48.639869689941406, "geo/layer_14/stable_rank_gate_proj": 76.94843292236328, "geo/layer_14/stable_rank_down_proj": 133.03567504882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37965813279151917, "geo/layer_14/attn_entropy_mean": 5.501064300537109, "geo/layer_14/attn_entropy_std": 0.3518180251121521, "geo/layer_21/stable_rank_q_proj": 43.4085807800293, "geo/layer_21/stable_rank_k_proj": 30.858135223388672, "geo/layer_21/stable_rank_o_proj": 76.65259552001953, "geo/layer_21/stable_rank_gate_proj": 73.51310729980469, "geo/layer_21/stable_rank_down_proj": 55.89783477783203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14754144847393036, "geo/layer_21/attn_entropy_mean": 5.705666542053223, "geo/layer_21/attn_entropy_std": 0.2926230728626251, "geo/layer_27/stable_rank_q_proj": 42.697654724121094, "geo/layer_27/stable_rank_k_proj": 31.438867568969727, "geo/layer_27/stable_rank_o_proj": 116.5969467163086, "geo/layer_27/stable_rank_gate_proj": 85.09088134765625, "geo/layer_27/stable_rank_down_proj": 131.9897003173828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08940800279378891, "geo/layer_27/attn_entropy_mean": 4.262732982635498, "geo/layer_27/attn_entropy_std": 0.6537818908691406, "attnres/final_alpha/block_0": 0.24040035903453827, "attnres/block_norm/0": 1.7168407440185547, "attnres/final_alpha/block_1": 0.005273844115436077, "attnres/block_norm/1": 39594.8359375, "attnres/final_alpha/block_2": 0.01138114370405674, "attnres/block_norm/2": 26221.455078125, "attnres/final_alpha/block_3": 0.013323711231350899, "attnres/block_norm/3": 46008.578125, "attnres/final_alpha/block_4": 0.016404613852500916, "attnres/block_norm/4": 12698.20703125, "attnres/final_alpha/block_5": 0.5951829552650452, "attnres/block_norm/5": 5980.54296875, "attnres/final_alpha/block_6": 0.11803341656923294, "attnres/block_norm/6": 30343.015625, "geo/tier1_time_s": 1.357560634613037, "geo/step": 38625.0, "geo/rankme_slope": -4.690237423094238e-05} {"step": 38630, "timestamp": 1778236295.6971242, "train/loss": 2.1998544454574587, "train/z_loss": 0.0014103386085480452, "train/perplexity": 9.023699963320675, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701531.1843317174, "perf/iters_per_sec": 0.8113532945307338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325087070465088, "data/tokens_consumed": 81015078912, "data/tokens_consumed_B": 81.015078912, "train/loss_slope": 1.4243808108838115e-05} {"step": 38640, "timestamp": 1778236306.0614796, "train/loss": 2.1467416524887084, "train/z_loss": 0.0014297836227342487, "train/perplexity": 8.556931467713166, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024594.3309674955, "perf/iters_per_sec": 0.9654018072926976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358381271362305, "data/tokens_consumed": 81036050432, "data/tokens_consumed_B": 81.036050432, "train/loss_slope": 1.2879317449872304e-05} {"step": 38650, "timestamp": 1778236316.4019568, "grad/layer_0/attn": 0.00283618806861341, "grad/layer_0/mlp": 0.002885195193812251, "grad/layer_0/attn_mlp_ratio": 0.9830142433325242, "grad/layer_4/attn": 0.0017600473947823048, "grad/layer_4/mlp": 0.002402504440397024, "grad/layer_4/attn_mlp_ratio": 0.7325885821183294, "grad/layer_8/attn": 0.004019867163151503, "grad/layer_8/mlp": 0.003925854805856943, "grad/layer_8/attn_mlp_ratio": 1.0239469515682569, "grad/layer_12/attn": 0.004616405349224806, "grad/layer_12/mlp": 0.005962111987173557, "grad/layer_12/attn_mlp_ratio": 0.7742902652159451, "grad/layer_16/attn": 0.004524350166320801, "grad/layer_16/mlp": 0.0045082950964570045, "grad/layer_16/attn_mlp_ratio": 1.0035612064348436, "grad/layer_20/attn": 0.0037572423461824656, "grad/layer_20/mlp": 0.005888273939490318, "grad/layer_20/attn_mlp_ratio": 0.6380889070352589, "grad/layer_24/attn": 0.007572751492261887, "grad/layer_24/mlp": 0.0093661118298769, "grad/layer_24/attn_mlp_ratio": 0.8085266916473223, "grad/layer_27/attn": 0.005099690984934568, "grad/layer_27/mlp": 0.008861307054758072, "grad/layer_27/attn_mlp_ratio": 0.5755009837568145} {"step": 38650, "timestamp": 1778236316.4176486, "train/loss": 2.183645725250244, "train/z_loss": 0.001422293414361775, "train/perplexity": 8.878616321839246, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026038.5168071545, "perf/iters_per_sec": 0.9660904487643979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350997686386108, "data/tokens_consumed": 81057021952, "data/tokens_consumed_B": 81.057021952, "train/loss_slope": 1.0830131458370466e-05} {"step": 38660, "timestamp": 1778236326.765899, "train/loss": 2.1390904426574706, "train/z_loss": 0.0014134245924651624, "train/perplexity": 8.491710417686964, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027658.4398472072, "perf/iters_per_sec": 0.9668628882633243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034272813796997, "data/tokens_consumed": 81077993472, "data/tokens_consumed_B": 81.077993472, "train/loss_slope": 7.877943095880344e-06} {"step": 38670, "timestamp": 1778236337.1143851, "train/loss": 2.1838567733764647, "train/z_loss": 0.001407400518655777, "train/perplexity": 8.880490334923921, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027607.8203997365, "perf/iters_per_sec": 0.9668387510298426, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342986345291139, "data/tokens_consumed": 81098964992, "data/tokens_consumed_B": 81.098964992, "train/loss_slope": 1.0616745685550928e-05} {"step": 38680, "timestamp": 1778236347.4722843, "train/loss": 2.161893606185913, "train/z_loss": 0.0014241026015952229, "train/perplexity": 8.687572934431488, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025691.0507012028, "perf/iters_per_sec": 0.965924764013864, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352773189544677, "data/tokens_consumed": 81119936512, "data/tokens_consumed_B": 81.119936512, "train/loss_slope": 7.650583447760386e-06} {"step": 38690, "timestamp": 1778236357.8198576, "train/loss": 2.148163878917694, "train/z_loss": 0.0014341337839141489, "train/perplexity": 8.569110020074259, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027793.6243873402, "perf/iters_per_sec": 0.9669273492752744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034203863143921, "data/tokens_consumed": 81140908032, "data/tokens_consumed_B": 81.140908032, "train/loss_slope": 5.825035041994367e-06} {"step": 38700, "timestamp": 1778236368.1570396, "grad/layer_0/attn": 0.0028202987741678953, "grad/layer_0/mlp": 0.002940608188509941, "grad/layer_0/attn_mlp_ratio": 0.9590868614455926, "grad/layer_4/attn": 0.004584634210914373, "grad/layer_4/mlp": 0.002581451553851366, "grad/layer_4/attn_mlp_ratio": 1.77599073144538, "grad/layer_8/attn": 0.006494275759905577, "grad/layer_8/mlp": 0.0036803623661398888, "grad/layer_8/attn_mlp_ratio": 1.7645750438045937, "grad/layer_12/attn": 0.004748599138110876, "grad/layer_12/mlp": 0.006228660233318806, "grad/layer_12/attn_mlp_ratio": 0.7623788879141992, "grad/layer_16/attn": 0.004146852996200323, "grad/layer_16/mlp": 0.0048200045712292194, "grad/layer_16/attn_mlp_ratio": 0.8603421114823887, "grad/layer_20/attn": 0.004411311354488134, "grad/layer_20/mlp": 0.006785531062632799, "grad/layer_20/attn_mlp_ratio": 0.6501055331940347, "grad/layer_24/attn": 0.016732411459088326, "grad/layer_24/mlp": 0.014453171752393246, "grad/layer_24/attn_mlp_ratio": 1.1576982291480655, "grad/layer_27/attn": 0.008151774294674397, "grad/layer_27/mlp": 0.015599743463099003, "grad/layer_27/attn_mlp_ratio": 0.522558224223462} {"step": 38700, "timestamp": 1778236368.7610815, "eos/sharpness": 78.08070182800292, "eos/L0_probe": 1.9992142915725708, "eos/L_plus": 2.3257126808166504, "eos/L_minus": 2.4535229206085205, "eos/grad_norm": 0.2906601130962372, "eos/embed_grad_frac": 0.031022794544696808, "eos/time_s": 0.60129714012146} {"step": 38700, "timestamp": 1778236368.779379, "train/loss": 2.184573531150818, "train/z_loss": 0.001427387516014278, "train/perplexity": 8.886857777095784, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914484.6913380974, "perf/iters_per_sec": 0.9128974396410453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0954133033752442, "data/tokens_consumed": 81161879552, "data/tokens_consumed_B": 81.161879552, "train/loss_slope": 3.4829397799550433e-06} {"step": 38700, "timestamp": 1778236370.1437185, "geo/rankme_last": 439.39495849609375, "geo/layer_0/stable_rank_q_proj": 19.195629119873047, "geo/layer_0/stable_rank_k_proj": 16.25455093383789, "geo/layer_0/stable_rank_o_proj": 49.392940521240234, "geo/layer_0/stable_rank_gate_proj": 138.13128662109375, "geo/layer_0/stable_rank_down_proj": 52.976905822753906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0571490116417408, "geo/layer_0/attn_entropy_mean": 6.202264785766602, "geo/layer_0/attn_entropy_std": 0.3715173006057739, "geo/layer_7/stable_rank_q_proj": 42.90934371948242, "geo/layer_7/stable_rank_k_proj": 42.383872985839844, "geo/layer_7/stable_rank_o_proj": 99.37007141113281, "geo/layer_7/stable_rank_gate_proj": 90.92771911621094, "geo/layer_7/stable_rank_down_proj": 145.9743194580078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5070309638977051, "geo/layer_7/attn_entropy_mean": 4.681633472442627, "geo/layer_7/attn_entropy_std": 0.82688969373703, "geo/layer_14/stable_rank_q_proj": 54.00052261352539, "geo/layer_14/stable_rank_k_proj": 36.671630859375, "geo/layer_14/stable_rank_o_proj": 48.589141845703125, "geo/layer_14/stable_rank_gate_proj": 77.02401733398438, "geo/layer_14/stable_rank_down_proj": 132.97608947753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.370919793844223, "geo/layer_14/attn_entropy_mean": 5.530864715576172, "geo/layer_14/attn_entropy_std": 0.35822197794914246, "geo/layer_21/stable_rank_q_proj": 43.40411376953125, "geo/layer_21/stable_rank_k_proj": 30.774110794067383, "geo/layer_21/stable_rank_o_proj": 76.51680755615234, "geo/layer_21/stable_rank_gate_proj": 73.41357421875, "geo/layer_21/stable_rank_down_proj": 55.89104461669922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1433033049106598, "geo/layer_21/attn_entropy_mean": 5.702573299407959, "geo/layer_21/attn_entropy_std": 0.28607234358787537, "geo/layer_27/stable_rank_q_proj": 42.55739212036133, "geo/layer_27/stable_rank_k_proj": 31.409847259521484, "geo/layer_27/stable_rank_o_proj": 116.72868347167969, "geo/layer_27/stable_rank_gate_proj": 84.94033813476562, "geo/layer_27/stable_rank_down_proj": 131.805908203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08474083989858627, "geo/layer_27/attn_entropy_mean": 4.27028751373291, "geo/layer_27/attn_entropy_std": 0.6584030389785767, "attnres/final_alpha/block_0": 0.2407725751399994, "attnres/block_norm/0": 1.716970682144165, "attnres/final_alpha/block_1": 0.005301165394484997, "attnres/block_norm/1": 39579.8203125, "attnres/final_alpha/block_2": 0.011392584070563316, "attnres/block_norm/2": 26197.23828125, "attnres/final_alpha/block_3": 0.013360696844756603, "attnres/block_norm/3": 46100.875, "attnres/final_alpha/block_4": 0.016553251072764397, "attnres/block_norm/4": 12661.4677734375, "attnres/final_alpha/block_5": 0.5942385792732239, "attnres/block_norm/5": 5964.8896484375, "attnres/final_alpha/block_6": 0.11838111281394958, "attnres/block_norm/6": 30339.748046875, "geo/tier1_time_s": 1.3601961135864258, "geo/step": 38700.0, "geo/rankme_slope": -2.678950486444578e-05} {"step": 38710, "timestamp": 1778236380.5047326, "train/loss": 2.123713970184326, "train/z_loss": 0.0014353979961015284, "train/perplexity": 8.362136613200045, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789174.3934767018, "perf/iters_per_sec": 0.8531448333152303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1721339225769043, "data/tokens_consumed": 81182851072, "data/tokens_consumed_B": 81.182851072, "train/loss_slope": -2.25002787830675e-06} {"step": 38720, "timestamp": 1778236390.8604915, "train/loss": 2.1648218870162963, "train/z_loss": 0.0014204988139681519, "train/perplexity": 8.713049871325873, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026259.50696532, "perf/iters_per_sec": 0.9661958250834084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349868774414062, "data/tokens_consumed": 81203822592, "data/tokens_consumed_B": 81.203822592, "train/loss_slope": -3.6738732610063388e-06} {"step": 38730, "timestamp": 1778236401.2196577, "train/loss": 2.1764121294021606, "train/z_loss": 0.0014111436088569463, "train/perplexity": 8.814623726994698, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025583.013830238, "perf/iters_per_sec": 0.9658732480193319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353325366973878, "data/tokens_consumed": 81224794112, "data/tokens_consumed_B": 81.224794112, "train/loss_slope": -5.392696669321445e-06} {"step": 38740, "timestamp": 1778236412.1148832, "train/loss": 2.0784835696220396, "train/z_loss": 0.0014339573797769844, "train/perplexity": 7.992339893207577, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926013.027515945, "perf/iters_per_sec": 0.9183945787029004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0888566017150878, "data/tokens_consumed": 81245765632, "data/tokens_consumed_B": 81.245765632, "train/loss_slope": -9.048459267351743e-06} {"step": 38750, "timestamp": 1778236422.4581456, "grad/layer_0/attn": 0.0025447693187743425, "grad/layer_0/mlp": 0.002717354102060199, "grad/layer_0/attn_mlp_ratio": 0.9364878957792809, "grad/layer_4/attn": 0.0022858602460473776, "grad/layer_4/mlp": 0.002401373814791441, "grad/layer_4/attn_mlp_ratio": 0.951896842040072, "grad/layer_8/attn": 0.0037626291159540415, "grad/layer_8/mlp": 0.003543297527357936, "grad/layer_8/attn_mlp_ratio": 1.061900385364932, "grad/layer_12/attn": 0.005379996728152037, "grad/layer_12/mlp": 0.006635944824665785, "grad/layer_12/attn_mlp_ratio": 0.8107355906699296, "grad/layer_16/attn": 0.004026191774755716, "grad/layer_16/mlp": 0.004665564280003309, "grad/layer_16/attn_mlp_ratio": 0.8629592149691568, "grad/layer_20/attn": 0.007198642008006573, "grad/layer_20/mlp": 0.006347585469484329, "grad/layer_20/attn_mlp_ratio": 1.134075614925721, "grad/layer_24/attn": 0.0073884017765522, "grad/layer_24/mlp": 0.009966063313186169, "grad/layer_24/attn_mlp_ratio": 0.7413560871764627, "grad/layer_27/attn": 0.007125783246010542, "grad/layer_27/mlp": 0.008922478184103966, "grad/layer_27/attn_mlp_ratio": 0.7986327362326715} {"step": 38750, "timestamp": 1778236422.473736, "train/loss": 2.236709785461426, "train/z_loss": 0.001410148898139596, "train/perplexity": 9.362476000367609, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025719.3679702657, "perf/iters_per_sec": 0.965938266739972, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352628469467162, "data/tokens_consumed": 81266737152, "data/tokens_consumed_B": 81.266737152, "train/loss_slope": -4.8109876476464165e-06} {"step": 38760, "timestamp": 1778236432.824675, "train/loss": 2.1912181854248045, "train/z_loss": 0.0014044927433133126, "train/perplexity": 8.946104493595232, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027213.5146064651, "perf/iters_per_sec": 0.9666507313759161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344998121261597, "data/tokens_consumed": 81287708672, "data/tokens_consumed_B": 81.287708672, "train/loss_slope": -3.7268308630371457e-06} {"step": 38770, "timestamp": 1778236443.1806426, "train/loss": 2.1905758380889893, "train/z_loss": 0.0014250783249735832, "train/perplexity": 8.940359832439267, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026335.4528329223, "perf/iters_per_sec": 0.9662320388950931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349480867385865, "data/tokens_consumed": 81308680192, "data/tokens_consumed_B": 81.308680192, "train/loss_slope": -5.653805686946229e-06} {"step": 38775, "timestamp": 1778236448.9496865, "eos/sharpness": 19.06805038452148, "eos/L0_probe": 2.0020742416381836, "eos/L_plus": 2.102416515350342, "eos/L_minus": 2.0924124717712402, "eos/grad_norm": 0.10128631442785263, "eos/embed_grad_frac": 0.21729403734207153, "eos/time_s": 0.6026122570037842} {"step": 38775, "timestamp": 1778236450.3323486, "geo/rankme_last": 439.7646789550781, "geo/layer_0/stable_rank_q_proj": 19.22380828857422, "geo/layer_0/stable_rank_k_proj": 16.31296730041504, "geo/layer_0/stable_rank_o_proj": 49.380794525146484, "geo/layer_0/stable_rank_gate_proj": 138.1619873046875, "geo/layer_0/stable_rank_down_proj": 53.025970458984375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05740153416991234, "geo/layer_0/attn_entropy_mean": 6.210421562194824, "geo/layer_0/attn_entropy_std": 0.3679768741130829, "geo/layer_7/stable_rank_q_proj": 42.97523880004883, "geo/layer_7/stable_rank_k_proj": 42.31422805786133, "geo/layer_7/stable_rank_o_proj": 99.1516342163086, "geo/layer_7/stable_rank_gate_proj": 90.7573013305664, "geo/layer_7/stable_rank_down_proj": 146.05538940429688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5259320735931396, "geo/layer_7/attn_entropy_mean": 4.667430877685547, "geo/layer_7/attn_entropy_std": 0.8160120248794556, "geo/layer_14/stable_rank_q_proj": 53.99773406982422, "geo/layer_14/stable_rank_k_proj": 36.799442291259766, "geo/layer_14/stable_rank_o_proj": 48.58348083496094, "geo/layer_14/stable_rank_gate_proj": 77.09617614746094, "geo/layer_14/stable_rank_down_proj": 133.16015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3781437873840332, "geo/layer_14/attn_entropy_mean": 5.505402565002441, "geo/layer_14/attn_entropy_std": 0.3813995122909546, "geo/layer_21/stable_rank_q_proj": 43.49108123779297, "geo/layer_21/stable_rank_k_proj": 30.74012565612793, "geo/layer_21/stable_rank_o_proj": 76.32310485839844, "geo/layer_21/stable_rank_gate_proj": 73.38764953613281, "geo/layer_21/stable_rank_down_proj": 55.85497283935547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15114931762218475, "geo/layer_21/attn_entropy_mean": 5.724255561828613, "geo/layer_21/attn_entropy_std": 0.28461065888404846, "geo/layer_27/stable_rank_q_proj": 42.50273132324219, "geo/layer_27/stable_rank_k_proj": 31.359783172607422, "geo/layer_27/stable_rank_o_proj": 116.66278076171875, "geo/layer_27/stable_rank_gate_proj": 85.0115737915039, "geo/layer_27/stable_rank_down_proj": 131.93031311035156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09022817760705948, "geo/layer_27/attn_entropy_mean": 4.266602039337158, "geo/layer_27/attn_entropy_std": 0.661622941493988, "attnres/final_alpha/block_0": 0.24102434515953064, "attnres/block_norm/0": 1.7171822786331177, "attnres/final_alpha/block_1": 0.005255853291600943, "attnres/block_norm/1": 39750.22265625, "attnres/final_alpha/block_2": 0.011469703167676926, "attnres/block_norm/2": 26225.193359375, "attnres/final_alpha/block_3": 0.013403883203864098, "attnres/block_norm/3": 46351.640625, "attnres/final_alpha/block_4": 0.01646290346980095, "attnres/block_norm/4": 12689.134765625, "attnres/final_alpha/block_5": 0.5957884788513184, "attnres/block_norm/5": 5881.86669921875, "attnres/final_alpha/block_6": 0.11659488081932068, "attnres/block_norm/6": 30209.34375, "geo/tier1_time_s": 1.3628852367401123, "geo/step": 38775.0, "geo/rankme_slope": -2.046050842211885e-05} {"step": 38780, "timestamp": 1778236455.5110662, "train/loss": 2.1569319009780883, "train/z_loss": 0.0014267597580328584, "train/perplexity": 8.644574519502072, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701656.7302922725, "perf/iters_per_sec": 0.8114131595097888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2324177742004394, "data/tokens_consumed": 81329651712, "data/tokens_consumed_B": 81.329651712, "train/loss_slope": -4.612428584758483e-06} {"step": 38790, "timestamp": 1778236465.8634381, "train/loss": 2.2094691038131713, "train/z_loss": 0.0014256185153499247, "train/perplexity": 9.110878178375525, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026661.333152573, "perf/iters_per_sec": 0.9663874307406297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347816705703736, "data/tokens_consumed": 81350623232, "data/tokens_consumed_B": 81.350623232, "train/loss_slope": -5.771185519851821e-07} {"step": 38800, "timestamp": 1778236476.2134242, "grad/layer_0/attn": 0.0027326010167598724, "grad/layer_0/mlp": 0.002915706718340516, "grad/layer_0/attn_mlp_ratio": 0.9372002011900294, "grad/layer_4/attn": 0.0021016141399741173, "grad/layer_4/mlp": 0.0025212643668055534, "grad/layer_4/attn_mlp_ratio": 0.8335556097519853, "grad/layer_8/attn": 0.005902425851672888, "grad/layer_8/mlp": 0.0037464885972440243, "grad/layer_8/attn_mlp_ratio": 1.5754554006835266, "grad/layer_12/attn": 0.003771101590245962, "grad/layer_12/mlp": 0.006600095424801111, "grad/layer_12/attn_mlp_ratio": 0.5713707591163389, "grad/layer_16/attn": 0.0061277286149561405, "grad/layer_16/mlp": 0.004535906948149204, "grad/layer_16/attn_mlp_ratio": 1.3509378719426033, "grad/layer_20/attn": 0.005885530728846788, "grad/layer_20/mlp": 0.006979756988584995, "grad/layer_20/attn_mlp_ratio": 0.8432285900711712, "grad/layer_24/attn": 0.02303251437842846, "grad/layer_24/mlp": 0.01489293109625578, "grad/layer_24/attn_mlp_ratio": 1.5465400380160923, "grad/layer_27/attn": 0.00431550620123744, "grad/layer_27/mlp": 0.015530516393482685, "grad/layer_27/attn_mlp_ratio": 0.27787267751516337} {"step": 38800, "timestamp": 1778236476.229078, "train/loss": 2.1985216617584227, "train/z_loss": 0.0014216811861842871, "train/perplexity": 9.01168133400026, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024448.3435932146, "perf/iters_per_sec": 0.965332195088012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035912823677063, "data/tokens_consumed": 81371594752, "data/tokens_consumed_B": 81.371594752, "train/loss_slope": 2.881170420756697e-06} {"step": 38810, "timestamp": 1778236486.578398, "train/loss": 2.1880320072174073, "train/z_loss": 0.0014066001866012811, "train/perplexity": 8.917645971453442, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027520.9833197594, "perf/iters_per_sec": 0.9667973438833997, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343429327011109, "data/tokens_consumed": 81392566272, "data/tokens_consumed_B": 81.392566272, "train/loss_slope": 4.746590546219565e-06} {"step": 38820, "timestamp": 1778236496.9263017, "train/loss": 2.1479705810546874, "train/z_loss": 0.0014207097585313023, "train/perplexity": 8.567453789497582, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027653.5320339014, "perf/iters_per_sec": 0.9668605480355746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342753171920775, "data/tokens_consumed": 81413537792, "data/tokens_consumed_B": 81.413537792, "train/loss_slope": 1.2888064228517842e-06} {"step": 38830, "timestamp": 1778236507.2745693, "train/loss": 2.1333714723587036, "train/z_loss": 0.0014218302443623544, "train/perplexity": 8.443285181252339, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027904.7491058097, "perf/iters_per_sec": 0.9669803376702355, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341471910476685, "data/tokens_consumed": 81434509312, "data/tokens_consumed_B": 81.434509312, "train/loss_slope": -2.22870170479966e-06} {"step": 38840, "timestamp": 1778236517.6266046, "train/loss": 2.1501984119415285, "train/z_loss": 0.0014191181166097523, "train/perplexity": 8.586561904592743, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027044.9604021783, "perf/iters_per_sec": 0.9665703584681408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345858335494995, "data/tokens_consumed": 81455480832, "data/tokens_consumed_B": 81.455480832, "train/loss_slope": -3.4114489639768716e-06} {"step": 38850, "timestamp": 1778236527.9733071, "grad/layer_0/attn": 0.003107772907242179, "grad/layer_0/mlp": 0.0030562386382371187, "grad/layer_0/attn_mlp_ratio": 1.0168619579224318, "grad/layer_4/attn": 0.0022028060629963875, "grad/layer_4/mlp": 0.0024337167851626873, "grad/layer_4/attn_mlp_ratio": 0.9051200969290789, "grad/layer_8/attn": 0.004133824724704027, "grad/layer_8/mlp": 0.003573860041797161, "grad/layer_8/attn_mlp_ratio": 1.1566834069296514, "grad/layer_12/attn": 0.00650820741429925, "grad/layer_12/mlp": 0.006918067578226328, "grad/layer_12/attn_mlp_ratio": 0.9407550947763843, "grad/layer_16/attn": 0.004059202503412962, "grad/layer_16/mlp": 0.005205618217587471, "grad/layer_16/attn_mlp_ratio": 0.7797733632715101, "grad/layer_20/attn": 0.00428894255310297, "grad/layer_20/mlp": 0.006309571675956249, "grad/layer_20/attn_mlp_ratio": 0.679751765317379, "grad/layer_24/attn": 0.013324465602636337, "grad/layer_24/mlp": 0.01329582929611206, "grad/layer_24/attn_mlp_ratio": 1.0021537736136004, "grad/layer_27/attn": 0.009063045494258404, "grad/layer_27/mlp": 0.0125490827485919, "grad/layer_27/attn_mlp_ratio": 0.7222077982595633} {"step": 38850, "timestamp": 1778236528.579475, "eos/sharpness": 74.08699989318846, "eos/L0_probe": 2.0028817653656006, "eos/L_plus": 2.3137242794036865, "eos/L_minus": 2.4329092502593994, "eos/grad_norm": 0.23724447190761566, "eos/embed_grad_frac": 0.04523007944226265, "eos/time_s": 0.6029987335205078} {"step": 38850, "timestamp": 1778236528.5998597, "train/loss": 2.1504544258117675, "train/z_loss": 0.00141756342491135, "train/perplexity": 8.58876046495695, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912072.4276959067, "perf/iters_per_sec": 0.9117471827010664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967952728271484, "data/tokens_consumed": 81476452352, "data/tokens_consumed_B": 81.476452352, "train/loss_slope": -7.259214407253982e-06} {"step": 38850, "timestamp": 1778236529.9606562, "geo/rankme_last": 439.8737487792969, "geo/layer_0/stable_rank_q_proj": 19.203208923339844, "geo/layer_0/stable_rank_k_proj": 16.33547592163086, "geo/layer_0/stable_rank_o_proj": 49.30533981323242, "geo/layer_0/stable_rank_gate_proj": 138.06784057617188, "geo/layer_0/stable_rank_down_proj": 52.9652214050293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05956396088004112, "geo/layer_0/attn_entropy_mean": 6.207253456115723, "geo/layer_0/attn_entropy_std": 0.3668198585510254, "geo/layer_7/stable_rank_q_proj": 42.94826889038086, "geo/layer_7/stable_rank_k_proj": 42.36100387573242, "geo/layer_7/stable_rank_o_proj": 99.14642333984375, "geo/layer_7/stable_rank_gate_proj": 90.5960464477539, "geo/layer_7/stable_rank_down_proj": 145.8794708251953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4996562898159027, "geo/layer_7/attn_entropy_mean": 4.6367692947387695, "geo/layer_7/attn_entropy_std": 0.7995331287384033, "geo/layer_14/stable_rank_q_proj": 54.0717887878418, "geo/layer_14/stable_rank_k_proj": 36.821922302246094, "geo/layer_14/stable_rank_o_proj": 48.70746612548828, "geo/layer_14/stable_rank_gate_proj": 77.31824493408203, "geo/layer_14/stable_rank_down_proj": 133.6792755126953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38861751556396484, "geo/layer_14/attn_entropy_mean": 5.481207370758057, "geo/layer_14/attn_entropy_std": 0.37909185886383057, "geo/layer_21/stable_rank_q_proj": 43.477474212646484, "geo/layer_21/stable_rank_k_proj": 30.758413314819336, "geo/layer_21/stable_rank_o_proj": 76.23485565185547, "geo/layer_21/stable_rank_gate_proj": 73.23905181884766, "geo/layer_21/stable_rank_down_proj": 55.75653076171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1462050825357437, "geo/layer_21/attn_entropy_mean": 5.719574928283691, "geo/layer_21/attn_entropy_std": 0.29027047753334045, "geo/layer_27/stable_rank_q_proj": 42.57401657104492, "geo/layer_27/stable_rank_k_proj": 31.434497833251953, "geo/layer_27/stable_rank_o_proj": 116.58199310302734, "geo/layer_27/stable_rank_gate_proj": 84.99039459228516, "geo/layer_27/stable_rank_down_proj": 132.0914306640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08218895643949509, "geo/layer_27/attn_entropy_mean": 4.274974822998047, "geo/layer_27/attn_entropy_std": 0.6479436755180359, "attnres/final_alpha/block_0": 0.24113374948501587, "attnres/block_norm/0": 1.7173213958740234, "attnres/final_alpha/block_1": 0.005324792582541704, "attnres/block_norm/1": 39605.125, "attnres/final_alpha/block_2": 0.011351138353347778, "attnres/block_norm/2": 26150.0703125, "attnres/final_alpha/block_3": 0.013297331519424915, "attnres/block_norm/3": 46089.6796875, "attnres/final_alpha/block_4": 0.01659843698143959, "attnres/block_norm/4": 12672.693359375, "attnres/final_alpha/block_5": 0.5926845669746399, "attnres/block_norm/5": 6006.0009765625, "attnres/final_alpha/block_6": 0.11960998177528381, "attnres/block_norm/6": 30377.609375, "geo/tier1_time_s": 1.3571009635925293, "geo/step": 38850.0, "geo/rankme_slope": -6.6353103741496576e-06} {"step": 38860, "timestamp": 1778236540.3083265, "train/loss": 2.182800316810608, "train/z_loss": 0.001402996655087918, "train/perplexity": 8.871113436616584, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791685.230837746, "perf/iters_per_sec": 0.8543420938671809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704913139343263, "data/tokens_consumed": 81497423872, "data/tokens_consumed_B": 81.497423872, "train/loss_slope": -4.9112864238810035e-06} {"step": 38870, "timestamp": 1778236550.6587698, "train/loss": 2.1341896653175354, "train/z_loss": 0.0014363726950250566, "train/perplexity": 8.450196244643205, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027473.0344922852, "perf/iters_per_sec": 0.9667744801007677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343673944473266, "data/tokens_consumed": 81518395392, "data/tokens_consumed_B": 81.518395392, "train/loss_slope": -9.241926494819402e-06} {"step": 38880, "timestamp": 1778236561.0158179, "train/loss": 2.1761513233184813, "train/z_loss": 0.001417054235935211, "train/perplexity": 8.812325119259825, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025901.4664570831, "perf/iters_per_sec": 0.9660250980649391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035169792175293, "data/tokens_consumed": 81539366912, "data/tokens_consumed_B": 81.539366912, "train/loss_slope": -7.812768347872205e-06} {"step": 38890, "timestamp": 1778236571.370735, "train/loss": 2.211977505683899, "train/z_loss": 0.0014217678341083228, "train/perplexity": 9.133760609410368, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026357.0193230081, "perf/iters_per_sec": 0.966242322598938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034937071800232, "data/tokens_consumed": 81560338432, "data/tokens_consumed_B": 81.560338432, "train/loss_slope": -3.369165804520024e-06} {"step": 38900, "timestamp": 1778236581.707647, "grad/layer_0/attn": 0.0026267028879374266, "grad/layer_0/mlp": 0.002617352642118931, "grad/layer_0/attn_mlp_ratio": 1.003572367479565, "grad/layer_4/attn": 0.0023944827262312174, "grad/layer_4/mlp": 0.0024677931796759367, "grad/layer_4/attn_mlp_ratio": 0.9702930735534111, "grad/layer_8/attn": 0.004337822087109089, "grad/layer_8/mlp": 0.0036011047195643187, "grad/layer_8/attn_mlp_ratio": 1.2045809007119939, "grad/layer_12/attn": 0.007878029718995094, "grad/layer_12/mlp": 0.0065050567500293255, "grad/layer_12/attn_mlp_ratio": 1.2110623935530371, "grad/layer_16/attn": 0.0032876317854970694, "grad/layer_16/mlp": 0.004405108280479908, "grad/layer_16/attn_mlp_ratio": 0.746322564971467, "grad/layer_20/attn": 0.0035946806892752647, "grad/layer_20/mlp": 0.005412288475781679, "grad/layer_20/attn_mlp_ratio": 0.6641701821592351, "grad/layer_24/attn": 0.004790916573256254, "grad/layer_24/mlp": 0.007957187481224537, "grad/layer_24/attn_mlp_ratio": 0.6020866699888677, "grad/layer_27/attn": 0.006457493174821138, "grad/layer_27/mlp": 0.006797441281378269, "grad/layer_27/attn_mlp_ratio": 0.9499887990960209} {"step": 38900, "timestamp": 1778236581.7234526, "train/loss": 2.098201274871826, "train/z_loss": 0.0014299066155217589, "train/perplexity": 8.151494420090293, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026731.3317248994, "perf/iters_per_sec": 0.966420808660936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347459316253662, "data/tokens_consumed": 81581309952, "data/tokens_consumed_B": 81.581309952, "train/loss_slope": -7.322043640063578e-06} {"step": 38910, "timestamp": 1778236592.4723885, "train/loss": 2.1819731473922728, "train/z_loss": 0.0014132131123915314, "train/perplexity": 8.86377855688755, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951983.8406491454, "perf/iters_per_sec": 0.9307784274335601, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0743695497512817, "data/tokens_consumed": 81602281472, "data/tokens_consumed_B": 81.602281472, "train/loss_slope": -8.111779252723974e-06} {"step": 38920, "timestamp": 1778236603.3049471, "train/loss": 2.201763129234314, "train/z_loss": 0.0014082178473472594, "train/perplexity": 9.040939800512824, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1937006.8135647795, "perf/iters_per_sec": 0.9236368244003198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.08267662525177, "data/tokens_consumed": 81623252992, "data/tokens_consumed_B": 81.623252992, "train/loss_slope": -5.241018081262274e-06} {"step": 38925, "timestamp": 1778236609.0889814, "eos/sharpness": 62.54940032958983, "eos/L0_probe": 2.0010039806365967, "eos/L_plus": 2.268702983856201, "eos/L_minus": 2.3587989807128906, "eos/grad_norm": 0.16203506290912628, "eos/embed_grad_frac": 0.08103276044130325, "eos/time_s": 0.612680196762085} {"step": 38925, "timestamp": 1778236610.4744086, "geo/rankme_last": 439.6334533691406, "geo/layer_0/stable_rank_q_proj": 19.18844223022461, "geo/layer_0/stable_rank_k_proj": 16.317264556884766, "geo/layer_0/stable_rank_o_proj": 49.37042999267578, "geo/layer_0/stable_rank_gate_proj": 137.76690673828125, "geo/layer_0/stable_rank_down_proj": 52.971595764160156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0648484155535698, "geo/layer_0/attn_entropy_mean": 6.208123683929443, "geo/layer_0/attn_entropy_std": 0.36990875005722046, "geo/layer_7/stable_rank_q_proj": 42.936614990234375, "geo/layer_7/stable_rank_k_proj": 42.27035140991211, "geo/layer_7/stable_rank_o_proj": 99.07013702392578, "geo/layer_7/stable_rank_gate_proj": 90.41260528564453, "geo/layer_7/stable_rank_down_proj": 145.7027587890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5081062912940979, "geo/layer_7/attn_entropy_mean": 4.627910614013672, "geo/layer_7/attn_entropy_std": 0.8103640675544739, "geo/layer_14/stable_rank_q_proj": 54.124549865722656, "geo/layer_14/stable_rank_k_proj": 36.85207748413086, "geo/layer_14/stable_rank_o_proj": 48.70743179321289, "geo/layer_14/stable_rank_gate_proj": 77.41295623779297, "geo/layer_14/stable_rank_down_proj": 133.5687255859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38706332445144653, "geo/layer_14/attn_entropy_mean": 5.520028114318848, "geo/layer_14/attn_entropy_std": 0.37124061584472656, "geo/layer_21/stable_rank_q_proj": 43.5003776550293, "geo/layer_21/stable_rank_k_proj": 30.6992130279541, "geo/layer_21/stable_rank_o_proj": 76.19111633300781, "geo/layer_21/stable_rank_gate_proj": 73.20536041259766, "geo/layer_21/stable_rank_down_proj": 55.67744445800781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14453518390655518, "geo/layer_21/attn_entropy_mean": 5.728989124298096, "geo/layer_21/attn_entropy_std": 0.2856716513633728, "geo/layer_27/stable_rank_q_proj": 42.62809753417969, "geo/layer_27/stable_rank_k_proj": 31.489587783813477, "geo/layer_27/stable_rank_o_proj": 116.6510238647461, "geo/layer_27/stable_rank_gate_proj": 84.92664337158203, "geo/layer_27/stable_rank_down_proj": 131.8988494873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0845930352807045, "geo/layer_27/attn_entropy_mean": 4.26474142074585, "geo/layer_27/attn_entropy_std": 0.6737774014472961, "attnres/final_alpha/block_0": 0.24097727239131927, "attnres/block_norm/0": 1.71750807762146, "attnres/final_alpha/block_1": 0.005326736718416214, "attnres/block_norm/1": 39811.4296875, "attnres/final_alpha/block_2": 0.011436091735959053, "attnres/block_norm/2": 26163.5859375, "attnres/final_alpha/block_3": 0.013442562893033028, "attnres/block_norm/3": 46220.3359375, "attnres/final_alpha/block_4": 0.01667650043964386, "attnres/block_norm/4": 12693.0498046875, "attnres/final_alpha/block_5": 0.5926538109779358, "attnres/block_norm/5": 5979.859375, "attnres/final_alpha/block_6": 0.11948704719543457, "attnres/block_norm/6": 30491.267578125, "geo/tier1_time_s": 1.3654429912567139, "geo/step": 38925.0, "geo/rankme_slope": -1.0025318721238495e-05} {"step": 38930, "timestamp": 1778236615.6527982, "train/loss": 2.170265316963196, "train/z_loss": 0.0014001043513417245, "train/perplexity": 8.760608070353937, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699123.0682546513, "perf/iters_per_sec": 0.8102050153039223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2342555046081543, "data/tokens_consumed": 81644224512, "data/tokens_consumed_B": 81.644224512, "train/loss_slope": -6.096551699428155e-06} {"step": 38940, "timestamp": 1778236626.005724, "train/loss": 2.130715584754944, "train/z_loss": 0.0014143796870484948, "train/perplexity": 8.420890516824107, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026845.8895195872, "perf/iters_per_sec": 0.966475434074205, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346874475479126, "data/tokens_consumed": 81665196032, "data/tokens_consumed_B": 81.665196032, "train/loss_slope": -8.512216046376618e-06} {"step": 38950, "timestamp": 1778236636.3500516, "grad/layer_0/attn": 0.003871928434818983, "grad/layer_0/mlp": 0.0031952487770467997, "grad/layer_0/attn_mlp_ratio": 1.2117767922971943, "grad/layer_4/attn": 0.0023510134778916836, "grad/layer_4/mlp": 0.0025809379294514656, "grad/layer_4/attn_mlp_ratio": 0.9109143462818251, "grad/layer_8/attn": 0.004980751778930426, "grad/layer_8/mlp": 0.004139816854149103, "grad/layer_8/attn_mlp_ratio": 1.2031333351438405, "grad/layer_12/attn": 0.006238837726414204, "grad/layer_12/mlp": 0.006416881922632456, "grad/layer_12/attn_mlp_ratio": 0.9722537681711637, "grad/layer_16/attn": 0.0037047613877803087, "grad/layer_16/mlp": 0.004524803254753351, "grad/layer_16/attn_mlp_ratio": 0.818767379998607, "grad/layer_20/attn": 0.004478820599615574, "grad/layer_20/mlp": 0.006099503953009844, "grad/layer_20/attn_mlp_ratio": 0.7342925852152632, "grad/layer_24/attn": 0.009189593605697155, "grad/layer_24/mlp": 0.010713323019444942, "grad/layer_24/attn_mlp_ratio": 0.8577724673512198, "grad/layer_27/attn": 0.012452959083020687, "grad/layer_27/mlp": 0.008786127902567387, "grad/layer_27/attn_mlp_ratio": 1.4173432346287034} {"step": 38950, "timestamp": 1778236636.3659148, "train/loss": 2.201969575881958, "train/z_loss": 0.0014255471643991769, "train/perplexity": 9.042806464902865, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025503.3463531262, "perf/iters_per_sec": 0.9658352596059447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353732585906983, "data/tokens_consumed": 81686167552, "data/tokens_consumed_B": 81.686167552, "train/loss_slope": -1.0690660087546613e-05} {"step": 38960, "timestamp": 1778236646.7164905, "train/loss": 2.1578194737434386, "train/z_loss": 0.001419892709236592, "train/perplexity": 8.652250614456111, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027331.9584547929, "perf/iters_per_sec": 0.9667072098039593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344393730163575, "data/tokens_consumed": 81707139072, "data/tokens_consumed_B": 81.707139072, "train/loss_slope": -1.0644170174253981e-05} {"step": 38970, "timestamp": 1778236657.594632, "train/loss": 2.1642935991287233, "train/z_loss": 0.001419256255030632, "train/perplexity": 8.708448088255187, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928950.1282867654, "perf/iters_per_sec": 0.9197950974878146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0871986627578736, "data/tokens_consumed": 81728110592, "data/tokens_consumed_B": 81.728110592, "train/loss_slope": -1.269081075235138e-05} {"step": 38980, "timestamp": 1778236667.9513733, "train/loss": 2.174119973182678, "train/z_loss": 0.0014355705585330724, "train/perplexity": 8.794442370643246, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026404.3551770088, "perf/iters_per_sec": 0.9662648940930408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034912896156311, "data/tokens_consumed": 81749082112, "data/tokens_consumed_B": 81.749082112, "train/loss_slope": -1.3238333403939052e-05} {"step": 38990, "timestamp": 1778236678.7188902, "train/loss": 2.1349376440048218, "train/z_loss": 0.0014303216710686683, "train/perplexity": 8.45651917575165, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948725.6236206088, "perf/iters_per_sec": 0.9292247884848637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0761658668518066, "data/tokens_consumed": 81770053632, "data/tokens_consumed_B": 81.770053632, "train/loss_slope": -1.3824635048439128e-05} {"step": 39000, "timestamp": 1778236689.073685, "grad/layer_0/attn": 0.00288918218575418, "grad/layer_0/mlp": 0.002924637869000435, "grad/layer_0/attn_mlp_ratio": 0.9878768642060771, "grad/layer_4/attn": 0.002222458366304636, "grad/layer_4/mlp": 0.0024837309028953314, "grad/layer_4/attn_mlp_ratio": 0.8948063875330601, "grad/layer_8/attn": 0.004312427248805761, "grad/layer_8/mlp": 0.00368491024710238, "grad/layer_8/attn_mlp_ratio": 1.1702936686632899, "grad/layer_12/attn": 0.0040233563631772995, "grad/layer_12/mlp": 0.006173364818096161, "grad/layer_12/attn_mlp_ratio": 0.6517282578555368, "grad/layer_16/attn": 0.006860972382128239, "grad/layer_16/mlp": 0.004413137212395668, "grad/layer_16/attn_mlp_ratio": 1.5546700445637769, "grad/layer_20/attn": 0.0062920707277953625, "grad/layer_20/mlp": 0.0058394563384354115, "grad/layer_20/attn_mlp_ratio": 1.0775096610672246, "grad/layer_24/attn": 0.01084109116345644, "grad/layer_24/mlp": 0.010574792511761189, "grad/layer_24/attn_mlp_ratio": 1.0251823899978025, "grad/layer_27/attn": 0.006826132070273161, "grad/layer_27/mlp": 0.01025089155882597, "grad/layer_27/attn_mlp_ratio": 0.6659061765027917} {"step": 39000, "timestamp": 1778236689.6774724, "eos/sharpness": 50.53663253784179, "eos/L0_probe": 2.0037496089935303, "eos/L_plus": 2.2412445545196533, "eos/L_minus": 2.271620988845825, "eos/grad_norm": 0.16055545210838318, "eos/embed_grad_frac": 0.09760235249996185, "eos/time_s": 0.6009604930877686} {"step": 39000, "timestamp": 1778236689.696697, "train/loss": 2.214596748352051, "train/z_loss": 0.0014214507187716662, "train/perplexity": 9.15771550306393, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911517.3750024068, "perf/iters_per_sec": 0.9114825129520449, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971137523651122, "data/tokens_consumed": 81791025152, "data/tokens_consumed_B": 81.791025152, "train/loss_slope": -1.0934206053833729e-05} {"step": 39000, "timestamp": 1778236691.0619366, "geo/rankme_last": 439.5890197753906, "geo/layer_0/stable_rank_q_proj": 19.218753814697266, "geo/layer_0/stable_rank_k_proj": 16.350008010864258, "geo/layer_0/stable_rank_o_proj": 49.30712127685547, "geo/layer_0/stable_rank_gate_proj": 138.0293731689453, "geo/layer_0/stable_rank_down_proj": 52.909000396728516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057572729885578156, "geo/layer_0/attn_entropy_mean": 6.203606605529785, "geo/layer_0/attn_entropy_std": 0.3726079761981964, "geo/layer_7/stable_rank_q_proj": 42.80585479736328, "geo/layer_7/stable_rank_k_proj": 42.21208572387695, "geo/layer_7/stable_rank_o_proj": 98.90440368652344, "geo/layer_7/stable_rank_gate_proj": 90.20853424072266, "geo/layer_7/stable_rank_down_proj": 146.11007690429688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.496795654296875, "geo/layer_7/attn_entropy_mean": 4.606932163238525, "geo/layer_7/attn_entropy_std": 0.8359630107879639, "geo/layer_14/stable_rank_q_proj": 54.208499908447266, "geo/layer_14/stable_rank_k_proj": 36.9119873046875, "geo/layer_14/stable_rank_o_proj": 48.7669677734375, "geo/layer_14/stable_rank_gate_proj": 77.39391326904297, "geo/layer_14/stable_rank_down_proj": 133.71096801757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3794000744819641, "geo/layer_14/attn_entropy_mean": 5.501633644104004, "geo/layer_14/attn_entropy_std": 0.36609068512916565, "geo/layer_21/stable_rank_q_proj": 43.42860794067383, "geo/layer_21/stable_rank_k_proj": 30.709596633911133, "geo/layer_21/stable_rank_o_proj": 76.20048522949219, "geo/layer_21/stable_rank_gate_proj": 73.3641128540039, "geo/layer_21/stable_rank_down_proj": 55.596317291259766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14155909419059753, "geo/layer_21/attn_entropy_mean": 5.729222297668457, "geo/layer_21/attn_entropy_std": 0.27669692039489746, "geo/layer_27/stable_rank_q_proj": 42.612518310546875, "geo/layer_27/stable_rank_k_proj": 31.448625564575195, "geo/layer_27/stable_rank_o_proj": 116.72040557861328, "geo/layer_27/stable_rank_gate_proj": 84.86946105957031, "geo/layer_27/stable_rank_down_proj": 131.86526489257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08105259388685226, "geo/layer_27/attn_entropy_mean": 4.270421981811523, "geo/layer_27/attn_entropy_std": 0.6544432640075684, "attnres/final_alpha/block_0": 0.24264198541641235, "attnres/block_norm/0": 1.7178494930267334, "attnres/final_alpha/block_1": 0.005342152900993824, "attnres/block_norm/1": 39718.61328125, "attnres/final_alpha/block_2": 0.011543757282197475, "attnres/block_norm/2": 26178.68359375, "attnres/final_alpha/block_3": 0.01347142830491066, "attnres/block_norm/3": 46437.68359375, "attnres/final_alpha/block_4": 0.016910944133996964, "attnres/block_norm/4": 12737.6171875, "attnres/final_alpha/block_5": 0.5910872220993042, "attnres/block_norm/5": 5995.544921875, "attnres/final_alpha/block_6": 0.11900251358747482, "attnres/block_norm/6": 30414.275390625, "geo/tier1_time_s": 1.3617987632751465, "geo/step": 39000.0, "geo/rankme_slope": 4.0317689575830314e-06} {"step": 39000, "timestamp": 1778236697.957613, "geo/ww_alpha_mean": 7.579861537848562, "geo/ww_alpha_std": 4.348835142652437, "geo/ww_alpha_min": 1.3630417221676545, "geo/ww_alpha_max": 26.081182209191613, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 4.0347930956839475, "geo/ww_alpha_by_type/k_proj": 4.527924871206751, "geo/ww_alpha_by_type/v_proj": 8.595294904517091, "geo/ww_alpha_by_type/o_proj": 8.018989206204866, "geo/ww_alpha_by_type/gate_proj": 8.280660592294035, "geo/ww_alpha_by_type/up_proj": 11.411581280921173, "geo/ww_alpha_by_type/down_proj": 8.292414217188053, "geo/twonn_id/layer_0": 0.7441140413284302, "geo/twonn_id/layer_7": 3.2577483654022217, "geo/twonn_id/layer_14": 4.662929058074951, "geo/twonn_id/layer_21": 7.342471122741699, "geo/twonn_id/layer_27": 6.468634128570557, "geo/tier2_time_s": 6.889519214630127} {"step": 39000, "timestamp": 1778236698.606313, "eoc/jacobian_sigma/layer_0/attn": 1025.130126953125, "eoc/jacobian_sigma/layer_0/mlp": 9326.3623046875, "eoc/jacobian_sigma/layer_0": 9326.3623046875, "eoc/jacobian_sigma/layer_7/attn": 1.132874608039856, "eoc/jacobian_sigma/layer_7/mlp": 1.702884316444397, "eoc/jacobian_sigma/layer_7": 1.702884316444397, "eoc/jacobian_sigma/layer_14/attn": 1.6382193565368652, "eoc/jacobian_sigma/layer_14/mlp": 6.43255090713501, "eoc/jacobian_sigma/layer_14": 6.43255090713501, "eoc/jacobian_sigma/layer_21/attn": 1.0922791957855225, "eoc/jacobian_sigma/layer_21/mlp": 3.7660880088806152, "eoc/jacobian_sigma/layer_21": 3.7660880088806152, "eoc/jacobian_sigma/layer_27/attn": 4.3013811111450195, "eoc/jacobian_sigma/layer_27/mlp": 29.16370391845703, "eoc/jacobian_sigma/layer_27": 29.16370391845703, "eoc/layer0_sigma": 9326.3623046875, "eoc/sigma_max": 29.16370391845703, "eoc/sigma_min": 1.702884316444397, "eoc/sigma_mean": 10.266306787729263, "eoc/time_s": 0.640718936920166} {"step": 39010, "timestamp": 1778236708.983185, "train/loss": 2.165980815887451, "train/z_loss": 0.0014345127972774207, "train/perplexity": 8.723153529957546, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1087557.938610872, "perf/iters_per_sec": 0.5185880368284569, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.92831289768219, "data/tokens_consumed": 81811996672, "data/tokens_consumed_B": 81.811996672, "train/loss_slope": -1.2972705993000011e-05} {"step": 39020, "timestamp": 1778236719.3413591, "train/loss": 2.2305665493011473, "train/z_loss": 0.0014212359441444279, "train/perplexity": 9.305136404920304, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025997.9643975392, "perf/iters_per_sec": 0.9660711118686386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351204872131348, "data/tokens_consumed": 81832968192, "data/tokens_consumed_B": 81.832968192, "train/loss_slope": -1.130557138212378e-05} {"step": 39030, "timestamp": 1778236729.6996, "train/loss": 2.1964475393295286, "train/z_loss": 0.0014144805609248579, "train/perplexity": 8.993009374282458, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025857.3268464208, "perf/iters_per_sec": 0.9660040506584266, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035192346572876, "data/tokens_consumed": 81853939712, "data/tokens_consumed_B": 81.853939712, "train/loss_slope": -1.1082100424722266e-05} {"step": 39040, "timestamp": 1778236740.069084, "train/loss": 2.155160117149353, "train/z_loss": 0.001430468470789492, "train/perplexity": 8.629271762753815, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023562.1075871228, "perf/iters_per_sec": 0.9649096048293699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363665103912354, "data/tokens_consumed": 81874911232, "data/tokens_consumed_B": 81.874911232, "train/loss_slope": -1.0634122329755495e-05} {"step": 39050, "timestamp": 1778236750.409517, "grad/layer_0/attn": 0.002521788002923131, "grad/layer_0/mlp": 0.002684572711586952, "grad/layer_0/attn_mlp_ratio": 0.9393628632603215, "grad/layer_4/attn": 0.003620747709646821, "grad/layer_4/mlp": 0.0023759484756737947, "grad/layer_4/attn_mlp_ratio": 1.5239166986684525, "grad/layer_8/attn": 0.005155079998075962, "grad/layer_8/mlp": 0.003679939778521657, "grad/layer_8/attn_mlp_ratio": 1.4008598423479988, "grad/layer_12/attn": 0.003970615565776825, "grad/layer_12/mlp": 0.00631952378898859, "grad/layer_12/attn_mlp_ratio": 0.6283092896753498, "grad/layer_16/attn": 0.0031550235580652952, "grad/layer_16/mlp": 0.004219052381813526, "grad/layer_16/attn_mlp_ratio": 0.747803818906072, "grad/layer_20/attn": 0.004634444136172533, "grad/layer_20/mlp": 0.006139915902167559, "grad/layer_20/attn_mlp_ratio": 0.754805787984143, "grad/layer_24/attn": 0.010814063251018524, "grad/layer_24/mlp": 0.009387670084834099, "grad/layer_24/attn_mlp_ratio": 1.1519432445005133, "grad/layer_27/attn": 0.004194601438939571, "grad/layer_27/mlp": 0.00872074719518423, "grad/layer_27/attn_mlp_ratio": 0.48099105466063824} {"step": 39050, "timestamp": 1778236750.425274, "train/loss": 2.198891615867615, "train/z_loss": 0.0014250955311581493, "train/perplexity": 9.015015859313143, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026257.4531917823, "perf/iters_per_sec": 0.9661948457678711, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349879264831543, "data/tokens_consumed": 81895882752, "data/tokens_consumed_B": 81.895882752, "train/loss_slope": -9.598206422223888e-06} {"step": 39060, "timestamp": 1778236760.774821, "train/loss": 2.180688500404358, "train/z_loss": 0.001420811377465725, "train/perplexity": 8.852399041358058, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027640.8185708728, "perf/iters_per_sec": 0.9668544857839931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342818021774292, "data/tokens_consumed": 81916854272, "data/tokens_consumed_B": 81.916854272, "train/loss_slope": -9.163029666709597e-06} {"step": 39070, "timestamp": 1778236771.1281445, "train/loss": 2.150921034812927, "train/z_loss": 0.0014152846415527165, "train/perplexity": 8.592768993033614, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027018.6613734292, "perf/iters_per_sec": 0.9665578181140085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345992565155029, "data/tokens_consumed": 81937825792, "data/tokens_consumed_B": 81.937825792, "train/loss_slope": -1.0190138550731847e-05} {"step": 39075, "timestamp": 1778236776.9235742, "eos/sharpness": 42.39957332611083, "eos/L0_probe": 2.002393960952759, "eos/L_plus": 2.2191081047058105, "eos/L_minus": 2.2096755504608154, "eos/grad_norm": 0.1692776381969452, "eos/embed_grad_frac": 0.09376857429742813, "eos/time_s": 0.62314772605896} {"step": 39075, "timestamp": 1778236778.3030844, "geo/rankme_last": 439.6737060546875, "geo/layer_0/stable_rank_q_proj": 19.225879669189453, "geo/layer_0/stable_rank_k_proj": 16.351987838745117, "geo/layer_0/stable_rank_o_proj": 49.26914596557617, "geo/layer_0/stable_rank_gate_proj": 137.8399658203125, "geo/layer_0/stable_rank_down_proj": 52.899654388427734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05731970816850662, "geo/layer_0/attn_entropy_mean": 6.2035956382751465, "geo/layer_0/attn_entropy_std": 0.37289103865623474, "geo/layer_7/stable_rank_q_proj": 42.76976013183594, "geo/layer_7/stable_rank_k_proj": 42.30127716064453, "geo/layer_7/stable_rank_o_proj": 98.87995910644531, "geo/layer_7/stable_rank_gate_proj": 90.35240173339844, "geo/layer_7/stable_rank_down_proj": 146.52017211914062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5209529995918274, "geo/layer_7/attn_entropy_mean": 4.629209518432617, "geo/layer_7/attn_entropy_std": 0.834112823009491, "geo/layer_14/stable_rank_q_proj": 54.16297149658203, "geo/layer_14/stable_rank_k_proj": 36.93193054199219, "geo/layer_14/stable_rank_o_proj": 48.6704216003418, "geo/layer_14/stable_rank_gate_proj": 77.4423828125, "geo/layer_14/stable_rank_down_proj": 134.14244079589844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40048539638519287, "geo/layer_14/attn_entropy_mean": 5.458481788635254, "geo/layer_14/attn_entropy_std": 0.3812096416950226, "geo/layer_21/stable_rank_q_proj": 43.43688201904297, "geo/layer_21/stable_rank_k_proj": 30.803895950317383, "geo/layer_21/stable_rank_o_proj": 76.10688018798828, "geo/layer_21/stable_rank_gate_proj": 73.43991088867188, "geo/layer_21/stable_rank_down_proj": 55.46258544921875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14378638565540314, "geo/layer_21/attn_entropy_mean": 5.718072891235352, "geo/layer_21/attn_entropy_std": 0.2864287495613098, "geo/layer_27/stable_rank_q_proj": 42.694583892822266, "geo/layer_27/stable_rank_k_proj": 31.461610794067383, "geo/layer_27/stable_rank_o_proj": 116.4481201171875, "geo/layer_27/stable_rank_gate_proj": 84.89600372314453, "geo/layer_27/stable_rank_down_proj": 132.0251922607422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08655264228582382, "geo/layer_27/attn_entropy_mean": 4.256982803344727, "geo/layer_27/attn_entropy_std": 0.6433738470077515, "attnres/final_alpha/block_0": 0.23915070295333862, "attnres/block_norm/0": 1.7179155349731445, "attnres/final_alpha/block_1": 0.005211616866290569, "attnres/block_norm/1": 39884.9609375, "attnres/final_alpha/block_2": 0.011278828606009483, "attnres/block_norm/2": 26284.2734375, "attnres/final_alpha/block_3": 0.013161132112145424, "attnres/block_norm/3": 46623.3125, "attnres/final_alpha/block_4": 0.016643032431602478, "attnres/block_norm/4": 12687.888671875, "attnres/final_alpha/block_5": 0.5985822081565857, "attnres/block_norm/5": 5954.115234375, "attnres/final_alpha/block_6": 0.11597248166799545, "attnres/block_norm/6": 30368.09765625, "geo/tier1_time_s": 1.3580565452575684, "geo/step": 39075.0, "geo/rankme_slope": 9.495536495848343e-06} {"step": 39080, "timestamp": 1778236783.481484, "train/loss": 2.1396203517913817, "train/z_loss": 0.0014303790987469256, "train/perplexity": 8.496211445062258, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698327.7792413482, "perf/iters_per_sec": 0.8098257919508687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2348334789276123, "data/tokens_consumed": 81958797312, "data/tokens_consumed_B": 81.958797312, "train/loss_slope": -1.4552205831411645e-05} {"step": 39090, "timestamp": 1778236793.8394663, "train/loss": 2.2059116125106812, "train/z_loss": 0.0014072095626033842, "train/perplexity": 9.078523892665205, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026215.6317998788, "perf/iters_per_sec": 0.9661749037742037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350092887878417, "data/tokens_consumed": 81979768832, "data/tokens_consumed_B": 81.979768832, "train/loss_slope": -1.5406763499969933e-05} {"step": 39100, "timestamp": 1778236804.1851609, "grad/layer_0/attn": 0.0026578442193567753, "grad/layer_0/mlp": 0.0027970157098025084, "grad/layer_0/attn_mlp_ratio": 0.9502428302485857, "grad/layer_4/attn": 0.0019374056719243526, "grad/layer_4/mlp": 0.0025756265968084335, "grad/layer_4/attn_mlp_ratio": 0.7522074819014234, "grad/layer_8/attn": 0.003785737557336688, "grad/layer_8/mlp": 0.0035232866648584604, "grad/layer_8/attn_mlp_ratio": 1.0744903296251478, "grad/layer_12/attn": 0.005606427788734436, "grad/layer_12/mlp": 0.006175563205033541, "grad/layer_12/attn_mlp_ratio": 0.9078407121443939, "grad/layer_16/attn": 0.005331715103238821, "grad/layer_16/mlp": 0.0044301594607532024, "grad/layer_16/attn_mlp_ratio": 1.203504079282494, "grad/layer_20/attn": 0.005968847777694464, "grad/layer_20/mlp": 0.005909004248678684, "grad/layer_20/attn_mlp_ratio": 1.0101274978802752, "grad/layer_24/attn": 0.01437317579984665, "grad/layer_24/mlp": 0.013025823049247265, "grad/layer_24/attn_mlp_ratio": 1.1034370446429134, "grad/layer_27/attn": 0.005558433011174202, "grad/layer_27/mlp": 0.012049855664372444, "grad/layer_27/attn_mlp_ratio": 0.4612862693019699} {"step": 39100, "timestamp": 1778236804.2010708, "train/loss": 2.2031890869140627, "train/z_loss": 0.0014201788348145784, "train/perplexity": 9.05384099414552, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025122.7268049002, "perf/iters_per_sec": 0.9656537660622121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035567855834961, "data/tokens_consumed": 82000740352, "data/tokens_consumed_B": 82.000740352, "train/loss_slope": -1.2509253723929748e-05} {"step": 39110, "timestamp": 1778236814.580687, "train/loss": 2.2411604166030883, "train/z_loss": 0.0013945490005426108, "train/perplexity": 9.404237791847574, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023725.473593112, "perf/iters_per_sec": 0.9649875038114129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362828493118286, "data/tokens_consumed": 82021711872, "data/tokens_consumed_B": 82.021711872, "train/loss_slope": -7.813929774687588e-06} {"step": 39120, "timestamp": 1778236824.9367664, "train/loss": 2.1644341945648193, "train/z_loss": 0.001424653606954962, "train/perplexity": 8.709672542386189, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026102.5919898665, "perf/iters_per_sec": 0.9661210021924336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350670337677002, "data/tokens_consumed": 82042683392, "data/tokens_consumed_B": 82.042683392, "train/loss_slope": -6.225463409091979e-06} {"step": 39130, "timestamp": 1778236835.2890446, "train/loss": 2.155538010597229, "train/z_loss": 0.0014186894986778498, "train/perplexity": 8.632533324235444, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026960.506974424, "perf/iters_per_sec": 0.9665300879356499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034628939628601, "data/tokens_consumed": 82063654912, "data/tokens_consumed_B": 82.063654912, "train/loss_slope": -6.372498071054273e-06} {"step": 39140, "timestamp": 1778236845.6690605, "train/loss": 2.1781858444213866, "train/z_loss": 0.0014195567113347352, "train/perplexity": 8.830272231379123, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021724.7663234046, "perf/iters_per_sec": 0.9640334922425292, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037308359146118, "data/tokens_consumed": 82084626432, "data/tokens_consumed_B": 82.084626432, "train/loss_slope": -4.712685464274809e-06} {"step": 39150, "timestamp": 1778236856.00804, "grad/layer_0/attn": 0.0024421371053904295, "grad/layer_0/mlp": 0.00263391830958426, "grad/layer_0/attn_mlp_ratio": 0.9271878341045114, "grad/layer_4/attn": 0.0020724916830658913, "grad/layer_4/mlp": 0.0024048914201557636, "grad/layer_4/attn_mlp_ratio": 0.8617817750597153, "grad/layer_8/attn": 0.0029796771705150604, "grad/layer_8/mlp": 0.0034077907912433147, "grad/layer_8/attn_mlp_ratio": 0.874372068477459, "grad/layer_12/attn": 0.003828255459666252, "grad/layer_12/mlp": 0.0061809890903532505, "grad/layer_12/attn_mlp_ratio": 0.6193596755744307, "grad/layer_16/attn": 0.004346172325313091, "grad/layer_16/mlp": 0.004318662919104099, "grad/layer_16/attn_mlp_ratio": 1.0063698663422689, "grad/layer_20/attn": 0.003502055536955595, "grad/layer_20/mlp": 0.005200324114412069, "grad/layer_20/attn_mlp_ratio": 0.6734302309940738, "grad/layer_24/attn": 0.007233908865600824, "grad/layer_24/mlp": 0.007861251942813396, "grad/layer_24/attn_mlp_ratio": 0.9201980582996221, "grad/layer_27/attn": 0.005526276770979166, "grad/layer_27/mlp": 0.006826724391430616, "grad/layer_27/attn_mlp_ratio": 0.8095063420116245} {"step": 39150, "timestamp": 1778236856.6210544, "eos/sharpness": 6.662940979003905, "eos/L0_probe": 2.0012030601501465, "eos/L_plus": 2.03826904296875, "eos/L_minus": 2.030766487121582, "eos/grad_norm": 0.09055197238922119, "eos/embed_grad_frac": 0.2796943783760071, "eos/time_s": 0.610215425491333} {"step": 39150, "timestamp": 1778236856.6408114, "train/loss": 2.1441309452056885, "train/z_loss": 0.001424159517046064, "train/perplexity": 8.534620960184332, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912130.8686716931, "perf/iters_per_sec": 0.911775049529883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967617511749268, "data/tokens_consumed": 82105597952, "data/tokens_consumed_B": 82.105597952, "train/loss_slope": -6.2576849253873055e-06} {"step": 39150, "timestamp": 1778236858.0040822, "geo/rankme_last": 440.5547180175781, "geo/layer_0/stable_rank_q_proj": 19.20332908630371, "geo/layer_0/stable_rank_k_proj": 16.35714340209961, "geo/layer_0/stable_rank_o_proj": 49.27737808227539, "geo/layer_0/stable_rank_gate_proj": 137.5081024169922, "geo/layer_0/stable_rank_down_proj": 52.9908332824707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05863214284181595, "geo/layer_0/attn_entropy_mean": 6.200794219970703, "geo/layer_0/attn_entropy_std": 0.3752327263355255, "geo/layer_7/stable_rank_q_proj": 42.94725799560547, "geo/layer_7/stable_rank_k_proj": 42.37686538696289, "geo/layer_7/stable_rank_o_proj": 99.03170776367188, "geo/layer_7/stable_rank_gate_proj": 90.28458404541016, "geo/layer_7/stable_rank_down_proj": 146.30030822753906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5030023455619812, "geo/layer_7/attn_entropy_mean": 4.643651008605957, "geo/layer_7/attn_entropy_std": 0.8249899744987488, "geo/layer_14/stable_rank_q_proj": 54.1710205078125, "geo/layer_14/stable_rank_k_proj": 36.968406677246094, "geo/layer_14/stable_rank_o_proj": 48.62336349487305, "geo/layer_14/stable_rank_gate_proj": 77.47705078125, "geo/layer_14/stable_rank_down_proj": 134.40335083007812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3814769983291626, "geo/layer_14/attn_entropy_mean": 5.511951446533203, "geo/layer_14/attn_entropy_std": 0.36979779601097107, "geo/layer_21/stable_rank_q_proj": 43.45993423461914, "geo/layer_21/stable_rank_k_proj": 30.8351993560791, "geo/layer_21/stable_rank_o_proj": 76.18245697021484, "geo/layer_21/stable_rank_gate_proj": 73.55374908447266, "geo/layer_21/stable_rank_down_proj": 55.52888107299805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1483219712972641, "geo/layer_21/attn_entropy_mean": 5.706228256225586, "geo/layer_21/attn_entropy_std": 0.2829136252403259, "geo/layer_27/stable_rank_q_proj": 42.61231994628906, "geo/layer_27/stable_rank_k_proj": 31.350954055786133, "geo/layer_27/stable_rank_o_proj": 116.41124725341797, "geo/layer_27/stable_rank_gate_proj": 84.80447387695312, "geo/layer_27/stable_rank_down_proj": 132.14276123046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08741980046033859, "geo/layer_27/attn_entropy_mean": 4.278740406036377, "geo/layer_27/attn_entropy_std": 0.6580243706703186, "attnres/final_alpha/block_0": 0.2405206859111786, "attnres/block_norm/0": 1.7180595397949219, "attnres/final_alpha/block_1": 0.005290976259857416, "attnres/block_norm/1": 39759.328125, "attnres/final_alpha/block_2": 0.011543240398168564, "attnres/block_norm/2": 26299.642578125, "attnres/final_alpha/block_3": 0.013372361660003662, "attnres/block_norm/3": 46436.0234375, "attnres/final_alpha/block_4": 0.01653294824063778, "attnres/block_norm/4": 12706.5615234375, "attnres/final_alpha/block_5": 0.5956852436065674, "attnres/block_norm/5": 5964.67041015625, "attnres/final_alpha/block_6": 0.1170545443892479, "attnres/block_norm/6": 30285.29296875, "geo/tier1_time_s": 1.3588447570800781, "geo/step": 39150.0, "geo/rankme_slope": 8.639277976815727e-05} {"step": 39160, "timestamp": 1778236868.358667, "train/loss": 2.1942189216613768, "train/z_loss": 0.0014075946412049234, "train/perplexity": 8.972989711074677, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790348.1030485462, "perf/iters_per_sec": 0.8537045016520244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.17136549949646, "data/tokens_consumed": 82126569472, "data/tokens_consumed_B": 82.126569472, "train/loss_slope": -4.726557906168358e-06} {"step": 39170, "timestamp": 1778236878.7179208, "train/loss": 2.1898272752761843, "train/z_loss": 0.0014252933789975942, "train/perplexity": 8.93366991575942, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025772.8323862285, "perf/iters_per_sec": 0.9659637605601447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352355241775513, "data/tokens_consumed": 82147540992, "data/tokens_consumed_B": 82.147540992, "train/loss_slope": -2.2731038138490125e-06} {"step": 39180, "timestamp": 1778236889.070899, "train/loss": 2.1456634402275085, "train/z_loss": 0.0014388147043064236, "train/perplexity": 8.547710251394017, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026735.3944989739, "perf/iters_per_sec": 0.9664227459425802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034743857383728, "data/tokens_consumed": 82168512512, "data/tokens_consumed_B": 82.168512512, "train/loss_slope": -5.12688868831957e-06} {"step": 39190, "timestamp": 1778236899.4368362, "train/loss": 2.1997865438461304, "train/z_loss": 0.0014227574225515126, "train/perplexity": 9.023087260355016, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024264.6428333735, "perf/iters_per_sec": 0.9652445997397296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360068321228026, "data/tokens_consumed": 82189484032, "data/tokens_consumed_B": 82.189484032, "train/loss_slope": 6.020141370363061e-08} {"step": 39200, "timestamp": 1778236909.7824266, "grad/layer_0/attn": 0.0028276522643864155, "grad/layer_0/mlp": 0.0028949035331606865, "grad/layer_0/attn_mlp_ratio": 0.9767690475068266, "grad/layer_4/attn": 0.0027316035702824593, "grad/layer_4/mlp": 0.0025404721964150667, "grad/layer_4/attn_mlp_ratio": 1.0752345436465114, "grad/layer_8/attn": 0.0037652519531548023, "grad/layer_8/mlp": 0.0038427477702498436, "grad/layer_8/attn_mlp_ratio": 0.9798332027726808, "grad/layer_12/attn": 0.004741801414638758, "grad/layer_12/mlp": 0.006354217883199453, "grad/layer_12/attn_mlp_ratio": 0.746244687729643, "grad/layer_16/attn": 0.005083896219730377, "grad/layer_16/mlp": 0.004659698344767094, "grad/layer_16/attn_mlp_ratio": 1.0910354564767306, "grad/layer_20/attn": 0.0045509347692132, "grad/layer_20/mlp": 0.006987015251070261, "grad/layer_20/attn_mlp_ratio": 0.6513417447288268, "grad/layer_24/attn": 0.020399069413542747, "grad/layer_24/mlp": 0.013722454197704792, "grad/layer_24/attn_mlp_ratio": 1.4865467190482595, "grad/layer_27/attn": 0.005055967252701521, "grad/layer_27/mlp": 0.013314872048795223, "grad/layer_27/attn_mlp_ratio": 0.3797233046025908} {"step": 39200, "timestamp": 1778236909.7983217, "train/loss": 2.2044260025024416, "train/z_loss": 0.0014162087463773787, "train/perplexity": 9.065046760070462, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025523.0294208943, "perf/iters_per_sec": 0.965844645224044, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353631973266602, "data/tokens_consumed": 82210455552, "data/tokens_consumed_B": 82.210455552, "train/loss_slope": 1.5862206814705752e-06} {"step": 39210, "timestamp": 1778236920.1820838, "train/loss": 2.138863682746887, "train/z_loss": 0.0014297298970632256, "train/perplexity": 8.489785056497565, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020982.8977642758, "perf/iters_per_sec": 0.963679741747034, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376891374588013, "data/tokens_consumed": 82231427072, "data/tokens_consumed_B": 82.231427072, "train/loss_slope": -4.329297709243301e-06} {"step": 39220, "timestamp": 1778236930.5572293, "train/loss": 2.192089486122131, "train/z_loss": 0.0014284481876529753, "train/perplexity": 8.953902637449609, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022493.7328208266, "perf/iters_per_sec": 0.9644001640419133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369139671325684, "data/tokens_consumed": 82252398592, "data/tokens_consumed_B": 82.252398592, "train/loss_slope": -5.405724510000618e-06} {"step": 39225, "timestamp": 1778236936.3434837, "eos/sharpness": 68.85833740234374, "eos/L0_probe": 2.0021400451660156, "eos/L_plus": 2.283836841583252, "eos/L_minus": 2.409026622772217, "eos/grad_norm": 0.19134853780269623, "eos/embed_grad_frac": 0.07359547168016434, "eos/time_s": 0.6016790866851807} {"step": 39225, "timestamp": 1778236937.7230341, "geo/rankme_last": 440.2947082519531, "geo/layer_0/stable_rank_q_proj": 19.21270751953125, "geo/layer_0/stable_rank_k_proj": 16.332563400268555, "geo/layer_0/stable_rank_o_proj": 49.346336364746094, "geo/layer_0/stable_rank_gate_proj": 137.80352783203125, "geo/layer_0/stable_rank_down_proj": 53.09374237060547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058314621448516846, "geo/layer_0/attn_entropy_mean": 6.199828147888184, "geo/layer_0/attn_entropy_std": 0.3754079341888428, "geo/layer_7/stable_rank_q_proj": 42.93614196777344, "geo/layer_7/stable_rank_k_proj": 42.28832244873047, "geo/layer_7/stable_rank_o_proj": 99.0452651977539, "geo/layer_7/stable_rank_gate_proj": 90.39407348632812, "geo/layer_7/stable_rank_down_proj": 145.98623657226562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5063100457191467, "geo/layer_7/attn_entropy_mean": 4.651787757873535, "geo/layer_7/attn_entropy_std": 0.8395344614982605, "geo/layer_14/stable_rank_q_proj": 54.03214645385742, "geo/layer_14/stable_rank_k_proj": 37.02364730834961, "geo/layer_14/stable_rank_o_proj": 48.50815200805664, "geo/layer_14/stable_rank_gate_proj": 77.35497283935547, "geo/layer_14/stable_rank_down_proj": 134.2634735107422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39656728506088257, "geo/layer_14/attn_entropy_mean": 5.486411094665527, "geo/layer_14/attn_entropy_std": 0.371330201625824, "geo/layer_21/stable_rank_q_proj": 43.499542236328125, "geo/layer_21/stable_rank_k_proj": 30.73040771484375, "geo/layer_21/stable_rank_o_proj": 76.2605209350586, "geo/layer_21/stable_rank_gate_proj": 73.49337768554688, "geo/layer_21/stable_rank_down_proj": 55.42539978027344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14684230089187622, "geo/layer_21/attn_entropy_mean": 5.7333598136901855, "geo/layer_21/attn_entropy_std": 0.29057127237319946, "geo/layer_27/stable_rank_q_proj": 42.64262008666992, "geo/layer_27/stable_rank_k_proj": 31.338796615600586, "geo/layer_27/stable_rank_o_proj": 116.3792495727539, "geo/layer_27/stable_rank_gate_proj": 84.74239349365234, "geo/layer_27/stable_rank_down_proj": 131.61549377441406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08085977286100388, "geo/layer_27/attn_entropy_mean": 4.294743537902832, "geo/layer_27/attn_entropy_std": 0.6487751007080078, "attnres/final_alpha/block_0": 0.2409898042678833, "attnres/block_norm/0": 1.718440055847168, "attnres/final_alpha/block_1": 0.005245907697826624, "attnres/block_norm/1": 39926.2890625, "attnres/final_alpha/block_2": 0.011458570137619972, "attnres/block_norm/2": 26258.861328125, "attnres/final_alpha/block_3": 0.013247650116682053, "attnres/block_norm/3": 46648.23828125, "attnres/final_alpha/block_4": 0.016537021845579147, "attnres/block_norm/4": 12796.7578125, "attnres/final_alpha/block_5": 0.5951493382453918, "attnres/block_norm/5": 5917.798828125, "attnres/final_alpha/block_6": 0.11737170815467834, "attnres/block_norm/6": 30302.001953125, "geo/tier1_time_s": 1.3593876361846924, "geo/step": 39225.0, "geo/rankme_slope": 0.00010030885401035414} {"step": 39230, "timestamp": 1778236942.9052124, "train/loss": 2.1240889549255373, "train/z_loss": 0.0014235727139748633, "train/perplexity": 8.365272874822294, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699246.716250687, "perf/iters_per_sec": 0.8102639752629694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2341656923294066, "data/tokens_consumed": 82273370112, "data/tokens_consumed_B": 82.273370112, "train/loss_slope": -9.143483563654073e-06} {"step": 39240, "timestamp": 1778236953.2648506, "train/loss": 2.1656465768814086, "train/z_loss": 0.0014188495464622975, "train/perplexity": 8.720238398994512, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025157.1827930228, "perf/iters_per_sec": 0.9656701959576716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355502367019653, "data/tokens_consumed": 82294341632, "data/tokens_consumed_B": 82.294341632, "train/loss_slope": -1.2116008465833014e-05} {"step": 39250, "timestamp": 1778236963.613537, "grad/layer_0/attn": 0.002828693250194192, "grad/layer_0/mlp": 0.002669984707608819, "grad/layer_0/attn_mlp_ratio": 1.0594417024895018, "grad/layer_4/attn": 0.0020131345372647047, "grad/layer_4/mlp": 0.0024595032446086407, "grad/layer_4/attn_mlp_ratio": 0.8185126243790644, "grad/layer_8/attn": 0.003848467255011201, "grad/layer_8/mlp": 0.003530181245878339, "grad/layer_8/attn_mlp_ratio": 1.0901613480861194, "grad/layer_12/attn": 0.00489288242533803, "grad/layer_12/mlp": 0.006917720194905996, "grad/layer_12/attn_mlp_ratio": 0.7072969441885362, "grad/layer_16/attn": 0.0038675195537507534, "grad/layer_16/mlp": 0.0046340469270944595, "grad/layer_16/attn_mlp_ratio": 0.834587895016612, "grad/layer_20/attn": 0.0031844964250922203, "grad/layer_20/mlp": 0.006388933397829533, "grad/layer_20/attn_mlp_ratio": 0.49843943847186184, "grad/layer_24/attn": 0.010712645947933197, "grad/layer_24/mlp": 0.009932661429047585, "grad/layer_24/attn_mlp_ratio": 1.0785272322634358, "grad/layer_27/attn": 0.008307259529829025, "grad/layer_27/mlp": 0.010150226764380932, "grad/layer_27/attn_mlp_ratio": 0.8184309218723743} {"step": 39250, "timestamp": 1778236963.6308546, "train/loss": 2.1966622829437257, "train/z_loss": 0.0013986287987791002, "train/perplexity": 8.994940772988347, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024223.4626576605, "perf/iters_per_sec": 0.9652249635017683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360279083251953, "data/tokens_consumed": 82315313152, "data/tokens_consumed_B": 82.315313152, "train/loss_slope": -8.490282130820811e-06} {"step": 39260, "timestamp": 1778236973.9862611, "train/loss": 2.1233930587768555, "train/z_loss": 0.0014170598005875945, "train/perplexity": 8.359453538707672, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026362.4343579893, "perf/iters_per_sec": 0.9662449046888301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349343061447143, "data/tokens_consumed": 82336284672, "data/tokens_consumed_B": 82.336284672, "train/loss_slope": -8.365746514417377e-06} {"step": 39270, "timestamp": 1778236984.3321655, "train/loss": 2.167039728164673, "train/z_loss": 0.0014188551809638738, "train/perplexity": 8.732395476668348, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028154.0643429453, "perf/iters_per_sec": 0.967099220439408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340200662612915, "data/tokens_consumed": 82357256192, "data/tokens_consumed_B": 82.357256192, "train/loss_slope": -9.446085718992128e-06} {"step": 39280, "timestamp": 1778236994.6862636, "train/loss": 2.1362748742103577, "train/z_loss": 0.0014296260313130915, "train/perplexity": 8.467835052907287, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026496.5123843178, "perf/iters_per_sec": 0.966308838073882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348658323287965, "data/tokens_consumed": 82378227712, "data/tokens_consumed_B": 82.378227712, "train/loss_slope": -1.007169490457117e-05} {"step": 39290, "timestamp": 1778237005.0388792, "train/loss": 2.163978695869446, "train/z_loss": 0.0014087250106967985, "train/perplexity": 8.705706201306173, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026862.8431183733, "perf/iters_per_sec": 0.9664835181800715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346787929534913, "data/tokens_consumed": 82399199232, "data/tokens_consumed_B": 82.399199232, "train/loss_slope": -9.398517778890447e-06} {"step": 39300, "timestamp": 1778237015.3764539, "grad/layer_0/attn": 0.0027753065805882215, "grad/layer_0/mlp": 0.0030270335264503956, "grad/layer_0/attn_mlp_ratio": 0.9168403536509907, "grad/layer_4/attn": 0.002008459297940135, "grad/layer_4/mlp": 0.0024289474822580814, "grad/layer_4/attn_mlp_ratio": 0.826884578576604, "grad/layer_8/attn": 0.0033343175891786814, "grad/layer_8/mlp": 0.0036694305017590523, "grad/layer_8/attn_mlp_ratio": 0.9086743833172034, "grad/layer_12/attn": 0.005390217062085867, "grad/layer_12/mlp": 0.006518503185361624, "grad/layer_12/attn_mlp_ratio": 0.8269102317077127, "grad/layer_16/attn": 0.004154201131314039, "grad/layer_16/mlp": 0.004511475097388029, "grad/layer_16/attn_mlp_ratio": 0.9208077069157251, "grad/layer_20/attn": 0.003933336120098829, "grad/layer_20/mlp": 0.005680387374013662, "grad/layer_20/attn_mlp_ratio": 0.6924415170783417, "grad/layer_24/attn": 0.009135537780821323, "grad/layer_24/mlp": 0.008805153891444206, "grad/layer_24/attn_mlp_ratio": 1.03752163672528, "grad/layer_27/attn": 0.003547112224623561, "grad/layer_27/mlp": 0.007473637815564871, "grad/layer_27/attn_mlp_ratio": 0.47461654748301574} {"step": 39300, "timestamp": 1778237015.9826121, "eos/sharpness": 37.73136138916015, "eos/L0_probe": 2.0040042400360107, "eos/L_plus": 2.1868937015533447, "eos/L_minus": 2.1984283924102783, "eos/grad_norm": 0.13229186832904816, "eos/embed_grad_frac": 0.12273207306861877, "eos/time_s": 0.6034190654754639} {"step": 39300, "timestamp": 1778237016.003957, "train/loss": 2.181591010093689, "train/z_loss": 0.0014245493221096694, "train/perplexity": 8.860392023596129, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913559.7097102317, "perf/iters_per_sec": 0.9124563740302237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0959428071975708, "data/tokens_consumed": 82420170752, "data/tokens_consumed_B": 82.420170752, "train/loss_slope": -8.01342572316082e-06} {"step": 39300, "timestamp": 1778237017.367392, "geo/rankme_last": 439.756103515625, "geo/layer_0/stable_rank_q_proj": 19.211284637451172, "geo/layer_0/stable_rank_k_proj": 16.335376739501953, "geo/layer_0/stable_rank_o_proj": 49.290855407714844, "geo/layer_0/stable_rank_gate_proj": 137.94143676757812, "geo/layer_0/stable_rank_down_proj": 53.201568603515625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05963999032974243, "geo/layer_0/attn_entropy_mean": 6.198185920715332, "geo/layer_0/attn_entropy_std": 0.37356284260749817, "geo/layer_7/stable_rank_q_proj": 42.89766311645508, "geo/layer_7/stable_rank_k_proj": 42.34833908081055, "geo/layer_7/stable_rank_o_proj": 98.97035217285156, "geo/layer_7/stable_rank_gate_proj": 90.18402862548828, "geo/layer_7/stable_rank_down_proj": 145.422607421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5054532885551453, "geo/layer_7/attn_entropy_mean": 4.671348571777344, "geo/layer_7/attn_entropy_std": 0.8115226626396179, "geo/layer_14/stable_rank_q_proj": 54.0338134765625, "geo/layer_14/stable_rank_k_proj": 36.98827362060547, "geo/layer_14/stable_rank_o_proj": 48.476951599121094, "geo/layer_14/stable_rank_gate_proj": 77.38684844970703, "geo/layer_14/stable_rank_down_proj": 134.11639404296875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38540002703666687, "geo/layer_14/attn_entropy_mean": 5.509747505187988, "geo/layer_14/attn_entropy_std": 0.3499786853790283, "geo/layer_21/stable_rank_q_proj": 43.517391204833984, "geo/layer_21/stable_rank_k_proj": 30.654808044433594, "geo/layer_21/stable_rank_o_proj": 76.27261352539062, "geo/layer_21/stable_rank_gate_proj": 73.4422607421875, "geo/layer_21/stable_rank_down_proj": 55.440345764160156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14412051439285278, "geo/layer_21/attn_entropy_mean": 5.709141731262207, "geo/layer_21/attn_entropy_std": 0.28317275643348694, "geo/layer_27/stable_rank_q_proj": 42.55754852294922, "geo/layer_27/stable_rank_k_proj": 31.323657989501953, "geo/layer_27/stable_rank_o_proj": 116.29410552978516, "geo/layer_27/stable_rank_gate_proj": 84.83441925048828, "geo/layer_27/stable_rank_down_proj": 131.76930236816406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08800916373729706, "geo/layer_27/attn_entropy_mean": 4.264801979064941, "geo/layer_27/attn_entropy_std": 0.6445745229721069, "attnres/final_alpha/block_0": 0.24048002064228058, "attnres/block_norm/0": 1.7187564373016357, "attnres/final_alpha/block_1": 0.005290915258228779, "attnres/block_norm/1": 39717.7421875, "attnres/final_alpha/block_2": 0.011468786746263504, "attnres/block_norm/2": 26181.560546875, "attnres/final_alpha/block_3": 0.013494035229086876, "attnres/block_norm/3": 45976.12890625, "attnres/final_alpha/block_4": 0.016386331990361214, "attnres/block_norm/4": 12753.5615234375, "attnres/final_alpha/block_5": 0.5948252081871033, "attnres/block_norm/5": 6054.5126953125, "attnres/final_alpha/block_6": 0.11805470287799835, "attnres/block_norm/6": 30615.2734375, "geo/tier1_time_s": 1.3592875003814697, "geo/step": 39300.0, "geo/rankme_slope": 9.98914018732493e-05} {"step": 39310, "timestamp": 1778237027.723315, "train/loss": 2.185950016975403, "train/z_loss": 0.0014232123387046159, "train/perplexity": 8.899098833738758, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790067.1903462755, "perf/iters_per_sec": 0.8535705520373704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715493202209473, "data/tokens_consumed": 82441142272, "data/tokens_consumed_B": 82.441142272, "train/loss_slope": -1.036586946029041e-05} {"step": 39320, "timestamp": 1778237038.0748227, "train/loss": 2.1388917922973634, "train/z_loss": 0.0014150041970424354, "train/perplexity": 8.490023703893263, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027222.8587859005, "perf/iters_per_sec": 0.9666551870278838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344950437545777, "data/tokens_consumed": 82462113792, "data/tokens_consumed_B": 82.462113792, "train/loss_slope": -8.608798783282504e-06} {"step": 39330, "timestamp": 1778237048.4220245, "train/loss": 2.2027772665023804, "train/z_loss": 0.0014149650232866406, "train/perplexity": 9.050113205262466, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028054.0409835968, "perf/iters_per_sec": 0.9670515255849823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340710639953614, "data/tokens_consumed": 82483085312, "data/tokens_consumed_B": 82.483085312, "train/loss_slope": -7.838882743292886e-06} {"step": 39340, "timestamp": 1778237058.770448, "train/loss": 2.1575173139572144, "train/z_loss": 0.0014170082286000252, "train/perplexity": 8.64963664719787, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027816.530884315, "perf/iters_per_sec": 0.9669382719441962, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034192180633545, "data/tokens_consumed": 82504056832, "data/tokens_consumed_B": 82.504056832, "train/loss_slope": -7.727947587048367e-06} {"step": 39350, "timestamp": 1778237069.1144047, "grad/layer_0/attn": 0.0027758735232055187, "grad/layer_0/mlp": 0.002713194116950035, "grad/layer_0/attn_mlp_ratio": 1.0231016658755607, "grad/layer_4/attn": 0.0018279646756127477, "grad/layer_4/mlp": 0.0023830938152968884, "grad/layer_4/attn_mlp_ratio": 0.7670552401981262, "grad/layer_8/attn": 0.003993636462837458, "grad/layer_8/mlp": 0.003587808459997177, "grad/layer_8/attn_mlp_ratio": 1.1131130315494313, "grad/layer_12/attn": 0.0045403121039271355, "grad/layer_12/mlp": 0.006398776080459356, "grad/layer_12/attn_mlp_ratio": 0.7095594494760395, "grad/layer_16/attn": 0.003836076706647873, "grad/layer_16/mlp": 0.0045720841735601425, "grad/layer_16/attn_mlp_ratio": 0.8390214346729085, "grad/layer_20/attn": 0.00553364772349596, "grad/layer_20/mlp": 0.0053513809107244015, "grad/layer_20/attn_mlp_ratio": 1.0340597525024453, "grad/layer_24/attn": 0.009464668110013008, "grad/layer_24/mlp": 0.010577160865068436, "grad/layer_24/attn_mlp_ratio": 0.8948212229416299, "grad/layer_27/attn": 0.008980807848274708, "grad/layer_27/mlp": 0.009185204282402992, "grad/layer_27/attn_mlp_ratio": 0.9777471980351502} {"step": 39350, "timestamp": 1778237069.130023, "train/loss": 2.162769305706024, "train/z_loss": 0.0014144132612273097, "train/perplexity": 8.695183969884768, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025528.0668551675, "perf/iters_per_sec": 0.9658470472598875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035360622406006, "data/tokens_consumed": 82525028352, "data/tokens_consumed_B": 82.525028352, "train/loss_slope": -1.084136529402294e-05} {"step": 39360, "timestamp": 1778237079.4771395, "train/loss": 2.178269600868225, "train/z_loss": 0.0014289135811850428, "train/perplexity": 8.831011854579513, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027955.8509292072, "perf/iters_per_sec": 0.9670047049184833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341211318969727, "data/tokens_consumed": 82545999872, "data/tokens_consumed_B": 82.545999872, "train/loss_slope": -1.0604670067073773e-05} {"step": 39370, "timestamp": 1778237089.8252711, "train/loss": 2.1925496816635133, "train/z_loss": 0.0014003683580085635, "train/perplexity": 8.958024131795256, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027744.120072903, "perf/iters_per_sec": 0.9669037437786594, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342291116714477, "data/tokens_consumed": 82566971392, "data/tokens_consumed_B": 82.566971392, "train/loss_slope": -1.0969346942323555e-05} {"step": 39375, "timestamp": 1778237095.5930734, "eos/sharpness": 71.81408405303954, "eos/L0_probe": 2.00369930267334, "eos/L_plus": 2.4437477588653564, "eos/L_minus": 2.2817916870117188, "eos/grad_norm": 0.2215135395526886, "eos/embed_grad_frac": 0.05051485449075699, "eos/time_s": 0.60402512550354} {"step": 39375, "timestamp": 1778237096.9681509, "geo/rankme_last": 438.8936767578125, "geo/layer_0/stable_rank_q_proj": 19.228607177734375, "geo/layer_0/stable_rank_k_proj": 16.385385513305664, "geo/layer_0/stable_rank_o_proj": 49.32607650756836, "geo/layer_0/stable_rank_gate_proj": 137.86489868164062, "geo/layer_0/stable_rank_down_proj": 53.239681243896484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05801145359873772, "geo/layer_0/attn_entropy_mean": 6.2035441398620605, "geo/layer_0/attn_entropy_std": 0.3716675937175751, "geo/layer_7/stable_rank_q_proj": 42.79461669921875, "geo/layer_7/stable_rank_k_proj": 42.38642501831055, "geo/layer_7/stable_rank_o_proj": 98.93838500976562, "geo/layer_7/stable_rank_gate_proj": 90.20806121826172, "geo/layer_7/stable_rank_down_proj": 145.52061462402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5066518783569336, "geo/layer_7/attn_entropy_mean": 4.676325798034668, "geo/layer_7/attn_entropy_std": 0.8029743432998657, "geo/layer_14/stable_rank_q_proj": 54.159751892089844, "geo/layer_14/stable_rank_k_proj": 37.04024887084961, "geo/layer_14/stable_rank_o_proj": 48.50605010986328, "geo/layer_14/stable_rank_gate_proj": 77.31088256835938, "geo/layer_14/stable_rank_down_proj": 134.55014038085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37348672747612, "geo/layer_14/attn_entropy_mean": 5.4934468269348145, "geo/layer_14/attn_entropy_std": 0.37225422263145447, "geo/layer_21/stable_rank_q_proj": 43.48069381713867, "geo/layer_21/stable_rank_k_proj": 30.713781356811523, "geo/layer_21/stable_rank_o_proj": 76.37445831298828, "geo/layer_21/stable_rank_gate_proj": 73.31043243408203, "geo/layer_21/stable_rank_down_proj": 55.48422622680664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15138991177082062, "geo/layer_21/attn_entropy_mean": 5.736199855804443, "geo/layer_21/attn_entropy_std": 0.283715158700943, "geo/layer_27/stable_rank_q_proj": 42.546810150146484, "geo/layer_27/stable_rank_k_proj": 31.40387725830078, "geo/layer_27/stable_rank_o_proj": 116.33390808105469, "geo/layer_27/stable_rank_gate_proj": 84.76296997070312, "geo/layer_27/stable_rank_down_proj": 131.73623657226562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0879337266087532, "geo/layer_27/attn_entropy_mean": 4.289960861206055, "geo/layer_27/attn_entropy_std": 0.652925968170166, "attnres/final_alpha/block_0": 0.23866228759288788, "attnres/block_norm/0": 1.7188942432403564, "attnres/final_alpha/block_1": 0.005174671299755573, "attnres/block_norm/1": 39908.1328125, "attnres/final_alpha/block_2": 0.011254331097006798, "attnres/block_norm/2": 26188.53125, "attnres/final_alpha/block_3": 0.013292166404426098, "attnres/block_norm/3": 46368.421875, "attnres/final_alpha/block_4": 0.016026953235268593, "attnres/block_norm/4": 12806.60546875, "attnres/final_alpha/block_5": 0.5987498760223389, "attnres/block_norm/5": 5955.744140625, "attnres/final_alpha/block_6": 0.1168397068977356, "attnres/block_norm/6": 30824.7890625, "geo/tier1_time_s": 1.3561761379241943, "geo/step": 39375.0, "geo/rankme_slope": 6.802937972063826e-05} {"step": 39380, "timestamp": 1778237102.1563132, "train/loss": 2.185185980796814, "train/z_loss": 0.0014084284659475089, "train/perplexity": 8.892302197041738, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701485.1049878327, "perf/iters_per_sec": 0.8113313221873439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232542085647583, "data/tokens_consumed": 82587942912, "data/tokens_consumed_B": 82.587942912, "train/loss_slope": -1.0736178893997105e-05} {"step": 39390, "timestamp": 1778237112.5154335, "train/loss": 2.1731733322143554, "train/z_loss": 0.0014226455125026405, "train/perplexity": 8.786121130436493, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025738.6820148297, "perf/iters_per_sec": 0.9659474763940953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352529764175415, "data/tokens_consumed": 82608914432, "data/tokens_consumed_B": 82.608914432, "train/loss_slope": -5.898725639546916e-06} {"step": 39400, "timestamp": 1778237122.8548539, "grad/layer_0/attn": 0.0025450594257563353, "grad/layer_0/mlp": 0.002774817869067192, "grad/layer_0/attn_mlp_ratio": 0.9171986970417038, "grad/layer_4/attn": 0.00336016109213233, "grad/layer_4/mlp": 0.0026196353137493134, "grad/layer_4/attn_mlp_ratio": 1.2826827254267228, "grad/layer_8/attn": 0.00508610624819994, "grad/layer_8/mlp": 0.003991161938756704, "grad/layer_8/attn_mlp_ratio": 1.2743422088130314, "grad/layer_12/attn": 0.005544956307858229, "grad/layer_12/mlp": 0.006717756390571594, "grad/layer_12/attn_mlp_ratio": 0.8254178780729257, "grad/layer_16/attn": 0.0048153758980333805, "grad/layer_16/mlp": 0.004486904479563236, "grad/layer_16/attn_mlp_ratio": 1.0732066645602967, "grad/layer_20/attn": 0.005428816191852093, "grad/layer_20/mlp": 0.005806737579405308, "grad/layer_20/attn_mlp_ratio": 0.9349167280461825, "grad/layer_24/attn": 0.017511218786239624, "grad/layer_24/mlp": 0.010350248776376247, "grad/layer_24/attn_mlp_ratio": 1.6918645141188646, "grad/layer_27/attn": 0.0072161550633609295, "grad/layer_27/mlp": 0.008971828036010265, "grad/layer_27/attn_mlp_ratio": 0.8043126722855308} {"step": 39400, "timestamp": 1778237122.870758, "train/loss": 2.191384720802307, "train/z_loss": 0.0014210247667506338, "train/perplexity": 8.94759446054691, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026600.210935609, "perf/iters_per_sec": 0.9663582853963895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348128795623779, "data/tokens_consumed": 82629885952, "data/tokens_consumed_B": 82.629885952, "train/loss_slope": -6.202474600411039e-06} {"step": 39410, "timestamp": 1778237133.22139, "train/loss": 2.1687358140945436, "train/z_loss": 0.0014269224018789827, "train/perplexity": 8.747218937148205, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026941.683413953, "perf/iters_per_sec": 0.9665211121625676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346385478973388, "data/tokens_consumed": 82650857472, "data/tokens_consumed_B": 82.650857472, "train/loss_slope": -4.510923122951879e-06} {"step": 39420, "timestamp": 1778237143.5765705, "train/loss": 2.167045295238495, "train/z_loss": 0.001405111502390355, "train/perplexity": 8.732444090693928, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026279.5781531408, "perf/iters_per_sec": 0.9662053957715706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349766254425048, "data/tokens_consumed": 82671828992, "data/tokens_consumed_B": 82.671828992, "train/loss_slope": -6.012880176243102e-06} {"step": 39430, "timestamp": 1778237153.9299402, "train/loss": 2.178543734550476, "train/z_loss": 0.0014038445078767836, "train/perplexity": 8.833433064229604, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027075.3242503835, "perf/iters_per_sec": 0.966584837079231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034570336341858, "data/tokens_consumed": 82692800512, "data/tokens_consumed_B": 82.692800512, "train/loss_slope": -6.0919271443935945e-06} {"step": 39440, "timestamp": 1778237164.2787817, "train/loss": 2.1730273246765135, "train/z_loss": 0.0014014334767125547, "train/perplexity": 8.78483838417065, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027526.124166867, "perf/iters_per_sec": 0.9667997952303252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343403100967408, "data/tokens_consumed": 82713772032, "data/tokens_consumed_B": 82.713772032, "train/loss_slope": -6.601124706834787e-06} {"step": 39450, "timestamp": 1778237174.6178327, "grad/layer_0/attn": 0.0029592979699373245, "grad/layer_0/mlp": 0.003049098188057542, "grad/layer_0/attn_mlp_ratio": 0.970548565629406, "grad/layer_4/attn": 0.002440802752971649, "grad/layer_4/mlp": 0.0026650691870599985, "grad/layer_4/attn_mlp_ratio": 0.915849642192323, "grad/layer_8/attn": 0.004295971244573593, "grad/layer_8/mlp": 0.0038119121454656124, "grad/layer_8/attn_mlp_ratio": 1.126985871640614, "grad/layer_12/attn": 0.00626418087631464, "grad/layer_12/mlp": 0.007024314254522324, "grad/layer_12/attn_mlp_ratio": 0.8917853843317388, "grad/layer_16/attn": 0.004240844398736954, "grad/layer_16/mlp": 0.00495751341804862, "grad/layer_16/attn_mlp_ratio": 0.8554377881769727, "grad/layer_20/attn": 0.007601459510624409, "grad/layer_20/mlp": 0.006439474876970053, "grad/layer_20/attn_mlp_ratio": 1.180447092008284, "grad/layer_24/attn": 0.013026181608438492, "grad/layer_24/mlp": 0.010772092267870903, "grad/layer_24/attn_mlp_ratio": 1.2092526840273565, "grad/layer_27/attn": 0.008468542248010635, "grad/layer_27/mlp": 0.010021545924246311, "grad/layer_27/attn_mlp_ratio": 0.8450335135438873} {"step": 39450, "timestamp": 1778237175.2253373, "eos/sharpness": 68.40555667877196, "eos/L0_probe": 2.006770610809326, "eos/L_plus": 2.2852838039398193, "eos/L_minus": 2.4123129844665527, "eos/grad_norm": 0.20409271121025085, "eos/embed_grad_frac": 0.05535213649272919, "eos/time_s": 0.6046702861785889} {"step": 39450, "timestamp": 1778237175.246073, "train/loss": 2.1396763801574705, "train/z_loss": 0.0014209133922122418, "train/perplexity": 8.496687487243278, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913101.028328342, "perf/iters_per_sec": 0.9122376577035627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0962055683135987, "data/tokens_consumed": 82734743552, "data/tokens_consumed_B": 82.734743552, "train/loss_slope": -7.056356206013167e-06} {"step": 39450, "timestamp": 1778237176.6090426, "geo/rankme_last": 439.3601989746094, "geo/layer_0/stable_rank_q_proj": 19.215309143066406, "geo/layer_0/stable_rank_k_proj": 16.363264083862305, "geo/layer_0/stable_rank_o_proj": 49.23099136352539, "geo/layer_0/stable_rank_gate_proj": 138.004638671875, "geo/layer_0/stable_rank_down_proj": 53.31477737426758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0626877024769783, "geo/layer_0/attn_entropy_mean": 6.195131301879883, "geo/layer_0/attn_entropy_std": 0.37434518337249756, "geo/layer_7/stable_rank_q_proj": 42.87823486328125, "geo/layer_7/stable_rank_k_proj": 42.26778793334961, "geo/layer_7/stable_rank_o_proj": 98.81470489501953, "geo/layer_7/stable_rank_gate_proj": 90.2806396484375, "geo/layer_7/stable_rank_down_proj": 145.21963500976562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5052257776260376, "geo/layer_7/attn_entropy_mean": 4.661660194396973, "geo/layer_7/attn_entropy_std": 0.8091283440589905, "geo/layer_14/stable_rank_q_proj": 54.293968200683594, "geo/layer_14/stable_rank_k_proj": 36.98137664794922, "geo/layer_14/stable_rank_o_proj": 48.43424606323242, "geo/layer_14/stable_rank_gate_proj": 77.40926361083984, "geo/layer_14/stable_rank_down_proj": 134.3096923828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3865249454975128, "geo/layer_14/attn_entropy_mean": 5.522947311401367, "geo/layer_14/attn_entropy_std": 0.3692292869091034, "geo/layer_21/stable_rank_q_proj": 43.469512939453125, "geo/layer_21/stable_rank_k_proj": 30.801801681518555, "geo/layer_21/stable_rank_o_proj": 76.3343505859375, "geo/layer_21/stable_rank_gate_proj": 73.15090942382812, "geo/layer_21/stable_rank_down_proj": 55.503963470458984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14278338849544525, "geo/layer_21/attn_entropy_mean": 5.711303234100342, "geo/layer_21/attn_entropy_std": 0.28710803389549255, "geo/layer_27/stable_rank_q_proj": 42.585411071777344, "geo/layer_27/stable_rank_k_proj": 31.36810302734375, "geo/layer_27/stable_rank_o_proj": 116.35501098632812, "geo/layer_27/stable_rank_gate_proj": 84.86631774902344, "geo/layer_27/stable_rank_down_proj": 131.36468505859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0860278457403183, "geo/layer_27/attn_entropy_mean": 4.29712438583374, "geo/layer_27/attn_entropy_std": 0.6500241756439209, "attnres/final_alpha/block_0": 0.24156644940376282, "attnres/block_norm/0": 1.7192497253417969, "attnres/final_alpha/block_1": 0.0051844436675310135, "attnres/block_norm/1": 39972.76171875, "attnres/final_alpha/block_2": 0.011377613991498947, "attnres/block_norm/2": 26269.83203125, "attnres/final_alpha/block_3": 0.013529304414987564, "attnres/block_norm/3": 46120.15234375, "attnres/final_alpha/block_4": 0.016422579064965248, "attnres/block_norm/4": 12821.99609375, "attnres/final_alpha/block_5": 0.5930361747741699, "attnres/block_norm/5": 5978.44921875, "attnres/final_alpha/block_6": 0.11888342350721359, "attnres/block_norm/6": 30612.4921875, "geo/tier1_time_s": 1.3592286109924316, "geo/step": 39450.0, "geo/rankme_slope": 6.781700961634654e-05} {"step": 39460, "timestamp": 1778237186.9564838, "train/loss": 2.154607152938843, "train/z_loss": 0.0014089734177105129, "train/perplexity": 8.624501403346812, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791405.394237703, "perf/iters_per_sec": 0.8542086573780551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706741571426391, "data/tokens_consumed": 82755715072, "data/tokens_consumed_B": 82.755715072, "train/loss_slope": -1.015189981589326e-05} {"step": 39470, "timestamp": 1778237197.311456, "train/loss": 2.1114615201950073, "train/z_loss": 0.001440596964675933, "train/perplexity": 8.260305069422595, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026471.7682380357, "perf/iters_per_sec": 0.9662970391454867, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348784685134889, "data/tokens_consumed": 82776686592, "data/tokens_consumed_B": 82.776686592, "train/loss_slope": -1.2318979688782286e-05} {"step": 39480, "timestamp": 1778237207.6640565, "train/loss": 2.1762857913970945, "train/z_loss": 0.001413983420934528, "train/perplexity": 8.81351017536105, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026833.139479255, "perf/iters_per_sec": 0.9664693543812061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034693956375122, "data/tokens_consumed": 82797658112, "data/tokens_consumed_B": 82.797658112, "train/loss_slope": -1.0339007869769647e-05} {"step": 39490, "timestamp": 1778237218.0164218, "train/loss": 2.17069935798645, "train/z_loss": 0.0014202221878804266, "train/perplexity": 8.764411358977057, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027032.6749805468, "perf/iters_per_sec": 0.9665645003226027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345921039581298, "data/tokens_consumed": 82818629632, "data/tokens_consumed_B": 82.818629632, "train/loss_slope": -8.088528090613457e-06} {"step": 39500, "timestamp": 1778237228.3555195, "grad/layer_0/attn": 0.0030469533521682024, "grad/layer_0/mlp": 0.0029401795472949743, "grad/layer_0/attn_mlp_ratio": 1.0363153676583885, "grad/layer_4/attn": 0.003797943703830242, "grad/layer_4/mlp": 0.0024591197725385427, "grad/layer_4/attn_mlp_ratio": 1.5444321142058175, "grad/layer_8/attn": 0.00394669221714139, "grad/layer_8/mlp": 0.003691817866638303, "grad/layer_8/attn_mlp_ratio": 1.0690375995800168, "grad/layer_12/attn": 0.003695996478199959, "grad/layer_12/mlp": 0.006115708500146866, "grad/layer_12/attn_mlp_ratio": 0.6043447652347598, "grad/layer_16/attn": 0.0033144140616059303, "grad/layer_16/mlp": 0.0044787731021642685, "grad/layer_16/attn_mlp_ratio": 0.7400272154893471, "grad/layer_20/attn": 0.009169339202344418, "grad/layer_20/mlp": 0.0057141948491334915, "grad/layer_20/attn_mlp_ratio": 1.6046598486694734, "grad/layer_24/attn": 0.01298324391245842, "grad/layer_24/mlp": 0.011315351352095604, "grad/layer_24/attn_mlp_ratio": 1.1474008533825897, "grad/layer_27/attn": 0.006387659348547459, "grad/layer_27/mlp": 0.010550272651016712, "grad/layer_27/attn_mlp_ratio": 0.6054496882966264} {"step": 39500, "timestamp": 1778237228.3714626, "train/loss": 2.2164585828781127, "train/z_loss": 0.001413310004863888, "train/perplexity": 9.174781536102307, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026235.1420111826, "perf/iters_per_sec": 0.9661842069679177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349993228912353, "data/tokens_consumed": 82839601152, "data/tokens_consumed_B": 82.839601152, "train/loss_slope": -5.850734912415635e-06} {"step": 39500, "timestamp": 1778237235.4315856, "geo/ww_alpha_mean": 7.4849329107748686, "geo/ww_alpha_std": 4.253767196616813, "geo/ww_alpha_min": 1.3507541591856222, "geo/ww_alpha_max": 35.306463969036955, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.058705644795331, "geo/ww_alpha_by_type/k_proj": 4.424436968564319, "geo/ww_alpha_by_type/v_proj": 8.749149187548836, "geo/ww_alpha_by_type/o_proj": 7.822797632131123, "geo/ww_alpha_by_type/gate_proj": 8.013768911709393, "geo/ww_alpha_by_type/up_proj": 11.083046981941953, "geo/ww_alpha_by_type/down_proj": 8.343496230173999, "geo/twonn_id/layer_0": 0.6903827786445618, "geo/twonn_id/layer_7": 2.88761830329895, "geo/twonn_id/layer_14": 4.526767730712891, "geo/twonn_id/layer_21": 7.745633602142334, "geo/twonn_id/layer_27": 5.436585426330566, "geo/tier2_time_s": 7.053074359893799} {"step": 39500, "timestamp": 1778237236.0687857, "eoc/jacobian_sigma/layer_0/attn": 1134.472900390625, "eoc/jacobian_sigma/layer_0/mlp": 9673.255859375, "eoc/jacobian_sigma/layer_0": 9673.255859375, "eoc/jacobian_sigma/layer_7/attn": 1.129186749458313, "eoc/jacobian_sigma/layer_7/mlp": 1.6535452604293823, "eoc/jacobian_sigma/layer_7": 1.6535452604293823, "eoc/jacobian_sigma/layer_14/attn": 1.6335604190826416, "eoc/jacobian_sigma/layer_14/mlp": 6.034005641937256, "eoc/jacobian_sigma/layer_14": 6.034005641937256, "eoc/jacobian_sigma/layer_21/attn": 1.0945662260055542, "eoc/jacobian_sigma/layer_21/mlp": 3.863569736480713, "eoc/jacobian_sigma/layer_21": 3.863569736480713, "eoc/jacobian_sigma/layer_27/attn": 4.3628034591674805, "eoc/jacobian_sigma/layer_27/mlp": 25.926513671875, "eoc/jacobian_sigma/layer_27": 25.926513671875, "eoc/layer0_sigma": 9673.255859375, "eoc/sigma_max": 25.926513671875, "eoc/sigma_min": 1.6535452604293823, "eoc/sigma_mean": 9.369408577680588, "eoc/time_s": 0.6306836605072021} {"step": 39510, "timestamp": 1778237246.4339764, "train/loss": 2.1926347732543947, "train/z_loss": 0.0014193459646776319, "train/perplexity": 8.958786416751344, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1161431.9494043551, "perf/iters_per_sec": 0.5538139102002884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8056606769561767, "data/tokens_consumed": 82860572672, "data/tokens_consumed_B": 82.860572672, "train/loss_slope": -5.50195103538594e-06} {"step": 39520, "timestamp": 1778237256.788409, "train/loss": 2.1890546321868896, "train/z_loss": 0.001405796525068581, "train/perplexity": 8.926770043349592, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026471.3013731593, "perf/iters_per_sec": 0.9662968165269658, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034878706932068, "data/tokens_consumed": 82881544192, "data/tokens_consumed_B": 82.881544192, "train/loss_slope": -2.112531597607827e-06} {"step": 39525, "timestamp": 1778237262.5761561, "eos/sharpness": 55.8589458465576, "eos/L0_probe": 2.000922441482544, "eos/L_plus": 2.2389233112335205, "eos/L_minus": 2.3215110301971436, "eos/grad_norm": 0.160055473446846, "eos/embed_grad_frac": 0.08594606816768646, "eos/time_s": 0.6221435070037842} {"step": 39525, "timestamp": 1778237263.9564762, "geo/rankme_last": 439.45025634765625, "geo/layer_0/stable_rank_q_proj": 19.210630416870117, "geo/layer_0/stable_rank_k_proj": 16.384645462036133, "geo/layer_0/stable_rank_o_proj": 49.308292388916016, "geo/layer_0/stable_rank_gate_proj": 138.281005859375, "geo/layer_0/stable_rank_down_proj": 53.371482849121094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05762441083788872, "geo/layer_0/attn_entropy_mean": 6.194673538208008, "geo/layer_0/attn_entropy_std": 0.37552326917648315, "geo/layer_7/stable_rank_q_proj": 42.7879753112793, "geo/layer_7/stable_rank_k_proj": 42.3699836730957, "geo/layer_7/stable_rank_o_proj": 98.99781799316406, "geo/layer_7/stable_rank_gate_proj": 90.2331771850586, "geo/layer_7/stable_rank_down_proj": 145.18667602539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.50958651304245, "geo/layer_7/attn_entropy_mean": 4.664669036865234, "geo/layer_7/attn_entropy_std": 0.8191052079200745, "geo/layer_14/stable_rank_q_proj": 54.321842193603516, "geo/layer_14/stable_rank_k_proj": 36.900821685791016, "geo/layer_14/stable_rank_o_proj": 48.41677474975586, "geo/layer_14/stable_rank_gate_proj": 77.3505859375, "geo/layer_14/stable_rank_down_proj": 134.40782165527344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37737253308296204, "geo/layer_14/attn_entropy_mean": 5.46174955368042, "geo/layer_14/attn_entropy_std": 0.3652696907520294, "geo/layer_21/stable_rank_q_proj": 43.47554016113281, "geo/layer_21/stable_rank_k_proj": 30.824247360229492, "geo/layer_21/stable_rank_o_proj": 76.43228149414062, "geo/layer_21/stable_rank_gate_proj": 73.109375, "geo/layer_21/stable_rank_down_proj": 55.51969528198242, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14423391222953796, "geo/layer_21/attn_entropy_mean": 5.6961565017700195, "geo/layer_21/attn_entropy_std": 0.2928924858570099, "geo/layer_27/stable_rank_q_proj": 42.52717590332031, "geo/layer_27/stable_rank_k_proj": 31.31886863708496, "geo/layer_27/stable_rank_o_proj": 116.54402923583984, "geo/layer_27/stable_rank_gate_proj": 84.87274169921875, "geo/layer_27/stable_rank_down_proj": 131.11358642578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08266086131334305, "geo/layer_27/attn_entropy_mean": 4.28416633605957, "geo/layer_27/attn_entropy_std": 0.6485084891319275, "attnres/final_alpha/block_0": 0.24216406047344208, "attnres/block_norm/0": 1.7193868160247803, "attnres/final_alpha/block_1": 0.0052343555726110935, "attnres/block_norm/1": 39867.6328125, "attnres/final_alpha/block_2": 0.011336127296090126, "attnres/block_norm/2": 26201.81640625, "attnres/final_alpha/block_3": 0.01341789960861206, "attnres/block_norm/3": 46330.5078125, "attnres/final_alpha/block_4": 0.016479849815368652, "attnres/block_norm/4": 12765.279296875, "attnres/final_alpha/block_5": 0.5921904444694519, "attnres/block_norm/5": 6087.24267578125, "attnres/final_alpha/block_6": 0.11917723715305328, "attnres/block_norm/6": 30671.13671875, "geo/tier1_time_s": 1.3600149154663086, "geo/step": 39525.0, "geo/rankme_slope": 7.629706179346739e-05} {"step": 39530, "timestamp": 1778237269.1408234, "train/loss": 2.1841245174407957, "train/z_loss": 0.0014213245594874024, "train/perplexity": 8.882868351835196, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698452.6233387282, "perf/iters_per_sec": 0.8098853222554818, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2347427129745483, "data/tokens_consumed": 82902515712, "data/tokens_consumed_B": 82.902515712, "train/loss_slope": -7.479710511677521e-07} {"step": 39540, "timestamp": 1778237279.4896052, "train/loss": 2.176490902900696, "train/z_loss": 0.0014107939437963068, "train/perplexity": 8.815318113093198, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027366.8634834317, "perf/iters_per_sec": 0.9667238538186225, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344215631484985, "data/tokens_consumed": 82923487232, "data/tokens_consumed_B": 82.923487232, "train/loss_slope": 1.4009592234820194e-06} {"step": 39550, "timestamp": 1778237290.2626705, "grad/layer_0/attn": 0.0026136545930057764, "grad/layer_0/mlp": 0.0027480628341436386, "grad/layer_0/attn_mlp_ratio": 0.9510897878400492, "grad/layer_4/attn": 0.002437685150653124, "grad/layer_4/mlp": 0.002582890447229147, "grad/layer_4/attn_mlp_ratio": 0.9437818235341814, "grad/layer_8/attn": 0.003588898340240121, "grad/layer_8/mlp": 0.003726891940459609, "grad/layer_8/attn_mlp_ratio": 0.9629735182233853, "grad/layer_12/attn": 0.005175742786377668, "grad/layer_12/mlp": 0.006328881252557039, "grad/layer_12/attn_mlp_ratio": 0.8177974112734043, "grad/layer_16/attn": 0.003431235905736685, "grad/layer_16/mlp": 0.004323143046349287, "grad/layer_16/attn_mlp_ratio": 0.7936900975935113, "grad/layer_20/attn": 0.004064524080604315, "grad/layer_20/mlp": 0.006380212958902121, "grad/layer_20/attn_mlp_ratio": 0.6370514656925454, "grad/layer_24/attn": 0.017024213448166847, "grad/layer_24/mlp": 0.011282338760793209, "grad/layer_24/attn_mlp_ratio": 1.5089259113929816, "grad/layer_27/attn": 0.011561021208763123, "grad/layer_27/mlp": 0.00998474657535553, "grad/layer_27/attn_mlp_ratio": 1.157868254915087} {"step": 39550, "timestamp": 1778237290.2786355, "train/loss": 2.1622854709625243, "train/z_loss": 0.0014220470329746604, "train/perplexity": 8.690977955369044, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1944782.087704588, "perf/iters_per_sec": 0.9273443640253963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0783480644226073, "data/tokens_consumed": 82944458752, "data/tokens_consumed_B": 82.944458752, "train/loss_slope": 2.596283280881657e-06} {"step": 39560, "timestamp": 1778237300.6447158, "train/loss": 2.2055468559265137, "train/z_loss": 0.001421472104266286, "train/perplexity": 9.075213045164253, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024551.1802571246, "perf/iters_per_sec": 0.96538123143059, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358602046966552, "data/tokens_consumed": 82965430272, "data/tokens_consumed_B": 82.965430272, "train/loss_slope": 4.255276337207873e-06} {"step": 39570, "timestamp": 1778237310.9964495, "train/loss": 2.1448694109916686, "train/z_loss": 0.0014259337913244963, "train/perplexity": 8.540925813432422, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027068.3171277512, "perf/iters_per_sec": 0.9665814958227879, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345739126205444, "data/tokens_consumed": 82986401792, "data/tokens_consumed_B": 82.986401792, "train/loss_slope": 5.81683046711197e-06} {"step": 39580, "timestamp": 1778237321.343367, "train/loss": 2.194173240661621, "train/z_loss": 0.001415270706638694, "train/perplexity": 8.972579825295943, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027792.0817234907, "perf/iters_per_sec": 0.9669266136758283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034204649925232, "data/tokens_consumed": 83007373312, "data/tokens_consumed_B": 83.007373312, "train/loss_slope": 7.0138957598457745e-06} {"step": 39590, "timestamp": 1778237331.6997242, "train/loss": 2.1740653276443482, "train/z_loss": 0.0014261397300288081, "train/perplexity": 8.793961806736048, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026264.127971002, "perf/iters_per_sec": 0.9661980285506258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349845170974732, "data/tokens_consumed": 83028344832, "data/tokens_consumed_B": 83.028344832, "train/loss_slope": 1.1762600305593316e-05} {"step": 39600, "timestamp": 1778237342.039103, "grad/layer_0/attn": 0.0028779797721654177, "grad/layer_0/mlp": 0.0028289463371038437, "grad/layer_0/attn_mlp_ratio": 1.0173327194953088, "grad/layer_4/attn": 0.0018826819723472, "grad/layer_4/mlp": 0.0024622667115181684, "grad/layer_4/attn_mlp_ratio": 0.764613308168008, "grad/layer_8/attn": 0.0031596897169947624, "grad/layer_8/mlp": 0.0036883291322737932, "grad/layer_8/attn_mlp_ratio": 0.8566723624742324, "grad/layer_12/attn": 0.00425309082493186, "grad/layer_12/mlp": 0.0067519242875278, "grad/layer_12/attn_mlp_ratio": 0.6299079463609217, "grad/layer_16/attn": 0.003571141278371215, "grad/layer_16/mlp": 0.004431404639035463, "grad/layer_16/attn_mlp_ratio": 0.8058711602020148, "grad/layer_20/attn": 0.004565471317619085, "grad/layer_20/mlp": 0.005735497456043959, "grad/layer_20/attn_mlp_ratio": 0.7960026611480426, "grad/layer_24/attn": 0.009811131283640862, "grad/layer_24/mlp": 0.009304084815084934, "grad/layer_24/attn_mlp_ratio": 1.0544971776572933, "grad/layer_27/attn": 0.0050162021070718765, "grad/layer_27/mlp": 0.007614154368638992, "grad/layer_27/attn_mlp_ratio": 0.6587996247951755} {"step": 39600, "timestamp": 1778237342.6533997, "eos/sharpness": 40.00666141510009, "eos/L0_probe": 2.0026350021362305, "eos/L_plus": 2.2343339920043945, "eos/L_minus": 2.1710026264190674, "eos/grad_norm": 0.11505108326673508, "eos/embed_grad_frac": 0.17274926602840424, "eos/time_s": 0.6114716529846191} {"step": 39600, "timestamp": 1778237342.674259, "train/loss": 2.171590256690979, "train/z_loss": 0.0014135798788629471, "train/perplexity": 8.7722230408947, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911918.1124759794, "perf/iters_per_sec": 0.9116735994701287, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0968837976455688, "data/tokens_consumed": 83049316352, "data/tokens_consumed_B": 83.049316352, "train/loss_slope": 1.1698677387460747e-05} {"step": 39600, "timestamp": 1778237344.0367932, "geo/rankme_last": 439.0305480957031, "geo/layer_0/stable_rank_q_proj": 19.226388931274414, "geo/layer_0/stable_rank_k_proj": 16.373464584350586, "geo/layer_0/stable_rank_o_proj": 49.33794021606445, "geo/layer_0/stable_rank_gate_proj": 138.3603515625, "geo/layer_0/stable_rank_down_proj": 53.30385208129883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05970331281423569, "geo/layer_0/attn_entropy_mean": 6.196152687072754, "geo/layer_0/attn_entropy_std": 0.3740506172180176, "geo/layer_7/stable_rank_q_proj": 42.90402603149414, "geo/layer_7/stable_rank_k_proj": 42.314369201660156, "geo/layer_7/stable_rank_o_proj": 98.9144287109375, "geo/layer_7/stable_rank_gate_proj": 90.19569396972656, "geo/layer_7/stable_rank_down_proj": 145.1339569091797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5043065547943115, "geo/layer_7/attn_entropy_mean": 4.6609649658203125, "geo/layer_7/attn_entropy_std": 0.8250154852867126, "geo/layer_14/stable_rank_q_proj": 54.19157409667969, "geo/layer_14/stable_rank_k_proj": 36.92753601074219, "geo/layer_14/stable_rank_o_proj": 48.437557220458984, "geo/layer_14/stable_rank_gate_proj": 77.1241455078125, "geo/layer_14/stable_rank_down_proj": 134.36590576171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3760747015476227, "geo/layer_14/attn_entropy_mean": 5.4848480224609375, "geo/layer_14/attn_entropy_std": 0.35760119557380676, "geo/layer_21/stable_rank_q_proj": 43.45302963256836, "geo/layer_21/stable_rank_k_proj": 30.85397720336914, "geo/layer_21/stable_rank_o_proj": 76.35677337646484, "geo/layer_21/stable_rank_gate_proj": 73.2175521850586, "geo/layer_21/stable_rank_down_proj": 55.419677734375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1478312611579895, "geo/layer_21/attn_entropy_mean": 5.712165832519531, "geo/layer_21/attn_entropy_std": 0.2857552468776703, "geo/layer_27/stable_rank_q_proj": 42.573646545410156, "geo/layer_27/stable_rank_k_proj": 31.340890884399414, "geo/layer_27/stable_rank_o_proj": 116.61868286132812, "geo/layer_27/stable_rank_gate_proj": 84.79974365234375, "geo/layer_27/stable_rank_down_proj": 131.2794647216797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09422683715820312, "geo/layer_27/attn_entropy_mean": 4.280941009521484, "geo/layer_27/attn_entropy_std": 0.6650946140289307, "attnres/final_alpha/block_0": 0.24114152789115906, "attnres/block_norm/0": 1.7194898128509521, "attnres/final_alpha/block_1": 0.005154299084097147, "attnres/block_norm/1": 39954.55859375, "attnres/final_alpha/block_2": 0.01133896503597498, "attnres/block_norm/2": 26222.69921875, "attnres/final_alpha/block_3": 0.013446513563394547, "attnres/block_norm/3": 46590.67578125, "attnres/final_alpha/block_4": 0.016525492072105408, "attnres/block_norm/4": 12791.955078125, "attnres/final_alpha/block_5": 0.5956067442893982, "attnres/block_norm/5": 6024.982421875, "attnres/final_alpha/block_6": 0.11678645759820938, "attnres/block_norm/6": 30766.203125, "geo/tier1_time_s": 1.358821153640747, "geo/step": 39600.0, "geo/rankme_slope": 6.99836575255102e-05} {"step": 39610, "timestamp": 1778237354.3924367, "train/loss": 2.1374865055084227, "train/z_loss": 0.0014273375156335533, "train/perplexity": 8.47810116499967, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790167.99543744, "perf/iters_per_sec": 0.8536186196505737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714833498001098, "data/tokens_consumed": 83070287872, "data/tokens_consumed_B": 83.070287872, "train/loss_slope": 5.5044349664115405e-06} {"step": 39620, "timestamp": 1778237364.743933, "train/loss": 2.1818349838256834, "train/z_loss": 0.0014188881032168865, "train/perplexity": 8.862553990225868, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026965.8785260215, "perf/iters_per_sec": 0.9665326492910488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346261978149414, "data/tokens_consumed": 83091259392, "data/tokens_consumed_B": 83.091259392, "train/loss_slope": 5.0721080604344865e-06} {"step": 39630, "timestamp": 1778237375.1203985, "train/loss": 2.2030741453170775, "train/z_loss": 0.001408691587857902, "train/perplexity": 9.052800391008242, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021990.3659954653, "perf/iters_per_sec": 0.9641601400353743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371721029281615, "data/tokens_consumed": 83112230912, "data/tokens_consumed_B": 83.112230912, "train/loss_slope": 8.661422685141895e-06} {"step": 39640, "timestamp": 1778237385.5018811, "train/loss": 2.1271358013153074, "train/z_loss": 0.001430195476859808, "train/perplexity": 8.390799444300912, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021282.4874655844, "perf/iters_per_sec": 0.9638225972488329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375353336334228, "data/tokens_consumed": 83133202432, "data/tokens_consumed_B": 83.133202432, "train/loss_slope": 4.5167273814135134e-06} {"step": 39650, "timestamp": 1778237395.8543992, "grad/layer_0/attn": 0.0026835387106984854, "grad/layer_0/mlp": 0.002687098691239953, "grad/layer_0/attn_mlp_ratio": 0.9986751210811178, "grad/layer_4/attn": 0.001744248322211206, "grad/layer_4/mlp": 0.0023838328197598457, "grad/layer_4/attn_mlp_ratio": 0.7316990665549357, "grad/layer_8/attn": 0.0033708212431520224, "grad/layer_8/mlp": 0.003411500481888652, "grad/layer_8/attn_mlp_ratio": 0.9880758224247147, "grad/layer_12/attn": 0.004469422623515129, "grad/layer_12/mlp": 0.006128629203885794, "grad/layer_12/attn_mlp_ratio": 0.7292695318806994, "grad/layer_16/attn": 0.006353355478495359, "grad/layer_16/mlp": 0.004374997690320015, "grad/layer_16/attn_mlp_ratio": 1.4521962714021475, "grad/layer_20/attn": 0.0050734104588627815, "grad/layer_20/mlp": 0.005634379107505083, "grad/layer_20/attn_mlp_ratio": 0.9004382332138593, "grad/layer_24/attn": 0.010061156935989857, "grad/layer_24/mlp": 0.008303926326334476, "grad/layer_24/attn_mlp_ratio": 1.2116144121993453, "grad/layer_27/attn": 0.003528572851791978, "grad/layer_27/mlp": 0.007070439867675304, "grad/layer_27/attn_mlp_ratio": 0.4990598700963358} {"step": 39650, "timestamp": 1778237395.8703117, "train/loss": 2.220361828804016, "train/z_loss": 0.0014058439526706934, "train/perplexity": 9.210662946182596, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023780.5090619244, "perf/iters_per_sec": 0.9650137467679617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362546682357787, "data/tokens_consumed": 83154173952, "data/tokens_consumed_B": 83.154173952, "train/loss_slope": 8.170351773241126e-06} {"step": 39660, "timestamp": 1778237406.2218597, "train/loss": 2.184345233440399, "train/z_loss": 0.0014126422349363566, "train/perplexity": 8.884829159385655, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026973.118488438, "perf/iters_per_sec": 0.9665361015741529, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346225023269653, "data/tokens_consumed": 83175145472, "data/tokens_consumed_B": 83.175145472, "train/loss_slope": 6.939992552722047e-06} {"step": 39670, "timestamp": 1778237416.5701702, "train/loss": 2.1576839447021485, "train/z_loss": 0.0014168034074828028, "train/perplexity": 8.651078062684535, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027618.757351625, "perf/iters_per_sec": 0.9668439661749005, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342930555343628, "data/tokens_consumed": 83196116992, "data/tokens_consumed_B": 83.196116992, "train/loss_slope": 6.784594169866182e-06} {"step": 39675, "timestamp": 1778237422.3526814, "eos/sharpness": 75.52561759948729, "eos/L0_probe": 1.9983692169189453, "eos/L_plus": 2.314528703689575, "eos/L_minus": 2.4374659061431885, "eos/grad_norm": 0.2737424075603485, "eos/embed_grad_frac": 0.03528610244393349, "eos/time_s": 0.6096551418304443} {"step": 39675, "timestamp": 1778237423.7361903, "geo/rankme_last": 439.11846923828125, "geo/layer_0/stable_rank_q_proj": 19.21958351135254, "geo/layer_0/stable_rank_k_proj": 16.384532928466797, "geo/layer_0/stable_rank_o_proj": 49.25822448730469, "geo/layer_0/stable_rank_gate_proj": 137.9990234375, "geo/layer_0/stable_rank_down_proj": 53.26649475097656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05712980031967163, "geo/layer_0/attn_entropy_mean": 6.198322296142578, "geo/layer_0/attn_entropy_std": 0.37277841567993164, "geo/layer_7/stable_rank_q_proj": 42.94306945800781, "geo/layer_7/stable_rank_k_proj": 42.24531173706055, "geo/layer_7/stable_rank_o_proj": 98.82677459716797, "geo/layer_7/stable_rank_gate_proj": 90.35907745361328, "geo/layer_7/stable_rank_down_proj": 145.15792846679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5073403120040894, "geo/layer_7/attn_entropy_mean": 4.692877769470215, "geo/layer_7/attn_entropy_std": 0.8158835172653198, "geo/layer_14/stable_rank_q_proj": 54.12773895263672, "geo/layer_14/stable_rank_k_proj": 36.958099365234375, "geo/layer_14/stable_rank_o_proj": 48.38447952270508, "geo/layer_14/stable_rank_gate_proj": 77.02241516113281, "geo/layer_14/stable_rank_down_proj": 134.05972290039062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38338780403137207, "geo/layer_14/attn_entropy_mean": 5.491382598876953, "geo/layer_14/attn_entropy_std": 0.38871705532073975, "geo/layer_21/stable_rank_q_proj": 43.32961654663086, "geo/layer_21/stable_rank_k_proj": 30.74599266052246, "geo/layer_21/stable_rank_o_proj": 76.1853256225586, "geo/layer_21/stable_rank_gate_proj": 73.08446502685547, "geo/layer_21/stable_rank_down_proj": 55.406044006347656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14506985247135162, "geo/layer_21/attn_entropy_mean": 5.726900577545166, "geo/layer_21/attn_entropy_std": 0.29466935992240906, "geo/layer_27/stable_rank_q_proj": 42.63593292236328, "geo/layer_27/stable_rank_k_proj": 31.38017463684082, "geo/layer_27/stable_rank_o_proj": 116.36892700195312, "geo/layer_27/stable_rank_gate_proj": 84.854736328125, "geo/layer_27/stable_rank_down_proj": 131.4359130859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08392626792192459, "geo/layer_27/attn_entropy_mean": 4.2679924964904785, "geo/layer_27/attn_entropy_std": 0.653584361076355, "attnres/final_alpha/block_0": 0.24268405139446259, "attnres/block_norm/0": 1.7197034358978271, "attnres/final_alpha/block_1": 0.005251377820968628, "attnres/block_norm/1": 39996.8359375, "attnres/final_alpha/block_2": 0.011499271728098392, "attnres/block_norm/2": 26334.984375, "attnres/final_alpha/block_3": 0.013234851881861687, "attnres/block_norm/3": 46598.765625, "attnres/final_alpha/block_4": 0.016476310789585114, "attnres/block_norm/4": 12902.5625, "attnres/final_alpha/block_5": 0.5909544229507446, "attnres/block_norm/5": 6074.77880859375, "attnres/final_alpha/block_6": 0.1198997050523758, "attnres/block_norm/6": 30814.14453125, "geo/tier1_time_s": 1.361966609954834, "geo/step": 39675.0, "geo/rankme_slope": 8.529130402160865e-05} {"step": 39680, "timestamp": 1778237428.9139466, "train/loss": 2.171023464202881, "train/z_loss": 0.0014151022769510746, "train/perplexity": 8.767252419559687, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699708.347780766, "perf/iters_per_sec": 0.8104840983299093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233830499649048, "data/tokens_consumed": 83217088512, "data/tokens_consumed_B": 83.217088512, "train/loss_slope": 6.121951983635238e-06} {"step": 39690, "timestamp": 1778237439.265597, "train/loss": 2.206684970855713, "train/z_loss": 0.0014239699812605977, "train/perplexity": 9.085547560434145, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026871.2032618548, "perf/iters_per_sec": 0.9664875046071314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346745252609253, "data/tokens_consumed": 83238060032, "data/tokens_consumed_B": 83.238060032, "train/loss_slope": 6.734753534881817e-06} {"step": 39700, "timestamp": 1778237449.6079097, "grad/layer_0/attn": 0.0030204111244529486, "grad/layer_0/mlp": 0.0028449499513953924, "grad/layer_0/attn_mlp_ratio": 1.061674570691143, "grad/layer_4/attn": 0.0021411909256130457, "grad/layer_4/mlp": 0.002494205255061388, "grad/layer_4/attn_mlp_ratio": 0.858466172910749, "grad/layer_8/attn": 0.0038636119570583105, "grad/layer_8/mlp": 0.0036608888767659664, "grad/layer_8/attn_mlp_ratio": 1.0553753423222985, "grad/layer_12/attn": 0.004056330770254135, "grad/layer_12/mlp": 0.006437487900257111, "grad/layer_12/attn_mlp_ratio": 0.6301108087645565, "grad/layer_16/attn": 0.003701235866174102, "grad/layer_16/mlp": 0.004790649749338627, "grad/layer_16/attn_mlp_ratio": 0.7725957819031746, "grad/layer_20/attn": 0.00312665943056345, "grad/layer_20/mlp": 0.005656178575009108, "grad/layer_20/attn_mlp_ratio": 0.552786538441241, "grad/layer_24/attn": 0.008517404086887836, "grad/layer_24/mlp": 0.007575556635856628, "grad/layer_24/attn_mlp_ratio": 1.1243271463565518, "grad/layer_27/attn": 0.004152799490839243, "grad/layer_27/mlp": 0.0064211948774755, "grad/layer_27/attn_mlp_ratio": 0.646733124505109} {"step": 39700, "timestamp": 1778237449.6236734, "train/loss": 2.1493175268173217, "train/z_loss": 0.0014284133329056203, "train/perplexity": 8.579001460373211, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025622.4767399007, "perf/iters_per_sec": 0.9658920654010299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353123664855957, "data/tokens_consumed": 83259031552, "data/tokens_consumed_B": 83.259031552, "train/loss_slope": 6.0760013698780235e-06} {"step": 39710, "timestamp": 1778237459.987355, "train/loss": 2.1680373668670656, "train/z_loss": 0.0014245872502215207, "train/perplexity": 8.741111599408244, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024709.4861971908, "perf/iters_per_sec": 0.9654567175851778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357792139053346, "data/tokens_consumed": 83280003072, "data/tokens_consumed_B": 83.280003072, "train/loss_slope": 2.883172106750004e-06} {"step": 39720, "timestamp": 1778237470.3358157, "train/loss": 2.207889199256897, "train/z_loss": 0.0014211463974788785, "train/perplexity": 9.096495225268194, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027575.0569893317, "perf/iters_per_sec": 0.9668231282183322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343153476715088, "data/tokens_consumed": 83300974592, "data/tokens_consumed_B": 83.300974592, "train/loss_slope": 4.4959739346851755e-06} {"step": 39730, "timestamp": 1778237480.6851947, "train/loss": 2.1627536058425902, "train/z_loss": 0.001404563244432211, "train/perplexity": 8.695047457755523, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027447.2851320445, "perf/iters_per_sec": 0.966762201849005, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343805313110352, "data/tokens_consumed": 83321946112, "data/tokens_consumed_B": 83.321946112, "train/loss_slope": 4.078206370765996e-06} {"step": 39740, "timestamp": 1778237491.0595706, "train/loss": 2.1909390449523927, "train/z_loss": 0.0014140288927592336, "train/perplexity": 8.943607622265779, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022928.772526894, "perf/iters_per_sec": 0.96460760713906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366909742355346, "data/tokens_consumed": 83342917632, "data/tokens_consumed_B": 83.342917632, "train/loss_slope": -5.83851033418791e-07} {"step": 39750, "timestamp": 1778237501.413926, "grad/layer_0/attn": 0.0029409711714833975, "grad/layer_0/mlp": 0.0029135968070477247, "grad/layer_0/attn_mlp_ratio": 1.0093953505954982, "grad/layer_4/attn": 0.002089574234560132, "grad/layer_4/mlp": 0.002709485124796629, "grad/layer_4/attn_mlp_ratio": 0.7712070969927408, "grad/layer_8/attn": 0.004653900861740112, "grad/layer_8/mlp": 0.0036534746177494526, "grad/layer_8/attn_mlp_ratio": 1.2738286757892017, "grad/layer_12/attn": 0.005077849142253399, "grad/layer_12/mlp": 0.006565080489963293, "grad/layer_12/attn_mlp_ratio": 0.773463337223373, "grad/layer_16/attn": 0.006089824717491865, "grad/layer_16/mlp": 0.0051591056399047375, "grad/layer_16/attn_mlp_ratio": 1.180403159870942, "grad/layer_20/attn": 0.004063420929014683, "grad/layer_20/mlp": 0.00771511672064662, "grad/layer_20/attn_mlp_ratio": 0.5266830073318474, "grad/layer_24/attn": 0.01591709442436695, "grad/layer_24/mlp": 0.013441145420074463, "grad/layer_24/attn_mlp_ratio": 1.184206688380439, "grad/layer_27/attn": 0.013457399792969227, "grad/layer_27/mlp": 0.011190544813871384, "grad/layer_27/attn_mlp_ratio": 1.202568766449249} {"step": 39750, "timestamp": 1778237502.02297, "eos/sharpness": 71.27509117126463, "eos/L0_probe": 2.0031256675720215, "eos/L_plus": 2.4473655223846436, "eos/L_minus": 2.271636724472046, "eos/grad_norm": 0.23883318901062012, "eos/embed_grad_frac": 0.044660236686468124, "eos/time_s": 0.6063063144683838} {"step": 39750, "timestamp": 1778237502.0437305, "train/loss": 2.1778936862945555, "train/z_loss": 0.0014103082590736448, "train/perplexity": 8.827692772407891, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910602.2712919666, "perf/iters_per_sec": 0.9110461574992974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097639226913452, "data/tokens_consumed": 83363889152, "data/tokens_consumed_B": 83.363889152, "train/loss_slope": 3.433627321167158e-06} {"step": 39750, "timestamp": 1778237503.4064958, "geo/rankme_last": 439.8418884277344, "geo/layer_0/stable_rank_q_proj": 19.187639236450195, "geo/layer_0/stable_rank_k_proj": 16.387590408325195, "geo/layer_0/stable_rank_o_proj": 49.22478485107422, "geo/layer_0/stable_rank_gate_proj": 137.68020629882812, "geo/layer_0/stable_rank_down_proj": 53.302886962890625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06056743860244751, "geo/layer_0/attn_entropy_mean": 6.198236465454102, "geo/layer_0/attn_entropy_std": 0.3727734684944153, "geo/layer_7/stable_rank_q_proj": 42.87056350708008, "geo/layer_7/stable_rank_k_proj": 42.25712203979492, "geo/layer_7/stable_rank_o_proj": 98.75028228759766, "geo/layer_7/stable_rank_gate_proj": 90.30635070800781, "geo/layer_7/stable_rank_down_proj": 145.49119567871094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5017590522766113, "geo/layer_7/attn_entropy_mean": 4.674564838409424, "geo/layer_7/attn_entropy_std": 0.8309302926063538, "geo/layer_14/stable_rank_q_proj": 54.11650085449219, "geo/layer_14/stable_rank_k_proj": 36.92241287231445, "geo/layer_14/stable_rank_o_proj": 48.37002944946289, "geo/layer_14/stable_rank_gate_proj": 76.9620132446289, "geo/layer_14/stable_rank_down_proj": 133.8142547607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39216479659080505, "geo/layer_14/attn_entropy_mean": 5.490653038024902, "geo/layer_14/attn_entropy_std": 0.3671551048755646, "geo/layer_21/stable_rank_q_proj": 43.29496383666992, "geo/layer_21/stable_rank_k_proj": 30.68764877319336, "geo/layer_21/stable_rank_o_proj": 76.10823059082031, "geo/layer_21/stable_rank_gate_proj": 73.08236694335938, "geo/layer_21/stable_rank_down_proj": 55.476680755615234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14466911554336548, "geo/layer_21/attn_entropy_mean": 5.709367752075195, "geo/layer_21/attn_entropy_std": 0.2912304997444153, "geo/layer_27/stable_rank_q_proj": 42.56411361694336, "geo/layer_27/stable_rank_k_proj": 31.384174346923828, "geo/layer_27/stable_rank_o_proj": 115.96163177490234, "geo/layer_27/stable_rank_gate_proj": 84.82269287109375, "geo/layer_27/stable_rank_down_proj": 131.513916015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09655972570180893, "geo/layer_27/attn_entropy_mean": 4.276307582855225, "geo/layer_27/attn_entropy_std": 0.6375783681869507, "attnres/final_alpha/block_0": 0.24035149812698364, "attnres/block_norm/0": 1.7201439142227173, "attnres/final_alpha/block_1": 0.005136589985340834, "attnres/block_norm/1": 40012.28515625, "attnres/final_alpha/block_2": 0.011314399540424347, "attnres/block_norm/2": 26350.841796875, "attnres/final_alpha/block_3": 0.013479437679052353, "attnres/block_norm/3": 46825.87890625, "attnres/final_alpha/block_4": 0.016101907938718796, "attnres/block_norm/4": 12823.228515625, "attnres/final_alpha/block_5": 0.5975652933120728, "attnres/block_norm/5": 6012.8916015625, "attnres/final_alpha/block_6": 0.11605088412761688, "attnres/block_norm/6": 30873.447265625, "geo/tier1_time_s": 1.3586187362670898, "geo/step": 39750.0, "geo/rankme_slope": 0.00011419710462309924} {"step": 39760, "timestamp": 1778237513.761583, "train/loss": 2.2052524089813232, "train/z_loss": 0.0014079307205975057, "train/perplexity": 9.072541269773506, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790341.288688116, "perf/iters_per_sec": 0.8537012523117619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713699579238892, "data/tokens_consumed": 83384860672, "data/tokens_consumed_B": 83.384860672, "train/loss_slope": 6.38989603916444e-06} {"step": 39770, "timestamp": 1778237524.1110713, "train/loss": 2.175588059425354, "train/z_loss": 0.001399163657333702, "train/perplexity": 8.807362852369726, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027434.1536774687, "perf/iters_per_sec": 0.966755940283522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034387230873108, "data/tokens_consumed": 83405832192, "data/tokens_consumed_B": 83.405832192, "train/loss_slope": 7.528153275570751e-06} {"step": 39780, "timestamp": 1778237534.9816656, "train/loss": 2.179453897476196, "train/z_loss": 0.0014162847073748708, "train/perplexity": 8.841476587414526, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930395.4986988842, "perf/iters_per_sec": 0.9204843038076802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0863846302032472, "data/tokens_consumed": 83426803712, "data/tokens_consumed_B": 83.426803712, "train/loss_slope": 6.875024899111572e-06} {"step": 39790, "timestamp": 1778237545.3413582, "train/loss": 2.113267517089844, "train/z_loss": 0.0014327978366054594, "train/perplexity": 8.27523663384944, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025738.8219730183, "perf/iters_per_sec": 0.9659475431313602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352529048919679, "data/tokens_consumed": 83447775232, "data/tokens_consumed_B": 83.447775232, "train/loss_slope": 5.447076141673271e-06} {"step": 39800, "timestamp": 1778237555.6833093, "grad/layer_0/attn": 0.002658348297700286, "grad/layer_0/mlp": 0.00275005423463881, "grad/layer_0/attn_mlp_ratio": 0.966653009075703, "grad/layer_4/attn": 0.0019850649405270815, "grad/layer_4/mlp": 0.002501780865713954, "grad/layer_4/attn_mlp_ratio": 0.7934607256717164, "grad/layer_8/attn": 0.003890648949891329, "grad/layer_8/mlp": 0.0036649173125624657, "grad/layer_8/attn_mlp_ratio": 1.061592530449693, "grad/layer_12/attn": 0.006551670376211405, "grad/layer_12/mlp": 0.006801435723900795, "grad/layer_12/attn_mlp_ratio": 0.9632775410727696, "grad/layer_16/attn": 0.0038519552908837795, "grad/layer_16/mlp": 0.005013412795960903, "grad/layer_16/attn_mlp_ratio": 0.7683299522341634, "grad/layer_20/attn": 0.004303296562284231, "grad/layer_20/mlp": 0.0060422951355576515, "grad/layer_20/attn_mlp_ratio": 0.7121956797079735, "grad/layer_24/attn": 0.015201154164969921, "grad/layer_24/mlp": 0.009439107030630112, "grad/layer_24/attn_mlp_ratio": 1.610444076393819, "grad/layer_27/attn": 0.00480899540707469, "grad/layer_27/mlp": 0.008907235227525234, "grad/layer_27/attn_mlp_ratio": 0.5398976483998218} {"step": 39800, "timestamp": 1778237555.699293, "train/loss": 2.149980640411377, "train/z_loss": 0.0014128780574537813, "train/perplexity": 8.584692199459703, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025649.9057672839, "perf/iters_per_sec": 0.9659051445804996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352983474731445, "data/tokens_consumed": 83468746752, "data/tokens_consumed_B": 83.468746752, "train/loss_slope": 5.65207712005785e-06} {"step": 39810, "timestamp": 1778237566.0525317, "train/loss": 2.1587302446365357, "train/z_loss": 0.0014154735719785095, "train/perplexity": 8.660134422101834, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026815.1122234461, "perf/iters_per_sec": 0.9664607583157759, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347031593322753, "data/tokens_consumed": 83489718272, "data/tokens_consumed_B": 83.489718272, "train/loss_slope": 5.79937497953589e-06} {"step": 39820, "timestamp": 1778237576.4122596, "train/loss": 2.1629473209381103, "train/z_loss": 0.001432294864207506, "train/perplexity": 8.696731982858056, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025451.0155074773, "perf/iters_per_sec": 0.9658103063142192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354000091552735, "data/tokens_consumed": 83510689792, "data/tokens_consumed_B": 83.510689792, "train/loss_slope": 3.804392632943068e-06} {"step": 39825, "timestamp": 1778237582.187587, "eos/sharpness": 6.051921844482421, "eos/L0_probe": 1.9996038675308228, "eos/L_plus": 2.032074213027954, "eos/L_minus": 2.0276527404785156, "eos/grad_norm": 0.0909975990653038, "eos/embed_grad_frac": 0.2690742611885071, "eos/time_s": 0.6092302799224854} {"step": 39825, "timestamp": 1778237583.5662212, "geo/rankme_last": 439.4997253417969, "geo/layer_0/stable_rank_q_proj": 19.238487243652344, "geo/layer_0/stable_rank_k_proj": 16.435714721679688, "geo/layer_0/stable_rank_o_proj": 49.14968490600586, "geo/layer_0/stable_rank_gate_proj": 137.7397003173828, "geo/layer_0/stable_rank_down_proj": 53.3166389465332, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06598351150751114, "geo/layer_0/attn_entropy_mean": 6.1977996826171875, "geo/layer_0/attn_entropy_std": 0.36989131569862366, "geo/layer_7/stable_rank_q_proj": 42.907554626464844, "geo/layer_7/stable_rank_k_proj": 42.25094223022461, "geo/layer_7/stable_rank_o_proj": 98.29084777832031, "geo/layer_7/stable_rank_gate_proj": 90.19063568115234, "geo/layer_7/stable_rank_down_proj": 145.77993774414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49801093339920044, "geo/layer_7/attn_entropy_mean": 4.6710968017578125, "geo/layer_7/attn_entropy_std": 0.8269239664077759, "geo/layer_14/stable_rank_q_proj": 54.25288391113281, "geo/layer_14/stable_rank_k_proj": 36.96649932861328, "geo/layer_14/stable_rank_o_proj": 48.28095626831055, "geo/layer_14/stable_rank_gate_proj": 77.08470916748047, "geo/layer_14/stable_rank_down_proj": 133.68612670898438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3877514600753784, "geo/layer_14/attn_entropy_mean": 5.519583702087402, "geo/layer_14/attn_entropy_std": 0.3740864396095276, "geo/layer_21/stable_rank_q_proj": 43.26815414428711, "geo/layer_21/stable_rank_k_proj": 30.718812942504883, "geo/layer_21/stable_rank_o_proj": 76.04195404052734, "geo/layer_21/stable_rank_gate_proj": 72.97833251953125, "geo/layer_21/stable_rank_down_proj": 55.46177291870117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14743196964263916, "geo/layer_21/attn_entropy_mean": 5.73837947845459, "geo/layer_21/attn_entropy_std": 0.28624242544174194, "geo/layer_27/stable_rank_q_proj": 42.60012435913086, "geo/layer_27/stable_rank_k_proj": 31.391294479370117, "geo/layer_27/stable_rank_o_proj": 115.81940460205078, "geo/layer_27/stable_rank_gate_proj": 84.79884338378906, "geo/layer_27/stable_rank_down_proj": 131.56982421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09003733843564987, "geo/layer_27/attn_entropy_mean": 4.2914719581604, "geo/layer_27/attn_entropy_std": 0.6572308540344238, "attnres/final_alpha/block_0": 0.24045652151107788, "attnres/block_norm/0": 1.7201151847839355, "attnres/final_alpha/block_1": 0.005252827890217304, "attnres/block_norm/1": 40134.01953125, "attnres/final_alpha/block_2": 0.011231523007154465, "attnres/block_norm/2": 26362.119140625, "attnres/final_alpha/block_3": 0.013188375160098076, "attnres/block_norm/3": 46616.11328125, "attnres/final_alpha/block_4": 0.016277635470032692, "attnres/block_norm/4": 12827.544921875, "attnres/final_alpha/block_5": 0.5960619449615479, "attnres/block_norm/5": 5998.78662109375, "attnres/final_alpha/block_6": 0.1175311803817749, "attnres/block_norm/6": 30802.82421875, "geo/tier1_time_s": 1.3594098091125488, "geo/step": 39825.0, "geo/rankme_slope": 0.00011735754848814526} {"step": 39830, "timestamp": 1778237588.7442682, "train/loss": 2.174557685852051, "train/z_loss": 0.0014154019067063929, "train/perplexity": 8.798292652085955, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701409.737726478, "perf/iters_per_sec": 0.811295384276618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325966835021973, "data/tokens_consumed": 83531661312, "data/tokens_consumed_B": 83.531661312, "train/loss_slope": 1.5963665830312569e-06} {"step": 39840, "timestamp": 1778237599.1033175, "train/loss": 2.16399245262146, "train/z_loss": 0.001413857494480908, "train/perplexity": 8.705825964371265, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025363.4303564874, "perf/iters_per_sec": 0.9657685424597203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354447841644288, "data/tokens_consumed": 83552632832, "data/tokens_consumed_B": 83.552632832, "train/loss_slope": -2.689109634001417e-07} {"step": 39850, "timestamp": 1778237609.454401, "grad/layer_0/attn": 0.0032776757143437862, "grad/layer_0/mlp": 0.0029090556781738997, "grad/layer_0/attn_mlp_ratio": 1.1267146332963338, "grad/layer_4/attn": 0.0021801383700221777, "grad/layer_4/mlp": 0.0023808395490050316, "grad/layer_4/attn_mlp_ratio": 0.9157014715095457, "grad/layer_8/attn": 0.004010295029729605, "grad/layer_8/mlp": 0.0034462837502360344, "grad/layer_8/attn_mlp_ratio": 1.1636577844436526, "grad/layer_12/attn": 0.004855550359934568, "grad/layer_12/mlp": 0.00600045220926404, "grad/layer_12/attn_mlp_ratio": 0.8091973920762826, "grad/layer_16/attn": 0.004397895187139511, "grad/layer_16/mlp": 0.004518957808613777, "grad/layer_16/attn_mlp_ratio": 0.9732100356050002, "grad/layer_20/attn": 0.005066561512649059, "grad/layer_20/mlp": 0.005783618427813053, "grad/layer_20/attn_mlp_ratio": 0.8760193101056524, "grad/layer_24/attn": 0.009519795887172222, "grad/layer_24/mlp": 0.008229496888816357, "grad/layer_24/attn_mlp_ratio": 1.156789521900226, "grad/layer_27/attn": 0.005860293749719858, "grad/layer_27/mlp": 0.00647301459684968, "grad/layer_27/attn_mlp_ratio": 0.9053422592369482} {"step": 39850, "timestamp": 1778237609.470304, "train/loss": 2.1084163665771483, "train/z_loss": 0.0014266474056057632, "train/perplexity": 8.235189431451763, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024144.5076176731, "perf/iters_per_sec": 0.965187314804875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036068320274353, "data/tokens_consumed": 83573604352, "data/tokens_consumed_B": 83.573604352, "train/loss_slope": -5.436776687960891e-06} {"step": 39860, "timestamp": 1778237619.826087, "train/loss": 2.152623510360718, "train/z_loss": 0.001412950421217829, "train/perplexity": 8.607410431941508, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026032.9168381018, "perf/iters_per_sec": 0.9660877784910687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03510262966156, "data/tokens_consumed": 83594575872, "data/tokens_consumed_B": 83.594575872, "train/loss_slope": -5.967667708695956e-06} {"step": 39870, "timestamp": 1778237630.1965022, "train/loss": 2.1487441539764403, "train/z_loss": 0.0014312671730294824, "train/perplexity": 8.574083903865347, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023210.1851427052, "perf/iters_per_sec": 0.964741795131066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036546778678894, "data/tokens_consumed": 83615547392, "data/tokens_consumed_B": 83.615547392, "train/loss_slope": -9.638899845509041e-06} {"step": 39880, "timestamp": 1778237640.5687313, "train/loss": 2.1511626124382017, "train/z_loss": 0.0014194993302226067, "train/perplexity": 8.594845064517598, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023255.372934742, "perf/iters_per_sec": 0.9647633423494062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365236282348633, "data/tokens_consumed": 83636518912, "data/tokens_consumed_B": 83.636518912, "train/loss_slope": -1.0640797527781406e-05} {"step": 39890, "timestamp": 1778237650.955706, "train/loss": 2.1911232471466064, "train/z_loss": 0.0014158627134747803, "train/perplexity": 8.945255206153613, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020597.3839439692, "perf/iters_per_sec": 0.9634959144325109, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378871202468871, "data/tokens_consumed": 83657490432, "data/tokens_consumed_B": 83.657490432, "train/loss_slope": -7.067522886741558e-06} {"step": 39900, "timestamp": 1778237661.298008, "grad/layer_0/attn": 0.0028326669707894325, "grad/layer_0/mlp": 0.0028512475546449423, "grad/layer_0/attn_mlp_ratio": 0.9934833146376324, "grad/layer_4/attn": 0.0018711044685915112, "grad/layer_4/mlp": 0.00242229038849473, "grad/layer_4/attn_mlp_ratio": 0.7724525516154179, "grad/layer_8/attn": 0.005700539797544479, "grad/layer_8/mlp": 0.003623155876994133, "grad/layer_8/attn_mlp_ratio": 1.5733630662717975, "grad/layer_12/attn": 0.006063137669116259, "grad/layer_12/mlp": 0.0067138406448066235, "grad/layer_12/attn_mlp_ratio": 0.9030803528973033, "grad/layer_16/attn": 0.004748601466417313, "grad/layer_16/mlp": 0.00468381168320775, "grad/layer_16/attn_mlp_ratio": 1.013832682910497, "grad/layer_20/attn": 0.006757997442036867, "grad/layer_20/mlp": 0.005975543987005949, "grad/layer_20/attn_mlp_ratio": 1.1309426126957032, "grad/layer_24/attn": 0.006626727990806103, "grad/layer_24/mlp": 0.007937203161418438, "grad/layer_24/attn_mlp_ratio": 0.8348945809435976, "grad/layer_27/attn": 0.007670605555176735, "grad/layer_27/mlp": 0.00717179337516427, "grad/layer_27/attn_mlp_ratio": 1.0695519303142007} {"step": 39900, "timestamp": 1778237661.907556, "eos/sharpness": 41.95239543914794, "eos/L0_probe": 2.000478982925415, "eos/L_plus": 2.246715784072876, "eos/L_minus": 2.1737661361694336, "eos/grad_norm": 0.1266796588897705, "eos/embed_grad_frac": 0.1308472752571106, "eos/time_s": 0.6067242622375488} {"step": 39900, "timestamp": 1778237661.927187, "train/loss": 2.208967995643616, "train/z_loss": 0.0014129347866401077, "train/perplexity": 9.106313786611032, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912520.1783764963, "perf/iters_per_sec": 0.9119606868631822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096538496017456, "data/tokens_consumed": 83678461952, "data/tokens_consumed_B": 83.678461952, "train/loss_slope": -9.304665703692003e-06} {"step": 39900, "timestamp": 1778237663.2876484, "geo/rankme_last": 439.50494384765625, "geo/layer_0/stable_rank_q_proj": 19.28302001953125, "geo/layer_0/stable_rank_k_proj": 16.45633316040039, "geo/layer_0/stable_rank_o_proj": 49.2298698425293, "geo/layer_0/stable_rank_gate_proj": 137.45358276367188, "geo/layer_0/stable_rank_down_proj": 53.232093811035156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06419957429170609, "geo/layer_0/attn_entropy_mean": 6.198568344116211, "geo/layer_0/attn_entropy_std": 0.3693007528781891, "geo/layer_7/stable_rank_q_proj": 42.858741760253906, "geo/layer_7/stable_rank_k_proj": 42.27433395385742, "geo/layer_7/stable_rank_o_proj": 98.28092193603516, "geo/layer_7/stable_rank_gate_proj": 90.1424789428711, "geo/layer_7/stable_rank_down_proj": 145.84962463378906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49922093749046326, "geo/layer_7/attn_entropy_mean": 4.663376808166504, "geo/layer_7/attn_entropy_std": 0.81834876537323, "geo/layer_14/stable_rank_q_proj": 54.188148498535156, "geo/layer_14/stable_rank_k_proj": 36.94078063964844, "geo/layer_14/stable_rank_o_proj": 48.215904235839844, "geo/layer_14/stable_rank_gate_proj": 77.12799835205078, "geo/layer_14/stable_rank_down_proj": 133.78370666503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3880859613418579, "geo/layer_14/attn_entropy_mean": 5.505220890045166, "geo/layer_14/attn_entropy_std": 0.3737504482269287, "geo/layer_21/stable_rank_q_proj": 43.301692962646484, "geo/layer_21/stable_rank_k_proj": 30.68631935119629, "geo/layer_21/stable_rank_o_proj": 76.06723022460938, "geo/layer_21/stable_rank_gate_proj": 72.9804916381836, "geo/layer_21/stable_rank_down_proj": 55.367103576660156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14393925666809082, "geo/layer_21/attn_entropy_mean": 5.729062080383301, "geo/layer_21/attn_entropy_std": 0.2791002094745636, "geo/layer_27/stable_rank_q_proj": 42.65861511230469, "geo/layer_27/stable_rank_k_proj": 31.413005828857422, "geo/layer_27/stable_rank_o_proj": 116.02901458740234, "geo/layer_27/stable_rank_gate_proj": 84.74746704101562, "geo/layer_27/stable_rank_down_proj": 131.44766235351562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08201269060373306, "geo/layer_27/attn_entropy_mean": 4.293667316436768, "geo/layer_27/attn_entropy_std": 0.6423304677009583, "attnres/final_alpha/block_0": 0.23963016271591187, "attnres/block_norm/0": 1.7202435731887817, "attnres/final_alpha/block_1": 0.005250409245491028, "attnres/block_norm/1": 40069.671875, "attnres/final_alpha/block_2": 0.011157708242535591, "attnres/block_norm/2": 26350.015625, "attnres/final_alpha/block_3": 0.013155165128409863, "attnres/block_norm/3": 46846.80078125, "attnres/final_alpha/block_4": 0.016255710273981094, "attnres/block_norm/4": 12825.716796875, "attnres/final_alpha/block_5": 0.5977690815925598, "attnres/block_norm/5": 5993.912109375, "attnres/final_alpha/block_6": 0.11678175628185272, "attnres/block_norm/6": 30848.08984375, "geo/tier1_time_s": 1.3566560745239258, "geo/step": 39900.0, "geo/rankme_slope": 0.00013176161089435774} {"step": 39910, "timestamp": 1778237673.64039, "train/loss": 2.1738625526428224, "train/z_loss": 0.0014214712078683077, "train/perplexity": 8.792178791898795, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790977.5435675215, "perf/iters_per_sec": 0.8540046422803504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709538221359252, "data/tokens_consumed": 83699433472, "data/tokens_consumed_B": 83.699433472, "train/loss_slope": -8.683130557280329e-06} {"step": 39920, "timestamp": 1778237683.9925609, "train/loss": 2.1264426827430727, "train/z_loss": 0.0014188907225616276, "train/perplexity": 8.384985640430642, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026848.3181168851, "perf/iters_per_sec": 0.966476592119639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346862077713013, "data/tokens_consumed": 83720404992, "data/tokens_consumed_B": 83.720404992, "train/loss_slope": -9.669490859131572e-06} {"step": 39930, "timestamp": 1778237694.3409088, "train/loss": 2.176489996910095, "train/z_loss": 0.0014156392076984047, "train/perplexity": 8.815310126501462, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027561.3163021214, "perf/iters_per_sec": 0.966816576148091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343223571777345, "data/tokens_consumed": 83741376512, "data/tokens_consumed_B": 83.741376512, "train/loss_slope": -9.501308132999162e-06} {"step": 39940, "timestamp": 1778237704.6968565, "train/loss": 2.1702088117599487, "train/z_loss": 0.0014150745468214155, "train/perplexity": 8.760113064399691, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026111.459231687, "perf/iters_per_sec": 0.9661252304228244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350625038146972, "data/tokens_consumed": 83762348032, "data/tokens_consumed_B": 83.762348032, "train/loss_slope": -1.2110688948895992e-05} {"step": 39950, "timestamp": 1778237715.6066487, "grad/layer_0/attn": 0.0027952161617577076, "grad/layer_0/mlp": 0.002768756588920951, "grad/layer_0/attn_mlp_ratio": 1.009556445657588, "grad/layer_4/attn": 0.0021125709172338247, "grad/layer_4/mlp": 0.0024737538769841194, "grad/layer_4/attn_mlp_ratio": 0.8539939447856365, "grad/layer_8/attn": 0.0038499345537275076, "grad/layer_8/mlp": 0.0036401168908923864, "grad/layer_8/attn_mlp_ratio": 1.0576403350112338, "grad/layer_12/attn": 0.00696578249335289, "grad/layer_12/mlp": 0.006899100728332996, "grad/layer_12/attn_mlp_ratio": 1.0096652689501286, "grad/layer_16/attn": 0.0037652275059372187, "grad/layer_16/mlp": 0.00449948338791728, "grad/layer_16/attn_mlp_ratio": 0.8368132733564195, "grad/layer_20/attn": 0.0035853227600455284, "grad/layer_20/mlp": 0.00610462436452508, "grad/layer_20/attn_mlp_ratio": 0.587312582597078, "grad/layer_24/attn": 0.006450367160141468, "grad/layer_24/mlp": 0.009937992319464684, "grad/layer_24/attn_mlp_ratio": 0.6490613886470363, "grad/layer_27/attn": 0.013522559776902199, "grad/layer_27/mlp": 0.008057844825088978, "grad/layer_27/attn_mlp_ratio": 1.6781856566633886} {"step": 39950, "timestamp": 1778237715.6226182, "train/loss": 2.1428513288497926, "train/z_loss": 0.001406478334683925, "train/perplexity": 8.523706904003946, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1920511.4528624662, "perf/iters_per_sec": 0.9157712234794932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.091975784301758, "data/tokens_consumed": 83783319552, "data/tokens_consumed_B": 83.783319552, "train/loss_slope": -1.2074239676756233e-05} {"step": 39960, "timestamp": 1778237725.9787393, "train/loss": 2.142668676376343, "train/z_loss": 0.0014074253151193262, "train/perplexity": 8.522150170029954, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026323.4560880126, "perf/iters_per_sec": 0.9662263184013427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349542140960692, "data/tokens_consumed": 83804291072, "data/tokens_consumed_B": 83.804291072, "train/loss_slope": -1.4653455821236731e-05} {"step": 39970, "timestamp": 1778237736.846602, "train/loss": 2.1770703077316282, "train/z_loss": 0.0014144151122309268, "train/perplexity": 8.820427230974989, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931003.7065412959, "perf/iters_per_sec": 0.9207743199068527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0860424518585206, "data/tokens_consumed": 83825262592, "data/tokens_consumed_B": 83.825262592, "train/loss_slope": -1.4778456686496594e-05} {"step": 39975, "timestamp": 1778237742.625564, "eos/sharpness": 44.25010681152343, "eos/L0_probe": 2.0039572715759277, "eos/L_plus": 2.2529358863830566, "eos/L_minus": 2.197479724884033, "eos/grad_norm": 0.12549757957458496, "eos/embed_grad_frac": 0.15416578948497772, "eos/time_s": 0.6109747886657715} {"step": 39975, "timestamp": 1778237744.005698, "geo/rankme_last": 439.995849609375, "geo/layer_0/stable_rank_q_proj": 19.272756576538086, "geo/layer_0/stable_rank_k_proj": 16.41543197631836, "geo/layer_0/stable_rank_o_proj": 49.31616973876953, "geo/layer_0/stable_rank_gate_proj": 137.55242919921875, "geo/layer_0/stable_rank_down_proj": 53.18358612060547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06336680799722672, "geo/layer_0/attn_entropy_mean": 6.19514799118042, "geo/layer_0/attn_entropy_std": 0.3696407675743103, "geo/layer_7/stable_rank_q_proj": 42.84621810913086, "geo/layer_7/stable_rank_k_proj": 42.23258590698242, "geo/layer_7/stable_rank_o_proj": 98.07779693603516, "geo/layer_7/stable_rank_gate_proj": 90.052734375, "geo/layer_7/stable_rank_down_proj": 146.04347229003906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5072503089904785, "geo/layer_7/attn_entropy_mean": 4.666796684265137, "geo/layer_7/attn_entropy_std": 0.8178215026855469, "geo/layer_14/stable_rank_q_proj": 54.167240142822266, "geo/layer_14/stable_rank_k_proj": 36.94442367553711, "geo/layer_14/stable_rank_o_proj": 48.21147918701172, "geo/layer_14/stable_rank_gate_proj": 77.02286529541016, "geo/layer_14/stable_rank_down_proj": 133.9010467529297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3824681341648102, "geo/layer_14/attn_entropy_mean": 5.515852451324463, "geo/layer_14/attn_entropy_std": 0.37683776021003723, "geo/layer_21/stable_rank_q_proj": 43.36795425415039, "geo/layer_21/stable_rank_k_proj": 30.76349449157715, "geo/layer_21/stable_rank_o_proj": 76.02899169921875, "geo/layer_21/stable_rank_gate_proj": 72.94806671142578, "geo/layer_21/stable_rank_down_proj": 55.380008697509766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14903181791305542, "geo/layer_21/attn_entropy_mean": 5.715770721435547, "geo/layer_21/attn_entropy_std": 0.2817193865776062, "geo/layer_27/stable_rank_q_proj": 42.6273193359375, "geo/layer_27/stable_rank_k_proj": 31.36707878112793, "geo/layer_27/stable_rank_o_proj": 116.04499053955078, "geo/layer_27/stable_rank_gate_proj": 84.79320526123047, "geo/layer_27/stable_rank_down_proj": 131.44847106933594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08570043742656708, "geo/layer_27/attn_entropy_mean": 4.27485990524292, "geo/layer_27/attn_entropy_std": 0.6577066779136658, "attnres/final_alpha/block_0": 0.240046888589859, "attnres/block_norm/0": 1.7203960418701172, "attnres/final_alpha/block_1": 0.005156739614903927, "attnres/block_norm/1": 40109.0390625, "attnres/final_alpha/block_2": 0.011414764449000359, "attnres/block_norm/2": 26279.1640625, "attnres/final_alpha/block_3": 0.013432569801807404, "attnres/block_norm/3": 46508.90625, "attnres/final_alpha/block_4": 0.016334161162376404, "attnres/block_norm/4": 12762.708984375, "attnres/final_alpha/block_5": 0.596931517124176, "attnres/block_norm/5": 6028.513671875, "attnres/final_alpha/block_6": 0.11668334901332855, "attnres/block_norm/6": 30892.0234375, "geo/tier1_time_s": 1.3603110313415527, "geo/step": 39975.0, "geo/rankme_slope": 0.00015124647515256102} {"step": 39980, "timestamp": 1778237749.1846647, "train/loss": 2.179979109764099, "train/z_loss": 0.0014112244243733585, "train/perplexity": 8.846121459226545, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700656.8596249248, "perf/iters_per_sec": 0.8109363840222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233142352104187, "data/tokens_consumed": 83846234112, "data/tokens_consumed_B": 83.846234112, "train/loss_slope": -1.4150453193245665e-05} {"step": 39990, "timestamp": 1778237759.5410335, "train/loss": 2.1415764331817626, "train/z_loss": 0.001403756788931787, "train/perplexity": 8.512846991095374, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026276.1240134218, "perf/iters_per_sec": 0.966203748709403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349783897399902, "data/tokens_consumed": 83867205632, "data/tokens_consumed_B": 83.867205632, "train/loss_slope": -1.818551510760208e-05} {"step": 40000, "timestamp": 1778237769.8803732, "grad/layer_0/attn": 0.002762536983937025, "grad/layer_0/mlp": 0.002742389915511012, "grad/layer_0/attn_mlp_ratio": 1.0073465000645645, "grad/layer_4/attn": 0.0017773255240172148, "grad/layer_4/mlp": 0.0024384702555835247, "grad/layer_4/attn_mlp_ratio": 0.7288690305164287, "grad/layer_8/attn": 0.0061063701286911964, "grad/layer_8/mlp": 0.003659298876300454, "grad/layer_8/attn_mlp_ratio": 1.6687267611198928, "grad/layer_12/attn": 0.004779533948749304, "grad/layer_12/mlp": 0.006253881845623255, "grad/layer_12/attn_mlp_ratio": 0.7642507470250913, "grad/layer_16/attn": 0.004260173998773098, "grad/layer_16/mlp": 0.004265188239514828, "grad/layer_16/attn_mlp_ratio": 0.9988243565482738, "grad/layer_20/attn": 0.0032749564852565527, "grad/layer_20/mlp": 0.005831500049680471, "grad/layer_20/attn_mlp_ratio": 0.5615975994506318, "grad/layer_24/attn": 0.012410136871039867, "grad/layer_24/mlp": 0.00990943145006895, "grad/layer_24/attn_mlp_ratio": 1.2523560820149686, "grad/layer_27/attn": 0.012688199989497662, "grad/layer_27/mlp": 0.008317273110151291, "grad/layer_27/attn_mlp_ratio": 1.5255240111640944} {"step": 40000, "timestamp": 1778237769.895998, "train/loss": 2.183076786994934, "train/z_loss": 0.0014014318468980492, "train/perplexity": 8.873566374049991, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026502.4884196722, "perf/iters_per_sec": 0.9663116876695977, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348627805709838, "data/tokens_consumed": 83888177152, "data/tokens_consumed_B": 83.888177152, "train/loss_slope": -1.493535230655483e-05} {"step": 40000, "timestamp": 1778237776.7702422, "geo/ww_alpha_mean": 7.720287355488439, "geo/ww_alpha_std": 4.40045708738544, "geo/ww_alpha_min": 1.357418036994146, "geo/ww_alpha_max": 31.36894194025179, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 4.042788790751449, "geo/ww_alpha_by_type/k_proj": 4.480330517349665, "geo/ww_alpha_by_type/v_proj": 8.480875519404538, "geo/ww_alpha_by_type/o_proj": 8.593329482494326, "geo/ww_alpha_by_type/gate_proj": 7.91750850500876, "geo/ww_alpha_by_type/up_proj": 11.899295954239122, "geo/ww_alpha_by_type/down_proj": 8.737107101309322, "geo/twonn_id/layer_0": 0.6993090510368347, "geo/twonn_id/layer_7": 3.3207149505615234, "geo/twonn_id/layer_14": 4.444146633148193, "geo/twonn_id/layer_21": 6.787056922912598, "geo/twonn_id/layer_27": 5.382778644561768, "geo/tier2_time_s": 6.867618560791016} {"step": 40000, "timestamp": 1778237777.415201, "eoc/jacobian_sigma/layer_0/attn": 966.9035034179688, "eoc/jacobian_sigma/layer_0/mlp": 9690.1689453125, "eoc/jacobian_sigma/layer_0": 9690.1689453125, "eoc/jacobian_sigma/layer_7/attn": 1.1349292993545532, "eoc/jacobian_sigma/layer_7/mlp": 1.704549789428711, "eoc/jacobian_sigma/layer_7": 1.704549789428711, "eoc/jacobian_sigma/layer_14/attn": 1.6038291454315186, "eoc/jacobian_sigma/layer_14/mlp": 6.492241382598877, "eoc/jacobian_sigma/layer_14": 6.492241382598877, "eoc/jacobian_sigma/layer_21/attn": 1.0933876037597656, "eoc/jacobian_sigma/layer_21/mlp": 4.345794200897217, "eoc/jacobian_sigma/layer_21": 4.345794200897217, "eoc/jacobian_sigma/layer_27/attn": 4.2458062171936035, "eoc/jacobian_sigma/layer_27/mlp": 29.80388641357422, "eoc/jacobian_sigma/layer_27": 29.80388641357422, "eoc/layer0_sigma": 9690.1689453125, "eoc/sigma_max": 29.80388641357422, "eoc/sigma_min": 1.704549789428711, "eoc/sigma_mean": 10.586617946624756, "eoc/time_s": 0.6374096870422363} {"step": 40010, "timestamp": 1778237788.2760916, "train/loss": 2.165894031524658, "train/z_loss": 0.0014147271285764873, "train/perplexity": 8.72239652948528, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1141344.6779849941, "perf/iters_per_sec": 0.5442355527806254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8374396800994872, "data/tokens_consumed": 83909148672, "data/tokens_consumed_B": 83.909148672, "train/loss_slope": -1.5614539669184524e-05} {"step": 40020, "timestamp": 1778237798.6301675, "train/loss": 2.2048962116241455, "train/z_loss": 0.0014049645513296127, "train/perplexity": 9.069310230028398, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026947.2884078138, "perf/iters_per_sec": 0.9665237848319119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346356868743896, "data/tokens_consumed": 83930120192, "data/tokens_consumed_B": 83.930120192, "train/loss_slope": -1.006237481257741e-05} {"step": 40030, "timestamp": 1778237809.4270134, "train/loss": 2.1886349439620973, "train/z_loss": 0.0014207176165655256, "train/perplexity": 8.923024369139629, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943529.7928649127, "perf/iters_per_sec": 0.9267472233128131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0790428876876832, "data/tokens_consumed": 83951091712, "data/tokens_consumed_B": 83.951091712, "train/loss_slope": -7.513237156883737e-06} {"step": 40040, "timestamp": 1778237819.7786236, "train/loss": 2.199212574958801, "train/z_loss": 0.0014158713864162564, "train/perplexity": 9.01790977499981, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027022.1180451822, "perf/iters_per_sec": 0.9665594663835441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345974922180177, "data/tokens_consumed": 83972063232, "data/tokens_consumed_B": 83.972063232, "train/loss_slope": -6.828617341447616e-06} {"step": 40050, "timestamp": 1778237830.1187317, "grad/layer_0/attn": 0.0028031491674482822, "grad/layer_0/mlp": 0.0027841106057167053, "grad/layer_0/attn_mlp_ratio": 1.0068382559976816, "grad/layer_4/attn": 0.0020883295219391584, "grad/layer_4/mlp": 0.0024228468537330627, "grad/layer_4/attn_mlp_ratio": 0.8619320831311742, "grad/layer_8/attn": 0.007727340329438448, "grad/layer_8/mlp": 0.0036156910937279463, "grad/layer_8/attn_mlp_ratio": 2.137168224665558, "grad/layer_12/attn": 0.003652299055829644, "grad/layer_12/mlp": 0.006261005066335201, "grad/layer_12/attn_mlp_ratio": 0.583340687126037, "grad/layer_16/attn": 0.0035046818666160107, "grad/layer_16/mlp": 0.004524144809693098, "grad/layer_16/attn_mlp_ratio": 0.7746617176446181, "grad/layer_20/attn": 0.006796180736273527, "grad/layer_20/mlp": 0.0060663907788693905, "grad/layer_20/attn_mlp_ratio": 1.1203005002440842, "grad/layer_24/attn": 0.008423911407589912, "grad/layer_24/mlp": 0.009296508505940437, "grad/layer_24/attn_mlp_ratio": 0.9061371063763726, "grad/layer_27/attn": 0.006935732904821634, "grad/layer_27/mlp": 0.007333795074373484, "grad/layer_27/attn_mlp_ratio": 0.9457221997496195} {"step": 40050, "timestamp": 1778237830.7386086, "eos/sharpness": 41.44678115844726, "eos/L0_probe": 1.9981415271759033, "eos/L_plus": 2.2387964725494385, "eos/L_minus": 2.171954393386841, "eos/grad_norm": 0.12409310042858124, "eos/embed_grad_frac": 0.1406569629907608, "eos/time_s": 0.617030143737793} {"step": 40050, "timestamp": 1778237830.7582674, "train/loss": 2.1585331916809083, "train/z_loss": 0.0014193727634847163, "train/perplexity": 8.65842808514272, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910770.2375892538, "perf/iters_per_sec": 0.9111262500711698, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0975427389144898, "data/tokens_consumed": 83993034752, "data/tokens_consumed_B": 83.993034752, "train/loss_slope": -5.9630689269030835e-06} {"step": 40050, "timestamp": 1778237832.121816, "geo/rankme_last": 439.9496765136719, "geo/layer_0/stable_rank_q_proj": 19.262451171875, "geo/layer_0/stable_rank_k_proj": 16.392213821411133, "geo/layer_0/stable_rank_o_proj": 49.298126220703125, "geo/layer_0/stable_rank_gate_proj": 137.38009643554688, "geo/layer_0/stable_rank_down_proj": 53.1645622253418, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05447348207235336, "geo/layer_0/attn_entropy_mean": 6.19911003112793, "geo/layer_0/attn_entropy_std": 0.3735961318016052, "geo/layer_7/stable_rank_q_proj": 42.851985931396484, "geo/layer_7/stable_rank_k_proj": 42.31922912597656, "geo/layer_7/stable_rank_o_proj": 98.18883514404297, "geo/layer_7/stable_rank_gate_proj": 90.04643249511719, "geo/layer_7/stable_rank_down_proj": 146.0477294921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5130506753921509, "geo/layer_7/attn_entropy_mean": 4.659140586853027, "geo/layer_7/attn_entropy_std": 0.827561616897583, "geo/layer_14/stable_rank_q_proj": 54.02618408203125, "geo/layer_14/stable_rank_k_proj": 36.9224739074707, "geo/layer_14/stable_rank_o_proj": 48.20475769042969, "geo/layer_14/stable_rank_gate_proj": 76.99061584472656, "geo/layer_14/stable_rank_down_proj": 133.6742401123047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3899495601654053, "geo/layer_14/attn_entropy_mean": 5.447531700134277, "geo/layer_14/attn_entropy_std": 0.37019938230514526, "geo/layer_21/stable_rank_q_proj": 43.342105865478516, "geo/layer_21/stable_rank_k_proj": 30.795608520507812, "geo/layer_21/stable_rank_o_proj": 75.8683853149414, "geo/layer_21/stable_rank_gate_proj": 72.76927947998047, "geo/layer_21/stable_rank_down_proj": 55.32929611206055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14579416811466217, "geo/layer_21/attn_entropy_mean": 5.713983058929443, "geo/layer_21/attn_entropy_std": 0.2845354676246643, "geo/layer_27/stable_rank_q_proj": 42.56122589111328, "geo/layer_27/stable_rank_k_proj": 31.346223831176758, "geo/layer_27/stable_rank_o_proj": 115.90966033935547, "geo/layer_27/stable_rank_gate_proj": 84.84050750732422, "geo/layer_27/stable_rank_down_proj": 131.54660034179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08567047864198685, "geo/layer_27/attn_entropy_mean": 4.283010482788086, "geo/layer_27/attn_entropy_std": 0.6403182744979858, "attnres/final_alpha/block_0": 0.2403118908405304, "attnres/block_norm/0": 1.7204350233078003, "attnres/final_alpha/block_1": 0.005250241607427597, "attnres/block_norm/1": 40067.57421875, "attnres/final_alpha/block_2": 0.011330206878483295, "attnres/block_norm/2": 26372.16015625, "attnres/final_alpha/block_3": 0.013320261612534523, "attnres/block_norm/3": 46867.75, "attnres/final_alpha/block_4": 0.01592952013015747, "attnres/block_norm/4": 12797.390625, "attnres/final_alpha/block_5": 0.5979946851730347, "attnres/block_norm/5": 6021.08447265625, "attnres/final_alpha/block_6": 0.11586323380470276, "attnres/block_norm/6": 30852.57421875, "geo/tier1_time_s": 1.3592522144317627, "geo/step": 40050.0, "geo/rankme_slope": 0.00017938173316201482} {"step": 40060, "timestamp": 1778237842.485073, "train/loss": 2.1826064825057983, "train/z_loss": 0.0014080171938985586, "train/perplexity": 8.869394077151517, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788942.7468734204, "perf/iters_per_sec": 0.8530343756072142, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1722856998443603, "data/tokens_consumed": 84014006272, "data/tokens_consumed_B": 84.014006272, "train/loss_slope": -4.722208194177592e-06} {"step": 40070, "timestamp": 1778237852.842373, "train/loss": 2.2136348128318786, "train/z_loss": 0.0014210956986062228, "train/perplexity": 9.148910606788085, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026352.4912556305, "perf/iters_per_sec": 0.9662401634481576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349393844604493, "data/tokens_consumed": 84034977792, "data/tokens_consumed_B": 84.034977792, "train/loss_slope": -3.444470945793801e-06} {"step": 40080, "timestamp": 1778237863.1996782, "train/loss": 2.18325457572937, "train/z_loss": 0.0014037301065400244, "train/perplexity": 8.875144134435425, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026020.3170209173, "perf/iters_per_sec": 0.9660817704300486, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351090669631957, "data/tokens_consumed": 84055949312, "data/tokens_consumed_B": 84.055949312, "train/loss_slope": -4.731654198077468e-06} {"step": 40090, "timestamp": 1778237873.561718, "train/loss": 2.2110766410827636, "train/z_loss": 0.0014159628772176802, "train/perplexity": 9.12553603297322, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025584.1333242143, "perf/iters_per_sec": 0.9658737818356582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353319644927979, "data/tokens_consumed": 84076920832, "data/tokens_consumed_B": 84.076920832, "train/loss_slope": -4.007556293234937e-07} {"step": 40100, "timestamp": 1778237883.9047182, "grad/layer_0/attn": 0.002911238931119442, "grad/layer_0/mlp": 0.0028325512539595366, "grad/layer_0/attn_mlp_ratio": 1.0277797530660513, "grad/layer_4/attn": 0.0021684153471142054, "grad/layer_4/mlp": 0.002462375909090042, "grad/layer_4/attn_mlp_ratio": 0.8806191008641007, "grad/layer_8/attn": 0.004814447835087776, "grad/layer_8/mlp": 0.0036220396868884563, "grad/layer_8/attn_mlp_ratio": 1.3292089867471246, "grad/layer_12/attn": 0.004170936066657305, "grad/layer_12/mlp": 0.006785931531339884, "grad/layer_12/attn_mlp_ratio": 0.6146445754617413, "grad/layer_16/attn": 0.006635956931859255, "grad/layer_16/mlp": 0.004426296334713697, "grad/layer_16/attn_mlp_ratio": 1.4992120454961086, "grad/layer_20/attn": 0.005476548336446285, "grad/layer_20/mlp": 0.0061815292574465275, "grad/layer_20/attn_mlp_ratio": 0.8859536240572905, "grad/layer_24/attn": 0.005015157163143158, "grad/layer_24/mlp": 0.008697791956365108, "grad/layer_24/attn_mlp_ratio": 0.5766011800055658, "grad/layer_27/attn": 0.009921601973474026, "grad/layer_27/mlp": 0.007349752355366945, "grad/layer_27/attn_mlp_ratio": 1.3499232843181082} {"step": 40100, "timestamp": 1778237883.9206562, "train/loss": 2.176890420913696, "train/z_loss": 0.0014037118875421585, "train/perplexity": 8.818840695090332, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025802.4582995647, "perf/iters_per_sec": 0.965977887296469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352203845977783, "data/tokens_consumed": 84097892352, "data/tokens_consumed_B": 84.097892352, "train/loss_slope": 1.7280781694692918e-06} {"step": 40110, "timestamp": 1778237894.277083, "train/loss": 2.2072835206985473, "train/z_loss": 0.001403197564650327, "train/perplexity": 9.090987341326102, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026281.118516542, "perf/iters_per_sec": 0.9662061302740774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349758386611938, "data/tokens_consumed": 84118863872, "data/tokens_consumed_B": 84.118863872, "train/loss_slope": 7.995296912332034e-06} {"step": 40120, "timestamp": 1778237904.6358745, "train/loss": 2.1699121952056886, "train/z_loss": 0.0014070893521420658, "train/perplexity": 8.75751505517292, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025731.3109108866, "perf/iters_per_sec": 0.9659439615778382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352567434310913, "data/tokens_consumed": 84139835392, "data/tokens_consumed_B": 84.139835392, "train/loss_slope": 7.4330191908388855e-06} {"step": 40125, "timestamp": 1778237910.4203763, "eos/sharpness": 15.311980247497555, "eos/L0_probe": 2.001697540283203, "eos/L_plus": 2.0711233615875244, "eos/L_minus": 2.0853915214538574, "eos/grad_norm": 0.10312116891145706, "eos/embed_grad_frac": 0.23064576089382172, "eos/time_s": 0.616196870803833} {"step": 40125, "timestamp": 1778237911.8037975, "geo/rankme_last": 440.10601806640625, "geo/layer_0/stable_rank_q_proj": 19.246829986572266, "geo/layer_0/stable_rank_k_proj": 16.348108291625977, "geo/layer_0/stable_rank_o_proj": 49.256134033203125, "geo/layer_0/stable_rank_gate_proj": 137.47335815429688, "geo/layer_0/stable_rank_down_proj": 53.18861389160156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0593816377222538, "geo/layer_0/attn_entropy_mean": 6.191515922546387, "geo/layer_0/attn_entropy_std": 0.3778432309627533, "geo/layer_7/stable_rank_q_proj": 42.896114349365234, "geo/layer_7/stable_rank_k_proj": 42.31946563720703, "geo/layer_7/stable_rank_o_proj": 98.29962921142578, "geo/layer_7/stable_rank_gate_proj": 89.80465698242188, "geo/layer_7/stable_rank_down_proj": 145.90223693847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5033413767814636, "geo/layer_7/attn_entropy_mean": 4.662144184112549, "geo/layer_7/attn_entropy_std": 0.8026915192604065, "geo/layer_14/stable_rank_q_proj": 54.008182525634766, "geo/layer_14/stable_rank_k_proj": 37.02959060668945, "geo/layer_14/stable_rank_o_proj": 48.15263748168945, "geo/layer_14/stable_rank_gate_proj": 76.92396545410156, "geo/layer_14/stable_rank_down_proj": 133.7465362548828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3949754536151886, "geo/layer_14/attn_entropy_mean": 5.528464317321777, "geo/layer_14/attn_entropy_std": 0.36709147691726685, "geo/layer_21/stable_rank_q_proj": 43.315372467041016, "geo/layer_21/stable_rank_k_proj": 30.839963912963867, "geo/layer_21/stable_rank_o_proj": 75.97199249267578, "geo/layer_21/stable_rank_gate_proj": 72.70333862304688, "geo/layer_21/stable_rank_down_proj": 55.318519592285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.148146390914917, "geo/layer_21/attn_entropy_mean": 5.70768928527832, "geo/layer_21/attn_entropy_std": 0.30169275403022766, "geo/layer_27/stable_rank_q_proj": 42.581851959228516, "geo/layer_27/stable_rank_k_proj": 31.299114227294922, "geo/layer_27/stable_rank_o_proj": 115.90673065185547, "geo/layer_27/stable_rank_gate_proj": 84.86524200439453, "geo/layer_27/stable_rank_down_proj": 131.2539825439453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08661046624183655, "geo/layer_27/attn_entropy_mean": 4.28604793548584, "geo/layer_27/attn_entropy_std": 0.648055374622345, "attnres/final_alpha/block_0": 0.23968036472797394, "attnres/block_norm/0": 1.7205467224121094, "attnres/final_alpha/block_1": 0.005160936154425144, "attnres/block_norm/1": 40442.625, "attnres/final_alpha/block_2": 0.01132307481020689, "attnres/block_norm/2": 26349.1328125, "attnres/final_alpha/block_3": 0.013260377570986748, "attnres/block_norm/3": 46992.1796875, "attnres/final_alpha/block_4": 0.01622677408158779, "attnres/block_norm/4": 12819.1318359375, "attnres/final_alpha/block_5": 0.597101628780365, "attnres/block_norm/5": 5944.54150390625, "attnres/final_alpha/block_6": 0.1172468513250351, "attnres/block_norm/6": 30801.015625, "geo/tier1_time_s": 1.3633437156677246, "geo/step": 40125.0, "geo/rankme_slope": 0.00017530557144732894} {"step": 40130, "timestamp": 1778237916.987365, "train/loss": 2.1817973852157593, "train/z_loss": 0.001410446537192911, "train/perplexity": 8.86222077677968, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698788.288181437, "perf/iters_per_sec": 0.8100453797251878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344987392425537, "data/tokens_consumed": 84160806912, "data/tokens_consumed_B": 84.160806912, "train/loss_slope": 7.031055438135788e-06} {"step": 40140, "timestamp": 1778237927.344018, "train/loss": 2.210286808013916, "train/z_loss": 0.0014009912963956594, "train/perplexity": 9.118331228514377, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026055.6435712893, "perf/iters_per_sec": 0.9660986154419371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035091018676758, "data/tokens_consumed": 84181778432, "data/tokens_consumed_B": 84.181778432, "train/loss_slope": 9.662614232099606e-06} {"step": 40150, "timestamp": 1778237937.6982672, "grad/layer_0/attn": 0.002893699100241065, "grad/layer_0/mlp": 0.0029300337191671133, "grad/layer_0/attn_mlp_ratio": 0.9875992151734355, "grad/layer_4/attn": 0.00309342541731894, "grad/layer_4/mlp": 0.0024730716831982136, "grad/layer_4/attn_mlp_ratio": 1.250843359394314, "grad/layer_8/attn": 0.0038276209961622953, "grad/layer_8/mlp": 0.0035464197862893343, "grad/layer_8/attn_mlp_ratio": 1.0792915443994946, "grad/layer_12/attn": 0.004121837206184864, "grad/layer_12/mlp": 0.006243631709367037, "grad/layer_12/attn_mlp_ratio": 0.6601666036746529, "grad/layer_16/attn": 0.00486740143969655, "grad/layer_16/mlp": 0.004398979712277651, "grad/layer_16/attn_mlp_ratio": 1.1064841502821916, "grad/layer_20/attn": 0.00342661514878273, "grad/layer_20/mlp": 0.006356116384267807, "grad/layer_20/attn_mlp_ratio": 0.539105152849863, "grad/layer_24/attn": 0.01245774608105421, "grad/layer_24/mlp": 0.01039170939475298, "grad/layer_24/attn_mlp_ratio": 1.1988158529012405, "grad/layer_27/attn": 0.01109460648149252, "grad/layer_27/mlp": 0.008942397311329842, "grad/layer_27/attn_mlp_ratio": 1.2406747286176156} {"step": 40150, "timestamp": 1778237937.714022, "train/loss": 2.178683042526245, "train/z_loss": 0.0014213484711945057, "train/perplexity": 8.834663717626801, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023627.5160020979, "perf/iters_per_sec": 0.9649407939920892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363330125808716, "data/tokens_consumed": 84202749952, "data/tokens_consumed_B": 84.202749952, "train/loss_slope": 8.314263316342443e-06} {"step": 40160, "timestamp": 1778237948.0686374, "train/loss": 2.182871699333191, "train/z_loss": 0.0014106166898272931, "train/perplexity": 8.87174670167357, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026324.8097991995, "perf/iters_per_sec": 0.966226963901138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03495352268219, "data/tokens_consumed": 84223721472, "data/tokens_consumed_B": 84.223721472, "train/loss_slope": 1.0208911771284976e-05} {"step": 40170, "timestamp": 1778237958.4207911, "train/loss": 2.1536914587020872, "train/z_loss": 0.0014108598814345896, "train/perplexity": 8.616607611818157, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026872.1840653308, "perf/iters_per_sec": 0.9664879722906736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346740245819093, "data/tokens_consumed": 84244692992, "data/tokens_consumed_B": 84.244692992, "train/loss_slope": 1.011753822639112e-05} {"step": 40180, "timestamp": 1778237968.7878544, "train/loss": 2.1483909606933596, "train/z_loss": 0.0014239528100006282, "train/perplexity": 8.571056129748147, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024114.6507090677, "perf/iters_per_sec": 0.9651730779214228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360836029052733, "data/tokens_consumed": 84265664512, "data/tokens_consumed_B": 84.265664512, "train/loss_slope": 7.078054864736335e-06} {"step": 40190, "timestamp": 1778237979.149312, "train/loss": 2.1757062435150147, "train/z_loss": 0.0014182346407324077, "train/perplexity": 8.808403804041498, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025152.4735833902, "perf/iters_per_sec": 0.965667950431533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355526447296142, "data/tokens_consumed": 84286636032, "data/tokens_consumed_B": 84.286636032, "train/loss_slope": 8.93817631789881e-06} {"step": 40200, "timestamp": 1778237989.4873152, "grad/layer_0/attn": 0.0028506803791970015, "grad/layer_0/mlp": 0.0029752382542937994, "grad/layer_0/attn_mlp_ratio": 0.9581351272522288, "grad/layer_4/attn": 0.002569198375567794, "grad/layer_4/mlp": 0.002413755049929023, "grad/layer_4/attn_mlp_ratio": 1.0643989203475515, "grad/layer_8/attn": 0.003574020927771926, "grad/layer_8/mlp": 0.0035928792785853148, "grad/layer_8/attn_mlp_ratio": 0.994751159494585, "grad/layer_12/attn": 0.0035921530798077583, "grad/layer_12/mlp": 0.0064509473741054535, "grad/layer_12/attn_mlp_ratio": 0.5568411608103938, "grad/layer_16/attn": 0.0036123311147093773, "grad/layer_16/mlp": 0.004469932988286018, "grad/layer_16/attn_mlp_ratio": 0.8081398632511778, "grad/layer_20/attn": 0.0031150691211223602, "grad/layer_20/mlp": 0.0057192412205040455, "grad/layer_20/attn_mlp_ratio": 0.5446647459960344, "grad/layer_24/attn": 0.0075395312160253525, "grad/layer_24/mlp": 0.01079419907182455, "grad/layer_24/attn_mlp_ratio": 0.6984799053648498, "grad/layer_27/attn": 0.010903701186180115, "grad/layer_27/mlp": 0.008089221082627773, "grad/layer_27/attn_mlp_ratio": 1.347929663438633} {"step": 40200, "timestamp": 1778237990.0948708, "eos/sharpness": 18.495273590087887, "eos/L0_probe": 1.9953669309616089, "eos/L_plus": 2.074404001235962, "eos/L_minus": 2.1012825965881348, "eos/grad_norm": 0.1068325936794281, "eos/embed_grad_frac": 0.2258390635251999, "eos/time_s": 0.6048426628112793} {"step": 40200, "timestamp": 1778237990.1144602, "train/loss": 2.1705039381980895, "train/z_loss": 0.0014107493683695794, "train/perplexity": 8.762698786904867, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913408.8587169808, "perf/iters_per_sec": 0.9123844426712898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960292100906373, "data/tokens_consumed": 84307607552, "data/tokens_consumed_B": 84.307607552, "train/loss_slope": 1.0799328493277946e-05} {"step": 40200, "timestamp": 1778237991.4763727, "geo/rankme_last": 439.91424560546875, "geo/layer_0/stable_rank_q_proj": 19.21725082397461, "geo/layer_0/stable_rank_k_proj": 16.344194412231445, "geo/layer_0/stable_rank_o_proj": 49.26658248901367, "geo/layer_0/stable_rank_gate_proj": 137.4916229248047, "geo/layer_0/stable_rank_down_proj": 53.21405792236328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061918970197439194, "geo/layer_0/attn_entropy_mean": 6.192321300506592, "geo/layer_0/attn_entropy_std": 0.375014990568161, "geo/layer_7/stable_rank_q_proj": 42.776145935058594, "geo/layer_7/stable_rank_k_proj": 42.27378845214844, "geo/layer_7/stable_rank_o_proj": 98.3909683227539, "geo/layer_7/stable_rank_gate_proj": 89.8807601928711, "geo/layer_7/stable_rank_down_proj": 146.1550750732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5108600854873657, "geo/layer_7/attn_entropy_mean": 4.635007858276367, "geo/layer_7/attn_entropy_std": 0.8316243290901184, "geo/layer_14/stable_rank_q_proj": 53.955810546875, "geo/layer_14/stable_rank_k_proj": 37.0904541015625, "geo/layer_14/stable_rank_o_proj": 48.11079788208008, "geo/layer_14/stable_rank_gate_proj": 76.88539123535156, "geo/layer_14/stable_rank_down_proj": 133.25437927246094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3895297646522522, "geo/layer_14/attn_entropy_mean": 5.499875068664551, "geo/layer_14/attn_entropy_std": 0.3731597661972046, "geo/layer_21/stable_rank_q_proj": 43.38332748413086, "geo/layer_21/stable_rank_k_proj": 30.911914825439453, "geo/layer_21/stable_rank_o_proj": 75.91260528564453, "geo/layer_21/stable_rank_gate_proj": 72.79087829589844, "geo/layer_21/stable_rank_down_proj": 55.27324295043945, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.149359330534935, "geo/layer_21/attn_entropy_mean": 5.721041202545166, "geo/layer_21/attn_entropy_std": 0.3014206886291504, "geo/layer_27/stable_rank_q_proj": 42.50815963745117, "geo/layer_27/stable_rank_k_proj": 31.363113403320312, "geo/layer_27/stable_rank_o_proj": 116.12305450439453, "geo/layer_27/stable_rank_gate_proj": 84.8568115234375, "geo/layer_27/stable_rank_down_proj": 131.3013153076172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08775558322668076, "geo/layer_27/attn_entropy_mean": 4.262252330780029, "geo/layer_27/attn_entropy_std": 0.663264274597168, "attnres/final_alpha/block_0": 0.24184295535087585, "attnres/block_norm/0": 1.7207375764846802, "attnres/final_alpha/block_1": 0.005299737676978111, "attnres/block_norm/1": 40229.85546875, "attnres/final_alpha/block_2": 0.011360477656126022, "attnres/block_norm/2": 26341.189453125, "attnres/final_alpha/block_3": 0.013283111155033112, "attnres/block_norm/3": 47012.5625, "attnres/final_alpha/block_4": 0.016193751245737076, "attnres/block_norm/4": 12873.96875, "attnres/final_alpha/block_5": 0.5933152437210083, "attnres/block_norm/5": 6015.859375, "attnres/final_alpha/block_6": 0.11870475858449936, "attnres/block_norm/6": 30979.29296875, "geo/tier1_time_s": 1.3576853275299072, "geo/step": 40200.0, "geo/rankme_slope": 0.00020197266406562624} {"step": 40210, "timestamp": 1778238001.8313735, "train/loss": 2.1542891025543214, "train/z_loss": 0.0014074873179197311, "train/perplexity": 8.621758813523165, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790441.0313222092, "perf/iters_per_sec": 0.8537488133059545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171304702758789, "data/tokens_consumed": 84328579072, "data/tokens_consumed_B": 84.328579072, "train/loss_slope": 7.764459729301947e-06} {"step": 40220, "timestamp": 1778238012.1915088, "train/loss": 2.1564805030822756, "train/z_loss": 0.001405868772417307, "train/perplexity": 8.640673257330784, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025191.4534435372, "perf/iters_per_sec": 0.9656865374772726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355327129364014, "data/tokens_consumed": 84349550592, "data/tokens_consumed_B": 84.349550592, "train/loss_slope": 8.067067085069265e-06} {"step": 40230, "timestamp": 1778238022.5476582, "train/loss": 2.1865286588668824, "train/z_loss": 0.0014136725338175892, "train/perplexity": 8.904249715234561, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026099.4651388926, "perf/iters_per_sec": 0.9661195111937011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350686311721802, "data/tokens_consumed": 84370522112, "data/tokens_consumed_B": 84.370522112, "train/loss_slope": 6.076204136546037e-06} {"step": 40240, "timestamp": 1778238032.9185076, "train/loss": 2.178312635421753, "train/z_loss": 0.0014145264867693186, "train/perplexity": 8.831391901409388, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023178.122114943, "perf/iters_per_sec": 0.9647265062880245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365632057189942, "data/tokens_consumed": 84391493632, "data/tokens_consumed_B": 84.391493632, "train/loss_slope": 6.040969203979907e-06} {"step": 40250, "timestamp": 1778238043.2966871, "grad/layer_0/attn": 0.0028035182040184736, "grad/layer_0/mlp": 0.002850467339158058, "grad/layer_0/attn_mlp_ratio": 0.9835292855850155, "grad/layer_4/attn": 0.0019781526643782854, "grad/layer_4/mlp": 0.0026328307576477528, "grad/layer_4/attn_mlp_ratio": 0.7513405802853679, "grad/layer_8/attn": 0.0033001352567225695, "grad/layer_8/mlp": 0.003600452793762088, "grad/layer_8/attn_mlp_ratio": 0.9165889275874629, "grad/layer_12/attn": 0.004883603658527136, "grad/layer_12/mlp": 0.006956190336495638, "grad/layer_12/attn_mlp_ratio": 0.7020514609412244, "grad/layer_16/attn": 0.0046389042399823666, "grad/layer_16/mlp": 0.004909536335617304, "grad/layer_16/attn_mlp_ratio": 0.9448762221883972, "grad/layer_20/attn": 0.004399250261485577, "grad/layer_20/mlp": 0.0062305317260324955, "grad/layer_20/attn_mlp_ratio": 0.7060794141367784, "grad/layer_24/attn": 0.009116964414715767, "grad/layer_24/mlp": 0.00978669710457325, "grad/layer_24/attn_mlp_ratio": 0.9315670265608584, "grad/layer_27/attn": 0.012120927684009075, "grad/layer_27/mlp": 0.007392249070107937, "grad/layer_27/attn_mlp_ratio": 1.63968061750712} {"step": 40250, "timestamp": 1778238043.313285, "train/loss": 2.1292463302612306, "train/z_loss": 0.0014161299914121628, "train/perplexity": 8.408527170266511, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018828.572697326, "perf/iters_per_sec": 0.9626524795042639, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387964725494385, "data/tokens_consumed": 84412465152, "data/tokens_consumed_B": 84.412465152, "train/loss_slope": 4.955443232902892e-06} {"step": 40260, "timestamp": 1778238053.710693, "train/loss": 2.139013886451721, "train/z_loss": 0.0014311163453385235, "train/perplexity": 8.49106034944076, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018078.4563542495, "perf/iters_per_sec": 0.9622947961589096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0391825914382935, "data/tokens_consumed": 84433436672, "data/tokens_consumed_B": 84.433436672, "train/loss_slope": 9.051631815331454e-08} {"step": 40270, "timestamp": 1778238064.0860043, "train/loss": 2.13291517496109, "train/z_loss": 0.0014101981068961322, "train/perplexity": 8.439433411040019, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022354.2787889813, "perf/iters_per_sec": 0.9643336671776682, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369854688644409, "data/tokens_consumed": 84454408192, "data/tokens_consumed_B": 84.454408192, "train/loss_slope": -2.5102044668348696e-06} {"step": 40275, "timestamp": 1778238069.872895, "eos/sharpness": 72.89063930511473, "eos/L0_probe": 1.9996678829193115, "eos/L_plus": 2.4387779235839844, "eos/L_minus": 2.289464235305786, "eos/grad_norm": 0.19334113597869873, "eos/embed_grad_frac": 0.07230951637029648, "eos/time_s": 0.6076769828796387} {"step": 40275, "timestamp": 1778238071.2531846, "geo/rankme_last": 438.99664306640625, "geo/layer_0/stable_rank_q_proj": 19.202638626098633, "geo/layer_0/stable_rank_k_proj": 16.33180046081543, "geo/layer_0/stable_rank_o_proj": 49.21816635131836, "geo/layer_0/stable_rank_gate_proj": 137.7368621826172, "geo/layer_0/stable_rank_down_proj": 53.11806869506836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05632958188652992, "geo/layer_0/attn_entropy_mean": 6.191545486450195, "geo/layer_0/attn_entropy_std": 0.37754741311073303, "geo/layer_7/stable_rank_q_proj": 42.84500503540039, "geo/layer_7/stable_rank_k_proj": 42.29267501831055, "geo/layer_7/stable_rank_o_proj": 98.348388671875, "geo/layer_7/stable_rank_gate_proj": 89.96905517578125, "geo/layer_7/stable_rank_down_proj": 146.25563049316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5061020851135254, "geo/layer_7/attn_entropy_mean": 4.654356956481934, "geo/layer_7/attn_entropy_std": 0.8317530751228333, "geo/layer_14/stable_rank_q_proj": 53.98335647583008, "geo/layer_14/stable_rank_k_proj": 37.019710540771484, "geo/layer_14/stable_rank_o_proj": 48.05333709716797, "geo/layer_14/stable_rank_gate_proj": 77.01834869384766, "geo/layer_14/stable_rank_down_proj": 133.2300262451172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39352476596832275, "geo/layer_14/attn_entropy_mean": 5.517995834350586, "geo/layer_14/attn_entropy_std": 0.37340304255485535, "geo/layer_21/stable_rank_q_proj": 43.40761947631836, "geo/layer_21/stable_rank_k_proj": 30.804826736450195, "geo/layer_21/stable_rank_o_proj": 75.86487579345703, "geo/layer_21/stable_rank_gate_proj": 72.76419067382812, "geo/layer_21/stable_rank_down_proj": 55.311256408691406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14326432347297668, "geo/layer_21/attn_entropy_mean": 5.7438836097717285, "geo/layer_21/attn_entropy_std": 0.27481603622436523, "geo/layer_27/stable_rank_q_proj": 42.543453216552734, "geo/layer_27/stable_rank_k_proj": 31.40249252319336, "geo/layer_27/stable_rank_o_proj": 116.1100082397461, "geo/layer_27/stable_rank_gate_proj": 84.85159301757812, "geo/layer_27/stable_rank_down_proj": 131.26194763183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08535180985927582, "geo/layer_27/attn_entropy_mean": 4.2979536056518555, "geo/layer_27/attn_entropy_std": 0.6648500561714172, "attnres/final_alpha/block_0": 0.24022716283798218, "attnres/block_norm/0": 1.7207058668136597, "attnres/final_alpha/block_1": 0.005202808883041143, "attnres/block_norm/1": 40240.6796875, "attnres/final_alpha/block_2": 0.011204063892364502, "attnres/block_norm/2": 26416.189453125, "attnres/final_alpha/block_3": 0.013363787904381752, "attnres/block_norm/3": 47220.37890625, "attnres/final_alpha/block_4": 0.016127515584230423, "attnres/block_norm/4": 12854.216796875, "attnres/final_alpha/block_5": 0.5976760387420654, "attnres/block_norm/5": 5960.6416015625, "attnres/final_alpha/block_6": 0.11619866639375687, "attnres/block_norm/6": 31158.69921875, "geo/tier1_time_s": 1.3588917255401611, "geo/step": 40275.0, "geo/rankme_slope": 0.000146685490602491} {"step": 40280, "timestamp": 1778238076.4438646, "train/loss": 2.172469162940979, "train/z_loss": 0.0014050122932530939, "train/perplexity": 8.779936391711324, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697730.3754756602, "perf/iters_per_sec": 0.8095409276369382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235267996788025, "data/tokens_consumed": 84475379712, "data/tokens_consumed_B": 84.475379712, "train/loss_slope": -4.5847664357232594e-06} {"step": 40290, "timestamp": 1778238086.8386247, "train/loss": 2.18066782951355, "train/z_loss": 0.0014164355467073618, "train/perplexity": 8.852216056275326, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020108.739357767, "perf/iters_per_sec": 0.9632629105366549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038138175010681, "data/tokens_consumed": 84496351232, "data/tokens_consumed_B": 84.496351232, "train/loss_slope": -4.53669690098657e-06} {"step": 40300, "timestamp": 1778238097.2080526, "grad/layer_0/attn": 0.002663157880306244, "grad/layer_0/mlp": 0.0026105856522917747, "grad/layer_0/attn_mlp_ratio": 1.0201380582761235, "grad/layer_4/attn": 0.00200498104095459, "grad/layer_4/mlp": 0.0025024469941854477, "grad/layer_4/attn_mlp_ratio": 0.801208163646399, "grad/layer_8/attn": 0.0032628572080284357, "grad/layer_8/mlp": 0.003479830687865615, "grad/layer_8/attn_mlp_ratio": 0.9376482383586641, "grad/layer_12/attn": 0.004531041719019413, "grad/layer_12/mlp": 0.006751885171979666, "grad/layer_12/attn_mlp_ratio": 0.6710780080673532, "grad/layer_16/attn": 0.0035945475101470947, "grad/layer_16/mlp": 0.004433289635926485, "grad/layer_16/attn_mlp_ratio": 0.810808163747478, "grad/layer_20/attn": 0.005353773012757301, "grad/layer_20/mlp": 0.005466120317578316, "grad/layer_20/attn_mlp_ratio": 0.9794465916887378, "grad/layer_24/attn": 0.0053744628094136715, "grad/layer_24/mlp": 0.007524801418185234, "grad/layer_24/attn_mlp_ratio": 0.7142331656755563, "grad/layer_27/attn": 0.004890098236501217, "grad/layer_27/mlp": 0.006829424295574427, "grad/layer_27/attn_mlp_ratio": 0.716033731872056} {"step": 40300, "timestamp": 1778238097.2249184, "train/loss": 2.1388779163360594, "train/z_loss": 0.0014058656059205532, "train/perplexity": 8.489905897470218, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020272.0586370644, "perf/iters_per_sec": 0.9633407872376749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380542516708373, "data/tokens_consumed": 84517322752, "data/tokens_consumed_B": 84.517322752, "train/loss_slope": -5.9238124458845684e-06} {"step": 40310, "timestamp": 1778238107.6003082, "train/loss": 2.123634600639343, "train/z_loss": 0.0014110456104390323, "train/perplexity": 8.361472940560017, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022008.5864132529, "perf/iters_per_sec": 0.9641688282076134, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037162756919861, "data/tokens_consumed": 84538294272, "data/tokens_consumed_B": 84.538294272, "train/loss_slope": -7.901028423669783e-06} {"step": 40320, "timestamp": 1778238117.9896035, "train/loss": 2.2099520683288576, "train/z_loss": 0.0014041471295058727, "train/perplexity": 9.115279471991188, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019320.260326105, "perf/iters_per_sec": 0.9628869344358945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385435342788696, "data/tokens_consumed": 84559265792, "data/tokens_consumed_B": 84.559265792, "train/loss_slope": -7.527701682311852e-06} {"step": 40330, "timestamp": 1778238128.373641, "train/loss": 2.1205398082733153, "train/z_loss": 0.0014017987763509155, "train/perplexity": 8.335635918665568, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020617.0646124526, "perf/iters_per_sec": 0.9635052989065421, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378770112991333, "data/tokens_consumed": 84580237312, "data/tokens_consumed_B": 84.580237312, "train/loss_slope": -8.679428288001597e-06} {"step": 40340, "timestamp": 1778238138.8784769, "train/loss": 2.195302891731262, "train/z_loss": 0.0013990726904012263, "train/perplexity": 8.982721436855716, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020392.8483737388, "perf/iters_per_sec": 0.9633983842724508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379921913146972, "data/tokens_consumed": 84601208832, "data/tokens_consumed_B": 84.601208832, "train/loss_slope": -8.034116206782805e-06} {"step": 40350, "timestamp": 1778238149.2469332, "grad/layer_0/attn": 0.002611882286146283, "grad/layer_0/mlp": 0.002715452341362834, "grad/layer_0/attn_mlp_ratio": 0.9618589691946249, "grad/layer_4/attn": 0.0024101310409605503, "grad/layer_4/mlp": 0.0024100271984934807, "grad/layer_4/attn_mlp_ratio": 1.0000430461792422, "grad/layer_8/attn": 0.00447040144354105, "grad/layer_8/mlp": 0.0035367351956665516, "grad/layer_8/attn_mlp_ratio": 1.2639909605388024, "grad/layer_12/attn": 0.004257356282323599, "grad/layer_12/mlp": 0.006654718425124884, "grad/layer_12/attn_mlp_ratio": 0.639750015909757, "grad/layer_16/attn": 0.004004512447863817, "grad/layer_16/mlp": 0.004623245447874069, "grad/layer_16/attn_mlp_ratio": 0.8661691027216223, "grad/layer_20/attn": 0.004564653150737286, "grad/layer_20/mlp": 0.0062857395969331264, "grad/layer_20/attn_mlp_ratio": 0.7261918836639742, "grad/layer_24/attn": 0.011219991371035576, "grad/layer_24/mlp": 0.009379531256854534, "grad/layer_24/attn_mlp_ratio": 1.1962208925114408, "grad/layer_27/attn": 0.003952051047235727, "grad/layer_27/mlp": 0.00944809801876545, "grad/layer_27/attn_mlp_ratio": 0.4182906440594976} {"step": 40350, "timestamp": 1778238149.8707078, "eos/sharpness": 59.802317619323716, "eos/L0_probe": 2.003112554550171, "eos/L_plus": 2.256767988204956, "eos/L_minus": 2.347480297088623, "eos/grad_norm": 0.1461205929517746, "eos/embed_grad_frac": 0.1310022920370102, "eos/time_s": 0.6207525730133057} {"step": 40350, "timestamp": 1778238149.892051, "train/loss": 2.2282093286514284, "train/z_loss": 0.001392970432061702, "train/perplexity": 9.283227976882158, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905065.3806083722, "perf/iters_per_sec": 0.9084059622804509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1008294105529786, "data/tokens_consumed": 84622180352, "data/tokens_consumed_B": 84.622180352, "train/loss_slope": -5.161010986543217e-06} {"step": 40350, "timestamp": 1778238151.2517548, "geo/rankme_last": 439.0301208496094, "geo/layer_0/stable_rank_q_proj": 19.1881103515625, "geo/layer_0/stable_rank_k_proj": 16.338281631469727, "geo/layer_0/stable_rank_o_proj": 49.22054672241211, "geo/layer_0/stable_rank_gate_proj": 137.61317443847656, "geo/layer_0/stable_rank_down_proj": 53.15098571777344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06464377045631409, "geo/layer_0/attn_entropy_mean": 6.189968109130859, "geo/layer_0/attn_entropy_std": 0.3775680363178253, "geo/layer_7/stable_rank_q_proj": 42.892337799072266, "geo/layer_7/stable_rank_k_proj": 42.27092742919922, "geo/layer_7/stable_rank_o_proj": 98.27845001220703, "geo/layer_7/stable_rank_gate_proj": 90.03020477294922, "geo/layer_7/stable_rank_down_proj": 146.2552947998047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5030258893966675, "geo/layer_7/attn_entropy_mean": 4.684493064880371, "geo/layer_7/attn_entropy_std": 0.8166630864143372, "geo/layer_14/stable_rank_q_proj": 54.032958984375, "geo/layer_14/stable_rank_k_proj": 37.03438949584961, "geo/layer_14/stable_rank_o_proj": 48.05849838256836, "geo/layer_14/stable_rank_gate_proj": 76.92910766601562, "geo/layer_14/stable_rank_down_proj": 133.62049865722656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3748050034046173, "geo/layer_14/attn_entropy_mean": 5.50260066986084, "geo/layer_14/attn_entropy_std": 0.401001900434494, "geo/layer_21/stable_rank_q_proj": 43.4488410949707, "geo/layer_21/stable_rank_k_proj": 30.82957649230957, "geo/layer_21/stable_rank_o_proj": 75.91806030273438, "geo/layer_21/stable_rank_gate_proj": 72.70901489257812, "geo/layer_21/stable_rank_down_proj": 55.212554931640625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451990157365799, "geo/layer_21/attn_entropy_mean": 5.713024139404297, "geo/layer_21/attn_entropy_std": 0.2846423089504242, "geo/layer_27/stable_rank_q_proj": 42.55674743652344, "geo/layer_27/stable_rank_k_proj": 31.37178611755371, "geo/layer_27/stable_rank_o_proj": 116.11544799804688, "geo/layer_27/stable_rank_gate_proj": 84.94034576416016, "geo/layer_27/stable_rank_down_proj": 131.5358123779297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09293150156736374, "geo/layer_27/attn_entropy_mean": 4.268830299377441, "geo/layer_27/attn_entropy_std": 0.6865265965461731, "attnres/final_alpha/block_0": 0.24110597372055054, "attnres/block_norm/0": 1.7211153507232666, "attnres/final_alpha/block_1": 0.005177884362637997, "attnres/block_norm/1": 40271.0859375, "attnres/final_alpha/block_2": 0.011374873109161854, "attnres/block_norm/2": 26507.927734375, "attnres/final_alpha/block_3": 0.013321101665496826, "attnres/block_norm/3": 47224.12109375, "attnres/final_alpha/block_4": 0.016312353312969208, "attnres/block_norm/4": 12870.8115234375, "attnres/final_alpha/block_5": 0.5954982042312622, "attnres/block_norm/5": 6019.9111328125, "attnres/final_alpha/block_6": 0.11720956861972809, "attnres/block_norm/6": 31154.76171875, "geo/tier1_time_s": 1.3555591106414795, "geo/step": 40350.0, "geo/rankme_slope": 0.00013448004201680673} {"step": 40360, "timestamp": 1778238161.6380062, "train/loss": 2.1797979354858397, "train/z_loss": 0.0014065438881516457, "train/perplexity": 8.84451891473008, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786040.2132991906, "perf/iters_per_sec": 0.8516503397460893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1741908073425293, "data/tokens_consumed": 84643151872, "data/tokens_consumed_B": 84.643151872, "train/loss_slope": -4.302954165884652e-06} {"step": 40370, "timestamp": 1778238172.0243273, "train/loss": 2.1484595060348513, "train/z_loss": 0.0014193441718816757, "train/perplexity": 8.571643655853364, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020408.5340014668, "perf/iters_per_sec": 0.9634058637626013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379841327667236, "data/tokens_consumed": 84664123392, "data/tokens_consumed_B": 84.664123392, "train/loss_slope": -4.442960966323588e-06} {"step": 40380, "timestamp": 1778238182.4229784, "train/loss": 2.1732704877853393, "train/z_loss": 0.0014207719708792866, "train/perplexity": 8.786974792519997, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017991.785421375, "perf/iters_per_sec": 0.9622534682375788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0392272233963014, "data/tokens_consumed": 84685094912, "data/tokens_consumed_B": 84.685094912, "train/loss_slope": -3.5024194577202066e-06} {"step": 40390, "timestamp": 1778238192.8045719, "train/loss": 2.1443650007247923, "train/z_loss": 0.0014169376925565302, "train/perplexity": 8.536618769113506, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021372.1355023833, "perf/iters_per_sec": 0.9638653447639386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374893188476562, "data/tokens_consumed": 84706066432, "data/tokens_consumed_B": 84.706066432, "train/loss_slope": -4.992774882689901e-06} {"step": 40400, "timestamp": 1778238203.1736035, "grad/layer_0/attn": 0.0028568971902132034, "grad/layer_0/mlp": 0.0028721336275339127, "grad/layer_0/attn_mlp_ratio": 0.9946950459950235, "grad/layer_4/attn": 0.00215551839210093, "grad/layer_4/mlp": 0.0025909002870321274, "grad/layer_4/attn_mlp_ratio": 0.8319572619965035, "grad/layer_8/attn": 0.003910963423550129, "grad/layer_8/mlp": 0.0036590814124792814, "grad/layer_8/attn_mlp_ratio": 1.068837469242433, "grad/layer_12/attn": 0.005400936119258404, "grad/layer_12/mlp": 0.006871535908430815, "grad/layer_12/attn_mlp_ratio": 0.78598673027863, "grad/layer_16/attn": 0.0043299440294504166, "grad/layer_16/mlp": 0.00452288705855608, "grad/layer_16/attn_mlp_ratio": 0.9573407157106127, "grad/layer_20/attn": 0.004058836959302425, "grad/layer_20/mlp": 0.005746304988861084, "grad/layer_20/attn_mlp_ratio": 0.7063385769701424, "grad/layer_24/attn": 0.007208252791315317, "grad/layer_24/mlp": 0.009268726222217083, "grad/layer_24/attn_mlp_ratio": 0.7776961516316624, "grad/layer_27/attn": 0.0040421877056360245, "grad/layer_27/mlp": 0.007790326606482267, "grad/layer_27/attn_mlp_ratio": 0.5188726812025166} {"step": 40400, "timestamp": 1778238203.1903753, "train/loss": 2.126128005981445, "train/z_loss": 0.0014131566626019777, "train/perplexity": 8.382347495406252, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020495.9695960379, "perf/iters_per_sec": 0.9634475563030424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379392147064208, "data/tokens_consumed": 84727037952, "data/tokens_consumed_B": 84.727037952, "train/loss_slope": -6.428222158382149e-06} {"step": 40410, "timestamp": 1778238213.576514, "train/loss": 2.1693854331970215, "train/z_loss": 0.00141792333452031, "train/perplexity": 8.75290314374803, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020395.400752192, "perf/iters_per_sec": 0.9633996013413391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379908800125122, "data/tokens_consumed": 84748009472, "data/tokens_consumed_B": 84.748009472, "train/loss_slope": -6.588266405871764e-06} {"step": 40420, "timestamp": 1778238223.9619083, "train/loss": 2.1707412719726564, "train/z_loss": 0.0014171804301440715, "train/perplexity": 8.764778718092552, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020601.7934943833, "perf/iters_per_sec": 0.9634980170699994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378848552703857, "data/tokens_consumed": 84768980992, "data/tokens_consumed_B": 84.768980992, "train/loss_slope": -6.771001084731329e-06} {"step": 40425, "timestamp": 1778238229.74097, "eos/sharpness": 39.71252441406249, "eos/L0_probe": 2.0019373893737793, "eos/L_plus": 2.2059593200683594, "eos/L_minus": 2.195040702819824, "eos/grad_norm": 0.14239367842674255, "eos/embed_grad_frac": 0.1274441033601761, "eos/time_s": 0.6025457382202148} {"step": 40425, "timestamp": 1778238231.1180022, "geo/rankme_last": 439.1390075683594, "geo/layer_0/stable_rank_q_proj": 19.20704460144043, "geo/layer_0/stable_rank_k_proj": 16.36802864074707, "geo/layer_0/stable_rank_o_proj": 49.226253509521484, "geo/layer_0/stable_rank_gate_proj": 137.43038940429688, "geo/layer_0/stable_rank_down_proj": 53.063358306884766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05819041281938553, "geo/layer_0/attn_entropy_mean": 6.190084457397461, "geo/layer_0/attn_entropy_std": 0.3769765794277191, "geo/layer_7/stable_rank_q_proj": 42.94308853149414, "geo/layer_7/stable_rank_k_proj": 42.05599594116211, "geo/layer_7/stable_rank_o_proj": 98.16497802734375, "geo/layer_7/stable_rank_gate_proj": 90.179443359375, "geo/layer_7/stable_rank_down_proj": 146.55340576171875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.491967111825943, "geo/layer_7/attn_entropy_mean": 4.673093795776367, "geo/layer_7/attn_entropy_std": 0.8111458420753479, "geo/layer_14/stable_rank_q_proj": 54.12570571899414, "geo/layer_14/stable_rank_k_proj": 37.183162689208984, "geo/layer_14/stable_rank_o_proj": 47.9571418762207, "geo/layer_14/stable_rank_gate_proj": 76.96097564697266, "geo/layer_14/stable_rank_down_proj": 133.7264862060547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.382914274930954, "geo/layer_14/attn_entropy_mean": 5.481634140014648, "geo/layer_14/attn_entropy_std": 0.3889581859111786, "geo/layer_21/stable_rank_q_proj": 43.40035629272461, "geo/layer_21/stable_rank_k_proj": 30.804218292236328, "geo/layer_21/stable_rank_o_proj": 75.89693450927734, "geo/layer_21/stable_rank_gate_proj": 72.81217956542969, "geo/layer_21/stable_rank_down_proj": 55.17272186279297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1456761360168457, "geo/layer_21/attn_entropy_mean": 5.714859962463379, "geo/layer_21/attn_entropy_std": 0.2881907820701599, "geo/layer_27/stable_rank_q_proj": 42.648433685302734, "geo/layer_27/stable_rank_k_proj": 31.428070068359375, "geo/layer_27/stable_rank_o_proj": 115.87077331542969, "geo/layer_27/stable_rank_gate_proj": 85.08881378173828, "geo/layer_27/stable_rank_down_proj": 131.47886657714844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08710233122110367, "geo/layer_27/attn_entropy_mean": 4.2728400230407715, "geo/layer_27/attn_entropy_std": 0.6693605780601501, "attnres/final_alpha/block_0": 0.24139438569545746, "attnres/block_norm/0": 1.7212728261947632, "attnres/final_alpha/block_1": 0.0052118804305791855, "attnres/block_norm/1": 40297.109375, "attnres/final_alpha/block_2": 0.011229119263589382, "attnres/block_norm/2": 26427.40625, "attnres/final_alpha/block_3": 0.013129840604960918, "attnres/block_norm/3": 47264.9140625, "attnres/final_alpha/block_4": 0.01612846739590168, "attnres/block_norm/4": 12891.001953125, "attnres/final_alpha/block_5": 0.5957085490226746, "attnres/block_norm/5": 6041.7138671875, "attnres/final_alpha/block_6": 0.11719777435064316, "attnres/block_norm/6": 31149.142578125, "geo/tier1_time_s": 1.3560585975646973, "geo/step": 40425.0, "geo/rankme_slope": 0.0001229166862057323} {"step": 40430, "timestamp": 1778238236.3088787, "train/loss": 2.151503586769104, "train/z_loss": 0.0014132068608887494, "train/perplexity": 8.597776185752833, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699364.8009646789, "perf/iters_per_sec": 0.8103202824424166, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234079933166504, "data/tokens_consumed": 84789952512, "data/tokens_consumed_B": 84.789952512, "train/loss_slope": -7.404129053785877e-06} {"step": 40440, "timestamp": 1778238246.6913974, "train/loss": 2.160418677330017, "train/z_loss": 0.0013992457534186542, "train/perplexity": 8.674768827317507, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020900.3880354222, "perf/iters_per_sec": 0.9636403980424033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377315044403077, "data/tokens_consumed": 84810924032, "data/tokens_consumed_B": 84.810924032, "train/loss_slope": -7.809524131257509e-06} {"step": 40450, "timestamp": 1778238257.0595903, "grad/layer_0/attn": 0.002680022967979312, "grad/layer_0/mlp": 0.002872374840080738, "grad/layer_0/attn_mlp_ratio": 0.9330338217976447, "grad/layer_4/attn": 0.0019037170568481088, "grad/layer_4/mlp": 0.0025983252562582493, "grad/layer_4/attn_mlp_ratio": 0.7326707766842487, "grad/layer_8/attn": 0.003900987096130848, "grad/layer_8/mlp": 0.003773022908717394, "grad/layer_8/attn_mlp_ratio": 1.0339155332786996, "grad/layer_12/attn": 0.005003645550459623, "grad/layer_12/mlp": 0.007060900330543518, "grad/layer_12/attn_mlp_ratio": 0.7086412844479757, "grad/layer_16/attn": 0.00448865070939064, "grad/layer_16/mlp": 0.004706901498138905, "grad/layer_16/attn_mlp_ratio": 0.9536317290264662, "grad/layer_20/attn": 0.004316244274377823, "grad/layer_20/mlp": 0.006060553248971701, "grad/layer_20/attn_mlp_ratio": 0.7121864994572095, "grad/layer_24/attn": 0.009426863864064217, "grad/layer_24/mlp": 0.009115945547819138, "grad/layer_24/attn_mlp_ratio": 1.0341070721850412, "grad/layer_27/attn": 0.009805655106902122, "grad/layer_27/mlp": 0.007863814942538738, "grad/layer_27/attn_mlp_ratio": 1.2469335880687862} {"step": 40450, "timestamp": 1778238257.0763774, "train/loss": 2.152052712440491, "train/z_loss": 0.0014072593534365297, "train/perplexity": 8.602498741893005, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020448.8165518488, "perf/iters_per_sec": 0.9634250719794506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379634380340577, "data/tokens_consumed": 84831895552, "data/tokens_consumed_B": 84.831895552, "train/loss_slope": -1.0718044696753997e-05} {"step": 40460, "timestamp": 1778238267.4497633, "train/loss": 2.173732137680054, "train/z_loss": 0.0014033869840204716, "train/perplexity": 8.79103223499471, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023045.971740398, "perf/iters_per_sec": 0.964663492078971, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036630916595459, "data/tokens_consumed": 84852867072, "data/tokens_consumed_B": 84.852867072, "train/loss_slope": -1.1448636378320518e-05} {"step": 40470, "timestamp": 1778238277.8216467, "train/loss": 2.1523011207580565, "train/z_loss": 0.0014186217915266753, "train/perplexity": 8.604635939570189, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022928.6329567283, "perf/iters_per_sec": 0.9646075405868189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366910457611085, "data/tokens_consumed": 84873838592, "data/tokens_consumed_B": 84.873838592, "train/loss_slope": -1.6090196887902368e-05} {"step": 40480, "timestamp": 1778238288.2035859, "train/loss": 2.169832944869995, "train/z_loss": 0.0014120217645540833, "train/perplexity": 8.756821046665523, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021320.1572522654, "perf/iters_per_sec": 0.963840559602864, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375159978866577, "data/tokens_consumed": 84894810112, "data/tokens_consumed_B": 84.894810112, "train/loss_slope": -1.5810531608485334e-05} {"step": 40490, "timestamp": 1778238298.5779846, "train/loss": 2.1208260536193846, "train/z_loss": 0.0014071482117287814, "train/perplexity": 8.338022297182393, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022479.781901913, "perf/iters_per_sec": 0.9643935117253842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369211196899415, "data/tokens_consumed": 84915781632, "data/tokens_consumed_B": 84.915781632, "train/loss_slope": -1.8772994331007287e-05} {"step": 40500, "timestamp": 1778238308.9416182, "grad/layer_0/attn": 0.0028057971503585577, "grad/layer_0/mlp": 0.002723437501117587, "grad/layer_0/attn_mlp_ratio": 1.0302410267109383, "grad/layer_4/attn": 0.0016847762744873762, "grad/layer_4/mlp": 0.002365640364587307, "grad/layer_4/attn_mlp_ratio": 0.712186107613479, "grad/layer_8/attn": 0.0043149287812411785, "grad/layer_8/mlp": 0.0035098448861390352, "grad/layer_8/attn_mlp_ratio": 1.2293787327592967, "grad/layer_12/attn": 0.003553429152816534, "grad/layer_12/mlp": 0.005997620057314634, "grad/layer_12/attn_mlp_ratio": 0.5924731909677223, "grad/layer_16/attn": 0.003830031957477331, "grad/layer_16/mlp": 0.004391215275973082, "grad/layer_16/attn_mlp_ratio": 0.8722031669031052, "grad/layer_20/attn": 0.0034406138584017754, "grad/layer_20/mlp": 0.00554026709869504, "grad/layer_20/attn_mlp_ratio": 0.6210194806510743, "grad/layer_24/attn": 0.014313303865492344, "grad/layer_24/mlp": 0.010368749499320984, "grad/layer_24/attn_mlp_ratio": 1.3804271892562316, "grad/layer_27/attn": 0.005922691896557808, "grad/layer_27/mlp": 0.010103235021233559, "grad/layer_27/attn_mlp_ratio": 0.5862173675549059} {"step": 40500, "timestamp": 1778238309.5491762, "eos/sharpness": 73.14858436584471, "eos/L0_probe": 1.9964401721954346, "eos/L_plus": 2.279644250869751, "eos/L_minus": 2.4447219371795654, "eos/grad_norm": 0.18639469146728516, "eos/embed_grad_frac": 0.06478899717330933, "eos/time_s": 0.6047718524932861} {"step": 40500, "timestamp": 1778238309.5695627, "train/loss": 2.1568662405014036, "train/z_loss": 0.0014126094058156014, "train/perplexity": 8.644006931252648, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908976.14098217, "perf/iters_per_sec": 0.910270758143506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985742330551147, "data/tokens_consumed": 84936753152, "data/tokens_consumed_B": 84.936753152, "train/loss_slope": -1.6761315544911226e-05} {"step": 40500, "timestamp": 1778238310.9329362, "geo/rankme_last": 439.6277770996094, "geo/layer_0/stable_rank_q_proj": 19.2436466217041, "geo/layer_0/stable_rank_k_proj": 16.39035987854004, "geo/layer_0/stable_rank_o_proj": 49.17156982421875, "geo/layer_0/stable_rank_gate_proj": 137.2615966796875, "geo/layer_0/stable_rank_down_proj": 53.0912971496582, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060242846608161926, "geo/layer_0/attn_entropy_mean": 6.194275856018066, "geo/layer_0/attn_entropy_std": 0.3757547438144684, "geo/layer_7/stable_rank_q_proj": 42.98295593261719, "geo/layer_7/stable_rank_k_proj": 41.944358825683594, "geo/layer_7/stable_rank_o_proj": 98.11356353759766, "geo/layer_7/stable_rank_gate_proj": 90.26670837402344, "geo/layer_7/stable_rank_down_proj": 146.74124145507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5012840032577515, "geo/layer_7/attn_entropy_mean": 4.668982028961182, "geo/layer_7/attn_entropy_std": 0.8294557929039001, "geo/layer_14/stable_rank_q_proj": 54.14067077636719, "geo/layer_14/stable_rank_k_proj": 37.2547492980957, "geo/layer_14/stable_rank_o_proj": 47.95355224609375, "geo/layer_14/stable_rank_gate_proj": 76.97663879394531, "geo/layer_14/stable_rank_down_proj": 133.8042449951172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3844932019710541, "geo/layer_14/attn_entropy_mean": 5.510024070739746, "geo/layer_14/attn_entropy_std": 0.3632957637310028, "geo/layer_21/stable_rank_q_proj": 43.41044235229492, "geo/layer_21/stable_rank_k_proj": 30.850297927856445, "geo/layer_21/stable_rank_o_proj": 75.76458740234375, "geo/layer_21/stable_rank_gate_proj": 72.87353515625, "geo/layer_21/stable_rank_down_proj": 55.17473220825195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14297279715538025, "geo/layer_21/attn_entropy_mean": 5.710535049438477, "geo/layer_21/attn_entropy_std": 0.2886846363544464, "geo/layer_27/stable_rank_q_proj": 42.64609909057617, "geo/layer_27/stable_rank_k_proj": 31.392955780029297, "geo/layer_27/stable_rank_o_proj": 115.99629974365234, "geo/layer_27/stable_rank_gate_proj": 85.1695327758789, "geo/layer_27/stable_rank_down_proj": 131.32723999023438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08902868628501892, "geo/layer_27/attn_entropy_mean": 4.2808122634887695, "geo/layer_27/attn_entropy_std": 0.6580255031585693, "attnres/final_alpha/block_0": 0.2402077168226242, "attnres/block_norm/0": 1.7213913202285767, "attnres/final_alpha/block_1": 0.005216761492192745, "attnres/block_norm/1": 40231.60546875, "attnres/final_alpha/block_2": 0.01127352099865675, "attnres/block_norm/2": 26494.51171875, "attnres/final_alpha/block_3": 0.013236820697784424, "attnres/block_norm/3": 47111.2421875, "attnres/final_alpha/block_4": 0.016270359978079796, "attnres/block_norm/4": 12877.16796875, "attnres/final_alpha/block_5": 0.5951722860336304, "attnres/block_norm/5": 6071.359375, "attnres/final_alpha/block_6": 0.1186225414276123, "attnres/block_norm/6": 31203.94140625, "geo/tier1_time_s": 1.3593075275421143, "geo/step": 40500.0, "geo/rankme_slope": 0.00014240493072228893} {"step": 40500, "timestamp": 1778238317.6880426, "geo/ww_alpha_mean": 7.494807547422158, "geo/ww_alpha_std": 3.900229879856229, "geo/ww_alpha_min": 1.3520155290909874, "geo/ww_alpha_max": 24.643232432941566, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.061705144516727, "geo/ww_alpha_by_type/k_proj": 4.52612926459175, "geo/ww_alpha_by_type/v_proj": 7.930502979164738, "geo/ww_alpha_by_type/o_proj": 7.941079157542773, "geo/ww_alpha_by_type/gate_proj": 8.048788085270258, "geo/ww_alpha_by_type/up_proj": 11.275740856102013, "geo/ww_alpha_by_type/down_proj": 8.780490871500104, "geo/twonn_id/layer_0": 0.7025563716888428, "geo/twonn_id/layer_7": 3.0762555599212646, "geo/twonn_id/layer_14": 4.736948013305664, "geo/twonn_id/layer_21": 8.351760864257812, "geo/twonn_id/layer_27": 5.569180965423584, "geo/tier2_time_s": 6.748663663864136} {"step": 40500, "timestamp": 1778238318.3655045, "eoc/jacobian_sigma/layer_0/attn": 1054.730712890625, "eoc/jacobian_sigma/layer_0/mlp": 8918.677734375, "eoc/jacobian_sigma/layer_0": 8918.677734375, "eoc/jacobian_sigma/layer_7/attn": 1.1330312490463257, "eoc/jacobian_sigma/layer_7/mlp": 1.7411450147628784, "eoc/jacobian_sigma/layer_7": 1.7411450147628784, "eoc/jacobian_sigma/layer_14/attn": 1.621249794960022, "eoc/jacobian_sigma/layer_14/mlp": 7.095860481262207, "eoc/jacobian_sigma/layer_14": 7.095860481262207, "eoc/jacobian_sigma/layer_21/attn": 1.0939475297927856, "eoc/jacobian_sigma/layer_21/mlp": 4.138255596160889, "eoc/jacobian_sigma/layer_21": 4.138255596160889, "eoc/jacobian_sigma/layer_27/attn": 3.986016273498535, "eoc/jacobian_sigma/layer_27/mlp": 28.75240707397461, "eoc/jacobian_sigma/layer_27": 28.75240707397461, "eoc/layer0_sigma": 8918.677734375, "eoc/sigma_max": 28.75240707397461, "eoc/sigma_min": 1.7411450147628784, "eoc/sigma_mean": 10.431917041540146, "eoc/time_s": 0.6686358451843262} {"step": 40510, "timestamp": 1778238328.7601948, "train/loss": 2.175289750099182, "train/z_loss": 0.0014149747556075453, "train/perplexity": 8.804735925729725, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1093018.1436120686, "perf/iters_per_sec": 0.521191665464434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.91867995262146, "data/tokens_consumed": 84957724672, "data/tokens_consumed_B": 84.957724672, "train/loss_slope": -1.5027520129389435e-05} {"step": 40520, "timestamp": 1778238339.134596, "train/loss": 2.1985623836517334, "train/z_loss": 0.0013996989699080586, "train/perplexity": 9.012048314198108, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022704.0895024675, "perf/iters_per_sec": 0.964500469924196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368061304092406, "data/tokens_consumed": 84978696192, "data/tokens_consumed_B": 84.978696192, "train/loss_slope": -1.2107354164695739e-05} {"step": 40530, "timestamp": 1778238349.5049846, "train/loss": 2.1739646911621096, "train/z_loss": 0.0014087367802858354, "train/perplexity": 8.793076857884692, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024693.6405077525, "perf/iters_per_sec": 0.9654491617716563, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035787320137024, "data/tokens_consumed": 84999667712, "data/tokens_consumed_B": 84.999667712, "train/loss_slope": -1.0958642534690267e-05} {"step": 40540, "timestamp": 1778238359.8587286, "train/loss": 2.2303463220596313, "train/z_loss": 0.001404970942530781, "train/perplexity": 9.30308738603109, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026948.4094103058, "perf/iters_per_sec": 0.9665243193675546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346351146697998, "data/tokens_consumed": 85020639232, "data/tokens_consumed_B": 85.020639232, "train/loss_slope": -6.910977383615526e-06} {"step": 40550, "timestamp": 1778238370.2119873, "grad/layer_0/attn": 0.00265199551358819, "grad/layer_0/mlp": 0.0027159242890775204, "grad/layer_0/attn_mlp_ratio": 0.9764614671356723, "grad/layer_4/attn": 0.004047869239002466, "grad/layer_4/mlp": 0.00253841420635581, "grad/layer_4/attn_mlp_ratio": 1.5946448256563974, "grad/layer_8/attn": 0.0034742404241114855, "grad/layer_8/mlp": 0.0036878499668091536, "grad/layer_8/attn_mlp_ratio": 0.9420774600843548, "grad/layer_12/attn": 0.005150761920958757, "grad/layer_12/mlp": 0.0066468375734984875, "grad/layer_12/attn_mlp_ratio": 0.7749191681775662, "grad/layer_16/attn": 0.005262772552669048, "grad/layer_16/mlp": 0.004690517671406269, "grad/layer_16/attn_mlp_ratio": 1.1220024758783105, "grad/layer_20/attn": 0.00431267824023962, "grad/layer_20/mlp": 0.006004612892866135, "grad/layer_20/attn_mlp_ratio": 0.7182275103097167, "grad/layer_24/attn": 0.008803222328424454, "grad/layer_24/mlp": 0.008927207440137863, "grad/layer_24/attn_mlp_ratio": 0.9861115347485812, "grad/layer_27/attn": 0.012746274471282959, "grad/layer_27/mlp": 0.008406409993767738, "grad/layer_27/attn_mlp_ratio": 1.5162565624454447} {"step": 40550, "timestamp": 1778238370.226759, "train/loss": 2.1961823225021364, "train/z_loss": 0.0014021028531715275, "train/perplexity": 8.990624593123528, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024022.4305629125, "perf/iters_per_sec": 0.9651291039290011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361308097839355, "data/tokens_consumed": 85041610752, "data/tokens_consumed_B": 85.041610752, "train/loss_slope": -5.818424995976241e-06} {"step": 40560, "timestamp": 1778238380.598561, "train/loss": 2.153757429122925, "train/z_loss": 0.0014046133612282575, "train/perplexity": 8.617176071799067, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023294.0470971225, "perf/iters_per_sec": 0.9647817836270917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365038156509399, "data/tokens_consumed": 85062582272, "data/tokens_consumed_B": 85.062582272, "train/loss_slope": -4.664941474978566e-06} {"step": 40570, "timestamp": 1778238390.9613442, "train/loss": 2.19223837852478, "train/z_loss": 0.0014105153968557715, "train/perplexity": 8.955235904780608, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025047.7576097802, "perf/iters_per_sec": 0.9656180179642583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356061935424805, "data/tokens_consumed": 85083553792, "data/tokens_consumed_B": 85.083553792, "train/loss_slope": -4.8407283374363396e-06} {"step": 40575, "timestamp": 1778238396.7453616, "eos/sharpness": 22.184967994689938, "eos/L0_probe": 1.9932160377502441, "eos/L_plus": 2.117955446243286, "eos/L_minus": 2.0903263092041016, "eos/grad_norm": 0.13729487359523773, "eos/embed_grad_frac": 0.1292417049407959, "eos/time_s": 0.620786190032959} {"step": 40575, "timestamp": 1778238398.2487545, "geo/rankme_last": 439.4662170410156, "geo/layer_0/stable_rank_q_proj": 19.229944229125977, "geo/layer_0/stable_rank_k_proj": 16.391027450561523, "geo/layer_0/stable_rank_o_proj": 49.22813415527344, "geo/layer_0/stable_rank_gate_proj": 137.3473358154297, "geo/layer_0/stable_rank_down_proj": 53.13310241699219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058421310037374496, "geo/layer_0/attn_entropy_mean": 6.193599700927734, "geo/layer_0/attn_entropy_std": 0.3747963309288025, "geo/layer_7/stable_rank_q_proj": 42.913909912109375, "geo/layer_7/stable_rank_k_proj": 42.04001998901367, "geo/layer_7/stable_rank_o_proj": 98.26554870605469, "geo/layer_7/stable_rank_gate_proj": 90.2293701171875, "geo/layer_7/stable_rank_down_proj": 146.5551300048828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4883694052696228, "geo/layer_7/attn_entropy_mean": 4.6620001792907715, "geo/layer_7/attn_entropy_std": 0.8227857351303101, "geo/layer_14/stable_rank_q_proj": 54.24858856201172, "geo/layer_14/stable_rank_k_proj": 37.15805435180664, "geo/layer_14/stable_rank_o_proj": 47.86592483520508, "geo/layer_14/stable_rank_gate_proj": 76.93994903564453, "geo/layer_14/stable_rank_down_proj": 133.85690307617188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39223983883857727, "geo/layer_14/attn_entropy_mean": 5.494186878204346, "geo/layer_14/attn_entropy_std": 0.37232768535614014, "geo/layer_21/stable_rank_q_proj": 43.350528717041016, "geo/layer_21/stable_rank_k_proj": 30.851612091064453, "geo/layer_21/stable_rank_o_proj": 75.86090850830078, "geo/layer_21/stable_rank_gate_proj": 73.00633239746094, "geo/layer_21/stable_rank_down_proj": 55.21445083618164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15040531754493713, "geo/layer_21/attn_entropy_mean": 5.725091934204102, "geo/layer_21/attn_entropy_std": 0.28837788105010986, "geo/layer_27/stable_rank_q_proj": 42.5589599609375, "geo/layer_27/stable_rank_k_proj": 31.412046432495117, "geo/layer_27/stable_rank_o_proj": 116.19374084472656, "geo/layer_27/stable_rank_gate_proj": 85.1891098022461, "geo/layer_27/stable_rank_down_proj": 131.43765258789062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0896296501159668, "geo/layer_27/attn_entropy_mean": 4.256411075592041, "geo/layer_27/attn_entropy_std": 0.6503946781158447, "attnres/final_alpha/block_0": 0.23948195576667786, "attnres/block_norm/0": 1.7215380668640137, "attnres/final_alpha/block_1": 0.005115206353366375, "attnres/block_norm/1": 40384.859375, "attnres/final_alpha/block_2": 0.011151967570185661, "attnres/block_norm/2": 26468.640625, "attnres/final_alpha/block_3": 0.01321727316826582, "attnres/block_norm/3": 47223.72265625, "attnres/final_alpha/block_4": 0.016225401312112808, "attnres/block_norm/4": 12892.7666015625, "attnres/final_alpha/block_5": 0.6000802516937256, "attnres/block_norm/5": 6002.400390625, "attnres/final_alpha/block_6": 0.1147279441356659, "attnres/block_norm/6": 31451.498046875, "geo/tier1_time_s": 1.363339900970459, "geo/step": 40575.0, "geo/rankme_slope": 0.00011701747105092036} {"step": 40580, "timestamp": 1778238403.4250743, "train/loss": 2.1600476503372192, "train/z_loss": 0.0014106018468737602, "train/perplexity": 8.671550850941257, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1683467.128997941, "perf/iters_per_sec": 0.8027396817197519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2457338571548462, "data/tokens_consumed": 85104525312, "data/tokens_consumed_B": 85.104525312, "train/loss_slope": -3.9975732430801675e-06} {"step": 40590, "timestamp": 1778238413.776167, "train/loss": 2.159059691429138, "train/z_loss": 0.0014160391758196056, "train/perplexity": 8.662987945626977, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026997.9683012303, "perf/iters_per_sec": 0.9665479508882667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034609818458557, "data/tokens_consumed": 85125496832, "data/tokens_consumed_B": 85.125496832, "train/loss_slope": -4.390815368043397e-06} {"step": 40600, "timestamp": 1778238424.1090608, "grad/layer_0/attn": 0.002642442239448428, "grad/layer_0/mlp": 0.0028038497548550367, "grad/layer_0/attn_mlp_ratio": 0.9424335739208284, "grad/layer_4/attn": 0.002347240224480629, "grad/layer_4/mlp": 0.00255575031042099, "grad/layer_4/attn_mlp_ratio": 0.9184152783111499, "grad/layer_8/attn": 0.003841627622023225, "grad/layer_8/mlp": 0.003654484637081623, "grad/layer_8/attn_mlp_ratio": 1.0512091028983335, "grad/layer_12/attn": 0.004877480678260326, "grad/layer_12/mlp": 0.006661680527031422, "grad/layer_12/attn_mlp_ratio": 0.7321696958075022, "grad/layer_16/attn": 0.007436155807226896, "grad/layer_16/mlp": 0.004680004436522722, "grad/layer_16/attn_mlp_ratio": 1.5889206408231422, "grad/layer_20/attn": 0.0040873028337955475, "grad/layer_20/mlp": 0.0060808113776147366, "grad/layer_20/attn_mlp_ratio": 0.6721640440329579, "grad/layer_24/attn": 0.007257008925080299, "grad/layer_24/mlp": 0.010290693491697311, "grad/layer_24/attn_mlp_ratio": 0.7052011470767495, "grad/layer_27/attn": 0.009091874584555626, "grad/layer_27/mlp": 0.0086887888610363, "grad/layer_27/attn_mlp_ratio": 1.046391462069905} {"step": 40600, "timestamp": 1778238424.1234133, "train/loss": 2.2113840341567994, "train/z_loss": 0.0013973421417176723, "train/perplexity": 9.128341590729038, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027930.3698261722, "perf/iters_per_sec": 0.9669925545817243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341341257095338, "data/tokens_consumed": 85146468352, "data/tokens_consumed_B": 85.146468352, "train/loss_slope": -1.8076764331934954e-06} {"step": 40610, "timestamp": 1778238434.471872, "train/loss": 2.169018578529358, "train/z_loss": 0.001416586700361222, "train/perplexity": 8.749692689295246, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028138.3049821996, "perf/iters_per_sec": 0.9670917057906149, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340281009674073, "data/tokens_consumed": 85167439872, "data/tokens_consumed_B": 85.167439872, "train/loss_slope": -3.855948558341495e-06} {"step": 40620, "timestamp": 1778238444.8228886, "train/loss": 2.1831297159194945, "train/z_loss": 0.0014099246123805641, "train/perplexity": 8.874036054804925, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027649.8395043146, "perf/iters_per_sec": 0.9668587873002599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342772006988525, "data/tokens_consumed": 85188411392, "data/tokens_consumed_B": 85.188411392, "train/loss_slope": -2.4159910011462797e-06} {"step": 40630, "timestamp": 1778238455.1708515, "train/loss": 2.1633363485336305, "train/z_loss": 0.0014029065961949528, "train/perplexity": 8.700115909768028, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027822.5147075518, "perf/iters_per_sec": 0.9669411252534637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341891288757323, "data/tokens_consumed": 85209382912, "data/tokens_consumed_B": 85.209382912, "train/loss_slope": -8.662089763587172e-07} {"step": 40640, "timestamp": 1778238465.5148444, "train/loss": 2.1678898334503174, "train/z_loss": 0.0014141574501991272, "train/perplexity": 8.739822088473122, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028693.2100102636, "perf/iters_per_sec": 0.9673563051272696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337452650070191, "data/tokens_consumed": 85230354432, "data/tokens_consumed_B": 85.230354432, "train/loss_slope": -3.6005566508570114e-06} {"step": 40650, "timestamp": 1778238476.3618681, "grad/layer_0/attn": 0.0028312935028225183, "grad/layer_0/mlp": 0.0027693521697074175, "grad/layer_0/attn_mlp_ratio": 1.0223666861715808, "grad/layer_4/attn": 0.001717879669740796, "grad/layer_4/mlp": 0.002558161271736026, "grad/layer_4/attn_mlp_ratio": 0.6715290476671549, "grad/layer_8/attn": 0.003254520008340478, "grad/layer_8/mlp": 0.003853191388770938, "grad/layer_8/attn_mlp_ratio": 0.8446297096380695, "grad/layer_12/attn": 0.004226849880069494, "grad/layer_12/mlp": 0.006635197438299656, "grad/layer_12/attn_mlp_ratio": 0.6370345201738594, "grad/layer_16/attn": 0.0054978271946311, "grad/layer_16/mlp": 0.004671625327318907, "grad/layer_16/attn_mlp_ratio": 1.176855310890445, "grad/layer_20/attn": 0.003938402980566025, "grad/layer_20/mlp": 0.0065558492206037045, "grad/layer_20/attn_mlp_ratio": 0.6007464155999472, "grad/layer_24/attn": 0.015374581329524517, "grad/layer_24/mlp": 0.012669293209910393, "grad/layer_24/attn_mlp_ratio": 1.2135310907592571, "grad/layer_27/attn": 0.009028922766447067, "grad/layer_27/mlp": 0.011968838982284069, "grad/layer_27/attn_mlp_ratio": 0.7543691334117291} {"step": 40650, "timestamp": 1778238476.964412, "eos/sharpness": 76.9625425338745, "eos/L0_probe": 1.9962692260742188, "eos/L_plus": 2.3168601989746094, "eos/L_minus": 2.4453036785125732, "eos/grad_norm": 0.23473457992076874, "eos/embed_grad_frac": 0.04168478399515152, "eos/time_s": 0.5994777679443359} {"step": 40650, "timestamp": 1778238476.9852746, "train/loss": 2.172940766811371, "train/z_loss": 0.001423687522765249, "train/perplexity": 8.784078020222708, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1829513.2225979266, "perf/iters_per_sec": 0.8723798859586366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1462896108627318, "data/tokens_consumed": 85251325952, "data/tokens_consumed_B": 85.251325952, "train/loss_slope": -4.3369633565602074e-07} {"step": 40650, "timestamp": 1778238478.3524017, "geo/rankme_last": 439.70684814453125, "geo/layer_0/stable_rank_q_proj": 19.202871322631836, "geo/layer_0/stable_rank_k_proj": 16.344650268554688, "geo/layer_0/stable_rank_o_proj": 49.16957092285156, "geo/layer_0/stable_rank_gate_proj": 137.3454132080078, "geo/layer_0/stable_rank_down_proj": 53.13848876953125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059976302087306976, "geo/layer_0/attn_entropy_mean": 6.190891742706299, "geo/layer_0/attn_entropy_std": 0.37431395053863525, "geo/layer_7/stable_rank_q_proj": 42.791786193847656, "geo/layer_7/stable_rank_k_proj": 42.0275993347168, "geo/layer_7/stable_rank_o_proj": 98.13961029052734, "geo/layer_7/stable_rank_gate_proj": 90.09577941894531, "geo/layer_7/stable_rank_down_proj": 146.2869110107422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5003625154495239, "geo/layer_7/attn_entropy_mean": 4.658932685852051, "geo/layer_7/attn_entropy_std": 0.8234086632728577, "geo/layer_14/stable_rank_q_proj": 54.28459548950195, "geo/layer_14/stable_rank_k_proj": 37.20781326293945, "geo/layer_14/stable_rank_o_proj": 47.911006927490234, "geo/layer_14/stable_rank_gate_proj": 76.85044860839844, "geo/layer_14/stable_rank_down_proj": 134.05294799804688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3867136836051941, "geo/layer_14/attn_entropy_mean": 5.499942779541016, "geo/layer_14/attn_entropy_std": 0.3759656250476837, "geo/layer_21/stable_rank_q_proj": 43.31045913696289, "geo/layer_21/stable_rank_k_proj": 30.83293342590332, "geo/layer_21/stable_rank_o_proj": 75.89668273925781, "geo/layer_21/stable_rank_gate_proj": 72.91361999511719, "geo/layer_21/stable_rank_down_proj": 55.138702392578125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1473710834980011, "geo/layer_21/attn_entropy_mean": 5.7339186668396, "geo/layer_21/attn_entropy_std": 0.2832992970943451, "geo/layer_27/stable_rank_q_proj": 42.61153793334961, "geo/layer_27/stable_rank_k_proj": 31.37637710571289, "geo/layer_27/stable_rank_o_proj": 116.21246337890625, "geo/layer_27/stable_rank_gate_proj": 85.14146423339844, "geo/layer_27/stable_rank_down_proj": 131.3944854736328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07998146861791611, "geo/layer_27/attn_entropy_mean": 4.280482769012451, "geo/layer_27/attn_entropy_std": 0.6685451865196228, "attnres/final_alpha/block_0": 0.24311277270317078, "attnres/block_norm/0": 1.7219078540802002, "attnres/final_alpha/block_1": 0.005216216668486595, "attnres/block_norm/1": 40417.609375, "attnres/final_alpha/block_2": 0.011428664438426495, "attnres/block_norm/2": 26413.171875, "attnres/final_alpha/block_3": 0.013383704237639904, "attnres/block_norm/3": 46985.859375, "attnres/final_alpha/block_4": 0.016469810158014297, "attnres/block_norm/4": 12883.0673828125, "attnres/final_alpha/block_5": 0.5926231741905212, "attnres/block_norm/5": 6036.150390625, "attnres/final_alpha/block_6": 0.11776567995548248, "attnres/block_norm/6": 31164.5859375, "geo/tier1_time_s": 1.3627564907073975, "geo/step": 40650.0, "geo/rankme_slope": 0.00010582355989270709} {"step": 40660, "timestamp": 1778238488.7076108, "train/loss": 2.230720591545105, "train/z_loss": 0.0014004422817379236, "train/perplexity": 9.306569899418973, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789689.536343264, "perf/iters_per_sec": 0.8533904725757904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1717965364456178, "data/tokens_consumed": 85272297472, "data/tokens_consumed_B": 85.272297472, "train/loss_slope": 4.039715819268732e-06} {"step": 40670, "timestamp": 1778238499.0574229, "train/loss": 2.1460572242736817, "train/z_loss": 0.0014172282302752138, "train/perplexity": 8.551076866138411, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027432.0975220483, "perf/iters_per_sec": 0.9667549598322145, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034388279914856, "data/tokens_consumed": 85293268992, "data/tokens_consumed_B": 85.293268992, "train/loss_slope": 1.812129331143001e-06} {"step": 40680, "timestamp": 1778238509.4052942, "train/loss": 2.190400266647339, "train/z_loss": 0.001405442098621279, "train/perplexity": 8.938790298361322, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027777.0759339617, "perf/iters_per_sec": 0.9669194583577927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034212303161621, "data/tokens_consumed": 85314240512, "data/tokens_consumed_B": 85.314240512, "train/loss_slope": 3.0411921712038633e-06} {"step": 40690, "timestamp": 1778238519.7514093, "train/loss": 2.1510388135910032, "train/z_loss": 0.0014061669469811022, "train/perplexity": 8.593781098467005, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028222.2953801062, "perf/iters_per_sec": 0.967131755533269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339852809906005, "data/tokens_consumed": 85335212032, "data/tokens_consumed_B": 85.335212032, "train/loss_slope": 4.069999814426972e-06} {"step": 40700, "timestamp": 1778238530.0954306, "grad/layer_0/attn": 0.002668689237907529, "grad/layer_0/mlp": 0.002651412272825837, "grad/layer_0/attn_mlp_ratio": 1.0065160988380235, "grad/layer_4/attn": 0.0015384876169264317, "grad/layer_4/mlp": 0.0025323473382741213, "grad/layer_4/attn_mlp_ratio": 0.6075341770539046, "grad/layer_8/attn": 0.0034416464623063803, "grad/layer_8/mlp": 0.0037639529909938574, "grad/layer_8/attn_mlp_ratio": 0.9143701791983881, "grad/layer_12/attn": 0.003630513558164239, "grad/layer_12/mlp": 0.006218495778739452, "grad/layer_12/attn_mlp_ratio": 0.5838250324450128, "grad/layer_16/attn": 0.005110145080834627, "grad/layer_16/mlp": 0.0044924612157046795, "grad/layer_16/attn_mlp_ratio": 1.1374933965420377, "grad/layer_20/attn": 0.003082862589508295, "grad/layer_20/mlp": 0.005206335335969925, "grad/layer_20/attn_mlp_ratio": 0.5921367586516191, "grad/layer_24/attn": 0.004719872958958149, "grad/layer_24/mlp": 0.00745352590456605, "grad/layer_24/attn_mlp_ratio": 0.6332402886991664, "grad/layer_27/attn": 0.003965470008552074, "grad/layer_27/mlp": 0.006486070342361927, "grad/layer_27/attn_mlp_ratio": 0.6113825071421879} {"step": 40700, "timestamp": 1778238530.1098473, "train/loss": 2.187881588935852, "train/z_loss": 0.0014095834689214826, "train/perplexity": 8.91630469534965, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025796.5797029673, "perf/iters_per_sec": 0.9659750841631733, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035223388671875, "data/tokens_consumed": 85356183552, "data/tokens_consumed_B": 85.356183552, "train/loss_slope": 3.877454457825265e-06} {"step": 40710, "timestamp": 1778238540.4569168, "train/loss": 2.205679106712341, "train/z_loss": 0.0014016380766406656, "train/perplexity": 9.076413328588492, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028136.8085571157, "perf/iters_per_sec": 0.9670909922395304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340288639068604, "data/tokens_consumed": 85377155072, "data/tokens_consumed_B": 85.377155072, "train/loss_slope": 5.830441642873424e-06} {"step": 40720, "timestamp": 1778238550.7982311, "train/loss": 2.2144339323043822, "train/z_loss": 0.0013961211196146905, "train/perplexity": 9.156224601394701, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029124.5550974617, "perf/iters_per_sec": 0.967561986492854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335255146026612, "data/tokens_consumed": 85398126592, "data/tokens_consumed_B": 85.398126592, "train/loss_slope": 1.0673605440759556e-05} {"step": 40725, "timestamp": 1778238556.5637116, "eos/sharpness": 23.602032661437985, "eos/L0_probe": 1.9958560466766357, "eos/L_plus": 2.12518310546875, "eos/L_minus": 2.1025493144989014, "eos/grad_norm": 0.13180631399154663, "eos/embed_grad_frac": 0.1566849648952484, "eos/time_s": 0.5992875099182129} {"step": 40725, "timestamp": 1778238557.9432216, "geo/rankme_last": 438.4604187011719, "geo/layer_0/stable_rank_q_proj": 19.226224899291992, "geo/layer_0/stable_rank_k_proj": 16.343738555908203, "geo/layer_0/stable_rank_o_proj": 49.12322235107422, "geo/layer_0/stable_rank_gate_proj": 137.2938690185547, "geo/layer_0/stable_rank_down_proj": 53.12830352783203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05909644067287445, "geo/layer_0/attn_entropy_mean": 6.189513206481934, "geo/layer_0/attn_entropy_std": 0.37358635663986206, "geo/layer_7/stable_rank_q_proj": 42.85546875, "geo/layer_7/stable_rank_k_proj": 42.063419342041016, "geo/layer_7/stable_rank_o_proj": 98.12259674072266, "geo/layer_7/stable_rank_gate_proj": 89.99117279052734, "geo/layer_7/stable_rank_down_proj": 146.58590698242188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5084902048110962, "geo/layer_7/attn_entropy_mean": 4.63520622253418, "geo/layer_7/attn_entropy_std": 0.7956357598304749, "geo/layer_14/stable_rank_q_proj": 54.2559928894043, "geo/layer_14/stable_rank_k_proj": 37.1599006652832, "geo/layer_14/stable_rank_o_proj": 47.85204315185547, "geo/layer_14/stable_rank_gate_proj": 76.76007843017578, "geo/layer_14/stable_rank_down_proj": 133.41921997070312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3847161829471588, "geo/layer_14/attn_entropy_mean": 5.492241382598877, "geo/layer_14/attn_entropy_std": 0.3655758798122406, "geo/layer_21/stable_rank_q_proj": 43.361690521240234, "geo/layer_21/stable_rank_k_proj": 30.815872192382812, "geo/layer_21/stable_rank_o_proj": 75.95438385009766, "geo/layer_21/stable_rank_gate_proj": 72.7980728149414, "geo/layer_21/stable_rank_down_proj": 55.13882827758789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14597173035144806, "geo/layer_21/attn_entropy_mean": 5.702870845794678, "geo/layer_21/attn_entropy_std": 0.2750494182109833, "geo/layer_27/stable_rank_q_proj": 42.56058120727539, "geo/layer_27/stable_rank_k_proj": 31.309295654296875, "geo/layer_27/stable_rank_o_proj": 115.98529815673828, "geo/layer_27/stable_rank_gate_proj": 85.1695556640625, "geo/layer_27/stable_rank_down_proj": 131.19082641601562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08662780374288559, "geo/layer_27/attn_entropy_mean": 4.239351272583008, "geo/layer_27/attn_entropy_std": 0.6622693538665771, "attnres/final_alpha/block_0": 0.24029715359210968, "attnres/block_norm/0": 1.7221310138702393, "attnres/final_alpha/block_1": 0.0051817512139678, "attnres/block_norm/1": 40425.4140625, "attnres/final_alpha/block_2": 0.011173894628882408, "attnres/block_norm/2": 26523.892578125, "attnres/final_alpha/block_3": 0.013094323687255383, "attnres/block_norm/3": 47764.9296875, "attnres/final_alpha/block_4": 0.01587020978331566, "attnres/block_norm/4": 12906.353515625, "attnres/final_alpha/block_5": 0.5982190370559692, "attnres/block_norm/5": 6006.560546875, "attnres/final_alpha/block_6": 0.11616364121437073, "attnres/block_norm/6": 31500.57421875, "geo/tier1_time_s": 1.3609838485717773, "geo/step": 40725.0, "geo/rankme_slope": 8.340136835984394e-05} {"step": 40730, "timestamp": 1778238563.163459, "train/loss": 2.1659825801849366, "train/z_loss": 0.0014034684980288147, "train/perplexity": 8.72316892020896, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696822.504329205, "perf/iters_per_sec": 0.8091080209394479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2359289169311523, "data/tokens_consumed": 85419098112, "data/tokens_consumed_B": 85.419098112, "train/loss_slope": 9.895126117874571e-06} {"step": 40740, "timestamp": 1778238573.507061, "train/loss": 2.1198493957519533, "train/z_loss": 0.0014208034262992442, "train/perplexity": 8.32988287746817, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028837.9852320333, "perf/iters_per_sec": 0.9674253393325964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033671498298645, "data/tokens_consumed": 85440069632, "data/tokens_consumed_B": 85.440069632, "train/loss_slope": 8.080394855319621e-06} {"step": 40750, "timestamp": 1778238583.83968, "grad/layer_0/attn": 0.002856627805158496, "grad/layer_0/mlp": 0.002601753920316696, "grad/layer_0/attn_mlp_ratio": 1.0979622911510964, "grad/layer_4/attn": 0.0021473229862749577, "grad/layer_4/mlp": 0.002324197441339493, "grad/layer_4/attn_mlp_ratio": 0.9238986566681427, "grad/layer_8/attn": 0.003932167310267687, "grad/layer_8/mlp": 0.003451134543865919, "grad/layer_8/attn_mlp_ratio": 1.1393839174767522, "grad/layer_12/attn": 0.004865220282226801, "grad/layer_12/mlp": 0.006466908846050501, "grad/layer_12/attn_mlp_ratio": 0.7523254653520536, "grad/layer_16/attn": 0.006520709954202175, "grad/layer_16/mlp": 0.004463616758584976, "grad/layer_16/attn_mlp_ratio": 1.4608579008435156, "grad/layer_20/attn": 0.0031814565882086754, "grad/layer_20/mlp": 0.00523674301803112, "grad/layer_20/attn_mlp_ratio": 0.6075258068806745, "grad/layer_24/attn": 0.005943562835454941, "grad/layer_24/mlp": 0.007737098727375269, "grad/layer_24/attn_mlp_ratio": 0.768190114675223, "grad/layer_27/attn": 0.004996250383555889, "grad/layer_27/mlp": 0.006565562915056944, "grad/layer_27/attn_mlp_ratio": 0.7609782088905221} {"step": 40750, "timestamp": 1778238583.8539774, "train/loss": 2.1641283512115477, "train/z_loss": 0.0014002404757775366, "train/perplexity": 8.707009154240469, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028194.328985136, "perf/iters_per_sec": 0.9671184201169662, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033999538421631, "data/tokens_consumed": 85461041152, "data/tokens_consumed_B": 85.461041152, "train/loss_slope": 8.190784903571252e-06} {"step": 40760, "timestamp": 1778238594.2028883, "train/loss": 2.2468138694763184, "train/z_loss": 0.0013993973494507372, "train/perplexity": 9.457554777537782, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027805.0308231865, "perf/iters_per_sec": 0.9669327882877286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341980457305908, "data/tokens_consumed": 85482012672, "data/tokens_consumed_B": 85.482012672, "train/loss_slope": 1.488781039721254e-05} {"step": 40770, "timestamp": 1778238604.5462046, "train/loss": 2.203309345245361, "train/z_loss": 0.0013942604535259306, "train/perplexity": 9.054929859426567, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028908.4150388592, "perf/iters_per_sec": 0.9674589228815361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336356163024902, "data/tokens_consumed": 85502984192, "data/tokens_consumed_B": 85.502984192, "train/loss_slope": 1.7152690601320275e-05} {"step": 40780, "timestamp": 1778238614.8908308, "train/loss": 2.1710771560668944, "train/z_loss": 0.0014019366702996195, "train/perplexity": 8.767723162321785, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028642.6324584002, "perf/iters_per_sec": 0.9673321878711701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03377103805542, "data/tokens_consumed": 85523955712, "data/tokens_consumed_B": 85.523955712, "train/loss_slope": 1.7703811635207092e-05} {"step": 40790, "timestamp": 1778238625.237454, "train/loss": 2.130796504020691, "train/z_loss": 0.0014160900609567761, "train/perplexity": 8.4215719566721, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028262.6095005805, "perf/iters_per_sec": 0.9671509788039114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339647293090821, "data/tokens_consumed": 85544927232, "data/tokens_consumed_B": 85.544927232, "train/loss_slope": 1.1860779960556798e-05} {"step": 40800, "timestamp": 1778238635.5786834, "grad/layer_0/attn": 0.002511267550289631, "grad/layer_0/mlp": 0.0027425780426710844, "grad/layer_0/attn_mlp_ratio": 0.9156594341716102, "grad/layer_4/attn": 0.0024211714044213295, "grad/layer_4/mlp": 0.0023759317118674517, "grad/layer_4/attn_mlp_ratio": 1.0190407790021212, "grad/layer_8/attn": 0.0035512482281774282, "grad/layer_8/mlp": 0.0037011378444731236, "grad/layer_8/attn_mlp_ratio": 0.9595017212153564, "grad/layer_12/attn": 0.004490419290959835, "grad/layer_12/mlp": 0.00642162561416626, "grad/layer_12/attn_mlp_ratio": 0.6992651846796152, "grad/layer_16/attn": 0.005587584339082241, "grad/layer_16/mlp": 0.0046343086287379265, "grad/layer_16/attn_mlp_ratio": 1.2056996342157642, "grad/layer_20/attn": 0.0047599985264241695, "grad/layer_20/mlp": 0.0058232867158949375, "grad/layer_20/attn_mlp_ratio": 0.8174075357977436, "grad/layer_24/attn": 0.011848627589643002, "grad/layer_24/mlp": 0.010314102284610271, "grad/layer_24/attn_mlp_ratio": 1.1487793263834964, "grad/layer_27/attn": 0.004355579614639282, "grad/layer_27/mlp": 0.010183899663388729, "grad/layer_27/attn_mlp_ratio": 0.42769270277950455} {"step": 40800, "timestamp": 1778238636.158265, "eos/sharpness": 46.708989143371575, "eos/L0_probe": 1.9990407228469849, "eos/L_plus": 2.2595105171203613, "eos/L_minus": 2.205660820007324, "eos/grad_norm": 0.155116006731987, "eos/embed_grad_frac": 0.10390853881835938, "eos/time_s": 0.5768411159515381} {"step": 40800, "timestamp": 1778238636.1775594, "train/loss": 2.1488540172576904, "train/z_loss": 0.001410694804508239, "train/perplexity": 8.575025932602978, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918152.5843731794, "perf/iters_per_sec": 0.9146464273324868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0933186531066894, "data/tokens_consumed": 85565898752, "data/tokens_consumed_B": 85.565898752, "train/loss_slope": 9.294472733596923e-06} {"step": 40800, "timestamp": 1778238637.543489, "geo/rankme_last": 439.0635986328125, "geo/layer_0/stable_rank_q_proj": 19.226816177368164, "geo/layer_0/stable_rank_k_proj": 16.353851318359375, "geo/layer_0/stable_rank_o_proj": 48.985740661621094, "geo/layer_0/stable_rank_gate_proj": 137.3800048828125, "geo/layer_0/stable_rank_down_proj": 53.13153839111328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059255339205265045, "geo/layer_0/attn_entropy_mean": 6.192668914794922, "geo/layer_0/attn_entropy_std": 0.37052154541015625, "geo/layer_7/stable_rank_q_proj": 42.895755767822266, "geo/layer_7/stable_rank_k_proj": 42.008872985839844, "geo/layer_7/stable_rank_o_proj": 97.9150390625, "geo/layer_7/stable_rank_gate_proj": 89.97982025146484, "geo/layer_7/stable_rank_down_proj": 146.2449188232422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5008317232131958, "geo/layer_7/attn_entropy_mean": 4.679842472076416, "geo/layer_7/attn_entropy_std": 0.8327779769897461, "geo/layer_14/stable_rank_q_proj": 54.17669677734375, "geo/layer_14/stable_rank_k_proj": 37.22252655029297, "geo/layer_14/stable_rank_o_proj": 47.86027145385742, "geo/layer_14/stable_rank_gate_proj": 76.79238891601562, "geo/layer_14/stable_rank_down_proj": 133.23318481445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3934994041919708, "geo/layer_14/attn_entropy_mean": 5.493798732757568, "geo/layer_14/attn_entropy_std": 0.37588465213775635, "geo/layer_21/stable_rank_q_proj": 43.315528869628906, "geo/layer_21/stable_rank_k_proj": 30.76140594482422, "geo/layer_21/stable_rank_o_proj": 75.86090087890625, "geo/layer_21/stable_rank_gate_proj": 72.75989532470703, "geo/layer_21/stable_rank_down_proj": 55.18095779418945, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15114907920360565, "geo/layer_21/attn_entropy_mean": 5.7405686378479, "geo/layer_21/attn_entropy_std": 0.28676262497901917, "geo/layer_27/stable_rank_q_proj": 42.492347717285156, "geo/layer_27/stable_rank_k_proj": 31.29649543762207, "geo/layer_27/stable_rank_o_proj": 116.13763427734375, "geo/layer_27/stable_rank_gate_proj": 85.11470794677734, "geo/layer_27/stable_rank_down_proj": 131.10626220703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08670210838317871, "geo/layer_27/attn_entropy_mean": 4.268792152404785, "geo/layer_27/attn_entropy_std": 0.6385869383811951, "attnres/final_alpha/block_0": 0.24138054251670837, "attnres/block_norm/0": 1.72226881980896, "attnres/final_alpha/block_1": 0.005198065657168627, "attnres/block_norm/1": 40356.9765625, "attnres/final_alpha/block_2": 0.011542235501110554, "attnres/block_norm/2": 26475.287109375, "attnres/final_alpha/block_3": 0.013460756279528141, "attnres/block_norm/3": 47512.30859375, "attnres/final_alpha/block_4": 0.01618240773677826, "attnres/block_norm/4": 12973.5146484375, "attnres/final_alpha/block_5": 0.5970543026924133, "attnres/block_norm/5": 6029.6123046875, "attnres/final_alpha/block_6": 0.11518165469169617, "attnres/block_norm/6": 31467.134765625, "geo/tier1_time_s": 1.3628630638122559, "geo/step": 40800.0, "geo/rankme_slope": 4.8311433948579434e-05} {"step": 40810, "timestamp": 1778238648.5541434, "train/loss": 2.1883122444152834, "train/z_loss": 0.0014140671468339861, "train/perplexity": 8.920145377769087, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694901.5319462535, "perf/iters_per_sec": 0.8081920299273746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2373296976089478, "data/tokens_consumed": 85586870272, "data/tokens_consumed_B": 85.586870272, "train/loss_slope": 9.603849732049655e-06} {"step": 40820, "timestamp": 1778238658.9099667, "train/loss": 2.1832983016967775, "train/z_loss": 0.001402360969223082, "train/perplexity": 8.875532217183167, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026406.2225144205, "perf/iters_per_sec": 0.9662657845089057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349119424819946, "data/tokens_consumed": 85607841792, "data/tokens_consumed_B": 85.607841792, "train/loss_slope": 9.835447179208927e-06} {"step": 40830, "timestamp": 1778238669.9057596, "train/loss": 2.1604153633117678, "train/z_loss": 0.0014113482553511857, "train/perplexity": 8.674740079022943, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908387.4002393226, "perf/iters_per_sec": 0.9099900246807683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0989131450653076, "data/tokens_consumed": 85628813312, "data/tokens_consumed_B": 85.628813312, "train/loss_slope": 9.386897008411218e-06} {"step": 40840, "timestamp": 1778238680.3002007, "train/loss": 2.2556466341018675, "train/z_loss": 0.0013903630780987442, "train/perplexity": 9.541461149925262, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019103.4692834853, "perf/iters_per_sec": 0.9627835604112078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386550426483154, "data/tokens_consumed": 85649784832, "data/tokens_consumed_B": 85.649784832, "train/loss_slope": 1.3972305371673365e-05} {"step": 40850, "timestamp": 1778238690.6484487, "grad/layer_0/attn": 0.0027878708206117153, "grad/layer_0/mlp": 0.0030598349403589964, "grad/layer_0/attn_mlp_ratio": 0.9111180125202525, "grad/layer_4/attn": 0.0027906999457627535, "grad/layer_4/mlp": 0.0027627309318631887, "grad/layer_4/attn_mlp_ratio": 1.01012364706408, "grad/layer_8/attn": 0.003704385831952095, "grad/layer_8/mlp": 0.0040145921520888805, "grad/layer_8/attn_mlp_ratio": 0.9227302797748942, "grad/layer_12/attn": 0.005884801968932152, "grad/layer_12/mlp": 0.007241664454340935, "grad/layer_12/attn_mlp_ratio": 0.8126311188226156, "grad/layer_16/attn": 0.004697763826698065, "grad/layer_16/mlp": 0.004823224153369665, "grad/layer_16/attn_mlp_ratio": 0.9739882659231636, "grad/layer_20/attn": 0.003150342497974634, "grad/layer_20/mlp": 0.006012682802975178, "grad/layer_20/attn_mlp_ratio": 0.523949549445854, "grad/layer_24/attn": 0.005709374323487282, "grad/layer_24/mlp": 0.007902977988123894, "grad/layer_24/attn_mlp_ratio": 0.7224332725997274, "grad/layer_27/attn": 0.005758806597441435, "grad/layer_27/mlp": 0.0069291722029447556, "grad/layer_27/attn_mlp_ratio": 0.8310958864443393} {"step": 40850, "timestamp": 1778238690.6630518, "train/loss": 2.166886878013611, "train/z_loss": 0.0014120895531959832, "train/perplexity": 8.731060830703624, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025296.4174409518, "perf/iters_per_sec": 0.9657365882115134, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354790449142457, "data/tokens_consumed": 85670756352, "data/tokens_consumed_B": 85.670756352, "train/loss_slope": 9.806613281185934e-06} {"step": 40860, "timestamp": 1778238701.007968, "train/loss": 2.184620404243469, "train/z_loss": 0.0014066421659663319, "train/perplexity": 8.8872743413665, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028537.1809931858, "perf/iters_per_sec": 0.967281904694169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338247776031495, "data/tokens_consumed": 85691727872, "data/tokens_consumed_B": 85.691727872, "train/loss_slope": 9.30344720091366e-06} {"step": 40870, "timestamp": 1778238711.3616, "train/loss": 2.172905707359314, "train/z_loss": 0.0014121183310635387, "train/perplexity": 8.78377006065897, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026818.521498531, "perf/iters_per_sec": 0.966462383984819, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034701418876648, "data/tokens_consumed": 85712699392, "data/tokens_consumed_B": 85.712699392, "train/loss_slope": 7.830845912655242e-06} {"step": 40875, "timestamp": 1778238717.1269512, "eos/sharpness": 80.07919788360594, "eos/L0_probe": 2.0021610260009766, "eos/L_plus": 2.521029472351074, "eos/L_minus": 2.2840845584869385, "eos/grad_norm": 0.21656085550785065, "eos/embed_grad_frac": 0.04476368427276611, "eos/time_s": 0.5979461669921875} {"step": 40875, "timestamp": 1778238718.5063033, "geo/rankme_last": 439.9937438964844, "geo/layer_0/stable_rank_q_proj": 19.2268123626709, "geo/layer_0/stable_rank_k_proj": 16.36956214904785, "geo/layer_0/stable_rank_o_proj": 49.00593566894531, "geo/layer_0/stable_rank_gate_proj": 137.0416717529297, "geo/layer_0/stable_rank_down_proj": 53.18438720703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06097983941435814, "geo/layer_0/attn_entropy_mean": 6.194849014282227, "geo/layer_0/attn_entropy_std": 0.3643154203891754, "geo/layer_7/stable_rank_q_proj": 42.847129821777344, "geo/layer_7/stable_rank_k_proj": 42.010986328125, "geo/layer_7/stable_rank_o_proj": 97.82230377197266, "geo/layer_7/stable_rank_gate_proj": 89.78839874267578, "geo/layer_7/stable_rank_down_proj": 146.14234924316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5068550705909729, "geo/layer_7/attn_entropy_mean": 4.6605730056762695, "geo/layer_7/attn_entropy_std": 0.8350314497947693, "geo/layer_14/stable_rank_q_proj": 54.29132080078125, "geo/layer_14/stable_rank_k_proj": 37.19142532348633, "geo/layer_14/stable_rank_o_proj": 47.722930908203125, "geo/layer_14/stable_rank_gate_proj": 76.64514923095703, "geo/layer_14/stable_rank_down_proj": 133.0289764404297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3978595733642578, "geo/layer_14/attn_entropy_mean": 5.519111156463623, "geo/layer_14/attn_entropy_std": 0.3680857717990875, "geo/layer_21/stable_rank_q_proj": 43.35265350341797, "geo/layer_21/stable_rank_k_proj": 30.738412857055664, "geo/layer_21/stable_rank_o_proj": 75.88082122802734, "geo/layer_21/stable_rank_gate_proj": 72.69410705566406, "geo/layer_21/stable_rank_down_proj": 55.133358001708984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14455534517765045, "geo/layer_21/attn_entropy_mean": 5.717371463775635, "geo/layer_21/attn_entropy_std": 0.2895922064781189, "geo/layer_27/stable_rank_q_proj": 42.51655197143555, "geo/layer_27/stable_rank_k_proj": 31.355331420898438, "geo/layer_27/stable_rank_o_proj": 116.0888671875, "geo/layer_27/stable_rank_gate_proj": 85.24484252929688, "geo/layer_27/stable_rank_down_proj": 131.12957763671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08337045460939407, "geo/layer_27/attn_entropy_mean": 4.266335964202881, "geo/layer_27/attn_entropy_std": 0.644555926322937, "attnres/final_alpha/block_0": 0.23897618055343628, "attnres/block_norm/0": 1.7225619554519653, "attnres/final_alpha/block_1": 0.005108097568154335, "attnres/block_norm/1": 40455.6953125, "attnres/final_alpha/block_2": 0.011151561513543129, "attnres/block_norm/2": 26485.25390625, "attnres/final_alpha/block_3": 0.013212434947490692, "attnres/block_norm/3": 47324.68359375, "attnres/final_alpha/block_4": 0.016063343733549118, "attnres/block_norm/4": 12930.716796875, "attnres/final_alpha/block_5": 0.6012458801269531, "attnres/block_norm/5": 6010.1484375, "attnres/final_alpha/block_6": 0.11424250900745392, "attnres/block_norm/6": 31429.4921875, "geo/tier1_time_s": 1.359102487564087, "geo/step": 40875.0, "geo/rankme_slope": 7.081586540866346e-05} {"step": 40880, "timestamp": 1778238723.678849, "train/loss": 2.1518454551696777, "train/z_loss": 0.0014146619942039251, "train/perplexity": 8.600715996231468, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703723.029746891, "perf/iters_per_sec": 0.8123984478697257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2309230804443358, "data/tokens_consumed": 85733670912, "data/tokens_consumed_B": 85.733670912, "train/loss_slope": 5.224717010294323e-06} {"step": 40890, "timestamp": 1778238734.0466835, "train/loss": 2.194204640388489, "train/z_loss": 0.001402228232473135, "train/perplexity": 8.97286156627503, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024094.482656149, "perf/iters_per_sec": 0.9651634610443826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360939264297486, "data/tokens_consumed": 85754642432, "data/tokens_consumed_B": 85.754642432, "train/loss_slope": 7.5560125044219924e-06} {"step": 40900, "timestamp": 1778238744.3881736, "grad/layer_0/attn": 0.0034016992431133986, "grad/layer_0/mlp": 0.0032484326511621475, "grad/layer_0/attn_mlp_ratio": 1.0471816730379961, "grad/layer_4/attn": 0.0023626957554370165, "grad/layer_4/mlp": 0.0027285234536975622, "grad/layer_4/attn_mlp_ratio": 0.8659246324757601, "grad/layer_8/attn": 0.0033956982661038637, "grad/layer_8/mlp": 0.003809282323345542, "grad/layer_8/attn_mlp_ratio": 0.8914272791360942, "grad/layer_12/attn": 0.004867952782660723, "grad/layer_12/mlp": 0.006869517732411623, "grad/layer_12/attn_mlp_ratio": 0.7086309259862235, "grad/layer_16/attn": 0.003914080560207367, "grad/layer_16/mlp": 0.004839685745537281, "grad/layer_16/attn_mlp_ratio": 0.8087468247172649, "grad/layer_20/attn": 0.003962756134569645, "grad/layer_20/mlp": 0.00694846548140049, "grad/layer_20/attn_mlp_ratio": 0.5703066508924027, "grad/layer_24/attn": 0.013001828454434872, "grad/layer_24/mlp": 0.01180457603186369, "grad/layer_24/attn_mlp_ratio": 1.10142272871107, "grad/layer_27/attn": 0.008532586507499218, "grad/layer_27/mlp": 0.010513441637158394, "grad/layer_27/attn_mlp_ratio": 0.8115883191078997} {"step": 40900, "timestamp": 1778238744.4024522, "train/loss": 2.189042592048645, "train/z_loss": 0.0013931067893281578, "train/perplexity": 8.926662564451222, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026792.788760424, "perf/iters_per_sec": 0.9664501136591072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347145557403565, "data/tokens_consumed": 85775613952, "data/tokens_consumed_B": 85.775613952, "train/loss_slope": 1.0658453528744462e-05} {"step": 40910, "timestamp": 1778238754.7515256, "train/loss": 2.229941487312317, "train/z_loss": 0.0014032036066055297, "train/perplexity": 9.299321935244008, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027922.7957501956, "perf/iters_per_sec": 0.9669889429808596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034137988090515, "data/tokens_consumed": 85796585472, "data/tokens_consumed_B": 85.796585472, "train/loss_slope": 1.408684216495611e-05} {"step": 40920, "timestamp": 1778238765.0968847, "train/loss": 2.1612024784088133, "train/z_loss": 0.001405488094314933, "train/perplexity": 8.681570785825647, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028393.3841979033, "perf/iters_per_sec": 0.9672133370389477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338980674743652, "data/tokens_consumed": 85817556992, "data/tokens_consumed_B": 85.817556992, "train/loss_slope": 1.0490492360927399e-05} {"step": 40930, "timestamp": 1778238775.4663353, "train/loss": 2.1158008098602297, "train/z_loss": 0.0014022408053278924, "train/perplexity": 8.296226806888512, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023679.7992206938, "perf/iters_per_sec": 0.9649657245734662, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363062381744386, "data/tokens_consumed": 85838528512, "data/tokens_consumed_B": 85.838528512, "train/loss_slope": 7.188468361415936e-06} {"step": 40940, "timestamp": 1778238785.8440475, "train/loss": 2.1992799758911135, "train/z_loss": 0.001406224293168634, "train/perplexity": 9.01851761101028, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022271.2851016463, "perf/iters_per_sec": 0.9642940927036506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370280265808105, "data/tokens_consumed": 85859500032, "data/tokens_consumed_B": 85.859500032, "train/loss_slope": 8.537759159979749e-06} {"step": 40950, "timestamp": 1778238796.2070882, "grad/layer_0/attn": 0.0030502991285175085, "grad/layer_0/mlp": 0.0029539281968027353, "grad/layer_0/attn_mlp_ratio": 1.0326246347343917, "grad/layer_4/attn": 0.00191376858856529, "grad/layer_4/mlp": 0.002606173511594534, "grad/layer_4/attn_mlp_ratio": 0.7343212209851169, "grad/layer_8/attn": 0.004163108766078949, "grad/layer_8/mlp": 0.003794301301240921, "grad/layer_8/attn_mlp_ratio": 1.097200334353357, "grad/layer_12/attn": 0.0037891773972660303, "grad/layer_12/mlp": 0.006846412550657988, "grad/layer_12/attn_mlp_ratio": 0.5534544279772363, "grad/layer_16/attn": 0.003428675467148423, "grad/layer_16/mlp": 0.004479489289224148, "grad/layer_16/attn_mlp_ratio": 0.7654165841750713, "grad/layer_20/attn": 0.003726964583620429, "grad/layer_20/mlp": 0.00568520650267601, "grad/layer_20/attn_mlp_ratio": 0.6555548186878833, "grad/layer_24/attn": 0.006375768221914768, "grad/layer_24/mlp": 0.00846068561077118, "grad/layer_24/attn_mlp_ratio": 0.7535758258692754, "grad/layer_27/attn": 0.003855413058772683, "grad/layer_27/mlp": 0.007536532357335091, "grad/layer_27/attn_mlp_ratio": 0.5115632528087} {"step": 40950, "timestamp": 1778238796.7983565, "eos/sharpness": 16.946077346801754, "eos/L0_probe": 1.9984471797943115, "eos/L_plus": 2.0890166759490967, "eos/L_minus": 2.077338457107544, "eos/grad_norm": 0.09658841788768768, "eos/embed_grad_frac": 0.24555055797100067, "eos/time_s": 0.5883240699768066} {"step": 40950, "timestamp": 1778238796.8172657, "train/loss": 2.13796865940094, "train/z_loss": 0.0014140175189822912, "train/perplexity": 8.482189900098092, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912095.870161803, "perf/iters_per_sec": 0.9117583609398856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967818260192872, "data/tokens_consumed": 85880471552, "data/tokens_consumed_B": 85.880471552, "train/loss_slope": 4.5518753802565684e-06} {"step": 40950, "timestamp": 1778238798.179638, "geo/rankme_last": 439.1117858886719, "geo/layer_0/stable_rank_q_proj": 19.2240047454834, "geo/layer_0/stable_rank_k_proj": 16.390365600585938, "geo/layer_0/stable_rank_o_proj": 49.12918472290039, "geo/layer_0/stable_rank_gate_proj": 136.9457550048828, "geo/layer_0/stable_rank_down_proj": 53.221500396728516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05863358452916145, "geo/layer_0/attn_entropy_mean": 6.194033145904541, "geo/layer_0/attn_entropy_std": 0.36991000175476074, "geo/layer_7/stable_rank_q_proj": 42.84981155395508, "geo/layer_7/stable_rank_k_proj": 42.04448318481445, "geo/layer_7/stable_rank_o_proj": 97.85285186767578, "geo/layer_7/stable_rank_gate_proj": 89.7408676147461, "geo/layer_7/stable_rank_down_proj": 146.13534545898438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5111204385757446, "geo/layer_7/attn_entropy_mean": 4.682219505310059, "geo/layer_7/attn_entropy_std": 0.820620059967041, "geo/layer_14/stable_rank_q_proj": 54.38083267211914, "geo/layer_14/stable_rank_k_proj": 37.231136322021484, "geo/layer_14/stable_rank_o_proj": 47.727325439453125, "geo/layer_14/stable_rank_gate_proj": 76.56524658203125, "geo/layer_14/stable_rank_down_proj": 132.9591522216797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3887384235858917, "geo/layer_14/attn_entropy_mean": 5.515481948852539, "geo/layer_14/attn_entropy_std": 0.363938570022583, "geo/layer_21/stable_rank_q_proj": 43.466678619384766, "geo/layer_21/stable_rank_k_proj": 30.7001953125, "geo/layer_21/stable_rank_o_proj": 75.8330078125, "geo/layer_21/stable_rank_gate_proj": 72.62163543701172, "geo/layer_21/stable_rank_down_proj": 55.2498893737793, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.146641805768013, "geo/layer_21/attn_entropy_mean": 5.72245454788208, "geo/layer_21/attn_entropy_std": 0.2895148694515228, "geo/layer_27/stable_rank_q_proj": 42.63064193725586, "geo/layer_27/stable_rank_k_proj": 31.346708297729492, "geo/layer_27/stable_rank_o_proj": 116.00328826904297, "geo/layer_27/stable_rank_gate_proj": 85.14939880371094, "geo/layer_27/stable_rank_down_proj": 131.19622802734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08823377639055252, "geo/layer_27/attn_entropy_mean": 4.274469375610352, "geo/layer_27/attn_entropy_std": 0.6658686399459839, "attnres/final_alpha/block_0": 0.24024388194084167, "attnres/block_norm/0": 1.722580909729004, "attnres/final_alpha/block_1": 0.005128586199134588, "attnres/block_norm/1": 40544.9765625, "attnres/final_alpha/block_2": 0.011169839650392532, "attnres/block_norm/2": 26482.6328125, "attnres/final_alpha/block_3": 0.013070641085505486, "attnres/block_norm/3": 47663.23046875, "attnres/final_alpha/block_4": 0.015933983027935028, "attnres/block_norm/4": 12956.4609375, "attnres/final_alpha/block_5": 0.5991995334625244, "attnres/block_norm/5": 6030.4833984375, "attnres/final_alpha/block_6": 0.11525358259677887, "attnres/block_norm/6": 31589.35546875, "geo/tier1_time_s": 1.358321189880371, "geo/step": 40950.0, "geo/rankme_slope": 6.119543520533214e-05} {"step": 40960, "timestamp": 1778238808.5634222, "train/loss": 2.1701457262039185, "train/z_loss": 0.0014006817364133894, "train/perplexity": 8.759560445227462, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785989.4431613449, "perf/iters_per_sec": 0.8516261306578373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1742241859436036, "data/tokens_consumed": 85901443072, "data/tokens_consumed_B": 85.901443072, "train/loss_slope": 2.472290461964035e-06} {"step": 40970, "timestamp": 1778238818.9423447, "train/loss": 2.1892797470092775, "train/z_loss": 0.0014006888028234243, "train/perplexity": 8.928779817808921, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022032.2920401664, "perf/iters_per_sec": 0.9641801319313843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371505975723267, "data/tokens_consumed": 85922414592, "data/tokens_consumed_B": 85.922414592, "train/loss_slope": 3.5813516444571e-06} {"step": 40980, "timestamp": 1778238829.3226974, "train/loss": 2.163201999664307, "train/z_loss": 0.001413644850254059, "train/perplexity": 8.698947137545938, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021417.8451270189, "perf/iters_per_sec": 0.9638871408114523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374658584594727, "data/tokens_consumed": 85943386112, "data/tokens_consumed_B": 85.943386112, "train/loss_slope": 3.302877957206483e-06} {"step": 40990, "timestamp": 1778238839.7041252, "train/loss": 2.1362316370010377, "train/z_loss": 0.0014176153694279493, "train/perplexity": 8.467468935265625, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021244.4939611938, "perf/iters_per_sec": 0.9638044805341691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375548362731934, "data/tokens_consumed": 85964357632, "data/tokens_consumed_B": 85.964357632, "train/loss_slope": -8.85095891028256e-07} {"step": 41000, "timestamp": 1778238850.6355762, "grad/layer_0/attn": 0.002901744097471237, "grad/layer_0/mlp": 0.002827799879014492, "grad/layer_0/attn_mlp_ratio": 1.0261489917976854, "grad/layer_4/attn": 0.0019585161935538054, "grad/layer_4/mlp": 0.0025978069752454758, "grad/layer_4/attn_mlp_ratio": 0.7539113324528683, "grad/layer_8/attn": 0.006150263361632824, "grad/layer_8/mlp": 0.0037155989557504654, "grad/layer_8/attn_mlp_ratio": 1.655254851062129, "grad/layer_12/attn": 0.0052215466275811195, "grad/layer_12/mlp": 0.006565330550074577, "grad/layer_12/attn_mlp_ratio": 0.7953211964308008, "grad/layer_16/attn": 0.00445387652143836, "grad/layer_16/mlp": 0.004385847132652998, "grad/layer_16/attn_mlp_ratio": 1.01551109402053, "grad/layer_20/attn": 0.003294338472187519, "grad/layer_20/mlp": 0.0061328839510679245, "grad/layer_20/attn_mlp_ratio": 0.5371597513919855, "grad/layer_24/attn": 0.011376968584954739, "grad/layer_24/mlp": 0.008948898874223232, "grad/layer_24/attn_mlp_ratio": 1.2713260723721898, "grad/layer_27/attn": 0.0052787791937589645, "grad/layer_27/mlp": 0.008513101376593113, "grad/layer_27/attn_mlp_ratio": 0.6200770903851011} {"step": 41000, "timestamp": 1778238850.6498563, "train/loss": 2.1609986782073975, "train/z_loss": 0.001409342372789979, "train/perplexity": 8.67980166023109, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916871.6354220186, "perf/iters_per_sec": 0.914035623274812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0940492630004883, "data/tokens_consumed": 85985329152, "data/tokens_consumed_B": 85.985329152, "train/loss_slope": -1.080173005436786e-06} {"step": 41000, "timestamp": 1778238857.8094256, "geo/ww_alpha_mean": 7.528985688306521, "geo/ww_alpha_std": 3.9860726203621266, "geo/ww_alpha_min": 1.3644949242264064, "geo/ww_alpha_max": 24.845353845046436, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.078165206390816, "geo/ww_alpha_by_type/k_proj": 4.4708217212766135, "geo/ww_alpha_by_type/v_proj": 7.933289730523429, "geo/ww_alpha_by_type/o_proj": 8.838475138472171, "geo/ww_alpha_by_type/gate_proj": 7.738748248170978, "geo/ww_alpha_by_type/up_proj": 11.352722194122064, "geo/ww_alpha_by_type/down_proj": 8.392513966769462, "geo/twonn_id/layer_0": 0.6940186619758606, "geo/twonn_id/layer_7": 3.2905657291412354, "geo/twonn_id/layer_14": 4.5518269538879395, "geo/twonn_id/layer_21": 7.1663007736206055, "geo/twonn_id/layer_27": 6.213376045227051, "geo/tier2_time_s": 7.1502838134765625} {"step": 41000, "timestamp": 1778238858.487776, "eoc/jacobian_sigma/layer_0/attn": 1026.0926513671875, "eoc/jacobian_sigma/layer_0/mlp": 8920.240234375, "eoc/jacobian_sigma/layer_0": 8920.240234375, "eoc/jacobian_sigma/layer_7/attn": 1.1455817222595215, "eoc/jacobian_sigma/layer_7/mlp": 1.771694302558899, "eoc/jacobian_sigma/layer_7": 1.771694302558899, "eoc/jacobian_sigma/layer_14/attn": 1.6267305612564087, "eoc/jacobian_sigma/layer_14/mlp": 6.3061370849609375, "eoc/jacobian_sigma/layer_14": 6.3061370849609375, "eoc/jacobian_sigma/layer_21/attn": 1.094710111618042, "eoc/jacobian_sigma/layer_21/mlp": 4.340605735778809, "eoc/jacobian_sigma/layer_21": 4.340605735778809, "eoc/jacobian_sigma/layer_27/attn": 4.762975692749023, "eoc/jacobian_sigma/layer_27/mlp": 26.71478843688965, "eoc/jacobian_sigma/layer_27": 26.71478843688965, "eoc/layer0_sigma": 8920.240234375, "eoc/sigma_max": 26.71478843688965, "eoc/sigma_min": 1.771694302558899, "eoc/sigma_mean": 9.783306390047073, "eoc/time_s": 0.6723198890686035} {"step": 41010, "timestamp": 1778238868.8893306, "train/loss": 2.2062854766845703, "train/z_loss": 0.0014162318198941647, "train/perplexity": 9.081918662052253, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1150296.995697953, "perf/iters_per_sec": 0.5485043505182042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8231395959854126, "data/tokens_consumed": 86006300672, "data/tokens_consumed_B": 86.006300672, "train/loss_slope": 4.0017200095716244e-07} {"step": 41020, "timestamp": 1778238879.7590103, "train/loss": 2.1477198362350465, "train/z_loss": 0.0014117154409177602, "train/perplexity": 8.565305814150452, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930403.844554799, "perf/iters_per_sec": 0.9204882834218974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0863799333572388, "data/tokens_consumed": 86027272192, "data/tokens_consumed_B": 86.027272192, "train/loss_slope": 7.166639591816145e-07} {"step": 41025, "timestamp": 1778238885.5324237, "eos/sharpness": 57.83438682556151, "eos/L0_probe": 2.002781629562378, "eos/L_plus": 2.335799217224121, "eos/L_minus": 2.24810791015625, "eos/grad_norm": 0.172504723072052, "eos/embed_grad_frac": 0.08579617738723755, "eos/time_s": 0.6112899780273438} {"step": 41025, "timestamp": 1778238886.912557, "geo/rankme_last": 438.794677734375, "geo/layer_0/stable_rank_q_proj": 19.239608764648438, "geo/layer_0/stable_rank_k_proj": 16.359691619873047, "geo/layer_0/stable_rank_o_proj": 49.103179931640625, "geo/layer_0/stable_rank_gate_proj": 137.31739807128906, "geo/layer_0/stable_rank_down_proj": 53.22878646850586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059833720326423645, "geo/layer_0/attn_entropy_mean": 6.191027641296387, "geo/layer_0/attn_entropy_std": 0.3741723299026489, "geo/layer_7/stable_rank_q_proj": 42.754032135009766, "geo/layer_7/stable_rank_k_proj": 41.98306655883789, "geo/layer_7/stable_rank_o_proj": 97.9552993774414, "geo/layer_7/stable_rank_gate_proj": 89.6798095703125, "geo/layer_7/stable_rank_down_proj": 146.25543212890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.513811469078064, "geo/layer_7/attn_entropy_mean": 4.678668975830078, "geo/layer_7/attn_entropy_std": 0.8358007669448853, "geo/layer_14/stable_rank_q_proj": 54.36558532714844, "geo/layer_14/stable_rank_k_proj": 37.32625198364258, "geo/layer_14/stable_rank_o_proj": 47.72500228881836, "geo/layer_14/stable_rank_gate_proj": 76.44214630126953, "geo/layer_14/stable_rank_down_proj": 132.95843505859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39744940400123596, "geo/layer_14/attn_entropy_mean": 5.518877983093262, "geo/layer_14/attn_entropy_std": 0.37532806396484375, "geo/layer_21/stable_rank_q_proj": 43.47303771972656, "geo/layer_21/stable_rank_k_proj": 30.737350463867188, "geo/layer_21/stable_rank_o_proj": 75.83528900146484, "geo/layer_21/stable_rank_gate_proj": 72.50811767578125, "geo/layer_21/stable_rank_down_proj": 55.23933792114258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14513848721981049, "geo/layer_21/attn_entropy_mean": 5.725671291351318, "geo/layer_21/attn_entropy_std": 0.29038479924201965, "geo/layer_27/stable_rank_q_proj": 42.57293701171875, "geo/layer_27/stable_rank_k_proj": 31.346986770629883, "geo/layer_27/stable_rank_o_proj": 116.0428237915039, "geo/layer_27/stable_rank_gate_proj": 85.09925079345703, "geo/layer_27/stable_rank_down_proj": 131.15426635742188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08672716468572617, "geo/layer_27/attn_entropy_mean": 4.279875755310059, "geo/layer_27/attn_entropy_std": 0.6691937446594238, "attnres/final_alpha/block_0": 0.24189044535160065, "attnres/block_norm/0": 1.7227380275726318, "attnres/final_alpha/block_1": 0.005262045189738274, "attnres/block_norm/1": 40508.00390625, "attnres/final_alpha/block_2": 0.011024044826626778, "attnres/block_norm/2": 26641.1171875, "attnres/final_alpha/block_3": 0.013056084513664246, "attnres/block_norm/3": 47752.0625, "attnres/final_alpha/block_4": 0.016160478815436363, "attnres/block_norm/4": 12945.20703125, "attnres/final_alpha/block_5": 0.5955177545547485, "attnres/block_norm/5": 6091.07080078125, "attnres/final_alpha/block_6": 0.11708914488554001, "attnres/block_norm/6": 31515.45703125, "geo/tier1_time_s": 1.3617222309112549, "geo/step": 41025.0, "geo/rankme_slope": 5.339049682372949e-05} {"step": 41030, "timestamp": 1778238892.0891826, "train/loss": 2.183772659301758, "train/z_loss": 0.0014088845578953623, "train/perplexity": 8.879743392111099, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701667.3304584823, "perf/iters_per_sec": 0.8114182140629207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2324100971221923, "data/tokens_consumed": 86048243712, "data/tokens_consumed_B": 86.048243712, "train/loss_slope": 2.2579949216159458e-06} {"step": 41040, "timestamp": 1778238902.4428964, "train/loss": 2.194388914108276, "train/z_loss": 0.0013917525648139417, "train/perplexity": 8.974515181207192, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026744.033785346, "perf/iters_per_sec": 0.9664268654753427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347394466400146, "data/tokens_consumed": 86069215232, "data/tokens_consumed_B": 86.069215232, "train/loss_slope": 5.0768983567973985e-06} {"step": 41050, "timestamp": 1778238912.7765317, "grad/layer_0/attn": 0.002744599711149931, "grad/layer_0/mlp": 0.002703595208004117, "grad/layer_0/attn_mlp_ratio": 1.0151666201758889, "grad/layer_4/attn": 0.0027349486481398344, "grad/layer_4/mlp": 0.0025524820666760206, "grad/layer_4/attn_mlp_ratio": 1.0714858986464268, "grad/layer_8/attn": 0.0037908004596829414, "grad/layer_8/mlp": 0.003620533039793372, "grad/layer_8/attn_mlp_ratio": 1.0470282450996393, "grad/layer_12/attn": 0.0046604410745203495, "grad/layer_12/mlp": 0.007277387194335461, "grad/layer_12/attn_mlp_ratio": 0.640400309345625, "grad/layer_16/attn": 0.006547237280756235, "grad/layer_16/mlp": 0.004438191652297974, "grad/layer_16/attn_mlp_ratio": 1.475203787075278, "grad/layer_20/attn": 0.004470695275813341, "grad/layer_20/mlp": 0.005890377797186375, "grad/layer_20/attn_mlp_ratio": 0.7589827603333964, "grad/layer_24/attn": 0.016742097213864326, "grad/layer_24/mlp": 0.011951372027397156, "grad/layer_24/attn_mlp_ratio": 1.4008514700571477, "grad/layer_27/attn": 0.007463319692760706, "grad/layer_27/mlp": 0.011281123384833336, "grad/layer_27/attn_mlp_ratio": 0.661575923957805} {"step": 41050, "timestamp": 1778238912.7908342, "train/loss": 2.11721875667572, "train/z_loss": 0.0014157663681544363, "train/perplexity": 8.307998759298778, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027768.521330915, "perf/iters_per_sec": 0.9669153792051863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342166662216186, "data/tokens_consumed": 86090186752, "data/tokens_consumed_B": 86.090186752, "train/loss_slope": 8.518078074668902e-07} {"step": 41060, "timestamp": 1778238923.7065773, "train/loss": 2.1380353212356566, "train/z_loss": 0.0014079273445531727, "train/perplexity": 8.482755357286244, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922580.7103694484, "perf/iters_per_sec": 0.9167579223487131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0908004999160767, "data/tokens_consumed": 86111158272, "data/tokens_consumed_B": 86.111158272, "train/loss_slope": -6.280855412887743e-07} {"step": 41070, "timestamp": 1778238934.0604885, "train/loss": 2.15063898563385, "train/z_loss": 0.0014066681382246316, "train/perplexity": 8.59034575134586, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027141.9877663874, "perf/iters_per_sec": 0.9666166247207582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03453631401062, "data/tokens_consumed": 86132129792, "data/tokens_consumed_B": 86.132129792, "train/loss_slope": 5.747492593078151e-07} {"step": 41080, "timestamp": 1778238944.7143044, "train/loss": 2.2102877378463743, "train/z_loss": 0.0013918404350988568, "train/perplexity": 9.11833970703866, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1969867.9284724244, "perf/iters_per_sec": 0.9393062250482676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.064615535736084, "data/tokens_consumed": 86153101312, "data/tokens_consumed_B": 86.153101312, "train/loss_slope": 3.5554503521832086e-06} {"step": 41090, "timestamp": 1778238955.0620701, "train/loss": 2.204831266403198, "train/z_loss": 0.0014134591561742126, "train/perplexity": 9.068721240797894, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028080.2733646221, "perf/iters_per_sec": 0.9670640341590033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340576887130737, "data/tokens_consumed": 86174072832, "data/tokens_consumed_B": 86.174072832, "train/loss_slope": 7.865747366801347e-06} {"step": 41100, "timestamp": 1778238965.3977783, "grad/layer_0/attn": 0.002941999351605773, "grad/layer_0/mlp": 0.0027780865784734488, "grad/layer_0/attn_mlp_ratio": 1.0590020010543348, "grad/layer_4/attn": 0.002197135705500841, "grad/layer_4/mlp": 0.0024635614827275276, "grad/layer_4/attn_mlp_ratio": 0.8918533723310811, "grad/layer_8/attn": 0.0038280857261270285, "grad/layer_8/mlp": 0.003695972030982375, "grad/layer_8/attn_mlp_ratio": 1.0357452898622204, "grad/layer_12/attn": 0.0047979094088077545, "grad/layer_12/mlp": 0.0069602541625499725, "grad/layer_12/attn_mlp_ratio": 0.6893296175433085, "grad/layer_16/attn": 0.004669153597205877, "grad/layer_16/mlp": 0.004558259155601263, "grad/layer_16/attn_mlp_ratio": 1.024328221671101, "grad/layer_20/attn": 0.006315415725111961, "grad/layer_20/mlp": 0.006907416507601738, "grad/layer_20/attn_mlp_ratio": 0.9142948925596484, "grad/layer_24/attn": 0.01661512441933155, "grad/layer_24/mlp": 0.012248467653989792, "grad/layer_24/attn_mlp_ratio": 1.3565063608808843, "grad/layer_27/attn": 0.004871782381087542, "grad/layer_27/mlp": 0.0124704884365201, "grad/layer_27/attn_mlp_ratio": 0.3906649179637526} {"step": 41100, "timestamp": 1778238966.001566, "eos/sharpness": 64.39397335052489, "eos/L0_probe": 1.9965442419052124, "eos/L_plus": 2.2763967514038086, "eos/L_minus": 2.3606314659118652, "eos/grad_norm": 0.20611344277858734, "eos/embed_grad_frac": 0.05673625320196152, "eos/time_s": 0.6008880138397217} {"step": 41100, "timestamp": 1778238966.0206933, "train/loss": 2.184806740283966, "train/z_loss": 0.0014045632211491465, "train/perplexity": 8.888930515175725, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914915.2714919916, "perf/iters_per_sec": 0.9131027562580069, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0951669931411743, "data/tokens_consumed": 86195044352, "data/tokens_consumed_B": 86.195044352, "train/loss_slope": 8.922071640032956e-06} {"step": 41100, "timestamp": 1778238967.381803, "geo/rankme_last": 438.70745849609375, "geo/layer_0/stable_rank_q_proj": 19.266366958618164, "geo/layer_0/stable_rank_k_proj": 16.40091896057129, "geo/layer_0/stable_rank_o_proj": 49.08125686645508, "geo/layer_0/stable_rank_gate_proj": 137.3774871826172, "geo/layer_0/stable_rank_down_proj": 53.21832275390625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05510189011693001, "geo/layer_0/attn_entropy_mean": 6.196813583374023, "geo/layer_0/attn_entropy_std": 0.37259387969970703, "geo/layer_7/stable_rank_q_proj": 42.77228927612305, "geo/layer_7/stable_rank_k_proj": 42.039825439453125, "geo/layer_7/stable_rank_o_proj": 98.0719223022461, "geo/layer_7/stable_rank_gate_proj": 89.44913482666016, "geo/layer_7/stable_rank_down_proj": 146.22512817382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5127553343772888, "geo/layer_7/attn_entropy_mean": 4.687071800231934, "geo/layer_7/attn_entropy_std": 0.8231058120727539, "geo/layer_14/stable_rank_q_proj": 54.388648986816406, "geo/layer_14/stable_rank_k_proj": 37.43148422241211, "geo/layer_14/stable_rank_o_proj": 47.67601776123047, "geo/layer_14/stable_rank_gate_proj": 76.41816711425781, "geo/layer_14/stable_rank_down_proj": 133.18472290039062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38799935579299927, "geo/layer_14/attn_entropy_mean": 5.515944957733154, "geo/layer_14/attn_entropy_std": 0.3741018772125244, "geo/layer_21/stable_rank_q_proj": 43.5118408203125, "geo/layer_21/stable_rank_k_proj": 30.749853134155273, "geo/layer_21/stable_rank_o_proj": 75.84925842285156, "geo/layer_21/stable_rank_gate_proj": 72.58695983886719, "geo/layer_21/stable_rank_down_proj": 55.19602584838867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14399303495883942, "geo/layer_21/attn_entropy_mean": 5.7329254150390625, "geo/layer_21/attn_entropy_std": 0.29350244998931885, "geo/layer_27/stable_rank_q_proj": 42.641029357910156, "geo/layer_27/stable_rank_k_proj": 31.3575496673584, "geo/layer_27/stable_rank_o_proj": 116.3945083618164, "geo/layer_27/stable_rank_gate_proj": 85.0929183959961, "geo/layer_27/stable_rank_down_proj": 131.16014099121094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08630932122468948, "geo/layer_27/attn_entropy_mean": 4.2664618492126465, "geo/layer_27/attn_entropy_std": 0.6561599969863892, "attnres/final_alpha/block_0": 0.2426607459783554, "attnres/block_norm/0": 1.7229359149932861, "attnres/final_alpha/block_1": 0.005146347917616367, "attnres/block_norm/1": 40501.20703125, "attnres/final_alpha/block_2": 0.011604277417063713, "attnres/block_norm/2": 26528.56640625, "attnres/final_alpha/block_3": 0.013796606101095676, "attnres/block_norm/3": 47396.9140625, "attnres/final_alpha/block_4": 0.016518037766218185, "attnres/block_norm/4": 12953.650390625, "attnres/final_alpha/block_5": 0.592698335647583, "attnres/block_norm/5": 6068.33203125, "attnres/final_alpha/block_6": 0.11757563054561615, "attnres/block_norm/6": 31640.796875, "geo/tier1_time_s": 1.3569982051849365, "geo/step": 41100.0, "geo/rankme_slope": 1.6258925445178067e-05} {"step": 41110, "timestamp": 1778238977.7310936, "train/loss": 2.249084711074829, "train/z_loss": 0.0013819538988173008, "train/perplexity": 9.479055789803779, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791431.9911926673, "perf/iters_per_sec": 0.8542213397944771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706567764282227, "data/tokens_consumed": 86216015872, "data/tokens_consumed_B": 86.216015872, "train/loss_slope": 1.5629394741842335e-05} {"step": 41120, "timestamp": 1778238988.1129103, "train/loss": 2.1624979257583616, "train/z_loss": 0.0013982690637931229, "train/perplexity": 8.692824591472581, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021262.3758116188, "perf/iters_per_sec": 0.9638130072649091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037545657157898, "data/tokens_consumed": 86236987392, "data/tokens_consumed_B": 86.236987392, "train/loss_slope": 1.4877853156066015e-05} {"step": 41130, "timestamp": 1778238998.493745, "train/loss": 2.1544471263885496, "train/z_loss": 0.0014102336601354182, "train/perplexity": 8.623121364563604, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021197.1666424284, "perf/iters_per_sec": 0.9637819131099836, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375791311264038, "data/tokens_consumed": 86257958912, "data/tokens_consumed_B": 86.257958912, "train/loss_slope": 1.437725882039495e-05} {"step": 41140, "timestamp": 1778239008.872627, "train/loss": 2.1828819036483766, "train/z_loss": 0.0013928479049354792, "train/perplexity": 8.871837232235062, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021623.610486434, "perf/iters_per_sec": 0.9639852573806925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373602628707885, "data/tokens_consumed": 86278930432, "data/tokens_consumed_B": 86.278930432, "train/loss_slope": 1.7325314451115206e-05} {"step": 41150, "timestamp": 1778239019.2465236, "grad/layer_0/attn": 0.0027541767340153456, "grad/layer_0/mlp": 0.002952429698780179, "grad/layer_0/attn_mlp_ratio": 0.9328508793513928, "grad/layer_4/attn": 0.0020600352436304092, "grad/layer_4/mlp": 0.002572273137047887, "grad/layer_4/attn_mlp_ratio": 0.8008617490397886, "grad/layer_8/attn": 0.010147711262106895, "grad/layer_8/mlp": 0.0038296866696327925, "grad/layer_8/attn_mlp_ratio": 2.6497496721069704, "grad/layer_12/attn": 0.004968344233930111, "grad/layer_12/mlp": 0.007094061933457851, "grad/layer_12/attn_mlp_ratio": 0.7003525216579473, "grad/layer_16/attn": 0.006712615955621004, "grad/layer_16/mlp": 0.004906745161861181, "grad/layer_16/attn_mlp_ratio": 1.3680384037452227, "grad/layer_20/attn": 0.0032105185091495514, "grad/layer_20/mlp": 0.006136145908385515, "grad/layer_20/attn_mlp_ratio": 0.5232141648458383, "grad/layer_24/attn": 0.008849112316966057, "grad/layer_24/mlp": 0.008233406580984592, "grad/layer_24/attn_mlp_ratio": 1.0747813948512301, "grad/layer_27/attn": 0.005571797490119934, "grad/layer_27/mlp": 0.0066172280348837376, "grad/layer_27/attn_mlp_ratio": 0.8420138125127266} {"step": 41150, "timestamp": 1778239019.2611737, "train/loss": 2.2035495758056642, "train/z_loss": 0.0013988315127789973, "train/perplexity": 9.057105391604397, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019878.3277770428, "perf/iters_per_sec": 0.9631530417332853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382565975189209, "data/tokens_consumed": 86299901952, "data/tokens_consumed_B": 86.299901952, "train/loss_slope": 1.961866203385933e-05} {"step": 41160, "timestamp": 1778239029.648351, "train/loss": 2.162282371520996, "train/z_loss": 0.0014119391213171183, "train/perplexity": 8.690951018232793, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020231.690335657, "perf/iters_per_sec": 0.9633215381315503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380749940872191, "data/tokens_consumed": 86320873472, "data/tokens_consumed_B": 86.320873472, "train/loss_slope": 1.968450769446756e-05} {"step": 41170, "timestamp": 1778239040.021545, "train/loss": 2.1589990615844727, "train/z_loss": 0.0014205114683136344, "train/perplexity": 8.662462725935653, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022696.0892881646, "perf/iters_per_sec": 0.9644966551247428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368102312088012, "data/tokens_consumed": 86341844992, "data/tokens_consumed_B": 86.341844992, "train/loss_slope": 1.7811515472187768e-05} {"step": 41175, "timestamp": 1778239045.7865727, "eos/sharpness": 32.89277553558349, "eos/L0_probe": 1.998353362083435, "eos/L_plus": 2.176670551300049, "eos/L_minus": 2.1489639282226562, "eos/grad_norm": 0.14504404366016388, "eos/embed_grad_frac": 0.11132248491048813, "eos/time_s": 0.5877907276153564} {"step": 41175, "timestamp": 1778239047.1602602, "geo/rankme_last": 439.6669921875, "geo/layer_0/stable_rank_q_proj": 19.25999641418457, "geo/layer_0/stable_rank_k_proj": 16.423234939575195, "geo/layer_0/stable_rank_o_proj": 49.07472610473633, "geo/layer_0/stable_rank_gate_proj": 137.1356658935547, "geo/layer_0/stable_rank_down_proj": 53.18047332763672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05989181250333786, "geo/layer_0/attn_entropy_mean": 6.1929521560668945, "geo/layer_0/attn_entropy_std": 0.3721947968006134, "geo/layer_7/stable_rank_q_proj": 42.827823638916016, "geo/layer_7/stable_rank_k_proj": 42.054012298583984, "geo/layer_7/stable_rank_o_proj": 98.04116821289062, "geo/layer_7/stable_rank_gate_proj": 89.40306091308594, "geo/layer_7/stable_rank_down_proj": 146.1005096435547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.514478325843811, "geo/layer_7/attn_entropy_mean": 4.681354522705078, "geo/layer_7/attn_entropy_std": 0.8286541700363159, "geo/layer_14/stable_rank_q_proj": 54.386863708496094, "geo/layer_14/stable_rank_k_proj": 37.323829650878906, "geo/layer_14/stable_rank_o_proj": 47.66452407836914, "geo/layer_14/stable_rank_gate_proj": 76.47758483886719, "geo/layer_14/stable_rank_down_proj": 132.96551513671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40084999799728394, "geo/layer_14/attn_entropy_mean": 5.542512893676758, "geo/layer_14/attn_entropy_std": 0.36206790804862976, "geo/layer_21/stable_rank_q_proj": 43.46250534057617, "geo/layer_21/stable_rank_k_proj": 30.7747859954834, "geo/layer_21/stable_rank_o_proj": 75.8794937133789, "geo/layer_21/stable_rank_gate_proj": 72.61481475830078, "geo/layer_21/stable_rank_down_proj": 55.1161994934082, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14626510441303253, "geo/layer_21/attn_entropy_mean": 5.710660934448242, "geo/layer_21/attn_entropy_std": 0.2907075583934784, "geo/layer_27/stable_rank_q_proj": 42.700653076171875, "geo/layer_27/stable_rank_k_proj": 31.39685821533203, "geo/layer_27/stable_rank_o_proj": 116.42634582519531, "geo/layer_27/stable_rank_gate_proj": 85.1666030883789, "geo/layer_27/stable_rank_down_proj": 131.0372772216797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08185523748397827, "geo/layer_27/attn_entropy_mean": 4.25557804107666, "geo/layer_27/attn_entropy_std": 0.6589297652244568, "attnres/final_alpha/block_0": 0.24207690358161926, "attnres/block_norm/0": 1.723384141921997, "attnres/final_alpha/block_1": 0.005175474099814892, "attnres/block_norm/1": 40489.828125, "attnres/final_alpha/block_2": 0.011310738511383533, "attnres/block_norm/2": 26620.68359375, "attnres/final_alpha/block_3": 0.013533690944314003, "attnres/block_norm/3": 47494.90234375, "attnres/final_alpha/block_4": 0.01621444895863533, "attnres/block_norm/4": 13000.39453125, "attnres/final_alpha/block_5": 0.5961343050003052, "attnres/block_norm/5": 6032.4970703125, "attnres/final_alpha/block_6": 0.11555443704128265, "attnres/block_norm/6": 31615.57421875, "geo/tier1_time_s": 1.354762315750122, "geo/step": 41175.0, "geo/rankme_slope": 3.605834912089835e-05} {"step": 41180, "timestamp": 1778239052.3533022, "train/loss": 2.165202784538269, "train/z_loss": 0.0014054925879463554, "train/perplexity": 8.71636928256831, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701273.863146885, "perf/iters_per_sec": 0.8112305942282129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2326951265335082, "data/tokens_consumed": 86362816512, "data/tokens_consumed_B": 86.362816512, "train/loss_slope": 1.5979449183169037e-05} {"step": 41190, "timestamp": 1778239062.743148, "train/loss": 2.132882845401764, "train/z_loss": 0.0014055404928512872, "train/perplexity": 8.43916057228728, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019490.8242038973, "perf/iters_per_sec": 0.9629682656306731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384558200836183, "data/tokens_consumed": 86383788032, "data/tokens_consumed_B": 86.383788032, "train/loss_slope": 1.3862681989729891e-05} {"step": 41200, "timestamp": 1778239073.1139395, "grad/layer_0/attn": 0.0026593112852424383, "grad/layer_0/mlp": 0.0027348273433744907, "grad/layer_0/attn_mlp_ratio": 0.9723872311158055, "grad/layer_4/attn": 0.0021075312979519367, "grad/layer_4/mlp": 0.002529023913666606, "grad/layer_4/attn_mlp_ratio": 0.8333377961470659, "grad/layer_8/attn": 0.004729993175715208, "grad/layer_8/mlp": 0.003533909097313881, "grad/layer_8/attn_mlp_ratio": 1.3384591713082126, "grad/layer_12/attn": 0.004572248086333275, "grad/layer_12/mlp": 0.00677866954356432, "grad/layer_12/attn_mlp_ratio": 0.6745052239968907, "grad/layer_16/attn": 0.004216158762574196, "grad/layer_16/mlp": 0.004588983952999115, "grad/layer_16/attn_mlp_ratio": 0.9187564641499985, "grad/layer_20/attn": 0.003688688622787595, "grad/layer_20/mlp": 0.006958189886063337, "grad/layer_20/attn_mlp_ratio": 0.5301218607390319, "grad/layer_24/attn": 0.024828346446156502, "grad/layer_24/mlp": 0.013671363703906536, "grad/layer_24/attn_mlp_ratio": 1.8160841012117537, "grad/layer_27/attn": 0.006406251806765795, "grad/layer_27/mlp": 0.013419106602668762, "grad/layer_27/attn_mlp_ratio": 0.47739778427216273} {"step": 41200, "timestamp": 1778239073.1281867, "train/loss": 2.179453158378601, "train/z_loss": 0.0014098037034273147, "train/perplexity": 8.841470052702858, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020336.1872327449, "perf/iters_per_sec": 0.9633713661349987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380213022232057, "data/tokens_consumed": 86404759552, "data/tokens_consumed_B": 86.404759552, "train/loss_slope": 1.4248570028168274e-05} {"step": 41210, "timestamp": 1778239083.5148923, "train/loss": 2.217334818840027, "train/z_loss": 0.001398170366883278, "train/perplexity": 9.182824332806202, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020473.2745982728, "perf/iters_per_sec": 0.9634367344848026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037950873374939, "data/tokens_consumed": 86425731072, "data/tokens_consumed_B": 86.425731072, "train/loss_slope": 1.5891396287608503e-05} {"step": 41220, "timestamp": 1778239093.892293, "train/loss": 2.182443952560425, "train/z_loss": 0.0014009104110300542, "train/perplexity": 8.867952652157198, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021861.670406014, "perf/iters_per_sec": 0.964098773196227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372381210327148, "data/tokens_consumed": 86446702592, "data/tokens_consumed_B": 86.446702592, "train/loss_slope": 1.5518647544991335e-05} {"step": 41230, "timestamp": 1778239104.2857757, "train/loss": 2.135351574420929, "train/z_loss": 0.0014074160251766443, "train/perplexity": 8.460020310816045, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018738.0845182717, "perf/iters_per_sec": 0.9626093313781127, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038843035697937, "data/tokens_consumed": 86467674112, "data/tokens_consumed_B": 86.467674112, "train/loss_slope": 1.4138272879469223e-05} {"step": 41240, "timestamp": 1778239114.6673265, "train/loss": 2.215224003791809, "train/z_loss": 0.0013821286847814918, "train/perplexity": 9.163461531854496, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021085.6612813896, "perf/iters_per_sec": 0.9637287432105015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376363754272462, "data/tokens_consumed": 86488645632, "data/tokens_consumed_B": 86.488645632, "train/loss_slope": 1.7066272836599954e-05} {"step": 41250, "timestamp": 1778239125.0411072, "grad/layer_0/attn": 0.002709053922444582, "grad/layer_0/mlp": 0.002857708605006337, "grad/layer_0/attn_mlp_ratio": 0.9479811282719854, "grad/layer_4/attn": 0.0034107742831110954, "grad/layer_4/mlp": 0.002446278464049101, "grad/layer_4/attn_mlp_ratio": 1.394270600755113, "grad/layer_8/attn": 0.003377610817551613, "grad/layer_8/mlp": 0.003529950510710478, "grad/layer_8/attn_mlp_ratio": 0.956843647416301, "grad/layer_12/attn": 0.0055874926038086414, "grad/layer_12/mlp": 0.006698831915855408, "grad/layer_12/attn_mlp_ratio": 0.8340995251983709, "grad/layer_16/attn": 0.004371319897472858, "grad/layer_16/mlp": 0.0043298122473061085, "grad/layer_16/attn_mlp_ratio": 1.0095864547553834, "grad/layer_20/attn": 0.005135081708431244, "grad/layer_20/mlp": 0.005654544569551945, "grad/layer_20/attn_mlp_ratio": 0.9081335471770423, "grad/layer_24/attn": 0.008909633383154869, "grad/layer_24/mlp": 0.009176300838589668, "grad/layer_24/attn_mlp_ratio": 0.9709395368330429, "grad/layer_27/attn": 0.004765110556036234, "grad/layer_27/mlp": 0.00926384050399065, "grad/layer_27/attn_mlp_ratio": 0.5143774336945666} {"step": 41250, "timestamp": 1778239125.6307251, "eos/sharpness": 26.83751583099365, "eos/L0_probe": 1.9925410747528076, "eos/L_plus": 2.132502317428589, "eos/L_minus": 2.120954990386963, "eos/grad_norm": 0.11420290172100067, "eos/embed_grad_frac": 0.19388815760612488, "eos/time_s": 0.58673095703125} {"step": 41250, "timestamp": 1778239125.648713, "train/loss": 2.1808300018310547, "train/z_loss": 0.001407444290816784, "train/perplexity": 8.853651757080542, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910589.7382971535, "perf/iters_per_sec": 0.911040181301667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097646427154541, "data/tokens_consumed": 86509617152, "data/tokens_consumed_B": 86.509617152, "train/loss_slope": 1.4933051220332803e-05} {"step": 41250, "timestamp": 1778239127.0103846, "geo/rankme_last": 440.196044921875, "geo/layer_0/stable_rank_q_proj": 19.266141891479492, "geo/layer_0/stable_rank_k_proj": 16.419919967651367, "geo/layer_0/stable_rank_o_proj": 48.98670959472656, "geo/layer_0/stable_rank_gate_proj": 137.14012145996094, "geo/layer_0/stable_rank_down_proj": 53.137916564941406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060657672584056854, "geo/layer_0/attn_entropy_mean": 6.193787574768066, "geo/layer_0/attn_entropy_std": 0.37004414200782776, "geo/layer_7/stable_rank_q_proj": 42.8216552734375, "geo/layer_7/stable_rank_k_proj": 42.167442321777344, "geo/layer_7/stable_rank_o_proj": 98.07452392578125, "geo/layer_7/stable_rank_gate_proj": 89.35269165039062, "geo/layer_7/stable_rank_down_proj": 146.30975341796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49748924374580383, "geo/layer_7/attn_entropy_mean": 4.670055389404297, "geo/layer_7/attn_entropy_std": 0.8187623620033264, "geo/layer_14/stable_rank_q_proj": 54.315887451171875, "geo/layer_14/stable_rank_k_proj": 37.28459930419922, "geo/layer_14/stable_rank_o_proj": 47.70817947387695, "geo/layer_14/stable_rank_gate_proj": 76.45315551757812, "geo/layer_14/stable_rank_down_proj": 133.14569091796875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.390267550945282, "geo/layer_14/attn_entropy_mean": 5.517693519592285, "geo/layer_14/attn_entropy_std": 0.38405704498291016, "geo/layer_21/stable_rank_q_proj": 43.25886154174805, "geo/layer_21/stable_rank_k_proj": 30.818334579467773, "geo/layer_21/stable_rank_o_proj": 75.85026550292969, "geo/layer_21/stable_rank_gate_proj": 72.55607604980469, "geo/layer_21/stable_rank_down_proj": 55.13310241699219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1462620496749878, "geo/layer_21/attn_entropy_mean": 5.71578311920166, "geo/layer_21/attn_entropy_std": 0.28838375210762024, "geo/layer_27/stable_rank_q_proj": 42.67707061767578, "geo/layer_27/stable_rank_k_proj": 31.511127471923828, "geo/layer_27/stable_rank_o_proj": 116.34820556640625, "geo/layer_27/stable_rank_gate_proj": 85.07288360595703, "geo/layer_27/stable_rank_down_proj": 131.018310546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08478404581546783, "geo/layer_27/attn_entropy_mean": 4.271733283996582, "geo/layer_27/attn_entropy_std": 0.6854239702224731, "attnres/final_alpha/block_0": 0.24106036126613617, "attnres/block_norm/0": 1.7236549854278564, "attnres/final_alpha/block_1": 0.005134339444339275, "attnres/block_norm/1": 40546.859375, "attnres/final_alpha/block_2": 0.011222891509532928, "attnres/block_norm/2": 26576.328125, "attnres/final_alpha/block_3": 0.013256557285785675, "attnres/block_norm/3": 48121.83203125, "attnres/final_alpha/block_4": 0.01605592481791973, "attnres/block_norm/4": 12997.5458984375, "attnres/final_alpha/block_5": 0.5995279550552368, "attnres/block_norm/5": 5983.57275390625, "attnres/final_alpha/block_6": 0.11374196410179138, "attnres/block_norm/6": 31673.595703125, "geo/tier1_time_s": 1.3581790924072266, "geo/step": 41250.0, "geo/rankme_slope": 5.994100765306122e-05} {"step": 41260, "timestamp": 1778239137.3891225, "train/loss": 2.163528847694397, "train/z_loss": 0.0014151764800772071, "train/perplexity": 8.701790835985001, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786766.6510494838, "perf/iters_per_sec": 0.8519967322585505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1737134218215943, "data/tokens_consumed": 86530588672, "data/tokens_consumed_B": 86.530588672, "train/loss_slope": 1.2302104775125223e-05} {"step": 41270, "timestamp": 1778239147.7619004, "train/loss": 2.1626949071884156, "train/z_loss": 0.0014092232100665568, "train/perplexity": 8.694537085151007, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022906.2089335525, "perf/iters_per_sec": 0.9645968479793322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036702537536621, "data/tokens_consumed": 86551560192, "data/tokens_consumed_B": 86.551560192, "train/loss_slope": 9.222577535959514e-06} {"step": 41280, "timestamp": 1778239158.1457658, "train/loss": 2.2195348501205445, "train/z_loss": 0.0013914270442910493, "train/perplexity": 9.203049072954814, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020721.5085376534, "perf/iters_per_sec": 0.9635551016510264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378233671188355, "data/tokens_consumed": 86572531712, "data/tokens_consumed_B": 86.572531712, "train/loss_slope": 1.188115271011206e-05} {"step": 41290, "timestamp": 1778239168.5268471, "train/loss": 2.1905827045440676, "train/z_loss": 0.0013918731361627578, "train/perplexity": 8.940421221229201, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021204.7834508554, "perf/iters_per_sec": 0.9637855450872685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375752210617066, "data/tokens_consumed": 86593503232, "data/tokens_consumed_B": 86.593503232, "train/loss_slope": 1.3260202854201195e-05} {"step": 41300, "timestamp": 1778239178.8958066, "grad/layer_0/attn": 0.0037924679927527905, "grad/layer_0/mlp": 0.0031483906786888838, "grad/layer_0/attn_mlp_ratio": 1.2045734660460816, "grad/layer_4/attn": 0.001718559768050909, "grad/layer_4/mlp": 0.0024985880590975285, "grad/layer_4/attn_mlp_ratio": 0.6878123398582182, "grad/layer_8/attn": 0.0037415758706629276, "grad/layer_8/mlp": 0.003559176344424486, "grad/layer_8/attn_mlp_ratio": 1.0512476493049883, "grad/layer_12/attn": 0.004325740970671177, "grad/layer_12/mlp": 0.006603800691664219, "grad/layer_12/attn_mlp_ratio": 0.655038076880125, "grad/layer_16/attn": 0.0050298902206122875, "grad/layer_16/mlp": 0.00474198954179883, "grad/layer_16/attn_mlp_ratio": 1.060713034097697, "grad/layer_20/attn": 0.0035919188521802425, "grad/layer_20/mlp": 0.006237312685698271, "grad/layer_20/attn_mlp_ratio": 0.5758760183417875, "grad/layer_24/attn": 0.011981120333075523, "grad/layer_24/mlp": 0.010584264062345028, "grad/layer_24/attn_mlp_ratio": 1.131974802339118, "grad/layer_27/attn": 0.008028967306017876, "grad/layer_27/mlp": 0.0085495850071311, "grad/layer_27/attn_mlp_ratio": 0.9391060742024799} {"step": 41300, "timestamp": 1778239178.9100592, "train/loss": 2.1936708450317384, "train/z_loss": 0.00139889131532982, "train/perplexity": 8.968073172559052, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020684.6502041419, "perf/iters_per_sec": 0.9635375262280187, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037842297554016, "data/tokens_consumed": 86614474752, "data/tokens_consumed_B": 86.614474752, "train/loss_slope": 1.2278085837949394e-05} {"step": 41310, "timestamp": 1778239189.3090377, "train/loss": 2.189531183242798, "train/z_loss": 0.0013922951882705092, "train/perplexity": 8.931025118839433, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018244.272296178, "perf/iters_per_sec": 0.9623738633614435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.039097213745117, "data/tokens_consumed": 86635446272, "data/tokens_consumed_B": 86.635446272, "train/loss_slope": 1.006045291418796e-05} {"step": 41320, "timestamp": 1778239199.6904414, "train/loss": 2.1829034805297853, "train/z_loss": 0.0014228549669496715, "train/perplexity": 8.872028660880108, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021623.3781695666, "perf/iters_per_sec": 0.9639851466033776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037360382080078, "data/tokens_consumed": 86656417792, "data/tokens_consumed_B": 86.656417792, "train/loss_slope": 1.2601372947429709e-05} {"step": 41325, "timestamp": 1778239205.4599388, "eos/sharpness": 39.40484523773193, "eos/L0_probe": 1.9952012300491333, "eos/L_plus": 2.1432902812957764, "eos/L_minus": 2.2411606311798096, "eos/grad_norm": 0.13000409305095673, "eos/embed_grad_frac": 0.13606764376163483, "eos/time_s": 0.5912067890167236} {"step": 41325, "timestamp": 1778239206.8364964, "geo/rankme_last": 439.6248474121094, "geo/layer_0/stable_rank_q_proj": 19.267318725585938, "geo/layer_0/stable_rank_k_proj": 16.386037826538086, "geo/layer_0/stable_rank_o_proj": 48.9776611328125, "geo/layer_0/stable_rank_gate_proj": 137.07398986816406, "geo/layer_0/stable_rank_down_proj": 53.10319900512695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06152660772204399, "geo/layer_0/attn_entropy_mean": 6.19612979888916, "geo/layer_0/attn_entropy_std": 0.37121886014938354, "geo/layer_7/stable_rank_q_proj": 42.77592086791992, "geo/layer_7/stable_rank_k_proj": 42.294189453125, "geo/layer_7/stable_rank_o_proj": 98.01994323730469, "geo/layer_7/stable_rank_gate_proj": 89.2000732421875, "geo/layer_7/stable_rank_down_proj": 146.72222900390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49834510684013367, "geo/layer_7/attn_entropy_mean": 4.6602463722229, "geo/layer_7/attn_entropy_std": 0.8144068717956543, "geo/layer_14/stable_rank_q_proj": 54.31815719604492, "geo/layer_14/stable_rank_k_proj": 37.446407318115234, "geo/layer_14/stable_rank_o_proj": 47.725643157958984, "geo/layer_14/stable_rank_gate_proj": 76.34510803222656, "geo/layer_14/stable_rank_down_proj": 132.7936553955078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39958783984184265, "geo/layer_14/attn_entropy_mean": 5.543517112731934, "geo/layer_14/attn_entropy_std": 0.3701237440109253, "geo/layer_21/stable_rank_q_proj": 43.25855255126953, "geo/layer_21/stable_rank_k_proj": 30.832233428955078, "geo/layer_21/stable_rank_o_proj": 75.80538940429688, "geo/layer_21/stable_rank_gate_proj": 72.552001953125, "geo/layer_21/stable_rank_down_proj": 55.17439270019531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1448884904384613, "geo/layer_21/attn_entropy_mean": 5.726449012756348, "geo/layer_21/attn_entropy_std": 0.29124364256858826, "geo/layer_27/stable_rank_q_proj": 42.64166259765625, "geo/layer_27/stable_rank_k_proj": 31.542686462402344, "geo/layer_27/stable_rank_o_proj": 116.33958435058594, "geo/layer_27/stable_rank_gate_proj": 85.06732940673828, "geo/layer_27/stable_rank_down_proj": 130.7197723388672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08674494177103043, "geo/layer_27/attn_entropy_mean": 4.267165184020996, "geo/layer_27/attn_entropy_std": 0.6686947345733643, "attnres/final_alpha/block_0": 0.2422763556241989, "attnres/block_norm/0": 1.7237818241119385, "attnres/final_alpha/block_1": 0.005249295383691788, "attnres/block_norm/1": 40614.49609375, "attnres/final_alpha/block_2": 0.011197203770279884, "attnres/block_norm/2": 26537.498046875, "attnres/final_alpha/block_3": 0.013075204566121101, "attnres/block_norm/3": 47445.0703125, "attnres/final_alpha/block_4": 0.01589115336537361, "attnres/block_norm/4": 13043.703125, "attnres/final_alpha/block_5": 0.5956768989562988, "attnres/block_norm/5": 6074.265625, "attnres/final_alpha/block_6": 0.11663384735584259, "attnres/block_norm/6": 31734.30859375, "geo/tier1_time_s": 1.3577535152435303, "geo/step": 41325.0, "geo/rankme_slope": 5.809399931847739e-05} {"step": 41330, "timestamp": 1778239212.0253272, "train/loss": 2.1604435443878174, "train/z_loss": 0.0014073360594920813, "train/perplexity": 8.674984545977475, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700875.5127774118, "perf/iters_per_sec": 0.8110406459700641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329838275909424, "data/tokens_consumed": 86677389312, "data/tokens_consumed_B": 86.677389312, "train/loss_slope": 8.421576048138333e-06} {"step": 41340, "timestamp": 1778239222.4058309, "train/loss": 2.122640585899353, "train/z_loss": 0.001404611871112138, "train/perplexity": 8.353165642681354, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021521.4891386542, "perf/iters_per_sec": 0.9639365621274253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037412667274475, "data/tokens_consumed": 86698360832, "data/tokens_consumed_B": 86.698360832, "train/loss_slope": 6.479265537485209e-06} {"step": 41350, "timestamp": 1778239232.7767913, "grad/layer_0/attn": 0.002391152549535036, "grad/layer_0/mlp": 0.0026635746471583843, "grad/layer_0/attn_mlp_ratio": 0.8977230888999905, "grad/layer_4/attn": 0.0035377335734665394, "grad/layer_4/mlp": 0.0025791171938180923, "grad/layer_4/attn_mlp_ratio": 1.371683863291585, "grad/layer_8/attn": 0.005896365735679865, "grad/layer_8/mlp": 0.0035547094885259867, "grad/layer_8/attn_mlp_ratio": 1.658747526017978, "grad/layer_12/attn": 0.0042654974386096, "grad/layer_12/mlp": 0.006424427963793278, "grad/layer_12/attn_mlp_ratio": 0.6639497549437972, "grad/layer_16/attn": 0.0036124109756201506, "grad/layer_16/mlp": 0.004525449126958847, "grad/layer_16/attn_mlp_ratio": 0.7982436205670861, "grad/layer_20/attn": 0.006945968139916658, "grad/layer_20/mlp": 0.006212811451405287, "grad/layer_20/attn_mlp_ratio": 1.1180072150016425, "grad/layer_24/attn": 0.015687348321080208, "grad/layer_24/mlp": 0.010763198137283325, "grad/layer_24/attn_mlp_ratio": 1.4574987819828318, "grad/layer_27/attn": 0.005738893058151007, "grad/layer_27/mlp": 0.010284306481480598, "grad/layer_27/attn_mlp_ratio": 0.5580243074905296} {"step": 41350, "timestamp": 1778239232.7911892, "train/loss": 2.16582453250885, "train/z_loss": 0.0014156632823869586, "train/perplexity": 8.721790352575587, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020320.5955353363, "perf/iters_per_sec": 0.9633639314343149, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380293130874634, "data/tokens_consumed": 86719332352, "data/tokens_consumed_B": 86.719332352, "train/loss_slope": 9.183870991393037e-06} {"step": 41360, "timestamp": 1778239243.1707234, "train/loss": 2.146078372001648, "train/z_loss": 0.001408318686299026, "train/perplexity": 8.55125770389794, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021771.328360509, "perf/iters_per_sec": 0.9640556947519822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372844696044923, "data/tokens_consumed": 86740303872, "data/tokens_consumed_B": 86.740303872, "train/loss_slope": 7.85628262132707e-06} {"step": 41370, "timestamp": 1778239253.5475266, "train/loss": 2.137369382381439, "train/z_loss": 0.0014085477334447206, "train/perplexity": 8.477108241428633, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022471.1324288412, "perf/iters_per_sec": 0.9643893873352247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369255542755127, "data/tokens_consumed": 86761275392, "data/tokens_consumed_B": 86.761275392, "train/loss_slope": 4.1524972995766635e-06} {"step": 41380, "timestamp": 1778239263.9244125, "train/loss": 2.137723708152771, "train/z_loss": 0.0014034477411769331, "train/perplexity": 8.480112431544502, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022351.1634967176, "perf/iters_per_sec": 0.9643321816905582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036987066268921, "data/tokens_consumed": 86782246912, "data/tokens_consumed_B": 86.782246912, "train/loss_slope": 1.986766376546908e-06} {"step": 41390, "timestamp": 1778239274.3058615, "train/loss": 2.1400222539901734, "train/z_loss": 0.001403072173707187, "train/perplexity": 8.499626777392242, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021410.2731662577, "perf/iters_per_sec": 0.9638835302192009, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037469744682312, "data/tokens_consumed": 86803218432, "data/tokens_consumed_B": 86.803218432, "train/loss_slope": -1.7516045680533558e-06} {"step": 41400, "timestamp": 1778239284.6789715, "grad/layer_0/attn": 0.003159497631713748, "grad/layer_0/mlp": 0.0027869476471096277, "grad/layer_0/attn_mlp_ratio": 1.1336766665218232, "grad/layer_4/attn": 0.0017856457270681858, "grad/layer_4/mlp": 0.0024832026101648808, "grad/layer_4/attn_mlp_ratio": 0.7190897947069419, "grad/layer_8/attn": 0.006819851230829954, "grad/layer_8/mlp": 0.0034833133686333895, "grad/layer_8/attn_mlp_ratio": 1.9578631932616706, "grad/layer_12/attn": 0.004130818881094456, "grad/layer_12/mlp": 0.005988787859678268, "grad/layer_12/attn_mlp_ratio": 0.6897587473302984, "grad/layer_16/attn": 0.0049475072883069515, "grad/layer_16/mlp": 0.0045831105671823025, "grad/layer_16/attn_mlp_ratio": 1.0795085799987196, "grad/layer_20/attn": 0.00399665767326951, "grad/layer_20/mlp": 0.006256808061152697, "grad/layer_20/attn_mlp_ratio": 0.6387694125071598, "grad/layer_24/attn": 0.004343434702605009, "grad/layer_24/mlp": 0.007671830710023642, "grad/layer_24/attn_mlp_ratio": 0.5661536092441048, "grad/layer_27/attn": 0.006832872051745653, "grad/layer_27/mlp": 0.006477419286966324, "grad/layer_27/attn_mlp_ratio": 1.0548756601270193} {"step": 41400, "timestamp": 1778239285.2757738, "eos/sharpness": 15.784931182861325, "eos/L0_probe": 1.994607925415039, "eos/L_plus": 2.0869457721710205, "eos/L_minus": 2.060119390487671, "eos/grad_norm": 0.09452495723962784, "eos/embed_grad_frac": 0.2546534538269043, "eos/time_s": 0.5941150188446045} {"step": 41400, "timestamp": 1778239285.2957416, "train/loss": 2.2226011991500854, "train/z_loss": 0.0013970379251986742, "train/perplexity": 9.23131214362286, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909448.3899875542, "perf/iters_per_sec": 0.9104959440171977, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0983025312423706, "data/tokens_consumed": 86824189952, "data/tokens_consumed_B": 86.824189952, "train/loss_slope": -1.6843561935882417e-06} {"step": 41400, "timestamp": 1778239286.657804, "geo/rankme_last": 439.8121032714844, "geo/layer_0/stable_rank_q_proj": 19.28328514099121, "geo/layer_0/stable_rank_k_proj": 16.405317306518555, "geo/layer_0/stable_rank_o_proj": 48.97125244140625, "geo/layer_0/stable_rank_gate_proj": 136.9997100830078, "geo/layer_0/stable_rank_down_proj": 53.082763671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05849329009652138, "geo/layer_0/attn_entropy_mean": 6.19500732421875, "geo/layer_0/attn_entropy_std": 0.37147772312164307, "geo/layer_7/stable_rank_q_proj": 42.79606628417969, "geo/layer_7/stable_rank_k_proj": 42.198951721191406, "geo/layer_7/stable_rank_o_proj": 98.03194427490234, "geo/layer_7/stable_rank_gate_proj": 89.07898712158203, "geo/layer_7/stable_rank_down_proj": 146.80738830566406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49374836683273315, "geo/layer_7/attn_entropy_mean": 4.678735733032227, "geo/layer_7/attn_entropy_std": 0.8052682280540466, "geo/layer_14/stable_rank_q_proj": 54.2760124206543, "geo/layer_14/stable_rank_k_proj": 37.418399810791016, "geo/layer_14/stable_rank_o_proj": 47.73528289794922, "geo/layer_14/stable_rank_gate_proj": 76.33563232421875, "geo/layer_14/stable_rank_down_proj": 132.9222869873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4031393527984619, "geo/layer_14/attn_entropy_mean": 5.520831108093262, "geo/layer_14/attn_entropy_std": 0.3642938435077667, "geo/layer_21/stable_rank_q_proj": 43.368988037109375, "geo/layer_21/stable_rank_k_proj": 30.705610275268555, "geo/layer_21/stable_rank_o_proj": 75.72330474853516, "geo/layer_21/stable_rank_gate_proj": 72.55519104003906, "geo/layer_21/stable_rank_down_proj": 55.08643341064453, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14350371062755585, "geo/layer_21/attn_entropy_mean": 5.709700107574463, "geo/layer_21/attn_entropy_std": 0.2935914993286133, "geo/layer_27/stable_rank_q_proj": 42.63911056518555, "geo/layer_27/stable_rank_k_proj": 31.542686462402344, "geo/layer_27/stable_rank_o_proj": 116.33638000488281, "geo/layer_27/stable_rank_gate_proj": 85.13631439208984, "geo/layer_27/stable_rank_down_proj": 130.6923828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07891029119491577, "geo/layer_27/attn_entropy_mean": 4.235939979553223, "geo/layer_27/attn_entropy_std": 0.654741108417511, "attnres/final_alpha/block_0": 0.24131911993026733, "attnres/block_norm/0": 1.723983645439148, "attnres/final_alpha/block_1": 0.005060551688075066, "attnres/block_norm/1": 40643.48828125, "attnres/final_alpha/block_2": 0.011255140416324139, "attnres/block_norm/2": 26561.70703125, "attnres/final_alpha/block_3": 0.013283143751323223, "attnres/block_norm/3": 47664.62890625, "attnres/final_alpha/block_4": 0.01583685167133808, "attnres/block_norm/4": 12989.3125, "attnres/final_alpha/block_5": 0.5990062355995178, "attnres/block_norm/5": 6026.80615234375, "attnres/final_alpha/block_6": 0.1142389252781868, "attnres/block_norm/6": 31831.20703125, "geo/tier1_time_s": 1.3579399585723877, "geo/step": 41400.0, "geo/rankme_slope": 9.45317775547719e-05} {"step": 41410, "timestamp": 1778239297.0410526, "train/loss": 2.179283285140991, "train/z_loss": 0.001410402578767389, "train/perplexity": 8.839968251121334, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786107.7420551435, "perf/iters_per_sec": 0.8516825399661748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1741464138031006, "data/tokens_consumed": 86845161472, "data/tokens_consumed_B": 86.845161472, "train/loss_slope": -1.6845663996598193e-06} {"step": 41420, "timestamp": 1778239307.422614, "train/loss": 2.131160628795624, "train/z_loss": 0.0014016901259310544, "train/perplexity": 8.42463901802803, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021462.9993468693, "perf/iters_per_sec": 0.9639086720213267, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037442684173584, "data/tokens_consumed": 86866132992, "data/tokens_consumed_B": 86.866132992, "train/loss_slope": -4.47325483299822e-06} {"step": 41430, "timestamp": 1778239317.8031402, "train/loss": 2.162324070930481, "train/z_loss": 0.0014157323050312698, "train/perplexity": 8.691313433314312, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021481.117347852, "perf/iters_per_sec": 0.9639173113574276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037433385848999, "data/tokens_consumed": 86887104512, "data/tokens_consumed_B": 86.887104512, "train/loss_slope": -6.5290684675690824e-06} {"step": 41440, "timestamp": 1778239328.1903615, "train/loss": 2.2019158363342286, "train/z_loss": 0.0013971038861200213, "train/perplexity": 9.042320521630538, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020388.9502081834, "perf/iters_per_sec": 0.9633965254822652, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379941940307618, "data/tokens_consumed": 86908076032, "data/tokens_consumed_B": 86.908076032, "train/loss_slope": -5.7055733992893725e-06} {"step": 41450, "timestamp": 1778239338.5596204, "grad/layer_0/attn": 0.0027505564503371716, "grad/layer_0/mlp": 0.0027472875081002712, "grad/layer_0/attn_mlp_ratio": 1.0011898434758932, "grad/layer_4/attn": 0.0023075940553098917, "grad/layer_4/mlp": 0.0026116438675671816, "grad/layer_4/attn_mlp_ratio": 0.8835791110759554, "grad/layer_8/attn": 0.004693794995546341, "grad/layer_8/mlp": 0.0035595742519944906, "grad/layer_8/attn_mlp_ratio": 1.318639402184794, "grad/layer_12/attn": 0.004595989361405373, "grad/layer_12/mlp": 0.005909872706979513, "grad/layer_12/attn_mlp_ratio": 0.7776799114826204, "grad/layer_16/attn": 0.0050453003495931625, "grad/layer_16/mlp": 0.004532346967607737, "grad/layer_16/attn_mlp_ratio": 1.113176081693182, "grad/layer_20/attn": 0.006892205215990543, "grad/layer_20/mlp": 0.006168493535369635, "grad/layer_20/attn_mlp_ratio": 1.117323875714357, "grad/layer_24/attn": 0.013239061459898949, "grad/layer_24/mlp": 0.01089512463659048, "grad/layer_24/attn_mlp_ratio": 1.2151362907701762, "grad/layer_27/attn": 0.0041668601334095, "grad/layer_27/mlp": 0.009355107322335243, "grad/layer_27/attn_mlp_ratio": 0.44541018561274404} {"step": 41450, "timestamp": 1778239338.5743458, "train/loss": 2.1837383031845095, "train/z_loss": 0.0014055307139642537, "train/perplexity": 8.879438323846493, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020700.8974146536, "perf/iters_per_sec": 0.9635452735017078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378339529037475, "data/tokens_consumed": 86929047552, "data/tokens_consumed_B": 86.929047552, "train/loss_slope": -6.518761531533364e-06} {"step": 41460, "timestamp": 1778239348.9550848, "train/loss": 2.104149889945984, "train/z_loss": 0.00141243077814579, "train/perplexity": 8.20012903355658, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021632.0203929858, "perf/iters_per_sec": 0.9639892675366334, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373559474945069, "data/tokens_consumed": 86950019072, "data/tokens_consumed_B": 86.950019072, "train/loss_slope": -1.0784096164171112e-05} {"step": 41470, "timestamp": 1778239359.339207, "train/loss": 2.2022672653198243, "train/z_loss": 0.0014186033164151012, "train/perplexity": 9.045498813598137, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020841.8413240097, "perf/iters_per_sec": 0.9636124807949112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377615690231323, "data/tokens_consumed": 86970990592, "data/tokens_consumed_B": 86.970990592, "train/loss_slope": -1.043601828654388e-05} {"step": 41475, "timestamp": 1778239365.1287296, "eos/sharpness": 79.98321056365965, "eos/L0_probe": 1.99562668800354, "eos/L_plus": 2.492434501647949, "eos/L_minus": 2.2986509799957275, "eos/grad_norm": 0.360133558511734, "eos/embed_grad_frac": 0.019212782382965088, "eos/time_s": 0.6072385311126709} {"step": 41475, "timestamp": 1778239366.5057294, "geo/rankme_last": 439.9276428222656, "geo/layer_0/stable_rank_q_proj": 19.284013748168945, "geo/layer_0/stable_rank_k_proj": 16.391817092895508, "geo/layer_0/stable_rank_o_proj": 48.92593002319336, "geo/layer_0/stable_rank_gate_proj": 136.91773986816406, "geo/layer_0/stable_rank_down_proj": 53.09172058105469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05956181511282921, "geo/layer_0/attn_entropy_mean": 6.194254398345947, "geo/layer_0/attn_entropy_std": 0.37501510977745056, "geo/layer_7/stable_rank_q_proj": 42.83450698852539, "geo/layer_7/stable_rank_k_proj": 42.11495590209961, "geo/layer_7/stable_rank_o_proj": 97.9083023071289, "geo/layer_7/stable_rank_gate_proj": 89.02177429199219, "geo/layer_7/stable_rank_down_proj": 146.6912078857422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4950142800807953, "geo/layer_7/attn_entropy_mean": 4.661722183227539, "geo/layer_7/attn_entropy_std": 0.8423481583595276, "geo/layer_14/stable_rank_q_proj": 54.41328811645508, "geo/layer_14/stable_rank_k_proj": 37.523162841796875, "geo/layer_14/stable_rank_o_proj": 47.707130432128906, "geo/layer_14/stable_rank_gate_proj": 76.29234313964844, "geo/layer_14/stable_rank_down_proj": 132.9866485595703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3811262547969818, "geo/layer_14/attn_entropy_mean": 5.52906608581543, "geo/layer_14/attn_entropy_std": 0.360064297914505, "geo/layer_21/stable_rank_q_proj": 43.2459602355957, "geo/layer_21/stable_rank_k_proj": 30.664169311523438, "geo/layer_21/stable_rank_o_proj": 75.66180419921875, "geo/layer_21/stable_rank_gate_proj": 72.45297241210938, "geo/layer_21/stable_rank_down_proj": 55.05184555053711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14454573392868042, "geo/layer_21/attn_entropy_mean": 5.727078914642334, "geo/layer_21/attn_entropy_std": 0.29465919733047485, "geo/layer_27/stable_rank_q_proj": 42.63052749633789, "geo/layer_27/stable_rank_k_proj": 31.511831283569336, "geo/layer_27/stable_rank_o_proj": 116.46202087402344, "geo/layer_27/stable_rank_gate_proj": 85.13629913330078, "geo/layer_27/stable_rank_down_proj": 130.90562438964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08482042700052261, "geo/layer_27/attn_entropy_mean": 4.262602806091309, "geo/layer_27/attn_entropy_std": 0.6727079153060913, "attnres/final_alpha/block_0": 0.24047169089317322, "attnres/block_norm/0": 1.7241376638412476, "attnres/final_alpha/block_1": 0.005071665160357952, "attnres/block_norm/1": 40760.1953125, "attnres/final_alpha/block_2": 0.011030340567231178, "attnres/block_norm/2": 26596.37109375, "attnres/final_alpha/block_3": 0.01316773146390915, "attnres/block_norm/3": 48096.453125, "attnres/final_alpha/block_4": 0.01571166142821312, "attnres/block_norm/4": 12987.1796875, "attnres/final_alpha/block_5": 0.6021595001220703, "attnres/block_norm/5": 5984.8857421875, "attnres/final_alpha/block_6": 0.11238740384578705, "attnres/block_norm/6": 31661.17578125, "geo/tier1_time_s": 1.3590967655181885, "geo/step": 41475.0, "geo/rankme_slope": 9.712158300820329e-05} {"step": 41480, "timestamp": 1778239371.7031288, "train/loss": 2.176203167438507, "train/z_loss": 0.0013997258851304651, "train/perplexity": 8.812781998344159, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697027.4690794968, "perf/iters_per_sec": 0.8092057557485088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2357796430587769, "data/tokens_consumed": 86991962112, "data/tokens_consumed_B": 86.991962112, "train/loss_slope": -1.0633733234640096e-05} {"step": 41490, "timestamp": 1778239382.0834947, "train/loss": 2.1859363555908202, "train/z_loss": 0.0014041257905773818, "train/perplexity": 8.898977260557583, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021702.2295807074, "perf/iters_per_sec": 0.9640227458861863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373199224472045, "data/tokens_consumed": 87012933632, "data/tokens_consumed_B": 87.012933632, "train/loss_slope": -1.3230998638403696e-05} {"step": 41500, "timestamp": 1778239392.4511442, "grad/layer_0/attn": 0.0026248027570545673, "grad/layer_0/mlp": 0.002556596416980028, "grad/layer_0/attn_mlp_ratio": 1.0266785312510351, "grad/layer_4/attn": 0.0018986185314133763, "grad/layer_4/mlp": 0.0024925798643380404, "grad/layer_4/attn_mlp_ratio": 0.7617081732892755, "grad/layer_8/attn": 0.005860878154635429, "grad/layer_8/mlp": 0.0035639714915305376, "grad/layer_8/attn_mlp_ratio": 1.64447948141991, "grad/layer_12/attn": 0.0038152371998876333, "grad/layer_12/mlp": 0.006609630770981312, "grad/layer_12/attn_mlp_ratio": 0.5772239440235483, "grad/layer_16/attn": 0.004197854548692703, "grad/layer_16/mlp": 0.00447839917615056, "grad/layer_16/attn_mlp_ratio": 0.9373560260801487, "grad/layer_20/attn": 0.005212754942476749, "grad/layer_20/mlp": 0.005515066906809807, "grad/layer_20/attn_mlp_ratio": 0.9451843352111999, "grad/layer_24/attn": 0.01335114985704422, "grad/layer_24/mlp": 0.009243594482541084, "grad/layer_24/attn_mlp_ratio": 1.444367744368767, "grad/layer_27/attn": 0.007530175615102053, "grad/layer_27/mlp": 0.006643061526119709, "grad/layer_27/attn_mlp_ratio": 1.1335399306690639} {"step": 41500, "timestamp": 1778239392.4659638, "train/loss": 2.2107558727264403, "train/z_loss": 0.0013968044542707503, "train/perplexity": 9.122609319203029, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020719.1410224207, "perf/iters_per_sec": 0.9635539727317909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378245830535888, "data/tokens_consumed": 87033905152, "data/tokens_consumed_B": 87.033905152, "train/loss_slope": -1.2247723707593378e-05} {"step": 41500, "timestamp": 1778239399.7857807, "geo/ww_alpha_mean": 7.684813723676632, "geo/ww_alpha_std": 4.3970056478883475, "geo/ww_alpha_min": 1.3583510047802838, "geo/ww_alpha_max": 27.792469130614297, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.034336390897738, "geo/ww_alpha_by_type/k_proj": 4.432331889310281, "geo/ww_alpha_by_type/v_proj": 8.651090079107165, "geo/ww_alpha_by_type/o_proj": 8.750847528219756, "geo/ww_alpha_by_type/gate_proj": 7.640188210698808, "geo/ww_alpha_by_type/up_proj": 11.834651493810409, "geo/ww_alpha_by_type/down_proj": 8.557390095374512, "geo/twonn_id/layer_0": 0.7610974311828613, "geo/twonn_id/layer_7": 3.1493120193481445, "geo/twonn_id/layer_14": 4.372745990753174, "geo/twonn_id/layer_21": 7.1515727043151855, "geo/twonn_id/layer_27": 5.996939182281494, "geo/tier2_time_s": 7.3110127449035645} {"step": 41500, "timestamp": 1778239400.510633, "eoc/jacobian_sigma/layer_0/attn": 1199.5626220703125, "eoc/jacobian_sigma/layer_0/mlp": 8654.1337890625, "eoc/jacobian_sigma/layer_0": 8654.1337890625, "eoc/jacobian_sigma/layer_7/attn": 1.1421411037445068, "eoc/jacobian_sigma/layer_7/mlp": 1.7412281036376953, "eoc/jacobian_sigma/layer_7": 1.7412281036376953, "eoc/jacobian_sigma/layer_14/attn": 1.6137583255767822, "eoc/jacobian_sigma/layer_14/mlp": 6.077701568603516, "eoc/jacobian_sigma/layer_14": 6.077701568603516, "eoc/jacobian_sigma/layer_21/attn": 1.0880602598190308, "eoc/jacobian_sigma/layer_21/mlp": 3.90145206451416, "eoc/jacobian_sigma/layer_21": 3.90145206451416, "eoc/jacobian_sigma/layer_27/attn": 4.476311683654785, "eoc/jacobian_sigma/layer_27/mlp": 25.101396560668945, "eoc/jacobian_sigma/layer_27": 25.101396560668945, "eoc/layer0_sigma": 8654.1337890625, "eoc/sigma_max": 25.101396560668945, "eoc/sigma_min": 1.7412281036376953, "eoc/sigma_mean": 9.205444574356079, "eoc/time_s": 0.7154512405395508} {"step": 41510, "timestamp": 1778239410.917711, "train/loss": 2.1593968868255615, "train/z_loss": 0.0013969000545330345, "train/perplexity": 8.665909557830926, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1136993.7789037637, "perf/iters_per_sec": 0.5421608824271029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8444709539413453, "data/tokens_consumed": 87054876672, "data/tokens_consumed_B": 87.054876672, "train/loss_slope": -1.3263575328995147e-05} {"step": 41520, "timestamp": 1778239421.29772, "train/loss": 2.2192853927612304, "train/z_loss": 0.0014012044412083924, "train/perplexity": 9.200753590959778, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021327.8214374762, "perf/iters_per_sec": 0.9638442141711598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375120639801025, "data/tokens_consumed": 87075848192, "data/tokens_consumed_B": 87.075848192, "train/loss_slope": -9.292157941704362e-06} {"step": 41530, "timestamp": 1778239431.6847851, "train/loss": 2.1332311153411867, "train/z_loss": 0.001399751880671829, "train/perplexity": 8.442100190089112, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020140.8443580389, "perf/iters_per_sec": 0.9632782193937487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381216764450074, "data/tokens_consumed": 87096819712, "data/tokens_consumed_B": 87.096819712, "train/loss_slope": -1.1948515186429973e-05} {"step": 41540, "timestamp": 1778239442.060931, "train/loss": 2.2204028606414794, "train/z_loss": 0.0014201427227817475, "train/perplexity": 9.211040884361228, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022571.535974846, "perf/iters_per_sec": 0.9644372634767752, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368740797042846, "data/tokens_consumed": 87117791232, "data/tokens_consumed_B": 87.117791232, "train/loss_slope": -5.96039918961912e-06} {"step": 41550, "timestamp": 1778239452.4267175, "grad/layer_0/attn": 0.002859650179743767, "grad/layer_0/mlp": 0.0030219200998544693, "grad/layer_0/attn_mlp_ratio": 0.946302347719666, "grad/layer_4/attn": 0.0020781869534403086, "grad/layer_4/mlp": 0.0026968561578541994, "grad/layer_4/attn_mlp_ratio": 0.7705961144157731, "grad/layer_8/attn": 0.0032584061846137047, "grad/layer_8/mlp": 0.00392140680924058, "grad/layer_8/attn_mlp_ratio": 0.830927842998249, "grad/layer_12/attn": 0.00536619545891881, "grad/layer_12/mlp": 0.006180219352245331, "grad/layer_12/attn_mlp_ratio": 0.868285584417107, "grad/layer_16/attn": 0.0054486901499331, "grad/layer_16/mlp": 0.004435801412910223, "grad/layer_16/attn_mlp_ratio": 1.2283439946703893, "grad/layer_20/attn": 0.0037963802460581064, "grad/layer_20/mlp": 0.0056631495244801044, "grad/layer_20/attn_mlp_ratio": 0.6703655205660625, "grad/layer_24/attn": 0.014157334342598915, "grad/layer_24/mlp": 0.012022050097584724, "grad/layer_24/attn_mlp_ratio": 1.177613976810975, "grad/layer_27/attn": 0.006509620696306229, "grad/layer_27/mlp": 0.012573261745274067, "grad/layer_27/attn_mlp_ratio": 0.5177352366007562} {"step": 41550, "timestamp": 1778239453.0495083, "eos/sharpness": 73.07171821594237, "eos/L0_probe": 1.9949977397918701, "eos/L_plus": 2.4387247562408447, "eos/L_minus": 2.2819879055023193, "eos/grad_norm": 0.2229069024324417, "eos/embed_grad_frac": 0.047280121594667435, "eos/time_s": 0.6198861598968506} {"step": 41550, "timestamp": 1778239453.0700815, "train/loss": 2.1660696268081665, "train/z_loss": 0.0014057583641260863, "train/perplexity": 8.723928275656514, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905855.756197259, "perf/iters_per_sec": 0.9087828427301688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.100372886657715, "data/tokens_consumed": 87138762752, "data/tokens_consumed_B": 87.138762752, "train/loss_slope": -5.258612134883668e-06} {"step": 41550, "timestamp": 1778239454.4315815, "geo/rankme_last": 439.1088562011719, "geo/layer_0/stable_rank_q_proj": 19.294403076171875, "geo/layer_0/stable_rank_k_proj": 16.3989315032959, "geo/layer_0/stable_rank_o_proj": 48.883399963378906, "geo/layer_0/stable_rank_gate_proj": 136.87998962402344, "geo/layer_0/stable_rank_down_proj": 53.243682861328125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06589051336050034, "geo/layer_0/attn_entropy_mean": 6.1938323974609375, "geo/layer_0/attn_entropy_std": 0.38322487473487854, "geo/layer_7/stable_rank_q_proj": 42.89324951171875, "geo/layer_7/stable_rank_k_proj": 42.02985763549805, "geo/layer_7/stable_rank_o_proj": 97.89103698730469, "geo/layer_7/stable_rank_gate_proj": 88.84481811523438, "geo/layer_7/stable_rank_down_proj": 146.27357482910156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5037762522697449, "geo/layer_7/attn_entropy_mean": 4.659603118896484, "geo/layer_7/attn_entropy_std": 0.8113581538200378, "geo/layer_14/stable_rank_q_proj": 54.371551513671875, "geo/layer_14/stable_rank_k_proj": 37.4893798828125, "geo/layer_14/stable_rank_o_proj": 47.69243621826172, "geo/layer_14/stable_rank_gate_proj": 76.32737731933594, "geo/layer_14/stable_rank_down_proj": 133.1831512451172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.387606143951416, "geo/layer_14/attn_entropy_mean": 5.507504463195801, "geo/layer_14/attn_entropy_std": 0.37091776728630066, "geo/layer_21/stable_rank_q_proj": 43.0955810546875, "geo/layer_21/stable_rank_k_proj": 30.726974487304688, "geo/layer_21/stable_rank_o_proj": 75.62421417236328, "geo/layer_21/stable_rank_gate_proj": 72.49288177490234, "geo/layer_21/stable_rank_down_proj": 55.109432220458984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14952591061592102, "geo/layer_21/attn_entropy_mean": 5.724212646484375, "geo/layer_21/attn_entropy_std": 0.2873905599117279, "geo/layer_27/stable_rank_q_proj": 42.570735931396484, "geo/layer_27/stable_rank_k_proj": 31.580453872680664, "geo/layer_27/stable_rank_o_proj": 116.3759765625, "geo/layer_27/stable_rank_gate_proj": 85.15489196777344, "geo/layer_27/stable_rank_down_proj": 131.0681610107422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0841832309961319, "geo/layer_27/attn_entropy_mean": 4.269996643066406, "geo/layer_27/attn_entropy_std": 0.6565539240837097, "attnres/final_alpha/block_0": 0.24303507804870605, "attnres/block_norm/0": 1.7244882583618164, "attnres/final_alpha/block_1": 0.005143929738551378, "attnres/block_norm/1": 40760.8828125, "attnres/final_alpha/block_2": 0.011147812008857727, "attnres/block_norm/2": 26475.275390625, "attnres/final_alpha/block_3": 0.013121032156050205, "attnres/block_norm/3": 47685.75, "attnres/final_alpha/block_4": 0.01589076779782772, "attnres/block_norm/4": 13126.0126953125, "attnres/final_alpha/block_5": 0.5960901975631714, "attnres/block_norm/5": 6084.06787109375, "attnres/final_alpha/block_6": 0.11557120084762573, "attnres/block_norm/6": 31812.513671875, "geo/tier1_time_s": 1.3574423789978027, "geo/step": 41550.0, "geo/rankme_slope": 5.6246541585384155e-05} {"step": 41560, "timestamp": 1778239464.8165271, "train/loss": 2.1909188985824586, "train/z_loss": 0.0014143336680717765, "train/perplexity": 8.943427442853062, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785939.6187038934, "perf/iters_per_sec": 0.8516023725051372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174256944656372, "data/tokens_consumed": 87159734272, "data/tokens_consumed_B": 87.159734272, "train/loss_slope": -5.6156975279475015e-06} {"step": 41570, "timestamp": 1778239475.2028377, "train/loss": 2.117722678184509, "train/z_loss": 0.0014092789380811155, "train/perplexity": 8.312186393599461, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020339.2499158718, "perf/iters_per_sec": 0.9633728265361174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380197286605835, "data/tokens_consumed": 87180705792, "data/tokens_consumed_B": 87.180705792, "train/loss_slope": -8.033492713227773e-06} {"step": 41580, "timestamp": 1778239485.5855947, "train/loss": 2.165091705322266, "train/z_loss": 0.0014019985566847027, "train/perplexity": 8.715401128873882, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020914.8279123984, "perf/iters_per_sec": 0.9636472835123054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377240896224975, "data/tokens_consumed": 87201677312, "data/tokens_consumed_B": 87.201677312, "train/loss_slope": -9.498811726665974e-06} {"step": 41590, "timestamp": 1778239495.9610837, "train/loss": 2.1008705735206603, "train/z_loss": 0.0014099265215918421, "train/perplexity": 8.173282259319917, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022358.4170278755, "perf/iters_per_sec": 0.9643356404437425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036983346939087, "data/tokens_consumed": 87222648832, "data/tokens_consumed_B": 87.222648832, "train/loss_slope": -1.4845177249582266e-05} {"step": 41600, "timestamp": 1778239506.3293812, "grad/layer_0/attn": 0.0030454813968390226, "grad/layer_0/mlp": 0.0030884467996656895, "grad/layer_0/attn_mlp_ratio": 0.9860883142166641, "grad/layer_4/attn": 0.004528623074293137, "grad/layer_4/mlp": 0.0025748261250555515, "grad/layer_4/attn_mlp_ratio": 1.758807266379863, "grad/layer_8/attn": 0.007436172571033239, "grad/layer_8/mlp": 0.003747182432562113, "grad/layer_8/attn_mlp_ratio": 1.984470333754679, "grad/layer_12/attn": 0.004907610360532999, "grad/layer_12/mlp": 0.00672542629763484, "grad/layer_12/attn_mlp_ratio": 0.7297099202897945, "grad/layer_16/attn": 0.004918438848108053, "grad/layer_16/mlp": 0.004585093818604946, "grad/layer_16/attn_mlp_ratio": 1.0727018759965838, "grad/layer_20/attn": 0.0048357220366597176, "grad/layer_20/mlp": 0.006111668422818184, "grad/layer_20/attn_mlp_ratio": 0.7912277995125777, "grad/layer_24/attn": 0.00842302106320858, "grad/layer_24/mlp": 0.008743403479456902, "grad/layer_24/attn_mlp_ratio": 0.9633572311586901, "grad/layer_27/attn": 0.00988709181547165, "grad/layer_27/mlp": 0.007928553968667984, "grad/layer_27/attn_mlp_ratio": 1.2470233197429281} {"step": 41600, "timestamp": 1778239506.353721, "train/loss": 2.122746205329895, "train/z_loss": 0.001418848056346178, "train/perplexity": 8.354047945873118, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019062.1280625504, "perf/iters_per_sec": 0.9627638473809006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386763095855713, "data/tokens_consumed": 87243620352, "data/tokens_consumed_B": 87.243620352, "train/loss_slope": -1.5650994077374047e-05} {"step": 41610, "timestamp": 1778239516.732801, "train/loss": 2.1099441766738893, "train/z_loss": 0.0014151850598864257, "train/perplexity": 8.247780853215115, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021263.6763221736, "perf/iters_per_sec": 0.9638136273966663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375449895858764, "data/tokens_consumed": 87264591872, "data/tokens_consumed_B": 87.264591872, "train/loss_slope": -1.9678554798152528e-05} {"step": 41620, "timestamp": 1778239527.113665, "train/loss": 2.1701876759529113, "train/z_loss": 0.0014081424684263765, "train/perplexity": 8.759927914296991, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021362.427118602, "perf/iters_per_sec": 0.9638607154458055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374943017959595, "data/tokens_consumed": 87285563392, "data/tokens_consumed_B": 87.285563392, "train/loss_slope": -1.9201176959832365e-05} {"step": 41625, "timestamp": 1778239532.9023852, "eos/sharpness": 58.10959339141844, "eos/L0_probe": 1.9956960678100586, "eos/L_plus": 2.2708685398101807, "eos/L_minus": 2.301619529724121, "eos/grad_norm": 0.1862550526857376, "eos/embed_grad_frac": 0.08155376464128494, "eos/time_s": 0.6062877178192139} {"step": 41625, "timestamp": 1778239534.2789986, "geo/rankme_last": 439.2424621582031, "geo/layer_0/stable_rank_q_proj": 19.30716896057129, "geo/layer_0/stable_rank_k_proj": 16.403839111328125, "geo/layer_0/stable_rank_o_proj": 48.88967514038086, "geo/layer_0/stable_rank_gate_proj": 137.1465606689453, "geo/layer_0/stable_rank_down_proj": 53.273460388183594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05933014303445816, "geo/layer_0/attn_entropy_mean": 6.1923651695251465, "geo/layer_0/attn_entropy_std": 0.3801219165325165, "geo/layer_7/stable_rank_q_proj": 42.81806182861328, "geo/layer_7/stable_rank_k_proj": 42.04127883911133, "geo/layer_7/stable_rank_o_proj": 97.89115905761719, "geo/layer_7/stable_rank_gate_proj": 88.81974029541016, "geo/layer_7/stable_rank_down_proj": 146.3375244140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48932263255119324, "geo/layer_7/attn_entropy_mean": 4.675773620605469, "geo/layer_7/attn_entropy_std": 0.8155823945999146, "geo/layer_14/stable_rank_q_proj": 54.268795013427734, "geo/layer_14/stable_rank_k_proj": 37.52782440185547, "geo/layer_14/stable_rank_o_proj": 47.74193572998047, "geo/layer_14/stable_rank_gate_proj": 76.33578491210938, "geo/layer_14/stable_rank_down_proj": 132.7984619140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3885030150413513, "geo/layer_14/attn_entropy_mean": 5.491141319274902, "geo/layer_14/attn_entropy_std": 0.36824533343315125, "geo/layer_21/stable_rank_q_proj": 43.10651397705078, "geo/layer_21/stable_rank_k_proj": 30.69991683959961, "geo/layer_21/stable_rank_o_proj": 75.61920166015625, "geo/layer_21/stable_rank_gate_proj": 72.51005554199219, "geo/layer_21/stable_rank_down_proj": 55.126773834228516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1438000202178955, "geo/layer_21/attn_entropy_mean": 5.706600189208984, "geo/layer_21/attn_entropy_std": 0.2979229688644409, "geo/layer_27/stable_rank_q_proj": 42.491607666015625, "geo/layer_27/stable_rank_k_proj": 31.583139419555664, "geo/layer_27/stable_rank_o_proj": 116.34473419189453, "geo/layer_27/stable_rank_gate_proj": 85.00515747070312, "geo/layer_27/stable_rank_down_proj": 131.1306915283203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09207747876644135, "geo/layer_27/attn_entropy_mean": 4.2406511306762695, "geo/layer_27/attn_entropy_std": 0.6581645011901855, "attnres/final_alpha/block_0": 0.24392113089561462, "attnres/block_norm/0": 1.7248402833938599, "attnres/final_alpha/block_1": 0.005207482259720564, "attnres/block_norm/1": 40768.890625, "attnres/final_alpha/block_2": 0.01128637045621872, "attnres/block_norm/2": 26579.564453125, "attnres/final_alpha/block_3": 0.013215875253081322, "attnres/block_norm/3": 48049.10546875, "attnres/final_alpha/block_4": 0.01618245244026184, "attnres/block_norm/4": 13077.2041015625, "attnres/final_alpha/block_5": 0.5926945209503174, "attnres/block_norm/5": 6133.9951171875, "attnres/final_alpha/block_6": 0.11749213933944702, "attnres/block_norm/6": 31974.1953125, "geo/tier1_time_s": 1.358137607574463, "geo/step": 41625.0, "geo/rankme_slope": 5.213309933348339e-05} {"step": 41630, "timestamp": 1778239539.4887593, "train/loss": 2.174904298782349, "train/z_loss": 0.0014006490702740848, "train/perplexity": 8.801342782660525, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695487.8280369465, "perf/iters_per_sec": 0.8084715976891262, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236901831626892, "data/tokens_consumed": 87306534912, "data/tokens_consumed_B": 87.306534912, "train/loss_slope": -1.962766972812773e-05} {"step": 41640, "timestamp": 1778239549.8417954, "train/loss": 2.1589267253875732, "train/z_loss": 0.001414172339718789, "train/perplexity": 8.661836138989006, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026481.1055807378, "perf/iters_per_sec": 0.9663014915374459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348737001419068, "data/tokens_consumed": 87327506432, "data/tokens_consumed_B": 87.327506432, "train/loss_slope": -2.0741239418112174e-05} {"step": 41650, "timestamp": 1778239560.1824508, "grad/layer_0/attn": 0.002475128509104252, "grad/layer_0/mlp": 0.002650586888194084, "grad/layer_0/attn_mlp_ratio": 0.933803915935852, "grad/layer_4/attn": 0.0018899178830906749, "grad/layer_4/mlp": 0.0025719625409692526, "grad/layer_4/attn_mlp_ratio": 0.7348154490993901, "grad/layer_8/attn": 0.005926576908677816, "grad/layer_8/mlp": 0.003632349893450737, "grad/layer_8/attn_mlp_ratio": 1.63160954185683, "grad/layer_12/attn": 0.004205962643027306, "grad/layer_12/mlp": 0.006443314719945192, "grad/layer_12/attn_mlp_ratio": 0.6527637963626745, "grad/layer_16/attn": 0.003777344711124897, "grad/layer_16/mlp": 0.0044773044064641, "grad/layer_16/attn_mlp_ratio": 0.8436649117055504, "grad/layer_20/attn": 0.0049163275398314, "grad/layer_20/mlp": 0.0052594346925616264, "grad/layer_20/attn_mlp_ratio": 0.9347634743536549, "grad/layer_24/attn": 0.008387580513954163, "grad/layer_24/mlp": 0.011781368404626846, "grad/layer_24/attn_mlp_ratio": 0.7119360124131712, "grad/layer_27/attn": 0.007158275227993727, "grad/layer_27/mlp": 0.010545751079916954, "grad/layer_27/attn_mlp_ratio": 0.6787828677037018} {"step": 41650, "timestamp": 1778239560.1971543, "train/loss": 2.1457545161247253, "train/z_loss": 0.0014255619957111777, "train/perplexity": 8.548488777226243, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026378.2128145965, "perf/iters_per_sec": 0.9662524284432394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349262475967407, "data/tokens_consumed": 87348477952, "data/tokens_consumed_B": 87.348477952, "train/loss_slope": -2.2320442610304862e-05} {"step": 41660, "timestamp": 1778239570.553962, "train/loss": 2.1523324251174927, "train/z_loss": 0.0014175624120980502, "train/perplexity": 8.604905306402614, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026259.6936722118, "perf/iters_per_sec": 0.9661959141121921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349867820739747, "data/tokens_consumed": 87369449472, "data/tokens_consumed_B": 87.369449472, "train/loss_slope": -1.997444460851951e-05} {"step": 41670, "timestamp": 1778239580.903709, "train/loss": 2.1591430425643923, "train/z_loss": 0.0014134504017420113, "train/perplexity": 8.663710045600448, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027516.964130179, "perf/iters_per_sec": 0.9667954273844619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034344983100891, "data/tokens_consumed": 87390420992, "data/tokens_consumed_B": 87.390420992, "train/loss_slope": -2.226089343200597e-05} {"step": 41680, "timestamp": 1778239591.248433, "train/loss": 2.1882302522659303, "train/z_loss": 0.0013954505557194352, "train/perplexity": 8.919414025859988, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028790.0208869057, "perf/iters_per_sec": 0.9674024681505707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033695936203003, "data/tokens_consumed": 87411392512, "data/tokens_consumed_B": 87.411392512, "train/loss_slope": -2.0147636642765035e-05} {"step": 41690, "timestamp": 1778239601.5907292, "train/loss": 2.155281686782837, "train/z_loss": 0.0014105176087468863, "train/perplexity": 8.630320883928594, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028755.7687134019, "perf/iters_per_sec": 0.9673861354414949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337133884429932, "data/tokens_consumed": 87432364032, "data/tokens_consumed_B": 87.432364032, "train/loss_slope": -2.237465870906165e-05} {"step": 41700, "timestamp": 1778239611.928745, "grad/layer_0/attn": 0.002896807622164488, "grad/layer_0/mlp": 0.0028980167116969824, "grad/layer_0/attn_mlp_ratio": 0.9995827527543615, "grad/layer_4/attn": 0.0017694946145638824, "grad/layer_4/mlp": 0.0025054230354726315, "grad/layer_4/attn_mlp_ratio": 0.7062657758327433, "grad/layer_8/attn": 0.005081671290099621, "grad/layer_8/mlp": 0.0037423076573759317, "grad/layer_8/attn_mlp_ratio": 1.3578977517506035, "grad/layer_12/attn": 0.004621712025254965, "grad/layer_12/mlp": 0.006851253565400839, "grad/layer_12/attn_mlp_ratio": 0.6745790261123791, "grad/layer_16/attn": 0.00500645162537694, "grad/layer_16/mlp": 0.00454002944752574, "grad/layer_16/attn_mlp_ratio": 1.1027354718661235, "grad/layer_20/attn": 0.006867315154522657, "grad/layer_20/mlp": 0.006348317489027977, "grad/layer_20/attn_mlp_ratio": 1.0817535604065687, "grad/layer_24/attn": 0.009884980507194996, "grad/layer_24/mlp": 0.011268957518041134, "grad/layer_24/attn_mlp_ratio": 0.8771867676003637, "grad/layer_27/attn": 0.004593223799020052, "grad/layer_27/mlp": 0.010713424533605576, "grad/layer_27/attn_mlp_ratio": 0.42873534430924676} {"step": 41700, "timestamp": 1778239612.5351074, "eos/sharpness": 18.994045257568356, "eos/L0_probe": 1.9987592697143555, "eos/L_plus": 2.106487274169922, "eos/L_minus": 2.0809717178344727, "eos/grad_norm": 0.12734626233577728, "eos/embed_grad_frac": 0.16354788839817047, "eos/time_s": 0.6035244464874268} {"step": 41700, "timestamp": 1778239612.5558379, "train/loss": 2.1253987431526182, "train/z_loss": 0.001421269529964775, "train/perplexity": 8.376236789385741, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913372.0237184274, "perf/iters_per_sec": 0.9123668783752572, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960503101348877, "data/tokens_consumed": 87453335552, "data/tokens_consumed_B": 87.453335552, "train/loss_slope": -2.414910163101118e-05} {"step": 41700, "timestamp": 1778239613.920481, "geo/rankme_last": 439.3249816894531, "geo/layer_0/stable_rank_q_proj": 19.314533233642578, "geo/layer_0/stable_rank_k_proj": 16.447744369506836, "geo/layer_0/stable_rank_o_proj": 48.82622528076172, "geo/layer_0/stable_rank_gate_proj": 137.19688415527344, "geo/layer_0/stable_rank_down_proj": 53.25844955444336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05569567531347275, "geo/layer_0/attn_entropy_mean": 6.193793773651123, "geo/layer_0/attn_entropy_std": 0.3776378631591797, "geo/layer_7/stable_rank_q_proj": 42.72218704223633, "geo/layer_7/stable_rank_k_proj": 42.04098129272461, "geo/layer_7/stable_rank_o_proj": 97.6938705444336, "geo/layer_7/stable_rank_gate_proj": 88.88300323486328, "geo/layer_7/stable_rank_down_proj": 146.06118774414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.501735270023346, "geo/layer_7/attn_entropy_mean": 4.652152061462402, "geo/layer_7/attn_entropy_std": 0.8256786465644836, "geo/layer_14/stable_rank_q_proj": 54.28205490112305, "geo/layer_14/stable_rank_k_proj": 37.67671585083008, "geo/layer_14/stable_rank_o_proj": 47.784446716308594, "geo/layer_14/stable_rank_gate_proj": 76.35704040527344, "geo/layer_14/stable_rank_down_proj": 132.7386474609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.389326274394989, "geo/layer_14/attn_entropy_mean": 5.524901390075684, "geo/layer_14/attn_entropy_std": 0.35531777143478394, "geo/layer_21/stable_rank_q_proj": 43.202335357666016, "geo/layer_21/stable_rank_k_proj": 30.650741577148438, "geo/layer_21/stable_rank_o_proj": 75.60509490966797, "geo/layer_21/stable_rank_gate_proj": 72.51231384277344, "geo/layer_21/stable_rank_down_proj": 55.148590087890625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.143450528383255, "geo/layer_21/attn_entropy_mean": 5.727257251739502, "geo/layer_21/attn_entropy_std": 0.287431925535202, "geo/layer_27/stable_rank_q_proj": 42.4267463684082, "geo/layer_27/stable_rank_k_proj": 31.524433135986328, "geo/layer_27/stable_rank_o_proj": 116.47639465332031, "geo/layer_27/stable_rank_gate_proj": 84.94158935546875, "geo/layer_27/stable_rank_down_proj": 131.24070739746094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09605465829372406, "geo/layer_27/attn_entropy_mean": 4.274419784545898, "geo/layer_27/attn_entropy_std": 0.6479487419128418, "attnres/final_alpha/block_0": 0.24166499078273773, "attnres/block_norm/0": 1.7249619960784912, "attnres/final_alpha/block_1": 0.0051663713529706, "attnres/block_norm/1": 40825.11328125, "attnres/final_alpha/block_2": 0.01123354397714138, "attnres/block_norm/2": 26576.78125, "attnres/final_alpha/block_3": 0.013246500864624977, "attnres/block_norm/3": 47994.8203125, "attnres/final_alpha/block_4": 0.01598888263106346, "attnres/block_norm/4": 13076.361328125, "attnres/final_alpha/block_5": 0.596156120300293, "attnres/block_norm/5": 6115.5732421875, "attnres/final_alpha/block_6": 0.11654353141784668, "attnres/block_norm/6": 32050.853515625, "geo/tier1_time_s": 1.3604621887207031, "geo/step": 41700.0, "geo/rankme_slope": 1.60613268744998e-05} {"step": 41710, "timestamp": 1778239624.2667508, "train/loss": 2.231752610206604, "train/z_loss": 0.0013947904924862086, "train/perplexity": 9.316179410974451, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791369.3491449265, "perf/iters_per_sec": 0.8541914697384484, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706977128982543, "data/tokens_consumed": 87474307072, "data/tokens_consumed_B": 87.474307072, "train/loss_slope": -1.8451868690172547e-05} {"step": 41720, "timestamp": 1778239634.606879, "train/loss": 2.1368703603744508, "train/z_loss": 0.0014139480306766927, "train/perplexity": 8.472879033182297, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029163.6412065995, "perf/iters_per_sec": 0.9675806242020605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335056066513062, "data/tokens_consumed": 87495278592, "data/tokens_consumed_B": 87.495278592, "train/loss_slope": -1.789190088919323e-05} {"step": 41730, "timestamp": 1778239645.5594923, "train/loss": 2.1759218692779543, "train/z_loss": 0.0014149157446809112, "train/perplexity": 8.810303327617774, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915895.6873805341, "perf/iters_per_sec": 0.9135702549841567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0946065664291382, "data/tokens_consumed": 87516250112, "data/tokens_consumed_B": 87.516250112, "train/loss_slope": -1.7855401458305304e-05} {"step": 41740, "timestamp": 1778239655.9133906, "train/loss": 2.166601777076721, "train/z_loss": 0.0014087412389926612, "train/perplexity": 8.728571951888448, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026901.375079502, "perf/iters_per_sec": 0.966501891650916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346591234207154, "data/tokens_consumed": 87537221632, "data/tokens_consumed_B": 87.537221632, "train/loss_slope": -2.1180449782496543e-05} {"step": 41750, "timestamp": 1778239666.2433295, "grad/layer_0/attn": 0.003854696173220873, "grad/layer_0/mlp": 0.003293609246611595, "grad/layer_0/attn_mlp_ratio": 1.1703562164063306, "grad/layer_4/attn": 0.0020371852442622185, "grad/layer_4/mlp": 0.002515374682843685, "grad/layer_4/attn_mlp_ratio": 0.8098933241109847, "grad/layer_8/attn": 0.00589055847376585, "grad/layer_8/mlp": 0.00374617800116539, "grad/layer_8/attn_mlp_ratio": 1.5724181591722426, "grad/layer_12/attn": 0.003944911062717438, "grad/layer_12/mlp": 0.00731603242456913, "grad/layer_12/attn_mlp_ratio": 0.5392145332144719, "grad/layer_16/attn": 0.004022142384201288, "grad/layer_16/mlp": 0.005083299707621336, "grad/layer_16/attn_mlp_ratio": 0.7912463432062249, "grad/layer_20/attn": 0.004004329442977905, "grad/layer_20/mlp": 0.007565394975244999, "grad/layer_20/attn_mlp_ratio": 0.5292954833357766, "grad/layer_24/attn": 0.02259075827896595, "grad/layer_24/mlp": 0.014321804977953434, "grad/layer_24/attn_mlp_ratio": 1.5773680870536007, "grad/layer_27/attn": 0.006801909767091274, "grad/layer_27/mlp": 0.012607386335730553, "grad/layer_27/attn_mlp_ratio": 0.5395178296283522} {"step": 41750, "timestamp": 1778239666.2580206, "train/loss": 2.202942204475403, "train/z_loss": 0.0013976421556435525, "train/perplexity": 9.051606035698997, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028431.319498837, "perf/iters_per_sec": 0.9672314260000405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338787317276001, "data/tokens_consumed": 87558193152, "data/tokens_consumed_B": 87.558193152, "train/loss_slope": -1.971919624575829e-05} {"step": 41760, "timestamp": 1778239676.6148474, "train/loss": 2.1698108434677126, "train/z_loss": 0.0014037217129953205, "train/perplexity": 8.75662751077957, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026533.8631835473, "perf/iters_per_sec": 0.9663266483228432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348467588424684, "data/tokens_consumed": 87579164672, "data/tokens_consumed_B": 87.579164672, "train/loss_slope": -1.526148012368983e-05} {"step": 41770, "timestamp": 1778239686.9642406, "train/loss": 2.146300792694092, "train/z_loss": 0.0014094431884586812, "train/perplexity": 8.553159892092873, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027498.457369818, "perf/iters_per_sec": 0.9667866026734437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343544244766236, "data/tokens_consumed": 87600136192, "data/tokens_consumed_B": 87.600136192, "train/loss_slope": -1.4744625640924064e-05} {"step": 41775, "timestamp": 1778239692.7242692, "eos/sharpness": 13.796234130859371, "eos/L0_probe": 1.9948482513427734, "eos/L_plus": 2.07035231590271, "eos/L_minus": 2.0573065280914307, "eos/grad_norm": 0.1091577410697937, "eos/embed_grad_frac": 0.1987960934638977, "eos/time_s": 0.5991928577423096} {"step": 41775, "timestamp": 1778239694.1092854, "geo/rankme_last": 438.8254089355469, "geo/layer_0/stable_rank_q_proj": 19.29361343383789, "geo/layer_0/stable_rank_k_proj": 16.417945861816406, "geo/layer_0/stable_rank_o_proj": 48.8575439453125, "geo/layer_0/stable_rank_gate_proj": 136.93173217773438, "geo/layer_0/stable_rank_down_proj": 53.315189361572266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05929524824023247, "geo/layer_0/attn_entropy_mean": 6.189577102661133, "geo/layer_0/attn_entropy_std": 0.37793004512786865, "geo/layer_7/stable_rank_q_proj": 42.81486129760742, "geo/layer_7/stable_rank_k_proj": 42.166481018066406, "geo/layer_7/stable_rank_o_proj": 97.65694427490234, "geo/layer_7/stable_rank_gate_proj": 89.0638427734375, "geo/layer_7/stable_rank_down_proj": 146.2794647216797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4996698200702667, "geo/layer_7/attn_entropy_mean": 4.6677446365356445, "geo/layer_7/attn_entropy_std": 0.8208427429199219, "geo/layer_14/stable_rank_q_proj": 54.24923324584961, "geo/layer_14/stable_rank_k_proj": 37.637733459472656, "geo/layer_14/stable_rank_o_proj": 47.717041015625, "geo/layer_14/stable_rank_gate_proj": 76.34326171875, "geo/layer_14/stable_rank_down_proj": 132.66851806640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3992844223976135, "geo/layer_14/attn_entropy_mean": 5.540088653564453, "geo/layer_14/attn_entropy_std": 0.3549460172653198, "geo/layer_21/stable_rank_q_proj": 43.25448226928711, "geo/layer_21/stable_rank_k_proj": 30.702444076538086, "geo/layer_21/stable_rank_o_proj": 75.69084167480469, "geo/layer_21/stable_rank_gate_proj": 72.44255065917969, "geo/layer_21/stable_rank_down_proj": 55.104148864746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1517697274684906, "geo/layer_21/attn_entropy_mean": 5.712133407592773, "geo/layer_21/attn_entropy_std": 0.2879338264465332, "geo/layer_27/stable_rank_q_proj": 42.422203063964844, "geo/layer_27/stable_rank_k_proj": 31.604177474975586, "geo/layer_27/stable_rank_o_proj": 116.30575561523438, "geo/layer_27/stable_rank_gate_proj": 84.84650421142578, "geo/layer_27/stable_rank_down_proj": 131.32118225097656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08694756776094437, "geo/layer_27/attn_entropy_mean": 4.249373435974121, "geo/layer_27/attn_entropy_std": 0.6629052758216858, "attnres/final_alpha/block_0": 0.243021160364151, "attnres/block_norm/0": 1.7251596450805664, "attnres/final_alpha/block_1": 0.005159967113286257, "attnres/block_norm/1": 40858.19140625, "attnres/final_alpha/block_2": 0.011180035769939423, "attnres/block_norm/2": 26586.8125, "attnres/final_alpha/block_3": 0.013157233595848083, "attnres/block_norm/3": 48272.13671875, "attnres/final_alpha/block_4": 0.016000041738152504, "attnres/block_norm/4": 13091.9599609375, "attnres/final_alpha/block_5": 0.595524251461029, "attnres/block_norm/5": 6139.82373046875, "attnres/final_alpha/block_6": 0.11595730483531952, "attnres/block_norm/6": 31819.42578125, "geo/tier1_time_s": 1.3636667728424072, "geo/step": 41775.0, "geo/rankme_slope": -7.729400353891557e-06} {"step": 41780, "timestamp": 1778239699.2897048, "train/loss": 2.1576470136642456, "train/z_loss": 0.0014050274505279959, "train/perplexity": 8.650758575292238, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702303.0221399816, "perf/iters_per_sec": 0.8117213354778202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319498777389526, "data/tokens_consumed": 87621107712, "data/tokens_consumed_B": 87.621107712, "train/loss_slope": -1.543878713528243e-05} {"step": 41790, "timestamp": 1778239709.6409, "train/loss": 2.1854694843292237, "train/z_loss": 0.0014113431912846862, "train/perplexity": 8.8948235535157, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027168.8039000877, "perf/iters_per_sec": 0.9666294116497458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345226287841798, "data/tokens_consumed": 87642079232, "data/tokens_consumed_B": 87.642079232, "train/loss_slope": -1.6905262446639543e-05} {"step": 41800, "timestamp": 1778239719.970659, "grad/layer_0/attn": 0.0026062014512717724, "grad/layer_0/mlp": 0.0028286741580814123, "grad/layer_0/attn_mlp_ratio": 0.9213508567930564, "grad/layer_4/attn": 0.0020391112193465233, "grad/layer_4/mlp": 0.0026009969878941774, "grad/layer_4/attn_mlp_ratio": 0.7839728959471581, "grad/layer_8/attn": 0.004419989418238401, "grad/layer_8/mlp": 0.0038021740037947893, "grad/layer_8/attn_mlp_ratio": 1.162490011655962, "grad/layer_12/attn": 0.004154691472649574, "grad/layer_12/mlp": 0.006732659880071878, "grad/layer_12/attn_mlp_ratio": 0.6170950983633691, "grad/layer_16/attn": 0.0033593049738556147, "grad/layer_16/mlp": 0.004566285293549299, "grad/layer_16/attn_mlp_ratio": 0.7356756497526936, "grad/layer_20/attn": 0.0072845714166760445, "grad/layer_20/mlp": 0.005859214346855879, "grad/layer_20/attn_mlp_ratio": 1.2432675886415163, "grad/layer_24/attn": 0.012903058901429176, "grad/layer_24/mlp": 0.010572719387710094, "grad/layer_24/attn_mlp_ratio": 1.2204105969545402, "grad/layer_27/attn": 0.003946531098335981, "grad/layer_27/mlp": 0.011502702720463276, "grad/layer_27/attn_mlp_ratio": 0.3430959801304363} {"step": 41800, "timestamp": 1778239719.9850883, "train/loss": 2.1959867000579836, "train/z_loss": 0.0013992049382068217, "train/perplexity": 8.988865997182186, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028515.66163296, "perf/iters_per_sec": 0.9672716434635925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338357448577882, "data/tokens_consumed": 87663050752, "data/tokens_consumed_B": 87.663050752, "train/loss_slope": -1.6718172061346736e-05} {"step": 41810, "timestamp": 1778239730.332554, "train/loss": 2.1523855686187745, "train/z_loss": 0.0014102159067988396, "train/perplexity": 8.605362613350133, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027760.1070376756, "perf/iters_per_sec": 0.9669113669575098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342209577560424, "data/tokens_consumed": 87684022272, "data/tokens_consumed_B": 87.684022272, "train/loss_slope": -1.6786405069492507e-05} {"step": 41820, "timestamp": 1778239740.6727023, "train/loss": 2.20040442943573, "train/z_loss": 0.001399043563287705, "train/perplexity": 9.028664218730563, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029569.523582744, "perf/iters_per_sec": 0.9677741640008659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033298921585083, "data/tokens_consumed": 87704993792, "data/tokens_consumed_B": 87.704993792, "train/loss_slope": -1.4262791478236835e-05} {"step": 41830, "timestamp": 1778239751.0150707, "train/loss": 2.1721550464630126, "train/z_loss": 0.0014090492157265544, "train/perplexity": 8.777178902124325, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028899.850887587, "perf/iters_per_sec": 0.9674548391759811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336399793624877, "data/tokens_consumed": 87725965312, "data/tokens_consumed_B": 87.725965312, "train/loss_slope": -1.4824733210511256e-05} {"step": 41840, "timestamp": 1778239761.36476, "train/loss": 2.1956074476242065, "train/z_loss": 0.0014097727136686445, "train/perplexity": 8.985457594239268, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027211.6925015117, "perf/iters_per_sec": 0.9666498625285681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034500741958618, "data/tokens_consumed": 87746936832, "data/tokens_consumed_B": 87.746936832, "train/loss_slope": -8.235959852203155e-06} {"step": 41850, "timestamp": 1778239771.6967812, "grad/layer_0/attn": 0.002964137587696314, "grad/layer_0/mlp": 0.0029887156561017036, "grad/layer_0/attn_mlp_ratio": 0.9917763446205242, "grad/layer_4/attn": 0.002010270720347762, "grad/layer_4/mlp": 0.0024900739081203938, "grad/layer_4/attn_mlp_ratio": 0.8073136436074018, "grad/layer_8/attn": 0.004903927445411682, "grad/layer_8/mlp": 0.0036888199392706156, "grad/layer_8/attn_mlp_ratio": 1.32940273399223, "grad/layer_12/attn": 0.003647848730906844, "grad/layer_12/mlp": 0.006741343066096306, "grad/layer_12/attn_mlp_ratio": 0.5411160121995682, "grad/layer_16/attn": 0.003623599885031581, "grad/layer_16/mlp": 0.0045587606728076935, "grad/layer_16/attn_mlp_ratio": 0.7948651104146103, "grad/layer_20/attn": 0.0036396449431777, "grad/layer_20/mlp": 0.0066417427733540535, "grad/layer_20/attn_mlp_ratio": 0.5479954603150263, "grad/layer_24/attn": 0.009489000774919987, "grad/layer_24/mlp": 0.010459189303219318, "grad/layer_24/attn_mlp_ratio": 0.9072405526951535, "grad/layer_27/attn": 0.005536697804927826, "grad/layer_27/mlp": 0.010019984096288681, "grad/layer_27/attn_mlp_ratio": 0.5525655227059713} {"step": 41850, "timestamp": 1778239772.298756, "eos/sharpness": 49.96931552886962, "eos/L0_probe": 1.9938009977340698, "eos/L_plus": 2.2803282737731934, "eos/L_minus": 2.2069668769836426, "eos/grad_norm": 0.15123584866523743, "eos/embed_grad_frac": 0.10229663550853729, "eos/time_s": 0.598827600479126} {"step": 41850, "timestamp": 1778239772.316985, "train/loss": 2.183627128601074, "train/z_loss": 0.0014095286489464343, "train/perplexity": 8.878451210861655, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916180.1649104373, "perf/iters_per_sec": 0.9137059044410883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944440603256225, "data/tokens_consumed": 87767908352, "data/tokens_consumed_B": 87.767908352, "train/loss_slope": -7.666213541749044e-06} {"step": 41850, "timestamp": 1778239773.678709, "geo/rankme_last": 439.4488220214844, "geo/layer_0/stable_rank_q_proj": 19.287269592285156, "geo/layer_0/stable_rank_k_proj": 16.402101516723633, "geo/layer_0/stable_rank_o_proj": 48.75139236450195, "geo/layer_0/stable_rank_gate_proj": 136.62393188476562, "geo/layer_0/stable_rank_down_proj": 53.36016845703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0574396513402462, "geo/layer_0/attn_entropy_mean": 6.1922736167907715, "geo/layer_0/attn_entropy_std": 0.3812437355518341, "geo/layer_7/stable_rank_q_proj": 42.912078857421875, "geo/layer_7/stable_rank_k_proj": 42.17340087890625, "geo/layer_7/stable_rank_o_proj": 97.70054626464844, "geo/layer_7/stable_rank_gate_proj": 89.00273132324219, "geo/layer_7/stable_rank_down_proj": 146.189208984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.503136932849884, "geo/layer_7/attn_entropy_mean": 4.634574890136719, "geo/layer_7/attn_entropy_std": 0.8193787336349487, "geo/layer_14/stable_rank_q_proj": 54.2294921875, "geo/layer_14/stable_rank_k_proj": 37.60860061645508, "geo/layer_14/stable_rank_o_proj": 47.69184112548828, "geo/layer_14/stable_rank_gate_proj": 76.24689483642578, "geo/layer_14/stable_rank_down_proj": 132.41732788085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38645562529563904, "geo/layer_14/attn_entropy_mean": 5.498102188110352, "geo/layer_14/attn_entropy_std": 0.37155574560165405, "geo/layer_21/stable_rank_q_proj": 43.22280502319336, "geo/layer_21/stable_rank_k_proj": 30.697288513183594, "geo/layer_21/stable_rank_o_proj": 75.54363250732422, "geo/layer_21/stable_rank_gate_proj": 72.4167251586914, "geo/layer_21/stable_rank_down_proj": 55.11907958984375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14964160323143005, "geo/layer_21/attn_entropy_mean": 5.727518081665039, "geo/layer_21/attn_entropy_std": 0.29033946990966797, "geo/layer_27/stable_rank_q_proj": 42.44187927246094, "geo/layer_27/stable_rank_k_proj": 31.655290603637695, "geo/layer_27/stable_rank_o_proj": 116.4176025390625, "geo/layer_27/stable_rank_gate_proj": 84.63745880126953, "geo/layer_27/stable_rank_down_proj": 131.29049682617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08304315060377121, "geo/layer_27/attn_entropy_mean": 4.26024055480957, "geo/layer_27/attn_entropy_std": 0.6857293248176575, "attnres/final_alpha/block_0": 0.24091731011867523, "attnres/block_norm/0": 1.7252366542816162, "attnres/final_alpha/block_1": 0.005117062013596296, "attnres/block_norm/1": 40929.13671875, "attnres/final_alpha/block_2": 0.010951112024486065, "attnres/block_norm/2": 26702.892578125, "attnres/final_alpha/block_3": 0.012970498763024807, "attnres/block_norm/3": 48025.078125, "attnres/final_alpha/block_4": 0.015707172453403473, "attnres/block_norm/4": 13070.064453125, "attnres/final_alpha/block_5": 0.599611759185791, "attnres/block_norm/5": 6057.005859375, "attnres/final_alpha/block_6": 0.11472508311271667, "attnres/block_norm/6": 31933.4375, "geo/tier1_time_s": 1.35819411277771, "geo/step": 41850.0, "geo/rankme_slope": -8.31385288490396e-06} {"step": 41860, "timestamp": 1778239784.0252473, "train/loss": 2.139193820953369, "train/z_loss": 0.001421795762144029, "train/perplexity": 8.492588321616532, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791683.8440280342, "perf/iters_per_sec": 0.8543414325847789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704922199249268, "data/tokens_consumed": 87788879872, "data/tokens_consumed_B": 87.788879872, "train/loss_slope": -8.681400810102058e-06} {"step": 41870, "timestamp": 1778239794.3732283, "train/loss": 2.183210515975952, "train/z_loss": 0.0014086472569033504, "train/perplexity": 8.874753106387674, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027771.0456325037, "perf/iters_per_sec": 0.9669165828859824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342153787612915, "data/tokens_consumed": 87809851392, "data/tokens_consumed_B": 87.809851392, "train/loss_slope": -7.737199186456124e-06} {"step": 41880, "timestamp": 1778239805.1146593, "train/loss": 2.142328453063965, "train/z_loss": 0.0014200826874002815, "train/perplexity": 8.51925122904214, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953900.3940297468, "perf/iters_per_sec": 0.9316923113011106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.073315715789795, "data/tokens_consumed": 87830822912, "data/tokens_consumed_B": 87.830822912, "train/loss_slope": -1.051038089114697e-05} {"step": 41890, "timestamp": 1778239815.4626155, "train/loss": 2.219122815132141, "train/z_loss": 0.0014121488202363253, "train/perplexity": 9.199257875843326, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027951.0351854314, "perf/iters_per_sec": 0.9670024085929066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341235876083374, "data/tokens_consumed": 87851794432, "data/tokens_consumed_B": 87.851794432, "train/loss_slope": -6.1428763935811285e-06} {"step": 41900, "timestamp": 1778239825.7975817, "grad/layer_0/attn": 0.0030639630276709795, "grad/layer_0/mlp": 0.0028799078427255154, "grad/layer_0/attn_mlp_ratio": 1.0639100584483532, "grad/layer_4/attn": 0.0019885783549398184, "grad/layer_4/mlp": 0.002448718762025237, "grad/layer_4/attn_mlp_ratio": 0.8120892870875119, "grad/layer_8/attn": 0.0038809694815427065, "grad/layer_8/mlp": 0.003700205823406577, "grad/layer_8/attn_mlp_ratio": 1.0488522968391203, "grad/layer_12/attn": 0.0047442070208489895, "grad/layer_12/mlp": 0.006399052683264017, "grad/layer_12/attn_mlp_ratio": 0.741392074973489, "grad/layer_16/attn": 0.0038087619468569756, "grad/layer_16/mlp": 0.004276579711586237, "grad/layer_16/attn_mlp_ratio": 0.8906093454723246, "grad/layer_20/attn": 0.0032975308131426573, "grad/layer_20/mlp": 0.005400892347097397, "grad/layer_20/attn_mlp_ratio": 0.6105529494324314, "grad/layer_24/attn": 0.013010416179895401, "grad/layer_24/mlp": 0.008668899536132812, "grad/layer_24/attn_mlp_ratio": 1.500815181394733, "grad/layer_27/attn": 0.00412767706438899, "grad/layer_27/mlp": 0.00845513679087162, "grad/layer_27/attn_mlp_ratio": 0.4881857168800348} {"step": 41900, "timestamp": 1778239825.8120947, "train/loss": 2.199196994304657, "train/z_loss": 0.0013992383959703148, "train/perplexity": 9.017769271161075, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027333.5471448482, "perf/iters_per_sec": 0.9667079673504105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344385623931884, "data/tokens_consumed": 87872765952, "data/tokens_consumed_B": 87.872765952, "train/loss_slope": -3.3018402390890696e-06} {"step": 41910, "timestamp": 1778239836.155467, "train/loss": 2.241396689414978, "train/z_loss": 0.0013862449442967773, "train/perplexity": 9.406460020070051, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028852.1643947037, "perf/iters_per_sec": 0.9674321004842299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336642742156983, "data/tokens_consumed": 87893737472, "data/tokens_consumed_B": 87.893737472, "train/loss_slope": 4.512642919927813e-06} {"step": 41920, "timestamp": 1778239846.5007365, "train/loss": 2.165039539337158, "train/z_loss": 0.001410345046315342, "train/perplexity": 8.714946493246748, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028264.199649554, "perf/iters_per_sec": 0.9671517370460291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033963918685913, "data/tokens_consumed": 87914708992, "data/tokens_consumed_B": 87.914708992, "train/loss_slope": 3.611309600598859e-06} {"step": 41925, "timestamp": 1778239852.2663846, "eos/sharpness": 33.59370231628417, "eos/L0_probe": 1.9987047910690308, "eos/L_plus": 2.175189256668091, "eos/L_minus": 2.1581573486328125, "eos/grad_norm": 0.1431114375591278, "eos/embed_grad_frac": 0.12982484698295593, "eos/time_s": 0.6009438037872314} {"step": 41925, "timestamp": 1778239853.6434708, "geo/rankme_last": 438.87646484375, "geo/layer_0/stable_rank_q_proj": 19.28182029724121, "geo/layer_0/stable_rank_k_proj": 16.371517181396484, "geo/layer_0/stable_rank_o_proj": 48.772769927978516, "geo/layer_0/stable_rank_gate_proj": 136.60755920410156, "geo/layer_0/stable_rank_down_proj": 53.29533767700195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0633217841386795, "geo/layer_0/attn_entropy_mean": 6.196194648742676, "geo/layer_0/attn_entropy_std": 0.3813488185405731, "geo/layer_7/stable_rank_q_proj": 42.887935638427734, "geo/layer_7/stable_rank_k_proj": 42.24774169921875, "geo/layer_7/stable_rank_o_proj": 97.72347259521484, "geo/layer_7/stable_rank_gate_proj": 88.98780822753906, "geo/layer_7/stable_rank_down_proj": 146.08187866210938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.500872015953064, "geo/layer_7/attn_entropy_mean": 4.714939117431641, "geo/layer_7/attn_entropy_std": 0.8406795263290405, "geo/layer_14/stable_rank_q_proj": 54.1593017578125, "geo/layer_14/stable_rank_k_proj": 37.757930755615234, "geo/layer_14/stable_rank_o_proj": 47.723087310791016, "geo/layer_14/stable_rank_gate_proj": 76.30113983154297, "geo/layer_14/stable_rank_down_proj": 132.6847686767578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.396965891122818, "geo/layer_14/attn_entropy_mean": 5.540591239929199, "geo/layer_14/attn_entropy_std": 0.38052448630332947, "geo/layer_21/stable_rank_q_proj": 43.206260681152344, "geo/layer_21/stable_rank_k_proj": 30.66287612915039, "geo/layer_21/stable_rank_o_proj": 75.48681640625, "geo/layer_21/stable_rank_gate_proj": 72.44231414794922, "geo/layer_21/stable_rank_down_proj": 55.106998443603516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15050850808620453, "geo/layer_21/attn_entropy_mean": 5.716963768005371, "geo/layer_21/attn_entropy_std": 0.2834332287311554, "geo/layer_27/stable_rank_q_proj": 42.412532806396484, "geo/layer_27/stable_rank_k_proj": 31.563947677612305, "geo/layer_27/stable_rank_o_proj": 116.23074340820312, "geo/layer_27/stable_rank_gate_proj": 84.65433502197266, "geo/layer_27/stable_rank_down_proj": 131.14122009277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0843271017074585, "geo/layer_27/attn_entropy_mean": 4.2617082595825195, "geo/layer_27/attn_entropy_std": 0.652365505695343, "attnres/final_alpha/block_0": 0.24207715690135956, "attnres/block_norm/0": 1.7253117561340332, "attnres/final_alpha/block_1": 0.005172674544155598, "attnres/block_norm/1": 40859.9921875, "attnres/final_alpha/block_2": 0.011262839660048485, "attnres/block_norm/2": 26714.34375, "attnres/final_alpha/block_3": 0.013106431812047958, "attnres/block_norm/3": 48112.9140625, "attnres/final_alpha/block_4": 0.015898220241069794, "attnres/block_norm/4": 13062.6875, "attnres/final_alpha/block_5": 0.5984960794448853, "attnres/block_norm/5": 6096.41796875, "attnres/final_alpha/block_6": 0.1139865294098854, "attnres/block_norm/6": 32116.48828125, "geo/tier1_time_s": 1.3577110767364502, "geo/step": 41925.0, "geo/rankme_slope": -2.757491668542417e-05} {"step": 41930, "timestamp": 1778239858.8218153, "train/loss": 2.1866217851638794, "train/z_loss": 0.001395559695083648, "train/perplexity": 8.905078973650362, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702737.3094774021, "perf/iters_per_sec": 0.8119284198176394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2316356658935548, "data/tokens_consumed": 87935680512, "data/tokens_consumed_B": 87.935680512, "train/loss_slope": 1.2358686854593785e-06} {"step": 41940, "timestamp": 1778239869.1626892, "train/loss": 2.175285053253174, "train/z_loss": 0.0014093060977756976, "train/perplexity": 8.804694571338057, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029023.1722809763, "perf/iters_per_sec": 0.9675136433987505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335771560668945, "data/tokens_consumed": 87956652032, "data/tokens_consumed_B": 87.956652032, "train/loss_slope": 3.1613082739815417e-06} {"step": 41950, "timestamp": 1778239879.495557, "grad/layer_0/attn": 0.0036611934192478657, "grad/layer_0/mlp": 0.003145181108266115, "grad/layer_0/attn_mlp_ratio": 1.1640643819267313, "grad/layer_4/attn": 0.003674770938232541, "grad/layer_4/mlp": 0.0024839916732162237, "grad/layer_4/attn_mlp_ratio": 1.4793812837288587, "grad/layer_8/attn": 0.004352067597210407, "grad/layer_8/mlp": 0.00369284744374454, "grad/layer_8/attn_mlp_ratio": 1.1785126641858112, "grad/layer_12/attn": 0.004677029326558113, "grad/layer_12/mlp": 0.00660787895321846, "grad/layer_12/attn_mlp_ratio": 0.7077958432486897, "grad/layer_16/attn": 0.003661937080323696, "grad/layer_16/mlp": 0.0045908051542937756, "grad/layer_16/attn_mlp_ratio": 0.7976677026102753, "grad/layer_20/attn": 0.003489379771053791, "grad/layer_20/mlp": 0.005798819940537214, "grad/layer_20/attn_mlp_ratio": 0.6017396205884893, "grad/layer_24/attn": 0.012339377775788307, "grad/layer_24/mlp": 0.010010664351284504, "grad/layer_24/attn_mlp_ratio": 1.2326232525159704, "grad/layer_27/attn": 0.004795518238097429, "grad/layer_27/mlp": 0.009699316695332527, "grad/layer_27/attn_mlp_ratio": 0.4944181471013621} {"step": 41950, "timestamp": 1778239879.5099244, "train/loss": 2.1755823612213137, "train/z_loss": 0.0014147819369100035, "train/perplexity": 8.807312666362122, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027829.3867981173, "perf/iters_per_sec": 0.9669444021215998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341856241226197, "data/tokens_consumed": 87977623552, "data/tokens_consumed_B": 87.977623552, "train/loss_slope": 1.4173691458005437e-06} {"step": 41960, "timestamp": 1778239889.8668666, "train/loss": 2.152148938179016, "train/z_loss": 0.0014083332731388508, "train/perplexity": 8.603326563515841, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025847.0154386288, "perf/iters_per_sec": 0.9659991337960381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351976156234741, "data/tokens_consumed": 87998595072, "data/tokens_consumed_B": 87.998595072, "train/loss_slope": 1.8632978495986482e-07} {"step": 41970, "timestamp": 1778239900.2068427, "train/loss": 2.1490169048309324, "train/z_loss": 0.0014183368533849717, "train/perplexity": 8.576422811531646, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029313.8675853945, "perf/iters_per_sec": 0.9676522577216122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334290981292724, "data/tokens_consumed": 88019566592, "data/tokens_consumed_B": 88.019566592, "train/loss_slope": -4.9535032868050465e-08} {"step": 41980, "timestamp": 1778239910.5451524, "train/loss": 2.1641267776489257, "train/z_loss": 0.0014202377176843583, "train/perplexity": 8.706995453227094, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029554.2573322065, "perf/iters_per_sec": 0.9677668844853432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333066940307618, "data/tokens_consumed": 88040538112, "data/tokens_consumed_B": 88.040538112, "train/loss_slope": -9.199329371070184e-07} {"step": 41990, "timestamp": 1778239920.8959565, "train/loss": 2.157395362854004, "train/z_loss": 0.0014081696979701519, "train/perplexity": 8.648581878782768, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027126.1974132706, "perf/iters_per_sec": 0.9666090952936509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345443725585937, "data/tokens_consumed": 88061509632, "data/tokens_consumed_B": 88.061509632, "train/loss_slope": -3.825894147470605e-06} {"step": 42000, "timestamp": 1778239931.2254121, "grad/layer_0/attn": 0.0028523572254925966, "grad/layer_0/mlp": 0.002749794628471136, "grad/layer_0/attn_mlp_ratio": 1.0372982375591666, "grad/layer_4/attn": 0.001946445438079536, "grad/layer_4/mlp": 0.0024362450931221247, "grad/layer_4/attn_mlp_ratio": 0.7989530132577112, "grad/layer_8/attn": 0.004736926406621933, "grad/layer_8/mlp": 0.0036848881281912327, "grad/layer_8/attn_mlp_ratio": 1.2855007026758867, "grad/layer_12/attn": 0.00509998295456171, "grad/layer_12/mlp": 0.007281146477907896, "grad/layer_12/attn_mlp_ratio": 0.7004367924738437, "grad/layer_16/attn": 0.005176670849323273, "grad/layer_16/mlp": 0.004683578386902809, "grad/layer_16/attn_mlp_ratio": 1.1052811143870762, "grad/layer_20/attn": 0.006297921296209097, "grad/layer_20/mlp": 0.006491638720035553, "grad/layer_20/attn_mlp_ratio": 0.9701589183876692, "grad/layer_24/attn": 0.017949387431144714, "grad/layer_24/mlp": 0.0103604132309556, "grad/layer_24/attn_mlp_ratio": 1.7324972332439887, "grad/layer_27/attn": 0.009334011003375053, "grad/layer_27/mlp": 0.01059089507907629, "grad/layer_27/attn_mlp_ratio": 0.8813240850325498} {"step": 42000, "timestamp": 1778239931.8275852, "eos/sharpness": 79.69851493835448, "eos/L0_probe": 1.9968597888946533, "eos/L_plus": 2.3091089725494385, "eos/L_minus": 2.481595754623413, "eos/grad_norm": 0.23269832134246826, "eos/embed_grad_frac": 0.0406758114695549, "eos/time_s": 0.5994420051574707} {"step": 42000, "timestamp": 1778239931.8459554, "train/loss": 2.1459198713302614, "train/z_loss": 0.0014330653240904212, "train/perplexity": 8.549902431219326, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916103.5279671806, "perf/iters_per_sec": 0.9136693610988524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944878339767456, "data/tokens_consumed": 88082481152, "data/tokens_consumed_B": 88.082481152, "train/loss_slope": -5.937933936120476e-06} {"step": 42000, "timestamp": 1778239933.2064993, "geo/rankme_last": 439.3333435058594, "geo/layer_0/stable_rank_q_proj": 19.278730392456055, "geo/layer_0/stable_rank_k_proj": 16.353853225708008, "geo/layer_0/stable_rank_o_proj": 48.81574630737305, "geo/layer_0/stable_rank_gate_proj": 136.72140502929688, "geo/layer_0/stable_rank_down_proj": 53.3176155090332, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06137523800134659, "geo/layer_0/attn_entropy_mean": 6.197361946105957, "geo/layer_0/attn_entropy_std": 0.3815251290798187, "geo/layer_7/stable_rank_q_proj": 42.839515686035156, "geo/layer_7/stable_rank_k_proj": 42.263267517089844, "geo/layer_7/stable_rank_o_proj": 97.58779907226562, "geo/layer_7/stable_rank_gate_proj": 89.04352569580078, "geo/layer_7/stable_rank_down_proj": 146.3985595703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5047670006752014, "geo/layer_7/attn_entropy_mean": 4.623184680938721, "geo/layer_7/attn_entropy_std": 0.8165650963783264, "geo/layer_14/stable_rank_q_proj": 54.123966217041016, "geo/layer_14/stable_rank_k_proj": 37.74567413330078, "geo/layer_14/stable_rank_o_proj": 47.738948822021484, "geo/layer_14/stable_rank_gate_proj": 76.28663635253906, "geo/layer_14/stable_rank_down_proj": 132.8548126220703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3732450008392334, "geo/layer_14/attn_entropy_mean": 5.492480754852295, "geo/layer_14/attn_entropy_std": 0.36423245072364807, "geo/layer_21/stable_rank_q_proj": 43.23350524902344, "geo/layer_21/stable_rank_k_proj": 30.657394409179688, "geo/layer_21/stable_rank_o_proj": 75.42874145507812, "geo/layer_21/stable_rank_gate_proj": 72.43452453613281, "geo/layer_21/stable_rank_down_proj": 55.035221099853516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15192067623138428, "geo/layer_21/attn_entropy_mean": 5.7104387283325195, "geo/layer_21/attn_entropy_std": 0.30031177401542664, "geo/layer_27/stable_rank_q_proj": 42.44975662231445, "geo/layer_27/stable_rank_k_proj": 31.608060836791992, "geo/layer_27/stable_rank_o_proj": 116.30990600585938, "geo/layer_27/stable_rank_gate_proj": 84.6585922241211, "geo/layer_27/stable_rank_down_proj": 131.3418731689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08749553561210632, "geo/layer_27/attn_entropy_mean": 4.253627777099609, "geo/layer_27/attn_entropy_std": 0.6820307970046997, "attnres/final_alpha/block_0": 0.2431258112192154, "attnres/block_norm/0": 1.7256379127502441, "attnres/final_alpha/block_1": 0.005159899592399597, "attnres/block_norm/1": 40801.83984375, "attnres/final_alpha/block_2": 0.01121225580573082, "attnres/block_norm/2": 26635.70703125, "attnres/final_alpha/block_3": 0.013252376578748226, "attnres/block_norm/3": 47815.26953125, "attnres/final_alpha/block_4": 0.015896419063210487, "attnres/block_norm/4": 13121.87890625, "attnres/final_alpha/block_5": 0.5940852165222168, "attnres/block_norm/5": 6137.3251953125, "attnres/final_alpha/block_6": 0.11726799607276917, "attnres/block_norm/6": 31824.423828125, "geo/tier1_time_s": 1.3565661907196045, "geo/step": 42000.0, "geo/rankme_slope": -5.3522834915216086e-05} {"step": 42000, "timestamp": 1778239940.3667276, "geo/ww_alpha_mean": 7.922350360389992, "geo/ww_alpha_std": 4.770318184901103, "geo/ww_alpha_min": 2.609456780323366, "geo/ww_alpha_max": 31.863594803369672, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.0508306676221215, "geo/ww_alpha_by_type/k_proj": 4.719459128118223, "geo/ww_alpha_by_type/v_proj": 8.970696913916766, "geo/ww_alpha_by_type/o_proj": 9.965904157338983, "geo/ww_alpha_by_type/gate_proj": 7.7039008854490465, "geo/ww_alpha_by_type/up_proj": 11.757103500904662, "geo/ww_alpha_by_type/down_proj": 8.402609485700097, "geo/twonn_id/layer_0": 0.6645107865333557, "geo/twonn_id/layer_7": 3.3122451305389404, "geo/twonn_id/layer_14": 4.9220871925354, "geo/twonn_id/layer_21": 6.7103142738342285, "geo/twonn_id/layer_27": 6.093667030334473, "geo/tier2_time_s": 7.152913570404053} {"step": 42000, "timestamp": 1778239941.1128535, "eoc/jacobian_sigma/layer_0/attn": 1187.7178955078125, "eoc/jacobian_sigma/layer_0/mlp": 8482.580078125, "eoc/jacobian_sigma/layer_0": 8482.580078125, "eoc/jacobian_sigma/layer_7/attn": 1.1379674673080444, "eoc/jacobian_sigma/layer_7/mlp": 1.828607201576233, "eoc/jacobian_sigma/layer_7": 1.828607201576233, "eoc/jacobian_sigma/layer_14/attn": 1.5966897010803223, "eoc/jacobian_sigma/layer_14/mlp": 6.754530906677246, "eoc/jacobian_sigma/layer_14": 6.754530906677246, "eoc/jacobian_sigma/layer_21/attn": 1.090198278427124, "eoc/jacobian_sigma/layer_21/mlp": 3.8807103633880615, "eoc/jacobian_sigma/layer_21": 3.8807103633880615, "eoc/jacobian_sigma/layer_27/attn": 3.5817134380340576, "eoc/jacobian_sigma/layer_27/mlp": 28.124271392822266, "eoc/jacobian_sigma/layer_27": 28.124271392822266, "eoc/layer0_sigma": 8482.580078125, "eoc/sigma_max": 28.124271392822266, "eoc/sigma_min": 1.828607201576233, "eoc/sigma_mean": 10.147029966115952, "eoc/time_s": 0.7385008335113525} {"step": 42010, "timestamp": 1778239951.4797552, "train/loss": 2.200884795188904, "train/z_loss": 0.001401662698481232, "train/perplexity": 9.033002321672775, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1068335.3974370302, "perf/iters_per_sec": 0.5094220149216796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.963008999824524, "data/tokens_consumed": 88103452672, "data/tokens_consumed_B": 88.103452672, "train/loss_slope": -2.021979892214496e-06} {"step": 42020, "timestamp": 1778239961.8250575, "train/loss": 2.173549008369446, "train/z_loss": 0.0014136982848867775, "train/perplexity": 8.789422486722527, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028350.9600858835, "perf/iters_per_sec": 0.9671931076459329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339196920394897, "data/tokens_consumed": 88124424192, "data/tokens_consumed_B": 88.124424192, "train/loss_slope": -3.272885264772307e-06} {"step": 42030, "timestamp": 1778239972.162702, "train/loss": 2.1595109701156616, "train/z_loss": 0.0013941628043539823, "train/perplexity": 8.66689824970053, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029601.2743730708, "perf/iters_per_sec": 0.9677893039574961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03328275680542, "data/tokens_consumed": 88145395712, "data/tokens_consumed_B": 88.145395712, "train/loss_slope": -3.203712030462789e-06} {"step": 42040, "timestamp": 1778239982.5016153, "train/loss": 2.2074441075325013, "train/z_loss": 0.0013973087538033725, "train/perplexity": 9.092447351426824, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029464.537629834, "perf/iters_per_sec": 0.967724102797429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333523750305176, "data/tokens_consumed": 88166367232, "data/tokens_consumed_B": 88.166367232, "train/loss_slope": 3.855006386488113e-07} {"step": 42050, "timestamp": 1778239993.3685906, "grad/layer_0/attn": 0.002596496604382992, "grad/layer_0/mlp": 0.0027518486604094505, "grad/layer_0/attn_mlp_ratio": 0.9435462594233036, "grad/layer_4/attn": 0.001906918827444315, "grad/layer_4/mlp": 0.002454021479934454, "grad/layer_4/attn_mlp_ratio": 0.7770587035731154, "grad/layer_8/attn": 0.004314493387937546, "grad/layer_8/mlp": 0.003617574693635106, "grad/layer_8/attn_mlp_ratio": 1.1926480125659389, "grad/layer_12/attn": 0.0046966285444796085, "grad/layer_12/mlp": 0.00682623079046607, "grad/layer_12/attn_mlp_ratio": 0.6880266167145337, "grad/layer_16/attn": 0.005668331868946552, "grad/layer_16/mlp": 0.004469582810997963, "grad/layer_16/attn_mlp_ratio": 1.268201526142164, "grad/layer_20/attn": 0.003616102272644639, "grad/layer_20/mlp": 0.006305175367742777, "grad/layer_20/attn_mlp_ratio": 0.5735133448933479, "grad/layer_24/attn": 0.012219510041177273, "grad/layer_24/mlp": 0.012522997334599495, "grad/layer_24/attn_mlp_ratio": 0.9757655948580071, "grad/layer_27/attn": 0.007309246342629194, "grad/layer_27/mlp": 0.011515157297253609, "grad/layer_27/attn_mlp_ratio": 0.6347500160416797} {"step": 42050, "timestamp": 1778239993.3832633, "train/loss": 2.1750919938087465, "train/z_loss": 0.0014089475153014064, "train/perplexity": 8.802994905969266, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928292.231440411, "perf/iters_per_sec": 0.9194813878252082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0875695943832397, "data/tokens_consumed": 88187338752, "data/tokens_consumed_B": 88.187338752, "train/loss_slope": -2.6398414754309114e-06} {"step": 42060, "timestamp": 1778240004.2735221, "train/loss": 2.1601385116577148, "train/z_loss": 0.0014090015669353307, "train/perplexity": 8.672338795298609, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926891.1624930177, "perf/iters_per_sec": 0.9188133060898865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0883603811264038, "data/tokens_consumed": 88208310272, "data/tokens_consumed_B": 88.208310272, "train/loss_slope": -5.361354118562303e-06} {"step": 42070, "timestamp": 1778240014.616074, "train/loss": 2.1394253849983214, "train/z_loss": 0.001401923387311399, "train/perplexity": 8.494555127432367, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029512.6275277126, "perf/iters_per_sec": 0.9677470338476718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333278894424438, "data/tokens_consumed": 88229281792, "data/tokens_consumed_B": 88.229281792, "train/loss_slope": -8.576017566318084e-06} {"step": 42075, "timestamp": 1778240020.4014025, "eos/sharpness": 79.05125617980956, "eos/L0_probe": 1.9943931102752686, "eos/L_plus": 2.4989066123962402, "eos/L_minus": 2.2803921699523926, "eos/grad_norm": 0.2373453825712204, "eos/embed_grad_frac": 0.038095008581876755, "eos/time_s": 0.6164717674255371} {"step": 42075, "timestamp": 1778240021.7810056, "geo/rankme_last": 438.6298522949219, "geo/layer_0/stable_rank_q_proj": 19.31597137451172, "geo/layer_0/stable_rank_k_proj": 16.40068817138672, "geo/layer_0/stable_rank_o_proj": 48.80044174194336, "geo/layer_0/stable_rank_gate_proj": 136.8564910888672, "geo/layer_0/stable_rank_down_proj": 53.28511047363281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05972440168261528, "geo/layer_0/attn_entropy_mean": 6.197070121765137, "geo/layer_0/attn_entropy_std": 0.38194045424461365, "geo/layer_7/stable_rank_q_proj": 42.96735763549805, "geo/layer_7/stable_rank_k_proj": 42.15168380737305, "geo/layer_7/stable_rank_o_proj": 97.4942626953125, "geo/layer_7/stable_rank_gate_proj": 89.29145812988281, "geo/layer_7/stable_rank_down_proj": 146.30165100097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49852079153060913, "geo/layer_7/attn_entropy_mean": 4.630703926086426, "geo/layer_7/attn_entropy_std": 0.8118925094604492, "geo/layer_14/stable_rank_q_proj": 54.277374267578125, "geo/layer_14/stable_rank_k_proj": 37.851043701171875, "geo/layer_14/stable_rank_o_proj": 47.74430465698242, "geo/layer_14/stable_rank_gate_proj": 76.28770446777344, "geo/layer_14/stable_rank_down_proj": 132.74346923828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39948898553848267, "geo/layer_14/attn_entropy_mean": 5.507390975952148, "geo/layer_14/attn_entropy_std": 0.36622461676597595, "geo/layer_21/stable_rank_q_proj": 43.10483932495117, "geo/layer_21/stable_rank_k_proj": 30.62575340270996, "geo/layer_21/stable_rank_o_proj": 75.38095092773438, "geo/layer_21/stable_rank_gate_proj": 72.28227233886719, "geo/layer_21/stable_rank_down_proj": 55.09367370605469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14610368013381958, "geo/layer_21/attn_entropy_mean": 5.714034080505371, "geo/layer_21/attn_entropy_std": 0.2928059995174408, "geo/layer_27/stable_rank_q_proj": 42.517879486083984, "geo/layer_27/stable_rank_k_proj": 31.583948135375977, "geo/layer_27/stable_rank_o_proj": 116.2032241821289, "geo/layer_27/stable_rank_gate_proj": 84.7121353149414, "geo/layer_27/stable_rank_down_proj": 131.39224243164062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08464560657739639, "geo/layer_27/attn_entropy_mean": 4.25855827331543, "geo/layer_27/attn_entropy_std": 0.6656018495559692, "attnres/final_alpha/block_0": 0.24031373858451843, "attnres/block_norm/0": 1.7256836891174316, "attnres/final_alpha/block_1": 0.0050179604440927505, "attnres/block_norm/1": 40909.56640625, "attnres/final_alpha/block_2": 0.011365922167897224, "attnres/block_norm/2": 26636.255859375, "attnres/final_alpha/block_3": 0.013351308181881905, "attnres/block_norm/3": 48089.20703125, "attnres/final_alpha/block_4": 0.015758689492940903, "attnres/block_norm/4": 13116.34765625, "attnres/final_alpha/block_5": 0.5999162197113037, "attnres/block_norm/5": 6057.2861328125, "attnres/final_alpha/block_6": 0.11427615582942963, "attnres/block_norm/6": 32012.099609375, "geo/tier1_time_s": 1.3609182834625244, "geo/step": 42075.0, "geo/rankme_slope": -9.628656149959983e-05} {"step": 42080, "timestamp": 1778240026.9542804, "train/loss": 2.2045530915260314, "train/z_loss": 0.0014117548009380699, "train/perplexity": 9.066198901222691, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700625.6233522585, "perf/iters_per_sec": 0.8109214894067089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2331650018692017, "data/tokens_consumed": 88250253312, "data/tokens_consumed_B": 88.250253312, "train/loss_slope": -4.293174790863952e-06} {"step": 42090, "timestamp": 1778240037.3046098, "train/loss": 2.17939692735672, "train/z_loss": 0.0014023056253790855, "train/perplexity": 8.840972901784648, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027342.1448048155, "perf/iters_per_sec": 0.9667120670341566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034434175491333, "data/tokens_consumed": 88271224832, "data/tokens_consumed_B": 88.271224832, "train/loss_slope": -1.8285705847959217e-06} {"step": 42100, "timestamp": 1778240047.636561, "grad/layer_0/attn": 0.003006167709827423, "grad/layer_0/mlp": 0.0029532667249441147, "grad/layer_0/attn_mlp_ratio": 1.0179126668936558, "grad/layer_4/attn": 0.0017349333502352238, "grad/layer_4/mlp": 0.0024800323881208897, "grad/layer_4/attn_mlp_ratio": 0.6995607350086673, "grad/layer_8/attn": 0.006199829746037722, "grad/layer_8/mlp": 0.0036793143954128027, "grad/layer_8/attn_mlp_ratio": 1.6850502325276568, "grad/layer_12/attn": 0.0039021095726639032, "grad/layer_12/mlp": 0.0062561980448663235, "grad/layer_12/attn_mlp_ratio": 0.6237189875237368, "grad/layer_16/attn": 0.006425286643207073, "grad/layer_16/mlp": 0.004418421071022749, "grad/layer_16/attn_mlp_ratio": 1.4542042042858916, "grad/layer_20/attn": 0.0031709359027445316, "grad/layer_20/mlp": 0.005355771165341139, "grad/layer_20/attn_mlp_ratio": 0.5920596204816744, "grad/layer_24/attn": 0.00861598365008831, "grad/layer_24/mlp": 0.00854938942939043, "grad/layer_24/attn_mlp_ratio": 1.0077893422061246, "grad/layer_27/attn": 0.00524553656578064, "grad/layer_27/mlp": 0.008125620894134045, "grad/layer_27/attn_mlp_ratio": 0.6455551605923335} {"step": 42100, "timestamp": 1778240047.651219, "train/loss": 2.206202530860901, "train/z_loss": 0.0013988319551572204, "train/perplexity": 9.081165386069308, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028025.7986795746, "perf/iters_per_sec": 0.9670380586049913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034085464477539, "data/tokens_consumed": 88292196352, "data/tokens_consumed_B": 88.292196352, "train/loss_slope": 1.0453623263212666e-06} {"step": 42110, "timestamp": 1778240058.5732555, "train/loss": 2.157445025444031, "train/z_loss": 0.001413400680758059, "train/perplexity": 8.649011400424419, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1920854.0552244068, "perf/iters_per_sec": 0.9159345890161547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0917810201644897, "data/tokens_consumed": 88313167872, "data/tokens_consumed_B": 88.313167872, "train/loss_slope": 4.892766844071024e-06} {"step": 42120, "timestamp": 1778240069.3299768, "train/loss": 2.1659495353698732, "train/z_loss": 0.0013986970647238196, "train/perplexity": 8.722880669467848, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950576.2171409219, "perf/iters_per_sec": 0.9301072202400789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0751448631286622, "data/tokens_consumed": 88334139392, "data/tokens_consumed_B": 88.334139392, "train/loss_slope": 4.107684435302189e-06} {"step": 42130, "timestamp": 1778240079.6733606, "train/loss": 2.2175577878952026, "train/z_loss": 0.0014006830286234617, "train/perplexity": 9.18487204675147, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028558.186200346, "perf/iters_per_sec": 0.9672919207574587, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338140726089478, "data/tokens_consumed": 88355110912, "data/tokens_consumed_B": 88.355110912, "train/loss_slope": 5.896369229913839e-06} {"step": 42140, "timestamp": 1778240090.019192, "train/loss": 2.1451536417007446, "train/z_loss": 0.0014062904985621572, "train/perplexity": 8.543353751863526, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027908.4425638819, "perf/iters_per_sec": 0.9669820988482866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341453075408935, "data/tokens_consumed": 88376082432, "data/tokens_consumed_B": 88.376082432, "train/loss_slope": 5.031397083018065e-06} {"step": 42150, "timestamp": 1778240100.3568182, "grad/layer_0/attn": 0.0025254441425204277, "grad/layer_0/mlp": 0.0028079331386834383, "grad/layer_0/attn_mlp_ratio": 0.8993960781291717, "grad/layer_4/attn": 0.0021368388552218676, "grad/layer_4/mlp": 0.002543041715398431, "grad/layer_4/attn_mlp_ratio": 0.84026886317128, "grad/layer_8/attn": 0.01147879846394062, "grad/layer_8/mlp": 0.003807550761848688, "grad/layer_8/attn_mlp_ratio": 3.014745929977491, "grad/layer_12/attn": 0.004004252143204212, "grad/layer_12/mlp": 0.006416071206331253, "grad/layer_12/attn_mlp_ratio": 0.6240972009230782, "grad/layer_16/attn": 0.0035798184107989073, "grad/layer_16/mlp": 0.004723682533949614, "grad/layer_16/attn_mlp_ratio": 0.7578448190126849, "grad/layer_20/attn": 0.004045832436531782, "grad/layer_20/mlp": 0.006614085286855698, "grad/layer_20/attn_mlp_ratio": 0.61169945652231, "grad/layer_24/attn": 0.02045772597193718, "grad/layer_24/mlp": 0.013175994157791138, "grad/layer_24/attn_mlp_ratio": 1.5526514031258218, "grad/layer_27/attn": 0.008650436997413635, "grad/layer_27/mlp": 0.012533270753920078, "grad/layer_27/attn_mlp_ratio": 0.6901978819605583} {"step": 42150, "timestamp": 1778240100.9579701, "eos/sharpness": 74.2967128753662, "eos/L0_probe": 1.9959582090377808, "eos/L_plus": 2.317281484603882, "eos/L_minus": 2.417602062225342, "eos/grad_norm": 0.23536966741085052, "eos/embed_grad_frac": 0.03839336708188057, "eos/time_s": 0.5984320640563965} {"step": 42150, "timestamp": 1778240100.9781866, "train/loss": 2.1243204832077027, "train/z_loss": 0.0014179755933582782, "train/perplexity": 8.367209896309824, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914436.4397517596, "perf/iters_per_sec": 0.9128744314917372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0954409122467041, "data/tokens_consumed": 88397053952, "data/tokens_consumed_B": 88.397053952, "train/loss_slope": 4.226676193830159e-06} {"step": 42150, "timestamp": 1778240102.3401842, "geo/rankme_last": 439.3924560546875, "geo/layer_0/stable_rank_q_proj": 19.340770721435547, "geo/layer_0/stable_rank_k_proj": 16.428897857666016, "geo/layer_0/stable_rank_o_proj": 48.79621505737305, "geo/layer_0/stable_rank_gate_proj": 137.04478454589844, "geo/layer_0/stable_rank_down_proj": 53.30033493041992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05992765352129936, "geo/layer_0/attn_entropy_mean": 6.194882869720459, "geo/layer_0/attn_entropy_std": 0.381766676902771, "geo/layer_7/stable_rank_q_proj": 42.9693489074707, "geo/layer_7/stable_rank_k_proj": 42.173828125, "geo/layer_7/stable_rank_o_proj": 97.2106704711914, "geo/layer_7/stable_rank_gate_proj": 89.47809600830078, "geo/layer_7/stable_rank_down_proj": 145.7991485595703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4933239221572876, "geo/layer_7/attn_entropy_mean": 4.662647724151611, "geo/layer_7/attn_entropy_std": 0.8117230534553528, "geo/layer_14/stable_rank_q_proj": 54.31202697753906, "geo/layer_14/stable_rank_k_proj": 37.76835250854492, "geo/layer_14/stable_rank_o_proj": 47.68169403076172, "geo/layer_14/stable_rank_gate_proj": 76.30526733398438, "geo/layer_14/stable_rank_down_proj": 132.85792541503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3853808641433716, "geo/layer_14/attn_entropy_mean": 5.53956413269043, "geo/layer_14/attn_entropy_std": 0.35735100507736206, "geo/layer_21/stable_rank_q_proj": 42.96641540527344, "geo/layer_21/stable_rank_k_proj": 30.640331268310547, "geo/layer_21/stable_rank_o_proj": 75.32795715332031, "geo/layer_21/stable_rank_gate_proj": 72.19526672363281, "geo/layer_21/stable_rank_down_proj": 55.06599044799805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1440313160419464, "geo/layer_21/attn_entropy_mean": 5.724128723144531, "geo/layer_21/attn_entropy_std": 0.29477575421333313, "geo/layer_27/stable_rank_q_proj": 42.51217269897461, "geo/layer_27/stable_rank_k_proj": 31.579914093017578, "geo/layer_27/stable_rank_o_proj": 115.98992919921875, "geo/layer_27/stable_rank_gate_proj": 84.64840698242188, "geo/layer_27/stable_rank_down_proj": 131.43099975585938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08409208059310913, "geo/layer_27/attn_entropy_mean": 4.268206596374512, "geo/layer_27/attn_entropy_std": 0.6579523682594299, "attnres/final_alpha/block_0": 0.24123913049697876, "attnres/block_norm/0": 1.7259986400604248, "attnres/final_alpha/block_1": 0.005159676540642977, "attnres/block_norm/1": 40852.65625, "attnres/final_alpha/block_2": 0.011074703186750412, "attnres/block_norm/2": 26627.99609375, "attnres/final_alpha/block_3": 0.013089461252093315, "attnres/block_norm/3": 48166.9453125, "attnres/final_alpha/block_4": 0.015883225947618484, "attnres/block_norm/4": 13133.3291015625, "attnres/final_alpha/block_5": 0.5964669585227966, "attnres/block_norm/5": 6132.45361328125, "attnres/final_alpha/block_6": 0.11708684265613556, "attnres/block_norm/6": 31996.896484375, "geo/tier1_time_s": 1.3585889339447021, "geo/step": 42150.0, "geo/rankme_slope": -0.00011501495129301721} {"step": 42160, "timestamp": 1778240112.6872122, "train/loss": 2.181953287124634, "train/z_loss": 0.0014010965009219945, "train/perplexity": 8.863602521621177, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791558.4561068136, "perf/iters_per_sec": 0.8542816429647511, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170574140548706, "data/tokens_consumed": 88418025472, "data/tokens_consumed_B": 88.418025472, "train/loss_slope": 4.4397279660407635e-06} {"step": 42170, "timestamp": 1778240123.0359306, "train/loss": 2.174765872955322, "train/z_loss": 0.001404886762611568, "train/perplexity": 8.800124533827386, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027479.296686026, "perf/iters_per_sec": 0.9667774661474352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343641996383668, "data/tokens_consumed": 88438996992, "data/tokens_consumed_B": 88.438996992, "train/loss_slope": 4.003208698612609e-06} {"step": 42180, "timestamp": 1778240133.392233, "train/loss": 2.1846466779708864, "train/z_loss": 0.0014032458304427565, "train/perplexity": 8.887507846257536, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026590.7324193327, "perf/iters_per_sec": 0.9663537656876243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348177194595336, "data/tokens_consumed": 88459968512, "data/tokens_consumed_B": 88.459968512, "train/loss_slope": 4.5107290904299236e-06} {"step": 42190, "timestamp": 1778240143.7363586, "train/loss": 2.208561062812805, "train/z_loss": 0.0013980434625409544, "train/perplexity": 9.102608882438293, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028528.5264129909, "perf/iters_per_sec": 0.9672777778687434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338291883468629, "data/tokens_consumed": 88480940032, "data/tokens_consumed_B": 88.480940032, "train/loss_slope": 4.456786811798838e-06} {"step": 42200, "timestamp": 1778240154.0742698, "grad/layer_0/attn": 0.0029383122455328703, "grad/layer_0/mlp": 0.003107756143435836, "grad/layer_0/attn_mlp_ratio": 0.9454770629901033, "grad/layer_4/attn": 0.0028174209874123335, "grad/layer_4/mlp": 0.0025013431441038847, "grad/layer_4/attn_mlp_ratio": 1.1263632026726043, "grad/layer_8/attn": 0.0035823313519358635, "grad/layer_8/mlp": 0.0037072249688208103, "grad/layer_8/attn_mlp_ratio": 0.9663107271432342, "grad/layer_12/attn": 0.004944896325469017, "grad/layer_12/mlp": 0.006533633917570114, "grad/layer_12/attn_mlp_ratio": 0.7568370545658517, "grad/layer_16/attn": 0.004334471188485622, "grad/layer_16/mlp": 0.004832186736166477, "grad/layer_16/attn_mlp_ratio": 0.8969999164859057, "grad/layer_20/attn": 0.0050188712775707245, "grad/layer_20/mlp": 0.007714748848229647, "grad/layer_20/attn_mlp_ratio": 0.6505553597725869, "grad/layer_24/attn": 0.021084163337945938, "grad/layer_24/mlp": 0.014527508988976479, "grad/layer_24/attn_mlp_ratio": 1.4513268041212013, "grad/layer_27/attn": 0.0071146306581795216, "grad/layer_27/mlp": 0.01521555706858635, "grad/layer_27/attn_mlp_ratio": 0.46758922984878964} {"step": 42200, "timestamp": 1778240154.0884252, "train/loss": 2.186736226081848, "train/z_loss": 0.001406074350234121, "train/perplexity": 8.906098137378594, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026832.9526666696, "perf/iters_per_sec": 0.9664692653020237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346940517425538, "data/tokens_consumed": 88501911552, "data/tokens_consumed_B": 88.501911552, "train/loss_slope": 5.837939846383295e-06} {"step": 42210, "timestamp": 1778240164.4447927, "train/loss": 2.148396062850952, "train/z_loss": 0.0014122819062322377, "train/perplexity": 8.571099860738816, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026486.6146532923, "perf/iters_per_sec": 0.9663041184679472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348708868026733, "data/tokens_consumed": 88522883072, "data/tokens_consumed_B": 88.522883072, "train/loss_slope": 7.228577076190561e-06} {"step": 42220, "timestamp": 1778240174.7869565, "train/loss": 2.1275411367416384, "train/z_loss": 0.0014161437749862671, "train/perplexity": 8.394201221954848, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028960.2694770284, "perf/iters_per_sec": 0.9674836490044729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336091995239258, "data/tokens_consumed": 88543854592, "data/tokens_consumed_B": 88.543854592, "train/loss_slope": 5.348444628779878e-06} {"step": 42225, "timestamp": 1778240180.5463614, "eos/sharpness": 42.109799385070794, "eos/L0_probe": 1.9967211484909058, "eos/L_plus": 2.1918933391571045, "eos/L_minus": 2.222646951675415, "eos/grad_norm": 0.11619146168231964, "eos/embed_grad_frac": 0.17322663962841034, "eos/time_s": 0.5934672355651855} {"step": 42225, "timestamp": 1778240181.9237642, "geo/rankme_last": 439.0608825683594, "geo/layer_0/stable_rank_q_proj": 19.31736183166504, "geo/layer_0/stable_rank_k_proj": 16.449230194091797, "geo/layer_0/stable_rank_o_proj": 48.79128646850586, "geo/layer_0/stable_rank_gate_proj": 136.92999267578125, "geo/layer_0/stable_rank_down_proj": 53.36016082763672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05982197821140289, "geo/layer_0/attn_entropy_mean": 6.1936163902282715, "geo/layer_0/attn_entropy_std": 0.3804972469806671, "geo/layer_7/stable_rank_q_proj": 42.92274475097656, "geo/layer_7/stable_rank_k_proj": 42.26655960083008, "geo/layer_7/stable_rank_o_proj": 97.0113525390625, "geo/layer_7/stable_rank_gate_proj": 89.40410614013672, "geo/layer_7/stable_rank_down_proj": 145.99244689941406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5027087330818176, "geo/layer_7/attn_entropy_mean": 4.66684627532959, "geo/layer_7/attn_entropy_std": 0.8148603439331055, "geo/layer_14/stable_rank_q_proj": 54.23830032348633, "geo/layer_14/stable_rank_k_proj": 37.74916458129883, "geo/layer_14/stable_rank_o_proj": 47.66810989379883, "geo/layer_14/stable_rank_gate_proj": 76.3552017211914, "geo/layer_14/stable_rank_down_proj": 132.8779754638672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40242400765419006, "geo/layer_14/attn_entropy_mean": 5.49901008605957, "geo/layer_14/attn_entropy_std": 0.3702167570590973, "geo/layer_21/stable_rank_q_proj": 42.96923065185547, "geo/layer_21/stable_rank_k_proj": 30.665891647338867, "geo/layer_21/stable_rank_o_proj": 75.32736206054688, "geo/layer_21/stable_rank_gate_proj": 72.16732788085938, "geo/layer_21/stable_rank_down_proj": 55.04989242553711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1481035202741623, "geo/layer_21/attn_entropy_mean": 5.714670658111572, "geo/layer_21/attn_entropy_std": 0.29569679498672485, "geo/layer_27/stable_rank_q_proj": 42.46154022216797, "geo/layer_27/stable_rank_k_proj": 31.637758255004883, "geo/layer_27/stable_rank_o_proj": 115.8255386352539, "geo/layer_27/stable_rank_gate_proj": 84.6221923828125, "geo/layer_27/stable_rank_down_proj": 131.05795288085938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09111585468053818, "geo/layer_27/attn_entropy_mean": 4.254307746887207, "geo/layer_27/attn_entropy_std": 0.6613641977310181, "attnres/final_alpha/block_0": 0.24150815606117249, "attnres/block_norm/0": 1.726233720779419, "attnres/final_alpha/block_1": 0.005142892710864544, "attnres/block_norm/1": 41009.33203125, "attnres/final_alpha/block_2": 0.011047991923987865, "attnres/block_norm/2": 26739.484375, "attnres/final_alpha/block_3": 0.013321064412593842, "attnres/block_norm/3": 48405.640625, "attnres/final_alpha/block_4": 0.0162762850522995, "attnres/block_norm/4": 13077.71875, "attnres/final_alpha/block_5": 0.5972979068756104, "attnres/block_norm/5": 6131.6396484375, "attnres/final_alpha/block_6": 0.11540570110082626, "attnres/block_norm/6": 32272.27734375, "geo/tier1_time_s": 1.3592960834503174, "geo/step": 42225.0, "geo/rankme_slope": -0.00015349208042592037} {"step": 42230, "timestamp": 1778240187.0983195, "train/loss": 2.130364680290222, "train/z_loss": 0.0014176827040500938, "train/perplexity": 8.417936107132912, "train/grad_norm": 0.35546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704239.3983486297, "perf/iters_per_sec": 0.8126446716063641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2305501222610473, "data/tokens_consumed": 88564826112, "data/tokens_consumed_B": 88.564826112, "train/loss_slope": 8.478538860546333e-07} {"step": 42240, "timestamp": 1778240197.4478717, "train/loss": 2.1999815940856933, "train/z_loss": 0.001393193646799773, "train/perplexity": 9.024847387337761, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027856.7821254844, "perf/iters_per_sec": 0.9669574652316496, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341716527938842, "data/tokens_consumed": 88585797632, "data/tokens_consumed_B": 88.585797632, "train/loss_slope": 5.3296595552537515e-06} {"step": 42250, "timestamp": 1778240207.785397, "grad/layer_0/attn": 0.0029277552384883165, "grad/layer_0/mlp": 0.0029977881349623203, "grad/layer_0/attn_mlp_ratio": 0.9766384444180447, "grad/layer_4/attn": 0.0018348073353990912, "grad/layer_4/mlp": 0.0025280818808823824, "grad/layer_4/attn_mlp_ratio": 0.7257705047835055, "grad/layer_8/attn": 0.005016101524233818, "grad/layer_8/mlp": 0.003505784086883068, "grad/layer_8/attn_mlp_ratio": 1.4308072764437754, "grad/layer_12/attn": 0.005130831617861986, "grad/layer_12/mlp": 0.006828854791820049, "grad/layer_12/attn_mlp_ratio": 0.7513458257852218, "grad/layer_16/attn": 0.0034336470998823643, "grad/layer_16/mlp": 0.004636127036064863, "grad/layer_16/attn_mlp_ratio": 0.7406283303086546, "grad/layer_20/attn": 0.003850130131468177, "grad/layer_20/mlp": 0.006430656183511019, "grad/layer_20/attn_mlp_ratio": 0.598714961852397, "grad/layer_24/attn": 0.013564850203692913, "grad/layer_24/mlp": 0.012830698862671852, "grad/layer_24/attn_mlp_ratio": 1.057218335739691, "grad/layer_27/attn": 0.010092301294207573, "grad/layer_27/mlp": 0.012271387502551079, "grad/layer_27/attn_mlp_ratio": 0.8224254355806919} {"step": 42250, "timestamp": 1778240207.7998695, "train/loss": 2.163915514945984, "train/z_loss": 0.0014096807455644012, "train/perplexity": 8.705156184124466, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027109.0992609703, "perf/iters_per_sec": 0.9666009422592975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034553098678589, "data/tokens_consumed": 88606769152, "data/tokens_consumed_B": 88.606769152, "train/loss_slope": 5.602733623219629e-06} {"step": 42260, "timestamp": 1778240218.1411612, "train/loss": 2.135951018333435, "train/z_loss": 0.0014088811702094972, "train/perplexity": 8.465093138777059, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028957.2274016032, "perf/iters_per_sec": 0.9674821984298722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03361074924469, "data/tokens_consumed": 88627740672, "data/tokens_consumed_B": 88.627740672, "train/loss_slope": 3.1862950847200774e-06} {"step": 42270, "timestamp": 1778240228.4939082, "train/loss": 2.1713425159454345, "train/z_loss": 0.0013974159024655818, "train/perplexity": 8.770050072995923, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026865.7855073763, "perf/iters_per_sec": 0.966484921220482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346772909164428, "data/tokens_consumed": 88648712192, "data/tokens_consumed_B": 88.648712192, "train/loss_slope": 2.8548766069500584e-06} {"step": 42280, "timestamp": 1778240238.8352315, "train/loss": 2.190621042251587, "train/z_loss": 0.001396100502461195, "train/perplexity": 8.940763983053388, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029086.219406445, "perf/iters_per_sec": 0.9675437066108918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335450410842895, "data/tokens_consumed": 88669683712, "data/tokens_consumed_B": 88.669683712, "train/loss_slope": 7.1031839755287095e-06} {"step": 42290, "timestamp": 1778240249.1837606, "train/loss": 2.127193641662598, "train/z_loss": 0.001418130425736308, "train/perplexity": 8.39128478509082, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027565.5226151613, "perf/iters_per_sec": 0.9668185818744475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343202114105225, "data/tokens_consumed": 88690655232, "data/tokens_consumed_B": 88.690655232, "train/loss_slope": 5.863551502645515e-06} {"step": 42300, "timestamp": 1778240259.5184793, "grad/layer_0/attn": 0.002973288530483842, "grad/layer_0/mlp": 0.0029866157565265894, "grad/layer_0/attn_mlp_ratio": 0.9955376497403831, "grad/layer_4/attn": 0.001848863554187119, "grad/layer_4/mlp": 0.002437255810946226, "grad/layer_4/attn_mlp_ratio": 0.7585840887218638, "grad/layer_8/attn": 0.007520226761698723, "grad/layer_8/mlp": 0.0036168189253658056, "grad/layer_8/attn_mlp_ratio": 2.079237780203317, "grad/layer_12/attn": 0.0044224318116903305, "grad/layer_12/mlp": 0.0058637927286326885, "grad/layer_12/attn_mlp_ratio": 0.7541930523356412, "grad/layer_16/attn": 0.0036564024630934, "grad/layer_16/mlp": 0.0046080248430371284, "grad/layer_16/attn_mlp_ratio": 0.7934858227315675, "grad/layer_20/attn": 0.008968597277998924, "grad/layer_20/mlp": 0.006266586482524872, "grad/layer_20/attn_mlp_ratio": 1.431177429672628, "grad/layer_24/attn": 0.015977349132299423, "grad/layer_24/mlp": 0.011089234612882137, "grad/layer_24/attn_mlp_ratio": 1.4407981746241572, "grad/layer_27/attn": 0.008318338543176651, "grad/layer_27/mlp": 0.011281006969511509, "grad/layer_27/attn_mlp_ratio": 0.7373755279046069} {"step": 42300, "timestamp": 1778240260.1186578, "eos/sharpness": 66.03107452392577, "eos/L0_probe": 1.9981151819229126, "eos/L_plus": 2.2888660430908203, "eos/L_minus": 2.3676750659942627, "eos/grad_norm": 0.18408171832561493, "eos/embed_grad_frac": 0.0673857182264328, "eos/time_s": 0.5973808765411377} {"step": 42300, "timestamp": 1778240260.1378138, "train/loss": 2.1930706977844237, "train/z_loss": 0.0014048539218492807, "train/perplexity": 8.962692622853368, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915546.8419438824, "perf/iters_per_sec": 0.9134039125174915, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094805908203125, "data/tokens_consumed": 88711626752, "data/tokens_consumed_B": 88.711626752, "train/loss_slope": 8.800642024708883e-06} {"step": 42300, "timestamp": 1778240261.5045469, "geo/rankme_last": 438.79949951171875, "geo/layer_0/stable_rank_q_proj": 19.33548355102539, "geo/layer_0/stable_rank_k_proj": 16.46114158630371, "geo/layer_0/stable_rank_o_proj": 48.70051193237305, "geo/layer_0/stable_rank_gate_proj": 136.8486328125, "geo/layer_0/stable_rank_down_proj": 53.340999603271484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0634690672159195, "geo/layer_0/attn_entropy_mean": 6.192645072937012, "geo/layer_0/attn_entropy_std": 0.38001710176467896, "geo/layer_7/stable_rank_q_proj": 42.931026458740234, "geo/layer_7/stable_rank_k_proj": 42.22174835205078, "geo/layer_7/stable_rank_o_proj": 96.90968322753906, "geo/layer_7/stable_rank_gate_proj": 89.19617462158203, "geo/layer_7/stable_rank_down_proj": 145.98365783691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5060124397277832, "geo/layer_7/attn_entropy_mean": 4.658224105834961, "geo/layer_7/attn_entropy_std": 0.8170153498649597, "geo/layer_14/stable_rank_q_proj": 54.174903869628906, "geo/layer_14/stable_rank_k_proj": 37.816776275634766, "geo/layer_14/stable_rank_o_proj": 47.606815338134766, "geo/layer_14/stable_rank_gate_proj": 76.27165222167969, "geo/layer_14/stable_rank_down_proj": 132.62588500976562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38996410369873047, "geo/layer_14/attn_entropy_mean": 5.522266387939453, "geo/layer_14/attn_entropy_std": 0.3777192533016205, "geo/layer_21/stable_rank_q_proj": 42.90147018432617, "geo/layer_21/stable_rank_k_proj": 30.738746643066406, "geo/layer_21/stable_rank_o_proj": 75.3530502319336, "geo/layer_21/stable_rank_gate_proj": 72.17676544189453, "geo/layer_21/stable_rank_down_proj": 55.076087951660156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1481696218252182, "geo/layer_21/attn_entropy_mean": 5.732210159301758, "geo/layer_21/attn_entropy_std": 0.2876584529876709, "geo/layer_27/stable_rank_q_proj": 42.46010208129883, "geo/layer_27/stable_rank_k_proj": 31.544206619262695, "geo/layer_27/stable_rank_o_proj": 115.94721221923828, "geo/layer_27/stable_rank_gate_proj": 84.62982177734375, "geo/layer_27/stable_rank_down_proj": 131.3120574951172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09661991894245148, "geo/layer_27/attn_entropy_mean": 4.261140823364258, "geo/layer_27/attn_entropy_std": 0.6473477482795715, "attnres/final_alpha/block_0": 0.24196118116378784, "attnres/block_norm/0": 1.7264765501022339, "attnres/final_alpha/block_1": 0.005135776475071907, "attnres/block_norm/1": 41024.6796875, "attnres/final_alpha/block_2": 0.01121661439538002, "attnres/block_norm/2": 26767.2734375, "attnres/final_alpha/block_3": 0.013433240354061127, "attnres/block_norm/3": 48501.734375, "attnres/final_alpha/block_4": 0.01615825667977333, "attnres/block_norm/4": 13095.9873046875, "attnres/final_alpha/block_5": 0.5956801772117615, "attnres/block_norm/5": 6098.6875, "attnres/final_alpha/block_6": 0.11641475558280945, "attnres/block_norm/6": 32071.982421875, "geo/tier1_time_s": 1.362429141998291, "geo/step": 42300.0, "geo/rankme_slope": -0.0001588557493309824} {"step": 42310, "timestamp": 1778240271.8534367, "train/loss": 2.134900188446045, "train/z_loss": 0.00141937758307904, "train/perplexity": 8.456202438032447, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790670.0769263403, "perf/iters_per_sec": 0.8538580307609274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711548805236816, "data/tokens_consumed": 88732598272, "data/tokens_consumed_B": 88.732598272, "train/loss_slope": 8.031890380142882e-06} {"step": 42320, "timestamp": 1778240282.198256, "train/loss": 2.2209781408309937, "train/z_loss": 0.0013903955928981303, "train/perplexity": 9.216341338183666, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028297.0788767936, "perf/iters_per_sec": 0.96716741508331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339471578598023, "data/tokens_consumed": 88753569792, "data/tokens_consumed_B": 88.753569792, "train/loss_slope": 1.2040565259242742e-05} {"step": 42330, "timestamp": 1778240292.5429358, "train/loss": 2.12920241355896, "train/z_loss": 0.0014023911789990962, "train/perplexity": 8.408157903590785, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028801.2045656033, "perf/iters_per_sec": 0.9674078009441391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336902379989623, "data/tokens_consumed": 88774541312, "data/tokens_consumed_B": 88.774541312, "train/loss_slope": 9.190314323237567e-06} {"step": 42340, "timestamp": 1778240302.8885682, "train/loss": 2.154167652130127, "train/z_loss": 0.0014106003800407052, "train/perplexity": 8.620711760841743, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028238.7107972247, "perf/iters_per_sec": 0.9671395830141185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339769124984741, "data/tokens_consumed": 88795512832, "data/tokens_consumed_B": 88.795512832, "train/loss_slope": 5.5695515797726726e-06} {"step": 42350, "timestamp": 1778240313.2273176, "grad/layer_0/attn": 0.0027058327104896307, "grad/layer_0/mlp": 0.0028810680378228426, "grad/layer_0/attn_mlp_ratio": 0.9391769236441477, "grad/layer_4/attn": 0.004553536884486675, "grad/layer_4/mlp": 0.002495157765224576, "grad/layer_4/attn_mlp_ratio": 1.8249494142034317, "grad/layer_8/attn": 0.0044866143725812435, "grad/layer_8/mlp": 0.003683385904878378, "grad/layer_8/attn_mlp_ratio": 1.218067931690854, "grad/layer_12/attn": 0.004842577967792749, "grad/layer_12/mlp": 0.006135954987257719, "grad/layer_12/attn_mlp_ratio": 0.7892133985545507, "grad/layer_16/attn": 0.004018435720354319, "grad/layer_16/mlp": 0.005009260959923267, "grad/layer_16/attn_mlp_ratio": 0.8022012972140593, "grad/layer_20/attn": 0.0051579950377345085, "grad/layer_20/mlp": 0.00776489544659853, "grad/layer_20/attn_mlp_ratio": 0.664271013921624, "grad/layer_24/attn": 0.029184024780988693, "grad/layer_24/mlp": 0.016509991139173508, "grad/layer_24/attn_mlp_ratio": 1.7676584050355708, "grad/layer_27/attn": 0.010614424012601376, "grad/layer_27/mlp": 0.016361769288778305, "grad/layer_27/attn_mlp_ratio": 0.6487332610787964} {"step": 42350, "timestamp": 1778240313.241743, "train/loss": 2.2044307231903075, "train/z_loss": 0.0013947706203907729, "train/perplexity": 9.065089553427713, "train/grad_norm": 0.341796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026594.0942474713, "perf/iters_per_sec": 0.9663553687322003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348160028457642, "data/tokens_consumed": 88816484352, "data/tokens_consumed_B": 88.816484352, "train/loss_slope": 7.514086472104671e-06} {"step": 42360, "timestamp": 1778240323.5918229, "train/loss": 2.1537325620651244, "train/z_loss": 0.001405870052985847, "train/perplexity": 8.616961790647895, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027469.7164799368, "perf/iters_per_sec": 0.9667728979491886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343690872192384, "data/tokens_consumed": 88837455872, "data/tokens_consumed_B": 88.837455872, "train/loss_slope": 5.2037803539455414e-06} {"step": 42370, "timestamp": 1778240333.9378557, "train/loss": 2.135499882698059, "train/z_loss": 0.0014027163852006197, "train/perplexity": 8.461275094897958, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027975.0206214474, "perf/iters_per_sec": 0.9670138457400548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341113567352296, "data/tokens_consumed": 88858427392, "data/tokens_consumed_B": 88.858427392, "train/loss_slope": 1.2733412690252158e-06} {"step": 42375, "timestamp": 1778240339.7002604, "eos/sharpness": 75.43730735778807, "eos/L0_probe": 1.995363712310791, "eos/L_plus": 2.4548730850219727, "eos/L_minus": 2.2902274131774902, "eos/grad_norm": 0.2868773937225342, "eos/embed_grad_frac": 0.032946132123470306, "eos/time_s": 0.59808349609375} {"step": 42375, "timestamp": 1778240341.0743005, "geo/rankme_last": 439.6427001953125, "geo/layer_0/stable_rank_q_proj": 19.328289031982422, "geo/layer_0/stable_rank_k_proj": 16.409595489501953, "geo/layer_0/stable_rank_o_proj": 48.65363311767578, "geo/layer_0/stable_rank_gate_proj": 136.820556640625, "geo/layer_0/stable_rank_down_proj": 53.35686111450195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06039921194314957, "geo/layer_0/attn_entropy_mean": 6.189852714538574, "geo/layer_0/attn_entropy_std": 0.37960776686668396, "geo/layer_7/stable_rank_q_proj": 42.952667236328125, "geo/layer_7/stable_rank_k_proj": 42.26454544067383, "geo/layer_7/stable_rank_o_proj": 96.95608520507812, "geo/layer_7/stable_rank_gate_proj": 89.01708984375, "geo/layer_7/stable_rank_down_proj": 145.47799682617188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49212372303009033, "geo/layer_7/attn_entropy_mean": 4.65216064453125, "geo/layer_7/attn_entropy_std": 0.8020168542861938, "geo/layer_14/stable_rank_q_proj": 54.131103515625, "geo/layer_14/stable_rank_k_proj": 37.851444244384766, "geo/layer_14/stable_rank_o_proj": 47.67701721191406, "geo/layer_14/stable_rank_gate_proj": 76.3775863647461, "geo/layer_14/stable_rank_down_proj": 132.91128540039062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3862384557723999, "geo/layer_14/attn_entropy_mean": 5.499537467956543, "geo/layer_14/attn_entropy_std": 0.36495938897132874, "geo/layer_21/stable_rank_q_proj": 42.979488372802734, "geo/layer_21/stable_rank_k_proj": 30.81035804748535, "geo/layer_21/stable_rank_o_proj": 75.2935791015625, "geo/layer_21/stable_rank_gate_proj": 72.17526245117188, "geo/layer_21/stable_rank_down_proj": 55.038021087646484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1473199427127838, "geo/layer_21/attn_entropy_mean": 5.716562747955322, "geo/layer_21/attn_entropy_std": 0.29394209384918213, "geo/layer_27/stable_rank_q_proj": 42.43913650512695, "geo/layer_27/stable_rank_k_proj": 31.45808982849121, "geo/layer_27/stable_rank_o_proj": 116.04749298095703, "geo/layer_27/stable_rank_gate_proj": 84.66973114013672, "geo/layer_27/stable_rank_down_proj": 131.31048583984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09235042333602905, "geo/layer_27/attn_entropy_mean": 4.266366958618164, "geo/layer_27/attn_entropy_std": 0.6571192145347595, "attnres/final_alpha/block_0": 0.23930105566978455, "attnres/block_norm/0": 1.726640224456787, "attnres/final_alpha/block_1": 0.005068268161267042, "attnres/block_norm/1": 41100.1640625, "attnres/final_alpha/block_2": 0.010968091897666454, "attnres/block_norm/2": 26692.0390625, "attnres/final_alpha/block_3": 0.01296919398009777, "attnres/block_norm/3": 48411.59375, "attnres/final_alpha/block_4": 0.015712391585111618, "attnres/block_norm/4": 13141.6748046875, "attnres/final_alpha/block_5": 0.6015478372573853, "attnres/block_norm/5": 6112.04345703125, "attnres/final_alpha/block_6": 0.114433154463768, "attnres/block_norm/6": 32217.51953125, "geo/tier1_time_s": 1.355792760848999, "geo/step": 42375.0, "geo/rankme_slope": -0.00016008565535589235} {"step": 42380, "timestamp": 1778240346.2529685, "train/loss": 2.181810903549194, "train/z_loss": 0.00140341694932431, "train/perplexity": 8.862340580044881, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703934.1541035622, "perf/iters_per_sec": 0.8124991198079883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307705640792848, "data/tokens_consumed": 88879398912, "data/tokens_consumed_B": 88.879398912, "train/loss_slope": 1.1776985079176188e-07} {"step": 42390, "timestamp": 1778240356.592902, "train/loss": 2.167597842216492, "train/z_loss": 0.0014009813661687077, "train/perplexity": 8.737270509575557, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029360.1711935, "perf/iters_per_sec": 0.9676743370025158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334055185317994, "data/tokens_consumed": 88900370432, "data/tokens_consumed_B": 88.900370432, "train/loss_slope": -1.7957459617727575e-06} {"step": 42400, "timestamp": 1778240366.921589, "grad/layer_0/attn": 0.002950418973341584, "grad/layer_0/mlp": 0.002783491276204586, "grad/layer_0/attn_mlp_ratio": 1.059970581753557, "grad/layer_4/attn": 0.0033536155242472887, "grad/layer_4/mlp": 0.0024573614355176687, "grad/layer_4/attn_mlp_ratio": 1.3647220711220316, "grad/layer_8/attn": 0.005199299659579992, "grad/layer_8/mlp": 0.0036933589726686478, "grad/layer_8/attn_mlp_ratio": 1.4077428046613465, "grad/layer_12/attn": 0.00662215007469058, "grad/layer_12/mlp": 0.006504622288048267, "grad/layer_12/attn_mlp_ratio": 1.0180683335066862, "grad/layer_16/attn": 0.0040813530795276165, "grad/layer_16/mlp": 0.004715912044048309, "grad/layer_16/attn_mlp_ratio": 0.8654429842757919, "grad/layer_20/attn": 0.006432969123125076, "grad/layer_20/mlp": 0.007591411471366882, "grad/layer_20/attn_mlp_ratio": 0.8474009165026445, "grad/layer_24/attn": 0.02871091291308403, "grad/layer_24/mlp": 0.01579449512064457, "grad/layer_24/attn_mlp_ratio": 1.817779708183187, "grad/layer_27/attn": 0.014091938734054565, "grad/layer_27/mlp": 0.015769431367516518, "grad/layer_27/attn_mlp_ratio": 0.8936237658968604} {"step": 42400, "timestamp": 1778240366.9360595, "train/loss": 2.1260133743286134, "train/z_loss": 0.0014126455294899642, "train/perplexity": 8.381386668129906, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028630.889095281, "perf/iters_per_sec": 0.9673265881992726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337770223617553, "data/tokens_consumed": 88921341952, "data/tokens_consumed_B": 88.921341952, "train/loss_slope": -1.2079356634470894e-06} {"step": 42410, "timestamp": 1778240377.284166, "train/loss": 2.132045340538025, "train/z_loss": 0.0014158611069433392, "train/perplexity": 8.432095693110401, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027994.6114862454, "perf/iters_per_sec": 0.9670231873923518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341013669967651, "data/tokens_consumed": 88942313472, "data/tokens_consumed_B": 88.942313472, "train/loss_slope": -2.7712018790991075e-06} {"step": 42420, "timestamp": 1778240387.6253855, "train/loss": 2.1751715183258056, "train/z_loss": 0.0013992153806611895, "train/perplexity": 8.803694987724299, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029281.704442029, "perf/iters_per_sec": 0.9676369211397309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334454774856567, "data/tokens_consumed": 88963284992, "data/tokens_consumed_B": 88.963284992, "train/loss_slope": -4.6323503633895195e-06} {"step": 42430, "timestamp": 1778240397.9660075, "train/loss": 2.1982665777206423, "train/z_loss": 0.0014012574916705488, "train/perplexity": 9.009382891098904, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029013.062638726, "perf/iters_per_sec": 0.9675088227456694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335823059082032, "data/tokens_consumed": 88984256512, "data/tokens_consumed_B": 88.984256512, "train/loss_slope": -3.2856400078065273e-06} {"step": 42440, "timestamp": 1778240408.3114169, "train/loss": 2.113630199432373, "train/z_loss": 0.0013975634588859974, "train/perplexity": 8.27823846037862, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028227.673621405, "perf/iters_per_sec": 0.967134320078566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339825391769408, "data/tokens_consumed": 89005228032, "data/tokens_consumed_B": 89.005228032, "train/loss_slope": -4.6104677010327e-06} {"step": 42450, "timestamp": 1778240418.6464899, "grad/layer_0/attn": 0.002566716866567731, "grad/layer_0/mlp": 0.002708631567656994, "grad/layer_0/attn_mlp_ratio": 0.9476064601976608, "grad/layer_4/attn": 0.0022526595275849104, "grad/layer_4/mlp": 0.002426626393571496, "grad/layer_4/attn_mlp_ratio": 0.9283091293829311, "grad/layer_8/attn": 0.0038733994588255882, "grad/layer_8/mlp": 0.0035864890087395906, "grad/layer_8/attn_mlp_ratio": 1.0799975523100995, "grad/layer_12/attn": 0.005311440676450729, "grad/layer_12/mlp": 0.0063200537115335464, "grad/layer_12/attn_mlp_ratio": 0.8404106728898121, "grad/layer_16/attn": 0.004660673905164003, "grad/layer_16/mlp": 0.004537531640380621, "grad/layer_16/attn_mlp_ratio": 1.0271385792605063, "grad/layer_20/attn": 0.003800570499151945, "grad/layer_20/mlp": 0.005535116419196129, "grad/layer_20/attn_mlp_ratio": 0.6866288154858764, "grad/layer_24/attn": 0.009411643259227276, "grad/layer_24/mlp": 0.008018278516829014, "grad/layer_24/attn_mlp_ratio": 1.1737735378107, "grad/layer_27/attn": 0.005203443579375744, "grad/layer_27/mlp": 0.0073191630654037, "grad/layer_27/attn_mlp_ratio": 0.7109342231870767} {"step": 42450, "timestamp": 1778240419.2424824, "eos/sharpness": 32.220983505249016, "eos/L0_probe": 1.991655707359314, "eos/L_plus": 2.132814884185791, "eos/L_minus": 2.172706365585327, "eos/grad_norm": 0.10758039355278015, "eos/embed_grad_frac": 0.18766973912715912, "eos/time_s": 0.5931296348571777} {"step": 42450, "timestamp": 1778240419.260773, "train/loss": 2.1843019485473634, "train/z_loss": 0.001401678111869842, "train/perplexity": 8.88444458882896, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916738.054498327, "perf/iters_per_sec": 0.9139719269267688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094125509262085, "data/tokens_consumed": 89026199552, "data/tokens_consumed_B": 89.026199552, "train/loss_slope": -2.732688980777849e-06} {"step": 42450, "timestamp": 1778240420.625765, "geo/rankme_last": 438.4334716796875, "geo/layer_0/stable_rank_q_proj": 19.34109878540039, "geo/layer_0/stable_rank_k_proj": 16.418254852294922, "geo/layer_0/stable_rank_o_proj": 48.677955627441406, "geo/layer_0/stable_rank_gate_proj": 136.89642333984375, "geo/layer_0/stable_rank_down_proj": 53.2978630065918, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05684611201286316, "geo/layer_0/attn_entropy_mean": 6.19431209564209, "geo/layer_0/attn_entropy_std": 0.3764837384223938, "geo/layer_7/stable_rank_q_proj": 42.948402404785156, "geo/layer_7/stable_rank_k_proj": 42.28214645385742, "geo/layer_7/stable_rank_o_proj": 97.04026794433594, "geo/layer_7/stable_rank_gate_proj": 88.90411376953125, "geo/layer_7/stable_rank_down_proj": 145.64639282226562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4935550093650818, "geo/layer_7/attn_entropy_mean": 4.655435562133789, "geo/layer_7/attn_entropy_std": 0.8089849948883057, "geo/layer_14/stable_rank_q_proj": 54.086002349853516, "geo/layer_14/stable_rank_k_proj": 37.73958206176758, "geo/layer_14/stable_rank_o_proj": 47.68408203125, "geo/layer_14/stable_rank_gate_proj": 76.4012451171875, "geo/layer_14/stable_rank_down_proj": 133.2716827392578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40225908160209656, "geo/layer_14/attn_entropy_mean": 5.488439559936523, "geo/layer_14/attn_entropy_std": 0.37594980001449585, "geo/layer_21/stable_rank_q_proj": 43.07215118408203, "geo/layer_21/stable_rank_k_proj": 30.82902717590332, "geo/layer_21/stable_rank_o_proj": 75.31556701660156, "geo/layer_21/stable_rank_gate_proj": 72.21247100830078, "geo/layer_21/stable_rank_down_proj": 54.997806549072266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1436072140932083, "geo/layer_21/attn_entropy_mean": 5.7045793533325195, "geo/layer_21/attn_entropy_std": 0.293484628200531, "geo/layer_27/stable_rank_q_proj": 42.50033187866211, "geo/layer_27/stable_rank_k_proj": 31.567607879638672, "geo/layer_27/stable_rank_o_proj": 116.22530364990234, "geo/layer_27/stable_rank_gate_proj": 84.64120483398438, "geo/layer_27/stable_rank_down_proj": 131.1778564453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0907711386680603, "geo/layer_27/attn_entropy_mean": 4.259634017944336, "geo/layer_27/attn_entropy_std": 0.6828368902206421, "attnres/final_alpha/block_0": 0.24214939773082733, "attnres/block_norm/0": 1.72696852684021, "attnres/final_alpha/block_1": 0.005168771371245384, "attnres/block_norm/1": 41011.4765625, "attnres/final_alpha/block_2": 0.011173955164849758, "attnres/block_norm/2": 26672.94140625, "attnres/final_alpha/block_3": 0.013291483744978905, "attnres/block_norm/3": 48296.90234375, "attnres/final_alpha/block_4": 0.01614942029118538, "attnres/block_norm/4": 13129.048828125, "attnres/final_alpha/block_5": 0.5956665277481079, "attnres/block_norm/5": 6107.7080078125, "attnres/final_alpha/block_6": 0.11640042066574097, "attnres/block_norm/6": 32237.755859375, "geo/tier1_time_s": 1.3607807159423828, "geo/step": 42450.0, "geo/rankme_slope": -0.00019340554581207483} {"step": 42460, "timestamp": 1778240430.969305, "train/loss": 2.2195667266845702, "train/z_loss": 0.0013926049578003585, "train/perplexity": 9.203342439213548, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791740.2305221758, "perf/iters_per_sec": 0.8543683197604064, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704553842544556, "data/tokens_consumed": 89047171072, "data/tokens_consumed_B": 89.047171072, "train/loss_slope": -3.5841895837953008e-06} {"step": 42470, "timestamp": 1778240441.325329, "train/loss": 2.1901589393615724, "train/z_loss": 0.0014061757014133037, "train/perplexity": 8.93663338463212, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026228.980851233, "perf/iters_per_sec": 0.9661812690979161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350024700164795, "data/tokens_consumed": 89068142592, "data/tokens_consumed_B": 89.068142592, "train/loss_slope": -3.7469391060278906e-07} {"step": 42480, "timestamp": 1778240451.6749592, "train/loss": 2.1543916702270507, "train/z_loss": 0.0013999310322105885, "train/perplexity": 8.622643172612053, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027431.30309948, "perf/iters_per_sec": 0.9667545810220146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343886852264403, "data/tokens_consumed": 89089114112, "data/tokens_consumed_B": 89.089114112, "train/loss_slope": -8.550973543704762e-07} {"step": 42490, "timestamp": 1778240462.02135, "train/loss": 2.117479705810547, "train/z_loss": 0.0014226248837076127, "train/perplexity": 8.310167007276071, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027741.6893284076, "perf/iters_per_sec": 0.9669025847093619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034230351448059, "data/tokens_consumed": 89110085632, "data/tokens_consumed_B": 89.110085632, "train/loss_slope": -2.9122241581305765e-06} {"step": 42500, "timestamp": 1778240472.3537242, "grad/layer_0/attn": 0.0036405608989298344, "grad/layer_0/mlp": 0.0032644658349454403, "grad/layer_0/attn_mlp_ratio": 1.115208726780811, "grad/layer_4/attn": 0.002176129026338458, "grad/layer_4/mlp": 0.002637421479448676, "grad/layer_4/attn_mlp_ratio": 0.8250971491608686, "grad/layer_8/attn": 0.004040643572807312, "grad/layer_8/mlp": 0.00390056986361742, "grad/layer_8/attn_mlp_ratio": 1.0359110618438918, "grad/layer_12/attn": 0.005263347644358873, "grad/layer_12/mlp": 0.006684430874884129, "grad/layer_12/attn_mlp_ratio": 0.7874039935688185, "grad/layer_16/attn": 0.006540103815495968, "grad/layer_16/mlp": 0.005480996333062649, "grad/layer_16/attn_mlp_ratio": 1.1932326348626203, "grad/layer_20/attn": 0.0036133830435574055, "grad/layer_20/mlp": 0.00693257711827755, "grad/layer_20/attn_mlp_ratio": 0.5212178573403872, "grad/layer_24/attn": 0.023578297346830368, "grad/layer_24/mlp": 0.01401342824101448, "grad/layer_24/attn_mlp_ratio": 1.6825502491650415, "grad/layer_27/attn": 0.012212849222123623, "grad/layer_27/mlp": 0.015174657106399536, "grad/layer_27/attn_mlp_ratio": 0.804818788062847} {"step": 42500, "timestamp": 1778240472.367941, "train/loss": 2.188002824783325, "train/z_loss": 0.0014087762450799347, "train/perplexity": 8.917385736634873, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028000.3158187447, "perf/iters_per_sec": 0.9670259074300502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340984582901, "data/tokens_consumed": 89131057152, "data/tokens_consumed_B": 89.131057152, "train/loss_slope": 8.06510609404393e-07} {"step": 42500, "timestamp": 1778240479.6295588, "geo/ww_alpha_mean": 7.3566065998379395, "geo/ww_alpha_std": 3.950298848120299, "geo/ww_alpha_min": 1.3466740690341945, "geo/ww_alpha_max": 30.844077069486605, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.039162351188199, "geo/ww_alpha_by_type/k_proj": 4.383660135303699, "geo/ww_alpha_by_type/v_proj": 7.838767883943208, "geo/ww_alpha_by_type/o_proj": 8.938238259500062, "geo/ww_alpha_by_type/gate_proj": 7.896804253559009, "geo/ww_alpha_by_type/up_proj": 10.326313023881358, "geo/ww_alpha_by_type/down_proj": 8.167691581101126, "geo/twonn_id/layer_0": 0.6986799240112305, "geo/twonn_id/layer_7": 2.8907437324523926, "geo/twonn_id/layer_14": 4.5358428955078125, "geo/twonn_id/layer_21": 6.184976100921631, "geo/twonn_id/layer_27": 5.623140335083008, "geo/tier2_time_s": 7.253987789154053} {"step": 42500, "timestamp": 1778240480.3281288, "eoc/jacobian_sigma/layer_0/attn": 1075.948974609375, "eoc/jacobian_sigma/layer_0/mlp": 7882.39794921875, "eoc/jacobian_sigma/layer_0": 7882.39794921875, "eoc/jacobian_sigma/layer_7/attn": 1.15982186794281, "eoc/jacobian_sigma/layer_7/mlp": 1.8054970502853394, "eoc/jacobian_sigma/layer_7": 1.8054970502853394, "eoc/jacobian_sigma/layer_14/attn": 1.5917435884475708, "eoc/jacobian_sigma/layer_14/mlp": 5.977336883544922, "eoc/jacobian_sigma/layer_14": 5.977336883544922, "eoc/jacobian_sigma/layer_21/attn": 1.0915018320083618, "eoc/jacobian_sigma/layer_21/mlp": 4.0676374435424805, "eoc/jacobian_sigma/layer_21": 4.0676374435424805, "eoc/jacobian_sigma/layer_27/attn": 3.38419508934021, "eoc/jacobian_sigma/layer_27/mlp": 30.888566970825195, "eoc/jacobian_sigma/layer_27": 30.888566970825195, "eoc/layer0_sigma": 7882.39794921875, "eoc/sigma_max": 30.888566970825195, "eoc/sigma_min": 1.8054970502853394, "eoc/sigma_mean": 10.684759587049484, "eoc/time_s": 0.6912462711334229} {"step": 42510, "timestamp": 1778240490.7096658, "train/loss": 2.1711920142173766, "train/z_loss": 0.0014082985580898821, "train/perplexity": 8.768730264623995, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1143749.7820004243, "perf/iters_per_sec": 0.5453823957445261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8335758686065673, "data/tokens_consumed": 89152028672, "data/tokens_consumed_B": 89.152028672, "train/loss_slope": 4.4122393911672185e-07} {"step": 42520, "timestamp": 1778240501.0691698, "train/loss": 2.1663333892822267, "train/z_loss": 0.0014083335176110269, "train/perplexity": 8.726229624053355, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025911.918401295, "perf/iters_per_sec": 0.9660300819403148, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351644515991212, "data/tokens_consumed": 89173000192, "data/tokens_consumed_B": 89.173000192, "train/loss_slope": 3.402756936479267e-06} {"step": 42525, "timestamp": 1778240506.8525877, "eos/sharpness": 42.82953739166259, "eos/L0_probe": 1.9915729761123657, "eos/L_plus": 2.223282814025879, "eos/L_minus": 2.1881585121154785, "eos/grad_norm": 0.2095986157655716, "eos/embed_grad_frac": 0.06481366604566574, "eos/time_s": 0.6100976467132568} {"step": 42525, "timestamp": 1778240508.2271798, "geo/rankme_last": 439.6334533691406, "geo/layer_0/stable_rank_q_proj": 19.322084426879883, "geo/layer_0/stable_rank_k_proj": 16.430299758911133, "geo/layer_0/stable_rank_o_proj": 48.76393127441406, "geo/layer_0/stable_rank_gate_proj": 136.53150939941406, "geo/layer_0/stable_rank_down_proj": 53.260406494140625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0610661581158638, "geo/layer_0/attn_entropy_mean": 6.1939287185668945, "geo/layer_0/attn_entropy_std": 0.38086220622062683, "geo/layer_7/stable_rank_q_proj": 42.961997985839844, "geo/layer_7/stable_rank_k_proj": 42.214599609375, "geo/layer_7/stable_rank_o_proj": 97.04219055175781, "geo/layer_7/stable_rank_gate_proj": 88.86998748779297, "geo/layer_7/stable_rank_down_proj": 145.58966064453125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49682101607322693, "geo/layer_7/attn_entropy_mean": 4.6749773025512695, "geo/layer_7/attn_entropy_std": 0.7948180437088013, "geo/layer_14/stable_rank_q_proj": 54.13802719116211, "geo/layer_14/stable_rank_k_proj": 37.720848083496094, "geo/layer_14/stable_rank_o_proj": 47.681427001953125, "geo/layer_14/stable_rank_gate_proj": 76.36632537841797, "geo/layer_14/stable_rank_down_proj": 133.6251983642578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3715428411960602, "geo/layer_14/attn_entropy_mean": 5.540102005004883, "geo/layer_14/attn_entropy_std": 0.38587892055511475, "geo/layer_21/stable_rank_q_proj": 43.10464859008789, "geo/layer_21/stable_rank_k_proj": 30.82862663269043, "geo/layer_21/stable_rank_o_proj": 75.30509948730469, "geo/layer_21/stable_rank_gate_proj": 72.1685562133789, "geo/layer_21/stable_rank_down_proj": 55.023529052734375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14434075355529785, "geo/layer_21/attn_entropy_mean": 5.73287296295166, "geo/layer_21/attn_entropy_std": 0.28696298599243164, "geo/layer_27/stable_rank_q_proj": 42.50466537475586, "geo/layer_27/stable_rank_k_proj": 31.649091720581055, "geo/layer_27/stable_rank_o_proj": 116.02081298828125, "geo/layer_27/stable_rank_gate_proj": 84.61873626708984, "geo/layer_27/stable_rank_down_proj": 131.16244506835938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09444595873355865, "geo/layer_27/attn_entropy_mean": 4.286505699157715, "geo/layer_27/attn_entropy_std": 0.6607917547225952, "attnres/final_alpha/block_0": 0.24048857390880585, "attnres/block_norm/0": 1.7269823551177979, "attnres/final_alpha/block_1": 0.005051353946328163, "attnres/block_norm/1": 41190.94140625, "attnres/final_alpha/block_2": 0.011008611880242825, "attnres/block_norm/2": 26557.91015625, "attnres/final_alpha/block_3": 0.013046281412243843, "attnres/block_norm/3": 48494.18359375, "attnres/final_alpha/block_4": 0.01571265608072281, "attnres/block_norm/4": 13179.9736328125, "attnres/final_alpha/block_5": 0.6007074117660522, "attnres/block_norm/5": 6076.1416015625, "attnres/final_alpha/block_6": 0.11398513615131378, "attnres/block_norm/6": 32316.84765625, "geo/tier1_time_s": 1.3564445972442627, "geo/step": 42525.0, "geo/rankme_slope": -0.00017577525150685273} {"step": 42530, "timestamp": 1778240513.4143984, "train/loss": 2.1392569065093996, "train/z_loss": 0.0013970086351037026, "train/perplexity": 8.493124098172643, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699741.8495492388, "perf/iters_per_sec": 0.8105000732179827, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338061809539795, "data/tokens_consumed": 89193971712, "data/tokens_consumed_B": 89.193971712, "train/loss_slope": -3.960759869360411e-07} {"step": 42540, "timestamp": 1778240523.763333, "train/loss": 2.1322476863861084, "train/z_loss": 0.0013999074930325151, "train/perplexity": 8.433802065297385, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027260.2830879097, "perf/iters_per_sec": 0.9666730323257016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344759464263915, "data/tokens_consumed": 89214943232, "data/tokens_consumed_B": 89.214943232, "train/loss_slope": 6.646061816302346e-07} {"step": 42550, "timestamp": 1778240534.1096497, "grad/layer_0/attn": 0.002578055951744318, "grad/layer_0/mlp": 0.0026763209607452154, "grad/layer_0/attn_mlp_ratio": 0.9632835124147864, "grad/layer_4/attn": 0.0015620279591530561, "grad/layer_4/mlp": 0.0022981984075158834, "grad/layer_4/attn_mlp_ratio": 0.6796749515086266, "grad/layer_8/attn": 0.004489255137741566, "grad/layer_8/mlp": 0.0034925704821944237, "grad/layer_8/attn_mlp_ratio": 1.2853727740330771, "grad/layer_12/attn": 0.006112006958574057, "grad/layer_12/mlp": 0.006262690760195255, "grad/layer_12/attn_mlp_ratio": 0.9759394316301124, "grad/layer_16/attn": 0.003324189456179738, "grad/layer_16/mlp": 0.004386971239000559, "grad/layer_16/attn_mlp_ratio": 0.7577413206754745, "grad/layer_20/attn": 0.0031516633462160826, "grad/layer_20/mlp": 0.005315029993653297, "grad/layer_20/attn_mlp_ratio": 0.5929718723473455, "grad/layer_24/attn": 0.004101383499801159, "grad/layer_24/mlp": 0.007408992853015661, "grad/layer_24/attn_mlp_ratio": 0.5535682819257892, "grad/layer_27/attn": 0.009406879544258118, "grad/layer_27/mlp": 0.006715328432619572, "grad/layer_27/attn_mlp_ratio": 1.4008070489127065} {"step": 42550, "timestamp": 1778240534.1240232, "train/loss": 2.1911651611328127, "train/z_loss": 0.0013930082321166992, "train/perplexity": 8.945630145314476, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025682.7935906104, "perf/iters_per_sec": 0.9659208267167141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352815389633179, "data/tokens_consumed": 89235914752, "data/tokens_consumed_B": 89.235914752, "train/loss_slope": 2.03820988111152e-06} {"step": 42560, "timestamp": 1778240544.4803798, "train/loss": 2.155697786808014, "train/z_loss": 0.001404298224952072, "train/perplexity": 8.633912707892879, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026442.12274525, "perf/iters_per_sec": 0.9662829030729532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348936080932618, "data/tokens_consumed": 89256886272, "data/tokens_consumed_B": 89.256886272, "train/loss_slope": 2.780739666640883e-06} {"step": 42570, "timestamp": 1778240554.8257327, "train/loss": 2.191033959388733, "train/z_loss": 0.0013992004794999957, "train/perplexity": 8.944456540028732, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028145.5533224517, "perf/iters_per_sec": 0.9670951620685824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340244054794312, "data/tokens_consumed": 89277857792, "data/tokens_consumed_B": 89.277857792, "train/loss_slope": 1.22858301283462e-06} {"step": 42580, "timestamp": 1778240565.1771262, "train/loss": 2.162656831741333, "train/z_loss": 0.001395683945156634, "train/perplexity": 8.69420604306664, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027361.1159864168, "perf/iters_per_sec": 0.9667211131984791, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344244956970214, "data/tokens_consumed": 89298829312, "data/tokens_consumed_B": 89.298829312, "train/loss_slope": 7.735241936592436e-07} {"step": 42590, "timestamp": 1778240575.518879, "train/loss": 2.202386260032654, "train/z_loss": 0.0013971724663861095, "train/perplexity": 9.046575244175363, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029211.67003254, "perf/iters_per_sec": 0.9676035261309338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334811449050902, "data/tokens_consumed": 89319800832, "data/tokens_consumed_B": 89.319800832, "train/loss_slope": -1.210638234252713e-06} {"step": 42600, "timestamp": 1778240585.850501, "grad/layer_0/attn": 0.0025966463144868612, "grad/layer_0/mlp": 0.0027832866180688143, "grad/layer_0/attn_mlp_ratio": 0.9329424444954584, "grad/layer_4/attn": 0.0028010907117277384, "grad/layer_4/mlp": 0.0025204643607139587, "grad/layer_4/attn_mlp_ratio": 1.1113391025296537, "grad/layer_8/attn": 0.005540353246033192, "grad/layer_8/mlp": 0.0034877362195402384, "grad/layer_8/attn_mlp_ratio": 1.5885241137619985, "grad/layer_12/attn": 0.004572055768221617, "grad/layer_12/mlp": 0.006851118057966232, "grad/layer_12/attn_mlp_ratio": 0.667344463021032, "grad/layer_16/attn": 0.0034429735969752073, "grad/layer_16/mlp": 0.004435919225215912, "grad/layer_16/attn_mlp_ratio": 0.7761578479129861, "grad/layer_20/attn": 0.003518186043947935, "grad/layer_20/mlp": 0.005755140446126461, "grad/layer_20/attn_mlp_ratio": 0.6113119246611407, "grad/layer_24/attn": 0.016991320997476578, "grad/layer_24/mlp": 0.011445450596511364, "grad/layer_24/attn_mlp_ratio": 1.4845480049689634, "grad/layer_27/attn": 0.008393911644816399, "grad/layer_27/mlp": 0.011166870594024658, "grad/layer_27/attn_mlp_ratio": 0.75167984611015} {"step": 42600, "timestamp": 1778240586.4514043, "eos/sharpness": 75.49622058868407, "eos/L0_probe": 1.9942328929901123, "eos/L_plus": 2.3105173110961914, "eos/L_minus": 2.432910680770874, "eos/grad_norm": 0.21064573526382446, "eos/embed_grad_frac": 0.048372574150562286, "eos/time_s": 0.5978620052337646} {"step": 42600, "timestamp": 1778240586.4692917, "train/loss": 2.203935384750366, "train/z_loss": 0.0013880917336791753, "train/perplexity": 9.060600378032747, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916449.4863612377, "perf/iters_per_sec": 0.9138343269163312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0942902565002441, "data/tokens_consumed": 89340772352, "data/tokens_consumed_B": 89.340772352, "train/loss_slope": -1.8988085909001908e-06} {"step": 42600, "timestamp": 1778240587.8290575, "geo/rankme_last": 438.2228088378906, "geo/layer_0/stable_rank_q_proj": 19.300588607788086, "geo/layer_0/stable_rank_k_proj": 16.417421340942383, "geo/layer_0/stable_rank_o_proj": 48.772850036621094, "geo/layer_0/stable_rank_gate_proj": 136.67832946777344, "geo/layer_0/stable_rank_down_proj": 53.287513732910156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05530122295022011, "geo/layer_0/attn_entropy_mean": 6.193498611450195, "geo/layer_0/attn_entropy_std": 0.38514745235443115, "geo/layer_7/stable_rank_q_proj": 43.00330352783203, "geo/layer_7/stable_rank_k_proj": 42.135921478271484, "geo/layer_7/stable_rank_o_proj": 97.1553955078125, "geo/layer_7/stable_rank_gate_proj": 88.84858703613281, "geo/layer_7/stable_rank_down_proj": 145.56167602539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49522408843040466, "geo/layer_7/attn_entropy_mean": 4.645471572875977, "geo/layer_7/attn_entropy_std": 0.8194149136543274, "geo/layer_14/stable_rank_q_proj": 54.287147521972656, "geo/layer_14/stable_rank_k_proj": 37.73440170288086, "geo/layer_14/stable_rank_o_proj": 47.77512741088867, "geo/layer_14/stable_rank_gate_proj": 76.42150115966797, "geo/layer_14/stable_rank_down_proj": 133.48959350585938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3810061812400818, "geo/layer_14/attn_entropy_mean": 5.510690689086914, "geo/layer_14/attn_entropy_std": 0.3651108741760254, "geo/layer_21/stable_rank_q_proj": 43.009525299072266, "geo/layer_21/stable_rank_k_proj": 30.806793212890625, "geo/layer_21/stable_rank_o_proj": 75.34484100341797, "geo/layer_21/stable_rank_gate_proj": 72.22250366210938, "geo/layer_21/stable_rank_down_proj": 55.042503356933594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1471438705921173, "geo/layer_21/attn_entropy_mean": 5.7397379875183105, "geo/layer_21/attn_entropy_std": 0.28486669063568115, "geo/layer_27/stable_rank_q_proj": 42.485721588134766, "geo/layer_27/stable_rank_k_proj": 31.685792922973633, "geo/layer_27/stable_rank_o_proj": 116.0210952758789, "geo/layer_27/stable_rank_gate_proj": 84.63265991210938, "geo/layer_27/stable_rank_down_proj": 131.391845703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08583488315343857, "geo/layer_27/attn_entropy_mean": 4.272074222564697, "geo/layer_27/attn_entropy_std": 0.6775547862052917, "attnres/final_alpha/block_0": 0.24172717332839966, "attnres/block_norm/0": 1.7268955707550049, "attnres/final_alpha/block_1": 0.005077811889350414, "attnres/block_norm/1": 41033.0546875, "attnres/final_alpha/block_2": 0.010953043587505817, "attnres/block_norm/2": 26695.228515625, "attnres/final_alpha/block_3": 0.013096317648887634, "attnres/block_norm/3": 48312.09375, "attnres/final_alpha/block_4": 0.016005612909793854, "attnres/block_norm/4": 13200.189453125, "attnres/final_alpha/block_5": 0.5971423387527466, "attnres/block_norm/5": 6098.4697265625, "attnres/final_alpha/block_6": 0.11599764227867126, "attnres/block_norm/6": 32326.5546875, "geo/tier1_time_s": 1.3555781841278076, "geo/step": 42600.0, "geo/rankme_slope": -0.00019867097229516806} {"step": 42610, "timestamp": 1778240598.1849742, "train/loss": 2.1835968017578127, "train/z_loss": 0.0014044697163626552, "train/perplexity": 8.87818195954617, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790619.4077493264, "perf/iters_per_sec": 0.8538338698145516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711880207061767, "data/tokens_consumed": 89361743872, "data/tokens_consumed_B": 89.361743872, "train/loss_slope": -4.668528884157524e-06} {"step": 42620, "timestamp": 1778240608.5331974, "train/loss": 2.1396320819854737, "train/z_loss": 0.0014049599296413362, "train/perplexity": 8.496311107856087, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027843.8323648383, "perf/iters_per_sec": 0.9669512903045837, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341782569885254, "data/tokens_consumed": 89382715392, "data/tokens_consumed_B": 89.382715392, "train/loss_slope": -6.48728552931896e-06} {"step": 42630, "timestamp": 1778240618.8794782, "train/loss": 2.152372694015503, "train/z_loss": 0.0014175719697959721, "train/perplexity": 8.605251823433669, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028808.5044391248, "perf/iters_per_sec": 0.9674112817950844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336865186691284, "data/tokens_consumed": 89403686912, "data/tokens_consumed_B": 89.403686912, "train/loss_slope": -7.226647824236792e-06} {"step": 42640, "timestamp": 1778240629.2521124, "train/loss": 2.1576395988464356, "train/z_loss": 0.0014145137625746428, "train/perplexity": 8.65069443173129, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023120.560128906, "perf/iters_per_sec": 0.964699058594182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036592698097229, "data/tokens_consumed": 89424658432, "data/tokens_consumed_B": 89.424658432, "train/loss_slope": -8.594421835372032e-06} {"step": 42650, "timestamp": 1778240639.602723, "grad/layer_0/attn": 0.002379432786256075, "grad/layer_0/mlp": 0.002582307904958725, "grad/layer_0/attn_mlp_ratio": 0.9214364753108168, "grad/layer_4/attn": 0.001768536283634603, "grad/layer_4/mlp": 0.0025862781330943108, "grad/layer_4/attn_mlp_ratio": 0.6838151676815786, "grad/layer_8/attn": 0.005410360172390938, "grad/layer_8/mlp": 0.0036752193700522184, "grad/layer_8/attn_mlp_ratio": 1.472118935067044, "grad/layer_12/attn": 0.005168345291167498, "grad/layer_12/mlp": 0.006146267056465149, "grad/layer_12/attn_mlp_ratio": 0.8408917412141786, "grad/layer_16/attn": 0.006514654029160738, "grad/layer_16/mlp": 0.004327606409788132, "grad/layer_16/attn_mlp_ratio": 1.505371159421719, "grad/layer_20/attn": 0.0036505593452602625, "grad/layer_20/mlp": 0.00528436154127121, "grad/layer_20/attn_mlp_ratio": 0.6908231482018861, "grad/layer_24/attn": 0.008931068703532219, "grad/layer_24/mlp": 0.008883868344128132, "grad/layer_24/attn_mlp_ratio": 1.005313029982483, "grad/layer_27/attn": 0.004555476829409599, "grad/layer_27/mlp": 0.008451288565993309, "grad/layer_27/attn_mlp_ratio": 0.5390274796481797} {"step": 42650, "timestamp": 1778240639.6170466, "train/loss": 2.148314583301544, "train/z_loss": 0.0014356401981785894, "train/perplexity": 8.57040151983487, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024659.9925427192, "perf/iters_per_sec": 0.9654331171716305, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035804533958435, "data/tokens_consumed": 89445629952, "data/tokens_consumed_B": 89.445629952, "train/loss_slope": -1.1312928172585661e-05} {"step": 42660, "timestamp": 1778240649.9676087, "train/loss": 2.1101505279541017, "train/z_loss": 0.0014178986661136151, "train/perplexity": 8.24948296896393, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027651.8961014126, "perf/iters_per_sec": 0.9668597679621757, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342761516571044, "data/tokens_consumed": 89466601472, "data/tokens_consumed_B": 89.466601472, "train/loss_slope": -1.5903017351372e-05} {"step": 42670, "timestamp": 1778240660.3209736, "train/loss": 2.1139206171035765, "train/z_loss": 0.0014113055542111397, "train/perplexity": 8.280642956251096, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026486.2878430677, "perf/iters_per_sec": 0.9663039626326884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348710536956787, "data/tokens_consumed": 89487572992, "data/tokens_consumed_B": 89.487572992, "train/loss_slope": -1.9805752805428882e-05} {"step": 42675, "timestamp": 1778240666.0762117, "eos/sharpness": 68.7817096710205, "eos/L0_probe": 1.9973664283752441, "eos/L_plus": 2.4103989601135254, "eos/L_minus": 2.272150993347168, "eos/grad_norm": 0.21972446143627167, "eos/embed_grad_frac": 0.043437473475933075, "eos/time_s": 0.5901634693145752} {"step": 42675, "timestamp": 1778240667.4568117, "geo/rankme_last": 439.558837890625, "geo/layer_0/stable_rank_q_proj": 19.300670623779297, "geo/layer_0/stable_rank_k_proj": 16.413986206054688, "geo/layer_0/stable_rank_o_proj": 48.82053756713867, "geo/layer_0/stable_rank_gate_proj": 136.46604919433594, "geo/layer_0/stable_rank_down_proj": 53.28926467895508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06010251119732857, "geo/layer_0/attn_entropy_mean": 6.189864158630371, "geo/layer_0/attn_entropy_std": 0.38949406147003174, "geo/layer_7/stable_rank_q_proj": 42.98774337768555, "geo/layer_7/stable_rank_k_proj": 42.19506072998047, "geo/layer_7/stable_rank_o_proj": 97.28543090820312, "geo/layer_7/stable_rank_gate_proj": 88.77336120605469, "geo/layer_7/stable_rank_down_proj": 145.24600219726562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4885135889053345, "geo/layer_7/attn_entropy_mean": 4.622363090515137, "geo/layer_7/attn_entropy_std": 0.8317384719848633, "geo/layer_14/stable_rank_q_proj": 54.223995208740234, "geo/layer_14/stable_rank_k_proj": 37.73360824584961, "geo/layer_14/stable_rank_o_proj": 47.803558349609375, "geo/layer_14/stable_rank_gate_proj": 76.2714614868164, "geo/layer_14/stable_rank_down_proj": 133.3303985595703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3968229293823242, "geo/layer_14/attn_entropy_mean": 5.469021320343018, "geo/layer_14/attn_entropy_std": 0.3684816360473633, "geo/layer_21/stable_rank_q_proj": 42.91791915893555, "geo/layer_21/stable_rank_k_proj": 30.913124084472656, "geo/layer_21/stable_rank_o_proj": 75.33092498779297, "geo/layer_21/stable_rank_gate_proj": 72.06948852539062, "geo/layer_21/stable_rank_down_proj": 54.91730499267578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14684568345546722, "geo/layer_21/attn_entropy_mean": 5.710697174072266, "geo/layer_21/attn_entropy_std": 0.287326455116272, "geo/layer_27/stable_rank_q_proj": 42.45359420776367, "geo/layer_27/stable_rank_k_proj": 31.670907974243164, "geo/layer_27/stable_rank_o_proj": 116.23482513427734, "geo/layer_27/stable_rank_gate_proj": 84.63762664794922, "geo/layer_27/stable_rank_down_proj": 131.40701293945312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08778034895658493, "geo/layer_27/attn_entropy_mean": 4.257027626037598, "geo/layer_27/attn_entropy_std": 0.6759445071220398, "attnres/final_alpha/block_0": 0.23982185125350952, "attnres/block_norm/0": 1.7270631790161133, "attnres/final_alpha/block_1": 0.005049956031143665, "attnres/block_norm/1": 41329.40625, "attnres/final_alpha/block_2": 0.011136909946799278, "attnres/block_norm/2": 26566.0078125, "attnres/final_alpha/block_3": 0.01319936104118824, "attnres/block_norm/3": 48679.50390625, "attnres/final_alpha/block_4": 0.015988904982805252, "attnres/block_norm/4": 13131.740234375, "attnres/final_alpha/block_5": 0.600695013999939, "attnres/block_norm/5": 6089.9853515625, "attnres/final_alpha/block_6": 0.11410801112651825, "attnres/block_norm/6": 32297.38671875, "geo/tier1_time_s": 1.3597362041473389, "geo/step": 42675.0, "geo/rankme_slope": -0.0001853843881302521} {"step": 42680, "timestamp": 1778240672.6344492, "train/loss": 2.154155945777893, "train/z_loss": 0.0014136084471829235, "train/perplexity": 8.620610844344048, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704155.8299197622, "perf/iters_per_sec": 0.8126048230742274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230610466003418, "data/tokens_consumed": 89508544512, "data/tokens_consumed_B": 89.508544512, "train/loss_slope": -1.9501137182657488e-05} {"step": 42690, "timestamp": 1778240682.9822319, "train/loss": 2.1939868211746214, "train/z_loss": 0.0014159728307276964, "train/perplexity": 8.970907317466718, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027603.193263302, "perf/iters_per_sec": 0.9668365446392546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343009948730468, "data/tokens_consumed": 89529516032, "data/tokens_consumed_B": 89.529516032, "train/loss_slope": -1.8786320441698437e-05} {"step": 42700, "timestamp": 1778240693.328789, "grad/layer_0/attn": 0.003208838403224945, "grad/layer_0/mlp": 0.003123549511656165, "grad/layer_0/attn_mlp_ratio": 1.0273050862552358, "grad/layer_4/attn": 0.0023077288642525673, "grad/layer_4/mlp": 0.0027641986962407827, "grad/layer_4/attn_mlp_ratio": 0.8348635660325882, "grad/layer_8/attn": 0.004720726050436497, "grad/layer_8/mlp": 0.0037809619680047035, "grad/layer_8/attn_mlp_ratio": 1.248551549983607, "grad/layer_12/attn": 0.0044492981396615505, "grad/layer_12/mlp": 0.006571515928953886, "grad/layer_12/attn_mlp_ratio": 0.6770580974098044, "grad/layer_16/attn": 0.0036672961432486773, "grad/layer_16/mlp": 0.004756533075124025, "grad/layer_16/attn_mlp_ratio": 0.771001905847751, "grad/layer_20/attn": 0.007091455161571503, "grad/layer_20/mlp": 0.007467043120414019, "grad/layer_20/attn_mlp_ratio": 0.9497005644997872, "grad/layer_24/attn": 0.02643490768969059, "grad/layer_24/mlp": 0.016216011717915535, "grad/layer_24/attn_mlp_ratio": 1.6301731884830746, "grad/layer_27/attn": 0.007856962271034718, "grad/layer_27/mlp": 0.014932839199900627, "grad/layer_27/attn_mlp_ratio": 0.5261532728800613} {"step": 42700, "timestamp": 1778240693.3433473, "train/loss": 2.16600558757782, "train/z_loss": 0.0014164250693283974, "train/perplexity": 8.723369619892274, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025348.5071244035, "perf/iters_per_sec": 0.9657614265081422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03545241355896, "data/tokens_consumed": 89550487552, "data/tokens_consumed_B": 89.550487552, "train/loss_slope": -2.1591293500630772e-05} {"step": 42710, "timestamp": 1778240703.7182996, "train/loss": 2.187172269821167, "train/z_loss": 0.001393043261487037, "train/perplexity": 8.9099824325129, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022411.983176543, "perf/iters_per_sec": 0.964361182773849, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369558811187745, "data/tokens_consumed": 89571459072, "data/tokens_consumed_B": 89.571459072, "train/loss_slope": -1.6741884065897618e-05} {"step": 42720, "timestamp": 1778240714.0944138, "train/loss": 2.145897698402405, "train/z_loss": 0.001392986113205552, "train/perplexity": 8.549712856951254, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022617.9509878128, "perf/iters_per_sec": 0.9644593958796562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368502855300903, "data/tokens_consumed": 89592430592, "data/tokens_consumed_B": 89.592430592, "train/loss_slope": -2.0041366994041185e-05} {"step": 42730, "timestamp": 1778240724.4775681, "train/loss": 2.159558129310608, "train/z_loss": 0.0013912645517848432, "train/perplexity": 8.667306983282367, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021032.0724950049, "perf/iters_per_sec": 0.9637031900858902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376638889312744, "data/tokens_consumed": 89613402112, "data/tokens_consumed_B": 89.613402112, "train/loss_slope": -2.0173414948821385e-05} {"step": 42740, "timestamp": 1778240734.8497512, "train/loss": 2.127126097679138, "train/z_loss": 0.0013917601085267962, "train/perplexity": 8.390718023430976, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022877.5050484587, "perf/iters_per_sec": 0.9645831609003347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367172479629516, "data/tokens_consumed": 89634373632, "data/tokens_consumed_B": 89.634373632, "train/loss_slope": -2.277733386904135e-05} {"step": 42750, "timestamp": 1778240745.2160044, "grad/layer_0/attn": 0.002424311824142933, "grad/layer_0/mlp": 0.002715208102017641, "grad/layer_0/attn_mlp_ratio": 0.8928640618945736, "grad/layer_4/attn": 0.0019084627274423838, "grad/layer_4/mlp": 0.0024718206841498613, "grad/layer_4/attn_mlp_ratio": 0.7720878227418763, "grad/layer_8/attn": 0.009800477884709835, "grad/layer_8/mlp": 0.0036303664091974497, "grad/layer_8/attn_mlp_ratio": 2.699583598482564, "grad/layer_12/attn": 0.0036751513835042715, "grad/layer_12/mlp": 0.006101712118834257, "grad/layer_12/attn_mlp_ratio": 0.6023147686579057, "grad/layer_16/attn": 0.005059762857854366, "grad/layer_16/mlp": 0.00440192362293601, "grad/layer_16/attn_mlp_ratio": 1.1494435561185936, "grad/layer_20/attn": 0.0034431328531354666, "grad/layer_20/mlp": 0.006367701571434736, "grad/layer_20/attn_mlp_ratio": 0.5407183047819636, "grad/layer_24/attn": 0.005974875297397375, "grad/layer_24/mlp": 0.00867715198546648, "grad/layer_24/attn_mlp_ratio": 0.688575610816457, "grad/layer_27/attn": 0.008593681268393993, "grad/layer_27/mlp": 0.006730965804308653, "grad/layer_27/attn_mlp_ratio": 1.2767381963549944} {"step": 42750, "timestamp": 1778240745.8067536, "eos/sharpness": 28.49841117858886, "eos/L0_probe": 1.998842477798462, "eos/L_plus": 2.172966480255127, "eos/L_minus": 2.1097025871276855, "eos/grad_norm": 0.10576290637254715, "eos/embed_grad_frac": 0.20812873542308807, "eos/time_s": 0.5879766941070557} {"step": 42750, "timestamp": 1778240745.8264322, "train/loss": 2.1468828320503235, "train/z_loss": 0.0014166268054395914, "train/perplexity": 8.558139616827521, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911856.4432277994, "perf/iters_per_sec": 0.911644193281078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969191789627075, "data/tokens_consumed": 89655345152, "data/tokens_consumed_B": 89.655345152, "train/loss_slope": -2.1957759702667033e-05} {"step": 42750, "timestamp": 1778240747.185274, "geo/rankme_last": 439.63616943359375, "geo/layer_0/stable_rank_q_proj": 19.339862823486328, "geo/layer_0/stable_rank_k_proj": 16.38563346862793, "geo/layer_0/stable_rank_o_proj": 48.90571594238281, "geo/layer_0/stable_rank_gate_proj": 136.11866760253906, "geo/layer_0/stable_rank_down_proj": 53.3044548034668, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06776563823223114, "geo/layer_0/attn_entropy_mean": 6.194036483764648, "geo/layer_0/attn_entropy_std": 0.3881848454475403, "geo/layer_7/stable_rank_q_proj": 42.96523666381836, "geo/layer_7/stable_rank_k_proj": 42.18517303466797, "geo/layer_7/stable_rank_o_proj": 97.2935791015625, "geo/layer_7/stable_rank_gate_proj": 88.63629913330078, "geo/layer_7/stable_rank_down_proj": 145.56820678710938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5043376684188843, "geo/layer_7/attn_entropy_mean": 4.60722541809082, "geo/layer_7/attn_entropy_std": 0.8304729461669922, "geo/layer_14/stable_rank_q_proj": 54.31835174560547, "geo/layer_14/stable_rank_k_proj": 37.867279052734375, "geo/layer_14/stable_rank_o_proj": 47.818878173828125, "geo/layer_14/stable_rank_gate_proj": 76.12862396240234, "geo/layer_14/stable_rank_down_proj": 133.18817138671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37913748621940613, "geo/layer_14/attn_entropy_mean": 5.470998764038086, "geo/layer_14/attn_entropy_std": 0.3602316975593567, "geo/layer_21/stable_rank_q_proj": 42.928104400634766, "geo/layer_21/stable_rank_k_proj": 30.81999397277832, "geo/layer_21/stable_rank_o_proj": 75.21623992919922, "geo/layer_21/stable_rank_gate_proj": 72.03681945800781, "geo/layer_21/stable_rank_down_proj": 54.87171173095703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14641204476356506, "geo/layer_21/attn_entropy_mean": 5.711555480957031, "geo/layer_21/attn_entropy_std": 0.29509618878364563, "geo/layer_27/stable_rank_q_proj": 42.55962371826172, "geo/layer_27/stable_rank_k_proj": 31.687095642089844, "geo/layer_27/stable_rank_o_proj": 116.52518463134766, "geo/layer_27/stable_rank_gate_proj": 84.47020721435547, "geo/layer_27/stable_rank_down_proj": 131.67800903320312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09105511009693146, "geo/layer_27/attn_entropy_mean": 4.257785797119141, "geo/layer_27/attn_entropy_std": 0.6807461380958557, "attnres/final_alpha/block_0": 0.23964938521385193, "attnres/block_norm/0": 1.7272248268127441, "attnres/final_alpha/block_1": 0.005002231802791357, "attnres/block_norm/1": 41246.4609375, "attnres/final_alpha/block_2": 0.0110408840700984, "attnres/block_norm/2": 26640.0, "attnres/final_alpha/block_3": 0.013357419520616531, "attnres/block_norm/3": 48490.32421875, "attnres/final_alpha/block_4": 0.015761848539114, "attnres/block_norm/4": 13258.423828125, "attnres/final_alpha/block_5": 0.6002793908119202, "attnres/block_norm/5": 6142.943359375, "attnres/final_alpha/block_6": 0.11490881443023682, "attnres/block_norm/6": 32333.9296875, "geo/tier1_time_s": 1.355591058731079, "geo/step": 42750.0, "geo/rankme_slope": -0.0001710270631690176} {"step": 42760, "timestamp": 1778240757.555811, "train/loss": 2.13882292509079, "train/z_loss": 0.0014099595719017088, "train/perplexity": 8.489439039809355, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788437.4516898259, "perf/iters_per_sec": 0.8527934320878152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1726169109344482, "data/tokens_consumed": 89676316672, "data/tokens_consumed_B": 89.676316672, "train/loss_slope": -2.35576751888102e-05} {"step": 42770, "timestamp": 1778240767.908959, "train/loss": 2.1798192620277406, "train/z_loss": 0.0013970999862067401, "train/perplexity": 8.844707539744661, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027121.806057011, "perf/iters_per_sec": 0.9666070013318114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345466136932373, "data/tokens_consumed": 89697288192, "data/tokens_consumed_B": 89.697288192, "train/loss_slope": -2.4109827059366057e-05} {"step": 42780, "timestamp": 1778240778.2690742, "train/loss": 2.175798201560974, "train/z_loss": 0.0014011918334290384, "train/perplexity": 8.809213844887653, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025261.723561007, "perf/iters_per_sec": 0.9657200448803935, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354967832565307, "data/tokens_consumed": 89718259712, "data/tokens_consumed_B": 89.718259712, "train/loss_slope": -2.4253430239187833e-05} {"step": 42790, "timestamp": 1778240788.6153154, "train/loss": 2.1835813999176024, "train/z_loss": 0.0014024315867573022, "train/perplexity": 8.878045220259292, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028551.683428255, "perf/iters_per_sec": 0.9672888199940943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338173866271974, "data/tokens_consumed": 89739231232, "data/tokens_consumed_B": 89.739231232, "train/loss_slope": -2.2270238460308413e-05} {"step": 42800, "timestamp": 1778240798.951853, "grad/layer_0/attn": 0.0026980440597981215, "grad/layer_0/mlp": 0.0027190707623958588, "grad/layer_0/attn_mlp_ratio": 0.992266916288011, "grad/layer_4/attn": 0.001883368706330657, "grad/layer_4/mlp": 0.0024961247108876705, "grad/layer_4/attn_mlp_ratio": 0.7545170410212358, "grad/layer_8/attn": 0.005585900973528624, "grad/layer_8/mlp": 0.003672506893053651, "grad/layer_8/attn_mlp_ratio": 1.521004856925815, "grad/layer_12/attn": 0.0066858455538749695, "grad/layer_12/mlp": 0.007080855779349804, "grad/layer_12/attn_mlp_ratio": 0.9442143249057194, "grad/layer_16/attn": 0.0036863572895526886, "grad/layer_16/mlp": 0.004490075167268515, "grad/layer_16/attn_mlp_ratio": 0.8210012238381121, "grad/layer_20/attn": 0.009865586645901203, "grad/layer_20/mlp": 0.005971988197416067, "grad/layer_20/attn_mlp_ratio": 1.6519768885297048, "grad/layer_24/attn": 0.006298295222222805, "grad/layer_24/mlp": 0.007968779653310776, "grad/layer_24/attn_mlp_ratio": 0.7903713513485001, "grad/layer_27/attn": 0.004299499094486237, "grad/layer_27/mlp": 0.00654178811237216, "grad/layer_27/attn_mlp_ratio": 0.6572360576202707} {"step": 42800, "timestamp": 1778240798.9660077, "train/loss": 2.1828653812408447, "train/z_loss": 0.0013899482786655427, "train/perplexity": 8.87169064933571, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027536.0787903806, "perf/iters_per_sec": 0.9668045419647124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034335231781006, "data/tokens_consumed": 89760202752, "data/tokens_consumed_B": 89.760202752, "train/loss_slope": -1.9689909502653726e-05} {"step": 42810, "timestamp": 1778240809.3175313, "train/loss": 2.11974937915802, "train/z_loss": 0.00140371595043689, "train/perplexity": 8.329049792616754, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027344.3409579128, "perf/iters_per_sec": 0.9667131142415585, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344330549240113, "data/tokens_consumed": 89781174272, "data/tokens_consumed_B": 89.781174272, "train/loss_slope": -2.3485791672467063e-05} {"step": 42820, "timestamp": 1778240819.6639633, "train/loss": 2.1969864845275877, "train/z_loss": 0.001401781931053847, "train/perplexity": 8.997857419798905, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028275.5646103967, "perf/iters_per_sec": 0.9671571562816604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033958125114441, "data/tokens_consumed": 89802145792, "data/tokens_consumed_B": 89.802145792, "train/loss_slope": -1.974392961604507e-05} {"step": 42825, "timestamp": 1778240825.4147928, "eos/sharpness": 21.59771919250488, "eos/L0_probe": 1.9976533651351929, "eos/L_plus": 2.0971744060516357, "eos/L_minus": 2.114109516143799, "eos/grad_norm": 0.09084615856409073, "eos/embed_grad_frac": 0.2676924169063568, "eos/time_s": 0.5850622653961182} {"step": 42825, "timestamp": 1778240826.7874856, "geo/rankme_last": 439.7309265136719, "geo/layer_0/stable_rank_q_proj": 19.28475570678711, "geo/layer_0/stable_rank_k_proj": 16.365602493286133, "geo/layer_0/stable_rank_o_proj": 48.90995788574219, "geo/layer_0/stable_rank_gate_proj": 135.95208740234375, "geo/layer_0/stable_rank_down_proj": 53.29920196533203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06109461560845375, "geo/layer_0/attn_entropy_mean": 6.187894821166992, "geo/layer_0/attn_entropy_std": 0.3818792998790741, "geo/layer_7/stable_rank_q_proj": 43.09656524658203, "geo/layer_7/stable_rank_k_proj": 42.36044692993164, "geo/layer_7/stable_rank_o_proj": 97.08472442626953, "geo/layer_7/stable_rank_gate_proj": 88.70077514648438, "geo/layer_7/stable_rank_down_proj": 145.93984985351562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5050371289253235, "geo/layer_7/attn_entropy_mean": 4.648014545440674, "geo/layer_7/attn_entropy_std": 0.8466233611106873, "geo/layer_14/stable_rank_q_proj": 54.32177734375, "geo/layer_14/stable_rank_k_proj": 37.96385955810547, "geo/layer_14/stable_rank_o_proj": 47.73480987548828, "geo/layer_14/stable_rank_gate_proj": 76.09358215332031, "geo/layer_14/stable_rank_down_proj": 133.2589111328125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.393718421459198, "geo/layer_14/attn_entropy_mean": 5.504273414611816, "geo/layer_14/attn_entropy_std": 0.37160825729370117, "geo/layer_21/stable_rank_q_proj": 42.856666564941406, "geo/layer_21/stable_rank_k_proj": 30.843664169311523, "geo/layer_21/stable_rank_o_proj": 75.1365737915039, "geo/layer_21/stable_rank_gate_proj": 72.0315933227539, "geo/layer_21/stable_rank_down_proj": 54.85641860961914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14484091103076935, "geo/layer_21/attn_entropy_mean": 5.696744918823242, "geo/layer_21/attn_entropy_std": 0.2952643930912018, "geo/layer_27/stable_rank_q_proj": 42.5992317199707, "geo/layer_27/stable_rank_k_proj": 31.57851791381836, "geo/layer_27/stable_rank_o_proj": 116.64219665527344, "geo/layer_27/stable_rank_gate_proj": 84.47710418701172, "geo/layer_27/stable_rank_down_proj": 131.94305419921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0819576159119606, "geo/layer_27/attn_entropy_mean": 4.26197624206543, "geo/layer_27/attn_entropy_std": 0.6613396406173706, "attnres/final_alpha/block_0": 0.24123913049697876, "attnres/block_norm/0": 1.727447509765625, "attnres/final_alpha/block_1": 0.00517460610717535, "attnres/block_norm/1": 41132.1640625, "attnres/final_alpha/block_2": 0.01127554103732109, "attnres/block_norm/2": 26671.90625, "attnres/final_alpha/block_3": 0.013452501967549324, "attnres/block_norm/3": 48543.203125, "attnres/final_alpha/block_4": 0.01591164991259575, "attnres/block_norm/4": 13169.3134765625, "attnres/final_alpha/block_5": 0.5964767336845398, "attnres/block_norm/5": 6176.390625, "attnres/final_alpha/block_6": 0.1164698451757431, "attnres/block_norm/6": 32386.09375, "geo/tier1_time_s": 1.3551979064941406, "geo/step": 42825.0, "geo/rankme_slope": -0.00015099207651810725} {"step": 42830, "timestamp": 1778240832.4763348, "train/loss": 2.1837814807891847, "train/z_loss": 0.0014038434834219514, "train/perplexity": 8.87982172500129, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1637764.3885246213, "perf/iters_per_sec": 0.7809469168303591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2804967641830445, "data/tokens_consumed": 89823117312, "data/tokens_consumed_B": 89.823117312, "train/loss_slope": -1.849450509969512e-05} {"step": 42840, "timestamp": 1778240842.8275578, "train/loss": 2.218345808982849, "train/z_loss": 0.0013939710799604655, "train/perplexity": 9.192112772159104, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027158.2922648757, "perf/iters_per_sec": 0.9666243993114833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345279932022096, "data/tokens_consumed": 89844088832, "data/tokens_consumed_B": 89.844088832, "train/loss_slope": -1.3784349626368876e-05} {"step": 42850, "timestamp": 1778240853.161412, "grad/layer_0/attn": 0.0029232334345579147, "grad/layer_0/mlp": 0.0029503987170755863, "grad/layer_0/attn_mlp_ratio": 0.9907926405201729, "grad/layer_4/attn": 0.0032434938475489616, "grad/layer_4/mlp": 0.0025736542884260416, "grad/layer_4/attn_mlp_ratio": 1.2602678363245854, "grad/layer_8/attn": 0.012538308277726173, "grad/layer_8/mlp": 0.0037697781808674335, "grad/layer_8/attn_mlp_ratio": 3.3260068215049197, "grad/layer_12/attn": 0.005092905834317207, "grad/layer_12/mlp": 0.005829021334648132, "grad/layer_12/attn_mlp_ratio": 0.8737154068510712, "grad/layer_16/attn": 0.0037728725001215935, "grad/layer_16/mlp": 0.004460758529603481, "grad/layer_16/attn_mlp_ratio": 0.8457916720898584, "grad/layer_20/attn": 0.005566139239817858, "grad/layer_20/mlp": 0.005650432780385017, "grad/layer_20/attn_mlp_ratio": 0.9850819145450295, "grad/layer_24/attn": 0.005444697570055723, "grad/layer_24/mlp": 0.0077438042499125, "grad/layer_24/attn_mlp_ratio": 0.7031037102735227, "grad/layer_27/attn": 0.003305488033220172, "grad/layer_27/mlp": 0.006548806559294462, "grad/layer_27/attn_mlp_ratio": 0.5047466210548183} {"step": 42850, "timestamp": 1778240853.175669, "train/loss": 2.143769931793213, "train/z_loss": 0.0014169761561788619, "train/perplexity": 8.531540403641884, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027798.9068601192, "perf/iters_per_sec": 0.9669298681545826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034201169013977, "data/tokens_consumed": 89865060352, "data/tokens_consumed_B": 89.865060352, "train/loss_slope": -1.4257812986422522e-05} {"step": 42860, "timestamp": 1778240863.5203147, "train/loss": 2.1950664043426515, "train/z_loss": 0.0013994640437886118, "train/perplexity": 8.980597387685819, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028475.524568001, "perf/iters_per_sec": 0.9672525046195989, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033856201171875, "data/tokens_consumed": 89886031872, "data/tokens_consumed_B": 89.886031872, "train/loss_slope": -1.4329055462232605e-05} {"step": 42870, "timestamp": 1778240873.8725302, "train/loss": 2.183179783821106, "train/z_loss": 0.001403604308143258, "train/perplexity": 8.874480370291892, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026777.424152125, "perf/iters_per_sec": 0.966442787242949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347223997116088, "data/tokens_consumed": 89907003392, "data/tokens_consumed_B": 89.907003392, "train/loss_slope": -1.2505809129840289e-05} {"step": 42880, "timestamp": 1778240884.214754, "train/loss": 2.1915048122406007, "train/z_loss": 0.0014002539333887397, "train/perplexity": 8.948669054558424, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028795.6829012965, "perf/iters_per_sec": 0.9674051680094226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033693051338196, "data/tokens_consumed": 89927974912, "data/tokens_consumed_B": 89.927974912, "train/loss_slope": -1.266567055398623e-05} {"step": 42890, "timestamp": 1778240894.5714145, "train/loss": 2.209590196609497, "train/z_loss": 0.0014058360364288091, "train/perplexity": 9.111981506892347, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026135.541185307, "perf/iters_per_sec": 0.9661367135931526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350502014160157, "data/tokens_consumed": 89948946432, "data/tokens_consumed_B": 89.948946432, "train/loss_slope": -7.155966479750389e-06} {"step": 42900, "timestamp": 1778240904.9097507, "grad/layer_0/attn": 0.003274213057011366, "grad/layer_0/mlp": 0.003153610974550247, "grad/layer_0/attn_mlp_ratio": 1.0382425034698732, "grad/layer_4/attn": 0.00221975427120924, "grad/layer_4/mlp": 0.0024950618389993906, "grad/layer_4/attn_mlp_ratio": 0.8896589846180095, "grad/layer_8/attn": 0.005420370027422905, "grad/layer_8/mlp": 0.0037320582196116447, "grad/layer_8/attn_mlp_ratio": 1.4523808481071498, "grad/layer_12/attn": 0.00711170956492424, "grad/layer_12/mlp": 0.006659981794655323, "grad/layer_12/attn_mlp_ratio": 1.0678271619073667, "grad/layer_16/attn": 0.006739066913723946, "grad/layer_16/mlp": 0.004683252424001694, "grad/layer_16/attn_mlp_ratio": 1.438971500935769, "grad/layer_20/attn": 0.0042393747717142105, "grad/layer_20/mlp": 0.005847375839948654, "grad/layer_20/attn_mlp_ratio": 0.7250046542674382, "grad/layer_24/attn": 0.01441335678100586, "grad/layer_24/mlp": 0.010448390617966652, "grad/layer_24/attn_mlp_ratio": 1.3794810291905724, "grad/layer_27/attn": 0.008676555939018726, "grad/layer_27/mlp": 0.010361137799918652, "grad/layer_27/attn_mlp_ratio": 0.837413421462796} {"step": 42900, "timestamp": 1778240905.5069532, "eos/sharpness": 66.87283515930174, "eos/L0_probe": 1.9974658489227295, "eos/L_plus": 2.278144598007202, "eos/L_minus": 2.3855154514312744, "eos/grad_norm": 0.17608575522899628, "eos/embed_grad_frac": 0.09811533242464066, "eos/time_s": 0.5944211483001709} {"step": 42900, "timestamp": 1778240905.5272253, "train/loss": 2.1426359534263613, "train/z_loss": 0.0014141584280878306, "train/perplexity": 8.521871304698877, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915083.4132817772, "perf/iters_per_sec": 0.9131829325112234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950708389282227, "data/tokens_consumed": 89969917952, "data/tokens_consumed_B": 89.969917952, "train/loss_slope": -6.819927417012319e-06} {"step": 42900, "timestamp": 1778240906.889023, "geo/rankme_last": 438.7285461425781, "geo/layer_0/stable_rank_q_proj": 19.254005432128906, "geo/layer_0/stable_rank_k_proj": 16.380176544189453, "geo/layer_0/stable_rank_o_proj": 48.94729232788086, "geo/layer_0/stable_rank_gate_proj": 135.96475219726562, "geo/layer_0/stable_rank_down_proj": 53.27668762207031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05974467843770981, "geo/layer_0/attn_entropy_mean": 6.186985015869141, "geo/layer_0/attn_entropy_std": 0.3821367919445038, "geo/layer_7/stable_rank_q_proj": 43.0634880065918, "geo/layer_7/stable_rank_k_proj": 42.350460052490234, "geo/layer_7/stable_rank_o_proj": 97.16466522216797, "geo/layer_7/stable_rank_gate_proj": 88.76068115234375, "geo/layer_7/stable_rank_down_proj": 146.2256622314453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5059811472892761, "geo/layer_7/attn_entropy_mean": 4.6397175788879395, "geo/layer_7/attn_entropy_std": 0.8275281190872192, "geo/layer_14/stable_rank_q_proj": 54.23572540283203, "geo/layer_14/stable_rank_k_proj": 37.90282440185547, "geo/layer_14/stable_rank_o_proj": 47.71821212768555, "geo/layer_14/stable_rank_gate_proj": 76.16889953613281, "geo/layer_14/stable_rank_down_proj": 133.33334350585938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3860415816307068, "geo/layer_14/attn_entropy_mean": 5.50757360458374, "geo/layer_14/attn_entropy_std": 0.3612622320652008, "geo/layer_21/stable_rank_q_proj": 42.86722946166992, "geo/layer_21/stable_rank_k_proj": 30.798233032226562, "geo/layer_21/stable_rank_o_proj": 75.02177429199219, "geo/layer_21/stable_rank_gate_proj": 71.97604370117188, "geo/layer_21/stable_rank_down_proj": 54.79572677612305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1460399627685547, "geo/layer_21/attn_entropy_mean": 5.7206926345825195, "geo/layer_21/attn_entropy_std": 0.2947290539741516, "geo/layer_27/stable_rank_q_proj": 42.635799407958984, "geo/layer_27/stable_rank_k_proj": 31.640336990356445, "geo/layer_27/stable_rank_o_proj": 116.52413940429688, "geo/layer_27/stable_rank_gate_proj": 84.55952453613281, "geo/layer_27/stable_rank_down_proj": 132.11605834960938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08867582678794861, "geo/layer_27/attn_entropy_mean": 4.269533634185791, "geo/layer_27/attn_entropy_std": 0.6532101631164551, "attnres/final_alpha/block_0": 0.2420491874217987, "attnres/block_norm/0": 1.727676510810852, "attnres/final_alpha/block_1": 0.005179641302675009, "attnres/block_norm/1": 41237.47265625, "attnres/final_alpha/block_2": 0.011005384847521782, "attnres/block_norm/2": 26768.7578125, "attnres/final_alpha/block_3": 0.013073116540908813, "attnres/block_norm/3": 48589.73046875, "attnres/final_alpha/block_4": 0.016237910836935043, "attnres/block_norm/4": 13212.7958984375, "attnres/final_alpha/block_5": 0.5941779017448425, "attnres/block_norm/5": 6138.8427734375, "attnres/final_alpha/block_6": 0.11827686429023743, "attnres/block_norm/6": 32431.96875, "geo/tier1_time_s": 1.3576421737670898, "geo/step": 42900.0, "geo/rankme_slope": -0.0001337098511279512} {"step": 42910, "timestamp": 1778240917.2370276, "train/loss": 2.178963613510132, "train/z_loss": 0.0013996896333992482, "train/perplexity": 8.837142815683594, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791544.663047334, "perf/iters_per_sec": 0.8542750659214658, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170583152770996, "data/tokens_consumed": 89990889472, "data/tokens_consumed_B": 89.990889472, "train/loss_slope": -1.7003722209455288e-06} {"step": 42920, "timestamp": 1778240928.0867095, "train/loss": 2.1699344873428346, "train/z_loss": 0.0013960468117147685, "train/perplexity": 8.757710281075582, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934169.2411430124, "perf/iters_per_sec": 0.9222837644305288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0842649936676025, "data/tokens_consumed": 90011860992, "data/tokens_consumed_B": 90.011860992, "train/loss_slope": -1.6699796391076912e-06} {"step": 42930, "timestamp": 1778240938.434491, "train/loss": 2.2178977489471436, "train/z_loss": 0.001395439403131604, "train/perplexity": 9.187995076338561, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027575.4776255516, "perf/iters_per_sec": 0.9668233287933119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343151330947875, "data/tokens_consumed": 90032832512, "data/tokens_consumed_B": 90.032832512, "train/loss_slope": 2.511855985822908e-06} {"step": 42940, "timestamp": 1778240948.7763343, "train/loss": 2.152144455909729, "train/z_loss": 0.0014111513970419764, "train/perplexity": 8.60328800117584, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028938.7879396412, "perf/iters_per_sec": 0.9674734058092314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336201429367065, "data/tokens_consumed": 90053804032, "data/tokens_consumed_B": 90.053804032, "train/loss_slope": 2.062945819423275e-06} {"step": 42950, "timestamp": 1778240959.107241, "grad/layer_0/attn": 0.0025306919123977423, "grad/layer_0/mlp": 0.002719076117500663, "grad/layer_0/attn_mlp_ratio": 0.9307175341792806, "grad/layer_4/attn": 0.002148995641618967, "grad/layer_4/mlp": 0.00239288411103189, "grad/layer_4/attn_mlp_ratio": 0.8980775717067605, "grad/layer_8/attn": 0.004011428449302912, "grad/layer_8/mlp": 0.0036995885893702507, "grad/layer_8/attn_mlp_ratio": 1.08429038634177, "grad/layer_12/attn": 0.006345285102725029, "grad/layer_12/mlp": 0.006561426445841789, "grad/layer_12/attn_mlp_ratio": 0.9670587727216521, "grad/layer_16/attn": 0.0044358340092003345, "grad/layer_16/mlp": 0.004572049714624882, "grad/layer_16/attn_mlp_ratio": 0.970206841362746, "grad/layer_20/attn": 0.003932445775717497, "grad/layer_20/mlp": 0.006818362977355719, "grad/layer_20/attn_mlp_ratio": 0.5767433812343371, "grad/layer_24/attn": 0.013266481459140778, "grad/layer_24/mlp": 0.009743588976562023, "grad/layer_24/attn_mlp_ratio": 1.3615600324374302, "grad/layer_27/attn": 0.006604115478694439, "grad/layer_27/mlp": 0.008184270933270454, "grad/layer_27/attn_mlp_ratio": 0.8069277583608346} {"step": 42950, "timestamp": 1778240959.1215599, "train/loss": 2.1817332983016966, "train/z_loss": 0.0013998119626194238, "train/perplexity": 8.861652842597122, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028134.8912657045, "perf/iters_per_sec": 0.9670900780037425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340298414230347, "data/tokens_consumed": 90074775552, "data/tokens_consumed_B": 90.074775552, "train/loss_slope": 3.4175788251051293e-06} {"step": 42960, "timestamp": 1778240969.4681294, "train/loss": 2.209561657905579, "train/z_loss": 0.001397275587078184, "train/perplexity": 9.11172146646064, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028455.4566311424, "perf/iters_per_sec": 0.9672429354816162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338664293289184, "data/tokens_consumed": 90095747072, "data/tokens_consumed_B": 90.095747072, "train/loss_slope": 4.9977923884536655e-06} {"step": 42970, "timestamp": 1778240979.812237, "train/loss": 2.1126683592796325, "train/z_loss": 0.0014108952251262962, "train/perplexity": 8.270279946256247, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029005.7612930876, "perf/iters_per_sec": 0.9675053411927641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033586025238037, "data/tokens_consumed": 90116718592, "data/tokens_consumed_B": 90.116718592, "train/loss_slope": 5.632460528653463e-07} {"step": 42975, "timestamp": 1778240985.5607014, "eos/sharpness": 30.55839538574218, "eos/L0_probe": 1.9961460828781128, "eos/L_plus": 2.1482863426208496, "eos/L_minus": 2.149589776992798, "eos/grad_norm": 0.19090130925178528, "eos/embed_grad_frac": 0.08330365270376205, "eos/time_s": 0.5863440036773682} {"step": 42975, "timestamp": 1778240986.9372683, "geo/rankme_last": 439.7777099609375, "geo/layer_0/stable_rank_q_proj": 19.27334976196289, "geo/layer_0/stable_rank_k_proj": 16.425090789794922, "geo/layer_0/stable_rank_o_proj": 48.96414566040039, "geo/layer_0/stable_rank_gate_proj": 135.806884765625, "geo/layer_0/stable_rank_down_proj": 53.33019256591797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059041399508714676, "geo/layer_0/attn_entropy_mean": 6.189427375793457, "geo/layer_0/attn_entropy_std": 0.38513821363449097, "geo/layer_7/stable_rank_q_proj": 43.09230422973633, "geo/layer_7/stable_rank_k_proj": 42.39208221435547, "geo/layer_7/stable_rank_o_proj": 97.33615112304688, "geo/layer_7/stable_rank_gate_proj": 88.75328826904297, "geo/layer_7/stable_rank_down_proj": 146.2118682861328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48012322187423706, "geo/layer_7/attn_entropy_mean": 4.622489929199219, "geo/layer_7/attn_entropy_std": 0.8172120451927185, "geo/layer_14/stable_rank_q_proj": 54.38096237182617, "geo/layer_14/stable_rank_k_proj": 37.75492858886719, "geo/layer_14/stable_rank_o_proj": 47.699581146240234, "geo/layer_14/stable_rank_gate_proj": 76.14871978759766, "geo/layer_14/stable_rank_down_proj": 133.531494140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3876497745513916, "geo/layer_14/attn_entropy_mean": 5.529880523681641, "geo/layer_14/attn_entropy_std": 0.3497324585914612, "geo/layer_21/stable_rank_q_proj": 42.856449127197266, "geo/layer_21/stable_rank_k_proj": 30.848875045776367, "geo/layer_21/stable_rank_o_proj": 74.91960144042969, "geo/layer_21/stable_rank_gate_proj": 71.86864471435547, "geo/layer_21/stable_rank_down_proj": 54.8082160949707, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14468872547149658, "geo/layer_21/attn_entropy_mean": 5.712477684020996, "geo/layer_21/attn_entropy_std": 0.29305753111839294, "geo/layer_27/stable_rank_q_proj": 42.59943389892578, "geo/layer_27/stable_rank_k_proj": 31.707576751708984, "geo/layer_27/stable_rank_o_proj": 116.83509063720703, "geo/layer_27/stable_rank_gate_proj": 84.45773315429688, "geo/layer_27/stable_rank_down_proj": 131.9107208251953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08855051547288895, "geo/layer_27/attn_entropy_mean": 4.283916473388672, "geo/layer_27/attn_entropy_std": 0.6571810245513916, "attnres/final_alpha/block_0": 0.241420716047287, "attnres/block_norm/0": 1.727873682975769, "attnres/final_alpha/block_1": 0.005037161987274885, "attnres/block_norm/1": 41285.45703125, "attnres/final_alpha/block_2": 0.011205613613128662, "attnres/block_norm/2": 26729.76953125, "attnres/final_alpha/block_3": 0.01332773081958294, "attnres/block_norm/3": 48452.84375, "attnres/final_alpha/block_4": 0.015826256945729256, "attnres/block_norm/4": 13237.634765625, "attnres/final_alpha/block_5": 0.5990258455276489, "attnres/block_norm/5": 6121.4892578125, "attnres/final_alpha/block_6": 0.11415668576955795, "attnres/block_norm/6": 32309.62890625, "geo/tier1_time_s": 1.3585476875305176, "geo/step": 42975.0, "geo/rankme_slope": -8.966436183848539e-05} {"step": 42980, "timestamp": 1778240992.115352, "train/loss": 2.162519669532776, "train/z_loss": 0.0013991802930831908, "train/perplexity": 8.693013608344485, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705300.4489526248, "perf/iters_per_sec": 0.8131506199610828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2297844648361207, "data/tokens_consumed": 90137690112, "data/tokens_consumed_B": 90.137690112, "train/loss_slope": 4.9536177391712525e-08} {"step": 42990, "timestamp": 1778241002.4704516, "train/loss": 2.174619162082672, "train/z_loss": 0.0013940007193014026, "train/perplexity": 8.798833554580261, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026578.359120172, "perf/iters_per_sec": 0.9663478656388149, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348240375518798, "data/tokens_consumed": 90158661632, "data/tokens_consumed_B": 90.158661632, "train/loss_slope": -1.5142777285845317e-07} {"step": 43000, "timestamp": 1778241012.8065124, "grad/layer_0/attn": 0.002976363291963935, "grad/layer_0/mlp": 0.002969418652355671, "grad/layer_0/attn_mlp_ratio": 1.0023386865199644, "grad/layer_4/attn": 0.004562937188893557, "grad/layer_4/mlp": 0.0024931253865361214, "grad/layer_4/attn_mlp_ratio": 1.8302075902457575, "grad/layer_8/attn": 0.006310301832854748, "grad/layer_8/mlp": 0.003611101070418954, "grad/layer_8/attn_mlp_ratio": 1.7474730103235068, "grad/layer_12/attn": 0.005011794622987509, "grad/layer_12/mlp": 0.005949170328676701, "grad/layer_12/attn_mlp_ratio": 0.8424358796025119, "grad/layer_16/attn": 0.0035313779953867197, "grad/layer_16/mlp": 0.004506457597017288, "grad/layer_16/attn_mlp_ratio": 0.7836261278396225, "grad/layer_20/attn": 0.003691304475069046, "grad/layer_20/mlp": 0.005495208315551281, "grad/layer_20/attn_mlp_ratio": 0.6717314787593427, "grad/layer_24/attn": 0.010471349582076073, "grad/layer_24/mlp": 0.00799830723553896, "grad/layer_24/attn_mlp_ratio": 1.3091957013890452, "grad/layer_27/attn": 0.006290992721915245, "grad/layer_27/mlp": 0.006533605046570301, "grad/layer_27/attn_mlp_ratio": 0.9628669900900867} {"step": 43000, "timestamp": 1778241012.8207088, "train/loss": 2.1758786916732786, "train/z_loss": 0.0014081664965488016, "train/perplexity": 8.809922928036052, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027462.0523802512, "perf/iters_per_sec": 0.9667692434216744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343729972839355, "data/tokens_consumed": 90179633152, "data/tokens_consumed_B": 90.179633152, "train/loss_slope": -9.937231439818707e-07} {"step": 43000, "timestamp": 1778241020.0422122, "geo/ww_alpha_mean": 7.482191682551256, "geo/ww_alpha_std": 4.305976350731943, "geo/ww_alpha_min": 1.3632949135417578, "geo/ww_alpha_max": 34.50377554568645, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.036048356047401, "geo/ww_alpha_by_type/k_proj": 4.50185773469843, "geo/ww_alpha_by_type/v_proj": 8.647326104115647, "geo/ww_alpha_by_type/o_proj": 7.883310397393131, "geo/ww_alpha_by_type/gate_proj": 8.396454394296114, "geo/ww_alpha_by_type/up_proj": 10.997154807452576, "geo/ww_alpha_by_type/down_proj": 8.01205128734618, "geo/twonn_id/layer_0": 0.7232057452201843, "geo/twonn_id/layer_7": 3.4925336837768555, "geo/twonn_id/layer_14": 4.407770156860352, "geo/twonn_id/layer_21": 7.032695770263672, "geo/twonn_id/layer_27": 5.057194709777832, "geo/tier2_time_s": 7.215448379516602} {"step": 43000, "timestamp": 1778241020.773389, "eoc/jacobian_sigma/layer_0/attn": 1121.58642578125, "eoc/jacobian_sigma/layer_0/mlp": 7618.4248046875, "eoc/jacobian_sigma/layer_0": 7618.4248046875, "eoc/jacobian_sigma/layer_7/attn": 1.153145432472229, "eoc/jacobian_sigma/layer_7/mlp": 1.7267314195632935, "eoc/jacobian_sigma/layer_7": 1.7267314195632935, "eoc/jacobian_sigma/layer_14/attn": 1.5822314023971558, "eoc/jacobian_sigma/layer_14/mlp": 5.864924907684326, "eoc/jacobian_sigma/layer_14": 5.864924907684326, "eoc/jacobian_sigma/layer_21/attn": 1.088356614112854, "eoc/jacobian_sigma/layer_21/mlp": 3.881096124649048, "eoc/jacobian_sigma/layer_21": 3.881096124649048, "eoc/jacobian_sigma/layer_27/attn": 4.137495994567871, "eoc/jacobian_sigma/layer_27/mlp": 23.68277359008789, "eoc/jacobian_sigma/layer_27": 23.68277359008789, "eoc/layer0_sigma": 7618.4248046875, "eoc/sigma_max": 23.68277359008789, "eoc/sigma_min": 1.7267314195632935, "eoc/sigma_mean": 8.78888151049614, "eoc/time_s": 0.7246453762054443} {"step": 43010, "timestamp": 1778241031.167875, "train/loss": 2.145992136001587, "train/z_loss": 0.001402432261966169, "train/perplexity": 8.550520309433498, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1143348.0641880461, "perf/iters_per_sec": 0.5451908417644721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8342200994491578, "data/tokens_consumed": 90200604672, "data/tokens_consumed_B": 90.200604672, "train/loss_slope": -3.162041391440607e-07} {"step": 43020, "timestamp": 1778241041.513381, "train/loss": 2.1546064138412477, "train/z_loss": 0.0014037370798178016, "train/perplexity": 8.624495029000922, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028665.7920789465, "perf/iters_per_sec": 0.9673432312388165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337592363357544, "data/tokens_consumed": 90221576192, "data/tokens_consumed_B": 90.221576192, "train/loss_slope": -7.17782430594402e-07} {"step": 43030, "timestamp": 1778241051.856547, "train/loss": 2.2107292652130126, "train/z_loss": 0.001389263547025621, "train/perplexity": 9.122366592482265, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028545.6953008992, "perf/iters_per_sec": 0.9672859646324631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338204383850098, "data/tokens_consumed": 90242547712, "data/tokens_consumed_B": 90.242547712, "train/loss_slope": 1.3866132420890094e-06} {"step": 43040, "timestamp": 1778241062.2515945, "train/loss": 2.1869909048080443, "train/z_loss": 0.0013885546941310167, "train/perplexity": 8.908366619962464, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022190.948121649, "perf/iters_per_sec": 0.9642557850464101, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370692253112792, "data/tokens_consumed": 90263519232, "data/tokens_consumed_B": 90.263519232, "train/loss_slope": 4.924379817151203e-06} {"step": 43050, "timestamp": 1778241072.5883, "grad/layer_0/attn": 0.0028623449616134167, "grad/layer_0/mlp": 0.0028609975706785917, "grad/layer_0/attn_mlp_ratio": 1.0004709164738694, "grad/layer_4/attn": 0.0018727572169154882, "grad/layer_4/mlp": 0.0024792607873678207, "grad/layer_4/attn_mlp_ratio": 0.7553691612114912, "grad/layer_8/attn": 0.0034503431525081396, "grad/layer_8/mlp": 0.0037588358391076326, "grad/layer_8/attn_mlp_ratio": 0.9179286375896654, "grad/layer_12/attn": 0.005980508867651224, "grad/layer_12/mlp": 0.006564639508724213, "grad/layer_12/attn_mlp_ratio": 0.9110186124617267, "grad/layer_16/attn": 0.003534687450155616, "grad/layer_16/mlp": 0.004345742054283619, "grad/layer_16/attn_mlp_ratio": 0.8133679644733312, "grad/layer_20/attn": 0.006283004302531481, "grad/layer_20/mlp": 0.0060937837697565556, "grad/layer_20/attn_mlp_ratio": 1.0310513856118242, "grad/layer_24/attn": 0.010524610057473183, "grad/layer_24/mlp": 0.011874143965542316, "grad/layer_24/attn_mlp_ratio": 0.8863468389283436, "grad/layer_27/attn": 0.011407473124563694, "grad/layer_27/mlp": 0.011562288738787174, "grad/layer_27/attn_mlp_ratio": 0.9866102882930816} {"step": 43050, "timestamp": 1778241073.193554, "eos/sharpness": 24.477910995483395, "eos/L0_probe": 1.9970698356628418, "eos/L_plus": 2.1341028213500977, "eos/L_minus": 2.10481595993042, "eos/grad_norm": 0.13224954903125763, "eos/embed_grad_frac": 0.1590556800365448, "eos/time_s": 0.6024281978607178} {"step": 43050, "timestamp": 1778241073.211083, "train/loss": 2.1634893894195555, "train/z_loss": 0.001403650688007474, "train/perplexity": 8.701447485104644, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914802.7626436816, "perf/iters_per_sec": 0.913049107858506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0952313423156739, "data/tokens_consumed": 90284490752, "data/tokens_consumed_B": 90.284490752, "train/loss_slope": 5.1298288979975495e-06} {"step": 43050, "timestamp": 1778241074.5719445, "geo/rankme_last": 438.5819396972656, "geo/layer_0/stable_rank_q_proj": 19.289928436279297, "geo/layer_0/stable_rank_k_proj": 16.47305679321289, "geo/layer_0/stable_rank_o_proj": 48.9419059753418, "geo/layer_0/stable_rank_gate_proj": 135.7485809326172, "geo/layer_0/stable_rank_down_proj": 53.332786560058594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0607050396502018, "geo/layer_0/attn_entropy_mean": 6.1893110275268555, "geo/layer_0/attn_entropy_std": 0.3770040273666382, "geo/layer_7/stable_rank_q_proj": 43.1027717590332, "geo/layer_7/stable_rank_k_proj": 42.427101135253906, "geo/layer_7/stable_rank_o_proj": 97.39891815185547, "geo/layer_7/stable_rank_gate_proj": 88.82389831542969, "geo/layer_7/stable_rank_down_proj": 145.98492431640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5059938430786133, "geo/layer_7/attn_entropy_mean": 4.660406112670898, "geo/layer_7/attn_entropy_std": 0.8021185994148254, "geo/layer_14/stable_rank_q_proj": 54.458255767822266, "geo/layer_14/stable_rank_k_proj": 37.81429672241211, "geo/layer_14/stable_rank_o_proj": 47.64051818847656, "geo/layer_14/stable_rank_gate_proj": 76.09175872802734, "geo/layer_14/stable_rank_down_proj": 133.56854248046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38193508982658386, "geo/layer_14/attn_entropy_mean": 5.529329776763916, "geo/layer_14/attn_entropy_std": 0.3677513003349304, "geo/layer_21/stable_rank_q_proj": 43.021751403808594, "geo/layer_21/stable_rank_k_proj": 30.862716674804688, "geo/layer_21/stable_rank_o_proj": 74.99856567382812, "geo/layer_21/stable_rank_gate_proj": 71.82533264160156, "geo/layer_21/stable_rank_down_proj": 54.888206481933594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14635953307151794, "geo/layer_21/attn_entropy_mean": 5.718042373657227, "geo/layer_21/attn_entropy_std": 0.2845251262187958, "geo/layer_27/stable_rank_q_proj": 42.6551399230957, "geo/layer_27/stable_rank_k_proj": 31.71109962463379, "geo/layer_27/stable_rank_o_proj": 116.92147064208984, "geo/layer_27/stable_rank_gate_proj": 84.3847427368164, "geo/layer_27/stable_rank_down_proj": 131.75088500976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.089270681142807, "geo/layer_27/attn_entropy_mean": 4.272798538208008, "geo/layer_27/attn_entropy_std": 0.6726710200309753, "attnres/final_alpha/block_0": 0.24110257625579834, "attnres/block_norm/0": 1.7282021045684814, "attnres/final_alpha/block_1": 0.00508450111374259, "attnres/block_norm/1": 41350.109375, "attnres/final_alpha/block_2": 0.011087171733379364, "attnres/block_norm/2": 26742.96875, "attnres/final_alpha/block_3": 0.013126390054821968, "attnres/block_norm/3": 48781.4921875, "attnres/final_alpha/block_4": 0.015641607344150543, "attnres/block_norm/4": 13221.69140625, "attnres/final_alpha/block_5": 0.5995611548423767, "attnres/block_norm/5": 6174.38427734375, "attnres/final_alpha/block_6": 0.1143965795636177, "attnres/block_norm/6": 32658.7890625, "geo/tier1_time_s": 1.3568487167358398, "geo/step": 43050.0, "geo/rankme_slope": -0.00010005830457182873} {"step": 43060, "timestamp": 1778241084.927435, "train/loss": 2.177387237548828, "train/z_loss": 0.0013834941550157965, "train/perplexity": 8.82322313039347, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790504.8476567422, "perf/iters_per_sec": 0.8537792433055602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712629556655885, "data/tokens_consumed": 90305462272, "data/tokens_consumed_B": 90.305462272, "train/loss_slope": 5.268545591398451e-06} {"step": 43070, "timestamp": 1778241095.2749233, "train/loss": 2.173894691467285, "train/z_loss": 0.0014037340646609665, "train/perplexity": 8.79246136673042, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028194.1419215456, "perf/iters_per_sec": 0.9671183309180954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339996337890625, "data/tokens_consumed": 90326433792, "data/tokens_consumed_B": 90.326433792, "train/loss_slope": 3.92374274897353e-06} {"step": 43080, "timestamp": 1778241105.6293135, "train/loss": 2.1588061094284057, "train/z_loss": 0.0013883523177355527, "train/perplexity": 8.66079144631952, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027138.2036652695, "perf/iters_per_sec": 0.9666148203207348, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034538245201111, "data/tokens_consumed": 90347405312, "data/tokens_consumed_B": 90.347405312, "train/loss_slope": 5.588354922757272e-06} {"step": 43090, "timestamp": 1778241116.3960605, "train/loss": 2.160392105579376, "train/z_loss": 0.001406626810785383, "train/perplexity": 8.674538326585779, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948529.9403562534, "perf/iters_per_sec": 0.9291314794331805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0762739419937133, "data/tokens_consumed": 90368376832, "data/tokens_consumed_B": 90.368376832, "train/loss_slope": 5.877470548110702e-06} {"step": 43100, "timestamp": 1778241127.275784, "grad/layer_0/attn": 0.0025600441731512547, "grad/layer_0/mlp": 0.002766559598967433, "grad/layer_0/attn_mlp_ratio": 0.9253529479615942, "grad/layer_4/attn": 0.0028735939413309097, "grad/layer_4/mlp": 0.0026385323144495487, "grad/layer_4/attn_mlp_ratio": 1.0890879814832215, "grad/layer_8/attn": 0.0038700136356055737, "grad/layer_8/mlp": 0.003725718706846237, "grad/layer_8/attn_mlp_ratio": 1.038729393236598, "grad/layer_12/attn": 0.005596595350652933, "grad/layer_12/mlp": 0.006761299446225166, "grad/layer_12/attn_mlp_ratio": 0.8277395953825937, "grad/layer_16/attn": 0.005569185130298138, "grad/layer_16/mlp": 0.004848171956837177, "grad/layer_16/attn_mlp_ratio": 1.1487185407217853, "grad/layer_20/attn": 0.0038562393747270107, "grad/layer_20/mlp": 0.007006426341831684, "grad/layer_20/attn_mlp_ratio": 0.5503860501129985, "grad/layer_24/attn": 0.02086169458925724, "grad/layer_24/mlp": 0.013366120867431164, "grad/layer_24/attn_mlp_ratio": 1.560789000794645, "grad/layer_27/attn": 0.01105661503970623, "grad/layer_27/mlp": 0.014169267378747463, "grad/layer_27/attn_mlp_ratio": 0.7803236869012522} {"step": 43100, "timestamp": 1778241127.2901742, "train/loss": 2.2350804805755615, "train/z_loss": 0.0013957386021502317, "train/perplexity": 9.347234092705541, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926709.4617274033, "perf/iters_per_sec": 0.9187266644131676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.088463020324707, "data/tokens_consumed": 90389348352, "data/tokens_consumed_B": 90.389348352, "train/loss_slope": 1.2250909439050266e-05} {"step": 43110, "timestamp": 1778241137.6350026, "train/loss": 2.1612316608428954, "train/z_loss": 0.0013994354754686355, "train/perplexity": 8.681824138889546, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028448.2996308382, "perf/iters_per_sec": 0.9672395227579299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338700771331788, "data/tokens_consumed": 90410319872, "data/tokens_consumed_B": 90.410319872, "train/loss_slope": 1.1247632398356393e-05} {"step": 43120, "timestamp": 1778241147.9881675, "train/loss": 2.2220102548599243, "train/z_loss": 0.0013931382680311799, "train/perplexity": 9.22585856396048, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026710.1308463034, "perf/iters_per_sec": 0.966410699294235, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347567558288575, "data/tokens_consumed": 90431291392, "data/tokens_consumed_B": 90.431291392, "train/loss_slope": 1.4365845330298706e-05} {"step": 43125, "timestamp": 1778241153.743541, "eos/sharpness": 43.19264888763427, "eos/L0_probe": 1.9935163259506226, "eos/L_plus": 2.2491612434387207, "eos/L_minus": 2.169797897338867, "eos/grad_norm": 0.1242583766579628, "eos/embed_grad_frac": 0.16099298000335693, "eos/time_s": 0.587702751159668} {"step": 43125, "timestamp": 1778241155.1191833, "geo/rankme_last": 438.9589538574219, "geo/layer_0/stable_rank_q_proj": 19.288057327270508, "geo/layer_0/stable_rank_k_proj": 16.478559494018555, "geo/layer_0/stable_rank_o_proj": 48.92625045776367, "geo/layer_0/stable_rank_gate_proj": 135.9171600341797, "geo/layer_0/stable_rank_down_proj": 53.42703628540039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06116294488310814, "geo/layer_0/attn_entropy_mean": 6.188174247741699, "geo/layer_0/attn_entropy_std": 0.38019219040870667, "geo/layer_7/stable_rank_q_proj": 43.14241027832031, "geo/layer_7/stable_rank_k_proj": 42.48057174682617, "geo/layer_7/stable_rank_o_proj": 97.09095001220703, "geo/layer_7/stable_rank_gate_proj": 88.87284088134766, "geo/layer_7/stable_rank_down_proj": 146.24118041992188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.486074835062027, "geo/layer_7/attn_entropy_mean": 4.651362419128418, "geo/layer_7/attn_entropy_std": 0.805465042591095, "geo/layer_14/stable_rank_q_proj": 54.37624740600586, "geo/layer_14/stable_rank_k_proj": 37.93681716918945, "geo/layer_14/stable_rank_o_proj": 47.55868148803711, "geo/layer_14/stable_rank_gate_proj": 76.15480041503906, "geo/layer_14/stable_rank_down_proj": 133.48158264160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3956604301929474, "geo/layer_14/attn_entropy_mean": 5.53398323059082, "geo/layer_14/attn_entropy_std": 0.37446075677871704, "geo/layer_21/stable_rank_q_proj": 43.083152770996094, "geo/layer_21/stable_rank_k_proj": 30.926414489746094, "geo/layer_21/stable_rank_o_proj": 75.13656616210938, "geo/layer_21/stable_rank_gate_proj": 71.88346099853516, "geo/layer_21/stable_rank_down_proj": 54.82103729248047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14234532415866852, "geo/layer_21/attn_entropy_mean": 5.739806175231934, "geo/layer_21/attn_entropy_std": 0.2922024726867676, "geo/layer_27/stable_rank_q_proj": 42.628143310546875, "geo/layer_27/stable_rank_k_proj": 31.69027328491211, "geo/layer_27/stable_rank_o_proj": 116.9162826538086, "geo/layer_27/stable_rank_gate_proj": 84.37030029296875, "geo/layer_27/stable_rank_down_proj": 131.7902069091797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08352792263031006, "geo/layer_27/attn_entropy_mean": 4.2593560218811035, "geo/layer_27/attn_entropy_std": 0.6566386818885803, "attnres/final_alpha/block_0": 0.24112892150878906, "attnres/block_norm/0": 1.7283196449279785, "attnres/final_alpha/block_1": 0.005122661124914885, "attnres/block_norm/1": 41263.5546875, "attnres/final_alpha/block_2": 0.010965690016746521, "attnres/block_norm/2": 26649.65234375, "attnres/final_alpha/block_3": 0.013004406355321407, "attnres/block_norm/3": 49070.89453125, "attnres/final_alpha/block_4": 0.01586848869919777, "attnres/block_norm/4": 13264.4140625, "attnres/final_alpha/block_5": 0.599651575088501, "attnres/block_norm/5": 6077.150390625, "attnres/final_alpha/block_6": 0.11425827443599701, "attnres/block_norm/6": 32459.734375, "geo/tier1_time_s": 1.3575501441955566, "geo/step": 43125.0, "geo/rankme_slope": -0.00012528169470913365} {"step": 43130, "timestamp": 1778241160.2923744, "train/loss": 2.147088074684143, "train/z_loss": 0.0014232089160941541, "train/perplexity": 8.559896292209249, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705591.2012218388, "perf/iters_per_sec": 0.813289261446876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2295748233795165, "data/tokens_consumed": 90452262912, "data/tokens_consumed_B": 90.452262912, "train/loss_slope": 1.6093728966516934e-05} {"step": 43140, "timestamp": 1778241170.6386638, "train/loss": 2.1767322778701783, "train/z_loss": 0.0013979070237837732, "train/perplexity": 8.817446167052871, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027685.3631314177, "perf/iters_per_sec": 0.9668757262856568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342590808868408, "data/tokens_consumed": 90473234432, "data/tokens_consumed_B": 90.473234432, "train/loss_slope": 1.5279096416836192e-05} {"step": 43150, "timestamp": 1778241181.4953344, "grad/layer_0/attn": 0.0029964055866003036, "grad/layer_0/mlp": 0.0029378097970038652, "grad/layer_0/attn_mlp_ratio": 1.0199453646256, "grad/layer_4/attn": 0.0020268969237804413, "grad/layer_4/mlp": 0.0025372453965246677, "grad/layer_4/attn_mlp_ratio": 0.7988572357529989, "grad/layer_8/attn": 0.003473205491900444, "grad/layer_8/mlp": 0.0037614896427839994, "grad/layer_8/attn_mlp_ratio": 0.9233590224626833, "grad/layer_12/attn": 0.0077802762389183044, "grad/layer_12/mlp": 0.006625061389058828, "grad/layer_12/attn_mlp_ratio": 1.1743704193187177, "grad/layer_16/attn": 0.004267563112080097, "grad/layer_16/mlp": 0.004977197851985693, "grad/layer_16/attn_mlp_ratio": 0.8574228216857477, "grad/layer_20/attn": 0.0058633117005229, "grad/layer_20/mlp": 0.006597793661057949, "grad/layer_20/attn_mlp_ratio": 0.8886776266226795, "grad/layer_24/attn": 0.016616133973002434, "grad/layer_24/mlp": 0.01231310609728098, "grad/layer_24/attn_mlp_ratio": 1.3494672836226869, "grad/layer_27/attn": 0.0044013033621013165, "grad/layer_27/mlp": 0.011961501091718674, "grad/layer_27/attn_mlp_ratio": 0.3679557683903822} {"step": 43150, "timestamp": 1778241181.5099926, "train/loss": 2.191462445259094, "train/z_loss": 0.0014004273456521333, "train/perplexity": 8.948289934493225, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930008.5335947608, "perf/iters_per_sec": 0.9202997844671063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0866024494171143, "data/tokens_consumed": 90494205952, "data/tokens_consumed_B": 90.494205952, "train/loss_slope": 1.403900949653832e-05} {"step": 43160, "timestamp": 1778241192.2610085, "train/loss": 2.191369915008545, "train/z_loss": 0.0014070646488107742, "train/perplexity": 8.947461985289364, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951880.1875578142, "perf/iters_per_sec": 0.9307290017880507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0744266033172607, "data/tokens_consumed": 90515177472, "data/tokens_consumed_B": 90.515177472, "train/loss_slope": 1.6205744524457007e-05} {"step": 43170, "timestamp": 1778241202.610721, "train/loss": 2.159638786315918, "train/z_loss": 0.0014072691090404987, "train/perplexity": 8.668006090501308, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027351.0696268107, "perf/iters_per_sec": 0.9667163227209142, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344296216964721, "data/tokens_consumed": 90536148992, "data/tokens_consumed_B": 90.536148992, "train/loss_slope": 1.604056006396382e-05} {"step": 43180, "timestamp": 1778241212.9603417, "train/loss": 2.1704389333724974, "train/z_loss": 0.001406845694873482, "train/perplexity": 8.762129187712056, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027265.5160592257, "perf/iters_per_sec": 0.9666755276008728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344732761383058, "data/tokens_consumed": 90557120512, "data/tokens_consumed_B": 90.557120512, "train/loss_slope": 1.713395948493009e-05} {"step": 43190, "timestamp": 1778241223.3218467, "train/loss": 2.1593580842018127, "train/z_loss": 0.0014221802121028304, "train/perplexity": 8.665573304326713, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025713.0233403384, "perf/iters_per_sec": 0.9659352413846676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352660894393921, "data/tokens_consumed": 90578092032, "data/tokens_consumed_B": 90.578092032, "train/loss_slope": 1.903549829403485e-05} {"step": 43200, "timestamp": 1778241233.6671653, "grad/layer_0/attn": 0.0025922032073140144, "grad/layer_0/mlp": 0.0026484252884984016, "grad/layer_0/attn_mlp_ratio": 0.9787714687268332, "grad/layer_4/attn": 0.0018834752263501287, "grad/layer_4/mlp": 0.002471472602337599, "grad/layer_4/attn_mlp_ratio": 0.7620861944251607, "grad/layer_8/attn": 0.004223157186061144, "grad/layer_8/mlp": 0.003703618422150612, "grad/layer_8/attn_mlp_ratio": 1.1402786655275843, "grad/layer_12/attn": 0.00477632787078619, "grad/layer_12/mlp": 0.006850900128483772, "grad/layer_12/attn_mlp_ratio": 0.6971825178431007, "grad/layer_16/attn": 0.003746934235095978, "grad/layer_16/mlp": 0.0044655343517661095, "grad/layer_16/attn_mlp_ratio": 0.8390785639586934, "grad/layer_20/attn": 0.0033635920844972134, "grad/layer_20/mlp": 0.006098919082432985, "grad/layer_20/attn_mlp_ratio": 0.5515062560896908, "grad/layer_24/attn": 0.008429300971329212, "grad/layer_24/mlp": 0.009406817145645618, "grad/layer_24/attn_mlp_ratio": 0.8960842707166555, "grad/layer_27/attn": 0.009122198447585106, "grad/layer_27/mlp": 0.007937353104352951, "grad/layer_27/attn_mlp_ratio": 1.1492746023425504} {"step": 43200, "timestamp": 1778241234.2494283, "eos/sharpness": 21.911358833312985, "eos/L0_probe": 1.9929617643356323, "eos/L_plus": 2.0916645526885986, "eos/L_minus": 2.113372564315796, "eos/grad_norm": 0.12079747021198273, "eos/embed_grad_frac": 0.16735456883907318, "eos/time_s": 0.5795207023620605} {"step": 43200, "timestamp": 1778241234.2676187, "train/loss": 2.198825788497925, "train/z_loss": 0.00138449544319883, "train/perplexity": 9.01442244406311, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916800.8744773252, "perf/iters_per_sec": 0.9140018818270327, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094089651107788, "data/tokens_consumed": 90599063552, "data/tokens_consumed_B": 90.599063552, "train/loss_slope": 2.2017985301585712e-05} {"step": 43200, "timestamp": 1778241235.6331275, "geo/rankme_last": 438.8802490234375, "geo/layer_0/stable_rank_q_proj": 19.284805297851562, "geo/layer_0/stable_rank_k_proj": 16.446685791015625, "geo/layer_0/stable_rank_o_proj": 48.86760711669922, "geo/layer_0/stable_rank_gate_proj": 136.0630645751953, "geo/layer_0/stable_rank_down_proj": 53.43672561645508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05992283299565315, "geo/layer_0/attn_entropy_mean": 6.1891326904296875, "geo/layer_0/attn_entropy_std": 0.37947723269462585, "geo/layer_7/stable_rank_q_proj": 43.196048736572266, "geo/layer_7/stable_rank_k_proj": 42.52121353149414, "geo/layer_7/stable_rank_o_proj": 96.83231353759766, "geo/layer_7/stable_rank_gate_proj": 88.98390197753906, "geo/layer_7/stable_rank_down_proj": 146.33499145507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4929925203323364, "geo/layer_7/attn_entropy_mean": 4.649165153503418, "geo/layer_7/attn_entropy_std": 0.8228626847267151, "geo/layer_14/stable_rank_q_proj": 54.32058334350586, "geo/layer_14/stable_rank_k_proj": 37.886234283447266, "geo/layer_14/stable_rank_o_proj": 47.60989761352539, "geo/layer_14/stable_rank_gate_proj": 76.09056854248047, "geo/layer_14/stable_rank_down_proj": 133.34144592285156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39388853311538696, "geo/layer_14/attn_entropy_mean": 5.494734764099121, "geo/layer_14/attn_entropy_std": 0.36308372020721436, "geo/layer_21/stable_rank_q_proj": 43.03581619262695, "geo/layer_21/stable_rank_k_proj": 30.951627731323242, "geo/layer_21/stable_rank_o_proj": 75.21951293945312, "geo/layer_21/stable_rank_gate_proj": 71.73932647705078, "geo/layer_21/stable_rank_down_proj": 54.792232513427734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15016718208789825, "geo/layer_21/attn_entropy_mean": 5.710937023162842, "geo/layer_21/attn_entropy_std": 0.2850891053676605, "geo/layer_27/stable_rank_q_proj": 42.57634735107422, "geo/layer_27/stable_rank_k_proj": 31.56736183166504, "geo/layer_27/stable_rank_o_proj": 116.91133880615234, "geo/layer_27/stable_rank_gate_proj": 84.31510925292969, "geo/layer_27/stable_rank_down_proj": 131.7776641845703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0938970074057579, "geo/layer_27/attn_entropy_mean": 4.262505531311035, "geo/layer_27/attn_entropy_std": 0.6796552538871765, "attnres/final_alpha/block_0": 0.23981115221977234, "attnres/block_norm/0": 1.7284290790557861, "attnres/final_alpha/block_1": 0.005094112362712622, "attnres/block_norm/1": 41284.50390625, "attnres/final_alpha/block_2": 0.010766888037323952, "attnres/block_norm/2": 26827.90625, "attnres/final_alpha/block_3": 0.012822997756302357, "attnres/block_norm/3": 48890.80859375, "attnres/final_alpha/block_4": 0.015400908887386322, "attnres/block_norm/4": 13268.841796875, "attnres/final_alpha/block_5": 0.6003921031951904, "attnres/block_norm/5": 6093.84619140625, "attnres/final_alpha/block_6": 0.11571186780929565, "attnres/block_norm/6": 32419.2578125, "geo/tier1_time_s": 1.3612329959869385, "geo/step": 43200.0, "geo/rankme_slope": -0.00013782446572378952} {"step": 43210, "timestamp": 1778241246.6180236, "train/loss": 2.1643412828445436, "train/z_loss": 0.0013941642479039728, "train/perplexity": 8.70886334931958, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698621.1444248168, "perf/iters_per_sec": 0.8099656793712696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234620213508606, "data/tokens_consumed": 90620035072, "data/tokens_consumed_B": 90.620035072, "train/loss_slope": 2.0613732632666475e-05} {"step": 43220, "timestamp": 1778241256.9990437, "train/loss": 2.2129652976989744, "train/z_loss": 0.0013981390395201743, "train/perplexity": 9.142787322731655, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021891.3214584256, "perf/iters_per_sec": 0.9641129119197968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372229099273682, "data/tokens_consumed": 90641006592, "data/tokens_consumed_B": 90.641006592, "train/loss_slope": 2.081496420830817e-05} {"step": 43230, "timestamp": 1778241267.3840013, "train/loss": 2.1666315317153932, "train/z_loss": 0.0014005484641529619, "train/perplexity": 8.728831671256907, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020354.470660668, "perf/iters_per_sec": 0.9633800843528119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038011908531189, "data/tokens_consumed": 90661978112, "data/tokens_consumed_B": 90.661978112, "train/loss_slope": 1.8332299440786196e-05} {"step": 43240, "timestamp": 1778241277.7647436, "train/loss": 2.1773026466369627, "train/z_loss": 0.0013961325748823583, "train/perplexity": 8.822476797470214, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022101.2274429246, "perf/iters_per_sec": 0.9642130028929351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371152400970458, "data/tokens_consumed": 90682949632, "data/tokens_consumed_B": 90.682949632, "train/loss_slope": 2.065924461489496e-05} {"step": 43250, "timestamp": 1778241288.1310477, "grad/layer_0/attn": 0.002657442819327116, "grad/layer_0/mlp": 0.0027661588974297047, "grad/layer_0/attn_mlp_ratio": 0.9606977841101662, "grad/layer_4/attn": 0.0019854605197906494, "grad/layer_4/mlp": 0.002487871563062072, "grad/layer_4/attn_mlp_ratio": 0.7980558439847114, "grad/layer_8/attn": 0.0032124188728630543, "grad/layer_8/mlp": 0.0037175307516008615, "grad/layer_8/attn_mlp_ratio": 0.8641270243876272, "grad/layer_12/attn": 0.008991417475044727, "grad/layer_12/mlp": 0.006590539589524269, "grad/layer_12/attn_mlp_ratio": 1.3642915297720881, "grad/layer_16/attn": 0.003583374200388789, "grad/layer_16/mlp": 0.004835947882384062, "grad/layer_16/attn_mlp_ratio": 0.7409869199259317, "grad/layer_20/attn": 0.004045411013066769, "grad/layer_20/mlp": 0.006237540859729052, "grad/layer_20/attn_mlp_ratio": 0.6485586289829341, "grad/layer_24/attn": 0.007985209114849567, "grad/layer_24/mlp": 0.008881387300789356, "grad/layer_24/attn_mlp_ratio": 0.8990947871657824, "grad/layer_27/attn": 0.006803106050938368, "grad/layer_27/mlp": 0.008522336371243, "grad/layer_27/attn_mlp_ratio": 0.7982677137771024} {"step": 43250, "timestamp": 1778241288.147934, "train/loss": 2.1207019090652466, "train/z_loss": 0.001418475154787302, "train/perplexity": 8.336987241371515, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020712.966940167, "perf/iters_per_sec": 0.9635510286999545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378277540206908, "data/tokens_consumed": 90703921152, "data/tokens_consumed_B": 90.703921152, "train/loss_slope": 1.746516421575855e-05} {"step": 43260, "timestamp": 1778241298.5290084, "train/loss": 2.16458523273468, "train/z_loss": 0.0014046388794668018, "train/perplexity": 8.710988134736908, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021168.5576796366, "perf/iters_per_sec": 0.9637682712934669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375938177108766, "data/tokens_consumed": 90724892672, "data/tokens_consumed_B": 90.724892672, "train/loss_slope": 1.5235057472288493e-05} {"step": 43270, "timestamp": 1778241308.8994195, "train/loss": 2.0917922019958497, "train/z_loss": 0.00140071006026119, "train/perplexity": 8.099417957484345, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023918.9025026956, "perf/iters_per_sec": 0.965079737902973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03618381023407, "data/tokens_consumed": 90745864192, "data/tokens_consumed_B": 90.745864192, "train/loss_slope": 1.0791187024567147e-05} {"step": 43275, "timestamp": 1778241314.6605582, "eos/sharpness": 27.345085144042965, "eos/L0_probe": 1.9945443868637085, "eos/L_plus": 2.13995361328125, "eos/L_minus": 2.1225860118865967, "eos/grad_norm": 0.13275636732578278, "eos/embed_grad_frac": 0.12398140877485275, "eos/time_s": 0.5983667373657227} {"step": 43275, "timestamp": 1778241316.040464, "geo/rankme_last": 438.4367980957031, "geo/layer_0/stable_rank_q_proj": 19.275768280029297, "geo/layer_0/stable_rank_k_proj": 16.43478012084961, "geo/layer_0/stable_rank_o_proj": 48.81729507446289, "geo/layer_0/stable_rank_gate_proj": 135.9564666748047, "geo/layer_0/stable_rank_down_proj": 53.47687911987305, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06112997233867645, "geo/layer_0/attn_entropy_mean": 6.191699981689453, "geo/layer_0/attn_entropy_std": 0.37579914927482605, "geo/layer_7/stable_rank_q_proj": 43.195945739746094, "geo/layer_7/stable_rank_k_proj": 42.48875045776367, "geo/layer_7/stable_rank_o_proj": 96.6531753540039, "geo/layer_7/stable_rank_gate_proj": 88.7585678100586, "geo/layer_7/stable_rank_down_proj": 146.35202026367188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5034264326095581, "geo/layer_7/attn_entropy_mean": 4.603976726531982, "geo/layer_7/attn_entropy_std": 0.8254381418228149, "geo/layer_14/stable_rank_q_proj": 54.322967529296875, "geo/layer_14/stable_rank_k_proj": 37.83064270019531, "geo/layer_14/stable_rank_o_proj": 47.66129684448242, "geo/layer_14/stable_rank_gate_proj": 75.95984649658203, "geo/layer_14/stable_rank_down_proj": 133.23573303222656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38467171788215637, "geo/layer_14/attn_entropy_mean": 5.492678165435791, "geo/layer_14/attn_entropy_std": 0.3695962727069855, "geo/layer_21/stable_rank_q_proj": 42.965370178222656, "geo/layer_21/stable_rank_k_proj": 30.849145889282227, "geo/layer_21/stable_rank_o_proj": 75.34086608886719, "geo/layer_21/stable_rank_gate_proj": 71.7787094116211, "geo/layer_21/stable_rank_down_proj": 54.789695739746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14378489553928375, "geo/layer_21/attn_entropy_mean": 5.715461254119873, "geo/layer_21/attn_entropy_std": 0.2927715480327606, "geo/layer_27/stable_rank_q_proj": 42.59096908569336, "geo/layer_27/stable_rank_k_proj": 31.55525779724121, "geo/layer_27/stable_rank_o_proj": 116.75834655761719, "geo/layer_27/stable_rank_gate_proj": 84.16539001464844, "geo/layer_27/stable_rank_down_proj": 131.33705139160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09367522597312927, "geo/layer_27/attn_entropy_mean": 4.270657062530518, "geo/layer_27/attn_entropy_std": 0.6566334962844849, "attnres/final_alpha/block_0": 0.2401323914527893, "attnres/block_norm/0": 1.7285706996917725, "attnres/final_alpha/block_1": 0.005010304041206837, "attnres/block_norm/1": 41382.63671875, "attnres/final_alpha/block_2": 0.010632958263158798, "attnres/block_norm/2": 26743.04296875, "attnres/final_alpha/block_3": 0.012954452075064182, "attnres/block_norm/3": 48986.48046875, "attnres/final_alpha/block_4": 0.015843749046325684, "attnres/block_norm/4": 13230.765625, "attnres/final_alpha/block_5": 0.6015877723693848, "attnres/block_norm/5": 6161.283203125, "attnres/final_alpha/block_6": 0.11383840441703796, "attnres/block_norm/6": 32557.15625, "geo/tier1_time_s": 1.3612985610961914, "geo/step": 43275.0, "geo/rankme_slope": -0.00016072419202055823} {"step": 43280, "timestamp": 1778241321.2180803, "train/loss": 2.173894166946411, "train/z_loss": 0.0013912353897467256, "train/perplexity": 8.79245675490211, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703237.117824093, "perf/iters_per_sec": 0.8121667470093217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2312742471694946, "data/tokens_consumed": 90766835712, "data/tokens_consumed_B": 90.766835712, "train/loss_slope": 1.2488526486792478e-05} {"step": 43290, "timestamp": 1778241331.5923738, "train/loss": 2.193813681602478, "train/z_loss": 0.001392324804328382, "train/perplexity": 8.969354232866117, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022483.8276485836, "perf/iters_per_sec": 0.9643954408877294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369190454483033, "data/tokens_consumed": 90787807232, "data/tokens_consumed_B": 90.787807232, "train/loss_slope": 1.1545192826949453e-05} {"step": 43300, "timestamp": 1778241341.9602644, "grad/layer_0/attn": 0.0028130188584327698, "grad/layer_0/mlp": 0.003025782061740756, "grad/layer_0/attn_mlp_ratio": 0.9296832052226847, "grad/layer_4/attn": 0.0026955814100801945, "grad/layer_4/mlp": 0.0026027171406894922, "grad/layer_4/attn_mlp_ratio": 1.0356796996380988, "grad/layer_8/attn": 0.006627690512686968, "grad/layer_8/mlp": 0.0037172315642237663, "grad/layer_8/attn_mlp_ratio": 1.782964074172376, "grad/layer_12/attn": 0.004485864657908678, "grad/layer_12/mlp": 0.007131802849471569, "grad/layer_12/attn_mlp_ratio": 0.6289944758276108, "grad/layer_16/attn": 0.004465071950107813, "grad/layer_16/mlp": 0.004650742746889591, "grad/layer_16/attn_mlp_ratio": 0.9600771526411196, "grad/layer_20/attn": 0.007745461072772741, "grad/layer_20/mlp": 0.006042161490768194, "grad/layer_20/attn_mlp_ratio": 1.2819023384953847, "grad/layer_24/attn": 0.006224813871085644, "grad/layer_24/mlp": 0.008903863839805126, "grad/layer_24/attn_mlp_ratio": 0.699113768266082, "grad/layer_27/attn": 0.007983538322150707, "grad/layer_27/mlp": 0.007720875553786755, "grad/layer_27/attn_mlp_ratio": 1.034019802952678} {"step": 43300, "timestamp": 1778241341.9769065, "train/loss": 2.1247573852539063, "train/z_loss": 0.0014072658610530197, "train/perplexity": 8.370866346131578, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020929.1751327985, "perf/iters_per_sec": 0.9636541248001091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377167224884034, "data/tokens_consumed": 90808778752, "data/tokens_consumed_B": 90.808778752, "train/loss_slope": 1.041210174131348e-05} {"step": 43310, "timestamp": 1778241352.3564317, "train/loss": 2.1661125659942626, "train/z_loss": 0.0013970874832011759, "train/perplexity": 8.72430288207882, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021491.663103086, "perf/iters_per_sec": 0.9639223399653845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374279737472534, "data/tokens_consumed": 90829750272, "data/tokens_consumed_B": 90.829750272, "train/loss_slope": 8.292252825002475e-06} {"step": 43320, "timestamp": 1778241362.7360728, "train/loss": 2.2034250020980837, "train/z_loss": 0.0014016043394804, "train/perplexity": 9.055977184679735, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021464.6717641235, "perf/iters_per_sec": 0.9639094694920175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374418258666993, "data/tokens_consumed": 90850721792, "data/tokens_consumed_B": 90.850721792, "train/loss_slope": 1.3568371271464747e-05} {"step": 43330, "timestamp": 1778241373.110937, "train/loss": 2.1540768146514893, "train/z_loss": 0.001410322659648955, "train/perplexity": 8.619928712686924, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022637.4849969186, "perf/iters_per_sec": 0.9644687104210465, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036840271949768, "data/tokens_consumed": 90871693312, "data/tokens_consumed_B": 90.871693312, "train/loss_slope": 1.0371814809902746e-05} {"step": 43340, "timestamp": 1778241383.4872313, "train/loss": 2.119005298614502, "train/z_loss": 0.0014095962047576903, "train/perplexity": 8.322854613861894, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022033.2681661309, "perf/iters_per_sec": 0.9641805973845152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371500968933105, "data/tokens_consumed": 90892664832, "data/tokens_consumed_B": 90.892664832, "train/loss_slope": 6.574994445455113e-06} {"step": 43350, "timestamp": 1778241393.859441, "grad/layer_0/attn": 0.0029259754810482264, "grad/layer_0/mlp": 0.002855646191164851, "grad/layer_0/attn_mlp_ratio": 1.0246281166196838, "grad/layer_4/attn": 0.0017099800752475858, "grad/layer_4/mlp": 0.0026056861970573664, "grad/layer_4/attn_mlp_ratio": 0.6562494023853475, "grad/layer_8/attn": 0.005323735065758228, "grad/layer_8/mlp": 0.0037622982636094093, "grad/layer_8/attn_mlp_ratio": 1.4150220294200258, "grad/layer_12/attn": 0.004040981642901897, "grad/layer_12/mlp": 0.0065909442491829395, "grad/layer_12/attn_mlp_ratio": 0.6131111763070562, "grad/layer_16/attn": 0.0038969037123024464, "grad/layer_16/mlp": 0.004434566013514996, "grad/layer_16/attn_mlp_ratio": 0.8787564809161502, "grad/layer_20/attn": 0.0053970240987837315, "grad/layer_20/mlp": 0.005843514110893011, "grad/layer_20/attn_mlp_ratio": 0.9235921919592549, "grad/layer_24/attn": 0.01337786391377449, "grad/layer_24/mlp": 0.010176073759794235, "grad/layer_24/attn_mlp_ratio": 1.3146390344738512, "grad/layer_27/attn": 0.003722486784681678, "grad/layer_27/mlp": 0.010835262015461922, "grad/layer_27/attn_mlp_ratio": 0.34355299807373263} {"step": 43350, "timestamp": 1778241394.4630454, "eos/sharpness": 55.57487010955809, "eos/L0_probe": 1.9946650266647339, "eos/L_plus": 2.321354627609253, "eos/L_minus": 2.223724126815796, "eos/grad_norm": 0.164335235953331, "eos/embed_grad_frac": 0.09479803591966629, "eos/time_s": 0.6007311344146729} {"step": 43350, "timestamp": 1778241394.4841347, "train/loss": 2.196490025520325, "train/z_loss": 0.0014001558185555041, "train/perplexity": 8.993391461111214, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907842.3054889557, "perf/iters_per_sec": 0.9097301032490519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0992271184921265, "data/tokens_consumed": 90913636352, "data/tokens_consumed_B": 90.913636352, "train/loss_slope": 1.0469672939564897e-05} {"step": 43350, "timestamp": 1778241395.8560855, "geo/rankme_last": 439.4796142578125, "geo/layer_0/stable_rank_q_proj": 19.2895450592041, "geo/layer_0/stable_rank_k_proj": 16.44675064086914, "geo/layer_0/stable_rank_o_proj": 48.76852035522461, "geo/layer_0/stable_rank_gate_proj": 135.8873291015625, "geo/layer_0/stable_rank_down_proj": 53.51781463623047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060816776007413864, "geo/layer_0/attn_entropy_mean": 6.190349102020264, "geo/layer_0/attn_entropy_std": 0.3829224407672882, "geo/layer_7/stable_rank_q_proj": 43.12295150756836, "geo/layer_7/stable_rank_k_proj": 42.343448638916016, "geo/layer_7/stable_rank_o_proj": 96.61225891113281, "geo/layer_7/stable_rank_gate_proj": 88.66350555419922, "geo/layer_7/stable_rank_down_proj": 146.14744567871094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4956457316875458, "geo/layer_7/attn_entropy_mean": 4.635848045349121, "geo/layer_7/attn_entropy_std": 0.8214124441146851, "geo/layer_14/stable_rank_q_proj": 54.282432556152344, "geo/layer_14/stable_rank_k_proj": 37.869667053222656, "geo/layer_14/stable_rank_o_proj": 47.58537673950195, "geo/layer_14/stable_rank_gate_proj": 75.9808120727539, "geo/layer_14/stable_rank_down_proj": 133.54269409179688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37707796692848206, "geo/layer_14/attn_entropy_mean": 5.520313739776611, "geo/layer_14/attn_entropy_std": 0.3582022190093994, "geo/layer_21/stable_rank_q_proj": 43.06435775756836, "geo/layer_21/stable_rank_k_proj": 30.772735595703125, "geo/layer_21/stable_rank_o_proj": 75.31290435791016, "geo/layer_21/stable_rank_gate_proj": 71.70934295654297, "geo/layer_21/stable_rank_down_proj": 54.70568084716797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14211557805538177, "geo/layer_21/attn_entropy_mean": 5.713785171508789, "geo/layer_21/attn_entropy_std": 0.2899278402328491, "geo/layer_27/stable_rank_q_proj": 42.59996032714844, "geo/layer_27/stable_rank_k_proj": 31.559457778930664, "geo/layer_27/stable_rank_o_proj": 117.01797485351562, "geo/layer_27/stable_rank_gate_proj": 84.12483978271484, "geo/layer_27/stable_rank_down_proj": 131.17735290527344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08114752173423767, "geo/layer_27/attn_entropy_mean": 4.267898082733154, "geo/layer_27/attn_entropy_std": 0.6556421518325806, "attnres/final_alpha/block_0": 0.24078361690044403, "attnres/block_norm/0": 1.7287769317626953, "attnres/final_alpha/block_1": 0.005021360237151384, "attnres/block_norm/1": 41399.22265625, "attnres/final_alpha/block_2": 0.010794326663017273, "attnres/block_norm/2": 26781.9765625, "attnres/final_alpha/block_3": 0.012987431138753891, "attnres/block_norm/3": 48915.8046875, "attnres/final_alpha/block_4": 0.015527043491601944, "attnres/block_norm/4": 13286.2060546875, "attnres/final_alpha/block_5": 0.6006515026092529, "attnres/block_norm/5": 6115.4130859375, "attnres/final_alpha/block_6": 0.11423470079898834, "attnres/block_norm/6": 32479.8828125, "geo/tier1_time_s": 1.3676400184631348, "geo/step": 43350.0, "geo/rankme_slope": -0.00016331513073979592} {"step": 43360, "timestamp": 1778241406.208265, "train/loss": 2.155694031715393, "train/z_loss": 0.0014012919273227453, "train/perplexity": 8.633880286811852, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789373.8482823942, "perf/iters_per_sec": 0.8532399407779666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720032691955566, "data/tokens_consumed": 90934607872, "data/tokens_consumed_B": 90.934607872, "train/loss_slope": 8.877741013637615e-06} {"step": 43370, "timestamp": 1778241416.551826, "train/loss": 2.18947856426239, "train/z_loss": 0.0014050632016733289, "train/perplexity": 8.930555189767382, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028569.6012392316, "perf/iters_per_sec": 0.9672973638721617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338082551956176, "data/tokens_consumed": 90955579392, "data/tokens_consumed_B": 90.955579392, "train/loss_slope": 8.185446151483835e-06} {"step": 43380, "timestamp": 1778241426.9014432, "train/loss": 2.1225051879882812, "train/z_loss": 0.0013994241831824183, "train/perplexity": 8.35203471806664, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027251.078730699, "perf/iters_per_sec": 0.9666686433461661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344806432723999, "data/tokens_consumed": 90976550912, "data/tokens_consumed_B": 90.976550912, "train/loss_slope": 6.2564825174724125e-06} {"step": 43390, "timestamp": 1778241437.25237, "train/loss": 2.1345449686050415, "train/z_loss": 0.001410747563932091, "train/perplexity": 8.453199160590362, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027399.6201657655, "perf/iters_per_sec": 0.9667394734219387, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344048500061036, "data/tokens_consumed": 90997522432, "data/tokens_consumed_B": 90.997522432, "train/loss_slope": 4.252529695089049e-06} {"step": 43400, "timestamp": 1778241447.5927243, "grad/layer_0/attn": 0.0026872106827795506, "grad/layer_0/mlp": 0.0027694343589246273, "grad/layer_0/attn_mlp_ratio": 0.970310264653454, "grad/layer_4/attn": 0.0019716406241059303, "grad/layer_4/mlp": 0.002362644299864769, "grad/layer_4/attn_mlp_ratio": 0.8345058715644137, "grad/layer_8/attn": 0.008594225160777569, "grad/layer_8/mlp": 0.003794959280639887, "grad/layer_8/attn_mlp_ratio": 2.2646422000249324, "grad/layer_12/attn": 0.0040412042289972305, "grad/layer_12/mlp": 0.006781843025237322, "grad/layer_12/attn_mlp_ratio": 0.5958858313839004, "grad/layer_16/attn": 0.003080377820879221, "grad/layer_16/mlp": 0.004426225088536739, "grad/layer_16/attn_mlp_ratio": 0.6959378905657441, "grad/layer_20/attn": 0.004175674170255661, "grad/layer_20/mlp": 0.00536630442366004, "grad/layer_20/attn_mlp_ratio": 0.7781284405022323, "grad/layer_24/attn": 0.004689816851168871, "grad/layer_24/mlp": 0.007389909587800503, "grad/layer_24/attn_mlp_ratio": 0.6346243796336198, "grad/layer_27/attn": 0.006871072109788656, "grad/layer_27/mlp": 0.006566047668457031, "grad/layer_27/attn_mlp_ratio": 1.0464547856013091} {"step": 43400, "timestamp": 1778241447.6075304, "train/loss": 2.1581034660339355, "train/z_loss": 0.0014119325089268387, "train/perplexity": 8.654708135868114, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026240.6497464923, "perf/iters_per_sec": 0.9661868332607709, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034996509552002, "data/tokens_consumed": 91018493952, "data/tokens_consumed_B": 91.018493952, "train/loss_slope": 1.1674882054435672e-06} {"step": 43410, "timestamp": 1778241458.0015128, "train/loss": 2.1904826164245605, "train/z_loss": 0.0014003996970131994, "train/perplexity": 8.939526436060998, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021044.331724036, "perf/iters_per_sec": 0.9637090357418232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376575946807862, "data/tokens_consumed": 91039465472, "data/tokens_consumed_B": 91.039465472, "train/loss_slope": 3.3302234880847595e-07} {"step": 43420, "timestamp": 1778241468.3831697, "train/loss": 2.1756276845932008, "train/z_loss": 0.0014042356866411864, "train/perplexity": 8.807711852515588, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021877.9365496545, "perf/iters_per_sec": 0.9641065294979355, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372297763824463, "data/tokens_consumed": 91060436992, "data/tokens_consumed_B": 91.060436992, "train/loss_slope": 1.1596612947465256e-06} {"step": 43425, "timestamp": 1778241474.173032, "eos/sharpness": 39.60275650024413, "eos/L0_probe": 1.9970260858535767, "eos/L_plus": 2.2214982509613037, "eos/L_minus": 2.168581485748291, "eos/grad_norm": 0.12858200073242188, "eos/embed_grad_frac": 0.13432250916957855, "eos/time_s": 0.6142549514770508} {"step": 43425, "timestamp": 1778241475.554867, "geo/rankme_last": 437.93328857421875, "geo/layer_0/stable_rank_q_proj": 19.315086364746094, "geo/layer_0/stable_rank_k_proj": 16.46470069885254, "geo/layer_0/stable_rank_o_proj": 48.82564926147461, "geo/layer_0/stable_rank_gate_proj": 135.5939483642578, "geo/layer_0/stable_rank_down_proj": 53.578582763671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06183136627078056, "geo/layer_0/attn_entropy_mean": 6.1953325271606445, "geo/layer_0/attn_entropy_std": 0.37810564041137695, "geo/layer_7/stable_rank_q_proj": 43.072486877441406, "geo/layer_7/stable_rank_k_proj": 42.3265380859375, "geo/layer_7/stable_rank_o_proj": 96.53472900390625, "geo/layer_7/stable_rank_gate_proj": 88.6202163696289, "geo/layer_7/stable_rank_down_proj": 146.09002685546875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48658236861228943, "geo/layer_7/attn_entropy_mean": 4.6507568359375, "geo/layer_7/attn_entropy_std": 0.8096530437469482, "geo/layer_14/stable_rank_q_proj": 54.33163833618164, "geo/layer_14/stable_rank_k_proj": 37.96711349487305, "geo/layer_14/stable_rank_o_proj": 47.586143493652344, "geo/layer_14/stable_rank_gate_proj": 75.86369323730469, "geo/layer_14/stable_rank_down_proj": 133.51565551757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40384405851364136, "geo/layer_14/attn_entropy_mean": 5.540544509887695, "geo/layer_14/attn_entropy_std": 0.37169012427330017, "geo/layer_21/stable_rank_q_proj": 43.033287048339844, "geo/layer_21/stable_rank_k_proj": 30.8687686920166, "geo/layer_21/stable_rank_o_proj": 75.22784423828125, "geo/layer_21/stable_rank_gate_proj": 71.7126235961914, "geo/layer_21/stable_rank_down_proj": 54.69881057739258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1387178599834442, "geo/layer_21/attn_entropy_mean": 5.735464096069336, "geo/layer_21/attn_entropy_std": 0.29201561212539673, "geo/layer_27/stable_rank_q_proj": 42.67960739135742, "geo/layer_27/stable_rank_k_proj": 31.591999053955078, "geo/layer_27/stable_rank_o_proj": 117.15479278564453, "geo/layer_27/stable_rank_gate_proj": 84.30638885498047, "geo/layer_27/stable_rank_down_proj": 131.11358642578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1001090481877327, "geo/layer_27/attn_entropy_mean": 4.2873029708862305, "geo/layer_27/attn_entropy_std": 0.6537861227989197, "attnres/final_alpha/block_0": 0.2403637170791626, "attnres/block_norm/0": 1.728991985321045, "attnres/final_alpha/block_1": 0.004942495375871658, "attnres/block_norm/1": 41360.9453125, "attnres/final_alpha/block_2": 0.010945259593427181, "attnres/block_norm/2": 26697.94921875, "attnres/final_alpha/block_3": 0.012943083420395851, "attnres/block_norm/3": 49250.23828125, "attnres/final_alpha/block_4": 0.015570458024740219, "attnres/block_norm/4": 13271.7177734375, "attnres/final_alpha/block_5": 0.6023372411727905, "attnres/block_norm/5": 6072.978515625, "attnres/final_alpha/block_6": 0.11289777606725693, "attnres/block_norm/6": 32482.515625, "geo/tier1_time_s": 1.36210298538208, "geo/step": 43425.0, "geo/rankme_slope": -0.00021212238801770708} {"step": 43430, "timestamp": 1778241480.7445712, "train/loss": 2.1728153228759766, "train/z_loss": 0.0013938023475930095, "train/perplexity": 8.782976180018071, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697190.4351804175, "perf/iters_per_sec": 0.8092834640409553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235660982131958, "data/tokens_consumed": 91081408512, "data/tokens_consumed_B": 91.081408512, "train/loss_slope": 3.2183823626998886e-06} {"step": 43440, "timestamp": 1778241491.134308, "train/loss": 2.2011780738830566, "train/z_loss": 0.001400727918371558, "train/perplexity": 9.035651897311004, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019826.0088949478, "perf/iters_per_sec": 0.9631280941462268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382834911346435, "data/tokens_consumed": 91102380032, "data/tokens_consumed_B": 91.102380032, "train/loss_slope": 1.8630864203172533e-06} {"step": 43450, "timestamp": 1778241501.5018852, "grad/layer_0/attn": 0.003780993400141597, "grad/layer_0/mlp": 0.00319819082506001, "grad/layer_0/attn_mlp_ratio": 1.1822287939456433, "grad/layer_4/attn": 0.0035463101230561733, "grad/layer_4/mlp": 0.0026095143985003233, "grad/layer_4/attn_mlp_ratio": 1.358992304926538, "grad/layer_8/attn": 0.0038495687767863274, "grad/layer_8/mlp": 0.0036864341236650944, "grad/layer_8/attn_mlp_ratio": 1.044252668899933, "grad/layer_12/attn": 0.00448946887627244, "grad/layer_12/mlp": 0.006991848349571228, "grad/layer_12/attn_mlp_ratio": 0.6421004271835662, "grad/layer_16/attn": 0.00736143346875906, "grad/layer_16/mlp": 0.0054582818411290646, "grad/layer_16/attn_mlp_ratio": 1.348672265038094, "grad/layer_20/attn": 0.0058224136009812355, "grad/layer_20/mlp": 0.007758705876767635, "grad/layer_20/attn_mlp_ratio": 0.750436170466524, "grad/layer_24/attn": 0.017189806327223778, "grad/layer_24/mlp": 0.012051623314619064, "grad/layer_24/attn_mlp_ratio": 1.4263477820234498, "grad/layer_27/attn": 0.011207561939954758, "grad/layer_27/mlp": 0.011236030608415604, "grad/layer_27/attn_mlp_ratio": 0.9974662966665334} {"step": 43450, "timestamp": 1778241501.5185857, "train/loss": 2.14593768119812, "train/z_loss": 0.0014034157735295593, "train/perplexity": 8.550054705207817, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021090.5837907584, "perf/iters_per_sec": 0.9637310904458801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376338481903076, "data/tokens_consumed": 91123351552, "data/tokens_consumed_B": 91.123351552, "train/loss_slope": 1.4042513455637498e-06} {"step": 43460, "timestamp": 1778241511.896576, "train/loss": 2.137588691711426, "train/z_loss": 0.0014121534768491983, "train/perplexity": 8.478967554232158, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021748.8370488589, "perf/iters_per_sec": 0.9640449700588507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372960090637207, "data/tokens_consumed": 91144323072, "data/tokens_consumed_B": 91.144323072, "train/loss_slope": 2.6327377033776986e-06} {"step": 43470, "timestamp": 1778241522.2813487, "train/loss": 2.164726710319519, "train/z_loss": 0.0014037920511327684, "train/perplexity": 8.712220631483046, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020835.4343561318, "perf/iters_per_sec": 0.9636094257145557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377648591995239, "data/tokens_consumed": 91165294592, "data/tokens_consumed_B": 91.165294592, "train/loss_slope": 3.789476921992562e-06} {"step": 43480, "timestamp": 1778241532.6604333, "train/loss": 2.1468663692474363, "train/z_loss": 0.001401102531235665, "train/perplexity": 8.557998727021653, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021881.7010373343, "perf/iters_per_sec": 0.9641083245455428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372278451919557, "data/tokens_consumed": 91186266112, "data/tokens_consumed_B": 91.186266112, "train/loss_slope": 1.748014226032703e-06} {"step": 43490, "timestamp": 1778241543.0437665, "train/loss": 2.132293963432312, "train/z_loss": 0.0014048165408894419, "train/perplexity": 8.43419236577614, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020733.4390419675, "perf/iters_per_sec": 0.9635607905587995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378172397613525, "data/tokens_consumed": 91207237632, "data/tokens_consumed_B": 91.207237632, "train/loss_slope": -3.3871934323063844e-06} {"step": 43500, "timestamp": 1778241553.4129996, "grad/layer_0/attn": 0.002765002427622676, "grad/layer_0/mlp": 0.0029025424737483263, "grad/layer_0/attn_mlp_ratio": 0.9526139091396575, "grad/layer_4/attn": 0.0022806685883551836, "grad/layer_4/mlp": 0.002460918389260769, "grad/layer_4/attn_mlp_ratio": 0.9267550300051871, "grad/layer_8/attn": 0.00617081206291914, "grad/layer_8/mlp": 0.0036416223738342524, "grad/layer_8/attn_mlp_ratio": 1.6945227319024951, "grad/layer_12/attn": 0.0038555630017071962, "grad/layer_12/mlp": 0.006949784234166145, "grad/layer_12/attn_mlp_ratio": 0.5547744816702715, "grad/layer_16/attn": 0.004256744869053364, "grad/layer_16/mlp": 0.004466507118195295, "grad/layer_16/attn_mlp_ratio": 0.953036603570813, "grad/layer_20/attn": 0.004223983734846115, "grad/layer_20/mlp": 0.006400651298463345, "grad/layer_20/attn_mlp_ratio": 0.6599302901983072, "grad/layer_24/attn": 0.017440833151340485, "grad/layer_24/mlp": 0.014067358337342739, "grad/layer_24/attn_mlp_ratio": 1.2398086839845235, "grad/layer_27/attn": 0.004785127472132444, "grad/layer_27/mlp": 0.013827409595251083, "grad/layer_27/attn_mlp_ratio": 0.34606101776067716} {"step": 43500, "timestamp": 1778241554.0246873, "eos/sharpness": 67.78750419616698, "eos/L0_probe": 1.9933022260665894, "eos/L_plus": 2.30818772315979, "eos/L_minus": 2.3562917709350586, "eos/grad_norm": 0.22174684703350067, "eos/embed_grad_frac": 0.05210663378238678, "eos/time_s": 0.6087782382965088} {"step": 43500, "timestamp": 1778241554.0466754, "train/loss": 2.1746079325675964, "train/z_loss": 0.0014122619410045444, "train/perplexity": 8.798734748500985, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907219.4849265232, "perf/iters_per_sec": 0.9094331192619911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0995860815048217, "data/tokens_consumed": 91228209152, "data/tokens_consumed_B": 91.228209152, "train/loss_slope": -1.7523504183380882e-06} {"step": 43500, "timestamp": 1778241555.4087112, "geo/rankme_last": 439.2449645996094, "geo/layer_0/stable_rank_q_proj": 19.300640106201172, "geo/layer_0/stable_rank_k_proj": 16.45541763305664, "geo/layer_0/stable_rank_o_proj": 48.81858444213867, "geo/layer_0/stable_rank_gate_proj": 135.4833526611328, "geo/layer_0/stable_rank_down_proj": 53.475990295410156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.065338134765625, "geo/layer_0/attn_entropy_mean": 6.19370174407959, "geo/layer_0/attn_entropy_std": 0.3798145353794098, "geo/layer_7/stable_rank_q_proj": 43.04728317260742, "geo/layer_7/stable_rank_k_proj": 42.29888153076172, "geo/layer_7/stable_rank_o_proj": 96.36000061035156, "geo/layer_7/stable_rank_gate_proj": 88.85049438476562, "geo/layer_7/stable_rank_down_proj": 146.35455322265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4848532974720001, "geo/layer_7/attn_entropy_mean": 4.639328956604004, "geo/layer_7/attn_entropy_std": 0.8263623714447021, "geo/layer_14/stable_rank_q_proj": 54.2775764465332, "geo/layer_14/stable_rank_k_proj": 37.998043060302734, "geo/layer_14/stable_rank_o_proj": 47.491249084472656, "geo/layer_14/stable_rank_gate_proj": 75.67424011230469, "geo/layer_14/stable_rank_down_proj": 133.69285583496094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3956949710845947, "geo/layer_14/attn_entropy_mean": 5.517004013061523, "geo/layer_14/attn_entropy_std": 0.3700474500656128, "geo/layer_21/stable_rank_q_proj": 42.96873092651367, "geo/layer_21/stable_rank_k_proj": 30.891464233398438, "geo/layer_21/stable_rank_o_proj": 75.32418060302734, "geo/layer_21/stable_rank_gate_proj": 71.62562561035156, "geo/layer_21/stable_rank_down_proj": 54.60886764526367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14412693679332733, "geo/layer_21/attn_entropy_mean": 5.70659065246582, "geo/layer_21/attn_entropy_std": 0.3055208623409271, "geo/layer_27/stable_rank_q_proj": 42.67515563964844, "geo/layer_27/stable_rank_k_proj": 31.56178092956543, "geo/layer_27/stable_rank_o_proj": 116.71748352050781, "geo/layer_27/stable_rank_gate_proj": 84.23833465576172, "geo/layer_27/stable_rank_down_proj": 130.98126220703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08379829674959183, "geo/layer_27/attn_entropy_mean": 4.266427516937256, "geo/layer_27/attn_entropy_std": 0.6666932702064514, "attnres/final_alpha/block_0": 0.24261754751205444, "attnres/block_norm/0": 1.7293145656585693, "attnres/final_alpha/block_1": 0.005036883521825075, "attnres/block_norm/1": 41482.859375, "attnres/final_alpha/block_2": 0.011056307703256607, "attnres/block_norm/2": 26761.28515625, "attnres/final_alpha/block_3": 0.013120976276695728, "attnres/block_norm/3": 49034.3984375, "attnres/final_alpha/block_4": 0.015995096415281296, "attnres/block_norm/4": 13338.6025390625, "attnres/final_alpha/block_5": 0.5954973101615906, "attnres/block_norm/5": 6180.4892578125, "attnres/final_alpha/block_6": 0.11667585372924805, "attnres/block_norm/6": 32349.3359375, "geo/tier1_time_s": 1.3580176830291748, "geo/step": 43500.0, "geo/rankme_slope": -0.00019463953550170069} {"step": 43500, "timestamp": 1778241562.341608, "geo/ww_alpha_mean": 7.712317939408803, "geo/ww_alpha_std": 5.145626557932803, "geo/ww_alpha_min": 1.349423344870446, "geo/ww_alpha_max": 47.074932301734364, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.071569431830999, "geo/ww_alpha_by_type/k_proj": 4.505210044487975, "geo/ww_alpha_by_type/v_proj": 7.31752963122589, "geo/ww_alpha_by_type/o_proj": 8.744662390072898, "geo/ww_alpha_by_type/gate_proj": 8.213984872731135, "geo/ww_alpha_by_type/up_proj": 13.129533442902611, "geo/ww_alpha_by_type/down_proj": 8.110158378586444, "geo/twonn_id/layer_0": 0.762249767780304, "geo/twonn_id/layer_7": 3.1096765995025635, "geo/twonn_id/layer_14": 4.7930684089660645, "geo/twonn_id/layer_21": 7.153388977050781, "geo/twonn_id/layer_27": 5.607086658477783, "geo/tier2_time_s": 6.92508602142334} {"step": 43500, "timestamp": 1778241563.0441356, "eoc/jacobian_sigma/layer_0/attn": 1110.2750244140625, "eoc/jacobian_sigma/layer_0/mlp": 7924.39892578125, "eoc/jacobian_sigma/layer_0": 7924.39892578125, "eoc/jacobian_sigma/layer_7/attn": 1.155030369758606, "eoc/jacobian_sigma/layer_7/mlp": 1.7235374450683594, "eoc/jacobian_sigma/layer_7": 1.7235374450683594, "eoc/jacobian_sigma/layer_14/attn": 1.5919986963272095, "eoc/jacobian_sigma/layer_14/mlp": 6.465369701385498, "eoc/jacobian_sigma/layer_14": 6.465369701385498, "eoc/jacobian_sigma/layer_21/attn": 1.0850809812545776, "eoc/jacobian_sigma/layer_21/mlp": 4.129806995391846, "eoc/jacobian_sigma/layer_21": 4.129806995391846, "eoc/jacobian_sigma/layer_27/attn": 3.588329315185547, "eoc/jacobian_sigma/layer_27/mlp": 24.34344482421875, "eoc/jacobian_sigma/layer_27": 24.34344482421875, "eoc/layer0_sigma": 7924.39892578125, "eoc/sigma_max": 24.34344482421875, "eoc/sigma_min": 1.7235374450683594, "eoc/sigma_mean": 9.165539741516113, "eoc/time_s": 0.6956040859222412} {"step": 43510, "timestamp": 1778241573.4362288, "train/loss": 2.194241428375244, "train/z_loss": 0.0014018494868651033, "train/perplexity": 8.973191665859298, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1081800.1655825474, "perf/iters_per_sec": 0.515842516700052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9385761499404908, "data/tokens_consumed": 91249180672, "data/tokens_consumed_B": 91.249180672, "train/loss_slope": 4.607735544284882e-08} {"step": 43520, "timestamp": 1778241583.8147073, "train/loss": 2.21975634098053, "train/z_loss": 0.0014000813360325992, "train/perplexity": 9.205087689967653, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021630.1618440787, "perf/iters_per_sec": 0.9639883813114541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373569011688233, "data/tokens_consumed": 91270152192, "data/tokens_consumed_B": 91.270152192, "train/loss_slope": 3.0381183312384426e-06} {"step": 43530, "timestamp": 1778241594.1928325, "train/loss": 2.196554708480835, "train/z_loss": 0.0014058862696401774, "train/perplexity": 8.99397319911001, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022086.491698878, "perf/iters_per_sec": 0.9642059763426198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371227979660034, "data/tokens_consumed": 91291123712, "data/tokens_consumed_B": 91.291123712, "train/loss_slope": 2.946731595709726e-06} {"step": 43540, "timestamp": 1778241604.5699399, "train/loss": 2.1251197814941407, "train/z_loss": 0.0013958508265204729, "train/perplexity": 8.373900466366596, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021954.809238075, "perf/iters_per_sec": 0.9641431852522254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371903419494628, "data/tokens_consumed": 91312095232, "data/tokens_consumed_B": 91.312095232, "train/loss_slope": -1.881879627114518e-06} {"step": 43550, "timestamp": 1778241614.9346201, "grad/layer_0/attn": 0.003136538900434971, "grad/layer_0/mlp": 0.0028879756573587656, "grad/layer_0/attn_mlp_ratio": 1.0860682927974201, "grad/layer_4/attn": 0.001890584477223456, "grad/layer_4/mlp": 0.0025294667575508356, "grad/layer_4/attn_mlp_ratio": 0.7474240951526127, "grad/layer_8/attn": 0.009299053810536861, "grad/layer_8/mlp": 0.003510100534185767, "grad/layer_8/attn_mlp_ratio": 2.649227124707195, "grad/layer_12/attn": 0.004398025572299957, "grad/layer_12/mlp": 0.006400042679160833, "grad/layer_12/attn_mlp_ratio": 0.6871869023470218, "grad/layer_16/attn": 0.004764188081026077, "grad/layer_16/mlp": 0.004610098898410797, "grad/layer_16/attn_mlp_ratio": 1.033424245914979, "grad/layer_20/attn": 0.004735231399536133, "grad/layer_20/mlp": 0.00630926014855504, "grad/layer_20/attn_mlp_ratio": 0.7505208555346256, "grad/layer_24/attn": 0.014432131312787533, "grad/layer_24/mlp": 0.011040201410651207, "grad/layer_24/attn_mlp_ratio": 1.3072344104284608, "grad/layer_27/attn": 0.009787694551050663, "grad/layer_27/mlp": 0.010373025201261044, "grad/layer_27/attn_mlp_ratio": 0.9435718381850258} {"step": 43550, "timestamp": 1778241614.951312, "train/loss": 2.240171027183533, "train/z_loss": 0.001380774495191872, "train/perplexity": 9.394937939823565, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021293.8207679172, "perf/iters_per_sec": 0.9638280013885103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375295162200928, "data/tokens_consumed": 91333066752, "data/tokens_consumed_B": 91.333066752, "train/loss_slope": 3.7035470033171013e-06} {"step": 43560, "timestamp": 1778241625.319035, "train/loss": 2.125043475627899, "train/z_loss": 0.0014054525061510503, "train/perplexity": 8.373261513015944, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023894.0349915982, "perf/iters_per_sec": 0.9650678801496497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361965417861938, "data/tokens_consumed": 91354038272, "data/tokens_consumed_B": 91.354038272, "train/loss_slope": 2.413623868757088e-07} {"step": 43570, "timestamp": 1778241635.6671612, "train/loss": 2.1616485834121706, "train/z_loss": 0.001391691796015948, "train/perplexity": 8.685444541976963, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027851.4058509856, "perf/iters_per_sec": 0.9669549016241958, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034174394607544, "data/tokens_consumed": 91375009792, "data/tokens_consumed_B": 91.375009792, "train/loss_slope": 1.1321136982205397e-06} {"step": 43575, "timestamp": 1778241641.4587305, "eos/sharpness": 23.669576644897457, "eos/L0_probe": 1.995483160018921, "eos/L_plus": 2.10683012008667, "eos/L_minus": 2.1208319664001465, "eos/grad_norm": 0.09838619828224182, "eos/embed_grad_frac": 0.21939899027347565, "eos/time_s": 0.6253738403320312} {"step": 43575, "timestamp": 1778241642.8422158, "geo/rankme_last": 439.4251403808594, "geo/layer_0/stable_rank_q_proj": 19.293804168701172, "geo/layer_0/stable_rank_k_proj": 16.490161895751953, "geo/layer_0/stable_rank_o_proj": 48.76242446899414, "geo/layer_0/stable_rank_gate_proj": 135.7311248779297, "geo/layer_0/stable_rank_down_proj": 53.437686920166016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06134254112839699, "geo/layer_0/attn_entropy_mean": 6.194257736206055, "geo/layer_0/attn_entropy_std": 0.37538978457450867, "geo/layer_7/stable_rank_q_proj": 43.09654998779297, "geo/layer_7/stable_rank_k_proj": 42.43820571899414, "geo/layer_7/stable_rank_o_proj": 96.10606384277344, "geo/layer_7/stable_rank_gate_proj": 89.03302764892578, "geo/layer_7/stable_rank_down_proj": 145.92013549804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4786166846752167, "geo/layer_7/attn_entropy_mean": 4.628025054931641, "geo/layer_7/attn_entropy_std": 0.8013893365859985, "geo/layer_14/stable_rank_q_proj": 54.235904693603516, "geo/layer_14/stable_rank_k_proj": 37.959999084472656, "geo/layer_14/stable_rank_o_proj": 47.58384704589844, "geo/layer_14/stable_rank_gate_proj": 75.67326354980469, "geo/layer_14/stable_rank_down_proj": 133.78289794921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3820611238479614, "geo/layer_14/attn_entropy_mean": 5.483353137969971, "geo/layer_14/attn_entropy_std": 0.37338727712631226, "geo/layer_21/stable_rank_q_proj": 42.94268035888672, "geo/layer_21/stable_rank_k_proj": 30.876752853393555, "geo/layer_21/stable_rank_o_proj": 75.29299926757812, "geo/layer_21/stable_rank_gate_proj": 71.58804321289062, "geo/layer_21/stable_rank_down_proj": 54.59680938720703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14362065494060516, "geo/layer_21/attn_entropy_mean": 5.703036308288574, "geo/layer_21/attn_entropy_std": 0.29328691959381104, "geo/layer_27/stable_rank_q_proj": 42.64695739746094, "geo/layer_27/stable_rank_k_proj": 31.47064971923828, "geo/layer_27/stable_rank_o_proj": 117.00420379638672, "geo/layer_27/stable_rank_gate_proj": 84.2464370727539, "geo/layer_27/stable_rank_down_proj": 130.90902709960938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08621212840080261, "geo/layer_27/attn_entropy_mean": 4.262582302093506, "geo/layer_27/attn_entropy_std": 0.6661145091056824, "attnres/final_alpha/block_0": 0.2425832748413086, "attnres/block_norm/0": 1.7295045852661133, "attnres/final_alpha/block_1": 0.005207896698266268, "attnres/block_norm/1": 41444.34375, "attnres/final_alpha/block_2": 0.011214201338589191, "attnres/block_norm/2": 26632.55859375, "attnres/final_alpha/block_3": 0.013113836757838726, "attnres/block_norm/3": 48946.6953125, "attnres/final_alpha/block_4": 0.015793602913618088, "attnres/block_norm/4": 13283.609375, "attnres/final_alpha/block_5": 0.596369206905365, "attnres/block_norm/5": 6154.2021484375, "attnres/final_alpha/block_6": 0.11571800708770752, "attnres/block_norm/6": 32429.51953125, "geo/tier1_time_s": 1.363029956817627, "geo/step": 43575.0, "geo/rankme_slope": -0.00018191241340286115} {"step": 43580, "timestamp": 1778241648.0180044, "train/loss": 2.1525963068008425, "train/z_loss": 0.0013962692231871189, "train/perplexity": 8.607176282921309, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699008.6926492362, "perf/iters_per_sec": 0.8101504767652684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2343385934829711, "data/tokens_consumed": 91395981312, "data/tokens_consumed_B": 91.395981312, "train/loss_slope": -1.9945393730278012e-07} {"step": 43590, "timestamp": 1778241658.3669288, "train/loss": 2.1763980627059936, "train/z_loss": 0.0013958378462120891, "train/perplexity": 8.814499735232983, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027455.7902930335, "perf/iters_per_sec": 0.966766257425801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343761920928956, "data/tokens_consumed": 91416952832, "data/tokens_consumed_B": 91.416952832, "train/loss_slope": 2.302862100689868e-06} {"step": 43600, "timestamp": 1778241668.704882, "grad/layer_0/attn": 0.0024468491319566965, "grad/layer_0/mlp": 0.002590770600363612, "grad/layer_0/attn_mlp_ratio": 0.9444483572449247, "grad/layer_4/attn": 0.0018266430124640465, "grad/layer_4/mlp": 0.0024022599682211876, "grad/layer_4/attn_mlp_ratio": 0.7603852041784257, "grad/layer_8/attn": 0.004964523483067751, "grad/layer_8/mlp": 0.0035551132168620825, "grad/layer_8/attn_mlp_ratio": 1.3964459190430765, "grad/layer_12/attn": 0.003798532532528043, "grad/layer_12/mlp": 0.006455910857766867, "grad/layer_12/attn_mlp_ratio": 0.5883805643196132, "grad/layer_16/attn": 0.005897643510252237, "grad/layer_16/mlp": 0.004455383401364088, "grad/layer_16/attn_mlp_ratio": 1.3237117542062493, "grad/layer_20/attn": 0.0038079300429672003, "grad/layer_20/mlp": 0.0053672571666538715, "grad/layer_20/attn_mlp_ratio": 0.7094741045161772, "grad/layer_24/attn": 0.012096037156879902, "grad/layer_24/mlp": 0.008100188337266445, "grad/layer_24/attn_mlp_ratio": 1.4933031806062438, "grad/layer_27/attn": 0.003479311941191554, "grad/layer_27/mlp": 0.007816673256456852, "grad/layer_27/attn_mlp_ratio": 0.44511415310932984} {"step": 43600, "timestamp": 1778241668.719218, "train/loss": 2.0975124716758726, "train/z_loss": 0.0014051556121557952, "train/perplexity": 8.145881577975649, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027219.0743828502, "perf/iters_per_sec": 0.9666533824838878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344969749450683, "data/tokens_consumed": 91437924352, "data/tokens_consumed_B": 91.437924352, "train/loss_slope": 2.439807064354722e-07} {"step": 43610, "timestamp": 1778241679.060639, "train/loss": 2.1554481267929075, "train/z_loss": 0.0013978377333842217, "train/perplexity": 8.63175743416983, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029362.1376220128, "perf/iters_per_sec": 0.9676752746686996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334045171737671, "data/tokens_consumed": 91458895872, "data/tokens_consumed_B": 91.458895872, "train/loss_slope": 5.219002725696364e-07} {"step": 43620, "timestamp": 1778241689.4030716, "train/loss": 2.185940384864807, "train/z_loss": 0.0014057409833185374, "train/perplexity": 8.899013117047406, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028909.77220608, "perf/iters_per_sec": 0.9674595700292968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336349248886108, "data/tokens_consumed": 91479867392, "data/tokens_consumed_B": 91.479867392, "train/loss_slope": -1.9505572612350705e-08} {"step": 43630, "timestamp": 1778241699.7454898, "train/loss": 2.131568193435669, "train/z_loss": 0.0014132171985693277, "train/perplexity": 8.4280733027959, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029247.7633658212, "perf/iters_per_sec": 0.9676207367734057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334627628326416, "data/tokens_consumed": 91500838912, "data/tokens_consumed_B": 91.500838912, "train/loss_slope": -3.0743598079595856e-06} {"step": 43640, "timestamp": 1778241710.0887876, "train/loss": 2.1523423910140993, "train/z_loss": 0.001383902516681701, "train/perplexity": 8.604991062426524, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028445.1187580253, "perf/iters_per_sec": 0.9672380059995772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338716983795166, "data/tokens_consumed": 91521810432, "data/tokens_consumed_B": 91.521810432, "train/loss_slope": -4.55092904519793e-06} {"step": 43650, "timestamp": 1778241720.4219904, "grad/layer_0/attn": 0.002460360759869218, "grad/layer_0/mlp": 0.0027682741638273, "grad/layer_0/attn_mlp_ratio": 0.8887705932964988, "grad/layer_4/attn": 0.0018528790678828955, "grad/layer_4/mlp": 0.0025186650454998016, "grad/layer_4/attn_mlp_ratio": 0.7356591531008032, "grad/layer_8/attn": 0.006941688247025013, "grad/layer_8/mlp": 0.0038085300475358963, "grad/layer_8/attn_mlp_ratio": 1.822668582922012, "grad/layer_12/attn": 0.003977596759796143, "grad/layer_12/mlp": 0.006589449476450682, "grad/layer_12/attn_mlp_ratio": 0.60363110964704, "grad/layer_16/attn": 0.00335707888007164, "grad/layer_16/mlp": 0.004182728007435799, "grad/layer_16/attn_mlp_ratio": 0.802605092619726, "grad/layer_20/attn": 0.006959826219826937, "grad/layer_20/mlp": 0.0053263697773218155, "grad/layer_20/attn_mlp_ratio": 1.3066734718255146, "grad/layer_24/attn": 0.005387814249843359, "grad/layer_24/mlp": 0.007817921228706837, "grad/layer_24/attn_mlp_ratio": 0.6891619937463042, "grad/layer_27/attn": 0.004628167487680912, "grad/layer_27/mlp": 0.006742095574736595, "grad/layer_27/attn_mlp_ratio": 0.6864582929345228} {"step": 43650, "timestamp": 1778241721.0216544, "eos/sharpness": 4.246377944946288, "eos/L0_probe": 1.9923852682113647, "eos/L_plus": 2.0164871215820312, "eos/L_minus": 2.010747194290161, "eos/grad_norm": 0.08976563811302185, "eos/embed_grad_frac": 0.2807776629924774, "eos/time_s": 0.5969076156616211} {"step": 43650, "timestamp": 1778241721.042142, "train/loss": 2.2027010679244996, "train/z_loss": 0.001394256856292486, "train/perplexity": 9.049423625779388, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915866.685100844, "perf/iters_per_sec": 0.9135564256195278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0946231365203858, "data/tokens_consumed": 91542781952, "data/tokens_consumed_B": 91.542781952, "train/loss_slope": -3.594688885640445e-06} {"step": 43650, "timestamp": 1778241722.409978, "geo/rankme_last": 438.9125061035156, "geo/layer_0/stable_rank_q_proj": 19.315656661987305, "geo/layer_0/stable_rank_k_proj": 16.47296142578125, "geo/layer_0/stable_rank_o_proj": 48.79659652709961, "geo/layer_0/stable_rank_gate_proj": 135.76036071777344, "geo/layer_0/stable_rank_down_proj": 53.38241195678711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0615263506770134, "geo/layer_0/attn_entropy_mean": 6.192020893096924, "geo/layer_0/attn_entropy_std": 0.38179266452789307, "geo/layer_7/stable_rank_q_proj": 43.078609466552734, "geo/layer_7/stable_rank_k_proj": 42.39608383178711, "geo/layer_7/stable_rank_o_proj": 95.85362243652344, "geo/layer_7/stable_rank_gate_proj": 88.88614654541016, "geo/layer_7/stable_rank_down_proj": 145.96441650390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4873240292072296, "geo/layer_7/attn_entropy_mean": 4.662515640258789, "geo/layer_7/attn_entropy_std": 0.8145255446434021, "geo/layer_14/stable_rank_q_proj": 54.31293869018555, "geo/layer_14/stable_rank_k_proj": 38.03311538696289, "geo/layer_14/stable_rank_o_proj": 47.583858489990234, "geo/layer_14/stable_rank_gate_proj": 75.7481689453125, "geo/layer_14/stable_rank_down_proj": 133.52325439453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36817070841789246, "geo/layer_14/attn_entropy_mean": 5.504705429077148, "geo/layer_14/attn_entropy_std": 0.3856324851512909, "geo/layer_21/stable_rank_q_proj": 42.867794036865234, "geo/layer_21/stable_rank_k_proj": 30.887868881225586, "geo/layer_21/stable_rank_o_proj": 75.197021484375, "geo/layer_21/stable_rank_gate_proj": 71.44770050048828, "geo/layer_21/stable_rank_down_proj": 54.56623458862305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15029554069042206, "geo/layer_21/attn_entropy_mean": 5.713504314422607, "geo/layer_21/attn_entropy_std": 0.2888474762439728, "geo/layer_27/stable_rank_q_proj": 42.66014099121094, "geo/layer_27/stable_rank_k_proj": 31.519689559936523, "geo/layer_27/stable_rank_o_proj": 116.88887786865234, "geo/layer_27/stable_rank_gate_proj": 84.32286834716797, "geo/layer_27/stable_rank_down_proj": 130.86184692382812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09352219104766846, "geo/layer_27/attn_entropy_mean": 4.28663969039917, "geo/layer_27/attn_entropy_std": 0.6675812602043152, "attnres/final_alpha/block_0": 0.24143308401107788, "attnres/block_norm/0": 1.729874849319458, "attnres/final_alpha/block_1": 0.005073354113847017, "attnres/block_norm/1": 41433.26953125, "attnres/final_alpha/block_2": 0.01104788202792406, "attnres/block_norm/2": 26768.56640625, "attnres/final_alpha/block_3": 0.013073842972517014, "attnres/block_norm/3": 49005.34375, "attnres/final_alpha/block_4": 0.015599981881678104, "attnres/block_norm/4": 13278.681640625, "attnres/final_alpha/block_5": 0.5981782674789429, "attnres/block_norm/5": 6179.44140625, "attnres/final_alpha/block_6": 0.11559358239173889, "attnres/block_norm/6": 32576.3828125, "geo/tier1_time_s": 1.363502025604248, "geo/step": 43650.0, "geo/rankme_slope": -0.00018500202815501201} {"step": 43660, "timestamp": 1778241732.7510836, "train/loss": 2.160091495513916, "train/z_loss": 0.0013965115882456302, "train/perplexity": 8.671931064955768, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791678.4062948762, "perf/iters_per_sec": 0.8543388396715528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704957723617553, "data/tokens_consumed": 91563753472, "data/tokens_consumed_B": 91.563753472, "train/loss_slope": -7.547953813382899e-06} {"step": 43670, "timestamp": 1778241743.1153963, "train/loss": 2.1616718769073486, "train/z_loss": 0.001398101611994207, "train/perplexity": 8.685646858693843, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024742.9027729477, "perf/iters_per_sec": 0.9654726518501986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035762119293213, "data/tokens_consumed": 91584724992, "data/tokens_consumed_B": 91.584724992, "train/loss_slope": -1.1238779601055061e-05} {"step": 43680, "timestamp": 1778241753.4599824, "train/loss": 2.1097628593444826, "train/z_loss": 0.0013972125947475434, "train/perplexity": 8.246285523186, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028237.214223972, "perf/iters_per_sec": 0.9671388693923817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339776754379273, "data/tokens_consumed": 91605696512, "data/tokens_consumed_B": 91.605696512, "train/loss_slope": -1.56321118767112e-05} {"step": 43690, "timestamp": 1778241763.8149536, "train/loss": 2.160540294647217, "train/z_loss": 0.0014039054745808243, "train/perplexity": 8.67582389358548, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026709.9440563926, "perf/iters_per_sec": 0.9664106102258647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034756851196289, "data/tokens_consumed": 91626668032, "data/tokens_consumed_B": 91.626668032, "train/loss_slope": -1.454169336754461e-05} {"step": 43700, "timestamp": 1778241774.1496933, "grad/layer_0/attn": 0.002561839297413826, "grad/layer_0/mlp": 0.00265862955711782, "grad/layer_0/attn_mlp_ratio": 0.9635938915204448, "grad/layer_4/attn": 0.0018858229741454124, "grad/layer_4/mlp": 0.0024502614978700876, "grad/layer_4/attn_mlp_ratio": 0.7696414847233787, "grad/layer_8/attn": 0.0035262778401374817, "grad/layer_8/mlp": 0.0035950911697000265, "grad/layer_8/attn_mlp_ratio": 0.9808590590891211, "grad/layer_12/attn": 0.004007220733910799, "grad/layer_12/mlp": 0.0064866808243095875, "grad/layer_12/attn_mlp_ratio": 0.6177613452348605, "grad/layer_16/attn": 0.0033729563001543283, "grad/layer_16/mlp": 0.004603833891451359, "grad/layer_16/attn_mlp_ratio": 0.7326407308381256, "grad/layer_20/attn": 0.0031854428816586733, "grad/layer_20/mlp": 0.006146903615444899, "grad/layer_20/attn_mlp_ratio": 0.5182190951933787, "grad/layer_24/attn": 0.005676036234945059, "grad/layer_24/mlp": 0.00780599657446146, "grad/layer_24/attn_mlp_ratio": 0.7271379263477142, "grad/layer_27/attn": 0.003918593283742666, "grad/layer_27/mlp": 0.006486319005489349, "grad/layer_27/attn_mlp_ratio": 0.6041320539451065} {"step": 43700, "timestamp": 1778241774.1637988, "train/loss": 2.135942351818085, "train/z_loss": 0.0013920987024903298, "train/perplexity": 8.46501977623533, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027923.123023914, "perf/iters_per_sec": 0.9669890990371294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341378211975099, "data/tokens_consumed": 91647639552, "data/tokens_consumed_B": 91.647639552, "train/loss_slope": -1.6568231282204134e-05} {"step": 43710, "timestamp": 1778241784.5105817, "train/loss": 2.1166968941688538, "train/z_loss": 0.0014138911734335124, "train/perplexity": 8.303664257344607, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027932.6140076972, "perf/iters_per_sec": 0.9669936246908651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034132981300354, "data/tokens_consumed": 91668611072, "data/tokens_consumed_B": 91.668611072, "train/loss_slope": -1.8419154992949076e-05} {"step": 43720, "timestamp": 1778241794.862343, "train/loss": 2.1926740646362304, "train/z_loss": 0.0014025412849150598, "train/perplexity": 8.959138426764664, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027340.7897340015, "perf/iters_per_sec": 0.9667114208860405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344348669052124, "data/tokens_consumed": 91689582592, "data/tokens_consumed_B": 91.689582592, "train/loss_slope": -1.8173493696625346e-05} {"step": 43725, "timestamp": 1778241800.620431, "eos/sharpness": 66.23156070709227, "eos/L0_probe": 1.9961391687393188, "eos/L_plus": 2.269998788833618, "eos/L_minus": 2.3845951557159424, "eos/grad_norm": 0.15899990499019623, "eos/embed_grad_frac": 0.09223149716854095, "eos/time_s": 0.5922706127166748} {"step": 43725, "timestamp": 1778241801.9992945, "geo/rankme_last": 438.63446044921875, "geo/layer_0/stable_rank_q_proj": 19.282512664794922, "geo/layer_0/stable_rank_k_proj": 16.41779899597168, "geo/layer_0/stable_rank_o_proj": 48.73706817626953, "geo/layer_0/stable_rank_gate_proj": 135.72540283203125, "geo/layer_0/stable_rank_down_proj": 53.511268615722656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058268919587135315, "geo/layer_0/attn_entropy_mean": 6.192995071411133, "geo/layer_0/attn_entropy_std": 0.3824600875377655, "geo/layer_7/stable_rank_q_proj": 43.039146423339844, "geo/layer_7/stable_rank_k_proj": 42.36408233642578, "geo/layer_7/stable_rank_o_proj": 95.88533020019531, "geo/layer_7/stable_rank_gate_proj": 88.86811828613281, "geo/layer_7/stable_rank_down_proj": 146.1801300048828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4788372814655304, "geo/layer_7/attn_entropy_mean": 4.646127700805664, "geo/layer_7/attn_entropy_std": 0.8195564150810242, "geo/layer_14/stable_rank_q_proj": 54.26800537109375, "geo/layer_14/stable_rank_k_proj": 37.94405746459961, "geo/layer_14/stable_rank_o_proj": 47.47739028930664, "geo/layer_14/stable_rank_gate_proj": 75.78546905517578, "geo/layer_14/stable_rank_down_proj": 133.6339111328125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3660416901111603, "geo/layer_14/attn_entropy_mean": 5.517956733703613, "geo/layer_14/attn_entropy_std": 0.36726272106170654, "geo/layer_21/stable_rank_q_proj": 42.91444396972656, "geo/layer_21/stable_rank_k_proj": 30.816999435424805, "geo/layer_21/stable_rank_o_proj": 75.18521881103516, "geo/layer_21/stable_rank_gate_proj": 71.50819396972656, "geo/layer_21/stable_rank_down_proj": 54.568603515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14980323612689972, "geo/layer_21/attn_entropy_mean": 5.712876319885254, "geo/layer_21/attn_entropy_std": 0.2886323630809784, "geo/layer_27/stable_rank_q_proj": 42.75191116333008, "geo/layer_27/stable_rank_k_proj": 31.561952590942383, "geo/layer_27/stable_rank_o_proj": 116.9778060913086, "geo/layer_27/stable_rank_gate_proj": 84.32093811035156, "geo/layer_27/stable_rank_down_proj": 130.818603515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08423609286546707, "geo/layer_27/attn_entropy_mean": 4.284012794494629, "geo/layer_27/attn_entropy_std": 0.6758022904396057, "attnres/final_alpha/block_0": 0.24226778745651245, "attnres/block_norm/0": 1.7299542427062988, "attnres/final_alpha/block_1": 0.005114886909723282, "attnres/block_norm/1": 41336.81640625, "attnres/final_alpha/block_2": 0.011061510071158409, "attnres/block_norm/2": 26857.77734375, "attnres/final_alpha/block_3": 0.01306869089603424, "attnres/block_norm/3": 49393.109375, "attnres/final_alpha/block_4": 0.015749000012874603, "attnres/block_norm/4": 13242.63671875, "attnres/final_alpha/block_5": 0.5960472822189331, "attnres/block_norm/5": 6173.45166015625, "attnres/final_alpha/block_6": 0.11669081449508667, "attnres/block_norm/6": 32621.0703125, "geo/tier1_time_s": 1.3613393306732178, "geo/step": 43725.0, "geo/rankme_slope": -0.00018002644417141857} {"step": 43730, "timestamp": 1778241807.175523, "train/loss": 2.1335187673568727, "train/z_loss": 0.001401124382391572, "train/perplexity": 8.444528926524134, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704176.036153259, "perf/iters_per_sec": 0.812614458157186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230595874786377, "data/tokens_consumed": 91710554112, "data/tokens_consumed_B": 91.710554112, "train/loss_slope": -2.067024082598151e-05} {"step": 43740, "timestamp": 1778241817.53232, "train/loss": 2.156217837333679, "train/z_loss": 0.0013923575752414763, "train/perplexity": 8.638403946469436, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026000.8109463877, "perf/iters_per_sec": 0.9660724692089022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351190328598023, "data/tokens_consumed": 91731525632, "data/tokens_consumed_B": 91.731525632, "train/loss_slope": -2.375285566800924e-05} {"step": 43750, "timestamp": 1778241827.8715127, "grad/layer_0/attn": 0.0038518363144248724, "grad/layer_0/mlp": 0.0032821695785969496, "grad/layer_0/attn_mlp_ratio": 1.173564041963681, "grad/layer_4/attn": 0.00186262431088835, "grad/layer_4/mlp": 0.002538626780733466, "grad/layer_4/attn_mlp_ratio": 0.7337133018737264, "grad/layer_8/attn": 0.004729948937892914, "grad/layer_8/mlp": 0.0035161483101546764, "grad/layer_8/attn_mlp_ratio": 1.3452074219144925, "grad/layer_12/attn": 0.004721282050013542, "grad/layer_12/mlp": 0.006341385655105114, "grad/layer_12/attn_mlp_ratio": 0.7445189793433854, "grad/layer_16/attn": 0.004252102691680193, "grad/layer_16/mlp": 0.00506513612344861, "grad/layer_16/attn_mlp_ratio": 0.8394843700344033, "grad/layer_20/attn": 0.0053827883675694466, "grad/layer_20/mlp": 0.007142757996916771, "grad/layer_20/attn_mlp_ratio": 0.7536008212140028, "grad/layer_24/attn": 0.01572941057384014, "grad/layer_24/mlp": 0.012048408389091492, "grad/layer_24/attn_mlp_ratio": 1.305517702863527, "grad/layer_27/attn": 0.009729035198688507, "grad/layer_27/mlp": 0.01083358284085989, "grad/layer_27/attn_mlp_ratio": 0.8980440960113505} {"step": 43750, "timestamp": 1778241827.8859277, "train/loss": 2.154239022731781, "train/z_loss": 0.001403198076877743, "train/perplexity": 8.621327048183254, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027317.613630861, "perf/iters_per_sec": 0.9667003696588807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034446692466736, "data/tokens_consumed": 91752497152, "data/tokens_consumed_B": 91.752497152, "train/loss_slope": -2.5790559590989434e-05} {"step": 43760, "timestamp": 1778241838.229088, "train/loss": 2.1437002182006837, "train/z_loss": 0.001403667510021478, "train/perplexity": 8.530945660041635, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028885.624260584, "perf/iters_per_sec": 0.9674480553915901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336472272872925, "data/tokens_consumed": 91773468672, "data/tokens_consumed_B": 91.773468672, "train/loss_slope": -2.8951638631194004e-05} {"step": 43770, "timestamp": 1778241848.569894, "train/loss": 2.1737178325653077, "train/z_loss": 0.0013881970779038965, "train/perplexity": 8.790906479169328, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029132.1849621625, "perf/iters_per_sec": 0.9675656246958554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335216283798219, "data/tokens_consumed": 91794440192, "data/tokens_consumed_B": 91.794440192, "train/loss_slope": -2.7850719911716608e-05} {"step": 43780, "timestamp": 1778241858.9168544, "train/loss": 2.1119669318199157, "train/z_loss": 0.0014100970001891255, "train/perplexity": 8.264480978817655, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028094.582222122, "perf/iters_per_sec": 0.9670708571539507, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340503931045533, "data/tokens_consumed": 91815411712, "data/tokens_consumed_B": 91.815411712, "train/loss_slope": -3.0654549720299434e-05} {"step": 43790, "timestamp": 1778241869.2599828, "train/loss": 2.198914313316345, "train/z_loss": 0.0013900832505896687, "train/perplexity": 9.015220479495582, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029035.6690613232, "perf/iters_per_sec": 0.9675196023279777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335707902908324, "data/tokens_consumed": 91836383232, "data/tokens_consumed_B": 91.836383232, "train/loss_slope": -2.7744874373377857e-05} {"step": 43800, "timestamp": 1778241879.5979664, "grad/layer_0/attn": 0.002749665640294552, "grad/layer_0/mlp": 0.0028057610616087914, "grad/layer_0/attn_mlp_ratio": 0.980007021951193, "grad/layer_4/attn": 0.0034789289347827435, "grad/layer_4/mlp": 0.0025957762263715267, "grad/layer_4/attn_mlp_ratio": 1.3402267751034314, "grad/layer_8/attn": 0.004605113994330168, "grad/layer_8/mlp": 0.0039309062995016575, "grad/layer_8/attn_mlp_ratio": 1.1715145379482919, "grad/layer_12/attn": 0.004915941972285509, "grad/layer_12/mlp": 0.007136876694858074, "grad/layer_12/attn_mlp_ratio": 0.6888085802219974, "grad/layer_16/attn": 0.003919822629541159, "grad/layer_16/mlp": 0.004892582539469004, "grad/layer_16/attn_mlp_ratio": 0.8011765806303439, "grad/layer_20/attn": 0.005229806527495384, "grad/layer_20/mlp": 0.006824404001235962, "grad/layer_20/attn_mlp_ratio": 0.7663389286323503, "grad/layer_24/attn": 0.008702578023076057, "grad/layer_24/mlp": 0.008685572072863579, "grad/layer_24/attn_mlp_ratio": 1.0019579424215264, "grad/layer_27/attn": 0.004038440529257059, "grad/layer_27/mlp": 0.007747539319097996, "grad/layer_27/attn_mlp_ratio": 0.5212545959175298} {"step": 43800, "timestamp": 1778241880.1990814, "eos/sharpness": 33.28025341033935, "eos/L0_probe": 1.9969788789749146, "eos/L_plus": 2.1452229022979736, "eos/L_minus": 2.181537389755249, "eos/grad_norm": 0.11447998136281967, "eos/embed_grad_frac": 0.19006992876529694, "eos/time_s": 0.5981812477111816} {"step": 43800, "timestamp": 1778241880.2193534, "train/loss": 2.139625597000122, "train/z_loss": 0.0014009747421368957, "train/perplexity": 8.496256009581664, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914986.226811435, "perf/iters_per_sec": 0.9131365903908897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0951264142990111, "data/tokens_consumed": 91857354752, "data/tokens_consumed_B": 91.857354752, "train/loss_slope": -2.841909739336664e-05} {"step": 43800, "timestamp": 1778241881.5857568, "geo/rankme_last": 440.1595153808594, "geo/layer_0/stable_rank_q_proj": 19.301790237426758, "geo/layer_0/stable_rank_k_proj": 16.394947052001953, "geo/layer_0/stable_rank_o_proj": 48.75204086303711, "geo/layer_0/stable_rank_gate_proj": 135.53033447265625, "geo/layer_0/stable_rank_down_proj": 53.39469528198242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06046604737639427, "geo/layer_0/attn_entropy_mean": 6.186671257019043, "geo/layer_0/attn_entropy_std": 0.38501599431037903, "geo/layer_7/stable_rank_q_proj": 43.0919075012207, "geo/layer_7/stable_rank_k_proj": 42.490840911865234, "geo/layer_7/stable_rank_o_proj": 95.94772338867188, "geo/layer_7/stable_rank_gate_proj": 88.90170288085938, "geo/layer_7/stable_rank_down_proj": 146.22691345214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48828834295272827, "geo/layer_7/attn_entropy_mean": 4.626863479614258, "geo/layer_7/attn_entropy_std": 0.8113494515419006, "geo/layer_14/stable_rank_q_proj": 54.17517852783203, "geo/layer_14/stable_rank_k_proj": 37.948463439941406, "geo/layer_14/stable_rank_o_proj": 47.36909866333008, "geo/layer_14/stable_rank_gate_proj": 75.66861724853516, "geo/layer_14/stable_rank_down_proj": 133.58103942871094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3848833441734314, "geo/layer_14/attn_entropy_mean": 5.485847473144531, "geo/layer_14/attn_entropy_std": 0.36158910393714905, "geo/layer_21/stable_rank_q_proj": 42.92124557495117, "geo/layer_21/stable_rank_k_proj": 30.955427169799805, "geo/layer_21/stable_rank_o_proj": 75.38785552978516, "geo/layer_21/stable_rank_gate_proj": 71.59329223632812, "geo/layer_21/stable_rank_down_proj": 54.50851058959961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15053854882717133, "geo/layer_21/attn_entropy_mean": 5.694188594818115, "geo/layer_21/attn_entropy_std": 0.29566067457199097, "geo/layer_27/stable_rank_q_proj": 42.76049041748047, "geo/layer_27/stable_rank_k_proj": 31.574575424194336, "geo/layer_27/stable_rank_o_proj": 116.90685272216797, "geo/layer_27/stable_rank_gate_proj": 84.51122283935547, "geo/layer_27/stable_rank_down_proj": 130.73675537109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0824747160077095, "geo/layer_27/attn_entropy_mean": 4.263795375823975, "geo/layer_27/attn_entropy_std": 0.6869111061096191, "attnres/final_alpha/block_0": 0.24155358970165253, "attnres/block_norm/0": 1.7300677299499512, "attnres/final_alpha/block_1": 0.005020345561206341, "attnres/block_norm/1": 41441.203125, "attnres/final_alpha/block_2": 0.01104438304901123, "attnres/block_norm/2": 26725.84375, "attnres/final_alpha/block_3": 0.012891678139567375, "attnres/block_norm/3": 49144.66015625, "attnres/final_alpha/block_4": 0.015778927132487297, "attnres/block_norm/4": 13327.9921875, "attnres/final_alpha/block_5": 0.5986192226409912, "attnres/block_norm/5": 6124.6943359375, "attnres/final_alpha/block_6": 0.11509187519550323, "attnres/block_norm/6": 32358.30859375, "geo/tier1_time_s": 1.3621420860290527, "geo/step": 43800.0, "geo/rankme_slope": -0.00012697041707307924} {"step": 43810, "timestamp": 1778241891.9344034, "train/loss": 2.1788808584213255, "train/z_loss": 0.0013996100635267795, "train/perplexity": 8.836411527404417, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790756.0752799346, "perf/iters_per_sec": 0.8538990379714654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710986375808716, "data/tokens_consumed": 91878326272, "data/tokens_consumed_B": 91.878326272, "train/loss_slope": -3.053464400242513e-05} {"step": 43820, "timestamp": 1778241902.2803547, "train/loss": 2.1858790636062624, "train/z_loss": 0.0013996016117744149, "train/perplexity": 8.898467435094386, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028340.2023241038, "perf/iters_per_sec": 0.9671879779453773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033925175666809, "data/tokens_consumed": 91899297792, "data/tokens_consumed_B": 91.899297792, "train/loss_slope": -2.7624383832540647e-05} {"step": 43830, "timestamp": 1778241912.6290514, "train/loss": 2.1310011982917785, "train/z_loss": 0.0014065968221984803, "train/perplexity": 8.423295980648078, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027996.3414853343, "perf/iters_per_sec": 0.9670240123202011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341004848480224, "data/tokens_consumed": 91920269312, "data/tokens_consumed_B": 91.920269312, "train/loss_slope": -2.8761167888200698e-05} {"step": 43840, "timestamp": 1778241922.9782872, "train/loss": 2.169577956199646, "train/z_loss": 0.0014053731225430965, "train/perplexity": 8.754588441167, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027393.8257540276, "perf/iters_per_sec": 0.9667367104311121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344078063964843, "data/tokens_consumed": 91941240832, "data/tokens_consumed_B": 91.941240832, "train/loss_slope": -2.544811291269736e-05} {"step": 43850, "timestamp": 1778241933.3142736, "grad/layer_0/attn": 0.0028870615642517805, "grad/layer_0/mlp": 0.0029248998034745455, "grad/layer_0/attn_mlp_ratio": 0.987063372945578, "grad/layer_4/attn": 0.003034817986190319, "grad/layer_4/mlp": 0.0025866134092211723, "grad/layer_4/attn_mlp_ratio": 1.173278487633083, "grad/layer_8/attn": 0.0041219270788133144, "grad/layer_8/mlp": 0.003704596543684602, "grad/layer_8/attn_mlp_ratio": 1.112652057772647, "grad/layer_12/attn": 0.004407151136547327, "grad/layer_12/mlp": 0.006668122950941324, "grad/layer_12/attn_mlp_ratio": 0.660928285647815, "grad/layer_16/attn": 0.003720885142683983, "grad/layer_16/mlp": 0.004510835744440556, "grad/layer_16/attn_mlp_ratio": 0.8248770895242933, "grad/layer_20/attn": 0.006764160469174385, "grad/layer_20/mlp": 0.005775993224233389, "grad/layer_20/attn_mlp_ratio": 1.1710817671473244, "grad/layer_24/attn": 0.012255617417395115, "grad/layer_24/mlp": 0.011484932154417038, "grad/layer_24/attn_mlp_ratio": 1.067104023420049, "grad/layer_27/attn": 0.003902378026396036, "grad/layer_27/mlp": 0.010907438583672047, "grad/layer_27/attn_mlp_ratio": 0.3577721717783042} {"step": 43850, "timestamp": 1778241933.3289487, "train/loss": 2.1981225967407227, "train/z_loss": 0.0013923829304985703, "train/perplexity": 9.0080858047019, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027547.4355929145, "perf/iters_per_sec": 0.966809957310159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343294382095336, "data/tokens_consumed": 91962212352, "data/tokens_consumed_B": 91.962212352, "train/loss_slope": -2.490055957356979e-05} {"step": 43860, "timestamp": 1778241943.6723366, "train/loss": 2.1433109521865843, "train/z_loss": 0.0013901124359108508, "train/perplexity": 8.527625499082887, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028844.8174110872, "perf/iters_per_sec": 0.9674285971694409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336680173873902, "data/tokens_consumed": 91983183872, "data/tokens_consumed_B": 91.983183872, "train/loss_slope": -2.456549613138878e-05} {"step": 43870, "timestamp": 1778241954.0233188, "train/loss": 2.197619342803955, "train/z_loss": 0.0013989111990667879, "train/perplexity": 9.003553590580895, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027176.2789070068, "perf/iters_per_sec": 0.9666329760108027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345188140869142, "data/tokens_consumed": 92004155392, "data/tokens_consumed_B": 92.004155392, "train/loss_slope": -2.1662480152300654e-05} {"step": 43875, "timestamp": 1778241959.7926977, "eos/sharpness": 62.930250167846665, "eos/L0_probe": 1.9918533563613892, "eos/L_plus": 2.2909047603607178, "eos/L_minus": 2.3221044540405273, "eos/grad_norm": 0.2276071310043335, "eos/embed_grad_frac": 0.04696439206600189, "eos/time_s": 0.6003339290618896} {"step": 43875, "timestamp": 1778241961.172603, "geo/rankme_last": 438.25244140625, "geo/layer_0/stable_rank_q_proj": 19.327686309814453, "geo/layer_0/stable_rank_k_proj": 16.425180435180664, "geo/layer_0/stable_rank_o_proj": 48.80690383911133, "geo/layer_0/stable_rank_gate_proj": 135.54739379882812, "geo/layer_0/stable_rank_down_proj": 53.378787994384766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06368103623390198, "geo/layer_0/attn_entropy_mean": 6.186605453491211, "geo/layer_0/attn_entropy_std": 0.3810804784297943, "geo/layer_7/stable_rank_q_proj": 43.05172348022461, "geo/layer_7/stable_rank_k_proj": 42.502811431884766, "geo/layer_7/stable_rank_o_proj": 95.71039581298828, "geo/layer_7/stable_rank_gate_proj": 89.08463287353516, "geo/layer_7/stable_rank_down_proj": 145.94862365722656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4817292094230652, "geo/layer_7/attn_entropy_mean": 4.64314079284668, "geo/layer_7/attn_entropy_std": 0.8154610395431519, "geo/layer_14/stable_rank_q_proj": 54.1260986328125, "geo/layer_14/stable_rank_k_proj": 38.05841064453125, "geo/layer_14/stable_rank_o_proj": 47.31180953979492, "geo/layer_14/stable_rank_gate_proj": 75.65348815917969, "geo/layer_14/stable_rank_down_proj": 134.1139678955078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38994652032852173, "geo/layer_14/attn_entropy_mean": 5.504944801330566, "geo/layer_14/attn_entropy_std": 0.3851260244846344, "geo/layer_21/stable_rank_q_proj": 42.9140739440918, "geo/layer_21/stable_rank_k_proj": 30.859888076782227, "geo/layer_21/stable_rank_o_proj": 75.4493179321289, "geo/layer_21/stable_rank_gate_proj": 71.53655242919922, "geo/layer_21/stable_rank_down_proj": 54.61759948730469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1460287719964981, "geo/layer_21/attn_entropy_mean": 5.711793899536133, "geo/layer_21/attn_entropy_std": 0.28979986906051636, "geo/layer_27/stable_rank_q_proj": 42.71635055541992, "geo/layer_27/stable_rank_k_proj": 31.601329803466797, "geo/layer_27/stable_rank_o_proj": 117.12805938720703, "geo/layer_27/stable_rank_gate_proj": 84.52824401855469, "geo/layer_27/stable_rank_down_proj": 130.5933380126953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08720570802688599, "geo/layer_27/attn_entropy_mean": 4.256303787231445, "geo/layer_27/attn_entropy_std": 0.69844651222229, "attnres/final_alpha/block_0": 0.24333050847053528, "attnres/block_norm/0": 1.7301510572433472, "attnres/final_alpha/block_1": 0.005080917850136757, "attnres/block_norm/1": 41666.8203125, "attnres/final_alpha/block_2": 0.011140541173517704, "attnres/block_norm/2": 26893.6171875, "attnres/final_alpha/block_3": 0.01333547942340374, "attnres/block_norm/3": 48900.00390625, "attnres/final_alpha/block_4": 0.01602335274219513, "attnres/block_norm/4": 13277.375, "attnres/final_alpha/block_5": 0.59493088722229, "attnres/block_norm/5": 6144.04931640625, "attnres/final_alpha/block_6": 0.11615833640098572, "attnres/block_norm/6": 32667.4765625, "geo/tier1_time_s": 1.3593876361846924, "geo/step": 43875.0, "geo/rankme_slope": -0.00012890783266431572} {"step": 43880, "timestamp": 1778241966.3499343, "train/loss": 2.1534345388412475, "train/z_loss": 0.0013928829925134779, "train/perplexity": 8.614394118546985, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702357.3824267676, "perf/iters_per_sec": 0.8117472564824904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319105386734008, "data/tokens_consumed": 92025126912, "data/tokens_consumed_B": 92.025126912, "train/loss_slope": -2.0897086039341996e-05} {"step": 43890, "timestamp": 1778241976.6925745, "train/loss": 2.134777045249939, "train/z_loss": 0.0013951721251942218, "train/perplexity": 8.455161178350647, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028669.9094001818, "perf/iters_per_sec": 0.9673451945305738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337571382522583, "data/tokens_consumed": 92046098432, "data/tokens_consumed_B": 92.046098432, "train/loss_slope": -2.009828508418849e-05} {"step": 43900, "timestamp": 1778241987.0306304, "grad/layer_0/attn": 0.0025680509861558676, "grad/layer_0/mlp": 0.0025920893531292677, "grad/layer_0/attn_mlp_ratio": 0.9907262201370481, "grad/layer_4/attn": 0.0033648062963038683, "grad/layer_4/mlp": 0.0024929335340857506, "grad/layer_4/attn_mlp_ratio": 1.3497376144703768, "grad/layer_8/attn": 0.003802073886618018, "grad/layer_8/mlp": 0.003431877586990595, "grad/layer_8/attn_mlp_ratio": 1.1078698699055465, "grad/layer_12/attn": 0.007449132390320301, "grad/layer_12/mlp": 0.006718690041452646, "grad/layer_12/attn_mlp_ratio": 1.1087179544657089, "grad/layer_16/attn": 0.0065366425551474094, "grad/layer_16/mlp": 0.004394789692014456, "grad/layer_16/attn_mlp_ratio": 1.4873618226347944, "grad/layer_20/attn": 0.0057542650029063225, "grad/layer_20/mlp": 0.005904261022806168, "grad/layer_20/attn_mlp_ratio": 0.9745952767365826, "grad/layer_24/attn": 0.005135020241141319, "grad/layer_24/mlp": 0.007988118566572666, "grad/layer_24/attn_mlp_ratio": 0.642832243170033, "grad/layer_27/attn": 0.005110766738653183, "grad/layer_27/mlp": 0.0071681104600429535, "grad/layer_27/attn_mlp_ratio": 0.7129865947020992} {"step": 43900, "timestamp": 1778241987.045051, "train/loss": 2.169950079917908, "train/z_loss": 0.0013995014247484504, "train/perplexity": 8.757846837395238, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026462.8511560806, "perf/iters_per_sec": 0.9662927871494678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348830223083496, "data/tokens_consumed": 92067069952, "data/tokens_consumed_B": 92.067069952, "train/loss_slope": -2.1178045081119406e-05} {"step": 43910, "timestamp": 1778241998.0059159, "train/loss": 2.1265218257904053, "train/z_loss": 0.0014032480656169354, "train/perplexity": 8.385649280006948, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914359.442257841, "perf/iters_per_sec": 0.9128377162255482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095484972000122, "data/tokens_consumed": 92088041472, "data/tokens_consumed_B": 92.088041472, "train/loss_slope": -2.2668805223951916e-05} {"step": 43920, "timestamp": 1778242008.3551545, "train/loss": 2.1911067008972167, "train/z_loss": 0.0014039307134225965, "train/perplexity": 8.945107196954618, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027755.853171558, "perf/iters_per_sec": 0.9669093385560789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342231273651123, "data/tokens_consumed": 92109012992, "data/tokens_consumed_B": 92.109012992, "train/loss_slope": -2.080712344172187e-05} {"step": 43930, "timestamp": 1778242018.7018664, "train/loss": 2.156291437149048, "train/z_loss": 0.0014031451079063117, "train/perplexity": 8.63903975480238, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027924.1983525895, "perf/iters_per_sec": 0.9669896117937992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341372728347777, "data/tokens_consumed": 92129984512, "data/tokens_consumed_B": 92.129984512, "train/loss_slope": -1.8132220007011932e-05} {"step": 43940, "timestamp": 1778242029.0447228, "train/loss": 2.0943333387374876, "train/z_loss": 0.0014079449814744295, "train/perplexity": 8.120025858700052, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028784.8268353823, "perf/iters_per_sec": 0.9673999914338027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336985826492309, "data/tokens_consumed": 92150956032, "data/tokens_consumed_B": 92.150956032, "train/loss_slope": -2.3049108781079172e-05} {"step": 43950, "timestamp": 1778242039.3831038, "grad/layer_0/attn": 0.0026664233300834894, "grad/layer_0/mlp": 0.0026927681174129248, "grad/layer_0/attn_mlp_ratio": 0.9902164296358391, "grad/layer_4/attn": 0.0022219554521143436, "grad/layer_4/mlp": 0.002391176763921976, "grad/layer_4/attn_mlp_ratio": 0.9292309095321047, "grad/layer_8/attn": 0.003521020757034421, "grad/layer_8/mlp": 0.0036939899437129498, "grad/layer_8/attn_mlp_ratio": 0.9531754864978814, "grad/layer_12/attn": 0.004809379577636719, "grad/layer_12/mlp": 0.006498569622635841, "grad/layer_12/attn_mlp_ratio": 0.7400673968126664, "grad/layer_16/attn": 0.006125744432210922, "grad/layer_16/mlp": 0.004767357371747494, "grad/layer_16/attn_mlp_ratio": 1.2849349914525094, "grad/layer_20/attn": 0.0035373473074287176, "grad/layer_20/mlp": 0.00587511109188199, "grad/layer_20/attn_mlp_ratio": 0.6020902740217908, "grad/layer_24/attn": 0.005175669677555561, "grad/layer_24/mlp": 0.007509526796638966, "grad/layer_24/attn_mlp_ratio": 0.6892138145043508, "grad/layer_27/attn": 0.009569689631462097, "grad/layer_27/mlp": 0.006151141598820686, "grad/layer_27/attn_mlp_ratio": 1.555758280336287} {"step": 43950, "timestamp": 1778242040.0736036, "eos/sharpness": 3.8344383239746085, "eos/L0_probe": 1.9947701692581177, "eos/L_plus": 2.018092393875122, "eos/L_minus": 2.0097923278808594, "eos/grad_norm": 0.08563779294490814, "eos/embed_grad_frac": 0.2769952118396759, "eos/time_s": 0.6876466274261475} {"step": 43950, "timestamp": 1778242040.0941255, "train/loss": 2.1199100375175477, "train/z_loss": 0.0013993115280754864, "train/perplexity": 8.330388031589619, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1899274.198660512, "perf/iters_per_sec": 0.905644511537796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1041860103607177, "data/tokens_consumed": 92171927552, "data/tokens_consumed_B": 92.171927552, "train/loss_slope": -2.458394625292069e-05} {"step": 43950, "timestamp": 1778242041.4542887, "geo/rankme_last": 439.4741516113281, "geo/layer_0/stable_rank_q_proj": 19.36979866027832, "geo/layer_0/stable_rank_k_proj": 16.430147171020508, "geo/layer_0/stable_rank_o_proj": 48.72723388671875, "geo/layer_0/stable_rank_gate_proj": 135.95091247558594, "geo/layer_0/stable_rank_down_proj": 53.47433853149414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05792434141039848, "geo/layer_0/attn_entropy_mean": 6.18941068649292, "geo/layer_0/attn_entropy_std": 0.37967756390571594, "geo/layer_7/stable_rank_q_proj": 43.084163665771484, "geo/layer_7/stable_rank_k_proj": 42.597267150878906, "geo/layer_7/stable_rank_o_proj": 96.03938293457031, "geo/layer_7/stable_rank_gate_proj": 89.01042175292969, "geo/layer_7/stable_rank_down_proj": 145.9897918701172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49811944365501404, "geo/layer_7/attn_entropy_mean": 4.654107093811035, "geo/layer_7/attn_entropy_std": 0.814853310585022, "geo/layer_14/stable_rank_q_proj": 54.2980842590332, "geo/layer_14/stable_rank_k_proj": 38.022396087646484, "geo/layer_14/stable_rank_o_proj": 47.25924301147461, "geo/layer_14/stable_rank_gate_proj": 75.62580871582031, "geo/layer_14/stable_rank_down_proj": 133.85092163085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39414966106414795, "geo/layer_14/attn_entropy_mean": 5.518810749053955, "geo/layer_14/attn_entropy_std": 0.3822849988937378, "geo/layer_21/stable_rank_q_proj": 42.91587448120117, "geo/layer_21/stable_rank_k_proj": 30.854158401489258, "geo/layer_21/stable_rank_o_proj": 75.353271484375, "geo/layer_21/stable_rank_gate_proj": 71.50812530517578, "geo/layer_21/stable_rank_down_proj": 54.576805114746094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14426104724407196, "geo/layer_21/attn_entropy_mean": 5.707792282104492, "geo/layer_21/attn_entropy_std": 0.29188603162765503, "geo/layer_27/stable_rank_q_proj": 42.628517150878906, "geo/layer_27/stable_rank_k_proj": 31.592998504638672, "geo/layer_27/stable_rank_o_proj": 117.03623962402344, "geo/layer_27/stable_rank_gate_proj": 84.35096740722656, "geo/layer_27/stable_rank_down_proj": 130.614013671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09258255362510681, "geo/layer_27/attn_entropy_mean": 4.275355339050293, "geo/layer_27/attn_entropy_std": 0.669352114200592, "attnres/final_alpha/block_0": 0.23929578065872192, "attnres/block_norm/0": 1.730220079421997, "attnres/final_alpha/block_1": 0.0050104595720767975, "attnres/block_norm/1": 41568.6171875, "attnres/final_alpha/block_2": 0.010729857720434666, "attnres/block_norm/2": 26935.91015625, "attnres/final_alpha/block_3": 0.01297617144882679, "attnres/block_norm/3": 49319.5078125, "attnres/final_alpha/block_4": 0.015551536343991756, "attnres/block_norm/4": 13355.3056640625, "attnres/final_alpha/block_5": 0.6029279232025146, "attnres/block_norm/5": 6109.119140625, "attnres/final_alpha/block_6": 0.11350826919078827, "attnres/block_norm/6": 32766.8359375, "geo/tier1_time_s": 1.3569962978363037, "geo/step": 43950.0, "geo/rankme_slope": -9.640551533113246e-05} {"step": 43960, "timestamp": 1778242052.280991, "train/loss": 2.176348423957825, "train/z_loss": 0.0014123094384558498, "train/perplexity": 8.8140622053597, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1721236.896050171, "perf/iters_per_sec": 0.8207497101069312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2183982372283935, "data/tokens_consumed": 92192899072, "data/tokens_consumed_B": 92.192899072, "train/loss_slope": -2.100524571147218e-05} {"step": 43970, "timestamp": 1778242062.6241944, "train/loss": 2.1472887992858887, "train/z_loss": 0.0014100470580160618, "train/perplexity": 8.561614646435702, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029074.470907126, "perf/iters_per_sec": 0.9675381044898634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033551025390625, "data/tokens_consumed": 92213870592, "data/tokens_consumed_B": 92.213870592, "train/loss_slope": -2.4985320769091626e-05} {"step": 43980, "timestamp": 1778242072.9801266, "train/loss": 2.148655152320862, "train/z_loss": 0.0013980491552501918, "train/perplexity": 8.573320830160752, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026043.5568057701, "perf/iters_per_sec": 0.9660928520230151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350971937179565, "data/tokens_consumed": 92234842112, "data/tokens_consumed_B": 92.234842112, "train/loss_slope": -2.5904483491867187e-05} {"step": 43990, "timestamp": 1778242083.3372717, "train/loss": 2.1964018821716307, "train/z_loss": 0.0013844985631294548, "train/perplexity": 8.992598788406642, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026168.5381249548, "perf/iters_per_sec": 0.9661524477600836, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350333452224731, "data/tokens_consumed": 92255813632, "data/tokens_consumed_B": 92.255813632, "train/loss_slope": -2.3237265154223046e-05} {"step": 44000, "timestamp": 1778242093.674144, "grad/layer_0/attn": 0.0028089405968785286, "grad/layer_0/mlp": 0.0028129704296588898, "grad/layer_0/attn_mlp_ratio": 0.9985673746888312, "grad/layer_4/attn": 0.0027116124983876944, "grad/layer_4/mlp": 0.0025525251403450966, "grad/layer_4/attn_mlp_ratio": 1.0623254397363333, "grad/layer_8/attn": 0.004320891108363867, "grad/layer_8/mlp": 0.003597839502617717, "grad/layer_8/attn_mlp_ratio": 1.2009682436148827, "grad/layer_12/attn": 0.004100896883755922, "grad/layer_12/mlp": 0.006510102655738592, "grad/layer_12/attn_mlp_ratio": 0.6299281344124749, "grad/layer_16/attn": 0.006007087882608175, "grad/layer_16/mlp": 0.0044721378944814205, "grad/layer_16/attn_mlp_ratio": 1.3432250726656625, "grad/layer_20/attn": 0.004593139048665762, "grad/layer_20/mlp": 0.005598071496933699, "grad/layer_20/attn_mlp_ratio": 0.8204859421915253, "grad/layer_24/attn": 0.004748296923935413, "grad/layer_24/mlp": 0.007337772287428379, "grad/layer_24/attn_mlp_ratio": 0.6471033268993941, "grad/layer_27/attn": 0.004658971913158894, "grad/layer_27/mlp": 0.006314926780760288, "grad/layer_27/attn_mlp_ratio": 0.7377713156669163} {"step": 44000, "timestamp": 1778242093.6885142, "train/loss": 2.1734365224838257, "train/z_loss": 0.0013889778638258576, "train/perplexity": 8.78843385635454, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027377.5174665276, "perf/iters_per_sec": 0.9667289340336455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034416127204895, "data/tokens_consumed": 92276785152, "data/tokens_consumed_B": 92.276785152, "train/loss_slope": -2.1884132361505e-05} {"step": 44000, "timestamp": 1778242100.9401317, "geo/ww_alpha_mean": 7.502901523222909, "geo/ww_alpha_std": 4.726862081400053, "geo/ww_alpha_min": 1.345825134433091, "geo/ww_alpha_max": 42.6141232627258, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.0395260902117744, "geo/ww_alpha_by_type/k_proj": 4.521558109101496, "geo/ww_alpha_by_type/v_proj": 7.503312432503867, "geo/ww_alpha_by_type/o_proj": 7.762155255847836, "geo/ww_alpha_by_type/gate_proj": 7.881317966113053, "geo/ww_alpha_by_type/up_proj": 12.86381408786505, "geo/ww_alpha_by_type/down_proj": 8.048190843025436, "geo/twonn_id/layer_0": 0.6894094347953796, "geo/twonn_id/layer_7": 3.2954862117767334, "geo/twonn_id/layer_14": 4.53764009475708, "geo/twonn_id/layer_21": 6.912782669067383, "geo/twonn_id/layer_27": 5.603038311004639, "geo/tier2_time_s": 7.245211839675903} {"step": 44000, "timestamp": 1778242101.803632, "eoc/jacobian_sigma/layer_0/attn": 1072.0335693359375, "eoc/jacobian_sigma/layer_0/mlp": 7023.1640625, "eoc/jacobian_sigma/layer_0": 7023.1640625, "eoc/jacobian_sigma/layer_7/attn": 1.1530015468597412, "eoc/jacobian_sigma/layer_7/mlp": 1.770301342010498, "eoc/jacobian_sigma/layer_7": 1.770301342010498, "eoc/jacobian_sigma/layer_14/attn": 1.6013689041137695, "eoc/jacobian_sigma/layer_14/mlp": 6.26978874206543, "eoc/jacobian_sigma/layer_14": 6.26978874206543, "eoc/jacobian_sigma/layer_21/attn": 1.084296464920044, "eoc/jacobian_sigma/layer_21/mlp": 3.997880458831787, "eoc/jacobian_sigma/layer_21": 3.997880458831787, "eoc/jacobian_sigma/layer_27/attn": 3.3050591945648193, "eoc/jacobian_sigma/layer_27/mlp": 28.029747009277344, "eoc/jacobian_sigma/layer_27": 28.029747009277344, "eoc/layer0_sigma": 7023.1640625, "eoc/sigma_max": 28.029747009277344, "eoc/sigma_min": 1.770301342010498, "eoc/sigma_mean": 10.016929388046265, "eoc/time_s": 0.8535375595092773} {"step": 44010, "timestamp": 1778242112.1748729, "train/loss": 2.1519126653671266, "train/z_loss": 0.001391334889922291, "train/perplexity": 8.601294071477835, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1134742.6494755747, "perf/iters_per_sec": 0.5410874602678178, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8481300592422485, "data/tokens_consumed": 92297756672, "data/tokens_consumed_B": 92.297756672, "train/loss_slope": -2.36180200542446e-05} {"step": 44020, "timestamp": 1778242122.5186038, "train/loss": 2.1563652396202087, "train/z_loss": 0.0014008064405061304, "train/perplexity": 8.639677360812898, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028940.847152271, "perf/iters_per_sec": 0.9674743877183299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336190938949585, "data/tokens_consumed": 92318728192, "data/tokens_consumed_B": 92.318728192, "train/loss_slope": -2.457242628636027e-05} {"step": 44025, "timestamp": 1778242128.29025, "eos/sharpness": 33.81030559539794, "eos/L0_probe": 1.9903368949890137, "eos/L_plus": 2.1353230476379395, "eos/L_minus": 2.1834537982940674, "eos/grad_norm": 0.09911385178565979, "eos/embed_grad_frac": 0.20544393360614777, "eos/time_s": 0.6101465225219727} {"step": 44025, "timestamp": 1778242129.670404, "geo/rankme_last": 438.8568115234375, "geo/layer_0/stable_rank_q_proj": 19.358654022216797, "geo/layer_0/stable_rank_k_proj": 16.41612434387207, "geo/layer_0/stable_rank_o_proj": 48.678199768066406, "geo/layer_0/stable_rank_gate_proj": 136.0365753173828, "geo/layer_0/stable_rank_down_proj": 53.41034698486328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058761343359947205, "geo/layer_0/attn_entropy_mean": 6.189252853393555, "geo/layer_0/attn_entropy_std": 0.38262781500816345, "geo/layer_7/stable_rank_q_proj": 43.00701141357422, "geo/layer_7/stable_rank_k_proj": 42.58588409423828, "geo/layer_7/stable_rank_o_proj": 95.81132507324219, "geo/layer_7/stable_rank_gate_proj": 88.96996307373047, "geo/layer_7/stable_rank_down_proj": 145.8131561279297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4809305965900421, "geo/layer_7/attn_entropy_mean": 4.646639823913574, "geo/layer_7/attn_entropy_std": 0.828120231628418, "geo/layer_14/stable_rank_q_proj": 54.31840515136719, "geo/layer_14/stable_rank_k_proj": 37.96114730834961, "geo/layer_14/stable_rank_o_proj": 47.208614349365234, "geo/layer_14/stable_rank_gate_proj": 75.54456329345703, "geo/layer_14/stable_rank_down_proj": 133.72584533691406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3871038556098938, "geo/layer_14/attn_entropy_mean": 5.573322296142578, "geo/layer_14/attn_entropy_std": 0.35532182455062866, "geo/layer_21/stable_rank_q_proj": 42.95294952392578, "geo/layer_21/stable_rank_k_proj": 30.754573822021484, "geo/layer_21/stable_rank_o_proj": 75.32074737548828, "geo/layer_21/stable_rank_gate_proj": 71.43975067138672, "geo/layer_21/stable_rank_down_proj": 54.5949821472168, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14494438469409943, "geo/layer_21/attn_entropy_mean": 5.708971977233887, "geo/layer_21/attn_entropy_std": 0.28707441687583923, "geo/layer_27/stable_rank_q_proj": 42.63941192626953, "geo/layer_27/stable_rank_k_proj": 31.576852798461914, "geo/layer_27/stable_rank_o_proj": 117.04447174072266, "geo/layer_27/stable_rank_gate_proj": 84.43327331542969, "geo/layer_27/stable_rank_down_proj": 130.77581787109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0870075672864914, "geo/layer_27/attn_entropy_mean": 4.247978210449219, "geo/layer_27/attn_entropy_std": 0.6836656928062439, "attnres/final_alpha/block_0": 0.2407483458518982, "attnres/block_norm/0": 1.7303392887115479, "attnres/final_alpha/block_1": 0.005025898106396198, "attnres/block_norm/1": 41494.09375, "attnres/final_alpha/block_2": 0.010958063416182995, "attnres/block_norm/2": 26904.626953125, "attnres/final_alpha/block_3": 0.01307572890073061, "attnres/block_norm/3": 48918.3671875, "attnres/final_alpha/block_4": 0.01561675127595663, "attnres/block_norm/4": 13374.111328125, "attnres/final_alpha/block_5": 0.5994199514389038, "attnres/block_norm/5": 6110.94580078125, "attnres/final_alpha/block_6": 0.11515524983406067, "attnres/block_norm/6": 32930.85546875, "geo/tier1_time_s": 1.3617022037506104, "geo/step": 44025.0, "geo/rankme_slope": -0.00011266971632402961} {"step": 44030, "timestamp": 1778242134.8540616, "train/loss": 2.143385910987854, "train/z_loss": 0.0013984794262796641, "train/perplexity": 8.528264743626178, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700782.17793412, "perf/iters_per_sec": 0.8109961404486274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330514907836914, "data/tokens_consumed": 92339699712, "data/tokens_consumed_B": 92.339699712, "train/loss_slope": -2.289860761455322e-05} {"step": 44040, "timestamp": 1778242145.1996627, "train/loss": 2.162555122375488, "train/z_loss": 0.0014018367161042987, "train/perplexity": 8.693321805851843, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028462.7072385051, "perf/iters_per_sec": 0.9672463928406263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338627338409423, "data/tokens_consumed": 92360671232, "data/tokens_consumed_B": 92.360671232, "train/loss_slope": -2.1443892312128522e-05} {"step": 44050, "timestamp": 1778242155.5337546, "grad/layer_0/attn": 0.0024340541567653418, "grad/layer_0/mlp": 0.002803090028464794, "grad/layer_0/attn_mlp_ratio": 0.868346733502441, "grad/layer_4/attn": 0.003067266196012497, "grad/layer_4/mlp": 0.002386684063822031, "grad/layer_4/attn_mlp_ratio": 1.2851579788004222, "grad/layer_8/attn": 0.006204855162650347, "grad/layer_8/mlp": 0.003680650144815445, "grad/layer_8/attn_mlp_ratio": 1.6858040699168553, "grad/layer_12/attn": 0.007146276067942381, "grad/layer_12/mlp": 0.006686253938823938, "grad/layer_12/attn_mlp_ratio": 1.0688011592810132, "grad/layer_16/attn": 0.0034357348922640085, "grad/layer_16/mlp": 0.004410633817315102, "grad/layer_16/attn_mlp_ratio": 0.7789662340318314, "grad/layer_20/attn": 0.0045933835208415985, "grad/layer_20/mlp": 0.005606880411505699, "grad/layer_20/attn_mlp_ratio": 0.819240487008001, "grad/layer_24/attn": 0.0044832127168774605, "grad/layer_24/mlp": 0.007623268757015467, "grad/layer_24/attn_mlp_ratio": 0.5880958419499653, "grad/layer_27/attn": 0.0058406805619597435, "grad/layer_27/mlp": 0.006297273561358452, "grad/layer_27/attn_mlp_ratio": 0.9274935275243841} {"step": 44050, "timestamp": 1778242155.5500386, "train/loss": 2.241496515274048, "train/z_loss": 0.0013853168347850442, "train/perplexity": 9.40739907489256, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027685.269646555, "perf/iters_per_sec": 0.9668756817086005, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342591285705567, "data/tokens_consumed": 92381642752, "data/tokens_consumed_B": 92.381642752, "train/loss_slope": -1.6694597707222475e-05} {"step": 44060, "timestamp": 1778242165.9014862, "train/loss": 2.136322724819183, "train/z_loss": 0.001401363604236394, "train/perplexity": 8.468240253664476, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026958.1715260877, "perf/iters_per_sec": 0.966528974307102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346301317214965, "data/tokens_consumed": 92402614272, "data/tokens_consumed_B": 92.402614272, "train/loss_slope": -1.744457513693033e-05} {"step": 44070, "timestamp": 1778242176.243586, "train/loss": 2.1457571506500246, "train/z_loss": 0.0013967776438221335, "train/perplexity": 8.548511298465863, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029122.5891293513, "perf/iters_per_sec": 0.9675610490462071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335265159606934, "data/tokens_consumed": 92423585792, "data/tokens_consumed_B": 92.423585792, "train/loss_slope": -1.7796478742169697e-05} {"step": 44080, "timestamp": 1778242186.5936494, "train/loss": 2.14921875, "train/z_loss": 0.001408805837854743, "train/perplexity": 8.57815409576384, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027471.8661768392, "perf/iters_per_sec": 0.9667739230045506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343679904937744, "data/tokens_consumed": 92444557312, "data/tokens_consumed_B": 92.444557312, "train/loss_slope": -1.8823433873748526e-05} {"step": 44090, "timestamp": 1778242196.9547563, "train/loss": 2.1618351697921754, "train/z_loss": 0.0014181629987433553, "train/perplexity": 8.687065278831792, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025224.6061561157, "perf/iters_per_sec": 0.9657023459225252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355157613754273, "data/tokens_consumed": 92465528832, "data/tokens_consumed_B": 92.465528832, "train/loss_slope": -1.89932717598847e-05} {"step": 44100, "timestamp": 1778242207.288948, "grad/layer_0/attn": 0.0026473072357475758, "grad/layer_0/mlp": 0.0026762487832456827, "grad/layer_0/attn_mlp_ratio": 0.989185741401223, "grad/layer_4/attn": 0.0021441776771098375, "grad/layer_4/mlp": 0.002360341837629676, "grad/layer_4/attn_mlp_ratio": 0.908418243529191, "grad/layer_8/attn": 0.004377859644591808, "grad/layer_8/mlp": 0.003662930568680167, "grad/layer_8/attn_mlp_ratio": 1.1951794998536638, "grad/layer_12/attn": 0.004236274864524603, "grad/layer_12/mlp": 0.0068221441470086575, "grad/layer_12/attn_mlp_ratio": 0.6209594390183271, "grad/layer_16/attn": 0.0032686395570635796, "grad/layer_16/mlp": 0.004257716704159975, "grad/layer_16/attn_mlp_ratio": 0.7676977374046993, "grad/layer_20/attn": 0.0030411984771490097, "grad/layer_20/mlp": 0.005937046837061644, "grad/layer_20/attn_mlp_ratio": 0.5122409355001927, "grad/layer_24/attn": 0.017516782507300377, "grad/layer_24/mlp": 0.010161766782402992, "grad/layer_24/attn_mlp_ratio": 1.7237929889568688, "grad/layer_27/attn": 0.006372781470417976, "grad/layer_27/mlp": 0.010481704957783222, "grad/layer_27/attn_mlp_ratio": 0.6079909170584654} {"step": 44100, "timestamp": 1778242207.88424, "eos/sharpness": 68.28222274780272, "eos/L0_probe": 1.986617088317871, "eos/L_plus": 2.2775042057037354, "eos/L_minus": 2.378552198410034, "eos/grad_norm": 0.17914146184921265, "eos/embed_grad_frac": 0.06330091506242752, "eos/time_s": 0.5923783779144287} {"step": 44100, "timestamp": 1778242207.9043434, "train/loss": 2.1787970304489135, "train/z_loss": 0.0014003963908180595, "train/perplexity": 8.83567081998911, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916516.5884474274, "perf/iters_per_sec": 0.9138663236844193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0942519426345825, "data/tokens_consumed": 92486500352, "data/tokens_consumed_B": 92.486500352, "train/loss_slope": -1.3630638028135372e-05} {"step": 44100, "timestamp": 1778242209.2659667, "geo/rankme_last": 439.3514099121094, "geo/layer_0/stable_rank_q_proj": 19.37438201904297, "geo/layer_0/stable_rank_k_proj": 16.383487701416016, "geo/layer_0/stable_rank_o_proj": 48.6406135559082, "geo/layer_0/stable_rank_gate_proj": 135.83006286621094, "geo/layer_0/stable_rank_down_proj": 53.33583450317383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06191408634185791, "geo/layer_0/attn_entropy_mean": 6.188560962677002, "geo/layer_0/attn_entropy_std": 0.38500455021858215, "geo/layer_7/stable_rank_q_proj": 42.8956184387207, "geo/layer_7/stable_rank_k_proj": 42.491153717041016, "geo/layer_7/stable_rank_o_proj": 95.73355102539062, "geo/layer_7/stable_rank_gate_proj": 88.97904968261719, "geo/layer_7/stable_rank_down_proj": 146.02273559570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4899129569530487, "geo/layer_7/attn_entropy_mean": 4.64216423034668, "geo/layer_7/attn_entropy_std": 0.8303847908973694, "geo/layer_14/stable_rank_q_proj": 54.26874542236328, "geo/layer_14/stable_rank_k_proj": 37.919288635253906, "geo/layer_14/stable_rank_o_proj": 47.2003288269043, "geo/layer_14/stable_rank_gate_proj": 75.34095001220703, "geo/layer_14/stable_rank_down_proj": 133.5874481201172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3792494535446167, "geo/layer_14/attn_entropy_mean": 5.5275444984436035, "geo/layer_14/attn_entropy_std": 0.3723197281360626, "geo/layer_21/stable_rank_q_proj": 42.97683334350586, "geo/layer_21/stable_rank_k_proj": 30.615198135375977, "geo/layer_21/stable_rank_o_proj": 75.22898864746094, "geo/layer_21/stable_rank_gate_proj": 71.52169036865234, "geo/layer_21/stable_rank_down_proj": 54.58848571777344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1422773152589798, "geo/layer_21/attn_entropy_mean": 5.702877521514893, "geo/layer_21/attn_entropy_std": 0.2892189621925354, "geo/layer_27/stable_rank_q_proj": 42.797977447509766, "geo/layer_27/stable_rank_k_proj": 31.640792846679688, "geo/layer_27/stable_rank_o_proj": 116.94093322753906, "geo/layer_27/stable_rank_gate_proj": 84.23744201660156, "geo/layer_27/stable_rank_down_proj": 130.78604125976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08286865800619125, "geo/layer_27/attn_entropy_mean": 4.235315322875977, "geo/layer_27/attn_entropy_std": 0.6893567442893982, "attnres/final_alpha/block_0": 0.24200668931007385, "attnres/block_norm/0": 1.7305865287780762, "attnres/final_alpha/block_1": 0.005002405494451523, "attnres/block_norm/1": 41757.4375, "attnres/final_alpha/block_2": 0.010784805752336979, "attnres/block_norm/2": 26924.728515625, "attnres/final_alpha/block_3": 0.012675202451646328, "attnres/block_norm/3": 49830.26953125, "attnres/final_alpha/block_4": 0.01573324389755726, "attnres/block_norm/4": 13323.482421875, "attnres/final_alpha/block_5": 0.5975357294082642, "attnres/block_norm/5": 6125.7490234375, "attnres/final_alpha/block_6": 0.11626192927360535, "attnres/block_norm/6": 32686.265625, "geo/tier1_time_s": 1.3584017753601074, "geo/step": 44100.0, "geo/rankme_slope": -0.00011214489702130853} {"step": 44110, "timestamp": 1778242219.613708, "train/loss": 2.203395700454712, "train/z_loss": 0.0014044833020307123, "train/perplexity": 9.055711833553518, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791469.0239336386, "perf/iters_per_sec": 0.8542389983814423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170632576942444, "data/tokens_consumed": 92507471872, "data/tokens_consumed_B": 92.507471872, "train/loss_slope": -1.121483631689125e-05} {"step": 44120, "timestamp": 1778242229.9576013, "train/loss": 2.143654668331146, "train/z_loss": 0.0013928350061178207, "train/perplexity": 8.530557085429615, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028412.7024907405, "perf/iters_per_sec": 0.9672225487188055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338882207870483, "data/tokens_consumed": 92528443392, "data/tokens_consumed_B": 92.528443392, "train/loss_slope": -8.715057566185221e-06} {"step": 44130, "timestamp": 1778242240.9081914, "train/loss": 2.1546631336212156, "train/z_loss": 0.0014158247970044613, "train/perplexity": 8.62498422233464, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916343.3518965694, "perf/iters_per_sec": 0.9137837180598113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0943508625030518, "data/tokens_consumed": 92549414912, "data/tokens_consumed_B": 92.549414912, "train/loss_slope": -1.0008012665451878e-05} {"step": 44140, "timestamp": 1778242251.76871, "train/loss": 2.1244861125946044, "train/z_loss": 0.001402028847951442, "train/perplexity": 8.36859586693059, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932375.571676041, "perf/iters_per_sec": 0.9214284761791425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.085271430015564, "data/tokens_consumed": 92570386432, "data/tokens_consumed_B": 92.570386432, "train/loss_slope": -1.1306134688042323e-05} {"step": 44150, "timestamp": 1778242262.1018343, "grad/layer_0/attn": 0.002602572785690427, "grad/layer_0/mlp": 0.0028375412803143263, "grad/layer_0/attn_mlp_ratio": 0.917192892320791, "grad/layer_4/attn": 0.0025154268369078636, "grad/layer_4/mlp": 0.002480387222021818, "grad/layer_4/attn_mlp_ratio": 1.014126630375406, "grad/layer_8/attn": 0.004123849328607321, "grad/layer_8/mlp": 0.003672919934615493, "grad/layer_8/attn_mlp_ratio": 1.1227713344537964, "grad/layer_12/attn": 0.004621248226612806, "grad/layer_12/mlp": 0.007028140127658844, "grad/layer_12/attn_mlp_ratio": 0.657535005978701, "grad/layer_16/attn": 0.005054602865129709, "grad/layer_16/mlp": 0.00468194205313921, "grad/layer_16/attn_mlp_ratio": 1.0795953259996247, "grad/layer_20/attn": 0.003585778409615159, "grad/layer_20/mlp": 0.005938782822340727, "grad/layer_20/attn_mlp_ratio": 0.6037901126384077, "grad/layer_24/attn": 0.015141848474740982, "grad/layer_24/mlp": 0.0108000123873353, "grad/layer_24/attn_mlp_ratio": 1.4020213858546147, "grad/layer_27/attn": 0.004174868110567331, "grad/layer_27/mlp": 0.01009888295084238, "grad/layer_27/attn_mlp_ratio": 0.41339899566606947} {"step": 44150, "timestamp": 1778242262.1164205, "train/loss": 2.211084079742432, "train/z_loss": 0.0013929485925473274, "train/perplexity": 9.125603914982532, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028085.0429613546, "perf/iters_per_sec": 0.967066308479955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340552568435668, "data/tokens_consumed": 92591357952, "data/tokens_consumed_B": 92.591357952, "train/loss_slope": -6.5043839326750375e-06} {"step": 44160, "timestamp": 1778242272.4599843, "train/loss": 2.1407211303710936, "train/z_loss": 0.001406269276048988, "train/perplexity": 8.505569042005929, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029243.7373272807, "perf/iters_per_sec": 0.9676188170086292, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334648132324218, "data/tokens_consumed": 92612329472, "data/tokens_consumed_B": 92.612329472, "train/loss_slope": -5.911766577868589e-06} {"step": 44170, "timestamp": 1778242282.8086448, "train/loss": 2.171217179298401, "train/z_loss": 0.0013905317755416036, "train/perplexity": 8.768950933208146, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027600.7161181327, "perf/iters_per_sec": 0.9668353634443916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034302258491516, "data/tokens_consumed": 92633300992, "data/tokens_consumed_B": 92.633300992, "train/loss_slope": -5.3698168407501035e-06} {"step": 44175, "timestamp": 1778242288.5763843, "eos/sharpness": 65.16959667205809, "eos/L0_probe": 1.9888262748718262, "eos/L_plus": 2.3811230659484863, "eos/L_minus": 2.248225450515747, "eos/grad_norm": 0.17427334189414978, "eos/embed_grad_frac": 0.09384274482727051, "eos/time_s": 0.5988221168518066} {"step": 44175, "timestamp": 1778242289.9544191, "geo/rankme_last": 438.505615234375, "geo/layer_0/stable_rank_q_proj": 19.38205909729004, "geo/layer_0/stable_rank_k_proj": 16.38979148864746, "geo/layer_0/stable_rank_o_proj": 48.63264083862305, "geo/layer_0/stable_rank_gate_proj": 136.09146118164062, "geo/layer_0/stable_rank_down_proj": 53.3514289855957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05742444470524788, "geo/layer_0/attn_entropy_mean": 6.188054084777832, "geo/layer_0/attn_entropy_std": 0.3863371014595032, "geo/layer_7/stable_rank_q_proj": 42.87483215332031, "geo/layer_7/stable_rank_k_proj": 42.48011016845703, "geo/layer_7/stable_rank_o_proj": 95.49024200439453, "geo/layer_7/stable_rank_gate_proj": 88.9514389038086, "geo/layer_7/stable_rank_down_proj": 145.6542205810547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.485149085521698, "geo/layer_7/attn_entropy_mean": 4.628542423248291, "geo/layer_7/attn_entropy_std": 0.8122487664222717, "geo/layer_14/stable_rank_q_proj": 54.27132034301758, "geo/layer_14/stable_rank_k_proj": 37.9902458190918, "geo/layer_14/stable_rank_o_proj": 47.18705749511719, "geo/layer_14/stable_rank_gate_proj": 75.22303771972656, "geo/layer_14/stable_rank_down_proj": 133.23703002929688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38781365752220154, "geo/layer_14/attn_entropy_mean": 5.522434234619141, "geo/layer_14/attn_entropy_std": 0.3748028874397278, "geo/layer_21/stable_rank_q_proj": 42.97169494628906, "geo/layer_21/stable_rank_k_proj": 30.715822219848633, "geo/layer_21/stable_rank_o_proj": 75.18519592285156, "geo/layer_21/stable_rank_gate_proj": 71.35442352294922, "geo/layer_21/stable_rank_down_proj": 54.52735900878906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1457323133945465, "geo/layer_21/attn_entropy_mean": 5.72382926940918, "geo/layer_21/attn_entropy_std": 0.28386035561561584, "geo/layer_27/stable_rank_q_proj": 42.798179626464844, "geo/layer_27/stable_rank_k_proj": 31.620264053344727, "geo/layer_27/stable_rank_o_proj": 116.60148620605469, "geo/layer_27/stable_rank_gate_proj": 84.16494750976562, "geo/layer_27/stable_rank_down_proj": 130.66583251953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08893226832151413, "geo/layer_27/attn_entropy_mean": 4.28731632232666, "geo/layer_27/attn_entropy_std": 0.6955347657203674, "attnres/final_alpha/block_0": 0.2406899631023407, "attnres/block_norm/0": 1.7308062314987183, "attnres/final_alpha/block_1": 0.004986885003745556, "attnres/block_norm/1": 41713.9375, "attnres/final_alpha/block_2": 0.010719990357756615, "attnres/block_norm/2": 26970.556640625, "attnres/final_alpha/block_3": 0.012880977243185043, "attnres/block_norm/3": 49279.4140625, "attnres/final_alpha/block_4": 0.015617393888533115, "attnres/block_norm/4": 13350.294921875, "attnres/final_alpha/block_5": 0.6010438799858093, "attnres/block_norm/5": 6124.61962890625, "attnres/final_alpha/block_6": 0.1140608936548233, "attnres/block_norm/6": 32929.671875, "geo/tier1_time_s": 1.3583383560180664, "geo/step": 44175.0, "geo/rankme_slope": -0.0001350107425782813} {"step": 44180, "timestamp": 1778242295.1317055, "train/loss": 2.1303847670555114, "train/z_loss": 0.0013987495563924312, "train/perplexity": 8.418105197937955, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702512.707248971, "perf/iters_per_sec": 0.8118213211293083, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317981481552125, "data/tokens_consumed": 92654272512, "data/tokens_consumed_B": 92.654272512, "train/loss_slope": -6.612896053704526e-06} {"step": 44190, "timestamp": 1778242305.9788263, "train/loss": 2.2062453269958495, "train/z_loss": 0.001397946069482714, "train/perplexity": 9.0815540331649, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934670.9318586234, "perf/iters_per_sec": 0.9225229892056577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.083983826637268, "data/tokens_consumed": 92675244032, "data/tokens_consumed_B": 92.675244032, "train/loss_slope": -3.972904159255232e-06} {"step": 44200, "timestamp": 1778242316.3173044, "grad/layer_0/attn": 0.0027743901591748, "grad/layer_0/mlp": 0.0029872555751353502, "grad/layer_0/attn_mlp_ratio": 0.9287421168089652, "grad/layer_4/attn": 0.0023059886880218983, "grad/layer_4/mlp": 0.0024612892884761095, "grad/layer_4/attn_mlp_ratio": 0.9369027058819911, "grad/layer_8/attn": 0.005824965424835682, "grad/layer_8/mlp": 0.0036742144729942083, "grad/layer_8/attn_mlp_ratio": 1.5853634318609686, "grad/layer_12/attn": 0.005852865986526012, "grad/layer_12/mlp": 0.006476539187133312, "grad/layer_12/attn_mlp_ratio": 0.9037026916757338, "grad/layer_16/attn": 0.0043438454158604145, "grad/layer_16/mlp": 0.004500347189605236, "grad/layer_16/attn_mlp_ratio": 0.9652244896507645, "grad/layer_20/attn": 0.0043395827524363995, "grad/layer_20/mlp": 0.00628955801948905, "grad/layer_20/attn_mlp_ratio": 0.6899662376899918, "grad/layer_24/attn": 0.012067001312971115, "grad/layer_24/mlp": 0.011470677331089973, "grad/layer_24/attn_mlp_ratio": 1.051986805963602, "grad/layer_27/attn": 0.004449687898159027, "grad/layer_27/mlp": 0.01244934182614088, "grad/layer_27/attn_mlp_ratio": 0.35742354291158646} {"step": 44200, "timestamp": 1778242316.3318489, "train/loss": 2.179331874847412, "train/z_loss": 0.0014023633440956474, "train/perplexity": 8.840397793019001, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026875.4067120065, "perf/iters_per_sec": 0.9664895089683564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346723794937134, "data/tokens_consumed": 92696215552, "data/tokens_consumed_B": 92.696215552, "train/loss_slope": -5.960194441494893e-07} {"step": 44210, "timestamp": 1778242327.108493, "train/loss": 2.1167386293411257, "train/z_loss": 0.001423859514761716, "train/perplexity": 8.304010819434739, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946962.7419568114, "perf/iters_per_sec": 0.9283841810020501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0771402835845947, "data/tokens_consumed": 92717187072, "data/tokens_consumed_B": 92.717187072, "train/loss_slope": -3.004120356894247e-06} {"step": 44220, "timestamp": 1778242337.4651537, "train/loss": 2.1433295965194703, "train/z_loss": 0.0014040637761354446, "train/perplexity": 8.527784492453577, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026395.0185415717, "perf/iters_per_sec": 0.9662604420383318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034917664527893, "data/tokens_consumed": 92738158592, "data/tokens_consumed_B": 92.738158592, "train/loss_slope": -8.285209648322439e-07} {"step": 44230, "timestamp": 1778242347.8112683, "train/loss": 2.174576961994171, "train/z_loss": 0.0014090960728935897, "train/perplexity": 8.798462250860132, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028065.07626975, "perf/iters_per_sec": 0.9670567876194716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340654373168945, "data/tokens_consumed": 92759130112, "data/tokens_consumed_B": 92.759130112, "train/loss_slope": 4.788217335680025e-07} {"step": 44240, "timestamp": 1778242358.1583805, "train/loss": 2.143439996242523, "train/z_loss": 0.0014186456217430532, "train/perplexity": 8.52872600947045, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028197.6961356648, "perf/iters_per_sec": 0.9671200256994557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339978218078614, "data/tokens_consumed": 92780101632, "data/tokens_consumed_B": 92.780101632, "train/loss_slope": 5.736424620359381e-07} {"step": 44250, "timestamp": 1778242368.497634, "grad/layer_0/attn": 0.0026495400816202164, "grad/layer_0/mlp": 0.0028810380026698112, "grad/layer_0/attn_mlp_ratio": 0.9196477058616235, "grad/layer_4/attn": 0.002737782197073102, "grad/layer_4/mlp": 0.0025977471377700567, "grad/layer_4/attn_mlp_ratio": 1.053906306690274, "grad/layer_8/attn": 0.004102685488760471, "grad/layer_8/mlp": 0.0036626863293349743, "grad/layer_8/attn_mlp_ratio": 1.1201301470695066, "grad/layer_12/attn": 0.004149159416556358, "grad/layer_12/mlp": 0.006594661623239517, "grad/layer_12/attn_mlp_ratio": 0.6291694086346787, "grad/layer_16/attn": 0.0039987461641430855, "grad/layer_16/mlp": 0.004371136426925659, "grad/layer_16/attn_mlp_ratio": 0.914806970569623, "grad/layer_20/attn": 0.0037978002801537514, "grad/layer_20/mlp": 0.005491027142852545, "grad/layer_20/attn_mlp_ratio": 0.6916374864279174, "grad/layer_24/attn": 0.014080718159675598, "grad/layer_24/mlp": 0.009036317467689514, "grad/layer_24/attn_mlp_ratio": 1.5582363118825053, "grad/layer_27/attn": 0.007273311261087656, "grad/layer_27/mlp": 0.008439471013844013, "grad/layer_27/attn_mlp_ratio": 0.8618207424345109} {"step": 44250, "timestamp": 1778242369.0991573, "eos/sharpness": 56.809997558593736, "eos/L0_probe": 1.9877246618270874, "eos/L_plus": 2.244546413421631, "eos/L_minus": 2.2990028858184814, "eos/grad_norm": 0.15090268850326538, "eos/embed_grad_frac": 0.09651162475347519, "eos/time_s": 0.598717212677002} {"step": 44250, "timestamp": 1778242369.1171546, "train/loss": 2.171479415893555, "train/z_loss": 0.001404638437088579, "train/perplexity": 8.771250774582047, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914974.011449316, "perf/iters_per_sec": 0.9131307656523304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095133399963379, "data/tokens_consumed": 92801073152, "data/tokens_consumed_B": 92.801073152, "train/loss_slope": -1.0555371986364186e-06} {"step": 44250, "timestamp": 1778242370.4832644, "geo/rankme_last": 439.43853759765625, "geo/layer_0/stable_rank_q_proj": 19.34844207763672, "geo/layer_0/stable_rank_k_proj": 16.39116859436035, "geo/layer_0/stable_rank_o_proj": 48.66181564331055, "geo/layer_0/stable_rank_gate_proj": 136.07351684570312, "geo/layer_0/stable_rank_down_proj": 53.364776611328125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061343539506196976, "geo/layer_0/attn_entropy_mean": 6.185299396514893, "geo/layer_0/attn_entropy_std": 0.3858310282230377, "geo/layer_7/stable_rank_q_proj": 42.957584381103516, "geo/layer_7/stable_rank_k_proj": 42.42877197265625, "geo/layer_7/stable_rank_o_proj": 95.80024719238281, "geo/layer_7/stable_rank_gate_proj": 88.81412506103516, "geo/layer_7/stable_rank_down_proj": 146.20864868164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4884088337421417, "geo/layer_7/attn_entropy_mean": 4.653507232666016, "geo/layer_7/attn_entropy_std": 0.8143126964569092, "geo/layer_14/stable_rank_q_proj": 54.38090896606445, "geo/layer_14/stable_rank_k_proj": 38.08504104614258, "geo/layer_14/stable_rank_o_proj": 47.20222473144531, "geo/layer_14/stable_rank_gate_proj": 75.1939468383789, "geo/layer_14/stable_rank_down_proj": 132.78717041015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3822084665298462, "geo/layer_14/attn_entropy_mean": 5.524789810180664, "geo/layer_14/attn_entropy_std": 0.36037495732307434, "geo/layer_21/stable_rank_q_proj": 42.98967742919922, "geo/layer_21/stable_rank_k_proj": 30.76190185546875, "geo/layer_21/stable_rank_o_proj": 75.10491943359375, "geo/layer_21/stable_rank_gate_proj": 71.3443832397461, "geo/layer_21/stable_rank_down_proj": 54.46857452392578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13885393738746643, "geo/layer_21/attn_entropy_mean": 5.710230827331543, "geo/layer_21/attn_entropy_std": 0.2919377386569977, "geo/layer_27/stable_rank_q_proj": 42.751060485839844, "geo/layer_27/stable_rank_k_proj": 31.588708877563477, "geo/layer_27/stable_rank_o_proj": 116.18214416503906, "geo/layer_27/stable_rank_gate_proj": 84.09170532226562, "geo/layer_27/stable_rank_down_proj": 130.7333984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08570399135351181, "geo/layer_27/attn_entropy_mean": 4.276504993438721, "geo/layer_27/attn_entropy_std": 0.665054976940155, "attnres/final_alpha/block_0": 0.24140223860740662, "attnres/block_norm/0": 1.7310850620269775, "attnres/final_alpha/block_1": 0.004987915046513081, "attnres/block_norm/1": 41634.6328125, "attnres/final_alpha/block_2": 0.010841465555131435, "attnres/block_norm/2": 26859.41015625, "attnres/final_alpha/block_3": 0.012983839958906174, "attnres/block_norm/3": 49682.921875, "attnres/final_alpha/block_4": 0.015810184180736542, "attnres/block_norm/4": 13388.642578125, "attnres/final_alpha/block_5": 0.5984926819801331, "attnres/block_norm/5": 6175.97265625, "attnres/final_alpha/block_6": 0.1154816523194313, "attnres/block_norm/6": 32781.078125, "geo/tier1_time_s": 1.362144947052002, "geo/step": 44250.0, "geo/rankme_slope": -0.0001118375475190076} {"step": 44260, "timestamp": 1778242380.8302205, "train/loss": 2.163999152183533, "train/z_loss": 0.001397620909847319, "train/perplexity": 8.705884289788088, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791021.8510923204, "perf/iters_per_sec": 0.8540257697545626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709248542785644, "data/tokens_consumed": 92822044672, "data/tokens_consumed_B": 92.822044672, "train/loss_slope": -5.30432689093353e-07} {"step": 44270, "timestamp": 1778242391.2262895, "train/loss": 2.117244100570679, "train/z_loss": 0.0014098024461418391, "train/perplexity": 8.308209319014841, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018945.1582640249, "perf/iters_per_sec": 0.9627080718345761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387364864349364, "data/tokens_consumed": 92843016192, "data/tokens_consumed_B": 92.843016192, "train/loss_slope": -7.193851463793994e-06} {"step": 44280, "timestamp": 1778242401.6105196, "train/loss": 2.1909213304519652, "train/z_loss": 0.0013961804332211613, "train/perplexity": 8.943449192127991, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021063.3709730578, "perf/iters_per_sec": 0.9637181143632211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037647819519043, "data/tokens_consumed": 92863987712, "data/tokens_consumed_B": 92.863987712, "train/loss_slope": -4.535073766184762e-06} {"step": 44290, "timestamp": 1778242411.9891543, "train/loss": 2.1859209060668947, "train/z_loss": 0.0014099126681685447, "train/perplexity": 8.898839776657516, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021664.4526215212, "perf/iters_per_sec": 0.9640047324283224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373393058776856, "data/tokens_consumed": 92884959232, "data/tokens_consumed_B": 92.884959232, "train/loss_slope": -9.865423097218458e-07} {"step": 44300, "timestamp": 1778242422.3560417, "grad/layer_0/attn": 0.002979816636070609, "grad/layer_0/mlp": 0.0029902446549385786, "grad/layer_0/attn_mlp_ratio": 0.9965126202961323, "grad/layer_4/attn": 0.0023322023916989565, "grad/layer_4/mlp": 0.002561607165262103, "grad/layer_4/attn_mlp_ratio": 0.9104449473289287, "grad/layer_8/attn": 0.003566035069525242, "grad/layer_8/mlp": 0.003729460062459111, "grad/layer_8/attn_mlp_ratio": 0.956179960151093, "grad/layer_12/attn": 0.005308149382472038, "grad/layer_12/mlp": 0.00699372636154294, "grad/layer_12/attn_mlp_ratio": 0.7589872740463125, "grad/layer_16/attn": 0.005774871911853552, "grad/layer_16/mlp": 0.004486910067498684, "grad/layer_16/attn_mlp_ratio": 1.287048703066161, "grad/layer_20/attn": 0.006446977611631155, "grad/layer_20/mlp": 0.0059296307153999805, "grad/layer_20/attn_mlp_ratio": 1.0872477245779892, "grad/layer_24/attn": 0.010509461164474487, "grad/layer_24/mlp": 0.009244963526725769, "grad/layer_24/attn_mlp_ratio": 1.13677690781749, "grad/layer_27/attn": 0.008514809422194958, "grad/layer_27/mlp": 0.00875831674784422, "grad/layer_27/attn_mlp_ratio": 0.9721970065847528} {"step": 44300, "timestamp": 1778242422.3728497, "train/loss": 2.16118586063385, "train/z_loss": 0.0014149548951536417, "train/perplexity": 8.681426518634705, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021048.4181667631, "perf/iters_per_sec": 0.9637109843095604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03765549659729, "data/tokens_consumed": 92905930752, "data/tokens_consumed_B": 92.905930752, "train/loss_slope": -3.0831772967068674e-06} {"step": 44310, "timestamp": 1778242432.7436361, "train/loss": 2.154028058052063, "train/z_loss": 0.0013960665324702858, "train/perplexity": 8.619508444521104, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023146.3393037675, "perf/iters_per_sec": 0.9647113510626638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365794897079468, "data/tokens_consumed": 92926902272, "data/tokens_consumed_B": 92.926902272, "train/loss_slope": -3.1423722306350816e-06} {"step": 44320, "timestamp": 1778242443.1083412, "train/loss": 2.1727829456329344, "train/z_loss": 0.0013833994162268937, "train/perplexity": 8.78269181606714, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025171.7768194051, "perf/iters_per_sec": 0.9656771549317384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355427742004395, "data/tokens_consumed": 92947873792, "data/tokens_consumed_B": 92.947873792, "train/loss_slope": 1.8844719659877745e-07} {"step": 44325, "timestamp": 1778242448.8687172, "eos/sharpness": 62.82277107238768, "eos/L0_probe": 1.9855396747589111, "eos/L_plus": 2.3653945922851562, "eos/L_minus": 2.233912467956543, "eos/grad_norm": 0.1587989628314972, "eos/embed_grad_frac": 0.0777796059846878, "eos/time_s": 0.596750020980835} {"step": 44325, "timestamp": 1778242450.2488916, "geo/rankme_last": 438.9091491699219, "geo/layer_0/stable_rank_q_proj": 19.342288970947266, "geo/layer_0/stable_rank_k_proj": 16.386539459228516, "geo/layer_0/stable_rank_o_proj": 48.63692855834961, "geo/layer_0/stable_rank_gate_proj": 135.99378967285156, "geo/layer_0/stable_rank_down_proj": 53.32884979248047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06273742020130157, "geo/layer_0/attn_entropy_mean": 6.1838836669921875, "geo/layer_0/attn_entropy_std": 0.38325589895248413, "geo/layer_7/stable_rank_q_proj": 42.919883728027344, "geo/layer_7/stable_rank_k_proj": 42.50493240356445, "geo/layer_7/stable_rank_o_proj": 95.84849548339844, "geo/layer_7/stable_rank_gate_proj": 88.71337127685547, "geo/layer_7/stable_rank_down_proj": 146.41806030273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4883185625076294, "geo/layer_7/attn_entropy_mean": 4.659633636474609, "geo/layer_7/attn_entropy_std": 0.8363502621650696, "geo/layer_14/stable_rank_q_proj": 54.286338806152344, "geo/layer_14/stable_rank_k_proj": 38.067344665527344, "geo/layer_14/stable_rank_o_proj": 47.19934844970703, "geo/layer_14/stable_rank_gate_proj": 75.256103515625, "geo/layer_14/stable_rank_down_proj": 132.95355224609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38282516598701477, "geo/layer_14/attn_entropy_mean": 5.519208908081055, "geo/layer_14/attn_entropy_std": 0.37007296085357666, "geo/layer_21/stable_rank_q_proj": 42.990604400634766, "geo/layer_21/stable_rank_k_proj": 30.716936111450195, "geo/layer_21/stable_rank_o_proj": 75.20938110351562, "geo/layer_21/stable_rank_gate_proj": 71.39823150634766, "geo/layer_21/stable_rank_down_proj": 54.43792724609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14627887308597565, "geo/layer_21/attn_entropy_mean": 5.70749568939209, "geo/layer_21/attn_entropy_std": 0.2895697057247162, "geo/layer_27/stable_rank_q_proj": 42.7706413269043, "geo/layer_27/stable_rank_k_proj": 31.505327224731445, "geo/layer_27/stable_rank_o_proj": 116.1306381225586, "geo/layer_27/stable_rank_gate_proj": 84.10240173339844, "geo/layer_27/stable_rank_down_proj": 130.70401000976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08264333754777908, "geo/layer_27/attn_entropy_mean": 4.2581634521484375, "geo/layer_27/attn_entropy_std": 0.676801860332489, "attnres/final_alpha/block_0": 0.2405162751674652, "attnres/block_norm/0": 1.7311413288116455, "attnres/final_alpha/block_1": 0.004945657216012478, "attnres/block_norm/1": 41700.25, "attnres/final_alpha/block_2": 0.010992733761668205, "attnres/block_norm/2": 27002.9453125, "attnres/final_alpha/block_3": 0.01292220689356327, "attnres/block_norm/3": 49652.4140625, "attnres/final_alpha/block_4": 0.015469708479940891, "attnres/block_norm/4": 13417.72265625, "attnres/final_alpha/block_5": 0.6016706824302673, "attnres/block_norm/5": 6130.6181640625, "attnres/final_alpha/block_6": 0.1134827509522438, "attnres/block_norm/6": 32977.44140625, "geo/tier1_time_s": 1.3622560501098633, "geo/step": 44325.0, "geo/rankme_slope": -0.00011030572385204082} {"step": 44330, "timestamp": 1778242455.4258535, "train/loss": 2.119774651527405, "train/z_loss": 0.0013982592499814928, "train/perplexity": 8.329260290099612, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703688.9418814823, "perf/iters_per_sec": 0.812382193508855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2309477090835572, "data/tokens_consumed": 92968845312, "data/tokens_consumed_B": 92.968845312, "train/loss_slope": -2.583766040807715e-06} {"step": 44340, "timestamp": 1778242465.7651231, "train/loss": 2.1544970512390136, "train/z_loss": 0.0013986811740323902, "train/perplexity": 8.623551883354965, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029152.266291353, "perf/iters_per_sec": 0.9675752002197995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335114002227783, "data/tokens_consumed": 92989816832, "data/tokens_consumed_B": 92.989816832, "train/loss_slope": -5.377642177727134e-06} {"step": 44350, "timestamp": 1778242476.101814, "grad/layer_0/attn": 0.0030407519079744816, "grad/layer_0/mlp": 0.0031903921626508236, "grad/layer_0/attn_mlp_ratio": 0.9530965654511683, "grad/layer_4/attn": 0.004400245379656553, "grad/layer_4/mlp": 0.0026435202453285456, "grad/layer_4/attn_mlp_ratio": 1.6645400090952112, "grad/layer_8/attn": 0.006083739455789328, "grad/layer_8/mlp": 0.0038125519640743732, "grad/layer_8/attn_mlp_ratio": 1.5957131479242794, "grad/layer_12/attn": 0.004152417648583651, "grad/layer_12/mlp": 0.007430813740938902, "grad/layer_12/attn_mlp_ratio": 0.5588106144856648, "grad/layer_16/attn": 0.0061165327206254005, "grad/layer_16/mlp": 0.004880992695689201, "grad/layer_16/attn_mlp_ratio": 1.253132913047404, "grad/layer_20/attn": 0.0056053330190479755, "grad/layer_20/mlp": 0.00706947036087513, "grad/layer_20/attn_mlp_ratio": 0.7928929118623252, "grad/layer_24/attn": 0.02436705119907856, "grad/layer_24/mlp": 0.01563328132033348, "grad/layer_24/attn_mlp_ratio": 1.5586651672108627, "grad/layer_27/attn": 0.007826214656233788, "grad/layer_27/mlp": 0.017025088891386986, "grad/layer_27/attn_mlp_ratio": 0.4596871511328416} {"step": 44350, "timestamp": 1778242476.1162217, "train/loss": 2.164448642730713, "train/z_loss": 0.0013910412788391113, "train/perplexity": 8.709798382089033, "train/grad_norm": 0.34765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026987.5985684309, "perf/iters_per_sec": 0.9665430062143473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346151113510131, "data/tokens_consumed": 93010788352, "data/tokens_consumed_B": 93.010788352, "train/loss_slope": -2.9268849693616457e-06} {"step": 44360, "timestamp": 1778242486.4684064, "train/loss": 2.184570860862732, "train/z_loss": 0.0013979209237731994, "train/perplexity": 8.886834046657025, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027396.536039266, "perf/iters_per_sec": 0.9667380027958231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344064235687256, "data/tokens_consumed": 93031759872, "data/tokens_consumed_B": 93.031759872, "train/loss_slope": -1.7147794343528788e-06} {"step": 44370, "timestamp": 1778242496.8074644, "train/loss": 2.1920022487640383, "train/z_loss": 0.0013954997877590359, "train/perplexity": 8.953121556709101, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029581.9334341546, "perf/iters_per_sec": 0.9677800814791463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332926034927368, "data/tokens_consumed": 93052731392, "data/tokens_consumed_B": 93.052731392, "train/loss_slope": 1.951686455400158e-06} {"step": 44380, "timestamp": 1778242507.1643176, "train/loss": 2.156339955329895, "train/z_loss": 0.0014081060886383056, "train/perplexity": 8.639458915463923, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026172.6453157105, "perf/iters_per_sec": 0.9661544062212517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035031247138977, "data/tokens_consumed": 93073702912, "data/tokens_consumed_B": 93.073702912, "train/loss_slope": -5.624209657789621e-07} {"step": 44390, "timestamp": 1778242517.5267124, "train/loss": 2.179695987701416, "train/z_loss": 0.001392637111712247, "train/perplexity": 8.843617281582961, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025677.4754872243, "perf/iters_per_sec": 0.9659182908474084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352842569351197, "data/tokens_consumed": 93094674432, "data/tokens_consumed_B": 93.094674432, "train/loss_slope": -9.99963423027623e-07} {"step": 44400, "timestamp": 1778242527.8615344, "grad/layer_0/attn": 0.0026319106109440327, "grad/layer_0/mlp": 0.0027353165205568075, "grad/layer_0/attn_mlp_ratio": 0.962195963408534, "grad/layer_4/attn": 0.0020045649725943804, "grad/layer_4/mlp": 0.0025736093521118164, "grad/layer_4/attn_mlp_ratio": 0.7788924504258028, "grad/layer_8/attn": 0.004063415341079235, "grad/layer_8/mlp": 0.003654034109786153, "grad/layer_8/attn_mlp_ratio": 1.1120353854916538, "grad/layer_12/attn": 0.0037013234104961157, "grad/layer_12/mlp": 0.00670792069286108, "grad/layer_12/attn_mlp_ratio": 0.5517840065188692, "grad/layer_16/attn": 0.004542105831205845, "grad/layer_16/mlp": 0.004492335487157106, "grad/layer_16/attn_mlp_ratio": 1.011078923887838, "grad/layer_20/attn": 0.0035677591804414988, "grad/layer_20/mlp": 0.006197735667228699, "grad/layer_20/attn_mlp_ratio": 0.575655257732424, "grad/layer_24/attn": 0.012251610867679119, "grad/layer_24/mlp": 0.01139848306775093, "grad/layer_24/attn_mlp_ratio": 1.0748457217835696, "grad/layer_27/attn": 0.004725947976112366, "grad/layer_27/mlp": 0.01225267443805933, "grad/layer_27/attn_mlp_ratio": 0.38570746014942275} {"step": 44400, "timestamp": 1778242528.458361, "eos/sharpness": 67.43013858795165, "eos/L0_probe": 1.9867767095565796, "eos/L_plus": 2.390737295150757, "eos/L_minus": 2.257117509841919, "eos/grad_norm": 0.19424223899841309, "eos/embed_grad_frac": 0.06587863713502884, "eos/time_s": 0.5938172340393066} {"step": 44400, "timestamp": 1778242528.4760926, "train/loss": 2.1607648849487306, "train/z_loss": 0.001417876034975052, "train/perplexity": 8.677772618313766, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916401.4279540794, "perf/iters_per_sec": 0.913811410882034, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0943176984786986, "data/tokens_consumed": 93115645952, "data/tokens_consumed_B": 93.115645952, "train/loss_slope": -1.1885247643988733e-06} {"step": 44400, "timestamp": 1778242529.8456883, "geo/rankme_last": 438.6070556640625, "geo/layer_0/stable_rank_q_proj": 19.339187622070312, "geo/layer_0/stable_rank_k_proj": 16.390033721923828, "geo/layer_0/stable_rank_o_proj": 48.49639892578125, "geo/layer_0/stable_rank_gate_proj": 135.76719665527344, "geo/layer_0/stable_rank_down_proj": 53.37022399902344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0601627416908741, "geo/layer_0/attn_entropy_mean": 6.187485694885254, "geo/layer_0/attn_entropy_std": 0.38569363951683044, "geo/layer_7/stable_rank_q_proj": 42.86805725097656, "geo/layer_7/stable_rank_k_proj": 42.53329849243164, "geo/layer_7/stable_rank_o_proj": 95.6875, "geo/layer_7/stable_rank_gate_proj": 88.67957305908203, "geo/layer_7/stable_rank_down_proj": 146.5312042236328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48624444007873535, "geo/layer_7/attn_entropy_mean": 4.652205467224121, "geo/layer_7/attn_entropy_std": 0.8150559067726135, "geo/layer_14/stable_rank_q_proj": 54.31528854370117, "geo/layer_14/stable_rank_k_proj": 38.02580261230469, "geo/layer_14/stable_rank_o_proj": 47.145042419433594, "geo/layer_14/stable_rank_gate_proj": 75.2764892578125, "geo/layer_14/stable_rank_down_proj": 133.0264129638672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3884965777397156, "geo/layer_14/attn_entropy_mean": 5.52340030670166, "geo/layer_14/attn_entropy_std": 0.35673829913139343, "geo/layer_21/stable_rank_q_proj": 42.939884185791016, "geo/layer_21/stable_rank_k_proj": 30.770000457763672, "geo/layer_21/stable_rank_o_proj": 75.14004516601562, "geo/layer_21/stable_rank_gate_proj": 71.3650131225586, "geo/layer_21/stable_rank_down_proj": 54.480072021484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14436595141887665, "geo/layer_21/attn_entropy_mean": 5.715209484100342, "geo/layer_21/attn_entropy_std": 0.2796119153499603, "geo/layer_27/stable_rank_q_proj": 42.70899963378906, "geo/layer_27/stable_rank_k_proj": 31.529943466186523, "geo/layer_27/stable_rank_o_proj": 116.2861557006836, "geo/layer_27/stable_rank_gate_proj": 84.06086730957031, "geo/layer_27/stable_rank_down_proj": 130.62387084960938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09509813040494919, "geo/layer_27/attn_entropy_mean": 4.258488655090332, "geo/layer_27/attn_entropy_std": 0.6767668724060059, "attnres/final_alpha/block_0": 0.24005062878131866, "attnres/block_norm/0": 1.7313227653503418, "attnres/final_alpha/block_1": 0.0050670308992266655, "attnres/block_norm/1": 41757.32421875, "attnres/final_alpha/block_2": 0.010969194583594799, "attnres/block_norm/2": 26852.107421875, "attnres/final_alpha/block_3": 0.013118421658873558, "attnres/block_norm/3": 49382.5625, "attnres/final_alpha/block_4": 0.015577202662825584, "attnres/block_norm/4": 13371.53515625, "attnres/final_alpha/block_5": 0.6020376086235046, "attnres/block_norm/5": 6149.36279296875, "attnres/final_alpha/block_6": 0.11317993700504303, "attnres/block_norm/6": 33018.0546875, "geo/tier1_time_s": 1.3609428405761719, "geo/step": 44400.0, "geo/rankme_slope": -0.00010968066914265706} {"step": 44410, "timestamp": 1778242540.1969132, "train/loss": 2.1922914743423463, "train/z_loss": 0.0013921100995503365, "train/perplexity": 8.955711402975842, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790548.9131035202, "perf/iters_per_sec": 0.8538002553479768, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171234130859375, "data/tokens_consumed": 93136617472, "data/tokens_consumed_B": 93.136617472, "train/loss_slope": 2.454959319727316e-06} {"step": 44420, "timestamp": 1778242550.5413606, "train/loss": 2.183044695854187, "train/z_loss": 0.0013946665334515274, "train/perplexity": 8.873281615751688, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029024.1083640582, "perf/iters_per_sec": 0.9675140897579471, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335766792297363, "data/tokens_consumed": 93157588992, "data/tokens_consumed_B": 93.157588992, "train/loss_slope": 4.646660084843195e-06} {"step": 44430, "timestamp": 1778242560.8941226, "train/loss": 2.154335308074951, "train/z_loss": 0.0014106003916822373, "train/perplexity": 8.622157195581542, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026965.5515611975, "perf/iters_per_sec": 0.9665324933820713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346263647079468, "data/tokens_consumed": 93178560512, "data/tokens_consumed_B": 93.178560512, "train/loss_slope": 4.9535051967301635e-06} {"step": 44440, "timestamp": 1778242571.238385, "train/loss": 2.1924996852874754, "train/z_loss": 0.0013866259017959238, "train/perplexity": 8.957576274247927, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028395.0213270937, "perf/iters_per_sec": 0.9672141176829785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338972330093383, "data/tokens_consumed": 93199532032, "data/tokens_consumed_B": 93.199532032, "train/loss_slope": 9.268673876188124e-06} {"step": 44450, "timestamp": 1778242581.5740054, "grad/layer_0/attn": 0.003198103280737996, "grad/layer_0/mlp": 0.0032063755206763744, "grad/layer_0/attn_mlp_ratio": 0.9974200340455953, "grad/layer_4/attn": 0.002603414934128523, "grad/layer_4/mlp": 0.0024535770062357187, "grad/layer_4/attn_mlp_ratio": 1.0610691335161198, "grad/layer_8/attn": 0.007578219752758741, "grad/layer_8/mlp": 0.003769775154069066, "grad/layer_8/attn_mlp_ratio": 2.0102577055698134, "grad/layer_12/attn": 0.00394385727122426, "grad/layer_12/mlp": 0.006641147192567587, "grad/layer_12/attn_mlp_ratio": 0.5938517996187214, "grad/layer_16/attn": 0.0032749557867646217, "grad/layer_16/mlp": 0.004276845138520002, "grad/layer_16/attn_mlp_ratio": 0.7657410086454092, "grad/layer_20/attn": 0.0033751854207366705, "grad/layer_20/mlp": 0.00622200733050704, "grad/layer_20/attn_mlp_ratio": 0.542459239792586, "grad/layer_24/attn": 0.011470837518572807, "grad/layer_24/mlp": 0.012642469257116318, "grad/layer_24/attn_mlp_ratio": 0.9073257126081931, "grad/layer_27/attn": 0.005096180364489555, "grad/layer_27/mlp": 0.013280991464853287, "grad/layer_27/attn_mlp_ratio": 0.38371987058376333} {"step": 44450, "timestamp": 1778242581.5885613, "train/loss": 2.1821048974990847, "train/z_loss": 0.0013912429683841764, "train/perplexity": 8.86494643759169, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027589.685884822, "perf/iters_per_sec": 0.9668301038192854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034307885169983, "data/tokens_consumed": 93220503552, "data/tokens_consumed_B": 93.220503552, "train/loss_slope": 9.62884293066072e-06} {"step": 44460, "timestamp": 1778242591.9279413, "train/loss": 2.160040283203125, "train/z_loss": 0.0013975070090964437, "train/perplexity": 8.671486966698655, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029555.662192293, "perf/iters_per_sec": 0.9677675543748345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333059787750245, "data/tokens_consumed": 93241475072, "data/tokens_consumed_B": 93.241475072, "train/loss_slope": 8.128838456145584e-06} {"step": 44470, "timestamp": 1778242602.2729378, "train/loss": 2.2384584188461303, "train/z_loss": 0.0013839582912623881, "train/perplexity": 9.378861860721292, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028173.3780774947, "perf/iters_per_sec": 0.9671084299457048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340102195739747, "data/tokens_consumed": 93262446592, "data/tokens_consumed_B": 93.262446592, "train/loss_slope": 1.2905120885375751e-05} {"step": 44475, "timestamp": 1778242608.0390515, "eos/sharpness": 57.89132118225096, "eos/L0_probe": 1.984717845916748, "eos/L_plus": 2.2224185466766357, "eos/L_minus": 2.32593035697937, "eos/grad_norm": 0.1396511346101761, "eos/embed_grad_frac": 0.11810363084077835, "eos/time_s": 0.601611852645874} {"step": 44475, "timestamp": 1778242609.4187295, "geo/rankme_last": 439.38702392578125, "geo/layer_0/stable_rank_q_proj": 19.341381072998047, "geo/layer_0/stable_rank_k_proj": 16.415285110473633, "geo/layer_0/stable_rank_o_proj": 48.39582443237305, "geo/layer_0/stable_rank_gate_proj": 135.76348876953125, "geo/layer_0/stable_rank_down_proj": 53.405853271484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06452607363462448, "geo/layer_0/attn_entropy_mean": 6.182766914367676, "geo/layer_0/attn_entropy_std": 0.3838029205799103, "geo/layer_7/stable_rank_q_proj": 42.81423568725586, "geo/layer_7/stable_rank_k_proj": 42.5843505859375, "geo/layer_7/stable_rank_o_proj": 95.69171142578125, "geo/layer_7/stable_rank_gate_proj": 88.51403045654297, "geo/layer_7/stable_rank_down_proj": 146.45892333984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.494560569524765, "geo/layer_7/attn_entropy_mean": 4.659640312194824, "geo/layer_7/attn_entropy_std": 0.84977126121521, "geo/layer_14/stable_rank_q_proj": 54.30842208862305, "geo/layer_14/stable_rank_k_proj": 38.09994888305664, "geo/layer_14/stable_rank_o_proj": 47.05604934692383, "geo/layer_14/stable_rank_gate_proj": 75.2523193359375, "geo/layer_14/stable_rank_down_proj": 133.03627014160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3915599286556244, "geo/layer_14/attn_entropy_mean": 5.55373477935791, "geo/layer_14/attn_entropy_std": 0.36192649602890015, "geo/layer_21/stable_rank_q_proj": 42.858848571777344, "geo/layer_21/stable_rank_k_proj": 30.638870239257812, "geo/layer_21/stable_rank_o_proj": 75.17715454101562, "geo/layer_21/stable_rank_gate_proj": 71.4446792602539, "geo/layer_21/stable_rank_down_proj": 54.406394958496094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13951727747917175, "geo/layer_21/attn_entropy_mean": 5.713688850402832, "geo/layer_21/attn_entropy_std": 0.2924814820289612, "geo/layer_27/stable_rank_q_proj": 42.69351577758789, "geo/layer_27/stable_rank_k_proj": 31.537382125854492, "geo/layer_27/stable_rank_o_proj": 116.26629638671875, "geo/layer_27/stable_rank_gate_proj": 84.0791244506836, "geo/layer_27/stable_rank_down_proj": 130.51315307617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09251415729522705, "geo/layer_27/attn_entropy_mean": 4.256813049316406, "geo/layer_27/attn_entropy_std": 0.6938873529434204, "attnres/final_alpha/block_0": 0.24204081296920776, "attnres/block_norm/0": 1.7313802242279053, "attnres/final_alpha/block_1": 0.005061480216681957, "attnres/block_norm/1": 41776.265625, "attnres/final_alpha/block_2": 0.011005330830812454, "attnres/block_norm/2": 26908.82421875, "attnres/final_alpha/block_3": 0.012902596965432167, "attnres/block_norm/3": 49752.9609375, "attnres/final_alpha/block_4": 0.01567436195909977, "attnres/block_norm/4": 13397.3642578125, "attnres/final_alpha/block_5": 0.5968791246414185, "attnres/block_norm/5": 6180.07666015625, "attnres/final_alpha/block_6": 0.11643632501363754, "attnres/block_norm/6": 33028.95703125, "geo/tier1_time_s": 1.361558198928833, "geo/step": 44475.0, "geo/rankme_slope": -0.0001238774611407063} {"step": 44480, "timestamp": 1778242614.612567, "train/loss": 2.1687093496322634, "train/z_loss": 0.0014001476461999117, "train/perplexity": 8.746987449765694, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700554.869218061, "perf/iters_per_sec": 0.8108877512064271, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2332163095474242, "data/tokens_consumed": 93283418112, "data/tokens_consumed_B": 93.283418112, "train/loss_slope": 1.2366962440014612e-05} {"step": 44490, "timestamp": 1778242624.9937456, "train/loss": 2.1509262800216673, "train/z_loss": 0.001401678321417421, "train/perplexity": 8.592814064018842, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021276.820862077, "perf/iters_per_sec": 0.9638198952017197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375382423400878, "data/tokens_consumed": 93304389632, "data/tokens_consumed_B": 93.304389632, "train/loss_slope": 9.862993709897259e-06} {"step": 44500, "timestamp": 1778242635.3679254, "grad/layer_0/attn": 0.00242911116220057, "grad/layer_0/mlp": 0.002664401428773999, "grad/layer_0/attn_mlp_ratio": 0.9116911005971026, "grad/layer_4/attn": 0.002195541514083743, "grad/layer_4/mlp": 0.0025202645920217037, "grad/layer_4/attn_mlp_ratio": 0.8711551294727402, "grad/layer_8/attn": 0.006222808267921209, "grad/layer_8/mlp": 0.0038490844890475273, "grad/layer_8/attn_mlp_ratio": 1.6166982366737412, "grad/layer_12/attn": 0.00484907440841198, "grad/layer_12/mlp": 0.0067427693866193295, "grad/layer_12/attn_mlp_ratio": 0.7191517399541408, "grad/layer_16/attn": 0.006404108367860317, "grad/layer_16/mlp": 0.004167626146227121, "grad/layer_16/attn_mlp_ratio": 1.5366321232998867, "grad/layer_20/attn": 0.007669336628168821, "grad/layer_20/mlp": 0.005586082581430674, "grad/layer_20/attn_mlp_ratio": 1.3729364682809524, "grad/layer_24/attn": 0.013829868286848068, "grad/layer_24/mlp": 0.010594412684440613, "grad/layer_24/attn_mlp_ratio": 1.305392622341389, "grad/layer_27/attn": 0.010735000483691692, "grad/layer_27/mlp": 0.009157722815871239, "grad/layer_27/attn_mlp_ratio": 1.1722346900327019} {"step": 44500, "timestamp": 1778242635.3863492, "train/loss": 2.1664947986602785, "train/z_loss": 0.0013964248588308691, "train/perplexity": 8.727638233028001, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019679.7344888584, "perf/iters_per_sec": 0.9630583450741093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383586883544922, "data/tokens_consumed": 93325361152, "data/tokens_consumed_B": 93.325361152, "train/loss_slope": 1.0826009435049953e-05} {"step": 44500, "timestamp": 1778242642.3088934, "geo/ww_alpha_mean": 7.688051009108305, "geo/ww_alpha_std": 5.077552283738436, "geo/ww_alpha_min": 1.3333472038243408, "geo/ww_alpha_max": 31.216281540293334, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.956855599672686, "geo/ww_alpha_by_type/k_proj": 4.512982452157867, "geo/ww_alpha_by_type/v_proj": 8.113659409505662, "geo/ww_alpha_by_type/o_proj": 7.905158631923331, "geo/ww_alpha_by_type/gate_proj": 8.213182999029474, "geo/ww_alpha_by_type/up_proj": 12.57959579796012, "geo/ww_alpha_by_type/down_proj": 8.640973306323195, "geo/twonn_id/layer_0": 0.6939467787742615, "geo/twonn_id/layer_7": 2.9759199619293213, "geo/twonn_id/layer_14": 4.377007007598877, "geo/twonn_id/layer_21": 7.685286045074463, "geo/twonn_id/layer_27": 6.376229763031006, "geo/tier2_time_s": 6.91455864906311} {"step": 44500, "timestamp": 1778242642.9798641, "eoc/jacobian_sigma/layer_0/attn": 971.4935913085938, "eoc/jacobian_sigma/layer_0/mlp": 7222.8525390625, "eoc/jacobian_sigma/layer_0": 7222.8525390625, "eoc/jacobian_sigma/layer_7/attn": 1.160631775856018, "eoc/jacobian_sigma/layer_7/mlp": 1.7453805208206177, "eoc/jacobian_sigma/layer_7": 1.7453805208206177, "eoc/jacobian_sigma/layer_14/attn": 1.6129329204559326, "eoc/jacobian_sigma/layer_14/mlp": 5.877610206604004, "eoc/jacobian_sigma/layer_14": 5.877610206604004, "eoc/jacobian_sigma/layer_21/attn": 1.0826551914215088, "eoc/jacobian_sigma/layer_21/mlp": 4.415916919708252, "eoc/jacobian_sigma/layer_21": 4.415916919708252, "eoc/jacobian_sigma/layer_27/attn": 3.650893449783325, "eoc/jacobian_sigma/layer_27/mlp": 30.03885841369629, "eoc/jacobian_sigma/layer_27": 30.03885841369629, "eoc/layer0_sigma": 7222.8525390625, "eoc/sigma_max": 30.03885841369629, "eoc/sigma_min": 1.7453805208206177, "eoc/sigma_mean": 10.51944151520729, "eoc/time_s": 0.6652235984802246} {"step": 44510, "timestamp": 1778242653.371529, "train/loss": 2.1408739566802977, "train/z_loss": 0.0013965951628051698, "train/perplexity": 8.506869016062888, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1166495.3175585475, "perf/iters_per_sec": 0.5562283122818696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.797822904586792, "data/tokens_consumed": 93346332672, "data/tokens_consumed_B": 93.346332672, "train/loss_slope": 1.1466640514759468e-05} {"step": 44520, "timestamp": 1778242663.758635, "train/loss": 2.1308834195137023, "train/z_loss": 0.0014122522086836397, "train/perplexity": 8.42230395356112, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020837.152162374, "perf/iters_per_sec": 0.9636102448284025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377639770507812, "data/tokens_consumed": 93367304192, "data/tokens_consumed_B": 93.367304192, "train/loss_slope": 1.3124180047532784e-05} {"step": 44530, "timestamp": 1778242674.1489794, "train/loss": 2.2212230443954466, "train/z_loss": 0.0013917071046307683, "train/perplexity": 9.218598729438902, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019598.212166605, "perf/iters_per_sec": 0.9630194722016359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384006023406982, "data/tokens_consumed": 93388275712, "data/tokens_consumed_B": 93.388275712, "train/loss_slope": 1.8848928595461894e-05} {"step": 44540, "timestamp": 1778242684.5409894, "train/loss": 2.1390748262405395, "train/z_loss": 0.0014132451848126947, "train/perplexity": 8.491577808632066, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019180.919109222, "perf/iters_per_sec": 0.9628204913660154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386152029037476, "data/tokens_consumed": 93409247232, "data/tokens_consumed_B": 93.409247232, "train/loss_slope": 1.5334590497833772e-05} {"step": 44550, "timestamp": 1778242694.9100258, "grad/layer_0/attn": 0.0029739050660282373, "grad/layer_0/mlp": 0.0030490171629935503, "grad/layer_0/attn_mlp_ratio": 0.9753651125964544, "grad/layer_4/attn": 0.003234629053622484, "grad/layer_4/mlp": 0.002711828099563718, "grad/layer_4/attn_mlp_ratio": 1.1927853888911086, "grad/layer_8/attn": 0.0038739070296287537, "grad/layer_8/mlp": 0.003884917823597789, "grad/layer_8/attn_mlp_ratio": 0.9971657331800621, "grad/layer_12/attn": 0.0048705535009503365, "grad/layer_12/mlp": 0.00681548286229372, "grad/layer_12/attn_mlp_ratio": 0.7146307206541932, "grad/layer_16/attn": 0.0038394310977309942, "grad/layer_16/mlp": 0.004911663476377726, "grad/layer_16/attn_mlp_ratio": 0.7816966773124381, "grad/layer_20/attn": 0.008515233173966408, "grad/layer_20/mlp": 0.006014838814735413, "grad/layer_20/attn_mlp_ratio": 1.4157042764861785, "grad/layer_24/attn": 0.008418010547757149, "grad/layer_24/mlp": 0.008137008175253868, "grad/layer_24/attn_mlp_ratio": 1.0345338560558996, "grad/layer_27/attn": 0.004428875632584095, "grad/layer_27/mlp": 0.00742997694760561, "grad/layer_27/attn_mlp_ratio": 0.5960820072804058} {"step": 44550, "timestamp": 1778242695.527776, "eos/sharpness": 11.253380775451658, "eos/L0_probe": 1.982818603515625, "eos/L_plus": 2.047572612762451, "eos/L_minus": 2.0305984020233154, "eos/grad_norm": 0.11181460320949554, "eos/embed_grad_frac": 0.19644108414649963, "eos/time_s": 0.614912748336792} {"step": 44550, "timestamp": 1778242695.5488408, "train/loss": 2.171792769432068, "train/z_loss": 0.001391939993482083, "train/perplexity": 8.773999707720916, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906638.3155570526, "perf/iters_per_sec": 0.9091559961114181, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999212503433227, "data/tokens_consumed": 93430218752, "data/tokens_consumed_B": 93.430218752, "train/loss_slope": 2.0719947630386937e-05} {"step": 44550, "timestamp": 1778242696.9164193, "geo/rankme_last": 439.2493591308594, "geo/layer_0/stable_rank_q_proj": 19.3669490814209, "geo/layer_0/stable_rank_k_proj": 16.43419647216797, "geo/layer_0/stable_rank_o_proj": 48.382667541503906, "geo/layer_0/stable_rank_gate_proj": 135.51100158691406, "geo/layer_0/stable_rank_down_proj": 53.47395706176758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05825437605381012, "geo/layer_0/attn_entropy_mean": 6.184736251831055, "geo/layer_0/attn_entropy_std": 0.3795914053916931, "geo/layer_7/stable_rank_q_proj": 42.63970184326172, "geo/layer_7/stable_rank_k_proj": 42.54995346069336, "geo/layer_7/stable_rank_o_proj": 95.79347229003906, "geo/layer_7/stable_rank_gate_proj": 88.48664093017578, "geo/layer_7/stable_rank_down_proj": 146.4215087890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4860963523387909, "geo/layer_7/attn_entropy_mean": 4.676187515258789, "geo/layer_7/attn_entropy_std": 0.8245459794998169, "geo/layer_14/stable_rank_q_proj": 54.2173957824707, "geo/layer_14/stable_rank_k_proj": 38.143646240234375, "geo/layer_14/stable_rank_o_proj": 47.06297302246094, "geo/layer_14/stable_rank_gate_proj": 75.2132797241211, "geo/layer_14/stable_rank_down_proj": 132.83860778808594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37195315957069397, "geo/layer_14/attn_entropy_mean": 5.52979850769043, "geo/layer_14/attn_entropy_std": 0.34946948289871216, "geo/layer_21/stable_rank_q_proj": 42.909759521484375, "geo/layer_21/stable_rank_k_proj": 30.61425018310547, "geo/layer_21/stable_rank_o_proj": 75.16271209716797, "geo/layer_21/stable_rank_gate_proj": 71.33039855957031, "geo/layer_21/stable_rank_down_proj": 54.44947052001953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14090463519096375, "geo/layer_21/attn_entropy_mean": 5.716767311096191, "geo/layer_21/attn_entropy_std": 0.284128874540329, "geo/layer_27/stable_rank_q_proj": 42.72709655761719, "geo/layer_27/stable_rank_k_proj": 31.414913177490234, "geo/layer_27/stable_rank_o_proj": 116.13915252685547, "geo/layer_27/stable_rank_gate_proj": 84.15367126464844, "geo/layer_27/stable_rank_down_proj": 130.7285614013672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0827527865767479, "geo/layer_27/attn_entropy_mean": 4.256227493286133, "geo/layer_27/attn_entropy_std": 0.6698734760284424, "attnres/final_alpha/block_0": 0.23880955576896667, "attnres/block_norm/0": 1.7314201593399048, "attnres/final_alpha/block_1": 0.0049501140601933, "attnres/block_norm/1": 41707.39453125, "attnres/final_alpha/block_2": 0.010801363736391068, "attnres/block_norm/2": 26970.671875, "attnres/final_alpha/block_3": 0.012901037000119686, "attnres/block_norm/3": 49585.37890625, "attnres/final_alpha/block_4": 0.015616283752024174, "attnres/block_norm/4": 13414.076171875, "attnres/final_alpha/block_5": 0.6028724312782288, "attnres/block_norm/5": 6130.96533203125, "attnres/final_alpha/block_6": 0.11404922604560852, "attnres/block_norm/6": 33024.18359375, "geo/tier1_time_s": 1.3632843494415283, "geo/step": 44550.0, "geo/rankme_slope": -0.00012388400672769107} {"step": 44560, "timestamp": 1778242707.2728016, "train/loss": 2.1870769023895265, "train/z_loss": 0.0013970970176160336, "train/perplexity": 8.90913275088897, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789438.7534766386, "perf/iters_per_sec": 0.8532708899863427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1719607591629029, "data/tokens_consumed": 93451190272, "data/tokens_consumed_B": 93.451190272, "train/loss_slope": 2.0117907805947796e-05} {"step": 44570, "timestamp": 1778242717.6168935, "train/loss": 2.215636730194092, "train/z_loss": 0.0013945470796898007, "train/perplexity": 9.167244314938523, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028626.44443156, "perf/iters_per_sec": 0.9673244688184548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337792873382567, "data/tokens_consumed": 93472161792, "data/tokens_consumed_B": 93.472161792, "train/loss_slope": 2.3356535239438533e-05} {"step": 44580, "timestamp": 1778242727.9645307, "train/loss": 2.1425667881965635, "train/z_loss": 0.0014104360016062856, "train/perplexity": 8.521281907894899, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028284.7782729554, "perf/iters_per_sec": 0.9671615496983316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339534282684326, "data/tokens_consumed": 93493133312, "data/tokens_consumed_B": 93.493133312, "train/loss_slope": 2.1640958959119916e-05} {"step": 44590, "timestamp": 1778242738.3111842, "train/loss": 2.169482111930847, "train/z_loss": 0.001409982773475349, "train/perplexity": 8.753749404248303, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028338.0040129123, "perf/iters_per_sec": 0.9671869297089158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033926296234131, "data/tokens_consumed": 93514104832, "data/tokens_consumed_B": 93.514104832, "train/loss_slope": 2.2978880088536014e-05} {"step": 44600, "timestamp": 1778242748.6429586, "grad/layer_0/attn": 0.00252738525159657, "grad/layer_0/mlp": 0.002583251101896167, "grad/layer_0/attn_mlp_ratio": 0.9783737832934754, "grad/layer_4/attn": 0.0022270127665251493, "grad/layer_4/mlp": 0.0025217735674232244, "grad/layer_4/attn_mlp_ratio": 0.8831136573809715, "grad/layer_8/attn": 0.008117368444800377, "grad/layer_8/mlp": 0.00365687208250165, "grad/layer_8/attn_mlp_ratio": 2.219757224122425, "grad/layer_12/attn": 0.00431902427226305, "grad/layer_12/mlp": 0.006848410703241825, "grad/layer_12/attn_mlp_ratio": 0.630660805309541, "grad/layer_16/attn": 0.004548029974102974, "grad/layer_16/mlp": 0.0046691144816577435, "grad/layer_16/attn_mlp_ratio": 0.9740669016711561, "grad/layer_20/attn": 0.004680727608501911, "grad/layer_20/mlp": 0.005324781406670809, "grad/layer_20/attn_mlp_ratio": 0.8790459482023749, "grad/layer_24/attn": 0.013342027552425861, "grad/layer_24/mlp": 0.008477818220853806, "grad/layer_24/attn_mlp_ratio": 1.5737571917066253, "grad/layer_27/attn": 0.007752907928079367, "grad/layer_27/mlp": 0.007130531594157219, "grad/layer_27/attn_mlp_ratio": 1.0872832855412626} {"step": 44600, "timestamp": 1778242748.6579852, "train/loss": 2.152081561088562, "train/z_loss": 0.0014085184782743454, "train/perplexity": 8.60274691593147, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027947.2948176973, "perf/iters_per_sec": 0.9670006250465857, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341254949569703, "data/tokens_consumed": 93535076352, "data/tokens_consumed_B": 93.535076352, "train/loss_slope": 1.8510460188322267e-05} {"step": 44610, "timestamp": 1778242759.0105507, "train/loss": 2.127690649032593, "train/z_loss": 0.0014080931781791151, "train/perplexity": 8.395456352036625, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027361.9103539824, "perf/iters_per_sec": 0.9667214919824516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034424090385437, "data/tokens_consumed": 93556047872, "data/tokens_consumed_B": 93.556047872, "train/loss_slope": 1.603883758451067e-05} {"step": 44620, "timestamp": 1778242769.3611367, "train/loss": 2.1573091983795165, "train/z_loss": 0.0013997541391290724, "train/perplexity": 8.647836710374104, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027434.948102271, "perf/iters_per_sec": 0.9667563190947871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343868255615234, "data/tokens_consumed": 93577019392, "data/tokens_consumed_B": 93.577019392, "train/loss_slope": 1.720806070894866e-05} {"step": 44625, "timestamp": 1778242775.1257088, "eos/sharpness": 8.974170684814451, "eos/L0_probe": 1.9837779998779297, "eos/L_plus": 2.0337753295898438, "eos/L_minus": 2.02352237701416, "eos/grad_norm": 0.10804025083780289, "eos/embed_grad_frac": 0.19179485738277435, "eos/time_s": 0.59940505027771} {"step": 44625, "timestamp": 1778242776.505679, "geo/rankme_last": 440.7469787597656, "geo/layer_0/stable_rank_q_proj": 19.388051986694336, "geo/layer_0/stable_rank_k_proj": 16.441741943359375, "geo/layer_0/stable_rank_o_proj": 48.383636474609375, "geo/layer_0/stable_rank_gate_proj": 135.6117401123047, "geo/layer_0/stable_rank_down_proj": 53.460296630859375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05929340049624443, "geo/layer_0/attn_entropy_mean": 6.191562652587891, "geo/layer_0/attn_entropy_std": 0.3748275935649872, "geo/layer_7/stable_rank_q_proj": 42.61439895629883, "geo/layer_7/stable_rank_k_proj": 42.57246017456055, "geo/layer_7/stable_rank_o_proj": 95.56062316894531, "geo/layer_7/stable_rank_gate_proj": 88.39832305908203, "geo/layer_7/stable_rank_down_proj": 146.25070190429688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.502596914768219, "geo/layer_7/attn_entropy_mean": 4.65186882019043, "geo/layer_7/attn_entropy_std": 0.8010591268539429, "geo/layer_14/stable_rank_q_proj": 54.00041198730469, "geo/layer_14/stable_rank_k_proj": 38.10410690307617, "geo/layer_14/stable_rank_o_proj": 47.06822204589844, "geo/layer_14/stable_rank_gate_proj": 75.18604278564453, "geo/layer_14/stable_rank_down_proj": 132.68243408203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4085105061531067, "geo/layer_14/attn_entropy_mean": 5.538177013397217, "geo/layer_14/attn_entropy_std": 0.3732425570487976, "geo/layer_21/stable_rank_q_proj": 42.85582733154297, "geo/layer_21/stable_rank_k_proj": 30.498249053955078, "geo/layer_21/stable_rank_o_proj": 75.13856506347656, "geo/layer_21/stable_rank_gate_proj": 71.28662872314453, "geo/layer_21/stable_rank_down_proj": 54.486083984375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1466916799545288, "geo/layer_21/attn_entropy_mean": 5.704537868499756, "geo/layer_21/attn_entropy_std": 0.29426875710487366, "geo/layer_27/stable_rank_q_proj": 42.682647705078125, "geo/layer_27/stable_rank_k_proj": 31.349794387817383, "geo/layer_27/stable_rank_o_proj": 116.2549819946289, "geo/layer_27/stable_rank_gate_proj": 83.97815704345703, "geo/layer_27/stable_rank_down_proj": 130.60858154296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08371911197900772, "geo/layer_27/attn_entropy_mean": 4.266388893127441, "geo/layer_27/attn_entropy_std": 0.6685901284217834, "attnres/final_alpha/block_0": 0.24106159806251526, "attnres/block_norm/0": 1.7319295406341553, "attnres/final_alpha/block_1": 0.005077224690467119, "attnres/block_norm/1": 41696.8671875, "attnres/final_alpha/block_2": 0.011213734745979309, "attnres/block_norm/2": 26888.359375, "attnres/final_alpha/block_3": 0.013316011056303978, "attnres/block_norm/3": 49543.40234375, "attnres/final_alpha/block_4": 0.015727398917078972, "attnres/block_norm/4": 13430.4189453125, "attnres/final_alpha/block_5": 0.5998325943946838, "attnres/block_norm/5": 6092.65283203125, "attnres/final_alpha/block_6": 0.113771453499794, "attnres/block_norm/6": 32691.796875, "geo/tier1_time_s": 1.35933518409729, "geo/step": 44625.0, "geo/rankme_slope": -4.677218543667467e-05} {"step": 44630, "timestamp": 1778242781.6869268, "train/loss": 2.2026722908020018, "train/z_loss": 0.0014089949778281151, "train/perplexity": 9.049163213154154, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702290.3385726009, "perf/iters_per_sec": 0.8117152874815945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231959056854248, "data/tokens_consumed": 93597990912, "data/tokens_consumed_B": 93.597990912, "train/loss_slope": 1.781119753782929e-05} {"step": 44640, "timestamp": 1778242792.0329432, "train/loss": 2.1380454778671263, "train/z_loss": 0.0013921736157499255, "train/perplexity": 8.482841513943786, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028188.576795513, "perf/iters_per_sec": 0.967115677259213, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340024709701539, "data/tokens_consumed": 93618962432, "data/tokens_consumed_B": 93.618962432, "train/loss_slope": 1.574882658401333e-05} {"step": 44650, "timestamp": 1778242802.367697, "grad/layer_0/attn": 0.002829124918207526, "grad/layer_0/mlp": 0.0027451571077108383, "grad/layer_0/attn_mlp_ratio": 1.0305875781032985, "grad/layer_4/attn": 0.002117923693731427, "grad/layer_4/mlp": 0.0025306413881480694, "grad/layer_4/attn_mlp_ratio": 0.8369117884340582, "grad/layer_8/attn": 0.0062337350100278854, "grad/layer_8/mlp": 0.0037802925799041986, "grad/layer_8/attn_mlp_ratio": 1.6490085657034002, "grad/layer_12/attn": 0.004427594132721424, "grad/layer_12/mlp": 0.0066514876671135426, "grad/layer_12/attn_mlp_ratio": 0.6656547057956642, "grad/layer_16/attn": 0.010699674487113953, "grad/layer_16/mlp": 0.004621230997145176, "grad/layer_16/attn_mlp_ratio": 2.3153298898477104, "grad/layer_20/attn": 0.0032719981390982866, "grad/layer_20/mlp": 0.005649455823004246, "grad/layer_20/attn_mlp_ratio": 0.5791704871569856, "grad/layer_24/attn": 0.012643497437238693, "grad/layer_24/mlp": 0.009546780958771706, "grad/layer_24/attn_mlp_ratio": 1.3243728288522638, "grad/layer_27/attn": 0.004845432937145233, "grad/layer_27/mlp": 0.00789935328066349, "grad/layer_27/attn_mlp_ratio": 0.6133961482222297} {"step": 44650, "timestamp": 1778242802.3825572, "train/loss": 2.179836320877075, "train/z_loss": 0.0013949162675999105, "train/perplexity": 8.84485842156492, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027377.0969124406, "perf/iters_per_sec": 0.9667287334978297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344163417816161, "data/tokens_consumed": 93639933952, "data/tokens_consumed_B": 93.639933952, "train/loss_slope": 1.923827900387231e-05} {"step": 44660, "timestamp": 1778242812.7302768, "train/loss": 2.147956967353821, "train/z_loss": 0.0014013923122547568, "train/perplexity": 8.567337155538414, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027660.6366856222, "perf/iters_per_sec": 0.9668639357975112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342716932296754, "data/tokens_consumed": 93660905472, "data/tokens_consumed_B": 93.660905472, "train/loss_slope": 1.8278950554738647e-05} {"step": 44670, "timestamp": 1778242823.0751686, "train/loss": 2.230859470367432, "train/z_loss": 0.0013990326784551143, "train/perplexity": 9.307862474640089, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028692.461387963, "perf/iters_per_sec": 0.9673559481563392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337456464767456, "data/tokens_consumed": 93681876992, "data/tokens_consumed_B": 93.681876992, "train/loss_slope": 2.2354866855322573e-05} {"step": 44680, "timestamp": 1778242833.4198, "train/loss": 2.1893649339675902, "train/z_loss": 0.0014004763099364937, "train/perplexity": 8.929540465801228, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028139.8949362705, "perf/iters_per_sec": 0.9670924639397958, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340272903442382, "data/tokens_consumed": 93702848512, "data/tokens_consumed_B": 93.702848512, "train/loss_slope": 2.07367267116497e-05} {"step": 44690, "timestamp": 1778242843.7726011, "train/loss": 2.1434009790420534, "train/z_loss": 0.001390701497439295, "train/perplexity": 8.528393248949723, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027085.6481659955, "perf/iters_per_sec": 0.966589759905813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345650672912599, "data/tokens_consumed": 93723820032, "data/tokens_consumed_B": 93.723820032, "train/loss_slope": 1.9369942870828204e-05} {"step": 44700, "timestamp": 1778242854.1104627, "grad/layer_0/attn": 0.0028790347278118134, "grad/layer_0/mlp": 0.002890527481213212, "grad/layer_0/attn_mlp_ratio": 0.9960239599593874, "grad/layer_4/attn": 0.0025385853368788958, "grad/layer_4/mlp": 0.0025661534164100885, "grad/layer_4/attn_mlp_ratio": 0.9892570029988855, "grad/layer_8/attn": 0.004217419307678938, "grad/layer_8/mlp": 0.003699104068800807, "grad/layer_8/attn_mlp_ratio": 1.1401190978209625, "grad/layer_12/attn": 0.003910295199602842, "grad/layer_12/mlp": 0.006924070883542299, "grad/layer_12/attn_mlp_ratio": 0.5647393287701922, "grad/layer_16/attn": 0.0041062175296247005, "grad/layer_16/mlp": 0.004383676219731569, "grad/layer_16/attn_mlp_ratio": 0.9367063692960208, "grad/layer_20/attn": 0.0031938799656927586, "grad/layer_20/mlp": 0.006094316486269236, "grad/layer_20/attn_mlp_ratio": 0.5240751642093407, "grad/layer_24/attn": 0.005521031562238932, "grad/layer_24/mlp": 0.007907679304480553, "grad/layer_24/attn_mlp_ratio": 0.6981860644363848, "grad/layer_27/attn": 0.0040457164868712425, "grad/layer_27/mlp": 0.007031511981040239, "grad/layer_27/attn_mlp_ratio": 0.575369343072041} {"step": 44700, "timestamp": 1778242854.7093441, "eos/sharpness": 3.986525535583495, "eos/L0_probe": 1.9855552911758423, "eos/L_plus": 2.011315107345581, "eos/L_minus": 1.9996607303619385, "eos/grad_norm": 0.09384140372276306, "eos/embed_grad_frac": 0.28148385882377625, "eos/time_s": 0.595940351486206} {"step": 44700, "timestamp": 1778242854.7275593, "train/loss": 2.128301763534546, "train/z_loss": 0.0014042402850463986, "train/perplexity": 8.400588505170749, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915437.3871585, "perf/iters_per_sec": 0.9133517204086781, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0948684692382813, "data/tokens_consumed": 93744791552, "data/tokens_consumed_B": 93.744791552, "train/loss_slope": 1.5635960752313777e-05} {"step": 44700, "timestamp": 1778242856.0913177, "geo/rankme_last": 439.39263916015625, "geo/layer_0/stable_rank_q_proj": 19.39810562133789, "geo/layer_0/stable_rank_k_proj": 16.425996780395508, "geo/layer_0/stable_rank_o_proj": 48.466617584228516, "geo/layer_0/stable_rank_gate_proj": 135.49295043945312, "geo/layer_0/stable_rank_down_proj": 53.41188430786133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0599214993417263, "geo/layer_0/attn_entropy_mean": 6.1909332275390625, "geo/layer_0/attn_entropy_std": 0.3765142261981964, "geo/layer_7/stable_rank_q_proj": 42.681854248046875, "geo/layer_7/stable_rank_k_proj": 42.42659378051758, "geo/layer_7/stable_rank_o_proj": 95.49637603759766, "geo/layer_7/stable_rank_gate_proj": 88.31001281738281, "geo/layer_7/stable_rank_down_proj": 146.1742706298828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49341312050819397, "geo/layer_7/attn_entropy_mean": 4.658069610595703, "geo/layer_7/attn_entropy_std": 0.8225114345550537, "geo/layer_14/stable_rank_q_proj": 53.97079849243164, "geo/layer_14/stable_rank_k_proj": 38.13004684448242, "geo/layer_14/stable_rank_o_proj": 47.02455139160156, "geo/layer_14/stable_rank_gate_proj": 75.28163146972656, "geo/layer_14/stable_rank_down_proj": 132.81858825683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4047206938266754, "geo/layer_14/attn_entropy_mean": 5.500391960144043, "geo/layer_14/attn_entropy_std": 0.3601906895637512, "geo/layer_21/stable_rank_q_proj": 42.805389404296875, "geo/layer_21/stable_rank_k_proj": 30.545391082763672, "geo/layer_21/stable_rank_o_proj": 75.09268951416016, "geo/layer_21/stable_rank_gate_proj": 71.2292251586914, "geo/layer_21/stable_rank_down_proj": 54.56761169433594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1464490443468094, "geo/layer_21/attn_entropy_mean": 5.724849700927734, "geo/layer_21/attn_entropy_std": 0.29464036226272583, "geo/layer_27/stable_rank_q_proj": 42.84135437011719, "geo/layer_27/stable_rank_k_proj": 31.273160934448242, "geo/layer_27/stable_rank_o_proj": 116.08866882324219, "geo/layer_27/stable_rank_gate_proj": 83.91849517822266, "geo/layer_27/stable_rank_down_proj": 130.7510223388672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08890191465616226, "geo/layer_27/attn_entropy_mean": 4.276125907897949, "geo/layer_27/attn_entropy_std": 0.6473750472068787, "attnres/final_alpha/block_0": 0.23953673243522644, "attnres/block_norm/0": 1.7319703102111816, "attnres/final_alpha/block_1": 0.005007934290915728, "attnres/block_norm/1": 41783.25, "attnres/final_alpha/block_2": 0.010927542112767696, "attnres/block_norm/2": 27039.59375, "attnres/final_alpha/block_3": 0.013148767873644829, "attnres/block_norm/3": 49309.78125, "attnres/final_alpha/block_4": 0.015483679249882698, "attnres/block_norm/4": 13422.953125, "attnres/final_alpha/block_5": 0.6005491018295288, "attnres/block_norm/5": 6132.8232421875, "attnres/final_alpha/block_6": 0.11534620821475983, "attnres/block_norm/6": 32829.50390625, "geo/tier1_time_s": 1.3594093322753906, "geo/step": 44700.0, "geo/rankme_slope": -4.1912604885704284e-05} {"step": 44710, "timestamp": 1778242866.4357686, "train/loss": 2.1999642133712767, "train/z_loss": 0.001387573848478496, "train/perplexity": 9.024690530405815, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791783.4808207862, "perf/iters_per_sec": 0.8543889431098872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170427131652832, "data/tokens_consumed": 93765763072, "data/tokens_consumed_B": 93.765763072, "train/loss_slope": 1.5001932121846806e-05} {"step": 44720, "timestamp": 1778242876.7886243, "train/loss": 2.1472434043884276, "train/z_loss": 0.0014001039555296302, "train/perplexity": 8.561226001638039, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027284.065291224, "perf/iters_per_sec": 0.9666843725639458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344638109207154, "data/tokens_consumed": 93786734592, "data/tokens_consumed_B": 93.786734592, "train/loss_slope": 1.574072046677626e-05} {"step": 44730, "timestamp": 1778242887.1392965, "train/loss": 2.1399898648262026, "train/z_loss": 0.001405479945242405, "train/perplexity": 8.49935148604511, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027938.177729149, "perf/iters_per_sec": 0.9669962776799913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341301441192627, "data/tokens_consumed": 93807706112, "data/tokens_consumed_B": 93.807706112, "train/loss_slope": 1.2517958183815045e-05} {"step": 44740, "timestamp": 1778242897.4819133, "train/loss": 2.1615453720092774, "train/z_loss": 0.0014090133830904961, "train/perplexity": 8.6845481513207, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029109.389156412, "perf/iters_per_sec": 0.9675547548086224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033533239364624, "data/tokens_consumed": 93828677632, "data/tokens_consumed_B": 93.828677632, "train/loss_slope": 1.1943656197189382e-05} {"step": 44750, "timestamp": 1778242907.8225276, "grad/layer_0/attn": 0.0031841550953686237, "grad/layer_0/mlp": 0.002894670469686389, "grad/layer_0/attn_mlp_ratio": 1.1000060347846758, "grad/layer_4/attn": 0.0021274315658956766, "grad/layer_4/mlp": 0.002375740557909012, "grad/layer_4/attn_mlp_ratio": 0.8954813981119124, "grad/layer_8/attn": 0.0032566366717219353, "grad/layer_8/mlp": 0.0034568174742162228, "grad/layer_8/attn_mlp_ratio": 0.9420909844975909, "grad/layer_12/attn": 0.0069518801756203175, "grad/layer_12/mlp": 0.006764313206076622, "grad/layer_12/attn_mlp_ratio": 1.027728885558154, "grad/layer_16/attn": 0.008186607621610165, "grad/layer_16/mlp": 0.0049019246362149715, "grad/layer_16/attn_mlp_ratio": 1.6700802362647997, "grad/layer_20/attn": 0.0045376913622021675, "grad/layer_20/mlp": 0.006418960634618998, "grad/layer_20/attn_mlp_ratio": 0.7069199438671291, "grad/layer_24/attn": 0.013107789680361748, "grad/layer_24/mlp": 0.0107412189245224, "grad/layer_24/attn_mlp_ratio": 1.2203260775556688, "grad/layer_27/attn": 0.006819625850766897, "grad/layer_27/mlp": 0.010701301507651806, "grad/layer_27/attn_mlp_ratio": 0.6372706891926704} {"step": 44750, "timestamp": 1778242907.8367314, "train/loss": 2.1271839141845703, "train/z_loss": 0.0013998020556755364, "train/perplexity": 8.391203159449455, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026874.4726104662, "perf/iters_per_sec": 0.9664890635540324, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346728563308716, "data/tokens_consumed": 93849649152, "data/tokens_consumed_B": 93.849649152, "train/loss_slope": 9.201757649634275e-06} {"step": 44760, "timestamp": 1778242918.1864493, "train/loss": 2.140933465957642, "train/z_loss": 0.001400928304065019, "train/perplexity": 8.50737526875371, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027773.616686879, "perf/iters_per_sec": 0.9669178088602443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342140674591065, "data/tokens_consumed": 93870620672, "data/tokens_consumed_B": 93.870620672, "train/loss_slope": 6.6704181089724605e-06} {"step": 44770, "timestamp": 1778242928.527874, "train/loss": 2.2070181608200072, "train/z_loss": 0.0013901551137678325, "train/perplexity": 9.088575278075957, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028803.9186150087, "perf/iters_per_sec": 0.9674090951037448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336888551712036, "data/tokens_consumed": 93891592192, "data/tokens_consumed_B": 93.891592192, "train/loss_slope": 9.887471815647755e-06} {"step": 44775, "timestamp": 1778242934.29145, "eos/sharpness": 74.95524883270262, "eos/L0_probe": 1.9833976030349731, "eos/L_plus": 2.3124656677246094, "eos/L_minus": 2.4038820266723633, "eos/grad_norm": 0.22516600787639618, "eos/embed_grad_frac": 0.045438800007104874, "eos/time_s": 0.5936710834503174} {"step": 44775, "timestamp": 1778242935.6715398, "geo/rankme_last": 439.0349426269531, "geo/layer_0/stable_rank_q_proj": 19.3819637298584, "geo/layer_0/stable_rank_k_proj": 16.374740600585938, "geo/layer_0/stable_rank_o_proj": 48.49577713012695, "geo/layer_0/stable_rank_gate_proj": 135.63760375976562, "geo/layer_0/stable_rank_down_proj": 53.41658020019531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061098407953977585, "geo/layer_0/attn_entropy_mean": 6.188053131103516, "geo/layer_0/attn_entropy_std": 0.3796481192111969, "geo/layer_7/stable_rank_q_proj": 42.7933464050293, "geo/layer_7/stable_rank_k_proj": 42.410972595214844, "geo/layer_7/stable_rank_o_proj": 95.62645721435547, "geo/layer_7/stable_rank_gate_proj": 88.22724151611328, "geo/layer_7/stable_rank_down_proj": 146.0054168701172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48681318759918213, "geo/layer_7/attn_entropy_mean": 4.658304214477539, "geo/layer_7/attn_entropy_std": 0.8197969198226929, "geo/layer_14/stable_rank_q_proj": 53.96931838989258, "geo/layer_14/stable_rank_k_proj": 38.2119026184082, "geo/layer_14/stable_rank_o_proj": 47.00423049926758, "geo/layer_14/stable_rank_gate_proj": 75.32721710205078, "geo/layer_14/stable_rank_down_proj": 132.72950744628906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.398171603679657, "geo/layer_14/attn_entropy_mean": 5.552066802978516, "geo/layer_14/attn_entropy_std": 0.38154545426368713, "geo/layer_21/stable_rank_q_proj": 42.744140625, "geo/layer_21/stable_rank_k_proj": 30.45030975341797, "geo/layer_21/stable_rank_o_proj": 75.01741027832031, "geo/layer_21/stable_rank_gate_proj": 71.08927154541016, "geo/layer_21/stable_rank_down_proj": 54.59491729736328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14126025140285492, "geo/layer_21/attn_entropy_mean": 5.723223686218262, "geo/layer_21/attn_entropy_std": 0.29313182830810547, "geo/layer_27/stable_rank_q_proj": 42.92632293701172, "geo/layer_27/stable_rank_k_proj": 31.188705444335938, "geo/layer_27/stable_rank_o_proj": 116.05741119384766, "geo/layer_27/stable_rank_gate_proj": 83.87328338623047, "geo/layer_27/stable_rank_down_proj": 130.89572143554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08910329639911652, "geo/layer_27/attn_entropy_mean": 4.2739763259887695, "geo/layer_27/attn_entropy_std": 0.6663240790367126, "attnres/final_alpha/block_0": 0.24097424745559692, "attnres/block_norm/0": 1.73228120803833, "attnres/final_alpha/block_1": 0.004985883831977844, "attnres/block_norm/1": 41803.140625, "attnres/final_alpha/block_2": 0.011160643771290779, "attnres/block_norm/2": 26989.35546875, "attnres/final_alpha/block_3": 0.0131258275359869, "attnres/block_norm/3": 49416.9140625, "attnres/final_alpha/block_4": 0.015779705718159676, "attnres/block_norm/4": 13450.5693359375, "attnres/final_alpha/block_5": 0.5976464748382568, "attnres/block_norm/5": 6213.421875, "attnres/final_alpha/block_6": 0.11632724851369858, "attnres/block_norm/6": 32822.53125, "geo/tier1_time_s": 1.3597981929779053, "geo/step": 44775.0, "geo/rankme_slope": -5.89890643757503e-05} {"step": 44780, "timestamp": 1778242940.84633, "train/loss": 2.206190657615662, "train/z_loss": 0.0013894103234633804, "train/perplexity": 9.081057563805725, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703471.8741114289, "perf/iters_per_sec": 0.8122786875302452, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231104564666748, "data/tokens_consumed": 93912563712, "data/tokens_consumed_B": 93.912563712, "train/loss_slope": 9.272923692725574e-06} {"step": 44790, "timestamp": 1778242951.1865294, "train/loss": 2.1759077072143556, "train/z_loss": 0.001416297082323581, "train/perplexity": 8.810178556425235, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029105.7381303876, "perf/iters_per_sec": 0.9675530138637484, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033535099029541, "data/tokens_consumed": 93933535232, "data/tokens_consumed_B": 93.933535232, "train/loss_slope": 1.2015846910828637e-05} {"step": 44800, "timestamp": 1778242961.5190835, "grad/layer_0/attn": 0.0025762252043932676, "grad/layer_0/mlp": 0.0028824959881603718, "grad/layer_0/attn_mlp_ratio": 0.8937480314283555, "grad/layer_4/attn": 0.004446977283805609, "grad/layer_4/mlp": 0.0025557666085660458, "grad/layer_4/attn_mlp_ratio": 1.739977780014459, "grad/layer_8/attn": 0.004437274299561977, "grad/layer_8/mlp": 0.0037013913970440626, "grad/layer_8/attn_mlp_ratio": 1.1988124744722606, "grad/layer_12/attn": 0.004623596090823412, "grad/layer_12/mlp": 0.006457182113081217, "grad/layer_12/attn_mlp_ratio": 0.7160392781632748, "grad/layer_16/attn": 0.004568565171211958, "grad/layer_16/mlp": 0.004431499168276787, "grad/layer_16/attn_mlp_ratio": 1.0309299166348436, "grad/layer_20/attn": 0.0037357329856604338, "grad/layer_20/mlp": 0.006331976503133774, "grad/layer_20/attn_mlp_ratio": 0.5899789623056366, "grad/layer_24/attn": 0.018698494881391525, "grad/layer_24/mlp": 0.011973056942224503, "grad/layer_24/attn_mlp_ratio": 1.5617143404102156, "grad/layer_27/attn": 0.006766322534531355, "grad/layer_27/mlp": 0.011106600053608418, "grad/layer_27/attn_mlp_ratio": 0.6092163615283339} {"step": 44800, "timestamp": 1778242961.533281, "train/loss": 2.18148512840271, "train/z_loss": 0.0013942704652436077, "train/perplexity": 8.859453919970809, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027794.6995786838, "perf/iters_per_sec": 0.9669278619664592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342033147811889, "data/tokens_consumed": 93954506752, "data/tokens_consumed_B": 93.954506752, "train/loss_slope": 1.1524457234789803e-05} {"step": 44810, "timestamp": 1778242971.8783484, "train/loss": 2.1727373361587525, "train/z_loss": 0.0014171551447361708, "train/perplexity": 8.782291251246354, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028637.3455888547, "perf/iters_per_sec": 0.9673296668953203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337737321853637, "data/tokens_consumed": 93975478272, "data/tokens_consumed_B": 93.975478272, "train/loss_slope": 1.2842267023848232e-05} {"step": 44820, "timestamp": 1778242982.2203195, "train/loss": 2.215360999107361, "train/z_loss": 0.0013799094944261015, "train/perplexity": 9.164716969151149, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029223.0456138032, "perf/iters_per_sec": 0.9676089504307762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334753513336181, "data/tokens_consumed": 93996449792, "data/tokens_consumed_B": 93.996449792, "train/loss_slope": 1.712368221351632e-05} {"step": 44830, "timestamp": 1778242992.5615635, "train/loss": 2.1725388288497927, "train/z_loss": 0.0014079767977818846, "train/perplexity": 8.780548075265878, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028924.8883637176, "perf/iters_per_sec": 0.9674667779749477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336272239685058, "data/tokens_consumed": 94017421312, "data/tokens_consumed_B": 94.017421312, "train/loss_slope": 1.549989300878637e-05} {"step": 44840, "timestamp": 1778243002.9165058, "train/loss": 2.187546968460083, "train/z_loss": 0.0014030722202733159, "train/perplexity": 8.91332161635788, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026709.7105690527, "perf/iters_per_sec": 0.9664104988904251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347569704055786, "data/tokens_consumed": 94038392832, "data/tokens_consumed_B": 94.038392832, "train/loss_slope": 1.7055811640238442e-05} {"step": 44850, "timestamp": 1778243013.2495337, "grad/layer_0/attn": 0.0032037210185080767, "grad/layer_0/mlp": 0.0031410122755914927, "grad/layer_0/attn_mlp_ratio": 1.0199644685910463, "grad/layer_4/attn": 0.0033330724108964205, "grad/layer_4/mlp": 0.002600150415673852, "grad/layer_4/attn_mlp_ratio": 1.2818767185993558, "grad/layer_8/attn": 0.0043047042563557625, "grad/layer_8/mlp": 0.0037762648425996304, "grad/layer_8/attn_mlp_ratio": 1.1399370334943575, "grad/layer_12/attn": 0.004107222426682711, "grad/layer_12/mlp": 0.006394629366695881, "grad/layer_12/attn_mlp_ratio": 0.6422924812256435, "grad/layer_16/attn": 0.0063756960444152355, "grad/layer_16/mlp": 0.004657358396798372, "grad/layer_16/attn_mlp_ratio": 1.368951101530644, "grad/layer_20/attn": 0.006238871719688177, "grad/layer_20/mlp": 0.006988006643950939, "grad/layer_20/attn_mlp_ratio": 0.8927970375942698, "grad/layer_24/attn": 0.01543564721941948, "grad/layer_24/mlp": 0.01224326342344284, "grad/layer_24/attn_mlp_ratio": 1.2607461392841874, "grad/layer_27/attn": 0.0054654269479215145, "grad/layer_27/mlp": 0.012568810023367405, "grad/layer_27/attn_mlp_ratio": 0.4348404418776621} {"step": 44850, "timestamp": 1778243013.8519, "eos/sharpness": 69.28145885467528, "eos/L0_probe": 1.981770634651184, "eos/L_plus": 2.307345390319824, "eos/L_minus": 2.349010467529297, "eos/grad_norm": 0.1977323591709137, "eos/embed_grad_frac": 0.0635417178273201, "eos/time_s": 0.5994420051574707} {"step": 44850, "timestamp": 1778243013.8722742, "train/loss": 2.1409624576568604, "train/z_loss": 0.0013965618680231273, "train/perplexity": 8.507621915593981, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915729.3639259909, "perf/iters_per_sec": 0.9134909457807497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094701600074768, "data/tokens_consumed": 94059364352, "data/tokens_consumed_B": 94.059364352, "train/loss_slope": 1.7552746833712024e-05} {"step": 44850, "timestamp": 1778243015.2356834, "geo/rankme_last": 439.4647521972656, "geo/layer_0/stable_rank_q_proj": 19.37447166442871, "geo/layer_0/stable_rank_k_proj": 16.388980865478516, "geo/layer_0/stable_rank_o_proj": 48.524208068847656, "geo/layer_0/stable_rank_gate_proj": 135.7831268310547, "geo/layer_0/stable_rank_down_proj": 53.542057037353516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0604826882481575, "geo/layer_0/attn_entropy_mean": 6.185829162597656, "geo/layer_0/attn_entropy_std": 0.38046354055404663, "geo/layer_7/stable_rank_q_proj": 42.77840042114258, "geo/layer_7/stable_rank_k_proj": 42.379478454589844, "geo/layer_7/stable_rank_o_proj": 95.759765625, "geo/layer_7/stable_rank_gate_proj": 88.20403289794922, "geo/layer_7/stable_rank_down_proj": 145.95089721679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49593618512153625, "geo/layer_7/attn_entropy_mean": 4.641559600830078, "geo/layer_7/attn_entropy_std": 0.8191152215003967, "geo/layer_14/stable_rank_q_proj": 53.86724090576172, "geo/layer_14/stable_rank_k_proj": 38.28544616699219, "geo/layer_14/stable_rank_o_proj": 46.863521575927734, "geo/layer_14/stable_rank_gate_proj": 75.3229751586914, "geo/layer_14/stable_rank_down_proj": 132.7869415283203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39018747210502625, "geo/layer_14/attn_entropy_mean": 5.516626358032227, "geo/layer_14/attn_entropy_std": 0.37500080466270447, "geo/layer_21/stable_rank_q_proj": 42.7630500793457, "geo/layer_21/stable_rank_k_proj": 30.50982093811035, "geo/layer_21/stable_rank_o_proj": 75.06383514404297, "geo/layer_21/stable_rank_gate_proj": 71.10308837890625, "geo/layer_21/stable_rank_down_proj": 54.54421615600586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14139731228351593, "geo/layer_21/attn_entropy_mean": 5.704453945159912, "geo/layer_21/attn_entropy_std": 0.2911215126514435, "geo/layer_27/stable_rank_q_proj": 42.967464447021484, "geo/layer_27/stable_rank_k_proj": 31.15506362915039, "geo/layer_27/stable_rank_o_proj": 116.10663604736328, "geo/layer_27/stable_rank_gate_proj": 83.85235595703125, "geo/layer_27/stable_rank_down_proj": 130.46572875976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08557836711406708, "geo/layer_27/attn_entropy_mean": 4.265354156494141, "geo/layer_27/attn_entropy_std": 0.6672444343566895, "attnres/final_alpha/block_0": 0.24349571764469147, "attnres/block_norm/0": 1.7323676347732544, "attnres/final_alpha/block_1": 0.005051467567682266, "attnres/block_norm/1": 41895.59765625, "attnres/final_alpha/block_2": 0.011207358911633492, "attnres/block_norm/2": 27002.10546875, "attnres/final_alpha/block_3": 0.013177954591810703, "attnres/block_norm/3": 49322.5703125, "attnres/final_alpha/block_4": 0.01590268313884735, "attnres/block_norm/4": 13444.8427734375, "attnres/final_alpha/block_5": 0.5937196612358093, "attnres/block_norm/5": 6221.615234375, "attnres/final_alpha/block_6": 0.11744514107704163, "attnres/block_norm/6": 32828.47265625, "geo/tier1_time_s": 1.3602089881896973, "geo/step": 44850.0, "geo/rankme_slope": -6.573693930697279e-05} {"step": 44860, "timestamp": 1778243025.577417, "train/loss": 2.167882966995239, "train/z_loss": 0.0013924061437137424, "train/perplexity": 8.739762077083391, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792135.1449180415, "perf/iters_per_sec": 0.8545566296186645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1701974630355836, "data/tokens_consumed": 94080335872, "data/tokens_consumed_B": 94.080335872, "train/loss_slope": 1.63956013807405e-05} {"step": 44870, "timestamp": 1778243035.921365, "train/loss": 2.1642749071121217, "train/z_loss": 0.0013928276603110135, "train/perplexity": 8.708285311320266, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028990.6907330037, "perf/iters_per_sec": 0.9674981549897211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335937023162842, "data/tokens_consumed": 94101307392, "data/tokens_consumed_B": 94.101307392, "train/loss_slope": 1.8286043983159614e-05} {"step": 44880, "timestamp": 1778243046.2637036, "train/loss": 2.170916998386383, "train/z_loss": 0.0013752899481914937, "train/perplexity": 8.766319056558903, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029166.5902795182, "perf/iters_per_sec": 0.9675820304296103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033504104614258, "data/tokens_consumed": 94122278912, "data/tokens_consumed_B": 94.122278912, "train/loss_slope": 1.7933216682969713e-05} {"step": 44890, "timestamp": 1778243056.605646, "train/loss": 2.1531689167022705, "train/z_loss": 0.0014119008439593018, "train/perplexity": 8.61210624862313, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029345.3762821269, "perf/iters_per_sec": 0.9676672822390208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033413052558899, "data/tokens_consumed": 94143250432, "data/tokens_consumed_B": 94.143250432, "train/loss_slope": 1.5374309659683768e-05} {"step": 44900, "timestamp": 1778243066.9384623, "grad/layer_0/attn": 0.0030354787595570087, "grad/layer_0/mlp": 0.0028139017522335052, "grad/layer_0/attn_mlp_ratio": 1.0787436516833837, "grad/layer_4/attn": 0.0024745124392211437, "grad/layer_4/mlp": 0.002509564394131303, "grad/layer_4/attn_mlp_ratio": 0.9860326144268738, "grad/layer_8/attn": 0.0043565756641328335, "grad/layer_8/mlp": 0.003677782602608204, "grad/layer_8/attn_mlp_ratio": 1.1845658148979938, "grad/layer_12/attn": 0.004651494324207306, "grad/layer_12/mlp": 0.006931313779205084, "grad/layer_12/attn_mlp_ratio": 0.6710840693800411, "grad/layer_16/attn": 0.0034992534201592207, "grad/layer_16/mlp": 0.004549557343125343, "grad/layer_16/attn_mlp_ratio": 0.7691414964870494, "grad/layer_20/attn": 0.005241758190095425, "grad/layer_20/mlp": 0.005584252066910267, "grad/layer_20/attn_mlp_ratio": 0.9386678884516882, "grad/layer_24/attn": 0.006382948253303766, "grad/layer_24/mlp": 0.007651707623153925, "grad/layer_24/attn_mlp_ratio": 0.8341861038404648, "grad/layer_27/attn": 0.004713721573352814, "grad/layer_27/mlp": 0.006536897737532854, "grad/layer_27/attn_mlp_ratio": 0.7210945758228126} {"step": 44900, "timestamp": 1778243066.9529183, "train/loss": 2.1478821277618407, "train/z_loss": 0.0013916106894612313, "train/perplexity": 8.566696003513416, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028301.1479306752, "perf/iters_per_sec": 0.9671693553593994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033945083618164, "data/tokens_consumed": 94164221952, "data/tokens_consumed_B": 94.164221952, "train/loss_slope": 1.4610962584467218e-05} {"step": 44910, "timestamp": 1778243077.2955904, "train/loss": 2.1912312269210816, "train/z_loss": 0.0013917899341322482, "train/perplexity": 8.946221164944463, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029101.9466941918, "perf/iters_per_sec": 0.9675512059660872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335370302200317, "data/tokens_consumed": 94185193472, "data/tokens_consumed_B": 94.185193472, "train/loss_slope": 1.3817278308050081e-05} {"step": 44920, "timestamp": 1778243087.639792, "train/loss": 2.1962637662887574, "train/z_loss": 0.0013865001499652862, "train/perplexity": 8.991356853453102, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029035.3414284128, "perf/iters_per_sec": 0.9675194461004318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335709571838378, "data/tokens_consumed": 94206164992, "data/tokens_consumed_B": 94.206164992, "train/loss_slope": 1.7159132574043086e-05} {"step": 44925, "timestamp": 1778243093.3984516, "eos/sharpness": 36.13090515136718, "eos/L0_probe": 1.9833556413650513, "eos/L_plus": 2.1800425052642822, "eos/L_minus": 2.147977828979492, "eos/grad_norm": 0.11454293131828308, "eos/embed_grad_frac": 0.17776961624622345, "eos/time_s": 0.595890998840332} {"step": 44925, "timestamp": 1778243094.7831388, "geo/rankme_last": 439.3149108886719, "geo/layer_0/stable_rank_q_proj": 19.368099212646484, "geo/layer_0/stable_rank_k_proj": 16.393266677856445, "geo/layer_0/stable_rank_o_proj": 48.539649963378906, "geo/layer_0/stable_rank_gate_proj": 135.83042907714844, "geo/layer_0/stable_rank_down_proj": 53.52284240722656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058158211410045624, "geo/layer_0/attn_entropy_mean": 6.189206123352051, "geo/layer_0/attn_entropy_std": 0.38408246636390686, "geo/layer_7/stable_rank_q_proj": 42.760440826416016, "geo/layer_7/stable_rank_k_proj": 42.314144134521484, "geo/layer_7/stable_rank_o_proj": 95.55545043945312, "geo/layer_7/stable_rank_gate_proj": 88.42801666259766, "geo/layer_7/stable_rank_down_proj": 145.82798767089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48291289806365967, "geo/layer_7/attn_entropy_mean": 4.636460304260254, "geo/layer_7/attn_entropy_std": 0.8179745674133301, "geo/layer_14/stable_rank_q_proj": 53.89467239379883, "geo/layer_14/stable_rank_k_proj": 38.255126953125, "geo/layer_14/stable_rank_o_proj": 46.834346771240234, "geo/layer_14/stable_rank_gate_proj": 75.28852081298828, "geo/layer_14/stable_rank_down_proj": 132.60499572753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3745638430118561, "geo/layer_14/attn_entropy_mean": 5.505342483520508, "geo/layer_14/attn_entropy_std": 0.37789806723594666, "geo/layer_21/stable_rank_q_proj": 42.790496826171875, "geo/layer_21/stable_rank_k_proj": 30.552875518798828, "geo/layer_21/stable_rank_o_proj": 74.96453857421875, "geo/layer_21/stable_rank_gate_proj": 71.10832977294922, "geo/layer_21/stable_rank_down_proj": 54.491127014160156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1435004025697708, "geo/layer_21/attn_entropy_mean": 5.6920976638793945, "geo/layer_21/attn_entropy_std": 0.29612764716148376, "geo/layer_27/stable_rank_q_proj": 43.04473876953125, "geo/layer_27/stable_rank_k_proj": 31.16136360168457, "geo/layer_27/stable_rank_o_proj": 115.8232192993164, "geo/layer_27/stable_rank_gate_proj": 83.95330047607422, "geo/layer_27/stable_rank_down_proj": 130.6346893310547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08461908996105194, "geo/layer_27/attn_entropy_mean": 4.262905120849609, "geo/layer_27/attn_entropy_std": 0.6715094447135925, "attnres/final_alpha/block_0": 0.24016961455345154, "attnres/block_norm/0": 1.7324615716934204, "attnres/final_alpha/block_1": 0.004982208367437124, "attnres/block_norm/1": 41943.1796875, "attnres/final_alpha/block_2": 0.010927686467766762, "attnres/block_norm/2": 27091.19140625, "attnres/final_alpha/block_3": 0.012922652997076511, "attnres/block_norm/3": 49680.04296875, "attnres/final_alpha/block_4": 0.015718067064881325, "attnres/block_norm/4": 13428.921875, "attnres/final_alpha/block_5": 0.6009291410446167, "attnres/block_norm/5": 6181.2080078125, "attnres/final_alpha/block_6": 0.11435062438249588, "attnres/block_norm/6": 32848.50390625, "geo/tier1_time_s": 1.3586690425872803, "geo/step": 44925.0, "geo/rankme_slope": -4.682372949179672e-05} {"step": 44930, "timestamp": 1778243099.969171, "train/loss": 2.191840934753418, "train/z_loss": 0.0013935279683209955, "train/perplexity": 8.951677409246958, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701893.5866122525, "perf/iters_per_sec": 0.8115261014043105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322462558746339, "data/tokens_consumed": 94227136512, "data/tokens_consumed_B": 94.227136512, "train/loss_slope": 1.812203929047309e-05} {"step": 44940, "timestamp": 1778243110.308898, "train/loss": 2.164302659034729, "train/z_loss": 0.0014034451334737242, "train/perplexity": 8.708526986333727, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029115.5678457697, "perf/iters_per_sec": 0.9675577010372971, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03353009223938, "data/tokens_consumed": 94248108032, "data/tokens_consumed_B": 94.248108032, "train/loss_slope": 1.3651308904636959e-05} {"step": 44950, "timestamp": 1778243120.6455677, "grad/layer_0/attn": 0.0028376406989991665, "grad/layer_0/mlp": 0.0027052375953644514, "grad/layer_0/attn_mlp_ratio": 1.048943205198417, "grad/layer_4/attn": 0.002077365294098854, "grad/layer_4/mlp": 0.0025048519019037485, "grad/layer_4/attn_mlp_ratio": 0.829336540649909, "grad/layer_8/attn": 0.003723587840795517, "grad/layer_8/mlp": 0.003543396946042776, "grad/layer_8/attn_mlp_ratio": 1.0508525554464678, "grad/layer_12/attn": 0.005126412026584148, "grad/layer_12/mlp": 0.00700148893520236, "grad/layer_12/attn_mlp_ratio": 0.7321888245213802, "grad/layer_16/attn": 0.0034998017363250256, "grad/layer_16/mlp": 0.004574555903673172, "grad/layer_16/attn_mlp_ratio": 0.7650582337422114, "grad/layer_20/attn": 0.0035011162981390953, "grad/layer_20/mlp": 0.005454801954329014, "grad/layer_20/attn_mlp_ratio": 0.6418411270048849, "grad/layer_24/attn": 0.00543910963460803, "grad/layer_24/mlp": 0.007606542203575373, "grad/layer_24/attn_mlp_ratio": 0.7150567784328808, "grad/layer_27/attn": 0.004146183840930462, "grad/layer_27/mlp": 0.006536751054227352, "grad/layer_27/attn_mlp_ratio": 0.6342881567778671} {"step": 44950, "timestamp": 1778243120.6603472, "train/loss": 2.1201395750045777, "train/z_loss": 0.0014056713902391494, "train/perplexity": 8.332300387394652, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026993.7176321608, "perf/iters_per_sec": 0.9665459240113071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034611988067627, "data/tokens_consumed": 94269079552, "data/tokens_consumed_B": 94.269079552, "train/loss_slope": 8.023160347307634e-06} {"step": 44960, "timestamp": 1778243131.0094724, "train/loss": 2.1929649829864504, "train/z_loss": 0.0013986803824082017, "train/perplexity": 8.961745183693498, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027900.4011281056, "perf/iters_per_sec": 0.9669782643929031, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034149408340454, "data/tokens_consumed": 94290051072, "data/tokens_consumed_B": 94.290051072, "train/loss_slope": 1.014150644686359e-05} {"step": 44970, "timestamp": 1778243141.351069, "train/loss": 2.146350848674774, "train/z_loss": 0.0014025176409631968, "train/perplexity": 8.553588039614787, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028938.0391360847, "perf/iters_per_sec": 0.9674730487518715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336205244064331, "data/tokens_consumed": 94311022592, "data/tokens_consumed_B": 94.311022592, "train/loss_slope": 7.709564813579003e-06} {"step": 44980, "timestamp": 1778243151.6957653, "train/loss": 2.1971646547317505, "train/z_loss": 0.0014015392051078379, "train/perplexity": 8.99946071271769, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028360.8292633567, "perf/iters_per_sec": 0.9671978136364731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339146614074708, "data/tokens_consumed": 94331994112, "data/tokens_consumed_B": 94.331994112, "train/loss_slope": 8.380200052418749e-06} {"step": 44990, "timestamp": 1778243162.0388522, "train/loss": 2.1978617906570435, "train/z_loss": 0.0013854645774699747, "train/perplexity": 9.005736747459254, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029071.522101963, "perf/iters_per_sec": 0.9675366983899893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335525274276733, "data/tokens_consumed": 94352965632, "data/tokens_consumed_B": 94.352965632, "train/loss_slope": 1.19277732850838e-05} {"step": 45000, "timestamp": 1778243172.9169784, "grad/layer_0/attn": 0.0024372802581638098, "grad/layer_0/mlp": 0.0026100671384483576, "grad/layer_0/attn_mlp_ratio": 0.9337997972851968, "grad/layer_4/attn": 0.0023357553873211145, "grad/layer_4/mlp": 0.0026951648760586977, "grad/layer_4/attn_mlp_ratio": 0.8666465348391513, "grad/layer_8/attn": 0.0033571398817002773, "grad/layer_8/mlp": 0.003589895786717534, "grad/layer_8/attn_mlp_ratio": 0.93516357789694, "grad/layer_12/attn": 0.003651639446616173, "grad/layer_12/mlp": 0.006060915067791939, "grad/layer_12/attn_mlp_ratio": 0.602489780094795, "grad/layer_16/attn": 0.00477264728397131, "grad/layer_16/mlp": 0.004382263403385878, "grad/layer_16/attn_mlp_ratio": 1.089082680738803, "grad/layer_20/attn": 0.0030156588181853294, "grad/layer_20/mlp": 0.005345011129975319, "grad/layer_20/attn_mlp_ratio": 0.5642006514922242, "grad/layer_24/attn": 0.01410647202283144, "grad/layer_24/mlp": 0.011696657165884972, "grad/layer_24/attn_mlp_ratio": 1.2060259356299214, "grad/layer_27/attn": 0.006740150041878223, "grad/layer_27/mlp": 0.011842024512588978, "grad/layer_27/attn_mlp_ratio": 0.5691721021009303} {"step": 45000, "timestamp": 1778243173.5261736, "eos/sharpness": 67.26779937744139, "eos/L0_probe": 1.9828311204910278, "eos/L_plus": 2.279554605484009, "eos/L_minus": 2.358785629272461, "eos/grad_norm": 0.19521011412143707, "eos/embed_grad_frac": 0.06461763381958008, "eos/time_s": 0.6063089370727539} {"step": 45000, "timestamp": 1778243173.5465531, "train/loss": 2.117610847949982, "train/z_loss": 0.0014046429540030658, "train/perplexity": 8.3112568918198, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1823186.4502050162, "perf/iters_per_sec": 0.8693630457902032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1502674341201782, "data/tokens_consumed": 94373937152, "data/tokens_consumed_B": 94.373937152, "train/loss_slope": 9.314371736207755e-06} {"step": 45000, "timestamp": 1778243174.9147327, "geo/rankme_last": 439.5194396972656, "geo/layer_0/stable_rank_q_proj": 19.392925262451172, "geo/layer_0/stable_rank_k_proj": 16.409757614135742, "geo/layer_0/stable_rank_o_proj": 48.552799224853516, "geo/layer_0/stable_rank_gate_proj": 135.65830993652344, "geo/layer_0/stable_rank_down_proj": 53.46646499633789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058980848640203476, "geo/layer_0/attn_entropy_mean": 6.189692497253418, "geo/layer_0/attn_entropy_std": 0.38107144832611084, "geo/layer_7/stable_rank_q_proj": 42.797149658203125, "geo/layer_7/stable_rank_k_proj": 42.3053092956543, "geo/layer_7/stable_rank_o_proj": 95.5223617553711, "geo/layer_7/stable_rank_gate_proj": 88.46232604980469, "geo/layer_7/stable_rank_down_proj": 145.85887145996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4892062246799469, "geo/layer_7/attn_entropy_mean": 4.6514482498168945, "geo/layer_7/attn_entropy_std": 0.8263973593711853, "geo/layer_14/stable_rank_q_proj": 53.8666877746582, "geo/layer_14/stable_rank_k_proj": 38.21998596191406, "geo/layer_14/stable_rank_o_proj": 46.90500259399414, "geo/layer_14/stable_rank_gate_proj": 75.14092254638672, "geo/layer_14/stable_rank_down_proj": 132.72865295410156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3904283344745636, "geo/layer_14/attn_entropy_mean": 5.510926246643066, "geo/layer_14/attn_entropy_std": 0.37274497747421265, "geo/layer_21/stable_rank_q_proj": 42.658546447753906, "geo/layer_21/stable_rank_k_proj": 30.560245513916016, "geo/layer_21/stable_rank_o_proj": 74.8724365234375, "geo/layer_21/stable_rank_gate_proj": 71.0703353881836, "geo/layer_21/stable_rank_down_proj": 54.48960494995117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14394734799861908, "geo/layer_21/attn_entropy_mean": 5.710149765014648, "geo/layer_21/attn_entropy_std": 0.28125476837158203, "geo/layer_27/stable_rank_q_proj": 42.9555778503418, "geo/layer_27/stable_rank_k_proj": 31.134157180786133, "geo/layer_27/stable_rank_o_proj": 115.67997741699219, "geo/layer_27/stable_rank_gate_proj": 84.00517272949219, "geo/layer_27/stable_rank_down_proj": 130.69873046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08745356649160385, "geo/layer_27/attn_entropy_mean": 4.264777660369873, "geo/layer_27/attn_entropy_std": 0.6667500734329224, "attnres/final_alpha/block_0": 0.24124276638031006, "attnres/block_norm/0": 1.732710838317871, "attnres/final_alpha/block_1": 0.004990807734429836, "attnres/block_norm/1": 41892.296875, "attnres/final_alpha/block_2": 0.010988321155309677, "attnres/block_norm/2": 27075.55078125, "attnres/final_alpha/block_3": 0.013049336150288582, "attnres/block_norm/3": 49603.38671875, "attnres/final_alpha/block_4": 0.015979815274477005, "attnres/block_norm/4": 13424.861328125, "attnres/final_alpha/block_5": 0.5963637828826904, "attnres/block_norm/5": 6153.7802734375, "attnres/final_alpha/block_6": 0.11738520860671997, "attnres/block_norm/6": 32859.7890625, "geo/tier1_time_s": 1.3646306991577148, "geo/step": 45000.0, "geo/rankme_slope": -3.7678938763005203e-06} {"step": 45000, "timestamp": 1778243182.1274686, "geo/ww_alpha_mean": 7.653281120016607, "geo/ww_alpha_std": 5.06936287625945, "geo/ww_alpha_min": 1.3431460092764624, "geo/ww_alpha_max": 33.105080089755134, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.9866419407003475, "geo/ww_alpha_by_type/k_proj": 4.452552300463702, "geo/ww_alpha_by_type/v_proj": 7.167885484242589, "geo/ww_alpha_by_type/o_proj": 8.998204621671235, "geo/ww_alpha_by_type/gate_proj": 7.930383755214625, "geo/ww_alpha_by_type/up_proj": 12.710580484599918, "geo/ww_alpha_by_type/down_proj": 8.432864903538375, "geo/twonn_id/layer_0": 0.7090020775794983, "geo/twonn_id/layer_7": 3.4116463661193848, "geo/twonn_id/layer_14": 4.382567882537842, "geo/twonn_id/layer_21": 8.264062881469727, "geo/twonn_id/layer_27": 5.068920612335205, "geo/tier2_time_s": 7.20518159866333} {"step": 45000, "timestamp": 1778243182.9577796, "eoc/jacobian_sigma/layer_0/attn": 1114.8040771484375, "eoc/jacobian_sigma/layer_0/mlp": 7707.51904296875, "eoc/jacobian_sigma/layer_0": 7707.51904296875, "eoc/jacobian_sigma/layer_7/attn": 1.165468692779541, "eoc/jacobian_sigma/layer_7/mlp": 1.7669090032577515, "eoc/jacobian_sigma/layer_7": 1.7669090032577515, "eoc/jacobian_sigma/layer_14/attn": 1.6027756929397583, "eoc/jacobian_sigma/layer_14/mlp": 6.854237079620361, "eoc/jacobian_sigma/layer_14": 6.854237079620361, "eoc/jacobian_sigma/layer_21/attn": 1.0819742679595947, "eoc/jacobian_sigma/layer_21/mlp": 3.6654715538024902, "eoc/jacobian_sigma/layer_21": 3.6654715538024902, "eoc/jacobian_sigma/layer_27/attn": 3.694734573364258, "eoc/jacobian_sigma/layer_27/mlp": 26.524518966674805, "eoc/jacobian_sigma/layer_27": 26.524518966674805, "eoc/layer0_sigma": 7707.51904296875, "eoc/sigma_max": 26.524518966674805, "eoc/sigma_min": 1.7669090032577515, "eoc/sigma_mean": 9.702784150838852, "eoc/time_s": 0.8229348659515381} {"step": 45010, "timestamp": 1778243193.3288374, "train/loss": 2.171635556221008, "train/z_loss": 0.0014023651834577321, "train/perplexity": 8.772620427476408, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1060297.3136017097, "perf/iters_per_sec": 0.5055891578682469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9778905153274535, "data/tokens_consumed": 94394908672, "data/tokens_consumed_B": 94.394908672, "train/loss_slope": 8.672880117792367e-06} {"step": 45020, "timestamp": 1778243204.2502577, "train/loss": 2.1096696138381956, "train/z_loss": 0.0013962566037662328, "train/perplexity": 8.245516629965879, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921057.5607509464, "perf/iters_per_sec": 0.9160316280131084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0916653633117677, "data/tokens_consumed": 94415880192, "data/tokens_consumed_B": 94.415880192, "train/loss_slope": 4.596426556355733e-06} {"step": 45030, "timestamp": 1778243214.593662, "train/loss": 2.121647047996521, "train/z_loss": 0.0013970326632261275, "train/perplexity": 8.344870577419936, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029257.5008274051, "perf/iters_per_sec": 0.9676253799569154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334578037261963, "data/tokens_consumed": 94436851712, "data/tokens_consumed_B": 94.436851712, "train/loss_slope": 5.00918114253266e-07} {"step": 45040, "timestamp": 1778243224.938891, "train/loss": 2.1159287214279177, "train/z_loss": 0.0013999056187458335, "train/perplexity": 8.297288058136985, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028682.58898306, "perf/iters_per_sec": 0.9673512406268406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337506771087646, "data/tokens_consumed": 94457823232, "data/tokens_consumed_B": 94.457823232, "train/loss_slope": -2.7464332646376685e-06} {"step": 45050, "timestamp": 1778243235.2729723, "grad/layer_0/attn": 0.004147794563323259, "grad/layer_0/mlp": 0.0032319228630512953, "grad/layer_0/attn_mlp_ratio": 1.2833828685716193, "grad/layer_4/attn": 0.0033016283996403217, "grad/layer_4/mlp": 0.0025456936564296484, "grad/layer_4/attn_mlp_ratio": 1.296946418359008, "grad/layer_8/attn": 0.0035742155741900206, "grad/layer_8/mlp": 0.0034570733550935984, "grad/layer_8/attn_mlp_ratio": 1.0338847642718805, "grad/layer_12/attn": 0.004206388723105192, "grad/layer_12/mlp": 0.006517936009913683, "grad/layer_12/attn_mlp_ratio": 0.6453559304926814, "grad/layer_16/attn": 0.004432240501046181, "grad/layer_16/mlp": 0.005072173662483692, "grad/layer_16/attn_mlp_ratio": 0.8738345152583742, "grad/layer_20/attn": 0.005780122708529234, "grad/layer_20/mlp": 0.006754509638994932, "grad/layer_20/attn_mlp_ratio": 0.8557427454963318, "grad/layer_24/attn": 0.021452460438013077, "grad/layer_24/mlp": 0.012520217336714268, "grad/layer_24/attn_mlp_ratio": 1.7134255492325488, "grad/layer_27/attn": 0.010646562092006207, "grad/layer_27/mlp": 0.010846016928553581, "grad/layer_27/attn_mlp_ratio": 0.9816103057903853} {"step": 45050, "timestamp": 1778243235.2876182, "train/loss": 2.182686281204224, "train/z_loss": 0.0013869336224161088, "train/perplexity": 8.870101871494855, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028093.4131902426, "perf/iters_per_sec": 0.9670702997161115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034050989151001, "data/tokens_consumed": 94478794752, "data/tokens_consumed_B": 94.478794752, "train/loss_slope": 2.812295105948738e-06} {"step": 45060, "timestamp": 1778243245.6305773, "train/loss": 2.1538080096244814, "train/z_loss": 0.0014027525903657079, "train/perplexity": 8.617611943910003, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028553.7418551985, "perf/iters_per_sec": 0.9672898015285485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338163375854492, "data/tokens_consumed": 94499766272, "data/tokens_consumed_B": 94.499766272, "train/loss_slope": 3.518928109508663e-07} {"step": 45070, "timestamp": 1778243255.9739165, "train/loss": 2.1673845529556273, "train/z_loss": 0.001391837338451296, "train/perplexity": 8.73540714233177, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029074.6113266333, "perf/iters_per_sec": 0.9675381714471022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335509538650514, "data/tokens_consumed": 94520737792, "data/tokens_consumed_B": 94.520737792, "train/loss_slope": -7.51183213967301e-07} {"step": 45075, "timestamp": 1778243261.76283, "eos/sharpness": 23.875379562377926, "eos/L0_probe": 1.9809342622756958, "eos/L_plus": 2.102201223373413, "eos/L_minus": 2.098421096801758, "eos/grad_norm": 0.12175389379262924, "eos/embed_grad_frac": 0.1754775047302246, "eos/time_s": 0.6244275569915771} {"step": 45075, "timestamp": 1778243263.1417882, "geo/rankme_last": 439.3585205078125, "geo/layer_0/stable_rank_q_proj": 19.407440185546875, "geo/layer_0/stable_rank_k_proj": 16.371347427368164, "geo/layer_0/stable_rank_o_proj": 48.526641845703125, "geo/layer_0/stable_rank_gate_proj": 135.3348388671875, "geo/layer_0/stable_rank_down_proj": 53.42387771606445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06221097707748413, "geo/layer_0/attn_entropy_mean": 6.1913604736328125, "geo/layer_0/attn_entropy_std": 0.3818328380584717, "geo/layer_7/stable_rank_q_proj": 42.740577697753906, "geo/layer_7/stable_rank_k_proj": 42.19913864135742, "geo/layer_7/stable_rank_o_proj": 95.34945678710938, "geo/layer_7/stable_rank_gate_proj": 88.51433563232422, "geo/layer_7/stable_rank_down_proj": 146.1548614501953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4877864122390747, "geo/layer_7/attn_entropy_mean": 4.653282642364502, "geo/layer_7/attn_entropy_std": 0.825395941734314, "geo/layer_14/stable_rank_q_proj": 53.733951568603516, "geo/layer_14/stable_rank_k_proj": 38.285831451416016, "geo/layer_14/stable_rank_o_proj": 46.86212921142578, "geo/layer_14/stable_rank_gate_proj": 75.10770416259766, "geo/layer_14/stable_rank_down_proj": 132.63409423828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3896970748901367, "geo/layer_14/attn_entropy_mean": 5.506085395812988, "geo/layer_14/attn_entropy_std": 0.3656986653804779, "geo/layer_21/stable_rank_q_proj": 42.65940856933594, "geo/layer_21/stable_rank_k_proj": 30.49510955810547, "geo/layer_21/stable_rank_o_proj": 74.62444305419922, "geo/layer_21/stable_rank_gate_proj": 70.98603820800781, "geo/layer_21/stable_rank_down_proj": 54.554481506347656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14295990765094757, "geo/layer_21/attn_entropy_mean": 5.707012176513672, "geo/layer_21/attn_entropy_std": 0.2899000942707062, "geo/layer_27/stable_rank_q_proj": 43.02007293701172, "geo/layer_27/stable_rank_k_proj": 31.165498733520508, "geo/layer_27/stable_rank_o_proj": 115.66024017333984, "geo/layer_27/stable_rank_gate_proj": 83.9105453491211, "geo/layer_27/stable_rank_down_proj": 130.74246215820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08949494361877441, "geo/layer_27/attn_entropy_mean": 4.277342796325684, "geo/layer_27/attn_entropy_std": 0.6849861145019531, "attnres/final_alpha/block_0": 0.2402038276195526, "attnres/block_norm/0": 1.7327724695205688, "attnres/final_alpha/block_1": 0.004905302077531815, "attnres/block_norm/1": 42026.4921875, "attnres/final_alpha/block_2": 0.010817786678671837, "attnres/block_norm/2": 27205.677734375, "attnres/final_alpha/block_3": 0.01287505030632019, "attnres/block_norm/3": 49645.875, "attnres/final_alpha/block_4": 0.01567930541932583, "attnres/block_norm/4": 13478.447265625, "attnres/final_alpha/block_5": 0.601554274559021, "attnres/block_norm/5": 6158.875, "attnres/final_alpha/block_6": 0.11396446079015732, "attnres/block_norm/6": 32963.796875, "geo/tier1_time_s": 1.3599979877471924, "geo/step": 45075.0, "geo/rankme_slope": 1.6454550570228086e-05} {"step": 45080, "timestamp": 1778243268.3193138, "train/loss": 2.144328498840332, "train/z_loss": 0.0013937190757133066, "train/perplexity": 8.536307172128486, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699812.8316787034, "perf/iters_per_sec": 0.8105339201348798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337546586990356, "data/tokens_consumed": 94541709312, "data/tokens_consumed_B": 94.541709312, "train/loss_slope": -3.040087398308155e-06} {"step": 45090, "timestamp": 1778243278.6630433, "train/loss": 2.209436631202698, "train/z_loss": 0.0013850508839823306, "train/perplexity": 9.110582329180891, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028358.6776809345, "perf/iters_per_sec": 0.9671967876820252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339157581329346, "data/tokens_consumed": 94562680832, "data/tokens_consumed_B": 94.562680832, "train/loss_slope": -6.906813485036402e-07} {"step": 45100, "timestamp": 1778243288.99704, "grad/layer_0/attn": 0.003157545579597354, "grad/layer_0/mlp": 0.0029750047251582146, "grad/layer_0/attn_mlp_ratio": 1.0613581372693845, "grad/layer_4/attn": 0.0025469670072197914, "grad/layer_4/mlp": 0.002578451531007886, "grad/layer_4/attn_mlp_ratio": 0.987789329297681, "grad/layer_8/attn": 0.003278931137174368, "grad/layer_8/mlp": 0.003709999844431877, "grad/layer_8/attn_mlp_ratio": 0.8838089450905585, "grad/layer_12/attn": 0.004231829196214676, "grad/layer_12/mlp": 0.007048111874610186, "grad/layer_12/attn_mlp_ratio": 0.6004202560145518, "grad/layer_16/attn": 0.005126731004565954, "grad/layer_16/mlp": 0.00470237759873271, "grad/layer_16/attn_mlp_ratio": 1.0902422844399777, "grad/layer_20/attn": 0.0034971374552696943, "grad/layer_20/mlp": 0.006287070456892252, "grad/layer_20/attn_mlp_ratio": 0.5562427562445484, "grad/layer_24/attn": 0.0176595039665699, "grad/layer_24/mlp": 0.01279434934258461, "grad/layer_24/attn_mlp_ratio": 1.3802580620310518, "grad/layer_27/attn": 0.010570031590759754, "grad/layer_27/mlp": 0.013347802683711052, "grad/layer_27/attn_mlp_ratio": 0.7918930000718064} {"step": 45100, "timestamp": 1778243289.01167, "train/loss": 2.1387316942214967, "train/z_loss": 0.0013956601731479168, "train/perplexity": 8.48866457623397, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027870.620319662, "perf/iters_per_sec": 0.9669640637968359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034164595603943, "data/tokens_consumed": 94583652352, "data/tokens_consumed_B": 94.583652352, "train/loss_slope": -1.5707044961965225e-06} {"step": 45110, "timestamp": 1778243299.3594873, "train/loss": 2.201345419883728, "train/z_loss": 0.001402387220878154, "train/perplexity": 9.037164104046823, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027837.8951652553, "perf/iters_per_sec": 0.9669484592272068, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03418128490448, "data/tokens_consumed": 94604623872, "data/tokens_consumed_B": 94.604623872, "train/loss_slope": 2.807811320167839e-06} {"step": 45120, "timestamp": 1778243309.6991625, "train/loss": 2.1816256523132322, "train/z_loss": 0.0013908833148889243, "train/perplexity": 8.860698972558515, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029346.4531196626, "perf/iters_per_sec": 0.9676677957151711, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033412504196167, "data/tokens_consumed": 94625595392, "data/tokens_consumed_B": 94.625595392, "train/loss_slope": 2.3966477649046724e-06} {"step": 45130, "timestamp": 1778243320.047355, "train/loss": 2.146779918670654, "train/z_loss": 0.0014090045006014407, "train/perplexity": 8.557258915074646, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027987.364224846, "perf/iters_per_sec": 0.9670197316288214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341050624847412, "data/tokens_consumed": 94646566912, "data/tokens_consumed_B": 94.646566912, "train/loss_slope": 5.370506037114858e-07} {"step": 45140, "timestamp": 1778243330.393547, "train/loss": 2.168565201759338, "train/z_loss": 0.0014024059753865003, "train/perplexity": 8.745726681001058, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028460.2279927365, "perf/iters_per_sec": 0.9672452106441195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338639974594117, "data/tokens_consumed": 94667538432, "data/tokens_consumed_B": 94.667538432, "train/loss_slope": -1.847820885718663e-06} {"step": 45150, "timestamp": 1778243340.729786, "grad/layer_0/attn": 0.0032257779967039824, "grad/layer_0/mlp": 0.0031039821915328503, "grad/layer_0/attn_mlp_ratio": 1.0392385309360077, "grad/layer_4/attn": 0.0018514376133680344, "grad/layer_4/mlp": 0.002486796583980322, "grad/layer_4/attn_mlp_ratio": 0.7445070299855221, "grad/layer_8/attn": 0.006658496800810099, "grad/layer_8/mlp": 0.003565538441762328, "grad/layer_8/attn_mlp_ratio": 1.8674589330112874, "grad/layer_12/attn": 0.004140947014093399, "grad/layer_12/mlp": 0.006800275761634111, "grad/layer_12/attn_mlp_ratio": 0.6089380928582399, "grad/layer_16/attn": 0.004331132397055626, "grad/layer_16/mlp": 0.004663805011659861, "grad/layer_16/attn_mlp_ratio": 0.9286692503997374, "grad/layer_20/attn": 0.006022477522492409, "grad/layer_20/mlp": 0.006778279785066843, "grad/layer_20/attn_mlp_ratio": 0.8884964363540765, "grad/layer_24/attn": 0.016468752175569534, "grad/layer_24/mlp": 0.013140243478119373, "grad/layer_24/attn_mlp_ratio": 1.253306460999906, "grad/layer_27/attn": 0.009460595436394215, "grad/layer_27/mlp": 0.0131089361384511, "grad/layer_27/attn_mlp_ratio": 0.7216905524831542} {"step": 45150, "timestamp": 1778243341.3365922, "eos/sharpness": 72.7753162384033, "eos/L0_probe": 1.987274169921875, "eos/L_plus": 2.4145028591156006, "eos/L_minus": 2.2877986431121826, "eos/grad_norm": 0.23684154450893402, "eos/embed_grad_frac": 0.04200352728366852, "eos/time_s": 0.6038966178894043} {"step": 45150, "timestamp": 1778243341.3565588, "train/loss": 2.2208624362945555, "train/z_loss": 0.001384741673246026, "train/perplexity": 9.215275027371165, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913816.0111226432, "perf/iters_per_sec": 0.9125785880673615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095796036720276, "data/tokens_consumed": 94688509952, "data/tokens_consumed_B": 94.688509952, "train/loss_slope": 4.069535481189391e-06} {"step": 45150, "timestamp": 1778243342.718478, "geo/rankme_last": 439.18048095703125, "geo/layer_0/stable_rank_q_proj": 19.397676467895508, "geo/layer_0/stable_rank_k_proj": 16.373638153076172, "geo/layer_0/stable_rank_o_proj": 48.452632904052734, "geo/layer_0/stable_rank_gate_proj": 135.14674377441406, "geo/layer_0/stable_rank_down_proj": 53.42539978027344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06181919202208519, "geo/layer_0/attn_entropy_mean": 6.193840026855469, "geo/layer_0/attn_entropy_std": 0.3795351982116699, "geo/layer_7/stable_rank_q_proj": 42.66030502319336, "geo/layer_7/stable_rank_k_proj": 42.22555923461914, "geo/layer_7/stable_rank_o_proj": 95.29998016357422, "geo/layer_7/stable_rank_gate_proj": 88.65092468261719, "geo/layer_7/stable_rank_down_proj": 146.110107421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.491605281829834, "geo/layer_7/attn_entropy_mean": 4.658470153808594, "geo/layer_7/attn_entropy_std": 0.8120591640472412, "geo/layer_14/stable_rank_q_proj": 53.74712371826172, "geo/layer_14/stable_rank_k_proj": 38.290870666503906, "geo/layer_14/stable_rank_o_proj": 46.771217346191406, "geo/layer_14/stable_rank_gate_proj": 75.03752899169922, "geo/layer_14/stable_rank_down_proj": 132.5841064453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3885970413684845, "geo/layer_14/attn_entropy_mean": 5.506240367889404, "geo/layer_14/attn_entropy_std": 0.380838006734848, "geo/layer_21/stable_rank_q_proj": 42.57817077636719, "geo/layer_21/stable_rank_k_proj": 30.43240737915039, "geo/layer_21/stable_rank_o_proj": 74.49945831298828, "geo/layer_21/stable_rank_gate_proj": 70.9734878540039, "geo/layer_21/stable_rank_down_proj": 54.49557876586914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14521382749080658, "geo/layer_21/attn_entropy_mean": 5.715880393981934, "geo/layer_21/attn_entropy_std": 0.2814181447029114, "geo/layer_27/stable_rank_q_proj": 42.89567565917969, "geo/layer_27/stable_rank_k_proj": 31.163537979125977, "geo/layer_27/stable_rank_o_proj": 115.57147216796875, "geo/layer_27/stable_rank_gate_proj": 83.90130615234375, "geo/layer_27/stable_rank_down_proj": 130.7281494140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09152422100305557, "geo/layer_27/attn_entropy_mean": 4.24308967590332, "geo/layer_27/attn_entropy_std": 0.6772887110710144, "attnres/final_alpha/block_0": 0.23940354585647583, "attnres/block_norm/0": 1.732795000076294, "attnres/final_alpha/block_1": 0.00498015433549881, "attnres/block_norm/1": 41980.31640625, "attnres/final_alpha/block_2": 0.010820619761943817, "attnres/block_norm/2": 27120.61328125, "attnres/final_alpha/block_3": 0.01285987813025713, "attnres/block_norm/3": 50032.921875, "attnres/final_alpha/block_4": 0.015592324547469616, "attnres/block_norm/4": 13468.255859375, "attnres/final_alpha/block_5": 0.6034003496170044, "attnres/block_norm/5": 6121.27587890625, "attnres/final_alpha/block_6": 0.11294318735599518, "attnres/block_norm/6": 33111.65625, "geo/tier1_time_s": 1.3582992553710938, "geo/step": 45150.0, "geo/rankme_slope": 3.754695237469988e-05} {"step": 45160, "timestamp": 1778243353.0682085, "train/loss": 2.1125773906707765, "train/z_loss": 0.0014134883298538626, "train/perplexity": 8.269527644613122, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791158.1423817603, "perf/iters_per_sec": 0.8540907585057069, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170835757255554, "data/tokens_consumed": 94709481472, "data/tokens_consumed_B": 94.709481472, "train/loss_slope": -7.220395148569674e-07} {"step": 45170, "timestamp": 1778243363.417745, "train/loss": 2.177373230457306, "train/z_loss": 0.0013880420243367552, "train/perplexity": 8.82309954356511, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027583.7969110932, "perf/iters_per_sec": 0.9668272957377878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343108892440795, "data/tokens_consumed": 94730452992, "data/tokens_consumed_B": 94.730452992, "train/loss_slope": 2.1766455295811396e-07} {"step": 45180, "timestamp": 1778243374.221572, "train/loss": 2.141594934463501, "train/z_loss": 0.00140195416752249, "train/perplexity": 8.513004491132921, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1942233.0922697799, "perf/iters_per_sec": 0.9261289082859897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0797632932662964, "data/tokens_consumed": 94751424512, "data/tokens_consumed_B": 94.751424512, "train/loss_slope": -3.4501543568663873e-06} {"step": 45190, "timestamp": 1778243385.067895, "train/loss": 2.1449994087219237, "train/z_loss": 0.0013895596493966877, "train/perplexity": 8.542036186573876, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934953.4787213902, "perf/iters_per_sec": 0.9226577180487586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0838255405426025, "data/tokens_consumed": 94772396032, "data/tokens_consumed_B": 94.772396032, "train/loss_slope": -2.3315711240313962e-06} {"step": 45200, "timestamp": 1778243395.414693, "grad/layer_0/attn": 0.002875674981623888, "grad/layer_0/mlp": 0.00284514925442636, "grad/layer_0/attn_mlp_ratio": 1.010729006950035, "grad/layer_4/attn": 0.001961595844477415, "grad/layer_4/mlp": 0.0024112965911626816, "grad/layer_4/attn_mlp_ratio": 0.8135024825715539, "grad/layer_8/attn": 0.007044659927487373, "grad/layer_8/mlp": 0.003570452332496643, "grad/layer_8/attn_mlp_ratio": 1.9730440499276969, "grad/layer_12/attn": 0.004346105735749006, "grad/layer_12/mlp": 0.006695426534861326, "grad/layer_12/attn_mlp_ratio": 0.6491155788519848, "grad/layer_16/attn": 0.003900559153407812, "grad/layer_16/mlp": 0.0046806600876152515, "grad/layer_16/attn_mlp_ratio": 0.833335255511276, "grad/layer_20/attn": 0.003501705825328827, "grad/layer_20/mlp": 0.006874909624457359, "grad/layer_20/attn_mlp_ratio": 0.5093457173512513, "grad/layer_24/attn": 0.0167465191334486, "grad/layer_24/mlp": 0.012877048924565315, "grad/layer_24/attn_mlp_ratio": 1.3004935448720873, "grad/layer_27/attn": 0.007879470475018024, "grad/layer_27/mlp": 0.011973043903708458, "grad/layer_27/attn_mlp_ratio": 0.658100853264841} {"step": 45200, "timestamp": 1778243395.4292872, "train/loss": 2.1957767724990847, "train/z_loss": 0.0013884662068448961, "train/perplexity": 8.986979184540044, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025987.0915878594, "perf/iters_per_sec": 0.9660659273089692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351260423660278, "data/tokens_consumed": 94793367552, "data/tokens_consumed_B": 94.793367552, "train/loss_slope": 2.4587330501998425e-07} {"step": 45210, "timestamp": 1778243405.7752006, "train/loss": 2.081060194969177, "train/z_loss": 0.0014169701491482556, "train/perplexity": 8.012959712125483, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028622.0933585723, "perf/iters_per_sec": 0.9673223940651762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337815046310426, "data/tokens_consumed": 94814339072, "data/tokens_consumed_B": 94.814339072, "train/loss_slope": -7.804794337275247e-06} {"step": 45220, "timestamp": 1778243416.1193945, "train/loss": 2.208280420303345, "train/z_loss": 0.0013933121808804572, "train/perplexity": 9.10005466186709, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028946.2292050205, "perf/iters_per_sec": 0.9674769540810683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336163520812989, "data/tokens_consumed": 94835310592, "data/tokens_consumed_B": 94.835310592, "train/loss_slope": -6.643432659534892e-06} {"step": 45225, "timestamp": 1778243421.8763673, "eos/sharpness": 74.34549331665038, "eos/L0_probe": 1.9883437156677246, "eos/L_plus": 2.451399326324463, "eos/L_minus": 2.2687430381774902, "eos/grad_norm": 0.1974121779203415, "eos/embed_grad_frac": 0.057745080441236496, "eos/time_s": 0.5962271690368652} {"step": 45225, "timestamp": 1778243423.2559395, "geo/rankme_last": 439.21124267578125, "geo/layer_0/stable_rank_q_proj": 19.426786422729492, "geo/layer_0/stable_rank_k_proj": 16.425249099731445, "geo/layer_0/stable_rank_o_proj": 48.50364303588867, "geo/layer_0/stable_rank_gate_proj": 135.14718627929688, "geo/layer_0/stable_rank_down_proj": 53.42226028442383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061732884496450424, "geo/layer_0/attn_entropy_mean": 6.192468643188477, "geo/layer_0/attn_entropy_std": 0.37866294384002686, "geo/layer_7/stable_rank_q_proj": 42.7189826965332, "geo/layer_7/stable_rank_k_proj": 42.34975051879883, "geo/layer_7/stable_rank_o_proj": 95.32058715820312, "geo/layer_7/stable_rank_gate_proj": 88.60787200927734, "geo/layer_7/stable_rank_down_proj": 146.15029907226562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4866127371788025, "geo/layer_7/attn_entropy_mean": 4.631575584411621, "geo/layer_7/attn_entropy_std": 0.8174854516983032, "geo/layer_14/stable_rank_q_proj": 53.76056671142578, "geo/layer_14/stable_rank_k_proj": 38.24519348144531, "geo/layer_14/stable_rank_o_proj": 46.81535339355469, "geo/layer_14/stable_rank_gate_proj": 75.00530242919922, "geo/layer_14/stable_rank_down_proj": 132.3065948486328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.372207373380661, "geo/layer_14/attn_entropy_mean": 5.524831295013428, "geo/layer_14/attn_entropy_std": 0.3843344748020172, "geo/layer_21/stable_rank_q_proj": 42.44758987426758, "geo/layer_21/stable_rank_k_proj": 30.32798957824707, "geo/layer_21/stable_rank_o_proj": 74.39518737792969, "geo/layer_21/stable_rank_gate_proj": 70.9404296875, "geo/layer_21/stable_rank_down_proj": 54.435611724853516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1502133309841156, "geo/layer_21/attn_entropy_mean": 5.731795310974121, "geo/layer_21/attn_entropy_std": 0.2969559133052826, "geo/layer_27/stable_rank_q_proj": 42.83786392211914, "geo/layer_27/stable_rank_k_proj": 31.256778717041016, "geo/layer_27/stable_rank_o_proj": 115.71640014648438, "geo/layer_27/stable_rank_gate_proj": 83.86750793457031, "geo/layer_27/stable_rank_down_proj": 130.73158264160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08858136087656021, "geo/layer_27/attn_entropy_mean": 4.255697250366211, "geo/layer_27/attn_entropy_std": 0.6612200140953064, "attnres/final_alpha/block_0": 0.23952391743659973, "attnres/block_norm/0": 1.7331018447875977, "attnres/final_alpha/block_1": 0.00497074332088232, "attnres/block_norm/1": 41925.44921875, "attnres/final_alpha/block_2": 0.010932721197605133, "attnres/block_norm/2": 27066.884765625, "attnres/final_alpha/block_3": 0.01301099918782711, "attnres/block_norm/3": 49588.61328125, "attnres/final_alpha/block_4": 0.015703927725553513, "attnres/block_norm/4": 13564.259765625, "attnres/final_alpha/block_5": 0.6023581027984619, "attnres/block_norm/5": 6202.1005859375, "attnres/final_alpha/block_6": 0.11349959671497345, "attnres/block_norm/6": 33048.4765625, "geo/tier1_time_s": 1.3613240718841553, "geo/step": 45225.0, "geo/rankme_slope": 6.418594781662665e-05} {"step": 45230, "timestamp": 1778243428.4331555, "train/loss": 2.13297643661499, "train/z_loss": 0.0014040627866052091, "train/perplexity": 8.439950440525637, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703787.546333437, "perf/iters_per_sec": 0.8124292117755113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2308764696121215, "data/tokens_consumed": 94856282112, "data/tokens_consumed_B": 94.856282112, "train/loss_slope": -8.139744020960504e-06} {"step": 45240, "timestamp": 1778243439.1899524, "train/loss": 2.1493326306343077, "train/z_loss": 0.0014026734861545265, "train/perplexity": 8.57913103701974, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950909.3373064392, "perf/iters_per_sec": 0.9302660643131443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.074961280822754, "data/tokens_consumed": 94877253632, "data/tokens_consumed_B": 94.877253632, "train/loss_slope": -1.0501563502545525e-05} {"step": 45250, "timestamp": 1778243449.525862, "grad/layer_0/attn": 0.0025549696292728186, "grad/layer_0/mlp": 0.0028353878296911716, "grad/layer_0/attn_mlp_ratio": 0.9011005522447519, "grad/layer_4/attn": 0.0023586417082697153, "grad/layer_4/mlp": 0.0024515383411198854, "grad/layer_4/attn_mlp_ratio": 0.9621067606805569, "grad/layer_8/attn": 0.007544501684606075, "grad/layer_8/mlp": 0.003573354333639145, "grad/layer_8/attn_mlp_ratio": 2.11132195944041, "grad/layer_12/attn": 0.004616591613739729, "grad/layer_12/mlp": 0.006222384516149759, "grad/layer_12/attn_mlp_ratio": 0.7419328599131741, "grad/layer_16/attn": 0.003482967847958207, "grad/layer_16/mlp": 0.004342386499047279, "grad/layer_16/attn_mlp_ratio": 0.8020860806641148, "grad/layer_20/attn": 0.005121761001646519, "grad/layer_20/mlp": 0.005506736226379871, "grad/layer_20/attn_mlp_ratio": 0.9300901111082551, "grad/layer_24/attn": 0.007201122120022774, "grad/layer_24/mlp": 0.008059099316596985, "grad/layer_24/attn_mlp_ratio": 0.8935393085228253, "grad/layer_27/attn": 0.006778719834983349, "grad/layer_27/mlp": 0.006445569917559624, "grad/layer_27/attn_mlp_ratio": 1.0516866338455912} {"step": 45250, "timestamp": 1778243449.5402393, "train/loss": 2.140928292274475, "train/z_loss": 0.001407136709894985, "train/perplexity": 8.507331254403347, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027888.479349404, "perf/iters_per_sec": 0.9669725796458264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341554880142212, "data/tokens_consumed": 94898225152, "data/tokens_consumed_B": 94.898225152, "train/loss_slope": -1.1670363711194924e-05} {"step": 45260, "timestamp": 1778243460.3022232, "train/loss": 2.1335190773010253, "train/z_loss": 0.0013968147337436675, "train/perplexity": 8.4445315438569, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1949791.5416933552, "perf/iters_per_sec": 0.9297330578295494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0755775451660157, "data/tokens_consumed": 94919196672, "data/tokens_consumed_B": 94.919196672, "train/loss_slope": -1.369599960532022e-05} {"step": 45270, "timestamp": 1778243470.6483924, "train/loss": 2.186233639717102, "train/z_loss": 0.0013941782526671886, "train/perplexity": 8.901623178512507, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028489.4179876633, "perf/iters_per_sec": 0.9672591295183484, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338491201400757, "data/tokens_consumed": 94940168192, "data/tokens_consumed_B": 94.940168192, "train/loss_slope": -1.5387137079968555e-05} {"step": 45280, "timestamp": 1778243480.9947622, "train/loss": 2.2057633638381957, "train/z_loss": 0.0013842690968886018, "train/perplexity": 9.077178113307456, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028419.8592398507, "perf/iters_per_sec": 0.9672259613227132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338845729827881, "data/tokens_consumed": 94961139712, "data/tokens_consumed_B": 94.961139712, "train/loss_slope": -1.1535601990737292e-05} {"step": 45290, "timestamp": 1778243491.3396087, "train/loss": 2.1519068479537964, "train/z_loss": 0.00140432333573699, "train/perplexity": 8.60124403434059, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028214.672356522, "perf/iters_per_sec": 0.9671281205923662, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03398916721344, "data/tokens_consumed": 94982111232, "data/tokens_consumed_B": 94.982111232, "train/loss_slope": -1.1204332129360886e-05} {"step": 45300, "timestamp": 1778243501.6750267, "grad/layer_0/attn": 0.0029132363852113485, "grad/layer_0/mlp": 0.0030000910628587008, "grad/layer_0/attn_mlp_ratio": 0.9710492871941295, "grad/layer_4/attn": 0.0021142938639968634, "grad/layer_4/mlp": 0.0025773507077246904, "grad/layer_4/attn_mlp_ratio": 0.8203360821739986, "grad/layer_8/attn": 0.0045991926454007626, "grad/layer_8/mlp": 0.0037044815253466368, "grad/layer_8/attn_mlp_ratio": 1.2415212465712813, "grad/layer_12/attn": 0.0043703848496079445, "grad/layer_12/mlp": 0.007194144651293755, "grad/layer_12/attn_mlp_ratio": 0.6074919258223154, "grad/layer_16/attn": 0.00465426268056035, "grad/layer_16/mlp": 0.004901005886495113, "grad/layer_16/attn_mlp_ratio": 0.9496545593670622, "grad/layer_20/attn": 0.0034647309221327305, "grad/layer_20/mlp": 0.00658057164400816, "grad/layer_20/attn_mlp_ratio": 0.5265091023872643, "grad/layer_24/attn": 0.019368845969438553, "grad/layer_24/mlp": 0.01310009229928255, "grad/layer_24/attn_mlp_ratio": 1.4785274316461556, "grad/layer_27/attn": 0.0041480353102087975, "grad/layer_27/mlp": 0.012495105154812336, "grad/layer_27/attn_mlp_ratio": 0.33197281860520805} {"step": 45300, "timestamp": 1778243502.2768097, "eos/sharpness": 63.4944438934326, "eos/L0_probe": 1.9860507249832153, "eos/L_plus": 2.3486552238464355, "eos/L_minus": 2.2583906650543213, "eos/grad_norm": 0.20142443478107452, "eos/embed_grad_frac": 0.06452694535255432, "eos/time_s": 0.5989928245544434} {"step": 45300, "timestamp": 1778243502.2966263, "train/loss": 2.204033851623535, "train/z_loss": 0.0013917277450673283, "train/perplexity": 9.061492590946992, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915157.7169279645, "perf/iters_per_sec": 0.9132183632507155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950283527374267, "data/tokens_consumed": 95003082752, "data/tokens_consumed_B": 95.003082752, "train/loss_slope": -9.234681295411524e-06} {"step": 45300, "timestamp": 1778243503.6602333, "geo/rankme_last": 438.7063903808594, "geo/layer_0/stable_rank_q_proj": 19.404354095458984, "geo/layer_0/stable_rank_k_proj": 16.378345489501953, "geo/layer_0/stable_rank_o_proj": 48.49061965942383, "geo/layer_0/stable_rank_gate_proj": 135.00668334960938, "geo/layer_0/stable_rank_down_proj": 53.489952087402344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06100859493017197, "geo/layer_0/attn_entropy_mean": 6.189401626586914, "geo/layer_0/attn_entropy_std": 0.3812921345233917, "geo/layer_7/stable_rank_q_proj": 42.69189453125, "geo/layer_7/stable_rank_k_proj": 42.28073501586914, "geo/layer_7/stable_rank_o_proj": 95.19290161132812, "geo/layer_7/stable_rank_gate_proj": 88.49832916259766, "geo/layer_7/stable_rank_down_proj": 145.9788055419922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4903790354728699, "geo/layer_7/attn_entropy_mean": 4.634888648986816, "geo/layer_7/attn_entropy_std": 0.7915253043174744, "geo/layer_14/stable_rank_q_proj": 53.643672943115234, "geo/layer_14/stable_rank_k_proj": 38.244773864746094, "geo/layer_14/stable_rank_o_proj": 46.82585144042969, "geo/layer_14/stable_rank_gate_proj": 74.97261810302734, "geo/layer_14/stable_rank_down_proj": 132.5240478515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3897731900215149, "geo/layer_14/attn_entropy_mean": 5.517258644104004, "geo/layer_14/attn_entropy_std": 0.379647433757782, "geo/layer_21/stable_rank_q_proj": 42.39079666137695, "geo/layer_21/stable_rank_k_proj": 30.351900100708008, "geo/layer_21/stable_rank_o_proj": 74.46536254882812, "geo/layer_21/stable_rank_gate_proj": 70.86613464355469, "geo/layer_21/stable_rank_down_proj": 54.324493408203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14523492753505707, "geo/layer_21/attn_entropy_mean": 5.7087812423706055, "geo/layer_21/attn_entropy_std": 0.28956931829452515, "geo/layer_27/stable_rank_q_proj": 42.89409255981445, "geo/layer_27/stable_rank_k_proj": 31.21674156188965, "geo/layer_27/stable_rank_o_proj": 115.67742919921875, "geo/layer_27/stable_rank_gate_proj": 83.89790344238281, "geo/layer_27/stable_rank_down_proj": 130.44558715820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10154610872268677, "geo/layer_27/attn_entropy_mean": 4.29467248916626, "geo/layer_27/attn_entropy_std": 0.6680660843849182, "attnres/final_alpha/block_0": 0.2401292622089386, "attnres/block_norm/0": 1.7330676317214966, "attnres/final_alpha/block_1": 0.0049570221453905106, "attnres/block_norm/1": 42053.078125, "attnres/final_alpha/block_2": 0.010774804279208183, "attnres/block_norm/2": 27182.89453125, "attnres/final_alpha/block_3": 0.012665452435612679, "attnres/block_norm/3": 50181.90625, "attnres/final_alpha/block_4": 0.01556793786585331, "attnres/block_norm/4": 13487.921875, "attnres/final_alpha/block_5": 0.6027621030807495, "attnres/block_norm/5": 6130.1005859375, "attnres/final_alpha/block_6": 0.11314346641302109, "attnres/block_norm/6": 33045.31640625, "geo/tier1_time_s": 1.3594224452972412, "geo/step": 45300.0, "geo/rankme_slope": 4.91679093512405e-05} {"step": 45310, "timestamp": 1778243514.0036545, "train/loss": 2.180788445472717, "train/z_loss": 0.0013785925111733377, "train/perplexity": 8.853283839200245, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791944.8940414796, "perf/iters_per_sec": 0.8544659109313391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703217029571533, "data/tokens_consumed": 95024054272, "data/tokens_consumed_B": 95.024054272, "train/loss_slope": -9.131174626881214e-06} {"step": 45320, "timestamp": 1778243524.3499017, "train/loss": 2.168500065803528, "train/z_loss": 0.0014012736966833473, "train/perplexity": 8.745157038286747, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027939.8141236214, "perf/iters_per_sec": 0.9669970579736812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341293096542359, "data/tokens_consumed": 95045025792, "data/tokens_consumed_B": 95.045025792, "train/loss_slope": -8.6531265340623e-06} {"step": 45330, "timestamp": 1778243534.696924, "train/loss": 2.1781038522720335, "train/z_loss": 0.0013976189075037837, "train/perplexity": 8.829548248060378, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028083.4530932845, "perf/iters_per_sec": 0.9670655503717825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340560674667358, "data/tokens_consumed": 95065997312, "data/tokens_consumed_B": 95.065997312, "train/loss_slope": -1.0812040388208637e-05} {"step": 45340, "timestamp": 1778243545.0395317, "train/loss": 2.111733078956604, "train/z_loss": 0.0013940849923528731, "train/perplexity": 8.262548532239864, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029036.0434990646, "perf/iters_per_sec": 0.9675197808738063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335705995559692, "data/tokens_consumed": 95086968832, "data/tokens_consumed_B": 95.086968832, "train/loss_slope": -1.4879386638901216e-05} {"step": 45350, "timestamp": 1778243555.370928, "grad/layer_0/attn": 0.0034725803416222334, "grad/layer_0/mlp": 0.0034597734920680523, "grad/layer_0/attn_mlp_ratio": 1.0037016149217226, "grad/layer_4/attn": 0.002700292505323887, "grad/layer_4/mlp": 0.002648629480972886, "grad/layer_4/attn_mlp_ratio": 1.0195055302267009, "grad/layer_8/attn": 0.008971644565463066, "grad/layer_8/mlp": 0.003817644901573658, "grad/layer_8/attn_mlp_ratio": 2.3500468382379442, "grad/layer_12/attn": 0.004430146422237158, "grad/layer_12/mlp": 0.006729774177074432, "grad/layer_12/attn_mlp_ratio": 0.6582904923466514, "grad/layer_16/attn": 0.004832693841308355, "grad/layer_16/mlp": 0.005012924782931805, "grad/layer_16/attn_mlp_ratio": 0.9640467300363693, "grad/layer_20/attn": 0.0082788011059165, "grad/layer_20/mlp": 0.006893993355333805, "grad/layer_20/attn_mlp_ratio": 1.200871622457269, "grad/layer_24/attn": 0.018359143286943436, "grad/layer_24/mlp": 0.011633222922682762, "grad/layer_24/attn_mlp_ratio": 1.578164817363709, "grad/layer_27/attn": 0.009212025441229343, "grad/layer_27/mlp": 0.018436523154377937, "grad/layer_27/attn_mlp_ratio": 0.49966174826600535} {"step": 45350, "timestamp": 1778243555.3856196, "train/loss": 2.225994920730591, "train/z_loss": 0.0013804120011627675, "train/perplexity": 9.262693867157745, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027944.4895509514, "perf/iters_per_sec": 0.9669992873911626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341269254684449, "data/tokens_consumed": 95107940352, "data/tokens_consumed_B": 95.107940352, "train/loss_slope": -1.1504452048999049e-05} {"step": 45360, "timestamp": 1778243565.7828324, "train/loss": 2.1546957969665526, "train/z_loss": 0.0013976630754768849, "train/perplexity": 8.62526594777384, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020224.2200567743, "perf/iters_per_sec": 0.9633179760249969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380788326263428, "data/tokens_consumed": 95128911872, "data/tokens_consumed_B": 95.128911872, "train/loss_slope": -1.1219443518086141e-05} {"step": 45370, "timestamp": 1778243576.1713579, "train/loss": 2.0991246700286865, "train/z_loss": 0.0013974328641779721, "train/perplexity": 8.159024946849867, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020018.1827749433, "perf/iters_per_sec": 0.9632197297930447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381847143173217, "data/tokens_consumed": 95149883392, "data/tokens_consumed_B": 95.149883392, "train/loss_slope": -1.3749449245213712e-05} {"step": 45375, "timestamp": 1778243581.9717603, "eos/sharpness": 73.18313121795653, "eos/L0_probe": 1.9821672439575195, "eos/L_plus": 2.3193705081939697, "eos/L_minus": 2.3767952919006348, "eos/grad_norm": 0.2880573272705078, "eos/embed_grad_frac": 0.03407980501651764, "eos/time_s": 0.6173949241638184} {"step": 45375, "timestamp": 1778243583.364782, "geo/rankme_last": 439.5202941894531, "geo/layer_0/stable_rank_q_proj": 19.385080337524414, "geo/layer_0/stable_rank_k_proj": 16.410219192504883, "geo/layer_0/stable_rank_o_proj": 48.52977752685547, "geo/layer_0/stable_rank_gate_proj": 134.86859130859375, "geo/layer_0/stable_rank_down_proj": 53.5634651184082, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06260432302951813, "geo/layer_0/attn_entropy_mean": 6.189272880554199, "geo/layer_0/attn_entropy_std": 0.3840864896774292, "geo/layer_7/stable_rank_q_proj": 42.71845626831055, "geo/layer_7/stable_rank_k_proj": 42.338722229003906, "geo/layer_7/stable_rank_o_proj": 95.00320434570312, "geo/layer_7/stable_rank_gate_proj": 88.39334106445312, "geo/layer_7/stable_rank_down_proj": 145.59654235839844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49161213636398315, "geo/layer_7/attn_entropy_mean": 4.642026901245117, "geo/layer_7/attn_entropy_std": 0.8328506946563721, "geo/layer_14/stable_rank_q_proj": 53.572505950927734, "geo/layer_14/stable_rank_k_proj": 38.211177825927734, "geo/layer_14/stable_rank_o_proj": 46.769046783447266, "geo/layer_14/stable_rank_gate_proj": 75.01515197753906, "geo/layer_14/stable_rank_down_proj": 132.75840759277344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3773748278617859, "geo/layer_14/attn_entropy_mean": 5.498759746551514, "geo/layer_14/attn_entropy_std": 0.3837345540523529, "geo/layer_21/stable_rank_q_proj": 42.377323150634766, "geo/layer_21/stable_rank_k_proj": 30.390995025634766, "geo/layer_21/stable_rank_o_proj": 74.414306640625, "geo/layer_21/stable_rank_gate_proj": 70.833740234375, "geo/layer_21/stable_rank_down_proj": 54.34609603881836, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14510364830493927, "geo/layer_21/attn_entropy_mean": 5.7141618728637695, "geo/layer_21/attn_entropy_std": 0.29781845211982727, "geo/layer_27/stable_rank_q_proj": 42.82999038696289, "geo/layer_27/stable_rank_k_proj": 31.22005271911621, "geo/layer_27/stable_rank_o_proj": 116.09529113769531, "geo/layer_27/stable_rank_gate_proj": 83.98577117919922, "geo/layer_27/stable_rank_down_proj": 130.5515899658203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08681067079305649, "geo/layer_27/attn_entropy_mean": 4.261602401733398, "geo/layer_27/attn_entropy_std": 0.6597814559936523, "attnres/final_alpha/block_0": 0.24174034595489502, "attnres/block_norm/0": 1.733301043510437, "attnres/final_alpha/block_1": 0.005034130997955799, "attnres/block_norm/1": 42161.87109375, "attnres/final_alpha/block_2": 0.011109170503914356, "attnres/block_norm/2": 27012.55078125, "attnres/final_alpha/block_3": 0.013266883790493011, "attnres/block_norm/3": 50035.79296875, "attnres/final_alpha/block_4": 0.01594911515712738, "attnres/block_norm/4": 13515.16015625, "attnres/final_alpha/block_5": 0.5965085029602051, "attnres/block_norm/5": 6207.8671875, "attnres/final_alpha/block_6": 0.11639189720153809, "attnres/block_norm/6": 32850.90625, "geo/tier1_time_s": 1.3639168739318848, "geo/step": 45375.0, "geo/rankme_slope": 6.456211390806322e-05} {"step": 45380, "timestamp": 1778243588.542271, "train/loss": 2.138843059539795, "train/z_loss": 0.0014067265554331243, "train/perplexity": 8.489609971707587, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696026.7507806828, "perf/iters_per_sec": 0.8087285760787405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2365087985992431, "data/tokens_consumed": 95170854912, "data/tokens_consumed_B": 95.170854912, "train/loss_slope": -1.5969833560866946e-05} {"step": 45390, "timestamp": 1778243598.8955853, "train/loss": 2.1798802614212036, "train/z_loss": 0.0013930810149759055, "train/perplexity": 8.845247077995529, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026975.8743586899, "perf/iters_per_sec": 0.9665374156754922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346210956573487, "data/tokens_consumed": 95191826432, "data/tokens_consumed_B": 95.191826432, "train/loss_slope": -1.4315848811195694e-05} {"step": 45400, "timestamp": 1778243609.2263148, "grad/layer_0/attn": 0.00275429873727262, "grad/layer_0/mlp": 0.0027535194531083107, "grad/layer_0/attn_mlp_ratio": 1.0002829775308586, "grad/layer_4/attn": 0.0028412817046046257, "grad/layer_4/mlp": 0.002429475774988532, "grad/layer_4/attn_mlp_ratio": 1.1695039797907183, "grad/layer_8/attn": 0.011365671642124653, "grad/layer_8/mlp": 0.003681278321892023, "grad/layer_8/attn_mlp_ratio": 3.0874251658159486, "grad/layer_12/attn": 0.004292702302336693, "grad/layer_12/mlp": 0.006535063963383436, "grad/layer_12/attn_mlp_ratio": 0.6568722602719532, "grad/layer_16/attn": 0.0033466708846390247, "grad/layer_16/mlp": 0.0043384782038629055, "grad/layer_16/attn_mlp_ratio": 0.77139279033831, "grad/layer_20/attn": 0.0035724565386772156, "grad/layer_20/mlp": 0.005841889884322882, "grad/layer_20/attn_mlp_ratio": 0.6115241040594999, "grad/layer_24/attn": 0.014152973890304565, "grad/layer_24/mlp": 0.010193056426942348, "grad/layer_24/attn_mlp_ratio": 1.388491651438932, "grad/layer_27/attn": 0.008644326590001583, "grad/layer_27/mlp": 0.008828727528452873, "grad/layer_27/attn_mlp_ratio": 0.9791135205194224} {"step": 45400, "timestamp": 1778243609.240614, "train/loss": 2.1642756342887877, "train/z_loss": 0.0013964427285827695, "train/perplexity": 8.708291643784447, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028219.3958183704, "perf/iters_per_sec": 0.9671303729144909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033986759185791, "data/tokens_consumed": 95212797952, "data/tokens_consumed_B": 95.212797952, "train/loss_slope": -1.4736432325292798e-05} {"step": 45410, "timestamp": 1778243619.593666, "train/loss": 2.165040373802185, "train/z_loss": 0.0013831972843036055, "train/perplexity": 8.714953765567842, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026700.6513017993, "perf/iters_per_sec": 0.9664061790951726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347615957260132, "data/tokens_consumed": 95233769472, "data/tokens_consumed_B": 95.233769472, "train/loss_slope": -1.3205096702335353e-05} {"step": 45420, "timestamp": 1778243629.9430013, "train/loss": 2.141340970993042, "train/z_loss": 0.001389446365647018, "train/perplexity": 8.5108427734786, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027687.419800578, "perf/iters_per_sec": 0.9668767069819345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342580318450927, "data/tokens_consumed": 95254740992, "data/tokens_consumed_B": 95.254740992, "train/loss_slope": -1.3609352621129444e-05} {"step": 45430, "timestamp": 1778243640.2843418, "train/loss": 2.1168567180633544, "train/z_loss": 0.001412599056493491, "train/perplexity": 8.304991487363552, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029016.52611593, "perf/iters_per_sec": 0.9675104742602968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335805416107178, "data/tokens_consumed": 95275712512, "data/tokens_consumed_B": 95.275712512, "train/loss_slope": -1.7158032035884898e-05} {"step": 45440, "timestamp": 1778243650.6342103, "train/loss": 2.16970477104187, "train/z_loss": 0.0013902472564950585, "train/perplexity": 8.755698723317545, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027231.2686210435, "perf/iters_per_sec": 0.9666591971497743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344907522201539, "data/tokens_consumed": 95296684032, "data/tokens_consumed_B": 95.296684032, "train/loss_slope": -1.5209251790657622e-05} {"step": 45450, "timestamp": 1778243660.9666154, "grad/layer_0/attn": 0.002774923574179411, "grad/layer_0/mlp": 0.0027839362155646086, "grad/layer_0/attn_mlp_ratio": 0.9967625906760839, "grad/layer_4/attn": 0.0025906814262270927, "grad/layer_4/mlp": 0.0024362618569284678, "grad/layer_4/attn_mlp_ratio": 1.0633837707227962, "grad/layer_8/attn": 0.0034243501722812653, "grad/layer_8/mlp": 0.0034954871516674757, "grad/layer_8/attn_mlp_ratio": 0.9796488803235441, "grad/layer_12/attn": 0.004667619243264198, "grad/layer_12/mlp": 0.006765696685761213, "grad/layer_12/attn_mlp_ratio": 0.689894831392306, "grad/layer_16/attn": 0.004287329968065023, "grad/layer_16/mlp": 0.004430504981428385, "grad/layer_16/attn_mlp_ratio": 0.9676842457616138, "grad/layer_20/attn": 0.0035391056444495916, "grad/layer_20/mlp": 0.005933336913585663, "grad/layer_20/attn_mlp_ratio": 0.596478109425782, "grad/layer_24/attn": 0.008356896229088306, "grad/layer_24/mlp": 0.009183290414512157, "grad/layer_24/attn_mlp_ratio": 0.9100110919808191, "grad/layer_27/attn": 0.01005175057798624, "grad/layer_27/mlp": 0.007233801297843456, "grad/layer_27/attn_mlp_ratio": 1.389553019935393} {"step": 45450, "timestamp": 1778243661.5657291, "eos/sharpness": 43.75710487365722, "eos/L0_probe": 1.981734275817871, "eos/L_plus": 2.163547992706299, "eos/L_minus": 2.2374916076660156, "eos/grad_norm": 0.11252807080745697, "eos/embed_grad_frac": 0.2024235725402832, "eos/time_s": 0.5963101387023926} {"step": 45450, "timestamp": 1778243661.5836334, "train/loss": 2.1285750389099123, "train/z_loss": 0.001410058990586549, "train/perplexity": 8.402884492851957, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916393.8290174296, "perf/iters_per_sec": 0.9138077874266766, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0943220376968383, "data/tokens_consumed": 95317655552, "data/tokens_consumed_B": 95.317655552, "train/loss_slope": -1.6306452475043357e-05} {"step": 45450, "timestamp": 1778243662.9449506, "geo/rankme_last": 439.5513000488281, "geo/layer_0/stable_rank_q_proj": 19.402286529541016, "geo/layer_0/stable_rank_k_proj": 16.433242797851562, "geo/layer_0/stable_rank_o_proj": 48.43598175048828, "geo/layer_0/stable_rank_gate_proj": 134.9893341064453, "geo/layer_0/stable_rank_down_proj": 53.50959396362305, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06393682211637497, "geo/layer_0/attn_entropy_mean": 6.1882452964782715, "geo/layer_0/attn_entropy_std": 0.37947309017181396, "geo/layer_7/stable_rank_q_proj": 42.728416442871094, "geo/layer_7/stable_rank_k_proj": 42.38172149658203, "geo/layer_7/stable_rank_o_proj": 95.05500793457031, "geo/layer_7/stable_rank_gate_proj": 88.36483001708984, "geo/layer_7/stable_rank_down_proj": 145.50413513183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4853162169456482, "geo/layer_7/attn_entropy_mean": 4.671656608581543, "geo/layer_7/attn_entropy_std": 0.8212047815322876, "geo/layer_14/stable_rank_q_proj": 53.51255416870117, "geo/layer_14/stable_rank_k_proj": 38.2089729309082, "geo/layer_14/stable_rank_o_proj": 46.8345947265625, "geo/layer_14/stable_rank_gate_proj": 74.91127014160156, "geo/layer_14/stable_rank_down_proj": 132.6293487548828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4057782292366028, "geo/layer_14/attn_entropy_mean": 5.556906700134277, "geo/layer_14/attn_entropy_std": 0.38588258624076843, "geo/layer_21/stable_rank_q_proj": 42.39459991455078, "geo/layer_21/stable_rank_k_proj": 30.471019744873047, "geo/layer_21/stable_rank_o_proj": 74.40597534179688, "geo/layer_21/stable_rank_gate_proj": 70.83023071289062, "geo/layer_21/stable_rank_down_proj": 54.380882263183594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1460667997598648, "geo/layer_21/attn_entropy_mean": 5.707871437072754, "geo/layer_21/attn_entropy_std": 0.2914007306098938, "geo/layer_27/stable_rank_q_proj": 42.789459228515625, "geo/layer_27/stable_rank_k_proj": 31.226308822631836, "geo/layer_27/stable_rank_o_proj": 116.13192749023438, "geo/layer_27/stable_rank_gate_proj": 83.93756103515625, "geo/layer_27/stable_rank_down_proj": 130.62828063964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09679839760065079, "geo/layer_27/attn_entropy_mean": 4.255641937255859, "geo/layer_27/attn_entropy_std": 0.6617274880409241, "attnres/final_alpha/block_0": 0.2415420114994049, "attnres/block_norm/0": 1.7334226369857788, "attnres/final_alpha/block_1": 0.005014827009290457, "attnres/block_norm/1": 42273.8515625, "attnres/final_alpha/block_2": 0.011080692522227764, "attnres/block_norm/2": 27097.71484375, "attnres/final_alpha/block_3": 0.012900739908218384, "attnres/block_norm/3": 50410.6875, "attnres/final_alpha/block_4": 0.01582483947277069, "attnres/block_norm/4": 13507.537109375, "attnres/final_alpha/block_5": 0.5977954268455505, "attnres/block_norm/5": 6189.93603515625, "attnres/final_alpha/block_6": 0.11584143340587616, "attnres/block_norm/6": 33028.4765625, "geo/tier1_time_s": 1.3571255207061768, "geo/step": 45450.0, "geo/rankme_slope": 8.32677993072229e-05} {"step": 45460, "timestamp": 1778243673.294858, "train/loss": 2.1187155723571776, "train/z_loss": 0.0013980942429043352, "train/perplexity": 8.320443613626267, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791296.4972475704, "perf/iters_per_sec": 0.8541567312467434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170745325088501, "data/tokens_consumed": 95338627072, "data/tokens_consumed_B": 95.338627072, "train/loss_slope": -1.9262371140487798e-05} {"step": 45470, "timestamp": 1778243683.6406796, "train/loss": 2.1747679233551027, "train/z_loss": 0.0013981113908812404, "train/perplexity": 8.800142577619297, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028197.5090714532, "perf/iters_per_sec": 0.9671199365002886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339979171752929, "data/tokens_consumed": 95359598592, "data/tokens_consumed_B": 95.359598592, "train/loss_slope": -1.408623827852148e-05} {"step": 45480, "timestamp": 1778243693.9840934, "train/loss": 2.1307022094726564, "train/z_loss": 0.0014181748032569886, "train/perplexity": 8.42077788578957, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028663.87378726, "perf/iters_per_sec": 0.9673423165260601, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337602138519286, "data/tokens_consumed": 95380570112, "data/tokens_consumed_B": 95.380570112, "train/loss_slope": -1.5678650675469462e-05} {"step": 45490, "timestamp": 1778243704.3260388, "train/loss": 2.148691773414612, "train/z_loss": 0.0013997451053000987, "train/perplexity": 8.573634800295551, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028856.0952867698, "perf/iters_per_sec": 0.9674339748796319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336622714996337, "data/tokens_consumed": 95401541632, "data/tokens_consumed_B": 95.401541632, "train/loss_slope": -1.7234524770645715e-05} {"step": 45500, "timestamp": 1778243714.6603897, "grad/layer_0/attn": 0.004064738750457764, "grad/layer_0/mlp": 0.003231840441003442, "grad/layer_0/attn_mlp_ratio": 1.2577163690123518, "grad/layer_4/attn": 0.0023262451868504286, "grad/layer_4/mlp": 0.0026012093294411898, "grad/layer_4/attn_mlp_ratio": 0.8942936929727219, "grad/layer_8/attn": 0.007527912966907024, "grad/layer_8/mlp": 0.0037753144279122353, "grad/layer_8/attn_mlp_ratio": 1.9939829943308038, "grad/layer_12/attn": 0.005640395451337099, "grad/layer_12/mlp": 0.006845141761004925, "grad/layer_12/attn_mlp_ratio": 0.8239997893205155, "grad/layer_16/attn": 0.005684925243258476, "grad/layer_16/mlp": 0.004727791529148817, "grad/layer_16/attn_mlp_ratio": 1.2024483499248422, "grad/layer_20/attn": 0.003469649702310562, "grad/layer_20/mlp": 0.006524503231048584, "grad/layer_20/attn_mlp_ratio": 0.5317875593379364, "grad/layer_24/attn": 0.02441968396306038, "grad/layer_24/mlp": 0.013532024808228016, "grad/layer_24/attn_mlp_ratio": 1.8045846152863811, "grad/layer_27/attn": 0.010426085442304611, "grad/layer_27/mlp": 0.012941188178956509, "grad/layer_27/attn_mlp_ratio": 0.8056513217768672} {"step": 45500, "timestamp": 1778243714.6747434, "train/loss": 2.1891422271728516, "train/z_loss": 0.0013740533613599837, "train/perplexity": 8.927552017894248, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027821.8602251653, "perf/iters_per_sec": 0.9669408131719424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341894626617432, "data/tokens_consumed": 95422513152, "data/tokens_consumed_B": 95.422513152, "train/loss_slope": -1.54411733526029e-05} {"step": 45500, "timestamp": 1778243721.8759232, "geo/ww_alpha_mean": 7.500075933021258, "geo/ww_alpha_std": 4.839188199127562, "geo/ww_alpha_min": 1.3521473818228693, "geo/ww_alpha_max": 47.42973229261053, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9790472997201087, "geo/ww_alpha_by_type/k_proj": 4.501933874560382, "geo/ww_alpha_by_type/v_proj": 7.604451613424449, "geo/ww_alpha_by_type/o_proj": 7.214410313824658, "geo/ww_alpha_by_type/gate_proj": 8.360196542975784, "geo/ww_alpha_by_type/up_proj": 12.74075782008762, "geo/ww_alpha_by_type/down_proj": 8.199606640056768, "geo/twonn_id/layer_0": 0.672245442867279, "geo/twonn_id/layer_7": 3.21368408203125, "geo/twonn_id/layer_14": 4.453795909881592, "geo/twonn_id/layer_21": 7.304177761077881, "geo/twonn_id/layer_27": 5.245694160461426, "geo/tier2_time_s": 7.191772222518921} {"step": 45500, "timestamp": 1778243722.699188, "eoc/jacobian_sigma/layer_0/attn": 1097.9566650390625, "eoc/jacobian_sigma/layer_0/mlp": 7953.97509765625, "eoc/jacobian_sigma/layer_0": 7953.97509765625, "eoc/jacobian_sigma/layer_7/attn": 1.1572450399398804, "eoc/jacobian_sigma/layer_7/mlp": 1.8282320499420166, "eoc/jacobian_sigma/layer_7": 1.8282320499420166, "eoc/jacobian_sigma/layer_14/attn": 1.5901000499725342, "eoc/jacobian_sigma/layer_14/mlp": 6.62778377532959, "eoc/jacobian_sigma/layer_14": 6.62778377532959, "eoc/jacobian_sigma/layer_21/attn": 1.0803196430206299, "eoc/jacobian_sigma/layer_21/mlp": 3.5223309993743896, "eoc/jacobian_sigma/layer_21": 3.5223309993743896, "eoc/jacobian_sigma/layer_27/attn": 3.2711589336395264, "eoc/jacobian_sigma/layer_27/mlp": 30.993640899658203, "eoc/jacobian_sigma/layer_27": 30.993640899658203, "eoc/layer0_sigma": 7953.97509765625, "eoc/sigma_max": 30.993640899658203, "eoc/sigma_min": 1.8282320499420166, "eoc/sigma_mean": 10.74299693107605, "eoc/time_s": 0.8162374496459961} {"step": 45510, "timestamp": 1778243733.0622392, "train/loss": 2.1643524289131166, "train/z_loss": 0.0014026315417140721, "train/perplexity": 8.708960419448639, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1141011.0852293838, "perf/iters_per_sec": 0.544076483359043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.837976884841919, "data/tokens_consumed": 95443484672, "data/tokens_consumed_B": 95.443484672, "train/loss_slope": -1.6700441151311935e-05} {"step": 45520, "timestamp": 1778243743.4176064, "train/loss": 2.1245278120040894, "train/z_loss": 0.0014019537484273314, "train/perplexity": 8.368944839712388, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026693.7401626843, "perf/iters_per_sec": 0.966402883607237, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347651243209839, "data/tokens_consumed": 95464456192, "data/tokens_consumed_B": 95.464456192, "train/loss_slope": -2.0959191861683422e-05} {"step": 45525, "timestamp": 1778243749.1928368, "eos/sharpness": 77.49269008636473, "eos/L0_probe": 1.983916997909546, "eos/L_plus": 2.3207502365112305, "eos/L_minus": 2.422010660171509, "eos/grad_norm": 0.26592978835105896, "eos/embed_grad_frac": 0.03327180817723274, "eos/time_s": 0.6117494106292725} {"step": 45525, "timestamp": 1778243750.5714526, "geo/rankme_last": 440.3236999511719, "geo/layer_0/stable_rank_q_proj": 19.407249450683594, "geo/layer_0/stable_rank_k_proj": 16.46729278564453, "geo/layer_0/stable_rank_o_proj": 48.357933044433594, "geo/layer_0/stable_rank_gate_proj": 135.1000518798828, "geo/layer_0/stable_rank_down_proj": 53.4445915222168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06030143424868584, "geo/layer_0/attn_entropy_mean": 6.196284294128418, "geo/layer_0/attn_entropy_std": 0.3779895007610321, "geo/layer_7/stable_rank_q_proj": 42.57776641845703, "geo/layer_7/stable_rank_k_proj": 42.3453254699707, "geo/layer_7/stable_rank_o_proj": 94.85745239257812, "geo/layer_7/stable_rank_gate_proj": 88.31342315673828, "geo/layer_7/stable_rank_down_proj": 145.2845001220703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4761158227920532, "geo/layer_7/attn_entropy_mean": 4.650712966918945, "geo/layer_7/attn_entropy_std": 0.819698691368103, "geo/layer_14/stable_rank_q_proj": 53.32278060913086, "geo/layer_14/stable_rank_k_proj": 38.16181182861328, "geo/layer_14/stable_rank_o_proj": 46.822105407714844, "geo/layer_14/stable_rank_gate_proj": 74.7947006225586, "geo/layer_14/stable_rank_down_proj": 132.3027801513672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38992834091186523, "geo/layer_14/attn_entropy_mean": 5.519539833068848, "geo/layer_14/attn_entropy_std": 0.37380558252334595, "geo/layer_21/stable_rank_q_proj": 42.436668395996094, "geo/layer_21/stable_rank_k_proj": 30.49153709411621, "geo/layer_21/stable_rank_o_proj": 74.40179443359375, "geo/layer_21/stable_rank_gate_proj": 70.87410736083984, "geo/layer_21/stable_rank_down_proj": 54.25240707397461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14575225114822388, "geo/layer_21/attn_entropy_mean": 5.683897495269775, "geo/layer_21/attn_entropy_std": 0.305367648601532, "geo/layer_27/stable_rank_q_proj": 42.70527267456055, "geo/layer_27/stable_rank_k_proj": 31.16370964050293, "geo/layer_27/stable_rank_o_proj": 116.242431640625, "geo/layer_27/stable_rank_gate_proj": 84.09599304199219, "geo/layer_27/stable_rank_down_proj": 130.55455017089844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0882330909371376, "geo/layer_27/attn_entropy_mean": 4.259838581085205, "geo/layer_27/attn_entropy_std": 0.657893717288971, "attnres/final_alpha/block_0": 0.24250394105911255, "attnres/block_norm/0": 1.7336167097091675, "attnres/final_alpha/block_1": 0.005067398305982351, "attnres/block_norm/1": 42332.53515625, "attnres/final_alpha/block_2": 0.010910630226135254, "attnres/block_norm/2": 27092.560546875, "attnres/final_alpha/block_3": 0.012660602107644081, "attnres/block_norm/3": 49869.7890625, "attnres/final_alpha/block_4": 0.015674497932195663, "attnres/block_norm/4": 13577.6728515625, "attnres/final_alpha/block_5": 0.5957610607147217, "attnres/block_norm/5": 6236.98095703125, "attnres/final_alpha/block_6": 0.11742185056209564, "attnres/block_norm/6": 32835.20703125, "geo/tier1_time_s": 1.360621452331543, "geo/step": 45525.0, "geo/rankme_slope": 0.00010960325536464585} {"step": 45530, "timestamp": 1778243755.7482145, "train/loss": 2.1545068264007567, "train/z_loss": 0.001394050708040595, "train/perplexity": 8.623636180381434, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701815.5490990262, "perf/iters_per_sec": 0.8114888902182704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232302761077881, "data/tokens_consumed": 95485427712, "data/tokens_consumed_B": 95.485427712, "train/loss_slope": -1.7954254751265548e-05} {"step": 45540, "timestamp": 1778243766.1014483, "train/loss": 2.154561448097229, "train/z_loss": 0.001403662795200944, "train/perplexity": 8.624107230884036, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026540.1662666926, "perf/iters_per_sec": 0.9663296538670981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348435401916505, "data/tokens_consumed": 95506399232, "data/tokens_consumed_B": 95.506399232, "train/loss_slope": -1.9844685207427634e-05} {"step": 45550, "timestamp": 1778243776.438406, "grad/layer_0/attn": 0.0028379973955452442, "grad/layer_0/mlp": 0.0028786445036530495, "grad/layer_0/attn_mlp_ratio": 0.9858797407445771, "grad/layer_4/attn": 0.0018339003436267376, "grad/layer_4/mlp": 0.0024753331672400236, "grad/layer_4/attn_mlp_ratio": 0.7408700751117532, "grad/layer_8/attn": 0.004610446747392416, "grad/layer_8/mlp": 0.003725317306816578, "grad/layer_8/attn_mlp_ratio": 1.2375983691902952, "grad/layer_12/attn": 0.006094442680478096, "grad/layer_12/mlp": 0.006783830001950264, "grad/layer_12/attn_mlp_ratio": 0.8983778468635325, "grad/layer_16/attn": 0.003546211402863264, "grad/layer_16/mlp": 0.004450878594070673, "grad/layer_16/attn_mlp_ratio": 0.7967441142773495, "grad/layer_20/attn": 0.004037707578390837, "grad/layer_20/mlp": 0.006188124883919954, "grad/layer_20/attn_mlp_ratio": 0.6524928938705913, "grad/layer_24/attn": 0.011912924237549305, "grad/layer_24/mlp": 0.009411254897713661, "grad/layer_24/attn_mlp_ratio": 1.2658167524356094, "grad/layer_27/attn": 0.008892436511814594, "grad/layer_27/mlp": 0.009933198802173138, "grad/layer_27/attn_mlp_ratio": 0.8952238447444307} {"step": 45550, "timestamp": 1778243776.4527628, "train/loss": 2.156355953216553, "train/z_loss": 0.0013930527726188301, "train/perplexity": 8.639597129654, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027471.492316181, "perf/iters_per_sec": 0.9667737447338968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343681812286376, "data/tokens_consumed": 95527370752, "data/tokens_consumed_B": 95.527370752, "train/loss_slope": -1.966419155591252e-05} {"step": 45560, "timestamp": 1778243786.7948956, "train/loss": 2.1656091928482057, "train/z_loss": 0.0014071067911572755, "train/perplexity": 8.719912407406147, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028806.0711421142, "perf/iters_per_sec": 0.9674101215086528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336877584457398, "data/tokens_consumed": 95548342272, "data/tokens_consumed_B": 95.548342272, "train/loss_slope": -1.798916338014418e-05} {"step": 45570, "timestamp": 1778243797.1412494, "train/loss": 2.1609939098358155, "train/z_loss": 0.0013937964802607894, "train/perplexity": 8.679760271810194, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027963.4252521014, "perf/iters_per_sec": 0.9670083166370875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341172695159913, "data/tokens_consumed": 95569313792, "data/tokens_consumed_B": 95.569313792, "train/loss_slope": -1.4831647950180158e-05} {"step": 45580, "timestamp": 1778243807.4875364, "train/loss": 2.1872388124465942, "train/z_loss": 0.0013860982959158718, "train/perplexity": 8.910575345863256, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028567.8234788284, "perf/iters_per_sec": 0.967296516169943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338091611862184, "data/tokens_consumed": 95590285312, "data/tokens_consumed_B": 95.590285312, "train/loss_slope": -1.4477932785782717e-05} {"step": 45590, "timestamp": 1778243817.830302, "train/loss": 2.1302448868751527, "train/z_loss": 0.0014006979414261877, "train/perplexity": 8.416927754217028, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028649.3697480194, "perf/iters_per_sec": 0.9673354004612061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337676048278808, "data/tokens_consumed": 95611256832, "data/tokens_consumed_B": 95.611256832, "train/loss_slope": -1.5932375426911905e-05} {"step": 45600, "timestamp": 1778243828.1631222, "grad/layer_0/attn": 0.0028929358813911676, "grad/layer_0/mlp": 0.002908471040427685, "grad/layer_0/attn_mlp_ratio": 0.9946586167486493, "grad/layer_4/attn": 0.0018802785780280828, "grad/layer_4/mlp": 0.0023493103217333555, "grad/layer_4/attn_mlp_ratio": 0.8003533975900834, "grad/layer_8/attn": 0.008625398389995098, "grad/layer_8/mlp": 0.003446450224146247, "grad/layer_8/attn_mlp_ratio": 2.5026904724447983, "grad/layer_12/attn": 0.005346955731511116, "grad/layer_12/mlp": 0.006926464848220348, "grad/layer_12/attn_mlp_ratio": 0.7719602671034864, "grad/layer_16/attn": 0.0038108041044324636, "grad/layer_16/mlp": 0.0044822730123996735, "grad/layer_16/attn_mlp_ratio": 0.8501945349760882, "grad/layer_20/attn": 0.0068993219174444675, "grad/layer_20/mlp": 0.006448902189731598, "grad/layer_20/attn_mlp_ratio": 1.069844387071899, "grad/layer_24/attn": 0.014383620582520962, "grad/layer_24/mlp": 0.012801329605281353, "grad/layer_24/attn_mlp_ratio": 1.123603634440164, "grad/layer_27/attn": 0.005263343453407288, "grad/layer_27/mlp": 0.012333117425441742, "grad/layer_27/attn_mlp_ratio": 0.4267650448112281} {"step": 45600, "timestamp": 1778243828.745803, "eos/sharpness": 74.42147731781004, "eos/L0_probe": 1.9824105501174927, "eos/L_plus": 2.3199832439422607, "eos/L_minus": 2.389052629470825, "eos/grad_norm": 0.22161546349525452, "eos/embed_grad_frac": 0.04098793864250183, "eos/time_s": 0.579669713973999} {"step": 45600, "timestamp": 1778243828.7660537, "train/loss": 2.1509564876556397, "train/z_loss": 0.0013987781945616006, "train/perplexity": 8.593073636521398, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918514.3471245414, "perf/iters_per_sec": 0.91481892925479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0931124925613402, "data/tokens_consumed": 95632228352, "data/tokens_consumed_B": 95.632228352, "train/loss_slope": -1.7163915402389234e-05} {"step": 45600, "timestamp": 1778243830.1327426, "geo/rankme_last": 439.8034973144531, "geo/layer_0/stable_rank_q_proj": 19.403287887573242, "geo/layer_0/stable_rank_k_proj": 16.432071685791016, "geo/layer_0/stable_rank_o_proj": 48.27827835083008, "geo/layer_0/stable_rank_gate_proj": 134.82150268554688, "geo/layer_0/stable_rank_down_proj": 53.41054153442383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06544824689626694, "geo/layer_0/attn_entropy_mean": 6.188080310821533, "geo/layer_0/attn_entropy_std": 0.3783688545227051, "geo/layer_7/stable_rank_q_proj": 42.53171157836914, "geo/layer_7/stable_rank_k_proj": 42.339881896972656, "geo/layer_7/stable_rank_o_proj": 94.75554656982422, "geo/layer_7/stable_rank_gate_proj": 88.2821273803711, "geo/layer_7/stable_rank_down_proj": 145.29197692871094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4690167009830475, "geo/layer_7/attn_entropy_mean": 4.6706109046936035, "geo/layer_7/attn_entropy_std": 0.820381760597229, "geo/layer_14/stable_rank_q_proj": 53.301456451416016, "geo/layer_14/stable_rank_k_proj": 38.25039291381836, "geo/layer_14/stable_rank_o_proj": 46.884090423583984, "geo/layer_14/stable_rank_gate_proj": 74.70013427734375, "geo/layer_14/stable_rank_down_proj": 132.24989318847656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39351773262023926, "geo/layer_14/attn_entropy_mean": 5.523000717163086, "geo/layer_14/attn_entropy_std": 0.38875460624694824, "geo/layer_21/stable_rank_q_proj": 42.43720245361328, "geo/layer_21/stable_rank_k_proj": 30.511327743530273, "geo/layer_21/stable_rank_o_proj": 74.38130950927734, "geo/layer_21/stable_rank_gate_proj": 70.98543548583984, "geo/layer_21/stable_rank_down_proj": 54.14027786254883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14884637296199799, "geo/layer_21/attn_entropy_mean": 5.699858665466309, "geo/layer_21/attn_entropy_std": 0.2888983488082886, "geo/layer_27/stable_rank_q_proj": 42.744476318359375, "geo/layer_27/stable_rank_k_proj": 31.233470916748047, "geo/layer_27/stable_rank_o_proj": 116.161376953125, "geo/layer_27/stable_rank_gate_proj": 84.0612564086914, "geo/layer_27/stable_rank_down_proj": 130.69235229492188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08748620748519897, "geo/layer_27/attn_entropy_mean": 4.262702941894531, "geo/layer_27/attn_entropy_std": 0.6832215189933777, "attnres/final_alpha/block_0": 0.24328327178955078, "attnres/block_norm/0": 1.7337968349456787, "attnres/final_alpha/block_1": 0.005131972022354603, "attnres/block_norm/1": 42308.66015625, "attnres/final_alpha/block_2": 0.011404098942875862, "attnres/block_norm/2": 27036.69140625, "attnres/final_alpha/block_3": 0.013197943568229675, "attnres/block_norm/3": 50203.48046875, "attnres/final_alpha/block_4": 0.015851067379117012, "attnres/block_norm/4": 13536.40234375, "attnres/final_alpha/block_5": 0.5938364267349243, "attnres/block_norm/5": 6273.47265625, "attnres/final_alpha/block_6": 0.11729525029659271, "attnres/block_norm/6": 32910.3671875, "geo/tier1_time_s": 1.3630614280700684, "geo/step": 45600.0, "geo/rankme_slope": 0.0001380563944327731} {"step": 45610, "timestamp": 1778243841.1206179, "train/loss": 2.2342560291290283, "train/z_loss": 0.001387945469468832, "train/perplexity": 9.339530927915737, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697954.209232036, "perf/iters_per_sec": 0.8096476598892384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351051568984985, "data/tokens_consumed": 95653199872, "data/tokens_consumed_B": 95.653199872, "train/loss_slope": -1.4923854636745402e-05} {"step": 45620, "timestamp": 1778243851.4683275, "train/loss": 2.1057451963424683, "train/z_loss": 0.0014039426925592125, "train/perplexity": 8.213221192081297, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027945.6116567184, "perf/iters_per_sec": 0.9669998224528877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034126353263855, "data/tokens_consumed": 95674171392, "data/tokens_consumed_B": 95.674171392, "train/loss_slope": -1.8650928417770058e-05} {"step": 45630, "timestamp": 1778243861.817224, "train/loss": 2.1824369192123414, "train/z_loss": 0.0013828527415171265, "train/perplexity": 8.867890280978747, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027424.1533243814, "perf/iters_per_sec": 0.9667511717435748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343923330307008, "data/tokens_consumed": 95695142912, "data/tokens_consumed_B": 95.695142912, "train/loss_slope": -1.5010896946551957e-05} {"step": 45640, "timestamp": 1778243872.1605463, "train/loss": 2.1923227310180664, "train/z_loss": 0.0013876794138923288, "train/perplexity": 8.955991333117828, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029027.8995093731, "perf/iters_per_sec": 0.9675158975169054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335747480392456, "data/tokens_consumed": 95716114432, "data/tokens_consumed_B": 95.716114432, "train/loss_slope": -1.4676081193591479e-05} {"step": 45650, "timestamp": 1778243882.4975274, "grad/layer_0/attn": 0.0024525756016373634, "grad/layer_0/mlp": 0.002663287101313472, "grad/layer_0/attn_mlp_ratio": 0.9208828850406406, "grad/layer_4/attn": 0.0024653382133692503, "grad/layer_4/mlp": 0.0023574463557451963, "grad/layer_4/attn_mlp_ratio": 1.0457663661293843, "grad/layer_8/attn": 0.004165581427514553, "grad/layer_8/mlp": 0.003603103570640087, "grad/layer_8/attn_mlp_ratio": 1.1561092347849493, "grad/layer_12/attn": 0.005247647874057293, "grad/layer_12/mlp": 0.0065853167325258255, "grad/layer_12/attn_mlp_ratio": 0.796870979409587, "grad/layer_16/attn": 0.004876662045717239, "grad/layer_16/mlp": 0.004382494371384382, "grad/layer_16/attn_mlp_ratio": 1.1127594290328346, "grad/layer_20/attn": 0.005255690775811672, "grad/layer_20/mlp": 0.005759058985859156, "grad/layer_20/attn_mlp_ratio": 0.9125953905763082, "grad/layer_24/attn": 0.006962559185922146, "grad/layer_24/mlp": 0.008588125929236412, "grad/layer_24/attn_mlp_ratio": 0.8107192607816448, "grad/layer_27/attn": 0.007404479663819075, "grad/layer_27/mlp": 0.006982097867876291, "grad/layer_27/attn_mlp_ratio": 1.0604949540791473} {"step": 45650, "timestamp": 1778243882.5120618, "train/loss": 2.213497185707092, "train/z_loss": 0.0013903434388339519, "train/perplexity": 9.14765155516816, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026881.0113393299, "perf/iters_per_sec": 0.966492181462922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346695184707642, "data/tokens_consumed": 95737085952, "data/tokens_consumed_B": 95.737085952, "train/loss_slope": -1.0615737896726712e-05} {"step": 45660, "timestamp": 1778243892.858855, "train/loss": 2.144033658504486, "train/z_loss": 0.0013992596184834837, "train/perplexity": 8.533790695452614, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028006.3474839153, "perf/iters_per_sec": 0.9670287835521294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340953826904298, "data/tokens_consumed": 95758057472, "data/tokens_consumed_B": 95.758057472, "train/loss_slope": -1.2654419879529953e-05} {"step": 45670, "timestamp": 1778243903.210028, "train/loss": 2.16395161151886, "train/z_loss": 0.0013944924459792674, "train/perplexity": 8.705470416100377, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027429.62079492, "perf/iters_per_sec": 0.966753778836689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343895435333252, "data/tokens_consumed": 95779028992, "data/tokens_consumed_B": 95.779028992, "train/loss_slope": -8.480754565305643e-06} {"step": 45675, "timestamp": 1778243908.9697745, "eos/sharpness": 58.000206947326646, "eos/L0_probe": 1.982394814491272, "eos/L_plus": 2.3285534381866455, "eos/L_minus": 2.216238260269165, "eos/grad_norm": 0.1436583548784256, "eos/embed_grad_frac": 0.11280984431505203, "eos/time_s": 0.5972707271575928} {"step": 45675, "timestamp": 1778243910.3431365, "geo/rankme_last": 440.23468017578125, "geo/layer_0/stable_rank_q_proj": 19.41387939453125, "geo/layer_0/stable_rank_k_proj": 16.375160217285156, "geo/layer_0/stable_rank_o_proj": 48.27760314941406, "geo/layer_0/stable_rank_gate_proj": 134.87185668945312, "geo/layer_0/stable_rank_down_proj": 53.306907653808594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05805053561925888, "geo/layer_0/attn_entropy_mean": 6.192148208618164, "geo/layer_0/attn_entropy_std": 0.3805270195007324, "geo/layer_7/stable_rank_q_proj": 42.67911148071289, "geo/layer_7/stable_rank_k_proj": 42.51233673095703, "geo/layer_7/stable_rank_o_proj": 94.75077056884766, "geo/layer_7/stable_rank_gate_proj": 88.28655242919922, "geo/layer_7/stable_rank_down_proj": 145.1873779296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4768792688846588, "geo/layer_7/attn_entropy_mean": 4.651738166809082, "geo/layer_7/attn_entropy_std": 0.8247880935668945, "geo/layer_14/stable_rank_q_proj": 53.33772659301758, "geo/layer_14/stable_rank_k_proj": 38.232357025146484, "geo/layer_14/stable_rank_o_proj": 46.92654800415039, "geo/layer_14/stable_rank_gate_proj": 74.62230682373047, "geo/layer_14/stable_rank_down_proj": 132.2498321533203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41054630279541016, "geo/layer_14/attn_entropy_mean": 5.523241996765137, "geo/layer_14/attn_entropy_std": 0.36920279264450073, "geo/layer_21/stable_rank_q_proj": 42.4183349609375, "geo/layer_21/stable_rank_k_proj": 30.419063568115234, "geo/layer_21/stable_rank_o_proj": 74.1878433227539, "geo/layer_21/stable_rank_gate_proj": 71.08343505859375, "geo/layer_21/stable_rank_down_proj": 54.06117248535156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14603577554225922, "geo/layer_21/attn_entropy_mean": 5.696934223175049, "geo/layer_21/attn_entropy_std": 0.2977047860622406, "geo/layer_27/stable_rank_q_proj": 42.78463363647461, "geo/layer_27/stable_rank_k_proj": 31.258028030395508, "geo/layer_27/stable_rank_o_proj": 116.07307434082031, "geo/layer_27/stable_rank_gate_proj": 84.0103530883789, "geo/layer_27/stable_rank_down_proj": 130.7574005126953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0852065458893776, "geo/layer_27/attn_entropy_mean": 4.266867637634277, "geo/layer_27/attn_entropy_std": 0.6817213892936707, "attnres/final_alpha/block_0": 0.24096792936325073, "attnres/block_norm/0": 1.7339775562286377, "attnres/final_alpha/block_1": 0.005028689280152321, "attnres/block_norm/1": 42268.41015625, "attnres/final_alpha/block_2": 0.01103158574551344, "attnres/block_norm/2": 27094.03515625, "attnres/final_alpha/block_3": 0.012773361057043076, "attnres/block_norm/3": 50431.265625, "attnres/final_alpha/block_4": 0.015508744865655899, "attnres/block_norm/4": 13543.35546875, "attnres/final_alpha/block_5": 0.6016648411750793, "attnres/block_norm/5": 6156.259765625, "attnres/final_alpha/block_6": 0.11302486807107925, "attnres/block_norm/6": 33042.71875, "geo/tier1_time_s": 1.355043888092041, "geo/step": 45675.0, "geo/rankme_slope": 0.00016089345894607844} {"step": 45680, "timestamp": 1778243915.529001, "train/loss": 2.2192585945129393, "train/z_loss": 0.001387313415762037, "train/perplexity": 9.200507030184296, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703158.6932325892, "perf/iters_per_sec": 0.8121293512499758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231330943107605, "data/tokens_consumed": 95800000512, "data/tokens_consumed_B": 95.800000512, "train/loss_slope": -3.4560488324032687e-06} {"step": 45690, "timestamp": 1778243925.8718324, "train/loss": 2.1583290100097656, "train/z_loss": 0.001404887728858739, "train/perplexity": 8.656660373300147, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028608.1981214308, "perf/iters_per_sec": 0.967315768299785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337885856628417, "data/tokens_consumed": 95820972032, "data/tokens_consumed_B": 95.820972032, "train/loss_slope": -4.872492595080917e-06} {"step": 45700, "timestamp": 1778243936.2050323, "grad/layer_0/attn": 0.0038524551782757044, "grad/layer_0/mlp": 0.0033582502510398626, "grad/layer_0/attn_mlp_ratio": 1.147161401198935, "grad/layer_4/attn": 0.002114696428179741, "grad/layer_4/mlp": 0.0026261683087795973, "grad/layer_4/attn_mlp_ratio": 0.8052402203567967, "grad/layer_8/attn": 0.004773719236254692, "grad/layer_8/mlp": 0.0039003523997962475, "grad/layer_8/attn_mlp_ratio": 1.223919949928646, "grad/layer_12/attn": 0.004381290636956692, "grad/layer_12/mlp": 0.006396970245987177, "grad/layer_12/attn_mlp_ratio": 0.6849008827600832, "grad/layer_16/attn": 0.008916046470403671, "grad/layer_16/mlp": 0.005102331750094891, "grad/layer_16/attn_mlp_ratio": 1.7474454293359722, "grad/layer_20/attn": 0.0064327893778681755, "grad/layer_20/mlp": 0.007319079712033272, "grad/layer_20/attn_mlp_ratio": 0.8789068493681474, "grad/layer_24/attn": 0.019270017743110657, "grad/layer_24/mlp": 0.013853811658918858, "grad/layer_24/attn_mlp_ratio": 1.3909542065709775, "grad/layer_27/attn": 0.013402407988905907, "grad/layer_27/mlp": 0.013231249526143074, "grad/layer_27/attn_mlp_ratio": 1.012935918193596} {"step": 45700, "timestamp": 1778243936.2197094, "train/loss": 2.142380392551422, "train/z_loss": 0.0013984059100039304, "train/perplexity": 8.51969372607593, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028103.0928148245, "perf/iters_per_sec": 0.9670749153207896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340460538864136, "data/tokens_consumed": 95841943552, "data/tokens_consumed_B": 95.841943552, "train/loss_slope": -8.169398377902392e-06} {"step": 45710, "timestamp": 1778243946.5673296, "train/loss": 2.214389967918396, "train/z_loss": 0.0013869663001969458, "train/perplexity": 9.1558220624509, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027835.0434518312, "perf/iters_per_sec": 0.9669470994242817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341827392578125, "data/tokens_consumed": 95862915072, "data/tokens_consumed_B": 95.862915072, "train/loss_slope": -2.8622248707109966e-06} {"step": 45720, "timestamp": 1778243956.9185011, "train/loss": 2.225513291358948, "train/z_loss": 0.001401780336163938, "train/perplexity": 9.258233755877297, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027695.9269764246, "perf/iters_per_sec": 0.9668807635194896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342536926269532, "data/tokens_consumed": 95883886592, "data/tokens_consumed_B": 95.883886592, "train/loss_slope": -1.0677305313696177e-07} {"step": 45730, "timestamp": 1778243967.292711, "train/loss": 2.180953621864319, "train/z_loss": 0.0013957561808638276, "train/perplexity": 8.854746313458415, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022955.3377324557, "perf/iters_per_sec": 0.9646202744161871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036677360534668, "data/tokens_consumed": 95904858112, "data/tokens_consumed_B": 95.904858112, "train/loss_slope": -5.319720304588463e-07} {"step": 45740, "timestamp": 1778243977.6716063, "train/loss": 2.182462382316589, "train/z_loss": 0.0013821232132613659, "train/perplexity": 8.868116087868291, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021567.7165871095, "perf/iters_per_sec": 0.9639586050925777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373889446258544, "data/tokens_consumed": 95925829632, "data/tokens_consumed_B": 95.925829632, "train/loss_slope": 3.8969127616111595e-07} {"step": 45750, "timestamp": 1778243988.046967, "grad/layer_0/attn": 0.003042249707505107, "grad/layer_0/mlp": 0.003041299758478999, "grad/layer_0/attn_mlp_ratio": 1.0003123167955503, "grad/layer_4/attn": 0.0029801279306411743, "grad/layer_4/mlp": 0.0026111688930541277, "grad/layer_4/attn_mlp_ratio": 1.1413002906240464, "grad/layer_8/attn": 0.006300413515418768, "grad/layer_8/mlp": 0.003795530181378126, "grad/layer_8/attn_mlp_ratio": 1.659956066305217, "grad/layer_12/attn": 0.004301801789551973, "grad/layer_12/mlp": 0.006598247680813074, "grad/layer_12/attn_mlp_ratio": 0.6519612376578379, "grad/layer_16/attn": 0.003933634143322706, "grad/layer_16/mlp": 0.004453727509826422, "grad/layer_16/attn_mlp_ratio": 0.8832228838251773, "grad/layer_20/attn": 0.003251260844990611, "grad/layer_20/mlp": 0.006127865985035896, "grad/layer_20/attn_mlp_ratio": 0.5305698264082684, "grad/layer_24/attn": 0.008123982697725296, "grad/layer_24/mlp": 0.009752187877893448, "grad/layer_24/attn_mlp_ratio": 0.8330420533464884, "grad/layer_27/attn": 0.014931034296751022, "grad/layer_27/mlp": 0.008524755015969276, "grad/layer_27/attn_mlp_ratio": 1.7514912855128206} {"step": 45750, "timestamp": 1778243988.6476147, "eos/sharpness": 10.979199409484862, "eos/L0_probe": 1.9842008352279663, "eos/L_plus": 2.036813974380493, "eos/L_minus": 2.041379690170288, "eos/grad_norm": 0.11483237147331238, "eos/embed_grad_frac": 0.1806614100933075, "eos/time_s": 0.5977671146392822} {"step": 45750, "timestamp": 1778243988.6675358, "train/loss": 2.166086506843567, "train/z_loss": 0.001405425788834691, "train/perplexity": 8.724075537117525, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908388.1041091506, "perf/iters_per_sec": 0.9099903603120568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098912739753723, "data/tokens_consumed": 95946801152, "data/tokens_consumed_B": 95.946801152, "train/loss_slope": -1.7690852208428214e-06} {"step": 45750, "timestamp": 1778243990.0284355, "geo/rankme_last": 438.4882507324219, "geo/layer_0/stable_rank_q_proj": 19.41905975341797, "geo/layer_0/stable_rank_k_proj": 16.41330337524414, "geo/layer_0/stable_rank_o_proj": 48.18602752685547, "geo/layer_0/stable_rank_gate_proj": 134.84793090820312, "geo/layer_0/stable_rank_down_proj": 53.432838439941406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06135053560137749, "geo/layer_0/attn_entropy_mean": 6.193875312805176, "geo/layer_0/attn_entropy_std": 0.38413259387016296, "geo/layer_7/stable_rank_q_proj": 42.647300720214844, "geo/layer_7/stable_rank_k_proj": 42.448795318603516, "geo/layer_7/stable_rank_o_proj": 94.75992584228516, "geo/layer_7/stable_rank_gate_proj": 88.28251647949219, "geo/layer_7/stable_rank_down_proj": 145.14173889160156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49937793612480164, "geo/layer_7/attn_entropy_mean": 4.660676002502441, "geo/layer_7/attn_entropy_std": 0.8199586868286133, "geo/layer_14/stable_rank_q_proj": 53.2202033996582, "geo/layer_14/stable_rank_k_proj": 38.210567474365234, "geo/layer_14/stable_rank_o_proj": 46.94259262084961, "geo/layer_14/stable_rank_gate_proj": 74.52366638183594, "geo/layer_14/stable_rank_down_proj": 132.06585693359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38758087158203125, "geo/layer_14/attn_entropy_mean": 5.494043350219727, "geo/layer_14/attn_entropy_std": 0.38592901825904846, "geo/layer_21/stable_rank_q_proj": 42.45033645629883, "geo/layer_21/stable_rank_k_proj": 30.442424774169922, "geo/layer_21/stable_rank_o_proj": 74.2092514038086, "geo/layer_21/stable_rank_gate_proj": 70.9354476928711, "geo/layer_21/stable_rank_down_proj": 54.038856506347656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1483326554298401, "geo/layer_21/attn_entropy_mean": 5.698269844055176, "geo/layer_21/attn_entropy_std": 0.29933035373687744, "geo/layer_27/stable_rank_q_proj": 42.72208786010742, "geo/layer_27/stable_rank_k_proj": 31.392396926879883, "geo/layer_27/stable_rank_o_proj": 116.00458526611328, "geo/layer_27/stable_rank_gate_proj": 84.07056427001953, "geo/layer_27/stable_rank_down_proj": 130.50169372558594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08804966509342194, "geo/layer_27/attn_entropy_mean": 4.245600700378418, "geo/layer_27/attn_entropy_std": 0.6848800778388977, "attnres/final_alpha/block_0": 0.24185490608215332, "attnres/block_norm/0": 1.7340831756591797, "attnres/final_alpha/block_1": 0.005021337885409594, "attnres/block_norm/1": 42123.71484375, "attnres/final_alpha/block_2": 0.010866312310099602, "attnres/block_norm/2": 27178.658203125, "attnres/final_alpha/block_3": 0.013023528270423412, "attnres/block_norm/3": 50566.36328125, "attnres/final_alpha/block_4": 0.01542515680193901, "attnres/block_norm/4": 13576.109375, "attnres/final_alpha/block_5": 0.5999707579612732, "attnres/block_norm/5": 6207.2734375, "attnres/final_alpha/block_6": 0.11383805423974991, "attnres/block_norm/6": 33248.35546875, "geo/tier1_time_s": 1.357520341873169, "geo/step": 45750.0, "geo/rankme_slope": 0.0001421198557548019} {"step": 45760, "timestamp": 1778244000.4138074, "train/loss": 2.156744146347046, "train/z_loss": 0.0014017888344824315, "train/perplexity": 8.642951612961522, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785901.001222444, "perf/iters_per_sec": 0.8515839582550259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1742823362350463, "data/tokens_consumed": 95967772672, "data/tokens_consumed_B": 95.967772672, "train/loss_slope": -3.6962350424152424e-06} {"step": 45770, "timestamp": 1778244010.7905571, "train/loss": 2.1502069354057314, "train/z_loss": 0.0013869537855498494, "train/perplexity": 8.586635092157668, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021979.6291173205, "perf/iters_per_sec": 0.9641550202929118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037177610397339, "data/tokens_consumed": 95988744192, "data/tokens_consumed_B": 95.988744192, "train/loss_slope": -2.025575720795363e-06} {"step": 45780, "timestamp": 1778244021.14188, "train/loss": 2.1590574383735657, "train/z_loss": 0.0013854504097253085, "train/perplexity": 8.6629684274557, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027279.4863560533, "perf/iters_per_sec": 0.9666821891575114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344661474227905, "data/tokens_consumed": 96009715712, "data/tokens_consumed_B": 96.009715712, "train/loss_slope": 1.8888463353094395e-07} {"step": 45790, "timestamp": 1778244031.5027878, "train/loss": 2.203900384902954, "train/z_loss": 0.0013893613358959556, "train/perplexity": 9.060283263951558, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025180.7291697077, "perf/iters_per_sec": 0.965681423745016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355381965637207, "data/tokens_consumed": 96030687232, "data/tokens_consumed_B": 96.030687232, "train/loss_slope": 3.288517597735257e-06} {"step": 45800, "timestamp": 1778244041.8475833, "grad/layer_0/attn": 0.002934239571914077, "grad/layer_0/mlp": 0.0030095370020717382, "grad/layer_0/attn_mlp_ratio": 0.9749803615626379, "grad/layer_4/attn": 0.002848806092515588, "grad/layer_4/mlp": 0.002520335605368018, "grad/layer_4/attn_mlp_ratio": 1.1303280298921952, "grad/layer_8/attn": 0.00390930799767375, "grad/layer_8/mlp": 0.0036546506453305483, "grad/layer_8/attn_mlp_ratio": 1.0696803251770557, "grad/layer_12/attn": 0.004569696262478828, "grad/layer_12/mlp": 0.007025272119790316, "grad/layer_12/attn_mlp_ratio": 0.6504653655421229, "grad/layer_16/attn": 0.003613788401708007, "grad/layer_16/mlp": 0.0048684170469641685, "grad/layer_16/attn_mlp_ratio": 0.742292267202592, "grad/layer_20/attn": 0.0042422739788889885, "grad/layer_20/mlp": 0.006921484600752592, "grad/layer_20/attn_mlp_ratio": 0.6129138706942044, "grad/layer_24/attn": 0.01576494425535202, "grad/layer_24/mlp": 0.015524420887231827, "grad/layer_24/attn_mlp_ratio": 1.0154932198964466, "grad/layer_27/attn": 0.008201595395803452, "grad/layer_27/mlp": 0.015554091893136501, "grad/layer_27/attn_mlp_ratio": 0.5272950294637926} {"step": 45800, "timestamp": 1778244041.8623962, "train/loss": 2.1642513275146484, "train/z_loss": 0.0013955148053355515, "train/perplexity": 8.708079975878816, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025345.009523695, "perf/iters_per_sec": 0.9657597587221599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354542016983033, "data/tokens_consumed": 96051658752, "data/tokens_consumed_B": 96.051658752, "train/loss_slope": 4.3371919680028005e-06} {"step": 45810, "timestamp": 1778244052.214663, "train/loss": 2.156385970115662, "train/z_loss": 0.0014002979383803903, "train/perplexity": 8.639856467461621, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027364.3401880465, "perf/iters_per_sec": 0.9667226506176216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344228506088258, "data/tokens_consumed": 96072630272, "data/tokens_consumed_B": 96.072630272, "train/loss_slope": 4.409130786297184e-06} {"step": 45820, "timestamp": 1778244062.5604968, "train/loss": 2.1862530946731566, "train/z_loss": 0.0014000885421410203, "train/perplexity": 8.901796360884882, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028187.127061813, "perf/iters_per_sec": 0.9671149859723153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034003210067749, "data/tokens_consumed": 96093601792, "data/tokens_consumed_B": 96.093601792, "train/loss_slope": 8.858230133297004e-06} {"step": 45825, "timestamp": 1778244068.3706145, "eos/sharpness": 5.936050415039062, "eos/L0_probe": 1.9825679063796997, "eos/L_plus": 2.0152463912963867, "eos/L_minus": 2.0092499256134033, "eos/grad_norm": 0.09149535000324249, "eos/embed_grad_frac": 0.24886088073253632, "eos/time_s": 0.6403284072875977} {"step": 45825, "timestamp": 1778244069.7634952, "geo/rankme_last": 438.22235107421875, "geo/layer_0/stable_rank_q_proj": 19.42372703552246, "geo/layer_0/stable_rank_k_proj": 16.432151794433594, "geo/layer_0/stable_rank_o_proj": 48.29462432861328, "geo/layer_0/stable_rank_gate_proj": 134.50967407226562, "geo/layer_0/stable_rank_down_proj": 53.43577575683594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06359481066465378, "geo/layer_0/attn_entropy_mean": 6.193166732788086, "geo/layer_0/attn_entropy_std": 0.38237902522087097, "geo/layer_7/stable_rank_q_proj": 42.50717544555664, "geo/layer_7/stable_rank_k_proj": 42.42399597167969, "geo/layer_7/stable_rank_o_proj": 94.90737915039062, "geo/layer_7/stable_rank_gate_proj": 88.38333892822266, "geo/layer_7/stable_rank_down_proj": 144.78057861328125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4995959997177124, "geo/layer_7/attn_entropy_mean": 4.6343560218811035, "geo/layer_7/attn_entropy_std": 0.8433067798614502, "geo/layer_14/stable_rank_q_proj": 53.28153991699219, "geo/layer_14/stable_rank_k_proj": 38.29360580444336, "geo/layer_14/stable_rank_o_proj": 46.91569137573242, "geo/layer_14/stable_rank_gate_proj": 74.62854766845703, "geo/layer_14/stable_rank_down_proj": 131.90257263183594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3903821110725403, "geo/layer_14/attn_entropy_mean": 5.531282424926758, "geo/layer_14/attn_entropy_std": 0.3746998608112335, "geo/layer_21/stable_rank_q_proj": 42.4827880859375, "geo/layer_21/stable_rank_k_proj": 30.507049560546875, "geo/layer_21/stable_rank_o_proj": 74.14031982421875, "geo/layer_21/stable_rank_gate_proj": 70.83303833007812, "geo/layer_21/stable_rank_down_proj": 54.019100189208984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1537722498178482, "geo/layer_21/attn_entropy_mean": 5.704673767089844, "geo/layer_21/attn_entropy_std": 0.29033729434013367, "geo/layer_27/stable_rank_q_proj": 42.701820373535156, "geo/layer_27/stable_rank_k_proj": 31.412635803222656, "geo/layer_27/stable_rank_o_proj": 116.17967987060547, "geo/layer_27/stable_rank_gate_proj": 83.95098876953125, "geo/layer_27/stable_rank_down_proj": 130.68861389160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09688270837068558, "geo/layer_27/attn_entropy_mean": 4.245289325714111, "geo/layer_27/attn_entropy_std": 0.6715814471244812, "attnres/final_alpha/block_0": 0.24065566062927246, "attnres/block_norm/0": 1.7341430187225342, "attnres/final_alpha/block_1": 0.0048959581181406975, "attnres/block_norm/1": 42121.32421875, "attnres/final_alpha/block_2": 0.011031346395611763, "attnres/block_norm/2": 27028.69921875, "attnres/final_alpha/block_3": 0.012986266985535622, "attnres/block_norm/3": 50321.25, "attnres/final_alpha/block_4": 0.015382309444248676, "attnres/block_norm/4": 13594.130859375, "attnres/final_alpha/block_5": 0.6015337705612183, "attnres/block_norm/5": 6220.0244140625, "attnres/final_alpha/block_6": 0.11351475864648819, "attnres/block_norm/6": 33434.1953125, "geo/tier1_time_s": 1.3673079013824463, "geo/step": 45825.0, "geo/rankme_slope": 9.31153125312625e-05} {"step": 45830, "timestamp": 1778244074.9669666, "train/loss": 2.174769473075867, "train/z_loss": 0.0013936091680079698, "train/perplexity": 8.800156215393546, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1691186.4684302798, "perf/iters_per_sec": 0.8064205495978736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2400477647781372, "data/tokens_consumed": 96114573312, "data/tokens_consumed_B": 96.114573312, "train/loss_slope": 1.0064784070589253e-05} {"step": 45840, "timestamp": 1778244085.3289154, "train/loss": 2.164113450050354, "train/z_loss": 0.00139999930979684, "train/perplexity": 8.706879410660212, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025828.1658966213, "perf/iters_per_sec": 0.9659901456339938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03520724773407, "data/tokens_consumed": 96135544832, "data/tokens_consumed_B": 96.135544832, "train/loss_slope": 1.1545214115089056e-05} {"step": 45850, "timestamp": 1778244095.6695294, "grad/layer_0/attn": 0.002637159079313278, "grad/layer_0/mlp": 0.0028139250352978706, "grad/layer_0/attn_mlp_ratio": 0.9371816777329149, "grad/layer_4/attn": 0.0025345096364617348, "grad/layer_4/mlp": 0.002344897948205471, "grad/layer_4/attn_mlp_ratio": 1.0808613356991668, "grad/layer_8/attn": 0.003829043125733733, "grad/layer_8/mlp": 0.003624532837420702, "grad/layer_8/attn_mlp_ratio": 1.05642387359806, "grad/layer_12/attn": 0.00452149473130703, "grad/layer_12/mlp": 0.006629609037190676, "grad/layer_12/attn_mlp_ratio": 0.6820152798967319, "grad/layer_16/attn": 0.0036672004498541355, "grad/layer_16/mlp": 0.004621054045855999, "grad/layer_16/attn_mlp_ratio": 0.7935852587104512, "grad/layer_20/attn": 0.00346851022914052, "grad/layer_20/mlp": 0.006020777393132448, "grad/layer_20/attn_mlp_ratio": 0.5760900868860955, "grad/layer_24/attn": 0.01109300646930933, "grad/layer_24/mlp": 0.01029994897544384, "grad/layer_24/attn_mlp_ratio": 1.0769962441616554, "grad/layer_27/attn": 0.00593921821564436, "grad/layer_27/mlp": 0.01121466513723135, "grad/layer_27/attn_mlp_ratio": 0.5295938924620651} {"step": 45850, "timestamp": 1778244095.6837568, "train/loss": 2.15016450881958, "train/z_loss": 0.0013928901404142379, "train/perplexity": 8.586270798272109, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026373.0777869541, "perf/iters_per_sec": 0.9662499798712512, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349288702011108, "data/tokens_consumed": 96156516352, "data/tokens_consumed_B": 96.156516352, "train/loss_slope": 9.401813534834803e-06} {"step": 45860, "timestamp": 1778244106.0494077, "train/loss": 2.196699285507202, "train/z_loss": 0.0014023983967490494, "train/perplexity": 8.99527361501323, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024228.7265459993, "perf/iters_per_sec": 0.9652274735193249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360252141952515, "data/tokens_consumed": 96177487872, "data/tokens_consumed_B": 96.177487872, "train/loss_slope": 1.1643357588799205e-05} {"step": 45870, "timestamp": 1778244116.4015992, "train/loss": 2.1986945152282713, "train/z_loss": 0.0013844113796949386, "train/perplexity": 9.013239169022729, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027302.427963534, "perf/iters_per_sec": 0.966693128568427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344544410705567, "data/tokens_consumed": 96198459392, "data/tokens_consumed_B": 96.198459392, "train/loss_slope": 1.3750176804580034e-05} {"step": 45880, "timestamp": 1778244126.7536206, "train/loss": 2.181776738166809, "train/z_loss": 0.0013986307429149746, "train/perplexity": 8.862037799962465, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027151.8919003878, "perf/iters_per_sec": 0.9666213473798694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034531259536743, "data/tokens_consumed": 96219430912, "data/tokens_consumed_B": 96.219430912, "train/loss_slope": 1.5213222887077537e-05} {"step": 45890, "timestamp": 1778244137.108338, "train/loss": 2.121944308280945, "train/z_loss": 0.0013988802209496498, "train/perplexity": 8.34735154474952, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026446.0443039637, "perf/iters_per_sec": 0.966284773017866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348916053771973, "data/tokens_consumed": 96240402432, "data/tokens_consumed_B": 96.240402432, "train/loss_slope": 1.2033192888952908e-05} {"step": 45900, "timestamp": 1778244147.4471064, "grad/layer_0/attn": 0.002693741349503398, "grad/layer_0/mlp": 0.002720238408073783, "grad/layer_0/attn_mlp_ratio": 0.990259251719384, "grad/layer_4/attn": 0.0018989215604960918, "grad/layer_4/mlp": 0.0024699782952666283, "grad/layer_4/attn_mlp_ratio": 0.7688008786372837, "grad/layer_8/attn": 0.005884367506951094, "grad/layer_8/mlp": 0.0033957380801439285, "grad/layer_8/attn_mlp_ratio": 1.7328684353107826, "grad/layer_12/attn": 0.004768917802721262, "grad/layer_12/mlp": 0.0076009249314665794, "grad/layer_12/attn_mlp_ratio": 0.6274128192264398, "grad/layer_16/attn": 0.0036047562025487423, "grad/layer_16/mlp": 0.004368421155959368, "grad/layer_16/attn_mlp_ratio": 0.8251851163921431, "grad/layer_20/attn": 0.005421007052063942, "grad/layer_20/mlp": 0.006031431723386049, "grad/layer_20/attn_mlp_ratio": 0.8987927263050096, "grad/layer_24/attn": 0.005523622501641512, "grad/layer_24/mlp": 0.007456492632627487, "grad/layer_24/attn_mlp_ratio": 0.7407802434342509, "grad/layer_27/attn": 0.007813580334186554, "grad/layer_27/mlp": 0.006626684684306383, "grad/layer_27/attn_mlp_ratio": 1.1791084967087964} {"step": 45900, "timestamp": 1778244148.04704, "eos/sharpness": 32.96647071838378, "eos/L0_probe": 1.9820656776428223, "eos/L_plus": 2.192898750305176, "eos/L_minus": 2.1008973121643066, "eos/grad_norm": 0.10688283294439316, "eos/embed_grad_frac": 0.18098331987857819, "eos/time_s": 0.5970709323883057} {"step": 45900, "timestamp": 1778244148.0649152, "train/loss": 2.103903567790985, "train/z_loss": 0.00139440594939515, "train/perplexity": 8.198109408851133, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915356.9725590344, "perf/iters_per_sec": 0.9133133757395908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094914436340332, "data/tokens_consumed": 96261373952, "data/tokens_consumed_B": 96.261373952, "train/loss_slope": 7.498497559507093e-06} {"step": 45900, "timestamp": 1778244149.42969, "geo/rankme_last": 438.5963439941406, "geo/layer_0/stable_rank_q_proj": 19.423297882080078, "geo/layer_0/stable_rank_k_proj": 16.43943214416504, "geo/layer_0/stable_rank_o_proj": 48.26301956176758, "geo/layer_0/stable_rank_gate_proj": 134.31954956054688, "geo/layer_0/stable_rank_down_proj": 53.454254150390625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05937008187174797, "geo/layer_0/attn_entropy_mean": 6.1926703453063965, "geo/layer_0/attn_entropy_std": 0.3844827115535736, "geo/layer_7/stable_rank_q_proj": 42.48162841796875, "geo/layer_7/stable_rank_k_proj": 42.462852478027344, "geo/layer_7/stable_rank_o_proj": 94.76994323730469, "geo/layer_7/stable_rank_gate_proj": 88.48807525634766, "geo/layer_7/stable_rank_down_proj": 144.87725830078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4803242087364197, "geo/layer_7/attn_entropy_mean": 4.652647972106934, "geo/layer_7/attn_entropy_std": 0.8045811653137207, "geo/layer_14/stable_rank_q_proj": 53.2578010559082, "geo/layer_14/stable_rank_k_proj": 38.250457763671875, "geo/layer_14/stable_rank_o_proj": 46.846858978271484, "geo/layer_14/stable_rank_gate_proj": 74.63911437988281, "geo/layer_14/stable_rank_down_proj": 132.01499938964844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4062681496143341, "geo/layer_14/attn_entropy_mean": 5.537368297576904, "geo/layer_14/attn_entropy_std": 0.3781428337097168, "geo/layer_21/stable_rank_q_proj": 42.328338623046875, "geo/layer_21/stable_rank_k_proj": 30.483755111694336, "geo/layer_21/stable_rank_o_proj": 74.19783782958984, "geo/layer_21/stable_rank_gate_proj": 70.8539047241211, "geo/layer_21/stable_rank_down_proj": 53.974239349365234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14708228409290314, "geo/layer_21/attn_entropy_mean": 5.715710163116455, "geo/layer_21/attn_entropy_std": 0.28145623207092285, "geo/layer_27/stable_rank_q_proj": 42.777469635009766, "geo/layer_27/stable_rank_k_proj": 31.387996673583984, "geo/layer_27/stable_rank_o_proj": 115.97346496582031, "geo/layer_27/stable_rank_gate_proj": 83.86286163330078, "geo/layer_27/stable_rank_down_proj": 130.7562255859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08662587404251099, "geo/layer_27/attn_entropy_mean": 4.2509074211120605, "geo/layer_27/attn_entropy_std": 0.6883984208106995, "attnres/final_alpha/block_0": 0.24098610877990723, "attnres/block_norm/0": 1.734316349029541, "attnres/final_alpha/block_1": 0.004937942139804363, "attnres/block_norm/1": 42390.2421875, "attnres/final_alpha/block_2": 0.010888689197599888, "attnres/block_norm/2": 27233.890625, "attnres/final_alpha/block_3": 0.01263921894133091, "attnres/block_norm/3": 50680.90625, "attnres/final_alpha/block_4": 0.015424631536006927, "attnres/block_norm/4": 13579.78515625, "attnres/final_alpha/block_5": 0.6021221280097961, "attnres/block_norm/5": 6166.3017578125, "attnres/final_alpha/block_6": 0.1130012720823288, "attnres/block_norm/6": 33286.203125, "geo/tier1_time_s": 1.3606014251708984, "geo/step": 45900.0, "geo/rankme_slope": 8.12670380652261e-05} {"step": 45910, "timestamp": 1778244159.7816591, "train/loss": 2.155991315841675, "train/z_loss": 0.0013956505339592696, "train/perplexity": 8.636447383929045, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790405.5716701888, "perf/iters_per_sec": 0.8537319048262543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713279008865356, "data/tokens_consumed": 96282345472, "data/tokens_consumed_B": 96.282345472, "train/loss_slope": 8.73812157722198e-06} {"step": 45920, "timestamp": 1778244170.1316886, "train/loss": 2.17003139257431, "train/z_loss": 0.0013858087360858918, "train/perplexity": 8.758558990139075, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027735.004811094, "perf/iters_per_sec": 0.966899397283122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342337608337402, "data/tokens_consumed": 96303316992, "data/tokens_consumed_B": 96.303316992, "train/loss_slope": 1.1159104067679127e-05} {"step": 45930, "timestamp": 1778244180.4846683, "train/loss": 2.118153619766235, "train/z_loss": 0.0013893687748350203, "train/perplexity": 8.31576923229316, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027326.0242521958, "perf/iters_per_sec": 0.9667043801556567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034442400932312, "data/tokens_consumed": 96324288512, "data/tokens_consumed_B": 96.324288512, "train/loss_slope": 1.0261670269123469e-05} {"step": 45940, "timestamp": 1778244190.829796, "train/loss": 2.164306330680847, "train/z_loss": 0.0013889804948121309, "train/perplexity": 8.708558961021728, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028385.1985915925, "perf/iters_per_sec": 0.9672094338376963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339022397994995, "data/tokens_consumed": 96345260032, "data/tokens_consumed_B": 96.345260032, "train/loss_slope": 1.0525428887569543e-05} {"step": 45950, "timestamp": 1778244201.167359, "grad/layer_0/attn": 0.0028838173020631075, "grad/layer_0/mlp": 0.003089017001911998, "grad/layer_0/attn_mlp_ratio": 0.9335711674364384, "grad/layer_4/attn": 0.002361219609156251, "grad/layer_4/mlp": 0.0026367302052676678, "grad/layer_4/attn_mlp_ratio": 0.8955104753940891, "grad/layer_8/attn": 0.007260595448315144, "grad/layer_8/mlp": 0.003783491672948003, "grad/layer_8/attn_mlp_ratio": 1.9190197531889617, "grad/layer_12/attn": 0.004911238793283701, "grad/layer_12/mlp": 0.006654984783381224, "grad/layer_12/attn_mlp_ratio": 0.7379789555267072, "grad/layer_16/attn": 0.004134435206651688, "grad/layer_16/mlp": 0.00507287448272109, "grad/layer_16/attn_mlp_ratio": 0.8150083624645765, "grad/layer_20/attn": 0.0046081398613750935, "grad/layer_20/mlp": 0.007345420308411121, "grad/layer_20/attn_mlp_ratio": 0.6273486887283385, "grad/layer_24/attn": 0.02341354452073574, "grad/layer_24/mlp": 0.016204016283154488, "grad/layer_24/attn_mlp_ratio": 1.4449222937762638, "grad/layer_27/attn": 0.005062805023044348, "grad/layer_27/mlp": 0.014968635514378548, "grad/layer_27/attn_mlp_ratio": 0.33822755483346306} {"step": 45950, "timestamp": 1778244201.1816769, "train/loss": 2.1560381650924683, "train/z_loss": 0.0013989871134981514, "train/perplexity": 8.63685200449651, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027383.3117850458, "perf/iters_per_sec": 0.9667316969800214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344131708145141, "data/tokens_consumed": 96366231552, "data/tokens_consumed_B": 96.366231552, "train/loss_slope": 7.6214543985526566e-06} {"step": 45960, "timestamp": 1778244211.523052, "train/loss": 2.1496841430664064, "train/z_loss": 0.0013882621307857335, "train/perplexity": 8.582147238320928, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029435.038625536, "perf/iters_per_sec": 0.9677100365760498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033367395401001, "data/tokens_consumed": 96387203072, "data/tokens_consumed_B": 96.387203072, "train/loss_slope": 8.710591727965553e-06} {"step": 45970, "timestamp": 1778244221.863059, "train/loss": 2.174568462371826, "train/z_loss": 0.0013942159828729927, "train/perplexity": 8.798387467571597, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029272.2476418668, "perf/iters_per_sec": 0.9676324117860159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334502935409546, "data/tokens_consumed": 96408174592, "data/tokens_consumed_B": 96.408174592, "train/loss_slope": 8.504848604691566e-06} {"step": 45975, "timestamp": 1778244227.621367, "eos/sharpness": 60.176944732666, "eos/L0_probe": 1.9832336902618408, "eos/L_plus": 2.3245482444763184, "eos/L_minus": 2.2436885833740234, "eos/grad_norm": 0.21834750473499298, "eos/embed_grad_frac": 0.050128430128097534, "eos/time_s": 0.595088005065918} {"step": 45975, "timestamp": 1778244229.0013163, "geo/rankme_last": 439.1691589355469, "geo/layer_0/stable_rank_q_proj": 19.406354904174805, "geo/layer_0/stable_rank_k_proj": 16.390920639038086, "geo/layer_0/stable_rank_o_proj": 48.199989318847656, "geo/layer_0/stable_rank_gate_proj": 134.6493682861328, "geo/layer_0/stable_rank_down_proj": 53.41038131713867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059720929712057114, "geo/layer_0/attn_entropy_mean": 6.188509464263916, "geo/layer_0/attn_entropy_std": 0.38449546694755554, "geo/layer_7/stable_rank_q_proj": 42.39558410644531, "geo/layer_7/stable_rank_k_proj": 42.50822067260742, "geo/layer_7/stable_rank_o_proj": 94.64149475097656, "geo/layer_7/stable_rank_gate_proj": 88.22692108154297, "geo/layer_7/stable_rank_down_proj": 144.74557495117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49225205183029175, "geo/layer_7/attn_entropy_mean": 4.677029609680176, "geo/layer_7/attn_entropy_std": 0.8145952224731445, "geo/layer_14/stable_rank_q_proj": 53.21813201904297, "geo/layer_14/stable_rank_k_proj": 38.24090576171875, "geo/layer_14/stable_rank_o_proj": 46.76996612548828, "geo/layer_14/stable_rank_gate_proj": 74.6258316040039, "geo/layer_14/stable_rank_down_proj": 131.95033264160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3920662999153137, "geo/layer_14/attn_entropy_mean": 5.503426551818848, "geo/layer_14/attn_entropy_std": 0.39528563618659973, "geo/layer_21/stable_rank_q_proj": 42.33171844482422, "geo/layer_21/stable_rank_k_proj": 30.502016067504883, "geo/layer_21/stable_rank_o_proj": 74.39938354492188, "geo/layer_21/stable_rank_gate_proj": 70.84245300292969, "geo/layer_21/stable_rank_down_proj": 53.96756362915039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1455366015434265, "geo/layer_21/attn_entropy_mean": 5.729131698608398, "geo/layer_21/attn_entropy_std": 0.2857199013233185, "geo/layer_27/stable_rank_q_proj": 42.870521545410156, "geo/layer_27/stable_rank_k_proj": 31.37020492553711, "geo/layer_27/stable_rank_o_proj": 115.85961151123047, "geo/layer_27/stable_rank_gate_proj": 83.7687759399414, "geo/layer_27/stable_rank_down_proj": 130.28697204589844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09549602121114731, "geo/layer_27/attn_entropy_mean": 4.285643577575684, "geo/layer_27/attn_entropy_std": 0.6776367425918579, "attnres/final_alpha/block_0": 0.24102406203746796, "attnres/block_norm/0": 1.7345638275146484, "attnres/final_alpha/block_1": 0.004916857928037643, "attnres/block_norm/1": 42250.6796875, "attnres/final_alpha/block_2": 0.010809105820953846, "attnres/block_norm/2": 27142.830078125, "attnres/final_alpha/block_3": 0.012806624174118042, "attnres/block_norm/3": 50387.58984375, "attnres/final_alpha/block_4": 0.015432124026119709, "attnres/block_norm/4": 13636.59765625, "attnres/final_alpha/block_5": 0.6015617847442627, "attnres/block_norm/5": 6170.85107421875, "attnres/final_alpha/block_6": 0.11344940960407257, "attnres/block_norm/6": 33312.61328125, "geo/tier1_time_s": 1.359886646270752, "geo/step": 45975.0, "geo/rankme_slope": 7.758197028811525e-05} {"step": 45980, "timestamp": 1778244234.1770754, "train/loss": 2.194020080566406, "train/z_loss": 0.0014051142265088857, "train/perplexity": 8.971205689349661, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704066.9545948838, "perf/iters_per_sec": 0.812562444016878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230674648284912, "data/tokens_consumed": 96429146112, "data/tokens_consumed_B": 96.429146112, "train/loss_slope": 1.2500407242968105e-05} {"step": 45990, "timestamp": 1778244244.538442, "train/loss": 2.158729982376099, "train/z_loss": 0.0013921058853156865, "train/perplexity": 8.660132150891494, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024903.802480504, "perf/iters_per_sec": 0.9655493748095054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035679817199707, "data/tokens_consumed": 96450117632, "data/tokens_consumed_B": 96.450117632, "train/loss_slope": 1.4445548933116747e-05} {"step": 46000, "timestamp": 1778244254.8712094, "grad/layer_0/attn": 0.002855562837794423, "grad/layer_0/mlp": 0.0027554905973374844, "grad/layer_0/attn_mlp_ratio": 1.036317357396138, "grad/layer_4/attn": 0.0018463177839294076, "grad/layer_4/mlp": 0.002436094218865037, "grad/layer_4/attn_mlp_ratio": 0.7579007797980486, "grad/layer_8/attn": 0.00373268174007535, "grad/layer_8/mlp": 0.0035255826078355312, "grad/layer_8/attn_mlp_ratio": 1.0587417880679826, "grad/layer_12/attn": 0.003937806002795696, "grad/layer_12/mlp": 0.006743925623595715, "grad/layer_12/attn_mlp_ratio": 0.5839041181930669, "grad/layer_16/attn": 0.004326405469328165, "grad/layer_16/mlp": 0.004404434934258461, "grad/layer_16/attn_mlp_ratio": 0.9822838651669583, "grad/layer_20/attn": 0.004780145827680826, "grad/layer_20/mlp": 0.005999694112688303, "grad/layer_20/attn_mlp_ratio": 0.7967315763479503, "grad/layer_24/attn": 0.009247615933418274, "grad/layer_24/mlp": 0.008877143263816833, "grad/layer_24/attn_mlp_ratio": 1.0417333092886034, "grad/layer_27/attn": 0.0069163222797214985, "grad/layer_27/mlp": 0.007638330105692148, "grad/layer_27/attn_mlp_ratio": 0.9054756855847103} {"step": 46000, "timestamp": 1778244254.8856792, "train/loss": 2.177081894874573, "train/z_loss": 0.0013844784232787787, "train/perplexity": 8.820529435118274, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027742.2035238727, "perf/iters_per_sec": 0.9669028298968662, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342300891876222, "data/tokens_consumed": 96471089152, "data/tokens_consumed_B": 96.471089152, "train/loss_slope": 1.2664172615763198e-05} {"step": 46000, "timestamp": 1778244262.1017776, "geo/ww_alpha_mean": 7.4680542377482935, "geo/ww_alpha_std": 4.5338012447386475, "geo/ww_alpha_min": 1.3377273257867617, "geo/ww_alpha_max": 30.89815194369376, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.985016217282673, "geo/ww_alpha_by_type/k_proj": 4.465963616499937, "geo/ww_alpha_by_type/v_proj": 8.18427504350672, "geo/ww_alpha_by_type/o_proj": 7.955977353140468, "geo/ww_alpha_by_type/gate_proj": 8.192970019422111, "geo/ww_alpha_by_type/up_proj": 11.533521893486705, "geo/ww_alpha_by_type/down_proj": 8.055855915970733, "geo/twonn_id/layer_0": 0.660197913646698, "geo/twonn_id/layer_7": 3.3963544368743896, "geo/twonn_id/layer_14": 4.218860626220703, "geo/twonn_id/layer_21": 6.976350784301758, "geo/twonn_id/layer_27": 5.347095489501953, "geo/tier2_time_s": 7.2073822021484375} {"step": 46000, "timestamp": 1778244262.8662736, "eoc/jacobian_sigma/layer_0/attn": 1172.39501953125, "eoc/jacobian_sigma/layer_0/mlp": 7284.1484375, "eoc/jacobian_sigma/layer_0": 7284.1484375, "eoc/jacobian_sigma/layer_7/attn": 1.1696381568908691, "eoc/jacobian_sigma/layer_7/mlp": 1.8614293336868286, "eoc/jacobian_sigma/layer_7": 1.8614293336868286, "eoc/jacobian_sigma/layer_14/attn": 1.6172338724136353, "eoc/jacobian_sigma/layer_14/mlp": 6.611229419708252, "eoc/jacobian_sigma/layer_14": 6.611229419708252, "eoc/jacobian_sigma/layer_21/attn": 1.0861403942108154, "eoc/jacobian_sigma/layer_21/mlp": 4.358572483062744, "eoc/jacobian_sigma/layer_21": 4.358572483062744, "eoc/jacobian_sigma/layer_27/attn": 3.641164541244507, "eoc/jacobian_sigma/layer_27/mlp": 26.66192626953125, "eoc/jacobian_sigma/layer_27": 26.66192626953125, "eoc/layer0_sigma": 7284.1484375, "eoc/sigma_max": 26.66192626953125, "eoc/sigma_min": 1.8614293336868286, "eoc/sigma_mean": 9.873289376497269, "eoc/time_s": 0.7546746730804443} {"step": 46010, "timestamp": 1778244273.2289293, "train/loss": 2.1324966669082643, "train/z_loss": 0.0013921427074819804, "train/perplexity": 8.435902179172238, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1143733.066006717, "perf/iters_per_sec": 0.5453744249375901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8336026668548584, "data/tokens_consumed": 96492060672, "data/tokens_consumed_B": 96.492060672, "train/loss_slope": 1.1437021244620225e-05} {"step": 46020, "timestamp": 1778244283.5682187, "train/loss": 2.166959285736084, "train/z_loss": 0.0013894770061597228, "train/perplexity": 8.731693049821645, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029420.008510277, "perf/iters_per_sec": 0.9677028696586022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03337504863739, "data/tokens_consumed": 96513032192, "data/tokens_consumed_B": 96.513032192, "train/loss_slope": 8.548613865025749e-06} {"step": 46030, "timestamp": 1778244293.9192266, "train/loss": 2.155748176574707, "train/z_loss": 0.0014006509794853629, "train/perplexity": 8.634347779701368, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027496.5880188711, "perf/iters_per_sec": 0.9667857112974506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03435537815094, "data/tokens_consumed": 96534003712, "data/tokens_consumed_B": 96.534003712, "train/loss_slope": 5.651350619375909e-06} {"step": 46040, "timestamp": 1778244304.2592237, "train/loss": 2.192413306236267, "train/z_loss": 0.0013999249436892568, "train/perplexity": 8.956802560725032, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029590.3628537392, "perf/iters_per_sec": 0.9677841009396263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033288311958313, "data/tokens_consumed": 96554975232, "data/tokens_consumed_B": 96.554975232, "train/loss_slope": 4.544723194853689e-06} {"step": 46050, "timestamp": 1778244315.129167, "grad/layer_0/attn": 0.003075862303376198, "grad/layer_0/mlp": 0.003013810608536005, "grad/layer_0/attn_mlp_ratio": 1.0205890816780379, "grad/layer_4/attn": 0.002804697025567293, "grad/layer_4/mlp": 0.0025488208048045635, "grad/layer_4/attn_mlp_ratio": 1.1003899961273855, "grad/layer_8/attn": 0.004640584345906973, "grad/layer_8/mlp": 0.003694927552714944, "grad/layer_8/attn_mlp_ratio": 1.2559337508265374, "grad/layer_12/attn": 0.006112511735409498, "grad/layer_12/mlp": 0.006907963193953037, "grad/layer_12/attn_mlp_ratio": 0.8848500600401502, "grad/layer_16/attn": 0.008187629282474518, "grad/layer_16/mlp": 0.0045338403433561325, "grad/layer_16/attn_mlp_ratio": 1.8058926829841642, "grad/layer_20/attn": 0.005206449888646603, "grad/layer_20/mlp": 0.005693978630006313, "grad/layer_20/attn_mlp_ratio": 0.9143781765831834, "grad/layer_24/attn": 0.01316014863550663, "grad/layer_24/mlp": 0.01027169730514288, "grad/layer_24/attn_mlp_ratio": 1.2812048599599077, "grad/layer_27/attn": 0.004879710264503956, "grad/layer_27/mlp": 0.010962671600282192, "grad/layer_27/attn_mlp_ratio": 0.44512053246822547} {"step": 46050, "timestamp": 1778244315.733712, "eos/sharpness": 73.31159114837645, "eos/L0_probe": 1.9832085371017456, "eos/L_plus": 2.4402692317962646, "eos/L_minus": 2.259263753890991, "eos/grad_norm": 0.20313239097595215, "eos/embed_grad_frac": 0.0628688707947731, "eos/time_s": 0.6018352508544922} {"step": 46050, "timestamp": 1778244315.75386, "train/loss": 2.1347156763076782, "train/z_loss": 0.0013884747284464538, "train/perplexity": 8.45464230997385, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1825271.7578471848, "perf/iters_per_sec": 0.8703573979602741, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1489532947540284, "data/tokens_consumed": 96575946752, "data/tokens_consumed_B": 96.575946752, "train/loss_slope": 3.96463582725311e-06} {"step": 46050, "timestamp": 1778244317.1141868, "geo/rankme_last": 439.2851867675781, "geo/layer_0/stable_rank_q_proj": 19.448728561401367, "geo/layer_0/stable_rank_k_proj": 16.426143646240234, "geo/layer_0/stable_rank_o_proj": 48.034080505371094, "geo/layer_0/stable_rank_gate_proj": 134.5419158935547, "geo/layer_0/stable_rank_down_proj": 53.39422607421875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06043718755245209, "geo/layer_0/attn_entropy_mean": 6.189387321472168, "geo/layer_0/attn_entropy_std": 0.38407498598098755, "geo/layer_7/stable_rank_q_proj": 42.33936309814453, "geo/layer_7/stable_rank_k_proj": 42.42301940917969, "geo/layer_7/stable_rank_o_proj": 94.64811706542969, "geo/layer_7/stable_rank_gate_proj": 88.29402923583984, "geo/layer_7/stable_rank_down_proj": 144.35458374023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48490968346595764, "geo/layer_7/attn_entropy_mean": 4.66921329498291, "geo/layer_7/attn_entropy_std": 0.8243416547775269, "geo/layer_14/stable_rank_q_proj": 53.164390563964844, "geo/layer_14/stable_rank_k_proj": 38.24873352050781, "geo/layer_14/stable_rank_o_proj": 46.68327331542969, "geo/layer_14/stable_rank_gate_proj": 74.70890808105469, "geo/layer_14/stable_rank_down_proj": 132.12380981445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39043352007865906, "geo/layer_14/attn_entropy_mean": 5.526562213897705, "geo/layer_14/attn_entropy_std": 0.36892029643058777, "geo/layer_21/stable_rank_q_proj": 42.333106994628906, "geo/layer_21/stable_rank_k_proj": 30.513290405273438, "geo/layer_21/stable_rank_o_proj": 74.38021850585938, "geo/layer_21/stable_rank_gate_proj": 70.72488403320312, "geo/layer_21/stable_rank_down_proj": 53.95835494995117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14000920951366425, "geo/layer_21/attn_entropy_mean": 5.735560417175293, "geo/layer_21/attn_entropy_std": 0.2900942862033844, "geo/layer_27/stable_rank_q_proj": 42.77608108520508, "geo/layer_27/stable_rank_k_proj": 31.468576431274414, "geo/layer_27/stable_rank_o_proj": 116.04228973388672, "geo/layer_27/stable_rank_gate_proj": 83.79611206054688, "geo/layer_27/stable_rank_down_proj": 130.4615478515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09038759768009186, "geo/layer_27/attn_entropy_mean": 4.226075172424316, "geo/layer_27/attn_entropy_std": 0.6929440498352051, "attnres/final_alpha/block_0": 0.2387048602104187, "attnres/block_norm/0": 1.7345662117004395, "attnres/final_alpha/block_1": 0.00485018827021122, "attnres/block_norm/1": 42307.421875, "attnres/final_alpha/block_2": 0.010685121640563011, "attnres/block_norm/2": 27122.5625, "attnres/final_alpha/block_3": 0.01262563094496727, "attnres/block_norm/3": 50456.6640625, "attnres/final_alpha/block_4": 0.01523500494658947, "attnres/block_norm/4": 13617.060546875, "attnres/final_alpha/block_5": 0.6060523390769958, "attnres/block_norm/5": 6181.2470703125, "attnres/final_alpha/block_6": 0.11184683442115784, "attnres/block_norm/6": 33431.546875, "geo/tier1_time_s": 1.3562824726104736, "geo/step": 46050.0, "geo/rankme_slope": 6.886338519782913e-05} {"step": 46060, "timestamp": 1778244327.45779, "train/loss": 2.105609178543091, "train/z_loss": 0.0013889188994653523, "train/perplexity": 8.212104123781259, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792422.8067552678, "perf/iters_per_sec": 0.854693797471651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1700096607208252, "data/tokens_consumed": 96596918272, "data/tokens_consumed_B": 96.596918272, "train/loss_slope": -3.7178212087671244e-08} {"step": 46070, "timestamp": 1778244337.8016582, "train/loss": 2.133253049850464, "train/z_loss": 0.0014024005504325032, "train/perplexity": 8.44228536544491, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028852.8195423237, "perf/iters_per_sec": 0.9674324128829592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336639404296875, "data/tokens_consumed": 96617889792, "data/tokens_consumed_B": 96.617889792, "train/loss_slope": -1.516116885068738e-06} {"step": 46080, "timestamp": 1778244348.14496, "train/loss": 2.218934988975525, "train/z_loss": 0.00138023846084252, "train/perplexity": 9.197530176851398, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028912.1589528378, "perf/iters_per_sec": 0.9674607081188382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336337089538574, "data/tokens_consumed": 96638861312, "data/tokens_consumed_B": 96.638861312, "train/loss_slope": 7.385859144653683e-07} {"step": 46090, "timestamp": 1778244359.0009308, "train/loss": 2.2161195516586303, "train/z_loss": 0.0013852646923623979, "train/perplexity": 9.171671525954693, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932879.390306497, "perf/iters_per_sec": 0.9216687156231389, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.08498854637146, "data/tokens_consumed": 96659832832, "data/tokens_consumed_B": 96.659832832, "train/loss_slope": 6.6824456860702755e-06} {"step": 46100, "timestamp": 1778244369.3376093, "grad/layer_0/attn": 0.0026655530091375113, "grad/layer_0/mlp": 0.002704393118619919, "grad/layer_0/attn_mlp_ratio": 0.985638105725532, "grad/layer_4/attn": 0.002800220623612404, "grad/layer_4/mlp": 0.0024950290098786354, "grad/layer_4/attn_mlp_ratio": 1.1223198208491498, "grad/layer_8/attn": 0.0035669635981321335, "grad/layer_8/mlp": 0.00351662072353065, "grad/layer_8/attn_mlp_ratio": 1.014315667547842, "grad/layer_12/attn": 0.00664129201322794, "grad/layer_12/mlp": 0.006253241561353207, "grad/layer_12/attn_mlp_ratio": 1.0620558700414529, "grad/layer_16/attn": 0.004645670298486948, "grad/layer_16/mlp": 0.00427622627466917, "grad/layer_16/attn_mlp_ratio": 1.086394847103098, "grad/layer_20/attn": 0.006193986162543297, "grad/layer_20/mlp": 0.006054268218576908, "grad/layer_20/attn_mlp_ratio": 1.023077577109967, "grad/layer_24/attn": 0.02013394609093666, "grad/layer_24/mlp": 0.010473846457898617, "grad/layer_24/attn_mlp_ratio": 1.9223067647246652, "grad/layer_27/attn": 0.012340646237134933, "grad/layer_27/mlp": 0.008296327665448189, "grad/layer_27/attn_mlp_ratio": 1.4874829666843883} {"step": 46100, "timestamp": 1778244369.3519788, "train/loss": 2.1067993879318236, "train/z_loss": 0.001397948805242777, "train/perplexity": 8.221884066145725, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027628.5726653156, "perf/iters_per_sec": 0.9668486464811876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342880487442017, "data/tokens_consumed": 96680804352, "data/tokens_consumed_B": 96.680804352, "train/loss_slope": 1.8388703031795107e-06} {"step": 46110, "timestamp": 1778244379.6949608, "train/loss": 2.1066964745521544, "train/z_loss": 0.0014117425889708102, "train/perplexity": 8.221037967807396, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028954.0917334051, "perf/iters_per_sec": 0.9674807032267595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03361234664917, "data/tokens_consumed": 96701775872, "data/tokens_consumed_B": 96.701775872, "train/loss_slope": 8.222751062338172e-07} {"step": 46120, "timestamp": 1778244390.03901, "train/loss": 2.1240938663482667, "train/z_loss": 0.0014166306471452117, "train/perplexity": 8.365313960314523, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028855.4401370345, "perf/iters_per_sec": 0.9674336624798939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336626052856446, "data/tokens_consumed": 96722747392, "data/tokens_consumed_B": 96.722747392, "train/loss_slope": -2.423590022881224e-07} {"step": 46125, "timestamp": 1778244395.790464, "eos/sharpness": 40.13774394989013, "eos/L0_probe": 1.982492446899414, "eos/L_plus": 2.2061970233917236, "eos/L_minus": 2.160165309906006, "eos/grad_norm": 0.12862500548362732, "eos/embed_grad_frac": 0.14366629719734192, "eos/time_s": 0.5891146659851074} {"step": 46125, "timestamp": 1778244397.1687133, "geo/rankme_last": 438.5309143066406, "geo/layer_0/stable_rank_q_proj": 19.429344177246094, "geo/layer_0/stable_rank_k_proj": 16.43007469177246, "geo/layer_0/stable_rank_o_proj": 47.95437240600586, "geo/layer_0/stable_rank_gate_proj": 134.71939086914062, "geo/layer_0/stable_rank_down_proj": 53.388648986816406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06145698204636574, "geo/layer_0/attn_entropy_mean": 6.188881874084473, "geo/layer_0/attn_entropy_std": 0.38486358523368835, "geo/layer_7/stable_rank_q_proj": 42.495567321777344, "geo/layer_7/stable_rank_k_proj": 42.345184326171875, "geo/layer_7/stable_rank_o_proj": 94.8104476928711, "geo/layer_7/stable_rank_gate_proj": 88.1439437866211, "geo/layer_7/stable_rank_down_proj": 144.678955078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4868195056915283, "geo/layer_7/attn_entropy_mean": 4.650274276733398, "geo/layer_7/attn_entropy_std": 0.8185757994651794, "geo/layer_14/stable_rank_q_proj": 53.27898406982422, "geo/layer_14/stable_rank_k_proj": 38.305816650390625, "geo/layer_14/stable_rank_o_proj": 46.60691452026367, "geo/layer_14/stable_rank_gate_proj": 74.74241638183594, "geo/layer_14/stable_rank_down_proj": 132.1490478515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38900360465049744, "geo/layer_14/attn_entropy_mean": 5.53639030456543, "geo/layer_14/attn_entropy_std": 0.37020421028137207, "geo/layer_21/stable_rank_q_proj": 42.35953140258789, "geo/layer_21/stable_rank_k_proj": 30.55441665649414, "geo/layer_21/stable_rank_o_proj": 74.31204986572266, "geo/layer_21/stable_rank_gate_proj": 70.71239471435547, "geo/layer_21/stable_rank_down_proj": 53.9493522644043, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14270420372486115, "geo/layer_21/attn_entropy_mean": 5.699166297912598, "geo/layer_21/attn_entropy_std": 0.2917293906211853, "geo/layer_27/stable_rank_q_proj": 42.7205810546875, "geo/layer_27/stable_rank_k_proj": 31.504451751708984, "geo/layer_27/stable_rank_o_proj": 115.92965698242188, "geo/layer_27/stable_rank_gate_proj": 83.66334533691406, "geo/layer_27/stable_rank_down_proj": 130.6914520263672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09178857505321503, "geo/layer_27/attn_entropy_mean": 4.243834495544434, "geo/layer_27/attn_entropy_std": 0.6751116514205933, "attnres/final_alpha/block_0": 0.2399168163537979, "attnres/block_norm/0": 1.7347567081451416, "attnres/final_alpha/block_1": 0.0048699332401156425, "attnres/block_norm/1": 42488.296875, "attnres/final_alpha/block_2": 0.010631445795297623, "attnres/block_norm/2": 27176.82421875, "attnres/final_alpha/block_3": 0.012775820679962635, "attnres/block_norm/3": 50492.859375, "attnres/final_alpha/block_4": 0.015183296985924244, "attnres/block_norm/4": 13578.400390625, "attnres/final_alpha/block_5": 0.6032668352127075, "attnres/block_norm/5": 6167.7216796875, "attnres/final_alpha/block_6": 0.11335588246583939, "attnres/block_norm/6": 33209.0078125, "geo/tier1_time_s": 1.3601679801940918, "geo/step": 46125.0, "geo/rankme_slope": 6.339250543967588e-05} {"step": 46130, "timestamp": 1778244402.3435698, "train/loss": 2.181978702545166, "train/z_loss": 0.0013794003752991557, "train/perplexity": 8.863827796669412, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705139.5907781925, "perf/iters_per_sec": 0.8130739168063128, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2299004793167114, "data/tokens_consumed": 96743718912, "data/tokens_consumed_B": 96.743718912, "train/loss_slope": 8.88924322577943e-08} {"step": 46140, "timestamp": 1778244412.700349, "train/loss": 2.1917463064193727, "train/z_loss": 0.0013796137296594678, "train/perplexity": 8.950830367004542, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026103.9920755203, "perf/iters_per_sec": 0.966121669805298, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350663185119628, "data/tokens_consumed": 96764690432, "data/tokens_consumed_B": 96.764690432, "train/loss_slope": 2.278474984568202e-06} {"step": 46150, "timestamp": 1778244423.0416446, "grad/layer_0/attn": 0.003649394726380706, "grad/layer_0/mlp": 0.0032441585790365934, "grad/layer_0/attn_mlp_ratio": 1.1249125235342847, "grad/layer_4/attn": 0.002075409283861518, "grad/layer_4/mlp": 0.0024416898377239704, "grad/layer_4/attn_mlp_ratio": 0.84998887524438, "grad/layer_8/attn": 0.0036887838505208492, "grad/layer_8/mlp": 0.003835056209936738, "grad/layer_8/attn_mlp_ratio": 0.961859110376843, "grad/layer_12/attn": 0.0047914725728333, "grad/layer_12/mlp": 0.007002548314630985, "grad/layer_12/attn_mlp_ratio": 0.6842469754042817, "grad/layer_16/attn": 0.004430169705301523, "grad/layer_16/mlp": 0.004691316280514002, "grad/layer_16/attn_mlp_ratio": 0.9443340303593283, "grad/layer_20/attn": 0.003572066081687808, "grad/layer_20/mlp": 0.005740052089095116, "grad/layer_20/attn_mlp_ratio": 0.622305505945395, "grad/layer_24/attn": 0.010613761842250824, "grad/layer_24/mlp": 0.010586848482489586, "grad/layer_24/attn_mlp_ratio": 1.0025421408034259, "grad/layer_27/attn": 0.004959574900567532, "grad/layer_27/mlp": 0.010596232488751411, "grad/layer_27/attn_mlp_ratio": 0.468050777389734} {"step": 46150, "timestamp": 1778244423.0560508, "train/loss": 2.134503984451294, "train/z_loss": 0.0014020087313838304, "train/perplexity": 8.452852720475631, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026179.7395935084, "perf/iters_per_sec": 0.9661577890365164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350276231765747, "data/tokens_consumed": 96785661952, "data/tokens_consumed_B": 96.785661952, "train/loss_slope": 4.209232816744837e-06} {"step": 46160, "timestamp": 1778244433.4003532, "train/loss": 2.156629753112793, "train/z_loss": 0.001395116385538131, "train/perplexity": 8.64196297432089, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028429.5419807949, "perf/iters_per_sec": 0.9672305784133887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338796377182007, "data/tokens_consumed": 96806633472, "data/tokens_consumed_B": 96.806633472, "train/loss_slope": 9.953032160821574e-07} {"step": 46170, "timestamp": 1778244443.7479064, "train/loss": 2.166498374938965, "train/z_loss": 0.0013911534915678203, "train/perplexity": 8.727669445550411, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028175.2486766505, "perf/iters_per_sec": 0.9671093219168904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340092658996582, "data/tokens_consumed": 96827604992, "data/tokens_consumed_B": 96.827604992, "train/loss_slope": 2.2417808487506773e-06} {"step": 46180, "timestamp": 1778244454.109248, "train/loss": 2.145030403137207, "train/z_loss": 0.0013879321049898863, "train/perplexity": 8.54230094609382, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025371.218505563, "perf/iters_per_sec": 0.9657722561385932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354408025741577, "data/tokens_consumed": 96848576512, "data/tokens_consumed_B": 96.848576512, "train/loss_slope": 5.760298024207087e-08} {"step": 46190, "timestamp": 1778244464.4550006, "train/loss": 2.1578717470169066, "train/z_loss": 0.0013913328060880304, "train/perplexity": 8.652702907739917, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027982.8288682562, "perf/iters_per_sec": 0.9670175690022736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341073751449585, "data/tokens_consumed": 96869548032, "data/tokens_consumed_B": 96.869548032, "train/loss_slope": -1.161513978069372e-06} {"step": 46200, "timestamp": 1778244474.797143, "grad/layer_0/attn": 0.0027543334290385246, "grad/layer_0/mlp": 0.0028804412577301264, "grad/layer_0/attn_mlp_ratio": 0.9562192341277246, "grad/layer_4/attn": 0.0028468051459640265, "grad/layer_4/mlp": 0.0024308525025844574, "grad/layer_4/attn_mlp_ratio": 1.1711138482593855, "grad/layer_8/attn": 0.003499074373394251, "grad/layer_8/mlp": 0.0035546624567359686, "grad/layer_8/attn_mlp_ratio": 0.9843618958327338, "grad/layer_12/attn": 0.003984464798122644, "grad/layer_12/mlp": 0.006434135138988495, "grad/layer_12/attn_mlp_ratio": 0.6192696687471305, "grad/layer_16/attn": 0.0035122784320265055, "grad/layer_16/mlp": 0.00447326572611928, "grad/layer_16/attn_mlp_ratio": 0.7851709620113354, "grad/layer_20/attn": 0.004493119195103645, "grad/layer_20/mlp": 0.005608429666608572, "grad/layer_20/attn_mlp_ratio": 0.8011367498715497, "grad/layer_24/attn": 0.005222246516495943, "grad/layer_24/mlp": 0.0071089258417487144, "grad/layer_24/attn_mlp_ratio": 0.7346041524820458, "grad/layer_27/attn": 0.004855587612837553, "grad/layer_27/mlp": 0.006224215496331453, "grad/layer_27/attn_mlp_ratio": 0.7801123752363964} {"step": 46200, "timestamp": 1778244475.4011161, "eos/sharpness": 6.172990798950194, "eos/L0_probe": 1.9876576662063599, "eos/L_plus": 2.026345729827881, "eos/L_minus": 2.010699510574341, "eos/grad_norm": 0.08649903535842896, "eos/embed_grad_frac": 0.27824121713638306, "eos/time_s": 0.601226806640625} {"step": 46200, "timestamp": 1778244475.4214067, "train/loss": 2.17192280292511, "train/z_loss": 0.0013942369725555182, "train/perplexity": 8.775140695732583, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913559.2101643814, "perf/iters_per_sec": 0.9124561358282001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0959430932998657, "data/tokens_consumed": 96890519552, "data/tokens_consumed_B": 96.890519552, "train/loss_slope": 1.5160528918912562e-06} {"step": 46200, "timestamp": 1778244476.7854939, "geo/rankme_last": 439.551513671875, "geo/layer_0/stable_rank_q_proj": 19.45220947265625, "geo/layer_0/stable_rank_k_proj": 16.41021728515625, "geo/layer_0/stable_rank_o_proj": 47.8906135559082, "geo/layer_0/stable_rank_gate_proj": 134.6300811767578, "geo/layer_0/stable_rank_down_proj": 53.45930862426758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06266489624977112, "geo/layer_0/attn_entropy_mean": 6.187431812286377, "geo/layer_0/attn_entropy_std": 0.3845624327659607, "geo/layer_7/stable_rank_q_proj": 42.394775390625, "geo/layer_7/stable_rank_k_proj": 42.454524993896484, "geo/layer_7/stable_rank_o_proj": 94.57209014892578, "geo/layer_7/stable_rank_gate_proj": 88.0733642578125, "geo/layer_7/stable_rank_down_proj": 144.87860107421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4815520644187927, "geo/layer_7/attn_entropy_mean": 4.63746976852417, "geo/layer_7/attn_entropy_std": 0.8136249780654907, "geo/layer_14/stable_rank_q_proj": 53.28754806518555, "geo/layer_14/stable_rank_k_proj": 38.29671096801758, "geo/layer_14/stable_rank_o_proj": 46.63579177856445, "geo/layer_14/stable_rank_gate_proj": 74.59774017333984, "geo/layer_14/stable_rank_down_proj": 132.18800354003906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3895489573478699, "geo/layer_14/attn_entropy_mean": 5.5250563621521, "geo/layer_14/attn_entropy_std": 0.3765924870967865, "geo/layer_21/stable_rank_q_proj": 42.345008850097656, "geo/layer_21/stable_rank_k_proj": 30.47900390625, "geo/layer_21/stable_rank_o_proj": 74.22852325439453, "geo/layer_21/stable_rank_gate_proj": 70.77508544921875, "geo/layer_21/stable_rank_down_proj": 54.0280647277832, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14680276811122894, "geo/layer_21/attn_entropy_mean": 5.706106185913086, "geo/layer_21/attn_entropy_std": 0.2918758988380432, "geo/layer_27/stable_rank_q_proj": 42.71908950805664, "geo/layer_27/stable_rank_k_proj": 31.50022315979004, "geo/layer_27/stable_rank_o_proj": 116.19257354736328, "geo/layer_27/stable_rank_gate_proj": 83.59376525878906, "geo/layer_27/stable_rank_down_proj": 130.5789031982422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09074518084526062, "geo/layer_27/attn_entropy_mean": 4.242548942565918, "geo/layer_27/attn_entropy_std": 0.6498465538024902, "attnres/final_alpha/block_0": 0.24082127213478088, "attnres/block_norm/0": 1.7348066568374634, "attnres/final_alpha/block_1": 0.004939144477248192, "attnres/block_norm/1": 42375.84375, "attnres/final_alpha/block_2": 0.01072397269308567, "attnres/block_norm/2": 27127.908203125, "attnres/final_alpha/block_3": 0.012674478814005852, "attnres/block_norm/3": 50848.6953125, "attnres/final_alpha/block_4": 0.015406358055770397, "attnres/block_norm/4": 13587.51171875, "attnres/final_alpha/block_5": 0.6014339923858643, "attnres/block_norm/5": 6185.591796875, "attnres/final_alpha/block_6": 0.11400076001882553, "attnres/block_norm/6": 33224.515625, "geo/tier1_time_s": 1.3608481884002686, "geo/step": 46200.0, "geo/rankme_slope": 5.187907194127651e-05} {"step": 46210, "timestamp": 1778244487.1329036, "train/loss": 2.1638219356536865, "train/z_loss": 0.0013901096303015948, "train/perplexity": 8.70434159988412, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791145.7779332604, "perf/iters_per_sec": 0.8540848626772215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708438396453857, "data/tokens_consumed": 96911491072, "data/tokens_consumed_B": 96.911491072, "train/loss_slope": -3.211512102080679e-06} {"step": 46220, "timestamp": 1778244497.4782977, "train/loss": 2.1605666279792786, "train/z_loss": 0.0013927722349762917, "train/perplexity": 8.676052359945105, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028111.0890961268, "perf/iters_per_sec": 0.967078728244842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034041976928711, "data/tokens_consumed": 96932462592, "data/tokens_consumed_B": 96.932462592, "train/loss_slope": -5.214690446781623e-07} {"step": 46230, "timestamp": 1778244508.4911685, "train/loss": 2.1843372583389282, "train/z_loss": 0.001388550375122577, "train/perplexity": 8.884758302254106, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905615.9902905603, "perf/iters_per_sec": 0.9086685134365846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.100511336326599, "data/tokens_consumed": 96953434112, "data/tokens_consumed_B": 96.953434112, "train/loss_slope": -9.259248545245931e-07} {"step": 46240, "timestamp": 1778244519.3367398, "train/loss": 2.1176448345184324, "train/z_loss": 0.00140114591922611, "train/perplexity": 8.311539367721231, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935057.5128402996, "perf/iters_per_sec": 0.9227073253823755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0837672710418702, "data/tokens_consumed": 96974405632, "data/tokens_consumed_B": 96.974405632, "train/loss_slope": -4.362661495889709e-06} {"step": 46250, "timestamp": 1778244529.6820552, "grad/layer_0/attn": 0.0030051220674067736, "grad/layer_0/mlp": 0.0027324818074703217, "grad/layer_0/attn_mlp_ratio": 1.0997774803891955, "grad/layer_4/attn": 0.002016223268583417, "grad/layer_4/mlp": 0.0026112005580216646, "grad/layer_4/attn_mlp_ratio": 0.7721441331555795, "grad/layer_8/attn": 0.006220343057066202, "grad/layer_8/mlp": 0.0037985851522535086, "grad/layer_8/attn_mlp_ratio": 1.637542043679548, "grad/layer_12/attn": 0.0034897238947451115, "grad/layer_12/mlp": 0.006454129237681627, "grad/layer_12/attn_mlp_ratio": 0.5406963065290302, "grad/layer_16/attn": 0.003977284301072359, "grad/layer_16/mlp": 0.0043868618085980415, "grad/layer_16/attn_mlp_ratio": 0.906635399960293, "grad/layer_20/attn": 0.004757934715598822, "grad/layer_20/mlp": 0.005318500101566315, "grad/layer_20/attn_mlp_ratio": 0.8946008339338966, "grad/layer_24/attn": 0.007511192467063665, "grad/layer_24/mlp": 0.007363751996308565, "grad/layer_24/attn_mlp_ratio": 1.0200224517103194, "grad/layer_27/attn": 0.006762049160897732, "grad/layer_27/mlp": 0.006641914136707783, "grad/layer_27/attn_mlp_ratio": 1.0180873946739633} {"step": 46250, "timestamp": 1778244529.696534, "train/loss": 2.122300362586975, "train/z_loss": 0.001411281491164118, "train/perplexity": 8.350324184390145, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026079.1175086051, "perf/iters_per_sec": 0.9661098086874986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035079026222229, "data/tokens_consumed": 96995377152, "data/tokens_consumed_B": 96.995377152, "train/loss_slope": -7.99415679511598e-06} {"step": 46260, "timestamp": 1778244540.047321, "train/loss": 2.170139122009277, "train/z_loss": 0.0013982338365167379, "train/perplexity": 8.759502595576336, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027149.7428819588, "perf/iters_per_sec": 0.9666203226480288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034532356262207, "data/tokens_consumed": 97016348672, "data/tokens_consumed_B": 97.016348672, "train/loss_slope": -9.210433174531587e-06} {"step": 46270, "timestamp": 1778244550.3908198, "train/loss": 2.178142762184143, "train/z_loss": 0.0013986551202833652, "train/perplexity": 8.82989181169065, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028900.1316782855, "perf/iters_per_sec": 0.9674549730674198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336398363113404, "data/tokens_consumed": 97037320192, "data/tokens_consumed_B": 97.037320192, "train/loss_slope": -6.800372265066915e-06} {"step": 46275, "timestamp": 1778244556.162726, "eos/sharpness": 42.42384433746337, "eos/L0_probe": 1.987141489982605, "eos/L_plus": 2.1786019802093506, "eos/L_minus": 2.219919443130493, "eos/grad_norm": 0.1207313984632492, "eos/embed_grad_frac": 0.16342459619045258, "eos/time_s": 0.6041085720062256} {"step": 46275, "timestamp": 1778244557.538985, "geo/rankme_last": 439.1457214355469, "geo/layer_0/stable_rank_q_proj": 19.43551254272461, "geo/layer_0/stable_rank_k_proj": 16.409053802490234, "geo/layer_0/stable_rank_o_proj": 47.852848052978516, "geo/layer_0/stable_rank_gate_proj": 134.79019165039062, "geo/layer_0/stable_rank_down_proj": 53.5506477355957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06435348838567734, "geo/layer_0/attn_entropy_mean": 6.1825056076049805, "geo/layer_0/attn_entropy_std": 0.38741981983184814, "geo/layer_7/stable_rank_q_proj": 42.3040657043457, "geo/layer_7/stable_rank_k_proj": 42.33566665649414, "geo/layer_7/stable_rank_o_proj": 94.84461975097656, "geo/layer_7/stable_rank_gate_proj": 88.07698059082031, "geo/layer_7/stable_rank_down_proj": 144.65985107421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4658285677433014, "geo/layer_7/attn_entropy_mean": 4.656081199645996, "geo/layer_7/attn_entropy_std": 0.7875581383705139, "geo/layer_14/stable_rank_q_proj": 53.18408966064453, "geo/layer_14/stable_rank_k_proj": 38.313987731933594, "geo/layer_14/stable_rank_o_proj": 46.650630950927734, "geo/layer_14/stable_rank_gate_proj": 74.60005950927734, "geo/layer_14/stable_rank_down_proj": 132.2747344970703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39956480264663696, "geo/layer_14/attn_entropy_mean": 5.515486717224121, "geo/layer_14/attn_entropy_std": 0.3831900358200073, "geo/layer_21/stable_rank_q_proj": 42.337493896484375, "geo/layer_21/stable_rank_k_proj": 30.60402488708496, "geo/layer_21/stable_rank_o_proj": 74.1021728515625, "geo/layer_21/stable_rank_gate_proj": 70.78129577636719, "geo/layer_21/stable_rank_down_proj": 54.00963592529297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14530815184116364, "geo/layer_21/attn_entropy_mean": 5.715249061584473, "geo/layer_21/attn_entropy_std": 0.29145732522010803, "geo/layer_27/stable_rank_q_proj": 42.63962936401367, "geo/layer_27/stable_rank_k_proj": 31.488876342773438, "geo/layer_27/stable_rank_o_proj": 116.29585266113281, "geo/layer_27/stable_rank_gate_proj": 83.6041259765625, "geo/layer_27/stable_rank_down_proj": 130.3819580078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08627871423959732, "geo/layer_27/attn_entropy_mean": 4.272874355316162, "geo/layer_27/attn_entropy_std": 0.6729680299758911, "attnres/final_alpha/block_0": 0.2421427071094513, "attnres/block_norm/0": 1.7350564002990723, "attnres/final_alpha/block_1": 0.005016766022890806, "attnres/block_norm/1": 42422.3828125, "attnres/final_alpha/block_2": 0.010865351185202599, "attnres/block_norm/2": 27215.60546875, "attnres/final_alpha/block_3": 0.012883941642940044, "attnres/block_norm/3": 50474.453125, "attnres/final_alpha/block_4": 0.015455767512321472, "attnres/block_norm/4": 13632.8876953125, "attnres/final_alpha/block_5": 0.5994693040847778, "attnres/block_norm/5": 6205.564453125, "attnres/final_alpha/block_6": 0.11416614055633545, "attnres/block_norm/6": 33380.42578125, "geo/tier1_time_s": 1.3578824996948242, "geo/step": 46275.0, "geo/rankme_slope": 6.538631077430972e-05} {"step": 46280, "timestamp": 1778244562.7204034, "train/loss": 2.178663992881775, "train/z_loss": 0.0013743570423685014, "train/perplexity": 8.834495422026958, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701893.4219686151, "perf/iters_per_sec": 0.8115260228961063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322463750839234, "data/tokens_consumed": 97058291712, "data/tokens_consumed_B": 97.058291712, "train/loss_slope": -3.1660174879983374e-06} {"step": 46290, "timestamp": 1778244573.6517513, "train/loss": 2.1646429777145384, "train/z_loss": 0.0013889575842767955, "train/perplexity": 8.711491165094893, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919357.4694146006, "perf/iters_per_sec": 0.9152209612915996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0926323175430297, "data/tokens_consumed": 97079263232, "data/tokens_consumed_B": 97.079263232, "train/loss_slope": -3.596103073346739e-06} {"step": 46300, "timestamp": 1778244584.4168878, "grad/layer_0/attn": 0.0026414617896080017, "grad/layer_0/mlp": 0.0027886973693966866, "grad/layer_0/attn_mlp_ratio": 0.9472027061363031, "grad/layer_4/attn": 0.002132897498086095, "grad/layer_4/mlp": 0.0025509221013635397, "grad/layer_4/attn_mlp_ratio": 0.8361280077244226, "grad/layer_8/attn": 0.00338699109852314, "grad/layer_8/mlp": 0.0037841794546693563, "grad/layer_8/attn_mlp_ratio": 0.8950397436463818, "grad/layer_12/attn": 0.004842509049922228, "grad/layer_12/mlp": 0.006590248551219702, "grad/layer_12/attn_mlp_ratio": 0.7347991413079675, "grad/layer_16/attn": 0.004082245286554098, "grad/layer_16/mlp": 0.004370364360511303, "grad/layer_16/attn_mlp_ratio": 0.934074337149562, "grad/layer_20/attn": 0.008356294594705105, "grad/layer_20/mlp": 0.005788515321910381, "grad/layer_20/attn_mlp_ratio": 1.4435989171032184, "grad/layer_24/attn": 0.011179154738783836, "grad/layer_24/mlp": 0.009937888942658901, "grad/layer_24/attn_mlp_ratio": 1.12490235006617, "grad/layer_27/attn": 0.004211661405861378, "grad/layer_27/mlp": 0.008715620264410973, "grad/layer_27/attn_mlp_ratio": 0.48323139716584185} {"step": 46300, "timestamp": 1778244584.4315708, "train/loss": 2.155841064453125, "train/z_loss": 0.0013966141152195633, "train/perplexity": 8.635149843198564, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947077.5534321705, "perf/iters_per_sec": 0.9284389273796895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0770767688751222, "data/tokens_consumed": 97100234752, "data/tokens_consumed_B": 97.100234752, "train/loss_slope": -1.4051471355974965e-06} {"step": 46310, "timestamp": 1778244594.7779868, "train/loss": 2.098000001907349, "train/z_loss": 0.0014128231443464756, "train/perplexity": 8.14985390974417, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028478.892652261, "perf/iters_per_sec": 0.967254110647326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338544845581055, "data/tokens_consumed": 97121206272, "data/tokens_consumed_B": 97.121206272, "train/loss_slope": -4.001269289011568e-06} {"step": 46320, "timestamp": 1778244605.1274736, "train/loss": 2.137541711330414, "train/z_loss": 0.0014014201704412698, "train/perplexity": 8.478569218462926, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027950.707902704, "perf/iters_per_sec": 0.967002252532341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341237545013429, "data/tokens_consumed": 97142177792, "data/tokens_consumed_B": 97.142177792, "train/loss_slope": -4.893773216547419e-06} {"step": 46330, "timestamp": 1778244615.475928, "train/loss": 2.1232516527175904, "train/z_loss": 0.001398725644685328, "train/perplexity": 8.358271544897669, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027920.9256166886, "perf/iters_per_sec": 0.9669880512317126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341389417648315, "data/tokens_consumed": 97163149312, "data/tokens_consumed_B": 97.163149312, "train/loss_slope": -6.015990111622551e-06} {"step": 46340, "timestamp": 1778244625.8329632, "train/loss": 2.161172926425934, "train/z_loss": 0.0013997540809214115, "train/perplexity": 8.681314231985274, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026334.612589353, "perf/iters_per_sec": 0.9662316382357373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349485158920289, "data/tokens_consumed": 97184120832, "data/tokens_consumed_B": 97.184120832, "train/loss_slope": -8.84209996116723e-06} {"step": 46350, "timestamp": 1778244636.199519, "grad/layer_0/attn": 0.0026615408714860678, "grad/layer_0/mlp": 0.0027943637687712908, "grad/layer_0/attn_mlp_ratio": 0.9524675369698269, "grad/layer_4/attn": 0.0026634689420461655, "grad/layer_4/mlp": 0.002600938780233264, "grad/layer_4/attn_mlp_ratio": 1.0240413422584131, "grad/layer_8/attn": 0.005332964938133955, "grad/layer_8/mlp": 0.0038052864838391542, "grad/layer_8/attn_mlp_ratio": 1.4014621029550751, "grad/layer_12/attn": 0.00469741877168417, "grad/layer_12/mlp": 0.0065671224147081375, "grad/layer_12/attn_mlp_ratio": 0.7152933055784383, "grad/layer_16/attn": 0.006918599363416433, "grad/layer_16/mlp": 0.004496422596275806, "grad/layer_16/attn_mlp_ratio": 1.53868971641541, "grad/layer_20/attn": 0.004167559556663036, "grad/layer_20/mlp": 0.005583036225289106, "grad/layer_20/attn_mlp_ratio": 0.7464682860445526, "grad/layer_24/attn": 0.008280959911644459, "grad/layer_24/mlp": 0.00992968026548624, "grad/layer_24/attn_mlp_ratio": 0.8339603700062257, "grad/layer_27/attn": 0.006140775978565216, "grad/layer_27/mlp": 0.009612665511667728, "grad/layer_27/attn_mlp_ratio": 0.6388213453624791} {"step": 46350, "timestamp": 1778244636.798646, "eos/sharpness": 60.971093177795396, "eos/L0_probe": 1.9848610162734985, "eos/L_plus": 2.256052255630493, "eos/L_minus": 2.323380708694458, "eos/grad_norm": 0.1538778394460678, "eos/embed_grad_frac": 0.08845280855894089, "eos/time_s": 0.5962591171264648} {"step": 46350, "timestamp": 1778244636.8180335, "train/loss": 2.126876425743103, "train/z_loss": 0.00139960526721552, "train/perplexity": 8.388623358117794, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910504.9996466169, "perf/iters_per_sec": 0.9109997747643551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976951122283936, "data/tokens_consumed": 97205092352, "data/tokens_consumed_B": 97.205092352, "train/loss_slope": -6.839999335684133e-06} {"step": 46350, "timestamp": 1778244638.1821113, "geo/rankme_last": 439.6242370605469, "geo/layer_0/stable_rank_q_proj": 19.43376922607422, "geo/layer_0/stable_rank_k_proj": 16.431989669799805, "geo/layer_0/stable_rank_o_proj": 47.844608306884766, "geo/layer_0/stable_rank_gate_proj": 135.0216064453125, "geo/layer_0/stable_rank_down_proj": 53.42542266845703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06567651778459549, "geo/layer_0/attn_entropy_mean": 6.188309669494629, "geo/layer_0/attn_entropy_std": 0.38617780804634094, "geo/layer_7/stable_rank_q_proj": 42.347373962402344, "geo/layer_7/stable_rank_k_proj": 42.3393669128418, "geo/layer_7/stable_rank_o_proj": 94.91763305664062, "geo/layer_7/stable_rank_gate_proj": 88.2010726928711, "geo/layer_7/stable_rank_down_proj": 144.61903381347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48652195930480957, "geo/layer_7/attn_entropy_mean": 4.670152187347412, "geo/layer_7/attn_entropy_std": 0.8174136281013489, "geo/layer_14/stable_rank_q_proj": 53.21902084350586, "geo/layer_14/stable_rank_k_proj": 38.3431396484375, "geo/layer_14/stable_rank_o_proj": 46.637569427490234, "geo/layer_14/stable_rank_gate_proj": 74.44802856445312, "geo/layer_14/stable_rank_down_proj": 132.1266632080078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38985490798950195, "geo/layer_14/attn_entropy_mean": 5.520326614379883, "geo/layer_14/attn_entropy_std": 0.36506086587905884, "geo/layer_21/stable_rank_q_proj": 42.30668640136719, "geo/layer_21/stable_rank_k_proj": 30.510826110839844, "geo/layer_21/stable_rank_o_proj": 74.1596450805664, "geo/layer_21/stable_rank_gate_proj": 70.81055450439453, "geo/layer_21/stable_rank_down_proj": 54.06948471069336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1428452730178833, "geo/layer_21/attn_entropy_mean": 5.722046852111816, "geo/layer_21/attn_entropy_std": 0.29539063572883606, "geo/layer_27/stable_rank_q_proj": 42.69242858886719, "geo/layer_27/stable_rank_k_proj": 31.464757919311523, "geo/layer_27/stable_rank_o_proj": 116.38275146484375, "geo/layer_27/stable_rank_gate_proj": 83.57652282714844, "geo/layer_27/stable_rank_down_proj": 130.14627075195312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08369605243206024, "geo/layer_27/attn_entropy_mean": 4.24102783203125, "geo/layer_27/attn_entropy_std": 0.6666898727416992, "attnres/final_alpha/block_0": 0.2420676350593567, "attnres/block_norm/0": 1.7352626323699951, "attnres/final_alpha/block_1": 0.0049403999000787735, "attnres/block_norm/1": 42221.28125, "attnres/final_alpha/block_2": 0.01089851837605238, "attnres/block_norm/2": 27155.234375, "attnres/final_alpha/block_3": 0.012797176837921143, "attnres/block_norm/3": 50376.28125, "attnres/final_alpha/block_4": 0.015504345297813416, "attnres/block_norm/4": 13644.529296875, "attnres/final_alpha/block_5": 0.5987597703933716, "attnres/block_norm/5": 6228.8330078125, "attnres/final_alpha/block_6": 0.11503216624259949, "attnres/block_norm/6": 33273.703125, "geo/tier1_time_s": 1.359766960144043, "geo/step": 46350.0, "geo/rankme_slope": 4.846848895808323e-05} {"step": 46360, "timestamp": 1778244648.5543044, "train/loss": 2.1435701131820677, "train/z_loss": 0.0013846233254298569, "train/perplexity": 8.529835813397597, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787524.4109945814, "perf/iters_per_sec": 0.852358060357371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1732158660888672, "data/tokens_consumed": 97226063872, "data/tokens_consumed_B": 97.226063872, "train/loss_slope": -8.048399432514516e-06} {"step": 46370, "timestamp": 1778244658.9344141, "train/loss": 2.1837103605270385, "train/z_loss": 0.001386678288690746, "train/perplexity": 8.879190212209341, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021841.2683844306, "perf/iters_per_sec": 0.9640890447542336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372485876083375, "data/tokens_consumed": 97247035392, "data/tokens_consumed_B": 97.247035392, "train/loss_slope": -1.0226825311525152e-05} {"step": 46380, "timestamp": 1778244669.3083184, "train/loss": 2.204602670669556, "train/z_loss": 0.0013965344871394337, "train/perplexity": 9.066648406742193, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022906.0228441537, "perf/iters_per_sec": 0.9645967592449921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367026329040527, "data/tokens_consumed": 97268006912, "data/tokens_consumed_B": 97.268006912, "train/loss_slope": -8.858461713347394e-06} {"step": 46390, "timestamp": 1778244679.687508, "train/loss": 2.1336196780204775, "train/z_loss": 0.0013991601765155793, "train/perplexity": 8.445381112538543, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021981.7671878587, "perf/iters_per_sec": 0.9641560398043912, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037176513671875, "data/tokens_consumed": 97288978432, "data/tokens_consumed_B": 97.288978432, "train/loss_slope": -9.298725840639783e-06} {"step": 46400, "timestamp": 1778244690.0505044, "grad/layer_0/attn": 0.002927925670519471, "grad/layer_0/mlp": 0.0030760045628994703, "grad/layer_0/attn_mlp_ratio": 0.9518599584174822, "grad/layer_4/attn": 0.0030907331965863705, "grad/layer_4/mlp": 0.0023564205039292574, "grad/layer_4/attn_mlp_ratio": 1.3116220387110304, "grad/layer_8/attn": 0.0048048957251012325, "grad/layer_8/mlp": 0.0035296061541885138, "grad/layer_8/attn_mlp_ratio": 1.3613121065272813, "grad/layer_12/attn": 0.0044736540876328945, "grad/layer_12/mlp": 0.006597721017897129, "grad/layer_12/attn_mlp_ratio": 0.6780605011475187, "grad/layer_16/attn": 0.010448860935866833, "grad/layer_16/mlp": 0.0047718221321702, "grad/layer_16/attn_mlp_ratio": 2.189700375974556, "grad/layer_20/attn": 0.00540941534563899, "grad/layer_20/mlp": 0.00625901622697711, "grad/layer_20/attn_mlp_ratio": 0.8642596636669189, "grad/layer_24/attn": 0.009899364784359932, "grad/layer_24/mlp": 0.009556830860674381, "grad/layer_24/attn_mlp_ratio": 1.0358417790473695, "grad/layer_27/attn": 0.007404692936688662, "grad/layer_27/mlp": 0.008768109604716301, "grad/layer_27/attn_mlp_ratio": 0.844502770386841} {"step": 46400, "timestamp": 1778244690.0648596, "train/loss": 2.1436644792556763, "train/z_loss": 0.001397238508798182, "train/perplexity": 8.530640778491934, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021778.8100854252, "perf/iters_per_sec": 0.9640592623164297, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372806310653686, "data/tokens_consumed": 97309949952, "data/tokens_consumed_B": 97.309949952, "train/loss_slope": -1.0032485828290932e-05} {"step": 46410, "timestamp": 1778244700.4375129, "train/loss": 2.156237518787384, "train/z_loss": 0.001403642341028899, "train/perplexity": 8.638573964489888, "train/grad_norm": 0.349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023196.6896951213, "perf/iters_per_sec": 0.9647353600001913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036553692817688, "data/tokens_consumed": 97330921472, "data/tokens_consumed_B": 97.330921472, "train/loss_slope": -9.948248867512178e-06} {"step": 46420, "timestamp": 1778244710.8114858, "train/loss": 2.1272567987442015, "train/z_loss": 0.0014128590701147914, "train/perplexity": 8.391814770884752, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022911.2798828452, "perf/iters_per_sec": 0.9645992659963823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036699938774109, "data/tokens_consumed": 97351892992, "data/tokens_consumed_B": 97.351892992, "train/loss_slope": -1.3011401800504157e-05} {"step": 46425, "timestamp": 1778244716.5861442, "eos/sharpness": 24.299049377441403, "eos/L0_probe": 1.9912176132202148, "eos/L_plus": 2.1369435787200928, "eos/L_minus": 2.088482141494751, "eos/grad_norm": 0.10456684231758118, "eos/embed_grad_frac": 0.19174472987651825, "eos/time_s": 0.5967898368835449} {"step": 46425, "timestamp": 1778244717.966637, "geo/rankme_last": 438.958740234375, "geo/layer_0/stable_rank_q_proj": 19.45444107055664, "geo/layer_0/stable_rank_k_proj": 16.461048126220703, "geo/layer_0/stable_rank_o_proj": 47.88498306274414, "geo/layer_0/stable_rank_gate_proj": 135.29185485839844, "geo/layer_0/stable_rank_down_proj": 53.446815490722656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06017209216952324, "geo/layer_0/attn_entropy_mean": 6.190868377685547, "geo/layer_0/attn_entropy_std": 0.3893239498138428, "geo/layer_7/stable_rank_q_proj": 42.317230224609375, "geo/layer_7/stable_rank_k_proj": 42.24898910522461, "geo/layer_7/stable_rank_o_proj": 94.7384262084961, "geo/layer_7/stable_rank_gate_proj": 88.18402862548828, "geo/layer_7/stable_rank_down_proj": 144.39846801757812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48778700828552246, "geo/layer_7/attn_entropy_mean": 4.655223369598389, "geo/layer_7/attn_entropy_std": 0.812717854976654, "geo/layer_14/stable_rank_q_proj": 53.17951202392578, "geo/layer_14/stable_rank_k_proj": 38.363826751708984, "geo/layer_14/stable_rank_o_proj": 46.612369537353516, "geo/layer_14/stable_rank_gate_proj": 74.47613525390625, "geo/layer_14/stable_rank_down_proj": 132.08151245117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38641077280044556, "geo/layer_14/attn_entropy_mean": 5.546720504760742, "geo/layer_14/attn_entropy_std": 0.3596454858779907, "geo/layer_21/stable_rank_q_proj": 42.30650329589844, "geo/layer_21/stable_rank_k_proj": 30.586902618408203, "geo/layer_21/stable_rank_o_proj": 74.28385162353516, "geo/layer_21/stable_rank_gate_proj": 70.78582763671875, "geo/layer_21/stable_rank_down_proj": 54.0211181640625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1457696259021759, "geo/layer_21/attn_entropy_mean": 5.717015266418457, "geo/layer_21/attn_entropy_std": 0.29753607511520386, "geo/layer_27/stable_rank_q_proj": 42.67851257324219, "geo/layer_27/stable_rank_k_proj": 31.450510025024414, "geo/layer_27/stable_rank_o_proj": 116.40138244628906, "geo/layer_27/stable_rank_gate_proj": 83.53154754638672, "geo/layer_27/stable_rank_down_proj": 130.28573608398438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08872530609369278, "geo/layer_27/attn_entropy_mean": 4.245515823364258, "geo/layer_27/attn_entropy_std": 0.6723019480705261, "attnres/final_alpha/block_0": 0.24197784066200256, "attnres/block_norm/0": 1.7354811429977417, "attnres/final_alpha/block_1": 0.004982846789062023, "attnres/block_norm/1": 42194.1796875, "attnres/final_alpha/block_2": 0.010844605043530464, "attnres/block_norm/2": 27034.45703125, "attnres/final_alpha/block_3": 0.012786658480763435, "attnres/block_norm/3": 50721.8046875, "attnres/final_alpha/block_4": 0.015661969780921936, "attnres/block_norm/4": 13674.390625, "attnres/final_alpha/block_5": 0.6003895998001099, "attnres/block_norm/5": 6241.6298828125, "attnres/final_alpha/block_6": 0.11335648596286774, "attnres/block_norm/6": 33500.44140625, "geo/tier1_time_s": 1.3619821071624756, "geo/step": 46425.0, "geo/rankme_slope": 5.2503227853641454e-05} {"step": 46430, "timestamp": 1778244723.1793265, "train/loss": 2.1226566076278686, "train/z_loss": 0.0014072801801376044, "train/perplexity": 8.353299475905644, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696341.66501772, "perf/iters_per_sec": 0.8088787388886071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2362792491912842, "data/tokens_consumed": 97372864512, "data/tokens_consumed_B": 97.372864512, "train/loss_slope": -1.781482483365679e-05} {"step": 46440, "timestamp": 1778244733.5606112, "train/loss": 2.168846940994263, "train/z_loss": 0.0013987238286063074, "train/perplexity": 8.74819104248238, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022042.843546986, "perf/iters_per_sec": 0.964185163281911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037145185470581, "data/tokens_consumed": 97393836032, "data/tokens_consumed_B": 97.393836032, "train/loss_slope": -1.6678315928631145e-05} {"step": 46450, "timestamp": 1778244743.936738, "grad/layer_0/attn": 0.0028628807049244642, "grad/layer_0/mlp": 0.0029048153664916754, "grad/layer_0/attn_mlp_ratio": 0.985563708934028, "grad/layer_4/attn": 0.0019059110200032592, "grad/layer_4/mlp": 0.002445185324177146, "grad/layer_4/attn_mlp_ratio": 0.7794545972498738, "grad/layer_8/attn": 0.005679891910403967, "grad/layer_8/mlp": 0.0037194201722741127, "grad/layer_8/attn_mlp_ratio": 1.527090647094632, "grad/layer_12/attn": 0.003681452013552189, "grad/layer_12/mlp": 0.006778721697628498, "grad/layer_12/attn_mlp_ratio": 0.5430894088086233, "grad/layer_16/attn": 0.004182266071438789, "grad/layer_16/mlp": 0.004623544868081808, "grad/layer_16/attn_mlp_ratio": 0.9045583205766692, "grad/layer_20/attn": 0.004096621181815863, "grad/layer_20/mlp": 0.005211427807807922, "grad/layer_20/attn_mlp_ratio": 0.7860842084523857, "grad/layer_24/attn": 0.0038509138394147158, "grad/layer_24/mlp": 0.006925477646291256, "grad/layer_24/attn_mlp_ratio": 0.5560502799214053, "grad/layer_27/attn": 0.0038099423982203007, "grad/layer_27/mlp": 0.005968295503407717, "grad/layer_27/attn_mlp_ratio": 0.6383635549226045} {"step": 46450, "timestamp": 1778244743.9509697, "train/loss": 2.148136579990387, "train/z_loss": 0.001394207391422242, "train/perplexity": 8.568876095755689, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019775.7797525313, "perf/iters_per_sec": 0.9631041430246979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383093118667603, "data/tokens_consumed": 97414807552, "data/tokens_consumed_B": 97.414807552, "train/loss_slope": -1.9263807038376728e-05} {"step": 46460, "timestamp": 1778244754.3259013, "train/loss": 2.142495059967041, "train/z_loss": 0.0013991747167892755, "train/perplexity": 8.520670713350595, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022391.3375540075, "perf/iters_per_sec": 0.9643513381738699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369664669036864, "data/tokens_consumed": 97435779072, "data/tokens_consumed_B": 97.435779072, "train/loss_slope": -2.2805457575844127e-05} {"step": 46470, "timestamp": 1778244764.6983645, "train/loss": 2.1320650577545166, "train/z_loss": 0.0013957854243926704, "train/perplexity": 8.432261952205737, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023137.4979964918, "perf/iters_per_sec": 0.9647071351988277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365840196609497, "data/tokens_consumed": 97456750592, "data/tokens_consumed_B": 97.456750592, "train/loss_slope": -2.359813826240795e-05} {"step": 46480, "timestamp": 1778244775.0719144, "train/loss": 2.126583194732666, "train/z_loss": 0.0014017701614648104, "train/perplexity": 8.38616391422455, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022927.3768261033, "perf/iters_per_sec": 0.9646069416170613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366916894912719, "data/tokens_consumed": 97477722112, "data/tokens_consumed_B": 97.477722112, "train/loss_slope": -2.73358749406244e-05} {"step": 46490, "timestamp": 1778244785.451945, "train/loss": 2.1828277349472045, "train/z_loss": 0.0013993867905810475, "train/perplexity": 8.871356669351034, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021333.4883269456, "perf/iters_per_sec": 0.9638469163546303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375091552734375, "data/tokens_consumed": 97498693632, "data/tokens_consumed_B": 97.498693632, "train/loss_slope": -2.6637131889553856e-05} {"step": 46500, "timestamp": 1778244795.8231757, "grad/layer_0/attn": 0.0027668545953929424, "grad/layer_0/mlp": 0.0028769841883331537, "grad/layer_0/attn_mlp_ratio": 0.9617204399110509, "grad/layer_4/attn": 0.0019331418443471193, "grad/layer_4/mlp": 0.0024764302652329206, "grad/layer_4/attn_mlp_ratio": 0.7806162739267243, "grad/layer_8/attn": 0.009351042099297047, "grad/layer_8/mlp": 0.003643678268417716, "grad/layer_8/attn_mlp_ratio": 2.566374183942525, "grad/layer_12/attn": 0.0038008277770131826, "grad/layer_12/mlp": 0.0062529342249035835, "grad/layer_12/attn_mlp_ratio": 0.6078470649972466, "grad/layer_16/attn": 0.003418860025703907, "grad/layer_16/mlp": 0.004369590897113085, "grad/layer_16/attn_mlp_ratio": 0.7824210613676865, "grad/layer_20/attn": 0.0037750278133898973, "grad/layer_20/mlp": 0.006857104133814573, "grad/layer_20/attn_mlp_ratio": 0.5505279903394247, "grad/layer_24/attn": 0.021807337179780006, "grad/layer_24/mlp": 0.011802486144006252, "grad/layer_24/attn_mlp_ratio": 1.8476901162121324, "grad/layer_27/attn": 0.008826671168208122, "grad/layer_27/mlp": 0.009994808584451675, "grad/layer_27/attn_mlp_ratio": 0.8831255751737646} {"step": 46500, "timestamp": 1778244796.4132383, "eos/sharpness": 70.15795707702635, "eos/L0_probe": 1.9889081716537476, "eos/L_plus": 2.4078168869018555, "eos/L_minus": 2.2715790271759033, "eos/grad_norm": 0.2056926190853119, "eos/embed_grad_frac": 0.053531464189291, "eos/time_s": 0.587287425994873} {"step": 46500, "timestamp": 1778244796.4337437, "train/loss": 2.108621621131897, "train/z_loss": 0.0013971543638035655, "train/perplexity": 8.236879915075582, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910715.5322017437, "perf/iters_per_sec": 0.911100164509651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0975741624832154, "data/tokens_consumed": 97519665152, "data/tokens_consumed_B": 97.519665152, "train/loss_slope": -2.7936097426061127e-05} {"step": 46500, "timestamp": 1778244797.7970192, "geo/rankme_last": 438.5892639160156, "geo/layer_0/stable_rank_q_proj": 19.438871383666992, "geo/layer_0/stable_rank_k_proj": 16.43317222595215, "geo/layer_0/stable_rank_o_proj": 47.93443298339844, "geo/layer_0/stable_rank_gate_proj": 135.1375274658203, "geo/layer_0/stable_rank_down_proj": 53.38884353637695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060323331505060196, "geo/layer_0/attn_entropy_mean": 6.186349868774414, "geo/layer_0/attn_entropy_std": 0.38561320304870605, "geo/layer_7/stable_rank_q_proj": 42.383060455322266, "geo/layer_7/stable_rank_k_proj": 42.26045608520508, "geo/layer_7/stable_rank_o_proj": 94.77837371826172, "geo/layer_7/stable_rank_gate_proj": 88.13127899169922, "geo/layer_7/stable_rank_down_proj": 144.36929321289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47380563616752625, "geo/layer_7/attn_entropy_mean": 4.650208950042725, "geo/layer_7/attn_entropy_std": 0.8109905123710632, "geo/layer_14/stable_rank_q_proj": 53.27214050292969, "geo/layer_14/stable_rank_k_proj": 38.574832916259766, "geo/layer_14/stable_rank_o_proj": 46.54804611206055, "geo/layer_14/stable_rank_gate_proj": 74.41641235351562, "geo/layer_14/stable_rank_down_proj": 131.8428497314453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37981274724006653, "geo/layer_14/attn_entropy_mean": 5.549317359924316, "geo/layer_14/attn_entropy_std": 0.36608263850212097, "geo/layer_21/stable_rank_q_proj": 42.26326370239258, "geo/layer_21/stable_rank_k_proj": 30.68680763244629, "geo/layer_21/stable_rank_o_proj": 74.24686431884766, "geo/layer_21/stable_rank_gate_proj": 70.66973876953125, "geo/layer_21/stable_rank_down_proj": 54.08992004394531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14860503375530243, "geo/layer_21/attn_entropy_mean": 5.731112480163574, "geo/layer_21/attn_entropy_std": 0.28490328788757324, "geo/layer_27/stable_rank_q_proj": 42.57521438598633, "geo/layer_27/stable_rank_k_proj": 31.386472702026367, "geo/layer_27/stable_rank_o_proj": 116.08998107910156, "geo/layer_27/stable_rank_gate_proj": 83.3958740234375, "geo/layer_27/stable_rank_down_proj": 130.2720489501953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0901344045996666, "geo/layer_27/attn_entropy_mean": 4.269322395324707, "geo/layer_27/attn_entropy_std": 0.6734651923179626, "attnres/final_alpha/block_0": 0.23929807543754578, "attnres/block_norm/0": 1.735532283782959, "attnres/final_alpha/block_1": 0.004800899885594845, "attnres/block_norm/1": 42516.6875, "attnres/final_alpha/block_2": 0.01082666777074337, "attnres/block_norm/2": 27132.896484375, "attnres/final_alpha/block_3": 0.012631713412702084, "attnres/block_norm/3": 51070.08984375, "attnres/final_alpha/block_4": 0.015057306736707687, "attnres/block_norm/4": 13646.89453125, "attnres/final_alpha/block_5": 0.6058392524719238, "attnres/block_norm/5": 6188.69091796875, "attnres/final_alpha/block_6": 0.11154605448246002, "attnres/block_norm/6": 33653.5, "geo/tier1_time_s": 1.3591177463531494, "geo/step": 46500.0, "geo/rankme_slope": 4.824009681997799e-05} {"step": 46500, "timestamp": 1778244804.9832685, "geo/ww_alpha_mean": 7.3398777014370875, "geo/ww_alpha_std": 4.0462012797216165, "geo/ww_alpha_min": 1.3300843440042107, "geo/ww_alpha_max": 24.22685778148723, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.078171162856277, "geo/ww_alpha_by_type/k_proj": 4.478095049192209, "geo/ww_alpha_by_type/v_proj": 7.320750557855953, "geo/ww_alpha_by_type/o_proj": 7.342644067080102, "geo/ww_alpha_by_type/gate_proj": 8.028788632559133, "geo/ww_alpha_by_type/up_proj": 11.97909225401445, "geo/ww_alpha_by_type/down_proj": 8.242554869671707, "geo/twonn_id/layer_0": 0.7028331160545349, "geo/twonn_id/layer_7": 3.6334288120269775, "geo/twonn_id/layer_14": 4.058767795562744, "geo/twonn_id/layer_21": 6.737439155578613, "geo/twonn_id/layer_27": 6.27171516418457, "geo/tier2_time_s": 7.180217027664185} {"step": 46500, "timestamp": 1778244805.6368499, "eoc/jacobian_sigma/layer_0/attn": 1074.41748046875, "eoc/jacobian_sigma/layer_0/mlp": 7594.55810546875, "eoc/jacobian_sigma/layer_0": 7594.55810546875, "eoc/jacobian_sigma/layer_7/attn": 1.1767522096633911, "eoc/jacobian_sigma/layer_7/mlp": 1.8034157752990723, "eoc/jacobian_sigma/layer_7": 1.8034157752990723, "eoc/jacobian_sigma/layer_14/attn": 1.6008888483047485, "eoc/jacobian_sigma/layer_14/mlp": 7.012548923492432, "eoc/jacobian_sigma/layer_14": 7.012548923492432, "eoc/jacobian_sigma/layer_21/attn": 1.0774534940719604, "eoc/jacobian_sigma/layer_21/mlp": 3.8543779850006104, "eoc/jacobian_sigma/layer_21": 3.8543779850006104, "eoc/jacobian_sigma/layer_27/attn": 3.4844682216644287, "eoc/jacobian_sigma/layer_27/mlp": 31.461200714111328, "eoc/jacobian_sigma/layer_27": 31.461200714111328, "eoc/layer0_sigma": 7594.55810546875, "eoc/sigma_max": 31.461200714111328, "eoc/sigma_min": 1.8034157752990723, "eoc/sigma_mean": 11.03288584947586, "eoc/time_s": 0.6437559127807617} {"step": 46510, "timestamp": 1778244816.0386035, "train/loss": 2.1532726287841797, "train/z_loss": 0.0013933504815213382, "train/perplexity": 8.612999474410149, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1069927.4646731226, "perf/iters_per_sec": 0.5101811717382062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9600880146026611, "data/tokens_consumed": 97540636672, "data/tokens_consumed_B": 97.540636672, "train/loss_slope": -2.798830547956529e-05} {"step": 46520, "timestamp": 1778244826.4280355, "train/loss": 2.114294719696045, "train/z_loss": 0.0014024602132849395, "train/perplexity": 8.283741345769977, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021133.4013458823, "perf/iters_per_sec": 0.9637515074471866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376118659973144, "data/tokens_consumed": 97561608192, "data/tokens_consumed_B": 97.561608192, "train/loss_slope": -3.2756348940977766e-05} {"step": 46530, "timestamp": 1778244836.8100324, "train/loss": 2.148510551452637, "train/z_loss": 0.0014094203012064098, "train/perplexity": 8.572081210152362, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021055.4301694937, "perf/iters_per_sec": 0.9637143278930157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376518964767456, "data/tokens_consumed": 97582579712, "data/tokens_consumed_B": 97.582579712, "train/loss_slope": -3.3662577831383894e-05} {"step": 46540, "timestamp": 1778244847.1906364, "train/loss": 2.178917479515076, "train/z_loss": 0.00139718281570822, "train/perplexity": 8.836735132384726, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021223.6399430837, "perf/iters_per_sec": 0.9637945365634364, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037565541267395, "data/tokens_consumed": 97603551232, "data/tokens_consumed_B": 97.603551232, "train/loss_slope": -3.275194790425544e-05} {"step": 46550, "timestamp": 1778244857.5655293, "grad/layer_0/attn": 0.004102771170437336, "grad/layer_0/mlp": 0.003396012354642153, "grad/layer_0/attn_mlp_ratio": 1.2081142885177245, "grad/layer_4/attn": 0.002471586922183633, "grad/layer_4/mlp": 0.002680538920685649, "grad/layer_4/attn_mlp_ratio": 0.9220484772318036, "grad/layer_8/attn": 0.003639552742242813, "grad/layer_8/mlp": 0.003831042442470789, "grad/layer_8/attn_mlp_ratio": 0.9500162689124103, "grad/layer_12/attn": 0.005297234747558832, "grad/layer_12/mlp": 0.00694904662668705, "grad/layer_12/attn_mlp_ratio": 0.762296607852036, "grad/layer_16/attn": 0.006672288756817579, "grad/layer_16/mlp": 0.005573795177042484, "grad/layer_16/attn_mlp_ratio": 1.1970817773483005, "grad/layer_20/attn": 0.005537237972021103, "grad/layer_20/mlp": 0.0070493100211024284, "grad/layer_20/attn_mlp_ratio": 0.7855006911165862, "grad/layer_24/attn": 0.022290537133812904, "grad/layer_24/mlp": 0.013309052214026451, "grad/layer_24/attn_mlp_ratio": 1.6748402972554883, "grad/layer_27/attn": 0.01900983601808548, "grad/layer_27/mlp": 0.013007492758333683, "grad/layer_27/attn_mlp_ratio": 1.4614527353675384} {"step": 46550, "timestamp": 1778244857.5797307, "train/loss": 2.1737907409667967, "train/z_loss": 0.0013897717115469276, "train/perplexity": 8.791547433473557, "train/grad_norm": 0.3203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020358.0438565572, "perf/iters_per_sec": 0.9633817881853853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380100727081298, "data/tokens_consumed": 97624522752, "data/tokens_consumed_B": 97.624522752, "train/loss_slope": -3.2066348977465185e-05} {"step": 46560, "timestamp": 1778244867.9591603, "train/loss": 2.184842038154602, "train/z_loss": 0.0014018547837622464, "train/perplexity": 8.88924428103274, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022028.9453297292, "perf/iters_per_sec": 0.9641785360954901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037152314186096, "data/tokens_consumed": 97645494272, "data/tokens_consumed_B": 97.645494272, "train/loss_slope": -3.018435878221458e-05} {"step": 46570, "timestamp": 1778244878.3424683, "train/loss": 2.2223663568496703, "train/z_loss": 0.0013732255436480045, "train/perplexity": 9.229144495580892, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020855.119662145, "perf/iters_per_sec": 0.9636188123999333, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03775475025177, "data/tokens_consumed": 97666465792, "data/tokens_consumed_B": 97.666465792, "train/loss_slope": -2.637599698423516e-05} {"step": 46575, "timestamp": 1778244884.1274226, "eos/sharpness": 64.96798992156981, "eos/L0_probe": 1.9853780269622803, "eos/L_plus": 2.354957342147827, "eos/L_minus": 2.2654786109924316, "eos/grad_norm": 0.17572645843029022, "eos/embed_grad_frac": 0.07298438251018524, "eos/time_s": 0.6041295528411865} {"step": 46575, "timestamp": 1778244885.50664, "geo/rankme_last": 439.0709228515625, "geo/layer_0/stable_rank_q_proj": 19.434751510620117, "geo/layer_0/stable_rank_k_proj": 16.380596160888672, "geo/layer_0/stable_rank_o_proj": 47.998008728027344, "geo/layer_0/stable_rank_gate_proj": 134.75985717773438, "geo/layer_0/stable_rank_down_proj": 53.32600402832031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06516008824110031, "geo/layer_0/attn_entropy_mean": 6.184834003448486, "geo/layer_0/attn_entropy_std": 0.38896557688713074, "geo/layer_7/stable_rank_q_proj": 42.37655258178711, "geo/layer_7/stable_rank_k_proj": 42.30266571044922, "geo/layer_7/stable_rank_o_proj": 94.7948226928711, "geo/layer_7/stable_rank_gate_proj": 88.04711151123047, "geo/layer_7/stable_rank_down_proj": 144.68116760253906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48099231719970703, "geo/layer_7/attn_entropy_mean": 4.675136566162109, "geo/layer_7/attn_entropy_std": 0.8153166770935059, "geo/layer_14/stable_rank_q_proj": 53.217742919921875, "geo/layer_14/stable_rank_k_proj": 38.459861755371094, "geo/layer_14/stable_rank_o_proj": 46.495975494384766, "geo/layer_14/stable_rank_gate_proj": 74.4757080078125, "geo/layer_14/stable_rank_down_proj": 132.0487518310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4064241945743561, "geo/layer_14/attn_entropy_mean": 5.513360500335693, "geo/layer_14/attn_entropy_std": 0.3691125512123108, "geo/layer_21/stable_rank_q_proj": 42.33148193359375, "geo/layer_21/stable_rank_k_proj": 30.67414665222168, "geo/layer_21/stable_rank_o_proj": 74.23111724853516, "geo/layer_21/stable_rank_gate_proj": 70.69571685791016, "geo/layer_21/stable_rank_down_proj": 54.03529357910156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14187949895858765, "geo/layer_21/attn_entropy_mean": 5.727899551391602, "geo/layer_21/attn_entropy_std": 0.28022921085357666, "geo/layer_27/stable_rank_q_proj": 42.661888122558594, "geo/layer_27/stable_rank_k_proj": 31.452396392822266, "geo/layer_27/stable_rank_o_proj": 115.86962127685547, "geo/layer_27/stable_rank_gate_proj": 83.43478393554688, "geo/layer_27/stable_rank_down_proj": 130.64544677734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08764606714248657, "geo/layer_27/attn_entropy_mean": 4.28410005569458, "geo/layer_27/attn_entropy_std": 0.6468919515609741, "attnres/final_alpha/block_0": 0.23941010236740112, "attnres/block_norm/0": 1.7357077598571777, "attnres/final_alpha/block_1": 0.004833599086850882, "attnres/block_norm/1": 42397.828125, "attnres/final_alpha/block_2": 0.01089807040989399, "attnres/block_norm/2": 27200.80078125, "attnres/final_alpha/block_3": 0.012729518115520477, "attnres/block_norm/3": 50957.49609375, "attnres/final_alpha/block_4": 0.015344018116593361, "attnres/block_norm/4": 13667.2939453125, "attnres/final_alpha/block_5": 0.6043156385421753, "attnres/block_norm/5": 6173.228515625, "attnres/final_alpha/block_6": 0.11246910691261292, "attnres/block_norm/6": 33349.0, "geo/tier1_time_s": 1.3592498302459717, "geo/step": 46575.0, "geo/rankme_slope": 6.352251838235295e-05} {"step": 46580, "timestamp": 1778244890.7022526, "train/loss": 2.2057531595230104, "train/z_loss": 0.0013881249818950892, "train/perplexity": 9.077085487393587, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697647.9028649956, "perf/iters_per_sec": 0.8095016016316393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353280067443848, "data/tokens_consumed": 97687437312, "data/tokens_consumed_B": 97.687437312, "train/loss_slope": -2.2037611943338487e-05} {"step": 46590, "timestamp": 1778244901.0803268, "train/loss": 2.1422444581985474, "train/z_loss": 0.0014046956901438534, "train/perplexity": 8.5185356857331, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021623.982193533, "perf/iters_per_sec": 0.9639854346244493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373600721359253, "data/tokens_consumed": 97708408832, "data/tokens_consumed_B": 97.708408832, "train/loss_slope": -2.4948417786324358e-05} {"step": 46600, "timestamp": 1778244911.45173, "grad/layer_0/attn": 0.002994987415149808, "grad/layer_0/mlp": 0.00298088020645082, "grad/layer_0/attn_mlp_ratio": 1.0047325310809894, "grad/layer_4/attn": 0.002544951857998967, "grad/layer_4/mlp": 0.002647865330800414, "grad/layer_4/attn_mlp_ratio": 0.9611333825335914, "grad/layer_8/attn": 0.004855803679674864, "grad/layer_8/mlp": 0.003955061547458172, "grad/layer_8/attn_mlp_ratio": 1.2277441194363123, "grad/layer_12/attn": 0.005154898390173912, "grad/layer_12/mlp": 0.006572154350578785, "grad/layer_12/attn_mlp_ratio": 0.7843544196865215, "grad/layer_16/attn": 0.006173182278871536, "grad/layer_16/mlp": 0.004569496493786573, "grad/layer_16/attn_mlp_ratio": 1.350954564068522, "grad/layer_20/attn": 0.004668806679546833, "grad/layer_20/mlp": 0.006044554058462381, "grad/layer_20/attn_mlp_ratio": 0.7723988497994513, "grad/layer_24/attn": 0.012856456451117992, "grad/layer_24/mlp": 0.011261162348091602, "grad/layer_24/attn_mlp_ratio": 1.1416633505093199, "grad/layer_27/attn": 0.004894251935184002, "grad/layer_27/mlp": 0.010049228556454182, "grad/layer_27/attn_mlp_ratio": 0.4870276219698352} {"step": 46600, "timestamp": 1778244911.466132, "train/loss": 2.184209132194519, "train/z_loss": 0.001378794654738158, "train/perplexity": 8.883620005353183, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020167.2899946808, "perf/iters_per_sec": 0.9632908296559719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381080865859986, "data/tokens_consumed": 97729380352, "data/tokens_consumed_B": 97.729380352, "train/loss_slope": -2.4125425092386384e-05} {"step": 46610, "timestamp": 1778244921.8481681, "train/loss": 2.1601685523986816, "train/z_loss": 0.0013936146860942245, "train/perplexity": 8.672599322695133, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021489.8977267006, "perf/iters_per_sec": 0.9639214981683257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037428879737854, "data/tokens_consumed": 97750351872, "data/tokens_consumed_B": 97.750351872, "train/loss_slope": -1.9722035758816496e-05} {"step": 46620, "timestamp": 1778244932.2417407, "train/loss": 2.168890118598938, "train/z_loss": 0.0013927418855018913, "train/perplexity": 8.748568776571604, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018896.6410923337, "perf/iters_per_sec": 0.9626849370443028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387614488601684, "data/tokens_consumed": 97771323392, "data/tokens_consumed_B": 97.771323392, "train/loss_slope": -2.2500155038601835e-05} {"step": 46630, "timestamp": 1778244942.624719, "train/loss": 2.1144230365753174, "train/z_loss": 0.001394380535930395, "train/perplexity": 8.2848043578079, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020737.8491862286, "perf/iters_per_sec": 0.9635628934794562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037814974784851, "data/tokens_consumed": 97792294912, "data/tokens_consumed_B": 97.792294912, "train/loss_slope": -2.3941740838035652e-05} {"step": 46640, "timestamp": 1778244953.0135727, "train/loss": 2.111350250244141, "train/z_loss": 0.0014051427599042654, "train/perplexity": 8.259385996816908, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020221.6681108098, "perf/iters_per_sec": 0.9633167591623353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380801439285279, "data/tokens_consumed": 97813266432, "data/tokens_consumed_B": 97.813266432, "train/loss_slope": -2.4885103468633116e-05} {"step": 46650, "timestamp": 1778244963.3905904, "grad/layer_0/attn": 0.0033512027002871037, "grad/layer_0/mlp": 0.002905914094299078, "grad/layer_0/attn_mlp_ratio": 1.1532352561757013, "grad/layer_4/attn": 0.001988399773836136, "grad/layer_4/mlp": 0.0023848009295761585, "grad/layer_4/attn_mlp_ratio": 0.8337801557346386, "grad/layer_8/attn": 0.003148679155856371, "grad/layer_8/mlp": 0.0035497385542839766, "grad/layer_8/attn_mlp_ratio": 0.8870171757733235, "grad/layer_12/attn": 0.004416388459503651, "grad/layer_12/mlp": 0.006440701894462109, "grad/layer_12/attn_mlp_ratio": 0.6856998605588307, "grad/layer_16/attn": 0.00527841504663229, "grad/layer_16/mlp": 0.005048633553087711, "grad/layer_16/attn_mlp_ratio": 1.0455135803731859, "grad/layer_20/attn": 0.00449137669056654, "grad/layer_20/mlp": 0.006222786381840706, "grad/layer_20/attn_mlp_ratio": 0.7217629439276513, "grad/layer_24/attn": 0.016643192619085312, "grad/layer_24/mlp": 0.011689740233123302, "grad/layer_24/attn_mlp_ratio": 1.4237435687023965, "grad/layer_27/attn": 0.00538780028000474, "grad/layer_27/mlp": 0.010434044525027275, "grad/layer_27/attn_mlp_ratio": 0.5163673794418582} {"step": 46650, "timestamp": 1778244963.9860344, "eos/sharpness": 66.41187667846678, "eos/L0_probe": 1.9854060411453247, "eos/L_plus": 2.279205560684204, "eos/L_minus": 2.3557252883911133, "eos/grad_norm": 0.2033669352531433, "eos/embed_grad_frac": 0.05034490302205086, "eos/time_s": 0.5926742553710938} {"step": 46650, "timestamp": 1778244964.006463, "train/loss": 2.1307183384895323, "train/z_loss": 0.0013906451407819987, "train/perplexity": 8.420913705753517, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908586.8645975005, "perf/iters_per_sec": 0.9100851366984847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0987982988357543, "data/tokens_consumed": 97834237952, "data/tokens_consumed_B": 97.834237952, "train/loss_slope": -2.3297409620722796e-05} {"step": 46650, "timestamp": 1778244965.3681087, "geo/rankme_last": 439.5131530761719, "geo/layer_0/stable_rank_q_proj": 19.439743041992188, "geo/layer_0/stable_rank_k_proj": 16.350902557373047, "geo/layer_0/stable_rank_o_proj": 47.98297882080078, "geo/layer_0/stable_rank_gate_proj": 134.8737335205078, "geo/layer_0/stable_rank_down_proj": 53.35034942626953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05890847370028496, "geo/layer_0/attn_entropy_mean": 6.188721179962158, "geo/layer_0/attn_entropy_std": 0.3872700035572052, "geo/layer_7/stable_rank_q_proj": 42.37428283691406, "geo/layer_7/stable_rank_k_proj": 42.19230651855469, "geo/layer_7/stable_rank_o_proj": 94.68223571777344, "geo/layer_7/stable_rank_gate_proj": 88.07655334472656, "geo/layer_7/stable_rank_down_proj": 144.53536987304688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4780539870262146, "geo/layer_7/attn_entropy_mean": 4.680978775024414, "geo/layer_7/attn_entropy_std": 0.8083806037902832, "geo/layer_14/stable_rank_q_proj": 53.05804443359375, "geo/layer_14/stable_rank_k_proj": 38.43012619018555, "geo/layer_14/stable_rank_o_proj": 46.462989807128906, "geo/layer_14/stable_rank_gate_proj": 74.58686065673828, "geo/layer_14/stable_rank_down_proj": 131.9984130859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4050014615058899, "geo/layer_14/attn_entropy_mean": 5.548102855682373, "geo/layer_14/attn_entropy_std": 0.3680068850517273, "geo/layer_21/stable_rank_q_proj": 42.29843521118164, "geo/layer_21/stable_rank_k_proj": 30.5911922454834, "geo/layer_21/stable_rank_o_proj": 74.29833221435547, "geo/layer_21/stable_rank_gate_proj": 70.60755157470703, "geo/layer_21/stable_rank_down_proj": 54.005104064941406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1372540295124054, "geo/layer_21/attn_entropy_mean": 5.725613117218018, "geo/layer_21/attn_entropy_std": 0.29291242361068726, "geo/layer_27/stable_rank_q_proj": 42.58527755737305, "geo/layer_27/stable_rank_k_proj": 31.500253677368164, "geo/layer_27/stable_rank_o_proj": 115.68645477294922, "geo/layer_27/stable_rank_gate_proj": 83.32052612304688, "geo/layer_27/stable_rank_down_proj": 130.5601043701172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07835472375154495, "geo/layer_27/attn_entropy_mean": 4.2718071937561035, "geo/layer_27/attn_entropy_std": 0.661238968372345, "attnres/final_alpha/block_0": 0.24235481023788452, "attnres/block_norm/0": 1.7359615564346313, "attnres/final_alpha/block_1": 0.0050735147669911385, "attnres/block_norm/1": 42431.58203125, "attnres/final_alpha/block_2": 0.010917277075350285, "attnres/block_norm/2": 27130.263671875, "attnres/final_alpha/block_3": 0.012558290734887123, "attnres/block_norm/3": 50858.3984375, "attnres/final_alpha/block_4": 0.01569499261677265, "attnres/block_norm/4": 13659.1279296875, "attnres/final_alpha/block_5": 0.5962473750114441, "attnres/block_norm/5": 6286.755859375, "attnres/final_alpha/block_6": 0.11715376377105713, "attnres/block_norm/6": 33214.890625, "geo/tier1_time_s": 1.3576173782348633, "geo/step": 46650.0, "geo/rankme_slope": 6.0793184461284516e-05} {"step": 46660, "timestamp": 1778244975.7486174, "train/loss": 2.1723769068717957, "train/z_loss": 0.0014041087008081377, "train/perplexity": 8.779126426654818, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786595.2828671813, "perf/iters_per_sec": 0.851915017541495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173826003074646, "data/tokens_consumed": 97855209472, "data/tokens_consumed_B": 97.855209472, "train/loss_slope": -2.3345515522697885e-05} {"step": 46670, "timestamp": 1778244986.1303477, "train/loss": 2.1113558053970336, "train/z_loss": 0.001393161399755627, "train/perplexity": 8.259431879096361, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021494.7292904526, "perf/iters_per_sec": 0.9639238020374549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374264001846314, "data/tokens_consumed": 97876180992, "data/tokens_consumed_B": 97.876180992, "train/loss_slope": -2.5845503985899676e-05} {"step": 46680, "timestamp": 1778244996.509997, "train/loss": 2.1747505187988283, "train/z_loss": 0.0014038626221008598, "train/perplexity": 8.799989416375439, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021367.954847217, "perf/iters_per_sec": 0.9638633512722097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037491464614868, "data/tokens_consumed": 97897152512, "data/tokens_consumed_B": 97.897152512, "train/loss_slope": -2.1164410228025353e-05} {"step": 46690, "timestamp": 1778245006.8893042, "train/loss": 2.181206202507019, "train/z_loss": 0.0013960907468572258, "train/perplexity": 8.85698313345004, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021945.3741254737, "perf/iters_per_sec": 0.9641386862399453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371951818466187, "data/tokens_consumed": 97918124032, "data/tokens_consumed_B": 97.918124032, "train/loss_slope": -1.9739097566029473e-05} {"step": 46700, "timestamp": 1778245017.2597394, "grad/layer_0/attn": 0.0029571224004030228, "grad/layer_0/mlp": 0.003016302129253745, "grad/layer_0/attn_mlp_ratio": 0.9803800069247822, "grad/layer_4/attn": 0.0026517154183238745, "grad/layer_4/mlp": 0.0024557309225201607, "grad/layer_4/attn_mlp_ratio": 1.0798069470990301, "grad/layer_8/attn": 0.004036057274788618, "grad/layer_8/mlp": 0.0035917041823267937, "grad/layer_8/attn_mlp_ratio": 1.1237164748357182, "grad/layer_12/attn": 0.004630895797163248, "grad/layer_12/mlp": 0.006603589281439781, "grad/layer_12/attn_mlp_ratio": 0.7012694959772883, "grad/layer_16/attn": 0.003863565856590867, "grad/layer_16/mlp": 0.004651930648833513, "grad/layer_16/attn_mlp_ratio": 0.8305295296065336, "grad/layer_20/attn": 0.00382872112095356, "grad/layer_20/mlp": 0.006502087228000164, "grad/layer_20/attn_mlp_ratio": 0.5888449243777157, "grad/layer_24/attn": 0.014574656262993813, "grad/layer_24/mlp": 0.01452258974313736, "grad/layer_24/attn_mlp_ratio": 1.0035852021174485, "grad/layer_27/attn": 0.005157902371138334, "grad/layer_27/mlp": 0.013929951004683971, "grad/layer_27/attn_mlp_ratio": 0.37027426244188183} {"step": 46700, "timestamp": 1778245017.2740793, "train/loss": 2.145908570289612, "train/z_loss": 0.0013860792736522854, "train/perplexity": 8.549805808970369, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020875.7339314106, "perf/iters_per_sec": 0.9636286420495084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037744164466858, "data/tokens_consumed": 97939095552, "data/tokens_consumed_B": 97.939095552, "train/loss_slope": -2.140471219944944e-05} {"step": 46710, "timestamp": 1778245027.6584592, "train/loss": 2.1409585952758787, "train/z_loss": 0.001388976431917399, "train/perplexity": 8.507589055980352, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020523.074310497, "perf/iters_per_sec": 0.9634604808380589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379252910614014, "data/tokens_consumed": 97960067072, "data/tokens_consumed_B": 97.960067072, "train/loss_slope": -1.9004402302279322e-05} {"step": 46720, "timestamp": 1778245038.0443108, "train/loss": 2.1860305786132814, "train/z_loss": 0.001398053765296936, "train/perplexity": 8.899815788595589, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020572.4587419133, "perf/iters_per_sec": 0.963484029169995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037899923324585, "data/tokens_consumed": 97981038592, "data/tokens_consumed_B": 97.981038592, "train/loss_slope": -1.316428156611511e-05} {"step": 46725, "timestamp": 1778245043.8187966, "eos/sharpness": 63.95990848541258, "eos/L0_probe": 1.9830620288848877, "eos/L_plus": 2.2477619647979736, "eos/L_minus": 2.3579611778259277, "eos/grad_norm": 0.14540107548236847, "eos/embed_grad_frac": 0.10067924112081528, "eos/time_s": 0.593698263168335} {"step": 46725, "timestamp": 1778245045.19361, "geo/rankme_last": 438.61077880859375, "geo/layer_0/stable_rank_q_proj": 19.451932907104492, "geo/layer_0/stable_rank_k_proj": 16.389848709106445, "geo/layer_0/stable_rank_o_proj": 48.06689453125, "geo/layer_0/stable_rank_gate_proj": 134.85812377929688, "geo/layer_0/stable_rank_down_proj": 53.399993896484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0638236477971077, "geo/layer_0/attn_entropy_mean": 6.1861748695373535, "geo/layer_0/attn_entropy_std": 0.3855977952480316, "geo/layer_7/stable_rank_q_proj": 42.44293975830078, "geo/layer_7/stable_rank_k_proj": 42.177696228027344, "geo/layer_7/stable_rank_o_proj": 94.51668548583984, "geo/layer_7/stable_rank_gate_proj": 88.13298034667969, "geo/layer_7/stable_rank_down_proj": 144.47998046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4790855348110199, "geo/layer_7/attn_entropy_mean": 4.628910064697266, "geo/layer_7/attn_entropy_std": 0.8181812167167664, "geo/layer_14/stable_rank_q_proj": 53.111907958984375, "geo/layer_14/stable_rank_k_proj": 38.37877655029297, "geo/layer_14/stable_rank_o_proj": 46.46052932739258, "geo/layer_14/stable_rank_gate_proj": 74.54561614990234, "geo/layer_14/stable_rank_down_proj": 132.02723693847656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.389877587556839, "geo/layer_14/attn_entropy_mean": 5.519479274749756, "geo/layer_14/attn_entropy_std": 0.36934638023376465, "geo/layer_21/stable_rank_q_proj": 42.330142974853516, "geo/layer_21/stable_rank_k_proj": 30.691762924194336, "geo/layer_21/stable_rank_o_proj": 74.36528015136719, "geo/layer_21/stable_rank_gate_proj": 70.53730010986328, "geo/layer_21/stable_rank_down_proj": 54.0146598815918, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1428990364074707, "geo/layer_21/attn_entropy_mean": 5.712982654571533, "geo/layer_21/attn_entropy_std": 0.2935827970504761, "geo/layer_27/stable_rank_q_proj": 42.528282165527344, "geo/layer_27/stable_rank_k_proj": 31.421724319458008, "geo/layer_27/stable_rank_o_proj": 115.57647705078125, "geo/layer_27/stable_rank_gate_proj": 83.32086181640625, "geo/layer_27/stable_rank_down_proj": 130.702392578125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0894855484366417, "geo/layer_27/attn_entropy_mean": 4.26294469833374, "geo/layer_27/attn_entropy_std": 0.6872166991233826, "attnres/final_alpha/block_0": 0.2405889630317688, "attnres/block_norm/0": 1.7363436222076416, "attnres/final_alpha/block_1": 0.004952353425323963, "attnres/block_norm/1": 42529.8671875, "attnres/final_alpha/block_2": 0.010947196744382381, "attnres/block_norm/2": 27197.498046875, "attnres/final_alpha/block_3": 0.0128396637737751, "attnres/block_norm/3": 50597.9921875, "attnres/final_alpha/block_4": 0.01550865825265646, "attnres/block_norm/4": 13672.408203125, "attnres/final_alpha/block_5": 0.5997910499572754, "attnres/block_norm/5": 6207.357421875, "attnres/final_alpha/block_6": 0.11537209153175354, "attnres/block_norm/6": 33446.70703125, "geo/tier1_time_s": 1.3566656112670898, "geo/step": 46725.0, "geo/rankme_slope": 6.3007781237495e-05} {"step": 46730, "timestamp": 1778245050.3845327, "train/loss": 2.105392646789551, "train/z_loss": 0.001409796508960426, "train/perplexity": 8.210326134977548, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700452.2993295637, "perf/iters_per_sec": 0.8108388420722789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233290696144104, "data/tokens_consumed": 98002010112, "data/tokens_consumed_B": 98.002010112, "train/loss_slope": -1.476773505378259e-05} {"step": 46740, "timestamp": 1778245060.7650795, "train/loss": 2.215139365196228, "train/z_loss": 0.0013885066262446343, "train/perplexity": 9.16268598216096, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021123.1844108806, "perf/iters_per_sec": 0.9637466356329348, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376171112060546, "data/tokens_consumed": 98022981632, "data/tokens_consumed_B": 98.022981632, "train/loss_slope": -9.66945924166617e-06} {"step": 46750, "timestamp": 1778245071.1375067, "grad/layer_0/attn": 0.002434715162962675, "grad/layer_0/mlp": 0.0027192188426852226, "grad/layer_0/attn_mlp_ratio": 0.8953729781532043, "grad/layer_4/attn": 0.0036385213024914265, "grad/layer_4/mlp": 0.0025071667041629553, "grad/layer_4/attn_mlp_ratio": 1.451248196350536, "grad/layer_8/attn": 0.007590714376419783, "grad/layer_8/mlp": 0.0035883495584130287, "grad/layer_8/attn_mlp_ratio": 2.1153775688004792, "grad/layer_12/attn": 0.00573834590613842, "grad/layer_12/mlp": 0.006846013478934765, "grad/layer_12/attn_mlp_ratio": 0.83820253056397, "grad/layer_16/attn": 0.003812864888459444, "grad/layer_16/mlp": 0.004921451676636934, "grad/layer_16/attn_mlp_ratio": 0.7747439295371821, "grad/layer_20/attn": 0.00587460957467556, "grad/layer_20/mlp": 0.00589452451094985, "grad/layer_20/attn_mlp_ratio": 0.9966214347061517, "grad/layer_24/attn": 0.00702556362375617, "grad/layer_24/mlp": 0.007812585681676865, "grad/layer_24/attn_mlp_ratio": 0.8992622698919319, "grad/layer_27/attn": 0.005478810518980026, "grad/layer_27/mlp": 0.0065259505063295364, "grad/layer_27/attn_mlp_ratio": 0.8395421371510403} {"step": 46750, "timestamp": 1778245071.1518478, "train/loss": 2.1884130954742433, "train/z_loss": 0.001395589124877006, "train/perplexity": 8.921045029241151, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020442.783348714, "perf/iters_per_sec": 0.963422195124013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037966537475586, "data/tokens_consumed": 98043953152, "data/tokens_consumed_B": 98.043953152, "train/loss_slope": -7.190576223435737e-06} {"step": 46760, "timestamp": 1778245081.531847, "train/loss": 2.212859535217285, "train/z_loss": 0.0013712401851080358, "train/perplexity": 9.141820409987291, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021471.0827225663, "perf/iters_per_sec": 0.9639125264752227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374385356903075, "data/tokens_consumed": 98064924672, "data/tokens_consumed_B": 98.064924672, "train/loss_slope": -3.852427710365142e-06} {"step": 46770, "timestamp": 1778245091.9078412, "train/loss": 2.1846421003341674, "train/z_loss": 0.0013774492195807396, "train/perplexity": 8.887467162568395, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022180.302091139, "perf/iters_per_sec": 0.9642507086234755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370746850967407, "data/tokens_consumed": 98085896192, "data/tokens_consumed_B": 98.085896192, "train/loss_slope": -2.654102261346084e-06} {"step": 46780, "timestamp": 1778245102.2753897, "train/loss": 2.178233599662781, "train/z_loss": 0.001390232192352414, "train/perplexity": 8.830693933230265, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024236.3662202524, "perf/iters_per_sec": 0.9652311163998853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360213041305542, "data/tokens_consumed": 98106867712, "data/tokens_consumed_B": 98.106867712, "train/loss_slope": -1.3414120337928977e-06} {"step": 46790, "timestamp": 1778245112.6228635, "train/loss": 2.124578070640564, "train/z_loss": 0.0014125241083092987, "train/perplexity": 8.369365462038628, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027722.3838146494, "perf/iters_per_sec": 0.9668933791230437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034240198135376, "data/tokens_consumed": 98127839232, "data/tokens_consumed_B": 98.127839232, "train/loss_slope": -5.214383356784571e-07} {"step": 46800, "timestamp": 1778245122.9643247, "grad/layer_0/attn": 0.003148728981614113, "grad/layer_0/mlp": 0.0028592213056981564, "grad/layer_0/attn_mlp_ratio": 1.101253990103388, "grad/layer_4/attn": 0.0020042420364916325, "grad/layer_4/mlp": 0.002429627813398838, "grad/layer_4/attn_mlp_ratio": 0.8249172745500245, "grad/layer_8/attn": 0.00552299665287137, "grad/layer_8/mlp": 0.0037612011656165123, "grad/layer_8/attn_mlp_ratio": 1.468412951830185, "grad/layer_12/attn": 0.006395592354238033, "grad/layer_12/mlp": 0.006461648270487785, "grad/layer_12/attn_mlp_ratio": 0.9897772189908308, "grad/layer_16/attn": 0.003902349853888154, "grad/layer_16/mlp": 0.00432586669921875, "grad/layer_16/attn_mlp_ratio": 0.9020966282625524, "grad/layer_20/attn": 0.007153112441301346, "grad/layer_20/mlp": 0.005871799774467945, "grad/layer_20/attn_mlp_ratio": 1.2182146180432458, "grad/layer_24/attn": 0.013508177362382412, "grad/layer_24/mlp": 0.011357215233147144, "grad/layer_24/attn_mlp_ratio": 1.1893916744677258, "grad/layer_27/attn": 0.01239761896431446, "grad/layer_27/mlp": 0.010743741877377033, "grad/layer_27/attn_mlp_ratio": 1.1539386361306867} {"step": 46800, "timestamp": 1778245123.560322, "eos/sharpness": 69.13363933563231, "eos/L0_probe": 1.9849050045013428, "eos/L_plus": 2.2890045642852783, "eos/L_minus": 2.3721418380737305, "eos/grad_norm": 0.19782359898090363, "eos/embed_grad_frac": 0.06692799925804138, "eos/time_s": 0.5931978225708008} {"step": 46800, "timestamp": 1778245123.5802257, "train/loss": 2.173517632484436, "train/z_loss": 0.0013840624829754232, "train/perplexity": 8.789146715139589, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914728.403177806, "perf/iters_per_sec": 0.9130136505021124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0952738761901855, "data/tokens_consumed": 98148810752, "data/tokens_consumed_B": 98.148810752, "train/loss_slope": 8.980591877280472e-07} {"step": 46800, "timestamp": 1778245124.943602, "geo/rankme_last": 438.7103576660156, "geo/layer_0/stable_rank_q_proj": 19.46099853515625, "geo/layer_0/stable_rank_k_proj": 16.35314178466797, "geo/layer_0/stable_rank_o_proj": 48.04561996459961, "geo/layer_0/stable_rank_gate_proj": 135.03277587890625, "geo/layer_0/stable_rank_down_proj": 53.41100311279297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061193715780973434, "geo/layer_0/attn_entropy_mean": 6.1840667724609375, "geo/layer_0/attn_entropy_std": 0.3799455165863037, "geo/layer_7/stable_rank_q_proj": 42.59516525268555, "geo/layer_7/stable_rank_k_proj": 42.15311813354492, "geo/layer_7/stable_rank_o_proj": 94.58399200439453, "geo/layer_7/stable_rank_gate_proj": 88.13594818115234, "geo/layer_7/stable_rank_down_proj": 144.73678588867188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5027356147766113, "geo/layer_7/attn_entropy_mean": 4.614282608032227, "geo/layer_7/attn_entropy_std": 0.7974891066551208, "geo/layer_14/stable_rank_q_proj": 53.08406448364258, "geo/layer_14/stable_rank_k_proj": 38.347808837890625, "geo/layer_14/stable_rank_o_proj": 46.50284194946289, "geo/layer_14/stable_rank_gate_proj": 74.60700225830078, "geo/layer_14/stable_rank_down_proj": 131.99624633789062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3978949189186096, "geo/layer_14/attn_entropy_mean": 5.514440536499023, "geo/layer_14/attn_entropy_std": 0.36946627497673035, "geo/layer_21/stable_rank_q_proj": 42.32747268676758, "geo/layer_21/stable_rank_k_proj": 30.57958221435547, "geo/layer_21/stable_rank_o_proj": 74.4277114868164, "geo/layer_21/stable_rank_gate_proj": 70.42650604248047, "geo/layer_21/stable_rank_down_proj": 54.006717681884766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14204835891723633, "geo/layer_21/attn_entropy_mean": 5.705792427062988, "geo/layer_21/attn_entropy_std": 0.28836169838905334, "geo/layer_27/stable_rank_q_proj": 42.55308151245117, "geo/layer_27/stable_rank_k_proj": 31.478105545043945, "geo/layer_27/stable_rank_o_proj": 115.5697250366211, "geo/layer_27/stable_rank_gate_proj": 83.34825134277344, "geo/layer_27/stable_rank_down_proj": 130.66549682617188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0888599157333374, "geo/layer_27/attn_entropy_mean": 4.250960350036621, "geo/layer_27/attn_entropy_std": 0.6717557907104492, "attnres/final_alpha/block_0": 0.23988556861877441, "attnres/block_norm/0": 1.7365071773529053, "attnres/final_alpha/block_1": 0.004907294176518917, "attnres/block_norm/1": 42526.8125, "attnres/final_alpha/block_2": 0.010864745825529099, "attnres/block_norm/2": 27276.09765625, "attnres/final_alpha/block_3": 0.01290087029337883, "attnres/block_norm/3": 50381.171875, "attnres/final_alpha/block_4": 0.015326449647545815, "attnres/block_norm/4": 13646.6787109375, "attnres/final_alpha/block_5": 0.6003850698471069, "attnres/block_norm/5": 6285.625, "attnres/final_alpha/block_6": 0.11572998762130737, "attnres/block_norm/6": 33493.859375, "geo/tier1_time_s": 1.3598730564117432, "geo/step": 46800.0, "geo/rankme_slope": 3.079513055222089e-05} {"step": 46810, "timestamp": 1778245135.2974896, "train/loss": 2.1576739072799684, "train/z_loss": 0.0013997097848914564, "train/perplexity": 8.650991228597503, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790271.1438755037, "perf/iters_per_sec": 0.8536678046586531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714158535003663, "data/tokens_consumed": 98169782272, "data/tokens_consumed_B": 98.169782272, "train/loss_slope": 8.885363076064974e-07} {"step": 46820, "timestamp": 1778245145.6462495, "train/loss": 2.1634809017181396, "train/z_loss": 0.0013965406804345548, "train/perplexity": 8.701373630129934, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027439.9483197285, "perf/iters_per_sec": 0.96675870338427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034384274482727, "data/tokens_consumed": 98190753792, "data/tokens_consumed_B": 98.190753792, "train/loss_slope": 3.032566476004185e-06} {"step": 46830, "timestamp": 1778245155.9866843, "train/loss": 2.1590890169143675, "train/z_loss": 0.001391617103945464, "train/perplexity": 8.663241995677073, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029046.995864142, "perf/iters_per_sec": 0.9675250033684454, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335650205612184, "data/tokens_consumed": 98211725312, "data/tokens_consumed_B": 98.211725312, "train/loss_slope": 4.247044894633095e-06} {"step": 46840, "timestamp": 1778245166.3289351, "train/loss": 2.178635263442993, "train/z_loss": 0.0013873597141355275, "train/perplexity": 8.834241615577438, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029332.3138656379, "perf/iters_per_sec": 0.9676610535934629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033419704437256, "data/tokens_consumed": 98232696832, "data/tokens_consumed_B": 98.232696832, "train/loss_slope": 5.99568532531128e-06} {"step": 46850, "timestamp": 1778245176.6559992, "grad/layer_0/attn": 0.0027019500266760588, "grad/layer_0/mlp": 0.0028595125768333673, "grad/layer_0/attn_mlp_ratio": 0.9448987754333742, "grad/layer_4/attn": 0.0019454581197351217, "grad/layer_4/mlp": 0.0025814382825046778, "grad/layer_4/attn_mlp_ratio": 0.7536333746798608, "grad/layer_8/attn": 0.004310278221964836, "grad/layer_8/mlp": 0.003713081358000636, "grad/layer_8/attn_mlp_ratio": 1.1608358908144636, "grad/layer_12/attn": 0.004596012644469738, "grad/layer_12/mlp": 0.007134275045245886, "grad/layer_12/attn_mlp_ratio": 0.6442157823885463, "grad/layer_16/attn": 0.0033571110107004642, "grad/layer_16/mlp": 0.004569898825138807, "grad/layer_16/attn_mlp_ratio": 0.7346138428211505, "grad/layer_20/attn": 0.003675567451864481, "grad/layer_20/mlp": 0.006240285001695156, "grad/layer_20/attn_mlp_ratio": 0.5890063341602811, "grad/layer_24/attn": 0.010698260739445686, "grad/layer_24/mlp": 0.008359301835298538, "grad/layer_24/attn_mlp_ratio": 1.2798031249799109, "grad/layer_27/attn": 0.0075243934988975525, "grad/layer_27/mlp": 0.007024856749922037, "grad/layer_27/attn_mlp_ratio": 1.0711098688055203} {"step": 46850, "timestamp": 1778245176.670244, "train/loss": 2.101241040229797, "train/z_loss": 0.0014044995070435107, "train/perplexity": 8.176310749244085, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028993.8733171073, "perf/iters_per_sec": 0.9674996725640809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335920810699464, "data/tokens_consumed": 98253668352, "data/tokens_consumed_B": 98.253668352, "train/loss_slope": 2.2838308329772656e-06} {"step": 46860, "timestamp": 1778245187.0276084, "train/loss": 2.1641064643859864, "train/z_loss": 0.0013847123947925866, "train/perplexity": 8.706818587535407, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025784.6359927363, "perf/iters_per_sec": 0.9659693889583284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352294921875, "data/tokens_consumed": 98274639872, "data/tokens_consumed_B": 98.274639872, "train/loss_slope": 5.1855595234168035e-06} {"step": 46870, "timestamp": 1778245197.416545, "train/loss": 2.1906876087188722, "train/z_loss": 0.0013984041404910385, "train/perplexity": 8.941359157935699, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019908.7556074494, "perf/iters_per_sec": 0.9631675508534667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382409572601319, "data/tokens_consumed": 98295611392, "data/tokens_consumed_B": 98.295611392, "train/loss_slope": 9.82640440529116e-06} {"step": 46875, "timestamp": 1778245203.1690435, "eos/sharpness": 79.22997474670409, "eos/L0_probe": 1.9888709783554077, "eos/L_plus": 2.4695701599121094, "eos/L_minus": 2.300471544265747, "eos/grad_norm": 0.27479153871536255, "eos/embed_grad_frac": 0.03084370866417885, "eos/time_s": 0.5939042568206787} {"step": 46875, "timestamp": 1778245204.544764, "geo/rankme_last": 438.9020080566406, "geo/layer_0/stable_rank_q_proj": 19.468053817749023, "geo/layer_0/stable_rank_k_proj": 16.376169204711914, "geo/layer_0/stable_rank_o_proj": 48.095916748046875, "geo/layer_0/stable_rank_gate_proj": 134.65924072265625, "geo/layer_0/stable_rank_down_proj": 53.43639373779297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06187346205115318, "geo/layer_0/attn_entropy_mean": 6.181110382080078, "geo/layer_0/attn_entropy_std": 0.3844391703605652, "geo/layer_7/stable_rank_q_proj": 42.51560974121094, "geo/layer_7/stable_rank_k_proj": 42.19025802612305, "geo/layer_7/stable_rank_o_proj": 94.64720916748047, "geo/layer_7/stable_rank_gate_proj": 87.84810638427734, "geo/layer_7/stable_rank_down_proj": 144.58041381835938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47953927516937256, "geo/layer_7/attn_entropy_mean": 4.673633575439453, "geo/layer_7/attn_entropy_std": 0.7986114025115967, "geo/layer_14/stable_rank_q_proj": 53.00027847290039, "geo/layer_14/stable_rank_k_proj": 38.35134506225586, "geo/layer_14/stable_rank_o_proj": 46.46902847290039, "geo/layer_14/stable_rank_gate_proj": 74.58818054199219, "geo/layer_14/stable_rank_down_proj": 131.72158813476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39949896931648254, "geo/layer_14/attn_entropy_mean": 5.520384311676025, "geo/layer_14/attn_entropy_std": 0.34680405259132385, "geo/layer_21/stable_rank_q_proj": 42.41603088378906, "geo/layer_21/stable_rank_k_proj": 30.672075271606445, "geo/layer_21/stable_rank_o_proj": 74.29944610595703, "geo/layer_21/stable_rank_gate_proj": 70.4464111328125, "geo/layer_21/stable_rank_down_proj": 54.007015228271484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14624401926994324, "geo/layer_21/attn_entropy_mean": 5.7290239334106445, "geo/layer_21/attn_entropy_std": 0.29162707924842834, "geo/layer_27/stable_rank_q_proj": 42.599029541015625, "geo/layer_27/stable_rank_k_proj": 31.4619197845459, "geo/layer_27/stable_rank_o_proj": 115.64549255371094, "geo/layer_27/stable_rank_gate_proj": 83.33805847167969, "geo/layer_27/stable_rank_down_proj": 130.66262817382812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08749809116125107, "geo/layer_27/attn_entropy_mean": 4.257734298706055, "geo/layer_27/attn_entropy_std": 0.6990014910697937, "attnres/final_alpha/block_0": 0.24010300636291504, "attnres/block_norm/0": 1.7366743087768555, "attnres/final_alpha/block_1": 0.004935846664011478, "attnres/block_norm/1": 42566.9296875, "attnres/final_alpha/block_2": 0.010810328647494316, "attnres/block_norm/2": 27249.16796875, "attnres/final_alpha/block_3": 0.012472797185182571, "attnres/block_norm/3": 50766.39453125, "attnres/final_alpha/block_4": 0.015323074534535408, "attnres/block_norm/4": 13651.810546875, "attnres/final_alpha/block_5": 0.6037106513977051, "attnres/block_norm/5": 6134.91748046875, "attnres/final_alpha/block_6": 0.11264432966709137, "attnres/block_norm/6": 33461.0859375, "geo/tier1_time_s": 1.3573675155639648, "geo/step": 46875.0, "geo/rankme_slope": 1.6741266819227687e-05} {"step": 46880, "timestamp": 1778245209.7183008, "train/loss": 2.164223623275757, "train/z_loss": 0.0014133150689303875, "train/perplexity": 8.707838728492682, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705383.5014670002, "perf/iters_per_sec": 0.8131902224860192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2297245740890503, "data/tokens_consumed": 98316582912, "data/tokens_consumed_B": 98.316582912, "train/loss_slope": 1.1879420766879115e-05} {"step": 46890, "timestamp": 1778245220.0794601, "train/loss": 2.1949660062789915, "train/z_loss": 0.0013856978970579804, "train/perplexity": 8.979695798357254, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025300.0547713996, "perf/iters_per_sec": 0.9657383226258276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354771852493285, "data/tokens_consumed": 98337554432, "data/tokens_consumed_B": 98.337554432, "train/loss_slope": 1.2153575279460184e-05} {"step": 46900, "timestamp": 1778245230.4479694, "grad/layer_0/attn": 0.0029076423961669207, "grad/layer_0/mlp": 0.00276607321575284, "grad/layer_0/attn_mlp_ratio": 1.0511805235269225, "grad/layer_4/attn": 0.0022083162330091, "grad/layer_4/mlp": 0.002634983276948333, "grad/layer_4/attn_mlp_ratio": 0.8380759637150459, "grad/layer_8/attn": 0.007524946238845587, "grad/layer_8/mlp": 0.0036255207378417253, "grad/layer_8/attn_mlp_ratio": 2.0755490246541215, "grad/layer_12/attn": 0.005050417967140675, "grad/layer_12/mlp": 0.006622309796512127, "grad/layer_12/attn_mlp_ratio": 0.7626369116009892, "grad/layer_16/attn": 0.00543137826025486, "grad/layer_16/mlp": 0.004766465164721012, "grad/layer_16/attn_mlp_ratio": 1.1394981309221324, "grad/layer_20/attn": 0.0031073030550032854, "grad/layer_20/mlp": 0.005660789553076029, "grad/layer_20/attn_mlp_ratio": 0.5489168906523141, "grad/layer_24/attn": 0.008124662563204765, "grad/layer_24/mlp": 0.0090325977653265, "grad/layer_24/attn_mlp_ratio": 0.8994823731047485, "grad/layer_27/attn": 0.005684103816747665, "grad/layer_27/mlp": 0.008347880095243454, "grad/layer_27/attn_mlp_ratio": 0.680903856285146} {"step": 46900, "timestamp": 1778245230.4623308, "train/loss": 2.139824366569519, "train/z_loss": 0.0013962815632112325, "train/perplexity": 8.497944974582039, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021085.6148427362, "perf/iters_per_sec": 0.9637287210668259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037636399269104, "data/tokens_consumed": 98358525952, "data/tokens_consumed_B": 98.358525952, "train/loss_slope": 7.97097580661084e-06} {"step": 46910, "timestamp": 1778245240.83654, "train/loss": 2.1026784896850588, "train/z_loss": 0.001402133668307215, "train/perplexity": 8.188072233921316, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022387.5711683328, "perf/iters_per_sec": 0.964349542221228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369683980941773, "data/tokens_consumed": 98379497472, "data/tokens_consumed_B": 98.379497472, "train/loss_slope": 4.695414523981121e-06} {"step": 46920, "timestamp": 1778245251.213477, "train/loss": 2.193759799003601, "train/z_loss": 0.001371586532332003, "train/perplexity": 8.968870953770084, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022893.1362365712, "perf/iters_per_sec": 0.9645906144316536, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036709237098694, "data/tokens_consumed": 98400468992, "data/tokens_consumed_B": 98.400468992, "train/loss_slope": 7.745519622896616e-06} {"step": 46930, "timestamp": 1778245261.5906193, "train/loss": 2.208559989929199, "train/z_loss": 0.0013927346211858095, "train/perplexity": 9.10259911640369, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022307.4109283132, "perf/iters_per_sec": 0.9643113188401762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370095014572143, "data/tokens_consumed": 98421440512, "data/tokens_consumed_B": 98.421440512, "train/loss_slope": 8.502259606396602e-06} {"step": 46940, "timestamp": 1778245271.9702983, "train/loss": 2.136204218864441, "train/z_loss": 0.001409206702373922, "train/perplexity": 8.467236776228429, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021393.9680936586, "perf/iters_per_sec": 0.9638757553547185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374781131744384, "data/tokens_consumed": 98442412032, "data/tokens_consumed_B": 98.442412032, "train/loss_slope": 7.649272453166196e-06} {"step": 46950, "timestamp": 1778245282.3377628, "grad/layer_0/attn": 0.003089823527261615, "grad/layer_0/mlp": 0.002888878108933568, "grad/layer_0/attn_mlp_ratio": 1.0695582519562934, "grad/layer_4/attn": 0.002018446335569024, "grad/layer_4/mlp": 0.002433784306049347, "grad/layer_4/attn_mlp_ratio": 0.829344756483783, "grad/layer_8/attn": 0.0035887311678379774, "grad/layer_8/mlp": 0.003527615685015917, "grad/layer_8/attn_mlp_ratio": 1.0173248410673457, "grad/layer_12/attn": 0.004701798781752586, "grad/layer_12/mlp": 0.0067123062908649445, "grad/layer_12/attn_mlp_ratio": 0.7004743985094988, "grad/layer_16/attn": 0.005960086826235056, "grad/layer_16/mlp": 0.004805508069694042, "grad/layer_16/attn_mlp_ratio": 1.2402615115342783, "grad/layer_20/attn": 0.003374222433194518, "grad/layer_20/mlp": 0.005803236737847328, "grad/layer_20/attn_mlp_ratio": 0.5814380021833063, "grad/layer_24/attn": 0.013525399379432201, "grad/layer_24/mlp": 0.009972418658435345, "grad/layer_24/attn_mlp_ratio": 1.3562807285836755, "grad/layer_27/attn": 0.00417884299531579, "grad/layer_27/mlp": 0.009741373360157013, "grad/layer_27/attn_mlp_ratio": 0.428978830593816} {"step": 46950, "timestamp": 1778245282.9244962, "eos/sharpness": 67.71800518035887, "eos/L0_probe": 1.9869070053100586, "eos/L_plus": 2.388298273086548, "eos/L_minus": 2.262695789337158, "eos/grad_norm": 0.16924221813678741, "eos/embed_grad_frac": 0.07655680924654007, "eos/time_s": 0.5840482711791992} {"step": 46950, "timestamp": 1778245282.9440513, "train/loss": 2.1672296047210695, "train/z_loss": 0.001387927064206451, "train/perplexity": 8.734053711275507, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912296.6094178741, "perf/iters_per_sec": 0.9118540808762904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096666693687439, "data/tokens_consumed": 98463383552, "data/tokens_consumed_B": 98.463383552, "train/loss_slope": 8.172002491062573e-06} {"step": 46950, "timestamp": 1778245284.3027065, "geo/rankme_last": 438.5965576171875, "geo/layer_0/stable_rank_q_proj": 19.49469757080078, "geo/layer_0/stable_rank_k_proj": 16.41029930114746, "geo/layer_0/stable_rank_o_proj": 48.09984588623047, "geo/layer_0/stable_rank_gate_proj": 134.58749389648438, "geo/layer_0/stable_rank_down_proj": 53.42903518676758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06068240478634834, "geo/layer_0/attn_entropy_mean": 6.182835578918457, "geo/layer_0/attn_entropy_std": 0.38588592410087585, "geo/layer_7/stable_rank_q_proj": 42.526058197021484, "geo/layer_7/stable_rank_k_proj": 42.376373291015625, "geo/layer_7/stable_rank_o_proj": 94.78607940673828, "geo/layer_7/stable_rank_gate_proj": 87.73887634277344, "geo/layer_7/stable_rank_down_proj": 144.72406005859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.482525497674942, "geo/layer_7/attn_entropy_mean": 4.630392074584961, "geo/layer_7/attn_entropy_std": 0.7815183997154236, "geo/layer_14/stable_rank_q_proj": 53.02667999267578, "geo/layer_14/stable_rank_k_proj": 38.443355560302734, "geo/layer_14/stable_rank_o_proj": 46.40236282348633, "geo/layer_14/stable_rank_gate_proj": 74.47130584716797, "geo/layer_14/stable_rank_down_proj": 131.57159423828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3993740677833557, "geo/layer_14/attn_entropy_mean": 5.538658142089844, "geo/layer_14/attn_entropy_std": 0.3616560399532318, "geo/layer_21/stable_rank_q_proj": 42.42499542236328, "geo/layer_21/stable_rank_k_proj": 30.677093505859375, "geo/layer_21/stable_rank_o_proj": 74.20310974121094, "geo/layer_21/stable_rank_gate_proj": 70.41845703125, "geo/layer_21/stable_rank_down_proj": 53.93173599243164, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1436678022146225, "geo/layer_21/attn_entropy_mean": 5.731329917907715, "geo/layer_21/attn_entropy_std": 0.2848985493183136, "geo/layer_27/stable_rank_q_proj": 42.62214279174805, "geo/layer_27/stable_rank_k_proj": 31.4080867767334, "geo/layer_27/stable_rank_o_proj": 115.82135772705078, "geo/layer_27/stable_rank_gate_proj": 83.33970642089844, "geo/layer_27/stable_rank_down_proj": 130.55677795410156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09111587703227997, "geo/layer_27/attn_entropy_mean": 4.247699737548828, "geo/layer_27/attn_entropy_std": 0.6744549870491028, "attnres/final_alpha/block_0": 0.2396947741508484, "attnres/block_norm/0": 1.7365241050720215, "attnres/final_alpha/block_1": 0.004895248916000128, "attnres/block_norm/1": 42509.0546875, "attnres/final_alpha/block_2": 0.0107449721544981, "attnres/block_norm/2": 27322.658203125, "attnres/final_alpha/block_3": 0.01266541052609682, "attnres/block_norm/3": 50415.4609375, "attnres/final_alpha/block_4": 0.015188157558441162, "attnres/block_norm/4": 13700.87109375, "attnres/final_alpha/block_5": 0.6039759516716003, "attnres/block_norm/5": 6203.62109375, "attnres/final_alpha/block_6": 0.11283545196056366, "attnres/block_norm/6": 33537.92578125, "geo/tier1_time_s": 1.3547818660736084, "geo/step": 46950.0, "geo/rankme_slope": -9.392389768407363e-06} {"step": 46960, "timestamp": 1778245294.680416, "train/loss": 2.1843825936317445, "train/z_loss": 0.0013835928519256413, "train/perplexity": 8.88516110450385, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787446.0235432356, "perf/iters_per_sec": 0.8523206823078325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1732673168182373, "data/tokens_consumed": 98484355072, "data/tokens_consumed_B": 98.484355072, "train/loss_slope": 9.315198654531931e-06} {"step": 46970, "timestamp": 1778245305.055653, "train/loss": 2.2003951549530028, "train/z_loss": 0.0013888892019167543, "train/perplexity": 9.028580482928522, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022113.7785614803, "perf/iters_per_sec": 0.9642189877326395, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03710880279541, "data/tokens_consumed": 98505326592, "data/tokens_consumed_B": 98.505326592, "train/loss_slope": 1.2876134343666615e-05} {"step": 46980, "timestamp": 1778245315.4345114, "train/loss": 2.1807395935058596, "train/z_loss": 0.0014029778656549751, "train/perplexity": 8.852851349435625, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022377.853028118, "perf/iters_per_sec": 0.9643449082508649, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369733810424804, "data/tokens_consumed": 98526298112, "data/tokens_consumed_B": 98.526298112, "train/loss_slope": 1.641730373770563e-05} {"step": 46990, "timestamp": 1778245325.8045225, "train/loss": 2.1378211975097656, "train/z_loss": 0.0013982309959828854, "train/perplexity": 8.48093919255224, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023204.3215812917, "perf/iters_per_sec": 0.9647389991671046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365497827529908, "data/tokens_consumed": 98547269632, "data/tokens_consumed_B": 98.547269632, "train/loss_slope": 1.528600979738326e-05} {"step": 47000, "timestamp": 1778245336.1634493, "grad/layer_0/attn": 0.0026400224305689335, "grad/layer_0/mlp": 0.002917094388976693, "grad/layer_0/attn_mlp_ratio": 0.9050177978619556, "grad/layer_4/attn": 0.0018579657189548016, "grad/layer_4/mlp": 0.002499445341527462, "grad/layer_4/attn_mlp_ratio": 0.7433511802599543, "grad/layer_8/attn": 0.008716510608792305, "grad/layer_8/mlp": 0.00353940948843956, "grad/layer_8/attn_mlp_ratio": 2.4627018690524607, "grad/layer_12/attn": 0.004215108696371317, "grad/layer_12/mlp": 0.0065358905121684074, "grad/layer_12/attn_mlp_ratio": 0.6449172647601692, "grad/layer_16/attn": 0.0034893518313765526, "grad/layer_16/mlp": 0.004343351814895868, "grad/layer_16/attn_mlp_ratio": 0.8033776446733559, "grad/layer_20/attn": 0.0029975168872624636, "grad/layer_20/mlp": 0.005374690517783165, "grad/layer_20/attn_mlp_ratio": 0.5577096619002815, "grad/layer_24/attn": 0.005580130033195019, "grad/layer_24/mlp": 0.008000737056136131, "grad/layer_24/attn_mlp_ratio": 0.6974519877728219, "grad/layer_27/attn": 0.004496041219681501, "grad/layer_27/mlp": 0.006561707239598036, "grad/layer_27/attn_mlp_ratio": 0.6851938050557623} {"step": 47000, "timestamp": 1778245336.1775756, "train/loss": 2.1560623407363892, "train/z_loss": 0.0013889775960706174, "train/perplexity": 8.637060808479145, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022641.6709051016, "perf/iters_per_sec": 0.964470706417609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036838126182556, "data/tokens_consumed": 98568241152, "data/tokens_consumed_B": 98.568241152, "train/loss_slope": 1.637567830021378e-05} {"step": 47000, "timestamp": 1778245343.4354343, "geo/ww_alpha_mean": 7.444285296566501, "geo/ww_alpha_std": 4.747996182723822, "geo/ww_alpha_min": 1.3493060304343896, "geo/ww_alpha_max": 41.63662061900274, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 4.0374767454483385, "geo/ww_alpha_by_type/k_proj": 4.429990915221554, "geo/ww_alpha_by_type/v_proj": 8.323347435626456, "geo/ww_alpha_by_type/o_proj": 6.728728818830545, "geo/ww_alpha_by_type/gate_proj": 8.090564960721442, "geo/ww_alpha_by_type/up_proj": 12.429329441192092, "geo/ww_alpha_by_type/down_proj": 8.166504403325451, "geo/twonn_id/layer_0": 0.7366751432418823, "geo/twonn_id/layer_7": 3.5639543533325195, "geo/twonn_id/layer_14": 4.472590923309326, "geo/twonn_id/layer_21": 6.729135513305664, "geo/twonn_id/layer_27": 5.453647136688232, "geo/tier2_time_s": 7.249437093734741} {"step": 47000, "timestamp": 1778245344.0908265, "eoc/jacobian_sigma/layer_0/attn": 974.789306640625, "eoc/jacobian_sigma/layer_0/mlp": 7779.5048828125, "eoc/jacobian_sigma/layer_0": 7779.5048828125, "eoc/jacobian_sigma/layer_7/attn": 1.1592267751693726, "eoc/jacobian_sigma/layer_7/mlp": 1.8619647026062012, "eoc/jacobian_sigma/layer_7": 1.8619647026062012, "eoc/jacobian_sigma/layer_14/attn": 1.6034570932388306, "eoc/jacobian_sigma/layer_14/mlp": 5.51155424118042, "eoc/jacobian_sigma/layer_14": 5.51155424118042, "eoc/jacobian_sigma/layer_21/attn": 1.083066463470459, "eoc/jacobian_sigma/layer_21/mlp": 4.005807399749756, "eoc/jacobian_sigma/layer_21": 4.005807399749756, "eoc/jacobian_sigma/layer_27/attn": 3.5392062664031982, "eoc/jacobian_sigma/layer_27/mlp": 27.487689971923828, "eoc/jacobian_sigma/layer_27": 27.487689971923828, "eoc/layer0_sigma": 7779.5048828125, "eoc/sigma_max": 27.487689971923828, "eoc/sigma_min": 1.8619647026062012, "eoc/sigma_mean": 9.716754078865051, "eoc/time_s": 0.6490447521209717} {"step": 47010, "timestamp": 1778245354.4804173, "train/loss": 2.168302667140961, "train/z_loss": 0.0013883454841561616, "train/perplexity": 8.743430926355149, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1146259.0970104546, "perf/iters_per_sec": 0.5465789303829454, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8295619249343873, "data/tokens_consumed": 98589212672, "data/tokens_consumed_B": 98.589212672, "train/loss_slope": 1.551558586558963e-05} {"step": 47020, "timestamp": 1778245364.854189, "train/loss": 2.1559633016586304, "train/z_loss": 0.0013944527017883956, "train/perplexity": 8.636205444300064, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022531.6337888802, "perf/iters_per_sec": 0.9644182366318131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368945360183717, "data/tokens_consumed": 98610184192, "data/tokens_consumed_B": 98.610184192, "train/loss_slope": 1.596813388604719e-05} {"step": 47025, "timestamp": 1778245370.6280558, "eos/sharpness": 5.366253852844237, "eos/L0_probe": 1.986401081085205, "eos/L_plus": 2.021754503250122, "eos/L_minus": 2.0047101974487305, "eos/grad_norm": 0.09180830419063568, "eos/embed_grad_frac": 0.3079599440097809, "eos/time_s": 0.5978989601135254} {"step": 47025, "timestamp": 1778245372.0023293, "geo/rankme_last": 438.3034362792969, "geo/layer_0/stable_rank_q_proj": 19.477205276489258, "geo/layer_0/stable_rank_k_proj": 16.413379669189453, "geo/layer_0/stable_rank_o_proj": 48.141502380371094, "geo/layer_0/stable_rank_gate_proj": 134.88543701171875, "geo/layer_0/stable_rank_down_proj": 53.34977722167969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06094660237431526, "geo/layer_0/attn_entropy_mean": 6.1798906326293945, "geo/layer_0/attn_entropy_std": 0.3913283050060272, "geo/layer_7/stable_rank_q_proj": 42.61946487426758, "geo/layer_7/stable_rank_k_proj": 42.28828430175781, "geo/layer_7/stable_rank_o_proj": 94.51091003417969, "geo/layer_7/stable_rank_gate_proj": 87.74275207519531, "geo/layer_7/stable_rank_down_proj": 144.6988525390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4837331473827362, "geo/layer_7/attn_entropy_mean": 4.641963005065918, "geo/layer_7/attn_entropy_std": 0.7915435433387756, "geo/layer_14/stable_rank_q_proj": 53.00365447998047, "geo/layer_14/stable_rank_k_proj": 38.438568115234375, "geo/layer_14/stable_rank_o_proj": 46.43268966674805, "geo/layer_14/stable_rank_gate_proj": 74.43767547607422, "geo/layer_14/stable_rank_down_proj": 131.1090545654297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3924952745437622, "geo/layer_14/attn_entropy_mean": 5.5195159912109375, "geo/layer_14/attn_entropy_std": 0.3765106201171875, "geo/layer_21/stable_rank_q_proj": 42.40630340576172, "geo/layer_21/stable_rank_k_proj": 30.598453521728516, "geo/layer_21/stable_rank_o_proj": 74.19100952148438, "geo/layer_21/stable_rank_gate_proj": 70.302978515625, "geo/layer_21/stable_rank_down_proj": 53.91706848144531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1470537632703781, "geo/layer_21/attn_entropy_mean": 5.71827507019043, "geo/layer_21/attn_entropy_std": 0.28710466623306274, "geo/layer_27/stable_rank_q_proj": 42.63410186767578, "geo/layer_27/stable_rank_k_proj": 31.473926544189453, "geo/layer_27/stable_rank_o_proj": 115.97207641601562, "geo/layer_27/stable_rank_gate_proj": 83.37671661376953, "geo/layer_27/stable_rank_down_proj": 130.3734588623047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09179506450891495, "geo/layer_27/attn_entropy_mean": 4.272736072540283, "geo/layer_27/attn_entropy_std": 0.6767114996910095, "attnres/final_alpha/block_0": 0.2395917773246765, "attnres/block_norm/0": 1.7367303371429443, "attnres/final_alpha/block_1": 0.004957529250532389, "attnres/block_norm/1": 42476.3984375, "attnres/final_alpha/block_2": 0.010749749839305878, "attnres/block_norm/2": 27211.177734375, "attnres/final_alpha/block_3": 0.012613937258720398, "attnres/block_norm/3": 50728.92578125, "attnres/final_alpha/block_4": 0.015305617824196815, "attnres/block_norm/4": 13710.23046875, "attnres/final_alpha/block_5": 0.6020749807357788, "attnres/block_norm/5": 6211.3291015625, "attnres/final_alpha/block_6": 0.1147063821554184, "attnres/block_norm/6": 33514.30859375, "geo/tier1_time_s": 1.3562147617340088, "geo/step": 47025.0, "geo/rankme_slope": -5.883876988295318e-05} {"step": 47030, "timestamp": 1778245377.188607, "train/loss": 2.1687679290771484, "train/z_loss": 0.001397446054033935, "train/perplexity": 8.747499858443078, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701141.20152483, "perf/iters_per_sec": 0.8111673362373495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232791256904602, "data/tokens_consumed": 98631155712, "data/tokens_consumed_B": 98.631155712, "train/loss_slope": 1.6515088181505717e-05} {"step": 47040, "timestamp": 1778245387.5644953, "train/loss": 2.1576730012893677, "train/z_loss": 0.001385911030229181, "train/perplexity": 8.650983390884313, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022259.5688945006, "perf/iters_per_sec": 0.9642885059807303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037034034729004, "data/tokens_consumed": 98652127232, "data/tokens_consumed_B": 98.652127232, "train/loss_slope": 1.8609441660776074e-05} {"step": 47050, "timestamp": 1778245397.9373143, "grad/layer_0/attn": 0.0026359690818935633, "grad/layer_0/mlp": 0.0025771723594516516, "grad/layer_0/attn_mlp_ratio": 1.0228143918837398, "grad/layer_4/attn": 0.0022885696962475777, "grad/layer_4/mlp": 0.002313719131052494, "grad/layer_4/attn_mlp_ratio": 0.9891302563995735, "grad/layer_8/attn": 0.003743336535990238, "grad/layer_8/mlp": 0.0033712973818182945, "grad/layer_8/attn_mlp_ratio": 1.1103548577894367, "grad/layer_12/attn": 0.007246709894388914, "grad/layer_12/mlp": 0.006569802761077881, "grad/layer_12/attn_mlp_ratio": 1.1030330814523066, "grad/layer_16/attn": 0.0038011816795915365, "grad/layer_16/mlp": 0.004540934227406979, "grad/layer_16/attn_mlp_ratio": 0.8370924143626924, "grad/layer_20/attn": 0.003150138072669506, "grad/layer_20/mlp": 0.005859028548002243, "grad/layer_20/attn_mlp_ratio": 0.5376553456081173, "grad/layer_24/attn": 0.008641165681183338, "grad/layer_24/mlp": 0.007661681622266769, "grad/layer_24/attn_mlp_ratio": 1.1278419013504493, "grad/layer_27/attn": 0.00963915977627039, "grad/layer_27/mlp": 0.006515020504593849, "grad/layer_27/attn_mlp_ratio": 1.4795286709413717} {"step": 47050, "timestamp": 1778245397.951364, "train/loss": 2.166710376739502, "train/z_loss": 0.0013902977108955384, "train/perplexity": 8.729519923332738, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020310.8508466622, "perf/iters_per_sec": 0.9633592848046599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380343198776245, "data/tokens_consumed": 98673098752, "data/tokens_consumed_B": 98.673098752, "train/loss_slope": 1.7785535408074995e-05} {"step": 47060, "timestamp": 1778245408.3220026, "train/loss": 2.1713425397872923, "train/z_loss": 0.0013895212789066136, "train/perplexity": 8.770050282090214, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023523.1438720552, "perf/iters_per_sec": 0.9648910254822041, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363864660263062, "data/tokens_consumed": 98694070272, "data/tokens_consumed_B": 98.694070272, "train/loss_slope": 1.543437951039598e-05} {"step": 47070, "timestamp": 1778245418.6888037, "train/loss": 2.1360378503799438, "train/z_loss": 0.0013879712205380201, "train/perplexity": 8.465828212051681, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023907.493175482, "perf/iters_per_sec": 0.9650742975118074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036189651489258, "data/tokens_consumed": 98715041792, "data/tokens_consumed_B": 98.715041792, "train/loss_slope": 1.2582413548647832e-05} {"step": 47080, "timestamp": 1778245429.0595472, "train/loss": 2.1811880826950074, "train/z_loss": 0.001393263239879161, "train/perplexity": 8.856822648034658, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023111.6725151364, "perf/iters_per_sec": 0.9646948206496889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365972518920898, "data/tokens_consumed": 98736013312, "data/tokens_consumed_B": 98.736013312, "train/loss_slope": 1.7602142306229716e-05} {"step": 47090, "timestamp": 1778245439.9411712, "train/loss": 2.16380352973938, "train/z_loss": 0.0013951928121969104, "train/perplexity": 8.704181389992948, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928538.7950237298, "perf/iters_per_sec": 0.9195989585035943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0874305486679077, "data/tokens_consumed": 98756984832, "data/tokens_consumed_B": 98.756984832, "train/loss_slope": 2.1463793596633417e-05} {"step": 47100, "timestamp": 1778245450.306673, "grad/layer_0/attn": 0.003026437945663929, "grad/layer_0/mlp": 0.003084284020587802, "grad/layer_0/attn_mlp_ratio": 0.981244861801886, "grad/layer_4/attn": 0.0019258055835962296, "grad/layer_4/mlp": 0.0024653153959661722, "grad/layer_4/attn_mlp_ratio": 0.7811598907917846, "grad/layer_8/attn": 0.004023007582873106, "grad/layer_8/mlp": 0.003678777953609824, "grad/layer_8/attn_mlp_ratio": 1.0935717034969001, "grad/layer_12/attn": 0.004883938003331423, "grad/layer_12/mlp": 0.007146606221795082, "grad/layer_12/attn_mlp_ratio": 0.6833926178970884, "grad/layer_16/attn": 0.003447656985372305, "grad/layer_16/mlp": 0.004314614925533533, "grad/layer_16/attn_mlp_ratio": 0.7990647983584532, "grad/layer_20/attn": 0.003220798447728157, "grad/layer_20/mlp": 0.00630154786631465, "grad/layer_20/attn_mlp_ratio": 0.5111122639937287, "grad/layer_24/attn": 0.012706127017736435, "grad/layer_24/mlp": 0.011650923639535904, "grad/layer_24/attn_mlp_ratio": 1.0905682074477783, "grad/layer_27/attn": 0.004667187109589577, "grad/layer_27/mlp": 0.010772001929581165, "grad/layer_27/attn_mlp_ratio": 0.4332701661931497} {"step": 47100, "timestamp": 1778245450.89302, "eos/sharpness": 53.94415855407714, "eos/L0_probe": 1.9846889972686768, "eos/L_plus": 2.309932231903076, "eos/L_minus": 2.198887348175049, "eos/grad_norm": 0.1683819741010666, "eos/embed_grad_frac": 0.12047997862100601, "eos/time_s": 0.5836677551269531} {"step": 47100, "timestamp": 1778245450.9146216, "train/loss": 2.145395803451538, "train/z_loss": 0.001407072925940156, "train/perplexity": 8.54542287588696, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911924.3461119432, "perf/iters_per_sec": 0.911676571899387, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0968802213668822, "data/tokens_consumed": 98777956352, "data/tokens_consumed_B": 98.777956352, "train/loss_slope": 1.766923762927214e-05} {"step": 47100, "timestamp": 1778245452.311862, "geo/rankme_last": 439.099853515625, "geo/layer_0/stable_rank_q_proj": 19.462810516357422, "geo/layer_0/stable_rank_k_proj": 16.45198631286621, "geo/layer_0/stable_rank_o_proj": 48.10881423950195, "geo/layer_0/stable_rank_gate_proj": 135.25917053222656, "geo/layer_0/stable_rank_down_proj": 53.36916732788086, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06431028246879578, "geo/layer_0/attn_entropy_mean": 6.184286594390869, "geo/layer_0/attn_entropy_std": 0.3904954791069031, "geo/layer_7/stable_rank_q_proj": 42.64663314819336, "geo/layer_7/stable_rank_k_proj": 42.221500396728516, "geo/layer_7/stable_rank_o_proj": 94.51361083984375, "geo/layer_7/stable_rank_gate_proj": 87.68528747558594, "geo/layer_7/stable_rank_down_proj": 144.88978576660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48274901509284973, "geo/layer_7/attn_entropy_mean": 4.646070957183838, "geo/layer_7/attn_entropy_std": 0.7989923357963562, "geo/layer_14/stable_rank_q_proj": 53.095149993896484, "geo/layer_14/stable_rank_k_proj": 38.44575119018555, "geo/layer_14/stable_rank_o_proj": 46.350345611572266, "geo/layer_14/stable_rank_gate_proj": 74.48599243164062, "geo/layer_14/stable_rank_down_proj": 131.3828582763672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3871242105960846, "geo/layer_14/attn_entropy_mean": 5.535541534423828, "geo/layer_14/attn_entropy_std": 0.3693948984146118, "geo/layer_21/stable_rank_q_proj": 42.3419075012207, "geo/layer_21/stable_rank_k_proj": 30.648632049560547, "geo/layer_21/stable_rank_o_proj": 74.22055053710938, "geo/layer_21/stable_rank_gate_proj": 70.22862243652344, "geo/layer_21/stable_rank_down_proj": 53.9455451965332, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14296996593475342, "geo/layer_21/attn_entropy_mean": 5.707663536071777, "geo/layer_21/attn_entropy_std": 0.2991779148578644, "geo/layer_27/stable_rank_q_proj": 42.600685119628906, "geo/layer_27/stable_rank_k_proj": 31.524494171142578, "geo/layer_27/stable_rank_o_proj": 115.82714080810547, "geo/layer_27/stable_rank_gate_proj": 83.3624496459961, "geo/layer_27/stable_rank_down_proj": 130.22628784179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09534665942192078, "geo/layer_27/attn_entropy_mean": 4.25490665435791, "geo/layer_27/attn_entropy_std": 0.6617438793182373, "attnres/final_alpha/block_0": 0.24056494235992432, "attnres/block_norm/0": 1.7368450164794922, "attnres/final_alpha/block_1": 0.0048893410712480545, "attnres/block_norm/1": 42623.578125, "attnres/final_alpha/block_2": 0.010895542800426483, "attnres/block_norm/2": 27240.77734375, "attnres/final_alpha/block_3": 0.012842508964240551, "attnres/block_norm/3": 50652.1171875, "attnres/final_alpha/block_4": 0.015166103839874268, "attnres/block_norm/4": 13740.412109375, "attnres/final_alpha/block_5": 0.601814866065979, "attnres/block_norm/5": 6271.5654296875, "attnres/final_alpha/block_6": 0.11382674425840378, "attnres/block_norm/6": 33558.6484375, "geo/tier1_time_s": 1.3747284412384033, "geo/step": 47100.0, "geo/rankme_slope": -4.907759978991597e-05} {"step": 47110, "timestamp": 1778245462.7076738, "train/loss": 2.166763973236084, "train/z_loss": 0.001387867785524577, "train/perplexity": 8.729987807555839, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1781675.70017089, "perf/iters_per_sec": 0.8495691777090502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.177067184448242, "data/tokens_consumed": 98798927872, "data/tokens_consumed_B": 98.798927872, "train/loss_slope": 1.5091520381553253e-05} {"step": 47120, "timestamp": 1778245473.0926611, "train/loss": 2.1781081914901734, "train/z_loss": 0.0013877652469091117, "train/perplexity": 8.829586561479429, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020883.2090293467, "perf/iters_per_sec": 0.9636322064539655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377403259277345, "data/tokens_consumed": 98819899392, "data/tokens_consumed_B": 98.819899392, "train/loss_slope": 1.4170016263863965e-05} {"step": 47130, "timestamp": 1778245483.477638, "train/loss": 2.1754666566848755, "train/z_loss": 0.0013931822148151695, "train/perplexity": 8.806293679284645, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020960.4239909912, "perf/iters_per_sec": 0.9636690254168468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377006769180297, "data/tokens_consumed": 98840870912, "data/tokens_consumed_B": 98.840870912, "train/loss_slope": 1.6534937504637637e-05} {"step": 47140, "timestamp": 1778245493.8582606, "train/loss": 2.152614426612854, "train/z_loss": 0.001408647489733994, "train/perplexity": 8.607332244750502, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021769.7948460907, "perf/iters_per_sec": 0.9640549635153249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372852563858033, "data/tokens_consumed": 98861842432, "data/tokens_consumed_B": 98.861842432, "train/loss_slope": 1.8142091752243204e-05} {"step": 47150, "timestamp": 1778245504.226403, "grad/layer_0/attn": 0.002999706659466028, "grad/layer_0/mlp": 0.0030044857412576675, "grad/layer_0/attn_mlp_ratio": 0.9984093179185565, "grad/layer_4/attn": 0.0019652415066957474, "grad/layer_4/mlp": 0.0026109647005796432, "grad/layer_4/attn_mlp_ratio": 0.7526878593918451, "grad/layer_8/attn": 0.007873142138123512, "grad/layer_8/mlp": 0.0038667460903525352, "grad/layer_8/attn_mlp_ratio": 2.036115573803852, "grad/layer_12/attn": 0.004196988418698311, "grad/layer_12/mlp": 0.006604581605643034, "grad/layer_12/attn_mlp_ratio": 0.6354661969148407, "grad/layer_16/attn": 0.003926955163478851, "grad/layer_16/mlp": 0.0050164866261184216, "grad/layer_16/attn_mlp_ratio": 0.7828098384140227, "grad/layer_20/attn": 0.0052665602415800095, "grad/layer_20/mlp": 0.0066140382550656796, "grad/layer_20/attn_mlp_ratio": 0.7962699879940008, "grad/layer_24/attn": 0.010657419450581074, "grad/layer_24/mlp": 0.011045337654650211, "grad/layer_24/attn_mlp_ratio": 0.964879452970479, "grad/layer_27/attn": 0.004274425096809864, "grad/layer_27/mlp": 0.010816230438649654, "grad/layer_27/attn_mlp_ratio": 0.39518620479991196} {"step": 47150, "timestamp": 1778245504.2406864, "train/loss": 2.1726595878601076, "train/z_loss": 0.001389631920028478, "train/perplexity": 8.781608469586265, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021261.9577907242, "perf/iters_per_sec": 0.9638128079370137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037545871734619, "data/tokens_consumed": 98882813952, "data/tokens_consumed_B": 98.882813952, "train/loss_slope": 1.7517778975735453e-05} {"step": 47160, "timestamp": 1778245514.616928, "train/loss": 2.179684948921204, "train/z_loss": 0.0014017493696883321, "train/perplexity": 8.843519659374325, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022128.1892971515, "perf/iters_per_sec": 0.9642258593068845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037101411819458, "data/tokens_consumed": 98903785472, "data/tokens_consumed_B": 98.903785472, "train/loss_slope": 1.8605978754308604e-05} {"step": 47170, "timestamp": 1778245524.9994142, "train/loss": 2.1678914546966555, "train/z_loss": 0.0013964294688776136, "train/perplexity": 8.739836257889165, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021246.8162615537, "perf/iters_per_sec": 0.9638055878932732, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037553644180298, "data/tokens_consumed": 98924756992, "data/tokens_consumed_B": 98.924756992, "train/loss_slope": 1.9564004206206843e-05} {"step": 47175, "timestamp": 1778245530.7682006, "eos/sharpness": 62.40248680114745, "eos/L0_probe": 1.9881306886672974, "eos/L_plus": 2.268648624420166, "eos/L_minus": 2.3316376209259033, "eos/grad_norm": 0.1816454380750656, "eos/embed_grad_frac": 0.08274488151073456, "eos/time_s": 0.5908420085906982} {"step": 47175, "timestamp": 1778245532.1503615, "geo/rankme_last": 439.1092529296875, "geo/layer_0/stable_rank_q_proj": 19.4626407623291, "geo/layer_0/stable_rank_k_proj": 16.426074981689453, "geo/layer_0/stable_rank_o_proj": 48.115478515625, "geo/layer_0/stable_rank_gate_proj": 135.10955810546875, "geo/layer_0/stable_rank_down_proj": 53.3406867980957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05907667800784111, "geo/layer_0/attn_entropy_mean": 6.179811477661133, "geo/layer_0/attn_entropy_std": 0.388771116733551, "geo/layer_7/stable_rank_q_proj": 42.59468460083008, "geo/layer_7/stable_rank_k_proj": 42.14827346801758, "geo/layer_7/stable_rank_o_proj": 94.35433197021484, "geo/layer_7/stable_rank_gate_proj": 87.70767211914062, "geo/layer_7/stable_rank_down_proj": 144.8897247314453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46992793679237366, "geo/layer_7/attn_entropy_mean": 4.657632350921631, "geo/layer_7/attn_entropy_std": 0.798434317111969, "geo/layer_14/stable_rank_q_proj": 53.16017150878906, "geo/layer_14/stable_rank_k_proj": 38.410491943359375, "geo/layer_14/stable_rank_o_proj": 46.340126037597656, "geo/layer_14/stable_rank_gate_proj": 74.50437927246094, "geo/layer_14/stable_rank_down_proj": 131.77235412597656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4100235402584076, "geo/layer_14/attn_entropy_mean": 5.495326042175293, "geo/layer_14/attn_entropy_std": 0.3776010572910309, "geo/layer_21/stable_rank_q_proj": 42.31912612915039, "geo/layer_21/stable_rank_k_proj": 30.62972068786621, "geo/layer_21/stable_rank_o_proj": 74.22269439697266, "geo/layer_21/stable_rank_gate_proj": 70.17098236083984, "geo/layer_21/stable_rank_down_proj": 53.9041862487793, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1436675786972046, "geo/layer_21/attn_entropy_mean": 5.721965789794922, "geo/layer_21/attn_entropy_std": 0.29970696568489075, "geo/layer_27/stable_rank_q_proj": 42.6998291015625, "geo/layer_27/stable_rank_k_proj": 31.604726791381836, "geo/layer_27/stable_rank_o_proj": 115.47885131835938, "geo/layer_27/stable_rank_gate_proj": 83.3402328491211, "geo/layer_27/stable_rank_down_proj": 130.11773681640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09170699864625931, "geo/layer_27/attn_entropy_mean": 4.2531843185424805, "geo/layer_27/attn_entropy_std": 0.6802331209182739, "attnres/final_alpha/block_0": 0.24127915501594543, "attnres/block_norm/0": 1.7371692657470703, "attnres/final_alpha/block_1": 0.004986613057553768, "attnres/block_norm/1": 42559.45703125, "attnres/final_alpha/block_2": 0.010929265059530735, "attnres/block_norm/2": 27297.669921875, "attnres/final_alpha/block_3": 0.01267585065215826, "attnres/block_norm/3": 50999.8359375, "attnres/final_alpha/block_4": 0.01537352055311203, "attnres/block_norm/4": 13712.6171875, "attnres/final_alpha/block_5": 0.6008857488632202, "attnres/block_norm/5": 6288.19677734375, "attnres/final_alpha/block_6": 0.11386984586715698, "attnres/block_norm/6": 33717.7109375, "geo/tier1_time_s": 1.363612174987793, "geo/step": 47175.0, "geo/rankme_slope": -8.902752507252901e-05} {"step": 47180, "timestamp": 1778245537.3397894, "train/loss": 2.13167724609375, "train/z_loss": 0.0014016778790391982, "train/perplexity": 8.428992456709251, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700401.0518960934, "perf/iters_per_sec": 0.8108144053917377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233327865600586, "data/tokens_consumed": 98945728512, "data/tokens_consumed_B": 98.945728512, "train/loss_slope": 1.706792947017794e-05} {"step": 47190, "timestamp": 1778245547.7172334, "train/loss": 2.158981275558472, "train/z_loss": 0.0013887284905649722, "train/perplexity": 8.662308656518523, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021709.106719025, "perf/iters_per_sec": 0.9640260251612782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373163938522338, "data/tokens_consumed": 98966700032, "data/tokens_consumed_B": 98.966700032, "train/loss_slope": 1.6988164942936338e-05} {"step": 47200, "timestamp": 1778245558.605446, "grad/layer_0/attn": 0.0024803860578686, "grad/layer_0/mlp": 0.002781642833724618, "grad/layer_0/attn_mlp_ratio": 0.8916982218660843, "grad/layer_4/attn": 0.003002349054440856, "grad/layer_4/mlp": 0.0025619035586714745, "grad/layer_4/attn_mlp_ratio": 1.1719211392975577, "grad/layer_8/attn": 0.005075765307992697, "grad/layer_8/mlp": 0.003731051692739129, "grad/layer_8/attn_mlp_ratio": 1.3604113772610897, "grad/layer_12/attn": 0.003938400186598301, "grad/layer_12/mlp": 0.0063314116559922695, "grad/layer_12/attn_mlp_ratio": 0.6220413927226991, "grad/layer_16/attn": 0.006950931157916784, "grad/layer_16/mlp": 0.004634309560060501, "grad/layer_16/attn_mlp_ratio": 1.4998849165867005, "grad/layer_20/attn": 0.007676604203879833, "grad/layer_20/mlp": 0.005908160470426083, "grad/layer_20/attn_mlp_ratio": 1.2993222022952255, "grad/layer_24/attn": 0.006090136244893074, "grad/layer_24/mlp": 0.008100373670458794, "grad/layer_24/attn_mlp_ratio": 0.7518339792051518, "grad/layer_27/attn": 0.006827826611697674, "grad/layer_27/mlp": 0.006808166392147541, "grad/layer_27/attn_mlp_ratio": 1.0028877260232705} {"step": 47200, "timestamp": 1778245558.6200166, "train/loss": 2.1308928966522216, "train/z_loss": 0.001402485405560583, "train/perplexity": 8.422383773280568, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1924666.9814766548, "perf/iters_per_sec": 0.9177527339347147, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0896181106567382, "data/tokens_consumed": 98987671552, "data/tokens_consumed_B": 98.987671552, "train/loss_slope": 1.6090031425551692e-05} {"step": 47210, "timestamp": 1778245569.0060306, "train/loss": 2.1700510501861574, "train/z_loss": 0.0013960515963844955, "train/perplexity": 8.758731164184304, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020895.9771816698, "perf/iters_per_sec": 0.9636382947834348, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377337694168092, "data/tokens_consumed": 99008643072, "data/tokens_consumed_B": 99.008643072, "train/loss_slope": 1.707640400957219e-05} {"step": 47220, "timestamp": 1778245579.390421, "train/loss": 2.1586838006973266, "train/z_loss": 0.0013908458524383605, "train/perplexity": 8.659732220685171, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020412.6178734982, "perf/iters_per_sec": 0.9634078111045352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379820346832276, "data/tokens_consumed": 99029614592, "data/tokens_consumed_B": 99.029614592, "train/loss_slope": 1.7182727386527335e-05} {"step": 47230, "timestamp": 1778245589.777868, "train/loss": 2.170448088645935, "train/z_loss": 0.0013975952053442598, "train/perplexity": 8.762209407767882, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020503.0705896262, "perf/iters_per_sec": 0.9634509423206454, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379355669021606, "data/tokens_consumed": 99050586112, "data/tokens_consumed_B": 99.050586112, "train/loss_slope": 1.943082323741033e-05} {"step": 47240, "timestamp": 1778245600.163278, "train/loss": 2.112461602687836, "train/z_loss": 0.0013897723634727298, "train/perplexity": 8.268570188119329, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020715.6593924505, "perf/iters_per_sec": 0.96355231256125, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037826371192932, "data/tokens_consumed": 99071557632, "data/tokens_consumed_B": 99.071557632, "train/loss_slope": 1.4208880938676817e-05} {"step": 47250, "timestamp": 1778245610.5359516, "grad/layer_0/attn": 0.002683889353647828, "grad/layer_0/mlp": 0.0029401255305856466, "grad/layer_0/attn_mlp_ratio": 0.912848527875056, "grad/layer_4/attn": 0.002191360341385007, "grad/layer_4/mlp": 0.0024913086090236902, "grad/layer_4/attn_mlp_ratio": 0.8796020876287832, "grad/layer_8/attn": 0.0031290389597415924, "grad/layer_8/mlp": 0.0035550284665077925, "grad/layer_8/attn_mlp_ratio": 0.8801726628079779, "grad/layer_12/attn": 0.006121027749031782, "grad/layer_12/mlp": 0.006780944298952818, "grad/layer_12/attn_mlp_ratio": 0.9026807165646504, "grad/layer_16/attn": 0.0036788315046578646, "grad/layer_16/mlp": 0.004934921860694885, "grad/layer_16/attn_mlp_ratio": 0.7454690335447269, "grad/layer_20/attn": 0.004527534823864698, "grad/layer_20/mlp": 0.006629414390772581, "grad/layer_20/attn_mlp_ratio": 0.6829464095459005, "grad/layer_24/attn": 0.013544606044888496, "grad/layer_24/mlp": 0.010943657718598843, "grad/layer_24/attn_mlp_ratio": 1.2376671739378875, "grad/layer_27/attn": 0.009775799699127674, "grad/layer_27/mlp": 0.00984899140894413, "grad/layer_27/attn_mlp_ratio": 0.99256859854636} {"step": 47250, "timestamp": 1778245611.1248596, "eos/sharpness": 80.2481174468994, "eos/L0_probe": 1.9860490560531616, "eos/L_plus": 2.4954564571380615, "eos/L_minus": 2.279122829437256, "eos/grad_norm": 0.22918438911437988, "eos/embed_grad_frac": 0.039697930216789246, "eos/time_s": 0.5860562324523926} {"step": 47250, "timestamp": 1778245611.144394, "train/loss": 2.131828212738037, "train/z_loss": 0.0013860490871593356, "train/perplexity": 8.430265049472272, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910499.8541528194, "perf/iters_per_sec": 0.9109973212017152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0976980686187745, "data/tokens_consumed": 99092529152, "data/tokens_consumed_B": 99.092529152, "train/loss_slope": 1.042580404261593e-05} {"step": 47250, "timestamp": 1778245612.5041645, "geo/rankme_last": 439.1917724609375, "geo/layer_0/stable_rank_q_proj": 19.472692489624023, "geo/layer_0/stable_rank_k_proj": 16.423154830932617, "geo/layer_0/stable_rank_o_proj": 48.07181167602539, "geo/layer_0/stable_rank_gate_proj": 134.713623046875, "geo/layer_0/stable_rank_down_proj": 53.39008331298828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05907413363456726, "geo/layer_0/attn_entropy_mean": 6.184001922607422, "geo/layer_0/attn_entropy_std": 0.39098086953163147, "geo/layer_7/stable_rank_q_proj": 42.72935104370117, "geo/layer_7/stable_rank_k_proj": 42.00074768066406, "geo/layer_7/stable_rank_o_proj": 94.22232055664062, "geo/layer_7/stable_rank_gate_proj": 87.84459686279297, "geo/layer_7/stable_rank_down_proj": 144.7383575439453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47404351830482483, "geo/layer_7/attn_entropy_mean": 4.638720989227295, "geo/layer_7/attn_entropy_std": 0.8132753968238831, "geo/layer_14/stable_rank_q_proj": 53.14357376098633, "geo/layer_14/stable_rank_k_proj": 38.38804626464844, "geo/layer_14/stable_rank_o_proj": 46.33563995361328, "geo/layer_14/stable_rank_gate_proj": 74.62061309814453, "geo/layer_14/stable_rank_down_proj": 131.53504943847656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39056146144866943, "geo/layer_14/attn_entropy_mean": 5.529838562011719, "geo/layer_14/attn_entropy_std": 0.377715528011322, "geo/layer_21/stable_rank_q_proj": 42.10235595703125, "geo/layer_21/stable_rank_k_proj": 30.57168960571289, "geo/layer_21/stable_rank_o_proj": 74.18638610839844, "geo/layer_21/stable_rank_gate_proj": 70.20309448242188, "geo/layer_21/stable_rank_down_proj": 53.90542221069336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1436510533094406, "geo/layer_21/attn_entropy_mean": 5.703035354614258, "geo/layer_21/attn_entropy_std": 0.28848958015441895, "geo/layer_27/stable_rank_q_proj": 42.70329284667969, "geo/layer_27/stable_rank_k_proj": 31.534887313842773, "geo/layer_27/stable_rank_o_proj": 115.56385040283203, "geo/layer_27/stable_rank_gate_proj": 83.41244506835938, "geo/layer_27/stable_rank_down_proj": 130.2582244873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09097862988710403, "geo/layer_27/attn_entropy_mean": 4.275509834289551, "geo/layer_27/attn_entropy_std": 0.6843472719192505, "attnres/final_alpha/block_0": 0.23976123332977295, "attnres/block_norm/0": 1.7373406887054443, "attnres/final_alpha/block_1": 0.004961742088198662, "attnres/block_norm/1": 42354.7265625, "attnres/final_alpha/block_2": 0.010866701602935791, "attnres/block_norm/2": 27229.1953125, "attnres/final_alpha/block_3": 0.012465447187423706, "attnres/block_norm/3": 51085.546875, "attnres/final_alpha/block_4": 0.015318961814045906, "attnres/block_norm/4": 13677.583984375, "attnres/final_alpha/block_5": 0.6036719679832458, "attnres/block_norm/5": 6216.3740234375, "attnres/final_alpha/block_6": 0.11295393854379654, "attnres/block_norm/6": 33800.390625, "geo/tier1_time_s": 1.3564927577972412, "geo/step": 47250.0, "geo/rankme_slope": -8.506420146183473e-05} {"step": 47260, "timestamp": 1778245622.8914347, "train/loss": 2.154991793632507, "train/z_loss": 0.0013876152341254055, "train/perplexity": 8.627819375621776, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785735.7807740576, "perf/iters_per_sec": 0.8515051750059403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174390983581543, "data/tokens_consumed": 99113500672, "data/tokens_consumed_B": 99.113500672, "train/loss_slope": 1.0906665646346776e-05} {"step": 47270, "timestamp": 1778245633.7874572, "train/loss": 2.1687820672988893, "train/z_loss": 0.0013891295180656015, "train/perplexity": 8.747623533410026, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925494.1933934882, "perf/iters_per_sec": 0.9181471793143693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0891499996185303, "data/tokens_consumed": 99134472192, "data/tokens_consumed_B": 99.134472192, "train/loss_slope": 1.2709999141698887e-05} {"step": 47280, "timestamp": 1778245644.6786826, "train/loss": 2.177250528335571, "train/z_loss": 0.0013827111572027207, "train/perplexity": 8.82201699694758, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927001.634737329, "perf/iters_per_sec": 0.9188659833609243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0882979869842528, "data/tokens_consumed": 99155443712, "data/tokens_consumed_B": 99.155443712, "train/loss_slope": 1.5059233226827661e-05} {"step": 47290, "timestamp": 1778245655.06387, "train/loss": 2.147918176651001, "train/z_loss": 0.0013792652054689825, "train/perplexity": 8.567004828954492, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020974.678991694, "perf/iters_per_sec": 0.9636758227308722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376933574676515, "data/tokens_consumed": 99176415232, "data/tokens_consumed_B": 99.176415232, "train/loss_slope": 1.4817889202402937e-05} {"step": 47300, "timestamp": 1778245665.430614, "grad/layer_0/attn": 0.002714176895096898, "grad/layer_0/mlp": 0.0028757296968251467, "grad/layer_0/attn_mlp_ratio": 0.9438219467257996, "grad/layer_4/attn": 0.002291623502969742, "grad/layer_4/mlp": 0.0024487951304763556, "grad/layer_4/attn_mlp_ratio": 0.9358167128265601, "grad/layer_8/attn": 0.003444378264248371, "grad/layer_8/mlp": 0.0035951940808445215, "grad/layer_8/attn_mlp_ratio": 0.9580506896123415, "grad/layer_12/attn": 0.004065591376274824, "grad/layer_12/mlp": 0.006327263545244932, "grad/layer_12/attn_mlp_ratio": 0.6425512834968082, "grad/layer_16/attn": 0.004697273951023817, "grad/layer_16/mlp": 0.004512684419751167, "grad/layer_16/attn_mlp_ratio": 1.0409045725365327, "grad/layer_20/attn": 0.004539185203611851, "grad/layer_20/mlp": 0.0065923044458031654, "grad/layer_20/attn_mlp_ratio": 0.6885581775043472, "grad/layer_24/attn": 0.02573060430586338, "grad/layer_24/mlp": 0.012552213855087757, "grad/layer_24/attn_mlp_ratio": 2.0498857331406515, "grad/layer_27/attn": 0.007486224640160799, "grad/layer_27/mlp": 0.010517614893615246, "grad/layer_27/attn_mlp_ratio": 0.7117796805364465} {"step": 47300, "timestamp": 1778245665.4452693, "train/loss": 2.1196069717407227, "train/z_loss": 0.0013939824537374079, "train/perplexity": 8.32786375859927, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021000.867855834, "perf/iters_per_sec": 0.9636883105544253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03767991065979, "data/tokens_consumed": 99197386752, "data/tokens_consumed_B": 99.197386752, "train/loss_slope": 1.2381313900814557e-05} {"step": 47310, "timestamp": 1778245675.8276823, "train/loss": 2.135710871219635, "train/z_loss": 0.0013953927671536803, "train/perplexity": 8.463060515165848, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021386.210501891, "perf/iters_per_sec": 0.9638720562467056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374820947647094, "data/tokens_consumed": 99218358272, "data/tokens_consumed_B": 99.218358272, "train/loss_slope": 7.439372212615752e-06} {"step": 47320, "timestamp": 1778245686.2080708, "train/loss": 2.1639107942581175, "train/z_loss": 0.00139511835295707, "train/perplexity": 8.705115089896289, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022122.843355115, "perf/iters_per_sec": 0.9642233101630759, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371041536331176, "data/tokens_consumed": 99239329792, "data/tokens_consumed_B": 99.239329792, "train/loss_slope": 6.523883146027958e-06} {"step": 47325, "timestamp": 1778245692.4924455, "eos/sharpness": 74.28643703460692, "eos/L0_probe": 1.9829192161560059, "eos/L_plus": 2.272813558578491, "eos/L_minus": 2.43588924407959, "eos/grad_norm": 0.213711217045784, "eos/embed_grad_frac": 0.05222318321466446, "eos/time_s": 0.5971672534942627} {"step": 47325, "timestamp": 1778245693.8712633, "geo/rankme_last": 440.0176696777344, "geo/layer_0/stable_rank_q_proj": 19.46697235107422, "geo/layer_0/stable_rank_k_proj": 16.456331253051758, "geo/layer_0/stable_rank_o_proj": 48.167747497558594, "geo/layer_0/stable_rank_gate_proj": 134.64515686035156, "geo/layer_0/stable_rank_down_proj": 53.37770080566406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0652724876999855, "geo/layer_0/attn_entropy_mean": 6.182063102722168, "geo/layer_0/attn_entropy_std": 0.39074692130088806, "geo/layer_7/stable_rank_q_proj": 42.76232147216797, "geo/layer_7/stable_rank_k_proj": 41.990318298339844, "geo/layer_7/stable_rank_o_proj": 94.23684692382812, "geo/layer_7/stable_rank_gate_proj": 87.71598815917969, "geo/layer_7/stable_rank_down_proj": 144.9407958984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47378358244895935, "geo/layer_7/attn_entropy_mean": 4.656800746917725, "geo/layer_7/attn_entropy_std": 0.7987280488014221, "geo/layer_14/stable_rank_q_proj": 53.094085693359375, "geo/layer_14/stable_rank_k_proj": 38.38658905029297, "geo/layer_14/stable_rank_o_proj": 46.32338333129883, "geo/layer_14/stable_rank_gate_proj": 74.5394058227539, "geo/layer_14/stable_rank_down_proj": 131.136474609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3890765309333801, "geo/layer_14/attn_entropy_mean": 5.522420883178711, "geo/layer_14/attn_entropy_std": 0.3842192590236664, "geo/layer_21/stable_rank_q_proj": 42.10468292236328, "geo/layer_21/stable_rank_k_proj": 30.57537078857422, "geo/layer_21/stable_rank_o_proj": 74.15811157226562, "geo/layer_21/stable_rank_gate_proj": 70.11820220947266, "geo/layer_21/stable_rank_down_proj": 53.87640380859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15299057960510254, "geo/layer_21/attn_entropy_mean": 5.698765754699707, "geo/layer_21/attn_entropy_std": 0.3043613135814667, "geo/layer_27/stable_rank_q_proj": 42.6446418762207, "geo/layer_27/stable_rank_k_proj": 31.546974182128906, "geo/layer_27/stable_rank_o_proj": 115.50548553466797, "geo/layer_27/stable_rank_gate_proj": 83.30705261230469, "geo/layer_27/stable_rank_down_proj": 130.25294494628906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08774620294570923, "geo/layer_27/attn_entropy_mean": 4.277838230133057, "geo/layer_27/attn_entropy_std": 0.6791123151779175, "attnres/final_alpha/block_0": 0.24269640445709229, "attnres/block_norm/0": 1.7376794815063477, "attnres/final_alpha/block_1": 0.005078610498458147, "attnres/block_norm/1": 42539.8046875, "attnres/final_alpha/block_2": 0.011074612848460674, "attnres/block_norm/2": 27175.81640625, "attnres/final_alpha/block_3": 0.012690998613834381, "attnres/block_norm/3": 50726.546875, "attnres/final_alpha/block_4": 0.015578213147819042, "attnres/block_norm/4": 13719.6376953125, "attnres/final_alpha/block_5": 0.5966488122940063, "attnres/block_norm/5": 6315.85400390625, "attnres/final_alpha/block_6": 0.11623238027095795, "attnres/block_norm/6": 33423.45703125, "geo/tier1_time_s": 1.3578057289123535, "geo/step": 47325.0, "geo/rankme_slope": -4.923897293292317e-05} {"step": 47330, "timestamp": 1778245699.061278, "train/loss": 2.1645636677742006, "train/z_loss": 0.001400090614333749, "train/perplexity": 8.710800284647542, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1632364.7636391069, "perf/iters_per_sec": 0.7783721750445876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.284732460975647, "data/tokens_consumed": 99260301312, "data/tokens_consumed_B": 99.260301312, "train/loss_slope": 4.749468422279443e-06} {"step": 47340, "timestamp": 1778245709.8608744, "train/loss": 2.172309231758118, "train/z_loss": 0.0013887232402339579, "train/perplexity": 8.7785323183793, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1942733.6120100305, "perf/iters_per_sec": 0.9263675746965554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0794851064682007, "data/tokens_consumed": 99281272832, "data/tokens_consumed_B": 99.281272832, "train/loss_slope": 5.683865849048343e-06} {"step": 47350, "timestamp": 1778245720.2311368, "grad/layer_0/attn": 0.0024790416937321424, "grad/layer_0/mlp": 0.002783676143735647, "grad/layer_0/attn_mlp_ratio": 0.8905639437455951, "grad/layer_4/attn": 0.0020205716136842966, "grad/layer_4/mlp": 0.0024748388677835464, "grad/layer_4/attn_mlp_ratio": 0.816445692017655, "grad/layer_8/attn": 0.0036465665325522423, "grad/layer_8/mlp": 0.0036227379459887743, "grad/layer_8/attn_mlp_ratio": 1.0065774798677072, "grad/layer_12/attn": 0.003935643006116152, "grad/layer_12/mlp": 0.006229854188859463, "grad/layer_12/attn_mlp_ratio": 0.6317391745669021, "grad/layer_16/attn": 0.004067733883857727, "grad/layer_16/mlp": 0.004330654628574848, "grad/layer_16/attn_mlp_ratio": 0.9392884306887139, "grad/layer_20/attn": 0.0034650336019694805, "grad/layer_20/mlp": 0.005576497409492731, "grad/layer_20/attn_mlp_ratio": 0.6213637854353995, "grad/layer_24/attn": 0.004961173050105572, "grad/layer_24/mlp": 0.007488173432648182, "grad/layer_24/attn_mlp_ratio": 0.662534465644396, "grad/layer_27/attn": 0.0040480936877429485, "grad/layer_27/mlp": 0.006359952501952648, "grad/layer_27/attn_mlp_ratio": 0.6364974617106569} {"step": 47350, "timestamp": 1778245720.245707, "train/loss": 2.1758907318115233, "train/z_loss": 0.0013956282171420753, "train/perplexity": 8.810029001364597, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020879.4482588766, "perf/iters_per_sec": 0.9636304131788619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037742257118225, "data/tokens_consumed": 99302244352, "data/tokens_consumed_B": 99.302244352, "train/loss_slope": 4.73908495337912e-06} {"step": 47360, "timestamp": 1778245730.6226223, "train/loss": 2.173582577705383, "train/z_loss": 0.0013853147276677192, "train/perplexity": 8.789717546751135, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021922.3676929462, "perf/iters_per_sec": 0.9641277159180385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372069835662843, "data/tokens_consumed": 99323215872, "data/tokens_consumed_B": 99.323215872, "train/loss_slope": 4.6101015750760615e-06} {"step": 47370, "timestamp": 1778245741.000204, "train/loss": 2.1633881330490112, "train/z_loss": 0.00139696488622576, "train/perplexity": 8.700566452719645, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022143.1115979692, "perf/iters_per_sec": 0.9642329748144003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370937585830688, "data/tokens_consumed": 99344187392, "data/tokens_consumed_B": 99.344187392, "train/loss_slope": 6.2722313271747985e-06} {"step": 47380, "timestamp": 1778245751.3806732, "train/loss": 2.1464086055755613, "train/z_loss": 0.001392503106035292, "train/perplexity": 8.554082082617626, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021581.283234562, "perf/iters_per_sec": 0.9639650741741953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373819828033448, "data/tokens_consumed": 99365158912, "data/tokens_consumed_B": 99.365158912, "train/loss_slope": 8.216266005453413e-06} {"step": 47390, "timestamp": 1778245761.7575457, "train/loss": 2.157253384590149, "train/z_loss": 0.0013885195134207607, "train/perplexity": 8.647354055306996, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021892.5763027088, "perf/iters_per_sec": 0.9641135102761788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372222661972046, "data/tokens_consumed": 99386130432, "data/tokens_consumed_B": 99.386130432, "train/loss_slope": 6.572385289237728e-06} {"step": 47400, "timestamp": 1778245772.1313322, "grad/layer_0/attn": 0.00314805144444108, "grad/layer_0/mlp": 0.0029689883813261986, "grad/layer_0/attn_mlp_ratio": 1.0603111006462702, "grad/layer_4/attn": 0.0024172074627131224, "grad/layer_4/mlp": 0.00257473299279809, "grad/layer_4/attn_mlp_ratio": 0.9388186563781736, "grad/layer_8/attn": 0.003483694978058338, "grad/layer_8/mlp": 0.003554046619683504, "grad/layer_8/attn_mlp_ratio": 0.9802051725331762, "grad/layer_12/attn": 0.005239445250481367, "grad/layer_12/mlp": 0.0069312178529798985, "grad/layer_12/attn_mlp_ratio": 0.7559198521854017, "grad/layer_16/attn": 0.0042239162139594555, "grad/layer_16/mlp": 0.0044916351325809956, "grad/layer_16/attn_mlp_ratio": 0.9403960907868063, "grad/layer_20/attn": 0.008022461086511612, "grad/layer_20/mlp": 0.0070009115152060986, "grad/layer_20/attn_mlp_ratio": 1.1459166359259116, "grad/layer_24/attn": 0.02200089767575264, "grad/layer_24/mlp": 0.01422067079693079, "grad/layer_24/attn_mlp_ratio": 1.5471068724683754, "grad/layer_27/attn": 0.012394263409078121, "grad/layer_27/mlp": 0.015046452172100544, "grad/layer_27/attn_mlp_ratio": 0.8237332751229226} {"step": 47400, "timestamp": 1778245772.7185066, "eos/sharpness": 78.14707756042479, "eos/L0_probe": 1.9869478940963745, "eos/L_plus": 2.4593698978424072, "eos/L_minus": 2.29599666595459, "eos/grad_norm": 0.2881838083267212, "eos/embed_grad_frac": 0.027869122102856636, "eos/time_s": 0.5843009948730469} {"step": 47400, "timestamp": 1778245772.7380826, "train/loss": 2.1746861219406126, "train/z_loss": 0.001386368798557669, "train/perplexity": 8.799422742950883, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911220.8252831937, "perf/iters_per_sec": 0.9113411070266694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972839832305907, "data/tokens_consumed": 99407101952, "data/tokens_consumed_B": 99.407101952, "train/loss_slope": 6.544525280679791e-06} {"step": 47400, "timestamp": 1778245774.0999079, "geo/rankme_last": 438.87335205078125, "geo/layer_0/stable_rank_q_proj": 19.46495819091797, "geo/layer_0/stable_rank_k_proj": 16.456331253051758, "geo/layer_0/stable_rank_o_proj": 48.22731399536133, "geo/layer_0/stable_rank_gate_proj": 134.88357543945312, "geo/layer_0/stable_rank_down_proj": 53.466976165771484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06262819468975067, "geo/layer_0/attn_entropy_mean": 6.1876912117004395, "geo/layer_0/attn_entropy_std": 0.386949747800827, "geo/layer_7/stable_rank_q_proj": 42.84922790527344, "geo/layer_7/stable_rank_k_proj": 41.973777770996094, "geo/layer_7/stable_rank_o_proj": 94.44877624511719, "geo/layer_7/stable_rank_gate_proj": 87.52549743652344, "geo/layer_7/stable_rank_down_proj": 144.91249084472656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4856744110584259, "geo/layer_7/attn_entropy_mean": 4.638821601867676, "geo/layer_7/attn_entropy_std": 0.8202469944953918, "geo/layer_14/stable_rank_q_proj": 53.03650665283203, "geo/layer_14/stable_rank_k_proj": 38.40096664428711, "geo/layer_14/stable_rank_o_proj": 46.219703674316406, "geo/layer_14/stable_rank_gate_proj": 74.37913513183594, "geo/layer_14/stable_rank_down_proj": 131.14376831054688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3865237832069397, "geo/layer_14/attn_entropy_mean": 5.515413284301758, "geo/layer_14/attn_entropy_std": 0.3660341203212738, "geo/layer_21/stable_rank_q_proj": 42.125972747802734, "geo/layer_21/stable_rank_k_proj": 30.517282485961914, "geo/layer_21/stable_rank_o_proj": 74.07208251953125, "geo/layer_21/stable_rank_gate_proj": 70.29566192626953, "geo/layer_21/stable_rank_down_proj": 53.827369689941406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14259018003940582, "geo/layer_21/attn_entropy_mean": 5.727280139923096, "geo/layer_21/attn_entropy_std": 0.28689512610435486, "geo/layer_27/stable_rank_q_proj": 42.6954345703125, "geo/layer_27/stable_rank_k_proj": 31.475587844848633, "geo/layer_27/stable_rank_o_proj": 115.62347412109375, "geo/layer_27/stable_rank_gate_proj": 83.24080657958984, "geo/layer_27/stable_rank_down_proj": 130.18936157226562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08941523730754852, "geo/layer_27/attn_entropy_mean": 4.278225898742676, "geo/layer_27/attn_entropy_std": 0.6830723285675049, "attnres/final_alpha/block_0": 0.23848678171634674, "attnres/block_norm/0": 1.7375175952911377, "attnres/final_alpha/block_1": 0.00485227070748806, "attnres/block_norm/1": 42730.0546875, "attnres/final_alpha/block_2": 0.01083311252295971, "attnres/block_norm/2": 27256.310546875, "attnres/final_alpha/block_3": 0.012500384822487831, "attnres/block_norm/3": 51475.6640625, "attnres/final_alpha/block_4": 0.01505594328045845, "attnres/block_norm/4": 13755.34765625, "attnres/final_alpha/block_5": 0.6081474423408508, "attnres/block_norm/5": 6198.60595703125, "attnres/final_alpha/block_6": 0.11012406647205353, "attnres/block_norm/6": 33779.8984375, "geo/tier1_time_s": 1.358022689819336, "geo/step": 47400.0, "geo/rankme_slope": -6.681172468987595e-05} {"step": 47410, "timestamp": 1778245784.4781666, "train/loss": 2.162126564979553, "train/z_loss": 0.0013779368484392763, "train/perplexity": 8.68959701669668, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786877.2122215014, "perf/iters_per_sec": 0.852049451933623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1736407995223999, "data/tokens_consumed": 99428073472, "data/tokens_consumed_B": 99.428073472, "train/loss_slope": 6.495325679075188e-06} {"step": 47420, "timestamp": 1778245794.8551447, "train/loss": 2.160233211517334, "train/z_loss": 0.001401890255510807, "train/perplexity": 8.673160103453364, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022393.8484855841, "perf/iters_per_sec": 0.9643525354793473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369651794433594, "data/tokens_consumed": 99449044992, "data/tokens_consumed_B": 99.449044992, "train/loss_slope": 4.570174796639225e-06} {"step": 47430, "timestamp": 1778245805.239279, "train/loss": 2.129767632484436, "train/z_loss": 0.001397946709766984, "train/perplexity": 8.412911696906704, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021021.9030201435, "perf/iters_per_sec": 0.9636983409023969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376691102981568, "data/tokens_consumed": 99470016512, "data/tokens_consumed_B": 99.470016512, "train/loss_slope": 5.168124215223165e-07} {"step": 47440, "timestamp": 1778245815.6208272, "train/loss": 2.171291160583496, "train/z_loss": 0.001383280917070806, "train/perplexity": 8.769599695464954, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021494.0324288707, "perf/iters_per_sec": 0.9639234697479585, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374267578125, "data/tokens_consumed": 99490988032, "data/tokens_consumed_B": 99.490988032, "train/loss_slope": 1.721074350095537e-06} {"step": 47450, "timestamp": 1778245825.9934855, "grad/layer_0/attn": 0.0025995660107582808, "grad/layer_0/mlp": 0.002805046271532774, "grad/layer_0/attn_mlp_ratio": 0.9267461804340112, "grad/layer_4/attn": 0.0022232146002352238, "grad/layer_4/mlp": 0.0024239083286374807, "grad/layer_4/attn_mlp_ratio": 0.9172023884932524, "grad/layer_8/attn": 0.004344529006630182, "grad/layer_8/mlp": 0.0034382236190140247, "grad/layer_8/attn_mlp_ratio": 1.2635969505428395, "grad/layer_12/attn": 0.005364840384572744, "grad/layer_12/mlp": 0.007055077236145735, "grad/layer_12/attn_mlp_ratio": 0.7604226189111649, "grad/layer_16/attn": 0.0037245419807732105, "grad/layer_16/mlp": 0.004759147763252258, "grad/layer_16/attn_mlp_ratio": 0.7826069052260904, "grad/layer_20/attn": 0.003779734019190073, "grad/layer_20/mlp": 0.006320394575595856, "grad/layer_20/attn_mlp_ratio": 0.5980218345832554, "grad/layer_24/attn": 0.012334681116044521, "grad/layer_24/mlp": 0.010266406461596489, "grad/layer_24/attn_mlp_ratio": 1.2014604177263757, "grad/layer_27/attn": 0.005092845298349857, "grad/layer_27/mlp": 0.009056828916072845, "grad/layer_27/attn_mlp_ratio": 0.5623210164740616} {"step": 47450, "timestamp": 1778245826.007552, "train/loss": 2.1573105096817016, "train/z_loss": 0.0013985888916067779, "train/perplexity": 8.647848050308713, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020737.9884542557, "perf/iters_per_sec": 0.9635629598876265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378149032592774, "data/tokens_consumed": 99511959552, "data/tokens_consumed_B": 99.511959552, "train/loss_slope": 8.366958107134179e-07} {"step": 47460, "timestamp": 1778245836.3874948, "train/loss": 2.166995120048523, "train/z_loss": 0.001377679267898202, "train/perplexity": 8.732005949644757, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022044.4239583707, "perf/iters_per_sec": 0.9641859168807844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037144374847412, "data/tokens_consumed": 99532931072, "data/tokens_consumed_B": 99.532931072, "train/loss_slope": 1.7472053840286418e-07} {"step": 47470, "timestamp": 1778245846.7612164, "train/loss": 2.201213312149048, "train/z_loss": 0.0013711614650674165, "train/perplexity": 9.035970303625977, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022943.613597754, "perf/iters_per_sec": 0.9646146839131136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366833686828614, "data/tokens_consumed": 99553902592, "data/tokens_consumed_B": 99.553902592, "train/loss_slope": 8.839847946395923e-07} {"step": 47475, "timestamp": 1778245852.534722, "eos/sharpness": 72.318959236145, "eos/L0_probe": 1.984985113143921, "eos/L_plus": 2.297837495803833, "eos/L_minus": 2.395322322845459, "eos/grad_norm": 0.22141742706298828, "eos/embed_grad_frac": 0.05298926308751106, "eos/time_s": 0.5904715061187744} {"step": 47475, "timestamp": 1778245853.9076762, "geo/rankme_last": 439.5613708496094, "geo/layer_0/stable_rank_q_proj": 19.440095901489258, "geo/layer_0/stable_rank_k_proj": 16.457033157348633, "geo/layer_0/stable_rank_o_proj": 48.21804428100586, "geo/layer_0/stable_rank_gate_proj": 134.82122802734375, "geo/layer_0/stable_rank_down_proj": 53.5198974609375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.058443833142519, "geo/layer_0/attn_entropy_mean": 6.18840217590332, "geo/layer_0/attn_entropy_std": 0.38644275069236755, "geo/layer_7/stable_rank_q_proj": 42.864532470703125, "geo/layer_7/stable_rank_k_proj": 41.879669189453125, "geo/layer_7/stable_rank_o_proj": 94.69237518310547, "geo/layer_7/stable_rank_gate_proj": 87.65967559814453, "geo/layer_7/stable_rank_down_proj": 145.25445556640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4867399334907532, "geo/layer_7/attn_entropy_mean": 4.615887641906738, "geo/layer_7/attn_entropy_std": 0.8032941222190857, "geo/layer_14/stable_rank_q_proj": 53.0493278503418, "geo/layer_14/stable_rank_k_proj": 38.461830139160156, "geo/layer_14/stable_rank_o_proj": 46.20027542114258, "geo/layer_14/stable_rank_gate_proj": 74.33706665039062, "geo/layer_14/stable_rank_down_proj": 131.52195739746094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39616844058036804, "geo/layer_14/attn_entropy_mean": 5.525826454162598, "geo/layer_14/attn_entropy_std": 0.38184812664985657, "geo/layer_21/stable_rank_q_proj": 42.11412811279297, "geo/layer_21/stable_rank_k_proj": 30.48448371887207, "geo/layer_21/stable_rank_o_proj": 74.07195281982422, "geo/layer_21/stable_rank_gate_proj": 70.3064193725586, "geo/layer_21/stable_rank_down_proj": 53.880615234375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14012737572193146, "geo/layer_21/attn_entropy_mean": 5.7113800048828125, "geo/layer_21/attn_entropy_std": 0.28577256202697754, "geo/layer_27/stable_rank_q_proj": 42.66263961791992, "geo/layer_27/stable_rank_k_proj": 31.506669998168945, "geo/layer_27/stable_rank_o_proj": 115.6825942993164, "geo/layer_27/stable_rank_gate_proj": 83.13558197021484, "geo/layer_27/stable_rank_down_proj": 130.24693298339844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08412669599056244, "geo/layer_27/attn_entropy_mean": 4.273709297180176, "geo/layer_27/attn_entropy_std": 0.6811118721961975, "attnres/final_alpha/block_0": 0.24336445331573486, "attnres/block_norm/0": 1.7377456426620483, "attnres/final_alpha/block_1": 0.0050078583881258965, "attnres/block_norm/1": 42549.08984375, "attnres/final_alpha/block_2": 0.011015711352229118, "attnres/block_norm/2": 27385.76171875, "attnres/final_alpha/block_3": 0.013065568171441555, "attnres/block_norm/3": 51016.5078125, "attnres/final_alpha/block_4": 0.01582188718020916, "attnres/block_norm/4": 13732.1640625, "attnres/final_alpha/block_5": 0.5961416959762573, "attnres/block_norm/5": 6272.6796875, "attnres/final_alpha/block_6": 0.11558276414871216, "attnres/block_norm/6": 33662.46484375, "geo/tier1_time_s": 1.3544585704803467, "geo/step": 47475.0, "geo/rankme_slope": -7.18284774847439e-05} {"step": 47480, "timestamp": 1778245859.0960746, "train/loss": 2.157528591156006, "train/z_loss": 0.0013977967319078743, "train/perplexity": 8.649734191419828, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701513.6409730057, "perf/iters_per_sec": 0.8113449292054203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232521414756775, "data/tokens_consumed": 99574874112, "data/tokens_consumed_B": 99.574874112, "train/loss_slope": -1.4171032157346133e-06} {"step": 47490, "timestamp": 1778245869.4725761, "train/loss": 2.1374781847000124, "train/z_loss": 0.0013969969120807945, "train/perplexity": 8.478030620637686, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022354.2322920111, "perf/iters_per_sec": 0.9643336450061851, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036985492706299, "data/tokens_consumed": 99595845632, "data/tokens_consumed_B": 99.595845632, "train/loss_slope": -1.5376826574449555e-06} {"step": 47500, "timestamp": 1778245879.8464434, "grad/layer_0/attn": 0.0026897082570940256, "grad/layer_0/mlp": 0.0029424086678773165, "grad/layer_0/attn_mlp_ratio": 0.9141178093465268, "grad/layer_4/attn": 0.0021516955457627773, "grad/layer_4/mlp": 0.0025176997296512127, "grad/layer_4/attn_mlp_ratio": 0.8546275137417237, "grad/layer_8/attn": 0.004822534043341875, "grad/layer_8/mlp": 0.0035457927733659744, "grad/layer_8/attn_mlp_ratio": 1.360072123661275, "grad/layer_12/attn": 0.004940510727465153, "grad/layer_12/mlp": 0.006220512092113495, "grad/layer_12/attn_mlp_ratio": 0.7942289276000204, "grad/layer_16/attn": 0.0041883946396410465, "grad/layer_16/mlp": 0.0043514929711818695, "grad/layer_16/attn_mlp_ratio": 0.9625189724830413, "grad/layer_20/attn": 0.005272380076348782, "grad/layer_20/mlp": 0.006030169315636158, "grad/layer_20/attn_mlp_ratio": 0.8743336568086401, "grad/layer_24/attn": 0.018032487481832504, "grad/layer_24/mlp": 0.00961834192276001, "grad/layer_24/attn_mlp_ratio": 1.8748020645514571, "grad/layer_27/attn": 0.009143001399934292, "grad/layer_27/mlp": 0.009038696065545082, "grad/layer_27/attn_mlp_ratio": 1.0115398540319138} {"step": 47500, "timestamp": 1778245879.860669, "train/loss": 2.1795615196228026, "train/z_loss": 0.0013889066176488996, "train/perplexity": 8.84242817730919, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019738.074752572, "perf/iters_per_sec": 0.9630861638796673, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383286952972413, "data/tokens_consumed": 99616817152, "data/tokens_consumed_B": 99.616817152, "train/loss_slope": -3.6011773331759704e-06} {"step": 47500, "timestamp": 1778245887.065823, "geo/ww_alpha_mean": 7.574061140645992, "geo/ww_alpha_std": 4.972187225010657, "geo/ww_alpha_min": 1.340812350532448, "geo/ww_alpha_max": 49.39510321765643, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.029446490312187, "geo/ww_alpha_by_type/k_proj": 4.455643930307224, "geo/ww_alpha_by_type/v_proj": 7.98992285126922, "geo/ww_alpha_by_type/o_proj": 7.142139206314691, "geo/ww_alpha_by_type/gate_proj": 8.037721318257352, "geo/ww_alpha_by_type/up_proj": 13.29405256990249, "geo/ww_alpha_by_type/down_proj": 8.170660321633592, "geo/twonn_id/layer_0": 0.7126762270927429, "geo/twonn_id/layer_7": 3.5890254974365234, "geo/twonn_id/layer_14": 5.460987567901611, "geo/twonn_id/layer_21": 6.978787899017334, "geo/twonn_id/layer_27": 6.542952537536621, "geo/tier2_time_s": 7.196156740188599} {"step": 47500, "timestamp": 1778245887.721891, "eoc/jacobian_sigma/layer_0/attn": 1068.47119140625, "eoc/jacobian_sigma/layer_0/mlp": 7982.36328125, "eoc/jacobian_sigma/layer_0": 7982.36328125, "eoc/jacobian_sigma/layer_7/attn": 1.1733475923538208, "eoc/jacobian_sigma/layer_7/mlp": 1.7670820951461792, "eoc/jacobian_sigma/layer_7": 1.7670820951461792, "eoc/jacobian_sigma/layer_14/attn": 1.5784958600997925, "eoc/jacobian_sigma/layer_14/mlp": 5.856992244720459, "eoc/jacobian_sigma/layer_14": 5.856992244720459, "eoc/jacobian_sigma/layer_21/attn": 1.0944262742996216, "eoc/jacobian_sigma/layer_21/mlp": 4.0700483322143555, "eoc/jacobian_sigma/layer_21": 4.0700483322143555, "eoc/jacobian_sigma/layer_27/attn": 3.3450186252593994, "eoc/jacobian_sigma/layer_27/mlp": 29.19816017150879, "eoc/jacobian_sigma/layer_27": 29.19816017150879, "eoc/layer0_sigma": 7982.36328125, "eoc/sigma_max": 29.19816017150879, "eoc/sigma_min": 1.7670820951461792, "eoc/sigma_mean": 10.223070710897446, "eoc/time_s": 0.6477236747741699} {"step": 47510, "timestamp": 1778245898.1251256, "train/loss": 2.1298144459724426, "train/z_loss": 0.0013968881452456118, "train/perplexity": 8.41330554386613, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1148696.9801940278, "perf/iters_per_sec": 0.5477414036722316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8256790399551392, "data/tokens_consumed": 99637788672, "data/tokens_consumed_B": 99.637788672, "train/loss_slope": -5.998958426840914e-06} {"step": 47520, "timestamp": 1778245908.4994142, "train/loss": 2.170804500579834, "train/z_loss": 0.0014053620514459908, "train/perplexity": 8.765332920363651, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022478.4798259675, "perf/iters_per_sec": 0.9643928908471906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036921787261963, "data/tokens_consumed": 99658760192, "data/tokens_consumed_B": 99.658760192, "train/loss_slope": -8.29583165740737e-06} {"step": 47530, "timestamp": 1778245918.8753405, "train/loss": 2.1049232959747313, "train/z_loss": 0.0014020868926309048, "train/perplexity": 8.206473515901836, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022791.863451311, "perf/iters_per_sec": 0.9645423238045268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367611408233643, "data/tokens_consumed": 99679731712, "data/tokens_consumed_B": 99.679731712, "train/loss_slope": -1.25005715607953e-05} {"step": 47540, "timestamp": 1778245929.251526, "train/loss": 2.192633533477783, "train/z_loss": 0.0013851542491465807, "train/perplexity": 8.958775309864361, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022551.3056488323, "perf/iters_per_sec": 0.9644276169056093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368844509124755, "data/tokens_consumed": 99700703232, "data/tokens_consumed_B": 99.700703232, "train/loss_slope": -9.599648243022541e-06} {"step": 47550, "timestamp": 1778245939.6222312, "grad/layer_0/attn": 0.003321722848340869, "grad/layer_0/mlp": 0.003123900853097439, "grad/layer_0/attn_mlp_ratio": 1.0633252776619835, "grad/layer_4/attn": 0.0019423245685175061, "grad/layer_4/mlp": 0.002462772186845541, "grad/layer_4/attn_mlp_ratio": 0.7886740397770782, "grad/layer_8/attn": 0.006067154463380575, "grad/layer_8/mlp": 0.003473631339147687, "grad/layer_8/attn_mlp_ratio": 1.746631607200461, "grad/layer_12/attn": 0.005544829647988081, "grad/layer_12/mlp": 0.006670236121863127, "grad/layer_12/attn_mlp_ratio": 0.8312793525683116, "grad/layer_16/attn": 0.003521823091432452, "grad/layer_16/mlp": 0.00447738217189908, "grad/layer_16/attn_mlp_ratio": 0.7865808362033543, "grad/layer_20/attn": 0.0033796823117882013, "grad/layer_20/mlp": 0.006223408039659262, "grad/layer_20/attn_mlp_ratio": 0.5430597248235824, "grad/layer_24/attn": 0.007662205025553703, "grad/layer_24/mlp": 0.00883904006332159, "grad/layer_24/attn_mlp_ratio": 0.8668593969454657, "grad/layer_27/attn": 0.0056898994371294975, "grad/layer_27/mlp": 0.007937304675579071, "grad/layer_27/attn_mlp_ratio": 0.7168553555655027} {"step": 47550, "timestamp": 1778245940.2347715, "eos/sharpness": 48.143267631530755, "eos/L0_probe": 1.9876331090927124, "eos/L_plus": 2.2651915550231934, "eos/L_minus": 2.191507339477539, "eos/grad_norm": 0.12902764976024628, "eos/embed_grad_frac": 0.13895758986473083, "eos/time_s": 0.6096842288970947} {"step": 47550, "timestamp": 1778245940.2553272, "train/loss": 2.158958911895752, "train/z_loss": 0.0013930172310210765, "train/perplexity": 8.66211493773549, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906655.0949666116, "perf/iters_per_sec": 0.9091639971573885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999115705490112, "data/tokens_consumed": 99721674752, "data/tokens_consumed_B": 99.721674752, "train/loss_slope": -9.026369832494124e-06} {"step": 47550, "timestamp": 1778245941.6163075, "geo/rankme_last": 439.4982604980469, "geo/layer_0/stable_rank_q_proj": 19.414514541625977, "geo/layer_0/stable_rank_k_proj": 16.463050842285156, "geo/layer_0/stable_rank_o_proj": 48.17930221557617, "geo/layer_0/stable_rank_gate_proj": 134.85447692871094, "geo/layer_0/stable_rank_down_proj": 53.52935028076172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06451326608657837, "geo/layer_0/attn_entropy_mean": 6.183230876922607, "geo/layer_0/attn_entropy_std": 0.3856217563152313, "geo/layer_7/stable_rank_q_proj": 42.86163330078125, "geo/layer_7/stable_rank_k_proj": 41.96756362915039, "geo/layer_7/stable_rank_o_proj": 94.49517059326172, "geo/layer_7/stable_rank_gate_proj": 87.5557632446289, "geo/layer_7/stable_rank_down_proj": 145.70643615722656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.470655620098114, "geo/layer_7/attn_entropy_mean": 4.632735252380371, "geo/layer_7/attn_entropy_std": 0.8130343556404114, "geo/layer_14/stable_rank_q_proj": 53.02138900756836, "geo/layer_14/stable_rank_k_proj": 38.496822357177734, "geo/layer_14/stable_rank_o_proj": 46.26268005371094, "geo/layer_14/stable_rank_gate_proj": 74.31743621826172, "geo/layer_14/stable_rank_down_proj": 131.28729248046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39150676131248474, "geo/layer_14/attn_entropy_mean": 5.534274101257324, "geo/layer_14/attn_entropy_std": 0.37026455998420715, "geo/layer_21/stable_rank_q_proj": 42.11091232299805, "geo/layer_21/stable_rank_k_proj": 30.44532012939453, "geo/layer_21/stable_rank_o_proj": 74.08231353759766, "geo/layer_21/stable_rank_gate_proj": 70.27969360351562, "geo/layer_21/stable_rank_down_proj": 53.81953811645508, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14297670125961304, "geo/layer_21/attn_entropy_mean": 5.712650299072266, "geo/layer_21/attn_entropy_std": 0.2953149974346161, "geo/layer_27/stable_rank_q_proj": 42.69606018066406, "geo/layer_27/stable_rank_k_proj": 31.395998001098633, "geo/layer_27/stable_rank_o_proj": 115.6885986328125, "geo/layer_27/stable_rank_gate_proj": 83.06891632080078, "geo/layer_27/stable_rank_down_proj": 130.30548095703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09482865035533905, "geo/layer_27/attn_entropy_mean": 4.250062942504883, "geo/layer_27/attn_entropy_std": 0.6865306496620178, "attnres/final_alpha/block_0": 0.2396872341632843, "attnres/block_norm/0": 1.737908124923706, "attnres/final_alpha/block_1": 0.004819168709218502, "attnres/block_norm/1": 42713.79296875, "attnres/final_alpha/block_2": 0.010805260390043259, "attnres/block_norm/2": 27461.48046875, "attnres/final_alpha/block_3": 0.012761772610247135, "attnres/block_norm/3": 51167.9921875, "attnres/final_alpha/block_4": 0.015476925298571587, "attnres/block_norm/4": 13715.935546875, "attnres/final_alpha/block_5": 0.6038837432861328, "attnres/block_norm/5": 6248.037109375, "attnres/final_alpha/block_6": 0.11256592720746994, "attnres/block_norm/6": 33885.24609375, "geo/tier1_time_s": 1.3576478958129883, "geo/step": 47550.0, "geo/rankme_slope": -3.021437090461184e-05} {"step": 47560, "timestamp": 1778245951.9995925, "train/loss": 2.145881712436676, "train/z_loss": 0.0013963519129902124, "train/perplexity": 8.54957618262697, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786167.0424613268, "perf/iters_per_sec": 0.8517108166033396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1741074323654175, "data/tokens_consumed": 99742646272, "data/tokens_consumed_B": 99.742646272, "train/loss_slope": -8.542379194145787e-06} {"step": 47570, "timestamp": 1778245962.3708034, "train/loss": 2.14610960483551, "train/z_loss": 0.0013948379550129176, "train/perplexity": 8.551524788079996, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023129.6804839028, "perf/iters_per_sec": 0.9647034075183405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365880250930786, "data/tokens_consumed": 99763617792, "data/tokens_consumed_B": 99.763617792, "train/loss_slope": -5.723892193887853e-06} {"step": 47580, "timestamp": 1778245972.7621756, "train/loss": 2.1498672485351564, "train/z_loss": 0.001389589544851333, "train/perplexity": 8.583718820292118, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021029.4720716027, "perf/iters_per_sec": 0.963701950107385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376652240753175, "data/tokens_consumed": 99784589312, "data/tokens_consumed_B": 99.784589312, "train/loss_slope": -3.597522025132677e-06} {"step": 47590, "timestamp": 1778245983.139861, "train/loss": 2.152617645263672, "train/z_loss": 0.001389085757546127, "train/perplexity": 8.607359948792057, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022234.6491593332, "perf/iters_per_sec": 0.96427662332503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370468139648437, "data/tokens_consumed": 99805560832, "data/tokens_consumed_B": 99.805560832, "train/loss_slope": -5.089704353030375e-06} {"step": 47600, "timestamp": 1778245993.505392, "grad/layer_0/attn": 0.002820027759298682, "grad/layer_0/mlp": 0.0028570189606398344, "grad/layer_0/attn_mlp_ratio": 0.9870524835305551, "grad/layer_4/attn": 0.0021478927228599787, "grad/layer_4/mlp": 0.002650131704285741, "grad/layer_4/attn_mlp_ratio": 0.8104852443136786, "grad/layer_8/attn": 0.003303298493847251, "grad/layer_8/mlp": 0.0035958481021225452, "grad/layer_8/attn_mlp_ratio": 0.9186423642403455, "grad/layer_12/attn": 0.0036327431444078684, "grad/layer_12/mlp": 0.006272838916629553, "grad/layer_12/attn_mlp_ratio": 0.5791226484176173, "grad/layer_16/attn": 0.00890870951116085, "grad/layer_16/mlp": 0.004891036543995142, "grad/layer_16/attn_mlp_ratio": 1.8214358549323721, "grad/layer_20/attn": 0.00404405128210783, "grad/layer_20/mlp": 0.006454612128436565, "grad/layer_20/attn_mlp_ratio": 0.6265366747039085, "grad/layer_24/attn": 0.02050412818789482, "grad/layer_24/mlp": 0.011966443620622158, "grad/layer_24/attn_mlp_ratio": 1.7134688188571343, "grad/layer_27/attn": 0.010884852148592472, "grad/layer_27/mlp": 0.012218260206282139, "grad/layer_27/attn_mlp_ratio": 0.8908675929089446} {"step": 47600, "timestamp": 1778245993.5193703, "train/loss": 2.1479676246643065, "train/z_loss": 0.001389363023918122, "train/perplexity": 8.56742846079705, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021925.8534824029, "perf/iters_per_sec": 0.9641293780719771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037205195426941, "data/tokens_consumed": 99826532352, "data/tokens_consumed_B": 99.826532352, "train/loss_slope": -4.327261022286799e-06} {"step": 47610, "timestamp": 1778246003.8946838, "train/loss": 2.1731454133987427, "train/z_loss": 0.0013946404214948416, "train/perplexity": 8.785875835764887, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022831.217679082, "perf/iters_per_sec": 0.9645610893626604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367409706115722, "data/tokens_consumed": 99847503872, "data/tokens_consumed_B": 99.847503872, "train/loss_slope": -3.482618147354782e-06} {"step": 47620, "timestamp": 1778246014.2759652, "train/loss": 2.1499642372131347, "train/z_loss": 0.001391126634553075, "train/perplexity": 8.58455138420662, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021028.357606479, "perf/iters_per_sec": 0.9637014186890025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376657962799072, "data/tokens_consumed": 99868475392, "data/tokens_consumed_B": 99.868475392, "train/loss_slope": -3.5020688686720402e-06} {"step": 47625, "timestamp": 1778246020.03726, "eos/sharpness": 34.086275100708, "eos/L0_probe": 1.9853506088256836, "eos/L_plus": 2.1456120014190674, "eos/L_minus": 2.16595196723938, "eos/grad_norm": 0.1087607592344284, "eos/embed_grad_frac": 0.2339528352022171, "eos/time_s": 0.5842170715332031} {"step": 47625, "timestamp": 1778246021.412295, "geo/rankme_last": 439.7525634765625, "geo/layer_0/stable_rank_q_proj": 19.40802574157715, "geo/layer_0/stable_rank_k_proj": 16.4376220703125, "geo/layer_0/stable_rank_o_proj": 48.21668243408203, "geo/layer_0/stable_rank_gate_proj": 134.70462036132812, "geo/layer_0/stable_rank_down_proj": 53.54668426513672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06372348219156265, "geo/layer_0/attn_entropy_mean": 6.183206558227539, "geo/layer_0/attn_entropy_std": 0.3892063796520233, "geo/layer_7/stable_rank_q_proj": 42.850154876708984, "geo/layer_7/stable_rank_k_proj": 41.97753143310547, "geo/layer_7/stable_rank_o_proj": 94.49007415771484, "geo/layer_7/stable_rank_gate_proj": 87.59419250488281, "geo/layer_7/stable_rank_down_proj": 145.52444458007812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4815828502178192, "geo/layer_7/attn_entropy_mean": 4.636085033416748, "geo/layer_7/attn_entropy_std": 0.8153373003005981, "geo/layer_14/stable_rank_q_proj": 53.051944732666016, "geo/layer_14/stable_rank_k_proj": 38.423553466796875, "geo/layer_14/stable_rank_o_proj": 46.323909759521484, "geo/layer_14/stable_rank_gate_proj": 74.20973205566406, "geo/layer_14/stable_rank_down_proj": 131.23068237304688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39194267988204956, "geo/layer_14/attn_entropy_mean": 5.482436180114746, "geo/layer_14/attn_entropy_std": 0.36382025480270386, "geo/layer_21/stable_rank_q_proj": 42.01945495605469, "geo/layer_21/stable_rank_k_proj": 30.448888778686523, "geo/layer_21/stable_rank_o_proj": 73.9648666381836, "geo/layer_21/stable_rank_gate_proj": 70.25421142578125, "geo/layer_21/stable_rank_down_proj": 53.77933120727539, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1413567066192627, "geo/layer_21/attn_entropy_mean": 5.700035095214844, "geo/layer_21/attn_entropy_std": 0.2993846535682678, "geo/layer_27/stable_rank_q_proj": 42.711341857910156, "geo/layer_27/stable_rank_k_proj": 31.399761199951172, "geo/layer_27/stable_rank_o_proj": 115.642822265625, "geo/layer_27/stable_rank_gate_proj": 83.16923522949219, "geo/layer_27/stable_rank_down_proj": 130.30421447753906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08974546194076538, "geo/layer_27/attn_entropy_mean": 4.244997024536133, "geo/layer_27/attn_entropy_std": 0.6729462146759033, "attnres/final_alpha/block_0": 0.24004265666007996, "attnres/block_norm/0": 1.7381693124771118, "attnres/final_alpha/block_1": 0.004939249716699123, "attnres/block_norm/1": 42708.6953125, "attnres/final_alpha/block_2": 0.010716801509261131, "attnres/block_norm/2": 27422.41015625, "attnres/final_alpha/block_3": 0.012610971927642822, "attnres/block_norm/3": 50928.7734375, "attnres/final_alpha/block_4": 0.015221982263028622, "attnres/block_norm/4": 13800.84375, "attnres/final_alpha/block_5": 0.603350818157196, "attnres/block_norm/5": 6277.390625, "attnres/final_alpha/block_6": 0.11311748623847961, "attnres/block_norm/6": 33862.9375, "geo/tier1_time_s": 1.3568859100341797, "geo/step": 47625.0, "geo/rankme_slope": -4.2047189969737904e-05} {"step": 47630, "timestamp": 1778246026.5989885, "train/loss": 2.1803030252456663, "train/z_loss": 0.001387755526229739, "train/perplexity": 8.848987319042136, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702680.2552266708, "perf/iters_per_sec": 0.8119012142308573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2316769361495972, "data/tokens_consumed": 99889446912, "data/tokens_consumed_B": 99.889446912, "train/loss_slope": -4.9975372955959236e-06} {"step": 47640, "timestamp": 1778246036.9746203, "train/loss": 2.1226491689682008, "train/z_loss": 0.001392963936086744, "train/perplexity": 8.353237338784847, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022586.4648197615, "perf/iters_per_sec": 0.9644443821047599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368664264678955, "data/tokens_consumed": 99910418432, "data/tokens_consumed_B": 99.910418432, "train/loss_slope": -1.0183280650967824e-05} {"step": 47650, "timestamp": 1778246047.3466747, "grad/layer_0/attn": 0.00398434279486537, "grad/layer_0/mlp": 0.0034359802957624197, "grad/layer_0/attn_mlp_ratio": 1.1595941582726266, "grad/layer_4/attn": 0.0021735995542258024, "grad/layer_4/mlp": 0.002670967485755682, "grad/layer_4/attn_mlp_ratio": 0.8137873203020687, "grad/layer_8/attn": 0.004044530913233757, "grad/layer_8/mlp": 0.003623438533395529, "grad/layer_8/attn_mlp_ratio": 1.1162134432075705, "grad/layer_12/attn": 0.006384292617440224, "grad/layer_12/mlp": 0.006275319494307041, "grad/layer_12/attn_mlp_ratio": 1.0173653343858444, "grad/layer_16/attn": 0.003917450550943613, "grad/layer_16/mlp": 0.004867363255470991, "grad/layer_16/attn_mlp_ratio": 0.8048403755475413, "grad/layer_20/attn": 0.003988946788012981, "grad/layer_20/mlp": 0.005964767653495073, "grad/layer_20/attn_mlp_ratio": 0.6687514003668703, "grad/layer_24/attn": 0.007667618803679943, "grad/layer_24/mlp": 0.010476229712367058, "grad/layer_24/attn_mlp_ratio": 0.7319063194498097, "grad/layer_27/attn": 0.006419793702661991, "grad/layer_27/mlp": 0.01028474885970354, "grad/layer_27/attn_mlp_ratio": 0.6242051923498814} {"step": 47650, "timestamp": 1778246047.3607638, "train/loss": 2.2020405769348144, "train/z_loss": 0.0013828196912072598, "train/perplexity": 9.043448536476257, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020087.723197812, "perf/iters_per_sec": 0.9632528892506657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381489753723145, "data/tokens_consumed": 99931389952, "data/tokens_consumed_B": 99.931389952, "train/loss_slope": -9.492439269447336e-06} {"step": 47660, "timestamp": 1778246057.7378848, "train/loss": 2.1611135959625245, "train/z_loss": 0.0013926746556535362, "train/perplexity": 8.680799180868151, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022423.1896481537, "perf/iters_per_sec": 0.9643665264359254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036950135231018, "data/tokens_consumed": 99952361472, "data/tokens_consumed_B": 99.952361472, "train/loss_slope": -8.793737213782623e-06} {"step": 47670, "timestamp": 1778246068.1128762, "train/loss": 2.1822283267974854, "train/z_loss": 0.0013709637685678899, "train/perplexity": 8.866040699241427, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022771.024007573, "perf/iters_per_sec": 0.9645323867833963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036771821975708, "data/tokens_consumed": 99973332992, "data/tokens_consumed_B": 99.973332992, "train/loss_slope": -1.0525425961881075e-05} {"step": 47680, "timestamp": 1778246078.490987, "train/loss": 2.169756901264191, "train/z_loss": 0.0013935772818513214, "train/perplexity": 8.75615517173584, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021680.9943880788, "perf/iters_per_sec": 0.9640126201572794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373308181762695, "data/tokens_consumed": 99994304512, "data/tokens_consumed_B": 99.994304512, "train/loss_slope": -9.240943163034307e-06} {"step": 47690, "timestamp": 1778246088.8709178, "train/loss": 2.1760947585105894, "train/z_loss": 0.0013988217920996248, "train/perplexity": 8.811826665879966, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021696.235356418, "perf/iters_per_sec": 0.9640198876173105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037322998046875, "data/tokens_consumed": 100015276032, "data/tokens_consumed_B": 100.015276032, "train/loss_slope": -7.182707494706588e-06} {"step": 47700, "timestamp": 1778246099.2368681, "grad/layer_0/attn": 0.0030445002485066652, "grad/layer_0/mlp": 0.003064583521336317, "grad/layer_0/attn_mlp_ratio": 0.9934466226700989, "grad/layer_4/attn": 0.002009625779464841, "grad/layer_4/mlp": 0.0026363609358668327, "grad/layer_4/attn_mlp_ratio": 0.7622725992853547, "grad/layer_8/attn": 0.0068600974045693874, "grad/layer_8/mlp": 0.0037512662820518017, "grad/layer_8/attn_mlp_ratio": 1.8287417383612123, "grad/layer_12/attn": 0.004879513289779425, "grad/layer_12/mlp": 0.006542804650962353, "grad/layer_12/attn_mlp_ratio": 0.7457831122137213, "grad/layer_16/attn": 0.007885372266173363, "grad/layer_16/mlp": 0.004646969493478537, "grad/layer_16/attn_mlp_ratio": 1.6968848423797598, "grad/layer_20/attn": 0.004451883491128683, "grad/layer_20/mlp": 0.005832367110997438, "grad/layer_20/attn_mlp_ratio": 0.7633064466061512, "grad/layer_24/attn": 0.008435092866420746, "grad/layer_24/mlp": 0.008744148537516594, "grad/layer_24/attn_mlp_ratio": 0.9646557047566815, "grad/layer_27/attn": 0.005938827525824308, "grad/layer_27/mlp": 0.007228406146168709, "grad/layer_27/attn_mlp_ratio": 0.8215957049968077} {"step": 47700, "timestamp": 1778246099.8242517, "eos/sharpness": 20.9660530090332, "eos/L0_probe": 1.9870021343231201, "eos/L_plus": 2.1016225814819336, "eos/L_minus": 2.0820422172546387, "eos/grad_norm": 0.11260925233364105, "eos/embed_grad_frac": 0.1986878216266632, "eos/time_s": 0.5846397876739502} {"step": 47700, "timestamp": 1778246099.841939, "train/loss": 2.0909383058547975, "train/z_loss": 0.0014132115989923476, "train/perplexity": 8.092504847704612, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912780.485645828, "perf/iters_per_sec": 0.91208481104175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963892698287965, "data/tokens_consumed": 100036247552, "data/tokens_consumed_B": 100.036247552, "train/loss_slope": -1.231638708523796e-05} {"step": 47700, "timestamp": 1778246101.2003908, "geo/rankme_last": 439.58441162109375, "geo/layer_0/stable_rank_q_proj": 19.405031204223633, "geo/layer_0/stable_rank_k_proj": 16.42889404296875, "geo/layer_0/stable_rank_o_proj": 48.20270538330078, "geo/layer_0/stable_rank_gate_proj": 134.67442321777344, "geo/layer_0/stable_rank_down_proj": 53.526161193847656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06373893469572067, "geo/layer_0/attn_entropy_mean": 6.182404518127441, "geo/layer_0/attn_entropy_std": 0.38875362277030945, "geo/layer_7/stable_rank_q_proj": 42.82119369506836, "geo/layer_7/stable_rank_k_proj": 41.955291748046875, "geo/layer_7/stable_rank_o_proj": 94.50540924072266, "geo/layer_7/stable_rank_gate_proj": 87.7395248413086, "geo/layer_7/stable_rank_down_proj": 145.35433959960938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48120659589767456, "geo/layer_7/attn_entropy_mean": 4.6555280685424805, "geo/layer_7/attn_entropy_std": 0.8055896759033203, "geo/layer_14/stable_rank_q_proj": 53.08488845825195, "geo/layer_14/stable_rank_k_proj": 38.46183776855469, "geo/layer_14/stable_rank_o_proj": 46.31524658203125, "geo/layer_14/stable_rank_gate_proj": 74.38931274414062, "geo/layer_14/stable_rank_down_proj": 130.81358337402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3748033046722412, "geo/layer_14/attn_entropy_mean": 5.50834846496582, "geo/layer_14/attn_entropy_std": 0.3645579516887665, "geo/layer_21/stable_rank_q_proj": 42.0393180847168, "geo/layer_21/stable_rank_k_proj": 30.426231384277344, "geo/layer_21/stable_rank_o_proj": 73.86701202392578, "geo/layer_21/stable_rank_gate_proj": 70.22570037841797, "geo/layer_21/stable_rank_down_proj": 53.74764633178711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14508259296417236, "geo/layer_21/attn_entropy_mean": 5.711472988128662, "geo/layer_21/attn_entropy_std": 0.3008655607700348, "geo/layer_27/stable_rank_q_proj": 42.81340789794922, "geo/layer_27/stable_rank_k_proj": 31.492958068847656, "geo/layer_27/stable_rank_o_proj": 115.72738647460938, "geo/layer_27/stable_rank_gate_proj": 83.09700012207031, "geo/layer_27/stable_rank_down_proj": 130.3177490234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09646442532539368, "geo/layer_27/attn_entropy_mean": 4.253355979919434, "geo/layer_27/attn_entropy_std": 0.6709272265434265, "attnres/final_alpha/block_0": 0.2412375807762146, "attnres/block_norm/0": 1.7384147644042969, "attnres/final_alpha/block_1": 0.0049147955141961575, "attnres/block_norm/1": 42811.859375, "attnres/final_alpha/block_2": 0.010783955454826355, "attnres/block_norm/2": 27358.60546875, "attnres/final_alpha/block_3": 0.01253382209688425, "attnres/block_norm/3": 51384.5078125, "attnres/final_alpha/block_4": 0.015273834578692913, "attnres/block_norm/4": 13775.646484375, "attnres/final_alpha/block_5": 0.6028658747673035, "attnres/block_norm/5": 6261.3671875, "attnres/final_alpha/block_6": 0.11239011585712433, "attnres/block_norm/6": 33914.15625, "geo/tier1_time_s": 1.35471510887146, "geo/step": 47700.0, "geo/rankme_slope": -2.1183707858143252e-05} {"step": 47710, "timestamp": 1778246111.575221, "train/loss": 2.115322303771973, "train/z_loss": 0.0013970376923680306, "train/perplexity": 8.292257961485928, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787876.1857630054, "perf/iters_per_sec": 0.8525257996382739, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172985029220581, "data/tokens_consumed": 100057219072, "data/tokens_consumed_B": 100.057219072, "train/loss_slope": -1.6235539915323522e-05} {"step": 47720, "timestamp": 1778246121.9506443, "train/loss": 2.150044059753418, "train/z_loss": 0.0013933901907876133, "train/perplexity": 8.585236652254855, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022589.5343292083, "perf/iters_per_sec": 0.9644458457609216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368648529052735, "data/tokens_consumed": 100078190592, "data/tokens_consumed_B": 100.078190592, "train/loss_slope": -1.5329612192004608e-05} {"step": 47730, "timestamp": 1778246132.3248239, "train/loss": 2.2352591276168825, "train/z_loss": 0.0013826492708176375, "train/perplexity": 9.348904097587011, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022825.3563140666, "perf/iters_per_sec": 0.9645582944460233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036743974685669, "data/tokens_consumed": 100099162112, "data/tokens_consumed_B": 100.099162112, "train/loss_slope": -1.4205362451280395e-05} {"step": 47740, "timestamp": 1778246142.6955166, "train/loss": 2.115351986885071, "train/z_loss": 0.0014083478832617401, "train/perplexity": 8.292504105169975, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023087.3366739375, "perf/iters_per_sec": 0.9646832164163291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366097211837768, "data/tokens_consumed": 100120133632, "data/tokens_consumed_B": 100.120133632, "train/loss_slope": -1.3708848285131405e-05} {"step": 47750, "timestamp": 1778246153.0292273, "grad/layer_0/attn": 0.0027327609714120626, "grad/layer_0/mlp": 0.002910129725933075, "grad/layer_0/attn_mlp_ratio": 0.9390512227528741, "grad/layer_4/attn": 0.0023805834352970123, "grad/layer_4/mlp": 0.0026124862488359213, "grad/layer_4/attn_mlp_ratio": 0.9112328706933757, "grad/layer_8/attn": 0.0037156324833631516, "grad/layer_8/mlp": 0.0037912591360509396, "grad/layer_8/attn_mlp_ratio": 0.980052339347134, "grad/layer_12/attn": 0.0044886283576488495, "grad/layer_12/mlp": 0.006331213749945164, "grad/layer_12/attn_mlp_ratio": 0.7089680532095322, "grad/layer_16/attn": 0.0038024450186640024, "grad/layer_16/mlp": 0.004654306452721357, "grad/layer_16/attn_mlp_ratio": 0.8169734794199849, "grad/layer_20/attn": 0.004552660975605249, "grad/layer_20/mlp": 0.005854360293596983, "grad/layer_20/attn_mlp_ratio": 0.7776530089580025, "grad/layer_24/attn": 0.0056691826321184635, "grad/layer_24/mlp": 0.008129904046654701, "grad/layer_24/attn_mlp_ratio": 0.6973246584280115, "grad/layer_27/attn": 0.005797365680336952, "grad/layer_27/mlp": 0.007288170047104359, "grad/layer_27/attn_mlp_ratio": 0.7954487290119436} {"step": 47750, "timestamp": 1778246153.043373, "train/loss": 2.168398714065552, "train/z_loss": 0.0013913069968111813, "train/perplexity": 8.744270746336417, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028082.517877937, "perf/iters_per_sec": 0.967065104426354, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034056544303894, "data/tokens_consumed": 100141105152, "data/tokens_consumed_B": 100.141105152, "train/loss_slope": -1.1561060488277673e-05} {"step": 47760, "timestamp": 1778246163.3892677, "train/loss": 2.180259943008423, "train/z_loss": 0.0013918716460466385, "train/perplexity": 8.848606093083186, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028283.2816317186, "perf/iters_per_sec": 0.9671608360441774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339541912078858, "data/tokens_consumed": 100162076672, "data/tokens_consumed_B": 100.162076672, "train/loss_slope": -7.203023161145614e-06} {"step": 47770, "timestamp": 1778246173.7341504, "train/loss": 2.1480852723121644, "train/z_loss": 0.0013936827424913644, "train/perplexity": 8.568436457896736, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028408.0249042334, "perf/iters_per_sec": 0.9672203182717483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338906049728394, "data/tokens_consumed": 100183048192, "data/tokens_consumed_B": 100.183048192, "train/loss_slope": -6.427376555709773e-06} {"step": 47775, "timestamp": 1778246179.4903467, "eos/sharpness": 44.09584999084472, "eos/L0_probe": 1.9849679470062256, "eos/L_plus": 2.185675859451294, "eos/L_minus": 2.2252185344696045, "eos/grad_norm": 0.11578288674354553, "eos/embed_grad_frac": 0.1754404753446579, "eos/time_s": 0.591480016708374} {"step": 47775, "timestamp": 1778246180.8642354, "geo/rankme_last": 438.9007873535156, "geo/layer_0/stable_rank_q_proj": 19.41098976135254, "geo/layer_0/stable_rank_k_proj": 16.441312789916992, "geo/layer_0/stable_rank_o_proj": 48.250648498535156, "geo/layer_0/stable_rank_gate_proj": 135.12033081054688, "geo/layer_0/stable_rank_down_proj": 53.667808532714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06364797800779343, "geo/layer_0/attn_entropy_mean": 6.186017990112305, "geo/layer_0/attn_entropy_std": 0.38639748096466064, "geo/layer_7/stable_rank_q_proj": 42.80677795410156, "geo/layer_7/stable_rank_k_proj": 42.101654052734375, "geo/layer_7/stable_rank_o_proj": 94.39139556884766, "geo/layer_7/stable_rank_gate_proj": 87.66454315185547, "geo/layer_7/stable_rank_down_proj": 145.5692901611328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48422813415527344, "geo/layer_7/attn_entropy_mean": 4.638973236083984, "geo/layer_7/attn_entropy_std": 0.8119316101074219, "geo/layer_14/stable_rank_q_proj": 53.088050842285156, "geo/layer_14/stable_rank_k_proj": 38.495723724365234, "geo/layer_14/stable_rank_o_proj": 46.288761138916016, "geo/layer_14/stable_rank_gate_proj": 74.33241271972656, "geo/layer_14/stable_rank_down_proj": 130.90151977539062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3774111866950989, "geo/layer_14/attn_entropy_mean": 5.509219169616699, "geo/layer_14/attn_entropy_std": 0.37305745482444763, "geo/layer_21/stable_rank_q_proj": 42.032989501953125, "geo/layer_21/stable_rank_k_proj": 30.456022262573242, "geo/layer_21/stable_rank_o_proj": 73.72639465332031, "geo/layer_21/stable_rank_gate_proj": 70.27062225341797, "geo/layer_21/stable_rank_down_proj": 53.77742385864258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14386022090911865, "geo/layer_21/attn_entropy_mean": 5.700030326843262, "geo/layer_21/attn_entropy_std": 0.2969340682029724, "geo/layer_27/stable_rank_q_proj": 42.81437683105469, "geo/layer_27/stable_rank_k_proj": 31.49484634399414, "geo/layer_27/stable_rank_o_proj": 115.9046630859375, "geo/layer_27/stable_rank_gate_proj": 83.10708618164062, "geo/layer_27/stable_rank_down_proj": 130.3746337890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10145866125822067, "geo/layer_27/attn_entropy_mean": 4.264148712158203, "geo/layer_27/attn_entropy_std": 0.6814196109771729, "attnres/final_alpha/block_0": 0.2429353892803192, "attnres/block_norm/0": 1.738523006439209, "attnres/final_alpha/block_1": 0.004952558781951666, "attnres/block_norm/1": 42802.3046875, "attnres/final_alpha/block_2": 0.010924968868494034, "attnres/block_norm/2": 27428.74609375, "attnres/final_alpha/block_3": 0.012868759222328663, "attnres/block_norm/3": 51488.04296875, "attnres/final_alpha/block_4": 0.015584006905555725, "attnres/block_norm/4": 13760.673828125, "attnres/final_alpha/block_5": 0.5984112024307251, "attnres/block_norm/5": 6306.1455078125, "attnres/final_alpha/block_6": 0.11432305723428726, "attnres/block_norm/6": 34023.47265625, "geo/tier1_time_s": 1.3560972213745117, "geo/step": 47775.0, "geo/rankme_slope": -4.206649456657663e-05} {"step": 47780, "timestamp": 1778246186.0383918, "train/loss": 2.1334524154663086, "train/z_loss": 0.0013978857081383468, "train/perplexity": 8.443968634653354, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705485.0798697013, "perf/iters_per_sec": 0.813238658842898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2296513319015503, "data/tokens_consumed": 100204019712, "data/tokens_consumed_B": 100.204019712, "train/loss_slope": -6.865529973979761e-06} {"step": 47790, "timestamp": 1778246196.3808055, "train/loss": 2.150808942317963, "train/z_loss": 0.001393058116082102, "train/perplexity": 8.591805862099417, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028661.394049724, "perf/iters_per_sec": 0.9673411340950604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033761477470398, "data/tokens_consumed": 100224991232, "data/tokens_consumed_B": 100.224991232, "train/loss_slope": -9.470710037636589e-06} {"step": 47800, "timestamp": 1778246206.7145061, "grad/layer_0/attn": 0.002773071639239788, "grad/layer_0/mlp": 0.0030676890164613724, "grad/layer_0/attn_mlp_ratio": 0.9039611036070597, "grad/layer_4/attn": 0.002033120719715953, "grad/layer_4/mlp": 0.002675109077244997, "grad/layer_4/attn_mlp_ratio": 0.7600141097081481, "grad/layer_8/attn": 0.0034026470966637135, "grad/layer_8/mlp": 0.0036068963818252087, "grad/layer_8/attn_mlp_ratio": 0.9433725403014233, "grad/layer_12/attn": 0.005001287441700697, "grad/layer_12/mlp": 0.006679099053144455, "grad/layer_12/attn_mlp_ratio": 0.7487967055177104, "grad/layer_16/attn": 0.005487832706421614, "grad/layer_16/mlp": 0.004521551076322794, "grad/layer_16/attn_mlp_ratio": 1.213705759907966, "grad/layer_20/attn": 0.003911841195076704, "grad/layer_20/mlp": 0.0074128806591033936, "grad/layer_20/attn_mlp_ratio": 0.5277086361159614, "grad/layer_24/attn": 0.020121349021792412, "grad/layer_24/mlp": 0.014828496612608433, "grad/layer_24/attn_mlp_ratio": 1.3569378886993684, "grad/layer_27/attn": 0.013147078454494476, "grad/layer_27/mlp": 0.013798311352729797, "grad/layer_27/attn_mlp_ratio": 0.9528034281247881} {"step": 47800, "timestamp": 1778246206.7288468, "train/loss": 2.1828115224838256, "train/z_loss": 0.0013914014678448438, "train/perplexity": 8.871212843971795, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027774.1776450635, "perf/iters_per_sec": 0.9669180763459508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342137813568115, "data/tokens_consumed": 100245962752, "data/tokens_consumed_B": 100.245962752, "train/loss_slope": -7.240192867992094e-06} {"step": 47810, "timestamp": 1778246217.0777593, "train/loss": 2.1680132865905763, "train/z_loss": 0.0013817599159665407, "train/perplexity": 8.740901113558396, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027769.1757788714, "perf/iters_per_sec": 0.9669156912702901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034216332435608, "data/tokens_consumed": 100266934272, "data/tokens_consumed_B": 100.266934272, "train/loss_slope": -6.860158374064167e-06} {"step": 47820, "timestamp": 1778246227.4183428, "train/loss": 2.1589682579040526, "train/z_loss": 0.0013918353361077607, "train/perplexity": 8.662195894311909, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029048.680853879, "perf/iters_per_sec": 0.9675258068341632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335641622543335, "data/tokens_consumed": 100287905792, "data/tokens_consumed_B": 100.287905792, "train/loss_slope": -6.677921756122913e-06} {"step": 47830, "timestamp": 1778246237.7604222, "train/loss": 2.205456185340881, "train/z_loss": 0.0013897133292630314, "train/perplexity": 9.074390227585948, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029203.6650703275, "perf/iters_per_sec": 0.9675997090675008, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033485221862793, "data/tokens_consumed": 100308877312, "data/tokens_consumed_B": 100.308877312, "train/loss_slope": -3.994785204972399e-06} {"step": 47840, "timestamp": 1778246248.1105611, "train/loss": 2.125962829589844, "train/z_loss": 0.0013972088228911162, "train/perplexity": 8.380963043836319, "train/grad_norm": 0.44921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027560.3815682603, "perf/iters_per_sec": 0.966816130432253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343228340148927, "data/tokens_consumed": 100329848832, "data/tokens_consumed_B": 100.329848832, "train/loss_slope": -4.905051388184048e-06} {"step": 47850, "timestamp": 1778246258.4474049, "grad/layer_0/attn": 0.0028282776474952698, "grad/layer_0/mlp": 0.002815351588651538, "grad/layer_0/attn_mlp_ratio": 1.0045912412633333, "grad/layer_4/attn": 0.001981466542929411, "grad/layer_4/mlp": 0.00262839300557971, "grad/layer_4/attn_mlp_ratio": 0.7538699362447099, "grad/layer_8/attn": 0.0038371612317860126, "grad/layer_8/mlp": 0.0037513920105993748, "grad/layer_8/attn_mlp_ratio": 1.022863278126619, "grad/layer_12/attn": 0.006167737301439047, "grad/layer_12/mlp": 0.0064091188833117485, "grad/layer_12/attn_mlp_ratio": 0.9623377748952363, "grad/layer_16/attn": 0.0035693515092134476, "grad/layer_16/mlp": 0.00469451816752553, "grad/layer_16/attn_mlp_ratio": 0.7603232761718158, "grad/layer_20/attn": 0.0043182712979614735, "grad/layer_20/mlp": 0.005913084372878075, "grad/layer_20/attn_mlp_ratio": 0.7302908182300399, "grad/layer_24/attn": 0.006334638223052025, "grad/layer_24/mlp": 0.008404669351875782, "grad/layer_24/attn_mlp_ratio": 0.7537046232839341, "grad/layer_27/attn": 0.007750401739031076, "grad/layer_27/mlp": 0.0067886351607739925, "grad/layer_27/attn_mlp_ratio": 1.1416730228260092} {"step": 47850, "timestamp": 1778246259.0455894, "eos/sharpness": 16.16852283477783, "eos/L0_probe": 1.9848647117614746, "eos/L_plus": 2.081892490386963, "eos/L_minus": 2.0495221614837646, "eos/grad_norm": 0.095527283847332, "eos/embed_grad_frac": 0.24894462525844574, "eos/time_s": 0.5952823162078857} {"step": 47850, "timestamp": 1778246259.0653667, "train/loss": 2.1748222589492796, "train/z_loss": 0.0013764264760538936, "train/perplexity": 8.80062075158591, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915344.0017493116, "perf/iters_per_sec": 0.913307190775543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094921851158142, "data/tokens_consumed": 100350820352, "data/tokens_consumed_B": 100.350820352, "train/loss_slope": -7.54012295407937e-06} {"step": 47850, "timestamp": 1778246260.4247873, "geo/rankme_last": 439.2284240722656, "geo/layer_0/stable_rank_q_proj": 19.40020179748535, "geo/layer_0/stable_rank_k_proj": 16.449451446533203, "geo/layer_0/stable_rank_o_proj": 48.22648620605469, "geo/layer_0/stable_rank_gate_proj": 134.69668579101562, "geo/layer_0/stable_rank_down_proj": 53.64665222167969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06008167192339897, "geo/layer_0/attn_entropy_mean": 6.184606552124023, "geo/layer_0/attn_entropy_std": 0.390116423368454, "geo/layer_7/stable_rank_q_proj": 42.806373596191406, "geo/layer_7/stable_rank_k_proj": 42.03116989135742, "geo/layer_7/stable_rank_o_proj": 94.22400665283203, "geo/layer_7/stable_rank_gate_proj": 87.59620666503906, "geo/layer_7/stable_rank_down_proj": 145.7710723876953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48922187089920044, "geo/layer_7/attn_entropy_mean": 4.648357391357422, "geo/layer_7/attn_entropy_std": 0.8216078281402588, "geo/layer_14/stable_rank_q_proj": 53.073760986328125, "geo/layer_14/stable_rank_k_proj": 38.5010986328125, "geo/layer_14/stable_rank_o_proj": 46.27766418457031, "geo/layer_14/stable_rank_gate_proj": 74.27958679199219, "geo/layer_14/stable_rank_down_proj": 131.0895538330078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38629764318466187, "geo/layer_14/attn_entropy_mean": 5.518022060394287, "geo/layer_14/attn_entropy_std": 0.3574228584766388, "geo/layer_21/stable_rank_q_proj": 41.923919677734375, "geo/layer_21/stable_rank_k_proj": 30.4857177734375, "geo/layer_21/stable_rank_o_proj": 73.77367401123047, "geo/layer_21/stable_rank_gate_proj": 70.359375, "geo/layer_21/stable_rank_down_proj": 53.782344818115234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14195096492767334, "geo/layer_21/attn_entropy_mean": 5.693119525909424, "geo/layer_21/attn_entropy_std": 0.29903644323349, "geo/layer_27/stable_rank_q_proj": 42.732452392578125, "geo/layer_27/stable_rank_k_proj": 31.468204498291016, "geo/layer_27/stable_rank_o_proj": 115.91464233398438, "geo/layer_27/stable_rank_gate_proj": 83.1292953491211, "geo/layer_27/stable_rank_down_proj": 130.35145568847656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08389392495155334, "geo/layer_27/attn_entropy_mean": 4.270096302032471, "geo/layer_27/attn_entropy_std": 0.682834267616272, "attnres/final_alpha/block_0": 0.24064719676971436, "attnres/block_norm/0": 1.738551378250122, "attnres/final_alpha/block_1": 0.00493065919727087, "attnres/block_norm/1": 42821.49609375, "attnres/final_alpha/block_2": 0.010626357048749924, "attnres/block_norm/2": 27416.3125, "attnres/final_alpha/block_3": 0.012712646275758743, "attnres/block_norm/3": 51223.4453125, "attnres/final_alpha/block_4": 0.015270535834133625, "attnres/block_norm/4": 13801.3837890625, "attnres/final_alpha/block_5": 0.6022062301635742, "attnres/block_norm/5": 6278.509765625, "attnres/final_alpha/block_6": 0.11360639333724976, "attnres/block_norm/6": 33673.5390625, "geo/tier1_time_s": 1.3555772304534912, "geo/step": 47850.0, "geo/rankme_slope": -3.657664237570028e-05} {"step": 47860, "timestamp": 1778246270.7813258, "train/loss": 2.1444853782653808, "train/z_loss": 0.001387709064874798, "train/perplexity": 8.537646448139368, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790515.490229346, "perf/iters_per_sec": 0.8537843180796366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1712559938430787, "data/tokens_consumed": 100371791872, "data/tokens_consumed_B": 100.371791872, "train/loss_slope": -8.255666038348432e-06} {"step": 47870, "timestamp": 1778246281.1349297, "train/loss": 2.1358912587165833, "train/z_loss": 0.001400232093874365, "train/perplexity": 8.46458728316949, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026930.847180386, "perf/iters_per_sec": 0.9665159450437479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034644079208374, "data/tokens_consumed": 100392763392, "data/tokens_consumed_B": 100.392763392, "train/loss_slope": -7.847224775463744e-06} {"step": 47880, "timestamp": 1778246291.4814105, "train/loss": 2.205682396888733, "train/z_loss": 0.0013907909044064582, "train/perplexity": 9.076443191638475, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027950.9884307496, "perf/iters_per_sec": 0.9670023862985371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341236114501953, "data/tokens_consumed": 100413734912, "data/tokens_consumed_B": 100.413734912, "train/loss_slope": -4.830891013753518e-06} {"step": 47890, "timestamp": 1778246301.8273957, "train/loss": 2.166133260726929, "train/z_loss": 0.0013831002986989915, "train/perplexity": 8.724483431062865, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028235.2967427198, "perf/iters_per_sec": 0.9671379550660705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339786529541015, "data/tokens_consumed": 100434706432, "data/tokens_consumed_B": 100.434706432, "train/loss_slope": -2.350591661834998e-06} {"step": 47900, "timestamp": 1778246312.16561, "grad/layer_0/attn": 0.003324873512610793, "grad/layer_0/mlp": 0.0032833146397024393, "grad/layer_0/attn_mlp_ratio": 1.0126575659670447, "grad/layer_4/attn": 0.0019419918535277247, "grad/layer_4/mlp": 0.002719780197367072, "grad/layer_4/attn_mlp_ratio": 0.7140252671907734, "grad/layer_8/attn": 0.005927727557718754, "grad/layer_8/mlp": 0.0036873375065624714, "grad/layer_8/attn_mlp_ratio": 1.6075901341848897, "grad/layer_12/attn": 0.004326896276324987, "grad/layer_12/mlp": 0.006420999765396118, "grad/layer_12/attn_mlp_ratio": 0.6738664331147836, "grad/layer_16/attn": 0.0061601088382303715, "grad/layer_16/mlp": 0.004801768809556961, "grad/layer_16/attn_mlp_ratio": 1.2828832362111193, "grad/layer_20/attn": 0.003602362237870693, "grad/layer_20/mlp": 0.0069382209330797195, "grad/layer_20/attn_mlp_ratio": 0.5192054592518057, "grad/layer_24/attn": 0.01573840342462063, "grad/layer_24/mlp": 0.011618147604167461, "grad/layer_24/attn_mlp_ratio": 1.354639640101599, "grad/layer_27/attn": 0.009636531583964825, "grad/layer_27/mlp": 0.011315159499645233, "grad/layer_27/attn_mlp_ratio": 0.851647871079694} {"step": 47900, "timestamp": 1778246312.1800175, "train/loss": 2.176699995994568, "train/z_loss": 0.0013785684946924447, "train/perplexity": 8.817161527946872, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026948.8297865598, "perf/iters_per_sec": 0.9665245198185729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346349000930786, "data/tokens_consumed": 100455677952, "data/tokens_consumed_B": 100.455677952, "train/loss_slope": -2.54988026554583e-06} {"step": 47910, "timestamp": 1778246322.5238633, "train/loss": 2.1625141382217405, "train/z_loss": 0.001391408103518188, "train/perplexity": 8.692965524715364, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028486.8451178118, "perf/iters_per_sec": 0.9672579026784, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338504314422607, "data/tokens_consumed": 100476649472, "data/tokens_consumed_B": 100.476649472, "train/loss_slope": -5.887413539938031e-06} {"step": 47920, "timestamp": 1778246332.8676486, "train/loss": 2.2498111009597777, "train/z_loss": 0.001378788659349084, "train/perplexity": 9.485943781429212, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028472.5775034486, "perf/iters_per_sec": 0.9672510993497127, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338577032089233, "data/tokens_consumed": 100497620992, "data/tokens_consumed_B": 100.497620992, "train/loss_slope": 1.4092807615264047e-06} {"step": 47925, "timestamp": 1778246338.636932, "eos/sharpness": 67.07890033721922, "eos/L0_probe": 1.9881408214569092, "eos/L_plus": 2.3805994987487793, "eos/L_minus": 2.2664711475372314, "eos/grad_norm": 0.1843090057373047, "eos/embed_grad_frac": 0.06596209108829498, "eos/time_s": 0.6051061153411865} {"step": 47925, "timestamp": 1778246340.0170147, "geo/rankme_last": 439.81964111328125, "geo/layer_0/stable_rank_q_proj": 19.384069442749023, "geo/layer_0/stable_rank_k_proj": 16.462160110473633, "geo/layer_0/stable_rank_o_proj": 48.2111701965332, "geo/layer_0/stable_rank_gate_proj": 134.37759399414062, "geo/layer_0/stable_rank_down_proj": 53.644630432128906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06239994987845421, "geo/layer_0/attn_entropy_mean": 6.180550575256348, "geo/layer_0/attn_entropy_std": 0.3874480128288269, "geo/layer_7/stable_rank_q_proj": 42.78628158569336, "geo/layer_7/stable_rank_k_proj": 42.13066482543945, "geo/layer_7/stable_rank_o_proj": 93.95250701904297, "geo/layer_7/stable_rank_gate_proj": 87.38618469238281, "geo/layer_7/stable_rank_down_proj": 145.8695831298828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4687563180923462, "geo/layer_7/attn_entropy_mean": 4.652377128601074, "geo/layer_7/attn_entropy_std": 0.7903503775596619, "geo/layer_14/stable_rank_q_proj": 52.98719024658203, "geo/layer_14/stable_rank_k_proj": 38.502220153808594, "geo/layer_14/stable_rank_o_proj": 46.21377944946289, "geo/layer_14/stable_rank_gate_proj": 74.1872329711914, "geo/layer_14/stable_rank_down_proj": 131.0556182861328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37478095293045044, "geo/layer_14/attn_entropy_mean": 5.51961088180542, "geo/layer_14/attn_entropy_std": 0.3643955886363983, "geo/layer_21/stable_rank_q_proj": 41.9785270690918, "geo/layer_21/stable_rank_k_proj": 30.483531951904297, "geo/layer_21/stable_rank_o_proj": 73.66044616699219, "geo/layer_21/stable_rank_gate_proj": 70.39971923828125, "geo/layer_21/stable_rank_down_proj": 53.73530578613281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14907483756542206, "geo/layer_21/attn_entropy_mean": 5.705193519592285, "geo/layer_21/attn_entropy_std": 0.3027728497982025, "geo/layer_27/stable_rank_q_proj": 42.82030487060547, "geo/layer_27/stable_rank_k_proj": 31.398515701293945, "geo/layer_27/stable_rank_o_proj": 115.79479217529297, "geo/layer_27/stable_rank_gate_proj": 82.96469116210938, "geo/layer_27/stable_rank_down_proj": 130.2477569580078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09014690667390823, "geo/layer_27/attn_entropy_mean": 4.253534317016602, "geo/layer_27/attn_entropy_std": 0.7056450247764587, "attnres/final_alpha/block_0": 0.2386743575334549, "attnres/block_norm/0": 1.738548994064331, "attnres/final_alpha/block_1": 0.00480478722602129, "attnres/block_norm/1": 43098.08203125, "attnres/final_alpha/block_2": 0.010693676769733429, "attnres/block_norm/2": 27405.25, "attnres/final_alpha/block_3": 0.012690063565969467, "attnres/block_norm/3": 51591.9375, "attnres/final_alpha/block_4": 0.015024008229374886, "attnres/block_norm/4": 13778.447265625, "attnres/final_alpha/block_5": 0.6074462532997131, "attnres/block_norm/5": 6227.15625, "attnres/final_alpha/block_6": 0.11066687107086182, "attnres/block_norm/6": 33653.12109375, "geo/tier1_time_s": 1.361971378326416, "geo/step": 47925.0, "geo/rankme_slope": -3.999910511079432e-05} {"step": 47930, "timestamp": 1778246345.1965394, "train/loss": 2.1889434099197387, "train/z_loss": 0.0013700596522539855, "train/perplexity": 8.925777242958814, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701996.4292523498, "perf/iters_per_sec": 0.8115751405965566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2321717977523803, "data/tokens_consumed": 100518592512, "data/tokens_consumed_B": 100.518592512, "train/loss_slope": 5.919785604010463e-06} {"step": 47940, "timestamp": 1778246355.5353239, "train/loss": 2.1942948579788206, "train/z_loss": 0.0013837135396897794, "train/perplexity": 8.973671112741036, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029253.4279354694, "perf/iters_per_sec": 0.9676234378506992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334598779678346, "data/tokens_consumed": 100539564032, "data/tokens_consumed_B": 100.539564032, "train/loss_slope": 6.386542255871931e-06} {"step": 47950, "timestamp": 1778246365.873233, "grad/layer_0/attn": 0.0025150910951197147, "grad/layer_0/mlp": 0.0026391029823571444, "grad/layer_0/attn_mlp_ratio": 0.9530097978868385, "grad/layer_4/attn": 0.0017783136572688818, "grad/layer_4/mlp": 0.002321580657735467, "grad/layer_4/attn_mlp_ratio": 0.7659925898952993, "grad/layer_8/attn": 0.004311754833906889, "grad/layer_8/mlp": 0.0035022711381316185, "grad/layer_8/attn_mlp_ratio": 1.2311310406121232, "grad/layer_12/attn": 0.004911801777780056, "grad/layer_12/mlp": 0.006351389456540346, "grad/layer_12/attn_mlp_ratio": 0.7733428620705726, "grad/layer_16/attn": 0.0035470754373818636, "grad/layer_16/mlp": 0.004431212320923805, "grad/layer_16/attn_mlp_ratio": 0.8004751522704887, "grad/layer_20/attn": 0.005539079662412405, "grad/layer_20/mlp": 0.005755867809057236, "grad/layer_20/attn_mlp_ratio": 0.9623361324356141, "grad/layer_24/attn": 0.005897863302379847, "grad/layer_24/mlp": 0.007771429605782032, "grad/layer_24/attn_mlp_ratio": 0.7589161229872246, "grad/layer_27/attn": 0.006824207026511431, "grad/layer_27/mlp": 0.006659314967691898, "grad/layer_27/attn_mlp_ratio": 1.0247610988732936} {"step": 47950, "timestamp": 1778246365.8876138, "train/loss": 2.18707594871521, "train/z_loss": 0.0013810872798785568, "train/perplexity": 8.909124254481934, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027309.9974061954, "perf/iters_per_sec": 0.9666967379599549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344505786895752, "data/tokens_consumed": 100560535552, "data/tokens_consumed_B": 100.560535552, "train/loss_slope": 8.235063489907579e-06} {"step": 47960, "timestamp": 1778246376.2360008, "train/loss": 2.1458017468452453, "train/z_loss": 0.0013868800015188754, "train/perplexity": 8.54889253804543, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027645.3523978542, "perf/iters_per_sec": 0.9668566476811667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342794895172118, "data/tokens_consumed": 100581507072, "data/tokens_consumed_B": 100.581507072, "train/loss_slope": 8.64740903049191e-06} {"step": 47970, "timestamp": 1778246386.5758405, "train/loss": 2.2153173685073853, "train/z_loss": 0.0013780966051854192, "train/perplexity": 9.164317115774164, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029302.584619881, "perf/iters_per_sec": 0.9676468775844006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033434844017029, "data/tokens_consumed": 100602478592, "data/tokens_consumed_B": 100.602478592, "train/loss_slope": 1.4206155370099308e-05} {"step": 47980, "timestamp": 1778246396.9203026, "train/loss": 2.1833691120147707, "train/z_loss": 0.001382018136791885, "train/perplexity": 8.876160718693757, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029123.6657304638, "perf/iters_per_sec": 0.9675615624096221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335259675979613, "data/tokens_consumed": 100623450112, "data/tokens_consumed_B": 100.623450112, "train/loss_slope": 1.6657830887000993e-05} {"step": 47990, "timestamp": 1778246407.264468, "train/loss": 2.1550208806991575, "train/z_loss": 0.0013943051337264479, "train/perplexity": 8.628070337228852, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028732.092340561, "perf/iters_per_sec": 0.9673748456671529, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337254524230957, "data/tokens_consumed": 100644421632, "data/tokens_consumed_B": 100.644421632, "train/loss_slope": 1.4821182411782059e-05} {"step": 48000, "timestamp": 1778246417.5941546, "grad/layer_0/attn": 0.0026065169367939234, "grad/layer_0/mlp": 0.002632946241647005, "grad/layer_0/attn_mlp_ratio": 0.9899620419774491, "grad/layer_4/attn": 0.002887007314711809, "grad/layer_4/mlp": 0.0025080335326492786, "grad/layer_4/attn_mlp_ratio": 1.1511039075110863, "grad/layer_8/attn": 0.003618549555540085, "grad/layer_8/mlp": 0.003547805128619075, "grad/layer_8/attn_mlp_ratio": 1.0199403074188902, "grad/layer_12/attn": 0.004574045073240995, "grad/layer_12/mlp": 0.006368185393512249, "grad/layer_12/attn_mlp_ratio": 0.7182650502095017, "grad/layer_16/attn": 0.004040525294840336, "grad/layer_16/mlp": 0.00449863588437438, "grad/layer_16/attn_mlp_ratio": 0.8981667574070782, "grad/layer_20/attn": 0.0030124415643513203, "grad/layer_20/mlp": 0.005446102470159531, "grad/layer_20/attn_mlp_ratio": 0.5531371334901388, "grad/layer_24/attn": 0.0062230960465967655, "grad/layer_24/mlp": 0.007609841879457235, "grad/layer_24/attn_mlp_ratio": 0.8177694179979046, "grad/layer_27/attn": 0.0038760188035666943, "grad/layer_27/mlp": 0.0065313419327139854, "grad/layer_27/attn_mlp_ratio": 0.5934490620997355} {"step": 48000, "timestamp": 1778246418.1761222, "eos/sharpness": 20.27974128723144, "eos/L0_probe": 1.9858508110046387, "eos/L_plus": 2.1032392978668213, "eos/L_minus": 2.0712597370147705, "eos/grad_norm": 0.094635970890522, "eos/embed_grad_frac": 0.24480289220809937, "eos/time_s": 0.5790576934814453} {"step": 48000, "timestamp": 1778246418.1967165, "train/loss": 2.1602190256118776, "train/z_loss": 0.0013919420773163438, "train/perplexity": 8.673037067696816, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919238.3240278023, "perf/iters_per_sec": 0.9151641483439457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.092700147628784, "data/tokens_consumed": 100665393152, "data/tokens_consumed_B": 100.665393152, "train/loss_slope": 1.4378216772368402e-05} {"step": 48000, "timestamp": 1778246419.5602324, "geo/rankme_last": 438.72332763671875, "geo/layer_0/stable_rank_q_proj": 19.405244827270508, "geo/layer_0/stable_rank_k_proj": 16.50117301940918, "geo/layer_0/stable_rank_o_proj": 48.25236892700195, "geo/layer_0/stable_rank_gate_proj": 134.3472137451172, "geo/layer_0/stable_rank_down_proj": 53.507972717285156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05990363657474518, "geo/layer_0/attn_entropy_mean": 6.180027961730957, "geo/layer_0/attn_entropy_std": 0.3887852728366852, "geo/layer_7/stable_rank_q_proj": 42.88631820678711, "geo/layer_7/stable_rank_k_proj": 42.160179138183594, "geo/layer_7/stable_rank_o_proj": 93.89752197265625, "geo/layer_7/stable_rank_gate_proj": 87.49393463134766, "geo/layer_7/stable_rank_down_proj": 145.4424591064453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4885930120944977, "geo/layer_7/attn_entropy_mean": 4.637078285217285, "geo/layer_7/attn_entropy_std": 0.8200830221176147, "geo/layer_14/stable_rank_q_proj": 53.038475036621094, "geo/layer_14/stable_rank_k_proj": 38.57212829589844, "geo/layer_14/stable_rank_o_proj": 46.17851638793945, "geo/layer_14/stable_rank_gate_proj": 73.95852661132812, "geo/layer_14/stable_rank_down_proj": 131.121826171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39009901881217957, "geo/layer_14/attn_entropy_mean": 5.506204605102539, "geo/layer_14/attn_entropy_std": 0.3951437771320343, "geo/layer_21/stable_rank_q_proj": 41.875186920166016, "geo/layer_21/stable_rank_k_proj": 30.392629623413086, "geo/layer_21/stable_rank_o_proj": 73.58544158935547, "geo/layer_21/stable_rank_gate_proj": 70.34821319580078, "geo/layer_21/stable_rank_down_proj": 53.72771453857422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1457313597202301, "geo/layer_21/attn_entropy_mean": 5.69245719909668, "geo/layer_21/attn_entropy_std": 0.3037987947463989, "geo/layer_27/stable_rank_q_proj": 42.97037887573242, "geo/layer_27/stable_rank_k_proj": 31.453901290893555, "geo/layer_27/stable_rank_o_proj": 115.70669555664062, "geo/layer_27/stable_rank_gate_proj": 82.91554260253906, "geo/layer_27/stable_rank_down_proj": 129.9917449951172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09116300940513611, "geo/layer_27/attn_entropy_mean": 4.258415222167969, "geo/layer_27/attn_entropy_std": 0.6763453483581543, "attnres/final_alpha/block_0": 0.24015802145004272, "attnres/block_norm/0": 1.7388558387756348, "attnres/final_alpha/block_1": 0.004842008464038372, "attnres/block_norm/1": 42921.77734375, "attnres/final_alpha/block_2": 0.010628299787640572, "attnres/block_norm/2": 27336.849609375, "attnres/final_alpha/block_3": 0.012492639943957329, "attnres/block_norm/3": 51611.671875, "attnres/final_alpha/block_4": 0.015052140690386295, "attnres/block_norm/4": 13877.8671875, "attnres/final_alpha/block_5": 0.603240966796875, "attnres/block_norm/5": 6283.009765625, "attnres/final_alpha/block_6": 0.11358588933944702, "attnres/block_norm/6": 33979.1484375, "geo/tier1_time_s": 1.3598601818084717, "geo/step": 48000.0, "geo/rankme_slope": -4.9035571259753904e-05} {"step": 48000, "timestamp": 1778246426.738651, "geo/ww_alpha_mean": 7.521161291250743, "geo/ww_alpha_std": 4.224084886124558, "geo/ww_alpha_min": 1.3428430820108321, "geo/ww_alpha_max": 30.496712489344812, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.024434917789294, "geo/ww_alpha_by_type/k_proj": 4.455944531571005, "geo/ww_alpha_by_type/v_proj": 7.880206596715871, "geo/ww_alpha_by_type/o_proj": 8.546225758050742, "geo/ww_alpha_by_type/gate_proj": 8.056007921027433, "geo/ww_alpha_by_type/up_proj": 11.565998748705193, "geo/ww_alpha_by_type/down_proj": 8.218440108934491, "geo/twonn_id/layer_0": 0.6874271631240845, "geo/twonn_id/layer_7": 3.1611006259918213, "geo/twonn_id/layer_14": 4.587153434753418, "geo/twonn_id/layer_21": 7.321225643157959, "geo/twonn_id/layer_27": 6.095029354095459, "geo/tier2_time_s": 7.171395778656006} {"step": 48000, "timestamp": 1778246427.4191313, "eoc/jacobian_sigma/layer_0/attn": 1085.8792724609375, "eoc/jacobian_sigma/layer_0/mlp": 7827.66357421875, "eoc/jacobian_sigma/layer_0": 7827.66357421875, "eoc/jacobian_sigma/layer_7/attn": 1.1763161420822144, "eoc/jacobian_sigma/layer_7/mlp": 1.8102188110351562, "eoc/jacobian_sigma/layer_7": 1.8102188110351562, "eoc/jacobian_sigma/layer_14/attn": 1.6194398403167725, "eoc/jacobian_sigma/layer_14/mlp": 5.865821838378906, "eoc/jacobian_sigma/layer_14": 5.865821838378906, "eoc/jacobian_sigma/layer_21/attn": 1.0887255668640137, "eoc/jacobian_sigma/layer_21/mlp": 3.874229669570923, "eoc/jacobian_sigma/layer_21": 3.874229669570923, "eoc/jacobian_sigma/layer_27/attn": 3.49180006980896, "eoc/jacobian_sigma/layer_27/mlp": 33.497615814208984, "eoc/jacobian_sigma/layer_27": 33.497615814208984, "eoc/layer0_sigma": 7827.66357421875, "eoc/sigma_max": 33.497615814208984, "eoc/sigma_min": 1.8102188110351562, "eoc/sigma_mean": 11.261971533298492, "eoc/time_s": 0.6730883121490479} {"step": 48010, "timestamp": 1778246438.4237273, "train/loss": 2.183501148223877, "train/z_loss": 0.0013855218887329102, "train/perplexity": 8.87733277068142, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1036996.6967133164, "perf/iters_per_sec": 0.4944785579268057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 2.0223323822021486, "data/tokens_consumed": 100686364672, "data/tokens_consumed_B": 100.686364672, "train/loss_slope": 1.6055196968957554e-05} {"step": 48020, "timestamp": 1778246448.7679822, "train/loss": 2.1777029752731325, "train/z_loss": 0.0013977177441120148, "train/perplexity": 8.826009394626908, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028629.5790871074, "perf/iters_per_sec": 0.9673259635386979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033777689933777, "data/tokens_consumed": 100707336192, "data/tokens_consumed_B": 100.707336192, "train/loss_slope": 1.662165091411862e-05} {"step": 48030, "timestamp": 1778246459.1148734, "train/loss": 2.171416139602661, "train/z_loss": 0.0013833877630531788, "train/perplexity": 8.77069577992572, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028518.2345759121, "perf/iters_per_sec": 0.967272870338398, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338344335556031, "data/tokens_consumed": 100728307712, "data/tokens_consumed_B": 100.728307712, "train/loss_slope": 1.7564577280920647e-05} {"step": 48040, "timestamp": 1778246469.454784, "train/loss": 2.1575629472732545, "train/z_loss": 0.001392543932888657, "train/perplexity": 8.650031367806761, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029101.197770199, "perf/iters_per_sec": 0.9675508488512988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335374116897582, "data/tokens_consumed": 100749279232, "data/tokens_consumed_B": 100.749279232, "train/loss_slope": 1.7008943690790093e-05} {"step": 48050, "timestamp": 1778246479.7890928, "grad/layer_0/attn": 0.0028698062524199486, "grad/layer_0/mlp": 0.003071759594604373, "grad/layer_0/attn_mlp_ratio": 0.9342548043262756, "grad/layer_4/attn": 0.002449877094477415, "grad/layer_4/mlp": 0.002510279882699251, "grad/layer_4/attn_mlp_ratio": 0.9759377883590156, "grad/layer_8/attn": 0.003972482867538929, "grad/layer_8/mlp": 0.003734796540811658, "grad/layer_8/attn_mlp_ratio": 1.0636410090257595, "grad/layer_12/attn": 0.007108826655894518, "grad/layer_12/mlp": 0.006501584779471159, "grad/layer_12/attn_mlp_ratio": 1.0933990384930194, "grad/layer_16/attn": 0.0034176756162196398, "grad/layer_16/mlp": 0.004476242698729038, "grad/layer_16/attn_mlp_ratio": 0.7635143511853373, "grad/layer_20/attn": 0.00455525191500783, "grad/layer_20/mlp": 0.005724018439650536, "grad/layer_20/attn_mlp_ratio": 0.7958136200037428, "grad/layer_24/attn": 0.010537467896938324, "grad/layer_24/mlp": 0.00926453247666359, "grad/layer_24/attn_mlp_ratio": 1.1373987634822644, "grad/layer_27/attn": 0.004461650270968676, "grad/layer_27/mlp": 0.0076728337444365025, "grad/layer_27/attn_mlp_ratio": 0.5814866269004085} {"step": 48050, "timestamp": 1778246479.803796, "train/loss": 2.156471621990204, "train/z_loss": 0.0013978601549752057, "train/perplexity": 8.640596519056785, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027242.5752871465, "perf/iters_per_sec": 0.9666645885883076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344849824905396, "data/tokens_consumed": 100770250752, "data/tokens_consumed_B": 100.770250752, "train/loss_slope": 1.693633069800355e-05} {"step": 48060, "timestamp": 1778246490.1529715, "train/loss": 2.203856611251831, "train/z_loss": 0.0013973253080621362, "train/perplexity": 9.05988667095311, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027956.1314586757, "perf/iters_per_sec": 0.9670048386853579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341209888458252, "data/tokens_consumed": 100791222272, "data/tokens_consumed_B": 100.791222272, "train/loss_slope": 1.9971692451227497e-05} {"step": 48070, "timestamp": 1778246500.4942727, "train/loss": 2.181381106376648, "train/z_loss": 0.001388119044713676, "train/perplexity": 8.858532389554812, "train/grad_norm": 0.373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028744.2579967105, "perf/iters_per_sec": 0.9673806467040589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033719253540039, "data/tokens_consumed": 100812193792, "data/tokens_consumed_B": 100.812193792, "train/loss_slope": 1.949317676327869e-05} {"step": 48075, "timestamp": 1778246506.2678955, "eos/sharpness": 36.34393215179443, "eos/L0_probe": 1.9869580268859863, "eos/L_plus": 2.149690628051758, "eos/L_minus": 2.187664747238159, "eos/grad_norm": 0.1213296577334404, "eos/embed_grad_frac": 0.16104869544506073, "eos/time_s": 0.6077456474304199} {"step": 48075, "timestamp": 1778246507.6446683, "geo/rankme_last": 438.405029296875, "geo/layer_0/stable_rank_q_proj": 19.40882110595703, "geo/layer_0/stable_rank_k_proj": 16.52641487121582, "geo/layer_0/stable_rank_o_proj": 48.34657287597656, "geo/layer_0/stable_rank_gate_proj": 134.22300720214844, "geo/layer_0/stable_rank_down_proj": 53.58540725708008, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06259793788194656, "geo/layer_0/attn_entropy_mean": 6.186966896057129, "geo/layer_0/attn_entropy_std": 0.3941923677921295, "geo/layer_7/stable_rank_q_proj": 42.81588363647461, "geo/layer_7/stable_rank_k_proj": 42.0574836730957, "geo/layer_7/stable_rank_o_proj": 93.98109436035156, "geo/layer_7/stable_rank_gate_proj": 87.6036148071289, "geo/layer_7/stable_rank_down_proj": 145.4590606689453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49063658714294434, "geo/layer_7/attn_entropy_mean": 4.636928558349609, "geo/layer_7/attn_entropy_std": 0.7854920029640198, "geo/layer_14/stable_rank_q_proj": 52.93094253540039, "geo/layer_14/stable_rank_k_proj": 38.59454345703125, "geo/layer_14/stable_rank_o_proj": 46.076759338378906, "geo/layer_14/stable_rank_gate_proj": 73.96505737304688, "geo/layer_14/stable_rank_down_proj": 131.2742156982422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3813934624195099, "geo/layer_14/attn_entropy_mean": 5.489968299865723, "geo/layer_14/attn_entropy_std": 0.3851342797279358, "geo/layer_21/stable_rank_q_proj": 41.9237174987793, "geo/layer_21/stable_rank_k_proj": 30.466272354125977, "geo/layer_21/stable_rank_o_proj": 73.5368423461914, "geo/layer_21/stable_rank_gate_proj": 70.31122589111328, "geo/layer_21/stable_rank_down_proj": 53.79709243774414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14636418223381042, "geo/layer_21/attn_entropy_mean": 5.716377258300781, "geo/layer_21/attn_entropy_std": 0.30005910992622375, "geo/layer_27/stable_rank_q_proj": 43.01896667480469, "geo/layer_27/stable_rank_k_proj": 31.50905990600586, "geo/layer_27/stable_rank_o_proj": 115.73115539550781, "geo/layer_27/stable_rank_gate_proj": 82.78655242919922, "geo/layer_27/stable_rank_down_proj": 129.8435821533203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08934946358203888, "geo/layer_27/attn_entropy_mean": 4.260382652282715, "geo/layer_27/attn_entropy_std": 0.6867657899856567, "attnres/final_alpha/block_0": 0.24102574586868286, "attnres/block_norm/0": 1.7390165328979492, "attnres/final_alpha/block_1": 0.004913213662803173, "attnres/block_norm/1": 42950.62109375, "attnres/final_alpha/block_2": 0.0106250811368227, "attnres/block_norm/2": 27427.109375, "attnres/final_alpha/block_3": 0.012592008337378502, "attnres/block_norm/3": 52226.671875, "attnres/final_alpha/block_4": 0.015270251780748367, "attnres/block_norm/4": 13788.896484375, "attnres/final_alpha/block_5": 0.6021944284439087, "attnres/block_norm/5": 6225.0908203125, "attnres/final_alpha/block_6": 0.11337919533252716, "attnres/block_norm/6": 33833.48828125, "geo/tier1_time_s": 1.3565037250518799, "geo/step": 48075.0, "geo/rankme_slope": -8.442818533663466e-05} {"step": 48080, "timestamp": 1778246512.818814, "train/loss": 2.177814078330994, "train/z_loss": 0.0013733786530792714, "train/perplexity": 8.826990045735023, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702315.7717865084, "perf/iters_per_sec": 0.8117274149830381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319406509399413, "data/tokens_consumed": 100833165312, "data/tokens_consumed_B": 100.833165312, "train/loss_slope": 2.1484718786285937e-05} {"step": 48090, "timestamp": 1778246523.159622, "train/loss": 2.1313339471817017, "train/z_loss": 0.001390495733357966, "train/perplexity": 8.426099289408207, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028970.0041796893, "perf/iters_per_sec": 0.9674882908724257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336042404174806, "data/tokens_consumed": 100854136832, "data/tokens_consumed_B": 100.854136832, "train/loss_slope": 1.9665504839554054e-05} {"step": 48100, "timestamp": 1778246533.4995832, "grad/layer_0/attn": 0.0029283263720571995, "grad/layer_0/mlp": 0.0030113067477941513, "grad/layer_0/attn_mlp_ratio": 0.9724436997186996, "grad/layer_4/attn": 0.0019263590220361948, "grad/layer_4/mlp": 0.0025698787067085505, "grad/layer_4/attn_mlp_ratio": 0.7495913881259783, "grad/layer_8/attn": 0.0074924263171851635, "grad/layer_8/mlp": 0.003757303347811103, "grad/layer_8/attn_mlp_ratio": 1.9940966763145085, "grad/layer_12/attn": 0.004899412393569946, "grad/layer_12/mlp": 0.006476243957877159, "grad/layer_12/attn_mlp_ratio": 0.7565206545313425, "grad/layer_16/attn": 0.0034036587458103895, "grad/layer_16/mlp": 0.004708708729594946, "grad/layer_16/attn_mlp_ratio": 0.7228433247810696, "grad/layer_20/attn": 0.008436229079961777, "grad/layer_20/mlp": 0.006060748361051083, "grad/layer_20/attn_mlp_ratio": 1.3919450929495807, "grad/layer_24/attn": 0.010284398682415485, "grad/layer_24/mlp": 0.01124906912446022, "grad/layer_24/attn_mlp_ratio": 0.9142444123334992, "grad/layer_27/attn": 0.004281085450202227, "grad/layer_27/mlp": 0.011655250564217567, "grad/layer_27/attn_mlp_ratio": 0.36730959921312184} {"step": 48100, "timestamp": 1778246533.5273962, "train/loss": 2.141914892196655, "train/z_loss": 0.0013851076480932533, "train/perplexity": 8.51572872854941, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023722.959353853, "perf/iters_per_sec": 0.9649863049287095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362841367721558, "data/tokens_consumed": 100875108352, "data/tokens_consumed_B": 100.875108352, "train/loss_slope": 1.739820950459756e-05} {"step": 48110, "timestamp": 1778246543.8782847, "train/loss": 2.1331852197647097, "train/z_loss": 0.0013823518762364983, "train/perplexity": 8.441712743925311, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027551.1744857836, "perf/iters_per_sec": 0.9668117401532095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343275308609008, "data/tokens_consumed": 100896079872, "data/tokens_consumed_B": 100.896079872, "train/loss_slope": 1.591153787200311e-05} {"step": 48120, "timestamp": 1778246554.2239063, "train/loss": 2.163184976577759, "train/z_loss": 0.0013766863150522113, "train/perplexity": 8.698799055876346, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028546.6777256192, "perf/iters_per_sec": 0.9672864330890747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338199377059936, "data/tokens_consumed": 100917051392, "data/tokens_consumed_B": 100.917051392, "train/loss_slope": 1.6934856923535648e-05} {"step": 48130, "timestamp": 1778246564.9629235, "train/loss": 2.1584654569625856, "train/z_loss": 0.0013896081829443574, "train/perplexity": 8.657841628817206, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1953694.1235914356, "perf/iters_per_sec": 0.931593953891485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.073429036140442, "data/tokens_consumed": 100938022912, "data/tokens_consumed_B": 100.938022912, "train/loss_slope": 1.753562510353359e-05} {"step": 48140, "timestamp": 1778246575.3143623, "train/loss": 2.2026875257492065, "train/z_loss": 0.0013920257217250765, "train/perplexity": 9.049301077728131, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027329.0147121544, "perf/iters_per_sec": 0.9667058061180851, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344408750534058, "data/tokens_consumed": 100958994432, "data/tokens_consumed_B": 100.958994432, "train/loss_slope": 1.9398866714102123e-05} {"step": 48150, "timestamp": 1778246585.646876, "grad/layer_0/attn": 0.0028322353027760983, "grad/layer_0/mlp": 0.00296592409722507, "grad/layer_0/attn_mlp_ratio": 0.954925046778319, "grad/layer_4/attn": 0.0032606697641313076, "grad/layer_4/mlp": 0.0025859095621854067, "grad/layer_4/attn_mlp_ratio": 1.2609372290970315, "grad/layer_8/attn": 0.006479619536548853, "grad/layer_8/mlp": 0.0036485041491687298, "grad/layer_8/attn_mlp_ratio": 1.7759660107358133, "grad/layer_12/attn": 0.005420440807938576, "grad/layer_12/mlp": 0.007132149767130613, "grad/layer_12/attn_mlp_ratio": 0.7600009687008039, "grad/layer_16/attn": 0.004443157464265823, "grad/layer_16/mlp": 0.004914953839033842, "grad/layer_16/attn_mlp_ratio": 0.904007956001158, "grad/layer_20/attn": 0.003848332678899169, "grad/layer_20/mlp": 0.006422095932066441, "grad/layer_20/attn_mlp_ratio": 0.599233125709098, "grad/layer_24/attn": 0.017876815050840378, "grad/layer_24/mlp": 0.0141200702637434, "grad/layer_24/attn_mlp_ratio": 1.2660570797680515, "grad/layer_27/attn": 0.004912240896373987, "grad/layer_27/mlp": 0.013622505590319633, "grad/layer_27/attn_mlp_ratio": 0.3605974560072823} {"step": 48150, "timestamp": 1778246586.2435405, "eos/sharpness": 56.51352405548094, "eos/L0_probe": 1.989134669303894, "eos/L_plus": 2.298525810241699, "eos/L_minus": 2.2448787689208984, "eos/grad_norm": 0.19959768652915955, "eos/embed_grad_frac": 0.06763795018196106, "eos/time_s": 0.5937972068786621} {"step": 48150, "timestamp": 1778246586.2613492, "train/loss": 2.16714608669281, "train/z_loss": 0.001384940231218934, "train/perplexity": 8.733324290791133, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916606.914356075, "perf/iters_per_sec": 0.913909394434011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0942003726959229, "data/tokens_consumed": 100979965952, "data/tokens_consumed_B": 100.979965952, "train/loss_slope": 2.0305500236531823e-05} {"step": 48150, "timestamp": 1778246587.6237829, "geo/rankme_last": 439.3319091796875, "geo/layer_0/stable_rank_q_proj": 19.4047794342041, "geo/layer_0/stable_rank_k_proj": 16.504671096801758, "geo/layer_0/stable_rank_o_proj": 48.34819793701172, "geo/layer_0/stable_rank_gate_proj": 134.1439971923828, "geo/layer_0/stable_rank_down_proj": 53.533111572265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06222135201096535, "geo/layer_0/attn_entropy_mean": 6.187647819519043, "geo/layer_0/attn_entropy_std": 0.3932388126850128, "geo/layer_7/stable_rank_q_proj": 42.759490966796875, "geo/layer_7/stable_rank_k_proj": 41.99824905395508, "geo/layer_7/stable_rank_o_proj": 94.0827407836914, "geo/layer_7/stable_rank_gate_proj": 87.48681640625, "geo/layer_7/stable_rank_down_proj": 145.41111755371094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4837954640388489, "geo/layer_7/attn_entropy_mean": 4.6556901931762695, "geo/layer_7/attn_entropy_std": 0.8056070804595947, "geo/layer_14/stable_rank_q_proj": 52.92167663574219, "geo/layer_14/stable_rank_k_proj": 38.578895568847656, "geo/layer_14/stable_rank_o_proj": 45.95043182373047, "geo/layer_14/stable_rank_gate_proj": 73.8575210571289, "geo/layer_14/stable_rank_down_proj": 131.1917266845703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39003312587738037, "geo/layer_14/attn_entropy_mean": 5.5004353523254395, "geo/layer_14/attn_entropy_std": 0.3689224421977997, "geo/layer_21/stable_rank_q_proj": 41.926971435546875, "geo/layer_21/stable_rank_k_proj": 30.47832489013672, "geo/layer_21/stable_rank_o_proj": 73.48005676269531, "geo/layer_21/stable_rank_gate_proj": 70.24773406982422, "geo/layer_21/stable_rank_down_proj": 53.82598114013672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14584924280643463, "geo/layer_21/attn_entropy_mean": 5.701205253601074, "geo/layer_21/attn_entropy_std": 0.2959645390510559, "geo/layer_27/stable_rank_q_proj": 43.04762649536133, "geo/layer_27/stable_rank_k_proj": 31.543270111083984, "geo/layer_27/stable_rank_o_proj": 115.9305648803711, "geo/layer_27/stable_rank_gate_proj": 82.77286529541016, "geo/layer_27/stable_rank_down_proj": 130.091552734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09231005609035492, "geo/layer_27/attn_entropy_mean": 4.260612487792969, "geo/layer_27/attn_entropy_std": 0.6942952871322632, "attnres/final_alpha/block_0": 0.24126213788986206, "attnres/block_norm/0": 1.7392520904541016, "attnres/final_alpha/block_1": 0.004893752746284008, "attnres/block_norm/1": 42850.84375, "attnres/final_alpha/block_2": 0.010726641863584518, "attnres/block_norm/2": 27447.970703125, "attnres/final_alpha/block_3": 0.012586697936058044, "attnres/block_norm/3": 51707.82421875, "attnres/final_alpha/block_4": 0.015334194526076317, "attnres/block_norm/4": 13842.2021484375, "attnres/final_alpha/block_5": 0.6018078327178955, "attnres/block_norm/5": 6200.54052734375, "attnres/final_alpha/block_6": 0.11338871717453003, "attnres/block_norm/6": 33800.12109375, "geo/tier1_time_s": 1.3585352897644043, "geo/step": 48150.0, "geo/rankme_slope": -9.996096094687875e-05} {"step": 48160, "timestamp": 1778246597.9702485, "train/loss": 2.160327410697937, "train/z_loss": 0.001383426261600107, "train/perplexity": 8.673977146510136, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791657.8964842914, "perf/iters_per_sec": 0.8543290598317582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1705091714859008, "data/tokens_consumed": 101000937472, "data/tokens_consumed_B": 101.000937472, "train/loss_slope": 2.1239460219215287e-05} {"step": 48170, "timestamp": 1778246608.3173106, "train/loss": 2.1717772006988527, "train/z_loss": 0.001383873599115759, "train/perplexity": 8.773863108723578, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028192.0374585306, "perf/iters_per_sec": 0.9671173274319318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340007066726684, "data/tokens_consumed": 101021908992, "data/tokens_consumed_B": 101.021908992, "train/loss_slope": 2.2162079889782217e-05} {"step": 48180, "timestamp": 1778246618.657931, "train/loss": 2.1930513858795164, "train/z_loss": 0.001386298961006105, "train/perplexity": 8.962519537857029, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028985.3552467586, "perf/iters_per_sec": 0.9674956108316224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335964202880858, "data/tokens_consumed": 101042880512, "data/tokens_consumed_B": 101.042880512, "train/loss_slope": 2.2149048658451682e-05} {"step": 48190, "timestamp": 1778246629.0060303, "train/loss": 2.1465326070785524, "train/z_loss": 0.0013971832697279752, "train/perplexity": 8.555142867420676, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027580.4785362037, "perf/iters_per_sec": 0.9668257134133357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343125820159913, "data/tokens_consumed": 101063852032, "data/tokens_consumed_B": 101.063852032, "train/loss_slope": 2.0953658948315222e-05} {"step": 48200, "timestamp": 1778246639.3463502, "grad/layer_0/attn": 0.0029282565228641033, "grad/layer_0/mlp": 0.0029058661311864853, "grad/layer_0/attn_mlp_ratio": 1.007705203851891, "grad/layer_4/attn": 0.0021446794271469116, "grad/layer_4/mlp": 0.0025128619745373726, "grad/layer_4/attn_mlp_ratio": 0.8534807576105247, "grad/layer_8/attn": 0.004935740027576685, "grad/layer_8/mlp": 0.0037246847059577703, "grad/layer_8/attn_mlp_ratio": 1.3251430079886994, "grad/layer_12/attn": 0.007725301664322615, "grad/layer_12/mlp": 0.006804951466619968, "grad/layer_12/attn_mlp_ratio": 1.1352471194971026, "grad/layer_16/attn": 0.004499414004385471, "grad/layer_16/mlp": 0.0050384774804115295, "grad/layer_16/attn_mlp_ratio": 0.8930106232640954, "grad/layer_20/attn": 0.0048998561687767506, "grad/layer_20/mlp": 0.0072565204463899136, "grad/layer_20/attn_mlp_ratio": 0.6752349335267034, "grad/layer_24/attn": 0.015821337699890137, "grad/layer_24/mlp": 0.011485290713608265, "grad/layer_24/attn_mlp_ratio": 1.3775304392940872, "grad/layer_27/attn": 0.008230126462876797, "grad/layer_27/mlp": 0.011840690858662128, "grad/layer_27/attn_mlp_ratio": 0.6950714693601557} {"step": 48200, "timestamp": 1778246639.3605454, "train/loss": 2.194745111465454, "train/z_loss": 0.0013861619285307825, "train/perplexity": 8.977712449192088, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026784.6627683502, "perf/iters_per_sec": 0.9664462388841392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347187042236328, "data/tokens_consumed": 101084823552, "data/tokens_consumed_B": 101.084823552, "train/loss_slope": 2.0934992311525183e-05} {"step": 48210, "timestamp": 1778246649.703307, "train/loss": 2.108111357688904, "train/z_loss": 0.001405817363411188, "train/perplexity": 8.232678008501425, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028635.7548547385, "perf/iters_per_sec": 0.9673289083741848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337745428085328, "data/tokens_consumed": 101105795072, "data/tokens_consumed_B": 101.105795072, "train/loss_slope": 1.8066357149936222e-05} {"step": 48220, "timestamp": 1778246660.0450377, "train/loss": 2.1759085178375246, "train/z_loss": 0.0013764889910817147, "train/perplexity": 8.810185698162991, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028792.4073519283, "perf/iters_per_sec": 0.9674036061057703, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336947202682496, "data/tokens_consumed": 101126766592, "data/tokens_consumed_B": 101.126766592, "train/loss_slope": 1.8610686909641018e-05} {"step": 48225, "timestamp": 1778246665.8021884, "eos/sharpness": 42.07277297973632, "eos/L0_probe": 1.9856778383255005, "eos/L_plus": 2.2061104774475098, "eos/L_minus": 2.1859729290008545, "eos/grad_norm": 0.13206210732460022, "eos/embed_grad_frac": 0.13735488057136536, "eos/time_s": 0.5939540863037109} {"step": 48225, "timestamp": 1778246667.1810012, "geo/rankme_last": 439.49072265625, "geo/layer_0/stable_rank_q_proj": 19.376150131225586, "geo/layer_0/stable_rank_k_proj": 16.476818084716797, "geo/layer_0/stable_rank_o_proj": 48.310523986816406, "geo/layer_0/stable_rank_gate_proj": 134.33038330078125, "geo/layer_0/stable_rank_down_proj": 53.521202087402344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06268937140703201, "geo/layer_0/attn_entropy_mean": 6.188501358032227, "geo/layer_0/attn_entropy_std": 0.3946002423763275, "geo/layer_7/stable_rank_q_proj": 42.764747619628906, "geo/layer_7/stable_rank_k_proj": 42.05774688720703, "geo/layer_7/stable_rank_o_proj": 94.14784240722656, "geo/layer_7/stable_rank_gate_proj": 87.38240814208984, "geo/layer_7/stable_rank_down_proj": 145.58226013183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4771730899810791, "geo/layer_7/attn_entropy_mean": 4.634568214416504, "geo/layer_7/attn_entropy_std": 0.802273690700531, "geo/layer_14/stable_rank_q_proj": 52.939762115478516, "geo/layer_14/stable_rank_k_proj": 38.59923553466797, "geo/layer_14/stable_rank_o_proj": 45.97955322265625, "geo/layer_14/stable_rank_gate_proj": 73.90669250488281, "geo/layer_14/stable_rank_down_proj": 130.97061157226562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39332568645477295, "geo/layer_14/attn_entropy_mean": 5.500585556030273, "geo/layer_14/attn_entropy_std": 0.3842984437942505, "geo/layer_21/stable_rank_q_proj": 41.94969177246094, "geo/layer_21/stable_rank_k_proj": 30.465824127197266, "geo/layer_21/stable_rank_o_proj": 73.44732666015625, "geo/layer_21/stable_rank_gate_proj": 70.34752655029297, "geo/layer_21/stable_rank_down_proj": 53.79275131225586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14168182015419006, "geo/layer_21/attn_entropy_mean": 5.705740928649902, "geo/layer_21/attn_entropy_std": 0.30484676361083984, "geo/layer_27/stable_rank_q_proj": 42.977630615234375, "geo/layer_27/stable_rank_k_proj": 31.556793212890625, "geo/layer_27/stable_rank_o_proj": 115.8523178100586, "geo/layer_27/stable_rank_gate_proj": 82.83879852294922, "geo/layer_27/stable_rank_down_proj": 130.25445556640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09142251312732697, "geo/layer_27/attn_entropy_mean": 4.268946170806885, "geo/layer_27/attn_entropy_std": 0.7081704139709473, "attnres/final_alpha/block_0": 0.2421034276485443, "attnres/block_norm/0": 1.7395391464233398, "attnres/final_alpha/block_1": 0.004905209876596928, "attnres/block_norm/1": 43028.76953125, "attnres/final_alpha/block_2": 0.010723888874053955, "attnres/block_norm/2": 27443.583984375, "attnres/final_alpha/block_3": 0.012830008752644062, "attnres/block_norm/3": 51169.41015625, "attnres/final_alpha/block_4": 0.015521824359893799, "attnres/block_norm/4": 13839.728515625, "attnres/final_alpha/block_5": 0.5985107421875, "attnres/block_norm/5": 6302.080078125, "attnres/final_alpha/block_6": 0.11540482193231583, "attnres/block_norm/6": 33629.72265625, "geo/tier1_time_s": 1.3584282398223877, "geo/step": 48225.0, "geo/rankme_slope": -8.597118534913966e-05} {"step": 48230, "timestamp": 1778246672.3542562, "train/loss": 2.1211066484451293, "train/z_loss": 0.0013858222751878201, "train/perplexity": 8.340362231367285, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704710.6518206983, "perf/iters_per_sec": 0.8128693827727786, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2302099466323853, "data/tokens_consumed": 101147738112, "data/tokens_consumed_B": 101.147738112, "train/loss_slope": 1.6591775492437275e-05} {"step": 48240, "timestamp": 1778246682.6968796, "train/loss": 2.2223262786865234, "train/z_loss": 0.0013805040391162038, "train/perplexity": 9.228774615834192, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028553.7418551985, "perf/iters_per_sec": 0.9672898015285485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338163375854492, "data/tokens_consumed": 101168709632, "data/tokens_consumed_B": 101.168709632, "train/loss_slope": 1.7130794583803353e-05} {"step": 48250, "timestamp": 1778246693.0468535, "grad/layer_0/attn": 0.0025705727748572826, "grad/layer_0/mlp": 0.0028659554664045572, "grad/layer_0/attn_mlp_ratio": 0.896933924932464, "grad/layer_4/attn": 0.004302395507693291, "grad/layer_4/mlp": 0.0025816995184868574, "grad/layer_4/attn_mlp_ratio": 1.6664973248185002, "grad/layer_8/attn": 0.005840874742716551, "grad/layer_8/mlp": 0.0035823795478791, "grad/layer_8/attn_mlp_ratio": 1.630445490659976, "grad/layer_12/attn": 0.005027953069657087, "grad/layer_12/mlp": 0.007057995069772005, "grad/layer_12/attn_mlp_ratio": 0.7123769496458164, "grad/layer_16/attn": 0.005887584760785103, "grad/layer_16/mlp": 0.004705754108726978, "grad/layer_16/attn_mlp_ratio": 1.2511458311754533, "grad/layer_20/attn": 0.003280567703768611, "grad/layer_20/mlp": 0.006108309607952833, "grad/layer_20/attn_mlp_ratio": 0.537066367066721, "grad/layer_24/attn": 0.010908860713243484, "grad/layer_24/mlp": 0.010969894006848335, "grad/layer_24/attn_mlp_ratio": 0.994436282336878, "grad/layer_27/attn": 0.004894407000392675, "grad/layer_27/mlp": 0.011464950628578663, "grad/layer_27/attn_mlp_ratio": 0.42690170383309156} {"step": 48250, "timestamp": 1778246693.061273, "train/loss": 2.1779624223709106, "train/z_loss": 0.0013944166945293545, "train/perplexity": 8.828299574226685, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024270.931808589, "perf/iters_per_sec": 0.9652475985567994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360036134719848, "data/tokens_consumed": 101189681152, "data/tokens_consumed_B": 101.189681152, "train/loss_slope": 1.607622020851914e-05} {"step": 48260, "timestamp": 1778246703.4104118, "train/loss": 2.1575275897979735, "train/z_loss": 0.001403760234825313, "train/perplexity": 8.649725529943353, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027774.9255897924, "perf/iters_per_sec": 0.9669184329937899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034213399887085, "data/tokens_consumed": 101210652672, "data/tokens_consumed_B": 101.210652672, "train/loss_slope": 1.5156182182682848e-05} {"step": 48270, "timestamp": 1778246713.7526236, "train/loss": 2.13778076171875, "train/z_loss": 0.0013995153130963445, "train/perplexity": 8.480596266000735, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028807.896114325, "perf/iters_per_sec": 0.9674109917232155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336868286132812, "data/tokens_consumed": 101231624192, "data/tokens_consumed_B": 101.231624192, "train/loss_slope": 1.389579616292549e-05} {"step": 48280, "timestamp": 1778246724.7393098, "train/loss": 2.182799506187439, "train/z_loss": 0.0013790981261990964, "train/perplexity": 8.871106245489411, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909621.1698173021, "perf/iters_per_sec": 0.9105783318602095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0982031583786012, "data/tokens_consumed": 101252595712, "data/tokens_consumed_B": 101.252595712, "train/loss_slope": 1.5860236352748287e-05} {"step": 48290, "timestamp": 1778246735.0854404, "train/loss": 2.162340188026428, "train/z_loss": 0.0013841728679835797, "train/perplexity": 8.69145351317566, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028567.776695702, "perf/iters_per_sec": 0.96729649386201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338091850280762, "data/tokens_consumed": 101273567232, "data/tokens_consumed_B": 101.273567232, "train/loss_slope": 1.4824893780023614e-05} {"step": 48300, "timestamp": 1778246745.4178236, "grad/layer_0/attn": 0.00285017816349864, "grad/layer_0/mlp": 0.002958595519885421, "grad/layer_0/attn_mlp_ratio": 0.9633550946746215, "grad/layer_4/attn": 0.00390901044011116, "grad/layer_4/mlp": 0.0025523535441607237, "grad/layer_4/attn_mlp_ratio": 1.531531670407115, "grad/layer_8/attn": 0.003865540260449052, "grad/layer_8/mlp": 0.003563778242096305, "grad/layer_8/attn_mlp_ratio": 1.0846747158172703, "grad/layer_12/attn": 0.004439653363078833, "grad/layer_12/mlp": 0.006283479277044535, "grad/layer_12/attn_mlp_ratio": 0.7065597094657842, "grad/layer_16/attn": 0.003621205221861601, "grad/layer_16/mlp": 0.004540232475847006, "grad/layer_16/attn_mlp_ratio": 0.7975814369346585, "grad/layer_20/attn": 0.0034860530868172646, "grad/layer_20/mlp": 0.005878708325326443, "grad/layer_20/attn_mlp_ratio": 0.5929964261875575, "grad/layer_24/attn": 0.009296989068388939, "grad/layer_24/mlp": 0.01101593766361475, "grad/layer_24/attn_mlp_ratio": 0.8439580240818502, "grad/layer_27/attn": 0.0038294021505862474, "grad/layer_27/mlp": 0.011759892106056213, "grad/layer_27/attn_mlp_ratio": 0.3256324193698092} {"step": 48300, "timestamp": 1778246746.0140927, "eos/sharpness": 47.07403182983398, "eos/L0_probe": 1.9821451902389526, "eos/L_plus": 2.221341609954834, "eos/L_minus": 2.213689088821411, "eos/grad_norm": 0.16403454542160034, "eos/embed_grad_frac": 0.09684285521507263, "eos/time_s": 0.5934441089630127} {"step": 48300, "timestamp": 1778246746.034753, "train/loss": 2.1305991888046263, "train/z_loss": 0.0014016805216670036, "train/perplexity": 8.419910416310852, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916285.9890667412, "perf/iters_per_sec": 0.9137563653310495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0943836212158202, "data/tokens_consumed": 101294538752, "data/tokens_consumed_B": 101.294538752, "train/loss_slope": 1.0170808548044594e-05} {"step": 48300, "timestamp": 1778246747.3968606, "geo/rankme_last": 439.7374267578125, "geo/layer_0/stable_rank_q_proj": 19.37559700012207, "geo/layer_0/stable_rank_k_proj": 16.465246200561523, "geo/layer_0/stable_rank_o_proj": 48.30337905883789, "geo/layer_0/stable_rank_gate_proj": 134.16563415527344, "geo/layer_0/stable_rank_down_proj": 53.527679443359375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06383506953716278, "geo/layer_0/attn_entropy_mean": 6.185791015625, "geo/layer_0/attn_entropy_std": 0.3924318253993988, "geo/layer_7/stable_rank_q_proj": 42.74114990234375, "geo/layer_7/stable_rank_k_proj": 42.1826286315918, "geo/layer_7/stable_rank_o_proj": 94.19424438476562, "geo/layer_7/stable_rank_gate_proj": 87.52961730957031, "geo/layer_7/stable_rank_down_proj": 145.77345275878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4807131886482239, "geo/layer_7/attn_entropy_mean": 4.627930641174316, "geo/layer_7/attn_entropy_std": 0.8043804168701172, "geo/layer_14/stable_rank_q_proj": 52.86173629760742, "geo/layer_14/stable_rank_k_proj": 38.45316696166992, "geo/layer_14/stable_rank_o_proj": 46.019508361816406, "geo/layer_14/stable_rank_gate_proj": 73.79558563232422, "geo/layer_14/stable_rank_down_proj": 130.66476440429688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3837873935699463, "geo/layer_14/attn_entropy_mean": 5.5129170417785645, "geo/layer_14/attn_entropy_std": 0.3866763710975647, "geo/layer_21/stable_rank_q_proj": 41.76576232910156, "geo/layer_21/stable_rank_k_proj": 30.519500732421875, "geo/layer_21/stable_rank_o_proj": 73.41004943847656, "geo/layer_21/stable_rank_gate_proj": 70.37494659423828, "geo/layer_21/stable_rank_down_proj": 53.79184341430664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14055772125720978, "geo/layer_21/attn_entropy_mean": 5.696041107177734, "geo/layer_21/attn_entropy_std": 0.3011642098426819, "geo/layer_27/stable_rank_q_proj": 42.92793273925781, "geo/layer_27/stable_rank_k_proj": 31.477157592773438, "geo/layer_27/stable_rank_o_proj": 115.7613296508789, "geo/layer_27/stable_rank_gate_proj": 82.8331527709961, "geo/layer_27/stable_rank_down_proj": 130.40081787109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08704618364572525, "geo/layer_27/attn_entropy_mean": 4.247627258300781, "geo/layer_27/attn_entropy_std": 0.6897240877151489, "attnres/final_alpha/block_0": 0.24189642071723938, "attnres/block_norm/0": 1.7397005558013916, "attnres/final_alpha/block_1": 0.004887651186436415, "attnres/block_norm/1": 42927.8515625, "attnres/final_alpha/block_2": 0.010959373787045479, "attnres/block_norm/2": 27384.388671875, "attnres/final_alpha/block_3": 0.012861201539635658, "attnres/block_norm/3": 51593.140625, "attnres/final_alpha/block_4": 0.015536787919700146, "attnres/block_norm/4": 13821.875, "attnres/final_alpha/block_5": 0.598612904548645, "attnres/block_norm/5": 6298.359375, "attnres/final_alpha/block_6": 0.11524567008018494, "attnres/block_norm/6": 33737.5546875, "geo/tier1_time_s": 1.3583686351776123, "geo/step": 48300.0, "geo/rankme_slope": -6.886963769882953e-05} {"step": 48310, "timestamp": 1778246758.3249602, "train/loss": 2.162833058834076, "train/z_loss": 0.0013948796084150672, "train/perplexity": 8.69573833273283, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706884.2696400757, "perf/iters_per_sec": 0.8139058445167903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2286433458328248, "data/tokens_consumed": 101315510272, "data/tokens_consumed_B": 101.315510272, "train/loss_slope": 8.39440861228991e-06} {"step": 48320, "timestamp": 1778246769.1754482, "train/loss": 2.1487196922302245, "train/z_loss": 0.0013934477930888534, "train/perplexity": 8.573874169366105, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934115.6968931495, "perf/iters_per_sec": 0.9222582325425861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0842950105667115, "data/tokens_consumed": 101336481792, "data/tokens_consumed_B": 101.336481792, "train/loss_slope": 7.45612722311582e-06} {"step": 48330, "timestamp": 1778246779.5220666, "train/loss": 2.1314975023269653, "train/z_loss": 0.0013995597488246857, "train/perplexity": 8.427477534007917, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028210.837482886, "perf/iters_per_sec": 0.9671262919821195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339911222457885, "data/tokens_consumed": 101357453312, "data/tokens_consumed_B": 101.357453312, "train/loss_slope": 5.552544681080956e-06} {"step": 48340, "timestamp": 1778246789.8650239, "train/loss": 2.1249311923980714, "train/z_loss": 0.0013948410050943494, "train/perplexity": 8.37232138895014, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028846.0341053347, "perf/iters_per_sec": 0.9674291773344682, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336673974990844, "data/tokens_consumed": 101378424832, "data/tokens_consumed_B": 101.378424832, "train/loss_slope": 3.7679958085988127e-06} {"step": 48350, "timestamp": 1778246800.208121, "grad/layer_0/attn": 0.0029514613561332226, "grad/layer_0/mlp": 0.0030645951628684998, "grad/layer_0/attn_mlp_ratio": 0.9630835731863063, "grad/layer_4/attn": 0.0033855619840323925, "grad/layer_4/mlp": 0.0024624085053801537, "grad/layer_4/attn_mlp_ratio": 1.3748985349690652, "grad/layer_8/attn": 0.006802606862038374, "grad/layer_8/mlp": 0.0036860033869743347, "grad/layer_8/attn_mlp_ratio": 1.8455237185959124, "grad/layer_12/attn": 0.005017942748963833, "grad/layer_12/mlp": 0.007113994099199772, "grad/layer_12/attn_mlp_ratio": 0.7053622210611704, "grad/layer_16/attn": 0.0035993088968098164, "grad/layer_16/mlp": 0.004459012765437365, "grad/layer_16/attn_mlp_ratio": 0.8071985897840142, "grad/layer_20/attn": 0.00359978131018579, "grad/layer_20/mlp": 0.006677288096398115, "grad/layer_20/attn_mlp_ratio": 0.5391082733447983, "grad/layer_24/attn": 0.010409197770059109, "grad/layer_24/mlp": 0.010164575651288033, "grad/layer_24/attn_mlp_ratio": 1.0240661317064885, "grad/layer_27/attn": 0.009817336685955524, "grad/layer_27/mlp": 0.009296487085521221, "grad/layer_27/attn_mlp_ratio": 1.056026485062605} {"step": 48350, "timestamp": 1778246800.2228525, "train/loss": 2.180970478057861, "train/z_loss": 0.0013875822070986033, "train/perplexity": 8.854895572034005, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025840.2967497606, "perf/iters_per_sec": 0.9659959300755313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352010488510133, "data/tokens_consumed": 101399396352, "data/tokens_consumed_B": 101.399396352, "train/loss_slope": 5.586433353418336e-06} {"step": 48360, "timestamp": 1778246810.5683894, "train/loss": 2.1788148403167726, "train/z_loss": 0.001388350606430322, "train/perplexity": 8.835828183520169, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028303.346161978, "perf/iters_per_sec": 0.967170403557767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339439630508422, "data/tokens_consumed": 101420367872, "data/tokens_consumed_B": 101.420367872, "train/loss_slope": 7.130828777877474e-06} {"step": 48370, "timestamp": 1778246821.4200084, "train/loss": 2.136116123199463, "train/z_loss": 0.0013937806477770209, "train/perplexity": 8.466490882229598, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933636.6099505567, "perf/iters_per_sec": 0.9220297860863479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.084563660621643, "data/tokens_consumed": 101441339392, "data/tokens_consumed_B": 101.441339392, "train/loss_slope": 5.514542276065982e-06} {"step": 48375, "timestamp": 1778246827.179457, "eos/sharpness": 72.17507362365721, "eos/L0_probe": 1.98797607421875, "eos/L_plus": 2.2652599811553955, "eos/L_minus": 2.4324429035186768, "eos/grad_norm": 0.15701737999916077, "eos/embed_grad_frac": 0.09431777149438858, "eos/time_s": 0.5957844257354736} {"step": 48375, "timestamp": 1778246828.5590734, "geo/rankme_last": 439.69866943359375, "geo/layer_0/stable_rank_q_proj": 19.401348114013672, "geo/layer_0/stable_rank_k_proj": 16.466405868530273, "geo/layer_0/stable_rank_o_proj": 48.17583084106445, "geo/layer_0/stable_rank_gate_proj": 134.0030059814453, "geo/layer_0/stable_rank_down_proj": 53.55693435668945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062014784663915634, "geo/layer_0/attn_entropy_mean": 6.187190055847168, "geo/layer_0/attn_entropy_std": 0.39168885350227356, "geo/layer_7/stable_rank_q_proj": 42.73075485229492, "geo/layer_7/stable_rank_k_proj": 42.229278564453125, "geo/layer_7/stable_rank_o_proj": 94.21546936035156, "geo/layer_7/stable_rank_gate_proj": 87.42779541015625, "geo/layer_7/stable_rank_down_proj": 145.70574951171875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47606390714645386, "geo/layer_7/attn_entropy_mean": 4.650509834289551, "geo/layer_7/attn_entropy_std": 0.8217871189117432, "geo/layer_14/stable_rank_q_proj": 52.830780029296875, "geo/layer_14/stable_rank_k_proj": 38.384765625, "geo/layer_14/stable_rank_o_proj": 46.01066207885742, "geo/layer_14/stable_rank_gate_proj": 73.8938217163086, "geo/layer_14/stable_rank_down_proj": 130.58447265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3846067488193512, "geo/layer_14/attn_entropy_mean": 5.530214786529541, "geo/layer_14/attn_entropy_std": 0.3810519874095917, "geo/layer_21/stable_rank_q_proj": 41.77058029174805, "geo/layer_21/stable_rank_k_proj": 30.512130737304688, "geo/layer_21/stable_rank_o_proj": 73.42999267578125, "geo/layer_21/stable_rank_gate_proj": 70.23432922363281, "geo/layer_21/stable_rank_down_proj": 53.7918586730957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14513356983661652, "geo/layer_21/attn_entropy_mean": 5.707840919494629, "geo/layer_21/attn_entropy_std": 0.29364103078842163, "geo/layer_27/stable_rank_q_proj": 42.90300750732422, "geo/layer_27/stable_rank_k_proj": 31.609146118164062, "geo/layer_27/stable_rank_o_proj": 115.73377227783203, "geo/layer_27/stable_rank_gate_proj": 82.7843017578125, "geo/layer_27/stable_rank_down_proj": 130.42872619628906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08385438472032547, "geo/layer_27/attn_entropy_mean": 4.277922630310059, "geo/layer_27/attn_entropy_std": 0.6888230443000793, "attnres/final_alpha/block_0": 0.24214383959770203, "attnres/block_norm/0": 1.7397735118865967, "attnres/final_alpha/block_1": 0.0049104103818535805, "attnres/block_norm/1": 42992.3828125, "attnres/final_alpha/block_2": 0.01059390977025032, "attnres/block_norm/2": 27429.640625, "attnres/final_alpha/block_3": 0.012510981410741806, "attnres/block_norm/3": 51586.66796875, "attnres/final_alpha/block_4": 0.015213231556117535, "attnres/block_norm/4": 13857.408203125, "attnres/final_alpha/block_5": 0.5998353958129883, "attnres/block_norm/5": 6257.5166015625, "attnres/final_alpha/block_6": 0.11479221284389496, "attnres/block_norm/6": 33666.23828125, "geo/tier1_time_s": 1.3611652851104736, "geo/step": 48375.0, "geo/rankme_slope": -4.707117221888757e-06} {"step": 48380, "timestamp": 1778246834.0804462, "train/loss": 2.1746629953384398, "train/z_loss": 0.001390394859481603, "train/perplexity": 8.799219244554878, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1657391.4566496664, "perf/iters_per_sec": 0.7903058322189648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2653329372406006, "data/tokens_consumed": 101462310912, "data/tokens_consumed_B": 101.462310912, "train/loss_slope": 5.191836384299111e-06} {"step": 48390, "timestamp": 1778246844.4396348, "train/loss": 2.131843900680542, "train/z_loss": 0.001380800805054605, "train/perplexity": 8.430397304023069, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025320.8064578222, "perf/iters_per_sec": 0.9657482178010093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354665756225585, "data/tokens_consumed": 101483282432, "data/tokens_consumed_B": 101.483282432, "train/loss_slope": 2.948772574629878e-06} {"step": 48400, "timestamp": 1778246854.7820454, "grad/layer_0/attn": 0.0029469411820173264, "grad/layer_0/mlp": 0.0028993485029786825, "grad/layer_0/attn_mlp_ratio": 1.0164149212653315, "grad/layer_4/attn": 0.002097902586683631, "grad/layer_4/mlp": 0.0025237002409994602, "grad/layer_4/attn_mlp_ratio": 0.8312803832537421, "grad/layer_8/attn": 0.004122702404856682, "grad/layer_8/mlp": 0.003825309220701456, "grad/layer_8/attn_mlp_ratio": 1.0777435389462027, "grad/layer_12/attn": 0.0041849371045827866, "grad/layer_12/mlp": 0.006501917727291584, "grad/layer_12/attn_mlp_ratio": 0.6436465694808772, "grad/layer_16/attn": 0.0036762277595698833, "grad/layer_16/mlp": 0.004444092512130737, "grad/layer_16/attn_mlp_ratio": 0.8272167302578562, "grad/layer_20/attn": 0.004483694210648537, "grad/layer_20/mlp": 0.005856485106050968, "grad/layer_20/attn_mlp_ratio": 0.765594730097832, "grad/layer_24/attn": 0.015728993341326714, "grad/layer_24/mlp": 0.009003782644867897, "grad/layer_24/attn_mlp_ratio": 1.746931682718816, "grad/layer_27/attn": 0.008693388663232327, "grad/layer_27/mlp": 0.00810142420232296, "grad/layer_27/attn_mlp_ratio": 1.073069171397384} {"step": 48400, "timestamp": 1778246854.7962544, "train/loss": 2.1047107100486757, "train/z_loss": 0.0013957317918539048, "train/perplexity": 8.204729120553328, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026316.2674451177, "perf/iters_per_sec": 0.9662228905892933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349578857421875, "data/tokens_consumed": 101504253952, "data/tokens_consumed_B": 101.504253952, "train/loss_slope": 1.8086002068298399e-07} {"step": 48410, "timestamp": 1778246865.1426136, "train/loss": 2.1224857211112975, "train/z_loss": 0.001401060284115374, "train/perplexity": 8.351872131616751, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028032.1578052405, "perf/iters_per_sec": 0.9670410908724024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340822219848633, "data/tokens_consumed": 101525225472, "data/tokens_consumed_B": 101.525225472, "train/loss_slope": -2.20831766022675e-06} {"step": 48420, "timestamp": 1778246875.559771, "train/loss": 2.1451924085617065, "train/z_loss": 0.0013950033695437013, "train/perplexity": 8.543684957290427, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017359.8497775435, "perf/iters_per_sec": 0.961952137841007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0395527601242065, "data/tokens_consumed": 101546196992, "data/tokens_consumed_B": 101.546196992, "train/loss_slope": -3.3157581400306468e-06} {"step": 48430, "timestamp": 1778246885.9129562, "train/loss": 2.143292260169983, "train/z_loss": 0.001385813974775374, "train/perplexity": 8.52746610205522, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026733.8534448212, "perf/iters_per_sec": 0.9664220111106974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034744644165039, "data/tokens_consumed": 101567168512, "data/tokens_consumed_B": 101.567168512, "train/loss_slope": -6.36442668295609e-06} {"step": 48440, "timestamp": 1778246896.2577295, "train/loss": 2.149118757247925, "train/z_loss": 0.001392060681246221, "train/perplexity": 8.5772963854112, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028353.1584251586, "perf/iters_per_sec": 0.9671941558957856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033918571472168, "data/tokens_consumed": 101588140032, "data/tokens_consumed_B": 101.588140032, "train/loss_slope": -6.566620402389077e-06} {"step": 48450, "timestamp": 1778246906.5981448, "grad/layer_0/attn": 0.002862318651750684, "grad/layer_0/mlp": 0.0027806153520941734, "grad/layer_0/attn_mlp_ratio": 1.0293831351598715, "grad/layer_4/attn": 0.0019727130420506, "grad/layer_4/mlp": 0.0025305633898824453, "grad/layer_4/attn_mlp_ratio": 0.7795548501105737, "grad/layer_8/attn": 0.006255445536226034, "grad/layer_8/mlp": 0.003455853322520852, "grad/layer_8/attn_mlp_ratio": 1.8101015209328628, "grad/layer_12/attn": 0.004867861978709698, "grad/layer_12/mlp": 0.006527905818074942, "grad/layer_12/attn_mlp_ratio": 0.7457003884249016, "grad/layer_16/attn": 0.005604209844022989, "grad/layer_16/mlp": 0.004808587487787008, "grad/layer_16/attn_mlp_ratio": 1.1654586178812103, "grad/layer_20/attn": 0.005428682547062635, "grad/layer_20/mlp": 0.005870889872312546, "grad/layer_20/attn_mlp_ratio": 0.9246779572883522, "grad/layer_24/attn": 0.008800186216831207, "grad/layer_24/mlp": 0.009599619545042515, "grad/layer_24/attn_mlp_ratio": 0.9167223850765635, "grad/layer_27/attn": 0.005524531938135624, "grad/layer_27/mlp": 0.009199658408761024, "grad/layer_27/attn_mlp_ratio": 0.600514892251109} {"step": 48450, "timestamp": 1778246907.1971447, "eos/sharpness": 24.916911125183102, "eos/L0_probe": 1.988136649131775, "eos/L_plus": 2.1264848709106445, "eos/L_minus": 2.0989575386047363, "eos/grad_norm": 0.11721866577863693, "eos/embed_grad_frac": 0.16719146072864532, "eos/time_s": 0.5961489677429199} {"step": 48450, "timestamp": 1778246907.2172487, "train/loss": 2.192551851272583, "train/z_loss": 0.001380939409136772, "train/perplexity": 8.958043567226742, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914818.6856567534, "perf/iters_per_sec": 0.9130567005428092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0952222347259521, "data/tokens_consumed": 101609111552, "data/tokens_consumed_B": 101.609111552, "train/loss_slope": -5.009332949286168e-06} {"step": 48450, "timestamp": 1778246908.5775259, "geo/rankme_last": 439.93670654296875, "geo/layer_0/stable_rank_q_proj": 19.398189544677734, "geo/layer_0/stable_rank_k_proj": 16.44852638244629, "geo/layer_0/stable_rank_o_proj": 48.203460693359375, "geo/layer_0/stable_rank_gate_proj": 134.1458740234375, "geo/layer_0/stable_rank_down_proj": 53.58420181274414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06364645063877106, "geo/layer_0/attn_entropy_mean": 6.189538478851318, "geo/layer_0/attn_entropy_std": 0.3923439085483551, "geo/layer_7/stable_rank_q_proj": 42.70247268676758, "geo/layer_7/stable_rank_k_proj": 42.04726028442383, "geo/layer_7/stable_rank_o_proj": 93.77637481689453, "geo/layer_7/stable_rank_gate_proj": 87.42591857910156, "geo/layer_7/stable_rank_down_proj": 145.62400817871094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4842289388179779, "geo/layer_7/attn_entropy_mean": 4.646733283996582, "geo/layer_7/attn_entropy_std": 0.7908194065093994, "geo/layer_14/stable_rank_q_proj": 52.67750930786133, "geo/layer_14/stable_rank_k_proj": 38.36956787109375, "geo/layer_14/stable_rank_o_proj": 45.95858383178711, "geo/layer_14/stable_rank_gate_proj": 73.756103515625, "geo/layer_14/stable_rank_down_proj": 130.933837890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.377994179725647, "geo/layer_14/attn_entropy_mean": 5.543597221374512, "geo/layer_14/attn_entropy_std": 0.37665054202079773, "geo/layer_21/stable_rank_q_proj": 41.81364059448242, "geo/layer_21/stable_rank_k_proj": 30.539159774780273, "geo/layer_21/stable_rank_o_proj": 73.31291198730469, "geo/layer_21/stable_rank_gate_proj": 70.05864715576172, "geo/layer_21/stable_rank_down_proj": 53.78506851196289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14612165093421936, "geo/layer_21/attn_entropy_mean": 5.692112922668457, "geo/layer_21/attn_entropy_std": 0.3032122850418091, "geo/layer_27/stable_rank_q_proj": 42.86809539794922, "geo/layer_27/stable_rank_k_proj": 31.680477142333984, "geo/layer_27/stable_rank_o_proj": 115.75421142578125, "geo/layer_27/stable_rank_gate_proj": 82.80560302734375, "geo/layer_27/stable_rank_down_proj": 130.67860412597656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0849253311753273, "geo/layer_27/attn_entropy_mean": 4.2264485359191895, "geo/layer_27/attn_entropy_std": 0.7245128154754639, "attnres/final_alpha/block_0": 0.2408718764781952, "attnres/block_norm/0": 1.7399656772613525, "attnres/final_alpha/block_1": 0.004903013352304697, "attnres/block_norm/1": 42977.796875, "attnres/final_alpha/block_2": 0.01061912439763546, "attnres/block_norm/2": 27496.29296875, "attnres/final_alpha/block_3": 0.012388540431857109, "attnres/block_norm/3": 51899.484375, "attnres/final_alpha/block_4": 0.015163012780249119, "attnres/block_norm/4": 13820.697265625, "attnres/final_alpha/block_5": 0.6020746231079102, "attnres/block_norm/5": 6324.56640625, "attnres/final_alpha/block_6": 0.11397979408502579, "attnres/block_norm/6": 33737.09375, "geo/tier1_time_s": 1.3566529750823975, "geo/step": 48450.0, "geo/rankme_slope": 2.4042273159263707e-05} {"step": 48460, "timestamp": 1778246918.9194922, "train/loss": 2.122169041633606, "train/z_loss": 0.001402331585995853, "train/perplexity": 8.34922768385563, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792587.6961081612, "perf/iters_per_sec": 0.8547724228421026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1699020385742187, "data/tokens_consumed": 101630083072, "data/tokens_consumed_B": 101.630083072, "train/loss_slope": -7.0885502608946444e-06} {"step": 48470, "timestamp": 1778246929.2618823, "train/loss": 2.1746341228485107, "train/z_loss": 0.0013975973473861814, "train/perplexity": 8.798965192853426, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028722.2663401864, "perf/iters_per_sec": 0.9673701602650577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337304592132568, "data/tokens_consumed": 101651054592, "data/tokens_consumed_B": 101.651054592, "train/loss_slope": -3.9234035765484715e-06} {"step": 48480, "timestamp": 1778246939.6094065, "train/loss": 2.19339120388031, "train/z_loss": 0.0013859918108209968, "train/perplexity": 8.965565680866257, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028146.6288870324, "perf/iters_per_sec": 0.9670956749377405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340238571166993, "data/tokens_consumed": 101672026112, "data/tokens_consumed_B": 101.672026112, "train/loss_slope": -2.2596354817900706e-06} {"step": 48490, "timestamp": 1778246949.9491386, "train/loss": 2.1302194118499758, "train/z_loss": 0.0013921410078182816, "train/perplexity": 8.416713335501749, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029523.1167663254, "perf/iters_per_sec": 0.9677520355064037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033322548866272, "data/tokens_consumed": 101692997632, "data/tokens_consumed_B": 101.692997632, "train/loss_slope": -5.606862699667172e-06} {"step": 48500, "timestamp": 1778246960.277984, "grad/layer_0/attn": 0.0032665557228028774, "grad/layer_0/mlp": 0.0032160731498152018, "grad/layer_0/attn_mlp_ratio": 1.0156969288527793, "grad/layer_4/attn": 0.0031716187950223684, "grad/layer_4/mlp": 0.0024908320046961308, "grad/layer_4/attn_mlp_ratio": 1.2733169726866398, "grad/layer_8/attn": 0.005848832428455353, "grad/layer_8/mlp": 0.0035784458741545677, "grad/layer_8/attn_mlp_ratio": 1.6344615709441246, "grad/layer_12/attn": 0.005709006451070309, "grad/layer_12/mlp": 0.007248961832374334, "grad/layer_12/attn_mlp_ratio": 0.7875619301535459, "grad/layer_16/attn": 0.00686753261834383, "grad/layer_16/mlp": 0.005096880253404379, "grad/layer_16/attn_mlp_ratio": 1.3473992211248933, "grad/layer_20/attn": 0.005661102011799812, "grad/layer_20/mlp": 0.007479676511138678, "grad/layer_20/attn_mlp_ratio": 0.7568645418933411, "grad/layer_24/attn": 0.010519723407924175, "grad/layer_24/mlp": 0.01077349204570055, "grad/layer_24/attn_mlp_ratio": 0.9764450807273621, "grad/layer_27/attn": 0.012844362296164036, "grad/layer_27/mlp": 0.011347856372594833, "grad/layer_27/attn_mlp_ratio": 1.1318756389969573} {"step": 48500, "timestamp": 1778246960.292216, "train/loss": 2.186523365974426, "train/z_loss": 0.0013777403975836933, "train/perplexity": 8.904202586123139, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028556.689155595, "perf/iters_per_sec": 0.9672912069108939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033814835548401, "data/tokens_consumed": 101713969152, "data/tokens_consumed_B": 101.713969152, "train/loss_slope": -3.0500840182208584e-06} {"step": 48500, "timestamp": 1778246967.5045266, "geo/ww_alpha_mean": 7.574279989314275, "geo/ww_alpha_std": 4.459371972252673, "geo/ww_alpha_min": 1.3284078516195872, "geo/ww_alpha_max": 32.408534878143115, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.892122371289845, "geo/ww_alpha_by_type/k_proj": 4.438882684934233, "geo/ww_alpha_by_type/v_proj": 8.381754403076828, "geo/ww_alpha_by_type/o_proj": 7.737181178921004, "geo/ww_alpha_by_type/gate_proj": 8.126870249200048, "geo/ww_alpha_by_type/up_proj": 12.537195568899048, "geo/ww_alpha_by_type/down_proj": 8.00715820025836, "geo/twonn_id/layer_0": 0.7098641991615295, "geo/twonn_id/layer_7": 3.0614659786224365, "geo/twonn_id/layer_14": 4.046902179718018, "geo/twonn_id/layer_21": 7.958909034729004, "geo/twonn_id/layer_27": 6.203063011169434, "geo/tier2_time_s": 7.203097820281982} {"step": 48500, "timestamp": 1778246968.1893718, "eoc/jacobian_sigma/layer_0/attn": 1070.82763671875, "eoc/jacobian_sigma/layer_0/mlp": 8330.19140625, "eoc/jacobian_sigma/layer_0": 8330.19140625, "eoc/jacobian_sigma/layer_7/attn": 1.1659448146820068, "eoc/jacobian_sigma/layer_7/mlp": 1.7680963277816772, "eoc/jacobian_sigma/layer_7": 1.7680963277816772, "eoc/jacobian_sigma/layer_14/attn": 1.5993479490280151, "eoc/jacobian_sigma/layer_14/mlp": 5.792941570281982, "eoc/jacobian_sigma/layer_14": 5.792941570281982, "eoc/jacobian_sigma/layer_21/attn": 1.0877158641815186, "eoc/jacobian_sigma/layer_21/mlp": 3.6232070922851562, "eoc/jacobian_sigma/layer_21": 3.6232070922851562, "eoc/jacobian_sigma/layer_27/attn": 3.973278522491455, "eoc/jacobian_sigma/layer_27/mlp": 35.9256706237793, "eoc/jacobian_sigma/layer_27": 35.9256706237793, "eoc/layer0_sigma": 8330.19140625, "eoc/sigma_max": 35.9256706237793, "eoc/sigma_min": 1.7680963277816772, "eoc/sigma_mean": 11.777478903532028, "eoc/time_s": 0.6778185367584229} {"step": 48510, "timestamp": 1778246978.5567234, "train/loss": 2.1288712501525877, "train/z_loss": 0.0014014258747920394, "train/perplexity": 8.40537389038521, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1148697.6102377283, "perf/iters_per_sec": 0.5477417041004793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.825678038597107, "data/tokens_consumed": 101734940672, "data/tokens_consumed_B": 101.734940672, "train/loss_slope": -6.941512716163448e-06} {"step": 48520, "timestamp": 1778246988.9067895, "train/loss": 2.14804722070694, "train/z_loss": 0.0013968374347314238, "train/perplexity": 8.568110421338394, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027128.8135494476, "perf/iters_per_sec": 0.966610342764591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345430374145508, "data/tokens_consumed": 101755912192, "data/tokens_consumed_B": 101.755912192, "train/loss_slope": -7.208397164560824e-06} {"step": 48525, "timestamp": 1778246994.6812797, "eos/sharpness": 46.695184707641594, "eos/L0_probe": 1.984641432762146, "eos/L_plus": 2.231076240539551, "eos/L_minus": 2.2051584720611572, "eos/grad_norm": 0.1512449085712433, "eos/embed_grad_frac": 0.10118518024682999, "eos/time_s": 0.6071109771728516} {"step": 48525, "timestamp": 1778246996.0586953, "geo/rankme_last": 439.7846374511719, "geo/layer_0/stable_rank_q_proj": 19.424423217773438, "geo/layer_0/stable_rank_k_proj": 16.43169403076172, "geo/layer_0/stable_rank_o_proj": 48.17000198364258, "geo/layer_0/stable_rank_gate_proj": 134.41473388671875, "geo/layer_0/stable_rank_down_proj": 53.54127883911133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06465653330087662, "geo/layer_0/attn_entropy_mean": 6.188604354858398, "geo/layer_0/attn_entropy_std": 0.39083781838417053, "geo/layer_7/stable_rank_q_proj": 42.72065353393555, "geo/layer_7/stable_rank_k_proj": 41.97012710571289, "geo/layer_7/stable_rank_o_proj": 93.85489654541016, "geo/layer_7/stable_rank_gate_proj": 87.25909423828125, "geo/layer_7/stable_rank_down_proj": 145.3103790283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4886106252670288, "geo/layer_7/attn_entropy_mean": 4.640602111816406, "geo/layer_7/attn_entropy_std": 0.8143446445465088, "geo/layer_14/stable_rank_q_proj": 52.706966400146484, "geo/layer_14/stable_rank_k_proj": 38.434261322021484, "geo/layer_14/stable_rank_o_proj": 45.8451042175293, "geo/layer_14/stable_rank_gate_proj": 73.64900970458984, "geo/layer_14/stable_rank_down_proj": 131.28062438964844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38959482312202454, "geo/layer_14/attn_entropy_mean": 5.524655818939209, "geo/layer_14/attn_entropy_std": 0.3686327040195465, "geo/layer_21/stable_rank_q_proj": 41.782073974609375, "geo/layer_21/stable_rank_k_proj": 30.61280632019043, "geo/layer_21/stable_rank_o_proj": 73.37374877929688, "geo/layer_21/stable_rank_gate_proj": 69.9882583618164, "geo/layer_21/stable_rank_down_proj": 53.76604461669922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14152584969997406, "geo/layer_21/attn_entropy_mean": 5.717923641204834, "geo/layer_21/attn_entropy_std": 0.3009549677371979, "geo/layer_27/stable_rank_q_proj": 42.817474365234375, "geo/layer_27/stable_rank_k_proj": 31.801166534423828, "geo/layer_27/stable_rank_o_proj": 115.95643615722656, "geo/layer_27/stable_rank_gate_proj": 82.77618408203125, "geo/layer_27/stable_rank_down_proj": 130.58302307128906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08161549270153046, "geo/layer_27/attn_entropy_mean": 4.264408111572266, "geo/layer_27/attn_entropy_std": 0.6798834800720215, "attnres/final_alpha/block_0": 0.24050037562847137, "attnres/block_norm/0": 1.7400716543197632, "attnres/final_alpha/block_1": 0.004864143207669258, "attnres/block_norm/1": 43047.45703125, "attnres/final_alpha/block_2": 0.01061256043612957, "attnres/block_norm/2": 27502.658203125, "attnres/final_alpha/block_3": 0.01247350312769413, "attnres/block_norm/3": 51699.125, "attnres/final_alpha/block_4": 0.01508035883307457, "attnres/block_norm/4": 13938.14453125, "attnres/final_alpha/block_5": 0.6035611033439636, "attnres/block_norm/5": 6270.00146484375, "attnres/final_alpha/block_6": 0.11290793120861053, "attnres/block_norm/6": 33813.54296875, "geo/tier1_time_s": 1.3567883968353271, "geo/step": 48525.0, "geo/rankme_slope": 3.5644296781212486e-05} {"step": 48530, "timestamp": 1778247001.2343624, "train/loss": 2.1215802788734437, "train/z_loss": 0.0014016608824022115, "train/perplexity": 8.344313416330076, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702212.2646711594, "perf/iters_per_sec": 0.8116780589443013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232015562057495, "data/tokens_consumed": 101776883712, "data/tokens_consumed_B": 101.776883712, "train/loss_slope": -1.301306399932826e-05} {"step": 48540, "timestamp": 1778247011.5821369, "train/loss": 2.1644057273864745, "train/z_loss": 0.0013985703117214143, "train/perplexity": 8.70942460611364, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027731.6391966802, "perf/iters_per_sec": 0.9668977924331094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342354774475098, "data/tokens_consumed": 101797855232, "data/tokens_consumed_B": 101.797855232, "train/loss_slope": -1.0977863192451013e-05} {"step": 48550, "timestamp": 1778247021.9138868, "grad/layer_0/attn": 0.0030722026713192463, "grad/layer_0/mlp": 0.003129061544314027, "grad/layer_0/attn_mlp_ratio": 0.9818287462958418, "grad/layer_4/attn": 0.0038204470183700323, "grad/layer_4/mlp": 0.002540981164202094, "grad/layer_4/attn_mlp_ratio": 1.5035321480694623, "grad/layer_8/attn": 0.0046270606108009815, "grad/layer_8/mlp": 0.0037659122608602047, "grad/layer_8/attn_mlp_ratio": 1.2286692220697548, "grad/layer_12/attn": 0.004222846124321222, "grad/layer_12/mlp": 0.006790834944695234, "grad/layer_12/attn_mlp_ratio": 0.6218448977964741, "grad/layer_16/attn": 0.0037070626858621836, "grad/layer_16/mlp": 0.004330499097704887, "grad/layer_16/attn_mlp_ratio": 0.8560358786873538, "grad/layer_20/attn": 0.005529147572815418, "grad/layer_20/mlp": 0.005228396505117416, "grad/layer_20/attn_mlp_ratio": 1.057522600218131, "grad/layer_24/attn": 0.004634038545191288, "grad/layer_24/mlp": 0.00785483792424202, "grad/layer_24/attn_mlp_ratio": 0.5899597841342459, "grad/layer_27/attn": 0.0037067292723804712, "grad/layer_27/mlp": 0.007253951393067837, "grad/layer_27/attn_mlp_ratio": 0.5109944939558486} {"step": 48550, "timestamp": 1778247021.9281654, "train/loss": 2.1971609830856322, "train/z_loss": 0.0013849465292878449, "train/perplexity": 8.999427669943358, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028035.7582102474, "perf/iters_per_sec": 0.9670428076792943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340803861618042, "data/tokens_consumed": 101818826752, "data/tokens_consumed_B": 101.818826752, "train/loss_slope": -9.003815012868048e-06} {"step": 48560, "timestamp": 1778247032.2788734, "train/loss": 2.175935459136963, "train/z_loss": 0.0013939185300841928, "train/perplexity": 8.810423059211386, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027547.248648633, "perf/iters_per_sec": 0.966809868168179, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343295335769653, "data/tokens_consumed": 101839798272, "data/tokens_consumed_B": 101.839798272, "train/loss_slope": -9.129093661166674e-06} {"step": 48570, "timestamp": 1778247042.6204214, "train/loss": 2.174678635597229, "train/z_loss": 0.0013920793891884387, "train/perplexity": 8.799356867697236, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029107.7508738786, "perf/iters_per_sec": 0.9675539736146348, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335340738296508, "data/tokens_consumed": 101860769792, "data/tokens_consumed_B": 101.860769792, "train/loss_slope": -9.351291536319175e-06} {"step": 48580, "timestamp": 1778247052.960656, "train/loss": 2.1534842252731323, "train/z_loss": 0.0014068380696699023, "train/perplexity": 8.614822147687118, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029047.5575270767, "perf/iters_per_sec": 0.967525271190203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335647344589234, "data/tokens_consumed": 101881741312, "data/tokens_consumed_B": 101.881741312, "train/loss_slope": -1.0639113575854968e-05} {"step": 48590, "timestamp": 1778247063.304406, "train/loss": 2.1897186756134035, "train/z_loss": 0.0013919885270297528, "train/perplexity": 8.932699774898612, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028482.0736309988, "perf/iters_per_sec": 0.9672556274561876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338528633117676, "data/tokens_consumed": 101902712832, "data/tokens_consumed_B": 101.902712832, "train/loss_slope": -9.612044106365274e-06} {"step": 48600, "timestamp": 1778247073.6378396, "grad/layer_0/attn": 0.0029006567783653736, "grad/layer_0/mlp": 0.002948665525764227, "grad/layer_0/attn_mlp_ratio": 0.9837184498034054, "grad/layer_4/attn": 0.002296891063451767, "grad/layer_4/mlp": 0.0025935189332813025, "grad/layer_4/attn_mlp_ratio": 0.8856272246229696, "grad/layer_8/attn": 0.0038931749295443296, "grad/layer_8/mlp": 0.0037502197083085775, "grad/layer_8/attn_mlp_ratio": 1.038119131289061, "grad/layer_12/attn": 0.004721143748611212, "grad/layer_12/mlp": 0.006893750745803118, "grad/layer_12/attn_mlp_ratio": 0.6848439774241944, "grad/layer_16/attn": 0.00478172954171896, "grad/layer_16/mlp": 0.005108141340315342, "grad/layer_16/attn_mlp_ratio": 0.9360996749189013, "grad/layer_20/attn": 0.007146509364247322, "grad/layer_20/mlp": 0.006509579252451658, "grad/layer_20/attn_mlp_ratio": 1.0978450338048005, "grad/layer_24/attn": 0.010237460024654865, "grad/layer_24/mlp": 0.010568177327513695, "grad/layer_24/attn_mlp_ratio": 0.9687062972658064, "grad/layer_27/attn": 0.004155113827437162, "grad/layer_27/mlp": 0.010344248265028, "grad/layer_27/attn_mlp_ratio": 0.4016834941323371} {"step": 48600, "timestamp": 1778247074.2382195, "eos/sharpness": 67.88842678070067, "eos/L0_probe": 1.9844015836715698, "eos/L_plus": 2.2838780879974365, "eos/L_minus": 2.36380934715271, "eos/grad_norm": 0.16803468763828278, "eos/embed_grad_frac": 0.0827128067612648, "eos/time_s": 0.5975587368011475} {"step": 48600, "timestamp": 1778247074.2563035, "train/loss": 2.177694630622864, "train/z_loss": 0.0013933893758803606, "train/perplexity": 8.825935744972533, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916104.4462384921, "perf/iters_per_sec": 0.9136697989647351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944873094558716, "data/tokens_consumed": 101923684352, "data/tokens_consumed_B": 101.923684352, "train/loss_slope": -9.625619458537029e-06} {"step": 48600, "timestamp": 1778247075.6207411, "geo/rankme_last": 439.0805969238281, "geo/layer_0/stable_rank_q_proj": 19.414453506469727, "geo/layer_0/stable_rank_k_proj": 16.435440063476562, "geo/layer_0/stable_rank_o_proj": 48.20819854736328, "geo/layer_0/stable_rank_gate_proj": 134.61505126953125, "geo/layer_0/stable_rank_down_proj": 53.52726745605469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06320205330848694, "geo/layer_0/attn_entropy_mean": 6.188277244567871, "geo/layer_0/attn_entropy_std": 0.3932192921638489, "geo/layer_7/stable_rank_q_proj": 42.858497619628906, "geo/layer_7/stable_rank_k_proj": 42.086082458496094, "geo/layer_7/stable_rank_o_proj": 94.12238311767578, "geo/layer_7/stable_rank_gate_proj": 87.25408935546875, "geo/layer_7/stable_rank_down_proj": 145.1473846435547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4850069284439087, "geo/layer_7/attn_entropy_mean": 4.635156631469727, "geo/layer_7/attn_entropy_std": 0.8003437519073486, "geo/layer_14/stable_rank_q_proj": 52.685157775878906, "geo/layer_14/stable_rank_k_proj": 38.461402893066406, "geo/layer_14/stable_rank_o_proj": 45.835601806640625, "geo/layer_14/stable_rank_gate_proj": 73.60145568847656, "geo/layer_14/stable_rank_down_proj": 131.2256317138672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3974963426589966, "geo/layer_14/attn_entropy_mean": 5.506017208099365, "geo/layer_14/attn_entropy_std": 0.3862207233905792, "geo/layer_21/stable_rank_q_proj": 41.72189712524414, "geo/layer_21/stable_rank_k_proj": 30.5570068359375, "geo/layer_21/stable_rank_o_proj": 73.328125, "geo/layer_21/stable_rank_gate_proj": 70.00482177734375, "geo/layer_21/stable_rank_down_proj": 53.71169662475586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14469723403453827, "geo/layer_21/attn_entropy_mean": 5.69097375869751, "geo/layer_21/attn_entropy_std": 0.3019409477710724, "geo/layer_27/stable_rank_q_proj": 42.822574615478516, "geo/layer_27/stable_rank_k_proj": 31.739791870117188, "geo/layer_27/stable_rank_o_proj": 115.94972229003906, "geo/layer_27/stable_rank_gate_proj": 82.72600555419922, "geo/layer_27/stable_rank_down_proj": 130.5970001220703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09101961553096771, "geo/layer_27/attn_entropy_mean": 4.244343280792236, "geo/layer_27/attn_entropy_std": 0.6844910383224487, "attnres/final_alpha/block_0": 0.2424214482307434, "attnres/block_norm/0": 1.740307092666626, "attnres/final_alpha/block_1": 0.004918909631669521, "attnres/block_norm/1": 43112.109375, "attnres/final_alpha/block_2": 0.010831414721906185, "attnres/block_norm/2": 27501.2578125, "attnres/final_alpha/block_3": 0.012745432555675507, "attnres/block_norm/3": 51380.08984375, "attnres/final_alpha/block_4": 0.01506865955889225, "attnres/block_norm/4": 13912.228515625, "attnres/final_alpha/block_5": 0.5995575189590454, "attnres/block_norm/5": 6333.9736328125, "attnres/final_alpha/block_6": 0.11445657908916473, "attnres/block_norm/6": 34016.3984375, "geo/tier1_time_s": 1.3607525825500488, "geo/step": 48600.0, "geo/rankme_slope": 3.823337928921569e-05} {"step": 48610, "timestamp": 1778247085.9641142, "train/loss": 2.198804998397827, "train/z_loss": 0.0013780415873043239, "train/perplexity": 9.014235035266303, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791774.8670926956, "perf/iters_per_sec": 0.8543848357642629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704327583312988, "data/tokens_consumed": 101944655872, "data/tokens_consumed_B": 101.944655872, "train/loss_slope": -6.894862941532491e-06} {"step": 48620, "timestamp": 1778247096.3138404, "train/loss": 2.171950626373291, "train/z_loss": 0.0013895763084292411, "train/perplexity": 8.775384853801654, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027979.4156751283, "perf/iters_per_sec": 0.9670159414649622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034109115600586, "data/tokens_consumed": 101965627392, "data/tokens_consumed_B": 101.965627392, "train/loss_slope": -7.1951300039900964e-06} {"step": 48630, "timestamp": 1778247106.6640728, "train/loss": 2.137789583206177, "train/z_loss": 0.0013812750112265348, "train/perplexity": 8.480671077804042, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027385.788399032, "perf/iters_per_sec": 0.9667328779215966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034411907196045, "data/tokens_consumed": 101986598912, "data/tokens_consumed_B": 101.986598912, "train/loss_slope": -7.712437846873085e-06} {"step": 48640, "timestamp": 1778247117.0057573, "train/loss": 2.1802624464035034, "train/z_loss": 0.0013880068669095635, "train/perplexity": 8.848628244667877, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029098.857386286, "perf/iters_per_sec": 0.9675497328692846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335386037826537, "data/tokens_consumed": 102007570432, "data/tokens_consumed_B": 102.007570432, "train/loss_slope": -9.149757176950621e-06} {"step": 48650, "timestamp": 1778247127.3391666, "grad/layer_0/attn": 0.002686944091692567, "grad/layer_0/mlp": 0.0028426332864910364, "grad/layer_0/attn_mlp_ratio": 0.9452306106238155, "grad/layer_4/attn": 0.0022552080918103456, "grad/layer_4/mlp": 0.0024811155162751675, "grad/layer_4/attn_mlp_ratio": 0.908949215029338, "grad/layer_8/attn": 0.0058920737355947495, "grad/layer_8/mlp": 0.003669510828331113, "grad/layer_8/attn_mlp_ratio": 1.6056836593955734, "grad/layer_12/attn": 0.004983616527169943, "grad/layer_12/mlp": 0.006518678739666939, "grad/layer_12/attn_mlp_ratio": 0.7645132778814078, "grad/layer_16/attn": 0.004551421385258436, "grad/layer_16/mlp": 0.004627103917300701, "grad/layer_16/attn_mlp_ratio": 0.9836436285505388, "grad/layer_20/attn": 0.004956717137247324, "grad/layer_20/mlp": 0.00595026696100831, "grad/layer_20/attn_mlp_ratio": 0.8330243141065634, "grad/layer_24/attn": 0.011710405349731445, "grad/layer_24/mlp": 0.010982452891767025, "grad/layer_24/attn_mlp_ratio": 1.0662832209261563, "grad/layer_27/attn": 0.0043440936133265495, "grad/layer_27/mlp": 0.01131776999682188, "grad/layer_27/attn_mlp_ratio": 0.38382946253223554} {"step": 48650, "timestamp": 1778247127.3534205, "train/loss": 2.1163002252578735, "train/z_loss": 0.0013979318318888545, "train/perplexity": 8.300371105075243, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027746.597568482, "perf/iters_per_sec": 0.9669049251406107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342278480529785, "data/tokens_consumed": 102028541952, "data/tokens_consumed_B": 102.028541952, "train/loss_slope": -9.644354781051552e-06} {"step": 48660, "timestamp": 1778247137.70066, "train/loss": 2.2045629739761354, "train/z_loss": 0.0013705375255085527, "train/perplexity": 9.066288497923683, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028182.3102195626, "perf/iters_per_sec": 0.9671126891229451, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340056657791137, "data/tokens_consumed": 102049513472, "data/tokens_consumed_B": 102.049513472, "train/loss_slope": -7.273145155473197e-06} {"step": 48670, "timestamp": 1778247148.0450583, "train/loss": 2.173951768875122, "train/z_loss": 0.0013925223611295223, "train/perplexity": 8.792963231956186, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028725.962776778, "perf/iters_per_sec": 0.9673719228633776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033728575706482, "data/tokens_consumed": 102070484992, "data/tokens_consumed_B": 102.070484992, "train/loss_slope": -5.492886773037059e-06} {"step": 48675, "timestamp": 1778247153.8030226, "eos/sharpness": 42.119336128234856, "eos/L0_probe": 1.9867008924484253, "eos/L_plus": 2.1770546436309814, "eos/L_minus": 2.2175405025482178, "eos/grad_norm": 0.09848202019929886, "eos/embed_grad_frac": 0.20592229068279266, "eos/time_s": 0.5928521156311035} {"step": 48675, "timestamp": 1778247155.1826885, "geo/rankme_last": 439.3488464355469, "geo/layer_0/stable_rank_q_proj": 19.406538009643555, "geo/layer_0/stable_rank_k_proj": 16.443822860717773, "geo/layer_0/stable_rank_o_proj": 48.19380187988281, "geo/layer_0/stable_rank_gate_proj": 134.5967559814453, "geo/layer_0/stable_rank_down_proj": 53.554725646972656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060889992862939835, "geo/layer_0/attn_entropy_mean": 6.190530776977539, "geo/layer_0/attn_entropy_std": 0.3899688720703125, "geo/layer_7/stable_rank_q_proj": 42.962425231933594, "geo/layer_7/stable_rank_k_proj": 42.05070495605469, "geo/layer_7/stable_rank_o_proj": 94.05022430419922, "geo/layer_7/stable_rank_gate_proj": 87.21702575683594, "geo/layer_7/stable_rank_down_proj": 145.18003845214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4822622239589691, "geo/layer_7/attn_entropy_mean": 4.617550373077393, "geo/layer_7/attn_entropy_std": 0.8074284195899963, "geo/layer_14/stable_rank_q_proj": 52.76576614379883, "geo/layer_14/stable_rank_k_proj": 38.43941116333008, "geo/layer_14/stable_rank_o_proj": 45.79618453979492, "geo/layer_14/stable_rank_gate_proj": 73.6506576538086, "geo/layer_14/stable_rank_down_proj": 131.18069458007812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39062994718551636, "geo/layer_14/attn_entropy_mean": 5.505242347717285, "geo/layer_14/attn_entropy_std": 0.3653300106525421, "geo/layer_21/stable_rank_q_proj": 41.597984313964844, "geo/layer_21/stable_rank_k_proj": 30.61808967590332, "geo/layer_21/stable_rank_o_proj": 73.229736328125, "geo/layer_21/stable_rank_gate_proj": 70.08125305175781, "geo/layer_21/stable_rank_down_proj": 53.65892028808594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14732146263122559, "geo/layer_21/attn_entropy_mean": 5.715417861938477, "geo/layer_21/attn_entropy_std": 0.29941970109939575, "geo/layer_27/stable_rank_q_proj": 42.74224090576172, "geo/layer_27/stable_rank_k_proj": 31.67325210571289, "geo/layer_27/stable_rank_o_proj": 115.95209503173828, "geo/layer_27/stable_rank_gate_proj": 82.78561401367188, "geo/layer_27/stable_rank_down_proj": 130.45465087890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08977144956588745, "geo/layer_27/attn_entropy_mean": 4.269557476043701, "geo/layer_27/attn_entropy_std": 0.6874887943267822, "attnres/final_alpha/block_0": 0.24133378267288208, "attnres/block_norm/0": 1.740456223487854, "attnres/final_alpha/block_1": 0.004922737833112478, "attnres/block_norm/1": 43022.4140625, "attnres/final_alpha/block_2": 0.010572933591902256, "attnres/block_norm/2": 27525.587890625, "attnres/final_alpha/block_3": 0.012482447549700737, "attnres/block_norm/3": 51660.8515625, "attnres/final_alpha/block_4": 0.015159538015723228, "attnres/block_norm/4": 13911.689453125, "attnres/final_alpha/block_5": 0.601192831993103, "attnres/block_norm/5": 6282.9755859375, "attnres/final_alpha/block_6": 0.11433576047420502, "attnres/block_norm/6": 34085.203125, "geo/tier1_time_s": 1.3595504760742188, "geo/step": 48675.0, "geo/rankme_slope": 4.4837270845838336e-05} {"step": 48680, "timestamp": 1778247160.361305, "train/loss": 2.1678898096084596, "train/z_loss": 0.0013770615099929274, "train/perplexity": 8.739821880099528, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703435.0582531446, "perf/iters_per_sec": 0.8122611323610042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2311311721801759, "data/tokens_consumed": 102091456512, "data/tokens_consumed_B": 102.091456512, "train/loss_slope": -4.818655891601581e-06} {"step": 48690, "timestamp": 1778247170.7025666, "train/loss": 2.1326757192611696, "train/z_loss": 0.0013882751925848424, "train/perplexity": 8.437412782540806, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028855.2997478605, "perf/iters_per_sec": 0.9674335955371192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336626768112183, "data/tokens_consumed": 102112428032, "data/tokens_consumed_B": 102.112428032, "train/loss_slope": -5.849997874962975e-06} {"step": 48700, "timestamp": 1778247181.0438333, "grad/layer_0/attn": 0.0027733538299798965, "grad/layer_0/mlp": 0.002803283743560314, "grad/layer_0/attn_mlp_ratio": 0.9893232311636324, "grad/layer_4/attn": 0.0023210584186017513, "grad/layer_4/mlp": 0.0026347991079092026, "grad/layer_4/attn_mlp_ratio": 0.8809242129853172, "grad/layer_8/attn": 0.006289768032729626, "grad/layer_8/mlp": 0.003607322694733739, "grad/layer_8/attn_mlp_ratio": 1.7436110907268785, "grad/layer_12/attn": 0.006562226917594671, "grad/layer_12/mlp": 0.006775656249374151, "grad/layer_12/attn_mlp_ratio": 0.9685005524521335, "grad/layer_16/attn": 0.004238748922944069, "grad/layer_16/mlp": 0.00470938254147768, "grad/layer_16/attn_mlp_ratio": 0.9000646678423356, "grad/layer_20/attn": 0.004698285833001137, "grad/layer_20/mlp": 0.005856535397469997, "grad/layer_20/attn_mlp_ratio": 0.8022295493693806, "grad/layer_24/attn": 0.01017408724874258, "grad/layer_24/mlp": 0.009599084034562111, "grad/layer_24/attn_mlp_ratio": 1.059901872524498, "grad/layer_27/attn": 0.006437323056161404, "grad/layer_27/mlp": 0.00925972405821085, "grad/layer_27/attn_mlp_ratio": 0.6951959849099019} {"step": 48700, "timestamp": 1778247181.0584865, "train/loss": 2.1702542543411254, "train/z_loss": 0.0013807782321237027, "train/perplexity": 8.760511155593814, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026538.95233652, "perf/iters_per_sec": 0.9663290750200844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034844160079956, "data/tokens_consumed": 102133399552, "data/tokens_consumed_B": 102.133399552, "train/loss_slope": -9.757840708501701e-06} {"step": 48710, "timestamp": 1778247191.4075034, "train/loss": 2.1353606462478636, "train/z_loss": 0.0013912606053054332, "train/perplexity": 8.46009705900429, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027889.227378456, "perf/iters_per_sec": 0.9669729363338737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341551065444947, "data/tokens_consumed": 102154371072, "data/tokens_consumed_B": 102.154371072, "train/loss_slope": -1.4355941753004041e-05} {"step": 48720, "timestamp": 1778247201.7497363, "train/loss": 2.1461280822753905, "train/z_loss": 0.0013867271831259132, "train/perplexity": 8.551682799824976, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028795.5893140251, "perf/iters_per_sec": 0.967405123383534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336930990219115, "data/tokens_consumed": 102175342592, "data/tokens_consumed_B": 102.175342592, "train/loss_slope": -1.6234092700957114e-05} {"step": 48730, "timestamp": 1778247212.0951831, "train/loss": 2.123884892463684, "train/z_loss": 0.0013905781903304159, "train/perplexity": 8.363566010804744, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028632.6201801053, "perf/iters_per_sec": 0.9673274136448409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337761402130128, "data/tokens_consumed": 102196314112, "data/tokens_consumed_B": 102.196314112, "train/loss_slope": -1.4264372051066182e-05} {"step": 48740, "timestamp": 1778247222.438933, "train/loss": 2.1679470777511596, "train/z_loss": 0.0013889537192881107, "train/perplexity": 8.74032240779814, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028870.3214996578, "perf/iters_per_sec": 0.9674407584665574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336550235748292, "data/tokens_consumed": 102217285632, "data/tokens_consumed_B": 102.217285632, "train/loss_slope": -1.6810532829405973e-05} {"step": 48750, "timestamp": 1778247232.7774928, "grad/layer_0/attn": 0.002803827403113246, "grad/layer_0/mlp": 0.003014607587829232, "grad/layer_0/attn_mlp_ratio": 0.9300803598534688, "grad/layer_4/attn": 0.003156746504828334, "grad/layer_4/mlp": 0.0025414482224732637, "grad/layer_4/attn_mlp_ratio": 1.2421053290417803, "grad/layer_8/attn": 0.003982647322118282, "grad/layer_8/mlp": 0.0036113152746111155, "grad/layer_8/attn_mlp_ratio": 1.1028245691632939, "grad/layer_12/attn": 0.005268822889775038, "grad/layer_12/mlp": 0.006485047284513712, "grad/layer_12/attn_mlp_ratio": 0.8124571151718927, "grad/layer_16/attn": 0.003752542892470956, "grad/layer_16/mlp": 0.004497963469475508, "grad/layer_16/attn_mlp_ratio": 0.8342759638910386, "grad/layer_20/attn": 0.004839941393584013, "grad/layer_20/mlp": 0.005688873119652271, "grad/layer_20/attn_mlp_ratio": 0.8507732914251606, "grad/layer_24/attn": 0.013572677038609982, "grad/layer_24/mlp": 0.011210087686777115, "grad/layer_24/attn_mlp_ratio": 1.2107556423081418, "grad/layer_27/attn": 0.0038434588350355625, "grad/layer_27/mlp": 0.011280611157417297, "grad/layer_27/attn_mlp_ratio": 0.3407137031256518} {"step": 48750, "timestamp": 1778247233.3716557, "eos/sharpness": 55.19802570343016, "eos/L0_probe": 1.9851868152618408, "eos/L_plus": 2.2559781074523926, "eos/L_minus": 2.266375780105591, "eos/grad_norm": 0.17034021019935608, "eos/embed_grad_frac": 0.08304008841514587, "eos/time_s": 0.5913131237030029} {"step": 48750, "timestamp": 1778247233.3917556, "train/loss": 2.1195034742355348, "train/z_loss": 0.0014031794155016542, "train/perplexity": 8.3270018900781, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915576.2517068451, "perf/iters_per_sec": 0.9134179361852861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0947890996932983, "data/tokens_consumed": 102238257152, "data/tokens_consumed_B": 102.238257152, "train/loss_slope": -1.9082698698985172e-05} {"step": 48750, "timestamp": 1778247234.7526205, "geo/rankme_last": 438.67523193359375, "geo/layer_0/stable_rank_q_proj": 19.418554306030273, "geo/layer_0/stable_rank_k_proj": 16.387508392333984, "geo/layer_0/stable_rank_o_proj": 48.17811584472656, "geo/layer_0/stable_rank_gate_proj": 134.5614471435547, "geo/layer_0/stable_rank_down_proj": 53.634361267089844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06197535619139671, "geo/layer_0/attn_entropy_mean": 6.186070442199707, "geo/layer_0/attn_entropy_std": 0.38923609256744385, "geo/layer_7/stable_rank_q_proj": 42.982418060302734, "geo/layer_7/stable_rank_k_proj": 42.05771255493164, "geo/layer_7/stable_rank_o_proj": 93.91476440429688, "geo/layer_7/stable_rank_gate_proj": 87.1398696899414, "geo/layer_7/stable_rank_down_proj": 145.35182189941406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48100632429122925, "geo/layer_7/attn_entropy_mean": 4.622610092163086, "geo/layer_7/attn_entropy_std": 0.7961847186088562, "geo/layer_14/stable_rank_q_proj": 52.613590240478516, "geo/layer_14/stable_rank_k_proj": 38.462158203125, "geo/layer_14/stable_rank_o_proj": 45.78856658935547, "geo/layer_14/stable_rank_gate_proj": 73.61555480957031, "geo/layer_14/stable_rank_down_proj": 131.3854217529297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39132359623908997, "geo/layer_14/attn_entropy_mean": 5.555362701416016, "geo/layer_14/attn_entropy_std": 0.3731182813644409, "geo/layer_21/stable_rank_q_proj": 41.598777770996094, "geo/layer_21/stable_rank_k_proj": 30.54466438293457, "geo/layer_21/stable_rank_o_proj": 73.32794189453125, "geo/layer_21/stable_rank_gate_proj": 70.05562591552734, "geo/layer_21/stable_rank_down_proj": 53.70503234863281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13960570096969604, "geo/layer_21/attn_entropy_mean": 5.707434177398682, "geo/layer_21/attn_entropy_std": 0.30740538239479065, "geo/layer_27/stable_rank_q_proj": 42.810752868652344, "geo/layer_27/stable_rank_k_proj": 31.67196273803711, "geo/layer_27/stable_rank_o_proj": 115.93253326416016, "geo/layer_27/stable_rank_gate_proj": 82.82150268554688, "geo/layer_27/stable_rank_down_proj": 130.27577209472656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09611160308122635, "geo/layer_27/attn_entropy_mean": 4.231437683105469, "geo/layer_27/attn_entropy_std": 0.6775433421134949, "attnres/final_alpha/block_0": 0.24228768050670624, "attnres/block_norm/0": 1.7407363653182983, "attnres/final_alpha/block_1": 0.005000201985239983, "attnres/block_norm/1": 43054.7109375, "attnres/final_alpha/block_2": 0.010749952867627144, "attnres/block_norm/2": 27509.783203125, "attnres/final_alpha/block_3": 0.012483969330787659, "attnres/block_norm/3": 51719.30078125, "attnres/final_alpha/block_4": 0.015380174852907658, "attnres/block_norm/4": 13883.029296875, "attnres/final_alpha/block_5": 0.600405216217041, "attnres/block_norm/5": 6328.32666015625, "attnres/final_alpha/block_6": 0.11369280517101288, "attnres/block_norm/6": 34301.09765625, "geo/tier1_time_s": 1.3564867973327637, "geo/step": 48750.0, "geo/rankme_slope": 3.694319524684874e-05} {"step": 48760, "timestamp": 1778247245.1011689, "train/loss": 2.1337993502616883, "train/z_loss": 0.0013861684827134013, "train/perplexity": 8.446898649416456, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791639.4672507616, "perf/iters_per_sec": 0.854320272088414, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1705212116241455, "data/tokens_consumed": 102259228672, "data/tokens_consumed_B": 102.259228672, "train/loss_slope": -1.972806209062905e-05} {"step": 48770, "timestamp": 1778247255.4471552, "train/loss": 2.129217767715454, "train/z_loss": 0.001391735696233809, "train/perplexity": 8.408287004754182, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028441.704008609, "perf/iters_per_sec": 0.9672363777201696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033873438835144, "data/tokens_consumed": 102280200192, "data/tokens_consumed_B": 102.280200192, "train/loss_slope": -2.253982045791399e-05} {"step": 48780, "timestamp": 1778247265.7931414, "train/loss": 2.1740521669387816, "train/z_loss": 0.0013901107478886843, "train/perplexity": 8.793846072755516, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027929.013968908, "perf/iters_per_sec": 0.9669919080585995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034134817123413, "data/tokens_consumed": 102301171712, "data/tokens_consumed_B": 102.301171712, "train/loss_slope": -2.35523457073643e-05} {"step": 48790, "timestamp": 1778247276.1389744, "train/loss": 2.1098680019378664, "train/z_loss": 0.0013895643991418184, "train/perplexity": 8.247152604614486, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028471.3612570819, "perf/iters_per_sec": 0.9672505193982515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033858323097229, "data/tokens_consumed": 102322143232, "data/tokens_consumed_B": 102.322143232, "train/loss_slope": -2.73746055058806e-05} {"step": 48800, "timestamp": 1778247286.4723089, "grad/layer_0/attn": 0.003105465555563569, "grad/layer_0/mlp": 0.003118742723017931, "grad/layer_0/attn_mlp_ratio": 0.9957427501375333, "grad/layer_4/attn": 0.0022299690172076225, "grad/layer_4/mlp": 0.0025665664579719305, "grad/layer_4/attn_mlp_ratio": 0.8688529858230978, "grad/layer_8/attn": 0.0053570386953651905, "grad/layer_8/mlp": 0.0035541679244488478, "grad/layer_8/attn_mlp_ratio": 1.5072553290993937, "grad/layer_12/attn": 0.004536164924502373, "grad/layer_12/mlp": 0.006416627671569586, "grad/layer_12/attn_mlp_ratio": 0.7069390785921764, "grad/layer_16/attn": 0.00338662788271904, "grad/layer_16/mlp": 0.004507858771830797, "grad/layer_16/attn_mlp_ratio": 0.7512719406283483, "grad/layer_20/attn": 0.0032240443397313356, "grad/layer_20/mlp": 0.005546849686652422, "grad/layer_20/attn_mlp_ratio": 0.5812388046796338, "grad/layer_24/attn": 0.005755349528044462, "grad/layer_24/mlp": 0.008042378351092339, "grad/layer_24/attn_mlp_ratio": 0.7156277913361258, "grad/layer_27/attn": 0.004123745020478964, "grad/layer_27/mlp": 0.007031932473182678, "grad/layer_27/attn_mlp_ratio": 0.5864312516598182} {"step": 48800, "timestamp": 1778247286.4865532, "train/loss": 2.1517166614532472, "train/z_loss": 0.0013966508209705352, "train/perplexity": 8.599608349384859, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028099.9130245778, "perf/iters_per_sec": 0.9670733990786446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340476751327514, "data/tokens_consumed": 102343114752, "data/tokens_consumed_B": 102.343114752, "train/loss_slope": -2.672212196834227e-05} {"step": 48810, "timestamp": 1778247296.8400464, "train/loss": 2.138075661659241, "train/z_loss": 0.001402406918350607, "train/perplexity": 8.483097562132818, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026870.736212914, "perf/iters_per_sec": 0.9664872819008418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346747636795044, "data/tokens_consumed": 102364086272, "data/tokens_consumed_B": 102.364086272, "train/loss_slope": -2.7739540070626112e-05} {"step": 48820, "timestamp": 1778247307.1822715, "train/loss": 2.2015491485595704, "train/z_loss": 0.0013787593226879836, "train/perplexity": 9.039005421081283, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028670.7983694605, "perf/iters_per_sec": 0.9673456184241583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033756685256958, "data/tokens_consumed": 102385057792, "data/tokens_consumed_B": 102.385057792, "train/loss_slope": -2.5498510794778303e-05} {"step": 48825, "timestamp": 1778247312.935845, "eos/sharpness": 79.6842336654663, "eos/L0_probe": 1.9832097291946411, "eos/L_plus": 2.32049822807312, "eos/L_minus": 2.442763566970825, "eos/grad_norm": 0.2523513436317444, "eos/embed_grad_frac": 0.036345500499010086, "eos/time_s": 0.5923583507537842} {"step": 48825, "timestamp": 1778247314.310414, "geo/rankme_last": 439.53662109375, "geo/layer_0/stable_rank_q_proj": 19.437219619750977, "geo/layer_0/stable_rank_k_proj": 16.38208770751953, "geo/layer_0/stable_rank_o_proj": 48.167457580566406, "geo/layer_0/stable_rank_gate_proj": 134.52731323242188, "geo/layer_0/stable_rank_down_proj": 53.61068344116211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05902039259672165, "geo/layer_0/attn_entropy_mean": 6.187917709350586, "geo/layer_0/attn_entropy_std": 0.3932681679725647, "geo/layer_7/stable_rank_q_proj": 42.95816421508789, "geo/layer_7/stable_rank_k_proj": 42.09575653076172, "geo/layer_7/stable_rank_o_proj": 94.19829559326172, "geo/layer_7/stable_rank_gate_proj": 87.1739501953125, "geo/layer_7/stable_rank_down_proj": 145.3516387939453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4623275697231293, "geo/layer_7/attn_entropy_mean": 4.646078109741211, "geo/layer_7/attn_entropy_std": 0.7875264286994934, "geo/layer_14/stable_rank_q_proj": 52.61675262451172, "geo/layer_14/stable_rank_k_proj": 38.46791458129883, "geo/layer_14/stable_rank_o_proj": 45.784095764160156, "geo/layer_14/stable_rank_gate_proj": 73.6068344116211, "geo/layer_14/stable_rank_down_proj": 131.30477905273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38569650053977966, "geo/layer_14/attn_entropy_mean": 5.505577564239502, "geo/layer_14/attn_entropy_std": 0.36784622073173523, "geo/layer_21/stable_rank_q_proj": 41.601318359375, "geo/layer_21/stable_rank_k_proj": 30.500959396362305, "geo/layer_21/stable_rank_o_proj": 73.37295532226562, "geo/layer_21/stable_rank_gate_proj": 70.09990692138672, "geo/layer_21/stable_rank_down_proj": 53.71802520751953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14712683856487274, "geo/layer_21/attn_entropy_mean": 5.6949687004089355, "geo/layer_21/attn_entropy_std": 0.2986554801464081, "geo/layer_27/stable_rank_q_proj": 42.874881744384766, "geo/layer_27/stable_rank_k_proj": 31.844390869140625, "geo/layer_27/stable_rank_o_proj": 116.01931762695312, "geo/layer_27/stable_rank_gate_proj": 82.84285736083984, "geo/layer_27/stable_rank_down_proj": 130.25575256347656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09410904347896576, "geo/layer_27/attn_entropy_mean": 4.238163948059082, "geo/layer_27/attn_entropy_std": 0.6932249665260315, "attnres/final_alpha/block_0": 0.24232374131679535, "attnres/block_norm/0": 1.7409836053848267, "attnres/final_alpha/block_1": 0.004993709269911051, "attnres/block_norm/1": 43216.4609375, "attnres/final_alpha/block_2": 0.010610142722725868, "attnres/block_norm/2": 27493.517578125, "attnres/final_alpha/block_3": 0.012340638786554337, "attnres/block_norm/3": 52406.09375, "attnres/final_alpha/block_4": 0.015266729518771172, "attnres/block_norm/4": 13894.7734375, "attnres/final_alpha/block_5": 0.5997781157493591, "attnres/block_norm/5": 6312.56640625, "attnres/final_alpha/block_6": 0.11468692868947983, "attnres/block_norm/6": 33922.3515625, "geo/tier1_time_s": 1.3564083576202393, "geo/step": 48825.0, "geo/rankme_slope": 5.18996074992497e-05} {"step": 48830, "timestamp": 1778247319.4832754, "train/loss": 2.1517958760261537, "train/z_loss": 0.0014005811419337988, "train/perplexity": 8.600289590669181, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705906.0714327618, "perf/iters_per_sec": 0.8134394032634553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2293478727340699, "data/tokens_consumed": 102406029312, "data/tokens_consumed_B": 102.406029312, "train/loss_slope": -2.3446773502728637e-05} {"step": 48840, "timestamp": 1778247329.820659, "train/loss": 2.1455071210861205, "train/z_loss": 0.0014025225187651812, "train/perplexity": 8.546374185095774, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030101.7849423478, "perf/iters_per_sec": 0.9680279659950007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03302800655365, "data/tokens_consumed": 102427000832, "data/tokens_consumed_B": 102.427000832, "train/loss_slope": -2.6522005938424904e-05} {"step": 48850, "timestamp": 1778247340.1504242, "grad/layer_0/attn": 0.002515100874006748, "grad/layer_0/mlp": 0.002784101292490959, "grad/layer_0/attn_mlp_ratio": 0.9033797694258778, "grad/layer_4/attn": 0.0021113655529916286, "grad/layer_4/mlp": 0.0024769906885921955, "grad/layer_4/attn_mlp_ratio": 0.8523913624207012, "grad/layer_8/attn": 0.004245022311806679, "grad/layer_8/mlp": 0.0034929339308291674, "grad/layer_8/attn_mlp_ratio": 1.2153170584784785, "grad/layer_12/attn": 0.005170547869056463, "grad/layer_12/mlp": 0.0068833292461931705, "grad/layer_12/attn_mlp_ratio": 0.7511696170569025, "grad/layer_16/attn": 0.004054743330925703, "grad/layer_16/mlp": 0.004610701464116573, "grad/layer_16/attn_mlp_ratio": 0.8794200350077541, "grad/layer_20/attn": 0.005503664258867502, "grad/layer_20/mlp": 0.006342959590256214, "grad/layer_20/attn_mlp_ratio": 0.8676807874598349, "grad/layer_24/attn": 0.006140948738902807, "grad/layer_24/mlp": 0.008434380404651165, "grad/layer_24/attn_mlp_ratio": 0.7280853330622638, "grad/layer_27/attn": 0.005702735856175423, "grad/layer_27/mlp": 0.007112137507647276, "grad/layer_27/attn_mlp_ratio": 0.8018314845375878} {"step": 48850, "timestamp": 1778247340.1646125, "train/loss": 2.1433252096176147, "train/z_loss": 0.00140330798458308, "train/perplexity": 8.527747081982023, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028362.0921508174, "perf/iters_per_sec": 0.9671984158281409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339140176773072, "data/tokens_consumed": 102447972352, "data/tokens_consumed_B": 102.447972352, "train/loss_slope": -2.678913483561029e-05} {"step": 48860, "timestamp": 1778247350.5060925, "train/loss": 2.1391252994537355, "train/z_loss": 0.0013972694985568523, "train/perplexity": 8.492006416665692, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029081.1174517816, "perf/iters_per_sec": 0.9675412738093289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335476398468018, "data/tokens_consumed": 102468943872, "data/tokens_consumed_B": 102.468943872, "train/loss_slope": -2.9106561936596748e-05} {"step": 48870, "timestamp": 1778247360.848148, "train/loss": 2.112969529628754, "train/z_loss": 0.0014037965098395943, "train/perplexity": 8.272771084464644, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029205.2566952107, "perf/iters_per_sec": 0.9676004680133871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033484411239624, "data/tokens_consumed": 102489915392, "data/tokens_consumed_B": 102.489915392, "train/loss_slope": -3.349222013838036e-05} {"step": 48880, "timestamp": 1778247371.1916134, "train/loss": 2.1334262609481813, "train/z_loss": 0.0013901272788643837, "train/perplexity": 8.443747789610692, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028426.501496629, "perf/iters_per_sec": 0.9672291285975595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338811874389648, "data/tokens_consumed": 102510886912, "data/tokens_consumed_B": 102.510886912, "train/loss_slope": -3.24053521990382e-05} {"step": 48890, "timestamp": 1778247381.5311823, "train/loss": 2.1559624910354613, "train/z_loss": 0.0013906627078540622, "train/perplexity": 8.636198443594676, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029611.6240341594, "perf/iters_per_sec": 0.9677942390604779, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332774877548219, "data/tokens_consumed": 102531858432, "data/tokens_consumed_B": 102.531858432, "train/loss_slope": -3.228989962232454e-05} {"step": 48900, "timestamp": 1778247391.8690999, "grad/layer_0/attn": 0.002519908593967557, "grad/layer_0/mlp": 0.0027969207148998976, "grad/layer_0/attn_mlp_ratio": 0.9009581467388611, "grad/layer_4/attn": 0.0030944293830543756, "grad/layer_4/mlp": 0.0024546263739466667, "grad/layer_4/attn_mlp_ratio": 1.260651840880296, "grad/layer_8/attn": 0.010776977986097336, "grad/layer_8/mlp": 0.003703781170770526, "grad/layer_8/attn_mlp_ratio": 2.9097231176006564, "grad/layer_12/attn": 0.004452280700206757, "grad/layer_12/mlp": 0.006821182556450367, "grad/layer_12/attn_mlp_ratio": 0.6527138950012594, "grad/layer_16/attn": 0.003946961835026741, "grad/layer_16/mlp": 0.004999011289328337, "grad/layer_16/attn_mlp_ratio": 0.7895484782156201, "grad/layer_20/attn": 0.005274011753499508, "grad/layer_20/mlp": 0.006151377223432064, "grad/layer_20/attn_mlp_ratio": 0.8573708742283682, "grad/layer_24/attn": 0.015572023577988148, "grad/layer_24/mlp": 0.01116180419921875, "grad/layer_24/attn_mlp_ratio": 1.3951170581872756, "grad/layer_27/attn": 0.008371016010642052, "grad/layer_27/mlp": 0.010590998455882072, "grad/layer_27/attn_mlp_ratio": 0.7903896848322128} {"step": 48900, "timestamp": 1778247392.461858, "eos/sharpness": 74.48372840881346, "eos/L0_probe": 1.9849672317504883, "eos/L_plus": 2.301365375518799, "eos/L_minus": 2.4134063720703125, "eos/grad_norm": 0.20563830435276031, "eos/embed_grad_frac": 0.05218469351530075, "eos/time_s": 0.59002685546875} {"step": 48900, "timestamp": 1778247392.4796908, "train/loss": 2.172772932052612, "train/z_loss": 0.0013904845109209418, "train/perplexity": 8.782603870317523, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916729.0328182117, "perf/iters_per_sec": 0.9139676250544604, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0941306591033935, "data/tokens_consumed": 102552829952, "data/tokens_consumed_B": 102.552829952, "train/loss_slope": -3.052319264290316e-05} {"step": 48900, "timestamp": 1778247393.8423028, "geo/rankme_last": 439.1494445800781, "geo/layer_0/stable_rank_q_proj": 19.440221786499023, "geo/layer_0/stable_rank_k_proj": 16.372278213500977, "geo/layer_0/stable_rank_o_proj": 48.10511779785156, "geo/layer_0/stable_rank_gate_proj": 134.03790283203125, "geo/layer_0/stable_rank_down_proj": 53.61552429199219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06295587867498398, "geo/layer_0/attn_entropy_mean": 6.191600322723389, "geo/layer_0/attn_entropy_std": 0.38503751158714294, "geo/layer_7/stable_rank_q_proj": 42.894771575927734, "geo/layer_7/stable_rank_k_proj": 41.95458984375, "geo/layer_7/stable_rank_o_proj": 94.28271484375, "geo/layer_7/stable_rank_gate_proj": 87.29287719726562, "geo/layer_7/stable_rank_down_proj": 145.3003387451172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4886244833469391, "geo/layer_7/attn_entropy_mean": 4.669891357421875, "geo/layer_7/attn_entropy_std": 0.8067498207092285, "geo/layer_14/stable_rank_q_proj": 52.54664611816406, "geo/layer_14/stable_rank_k_proj": 38.56572723388672, "geo/layer_14/stable_rank_o_proj": 45.69672775268555, "geo/layer_14/stable_rank_gate_proj": 73.57290649414062, "geo/layer_14/stable_rank_down_proj": 131.43260192871094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3988279700279236, "geo/layer_14/attn_entropy_mean": 5.53566312789917, "geo/layer_14/attn_entropy_std": 0.37435007095336914, "geo/layer_21/stable_rank_q_proj": 41.61883544921875, "geo/layer_21/stable_rank_k_proj": 30.425460815429688, "geo/layer_21/stable_rank_o_proj": 73.40391540527344, "geo/layer_21/stable_rank_gate_proj": 70.02694702148438, "geo/layer_21/stable_rank_down_proj": 53.65892028808594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.146560400724411, "geo/layer_21/attn_entropy_mean": 5.713683605194092, "geo/layer_21/attn_entropy_std": 0.2947591543197632, "geo/layer_27/stable_rank_q_proj": 42.836917877197266, "geo/layer_27/stable_rank_k_proj": 31.801158905029297, "geo/layer_27/stable_rank_o_proj": 116.00747680664062, "geo/layer_27/stable_rank_gate_proj": 82.7873306274414, "geo/layer_27/stable_rank_down_proj": 130.13706970214844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08635794371366501, "geo/layer_27/attn_entropy_mean": 4.256467342376709, "geo/layer_27/attn_entropy_std": 0.6999090909957886, "attnres/final_alpha/block_0": 0.24200822412967682, "attnres/block_norm/0": 1.7410902976989746, "attnres/final_alpha/block_1": 0.004992153495550156, "attnres/block_norm/1": 43160.05859375, "attnres/final_alpha/block_2": 0.010712074115872383, "attnres/block_norm/2": 27585.0, "attnres/final_alpha/block_3": 0.012310167774558067, "attnres/block_norm/3": 52135.45703125, "attnres/final_alpha/block_4": 0.015179071575403214, "attnres/block_norm/4": 13928.740234375, "attnres/final_alpha/block_5": 0.5992729663848877, "attnres/block_norm/5": 6308.09765625, "attnres/final_alpha/block_6": 0.11552534997463226, "attnres/block_norm/6": 33939.83203125, "geo/tier1_time_s": 1.3582561016082764, "geo/step": 48900.0, "geo/rankme_slope": 4.8667709271208484e-05} {"step": 48910, "timestamp": 1778247404.2037127, "train/loss": 2.1511203408241273, "train/z_loss": 0.0013930529938079416, "train/perplexity": 8.594481754222912, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789401.8774297088, "perf/iters_per_sec": 0.853253306116919, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1719849109649658, "data/tokens_consumed": 102573801472, "data/tokens_consumed_B": 102.573801472, "train/loss_slope": -3.089781421961238e-05} {"step": 48920, "timestamp": 1778247414.5490687, "train/loss": 2.1719254732131956, "train/z_loss": 0.001392606575973332, "train/perplexity": 8.775164127917519, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028601.6014593232, "perf/iters_per_sec": 0.967312622766172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337919473648072, "data/tokens_consumed": 102594772992, "data/tokens_consumed_B": 102.594772992, "train/loss_slope": -2.4732088398869015e-05} {"step": 48930, "timestamp": 1778247424.8940115, "train/loss": 2.1318673849105836, "train/z_loss": 0.0013887560227885843, "train/perplexity": 8.430595287737436, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028661.2536874034, "perf/iters_per_sec": 0.9673410671650903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337615489959717, "data/tokens_consumed": 102615744512, "data/tokens_consumed_B": 102.615744512, "train/loss_slope": -2.4541529863759827e-05} {"step": 48940, "timestamp": 1778247435.2566285, "train/loss": 2.1926130771636965, "train/z_loss": 0.001386472606100142, "train/perplexity": 8.958592048217227, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024790.9089893734, "perf/iters_per_sec": 0.9654955429980151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357375621795655, "data/tokens_consumed": 102636716032, "data/tokens_consumed_B": 102.636716032, "train/loss_slope": -2.034948807809933e-05} {"step": 48950, "timestamp": 1778247445.586412, "grad/layer_0/attn": 0.00311396736651659, "grad/layer_0/mlp": 0.0034632442984730005, "grad/layer_0/attn_mlp_ratio": 0.8991474491056963, "grad/layer_4/attn": 0.0028096348978579044, "grad/layer_4/mlp": 0.002526764525100589, "grad/layer_4/attn_mlp_ratio": 1.111949593542396, "grad/layer_8/attn": 0.00401638587936759, "grad/layer_8/mlp": 0.0037609993014484644, "grad/layer_8/attn_mlp_ratio": 1.0679038868819732, "grad/layer_12/attn": 0.005422526970505714, "grad/layer_12/mlp": 0.006456397008150816, "grad/layer_12/attn_mlp_ratio": 0.8398688741837298, "grad/layer_16/attn": 0.006677321158349514, "grad/layer_16/mlp": 0.004998817108571529, "grad/layer_16/attn_mlp_ratio": 1.3357802215491767, "grad/layer_20/attn": 0.005844517145305872, "grad/layer_20/mlp": 0.007926599122583866, "grad/layer_20/attn_mlp_ratio": 0.7373297149493968, "grad/layer_24/attn": 0.0230299923568964, "grad/layer_24/mlp": 0.015840424224734306, "grad/layer_24/attn_mlp_ratio": 1.453874712240872, "grad/layer_27/attn": 0.011367464438080788, "grad/layer_27/mlp": 0.017223360016942024, "grad/layer_27/attn_mlp_ratio": 0.6600027149695955} {"step": 48950, "timestamp": 1778247445.6006446, "train/loss": 2.182556939125061, "train/z_loss": 0.0013797307969070972, "train/perplexity": 8.868954668268836, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028780.1007395375, "perf/iters_per_sec": 0.9673977378556907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03370099067688, "data/tokens_consumed": 102657687552, "data/tokens_consumed_B": 102.657687552, "train/loss_slope": -1.7190331944895663e-05} {"step": 48960, "timestamp": 1778247455.9506958, "train/loss": 2.1374353170394897, "train/z_loss": 0.0013873470248654484, "train/perplexity": 8.477667195088797, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027633.7607980845, "perf/iters_per_sec": 0.9668511203756736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342854022979737, "data/tokens_consumed": 102678659072, "data/tokens_consumed_B": 102.678659072, "train/loss_slope": -1.920771163896936e-05} {"step": 48970, "timestamp": 1778247466.2867165, "train/loss": 2.1606119751930235, "train/z_loss": 0.0013889356516301632, "train/perplexity": 8.676445803666654, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029976.0836932103, "perf/iters_per_sec": 0.9679680269685794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330919742584228, "data/tokens_consumed": 102699630592, "data/tokens_consumed_B": 102.699630592, "train/loss_slope": -1.5625151456242876e-05} {"step": 48975, "timestamp": 1778247472.0400157, "eos/sharpness": 90.34276008605956, "eos/L0_probe": 1.9875246286392212, "eos/L_plus": 2.568341016769409, "eos/L_minus": 2.310135841369629, "eos/grad_norm": 0.3569929599761963, "eos/embed_grad_frac": 0.018226109445095062, "eos/time_s": 0.5868985652923584} {"step": 48975, "timestamp": 1778247473.418631, "geo/rankme_last": 439.0571594238281, "geo/layer_0/stable_rank_q_proj": 19.41013526916504, "geo/layer_0/stable_rank_k_proj": 16.393667221069336, "geo/layer_0/stable_rank_o_proj": 48.05699157714844, "geo/layer_0/stable_rank_gate_proj": 133.9056396484375, "geo/layer_0/stable_rank_down_proj": 53.65589141845703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061382222920656204, "geo/layer_0/attn_entropy_mean": 6.193943977355957, "geo/layer_0/attn_entropy_std": 0.38849589228630066, "geo/layer_7/stable_rank_q_proj": 42.82093048095703, "geo/layer_7/stable_rank_k_proj": 41.8758544921875, "geo/layer_7/stable_rank_o_proj": 94.15482330322266, "geo/layer_7/stable_rank_gate_proj": 87.1912612915039, "geo/layer_7/stable_rank_down_proj": 145.21946716308594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.475628525018692, "geo/layer_7/attn_entropy_mean": 4.642190933227539, "geo/layer_7/attn_entropy_std": 0.8005601763725281, "geo/layer_14/stable_rank_q_proj": 52.59004592895508, "geo/layer_14/stable_rank_k_proj": 38.58253479003906, "geo/layer_14/stable_rank_o_proj": 45.698490142822266, "geo/layer_14/stable_rank_gate_proj": 73.58470916748047, "geo/layer_14/stable_rank_down_proj": 130.9734649658203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.398596853017807, "geo/layer_14/attn_entropy_mean": 5.5256805419921875, "geo/layer_14/attn_entropy_std": 0.3657819628715515, "geo/layer_21/stable_rank_q_proj": 41.65058898925781, "geo/layer_21/stable_rank_k_proj": 30.412147521972656, "geo/layer_21/stable_rank_o_proj": 73.25202178955078, "geo/layer_21/stable_rank_gate_proj": 69.95075225830078, "geo/layer_21/stable_rank_down_proj": 53.658729553222656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14314663410186768, "geo/layer_21/attn_entropy_mean": 5.727081298828125, "geo/layer_21/attn_entropy_std": 0.3046836256980896, "geo/layer_27/stable_rank_q_proj": 42.79690933227539, "geo/layer_27/stable_rank_k_proj": 31.852924346923828, "geo/layer_27/stable_rank_o_proj": 115.96826171875, "geo/layer_27/stable_rank_gate_proj": 82.79914855957031, "geo/layer_27/stable_rank_down_proj": 130.10104370117188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09097109735012054, "geo/layer_27/attn_entropy_mean": 4.267421245574951, "geo/layer_27/attn_entropy_std": 0.6862412095069885, "attnres/final_alpha/block_0": 0.23790234327316284, "attnres/block_norm/0": 1.741129755973816, "attnres/final_alpha/block_1": 0.004830050282180309, "attnres/block_norm/1": 43300.8125, "attnres/final_alpha/block_2": 0.01040974073112011, "attnres/block_norm/2": 27401.265625, "attnres/final_alpha/block_3": 0.01227286085486412, "attnres/block_norm/3": 51874.6171875, "attnres/final_alpha/block_4": 0.014859957620501518, "attnres/block_norm/4": 13917.3671875, "attnres/final_alpha/block_5": 0.6078426241874695, "attnres/block_norm/5": 6233.35986328125, "attnres/final_alpha/block_6": 0.11188247054815292, "attnres/block_norm/6": 34274.390625, "geo/tier1_time_s": 1.3585598468780518, "geo/step": 48975.0, "geo/rankme_slope": 4.3584797200130055e-05} {"step": 48980, "timestamp": 1778247478.5900605, "train/loss": 2.167733883857727, "train/z_loss": 0.0014039989095181228, "train/perplexity": 8.738459223051029, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705492.5863060087, "perf/iters_per_sec": 0.8132422381906551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2296459197998046, "data/tokens_consumed": 102720602112, "data/tokens_consumed_B": 102.720602112, "train/loss_slope": -1.3490112522433102e-05} {"step": 48990, "timestamp": 1778247488.9301476, "train/loss": 2.1395710945129394, "train/z_loss": 0.0013858538586646318, "train/perplexity": 8.495792955116327, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029174.5481381044, "perf/iters_per_sec": 0.967585825032284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335000514984132, "data/tokens_consumed": 102741573632, "data/tokens_consumed_B": 102.741573632, "train/loss_slope": -1.4727421054388e-05} {"step": 49000, "timestamp": 1778247499.2582839, "grad/layer_0/attn": 0.002573662903159857, "grad/layer_0/mlp": 0.002859539818018675, "grad/layer_0/attn_mlp_ratio": 0.9000269193455088, "grad/layer_4/attn": 0.001861509052105248, "grad/layer_4/mlp": 0.0026073637418448925, "grad/layer_4/attn_mlp_ratio": 0.713942957338896, "grad/layer_8/attn": 0.007041033823043108, "grad/layer_8/mlp": 0.003685890231281519, "grad/layer_8/attn_mlp_ratio": 1.9102667714465245, "grad/layer_12/attn": 0.004568429663777351, "grad/layer_12/mlp": 0.006507056765258312, "grad/layer_12/attn_mlp_ratio": 0.7020731120652344, "grad/layer_16/attn": 0.004804614465683699, "grad/layer_16/mlp": 0.004553286358714104, "grad/layer_16/attn_mlp_ratio": 1.0551970558515162, "grad/layer_20/attn": 0.004062366206198931, "grad/layer_20/mlp": 0.005826849490404129, "grad/layer_20/attn_mlp_ratio": 0.6971805506853971, "grad/layer_24/attn": 0.01119224913418293, "grad/layer_24/mlp": 0.01067524403333664, "grad/layer_24/attn_mlp_ratio": 1.0484302742296814, "grad/layer_27/attn": 0.007561301346868277, "grad/layer_27/mlp": 0.009492933750152588, "grad/layer_27/attn_mlp_ratio": 0.7965189125116191} {"step": 49000, "timestamp": 1778247499.2725766, "train/loss": 2.163583207130432, "train/z_loss": 0.0013689429382793606, "train/perplexity": 8.702263873284245, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029005.3868665227, "perf/iters_per_sec": 0.9675051626522649, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335862159729003, "data/tokens_consumed": 102762545152, "data/tokens_consumed_B": 102.762545152, "train/loss_slope": -1.420468676744765e-05} {"step": 49000, "timestamp": 1778247506.5127661, "geo/ww_alpha_mean": 7.527917521081951, "geo/ww_alpha_std": 4.48414317350431, "geo/ww_alpha_min": 1.3501355525449752, "geo/ww_alpha_max": 38.4730524667015, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.9312713374206636, "geo/ww_alpha_by_type/k_proj": 4.489584776247261, "geo/ww_alpha_by_type/v_proj": 7.315133762595186, "geo/ww_alpha_by_type/o_proj": 8.213360633643555, "geo/ww_alpha_by_type/gate_proj": 7.767622402536758, "geo/ww_alpha_by_type/up_proj": 13.003070109980966, "geo/ww_alpha_by_type/down_proj": 8.072607098352378, "geo/twonn_id/layer_0": 0.6989201903343201, "geo/twonn_id/layer_7": 3.260753870010376, "geo/twonn_id/layer_14": 3.749396800994873, "geo/twonn_id/layer_21": 7.493835926055908, "geo/twonn_id/layer_27": 5.883780002593994, "geo/tier2_time_s": 7.232291221618652} {"step": 49000, "timestamp": 1778247507.266815, "eoc/jacobian_sigma/layer_0/attn": 1067.878173828125, "eoc/jacobian_sigma/layer_0/mlp": 8362.5078125, "eoc/jacobian_sigma/layer_0": 8362.5078125, "eoc/jacobian_sigma/layer_7/attn": 1.1652195453643799, "eoc/jacobian_sigma/layer_7/mlp": 1.737341284751892, "eoc/jacobian_sigma/layer_7": 1.737341284751892, "eoc/jacobian_sigma/layer_14/attn": 1.579390287399292, "eoc/jacobian_sigma/layer_14/mlp": 6.563382625579834, "eoc/jacobian_sigma/layer_14": 6.563382625579834, "eoc/jacobian_sigma/layer_21/attn": 1.0931148529052734, "eoc/jacobian_sigma/layer_21/mlp": 3.7548208236694336, "eoc/jacobian_sigma/layer_21": 3.7548208236694336, "eoc/jacobian_sigma/layer_27/attn": 3.5412063598632812, "eoc/jacobian_sigma/layer_27/mlp": 30.01293182373047, "eoc/jacobian_sigma/layer_27": 30.01293182373047, "eoc/layer0_sigma": 8362.5078125, "eoc/sigma_max": 30.01293182373047, "eoc/sigma_min": 1.737341284751892, "eoc/sigma_mean": 10.517119139432907, "eoc/time_s": 0.7472162246704102} {"step": 49010, "timestamp": 1778247517.62472, "train/loss": 2.1642854690551756, "train/z_loss": 0.0013829793315380812, "train/perplexity": 8.708377288219548, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1143144.3323771101, "perf/iters_per_sec": 0.5450936948667098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8345469951629638, "data/tokens_consumed": 102783516672, "data/tokens_consumed_B": 102.783516672, "train/loss_slope": -1.2233233637828373e-05} {"step": 49020, "timestamp": 1778247527.9704366, "train/loss": 2.1050444841384888, "train/z_loss": 0.001406153046991676, "train/perplexity": 8.207468103623066, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028136.0135821882, "perf/iters_per_sec": 0.9670906131659451, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340292692184447, "data/tokens_consumed": 102804488192, "data/tokens_consumed_B": 102.804488192, "train/loss_slope": -1.4109390239046042e-05} {"step": 49030, "timestamp": 1778247538.3107138, "train/loss": 2.1148818135261536, "train/z_loss": 0.0013965879799798131, "train/perplexity": 8.288606107100243, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029690.5846515433, "perf/iters_per_sec": 0.9678318904168812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332372903823852, "data/tokens_consumed": 102825459712, "data/tokens_consumed_B": 102.825459712, "train/loss_slope": -1.5694972469468328e-05} {"step": 49040, "timestamp": 1778247548.6554585, "train/loss": 2.167775535583496, "train/z_loss": 0.0013905453961342573, "train/perplexity": 8.738823202538365, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028245.7260137864, "perf/iters_per_sec": 0.967142928130048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339733362197876, "data/tokens_consumed": 102846431232, "data/tokens_consumed_B": 102.846431232, "train/loss_slope": -1.4910092827367462e-05} {"step": 49050, "timestamp": 1778247558.9927907, "grad/layer_0/attn": 0.0028143133968114853, "grad/layer_0/mlp": 0.002879129024222493, "grad/layer_0/attn_mlp_ratio": 0.9774877316665985, "grad/layer_4/attn": 0.0024590399116277695, "grad/layer_4/mlp": 0.0024081217125058174, "grad/layer_4/attn_mlp_ratio": 1.02114432038177, "grad/layer_8/attn": 0.00309631135314703, "grad/layer_8/mlp": 0.003432599361985922, "grad/layer_8/attn_mlp_ratio": 0.9020310663789671, "grad/layer_12/attn": 0.003562720026820898, "grad/layer_12/mlp": 0.006620804313570261, "grad/layer_12/attn_mlp_ratio": 0.5381098434985645, "grad/layer_16/attn": 0.00443795882165432, "grad/layer_16/mlp": 0.0044209593906998634, "grad/layer_16/attn_mlp_ratio": 1.003845167771887, "grad/layer_20/attn": 0.0045310575515031815, "grad/layer_20/mlp": 0.005780672188848257, "grad/layer_20/attn_mlp_ratio": 0.7838288221673176, "grad/layer_24/attn": 0.008377556689083576, "grad/layer_24/mlp": 0.007854885421693325, "grad/layer_24/attn_mlp_ratio": 1.0665409019579926, "grad/layer_27/attn": 0.004739259369671345, "grad/layer_27/mlp": 0.006784926168620586, "grad/layer_27/attn_mlp_ratio": 0.6984982860594686} {"step": 49050, "timestamp": 1778247559.6004565, "eos/sharpness": 33.24561119079589, "eos/L0_probe": 1.9842325448989868, "eos/L_plus": 2.171043872833252, "eos/L_minus": 2.1298773288726807, "eos/grad_norm": 0.102897509932518, "eos/embed_grad_frac": 0.19157099723815918, "eos/time_s": 0.604790449142456} {"step": 49050, "timestamp": 1778247559.6212828, "train/loss": 2.1827293634414673, "train/z_loss": 0.001378746796399355, "train/perplexity": 8.870484023559982, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914500.4007127108, "perf/iters_per_sec": 0.9129049304545931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095404314994812, "data/tokens_consumed": 102867402752, "data/tokens_consumed_B": 102.867402752, "train/loss_slope": -1.3315264233733096e-05} {"step": 49050, "timestamp": 1778247560.9836223, "geo/rankme_last": 439.1461181640625, "geo/layer_0/stable_rank_q_proj": 19.416318893432617, "geo/layer_0/stable_rank_k_proj": 16.411758422851562, "geo/layer_0/stable_rank_o_proj": 47.96255874633789, "geo/layer_0/stable_rank_gate_proj": 134.0005340576172, "geo/layer_0/stable_rank_down_proj": 53.60960006713867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059818267822265625, "geo/layer_0/attn_entropy_mean": 6.189984321594238, "geo/layer_0/attn_entropy_std": 0.39070919156074524, "geo/layer_7/stable_rank_q_proj": 42.82084274291992, "geo/layer_7/stable_rank_k_proj": 41.90407943725586, "geo/layer_7/stable_rank_o_proj": 94.31946563720703, "geo/layer_7/stable_rank_gate_proj": 87.25892639160156, "geo/layer_7/stable_rank_down_proj": 145.01373291015625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47096624970436096, "geo/layer_7/attn_entropy_mean": 4.639248371124268, "geo/layer_7/attn_entropy_std": 0.8042042255401611, "geo/layer_14/stable_rank_q_proj": 52.56758117675781, "geo/layer_14/stable_rank_k_proj": 38.61888885498047, "geo/layer_14/stable_rank_o_proj": 45.62066650390625, "geo/layer_14/stable_rank_gate_proj": 73.61246490478516, "geo/layer_14/stable_rank_down_proj": 130.70718383789062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3823305666446686, "geo/layer_14/attn_entropy_mean": 5.511545181274414, "geo/layer_14/attn_entropy_std": 0.3793579041957855, "geo/layer_21/stable_rank_q_proj": 41.58903884887695, "geo/layer_21/stable_rank_k_proj": 30.344646453857422, "geo/layer_21/stable_rank_o_proj": 73.23133087158203, "geo/layer_21/stable_rank_gate_proj": 69.96424865722656, "geo/layer_21/stable_rank_down_proj": 53.64252471923828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15008589625358582, "geo/layer_21/attn_entropy_mean": 5.711190223693848, "geo/layer_21/attn_entropy_std": 0.2960090637207031, "geo/layer_27/stable_rank_q_proj": 42.779296875, "geo/layer_27/stable_rank_k_proj": 31.787273406982422, "geo/layer_27/stable_rank_o_proj": 116.10287475585938, "geo/layer_27/stable_rank_gate_proj": 82.82416534423828, "geo/layer_27/stable_rank_down_proj": 130.02737426757812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09418502449989319, "geo/layer_27/attn_entropy_mean": 4.25098180770874, "geo/layer_27/attn_entropy_std": 0.6943208575248718, "attnres/final_alpha/block_0": 0.2394566833972931, "attnres/block_norm/0": 1.7412705421447754, "attnres/final_alpha/block_1": 0.004897288512438536, "attnres/block_norm/1": 43211.3671875, "attnres/final_alpha/block_2": 0.010601047426462173, "attnres/block_norm/2": 27422.673828125, "attnres/final_alpha/block_3": 0.012351813726127148, "attnres/block_norm/3": 51877.22265625, "attnres/final_alpha/block_4": 0.014877772890031338, "attnres/block_norm/4": 13954.169921875, "attnres/final_alpha/block_5": 0.6048872470855713, "attnres/block_norm/5": 6267.58935546875, "attnres/final_alpha/block_6": 0.11292806267738342, "attnres/block_norm/6": 34067.1484375, "geo/tier1_time_s": 1.3589363098144531, "geo/step": 49050.0, "geo/rankme_slope": 2.500508015706283e-05} {"step": 49060, "timestamp": 1778247571.3216739, "train/loss": 2.153817522525787, "train/z_loss": 0.0013809103053063153, "train/perplexity": 8.617693922791844, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792881.9714840206, "perf/iters_per_sec": 0.8549127442760566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1697100162506104, "data/tokens_consumed": 102888374272, "data/tokens_consumed_B": 102.888374272, "train/loss_slope": -1.0597665653978439e-05} {"step": 49070, "timestamp": 1778247581.6697059, "train/loss": 2.151005721092224, "train/z_loss": 0.0013877694960683583, "train/perplexity": 8.593496713482027, "train/grad_norm": 0.3203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027571.9255918479, "perf/iters_per_sec": 0.9668216350516547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343169450759888, "data/tokens_consumed": 102909345792, "data/tokens_consumed_B": 102.909345792, "train/loss_slope": -9.34920368200304e-06} {"step": 49080, "timestamp": 1778247592.0277948, "train/loss": 2.1895792961120604, "train/z_loss": 0.0013719593291170896, "train/perplexity": 8.931454826420504, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025884.6689155377, "perf/iters_per_sec": 0.9660170883729637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351783752441406, "data/tokens_consumed": 102930317312, "data/tokens_consumed_B": 102.930317312, "train/loss_slope": -5.9889716140651075e-06} {"step": 49090, "timestamp": 1778247602.3715186, "train/loss": 2.209568452835083, "train/z_loss": 0.0013674285844899714, "train/perplexity": 9.111783380176018, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028889.321292493, "perf/iters_per_sec": 0.9674498182737794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336453437805175, "data/tokens_consumed": 102951288832, "data/tokens_consumed_B": 102.951288832, "train/loss_slope": -4.272362208030196e-06} {"step": 49100, "timestamp": 1778247612.7080328, "grad/layer_0/attn": 0.0026985055301338434, "grad/layer_0/mlp": 0.002882226835936308, "grad/layer_0/attn_mlp_ratio": 0.9362571338461309, "grad/layer_4/attn": 0.002733492059633136, "grad/layer_4/mlp": 0.0025740633718669415, "grad/layer_4/attn_mlp_ratio": 1.0619365410016726, "grad/layer_8/attn": 0.0055099898017942905, "grad/layer_8/mlp": 0.0035778305027633905, "grad/layer_8/attn_mlp_ratio": 1.540036523120622, "grad/layer_12/attn": 0.003968695178627968, "grad/layer_12/mlp": 0.006604832597076893, "grad/layer_12/attn_mlp_ratio": 0.6008774727003149, "grad/layer_16/attn": 0.0065192063339054585, "grad/layer_16/mlp": 0.0046105277724564075, "grad/layer_16/attn_mlp_ratio": 1.4139826315445614, "grad/layer_20/attn": 0.0042300657369196415, "grad/layer_20/mlp": 0.007026081904768944, "grad/layer_20/attn_mlp_ratio": 0.6020518596350698, "grad/layer_24/attn": 0.014294528402388096, "grad/layer_24/mlp": 0.01274846401065588, "grad/layer_24/attn_mlp_ratio": 1.1212745534138444, "grad/layer_27/attn": 0.008550336584448814, "grad/layer_27/mlp": 0.012336699292063713, "grad/layer_27/attn_mlp_ratio": 0.6930813755540892} {"step": 49100, "timestamp": 1778247612.7225282, "train/loss": 2.149210262298584, "train/z_loss": 0.0013905822299420834, "train/perplexity": 8.578081287262158, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027070.279117205, "perf/iters_per_sec": 0.9665824313722634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345729112625122, "data/tokens_consumed": 102972260352, "data/tokens_consumed_B": 102.972260352, "train/loss_slope": -5.594009284389464e-06} {"step": 49110, "timestamp": 1778247623.0741756, "train/loss": 2.1219801902770996, "train/z_loss": 0.0013980741030536592, "train/perplexity": 8.347651069759296, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027372.7979251132, "perf/iters_per_sec": 0.9667266835809294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034418535232544, "data/tokens_consumed": 102993231872, "data/tokens_consumed_B": 102.993231872, "train/loss_slope": -9.071110777764805e-06} {"step": 49120, "timestamp": 1778247633.422251, "train/loss": 2.197887587547302, "train/z_loss": 0.0013940286240540444, "train/perplexity": 9.005969070458418, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027851.9201020962, "perf/iters_per_sec": 0.966955146838234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034174132347107, "data/tokens_consumed": 103014203392, "data/tokens_consumed_B": 103.014203392, "train/loss_slope": -6.207247478554948e-06} {"step": 49125, "timestamp": 1778247639.168353, "eos/sharpness": 27.26798057556152, "eos/L0_probe": 1.9871774911880493, "eos/L_plus": 2.1596076488494873, "eos/L_minus": 2.0874271392822266, "eos/grad_norm": 0.10400304943323135, "eos/embed_grad_frac": 0.20572157204151154, "eos/time_s": 0.5836241245269775} {"step": 49125, "timestamp": 1778247640.5453358, "geo/rankme_last": 438.872314453125, "geo/layer_0/stable_rank_q_proj": 19.38924789428711, "geo/layer_0/stable_rank_k_proj": 16.44289779663086, "geo/layer_0/stable_rank_o_proj": 47.96714782714844, "geo/layer_0/stable_rank_gate_proj": 133.91615295410156, "geo/layer_0/stable_rank_down_proj": 53.612064361572266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06173412501811981, "geo/layer_0/attn_entropy_mean": 6.186982154846191, "geo/layer_0/attn_entropy_std": 0.38810622692108154, "geo/layer_7/stable_rank_q_proj": 42.853126525878906, "geo/layer_7/stable_rank_k_proj": 41.97321701049805, "geo/layer_7/stable_rank_o_proj": 94.28569030761719, "geo/layer_7/stable_rank_gate_proj": 87.31045532226562, "geo/layer_7/stable_rank_down_proj": 144.6536102294922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4904453754425049, "geo/layer_7/attn_entropy_mean": 4.657662868499756, "geo/layer_7/attn_entropy_std": 0.8206248879432678, "geo/layer_14/stable_rank_q_proj": 52.53118133544922, "geo/layer_14/stable_rank_k_proj": 38.643646240234375, "geo/layer_14/stable_rank_o_proj": 45.638607025146484, "geo/layer_14/stable_rank_gate_proj": 73.56765747070312, "geo/layer_14/stable_rank_down_proj": 130.95692443847656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3554132282733917, "geo/layer_14/attn_entropy_mean": 5.51344108581543, "geo/layer_14/attn_entropy_std": 0.3734680414199829, "geo/layer_21/stable_rank_q_proj": 41.551361083984375, "geo/layer_21/stable_rank_k_proj": 30.44691276550293, "geo/layer_21/stable_rank_o_proj": 73.35250854492188, "geo/layer_21/stable_rank_gate_proj": 69.97343444824219, "geo/layer_21/stable_rank_down_proj": 53.568809509277344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1440410614013672, "geo/layer_21/attn_entropy_mean": 5.704362392425537, "geo/layer_21/attn_entropy_std": 0.2961203455924988, "geo/layer_27/stable_rank_q_proj": 42.736846923828125, "geo/layer_27/stable_rank_k_proj": 31.767757415771484, "geo/layer_27/stable_rank_o_proj": 116.03892517089844, "geo/layer_27/stable_rank_gate_proj": 82.81546020507812, "geo/layer_27/stable_rank_down_proj": 129.88690185546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09271366894245148, "geo/layer_27/attn_entropy_mean": 4.2609453201293945, "geo/layer_27/attn_entropy_std": 0.7089998126029968, "attnres/final_alpha/block_0": 0.23822681605815887, "attnres/block_norm/0": 1.7412371635437012, "attnres/final_alpha/block_1": 0.004871658980846405, "attnres/block_norm/1": 43123.4375, "attnres/final_alpha/block_2": 0.010502403602004051, "attnres/block_norm/2": 27482.126953125, "attnres/final_alpha/block_3": 0.012318180873990059, "attnres/block_norm/3": 52078.953125, "attnres/final_alpha/block_4": 0.014979735016822815, "attnres/block_norm/4": 13940.28515625, "attnres/final_alpha/block_5": 0.6064308285713196, "attnres/block_norm/5": 6278.59326171875, "attnres/final_alpha/block_6": 0.11267037689685822, "attnres/block_norm/6": 34256.62890625, "geo/tier1_time_s": 1.3592069149017334, "geo/step": 49125.0, "geo/rankme_slope": 2.38488168704982e-05} {"step": 49130, "timestamp": 1778247645.7217834, "train/loss": 2.165232515335083, "train/z_loss": 0.0013918882119469344, "train/perplexity": 8.716628431024734, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705792.7656317498, "perf/iters_per_sec": 0.8133853748472928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2294295310974122, "data/tokens_consumed": 103035174912, "data/tokens_consumed_B": 103.035174912, "train/loss_slope": -5.610968249000812e-06} {"step": 49140, "timestamp": 1778247656.0763435, "train/loss": 2.256880187988281, "train/z_loss": 0.001371160231065005, "train/perplexity": 9.553238318801636, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026478.4444293005, "perf/iters_per_sec": 0.966300222601557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348750591278075, "data/tokens_consumed": 103056146432, "data/tokens_consumed_B": 103.056146432, "train/loss_slope": 3.101731276605118e-06} {"step": 49150, "timestamp": 1778247666.4100482, "grad/layer_0/attn": 0.0027862731367349625, "grad/layer_0/mlp": 0.002894449746236205, "grad/layer_0/attn_mlp_ratio": 0.9626261585973203, "grad/layer_4/attn": 0.0018758975202217698, "grad/layer_4/mlp": 0.0024442647118121386, "grad/layer_4/attn_mlp_ratio": 0.7674690201963058, "grad/layer_8/attn": 0.004073170013725758, "grad/layer_8/mlp": 0.0035314804408699274, "grad/layer_8/attn_mlp_ratio": 1.1533887746475853, "grad/layer_12/attn": 0.005026495549827814, "grad/layer_12/mlp": 0.0064320070669054985, "grad/layer_12/attn_mlp_ratio": 0.7814816463032815, "grad/layer_16/attn": 0.003909930121153593, "grad/layer_16/mlp": 0.004760181996971369, "grad/layer_16/attn_mlp_ratio": 0.8213824684650737, "grad/layer_20/attn": 0.0037089434918016195, "grad/layer_20/mlp": 0.006410392001271248, "grad/layer_20/attn_mlp_ratio": 0.5785829373941256, "grad/layer_24/attn": 0.0112506914883852, "grad/layer_24/mlp": 0.010260523296892643, "grad/layer_24/attn_mlp_ratio": 1.0965026883319058, "grad/layer_27/attn": 0.01010793074965477, "grad/layer_27/mlp": 0.010418262332677841, "grad/layer_27/attn_mlp_ratio": 0.9702127216483156} {"step": 49150, "timestamp": 1778247666.4244845, "train/loss": 2.1944703578948976, "train/z_loss": 0.0013822393608279525, "train/perplexity": 8.975246129471834, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027914.660694572, "perf/iters_per_sec": 0.9669850638840541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341421365737915, "data/tokens_consumed": 103077117952, "data/tokens_consumed_B": 103.077117952, "train/loss_slope": 5.887851835262863e-06} {"step": 49160, "timestamp": 1778247676.7700663, "train/loss": 2.2268717288970947, "train/z_loss": 0.001367656036745757, "train/perplexity": 9.270819034370229, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028029.6796118196, "perf/iters_per_sec": 0.9670399091776941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340834856033325, "data/tokens_consumed": 103098089472, "data/tokens_consumed_B": 103.098089472, "train/loss_slope": 1.0152760819561869e-05} {"step": 49170, "timestamp": 1778247687.1099076, "train/loss": 2.153999900817871, "train/z_loss": 0.0013921585865318774, "train/perplexity": 8.619265746420083, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029581.3246454513, "perf/iters_per_sec": 0.967779791186071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332929134368896, "data/tokens_consumed": 103119060992, "data/tokens_consumed_B": 103.119060992, "train/loss_slope": 1.0702715812772387e-05} {"step": 49180, "timestamp": 1778247697.971366, "train/loss": 2.15820791721344, "train/z_loss": 0.0013851053896360098, "train/perplexity": 8.65561217755446, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932034.364151064, "perf/iters_per_sec": 0.921265775752575, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0854630947113038, "data/tokens_consumed": 103140032512, "data/tokens_consumed_B": 103.140032512, "train/loss_slope": 1.281333142774726e-05} {"step": 49190, "timestamp": 1778247708.317573, "train/loss": 2.1188523054122923, "train/z_loss": 0.0013959298143163324, "train/perplexity": 8.321581371084221, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028069.2379130854, "perf/iters_per_sec": 0.967058772045653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340633153915406, "data/tokens_consumed": 103161004032, "data/tokens_consumed_B": 103.161004032, "train/loss_slope": 9.808486322722842e-06} {"step": 49200, "timestamp": 1778247718.6538563, "grad/layer_0/attn": 0.0030246363021433353, "grad/layer_0/mlp": 0.0030799065716564655, "grad/layer_0/attn_mlp_ratio": 0.982054531060383, "grad/layer_4/attn": 0.002197784138843417, "grad/layer_4/mlp": 0.0023835040628910065, "grad/layer_4/attn_mlp_ratio": 0.9220810993582133, "grad/layer_8/attn": 0.003828081302344799, "grad/layer_8/mlp": 0.003500222461298108, "grad/layer_8/attn_mlp_ratio": 1.093667969766214, "grad/layer_12/attn": 0.004604517947882414, "grad/layer_12/mlp": 0.006755390204489231, "grad/layer_12/attn_mlp_ratio": 0.6816065009334139, "grad/layer_16/attn": 0.004717458039522171, "grad/layer_16/mlp": 0.004617753904312849, "grad/layer_16/attn_mlp_ratio": 1.0215914566077364, "grad/layer_20/attn": 0.005706308409571648, "grad/layer_20/mlp": 0.0064412378706038, "grad/layer_20/attn_mlp_ratio": 0.8859024360866363, "grad/layer_24/attn": 0.016220105811953545, "grad/layer_24/mlp": 0.011663221754133701, "grad/layer_24/attn_mlp_ratio": 1.3907054169774524, "grad/layer_27/attn": 0.0069882734678685665, "grad/layer_27/mlp": 0.011180413886904716, "grad/layer_27/attn_mlp_ratio": 0.6250460381926575} {"step": 49200, "timestamp": 1778247719.2460828, "eos/sharpness": 77.96905040740965, "eos/L0_probe": 1.984247088432312, "eos/L_plus": 2.467241048812866, "eos/L_minus": 2.2809436321258545, "eos/grad_norm": 0.21747414767742157, "eos/embed_grad_frac": 0.05275014787912369, "eos/time_s": 0.589383602142334} {"step": 49200, "timestamp": 1778247719.2644775, "train/loss": 2.155877113342285, "train/z_loss": 0.0013934645685367286, "train/perplexity": 8.635461136368992, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917053.7832529906, "perf/iters_per_sec": 0.9141224781289056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0939453125, "data/tokens_consumed": 103181975552, "data/tokens_consumed_B": 103.181975552, "train/loss_slope": 1.1958324404904462e-05} {"step": 49200, "timestamp": 1778247720.6228755, "geo/rankme_last": 439.0246887207031, "geo/layer_0/stable_rank_q_proj": 19.381427764892578, "geo/layer_0/stable_rank_k_proj": 16.414751052856445, "geo/layer_0/stable_rank_o_proj": 47.96147155761719, "geo/layer_0/stable_rank_gate_proj": 134.01239013671875, "geo/layer_0/stable_rank_down_proj": 53.55976867675781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06189689412713051, "geo/layer_0/attn_entropy_mean": 6.183810710906982, "geo/layer_0/attn_entropy_std": 0.3903433084487915, "geo/layer_7/stable_rank_q_proj": 42.816322326660156, "geo/layer_7/stable_rank_k_proj": 41.72985076904297, "geo/layer_7/stable_rank_o_proj": 94.22846221923828, "geo/layer_7/stable_rank_gate_proj": 87.2298583984375, "geo/layer_7/stable_rank_down_proj": 144.65908813476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4925529360771179, "geo/layer_7/attn_entropy_mean": 4.632791519165039, "geo/layer_7/attn_entropy_std": 0.8082625269889832, "geo/layer_14/stable_rank_q_proj": 52.506473541259766, "geo/layer_14/stable_rank_k_proj": 38.70013427734375, "geo/layer_14/stable_rank_o_proj": 45.62361526489258, "geo/layer_14/stable_rank_gate_proj": 73.52606201171875, "geo/layer_14/stable_rank_down_proj": 130.9425506591797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3794557452201843, "geo/layer_14/attn_entropy_mean": 5.525059700012207, "geo/layer_14/attn_entropy_std": 0.3813270032405853, "geo/layer_21/stable_rank_q_proj": 41.688629150390625, "geo/layer_21/stable_rank_k_proj": 30.45783805847168, "geo/layer_21/stable_rank_o_proj": 73.29204559326172, "geo/layer_21/stable_rank_gate_proj": 70.0656509399414, "geo/layer_21/stable_rank_down_proj": 53.54774475097656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14607055485248566, "geo/layer_21/attn_entropy_mean": 5.717104911804199, "geo/layer_21/attn_entropy_std": 0.2962077260017395, "geo/layer_27/stable_rank_q_proj": 42.692420959472656, "geo/layer_27/stable_rank_k_proj": 31.76903533935547, "geo/layer_27/stable_rank_o_proj": 115.96308898925781, "geo/layer_27/stable_rank_gate_proj": 82.85147094726562, "geo/layer_27/stable_rank_down_proj": 129.9875030517578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08323487639427185, "geo/layer_27/attn_entropy_mean": 4.253079414367676, "geo/layer_27/attn_entropy_std": 0.709552526473999, "attnres/final_alpha/block_0": 0.2382063865661621, "attnres/block_norm/0": 1.7413620948791504, "attnres/final_alpha/block_1": 0.0048469193279743195, "attnres/block_norm/1": 43166.265625, "attnres/final_alpha/block_2": 0.010531187057495117, "attnres/block_norm/2": 27484.46484375, "attnres/final_alpha/block_3": 0.012445890344679356, "attnres/block_norm/3": 52360.96875, "attnres/final_alpha/block_4": 0.015076294541358948, "attnres/block_norm/4": 13981.9970703125, "attnres/final_alpha/block_5": 0.606482744216919, "attnres/block_norm/5": 6271.7353515625, "attnres/final_alpha/block_6": 0.11241056025028229, "attnres/block_norm/6": 34320.1796875, "geo/tier1_time_s": 1.3542513847351074, "geo/step": 49200.0, "geo/rankme_slope": 2.9315104948229292e-05} {"step": 49210, "timestamp": 1778247730.9643855, "train/loss": 2.187363862991333, "train/z_loss": 0.001383359613828361, "train/perplexity": 8.911689687837276, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1793060.3594705537, "perf/iters_per_sec": 0.8549978062966126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1695936441421508, "data/tokens_consumed": 103202947072, "data/tokens_consumed_B": 103.202947072, "train/loss_slope": 1.0774778196699357e-05} {"step": 49220, "timestamp": 1778247741.3073418, "train/loss": 2.188722848892212, "train/z_loss": 0.0013857209007255733, "train/perplexity": 8.923808781449559, "train/grad_norm": 0.3203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028639.1702575306, "perf/iters_per_sec": 0.9673305369651464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337728023529054, "data/tokens_consumed": 103223918592, "data/tokens_consumed_B": 103.223918592, "train/loss_slope": 1.3685770103461454e-05} {"step": 49230, "timestamp": 1778247751.6517658, "train/loss": 2.155182957649231, "train/z_loss": 0.001391921832691878, "train/perplexity": 8.629468861885373, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028858.8094830385, "perf/iters_per_sec": 0.9674352691092675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033660888671875, "data/tokens_consumed": 103244890112, "data/tokens_consumed_B": 103.244890112, "train/loss_slope": 1.1267589070651747e-05} {"step": 49240, "timestamp": 1778247762.027018, "train/loss": 2.160919165611267, "train/z_loss": 0.0013776724226772785, "train/perplexity": 8.679111534104413, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022226.4666936663, "perf/iters_per_sec": 0.9642727216213542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037051010131836, "data/tokens_consumed": 103265861632, "data/tokens_consumed_B": 103.265861632, "train/loss_slope": 1.5283800251591927e-05} {"step": 49250, "timestamp": 1778247772.3986444, "grad/layer_0/attn": 0.002739003160968423, "grad/layer_0/mlp": 0.0029813824221491814, "grad/layer_0/attn_mlp_ratio": 0.9187023606061676, "grad/layer_4/attn": 0.003243467304855585, "grad/layer_4/mlp": 0.0025508992839604616, "grad/layer_4/attn_mlp_ratio": 1.2714995053312765, "grad/layer_8/attn": 0.00334948324598372, "grad/layer_8/mlp": 0.003766125999391079, "grad/layer_8/attn_mlp_ratio": 0.8893709763263838, "grad/layer_12/attn": 0.0039032120257616043, "grad/layer_12/mlp": 0.006249942351132631, "grad/layer_12/attn_mlp_ratio": 0.6245196745858442, "grad/layer_16/attn": 0.00415336387231946, "grad/layer_16/mlp": 0.004527101758867502, "grad/layer_16/attn_mlp_ratio": 0.9174443168721774, "grad/layer_20/attn": 0.004736955277621746, "grad/layer_20/mlp": 0.005288410000503063, "grad/layer_20/attn_mlp_ratio": 0.8957238919824203, "grad/layer_24/attn": 0.011607619002461433, "grad/layer_24/mlp": 0.007888712920248508, "grad/layer_24/attn_mlp_ratio": 1.4714211269528186, "grad/layer_27/attn": 0.0040105972439050674, "grad/layer_27/mlp": 0.0065481215715408325, "grad/layer_27/attn_mlp_ratio": 0.6124805623780258} {"step": 49250, "timestamp": 1778247772.4129326, "train/loss": 2.129077661037445, "train/z_loss": 0.0013916521100327372, "train/perplexity": 8.407109030117184, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020155.4125704656, "perf/iters_per_sec": 0.9632851660587624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381141901016235, "data/tokens_consumed": 103286833152, "data/tokens_consumed_B": 103.286833152, "train/loss_slope": 1.4793414244092394e-05} {"step": 49260, "timestamp": 1778247782.7980227, "train/loss": 2.173210382461548, "train/z_loss": 0.0013739159796386958, "train/perplexity": 8.78644666442676, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020729.214606042, "perf/iters_per_sec": 0.9635587761907778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378194093704223, "data/tokens_consumed": 103307804672, "data/tokens_consumed_B": 103.307804672, "train/loss_slope": 1.574496694273536e-05} {"step": 49270, "timestamp": 1778247793.173479, "train/loss": 2.1036635398864747, "train/z_loss": 0.0013828104711137712, "train/perplexity": 8.196141869970342, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022201.6872441599, "perf/iters_per_sec": 0.9642609058590698, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037063717842102, "data/tokens_consumed": 103328776192, "data/tokens_consumed_B": 103.328776192, "train/loss_slope": 1.1349425350192635e-05} {"step": 49275, "timestamp": 1778247798.9395738, "eos/sharpness": 58.056592941284165, "eos/L0_probe": 1.9837874174118042, "eos/L_plus": 2.2579081058502197, "eos/L_minus": 2.2902326583862305, "eos/grad_norm": 0.16833382844924927, "eos/embed_grad_frac": 0.07994648814201355, "eos/time_s": 0.5792882442474365} {"step": 49275, "timestamp": 1778247800.3166714, "geo/rankme_last": 437.75872802734375, "geo/layer_0/stable_rank_q_proj": 19.40227699279785, "geo/layer_0/stable_rank_k_proj": 16.40532112121582, "geo/layer_0/stable_rank_o_proj": 47.857032775878906, "geo/layer_0/stable_rank_gate_proj": 133.94015502929688, "geo/layer_0/stable_rank_down_proj": 53.64447021484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0648227110505104, "geo/layer_0/attn_entropy_mean": 6.1816911697387695, "geo/layer_0/attn_entropy_std": 0.3937167227268219, "geo/layer_7/stable_rank_q_proj": 42.852325439453125, "geo/layer_7/stable_rank_k_proj": 41.81962203979492, "geo/layer_7/stable_rank_o_proj": 93.96454620361328, "geo/layer_7/stable_rank_gate_proj": 87.23104095458984, "geo/layer_7/stable_rank_down_proj": 144.32620239257812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49366918206214905, "geo/layer_7/attn_entropy_mean": 4.628202438354492, "geo/layer_7/attn_entropy_std": 0.7960672378540039, "geo/layer_14/stable_rank_q_proj": 52.55322265625, "geo/layer_14/stable_rank_k_proj": 38.698795318603516, "geo/layer_14/stable_rank_o_proj": 45.596961975097656, "geo/layer_14/stable_rank_gate_proj": 73.52800750732422, "geo/layer_14/stable_rank_down_proj": 130.67962646484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3987579643726349, "geo/layer_14/attn_entropy_mean": 5.562506675720215, "geo/layer_14/attn_entropy_std": 0.3813328742980957, "geo/layer_21/stable_rank_q_proj": 41.82695770263672, "geo/layer_21/stable_rank_k_proj": 30.4577693939209, "geo/layer_21/stable_rank_o_proj": 73.322265625, "geo/layer_21/stable_rank_gate_proj": 70.1402359008789, "geo/layer_21/stable_rank_down_proj": 53.64297866821289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15064945816993713, "geo/layer_21/attn_entropy_mean": 5.724575996398926, "geo/layer_21/attn_entropy_std": 0.2986745536327362, "geo/layer_27/stable_rank_q_proj": 42.70532989501953, "geo/layer_27/stable_rank_k_proj": 31.782508850097656, "geo/layer_27/stable_rank_o_proj": 116.01313781738281, "geo/layer_27/stable_rank_gate_proj": 82.91059875488281, "geo/layer_27/stable_rank_down_proj": 130.161376953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09607447683811188, "geo/layer_27/attn_entropy_mean": 4.262625694274902, "geo/layer_27/attn_entropy_std": 0.7119941711425781, "attnres/final_alpha/block_0": 0.24023008346557617, "attnres/block_norm/0": 1.7414591312408447, "attnres/final_alpha/block_1": 0.0048803361132740974, "attnres/block_norm/1": 43352.484375, "attnres/final_alpha/block_2": 0.010628129355609417, "attnres/block_norm/2": 27593.6015625, "attnres/final_alpha/block_3": 0.012604285031557083, "attnres/block_norm/3": 52312.78125, "attnres/final_alpha/block_4": 0.015203471295535564, "attnres/block_norm/4": 13889.7421875, "attnres/final_alpha/block_5": 0.6022124290466309, "attnres/block_norm/5": 6307.56396484375, "attnres/final_alpha/block_6": 0.11424122750759125, "attnres/block_norm/6": 34476.8671875, "geo/tier1_time_s": 1.3575639724731445, "geo/step": 49275.0, "geo/rankme_slope": 2.096053265056022e-05} {"step": 49280, "timestamp": 1778247805.5095582, "train/loss": 2.203818106651306, "train/z_loss": 0.0013799781096167862, "train/perplexity": 9.059537830352072, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700613.3921981144, "perf/iters_per_sec": 0.8109156571379253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2331738710403441, "data/tokens_consumed": 103349747712, "data/tokens_consumed_B": 103.349747712, "train/loss_slope": 1.5673013517458136e-05} {"step": 49290, "timestamp": 1778247815.8888097, "train/loss": 2.136248195171356, "train/z_loss": 0.001399492984637618, "train/perplexity": 8.467609142219205, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021493.985971449, "perf/iters_per_sec": 0.9639234475953335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374267816543579, "data/tokens_consumed": 103370719232, "data/tokens_consumed_B": 103.370719232, "train/loss_slope": 1.4717364017934045e-05} {"step": 49300, "timestamp": 1778247826.2159, "grad/layer_0/attn": 0.002819086192175746, "grad/layer_0/mlp": 0.002908953931182623, "grad/layer_0/attn_mlp_ratio": 0.9691064767460964, "grad/layer_4/attn": 0.0029590120539069176, "grad/layer_4/mlp": 0.0024574261624366045, "grad/layer_4/attn_mlp_ratio": 1.20411021040077, "grad/layer_8/attn": 0.005841944366693497, "grad/layer_8/mlp": 0.003544582985341549, "grad/layer_8/attn_mlp_ratio": 1.648133003526585, "grad/layer_12/attn": 0.004460795316845179, "grad/layer_12/mlp": 0.006905094254761934, "grad/layer_12/attn_mlp_ratio": 0.6460151140105562, "grad/layer_16/attn": 0.0036326893605291843, "grad/layer_16/mlp": 0.004895119462162256, "grad/layer_16/attn_mlp_ratio": 0.7421043172487016, "grad/layer_20/attn": 0.003531029913574457, "grad/layer_20/mlp": 0.0062479437328875065, "grad/layer_20/attn_mlp_ratio": 0.5651507132615469, "grad/layer_24/attn": 0.014067024923861027, "grad/layer_24/mlp": 0.010109348222613335, "grad/layer_24/attn_mlp_ratio": 1.3914868174435013, "grad/layer_27/attn": 0.010643397457897663, "grad/layer_27/mlp": 0.008108574897050858, "grad/layer_27/attn_mlp_ratio": 1.3126101024864094} {"step": 49300, "timestamp": 1778247826.2300403, "train/loss": 2.1698727130889894, "train/z_loss": 0.0014065461000427604, "train/perplexity": 8.757169296767199, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028811.780040472, "perf/iters_per_sec": 0.9674128437235222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336848497390747, "data/tokens_consumed": 103391690752, "data/tokens_consumed_B": 103.391690752, "train/loss_slope": 1.3866827232573683e-05} {"step": 49310, "timestamp": 1778247836.5864146, "train/loss": 2.1719226360321047, "train/z_loss": 0.0014037447283044458, "train/perplexity": 8.775139231223102, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026617.4872770964, "perf/iters_per_sec": 0.9663665233979685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348040580749511, "data/tokens_consumed": 103412662272, "data/tokens_consumed_B": 103.412662272, "train/loss_slope": 1.5044502985931648e-05} {"step": 49320, "timestamp": 1778247846.9305408, "train/loss": 2.1487754583358765, "train/z_loss": 0.001385838119313121, "train/perplexity": 8.574352314270897, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028754.271377011, "perf/iters_per_sec": 0.9673854214558654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337141513824464, "data/tokens_consumed": 103433633792, "data/tokens_consumed_B": 103.433633792, "train/loss_slope": 1.3980834740902792e-05} {"step": 49330, "timestamp": 1778247857.277895, "train/loss": 2.169577217102051, "train/z_loss": 0.001392396108713001, "train/perplexity": 8.754581970674128, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027851.3591008976, "perf/iters_per_sec": 0.9669548793320167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034174418449402, "data/tokens_consumed": 103454605312, "data/tokens_consumed_B": 103.454605312, "train/loss_slope": 1.3109078530323855e-05} {"step": 49340, "timestamp": 1778247867.6187158, "train/loss": 2.1712914228439333, "train/z_loss": 0.0013698599068447948, "train/perplexity": 8.769601995384308, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029204.8353824986, "perf/iters_per_sec": 0.9676002671158307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334846258163453, "data/tokens_consumed": 103475576832, "data/tokens_consumed_B": 103.475576832, "train/loss_slope": 1.1895497918951574e-05} {"step": 49350, "timestamp": 1778247878.3804455, "grad/layer_0/attn": 0.0026744557544589043, "grad/layer_0/mlp": 0.0028312301728874445, "grad/layer_0/attn_mlp_ratio": 0.9446267158380399, "grad/layer_4/attn": 0.003232925897464156, "grad/layer_4/mlp": 0.0025492538698017597, "grad/layer_4/attn_mlp_ratio": 1.2681850987626633, "grad/layer_8/attn": 0.003913941327482462, "grad/layer_8/mlp": 0.0035319868475198746, "grad/layer_8/attn_mlp_ratio": 1.1081415038158031, "grad/layer_12/attn": 0.004951075185090303, "grad/layer_12/mlp": 0.006762756034731865, "grad/layer_12/attn_mlp_ratio": 0.732109081926345, "grad/layer_16/attn": 0.006540616974234581, "grad/layer_16/mlp": 0.00432179169729352, "grad/layer_16/attn_mlp_ratio": 1.5134039956137126, "grad/layer_20/attn": 0.00623830733820796, "grad/layer_20/mlp": 0.00593009265139699, "grad/layer_20/attn_mlp_ratio": 1.0519746654448807, "grad/layer_24/attn": 0.01447859313338995, "grad/layer_24/mlp": 0.011097991839051247, "grad/layer_24/attn_mlp_ratio": 1.3046137727351519, "grad/layer_27/attn": 0.00920981727540493, "grad/layer_27/mlp": 0.011603126302361488, "grad/layer_27/attn_mlp_ratio": 0.7937358394656919} {"step": 49350, "timestamp": 1778247878.965219, "eos/sharpness": 80.41653633117674, "eos/L0_probe": 1.981377124786377, "eos/L_plus": 2.3166821002960205, "eos/L_minus": 2.450237512588501, "eos/grad_norm": 0.23066094517707825, "eos/embed_grad_frac": 0.04003104194998741, "eos/time_s": 0.5818657875061035} {"step": 49350, "timestamp": 1778247878.9829412, "train/loss": 2.143956923484802, "train/z_loss": 0.0013923867139965297, "train/perplexity": 8.533135879979582, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1846814.7259636016, "perf/iters_per_sec": 0.8806298856561668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1355508327484132, "data/tokens_consumed": 103496548352, "data/tokens_consumed_B": 103.496548352, "train/loss_slope": 1.2398768165180891e-05} {"step": 49350, "timestamp": 1778247880.343522, "geo/rankme_last": 439.2403564453125, "geo/layer_0/stable_rank_q_proj": 19.425996780395508, "geo/layer_0/stable_rank_k_proj": 16.441532135009766, "geo/layer_0/stable_rank_o_proj": 47.83583068847656, "geo/layer_0/stable_rank_gate_proj": 134.24395751953125, "geo/layer_0/stable_rank_down_proj": 53.6378173828125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05907488986849785, "geo/layer_0/attn_entropy_mean": 6.182817459106445, "geo/layer_0/attn_entropy_std": 0.3930388391017914, "geo/layer_7/stable_rank_q_proj": 42.86115264892578, "geo/layer_7/stable_rank_k_proj": 41.82744216918945, "geo/layer_7/stable_rank_o_proj": 93.93209075927734, "geo/layer_7/stable_rank_gate_proj": 87.37171936035156, "geo/layer_7/stable_rank_down_proj": 144.27435302734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47799235582351685, "geo/layer_7/attn_entropy_mean": 4.657894134521484, "geo/layer_7/attn_entropy_std": 0.7811757922172546, "geo/layer_14/stable_rank_q_proj": 52.571678161621094, "geo/layer_14/stable_rank_k_proj": 38.69355392456055, "geo/layer_14/stable_rank_o_proj": 45.57036209106445, "geo/layer_14/stable_rank_gate_proj": 73.4030990600586, "geo/layer_14/stable_rank_down_proj": 130.64002990722656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38203164935112, "geo/layer_14/attn_entropy_mean": 5.50341796875, "geo/layer_14/attn_entropy_std": 0.37507718801498413, "geo/layer_21/stable_rank_q_proj": 41.66787338256836, "geo/layer_21/stable_rank_k_proj": 30.544763565063477, "geo/layer_21/stable_rank_o_proj": 73.3109359741211, "geo/layer_21/stable_rank_gate_proj": 70.12580108642578, "geo/layer_21/stable_rank_down_proj": 53.67457580566406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1403590738773346, "geo/layer_21/attn_entropy_mean": 5.688241958618164, "geo/layer_21/attn_entropy_std": 0.29917892813682556, "geo/layer_27/stable_rank_q_proj": 42.72218704223633, "geo/layer_27/stable_rank_k_proj": 31.88033676147461, "geo/layer_27/stable_rank_o_proj": 116.08983612060547, "geo/layer_27/stable_rank_gate_proj": 82.92811584472656, "geo/layer_27/stable_rank_down_proj": 130.37461853027344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08852747827768326, "geo/layer_27/attn_entropy_mean": 4.248305320739746, "geo/layer_27/attn_entropy_std": 0.6857951283454895, "attnres/final_alpha/block_0": 0.2402319610118866, "attnres/block_norm/0": 1.7416062355041504, "attnres/final_alpha/block_1": 0.004927048459649086, "attnres/block_norm/1": 43312.359375, "attnres/final_alpha/block_2": 0.01061055064201355, "attnres/block_norm/2": 27615.1640625, "attnres/final_alpha/block_3": 0.012544604018330574, "attnres/block_norm/3": 51996.01171875, "attnres/final_alpha/block_4": 0.015195731073617935, "attnres/block_norm/4": 13923.92578125, "attnres/final_alpha/block_5": 0.6020418405532837, "attnres/block_norm/5": 6292.890625, "attnres/final_alpha/block_6": 0.11444827914237976, "attnres/block_norm/6": 34127.7109375, "geo/tier1_time_s": 1.3567111492156982, "geo/step": 49350.0, "geo/rankme_slope": 4.5386689832182874e-05} {"step": 49360, "timestamp": 1778247890.7010937, "train/loss": 2.17125027179718, "train/z_loss": 0.001382675173226744, "train/perplexity": 8.769241124507747, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790230.4077489537, "perf/iters_per_sec": 0.8536483801598328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714425086975098, "data/tokens_consumed": 103517519872, "data/tokens_consumed_B": 103.517519872, "train/loss_slope": 1.4437201435845603e-05} {"step": 49370, "timestamp": 1778247901.5738933, "train/loss": 2.1317431688308717, "train/z_loss": 0.0013932906440459192, "train/perplexity": 8.429548137278966, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930099.8393066248, "perf/iters_per_sec": 0.9203433224232792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.08655104637146, "data/tokens_consumed": 103538491392, "data/tokens_consumed_B": 103.538491392, "train/loss_slope": 1.1549955447300225e-05} {"step": 49380, "timestamp": 1778247911.9544003, "train/loss": 2.147248554229736, "train/z_loss": 0.0013883180450648069, "train/perplexity": 8.561270090706879, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021791.3572018188, "perf/iters_per_sec": 0.9640652452477545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372741937637329, "data/tokens_consumed": 103559462912, "data/tokens_consumed_B": 103.559462912, "train/loss_slope": 1.1925243601725997e-05} {"step": 49390, "timestamp": 1778247922.8975933, "train/loss": 2.190395975112915, "train/z_loss": 0.0013880250160582363, "train/perplexity": 8.938751937317365, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917370.0339874004, "perf/iters_per_sec": 0.9142732782303812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093764877319336, "data/tokens_consumed": 103580434432, "data/tokens_consumed_B": 103.580434432, "train/loss_slope": 1.2301548851383956e-05} {"step": 49400, "timestamp": 1778247933.2678373, "grad/layer_0/attn": 0.0028804864268749952, "grad/layer_0/mlp": 0.003024247009307146, "grad/layer_0/attn_mlp_ratio": 0.952463976243963, "grad/layer_4/attn": 0.002569730393588543, "grad/layer_4/mlp": 0.0025896590668708086, "grad/layer_4/attn_mlp_ratio": 0.9923044802431099, "grad/layer_8/attn": 0.0036330108996480703, "grad/layer_8/mlp": 0.0035807760432362556, "grad/layer_8/attn_mlp_ratio": 1.0145875515034586, "grad/layer_12/attn": 0.004813683219254017, "grad/layer_12/mlp": 0.006395894102752209, "grad/layer_12/attn_mlp_ratio": 0.7526208324682181, "grad/layer_16/attn": 0.003888034028932452, "grad/layer_16/mlp": 0.004669313319027424, "grad/layer_16/attn_mlp_ratio": 0.8326778864508725, "grad/layer_20/attn": 0.006454059854149818, "grad/layer_20/mlp": 0.005673644132912159, "grad/layer_20/attn_mlp_ratio": 1.1375510323172113, "grad/layer_24/attn": 0.008481320925056934, "grad/layer_24/mlp": 0.009492842480540276, "grad/layer_24/attn_mlp_ratio": 0.8934437554503541, "grad/layer_27/attn": 0.006425275467336178, "grad/layer_27/mlp": 0.007955923676490784, "grad/layer_27/attn_mlp_ratio": 0.8076089776428518} {"step": 49400, "timestamp": 1778247933.2819064, "train/loss": 2.1591377735137938, "train/z_loss": 0.0013820380321703852, "train/perplexity": 8.663664396194111, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021053.3869318867, "perf/iters_per_sec": 0.9637133536014016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376529455184937, "data/tokens_consumed": 103601405952, "data/tokens_consumed_B": 103.601405952, "train/loss_slope": 9.106225921626229e-06} {"step": 49410, "timestamp": 1778247944.2264307, "train/loss": 2.210612678527832, "train/z_loss": 0.0013803955167531966, "train/perplexity": 9.121303107995516, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917290.042113236, "perf/iters_per_sec": 0.9142351351324253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093810510635376, "data/tokens_consumed": 103622377472, "data/tokens_consumed_B": 103.622377472, "train/loss_slope": 9.980772530893009e-06} {"step": 49420, "timestamp": 1778247954.6216466, "train/loss": 2.189630317687988, "train/z_loss": 0.0013782134279608726, "train/perplexity": 8.931910534946459, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018701.5301503977, "perf/iters_per_sec": 0.9625919008972157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388618469238282, "data/tokens_consumed": 103643348992, "data/tokens_consumed_B": 103.643348992, "train/loss_slope": 1.0879242209174907e-05} {"step": 49425, "timestamp": 1778247960.3844275, "eos/sharpness": 79.54614162445067, "eos/L0_probe": 1.9849528074264526, "eos/L_plus": 2.339909553527832, "eos/L_minus": 2.42545747756958, "eos/grad_norm": 0.2424330860376358, "eos/embed_grad_frac": 0.03932329639792442, "eos/time_s": 0.5836286544799805} {"step": 49425, "timestamp": 1778247961.7588649, "geo/rankme_last": 439.017578125, "geo/layer_0/stable_rank_q_proj": 19.41168975830078, "geo/layer_0/stable_rank_k_proj": 16.439353942871094, "geo/layer_0/stable_rank_o_proj": 47.833831787109375, "geo/layer_0/stable_rank_gate_proj": 134.10519409179688, "geo/layer_0/stable_rank_down_proj": 53.68082809448242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06518594920635223, "geo/layer_0/attn_entropy_mean": 6.182610511779785, "geo/layer_0/attn_entropy_std": 0.3882177472114563, "geo/layer_7/stable_rank_q_proj": 42.78458786010742, "geo/layer_7/stable_rank_k_proj": 41.84529113769531, "geo/layer_7/stable_rank_o_proj": 94.0504379272461, "geo/layer_7/stable_rank_gate_proj": 87.28800201416016, "geo/layer_7/stable_rank_down_proj": 143.57737731933594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4802360534667969, "geo/layer_7/attn_entropy_mean": 4.654351234436035, "geo/layer_7/attn_entropy_std": 0.7945974469184875, "geo/layer_14/stable_rank_q_proj": 52.60203170776367, "geo/layer_14/stable_rank_k_proj": 38.671268463134766, "geo/layer_14/stable_rank_o_proj": 45.487937927246094, "geo/layer_14/stable_rank_gate_proj": 73.34014129638672, "geo/layer_14/stable_rank_down_proj": 130.61256408691406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3842361271381378, "geo/layer_14/attn_entropy_mean": 5.533268451690674, "geo/layer_14/attn_entropy_std": 0.3888493478298187, "geo/layer_21/stable_rank_q_proj": 41.70341110229492, "geo/layer_21/stable_rank_k_proj": 30.568159103393555, "geo/layer_21/stable_rank_o_proj": 73.32960510253906, "geo/layer_21/stable_rank_gate_proj": 70.09381103515625, "geo/layer_21/stable_rank_down_proj": 53.65010070800781, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1441253423690796, "geo/layer_21/attn_entropy_mean": 5.713510036468506, "geo/layer_21/attn_entropy_std": 0.2961105406284332, "geo/layer_27/stable_rank_q_proj": 42.71780776977539, "geo/layer_27/stable_rank_k_proj": 31.903688430786133, "geo/layer_27/stable_rank_o_proj": 116.06938171386719, "geo/layer_27/stable_rank_gate_proj": 82.88395690917969, "geo/layer_27/stable_rank_down_proj": 130.44383239746094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09626041352748871, "geo/layer_27/attn_entropy_mean": 4.223776340484619, "geo/layer_27/attn_entropy_std": 0.6995672583580017, "attnres/final_alpha/block_0": 0.23876219987869263, "attnres/block_norm/0": 1.7416205406188965, "attnres/final_alpha/block_1": 0.004726123996078968, "attnres/block_norm/1": 43441.8671875, "attnres/final_alpha/block_2": 0.010606305673718452, "attnres/block_norm/2": 27537.94921875, "attnres/final_alpha/block_3": 0.01255839504301548, "attnres/block_norm/3": 51985.83203125, "attnres/final_alpha/block_4": 0.015089765191078186, "attnres/block_norm/4": 13964.150390625, "attnres/final_alpha/block_5": 0.6048611998558044, "attnres/block_norm/5": 6330.7001953125, "attnres/final_alpha/block_6": 0.11339600384235382, "attnres/block_norm/6": 34203.859375, "geo/tier1_time_s": 1.3565616607666016, "geo/step": 49425.0, "geo/rankme_slope": 7.762427236519608e-05} {"step": 49430, "timestamp": 1778247967.339736, "train/loss": 2.1314413785934447, "train/z_loss": 0.0014125354122370482, "train/perplexity": 8.427004565777043, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1649692.7633901413, "perf/iters_per_sec": 0.7866348092032153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2712379217147827, "data/tokens_consumed": 103664320512, "data/tokens_consumed_B": 103.664320512, "train/loss_slope": 8.152451895751787e-06} {"step": 49440, "timestamp": 1778247977.7171154, "train/loss": 2.1630679845809935, "train/z_loss": 0.0013910746551118792, "train/perplexity": 8.697781425533801, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021894.4353341395, "perf/iters_per_sec": 0.9641143967314432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372213125228882, "data/tokens_consumed": 103685292032, "data/tokens_consumed_B": 103.685292032, "train/loss_slope": 7.671813373983346e-06} {"step": 49450, "timestamp": 1778247988.081121, "grad/layer_0/attn": 0.002923877676948905, "grad/layer_0/mlp": 0.0030730965081602335, "grad/layer_0/attn_mlp_ratio": 0.9514434623320673, "grad/layer_4/attn": 0.0053288876079022884, "grad/layer_4/mlp": 0.0025534736923873425, "grad/layer_4/attn_mlp_ratio": 2.0869168987711064, "grad/layer_8/attn": 0.003760182997211814, "grad/layer_8/mlp": 0.003916857298463583, "grad/layer_8/attn_mlp_ratio": 0.9599999731128263, "grad/layer_12/attn": 0.004358521196991205, "grad/layer_12/mlp": 0.006931733340024948, "grad/layer_12/attn_mlp_ratio": 0.6287779578805498, "grad/layer_16/attn": 0.00678733317181468, "grad/layer_16/mlp": 0.004575502593070269, "grad/layer_16/attn_mlp_ratio": 1.4834070979991538, "grad/layer_20/attn": 0.003511680057272315, "grad/layer_20/mlp": 0.005888278596103191, "grad/layer_20/attn_mlp_ratio": 0.5963848245831694, "grad/layer_24/attn": 0.013097889721393585, "grad/layer_24/mlp": 0.011598586104810238, "grad/layer_24/attn_mlp_ratio": 1.1292660579581284, "grad/layer_27/attn": 0.004312433302402496, "grad/layer_27/mlp": 0.01070292666554451, "grad/layer_27/attn_mlp_ratio": 0.4029209389982313} {"step": 49450, "timestamp": 1778247988.0951552, "train/loss": 2.1765504121780395, "train/z_loss": 0.0013823096407577396, "train/perplexity": 8.815842721913054, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021670.5860539486, "perf/iters_per_sec": 0.9640076570768111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373361587524415, "data/tokens_consumed": 103706263552, "data/tokens_consumed_B": 103.706263552, "train/loss_slope": 1.060767912700151e-05} {"step": 49460, "timestamp": 1778247998.4769907, "train/loss": 2.18329758644104, "train/z_loss": 0.0013792037731036543, "train/perplexity": 8.875525868910097, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021788.7083531676, "perf/iters_per_sec": 0.9640639821782911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372755527496338, "data/tokens_consumed": 103727235072, "data/tokens_consumed_B": 103.727235072, "train/loss_slope": 9.697945934138775e-06} {"step": 49470, "timestamp": 1778248008.8564827, "train/loss": 2.1392318964004517, "train/z_loss": 0.001393246848601848, "train/perplexity": 8.492911686869865, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021448.2264479757, "perf/iters_per_sec": 0.9639016277541999, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374502658843994, "data/tokens_consumed": 103748206592, "data/tokens_consumed_B": 103.748206592, "train/loss_slope": 9.276789278373101e-06} {"step": 49480, "timestamp": 1778248019.2449298, "train/loss": 2.143996262550354, "train/z_loss": 0.0013980563264340163, "train/perplexity": 8.533471572174193, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020301.059849385, "perf/iters_per_sec": 0.9633546160933423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380393505096435, "data/tokens_consumed": 103769178112, "data/tokens_consumed_B": 103.769178112, "train/loss_slope": 1.03179439853126e-05} {"step": 49490, "timestamp": 1778248029.6313045, "train/loss": 2.183869457244873, "train/z_loss": 0.0013798833009786905, "train/perplexity": 8.880602974609083, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020604.5784835087, "perf/iters_per_sec": 0.9634993450562995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037883424758911, "data/tokens_consumed": 103790149632, "data/tokens_consumed_B": 103.790149632, "train/loss_slope": 9.95848972876788e-06} {"step": 49500, "timestamp": 1778248039.9919543, "grad/layer_0/attn": 0.002965029329061508, "grad/layer_0/mlp": 0.0030334563925862312, "grad/layer_0/attn_mlp_ratio": 0.977442510320501, "grad/layer_4/attn": 0.0017215895932167768, "grad/layer_4/mlp": 0.002507056575268507, "grad/layer_4/attn_mlp_ratio": 0.6866975167333996, "grad/layer_8/attn": 0.0035109405871480703, "grad/layer_8/mlp": 0.0034631916787475348, "grad/layer_8/attn_mlp_ratio": 1.0137875149431672, "grad/layer_12/attn": 0.0054003288969397545, "grad/layer_12/mlp": 0.006360032130032778, "grad/layer_12/attn_mlp_ratio": 0.8491040141964694, "grad/layer_16/attn": 0.008478877134621143, "grad/layer_16/mlp": 0.004219439346343279, "grad/layer_16/attn_mlp_ratio": 2.0094795155714937, "grad/layer_20/attn": 0.003634302644059062, "grad/layer_20/mlp": 0.005798446945846081, "grad/layer_20/attn_mlp_ratio": 0.6267717227257635, "grad/layer_24/attn": 0.007636617869138718, "grad/layer_24/mlp": 0.009875874035060406, "grad/layer_24/attn_mlp_ratio": 0.7732599428366458, "grad/layer_27/attn": 0.008698977530002594, "grad/layer_27/mlp": 0.009182767011225224, "grad/layer_27/attn_mlp_ratio": 0.947315490487477} {"step": 49500, "timestamp": 1778248040.5803485, "eos/sharpness": 23.71137142181396, "eos/L0_probe": 1.9857369661331177, "eos/L_plus": 2.114642858505249, "eos/L_minus": 2.093944787979126, "eos/grad_norm": 0.11230028420686722, "eos/embed_grad_frac": 0.1998358964920044, "eos/time_s": 0.5856013298034668} {"step": 49500, "timestamp": 1778248040.599952, "train/loss": 2.134488010406494, "train/z_loss": 0.001389274769462645, "train/perplexity": 8.45271769530604, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912797.3733385873, "perf/iters_per_sec": 0.9120928637211739, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963795900344848, "data/tokens_consumed": 103811121152, "data/tokens_consumed_B": 103.811121152, "train/loss_slope": 1.0013458537797817e-05} {"step": 49500, "timestamp": 1778248041.9636316, "geo/rankme_last": 439.4507141113281, "geo/layer_0/stable_rank_q_proj": 19.39846420288086, "geo/layer_0/stable_rank_k_proj": 16.435949325561523, "geo/layer_0/stable_rank_o_proj": 47.807220458984375, "geo/layer_0/stable_rank_gate_proj": 134.0994873046875, "geo/layer_0/stable_rank_down_proj": 53.65205001831055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0635681003332138, "geo/layer_0/attn_entropy_mean": 6.183035850524902, "geo/layer_0/attn_entropy_std": 0.3888760805130005, "geo/layer_7/stable_rank_q_proj": 42.7752685546875, "geo/layer_7/stable_rank_k_proj": 41.80865478515625, "geo/layer_7/stable_rank_o_proj": 94.09640502929688, "geo/layer_7/stable_rank_gate_proj": 87.08293914794922, "geo/layer_7/stable_rank_down_proj": 143.4010467529297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4830830991268158, "geo/layer_7/attn_entropy_mean": 4.687197685241699, "geo/layer_7/attn_entropy_std": 0.8100759983062744, "geo/layer_14/stable_rank_q_proj": 52.575904846191406, "geo/layer_14/stable_rank_k_proj": 38.525390625, "geo/layer_14/stable_rank_o_proj": 45.509769439697266, "geo/layer_14/stable_rank_gate_proj": 73.37924194335938, "geo/layer_14/stable_rank_down_proj": 130.73101806640625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3885922133922577, "geo/layer_14/attn_entropy_mean": 5.524876117706299, "geo/layer_14/attn_entropy_std": 0.36952102184295654, "geo/layer_21/stable_rank_q_proj": 41.70707702636719, "geo/layer_21/stable_rank_k_proj": 30.49888038635254, "geo/layer_21/stable_rank_o_proj": 73.27385711669922, "geo/layer_21/stable_rank_gate_proj": 70.07816314697266, "geo/layer_21/stable_rank_down_proj": 53.681034088134766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14448106288909912, "geo/layer_21/attn_entropy_mean": 5.722742557525635, "geo/layer_21/attn_entropy_std": 0.29912272095680237, "geo/layer_27/stable_rank_q_proj": 42.738525390625, "geo/layer_27/stable_rank_k_proj": 31.888898849487305, "geo/layer_27/stable_rank_o_proj": 116.03556060791016, "geo/layer_27/stable_rank_gate_proj": 82.80823516845703, "geo/layer_27/stable_rank_down_proj": 130.47877502441406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09624704718589783, "geo/layer_27/attn_entropy_mean": 4.242032527923584, "geo/layer_27/attn_entropy_std": 0.6999143362045288, "attnres/final_alpha/block_0": 0.2407224476337433, "attnres/block_norm/0": 1.74189293384552, "attnres/final_alpha/block_1": 0.004895254969596863, "attnres/block_norm/1": 43022.17578125, "attnres/final_alpha/block_2": 0.010781385004520416, "attnres/block_norm/2": 27629.197265625, "attnres/final_alpha/block_3": 0.012544970959424973, "attnres/block_norm/3": 52237.1328125, "attnres/final_alpha/block_4": 0.01528286561369896, "attnres/block_norm/4": 13988.724609375, "attnres/final_alpha/block_5": 0.601619303226471, "attnres/block_norm/5": 6282.87548828125, "attnres/final_alpha/block_6": 0.11415375769138336, "attnres/block_norm/6": 34141.109375, "geo/tier1_time_s": 1.359553575515747, "geo/step": 49500.0, "geo/rankme_slope": 6.798268135379152e-05} {"step": 49500, "timestamp": 1778248049.04176, "geo/ww_alpha_mean": 7.5273886741137614, "geo/ww_alpha_std": 4.434894530833768, "geo/ww_alpha_min": 1.3429323020463448, "geo/ww_alpha_max": 34.32100475943201, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.9178998301361645, "geo/ww_alpha_by_type/k_proj": 4.476980266789515, "geo/ww_alpha_by_type/v_proj": 7.980753559964591, "geo/ww_alpha_by_type/o_proj": 6.999021297897855, "geo/ww_alpha_by_type/gate_proj": 8.048383894989966, "geo/ww_alpha_by_type/up_proj": 13.07137705243022, "geo/ww_alpha_by_type/down_proj": 8.293311345549839, "geo/twonn_id/layer_0": 0.7411806583404541, "geo/twonn_id/layer_7": 3.020124673843384, "geo/twonn_id/layer_14": 4.170085430145264, "geo/twonn_id/layer_21": 6.22632360458374, "geo/twonn_id/layer_27": 5.055374622344971, "geo/tier2_time_s": 7.070464372634888} {"step": 49500, "timestamp": 1778248049.7229028, "eoc/jacobian_sigma/layer_0/attn": 1083.2777099609375, "eoc/jacobian_sigma/layer_0/mlp": 7739.34228515625, "eoc/jacobian_sigma/layer_0": 7739.34228515625, "eoc/jacobian_sigma/layer_7/attn": 1.1541603803634644, "eoc/jacobian_sigma/layer_7/mlp": 1.6830567121505737, "eoc/jacobian_sigma/layer_7": 1.6830567121505737, "eoc/jacobian_sigma/layer_14/attn": 1.5869290828704834, "eoc/jacobian_sigma/layer_14/mlp": 6.151006698608398, "eoc/jacobian_sigma/layer_14": 6.151006698608398, "eoc/jacobian_sigma/layer_21/attn": 1.0878182649612427, "eoc/jacobian_sigma/layer_21/mlp": 4.433737754821777, "eoc/jacobian_sigma/layer_21": 4.433737754821777, "eoc/jacobian_sigma/layer_27/attn": 3.645939588546753, "eoc/jacobian_sigma/layer_27/mlp": 33.1412467956543, "eoc/jacobian_sigma/layer_27": 33.1412467956543, "eoc/layer0_sigma": 7739.34228515625, "eoc/sigma_max": 33.1412467956543, "eoc/sigma_min": 1.6830567121505737, "eoc/sigma_mean": 11.352261990308762, "eoc/time_s": 0.6742551326751709} {"step": 49510, "timestamp": 1778248060.12759, "train/loss": 2.175630843639374, "train/z_loss": 0.0013905143598094583, "train/perplexity": 8.807739676527959, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1074147.251649182, "perf/iters_per_sec": 0.512193322968093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.952387809753418, "data/tokens_consumed": 103832092672, "data/tokens_consumed_B": 103.832092672, "train/loss_slope": 9.080937099714214e-06} {"step": 49520, "timestamp": 1778248070.5136576, "train/loss": 2.1778979539871215, "train/z_loss": 0.001378565188497305, "train/perplexity": 8.827730446367102, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020155.9693216057, "perf/iters_per_sec": 0.9632854315383939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381139039993286, "data/tokens_consumed": 103853064192, "data/tokens_consumed_B": 103.853064192, "train/loss_slope": 9.389158393969652e-06} {"step": 49530, "timestamp": 1778248080.8999803, "train/loss": 2.1747080087661743, "train/z_loss": 0.0013733384548686445, "train/perplexity": 8.799615336489124, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020467.0091978973, "perf/iters_per_sec": 0.9634337469090926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379540920257568, "data/tokens_consumed": 103874035712, "data/tokens_consumed_B": 103.874035712, "train/loss_slope": 7.867996448730792e-06} {"step": 49540, "timestamp": 1778248091.2876067, "train/loss": 2.117231321334839, "train/z_loss": 0.00137566541088745, "train/perplexity": 8.308103147126946, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020105.8165543762, "perf/iters_per_sec": 0.9632615168353921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381396770477296, "data/tokens_consumed": 103895007232, "data/tokens_consumed_B": 103.895007232, "train/loss_slope": 5.46409990062872e-06} {"step": 49550, "timestamp": 1778248101.65135, "grad/layer_0/attn": 0.009949837811291218, "grad/layer_0/mlp": 0.012170220725238323, "grad/layer_0/attn_mlp_ratio": 0.8175560619785529, "grad/layer_4/attn": 0.027169961482286453, "grad/layer_4/mlp": 0.010986756533384323, "grad/layer_4/attn_mlp_ratio": 2.4729738164699033, "grad/layer_8/attn": 0.026232996955513954, "grad/layer_8/mlp": 0.010160722769796848, "grad/layer_8/attn_mlp_ratio": 2.581804197562811, "grad/layer_12/attn": 0.02107459492981434, "grad/layer_12/mlp": 0.02297911047935486, "grad/layer_12/attn_mlp_ratio": 0.9171196969106543, "grad/layer_16/attn": 0.01435480173677206, "grad/layer_16/mlp": 0.01605888456106186, "grad/layer_16/attn_mlp_ratio": 0.8938853500566134, "grad/layer_20/attn": 0.0394647940993309, "grad/layer_20/mlp": 0.03393334895372391, "grad/layer_20/attn_mlp_ratio": 1.1630091105021643, "grad/layer_24/attn": 0.04568592458963394, "grad/layer_24/mlp": 0.036859434098005295, "grad/layer_24/attn_mlp_ratio": 1.2394635344702687, "grad/layer_27/attn": 0.03024098090827465, "grad/layer_27/mlp": 0.031158404424786568, "grad/layer_27/attn_mlp_ratio": 0.9705561427003714} {"step": 49550, "timestamp": 1778248101.6654842, "train/loss": 2.1672589063644407, "train/z_loss": 0.0013874899945221842, "train/perplexity": 8.734309637152046, "train/grad_norm": 0.41015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021746.420656716, "perf/iters_per_sec": 0.9640438178332882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037297248840332, "data/tokens_consumed": 103915978752, "data/tokens_consumed_B": 103.915978752, "train/loss_slope": 8.073921057686427e-06} {"step": 49560, "timestamp": 1778248112.0472715, "train/loss": 2.188891363143921, "train/z_loss": 0.0013772904057987034, "train/perplexity": 8.925312697120813, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021417.148318444, "perf/iters_per_sec": 0.9638868085472316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374662160873414, "data/tokens_consumed": 103936950272, "data/tokens_consumed_B": 103.936950272, "train/loss_slope": 1.0718329346934563e-05} {"step": 49570, "timestamp": 1778248122.427947, "train/loss": 2.146059274673462, "train/z_loss": 0.0014047546661458909, "train/perplexity": 8.551094399282512, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021623.3781695666, "perf/iters_per_sec": 0.9639851466033776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037360382080078, "data/tokens_consumed": 103957921792, "data/tokens_consumed_B": 103.957921792, "train/loss_slope": 1.0726537369694553e-05} {"step": 49575, "timestamp": 1778248128.2040715, "eos/sharpness": 64.1188621520996, "eos/L0_probe": 1.9867044687271118, "eos/L_plus": 2.3537566661834717, "eos/L_minus": 2.260840892791748, "eos/grad_norm": 0.1743483990430832, "eos/embed_grad_frac": 0.07245968282222748, "eos/time_s": 0.5991413593292236} {"step": 49575, "timestamp": 1778248129.578641, "geo/rankme_last": 439.9163513183594, "geo/layer_0/stable_rank_q_proj": 19.410680770874023, "geo/layer_0/stable_rank_k_proj": 16.431621551513672, "geo/layer_0/stable_rank_o_proj": 47.73139953613281, "geo/layer_0/stable_rank_gate_proj": 134.05946350097656, "geo/layer_0/stable_rank_down_proj": 53.75428009033203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061485107988119125, "geo/layer_0/attn_entropy_mean": 6.181015968322754, "geo/layer_0/attn_entropy_std": 0.389729380607605, "geo/layer_7/stable_rank_q_proj": 42.84507751464844, "geo/layer_7/stable_rank_k_proj": 41.829036712646484, "geo/layer_7/stable_rank_o_proj": 94.24466705322266, "geo/layer_7/stable_rank_gate_proj": 86.93002319335938, "geo/layer_7/stable_rank_down_proj": 143.40447998046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48153796792030334, "geo/layer_7/attn_entropy_mean": 4.6743035316467285, "geo/layer_7/attn_entropy_std": 0.7937216758728027, "geo/layer_14/stable_rank_q_proj": 52.57665252685547, "geo/layer_14/stable_rank_k_proj": 38.50627899169922, "geo/layer_14/stable_rank_o_proj": 45.49494934082031, "geo/layer_14/stable_rank_gate_proj": 73.40917205810547, "geo/layer_14/stable_rank_down_proj": 130.6572265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38271430134773254, "geo/layer_14/attn_entropy_mean": 5.553244590759277, "geo/layer_14/attn_entropy_std": 0.3744237720966339, "geo/layer_21/stable_rank_q_proj": 41.646156311035156, "geo/layer_21/stable_rank_k_proj": 30.318159103393555, "geo/layer_21/stable_rank_o_proj": 73.25991821289062, "geo/layer_21/stable_rank_gate_proj": 70.0268783569336, "geo/layer_21/stable_rank_down_proj": 53.61257553100586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15145425498485565, "geo/layer_21/attn_entropy_mean": 5.717986106872559, "geo/layer_21/attn_entropy_std": 0.29136350750923157, "geo/layer_27/stable_rank_q_proj": 42.7671012878418, "geo/layer_27/stable_rank_k_proj": 31.85062599182129, "geo/layer_27/stable_rank_o_proj": 116.02923583984375, "geo/layer_27/stable_rank_gate_proj": 82.89263153076172, "geo/layer_27/stable_rank_down_proj": 130.3802032470703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08778408169746399, "geo/layer_27/attn_entropy_mean": 4.247194290161133, "geo/layer_27/attn_entropy_std": 0.7113634943962097, "attnres/final_alpha/block_0": 0.2375493049621582, "attnres/block_norm/0": 1.7419495582580566, "attnres/final_alpha/block_1": 0.004777010530233383, "attnres/block_norm/1": 43325.3359375, "attnres/final_alpha/block_2": 0.010645939037203789, "attnres/block_norm/2": 27459.53515625, "attnres/final_alpha/block_3": 0.012251983396708965, "attnres/block_norm/3": 51979.140625, "attnres/final_alpha/block_4": 0.014828506857156754, "attnres/block_norm/4": 14041.087890625, "attnres/final_alpha/block_5": 0.6076794862747192, "attnres/block_norm/5": 6288.0966796875, "attnres/final_alpha/block_6": 0.11226779222488403, "attnres/block_norm/6": 34182.8203125, "geo/tier1_time_s": 1.3568089008331299, "geo/step": 49575.0, "geo/rankme_slope": 6.303443252300921e-05} {"step": 49580, "timestamp": 1778248134.769961, "train/loss": 2.2208715677261353, "train/z_loss": 0.001377201871946454, "train/perplexity": 9.215359176408766, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699889.4044261978, "perf/iters_per_sec": 0.8105704328661908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233699083328247, "data/tokens_consumed": 103978893312, "data/tokens_consumed_B": 103.978893312, "train/loss_slope": 1.3928876980410383e-05} {"step": 49590, "timestamp": 1778248145.1461382, "train/loss": 2.1818951964378357, "train/z_loss": 0.0013720007031224669, "train/perplexity": 8.863087643818137, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022191.7384427942, "perf/iters_per_sec": 0.964256161900899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037068819999695, "data/tokens_consumed": 103999864832, "data/tokens_consumed_B": 103.999864832, "train/loss_slope": 1.6930942988917833e-05} {"step": 49600, "timestamp": 1778248155.5150566, "grad/layer_0/attn": 0.0031132930889725685, "grad/layer_0/mlp": 0.0031102499924600124, "grad/layer_0/attn_mlp_ratio": 1.000978376793536, "grad/layer_4/attn": 0.0022597203496843576, "grad/layer_4/mlp": 0.002604591427370906, "grad/layer_4/attn_mlp_ratio": 0.8675910698232733, "grad/layer_8/attn": 0.007935676723718643, "grad/layer_8/mlp": 0.0038135070353746414, "grad/layer_8/attn_mlp_ratio": 2.0809392619476603, "grad/layer_12/attn": 0.00444336561486125, "grad/layer_12/mlp": 0.0068638999946415424, "grad/layer_12/attn_mlp_ratio": 0.6473528975647626, "grad/layer_16/attn": 0.003961916081607342, "grad/layer_16/mlp": 0.004595436621457338, "grad/layer_16/attn_mlp_ratio": 0.8621413636506152, "grad/layer_20/attn": 0.0037149996496737003, "grad/layer_20/mlp": 0.005651520565152168, "grad/layer_20/attn_mlp_ratio": 0.6573451412078795, "grad/layer_24/attn": 0.006593420635908842, "grad/layer_24/mlp": 0.00832041073590517, "grad/layer_24/attn_mlp_ratio": 0.7924393117051594, "grad/layer_27/attn": 0.006183367222547531, "grad/layer_27/mlp": 0.006871971767395735, "grad/layer_27/attn_mlp_ratio": 0.8997951886102288} {"step": 49600, "timestamp": 1778248155.5292912, "train/loss": 2.1863934278488157, "train/z_loss": 0.0013739020680077374, "train/perplexity": 8.903045665894695, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020605.7853145138, "perf/iters_per_sec": 0.9634999205181665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378828048706055, "data/tokens_consumed": 104020836352, "data/tokens_consumed_B": 104.020836352, "train/loss_slope": 1.9480889779899726e-05} {"step": 49610, "timestamp": 1778248165.9043312, "train/loss": 2.128965699672699, "train/z_loss": 0.0013988403137773276, "train/perplexity": 8.40616781140765, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022262.3584553169, "perf/iters_per_sec": 0.9642898361469826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370326042175293, "data/tokens_consumed": 104041807872, "data/tokens_consumed_B": 104.041807872, "train/loss_slope": 1.9888264990553866e-05} {"step": 49620, "timestamp": 1778248176.2802048, "train/loss": 2.1489973783493044, "train/z_loss": 0.0013771473430097103, "train/perplexity": 8.576255345804197, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022467.1332351204, "perf/iters_per_sec": 0.9643874803710558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369276046752929, "data/tokens_consumed": 104062779392, "data/tokens_consumed_B": 104.062779392, "train/loss_slope": 1.994191875623714e-05} {"step": 49630, "timestamp": 1778248186.6531699, "train/loss": 2.1453590631484984, "train/z_loss": 0.0013772769016213715, "train/perplexity": 8.545108920228348, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022657.0193833385, "perf/iters_per_sec": 0.9644780251423543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368302583694458, "data/tokens_consumed": 104083750912, "data/tokens_consumed_B": 104.083750912, "train/loss_slope": 1.7736615384980516e-05} {"step": 49640, "timestamp": 1778248197.028896, "train/loss": 2.1918619155883787, "train/z_loss": 0.001381369703449309, "train/perplexity": 8.951865224883562, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022068.1769094164, "perf/iters_per_sec": 0.96419724317046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03713219165802, "data/tokens_consumed": 104104722432, "data/tokens_consumed_B": 104.104722432, "train/loss_slope": 2.085888633991261e-05} {"step": 49650, "timestamp": 1778248207.4004965, "grad/layer_0/attn": 0.003114556660875678, "grad/layer_0/mlp": 0.003039032919332385, "grad/layer_0/attn_mlp_ratio": 1.0248512079542604, "grad/layer_4/attn": 0.002160274423658848, "grad/layer_4/mlp": 0.002747032092884183, "grad/layer_4/attn_mlp_ratio": 0.7864030240543769, "grad/layer_8/attn": 0.006254784297198057, "grad/layer_8/mlp": 0.003780610393732786, "grad/layer_8/attn_mlp_ratio": 1.6544376384625636, "grad/layer_12/attn": 0.005458928644657135, "grad/layer_12/mlp": 0.006725347135215998, "grad/layer_12/attn_mlp_ratio": 0.8116946908068177, "grad/layer_16/attn": 0.005144735798239708, "grad/layer_16/mlp": 0.0049405875615775585, "grad/layer_16/attn_mlp_ratio": 1.0413206182434103, "grad/layer_20/attn": 0.003055598121136427, "grad/layer_20/mlp": 0.006206961814314127, "grad/layer_20/attn_mlp_ratio": 0.49228562432287337, "grad/layer_24/attn": 0.01131109893321991, "grad/layer_24/mlp": 0.008984345011413097, "grad/layer_24/attn_mlp_ratio": 1.2589786782401162, "grad/layer_27/attn": 0.006577194202691317, "grad/layer_27/mlp": 0.0073893750086426735, "grad/layer_27/attn_mlp_ratio": 0.8900880123135956} {"step": 49650, "timestamp": 1778248207.989468, "eos/sharpness": 17.794346809387203, "eos/L0_probe": 1.9847713708877563, "eos/L_plus": 2.093615770339966, "eos/L_minus": 2.053870439529419, "eos/grad_norm": 0.10611408948898315, "eos/embed_grad_frac": 0.22842876613140106, "eos/time_s": 0.5862007141113281} {"step": 49650, "timestamp": 1778248208.007311, "train/loss": 2.183589768409729, "train/z_loss": 0.001380295364651829, "train/perplexity": 8.878119516421693, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911384.9961315289, "perf/iters_per_sec": 0.9114193897874493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097189736366272, "data/tokens_consumed": 104125693952, "data/tokens_consumed_B": 104.125693952, "train/loss_slope": 1.9599323604616878e-05} {"step": 49650, "timestamp": 1778248209.3694026, "geo/rankme_last": 437.9917297363281, "geo/layer_0/stable_rank_q_proj": 19.4219913482666, "geo/layer_0/stable_rank_k_proj": 16.397584915161133, "geo/layer_0/stable_rank_o_proj": 47.70729064941406, "geo/layer_0/stable_rank_gate_proj": 134.62741088867188, "geo/layer_0/stable_rank_down_proj": 53.6967658996582, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061678480356931686, "geo/layer_0/attn_entropy_mean": 6.181705474853516, "geo/layer_0/attn_entropy_std": 0.39097708463668823, "geo/layer_7/stable_rank_q_proj": 42.784481048583984, "geo/layer_7/stable_rank_k_proj": 41.83256912231445, "geo/layer_7/stable_rank_o_proj": 94.1730728149414, "geo/layer_7/stable_rank_gate_proj": 87.01948547363281, "geo/layer_7/stable_rank_down_proj": 143.259521484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.488757461309433, "geo/layer_7/attn_entropy_mean": 4.652486801147461, "geo/layer_7/attn_entropy_std": 0.8128526210784912, "geo/layer_14/stable_rank_q_proj": 52.3956298828125, "geo/layer_14/stable_rank_k_proj": 38.54926300048828, "geo/layer_14/stable_rank_o_proj": 45.47306823730469, "geo/layer_14/stable_rank_gate_proj": 73.36579132080078, "geo/layer_14/stable_rank_down_proj": 130.3590087890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3904811143875122, "geo/layer_14/attn_entropy_mean": 5.529651641845703, "geo/layer_14/attn_entropy_std": 0.3684869706630707, "geo/layer_21/stable_rank_q_proj": 41.63975143432617, "geo/layer_21/stable_rank_k_proj": 30.27608299255371, "geo/layer_21/stable_rank_o_proj": 73.18868255615234, "geo/layer_21/stable_rank_gate_proj": 69.80135345458984, "geo/layer_21/stable_rank_down_proj": 53.568885803222656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14807386696338654, "geo/layer_21/attn_entropy_mean": 5.707272529602051, "geo/layer_21/attn_entropy_std": 0.2935301661491394, "geo/layer_27/stable_rank_q_proj": 42.72265625, "geo/layer_27/stable_rank_k_proj": 31.86264419555664, "geo/layer_27/stable_rank_o_proj": 115.89579772949219, "geo/layer_27/stable_rank_gate_proj": 82.8742446899414, "geo/layer_27/stable_rank_down_proj": 130.431396484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09349772334098816, "geo/layer_27/attn_entropy_mean": 4.21278190612793, "geo/layer_27/attn_entropy_std": 0.6926354765892029, "attnres/final_alpha/block_0": 0.23863936960697174, "attnres/block_norm/0": 1.7420529127120972, "attnres/final_alpha/block_1": 0.004874092526733875, "attnres/block_norm/1": 43335.41796875, "attnres/final_alpha/block_2": 0.010606802999973297, "attnres/block_norm/2": 27504.69140625, "attnres/final_alpha/block_3": 0.012440507300198078, "attnres/block_norm/3": 52450.24609375, "attnres/final_alpha/block_4": 0.014985751360654831, "attnres/block_norm/4": 13971.3388671875, "attnres/final_alpha/block_5": 0.6060101985931396, "attnres/block_norm/5": 6367.1982421875, "attnres/final_alpha/block_6": 0.11244329065084457, "attnres/block_norm/6": 34476.421875, "geo/tier1_time_s": 1.3580288887023926, "geo/step": 49650.0, "geo/rankme_slope": 7.748724489795914e-06} {"step": 49660, "timestamp": 1778248219.7450778, "train/loss": 2.1577389240264893, "train/z_loss": 0.001377651107031852, "train/perplexity": 8.6515537061864, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787243.512077603, "perf/iters_per_sec": 0.8522241173160567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734002590179444, "data/tokens_consumed": 104146665472, "data/tokens_consumed_B": 104.146665472, "train/loss_slope": 2.2072569114325634e-05} {"step": 49670, "timestamp": 1778248230.1310298, "train/loss": 2.2273972034454346, "train/z_loss": 0.0013824252411723137, "train/perplexity": 9.275691893984783, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020859.1589115064, "perf/iters_per_sec": 0.9636207384641201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377526760101319, "data/tokens_consumed": 104167636992, "data/tokens_consumed_B": 104.167636992, "train/loss_slope": 2.688490016089159e-05} {"step": 49680, "timestamp": 1778248240.5108247, "train/loss": 2.186340308189392, "train/z_loss": 0.0013781097950413823, "train/perplexity": 8.90257275170172, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021656.64648862, "perf/iters_per_sec": 0.9640010101740932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373433113098145, "data/tokens_consumed": 104188608512, "data/tokens_consumed_B": 104.188608512, "train/loss_slope": 2.8826675320615826e-05} {"step": 49690, "timestamp": 1778248250.8890414, "train/loss": 2.1589166641235353, "train/z_loss": 0.0014052661368623375, "train/perplexity": 8.661748990406972, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021802.9750410418, "perf/iters_per_sec": 0.9640707850651941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372682332992553, "data/tokens_consumed": 104209580032, "data/tokens_consumed_B": 104.209580032, "train/loss_slope": 2.6982993003737617e-05} {"step": 49700, "timestamp": 1778248261.2623842, "grad/layer_0/attn": 0.0028422169853001833, "grad/layer_0/mlp": 0.0030565557535737753, "grad/layer_0/attn_mlp_ratio": 0.9298756906329771, "grad/layer_4/attn": 0.002662552520632744, "grad/layer_4/mlp": 0.0026446173433214426, "grad/layer_4/attn_mlp_ratio": 1.006781728433575, "grad/layer_8/attn": 0.008103247731924057, "grad/layer_8/mlp": 0.0038790435064584017, "grad/layer_8/attn_mlp_ratio": 2.0889808298191284, "grad/layer_12/attn": 0.005795602221041918, "grad/layer_12/mlp": 0.007059446070343256, "grad/layer_12/attn_mlp_ratio": 0.8209712321894672, "grad/layer_16/attn": 0.004782417789101601, "grad/layer_16/mlp": 0.004513654857873917, "grad/layer_16/attn_mlp_ratio": 1.0595443900200738, "grad/layer_20/attn": 0.0032892089802771807, "grad/layer_20/mlp": 0.005750374402850866, "grad/layer_20/attn_mlp_ratio": 0.5719990895630342, "grad/layer_24/attn": 0.00635905284434557, "grad/layer_24/mlp": 0.008204301819205284, "grad/layer_24/attn_mlp_ratio": 0.7750876195157801, "grad/layer_27/attn": 0.005867934785783291, "grad/layer_27/mlp": 0.006676652934402227, "grad/layer_27/attn_mlp_ratio": 0.878873704466606} {"step": 49700, "timestamp": 1778248261.2767758, "train/loss": 2.163639235496521, "train/z_loss": 0.0013879510457627474, "train/perplexity": 8.702751460569571, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020070.3260452265, "perf/iters_per_sec": 0.963244593641866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381579160690309, "data/tokens_consumed": 104230551552, "data/tokens_consumed_B": 104.230551552, "train/loss_slope": 2.7665854172773813e-05} {"step": 49710, "timestamp": 1778248271.653095, "train/loss": 2.1788349628448485, "train/z_loss": 0.0013787887524813413, "train/perplexity": 8.836005984509761, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022398.4518763353, "perf/iters_per_sec": 0.9643547305471112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369628190994262, "data/tokens_consumed": 104251523072, "data/tokens_consumed_B": 104.251523072, "train/loss_slope": 2.7144606507579114e-05} {"step": 49720, "timestamp": 1778248282.0298455, "train/loss": 2.1933704137802126, "train/z_loss": 0.0013700520968995989, "train/perplexity": 8.965379287795894, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021966.1036019556, "perf/iters_per_sec": 0.9641485708246019, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371845483779907, "data/tokens_consumed": 104272494592, "data/tokens_consumed_B": 104.272494592, "train/loss_slope": 2.808724846025858e-05} {"step": 49725, "timestamp": 1778248287.7952251, "eos/sharpness": 46.09687328338622, "eos/L0_probe": 1.9812465906143188, "eos/L_plus": 2.180600166320801, "eos/L_minus": 2.242861747741699, "eos/grad_norm": 0.11294405907392502, "eos/embed_grad_frac": 0.19016936421394348, "eos/time_s": 0.5872550010681152} {"step": 49725, "timestamp": 1778248289.173159, "geo/rankme_last": 438.0719299316406, "geo/layer_0/stable_rank_q_proj": 19.41008186340332, "geo/layer_0/stable_rank_k_proj": 16.395357131958008, "geo/layer_0/stable_rank_o_proj": 47.69882583618164, "geo/layer_0/stable_rank_gate_proj": 134.76451110839844, "geo/layer_0/stable_rank_down_proj": 53.71333694458008, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.064412422478199, "geo/layer_0/attn_entropy_mean": 6.17903995513916, "geo/layer_0/attn_entropy_std": 0.3971041142940521, "geo/layer_7/stable_rank_q_proj": 42.753475189208984, "geo/layer_7/stable_rank_k_proj": 41.782291412353516, "geo/layer_7/stable_rank_o_proj": 94.12310028076172, "geo/layer_7/stable_rank_gate_proj": 86.91964721679688, "geo/layer_7/stable_rank_down_proj": 143.50389099121094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49930697679519653, "geo/layer_7/attn_entropy_mean": 4.665651321411133, "geo/layer_7/attn_entropy_std": 0.8179789185523987, "geo/layer_14/stable_rank_q_proj": 52.40665817260742, "geo/layer_14/stable_rank_k_proj": 38.54365539550781, "geo/layer_14/stable_rank_o_proj": 45.504146575927734, "geo/layer_14/stable_rank_gate_proj": 73.43132019042969, "geo/layer_14/stable_rank_down_proj": 130.37940979003906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3929045498371124, "geo/layer_14/attn_entropy_mean": 5.5109052658081055, "geo/layer_14/attn_entropy_std": 0.37609443068504333, "geo/layer_21/stable_rank_q_proj": 41.58625411987305, "geo/layer_21/stable_rank_k_proj": 30.39706802368164, "geo/layer_21/stable_rank_o_proj": 73.22724914550781, "geo/layer_21/stable_rank_gate_proj": 69.78339385986328, "geo/layer_21/stable_rank_down_proj": 53.56096649169922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14614591002464294, "geo/layer_21/attn_entropy_mean": 5.712040901184082, "geo/layer_21/attn_entropy_std": 0.28697338700294495, "geo/layer_27/stable_rank_q_proj": 42.6925048828125, "geo/layer_27/stable_rank_k_proj": 31.757434844970703, "geo/layer_27/stable_rank_o_proj": 115.96513366699219, "geo/layer_27/stable_rank_gate_proj": 82.8599624633789, "geo/layer_27/stable_rank_down_proj": 130.61074829101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09410829842090607, "geo/layer_27/attn_entropy_mean": 4.230947971343994, "geo/layer_27/attn_entropy_std": 0.7089900374412537, "attnres/final_alpha/block_0": 0.24109019339084625, "attnres/block_norm/0": 1.7421035766601562, "attnres/final_alpha/block_1": 0.004899345338344574, "attnres/block_norm/1": 43314.1953125, "attnres/final_alpha/block_2": 0.010589426383376122, "attnres/block_norm/2": 27643.04296875, "attnres/final_alpha/block_3": 0.012464161962270737, "attnres/block_norm/3": 52400.50390625, "attnres/final_alpha/block_4": 0.015180808492004871, "attnres/block_norm/4": 14056.0615234375, "attnres/final_alpha/block_5": 0.6005099415779114, "attnres/block_norm/5": 6373.654296875, "attnres/final_alpha/block_6": 0.1152660995721817, "attnres/block_norm/6": 34369.2890625, "geo/tier1_time_s": 1.3579504489898682, "geo/step": 49725.0, "geo/rankme_slope": -2.554252169617847e-05} {"step": 49730, "timestamp": 1778248294.3630235, "train/loss": 2.171864557266235, "train/z_loss": 0.0013756412197835743, "train/perplexity": 8.774629596765834, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701337.602043007, "perf/iters_per_sec": 0.8112609873023067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2326489448547364, "data/tokens_consumed": 104293466112, "data/tokens_consumed_B": 104.293466112, "train/loss_slope": 2.6347546201191417e-05} {"step": 49740, "timestamp": 1778248304.7432144, "train/loss": 2.184353291988373, "train/z_loss": 0.0013733568601310253, "train/perplexity": 8.884900758496167, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021387.4182667558, "perf/iters_per_sec": 0.9638726321538714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374814748764039, "data/tokens_consumed": 104314437632, "data/tokens_consumed_B": 104.314437632, "train/loss_slope": 2.796260309119209e-05} {"step": 49750, "timestamp": 1778248315.1197178, "grad/layer_0/attn": 0.002855851547792554, "grad/layer_0/mlp": 0.002954067662358284, "grad/layer_0/attn_mlp_ratio": 0.9667522133996945, "grad/layer_4/attn": 0.0018310323357582092, "grad/layer_4/mlp": 0.0026063465047627687, "grad/layer_4/attn_mlp_ratio": 0.7025283331128059, "grad/layer_8/attn": 0.004260918591171503, "grad/layer_8/mlp": 0.0036697520408779383, "grad/layer_8/attn_mlp_ratio": 1.1610916562207212, "grad/layer_12/attn": 0.006328057497739792, "grad/layer_12/mlp": 0.007324122823774815, "grad/layer_12/attn_mlp_ratio": 0.8640020878402113, "grad/layer_16/attn": 0.006287652533501387, "grad/layer_16/mlp": 0.00474848598241806, "grad/layer_16/attn_mlp_ratio": 1.3241383515437284, "grad/layer_20/attn": 0.0035718625877052546, "grad/layer_20/mlp": 0.006612301338464022, "grad/layer_20/attn_mlp_ratio": 0.5401844760021961, "grad/layer_24/attn": 0.017574641853570938, "grad/layer_24/mlp": 0.010471978224813938, "grad/layer_24/attn_mlp_ratio": 1.6782542236481566, "grad/layer_27/attn": 0.009604062885046005, "grad/layer_27/mlp": 0.010393135249614716, "grad/layer_27/attn_mlp_ratio": 0.9240775340621382} {"step": 49750, "timestamp": 1778248315.1340528, "train/loss": 2.17022248506546, "train/z_loss": 0.0013933471171185374, "train/perplexity": 8.76023284492083, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019130.9074409972, "perf/iters_per_sec": 0.9627966439442621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386409282684326, "data/tokens_consumed": 104335409152, "data/tokens_consumed_B": 104.335409152, "train/loss_slope": 2.57825407079129e-05} {"step": 49760, "timestamp": 1778248325.507806, "train/loss": 2.1819746494293213, "train/z_loss": 0.0013904307037591935, "train/perplexity": 8.86379187062133, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022887.7397276578, "perf/iters_per_sec": 0.9645880411756791, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367120027542114, "data/tokens_consumed": 104356380672, "data/tokens_consumed_B": 104.356380672, "train/loss_slope": 2.5106174531656763e-05} {"step": 49770, "timestamp": 1778248335.8828495, "train/loss": 2.1431050062179566, "train/z_loss": 0.0014039413188584148, "train/perplexity": 8.525869449821224, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022750.0919628683, "perf/iters_per_sec": 0.9645224056066839, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367825508117676, "data/tokens_consumed": 104377352192, "data/tokens_consumed_B": 104.377352192, "train/loss_slope": 2.1785232829790044e-05} {"step": 49780, "timestamp": 1778248346.2562253, "train/loss": 2.1384801626205445, "train/z_loss": 0.0013968984130769968, "train/perplexity": 8.486529677351689, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022590.3249619382, "perf/iters_per_sec": 0.9644462227639857, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036864447593689, "data/tokens_consumed": 104398323712, "data/tokens_consumed_B": 104.398323712, "train/loss_slope": 2.0890117907645664e-05} {"step": 49790, "timestamp": 1778248356.6338096, "train/loss": 2.1904635548591616, "train/z_loss": 0.0013753788894973694, "train/perplexity": 8.939356036317248, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022132.1406637805, "perf/iters_per_sec": 0.9642277434653189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370993852615356, "data/tokens_consumed": 104419295232, "data/tokens_consumed_B": 104.419295232, "train/loss_slope": 1.9235866576960704e-05} {"step": 49800, "timestamp": 1778248366.9989648, "grad/layer_0/attn": 0.0028329326305538416, "grad/layer_0/mlp": 0.0029600292909890413, "grad/layer_0/attn_mlp_ratio": 0.9570623316031559, "grad/layer_4/attn": 0.0027337835635989904, "grad/layer_4/mlp": 0.002491843653842807, "grad/layer_4/attn_mlp_ratio": 1.0970926886499506, "grad/layer_8/attn": 0.004972128663212061, "grad/layer_8/mlp": 0.00357524654828012, "grad/layer_8/attn_mlp_ratio": 1.3907092719334242, "grad/layer_12/attn": 0.005967767909169197, "grad/layer_12/mlp": 0.0069793532602488995, "grad/layer_12/attn_mlp_ratio": 0.8550602901350124, "grad/layer_16/attn": 0.004645044915378094, "grad/layer_16/mlp": 0.005073207430541515, "grad/layer_16/attn_mlp_ratio": 0.9156031736163336, "grad/layer_20/attn": 0.0038924673572182655, "grad/layer_20/mlp": 0.006828273646533489, "grad/layer_20/attn_mlp_ratio": 0.5700514510266017, "grad/layer_24/attn": 0.02013159915804863, "grad/layer_24/mlp": 0.013319404795765877, "grad/layer_24/attn_mlp_ratio": 1.5114488459201576, "grad/layer_27/attn": 0.017061395570635796, "grad/layer_27/mlp": 0.012844369746744633, "grad/layer_27/attn_mlp_ratio": 1.328317058307065} {"step": 49800, "timestamp": 1778248367.5895178, "eos/sharpness": 89.9109125137329, "eos/L0_probe": 1.9871506690979004, "eos/L_plus": 2.564361333847046, "eos/L_minus": 2.309049129486084, "eos/grad_norm": 0.30197271704673767, "eos/embed_grad_frac": 0.023746542632579803, "eos/time_s": 0.58770751953125} {"step": 49800, "timestamp": 1778248367.607822, "train/loss": 2.1186503887176515, "train/z_loss": 0.0013919671066105366, "train/perplexity": 8.31990127450507, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912220.324367051, "perf/iters_per_sec": 0.9118177053294425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096710443496704, "data/tokens_consumed": 104440266752, "data/tokens_consumed_B": 104.440266752, "train/loss_slope": 1.5755044623534306e-05} {"step": 49800, "timestamp": 1778248368.9726553, "geo/rankme_last": 439.6737060546875, "geo/layer_0/stable_rank_q_proj": 19.44919204711914, "geo/layer_0/stable_rank_k_proj": 16.37784767150879, "geo/layer_0/stable_rank_o_proj": 47.665103912353516, "geo/layer_0/stable_rank_gate_proj": 134.40567016601562, "geo/layer_0/stable_rank_down_proj": 53.71754455566406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05852651968598366, "geo/layer_0/attn_entropy_mean": 6.1857805252075195, "geo/layer_0/attn_entropy_std": 0.39110323786735535, "geo/layer_7/stable_rank_q_proj": 42.74137496948242, "geo/layer_7/stable_rank_k_proj": 41.811119079589844, "geo/layer_7/stable_rank_o_proj": 94.18592834472656, "geo/layer_7/stable_rank_gate_proj": 86.7671890258789, "geo/layer_7/stable_rank_down_proj": 143.3399658203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4775048494338989, "geo/layer_7/attn_entropy_mean": 4.633439540863037, "geo/layer_7/attn_entropy_std": 0.8122023344039917, "geo/layer_14/stable_rank_q_proj": 52.59220504760742, "geo/layer_14/stable_rank_k_proj": 38.65443801879883, "geo/layer_14/stable_rank_o_proj": 45.50859069824219, "geo/layer_14/stable_rank_gate_proj": 73.47632598876953, "geo/layer_14/stable_rank_down_proj": 130.41139221191406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3931364417076111, "geo/layer_14/attn_entropy_mean": 5.521791458129883, "geo/layer_14/attn_entropy_std": 0.3758447468280792, "geo/layer_21/stable_rank_q_proj": 41.64985656738281, "geo/layer_21/stable_rank_k_proj": 30.499061584472656, "geo/layer_21/stable_rank_o_proj": 73.30834197998047, "geo/layer_21/stable_rank_gate_proj": 69.86712646484375, "geo/layer_21/stable_rank_down_proj": 53.55126190185547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1407783329486847, "geo/layer_21/attn_entropy_mean": 5.717563629150391, "geo/layer_21/attn_entropy_std": 0.29317009449005127, "geo/layer_27/stable_rank_q_proj": 42.73564147949219, "geo/layer_27/stable_rank_k_proj": 31.73796272277832, "geo/layer_27/stable_rank_o_proj": 115.92134094238281, "geo/layer_27/stable_rank_gate_proj": 82.85250854492188, "geo/layer_27/stable_rank_down_proj": 130.56011962890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08926457166671753, "geo/layer_27/attn_entropy_mean": 4.221684455871582, "geo/layer_27/attn_entropy_std": 0.7141628265380859, "attnres/final_alpha/block_0": 0.23641270399093628, "attnres/block_norm/0": 1.742016077041626, "attnres/final_alpha/block_1": 0.004705904051661491, "attnres/block_norm/1": 43608.3984375, "attnres/final_alpha/block_2": 0.01037396490573883, "attnres/block_norm/2": 27593.02734375, "attnres/final_alpha/block_3": 0.012252728454768658, "attnres/block_norm/3": 52667.4765625, "attnres/final_alpha/block_4": 0.014689428731799126, "attnres/block_norm/4": 13997.2197265625, "attnres/final_alpha/block_5": 0.6111279129981995, "attnres/block_norm/5": 6324.24755859375, "attnres/final_alpha/block_6": 0.11043733358383179, "attnres/block_norm/6": 34554.19140625, "geo/tier1_time_s": 1.3606350421905518, "geo/step": 49800.0, "geo/rankme_slope": -3.3882889093137233e-06} {"step": 49810, "timestamp": 1778248379.3447413, "train/loss": 2.1552354097366333, "train/z_loss": 0.0013826217735186219, "train/perplexity": 8.62992150741135, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787403.0187056372, "perf/iters_per_sec": 0.8523001760032831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173295545578003, "data/tokens_consumed": 104461238272, "data/tokens_consumed_B": 104.461238272, "train/loss_slope": 1.366054649078342e-05} {"step": 49820, "timestamp": 1778248389.723219, "train/loss": 2.142921507358551, "train/z_loss": 0.001397113606799394, "train/perplexity": 8.52430510603378, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021943.9333032207, "perf/iters_per_sec": 0.9641379992023567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371959209442139, "data/tokens_consumed": 104482209792, "data/tokens_consumed_B": 104.482209792, "train/loss_slope": 1.4660813641769918e-05} {"step": 49830, "timestamp": 1778248400.1027265, "train/loss": 2.1950628757476807, "train/z_loss": 0.001371944393031299, "train/perplexity": 8.980565698850949, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021872.452506268, "perf/iters_per_sec": 0.9641039145022716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372325897216796, "data/tokens_consumed": 104503181312, "data/tokens_consumed_B": 104.503181312, "train/loss_slope": 1.5813598180725686e-05} {"step": 49840, "timestamp": 1778248410.4784374, "train/loss": 2.1318015694618224, "train/z_loss": 0.0013894698466174304, "train/perplexity": 8.430040442584142, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022607.346969447, "perf/iters_per_sec": 0.9644543394896732, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368557214736938, "data/tokens_consumed": 104524152832, "data/tokens_consumed_B": 104.524152832, "train/loss_slope": 1.2775223061303792e-05} {"step": 49850, "timestamp": 1778248420.8434114, "grad/layer_0/attn": 0.0032192645594477654, "grad/layer_0/mlp": 0.0030954661779105663, "grad/layer_0/attn_mlp_ratio": 1.039993419544135, "grad/layer_4/attn": 0.0023918498773127794, "grad/layer_4/mlp": 0.0025117509067058563, "grad/layer_4/attn_mlp_ratio": 0.9522639270083039, "grad/layer_8/attn": 0.004479119088500738, "grad/layer_8/mlp": 0.0034738860558718443, "grad/layer_8/attn_mlp_ratio": 1.2893684155221286, "grad/layer_12/attn": 0.005801151040941477, "grad/layer_12/mlp": 0.007100346032530069, "grad/layer_12/attn_mlp_ratio": 0.8170236961214665, "grad/layer_16/attn": 0.0066392142325639725, "grad/layer_16/mlp": 0.006041896529495716, "grad/layer_16/attn_mlp_ratio": 1.0988625988985368, "grad/layer_20/attn": 0.004489382263273001, "grad/layer_20/mlp": 0.008486706763505936, "grad/layer_20/attn_mlp_ratio": 0.5289899056815539, "grad/layer_24/attn": 0.027101656422019005, "grad/layer_24/mlp": 0.01647348888218403, "grad/layer_24/attn_mlp_ratio": 1.6451679696589634, "grad/layer_27/attn": 0.016663948073983192, "grad/layer_27/mlp": 0.017173390835523605, "grad/layer_27/attn_mlp_ratio": 0.9703353365999129} {"step": 49850, "timestamp": 1778248420.8577852, "train/loss": 2.131942093372345, "train/z_loss": 0.001397412654478103, "train/perplexity": 8.431225148070771, "train/grad_norm": 0.36328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021381.2401001395, "perf/iters_per_sec": 0.9638696861744592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374846458435059, "data/tokens_consumed": 104545124352, "data/tokens_consumed_B": 104.545124352, "train/loss_slope": 9.62940714504496e-06} {"step": 49860, "timestamp": 1778248431.2325082, "train/loss": 2.149799418449402, "train/z_loss": 0.0013991115964017808, "train/perplexity": 8.583136605654504, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022680.9727769245, "perf/iters_per_sec": 0.9644894470104811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036817979812622, "data/tokens_consumed": 104566095872, "data/tokens_consumed_B": 104.566095872, "train/loss_slope": 7.3035435171554e-06} {"step": 49870, "timestamp": 1778248441.616199, "train/loss": 2.140478563308716, "train/z_loss": 0.0013922761310823262, "train/perplexity": 8.50350612131798, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020956.059307862, "perf/iters_per_sec": 0.9636669441737471, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377029180526733, "data/tokens_consumed": 104587067392, "data/tokens_consumed_B": 104.587067392, "train/loss_slope": 2.825957327464218e-06} {"step": 49875, "timestamp": 1778248447.3928742, "eos/sharpness": 60.49852371215819, "eos/L0_probe": 1.9849849939346313, "eos/L_plus": 2.240138292312622, "eos/L_minus": 2.3348169326782227, "eos/grad_norm": 0.14588700234889984, "eos/embed_grad_frac": 0.10502039641141891, "eos/time_s": 0.5885705947875977} {"step": 49875, "timestamp": 1778248448.7747438, "geo/rankme_last": 439.4624328613281, "geo/layer_0/stable_rank_q_proj": 19.45413589477539, "geo/layer_0/stable_rank_k_proj": 16.37801170349121, "geo/layer_0/stable_rank_o_proj": 47.725868225097656, "geo/layer_0/stable_rank_gate_proj": 134.5260772705078, "geo/layer_0/stable_rank_down_proj": 53.677188873291016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06434774398803711, "geo/layer_0/attn_entropy_mean": 6.190258502960205, "geo/layer_0/attn_entropy_std": 0.3880193829536438, "geo/layer_7/stable_rank_q_proj": 42.63262176513672, "geo/layer_7/stable_rank_k_proj": 41.82756805419922, "geo/layer_7/stable_rank_o_proj": 94.17970275878906, "geo/layer_7/stable_rank_gate_proj": 86.91992950439453, "geo/layer_7/stable_rank_down_proj": 143.35614013671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48440197110176086, "geo/layer_7/attn_entropy_mean": 4.6666154861450195, "geo/layer_7/attn_entropy_std": 0.8136122822761536, "geo/layer_14/stable_rank_q_proj": 52.60143280029297, "geo/layer_14/stable_rank_k_proj": 38.67503356933594, "geo/layer_14/stable_rank_o_proj": 45.533180236816406, "geo/layer_14/stable_rank_gate_proj": 73.41803741455078, "geo/layer_14/stable_rank_down_proj": 130.87481689453125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4005090296268463, "geo/layer_14/attn_entropy_mean": 5.541228294372559, "geo/layer_14/attn_entropy_std": 0.3792369067668915, "geo/layer_21/stable_rank_q_proj": 41.575439453125, "geo/layer_21/stable_rank_k_proj": 30.448753356933594, "geo/layer_21/stable_rank_o_proj": 73.34381866455078, "geo/layer_21/stable_rank_gate_proj": 69.87664031982422, "geo/layer_21/stable_rank_down_proj": 53.50300216674805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14950861036777496, "geo/layer_21/attn_entropy_mean": 5.715103626251221, "geo/layer_21/attn_entropy_std": 0.2862011790275574, "geo/layer_27/stable_rank_q_proj": 42.80657958984375, "geo/layer_27/stable_rank_k_proj": 31.79335594177246, "geo/layer_27/stable_rank_o_proj": 116.06988525390625, "geo/layer_27/stable_rank_gate_proj": 82.84245300292969, "geo/layer_27/stable_rank_down_proj": 130.6475372314453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09540504962205887, "geo/layer_27/attn_entropy_mean": 4.253300666809082, "geo/layer_27/attn_entropy_std": 0.7188234925270081, "attnres/final_alpha/block_0": 0.2403520792722702, "attnres/block_norm/0": 1.7421894073486328, "attnres/final_alpha/block_1": 0.004808600060641766, "attnres/block_norm/1": 43471.75, "attnres/final_alpha/block_2": 0.010549800470471382, "attnres/block_norm/2": 27644.927734375, "attnres/final_alpha/block_3": 0.01270248368382454, "attnres/block_norm/3": 51991.37890625, "attnres/final_alpha/block_4": 0.015063011087477207, "attnres/block_norm/4": 14068.994140625, "attnres/final_alpha/block_5": 0.6023080945014954, "attnres/block_norm/5": 6330.37939453125, "attnres/final_alpha/block_6": 0.11421594768762589, "attnres/block_norm/6": 34326.390625, "geo/tier1_time_s": 1.3612780570983887, "geo/step": 49875.0, "geo/rankme_slope": -1.2989043273559424e-05} {"step": 49880, "timestamp": 1778248453.960193, "train/loss": 2.1698602437973022, "train/z_loss": 0.001387320866342634, "train/perplexity": 8.757060101749676, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700104.410009512, "perf/iters_per_sec": 0.8106729555175362, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233543062210083, "data/tokens_consumed": 104608038912, "data/tokens_consumed_B": 104.608038912, "train/loss_slope": 1.3006052597486077e-06} {"step": 49890, "timestamp": 1778248464.331488, "train/loss": 2.1443386554718016, "train/z_loss": 0.0013792031444609166, "train/perplexity": 8.536393872694838, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022960.0367293535, "perf/iters_per_sec": 0.9646225150725143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366749525070191, "data/tokens_consumed": 104629010432, "data/tokens_consumed_B": 104.629010432, "train/loss_slope": -4.18773793330055e-07} {"step": 49900, "timestamp": 1778248474.7165053, "grad/layer_0/attn": 0.0030318317003548145, "grad/layer_0/mlp": 0.003049698891118169, "grad/layer_0/attn_mlp_ratio": 0.9941412936767234, "grad/layer_4/attn": 0.0018202924402430654, "grad/layer_4/mlp": 0.002390242414548993, "grad/layer_4/attn_mlp_ratio": 0.7615513610703767, "grad/layer_8/attn": 0.003816277254372835, "grad/layer_8/mlp": 0.0035067570861428976, "grad/layer_8/attn_mlp_ratio": 1.088263900749393, "grad/layer_12/attn": 0.0052036745473742485, "grad/layer_12/mlp": 0.007322836667299271, "grad/layer_12/attn_mlp_ratio": 0.7106091140269144, "grad/layer_16/attn": 0.0036745634861290455, "grad/layer_16/mlp": 0.005102739669382572, "grad/layer_16/attn_mlp_ratio": 0.7201157911632372, "grad/layer_20/attn": 0.005604211241006851, "grad/layer_20/mlp": 0.006061924155801535, "grad/layer_20/attn_mlp_ratio": 0.9244937753294045, "grad/layer_24/attn": 0.009572410024702549, "grad/layer_24/mlp": 0.00852185394614935, "grad/layer_24/attn_mlp_ratio": 1.1232778656926072, "grad/layer_27/attn": 0.0071299513801932335, "grad/layer_27/mlp": 0.008024848066270351, "grad/layer_27/attn_mlp_ratio": 0.8884842719095292} {"step": 49900, "timestamp": 1778248474.7309194, "train/loss": 2.1262343406677244, "train/z_loss": 0.0013985619996674358, "train/perplexity": 8.383238877089017, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017794.534947093, "perf/iters_per_sec": 0.9621594118819681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0393288135528564, "data/tokens_consumed": 104649981952, "data/tokens_consumed_B": 104.649981952, "train/loss_slope": -2.180892091379668e-06} {"step": 49910, "timestamp": 1778248485.1138265, "train/loss": 2.1586501836776733, "train/z_loss": 0.0013819269952364266, "train/perplexity": 8.65944111119006, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021032.5368570308, "perf/iters_per_sec": 0.9637034115109591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376636505126953, "data/tokens_consumed": 104670953472, "data/tokens_consumed_B": 104.670953472, "train/loss_slope": -3.2737431066943633e-06} {"step": 49920, "timestamp": 1778248495.4906895, "train/loss": 2.156329369544983, "train/z_loss": 0.0013849795213900506, "train/perplexity": 8.639367460494148, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021952.7177030547, "perf/iters_per_sec": 0.96414218793061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371914148330688, "data/tokens_consumed": 104691924992, "data/tokens_consumed_B": 104.691924992, "train/loss_slope": -3.2525838691122838e-06} {"step": 49930, "timestamp": 1778248505.8669684, "train/loss": 2.104702877998352, "train/z_loss": 0.0013858325546607375, "train/perplexity": 8.204664860953605, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022415.7961529442, "perf/iters_per_sec": 0.9643630009426805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369539260864258, "data/tokens_consumed": 104712896512, "data/tokens_consumed_B": 104.712896512, "train/loss_slope": -8.707390652738648e-06} {"step": 49940, "timestamp": 1778248516.2425027, "train/loss": 2.1452261209487915, "train/z_loss": 0.0013947232393547893, "train/perplexity": 8.54397299015995, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022563.4437958763, "perf/iters_per_sec": 0.9644334048251516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036878228187561, "data/tokens_consumed": 104733868032, "data/tokens_consumed_B": 104.733868032, "train/loss_slope": -8.040718298361779e-06} {"step": 49950, "timestamp": 1778248526.6079717, "grad/layer_0/attn": 0.002715324517339468, "grad/layer_0/mlp": 0.0027395556680858135, "grad/layer_0/attn_mlp_ratio": 0.9911550438108889, "grad/layer_4/attn": 0.0020057775545865297, "grad/layer_4/mlp": 0.0025074544828385115, "grad/layer_4/attn_mlp_ratio": 0.7999257766479388, "grad/layer_8/attn": 0.0032270748633891344, "grad/layer_8/mlp": 0.003521158592775464, "grad/layer_8/attn_mlp_ratio": 0.9164809498675206, "grad/layer_12/attn": 0.0035928157158195972, "grad/layer_12/mlp": 0.0059645469300448895, "grad/layer_12/attn_mlp_ratio": 0.6023618721961117, "grad/layer_16/attn": 0.0032857409678399563, "grad/layer_16/mlp": 0.004376296419650316, "grad/layer_16/attn_mlp_ratio": 0.7508040081576787, "grad/layer_20/attn": 0.00399815384298563, "grad/layer_20/mlp": 0.005742138717323542, "grad/layer_20/attn_mlp_ratio": 0.6962830349770616, "grad/layer_24/attn": 0.012515629641711712, "grad/layer_24/mlp": 0.010886832140386105, "grad/layer_24/attn_mlp_ratio": 1.1496116928561988, "grad/layer_27/attn": 0.005144969560205936, "grad/layer_27/mlp": 0.011011471971869469, "grad/layer_27/attn_mlp_ratio": 0.46723721647985356} {"step": 49950, "timestamp": 1778248527.2023635, "eos/sharpness": 77.0073890686035, "eos/L0_probe": 1.983366847038269, "eos/L_plus": 2.453402042388916, "eos/L_minus": 2.2834055423736572, "eos/grad_norm": 0.19298774003982544, "eos/embed_grad_frac": 0.060497112572193146, "eos/time_s": 0.591317892074585} {"step": 49950, "timestamp": 1778248527.222508, "train/loss": 2.129386818408966, "train/z_loss": 0.0013934358721598983, "train/perplexity": 8.409708551656944, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910830.300905232, "perf/iters_per_sec": 0.9111548904920731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0975082397460938, "data/tokens_consumed": 104754839552, "data/tokens_consumed_B": 104.754839552, "train/loss_slope": -8.867587486688219e-06} {"step": 49950, "timestamp": 1778248528.5893717, "geo/rankme_last": 439.22442626953125, "geo/layer_0/stable_rank_q_proj": 19.47347640991211, "geo/layer_0/stable_rank_k_proj": 16.374309539794922, "geo/layer_0/stable_rank_o_proj": 47.80385208129883, "geo/layer_0/stable_rank_gate_proj": 134.5564727783203, "geo/layer_0/stable_rank_down_proj": 53.64494323730469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0635017454624176, "geo/layer_0/attn_entropy_mean": 6.193174839019775, "geo/layer_0/attn_entropy_std": 0.39067861437797546, "geo/layer_7/stable_rank_q_proj": 42.653717041015625, "geo/layer_7/stable_rank_k_proj": 41.84846496582031, "geo/layer_7/stable_rank_o_proj": 94.21800994873047, "geo/layer_7/stable_rank_gate_proj": 86.9625473022461, "geo/layer_7/stable_rank_down_proj": 143.29383850097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4932412803173065, "geo/layer_7/attn_entropy_mean": 4.661778450012207, "geo/layer_7/attn_entropy_std": 0.7866173982620239, "geo/layer_14/stable_rank_q_proj": 52.62739944458008, "geo/layer_14/stable_rank_k_proj": 38.667423248291016, "geo/layer_14/stable_rank_o_proj": 45.49102783203125, "geo/layer_14/stable_rank_gate_proj": 73.388427734375, "geo/layer_14/stable_rank_down_proj": 130.9677276611328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4002339541912079, "geo/layer_14/attn_entropy_mean": 5.492506980895996, "geo/layer_14/attn_entropy_std": 0.39184698462486267, "geo/layer_21/stable_rank_q_proj": 41.580204010009766, "geo/layer_21/stable_rank_k_proj": 30.379186630249023, "geo/layer_21/stable_rank_o_proj": 73.37661743164062, "geo/layer_21/stable_rank_gate_proj": 69.87890625, "geo/layer_21/stable_rank_down_proj": 53.563995361328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1449993997812271, "geo/layer_21/attn_entropy_mean": 5.698788166046143, "geo/layer_21/attn_entropy_std": 0.30104008316993713, "geo/layer_27/stable_rank_q_proj": 42.76033401489258, "geo/layer_27/stable_rank_k_proj": 31.771770477294922, "geo/layer_27/stable_rank_o_proj": 115.7877197265625, "geo/layer_27/stable_rank_gate_proj": 82.80467224121094, "geo/layer_27/stable_rank_down_proj": 130.29681396484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09121933579444885, "geo/layer_27/attn_entropy_mean": 4.266587257385254, "geo/layer_27/attn_entropy_std": 0.7143002152442932, "attnres/final_alpha/block_0": 0.23844879865646362, "attnres/block_norm/0": 1.7424147129058838, "attnres/final_alpha/block_1": 0.00485618319362402, "attnres/block_norm/1": 43405.8515625, "attnres/final_alpha/block_2": 0.010577614419162273, "attnres/block_norm/2": 27572.35546875, "attnres/final_alpha/block_3": 0.01236832793802023, "attnres/block_norm/3": 52314.03515625, "attnres/final_alpha/block_4": 0.014769250527024269, "attnres/block_norm/4": 14017.669921875, "attnres/final_alpha/block_5": 0.6056185364723206, "attnres/block_norm/5": 6331.0205078125, "attnres/final_alpha/block_6": 0.11336129903793335, "attnres/block_norm/6": 34431.5, "geo/tier1_time_s": 1.3631761074066162, "geo/step": 49950.0, "geo/rankme_slope": 2.076201574379752e-06} {"step": 49960, "timestamp": 1778248538.9684856, "train/loss": 2.112004745006561, "train/z_loss": 0.001396398339420557, "train/perplexity": 8.26479349108794, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785957.2418841934, "perf/iters_per_sec": 0.8516107758923499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1742453575134277, "data/tokens_consumed": 104775811072, "data/tokens_consumed_B": 104.775811072, "train/loss_slope": -1.3397888331809512e-05} {"step": 49970, "timestamp": 1778248549.3438396, "train/loss": 2.180934798717499, "train/z_loss": 0.0013881398481316864, "train/perplexity": 8.85457964083716, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023040.7140016234, "perf/iters_per_sec": 0.9646609849937551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366336107254028, "data/tokens_consumed": 104796782592, "data/tokens_consumed_B": 104.796782592, "train/loss_slope": -1.2398168813921137e-05} {"step": 49980, "timestamp": 1778248559.7184067, "train/loss": 2.168465423583984, "train/z_loss": 0.0013647737447172402, "train/perplexity": 8.744854091884081, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022420.306643592, "perf/iters_per_sec": 0.964365151712223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369516134262085, "data/tokens_consumed": 104817754112, "data/tokens_consumed_B": 104.817754112, "train/loss_slope": -1.1731963253507616e-05} {"step": 49990, "timestamp": 1778248570.0977502, "train/loss": 2.1229427099227904, "train/z_loss": 0.0014016727218404412, "train/perplexity": 8.355689715966149, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022003.42702693, "perf/iters_per_sec": 0.9641663680205012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371654033660889, "data/tokens_consumed": 104838725632, "data/tokens_consumed_B": 104.838725632, "train/loss_slope": -1.547779096986233e-05} {"step": 50000, "timestamp": 1778248580.4636135, "grad/layer_0/attn": 0.0030854290816932917, "grad/layer_0/mlp": 0.002979896031320095, "grad/layer_0/attn_mlp_ratio": 1.0354149761342337, "grad/layer_4/attn": 0.0021640420891344547, "grad/layer_4/mlp": 0.002499323571100831, "grad/layer_4/attn_mlp_ratio": 0.8658510756957297, "grad/layer_8/attn": 0.0067631215788424015, "grad/layer_8/mlp": 0.0039239381439983845, "grad/layer_8/attn_mlp_ratio": 1.7235545409478639, "grad/layer_12/attn": 0.004217791836708784, "grad/layer_12/mlp": 0.006286747753620148, "grad/layer_12/attn_mlp_ratio": 0.6709020203951745, "grad/layer_16/attn": 0.006121048238128424, "grad/layer_16/mlp": 0.004414628259837627, "grad/layer_16/attn_mlp_ratio": 1.38653760615844, "grad/layer_20/attn": 0.0037068123929202557, "grad/layer_20/mlp": 0.005858287680894136, "grad/layer_20/attn_mlp_ratio": 0.6327467225166761, "grad/layer_24/attn": 0.009449036791920662, "grad/layer_24/mlp": 0.009183519519865513, "grad/layer_24/attn_mlp_ratio": 1.0289123542003211, "grad/layer_27/attn": 0.003780473256483674, "grad/layer_27/mlp": 0.009067410603165627, "grad/layer_27/attn_mlp_ratio": 0.41692974766918084} {"step": 50000, "timestamp": 1778248580.4780152, "train/loss": 2.1232274055480955, "train/z_loss": 0.0014017281238920987, "train/perplexity": 8.358068882927833, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021606.2333321236, "perf/iters_per_sec": 0.9639769713078135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037369179725647, "data/tokens_consumed": 104859697152, "data/tokens_consumed_B": 104.859697152, "train/loss_slope": -1.773147046512358e-05} {"step": 50000, "timestamp": 1778248587.7385, "geo/ww_alpha_mean": 7.547870468065055, "geo/ww_alpha_std": 4.281431861370923, "geo/ww_alpha_min": 1.3505524457083704, "geo/ww_alpha_max": 31.60581636075214, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9328802646046346, "geo/ww_alpha_by_type/k_proj": 4.461321838986026, "geo/ww_alpha_by_type/v_proj": 8.81509899448019, "geo/ww_alpha_by_type/o_proj": 7.956736755913528, "geo/ww_alpha_by_type/gate_proj": 7.932767166986812, "geo/ww_alpha_by_type/up_proj": 11.246829707740616, "geo/ww_alpha_by_type/down_proj": 8.588120326338544, "geo/twonn_id/layer_0": 0.7640170454978943, "geo/twonn_id/layer_7": 3.2181692123413086, "geo/twonn_id/layer_14": 4.449441909790039, "geo/twonn_id/layer_21": 7.354526519775391, "geo/twonn_id/layer_27": 5.414948463439941, "geo/tier2_time_s": 7.252551317214966} {"step": 50000, "timestamp": 1778248588.4562533, "eoc/jacobian_sigma/layer_0/attn": 1048.7325439453125, "eoc/jacobian_sigma/layer_0/mlp": 8456.2177734375, "eoc/jacobian_sigma/layer_0": 8456.2177734375, "eoc/jacobian_sigma/layer_7/attn": 1.1466914415359497, "eoc/jacobian_sigma/layer_7/mlp": 1.7083176374435425, "eoc/jacobian_sigma/layer_7": 1.7083176374435425, "eoc/jacobian_sigma/layer_14/attn": 1.6001538038253784, "eoc/jacobian_sigma/layer_14/mlp": 5.29021692276001, "eoc/jacobian_sigma/layer_14": 5.29021692276001, "eoc/jacobian_sigma/layer_21/attn": 1.0874903202056885, "eoc/jacobian_sigma/layer_21/mlp": 4.2010016441345215, "eoc/jacobian_sigma/layer_21": 4.2010016441345215, "eoc/jacobian_sigma/layer_27/attn": 3.48085618019104, "eoc/jacobian_sigma/layer_27/mlp": 29.43850326538086, "eoc/jacobian_sigma/layer_27": 29.43850326538086, "eoc/layer0_sigma": 8456.2177734375, "eoc/sigma_max": 29.43850326538086, "eoc/sigma_min": 1.7083176374435425, "eoc/sigma_mean": 10.159509867429733, "eoc/time_s": 0.7114143371582031} {"step": 50010, "timestamp": 1778248598.8627677, "train/loss": 2.1030024766921995, "train/z_loss": 0.0013949999469332398, "train/perplexity": 8.190725492726116, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1141113.0581622003, "perf/iters_per_sec": 0.5441251078425409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.837812638282776, "data/tokens_consumed": 104880668672, "data/tokens_consumed_B": 104.880668672, "train/loss_slope": -2.1095637748665615e-05} {"step": 50020, "timestamp": 1778248609.2363281, "train/loss": 2.1445206880569456, "train/z_loss": 0.0013953077606856823, "train/perplexity": 8.537947915978258, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023047.507191391, "perf/iters_per_sec": 0.964664224239059, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036630129814148, "data/tokens_consumed": 104901640192, "data/tokens_consumed_B": 104.901640192, "train/loss_slope": -2.551019241099526e-05} {"step": 50025, "timestamp": 1778248615.0235572, "eos/sharpness": 71.09453678131102, "eos/L0_probe": 1.9819371700286865, "eos/L_plus": 2.285139560699463, "eos/L_minus": 2.3896801471710205, "eos/grad_norm": 0.1722261756658554, "eos/embed_grad_frac": 0.07443509250879288, "eos/time_s": 0.6074502468109131} {"step": 50025, "timestamp": 1778248616.4017785, "geo/rankme_last": 439.0305480957031, "geo/layer_0/stable_rank_q_proj": 19.44020652770996, "geo/layer_0/stable_rank_k_proj": 16.429481506347656, "geo/layer_0/stable_rank_o_proj": 47.746665954589844, "geo/layer_0/stable_rank_gate_proj": 134.33360290527344, "geo/layer_0/stable_rank_down_proj": 53.735694885253906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05944405496120453, "geo/layer_0/attn_entropy_mean": 6.1844482421875, "geo/layer_0/attn_entropy_std": 0.39079785346984863, "geo/layer_7/stable_rank_q_proj": 42.6936149597168, "geo/layer_7/stable_rank_k_proj": 41.959320068359375, "geo/layer_7/stable_rank_o_proj": 94.00950622558594, "geo/layer_7/stable_rank_gate_proj": 86.94308471679688, "geo/layer_7/stable_rank_down_proj": 143.345947265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4819428622722626, "geo/layer_7/attn_entropy_mean": 4.633185386657715, "geo/layer_7/attn_entropy_std": 0.813183069229126, "geo/layer_14/stable_rank_q_proj": 52.63330841064453, "geo/layer_14/stable_rank_k_proj": 38.68316650390625, "geo/layer_14/stable_rank_o_proj": 45.511226654052734, "geo/layer_14/stable_rank_gate_proj": 73.39921569824219, "geo/layer_14/stable_rank_down_proj": 131.03494262695312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3954016864299774, "geo/layer_14/attn_entropy_mean": 5.5213704109191895, "geo/layer_14/attn_entropy_std": 0.3753291964530945, "geo/layer_21/stable_rank_q_proj": 41.45443344116211, "geo/layer_21/stable_rank_k_proj": 30.368932723999023, "geo/layer_21/stable_rank_o_proj": 73.24420166015625, "geo/layer_21/stable_rank_gate_proj": 69.8740234375, "geo/layer_21/stable_rank_down_proj": 53.49707794189453, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14471162855625153, "geo/layer_21/attn_entropy_mean": 5.7049665451049805, "geo/layer_21/attn_entropy_std": 0.28100234270095825, "geo/layer_27/stable_rank_q_proj": 42.807464599609375, "geo/layer_27/stable_rank_k_proj": 31.746967315673828, "geo/layer_27/stable_rank_o_proj": 115.67646789550781, "geo/layer_27/stable_rank_gate_proj": 82.71260833740234, "geo/layer_27/stable_rank_down_proj": 130.04635620117188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09069167077541351, "geo/layer_27/attn_entropy_mean": 4.2467122077941895, "geo/layer_27/attn_entropy_std": 0.6995981931686401, "attnres/final_alpha/block_0": 0.2406189739704132, "attnres/block_norm/0": 1.7426154613494873, "attnres/final_alpha/block_1": 0.004901716485619545, "attnres/block_norm/1": 43538.09375, "attnres/final_alpha/block_2": 0.01075359620153904, "attnres/block_norm/2": 27677.20703125, "attnres/final_alpha/block_3": 0.01261257566511631, "attnres/block_norm/3": 52765.421875, "attnres/final_alpha/block_4": 0.01501364354044199, "attnres/block_norm/4": 14069.171875, "attnres/final_alpha/block_5": 0.6022891998291016, "attnres/block_norm/5": 6360.4345703125, "attnres/final_alpha/block_6": 0.11381027102470398, "attnres/block_norm/6": 34461.80078125, "geo/tier1_time_s": 1.3581690788269043, "geo/step": 50025.0, "geo/rankme_slope": -1.7725254164165666e-06} {"step": 50030, "timestamp": 1778248621.6006026, "train/loss": 2.1860011339187624, "train/z_loss": 0.001389494480099529, "train/perplexity": 8.899553740096406, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696897.4657285702, "perf/iters_per_sec": 0.8091437653200961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235874319076538, "data/tokens_consumed": 104922611712, "data/tokens_consumed_B": 104.922611712, "train/loss_slope": -2.6911736571892506e-05} {"step": 50040, "timestamp": 1778248631.9596322, "train/loss": 2.183374834060669, "train/z_loss": 0.0013901702710427343, "train/perplexity": 8.876211508638104, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025385.5824346126, "perf/iters_per_sec": 0.9657791053937018, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354334592819214, "data/tokens_consumed": 104943583232, "data/tokens_consumed_B": 104.943583232, "train/loss_slope": -2.5348970141097504e-05} {"step": 50050, "timestamp": 1778248642.2954123, "grad/layer_0/attn": 0.002652904251590371, "grad/layer_0/mlp": 0.0029811887070536613, "grad/layer_0/attn_mlp_ratio": 0.8898813269771608, "grad/layer_4/attn": 0.002685386687517166, "grad/layer_4/mlp": 0.002598165301606059, "grad/layer_4/attn_mlp_ratio": 1.0335703361522677, "grad/layer_8/attn": 0.013295287266373634, "grad/layer_8/mlp": 0.0038450206629931927, "grad/layer_8/attn_mlp_ratio": 3.4577933607889784, "grad/layer_12/attn": 0.007500702980905771, "grad/layer_12/mlp": 0.006745608989149332, "grad/layer_12/attn_mlp_ratio": 1.1119385783814613, "grad/layer_16/attn": 0.0035609877668321133, "grad/layer_16/mlp": 0.004704147111624479, "grad/layer_16/attn_mlp_ratio": 0.75698901557173, "grad/layer_20/attn": 0.005240544676780701, "grad/layer_20/mlp": 0.006033793091773987, "grad/layer_20/attn_mlp_ratio": 0.8685323659957819, "grad/layer_24/attn": 0.010500315576791763, "grad/layer_24/mlp": 0.010841695591807365, "grad/layer_24/attn_mlp_ratio": 0.968512295057906, "grad/layer_27/attn": 0.0035193043295294046, "grad/layer_27/mlp": 0.009666224010288715, "grad/layer_27/attn_mlp_ratio": 0.36408263344354513} {"step": 50050, "timestamp": 1778248642.3098133, "train/loss": 2.1168304085731506, "train/z_loss": 0.0013921473524533211, "train/perplexity": 8.30477299014566, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027528.8815410568, "perf/iters_per_sec": 0.966801110048798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034338903427124, "data/tokens_consumed": 104964554752, "data/tokens_consumed_B": 104.964554752, "train/loss_slope": -2.685176633050267e-05} {"step": 50060, "timestamp": 1778248652.6586826, "train/loss": 2.142167401313782, "train/z_loss": 0.0013892340939491987, "train/perplexity": 8.517879299200269, "train/grad_norm": 0.359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027958.235432164, "perf/iters_per_sec": 0.967005841938097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341199159622192, "data/tokens_consumed": 104985526272, "data/tokens_consumed_B": 104.985526272, "train/loss_slope": -2.8522540762586835e-05} {"step": 50070, "timestamp": 1778248663.0065622, "train/loss": 2.1496903896331787, "train/z_loss": 0.0013843200518749653, "train/perplexity": 8.582200847444138, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027678.8192118462, "perf/iters_per_sec": 0.9668726059016448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342624187469482, "data/tokens_consumed": 105006497792, "data/tokens_consumed_B": 105.006497792, "train/loss_slope": -2.9902835661965075e-05} {"step": 50080, "timestamp": 1778248673.3550234, "train/loss": 2.2137747764587403, "train/z_loss": 0.0013756266329437495, "train/perplexity": 9.150191211115368, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027917.6061382084, "perf/iters_per_sec": 0.9669864683810274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341406345367432, "data/tokens_consumed": 105027469312, "data/tokens_consumed_B": 105.027469312, "train/loss_slope": -2.513676630352624e-05} {"step": 50090, "timestamp": 1778248683.702552, "train/loss": 2.1493740558624266, "train/z_loss": 0.001396367361303419, "train/perplexity": 8.57948643684122, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028108.2366142755, "perf/iters_per_sec": 0.9670773680755022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340434312820435, "data/tokens_consumed": 105048440832, "data/tokens_consumed_B": 105.048440832, "train/loss_slope": -2.301405576339111e-05} {"step": 50100, "timestamp": 1778248694.0420873, "grad/layer_0/attn": 0.0023700373712927103, "grad/layer_0/mlp": 0.0026014475151896477, "grad/layer_0/attn_mlp_ratio": 0.9110455876390702, "grad/layer_4/attn": 0.0017326808301731944, "grad/layer_4/mlp": 0.002573599573224783, "grad/layer_4/attn_mlp_ratio": 0.6732518845878245, "grad/layer_8/attn": 0.005983614828437567, "grad/layer_8/mlp": 0.003626396879553795, "grad/layer_8/attn_mlp_ratio": 1.650016493553834, "grad/layer_12/attn": 0.004470451269298792, "grad/layer_12/mlp": 0.006420471705496311, "grad/layer_12/attn_mlp_ratio": 0.6962808037676945, "grad/layer_16/attn": 0.0035979149397462606, "grad/layer_16/mlp": 0.004386505577713251, "grad/layer_16/attn_mlp_ratio": 0.8202234772033639, "grad/layer_20/attn": 0.004413323011249304, "grad/layer_20/mlp": 0.0055272639729082584, "grad/layer_20/attn_mlp_ratio": 0.7984642949992371, "grad/layer_24/attn": 0.00692747300490737, "grad/layer_24/mlp": 0.007712665945291519, "grad/layer_24/attn_mlp_ratio": 0.8981943421673885, "grad/layer_27/attn": 0.004739599768072367, "grad/layer_27/mlp": 0.006478550378233194, "grad/layer_27/attn_mlp_ratio": 0.7315833663713203} {"step": 50100, "timestamp": 1778248694.6442158, "eos/sharpness": 27.507805824279778, "eos/L0_probe": 1.9810099601745605, "eos/L_plus": 2.1340999603271484, "eos/L_minus": 2.1029980182647705, "eos/grad_norm": 0.09655797481536865, "eos/embed_grad_frac": 0.2406737357378006, "eos/time_s": 0.5992119312286377} {"step": 50100, "timestamp": 1778248694.6640332, "train/loss": 2.160281753540039, "train/z_loss": 0.0013896607211790978, "train/perplexity": 8.673581126406601, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914024.5252379396, "perf/iters_per_sec": 0.9126780153455446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0956766605377197, "data/tokens_consumed": 105069412352, "data/tokens_consumed_B": 105.069412352, "train/loss_slope": -2.3829194836311772e-05} {"step": 50100, "timestamp": 1778248696.0232372, "geo/rankme_last": 438.5417785644531, "geo/layer_0/stable_rank_q_proj": 19.425790786743164, "geo/layer_0/stable_rank_k_proj": 16.430442810058594, "geo/layer_0/stable_rank_o_proj": 47.686702728271484, "geo/layer_0/stable_rank_gate_proj": 134.17996215820312, "geo/layer_0/stable_rank_down_proj": 53.757205963134766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06520366668701172, "geo/layer_0/attn_entropy_mean": 6.178496837615967, "geo/layer_0/attn_entropy_std": 0.3956628739833832, "geo/layer_7/stable_rank_q_proj": 42.6502799987793, "geo/layer_7/stable_rank_k_proj": 41.968563079833984, "geo/layer_7/stable_rank_o_proj": 93.80390167236328, "geo/layer_7/stable_rank_gate_proj": 86.93769836425781, "geo/layer_7/stable_rank_down_proj": 143.2193145751953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48232096433639526, "geo/layer_7/attn_entropy_mean": 4.662724494934082, "geo/layer_7/attn_entropy_std": 0.8113810420036316, "geo/layer_14/stable_rank_q_proj": 52.670284271240234, "geo/layer_14/stable_rank_k_proj": 38.614017486572266, "geo/layer_14/stable_rank_o_proj": 45.533905029296875, "geo/layer_14/stable_rank_gate_proj": 73.50985717773438, "geo/layer_14/stable_rank_down_proj": 130.6019744873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4031282365322113, "geo/layer_14/attn_entropy_mean": 5.510569095611572, "geo/layer_14/attn_entropy_std": 0.38475552201271057, "geo/layer_21/stable_rank_q_proj": 41.466796875, "geo/layer_21/stable_rank_k_proj": 30.392690658569336, "geo/layer_21/stable_rank_o_proj": 73.22108459472656, "geo/layer_21/stable_rank_gate_proj": 69.90691375732422, "geo/layer_21/stable_rank_down_proj": 53.510894775390625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14332343637943268, "geo/layer_21/attn_entropy_mean": 5.711939811706543, "geo/layer_21/attn_entropy_std": 0.3035084009170532, "geo/layer_27/stable_rank_q_proj": 42.82744216918945, "geo/layer_27/stable_rank_k_proj": 31.676118850708008, "geo/layer_27/stable_rank_o_proj": 115.78111267089844, "geo/layer_27/stable_rank_gate_proj": 82.59776306152344, "geo/layer_27/stable_rank_down_proj": 130.17066955566406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09397260844707489, "geo/layer_27/attn_entropy_mean": 4.2519426345825195, "geo/layer_27/attn_entropy_std": 0.6972973346710205, "attnres/final_alpha/block_0": 0.23891091346740723, "attnres/block_norm/0": 1.7428929805755615, "attnres/final_alpha/block_1": 0.004885940812528133, "attnres/block_norm/1": 43520.0859375, "attnres/final_alpha/block_2": 0.01060915645211935, "attnres/block_norm/2": 27519.84375, "attnres/final_alpha/block_3": 0.012755005620419979, "attnres/block_norm/3": 52066.9453125, "attnres/final_alpha/block_4": 0.015078835189342499, "attnres/block_norm/4": 14065.3046875, "attnres/final_alpha/block_5": 0.6034122705459595, "attnres/block_norm/5": 6423.044921875, "attnres/final_alpha/block_6": 0.11434783786535263, "attnres/block_norm/6": 34725.75, "geo/tier1_time_s": 1.3557686805725098, "geo/step": 50100.0, "geo/rankme_slope": -5.182717618297321e-06} {"step": 50110, "timestamp": 1778248706.3785446, "train/loss": 2.153923738002777, "train/z_loss": 0.0013865269022062421, "train/perplexity": 8.618609303875363, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790751.5546012311, "perf/iters_per_sec": 0.8538968823438793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1711015939712524, "data/tokens_consumed": 105090383872, "data/tokens_consumed_B": 105.090383872, "train/loss_slope": -2.6685632315978175e-05} {"step": 50120, "timestamp": 1778248716.7252073, "train/loss": 2.1416104078292846, "train/z_loss": 0.001381756062619388, "train/perplexity": 8.513136216984451, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028230.8538122878, "perf/iters_per_sec": 0.9671358365117492, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033980917930603, "data/tokens_consumed": 105111355392, "data/tokens_consumed_B": 105.111355392, "train/loss_slope": -2.5711442527919797e-05} {"step": 50130, "timestamp": 1778248727.0779784, "train/loss": 2.139568543434143, "train/z_loss": 0.0013862755498848856, "train/perplexity": 8.495771281706707, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026655.5429601849, "perf/iters_per_sec": 0.9663846697617459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347846269607544, "data/tokens_consumed": 105132326912, "data/tokens_consumed_B": 105.132326912, "train/loss_slope": -2.677010753081076e-05} {"step": 50140, "timestamp": 1778248737.4285696, "train/loss": 2.158557653427124, "train/z_loss": 0.0013824955211021006, "train/perplexity": 8.65863988800369, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027402.0968195961, "perf/iters_per_sec": 0.9667406543825131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344035863876342, "data/tokens_consumed": 105153298432, "data/tokens_consumed_B": 105.153298432, "train/loss_slope": -2.111550234403476e-05} {"step": 50150, "timestamp": 1778248747.766013, "grad/layer_0/attn": 0.003691833233460784, "grad/layer_0/mlp": 0.0029267705976963043, "grad/layer_0/attn_mlp_ratio": 1.261401597455745, "grad/layer_4/attn": 0.002468046033754945, "grad/layer_4/mlp": 0.0025061534252017736, "grad/layer_4/attn_mlp_ratio": 0.9847944305631632, "grad/layer_8/attn": 0.005640535149723291, "grad/layer_8/mlp": 0.003646403318271041, "grad/layer_8/attn_mlp_ratio": 1.5468763333920352, "grad/layer_12/attn": 0.004920474253594875, "grad/layer_12/mlp": 0.007074396591633558, "grad/layer_12/attn_mlp_ratio": 0.6955327030803926, "grad/layer_16/attn": 0.0035740388557314873, "grad/layer_16/mlp": 0.0049959030002355576, "grad/layer_16/attn_mlp_ratio": 0.7153939506078433, "grad/layer_20/attn": 0.00686455424875021, "grad/layer_20/mlp": 0.0063574607484042645, "grad/layer_20/attn_mlp_ratio": 1.0797635113196533, "grad/layer_24/attn": 0.005594203248620033, "grad/layer_24/mlp": 0.008502611890435219, "grad/layer_24/attn_mlp_ratio": 0.6579393785007571, "grad/layer_27/attn": 0.010873066261410713, "grad/layer_27/mlp": 0.007034319452941418, "grad/layer_27/attn_mlp_ratio": 1.5457168500205702} {"step": 50150, "timestamp": 1778248747.7805512, "train/loss": 2.1421860814094544, "train/z_loss": 0.0013767481432296336, "train/perplexity": 8.518038415486656, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027212.7203551852, "perf/iters_per_sec": 0.9666503526473929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345002174377442, "data/tokens_consumed": 105174269952, "data/tokens_consumed_B": 105.174269952, "train/loss_slope": -2.0097880895667832e-05} {"step": 50160, "timestamp": 1778248758.1374183, "train/loss": 2.2472901582717895, "train/z_loss": 0.0013769217417575418, "train/perplexity": 9.462060377809165, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026300.9567393467, "perf/iters_per_sec": 0.9662155898758634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034965705871582, "data/tokens_consumed": 105195241472, "data/tokens_consumed_B": 105.195241472, "train/loss_slope": -1.0809986039821778e-05} {"step": 50170, "timestamp": 1778248768.4970052, "train/loss": 2.1130358457565306, "train/z_loss": 0.0014080883236601948, "train/perplexity": 8.273319720800467, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025925.590080403, "perf/iters_per_sec": 0.9660366011049285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351574659347533, "data/tokens_consumed": 105216212992, "data/tokens_consumed_B": 105.216212992, "train/loss_slope": -1.3938573892503534e-05} {"step": 50175, "timestamp": 1778248774.258943, "eos/sharpness": 3.7448644638061515, "eos/L0_probe": 1.9824070930480957, "eos/L_plus": 2.0020968914031982, "eos/L_minus": 2.0001659393310547, "eos/grad_norm": 0.08543205261230469, "eos/embed_grad_frac": 0.3175635039806366, "eos/time_s": 0.5904979705810547} {"step": 50175, "timestamp": 1778248775.6334486, "geo/rankme_last": 439.2965087890625, "geo/layer_0/stable_rank_q_proj": 19.447219848632812, "geo/layer_0/stable_rank_k_proj": 16.405723571777344, "geo/layer_0/stable_rank_o_proj": 47.748558044433594, "geo/layer_0/stable_rank_gate_proj": 134.14198303222656, "geo/layer_0/stable_rank_down_proj": 53.724788665771484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062165483832359314, "geo/layer_0/attn_entropy_mean": 6.1852521896362305, "geo/layer_0/attn_entropy_std": 0.39435669779777527, "geo/layer_7/stable_rank_q_proj": 42.660888671875, "geo/layer_7/stable_rank_k_proj": 42.044219970703125, "geo/layer_7/stable_rank_o_proj": 93.64542388916016, "geo/layer_7/stable_rank_gate_proj": 86.77622985839844, "geo/layer_7/stable_rank_down_proj": 143.1525115966797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47496750950813293, "geo/layer_7/attn_entropy_mean": 4.646600723266602, "geo/layer_7/attn_entropy_std": 0.7945024371147156, "geo/layer_14/stable_rank_q_proj": 52.705501556396484, "geo/layer_14/stable_rank_k_proj": 38.5389404296875, "geo/layer_14/stable_rank_o_proj": 45.504600524902344, "geo/layer_14/stable_rank_gate_proj": 73.56688690185547, "geo/layer_14/stable_rank_down_proj": 130.86680603027344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38444215059280396, "geo/layer_14/attn_entropy_mean": 5.517249584197998, "geo/layer_14/attn_entropy_std": 0.3888968527317047, "geo/layer_21/stable_rank_q_proj": 41.512535095214844, "geo/layer_21/stable_rank_k_proj": 30.419593811035156, "geo/layer_21/stable_rank_o_proj": 73.27236938476562, "geo/layer_21/stable_rank_gate_proj": 69.94606018066406, "geo/layer_21/stable_rank_down_proj": 53.470420837402344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14214278757572174, "geo/layer_21/attn_entropy_mean": 5.691256046295166, "geo/layer_21/attn_entropy_std": 0.29610997438430786, "geo/layer_27/stable_rank_q_proj": 42.84892654418945, "geo/layer_27/stable_rank_k_proj": 31.611072540283203, "geo/layer_27/stable_rank_o_proj": 115.78214263916016, "geo/layer_27/stable_rank_gate_proj": 82.6507797241211, "geo/layer_27/stable_rank_down_proj": 130.0382080078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08952076733112335, "geo/layer_27/attn_entropy_mean": 4.271629333496094, "geo/layer_27/attn_entropy_std": 0.7066930532455444, "attnres/final_alpha/block_0": 0.2400752454996109, "attnres/block_norm/0": 1.7428951263427734, "attnres/final_alpha/block_1": 0.00489500816911459, "attnres/block_norm/1": 43551.21875, "attnres/final_alpha/block_2": 0.010582809336483479, "attnres/block_norm/2": 27650.73046875, "attnres/final_alpha/block_3": 0.012501008808612823, "attnres/block_norm/3": 52815.1640625, "attnres/final_alpha/block_4": 0.014896165579557419, "attnres/block_norm/4": 14059.734375, "attnres/final_alpha/block_5": 0.6030032634735107, "attnres/block_norm/5": 6354.125, "attnres/final_alpha/block_6": 0.11404649913311005, "attnres/block_norm/6": 34429.47265625, "geo/tier1_time_s": 1.3569667339324951, "geo/step": 50175.0, "geo/rankme_slope": -5.259506146208485e-06} {"step": 50180, "timestamp": 1778248780.8158917, "train/loss": 2.1973387479782103, "train/z_loss": 0.001373762427829206, "train/perplexity": 9.00102759443736, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703685.411076968, "perf/iters_per_sec": 0.8123805098900643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2309502601623534, "data/tokens_consumed": 105237184512, "data/tokens_consumed_B": 105.237184512, "train/loss_slope": -1.1754875464944421e-05} {"step": 50190, "timestamp": 1778248791.1655624, "train/loss": 2.1987542152404784, "train/z_loss": 0.0013809432392008602, "train/perplexity": 9.013777275573469, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027722.6642795303, "perf/iters_per_sec": 0.9668935128591205, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342400550842286, "data/tokens_consumed": 105258156032, "data/tokens_consumed_B": 105.258156032, "train/loss_slope": -1.191924016038615e-05} {"step": 50200, "timestamp": 1778248801.5098605, "grad/layer_0/attn": 0.0028297577518969774, "grad/layer_0/mlp": 0.002787970704957843, "grad/layer_0/attn_mlp_ratio": 1.0149883014789196, "grad/layer_4/attn": 0.0025007545482367277, "grad/layer_4/mlp": 0.0026398838963359594, "grad/layer_4/attn_mlp_ratio": 0.9472971356725003, "grad/layer_8/attn": 0.005151082761585712, "grad/layer_8/mlp": 0.0036919284611940384, "grad/layer_8/attn_mlp_ratio": 1.3952281784996827, "grad/layer_12/attn": 0.004993125796318054, "grad/layer_12/mlp": 0.006997482851147652, "grad/layer_12/attn_mlp_ratio": 0.7135602660524004, "grad/layer_16/attn": 0.003737047780305147, "grad/layer_16/mlp": 0.004456747323274612, "grad/layer_16/attn_mlp_ratio": 0.8385145994114553, "grad/layer_20/attn": 0.0043267132714390755, "grad/layer_20/mlp": 0.005669956095516682, "grad/layer_20/attn_mlp_ratio": 0.763094655803562, "grad/layer_24/attn": 0.0075033302418887615, "grad/layer_24/mlp": 0.008097411133348942, "grad/layer_24/attn_mlp_ratio": 0.9266332196377188, "grad/layer_27/attn": 0.004273965489119291, "grad/layer_27/mlp": 0.007564642932265997, "grad/layer_27/attn_mlp_ratio": 0.564992356002689} {"step": 50200, "timestamp": 1778248801.5240054, "train/loss": 2.1081577062606813, "train/z_loss": 0.0014000863884575665, "train/perplexity": 8.233059590211838, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026030.9101900584, "perf/iters_per_sec": 0.9660868216467182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351036548614503, "data/tokens_consumed": 105279127552, "data/tokens_consumed_B": 105.279127552, "train/loss_slope": -1.5317539813959892e-05} {"step": 50210, "timestamp": 1778248811.8731463, "train/loss": 2.157314324378967, "train/z_loss": 0.0013747119111940265, "train/perplexity": 8.647881039293946, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027401.1155032036, "perf/iters_per_sec": 0.9667401864543932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344040870666504, "data/tokens_consumed": 105300099072, "data/tokens_consumed_B": 105.300099072, "train/loss_slope": -1.3830087458876365e-05} {"step": 50220, "timestamp": 1778248822.2244895, "train/loss": 2.1457387447357177, "train/z_loss": 0.0013936039875261485, "train/perplexity": 8.548353956747466, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027345.836213808, "perf/iters_per_sec": 0.9667138272351303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034432291984558, "data/tokens_consumed": 105321070592, "data/tokens_consumed_B": 105.321070592, "train/loss_slope": -1.2911867465909972e-05} {"step": 50230, "timestamp": 1778248833.08665, "train/loss": 2.192788863182068, "train/z_loss": 0.0013869827962480485, "train/perplexity": 8.960166981865198, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931946.9064188302, "perf/iters_per_sec": 0.9212240726560736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0855122327804565, "data/tokens_consumed": 105342042112, "data/tokens_consumed_B": 105.342042112, "train/loss_slope": -1.1179725529372877e-05} {"step": 50240, "timestamp": 1778248843.4343464, "train/loss": 2.1807565689086914, "train/z_loss": 0.001388872426468879, "train/perplexity": 8.853001631429036, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027711.5859757334, "perf/iters_per_sec": 0.9668882303122203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342457056045533, "data/tokens_consumed": 105363013632, "data/tokens_consumed_B": 105.363013632, "train/loss_slope": -9.859855984053468e-06} {"step": 50250, "timestamp": 1778248853.7691002, "grad/layer_0/attn": 0.004267469979822636, "grad/layer_0/mlp": 0.00372705003246665, "grad/layer_0/attn_mlp_ratio": 1.1449993501961089, "grad/layer_4/attn": 0.0040406049229204655, "grad/layer_4/mlp": 0.0025841672904789448, "grad/layer_4/attn_mlp_ratio": 1.563600306159574, "grad/layer_8/attn": 0.006328040733933449, "grad/layer_8/mlp": 0.003548086853697896, "grad/layer_8/attn_mlp_ratio": 1.7835077934993173, "grad/layer_12/attn": 0.010160157456994057, "grad/layer_12/mlp": 0.006758110132068396, "grad/layer_12/attn_mlp_ratio": 1.5034021506163606, "grad/layer_16/attn": 0.0035244415048509836, "grad/layer_16/mlp": 0.004570960532873869, "grad/layer_16/attn_mlp_ratio": 0.7710505051178018, "grad/layer_20/attn": 0.0037177640479058027, "grad/layer_20/mlp": 0.006107977591454983, "grad/layer_20/attn_mlp_ratio": 0.6086734817494387, "grad/layer_24/attn": 0.007558026351034641, "grad/layer_24/mlp": 0.008294692263007164, "grad/layer_24/attn_mlp_ratio": 0.9111882659738026, "grad/layer_27/attn": 0.005614493507891893, "grad/layer_27/mlp": 0.006988438777625561, "grad/layer_27/attn_mlp_ratio": 0.8033973833365645} {"step": 50250, "timestamp": 1778248854.3613293, "eos/sharpness": 28.095936775207512, "eos/L0_probe": 1.9835220575332642, "eos/L_plus": 2.1295974254608154, "eos/L_minus": 2.118406057357788, "eos/grad_norm": 0.10968756675720215, "eos/embed_grad_frac": 0.17551247775554657, "eos/time_s": 0.5895087718963623} {"step": 50250, "timestamp": 1778248854.3812177, "train/loss": 2.164735198020935, "train/z_loss": 0.0013985845376737415, "train/perplexity": 8.712294578524256, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916893.9424935284, "perf/iters_per_sec": 0.9140462601153986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0940365314483642, "data/tokens_consumed": 105383985152, "data/tokens_consumed_B": 105.383985152, "train/loss_slope": -1.1445346466600118e-05} {"step": 50250, "timestamp": 1778248855.7471704, "geo/rankme_last": 438.99432373046875, "geo/layer_0/stable_rank_q_proj": 19.445707321166992, "geo/layer_0/stable_rank_k_proj": 16.37885856628418, "geo/layer_0/stable_rank_o_proj": 47.813419342041016, "geo/layer_0/stable_rank_gate_proj": 133.80224609375, "geo/layer_0/stable_rank_down_proj": 53.70242691040039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059908825904130936, "geo/layer_0/attn_entropy_mean": 6.1803436279296875, "geo/layer_0/attn_entropy_std": 0.39599475264549255, "geo/layer_7/stable_rank_q_proj": 42.701805114746094, "geo/layer_7/stable_rank_k_proj": 41.994293212890625, "geo/layer_7/stable_rank_o_proj": 93.81694030761719, "geo/layer_7/stable_rank_gate_proj": 86.68472290039062, "geo/layer_7/stable_rank_down_proj": 143.4154815673828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46672943234443665, "geo/layer_7/attn_entropy_mean": 4.644659996032715, "geo/layer_7/attn_entropy_std": 0.807262659072876, "geo/layer_14/stable_rank_q_proj": 52.739105224609375, "geo/layer_14/stable_rank_k_proj": 38.7276611328125, "geo/layer_14/stable_rank_o_proj": 45.50910949707031, "geo/layer_14/stable_rank_gate_proj": 73.63031768798828, "geo/layer_14/stable_rank_down_proj": 130.47891235351562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3997490406036377, "geo/layer_14/attn_entropy_mean": 5.528022289276123, "geo/layer_14/attn_entropy_std": 0.38312089443206787, "geo/layer_21/stable_rank_q_proj": 41.54421615600586, "geo/layer_21/stable_rank_k_proj": 30.432289123535156, "geo/layer_21/stable_rank_o_proj": 73.09683227539062, "geo/layer_21/stable_rank_gate_proj": 69.8355712890625, "geo/layer_21/stable_rank_down_proj": 53.48925018310547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14317011833190918, "geo/layer_21/attn_entropy_mean": 5.70982027053833, "geo/layer_21/attn_entropy_std": 0.2952665090560913, "geo/layer_27/stable_rank_q_proj": 42.752418518066406, "geo/layer_27/stable_rank_k_proj": 31.601877212524414, "geo/layer_27/stable_rank_o_proj": 115.61769104003906, "geo/layer_27/stable_rank_gate_proj": 82.6507797241211, "geo/layer_27/stable_rank_down_proj": 130.2202911376953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09672006219625473, "geo/layer_27/attn_entropy_mean": 4.236703872680664, "geo/layer_27/attn_entropy_std": 0.7074499726295471, "attnres/final_alpha/block_0": 0.2382020205259323, "attnres/block_norm/0": 1.7429120540618896, "attnres/final_alpha/block_1": 0.004791192710399628, "attnres/block_norm/1": 43669.921875, "attnres/final_alpha/block_2": 0.010678968392312527, "attnres/block_norm/2": 27665.09765625, "attnres/final_alpha/block_3": 0.012378244660794735, "attnres/block_norm/3": 52487.375, "attnres/final_alpha/block_4": 0.014841580763459206, "attnres/block_norm/4": 14052.701171875, "attnres/final_alpha/block_5": 0.6060553193092346, "attnres/block_norm/5": 6312.4736328125, "attnres/final_alpha/block_6": 0.11305268853902817, "attnres/block_norm/6": 34626.828125, "geo/tier1_time_s": 1.3619933128356934, "geo/step": 50250.0, "geo/rankme_slope": -2.7313659838935573e-05} {"step": 50260, "timestamp": 1778248866.0953302, "train/loss": 2.1667787075042724, "train/z_loss": 0.001387194893322885, "train/perplexity": 8.730116438485114, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790857.577817446, "perf/iters_per_sec": 0.8539474381530028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710322618484497, "data/tokens_consumed": 105404956672, "data/tokens_consumed_B": 105.404956672, "train/loss_slope": -1.0277523280549214e-05} {"step": 50270, "timestamp": 1778248876.4431293, "train/loss": 2.1611493825912476, "train/z_loss": 0.0013787885429337621, "train/perplexity": 8.681109842964197, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027692.6549772667, "perf/iters_per_sec": 0.9668792033087095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034255361557007, "data/tokens_consumed": 105425928192, "data/tokens_consumed_B": 105.425928192, "train/loss_slope": -1.365135681010425e-05} {"step": 50280, "timestamp": 1778248886.7869637, "train/loss": 2.1575264930725098, "train/z_loss": 0.0013841093168593943, "train/perplexity": 8.649716043574312, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028678.9862810306, "perf/iters_per_sec": 0.967349522724643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337525129318237, "data/tokens_consumed": 105446899712, "data/tokens_consumed_B": 105.446899712, "train/loss_slope": -1.1239427666101902e-05} {"step": 50290, "timestamp": 1778248897.1335921, "train/loss": 2.145844745635986, "train/z_loss": 0.001381993922404945, "train/perplexity": 8.54926013798986, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028209.200650994, "perf/iters_per_sec": 0.9671255114798517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339919567108153, "data/tokens_consumed": 105467871232, "data/tokens_consumed_B": 105.467871232, "train/loss_slope": -1.3561054374804738e-05} {"step": 50300, "timestamp": 1778248907.473612, "grad/layer_0/attn": 0.0032174591906368732, "grad/layer_0/mlp": 0.003232113551348448, "grad/layer_0/attn_mlp_ratio": 0.9954659822356616, "grad/layer_4/attn": 0.0021701673977077007, "grad/layer_4/mlp": 0.0024487744085490704, "grad/layer_4/attn_mlp_ratio": 0.8862259020302985, "grad/layer_8/attn": 0.004235898610204458, "grad/layer_8/mlp": 0.0036204333882778883, "grad/layer_8/attn_mlp_ratio": 1.169997632581651, "grad/layer_12/attn": 0.006509509403258562, "grad/layer_12/mlp": 0.006703007034957409, "grad/layer_12/attn_mlp_ratio": 0.971132697936465, "grad/layer_16/attn": 0.0039503187872469425, "grad/layer_16/mlp": 0.005050695035606623, "grad/layer_16/attn_mlp_ratio": 0.7821336828267071, "grad/layer_20/attn": 0.004224104341119528, "grad/layer_20/mlp": 0.00831520650535822, "grad/layer_20/attn_mlp_ratio": 0.5079975208791043, "grad/layer_24/attn": 0.026358217000961304, "grad/layer_24/mlp": 0.01613304391503334, "grad/layer_24/attn_mlp_ratio": 1.6338030799643133, "grad/layer_27/attn": 0.013833841308951378, "grad/layer_27/mlp": 0.015240154229104519, "grad/layer_27/attn_mlp_ratio": 0.9077231772208849} {"step": 50300, "timestamp": 1778248907.4877076, "train/loss": 2.175584888458252, "train/z_loss": 0.0013831038144417108, "train/perplexity": 8.807334924556146, "train/grad_norm": 0.32421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026347.0762737908, "perf/iters_per_sec": 0.9662375813836054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349421501159668, "data/tokens_consumed": 105488842752, "data/tokens_consumed_B": 105.488842752, "train/loss_slope": -1.2089607367242331e-05} {"step": 50310, "timestamp": 1778248917.8328786, "train/loss": 2.13070262670517, "train/z_loss": 0.0013935897499322892, "train/perplexity": 8.420781399212625, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028771.16333135, "perf/iters_per_sec": 0.9673934761673689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337055444717407, "data/tokens_consumed": 105509814272, "data/tokens_consumed_B": 105.509814272, "train/loss_slope": -1.3167050876955065e-05} {"step": 50320, "timestamp": 1778248928.1878872, "train/loss": 2.1648327589035032, "train/z_loss": 0.0013698023394681513, "train/perplexity": 8.713144599136237, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026751.3655618262, "perf/iters_per_sec": 0.9664303615388041, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347357034683227, "data/tokens_consumed": 105530785792, "data/tokens_consumed_B": 105.530785792, "train/loss_slope": -1.3570352077054941e-05} {"step": 50325, "timestamp": 1778248933.9396608, "eos/sharpness": 81.71458244323729, "eos/L0_probe": 1.9786266088485718, "eos/L_plus": 2.3166589736938477, "eos/L_minus": 2.457740068435669, "eos/grad_norm": 0.21317961812019348, "eos/embed_grad_frac": 0.04521622508764267, "eos/time_s": 0.5900073051452637} {"step": 50325, "timestamp": 1778248935.3164723, "geo/rankme_last": 439.1846618652344, "geo/layer_0/stable_rank_q_proj": 19.42884063720703, "geo/layer_0/stable_rank_k_proj": 16.397111892700195, "geo/layer_0/stable_rank_o_proj": 47.75013732910156, "geo/layer_0/stable_rank_gate_proj": 133.53570556640625, "geo/layer_0/stable_rank_down_proj": 53.76960372924805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062401991337537766, "geo/layer_0/attn_entropy_mean": 6.180627822875977, "geo/layer_0/attn_entropy_std": 0.3959295153617859, "geo/layer_7/stable_rank_q_proj": 42.70920944213867, "geo/layer_7/stable_rank_k_proj": 41.97428894042969, "geo/layer_7/stable_rank_o_proj": 93.66616821289062, "geo/layer_7/stable_rank_gate_proj": 86.70538330078125, "geo/layer_7/stable_rank_down_proj": 143.3332977294922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4690151512622833, "geo/layer_7/attn_entropy_mean": 4.645925521850586, "geo/layer_7/attn_entropy_std": 0.8051629662513733, "geo/layer_14/stable_rank_q_proj": 52.76497268676758, "geo/layer_14/stable_rank_k_proj": 38.72568893432617, "geo/layer_14/stable_rank_o_proj": 45.459510803222656, "geo/layer_14/stable_rank_gate_proj": 73.56108856201172, "geo/layer_14/stable_rank_down_proj": 130.32139587402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38607993721961975, "geo/layer_14/attn_entropy_mean": 5.520786285400391, "geo/layer_14/attn_entropy_std": 0.37870728969573975, "geo/layer_21/stable_rank_q_proj": 41.5234375, "geo/layer_21/stable_rank_k_proj": 30.493453979492188, "geo/layer_21/stable_rank_o_proj": 73.14257049560547, "geo/layer_21/stable_rank_gate_proj": 69.77952575683594, "geo/layer_21/stable_rank_down_proj": 53.41270065307617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14775849878787994, "geo/layer_21/attn_entropy_mean": 5.714428424835205, "geo/layer_21/attn_entropy_std": 0.296317994594574, "geo/layer_27/stable_rank_q_proj": 42.722747802734375, "geo/layer_27/stable_rank_k_proj": 31.679855346679688, "geo/layer_27/stable_rank_o_proj": 115.48011779785156, "geo/layer_27/stable_rank_gate_proj": 82.63213348388672, "geo/layer_27/stable_rank_down_proj": 130.1515350341797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09349551796913147, "geo/layer_27/attn_entropy_mean": 4.257353782653809, "geo/layer_27/attn_entropy_std": 0.7015283703804016, "attnres/final_alpha/block_0": 0.2386646866798401, "attnres/block_norm/0": 1.743062973022461, "attnres/final_alpha/block_1": 0.004843724891543388, "attnres/block_norm/1": 43519.4296875, "attnres/final_alpha/block_2": 0.01076625008136034, "attnres/block_norm/2": 27709.4296875, "attnres/final_alpha/block_3": 0.012727975845336914, "attnres/block_norm/3": 52603.59375, "attnres/final_alpha/block_4": 0.0150980893522501, "attnres/block_norm/4": 14057.935546875, "attnres/final_alpha/block_5": 0.6036562919616699, "attnres/block_norm/5": 6315.791015625, "attnres/final_alpha/block_6": 0.11424297839403152, "attnres/block_norm/6": 34606.9921875, "geo/tier1_time_s": 1.358717441558838, "geo/step": 50325.0, "geo/rankme_slope": -2.8187466392807122e-05} {"step": 50330, "timestamp": 1778248940.494006, "train/loss": 2.143345594406128, "train/z_loss": 0.0013761806418187915, "train/perplexity": 8.527920920074603, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704892.8088490367, "perf/iters_per_sec": 0.8129562420125183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2300785064697266, "data/tokens_consumed": 105551757312, "data/tokens_consumed_B": 105.551757312, "train/loss_slope": -1.4008676530075188e-05} {"step": 50340, "timestamp": 1778248950.8343914, "train/loss": 2.1824622631072996, "train/z_loss": 0.0013887127861380577, "train/perplexity": 8.868115030706536, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029069.5094303444, "perf/iters_per_sec": 0.9675357386733744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335535526275634, "data/tokens_consumed": 105572728832, "data/tokens_consumed_B": 105.572728832, "train/loss_slope": -1.1987866133567698e-05} {"step": 50350, "timestamp": 1778248961.1795838, "grad/layer_0/attn": 0.002962309867143631, "grad/layer_0/mlp": 0.0031430842354893684, "grad/layer_0/attn_mlp_ratio": 0.9424850086570803, "grad/layer_4/attn": 0.0033643813803792, "grad/layer_4/mlp": 0.002710530534386635, "grad/layer_4/attn_mlp_ratio": 1.2412261044747517, "grad/layer_8/attn": 0.004083859734237194, "grad/layer_8/mlp": 0.003826010739430785, "grad/layer_8/attn_mlp_ratio": 1.0673936655246827, "grad/layer_12/attn": 0.004618325736373663, "grad/layer_12/mlp": 0.007159476634114981, "grad/layer_12/attn_mlp_ratio": 0.6450647034534372, "grad/layer_16/attn": 0.004832083359360695, "grad/layer_16/mlp": 0.004510699305683374, "grad/layer_16/attn_mlp_ratio": 1.0712492508971856, "grad/layer_20/attn": 0.008377527818083763, "grad/layer_20/mlp": 0.006345066241919994, "grad/layer_20/attn_mlp_ratio": 1.3203215485291133, "grad/layer_24/attn": 0.021023519337177277, "grad/layer_24/mlp": 0.013892343267798424, "grad/layer_24/attn_mlp_ratio": 1.513316996318164, "grad/layer_27/attn": 0.010234827175736427, "grad/layer_27/mlp": 0.012418358586728573, "grad/layer_27/attn_mlp_ratio": 0.8241690737016903} {"step": 50350, "timestamp": 1778248961.193632, "train/loss": 2.1546454548835756, "train/z_loss": 0.0013810258242301643, "train/perplexity": 8.624831744849232, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025643.6548691597, "perf/iters_per_sec": 0.9659021639200018, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353015422821046, "data/tokens_consumed": 105593700352, "data/tokens_consumed_B": 105.593700352, "train/loss_slope": -1.3289582067661862e-05} {"step": 50360, "timestamp": 1778248971.5397253, "train/loss": 2.1398701906204223, "train/z_loss": 0.0013829280738718807, "train/perplexity": 8.49833439376744, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027934.9049481272, "perf/iters_per_sec": 0.9669947170963894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341318130493165, "data/tokens_consumed": 105614671872, "data/tokens_consumed_B": 105.614671872, "train/loss_slope": -1.3827721671302724e-05} {"step": 50370, "timestamp": 1778248981.884996, "train/loss": 2.14205379486084, "train/z_loss": 0.001391576079186052, "train/perplexity": 8.516911668112108, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028645.018576689, "perf/iters_per_sec": 0.9673333256610341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337698221206666, "data/tokens_consumed": 105635643392, "data/tokens_consumed_B": 105.635643392, "train/loss_slope": -1.6592852224503898e-05} {"step": 50380, "timestamp": 1778248992.2295938, "train/loss": 2.1158514738082888, "train/z_loss": 0.001384302752558142, "train/perplexity": 8.296647137140248, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028254.7990991748, "perf/iters_per_sec": 0.9671472545143007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033968710899353, "data/tokens_consumed": 105656614912, "data/tokens_consumed_B": 105.656614912, "train/loss_slope": -1.9987209967010325e-05} {"step": 50390, "timestamp": 1778249002.5923674, "train/loss": 2.145360803604126, "train/z_loss": 0.0014007674413733183, "train/perplexity": 8.545123792624198, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024666.4237847636, "perf/iters_per_sec": 0.9654361838268106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358012437820434, "data/tokens_consumed": 105677586432, "data/tokens_consumed_B": 105.677586432, "train/loss_slope": -1.8975862747121405e-05} {"step": 50400, "timestamp": 1778249013.3273838, "grad/layer_0/attn": 0.0026898421347141266, "grad/layer_0/mlp": 0.0028155490290373564, "grad/layer_0/attn_mlp_ratio": 0.9553525836126293, "grad/layer_4/attn": 0.002745527308434248, "grad/layer_4/mlp": 0.002455385634675622, "grad/layer_4/attn_mlp_ratio": 1.11816537404334, "grad/layer_8/attn": 0.0037248041480779648, "grad/layer_8/mlp": 0.0034781573340296745, "grad/layer_8/attn_mlp_ratio": 1.0709130390807335, "grad/layer_12/attn": 0.005890053231269121, "grad/layer_12/mlp": 0.006853675935417414, "grad/layer_12/attn_mlp_ratio": 0.8594005904030735, "grad/layer_16/attn": 0.004919119644910097, "grad/layer_16/mlp": 0.004440756980329752, "grad/layer_16/attn_mlp_ratio": 1.1077209484615231, "grad/layer_20/attn": 0.003566679311916232, "grad/layer_20/mlp": 0.005812712013721466, "grad/layer_20/attn_mlp_ratio": 0.613599855306225, "grad/layer_24/attn": 0.016444336622953415, "grad/layer_24/mlp": 0.009007691405713558, "grad/layer_24/attn_mlp_ratio": 1.825588344419079, "grad/layer_27/attn": 0.011570828035473824, "grad/layer_27/mlp": 0.0076051680371165276, "grad/layer_27/attn_mlp_ratio": 1.5214427645594253} {"step": 50400, "timestamp": 1778249013.9062552, "eos/sharpness": 56.84428215026854, "eos/L0_probe": 1.9776384830474854, "eos/L_plus": 2.2237887382507324, "eos/L_minus": 2.299931049346924, "eos/grad_norm": 0.13860400021076202, "eos/embed_grad_frac": 0.11112894117832184, "eos/time_s": 0.5761034488677979} {"step": 50400, "timestamp": 1778249013.9238353, "train/loss": 2.130287194252014, "train/z_loss": 0.0013865016517229378, "train/perplexity": 8.417283859884435, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1851918.651903379, "perf/iters_per_sec": 0.8830636271969695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1324212312698365, "data/tokens_consumed": 105698557952, "data/tokens_consumed_B": 105.698557952, "train/loss_slope": -2.07003663189233e-05} {"step": 50400, "timestamp": 1778249015.286839, "geo/rankme_last": 438.7011413574219, "geo/layer_0/stable_rank_q_proj": 19.46086311340332, "geo/layer_0/stable_rank_k_proj": 16.427703857421875, "geo/layer_0/stable_rank_o_proj": 47.68158721923828, "geo/layer_0/stable_rank_gate_proj": 133.4249267578125, "geo/layer_0/stable_rank_down_proj": 53.68886184692383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0599612258374691, "geo/layer_0/attn_entropy_mean": 6.183167457580566, "geo/layer_0/attn_entropy_std": 0.3892481327056885, "geo/layer_7/stable_rank_q_proj": 42.66439437866211, "geo/layer_7/stable_rank_k_proj": 42.010986328125, "geo/layer_7/stable_rank_o_proj": 93.67852783203125, "geo/layer_7/stable_rank_gate_proj": 86.66171264648438, "geo/layer_7/stable_rank_down_proj": 143.12095642089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4755583107471466, "geo/layer_7/attn_entropy_mean": 4.666910648345947, "geo/layer_7/attn_entropy_std": 0.810973584651947, "geo/layer_14/stable_rank_q_proj": 52.64776611328125, "geo/layer_14/stable_rank_k_proj": 38.67290115356445, "geo/layer_14/stable_rank_o_proj": 45.4537353515625, "geo/layer_14/stable_rank_gate_proj": 73.54486846923828, "geo/layer_14/stable_rank_down_proj": 130.73121643066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3876679539680481, "geo/layer_14/attn_entropy_mean": 5.5024333000183105, "geo/layer_14/attn_entropy_std": 0.35353392362594604, "geo/layer_21/stable_rank_q_proj": 41.532230377197266, "geo/layer_21/stable_rank_k_proj": 30.607276916503906, "geo/layer_21/stable_rank_o_proj": 73.08642578125, "geo/layer_21/stable_rank_gate_proj": 69.68196105957031, "geo/layer_21/stable_rank_down_proj": 53.42646026611328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1392599493265152, "geo/layer_21/attn_entropy_mean": 5.702512741088867, "geo/layer_21/attn_entropy_std": 0.29266294836997986, "geo/layer_27/stable_rank_q_proj": 42.668338775634766, "geo/layer_27/stable_rank_k_proj": 31.71991539001465, "geo/layer_27/stable_rank_o_proj": 115.72979736328125, "geo/layer_27/stable_rank_gate_proj": 82.68119812011719, "geo/layer_27/stable_rank_down_proj": 130.19590759277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09916169196367264, "geo/layer_27/attn_entropy_mean": 4.247646808624268, "geo/layer_27/attn_entropy_std": 0.7001282572746277, "attnres/final_alpha/block_0": 0.23931074142456055, "attnres/block_norm/0": 1.743109941482544, "attnres/final_alpha/block_1": 0.004895655903965235, "attnres/block_norm/1": 43647.0078125, "attnres/final_alpha/block_2": 0.01067466102540493, "attnres/block_norm/2": 27639.921875, "attnres/final_alpha/block_3": 0.012650403194129467, "attnres/block_norm/3": 52876.51953125, "attnres/final_alpha/block_4": 0.015118625946342945, "attnres/block_norm/4": 14031.5546875, "attnres/final_alpha/block_5": 0.6022357940673828, "attnres/block_norm/5": 6351.474609375, "attnres/final_alpha/block_6": 0.11511413753032684, "attnres/block_norm/6": 34412.921875, "geo/tier1_time_s": 1.3590021133422852, "geo/step": 50400.0, "geo/rankme_slope": -2.9936017375700277e-05} {"step": 50410, "timestamp": 1778249026.722944, "train/loss": 2.147654724121094, "train/z_loss": 0.001398379635065794, "train/perplexity": 8.56474812713853, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1639024.1146061516, "perf/iters_per_sec": 0.7815476010351904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.279512596130371, "data/tokens_consumed": 105719529472, "data/tokens_consumed_B": 105.719529472, "train/loss_slope": -1.823882007255519e-05} {"step": 50420, "timestamp": 1778249037.0716345, "train/loss": 2.1614325046539307, "train/z_loss": 0.0013902959413826465, "train/perplexity": 8.683568004652798, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028219.9570232206, "perf/iters_per_sec": 0.9671306405178168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033986473083496, "data/tokens_consumed": 105740500992, "data/tokens_consumed_B": 105.740500992, "train/loss_slope": -1.6154892946055014e-05} {"step": 50430, "timestamp": 1778249047.4179285, "train/loss": 2.1472219944000246, "train/z_loss": 0.0013962630182504654, "train/perplexity": 8.561042707850794, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028487.1257942056, "perf/iters_per_sec": 0.9672580365153339, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338502883911134, "data/tokens_consumed": 105761472512, "data/tokens_consumed_B": 105.761472512, "train/loss_slope": -1.8407916157159073e-05} {"step": 50440, "timestamp": 1778249057.7597911, "train/loss": 2.151522135734558, "train/z_loss": 0.001388024282641709, "train/perplexity": 8.597935667085387, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028885.7178561622, "perf/iters_per_sec": 0.9674481000214397, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336471796035767, "data/tokens_consumed": 105782444032, "data/tokens_consumed_B": 105.782444032, "train/loss_slope": -1.8507660061183575e-05} {"step": 50450, "timestamp": 1778249068.095118, "grad/layer_0/attn": 0.00285422895103693, "grad/layer_0/mlp": 0.002865241840481758, "grad/layer_0/attn_mlp_ratio": 0.9961563492111328, "grad/layer_4/attn": 0.002572716446593404, "grad/layer_4/mlp": 0.002639718586578965, "grad/layer_4/attn_mlp_ratio": 0.9746176589474406, "grad/layer_8/attn": 0.007450484670698643, "grad/layer_8/mlp": 0.0036916781682521105, "grad/layer_8/attn_mlp_ratio": 2.018183636090857, "grad/layer_12/attn": 0.004302592948079109, "grad/layer_12/mlp": 0.006818744353950024, "grad/layer_12/attn_mlp_ratio": 0.6309948960745501, "grad/layer_16/attn": 0.005421388428658247, "grad/layer_16/mlp": 0.004715483635663986, "grad/layer_16/attn_mlp_ratio": 1.1496993166693348, "grad/layer_20/attn": 0.0036252408754080534, "grad/layer_20/mlp": 0.007174322847276926, "grad/layer_20/attn_mlp_ratio": 0.5053077345485324, "grad/layer_24/attn": 0.020686713978648186, "grad/layer_24/mlp": 0.01409764401614666, "grad/layer_24/attn_mlp_ratio": 1.467388012366887, "grad/layer_27/attn": 0.005803690291941166, "grad/layer_27/mlp": 0.013501270674169064, "grad/layer_27/attn_mlp_ratio": 0.42986252101875605} {"step": 50450, "timestamp": 1778249068.109284, "train/loss": 2.1934945821762084, "train/z_loss": 0.0013614388648420573, "train/perplexity": 8.966492573677584, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027762.2573504164, "perf/iters_per_sec": 0.9669123923065264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342198610305786, "data/tokens_consumed": 105803415552, "data/tokens_consumed_B": 105.803415552, "train/loss_slope": -1.5283018076988524e-05} {"step": 50460, "timestamp": 1778249078.9924948, "train/loss": 2.1339101791381836, "train/z_loss": 0.0013951367698609829, "train/perplexity": 8.447834861582354, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928373.3976749051, "perf/iters_per_sec": 0.9195200909018064, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0875238180160522, "data/tokens_consumed": 105824387072, "data/tokens_consumed_B": 105.824387072, "train/loss_slope": -1.5209458985201344e-05} {"step": 50470, "timestamp": 1778249089.8621461, "train/loss": 2.1195805430412293, "train/z_loss": 0.0013966857106424869, "train/perplexity": 8.327643666898954, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930600.6497355043, "perf/iters_per_sec": 0.9205821274449846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0862691879272461, "data/tokens_consumed": 105845358592, "data/tokens_consumed_B": 105.845358592, "train/loss_slope": -1.859854246952709e-05} {"step": 50475, "timestamp": 1778249095.9917548, "eos/sharpness": 45.24612426757812, "eos/L0_probe": 1.9825252294540405, "eos/L_plus": 2.20896577835083, "eos/L_minus": 2.2085459232330322, "eos/grad_norm": 0.1464693695306778, "eos/embed_grad_frac": 0.10858432948589325, "eos/time_s": 0.5838608741760254} {"step": 50475, "timestamp": 1778249097.3682666, "geo/rankme_last": 439.23114013671875, "geo/layer_0/stable_rank_q_proj": 19.470979690551758, "geo/layer_0/stable_rank_k_proj": 16.428556442260742, "geo/layer_0/stable_rank_o_proj": 47.66806411743164, "geo/layer_0/stable_rank_gate_proj": 133.39500427246094, "geo/layer_0/stable_rank_down_proj": 53.71982955932617, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06251061707735062, "geo/layer_0/attn_entropy_mean": 6.188409805297852, "geo/layer_0/attn_entropy_std": 0.3873867392539978, "geo/layer_7/stable_rank_q_proj": 42.71485900878906, "geo/layer_7/stable_rank_k_proj": 42.01882553100586, "geo/layer_7/stable_rank_o_proj": 93.71949768066406, "geo/layer_7/stable_rank_gate_proj": 86.60655212402344, "geo/layer_7/stable_rank_down_proj": 142.91802978515625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4798111617565155, "geo/layer_7/attn_entropy_mean": 4.663348197937012, "geo/layer_7/attn_entropy_std": 0.7934635877609253, "geo/layer_14/stable_rank_q_proj": 52.62110137939453, "geo/layer_14/stable_rank_k_proj": 38.731475830078125, "geo/layer_14/stable_rank_o_proj": 45.4470100402832, "geo/layer_14/stable_rank_gate_proj": 73.60016632080078, "geo/layer_14/stable_rank_down_proj": 130.65513610839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39535385370254517, "geo/layer_14/attn_entropy_mean": 5.522968292236328, "geo/layer_14/attn_entropy_std": 0.374155193567276, "geo/layer_21/stable_rank_q_proj": 41.49007034301758, "geo/layer_21/stable_rank_k_proj": 30.53839111328125, "geo/layer_21/stable_rank_o_proj": 72.99522399902344, "geo/layer_21/stable_rank_gate_proj": 69.63591003417969, "geo/layer_21/stable_rank_down_proj": 53.435302734375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14699402451515198, "geo/layer_21/attn_entropy_mean": 5.7193145751953125, "geo/layer_21/attn_entropy_std": 0.2943422794342041, "geo/layer_27/stable_rank_q_proj": 42.68791961669922, "geo/layer_27/stable_rank_k_proj": 31.669904708862305, "geo/layer_27/stable_rank_o_proj": 115.73539733886719, "geo/layer_27/stable_rank_gate_proj": 82.59996032714844, "geo/layer_27/stable_rank_down_proj": 130.11622619628906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09047892689704895, "geo/layer_27/attn_entropy_mean": 4.268303871154785, "geo/layer_27/attn_entropy_std": 0.7174399495124817, "attnres/final_alpha/block_0": 0.23991671204566956, "attnres/block_norm/0": 1.7433617115020752, "attnres/final_alpha/block_1": 0.004829864017665386, "attnres/block_norm/1": 43705.0546875, "attnres/final_alpha/block_2": 0.010655026882886887, "attnres/block_norm/2": 27611.794921875, "attnres/final_alpha/block_3": 0.012619640678167343, "attnres/block_norm/3": 52805.65625, "attnres/final_alpha/block_4": 0.015042795799672604, "attnres/block_norm/4": 14112.2666015625, "attnres/final_alpha/block_5": 0.6006994843482971, "attnres/block_norm/5": 6366.328125, "attnres/final_alpha/block_6": 0.11623647809028625, "attnres/block_norm/6": 34340.83984375, "geo/tier1_time_s": 1.3568334579467773, "geo/step": 50475.0, "geo/rankme_slope": -4.3482646965036016e-05} {"step": 50480, "timestamp": 1778249102.5422103, "train/loss": 2.175164246559143, "train/z_loss": 0.0013943746569566428, "train/perplexity": 8.803630969541341, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1654800.0276790408, "perf/iters_per_sec": 0.7890701425929264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2673144578933715, "data/tokens_consumed": 105866330112, "data/tokens_consumed_B": 105.866330112, "train/loss_slope": -1.8373290318610103e-05} {"step": 50490, "timestamp": 1778249112.883512, "train/loss": 2.162371802330017, "train/z_loss": 0.0013972908025607466, "train/perplexity": 8.691728291769097, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028901.9100211817, "perf/iters_per_sec": 0.9674558210473927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336389303207398, "data/tokens_consumed": 105887301632, "data/tokens_consumed_B": 105.887301632, "train/loss_slope": -1.6528833416750795e-05} {"step": 50500, "timestamp": 1778249123.2640388, "grad/layer_0/attn": 0.002601580461487174, "grad/layer_0/mlp": 0.002801202703267336, "grad/layer_0/attn_mlp_ratio": 0.9287369191736773, "grad/layer_4/attn": 0.0024725887924432755, "grad/layer_4/mlp": 0.0024621188640594482, "grad/layer_4/attn_mlp_ratio": 1.004252364949322, "grad/layer_8/attn": 0.00493386946618557, "grad/layer_8/mlp": 0.0035528670996427536, "grad/layer_8/attn_mlp_ratio": 1.3887007841671308, "grad/layer_12/attn": 0.00417232746258378, "grad/layer_12/mlp": 0.006703503895550966, "grad/layer_12/attn_mlp_ratio": 0.6224099314855192, "grad/layer_16/attn": 0.004522274248301983, "grad/layer_16/mlp": 0.00447375513613224, "grad/layer_16/attn_mlp_ratio": 1.010845254067071, "grad/layer_20/attn": 0.0032074127811938524, "grad/layer_20/mlp": 0.005800916813313961, "grad/layer_20/attn_mlp_ratio": 0.5529147941823416, "grad/layer_24/attn": 0.008878836408257484, "grad/layer_24/mlp": 0.008491397835314274, "grad/layer_24/attn_mlp_ratio": 1.0456271718619994, "grad/layer_27/attn": 0.01088737789541483, "grad/layer_27/mlp": 0.007037639617919922, "grad/layer_27/attn_mlp_ratio": 1.5470212076489693} {"step": 50500, "timestamp": 1778249123.2782474, "train/loss": 2.13190598487854, "train/z_loss": 0.0013987046084366738, "train/perplexity": 8.430920714726094, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018444.8992417464, "perf/iters_per_sec": 0.9624695297440273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0389939308166505, "data/tokens_consumed": 105908273152, "data/tokens_consumed_B": 105.908273152, "train/loss_slope": -1.9461242250590717e-05} {"step": 50500, "timestamp": 1778249130.2083225, "geo/ww_alpha_mean": 7.566169243368331, "geo/ww_alpha_std": 4.849697904297539, "geo/ww_alpha_min": 1.3632534059413266, "geo/ww_alpha_max": 36.30320176913776, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.00488160066008, "geo/ww_alpha_by_type/k_proj": 4.444786193445294, "geo/ww_alpha_by_type/v_proj": 7.933584374303874, "geo/ww_alpha_by_type/o_proj": 8.192810019105185, "geo/ww_alpha_by_type/gate_proj": 8.000503616191363, "geo/ww_alpha_by_type/up_proj": 12.063849944294835, "geo/ww_alpha_by_type/down_proj": 8.421808530792113, "geo/twonn_id/layer_0": 0.7496802806854248, "geo/twonn_id/layer_7": 3.0391178131103516, "geo/twonn_id/layer_14": 4.121096611022949, "geo/twonn_id/layer_21": 7.360365867614746, "geo/twonn_id/layer_27": 5.858638286590576, "geo/tier2_time_s": 6.9222400188446045} {"step": 50500, "timestamp": 1778249130.9025867, "eoc/jacobian_sigma/layer_0/attn": 1031.5421142578125, "eoc/jacobian_sigma/layer_0/mlp": 6897.50439453125, "eoc/jacobian_sigma/layer_0": 6897.50439453125, "eoc/jacobian_sigma/layer_7/attn": 1.1534781455993652, "eoc/jacobian_sigma/layer_7/mlp": 1.6834089756011963, "eoc/jacobian_sigma/layer_7": 1.6834089756011963, "eoc/jacobian_sigma/layer_14/attn": 1.5966192483901978, "eoc/jacobian_sigma/layer_14/mlp": 6.048382759094238, "eoc/jacobian_sigma/layer_14": 6.048382759094238, "eoc/jacobian_sigma/layer_21/attn": 1.0891824960708618, "eoc/jacobian_sigma/layer_21/mlp": 4.213293075561523, "eoc/jacobian_sigma/layer_21": 4.213293075561523, "eoc/jacobian_sigma/layer_27/attn": 3.2884950637817383, "eoc/jacobian_sigma/layer_27/mlp": 35.81092071533203, "eoc/jacobian_sigma/layer_27": 35.81092071533203, "eoc/layer0_sigma": 6897.50439453125, "eoc/sigma_max": 35.81092071533203, "eoc/sigma_min": 1.6834089756011963, "eoc/sigma_mean": 11.939001381397247, "eoc/time_s": 0.6879744529724121} {"step": 50510, "timestamp": 1778249141.2920675, "train/loss": 2.110614061355591, "train/z_loss": 0.0013961527147330344, "train/perplexity": 8.253307766257226, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1164617.0085997768, "perf/iters_per_sec": 0.555332664775742, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8007224559783936, "data/tokens_consumed": 105929244672, "data/tokens_consumed_B": 105.929244672, "train/loss_slope": -2.1161914043443672e-05} {"step": 50520, "timestamp": 1778249151.6766713, "train/loss": 2.2167842864990233, "train/z_loss": 0.0013906613690778612, "train/perplexity": 9.177770282365998, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020875.8267894308, "perf/iters_per_sec": 0.9636286863276629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037744116783142, "data/tokens_consumed": 105950216192, "data/tokens_consumed_B": 105.950216192, "train/loss_slope": -1.6340015179897052e-05} {"step": 50530, "timestamp": 1778249162.0505133, "train/loss": 2.168199563026428, "train/z_loss": 0.0013828455354087054, "train/perplexity": 8.742529489123251, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022986.695808161, "perf/iters_per_sec": 0.9646352271118932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366612911224364, "data/tokens_consumed": 105971187712, "data/tokens_consumed_B": 105.971187712, "train/loss_slope": -1.4644335791973831e-05} {"step": 50540, "timestamp": 1778249172.4228435, "train/loss": 2.2137046575546266, "train/z_loss": 0.0013872081297449767, "train/perplexity": 9.149549632228881, "train/grad_norm": 0.302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023232.6159056097, "perf/iters_per_sec": 0.9647524909523056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365352869033813, "data/tokens_consumed": 105992159232, "data/tokens_consumed_B": 105.992159232, "train/loss_slope": -1.3721008147701202e-05} {"step": 50550, "timestamp": 1778249182.7867036, "grad/layer_0/attn": 0.0030517736449837685, "grad/layer_0/mlp": 0.0028956164605915546, "grad/layer_0/attn_mlp_ratio": 1.0539287855020114, "grad/layer_4/attn": 0.0018066212069243193, "grad/layer_4/mlp": 0.0025217540096491575, "grad/layer_4/attn_mlp_ratio": 0.716414498944018, "grad/layer_8/attn": 0.004634297918528318, "grad/layer_8/mlp": 0.003610956948250532, "grad/layer_8/attn_mlp_ratio": 1.2833987933402817, "grad/layer_12/attn": 0.0036376977805048227, "grad/layer_12/mlp": 0.006437449250370264, "grad/layer_12/attn_mlp_ratio": 0.5650837128987419, "grad/layer_16/attn": 0.003355553839355707, "grad/layer_16/mlp": 0.004561159759759903, "grad/layer_16/attn_mlp_ratio": 0.7356799460066156, "grad/layer_20/attn": 0.00603707879781723, "grad/layer_20/mlp": 0.005889087915420532, "grad/layer_20/attn_mlp_ratio": 1.025129660485492, "grad/layer_24/attn": 0.012224026024341583, "grad/layer_24/mlp": 0.00916164182126522, "grad/layer_24/attn_mlp_ratio": 1.3342614925789906, "grad/layer_27/attn": 0.0037933546118438244, "grad/layer_27/mlp": 0.009861336089670658, "grad/layer_27/attn_mlp_ratio": 0.38466943412974874} {"step": 50550, "timestamp": 1778249183.386722, "eos/sharpness": 56.512522697448716, "eos/L0_probe": 1.9832826852798462, "eos/L_plus": 2.3072469234466553, "eos/L_minus": 2.2244436740875244, "eos/grad_norm": 0.1472303718328476, "eos/embed_grad_frac": 0.11388599127531052, "eos/time_s": 0.5972015857696533} {"step": 50550, "timestamp": 1778249183.4076748, "train/loss": 2.167395091056824, "train/z_loss": 0.0013839120394550264, "train/perplexity": 8.735499197421271, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909988.3893233472, "perf/iters_per_sec": 0.9107534357659088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979920148849487, "data/tokens_consumed": 106013130752, "data/tokens_consumed_B": 106.013130752, "train/loss_slope": -1.2632548590876688e-05} {"step": 50550, "timestamp": 1778249184.7696657, "geo/rankme_last": 439.0564880371094, "geo/layer_0/stable_rank_q_proj": 19.431283950805664, "geo/layer_0/stable_rank_k_proj": 16.413908004760742, "geo/layer_0/stable_rank_o_proj": 47.64745330810547, "geo/layer_0/stable_rank_gate_proj": 133.44508361816406, "geo/layer_0/stable_rank_down_proj": 53.73015594482422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06335044652223587, "geo/layer_0/attn_entropy_mean": 6.17986536026001, "geo/layer_0/attn_entropy_std": 0.3974471986293793, "geo/layer_7/stable_rank_q_proj": 42.65165710449219, "geo/layer_7/stable_rank_k_proj": 41.90311813354492, "geo/layer_7/stable_rank_o_proj": 93.57758331298828, "geo/layer_7/stable_rank_gate_proj": 86.71514892578125, "geo/layer_7/stable_rank_down_proj": 142.8438720703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4795845150947571, "geo/layer_7/attn_entropy_mean": 4.654000282287598, "geo/layer_7/attn_entropy_std": 0.7812946438789368, "geo/layer_14/stable_rank_q_proj": 52.662071228027344, "geo/layer_14/stable_rank_k_proj": 38.79343032836914, "geo/layer_14/stable_rank_o_proj": 45.430267333984375, "geo/layer_14/stable_rank_gate_proj": 73.62604522705078, "geo/layer_14/stable_rank_down_proj": 130.58660888671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3807855546474457, "geo/layer_14/attn_entropy_mean": 5.5166401863098145, "geo/layer_14/attn_entropy_std": 0.37024933099746704, "geo/layer_21/stable_rank_q_proj": 41.55242919921875, "geo/layer_21/stable_rank_k_proj": 30.636306762695312, "geo/layer_21/stable_rank_o_proj": 73.07769012451172, "geo/layer_21/stable_rank_gate_proj": 69.63899230957031, "geo/layer_21/stable_rank_down_proj": 53.441551208496094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1403711438179016, "geo/layer_21/attn_entropy_mean": 5.705480575561523, "geo/layer_21/attn_entropy_std": 0.28999626636505127, "geo/layer_27/stable_rank_q_proj": 42.6135368347168, "geo/layer_27/stable_rank_k_proj": 31.578323364257812, "geo/layer_27/stable_rank_o_proj": 115.67337036132812, "geo/layer_27/stable_rank_gate_proj": 82.52257537841797, "geo/layer_27/stable_rank_down_proj": 130.04742431640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09486349672079086, "geo/layer_27/attn_entropy_mean": 4.237982273101807, "geo/layer_27/attn_entropy_std": 0.7109951972961426, "attnres/final_alpha/block_0": 0.23777258396148682, "attnres/block_norm/0": 1.7435060739517212, "attnres/final_alpha/block_1": 0.00484654912725091, "attnres/block_norm/1": 43700.28125, "attnres/final_alpha/block_2": 0.010627774521708488, "attnres/block_norm/2": 27717.0703125, "attnres/final_alpha/block_3": 0.0125881377607584, "attnres/block_norm/3": 52661.8125, "attnres/final_alpha/block_4": 0.014919421635568142, "attnres/block_norm/4": 14100.6015625, "attnres/final_alpha/block_5": 0.6057209372520447, "attnres/block_norm/5": 6367.6044921875, "attnres/final_alpha/block_6": 0.11352460086345673, "attnres/block_norm/6": 34671.22265625, "geo/tier1_time_s": 1.3573718070983887, "geo/step": 50550.0, "geo/rankme_slope": -6.0051383834783915e-05} {"step": 50560, "timestamp": 1778249195.1513252, "train/loss": 2.1413911104202272, "train/z_loss": 0.0013864576583728194, "train/perplexity": 8.511269512958274, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786405.9523586656, "perf/iters_per_sec": 0.8518247377198532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739504098892213, "data/tokens_consumed": 106034102272, "data/tokens_consumed_B": 106.034102272, "train/loss_slope": -1.1777985414298389e-05} {"step": 50570, "timestamp": 1778249205.5281842, "train/loss": 2.20889835357666, "train/z_loss": 0.0013804057613015174, "train/perplexity": 9.105679626178961, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021935.1025020147, "perf/iters_per_sec": 0.964133788348205, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372004508972168, "data/tokens_consumed": 106055073792, "data/tokens_consumed_B": 106.055073792, "train/loss_slope": -9.4519691069563e-06} {"step": 50580, "timestamp": 1778249215.8985245, "train/loss": 2.1882997035980223, "train/z_loss": 0.0013942130375653506, "train/perplexity": 8.920033512557403, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023286.181834827, "perf/iters_per_sec": 0.9647780331777701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365078449249268, "data/tokens_consumed": 106076045312, "data/tokens_consumed_B": 106.076045312, "train/loss_slope": -3.890971045861748e-06} {"step": 50590, "timestamp": 1778249226.275739, "train/loss": 2.1824445486068726, "train/z_loss": 0.0013890160946175456, "train/perplexity": 8.867957937870452, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022275.701961005, "perf/iters_per_sec": 0.9642961988263155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037025761604309, "data/tokens_consumed": 106097016832, "data/tokens_consumed_B": 106.097016832, "train/loss_slope": -1.000918148875114e-06} {"step": 50600, "timestamp": 1778249236.639879, "grad/layer_0/attn": 0.003073387546464801, "grad/layer_0/mlp": 0.003058100352063775, "grad/layer_0/attn_mlp_ratio": 1.0049988856287273, "grad/layer_4/attn": 0.001937482738867402, "grad/layer_4/mlp": 0.002507092198356986, "grad/layer_4/attn_mlp_ratio": 0.7728007222299412, "grad/layer_8/attn": 0.0042188698425889015, "grad/layer_8/mlp": 0.003727071452885866, "grad/layer_8/attn_mlp_ratio": 1.1319530045840511, "grad/layer_12/attn": 0.0040360670536756516, "grad/layer_12/mlp": 0.0059546222910285, "grad/layer_12/attn_mlp_ratio": 0.6778040299846, "grad/layer_16/attn": 0.00339023070409894, "grad/layer_16/mlp": 0.004213040694594383, "grad/layer_16/attn_mlp_ratio": 0.8046992349204963, "grad/layer_20/attn": 0.0038672154769301414, "grad/layer_20/mlp": 0.005381463095545769, "grad/layer_20/attn_mlp_ratio": 0.718617843587787, "grad/layer_24/attn": 0.004530047066509724, "grad/layer_24/mlp": 0.007443441078066826, "grad/layer_24/attn_mlp_ratio": 0.6085958037605189, "grad/layer_27/attn": 0.004677252843976021, "grad/layer_27/mlp": 0.006281624548137188, "grad/layer_27/attn_mlp_ratio": 0.7445928570983714} {"step": 50600, "timestamp": 1778249236.6540296, "train/loss": 2.1413852691650392, "train/z_loss": 0.001383319532033056, "train/perplexity": 8.511219796606277, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021679.6004085422, "perf/iters_per_sec": 0.9640119554560386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373315334320068, "data/tokens_consumed": 106117988352, "data/tokens_consumed_B": 106.117988352, "train/loss_slope": -2.7806957026267405e-07} {"step": 50610, "timestamp": 1778249247.0282767, "train/loss": 2.1679747819900514, "train/z_loss": 0.0013795223901979625, "train/perplexity": 8.740564555132355, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022490.8031118896, "perf/iters_per_sec": 0.9643987670478295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369154691696167, "data/tokens_consumed": 106138959872, "data/tokens_consumed_B": 106.138959872, "train/loss_slope": -1.4020991547129654e-06} {"step": 50620, "timestamp": 1778249257.4063013, "train/loss": 2.1815531969070436, "train/z_loss": 0.001383546378929168, "train/perplexity": 8.860056990273169, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021980.0474351135, "perf/iters_per_sec": 0.9641552197623794, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371773958206176, "data/tokens_consumed": 106159931392, "data/tokens_consumed_B": 106.159931392, "train/loss_slope": -5.522646776663027e-07} {"step": 50625, "timestamp": 1778249263.1655962, "eos/sharpness": 45.13776302337646, "eos/L0_probe": 1.9837980270385742, "eos/L_plus": 2.2313332557678223, "eos/L_minus": 2.187640428543091, "eos/grad_norm": 0.15337231755256653, "eos/embed_grad_frac": 0.10074499994516373, "eos/time_s": 0.582033634185791} {"step": 50625, "timestamp": 1778249264.5406203, "geo/rankme_last": 439.77142333984375, "geo/layer_0/stable_rank_q_proj": 19.45707893371582, "geo/layer_0/stable_rank_k_proj": 16.409130096435547, "geo/layer_0/stable_rank_o_proj": 47.72856140136719, "geo/layer_0/stable_rank_gate_proj": 133.3436279296875, "geo/layer_0/stable_rank_down_proj": 53.68495178222656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06406796723604202, "geo/layer_0/attn_entropy_mean": 6.182425498962402, "geo/layer_0/attn_entropy_std": 0.3907683193683624, "geo/layer_7/stable_rank_q_proj": 42.60123825073242, "geo/layer_7/stable_rank_k_proj": 41.89957809448242, "geo/layer_7/stable_rank_o_proj": 93.53398132324219, "geo/layer_7/stable_rank_gate_proj": 86.65592956542969, "geo/layer_7/stable_rank_down_proj": 142.563232421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4646291732788086, "geo/layer_7/attn_entropy_mean": 4.691701889038086, "geo/layer_7/attn_entropy_std": 0.7834354043006897, "geo/layer_14/stable_rank_q_proj": 52.723388671875, "geo/layer_14/stable_rank_k_proj": 38.73845291137695, "geo/layer_14/stable_rank_o_proj": 45.34103012084961, "geo/layer_14/stable_rank_gate_proj": 73.63854217529297, "geo/layer_14/stable_rank_down_proj": 130.8297576904297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3817645013332367, "geo/layer_14/attn_entropy_mean": 5.515357494354248, "geo/layer_14/attn_entropy_std": 0.3714994192123413, "geo/layer_21/stable_rank_q_proj": 41.570709228515625, "geo/layer_21/stable_rank_k_proj": 30.59716033935547, "geo/layer_21/stable_rank_o_proj": 73.07750701904297, "geo/layer_21/stable_rank_gate_proj": 69.53699493408203, "geo/layer_21/stable_rank_down_proj": 53.432456970214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14342351257801056, "geo/layer_21/attn_entropy_mean": 5.715964317321777, "geo/layer_21/attn_entropy_std": 0.2977338433265686, "geo/layer_27/stable_rank_q_proj": 42.61589050292969, "geo/layer_27/stable_rank_k_proj": 31.60957908630371, "geo/layer_27/stable_rank_o_proj": 115.70170593261719, "geo/layer_27/stable_rank_gate_proj": 82.50889587402344, "geo/layer_27/stable_rank_down_proj": 129.92982482910156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07751073688268661, "geo/layer_27/attn_entropy_mean": 4.2532806396484375, "geo/layer_27/attn_entropy_std": 0.7279409766197205, "attnres/final_alpha/block_0": 0.23704080283641815, "attnres/block_norm/0": 1.7436827421188354, "attnres/final_alpha/block_1": 0.004853163845837116, "attnres/block_norm/1": 43877.2734375, "attnres/final_alpha/block_2": 0.010527539998292923, "attnres/block_norm/2": 27636.82421875, "attnres/final_alpha/block_3": 0.012461001053452492, "attnres/block_norm/3": 52475.265625, "attnres/final_alpha/block_4": 0.014647850766777992, "attnres/block_norm/4": 14104.220703125, "attnres/final_alpha/block_5": 0.6064419746398926, "attnres/block_norm/5": 6356.373046875, "attnres/final_alpha/block_6": 0.11402763426303864, "attnres/block_norm/6": 34493.20703125, "geo/tier1_time_s": 1.3574023246765137, "geo/step": 50625.0, "geo/rankme_slope": -4.8376030099539816e-05} {"step": 50630, "timestamp": 1778249269.7293146, "train/loss": 2.129891300201416, "train/z_loss": 0.0013962184777483345, "train/perplexity": 8.413952166824357, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702442.8833565284, "perf/iters_per_sec": 0.8117880265028612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231848669052124, "data/tokens_consumed": 106180902912, "data/tokens_consumed_B": 106.180902912, "train/loss_slope": -3.0310286082128093e-06} {"step": 50640, "timestamp": 1778249280.1053143, "train/loss": 2.1330119729042054, "train/z_loss": 0.0013902303064242006, "train/perplexity": 8.440250370374619, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022056.6489887238, "perf/iters_per_sec": 0.9641917462295169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371381044387817, "data/tokens_consumed": 106201874432, "data/tokens_consumed_B": 106.201874432, "train/loss_slope": -2.4874881763841394e-06} {"step": 50650, "timestamp": 1778249290.4714775, "grad/layer_0/attn": 0.0027339565567672253, "grad/layer_0/mlp": 0.0027309374418109655, "grad/layer_0/attn_mlp_ratio": 1.0011054866360136, "grad/layer_4/attn": 0.0028385750483721495, "grad/layer_4/mlp": 0.0025376514531672, "grad/layer_4/attn_mlp_ratio": 1.1185834575394609, "grad/layer_8/attn": 0.003731753909960389, "grad/layer_8/mlp": 0.003537497716024518, "grad/layer_8/attn_mlp_ratio": 1.0549134173471175, "grad/layer_12/attn": 0.00484763877466321, "grad/layer_12/mlp": 0.006583104841411114, "grad/layer_12/attn_mlp_ratio": 0.736375740293774, "grad/layer_16/attn": 0.003147607669234276, "grad/layer_16/mlp": 0.00411025108769536, "grad/layer_16/attn_mlp_ratio": 0.7657944795824404, "grad/layer_20/attn": 0.0033495179377496243, "grad/layer_20/mlp": 0.006030345801264048, "grad/layer_20/attn_mlp_ratio": 0.5554437494286218, "grad/layer_24/attn": 0.011138973757624626, "grad/layer_24/mlp": 0.010344338603317738, "grad/layer_24/attn_mlp_ratio": 1.0768183522502048, "grad/layer_27/attn": 0.011583889834582806, "grad/layer_27/mlp": 0.008801039308309555, "grad/layer_27/attn_mlp_ratio": 1.31619565566833} {"step": 50650, "timestamp": 1778249290.4856167, "train/loss": 2.164947986602783, "train/z_loss": 0.001375564350746572, "train/perplexity": 8.714148652588163, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021345.5189599216, "perf/iters_per_sec": 0.9638526530074699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375029802322389, "data/tokens_consumed": 106222845952, "data/tokens_consumed_B": 106.222845952, "train/loss_slope": -4.774740712977781e-07} {"step": 50660, "timestamp": 1778249300.8677952, "train/loss": 2.0968298435211183, "train/z_loss": 0.001403694087639451, "train/perplexity": 8.140322867347273, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020897.1379307904, "perf/iters_per_sec": 0.9636388482717468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377331733703614, "data/tokens_consumed": 106243817472, "data/tokens_consumed_B": 106.243817472, "train/loss_slope": -4.058427793501043e-06} {"step": 50670, "timestamp": 1778249311.2213392, "train/loss": 2.121835970878601, "train/z_loss": 0.0013864669599570334, "train/perplexity": 8.346447263351344, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026453.3739246358, "perf/iters_per_sec": 0.966288268053358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348878622055053, "data/tokens_consumed": 106264788992, "data/tokens_consumed_B": 106.264788992, "train/loss_slope": -1.8590568888126184e-06} {"step": 50680, "timestamp": 1778249321.574705, "train/loss": 2.159424662590027, "train/z_loss": 0.0013851374620571732, "train/perplexity": 8.666150263436556, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026927.7644799368, "perf/iters_per_sec": 0.9665144750976261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346456527709962, "data/tokens_consumed": 106285760512, "data/tokens_consumed_B": 106.285760512, "train/loss_slope": 2.1169506820376685e-07} {"step": 50690, "timestamp": 1778249331.9238997, "train/loss": 2.16162109375, "train/z_loss": 0.0013906001928262412, "train/perplexity": 8.685205785322387, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027341.2569995949, "perf/iters_per_sec": 0.9667116436956381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344346284866333, "data/tokens_consumed": 106306732032, "data/tokens_consumed_B": 106.306732032, "train/loss_slope": 7.83191054853834e-07} {"step": 50700, "timestamp": 1778249342.2581763, "grad/layer_0/attn": 0.002619463950395584, "grad/layer_0/mlp": 0.0029449588619172573, "grad/layer_0/attn_mlp_ratio": 0.8894738379275187, "grad/layer_4/attn": 0.0024465748574584723, "grad/layer_4/mlp": 0.002587149851024151, "grad/layer_4/attn_mlp_ratio": 0.9456641106132907, "grad/layer_8/attn": 0.008275232277810574, "grad/layer_8/mlp": 0.003982136491686106, "grad/layer_8/attn_mlp_ratio": 2.078088505323394, "grad/layer_12/attn": 0.005069278180599213, "grad/layer_12/mlp": 0.00619103666394949, "grad/layer_12/attn_mlp_ratio": 0.8188092518070162, "grad/layer_16/attn": 0.00543080223724246, "grad/layer_16/mlp": 0.004758370574563742, "grad/layer_16/attn_mlp_ratio": 1.1413155066445868, "grad/layer_20/attn": 0.0049682375974953175, "grad/layer_20/mlp": 0.006039275322109461, "grad/layer_20/attn_mlp_ratio": 0.8226545819233329, "grad/layer_24/attn": 0.010077864862978458, "grad/layer_24/mlp": 0.00907133612781763, "grad/layer_24/attn_mlp_ratio": 1.1109570420369015, "grad/layer_27/attn": 0.004617986734956503, "grad/layer_27/mlp": 0.008120737038552761, "grad/layer_27/attn_mlp_ratio": 0.5686659543544219} {"step": 50700, "timestamp": 1778249342.8536377, "eos/sharpness": 45.78313827514648, "eos/L0_probe": 1.9888417720794678, "eos/L_plus": 2.2060534954071045, "eos/L_minus": 2.229461431503296, "eos/grad_norm": 0.1294100433588028, "eos/embed_grad_frac": 0.1461358368396759, "eos/time_s": 0.5926144123077393} {"step": 50700, "timestamp": 1778249342.8737516, "train/loss": 2.1276098251342774, "train/z_loss": 0.0014098329818807542, "train/perplexity": 8.394777825947047, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915968.676859007, "perf/iters_per_sec": 0.9136050590796504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0945648670196533, "data/tokens_consumed": 106327703552, "data/tokens_consumed_B": 106.327703552, "train/loss_slope": -3.828135546784739e-07} {"step": 50700, "timestamp": 1778249344.2336416, "geo/rankme_last": 438.8013916015625, "geo/layer_0/stable_rank_q_proj": 19.484575271606445, "geo/layer_0/stable_rank_k_proj": 16.375410079956055, "geo/layer_0/stable_rank_o_proj": 47.63922882080078, "geo/layer_0/stable_rank_gate_proj": 133.4669647216797, "geo/layer_0/stable_rank_down_proj": 53.72633361816406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061071526259183884, "geo/layer_0/attn_entropy_mean": 6.185002326965332, "geo/layer_0/attn_entropy_std": 0.393891841173172, "geo/layer_7/stable_rank_q_proj": 42.50672912597656, "geo/layer_7/stable_rank_k_proj": 41.98721694946289, "geo/layer_7/stable_rank_o_proj": 93.56375885009766, "geo/layer_7/stable_rank_gate_proj": 86.62175750732422, "geo/layer_7/stable_rank_down_proj": 142.6426544189453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.5025345683097839, "geo/layer_7/attn_entropy_mean": 4.66484260559082, "geo/layer_7/attn_entropy_std": 0.7871843576431274, "geo/layer_14/stable_rank_q_proj": 52.602027893066406, "geo/layer_14/stable_rank_k_proj": 38.870086669921875, "geo/layer_14/stable_rank_o_proj": 45.326114654541016, "geo/layer_14/stable_rank_gate_proj": 73.6280288696289, "geo/layer_14/stable_rank_down_proj": 130.87185668945312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39299917221069336, "geo/layer_14/attn_entropy_mean": 5.4947309494018555, "geo/layer_14/attn_entropy_std": 0.3822375237941742, "geo/layer_21/stable_rank_q_proj": 41.56955337524414, "geo/layer_21/stable_rank_k_proj": 30.615015029907227, "geo/layer_21/stable_rank_o_proj": 73.1396713256836, "geo/layer_21/stable_rank_gate_proj": 69.55061340332031, "geo/layer_21/stable_rank_down_proj": 53.39677429199219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14546982944011688, "geo/layer_21/attn_entropy_mean": 5.697315692901611, "geo/layer_21/attn_entropy_std": 0.29372575879096985, "geo/layer_27/stable_rank_q_proj": 42.55613708496094, "geo/layer_27/stable_rank_k_proj": 31.612667083740234, "geo/layer_27/stable_rank_o_proj": 115.51496124267578, "geo/layer_27/stable_rank_gate_proj": 82.46376037597656, "geo/layer_27/stable_rank_down_proj": 129.90176391601562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09245636314153671, "geo/layer_27/attn_entropy_mean": 4.2289958000183105, "geo/layer_27/attn_entropy_std": 0.7025052309036255, "attnres/final_alpha/block_0": 0.23906120657920837, "attnres/block_norm/0": 1.7438329458236694, "attnres/final_alpha/block_1": 0.00482331495732069, "attnres/block_norm/1": 43757.6796875, "attnres/final_alpha/block_2": 0.010525848716497421, "attnres/block_norm/2": 27689.185546875, "attnres/final_alpha/block_3": 0.012465395033359528, "attnres/block_norm/3": 53243.37890625, "attnres/final_alpha/block_4": 0.014883146621286869, "attnres/block_norm/4": 14095.4873046875, "attnres/final_alpha/block_5": 0.6035124659538269, "attnres/block_norm/5": 6359.7646484375, "attnres/final_alpha/block_6": 0.1147286593914032, "attnres/block_norm/6": 34600.8828125, "geo/tier1_time_s": 1.3553733825683594, "geo/step": 50700.0, "geo/rankme_slope": -7.822023340586234e-05} {"step": 50710, "timestamp": 1778249354.583421, "train/loss": 2.1782960414886476, "train/z_loss": 0.0013942457851953804, "train/perplexity": 8.831245355098842, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791583.2695527473, "perf/iters_per_sec": 0.8542934749377953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170557928085327, "data/tokens_consumed": 106348675072, "data/tokens_consumed_B": 106.348675072, "train/loss_slope": 2.4264369824968245e-06} {"step": 50720, "timestamp": 1778249364.9332228, "train/loss": 2.1291839122772216, "train/z_loss": 0.001392583770211786, "train/perplexity": 8.408002343331548, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027309.2498044046, "perf/iters_per_sec": 0.9666963814756415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344509601593017, "data/tokens_consumed": 106369646592, "data/tokens_consumed_B": 106.369646592, "train/loss_slope": 3.1997184799199484e-06} {"step": 50730, "timestamp": 1778249375.3011992, "train/loss": 2.143693721294403, "train/z_loss": 0.0013794694212265312, "train/perplexity": 8.530890235467242, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024311.7878064995, "perf/iters_per_sec": 0.9652670802147386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359827041625977, "data/tokens_consumed": 106390618112, "data/tokens_consumed_B": 106.390618112, "train/loss_slope": 3.608615091531635e-06} {"step": 50740, "timestamp": 1778249385.649604, "train/loss": 2.115066146850586, "train/z_loss": 0.0013970764237456023, "train/perplexity": 8.290134114245868, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028116.0458869545, "perf/iters_per_sec": 0.9670810918268941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340394496917724, "data/tokens_consumed": 106411589632, "data/tokens_consumed_B": 106.411589632, "train/loss_slope": 3.107565113849393e-06} {"step": 50750, "timestamp": 1778249396.0117488, "grad/layer_0/attn": 0.00284159486182034, "grad/layer_0/mlp": 0.0029174319934099913, "grad/layer_0/attn_mlp_ratio": 0.9740054852481553, "grad/layer_4/attn": 0.002184509765356779, "grad/layer_4/mlp": 0.0023801473435014486, "grad/layer_4/attn_mlp_ratio": 0.9178043870018129, "grad/layer_8/attn": 0.0050605228170752525, "grad/layer_8/mlp": 0.0036230962723493576, "grad/layer_8/attn_mlp_ratio": 1.3967397764232838, "grad/layer_12/attn": 0.004981177859008312, "grad/layer_12/mlp": 0.007345203775912523, "grad/layer_12/attn_mlp_ratio": 0.678153791665787, "grad/layer_16/attn": 0.005094035994261503, "grad/layer_16/mlp": 0.004347896669059992, "grad/layer_16/attn_mlp_ratio": 1.1716092319650049, "grad/layer_20/attn": 0.0038075533229857683, "grad/layer_20/mlp": 0.006247812416404486, "grad/layer_20/attn_mlp_ratio": 0.6094218277178639, "grad/layer_24/attn": 0.02490951120853424, "grad/layer_24/mlp": 0.012160707265138626, "grad/layer_24/attn_mlp_ratio": 2.0483603840302003, "grad/layer_27/attn": 0.005809508264064789, "grad/layer_27/mlp": 0.011172821745276451, "grad/layer_27/attn_mlp_ratio": 0.5199678599118523} {"step": 50750, "timestamp": 1778249396.0265656, "train/loss": 2.1305471062660217, "train/z_loss": 0.0014051648206077517, "train/perplexity": 8.419471897421236, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022285.7910700152, "perf/iters_per_sec": 0.9643010096883846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370205879211425, "data/tokens_consumed": 106432561152, "data/tokens_consumed_B": 106.432561152, "train/loss_slope": 2.7529164402112706e-06} {"step": 50760, "timestamp": 1778249406.3846216, "train/loss": 2.1521746158599853, "train/z_loss": 0.0013877074350602924, "train/perplexity": 8.60354747982691, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026462.384295313, "perf/iters_per_sec": 0.966292564532906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348832607269287, "data/tokens_consumed": 106453532672, "data/tokens_consumed_B": 106.453532672, "train/loss_slope": 4.442937918479087e-06} {"step": 50770, "timestamp": 1778249416.749085, "train/loss": 2.1717981815338137, "train/z_loss": 0.0013809674186632038, "train/perplexity": 8.774047193628553, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024806.150273971, "perf/iters_per_sec": 0.96550281060885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357297658920288, "data/tokens_consumed": 106474504192, "data/tokens_consumed_B": 106.474504192, "train/loss_slope": 4.9787434378509236e-06} {"step": 50775, "timestamp": 1778249422.5157173, "eos/sharpness": 6.321954727172851, "eos/L0_probe": 1.985991358757019, "eos/L_plus": 2.0165112018585205, "eos/L_minus": 2.018691062927246, "eos/grad_norm": 0.09178788214921951, "eos/embed_grad_frac": 0.30367860198020935, "eos/time_s": 0.589036226272583} {"step": 50775, "timestamp": 1778249423.8921878, "geo/rankme_last": 440.4118957519531, "geo/layer_0/stable_rank_q_proj": 19.489234924316406, "geo/layer_0/stable_rank_k_proj": 16.359947204589844, "geo/layer_0/stable_rank_o_proj": 47.528079986572266, "geo/layer_0/stable_rank_gate_proj": 133.72201538085938, "geo/layer_0/stable_rank_down_proj": 53.70879364013672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06177465245127678, "geo/layer_0/attn_entropy_mean": 6.18309211730957, "geo/layer_0/attn_entropy_std": 0.39652174711227417, "geo/layer_7/stable_rank_q_proj": 42.51292419433594, "geo/layer_7/stable_rank_k_proj": 41.91899108886719, "geo/layer_7/stable_rank_o_proj": 93.35389709472656, "geo/layer_7/stable_rank_gate_proj": 86.7046890258789, "geo/layer_7/stable_rank_down_proj": 142.8002166748047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47059139609336853, "geo/layer_7/attn_entropy_mean": 4.675825595855713, "geo/layer_7/attn_entropy_std": 0.7958691716194153, "geo/layer_14/stable_rank_q_proj": 52.56766891479492, "geo/layer_14/stable_rank_k_proj": 38.83808898925781, "geo/layer_14/stable_rank_o_proj": 45.30818176269531, "geo/layer_14/stable_rank_gate_proj": 73.61479187011719, "geo/layer_14/stable_rank_down_proj": 131.088623046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3695814907550812, "geo/layer_14/attn_entropy_mean": 5.478724479675293, "geo/layer_14/attn_entropy_std": 0.3782947063446045, "geo/layer_21/stable_rank_q_proj": 41.49571990966797, "geo/layer_21/stable_rank_k_proj": 30.69068717956543, "geo/layer_21/stable_rank_o_proj": 73.1363525390625, "geo/layer_21/stable_rank_gate_proj": 69.40753936767578, "geo/layer_21/stable_rank_down_proj": 53.39889907836914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1457458734512329, "geo/layer_21/attn_entropy_mean": 5.688002586364746, "geo/layer_21/attn_entropy_std": 0.3005831241607666, "geo/layer_27/stable_rank_q_proj": 42.620338439941406, "geo/layer_27/stable_rank_k_proj": 31.564992904663086, "geo/layer_27/stable_rank_o_proj": 115.69628143310547, "geo/layer_27/stable_rank_gate_proj": 82.49893951416016, "geo/layer_27/stable_rank_down_proj": 129.565185546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09505470097064972, "geo/layer_27/attn_entropy_mean": 4.248376369476318, "geo/layer_27/attn_entropy_std": 0.7119751572608948, "attnres/final_alpha/block_0": 0.23836871981620789, "attnres/block_norm/0": 1.744342565536499, "attnres/final_alpha/block_1": 0.004754100926220417, "attnres/block_norm/1": 44000.9921875, "attnres/final_alpha/block_2": 0.010630179196596146, "attnres/block_norm/2": 27525.33984375, "attnres/final_alpha/block_3": 0.012364014983177185, "attnres/block_norm/3": 52684.171875, "attnres/final_alpha/block_4": 0.01502852514386177, "attnres/block_norm/4": 14114.2890625, "attnres/final_alpha/block_5": 0.6041669845581055, "attnres/block_norm/5": 6366.97412109375, "attnres/final_alpha/block_6": 0.11468745768070221, "attnres/block_norm/6": 34404.17578125, "geo/tier1_time_s": 1.3582744598388672, "geo/step": 50775.0, "geo/rankme_slope": -6.737241771708684e-05} {"step": 50780, "timestamp": 1778249429.0758908, "train/loss": 2.2000481843948365, "train/z_loss": 0.001393187779467553, "train/perplexity": 9.025448374725, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702264.6425738775, "perf/iters_per_sec": 0.811703034674586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231977653503418, "data/tokens_consumed": 106495475712, "data/tokens_consumed_B": 106.495475712, "train/loss_slope": 6.878038143703965e-06} {"step": 50790, "timestamp": 1778249439.4238997, "train/loss": 2.162402558326721, "train/z_loss": 0.0013964051380753518, "train/perplexity": 8.691995618646724, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027992.600950472, "perf/iters_per_sec": 0.9670222286941872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341023921966552, "data/tokens_consumed": 106516447232, "data/tokens_consumed_B": 106.516447232, "train/loss_slope": 9.617578629219954e-06} {"step": 50800, "timestamp": 1778249449.7571166, "grad/layer_0/attn": 0.0028241893742233515, "grad/layer_0/mlp": 0.0028299540281295776, "grad/layer_0/attn_mlp_ratio": 0.9979629514666245, "grad/layer_4/attn": 0.0026028254069387913, "grad/layer_4/mlp": 0.0026053478941321373, "grad/layer_4/attn_mlp_ratio": 0.9990317657376183, "grad/layer_8/attn": 0.0054383729584515095, "grad/layer_8/mlp": 0.003744203597307205, "grad/layer_8/attn_mlp_ratio": 1.4524778559357585, "grad/layer_12/attn": 0.005178683903068304, "grad/layer_12/mlp": 0.00658596633002162, "grad/layer_12/attn_mlp_ratio": 0.7863210294333833, "grad/layer_16/attn": 0.0039342110976576805, "grad/layer_16/mlp": 0.004483082331717014, "grad/layer_16/attn_mlp_ratio": 0.8775683154572028, "grad/layer_20/attn": 0.003458628198131919, "grad/layer_20/mlp": 0.006198725197464228, "grad/layer_20/attn_mlp_ratio": 0.5579579723506659, "grad/layer_24/attn": 0.005610073450952768, "grad/layer_24/mlp": 0.00800168514251709, "grad/layer_24/attn_mlp_ratio": 0.7011114885078894, "grad/layer_27/attn": 0.004335571546107531, "grad/layer_27/mlp": 0.006553455721586943, "grad/layer_27/attn_mlp_ratio": 0.6615702713408476} {"step": 50800, "timestamp": 1778249449.771223, "train/loss": 2.164840650558472, "train/z_loss": 0.0013988434220664203, "train/perplexity": 8.713213360538425, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027722.9914886558, "perf/iters_per_sec": 0.96689366888459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342398881912231, "data/tokens_consumed": 106537418752, "data/tokens_consumed_B": 106.537418752, "train/loss_slope": 8.183319741027994e-06} {"step": 50810, "timestamp": 1778249460.1276813, "train/loss": 2.1724005699157716, "train/z_loss": 0.0013901062426157295, "train/perplexity": 8.77933416996743, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025823.5002225596, "perf/iters_per_sec": 0.9659879208672331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035209631919861, "data/tokens_consumed": 106558390272, "data/tokens_consumed_B": 106.558390272, "train/loss_slope": 9.360005116627327e-06} {"step": 50820, "timestamp": 1778249470.473817, "train/loss": 2.2045542001724243, "train/z_loss": 0.0013766883756034076, "train/perplexity": 9.066208952436972, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028367.1437163863, "perf/iters_per_sec": 0.9672008246023113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033911442756653, "data/tokens_consumed": 106579361792, "data/tokens_consumed_B": 106.579361792, "train/loss_slope": 1.1679909775073366e-05} {"step": 50830, "timestamp": 1778249480.818044, "train/loss": 2.1257330417633056, "train/z_loss": 0.0013904861290939153, "train/perplexity": 8.379037421804902, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028316.769507573, "perf/iters_per_sec": 0.967176804307734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033937120437622, "data/tokens_consumed": 106600333312, "data/tokens_consumed_B": 106.600333312, "train/loss_slope": 1.240348568414742e-05} {"step": 50840, "timestamp": 1778249491.1683977, "train/loss": 2.1666675090789793, "train/z_loss": 0.001394172862637788, "train/perplexity": 8.729145717256866, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027147.6873033773, "perf/iters_per_sec": 0.9666193424717795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034533405303955, "data/tokens_consumed": 106621304832, "data/tokens_consumed_B": 106.621304832, "train/loss_slope": 1.180799768523994e-05} {"step": 50850, "timestamp": 1778249501.5023165, "grad/layer_0/attn": 0.0027909898199141026, "grad/layer_0/mlp": 0.002816795837134123, "grad/layer_0/attn_mlp_ratio": 0.9908384853585538, "grad/layer_4/attn": 0.0021111781243234873, "grad/layer_4/mlp": 0.002622629050165415, "grad/layer_4/attn_mlp_ratio": 0.8049853804874894, "grad/layer_8/attn": 0.007546652108430862, "grad/layer_8/mlp": 0.0037810944486409426, "grad/layer_8/attn_mlp_ratio": 1.9958908753401559, "grad/layer_12/attn": 0.004325005691498518, "grad/layer_12/mlp": 0.006578328553587198, "grad/layer_12/attn_mlp_ratio": 0.6574626959600246, "grad/layer_16/attn": 0.00446340162307024, "grad/layer_16/mlp": 0.004475897178053856, "grad/layer_16/attn_mlp_ratio": 0.9972082346382515, "grad/layer_20/attn": 0.0030138245783746243, "grad/layer_20/mlp": 0.00574303325265646, "grad/layer_20/attn_mlp_ratio": 0.5247792226351201, "grad/layer_24/attn": 0.01056987140327692, "grad/layer_24/mlp": 0.009301351383328438, "grad/layer_24/attn_mlp_ratio": 1.1363801725181715, "grad/layer_27/attn": 0.009375792928040028, "grad/layer_27/mlp": 0.009118927642703056, "grad/layer_27/attn_mlp_ratio": 1.028168354063614} {"step": 50850, "timestamp": 1778249502.0964684, "eos/sharpness": 68.6972141265869, "eos/L0_probe": 1.9872955083847046, "eos/L_plus": 2.389166831970215, "eos/L_minus": 2.2723963260650635, "eos/grad_norm": 0.17337436974048615, "eos/embed_grad_frac": 0.07572317123413086, "eos/time_s": 0.5912461280822754} {"step": 50850, "timestamp": 1778249502.1143003, "train/loss": 2.131246197223663, "train/z_loss": 0.0013910160167142748, "train/perplexity": 8.425359931988897, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916817.081363492, "perf/iters_per_sec": 0.9140096098725757, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094080400466919, "data/tokens_consumed": 106642276352, "data/tokens_consumed_B": 106.642276352, "train/loss_slope": 9.074946627257848e-06} {"step": 50850, "timestamp": 1778249503.4782896, "geo/rankme_last": 439.1302185058594, "geo/layer_0/stable_rank_q_proj": 19.475536346435547, "geo/layer_0/stable_rank_k_proj": 16.31377601623535, "geo/layer_0/stable_rank_o_proj": 47.523109436035156, "geo/layer_0/stable_rank_gate_proj": 133.4615020751953, "geo/layer_0/stable_rank_down_proj": 53.6882438659668, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06155596300959587, "geo/layer_0/attn_entropy_mean": 6.182284832000732, "geo/layer_0/attn_entropy_std": 0.3968411684036255, "geo/layer_7/stable_rank_q_proj": 42.43155288696289, "geo/layer_7/stable_rank_k_proj": 41.955177307128906, "geo/layer_7/stable_rank_o_proj": 93.41877746582031, "geo/layer_7/stable_rank_gate_proj": 86.74920654296875, "geo/layer_7/stable_rank_down_proj": 142.49374389648438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47771745920181274, "geo/layer_7/attn_entropy_mean": 4.626172065734863, "geo/layer_7/attn_entropy_std": 0.7917437553405762, "geo/layer_14/stable_rank_q_proj": 52.43590545654297, "geo/layer_14/stable_rank_k_proj": 38.798770904541016, "geo/layer_14/stable_rank_o_proj": 45.33998489379883, "geo/layer_14/stable_rank_gate_proj": 73.61109924316406, "geo/layer_14/stable_rank_down_proj": 131.47872924804688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40186452865600586, "geo/layer_14/attn_entropy_mean": 5.500696659088135, "geo/layer_14/attn_entropy_std": 0.37468403577804565, "geo/layer_21/stable_rank_q_proj": 41.507423400878906, "geo/layer_21/stable_rank_k_proj": 30.6237850189209, "geo/layer_21/stable_rank_o_proj": 73.1029281616211, "geo/layer_21/stable_rank_gate_proj": 69.42140197753906, "geo/layer_21/stable_rank_down_proj": 53.42898941040039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14113864302635193, "geo/layer_21/attn_entropy_mean": 5.697751045227051, "geo/layer_21/attn_entropy_std": 0.2999950647354126, "geo/layer_27/stable_rank_q_proj": 42.55707550048828, "geo/layer_27/stable_rank_k_proj": 31.484609603881836, "geo/layer_27/stable_rank_o_proj": 115.90872955322266, "geo/layer_27/stable_rank_gate_proj": 82.51132202148438, "geo/layer_27/stable_rank_down_proj": 129.50685119628906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08674763143062592, "geo/layer_27/attn_entropy_mean": 4.251736640930176, "geo/layer_27/attn_entropy_std": 0.6914501190185547, "attnres/final_alpha/block_0": 0.23882225155830383, "attnres/block_norm/0": 1.7443352937698364, "attnres/final_alpha/block_1": 0.00481203431263566, "attnres/block_norm/1": 43812.5, "attnres/final_alpha/block_2": 0.010523885488510132, "attnres/block_norm/2": 27613.90625, "attnres/final_alpha/block_3": 0.012440908700227737, "attnres/block_norm/3": 52779.1171875, "attnres/final_alpha/block_4": 0.014821448363363743, "attnres/block_norm/4": 14147.6923828125, "attnres/final_alpha/block_5": 0.6065816879272461, "attnres/block_norm/5": 6360.5615234375, "attnres/final_alpha/block_6": 0.11199779808521271, "attnres/block_norm/6": 34676.76953125, "geo/tier1_time_s": 1.3597931861877441, "geo/step": 50850.0, "geo/rankme_slope": -7.342853156887755e-05} {"step": 50860, "timestamp": 1778249513.8246047, "train/loss": 2.106127882003784, "train/z_loss": 0.0013959196978248656, "train/perplexity": 8.216364875547649, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791443.3015225844, "perf/iters_per_sec": 0.8542267329800531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706493854522706, "data/tokens_consumed": 106663247872, "data/tokens_consumed_B": 106.663247872, "train/loss_slope": 5.932815712277317e-06} {"step": 50870, "timestamp": 1778249524.1768618, "train/loss": 2.148295187950134, "train/z_loss": 0.0013845546753145754, "train/perplexity": 8.570235295497664, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027226.503039229, "perf/iters_per_sec": 0.9666569247432847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344931840896607, "data/tokens_consumed": 106684219392, "data/tokens_consumed_B": 106.684219392, "train/loss_slope": 4.783184042643685e-06} {"step": 50880, "timestamp": 1778249534.521771, "train/loss": 2.1613038659095762, "train/z_loss": 0.0013915422605350614, "train/perplexity": 8.682451033212606, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028700.8366314364, "perf/iters_per_sec": 0.9673599417836363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337413787841796, "data/tokens_consumed": 106705190912, "data/tokens_consumed_B": 106.705190912, "train/loss_slope": 6.177672143816615e-06} {"step": 50890, "timestamp": 1778249544.8674288, "train/loss": 2.147307050228119, "train/z_loss": 0.0013866914785467088, "train/perplexity": 8.561770905395946, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028145.4130315036, "perf/iters_per_sec": 0.9670950951726454, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340244770050049, "data/tokens_consumed": 106726162432, "data/tokens_consumed_B": 106.726162432, "train/loss_slope": 5.204171995625985e-06} {"step": 50900, "timestamp": 1778249555.2110007, "grad/layer_0/attn": 0.003003171645104885, "grad/layer_0/mlp": 0.003154044272378087, "grad/layer_0/attn_mlp_ratio": 0.952165312386063, "grad/layer_4/attn": 0.0018039718270301819, "grad/layer_4/mlp": 0.0025458999443799257, "grad/layer_4/attn_mlp_ratio": 0.7085792040470912, "grad/layer_8/attn": 0.008612853474915028, "grad/layer_8/mlp": 0.0038582973647862673, "grad/layer_8/attn_mlp_ratio": 2.232293791114455, "grad/layer_12/attn": 0.004601852037012577, "grad/layer_12/mlp": 0.00701303081586957, "grad/layer_12/attn_mlp_ratio": 0.6561859048131656, "grad/layer_16/attn": 0.003371939994394779, "grad/layer_16/mlp": 0.004598443396389484, "grad/layer_16/attn_mlp_ratio": 0.7332785532848872, "grad/layer_20/attn": 0.004040331579744816, "grad/layer_20/mlp": 0.006412433460354805, "grad/layer_20/attn_mlp_ratio": 0.6300777297287523, "grad/layer_24/attn": 0.018843835219740868, "grad/layer_24/mlp": 0.011549956165254116, "grad/layer_24/attn_mlp_ratio": 1.6315070626223083, "grad/layer_27/attn": 0.01252493355423212, "grad/layer_27/mlp": 0.01182005275040865, "grad/layer_27/attn_mlp_ratio": 1.05963431067054} {"step": 50900, "timestamp": 1778249555.2250912, "train/loss": 2.175998020172119, "train/z_loss": 0.0013897335738874972, "train/perplexity": 8.810974265639976, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026172.832006595, "perf/iters_per_sec": 0.9661544952424026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350311517715454, "data/tokens_consumed": 106747133952, "data/tokens_consumed_B": 106.747133952, "train/loss_slope": 4.834292273316881e-06} {"step": 50910, "timestamp": 1778249565.5935526, "train/loss": 2.1415990829467773, "train/z_loss": 0.0014017607551068067, "train/perplexity": 8.51303980726294, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024042.3643068369, "perf/iters_per_sec": 0.9651386090788063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03612060546875, "data/tokens_consumed": 106768105472, "data/tokens_consumed_B": 106.768105472, "train/loss_slope": 4.325785488113814e-06} {"step": 50920, "timestamp": 1778249575.95652, "train/loss": 2.0952454566955567, "train/z_loss": 0.00138303724816069, "train/perplexity": 8.127435658897948, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024674.9988377118, "perf/iters_per_sec": 0.9654402727306899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035796856880188, "data/tokens_consumed": 106789076992, "data/tokens_consumed_B": 106.789076992, "train/loss_slope": 9.434059007440771e-07} {"step": 50925, "timestamp": 1778249581.7577841, "eos/sharpness": 75.7568836212158, "eos/L0_probe": 1.985732913017273, "eos/L_plus": 2.317600965499878, "eos/L_minus": 2.411433696746826, "eos/grad_norm": 0.24165579676628113, "eos/embed_grad_frac": 0.04013694450259209, "eos/time_s": 0.6240801811218262} {"step": 50925, "timestamp": 1778249583.140891, "geo/rankme_last": 438.6369323730469, "geo/layer_0/stable_rank_q_proj": 19.48224449157715, "geo/layer_0/stable_rank_k_proj": 16.333972930908203, "geo/layer_0/stable_rank_o_proj": 47.55893325805664, "geo/layer_0/stable_rank_gate_proj": 133.40892028808594, "geo/layer_0/stable_rank_down_proj": 53.624610900878906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06354787200689316, "geo/layer_0/attn_entropy_mean": 6.179580211639404, "geo/layer_0/attn_entropy_std": 0.39713770151138306, "geo/layer_7/stable_rank_q_proj": 42.406585693359375, "geo/layer_7/stable_rank_k_proj": 41.84187698364258, "geo/layer_7/stable_rank_o_proj": 93.23925018310547, "geo/layer_7/stable_rank_gate_proj": 86.68318939208984, "geo/layer_7/stable_rank_down_proj": 142.77842712402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.49078547954559326, "geo/layer_7/attn_entropy_mean": 4.649945259094238, "geo/layer_7/attn_entropy_std": 0.8076044321060181, "geo/layer_14/stable_rank_q_proj": 52.37173080444336, "geo/layer_14/stable_rank_k_proj": 38.74500274658203, "geo/layer_14/stable_rank_o_proj": 45.284339904785156, "geo/layer_14/stable_rank_gate_proj": 73.44241333007812, "geo/layer_14/stable_rank_down_proj": 130.99322509765625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38189589977264404, "geo/layer_14/attn_entropy_mean": 5.475579261779785, "geo/layer_14/attn_entropy_std": 0.38234496116638184, "geo/layer_21/stable_rank_q_proj": 41.47180938720703, "geo/layer_21/stable_rank_k_proj": 30.57765769958496, "geo/layer_21/stable_rank_o_proj": 73.16035461425781, "geo/layer_21/stable_rank_gate_proj": 69.3252182006836, "geo/layer_21/stable_rank_down_proj": 53.3753662109375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1443118005990982, "geo/layer_21/attn_entropy_mean": 5.707096099853516, "geo/layer_21/attn_entropy_std": 0.2943643033504486, "geo/layer_27/stable_rank_q_proj": 42.66378402709961, "geo/layer_27/stable_rank_k_proj": 31.587642669677734, "geo/layer_27/stable_rank_o_proj": 115.86320495605469, "geo/layer_27/stable_rank_gate_proj": 82.69259643554688, "geo/layer_27/stable_rank_down_proj": 129.3905487060547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08884330093860626, "geo/layer_27/attn_entropy_mean": 4.241824150085449, "geo/layer_27/attn_entropy_std": 0.686211109161377, "attnres/final_alpha/block_0": 0.23977486789226532, "attnres/block_norm/0": 1.7445811033248901, "attnres/final_alpha/block_1": 0.004883375950157642, "attnres/block_norm/1": 43796.5, "attnres/final_alpha/block_2": 0.010595395229756832, "attnres/block_norm/2": 27646.8671875, "attnres/final_alpha/block_3": 0.012500733137130737, "attnres/block_norm/3": 52283.09375, "attnres/final_alpha/block_4": 0.014833526685833931, "attnres/block_norm/4": 14209.224609375, "attnres/final_alpha/block_5": 0.6022793054580688, "attnres/block_norm/5": 6400.58837890625, "attnres/final_alpha/block_6": 0.11513283103704453, "attnres/block_norm/6": 34802.078125, "geo/tier1_time_s": 1.3631582260131836, "geo/step": 50925.0, "geo/rankme_slope": -9.469225190076031e-05} {"step": 50930, "timestamp": 1778249588.32285, "train/loss": 2.1121735095977785, "train/z_loss": 0.0014025590848177672, "train/perplexity": 8.266188413286386, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697072.8161575985, "perf/iters_per_sec": 0.8092273789203637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2357466220855713, "data/tokens_consumed": 106810048512, "data/tokens_consumed_B": 106.810048512, "train/loss_slope": -4.488917030874647e-06} {"step": 50940, "timestamp": 1778249598.682674, "train/loss": 2.153440272808075, "train/z_loss": 0.0013932467671111226, "train/perplexity": 8.614443513338715, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025233.139308411, "perf/iters_per_sec": 0.9657064148466162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355113983154296, "data/tokens_consumed": 106831020032, "data/tokens_consumed_B": 106.831020032, "train/loss_slope": -5.022760614989675e-06} {"step": 50950, "timestamp": 1778249609.0327353, "grad/layer_0/attn": 0.002771811792626977, "grad/layer_0/mlp": 0.0028325931634753942, "grad/layer_0/attn_mlp_ratio": 0.9785421113464619, "grad/layer_4/attn": 0.003177169244736433, "grad/layer_4/mlp": 0.00262449961155653, "grad/layer_4/attn_mlp_ratio": 1.210580908333241, "grad/layer_8/attn": 0.0037247375585138798, "grad/layer_8/mlp": 0.003658814588561654, "grad/layer_8/attn_mlp_ratio": 1.0180175481852953, "grad/layer_12/attn": 0.004648138303309679, "grad/layer_12/mlp": 0.007106253411620855, "grad/layer_12/attn_mlp_ratio": 0.6540912585947827, "grad/layer_16/attn": 0.0037988675758242607, "grad/layer_16/mlp": 0.004999392665922642, "grad/layer_16/attn_mlp_ratio": 0.7598657984462593, "grad/layer_20/attn": 0.0029993813950568438, "grad/layer_20/mlp": 0.006348595954477787, "grad/layer_20/attn_mlp_ratio": 0.472447982092249, "grad/layer_24/attn": 0.012764615938067436, "grad/layer_24/mlp": 0.009500382468104362, "grad/layer_24/attn_mlp_ratio": 1.3435896761591566, "grad/layer_27/attn": 0.009627615101635456, "grad/layer_27/mlp": 0.00909088272601366, "grad/layer_27/attn_mlp_ratio": 1.0590407208952173} {"step": 50950, "timestamp": 1778249609.0467968, "train/loss": 2.106800937652588, "train/z_loss": 0.0013888270943425596, "train/perplexity": 8.221896807780055, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024418.2913528748, "perf/iters_per_sec": 0.9653178650631308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035928201675415, "data/tokens_consumed": 106851991552, "data/tokens_consumed_B": 106.851991552, "train/loss_slope": -9.297073470412333e-06} {"step": 50960, "timestamp": 1778249619.4122515, "train/loss": 2.1002279043197634, "train/z_loss": 0.0013830295647494495, "train/perplexity": 8.168031230059738, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024553.7431529043, "perf/iters_per_sec": 0.9653824535145303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358588933944701, "data/tokens_consumed": 106872963072, "data/tokens_consumed_B": 106.872963072, "train/loss_slope": -1.4988216780128283e-05} {"step": 50970, "timestamp": 1778249629.774653, "train/loss": 2.2024460315704344, "train/z_loss": 0.001381056767422706, "train/perplexity": 9.047115988049741, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025211.783248268, "perf/iters_per_sec": 0.9656962314835873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355223178863526, "data/tokens_consumed": 106893934592, "data/tokens_consumed_B": 106.893934592, "train/loss_slope": -1.0415283459784378e-05} {"step": 50980, "timestamp": 1778249640.1430585, "train/loss": 2.154524600505829, "train/z_loss": 0.0013884304789826274, "train/perplexity": 8.6237894591592, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023604.3781648679, "perf/iters_per_sec": 0.9649297610115375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036344861984253, "data/tokens_consumed": 106914906112, "data/tokens_consumed_B": 106.914906112, "train/loss_slope": -9.470703592537864e-06} {"step": 50990, "timestamp": 1778249650.5141695, "train/loss": 2.1503344535827638, "train/z_loss": 0.0013750263606198132, "train/perplexity": 8.587730114027575, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023513.1820639032, "perf/iters_per_sec": 0.9648862753219143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363915681838989, "data/tokens_consumed": 106935877632, "data/tokens_consumed_B": 106.935877632, "train/loss_slope": -1.151726502682613e-05} {"step": 51000, "timestamp": 1778249660.8752604, "grad/layer_0/attn": 0.0033597606234252453, "grad/layer_0/mlp": 0.003229116089642048, "grad/layer_0/attn_mlp_ratio": 1.0404582635342334, "grad/layer_4/attn": 0.0024458663538098335, "grad/layer_4/mlp": 0.0025024977512657642, "grad/layer_4/attn_mlp_ratio": 0.9773700115557395, "grad/layer_8/attn": 0.006533773150295019, "grad/layer_8/mlp": 0.0035817453172057867, "grad/layer_8/attn_mlp_ratio": 1.8241869226406902, "grad/layer_12/attn": 0.00611988827586174, "grad/layer_12/mlp": 0.007047997321933508, "grad/layer_12/attn_mlp_ratio": 0.8683159072698475, "grad/layer_16/attn": 0.0035694700200110674, "grad/layer_16/mlp": 0.004674154799431562, "grad/layer_16/attn_mlp_ratio": 0.7636610460738392, "grad/layer_20/attn": 0.004202117212116718, "grad/layer_20/mlp": 0.006734213791787624, "grad/layer_20/attn_mlp_ratio": 0.6239952100780763, "grad/layer_24/attn": 0.022313540801405907, "grad/layer_24/mlp": 0.012338032945990562, "grad/layer_24/attn_mlp_ratio": 1.8085168614990088, "grad/layer_27/attn": 0.00717468885704875, "grad/layer_27/mlp": 0.012496025301516056, "grad/layer_27/attn_mlp_ratio": 0.5741576722609971} {"step": 51000, "timestamp": 1778249661.4676268, "eos/sharpness": 72.92270660400389, "eos/L0_probe": 1.9832555055618286, "eos/L_plus": 2.3236501216888428, "eos/L_minus": 2.3720879554748535, "eos/grad_norm": 0.2276548594236374, "eos/embed_grad_frac": 0.04483890160918236, "eos/time_s": 0.5894694328308105} {"step": 51000, "timestamp": 1778249661.4945312, "train/loss": 2.146059501171112, "train/z_loss": 0.0013770581339485944, "train/perplexity": 8.551096336085518, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911059.8796792391, "perf/iters_per_sec": 0.9112643621822544, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973763942718506, "data/tokens_consumed": 106956849152, "data/tokens_consumed_B": 106.956849152, "train/loss_slope": -1.3833403129531838e-05} {"step": 51000, "timestamp": 1778249662.8578446, "geo/rankme_last": 439.35223388671875, "geo/layer_0/stable_rank_q_proj": 19.462562561035156, "geo/layer_0/stable_rank_k_proj": 16.358285903930664, "geo/layer_0/stable_rank_o_proj": 47.64467239379883, "geo/layer_0/stable_rank_gate_proj": 133.31362915039062, "geo/layer_0/stable_rank_down_proj": 53.685447692871094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0601436123251915, "geo/layer_0/attn_entropy_mean": 6.179512977600098, "geo/layer_0/attn_entropy_std": 0.39301732182502747, "geo/layer_7/stable_rank_q_proj": 42.42087936401367, "geo/layer_7/stable_rank_k_proj": 41.931121826171875, "geo/layer_7/stable_rank_o_proj": 93.270751953125, "geo/layer_7/stable_rank_gate_proj": 86.71360778808594, "geo/layer_7/stable_rank_down_proj": 143.02398681640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4932697117328644, "geo/layer_7/attn_entropy_mean": 4.65987491607666, "geo/layer_7/attn_entropy_std": 0.7884610295295715, "geo/layer_14/stable_rank_q_proj": 52.365726470947266, "geo/layer_14/stable_rank_k_proj": 38.798927307128906, "geo/layer_14/stable_rank_o_proj": 45.27815246582031, "geo/layer_14/stable_rank_gate_proj": 73.4221420288086, "geo/layer_14/stable_rank_down_proj": 130.95614624023438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3944414258003235, "geo/layer_14/attn_entropy_mean": 5.516762733459473, "geo/layer_14/attn_entropy_std": 0.3781053125858307, "geo/layer_21/stable_rank_q_proj": 41.34378433227539, "geo/layer_21/stable_rank_k_proj": 30.612504959106445, "geo/layer_21/stable_rank_o_proj": 73.08902740478516, "geo/layer_21/stable_rank_gate_proj": 69.22724151611328, "geo/layer_21/stable_rank_down_proj": 53.2598762512207, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14268454909324646, "geo/layer_21/attn_entropy_mean": 5.701422691345215, "geo/layer_21/attn_entropy_std": 0.2885824143886566, "geo/layer_27/stable_rank_q_proj": 42.769371032714844, "geo/layer_27/stable_rank_k_proj": 31.5762996673584, "geo/layer_27/stable_rank_o_proj": 116.20683288574219, "geo/layer_27/stable_rank_gate_proj": 82.65812683105469, "geo/layer_27/stable_rank_down_proj": 129.35142517089844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09297040104866028, "geo/layer_27/attn_entropy_mean": 4.230099678039551, "geo/layer_27/attn_entropy_std": 0.7107634544372559, "attnres/final_alpha/block_0": 0.24014721810817719, "attnres/block_norm/0": 1.7447142601013184, "attnres/final_alpha/block_1": 0.004903334192931652, "attnres/block_norm/1": 43822.68359375, "attnres/final_alpha/block_2": 0.010695112869143486, "attnres/block_norm/2": 27699.31640625, "attnres/final_alpha/block_3": 0.012702004984021187, "attnres/block_norm/3": 52991.421875, "attnres/final_alpha/block_4": 0.015021626837551594, "attnres/block_norm/4": 14153.130859375, "attnres/final_alpha/block_5": 0.6014101505279541, "attnres/block_norm/5": 6479.712890625, "attnres/final_alpha/block_6": 0.115120530128479, "attnres/block_norm/6": 34761.5, "geo/tier1_time_s": 1.3589146137237549, "geo/step": 51000.0, "geo/rankme_slope": -9.021581288765506e-05} {"step": 51000, "timestamp": 1778249669.983336, "geo/ww_alpha_mean": 7.514559825314177, "geo/ww_alpha_std": 4.361889915808964, "geo/ww_alpha_min": 1.3420140421726812, "geo/ww_alpha_max": 27.226689370505625, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.010586660636052, "geo/ww_alpha_by_type/k_proj": 4.464129856250965, "geo/ww_alpha_by_type/v_proj": 8.445271817303183, "geo/ww_alpha_by_type/o_proj": 7.211515520636462, "geo/ww_alpha_by_type/gate_proj": 8.159263312060812, "geo/ww_alpha_by_type/up_proj": 11.995756310367435, "geo/ww_alpha_by_type/down_proj": 8.41191590888479, "geo/twonn_id/layer_0": 0.7214984893798828, "geo/twonn_id/layer_7": 3.1337108612060547, "geo/twonn_id/layer_14": 4.564043045043945, "geo/twonn_id/layer_21": 7.992386817932129, "geo/twonn_id/layer_27": 5.2915778160095215, "geo/tier2_time_s": 7.118743181228638} {"step": 51000, "timestamp": 1778249670.633782, "eoc/jacobian_sigma/layer_0/attn": 969.8292846679688, "eoc/jacobian_sigma/layer_0/mlp": 7531.87744140625, "eoc/jacobian_sigma/layer_0": 7531.87744140625, "eoc/jacobian_sigma/layer_7/attn": 1.1616714000701904, "eoc/jacobian_sigma/layer_7/mlp": 1.707295536994934, "eoc/jacobian_sigma/layer_7": 1.707295536994934, "eoc/jacobian_sigma/layer_14/attn": 1.581680417060852, "eoc/jacobian_sigma/layer_14/mlp": 7.650575637817383, "eoc/jacobian_sigma/layer_14": 7.650575637817383, "eoc/jacobian_sigma/layer_21/attn": 1.0836870670318604, "eoc/jacobian_sigma/layer_21/mlp": 4.222978591918945, "eoc/jacobian_sigma/layer_21": 4.222978591918945, "eoc/jacobian_sigma/layer_27/attn": 3.5752477645874023, "eoc/jacobian_sigma/layer_27/mlp": 32.564735412597656, "eoc/jacobian_sigma/layer_27": 32.564735412597656, "eoc/layer0_sigma": 7531.87744140625, "eoc/sigma_max": 32.564735412597656, "eoc/sigma_min": 1.707295536994934, "eoc/sigma_mean": 11.53639629483223, "eoc/time_s": 0.6404983997344971} {"step": 51010, "timestamp": 1778249681.0269477, "train/loss": 2.1471244931221007, "train/z_loss": 0.0013820328400470316, "train/perplexity": 8.560208035937867, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1073908.6914772193, "perf/iters_per_sec": 0.5120795686136338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9528215169906615, "data/tokens_consumed": 106977820672, "data/tokens_consumed_B": 106.977820672, "train/loss_slope": -1.7339428902530755e-05} {"step": 51020, "timestamp": 1778249691.3925521, "train/loss": 2.138836121559143, "train/z_loss": 0.0013900918420404196, "train/perplexity": 8.489551071162188, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024562.224054293, "perf/iters_per_sec": 0.9653864975234475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358545541763307, "data/tokens_consumed": 106998792192, "data/tokens_consumed_B": 106.998792192, "train/loss_slope": -1.8874529664880555e-05} {"step": 51030, "timestamp": 1778249701.75953, "train/loss": 2.184524655342102, "train/z_loss": 0.0013883323059417307, "train/perplexity": 8.886423435349474, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023894.1746950098, "perf/iters_per_sec": 0.9650679467654275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361964702606201, "data/tokens_consumed": 107019763712, "data/tokens_consumed_B": 107.019763712, "train/loss_slope": -1.5174671525608977e-05} {"step": 51040, "timestamp": 1778249712.1227577, "train/loss": 2.1430482387542726, "train/z_loss": 0.0013763637631200253, "train/perplexity": 8.525385471574097, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025030.0418197894, "perf/iters_per_sec": 0.9656095704173038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356152534484864, "data/tokens_consumed": 107040735232, "data/tokens_consumed_B": 107.040735232, "train/loss_slope": -1.40961566630906e-05} {"step": 51050, "timestamp": 1778249722.4826672, "grad/layer_0/attn": 0.0029997460078448057, "grad/layer_0/mlp": 0.002963007427752018, "grad/layer_0/attn_mlp_ratio": 1.012399050541954, "grad/layer_4/attn": 0.0020929269958287477, "grad/layer_4/mlp": 0.00265138759277761, "grad/layer_4/attn_mlp_ratio": 0.7893704121543178, "grad/layer_8/attn": 0.0049707768484950066, "grad/layer_8/mlp": 0.0037611653096973896, "grad/layer_8/attn_mlp_ratio": 1.3216054884687825, "grad/layer_12/attn": 0.005493640899658203, "grad/layer_12/mlp": 0.006794570945203304, "grad/layer_12/attn_mlp_ratio": 0.8085338814046981, "grad/layer_16/attn": 0.003414855571463704, "grad/layer_16/mlp": 0.004579154308885336, "grad/layer_16/attn_mlp_ratio": 0.7457393366857339, "grad/layer_20/attn": 0.004776407964527607, "grad/layer_20/mlp": 0.006841842085123062, "grad/layer_20/attn_mlp_ratio": 0.6981172373302403, "grad/layer_24/attn": 0.021960992366075516, "grad/layer_24/mlp": 0.01496792770922184, "grad/layer_24/attn_mlp_ratio": 1.4672032525801737, "grad/layer_27/attn": 0.011208823882043362, "grad/layer_27/mlp": 0.014875403605401516, "grad/layer_27/attn_mlp_ratio": 0.7535139283630496} {"step": 51050, "timestamp": 1778249722.497192, "train/loss": 2.1258183121681213, "train/z_loss": 0.0013799419393762946, "train/perplexity": 8.379751936180858, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023172.7240854003, "perf/iters_per_sec": 0.9647239323069574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365659713745117, "data/tokens_consumed": 107061706752, "data/tokens_consumed_B": 107.061706752, "train/loss_slope": -1.8025800525838084e-05} {"step": 51060, "timestamp": 1778249732.8643787, "train/loss": 2.1723576307296755, "train/z_loss": 0.0013880667509511113, "train/perplexity": 8.778957200597144, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024368.9517025796, "perf/iters_per_sec": 0.9652943380844973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035953450202942, "data/tokens_consumed": 107082678272, "data/tokens_consumed_B": 107.082678272, "train/loss_slope": -1.7665943640186116e-05} {"step": 51070, "timestamp": 1778249743.2506425, "train/loss": 2.195357418060303, "train/z_loss": 0.0013760983594693244, "train/perplexity": 8.983211245034063, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020595.6201291922, "perf/iters_per_sec": 0.9634950733800851, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378880262374879, "data/tokens_consumed": 107103649792, "data/tokens_consumed_B": 107.103649792, "train/loss_slope": -1.5520055967159774e-05} {"step": 51075, "timestamp": 1778249749.0513077, "eos/sharpness": 7.6929092407226545, "eos/L0_probe": 1.9840260744094849, "eos/L_plus": 2.022447109222412, "eos/L_minus": 2.022534132003784, "eos/grad_norm": 0.09239966422319412, "eos/embed_grad_frac": 0.27039700746536255, "eos/time_s": 0.6251680850982666} {"step": 51075, "timestamp": 1778249750.4307146, "geo/rankme_last": 438.0594177246094, "geo/layer_0/stable_rank_q_proj": 19.421865463256836, "geo/layer_0/stable_rank_k_proj": 16.331615447998047, "geo/layer_0/stable_rank_o_proj": 47.66183853149414, "geo/layer_0/stable_rank_gate_proj": 133.5354766845703, "geo/layer_0/stable_rank_down_proj": 53.72223663330078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06374233961105347, "geo/layer_0/attn_entropy_mean": 6.172050952911377, "geo/layer_0/attn_entropy_std": 0.39621543884277344, "geo/layer_7/stable_rank_q_proj": 42.60346221923828, "geo/layer_7/stable_rank_k_proj": 41.983253479003906, "geo/layer_7/stable_rank_o_proj": 93.11251831054688, "geo/layer_7/stable_rank_gate_proj": 86.63674926757812, "geo/layer_7/stable_rank_down_proj": 143.20457458496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47216811776161194, "geo/layer_7/attn_entropy_mean": 4.649557113647461, "geo/layer_7/attn_entropy_std": 0.8034835457801819, "geo/layer_14/stable_rank_q_proj": 52.29643249511719, "geo/layer_14/stable_rank_k_proj": 38.91117858886719, "geo/layer_14/stable_rank_o_proj": 45.22890853881836, "geo/layer_14/stable_rank_gate_proj": 73.37837982177734, "geo/layer_14/stable_rank_down_proj": 130.82571411132812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37780436873435974, "geo/layer_14/attn_entropy_mean": 5.50650691986084, "geo/layer_14/attn_entropy_std": 0.37957435846328735, "geo/layer_21/stable_rank_q_proj": 41.29133987426758, "geo/layer_21/stable_rank_k_proj": 30.599592208862305, "geo/layer_21/stable_rank_o_proj": 73.13479614257812, "geo/layer_21/stable_rank_gate_proj": 69.18599700927734, "geo/layer_21/stable_rank_down_proj": 53.19771957397461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14467863738536835, "geo/layer_21/attn_entropy_mean": 5.705031394958496, "geo/layer_21/attn_entropy_std": 0.299318790435791, "geo/layer_27/stable_rank_q_proj": 42.741172790527344, "geo/layer_27/stable_rank_k_proj": 31.529598236083984, "geo/layer_27/stable_rank_o_proj": 116.20549774169922, "geo/layer_27/stable_rank_gate_proj": 82.64972686767578, "geo/layer_27/stable_rank_down_proj": 129.3092041015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0930776298046112, "geo/layer_27/attn_entropy_mean": 4.23224401473999, "geo/layer_27/attn_entropy_std": 0.7090933918952942, "attnres/final_alpha/block_0": 0.23968857526779175, "attnres/block_norm/0": 1.7445286512374878, "attnres/final_alpha/block_1": 0.004870153497904539, "attnres/block_norm/1": 43858.546875, "attnres/final_alpha/block_2": 0.010608233511447906, "attnres/block_norm/2": 27660.3828125, "attnres/final_alpha/block_3": 0.012455208227038383, "attnres/block_norm/3": 52893.6953125, "attnres/final_alpha/block_4": 0.014921535737812519, "attnres/block_norm/4": 14188.892578125, "attnres/final_alpha/block_5": 0.6024104356765747, "attnres/block_norm/5": 6421.7060546875, "attnres/final_alpha/block_6": 0.11504584550857544, "attnres/block_norm/6": 34691.1015625, "geo/tier1_time_s": 1.361086368560791, "geo/step": 51075.0, "geo/rankme_slope": -9.953571272258904e-05} {"step": 51080, "timestamp": 1778249755.614042, "train/loss": 2.1304612874984743, "train/z_loss": 0.0013889621710404753, "train/perplexity": 8.418749379722833, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697322.1537810094, "perf/iters_per_sec": 0.809346272364144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2355650901794433, "data/tokens_consumed": 107124621312, "data/tokens_consumed_B": 107.124621312, "train/loss_slope": -1.3400287653925545e-05} {"step": 51090, "timestamp": 1778249765.9833457, "train/loss": 2.088862109184265, "train/z_loss": 0.0013919463963247836, "train/perplexity": 8.075720645764786, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024019.8690121395, "perf/iters_per_sec": 0.9651278824864099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361321210861205, "data/tokens_consumed": 107145592832, "data/tokens_consumed_B": 107.145592832, "train/loss_slope": -1.755484544655023e-05} {"step": 51100, "timestamp": 1778249776.3399014, "grad/layer_0/attn": 0.002686857245862484, "grad/layer_0/mlp": 0.0027793121989816427, "grad/layer_0/attn_mlp_ratio": 0.9667345576267068, "grad/layer_4/attn": 0.002730810083448887, "grad/layer_4/mlp": 0.002543706214055419, "grad/layer_4/attn_mlp_ratio": 1.0735555706095525, "grad/layer_8/attn": 0.006410025060176849, "grad/layer_8/mlp": 0.0034423868637531996, "grad/layer_8/attn_mlp_ratio": 1.8620873038596637, "grad/layer_12/attn": 0.006313748192042112, "grad/layer_12/mlp": 0.006646187510341406, "grad/layer_12/attn_mlp_ratio": 0.9499804342293888, "grad/layer_16/attn": 0.004999612458050251, "grad/layer_16/mlp": 0.004624180495738983, "grad/layer_16/attn_mlp_ratio": 1.081188840820191, "grad/layer_20/attn": 0.0032667573541402817, "grad/layer_20/mlp": 0.0062890672124922276, "grad/layer_20/attn_mlp_ratio": 0.5194343122471261, "grad/layer_24/attn": 0.022515391930937767, "grad/layer_24/mlp": 0.012123223394155502, "grad/layer_24/attn_mlp_ratio": 1.8572116518178714, "grad/layer_27/attn": 0.012155800126492977, "grad/layer_27/mlp": 0.01214749738574028, "grad/layer_27/attn_mlp_ratio": 1.000683485694271} {"step": 51100, "timestamp": 1778249776.3543737, "train/loss": 2.178244471549988, "train/z_loss": 0.0013942282763309776, "train/perplexity": 8.830789940060553, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023679.7526627523, "perf/iters_per_sec": 0.9649657023729097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363062620162964, "data/tokens_consumed": 107166564352, "data/tokens_consumed_B": 107.166564352, "train/loss_slope": -1.5665865745624502e-05} {"step": 51110, "timestamp": 1778249786.7172573, "train/loss": 2.2038655519485473, "train/z_loss": 0.0013799761421978475, "train/perplexity": 9.059967673014226, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025137.460209785, "perf/iters_per_sec": 0.965660791497128, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355603218078613, "data/tokens_consumed": 107187535872, "data/tokens_consumed_B": 107.187535872, "train/loss_slope": -1.26617333569256e-05} {"step": 51120, "timestamp": 1778249797.0776896, "train/loss": 2.191612434387207, "train/z_loss": 0.0013813183410093187, "train/perplexity": 8.949632181357305, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025490.9863283797, "perf/iters_per_sec": 0.9658293658868693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353795766830445, "data/tokens_consumed": 107208507392, "data/tokens_consumed_B": 107.208507392, "train/loss_slope": -1.1191707547276758e-05} {"step": 51130, "timestamp": 1778249807.4445846, "train/loss": 2.1771732330322267, "train/z_loss": 0.0013744624448008835, "train/perplexity": 8.821335122820864, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024239.1612374359, "perf/iters_per_sec": 0.9652324491679363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360198736190795, "data/tokens_consumed": 107229478912, "data/tokens_consumed_B": 107.229478912, "train/loss_slope": -1.0763213865541631e-05} {"step": 51140, "timestamp": 1778249817.8094532, "train/loss": 2.151981973648071, "train/z_loss": 0.00137137541314587, "train/perplexity": 8.601890233043056, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024381.670759115, "perf/iters_per_sec": 0.9653004030032706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359469413757325, "data/tokens_consumed": 107250450432, "data/tokens_consumed_B": 107.250450432, "train/loss_slope": -1.0725505631713189e-05} {"step": 51150, "timestamp": 1778249828.1629257, "grad/layer_0/attn": 0.00261084851808846, "grad/layer_0/mlp": 0.002813467988744378, "grad/layer_0/attn_mlp_ratio": 0.9279822751619177, "grad/layer_4/attn": 0.0017637779237702489, "grad/layer_4/mlp": 0.0025198960211127996, "grad/layer_4/attn_mlp_ratio": 0.6999407273151225, "grad/layer_8/attn": 0.0038041130173951387, "grad/layer_8/mlp": 0.0036221216432750225, "grad/layer_8/attn_mlp_ratio": 1.0502443835461932, "grad/layer_12/attn": 0.0057365442626178265, "grad/layer_12/mlp": 0.006379623431712389, "grad/layer_12/attn_mlp_ratio": 0.8991979282323028, "grad/layer_16/attn": 0.003572888905182481, "grad/layer_16/mlp": 0.004377515520900488, "grad/layer_16/attn_mlp_ratio": 0.8161910121173962, "grad/layer_20/attn": 0.0031434856355190277, "grad/layer_20/mlp": 0.005621209274977446, "grad/layer_20/attn_mlp_ratio": 0.5592187420579118, "grad/layer_24/attn": 0.004629972856491804, "grad/layer_24/mlp": 0.007901758886873722, "grad/layer_24/attn_mlp_ratio": 0.5859420496351563, "grad/layer_27/attn": 0.004750501364469528, "grad/layer_27/mlp": 0.007059830706566572, "grad/layer_27/attn_mlp_ratio": 0.6728916732750786} {"step": 51150, "timestamp": 1778249828.762343, "eos/sharpness": 34.562897682189934, "eos/L0_probe": 1.9824310541152954, "eos/L_plus": 2.1733579635620117, "eos/L_minus": 2.1371331214904785, "eos/grad_norm": 0.09620416164398193, "eos/embed_grad_frac": 0.2453628033399582, "eos/time_s": 0.5963304042816162} {"step": 51150, "timestamp": 1778249828.7822282, "train/loss": 2.1154556035995484, "train/z_loss": 0.0013853385811671616, "train/perplexity": 8.2933633917176, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912520.0120420519, "perf/iters_per_sec": 0.9119606075487384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965385913848877, "data/tokens_consumed": 107271421952, "data/tokens_consumed_B": 107.271421952, "train/loss_slope": -1.3842005648127937e-05} {"step": 51150, "timestamp": 1778249830.1482189, "geo/rankme_last": 438.35174560546875, "geo/layer_0/stable_rank_q_proj": 19.439220428466797, "geo/layer_0/stable_rank_k_proj": 16.339771270751953, "geo/layer_0/stable_rank_o_proj": 47.645668029785156, "geo/layer_0/stable_rank_gate_proj": 133.39443969726562, "geo/layer_0/stable_rank_down_proj": 53.721988677978516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062441568821668625, "geo/layer_0/attn_entropy_mean": 6.175493240356445, "geo/layer_0/attn_entropy_std": 0.40066009759902954, "geo/layer_7/stable_rank_q_proj": 42.58687973022461, "geo/layer_7/stable_rank_k_proj": 42.114036560058594, "geo/layer_7/stable_rank_o_proj": 93.28954315185547, "geo/layer_7/stable_rank_gate_proj": 86.49339294433594, "geo/layer_7/stable_rank_down_proj": 143.30274963378906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.471285343170166, "geo/layer_7/attn_entropy_mean": 4.662384510040283, "geo/layer_7/attn_entropy_std": 0.8147751688957214, "geo/layer_14/stable_rank_q_proj": 52.19015884399414, "geo/layer_14/stable_rank_k_proj": 38.90305709838867, "geo/layer_14/stable_rank_o_proj": 45.16475296020508, "geo/layer_14/stable_rank_gate_proj": 73.36273193359375, "geo/layer_14/stable_rank_down_proj": 130.38552856445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3816334307193756, "geo/layer_14/attn_entropy_mean": 5.465497016906738, "geo/layer_14/attn_entropy_std": 0.3680795431137085, "geo/layer_21/stable_rank_q_proj": 41.23924255371094, "geo/layer_21/stable_rank_k_proj": 30.5062198638916, "geo/layer_21/stable_rank_o_proj": 73.12420654296875, "geo/layer_21/stable_rank_gate_proj": 69.12532806396484, "geo/layer_21/stable_rank_down_proj": 53.18214797973633, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1511213481426239, "geo/layer_21/attn_entropy_mean": 5.710456371307373, "geo/layer_21/attn_entropy_std": 0.2947746515274048, "geo/layer_27/stable_rank_q_proj": 42.75065231323242, "geo/layer_27/stable_rank_k_proj": 31.583145141601562, "geo/layer_27/stable_rank_o_proj": 116.04852294921875, "geo/layer_27/stable_rank_gate_proj": 82.62842559814453, "geo/layer_27/stable_rank_down_proj": 129.41578674316406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09104538708925247, "geo/layer_27/attn_entropy_mean": 4.26841926574707, "geo/layer_27/attn_entropy_std": 0.7002314329147339, "attnres/final_alpha/block_0": 0.23695839941501617, "attnres/block_norm/0": 1.7447054386138916, "attnres/final_alpha/block_1": 0.004845646675676107, "attnres/block_norm/1": 43941.05859375, "attnres/final_alpha/block_2": 0.01052540261298418, "attnres/block_norm/2": 27757.37890625, "attnres/final_alpha/block_3": 0.012328974902629852, "attnres/block_norm/3": 53043.109375, "attnres/final_alpha/block_4": 0.01446581445634365, "attnres/block_norm/4": 14185.8271484375, "attnres/final_alpha/block_5": 0.6084268093109131, "attnres/block_norm/5": 6338.1689453125, "attnres/final_alpha/block_6": 0.11244890838861465, "attnres/block_norm/6": 34990.6484375, "geo/tier1_time_s": 1.3622112274169922, "geo/step": 51150.0, "geo/rankme_slope": -0.00013454244979241696} {"step": 51160, "timestamp": 1778249840.493893, "train/loss": 2.1603671312332153, "train/z_loss": 0.0013690584455616772, "train/perplexity": 8.674321688368035, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791189.874933465, "perf/iters_per_sec": 0.8541058897654843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708150148391723, "data/tokens_consumed": 107292393472, "data/tokens_consumed_B": 107.292393472, "train/loss_slope": -7.88847028833586e-06} {"step": 51170, "timestamp": 1778249850.8486118, "train/loss": 2.111109137535095, "train/z_loss": 0.001391218404751271, "train/perplexity": 8.257394793945968, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026412.6648549095, "perf/iters_per_sec": 0.9662688564562366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034908652305603, "data/tokens_consumed": 107313364992, "data/tokens_consumed_B": 107.313364992, "train/loss_slope": -1.2893459312628926e-05} {"step": 51180, "timestamp": 1778249861.1992903, "train/loss": 2.171477699279785, "train/z_loss": 0.0013907916145399213, "train/perplexity": 8.771235717745112, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027378.779129836, "perf/iters_per_sec": 0.966729535641592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344154834747314, "data/tokens_consumed": 107334336512, "data/tokens_consumed_B": 107.334336512, "train/loss_slope": -9.20061806176039e-06} {"step": 51190, "timestamp": 1778249871.5405834, "train/loss": 2.2360562801361086, "train/z_loss": 0.0013751738471910358, "train/perplexity": 9.356359571220402, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029187.0467831786, "perf/iters_per_sec": 0.9675917848506825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334936857223511, "data/tokens_consumed": 107355308032, "data/tokens_consumed_B": 107.355308032, "train/loss_slope": -1.5546032161351972e-06} {"step": 51200, "timestamp": 1778249881.877364, "grad/layer_0/attn": 0.0024937610141932964, "grad/layer_0/mlp": 0.002792297163978219, "grad/layer_0/attn_mlp_ratio": 0.8930857922484963, "grad/layer_4/attn": 0.0019982578232884407, "grad/layer_4/mlp": 0.0026976419612765312, "grad/layer_4/attn_mlp_ratio": 0.7407423883148011, "grad/layer_8/attn": 0.003915809560567141, "grad/layer_8/mlp": 0.003772386349737644, "grad/layer_8/attn_mlp_ratio": 1.0380191989183587, "grad/layer_12/attn": 0.0058968267403542995, "grad/layer_12/mlp": 0.006941040046513081, "grad/layer_12/attn_mlp_ratio": 0.849559520746562, "grad/layer_16/attn": 0.0038595872465521097, "grad/layer_16/mlp": 0.004900948144495487, "grad/layer_16/attn_mlp_ratio": 0.7875184666328634, "grad/layer_20/attn": 0.0038771722465753555, "grad/layer_20/mlp": 0.00677720969542861, "grad/layer_20/attn_mlp_ratio": 0.5720897483785438, "grad/layer_24/attn": 0.013223312795162201, "grad/layer_24/mlp": 0.011786079965531826, "grad/layer_24/attn_mlp_ratio": 1.1219432348702207, "grad/layer_27/attn": 0.006997190415859222, "grad/layer_27/mlp": 0.012135700322687626, "grad/layer_27/attn_mlp_ratio": 0.5765790331127499} {"step": 51200, "timestamp": 1778249881.891726, "train/loss": 2.2200456619262696, "train/z_loss": 0.0013902731123380363, "train/perplexity": 9.207751299944231, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026915.1535302862, "perf/iters_per_sec": 0.9665084617282325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034652090072632, "data/tokens_consumed": 107376279552, "data/tokens_consumed_B": 107.376279552, "train/loss_slope": -3.9517867325517527e-07} {"step": 51210, "timestamp": 1778249892.2356515, "train/loss": 2.1529860496520996, "train/z_loss": 0.0013854295015335084, "train/perplexity": 8.610531522144866, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028742.4799301692, "perf/iters_per_sec": 0.9673797988558622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337201595306396, "data/tokens_consumed": 107397251072, "data/tokens_consumed_B": 107.397251072, "train/loss_slope": -3.7458347122551755e-07} {"step": 51220, "timestamp": 1778249902.5868092, "train/loss": 2.107226550579071, "train/z_loss": 0.0013796140905469657, "train/perplexity": 8.225396898130672, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027268.9268497152, "perf/iters_per_sec": 0.9666771539925171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344715356826781, "data/tokens_consumed": 107418222592, "data/tokens_consumed_B": 107.418222592, "train/loss_slope": -3.7687301850340225e-06} {"step": 51225, "timestamp": 1778249908.3416398, "eos/sharpness": 15.654182434082028, "eos/L0_probe": 1.983578085899353, "eos/L_plus": 2.062180519104004, "eos/L_minus": 2.0615174770355225, "eos/grad_norm": 0.10899976640939713, "eos/embed_grad_frac": 0.24317802488803864, "eos/time_s": 0.5845496654510498} {"step": 51225, "timestamp": 1778249909.7203689, "geo/rankme_last": 438.815185546875, "geo/layer_0/stable_rank_q_proj": 19.47260284423828, "geo/layer_0/stable_rank_k_proj": 16.3576717376709, "geo/layer_0/stable_rank_o_proj": 47.668113708496094, "geo/layer_0/stable_rank_gate_proj": 133.27413940429688, "geo/layer_0/stable_rank_down_proj": 53.80390930175781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059305232018232346, "geo/layer_0/attn_entropy_mean": 6.177798748016357, "geo/layer_0/attn_entropy_std": 0.3942021131515503, "geo/layer_7/stable_rank_q_proj": 42.598506927490234, "geo/layer_7/stable_rank_k_proj": 42.070003509521484, "geo/layer_7/stable_rank_o_proj": 93.45658111572266, "geo/layer_7/stable_rank_gate_proj": 86.36518096923828, "geo/layer_7/stable_rank_down_proj": 143.28933715820312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4841702878475189, "geo/layer_7/attn_entropy_mean": 4.67756462097168, "geo/layer_7/attn_entropy_std": 0.8009462356567383, "geo/layer_14/stable_rank_q_proj": 52.1771354675293, "geo/layer_14/stable_rank_k_proj": 38.86336898803711, "geo/layer_14/stable_rank_o_proj": 45.2797737121582, "geo/layer_14/stable_rank_gate_proj": 73.43183135986328, "geo/layer_14/stable_rank_down_proj": 130.2744903564453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.396340012550354, "geo/layer_14/attn_entropy_mean": 5.504084587097168, "geo/layer_14/attn_entropy_std": 0.3820374310016632, "geo/layer_21/stable_rank_q_proj": 41.19507598876953, "geo/layer_21/stable_rank_k_proj": 30.54488754272461, "geo/layer_21/stable_rank_o_proj": 73.11273956298828, "geo/layer_21/stable_rank_gate_proj": 69.17760467529297, "geo/layer_21/stable_rank_down_proj": 53.2282600402832, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14510218799114227, "geo/layer_21/attn_entropy_mean": 5.7114176750183105, "geo/layer_21/attn_entropy_std": 0.29113534092903137, "geo/layer_27/stable_rank_q_proj": 42.764427185058594, "geo/layer_27/stable_rank_k_proj": 31.629199981689453, "geo/layer_27/stable_rank_o_proj": 116.05852508544922, "geo/layer_27/stable_rank_gate_proj": 82.71893310546875, "geo/layer_27/stable_rank_down_proj": 129.1985321044922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08960177004337311, "geo/layer_27/attn_entropy_mean": 4.249275207519531, "geo/layer_27/attn_entropy_std": 0.67117840051651, "attnres/final_alpha/block_0": 0.23809240758419037, "attnres/block_norm/0": 1.7450379133224487, "attnres/final_alpha/block_1": 0.004823846742510796, "attnres/block_norm/1": 43796.890625, "attnres/final_alpha/block_2": 0.010537447407841682, "attnres/block_norm/2": 27720.7265625, "attnres/final_alpha/block_3": 0.012342467904090881, "attnres/block_norm/3": 53097.08984375, "attnres/final_alpha/block_4": 0.014776643365621567, "attnres/block_norm/4": 14190.587890625, "attnres/final_alpha/block_5": 0.6062012910842896, "attnres/block_norm/5": 6380.5068359375, "attnres/final_alpha/block_6": 0.11322592198848724, "attnres/block_norm/6": 34838.5078125, "geo/tier1_time_s": 1.358320951461792, "geo/step": 51225.0, "geo/rankme_slope": -0.0001318760512017307} {"step": 51230, "timestamp": 1778249914.8942132, "train/loss": 2.0933722138404844, "train/z_loss": 0.0013848372385837139, "train/perplexity": 8.11222524896355, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704693.1089126135, "perf/iters_per_sec": 0.8128610176623409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2302226066589355, "data/tokens_consumed": 107439194112, "data/tokens_consumed_B": 107.439194112, "train/loss_slope": -5.088165219109805e-06} {"step": 51240, "timestamp": 1778249925.2455475, "train/loss": 2.1080351948738096, "train/z_loss": 0.0013832874945364892, "train/perplexity": 8.23205100844588, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026943.4583253218, "perf/iters_per_sec": 0.9665219585062608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346376419067382, "data/tokens_consumed": 107460165632, "data/tokens_consumed_B": 107.460165632, "train/loss_slope": -6.146450118549315e-06} {"step": 51250, "timestamp": 1778249935.59838, "grad/layer_0/attn": 0.002847489435225725, "grad/layer_0/mlp": 0.00274810497649014, "grad/layer_0/attn_mlp_ratio": 1.036164686563775, "grad/layer_4/attn": 0.0018213766161352396, "grad/layer_4/mlp": 0.0023173559457063675, "grad/layer_4/attn_mlp_ratio": 0.7859718490431861, "grad/layer_8/attn": 0.00933366920799017, "grad/layer_8/mlp": 0.00357071147300303, "grad/layer_8/attn_mlp_ratio": 2.613952154119352, "grad/layer_12/attn": 0.00616156542673707, "grad/layer_12/mlp": 0.006237045396119356, "grad/layer_12/attn_mlp_ratio": 0.9878981050516228, "grad/layer_16/attn": 0.0038905038964003325, "grad/layer_16/mlp": 0.0044919345527887344, "grad/layer_16/attn_mlp_ratio": 0.8661087475938648, "grad/layer_20/attn": 0.008815429173409939, "grad/layer_20/mlp": 0.005487135145813227, "grad/layer_20/attn_mlp_ratio": 1.6065631296652028, "grad/layer_24/attn": 0.008439330384135246, "grad/layer_24/mlp": 0.007895957678556442, "grad/layer_24/attn_mlp_ratio": 1.0688165540923327, "grad/layer_27/attn": 0.0052336156368255615, "grad/layer_27/mlp": 0.006651328410953283, "grad/layer_27/attn_mlp_ratio": 0.7868526758536947} {"step": 51250, "timestamp": 1778249935.6126606, "train/loss": 2.1664806842803954, "train/z_loss": 0.0013676949893124402, "train/perplexity": 8.727515048695835, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024180.3743354776, "perf/iters_per_sec": 0.9652044173886669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360499620437622, "data/tokens_consumed": 107481137152, "data/tokens_consumed_B": 107.481137152, "train/loss_slope": -4.6164435450465125e-06} {"step": 51260, "timestamp": 1778249945.9600308, "train/loss": 2.2207720994949343, "train/z_loss": 0.0013725909637287259, "train/perplexity": 9.21444258651815, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028345.8618280322, "perf/iters_per_sec": 0.9671906766071473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033922290802002, "data/tokens_consumed": 107502108672, "data/tokens_consumed_B": 107.502108672, "train/loss_slope": 2.6054988206414425e-07} {"step": 51270, "timestamp": 1778249956.7958076, "train/loss": 2.1660160779953004, "train/z_loss": 0.0013922487269155681, "train/perplexity": 8.723461132161423, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936264.7686412602, "perf/iters_per_sec": 0.9232829898077298, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0830915451049805, "data/tokens_consumed": 107523080192, "data/tokens_consumed_B": 107.523080192, "train/loss_slope": 1.4787405917067057e-06} {"step": 51280, "timestamp": 1778249967.1408036, "train/loss": 2.1444050788879396, "train/z_loss": 0.001389055885374546, "train/perplexity": 8.536960907969362, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028464.7187065063, "perf/iters_per_sec": 0.9672473519833118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338617086410522, "data/tokens_consumed": 107544051712, "data/tokens_consumed_B": 107.544051712, "train/loss_slope": 1.1876998919107218e-06} {"step": 51290, "timestamp": 1778249977.483926, "train/loss": 2.1823318004608154, "train/z_loss": 0.0013805570430122315, "train/perplexity": 8.866958148416906, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028562.7709136417, "perf/iters_per_sec": 0.9672941069191178, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338117361068726, "data/tokens_consumed": 107565023232, "data/tokens_consumed_B": 107.565023232, "train/loss_slope": 2.4574943394264443e-06} {"step": 51300, "timestamp": 1778249987.8208442, "grad/layer_0/attn": 0.00274173473007977, "grad/layer_0/mlp": 0.002773256041109562, "grad/layer_0/attn_mlp_ratio": 0.9886337902357695, "grad/layer_4/attn": 0.001781267812475562, "grad/layer_4/mlp": 0.002534406026825309, "grad/layer_4/attn_mlp_ratio": 0.7028344011726504, "grad/layer_8/attn": 0.007917346432805061, "grad/layer_8/mlp": 0.0037564102094620466, "grad/layer_8/attn_mlp_ratio": 2.1076894642904174, "grad/layer_12/attn": 0.004671808797866106, "grad/layer_12/mlp": 0.0062775034457445145, "grad/layer_12/attn_mlp_ratio": 0.7442144419071017, "grad/layer_16/attn": 0.003341832896694541, "grad/layer_16/mlp": 0.004237212240695953, "grad/layer_16/attn_mlp_ratio": 0.7886866713282659, "grad/layer_20/attn": 0.003415093757212162, "grad/layer_20/mlp": 0.0061040581203997135, "grad/layer_20/attn_mlp_ratio": 0.5594792241330442, "grad/layer_24/attn": 0.015192454680800438, "grad/layer_24/mlp": 0.010980972088873386, "grad/layer_24/attn_mlp_ratio": 1.3835254674621973, "grad/layer_27/attn": 0.004283292684704065, "grad/layer_27/mlp": 0.010226700454950333, "grad/layer_27/attn_mlp_ratio": 0.41883427227471687} {"step": 51300, "timestamp": 1778249988.417291, "eos/sharpness": 71.35550975799559, "eos/L0_probe": 1.9826925992965698, "eos/L_plus": 2.2983884811401367, "eos/L_minus": 2.380551815032959, "eos/grad_norm": 0.197948157787323, "eos/embed_grad_frac": 0.05965828523039818, "eos/time_s": 0.5933265686035156} {"step": 51300, "timestamp": 1778249988.437515, "train/loss": 2.1762928485870363, "train/z_loss": 0.0013831180636771022, "train/perplexity": 8.813572374195886, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915794.4961888376, "perf/iters_per_sec": 0.9135220032638729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0946643829345704, "data/tokens_consumed": 107585994752, "data/tokens_consumed_B": 107.585994752, "train/loss_slope": 5.127183224322944e-06} {"step": 51300, "timestamp": 1778249989.7997732, "geo/rankme_last": 438.7948913574219, "geo/layer_0/stable_rank_q_proj": 19.467388153076172, "geo/layer_0/stable_rank_k_proj": 16.360570907592773, "geo/layer_0/stable_rank_o_proj": 47.58524703979492, "geo/layer_0/stable_rank_gate_proj": 133.21438598632812, "geo/layer_0/stable_rank_down_proj": 53.812564849853516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06273671984672546, "geo/layer_0/attn_entropy_mean": 6.179216384887695, "geo/layer_0/attn_entropy_std": 0.39382532238960266, "geo/layer_7/stable_rank_q_proj": 42.64194869995117, "geo/layer_7/stable_rank_k_proj": 42.057838439941406, "geo/layer_7/stable_rank_o_proj": 93.33091735839844, "geo/layer_7/stable_rank_gate_proj": 86.22188568115234, "geo/layer_7/stable_rank_down_proj": 143.15927124023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4663822054862976, "geo/layer_7/attn_entropy_mean": 4.650850296020508, "geo/layer_7/attn_entropy_std": 0.7880123853683472, "geo/layer_14/stable_rank_q_proj": 52.15866470336914, "geo/layer_14/stable_rank_k_proj": 38.81658935546875, "geo/layer_14/stable_rank_o_proj": 45.298004150390625, "geo/layer_14/stable_rank_gate_proj": 73.4735107421875, "geo/layer_14/stable_rank_down_proj": 130.40106201171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3863213062286377, "geo/layer_14/attn_entropy_mean": 5.513339042663574, "geo/layer_14/attn_entropy_std": 0.3632921874523163, "geo/layer_21/stable_rank_q_proj": 41.10381317138672, "geo/layer_21/stable_rank_k_proj": 30.431840896606445, "geo/layer_21/stable_rank_o_proj": 73.0645751953125, "geo/layer_21/stable_rank_gate_proj": 69.20453643798828, "geo/layer_21/stable_rank_down_proj": 53.19794845581055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1458626091480255, "geo/layer_21/attn_entropy_mean": 5.706812858581543, "geo/layer_21/attn_entropy_std": 0.3015457093715668, "geo/layer_27/stable_rank_q_proj": 42.83417510986328, "geo/layer_27/stable_rank_k_proj": 31.65039825439453, "geo/layer_27/stable_rank_o_proj": 116.0572509765625, "geo/layer_27/stable_rank_gate_proj": 82.71792602539062, "geo/layer_27/stable_rank_down_proj": 129.18331909179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09704612195491791, "geo/layer_27/attn_entropy_mean": 4.248642921447754, "geo/layer_27/attn_entropy_std": 0.7071952223777771, "attnres/final_alpha/block_0": 0.23928740620613098, "attnres/block_norm/0": 1.7452826499938965, "attnres/final_alpha/block_1": 0.004925441928207874, "attnres/block_norm/1": 44003.10546875, "attnres/final_alpha/block_2": 0.010722948238253593, "attnres/block_norm/2": 27706.26171875, "attnres/final_alpha/block_3": 0.01260642521083355, "attnres/block_norm/3": 53086.953125, "attnres/final_alpha/block_4": 0.014818686991930008, "attnres/block_norm/4": 14215.6796875, "attnres/final_alpha/block_5": 0.6025934219360352, "attnres/block_norm/5": 6383.20263671875, "attnres/final_alpha/block_6": 0.11504565179347992, "attnres/block_norm/6": 34924.66796875, "geo/tier1_time_s": 1.3579587936401367, "geo/step": 51300.0, "geo/rankme_slope": -0.00013095157985069028} {"step": 51310, "timestamp": 1778250000.1389463, "train/loss": 2.1203611731529235, "train/z_loss": 0.0013906842563301326, "train/perplexity": 8.334147014328957, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792838.7412228314, "perf/iters_per_sec": 0.8548921304811627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.169738221168518, "data/tokens_consumed": 107606966272, "data/tokens_consumed_B": 107.606966272, "train/loss_slope": 1.7532116079916982e-06} {"step": 51320, "timestamp": 1778250010.483146, "train/loss": 2.1423580646514893, "train/z_loss": 0.001375130086671561, "train/perplexity": 8.519503501330624, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028748.4224279134, "perf/iters_per_sec": 0.9673826324595992, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337171316146851, "data/tokens_consumed": 107627937792, "data/tokens_consumed_B": 107.627937792, "train/loss_slope": 1.7668898671445034e-06} {"step": 51330, "timestamp": 1778250020.819158, "train/loss": 2.172149157524109, "train/z_loss": 0.0013774632709100843, "train/perplexity": 8.777127214006217, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029885.296126117, "perf/iters_per_sec": 0.9679247360830865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331381797790526, "data/tokens_consumed": 107648909312, "data/tokens_consumed_B": 107.648909312, "train/loss_slope": 2.275055956275393e-06} {"step": 51340, "timestamp": 1778250031.1636431, "train/loss": 2.1599358320236206, "train/z_loss": 0.0013759092427790165, "train/perplexity": 8.670581266958454, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028656.013507994, "perf/iters_per_sec": 0.9673385684528322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337642192840577, "data/tokens_consumed": 107669880832, "data/tokens_consumed_B": 107.669880832, "train/loss_slope": 4.393817417763936e-06} {"step": 51350, "timestamp": 1778250041.5027058, "grad/layer_0/attn": 0.002632530638948083, "grad/layer_0/mlp": 0.0027951844967901707, "grad/layer_0/attn_mlp_ratio": 0.9418092250404964, "grad/layer_4/attn": 0.002480556024238467, "grad/layer_4/mlp": 0.0025160755030810833, "grad/layer_4/attn_mlp_ratio": 0.9858829445350846, "grad/layer_8/attn": 0.004101612605154514, "grad/layer_8/mlp": 0.0036677480675280094, "grad/layer_8/attn_mlp_ratio": 1.1182917740829847, "grad/layer_12/attn": 0.004383052233606577, "grad/layer_12/mlp": 0.006714374292641878, "grad/layer_12/attn_mlp_ratio": 0.6527863918952537, "grad/layer_16/attn": 0.007196085527539253, "grad/layer_16/mlp": 0.004510841798037291, "grad/layer_16/attn_mlp_ratio": 1.5952865762531685, "grad/layer_20/attn": 0.003255358897149563, "grad/layer_20/mlp": 0.0059996978379786015, "grad/layer_20/attn_mlp_ratio": 0.5425871320192409, "grad/layer_24/attn": 0.013018080033361912, "grad/layer_24/mlp": 0.008844666182994843, "grad/layer_24/attn_mlp_ratio": 1.4718565536374237, "grad/layer_27/attn": 0.007511308416724205, "grad/layer_27/mlp": 0.007593709044158459, "grad/layer_27/attn_mlp_ratio": 0.9891488170181444} {"step": 51350, "timestamp": 1778250041.5171309, "train/loss": 2.1669920444488526, "train/z_loss": 0.0013811227516271174, "train/perplexity": 8.731979093531434, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026778.544966738, "perf/iters_per_sec": 0.9664433216890039, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347218275070191, "data/tokens_consumed": 107690852352, "data/tokens_consumed_B": 107.690852352, "train/loss_slope": 5.2729270710731815e-06} {"step": 51360, "timestamp": 1778250051.8594964, "train/loss": 2.1358256459236147, "train/z_loss": 0.0013779127970337869, "train/perplexity": 8.464031916176303, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028986.0572828192, "perf/iters_per_sec": 0.9674959455885025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335960626602172, "data/tokens_consumed": 107711823872, "data/tokens_consumed_B": 107.711823872, "train/loss_slope": 3.3902795544408753e-06} {"step": 51370, "timestamp": 1778250062.1985917, "train/loss": 2.1637868165969847, "train/z_loss": 0.0013847230351530016, "train/perplexity": 8.7040359169856, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029612.5138289542, "perf/iters_per_sec": 0.9677946633476993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332770347595215, "data/tokens_consumed": 107732795392, "data/tokens_consumed_B": 107.732795392, "train/loss_slope": 3.305885271735626e-06} {"step": 51375, "timestamp": 1778250067.9562674, "eos/sharpness": 19.69056129455566, "eos/L0_probe": 1.9827539920806885, "eos/L_plus": 2.067309617996216, "eos/L_minus": 2.0951039791107178, "eos/grad_norm": 0.10207065939903259, "eos/embed_grad_frac": 0.2459595948457718, "eos/time_s": 0.5943291187286377} {"step": 51375, "timestamp": 1778250069.3326313, "geo/rankme_last": 438.7762451171875, "geo/layer_0/stable_rank_q_proj": 19.43674087524414, "geo/layer_0/stable_rank_k_proj": 16.346834182739258, "geo/layer_0/stable_rank_o_proj": 47.6254768371582, "geo/layer_0/stable_rank_gate_proj": 132.82545471191406, "geo/layer_0/stable_rank_down_proj": 53.85017013549805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06056930124759674, "geo/layer_0/attn_entropy_mean": 6.1782636642456055, "geo/layer_0/attn_entropy_std": 0.39328908920288086, "geo/layer_7/stable_rank_q_proj": 42.61395263671875, "geo/layer_7/stable_rank_k_proj": 41.929901123046875, "geo/layer_7/stable_rank_o_proj": 93.21269226074219, "geo/layer_7/stable_rank_gate_proj": 86.20155334472656, "geo/layer_7/stable_rank_down_proj": 143.1497039794922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4752935767173767, "geo/layer_7/attn_entropy_mean": 4.614974021911621, "geo/layer_7/attn_entropy_std": 0.8144122958183289, "geo/layer_14/stable_rank_q_proj": 52.26139831542969, "geo/layer_14/stable_rank_k_proj": 38.772010803222656, "geo/layer_14/stable_rank_o_proj": 45.26409912109375, "geo/layer_14/stable_rank_gate_proj": 73.60509490966797, "geo/layer_14/stable_rank_down_proj": 130.5745391845703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39869698882102966, "geo/layer_14/attn_entropy_mean": 5.485692977905273, "geo/layer_14/attn_entropy_std": 0.37930014729499817, "geo/layer_21/stable_rank_q_proj": 41.23115158081055, "geo/layer_21/stable_rank_k_proj": 30.426910400390625, "geo/layer_21/stable_rank_o_proj": 73.09093475341797, "geo/layer_21/stable_rank_gate_proj": 69.11317443847656, "geo/layer_21/stable_rank_down_proj": 53.22701644897461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1445002555847168, "geo/layer_21/attn_entropy_mean": 5.700301647186279, "geo/layer_21/attn_entropy_std": 0.2860499918460846, "geo/layer_27/stable_rank_q_proj": 42.823570251464844, "geo/layer_27/stable_rank_k_proj": 31.687334060668945, "geo/layer_27/stable_rank_o_proj": 115.91561889648438, "geo/layer_27/stable_rank_gate_proj": 82.62809753417969, "geo/layer_27/stable_rank_down_proj": 129.21820068359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09145829826593399, "geo/layer_27/attn_entropy_mean": 4.222498416900635, "geo/layer_27/attn_entropy_std": 0.7137219905853271, "attnres/final_alpha/block_0": 0.23880714178085327, "attnres/block_norm/0": 1.7456200122833252, "attnres/final_alpha/block_1": 0.004860566928982735, "attnres/block_norm/1": 44041.14453125, "attnres/final_alpha/block_2": 0.010631692595779896, "attnres/block_norm/2": 27749.7109375, "attnres/final_alpha/block_3": 0.012539688497781754, "attnres/block_norm/3": 53122.1875, "attnres/final_alpha/block_4": 0.014805000275373459, "attnres/block_norm/4": 14183.9267578125, "attnres/final_alpha/block_5": 0.6049880385398865, "attnres/block_norm/5": 6381.5927734375, "attnres/final_alpha/block_6": 0.11336791515350342, "attnres/block_norm/6": 35004.015625, "geo/tier1_time_s": 1.3582603931427002, "geo/step": 51375.0, "geo/rankme_slope": -0.00012140764508928571} {"step": 51380, "timestamp": 1778250074.5110347, "train/loss": 2.2058427333831787, "train/z_loss": 0.001367485651280731, "train/perplexity": 9.077898593395748, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703962.3101718694, "perf/iters_per_sec": 0.812512545667586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307502269744872, "data/tokens_consumed": 107753766912, "data/tokens_consumed_B": 107.753766912, "train/loss_slope": 4.105760591222536e-06} {"step": 51390, "timestamp": 1778250084.8958254, "train/loss": 2.1638694047927856, "train/z_loss": 0.0013756641536019742, "train/perplexity": 8.704754797293276, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020710.6458663705, "perf/iters_per_sec": 0.9635499219257214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378289461135863, "data/tokens_consumed": 107774738432, "data/tokens_consumed_B": 107.774738432, "train/loss_slope": 4.092614764463345e-06} {"step": 51400, "timestamp": 1778250095.2638593, "grad/layer_0/attn": 0.003566411091014743, "grad/layer_0/mlp": 0.0031138097401708364, "grad/layer_0/attn_mlp_ratio": 1.1453528873230983, "grad/layer_4/attn": 0.0021991513203829527, "grad/layer_4/mlp": 0.002419673139229417, "grad/layer_4/attn_mlp_ratio": 0.9088629343536101, "grad/layer_8/attn": 0.004903143737465143, "grad/layer_8/mlp": 0.003735465230420232, "grad/layer_8/attn_mlp_ratio": 1.3125924894914096, "grad/layer_12/attn": 0.005238526500761509, "grad/layer_12/mlp": 0.007181675639003515, "grad/layer_12/attn_mlp_ratio": 0.7294295497513474, "grad/layer_16/attn": 0.003826115047559142, "grad/layer_16/mlp": 0.004726878833025694, "grad/layer_16/attn_mlp_ratio": 0.8094379191366408, "grad/layer_20/attn": 0.003998070489615202, "grad/layer_20/mlp": 0.0064001185819506645, "grad/layer_20/attn_mlp_ratio": 0.6246869297737222, "grad/layer_24/attn": 0.014474786818027496, "grad/layer_24/mlp": 0.011942881159484386, "grad/layer_24/attn_mlp_ratio": 1.2120012334990276, "grad/layer_27/attn": 0.006711081136018038, "grad/layer_27/mlp": 0.013225642964243889, "grad/layer_27/attn_mlp_ratio": 0.5074294764661949} {"step": 51400, "timestamp": 1778250095.2782078, "train/loss": 2.1728556632995604, "train/z_loss": 0.0013670048909261822, "train/perplexity": 8.783330496144082, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021151.2347890218, "perf/iters_per_sec": 0.9637600110955342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037602710723877, "data/tokens_consumed": 107795709952, "data/tokens_consumed_B": 107.795709952, "train/loss_slope": 3.6775414520936386e-06} {"step": 51410, "timestamp": 1778250105.6563978, "train/loss": 2.221498990058899, "train/z_loss": 0.0013627221691422165, "train/perplexity": 9.221142912793434, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021709.524924891, "perf/iters_per_sec": 0.964026224577375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373161792755128, "data/tokens_consumed": 107816681472, "data/tokens_consumed_B": 107.816681472, "train/loss_slope": 7.1536610157254206e-06} {"step": 51420, "timestamp": 1778250116.0328615, "train/loss": 2.140225863456726, "train/z_loss": 0.0013857420883141458, "train/perplexity": 8.50135755806197, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022327.6828712886, "perf/iters_per_sec": 0.9643209852558559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369991064071655, "data/tokens_consumed": 107837652992, "data/tokens_consumed_B": 107.837652992, "train/loss_slope": 6.54806906920167e-06} {"step": 51430, "timestamp": 1778250126.4086864, "train/loss": 2.135829281806946, "train/z_loss": 0.0013950588647276164, "train/perplexity": 8.464062690464809, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022616.183643697, "perf/iters_per_sec": 0.9644585531443105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368511915206908, "data/tokens_consumed": 107858624512, "data/tokens_consumed_B": 107.858624512, "train/loss_slope": 4.845501523170963e-06} {"step": 51440, "timestamp": 1778250137.2003756, "train/loss": 2.1583781242370605, "train/z_loss": 0.001377954229246825, "train/perplexity": 8.657085548926338, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1944174.7963098378, "perf/iters_per_sec": 0.9270547849225225, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0786849021911622, "data/tokens_consumed": 107879596032, "data/tokens_consumed_B": 107.879596032, "train/loss_slope": 4.756756419717751e-06} {"step": 51450, "timestamp": 1778250148.0627887, "grad/layer_0/attn": 0.003660905174911022, "grad/layer_0/mlp": 0.0035106439609080553, "grad/layer_0/attn_mlp_ratio": 1.042801580392659, "grad/layer_4/attn": 0.00325995241291821, "grad/layer_4/mlp": 0.002813085215166211, "grad/layer_4/attn_mlp_ratio": 1.1588530199716305, "grad/layer_8/attn": 0.006273321341723204, "grad/layer_8/mlp": 0.0038052250165492296, "grad/layer_8/attn_mlp_ratio": 1.6486071518975383, "grad/layer_12/attn": 0.003958337940275669, "grad/layer_12/mlp": 0.006461042445152998, "grad/layer_12/attn_mlp_ratio": 0.6126469393465255, "grad/layer_16/attn": 0.0038659940473735332, "grad/layer_16/mlp": 0.00463265273720026, "grad/layer_16/attn_mlp_ratio": 0.834509768642613, "grad/layer_20/attn": 0.003258274868130684, "grad/layer_20/mlp": 0.0054738475009799, "grad/layer_20/attn_mlp_ratio": 0.595243986615083, "grad/layer_24/attn": 0.006639962084591389, "grad/layer_24/mlp": 0.007563690654933453, "grad/layer_24/attn_mlp_ratio": 0.877873289605401, "grad/layer_27/attn": 0.004140384495258331, "grad/layer_27/mlp": 0.006603428162634373, "grad/layer_27/attn_mlp_ratio": 0.6270052964286411} {"step": 51450, "timestamp": 1778250148.6607068, "eos/sharpness": 2.147591114044189, "eos/L0_probe": 1.9812021255493164, "eos/L_plus": 1.9970308542251587, "eos/L_minus": 1.986849308013916, "eos/grad_norm": 0.081572525203228, "eos/embed_grad_frac": 0.31983137130737305, "eos/time_s": 0.5946531295776367} {"step": 51450, "timestamp": 1778250148.6835427, "train/loss": 2.158436107635498, "train/z_loss": 0.0013879692880436779, "train/perplexity": 8.65758753072019, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1827482.1155379068, "perf/iters_per_sec": 0.8714113786401304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.147563624382019, "data/tokens_consumed": 107900567552, "data/tokens_consumed_B": 107.900567552, "train/loss_slope": 7.207012498410929e-06} {"step": 51450, "timestamp": 1778250150.0444393, "geo/rankme_last": 438.7091064453125, "geo/layer_0/stable_rank_q_proj": 19.4619140625, "geo/layer_0/stable_rank_k_proj": 16.385860443115234, "geo/layer_0/stable_rank_o_proj": 47.624210357666016, "geo/layer_0/stable_rank_gate_proj": 133.02200317382812, "geo/layer_0/stable_rank_down_proj": 53.94976806640625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06294849514961243, "geo/layer_0/attn_entropy_mean": 6.179802894592285, "geo/layer_0/attn_entropy_std": 0.39760369062423706, "geo/layer_7/stable_rank_q_proj": 42.57386016845703, "geo/layer_7/stable_rank_k_proj": 41.960060119628906, "geo/layer_7/stable_rank_o_proj": 93.27348327636719, "geo/layer_7/stable_rank_gate_proj": 86.10485076904297, "geo/layer_7/stable_rank_down_proj": 143.50506591796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4776059687137604, "geo/layer_7/attn_entropy_mean": 4.659084796905518, "geo/layer_7/attn_entropy_std": 0.8024057149887085, "geo/layer_14/stable_rank_q_proj": 52.17647933959961, "geo/layer_14/stable_rank_k_proj": 38.7847900390625, "geo/layer_14/stable_rank_o_proj": 45.262786865234375, "geo/layer_14/stable_rank_gate_proj": 73.65219116210938, "geo/layer_14/stable_rank_down_proj": 130.66043090820312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38364192843437195, "geo/layer_14/attn_entropy_mean": 5.491036415100098, "geo/layer_14/attn_entropy_std": 0.3803240954875946, "geo/layer_21/stable_rank_q_proj": 41.119850158691406, "geo/layer_21/stable_rank_k_proj": 30.33533477783203, "geo/layer_21/stable_rank_o_proj": 72.90860748291016, "geo/layer_21/stable_rank_gate_proj": 69.03970336914062, "geo/layer_21/stable_rank_down_proj": 53.25257110595703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14381422102451324, "geo/layer_21/attn_entropy_mean": 5.704586982727051, "geo/layer_21/attn_entropy_std": 0.298335462808609, "geo/layer_27/stable_rank_q_proj": 42.86661911010742, "geo/layer_27/stable_rank_k_proj": 31.583568572998047, "geo/layer_27/stable_rank_o_proj": 115.88019561767578, "geo/layer_27/stable_rank_gate_proj": 82.5670394897461, "geo/layer_27/stable_rank_down_proj": 129.17848205566406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09566876292228699, "geo/layer_27/attn_entropy_mean": 4.224693775177002, "geo/layer_27/attn_entropy_std": 0.6986558437347412, "attnres/final_alpha/block_0": 0.23832929134368896, "attnres/block_norm/0": 1.7457226514816284, "attnres/final_alpha/block_1": 0.004765587393194437, "attnres/block_norm/1": 44101.3671875, "attnres/final_alpha/block_2": 0.010431223548948765, "attnres/block_norm/2": 27668.875, "attnres/final_alpha/block_3": 0.012171432375907898, "attnres/block_norm/3": 53100.1640625, "attnres/final_alpha/block_4": 0.014608390629291534, "attnres/block_norm/4": 14160.51953125, "attnres/final_alpha/block_5": 0.6082271337509155, "attnres/block_norm/5": 6328.654296875, "attnres/final_alpha/block_6": 0.11146695911884308, "attnres/block_norm/6": 35085.046875, "geo/tier1_time_s": 1.3573827743530273, "geo/step": 51450.0, "geo/rankme_slope": -0.0001182105068589936} {"step": 51460, "timestamp": 1778250160.4197903, "train/loss": 2.1271129846572876, "train/z_loss": 0.0013868052163161337, "train/perplexity": 8.390607996483586, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787414.9320007167, "perf/iters_per_sec": 0.8523058567050537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1732877254486085, "data/tokens_consumed": 107921539072, "data/tokens_consumed_B": 107.921539072, "train/loss_slope": 4.2273874294281984e-06} {"step": 51470, "timestamp": 1778250170.79716, "train/loss": 2.1825462579727173, "train/z_loss": 0.0013812487362883986, "train/perplexity": 8.868859938118808, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021947.8374715014, "perf/iters_per_sec": 0.9641398608548648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371939182281493, "data/tokens_consumed": 107942510592, "data/tokens_consumed_B": 107.942510592, "train/loss_slope": 3.680522757323327e-06} {"step": 51480, "timestamp": 1778250181.1733627, "train/loss": 2.1525720834732054, "train/z_loss": 0.001378618460148573, "train/perplexity": 8.606967790995371, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022502.894028936, "perf/iters_per_sec": 0.9644045324463539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03690927028656, "data/tokens_consumed": 107963482112, "data/tokens_consumed_B": 107.963482112, "train/loss_slope": 4.646156945101218e-06} {"step": 51490, "timestamp": 1778250191.54833, "train/loss": 2.1208500623703004, "train/z_loss": 0.0014036515029147267, "train/perplexity": 8.338222485085977, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022699.0195918726, "perf/iters_per_sec": 0.9644980524024356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036808729171753, "data/tokens_consumed": 107984453632, "data/tokens_consumed_B": 107.984453632, "train/loss_slope": 2.9791282885002e-06} {"step": 51500, "timestamp": 1778250202.3979194, "grad/layer_0/attn": 0.002388688502833247, "grad/layer_0/mlp": 0.002650388516485691, "grad/layer_0/attn_mlp_ratio": 0.9012597201690944, "grad/layer_4/attn": 0.0019492111168801785, "grad/layer_4/mlp": 0.0022594889160245657, "grad/layer_4/attn_mlp_ratio": 0.8626778457678442, "grad/layer_8/attn": 0.006510570179671049, "grad/layer_8/mlp": 0.0033598896116018295, "grad/layer_8/attn_mlp_ratio": 1.9377333003490553, "grad/layer_12/attn": 0.0068919300101697445, "grad/layer_12/mlp": 0.0060132527723908424, "grad/layer_12/attn_mlp_ratio": 1.1461234304336754, "grad/layer_16/attn": 0.0032358383759856224, "grad/layer_16/mlp": 0.004107500426471233, "grad/layer_16/attn_mlp_ratio": 0.7877876959800518, "grad/layer_20/attn": 0.003030157648026943, "grad/layer_20/mlp": 0.005338758695870638, "grad/layer_20/attn_mlp_ratio": 0.5675771773713911, "grad/layer_24/attn": 0.006918151397258043, "grad/layer_24/mlp": 0.007621568161994219, "grad/layer_24/attn_mlp_ratio": 0.9077070701782152, "grad/layer_27/attn": 0.005277336109429598, "grad/layer_27/mlp": 0.007735431659966707, "grad/layer_27/attn_mlp_ratio": 0.6822290304132049} {"step": 51500, "timestamp": 1778250202.4122458, "train/loss": 2.1841801166534425, "train/z_loss": 0.0013770924182608725, "train/perplexity": 8.88336224605154, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931270.9356761624, "perf/iters_per_sec": 0.9209017446881115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0858921766281129, "data/tokens_consumed": 108005425152, "data/tokens_consumed_B": 108.005425152, "train/loss_slope": 3.2776989666434687e-06} {"step": 51500, "timestamp": 1778250209.690477, "geo/ww_alpha_mean": 7.552778811985942, "geo/ww_alpha_std": 4.301366298452009, "geo/ww_alpha_min": 1.3505907565437192, "geo/ww_alpha_max": 25.91689759693512, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 3.961957891024778, "geo/ww_alpha_by_type/k_proj": 4.458296504816363, "geo/ww_alpha_by_type/v_proj": 8.837864943131324, "geo/ww_alpha_by_type/o_proj": 7.270126364076617, "geo/ww_alpha_by_type/gate_proj": 8.108544017895953, "geo/ww_alpha_by_type/up_proj": 11.792973178581367, "geo/ww_alpha_by_type/down_proj": 8.536891722077419, "geo/twonn_id/layer_0": 0.7357388138771057, "geo/twonn_id/layer_7": 3.2689507007598877, "geo/twonn_id/layer_14": 4.558320999145508, "geo/twonn_id/layer_21": 8.991559982299805, "geo/twonn_id/layer_27": 5.684291839599609, "geo/tier2_time_s": 7.26992130279541} {"step": 51500, "timestamp": 1778250210.4179115, "eoc/jacobian_sigma/layer_0/attn": 1159.29638671875, "eoc/jacobian_sigma/layer_0/mlp": 8408.7578125, "eoc/jacobian_sigma/layer_0": 8408.7578125, "eoc/jacobian_sigma/layer_7/attn": 1.1360055208206177, "eoc/jacobian_sigma/layer_7/mlp": 1.7251873016357422, "eoc/jacobian_sigma/layer_7": 1.7251873016357422, "eoc/jacobian_sigma/layer_14/attn": 1.5795140266418457, "eoc/jacobian_sigma/layer_14/mlp": 6.113299369812012, "eoc/jacobian_sigma/layer_14": 6.113299369812012, "eoc/jacobian_sigma/layer_21/attn": 1.0841721296310425, "eoc/jacobian_sigma/layer_21/mlp": 4.272726058959961, "eoc/jacobian_sigma/layer_21": 4.272726058959961, "eoc/jacobian_sigma/layer_27/attn": 3.3240184783935547, "eoc/jacobian_sigma/layer_27/mlp": 35.31047439575195, "eoc/jacobian_sigma/layer_27": 35.31047439575195, "eoc/layer0_sigma": 8408.7578125, "eoc/sigma_max": 35.31047439575195, "eoc/sigma_min": 1.7251873016357422, "eoc/sigma_mean": 11.855421781539917, "eoc/time_s": 0.7213060855865479} {"step": 51510, "timestamp": 1778250220.8213322, "train/loss": 2.117790126800537, "train/z_loss": 0.0013752613333053887, "train/perplexity": 8.312747057975681, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1139631.1456457558, "perf/iters_per_sec": 0.5434184768894939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8402024269104005, "data/tokens_consumed": 108026396672, "data/tokens_consumed_B": 108.026396672, "train/loss_slope": -1.7208449875596985e-06} {"step": 51520, "timestamp": 1778250231.754808, "train/loss": 2.1456803560256956, "train/z_loss": 0.0013931341585703195, "train/perplexity": 8.547854843958538, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919045.000760214, "perf/iters_per_sec": 0.9150719646264143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0928102254867553, "data/tokens_consumed": 108047368192, "data/tokens_consumed_B": 108.047368192, "train/loss_slope": 1.3634033185956631e-06} {"step": 51525, "timestamp": 1778250237.5494215, "eos/sharpness": 27.631974220275872, "eos/L0_probe": 1.9819728136062622, "eos/L_plus": 2.1244993209838867, "eos/L_minus": 2.1157660484313965, "eos/grad_norm": 0.12861895561218262, "eos/embed_grad_frac": 0.14841637015342712, "eos/time_s": 0.6087555885314941} {"step": 51525, "timestamp": 1778250238.926929, "geo/rankme_last": 438.3092956542969, "geo/layer_0/stable_rank_q_proj": 19.463884353637695, "geo/layer_0/stable_rank_k_proj": 16.38913917541504, "geo/layer_0/stable_rank_o_proj": 47.684391021728516, "geo/layer_0/stable_rank_gate_proj": 133.02174377441406, "geo/layer_0/stable_rank_down_proj": 53.939918518066406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06362587958574295, "geo/layer_0/attn_entropy_mean": 6.181700229644775, "geo/layer_0/attn_entropy_std": 0.3963884115219116, "geo/layer_7/stable_rank_q_proj": 42.561485290527344, "geo/layer_7/stable_rank_k_proj": 41.957984924316406, "geo/layer_7/stable_rank_o_proj": 93.44811248779297, "geo/layer_7/stable_rank_gate_proj": 85.94985961914062, "geo/layer_7/stable_rank_down_proj": 143.23385620117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48014912009239197, "geo/layer_7/attn_entropy_mean": 4.620011329650879, "geo/layer_7/attn_entropy_std": 0.811939001083374, "geo/layer_14/stable_rank_q_proj": 52.10789108276367, "geo/layer_14/stable_rank_k_proj": 38.81586837768555, "geo/layer_14/stable_rank_o_proj": 45.29749298095703, "geo/layer_14/stable_rank_gate_proj": 73.66600036621094, "geo/layer_14/stable_rank_down_proj": 130.98558044433594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39446282386779785, "geo/layer_14/attn_entropy_mean": 5.516207695007324, "geo/layer_14/attn_entropy_std": 0.37711742520332336, "geo/layer_21/stable_rank_q_proj": 41.20832061767578, "geo/layer_21/stable_rank_k_proj": 30.36785888671875, "geo/layer_21/stable_rank_o_proj": 72.9227066040039, "geo/layer_21/stable_rank_gate_proj": 69.09996795654297, "geo/layer_21/stable_rank_down_proj": 53.281455993652344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14487287402153015, "geo/layer_21/attn_entropy_mean": 5.705787181854248, "geo/layer_21/attn_entropy_std": 0.27969658374786377, "geo/layer_27/stable_rank_q_proj": 42.83522033691406, "geo/layer_27/stable_rank_k_proj": 31.515296936035156, "geo/layer_27/stable_rank_o_proj": 116.11865997314453, "geo/layer_27/stable_rank_gate_proj": 82.55673217773438, "geo/layer_27/stable_rank_down_proj": 128.93801879882812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09068694710731506, "geo/layer_27/attn_entropy_mean": 4.232975959777832, "geo/layer_27/attn_entropy_std": 0.7007595300674438, "attnres/final_alpha/block_0": 0.23923304677009583, "attnres/block_norm/0": 1.7456837892532349, "attnres/final_alpha/block_1": 0.004830248653888702, "attnres/block_norm/1": 44180.484375, "attnres/final_alpha/block_2": 0.01051147747784853, "attnres/block_norm/2": 27803.158203125, "attnres/final_alpha/block_3": 0.012310082092881203, "attnres/block_norm/3": 52933.5, "attnres/final_alpha/block_4": 0.014815399423241615, "attnres/block_norm/4": 14223.171875, "attnres/final_alpha/block_5": 0.6053135395050049, "attnres/block_norm/5": 6399.03955078125, "attnres/final_alpha/block_6": 0.11298621445894241, "attnres/block_norm/6": 34974.04296875, "geo/tier1_time_s": 1.356781005859375, "geo/step": 51525.0, "geo/rankme_slope": -0.00014875803837159865} {"step": 51530, "timestamp": 1778250244.1158996, "train/loss": 2.0863653421401978, "train/z_loss": 0.001386464957613498, "train/perplexity": 8.055582603061477, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697618.7100708466, "perf/iters_per_sec": 0.8094876814226373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353492498397827, "data/tokens_consumed": 108068339712, "data/tokens_consumed_B": 108.068339712, "train/loss_slope": -1.935208011882484e-06} {"step": 51540, "timestamp": 1778250254.493352, "train/loss": 2.1693342447280886, "train/z_loss": 0.001380322512704879, "train/perplexity": 8.752455107504625, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021729.3666687147, "perf/iters_per_sec": 0.9640356858581136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037305998802185, "data/tokens_consumed": 108089311232, "data/tokens_consumed_B": 108.089311232, "train/loss_slope": 2.551121758942518e-06} {"step": 51550, "timestamp": 1778250264.8668792, "grad/layer_0/attn": 0.002868075156584382, "grad/layer_0/mlp": 0.002960447920486331, "grad/layer_0/attn_mlp_ratio": 0.9687976741146178, "grad/layer_4/attn": 0.0023513392079621553, "grad/layer_4/mlp": 0.00251506338827312, "grad/layer_4/attn_mlp_ratio": 0.9349025258907558, "grad/layer_8/attn": 0.005659944377839565, "grad/layer_8/mlp": 0.0036736747715622187, "grad/layer_8/attn_mlp_ratio": 1.5406764549723726, "grad/layer_12/attn": 0.004097992088645697, "grad/layer_12/mlp": 0.006147298030555248, "grad/layer_12/attn_mlp_ratio": 0.6666330478225155, "grad/layer_16/attn": 0.00347148603759706, "grad/layer_16/mlp": 0.004315628204494715, "grad/layer_16/attn_mlp_ratio": 0.8043987555604633, "grad/layer_20/attn": 0.0054967934265732765, "grad/layer_20/mlp": 0.0057959663681685925, "grad/layer_20/attn_mlp_ratio": 0.9483825444404532, "grad/layer_24/attn": 0.011196134611964226, "grad/layer_24/mlp": 0.00876364205032587, "grad/layer_24/attn_mlp_ratio": 1.2775663839204006, "grad/layer_27/attn": 0.006887661758810282, "grad/layer_27/mlp": 0.008228417485952377, "grad/layer_27/attn_mlp_ratio": 0.8370578773941851} {"step": 51550, "timestamp": 1778250264.8812797, "train/loss": 2.095501184463501, "train/z_loss": 0.0013852275675162672, "train/perplexity": 8.129514335654465, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019857.2700287306, "perf/iters_per_sec": 0.963143000616422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382674217224122, "data/tokens_consumed": 108110282752, "data/tokens_consumed_B": 108.110282752, "train/loss_slope": -1.0206143562052578e-07} {"step": 51560, "timestamp": 1778250275.2615485, "train/loss": 2.1643300294876098, "train/z_loss": 0.001387040305417031, "train/perplexity": 8.708765345923256, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021654.2303168438, "perf/iters_per_sec": 0.9639998580536098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373445510864259, "data/tokens_consumed": 108131254272, "data/tokens_consumed_B": 108.131254272, "train/loss_slope": -1.5611986194044885e-07} {"step": 51570, "timestamp": 1778250286.2033613, "train/loss": 2.1521968841552734, "train/z_loss": 0.0013843104476109146, "train/perplexity": 8.603739068295882, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917871.9953047084, "perf/iters_per_sec": 0.9145126320384542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0934786081314087, "data/tokens_consumed": 108152225792, "data/tokens_consumed_B": 108.152225792, "train/loss_slope": 3.1328594151681584e-06} {"step": 51580, "timestamp": 1778250296.5867746, "train/loss": 2.1083298683166505, "train/z_loss": 0.0013882648549042643, "train/perplexity": 8.234477132697817, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020685.4857685955, "perf/iters_per_sec": 0.9635379246561983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378418684005737, "data/tokens_consumed": 108173197312, "data/tokens_consumed_B": 108.173197312, "train/loss_slope": 2.635522894483002e-06} {"step": 51590, "timestamp": 1778250306.9734333, "train/loss": 2.156387209892273, "train/z_loss": 0.0013757062260992825, "train/perplexity": 8.639867178960234, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020852.6125499732, "perf/iters_per_sec": 0.96361761691569, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377560377120971, "data/tokens_consumed": 108194168832, "data/tokens_consumed_B": 108.194168832, "train/loss_slope": 4.734193473973816e-06} {"step": 51600, "timestamp": 1778250317.3404648, "grad/layer_0/attn": 0.0028028832748532295, "grad/layer_0/mlp": 0.0027738288044929504, "grad/layer_0/attn_mlp_ratio": 1.0104744637685539, "grad/layer_4/attn": 0.0026783791836351156, "grad/layer_4/mlp": 0.0026314544957131147, "grad/layer_4/attn_mlp_ratio": 1.0178321860458643, "grad/layer_8/attn": 0.008836823515594006, "grad/layer_8/mlp": 0.0035912750754505396, "grad/layer_8/attn_mlp_ratio": 2.460636705313285, "grad/layer_12/attn": 0.005777296144515276, "grad/layer_12/mlp": 0.007144544273614883, "grad/layer_12/attn_mlp_ratio": 0.808630451768357, "grad/layer_16/attn": 0.004301520064473152, "grad/layer_16/mlp": 0.004721479490399361, "grad/layer_16/attn_mlp_ratio": 0.9110534064829694, "grad/layer_20/attn": 0.0038612112402915955, "grad/layer_20/mlp": 0.006081526167690754, "grad/layer_20/attn_mlp_ratio": 0.6349082566337011, "grad/layer_24/attn": 0.007279891520738602, "grad/layer_24/mlp": 0.00824674777686596, "grad/layer_24/attn_mlp_ratio": 0.8827590741752135, "grad/layer_27/attn": 0.004481473471969366, "grad/layer_27/mlp": 0.007208146620541811, "grad/layer_27/attn_mlp_ratio": 0.6217233979433356} {"step": 51600, "timestamp": 1778250317.938748, "eos/sharpness": 43.500089645385735, "eos/L0_probe": 1.9828392267227173, "eos/L_plus": 2.1822516918182373, "eos/L_minus": 2.2184276580810547, "eos/grad_norm": 0.12137779593467712, "eos/embed_grad_frac": 0.1603071391582489, "eos/time_s": 0.595355749130249} {"step": 51600, "timestamp": 1778250317.9570239, "train/loss": 2.1863351106643676, "train/z_loss": 0.001369621162302792, "train/perplexity": 8.90252648047731, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910399.2321231929, "perf/iters_per_sec": 0.9109493408790554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0977558851242066, "data/tokens_consumed": 108215140352, "data/tokens_consumed_B": 108.215140352, "train/loss_slope": 6.154778023006421e-06} {"step": 51600, "timestamp": 1778250319.3204737, "geo/rankme_last": 438.7528381347656, "geo/layer_0/stable_rank_q_proj": 19.43242073059082, "geo/layer_0/stable_rank_k_proj": 16.390846252441406, "geo/layer_0/stable_rank_o_proj": 47.7436408996582, "geo/layer_0/stable_rank_gate_proj": 133.1141357421875, "geo/layer_0/stable_rank_down_proj": 53.984554290771484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06355798989534378, "geo/layer_0/attn_entropy_mean": 6.180076599121094, "geo/layer_0/attn_entropy_std": 0.39772045612335205, "geo/layer_7/stable_rank_q_proj": 42.58740997314453, "geo/layer_7/stable_rank_k_proj": 41.86492156982422, "geo/layer_7/stable_rank_o_proj": 93.43917083740234, "geo/layer_7/stable_rank_gate_proj": 85.86710357666016, "geo/layer_7/stable_rank_down_proj": 142.9484100341797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47315889596939087, "geo/layer_7/attn_entropy_mean": 4.658949375152588, "geo/layer_7/attn_entropy_std": 0.8010668754577637, "geo/layer_14/stable_rank_q_proj": 52.09307098388672, "geo/layer_14/stable_rank_k_proj": 38.85063171386719, "geo/layer_14/stable_rank_o_proj": 45.252159118652344, "geo/layer_14/stable_rank_gate_proj": 73.60106658935547, "geo/layer_14/stable_rank_down_proj": 131.2055206298828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40446093678474426, "geo/layer_14/attn_entropy_mean": 5.510339736938477, "geo/layer_14/attn_entropy_std": 0.35637012124061584, "geo/layer_21/stable_rank_q_proj": 41.24245071411133, "geo/layer_21/stable_rank_k_proj": 30.419776916503906, "geo/layer_21/stable_rank_o_proj": 72.97647857666016, "geo/layer_21/stable_rank_gate_proj": 69.21216583251953, "geo/layer_21/stable_rank_down_proj": 53.339630126953125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14388355612754822, "geo/layer_21/attn_entropy_mean": 5.704926490783691, "geo/layer_21/attn_entropy_std": 0.29668349027633667, "geo/layer_27/stable_rank_q_proj": 42.8156852722168, "geo/layer_27/stable_rank_k_proj": 31.475910186767578, "geo/layer_27/stable_rank_o_proj": 116.2287368774414, "geo/layer_27/stable_rank_gate_proj": 82.4139175415039, "geo/layer_27/stable_rank_down_proj": 129.0316619873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09282005578279495, "geo/layer_27/attn_entropy_mean": 4.254748344421387, "geo/layer_27/attn_entropy_std": 0.6980365514755249, "attnres/final_alpha/block_0": 0.23896664381027222, "attnres/block_norm/0": 1.745955467224121, "attnres/final_alpha/block_1": 0.004875883460044861, "attnres/block_norm/1": 44039.64453125, "attnres/final_alpha/block_2": 0.010359557345509529, "attnres/block_norm/2": 27821.091796875, "attnres/final_alpha/block_3": 0.012322237715125084, "attnres/block_norm/3": 53183.6640625, "attnres/final_alpha/block_4": 0.014685382135212421, "attnres/block_norm/4": 14182.294921875, "attnres/final_alpha/block_5": 0.6057462692260742, "attnres/block_norm/5": 6373.009765625, "attnres/final_alpha/block_6": 0.11304407566785812, "attnres/block_norm/6": 34882.8828125, "geo/tier1_time_s": 1.3593907356262207, "geo/step": 51600.0, "geo/rankme_slope": -0.0001539347574967487} {"step": 51610, "timestamp": 1778250329.7005215, "train/loss": 2.156490993499756, "train/z_loss": 0.0013635864714160561, "train/perplexity": 8.640763902076015, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786395.6488216443, "perf/iters_per_sec": 0.8518198246105405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739571809768676, "data/tokens_consumed": 108236111872, "data/tokens_consumed_B": 108.236111872, "train/loss_slope": 7.359985142114653e-06} {"step": 51620, "timestamp": 1778250340.0768502, "train/loss": 2.163132643699646, "train/z_loss": 0.001375128817744553, "train/perplexity": 8.698343834597251, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022788.328158291, "perf/iters_per_sec": 0.9645406380454498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367629528045654, "data/tokens_consumed": 108257083392, "data/tokens_consumed_B": 108.257083392, "train/loss_slope": 9.796461900218334e-06} {"step": 51630, "timestamp": 1778250350.4504795, "train/loss": 2.111990189552307, "train/z_loss": 0.0013899451121687889, "train/perplexity": 8.264673194139853, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022626.3691692825, "perf/iters_per_sec": 0.9644634099813855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368459701538086, "data/tokens_consumed": 108278054912, "data/tokens_consumed_B": 108.278054912, "train/loss_slope": 6.0858558971818114e-06} {"step": 51640, "timestamp": 1778250360.8260179, "train/loss": 2.1315677165985107, "train/z_loss": 0.001385545078665018, "train/perplexity": 8.428069283978337, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022328.0548373421, "perf/iters_per_sec": 0.9643211626230918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369989156723023, "data/tokens_consumed": 108299026432, "data/tokens_consumed_B": 108.299026432, "train/loss_slope": 3.748886462914627e-06} {"step": 51650, "timestamp": 1778250371.1967032, "grad/layer_0/attn": 0.0032739245798438787, "grad/layer_0/mlp": 0.0030380217358469963, "grad/layer_0/attn_mlp_ratio": 1.0776501146941602, "grad/layer_4/attn": 0.0026135940570384264, "grad/layer_4/mlp": 0.002439158735796809, "grad/layer_4/attn_mlp_ratio": 1.071514498638476, "grad/layer_8/attn": 0.004810990300029516, "grad/layer_8/mlp": 0.0037553159054368734, "grad/layer_8/attn_mlp_ratio": 1.281114636708138, "grad/layer_12/attn": 0.004974790383130312, "grad/layer_12/mlp": 0.006762258242815733, "grad/layer_12/attn_mlp_ratio": 0.7356699686600353, "grad/layer_16/attn": 0.005496853031218052, "grad/layer_16/mlp": 0.005092053208500147, "grad/layer_16/attn_mlp_ratio": 1.0794963638817714, "grad/layer_20/attn": 0.003919584676623344, "grad/layer_20/mlp": 0.00628310302272439, "grad/layer_20/attn_mlp_ratio": 0.6238294358797329, "grad/layer_24/attn": 0.0155336307361722, "grad/layer_24/mlp": 0.011048085987567902, "grad/layer_24/attn_mlp_ratio": 1.4060019638742456, "grad/layer_27/attn": 0.0065951235592365265, "grad/layer_27/mlp": 0.010805335827171803, "grad/layer_27/attn_mlp_ratio": 0.6103580308551073} {"step": 51650, "timestamp": 1778250371.2109857, "train/loss": 2.1433786153793335, "train/z_loss": 0.0013776650070212782, "train/perplexity": 8.528202524972212, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020742.6307328087, "perf/iters_per_sec": 0.9635651734985393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378125190734864, "data/tokens_consumed": 108319997952, "data/tokens_consumed_B": 108.319997952, "train/loss_slope": 4.050803842610359e-06} {"step": 51660, "timestamp": 1778250381.5879064, "train/loss": 2.1434481143951416, "train/z_loss": 0.0013786704745143651, "train/perplexity": 8.528795247250878, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021945.0952564902, "perf/iters_per_sec": 0.9641385532648516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371953248977661, "data/tokens_consumed": 108340969472, "data/tokens_consumed_B": 108.340969472, "train/loss_slope": 2.5436340278238905e-07} {"step": 51670, "timestamp": 1778250391.9534264, "train/loss": 2.183674764633179, "train/z_loss": 0.0013802853762172162, "train/perplexity": 8.878874155122189, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024485.339359282, "perf/iters_per_sec": 0.9653498360439692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358938932418824, "data/tokens_consumed": 108361940992, "data/tokens_consumed_B": 108.361940992, "train/loss_slope": 3.0720030764292336e-07} {"step": 51675, "timestamp": 1778250397.7450013, "eos/sharpness": 38.44571113586425, "eos/L0_probe": 1.979886770248413, "eos/L_plus": 2.1647682189941406, "eos/L_minus": 2.179462432861328, "eos/grad_norm": 0.11640182137489319, "eos/embed_grad_frac": 0.1668560951948166, "eos/time_s": 0.613792896270752} {"step": 51675, "timestamp": 1778250399.1265018, "geo/rankme_last": 438.6040954589844, "geo/layer_0/stable_rank_q_proj": 19.42784881591797, "geo/layer_0/stable_rank_k_proj": 16.396949768066406, "geo/layer_0/stable_rank_o_proj": 47.73050308227539, "geo/layer_0/stable_rank_gate_proj": 133.03768920898438, "geo/layer_0/stable_rank_down_proj": 54.05434036254883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06090158224105835, "geo/layer_0/attn_entropy_mean": 6.182245254516602, "geo/layer_0/attn_entropy_std": 0.39654219150543213, "geo/layer_7/stable_rank_q_proj": 42.60819625854492, "geo/layer_7/stable_rank_k_proj": 41.79523849487305, "geo/layer_7/stable_rank_o_proj": 93.59952545166016, "geo/layer_7/stable_rank_gate_proj": 85.73155212402344, "geo/layer_7/stable_rank_down_proj": 143.0941925048828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4791794717311859, "geo/layer_7/attn_entropy_mean": 4.662130832672119, "geo/layer_7/attn_entropy_std": 0.8039793968200684, "geo/layer_14/stable_rank_q_proj": 52.16844940185547, "geo/layer_14/stable_rank_k_proj": 38.82413101196289, "geo/layer_14/stable_rank_o_proj": 45.23512649536133, "geo/layer_14/stable_rank_gate_proj": 73.50506591796875, "geo/layer_14/stable_rank_down_proj": 131.47625732421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3965069353580475, "geo/layer_14/attn_entropy_mean": 5.524259567260742, "geo/layer_14/attn_entropy_std": 0.3863343298435211, "geo/layer_21/stable_rank_q_proj": 41.18946075439453, "geo/layer_21/stable_rank_k_proj": 30.448932647705078, "geo/layer_21/stable_rank_o_proj": 72.87154388427734, "geo/layer_21/stable_rank_gate_proj": 69.18379211425781, "geo/layer_21/stable_rank_down_proj": 53.39521026611328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14585541188716888, "geo/layer_21/attn_entropy_mean": 5.697108745574951, "geo/layer_21/attn_entropy_std": 0.3059200644493103, "geo/layer_27/stable_rank_q_proj": 42.87163162231445, "geo/layer_27/stable_rank_k_proj": 31.573116302490234, "geo/layer_27/stable_rank_o_proj": 116.29083251953125, "geo/layer_27/stable_rank_gate_proj": 82.41045379638672, "geo/layer_27/stable_rank_down_proj": 129.0069122314453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08338946104049683, "geo/layer_27/attn_entropy_mean": 4.251203536987305, "geo/layer_27/attn_entropy_std": 0.7107377052307129, "attnres/final_alpha/block_0": 0.24069808423519135, "attnres/block_norm/0": 1.7462654113769531, "attnres/final_alpha/block_1": 0.004843852482736111, "attnres/block_norm/1": 44232.25, "attnres/final_alpha/block_2": 0.010640295222401619, "attnres/block_norm/2": 27830.34765625, "attnres/final_alpha/block_3": 0.01263829879462719, "attnres/block_norm/3": 53244.55859375, "attnres/final_alpha/block_4": 0.014766229316592216, "attnres/block_norm/4": 14228.7841796875, "attnres/final_alpha/block_5": 0.6034984588623047, "attnres/block_norm/5": 6377.806640625, "attnres/final_alpha/block_6": 0.11291477084159851, "attnres/block_norm/6": 34923.703125, "geo/tier1_time_s": 1.36307692527771, "geo/step": 51675.0, "geo/rankme_slope": -0.00014386387367446978} {"step": 51680, "timestamp": 1778250404.3205287, "train/loss": 2.1350812911987305, "train/z_loss": 0.001386784901842475, "train/perplexity": 8.45773401825375, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696702.0889426288, "perf/iters_per_sec": 0.809050602408709, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2360166311264038, "data/tokens_consumed": 108382912512, "data/tokens_consumed_B": 108.382912512, "train/loss_slope": -3.2281522954006824e-07} {"step": 51690, "timestamp": 1778250414.700543, "train/loss": 2.177630400657654, "train/z_loss": 0.0013811520882882177, "train/perplexity": 8.825368873631946, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021275.613229371, "perf/iters_per_sec": 0.963819319357572, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375388622283936, "data/tokens_consumed": 108403884032, "data/tokens_consumed_B": 108.403884032, "train/loss_slope": 1.73717110928852e-06} {"step": 51700, "timestamp": 1778250425.0620718, "grad/layer_0/attn": 0.002939558122307062, "grad/layer_0/mlp": 0.0030018214602023363, "grad/layer_0/attn_mlp_ratio": 0.979258114899049, "grad/layer_4/attn": 0.002660423284396529, "grad/layer_4/mlp": 0.0024877418763935566, "grad/layer_4/attn_mlp_ratio": 1.0694128690360831, "grad/layer_8/attn": 0.0034404771868139505, "grad/layer_8/mlp": 0.003569871885702014, "grad/layer_8/attn_mlp_ratio": 0.9637536585607794, "grad/layer_12/attn": 0.005774144548922777, "grad/layer_12/mlp": 0.006622934713959694, "grad/layer_12/attn_mlp_ratio": 0.8718407641204844, "grad/layer_16/attn": 0.004389235749840736, "grad/layer_16/mlp": 0.004481412936002016, "grad/layer_16/attn_mlp_ratio": 0.9794312005118115, "grad/layer_20/attn": 0.0036104086320847273, "grad/layer_20/mlp": 0.00610272865742445, "grad/layer_20/attn_mlp_ratio": 0.5916056203042583, "grad/layer_24/attn": 0.008890632539987564, "grad/layer_24/mlp": 0.010452802293002605, "grad/layer_24/attn_mlp_ratio": 0.8505501401173717, "grad/layer_27/attn": 0.0072495099157094955, "grad/layer_27/mlp": 0.0097661092877388, "grad/layer_27/attn_mlp_ratio": 0.7423129956757544} {"step": 51700, "timestamp": 1778250425.0763853, "train/loss": 2.1557145595550535, "train/z_loss": 0.0013735563261434436, "train/perplexity": 8.634057523541166, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022273.2843092817, "perf/iters_per_sec": 0.9642950460001382, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370270013809204, "data/tokens_consumed": 108424855552, "data/tokens_consumed_B": 108.424855552, "train/loss_slope": 4.1472419689555335e-07} {"step": 51710, "timestamp": 1778250435.4533322, "train/loss": 2.1951541185379027, "train/z_loss": 0.0013914215844124556, "train/perplexity": 8.981385148106934, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022206.5222110117, "perf/iters_per_sec": 0.9642632113509234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370612382888793, "data/tokens_consumed": 108445827072, "data/tokens_consumed_B": 108.445827072, "train/loss_slope": 4.473384228548131e-06} {"step": 51720, "timestamp": 1778250445.8356237, "train/loss": 2.162011432647705, "train/z_loss": 0.0013711869250983, "train/perplexity": 8.688596620719393, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020853.216113446, "perf/iters_per_sec": 0.9636179047171812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377557277679443, "data/tokens_consumed": 108466798592, "data/tokens_consumed_B": 108.466798592, "train/loss_slope": 3.5664474359690155e-06} {"step": 51730, "timestamp": 1778250456.2095914, "train/loss": 2.1312241911888123, "train/z_loss": 0.0013918800512328744, "train/perplexity": 8.425174525264644, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022731.3930359632, "perf/iters_per_sec": 0.964513489263517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367921352386475, "data/tokens_consumed": 108487770112, "data/tokens_consumed_B": 108.487770112, "train/loss_slope": 1.6705510067646432e-06} {"step": 51740, "timestamp": 1778250466.5917852, "train/loss": 2.1520307302474975, "train/z_loss": 0.001383156538940966, "train/perplexity": 8.602309642183856, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020924.9963097142, "perf/iters_per_sec": 0.963652132181985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377188682556153, "data/tokens_consumed": 108508741632, "data/tokens_consumed_B": 108.508741632, "train/loss_slope": -7.093529782780271e-07} {"step": 51750, "timestamp": 1778250476.9489982, "grad/layer_0/attn": 0.002663739724084735, "grad/layer_0/mlp": 0.002823296468704939, "grad/layer_0/attn_mlp_ratio": 0.943485623724824, "grad/layer_4/attn": 0.0038480141665786505, "grad/layer_4/mlp": 0.002415320836007595, "grad/layer_4/attn_mlp_ratio": 1.593168886673597, "grad/layer_8/attn": 0.009195188991725445, "grad/layer_8/mlp": 0.0034814677201211452, "grad/layer_8/attn_mlp_ratio": 2.6411816701512634, "grad/layer_12/attn": 0.005952410865575075, "grad/layer_12/mlp": 0.0065748970955610275, "grad/layer_12/attn_mlp_ratio": 0.9053237926813187, "grad/layer_16/attn": 0.003274128772318363, "grad/layer_16/mlp": 0.004345808178186417, "grad/layer_16/attn_mlp_ratio": 0.7533992672324504, "grad/layer_20/attn": 0.00390630541369319, "grad/layer_20/mlp": 0.005480543710291386, "grad/layer_20/attn_mlp_ratio": 0.7127587241174708, "grad/layer_24/attn": 0.008587130345404148, "grad/layer_24/mlp": 0.009395452216267586, "grad/layer_24/attn_mlp_ratio": 0.9139666783829148, "grad/layer_27/attn": 0.008516419678926468, "grad/layer_27/mlp": 0.009004148654639721, "grad/layer_27/attn_mlp_ratio": 0.945832850055711} {"step": 51750, "timestamp": 1778250477.5643375, "eos/sharpness": 57.25791454315184, "eos/L0_probe": 1.9854573011398315, "eos/L_plus": 2.3142948150634766, "eos/L_minus": 2.229198932647705, "eos/grad_norm": 0.146336629986763, "eos/embed_grad_frac": 0.11664523184299469, "eos/time_s": 0.6124210357666016} {"step": 51750, "timestamp": 1778250477.584747, "train/loss": 2.189718317985535, "train/z_loss": 0.0013775809202343225, "train/perplexity": 8.9326965803168, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908420.027221818, "perf/iters_per_sec": 0.9100055824383821, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0988943576812744, "data/tokens_consumed": 108529713152, "data/tokens_consumed_B": 108.529713152, "train/loss_slope": 4.348766328049359e-08} {"step": 51750, "timestamp": 1778250478.9481423, "geo/rankme_last": 439.0154724121094, "geo/layer_0/stable_rank_q_proj": 19.41132354736328, "geo/layer_0/stable_rank_k_proj": 16.340694427490234, "geo/layer_0/stable_rank_o_proj": 47.71223068237305, "geo/layer_0/stable_rank_gate_proj": 133.51507568359375, "geo/layer_0/stable_rank_down_proj": 54.00769805908203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06471362709999084, "geo/layer_0/attn_entropy_mean": 6.178862571716309, "geo/layer_0/attn_entropy_std": 0.39835840463638306, "geo/layer_7/stable_rank_q_proj": 42.66860580444336, "geo/layer_7/stable_rank_k_proj": 41.86278533935547, "geo/layer_7/stable_rank_o_proj": 93.74215698242188, "geo/layer_7/stable_rank_gate_proj": 85.7771224975586, "geo/layer_7/stable_rank_down_proj": 142.99072265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4701319932937622, "geo/layer_7/attn_entropy_mean": 4.654058933258057, "geo/layer_7/attn_entropy_std": 0.7934154272079468, "geo/layer_14/stable_rank_q_proj": 52.213951110839844, "geo/layer_14/stable_rank_k_proj": 38.77272415161133, "geo/layer_14/stable_rank_o_proj": 45.15590286254883, "geo/layer_14/stable_rank_gate_proj": 73.43998718261719, "geo/layer_14/stable_rank_down_proj": 131.69119262695312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39793944358825684, "geo/layer_14/attn_entropy_mean": 5.4902801513671875, "geo/layer_14/attn_entropy_std": 0.3865252435207367, "geo/layer_21/stable_rank_q_proj": 41.18081283569336, "geo/layer_21/stable_rank_k_proj": 30.4976863861084, "geo/layer_21/stable_rank_o_proj": 72.83191680908203, "geo/layer_21/stable_rank_gate_proj": 69.01435089111328, "geo/layer_21/stable_rank_down_proj": 53.33344268798828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14507484436035156, "geo/layer_21/attn_entropy_mean": 5.708719253540039, "geo/layer_21/attn_entropy_std": 0.2963629961013794, "geo/layer_27/stable_rank_q_proj": 42.81664276123047, "geo/layer_27/stable_rank_k_proj": 31.505321502685547, "geo/layer_27/stable_rank_o_proj": 116.28392028808594, "geo/layer_27/stable_rank_gate_proj": 82.3879165649414, "geo/layer_27/stable_rank_down_proj": 129.03436279296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09173328429460526, "geo/layer_27/attn_entropy_mean": 4.251859664916992, "geo/layer_27/attn_entropy_std": 0.6993446946144104, "attnres/final_alpha/block_0": 0.23726452887058258, "attnres/block_norm/0": 1.7463444471359253, "attnres/final_alpha/block_1": 0.00475418521091342, "attnres/block_norm/1": 43950.30078125, "attnres/final_alpha/block_2": 0.010377115570008755, "attnres/block_norm/2": 27920.08984375, "attnres/final_alpha/block_3": 0.012156601995229721, "attnres/block_norm/3": 53362.703125, "attnres/final_alpha/block_4": 0.01439884677529335, "attnres/block_norm/4": 14145.380859375, "attnres/final_alpha/block_5": 0.6104930639266968, "attnres/block_norm/5": 6315.19921875, "attnres/final_alpha/block_6": 0.11055569350719452, "attnres/block_norm/6": 35031.3828125, "geo/tier1_time_s": 1.360093355178833, "geo/step": 51750.0, "geo/rankme_slope": -0.0001551285162502501} {"step": 51760, "timestamp": 1778250489.3319793, "train/loss": 2.181730127334595, "train/z_loss": 0.00137512416113168, "train/perplexity": 8.861624742632042, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785722.512255193, "perf/iters_per_sec": 0.8514988480831113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1743997097015382, "data/tokens_consumed": 108550684672, "data/tokens_consumed_B": 108.550684672, "train/loss_slope": 1.561527927466213e-06} {"step": 51770, "timestamp": 1778250499.7090063, "train/loss": 2.138254928588867, "train/z_loss": 0.0013947680708952248, "train/perplexity": 8.484618437303737, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021870.6399899947, "perf/iters_per_sec": 0.9641030502271627, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372335195541382, "data/tokens_consumed": 108571656192, "data/tokens_consumed_B": 108.571656192, "train/loss_slope": 1.6507199673977445e-06} {"step": 51780, "timestamp": 1778250510.0937617, "train/loss": 2.146460437774658, "train/z_loss": 0.0013749600620940328, "train/perplexity": 8.55452547099402, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020243.0118574102, "perf/iters_per_sec": 0.9633269366538096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380691766738892, "data/tokens_consumed": 108592627712, "data/tokens_consumed_B": 108.592627712, "train/loss_slope": 3.979745313207778e-06} {"step": 51790, "timestamp": 1778250520.4740512, "train/loss": 2.125274658203125, "train/z_loss": 0.0013818641076795758, "train/perplexity": 8.375197488948892, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021786.5706908957, "perf/iters_per_sec": 0.9640629628614882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372766494750976, "data/tokens_consumed": 108613599232, "data/tokens_consumed_B": 108.613599232, "train/loss_slope": 2.8329681284320984e-06} {"step": 51800, "timestamp": 1778250530.8411655, "grad/layer_0/attn": 0.002977265976369381, "grad/layer_0/mlp": 0.002698057796806097, "grad/layer_0/attn_mlp_ratio": 1.1034848362200842, "grad/layer_4/attn": 0.0022077825851738453, "grad/layer_4/mlp": 0.0024606208316981792, "grad/layer_4/attn_mlp_ratio": 0.8972461205758177, "grad/layer_8/attn": 0.007457385305315256, "grad/layer_8/mlp": 0.0034234055783599615, "grad/layer_8/attn_mlp_ratio": 2.1783527884103524, "grad/layer_12/attn": 0.0037272064946591854, "grad/layer_12/mlp": 0.005906959995627403, "grad/layer_12/attn_mlp_ratio": 0.6309855550604161, "grad/layer_16/attn": 0.0037123304791748524, "grad/layer_16/mlp": 0.004323007073253393, "grad/layer_16/attn_mlp_ratio": 0.8587379873304815, "grad/layer_20/attn": 0.004288002382963896, "grad/layer_20/mlp": 0.005525712855160236, "grad/layer_20/attn_mlp_ratio": 0.776008891116848, "grad/layer_24/attn": 0.005853281822055578, "grad/layer_24/mlp": 0.007565128616988659, "grad/layer_24/attn_mlp_ratio": 0.7737187351368041, "grad/layer_27/attn": 0.005207314621657133, "grad/layer_27/mlp": 0.0070438990369439125, "grad/layer_27/attn_mlp_ratio": 0.7392659265016669} {"step": 51800, "timestamp": 1778250530.8554895, "train/loss": 2.151436281204224, "train/z_loss": 0.0013687290716916322, "train/perplexity": 8.597197527043631, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021104.7477486439, "perf/iters_per_sec": 0.9637378443473071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037626576423645, "data/tokens_consumed": 108634570752, "data/tokens_consumed_B": 108.634570752, "train/loss_slope": 3.432667867006441e-06} {"step": 51810, "timestamp": 1778250541.2336948, "train/loss": 2.162454390525818, "train/z_loss": 0.0013708379818126559, "train/perplexity": 8.692446155570236, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021604.1889806683, "perf/iters_per_sec": 0.9639759964850751, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037370228767395, "data/tokens_consumed": 108655542272, "data/tokens_consumed_B": 108.655542272, "train/loss_slope": 5.161172536053972e-06} {"step": 51820, "timestamp": 1778250551.6171212, "train/loss": 2.135008227825165, "train/z_loss": 0.0013891326147131623, "train/perplexity": 8.457116090247885, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020837.6164348326, "perf/iters_per_sec": 0.9636104662107623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037763738632202, "data/tokens_consumed": 108676513792, "data/tokens_consumed_B": 108.676513792, "train/loss_slope": 7.219853567140031e-06} {"step": 51825, "timestamp": 1778250557.3936229, "eos/sharpness": 79.58362102508544, "eos/L0_probe": 1.9863625764846802, "eos/L_plus": 2.3217825889587402, "eos/L_minus": 2.4467787742614746, "eos/grad_norm": 0.25868600606918335, "eos/embed_grad_frac": 0.03852005675435066, "eos/time_s": 0.5970795154571533} {"step": 51825, "timestamp": 1778250558.770425, "geo/rankme_last": 438.2083740234375, "geo/layer_0/stable_rank_q_proj": 19.425888061523438, "geo/layer_0/stable_rank_k_proj": 16.379674911499023, "geo/layer_0/stable_rank_o_proj": 47.69828796386719, "geo/layer_0/stable_rank_gate_proj": 133.32711791992188, "geo/layer_0/stable_rank_down_proj": 53.881954193115234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06842859834432602, "geo/layer_0/attn_entropy_mean": 6.181920528411865, "geo/layer_0/attn_entropy_std": 0.3939361572265625, "geo/layer_7/stable_rank_q_proj": 42.73805236816406, "geo/layer_7/stable_rank_k_proj": 41.93431854248047, "geo/layer_7/stable_rank_o_proj": 93.71937561035156, "geo/layer_7/stable_rank_gate_proj": 86.0337905883789, "geo/layer_7/stable_rank_down_proj": 143.2767791748047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4832916855812073, "geo/layer_7/attn_entropy_mean": 4.656457424163818, "geo/layer_7/attn_entropy_std": 0.7956250309944153, "geo/layer_14/stable_rank_q_proj": 52.28663635253906, "geo/layer_14/stable_rank_k_proj": 38.871089935302734, "geo/layer_14/stable_rank_o_proj": 45.17810821533203, "geo/layer_14/stable_rank_gate_proj": 73.56663513183594, "geo/layer_14/stable_rank_down_proj": 131.68655395507812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3856907784938812, "geo/layer_14/attn_entropy_mean": 5.506926536560059, "geo/layer_14/attn_entropy_std": 0.37743479013442993, "geo/layer_21/stable_rank_q_proj": 41.176143646240234, "geo/layer_21/stable_rank_k_proj": 30.46318244934082, "geo/layer_21/stable_rank_o_proj": 72.8792495727539, "geo/layer_21/stable_rank_gate_proj": 68.92232513427734, "geo/layer_21/stable_rank_down_proj": 53.29989242553711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1447651982307434, "geo/layer_21/attn_entropy_mean": 5.696825981140137, "geo/layer_21/attn_entropy_std": 0.29871705174446106, "geo/layer_27/stable_rank_q_proj": 42.802738189697266, "geo/layer_27/stable_rank_k_proj": 31.487255096435547, "geo/layer_27/stable_rank_o_proj": 116.1621322631836, "geo/layer_27/stable_rank_gate_proj": 82.29724884033203, "geo/layer_27/stable_rank_down_proj": 128.83401489257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09505944699048996, "geo/layer_27/attn_entropy_mean": 4.243834018707275, "geo/layer_27/attn_entropy_std": 0.7110668420791626, "attnres/final_alpha/block_0": 0.24036645889282227, "attnres/block_norm/0": 1.7465708255767822, "attnres/final_alpha/block_1": 0.004831045866012573, "attnres/block_norm/1": 44002.9765625, "attnres/final_alpha/block_2": 0.010558324865996838, "attnres/block_norm/2": 27936.0703125, "attnres/final_alpha/block_3": 0.012419099919497967, "attnres/block_norm/3": 53153.4140625, "attnres/final_alpha/block_4": 0.014798357151448727, "attnres/block_norm/4": 14197.0498046875, "attnres/final_alpha/block_5": 0.6020106673240662, "attnres/block_norm/5": 6410.3583984375, "attnres/final_alpha/block_6": 0.11501604318618774, "attnres/block_norm/6": 35252.9921875, "geo/tier1_time_s": 1.3585655689239502, "geo/step": 51825.0, "geo/rankme_slope": -0.00020248140271733692} {"step": 51830, "timestamp": 1778250563.9642437, "train/loss": 2.1815964460372923, "train/z_loss": 0.0013550278847105802, "train/perplexity": 8.860440188318382, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699165.4420077356, "perf/iters_per_sec": 0.8102252206839254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2342247247695923, "data/tokens_consumed": 108697485312, "data/tokens_consumed_B": 108.697485312, "train/loss_slope": 7.352575131303878e-06} {"step": 51840, "timestamp": 1778250574.3493867, "train/loss": 2.1286526918411255, "train/z_loss": 0.001359676825813949, "train/perplexity": 8.403537026798732, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020235.5414947995, "perf/iters_per_sec": 0.9633233745073316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380730152130127, "data/tokens_consumed": 108718456832, "data/tokens_consumed_B": 108.718456832, "train/loss_slope": 6.75395719646656e-06} {"step": 51850, "timestamp": 1778250584.7163548, "grad/layer_0/attn": 0.0029710603412240744, "grad/layer_0/mlp": 0.003035092493519187, "grad/layer_0/attn_mlp_ratio": 0.9789027022003084, "grad/layer_4/attn": 0.0038810621481388807, "grad/layer_4/mlp": 0.002474200911819935, "grad/layer_4/attn_mlp_ratio": 1.5686123033649997, "grad/layer_8/attn": 0.0032701336313039064, "grad/layer_8/mlp": 0.0036437485832720995, "grad/layer_8/attn_mlp_ratio": 0.89746409962811, "grad/layer_12/attn": 0.005370361264795065, "grad/layer_12/mlp": 0.0066863554529845715, "grad/layer_12/attn_mlp_ratio": 0.8031821254850725, "grad/layer_16/attn": 0.003860214026644826, "grad/layer_16/mlp": 0.004690335597842932, "grad/layer_16/attn_mlp_ratio": 0.823014444023724, "grad/layer_20/attn": 0.0038111198227852583, "grad/layer_20/mlp": 0.006829114630818367, "grad/layer_20/attn_mlp_ratio": 0.5580693798548253, "grad/layer_24/attn": 0.02291053719818592, "grad/layer_24/mlp": 0.014137107878923416, "grad/layer_24/attn_mlp_ratio": 1.6205957563840174, "grad/layer_27/attn": 0.005342238117009401, "grad/layer_27/mlp": 0.014700782485306263, "grad/layer_27/attn_mlp_ratio": 0.3633982127148169} {"step": 51850, "timestamp": 1778250584.7307844, "train/loss": 2.1822051286697386, "train/z_loss": 0.0013736113207414747, "train/perplexity": 8.865835026082303, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021506.343720884, "perf/iters_per_sec": 0.963929340229456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374204397201539, "data/tokens_consumed": 108739428352, "data/tokens_consumed_B": 108.739428352, "train/loss_slope": 7.235548312407204e-06} {"step": 51860, "timestamp": 1778250595.1049256, "train/loss": 2.143327760696411, "train/z_loss": 0.0013941817916929722, "train/perplexity": 8.527768836964531, "train/grad_norm": 0.4375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022536.516838979, "perf/iters_per_sec": 0.9644205650515456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036892032623291, "data/tokens_consumed": 108760399872, "data/tokens_consumed_B": 108.760399872, "train/loss_slope": 3.824113848114199e-06} {"step": 51870, "timestamp": 1778250605.4901276, "train/loss": 2.145684313774109, "train/z_loss": 0.0013761086040176452, "train/perplexity": 8.54788867428443, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020292.7538170922, "perf/iters_per_sec": 0.9633506554685078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380436182022095, "data/tokens_consumed": 108781371392, "data/tokens_consumed_B": 108.781371392, "train/loss_slope": 3.063622616877434e-06} {"step": 51880, "timestamp": 1778250615.8722641, "train/loss": 2.1462822198867797, "train/z_loss": 0.001371327240485698, "train/perplexity": 8.553001037377483, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021594.153133494, "perf/iters_per_sec": 0.9639712110202284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373753786087037, "data/tokens_consumed": 108802342912, "data/tokens_consumed_B": 108.802342912, "train/loss_slope": 3.1301886478129375e-06} {"step": 51890, "timestamp": 1778250626.2468758, "train/loss": 2.165753412246704, "train/z_loss": 0.0013773708371445537, "train/perplexity": 8.721170078616613, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022269.7508287, "perf/iters_per_sec": 0.964293361105299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370288133621215, "data/tokens_consumed": 108823314432, "data/tokens_consumed_B": 108.823314432, "train/loss_slope": 3.52319509390051e-06} {"step": 51900, "timestamp": 1778250636.6096654, "grad/layer_0/attn": 0.0027500640135258436, "grad/layer_0/mlp": 0.0029501072131097317, "grad/layer_0/attn_mlp_ratio": 0.9321911787090142, "grad/layer_4/attn": 0.0019295900128781796, "grad/layer_4/mlp": 0.0025139914359897375, "grad/layer_4/attn_mlp_ratio": 0.7675403776244283, "grad/layer_8/attn": 0.003944956231862307, "grad/layer_8/mlp": 0.003597748000174761, "grad/layer_8/attn_mlp_ratio": 1.0965070710955778, "grad/layer_12/attn": 0.005252786446362734, "grad/layer_12/mlp": 0.006683420389890671, "grad/layer_12/attn_mlp_ratio": 0.7859428348565063, "grad/layer_16/attn": 0.004461163189262152, "grad/layer_16/mlp": 0.004974567797034979, "grad/layer_16/attn_mlp_ratio": 0.896794109880612, "grad/layer_20/attn": 0.0034978589974343777, "grad/layer_20/mlp": 0.005797705613076687, "grad/layer_20/attn_mlp_ratio": 0.6033177899224826, "grad/layer_24/attn": 0.008908404037356377, "grad/layer_24/mlp": 0.008173806592822075, "grad/layer_24/attn_mlp_ratio": 1.0898721210497182, "grad/layer_27/attn": 0.005390380974858999, "grad/layer_27/mlp": 0.007207799237221479, "grad/layer_27/attn_mlp_ratio": 0.7478539180499616} {"step": 51900, "timestamp": 1778250637.2072194, "eos/sharpness": 12.838387489318846, "eos/L0_probe": 1.9862381219863892, "eos/L_plus": 2.052497386932373, "eos/L_minus": 2.0483627319335938, "eos/grad_norm": 0.09649652242660522, "eos/embed_grad_frac": 0.20782606303691864, "eos/time_s": 0.5946991443634033} {"step": 51900, "timestamp": 1778250637.226962, "train/loss": 2.1321425914764403, "train/z_loss": 0.001378724875394255, "train/perplexity": 8.432915762204962, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911161.650954177, "perf/iters_per_sec": 0.9113128905077824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973179578781127, "data/tokens_consumed": 108844285952, "data/tokens_consumed_B": 108.844285952, "train/loss_slope": 3.6362279330578164e-06} {"step": 51900, "timestamp": 1778250638.588646, "geo/rankme_last": 438.41131591796875, "geo/layer_0/stable_rank_q_proj": 19.42393684387207, "geo/layer_0/stable_rank_k_proj": 16.375831604003906, "geo/layer_0/stable_rank_o_proj": 47.666664123535156, "geo/layer_0/stable_rank_gate_proj": 133.36117553710938, "geo/layer_0/stable_rank_down_proj": 53.90012741088867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06114571541547775, "geo/layer_0/attn_entropy_mean": 6.1744561195373535, "geo/layer_0/attn_entropy_std": 0.39831411838531494, "geo/layer_7/stable_rank_q_proj": 42.77895736694336, "geo/layer_7/stable_rank_k_proj": 42.019615173339844, "geo/layer_7/stable_rank_o_proj": 93.66910552978516, "geo/layer_7/stable_rank_gate_proj": 86.04096984863281, "geo/layer_7/stable_rank_down_proj": 143.1606903076172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48734140396118164, "geo/layer_7/attn_entropy_mean": 4.643383026123047, "geo/layer_7/attn_entropy_std": 0.8047823905944824, "geo/layer_14/stable_rank_q_proj": 52.307003021240234, "geo/layer_14/stable_rank_k_proj": 38.84160614013672, "geo/layer_14/stable_rank_o_proj": 45.252113342285156, "geo/layer_14/stable_rank_gate_proj": 73.4649429321289, "geo/layer_14/stable_rank_down_proj": 131.40487670898438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.407010555267334, "geo/layer_14/attn_entropy_mean": 5.485209941864014, "geo/layer_14/attn_entropy_std": 0.3906889259815216, "geo/layer_21/stable_rank_q_proj": 41.05328369140625, "geo/layer_21/stable_rank_k_proj": 30.473196029663086, "geo/layer_21/stable_rank_o_proj": 72.86419677734375, "geo/layer_21/stable_rank_gate_proj": 68.92742919921875, "geo/layer_21/stable_rank_down_proj": 53.256622314453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14491122961044312, "geo/layer_21/attn_entropy_mean": 5.701332092285156, "geo/layer_21/attn_entropy_std": 0.28669872879981995, "geo/layer_27/stable_rank_q_proj": 42.81598663330078, "geo/layer_27/stable_rank_k_proj": 31.550357818603516, "geo/layer_27/stable_rank_o_proj": 116.1131362915039, "geo/layer_27/stable_rank_gate_proj": 82.33406066894531, "geo/layer_27/stable_rank_down_proj": 128.67520141601562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08491490036249161, "geo/layer_27/attn_entropy_mean": 4.252035140991211, "geo/layer_27/attn_entropy_std": 0.689084529876709, "attnres/final_alpha/block_0": 0.2377704679965973, "attnres/block_norm/0": 1.7464336156845093, "attnres/final_alpha/block_1": 0.004790198523551226, "attnres/block_norm/1": 44219.1171875, "attnres/final_alpha/block_2": 0.010443262755870819, "attnres/block_norm/2": 27847.765625, "attnres/final_alpha/block_3": 0.012335737235844135, "attnres/block_norm/3": 52899.8046875, "attnres/final_alpha/block_4": 0.014512097463011742, "attnres/block_norm/4": 14210.484375, "attnres/final_alpha/block_5": 0.6085312366485596, "attnres/block_norm/5": 6354.3505859375, "attnres/final_alpha/block_6": 0.11161699146032333, "attnres/block_norm/6": 34979.7109375, "geo/tier1_time_s": 1.3581831455230713, "geo/step": 51900.0, "geo/rankme_slope": -0.00021295019961109444} {"step": 51910, "timestamp": 1778250648.9636095, "train/loss": 2.122350549697876, "train/z_loss": 0.001392479962669313, "train/perplexity": 8.350743273552395, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787353.9869983767, "perf/iters_per_sec": 0.8522767958633312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1733277320861817, "data/tokens_consumed": 108865257472, "data/tokens_consumed_B": 108.865257472, "train/loss_slope": 1.135403023372634e-06} {"step": 51920, "timestamp": 1778250659.334612, "train/loss": 2.1004310131072996, "train/z_loss": 0.0013956342125311493, "train/perplexity": 8.16969039746947, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023058.3484723011, "perf/iters_per_sec": 0.9646693937646394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366245746612548, "data/tokens_consumed": 108886228992, "data/tokens_consumed_B": 108.886228992, "train/loss_slope": -5.453782706800121e-06} {"step": 51930, "timestamp": 1778250669.7044547, "train/loss": 2.1849690437316895, "train/z_loss": 0.0013630952686071395, "train/perplexity": 8.890373336329167, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023470.5429093945, "perf/iters_per_sec": 0.9648659433886502, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364134073257447, "data/tokens_consumed": 108907200512, "data/tokens_consumed_B": 108.907200512, "train/loss_slope": -6.001187901650034e-06} {"step": 51940, "timestamp": 1778250680.0772629, "train/loss": 2.197150182723999, "train/z_loss": 0.0013720535207539797, "train/perplexity": 8.999330473394913, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022762.1394645711, "perf/iters_per_sec": 0.9645281503031593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367763757705688, "data/tokens_consumed": 108928172032, "data/tokens_consumed_B": 108.928172032, "train/loss_slope": -3.411308502314058e-06} {"step": 51950, "timestamp": 1778250690.4470592, "grad/layer_0/attn": 0.003218644065782428, "grad/layer_0/mlp": 0.003187032649293542, "grad/layer_0/attn_mlp_ratio": 1.0099187297325052, "grad/layer_4/attn": 0.002075496595352888, "grad/layer_4/mlp": 0.0026300724130123854, "grad/layer_4/attn_mlp_ratio": 0.7891404457802165, "grad/layer_8/attn": 0.008469993248581886, "grad/layer_8/mlp": 0.003949211910367012, "grad/layer_8/attn_mlp_ratio": 2.144729942668929, "grad/layer_12/attn": 0.004983270075172186, "grad/layer_12/mlp": 0.007618394680321217, "grad/layer_12/attn_mlp_ratio": 0.6541102448568668, "grad/layer_16/attn": 0.005125884432345629, "grad/layer_16/mlp": 0.005105066113173962, "grad/layer_16/attn_mlp_ratio": 1.0040779528222268, "grad/layer_20/attn": 0.004431471694260836, "grad/layer_20/mlp": 0.006757020018994808, "grad/layer_20/attn_mlp_ratio": 0.655832247976209, "grad/layer_24/attn": 0.006806573364883661, "grad/layer_24/mlp": 0.008807072415947914, "grad/layer_24/attn_mlp_ratio": 0.7728531078355795, "grad/layer_27/attn": 0.007686338387429714, "grad/layer_27/mlp": 0.008248664438724518, "grad/layer_27/attn_mlp_ratio": 0.9318282191432463} {"step": 51950, "timestamp": 1778250690.4615939, "train/loss": 2.150162672996521, "train/z_loss": 0.0014014704851433636, "train/perplexity": 8.586255035412655, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020503.3954728798, "perf/iters_per_sec": 0.9634510972370528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379354000091552, "data/tokens_consumed": 108949143552, "data/tokens_consumed_B": 108.949143552, "train/loss_slope": -6.491849825184139e-06} {"step": 51960, "timestamp": 1778250700.8474593, "train/loss": 2.2191415786743165, "train/z_loss": 0.001379922078922391, "train/perplexity": 9.199430488125868, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020946.029894659, "perf/iters_per_sec": 0.9636621617768569, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377080678939818, "data/tokens_consumed": 108970115072, "data/tokens_consumed_B": 108.970115072, "train/loss_slope": -5.925039317992047e-06} {"step": 51970, "timestamp": 1778250711.2277133, "train/loss": 2.1233652353286745, "train/z_loss": 0.0013848894159309566, "train/perplexity": 8.359220953121, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021393.1319431798, "perf/iters_per_sec": 0.9638753566471003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037478542327881, "data/tokens_consumed": 108991086592, "data/tokens_consumed_B": 108.991086592, "train/loss_slope": -4.99558523900871e-06} {"step": 51975, "timestamp": 1778250716.9961765, "eos/sharpness": 7.7402114868164045, "eos/L0_probe": 1.983891248703003, "eos/L_plus": 2.0320942401885986, "eos/L_minus": 2.0130903720855713, "eos/grad_norm": 0.10557214915752411, "eos/embed_grad_frac": 0.26782849431037903, "eos/time_s": 0.5951790809631348} {"step": 51975, "timestamp": 1778250718.3755095, "geo/rankme_last": 439.3214111328125, "geo/layer_0/stable_rank_q_proj": 19.435035705566406, "geo/layer_0/stable_rank_k_proj": 16.359956741333008, "geo/layer_0/stable_rank_o_proj": 47.65238571166992, "geo/layer_0/stable_rank_gate_proj": 133.52691650390625, "geo/layer_0/stable_rank_down_proj": 53.8426628112793, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06072396785020828, "geo/layer_0/attn_entropy_mean": 6.178256034851074, "geo/layer_0/attn_entropy_std": 0.40040960907936096, "geo/layer_7/stable_rank_q_proj": 42.93393325805664, "geo/layer_7/stable_rank_k_proj": 41.99725341796875, "geo/layer_7/stable_rank_o_proj": 93.73242950439453, "geo/layer_7/stable_rank_gate_proj": 86.00570678710938, "geo/layer_7/stable_rank_down_proj": 142.97402954101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46209603548049927, "geo/layer_7/attn_entropy_mean": 4.6792097091674805, "geo/layer_7/attn_entropy_std": 0.796758770942688, "geo/layer_14/stable_rank_q_proj": 52.28422927856445, "geo/layer_14/stable_rank_k_proj": 38.853050231933594, "geo/layer_14/stable_rank_o_proj": 45.20964813232422, "geo/layer_14/stable_rank_gate_proj": 73.37590789794922, "geo/layer_14/stable_rank_down_proj": 131.62664794921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3979357182979584, "geo/layer_14/attn_entropy_mean": 5.517857551574707, "geo/layer_14/attn_entropy_std": 0.388002872467041, "geo/layer_21/stable_rank_q_proj": 41.0113525390625, "geo/layer_21/stable_rank_k_proj": 30.486289978027344, "geo/layer_21/stable_rank_o_proj": 72.80220031738281, "geo/layer_21/stable_rank_gate_proj": 68.97310638427734, "geo/layer_21/stable_rank_down_proj": 53.21337127685547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14911657571792603, "geo/layer_21/attn_entropy_mean": 5.68159294128418, "geo/layer_21/attn_entropy_std": 0.2968330979347229, "geo/layer_27/stable_rank_q_proj": 42.87739562988281, "geo/layer_27/stable_rank_k_proj": 31.590709686279297, "geo/layer_27/stable_rank_o_proj": 115.76586151123047, "geo/layer_27/stable_rank_gate_proj": 82.22356414794922, "geo/layer_27/stable_rank_down_proj": 128.44735717773438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09016599506139755, "geo/layer_27/attn_entropy_mean": 4.244969367980957, "geo/layer_27/attn_entropy_std": 0.6992610096931458, "attnres/final_alpha/block_0": 0.23820722103118896, "attnres/block_norm/0": 1.7467067241668701, "attnres/final_alpha/block_1": 0.004740268923342228, "attnres/block_norm/1": 44177.9453125, "attnres/final_alpha/block_2": 0.010445574298501015, "attnres/block_norm/2": 27868.8984375, "attnres/final_alpha/block_3": 0.012416277080774307, "attnres/block_norm/3": 53126.078125, "attnres/final_alpha/block_4": 0.014441794715821743, "attnres/block_norm/4": 14237.982421875, "attnres/final_alpha/block_5": 0.6074883937835693, "attnres/block_norm/5": 6371.1025390625, "attnres/final_alpha/block_6": 0.11226049065589905, "attnres/block_norm/6": 35101.7421875, "geo/tier1_time_s": 1.359292984008789, "geo/step": 51975.0, "geo/rankme_slope": -0.00018850241268382353} {"step": 51980, "timestamp": 1778250723.565606, "train/loss": 2.186460542678833, "train/z_loss": 0.0013855352415703237, "train/perplexity": 8.903643212343084, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700432.6086417253, "perf/iters_per_sec": 0.8108294528206469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2333049774169922, "data/tokens_consumed": 109012058112, "data/tokens_consumed_B": 109.012058112, "train/loss_slope": -3.127317789113855e-06} {"step": 51990, "timestamp": 1778250733.9383242, "train/loss": 2.1482258081436156, "train/z_loss": 0.0013804342714138329, "train/perplexity": 8.569640714857227, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023149.9689154436, "perf/iters_per_sec": 0.9647130817963808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365776300430298, "data/tokens_consumed": 109033029632, "data/tokens_consumed_B": 109.033029632, "train/loss_slope": -3.822695969319265e-06} {"step": 52000, "timestamp": 1778250744.3029766, "grad/layer_0/attn": 0.00291746249422431, "grad/layer_0/mlp": 0.002958070719614625, "grad/layer_0/attn_mlp_ratio": 0.9862720239417373, "grad/layer_4/attn": 0.0029667147900909185, "grad/layer_4/mlp": 0.0025462813209742308, "grad/layer_4/attn_mlp_ratio": 1.1651166150188639, "grad/layer_8/attn": 0.005101813469082117, "grad/layer_8/mlp": 0.003713030833750963, "grad/layer_8/attn_mlp_ratio": 1.3740293469433027, "grad/layer_12/attn": 0.005866658873856068, "grad/layer_12/mlp": 0.00691412016749382, "grad/layer_12/attn_mlp_ratio": 0.8485040246461567, "grad/layer_16/attn": 0.003572241635993123, "grad/layer_16/mlp": 0.004351452458649874, "grad/layer_16/attn_mlp_ratio": 0.8209308473080271, "grad/layer_20/attn": 0.0036574590485543013, "grad/layer_20/mlp": 0.0059121460653841496, "grad/layer_20/attn_mlp_ratio": 0.618634747220708, "grad/layer_24/attn": 0.008633149787783623, "grad/layer_24/mlp": 0.009944990277290344, "grad/layer_24/attn_mlp_ratio": 0.868090310826007, "grad/layer_27/attn": 0.004122226499021053, "grad/layer_27/mlp": 0.010053744539618492, "grad/layer_27/attn_mlp_ratio": 0.4100190174690451} {"step": 52000, "timestamp": 1778250744.3172467, "train/loss": 2.1813191890716555, "train/z_loss": 0.0013771681347861885, "train/perplexity": 8.857983910083423, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021642.6141870096, "perf/iters_per_sec": 0.9639943190512703, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373505115509034, "data/tokens_consumed": 109054001152, "data/tokens_consumed_B": 109.054001152, "train/loss_slope": -2.808688125415827e-06} {"step": 52000, "timestamp": 1778250751.622107, "geo/ww_alpha_mean": 7.425414876361849, "geo/ww_alpha_std": 3.9851675653166123, "geo/ww_alpha_min": 1.3439846054720948, "geo/ww_alpha_max": 26.506678262864405, "geo/ww_alpha_healthy_frac": 0.14213197969543148, "geo/ww_alpha_by_type/q_proj": 4.026376434045541, "geo/ww_alpha_by_type/k_proj": 4.368449748937254, "geo/ww_alpha_by_type/v_proj": 8.469786378839656, "geo/ww_alpha_by_type/o_proj": 7.112098178962492, "geo/ww_alpha_by_type/gate_proj": 8.100252103231949, "geo/ww_alpha_by_type/up_proj": 11.496769892975644, "geo/ww_alpha_by_type/down_proj": 8.496875772151226, "geo/twonn_id/layer_0": 0.6748447418212891, "geo/twonn_id/layer_7": 3.158618688583374, "geo/twonn_id/layer_14": 4.7513017654418945, "geo/twonn_id/layer_21": 7.418199062347412, "geo/twonn_id/layer_27": 5.804811000823975, "geo/tier2_time_s": 7.298128604888916} {"step": 52000, "timestamp": 1778250752.3977475, "eoc/jacobian_sigma/layer_0/attn": 1094.23583984375, "eoc/jacobian_sigma/layer_0/mlp": 7888.001953125, "eoc/jacobian_sigma/layer_0": 7888.001953125, "eoc/jacobian_sigma/layer_7/attn": 1.1544054746627808, "eoc/jacobian_sigma/layer_7/mlp": 1.7200051546096802, "eoc/jacobian_sigma/layer_7": 1.7200051546096802, "eoc/jacobian_sigma/layer_14/attn": 1.604271650314331, "eoc/jacobian_sigma/layer_14/mlp": 5.627504825592041, "eoc/jacobian_sigma/layer_14": 5.627504825592041, "eoc/jacobian_sigma/layer_21/attn": 1.0889800786972046, "eoc/jacobian_sigma/layer_21/mlp": 3.9050350189208984, "eoc/jacobian_sigma/layer_21": 3.9050350189208984, "eoc/jacobian_sigma/layer_27/attn": 3.343592405319214, "eoc/jacobian_sigma/layer_27/mlp": 32.511478424072266, "eoc/jacobian_sigma/layer_27": 32.511478424072266, "eoc/layer0_sigma": 7888.001953125, "eoc/sigma_max": 32.511478424072266, "eoc/sigma_min": 1.7200051546096802, "eoc/sigma_mean": 10.941005855798721, "eoc/time_s": 0.7689666748046875} {"step": 52010, "timestamp": 1778250762.797592, "train/loss": 2.1811886072158813, "train/z_loss": 0.0013702801428735257, "train/perplexity": 8.85682729362423, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1135105.7635105709, "perf/iters_per_sec": 0.5412606065323691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8475388526916503, "data/tokens_consumed": 109074972672, "data/tokens_consumed_B": 109.074972672, "train/loss_slope": -1.7802085098189334e-06} {"step": 52020, "timestamp": 1778250773.1715403, "train/loss": 2.11511344909668, "train/z_loss": 0.0013868295820429922, "train/perplexity": 8.290526265484635, "train/grad_norm": 0.31640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022605.1610672032, "perf/iters_per_sec": 0.9644532971702591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368568420410156, "data/tokens_consumed": 109095944192, "data/tokens_consumed_B": 109.095944192, "train/loss_slope": -5.220192386479594e-06} {"step": 52030, "timestamp": 1778250783.5496073, "train/loss": 2.1440598487854006, "train/z_loss": 0.0013870560564100743, "train/perplexity": 8.534014200755017, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021654.6020352028, "perf/iters_per_sec": 0.9640000353027357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373443603515624, "data/tokens_consumed": 109116915712, "data/tokens_consumed_B": 109.116915712, "train/loss_slope": -4.143116061407808e-06} {"step": 52040, "timestamp": 1778250793.933354, "train/loss": 2.1690058946609496, "train/z_loss": 0.001382954337168485, "train/perplexity": 8.749581710048385, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021100.846835516, "perf/iters_per_sec": 0.9637359842469768, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376285791397095, "data/tokens_consumed": 109137887232, "data/tokens_consumed_B": 109.137887232, "train/loss_slope": -4.049256010310225e-06} {"step": 52050, "timestamp": 1778250804.3017929, "grad/layer_0/attn": 0.002740283729508519, "grad/layer_0/mlp": 0.0027068331837654114, "grad/layer_0/attn_mlp_ratio": 1.0123577783470195, "grad/layer_4/attn": 0.002683585276827216, "grad/layer_4/mlp": 0.002552183112129569, "grad/layer_4/attn_mlp_ratio": 1.0514861409922072, "grad/layer_8/attn": 0.004816936794668436, "grad/layer_8/mlp": 0.0036122328601777554, "grad/layer_8/attn_mlp_ratio": 1.3335066834757519, "grad/layer_12/attn": 0.005137018859386444, "grad/layer_12/mlp": 0.007819458842277527, "grad/layer_12/attn_mlp_ratio": 0.6569532364460776, "grad/layer_16/attn": 0.003960090223699808, "grad/layer_16/mlp": 0.0045390864834189415, "grad/layer_16/attn_mlp_ratio": 0.8724420984093633, "grad/layer_20/attn": 0.003430447541177273, "grad/layer_20/mlp": 0.005736833438277245, "grad/layer_20/attn_mlp_ratio": 0.5979688129851902, "grad/layer_24/attn": 0.012077813036739826, "grad/layer_24/mlp": 0.00966469943523407, "grad/layer_24/attn_mlp_ratio": 1.249683240819686, "grad/layer_27/attn": 0.005103688221424818, "grad/layer_27/mlp": 0.00880366563796997, "grad/layer_27/attn_mlp_ratio": 0.5797230805132401} {"step": 52050, "timestamp": 1778250804.943911, "eos/sharpness": 28.39903831481933, "eos/L0_probe": 1.979985237121582, "eos/L_plus": 2.1179587841033936, "eos/L_minus": 2.126002073287964, "eos/grad_norm": 0.118258535861969, "eos/embed_grad_frac": 0.14560124278068542, "eos/time_s": 0.6393284797668457} {"step": 52050, "timestamp": 1778250804.9624217, "train/loss": 2.1667654752731322, "train/z_loss": 0.0013838708866387606, "train/perplexity": 8.730000920330804, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1902413.7434936666, "perf/iters_per_sec": 0.9071415631740888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1023637771606445, "data/tokens_consumed": 109158858752, "data/tokens_consumed_B": 109.158858752, "train/loss_slope": -5.163880457507606e-06} {"step": 52050, "timestamp": 1778250806.3250043, "geo/rankme_last": 439.33380126953125, "geo/layer_0/stable_rank_q_proj": 19.420801162719727, "geo/layer_0/stable_rank_k_proj": 16.353914260864258, "geo/layer_0/stable_rank_o_proj": 47.60629653930664, "geo/layer_0/stable_rank_gate_proj": 133.14443969726562, "geo/layer_0/stable_rank_down_proj": 53.8834114074707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07160638272762299, "geo/layer_0/attn_entropy_mean": 6.170281410217285, "geo/layer_0/attn_entropy_std": 0.3980695605278015, "geo/layer_7/stable_rank_q_proj": 43.04362487792969, "geo/layer_7/stable_rank_k_proj": 41.87739944458008, "geo/layer_7/stable_rank_o_proj": 93.62725067138672, "geo/layer_7/stable_rank_gate_proj": 85.95304870605469, "geo/layer_7/stable_rank_down_proj": 143.32984924316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.491411954164505, "geo/layer_7/attn_entropy_mean": 4.6633620262146, "geo/layer_7/attn_entropy_std": 0.7903672456741333, "geo/layer_14/stable_rank_q_proj": 52.27927017211914, "geo/layer_14/stable_rank_k_proj": 38.78867721557617, "geo/layer_14/stable_rank_o_proj": 45.12300491333008, "geo/layer_14/stable_rank_gate_proj": 73.46722412109375, "geo/layer_14/stable_rank_down_proj": 131.6369171142578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3986424207687378, "geo/layer_14/attn_entropy_mean": 5.532718658447266, "geo/layer_14/attn_entropy_std": 0.3923196792602539, "geo/layer_21/stable_rank_q_proj": 40.94328308105469, "geo/layer_21/stable_rank_k_proj": 30.497833251953125, "geo/layer_21/stable_rank_o_proj": 72.83817291259766, "geo/layer_21/stable_rank_gate_proj": 68.86427307128906, "geo/layer_21/stable_rank_down_proj": 53.23894500732422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14009268581867218, "geo/layer_21/attn_entropy_mean": 5.695991516113281, "geo/layer_21/attn_entropy_std": 0.29504233598709106, "geo/layer_27/stable_rank_q_proj": 42.88663101196289, "geo/layer_27/stable_rank_k_proj": 31.532512664794922, "geo/layer_27/stable_rank_o_proj": 115.54582977294922, "geo/layer_27/stable_rank_gate_proj": 82.20476531982422, "geo/layer_27/stable_rank_down_proj": 128.49452209472656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08736064285039902, "geo/layer_27/attn_entropy_mean": 4.234027862548828, "geo/layer_27/attn_entropy_std": 0.6931300759315491, "attnres/final_alpha/block_0": 0.23826181888580322, "attnres/block_norm/0": 1.7470107078552246, "attnres/final_alpha/block_1": 0.004770700819790363, "attnres/block_norm/1": 44157.3359375, "attnres/final_alpha/block_2": 0.010294446721673012, "attnres/block_norm/2": 27932.330078125, "attnres/final_alpha/block_3": 0.012098332867026329, "attnres/block_norm/3": 53442.34765625, "attnres/final_alpha/block_4": 0.014568457379937172, "attnres/block_norm/4": 14219.751953125, "attnres/final_alpha/block_5": 0.6086183786392212, "attnres/block_norm/5": 6337.115234375, "attnres/final_alpha/block_6": 0.11138781905174255, "attnres/block_norm/6": 35019.1015625, "geo/tier1_time_s": 1.3583452701568604, "geo/step": 52050.0, "geo/rankme_slope": -0.00015539346598014207} {"step": 52060, "timestamp": 1778250816.6817503, "train/loss": 2.2230183124542235, "train/z_loss": 0.0013824672554619611, "train/perplexity": 9.23516344989232, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790076.4798237458, "perf/iters_per_sec": 0.8535749816054086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715432405471802, "data/tokens_consumed": 109179830272, "data/tokens_consumed_B": 109.179830272, "train/loss_slope": -1.6532894944846838e-07} {"step": 52070, "timestamp": 1778250827.0239573, "train/loss": 2.1571023941040037, "train/z_loss": 0.001376950496342033, "train/perplexity": 8.646048485681092, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029154.8876596899, "perf/iters_per_sec": 0.9675764501856279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335100650787354, "data/tokens_consumed": 109200801792, "data/tokens_consumed_B": 109.200801792, "train/loss_slope": 2.2505530358218914e-06} {"step": 52080, "timestamp": 1778250837.369395, "train/loss": 2.165903902053833, "train/z_loss": 0.0013721064548008143, "train/perplexity": 8.722482624579602, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028501.2064929893, "perf/iters_per_sec": 0.9672647507157275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338431119918823, "data/tokens_consumed": 109221773312, "data/tokens_consumed_B": 109.221773312, "train/loss_slope": 1.302108677378494e-06} {"step": 52090, "timestamp": 1778250847.7220411, "train/loss": 2.2177113771438597, "train/z_loss": 0.001381530670914799, "train/perplexity": 9.18628285268768, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027237.3424342636, "perf/iters_per_sec": 0.9666620933696096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344876527786255, "data/tokens_consumed": 109242744832, "data/tokens_consumed_B": 109.242744832, "train/loss_slope": 8.676383933826051e-07} {"step": 52100, "timestamp": 1778250858.0655167, "grad/layer_0/attn": 0.0029850988648831844, "grad/layer_0/mlp": 0.0028449520468711853, "grad/layer_0/attn_mlp_ratio": 1.049261537901834, "grad/layer_4/attn": 0.002385698491707444, "grad/layer_4/mlp": 0.0026108347810804844, "grad/layer_4/attn_mlp_ratio": 0.9137684305489789, "grad/layer_8/attn": 0.006235230714082718, "grad/layer_8/mlp": 0.0036486927419900894, "grad/layer_8/attn_mlp_ratio": 1.7088943860458983, "grad/layer_12/attn": 0.004078139550983906, "grad/layer_12/mlp": 0.006599021144211292, "grad/layer_12/attn_mlp_ratio": 0.6179915778512274, "grad/layer_16/attn": 0.004692775662988424, "grad/layer_16/mlp": 0.004674009513109922, "grad/layer_16/attn_mlp_ratio": 1.0040149788793473, "grad/layer_20/attn": 0.006022011395543814, "grad/layer_20/mlp": 0.006395996548235416, "grad/layer_20/attn_mlp_ratio": 0.94152822878749, "grad/layer_24/attn": 0.012762326747179031, "grad/layer_24/mlp": 0.01211965549737215, "grad/layer_24/attn_mlp_ratio": 1.0530271792497328, "grad/layer_27/attn": 0.007260222919285297, "grad/layer_27/mlp": 0.011260580271482468, "grad/layer_27/attn_mlp_ratio": 0.6447467785649739} {"step": 52100, "timestamp": 1778250858.0798786, "train/loss": 2.1147874712944033, "train/z_loss": 0.0013793104444630444, "train/perplexity": 8.28782417838703, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026054.5702233429, "perf/iters_per_sec": 0.9660981036297526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350915670394898, "data/tokens_consumed": 109263716352, "data/tokens_consumed_B": 109.263716352, "train/loss_slope": -4.1864504157480145e-07} {"step": 52110, "timestamp": 1778250868.4324312, "train/loss": 2.2341299772262575, "train/z_loss": 0.0013717218651436269, "train/perplexity": 9.338353736466456, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026767.1033758265, "perf/iters_per_sec": 0.9664378659133084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347276687622071, "data/tokens_consumed": 109284687872, "data/tokens_consumed_B": 109.284687872, "train/loss_slope": 7.013674108537845e-06} {"step": 52120, "timestamp": 1778250878.780996, "train/loss": 2.140320670604706, "train/z_loss": 0.0013845445937477052, "train/perplexity": 8.502163585733996, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028168.7951241457, "perf/iters_per_sec": 0.9671062446232537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340125560760498, "data/tokens_consumed": 109305659392, "data/tokens_consumed_B": 109.305659392, "train/loss_slope": 8.094229034357446e-06} {"step": 52125, "timestamp": 1778250884.532184, "eos/sharpness": 6.233501434326171, "eos/L0_probe": 1.9852426052093506, "eos/L_plus": 2.0168750286102295, "eos/L_minus": 2.0159451961517334, "eos/grad_norm": 0.08409281820058823, "eos/embed_grad_frac": 0.31276994943618774, "eos/time_s": 0.5880677700042725} {"step": 52125, "timestamp": 1778250885.9087043, "geo/rankme_last": 438.7794189453125, "geo/layer_0/stable_rank_q_proj": 19.440656661987305, "geo/layer_0/stable_rank_k_proj": 16.365947723388672, "geo/layer_0/stable_rank_o_proj": 47.69705581665039, "geo/layer_0/stable_rank_gate_proj": 132.7416534423828, "geo/layer_0/stable_rank_down_proj": 53.83086395263672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06482809036970139, "geo/layer_0/attn_entropy_mean": 6.17608642578125, "geo/layer_0/attn_entropy_std": 0.3998717963695526, "geo/layer_7/stable_rank_q_proj": 43.04208755493164, "geo/layer_7/stable_rank_k_proj": 41.945167541503906, "geo/layer_7/stable_rank_o_proj": 93.6760482788086, "geo/layer_7/stable_rank_gate_proj": 85.95310974121094, "geo/layer_7/stable_rank_down_proj": 143.07650756835938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4820336103439331, "geo/layer_7/attn_entropy_mean": 4.650039196014404, "geo/layer_7/attn_entropy_std": 0.8046905994415283, "geo/layer_14/stable_rank_q_proj": 52.231082916259766, "geo/layer_14/stable_rank_k_proj": 38.66091537475586, "geo/layer_14/stable_rank_o_proj": 45.09653091430664, "geo/layer_14/stable_rank_gate_proj": 73.47077178955078, "geo/layer_14/stable_rank_down_proj": 131.7600555419922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3751513957977295, "geo/layer_14/attn_entropy_mean": 5.531116962432861, "geo/layer_14/attn_entropy_std": 0.3878425359725952, "geo/layer_21/stable_rank_q_proj": 40.96028137207031, "geo/layer_21/stable_rank_k_proj": 30.39332389831543, "geo/layer_21/stable_rank_o_proj": 72.85222625732422, "geo/layer_21/stable_rank_gate_proj": 68.74372100830078, "geo/layer_21/stable_rank_down_proj": 53.238746643066406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1468740850687027, "geo/layer_21/attn_entropy_mean": 5.688936233520508, "geo/layer_21/attn_entropy_std": 0.3021787703037262, "geo/layer_27/stable_rank_q_proj": 42.96735763549805, "geo/layer_27/stable_rank_k_proj": 31.584936141967773, "geo/layer_27/stable_rank_o_proj": 115.58384704589844, "geo/layer_27/stable_rank_gate_proj": 82.29499816894531, "geo/layer_27/stable_rank_down_proj": 128.7012481689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0858837217092514, "geo/layer_27/attn_entropy_mean": 4.226351261138916, "geo/layer_27/attn_entropy_std": 0.7212002277374268, "attnres/final_alpha/block_0": 0.2398584932088852, "attnres/block_norm/0": 1.7470550537109375, "attnres/final_alpha/block_1": 0.004770629107952118, "attnres/block_norm/1": 44216.7890625, "attnres/final_alpha/block_2": 0.010562373325228691, "attnres/block_norm/2": 27896.32421875, "attnres/final_alpha/block_3": 0.012456133961677551, "attnres/block_norm/3": 53371.51171875, "attnres/final_alpha/block_4": 0.014498625881969929, "attnres/block_norm/4": 14186.53125, "attnres/final_alpha/block_5": 0.6052114963531494, "attnres/block_norm/5": 6364.77734375, "attnres/final_alpha/block_6": 0.11264222860336304, "attnres/block_norm/6": 35162.5234375, "geo/tier1_time_s": 1.358097791671753, "geo/step": 52125.0, "geo/rankme_slope": -0.0001404256038352841} {"step": 52130, "timestamp": 1778250891.0882466, "train/loss": 2.188177704811096, "train/z_loss": 0.0013790664030238985, "train/perplexity": 8.918945345668401, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705055.306072315, "perf/iters_per_sec": 0.8130337267266822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2299612760543823, "data/tokens_consumed": 109326630912, "data/tokens_consumed_B": 109.326630912, "train/loss_slope": 1.1204229250992882e-05} {"step": 52140, "timestamp": 1778250901.431204, "train/loss": 2.1514545679092407, "train/z_loss": 0.0013772144331596792, "train/perplexity": 8.597354742896256, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028511.1239131144, "perf/iters_per_sec": 0.9672694797101566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338380575180053, "data/tokens_consumed": 109347602432, "data/tokens_consumed_B": 109.347602432, "train/loss_slope": 1.0592707298626743e-05} {"step": 52150, "timestamp": 1778250911.772255, "grad/layer_0/attn": 0.0025372665841132402, "grad/layer_0/mlp": 0.002639397280290723, "grad/layer_0/attn_mlp_ratio": 0.961305259700518, "grad/layer_4/attn": 0.0017858739010989666, "grad/layer_4/mlp": 0.0026420296635478735, "grad/layer_4/attn_mlp_ratio": 0.6759476845184325, "grad/layer_8/attn": 0.0037378498818725348, "grad/layer_8/mlp": 0.003595046466216445, "grad/layer_8/attn_mlp_ratio": 1.039722243655492, "grad/layer_12/attn": 0.004153035581111908, "grad/layer_12/mlp": 0.006528042256832123, "grad/layer_12/attn_mlp_ratio": 0.636183920707166, "grad/layer_16/attn": 0.009062914177775383, "grad/layer_16/mlp": 0.004335039760917425, "grad/layer_16/attn_mlp_ratio": 2.0906184184099748, "grad/layer_20/attn": 0.003774027107283473, "grad/layer_20/mlp": 0.005321130622178316, "grad/layer_20/attn_mlp_ratio": 0.7092528457445031, "grad/layer_24/attn": 0.011161868460476398, "grad/layer_24/mlp": 0.008058246225118637, "grad/layer_24/attn_mlp_ratio": 1.3851485807381383, "grad/layer_27/attn": 0.006004572380334139, "grad/layer_27/mlp": 0.007032758556306362, "grad/layer_27/attn_mlp_ratio": 0.8538004322030537} {"step": 52150, "timestamp": 1778250911.7866745, "train/loss": 2.154383087158203, "train/z_loss": 0.0013754956191405654, "train/perplexity": 8.622569164189663, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026524.4786657335, "perf/iters_per_sec": 0.9663221734360378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348515510559082, "data/tokens_consumed": 109368573952, "data/tokens_consumed_B": 109.368573952, "train/loss_slope": 7.942070340094461e-06} {"step": 52160, "timestamp": 1778250922.1320634, "train/loss": 2.1918602705001833, "train/z_loss": 0.0013794527389109136, "train/perplexity": 8.951850498287866, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028522.3045171828, "perf/iters_per_sec": 0.9672748110376276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338323593139649, "data/tokens_consumed": 109389545472, "data/tokens_consumed_B": 109.389545472, "train/loss_slope": 1.0192993822449728e-05} {"step": 52170, "timestamp": 1778250932.474972, "train/loss": 2.1179598569869995, "train/z_loss": 0.0013716126908548177, "train/perplexity": 8.314158101828829, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028699.0586410065, "perf/iters_per_sec": 0.9673590939717324, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337422847747804, "data/tokens_consumed": 109410516992, "data/tokens_consumed_B": 109.410516992, "train/loss_slope": 5.030665231688124e-06} {"step": 52180, "timestamp": 1778250942.8245142, "train/loss": 2.1479122638702393, "train/z_loss": 0.0013841330539435149, "train/perplexity": 8.566954174282907, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027876.5577108753, "perf/iters_per_sec": 0.9669668949655892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341615676879883, "data/tokens_consumed": 109431488512, "data/tokens_consumed_B": 109.431488512, "train/loss_slope": 5.298166571169906e-06} {"step": 52190, "timestamp": 1778250953.166228, "train/loss": 2.1232938528060914, "train/z_loss": 0.0013736286200582981, "train/perplexity": 8.358624272139085, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028880.9444926786, "perf/iters_per_sec": 0.9674458239043611, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336496114730835, "data/tokens_consumed": 109452460032, "data/tokens_consumed_B": 109.452460032, "train/loss_slope": 8.04532278131303e-06} {"step": 52200, "timestamp": 1778250963.4955838, "grad/layer_0/attn": 0.002423856407403946, "grad/layer_0/mlp": 0.0026621955912560225, "grad/layer_0/attn_mlp_ratio": 0.9104726656139887, "grad/layer_4/attn": 0.0022933254949748516, "grad/layer_4/mlp": 0.0024219690822064877, "grad/layer_4/attn_mlp_ratio": 0.9468846721185619, "grad/layer_8/attn": 0.00471790274605155, "grad/layer_8/mlp": 0.0034960240591317415, "grad/layer_8/attn_mlp_ratio": 1.3495051896961916, "grad/layer_12/attn": 0.00542408274486661, "grad/layer_12/mlp": 0.006589878350496292, "grad/layer_12/attn_mlp_ratio": 0.8230929880744788, "grad/layer_16/attn": 0.003578216303139925, "grad/layer_16/mlp": 0.004668019711971283, "grad/layer_16/attn_mlp_ratio": 0.7665383711447576, "grad/layer_20/attn": 0.0036414156202226877, "grad/layer_20/mlp": 0.006051270756870508, "grad/layer_20/attn_mlp_ratio": 0.6017604741801117, "grad/layer_24/attn": 0.013248797506093979, "grad/layer_24/mlp": 0.009598557837307453, "grad/layer_24/attn_mlp_ratio": 1.3802904136879623, "grad/layer_27/attn": 0.007506308611482382, "grad/layer_27/mlp": 0.008626652881503105, "grad/layer_27/attn_mlp_ratio": 0.8701298901876641} {"step": 52200, "timestamp": 1778250964.0878868, "eos/sharpness": 69.05279159545897, "eos/L0_probe": 1.981046438217163, "eos/L_plus": 2.2600276470184326, "eos/L_minus": 2.3925931453704834, "eos/grad_norm": 0.1721271425485611, "eos/embed_grad_frac": 0.07421072572469711, "eos/time_s": 0.5894107818603516} {"step": 52200, "timestamp": 1778250964.1078377, "train/loss": 2.1603580951690673, "train/z_loss": 0.001371310034301132, "train/perplexity": 8.67424330699495, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917494.9667287841, "perf/iters_per_sec": 0.9143328508037492, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0936936140060425, "data/tokens_consumed": 109473431552, "data/tokens_consumed_B": 109.473431552, "train/loss_slope": 1.2159303119509509e-05} {"step": 52200, "timestamp": 1778250965.4699974, "geo/rankme_last": 439.40985107421875, "geo/layer_0/stable_rank_q_proj": 19.44809913635254, "geo/layer_0/stable_rank_k_proj": 16.362743377685547, "geo/layer_0/stable_rank_o_proj": 47.706077575683594, "geo/layer_0/stable_rank_gate_proj": 132.33302307128906, "geo/layer_0/stable_rank_down_proj": 53.84268569946289, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06466200202703476, "geo/layer_0/attn_entropy_mean": 6.172865867614746, "geo/layer_0/attn_entropy_std": 0.40181344747543335, "geo/layer_7/stable_rank_q_proj": 42.98997497558594, "geo/layer_7/stable_rank_k_proj": 41.84589767456055, "geo/layer_7/stable_rank_o_proj": 93.6341552734375, "geo/layer_7/stable_rank_gate_proj": 85.97079467773438, "geo/layer_7/stable_rank_down_proj": 142.94187927246094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4730999171733856, "geo/layer_7/attn_entropy_mean": 4.646941184997559, "geo/layer_7/attn_entropy_std": 0.804722785949707, "geo/layer_14/stable_rank_q_proj": 52.21724319458008, "geo/layer_14/stable_rank_k_proj": 38.63975524902344, "geo/layer_14/stable_rank_o_proj": 45.12238693237305, "geo/layer_14/stable_rank_gate_proj": 73.51225280761719, "geo/layer_14/stable_rank_down_proj": 131.86460876464844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39523065090179443, "geo/layer_14/attn_entropy_mean": 5.552407741546631, "geo/layer_14/attn_entropy_std": 0.3749290108680725, "geo/layer_21/stable_rank_q_proj": 40.97901153564453, "geo/layer_21/stable_rank_k_proj": 30.374011993408203, "geo/layer_21/stable_rank_o_proj": 72.83853149414062, "geo/layer_21/stable_rank_gate_proj": 68.75878143310547, "geo/layer_21/stable_rank_down_proj": 53.24748229980469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13967296481132507, "geo/layer_21/attn_entropy_mean": 5.697154998779297, "geo/layer_21/attn_entropy_std": 0.2978779673576355, "geo/layer_27/stable_rank_q_proj": 43.06557846069336, "geo/layer_27/stable_rank_k_proj": 31.523868560791016, "geo/layer_27/stable_rank_o_proj": 115.82244110107422, "geo/layer_27/stable_rank_gate_proj": 82.2739028930664, "geo/layer_27/stable_rank_down_proj": 128.6915283203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09453631937503815, "geo/layer_27/attn_entropy_mean": 4.24012565612793, "geo/layer_27/attn_entropy_std": 0.6988585591316223, "attnres/final_alpha/block_0": 0.2394060343503952, "attnres/block_norm/0": 1.7473794221878052, "attnres/final_alpha/block_1": 0.004780459683388472, "attnres/block_norm/1": 44243.09375, "attnres/final_alpha/block_2": 0.01059570349752903, "attnres/block_norm/2": 27888.06640625, "attnres/final_alpha/block_3": 0.012437139637768269, "attnres/block_norm/3": 53234.6015625, "attnres/final_alpha/block_4": 0.01477254368364811, "attnres/block_norm/4": 14251.3388671875, "attnres/final_alpha/block_5": 0.6044211387634277, "attnres/block_norm/5": 6387.1669921875, "attnres/final_alpha/block_6": 0.11358703672885895, "attnres/block_norm/6": 35119.2890625, "geo/tier1_time_s": 1.3580055236816406, "geo/step": 52200.0, "geo/rankme_slope": -9.672972704706883e-05} {"step": 52210, "timestamp": 1778250975.8148322, "train/loss": 2.1706787109375, "train/z_loss": 0.001376608689315617, "train/perplexity": 8.764230401614833, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791960.2995369718, "perf/iters_per_sec": 0.8544732568440303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703116416931152, "data/tokens_consumed": 109494403072, "data/tokens_consumed_B": 109.494403072, "train/loss_slope": 1.289380266256532e-05} {"step": 52220, "timestamp": 1778250986.1562264, "train/loss": 2.139925754070282, "train/z_loss": 0.0013884952059015632, "train/perplexity": 8.498806603663104, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029313.3525925346, "perf/iters_per_sec": 0.9676520121538804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334293603897096, "data/tokens_consumed": 109515374592, "data/tokens_consumed_B": 109.515374592, "train/loss_slope": 9.006857585878396e-06} {"step": 52230, "timestamp": 1778250996.500074, "train/loss": 2.1353844165802003, "train/z_loss": 0.001385248126462102, "train/perplexity": 8.460298160713101, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028416.1171425132, "perf/iters_per_sec": 0.9672241769516531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033886480331421, "data/tokens_consumed": 109536346112, "data/tokens_consumed_B": 109.536346112, "train/loss_slope": 3.971230343516245e-06} {"step": 52240, "timestamp": 1778251006.8500166, "train/loss": 2.176018166542053, "train/z_loss": 0.001371402887161821, "train/perplexity": 8.811151776575105, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027409.8071957352, "perf/iters_per_sec": 0.9667443309763599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034399652481079, "data/tokens_consumed": 109557317632, "data/tokens_consumed_B": 109.557317632, "train/loss_slope": 2.187735069416822e-06} {"step": 52250, "timestamp": 1778251017.1907833, "grad/layer_0/attn": 0.0029060118831694126, "grad/layer_0/mlp": 0.0031272664200514555, "grad/layer_0/attn_mlp_ratio": 0.9292498303347672, "grad/layer_4/attn": 0.00257036997936666, "grad/layer_4/mlp": 0.00260941032320261, "grad/layer_4/attn_mlp_ratio": 0.9850385958878656, "grad/layer_8/attn": 0.0034946973901242018, "grad/layer_8/mlp": 0.004067088011652231, "grad/layer_8/attn_mlp_ratio": 0.8592627683948802, "grad/layer_12/attn": 0.004472849424928427, "grad/layer_12/mlp": 0.006638149730861187, "grad/layer_12/attn_mlp_ratio": 0.6738096516191697, "grad/layer_16/attn": 0.0035747617948800325, "grad/layer_16/mlp": 0.0045841168612241745, "grad/layer_16/attn_mlp_ratio": 0.7798147004358728, "grad/layer_20/attn": 0.00419371435418725, "grad/layer_20/mlp": 0.006750928238034248, "grad/layer_20/attn_mlp_ratio": 0.6212055800622506, "grad/layer_24/attn": 0.02099231630563736, "grad/layer_24/mlp": 0.01406711433082819, "grad/layer_24/attn_mlp_ratio": 1.4922972588914565, "grad/layer_27/attn": 0.005791500210762024, "grad/layer_27/mlp": 0.014694617129862309, "grad/layer_27/attn_mlp_ratio": 0.3941239244389826} {"step": 52250, "timestamp": 1778251017.2050662, "train/loss": 2.1664934158325195, "train/z_loss": 0.0013688051025383175, "train/perplexity": 8.727626164215927, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026306.0447226078, "perf/iters_per_sec": 0.9662180160153426, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03496310710907, "data/tokens_consumed": 109578289152, "data/tokens_consumed_B": 109.578289152, "train/loss_slope": 3.298976168845667e-06} {"step": 52260, "timestamp": 1778251027.556073, "train/loss": 2.0779562115669252, "train/z_loss": 0.0013937084819190203, "train/perplexity": 7.988126179551256, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027096.0189020326, "perf/iters_per_sec": 0.9665947050581134, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345597743988038, "data/tokens_consumed": 109599260672, "data/tokens_consumed_B": 109.599260672, "train/loss_slope": 2.4409548927991682e-06} {"step": 52270, "timestamp": 1778251037.9024022, "train/loss": 2.1508774280548097, "train/z_loss": 0.0013807130977511405, "train/perplexity": 8.592394298404246, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028103.7942404859, "perf/iters_per_sec": 0.9670752497866086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034045696258545, "data/tokens_consumed": 109620232192, "data/tokens_consumed_B": 109.620232192, "train/loss_slope": 2.7677365190590384e-06} {"step": 52275, "timestamp": 1778251043.6673224, "eos/sharpness": 13.09690475463867, "eos/L0_probe": 1.9785406589508057, "eos/L_plus": 2.0534040927886963, "eos/L_minus": 2.0346462726593018, "eos/grad_norm": 0.11853047460317612, "eos/embed_grad_frac": 0.18742263317108154, "eos/time_s": 0.5971503257751465} {"step": 52275, "timestamp": 1778251045.0467622, "geo/rankme_last": 439.2709655761719, "geo/layer_0/stable_rank_q_proj": 19.457752227783203, "geo/layer_0/stable_rank_k_proj": 16.388813018798828, "geo/layer_0/stable_rank_o_proj": 47.74365234375, "geo/layer_0/stable_rank_gate_proj": 132.23480224609375, "geo/layer_0/stable_rank_down_proj": 53.88385772705078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05852188169956207, "geo/layer_0/attn_entropy_mean": 6.173705577850342, "geo/layer_0/attn_entropy_std": 0.40284135937690735, "geo/layer_7/stable_rank_q_proj": 42.981361389160156, "geo/layer_7/stable_rank_k_proj": 41.86902618408203, "geo/layer_7/stable_rank_o_proj": 93.67709350585938, "geo/layer_7/stable_rank_gate_proj": 85.96044921875, "geo/layer_7/stable_rank_down_proj": 143.06021118164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47574707865715027, "geo/layer_7/attn_entropy_mean": 4.656661033630371, "geo/layer_7/attn_entropy_std": 0.7648288607597351, "geo/layer_14/stable_rank_q_proj": 52.27915954589844, "geo/layer_14/stable_rank_k_proj": 38.74596405029297, "geo/layer_14/stable_rank_o_proj": 45.08556365966797, "geo/layer_14/stable_rank_gate_proj": 73.50559997558594, "geo/layer_14/stable_rank_down_proj": 131.8975830078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3995942175388336, "geo/layer_14/attn_entropy_mean": 5.501903533935547, "geo/layer_14/attn_entropy_std": 0.3713397979736328, "geo/layer_21/stable_rank_q_proj": 40.963321685791016, "geo/layer_21/stable_rank_k_proj": 30.375232696533203, "geo/layer_21/stable_rank_o_proj": 72.86064910888672, "geo/layer_21/stable_rank_gate_proj": 68.70072174072266, "geo/layer_21/stable_rank_down_proj": 53.30414581298828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14721208810806274, "geo/layer_21/attn_entropy_mean": 5.704951286315918, "geo/layer_21/attn_entropy_std": 0.30735883116722107, "geo/layer_27/stable_rank_q_proj": 43.04253387451172, "geo/layer_27/stable_rank_k_proj": 31.476177215576172, "geo/layer_27/stable_rank_o_proj": 115.83726501464844, "geo/layer_27/stable_rank_gate_proj": 82.30742645263672, "geo/layer_27/stable_rank_down_proj": 128.8539276123047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08702672272920609, "geo/layer_27/attn_entropy_mean": 4.24894905090332, "geo/layer_27/attn_entropy_std": 0.6925103664398193, "attnres/final_alpha/block_0": 0.23911447823047638, "attnres/block_norm/0": 1.7472848892211914, "attnres/final_alpha/block_1": 0.004813671112060547, "attnres/block_norm/1": 44348.63671875, "attnres/final_alpha/block_2": 0.010541900061070919, "attnres/block_norm/2": 27789.671875, "attnres/final_alpha/block_3": 0.012280222959816456, "attnres/block_norm/3": 53349.0078125, "attnres/final_alpha/block_4": 0.01456754095852375, "attnres/block_norm/4": 14279.734375, "attnres/final_alpha/block_5": 0.605074942111969, "attnres/block_norm/5": 6389.7373046875, "attnres/final_alpha/block_6": 0.1136072427034378, "attnres/block_norm/6": 34893.328125, "geo/tier1_time_s": 1.360724687576294, "geo/step": 52275.0, "geo/rankme_slope": -6.168191886129451e-05} {"step": 52280, "timestamp": 1778251050.2202392, "train/loss": 2.183436918258667, "train/z_loss": 0.0013712337822653353, "train/perplexity": 8.876762598217676, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703541.4855042277, "perf/iters_per_sec": 0.812311880828966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2310542583465576, "data/tokens_consumed": 109641203712, "data/tokens_consumed_B": 109.641203712, "train/loss_slope": 3.737155961232136e-06} {"step": 52290, "timestamp": 1778251060.57461, "train/loss": 2.1994522333145143, "train/z_loss": 0.001374941226094961, "train/perplexity": 9.020071251426053, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026803.9504304666, "perf/iters_per_sec": 0.9664554359581311, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034708857536316, "data/tokens_consumed": 109662175232, "data/tokens_consumed_B": 109.662175232, "train/loss_slope": 7.909726552908928e-06} {"step": 52300, "timestamp": 1778251070.9070325, "grad/layer_0/attn": 0.003339641261845827, "grad/layer_0/mlp": 0.002920720959082246, "grad/layer_0/attn_mlp_ratio": 1.1434304044410235, "grad/layer_4/attn": 0.0026720846071839333, "grad/layer_4/mlp": 0.00249902019277215, "grad/layer_4/attn_mlp_ratio": 1.069252864777582, "grad/layer_8/attn": 0.002969181863591075, "grad/layer_8/mlp": 0.0036349575966596603, "grad/layer_8/attn_mlp_ratio": 0.8168408304502692, "grad/layer_12/attn": 0.0043558417819440365, "grad/layer_12/mlp": 0.007026843726634979, "grad/layer_12/attn_mlp_ratio": 0.6198859529840961, "grad/layer_16/attn": 0.0046773203648626804, "grad/layer_16/mlp": 0.004455155692994595, "grad/layer_16/attn_mlp_ratio": 1.0498668469052022, "grad/layer_20/attn": 0.003591774730011821, "grad/layer_20/mlp": 0.006832316517829895, "grad/layer_20/attn_mlp_ratio": 0.5257037884688449, "grad/layer_24/attn": 0.012241121381521225, "grad/layer_24/mlp": 0.00912920106202364, "grad/layer_24/attn_mlp_ratio": 1.3408754133321974, "grad/layer_27/attn": 0.00460426602512598, "grad/layer_27/mlp": 0.008823085576295853, "grad/layer_27/attn_mlp_ratio": 0.52184305967875} {"step": 52300, "timestamp": 1778251070.921546, "train/loss": 2.0938350200653075, "train/z_loss": 0.0013835021178238093, "train/perplexity": 8.115980506217127, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028241.9845589146, "perf/iters_per_sec": 0.9671411440653394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339752435684204, "data/tokens_consumed": 109683146752, "data/tokens_consumed_B": 109.683146752, "train/loss_slope": 5.42146359125205e-06} {"step": 52310, "timestamp": 1778251081.64704, "train/loss": 2.1403947353363035, "train/z_loss": 0.0013811437413096428, "train/perplexity": 8.502793319518215, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1956290.4530850658, "perf/iters_per_sec": 0.9328319802689866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0720044136047364, "data/tokens_consumed": 109704118272, "data/tokens_consumed_B": 109.704118272, "train/loss_slope": 2.4082850880570184e-06} {"step": 52320, "timestamp": 1778251092.000452, "train/loss": 2.1284767508506777, "train/z_loss": 0.0013899745885282756, "train/perplexity": 8.402058630230064, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026670.2519813855, "perf/iters_per_sec": 0.9663916835696151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347771167755127, "data/tokens_consumed": 109725089792, "data/tokens_consumed_B": 109.725089792, "train/loss_slope": -3.790233072954048e-09} {"step": 52330, "timestamp": 1778251102.3484073, "train/loss": 2.189206290245056, "train/z_loss": 0.0013710589497350155, "train/perplexity": 8.928123962623852, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028058.2961008963, "perf/iters_per_sec": 0.9670535545830232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340688943862915, "data/tokens_consumed": 109746061312, "data/tokens_consumed_B": 109.746061312, "train/loss_slope": 3.0140098780080946e-06} {"step": 52340, "timestamp": 1778251112.6967125, "train/loss": 2.1466387271881104, "train/z_loss": 0.0013845462934114039, "train/perplexity": 8.556050788292529, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027454.5285338734, "perf/iters_per_sec": 0.9667656557721488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343768358230592, "data/tokens_consumed": 109767032832, "data/tokens_consumed_B": 109.767032832, "train/loss_slope": 2.7423717150844464e-06} {"step": 52350, "timestamp": 1778251123.038404, "grad/layer_0/attn": 0.003371782135218382, "grad/layer_0/mlp": 0.003018411109223962, "grad/layer_0/attn_mlp_ratio": 1.1170718306752088, "grad/layer_4/attn": 0.0035534948110580444, "grad/layer_4/mlp": 0.0025737725663930178, "grad/layer_4/attn_mlp_ratio": 1.3806560530608347, "grad/layer_8/attn": 0.003736465470865369, "grad/layer_8/mlp": 0.0037699437234550714, "grad/layer_8/attn_mlp_ratio": 0.9911196680487879, "grad/layer_12/attn": 0.006340629421174526, "grad/layer_12/mlp": 0.006539430934935808, "grad/layer_12/attn_mlp_ratio": 0.9695995549613999, "grad/layer_16/attn": 0.0037839384749531746, "grad/layer_16/mlp": 0.004945080261677504, "grad/layer_16/attn_mlp_ratio": 0.765192514216202, "grad/layer_20/attn": 0.011184723116457462, "grad/layer_20/mlp": 0.006121664308011532, "grad/layer_20/attn_mlp_ratio": 1.8270722422842742, "grad/layer_24/attn": 0.006755907088518143, "grad/layer_24/mlp": 0.008819986134767532, "grad/layer_24/attn_mlp_ratio": 0.7659770558243063, "grad/layer_27/attn": 0.007542010396718979, "grad/layer_27/mlp": 0.008064142428338528, "grad/layer_27/attn_mlp_ratio": 0.9352526161604036} {"step": 52350, "timestamp": 1778251123.6442778, "eos/sharpness": 72.2156524658203, "eos/L0_probe": 1.9799644947052002, "eos/L_plus": 2.4202404022216797, "eos/L_minus": 2.261845111846924, "eos/grad_norm": 0.1665629744529724, "eos/embed_grad_frac": 0.0803891122341156, "eos/time_s": 0.6029696464538574} {"step": 52350, "timestamp": 1778251123.6644366, "train/loss": 2.1267889738082886, "train/z_loss": 0.001388342143036425, "train/perplexity": 8.38788978885119, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913417.349697447, "perf/iters_per_sec": 0.9123884914862856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960243463516235, "data/tokens_consumed": 109788004352, "data/tokens_consumed_B": 109.788004352, "train/loss_slope": 1.7351476415799944e-06} {"step": 52350, "timestamp": 1778251125.0294487, "geo/rankme_last": 439.4712219238281, "geo/layer_0/stable_rank_q_proj": 19.505157470703125, "geo/layer_0/stable_rank_k_proj": 16.398305892944336, "geo/layer_0/stable_rank_o_proj": 47.76802062988281, "geo/layer_0/stable_rank_gate_proj": 132.36688232421875, "geo/layer_0/stable_rank_down_proj": 53.917694091796875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061343368142843246, "geo/layer_0/attn_entropy_mean": 6.173657417297363, "geo/layer_0/attn_entropy_std": 0.40099695324897766, "geo/layer_7/stable_rank_q_proj": 43.006683349609375, "geo/layer_7/stable_rank_k_proj": 41.86201477050781, "geo/layer_7/stable_rank_o_proj": 93.62228393554688, "geo/layer_7/stable_rank_gate_proj": 86.07323455810547, "geo/layer_7/stable_rank_down_proj": 143.2434539794922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4728822410106659, "geo/layer_7/attn_entropy_mean": 4.672918319702148, "geo/layer_7/attn_entropy_std": 0.7971332669258118, "geo/layer_14/stable_rank_q_proj": 52.26156997680664, "geo/layer_14/stable_rank_k_proj": 38.76289367675781, "geo/layer_14/stable_rank_o_proj": 45.09604263305664, "geo/layer_14/stable_rank_gate_proj": 73.54238891601562, "geo/layer_14/stable_rank_down_proj": 131.7249755859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3810572028160095, "geo/layer_14/attn_entropy_mean": 5.5115203857421875, "geo/layer_14/attn_entropy_std": 0.37466859817504883, "geo/layer_21/stable_rank_q_proj": 40.993858337402344, "geo/layer_21/stable_rank_k_proj": 30.382904052734375, "geo/layer_21/stable_rank_o_proj": 72.86980438232422, "geo/layer_21/stable_rank_gate_proj": 68.84822845458984, "geo/layer_21/stable_rank_down_proj": 53.32077407836914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14786355197429657, "geo/layer_21/attn_entropy_mean": 5.709074974060059, "geo/layer_21/attn_entropy_std": 0.29028618335723877, "geo/layer_27/stable_rank_q_proj": 42.87711715698242, "geo/layer_27/stable_rank_k_proj": 31.402009963989258, "geo/layer_27/stable_rank_o_proj": 115.92039489746094, "geo/layer_27/stable_rank_gate_proj": 82.27870178222656, "geo/layer_27/stable_rank_down_proj": 128.91387939453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0881827250123024, "geo/layer_27/attn_entropy_mean": 4.254825592041016, "geo/layer_27/attn_entropy_std": 0.6883759498596191, "attnres/final_alpha/block_0": 0.23749610781669617, "attnres/block_norm/0": 1.7474174499511719, "attnres/final_alpha/block_1": 0.0047380621545016766, "attnres/block_norm/1": 44196.453125, "attnres/final_alpha/block_2": 0.010374213568866253, "attnres/block_norm/2": 27760.599609375, "attnres/final_alpha/block_3": 0.012136646546423435, "attnres/block_norm/3": 53616.1953125, "attnres/final_alpha/block_4": 0.014349071308970451, "attnres/block_norm/4": 14211.7763671875, "attnres/final_alpha/block_5": 0.6105342507362366, "attnres/block_norm/5": 6308.81640625, "attnres/final_alpha/block_6": 0.11037163436412811, "attnres/block_norm/6": 35376.0859375, "geo/tier1_time_s": 1.360898733139038, "geo/step": 52350.0, "geo/rankme_slope": -4.268287393082233e-05} {"step": 52360, "timestamp": 1778251135.3764129, "train/loss": 2.1772740840911866, "train/z_loss": 0.0013780913897790015, "train/perplexity": 8.822224808671567, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791219.675376431, "perf/iters_per_sec": 0.8541200997240215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707955360412599, "data/tokens_consumed": 109808975872, "data/tokens_consumed_B": 109.808975872, "train/loss_slope": 1.8864149021523504e-06} {"step": 52370, "timestamp": 1778251145.7182453, "train/loss": 2.1630576729774473, "train/z_loss": 0.00137934572994709, "train/perplexity": 8.697691737922423, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028774.1112635673, "perf/iters_per_sec": 0.96739488185099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337040424346924, "data/tokens_consumed": 109829947392, "data/tokens_consumed_B": 109.829947392, "train/loss_slope": 2.838016192976754e-06} {"step": 52380, "timestamp": 1778251156.0762227, "train/loss": 2.154080867767334, "train/z_loss": 0.0013917787582613528, "train/perplexity": 8.619963650327374, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026037.1634784637, "perf/iters_per_sec": 0.9660898034469908, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351004600524902, "data/tokens_consumed": 109850918912, "data/tokens_consumed_B": 109.850918912, "train/loss_slope": 5.806060428201669e-06} {"step": 52390, "timestamp": 1778251166.4196508, "train/loss": 2.1830050230026243, "train/z_loss": 0.0013974492670968176, "train/perplexity": 8.872929594350165, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028890.538040119, "perf/iters_per_sec": 0.9674503984642596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033644723892212, "data/tokens_consumed": 109871890432, "data/tokens_consumed_B": 109.871890432, "train/loss_slope": 8.01065366546422e-06} {"step": 52400, "timestamp": 1778251176.7524514, "grad/layer_0/attn": 0.003107694676145911, "grad/layer_0/mlp": 0.0029960027895867825, "grad/layer_0/attn_mlp_ratio": 1.0372802666336993, "grad/layer_4/attn": 0.002076846780255437, "grad/layer_4/mlp": 0.002657307079061866, "grad/layer_4/attn_mlp_ratio": 0.7815606703733234, "grad/layer_8/attn": 0.007581774145364761, "grad/layer_8/mlp": 0.003820948302745819, "grad/layer_8/attn_mlp_ratio": 1.984264990314009, "grad/layer_12/attn": 0.0058521246537566185, "grad/layer_12/mlp": 0.006912723183631897, "grad/layer_12/attn_mlp_ratio": 0.8465729660571562, "grad/layer_16/attn": 0.004886365961283445, "grad/layer_16/mlp": 0.004674261901527643, "grad/layer_16/attn_mlp_ratio": 1.0453769942049638, "grad/layer_20/attn": 0.0039246780797839165, "grad/layer_20/mlp": 0.006186178419739008, "grad/layer_20/attn_mlp_ratio": 0.6344269030162901, "grad/layer_24/attn": 0.01521514542400837, "grad/layer_24/mlp": 0.012521058320999146, "grad/layer_24/attn_mlp_ratio": 1.2151644783072775, "grad/layer_27/attn": 0.005936615169048309, "grad/layer_27/mlp": 0.012367118149995804, "grad/layer_27/attn_mlp_ratio": 0.4800322151888799} {"step": 52400, "timestamp": 1778251176.766972, "train/loss": 2.170561647415161, "train/z_loss": 0.0013926794636063278, "train/perplexity": 8.763204489983018, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027793.7646296246, "perf/iters_per_sec": 0.9669274161480067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034203791618347, "data/tokens_consumed": 109892861952, "data/tokens_consumed_B": 109.892861952, "train/loss_slope": 9.997693161116051e-06} {"step": 52410, "timestamp": 1778251187.122638, "train/loss": 2.1412919759750366, "train/z_loss": 0.0013889302616007626, "train/perplexity": 8.510425794798696, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027073.6425365338, "perf/iters_per_sec": 0.966584035175578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345711946487426, "data/tokens_consumed": 109913833472, "data/tokens_consumed_B": 109.913833472, "train/loss_slope": 1.3196773798015794e-05} {"step": 52420, "timestamp": 1778251197.4698608, "train/loss": 2.1403329253196715, "train/z_loss": 0.0013924361555837095, "train/perplexity": 8.50226777796375, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028140.2222800653, "perf/iters_per_sec": 0.9670926200294806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340271234512328, "data/tokens_consumed": 109934804992, "data/tokens_consumed_B": 109.934804992, "train/loss_slope": 1.1509495134865814e-05} {"step": 52425, "timestamp": 1778251203.2321858, "eos/sharpness": 73.79806041717528, "eos/L0_probe": 1.9794974327087402, "eos/L_plus": 2.4263975620269775, "eos/L_minus": 2.270577907562256, "eos/grad_norm": 0.25288307666778564, "eos/embed_grad_frac": 0.04003565013408661, "eos/time_s": 0.5934958457946777} {"step": 52425, "timestamp": 1778251204.613567, "geo/rankme_last": 438.42742919921875, "geo/layer_0/stable_rank_q_proj": 19.50815773010254, "geo/layer_0/stable_rank_k_proj": 16.422107696533203, "geo/layer_0/stable_rank_o_proj": 47.77821350097656, "geo/layer_0/stable_rank_gate_proj": 132.53392028808594, "geo/layer_0/stable_rank_down_proj": 54.022308349609375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06333799660205841, "geo/layer_0/attn_entropy_mean": 6.178901672363281, "geo/layer_0/attn_entropy_std": 0.399685263633728, "geo/layer_7/stable_rank_q_proj": 43.099456787109375, "geo/layer_7/stable_rank_k_proj": 41.924232482910156, "geo/layer_7/stable_rank_o_proj": 93.59105682373047, "geo/layer_7/stable_rank_gate_proj": 85.98939514160156, "geo/layer_7/stable_rank_down_proj": 143.59925842285156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4934587776660919, "geo/layer_7/attn_entropy_mean": 4.649114608764648, "geo/layer_7/attn_entropy_std": 0.7886701226234436, "geo/layer_14/stable_rank_q_proj": 52.32441329956055, "geo/layer_14/stable_rank_k_proj": 38.80908203125, "geo/layer_14/stable_rank_o_proj": 45.070899963378906, "geo/layer_14/stable_rank_gate_proj": 73.404541015625, "geo/layer_14/stable_rank_down_proj": 131.61155700683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3993443548679352, "geo/layer_14/attn_entropy_mean": 5.542372703552246, "geo/layer_14/attn_entropy_std": 0.3879946768283844, "geo/layer_21/stable_rank_q_proj": 40.99566650390625, "geo/layer_21/stable_rank_k_proj": 30.322498321533203, "geo/layer_21/stable_rank_o_proj": 72.81599426269531, "geo/layer_21/stable_rank_gate_proj": 68.8501205444336, "geo/layer_21/stable_rank_down_proj": 53.35075378417969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14035655558109283, "geo/layer_21/attn_entropy_mean": 5.698050498962402, "geo/layer_21/attn_entropy_std": 0.30789339542388916, "geo/layer_27/stable_rank_q_proj": 42.85736083984375, "geo/layer_27/stable_rank_k_proj": 31.337135314941406, "geo/layer_27/stable_rank_o_proj": 116.00926208496094, "geo/layer_27/stable_rank_gate_proj": 82.19637298583984, "geo/layer_27/stable_rank_down_proj": 128.8152313232422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08915944397449493, "geo/layer_27/attn_entropy_mean": 4.242528915405273, "geo/layer_27/attn_entropy_std": 0.7038701772689819, "attnres/final_alpha/block_0": 0.23827116191387177, "attnres/block_norm/0": 1.7474665641784668, "attnres/final_alpha/block_1": 0.004758864175528288, "attnres/block_norm/1": 44318.9296875, "attnres/final_alpha/block_2": 0.010312553495168686, "attnres/block_norm/2": 27833.796875, "attnres/final_alpha/block_3": 0.012071722187101841, "attnres/block_norm/3": 53729.5859375, "attnres/final_alpha/block_4": 0.014183509163558483, "attnres/block_norm/4": 14248.5439453125, "attnres/final_alpha/block_5": 0.6109752058982849, "attnres/block_norm/5": 6314.6044921875, "attnres/final_alpha/block_6": 0.10942700505256653, "attnres/block_norm/6": 35368.0625, "geo/tier1_time_s": 1.3610646724700928, "geo/step": 52425.0, "geo/rankme_slope": -4.817129976990796e-05} {"step": 52430, "timestamp": 1778251209.8003607, "train/loss": 2.1240928888320925, "train/z_loss": 0.001387428236193955, "train/perplexity": 8.365305783088822, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701479.4768961512, "perf/iters_per_sec": 0.8113286385041004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325461626052856, "data/tokens_consumed": 109955776512, "data/tokens_consumed_B": 109.955776512, "train/loss_slope": 8.590873847879044e-06} {"step": 52440, "timestamp": 1778251220.1448908, "train/loss": 2.207316851615906, "train/z_loss": 0.0013680083444342016, "train/perplexity": 9.091290357323755, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028331.689741309, "perf/iters_per_sec": 0.9671839188295884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339295148849488, "data/tokens_consumed": 109976748032, "data/tokens_consumed_B": 109.976748032, "train/loss_slope": 1.1996931941023567e-05} {"step": 52450, "timestamp": 1778251230.4849348, "grad/layer_0/attn": 0.002681104000657797, "grad/layer_0/mlp": 0.0028497681487351656, "grad/layer_0/attn_mlp_ratio": 0.9408147493564685, "grad/layer_4/attn": 0.00211720890365541, "grad/layer_4/mlp": 0.002512677339836955, "grad/layer_4/attn_mlp_ratio": 0.842610702865543, "grad/layer_8/attn": 0.009089473634958267, "grad/layer_8/mlp": 0.0034575634635984898, "grad/layer_8/attn_mlp_ratio": 2.628866676712175, "grad/layer_12/attn": 0.004537275526672602, "grad/layer_12/mlp": 0.006678893230855465, "grad/layer_12/attn_mlp_ratio": 0.679345409771868, "grad/layer_16/attn": 0.0034307497553527355, "grad/layer_16/mlp": 0.0043885363265872, "grad/layer_16/attn_mlp_ratio": 0.7817525985583993, "grad/layer_20/attn": 0.00437880028039217, "grad/layer_20/mlp": 0.006181837059557438, "grad/layer_20/attn_mlp_ratio": 0.7083331649430978, "grad/layer_24/attn": 0.012835536152124405, "grad/layer_24/mlp": 0.010825373232364655, "grad/layer_24/attn_mlp_ratio": 1.1856899303186117, "grad/layer_27/attn": 0.004063447006046772, "grad/layer_27/mlp": 0.010182375088334084, "grad/layer_27/attn_mlp_ratio": 0.39906671389424453} {"step": 52450, "timestamp": 1778251230.499258, "train/loss": 2.179712414741516, "train/z_loss": 0.0013713115360587836, "train/perplexity": 8.843762557231898, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026755.6619140948, "perf/iters_per_sec": 0.9664324101992106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034733510017395, "data/tokens_consumed": 109997719552, "data/tokens_consumed_B": 109.997719552, "train/loss_slope": 1.3707904296346248e-05} {"step": 52460, "timestamp": 1778251240.8595526, "train/loss": 2.129693019390106, "train/z_loss": 0.0013905726722441615, "train/perplexity": 8.412284006949909, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026082.5243081264, "perf/iters_per_sec": 0.9661114331761009, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350772857666015, "data/tokens_consumed": 110018691072, "data/tokens_consumed_B": 110.018691072, "train/loss_slope": 1.052352221134437e-05} {"step": 52470, "timestamp": 1778251251.2057602, "train/loss": 2.1792173862457274, "train/z_loss": 0.0013782154070213436, "train/perplexity": 8.839385726173505, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028427.1563693716, "perf/iters_per_sec": 0.967229440865217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033880853652954, "data/tokens_consumed": 110039662592, "data/tokens_consumed_B": 110.039662592, "train/loss_slope": 1.363767769255868e-05} {"step": 52480, "timestamp": 1778251261.549847, "train/loss": 2.147002387046814, "train/z_loss": 0.001380758872255683, "train/perplexity": 8.559162846344258, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028761.102992257, "perf/iters_per_sec": 0.9673886790238653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337106704711914, "data/tokens_consumed": 110060634112, "data/tokens_consumed_B": 110.060634112, "train/loss_slope": 1.302544925436755e-05} {"step": 52490, "timestamp": 1778251272.4732273, "train/loss": 2.1342734813690187, "train/z_loss": 0.001391288754530251, "train/perplexity": 8.450904536409386, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921244.6174612204, "perf/iters_per_sec": 0.9161208236032583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.091559076309204, "data/tokens_consumed": 110081605632, "data/tokens_consumed_B": 110.081605632, "train/loss_slope": 9.74118577228663e-06} {"step": 52500, "timestamp": 1778251283.3570004, "grad/layer_0/attn": 0.0029512722976505756, "grad/layer_0/mlp": 0.0028027361258864403, "grad/layer_0/attn_mlp_ratio": 1.0529968073314344, "grad/layer_4/attn": 0.0025059168692678213, "grad/layer_4/mlp": 0.002412150613963604, "grad/layer_4/attn_mlp_ratio": 1.0388724281453137, "grad/layer_8/attn": 0.003902117721736431, "grad/layer_8/mlp": 0.003525632666423917, "grad/layer_8/attn_mlp_ratio": 1.1067850738448821, "grad/layer_12/attn": 0.004067709669470787, "grad/layer_12/mlp": 0.006575397215783596, "grad/layer_12/attn_mlp_ratio": 0.6186256851288133, "grad/layer_16/attn": 0.0034580184146761894, "grad/layer_16/mlp": 0.004315465688705444, "grad/layer_16/attn_mlp_ratio": 0.8013082675169412, "grad/layer_20/attn": 0.0035190198104828596, "grad/layer_20/mlp": 0.005819851532578468, "grad/layer_20/attn_mlp_ratio": 0.6046579934759898, "grad/layer_24/attn": 0.013391333632171154, "grad/layer_24/mlp": 0.011191390454769135, "grad/layer_24/attn_mlp_ratio": 1.1965745960375342, "grad/layer_27/attn": 0.011551323346793652, "grad/layer_27/mlp": 0.010122349485754967, "grad/layer_27/attn_mlp_ratio": 1.1411701649832031} {"step": 52500, "timestamp": 1778251283.957502, "eos/sharpness": 80.05206584930419, "eos/L0_probe": 1.982193946838379, "eos/L_plus": 2.4732322692871094, "eos/L_minus": 2.2916762828826904, "eos/grad_norm": 0.21062923967838287, "eos/embed_grad_frac": 0.04539154842495918, "eos/time_s": 0.5975832939147949} {"step": 52500, "timestamp": 1778251283.978369, "train/loss": 2.1241971015930177, "train/z_loss": 0.0013814827427268029, "train/perplexity": 8.366177600126901, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1823806.9777022258, "perf/iters_per_sec": 0.8696589363585595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.149876070022583, "data/tokens_consumed": 110102577152, "data/tokens_consumed_B": 110.102577152, "train/loss_slope": 9.680400866128788e-06} {"step": 52500, "timestamp": 1778251285.3435714, "geo/rankme_last": 438.9325866699219, "geo/layer_0/stable_rank_q_proj": 19.440444946289062, "geo/layer_0/stable_rank_k_proj": 16.406723022460938, "geo/layer_0/stable_rank_o_proj": 47.73670959472656, "geo/layer_0/stable_rank_gate_proj": 132.5216827392578, "geo/layer_0/stable_rank_down_proj": 53.93925857543945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05967468395829201, "geo/layer_0/attn_entropy_mean": 6.17800235748291, "geo/layer_0/attn_entropy_std": 0.3997642695903778, "geo/layer_7/stable_rank_q_proj": 43.124305725097656, "geo/layer_7/stable_rank_k_proj": 41.911537170410156, "geo/layer_7/stable_rank_o_proj": 93.56015014648438, "geo/layer_7/stable_rank_gate_proj": 86.05937194824219, "geo/layer_7/stable_rank_down_proj": 143.7649688720703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47829386591911316, "geo/layer_7/attn_entropy_mean": 4.640872955322266, "geo/layer_7/attn_entropy_std": 0.7968817949295044, "geo/layer_14/stable_rank_q_proj": 52.259769439697266, "geo/layer_14/stable_rank_k_proj": 38.7335090637207, "geo/layer_14/stable_rank_o_proj": 45.17830276489258, "geo/layer_14/stable_rank_gate_proj": 73.32561492919922, "geo/layer_14/stable_rank_down_proj": 131.15194702148438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39372631907463074, "geo/layer_14/attn_entropy_mean": 5.528019905090332, "geo/layer_14/attn_entropy_std": 0.3888257145881653, "geo/layer_21/stable_rank_q_proj": 40.97980880737305, "geo/layer_21/stable_rank_k_proj": 30.2573184967041, "geo/layer_21/stable_rank_o_proj": 72.67437744140625, "geo/layer_21/stable_rank_gate_proj": 68.84204864501953, "geo/layer_21/stable_rank_down_proj": 53.337623596191406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15084639191627502, "geo/layer_21/attn_entropy_mean": 5.710206031799316, "geo/layer_21/attn_entropy_std": 0.29462283849716187, "geo/layer_27/stable_rank_q_proj": 42.85835266113281, "geo/layer_27/stable_rank_k_proj": 31.328510284423828, "geo/layer_27/stable_rank_o_proj": 116.1710205078125, "geo/layer_27/stable_rank_gate_proj": 82.16143035888672, "geo/layer_27/stable_rank_down_proj": 128.87767028808594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08977175503969193, "geo/layer_27/attn_entropy_mean": 4.202887535095215, "geo/layer_27/attn_entropy_std": 0.6857964396476746, "attnres/final_alpha/block_0": 0.23646287620067596, "attnres/block_norm/0": 1.7477126121520996, "attnres/final_alpha/block_1": 0.004703528713434935, "attnres/block_norm/1": 44361.76171875, "attnres/final_alpha/block_2": 0.01041646208614111, "attnres/block_norm/2": 27786.685546875, "attnres/final_alpha/block_3": 0.012228268198668957, "attnres/block_norm/3": 53037.140625, "attnres/final_alpha/block_4": 0.014346524141728878, "attnres/block_norm/4": 14225.888671875, "attnres/final_alpha/block_5": 0.6108720302581787, "attnres/block_norm/5": 6368.693359375, "attnres/final_alpha/block_6": 0.11097031831741333, "attnres/block_norm/6": 35452.7890625, "geo/tier1_time_s": 1.360863208770752, "geo/step": 52500.0, "geo/rankme_slope": -5.862733765381153e-05} {"step": 52500, "timestamp": 1778251292.526002, "geo/ww_alpha_mean": 7.5898308377725865, "geo/ww_alpha_std": 4.326667003757614, "geo/ww_alpha_min": 1.358077210046515, "geo/ww_alpha_max": 27.963180168848368, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.021540557770319, "geo/ww_alpha_by_type/k_proj": 4.446054351787131, "geo/ww_alpha_by_type/v_proj": 8.223342128002503, "geo/ww_alpha_by_type/o_proj": 7.476653930885143, "geo/ww_alpha_by_type/gate_proj": 8.279078980706116, "geo/ww_alpha_by_type/up_proj": 12.2953775518204, "geo/ww_alpha_by_type/down_proj": 8.485366898968968, "geo/twonn_id/layer_0": 0.711949348449707, "geo/twonn_id/layer_7": 3.441235065460205, "geo/twonn_id/layer_14": 4.632368087768555, "geo/twonn_id/layer_21": 7.704052448272705, "geo/twonn_id/layer_27": 6.4302754402160645, "geo/tier2_time_s": 7.174936056137085} {"step": 52500, "timestamp": 1778251293.2470927, "eoc/jacobian_sigma/layer_0/attn": 943.4522705078125, "eoc/jacobian_sigma/layer_0/mlp": 8609.6171875, "eoc/jacobian_sigma/layer_0": 8609.6171875, "eoc/jacobian_sigma/layer_7/attn": 1.1501988172531128, "eoc/jacobian_sigma/layer_7/mlp": 1.7139805555343628, "eoc/jacobian_sigma/layer_7": 1.7139805555343628, "eoc/jacobian_sigma/layer_14/attn": 1.593775749206543, "eoc/jacobian_sigma/layer_14/mlp": 6.362198352813721, "eoc/jacobian_sigma/layer_14": 6.362198352813721, "eoc/jacobian_sigma/layer_21/attn": 1.0883078575134277, "eoc/jacobian_sigma/layer_21/mlp": 4.375021457672119, "eoc/jacobian_sigma/layer_21": 4.375021457672119, "eoc/jacobian_sigma/layer_27/attn": 3.5128719806671143, "eoc/jacobian_sigma/layer_27/mlp": 31.490440368652344, "eoc/jacobian_sigma/layer_27": 31.490440368652344, "eoc/layer0_sigma": 8609.6171875, "eoc/sigma_max": 31.490440368652344, "eoc/sigma_min": 1.7139805555343628, "eoc/sigma_mean": 10.985410183668137, "eoc/time_s": 0.7113542556762695} {"step": 52510, "timestamp": 1778251303.6101954, "train/loss": 2.1030945658683775, "train/z_loss": 0.0013777778833173216, "train/perplexity": 8.191479804620489, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1068471.0090364167, "perf/iters_per_sec": 0.5094866795713504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9627598524093628, "data/tokens_consumed": 110123548672, "data/tokens_consumed_B": 110.123548672, "train/loss_slope": 4.414351044422207e-06} {"step": 52520, "timestamp": 1778251313.9513628, "train/loss": 2.219189190864563, "train/z_loss": 0.0013663712656125427, "train/perplexity": 9.199868503587782, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029004.310390918, "perf/iters_per_sec": 0.9675046493486967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335867643356322, "data/tokens_consumed": 110144520192, "data/tokens_consumed_B": 110.144520192, "train/loss_slope": 7.752964978027772e-06} {"step": 52530, "timestamp": 1778251324.2990627, "train/loss": 2.136313009262085, "train/z_loss": 0.0013892555725760758, "train/perplexity": 8.468157980392434, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028256.669848524, "perf/iters_per_sec": 0.9671481465571041, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339677572250365, "data/tokens_consumed": 110165491712, "data/tokens_consumed_B": 110.165491712, "train/loss_slope": 2.485172640551214e-06} {"step": 52540, "timestamp": 1778251334.6455176, "train/loss": 2.173011803627014, "train/z_loss": 0.0013913949020206927, "train/perplexity": 8.784702035317386, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028408.6797650456, "perf/iters_per_sec": 0.967220630533717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338902711868285, "data/tokens_consumed": 110186463232, "data/tokens_consumed_B": 110.186463232, "train/loss_slope": 4.365981844785546e-06} {"step": 52550, "timestamp": 1778251345.5065656, "grad/layer_0/attn": 0.0029746422078460455, "grad/layer_0/mlp": 0.002947459230199456, "grad/layer_0/attn_mlp_ratio": 1.009222477598953, "grad/layer_4/attn": 0.0020036385394632816, "grad/layer_4/mlp": 0.002471629763022065, "grad/layer_4/attn_mlp_ratio": 0.8106547705381056, "grad/layer_8/attn": 0.004364532418549061, "grad/layer_8/mlp": 0.003452260047197342, "grad/layer_8/attn_mlp_ratio": 1.2642536287690636, "grad/layer_12/attn": 0.005560849793255329, "grad/layer_12/mlp": 0.006588120944797993, "grad/layer_12/attn_mlp_ratio": 0.8440721953107099, "grad/layer_16/attn": 0.0037236521020531654, "grad/layer_16/mlp": 0.0045932745561003685, "grad/layer_16/attn_mlp_ratio": 0.8106748193487081, "grad/layer_20/attn": 0.0030785510316491127, "grad/layer_20/mlp": 0.006172534544020891, "grad/layer_20/attn_mlp_ratio": 0.49874989922190116, "grad/layer_24/attn": 0.005699727218598127, "grad/layer_24/mlp": 0.007417070213705301, "grad/layer_24/attn_mlp_ratio": 0.7684607233756625, "grad/layer_27/attn": 0.007388636935502291, "grad/layer_27/mlp": 0.006639906205236912, "grad/layer_27/attn_mlp_ratio": 1.112762228237295} {"step": 52550, "timestamp": 1778251345.521092, "train/loss": 2.135354161262512, "train/z_loss": 0.0013887491077184677, "train/perplexity": 8.460042195576685, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929240.6963555305, "perf/iters_per_sec": 0.9199336511399892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0870349168777467, "data/tokens_consumed": 110207434752, "data/tokens_consumed_B": 110.207434752, "train/loss_slope": -4.6944105335448716e-07} {"step": 52560, "timestamp": 1778251356.307809, "train/loss": 2.1564791440963744, "train/z_loss": 0.0013649423024617135, "train/perplexity": 8.64066151478563, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946022.4785803177, "perf/iters_per_sec": 0.9279358284856404, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0776607275009156, "data/tokens_consumed": 110228406272, "data/tokens_consumed_B": 110.228406272, "train/loss_slope": 7.37023217664012e-08} {"step": 52570, "timestamp": 1778251366.6575809, "train/loss": 2.2035505294799806, "train/z_loss": 0.0013667447143234312, "train/perplexity": 9.057114029137308, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027835.6044440023, "perf/iters_per_sec": 0.9669473669261943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341824531555175, "data/tokens_consumed": 110249377792, "data/tokens_consumed_B": 110.249377792, "train/loss_slope": 2.687245483982109e-06} {"step": 52575, "timestamp": 1778251372.435753, "eos/sharpness": 52.105903625488274, "eos/L0_probe": 1.9817715883255005, "eos/L_plus": 2.26888108253479, "eos/L_minus": 2.2157211303710938, "eos/grad_norm": 0.14749212563037872, "eos/embed_grad_frac": 0.10876656323671341, "eos/time_s": 0.613842248916626} {"step": 52575, "timestamp": 1778251373.8160126, "geo/rankme_last": 439.42156982421875, "geo/layer_0/stable_rank_q_proj": 19.481849670410156, "geo/layer_0/stable_rank_k_proj": 16.3818416595459, "geo/layer_0/stable_rank_o_proj": 47.7552604675293, "geo/layer_0/stable_rank_gate_proj": 132.40745544433594, "geo/layer_0/stable_rank_down_proj": 53.90658187866211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06566832214593887, "geo/layer_0/attn_entropy_mean": 6.1820526123046875, "geo/layer_0/attn_entropy_std": 0.39779162406921387, "geo/layer_7/stable_rank_q_proj": 43.14253234863281, "geo/layer_7/stable_rank_k_proj": 41.91702651977539, "geo/layer_7/stable_rank_o_proj": 93.41282653808594, "geo/layer_7/stable_rank_gate_proj": 85.88848876953125, "geo/layer_7/stable_rank_down_proj": 143.6285858154297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46542608737945557, "geo/layer_7/attn_entropy_mean": 4.637978553771973, "geo/layer_7/attn_entropy_std": 0.7867340445518494, "geo/layer_14/stable_rank_q_proj": 52.22921371459961, "geo/layer_14/stable_rank_k_proj": 38.83024597167969, "geo/layer_14/stable_rank_o_proj": 45.19019317626953, "geo/layer_14/stable_rank_gate_proj": 73.31094360351562, "geo/layer_14/stable_rank_down_proj": 131.09426879882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3914894163608551, "geo/layer_14/attn_entropy_mean": 5.521309852600098, "geo/layer_14/attn_entropy_std": 0.37960901856422424, "geo/layer_21/stable_rank_q_proj": 40.991146087646484, "geo/layer_21/stable_rank_k_proj": 30.332843780517578, "geo/layer_21/stable_rank_o_proj": 72.63683319091797, "geo/layer_21/stable_rank_gate_proj": 68.78780364990234, "geo/layer_21/stable_rank_down_proj": 53.341209411621094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14133603870868683, "geo/layer_21/attn_entropy_mean": 5.71742057800293, "geo/layer_21/attn_entropy_std": 0.29805803298950195, "geo/layer_27/stable_rank_q_proj": 42.89537048339844, "geo/layer_27/stable_rank_k_proj": 31.38982582092285, "geo/layer_27/stable_rank_o_proj": 116.11631774902344, "geo/layer_27/stable_rank_gate_proj": 82.19331359863281, "geo/layer_27/stable_rank_down_proj": 129.09983825683594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08227323740720749, "geo/layer_27/attn_entropy_mean": 4.232011318206787, "geo/layer_27/attn_entropy_std": 0.6754570603370667, "attnres/final_alpha/block_0": 0.2377856969833374, "attnres/block_norm/0": 1.7478065490722656, "attnres/final_alpha/block_1": 0.004702843725681305, "attnres/block_norm/1": 44206.27734375, "attnres/final_alpha/block_2": 0.010336853563785553, "attnres/block_norm/2": 27966.64453125, "attnres/final_alpha/block_3": 0.012197619304060936, "attnres/block_norm/3": 53449.03515625, "attnres/final_alpha/block_4": 0.014343520626425743, "attnres/block_norm/4": 14254.982421875, "attnres/final_alpha/block_5": 0.6106034517288208, "attnres/block_norm/5": 6356.5751953125, "attnres/final_alpha/block_6": 0.11003004759550095, "attnres/block_norm/6": 35668.80078125, "geo/tier1_time_s": 1.3608739376068115, "geo/step": 52575.0, "geo/rankme_slope": -2.5945280455932376e-05} {"step": 52580, "timestamp": 1778251378.9912744, "train/loss": 2.2184414148330687, "train/z_loss": 0.0013712774962186813, "train/perplexity": 9.192991633927496, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701440.1800304295, "perf/iters_per_sec": 0.8113099002983234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325746297836304, "data/tokens_consumed": 110270349312, "data/tokens_consumed_B": 110.270349312, "train/loss_slope": 3.465158138910317e-06} {"step": 52590, "timestamp": 1778251389.3424997, "train/loss": 2.1235315561294557, "train/z_loss": 0.001396534580271691, "train/perplexity": 8.360611381069168, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027520.6561758257, "perf/iters_per_sec": 0.966797187889016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343430995941163, "data/tokens_consumed": 110291320832, "data/tokens_consumed_B": 110.291320832, "train/loss_slope": 1.3852804514726946e-06} {"step": 52600, "timestamp": 1778251399.6767378, "grad/layer_0/attn": 0.0025927932001650333, "grad/layer_0/mlp": 0.002781872171908617, "grad/layer_0/attn_mlp_ratio": 0.932031720631854, "grad/layer_4/attn": 0.002902116160839796, "grad/layer_4/mlp": 0.0024655601009726524, "grad/layer_4/attn_mlp_ratio": 1.1770615698999862, "grad/layer_8/attn": 0.004569394048303366, "grad/layer_8/mlp": 0.0034454904962331057, "grad/layer_8/attn_mlp_ratio": 1.3261954780253946, "grad/layer_12/attn": 0.0035939551889896393, "grad/layer_12/mlp": 0.0063218530267477036, "grad/layer_12/attn_mlp_ratio": 0.5684971031332011, "grad/layer_16/attn": 0.0032549158204346895, "grad/layer_16/mlp": 0.004247273784130812, "grad/layer_16/attn_mlp_ratio": 0.7663541154235679, "grad/layer_20/attn": 0.004962415900081396, "grad/layer_20/mlp": 0.005625877063721418, "grad/layer_20/attn_mlp_ratio": 0.8820697209817582, "grad/layer_24/attn": 0.014832921326160431, "grad/layer_24/mlp": 0.009835918433964252, "grad/layer_24/attn_mlp_ratio": 1.508036211863804, "grad/layer_27/attn": 0.005121687892824411, "grad/layer_27/mlp": 0.010066087357699871, "grad/layer_27/attn_mlp_ratio": 0.5088062183392486} {"step": 52600, "timestamp": 1778251399.6910594, "train/loss": 2.1489356994628905, "train/z_loss": 0.001386441511567682, "train/perplexity": 8.575726388237792, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027482.9885944915, "perf/iters_per_sec": 0.9667792265865762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343623161315918, "data/tokens_consumed": 110312292352, "data/tokens_consumed_B": 110.312292352, "train/loss_slope": 2.669014810550071e-06} {"step": 52610, "timestamp": 1778251410.0324612, "train/loss": 2.222728800773621, "train/z_loss": 0.0013619939447380603, "train/perplexity": 9.232490149195863, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029247.0143341853, "perf/iters_per_sec": 0.967620379607289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033463144302368, "data/tokens_consumed": 110333263872, "data/tokens_consumed_B": 110.333263872, "train/loss_slope": 6.572647170551231e-06} {"step": 52620, "timestamp": 1778251420.3787603, "train/loss": 2.16592515707016, "train/z_loss": 0.001376660296227783, "train/perplexity": 8.722668023060516, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028489.6986647693, "perf/iters_per_sec": 0.967259263355622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338489770889283, "data/tokens_consumed": 110354235392, "data/tokens_consumed_B": 110.354235392, "train/loss_slope": 7.424836583656833e-06} {"step": 52630, "timestamp": 1778251430.727573, "train/loss": 2.155848228931427, "train/z_loss": 0.0013765383046120405, "train/perplexity": 8.635211709763867, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027523.507005082, "perf/iters_per_sec": 0.9667985472703371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343416452407836, "data/tokens_consumed": 110375206912, "data/tokens_consumed_B": 110.375206912, "train/loss_slope": 4.5755025756062174e-06} {"step": 52640, "timestamp": 1778251441.0738375, "train/loss": 2.147094559669495, "train/z_loss": 0.0013864230597391725, "train/perplexity": 8.55995180319131, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028437.5408365172, "perf/iters_per_sec": 0.9672343925650202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338755607604981, "data/tokens_consumed": 110396178432, "data/tokens_consumed_B": 110.396178432, "train/loss_slope": 2.3400304913342034e-06} {"step": 52650, "timestamp": 1778251451.4182372, "grad/layer_0/attn": 0.002785193268209696, "grad/layer_0/mlp": 0.0028733632061630487, "grad/layer_0/attn_mlp_ratio": 0.9693146920320734, "grad/layer_4/attn": 0.002179845003411174, "grad/layer_4/mlp": 0.002487063640728593, "grad/layer_4/attn_mlp_ratio": 0.8764733157874681, "grad/layer_8/attn": 0.004176042042672634, "grad/layer_8/mlp": 0.0035133555065840483, "grad/layer_8/attn_mlp_ratio": 1.188619231952128, "grad/layer_12/attn": 0.005053275264799595, "grad/layer_12/mlp": 0.006488153710961342, "grad/layer_12/attn_mlp_ratio": 0.7788464040822204, "grad/layer_16/attn": 0.0041834507137537, "grad/layer_16/mlp": 0.004588644951581955, "grad/layer_16/attn_mlp_ratio": 0.9116962996105873, "grad/layer_20/attn": 0.0034820050932466984, "grad/layer_20/mlp": 0.006390641909092665, "grad/layer_20/attn_mlp_ratio": 0.544859982501362, "grad/layer_24/attn": 0.011417759582400322, "grad/layer_24/mlp": 0.012069230899214745, "grad/layer_24/attn_mlp_ratio": 0.9460221271051313, "grad/layer_27/attn": 0.007270395290106535, "grad/layer_27/mlp": 0.011543625965714455, "grad/layer_27/attn_mlp_ratio": 0.6298190229584986} {"step": 52650, "timestamp": 1778251452.0130126, "eos/sharpness": 51.17323398590087, "eos/L0_probe": 1.981183648109436, "eos/L_plus": 2.24051570892334, "eos/L_minus": 2.233583927154541, "eos/grad_norm": 0.1589195728302002, "eos/embed_grad_frac": 0.10087278485298157, "eos/time_s": 0.5919601917266846} {"step": 52650, "timestamp": 1778251452.0309546, "train/loss": 2.1663124322891236, "train/z_loss": 0.0013851686730049551, "train/perplexity": 8.726046750435556, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915139.2864174168, "perf/iters_per_sec": 0.9132095748984417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095038890838623, "data/tokens_consumed": 110417149952, "data/tokens_consumed_B": 110.417149952, "train/loss_slope": 1.94339217847796e-06} {"step": 52650, "timestamp": 1778251453.3968182, "geo/rankme_last": 438.9313049316406, "geo/layer_0/stable_rank_q_proj": 19.465932846069336, "geo/layer_0/stable_rank_k_proj": 16.364574432373047, "geo/layer_0/stable_rank_o_proj": 47.76179885864258, "geo/layer_0/stable_rank_gate_proj": 132.45858764648438, "geo/layer_0/stable_rank_down_proj": 53.90765380859375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06180790811777115, "geo/layer_0/attn_entropy_mean": 6.181894779205322, "geo/layer_0/attn_entropy_std": 0.3997950255870819, "geo/layer_7/stable_rank_q_proj": 43.119842529296875, "geo/layer_7/stable_rank_k_proj": 41.95213317871094, "geo/layer_7/stable_rank_o_proj": 93.4051742553711, "geo/layer_7/stable_rank_gate_proj": 85.79934692382812, "geo/layer_7/stable_rank_down_proj": 143.23728942871094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47231924533843994, "geo/layer_7/attn_entropy_mean": 4.6466064453125, "geo/layer_7/attn_entropy_std": 0.7830352783203125, "geo/layer_14/stable_rank_q_proj": 52.15790939331055, "geo/layer_14/stable_rank_k_proj": 38.88418960571289, "geo/layer_14/stable_rank_o_proj": 45.201351165771484, "geo/layer_14/stable_rank_gate_proj": 73.23384094238281, "geo/layer_14/stable_rank_down_proj": 131.14932250976562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3898715674877167, "geo/layer_14/attn_entropy_mean": 5.498325347900391, "geo/layer_14/attn_entropy_std": 0.3760918974876404, "geo/layer_21/stable_rank_q_proj": 40.93776321411133, "geo/layer_21/stable_rank_k_proj": 30.304302215576172, "geo/layer_21/stable_rank_o_proj": 72.65106964111328, "geo/layer_21/stable_rank_gate_proj": 68.67869567871094, "geo/layer_21/stable_rank_down_proj": 53.403594970703125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1438903957605362, "geo/layer_21/attn_entropy_mean": 5.683167934417725, "geo/layer_21/attn_entropy_std": 0.30768054723739624, "geo/layer_27/stable_rank_q_proj": 42.90202331542969, "geo/layer_27/stable_rank_k_proj": 31.401947021484375, "geo/layer_27/stable_rank_o_proj": 116.0560073852539, "geo/layer_27/stable_rank_gate_proj": 82.17924499511719, "geo/layer_27/stable_rank_down_proj": 129.10997009277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09442903101444244, "geo/layer_27/attn_entropy_mean": 4.207622528076172, "geo/layer_27/attn_entropy_std": 0.7108971476554871, "attnres/final_alpha/block_0": 0.2391071766614914, "attnres/block_norm/0": 1.747878909111023, "attnres/final_alpha/block_1": 0.004777279682457447, "attnres/block_norm/1": 44397.5703125, "attnres/final_alpha/block_2": 0.010561846196651459, "attnres/block_norm/2": 27882.67578125, "attnres/final_alpha/block_3": 0.012373698875308037, "attnres/block_norm/3": 53527.01171875, "attnres/final_alpha/block_4": 0.01445951871573925, "attnres/block_norm/4": 14288.021484375, "attnres/final_alpha/block_5": 0.604720950126648, "attnres/block_norm/5": 6402.85107421875, "attnres/final_alpha/block_6": 0.11399957537651062, "attnres/block_norm/6": 35231.359375, "geo/tier1_time_s": 1.3622708320617676, "geo/step": 52650.0, "geo/rankme_slope": -2.1139256483843538e-05} {"step": 52660, "timestamp": 1778251463.748805, "train/loss": 2.1428102016448975, "train/z_loss": 0.0013851523981429636, "train/perplexity": 8.523356354972242, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790224.0315105405, "perf/iters_per_sec": 0.8536453397324278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171446681022644, "data/tokens_consumed": 110438121472, "data/tokens_consumed_B": 110.438121472, "train/loss_slope": 1.2727047708205614e-07} {"step": 52670, "timestamp": 1778251474.0952399, "train/loss": 2.158521842956543, "train/z_loss": 0.0013831786345690489, "train/perplexity": 8.658329823586518, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028384.9647180971, "perf/iters_per_sec": 0.9672093223181234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033902359008789, "data/tokens_consumed": 110459092992, "data/tokens_consumed_B": 110.459092992, "train/loss_slope": 1.6832579802437097e-06} {"step": 52680, "timestamp": 1778251484.9943757, "train/loss": 2.2020982980728148, "train/z_loss": 0.0013687567901797594, "train/perplexity": 9.043970549682685, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925461.1907345494, "perf/iters_per_sec": 0.9181314424202678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.089168667793274, "data/tokens_consumed": 110480064512, "data/tokens_consumed_B": 110.480064512, "train/loss_slope": 2.913073315979544e-06} {"step": 52690, "timestamp": 1778251495.3374138, "train/loss": 2.133397901058197, "train/z_loss": 0.0013858731603249908, "train/perplexity": 8.443508329247873, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028836.4409787597, "perf/iters_per_sec": 0.9674246029752539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033672285079956, "data/tokens_consumed": 110501036032, "data/tokens_consumed_B": 110.501036032, "train/loss_slope": 2.559982403384104e-06} {"step": 52700, "timestamp": 1778251505.6687093, "grad/layer_0/attn": 0.004003124311566353, "grad/layer_0/mlp": 0.003317579161375761, "grad/layer_0/attn_mlp_ratio": 1.2066401421578408, "grad/layer_4/attn": 0.002526157535612583, "grad/layer_4/mlp": 0.002549164230003953, "grad/layer_4/attn_mlp_ratio": 0.9909747699979254, "grad/layer_8/attn": 0.005349143408238888, "grad/layer_8/mlp": 0.0036696013994514942, "grad/layer_8/attn_mlp_ratio": 1.4576905446104813, "grad/layer_12/attn": 0.004721534438431263, "grad/layer_12/mlp": 0.007188676856458187, "grad/layer_12/attn_mlp_ratio": 0.6568015876954264, "grad/layer_16/attn": 0.0037706736475229263, "grad/layer_16/mlp": 0.005178827326744795, "grad/layer_16/attn_mlp_ratio": 0.7280940909616335, "grad/layer_20/attn": 0.003874169895425439, "grad/layer_20/mlp": 0.006331114564090967, "grad/layer_20/attn_mlp_ratio": 0.6119254034994956, "grad/layer_24/attn": 0.01196029968559742, "grad/layer_24/mlp": 0.011051907204091549, "grad/layer_24/attn_mlp_ratio": 1.0821932682306856, "grad/layer_27/attn": 0.01084150467067957, "grad/layer_27/mlp": 0.010014721192419529, "grad/layer_27/attn_mlp_ratio": 1.0825568035413886} {"step": 52700, "timestamp": 1778251505.6829484, "train/loss": 2.1410149574279784, "train/z_loss": 0.0013701678602956236, "train/perplexity": 8.508068575521976, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028555.2388955995, "perf/iters_per_sec": 0.967290515373039, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033815574645996, "data/tokens_consumed": 110522007552, "data/tokens_consumed_B": 110.522007552, "train/loss_slope": 1.384241402846656e-06} {"step": 52710, "timestamp": 1778251516.0503247, "train/loss": 2.1844075202941893, "train/z_loss": 0.0013782482827082276, "train/perplexity": 8.885382584675838, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024287.8889438987, "perf/iters_per_sec": 0.9652556843490118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359949350357056, "data/tokens_consumed": 110542979072, "data/tokens_consumed_B": 110.542979072, "train/loss_slope": 5.194194002835334e-06} {"step": 52720, "timestamp": 1778251526.424969, "train/loss": 2.1720643758773805, "train/z_loss": 0.0013804131536744534, "train/perplexity": 8.776383106251256, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022393.0580064524, "perf/iters_per_sec": 0.9643521585495245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036965584754944, "data/tokens_consumed": 110563950592, "data/tokens_consumed_B": 110.563950592, "train/loss_slope": 6.2751400767594374e-06} {"step": 52725, "timestamp": 1778251532.1847165, "eos/sharpness": 70.53198814392088, "eos/L0_probe": 1.9804933071136475, "eos/L_plus": 2.402524948120117, "eos/L_minus": 2.2637815475463867, "eos/grad_norm": 0.1849222630262375, "eos/embed_grad_frac": 0.07015533000230789, "eos/time_s": 0.5842840671539307} {"step": 52725, "timestamp": 1778251533.5618818, "geo/rankme_last": 438.9748840332031, "geo/layer_0/stable_rank_q_proj": 19.454036712646484, "geo/layer_0/stable_rank_k_proj": 16.342082977294922, "geo/layer_0/stable_rank_o_proj": 47.65647506713867, "geo/layer_0/stable_rank_gate_proj": 132.44427490234375, "geo/layer_0/stable_rank_down_proj": 53.9339714050293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06424061208963394, "geo/layer_0/attn_entropy_mean": 6.183670520782471, "geo/layer_0/attn_entropy_std": 0.40270453691482544, "geo/layer_7/stable_rank_q_proj": 43.09135055541992, "geo/layer_7/stable_rank_k_proj": 42.02250671386719, "geo/layer_7/stable_rank_o_proj": 93.3405532836914, "geo/layer_7/stable_rank_gate_proj": 85.59761810302734, "geo/layer_7/stable_rank_down_proj": 143.13304138183594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4780512750148773, "geo/layer_7/attn_entropy_mean": 4.594399452209473, "geo/layer_7/attn_entropy_std": 0.7770854830741882, "geo/layer_14/stable_rank_q_proj": 52.10901641845703, "geo/layer_14/stable_rank_k_proj": 38.9686279296875, "geo/layer_14/stable_rank_o_proj": 45.12724304199219, "geo/layer_14/stable_rank_gate_proj": 73.22330474853516, "geo/layer_14/stable_rank_down_proj": 131.46514892578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38955309987068176, "geo/layer_14/attn_entropy_mean": 5.534977912902832, "geo/layer_14/attn_entropy_std": 0.37872207164764404, "geo/layer_21/stable_rank_q_proj": 40.970211029052734, "geo/layer_21/stable_rank_k_proj": 30.34903335571289, "geo/layer_21/stable_rank_o_proj": 72.63826751708984, "geo/layer_21/stable_rank_gate_proj": 68.74810028076172, "geo/layer_21/stable_rank_down_proj": 53.33208465576172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.143220454454422, "geo/layer_21/attn_entropy_mean": 5.690749645233154, "geo/layer_21/attn_entropy_std": 0.29471948742866516, "geo/layer_27/stable_rank_q_proj": 42.8104362487793, "geo/layer_27/stable_rank_k_proj": 31.449241638183594, "geo/layer_27/stable_rank_o_proj": 116.15789794921875, "geo/layer_27/stable_rank_gate_proj": 82.1279296875, "geo/layer_27/stable_rank_down_proj": 128.8120574951172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09559033811092377, "geo/layer_27/attn_entropy_mean": 4.236011028289795, "geo/layer_27/attn_entropy_std": 0.697341799736023, "attnres/final_alpha/block_0": 0.23879289627075195, "attnres/block_norm/0": 1.748098611831665, "attnres/final_alpha/block_1": 0.004698295146226883, "attnres/block_norm/1": 44374.87109375, "attnres/final_alpha/block_2": 0.010502681136131287, "attnres/block_norm/2": 27973.787109375, "attnres/final_alpha/block_3": 0.012593342922627926, "attnres/block_norm/3": 53577.73046875, "attnres/final_alpha/block_4": 0.01441209390759468, "attnres/block_norm/4": 14253.193359375, "attnres/final_alpha/block_5": 0.6087154150009155, "attnres/block_norm/5": 6304.69677734375, "attnres/final_alpha/block_6": 0.11028525978326797, "attnres/block_norm/6": 35373.70703125, "geo/tier1_time_s": 1.3572545051574707, "geo/step": 52725.0, "geo/rankme_slope": -1.770010738670468e-05} {"step": 52730, "timestamp": 1778251538.7496672, "train/loss": 2.1700180411338805, "train/z_loss": 0.0013779833680018783, "train/perplexity": 8.75844205154112, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702522.7908327123, "perf/iters_per_sec": 0.8118261293567239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317908525466919, "data/tokens_consumed": 110584922112, "data/tokens_consumed_B": 110.584922112, "train/loss_slope": 5.35656355037511e-06} {"step": 52740, "timestamp": 1778251549.1221445, "train/loss": 2.1303509950637816, "train/z_loss": 0.0013818563078530134, "train/perplexity": 8.417820906559399, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022826.5657993043, "perf/iters_per_sec": 0.9645588711735269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367433547973632, "data/tokens_consumed": 110605893632, "data/tokens_consumed_B": 110.605893632, "train/loss_slope": 3.2959739331400553e-06} {"step": 52750, "timestamp": 1778251559.4933422, "grad/layer_0/attn": 0.0029171560890972614, "grad/layer_0/mlp": 0.003100759582594037, "grad/layer_0/attn_mlp_ratio": 0.9407875448950702, "grad/layer_4/attn": 0.0023644701577723026, "grad/layer_4/mlp": 0.0025353231467306614, "grad/layer_4/attn_mlp_ratio": 0.9326109247889095, "grad/layer_8/attn": 0.003598898882046342, "grad/layer_8/mlp": 0.0036494419910013676, "grad/layer_8/attn_mlp_ratio": 0.9861504285601208, "grad/layer_12/attn": 0.004795067943632603, "grad/layer_12/mlp": 0.006302348803728819, "grad/layer_12/attn_mlp_ratio": 0.7608382234749927, "grad/layer_16/attn": 0.0042489259503781796, "grad/layer_16/mlp": 0.0052360412664711475, "grad/layer_16/attn_mlp_ratio": 0.8114767728127713, "grad/layer_20/attn": 0.004316573031246662, "grad/layer_20/mlp": 0.007360547315329313, "grad/layer_20/attn_mlp_ratio": 0.5864472827465017, "grad/layer_24/attn": 0.021343659609556198, "grad/layer_24/mlp": 0.01548262033611536, "grad/layer_24/attn_mlp_ratio": 1.37855602012752, "grad/layer_27/attn": 0.0055832755751907825, "grad/layer_27/mlp": 0.014891313388943672, "grad/layer_27/attn_mlp_ratio": 0.37493506394423753} {"step": 52750, "timestamp": 1778251559.5077794, "train/loss": 2.1860023021697996, "train/z_loss": 0.0013722182251513005, "train/perplexity": 8.899564137015366, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020513.9774416916, "perf/iters_per_sec": 0.9634561431129892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379299640655517, "data/tokens_consumed": 110626865152, "data/tokens_consumed_B": 110.626865152, "train/loss_slope": 6.851517070423449e-06} {"step": 52760, "timestamp": 1778251569.8855555, "train/loss": 2.2060001850128175, "train/z_loss": 0.0013782098540104926, "train/perplexity": 9.079328035854044, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022479.6888964325, "perf/iters_per_sec": 0.9643934673769152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369211673736571, "data/tokens_consumed": 110647836672, "data/tokens_consumed_B": 110.647836672, "train/loss_slope": 1.1115380146107861e-05} {"step": 52770, "timestamp": 1778251580.2705557, "train/loss": 2.1310933113098143, "train/z_loss": 0.001379228034056723, "train/perplexity": 8.424071911598785, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020707.0714231532, "perf/iters_per_sec": 0.9635482174983755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378307819366455, "data/tokens_consumed": 110668808192, "data/tokens_consumed_B": 110.668808192, "train/loss_slope": 8.265342420548785e-06} {"step": 52780, "timestamp": 1778251590.649697, "train/loss": 2.1742602586746216, "train/z_loss": 0.0013741505332291125, "train/perplexity": 8.795676189859025, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021401.4005727612, "perf/iters_per_sec": 0.9638792994369322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374742984771728, "data/tokens_consumed": 110689779712, "data/tokens_consumed_B": 110.689779712, "train/loss_slope": 8.485576193002869e-06} {"step": 52790, "timestamp": 1778251601.0269113, "train/loss": 2.171387219429016, "train/z_loss": 0.00138447901699692, "train/perplexity": 8.770442133548542, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022643.1592321864, "perf/iters_per_sec": 0.9644714161072666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036837363243103, "data/tokens_consumed": 110710751232, "data/tokens_consumed_B": 110.710751232, "train/loss_slope": 7.217784604617034e-06} {"step": 52800, "timestamp": 1778251611.3910816, "grad/layer_0/attn": 0.0027790237218141556, "grad/layer_0/mlp": 0.003000045893713832, "grad/layer_0/attn_mlp_ratio": 0.9263270388644718, "grad/layer_4/attn": 0.002199603011831641, "grad/layer_4/mlp": 0.0025624281261116266, "grad/layer_4/attn_mlp_ratio": 0.8584057065159023, "grad/layer_8/attn": 0.0059014963917434216, "grad/layer_8/mlp": 0.003908875398337841, "grad/layer_8/attn_mlp_ratio": 1.5097683193677816, "grad/layer_12/attn": 0.005031462758779526, "grad/layer_12/mlp": 0.006281209643930197, "grad/layer_12/attn_mlp_ratio": 0.801034030688379, "grad/layer_16/attn": 0.003935786429792643, "grad/layer_16/mlp": 0.004288726020604372, "grad/layer_16/attn_mlp_ratio": 0.9177052390647896, "grad/layer_20/attn": 0.003113876096904278, "grad/layer_20/mlp": 0.00581868551671505, "grad/layer_20/attn_mlp_ratio": 0.5351511152207984, "grad/layer_24/attn": 0.006236480548977852, "grad/layer_24/mlp": 0.008059052750468254, "grad/layer_24/attn_mlp_ratio": 0.7738478286087297, "grad/layer_27/attn": 0.006205277517437935, "grad/layer_27/mlp": 0.006913525052368641, "grad/layer_27/attn_mlp_ratio": 0.8975562221411673} {"step": 52800, "timestamp": 1778251611.9813254, "eos/sharpness": 17.820453643798825, "eos/L0_probe": 1.98147714138031, "eos/L_plus": 2.0606908798217773, "eos/L_minus": 2.080467939376831, "eos/grad_norm": 0.11137163639068604, "eos/embed_grad_frac": 0.17910824716091156, "eos/time_s": 0.5873494148254395} {"step": 52800, "timestamp": 1778251611.999282, "train/loss": 2.1461408376693725, "train/z_loss": 0.0013721438706852495, "train/perplexity": 8.551791880603979, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912168.279421961, "perf/iters_per_sec": 0.9117928883657269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967402935028077, "data/tokens_consumed": 110731722752, "data/tokens_consumed_B": 110.731722752, "train/loss_slope": 5.9804202628285545e-06} {"step": 52800, "timestamp": 1778251613.3602855, "geo/rankme_last": 438.78692626953125, "geo/layer_0/stable_rank_q_proj": 19.499406814575195, "geo/layer_0/stable_rank_k_proj": 16.35669708251953, "geo/layer_0/stable_rank_o_proj": 47.68737030029297, "geo/layer_0/stable_rank_gate_proj": 132.46209716796875, "geo/layer_0/stable_rank_down_proj": 53.992496490478516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06516322493553162, "geo/layer_0/attn_entropy_mean": 6.177889823913574, "geo/layer_0/attn_entropy_std": 0.40236684679985046, "geo/layer_7/stable_rank_q_proj": 43.1221809387207, "geo/layer_7/stable_rank_k_proj": 41.936279296875, "geo/layer_7/stable_rank_o_proj": 93.33515930175781, "geo/layer_7/stable_rank_gate_proj": 85.6957015991211, "geo/layer_7/stable_rank_down_proj": 142.77993774414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47130337357521057, "geo/layer_7/attn_entropy_mean": 4.641484260559082, "geo/layer_7/attn_entropy_std": 0.792803943157196, "geo/layer_14/stable_rank_q_proj": 52.22931671142578, "geo/layer_14/stable_rank_k_proj": 39.01176452636719, "geo/layer_14/stable_rank_o_proj": 45.024688720703125, "geo/layer_14/stable_rank_gate_proj": 73.14202880859375, "geo/layer_14/stable_rank_down_proj": 131.867919921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37842655181884766, "geo/layer_14/attn_entropy_mean": 5.498902320861816, "geo/layer_14/attn_entropy_std": 0.38007527589797974, "geo/layer_21/stable_rank_q_proj": 40.906654357910156, "geo/layer_21/stable_rank_k_proj": 30.430618286132812, "geo/layer_21/stable_rank_o_proj": 72.60295104980469, "geo/layer_21/stable_rank_gate_proj": 68.71709442138672, "geo/layer_21/stable_rank_down_proj": 53.360443115234375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14416348934173584, "geo/layer_21/attn_entropy_mean": 5.698976039886475, "geo/layer_21/attn_entropy_std": 0.29475706815719604, "geo/layer_27/stable_rank_q_proj": 42.81028747558594, "geo/layer_27/stable_rank_k_proj": 31.44015884399414, "geo/layer_27/stable_rank_o_proj": 116.2042007446289, "geo/layer_27/stable_rank_gate_proj": 82.19209289550781, "geo/layer_27/stable_rank_down_proj": 128.88912963867188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09572330862283707, "geo/layer_27/attn_entropy_mean": 4.221122741699219, "geo/layer_27/attn_entropy_std": 0.7074279189109802, "attnres/final_alpha/block_0": 0.23915907740592957, "attnres/block_norm/0": 1.7480987310409546, "attnres/final_alpha/block_1": 0.004794888198375702, "attnres/block_norm/1": 44338.75, "attnres/final_alpha/block_2": 0.010515199974179268, "attnres/block_norm/2": 27947.841796875, "attnres/final_alpha/block_3": 0.012447100132703781, "attnres/block_norm/3": 53742.6328125, "attnres/final_alpha/block_4": 0.01429232582449913, "attnres/block_norm/4": 14306.431640625, "attnres/final_alpha/block_5": 0.6067155599594116, "attnres/block_norm/5": 6334.40869140625, "attnres/final_alpha/block_6": 0.11207584291696548, "attnres/block_norm/6": 35349.078125, "geo/tier1_time_s": 1.3567306995391846, "geo/step": 52800.0, "geo/rankme_slope": -1.7147542610794316e-05} {"step": 52810, "timestamp": 1778251624.395736, "train/loss": 2.1671079635620116, "train/z_loss": 0.0013779146480374037, "train/perplexity": 8.7329913554732, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1692274.525698329, "perf/iters_per_sec": 0.8069393757335325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2392504692077637, "data/tokens_consumed": 110752694272, "data/tokens_consumed_B": 110.752694272, "train/loss_slope": 6.662747125313682e-06} {"step": 52820, "timestamp": 1778251634.7733507, "train/loss": 2.155916118621826, "train/z_loss": 0.0013793235062621533, "train/perplexity": 8.635797971513714, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022312.385885862, "perf/iters_per_sec": 0.9643136910847959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037006950378418, "data/tokens_consumed": 110773665792, "data/tokens_consumed_B": 110.773665792, "train/loss_slope": 5.011223265976613e-06} {"step": 52830, "timestamp": 1778251645.14628, "train/loss": 2.1654749155044555, "train/z_loss": 0.001398856204468757, "train/perplexity": 8.718741599338603, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022681.3448729503, "perf/iters_per_sec": 0.9644896244396927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368177890777588, "data/tokens_consumed": 110794637312, "data/tokens_consumed_B": 110.794637312, "train/loss_slope": 6.725985124738464e-06} {"step": 52840, "timestamp": 1778251655.538124, "train/loss": 2.189786458015442, "train/z_loss": 0.0013702195254154503, "train/perplexity": 8.933305275266944, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019422.8084616777, "perf/iters_per_sec": 0.96293583319744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384907960891723, "data/tokens_consumed": 110815608832, "data/tokens_consumed_B": 110.815608832, "train/loss_slope": 6.6956324295492015e-06} {"step": 52850, "timestamp": 1778251665.9056747, "grad/layer_0/attn": 0.0028874606359750032, "grad/layer_0/mlp": 0.0028727867174893618, "grad/layer_0/attn_mlp_ratio": 1.0051078689154058, "grad/layer_4/attn": 0.002098197117447853, "grad/layer_4/mlp": 0.0026192576624453068, "grad/layer_4/attn_mlp_ratio": 0.8010655337292967, "grad/layer_8/attn": 0.00643583619967103, "grad/layer_8/mlp": 0.0037657390348613262, "grad/layer_8/attn_mlp_ratio": 1.7090499286292253, "grad/layer_12/attn": 0.004671646282076836, "grad/layer_12/mlp": 0.006958435755223036, "grad/layer_12/attn_mlp_ratio": 0.6713644243153115, "grad/layer_16/attn": 0.0036313228774815798, "grad/layer_16/mlp": 0.004785895347595215, "grad/layer_16/attn_mlp_ratio": 0.7587551623816235, "grad/layer_20/attn": 0.0038615090306848288, "grad/layer_20/mlp": 0.006402794737368822, "grad/layer_20/attn_mlp_ratio": 0.6030974174196226, "grad/layer_24/attn": 0.013925443403422832, "grad/layer_24/mlp": 0.010530389845371246, "grad/layer_24/attn_mlp_ratio": 1.322405293219357, "grad/layer_27/attn": 0.006028812378644943, "grad/layer_27/mlp": 0.007429952267557383, "grad/layer_27/attn_mlp_ratio": 0.8114200576802529} {"step": 52850, "timestamp": 1778251665.920027, "train/loss": 2.112581706047058, "train/z_loss": 0.0013882492436096071, "train/perplexity": 8.26956333081358, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021857.2088809253, "perf/iters_per_sec": 0.9640966457752825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372404098510741, "data/tokens_consumed": 110836580352, "data/tokens_consumed_B": 110.836580352, "train/loss_slope": 5.251093201189411e-06} {"step": 52860, "timestamp": 1778251676.2995708, "train/loss": 2.1416630148887634, "train/z_loss": 0.0013959266245365143, "train/perplexity": 8.513584079828039, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021569.2962558707, "perf/iters_per_sec": 0.9639593583373407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373881340026856, "data/tokens_consumed": 110857551872, "data/tokens_consumed_B": 110.857551872, "train/loss_slope": 3.2615088214753377e-06} {"step": 52870, "timestamp": 1778251686.667219, "train/loss": 2.1365663766860963, "train/z_loss": 0.001395983505062759, "train/perplexity": 8.470303807595926, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024087.5892798773, "perf/iters_per_sec": 0.9651601740264307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360974550247193, "data/tokens_consumed": 110878523392, "data/tokens_consumed_B": 110.878523392, "train/loss_slope": 1.1139731488712287e-06} {"step": 52875, "timestamp": 1778251692.4309337, "eos/sharpness": 40.614748001098626, "eos/L0_probe": 1.9784897565841675, "eos/L_plus": 2.2125518321990967, "eos/L_minus": 2.1505751609802246, "eos/grad_norm": 0.1550815850496292, "eos/embed_grad_frac": 0.14980052411556244, "eos/time_s": 0.6000292301177979} {"step": 52875, "timestamp": 1778251693.8063066, "geo/rankme_last": 439.5194396972656, "geo/layer_0/stable_rank_q_proj": 19.52327537536621, "geo/layer_0/stable_rank_k_proj": 16.378175735473633, "geo/layer_0/stable_rank_o_proj": 47.70764923095703, "geo/layer_0/stable_rank_gate_proj": 132.71566772460938, "geo/layer_0/stable_rank_down_proj": 53.969139099121094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061980947852134705, "geo/layer_0/attn_entropy_mean": 6.178883075714111, "geo/layer_0/attn_entropy_std": 0.4049336016178131, "geo/layer_7/stable_rank_q_proj": 43.01533889770508, "geo/layer_7/stable_rank_k_proj": 41.79005432128906, "geo/layer_7/stable_rank_o_proj": 93.37883758544922, "geo/layer_7/stable_rank_gate_proj": 85.54701232910156, "geo/layer_7/stable_rank_down_proj": 142.94888305664062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4746060073375702, "geo/layer_7/attn_entropy_mean": 4.626959800720215, "geo/layer_7/attn_entropy_std": 0.7991794347763062, "geo/layer_14/stable_rank_q_proj": 52.27629470825195, "geo/layer_14/stable_rank_k_proj": 39.04623031616211, "geo/layer_14/stable_rank_o_proj": 44.95735549926758, "geo/layer_14/stable_rank_gate_proj": 73.05479431152344, "geo/layer_14/stable_rank_down_proj": 131.78126525878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4028831720352173, "geo/layer_14/attn_entropy_mean": 5.500565528869629, "geo/layer_14/attn_entropy_std": 0.3672996163368225, "geo/layer_21/stable_rank_q_proj": 40.85987091064453, "geo/layer_21/stable_rank_k_proj": 30.429019927978516, "geo/layer_21/stable_rank_o_proj": 72.65097045898438, "geo/layer_21/stable_rank_gate_proj": 68.66162109375, "geo/layer_21/stable_rank_down_proj": 53.30453872680664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14406637847423553, "geo/layer_21/attn_entropy_mean": 5.697160720825195, "geo/layer_21/attn_entropy_std": 0.30379849672317505, "geo/layer_27/stable_rank_q_proj": 42.847267150878906, "geo/layer_27/stable_rank_k_proj": 31.473934173583984, "geo/layer_27/stable_rank_o_proj": 116.27066802978516, "geo/layer_27/stable_rank_gate_proj": 82.11414337158203, "geo/layer_27/stable_rank_down_proj": 129.13525390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09433172643184662, "geo/layer_27/attn_entropy_mean": 4.221991062164307, "geo/layer_27/attn_entropy_std": 0.6967160701751709, "attnres/final_alpha/block_0": 0.23884427547454834, "attnres/block_norm/0": 1.748165249824524, "attnres/final_alpha/block_1": 0.0047265286557376385, "attnres/block_norm/1": 44434.2890625, "attnres/final_alpha/block_2": 0.010533429682254791, "attnres/block_norm/2": 27884.35546875, "attnres/final_alpha/block_3": 0.012437321245670319, "attnres/block_norm/3": 53786.40234375, "attnres/final_alpha/block_4": 0.014493271708488464, "attnres/block_norm/4": 14231.173828125, "attnres/final_alpha/block_5": 0.6073925495147705, "attnres/block_norm/5": 6374.99609375, "attnres/final_alpha/block_6": 0.11157264560461044, "attnres/block_norm/6": 35306.80859375, "geo/tier1_time_s": 1.3570516109466553, "geo/step": 52875.0, "geo/rankme_slope": -2.094744147659062e-06} {"step": 52880, "timestamp": 1778251698.986149, "train/loss": 2.1085156559944154, "train/z_loss": 0.0013817060622386633, "train/perplexity": 8.236007139205688, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703321.7835952947, "perf/iters_per_sec": 0.8122071187950586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2312130451202392, "data/tokens_consumed": 110899494912, "data/tokens_consumed_B": 110.899494912, "train/loss_slope": -2.6527626178946867e-06} {"step": 52890, "timestamp": 1778251709.3344524, "train/loss": 2.089889645576477, "train/z_loss": 0.0013794281869195402, "train/perplexity": 8.084023007380509, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027543.3695625716, "perf/iters_per_sec": 0.9668080184758051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343315124511718, "data/tokens_consumed": 110920466432, "data/tokens_consumed_B": 110.920466432, "train/loss_slope": -6.300597379703397e-06} {"step": 52900, "timestamp": 1778251719.6752129, "grad/layer_0/attn": 0.0026855652686208487, "grad/layer_0/mlp": 0.0027477077674120665, "grad/layer_0/attn_mlp_ratio": 0.9773838407174825, "grad/layer_4/attn": 0.002810561330989003, "grad/layer_4/mlp": 0.00247907149605453, "grad/layer_4/attn_mlp_ratio": 1.1337152728715227, "grad/layer_8/attn": 0.004436787683516741, "grad/layer_8/mlp": 0.0035076907370239496, "grad/layer_8/attn_mlp_ratio": 1.2648742120274974, "grad/layer_12/attn": 0.005888290703296661, "grad/layer_12/mlp": 0.006646281108260155, "grad/layer_12/attn_mlp_ratio": 0.8859526882459555, "grad/layer_16/attn": 0.0038914859760552645, "grad/layer_16/mlp": 0.0048914155922830105, "grad/layer_16/attn_mlp_ratio": 0.7955745781726762, "grad/layer_20/attn": 0.0036280101630836725, "grad/layer_20/mlp": 0.006990449037402868, "grad/layer_20/attn_mlp_ratio": 0.5189952879668004, "grad/layer_24/attn": 0.025890544056892395, "grad/layer_24/mlp": 0.012909200973808765, "grad/layer_24/attn_mlp_ratio": 2.0055884100698713, "grad/layer_27/attn": 0.012648842297494411, "grad/layer_27/mlp": 0.012760289944708347, "grad/layer_27/attn_mlp_ratio": 0.9912660490613101} {"step": 52900, "timestamp": 1778251719.6897185, "train/loss": 2.1780927419662475, "train/z_loss": 0.0013681757729500533, "train/perplexity": 8.829450149624343, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026776.863745283, "perf/iters_per_sec": 0.9664425200201431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347226858139038, "data/tokens_consumed": 110941437952, "data/tokens_consumed_B": 110.941437952, "train/loss_slope": -6.654618053224622e-06} {"step": 52910, "timestamp": 1778251730.0338047, "train/loss": 2.182885193824768, "train/z_loss": 0.0013719103997573256, "train/perplexity": 8.871866422192491, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028947.0248152406, "perf/iters_per_sec": 0.9674773334575847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336159467697144, "data/tokens_consumed": 110962409472, "data/tokens_consumed_B": 110.962409472, "train/loss_slope": -7.3725413889847105e-06} {"step": 52920, "timestamp": 1778251740.373792, "train/loss": 2.1572337985038756, "train/z_loss": 0.0013758433400653304, "train/perplexity": 8.647184689143048, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029203.196945837, "perf/iters_per_sec": 0.9675994858483491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334854602813721, "data/tokens_consumed": 110983380992, "data/tokens_consumed_B": 110.983380992, "train/loss_slope": -1.1015415592233755e-05} {"step": 52930, "timestamp": 1778251750.7177823, "train/loss": 2.1417972326278685, "train/z_loss": 0.0013840583735145629, "train/perplexity": 8.514726830521907, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028896.4346069791, "perf/iters_per_sec": 0.9674532101664444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336417198181151, "data/tokens_consumed": 111004352512, "data/tokens_consumed_B": 111.004352512, "train/loss_slope": -1.0519966662842226e-05} {"step": 52940, "timestamp": 1778251761.0597634, "train/loss": 2.158931386470795, "train/z_loss": 0.001391584996599704, "train/perplexity": 8.661876512622193, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028902.8459923791, "perf/iters_per_sec": 0.9674562673532386, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336384534835816, "data/tokens_consumed": 111025324032, "data/tokens_consumed_B": 111.025324032, "train/loss_slope": -8.216585005649536e-06} {"step": 52950, "timestamp": 1778251771.3984802, "grad/layer_0/attn": 0.003394637955352664, "grad/layer_0/mlp": 0.0033434901852160692, "grad/layer_0/attn_mlp_ratio": 1.0152976876776807, "grad/layer_4/attn": 0.0023061924148350954, "grad/layer_4/mlp": 0.0024685387033969164, "grad/layer_4/attn_mlp_ratio": 0.9342338113792585, "grad/layer_8/attn": 0.004106480628252029, "grad/layer_8/mlp": 0.0036341098602861166, "grad/layer_8/attn_mlp_ratio": 1.129982491759474, "grad/layer_12/attn": 0.004370653070509434, "grad/layer_12/mlp": 0.006765315774828196, "grad/layer_12/attn_mlp_ratio": 0.646038285776335, "grad/layer_16/attn": 0.003736440557986498, "grad/layer_16/mlp": 0.004701939411461353, "grad/layer_16/attn_mlp_ratio": 0.7946594270042449, "grad/layer_20/attn": 0.0033325799740850925, "grad/layer_20/mlp": 0.006919969338923693, "grad/layer_20/attn_mlp_ratio": 0.4815888283176335, "grad/layer_24/attn": 0.014999032951891422, "grad/layer_24/mlp": 0.014027922414243221, "grad/layer_24/attn_mlp_ratio": 1.0692269604898506, "grad/layer_27/attn": 0.009711054153740406, "grad/layer_27/mlp": 0.014867816120386124, "grad/layer_27/attn_mlp_ratio": 0.6531594155989779} {"step": 52950, "timestamp": 1778251772.001406, "eos/sharpness": 69.95513439178465, "eos/L0_probe": 1.9762240648269653, "eos/L_plus": 2.3076560497283936, "eos/L_minus": 2.344343423843384, "eos/grad_norm": 0.23213821649551392, "eos/embed_grad_frac": 0.054054033011198044, "eos/time_s": 0.6000015735626221} {"step": 52950, "timestamp": 1778251772.021462, "train/loss": 2.13517165184021, "train/z_loss": 0.0013814778067171574, "train/perplexity": 8.458498299055035, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914137.10926354, "perf/iters_per_sec": 0.912731699592371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0956122159957886, "data/tokens_consumed": 111046295552, "data/tokens_consumed_B": 111.046295552, "train/loss_slope": -1.0126533452505325e-05} {"step": 52950, "timestamp": 1778251773.3824382, "geo/rankme_last": 439.0703125, "geo/layer_0/stable_rank_q_proj": 19.49405288696289, "geo/layer_0/stable_rank_k_proj": 16.35748863220215, "geo/layer_0/stable_rank_o_proj": 47.691139221191406, "geo/layer_0/stable_rank_gate_proj": 132.30682373046875, "geo/layer_0/stable_rank_down_proj": 54.080814361572266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0609777495265007, "geo/layer_0/attn_entropy_mean": 6.169807434082031, "geo/layer_0/attn_entropy_std": 0.40340179204940796, "geo/layer_7/stable_rank_q_proj": 43.0295524597168, "geo/layer_7/stable_rank_k_proj": 41.82914352416992, "geo/layer_7/stable_rank_o_proj": 93.27355194091797, "geo/layer_7/stable_rank_gate_proj": 85.49008178710938, "geo/layer_7/stable_rank_down_proj": 143.15061950683594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4744533896446228, "geo/layer_7/attn_entropy_mean": 4.663641929626465, "geo/layer_7/attn_entropy_std": 0.7999019622802734, "geo/layer_14/stable_rank_q_proj": 52.26984405517578, "geo/layer_14/stable_rank_k_proj": 39.08391189575195, "geo/layer_14/stable_rank_o_proj": 45.03007125854492, "geo/layer_14/stable_rank_gate_proj": 73.06348419189453, "geo/layer_14/stable_rank_down_proj": 131.82601928710938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4026474952697754, "geo/layer_14/attn_entropy_mean": 5.498388290405273, "geo/layer_14/attn_entropy_std": 0.38136425614356995, "geo/layer_21/stable_rank_q_proj": 40.920616149902344, "geo/layer_21/stable_rank_k_proj": 30.43781280517578, "geo/layer_21/stable_rank_o_proj": 72.5389633178711, "geo/layer_21/stable_rank_gate_proj": 68.6536865234375, "geo/layer_21/stable_rank_down_proj": 53.25938415527344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14411109685897827, "geo/layer_21/attn_entropy_mean": 5.686590194702148, "geo/layer_21/attn_entropy_std": 0.30538952350616455, "geo/layer_27/stable_rank_q_proj": 42.781673431396484, "geo/layer_27/stable_rank_k_proj": 31.454082489013672, "geo/layer_27/stable_rank_o_proj": 116.14085388183594, "geo/layer_27/stable_rank_gate_proj": 82.05785369873047, "geo/layer_27/stable_rank_down_proj": 129.19029235839844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08769587427377701, "geo/layer_27/attn_entropy_mean": 4.222115993499756, "geo/layer_27/attn_entropy_std": 0.724425733089447, "attnres/final_alpha/block_0": 0.23828712105751038, "attnres/block_norm/0": 1.7484626770019531, "attnres/final_alpha/block_1": 0.004762861877679825, "attnres/block_norm/1": 44260.3515625, "attnres/final_alpha/block_2": 0.01060536503791809, "attnres/block_norm/2": 27898.71484375, "attnres/final_alpha/block_3": 0.012413122691214085, "attnres/block_norm/3": 53651.296875, "attnres/final_alpha/block_4": 0.01438576728105545, "attnres/block_norm/4": 14330.3037109375, "attnres/final_alpha/block_5": 0.6061784625053406, "attnres/block_norm/5": 6447.6455078125, "attnres/final_alpha/block_6": 0.11336728930473328, "attnres/block_norm/6": 35562.609375, "geo/tier1_time_s": 1.357386589050293, "geo/step": 52950.0, "geo/rankme_slope": 3.014623818277311e-06} {"step": 52960, "timestamp": 1778251783.7363987, "train/loss": 2.1196767568588255, "train/z_loss": 0.001388739643152803, "train/perplexity": 8.328444939833872, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790669.4572140486, "perf/iters_per_sec": 0.8538577352590793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171155285835266, "data/tokens_consumed": 111067267072, "data/tokens_consumed_B": 111.067267072, "train/loss_slope": -8.758439947598198e-06} {"step": 52970, "timestamp": 1778251794.0763245, "train/loss": 2.1675548791885375, "train/z_loss": 0.0013838603277690708, "train/perplexity": 8.736895138042016, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029090.7596996017, "perf/iters_per_sec": 0.9675458715913781, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335427284240724, "data/tokens_consumed": 111088238592, "data/tokens_consumed_B": 111.088238592, "train/loss_slope": -1.0231358698099034e-05} {"step": 52980, "timestamp": 1778251804.4343617, "train/loss": 2.199495792388916, "train/z_loss": 0.0013728937366977334, "train/perplexity": 9.020464165938238, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025886.3019976304, "perf/iters_per_sec": 0.966017867087188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351775407791137, "data/tokens_consumed": 111109210112, "data/tokens_consumed_B": 111.109210112, "train/loss_slope": -6.0358723111958875e-06} {"step": 52990, "timestamp": 1778251814.777885, "train/loss": 2.152573299407959, "train/z_loss": 0.0013896212331019342, "train/perplexity": 8.606978256512996, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029075.7814899497, "perf/iters_per_sec": 0.9675387294244526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335503578186036, "data/tokens_consumed": 111130181632, "data/tokens_consumed_B": 111.130181632, "train/loss_slope": -6.9607612573811495e-06} {"step": 53000, "timestamp": 1778251825.1102798, "grad/layer_0/attn": 0.002527100732550025, "grad/layer_0/mlp": 0.002785252407193184, "grad/layer_0/attn_mlp_ratio": 0.9073147680591053, "grad/layer_4/attn": 0.0020097894594073296, "grad/layer_4/mlp": 0.002644447609782219, "grad/layer_4/attn_mlp_ratio": 0.7600034789770315, "grad/layer_8/attn": 0.005775695201009512, "grad/layer_8/mlp": 0.0036574085243046284, "grad/layer_8/attn_mlp_ratio": 1.5791768966224349, "grad/layer_12/attn": 0.0036810641176998615, "grad/layer_12/mlp": 0.006251643877476454, "grad/layer_12/attn_mlp_ratio": 0.5888153789566508, "grad/layer_16/attn": 0.0039835297502577305, "grad/layer_16/mlp": 0.004463074263185263, "grad/layer_16/attn_mlp_ratio": 0.892552851710657, "grad/layer_20/attn": 0.003979687113314867, "grad/layer_20/mlp": 0.006010617129504681, "grad/layer_20/attn_mlp_ratio": 0.6621095573645143, "grad/layer_24/attn": 0.014370052143931389, "grad/layer_24/mlp": 0.009429347701370716, "grad/layer_24/attn_mlp_ratio": 1.523970951823673, "grad/layer_27/attn": 0.007044908124953508, "grad/layer_27/mlp": 0.009029733948409557, "grad/layer_27/attn_mlp_ratio": 0.78018999088842} {"step": 53000, "timestamp": 1778251825.1249802, "train/loss": 2.1173593044281005, "train/z_loss": 0.0013825999572873115, "train/perplexity": 8.309166511911757, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027764.7816364097, "perf/iters_per_sec": 0.9669135959798859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342185735702514, "data/tokens_consumed": 111151153152, "data/tokens_consumed_B": 111.151153152, "train/loss_slope": -7.977128758503507e-06} {"step": 53000, "timestamp": 1778251832.3784602, "geo/ww_alpha_mean": 7.471193360969775, "geo/ww_alpha_std": 4.018258253485461, "geo/ww_alpha_min": 1.358154370299434, "geo/ww_alpha_max": 26.951375392228282, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 4.028544500805039, "geo/ww_alpha_by_type/k_proj": 4.460943749904074, "geo/ww_alpha_by_type/v_proj": 7.938734423088639, "geo/ww_alpha_by_type/o_proj": 8.180359732585424, "geo/ww_alpha_by_type/gate_proj": 8.223010376159133, "geo/ww_alpha_by_type/up_proj": 11.046242585021005, "geo/ww_alpha_by_type/down_proj": 8.513129220891436, "geo/twonn_id/layer_0": 0.6660218834877014, "geo/twonn_id/layer_7": 3.293581485748291, "geo/twonn_id/layer_14": 4.284578800201416, "geo/twonn_id/layer_21": 6.995284080505371, "geo/twonn_id/layer_27": 5.914855480194092, "geo/tier2_time_s": 7.246462106704712} {"step": 53000, "timestamp": 1778251833.0394733, "eoc/jacobian_sigma/layer_0/attn": 997.8546142578125, "eoc/jacobian_sigma/layer_0/mlp": 8034.14501953125, "eoc/jacobian_sigma/layer_0": 8034.14501953125, "eoc/jacobian_sigma/layer_7/attn": 1.1497687101364136, "eoc/jacobian_sigma/layer_7/mlp": 1.7744050025939941, "eoc/jacobian_sigma/layer_7": 1.7744050025939941, "eoc/jacobian_sigma/layer_14/attn": 1.5540001392364502, "eoc/jacobian_sigma/layer_14/mlp": 7.829148292541504, "eoc/jacobian_sigma/layer_14": 7.829148292541504, "eoc/jacobian_sigma/layer_21/attn": 1.098144769668579, "eoc/jacobian_sigma/layer_21/mlp": 3.8379900455474854, "eoc/jacobian_sigma/layer_21": 3.8379900455474854, "eoc/jacobian_sigma/layer_27/attn": 3.6382040977478027, "eoc/jacobian_sigma/layer_27/mlp": 34.01960754394531, "eoc/jacobian_sigma/layer_27": 34.01960754394531, "eoc/layer0_sigma": 8034.14501953125, "eoc/sigma_max": 34.01960754394531, "eoc/sigma_min": 1.7744050025939941, "eoc/sigma_mean": 11.865287721157074, "eoc/time_s": 0.6541287899017334} {"step": 53010, "timestamp": 1778251843.4139051, "train/loss": 2.1755236864089964, "train/z_loss": 0.001373845071066171, "train/perplexity": 8.806795914104724, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1146992.8147460197, "perf/iters_per_sec": 0.5469287942628954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8283915758132934, "data/tokens_consumed": 111172124672, "data/tokens_consumed_B": 111.172124672, "train/loss_slope": -5.4693409533176705e-06} {"step": 53020, "timestamp": 1778251853.775935, "train/loss": 2.1935672521591187, "train/z_loss": 0.0013737752800807358, "train/perplexity": 8.967144192215946, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025460.4833656442, "perf/iters_per_sec": 0.9658148209408017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353951692581176, "data/tokens_consumed": 111193096192, "data/tokens_consumed_B": 111.193096192, "train/loss_slope": -5.887414605775672e-06} {"step": 53025, "timestamp": 1778251859.5671098, "eos/sharpness": 6.36298656463623, "eos/L0_probe": 1.9777050018310547, "eos/L_plus": 2.02042555809021, "eos/L_minus": 1.9986143112182617, "eos/grad_norm": 0.09396328032016754, "eos/embed_grad_frac": 0.2738122344017029, "eos/time_s": 0.6289196014404297} {"step": 53025, "timestamp": 1778251860.9488733, "geo/rankme_last": 438.94390869140625, "geo/layer_0/stable_rank_q_proj": 19.510517120361328, "geo/layer_0/stable_rank_k_proj": 16.395671844482422, "geo/layer_0/stable_rank_o_proj": 47.65888595581055, "geo/layer_0/stable_rank_gate_proj": 132.47813415527344, "geo/layer_0/stable_rank_down_proj": 54.12055587768555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06531316787004471, "geo/layer_0/attn_entropy_mean": 6.176870346069336, "geo/layer_0/attn_entropy_std": 0.39221030473709106, "geo/layer_7/stable_rank_q_proj": 42.98529052734375, "geo/layer_7/stable_rank_k_proj": 41.806549072265625, "geo/layer_7/stable_rank_o_proj": 93.33830261230469, "geo/layer_7/stable_rank_gate_proj": 85.54485321044922, "geo/layer_7/stable_rank_down_proj": 143.1478729248047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4648568630218506, "geo/layer_7/attn_entropy_mean": 4.648874759674072, "geo/layer_7/attn_entropy_std": 0.7921075820922852, "geo/layer_14/stable_rank_q_proj": 52.233272552490234, "geo/layer_14/stable_rank_k_proj": 39.168212890625, "geo/layer_14/stable_rank_o_proj": 45.02376937866211, "geo/layer_14/stable_rank_gate_proj": 73.06575012207031, "geo/layer_14/stable_rank_down_proj": 131.5346221923828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3809587359428406, "geo/layer_14/attn_entropy_mean": 5.530479431152344, "geo/layer_14/attn_entropy_std": 0.37586793303489685, "geo/layer_21/stable_rank_q_proj": 40.988121032714844, "geo/layer_21/stable_rank_k_proj": 30.423418045043945, "geo/layer_21/stable_rank_o_proj": 72.62751007080078, "geo/layer_21/stable_rank_gate_proj": 68.55992889404297, "geo/layer_21/stable_rank_down_proj": 53.23287582397461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14472895860671997, "geo/layer_21/attn_entropy_mean": 5.708666801452637, "geo/layer_21/attn_entropy_std": 0.2995232939720154, "geo/layer_27/stable_rank_q_proj": 42.80023193359375, "geo/layer_27/stable_rank_k_proj": 31.480777740478516, "geo/layer_27/stable_rank_o_proj": 116.14507293701172, "geo/layer_27/stable_rank_gate_proj": 81.98571014404297, "geo/layer_27/stable_rank_down_proj": 129.44491577148438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10388985276222229, "geo/layer_27/attn_entropy_mean": 4.243428707122803, "geo/layer_27/attn_entropy_std": 0.6941943168640137, "attnres/final_alpha/block_0": 0.24074983596801758, "attnres/block_norm/0": 1.7487022876739502, "attnres/final_alpha/block_1": 0.004902570974081755, "attnres/block_norm/1": 44452.046875, "attnres/final_alpha/block_2": 0.010711893439292908, "attnres/block_norm/2": 27939.650390625, "attnres/final_alpha/block_3": 0.012772532179951668, "attnres/block_norm/3": 53889.140625, "attnres/final_alpha/block_4": 0.014779292978346348, "attnres/block_norm/4": 14326.4921875, "attnres/final_alpha/block_5": 0.6032760739326477, "attnres/block_norm/5": 6405.984375, "attnres/final_alpha/block_6": 0.11280778050422668, "attnres/block_norm/6": 35307.5625, "geo/tier1_time_s": 1.3611664772033691, "geo/step": 53025.0, "geo/rankme_slope": -3.723753173144258e-05} {"step": 53030, "timestamp": 1778251866.138388, "train/loss": 2.1594027042388917, "train/z_loss": 0.0013844982138834893, "train/perplexity": 8.665959971155342, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697036.2436318647, "perf/iters_per_sec": 0.8092099397811244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235773253440857, "data/tokens_consumed": 111214067712, "data/tokens_consumed_B": 111.214067712, "train/loss_slope": -6.67489208761655e-06} {"step": 53040, "timestamp": 1778251876.4885013, "train/loss": 2.1564053535461425, "train/z_loss": 0.001397060533054173, "train/perplexity": 8.640023939141905, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027289.4852820362, "perf/iters_per_sec": 0.9666869570169622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344610452651977, "data/tokens_consumed": 111235039232, "data/tokens_consumed_B": 111.235039232, "train/loss_slope": -6.146961706306335e-06} {"step": 53050, "timestamp": 1778251886.8314736, "grad/layer_0/attn": 0.0026955450884997845, "grad/layer_0/mlp": 0.0027044308371841908, "grad/layer_0/attn_mlp_ratio": 0.9967143369933277, "grad/layer_4/attn": 0.004566062707453966, "grad/layer_4/mlp": 0.0025278914254158735, "grad/layer_4/attn_mlp_ratio": 1.8062731970679735, "grad/layer_8/attn": 0.005547829903662205, "grad/layer_8/mlp": 0.003560012439265847, "grad/layer_8/attn_mlp_ratio": 1.558373697415765, "grad/layer_12/attn": 0.0041498481296002865, "grad/layer_12/mlp": 0.006366610527038574, "grad/layer_12/attn_mlp_ratio": 0.6518143440367087, "grad/layer_16/attn": 0.004523965995758772, "grad/layer_16/mlp": 0.004441388882696629, "grad/layer_16/attn_mlp_ratio": 1.0185926099659044, "grad/layer_20/attn": 0.0036072940565645695, "grad/layer_20/mlp": 0.005933288484811783, "grad/layer_20/attn_mlp_ratio": 0.6079754936914131, "grad/layer_24/attn": 0.011887074448168278, "grad/layer_24/mlp": 0.009551339782774448, "grad/layer_24/attn_mlp_ratio": 1.244545225493049, "grad/layer_27/attn": 0.005497341975569725, "grad/layer_27/mlp": 0.0085861561819911, "grad/layer_27/attn_mlp_ratio": 0.6402564541132381} {"step": 53050, "timestamp": 1778251886.8462088, "train/loss": 2.158668076992035, "train/z_loss": 0.001374620629940182, "train/perplexity": 8.659596058678318, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025548.589994522, "perf/iters_per_sec": 0.9658568334553347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353501319885254, "data/tokens_consumed": 111256010752, "data/tokens_consumed_B": 111.256010752, "train/loss_slope": -5.605272942035771e-06} {"step": 53060, "timestamp": 1778251897.2189014, "train/loss": 2.1529300928115847, "train/z_loss": 0.0013800269458442926, "train/perplexity": 8.610049717485992, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022899.323648602, "perf/iters_per_sec": 0.964593564819623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367060661315919, "data/tokens_consumed": 111276982272, "data/tokens_consumed_B": 111.276982272, "train/loss_slope": -1.985473815936325e-06} {"step": 53070, "timestamp": 1778251907.5925212, "train/loss": 2.1440615892410277, "train/z_loss": 0.0013805466471239925, "train/perplexity": 8.53402905384098, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022703.4383198677, "perf/iters_per_sec": 0.9645001594161356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368064641952515, "data/tokens_consumed": 111297953792, "data/tokens_consumed_B": 111.297953792, "train/loss_slope": -2.803306353546486e-06} {"step": 53080, "timestamp": 1778251917.9680471, "train/loss": 2.147665464878082, "train/z_loss": 0.0013775477069430052, "train/perplexity": 8.564840119510862, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022149.201447564, "perf/iters_per_sec": 0.9642358786809749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370906352996827, "data/tokens_consumed": 111318925312, "data/tokens_consumed_B": 111.318925312, "train/loss_slope": -2.857972014032668e-06} {"step": 53090, "timestamp": 1778251928.3424473, "train/loss": 2.1114913702011107, "train/z_loss": 0.0013871918199583888, "train/perplexity": 8.26055164325943, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022447.044501274, "perf/iters_per_sec": 0.9643779013162965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369379043579101, "data/tokens_consumed": 111339896832, "data/tokens_consumed_B": 111.339896832, "train/loss_slope": -1.899858505347531e-06} {"step": 53100, "timestamp": 1778251938.7076929, "grad/layer_0/attn": 0.002826484153047204, "grad/layer_0/mlp": 0.0028970199637115, "grad/layer_0/attn_mlp_ratio": 0.9756522533109662, "grad/layer_4/attn": 0.001856053015217185, "grad/layer_4/mlp": 0.002562588546425104, "grad/layer_4/attn_mlp_ratio": 0.7242883159598948, "grad/layer_8/attn": 0.005985697731375694, "grad/layer_8/mlp": 0.0036146657075732946, "grad/layer_8/attn_mlp_ratio": 1.655947755622308, "grad/layer_12/attn": 0.006566280499100685, "grad/layer_12/mlp": 0.006608584895730019, "grad/layer_12/attn_mlp_ratio": 0.9935985545080124, "grad/layer_16/attn": 0.00474903779104352, "grad/layer_16/mlp": 0.004588117823004723, "grad/layer_16/attn_mlp_ratio": 1.0350731761343686, "grad/layer_20/attn": 0.007459978573024273, "grad/layer_20/mlp": 0.00644536130130291, "grad/layer_20/attn_mlp_ratio": 1.1574181971419417, "grad/layer_24/attn": 0.017223525792360306, "grad/layer_24/mlp": 0.01276295818388462, "grad/layer_24/attn_mlp_ratio": 1.349493229489585, "grad/layer_27/attn": 0.006240163464099169, "grad/layer_27/mlp": 0.01232721097767353, "grad/layer_27/attn_mlp_ratio": 0.5062104822234335} {"step": 53100, "timestamp": 1778251939.3093328, "eos/sharpness": 69.26786899566649, "eos/L0_probe": 1.9768015146255493, "eos/L_plus": 2.373460054397583, "eos/L_minus": 2.2728216648101807, "eos/grad_norm": 0.21916934847831726, "eos/embed_grad_frac": 0.04864931479096413, "eos/time_s": 0.5986125469207764} {"step": 53100, "timestamp": 1778251939.3277128, "train/loss": 2.1947040796279906, "train/z_loss": 0.0013680480071343482, "train/perplexity": 8.977344084711468, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910297.2931069576, "perf/iters_per_sec": 0.9109007325682438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978144645690917, "data/tokens_consumed": 111360868352, "data/tokens_consumed_B": 111.360868352, "train/loss_slope": -2.108751443496165e-06} {"step": 53100, "timestamp": 1778251940.687208, "geo/rankme_last": 438.6706237792969, "geo/layer_0/stable_rank_q_proj": 19.50262451171875, "geo/layer_0/stable_rank_k_proj": 16.367586135864258, "geo/layer_0/stable_rank_o_proj": 47.644371032714844, "geo/layer_0/stable_rank_gate_proj": 132.27806091308594, "geo/layer_0/stable_rank_down_proj": 54.12216567993164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0617188885807991, "geo/layer_0/attn_entropy_mean": 6.1775922775268555, "geo/layer_0/attn_entropy_std": 0.3993349075317383, "geo/layer_7/stable_rank_q_proj": 42.90969467163086, "geo/layer_7/stable_rank_k_proj": 41.76567459106445, "geo/layer_7/stable_rank_o_proj": 93.43638610839844, "geo/layer_7/stable_rank_gate_proj": 85.5994644165039, "geo/layer_7/stable_rank_down_proj": 142.84677124023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4718201756477356, "geo/layer_7/attn_entropy_mean": 4.677255153656006, "geo/layer_7/attn_entropy_std": 0.8077722191810608, "geo/layer_14/stable_rank_q_proj": 52.21531677246094, "geo/layer_14/stable_rank_k_proj": 39.08074188232422, "geo/layer_14/stable_rank_o_proj": 45.051273345947266, "geo/layer_14/stable_rank_gate_proj": 73.12107849121094, "geo/layer_14/stable_rank_down_proj": 131.50975036621094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3982500433921814, "geo/layer_14/attn_entropy_mean": 5.5531110763549805, "geo/layer_14/attn_entropy_std": 0.3916478753089905, "geo/layer_21/stable_rank_q_proj": 41.045963287353516, "geo/layer_21/stable_rank_k_proj": 30.43347930908203, "geo/layer_21/stable_rank_o_proj": 72.57747650146484, "geo/layer_21/stable_rank_gate_proj": 68.54793548583984, "geo/layer_21/stable_rank_down_proj": 53.29914474487305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13821828365325928, "geo/layer_21/attn_entropy_mean": 5.723602294921875, "geo/layer_21/attn_entropy_std": 0.2863352596759796, "geo/layer_27/stable_rank_q_proj": 42.90343475341797, "geo/layer_27/stable_rank_k_proj": 31.492252349853516, "geo/layer_27/stable_rank_o_proj": 116.33114624023438, "geo/layer_27/stable_rank_gate_proj": 82.0285873413086, "geo/layer_27/stable_rank_down_proj": 129.64761352539062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09106071293354034, "geo/layer_27/attn_entropy_mean": 4.253196716308594, "geo/layer_27/attn_entropy_std": 0.7074735164642334, "attnres/final_alpha/block_0": 0.2387361228466034, "attnres/block_norm/0": 1.7486960887908936, "attnres/final_alpha/block_1": 0.004816037602722645, "attnres/block_norm/1": 44540.90625, "attnres/final_alpha/block_2": 0.010514235123991966, "attnres/block_norm/2": 28018.689453125, "attnres/final_alpha/block_3": 0.01246186438947916, "attnres/block_norm/3": 54124.64453125, "attnres/final_alpha/block_4": 0.014399077743291855, "attnres/block_norm/4": 14304.966796875, "attnres/final_alpha/block_5": 0.6087099313735962, "attnres/block_norm/5": 6389.826171875, "attnres/final_alpha/block_6": 0.11036273837089539, "attnres/block_norm/6": 35755.7890625, "geo/tier1_time_s": 1.3554258346557617, "geo/step": 53100.0, "geo/rankme_slope": -3.920126644407763e-05} {"step": 53110, "timestamp": 1778251951.0598152, "train/loss": 2.179284119606018, "train/z_loss": 0.001383432629518211, "train/perplexity": 8.839975627768757, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788130.4566437635, "perf/iters_per_sec": 0.8526470454424684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1728182315826416, "data/tokens_consumed": 111381839872, "data/tokens_consumed_B": 111.381839872, "train/loss_slope": 3.903288015759825e-06} {"step": 53120, "timestamp": 1778251961.433648, "train/loss": 2.152764987945557, "train/z_loss": 0.0013738964800722897, "train/perplexity": 8.608628273727764, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023049.8336367302, "perf/iters_per_sec": 0.9646653335746432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366289377212525, "data/tokens_consumed": 111402811392, "data/tokens_consumed_B": 111.402811392, "train/loss_slope": 2.720342603775565e-06} {"step": 53130, "timestamp": 1778251971.804694, "train/loss": 2.134289038181305, "train/z_loss": 0.0013693752698600292, "train/perplexity": 8.451036006567534, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023501.0325147596, "perf/iters_per_sec": 0.9648804819654272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363977909088136, "data/tokens_consumed": 111423782912, "data/tokens_consumed_B": 111.423782912, "train/loss_slope": 3.3253076589396947e-06} {"step": 53140, "timestamp": 1778251982.181032, "train/loss": 2.1253100633621216, "train/z_loss": 0.0013767399359494448, "train/perplexity": 8.375494019396937, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022007.7497542226, "perf/iters_per_sec": 0.964168429257499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371631860733033, "data/tokens_consumed": 111444754432, "data/tokens_consumed_B": 111.444754432, "train/loss_slope": 1.2358964759238115e-06} {"step": 53150, "timestamp": 1778251992.59083, "grad/layer_0/attn": 0.0027623747009783983, "grad/layer_0/mlp": 0.0030349453445523977, "grad/layer_0/attn_mlp_ratio": 0.9101892444019867, "grad/layer_4/attn": 0.0018284881953150034, "grad/layer_4/mlp": 0.0025531896390020847, "grad/layer_4/attn_mlp_ratio": 0.7161583674661284, "grad/layer_8/attn": 0.005692953709512949, "grad/layer_8/mlp": 0.003636061679571867, "grad/layer_8/attn_mlp_ratio": 1.565692239196022, "grad/layer_12/attn": 0.004489155951887369, "grad/layer_12/mlp": 0.006236078217625618, "grad/layer_12/attn_mlp_ratio": 0.7198684370591119, "grad/layer_16/attn": 0.004619384650141001, "grad/layer_16/mlp": 0.0044871848076581955, "grad/layer_16/attn_mlp_ratio": 1.0294616213067536, "grad/layer_20/attn": 0.0036297945771366358, "grad/layer_20/mlp": 0.005936746951192617, "grad/layer_20/attn_mlp_ratio": 0.6114113580782351, "grad/layer_24/attn": 0.006717137526720762, "grad/layer_24/mlp": 0.009100663475692272, "grad/layer_24/attn_mlp_ratio": 0.7380931589057012, "grad/layer_27/attn": 0.004243765491992235, "grad/layer_27/mlp": 0.008447973988950253, "grad/layer_27/attn_mlp_ratio": 0.5023412059872412} {"step": 53150, "timestamp": 1778251992.6079457, "train/loss": 2.145250344276428, "train/z_loss": 0.0013804594054818153, "train/perplexity": 8.544179956123703, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018268.76961685, "perf/iters_per_sec": 0.9623855445942163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0390846014022828, "data/tokens_consumed": 111465725952, "data/tokens_consumed_B": 111.465725952, "train/loss_slope": 5.399189945315589e-07} {"step": 53160, "timestamp": 1778252002.9967606, "train/loss": 2.176353168487549, "train/z_loss": 0.0013831668067723513, "train/perplexity": 8.814104024039029, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019834.3574709864, "perf/iters_per_sec": 0.9631320750575001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382791996002196, "data/tokens_consumed": 111486697472, "data/tokens_consumed_B": 111.486697472, "train/loss_slope": 3.973938872998005e-06} {"step": 53170, "timestamp": 1778252013.3707986, "train/loss": 2.2101055145263673, "train/z_loss": 0.0013732307706959546, "train/perplexity": 9.116678284283873, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022753.8596989436, "perf/iters_per_sec": 0.9645242022032469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367806196212768, "data/tokens_consumed": 111507668992, "data/tokens_consumed_B": 111.507668992, "train/loss_slope": 4.952846058417177e-06} {"step": 53175, "timestamp": 1778252019.1324227, "eos/sharpness": 71.13587856292723, "eos/L0_probe": 1.9734368324279785, "eos/L_plus": 2.3086135387420654, "eos/L_minus": 2.349618911743164, "eos/grad_norm": 0.252867192029953, "eos/embed_grad_frac": 0.04556879773736, "eos/time_s": 0.5851976871490479} {"step": 53175, "timestamp": 1778252020.5065303, "geo/rankme_last": 438.39898681640625, "geo/layer_0/stable_rank_q_proj": 19.512165069580078, "geo/layer_0/stable_rank_k_proj": 16.322866439819336, "geo/layer_0/stable_rank_o_proj": 47.56150817871094, "geo/layer_0/stable_rank_gate_proj": 132.26116943359375, "geo/layer_0/stable_rank_down_proj": 54.13656234741211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06437741965055466, "geo/layer_0/attn_entropy_mean": 6.176631927490234, "geo/layer_0/attn_entropy_std": 0.39954662322998047, "geo/layer_7/stable_rank_q_proj": 42.93141174316406, "geo/layer_7/stable_rank_k_proj": 41.65031051635742, "geo/layer_7/stable_rank_o_proj": 93.31763458251953, "geo/layer_7/stable_rank_gate_proj": 85.77827453613281, "geo/layer_7/stable_rank_down_proj": 143.16297912597656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4808840751647949, "geo/layer_7/attn_entropy_mean": 4.675501823425293, "geo/layer_7/attn_entropy_std": 0.8030399680137634, "geo/layer_14/stable_rank_q_proj": 52.14753341674805, "geo/layer_14/stable_rank_k_proj": 38.905757904052734, "geo/layer_14/stable_rank_o_proj": 45.03119659423828, "geo/layer_14/stable_rank_gate_proj": 73.04094696044922, "geo/layer_14/stable_rank_down_proj": 131.6943359375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3939039409160614, "geo/layer_14/attn_entropy_mean": 5.549939155578613, "geo/layer_14/attn_entropy_std": 0.3780238926410675, "geo/layer_21/stable_rank_q_proj": 40.992271423339844, "geo/layer_21/stable_rank_k_proj": 30.324565887451172, "geo/layer_21/stable_rank_o_proj": 72.5687484741211, "geo/layer_21/stable_rank_gate_proj": 68.6260986328125, "geo/layer_21/stable_rank_down_proj": 53.23269271850586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13987895846366882, "geo/layer_21/attn_entropy_mean": 5.716007232666016, "geo/layer_21/attn_entropy_std": 0.2930423319339752, "geo/layer_27/stable_rank_q_proj": 42.877906799316406, "geo/layer_27/stable_rank_k_proj": 31.442310333251953, "geo/layer_27/stable_rank_o_proj": 116.41446685791016, "geo/layer_27/stable_rank_gate_proj": 81.98329162597656, "geo/layer_27/stable_rank_down_proj": 129.64068603515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0940634161233902, "geo/layer_27/attn_entropy_mean": 4.251579284667969, "geo/layer_27/attn_entropy_std": 0.6981107592582703, "attnres/final_alpha/block_0": 0.23977211117744446, "attnres/block_norm/0": 1.7487640380859375, "attnres/final_alpha/block_1": 0.004829557612538338, "attnres/block_norm/1": 44516.8671875, "attnres/final_alpha/block_2": 0.010613911785185337, "attnres/block_norm/2": 27987.017578125, "attnres/final_alpha/block_3": 0.01261964999139309, "attnres/block_norm/3": 53917.4296875, "attnres/final_alpha/block_4": 0.0145537368953228, "attnres/block_norm/4": 14349.365234375, "attnres/final_alpha/block_5": 0.6033721566200256, "attnres/block_norm/5": 6468.1591796875, "attnres/final_alpha/block_6": 0.11423888802528381, "attnres/block_norm/6": 35558.2890625, "geo/tier1_time_s": 1.3563547134399414, "geo/step": 53175.0, "geo/rankme_slope": -5.6231789590836336e-05} {"step": 53180, "timestamp": 1778252025.693943, "train/loss": 2.1171767354011535, "train/z_loss": 0.0013864549691788852, "train/perplexity": 8.30764965393679, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702780.6548059324, "perf/iters_per_sec": 0.8119490884809172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231604313850403, "data/tokens_consumed": 111528640512, "data/tokens_consumed_B": 111.528640512, "train/loss_slope": 2.1159432508764406e-06} {"step": 53190, "timestamp": 1778252036.064654, "train/loss": 2.164858412742615, "train/z_loss": 0.0013798677711747587, "train/perplexity": 8.71336812761311, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023171.932997284, "perf/iters_per_sec": 0.9647235550867481, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365663766860962, "data/tokens_consumed": 111549612032, "data/tokens_consumed_B": 111.549612032, "train/loss_slope": 6.564767447241943e-07} {"step": 53200, "timestamp": 1778252046.4291098, "grad/layer_0/attn": 0.0026741926558315754, "grad/layer_0/mlp": 0.0027644759975373745, "grad/layer_0/attn_mlp_ratio": 0.967341572681267, "grad/layer_4/attn": 0.0027822102420032024, "grad/layer_4/mlp": 0.00260829646140337, "grad/layer_4/attn_mlp_ratio": 1.0666771114808586, "grad/layer_8/attn": 0.004077647812664509, "grad/layer_8/mlp": 0.0035895793698728085, "grad/layer_8/attn_mlp_ratio": 1.1359681118326082, "grad/layer_12/attn": 0.004954692907631397, "grad/layer_12/mlp": 0.006135673262178898, "grad/layer_12/attn_mlp_ratio": 0.8075222742743738, "grad/layer_16/attn": 0.004482703749090433, "grad/layer_16/mlp": 0.004548986908048391, "grad/layer_16/attn_mlp_ratio": 0.9854290067567384, "grad/layer_20/attn": 0.003812612034380436, "grad/layer_20/mlp": 0.005625532008707523, "grad/layer_20/attn_mlp_ratio": 0.6777335833669947, "grad/layer_24/attn": 0.009911805391311646, "grad/layer_24/mlp": 0.008968519978225231, "grad/layer_24/attn_mlp_ratio": 1.1051773653689674, "grad/layer_27/attn": 0.004017490427941084, "grad/layer_27/mlp": 0.008685396984219551, "grad/layer_27/attn_mlp_ratio": 0.4625569089109857} {"step": 53200, "timestamp": 1778252046.4433246, "train/loss": 2.1626095294952394, "train/z_loss": 0.0013643627637065947, "train/perplexity": 8.693794797319304, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021604.932380719, "perf/iters_per_sec": 0.9639763509658428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373698472976685, "data/tokens_consumed": 111570583552, "data/tokens_consumed_B": 111.570583552, "train/loss_slope": 1.2598484727737767e-06} {"step": 53210, "timestamp": 1778252056.8131967, "train/loss": 2.2219111204147337, "train/z_loss": 0.0013600452686659992, "train/perplexity": 9.224944008923037, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023649.0248887104, "perf/iters_per_sec": 0.9649510502284576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036321997642517, "data/tokens_consumed": 111591555072, "data/tokens_consumed_B": 111.591555072, "train/loss_slope": 6.0088768614830044e-06} {"step": 53220, "timestamp": 1778252067.1852582, "train/loss": 2.1170900464057922, "train/z_loss": 0.0013790528289973735, "train/perplexity": 8.306929503349496, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022958.9201340538, "perf/iters_per_sec": 0.9646219826383847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366755247116088, "data/tokens_consumed": 111612526592, "data/tokens_consumed_B": 111.612526592, "train/loss_slope": 2.6056101470247005e-06} {"step": 53230, "timestamp": 1778252077.565968, "train/loss": 2.133562684059143, "train/z_loss": 0.0013799894368276, "train/perplexity": 8.444899790530314, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021144.9187419182, "perf/iters_per_sec": 0.9637569993695823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376059532165527, "data/tokens_consumed": 111633498112, "data/tokens_consumed_B": 111.633498112, "train/loss_slope": -6.691101897128763e-08} {"step": 53240, "timestamp": 1778252087.9431207, "train/loss": 2.171125817298889, "train/z_loss": 0.0013638725504279137, "train/perplexity": 8.768149820913433, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022313.2692920978, "perf/iters_per_sec": 0.964314112325715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370064973831177, "data/tokens_consumed": 111654469632, "data/tokens_consumed_B": 111.654469632, "train/loss_slope": 1.9568788872944054e-06} {"step": 53250, "timestamp": 1778252098.310008, "grad/layer_0/attn": 0.002869861200451851, "grad/layer_0/mlp": 0.002992741297930479, "grad/layer_0/attn_mlp_ratio": 0.9589405895331947, "grad/layer_4/attn": 0.0027114644180983305, "grad/layer_4/mlp": 0.002485099947080016, "grad/layer_4/attn_mlp_ratio": 1.0910886349562834, "grad/layer_8/attn": 0.0074002849869430065, "grad/layer_8/mlp": 0.003654856001958251, "grad/layer_8/attn_mlp_ratio": 2.024781490843915, "grad/layer_12/attn": 0.004696172196418047, "grad/layer_12/mlp": 0.006746766157448292, "grad/layer_12/attn_mlp_ratio": 0.6960626790995715, "grad/layer_16/attn": 0.003903292352333665, "grad/layer_16/mlp": 0.0046844761818647385, "grad/layer_16/attn_mlp_ratio": 0.8332398580914344, "grad/layer_20/attn": 0.0035091787576675415, "grad/layer_20/mlp": 0.005560952704399824, "grad/layer_20/attn_mlp_ratio": 0.631039118852273, "grad/layer_24/attn": 0.008512605912983418, "grad/layer_24/mlp": 0.007699592970311642, "grad/layer_24/attn_mlp_ratio": 1.1055916637733259, "grad/layer_27/attn": 0.0040321568958461285, "grad/layer_27/mlp": 0.006490711122751236, "grad/layer_27/attn_mlp_ratio": 0.6212195794064317} {"step": 53250, "timestamp": 1778252098.8958392, "eos/sharpness": 26.626110076904293, "eos/L0_probe": 1.9783896207809448, "eos/L_plus": 2.1337273120880127, "eos/L_minus": 2.08931303024292, "eos/grad_norm": 0.08865554630756378, "eos/embed_grad_frac": 0.25611013174057007, "eos/time_s": 0.5830140113830566} {"step": 53250, "timestamp": 1778252098.913651, "train/loss": 2.135979402065277, "train/z_loss": 0.0013901468133553862, "train/perplexity": 8.46533341312065, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912404.2502861074, "perf/iters_per_sec": 0.9119054080420053, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966049671173095, "data/tokens_consumed": 111675441152, "data/tokens_consumed_B": 111.675441152, "train/loss_slope": 1.3213767255708089e-06} {"step": 53250, "timestamp": 1778252100.2733934, "geo/rankme_last": 439.464111328125, "geo/layer_0/stable_rank_q_proj": 19.525653839111328, "geo/layer_0/stable_rank_k_proj": 16.347742080688477, "geo/layer_0/stable_rank_o_proj": 47.5629768371582, "geo/layer_0/stable_rank_gate_proj": 132.29421997070312, "geo/layer_0/stable_rank_down_proj": 54.132991790771484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06170441582798958, "geo/layer_0/attn_entropy_mean": 6.175634860992432, "geo/layer_0/attn_entropy_std": 0.39762815833091736, "geo/layer_7/stable_rank_q_proj": 42.97813415527344, "geo/layer_7/stable_rank_k_proj": 41.66973114013672, "geo/layer_7/stable_rank_o_proj": 93.09751892089844, "geo/layer_7/stable_rank_gate_proj": 85.97479248046875, "geo/layer_7/stable_rank_down_proj": 143.17405700683594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4577316641807556, "geo/layer_7/attn_entropy_mean": 4.679413318634033, "geo/layer_7/attn_entropy_std": 0.7847273945808411, "geo/layer_14/stable_rank_q_proj": 52.08002471923828, "geo/layer_14/stable_rank_k_proj": 38.94283676147461, "geo/layer_14/stable_rank_o_proj": 44.99600601196289, "geo/layer_14/stable_rank_gate_proj": 72.97740173339844, "geo/layer_14/stable_rank_down_proj": 131.56900024414062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.391320139169693, "geo/layer_14/attn_entropy_mean": 5.531466484069824, "geo/layer_14/attn_entropy_std": 0.3934749662876129, "geo/layer_21/stable_rank_q_proj": 40.92394256591797, "geo/layer_21/stable_rank_k_proj": 30.305622100830078, "geo/layer_21/stable_rank_o_proj": 72.4510498046875, "geo/layer_21/stable_rank_gate_proj": 68.60900115966797, "geo/layer_21/stable_rank_down_proj": 53.16546630859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14547990262508392, "geo/layer_21/attn_entropy_mean": 5.700117111206055, "geo/layer_21/attn_entropy_std": 0.2910766005516052, "geo/layer_27/stable_rank_q_proj": 42.88898849487305, "geo/layer_27/stable_rank_k_proj": 31.43971824645996, "geo/layer_27/stable_rank_o_proj": 116.23635864257812, "geo/layer_27/stable_rank_gate_proj": 81.95906829833984, "geo/layer_27/stable_rank_down_proj": 129.70303344726562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08701097965240479, "geo/layer_27/attn_entropy_mean": 4.239751815795898, "geo/layer_27/attn_entropy_std": 0.6909563541412354, "attnres/final_alpha/block_0": 0.23873800039291382, "attnres/block_norm/0": 1.7486965656280518, "attnres/final_alpha/block_1": 0.0048500364646315575, "attnres/block_norm/1": 44531.1953125, "attnres/final_alpha/block_2": 0.010571526363492012, "attnres/block_norm/2": 27985.765625, "attnres/final_alpha/block_3": 0.012652399018406868, "attnres/block_norm/3": 53899.7421875, "attnres/final_alpha/block_4": 0.01449181605130434, "attnres/block_norm/4": 14329.70703125, "attnres/final_alpha/block_5": 0.6051093339920044, "attnres/block_norm/5": 6387.16552734375, "attnres/final_alpha/block_6": 0.11358688771724701, "attnres/block_norm/6": 35376.03125, "geo/tier1_time_s": 1.3558847904205322, "geo/step": 53250.0, "geo/rankme_slope": -2.491131218112245e-05} {"step": 53260, "timestamp": 1778252110.6515534, "train/loss": 2.172518181800842, "train/z_loss": 0.0013737717061303556, "train/perplexity": 8.78036678473152, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787217.5114189922, "perf/iters_per_sec": 0.8522117192358933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173417329788208, "data/tokens_consumed": 111696412672, "data/tokens_consumed_B": 111.696412672, "train/loss_slope": -2.4727755468456085e-06} {"step": 53270, "timestamp": 1778252121.0245473, "train/loss": 2.126266825199127, "train/z_loss": 0.0013893366674892604, "train/perplexity": 8.383511207098808, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023057.3713563818, "perf/iters_per_sec": 0.9646689278394612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036625075340271, "data/tokens_consumed": 111717384192, "data/tokens_consumed_B": 111.717384192, "train/loss_slope": -4.708551208857761e-06} {"step": 53280, "timestamp": 1778252131.3958192, "train/loss": 2.1144375681877134, "train/z_loss": 0.0013826575828716158, "train/perplexity": 8.284924750248353, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022981.1592226415, "perf/iters_per_sec": 0.9646325870621879, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366641283035278, "data/tokens_consumed": 111738355712, "data/tokens_consumed_B": 111.738355712, "train/loss_slope": -5.644216896569892e-06} {"step": 53290, "timestamp": 1778252141.7831614, "train/loss": 2.1923163175582885, "train/z_loss": 0.0013563896995037795, "train/perplexity": 8.955933894411832, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020284.7726660473, "perf/iters_per_sec": 0.9633468497591244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03804771900177, "data/tokens_consumed": 111759327232, "data/tokens_consumed_B": 111.759327232, "train/loss_slope": -8.999895698989228e-07} {"step": 53300, "timestamp": 1778252152.1500664, "grad/layer_0/attn": 0.002922410611063242, "grad/layer_0/mlp": 0.0028734449297189713, "grad/layer_0/attn_mlp_ratio": 1.0170407231869194, "grad/layer_4/attn": 0.002542971633374691, "grad/layer_4/mlp": 0.0025127343833446503, "grad/layer_4/attn_mlp_ratio": 1.0120335635262954, "grad/layer_8/attn": 0.004171847831457853, "grad/layer_8/mlp": 0.003575874725356698, "grad/layer_8/attn_mlp_ratio": 1.166664951993029, "grad/layer_12/attn": 0.004653246607631445, "grad/layer_12/mlp": 0.0067274561151862144, "grad/layer_12/attn_mlp_ratio": 0.6916799543232173, "grad/layer_16/attn": 0.004029379226267338, "grad/layer_16/mlp": 0.00426257262006402, "grad/layer_16/attn_mlp_ratio": 0.9452927822910715, "grad/layer_20/attn": 0.0028698144014924765, "grad/layer_20/mlp": 0.005346613936126232, "grad/layer_20/attn_mlp_ratio": 0.5367536130533439, "grad/layer_24/attn": 0.006970148533582687, "grad/layer_24/mlp": 0.007367388810962439, "grad/layer_24/attn_mlp_ratio": 0.9460812531847382, "grad/layer_27/attn": 0.005998753476887941, "grad/layer_27/mlp": 0.006231216713786125, "grad/layer_27/attn_mlp_ratio": 0.9626937492555427} {"step": 53300, "timestamp": 1778252152.164568, "train/loss": 2.157396674156189, "train/z_loss": 0.001387137349229306, "train/perplexity": 8.648593219694519, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021333.8134773052, "perf/iters_per_sec": 0.9638470713984038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037508988380432, "data/tokens_consumed": 111780298752, "data/tokens_consumed_B": 111.780298752, "train/loss_slope": -4.6226757766604706e-06} {"step": 53310, "timestamp": 1778252162.5372045, "train/loss": 2.150155186653137, "train/z_loss": 0.001384401333052665, "train/perplexity": 8.586190755999688, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022750.1384780428, "perf/iters_per_sec": 0.9645224277868475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367825269699096, "data/tokens_consumed": 111801270272, "data/tokens_consumed_B": 111.801270272, "train/loss_slope": -6.0300300473963895e-06} {"step": 53320, "timestamp": 1778252172.9156797, "train/loss": 2.135640835762024, "train/z_loss": 0.0013866568100638688, "train/perplexity": 8.462467821604903, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022069.1065861676, "perf/iters_per_sec": 0.96419768647488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371317148208619, "data/tokens_consumed": 111822241792, "data/tokens_consumed_B": 111.822241792, "train/loss_slope": -9.033638787920539e-06} {"step": 53325, "timestamp": 1778252178.6888654, "eos/sharpness": 60.08856296539305, "eos/L0_probe": 1.9809941053390503, "eos/L_plus": 2.3217244148254395, "eos/L_minus": 2.241149425506592, "eos/grad_norm": 0.1655900776386261, "eos/embed_grad_frac": 0.08295595645904541, "eos/time_s": 0.5957028865814209} {"step": 53325, "timestamp": 1778252180.0668736, "geo/rankme_last": 439.42681884765625, "geo/layer_0/stable_rank_q_proj": 19.52617835998535, "geo/layer_0/stable_rank_k_proj": 16.35009765625, "geo/layer_0/stable_rank_o_proj": 47.565284729003906, "geo/layer_0/stable_rank_gate_proj": 132.1350555419922, "geo/layer_0/stable_rank_down_proj": 54.1630859375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06560645997524261, "geo/layer_0/attn_entropy_mean": 6.173197269439697, "geo/layer_0/attn_entropy_std": 0.40513846278190613, "geo/layer_7/stable_rank_q_proj": 43.03315353393555, "geo/layer_7/stable_rank_k_proj": 41.628700256347656, "geo/layer_7/stable_rank_o_proj": 93.19780731201172, "geo/layer_7/stable_rank_gate_proj": 85.90819549560547, "geo/layer_7/stable_rank_down_proj": 143.0005340576172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.476208359003067, "geo/layer_7/attn_entropy_mean": 4.638472557067871, "geo/layer_7/attn_entropy_std": 0.8060283064842224, "geo/layer_14/stable_rank_q_proj": 52.200382232666016, "geo/layer_14/stable_rank_k_proj": 38.92988204956055, "geo/layer_14/stable_rank_o_proj": 45.066444396972656, "geo/layer_14/stable_rank_gate_proj": 72.80890655517578, "geo/layer_14/stable_rank_down_proj": 131.62942504882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38141873478889465, "geo/layer_14/attn_entropy_mean": 5.516819000244141, "geo/layer_14/attn_entropy_std": 0.38306131958961487, "geo/layer_21/stable_rank_q_proj": 40.97718811035156, "geo/layer_21/stable_rank_k_proj": 30.272430419921875, "geo/layer_21/stable_rank_o_proj": 72.47721099853516, "geo/layer_21/stable_rank_gate_proj": 68.83585357666016, "geo/layer_21/stable_rank_down_proj": 53.141239166259766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14109963178634644, "geo/layer_21/attn_entropy_mean": 5.703813552856445, "geo/layer_21/attn_entropy_std": 0.30024242401123047, "geo/layer_27/stable_rank_q_proj": 42.79535675048828, "geo/layer_27/stable_rank_k_proj": 31.420974731445312, "geo/layer_27/stable_rank_o_proj": 115.82927703857422, "geo/layer_27/stable_rank_gate_proj": 81.90676879882812, "geo/layer_27/stable_rank_down_proj": 129.8399200439453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09088729321956635, "geo/layer_27/attn_entropy_mean": 4.2292561531066895, "geo/layer_27/attn_entropy_std": 0.689515233039856, "attnres/final_alpha/block_0": 0.2384577989578247, "attnres/block_norm/0": 1.748940110206604, "attnres/final_alpha/block_1": 0.004802128300070763, "attnres/block_norm/1": 44531.046875, "attnres/final_alpha/block_2": 0.010412123054265976, "attnres/block_norm/2": 27867.71875, "attnres/final_alpha/block_3": 0.012496182695031166, "attnres/block_norm/3": 53391.1796875, "attnres/final_alpha/block_4": 0.014294546097517014, "attnres/block_norm/4": 14321.8251953125, "attnres/final_alpha/block_5": 0.6083227396011353, "attnres/block_norm/5": 6405.498046875, "attnres/final_alpha/block_6": 0.11121445894241333, "attnres/block_norm/6": 35578.96875, "geo/tier1_time_s": 1.3592393398284912, "geo/step": 53325.0, "geo/rankme_slope": 2.04265299869948e-05} {"step": 53330, "timestamp": 1778252185.256208, "train/loss": 2.156908929347992, "train/z_loss": 0.0013843529159203172, "train/perplexity": 8.644375941814706, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700376.1360431893, "perf/iters_per_sec": 0.8108025245872447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2333459377288818, "data/tokens_consumed": 111843213312, "data/tokens_consumed_B": 111.843213312, "train/loss_slope": -7.1018160408837265e-06} {"step": 53340, "timestamp": 1778252195.6304631, "train/loss": 2.0773897886276247, "train/z_loss": 0.0013959952513687313, "train/perplexity": 7.983602802834237, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022407.4727230216, "perf/iters_per_sec": 0.9643590320220097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369581937789918, "data/tokens_consumed": 111864184832, "data/tokens_consumed_B": 111.864184832, "train/loss_slope": -1.243499424090586e-05} {"step": 53350, "timestamp": 1778252205.9983113, "grad/layer_0/attn": 0.0029040409717708826, "grad/layer_0/mlp": 0.0028423776384443045, "grad/layer_0/attn_mlp_ratio": 1.021694242989789, "grad/layer_4/attn": 0.0017793935257941484, "grad/layer_4/mlp": 0.002435017842799425, "grad/layer_4/attn_mlp_ratio": 0.7307517100873859, "grad/layer_8/attn": 0.0033190681133419275, "grad/layer_8/mlp": 0.0035777625162154436, "grad/layer_8/attn_mlp_ratio": 0.9276937766354205, "grad/layer_12/attn": 0.00494672916829586, "grad/layer_12/mlp": 0.006958790123462677, "grad/layer_12/attn_mlp_ratio": 0.7108605101526368, "grad/layer_16/attn": 0.004674023948609829, "grad/layer_16/mlp": 0.004853521939367056, "grad/layer_16/attn_mlp_ratio": 0.9630169412436346, "grad/layer_20/attn": 0.003407686483114958, "grad/layer_20/mlp": 0.005525698885321617, "grad/layer_20/attn_mlp_ratio": 0.6166978136462165, "grad/layer_24/attn": 0.00677594356238842, "grad/layer_24/mlp": 0.008135193027555943, "grad/layer_24/attn_mlp_ratio": 0.8329173574793938, "grad/layer_27/attn": 0.004149504471570253, "grad/layer_27/mlp": 0.009024325758218765, "grad/layer_27/attn_mlp_ratio": 0.4598132355550034} {"step": 53350, "timestamp": 1778252206.0128355, "train/loss": 2.170511817932129, "train/z_loss": 0.001368265924975276, "train/perplexity": 8.762767834912815, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021395.6868495927, "perf/iters_per_sec": 0.9638765749214138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037477231025696, "data/tokens_consumed": 111885156352, "data/tokens_consumed_B": 111.885156352, "train/loss_slope": -1.3356079019442911e-05} {"step": 53360, "timestamp": 1778252216.8246248, "train/loss": 2.220882272720337, "train/z_loss": 0.0013647082028910517, "train/perplexity": 9.215457827303345, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1941083.192014438, "perf/iters_per_sec": 0.9255805931160155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.080402946472168, "data/tokens_consumed": 111906127872, "data/tokens_consumed_B": 111.906127872, "train/loss_slope": -8.277628559841576e-06} {"step": 53370, "timestamp": 1778252227.1833582, "train/loss": 2.191436219215393, "train/z_loss": 0.001386815484147519, "train/perplexity": 8.948055259327667, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025416.2230550074, "perf/iters_per_sec": 0.9657937159800565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354177951812744, "data/tokens_consumed": 111927099392, "data/tokens_consumed_B": 111.927099392, "train/loss_slope": -5.862384338905373e-06} {"step": 53380, "timestamp": 1778252237.5391705, "train/loss": 2.1337777614593505, "train/z_loss": 0.0013862108462490141, "train/perplexity": 8.446716292959582, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026480.0317819482, "perf/iters_per_sec": 0.9663009795102826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348742485046387, "data/tokens_consumed": 111948070912, "data/tokens_consumed_B": 111.948070912, "train/loss_slope": -7.450501496034296e-06} {"step": 53390, "timestamp": 1778252247.909413, "train/loss": 2.110055136680603, "train/z_loss": 0.0013866589055396617, "train/perplexity": 8.248696077809685, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023649.9560196681, "perf/iters_per_sec": 0.9649514942262974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363215208053589, "data/tokens_consumed": 111969042432, "data/tokens_consumed_B": 111.969042432, "train/loss_slope": -8.670538222149554e-06} {"step": 53400, "timestamp": 1778252258.272387, "grad/layer_0/attn": 0.003471145872026682, "grad/layer_0/mlp": 0.00318591995164752, "grad/layer_0/attn_mlp_ratio": 1.0895269861626515, "grad/layer_4/attn": 0.0019291265634819865, "grad/layer_4/mlp": 0.0025443925987929106, "grad/layer_4/attn_mlp_ratio": 0.7581874308935042, "grad/layer_8/attn": 0.003447231836616993, "grad/layer_8/mlp": 0.003662656992673874, "grad/layer_8/attn_mlp_ratio": 0.9411833402346675, "grad/layer_12/attn": 0.004301434848457575, "grad/layer_12/mlp": 0.007157659158110619, "grad/layer_12/attn_mlp_ratio": 0.6009555209803337, "grad/layer_16/attn": 0.003928609192371368, "grad/layer_16/mlp": 0.0047259158454835415, "grad/layer_16/attn_mlp_ratio": 0.8312905345102166, "grad/layer_20/attn": 0.0030653721187263727, "grad/layer_20/mlp": 0.006026849150657654, "grad/layer_20/attn_mlp_ratio": 0.508619345073527, "grad/layer_24/attn": 0.005729610100388527, "grad/layer_24/mlp": 0.00858536921441555, "grad/layer_24/attn_mlp_ratio": 0.6673690892677191, "grad/layer_27/attn": 0.005522505380213261, "grad/layer_27/mlp": 0.007761965971440077, "grad/layer_27/attn_mlp_ratio": 0.7114828033754431} {"step": 53400, "timestamp": 1778252258.8710098, "eos/sharpness": 6.493210792541503, "eos/L0_probe": 1.9762340784072876, "eos/L_plus": 2.015418529510498, "eos/L_minus": 2.001981735229492, "eos/grad_norm": 0.09745719283819199, "eos/embed_grad_frac": 0.2598934769630432, "eos/time_s": 0.5957794189453125} {"step": 53400, "timestamp": 1778252258.8925583, "train/loss": 2.1002934336662293, "train/z_loss": 0.0013698261929675937, "train/perplexity": 8.168566493345692, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910148.9056093919, "perf/iters_per_sec": 0.9108299758955917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978997468948364, "data/tokens_consumed": 111990013952, "data/tokens_consumed_B": 111.990013952, "train/loss_slope": -1.1137073454183027e-05} {"step": 53400, "timestamp": 1778252260.2577596, "geo/rankme_last": 438.338134765625, "geo/layer_0/stable_rank_q_proj": 19.51544761657715, "geo/layer_0/stable_rank_k_proj": 16.340686798095703, "geo/layer_0/stable_rank_o_proj": 47.52621078491211, "geo/layer_0/stable_rank_gate_proj": 132.46511840820312, "geo/layer_0/stable_rank_down_proj": 54.17195510864258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06105189397931099, "geo/layer_0/attn_entropy_mean": 6.17575216293335, "geo/layer_0/attn_entropy_std": 0.4008029103279114, "geo/layer_7/stable_rank_q_proj": 42.98285675048828, "geo/layer_7/stable_rank_k_proj": 41.57071304321289, "geo/layer_7/stable_rank_o_proj": 93.36054992675781, "geo/layer_7/stable_rank_gate_proj": 85.87163543701172, "geo/layer_7/stable_rank_down_proj": 142.8192901611328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4702528119087219, "geo/layer_7/attn_entropy_mean": 4.60965633392334, "geo/layer_7/attn_entropy_std": 0.7949386835098267, "geo/layer_14/stable_rank_q_proj": 52.122772216796875, "geo/layer_14/stable_rank_k_proj": 38.948143005371094, "geo/layer_14/stable_rank_o_proj": 44.987632751464844, "geo/layer_14/stable_rank_gate_proj": 72.70183563232422, "geo/layer_14/stable_rank_down_proj": 131.45077514648438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38813379406929016, "geo/layer_14/attn_entropy_mean": 5.520713806152344, "geo/layer_14/attn_entropy_std": 0.4018045961856842, "geo/layer_21/stable_rank_q_proj": 40.93751907348633, "geo/layer_21/stable_rank_k_proj": 30.34119987487793, "geo/layer_21/stable_rank_o_proj": 72.51190948486328, "geo/layer_21/stable_rank_gate_proj": 68.66817474365234, "geo/layer_21/stable_rank_down_proj": 53.18815231323242, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451244056224823, "geo/layer_21/attn_entropy_mean": 5.69611120223999, "geo/layer_21/attn_entropy_std": 0.299215704202652, "geo/layer_27/stable_rank_q_proj": 42.77820587158203, "geo/layer_27/stable_rank_k_proj": 31.448854446411133, "geo/layer_27/stable_rank_o_proj": 115.83942413330078, "geo/layer_27/stable_rank_gate_proj": 81.9101791381836, "geo/layer_27/stable_rank_down_proj": 129.69808959960938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09340119361877441, "geo/layer_27/attn_entropy_mean": 4.235734462738037, "geo/layer_27/attn_entropy_std": 0.6760202646255493, "attnres/final_alpha/block_0": 0.2380523681640625, "attnres/block_norm/0": 1.7490878105163574, "attnres/final_alpha/block_1": 0.004815520718693733, "attnres/block_norm/1": 44622.6171875, "attnres/final_alpha/block_2": 0.010319650173187256, "attnres/block_norm/2": 27824.93359375, "attnres/final_alpha/block_3": 0.012287680059671402, "attnres/block_norm/3": 54131.72265625, "attnres/final_alpha/block_4": 0.01402481459081173, "attnres/block_norm/4": 14338.4853515625, "attnres/final_alpha/block_5": 0.6076356172561646, "attnres/block_norm/5": 6367.9296875, "attnres/final_alpha/block_6": 0.11286433786153793, "attnres/block_norm/6": 35831.2109375, "geo/tier1_time_s": 1.360968828201294, "geo/step": 53400.0, "geo/rankme_slope": -3.060835662389956e-05} {"step": 53410, "timestamp": 1778252270.6436331, "train/loss": 2.1693090558052064, "train/z_loss": 0.001364397071301937, "train/perplexity": 8.752234645364506, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785257.6947797332, "perf/iters_per_sec": 0.8512772058390299, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174705481529236, "data/tokens_consumed": 112010985472, "data/tokens_consumed_B": 112.010985472, "train/loss_slope": -1.1193259929058382e-05} {"step": 53420, "timestamp": 1778252281.0155866, "train/loss": 2.1158361554145815, "train/z_loss": 0.0013727915240451694, "train/perplexity": 8.296520046806362, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022897.881466475, "perf/iters_per_sec": 0.964592877133596, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036706805229187, "data/tokens_consumed": 112031956992, "data/tokens_consumed_B": 112.031956992, "train/loss_slope": -1.4517802488256619e-05} {"step": 53430, "timestamp": 1778252291.3848548, "train/loss": 2.1576489210128784, "train/z_loss": 0.001389811048284173, "train/perplexity": 8.650775075320515, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023440.240410499, "perf/iters_per_sec": 0.9648514940311904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364289283752441, "data/tokens_consumed": 112052928512, "data/tokens_consumed_B": 112.052928512, "train/loss_slope": -1.6313263950067462e-05} {"step": 53440, "timestamp": 1778252301.726324, "train/loss": 2.150452768802643, "train/z_loss": 0.001364576444029808, "train/perplexity": 8.588746233314282, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029227.7269600078, "perf/iters_per_sec": 0.967611182670597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033472967147827, "data/tokens_consumed": 112073900032, "data/tokens_consumed_B": 112.073900032, "train/loss_slope": -1.3532614335976555e-05} {"step": 53450, "timestamp": 1778252312.060627, "grad/layer_0/attn": 0.0026677821297198534, "grad/layer_0/mlp": 0.0031165259424597025, "grad/layer_0/attn_mlp_ratio": 0.8560114991416276, "grad/layer_4/attn": 0.001979108899831772, "grad/layer_4/mlp": 0.0025753588415682316, "grad/layer_4/attn_mlp_ratio": 0.7684788585728632, "grad/layer_8/attn": 0.0038663202431052923, "grad/layer_8/mlp": 0.0036553735844790936, "grad/layer_8/attn_mlp_ratio": 1.0577086166379894, "grad/layer_12/attn": 0.00663255387917161, "grad/layer_12/mlp": 0.006570526864379644, "grad/layer_12/attn_mlp_ratio": 1.0094401735398437, "grad/layer_16/attn": 0.0033796578645706177, "grad/layer_16/mlp": 0.004751503933221102, "grad/layer_16/attn_mlp_ratio": 0.7112816996346959, "grad/layer_20/attn": 0.004851391538977623, "grad/layer_20/mlp": 0.006169610191136599, "grad/layer_20/attn_mlp_ratio": 0.7863367879081831, "grad/layer_24/attn": 0.009502113796770573, "grad/layer_24/mlp": 0.009693175554275513, "grad/layer_24/attn_mlp_ratio": 0.9802890338193072, "grad/layer_27/attn": 0.0062726084142923355, "grad/layer_27/mlp": 0.008137192577123642, "grad/layer_27/attn_mlp_ratio": 0.7708565672687986} {"step": 53450, "timestamp": 1778252312.0749528, "train/loss": 2.119261372089386, "train/z_loss": 0.0013941799639724194, "train/perplexity": 8.324986149066984, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027427.237534906, "perf/iters_per_sec": 0.9667526424097567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343907594680786, "data/tokens_consumed": 112094871552, "data/tokens_consumed_B": 112.094871552, "train/loss_slope": -1.4209671432536286e-05} {"step": 53460, "timestamp": 1778252322.42348, "train/loss": 2.1642410278320314, "train/z_loss": 0.001375005708541721, "train/perplexity": 8.70799028588075, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028031.2226372003, "perf/iters_per_sec": 0.9670406449495317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340826988220215, "data/tokens_consumed": 112115843072, "data/tokens_consumed_B": 112.115843072, "train/loss_slope": -1.5173599772220074e-05} {"step": 53470, "timestamp": 1778252332.7665071, "train/loss": 2.1076364040374758, "train/z_loss": 0.0013905175030231476, "train/perplexity": 8.228768796441013, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029053.9230620693, "perf/iters_per_sec": 0.9675283065138194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335614919662475, "data/tokens_consumed": 112136814592, "data/tokens_consumed_B": 112.136814592, "train/loss_slope": -1.6540164007569696e-05} {"step": 53475, "timestamp": 1778252338.5275276, "eos/sharpness": 74.69823360443114, "eos/L0_probe": 1.9800057411193848, "eos/L_plus": 2.3215086460113525, "eos/L_minus": 2.3854851722717285, "eos/grad_norm": 0.2551531195640564, "eos/embed_grad_frac": 0.034718889743089676, "eos/time_s": 0.5927815437316895} {"step": 53475, "timestamp": 1778252339.9066734, "geo/rankme_last": 438.0594177246094, "geo/layer_0/stable_rank_q_proj": 19.48354148864746, "geo/layer_0/stable_rank_k_proj": 16.35483169555664, "geo/layer_0/stable_rank_o_proj": 47.52540588378906, "geo/layer_0/stable_rank_gate_proj": 132.61163330078125, "geo/layer_0/stable_rank_down_proj": 54.1773567199707, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06216948851943016, "geo/layer_0/attn_entropy_mean": 6.1762614250183105, "geo/layer_0/attn_entropy_std": 0.40142709016799927, "geo/layer_7/stable_rank_q_proj": 42.95699691772461, "geo/layer_7/stable_rank_k_proj": 41.55169677734375, "geo/layer_7/stable_rank_o_proj": 93.23656463623047, "geo/layer_7/stable_rank_gate_proj": 85.79391479492188, "geo/layer_7/stable_rank_down_proj": 142.88034057617188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4741571545600891, "geo/layer_7/attn_entropy_mean": 4.645988464355469, "geo/layer_7/attn_entropy_std": 0.7983213663101196, "geo/layer_14/stable_rank_q_proj": 52.066375732421875, "geo/layer_14/stable_rank_k_proj": 38.93824768066406, "geo/layer_14/stable_rank_o_proj": 44.984188079833984, "geo/layer_14/stable_rank_gate_proj": 72.70562744140625, "geo/layer_14/stable_rank_down_proj": 131.4265594482422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3846433162689209, "geo/layer_14/attn_entropy_mean": 5.514317512512207, "geo/layer_14/attn_entropy_std": 0.3921278417110443, "geo/layer_21/stable_rank_q_proj": 40.897125244140625, "geo/layer_21/stable_rank_k_proj": 30.263235092163086, "geo/layer_21/stable_rank_o_proj": 72.46564483642578, "geo/layer_21/stable_rank_gate_proj": 68.57344055175781, "geo/layer_21/stable_rank_down_proj": 53.25048065185547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1437261402606964, "geo/layer_21/attn_entropy_mean": 5.697266578674316, "geo/layer_21/attn_entropy_std": 0.30841273069381714, "geo/layer_27/stable_rank_q_proj": 42.8022346496582, "geo/layer_27/stable_rank_k_proj": 31.446502685546875, "geo/layer_27/stable_rank_o_proj": 115.91468048095703, "geo/layer_27/stable_rank_gate_proj": 81.8890380859375, "geo/layer_27/stable_rank_down_proj": 129.65771484375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09685786813497543, "geo/layer_27/attn_entropy_mean": 4.21966552734375, "geo/layer_27/attn_entropy_std": 0.7032341361045837, "attnres/final_alpha/block_0": 0.2398187816143036, "attnres/block_norm/0": 1.7493484020233154, "attnres/final_alpha/block_1": 0.004901024047285318, "attnres/block_norm/1": 44490.3359375, "attnres/final_alpha/block_2": 0.010406764224171638, "attnres/block_norm/2": 27920.8125, "attnres/final_alpha/block_3": 0.012373451143503189, "attnres/block_norm/3": 54208.1171875, "attnres/final_alpha/block_4": 0.0143259447067976, "attnres/block_norm/4": 14312.064453125, "attnres/final_alpha/block_5": 0.6035086512565613, "attnres/block_norm/5": 6438.51025390625, "attnres/final_alpha/block_6": 0.11466541886329651, "attnres/block_norm/6": 35597.65234375, "geo/tier1_time_s": 1.3614428043365479, "geo/step": 53475.0, "geo/rankme_slope": -8.82121012467487e-05} {"step": 53480, "timestamp": 1778252345.0803082, "train/loss": 2.1123321294784545, "train/z_loss": 0.0013907334650866687, "train/perplexity": 8.267499699101398, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703852.100810958, "perf/iters_per_sec": 0.8124599937491217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2308298349380493, "data/tokens_consumed": 112157786112, "data/tokens_consumed_B": 112.157786112, "train/loss_slope": -1.9494292680973245e-05} {"step": 53490, "timestamp": 1778252355.4234974, "train/loss": 2.130732536315918, "train/z_loss": 0.0013818467152304948, "train/perplexity": 8.42103326527306, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029005.7144897594, "perf/iters_per_sec": 0.9675053188751981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033586049079895, "data/tokens_consumed": 112178757632, "data/tokens_consumed_B": 112.178757632, "train/loss_slope": -2.2085168237912164e-05} {"step": 53500, "timestamp": 1778252365.7636044, "grad/layer_0/attn": 0.0027029020711779594, "grad/layer_0/mlp": 0.002744414610788226, "grad/layer_0/attn_mlp_ratio": 0.9848737731046691, "grad/layer_4/attn": 0.0019286854658275843, "grad/layer_4/mlp": 0.0024891700595617294, "grad/layer_4/attn_mlp_ratio": 0.7748307034852008, "grad/layer_8/attn": 0.010741038247942924, "grad/layer_8/mlp": 0.0035697859711945057, "grad/layer_8/attn_mlp_ratio": 3.0088744909996263, "grad/layer_12/attn": 0.0037466888315975666, "grad/layer_12/mlp": 0.006519552320241928, "grad/layer_12/attn_mlp_ratio": 0.5746849768344273, "grad/layer_16/attn": 0.0033526222687214613, "grad/layer_16/mlp": 0.004450216889381409, "grad/layer_16/attn_mlp_ratio": 0.753361527476323, "grad/layer_20/attn": 0.006761380936950445, "grad/layer_20/mlp": 0.00606071762740612, "grad/layer_20/attn_mlp_ratio": 1.1156072995077757, "grad/layer_24/attn": 0.012042896822094917, "grad/layer_24/mlp": 0.01045406237244606, "grad/layer_24/attn_mlp_ratio": 1.1519824808620165, "grad/layer_27/attn": 0.009155641309916973, "grad/layer_27/mlp": 0.009890549816191196, "grad/layer_27/attn_mlp_ratio": 0.9256958801581749} {"step": 53500, "timestamp": 1778252365.7777786, "train/loss": 2.1041109442710875, "train/z_loss": 0.0013832172378897666, "train/perplexity": 8.199809680215886, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026679.6378493581, "perf/iters_per_sec": 0.9663961591002265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347723245620728, "data/tokens_consumed": 112199729152, "data/tokens_consumed_B": 112.199729152, "train/loss_slope": -2.6863964634759952e-05} {"step": 53500, "timestamp": 1778252373.0265527, "geo/ww_alpha_mean": 7.40767988244253, "geo/ww_alpha_std": 3.937559066200139, "geo/ww_alpha_min": 1.3506238293166613, "geo/ww_alpha_max": 24.118416347946923, "geo/ww_alpha_healthy_frac": 0.18781725888324874, "geo/ww_alpha_by_type/q_proj": 3.986149114915192, "geo/ww_alpha_by_type/k_proj": 4.428379257285422, "geo/ww_alpha_by_type/v_proj": 7.983101208498853, "geo/ww_alpha_by_type/o_proj": 7.724820078014566, "geo/ww_alpha_by_type/gate_proj": 8.093233599731386, "geo/ww_alpha_by_type/up_proj": 11.318380082282918, "geo/ww_alpha_by_type/down_proj": 8.410854110986069, "geo/twonn_id/layer_0": 0.726689338684082, "geo/twonn_id/layer_7": 3.3640942573547363, "geo/twonn_id/layer_14": 4.353879928588867, "geo/twonn_id/layer_21": 7.6160430908203125, "geo/twonn_id/layer_27": 5.882579803466797, "geo/tier2_time_s": 7.241858005523682} {"step": 53500, "timestamp": 1778252373.7440033, "eoc/jacobian_sigma/layer_0/attn": 1101.3291015625, "eoc/jacobian_sigma/layer_0/mlp": 9127.3564453125, "eoc/jacobian_sigma/layer_0": 9127.3564453125, "eoc/jacobian_sigma/layer_7/attn": 1.1533631086349487, "eoc/jacobian_sigma/layer_7/mlp": 1.7625274658203125, "eoc/jacobian_sigma/layer_7": 1.7625274658203125, "eoc/jacobian_sigma/layer_14/attn": 1.5490748882293701, "eoc/jacobian_sigma/layer_14/mlp": 5.943543910980225, "eoc/jacobian_sigma/layer_14": 5.943543910980225, "eoc/jacobian_sigma/layer_21/attn": 1.0918550491333008, "eoc/jacobian_sigma/layer_21/mlp": 3.922395944595337, "eoc/jacobian_sigma/layer_21": 3.922395944595337, "eoc/jacobian_sigma/layer_27/attn": 3.272108554840088, "eoc/jacobian_sigma/layer_27/mlp": 31.395275115966797, "eoc/jacobian_sigma/layer_27": 31.395275115966797, "eoc/layer0_sigma": 9127.3564453125, "eoc/sigma_max": 31.395275115966797, "eoc/sigma_min": 1.7625274658203125, "eoc/sigma_mean": 10.755935609340668, "eoc/time_s": 0.7083990573883057} {"step": 53510, "timestamp": 1778252384.108719, "train/loss": 2.1707029581069945, "train/z_loss": 0.0013775165774859489, "train/perplexity": 8.764442911971248, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1144378.6819608302, "perf/iters_per_sec": 0.5456822786144401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8325682163238526, "data/tokens_consumed": 112220700672, "data/tokens_consumed_B": 112.220700672, "train/loss_slope": -2.8941635573335907e-05} {"step": 53520, "timestamp": 1778252394.4544172, "train/loss": 2.1763891220092773, "train/z_loss": 0.0013651297776959836, "train/perplexity": 8.814420927816442, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028572.1743190184, "perf/iters_per_sec": 0.967298590812215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338069438934325, "data/tokens_consumed": 112241672192, "data/tokens_consumed_B": 112.241672192, "train/loss_slope": -2.3726614902872875e-05} {"step": 53530, "timestamp": 1778252405.5041761, "train/loss": 2.160738945007324, "train/z_loss": 0.0013758522458374501, "train/perplexity": 8.677547520320037, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1898768.3979738483, "perf/iters_per_sec": 0.9054033269757501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1044801473617554, "data/tokens_consumed": 112262643712, "data/tokens_consumed_B": 112.262643712, "train/loss_slope": -2.4412741383524792e-05} {"step": 53540, "timestamp": 1778252415.8616998, "train/loss": 2.1059197664260862, "train/z_loss": 0.001389127504080534, "train/perplexity": 8.214655099946638, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026469.9474662344, "perf/iters_per_sec": 0.9662961709338352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348793983459472, "data/tokens_consumed": 112283615232, "data/tokens_consumed_B": 112.283615232, "train/loss_slope": -2.6160597429238327e-05} {"step": 53550, "timestamp": 1778252426.7171628, "grad/layer_0/attn": 0.0027457557152956724, "grad/layer_0/mlp": 0.0029294912237674, "grad/layer_0/attn_mlp_ratio": 0.9372807125315399, "grad/layer_4/attn": 0.0023971498012542725, "grad/layer_4/mlp": 0.002463705837726593, "grad/layer_4/attn_mlp_ratio": 0.9729853569563031, "grad/layer_8/attn": 0.003865365171805024, "grad/layer_8/mlp": 0.0035275171976536512, "grad/layer_8/attn_mlp_ratio": 1.0957749730599746, "grad/layer_12/attn": 0.00516461580991745, "grad/layer_12/mlp": 0.006788331083953381, "grad/layer_12/attn_mlp_ratio": 0.7608078730934408, "grad/layer_16/attn": 0.0041163163259625435, "grad/layer_16/mlp": 0.0049356441013514996, "grad/layer_16/attn_mlp_ratio": 0.8339977838830839, "grad/layer_20/attn": 0.004161057062447071, "grad/layer_20/mlp": 0.006461434066295624, "grad/layer_20/attn_mlp_ratio": 0.6439835113003447, "grad/layer_24/attn": 0.01665533520281315, "grad/layer_24/mlp": 0.010157525539398193, "grad/layer_24/attn_mlp_ratio": 1.6397039785173444, "grad/layer_27/attn": 0.004224608186632395, "grad/layer_27/mlp": 0.008956382051110268, "grad/layer_27/attn_mlp_ratio": 0.4716869060917299} {"step": 53550, "timestamp": 1778252427.3286293, "eos/sharpness": 61.60457134246825, "eos/L0_probe": 1.979905605316162, "eos/L_plus": 2.3398666381835938, "eos/L_minus": 2.235990285873413, "eos/grad_norm": 0.15902505815029144, "eos/embed_grad_frac": 0.08669246733188629, "eos/time_s": 0.6084067821502686} {"step": 53550, "timestamp": 1778252427.3569617, "train/loss": 2.2441914081573486, "train/z_loss": 0.0013712915591895581, "train/perplexity": 9.432785198788386, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1825542.801585416, "perf/iters_per_sec": 0.8704866416861611, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1487827062606812, "data/tokens_consumed": 112304586752, "data/tokens_consumed_B": 112.304586752, "train/loss_slope": -2.1896059399355014e-05} {"step": 53550, "timestamp": 1778252428.7176924, "geo/rankme_last": 438.7647705078125, "geo/layer_0/stable_rank_q_proj": 19.480382919311523, "geo/layer_0/stable_rank_k_proj": 16.370315551757812, "geo/layer_0/stable_rank_o_proj": 47.580867767333984, "geo/layer_0/stable_rank_gate_proj": 132.82212829589844, "geo/layer_0/stable_rank_down_proj": 54.23520278930664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06152881309390068, "geo/layer_0/attn_entropy_mean": 6.170420169830322, "geo/layer_0/attn_entropy_std": 0.40157610177993774, "geo/layer_7/stable_rank_q_proj": 42.99699783325195, "geo/layer_7/stable_rank_k_proj": 41.60338592529297, "geo/layer_7/stable_rank_o_proj": 93.08207702636719, "geo/layer_7/stable_rank_gate_proj": 85.69851684570312, "geo/layer_7/stable_rank_down_proj": 142.6637420654297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46535420417785645, "geo/layer_7/attn_entropy_mean": 4.668623447418213, "geo/layer_7/attn_entropy_std": 0.7864106297492981, "geo/layer_14/stable_rank_q_proj": 51.982215881347656, "geo/layer_14/stable_rank_k_proj": 38.91498947143555, "geo/layer_14/stable_rank_o_proj": 44.95076370239258, "geo/layer_14/stable_rank_gate_proj": 72.68022155761719, "geo/layer_14/stable_rank_down_proj": 131.27337646484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38464581966400146, "geo/layer_14/attn_entropy_mean": 5.51525354385376, "geo/layer_14/attn_entropy_std": 0.3842567801475525, "geo/layer_21/stable_rank_q_proj": 40.88949966430664, "geo/layer_21/stable_rank_k_proj": 30.26827621459961, "geo/layer_21/stable_rank_o_proj": 72.44148254394531, "geo/layer_21/stable_rank_gate_proj": 68.50347900390625, "geo/layer_21/stable_rank_down_proj": 53.239646911621094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14703841507434845, "geo/layer_21/attn_entropy_mean": 5.71284818649292, "geo/layer_21/attn_entropy_std": 0.29565539956092834, "geo/layer_27/stable_rank_q_proj": 42.827980041503906, "geo/layer_27/stable_rank_k_proj": 31.50262451171875, "geo/layer_27/stable_rank_o_proj": 115.64430236816406, "geo/layer_27/stable_rank_gate_proj": 81.84784698486328, "geo/layer_27/stable_rank_down_proj": 129.45347595214844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09031268209218979, "geo/layer_27/attn_entropy_mean": 4.227721691131592, "geo/layer_27/attn_entropy_std": 0.7076760530471802, "attnres/final_alpha/block_0": 0.2374405860900879, "attnres/block_norm/0": 1.7496436834335327, "attnres/final_alpha/block_1": 0.0047461120411753654, "attnres/block_norm/1": 44539.66015625, "attnres/final_alpha/block_2": 0.010215699672698975, "attnres/block_norm/2": 28017.982421875, "attnres/final_alpha/block_3": 0.012107491493225098, "attnres/block_norm/3": 54071.23828125, "attnres/final_alpha/block_4": 0.014225825667381287, "attnres/block_norm/4": 14380.1806640625, "attnres/final_alpha/block_5": 0.6106365919113159, "attnres/block_norm/5": 6356.158203125, "attnres/final_alpha/block_6": 0.1106276661157608, "attnres/block_norm/6": 35811.3515625, "geo/tier1_time_s": 1.3562793731689453, "geo/step": 53550.0, "geo/rankme_slope": -7.136821525485195e-05} {"step": 53560, "timestamp": 1778252439.0662801, "train/loss": 2.1414719343185427, "train/z_loss": 0.0013743560295552016, "train/perplexity": 8.511957454740617, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791636.3288448118, "perf/iters_per_sec": 0.8543187755798396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1705232620239259, "data/tokens_consumed": 112325558272, "data/tokens_consumed_B": 112.325558272, "train/loss_slope": -2.2583984098311417e-05} {"step": 53570, "timestamp": 1778252449.4119127, "train/loss": 2.1684247732162474, "train/z_loss": 0.0013879104168154298, "train/perplexity": 8.744498617574571, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028600.6189814033, "perf/iters_per_sec": 0.9673121542841927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337924480438232, "data/tokens_consumed": 112346529792, "data/tokens_consumed_B": 112.346529792, "train/loss_slope": -1.8799928362720764e-05} {"step": 53580, "timestamp": 1778252459.7529888, "train/loss": 2.146888017654419, "train/z_loss": 0.0013896137825213373, "train/perplexity": 8.558183996066434, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029388.310212025, "perf/iters_per_sec": 0.9676877547321439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333911895751953, "data/tokens_consumed": 112367501312, "data/tokens_consumed_B": 112.367501312, "train/loss_slope": -1.5350650825408937e-05} {"step": 53590, "timestamp": 1778252470.6515586, "train/loss": 2.202994203567505, "train/z_loss": 0.0013850278104655445, "train/perplexity": 9.052076723232474, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925125.7068421738, "perf/iters_per_sec": 0.9179714712344045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0893584728240966, "data/tokens_consumed": 112388472832, "data/tokens_consumed_B": 112.388472832, "train/loss_slope": -1.4234572525608119e-05} {"step": 53600, "timestamp": 1778252480.99793, "grad/layer_0/attn": 0.0026494679041206837, "grad/layer_0/mlp": 0.002861763583496213, "grad/layer_0/attn_mlp_ratio": 0.9258164534689434, "grad/layer_4/attn": 0.0023126329760998487, "grad/layer_4/mlp": 0.0024504566099494696, "grad/layer_4/attn_mlp_ratio": 0.9437558993431625, "grad/layer_8/attn": 0.0072248224169015884, "grad/layer_8/mlp": 0.0036655343137681484, "grad/layer_8/attn_mlp_ratio": 1.9710147556559188, "grad/layer_12/attn": 0.0047873882576823235, "grad/layer_12/mlp": 0.006345235276967287, "grad/layer_12/attn_mlp_ratio": 0.7544855270554931, "grad/layer_16/attn": 0.005497226491570473, "grad/layer_16/mlp": 0.004232860170304775, "grad/layer_16/attn_mlp_ratio": 1.298702565292726, "grad/layer_20/attn": 0.003283332334831357, "grad/layer_20/mlp": 0.005513009615242481, "grad/layer_20/attn_mlp_ratio": 0.5955607743177984, "grad/layer_24/attn": 0.009688064455986023, "grad/layer_24/mlp": 0.007411790080368519, "grad/layer_24/attn_mlp_ratio": 1.3071153149540893, "grad/layer_27/attn": 0.0036346311680972576, "grad/layer_27/mlp": 0.0062466394156217575, "grad/layer_27/attn_mlp_ratio": 0.5818538366120981} {"step": 53600, "timestamp": 1778252481.012026, "train/loss": 2.1902477979660033, "train/z_loss": 0.0013768129865638912, "train/perplexity": 8.9374275166852, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025576.436828118, "perf/iters_per_sec": 0.9658701118603316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035335898399353, "data/tokens_consumed": 112409444352, "data/tokens_consumed_B": 112.409444352, "train/loss_slope": -1.2431426102643688e-05} {"step": 53610, "timestamp": 1778252491.6783507, "train/loss": 2.123704934120178, "train/z_loss": 0.0013799353153444826, "train/perplexity": 8.362061052738579, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1967533.5396406283, "perf/iters_per_sec": 0.9381931017115728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.065878653526306, "data/tokens_consumed": 112430415872, "data/tokens_consumed_B": 112.430415872, "train/loss_slope": -1.0158591402066994e-05} {"step": 53620, "timestamp": 1778252502.0218468, "train/loss": 2.1786388158798218, "train/z_loss": 0.0013733613188378513, "train/perplexity": 8.83427299871845, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028413.1702505776, "perf/iters_per_sec": 0.967222771764077, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338879823684692, "data/tokens_consumed": 112451387392, "data/tokens_consumed_B": 112.451387392, "train/loss_slope": -7.946162333976304e-06} {"step": 53625, "timestamp": 1778252507.781182, "eos/sharpness": 32.46521949768066, "eos/L0_probe": 1.9790176153182983, "eos/L_plus": 2.117588996887207, "eos/L_minus": 2.1650984287261963, "eos/grad_norm": 0.101547472178936, "eos/embed_grad_frac": 0.2250431627035141, "eos/time_s": 0.5923027992248535} {"step": 53625, "timestamp": 1778252509.1606288, "geo/rankme_last": 438.6839904785156, "geo/layer_0/stable_rank_q_proj": 19.479446411132812, "geo/layer_0/stable_rank_k_proj": 16.372026443481445, "geo/layer_0/stable_rank_o_proj": 47.59996795654297, "geo/layer_0/stable_rank_gate_proj": 132.684326171875, "geo/layer_0/stable_rank_down_proj": 54.28766632080078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06386318057775497, "geo/layer_0/attn_entropy_mean": 6.1722307205200195, "geo/layer_0/attn_entropy_std": 0.40074047446250916, "geo/layer_7/stable_rank_q_proj": 43.0162353515625, "geo/layer_7/stable_rank_k_proj": 41.685813903808594, "geo/layer_7/stable_rank_o_proj": 93.3406753540039, "geo/layer_7/stable_rank_gate_proj": 85.5139389038086, "geo/layer_7/stable_rank_down_proj": 142.99966430664062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4678109288215637, "geo/layer_7/attn_entropy_mean": 4.67800235748291, "geo/layer_7/attn_entropy_std": 0.7990819811820984, "geo/layer_14/stable_rank_q_proj": 51.96932601928711, "geo/layer_14/stable_rank_k_proj": 38.9068489074707, "geo/layer_14/stable_rank_o_proj": 44.88694763183594, "geo/layer_14/stable_rank_gate_proj": 72.83700561523438, "geo/layer_14/stable_rank_down_proj": 131.01939392089844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38534921407699585, "geo/layer_14/attn_entropy_mean": 5.537221908569336, "geo/layer_14/attn_entropy_std": 0.3866707682609558, "geo/layer_21/stable_rank_q_proj": 40.85094451904297, "geo/layer_21/stable_rank_k_proj": 30.22957992553711, "geo/layer_21/stable_rank_o_proj": 72.3711929321289, "geo/layer_21/stable_rank_gate_proj": 68.44186401367188, "geo/layer_21/stable_rank_down_proj": 53.230384826660156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14346130192279816, "geo/layer_21/attn_entropy_mean": 5.699275493621826, "geo/layer_21/attn_entropy_std": 0.3089013993740082, "geo/layer_27/stable_rank_q_proj": 42.831336975097656, "geo/layer_27/stable_rank_k_proj": 31.434785842895508, "geo/layer_27/stable_rank_o_proj": 115.6632308959961, "geo/layer_27/stable_rank_gate_proj": 81.68025970458984, "geo/layer_27/stable_rank_down_proj": 129.3438720703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08399862051010132, "geo/layer_27/attn_entropy_mean": 4.204195976257324, "geo/layer_27/attn_entropy_std": 0.6912746429443359, "attnres/final_alpha/block_0": 0.23930993676185608, "attnres/block_norm/0": 1.7498574256896973, "attnres/final_alpha/block_1": 0.004812944680452347, "attnres/block_norm/1": 44603.84375, "attnres/final_alpha/block_2": 0.010347722098231316, "attnres/block_norm/2": 28005.9609375, "attnres/final_alpha/block_3": 0.01235109567642212, "attnres/block_norm/3": 53674.375, "attnres/final_alpha/block_4": 0.01450104359537363, "attnres/block_norm/4": 14399.34765625, "attnres/final_alpha/block_5": 0.6046460270881653, "attnres/block_norm/5": 6446.0732421875, "attnres/final_alpha/block_6": 0.11403122544288635, "attnres/block_norm/6": 35585.0703125, "geo/tier1_time_s": 1.359292984008789, "geo/step": 53625.0, "geo/rankme_slope": -6.279341814850941e-05} {"step": 53630, "timestamp": 1778252514.3345823, "train/loss": 2.147795820236206, "train/z_loss": 0.0013745173811912537, "train/perplexity": 8.565956665084181, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704190.5638067697, "perf/iters_per_sec": 0.8126213854822014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2305853843688965, "data/tokens_consumed": 112472358912, "data/tokens_consumed_B": 112.472358912, "train/loss_slope": -8.191971266695299e-06} {"step": 53640, "timestamp": 1778252524.6723907, "train/loss": 2.1952744483947755, "train/z_loss": 0.0013766189105808735, "train/perplexity": 8.982465941920905, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029429.1857504477, "perf/iters_per_sec": 0.9677072457077254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333703756332397, "data/tokens_consumed": 112493330432, "data/tokens_consumed_B": 112.493330432, "train/loss_slope": -6.138129188532996e-06} {"step": 53650, "timestamp": 1778252535.0063877, "grad/layer_0/attn": 0.0025595538318157196, "grad/layer_0/mlp": 0.002770561957731843, "grad/layer_0/attn_mlp_ratio": 0.9238391988631837, "grad/layer_4/attn": 0.001828388893045485, "grad/layer_4/mlp": 0.0023701509926468134, "grad/layer_4/attn_mlp_ratio": 0.7714229268833964, "grad/layer_8/attn": 0.004110944457352161, "grad/layer_8/mlp": 0.003558933502063155, "grad/layer_8/attn_mlp_ratio": 1.1551056909207296, "grad/layer_12/attn": 0.003835137002170086, "grad/layer_12/mlp": 0.006583025678992271, "grad/layer_12/attn_mlp_ratio": 0.5825796724674482, "grad/layer_16/attn": 0.004754264373332262, "grad/layer_16/mlp": 0.0044862437061965466, "grad/layer_16/attn_mlp_ratio": 1.0597427555688161, "grad/layer_20/attn": 0.007626906968653202, "grad/layer_20/mlp": 0.00611482746899128, "grad/layer_20/attn_mlp_ratio": 1.2472807912572685, "grad/layer_24/attn": 0.006980244070291519, "grad/layer_24/mlp": 0.010503308847546577, "grad/layer_24/attn_mlp_ratio": 0.6645757165813926, "grad/layer_27/attn": 0.00807627197355032, "grad/layer_27/mlp": 0.009087740443646908, "grad/layer_27/attn_mlp_ratio": 0.8886996646483609} {"step": 53650, "timestamp": 1778252535.0204082, "train/loss": 2.13036994934082, "train/z_loss": 0.0013906633015722037, "train/perplexity": 8.417980461781045, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028067.1804693018, "perf/iters_per_sec": 0.9670577909800061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340643644332885, "data/tokens_consumed": 112514301952, "data/tokens_consumed_B": 112.514301952, "train/loss_slope": -6.833102021387572e-06} {"step": 53660, "timestamp": 1778252545.3695462, "train/loss": 2.1258992552757263, "train/z_loss": 0.0013870635302737355, "train/perplexity": 8.380430246795445, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027440.2287064844, "perf/iters_per_sec": 0.9667588370830938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343841314315796, "data/tokens_consumed": 112535273472, "data/tokens_consumed_B": 112.535273472, "train/loss_slope": -9.174902962021637e-06} {"step": 53670, "timestamp": 1778252555.7149606, "train/loss": 2.1384177207946777, "train/z_loss": 0.0013769429409876467, "train/perplexity": 8.485999779487429, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028430.1968555006, "perf/iters_per_sec": 0.9672308906819823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338793039321899, "data/tokens_consumed": 112556244992, "data/tokens_consumed_B": 112.556244992, "train/loss_slope": -9.80051683299429e-06} {"step": 53680, "timestamp": 1778252566.0627582, "train/loss": 2.155059027671814, "train/z_loss": 0.0013791484176181256, "train/perplexity": 8.62839947826991, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028057.968783547, "perf/iters_per_sec": 0.9670533985059485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340690612792969, "data/tokens_consumed": 112577216512, "data/tokens_consumed_B": 112.577216512, "train/loss_slope": -6.77241357711208e-06} {"step": 53690, "timestamp": 1778252576.4023225, "train/loss": 2.120663058757782, "train/z_loss": 0.001381537108682096, "train/perplexity": 8.33666335314548, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029275.1033989629, "perf/iters_per_sec": 0.9676337735171141, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033448839187622, "data/tokens_consumed": 112598188032, "data/tokens_consumed_B": 112.598188032, "train/loss_slope": -9.894842862582125e-06} {"step": 53700, "timestamp": 1778252586.7328856, "grad/layer_0/attn": 0.0027045798487961292, "grad/layer_0/mlp": 0.0029185479506850243, "grad/layer_0/attn_mlp_ratio": 0.9266867640439658, "grad/layer_4/attn": 0.002448047511279583, "grad/layer_4/mlp": 0.002518363995477557, "grad/layer_4/attn_mlp_ratio": 0.9720784677941335, "grad/layer_8/attn": 0.004529960919171572, "grad/layer_8/mlp": 0.0037615899927914143, "grad/layer_8/attn_mlp_ratio": 1.2042675590444152, "grad/layer_12/attn": 0.004365045577287674, "grad/layer_12/mlp": 0.006769617088139057, "grad/layer_12/attn_mlp_ratio": 0.6447994703357235, "grad/layer_16/attn": 0.0033350507728755474, "grad/layer_16/mlp": 0.004539359826594591, "grad/layer_16/attn_mlp_ratio": 0.7346962626463261, "grad/layer_20/attn": 0.007346608210355043, "grad/layer_20/mlp": 0.005634750239551067, "grad/layer_20/attn_mlp_ratio": 1.3038036767642067, "grad/layer_24/attn": 0.01813565008342266, "grad/layer_24/mlp": 0.010829404927790165, "grad/layer_24/attn_mlp_ratio": 1.6746672635184834, "grad/layer_27/attn": 0.007104219403117895, "grad/layer_27/mlp": 0.010200603865087032, "grad/layer_27/attn_mlp_ratio": 0.6964508599131053} {"step": 53700, "timestamp": 1778252587.3268685, "eos/sharpness": 76.15756988525389, "eos/L0_probe": 1.9824039936065674, "eos/L_plus": 2.316572666168213, "eos/L_minus": 2.409811019897461, "eos/grad_norm": 0.20739500224590302, "eos/embed_grad_frac": 0.05165752023458481, "eos/time_s": 0.5911643505096436} {"step": 53700, "timestamp": 1778252587.3471975, "train/loss": 2.1398912906646728, "train/z_loss": 0.0013747835415415465, "train/perplexity": 8.498513710890997, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917052.3627002046, "perf/iters_per_sec": 0.914121800756552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0939461231231689, "data/tokens_consumed": 112619159552, "data/tokens_consumed_B": 112.619159552, "train/loss_slope": -1.1398077826581461e-05} {"step": 53700, "timestamp": 1778252588.7101798, "geo/rankme_last": 439.0406188964844, "geo/layer_0/stable_rank_q_proj": 19.4931583404541, "geo/layer_0/stable_rank_k_proj": 16.350818634033203, "geo/layer_0/stable_rank_o_proj": 47.5838737487793, "geo/layer_0/stable_rank_gate_proj": 132.8428955078125, "geo/layer_0/stable_rank_down_proj": 54.21802520751953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06504228711128235, "geo/layer_0/attn_entropy_mean": 6.175273895263672, "geo/layer_0/attn_entropy_std": 0.39830338954925537, "geo/layer_7/stable_rank_q_proj": 42.954837799072266, "geo/layer_7/stable_rank_k_proj": 41.81169509887695, "geo/layer_7/stable_rank_o_proj": 93.27014923095703, "geo/layer_7/stable_rank_gate_proj": 85.46923065185547, "geo/layer_7/stable_rank_down_proj": 142.8568115234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4771007299423218, "geo/layer_7/attn_entropy_mean": 4.648612976074219, "geo/layer_7/attn_entropy_std": 0.8088438510894775, "geo/layer_14/stable_rank_q_proj": 52.04045486450195, "geo/layer_14/stable_rank_k_proj": 38.858768463134766, "geo/layer_14/stable_rank_o_proj": 44.88848876953125, "geo/layer_14/stable_rank_gate_proj": 72.74961853027344, "geo/layer_14/stable_rank_down_proj": 130.92498779296875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3821325898170471, "geo/layer_14/attn_entropy_mean": 5.5294575691223145, "geo/layer_14/attn_entropy_std": 0.39719632267951965, "geo/layer_21/stable_rank_q_proj": 40.833595275878906, "geo/layer_21/stable_rank_k_proj": 30.172157287597656, "geo/layer_21/stable_rank_o_proj": 72.42597961425781, "geo/layer_21/stable_rank_gate_proj": 68.50199127197266, "geo/layer_21/stable_rank_down_proj": 53.21457290649414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1441752165555954, "geo/layer_21/attn_entropy_mean": 5.690580368041992, "geo/layer_21/attn_entropy_std": 0.30933743715286255, "geo/layer_27/stable_rank_q_proj": 42.78248977661133, "geo/layer_27/stable_rank_k_proj": 31.510101318359375, "geo/layer_27/stable_rank_o_proj": 115.79420471191406, "geo/layer_27/stable_rank_gate_proj": 81.59044647216797, "geo/layer_27/stable_rank_down_proj": 129.4168701171875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08917228877544403, "geo/layer_27/attn_entropy_mean": 4.230563163757324, "geo/layer_27/attn_entropy_std": 0.6924580335617065, "attnres/final_alpha/block_0": 0.23746183514595032, "attnres/block_norm/0": 1.7499208450317383, "attnres/final_alpha/block_1": 0.004812616854906082, "attnres/block_norm/1": 44636.53125, "attnres/final_alpha/block_2": 0.010179860517382622, "attnres/block_norm/2": 27981.234375, "attnres/final_alpha/block_3": 0.012213989160954952, "attnres/block_norm/3": 54091.28515625, "attnres/final_alpha/block_4": 0.01422896794974804, "attnres/block_norm/4": 14364.3515625, "attnres/final_alpha/block_5": 0.608611524105072, "attnres/block_norm/5": 6407.23876953125, "attnres/final_alpha/block_6": 0.11249122023582458, "attnres/block_norm/6": 35736.22265625, "geo/tier1_time_s": 1.3597230911254883, "geo/step": 53700.0, "geo/rankme_slope": -4.980511735944378e-05} {"step": 53710, "timestamp": 1778252599.0664222, "train/loss": 2.138677144050598, "train/z_loss": 0.0014012790517881513, "train/perplexity": 8.488201530760357, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789943.8133722895, "perf/iters_per_sec": 0.8535117213117073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171630072593689, "data/tokens_consumed": 112640131072, "data/tokens_consumed_B": 112.640131072, "train/loss_slope": -1.0342239482317666e-05} {"step": 53720, "timestamp": 1778252609.409465, "train/loss": 2.1077102422714233, "train/z_loss": 0.0013824403868056834, "train/perplexity": 8.22937641662903, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028964.387993677, "perf/iters_per_sec": 0.9674856128662477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336071014404298, "data/tokens_consumed": 112661102592, "data/tokens_consumed_B": 112.661102592, "train/loss_slope": -1.1819206484437785e-05} {"step": 53730, "timestamp": 1778252619.7566822, "train/loss": 2.1415157318115234, "train/z_loss": 0.0013847553636878729, "train/perplexity": 8.512330265301518, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027755.712934517, "perf/iters_per_sec": 0.9669092716858468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034223198890686, "data/tokens_consumed": 112682074112, "data/tokens_consumed_B": 112.682074112, "train/loss_slope": -1.1334714187074967e-05} {"step": 53740, "timestamp": 1778252630.108571, "train/loss": 2.1520864963531494, "train/z_loss": 0.0013750725658610464, "train/perplexity": 8.602789372868445, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027862.8596875062, "perf/iters_per_sec": 0.9669603632390529, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034168553352356, "data/tokens_consumed": 112703045632, "data/tokens_consumed_B": 112.703045632, "train/loss_slope": -1.2592112885700329e-05} {"step": 53750, "timestamp": 1778252640.4413393, "grad/layer_0/attn": 0.002691724570468068, "grad/layer_0/mlp": 0.0028687575832009315, "grad/layer_0/attn_mlp_ratio": 0.9382892763060658, "grad/layer_4/attn": 0.0025791211519390345, "grad/layer_4/mlp": 0.0024830226320773363, "grad/layer_4/attn_mlp_ratio": 1.0387021909305278, "grad/layer_8/attn": 0.004294249694794416, "grad/layer_8/mlp": 0.00359358754940331, "grad/layer_8/attn_mlp_ratio": 1.19497563820586, "grad/layer_12/attn": 0.00730290450155735, "grad/layer_12/mlp": 0.00683254562318325, "grad/layer_12/attn_mlp_ratio": 1.0688409265638932, "grad/layer_16/attn": 0.003444402478635311, "grad/layer_16/mlp": 0.004530199803411961, "grad/layer_16/attn_mlp_ratio": 0.7603201960339827, "grad/layer_20/attn": 0.002820937894284725, "grad/layer_20/mlp": 0.0052092839032411575, "grad/layer_20/attn_mlp_ratio": 0.5415212325781373, "grad/layer_24/attn": 0.00763347651809454, "grad/layer_24/mlp": 0.0072443727403879166, "grad/layer_24/attn_mlp_ratio": 1.053711161239154, "grad/layer_27/attn": 0.0043693603947758675, "grad/layer_27/mlp": 0.006055108737200499, "grad/layer_27/attn_mlp_ratio": 0.7215989856254978} {"step": 53750, "timestamp": 1778252640.4557865, "train/loss": 2.188012170791626, "train/z_loss": 0.0013774181134067476, "train/perplexity": 8.917469078985446, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028182.6375770492, "perf/iters_per_sec": 0.9671128452191587, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340054988861085, "data/tokens_consumed": 112724017152, "data/tokens_consumed_B": 112.724017152, "train/loss_slope": -8.368591821626472e-06} {"step": 53760, "timestamp": 1778252651.3760786, "train/loss": 2.14906907081604, "train/z_loss": 0.0013864510226994752, "train/perplexity": 8.57687022074598, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1921285.6590010014, "perf/iters_per_sec": 0.9161403937344558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.091535758972168, "data/tokens_consumed": 112744988672, "data/tokens_consumed_B": 112.744988672, "train/loss_slope": -5.248941425228487e-06} {"step": 53770, "timestamp": 1778252661.724615, "train/loss": 2.165091848373413, "train/z_loss": 0.0013832967379130423, "train/perplexity": 8.715402375622103, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028276.7806219803, "perf/iters_per_sec": 0.9671577361211683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339575052261352, "data/tokens_consumed": 112765960192, "data/tokens_consumed_B": 112.765960192, "train/loss_slope": -5.648929215107025e-06} {"step": 53775, "timestamp": 1778252667.4792535, "eos/sharpness": 12.069392204284666, "eos/L0_probe": 1.9803236722946167, "eos/L_plus": 2.0535390377044678, "eos/L_minus": 2.0278022289276123, "eos/grad_norm": 0.12120663374662399, "eos/embed_grad_frac": 0.16441836953163147, "eos/time_s": 0.5905439853668213} {"step": 53775, "timestamp": 1778252668.8535445, "geo/rankme_last": 439.33251953125, "geo/layer_0/stable_rank_q_proj": 19.458621978759766, "geo/layer_0/stable_rank_k_proj": 16.33631706237793, "geo/layer_0/stable_rank_o_proj": 47.591064453125, "geo/layer_0/stable_rank_gate_proj": 132.59808349609375, "geo/layer_0/stable_rank_down_proj": 54.25096130371094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06333760917186737, "geo/layer_0/attn_entropy_mean": 6.177703380584717, "geo/layer_0/attn_entropy_std": 0.4002220034599304, "geo/layer_7/stable_rank_q_proj": 42.89439392089844, "geo/layer_7/stable_rank_k_proj": 41.82402420043945, "geo/layer_7/stable_rank_o_proj": 93.28811645507812, "geo/layer_7/stable_rank_gate_proj": 85.2341537475586, "geo/layer_7/stable_rank_down_proj": 143.2026824951172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4686072766780853, "geo/layer_7/attn_entropy_mean": 4.642232894897461, "geo/layer_7/attn_entropy_std": 0.7823251485824585, "geo/layer_14/stable_rank_q_proj": 52.13153076171875, "geo/layer_14/stable_rank_k_proj": 38.83729553222656, "geo/layer_14/stable_rank_o_proj": 44.8857536315918, "geo/layer_14/stable_rank_gate_proj": 72.67925262451172, "geo/layer_14/stable_rank_down_proj": 130.5667724609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3737025558948517, "geo/layer_14/attn_entropy_mean": 5.510900497436523, "geo/layer_14/attn_entropy_std": 0.37980973720550537, "geo/layer_21/stable_rank_q_proj": 40.865760803222656, "geo/layer_21/stable_rank_k_proj": 30.177330017089844, "geo/layer_21/stable_rank_o_proj": 72.41322326660156, "geo/layer_21/stable_rank_gate_proj": 68.44832611083984, "geo/layer_21/stable_rank_down_proj": 53.118675231933594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14146587252616882, "geo/layer_21/attn_entropy_mean": 5.719080448150635, "geo/layer_21/attn_entropy_std": 0.2932761311531067, "geo/layer_27/stable_rank_q_proj": 42.79530715942383, "geo/layer_27/stable_rank_k_proj": 31.52603530883789, "geo/layer_27/stable_rank_o_proj": 115.7100601196289, "geo/layer_27/stable_rank_gate_proj": 81.6269302368164, "geo/layer_27/stable_rank_down_proj": 129.48219299316406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0988745465874672, "geo/layer_27/attn_entropy_mean": 4.219963073730469, "geo/layer_27/attn_entropy_std": 0.6895443201065063, "attnres/final_alpha/block_0": 0.23769249022006989, "attnres/block_norm/0": 1.7500076293945312, "attnres/final_alpha/block_1": 0.004752677399665117, "attnres/block_norm/1": 44615.83984375, "attnres/final_alpha/block_2": 0.010355764999985695, "attnres/block_norm/2": 27895.275390625, "attnres/final_alpha/block_3": 0.0122575918212533, "attnres/block_norm/3": 54096.15234375, "attnres/final_alpha/block_4": 0.014018770307302475, "attnres/block_norm/4": 14436.013671875, "attnres/final_alpha/block_5": 0.6096823215484619, "attnres/block_norm/5": 6403.15673828125, "attnres/final_alpha/block_6": 0.11124040931463242, "attnres/block_norm/6": 35520.609375, "geo/tier1_time_s": 1.3554987907409668, "geo/step": 53775.0, "geo/rankme_slope": -3.375449789290717e-05} {"step": 53780, "timestamp": 1778252674.026478, "train/loss": 2.2230878829956056, "train/z_loss": 0.0013732467778027058, "train/perplexity": 9.235805967563174, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705804.7075189645, "perf/iters_per_sec": 0.8133910691828559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2294209241867065, "data/tokens_consumed": 112786931712, "data/tokens_consumed_B": 112.786931712, "train/loss_slope": -2.8231714067240906e-08} {"step": 53790, "timestamp": 1778252684.3742087, "train/loss": 2.1939350605010985, "train/z_loss": 0.0013838398037478328, "train/perplexity": 8.97044298927893, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028014.717063237, "perf/iters_per_sec": 0.9670327744785485, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340911149978638, "data/tokens_consumed": 112807903232, "data/tokens_consumed_B": 112.807903232, "train/loss_slope": 3.62789234598585e-06} {"step": 53800, "timestamp": 1778252694.7183695, "grad/layer_0/attn": 0.002563979011029005, "grad/layer_0/mlp": 0.0027253448497503996, "grad/layer_0/attn_mlp_ratio": 0.9407906368930754, "grad/layer_4/attn": 0.0027061186265200377, "grad/layer_4/mlp": 0.0025229942984879017, "grad/layer_4/attn_mlp_ratio": 1.0725820985341414, "grad/layer_8/attn": 0.004686564207077026, "grad/layer_8/mlp": 0.0036316169425845146, "grad/layer_8/attn_mlp_ratio": 1.2904896502362861, "grad/layer_12/attn": 0.008510028012096882, "grad/layer_12/mlp": 0.006801626645028591, "grad/layer_12/attn_mlp_ratio": 1.251175392462838, "grad/layer_16/attn": 0.0031261281110346317, "grad/layer_16/mlp": 0.004300865810364485, "grad/layer_16/attn_mlp_ratio": 0.7268601663448984, "grad/layer_20/attn": 0.0029907505959272385, "grad/layer_20/mlp": 0.005338572897017002, "grad/layer_20/attn_mlp_ratio": 0.5602153604714892, "grad/layer_24/attn": 0.00812130980193615, "grad/layer_24/mlp": 0.00858570821583271, "grad/layer_24/attn_mlp_ratio": 0.9459102852306113, "grad/layer_27/attn": 0.003773997537791729, "grad/layer_27/mlp": 0.0071620820090174675, "grad/layer_27/attn_mlp_ratio": 0.5269413950225524} {"step": 53800, "timestamp": 1778252694.732816, "train/loss": 2.14753999710083, "train/z_loss": 0.00138204573886469, "train/perplexity": 8.563765575470283, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025659.0955905465, "perf/iters_per_sec": 0.9659095266297085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352936506271362, "data/tokens_consumed": 112828874752, "data/tokens_consumed_B": 112.828874752, "train/loss_slope": 2.9707301675182385e-06} {"step": 53810, "timestamp": 1778252705.0754993, "train/loss": 2.170256233215332, "train/z_loss": 0.0013768108212389052, "train/perplexity": 8.76052849156053, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029152.4067216278, "perf/iters_per_sec": 0.9675752671821727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335113286972046, "data/tokens_consumed": 112849846272, "data/tokens_consumed_B": 112.849846272, "train/loss_slope": 4.932103105539888e-06} {"step": 53820, "timestamp": 1778252715.4174178, "train/loss": 2.1246992349624634, "train/z_loss": 0.001390096067916602, "train/perplexity": 8.370379591966506, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029243.6905129731, "perf/iters_per_sec": 0.9676187946858278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334648370742798, "data/tokens_consumed": 112870817792, "data/tokens_consumed_B": 112.870817792, "train/loss_slope": 3.5050477775553037e-06} {"step": 53830, "timestamp": 1778252725.7625885, "train/loss": 2.1390963554382325, "train/z_loss": 0.001383081078529358, "train/perplexity": 8.491760627457397, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028129.13941322, "perf/iters_per_sec": 0.9670873353067494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340327739715576, "data/tokens_consumed": 112891789312, "data/tokens_consumed_B": 112.891789312, "train/loss_slope": 3.550051963261352e-06} {"step": 53840, "timestamp": 1778252736.1085372, "train/loss": 2.156882095336914, "train/z_loss": 0.0013975001289509236, "train/perplexity": 8.644143981647145, "train/grad_norm": 0.388671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028397.1729866196, "perf/iters_per_sec": 0.9672151436741923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338961362838746, "data/tokens_consumed": 112912760832, "data/tokens_consumed_B": 112.912760832, "train/loss_slope": 6.156719013480764e-06} {"step": 53850, "timestamp": 1778252746.4430876, "grad/layer_0/attn": 0.0030340200755745173, "grad/layer_0/mlp": 0.0030656384769827127, "grad/layer_0/attn_mlp_ratio": 0.9896861614263364, "grad/layer_4/attn": 0.0020313148852437735, "grad/layer_4/mlp": 0.0026019541546702385, "grad/layer_4/attn_mlp_ratio": 0.780688162214141, "grad/layer_8/attn": 0.003874641377478838, "grad/layer_8/mlp": 0.0037402561865746975, "grad/layer_8/attn_mlp_ratio": 1.0359293803974083, "grad/layer_12/attn": 0.004867654759436846, "grad/layer_12/mlp": 0.007160881068557501, "grad/layer_12/attn_mlp_ratio": 0.6797563937815483, "grad/layer_16/attn": 0.0034033574629575014, "grad/layer_16/mlp": 0.00481621827930212, "grad/layer_16/attn_mlp_ratio": 0.7066451715693703, "grad/layer_20/attn": 0.006073948927223682, "grad/layer_20/mlp": 0.005828978959470987, "grad/layer_20/attn_mlp_ratio": 1.0420262048041957, "grad/layer_24/attn": 0.00485020037740469, "grad/layer_24/mlp": 0.008827714249491692, "grad/layer_24/attn_mlp_ratio": 0.5494287859103607, "grad/layer_27/attn": 0.0039415243081748486, "grad/layer_27/mlp": 0.006959578488022089, "grad/layer_27/attn_mlp_ratio": 0.5663452547196581} {"step": 53850, "timestamp": 1778252747.029915, "eos/sharpness": 25.346422195434567, "eos/L0_probe": 1.9819457530975342, "eos/L_plus": 2.123222827911377, "eos/L_minus": 2.094132900238037, "eos/grad_norm": 0.11191525310277939, "eos/embed_grad_frac": 0.21076324582099915, "eos/time_s": 0.5840504169464111} {"step": 53850, "timestamp": 1778252747.0483787, "train/loss": 2.1536941528320312, "train/z_loss": 0.001378170493990183, "train/perplexity": 8.616630826110011, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918313.0534736975, "perf/iters_per_sec": 0.9147229449623573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0932271957397461, "data/tokens_consumed": 112933732352, "data/tokens_consumed_B": 112.933732352, "train/loss_slope": 3.934416647898754e-06} {"step": 53850, "timestamp": 1778252748.4116838, "geo/rankme_last": 438.95245361328125, "geo/layer_0/stable_rank_q_proj": 19.462961196899414, "geo/layer_0/stable_rank_k_proj": 16.351707458496094, "geo/layer_0/stable_rank_o_proj": 47.571510314941406, "geo/layer_0/stable_rank_gate_proj": 132.30612182617188, "geo/layer_0/stable_rank_down_proj": 54.1971321105957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060275763273239136, "geo/layer_0/attn_entropy_mean": 6.1706342697143555, "geo/layer_0/attn_entropy_std": 0.4045978784561157, "geo/layer_7/stable_rank_q_proj": 42.88547897338867, "geo/layer_7/stable_rank_k_proj": 41.65821075439453, "geo/layer_7/stable_rank_o_proj": 93.38727569580078, "geo/layer_7/stable_rank_gate_proj": 85.15497589111328, "geo/layer_7/stable_rank_down_proj": 143.58900451660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4695730209350586, "geo/layer_7/attn_entropy_mean": 4.644300937652588, "geo/layer_7/attn_entropy_std": 0.7898227572441101, "geo/layer_14/stable_rank_q_proj": 52.177738189697266, "geo/layer_14/stable_rank_k_proj": 38.907615661621094, "geo/layer_14/stable_rank_o_proj": 44.813621520996094, "geo/layer_14/stable_rank_gate_proj": 72.66421508789062, "geo/layer_14/stable_rank_down_proj": 130.4788055419922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.36802881956100464, "geo/layer_14/attn_entropy_mean": 5.523386001586914, "geo/layer_14/attn_entropy_std": 0.4029967784881592, "geo/layer_21/stable_rank_q_proj": 40.94746780395508, "geo/layer_21/stable_rank_k_proj": 30.240388870239258, "geo/layer_21/stable_rank_o_proj": 72.47000122070312, "geo/layer_21/stable_rank_gate_proj": 68.4001235961914, "geo/layer_21/stable_rank_down_proj": 53.183868408203125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14389190077781677, "geo/layer_21/attn_entropy_mean": 5.718238830566406, "geo/layer_21/attn_entropy_std": 0.30312785506248474, "geo/layer_27/stable_rank_q_proj": 42.7102165222168, "geo/layer_27/stable_rank_k_proj": 31.539236068725586, "geo/layer_27/stable_rank_o_proj": 115.89570617675781, "geo/layer_27/stable_rank_gate_proj": 81.58082580566406, "geo/layer_27/stable_rank_down_proj": 129.50665283203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0905412808060646, "geo/layer_27/attn_entropy_mean": 4.226839542388916, "geo/layer_27/attn_entropy_std": 0.7197183966636658, "attnres/final_alpha/block_0": 0.2385232150554657, "attnres/block_norm/0": 1.7504417896270752, "attnres/final_alpha/block_1": 0.004769813269376755, "attnres/block_norm/1": 44809.53125, "attnres/final_alpha/block_2": 0.010449549183249474, "attnres/block_norm/2": 28106.30859375, "attnres/final_alpha/block_3": 0.012375231832265854, "attnres/block_norm/3": 54134.5, "attnres/final_alpha/block_4": 0.014346478506922722, "attnres/block_norm/4": 14405.60546875, "attnres/final_alpha/block_5": 0.6082886457443237, "attnres/block_norm/5": 6406.328125, "attnres/final_alpha/block_6": 0.11124702543020248, "attnres/block_norm/6": 35594.125, "geo/tier1_time_s": 1.3590645790100098, "geo/step": 53850.0, "geo/rankme_slope": -4.5974014605842344e-05} {"step": 53860, "timestamp": 1778252758.7560587, "train/loss": 2.1740362644195557, "train/z_loss": 0.0013951106113381684, "train/perplexity": 8.793706229561208, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791887.5090765802, "perf/iters_per_sec": 0.8544385476477528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703591823577881, "data/tokens_consumed": 112954703872, "data/tokens_consumed_B": 112.954703872, "train/loss_slope": 4.63372025087792e-06} {"step": 53870, "timestamp": 1778252769.1019216, "train/loss": 2.175239896774292, "train/z_loss": 0.00139282870804891, "train/perplexity": 8.80429699131031, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028592.9463294318, "perf/iters_per_sec": 0.9673084956786308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337963581085206, "data/tokens_consumed": 112975675392, "data/tokens_consumed_B": 112.975675392, "train/loss_slope": 5.056787817606224e-06} {"step": 53880, "timestamp": 1778252779.4437, "train/loss": 2.2140564918518066, "train/z_loss": 0.0013685130281373859, "train/perplexity": 9.152769323959026, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029285.9178962277, "perf/iters_per_sec": 0.9676389302712572, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334433317184448, "data/tokens_consumed": 112996646912, "data/tokens_consumed_B": 112.996646912, "train/loss_slope": 6.039335181896951e-06} {"step": 53890, "timestamp": 1778252789.7854195, "train/loss": 2.142767333984375, "train/z_loss": 0.0013581730192527175, "train/perplexity": 8.522990986456804, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028823.010753963, "perf/iters_per_sec": 0.9674181989450278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336791276931763, "data/tokens_consumed": 113017618432, "data/tokens_consumed_B": 113.017618432, "train/loss_slope": 1.5313722834800303e-06} {"step": 53900, "timestamp": 1778252800.1161163, "grad/layer_0/attn": 0.00235366215929389, "grad/layer_0/mlp": 0.002690583234652877, "grad/layer_0/attn_mlp_ratio": 0.8747776472783162, "grad/layer_4/attn": 0.0029634584207087755, "grad/layer_4/mlp": 0.002580700907856226, "grad/layer_4/attn_mlp_ratio": 1.1483152878568075, "grad/layer_8/attn": 0.0035348436795175076, "grad/layer_8/mlp": 0.0036082870792597532, "grad/layer_8/attn_mlp_ratio": 0.9796458829096545, "grad/layer_12/attn": 0.004095214419066906, "grad/layer_12/mlp": 0.006280491128563881, "grad/layer_12/attn_mlp_ratio": 0.6520532025332249, "grad/layer_16/attn": 0.0033582369796931744, "grad/layer_16/mlp": 0.004405790474265814, "grad/layer_16/attn_mlp_ratio": 0.7622325489796562, "grad/layer_20/attn": 0.006309269927442074, "grad/layer_20/mlp": 0.005821217317134142, "grad/layer_20/attn_mlp_ratio": 1.0838402820810988, "grad/layer_24/attn": 0.008418776094913483, "grad/layer_24/mlp": 0.00819482747465372, "grad/layer_24/attn_mlp_ratio": 1.0273280332282313, "grad/layer_27/attn": 0.005244035739451647, "grad/layer_27/mlp": 0.00843878649175167, "grad/layer_27/attn_mlp_ratio": 0.6214205896114648} {"step": 53900, "timestamp": 1778252800.1301737, "train/loss": 2.167883539199829, "train/z_loss": 0.0013914395007304848, "train/perplexity": 8.739767078016797, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028553.3675954435, "perf/iters_per_sec": 0.9672896230675905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338165283203125, "data/tokens_consumed": 113038589952, "data/tokens_consumed_B": 113.038589952, "train/loss_slope": 3.7976438134345307e-06} {"step": 53910, "timestamp": 1778252810.471788, "train/loss": 2.11867516040802, "train/z_loss": 0.0013910968904383481, "train/perplexity": 8.320107375076057, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029322.3884142882, "perf/iters_per_sec": 0.9676563207694474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334247589111327, "data/tokens_consumed": 113059561472, "data/tokens_consumed_B": 113.059561472, "train/loss_slope": 3.4433492124885224e-06} {"step": 53920, "timestamp": 1778252820.8113904, "train/loss": 2.1981126189231874, "train/z_loss": 0.0013834293349646032, "train/perplexity": 9.007995924113805, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029229.9272001854, "perf/iters_per_sec": 0.9676122318268706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334718465805053, "data/tokens_consumed": 113080532992, "data/tokens_consumed_B": 113.080532992, "train/loss_slope": 6.330541279664303e-06} {"step": 53925, "timestamp": 1778252826.5857258, "eos/sharpness": 72.77350425720213, "eos/L0_probe": 1.9828803539276123, "eos/L_plus": 2.3056724071502686, "eos/L_minus": 2.3878233432769775, "eos/grad_norm": 0.19792170822620392, "eos/embed_grad_frac": 0.056165024638175964, "eos/time_s": 0.5956194400787354} {"step": 53925, "timestamp": 1778252827.9639513, "geo/rankme_last": 437.3685607910156, "geo/layer_0/stable_rank_q_proj": 19.442834854125977, "geo/layer_0/stable_rank_k_proj": 16.34316635131836, "geo/layer_0/stable_rank_o_proj": 47.55200958251953, "geo/layer_0/stable_rank_gate_proj": 132.3790740966797, "geo/layer_0/stable_rank_down_proj": 54.083824157714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06713680177927017, "geo/layer_0/attn_entropy_mean": 6.169805526733398, "geo/layer_0/attn_entropy_std": 0.40184253454208374, "geo/layer_7/stable_rank_q_proj": 42.929649353027344, "geo/layer_7/stable_rank_k_proj": 41.69342803955078, "geo/layer_7/stable_rank_o_proj": 93.29352569580078, "geo/layer_7/stable_rank_gate_proj": 85.25697326660156, "geo/layer_7/stable_rank_down_proj": 143.35235595703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4677615463733673, "geo/layer_7/attn_entropy_mean": 4.655240535736084, "geo/layer_7/attn_entropy_std": 0.7880622148513794, "geo/layer_14/stable_rank_q_proj": 52.189212799072266, "geo/layer_14/stable_rank_k_proj": 38.91908264160156, "geo/layer_14/stable_rank_o_proj": 44.847232818603516, "geo/layer_14/stable_rank_gate_proj": 72.72247314453125, "geo/layer_14/stable_rank_down_proj": 130.3153076171875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3800489008426666, "geo/layer_14/attn_entropy_mean": 5.522218704223633, "geo/layer_14/attn_entropy_std": 0.3867369592189789, "geo/layer_21/stable_rank_q_proj": 40.78220748901367, "geo/layer_21/stable_rank_k_proj": 30.247142791748047, "geo/layer_21/stable_rank_o_proj": 72.35160064697266, "geo/layer_21/stable_rank_gate_proj": 68.54019165039062, "geo/layer_21/stable_rank_down_proj": 53.129817962646484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14404359459877014, "geo/layer_21/attn_entropy_mean": 5.723405838012695, "geo/layer_21/attn_entropy_std": 0.2973547577857971, "geo/layer_27/stable_rank_q_proj": 42.7512092590332, "geo/layer_27/stable_rank_k_proj": 31.477869033813477, "geo/layer_27/stable_rank_o_proj": 115.96562194824219, "geo/layer_27/stable_rank_gate_proj": 81.5865707397461, "geo/layer_27/stable_rank_down_proj": 129.3262939453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09761200845241547, "geo/layer_27/attn_entropy_mean": 4.217446804046631, "geo/layer_27/attn_entropy_std": 0.7021426558494568, "attnres/final_alpha/block_0": 0.23984751105308533, "attnres/block_norm/0": 1.7505887746810913, "attnres/final_alpha/block_1": 0.004844240844249725, "attnres/block_norm/1": 44558.625, "attnres/final_alpha/block_2": 0.010623417794704437, "attnres/block_norm/2": 27988.33984375, "attnres/final_alpha/block_3": 0.012493040412664413, "attnres/block_norm/3": 54307.6171875, "attnres/final_alpha/block_4": 0.014405503869056702, "attnres/block_norm/4": 14433.7578125, "attnres/final_alpha/block_5": 0.6038034558296204, "attnres/block_norm/5": 6572.71484375, "attnres/final_alpha/block_6": 0.11398284137248993, "attnres/block_norm/6": 36068.51171875, "geo/tier1_time_s": 1.3580958843231201, "geo/step": 53925.0, "geo/rankme_slope": -8.37659087072329e-05} {"step": 53930, "timestamp": 1778252833.1375, "train/loss": 2.1489526271820067, "train/z_loss": 0.0013711633277125657, "train/perplexity": 8.575871556953993, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702352.5392785245, "perf/iters_per_sec": 0.8117449470894453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319140434265137, "data/tokens_consumed": 113101504512, "data/tokens_consumed_B": 113.101504512, "train/loss_slope": 5.3127288603760945e-06} {"step": 53940, "timestamp": 1778252843.4729784, "train/loss": 2.1525360107421876, "train/z_loss": 0.001381628552917391, "train/perplexity": 8.606657319761174, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029972.1016120915, "perf/iters_per_sec": 0.967966128164335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330940008163452, "data/tokens_consumed": 113122476032, "data/tokens_consumed_B": 113.122476032, "train/loss_slope": 5.5376369460294955e-06} {"step": 53950, "timestamp": 1778252853.8024433, "grad/layer_0/attn": 0.0025676407385617495, "grad/layer_0/mlp": 0.002633404918015003, "grad/layer_0/attn_mlp_ratio": 0.9750269028108615, "grad/layer_4/attn": 0.0018430027412250638, "grad/layer_4/mlp": 0.0025677045341581106, "grad/layer_4/attn_mlp_ratio": 0.7177627507103599, "grad/layer_8/attn": 0.0036748023703694344, "grad/layer_8/mlp": 0.0037835806142538786, "grad/layer_8/attn_mlp_ratio": 0.971249894716229, "grad/layer_12/attn": 0.005296403542160988, "grad/layer_12/mlp": 0.006597727071493864, "grad/layer_12/attn_mlp_ratio": 0.8027618306262535, "grad/layer_16/attn": 0.0057726455852389336, "grad/layer_16/mlp": 0.004625556990504265, "grad/layer_16/attn_mlp_ratio": 1.2479892631937264, "grad/layer_20/attn": 0.0037908160593360662, "grad/layer_20/mlp": 0.005767746362835169, "grad/layer_20/attn_mlp_ratio": 0.657243879175762, "grad/layer_24/attn": 0.008810227736830711, "grad/layer_24/mlp": 0.008706405758857727, "grad/layer_24/attn_mlp_ratio": 1.0119247689179753, "grad/layer_27/attn": 0.007856645621359348, "grad/layer_27/mlp": 0.008417977951467037, "grad/layer_27/attn_mlp_ratio": 0.9333174276915747} {"step": 53950, "timestamp": 1778252853.8165908, "train/loss": 2.116888678073883, "train/z_loss": 0.0013830639654770494, "train/perplexity": 8.30525691922051, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028954.2789372045, "perf/iters_per_sec": 0.9674807924924872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336122512817383, "data/tokens_consumed": 113143447552, "data/tokens_consumed_B": 113.143447552, "train/loss_slope": 2.212573006243953e-06} {"step": 53960, "timestamp": 1778252864.1657438, "train/loss": 2.1139413475990296, "train/z_loss": 0.0013974024564959108, "train/perplexity": 8.28081461986158, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027855.473116894, "perf/iters_per_sec": 0.9669568410477133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341723203659057, "data/tokens_consumed": 113164419072, "data/tokens_consumed_B": 113.164419072, "train/loss_slope": -2.2047226542722674e-06} {"step": 53970, "timestamp": 1778252874.5048459, "train/loss": 2.1241567492485047, "train/z_loss": 0.0013918673852458597, "train/perplexity": 8.365840012057406, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029333.999329284, "perf/iters_per_sec": 0.9676618572851582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033418846130371, "data/tokens_consumed": 113185390592, "data/tokens_consumed_B": 113.185390592, "train/loss_slope": -3.1065752082067064e-06} {"step": 53980, "timestamp": 1778252884.9209666, "train/loss": 2.149827742576599, "train/z_loss": 0.0013810886186547578, "train/perplexity": 8.583379718950441, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019335.836587228, "perf/iters_per_sec": 0.9628943617759839, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385355234146119, "data/tokens_consumed": 113206362112, "data/tokens_consumed_B": 113.206362112, "train/loss_slope": -4.95522372042355e-07} {"step": 53990, "timestamp": 1778252895.3002374, "train/loss": 2.1759103536605835, "train/z_loss": 0.0013793036225251854, "train/perplexity": 8.810201872119896, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022077.520199642, "perf/iters_per_sec": 0.9642016983984194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03712739944458, "data/tokens_consumed": 113227333632, "data/tokens_consumed_B": 113.227333632, "train/loss_slope": 8.80812675860641e-07} {"step": 54000, "timestamp": 1778252905.671142, "grad/layer_0/attn": 0.004201738629490137, "grad/layer_0/mlp": 0.003426077077165246, "grad/layer_0/attn_mlp_ratio": 1.2263992934819634, "grad/layer_4/attn": 0.0022896421141922474, "grad/layer_4/mlp": 0.002706801751628518, "grad/layer_4/attn_mlp_ratio": 0.8458846416167142, "grad/layer_8/attn": 0.004124288447201252, "grad/layer_8/mlp": 0.003537652315571904, "grad/layer_8/attn_mlp_ratio": 1.1658263624337755, "grad/layer_12/attn": 0.008002718910574913, "grad/layer_12/mlp": 0.0068348608911037445, "grad/layer_12/attn_mlp_ratio": 1.1708678378377049, "grad/layer_16/attn": 0.003959536086767912, "grad/layer_16/mlp": 0.004668314941227436, "grad/layer_16/attn_mlp_ratio": 0.8481724244829104, "grad/layer_20/attn": 0.004812565166503191, "grad/layer_20/mlp": 0.007291074842214584, "grad/layer_20/attn_mlp_ratio": 0.6600625017086201, "grad/layer_24/attn": 0.014344188384711742, "grad/layer_24/mlp": 0.010786893777549267, "grad/layer_24/attn_mlp_ratio": 1.3297793180821278, "grad/layer_27/attn": 0.014378237538039684, "grad/layer_27/mlp": 0.010089144110679626, "grad/layer_27/attn_mlp_ratio": 1.425119637284988} {"step": 54000, "timestamp": 1778252906.2936983, "eos/sharpness": 74.1577386856079, "eos/L0_probe": 1.9818671941757202, "eos/L_plus": 2.444676399230957, "eos/L_minus": 2.2606353759765625, "eos/grad_norm": 0.20447523891925812, "eos/embed_grad_frac": 0.05272211134433746, "eos/time_s": 0.6193196773529053} {"step": 54000, "timestamp": 1778252906.3155997, "train/loss": 2.1974597692489626, "train/z_loss": 0.0013637578813359141, "train/perplexity": 9.002116976152765, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905076.4383712125, "perf/iters_per_sec": 0.9084112350326597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1008230209350587, "data/tokens_consumed": 113248305152, "data/tokens_consumed_B": 113.248305152, "train/loss_slope": 1.37512224628302e-06} {"step": 54000, "timestamp": 1778252907.6829317, "geo/rankme_last": 439.0121154785156, "geo/layer_0/stable_rank_q_proj": 19.453012466430664, "geo/layer_0/stable_rank_k_proj": 16.327789306640625, "geo/layer_0/stable_rank_o_proj": 47.582096099853516, "geo/layer_0/stable_rank_gate_proj": 132.48052978515625, "geo/layer_0/stable_rank_down_proj": 54.20030212402344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06462058424949646, "geo/layer_0/attn_entropy_mean": 6.1693243980407715, "geo/layer_0/attn_entropy_std": 0.401254802942276, "geo/layer_7/stable_rank_q_proj": 42.806884765625, "geo/layer_7/stable_rank_k_proj": 41.53370666503906, "geo/layer_7/stable_rank_o_proj": 93.33993530273438, "geo/layer_7/stable_rank_gate_proj": 85.28205108642578, "geo/layer_7/stable_rank_down_proj": 143.18008422851562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4637063443660736, "geo/layer_7/attn_entropy_mean": 4.629729270935059, "geo/layer_7/attn_entropy_std": 0.78054279088974, "geo/layer_14/stable_rank_q_proj": 52.1903190612793, "geo/layer_14/stable_rank_k_proj": 38.97964096069336, "geo/layer_14/stable_rank_o_proj": 44.75761032104492, "geo/layer_14/stable_rank_gate_proj": 72.81409454345703, "geo/layer_14/stable_rank_down_proj": 130.43487548828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40124714374542236, "geo/layer_14/attn_entropy_mean": 5.53748083114624, "geo/layer_14/attn_entropy_std": 0.38365691900253296, "geo/layer_21/stable_rank_q_proj": 40.824283599853516, "geo/layer_21/stable_rank_k_proj": 30.177776336669922, "geo/layer_21/stable_rank_o_proj": 72.35140991210938, "geo/layer_21/stable_rank_gate_proj": 68.50071716308594, "geo/layer_21/stable_rank_down_proj": 53.0913200378418, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14648933708667755, "geo/layer_21/attn_entropy_mean": 5.70922327041626, "geo/layer_21/attn_entropy_std": 0.2900892496109009, "geo/layer_27/stable_rank_q_proj": 42.71713638305664, "geo/layer_27/stable_rank_k_proj": 31.42681121826172, "geo/layer_27/stable_rank_o_proj": 116.06436157226562, "geo/layer_27/stable_rank_gate_proj": 81.56086730957031, "geo/layer_27/stable_rank_down_proj": 129.33465576171875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08862912654876709, "geo/layer_27/attn_entropy_mean": 4.235759735107422, "geo/layer_27/attn_entropy_std": 0.7074810862541199, "attnres/final_alpha/block_0": 0.23731869459152222, "attnres/block_norm/0": 1.7508217096328735, "attnres/final_alpha/block_1": 0.004803305957466364, "attnres/block_norm/1": 44690.3515625, "attnres/final_alpha/block_2": 0.010188011452555656, "attnres/block_norm/2": 28174.0625, "attnres/final_alpha/block_3": 0.012021759524941444, "attnres/block_norm/3": 54369.15625, "attnres/final_alpha/block_4": 0.014024941250681877, "attnres/block_norm/4": 14396.060546875, "attnres/final_alpha/block_5": 0.610316276550293, "attnres/block_norm/5": 6381.005859375, "attnres/final_alpha/block_6": 0.11132702231407166, "attnres/block_norm/6": 35810.84375, "geo/tier1_time_s": 1.3633201122283936, "geo/step": 54000.0, "geo/rankme_slope": -7.739380908613446e-05} {"step": 54000, "timestamp": 1778252914.5599132, "geo/ww_alpha_mean": 7.579761086650886, "geo/ww_alpha_std": 4.4200547378776065, "geo/ww_alpha_min": 1.3391445912833784, "geo/ww_alpha_max": 28.058524893093097, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.003174661047714, "geo/ww_alpha_by_type/k_proj": 4.434083649511846, "geo/ww_alpha_by_type/v_proj": 9.10979568324335, "geo/ww_alpha_by_type/o_proj": 7.967345083734643, "geo/ww_alpha_by_type/gate_proj": 8.237604188243179, "geo/ww_alpha_by_type/up_proj": 11.115166755753636, "geo/ww_alpha_by_type/down_proj": 8.288865518477808, "geo/twonn_id/layer_0": 0.7228795886039734, "geo/twonn_id/layer_7": 2.984792709350586, "geo/twonn_id/layer_14": 5.576702117919922, "geo/twonn_id/layer_21": 8.245563507080078, "geo/twonn_id/layer_27": 5.157238483428955, "geo/tier2_time_s": 6.868135929107666} {"step": 54000, "timestamp": 1778252915.2374814, "eoc/jacobian_sigma/layer_0/attn": 1030.88623046875, "eoc/jacobian_sigma/layer_0/mlp": 8175.10693359375, "eoc/jacobian_sigma/layer_0": 8175.10693359375, "eoc/jacobian_sigma/layer_7/attn": 1.1561704874038696, "eoc/jacobian_sigma/layer_7/mlp": 1.7026368379592896, "eoc/jacobian_sigma/layer_7": 1.7026368379592896, "eoc/jacobian_sigma/layer_14/attn": 1.5293766260147095, "eoc/jacobian_sigma/layer_14/mlp": 6.29241943359375, "eoc/jacobian_sigma/layer_14": 6.29241943359375, "eoc/jacobian_sigma/layer_21/attn": 1.1017078161239624, "eoc/jacobian_sigma/layer_21/mlp": 4.168071746826172, "eoc/jacobian_sigma/layer_21": 4.168071746826172, "eoc/jacobian_sigma/layer_27/attn": 3.305023670196533, "eoc/jacobian_sigma/layer_27/mlp": 26.7077693939209, "eoc/jacobian_sigma/layer_27": 26.7077693939209, "eoc/layer0_sigma": 8175.10693359375, "eoc/sigma_max": 26.7077693939209, "eoc/sigma_min": 1.7026368379592896, "eoc/sigma_mean": 9.717724353075027, "eoc/time_s": 0.6696412563323975} {"step": 54010, "timestamp": 1778252925.6328351, "train/loss": 2.172629642486572, "train/z_loss": 0.0013833737815730275, "train/perplexity": 8.781345504977708, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085863.2497108963, "perf/iters_per_sec": 0.5177799461893541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9313223838806153, "data/tokens_consumed": 113269276672, "data/tokens_consumed_B": 113.269276672, "train/loss_slope": 3.823358669961991e-06} {"step": 54020, "timestamp": 1778252936.0055897, "train/loss": 2.124196267127991, "train/z_loss": 0.0013907214510254563, "train/perplexity": 8.3661706188472, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022785.6766966355, "perf/iters_per_sec": 0.9645393737300089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367643117904664, "data/tokens_consumed": 113290248192, "data/tokens_consumed_B": 113.290248192, "train/loss_slope": 4.491387513747565e-06} {"step": 54030, "timestamp": 1778252946.3851304, "train/loss": 2.157466733455658, "train/z_loss": 0.0013842913904227316, "train/perplexity": 8.649199155302346, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022219.4000720598, "perf/iters_per_sec": 0.9642693519935893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370546340942384, "data/tokens_consumed": 113311219712, "data/tokens_consumed_B": 113.311219712, "train/loss_slope": 5.148554549287332e-06} {"step": 54040, "timestamp": 1778252956.7649353, "train/loss": 2.1488480091094972, "train/z_loss": 0.0013871850911527871, "train/perplexity": 8.574974412731184, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021881.096859355, "perf/iters_per_sec": 0.9641080364510322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372281551361084, "data/tokens_consumed": 113332191232, "data/tokens_consumed_B": 113.332191232, "train/loss_slope": 5.114383937859742e-06} {"step": 54050, "timestamp": 1778252967.1261137, "grad/layer_0/attn": 0.003265195991843939, "grad/layer_0/mlp": 0.0033208467066287994, "grad/layer_0/attn_mlp_ratio": 0.9832419807279953, "grad/layer_4/attn": 0.0021738819777965546, "grad/layer_4/mlp": 0.002554517239332199, "grad/layer_4/attn_mlp_ratio": 0.8509951936223106, "grad/layer_8/attn": 0.004887202754616737, "grad/layer_8/mlp": 0.003563760779798031, "grad/layer_8/attn_mlp_ratio": 1.3713610198487027, "grad/layer_12/attn": 0.005993778817355633, "grad/layer_12/mlp": 0.006493046879768372, "grad/layer_12/attn_mlp_ratio": 0.9231072616649157, "grad/layer_16/attn": 0.0033571976236999035, "grad/layer_16/mlp": 0.004412936978042126, "grad/layer_16/attn_mlp_ratio": 0.7607626314013481, "grad/layer_20/attn": 0.0031546535901725292, "grad/layer_20/mlp": 0.006496166344732046, "grad/layer_20/attn_mlp_ratio": 0.48561772808803894, "grad/layer_24/attn": 0.010023807175457478, "grad/layer_24/mlp": 0.010916197672486305, "grad/layer_24/attn_mlp_ratio": 0.9182507851517632, "grad/layer_27/attn": 0.005446564871817827, "grad/layer_27/mlp": 0.011362655088305473, "grad/layer_27/attn_mlp_ratio": 0.4793390965012712} {"step": 54050, "timestamp": 1778252967.1429732, "train/loss": 2.2126846075057984, "train/z_loss": 0.00138581971405074, "train/perplexity": 9.140221392124495, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021726.206833321, "perf/iters_per_sec": 0.964034179131184, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037307620048523, "data/tokens_consumed": 113353162752, "data/tokens_consumed_B": 113.353162752, "train/loss_slope": 9.018690973082166e-06} {"step": 54060, "timestamp": 1778252977.4912302, "train/loss": 2.1387518882751464, "train/z_loss": 0.0013825126341544091, "train/perplexity": 8.488835998512688, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027779.9274843505, "perf/iters_per_sec": 0.9669208180829766, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342108488082886, "data/tokens_consumed": 113374134272, "data/tokens_consumed_B": 113.374134272, "train/loss_slope": 8.11837234548577e-06} {"step": 54070, "timestamp": 1778252987.8424625, "train/loss": 2.145772385597229, "train/z_loss": 0.0013890979578718543, "train/perplexity": 8.548641535576246, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027700.5077929867, "perf/iters_per_sec": 0.9668829478230413, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342513561248778, "data/tokens_consumed": 113395105792, "data/tokens_consumed_B": 113.395105792, "train/loss_slope": 7.114643446861945e-06} {"step": 54075, "timestamp": 1778252993.619897, "eos/sharpness": 77.03032493591307, "eos/L0_probe": 1.982712745666504, "eos/L_plus": 2.4604949951171875, "eos/L_minus": 2.275233745574951, "eos/grad_norm": 0.2454025149345398, "eos/embed_grad_frac": 0.07124296575784683, "eos/time_s": 0.6099917888641357} {"step": 54075, "timestamp": 1778252995.0041857, "geo/rankme_last": 438.8189392089844, "geo/layer_0/stable_rank_q_proj": 19.45335578918457, "geo/layer_0/stable_rank_k_proj": 16.308528900146484, "geo/layer_0/stable_rank_o_proj": 47.65782165527344, "geo/layer_0/stable_rank_gate_proj": 132.5086669921875, "geo/layer_0/stable_rank_down_proj": 54.252559661865234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05999287590384483, "geo/layer_0/attn_entropy_mean": 6.171448707580566, "geo/layer_0/attn_entropy_std": 0.39750438928604126, "geo/layer_7/stable_rank_q_proj": 42.808624267578125, "geo/layer_7/stable_rank_k_proj": 41.61503219604492, "geo/layer_7/stable_rank_o_proj": 93.26024627685547, "geo/layer_7/stable_rank_gate_proj": 85.3095703125, "geo/layer_7/stable_rank_down_proj": 143.1389617919922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47055259346961975, "geo/layer_7/attn_entropy_mean": 4.664929389953613, "geo/layer_7/attn_entropy_std": 0.7837822437286377, "geo/layer_14/stable_rank_q_proj": 52.119266510009766, "geo/layer_14/stable_rank_k_proj": 38.93391036987305, "geo/layer_14/stable_rank_o_proj": 44.77582931518555, "geo/layer_14/stable_rank_gate_proj": 72.72486877441406, "geo/layer_14/stable_rank_down_proj": 130.43252563476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3691381812095642, "geo/layer_14/attn_entropy_mean": 5.505499839782715, "geo/layer_14/attn_entropy_std": 0.37773454189300537, "geo/layer_21/stable_rank_q_proj": 40.84101104736328, "geo/layer_21/stable_rank_k_proj": 30.184295654296875, "geo/layer_21/stable_rank_o_proj": 72.31596374511719, "geo/layer_21/stable_rank_gate_proj": 68.48628997802734, "geo/layer_21/stable_rank_down_proj": 53.12023162841797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14649757742881775, "geo/layer_21/attn_entropy_mean": 5.712897300720215, "geo/layer_21/attn_entropy_std": 0.2956227958202362, "geo/layer_27/stable_rank_q_proj": 42.745540618896484, "geo/layer_27/stable_rank_k_proj": 31.436870574951172, "geo/layer_27/stable_rank_o_proj": 115.89266204833984, "geo/layer_27/stable_rank_gate_proj": 81.5754623413086, "geo/layer_27/stable_rank_down_proj": 129.3451385498047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08212316036224365, "geo/layer_27/attn_entropy_mean": 4.231040000915527, "geo/layer_27/attn_entropy_std": 0.7164201736450195, "attnres/final_alpha/block_0": 0.23811298608779907, "attnres/block_norm/0": 1.7510013580322266, "attnres/final_alpha/block_1": 0.004809725098311901, "attnres/block_norm/1": 44632.9453125, "attnres/final_alpha/block_2": 0.010385608300566673, "attnres/block_norm/2": 28087.0625, "attnres/final_alpha/block_3": 0.01231830008327961, "attnres/block_norm/3": 54106.0703125, "attnres/final_alpha/block_4": 0.014280878007411957, "attnres/block_norm/4": 14511.720703125, "attnres/final_alpha/block_5": 0.609979510307312, "attnres/block_norm/5": 6412.83203125, "attnres/final_alpha/block_6": 0.11011300981044769, "attnres/block_norm/6": 35732.796875, "geo/tier1_time_s": 1.3663740158081055, "geo/step": 54075.0, "geo/rankme_slope": -7.088980123299319e-05} {"step": 54080, "timestamp": 1778253000.1889586, "train/loss": 2.1371703147888184, "train/z_loss": 0.0013904753490351141, "train/perplexity": 8.475420891852508, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699351.5373652077, "perf/iters_per_sec": 0.8103139578653372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2340895652770996, "data/tokens_consumed": 113416077312, "data/tokens_consumed_B": 113.416077312, "train/loss_slope": 5.816263985140287e-06} {"step": 54090, "timestamp": 1778253010.5400364, "train/loss": 2.196446251869202, "train/z_loss": 0.001369853736832738, "train/perplexity": 8.992997796147122, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027506.4021497979, "perf/iters_per_sec": 0.966790391039752, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034350371360779, "data/tokens_consumed": 113437048832, "data/tokens_consumed_B": 113.437048832, "train/loss_slope": 5.859453385562265e-06} {"step": 54100, "timestamp": 1778253020.8746238, "grad/layer_0/attn": 0.0031347244512289762, "grad/layer_0/mlp": 0.0031723843421787024, "grad/layer_0/attn_mlp_ratio": 0.9881288060649225, "grad/layer_4/attn": 0.0038822221104055643, "grad/layer_4/mlp": 0.0024936748668551445, "grad/layer_4/attn_mlp_ratio": 1.5568276387285398, "grad/layer_8/attn": 0.005060569383203983, "grad/layer_8/mlp": 0.003583855228498578, "grad/layer_8/attn_mlp_ratio": 1.4120462237866227, "grad/layer_12/attn": 0.008507037535309792, "grad/layer_12/mlp": 0.006516209337860346, "grad/layer_12/attn_mlp_ratio": 1.3055193539179968, "grad/layer_16/attn": 0.0033669969998300076, "grad/layer_16/mlp": 0.004689407534897327, "grad/layer_16/attn_mlp_ratio": 0.7180004942999004, "grad/layer_20/attn": 0.003749589901417494, "grad/layer_20/mlp": 0.0062702554278075695, "grad/layer_20/attn_mlp_ratio": 0.5979963471645883, "grad/layer_24/attn": 0.013929699547588825, "grad/layer_24/mlp": 0.00915707927197218, "grad/layer_24/attn_mlp_ratio": 1.521194584184188, "grad/layer_27/attn": 0.0100939916446805, "grad/layer_27/mlp": 0.008551094681024551, "grad/layer_27/attn_mlp_ratio": 1.180432670104387} {"step": 54100, "timestamp": 1778253020.888925, "train/loss": 2.1997714757919313, "train/z_loss": 0.001370620308443904, "train/perplexity": 9.022951301011458, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028006.0201833344, "perf/iters_per_sec": 0.9670286274830505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034095549583435, "data/tokens_consumed": 113458020352, "data/tokens_consumed_B": 113.458020352, "train/loss_slope": 1.1041419297912783e-05} {"step": 54110, "timestamp": 1778253031.2451532, "train/loss": 2.124234664440155, "train/z_loss": 0.0013798811472952367, "train/perplexity": 8.366491863479494, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026030.863523872, "perf/iters_per_sec": 0.9660867993945466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035103678703308, "data/tokens_consumed": 113478991872, "data/tokens_consumed_B": 113.478991872, "train/loss_slope": 1.0795425367255205e-05} {"step": 54120, "timestamp": 1778253041.5896559, "train/loss": 2.1444693565368653, "train/z_loss": 0.0013717656373046338, "train/perplexity": 8.537509661381598, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028623.8244283854, "perf/iters_per_sec": 0.9673232195035865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337806224822998, "data/tokens_consumed": 113499963392, "data/tokens_consumed_B": 113.499963392, "train/loss_slope": 1.02103382053942e-05} {"step": 54130, "timestamp": 1778253051.9316928, "train/loss": 2.17814781665802, "train/z_loss": 0.0013727934448979795, "train/perplexity": 8.829936442260939, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029061.0843366769, "perf/iters_per_sec": 0.9675317212756523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335578441619873, "data/tokens_consumed": 113520934912, "data/tokens_consumed_B": 113.520934912, "train/loss_slope": 1.051615286688889e-05} {"step": 54140, "timestamp": 1778253062.285053, "train/loss": 2.133143734931946, "train/z_loss": 0.0013867464731447399, "train/perplexity": 8.441362548147847, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026731.3317248994, "perf/iters_per_sec": 0.966420808660936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347459316253662, "data/tokens_consumed": 113541906432, "data/tokens_consumed_B": 113.541906432, "train/loss_slope": 7.551641633050676e-06} {"step": 54150, "timestamp": 1778253072.6214552, "grad/layer_0/attn": 0.00272964034229517, "grad/layer_0/mlp": 0.0031263108830899, "grad/layer_0/attn_mlp_ratio": 0.8731186235341567, "grad/layer_4/attn": 0.003888887818902731, "grad/layer_4/mlp": 0.0024404956493526697, "grad/layer_4/attn_mlp_ratio": 1.593482725767599, "grad/layer_8/attn": 0.006977361626923084, "grad/layer_8/mlp": 0.003569783177226782, "grad/layer_8/attn_mlp_ratio": 1.9545616876617535, "grad/layer_12/attn": 0.005474684294313192, "grad/layer_12/mlp": 0.006642891559749842, "grad/layer_12/attn_mlp_ratio": 0.8241417404840468, "grad/layer_16/attn": 0.0062516918405890465, "grad/layer_16/mlp": 0.004708264954388142, "grad/layer_16/attn_mlp_ratio": 1.3278122128579866, "grad/layer_20/attn": 0.0040618013590574265, "grad/layer_20/mlp": 0.005921646952629089, "grad/layer_20/attn_mlp_ratio": 0.6859242577205054, "grad/layer_24/attn": 0.008337804116308689, "grad/layer_24/mlp": 0.008819743059575558, "grad/layer_24/attn_mlp_ratio": 0.9453567938944312, "grad/layer_27/attn": 0.004789578262716532, "grad/layer_27/mlp": 0.00872061587870121, "grad/layer_27/attn_mlp_ratio": 0.5492247651329166} {"step": 54150, "timestamp": 1778253073.2222073, "eos/sharpness": 54.648351669311516, "eos/L0_probe": 1.982724905014038, "eos/L_plus": 2.236586093902588, "eos/L_minus": 2.2753472328186035, "eos/grad_norm": 0.1521814465522766, "eos/embed_grad_frac": 0.09493625164031982, "eos/time_s": 0.5978431701660156} {"step": 54150, "timestamp": 1778253073.2426567, "train/loss": 2.176453423500061, "train/z_loss": 0.001376379281282425, "train/perplexity": 8.8149877264453, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915078.5349612548, "perf/iters_per_sec": 0.9131806063467287, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950736284255982, "data/tokens_consumed": 113562877952, "data/tokens_consumed_B": 113.562877952, "train/loss_slope": 8.359083713013073e-06} {"step": 54150, "timestamp": 1778253074.6075928, "geo/rankme_last": 438.78192138671875, "geo/layer_0/stable_rank_q_proj": 19.46586799621582, "geo/layer_0/stable_rank_k_proj": 16.35933494567871, "geo/layer_0/stable_rank_o_proj": 47.65509033203125, "geo/layer_0/stable_rank_gate_proj": 132.544677734375, "geo/layer_0/stable_rank_down_proj": 54.21997833251953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07026804983615875, "geo/layer_0/attn_entropy_mean": 6.175210952758789, "geo/layer_0/attn_entropy_std": 0.39778533577919006, "geo/layer_7/stable_rank_q_proj": 42.92619705200195, "geo/layer_7/stable_rank_k_proj": 41.65650177001953, "geo/layer_7/stable_rank_o_proj": 93.30738830566406, "geo/layer_7/stable_rank_gate_proj": 85.13137817382812, "geo/layer_7/stable_rank_down_proj": 143.20046997070312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46854186058044434, "geo/layer_7/attn_entropy_mean": 4.6461262702941895, "geo/layer_7/attn_entropy_std": 0.7924274206161499, "geo/layer_14/stable_rank_q_proj": 52.117149353027344, "geo/layer_14/stable_rank_k_proj": 38.97658157348633, "geo/layer_14/stable_rank_o_proj": 44.703678131103516, "geo/layer_14/stable_rank_gate_proj": 72.75254821777344, "geo/layer_14/stable_rank_down_proj": 130.09690856933594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39844125509262085, "geo/layer_14/attn_entropy_mean": 5.515045166015625, "geo/layer_14/attn_entropy_std": 0.3849249482154846, "geo/layer_21/stable_rank_q_proj": 40.837135314941406, "geo/layer_21/stable_rank_k_proj": 30.26224136352539, "geo/layer_21/stable_rank_o_proj": 72.21622467041016, "geo/layer_21/stable_rank_gate_proj": 68.44853973388672, "geo/layer_21/stable_rank_down_proj": 53.0670051574707, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14129109680652618, "geo/layer_21/attn_entropy_mean": 5.718631744384766, "geo/layer_21/attn_entropy_std": 0.2947230935096741, "geo/layer_27/stable_rank_q_proj": 42.77901077270508, "geo/layer_27/stable_rank_k_proj": 31.483646392822266, "geo/layer_27/stable_rank_o_proj": 116.1764907836914, "geo/layer_27/stable_rank_gate_proj": 81.58457946777344, "geo/layer_27/stable_rank_down_proj": 129.4123077392578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09362347424030304, "geo/layer_27/attn_entropy_mean": 4.230414867401123, "geo/layer_27/attn_entropy_std": 0.7196843028068542, "attnres/final_alpha/block_0": 0.2394687831401825, "attnres/block_norm/0": 1.751172423362732, "attnres/final_alpha/block_1": 0.004866696894168854, "attnres/block_norm/1": 44822.4609375, "attnres/final_alpha/block_2": 0.010521694086492062, "attnres/block_norm/2": 28078.263671875, "attnres/final_alpha/block_3": 0.012241948395967484, "attnres/block_norm/3": 54692.1328125, "attnres/final_alpha/block_4": 0.014328330755233765, "attnres/block_norm/4": 14457.5712890625, "attnres/final_alpha/block_5": 0.6058283448219299, "attnres/block_norm/5": 6410.2724609375, "attnres/final_alpha/block_6": 0.11274414509534836, "attnres/block_norm/6": 35670.67578125, "geo/tier1_time_s": 1.3617150783538818, "geo/step": 54150.0, "geo/rankme_slope": -8.086724924344738e-05} {"step": 54160, "timestamp": 1778253084.9574256, "train/loss": 2.1606536865234376, "train/z_loss": 0.0013901135651394724, "train/perplexity": 8.676807717312288, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790656.8807925743, "perf/iters_per_sec": 0.8538517383540031, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171163511276245, "data/tokens_consumed": 113583849472, "data/tokens_consumed_B": 113.583849472, "train/loss_slope": 1.0075499766086902e-05} {"step": 54170, "timestamp": 1778253095.3077042, "train/loss": 2.167836368083954, "train/z_loss": 0.0013800565851852299, "train/perplexity": 8.739354823174576, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027186.1833760876, "perf/iters_per_sec": 0.9666376988296926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345137596130372, "data/tokens_consumed": 113604820992, "data/tokens_consumed_B": 113.604820992, "train/loss_slope": 1.4283047765359337e-05} {"step": 54180, "timestamp": 1778253105.6487243, "train/loss": 2.177399682998657, "train/z_loss": 0.001362542191054672, "train/perplexity": 8.823332940057583, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029487.762846111, "perf/iters_per_sec": 0.9677351774435573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333405494689942, "data/tokens_consumed": 113625792512, "data/tokens_consumed_B": 113.625792512, "train/loss_slope": 1.3477394311162738e-05} {"step": 54190, "timestamp": 1778253115.9847674, "train/loss": 2.1747413992881777, "train/z_loss": 0.001382294890936464, "train/perplexity": 8.799909165144157, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029950.4112563115, "perf/iters_per_sec": 0.9679557853967244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331050395965575, "data/tokens_consumed": 113646764032, "data/tokens_consumed_B": 113.646764032, "train/loss_slope": 1.533134687494102e-05} {"step": 54200, "timestamp": 1778253126.3180873, "grad/layer_0/attn": 0.0033731230068951845, "grad/layer_0/mlp": 0.003295136382803321, "grad/layer_0/attn_mlp_ratio": 1.0236671605254775, "grad/layer_4/attn": 0.0029879040084779263, "grad/layer_4/mlp": 0.002485191449522972, "grad/layer_4/attn_mlp_ratio": 1.2022831837857524, "grad/layer_8/attn": 0.003918297588825226, "grad/layer_8/mlp": 0.003600501921027899, "grad/layer_8/attn_mlp_ratio": 1.0882642381371586, "grad/layer_12/attn": 0.005793056916445494, "grad/layer_12/mlp": 0.006595277227461338, "grad/layer_12/attn_mlp_ratio": 0.8783644157501053, "grad/layer_16/attn": 0.003656102577224374, "grad/layer_16/mlp": 0.004582704044878483, "grad/layer_16/attn_mlp_ratio": 0.7978046283678042, "grad/layer_20/attn": 0.0035312895197421312, "grad/layer_20/mlp": 0.007094917818903923, "grad/layer_20/attn_mlp_ratio": 0.4977209828366375, "grad/layer_24/attn": 0.01801181025803089, "grad/layer_24/mlp": 0.014225715771317482, "grad/layer_24/attn_mlp_ratio": 1.2661443839425404, "grad/layer_27/attn": 0.010252604261040688, "grad/layer_27/mlp": 0.013835926540195942, "grad/layer_27/attn_mlp_ratio": 0.741013199018775} {"step": 54200, "timestamp": 1778253126.332608, "train/loss": 2.149704432487488, "train/z_loss": 0.0013909573201090098, "train/perplexity": 8.582321366886507, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027666.4793641474, "perf/iters_per_sec": 0.9668667218037355, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342687129974366, "data/tokens_consumed": 113667735552, "data/tokens_consumed_B": 113.667735552, "train/loss_slope": 1.5549798151984022e-05} {"step": 54210, "timestamp": 1778253136.6705241, "train/loss": 2.114220654964447, "train/z_loss": 0.0013882845756597816, "train/perplexity": 8.283127835410596, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030196.3873891332, "perf/iters_per_sec": 0.968073075956885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0329798698425292, "data/tokens_consumed": 113688707072, "data/tokens_consumed_B": 113.688707072, "train/loss_slope": 1.726982572791895e-05} {"step": 54220, "timestamp": 1778253147.0286891, "train/loss": 2.170128655433655, "train/z_loss": 0.0013712672982364893, "train/perplexity": 8.759410914059803, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025690.4908949677, "perf/iters_per_sec": 0.9659244970774497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352776050567627, "data/tokens_consumed": 113709678592, "data/tokens_consumed_B": 113.709678592, "train/loss_slope": 1.6087569776970627e-05} {"step": 54225, "timestamp": 1778253152.8004506, "eos/sharpness": 69.55561637878417, "eos/L0_probe": 1.9846327304840088, "eos/L_plus": 2.3167104721069336, "eos/L_minus": 2.348111152648926, "eos/grad_norm": 0.21944572031497955, "eos/embed_grad_frac": 0.049672603607177734, "eos/time_s": 0.5998547077178955} {"step": 54225, "timestamp": 1778253154.183889, "geo/rankme_last": 439.46746826171875, "geo/layer_0/stable_rank_q_proj": 19.444807052612305, "geo/layer_0/stable_rank_k_proj": 16.287778854370117, "geo/layer_0/stable_rank_o_proj": 47.628807067871094, "geo/layer_0/stable_rank_gate_proj": 132.69154357910156, "geo/layer_0/stable_rank_down_proj": 54.20346450805664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06192430108785629, "geo/layer_0/attn_entropy_mean": 6.172957420349121, "geo/layer_0/attn_entropy_std": 0.3942113220691681, "geo/layer_7/stable_rank_q_proj": 42.90934371948242, "geo/layer_7/stable_rank_k_proj": 41.64917755126953, "geo/layer_7/stable_rank_o_proj": 93.38656616210938, "geo/layer_7/stable_rank_gate_proj": 85.2714614868164, "geo/layer_7/stable_rank_down_proj": 142.91493225097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4626217484474182, "geo/layer_7/attn_entropy_mean": 4.660660743713379, "geo/layer_7/attn_entropy_std": 0.7995708584785461, "geo/layer_14/stable_rank_q_proj": 52.16965103149414, "geo/layer_14/stable_rank_k_proj": 39.09479522705078, "geo/layer_14/stable_rank_o_proj": 44.65223693847656, "geo/layer_14/stable_rank_gate_proj": 72.7872314453125, "geo/layer_14/stable_rank_down_proj": 130.4523468017578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3972708582878113, "geo/layer_14/attn_entropy_mean": 5.525333404541016, "geo/layer_14/attn_entropy_std": 0.3834581673145294, "geo/layer_21/stable_rank_q_proj": 40.80512237548828, "geo/layer_21/stable_rank_k_proj": 30.337263107299805, "geo/layer_21/stable_rank_o_proj": 72.31315612792969, "geo/layer_21/stable_rank_gate_proj": 68.41124725341797, "geo/layer_21/stable_rank_down_proj": 53.06201171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14345021545886993, "geo/layer_21/attn_entropy_mean": 5.706340312957764, "geo/layer_21/attn_entropy_std": 0.2982191741466522, "geo/layer_27/stable_rank_q_proj": 42.83052444458008, "geo/layer_27/stable_rank_k_proj": 31.466691970825195, "geo/layer_27/stable_rank_o_proj": 116.24909210205078, "geo/layer_27/stable_rank_gate_proj": 81.43871307373047, "geo/layer_27/stable_rank_down_proj": 129.2940673828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08604032546281815, "geo/layer_27/attn_entropy_mean": 4.23736572265625, "geo/layer_27/attn_entropy_std": 0.7095102071762085, "attnres/final_alpha/block_0": 0.24053384363651276, "attnres/block_norm/0": 1.7514009475708008, "attnres/final_alpha/block_1": 0.004912061616778374, "attnres/block_norm/1": 44672.7421875, "attnres/final_alpha/block_2": 0.01062304712831974, "attnres/block_norm/2": 28120.42578125, "attnres/final_alpha/block_3": 0.012389843352138996, "attnres/block_norm/3": 53973.484375, "attnres/final_alpha/block_4": 0.01455891877412796, "attnres/block_norm/4": 14509.6376953125, "attnres/final_alpha/block_5": 0.6045695543289185, "attnres/block_norm/5": 6470.5126953125, "attnres/final_alpha/block_6": 0.11241275072097778, "attnres/block_norm/6": 35698.234375, "geo/tier1_time_s": 1.3632497787475586, "geo/step": 54225.0, "geo/rankme_slope": -5.2134701536864746e-05} {"step": 54230, "timestamp": 1778253159.3574727, "train/loss": 2.1193060159683226, "train/z_loss": 0.0013773284037597478, "train/perplexity": 8.325357817037062, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701702.39095518, "perf/iters_per_sec": 0.8114349322105312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232384705543518, "data/tokens_consumed": 113730650112, "data/tokens_consumed_B": 113.730650112, "train/loss_slope": 1.282083610496908e-05} {"step": 54240, "timestamp": 1778253169.705354, "train/loss": 2.1693044781684874, "train/z_loss": 0.001386964146513492, "train/perplexity": 8.75219458090552, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027609.4095221718, "perf/iters_per_sec": 0.9668395087824687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342978239059448, "data/tokens_consumed": 113751621632, "data/tokens_consumed_B": 113.751621632, "train/loss_slope": 1.4817971350586666e-05} {"step": 54250, "timestamp": 1778253180.0319407, "grad/layer_0/attn": 0.0027871564961969852, "grad/layer_0/mlp": 0.002947162603959441, "grad/layer_0/attn_mlp_ratio": 0.9457083901246811, "grad/layer_4/attn": 0.0019556765910238028, "grad/layer_4/mlp": 0.0025520084891468287, "grad/layer_4/attn_mlp_ratio": 0.7663283734000331, "grad/layer_8/attn": 0.005290588364005089, "grad/layer_8/mlp": 0.0037897657603025436, "grad/layer_8/attn_mlp_ratio": 1.396019848989492, "grad/layer_12/attn": 0.0038474183529615402, "grad/layer_12/mlp": 0.006465144455432892, "grad/layer_12/attn_mlp_ratio": 0.5951016748308308, "grad/layer_16/attn": 0.008774781599640846, "grad/layer_16/mlp": 0.004132406786084175, "grad/layer_16/attn_mlp_ratio": 2.123406973594446, "grad/layer_20/attn": 0.0052472129464149475, "grad/layer_20/mlp": 0.0063046058639883995, "grad/layer_20/attn_mlp_ratio": 0.8322824576804279, "grad/layer_24/attn": 0.010673190467059612, "grad/layer_24/mlp": 0.01130504161119461, "grad/layer_24/attn_mlp_ratio": 0.9441088975806857, "grad/layer_27/attn": 0.005367874167859554, "grad/layer_27/mlp": 0.010027995333075523, "grad/layer_27/attn_mlp_ratio": 0.5352888524614396} {"step": 54250, "timestamp": 1778253180.0464385, "train/loss": 2.1252078652381896, "train/z_loss": 0.001390142913442105, "train/perplexity": 8.374638103358404, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028871.3978330933, "perf/iters_per_sec": 0.9674412717023341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033654475212097, "data/tokens_consumed": 113772593152, "data/tokens_consumed_B": 113.772593152, "train/loss_slope": 1.2067605881872728e-05} {"step": 54260, "timestamp": 1778253190.3911622, "train/loss": 2.1400277853012084, "train/z_loss": 0.0013819153187796473, "train/perplexity": 8.499673791601655, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028266.117185456, "perf/iters_per_sec": 0.9671526513983993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339629411697389, "data/tokens_consumed": 113793564672, "data/tokens_consumed_B": 113.793564672, "train/loss_slope": 1.2425030340062548e-05} {"step": 54270, "timestamp": 1778253200.731334, "train/loss": 2.0996034383773803, "train/z_loss": 0.001390835363417864, "train/perplexity": 8.162932165002179, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029386.39055368, "perf/iters_per_sec": 0.9676868393677139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333921670913697, "data/tokens_consumed": 113814536192, "data/tokens_consumed_B": 113.814536192, "train/loss_slope": 7.616888300062788e-06} {"step": 54280, "timestamp": 1778253211.0680242, "train/loss": 2.1657360792160034, "train/z_loss": 0.0013698883121833205, "train/perplexity": 8.721018915617956, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029843.699582248, "perf/iters_per_sec": 0.9679049013053169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033159351348877, "data/tokens_consumed": 113835507712, "data/tokens_consumed_B": 113.835507712, "train/loss_slope": 6.052492589804649e-06} {"step": 54290, "timestamp": 1778253221.4216194, "train/loss": 2.126235008239746, "train/z_loss": 0.0013891704031266273, "train/perplexity": 8.383244473506611, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026541.4268880268, "perf/iters_per_sec": 0.9663302549781927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348428964614869, "data/tokens_consumed": 113856479232, "data/tokens_consumed_B": 113.856479232, "train/loss_slope": 6.799858438335604e-06} {"step": 54300, "timestamp": 1778253231.7530048, "grad/layer_0/attn": 0.0027004126459360123, "grad/layer_0/mlp": 0.002814824692904949, "grad/layer_0/attn_mlp_ratio": 0.9593537234511632, "grad/layer_4/attn": 0.0017390905413776636, "grad/layer_4/mlp": 0.002539920387789607, "grad/layer_4/attn_mlp_ratio": 0.6847027494514704, "grad/layer_8/attn": 0.004165359307080507, "grad/layer_8/mlp": 0.0034710715990513563, "grad/layer_8/attn_mlp_ratio": 1.2000211082412673, "grad/layer_12/attn": 0.004697316326200962, "grad/layer_12/mlp": 0.0064032478258013725, "grad/layer_12/attn_mlp_ratio": 0.7335833909028423, "grad/layer_16/attn": 0.0043238066136837006, "grad/layer_16/mlp": 0.004341060295701027, "grad/layer_16/attn_mlp_ratio": 0.9960254452957134, "grad/layer_20/attn": 0.004772668704390526, "grad/layer_20/mlp": 0.006116299424320459, "grad/layer_20/attn_mlp_ratio": 0.78031965004539, "grad/layer_24/attn": 0.00974302738904953, "grad/layer_24/mlp": 0.008467067033052444, "grad/layer_24/attn_mlp_ratio": 1.1506968394069048, "grad/layer_27/attn": 0.005629724822938442, "grad/layer_27/mlp": 0.007172802463173866, "grad/layer_27/attn_mlp_ratio": 0.7848710142730273} {"step": 54300, "timestamp": 1778253232.3582466, "eos/sharpness": 62.89703845977782, "eos/L0_probe": 1.9821926355361938, "eos/L_plus": 2.2522621154785156, "eos/L_minus": 2.3410935401916504, "eos/grad_norm": 0.1432795375585556, "eos/embed_grad_frac": 0.10704368352890015, "eos/time_s": 0.6025233268737793} {"step": 54300, "timestamp": 1778253232.3763447, "train/loss": 2.1396358251571654, "train/z_loss": 0.001403156842570752, "train/perplexity": 8.496342911066831, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915742.2982317924, "perf/iters_per_sec": 0.9134971133383715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0946942090988159, "data/tokens_consumed": 113877450752, "data/tokens_consumed_B": 113.877450752, "train/loss_slope": 6.306275897937394e-06} {"step": 54300, "timestamp": 1778253233.7390215, "geo/rankme_last": 438.81414794921875, "geo/layer_0/stable_rank_q_proj": 19.485172271728516, "geo/layer_0/stable_rank_k_proj": 16.33359718322754, "geo/layer_0/stable_rank_o_proj": 47.559993743896484, "geo/layer_0/stable_rank_gate_proj": 132.6500701904297, "geo/layer_0/stable_rank_down_proj": 54.20077896118164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06171458959579468, "geo/layer_0/attn_entropy_mean": 6.170863628387451, "geo/layer_0/attn_entropy_std": 0.39851370453834534, "geo/layer_7/stable_rank_q_proj": 42.87482833862305, "geo/layer_7/stable_rank_k_proj": 41.69236755371094, "geo/layer_7/stable_rank_o_proj": 93.30953979492188, "geo/layer_7/stable_rank_gate_proj": 85.17574310302734, "geo/layer_7/stable_rank_down_proj": 142.7582550048828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4642521142959595, "geo/layer_7/attn_entropy_mean": 4.656316757202148, "geo/layer_7/attn_entropy_std": 0.7994443774223328, "geo/layer_14/stable_rank_q_proj": 52.188812255859375, "geo/layer_14/stable_rank_k_proj": 39.11200714111328, "geo/layer_14/stable_rank_o_proj": 44.63554382324219, "geo/layer_14/stable_rank_gate_proj": 72.71344757080078, "geo/layer_14/stable_rank_down_proj": 130.39259338378906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40328848361968994, "geo/layer_14/attn_entropy_mean": 5.518641948699951, "geo/layer_14/attn_entropy_std": 0.3971899449825287, "geo/layer_21/stable_rank_q_proj": 40.762001037597656, "geo/layer_21/stable_rank_k_proj": 30.379735946655273, "geo/layer_21/stable_rank_o_proj": 72.38873291015625, "geo/layer_21/stable_rank_gate_proj": 68.34523010253906, "geo/layer_21/stable_rank_down_proj": 53.02787399291992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14151185750961304, "geo/layer_21/attn_entropy_mean": 5.705162048339844, "geo/layer_21/attn_entropy_std": 0.29892274737358093, "geo/layer_27/stable_rank_q_proj": 42.92240905761719, "geo/layer_27/stable_rank_k_proj": 31.48072624206543, "geo/layer_27/stable_rank_o_proj": 116.23976135253906, "geo/layer_27/stable_rank_gate_proj": 81.47892761230469, "geo/layer_27/stable_rank_down_proj": 129.22962951660156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09568613767623901, "geo/layer_27/attn_entropy_mean": 4.242491245269775, "geo/layer_27/attn_entropy_std": 0.7235371470451355, "attnres/final_alpha/block_0": 0.23961378633975983, "attnres/block_norm/0": 1.751496434211731, "attnres/final_alpha/block_1": 0.0048681688494980335, "attnres/block_norm/1": 44633.33203125, "attnres/final_alpha/block_2": 0.010555300861597061, "attnres/block_norm/2": 28226.87890625, "attnres/final_alpha/block_3": 0.01241979654878378, "attnres/block_norm/3": 54201.4375, "attnres/final_alpha/block_4": 0.01433692965656519, "attnres/block_norm/4": 14401.435546875, "attnres/final_alpha/block_5": 0.6052994132041931, "attnres/block_norm/5": 6452.998046875, "attnres/final_alpha/block_6": 0.11290661990642548, "attnres/block_norm/6": 35749.078125, "geo/tier1_time_s": 1.358705759048462, "geo/step": 54300.0, "geo/rankme_slope": -4.9904024109643856e-05} {"step": 54310, "timestamp": 1778253244.088436, "train/loss": 2.123561882972717, "train/z_loss": 0.0013902199920266868, "train/perplexity": 8.360864935864832, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791162.9204297117, "perf/iters_per_sec": 0.8540930368565138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170832633972168, "data/tokens_consumed": 113898422272, "data/tokens_consumed_B": 113.898422272, "train/loss_slope": 4.4402428228433805e-06} {"step": 54320, "timestamp": 1778253254.4377923, "train/loss": 2.092275357246399, "train/z_loss": 0.001388214819598943, "train/perplexity": 8.103332179309165, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027877.118726016, "perf/iters_per_sec": 0.9669671624784546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341612815856933, "data/tokens_consumed": 113919393792, "data/tokens_consumed_B": 113.919393792, "train/loss_slope": -1.3213821477514166e-07} {"step": 54330, "timestamp": 1778253264.782214, "train/loss": 2.1349694967269897, "train/z_loss": 0.0013882541214115917, "train/perplexity": 8.456788543197483, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028374.066273032, "perf/iters_per_sec": 0.9672041255345497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033907914161682, "data/tokens_consumed": 113940365312, "data/tokens_consumed_B": 113.940365312, "train/loss_slope": -8.272145912520586e-07} {"step": 54340, "timestamp": 1778253275.1378152, "train/loss": 2.1020590543746946, "train/z_loss": 0.001394412072841078, "train/perplexity": 8.183001823413592, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026192.948150971, "perf/iters_per_sec": 0.9661640873675208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350208759307862, "data/tokens_consumed": 113961336832, "data/tokens_consumed_B": 113.961336832, "train/loss_slope": -8.27037866073365e-06} {"step": 54350, "timestamp": 1778253285.4756536, "grad/layer_0/attn": 0.002965399529784918, "grad/layer_0/mlp": 0.002971186302602291, "grad/layer_0/attn_mlp_ratio": 0.9980523359920115, "grad/layer_4/attn": 0.002175235887989402, "grad/layer_4/mlp": 0.0026358882896602154, "grad/layer_4/attn_mlp_ratio": 0.8252382371431914, "grad/layer_8/attn": 0.004189734347164631, "grad/layer_8/mlp": 0.003690378973260522, "grad/layer_8/attn_mlp_ratio": 1.135312731833513, "grad/layer_12/attn": 0.0046158949844539165, "grad/layer_12/mlp": 0.006453297100961208, "grad/layer_12/attn_mlp_ratio": 0.7152769879816455, "grad/layer_16/attn": 0.0038708399515599012, "grad/layer_16/mlp": 0.004674621392041445, "grad/layer_16/attn_mlp_ratio": 0.8280541982169073, "grad/layer_20/attn": 0.003711021738126874, "grad/layer_20/mlp": 0.006884550675749779, "grad/layer_20/attn_mlp_ratio": 0.5390361490539979, "grad/layer_24/attn": 0.015336700715124607, "grad/layer_24/mlp": 0.011234075762331486, "grad/layer_24/attn_mlp_ratio": 1.365194690072324, "grad/layer_27/attn": 0.007286423351615667, "grad/layer_27/mlp": 0.010584989562630653, "grad/layer_27/attn_mlp_ratio": 0.6883732137537861} {"step": 54350, "timestamp": 1778253285.4902885, "train/loss": 2.149602770805359, "train/z_loss": 0.0013848711852915585, "train/perplexity": 8.581448918007839, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027112.7431053084, "perf/iters_per_sec": 0.9666026797796766, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034551239013672, "data/tokens_consumed": 113982308352, "data/tokens_consumed_B": 113.982308352, "train/loss_slope": -7.275010264030692e-06} {"step": 54360, "timestamp": 1778253295.8301451, "train/loss": 2.167868447303772, "train/z_loss": 0.0013830201234668494, "train/perplexity": 8.739635179355798, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029262.5568539326, "perf/iters_per_sec": 0.9676277908582366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033455228805542, "data/tokens_consumed": 114003279872, "data/tokens_consumed_B": 114.003279872, "train/loss_slope": -2.1167039728150066e-06} {"step": 54370, "timestamp": 1778253306.1747699, "train/loss": 2.212302875518799, "train/z_loss": 0.0013653599540702998, "train/perplexity": 9.136732943119512, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028272.1504318218, "perf/iters_per_sec": 0.9671555282744512, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339598655700684, "data/tokens_consumed": 114024251392, "data/tokens_consumed_B": 114.024251392, "train/loss_slope": 3.960284970738773e-06} {"step": 54375, "timestamp": 1778253311.9358308, "eos/sharpness": 26.882648468017575, "eos/L0_probe": 1.979738473892212, "eos/L_plus": 2.125736713409424, "eos/L_minus": 2.102566719055176, "eos/grad_norm": 0.1113877072930336, "eos/embed_grad_frac": 0.19334810972213745, "eos/time_s": 0.5976872444152832} {"step": 54375, "timestamp": 1778253313.3175354, "geo/rankme_last": 438.6639099121094, "geo/layer_0/stable_rank_q_proj": 19.476058959960938, "geo/layer_0/stable_rank_k_proj": 16.337352752685547, "geo/layer_0/stable_rank_o_proj": 47.48429489135742, "geo/layer_0/stable_rank_gate_proj": 132.5230712890625, "geo/layer_0/stable_rank_down_proj": 54.32685470581055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06529492139816284, "geo/layer_0/attn_entropy_mean": 6.16942834854126, "geo/layer_0/attn_entropy_std": 0.39647358655929565, "geo/layer_7/stable_rank_q_proj": 42.96516036987305, "geo/layer_7/stable_rank_k_proj": 41.74105453491211, "geo/layer_7/stable_rank_o_proj": 93.28687286376953, "geo/layer_7/stable_rank_gate_proj": 85.12166595458984, "geo/layer_7/stable_rank_down_proj": 142.58447265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46327656507492065, "geo/layer_7/attn_entropy_mean": 4.665346145629883, "geo/layer_7/attn_entropy_std": 0.7987495064735413, "geo/layer_14/stable_rank_q_proj": 52.274993896484375, "geo/layer_14/stable_rank_k_proj": 39.102935791015625, "geo/layer_14/stable_rank_o_proj": 44.635440826416016, "geo/layer_14/stable_rank_gate_proj": 72.76475524902344, "geo/layer_14/stable_rank_down_proj": 130.5343780517578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3987547755241394, "geo/layer_14/attn_entropy_mean": 5.543721675872803, "geo/layer_14/attn_entropy_std": 0.3885570466518402, "geo/layer_21/stable_rank_q_proj": 40.80787658691406, "geo/layer_21/stable_rank_k_proj": 30.354427337646484, "geo/layer_21/stable_rank_o_proj": 72.25179290771484, "geo/layer_21/stable_rank_gate_proj": 68.38941192626953, "geo/layer_21/stable_rank_down_proj": 53.091148376464844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14287061989307404, "geo/layer_21/attn_entropy_mean": 5.706168174743652, "geo/layer_21/attn_entropy_std": 0.29464223980903625, "geo/layer_27/stable_rank_q_proj": 42.928131103515625, "geo/layer_27/stable_rank_k_proj": 31.534107208251953, "geo/layer_27/stable_rank_o_proj": 116.12248992919922, "geo/layer_27/stable_rank_gate_proj": 81.48245239257812, "geo/layer_27/stable_rank_down_proj": 129.48341369628906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08718958497047424, "geo/layer_27/attn_entropy_mean": 4.239020824432373, "geo/layer_27/attn_entropy_std": 0.721017062664032, "attnres/final_alpha/block_0": 0.24033701419830322, "attnres/block_norm/0": 1.751563549041748, "attnres/final_alpha/block_1": 0.004891091026365757, "attnres/block_norm/1": 44686.8671875, "attnres/final_alpha/block_2": 0.010567383840680122, "attnres/block_norm/2": 28173.72265625, "attnres/final_alpha/block_3": 0.012488759122788906, "attnres/block_norm/3": 54388.15234375, "attnres/final_alpha/block_4": 0.014473127201199532, "attnres/block_norm/4": 14452.6416015625, "attnres/final_alpha/block_5": 0.6033024787902832, "attnres/block_norm/5": 6458.177734375, "attnres/final_alpha/block_6": 0.11394011229276657, "attnres/block_norm/6": 35808.1171875, "geo/tier1_time_s": 1.3634099960327148, "geo/step": 54375.0, "geo/rankme_slope": -2.8731551214235697e-05} {"step": 54380, "timestamp": 1778253318.5009067, "train/loss": 2.120161437988281, "train/z_loss": 0.0013815762591548263, "train/perplexity": 8.332482558333627, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702524.1748633326, "perf/iters_per_sec": 0.8118267893139518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317898511886596, "data/tokens_consumed": 114045222912, "data/tokens_consumed_B": 114.045222912, "train/loss_slope": 1.0440306730754152e-06} {"step": 54390, "timestamp": 1778253328.8404026, "train/loss": 2.1193387508392334, "train/z_loss": 0.0013922647223807872, "train/perplexity": 8.325630351011146, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029311.386258565, "perf/iters_per_sec": 0.9676510745327782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334303617477416, "data/tokens_consumed": 114066194432, "data/tokens_consumed_B": 114.066194432, "train/loss_slope": -3.3424897412798742e-06} {"step": 54400, "timestamp": 1778253339.1754768, "grad/layer_0/attn": 0.0030344505794346333, "grad/layer_0/mlp": 0.003113984828814864, "grad/layer_0/attn_mlp_ratio": 0.9744589806314508, "grad/layer_4/attn": 0.002606314141303301, "grad/layer_4/mlp": 0.002635230775922537, "grad/layer_4/attn_mlp_ratio": 0.9890268686195804, "grad/layer_8/attn": 0.004507777281105518, "grad/layer_8/mlp": 0.003532467409968376, "grad/layer_8/attn_mlp_ratio": 1.2760987237348642, "grad/layer_12/attn": 0.004271812736988068, "grad/layer_12/mlp": 0.006927087903022766, "grad/layer_12/attn_mlp_ratio": 0.616682325260482, "grad/layer_16/attn": 0.004200404509902, "grad/layer_16/mlp": 0.004613762721419334, "grad/layer_16/attn_mlp_ratio": 0.9104075507309732, "grad/layer_20/attn": 0.003981198649853468, "grad/layer_20/mlp": 0.005750100128352642, "grad/layer_20/attn_mlp_ratio": 0.6923703051684109, "grad/layer_24/attn": 0.009525627829134464, "grad/layer_24/mlp": 0.009035232476890087, "grad/layer_24/attn_mlp_ratio": 1.0542758858802028, "grad/layer_27/attn": 0.0042750840075314045, "grad/layer_27/mlp": 0.008718006312847137, "grad/layer_27/attn_mlp_ratio": 0.49037403794880235} {"step": 54400, "timestamp": 1778253339.1899467, "train/loss": 2.141026473045349, "train/z_loss": 0.0013859049649909139, "train/perplexity": 8.508166551748385, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027196.0879419523, "perf/iters_per_sec": 0.9666424216947328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345087051391602, "data/tokens_consumed": 114087165952, "data/tokens_consumed_B": 114.087165952, "train/loss_slope": -7.043390434281139e-06} {"step": 54410, "timestamp": 1778253350.0012233, "train/loss": 2.1709981203079223, "train/z_loss": 0.0013866044930182398, "train/perplexity": 8.767030226050926, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1940980.3078658786, "perf/iters_per_sec": 0.9255315341309922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.080460214614868, "data/tokens_consumed": 114108137472, "data/tokens_consumed_B": 114.108137472, "train/loss_slope": -4.829916788084579e-06} {"step": 54420, "timestamp": 1778253360.3792658, "train/loss": 2.1316503524780273, "train/z_loss": 0.001398773118853569, "train/perplexity": 8.428765773673366, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021766.54194442, "perf/iters_per_sec": 0.9640534124109363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037286925315857, "data/tokens_consumed": 114129108992, "data/tokens_consumed_B": 114.129108992, "train/loss_slope": -8.196743222066931e-06} {"step": 54430, "timestamp": 1778253370.75417, "train/loss": 2.168032944202423, "train/z_loss": 0.001393275975715369, "train/perplexity": 8.741072940488525, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022263.9857026804, "perf/iters_per_sec": 0.9642906120789911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370317697525024, "data/tokens_consumed": 114150080512, "data/tokens_consumed_B": 114.150080512, "train/loss_slope": -6.8870995018240854e-06} {"step": 54440, "timestamp": 1778253381.1331701, "train/loss": 2.190212059020996, "train/z_loss": 0.0013627791544422508, "train/perplexity": 8.93710810816237, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021590.4361783455, "perf/iters_per_sec": 0.9639694386378982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373772859573365, "data/tokens_consumed": 114171052032, "data/tokens_consumed_B": 114.171052032, "train/loss_slope": -4.708477115259109e-06} {"step": 54450, "timestamp": 1778253391.4959965, "grad/layer_0/attn": 0.0025910865515470505, "grad/layer_0/mlp": 0.0027295658364892006, "grad/layer_0/attn_mlp_ratio": 0.9492668841258094, "grad/layer_4/attn": 0.001987188821658492, "grad/layer_4/mlp": 0.0024946569465100765, "grad/layer_4/attn_mlp_ratio": 0.7965779602605049, "grad/layer_8/attn": 0.003353523090481758, "grad/layer_8/mlp": 0.0034796632826328278, "grad/layer_8/attn_mlp_ratio": 0.9637492831115095, "grad/layer_12/attn": 0.00423301151022315, "grad/layer_12/mlp": 0.006680362392216921, "grad/layer_12/attn_mlp_ratio": 0.6336499726107525, "grad/layer_16/attn": 0.00427791103720665, "grad/layer_16/mlp": 0.004544014576822519, "grad/layer_16/attn_mlp_ratio": 0.9414386487409089, "grad/layer_20/attn": 0.005960885901004076, "grad/layer_20/mlp": 0.005604949314147234, "grad/layer_20/attn_mlp_ratio": 1.0635039606170997, "grad/layer_24/attn": 0.008735135197639465, "grad/layer_24/mlp": 0.00924654584378004, "grad/layer_24/attn_mlp_ratio": 0.9446916990138798, "grad/layer_27/attn": 0.005759465973824263, "grad/layer_27/mlp": 0.007589404005557299, "grad/layer_27/attn_mlp_ratio": 0.7588825016718936} {"step": 54450, "timestamp": 1778253392.0850601, "eos/sharpness": 53.97620201110839, "eos/L0_probe": 1.985248327255249, "eos/L_plus": 2.1960151195526123, "eos/L_minus": 2.3142435550689697, "eos/grad_norm": 0.145905539393425, "eos/embed_grad_frac": 0.11101774126291275, "eos/time_s": 0.5861797332763672} {"step": 54450, "timestamp": 1778253392.103691, "train/loss": 2.148737096786499, "train/z_loss": 0.0013707647449336946, "train/perplexity": 8.574023395140177, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912520.2615437293, "perf/iters_per_sec": 0.9119607265204093, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965384483337401, "data/tokens_consumed": 114192023552, "data/tokens_consumed_B": 114.192023552, "train/loss_slope": -6.931817470782866e-06} {"step": 54450, "timestamp": 1778253393.47371, "geo/rankme_last": 439.104248046875, "geo/layer_0/stable_rank_q_proj": 19.496082305908203, "geo/layer_0/stable_rank_k_proj": 16.3797664642334, "geo/layer_0/stable_rank_o_proj": 47.62640380859375, "geo/layer_0/stable_rank_gate_proj": 132.70640563964844, "geo/layer_0/stable_rank_down_proj": 54.285526275634766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061686087399721146, "geo/layer_0/attn_entropy_mean": 6.173379898071289, "geo/layer_0/attn_entropy_std": 0.3963093161582947, "geo/layer_7/stable_rank_q_proj": 42.87092971801758, "geo/layer_7/stable_rank_k_proj": 41.79798126220703, "geo/layer_7/stable_rank_o_proj": 93.328369140625, "geo/layer_7/stable_rank_gate_proj": 85.17015075683594, "geo/layer_7/stable_rank_down_proj": 142.59400939941406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4720417559146881, "geo/layer_7/attn_entropy_mean": 4.6593523025512695, "geo/layer_7/attn_entropy_std": 0.7901496887207031, "geo/layer_14/stable_rank_q_proj": 52.22500228881836, "geo/layer_14/stable_rank_k_proj": 39.08170700073242, "geo/layer_14/stable_rank_o_proj": 44.626121520996094, "geo/layer_14/stable_rank_gate_proj": 72.69258880615234, "geo/layer_14/stable_rank_down_proj": 130.8926239013672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4005337357521057, "geo/layer_14/attn_entropy_mean": 5.537727355957031, "geo/layer_14/attn_entropy_std": 0.38226208090782166, "geo/layer_21/stable_rank_q_proj": 40.77299499511719, "geo/layer_21/stable_rank_k_proj": 30.31339454650879, "geo/layer_21/stable_rank_o_proj": 72.30870056152344, "geo/layer_21/stable_rank_gate_proj": 68.31748962402344, "geo/layer_21/stable_rank_down_proj": 53.049957275390625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14039547741413116, "geo/layer_21/attn_entropy_mean": 5.706564903259277, "geo/layer_21/attn_entropy_std": 0.29614201188087463, "geo/layer_27/stable_rank_q_proj": 42.93045425415039, "geo/layer_27/stable_rank_k_proj": 31.581302642822266, "geo/layer_27/stable_rank_o_proj": 116.27556610107422, "geo/layer_27/stable_rank_gate_proj": 81.43138885498047, "geo/layer_27/stable_rank_down_proj": 129.5084228515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09767584502696991, "geo/layer_27/attn_entropy_mean": 4.2451934814453125, "geo/layer_27/attn_entropy_std": 0.7322826385498047, "attnres/final_alpha/block_0": 0.23774084448814392, "attnres/block_norm/0": 1.7518097162246704, "attnres/final_alpha/block_1": 0.004814604297280312, "attnres/block_norm/1": 44559.1875, "attnres/final_alpha/block_2": 0.010425763204693794, "attnres/block_norm/2": 28090.076171875, "attnres/final_alpha/block_3": 0.012226363644003868, "attnres/block_norm/3": 54628.2890625, "attnres/final_alpha/block_4": 0.014324847608804703, "attnres/block_norm/4": 14494.349609375, "attnres/final_alpha/block_5": 0.6081664562225342, "attnres/block_norm/5": 6393.2861328125, "attnres/final_alpha/block_6": 0.11230114847421646, "attnres/block_norm/6": 35951.3671875, "geo/tier1_time_s": 1.359313726425171, "geo/step": 54450.0, "geo/rankme_slope": -2.400086987920168e-05} {"step": 54460, "timestamp": 1778253403.8555055, "train/loss": 2.11906316280365, "train/z_loss": 0.0013885533437132836, "train/perplexity": 8.323336223029347, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785156.3550301546, "perf/iters_per_sec": 0.8512288832808278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1747721672058105, "data/tokens_consumed": 114212995072, "data/tokens_consumed_B": 114.212995072, "train/loss_slope": -8.22730045078252e-06} {"step": 54470, "timestamp": 1778253414.229944, "train/loss": 2.157922601699829, "train/z_loss": 0.0013887626817449927, "train/perplexity": 8.653142949391707, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022718.8807630178, "perf/iters_per_sec": 0.964507522946843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367985486984252, "data/tokens_consumed": 114233966592, "data/tokens_consumed_B": 114.233966592, "train/loss_slope": -1.0590666312553311e-05} {"step": 54480, "timestamp": 1778253424.6028588, "train/loss": 2.1830604791641237, "train/z_loss": 0.0013798539526760577, "train/perplexity": 8.873421666610815, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022743.7659190504, "perf/iters_per_sec": 0.9645193891139271, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367857933044433, "data/tokens_consumed": 114254938112, "data/tokens_consumed_B": 114.254938112, "train/loss_slope": -1.1236452970496136e-05} {"step": 54490, "timestamp": 1778253434.9783916, "train/loss": 2.1671714544296266, "train/z_loss": 0.001367911696434021, "train/perplexity": 8.733545838273344, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022595.6733760517, "perf/iters_per_sec": 0.9644487730865725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368617057800293, "data/tokens_consumed": 114275909632, "data/tokens_consumed_B": 114.275909632, "train/loss_slope": -1.1795848423820148e-05} {"step": 54500, "timestamp": 1778253445.347035, "grad/layer_0/attn": 0.0028362893499433994, "grad/layer_0/mlp": 0.002953881397843361, "grad/layer_0/attn_mlp_ratio": 0.9601906345986405, "grad/layer_4/attn": 0.004065536428242922, "grad/layer_4/mlp": 0.0025046844966709614, "grad/layer_4/attn_mlp_ratio": 1.6231730069512653, "grad/layer_8/attn": 0.0033713248558342457, "grad/layer_8/mlp": 0.0036259107291698456, "grad/layer_8/attn_mlp_ratio": 0.9297870286032689, "grad/layer_12/attn": 0.005583133082836866, "grad/layer_12/mlp": 0.006879202090203762, "grad/layer_12/attn_mlp_ratio": 0.8115960148383855, "grad/layer_16/attn": 0.0032940951641649008, "grad/layer_16/mlp": 0.004552246071398258, "grad/layer_16/attn_mlp_ratio": 0.7236197341131694, "grad/layer_20/attn": 0.0033596795983612537, "grad/layer_20/mlp": 0.006285410840064287, "grad/layer_20/attn_mlp_ratio": 0.5345202772576221, "grad/layer_24/attn": 0.010613007470965385, "grad/layer_24/mlp": 0.009720662608742714, "grad/layer_24/attn_mlp_ratio": 1.091798757858361, "grad/layer_27/attn": 0.005527926608920097, "grad/layer_27/mlp": 0.008796006441116333, "grad/layer_27/attn_mlp_ratio": 0.6284586741813096} {"step": 54500, "timestamp": 1778253445.3613086, "train/loss": 2.1454248785972596, "train/z_loss": 0.0013825389673002065, "train/perplexity": 8.545671338914362, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021250.7177381718, "perf/iters_per_sec": 0.9638074482622966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375516414642334, "data/tokens_consumed": 114296881152, "data/tokens_consumed_B": 114.296881152, "train/loss_slope": -1.5304280564908713e-05} {"step": 54500, "timestamp": 1778253452.6598244, "geo/ww_alpha_mean": 7.505264220930142, "geo/ww_alpha_std": 4.088791597588715, "geo/ww_alpha_min": 1.345474041288308, "geo/ww_alpha_max": 25.12959757893457, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9971069113159152, "geo/ww_alpha_by_type/k_proj": 4.485950038326704, "geo/ww_alpha_by_type/v_proj": 8.298736059117967, "geo/ww_alpha_by_type/o_proj": 7.8936761795102415, "geo/ww_alpha_by_type/gate_proj": 7.8260916395546145, "geo/ww_alpha_by_type/up_proj": 11.572256101268616, "geo/ww_alpha_by_type/down_proj": 8.556957426945214, "geo/twonn_id/layer_0": 0.6963714957237244, "geo/twonn_id/layer_7": 3.026019811630249, "geo/twonn_id/layer_14": 4.735194683074951, "geo/twonn_id/layer_21": 6.854605197906494, "geo/twonn_id/layer_27": 6.431380748748779, "geo/tier2_time_s": 7.292195796966553} {"step": 54500, "timestamp": 1778253453.345435, "eoc/jacobian_sigma/layer_0/attn": 1101.889892578125, "eoc/jacobian_sigma/layer_0/mlp": 8889.3349609375, "eoc/jacobian_sigma/layer_0": 8889.3349609375, "eoc/jacobian_sigma/layer_7/attn": 1.156623125076294, "eoc/jacobian_sigma/layer_7/mlp": 1.7975327968597412, "eoc/jacobian_sigma/layer_7": 1.7975327968597412, "eoc/jacobian_sigma/layer_14/attn": 1.5097066164016724, "eoc/jacobian_sigma/layer_14/mlp": 7.1303911209106445, "eoc/jacobian_sigma/layer_14": 7.1303911209106445, "eoc/jacobian_sigma/layer_21/attn": 1.0985788106918335, "eoc/jacobian_sigma/layer_21/mlp": 4.098277568817139, "eoc/jacobian_sigma/layer_21": 4.098277568817139, "eoc/jacobian_sigma/layer_27/attn": 3.4333300590515137, "eoc/jacobian_sigma/layer_27/mlp": 32.87830352783203, "eoc/jacobian_sigma/layer_27": 32.87830352783203, "eoc/layer0_sigma": 8889.3349609375, "eoc/sigma_max": 32.87830352783203, "eoc/sigma_min": 1.7975327968597412, "eoc/sigma_mean": 11.476126253604889, "eoc/time_s": 0.6794114112854004} {"step": 54510, "timestamp": 1778253463.7442966, "train/loss": 2.123039448261261, "train/z_loss": 0.0013751493999734521, "train/perplexity": 8.356498070604857, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1141129.1943216217, "perf/iters_per_sec": 0.5441328021629437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.837786650657654, "data/tokens_consumed": 114317852672, "data/tokens_consumed_B": 114.317852672, "train/loss_slope": -1.6156242303650806e-05} {"step": 54520, "timestamp": 1778253474.1196856, "train/loss": 2.187658095359802, "train/z_loss": 0.0013561887200921774, "train/perplexity": 8.914312181193486, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022781.3506425414, "perf/iters_per_sec": 0.9645373109066684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036766529083252, "data/tokens_consumed": 114338824192, "data/tokens_consumed_B": 114.338824192, "train/loss_slope": -1.2767654605502643e-05} {"step": 54525, "timestamp": 1778253479.9004803, "eos/sharpness": 33.130383491516106, "eos/L0_probe": 1.9793169498443604, "eos/L_plus": 2.1444029808044434, "eos/L_minus": 2.1455347537994385, "eos/grad_norm": 0.14686596393585205, "eos/embed_grad_frac": 0.13565202057361603, "eos/time_s": 0.6020767688751221} {"step": 54525, "timestamp": 1778253481.2782788, "geo/rankme_last": 439.01190185546875, "geo/layer_0/stable_rank_q_proj": 19.5103816986084, "geo/layer_0/stable_rank_k_proj": 16.396875381469727, "geo/layer_0/stable_rank_o_proj": 47.63059616088867, "geo/layer_0/stable_rank_gate_proj": 132.79434204101562, "geo/layer_0/stable_rank_down_proj": 54.21236801147461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061414606869220734, "geo/layer_0/attn_entropy_mean": 6.173480033874512, "geo/layer_0/attn_entropy_std": 0.3957853615283966, "geo/layer_7/stable_rank_q_proj": 42.913597106933594, "geo/layer_7/stable_rank_k_proj": 41.86025619506836, "geo/layer_7/stable_rank_o_proj": 93.43502044677734, "geo/layer_7/stable_rank_gate_proj": 85.13377380371094, "geo/layer_7/stable_rank_down_proj": 142.7607421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4648849666118622, "geo/layer_7/attn_entropy_mean": 4.659456253051758, "geo/layer_7/attn_entropy_std": 0.8054166436195374, "geo/layer_14/stable_rank_q_proj": 52.13844680786133, "geo/layer_14/stable_rank_k_proj": 39.10317611694336, "geo/layer_14/stable_rank_o_proj": 44.63766098022461, "geo/layer_14/stable_rank_gate_proj": 72.726806640625, "geo/layer_14/stable_rank_down_proj": 130.8767852783203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.380891352891922, "geo/layer_14/attn_entropy_mean": 5.50367546081543, "geo/layer_14/attn_entropy_std": 0.38188982009887695, "geo/layer_21/stable_rank_q_proj": 40.6940803527832, "geo/layer_21/stable_rank_k_proj": 30.38399314880371, "geo/layer_21/stable_rank_o_proj": 72.27410888671875, "geo/layer_21/stable_rank_gate_proj": 68.23116302490234, "geo/layer_21/stable_rank_down_proj": 53.057064056396484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14114022254943848, "geo/layer_21/attn_entropy_mean": 5.719814300537109, "geo/layer_21/attn_entropy_std": 0.2956526279449463, "geo/layer_27/stable_rank_q_proj": 42.960994720458984, "geo/layer_27/stable_rank_k_proj": 31.61087989807129, "geo/layer_27/stable_rank_o_proj": 116.09605407714844, "geo/layer_27/stable_rank_gate_proj": 81.32847595214844, "geo/layer_27/stable_rank_down_proj": 129.55076599121094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09312638640403748, "geo/layer_27/attn_entropy_mean": 4.244318008422852, "geo/layer_27/attn_entropy_std": 0.7041493058204651, "attnres/final_alpha/block_0": 0.23670417070388794, "attnres/block_norm/0": 1.751932978630066, "attnres/final_alpha/block_1": 0.004766559228301048, "attnres/block_norm/1": 44672.9296875, "attnres/final_alpha/block_2": 0.010187744162976742, "attnres/block_norm/2": 28044.8671875, "attnres/final_alpha/block_3": 0.012183072045445442, "attnres/block_norm/3": 54726.0625, "attnres/final_alpha/block_4": 0.014145229011774063, "attnres/block_norm/4": 14463.4306640625, "attnres/final_alpha/block_5": 0.6115971207618713, "attnres/block_norm/5": 6389.9423828125, "attnres/final_alpha/block_6": 0.11041608452796936, "attnres/block_norm/6": 36028.703125, "geo/tier1_time_s": 1.3588173389434814, "geo/step": 54525.0, "geo/rankme_slope": 3.0032755289615847e-05} {"step": 54530, "timestamp": 1778253486.4658165, "train/loss": 2.1720314264297484, "train/z_loss": 0.0013666017097420991, "train/perplexity": 8.776093934039755, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699310.0407080522, "perf/iters_per_sec": 0.8102941707172643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234119701385498, "data/tokens_consumed": 114359795712, "data/tokens_consumed_B": 114.359795712, "train/loss_slope": -1.12694035698049e-05} {"step": 54540, "timestamp": 1778253496.8533607, "train/loss": 2.149672198295593, "train/z_loss": 0.0013822722132317722, "train/perplexity": 8.582044727151317, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020515.5554640815, "perf/iters_per_sec": 0.9634568955727012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037929153442383, "data/tokens_consumed": 114380767232, "data/tokens_consumed_B": 114.380767232, "train/loss_slope": -1.4435350304782984e-05} {"step": 54550, "timestamp": 1778253507.2259138, "grad/layer_0/attn": 0.0030432529747486115, "grad/layer_0/mlp": 0.0031433114781975746, "grad/layer_0/attn_mlp_ratio": 0.96816777434888, "grad/layer_4/attn": 0.0018894373206421733, "grad/layer_4/mlp": 0.0025769302155822515, "grad/layer_4/attn_mlp_ratio": 0.733212423020162, "grad/layer_8/attn": 0.006460287142544985, "grad/layer_8/mlp": 0.0039132521487772465, "grad/layer_8/attn_mlp_ratio": 1.6508741915534828, "grad/layer_12/attn": 0.006491290405392647, "grad/layer_12/mlp": 0.0070925867184996605, "grad/layer_12/attn_mlp_ratio": 0.9152218466274327, "grad/layer_16/attn": 0.0033829971216619015, "grad/layer_16/mlp": 0.004713470581918955, "grad/layer_16/attn_mlp_ratio": 0.7177295351892613, "grad/layer_20/attn": 0.004754465539008379, "grad/layer_20/mlp": 0.006782023701816797, "grad/layer_20/attn_mlp_ratio": 0.701039347242446, "grad/layer_24/attn": 0.011849960312247276, "grad/layer_24/mlp": 0.011424056254327297, "grad/layer_24/attn_mlp_ratio": 1.0372813250136543, "grad/layer_27/attn": 0.007555901538580656, "grad/layer_27/mlp": 0.012005039490759373, "grad/layer_27/attn_mlp_ratio": 0.6293941374751194} {"step": 54550, "timestamp": 1778253507.2402976, "train/loss": 2.131523370742798, "train/z_loss": 0.0013773471000604331, "train/perplexity": 8.427695542320938, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020333.3101752358, "perf/iters_per_sec": 0.9633699942470721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038022780418396, "data/tokens_consumed": 114401738752, "data/tokens_consumed_B": 114.401738752, "train/loss_slope": -1.0351853881410177e-05} {"step": 54560, "timestamp": 1778253517.625432, "train/loss": 2.136902379989624, "train/z_loss": 0.0013836629106663168, "train/perplexity": 8.473150335851827, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020913.9457275008, "perf/iters_per_sec": 0.9636468628537659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037724542617798, "data/tokens_consumed": 114422710272, "data/tokens_consumed_B": 114.422710272, "train/loss_slope": -1.2039019844748757e-05} {"step": 54570, "timestamp": 1778253528.0060265, "train/loss": 2.114219093322754, "train/z_loss": 0.0013847544556483625, "train/perplexity": 8.283114900142918, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021632.4850307466, "perf/iters_per_sec": 0.9639894890931828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373557090759278, "data/tokens_consumed": 114443681792, "data/tokens_consumed_B": 114.443681792, "train/loss_slope": -1.3434718386961697e-05} {"step": 54580, "timestamp": 1778253538.9828794, "train/loss": 2.139255154132843, "train/z_loss": 0.0013827681890688837, "train/perplexity": 8.49310921503412, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911711.3034816554, "perf/iters_per_sec": 0.9115749852569844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0970024585723877, "data/tokens_consumed": 114464653312, "data/tokens_consumed_B": 114.464653312, "train/loss_slope": -1.4583330779615003e-05} {"step": 54590, "timestamp": 1778253549.8496716, "train/loss": 2.1606268286705017, "train/z_loss": 0.0013827435090206564, "train/perplexity": 8.676574680016119, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931066.4900495694, "perf/iters_per_sec": 0.9208042574165198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0860071420669555, "data/tokens_consumed": 114485624832, "data/tokens_consumed_B": 114.485624832, "train/loss_slope": -1.1052803488204061e-05} {"step": 54600, "timestamp": 1778253560.2000873, "grad/layer_0/attn": 0.002650349633768201, "grad/layer_0/mlp": 0.0026730524841696024, "grad/layer_0/attn_mlp_ratio": 0.9915067325888561, "grad/layer_4/attn": 0.002263830741867423, "grad/layer_4/mlp": 0.002573374891653657, "grad/layer_4/attn_mlp_ratio": 0.879712731028244, "grad/layer_8/attn": 0.005429084412753582, "grad/layer_8/mlp": 0.0037033434491604567, "grad/layer_8/attn_mlp_ratio": 1.4659953473623446, "grad/layer_12/attn": 0.004780575167387724, "grad/layer_12/mlp": 0.007076031994074583, "grad/layer_12/attn_mlp_ratio": 0.6756011142729189, "grad/layer_16/attn": 0.0040301308035850525, "grad/layer_16/mlp": 0.00497808400541544, "grad/layer_16/attn_mlp_ratio": 0.8095746713481297, "grad/layer_20/attn": 0.0038661155849695206, "grad/layer_20/mlp": 0.005794365890324116, "grad/layer_20/attn_mlp_ratio": 0.6672197771810517, "grad/layer_24/attn": 0.010297507978975773, "grad/layer_24/mlp": 0.009927792474627495, "grad/layer_24/attn_mlp_ratio": 1.0372404440936005, "grad/layer_27/attn": 0.004960708320140839, "grad/layer_27/mlp": 0.010510499589145184, "grad/layer_27/attn_mlp_ratio": 0.47197644896598556} {"step": 54600, "timestamp": 1778253560.7956421, "eos/sharpness": 52.599024772644036, "eos/L0_probe": 1.9791340827941895, "eos/L_plus": 2.240239143371582, "eos/L_minus": 2.2440192699432373, "eos/grad_norm": 0.14698906242847443, "eos/embed_grad_frac": 0.10069932043552399, "eos/time_s": 0.5927834510803223} {"step": 54600, "timestamp": 1778253560.8151777, "train/loss": 2.1737279176712034, "train/z_loss": 0.0013766871299594641, "train/perplexity": 8.79099513683915, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913997.4537331478, "perf/iters_per_sec": 0.9126651066461314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0956921577453613, "data/tokens_consumed": 114506596352, "data/tokens_consumed_B": 114.506596352, "train/loss_slope": -7.465657177824139e-06} {"step": 54600, "timestamp": 1778253562.1773403, "geo/rankme_last": 438.1605224609375, "geo/layer_0/stable_rank_q_proj": 19.483753204345703, "geo/layer_0/stable_rank_k_proj": 16.357404708862305, "geo/layer_0/stable_rank_o_proj": 47.60230255126953, "geo/layer_0/stable_rank_gate_proj": 133.0594482421875, "geo/layer_0/stable_rank_down_proj": 54.25322341918945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06360036134719849, "geo/layer_0/attn_entropy_mean": 6.170406818389893, "geo/layer_0/attn_entropy_std": 0.40092360973358154, "geo/layer_7/stable_rank_q_proj": 42.85932159423828, "geo/layer_7/stable_rank_k_proj": 41.83421325683594, "geo/layer_7/stable_rank_o_proj": 93.275634765625, "geo/layer_7/stable_rank_gate_proj": 84.94286346435547, "geo/layer_7/stable_rank_down_proj": 142.66156005859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4697874188423157, "geo/layer_7/attn_entropy_mean": 4.632928371429443, "geo/layer_7/attn_entropy_std": 0.8005045056343079, "geo/layer_14/stable_rank_q_proj": 52.213279724121094, "geo/layer_14/stable_rank_k_proj": 39.12846374511719, "geo/layer_14/stable_rank_o_proj": 44.531917572021484, "geo/layer_14/stable_rank_gate_proj": 72.82463836669922, "geo/layer_14/stable_rank_down_proj": 130.51414489746094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3921828866004944, "geo/layer_14/attn_entropy_mean": 5.535752296447754, "geo/layer_14/attn_entropy_std": 0.3977124094963074, "geo/layer_21/stable_rank_q_proj": 40.67967987060547, "geo/layer_21/stable_rank_k_proj": 30.389406204223633, "geo/layer_21/stable_rank_o_proj": 72.36943817138672, "geo/layer_21/stable_rank_gate_proj": 68.24089813232422, "geo/layer_21/stable_rank_down_proj": 53.036556243896484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14630168676376343, "geo/layer_21/attn_entropy_mean": 5.719446659088135, "geo/layer_21/attn_entropy_std": 0.29925960302352905, "geo/layer_27/stable_rank_q_proj": 43.0569953918457, "geo/layer_27/stable_rank_k_proj": 31.470430374145508, "geo/layer_27/stable_rank_o_proj": 116.14306640625, "geo/layer_27/stable_rank_gate_proj": 81.25961303710938, "geo/layer_27/stable_rank_down_proj": 129.68814086914062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0897139310836792, "geo/layer_27/attn_entropy_mean": 4.237153053283691, "geo/layer_27/attn_entropy_std": 0.7063611745834351, "attnres/final_alpha/block_0": 0.23758186399936676, "attnres/block_norm/0": 1.7521620988845825, "attnres/final_alpha/block_1": 0.004785815253853798, "attnres/block_norm/1": 44557.91015625, "attnres/final_alpha/block_2": 0.010298988781869411, "attnres/block_norm/2": 27982.4765625, "attnres/final_alpha/block_3": 0.012224199250340462, "attnres/block_norm/3": 54644.109375, "attnres/final_alpha/block_4": 0.014375844970345497, "attnres/block_norm/4": 14431.626953125, "attnres/final_alpha/block_5": 0.6079216003417969, "attnres/block_norm/5": 6450.669921875, "attnres/final_alpha/block_6": 0.11281171441078186, "attnres/block_norm/6": 36235.6796875, "geo/tier1_time_s": 1.3585243225097656, "geo/step": 54600.0, "geo/rankme_slope": 1.7298462353691475e-05} {"step": 54610, "timestamp": 1778253572.521923, "train/loss": 2.10101557970047, "train/z_loss": 0.0013756082160398364, "train/perplexity": 8.174467521689957, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791919.5595135093, "perf/iters_per_sec": 0.8544538304870173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170338249206543, "data/tokens_consumed": 114527567872, "data/tokens_consumed_B": 114.527567872, "train/loss_slope": -1.2211130697592032e-05} {"step": 54620, "timestamp": 1778253582.8682091, "train/loss": 2.138639235496521, "train/z_loss": 0.0013733518426306546, "train/perplexity": 8.487879761412554, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027968.6150986014, "perf/iters_per_sec": 0.9670107913487441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034114623069763, "data/tokens_consumed": 114548539392, "data/tokens_consumed_B": 114.548539392, "train/loss_slope": -1.1364979440658787e-05} {"step": 54630, "timestamp": 1778253593.2090933, "train/loss": 2.1647456169128416, "train/z_loss": 0.0013752898667007685, "train/perplexity": 8.712385351452603, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029467.6748759795, "perf/iters_per_sec": 0.9677255987529657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333507776260376, "data/tokens_consumed": 114569510912, "data/tokens_consumed_B": 114.569510912, "train/loss_slope": -1.078922220654153e-05} {"step": 54640, "timestamp": 1778253604.0731678, "train/loss": 2.106271195411682, "train/z_loss": 0.0013860978186130523, "train/perplexity": 8.217542475179389, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931717.8819135076, "perf/iters_per_sec": 0.9211148652617968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0856409311294555, "data/tokens_consumed": 114590482432, "data/tokens_consumed_B": 114.590482432, "train/loss_slope": -1.0830042156914256e-05} {"step": 54650, "timestamp": 1778253614.7591074, "grad/layer_0/attn": 0.0026052570901811123, "grad/layer_0/mlp": 0.002850984688848257, "grad/layer_0/attn_mlp_ratio": 0.9138095371015943, "grad/layer_4/attn": 0.0021086365450173616, "grad/layer_4/mlp": 0.002467845566570759, "grad/layer_4/attn_mlp_ratio": 0.8544442521591948, "grad/layer_8/attn": 0.004819076973944902, "grad/layer_8/mlp": 0.0036286208778619766, "grad/layer_8/attn_mlp_ratio": 1.3280739441638667, "grad/layer_12/attn": 0.007437222637236118, "grad/layer_12/mlp": 0.007527360692620277, "grad/layer_12/attn_mlp_ratio": 0.9880252643831648, "grad/layer_16/attn": 0.0036761718802154064, "grad/layer_16/mlp": 0.004617178812623024, "grad/layer_16/attn_mlp_ratio": 0.7961943753500705, "grad/layer_20/attn": 0.008166252635419369, "grad/layer_20/mlp": 0.005943358410149813, "grad/layer_20/attn_mlp_ratio": 1.3740131310391916, "grad/layer_24/attn": 0.010934652760624886, "grad/layer_24/mlp": 0.00887218676507473, "grad/layer_24/attn_mlp_ratio": 1.2324642083079913, "grad/layer_27/attn": 0.010935242287814617, "grad/layer_27/mlp": 0.007118843030184507, "grad/layer_27/attn_mlp_ratio": 1.5360982238038439} {"step": 54650, "timestamp": 1778253614.7732306, "train/loss": 2.1595701456069945, "train/z_loss": 0.0013620062032714485, "train/perplexity": 8.667411132837694, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1961284.357918837, "perf/iters_per_sec": 0.935213259658259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0692748308181763, "data/tokens_consumed": 114611453952, "data/tokens_consumed_B": 114.611453952, "train/loss_slope": -1.1531379256490322e-05} {"step": 54660, "timestamp": 1778253625.1497748, "train/loss": 2.1663984298706054, "train/z_loss": 0.0013745415024459362, "train/perplexity": 8.72679720162002, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022455.275253689, "perf/iters_per_sec": 0.964381826044888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03693368434906, "data/tokens_consumed": 114632425472, "data/tokens_consumed_B": 114.632425472, "train/loss_slope": -1.2133070602096596e-05} {"step": 54670, "timestamp": 1778253635.522821, "train/loss": 2.0976687550544737, "train/z_loss": 0.0013982756063342094, "train/perplexity": 8.14715474335504, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022729.5789793616, "perf/iters_per_sec": 0.9645126242539223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036793065071106, "data/tokens_consumed": 114653396992, "data/tokens_consumed_B": 114.653396992, "train/loss_slope": -1.610762191684341e-05} {"step": 54675, "timestamp": 1778253641.2868505, "eos/sharpness": 23.910474777221676, "eos/L0_probe": 1.9765135049819946, "eos/L_plus": 2.0981640815734863, "eos/L_minus": 2.0939676761627197, "eos/grad_norm": 0.11811822652816772, "eos/embed_grad_frac": 0.1691204458475113, "eos/time_s": 0.5848805904388428} {"step": 54675, "timestamp": 1778253642.6615696, "geo/rankme_last": 438.79278564453125, "geo/layer_0/stable_rank_q_proj": 19.46486473083496, "geo/layer_0/stable_rank_k_proj": 16.33379364013672, "geo/layer_0/stable_rank_o_proj": 47.58562469482422, "geo/layer_0/stable_rank_gate_proj": 132.81190490722656, "geo/layer_0/stable_rank_down_proj": 54.3210334777832, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06821952760219574, "geo/layer_0/attn_entropy_mean": 6.170168876647949, "geo/layer_0/attn_entropy_std": 0.39891767501831055, "geo/layer_7/stable_rank_q_proj": 42.78314208984375, "geo/layer_7/stable_rank_k_proj": 41.854774475097656, "geo/layer_7/stable_rank_o_proj": 93.18029022216797, "geo/layer_7/stable_rank_gate_proj": 84.74497985839844, "geo/layer_7/stable_rank_down_proj": 142.54364013671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46142470836639404, "geo/layer_7/attn_entropy_mean": 4.641717910766602, "geo/layer_7/attn_entropy_std": 0.8099457621574402, "geo/layer_14/stable_rank_q_proj": 52.14834213256836, "geo/layer_14/stable_rank_k_proj": 39.206512451171875, "geo/layer_14/stable_rank_o_proj": 44.55329513549805, "geo/layer_14/stable_rank_gate_proj": 72.72322082519531, "geo/layer_14/stable_rank_down_proj": 130.52317810058594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38574376702308655, "geo/layer_14/attn_entropy_mean": 5.495821475982666, "geo/layer_14/attn_entropy_std": 0.38030579686164856, "geo/layer_21/stable_rank_q_proj": 40.64497756958008, "geo/layer_21/stable_rank_k_proj": 30.379547119140625, "geo/layer_21/stable_rank_o_proj": 72.344482421875, "geo/layer_21/stable_rank_gate_proj": 68.19303894042969, "geo/layer_21/stable_rank_down_proj": 53.04359436035156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14513200521469116, "geo/layer_21/attn_entropy_mean": 5.688716888427734, "geo/layer_21/attn_entropy_std": 0.3050735294818878, "geo/layer_27/stable_rank_q_proj": 43.076847076416016, "geo/layer_27/stable_rank_k_proj": 31.4694881439209, "geo/layer_27/stable_rank_o_proj": 116.03225708007812, "geo/layer_27/stable_rank_gate_proj": 81.27082824707031, "geo/layer_27/stable_rank_down_proj": 129.91390991210938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08868110924959183, "geo/layer_27/attn_entropy_mean": 4.237531661987305, "geo/layer_27/attn_entropy_std": 0.7259860038757324, "attnres/final_alpha/block_0": 0.23859889805316925, "attnres/block_norm/0": 1.752197027206421, "attnres/final_alpha/block_1": 0.00484872329980135, "attnres/block_norm/1": 44822.05078125, "attnres/final_alpha/block_2": 0.010550221428275108, "attnres/block_norm/2": 27874.6953125, "attnres/final_alpha/block_3": 0.012422339990735054, "attnres/block_norm/3": 54504.07421875, "attnres/final_alpha/block_4": 0.014274916611611843, "attnres/block_norm/4": 14510.732421875, "attnres/final_alpha/block_5": 0.6068894863128662, "attnres/block_norm/5": 6498.1806640625, "attnres/final_alpha/block_6": 0.11241541802883148, "attnres/block_norm/6": 35971.765625, "geo/tier1_time_s": 1.3561315536499023, "geo/step": 54675.0, "geo/rankme_slope": 9.534282462985194e-06} {"step": 54680, "timestamp": 1778253647.8535438, "train/loss": 2.1985159397125242, "train/z_loss": 0.0013794003869406879, "train/perplexity": 9.011629768893574, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701732.613275119, "perf/iters_per_sec": 0.8114493433356853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2323628187179565, "data/tokens_consumed": 114674368512, "data/tokens_consumed_B": 114.674368512, "train/loss_slope": -1.3033783668302996e-05} {"step": 54690, "timestamp": 1778253658.2270818, "train/loss": 2.192948412895203, "train/z_loss": 0.0013604959356598556, "train/perplexity": 8.961596687988363, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022878.1563431404, "perf/iters_per_sec": 0.9645834714618399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036716914176941, "data/tokens_consumed": 114695340032, "data/tokens_consumed_B": 114.695340032, "train/loss_slope": -1.2427447273536022e-05} {"step": 54700, "timestamp": 1778253668.5916674, "grad/layer_0/attn": 0.002960123587399721, "grad/layer_0/mlp": 0.0030515759717673063, "grad/layer_0/attn_mlp_ratio": 0.97003106518835, "grad/layer_4/attn": 0.0020387370605021715, "grad/layer_4/mlp": 0.0025507782120257616, "grad/layer_4/attn_mlp_ratio": 0.7992607789122449, "grad/layer_8/attn": 0.006763918790966272, "grad/layer_8/mlp": 0.003494357457384467, "grad/layer_8/attn_mlp_ratio": 1.9356687688334555, "grad/layer_12/attn": 0.004288734868168831, "grad/layer_12/mlp": 0.007076446898281574, "grad/layer_12/attn_mlp_ratio": 0.6060576542451737, "grad/layer_16/attn": 0.003488929010927677, "grad/layer_16/mlp": 0.004335690755397081, "grad/layer_16/attn_mlp_ratio": 0.8046996723912286, "grad/layer_20/attn": 0.004714013077318668, "grad/layer_20/mlp": 0.006325313821434975, "grad/layer_20/attn_mlp_ratio": 0.7452615215418806, "grad/layer_24/attn": 0.006430438254028559, "grad/layer_24/mlp": 0.008303085342049599, "grad/layer_24/attn_mlp_ratio": 0.7744636977312881, "grad/layer_27/attn": 0.009973431937396526, "grad/layer_27/mlp": 0.007080948445945978, "grad/layer_27/attn_mlp_ratio": 1.4084881245333394} {"step": 54700, "timestamp": 1778253668.6056786, "train/loss": 2.1340420961380007, "train/z_loss": 0.0013780305278487503, "train/perplexity": 8.448949348120488, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021569.6679429968, "perf/iters_per_sec": 0.9639595355715737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373879432678224, "data/tokens_consumed": 114716311552, "data/tokens_consumed_B": 114.716311552, "train/loss_slope": -1.424189974444076e-05} {"step": 54710, "timestamp": 1778253678.988326, "train/loss": 2.182571005821228, "train/z_loss": 0.0013826131471432745, "train/perplexity": 8.869079426036937, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021218.902564653, "perf/iters_per_sec": 0.9637922776053681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375679731369019, "data/tokens_consumed": 114737283072, "data/tokens_consumed_B": 114.737283072, "train/loss_slope": -1.3240011597480916e-05} {"step": 54720, "timestamp": 1778253689.3381872, "train/loss": 2.1724826216697695, "train/z_loss": 0.0013785791234113276, "train/perplexity": 8.78005455928921, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027629.5542019112, "perf/iters_per_sec": 0.9668491145143085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342875480651856, "data/tokens_consumed": 114758254592, "data/tokens_consumed_B": 114.758254592, "train/loss_slope": -1.4766893228038215e-05} {"step": 54730, "timestamp": 1778253699.6757412, "train/loss": 2.146527886390686, "train/z_loss": 0.0013747778139077127, "train/perplexity": 8.55510248135687, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029648.0594275405, "perf/iters_per_sec": 0.9678116128099158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332589387893676, "data/tokens_consumed": 114779226112, "data/tokens_consumed_B": 114.779226112, "train/loss_slope": -1.5864557401575318e-05} {"step": 54740, "timestamp": 1778253710.0312855, "train/loss": 2.1174147605895994, "train/z_loss": 0.001374635531101376, "train/perplexity": 8.309627319168944, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026652.6945712317, "perf/iters_per_sec": 0.966383311544052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034786081314087, "data/tokens_consumed": 114800197632, "data/tokens_consumed_B": 114.800197632, "train/loss_slope": -1.8057076987510666e-05} {"step": 54750, "timestamp": 1778253720.369029, "grad/layer_0/attn": 0.0025568010751158, "grad/layer_0/mlp": 0.002840739907696843, "grad/layer_0/attn_mlp_ratio": 0.9000475468322603, "grad/layer_4/attn": 0.0019432378467172384, "grad/layer_4/mlp": 0.002440570155158639, "grad/layer_4/attn_mlp_ratio": 0.7962228674260912, "grad/layer_8/attn": 0.006422821898013353, "grad/layer_8/mlp": 0.0036319971550256014, "grad/layer_8/attn_mlp_ratio": 1.7683994361851685, "grad/layer_12/attn": 0.0042813923209905624, "grad/layer_12/mlp": 0.0066667236387729645, "grad/layer_12/attn_mlp_ratio": 0.6422033503639029, "grad/layer_16/attn": 0.00361826759763062, "grad/layer_16/mlp": 0.004490097519010305, "grad/layer_16/attn_mlp_ratio": 0.8058327245072566, "grad/layer_20/attn": 0.004722002428025007, "grad/layer_20/mlp": 0.005710948258638382, "grad/layer_20/attn_mlp_ratio": 0.8268333263568223, "grad/layer_24/attn": 0.02189633995294571, "grad/layer_24/mlp": 0.010079962201416492, "grad/layer_24/attn_mlp_ratio": 2.1722640718476414, "grad/layer_27/attn": 0.005229753442108631, "grad/layer_27/mlp": 0.009489196352660656, "grad/layer_27/attn_mlp_ratio": 0.5511271126273576} {"step": 54750, "timestamp": 1778253720.9621878, "eos/sharpness": 50.120854377746575, "eos/L0_probe": 1.9755809307098389, "eos/L_plus": 2.2163965702056885, "eos/L_minus": 2.235973834991455, "eos/grad_norm": 0.15193425118923187, "eos/embed_grad_frac": 0.16412778198719025, "eos/time_s": 0.5904390811920166} {"step": 54750, "timestamp": 1778253720.9797373, "train/loss": 2.1830524682998655, "train/z_loss": 0.0013755502761341632, "train/perplexity": 8.873350583119059, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916728.8657508623, "perf/iters_per_sec": 0.9139675453905403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0941307544708252, "data/tokens_consumed": 114821169152, "data/tokens_consumed_B": 114.821169152, "train/loss_slope": -1.4131402962207155e-05} {"step": 54750, "timestamp": 1778253722.3411698, "geo/rankme_last": 438.91375732421875, "geo/layer_0/stable_rank_q_proj": 19.505416870117188, "geo/layer_0/stable_rank_k_proj": 16.327024459838867, "geo/layer_0/stable_rank_o_proj": 47.629669189453125, "geo/layer_0/stable_rank_gate_proj": 132.66786193847656, "geo/layer_0/stable_rank_down_proj": 54.3549919128418, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0631457194685936, "geo/layer_0/attn_entropy_mean": 6.169764518737793, "geo/layer_0/attn_entropy_std": 0.40004560351371765, "geo/layer_7/stable_rank_q_proj": 42.85700607299805, "geo/layer_7/stable_rank_k_proj": 41.92176055908203, "geo/layer_7/stable_rank_o_proj": 93.14122772216797, "geo/layer_7/stable_rank_gate_proj": 84.54315185546875, "geo/layer_7/stable_rank_down_proj": 142.8173065185547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4532451927661896, "geo/layer_7/attn_entropy_mean": 4.612336158752441, "geo/layer_7/attn_entropy_std": 0.7802103161811829, "geo/layer_14/stable_rank_q_proj": 52.16899490356445, "geo/layer_14/stable_rank_k_proj": 39.2544059753418, "geo/layer_14/stable_rank_o_proj": 44.63920974731445, "geo/layer_14/stable_rank_gate_proj": 72.64856719970703, "geo/layer_14/stable_rank_down_proj": 130.166015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39058852195739746, "geo/layer_14/attn_entropy_mean": 5.544310569763184, "geo/layer_14/attn_entropy_std": 0.3796117305755615, "geo/layer_21/stable_rank_q_proj": 40.69264221191406, "geo/layer_21/stable_rank_k_proj": 30.382488250732422, "geo/layer_21/stable_rank_o_proj": 72.29315948486328, "geo/layer_21/stable_rank_gate_proj": 67.99256134033203, "geo/layer_21/stable_rank_down_proj": 53.03432846069336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14830844104290009, "geo/layer_21/attn_entropy_mean": 5.694708824157715, "geo/layer_21/attn_entropy_std": 0.3000431954860687, "geo/layer_27/stable_rank_q_proj": 43.14226150512695, "geo/layer_27/stable_rank_k_proj": 31.387210845947266, "geo/layer_27/stable_rank_o_proj": 116.00577545166016, "geo/layer_27/stable_rank_gate_proj": 81.3119125366211, "geo/layer_27/stable_rank_down_proj": 129.7342071533203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0905478447675705, "geo/layer_27/attn_entropy_mean": 4.266046524047852, "geo/layer_27/attn_entropy_std": 0.7242785096168518, "attnres/final_alpha/block_0": 0.23823919892311096, "attnres/block_norm/0": 1.7523325681686401, "attnres/final_alpha/block_1": 0.004821125417947769, "attnres/block_norm/1": 44800.1484375, "attnres/final_alpha/block_2": 0.010475791990756989, "attnres/block_norm/2": 27980.947265625, "attnres/final_alpha/block_3": 0.01241310779005289, "attnres/block_norm/3": 54171.39453125, "attnres/final_alpha/block_4": 0.014437972567975521, "attnres/block_norm/4": 14542.1875, "attnres/final_alpha/block_5": 0.6066359281539917, "attnres/block_norm/5": 6455.73046875, "attnres/final_alpha/block_6": 0.11297684162855148, "attnres/block_norm/6": 36075.1328125, "geo/tier1_time_s": 1.3579230308532715, "geo/step": 54750.0, "geo/rankme_slope": 2.872250462685074e-05} {"step": 54760, "timestamp": 1778253732.6974075, "train/loss": 2.1277588963508607, "train/z_loss": 0.0013700089417397975, "train/perplexity": 8.396029338970473, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790237.330573515, "perf/iters_per_sec": 0.8536516812198234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714379787445068, "data/tokens_consumed": 114842140672, "data/tokens_consumed_B": 114.842140672, "train/loss_slope": -1.584473122643387e-05} {"step": 54770, "timestamp": 1778253743.0620303, "train/loss": 2.1224822044372558, "train/z_loss": 0.0013878788100555538, "train/perplexity": 8.351842760856469, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024874.5291360158, "perf/iters_per_sec": 0.9655354161911086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356947898864746, "data/tokens_consumed": 114863112192, "data/tokens_consumed_B": 114.863112192, "train/loss_slope": -1.687487414245887e-05} {"step": 54780, "timestamp": 1778253753.4125674, "train/loss": 2.1554529905319213, "train/z_loss": 0.00137236334849149, "train/perplexity": 8.631799416887317, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027392.8444456419, "perf/iters_per_sec": 0.9667362425068101, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344083070755006, "data/tokens_consumed": 114884083712, "data/tokens_consumed_B": 114.884083712, "train/loss_slope": -1.238030862278892e-05} {"step": 54790, "timestamp": 1778253763.7574086, "train/loss": 2.176336431503296, "train/z_loss": 0.0013741269242018462, "train/perplexity": 8.8139565037533, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028117.7293308605, "perf/iters_per_sec": 0.9670818945555022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340385913848877, "data/tokens_consumed": 114905055232, "data/tokens_consumed_B": 114.905055232, "train/loss_slope": -8.330810388358552e-06} {"step": 54800, "timestamp": 1778253774.0959713, "grad/layer_0/attn": 0.002699004951864481, "grad/layer_0/mlp": 0.002934136660769582, "grad/layer_0/attn_mlp_ratio": 0.9198633778599232, "grad/layer_4/attn": 0.0019315477693453431, "grad/layer_4/mlp": 0.00268897064961493, "grad/layer_4/attn_mlp_ratio": 0.7183223430831099, "grad/layer_8/attn": 0.003404096933081746, "grad/layer_8/mlp": 0.0036969378124922514, "grad/layer_8/attn_mlp_ratio": 0.9207882343869038, "grad/layer_12/attn": 0.0039419070817530155, "grad/layer_12/mlp": 0.006356667727231979, "grad/layer_12/attn_mlp_ratio": 0.620121609133936, "grad/layer_16/attn": 0.0035129166208207607, "grad/layer_16/mlp": 0.004668121691793203, "grad/layer_16/attn_mlp_ratio": 0.7525331980405155, "grad/layer_20/attn": 0.006219100207090378, "grad/layer_20/mlp": 0.006228163372725248, "grad/layer_20/attn_mlp_ratio": 0.9985447932324576, "grad/layer_24/attn": 0.015306585468351841, "grad/layer_24/mlp": 0.011831622570753098, "grad/layer_24/attn_mlp_ratio": 1.2937012863154094, "grad/layer_27/attn": 0.0049649132415652275, "grad/layer_27/mlp": 0.01240561157464981, "grad/layer_27/attn_mlp_ratio": 0.40021511004658933} {"step": 54800, "timestamp": 1778253774.11001, "train/loss": 2.20244460105896, "train/z_loss": 0.0013858150457963347, "train/perplexity": 9.047103046055765, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027116.8541249041, "perf/iters_per_sec": 0.966604640066578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345491409301757, "data/tokens_consumed": 114926026752, "data/tokens_consumed_B": 114.926026752, "train/loss_slope": -5.521033341699434e-06} {"step": 54810, "timestamp": 1778253784.4539607, "train/loss": 2.140912353992462, "train/z_loss": 0.001396804954856634, "train/perplexity": 8.507195663239186, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028894.5626772784, "perf/iters_per_sec": 0.9674523175608055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336426734924316, "data/tokens_consumed": 114946998272, "data/tokens_consumed_B": 114.946998272, "train/loss_slope": -5.055787840155711e-06} {"step": 54820, "timestamp": 1778253794.798648, "train/loss": 2.152601790428162, "train/z_loss": 0.0013895492884330451, "train/perplexity": 8.607223481597726, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028386.4147346371, "perf/iters_per_sec": 0.9672100137398897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033901619911194, "data/tokens_consumed": 114967969792, "data/tokens_consumed_B": 114.967969792, "train/loss_slope": -6.621934392593229e-06} {"step": 54825, "timestamp": 1778253800.5594008, "eos/sharpness": 16.821146011352536, "eos/L0_probe": 1.9801440238952637, "eos/L_plus": 2.0788161754608154, "eos/L_minus": 2.0496833324432373, "eos/grad_norm": 0.11220754683017731, "eos/embed_grad_frac": 0.27578818798065186, "eos/time_s": 0.5912034511566162} {"step": 54825, "timestamp": 1778253801.9383252, "geo/rankme_last": 438.4581604003906, "geo/layer_0/stable_rank_q_proj": 19.5114688873291, "geo/layer_0/stable_rank_k_proj": 16.323345184326172, "geo/layer_0/stable_rank_o_proj": 47.64908981323242, "geo/layer_0/stable_rank_gate_proj": 132.47421264648438, "geo/layer_0/stable_rank_down_proj": 54.30162811279297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06522752344608307, "geo/layer_0/attn_entropy_mean": 6.1730146408081055, "geo/layer_0/attn_entropy_std": 0.40139326453208923, "geo/layer_7/stable_rank_q_proj": 42.85432052612305, "geo/layer_7/stable_rank_k_proj": 41.85011672973633, "geo/layer_7/stable_rank_o_proj": 93.22138977050781, "geo/layer_7/stable_rank_gate_proj": 84.42134094238281, "geo/layer_7/stable_rank_down_proj": 143.17626953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4687459468841553, "geo/layer_7/attn_entropy_mean": 4.643350601196289, "geo/layer_7/attn_entropy_std": 0.8043063879013062, "geo/layer_14/stable_rank_q_proj": 52.18363571166992, "geo/layer_14/stable_rank_k_proj": 39.29005432128906, "geo/layer_14/stable_rank_o_proj": 44.62079620361328, "geo/layer_14/stable_rank_gate_proj": 72.62805938720703, "geo/layer_14/stable_rank_down_proj": 129.87936401367188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3977562487125397, "geo/layer_14/attn_entropy_mean": 5.543308734893799, "geo/layer_14/attn_entropy_std": 0.399990439414978, "geo/layer_21/stable_rank_q_proj": 40.72283172607422, "geo/layer_21/stable_rank_k_proj": 30.41689682006836, "geo/layer_21/stable_rank_o_proj": 72.25013732910156, "geo/layer_21/stable_rank_gate_proj": 67.93878936767578, "geo/layer_21/stable_rank_down_proj": 52.99103546142578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14652328193187714, "geo/layer_21/attn_entropy_mean": 5.710543632507324, "geo/layer_21/attn_entropy_std": 0.2910824120044708, "geo/layer_27/stable_rank_q_proj": 43.133060455322266, "geo/layer_27/stable_rank_k_proj": 31.394481658935547, "geo/layer_27/stable_rank_o_proj": 116.01692199707031, "geo/layer_27/stable_rank_gate_proj": 81.24864196777344, "geo/layer_27/stable_rank_down_proj": 129.85362243652344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09235206991434097, "geo/layer_27/attn_entropy_mean": 4.215883255004883, "geo/layer_27/attn_entropy_std": 0.7207937836647034, "attnres/final_alpha/block_0": 0.2373965084552765, "attnres/block_norm/0": 1.7525653839111328, "attnres/final_alpha/block_1": 0.00486255157738924, "attnres/block_norm/1": 44603.7734375, "attnres/final_alpha/block_2": 0.010220483876764774, "attnres/block_norm/2": 28036.546875, "attnres/final_alpha/block_3": 0.012140605598688126, "attnres/block_norm/3": 54744.1953125, "attnres/final_alpha/block_4": 0.014250279404222965, "attnres/block_norm/4": 14430.6044921875, "attnres/final_alpha/block_5": 0.6102715730667114, "attnres/block_norm/5": 6389.5205078125, "attnres/final_alpha/block_6": 0.11085803806781769, "attnres/block_norm/6": 35942.05859375, "geo/tier1_time_s": 1.3594849109649658, "geo/step": 54825.0, "geo/rankme_slope": -8.035440738795515e-06} {"step": 54830, "timestamp": 1778253807.1157506, "train/loss": 2.1607392549514772, "train/z_loss": 0.0013743617688305675, "train/perplexity": 8.67755020987557, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703680.6263652325, "perf/iters_per_sec": 0.8123782283617175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2309537172317504, "data/tokens_consumed": 114988941312, "data/tokens_consumed_B": 114.988941312, "train/loss_slope": -6.865600870065543e-06} {"step": 54840, "timestamp": 1778253817.4560416, "train/loss": 2.162811040878296, "train/z_loss": 0.0013729486032389103, "train/perplexity": 8.695546872458536, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029021.3469214505, "perf/iters_per_sec": 0.9675127729995014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033578085899353, "data/tokens_consumed": 115009912832, "data/tokens_consumed_B": 115.009912832, "train/loss_slope": -5.934241402446123e-06} {"step": 54850, "timestamp": 1778253827.796332, "grad/layer_0/attn": 0.0026402396615594625, "grad/layer_0/mlp": 0.0029628814663738012, "grad/layer_0/attn_mlp_ratio": 0.8911053656426732, "grad/layer_4/attn": 0.0023158846888691187, "grad/layer_4/mlp": 0.0026176359970122576, "grad/layer_4/attn_mlp_ratio": 0.8847236984210468, "grad/layer_8/attn": 0.007477285340428352, "grad/layer_8/mlp": 0.0036037915851920843, "grad/layer_8/attn_mlp_ratio": 2.074838390674018, "grad/layer_12/attn": 0.004860471002757549, "grad/layer_12/mlp": 0.006239660549908876, "grad/layer_12/attn_mlp_ratio": 0.7789639974777365, "grad/layer_16/attn": 0.0035373272839933634, "grad/layer_16/mlp": 0.004415796138346195, "grad/layer_16/attn_mlp_ratio": 0.8010621625326997, "grad/layer_20/attn": 0.004379132762551308, "grad/layer_20/mlp": 0.005930238403379917, "grad/layer_20/attn_mlp_ratio": 0.7384412549436987, "grad/layer_24/attn": 0.013611221686005592, "grad/layer_24/mlp": 0.011672676540911198, "grad/layer_24/attn_mlp_ratio": 1.1660754516492, "grad/layer_27/attn": 0.005274864379316568, "grad/layer_27/mlp": 0.012223963625729084, "grad/layer_27/attn_mlp_ratio": 0.4315183272520678} {"step": 54850, "timestamp": 1778253827.810426, "train/loss": 2.142403447628021, "train/z_loss": 0.001364421076141298, "train/perplexity": 8.519890150531667, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026860.3677818605, "perf/iters_per_sec": 0.9664823378476431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346800565719605, "data/tokens_consumed": 115030884352, "data/tokens_consumed_B": 115.030884352, "train/loss_slope": -6.415538289974083e-06} {"step": 54860, "timestamp": 1778253838.7109804, "train/loss": 2.101935029029846, "train/z_loss": 0.0013885985361412167, "train/perplexity": 8.181986986724583, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1925693.6243353433, "perf/iters_per_sec": 0.9182422753979412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0890372037887572, "data/tokens_consumed": 115051855872, "data/tokens_consumed_B": 115.051855872, "train/loss_slope": -8.054494199687061e-06} {"step": 54870, "timestamp": 1778253849.0579603, "train/loss": 2.1508870840072634, "train/z_loss": 0.0013975581037811934, "train/perplexity": 8.59247726655562, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028615.4498207695, "perf/iters_per_sec": 0.9673192261794898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337848901748656, "data/tokens_consumed": 115072827392, "data/tokens_consumed_B": 115.072827392, "train/loss_slope": -6.625929688534695e-06} {"step": 54880, "timestamp": 1778253859.404268, "train/loss": 2.1944560170173646, "train/z_loss": 0.0013747605727985502, "train/perplexity": 8.975117417489187, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028463.1282431055, "perf/iters_per_sec": 0.9672465935912635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338625192642212, "data/tokens_consumed": 115093798912, "data/tokens_consumed_B": 115.093798912, "train/loss_slope": -2.2736451949871177e-07} {"step": 54890, "timestamp": 1778253869.7439148, "train/loss": 2.1670340299606323, "train/z_loss": 0.0013810054631903767, "train/perplexity": 8.732345717838927, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029623.0509319012, "perf/iters_per_sec": 0.9677996878299242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332716703414917, "data/tokens_consumed": 115114770432, "data/tokens_consumed_B": 115.114770432, "train/loss_slope": 2.451396152989194e-07} {"step": 54900, "timestamp": 1778253880.084006, "grad/layer_0/attn": 0.0030753014143556356, "grad/layer_0/mlp": 0.003286258317530155, "grad/layer_0/attn_mlp_ratio": 0.9358063255010031, "grad/layer_4/attn": 0.002410183660686016, "grad/layer_4/mlp": 0.002564434427767992, "grad/layer_4/attn_mlp_ratio": 0.9398499492142498, "grad/layer_8/attn": 0.00880122184753418, "grad/layer_8/mlp": 0.0036857910454273224, "grad/layer_8/attn_mlp_ratio": 2.387878612832737, "grad/layer_12/attn": 0.004801939241588116, "grad/layer_12/mlp": 0.006515424232929945, "grad/layer_12/attn_mlp_ratio": 0.7370109752205064, "grad/layer_16/attn": 0.004441606812179089, "grad/layer_16/mlp": 0.004339876119047403, "grad/layer_16/attn_mlp_ratio": 1.023440897389008, "grad/layer_20/attn": 0.0033883261494338512, "grad/layer_20/mlp": 0.005866542924195528, "grad/layer_20/attn_mlp_ratio": 0.57756776613745, "grad/layer_24/attn": 0.012684880755841732, "grad/layer_24/mlp": 0.012353317812085152, "grad/layer_24/attn_mlp_ratio": 1.026839983081162, "grad/layer_27/attn": 0.005532491020858288, "grad/layer_27/mlp": 0.011646000668406487, "grad/layer_27/attn_mlp_ratio": 0.47505501080396156} {"step": 54900, "timestamp": 1778253880.6912394, "eos/sharpness": 53.09376716613769, "eos/L0_probe": 1.9790467023849487, "eos/L_plus": 2.2647175788879395, "eos/L_minus": 2.224313497543335, "eos/grad_norm": 0.16876231133937836, "eos/embed_grad_frac": 0.09738939255475998, "eos/time_s": 0.604407548904419} {"step": 54900, "timestamp": 1778253880.7120264, "train/loss": 2.175263023376465, "train/z_loss": 0.001381643582135439, "train/perplexity": 8.804500607138701, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912919.9636331578, "perf/iters_per_sec": 0.9121513193288602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963093280792235, "data/tokens_consumed": 115135741952, "data/tokens_consumed_B": 115.135741952, "train/loss_slope": 2.69956615212222e-06} {"step": 54900, "timestamp": 1778253882.073854, "geo/rankme_last": 439.0050354003906, "geo/layer_0/stable_rank_q_proj": 19.525136947631836, "geo/layer_0/stable_rank_k_proj": 16.344194412231445, "geo/layer_0/stable_rank_o_proj": 47.70729064941406, "geo/layer_0/stable_rank_gate_proj": 132.7045440673828, "geo/layer_0/stable_rank_down_proj": 54.31879806518555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07213828712701797, "geo/layer_0/attn_entropy_mean": 6.172341823577881, "geo/layer_0/attn_entropy_std": 0.4000566005706787, "geo/layer_7/stable_rank_q_proj": 42.90216827392578, "geo/layer_7/stable_rank_k_proj": 41.776512145996094, "geo/layer_7/stable_rank_o_proj": 93.40313720703125, "geo/layer_7/stable_rank_gate_proj": 84.5678939819336, "geo/layer_7/stable_rank_down_proj": 143.10372924804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4636843800544739, "geo/layer_7/attn_entropy_mean": 4.647952079772949, "geo/layer_7/attn_entropy_std": 0.7821292877197266, "geo/layer_14/stable_rank_q_proj": 52.24544143676758, "geo/layer_14/stable_rank_k_proj": 39.365352630615234, "geo/layer_14/stable_rank_o_proj": 44.60810089111328, "geo/layer_14/stable_rank_gate_proj": 72.53389739990234, "geo/layer_14/stable_rank_down_proj": 129.70797729492188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38576170802116394, "geo/layer_14/attn_entropy_mean": 5.541816711425781, "geo/layer_14/attn_entropy_std": 0.3919772207736969, "geo/layer_21/stable_rank_q_proj": 40.71168518066406, "geo/layer_21/stable_rank_k_proj": 30.48040008544922, "geo/layer_21/stable_rank_o_proj": 72.21318817138672, "geo/layer_21/stable_rank_gate_proj": 67.94314575195312, "geo/layer_21/stable_rank_down_proj": 53.02633285522461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14202797412872314, "geo/layer_21/attn_entropy_mean": 5.720272064208984, "geo/layer_21/attn_entropy_std": 0.29445087909698486, "geo/layer_27/stable_rank_q_proj": 43.154022216796875, "geo/layer_27/stable_rank_k_proj": 31.444103240966797, "geo/layer_27/stable_rank_o_proj": 116.15829467773438, "geo/layer_27/stable_rank_gate_proj": 81.19457244873047, "geo/layer_27/stable_rank_down_proj": 129.76551818847656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09651878476142883, "geo/layer_27/attn_entropy_mean": 4.256608009338379, "geo/layer_27/attn_entropy_std": 0.7182690501213074, "attnres/final_alpha/block_0": 0.23673012852668762, "attnres/block_norm/0": 1.752463459968567, "attnres/final_alpha/block_1": 0.004840974695980549, "attnres/block_norm/1": 44838.640625, "attnres/final_alpha/block_2": 0.010199984535574913, "attnres/block_norm/2": 28013.61328125, "attnres/final_alpha/block_3": 0.01195180881768465, "attnres/block_norm/3": 54748.5625, "attnres/final_alpha/block_4": 0.014235680922865868, "attnres/block_norm/4": 14483.791015625, "attnres/final_alpha/block_5": 0.611909031867981, "attnres/block_norm/5": 6370.33251953125, "attnres/final_alpha/block_6": 0.1101323813199997, "attnres/block_norm/6": 36029.7890625, "geo/tier1_time_s": 1.3582556247711182, "geo/step": 54900.0, "geo/rankme_slope": -1.860171803096239e-05} {"step": 54910, "timestamp": 1778253892.4288697, "train/loss": 2.1248670339584352, "train/z_loss": 0.0013717580121010541, "train/perplexity": 8.371784251104842, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790357.541205622, "perf/iters_per_sec": 0.8537090021160231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713593244552611, "data/tokens_consumed": 115156713472, "data/tokens_consumed_B": 115.156713472, "train/loss_slope": -8.310107186218e-07} {"step": 54920, "timestamp": 1778253902.7739432, "train/loss": 2.0901408910751345, "train/z_loss": 0.0014052854501642286, "train/perplexity": 8.086054336942674, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028687.0806814341, "perf/iters_per_sec": 0.9673533824355288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337483882904053, "data/tokens_consumed": 115177684992, "data/tokens_consumed_B": 115.177684992, "train/loss_slope": -1.6175663367499662e-06} {"step": 54930, "timestamp": 1778253913.1169572, "train/loss": 2.1622116327285767, "train/z_loss": 0.0013768319389782847, "train/perplexity": 8.690336252596934, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028468.9287687237, "perf/iters_per_sec": 0.9672493594974154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338595628738403, "data/tokens_consumed": 115198656512, "data/tokens_consumed_B": 115.198656512, "train/loss_slope": -9.725061651825767e-07} {"step": 54940, "timestamp": 1778253923.4677374, "train/loss": 2.1487636804580688, "train/z_loss": 0.0013759367517195641, "train/perplexity": 8.574251327191767, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027555.1470745658, "perf/iters_per_sec": 0.9668136344311551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343255043029784, "data/tokens_consumed": 115219628032, "data/tokens_consumed_B": 115.219628032, "train/loss_slope": -9.250718839813238e-07} {"step": 54950, "timestamp": 1778253933.8009188, "grad/layer_0/attn": 0.002938151126727462, "grad/layer_0/mlp": 0.0031423892360180616, "grad/layer_0/attn_mlp_ratio": 0.9350054409395991, "grad/layer_4/attn": 0.002264405135065317, "grad/layer_4/mlp": 0.0026162308640778065, "grad/layer_4/attn_mlp_ratio": 0.8655218771418738, "grad/layer_8/attn": 0.004490228369832039, "grad/layer_8/mlp": 0.00378859369084239, "grad/layer_8/attn_mlp_ratio": 1.1851965710036285, "grad/layer_12/attn": 0.004007006995379925, "grad/layer_12/mlp": 0.006811844650655985, "grad/layer_12/attn_mlp_ratio": 0.5882410921056367, "grad/layer_16/attn": 0.0056187850423157215, "grad/layer_16/mlp": 0.004725591279566288, "grad/layer_16/attn_mlp_ratio": 1.1890120391305206, "grad/layer_20/attn": 0.004860467743128538, "grad/layer_20/mlp": 0.006512144114822149, "grad/layer_20/attn_mlp_ratio": 0.7463697950769784, "grad/layer_24/attn": 0.019626367837190628, "grad/layer_24/mlp": 0.01429726742208004, "grad/layer_24/attn_mlp_ratio": 1.3727355808989776, "grad/layer_27/attn": 0.011234773322939873, "grad/layer_27/mlp": 0.01228333730250597, "grad/layer_27/attn_mlp_ratio": 0.9146352456823196} {"step": 54950, "timestamp": 1778253933.8151913, "train/loss": 2.1388227462768556, "train/z_loss": 0.001377432222943753, "train/perplexity": 8.489437521779497, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028150.977920665, "perf/iters_per_sec": 0.9670977487185788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340216398239135, "data/tokens_consumed": 115240599552, "data/tokens_consumed_B": 115.240599552, "train/loss_slope": -3.624105303272486e-06} {"step": 54960, "timestamp": 1778253944.156066, "train/loss": 2.146083045005798, "train/z_loss": 0.0013873122283257545, "train/perplexity": 8.551297664054049, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029068.7605302965, "perf/iters_per_sec": 0.9675353815700037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335539340972901, "data/tokens_consumed": 115261571072, "data/tokens_consumed_B": 115.261571072, "train/loss_slope": -6.096783451157011e-06} {"step": 54970, "timestamp": 1778253954.5003123, "train/loss": 2.1888931512832643, "train/z_loss": 0.0013795441831462086, "train/perplexity": 8.925328656837866, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028813.1838728588, "perf/iters_per_sec": 0.9674135131229681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336841344833374, "data/tokens_consumed": 115282542592, "data/tokens_consumed_B": 115.282542592, "train/loss_slope": -5.445745611491223e-06} {"step": 54975, "timestamp": 1778253960.265031, "eos/sharpness": 28.10609340667724, "eos/L0_probe": 1.977374792098999, "eos/L_plus": 2.1020267009735107, "eos/L_minus": 2.1337838172912598, "eos/grad_norm": 0.11634387075901031, "eos/embed_grad_frac": 0.20407213270664215, "eos/time_s": 0.593510627746582} {"step": 54975, "timestamp": 1778253961.6401896, "geo/rankme_last": 439.59381103515625, "geo/layer_0/stable_rank_q_proj": 19.556020736694336, "geo/layer_0/stable_rank_k_proj": 16.316665649414062, "geo/layer_0/stable_rank_o_proj": 47.7271842956543, "geo/layer_0/stable_rank_gate_proj": 132.76678466796875, "geo/layer_0/stable_rank_down_proj": 54.3931884765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06867744028568268, "geo/layer_0/attn_entropy_mean": 6.175934791564941, "geo/layer_0/attn_entropy_std": 0.4000947177410126, "geo/layer_7/stable_rank_q_proj": 42.927268981933594, "geo/layer_7/stable_rank_k_proj": 41.63516616821289, "geo/layer_7/stable_rank_o_proj": 93.22615051269531, "geo/layer_7/stable_rank_gate_proj": 84.6261215209961, "geo/layer_7/stable_rank_down_proj": 142.82630920410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4669208526611328, "geo/layer_7/attn_entropy_mean": 4.676136016845703, "geo/layer_7/attn_entropy_std": 0.7860819101333618, "geo/layer_14/stable_rank_q_proj": 52.24639129638672, "geo/layer_14/stable_rank_k_proj": 39.35953903198242, "geo/layer_14/stable_rank_o_proj": 44.59653854370117, "geo/layer_14/stable_rank_gate_proj": 72.57920837402344, "geo/layer_14/stable_rank_down_proj": 129.64584350585938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3952246606349945, "geo/layer_14/attn_entropy_mean": 5.518002510070801, "geo/layer_14/attn_entropy_std": 0.38296815752983093, "geo/layer_21/stable_rank_q_proj": 40.598114013671875, "geo/layer_21/stable_rank_k_proj": 30.470077514648438, "geo/layer_21/stable_rank_o_proj": 72.18328094482422, "geo/layer_21/stable_rank_gate_proj": 67.91131591796875, "geo/layer_21/stable_rank_down_proj": 53.03184509277344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1458185464143753, "geo/layer_21/attn_entropy_mean": 5.708492279052734, "geo/layer_21/attn_entropy_std": 0.30092281103134155, "geo/layer_27/stable_rank_q_proj": 43.23478317260742, "geo/layer_27/stable_rank_k_proj": 31.496292114257812, "geo/layer_27/stable_rank_o_proj": 116.04568481445312, "geo/layer_27/stable_rank_gate_proj": 81.25027465820312, "geo/layer_27/stable_rank_down_proj": 129.7096710205078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08387459069490433, "geo/layer_27/attn_entropy_mean": 4.240148544311523, "geo/layer_27/attn_entropy_std": 0.6996644735336304, "attnres/final_alpha/block_0": 0.23738379776477814, "attnres/block_norm/0": 1.7525913715362549, "attnres/final_alpha/block_1": 0.004816242028027773, "attnres/block_norm/1": 44734.875, "attnres/final_alpha/block_2": 0.010506609454751015, "attnres/block_norm/2": 27958.287109375, "attnres/final_alpha/block_3": 0.012068210169672966, "attnres/block_norm/3": 54746.12890625, "attnres/final_alpha/block_4": 0.014009306207299232, "attnres/block_norm/4": 14517.708984375, "attnres/final_alpha/block_5": 0.6097977161407471, "attnres/block_norm/5": 6413.205078125, "attnres/final_alpha/block_6": 0.11141815781593323, "attnres/block_norm/6": 36019.9765625, "geo/tier1_time_s": 1.356950044631958, "geo/step": 54975.0, "geo/rankme_slope": 3.5995843649959975e-06} {"step": 54980, "timestamp": 1778253966.8141909, "train/loss": 2.179806685447693, "train/z_loss": 0.0013760707923211157, "train/perplexity": 8.844596304271771, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704114.329403715, "perf/iters_per_sec": 0.8125850340860915, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230640435218811, "data/tokens_consumed": 115303514112, "data/tokens_consumed_B": 115.303514112, "train/loss_slope": -3.856371495589947e-06} {"step": 54990, "timestamp": 1778253977.1664188, "train/loss": 2.174657440185547, "train/z_loss": 0.0013763513066805898, "train/perplexity": 8.799170363682407, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026954.3881112041, "perf/iters_per_sec": 0.9665271702343007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346320629119874, "data/tokens_consumed": 115324485632, "data/tokens_consumed_B": 115.324485632, "train/loss_slope": -1.0281071589939278e-06} {"step": 55000, "timestamp": 1778253987.4994805, "grad/layer_0/attn": 0.00266505079343915, "grad/layer_0/mlp": 0.0028501402121037245, "grad/layer_0/attn_mlp_ratio": 0.9350595064114728, "grad/layer_4/attn": 0.002142130397260189, "grad/layer_4/mlp": 0.0024449871852993965, "grad/layer_4/attn_mlp_ratio": 0.8761315079795511, "grad/layer_8/attn": 0.007792843505740166, "grad/layer_8/mlp": 0.003488284070044756, "grad/layer_8/attn_mlp_ratio": 2.2340047788137, "grad/layer_12/attn": 0.003956514876335859, "grad/layer_12/mlp": 0.006860476918518543, "grad/layer_12/attn_mlp_ratio": 0.5767113373685249, "grad/layer_16/attn": 0.003652539337053895, "grad/layer_16/mlp": 0.004678493365645409, "grad/layer_16/attn_mlp_ratio": 0.7807084404145939, "grad/layer_20/attn": 0.0036817521322518587, "grad/layer_20/mlp": 0.006000340450555086, "grad/layer_20/attn_mlp_ratio": 0.613590528942772, "grad/layer_24/attn": 0.005482831504195929, "grad/layer_24/mlp": 0.007659937255084515, "grad/layer_24/attn_mlp_ratio": 0.715780201590883, "grad/layer_27/attn": 0.006799529772251844, "grad/layer_27/mlp": 0.006552600767463446, "grad/layer_27/attn_mlp_ratio": 1.0376841058661928} {"step": 55000, "timestamp": 1778253987.5137289, "train/loss": 2.1531171083450316, "train/z_loss": 0.0013920108205638826, "train/perplexity": 8.611660081103723, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027762.7248103113, "perf/iters_per_sec": 0.9669126152087743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342196226119995, "data/tokens_consumed": 115345457152, "data/tokens_consumed_B": 115.345457152, "train/loss_slope": 1.8280623376172943e-06} {"step": 55000, "timestamp": 1778253994.7759748, "geo/ww_alpha_mean": 7.568064480997466, "geo/ww_alpha_std": 4.27564160187557, "geo/ww_alpha_min": 1.3488698801893215, "geo/ww_alpha_max": 25.733210552182825, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.009364677689535, "geo/ww_alpha_by_type/k_proj": 4.472240513683471, "geo/ww_alpha_by_type/v_proj": 8.879182232765992, "geo/ww_alpha_by_type/o_proj": 7.614106297725406, "geo/ww_alpha_by_type/gate_proj": 8.037921156727359, "geo/ww_alpha_by_type/up_proj": 11.69020139452997, "geo/ww_alpha_by_type/down_proj": 8.368524543351368, "geo/twonn_id/layer_0": 0.671698808670044, "geo/twonn_id/layer_7": 3.5543277263641357, "geo/twonn_id/layer_14": 4.466948509216309, "geo/twonn_id/layer_21": 6.695029258728027, "geo/twonn_id/layer_27": 5.576419353485107, "geo/tier2_time_s": 7.253276109695435} {"step": 55000, "timestamp": 1778253995.4503834, "eoc/jacobian_sigma/layer_0/attn": 1110.1341552734375, "eoc/jacobian_sigma/layer_0/mlp": 8219.7197265625, "eoc/jacobian_sigma/layer_0": 8219.7197265625, "eoc/jacobian_sigma/layer_7/attn": 1.15997314453125, "eoc/jacobian_sigma/layer_7/mlp": 1.772403597831726, "eoc/jacobian_sigma/layer_7": 1.772403597831726, "eoc/jacobian_sigma/layer_14/attn": 1.5055729150772095, "eoc/jacobian_sigma/layer_14/mlp": 6.9323554039001465, "eoc/jacobian_sigma/layer_14": 6.9323554039001465, "eoc/jacobian_sigma/layer_21/attn": 1.0921093225479126, "eoc/jacobian_sigma/layer_21/mlp": 4.15205192565918, "eoc/jacobian_sigma/layer_21": 4.15205192565918, "eoc/jacobian_sigma/layer_27/attn": 3.4552931785583496, "eoc/jacobian_sigma/layer_27/mlp": 28.107126235961914, "eoc/jacobian_sigma/layer_27": 28.107126235961914, "eoc/layer0_sigma": 8219.7197265625, "eoc/sigma_max": 28.107126235961914, "eoc/sigma_min": 1.772403597831726, "eoc/sigma_mean": 10.240984290838242, "eoc/time_s": 0.6670324802398682} {"step": 55010, "timestamp": 1778254005.8244882, "train/loss": 2.1799466371536256, "train/z_loss": 0.0013671718072146178, "train/perplexity": 8.845834207234134, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1145776.7166627832, "perf/iters_per_sec": 0.5463489135087887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8303321838378905, "data/tokens_consumed": 115366428672, "data/tokens_consumed_B": 115.366428672, "train/loss_slope": 4.826425576117042e-06} {"step": 55020, "timestamp": 1778254016.1750221, "train/loss": 2.135769045352936, "train/z_loss": 0.001377378823235631, "train/perplexity": 8.463552860697138, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027617.8693041217, "perf/iters_per_sec": 0.9668435427208527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342935085296632, "data/tokens_consumed": 115387400192, "data/tokens_consumed_B": 115.387400192, "train/loss_slope": 2.2562400664505165e-06} {"step": 55030, "timestamp": 1778254026.516358, "train/loss": 2.150756502151489, "train/z_loss": 0.0013680983800441026, "train/perplexity": 8.591355318183101, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029316.5361889456, "perf/iters_per_sec": 0.9676535302109459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334277391433715, "data/tokens_consumed": 115408371712, "data/tokens_consumed_B": 115.408371712, "train/loss_slope": 2.5789017009191606e-06} {"step": 55040, "timestamp": 1778254036.8729324, "train/loss": 2.126922535896301, "train/z_loss": 0.001392040669452399, "train/perplexity": 8.389010167743814, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026334.659269533, "perf/iters_per_sec": 0.9662316604945816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349484920501708, "data/tokens_consumed": 115429343232, "data/tokens_consumed_B": 115.429343232, "train/loss_slope": 9.713903047142323e-07} {"step": 55050, "timestamp": 1778254047.2434306, "grad/layer_0/attn": 0.003052640240639448, "grad/layer_0/mlp": 0.0030155142303556204, "grad/layer_0/attn_mlp_ratio": 1.0123116345062932, "grad/layer_4/attn": 0.0021985957864671946, "grad/layer_4/mlp": 0.002599232830107212, "grad/layer_4/attn_mlp_ratio": 0.8458633164425582, "grad/layer_8/attn": 0.005384194664657116, "grad/layer_8/mlp": 0.0037541957572102547, "grad/layer_8/attn_mlp_ratio": 1.434180546099189, "grad/layer_12/attn": 0.0048615471459925175, "grad/layer_12/mlp": 0.006793212611228228, "grad/layer_12/attn_mlp_ratio": 0.7156477137771738, "grad/layer_16/attn": 0.003655480919405818, "grad/layer_16/mlp": 0.004814318381249905, "grad/layer_16/attn_mlp_ratio": 0.7592935393955852, "grad/layer_20/attn": 0.005665292497724295, "grad/layer_20/mlp": 0.006754550151526928, "grad/layer_20/attn_mlp_ratio": 0.8387371900066335, "grad/layer_24/attn": 0.012162911705672741, "grad/layer_24/mlp": 0.010210604406893253, "grad/layer_24/attn_mlp_ratio": 1.1912038799917744, "grad/layer_27/attn": 0.005180478096008301, "grad/layer_27/mlp": 0.009471151977777481, "grad/layer_27/attn_mlp_ratio": 0.546974439167063} {"step": 55050, "timestamp": 1778254047.844036, "eos/sharpness": 52.6332139968872, "eos/L0_probe": 1.9778332710266113, "eos/L_plus": 2.2664003372192383, "eos/L_minus": 2.2155983448028564, "eos/grad_norm": 0.16363516449928284, "eos/embed_grad_frac": 0.08376236259937286, "eos/time_s": 0.5978209972381592} {"step": 55050, "timestamp": 1778254047.8635597, "train/loss": 2.1267769575119018, "train/z_loss": 0.0013840506202541293, "train/perplexity": 8.387788998086993, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908981.4854210352, "perf/iters_per_sec": 0.9102733065705467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985711574554444, "data/tokens_consumed": 115450314752, "data/tokens_consumed_B": 115.450314752, "train/loss_slope": 3.2504286381206337e-06} {"step": 55050, "timestamp": 1778254049.2243636, "geo/rankme_last": 438.9131164550781, "geo/layer_0/stable_rank_q_proj": 19.547256469726562, "geo/layer_0/stable_rank_k_proj": 16.260622024536133, "geo/layer_0/stable_rank_o_proj": 47.664554595947266, "geo/layer_0/stable_rank_gate_proj": 132.67784118652344, "geo/layer_0/stable_rank_down_proj": 54.391021728515625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06390486657619476, "geo/layer_0/attn_entropy_mean": 6.168561935424805, "geo/layer_0/attn_entropy_std": 0.40178197622299194, "geo/layer_7/stable_rank_q_proj": 42.954097747802734, "geo/layer_7/stable_rank_k_proj": 41.59590530395508, "geo/layer_7/stable_rank_o_proj": 93.12616729736328, "geo/layer_7/stable_rank_gate_proj": 84.8393783569336, "geo/layer_7/stable_rank_down_proj": 142.50167846679688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.471333384513855, "geo/layer_7/attn_entropy_mean": 4.670313835144043, "geo/layer_7/attn_entropy_std": 0.8166694641113281, "geo/layer_14/stable_rank_q_proj": 52.301734924316406, "geo/layer_14/stable_rank_k_proj": 39.38811492919922, "geo/layer_14/stable_rank_o_proj": 44.596439361572266, "geo/layer_14/stable_rank_gate_proj": 72.5887451171875, "geo/layer_14/stable_rank_down_proj": 129.94728088378906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39642196893692017, "geo/layer_14/attn_entropy_mean": 5.5721940994262695, "geo/layer_14/attn_entropy_std": 0.38665828108787537, "geo/layer_21/stable_rank_q_proj": 40.634029388427734, "geo/layer_21/stable_rank_k_proj": 30.431076049804688, "geo/layer_21/stable_rank_o_proj": 72.09888458251953, "geo/layer_21/stable_rank_gate_proj": 67.85467529296875, "geo/layer_21/stable_rank_down_proj": 52.93531799316406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14267072081565857, "geo/layer_21/attn_entropy_mean": 5.71409797668457, "geo/layer_21/attn_entropy_std": 0.30568167567253113, "geo/layer_27/stable_rank_q_proj": 43.18714904785156, "geo/layer_27/stable_rank_k_proj": 31.4752254486084, "geo/layer_27/stable_rank_o_proj": 116.1933364868164, "geo/layer_27/stable_rank_gate_proj": 81.23749542236328, "geo/layer_27/stable_rank_down_proj": 129.8673095703125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09639356285333633, "geo/layer_27/attn_entropy_mean": 4.2511796951293945, "geo/layer_27/attn_entropy_std": 0.6857260465621948, "attnres/final_alpha/block_0": 0.23809637129306793, "attnres/block_norm/0": 1.7527074813842773, "attnres/final_alpha/block_1": 0.004800692200660706, "attnres/block_norm/1": 44567.01953125, "attnres/final_alpha/block_2": 0.010505249723792076, "attnres/block_norm/2": 28028.046875, "attnres/final_alpha/block_3": 0.012216105125844479, "attnres/block_norm/3": 54410.2578125, "attnres/final_alpha/block_4": 0.014359247870743275, "attnres/block_norm/4": 14548.25390625, "attnres/final_alpha/block_5": 0.6101523637771606, "attnres/block_norm/5": 6456.81689453125, "attnres/final_alpha/block_6": 0.10986995697021484, "attnres/block_norm/6": 36327.70703125, "geo/tier1_time_s": 1.357349157333374, "geo/step": 55050.0, "geo/rankme_slope": 2.786055828581432e-06} {"step": 55060, "timestamp": 1778254059.5983071, "train/loss": 2.1801191568374634, "train/z_loss": 0.001368056843057275, "train/perplexity": 8.847360419401884, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787623.5856935533, "perf/iters_per_sec": 0.8524053505389945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1731507778167725, "data/tokens_consumed": 115471286272, "data/tokens_consumed_B": 115.471286272, "train/loss_slope": 4.320639123295735e-06} {"step": 55070, "timestamp": 1778254069.97351, "train/loss": 2.1914114713668824, "train/z_loss": 0.0013700447627343238, "train/perplexity": 8.947833816951768, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022664.8332435568, "perf/iters_per_sec": 0.9644817510812553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036826252937317, "data/tokens_consumed": 115492257792, "data/tokens_consumed_B": 115.492257792, "train/loss_slope": 6.4375191739182834e-06} {"step": 55080, "timestamp": 1778254080.355553, "train/loss": 2.1263550758361816, "train/z_loss": 0.0013748728320933878, "train/perplexity": 8.384251089950673, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020935.2112410965, "perf/iters_per_sec": 0.9636570030408366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037713623046875, "data/tokens_consumed": 115513229312, "data/tokens_consumed_B": 115.513229312, "train/loss_slope": 4.113552851466637e-06} {"step": 55090, "timestamp": 1778254090.7301455, "train/loss": 2.209357404708862, "train/z_loss": 0.0013663035701029004, "train/perplexity": 9.109860558278216, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022861.1762974323, "perf/iters_per_sec": 0.9645753747450982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367256164550782, "data/tokens_consumed": 115534200832, "data/tokens_consumed_B": 115.534200832, "train/loss_slope": 1.032587856707043e-05} {"step": 55100, "timestamp": 1778254101.0944095, "grad/layer_0/attn": 0.0027724597603082657, "grad/layer_0/mlp": 0.0029726463835686445, "grad/layer_0/attn_mlp_ratio": 0.9326570702682224, "grad/layer_4/attn": 0.0028726444579660892, "grad/layer_4/mlp": 0.0024851260241121054, "grad/layer_4/attn_mlp_ratio": 1.1559350771351444, "grad/layer_8/attn": 0.0031812568195164204, "grad/layer_8/mlp": 0.003541996469721198, "grad/layer_8/attn_mlp_ratio": 0.8981535574346498, "grad/layer_12/attn": 0.006443413440138102, "grad/layer_12/mlp": 0.006607228424400091, "grad/layer_12/attn_mlp_ratio": 0.9752066870917163, "grad/layer_16/attn": 0.0040297554805874825, "grad/layer_16/mlp": 0.004758282098919153, "grad/layer_16/attn_mlp_ratio": 0.8468929147377691, "grad/layer_20/attn": 0.0031542666256427765, "grad/layer_20/mlp": 0.006297668907791376, "grad/layer_20/attn_mlp_ratio": 0.5008625606935467, "grad/layer_24/attn": 0.009051283821463585, "grad/layer_24/mlp": 0.01087617315351963, "grad/layer_24/attn_mlp_ratio": 0.8322121770664609, "grad/layer_27/attn": 0.006091142538934946, "grad/layer_27/mlp": 0.01244454737752676, "grad/layer_27/attn_mlp_ratio": 0.4894627586848586} {"step": 55100, "timestamp": 1778254101.1085606, "train/loss": 2.17747939825058, "train/z_loss": 0.0013739099260419608, "train/perplexity": 8.824036322300474, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022114.708280164, "perf/iters_per_sec": 0.9642194310570545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371083259582519, "data/tokens_consumed": 115555172352, "data/tokens_consumed_B": 115.555172352, "train/loss_slope": 1.4830495114445252e-05} {"step": 55110, "timestamp": 1778254111.4856343, "train/loss": 2.182442545890808, "train/z_loss": 0.0013692239648662508, "train/perplexity": 8.867940177886414, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021838.8052980627, "perf/iters_per_sec": 0.9640878702631296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372498512268067, "data/tokens_consumed": 115576143872, "data/tokens_consumed_B": 115.576143872, "train/loss_slope": 1.5078716717287138e-05} {"step": 55120, "timestamp": 1778254121.8643553, "train/loss": 2.139822220802307, "train/z_loss": 0.0013844011700712144, "train/perplexity": 8.497926739989907, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022249.759000083, "perf/iters_per_sec": 0.964283828258554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370390653610229, "data/tokens_consumed": 115597115392, "data/tokens_consumed_B": 115.597115392, "train/loss_slope": 1.3951526351995127e-05} {"step": 55125, "timestamp": 1778254127.6209571, "eos/sharpness": 61.53209209442137, "eos/L0_probe": 1.9750462770462036, "eos/L_plus": 2.3382670879364014, "eos/L_minus": 2.2271463871002197, "eos/grad_norm": 0.1653127670288086, "eos/embed_grad_frac": 0.08640138059854507, "eos/time_s": 0.5805361270904541} {"step": 55125, "timestamp": 1778254128.9963753, "geo/rankme_last": 438.8424072265625, "geo/layer_0/stable_rank_q_proj": 19.57910919189453, "geo/layer_0/stable_rank_k_proj": 16.262651443481445, "geo/layer_0/stable_rank_o_proj": 47.6111946105957, "geo/layer_0/stable_rank_gate_proj": 132.67843627929688, "geo/layer_0/stable_rank_down_proj": 54.244815826416016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06109030917286873, "geo/layer_0/attn_entropy_mean": 6.174077033996582, "geo/layer_0/attn_entropy_std": 0.39883583784103394, "geo/layer_7/stable_rank_q_proj": 42.98434066772461, "geo/layer_7/stable_rank_k_proj": 41.52907943725586, "geo/layer_7/stable_rank_o_proj": 93.16034698486328, "geo/layer_7/stable_rank_gate_proj": 84.66602325439453, "geo/layer_7/stable_rank_down_proj": 142.6355438232422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4693371057510376, "geo/layer_7/attn_entropy_mean": 4.634767532348633, "geo/layer_7/attn_entropy_std": 0.8045172095298767, "geo/layer_14/stable_rank_q_proj": 52.2834358215332, "geo/layer_14/stable_rank_k_proj": 39.38704299926758, "geo/layer_14/stable_rank_o_proj": 44.57424545288086, "geo/layer_14/stable_rank_gate_proj": 72.60649871826172, "geo/layer_14/stable_rank_down_proj": 129.90773010253906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3822747468948364, "geo/layer_14/attn_entropy_mean": 5.529295921325684, "geo/layer_14/attn_entropy_std": 0.39798787236213684, "geo/layer_21/stable_rank_q_proj": 40.71220397949219, "geo/layer_21/stable_rank_k_proj": 30.405887603759766, "geo/layer_21/stable_rank_o_proj": 72.12482452392578, "geo/layer_21/stable_rank_gate_proj": 67.72505187988281, "geo/layer_21/stable_rank_down_proj": 52.88690948486328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1420658528804779, "geo/layer_21/attn_entropy_mean": 5.719348907470703, "geo/layer_21/attn_entropy_std": 0.3108034133911133, "geo/layer_27/stable_rank_q_proj": 43.11054611206055, "geo/layer_27/stable_rank_k_proj": 31.543725967407227, "geo/layer_27/stable_rank_o_proj": 116.00141906738281, "geo/layer_27/stable_rank_gate_proj": 81.26043701171875, "geo/layer_27/stable_rank_down_proj": 129.71949768066406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09210306406021118, "geo/layer_27/attn_entropy_mean": 4.24705171585083, "geo/layer_27/attn_entropy_std": 0.7025367021560669, "attnres/final_alpha/block_0": 0.23707041144371033, "attnres/block_norm/0": 1.7527942657470703, "attnres/final_alpha/block_1": 0.004754720255732536, "attnres/block_norm/1": 44579.59375, "attnres/final_alpha/block_2": 0.010396858677268028, "attnres/block_norm/2": 27985.15234375, "attnres/final_alpha/block_3": 0.012122120708227158, "attnres/block_norm/3": 54813.171875, "attnres/final_alpha/block_4": 0.014289039187133312, "attnres/block_norm/4": 14558.767578125, "attnres/final_alpha/block_5": 0.6117035746574402, "attnres/block_norm/5": 6476.0205078125, "attnres/final_alpha/block_6": 0.10966326296329498, "attnres/block_norm/6": 36199.4765625, "geo/tier1_time_s": 1.3567633628845215, "geo/step": 55125.0, "geo/rankme_slope": -1.00604695003001e-06} {"step": 55130, "timestamp": 1778254134.1885111, "train/loss": 2.1671859502792357, "train/z_loss": 0.0013702776981517673, "train/perplexity": 8.733672439357962, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702554.1957003025, "perf/iters_per_sec": 0.8118411043645394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317681312561035, "data/tokens_consumed": 115618086912, "data/tokens_consumed_B": 115.618086912, "train/loss_slope": 1.649659999025646e-05} {"step": 55140, "timestamp": 1778254144.5722287, "train/loss": 2.200557768344879, "train/z_loss": 0.001378331333398819, "train/perplexity": 9.030048770403047, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020489.3791760588, "perf/iters_per_sec": 0.9634444137459082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037942600250244, "data/tokens_consumed": 115639058432, "data/tokens_consumed_B": 115.639058432, "train/loss_slope": 1.830979332064069e-05} {"step": 55150, "timestamp": 1778254154.9488895, "grad/layer_0/attn": 0.0029766096267849207, "grad/layer_0/mlp": 0.0031604666728526354, "grad/layer_0/attn_mlp_ratio": 0.9418259518983123, "grad/layer_4/attn": 0.0018027746118605137, "grad/layer_4/mlp": 0.002576625207439065, "grad/layer_4/attn_mlp_ratio": 0.699665025665804, "grad/layer_8/attn": 0.0036253321450203657, "grad/layer_8/mlp": 0.003572786459699273, "grad/layer_8/attn_mlp_ratio": 1.0147071716832465, "grad/layer_12/attn": 0.005310378037393093, "grad/layer_12/mlp": 0.006746265571564436, "grad/layer_12/attn_mlp_ratio": 0.7871581547368316, "grad/layer_16/attn": 0.003847701707854867, "grad/layer_16/mlp": 0.004654772114008665, "grad/layer_16/attn_mlp_ratio": 0.8266143929181117, "grad/layer_20/attn": 0.0033889696933329105, "grad/layer_20/mlp": 0.006232311949133873, "grad/layer_20/attn_mlp_ratio": 0.5437740707806645, "grad/layer_24/attn": 0.014117822051048279, "grad/layer_24/mlp": 0.010407594963908195, "grad/layer_24/attn_mlp_ratio": 1.35649225054946, "grad/layer_27/attn": 0.005328478757292032, "grad/layer_27/mlp": 0.00942987110465765, "grad/layer_27/attn_mlp_ratio": 0.5650637894884676} {"step": 55150, "timestamp": 1778254154.9629884, "train/loss": 2.1497374057769774, "train/z_loss": 0.001371656300034374, "train/perplexity": 8.582604358918992, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019580.1279017152, "perf/iters_per_sec": 0.9630108489521576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384099006652832, "data/tokens_consumed": 115660029952, "data/tokens_consumed_B": 115.660029952, "train/loss_slope": 1.9647879890947805e-05} {"step": 55160, "timestamp": 1778254165.3422372, "train/loss": 2.145685875415802, "train/z_loss": 0.0013878525816835463, "train/perplexity": 8.547902023034192, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022034.2907752949, "perf/iters_per_sec": 0.9641810850025629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371495723724364, "data/tokens_consumed": 115681001472, "data/tokens_consumed_B": 115.681001472, "train/loss_slope": 1.981978410004929e-05} {"step": 55170, "timestamp": 1778254175.7239249, "train/loss": 2.1808099269866945, "train/z_loss": 0.0013767050229944288, "train/perplexity": 8.853474023183495, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020959.5417662852, "perf/iters_per_sec": 0.9636686047393251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03770112991333, "data/tokens_consumed": 115701972992, "data/tokens_consumed_B": 115.701972992, "train/loss_slope": 2.2531542833810713e-05} {"step": 55180, "timestamp": 1778254186.1051269, "train/loss": 2.108654427528381, "train/z_loss": 0.0013994718319736421, "train/perplexity": 8.237150141856429, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021571.8051466248, "perf/iters_per_sec": 0.9639605546696781, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373868465423584, "data/tokens_consumed": 115722944512, "data/tokens_consumed_B": 115.722944512, "train/loss_slope": 2.1520861268866173e-05} {"step": 55190, "timestamp": 1778254196.4842427, "train/loss": 2.148923707008362, "train/z_loss": 0.001369665318634361, "train/perplexity": 8.575623544845705, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021954.065580683, "perf/iters_per_sec": 0.964142830648748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371907234191895, "data/tokens_consumed": 115743916032, "data/tokens_consumed_B": 115.743916032, "train/loss_slope": 2.2823808581629548e-05} {"step": 55200, "timestamp": 1778254206.8566933, "grad/layer_0/attn": 0.003021802520379424, "grad/layer_0/mlp": 0.002955317497253418, "grad/layer_0/attn_mlp_ratio": 1.0224967100618205, "grad/layer_4/attn": 0.002231253543868661, "grad/layer_4/mlp": 0.002610850613564253, "grad/layer_4/attn_mlp_ratio": 0.8546078610609729, "grad/layer_8/attn": 0.0042340406216681, "grad/layer_8/mlp": 0.0037561776116490364, "grad/layer_8/attn_mlp_ratio": 1.1272205275424183, "grad/layer_12/attn": 0.004712333437055349, "grad/layer_12/mlp": 0.00660895137116313, "grad/layer_12/attn_mlp_ratio": 0.7130228535671198, "grad/layer_16/attn": 0.006018893793225288, "grad/layer_16/mlp": 0.004494519904255867, "grad/layer_16/attn_mlp_ratio": 1.3391627554279413, "grad/layer_20/attn": 0.0033665141090750694, "grad/layer_20/mlp": 0.005744229536503553, "grad/layer_20/attn_mlp_ratio": 0.5860688590305433, "grad/layer_24/attn": 0.0084272725507617, "grad/layer_24/mlp": 0.009920586831867695, "grad/layer_24/attn_mlp_ratio": 0.8494731822459968, "grad/layer_27/attn": 0.0063461558893322945, "grad/layer_27/mlp": 0.008823301643133163, "grad/layer_27/attn_mlp_ratio": 0.7192495591881196} {"step": 55200, "timestamp": 1778254207.438743, "eos/sharpness": 11.681675910949705, "eos/L0_probe": 1.97689688205719, "eos/L_plus": 2.040576457977295, "eos/L_minus": 2.030034065246582, "eos/grad_norm": 0.11265460401773453, "eos/embed_grad_frac": 0.19151180982589722, "eos/time_s": 0.5793802738189697} {"step": 55200, "timestamp": 1778254207.458408, "train/loss": 2.114681911468506, "train/z_loss": 0.0013709362014196814, "train/perplexity": 8.28694936328317, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911886.6540133394, "perf/iters_per_sec": 0.911658598906202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969018459320068, "data/tokens_consumed": 115764887552, "data/tokens_consumed_B": 115.764887552, "train/loss_slope": 2.060618222457247e-05} {"step": 55200, "timestamp": 1778254208.8215406, "geo/rankme_last": 439.0556640625, "geo/layer_0/stable_rank_q_proj": 19.59873390197754, "geo/layer_0/stable_rank_k_proj": 16.29297637939453, "geo/layer_0/stable_rank_o_proj": 47.56134033203125, "geo/layer_0/stable_rank_gate_proj": 132.96351623535156, "geo/layer_0/stable_rank_down_proj": 54.31896209716797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06097035855054855, "geo/layer_0/attn_entropy_mean": 6.1714324951171875, "geo/layer_0/attn_entropy_std": 0.40069711208343506, "geo/layer_7/stable_rank_q_proj": 43.0046272277832, "geo/layer_7/stable_rank_k_proj": 41.58662033081055, "geo/layer_7/stable_rank_o_proj": 93.3077163696289, "geo/layer_7/stable_rank_gate_proj": 84.84640502929688, "geo/layer_7/stable_rank_down_proj": 142.52780151367188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4696510434150696, "geo/layer_7/attn_entropy_mean": 4.631949424743652, "geo/layer_7/attn_entropy_std": 0.8079197406768799, "geo/layer_14/stable_rank_q_proj": 52.281185150146484, "geo/layer_14/stable_rank_k_proj": 39.40777587890625, "geo/layer_14/stable_rank_o_proj": 44.5860481262207, "geo/layer_14/stable_rank_gate_proj": 72.57124328613281, "geo/layer_14/stable_rank_down_proj": 129.70465087890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3925476372241974, "geo/layer_14/attn_entropy_mean": 5.502671241760254, "geo/layer_14/attn_entropy_std": 0.3830018937587738, "geo/layer_21/stable_rank_q_proj": 40.83726501464844, "geo/layer_21/stable_rank_k_proj": 30.355571746826172, "geo/layer_21/stable_rank_o_proj": 72.12419891357422, "geo/layer_21/stable_rank_gate_proj": 67.7488784790039, "geo/layer_21/stable_rank_down_proj": 52.90358352661133, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14721111953258514, "geo/layer_21/attn_entropy_mean": 5.705438613891602, "geo/layer_21/attn_entropy_std": 0.2899700999259949, "geo/layer_27/stable_rank_q_proj": 43.16231918334961, "geo/layer_27/stable_rank_k_proj": 31.524066925048828, "geo/layer_27/stable_rank_o_proj": 116.16165924072266, "geo/layer_27/stable_rank_gate_proj": 81.21074676513672, "geo/layer_27/stable_rank_down_proj": 129.6475830078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08435501158237457, "geo/layer_27/attn_entropy_mean": 4.228640079498291, "geo/layer_27/attn_entropy_std": 0.7006722688674927, "attnres/final_alpha/block_0": 0.23873764276504517, "attnres/block_norm/0": 1.7528873682022095, "attnres/final_alpha/block_1": 0.004804943688213825, "attnres/block_norm/1": 44854.8984375, "attnres/final_alpha/block_2": 0.010481656529009342, "attnres/block_norm/2": 28131.916015625, "attnres/final_alpha/block_3": 0.01217108964920044, "attnres/block_norm/3": 55050.390625, "attnres/final_alpha/block_4": 0.014320729300379753, "attnres/block_norm/4": 14514.9375, "attnres/final_alpha/block_5": 0.6080746650695801, "attnres/block_norm/5": 6455.19482421875, "attnres/final_alpha/block_6": 0.1114092692732811, "attnres/block_norm/6": 36279.7109375, "geo/tier1_time_s": 1.3596892356872559, "geo/step": 55200.0, "geo/rankme_slope": -3.8474374124649957e-07} {"step": 55210, "timestamp": 1778254219.697955, "train/loss": 2.1068447828292847, "train/z_loss": 0.0013838313985615968, "train/perplexity": 8.222257306201378, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1713899.2018543896, "perf/iters_per_sec": 0.8172508248588513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.223614549636841, "data/tokens_consumed": 115785859072, "data/tokens_consumed_B": 115.785859072, "train/loss_slope": 1.5814483126398044e-05} {"step": 55220, "timestamp": 1778254230.0584521, "train/loss": 2.12172589302063, "train/z_loss": 0.0013802102534100414, "train/perplexity": 8.345528554880584, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025366.0885802773, "perf/iters_per_sec": 0.9657698099995982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354434251785278, "data/tokens_consumed": 115806830592, "data/tokens_consumed_B": 115.806830592, "train/loss_slope": 1.530402597278017e-05} {"step": 55230, "timestamp": 1778254240.4183679, "train/loss": 2.183806800842285, "train/z_loss": 0.0013727253302931786, "train/perplexity": 8.880046565405364, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025581.0547187573, "perf/iters_per_sec": 0.9658723138421809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353335380554198, "data/tokens_consumed": 115827802112, "data/tokens_consumed_B": 115.827802112, "train/loss_slope": 1.545947269745293e-05} {"step": 55240, "timestamp": 1778254250.7726088, "train/loss": 2.189192271232605, "train/z_loss": 0.0013639971613883973, "train/perplexity": 8.927998800020188, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027067.7565600332, "perf/iters_per_sec": 0.9665812285232702, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345741987228394, "data/tokens_consumed": 115848773632, "data/tokens_consumed_B": 115.848773632, "train/loss_slope": 1.8887649503800095e-05} {"step": 55250, "timestamp": 1778254261.1067157, "grad/layer_0/attn": 0.002546880627050996, "grad/layer_0/mlp": 0.0027549592778086662, "grad/layer_0/attn_mlp_ratio": 0.9244712091097405, "grad/layer_4/attn": 0.0018257341580465436, "grad/layer_4/mlp": 0.002561841392889619, "grad/layer_4/attn_mlp_ratio": 0.7126647620915898, "grad/layer_8/attn": 0.01300803478807211, "grad/layer_8/mlp": 0.0035519907251000404, "grad/layer_8/attn_mlp_ratio": 3.6621814156025425, "grad/layer_12/attn": 0.004669716581702232, "grad/layer_12/mlp": 0.0064442879520356655, "grad/layer_12/attn_mlp_ratio": 0.724628778849687, "grad/layer_16/attn": 0.008561809547245502, "grad/layer_16/mlp": 0.004362814594060183, "grad/layer_16/attn_mlp_ratio": 1.9624508826611644, "grad/layer_20/attn": 0.0028356555849313736, "grad/layer_20/mlp": 0.005742180626839399, "grad/layer_20/attn_mlp_ratio": 0.4938290380999851, "grad/layer_24/attn": 0.009809765964746475, "grad/layer_24/mlp": 0.009222173132002354, "grad/layer_24/attn_mlp_ratio": 1.063715213102383, "grad/layer_27/attn": 0.008456723764538765, "grad/layer_27/mlp": 0.008840925060212612, "grad/layer_27/attn_mlp_ratio": 0.9565428517138802} {"step": 55250, "timestamp": 1778254261.1212041, "train/loss": 2.12394495010376, "train/z_loss": 0.001380292954854667, "train/perplexity": 8.36406832192563, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027692.5614917316, "perf/iters_per_sec": 0.9668791587313326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342554092407226, "data/tokens_consumed": 115869745152, "data/tokens_consumed_B": 115.869745152, "train/loss_slope": 1.574335809301916e-05} {"step": 55260, "timestamp": 1778254271.4625595, "train/loss": 2.1802660465240478, "train/z_loss": 0.0013980170944705606, "train/perplexity": 8.848660100853552, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029381.3807307153, "perf/iters_per_sec": 0.9676844504979684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033394718170166, "data/tokens_consumed": 115890716672, "data/tokens_consumed_B": 115.890716672, "train/loss_slope": 1.6844567013616716e-05} {"step": 55270, "timestamp": 1778254281.812184, "train/loss": 2.1785255312919616, "train/z_loss": 0.0013787226169370114, "train/perplexity": 8.833272268427475, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027842.242874928, "perf/iters_per_sec": 0.9669505323767319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341790676116944, "data/tokens_consumed": 115911688192, "data/tokens_consumed_B": 115.911688192, "train/loss_slope": 1.5344127827566295e-05} {"step": 55275, "timestamp": 1778254287.5726137, "eos/sharpness": 82.25307464599608, "eos/L0_probe": 1.97419011592865, "eos/L_plus": 2.482508420944214, "eos/L_minus": 2.288402557373047, "eos/grad_norm": 0.25452178716659546, "eos/embed_grad_frac": 0.031094692647457123, "eos/time_s": 0.5983636379241943} {"step": 55275, "timestamp": 1778254288.9488692, "geo/rankme_last": 438.8848571777344, "geo/layer_0/stable_rank_q_proj": 19.60219383239746, "geo/layer_0/stable_rank_k_proj": 16.255414962768555, "geo/layer_0/stable_rank_o_proj": 47.55733871459961, "geo/layer_0/stable_rank_gate_proj": 133.1394500732422, "geo/layer_0/stable_rank_down_proj": 54.21067428588867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06459449231624603, "geo/layer_0/attn_entropy_mean": 6.1648359298706055, "geo/layer_0/attn_entropy_std": 0.4034121632575989, "geo/layer_7/stable_rank_q_proj": 42.92537307739258, "geo/layer_7/stable_rank_k_proj": 41.723819732666016, "geo/layer_7/stable_rank_o_proj": 93.1766586303711, "geo/layer_7/stable_rank_gate_proj": 84.80909729003906, "geo/layer_7/stable_rank_down_proj": 142.67689514160156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.464960515499115, "geo/layer_7/attn_entropy_mean": 4.643156051635742, "geo/layer_7/attn_entropy_std": 0.8060348629951477, "geo/layer_14/stable_rank_q_proj": 52.28748321533203, "geo/layer_14/stable_rank_k_proj": 39.46678161621094, "geo/layer_14/stable_rank_o_proj": 44.5568962097168, "geo/layer_14/stable_rank_gate_proj": 72.33407592773438, "geo/layer_14/stable_rank_down_proj": 129.5795440673828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38850030303001404, "geo/layer_14/attn_entropy_mean": 5.515407562255859, "geo/layer_14/attn_entropy_std": 0.3834345042705536, "geo/layer_21/stable_rank_q_proj": 40.770450592041016, "geo/layer_21/stable_rank_k_proj": 30.37529754638672, "geo/layer_21/stable_rank_o_proj": 72.18133544921875, "geo/layer_21/stable_rank_gate_proj": 67.87169647216797, "geo/layer_21/stable_rank_down_proj": 52.92252731323242, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14471566677093506, "geo/layer_21/attn_entropy_mean": 5.6957106590271, "geo/layer_21/attn_entropy_std": 0.29414430260658264, "geo/layer_27/stable_rank_q_proj": 43.29723358154297, "geo/layer_27/stable_rank_k_proj": 31.561460494995117, "geo/layer_27/stable_rank_o_proj": 115.98915100097656, "geo/layer_27/stable_rank_gate_proj": 81.22355651855469, "geo/layer_27/stable_rank_down_proj": 129.583984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0973585993051529, "geo/layer_27/attn_entropy_mean": 4.219549179077148, "geo/layer_27/attn_entropy_std": 0.6988124251365662, "attnres/final_alpha/block_0": 0.236552894115448, "attnres/block_norm/0": 1.7531852722167969, "attnres/final_alpha/block_1": 0.004671011120080948, "attnres/block_norm/1": 44746.3359375, "attnres/final_alpha/block_2": 0.01040519867092371, "attnres/block_norm/2": 27884.44140625, "attnres/final_alpha/block_3": 0.01214640587568283, "attnres/block_norm/3": 54962.890625, "attnres/final_alpha/block_4": 0.013963114470243454, "attnres/block_norm/4": 14520.4267578125, "attnres/final_alpha/block_5": 0.6123868823051453, "attnres/block_norm/5": 6451.5830078125, "attnres/final_alpha/block_6": 0.10987450927495956, "attnres/block_norm/6": 36377.171875, "geo/tier1_time_s": 1.3585844039916992, "geo/step": 55275.0, "geo/rankme_slope": -1.8620905393407363e-05} {"step": 55280, "timestamp": 1778254294.1290147, "train/loss": 2.123159408569336, "train/z_loss": 0.0013879722682759166, "train/perplexity": 8.35750057881723, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703365.8843720758, "perf/iters_per_sec": 0.8122281476841334, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2311811685562133, "data/tokens_consumed": 115932659712, "data/tokens_consumed_B": 115.932659712, "train/loss_slope": 1.4467934899740798e-05} {"step": 55290, "timestamp": 1778254304.4726746, "train/loss": 2.1237813115119932, "train/z_loss": 0.001376645581331104, "train/perplexity": 8.362699749542672, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028856.1888796238, "perf/iters_per_sec": 0.9674340195081824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336622238159179, "data/tokens_consumed": 115953631232, "data/tokens_consumed_B": 115.953631232, "train/loss_slope": 1.1285779514078169e-05} {"step": 55300, "timestamp": 1778254314.8190582, "grad/layer_0/attn": 0.0028189229778945446, "grad/layer_0/mlp": 0.003089427249506116, "grad/layer_0/attn_mlp_ratio": 0.9124419055671223, "grad/layer_4/attn": 0.003199569648131728, "grad/layer_4/mlp": 0.0028581381775438786, "grad/layer_4/attn_mlp_ratio": 1.1194593604061929, "grad/layer_8/attn": 0.005196292418986559, "grad/layer_8/mlp": 0.003843520535156131, "grad/layer_8/attn_mlp_ratio": 1.3519616289962955, "grad/layer_12/attn": 0.006441475357860327, "grad/layer_12/mlp": 0.006741810590028763, "grad/layer_12/attn_mlp_ratio": 0.9554518294895707, "grad/layer_16/attn": 0.00541685800999403, "grad/layer_16/mlp": 0.004530222155153751, "grad/layer_16/attn_mlp_ratio": 1.1957157298036778, "grad/layer_20/attn": 0.004572410136461258, "grad/layer_20/mlp": 0.007899405434727669, "grad/layer_20/attn_mlp_ratio": 0.5788296494413223, "grad/layer_24/attn": 0.01054872665554285, "grad/layer_24/mlp": 0.008602211251854897, "grad/layer_24/attn_mlp_ratio": 1.2262808043269278, "grad/layer_27/attn": 0.0035002068616449833, "grad/layer_27/mlp": 0.007815011776983738, "grad/layer_27/attn_mlp_ratio": 0.4478824750034691} {"step": 55300, "timestamp": 1778254314.833837, "train/loss": 2.1212700486183165, "train/z_loss": 0.0013844194472767413, "train/perplexity": 8.341725159348636, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025487.0684652291, "perf/iters_per_sec": 0.9658274977041383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353815793991088, "data/tokens_consumed": 115974602752, "data/tokens_consumed_B": 115.974602752, "train/loss_slope": 8.76955564695191e-06} {"step": 55310, "timestamp": 1778254325.174091, "train/loss": 2.2032180070877074, "train/z_loss": 0.0013601232320070266, "train/perplexity": 9.05410283658547, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029170.0074699703, "perf/iters_per_sec": 0.9675836598729946, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335023641586303, "data/tokens_consumed": 115995574272, "data/tokens_consumed_B": 115.995574272, "train/loss_slope": 1.0169390154213967e-05} {"step": 55320, "timestamp": 1778254335.5295928, "train/loss": 2.1308997631073, "train/z_loss": 0.0013881956692785025, "train/perplexity": 8.422441605398951, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026609.7362334037, "perf/iters_per_sec": 0.9663628274123209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348080158233643, "data/tokens_consumed": 116016545792, "data/tokens_consumed_B": 116.016545792, "train/loss_slope": 5.2813413727580745e-06} {"step": 55330, "timestamp": 1778254345.8713624, "train/loss": 2.1583728313446047, "train/z_loss": 0.0013754834420979024, "train/perplexity": 8.657039728024808, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029318.2216263835, "perf/iters_per_sec": 0.9676543338901441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334268808364868, "data/tokens_consumed": 116037517312, "data/tokens_consumed_B": 116.037517312, "train/loss_slope": 4.566525733403009e-06} {"step": 55340, "timestamp": 1778254356.2284384, "train/loss": 2.164684844017029, "train/z_loss": 0.001376685977447778, "train/perplexity": 8.711855890653956, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025997.871068204, "perf/iters_per_sec": 0.9660710673657437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351205348968506, "data/tokens_consumed": 116058488832, "data/tokens_consumed_B": 116.058488832, "train/loss_slope": 2.204022069897316e-06} {"step": 55350, "timestamp": 1778254366.5744753, "grad/layer_0/attn": 0.002962003229185939, "grad/layer_0/mlp": 0.0033355846535414457, "grad/layer_0/attn_mlp_ratio": 0.8880011896088477, "grad/layer_4/attn": 0.0023693193215876818, "grad/layer_4/mlp": 0.0024567937944084406, "grad/layer_4/attn_mlp_ratio": 0.964394826517663, "grad/layer_8/attn": 0.006427299696952105, "grad/layer_8/mlp": 0.0037472345866262913, "grad/layer_8/attn_mlp_ratio": 1.7152114117353905, "grad/layer_12/attn": 0.004530579317361116, "grad/layer_12/mlp": 0.006308880168944597, "grad/layer_12/attn_mlp_ratio": 0.7181273259635074, "grad/layer_16/attn": 0.0038688199128955603, "grad/layer_16/mlp": 0.004580996464937925, "grad/layer_16/attn_mlp_ratio": 0.8445367417445302, "grad/layer_20/attn": 0.0034720799885690212, "grad/layer_20/mlp": 0.006087832152843475, "grad/layer_20/attn_mlp_ratio": 0.5703310873829185, "grad/layer_24/attn": 0.006266022566705942, "grad/layer_24/mlp": 0.00817134976387024, "grad/layer_24/attn_mlp_ratio": 0.7668283296020975, "grad/layer_27/attn": 0.004238996189087629, "grad/layer_27/mlp": 0.007625953294336796, "grad/layer_27/attn_mlp_ratio": 0.5558644237500328} {"step": 55350, "timestamp": 1778254367.1646726, "eos/sharpness": 31.949138641357415, "eos/L0_probe": 1.9771666526794434, "eos/L_plus": 2.1571860313415527, "eos/L_minus": 2.116638660430908, "eos/grad_norm": 0.10821174085140228, "eos/embed_grad_frac": 0.19585031270980835, "eos/time_s": 0.5874199867248535} {"step": 55350, "timestamp": 1778254367.1844196, "train/loss": 2.1626327991485597, "train/z_loss": 0.0013688528328202666, "train/perplexity": 8.693997101264038, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915389.0458299452, "perf/iters_per_sec": 0.9133286694669462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094896101951599, "data/tokens_consumed": 116079460352, "data/tokens_consumed_B": 116.079460352, "train/loss_slope": 2.5258936468560427e-06} {"step": 55350, "timestamp": 1778254368.5484784, "geo/rankme_last": 438.5821533203125, "geo/layer_0/stable_rank_q_proj": 19.60433006286621, "geo/layer_0/stable_rank_k_proj": 16.23357582092285, "geo/layer_0/stable_rank_o_proj": 47.52867889404297, "geo/layer_0/stable_rank_gate_proj": 133.02005004882812, "geo/layer_0/stable_rank_down_proj": 54.17694854736328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.057916056364774704, "geo/layer_0/attn_entropy_mean": 6.160758018493652, "geo/layer_0/attn_entropy_std": 0.40968748927116394, "geo/layer_7/stable_rank_q_proj": 42.96635055541992, "geo/layer_7/stable_rank_k_proj": 41.69326400756836, "geo/layer_7/stable_rank_o_proj": 93.1429214477539, "geo/layer_7/stable_rank_gate_proj": 84.68633270263672, "geo/layer_7/stable_rank_down_proj": 142.70867919921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4700104594230652, "geo/layer_7/attn_entropy_mean": 4.6518754959106445, "geo/layer_7/attn_entropy_std": 0.8000310063362122, "geo/layer_14/stable_rank_q_proj": 52.21608352661133, "geo/layer_14/stable_rank_k_proj": 39.52546310424805, "geo/layer_14/stable_rank_o_proj": 44.60430145263672, "geo/layer_14/stable_rank_gate_proj": 72.18477630615234, "geo/layer_14/stable_rank_down_proj": 130.14759826660156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38309043645858765, "geo/layer_14/attn_entropy_mean": 5.5397539138793945, "geo/layer_14/attn_entropy_std": 0.38617774844169617, "geo/layer_21/stable_rank_q_proj": 40.755126953125, "geo/layer_21/stable_rank_k_proj": 30.31716537475586, "geo/layer_21/stable_rank_o_proj": 72.1790542602539, "geo/layer_21/stable_rank_gate_proj": 67.7879867553711, "geo/layer_21/stable_rank_down_proj": 52.856651306152344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14498627185821533, "geo/layer_21/attn_entropy_mean": 5.7051239013671875, "geo/layer_21/attn_entropy_std": 0.29785746335983276, "geo/layer_27/stable_rank_q_proj": 43.3189697265625, "geo/layer_27/stable_rank_k_proj": 31.537029266357422, "geo/layer_27/stable_rank_o_proj": 115.861572265625, "geo/layer_27/stable_rank_gate_proj": 81.2212905883789, "geo/layer_27/stable_rank_down_proj": 129.5440216064453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08920513838529587, "geo/layer_27/attn_entropy_mean": 4.239798545837402, "geo/layer_27/attn_entropy_std": 0.6938713192939758, "attnres/final_alpha/block_0": 0.2376551628112793, "attnres/block_norm/0": 1.7533347606658936, "attnres/final_alpha/block_1": 0.00473751500248909, "attnres/block_norm/1": 44808.3046875, "attnres/final_alpha/block_2": 0.010578076355159283, "attnres/block_norm/2": 27967.232421875, "attnres/final_alpha/block_3": 0.012409779243171215, "attnres/block_norm/3": 54748.8203125, "attnres/final_alpha/block_4": 0.014191437512636185, "attnres/block_norm/4": 14544.970703125, "attnres/final_alpha/block_5": 0.6084287166595459, "attnres/block_norm/5": 6455.28076171875, "attnres/final_alpha/block_6": 0.11199935525655746, "attnres/block_norm/6": 36268.609375, "geo/tier1_time_s": 1.3599905967712402, "geo/step": 55350.0, "geo/rankme_slope": -3.260765243597439e-05} {"step": 55360, "timestamp": 1778254378.9250205, "train/loss": 2.152054452896118, "train/z_loss": 0.0013775553554296494, "train/perplexity": 8.602513714173378, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786815.360162081, "perf/iters_per_sec": 0.85201995857338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1736814260482789, "data/tokens_consumed": 116100431872, "data/tokens_consumed_B": 116.100431872, "train/loss_slope": 3.310721713860746e-06} {"step": 55370, "timestamp": 1778254389.3006191, "train/loss": 2.1848363876342773, "train/z_loss": 0.0013686193386092782, "train/perplexity": 8.88919405231917, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022114.987195936, "perf/iters_per_sec": 0.9642195640544586, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371081829071045, "data/tokens_consumed": 116121403392, "data/tokens_consumed_B": 116.121403392, "train/loss_slope": 8.754965789032743e-06} {"step": 55380, "timestamp": 1778254399.6819408, "train/loss": 2.1779327630996703, "train/z_loss": 0.001360452699009329, "train/perplexity": 8.82803773717799, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021625.7942675978, "perf/iters_per_sec": 0.9639862986886968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373591423034667, "data/tokens_consumed": 116142374912, "data/tokens_consumed_B": 116.142374912, "train/loss_slope": 8.237727089683672e-06} {"step": 55390, "timestamp": 1778254410.068683, "train/loss": 2.1764775037765505, "train/z_loss": 0.0013616285752505064, "train/perplexity": 8.815199996342752, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019992.2977029306, "perf/iters_per_sec": 0.9632073868288663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381980180740356, "data/tokens_consumed": 116163346432, "data/tokens_consumed_B": 116.163346432, "train/loss_slope": 7.514844990835369e-06} {"step": 55400, "timestamp": 1778254420.4378505, "grad/layer_0/attn": 0.0027922417502850294, "grad/layer_0/mlp": 0.003038648283109069, "grad/layer_0/attn_mlp_ratio": 0.9189091326940821, "grad/layer_4/attn": 0.0022024111822247505, "grad/layer_4/mlp": 0.002578764222562313, "grad/layer_4/attn_mlp_ratio": 0.8540567910589013, "grad/layer_8/attn": 0.0032161716371774673, "grad/layer_8/mlp": 0.003609417239204049, "grad/layer_8/attn_mlp_ratio": 0.8910500878478905, "grad/layer_12/attn": 0.005360373295843601, "grad/layer_12/mlp": 0.006839904002845287, "grad/layer_12/attn_mlp_ratio": 0.7836912938024635, "grad/layer_16/attn": 0.006684594321995974, "grad/layer_16/mlp": 0.0047937799245119095, "grad/layer_16/attn_mlp_ratio": 1.3944307598212295, "grad/layer_20/attn": 0.004660453647375107, "grad/layer_20/mlp": 0.007244552485644817, "grad/layer_20/attn_mlp_ratio": 0.6433045508717627, "grad/layer_24/attn": 0.011640356853604317, "grad/layer_24/mlp": 0.012733192183077335, "grad/layer_24/attn_mlp_ratio": 0.9141742773392798, "grad/layer_27/attn": 0.010128890164196491, "grad/layer_27/mlp": 0.012992232106626034, "grad/layer_27/attn_mlp_ratio": 0.7796112325509977} {"step": 55400, "timestamp": 1778254420.4520686, "train/loss": 2.1945140600204467, "train/z_loss": 0.0013760641915723682, "train/perplexity": 8.975638375375945, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020703.7290980786, "perf/iters_per_sec": 0.9635466237535851, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037832498550415, "data/tokens_consumed": 116184317952, "data/tokens_consumed_B": 116.184317952, "train/loss_slope": 9.109275526303774e-06} {"step": 55410, "timestamp": 1778254430.8280406, "train/loss": 2.145783805847168, "train/z_loss": 0.0013733578496612608, "train/perplexity": 8.54873916375669, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022759.1159467942, "perf/iters_per_sec": 0.9645267085775348, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036777925491333, "data/tokens_consumed": 116205289472, "data/tokens_consumed_B": 116.205289472, "train/loss_slope": 9.561111424157465e-06} {"step": 55420, "timestamp": 1778254441.2018542, "train/loss": 2.1276241898536683, "train/z_loss": 0.001378407923039049, "train/perplexity": 8.39489841544098, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022516.4732311992, "perf/iters_per_sec": 0.9644110075145718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369023084640503, "data/tokens_consumed": 116226260992, "data/tokens_consumed_B": 116.226260992, "train/loss_slope": 6.5797052546518135e-06} {"step": 55425, "timestamp": 1778254446.9636407, "eos/sharpness": 26.095557212829586, "eos/L0_probe": 1.9757664203643799, "eos/L_plus": 2.088371992111206, "eos/L_minus": 2.1241164207458496, "eos/grad_norm": 0.10027948021888733, "eos/embed_grad_frac": 0.24098491668701172, "eos/time_s": 0.5839173793792725} {"step": 55425, "timestamp": 1778254448.3396945, "geo/rankme_last": 438.9539489746094, "geo/layer_0/stable_rank_q_proj": 19.63511085510254, "geo/layer_0/stable_rank_k_proj": 16.271089553833008, "geo/layer_0/stable_rank_o_proj": 47.4802360534668, "geo/layer_0/stable_rank_gate_proj": 132.99276733398438, "geo/layer_0/stable_rank_down_proj": 54.291656494140625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06453293561935425, "geo/layer_0/attn_entropy_mean": 6.165243625640869, "geo/layer_0/attn_entropy_std": 0.40200990438461304, "geo/layer_7/stable_rank_q_proj": 42.99260330200195, "geo/layer_7/stable_rank_k_proj": 41.713008880615234, "geo/layer_7/stable_rank_o_proj": 93.22310638427734, "geo/layer_7/stable_rank_gate_proj": 84.64907836914062, "geo/layer_7/stable_rank_down_proj": 142.3856201171875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46342042088508606, "geo/layer_7/attn_entropy_mean": 4.662722110748291, "geo/layer_7/attn_entropy_std": 0.7968990206718445, "geo/layer_14/stable_rank_q_proj": 52.28245162963867, "geo/layer_14/stable_rank_k_proj": 39.4483528137207, "geo/layer_14/stable_rank_o_proj": 44.56298828125, "geo/layer_14/stable_rank_gate_proj": 72.16251373291016, "geo/layer_14/stable_rank_down_proj": 130.08607482910156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3841935396194458, "geo/layer_14/attn_entropy_mean": 5.524590015411377, "geo/layer_14/attn_entropy_std": 0.3891441822052002, "geo/layer_21/stable_rank_q_proj": 40.79566955566406, "geo/layer_21/stable_rank_k_proj": 30.361900329589844, "geo/layer_21/stable_rank_o_proj": 72.23814392089844, "geo/layer_21/stable_rank_gate_proj": 67.84349822998047, "geo/layer_21/stable_rank_down_proj": 52.772361755371094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14228512346744537, "geo/layer_21/attn_entropy_mean": 5.686074256896973, "geo/layer_21/attn_entropy_std": 0.2969295084476471, "geo/layer_27/stable_rank_q_proj": 43.232398986816406, "geo/layer_27/stable_rank_k_proj": 31.491870880126953, "geo/layer_27/stable_rank_o_proj": 115.76934814453125, "geo/layer_27/stable_rank_gate_proj": 81.24790954589844, "geo/layer_27/stable_rank_down_proj": 129.3568572998047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09231597185134888, "geo/layer_27/attn_entropy_mean": 4.245877265930176, "geo/layer_27/attn_entropy_std": 0.7133747339248657, "attnres/final_alpha/block_0": 0.23715490102767944, "attnres/block_norm/0": 1.753272533416748, "attnres/final_alpha/block_1": 0.0046340422704815865, "attnres/block_norm/1": 44944.671875, "attnres/final_alpha/block_2": 0.010479294694960117, "attnres/block_norm/2": 28111.572265625, "attnres/final_alpha/block_3": 0.012194513343274593, "attnres/block_norm/3": 55411.875, "attnres/final_alpha/block_4": 0.014195894822478294, "attnres/block_norm/4": 14488.51171875, "attnres/final_alpha/block_5": 0.6113214492797852, "attnres/block_norm/5": 6442.32958984375, "attnres/final_alpha/block_6": 0.11001988500356674, "attnres/block_norm/6": 36517.6015625, "geo/tier1_time_s": 1.3566265106201172, "geo/step": 55425.0, "geo/rankme_slope": -3.956856179971989e-05} {"step": 55430, "timestamp": 1778254453.5302029, "train/loss": 2.1692732334136964, "train/z_loss": 0.0013744855532422662, "train/perplexity": 8.751921125004012, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701841.3632479673, "perf/iters_per_sec": 0.8115011993636929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322840690612793, "data/tokens_consumed": 116247232512, "data/tokens_consumed_B": 116.247232512, "train/loss_slope": 8.28233713006394e-06} {"step": 55440, "timestamp": 1778254463.9038012, "train/loss": 2.157502567768097, "train/z_loss": 0.0013892400078475476, "train/perplexity": 8.649509098960502, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022467.2727416118, "perf/iters_per_sec": 0.9643875468929347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369275331497192, "data/tokens_consumed": 116268204032, "data/tokens_consumed_B": 116.268204032, "train/loss_slope": 1.0628421822361138e-05} {"step": 55450, "timestamp": 1778254474.659269, "grad/layer_0/attn": 0.002801193855702877, "grad/layer_0/mlp": 0.00299302046187222, "grad/layer_0/attn_mlp_ratio": 0.9359086574235395, "grad/layer_4/attn": 0.0022402547765523195, "grad/layer_4/mlp": 0.002492237836122513, "grad/layer_4/attn_mlp_ratio": 0.8988928160036617, "grad/layer_8/attn": 0.0063940538093447685, "grad/layer_8/mlp": 0.0035892119631171227, "grad/layer_8/attn_mlp_ratio": 1.7814644821492451, "grad/layer_12/attn": 0.005020713899284601, "grad/layer_12/mlp": 0.007121877279132605, "grad/layer_12/attn_mlp_ratio": 0.7049705621154755, "grad/layer_16/attn": 0.004246140364557505, "grad/layer_16/mlp": 0.005043473560363054, "grad/layer_16/attn_mlp_ratio": 0.8419079092110985, "grad/layer_20/attn": 0.006944755557924509, "grad/layer_20/mlp": 0.006912762299180031, "grad/layer_20/attn_mlp_ratio": 1.004628129378246, "grad/layer_24/attn": 0.018455248326063156, "grad/layer_24/mlp": 0.012882460840046406, "grad/layer_24/attn_mlp_ratio": 1.4325871750709671, "grad/layer_27/attn": 0.010861882008612156, "grad/layer_27/mlp": 0.012862712144851685, "grad/layer_27/attn_mlp_ratio": 0.844447252014025} {"step": 55450, "timestamp": 1778254474.673846, "train/loss": 2.1299094438552855, "train/z_loss": 0.0013775120140053331, "train/perplexity": 8.414104828045058, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948069.7453435126, "perf/iters_per_sec": 0.928912041351086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0765281915664673, "data/tokens_consumed": 116289175552, "data/tokens_consumed_B": 116.289175552, "train/loss_slope": 8.860932274906316e-06} {"step": 55460, "timestamp": 1778254485.0512862, "train/loss": 2.1052743434906005, "train/z_loss": 0.0013956836657598615, "train/perplexity": 8.209354883762565, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022495.1279233035, "perf/iters_per_sec": 0.9644008292786138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369132518768311, "data/tokens_consumed": 116310147072, "data/tokens_consumed_B": 116.310147072, "train/loss_slope": 3.854146620335293e-06} {"step": 55470, "timestamp": 1778254495.427194, "train/loss": 2.1096461415290833, "train/z_loss": 0.001386822562199086, "train/perplexity": 8.245323090922161, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022625.2064328643, "perf/iters_per_sec": 0.9644628555454561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368465662002564, "data/tokens_consumed": 116331118592, "data/tokens_consumed_B": 116.331118592, "train/loss_slope": 1.4787374871864195e-06} {"step": 55480, "timestamp": 1778254505.8140764, "train/loss": 2.1433675408363344, "train/z_loss": 0.001381667610257864, "train/perplexity": 8.528108079549614, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019937.8391619688, "perf/iters_per_sec": 0.9631814189729542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382260084152222, "data/tokens_consumed": 116352090112, "data/tokens_consumed_B": 116.352090112, "train/loss_slope": 2.688025069100913e-06} {"step": 55490, "timestamp": 1778254516.1924784, "train/loss": 2.172420835494995, "train/z_loss": 0.0013684400706551968, "train/perplexity": 8.779512090062404, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022039.5897666202, "perf/iters_per_sec": 0.9641836117585278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371468544006348, "data/tokens_consumed": 116373061632, "data/tokens_consumed_B": 116.373061632, "train/loss_slope": 4.707916041161693e-06} {"step": 55500, "timestamp": 1778254526.5628855, "grad/layer_0/attn": 0.0026211924850940704, "grad/layer_0/mlp": 0.002964205341413617, "grad/layer_0/attn_mlp_ratio": 0.8842816521664688, "grad/layer_4/attn": 0.0019789766520261765, "grad/layer_4/mlp": 0.0024305202532559633, "grad/layer_4/attn_mlp_ratio": 0.8142193293609363, "grad/layer_8/attn": 0.008541282266378403, "grad/layer_8/mlp": 0.0035190985072404146, "grad/layer_8/attn_mlp_ratio": 2.427122175208456, "grad/layer_12/attn": 0.004545486532151699, "grad/layer_12/mlp": 0.006639269646257162, "grad/layer_12/attn_mlp_ratio": 0.6846365196585337, "grad/layer_16/attn": 0.0039081573486328125, "grad/layer_16/mlp": 0.004756287205964327, "grad/layer_16/attn_mlp_ratio": 0.8216823537409169, "grad/layer_20/attn": 0.006736016366630793, "grad/layer_20/mlp": 0.006298542488366365, "grad/layer_20/attn_mlp_ratio": 1.0694563499614114, "grad/layer_24/attn": 0.006975479889661074, "grad/layer_24/mlp": 0.00825995672494173, "grad/layer_24/attn_mlp_ratio": 0.8444935049294622, "grad/layer_27/attn": 0.004288325086236, "grad/layer_27/mlp": 0.006722094025462866, "grad/layer_27/attn_mlp_ratio": 0.6379448139519643} {"step": 55500, "timestamp": 1778254527.16217, "eos/sharpness": 11.677455902099608, "eos/L0_probe": 1.9779945611953735, "eos/L_plus": 2.0406723022460938, "eos/L_minus": 2.0320913791656494, "eos/grad_norm": 0.10099513083696365, "eos/embed_grad_frac": 0.20060133934020996, "eos/time_s": 0.596501350402832} {"step": 55500, "timestamp": 1778254527.1808991, "train/loss": 2.1271636724472045, "train/z_loss": 0.0013905443483963609, "train/perplexity": 8.391033308637963, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909691.3187292994, "perf/iters_per_sec": 0.9106117814680573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0981628179550171, "data/tokens_consumed": 116394033152, "data/tokens_consumed_B": 116.394033152, "train/loss_slope": 2.7149884936118635e-06} {"step": 55500, "timestamp": 1778254528.5413053, "geo/rankme_last": 439.0225830078125, "geo/layer_0/stable_rank_q_proj": 19.628265380859375, "geo/layer_0/stable_rank_k_proj": 16.285388946533203, "geo/layer_0/stable_rank_o_proj": 47.47502136230469, "geo/layer_0/stable_rank_gate_proj": 132.85398864746094, "geo/layer_0/stable_rank_down_proj": 54.25977325439453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06289079040288925, "geo/layer_0/attn_entropy_mean": 6.1635661125183105, "geo/layer_0/attn_entropy_std": 0.403652161359787, "geo/layer_7/stable_rank_q_proj": 43.09219741821289, "geo/layer_7/stable_rank_k_proj": 41.685455322265625, "geo/layer_7/stable_rank_o_proj": 93.29434204101562, "geo/layer_7/stable_rank_gate_proj": 84.5962142944336, "geo/layer_7/stable_rank_down_proj": 142.35897827148438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44967222213745117, "geo/layer_7/attn_entropy_mean": 4.63233757019043, "geo/layer_7/attn_entropy_std": 0.808891773223877, "geo/layer_14/stable_rank_q_proj": 52.28681945800781, "geo/layer_14/stable_rank_k_proj": 39.45370864868164, "geo/layer_14/stable_rank_o_proj": 44.51825714111328, "geo/layer_14/stable_rank_gate_proj": 72.13168334960938, "geo/layer_14/stable_rank_down_proj": 129.8526153564453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3820413649082184, "geo/layer_14/attn_entropy_mean": 5.510988235473633, "geo/layer_14/attn_entropy_std": 0.3808976709842682, "geo/layer_21/stable_rank_q_proj": 40.71460723876953, "geo/layer_21/stable_rank_k_proj": 30.345468521118164, "geo/layer_21/stable_rank_o_proj": 72.10419464111328, "geo/layer_21/stable_rank_gate_proj": 67.77682495117188, "geo/layer_21/stable_rank_down_proj": 52.78446578979492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1430041790008545, "geo/layer_21/attn_entropy_mean": 5.6979594230651855, "geo/layer_21/attn_entropy_std": 0.3022211492061615, "geo/layer_27/stable_rank_q_proj": 43.23579406738281, "geo/layer_27/stable_rank_k_proj": 31.440908432006836, "geo/layer_27/stable_rank_o_proj": 115.67414093017578, "geo/layer_27/stable_rank_gate_proj": 81.19091796875, "geo/layer_27/stable_rank_down_proj": 129.41769409179688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09194765239953995, "geo/layer_27/attn_entropy_mean": 4.243603229522705, "geo/layer_27/attn_entropy_std": 0.7168898582458496, "attnres/final_alpha/block_0": 0.23679938912391663, "attnres/block_norm/0": 1.7533820867538452, "attnres/final_alpha/block_1": 0.004707961808890104, "attnres/block_norm/1": 44938.8125, "attnres/final_alpha/block_2": 0.010451946407556534, "attnres/block_norm/2": 28024.40234375, "attnres/final_alpha/block_3": 0.012237370945513248, "attnres/block_norm/3": 54974.9375, "attnres/final_alpha/block_4": 0.014084376394748688, "attnres/block_norm/4": 14522.625, "attnres/final_alpha/block_5": 0.6107887029647827, "attnres/block_norm/5": 6458.4091796875, "attnres/final_alpha/block_6": 0.11093024909496307, "attnres/block_norm/6": 36378.1328125, "geo/tier1_time_s": 1.356074571609497, "geo/step": 55500.0, "geo/rankme_slope": -3.139152145233093e-05} {"step": 55500, "timestamp": 1778254535.7130518, "geo/ww_alpha_mean": 7.55815464372795, "geo/ww_alpha_std": 4.394654053859056, "geo/ww_alpha_min": 1.3535111397611388, "geo/ww_alpha_max": 26.956272988843974, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.036226335969702, "geo/ww_alpha_by_type/k_proj": 4.4795669064928445, "geo/ww_alpha_by_type/v_proj": 8.697114159965533, "geo/ww_alpha_by_type/o_proj": 7.381206580561812, "geo/ww_alpha_by_type/gate_proj": 7.875153245925158, "geo/ww_alpha_by_type/up_proj": 12.150122372866795, "geo/ww_alpha_by_type/down_proj": 8.384261072892624, "geo/twonn_id/layer_0": 0.7033639550209045, "geo/twonn_id/layer_7": 3.070896863937378, "geo/twonn_id/layer_14": 4.851078510284424, "geo/twonn_id/layer_21": 6.893019199371338, "geo/twonn_id/layer_27": 5.920097351074219, "geo/tier2_time_s": 7.1659626960754395} {"step": 55500, "timestamp": 1778254536.4837525, "eoc/jacobian_sigma/layer_0/attn": 1186.0478515625, "eoc/jacobian_sigma/layer_0/mlp": 9015.9775390625, "eoc/jacobian_sigma/layer_0": 9015.9775390625, "eoc/jacobian_sigma/layer_7/attn": 1.1624579429626465, "eoc/jacobian_sigma/layer_7/mlp": 1.7010358572006226, "eoc/jacobian_sigma/layer_7": 1.7010358572006226, "eoc/jacobian_sigma/layer_14/attn": 1.5270521640777588, "eoc/jacobian_sigma/layer_14/mlp": 4.860001564025879, "eoc/jacobian_sigma/layer_14": 4.860001564025879, "eoc/jacobian_sigma/layer_21/attn": 1.095920443534851, "eoc/jacobian_sigma/layer_21/mlp": 3.9627606868743896, "eoc/jacobian_sigma/layer_21": 3.9627606868743896, "eoc/jacobian_sigma/layer_27/attn": 2.8545689582824707, "eoc/jacobian_sigma/layer_27/mlp": 29.517213821411133, "eoc/jacobian_sigma/layer_27": 29.517213821411133, "eoc/layer0_sigma": 9015.9775390625, "eoc/sigma_max": 29.517213821411133, "eoc/sigma_min": 1.7010358572006226, "eoc/sigma_mean": 10.010252982378006, "eoc/time_s": 0.7638812065124512} {"step": 55510, "timestamp": 1778254546.899043, "train/loss": 2.1407521963119507, "train/z_loss": 0.0013775333878584205, "train/perplexity": 8.505833279615118, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1063781.6874831358, "perf/iters_per_sec": 0.5072506368079833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9714120149612426, "data/tokens_consumed": 116415004672, "data/tokens_consumed_B": 116.415004672, "train/loss_slope": 1.9452287883978143e-07} {"step": 55520, "timestamp": 1778254557.275087, "train/loss": 2.185916709899902, "train/z_loss": 0.0013735225307755171, "train/perplexity": 8.898802435718117, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022512.3343454371, "perf/iters_per_sec": 0.9644090339400468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369044303894044, "data/tokens_consumed": 116435976192, "data/tokens_consumed_B": 116.435976192, "train/loss_slope": 4.252121894165796e-06} {"step": 55530, "timestamp": 1778254567.6583014, "train/loss": 2.1779268622398376, "train/z_loss": 0.001374630897771567, "train/perplexity": 8.827985644318401, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020781.8124135158, "perf/iters_per_sec": 0.9635838567798213, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377923965454101, "data/tokens_consumed": 116456947712, "data/tokens_consumed_B": 116.456947712, "train/loss_slope": 6.890095521335785e-06} {"step": 55540, "timestamp": 1778254578.0373645, "train/loss": 2.1888136863708496, "train/z_loss": 0.00137706397799775, "train/perplexity": 8.924619434557394, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021837.2716812803, "perf/iters_per_sec": 0.9640871389776613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372506380081177, "data/tokens_consumed": 116477919232, "data/tokens_consumed_B": 116.477919232, "train/loss_slope": 8.81263121019214e-06} {"step": 55550, "timestamp": 1778254588.408705, "grad/layer_0/attn": 0.0030529811047017574, "grad/layer_0/mlp": 0.0030582572799175978, "grad/layer_0/attn_mlp_ratio": 0.9982747445488115, "grad/layer_4/attn": 0.0026555496733635664, "grad/layer_4/mlp": 0.0025087171234190464, "grad/layer_4/attn_mlp_ratio": 1.0585288961919777, "grad/layer_8/attn": 0.004103282932192087, "grad/layer_8/mlp": 0.0036206471268087626, "grad/layer_8/attn_mlp_ratio": 1.133300947358166, "grad/layer_12/attn": 0.0048492541536688805, "grad/layer_12/mlp": 0.006964142434298992, "grad/layer_12/attn_mlp_ratio": 0.6963174762414602, "grad/layer_16/attn": 0.0037516229785978794, "grad/layer_16/mlp": 0.005082540214061737, "grad/layer_16/attn_mlp_ratio": 0.7381393450472706, "grad/layer_20/attn": 0.004818887449800968, "grad/layer_20/mlp": 0.0072786747477948666, "grad/layer_20/attn_mlp_ratio": 0.6620556008571931, "grad/layer_24/attn": 0.01744958944618702, "grad/layer_24/mlp": 0.013423631899058819, "grad/layer_24/attn_mlp_ratio": 1.2999156597417507, "grad/layer_27/attn": 0.004519851878285408, "grad/layer_27/mlp": 0.013286213390529156, "grad/layer_27/attn_mlp_ratio": 0.34019112228682047} {"step": 55550, "timestamp": 1778254588.423133, "train/loss": 2.1262171030044557, "train/z_loss": 0.0013856004341505467, "train/perplexity": 8.38309437088563, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020689.106555879, "perf/iters_per_sec": 0.963539651182117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378400087356567, "data/tokens_consumed": 116498890752, "data/tokens_consumed_B": 116.498890752, "train/loss_slope": 5.86965456046113e-06} {"step": 55560, "timestamp": 1778254598.8051918, "train/loss": 2.117910599708557, "train/z_loss": 0.0013927184510976077, "train/perplexity": 8.313748579114263, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021032.9083468053, "perf/iters_per_sec": 0.9637035886510874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037663459777832, "data/tokens_consumed": 116519862272, "data/tokens_consumed_B": 116.519862272, "train/loss_slope": 2.765590987905139e-06} {"step": 55570, "timestamp": 1778254609.1819553, "train/loss": 2.182760977745056, "train/z_loss": 0.0013770366553217173, "train/perplexity": 8.870764462167793, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022404.6827618426, "perf/iters_per_sec": 0.9643577016648496, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369596242904664, "data/tokens_consumed": 116540833792, "data/tokens_consumed_B": 116.540833792, "train/loss_slope": 2.162072889589559e-06} {"step": 55575, "timestamp": 1778254614.9636345, "eos/sharpness": 78.24254035949706, "eos/L0_probe": 1.980526328086853, "eos/L_plus": 2.3328747749328613, "eos/L_minus": 2.4106032848358154, "eos/grad_norm": 0.3217144310474396, "eos/embed_grad_frac": 0.025741828605532646, "eos/time_s": 0.6058750152587891} {"step": 55575, "timestamp": 1778254616.3424978, "geo/rankme_last": 438.699462890625, "geo/layer_0/stable_rank_q_proj": 19.660747528076172, "geo/layer_0/stable_rank_k_proj": 16.30101203918457, "geo/layer_0/stable_rank_o_proj": 47.45962142944336, "geo/layer_0/stable_rank_gate_proj": 133.0235137939453, "geo/layer_0/stable_rank_down_proj": 54.21215057373047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06342136859893799, "geo/layer_0/attn_entropy_mean": 6.166193962097168, "geo/layer_0/attn_entropy_std": 0.40367573499679565, "geo/layer_7/stable_rank_q_proj": 43.16891098022461, "geo/layer_7/stable_rank_k_proj": 41.652496337890625, "geo/layer_7/stable_rank_o_proj": 93.24525451660156, "geo/layer_7/stable_rank_gate_proj": 84.6519775390625, "geo/layer_7/stable_rank_down_proj": 142.2859344482422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46206140518188477, "geo/layer_7/attn_entropy_mean": 4.640390396118164, "geo/layer_7/attn_entropy_std": 0.7890945076942444, "geo/layer_14/stable_rank_q_proj": 52.24872589111328, "geo/layer_14/stable_rank_k_proj": 39.457157135009766, "geo/layer_14/stable_rank_o_proj": 44.49241638183594, "geo/layer_14/stable_rank_gate_proj": 72.13632202148438, "geo/layer_14/stable_rank_down_proj": 129.86279296875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38660821318626404, "geo/layer_14/attn_entropy_mean": 5.529214859008789, "geo/layer_14/attn_entropy_std": 0.39711716771125793, "geo/layer_21/stable_rank_q_proj": 40.7017936706543, "geo/layer_21/stable_rank_k_proj": 30.33600425720215, "geo/layer_21/stable_rank_o_proj": 72.091796875, "geo/layer_21/stable_rank_gate_proj": 67.6793212890625, "geo/layer_21/stable_rank_down_proj": 52.787445068359375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1442331075668335, "geo/layer_21/attn_entropy_mean": 5.705374717712402, "geo/layer_21/attn_entropy_std": 0.30828073620796204, "geo/layer_27/stable_rank_q_proj": 43.19847869873047, "geo/layer_27/stable_rank_k_proj": 31.49785804748535, "geo/layer_27/stable_rank_o_proj": 115.53788757324219, "geo/layer_27/stable_rank_gate_proj": 81.14469146728516, "geo/layer_27/stable_rank_down_proj": 129.3075714111328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08636382967233658, "geo/layer_27/attn_entropy_mean": 4.2215657234191895, "geo/layer_27/attn_entropy_std": 0.7110112309455872, "attnres/final_alpha/block_0": 0.23910203576087952, "attnres/block_norm/0": 1.7536139488220215, "attnres/final_alpha/block_1": 0.004772867076098919, "attnres/block_norm/1": 44969.5234375, "attnres/final_alpha/block_2": 0.010631773620843887, "attnres/block_norm/2": 27975.0078125, "attnres/final_alpha/block_3": 0.012436370365321636, "attnres/block_norm/3": 55064.31640625, "attnres/final_alpha/block_4": 0.014400698244571686, "attnres/block_norm/4": 14594.42578125, "attnres/final_alpha/block_5": 0.6036539077758789, "attnres/block_norm/5": 6518.9306640625, "attnres/final_alpha/block_6": 0.11500232666730881, "attnres/block_norm/6": 36294.765625, "geo/tier1_time_s": 1.360342264175415, "geo/step": 55575.0, "geo/rankme_slope": -5.971492112469988e-05} {"step": 55580, "timestamp": 1778254621.5328784, "train/loss": 2.157182049751282, "train/z_loss": 0.0013694456894882024, "train/perplexity": 8.64673721970003, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698651.3886102303, "perf/iters_per_sec": 0.8099801009226943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2345982313156127, "data/tokens_consumed": 116561805312, "data/tokens_consumed_B": 116.561805312, "train/loss_slope": 1.4740930460539062e-06} {"step": 55590, "timestamp": 1778254631.9097574, "train/loss": 2.1704957485198975, "train/z_loss": 0.0013683520257472992, "train/perplexity": 8.762627023515568, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021912.8864064298, "perf/iters_per_sec": 0.9641231948883199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372118473052978, "data/tokens_consumed": 116582776832, "data/tokens_consumed_B": 116.582776832, "train/loss_slope": 2.85076457675144e-06} {"step": 55600, "timestamp": 1778254642.2803168, "grad/layer_0/attn": 0.0027779012452811003, "grad/layer_0/mlp": 0.003173074685037136, "grad/layer_0/attn_mlp_ratio": 0.8754603762824925, "grad/layer_4/attn": 0.002508922480046749, "grad/layer_4/mlp": 0.0026645315811038017, "grad/layer_4/attn_mlp_ratio": 0.9415997932542538, "grad/layer_8/attn": 0.004571700002998114, "grad/layer_8/mlp": 0.003762771375477314, "grad/layer_8/attn_mlp_ratio": 1.2149821036947734, "grad/layer_12/attn": 0.006561344023793936, "grad/layer_12/mlp": 0.00728509621694684, "grad/layer_12/attn_mlp_ratio": 0.9006530234240985, "grad/layer_16/attn": 0.0038847888354212046, "grad/layer_16/mlp": 0.004906078334897757, "grad/layer_16/attn_mlp_ratio": 0.7918317831586328, "grad/layer_20/attn": 0.0036744410172104836, "grad/layer_20/mlp": 0.007114807143807411, "grad/layer_20/attn_mlp_ratio": 0.5164498336070377, "grad/layer_24/attn": 0.011832593008875847, "grad/layer_24/mlp": 0.01001004595309496, "grad/layer_24/attn_mlp_ratio": 1.182071785295871, "grad/layer_27/attn": 0.010359414853155613, "grad/layer_27/mlp": 0.0088533079251647, "grad/layer_27/attn_mlp_ratio": 1.1701179744012007} {"step": 55600, "timestamp": 1778254642.2945561, "train/loss": 2.204665923118591, "train/z_loss": 0.0013724324991926551, "train/perplexity": 9.067221912596096, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020730.6072752746, "perf/iters_per_sec": 0.963559440267217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037818694114685, "data/tokens_consumed": 116603748352, "data/tokens_consumed_B": 116.603748352, "train/loss_slope": 7.0395089755214396e-06} {"step": 55610, "timestamp": 1778254652.6699123, "train/loss": 2.184353232383728, "train/z_loss": 0.0013726791134104134, "train/perplexity": 8.884900228914828, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022284.6287251366, "perf/iters_per_sec": 0.9643004554391559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370211839675902, "data/tokens_consumed": 116624719872, "data/tokens_consumed_B": 116.624719872, "train/loss_slope": 5.57762119385922e-06} {"step": 55620, "timestamp": 1778254663.6518955, "train/loss": 2.1229745388031005, "train/z_loss": 0.0013681473094038666, "train/perplexity": 8.355955672446552, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910479.3554446173, "perf/iters_per_sec": 0.9109875466559493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097709846496582, "data/tokens_consumed": 116645691392, "data/tokens_consumed_B": 116.645691392, "train/loss_slope": 2.6496807662638983e-06} {"step": 55630, "timestamp": 1778254674.5328133, "train/loss": 2.1327707290649416, "train/z_loss": 0.0013844946282915772, "train/perplexity": 8.438214457556509, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928857.1125410877, "perf/iters_per_sec": 0.9197507441239775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0872510910034179, "data/tokens_consumed": 116666662912, "data/tokens_consumed_B": 116.666662912, "train/loss_slope": 1.9046966857177815e-06} {"step": 55640, "timestamp": 1778254684.9103537, "train/loss": 2.17840256690979, "train/z_loss": 0.0013759893365204335, "train/perplexity": 8.832186157338294, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022065.573819061, "perf/iters_per_sec": 0.9641960019202523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037133526802063, "data/tokens_consumed": 116687634432, "data/tokens_consumed_B": 116.687634432, "train/loss_slope": 3.649842227646216e-07} {"step": 55650, "timestamp": 1778254695.2829733, "grad/layer_0/attn": 0.0026008121203631163, "grad/layer_0/mlp": 0.0027851054910570383, "grad/layer_0/attn_mlp_ratio": 0.9338289107293858, "grad/layer_4/attn": 0.001762801199220121, "grad/layer_4/mlp": 0.00246787047944963, "grad/layer_4/attn_mlp_ratio": 0.7143005042076601, "grad/layer_8/attn": 0.003848175285384059, "grad/layer_8/mlp": 0.003761610249057412, "grad/layer_8/attn_mlp_ratio": 1.0230127334555899, "grad/layer_12/attn": 0.004383647348731756, "grad/layer_12/mlp": 0.0065900241024792194, "grad/layer_12/attn_mlp_ratio": 0.6651944232742868, "grad/layer_16/attn": 0.004343430977314711, "grad/layer_16/mlp": 0.004469636827707291, "grad/layer_16/attn_mlp_ratio": 0.971763713153024, "grad/layer_20/attn": 0.0044057490304112434, "grad/layer_20/mlp": 0.005749104078859091, "grad/layer_20/attn_mlp_ratio": 0.7663366140784674, "grad/layer_24/attn": 0.00930631160736084, "grad/layer_24/mlp": 0.008865121752023697, "grad/layer_24/attn_mlp_ratio": 1.0497669138339514, "grad/layer_27/attn": 0.004221852403134108, "grad/layer_27/mlp": 0.00815888773649931, "grad/layer_27/attn_mlp_ratio": 0.5174543991458468} {"step": 55650, "timestamp": 1778254695.878105, "eos/sharpness": 57.80286788940428, "eos/L0_probe": 1.9796903133392334, "eos/L_plus": 2.2369461059570312, "eos/L_minus": 2.3004631996154785, "eos/grad_norm": 0.14159218966960907, "eos/embed_grad_frac": 0.10825951397418976, "eos/time_s": 0.592207670211792} {"step": 55650, "timestamp": 1778254695.8983624, "train/loss": 2.1628772974014283, "train/z_loss": 0.001371309906244278, "train/perplexity": 8.696123028247868, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909422.193856445, "perf/iters_per_sec": 0.9104834527284836, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.09831759929657, "data/tokens_consumed": 116708605952, "data/tokens_consumed_B": 116.708605952, "train/loss_slope": 1.0466516250395112e-06} {"step": 55650, "timestamp": 1778254697.258984, "geo/rankme_last": 439.2702941894531, "geo/layer_0/stable_rank_q_proj": 19.66702651977539, "geo/layer_0/stable_rank_k_proj": 16.280967712402344, "geo/layer_0/stable_rank_o_proj": 47.51129913330078, "geo/layer_0/stable_rank_gate_proj": 133.1078338623047, "geo/layer_0/stable_rank_down_proj": 54.206050872802734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06321099400520325, "geo/layer_0/attn_entropy_mean": 6.170838356018066, "geo/layer_0/attn_entropy_std": 0.4049932658672333, "geo/layer_7/stable_rank_q_proj": 43.20796585083008, "geo/layer_7/stable_rank_k_proj": 41.654232025146484, "geo/layer_7/stable_rank_o_proj": 93.16966247558594, "geo/layer_7/stable_rank_gate_proj": 84.50713348388672, "geo/layer_7/stable_rank_down_proj": 141.8704376220703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4749111533164978, "geo/layer_7/attn_entropy_mean": 4.662021160125732, "geo/layer_7/attn_entropy_std": 0.7945717573165894, "geo/layer_14/stable_rank_q_proj": 52.13718795776367, "geo/layer_14/stable_rank_k_proj": 39.384361267089844, "geo/layer_14/stable_rank_o_proj": 44.52912902832031, "geo/layer_14/stable_rank_gate_proj": 72.16610717773438, "geo/layer_14/stable_rank_down_proj": 129.70962524414062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3829881548881531, "geo/layer_14/attn_entropy_mean": 5.523674011230469, "geo/layer_14/attn_entropy_std": 0.40475785732269287, "geo/layer_21/stable_rank_q_proj": 40.758914947509766, "geo/layer_21/stable_rank_k_proj": 30.41455078125, "geo/layer_21/stable_rank_o_proj": 71.99579620361328, "geo/layer_21/stable_rank_gate_proj": 67.62935638427734, "geo/layer_21/stable_rank_down_proj": 52.75395584106445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14288686215877533, "geo/layer_21/attn_entropy_mean": 5.696549892425537, "geo/layer_21/attn_entropy_std": 0.29372185468673706, "geo/layer_27/stable_rank_q_proj": 43.253257751464844, "geo/layer_27/stable_rank_k_proj": 31.500215530395508, "geo/layer_27/stable_rank_o_proj": 115.37757873535156, "geo/layer_27/stable_rank_gate_proj": 81.32987213134766, "geo/layer_27/stable_rank_down_proj": 129.15687561035156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08723896741867065, "geo/layer_27/attn_entropy_mean": 4.2479472160339355, "geo/layer_27/attn_entropy_std": 0.6977996826171875, "attnres/final_alpha/block_0": 0.23744912445545197, "attnres/block_norm/0": 1.753586769104004, "attnres/final_alpha/block_1": 0.004741795361042023, "attnres/block_norm/1": 45036.1484375, "attnres/final_alpha/block_2": 0.010207042098045349, "attnres/block_norm/2": 28110.6953125, "attnres/final_alpha/block_3": 0.012208601459860802, "attnres/block_norm/3": 54924.64453125, "attnres/final_alpha/block_4": 0.014185823500156403, "attnres/block_norm/4": 14514.4453125, "attnres/final_alpha/block_5": 0.6095108985900879, "attnres/block_norm/5": 6469.7333984375, "attnres/final_alpha/block_6": 0.11169673502445221, "attnres/block_norm/6": 36140.3046875, "geo/tier1_time_s": 1.3571813106536865, "geo/step": 55650.0, "geo/rankme_slope": -6.413205907362945e-05} {"step": 55660, "timestamp": 1778254707.635205, "train/loss": 2.1537049531936647, "train/z_loss": 0.0013712074025534094, "train/perplexity": 8.616723889341552, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787327.1841356275, "perf/iters_per_sec": 0.8522640152624261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1733453273773193, "data/tokens_consumed": 116729577472, "data/tokens_consumed_B": 116.729577472, "train/loss_slope": 1.5932937230643206e-06} {"step": 55670, "timestamp": 1778254718.0075138, "train/loss": 2.16363525390625, "train/z_loss": 0.0013865680433809758, "train/perplexity": 8.70271680984801, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022690.3217310642, "perf/iters_per_sec": 0.9644939049392053, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368131875991822, "data/tokens_consumed": 116750548992, "data/tokens_consumed_B": 116.750548992, "train/loss_slope": -1.4203464666573304e-06} {"step": 55680, "timestamp": 1778254728.8814595, "train/loss": 2.160371708869934, "train/z_loss": 0.0013913208735175432, "train/perplexity": 8.67436139635239, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929326.3858010098, "perf/iters_per_sec": 0.9199745110516595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0869866371154786, "data/tokens_consumed": 116771520512, "data/tokens_consumed_B": 116.771520512, "train/loss_slope": 1.4049222510103122e-06} {"step": 55690, "timestamp": 1778254739.6511939, "train/loss": 2.156253480911255, "train/z_loss": 0.0013843860127963127, "train/perplexity": 8.63871185557809, "train/grad_norm": 0.326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1948299.7725209089, "perf/iters_per_sec": 0.9290217268566651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0764010906219483, "data/tokens_consumed": 116792492032, "data/tokens_consumed_B": 116.792492032, "train/loss_slope": 3.693895550272004e-06} {"step": 55700, "timestamp": 1778254750.0134785, "grad/layer_0/attn": 0.00302459136582911, "grad/layer_0/mlp": 0.0030685425736010075, "grad/layer_0/attn_mlp_ratio": 0.9856768138993096, "grad/layer_4/attn": 0.0020649079233407974, "grad/layer_4/mlp": 0.0025986682157963514, "grad/layer_4/attn_mlp_ratio": 0.7946023395094247, "grad/layer_8/attn": 0.0037814597599208355, "grad/layer_8/mlp": 0.0036483232397586107, "grad/layer_8/attn_mlp_ratio": 1.0364924947060843, "grad/layer_12/attn": 0.004086269065737724, "grad/layer_12/mlp": 0.006417896132916212, "grad/layer_12/attn_mlp_ratio": 0.636699148356435, "grad/layer_16/attn": 0.0034164292737841606, "grad/layer_16/mlp": 0.004566233605146408, "grad/layer_16/attn_mlp_ratio": 0.7481941342453954, "grad/layer_20/attn": 0.0034060822799801826, "grad/layer_20/mlp": 0.006136561743915081, "grad/layer_20/attn_mlp_ratio": 0.5550473321404885, "grad/layer_24/attn": 0.004534171894192696, "grad/layer_24/mlp": 0.007940003648400307, "grad/layer_24/attn_mlp_ratio": 0.5710541251452441, "grad/layer_27/attn": 0.005202101543545723, "grad/layer_27/mlp": 0.006793120410293341, "grad/layer_27/attn_mlp_ratio": 0.7657896743717691} {"step": 55700, "timestamp": 1778254750.0278025, "train/loss": 2.1175915122032167, "train/z_loss": 0.0013820336083881558, "train/perplexity": 8.3110961890149, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022010.073808794, "perf/iters_per_sec": 0.9641695374530763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371619939804078, "data/tokens_consumed": 116813463552, "data/tokens_consumed_B": 116.813463552, "train/loss_slope": 1.6007675196076645e-07} {"step": 55710, "timestamp": 1778254760.3923385, "train/loss": 2.173590159416199, "train/z_loss": 0.0013883919920772315, "train/perplexity": 8.789784188100352, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024885.9493687642, "perf/iters_per_sec": 0.9655408617824384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356889486312866, "data/tokens_consumed": 116834435072, "data/tokens_consumed_B": 116.834435072, "train/loss_slope": 2.913798979728602e-06} {"step": 55720, "timestamp": 1778254770.7441869, "train/loss": 2.1490615367889405, "train/z_loss": 0.0013777181855402886, "train/perplexity": 8.576805602616725, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027464.1085964306, "perf/iters_per_sec": 0.966770223901954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343719482421876, "data/tokens_consumed": 116855406592, "data/tokens_consumed_B": 116.855406592, "train/loss_slope": 3.609736181042285e-06} {"step": 55725, "timestamp": 1778254776.5003085, "eos/sharpness": 72.29292392730711, "eos/L0_probe": 1.9779936075210571, "eos/L_plus": 2.406611442565918, "eos/L_minus": 2.2723050117492676, "eos/grad_norm": 0.20372308790683746, "eos/embed_grad_frac": 0.05899374186992645, "eos/time_s": 0.592573881149292} {"step": 55725, "timestamp": 1778254777.8791544, "geo/rankme_last": 438.67919921875, "geo/layer_0/stable_rank_q_proj": 19.648523330688477, "geo/layer_0/stable_rank_k_proj": 16.227354049682617, "geo/layer_0/stable_rank_o_proj": 47.575660705566406, "geo/layer_0/stable_rank_gate_proj": 133.06668090820312, "geo/layer_0/stable_rank_down_proj": 54.30430221557617, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06383483111858368, "geo/layer_0/attn_entropy_mean": 6.169096946716309, "geo/layer_0/attn_entropy_std": 0.40574872493743896, "geo/layer_7/stable_rank_q_proj": 43.22089385986328, "geo/layer_7/stable_rank_k_proj": 41.592857360839844, "geo/layer_7/stable_rank_o_proj": 93.09049987792969, "geo/layer_7/stable_rank_gate_proj": 84.52576446533203, "geo/layer_7/stable_rank_down_proj": 141.7615203857422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46044886112213135, "geo/layer_7/attn_entropy_mean": 4.643702030181885, "geo/layer_7/attn_entropy_std": 0.8030181527137756, "geo/layer_14/stable_rank_q_proj": 52.27782440185547, "geo/layer_14/stable_rank_k_proj": 39.41299057006836, "geo/layer_14/stable_rank_o_proj": 44.55073165893555, "geo/layer_14/stable_rank_gate_proj": 72.2559814453125, "geo/layer_14/stable_rank_down_proj": 129.4624786376953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38160744309425354, "geo/layer_14/attn_entropy_mean": 5.501293182373047, "geo/layer_14/attn_entropy_std": 0.39560821652412415, "geo/layer_21/stable_rank_q_proj": 40.748104095458984, "geo/layer_21/stable_rank_k_proj": 30.373624801635742, "geo/layer_21/stable_rank_o_proj": 71.87737274169922, "geo/layer_21/stable_rank_gate_proj": 67.61414337158203, "geo/layer_21/stable_rank_down_proj": 52.736576080322266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14423981308937073, "geo/layer_21/attn_entropy_mean": 5.714062213897705, "geo/layer_21/attn_entropy_std": 0.3003202974796295, "geo/layer_27/stable_rank_q_proj": 43.20742416381836, "geo/layer_27/stable_rank_k_proj": 31.567995071411133, "geo/layer_27/stable_rank_o_proj": 115.42219543457031, "geo/layer_27/stable_rank_gate_proj": 81.25049591064453, "geo/layer_27/stable_rank_down_proj": 129.24778747558594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09206552803516388, "geo/layer_27/attn_entropy_mean": 4.232885360717773, "geo/layer_27/attn_entropy_std": 0.6977640390396118, "attnres/final_alpha/block_0": 0.23510415852069855, "attnres/block_norm/0": 1.753764033317566, "attnres/final_alpha/block_1": 0.0046599688939750195, "attnres/block_norm/1": 44980.203125, "attnres/final_alpha/block_2": 0.010261710733175278, "attnres/block_norm/2": 28190.47265625, "attnres/final_alpha/block_3": 0.012264477089047432, "attnres/block_norm/3": 54827.5, "attnres/final_alpha/block_4": 0.014195084571838379, "attnres/block_norm/4": 14607.1171875, "attnres/final_alpha/block_5": 0.613781750202179, "attnres/block_norm/5": 6495.87060546875, "attnres/final_alpha/block_6": 0.10973285138607025, "attnres/block_norm/6": 36719.5625, "geo/tier1_time_s": 1.3603949546813965, "geo/step": 55725.0, "geo/rankme_slope": -5.8475890356142456e-05} {"step": 55730, "timestamp": 1778254783.0688899, "train/loss": 2.137109708786011, "train/z_loss": 0.001373571646399796, "train/perplexity": 8.47490724603531, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702284.738096509, "perf/iters_per_sec": 0.8117126169664902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319631099700927, "data/tokens_consumed": 116876378112, "data/tokens_consumed_B": 116.876378112, "train/loss_slope": 2.050757651353388e-06} {"step": 55740, "timestamp": 1778254793.428632, "train/loss": 2.147042179107666, "train/z_loss": 0.001378602534532547, "train/perplexity": 8.559503439849497, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025668.23884771, "perf/iters_per_sec": 0.9659138864744711, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352889776229859, "data/tokens_consumed": 116897349632, "data/tokens_consumed_B": 116.897349632, "train/loss_slope": -6.713020597675824e-07} {"step": 55750, "timestamp": 1778254803.7656605, "grad/layer_0/attn": 0.0037657693028450012, "grad/layer_0/mlp": 0.0031258671078830957, "grad/layer_0/attn_mlp_ratio": 1.2047118615109889, "grad/layer_4/attn": 0.0024378111120313406, "grad/layer_4/mlp": 0.0025452151894569397, "grad/layer_4/attn_mlp_ratio": 0.957801535347324, "grad/layer_8/attn": 0.004627440590411425, "grad/layer_8/mlp": 0.003648053389042616, "grad/layer_8/attn_mlp_ratio": 1.2684684049481512, "grad/layer_12/attn": 0.008272492326796055, "grad/layer_12/mlp": 0.007113044150173664, "grad/layer_12/attn_mlp_ratio": 1.1630030737674777, "grad/layer_16/attn": 0.004465911071747541, "grad/layer_16/mlp": 0.004637829028069973, "grad/layer_16/attn_mlp_ratio": 0.9629313518081304, "grad/layer_20/attn": 0.007578113581985235, "grad/layer_20/mlp": 0.006202794145792723, "grad/layer_20/attn_mlp_ratio": 1.2217257709499836, "grad/layer_24/attn": 0.006962192244827747, "grad/layer_24/mlp": 0.008657061494886875, "grad/layer_24/attn_mlp_ratio": 0.8042211746465834, "grad/layer_27/attn": 0.012050330638885498, "grad/layer_27/mlp": 0.006944534368813038, "grad/layer_27/attn_mlp_ratio": 1.7352251173929512} {"step": 55750, "timestamp": 1778254803.7797575, "train/loss": 2.1753270626068115, "train/z_loss": 0.0013803955749608575, "train/perplexity": 8.805064458635282, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026961.627991537, "perf/iters_per_sec": 0.9665306224782644, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346283674240113, "data/tokens_consumed": 116918321152, "data/tokens_consumed_B": 116.918321152, "train/loss_slope": 2.2294147716353934e-06} {"step": 55760, "timestamp": 1778254814.1764853, "train/loss": 2.1899744272232056, "train/z_loss": 0.001378734118770808, "train/perplexity": 8.934984619409798, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018981.9995078382, "perf/iters_per_sec": 0.9627256391085807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387175321578979, "data/tokens_consumed": 116939292672, "data/tokens_consumed_B": 116.939292672, "train/loss_slope": 2.658417916605504e-06} {"step": 55770, "timestamp": 1778254824.574205, "train/loss": 2.1770861387252807, "train/z_loss": 0.0013883953331969678, "train/perplexity": 8.82056686820779, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018628.9812957505, "perf/iters_per_sec": 0.9625573069075348, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388991832733154, "data/tokens_consumed": 116960264192, "data/tokens_consumed_B": 116.960264192, "train/loss_slope": 1.9273145471361602e-06} {"step": 55780, "timestamp": 1778254834.9526837, "train/loss": 2.1264107704162596, "train/z_loss": 0.0013883489300496876, "train/perplexity": 8.384718060298136, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022064.3652436743, "perf/iters_per_sec": 0.9641954256265994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371341466903687, "data/tokens_consumed": 116981235712, "data/tokens_consumed_B": 116.981235712, "train/loss_slope": 1.1849144432873064e-07} {"step": 55790, "timestamp": 1778254845.2962668, "train/loss": 2.173195219039917, "train/z_loss": 0.0013612337177619338, "train/perplexity": 8.786313432841476, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028641.6499407361, "perf/iters_per_sec": 0.9673317193702393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033771538734436, "data/tokens_consumed": 117002207232, "data/tokens_consumed_B": 117.002207232, "train/loss_slope": 2.3894597559120846e-06} {"step": 55800, "timestamp": 1778254855.6649303, "grad/layer_0/attn": 0.0026827272959053516, "grad/layer_0/mlp": 0.003008101135492325, "grad/layer_0/attn_mlp_ratio": 0.8918341125797515, "grad/layer_4/attn": 0.0021338826045393944, "grad/layer_4/mlp": 0.0025499227922409773, "grad/layer_4/attn_mlp_ratio": 0.8368420123731861, "grad/layer_8/attn": 0.007608095183968544, "grad/layer_8/mlp": 0.00369554222561419, "grad/layer_8/attn_mlp_ratio": 2.058722242534212, "grad/layer_12/attn": 0.004371316637843847, "grad/layer_12/mlp": 0.006486184429377317, "grad/layer_12/attn_mlp_ratio": 0.6739426881929151, "grad/layer_16/attn": 0.0035482575185596943, "grad/layer_16/mlp": 0.004687866661697626, "grad/layer_16/attn_mlp_ratio": 0.7569023820281866, "grad/layer_20/attn": 0.0036287400871515274, "grad/layer_20/mlp": 0.006838757079094648, "grad/layer_20/attn_mlp_ratio": 0.5306139686088284, "grad/layer_24/attn": 0.025125006213784218, "grad/layer_24/mlp": 0.013528740033507347, "grad/layer_24/attn_mlp_ratio": 1.8571578702702543, "grad/layer_27/attn": 0.008007318712770939, "grad/layer_27/mlp": 0.012560532428324223, "grad/layer_27/attn_mlp_ratio": 0.6374983460863856} {"step": 55800, "timestamp": 1778254856.2900636, "eos/sharpness": 81.36103153228758, "eos/L0_probe": 1.9822531938552856, "eos/L_plus": 2.3291964530944824, "eos/L_minus": 2.448920249938965, "eos/grad_norm": 0.27971041202545166, "eos/embed_grad_frac": 0.03122798167169094, "eos/time_s": 0.6221668720245361} {"step": 55800, "timestamp": 1778254856.3107378, "train/loss": 2.2173732995986937, "train/z_loss": 0.0013865953078493476, "train/perplexity": 9.183177701652138, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905435.0187585668, "perf/iters_per_sec": 0.9085822194855532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1006158590316772, "data/tokens_consumed": 117023178752, "data/tokens_consumed_B": 117.023178752, "train/loss_slope": 8.870951635072818e-06} {"step": 55800, "timestamp": 1778254857.6793618, "geo/rankme_last": 439.11846923828125, "geo/layer_0/stable_rank_q_proj": 19.63657569885254, "geo/layer_0/stable_rank_k_proj": 16.24250030517578, "geo/layer_0/stable_rank_o_proj": 47.574462890625, "geo/layer_0/stable_rank_gate_proj": 132.76705932617188, "geo/layer_0/stable_rank_down_proj": 54.280250549316406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060969170182943344, "geo/layer_0/attn_entropy_mean": 6.169290542602539, "geo/layer_0/attn_entropy_std": 0.4049305021762848, "geo/layer_7/stable_rank_q_proj": 43.27934646606445, "geo/layer_7/stable_rank_k_proj": 41.669193267822266, "geo/layer_7/stable_rank_o_proj": 93.32878875732422, "geo/layer_7/stable_rank_gate_proj": 84.55506896972656, "geo/layer_7/stable_rank_down_proj": 141.9304962158203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45967063307762146, "geo/layer_7/attn_entropy_mean": 4.663862228393555, "geo/layer_7/attn_entropy_std": 0.7925683259963989, "geo/layer_14/stable_rank_q_proj": 52.28205108642578, "geo/layer_14/stable_rank_k_proj": 39.501583099365234, "geo/layer_14/stable_rank_o_proj": 44.5076789855957, "geo/layer_14/stable_rank_gate_proj": 72.23432922363281, "geo/layer_14/stable_rank_down_proj": 129.36376953125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39878392219543457, "geo/layer_14/attn_entropy_mean": 5.483462333679199, "geo/layer_14/attn_entropy_std": 0.39156752824783325, "geo/layer_21/stable_rank_q_proj": 40.75382995605469, "geo/layer_21/stable_rank_k_proj": 30.34589958190918, "geo/layer_21/stable_rank_o_proj": 71.9687728881836, "geo/layer_21/stable_rank_gate_proj": 67.62407684326172, "geo/layer_21/stable_rank_down_proj": 52.71271514892578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14518696069717407, "geo/layer_21/attn_entropy_mean": 5.6787567138671875, "geo/layer_21/attn_entropy_std": 0.31084051728248596, "geo/layer_27/stable_rank_q_proj": 43.1664924621582, "geo/layer_27/stable_rank_k_proj": 31.598939895629883, "geo/layer_27/stable_rank_o_proj": 115.61038208007812, "geo/layer_27/stable_rank_gate_proj": 81.3382797241211, "geo/layer_27/stable_rank_down_proj": 129.0689239501953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09139823168516159, "geo/layer_27/attn_entropy_mean": 4.210127830505371, "geo/layer_27/attn_entropy_std": 0.705062747001648, "attnres/final_alpha/block_0": 0.2375611811876297, "attnres/block_norm/0": 1.754027009010315, "attnres/final_alpha/block_1": 0.004803522955626249, "attnres/block_norm/1": 45046.1640625, "attnres/final_alpha/block_2": 0.010634074918925762, "attnres/block_norm/2": 28095.73046875, "attnres/final_alpha/block_3": 0.012429876253008842, "attnres/block_norm/3": 54889.9296875, "attnres/final_alpha/block_4": 0.014397088438272476, "attnres/block_norm/4": 14531.6669921875, "attnres/final_alpha/block_5": 0.607964277267456, "attnres/block_norm/5": 6501.966796875, "attnres/final_alpha/block_6": 0.11220996081829071, "attnres/block_norm/6": 36557.38671875, "geo/tier1_time_s": 1.364642858505249, "geo/step": 55800.0, "geo/rankme_slope": -3.7811765331132454e-05} {"step": 55810, "timestamp": 1778254868.0627894, "train/loss": 2.178459715843201, "train/z_loss": 0.0013823901303112508, "train/perplexity": 8.832690921780108, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785080.4211641247, "perf/iters_per_sec": 0.8511926751919388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1748221397399903, "data/tokens_consumed": 117044150272, "data/tokens_consumed_B": 117.044150272, "train/loss_slope": 9.293602144542873e-06} {"step": 55820, "timestamp": 1778254878.4400673, "train/loss": 2.1103248238563537, "train/z_loss": 0.001382601901423186, "train/perplexity": 8.250920945354174, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021710.779543528, "perf/iters_per_sec": 0.9640268228261605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373155355453492, "data/tokens_consumed": 117065121792, "data/tokens_consumed_B": 117.065121792, "train/loss_slope": 6.332024660977411e-06} {"step": 55830, "timestamp": 1778254888.821198, "train/loss": 2.1865744829177856, "train/z_loss": 0.0013718910631723702, "train/perplexity": 8.904657753375675, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021430.9916721648, "perf/iters_per_sec": 0.9638934095726799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374591112136842, "data/tokens_consumed": 117086093312, "data/tokens_consumed_B": 117.086093312, "train/loss_slope": 8.444046995641912e-06} {"step": 55840, "timestamp": 1778254899.1814892, "train/loss": 2.162613368034363, "train/z_loss": 0.0014011579798534513, "train/perplexity": 8.693828168854814, "train/grad_norm": 0.375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025200.2660840033, "perf/iters_per_sec": 0.9656907396717087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355282068252563, "data/tokens_consumed": 117107064832, "data/tokens_consumed_B": 117.107064832, "train/loss_slope": 9.227194172320948e-06} {"step": 55850, "timestamp": 1778254909.547509, "grad/layer_0/attn": 0.0027467634063214064, "grad/layer_0/mlp": 0.002727411687374115, "grad/layer_0/attn_mlp_ratio": 1.007095231837332, "grad/layer_4/attn": 0.002074622781947255, "grad/layer_4/mlp": 0.0026331786066293716, "grad/layer_4/attn_mlp_ratio": 0.7878776995743285, "grad/layer_8/attn": 0.005687156692147255, "grad/layer_8/mlp": 0.0035785320214927197, "grad/layer_8/attn_mlp_ratio": 1.5892428792213722, "grad/layer_12/attn": 0.00473325839266181, "grad/layer_12/mlp": 0.0066961562260985374, "grad/layer_12/attn_mlp_ratio": 0.706861990992317, "grad/layer_16/attn": 0.004242435563355684, "grad/layer_16/mlp": 0.004171035718172789, "grad/layer_16/attn_mlp_ratio": 1.0171179889829318, "grad/layer_20/attn": 0.003413981990888715, "grad/layer_20/mlp": 0.005776321981102228, "grad/layer_20/attn_mlp_ratio": 0.5910304070574376, "grad/layer_24/attn": 0.007671792060136795, "grad/layer_24/mlp": 0.008271295577287674, "grad/layer_24/attn_mlp_ratio": 0.9275199871288523, "grad/layer_27/attn": 0.006723106373101473, "grad/layer_27/mlp": 0.006567362695932388, "grad/layer_27/attn_mlp_ratio": 1.0237147820226329} {"step": 55850, "timestamp": 1778254909.5618682, "train/loss": 2.1177307844161986, "train/z_loss": 0.001391750981565565, "train/perplexity": 8.312253774381317, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021651.6747568268, "perf/iters_per_sec": 0.9639986394676336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037345862388611, "data/tokens_consumed": 117128036352, "data/tokens_consumed_B": 117.128036352, "train/loss_slope": 6.10746265065726e-06} {"step": 55860, "timestamp": 1778254919.914669, "train/loss": 2.1615407943725584, "train/z_loss": 0.0013738552457652985, "train/perplexity": 8.684508396705183, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027032.8151175964, "perf/iters_per_sec": 0.9665645671451552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345920324325562, "data/tokens_consumed": 117149007872, "data/tokens_consumed_B": 117.149007872, "train/loss_slope": 3.1672847045636883e-06} {"step": 55870, "timestamp": 1778254930.259258, "train/loss": 2.135360860824585, "train/z_loss": 0.001374100090470165, "train/perplexity": 8.460098874344373, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028311.344022455, "perf/iters_per_sec": 0.9671742172348284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339398860931397, "data/tokens_consumed": 117169979392, "data/tokens_consumed_B": 117.169979392, "train/loss_slope": 1.5671203202968978e-06} {"step": 55875, "timestamp": 1778254936.0125947, "eos/sharpness": 60.729742050170884, "eos/L0_probe": 1.9814540147781372, "eos/L_plus": 2.265089511871338, "eos/L_minus": 2.3051159381866455, "eos/grad_norm": 0.164037823677063, "eos/embed_grad_frac": 0.08048065751791, "eos/time_s": 0.5880112648010254} {"step": 55875, "timestamp": 1778254937.393906, "geo/rankme_last": 437.1358642578125, "geo/layer_0/stable_rank_q_proj": 19.59299087524414, "geo/layer_0/stable_rank_k_proj": 16.220251083374023, "geo/layer_0/stable_rank_o_proj": 47.55495834350586, "geo/layer_0/stable_rank_gate_proj": 132.81863403320312, "geo/layer_0/stable_rank_down_proj": 54.300716400146484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05992754548788071, "geo/layer_0/attn_entropy_mean": 6.163244247436523, "geo/layer_0/attn_entropy_std": 0.4066120982170105, "geo/layer_7/stable_rank_q_proj": 43.31623077392578, "geo/layer_7/stable_rank_k_proj": 41.66048812866211, "geo/layer_7/stable_rank_o_proj": 93.44481658935547, "geo/layer_7/stable_rank_gate_proj": 84.52584838867188, "geo/layer_7/stable_rank_down_proj": 141.84352111816406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47129231691360474, "geo/layer_7/attn_entropy_mean": 4.6581621170043945, "geo/layer_7/attn_entropy_std": 0.7928863167762756, "geo/layer_14/stable_rank_q_proj": 52.218292236328125, "geo/layer_14/stable_rank_k_proj": 39.50092315673828, "geo/layer_14/stable_rank_o_proj": 44.56608200073242, "geo/layer_14/stable_rank_gate_proj": 72.27843475341797, "geo/layer_14/stable_rank_down_proj": 129.38009643554688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38851842284202576, "geo/layer_14/attn_entropy_mean": 5.506174087524414, "geo/layer_14/attn_entropy_std": 0.386928528547287, "geo/layer_21/stable_rank_q_proj": 40.71638488769531, "geo/layer_21/stable_rank_k_proj": 30.440502166748047, "geo/layer_21/stable_rank_o_proj": 72.01262664794922, "geo/layer_21/stable_rank_gate_proj": 67.66064453125, "geo/layer_21/stable_rank_down_proj": 52.66011428833008, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14473597705364227, "geo/layer_21/attn_entropy_mean": 5.705081939697266, "geo/layer_21/attn_entropy_std": 0.2997671663761139, "geo/layer_27/stable_rank_q_proj": 43.17647933959961, "geo/layer_27/stable_rank_k_proj": 31.59324073791504, "geo/layer_27/stable_rank_o_proj": 115.51665496826172, "geo/layer_27/stable_rank_gate_proj": 81.34622955322266, "geo/layer_27/stable_rank_down_proj": 129.34722900390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09487516433000565, "geo/layer_27/attn_entropy_mean": 4.214646339416504, "geo/layer_27/attn_entropy_std": 0.7077510952949524, "attnres/final_alpha/block_0": 0.23668025434017181, "attnres/block_norm/0": 1.7540416717529297, "attnres/final_alpha/block_1": 0.004733023699373007, "attnres/block_norm/1": 44945.69140625, "attnres/final_alpha/block_2": 0.010369773954153061, "attnres/block_norm/2": 28108.53125, "attnres/final_alpha/block_3": 0.012151286005973816, "attnres/block_norm/3": 55367.421875, "attnres/final_alpha/block_4": 0.014040188863873482, "attnres/block_norm/4": 14556.423828125, "attnres/final_alpha/block_5": 0.6101489067077637, "attnres/block_norm/5": 6505.7822265625, "attnres/final_alpha/block_6": 0.11187662184238434, "attnres/block_norm/6": 36669.40625, "geo/tier1_time_s": 1.3609168529510498, "geo/step": 55875.0, "geo/rankme_slope": -9.71735569227691e-05} {"step": 55880, "timestamp": 1778254942.5695822, "train/loss": 2.1230441451072695, "train/z_loss": 0.0013788925716653466, "train/perplexity": 8.35653731988164, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704691.1927599686, "perf/iters_per_sec": 0.812860103969559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2302239894866944, "data/tokens_consumed": 117190950912, "data/tokens_consumed_B": 117.190950912, "train/loss_slope": 1.8944445795172689e-06} {"step": 55890, "timestamp": 1778254952.912284, "train/loss": 2.125991106033325, "train/z_loss": 0.0013977526919916271, "train/perplexity": 8.381200031014709, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028647.0771930565, "perf/iters_per_sec": 0.9673343072858126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337687730789185, "data/tokens_consumed": 117211922432, "data/tokens_consumed_B": 117.211922432, "train/loss_slope": 8.206000315187664e-07} {"step": 55900, "timestamp": 1778254963.249661, "grad/layer_0/attn": 0.003553630318492651, "grad/layer_0/mlp": 0.0032862313091754913, "grad/layer_0/attn_mlp_ratio": 1.0813694703819563, "grad/layer_4/attn": 0.0029347173403948545, "grad/layer_4/mlp": 0.0027648047544062138, "grad/layer_4/attn_mlp_ratio": 1.0614555076890344, "grad/layer_8/attn": 0.0038020452484488487, "grad/layer_8/mlp": 0.0040108757093548775, "grad/layer_8/attn_mlp_ratio": 0.947933924950018, "grad/layer_12/attn": 0.007875803858041763, "grad/layer_12/mlp": 0.007584117352962494, "grad/layer_12/attn_mlp_ratio": 1.0384601645330969, "grad/layer_16/attn": 0.004764848854392767, "grad/layer_16/mlp": 0.005190838128328323, "grad/layer_16/attn_mlp_ratio": 0.9179343768390287, "grad/layer_20/attn": 0.004099641926586628, "grad/layer_20/mlp": 0.00662444019690156, "grad/layer_20/attn_mlp_ratio": 0.6188661596820713, "grad/layer_24/attn": 0.014614935964345932, "grad/layer_24/mlp": 0.011128517799079418, "grad/layer_24/attn_mlp_ratio": 1.3132868273101237, "grad/layer_27/attn": 0.008126429282128811, "grad/layer_27/mlp": 0.010671239346265793, "grad/layer_27/attn_mlp_ratio": 0.7615262803396758} {"step": 55900, "timestamp": 1778254963.2637177, "train/loss": 2.158785343170166, "train/z_loss": 0.0013843429507687688, "train/perplexity": 8.660611595955201, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026829.309828138, "perf/iters_per_sec": 0.9664675282612505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346959114074707, "data/tokens_consumed": 117232893952, "data/tokens_consumed_B": 117.232893952, "train/loss_slope": 2.2429112947419567e-06} {"step": 55910, "timestamp": 1778254973.6135097, "train/loss": 2.1525453090667725, "train/z_loss": 0.0013769628829322755, "train/perplexity": 8.606737347626588, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027553.5580371849, "perf/iters_per_sec": 0.9668128767190861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343263149261475, "data/tokens_consumed": 117253865472, "data/tokens_consumed_B": 117.253865472, "train/loss_slope": 2.6000026750948774e-07} {"step": 55920, "timestamp": 1778254983.971593, "train/loss": 2.1047598481178285, "train/z_loss": 0.0013972720131278037, "train/perplexity": 8.20513229500576, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026112.1126104705, "perf/iters_per_sec": 0.9661255419781067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350621700286866, "data/tokens_consumed": 117274836992, "data/tokens_consumed_B": 117.274836992, "train/loss_slope": -6.699482986170937e-06} {"step": 55930, "timestamp": 1778254994.3163123, "train/loss": 2.2096247911453246, "train/z_loss": 0.0013722865143790842, "train/perplexity": 9.112296737115638, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028195.6384312354, "perf/iters_per_sec": 0.9671190445095231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339988708496093, "data/tokens_consumed": 117295808512, "data/tokens_consumed_B": 117.295808512, "train/loss_slope": -3.0789864302897625e-06} {"step": 55940, "timestamp": 1778255004.6646256, "train/loss": 2.186674213409424, "train/z_loss": 0.0013741892646066845, "train/perplexity": 8.905545863556387, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027914.9412126448, "perf/iters_per_sec": 0.9669851976454948, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341419935226441, "data/tokens_consumed": 117316780032, "data/tokens_consumed_B": 117.316780032, "train/loss_slope": -1.6938194225687878e-06} {"step": 55950, "timestamp": 1778255015.5539148, "grad/layer_0/attn": 0.002960686571896076, "grad/layer_0/mlp": 0.0031017609871923923, "grad/layer_0/attn_mlp_ratio": 0.9545179298693144, "grad/layer_4/attn": 0.0032231751829385757, "grad/layer_4/mlp": 0.0025482866913080215, "grad/layer_4/attn_mlp_ratio": 1.2648400462351954, "grad/layer_8/attn": 0.004063584841787815, "grad/layer_8/mlp": 0.003543221391737461, "grad/layer_8/attn_mlp_ratio": 1.1468616487182086, "grad/layer_12/attn": 0.004331616219133139, "grad/layer_12/mlp": 0.0066125718876719475, "grad/layer_12/attn_mlp_ratio": 0.6550577032974045, "grad/layer_16/attn": 0.0033102354500442743, "grad/layer_16/mlp": 0.004530031234025955, "grad/layer_16/attn_mlp_ratio": 0.7307312479674138, "grad/layer_20/attn": 0.0047436063177883625, "grad/layer_20/mlp": 0.00611775740981102, "grad/layer_20/attn_mlp_ratio": 0.7753831874148432, "grad/layer_24/attn": 0.01167672872543335, "grad/layer_24/mlp": 0.009463168680667877, "grad/layer_24/attn_mlp_ratio": 1.2339131844808169, "grad/layer_27/attn": 0.006941063795238733, "grad/layer_27/mlp": 0.008307467214763165, "grad/layer_27/attn_mlp_ratio": 0.835521047781169} {"step": 55950, "timestamp": 1778255016.1443875, "eos/sharpness": 43.28420162200927, "eos/L0_probe": 1.9831647872924805, "eos/L_plus": 2.2154669761657715, "eos/L_minus": 2.1837046146392822, "eos/grad_norm": 0.14689092338085175, "eos/embed_grad_frac": 0.10459450632333755, "eos/time_s": 0.5875322818756104} {"step": 55950, "timestamp": 1778255016.1632352, "train/loss": 2.138966464996338, "train/z_loss": 0.0013758620829321445, "train/perplexity": 8.490657700548415, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1824951.0415187946, "perf/iters_per_sec": 0.8702044684976552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1491552114486694, "data/tokens_consumed": 117337751552, "data/tokens_consumed_B": 117.337751552, "train/loss_slope": -3.7907541316799522e-06} {"step": 55950, "timestamp": 1778255017.524694, "geo/rankme_last": 439.31744384765625, "geo/layer_0/stable_rank_q_proj": 19.603628158569336, "geo/layer_0/stable_rank_k_proj": 16.209381103515625, "geo/layer_0/stable_rank_o_proj": 47.635955810546875, "geo/layer_0/stable_rank_gate_proj": 132.7496337890625, "geo/layer_0/stable_rank_down_proj": 54.23284912109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06173508241772652, "geo/layer_0/attn_entropy_mean": 6.164104461669922, "geo/layer_0/attn_entropy_std": 0.41000470519065857, "geo/layer_7/stable_rank_q_proj": 43.242530822753906, "geo/layer_7/stable_rank_k_proj": 41.67981719970703, "geo/layer_7/stable_rank_o_proj": 93.03739929199219, "geo/layer_7/stable_rank_gate_proj": 84.54517364501953, "geo/layer_7/stable_rank_down_proj": 141.77294921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4732832610607147, "geo/layer_7/attn_entropy_mean": 4.651993751525879, "geo/layer_7/attn_entropy_std": 0.8024584650993347, "geo/layer_14/stable_rank_q_proj": 52.278167724609375, "geo/layer_14/stable_rank_k_proj": 39.47111511230469, "geo/layer_14/stable_rank_o_proj": 44.50820541381836, "geo/layer_14/stable_rank_gate_proj": 72.22685241699219, "geo/layer_14/stable_rank_down_proj": 129.18319702148438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3892674148082733, "geo/layer_14/attn_entropy_mean": 5.51872444152832, "geo/layer_14/attn_entropy_std": 0.3862253427505493, "geo/layer_21/stable_rank_q_proj": 40.77939224243164, "geo/layer_21/stable_rank_k_proj": 30.486534118652344, "geo/layer_21/stable_rank_o_proj": 71.98809814453125, "geo/layer_21/stable_rank_gate_proj": 67.70865631103516, "geo/layer_21/stable_rank_down_proj": 52.705257415771484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14462806284427643, "geo/layer_21/attn_entropy_mean": 5.705081939697266, "geo/layer_21/attn_entropy_std": 0.2988647222518921, "geo/layer_27/stable_rank_q_proj": 43.234580993652344, "geo/layer_27/stable_rank_k_proj": 31.613513946533203, "geo/layer_27/stable_rank_o_proj": 115.67527770996094, "geo/layer_27/stable_rank_gate_proj": 81.31175994873047, "geo/layer_27/stable_rank_down_proj": 129.51779174804688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09337660670280457, "geo/layer_27/attn_entropy_mean": 4.221868991851807, "geo/layer_27/attn_entropy_std": 0.7060002088546753, "attnres/final_alpha/block_0": 0.23689121007919312, "attnres/block_norm/0": 1.754145622253418, "attnres/final_alpha/block_1": 0.0047084069810807705, "attnres/block_norm/1": 45173.51953125, "attnres/final_alpha/block_2": 0.010344184935092926, "attnres/block_norm/2": 28063.296875, "attnres/final_alpha/block_3": 0.012260260060429573, "attnres/block_norm/3": 55111.1171875, "attnres/final_alpha/block_4": 0.014381837099790573, "attnres/block_norm/4": 14568.4931640625, "attnres/final_alpha/block_5": 0.6110653877258301, "attnres/block_norm/5": 6460.5244140625, "attnres/final_alpha/block_6": 0.11034867912530899, "attnres/block_norm/6": 36442.9453125, "geo/tier1_time_s": 1.357856273651123, "geo/step": 55950.0, "geo/rankme_slope": -6.540350515206083e-05} {"step": 55960, "timestamp": 1778255027.8736167, "train/loss": 2.129043364524841, "train/z_loss": 0.0013874187367036938, "train/perplexity": 8.406820700540749, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791365.7009370213, "perf/iters_per_sec": 0.8541897301373583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707000970840453, "data/tokens_consumed": 117358723072, "data/tokens_consumed_B": 117.358723072, "train/loss_slope": -6.037334332836682e-06} {"step": 55970, "timestamp": 1778255038.2297645, "train/loss": 2.1912079215049745, "train/z_loss": 0.0013632168411277235, "train/perplexity": 8.946012671967145, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026742.6328149848, "perf/iters_per_sec": 0.966426197440617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034740161895752, "data/tokens_consumed": 117379694592, "data/tokens_consumed_B": 117.379694592, "train/loss_slope": -1.9759689978282795e-06} {"step": 55980, "timestamp": 1778255048.5729344, "train/loss": 2.1744964122772217, "train/z_loss": 0.0013754165265709162, "train/perplexity": 8.79775356575881, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028943.9359790534, "perf/iters_per_sec": 0.967475860585715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336175203323363, "data/tokens_consumed": 117400666112, "data/tokens_consumed_B": 117.400666112, "train/loss_slope": 5.391605115673665e-07} {"step": 55990, "timestamp": 1778255058.9127166, "train/loss": 2.1593086957931518, "train/z_loss": 0.0013744085910730064, "train/perplexity": 8.665145336019501, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029178.7611473312, "perf/iters_per_sec": 0.9675878339516312, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334979057312013, "data/tokens_consumed": 117421637632, "data/tokens_consumed_B": 117.421637632, "train/loss_slope": 1.8463469324189245e-06} {"step": 56000, "timestamp": 1778255069.2508683, "grad/layer_0/attn": 0.0026541920378804207, "grad/layer_0/mlp": 0.0029663885943591595, "grad/layer_0/attn_mlp_ratio": 0.8947553107007159, "grad/layer_4/attn": 0.0019889965187758207, "grad/layer_4/mlp": 0.002596077974885702, "grad/layer_4/attn_mlp_ratio": 0.7661543533752891, "grad/layer_8/attn": 0.009248600341379642, "grad/layer_8/mlp": 0.0037134906742721796, "grad/layer_8/attn_mlp_ratio": 2.490540815519411, "grad/layer_12/attn": 0.006464709993451834, "grad/layer_12/mlp": 0.00643891841173172, "grad/layer_12/attn_mlp_ratio": 1.0040055611315972, "grad/layer_16/attn": 0.004370441194623709, "grad/layer_16/mlp": 0.004409988410770893, "grad/layer_16/attn_mlp_ratio": 0.9910323312519762, "grad/layer_20/attn": 0.008154206909239292, "grad/layer_20/mlp": 0.006022490095347166, "grad/layer_20/attn_mlp_ratio": 1.3539593498282556, "grad/layer_24/attn": 0.009134916588664055, "grad/layer_24/mlp": 0.01018934790045023, "grad/layer_24/attn_mlp_ratio": 0.896516301951844, "grad/layer_27/attn": 0.004201334901154041, "grad/layer_27/mlp": 0.0099166976287961, "grad/layer_27/attn_mlp_ratio": 0.42366269660052336} {"step": 56000, "timestamp": 1778255069.2651682, "train/loss": 2.129991149902344, "train/z_loss": 0.001387408934533596, "train/perplexity": 8.414792339376627, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027315.3240849152, "perf/iters_per_sec": 0.9666992779182983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344478607177734, "data/tokens_consumed": 117442609152, "data/tokens_consumed_B": 117.442609152, "train/loss_slope": 1.2484264202100748e-07} {"step": 56000, "timestamp": 1778255076.546765, "geo/ww_alpha_mean": 7.5055579752589345, "geo/ww_alpha_std": 4.024050578172456, "geo/ww_alpha_min": 1.349580020138891, "geo/ww_alpha_max": 22.81931033673944, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 4.024889209015581, "geo/ww_alpha_by_type/k_proj": 4.472544744542373, "geo/ww_alpha_by_type/v_proj": 9.167045597798952, "geo/ww_alpha_by_type/o_proj": 7.754598233687199, "geo/ww_alpha_by_type/gate_proj": 8.043664184451597, "geo/ww_alpha_by_type/up_proj": 10.807803636699004, "geo/ww_alpha_by_type/down_proj": 8.360654168642574, "geo/twonn_id/layer_0": 0.782241702079773, "geo/twonn_id/layer_7": 2.952169895172119, "geo/twonn_id/layer_14": 4.1195902824401855, "geo/twonn_id/layer_21": 7.163875579833984, "geo/twonn_id/layer_27": 4.851899147033691, "geo/tier2_time_s": 7.27306056022644} {"step": 56000, "timestamp": 1778255077.3020933, "eoc/jacobian_sigma/layer_0/attn": 1157.382080078125, "eoc/jacobian_sigma/layer_0/mlp": 8832.3447265625, "eoc/jacobian_sigma/layer_0": 8832.3447265625, "eoc/jacobian_sigma/layer_7/attn": 1.1497746706008911, "eoc/jacobian_sigma/layer_7/mlp": 1.809735655784607, "eoc/jacobian_sigma/layer_7": 1.809735655784607, "eoc/jacobian_sigma/layer_14/attn": 1.4977234601974487, "eoc/jacobian_sigma/layer_14/mlp": 4.851278305053711, "eoc/jacobian_sigma/layer_14": 4.851278305053711, "eoc/jacobian_sigma/layer_21/attn": 1.0727356672286987, "eoc/jacobian_sigma/layer_21/mlp": 4.2406086921691895, "eoc/jacobian_sigma/layer_21": 4.2406086921691895, "eoc/jacobian_sigma/layer_27/attn": 2.8754773139953613, "eoc/jacobian_sigma/layer_27/mlp": 31.23587989807129, "eoc/jacobian_sigma/layer_27": 31.23587989807129, "eoc/layer0_sigma": 8832.3447265625, "eoc/sigma_max": 31.23587989807129, "eoc/sigma_min": 1.809735655784607, "eoc/sigma_mean": 10.534375637769699, "eoc/time_s": 0.7487990856170654} {"step": 56010, "timestamp": 1778255087.6698747, "train/loss": 2.1499485969543457, "train/z_loss": 0.0013856444391421973, "train/perplexity": 8.584417120651343, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1139922.1678560667, "perf/iters_per_sec": 0.5435572470932325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8397326231002809, "data/tokens_consumed": 117463580672, "data/tokens_consumed_B": 117.463580672, "train/loss_slope": 1.2427152401328751e-06} {"step": 56020, "timestamp": 1778255098.0235922, "train/loss": 2.1191635608673094, "train/z_loss": 0.0013887510634958745, "train/perplexity": 8.324171911819393, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027011.8414880598, "perf/iters_per_sec": 0.9665545661392497, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034602737426758, "data/tokens_consumed": 117484552192, "data/tokens_consumed_B": 117.484552192, "train/loss_slope": -2.109654744466145e-06} {"step": 56025, "timestamp": 1778255103.7859256, "eos/sharpness": 53.47406864166259, "eos/L0_probe": 1.978588581085205, "eos/L_plus": 2.2143971920013428, "eos/L_minus": 2.2775206565856934, "eos/grad_norm": 0.13150052726268768, "eos/embed_grad_frac": 0.11755570024251938, "eos/time_s": 0.5997867584228516} {"step": 56025, "timestamp": 1778255105.166678, "geo/rankme_last": 439.255859375, "geo/layer_0/stable_rank_q_proj": 19.620311737060547, "geo/layer_0/stable_rank_k_proj": 16.209091186523438, "geo/layer_0/stable_rank_o_proj": 47.62528991699219, "geo/layer_0/stable_rank_gate_proj": 132.67005920410156, "geo/layer_0/stable_rank_down_proj": 54.27594757080078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062373917549848557, "geo/layer_0/attn_entropy_mean": 6.16822624206543, "geo/layer_0/attn_entropy_std": 0.40603259205818176, "geo/layer_7/stable_rank_q_proj": 43.1699104309082, "geo/layer_7/stable_rank_k_proj": 41.64849090576172, "geo/layer_7/stable_rank_o_proj": 92.97273254394531, "geo/layer_7/stable_rank_gate_proj": 84.43338775634766, "geo/layer_7/stable_rank_down_proj": 141.53111267089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4710457921028137, "geo/layer_7/attn_entropy_mean": 4.676083564758301, "geo/layer_7/attn_entropy_std": 0.7819517254829407, "geo/layer_14/stable_rank_q_proj": 52.158729553222656, "geo/layer_14/stable_rank_k_proj": 39.43840026855469, "geo/layer_14/stable_rank_o_proj": 44.53278350830078, "geo/layer_14/stable_rank_gate_proj": 72.16268157958984, "geo/layer_14/stable_rank_down_proj": 129.2228240966797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3870198130607605, "geo/layer_14/attn_entropy_mean": 5.520874977111816, "geo/layer_14/attn_entropy_std": 0.38127124309539795, "geo/layer_21/stable_rank_q_proj": 40.85487365722656, "geo/layer_21/stable_rank_k_proj": 30.50261116027832, "geo/layer_21/stable_rank_o_proj": 71.92098236083984, "geo/layer_21/stable_rank_gate_proj": 67.57669830322266, "geo/layer_21/stable_rank_down_proj": 52.6610107421875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14256343245506287, "geo/layer_21/attn_entropy_mean": 5.7051568031311035, "geo/layer_21/attn_entropy_std": 0.3013051748275757, "geo/layer_27/stable_rank_q_proj": 43.3618049621582, "geo/layer_27/stable_rank_k_proj": 31.579933166503906, "geo/layer_27/stable_rank_o_proj": 115.67190551757812, "geo/layer_27/stable_rank_gate_proj": 81.26731872558594, "geo/layer_27/stable_rank_down_proj": 129.5489501953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09331652522087097, "geo/layer_27/attn_entropy_mean": 4.211746692657471, "geo/layer_27/attn_entropy_std": 0.7150477766990662, "attnres/final_alpha/block_0": 0.23745101690292358, "attnres/block_norm/0": 1.754143238067627, "attnres/final_alpha/block_1": 0.004764714278280735, "attnres/block_norm/1": 44964.9296875, "attnres/final_alpha/block_2": 0.010492473840713501, "attnres/block_norm/2": 28069.0625, "attnres/final_alpha/block_3": 0.012315453961491585, "attnres/block_norm/3": 55124.28125, "attnres/final_alpha/block_4": 0.014356309548020363, "attnres/block_norm/4": 14504.6748046875, "attnres/final_alpha/block_5": 0.6085522174835205, "attnres/block_norm/5": 6515.87744140625, "attnres/final_alpha/block_6": 0.11206784844398499, "attnres/block_norm/6": 36169.80859375, "geo/tier1_time_s": 1.3627879619598389, "geo/step": 56025.0, "geo/rankme_slope": -3.9982223358093236e-05} {"step": 56030, "timestamp": 1778255110.3436337, "train/loss": 2.238483428955078, "train/z_loss": 0.0013657239032909274, "train/perplexity": 9.379096430011526, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702892.9673796452, "perf/iters_per_sec": 0.8120026432893969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231523084640503, "data/tokens_consumed": 117505523712, "data/tokens_consumed_B": 117.505523712, "train/loss_slope": 2.55454355555184e-06} {"step": 56040, "timestamp": 1778255120.692369, "train/loss": 2.1175742387771606, "train/z_loss": 0.0013957970775663852, "train/perplexity": 8.310952629149323, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028681.5128499446, "perf/iters_per_sec": 0.967350727486584, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337512254714967, "data/tokens_consumed": 117526495232, "data/tokens_consumed_B": 117.526495232, "train/loss_slope": -1.5137479572084603e-06} {"step": 56050, "timestamp": 1778255131.0252616, "grad/layer_0/attn": 0.0028067752718925476, "grad/layer_0/mlp": 0.00294059538282454, "grad/layer_0/attn_mlp_ratio": 0.9544921388495592, "grad/layer_4/attn": 0.0038088487926870584, "grad/layer_4/mlp": 0.0025056672748178244, "grad/layer_4/attn_mlp_ratio": 1.5200935411325225, "grad/layer_8/attn": 0.009280937723815441, "grad/layer_8/mlp": 0.003956048283725977, "grad/layer_8/attn_mlp_ratio": 2.3460121878171405, "grad/layer_12/attn": 0.0044267126359045506, "grad/layer_12/mlp": 0.006471552886068821, "grad/layer_12/attn_mlp_ratio": 0.6840263296049409, "grad/layer_16/attn": 0.00476433988660574, "grad/layer_16/mlp": 0.004537077154964209, "grad/layer_16/attn_mlp_ratio": 1.0500900952023406, "grad/layer_20/attn": 0.004432325251400471, "grad/layer_20/mlp": 0.005747347604483366, "grad/layer_20/attn_mlp_ratio": 0.7711949022925692, "grad/layer_24/attn": 0.008302850648760796, "grad/layer_24/mlp": 0.008730038069188595, "grad/layer_24/attn_mlp_ratio": 0.9510669355449676, "grad/layer_27/attn": 0.006231635808944702, "grad/layer_27/mlp": 0.007624107412993908, "grad/layer_27/attn_mlp_ratio": 0.817359382501363} {"step": 56050, "timestamp": 1778255131.0394404, "train/loss": 2.208426594734192, "train/z_loss": 0.0013715588836930692, "train/perplexity": 9.101384954402974, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028272.6181268375, "perf/iters_per_sec": 0.9671557512888134, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339596271514893, "data/tokens_consumed": 117547466752, "data/tokens_consumed_B": 117.547466752, "train/loss_slope": -1.824736630920416e-07} {"step": 56060, "timestamp": 1778255141.3864756, "train/loss": 2.144256889820099, "train/z_loss": 0.001373952312860638, "train/perplexity": 8.535695917421418, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028178.475468304, "perf/iters_per_sec": 0.9671108605710526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340076208114624, "data/tokens_consumed": 117568438272, "data/tokens_consumed_B": 117.568438272, "train/loss_slope": 4.716101545418881e-07} {"step": 56070, "timestamp": 1778255151.733298, "train/loss": 2.1273082971572874, "train/z_loss": 0.0013821543310768903, "train/perplexity": 8.392246947156465, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028316.4888783207, "perf/iters_per_sec": 0.9671766704932788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339372634887696, "data/tokens_consumed": 117589409792, "data/tokens_consumed_B": 117.589409792, "train/loss_slope": 8.462686063718678e-07} {"step": 56080, "timestamp": 1778255162.0822496, "train/loss": 2.196986699104309, "train/z_loss": 0.0013742708717472852, "train/perplexity": 8.997859350529856, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028004.8980107156, "perf/iters_per_sec": 0.967028092389448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034096121788025, "data/tokens_consumed": 117610381312, "data/tokens_consumed_B": 117.610381312, "train/loss_slope": 1.4943577168118323e-06} {"step": 56090, "timestamp": 1778255172.4323876, "train/loss": 2.1544764757156374, "train/z_loss": 0.0013910433277487754, "train/perplexity": 8.62337445108699, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027912.790576069, "perf/iters_per_sec": 0.9669841721420617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034143090248108, "data/tokens_consumed": 117631352832, "data/tokens_consumed_B": 117.631352832, "train/loss_slope": 4.5627647715218265e-06} {"step": 56100, "timestamp": 1778255182.7695289, "grad/layer_0/attn": 0.002265767427161336, "grad/layer_0/mlp": 0.0026658172719180584, "grad/layer_0/attn_mlp_ratio": 0.8499334766999114, "grad/layer_4/attn": 0.001978124026209116, "grad/layer_4/mlp": 0.002516412176191807, "grad/layer_4/attn_mlp_ratio": 0.7860890065290473, "grad/layer_8/attn": 0.005048566497862339, "grad/layer_8/mlp": 0.0035525737330317497, "grad/layer_8/attn_mlp_ratio": 1.4211010763297538, "grad/layer_12/attn": 0.004152448382228613, "grad/layer_12/mlp": 0.0068886298686265945, "grad/layer_12/attn_mlp_ratio": 0.6027974214234791, "grad/layer_16/attn": 0.006406890228390694, "grad/layer_16/mlp": 0.004374044947326183, "grad/layer_16/attn_mlp_ratio": 1.4647517707452904, "grad/layer_20/attn": 0.004352951887995005, "grad/layer_20/mlp": 0.005512932315468788, "grad/layer_20/attn_mlp_ratio": 0.7895891986234072, "grad/layer_24/attn": 0.007897191680967808, "grad/layer_24/mlp": 0.008273757062852383, "grad/layer_24/attn_mlp_ratio": 0.9544867616401309, "grad/layer_27/attn": 0.007953832857310772, "grad/layer_27/mlp": 0.00685040932148695, "grad/layer_27/attn_mlp_ratio": 1.1610740859316864} {"step": 56100, "timestamp": 1778255183.3574436, "eos/sharpness": 55.480766296386705, "eos/L0_probe": 1.9792689085006714, "eos/L_plus": 2.313183307647705, "eos/L_minus": 2.200162172317505, "eos/grad_norm": 0.12162893265485764, "eos/embed_grad_frac": 0.13838306069374084, "eos/time_s": 0.5849134922027588} {"step": 56100, "timestamp": 1778255183.376841, "train/loss": 2.1645331382751465, "train/z_loss": 0.0013863599509932102, "train/perplexity": 8.7105343523379, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917476.198610067, "perf/iters_per_sec": 0.9143239014673552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0937043190002442, "data/tokens_consumed": 117652324352, "data/tokens_consumed_B": 117.652324352, "train/loss_slope": 6.362460634567461e-06} {"step": 56100, "timestamp": 1778255184.739692, "geo/rankme_last": 438.98870849609375, "geo/layer_0/stable_rank_q_proj": 19.5712890625, "geo/layer_0/stable_rank_k_proj": 16.211454391479492, "geo/layer_0/stable_rank_o_proj": 47.535987854003906, "geo/layer_0/stable_rank_gate_proj": 132.91590881347656, "geo/layer_0/stable_rank_down_proj": 54.206687927246094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06145942211151123, "geo/layer_0/attn_entropy_mean": 6.170933723449707, "geo/layer_0/attn_entropy_std": 0.40536749362945557, "geo/layer_7/stable_rank_q_proj": 43.256805419921875, "geo/layer_7/stable_rank_k_proj": 41.62260055541992, "geo/layer_7/stable_rank_o_proj": 92.78846740722656, "geo/layer_7/stable_rank_gate_proj": 84.42890930175781, "geo/layer_7/stable_rank_down_proj": 141.68736267089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4686761498451233, "geo/layer_7/attn_entropy_mean": 4.657467842102051, "geo/layer_7/attn_entropy_std": 0.8255187273025513, "geo/layer_14/stable_rank_q_proj": 52.066654205322266, "geo/layer_14/stable_rank_k_proj": 39.34129333496094, "geo/layer_14/stable_rank_o_proj": 44.49335479736328, "geo/layer_14/stable_rank_gate_proj": 72.12613677978516, "geo/layer_14/stable_rank_down_proj": 129.24551391601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3938123881816864, "geo/layer_14/attn_entropy_mean": 5.555876731872559, "geo/layer_14/attn_entropy_std": 0.3942449390888214, "geo/layer_21/stable_rank_q_proj": 40.79301834106445, "geo/layer_21/stable_rank_k_proj": 30.365367889404297, "geo/layer_21/stable_rank_o_proj": 71.98208618164062, "geo/layer_21/stable_rank_gate_proj": 67.52993774414062, "geo/layer_21/stable_rank_down_proj": 52.67274856567383, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14324603974819183, "geo/layer_21/attn_entropy_mean": 5.728969573974609, "geo/layer_21/attn_entropy_std": 0.3009616732597351, "geo/layer_27/stable_rank_q_proj": 43.37873840332031, "geo/layer_27/stable_rank_k_proj": 31.626441955566406, "geo/layer_27/stable_rank_o_proj": 115.54798126220703, "geo/layer_27/stable_rank_gate_proj": 81.36275482177734, "geo/layer_27/stable_rank_down_proj": 129.4379425048828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09919934719800949, "geo/layer_27/attn_entropy_mean": 4.230254650115967, "geo/layer_27/attn_entropy_std": 0.7110362648963928, "attnres/final_alpha/block_0": 0.2383217066526413, "attnres/block_norm/0": 1.7542825937271118, "attnres/final_alpha/block_1": 0.00472384924069047, "attnres/block_norm/1": 45169.8984375, "attnres/final_alpha/block_2": 0.010473016649484634, "attnres/block_norm/2": 28028.73828125, "attnres/final_alpha/block_3": 0.01243292074650526, "attnres/block_norm/3": 55701.56640625, "attnres/final_alpha/block_4": 0.014117958024144173, "attnres/block_norm/4": 14618.0712890625, "attnres/final_alpha/block_5": 0.60870361328125, "attnres/block_norm/5": 6490.27783203125, "attnres/final_alpha/block_6": 0.11122696846723557, "attnres/block_norm/6": 36307.01953125, "geo/tier1_time_s": 1.3590073585510254, "geo/step": 56100.0, "geo/rankme_slope": -1.638381915266107e-05} {"step": 56110, "timestamp": 1778255195.0883508, "train/loss": 2.1788397550582888, "train/z_loss": 0.0013824513531289994, "train/perplexity": 8.83604832863786, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791233.4999088203, "perf/iters_per_sec": 0.8541266917747594, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707864999771118, "data/tokens_consumed": 117673295872, "data/tokens_consumed_B": 117.673295872, "train/loss_slope": 9.328388416691998e-06} {"step": 56120, "timestamp": 1778255205.429252, "train/loss": 2.109540581703186, "train/z_loss": 0.0013817595434375107, "train/perplexity": 8.244452761988912, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028992.001207602, "perf/iters_per_sec": 0.9674987798727045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335930347442628, "data/tokens_consumed": 117694267392, "data/tokens_consumed_B": 117.694267392, "train/loss_slope": 5.59880739927935e-06} {"step": 56130, "timestamp": 1778255215.7797086, "train/loss": 2.1519084930419923, "train/z_loss": 0.0013840196887031197, "train/perplexity": 8.60125818415726, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027501.8222102085, "perf/iters_per_sec": 0.9667882071543734, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034352707862854, "data/tokens_consumed": 117715238912, "data/tokens_consumed_B": 117.715238912, "train/loss_slope": 6.080881454119826e-06} {"step": 56140, "timestamp": 1778255226.1259427, "train/loss": 2.086369526386261, "train/z_loss": 0.0013908854802139104, "train/perplexity": 8.055616309671791, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028321.212814418, "perf/iters_per_sec": 0.967178923041543, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339348554611205, "data/tokens_consumed": 117736210432, "data/tokens_consumed_B": 117.736210432, "train/loss_slope": 4.710420759597626e-06} {"step": 56150, "timestamp": 1778255236.4594615, "grad/layer_0/attn": 0.00263651879504323, "grad/layer_0/mlp": 0.0029003266245126724, "grad/layer_0/attn_mlp_ratio": 0.9090419961172586, "grad/layer_4/attn": 0.0022385420743376017, "grad/layer_4/mlp": 0.002529030665755272, "grad/layer_4/attn_mlp_ratio": 0.885138332300628, "grad/layer_8/attn": 0.0034594659227877855, "grad/layer_8/mlp": 0.003630751511082053, "grad/layer_8/attn_mlp_ratio": 0.9528236280963268, "grad/layer_12/attn": 0.0051116785034537315, "grad/layer_12/mlp": 0.006734158843755722, "grad/layer_12/attn_mlp_ratio": 0.7590671004570746, "grad/layer_16/attn": 0.004098437260836363, "grad/layer_16/mlp": 0.004981146659702063, "grad/layer_16/attn_mlp_ratio": 0.8227899033196732, "grad/layer_20/attn": 0.003469026181846857, "grad/layer_20/mlp": 0.006603419315069914, "grad/layer_20/attn_mlp_ratio": 0.5253378535868949, "grad/layer_24/attn": 0.01312599889934063, "grad/layer_24/mlp": 0.01099016610532999, "grad/layer_24/attn_mlp_ratio": 1.194340345187392, "grad/layer_27/attn": 0.006336419843137264, "grad/layer_27/mlp": 0.010313398204743862, "grad/layer_27/attn_mlp_ratio": 0.614387193813963} {"step": 56150, "timestamp": 1778255236.4736936, "train/loss": 2.147033715248108, "train/z_loss": 0.0013687475235201418, "train/perplexity": 8.559430993721081, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027638.995697698, "perf/iters_per_sec": 0.9668536165703288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342827320098877, "data/tokens_consumed": 117757181952, "data/tokens_consumed_B": 117.757181952, "train/loss_slope": 4.00079088528663e-06} {"step": 56160, "timestamp": 1778255246.8282065, "train/loss": 2.1493026852607726, "train/z_loss": 0.0013802704750560224, "train/perplexity": 8.578874135582755, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026754.77462246, "perf/iters_per_sec": 0.9664319871055889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347339630126953, "data/tokens_consumed": 117778153472, "data/tokens_consumed_B": 117.778153472, "train/loss_slope": 3.1836487672509626e-06} {"step": 56170, "timestamp": 1778255257.1765583, "train/loss": 2.190620756149292, "train/z_loss": 0.0013722206465899944, "train/perplexity": 8.940761425080662, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027939.4400903664, "perf/iters_per_sec": 0.9669968796207268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341295003890991, "data/tokens_consumed": 117799124992, "data/tokens_consumed_B": 117.799124992, "train/loss_slope": 6.94543530624118e-06} {"step": 56175, "timestamp": 1778255262.927936, "eos/sharpness": 72.5687265396118, "eos/L0_probe": 1.9849765300750732, "eos/L_plus": 2.4100873470306396, "eos/L_minus": 2.285552978515625, "eos/grad_norm": 0.2233421504497528, "eos/embed_grad_frac": 0.04513513296842575, "eos/time_s": 0.5861456394195557} {"step": 56175, "timestamp": 1778255264.3062627, "geo/rankme_last": 439.5060119628906, "geo/layer_0/stable_rank_q_proj": 19.544666290283203, "geo/layer_0/stable_rank_k_proj": 16.192668914794922, "geo/layer_0/stable_rank_o_proj": 47.4497184753418, "geo/layer_0/stable_rank_gate_proj": 132.77110290527344, "geo/layer_0/stable_rank_down_proj": 54.19662094116211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06323439627885818, "geo/layer_0/attn_entropy_mean": 6.17010498046875, "geo/layer_0/attn_entropy_std": 0.40926411747932434, "geo/layer_7/stable_rank_q_proj": 43.33889389038086, "geo/layer_7/stable_rank_k_proj": 41.571937561035156, "geo/layer_7/stable_rank_o_proj": 92.86029052734375, "geo/layer_7/stable_rank_gate_proj": 84.39727020263672, "geo/layer_7/stable_rank_down_proj": 141.3224639892578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4726197123527527, "geo/layer_7/attn_entropy_mean": 4.656156539916992, "geo/layer_7/attn_entropy_std": 0.8004802465438843, "geo/layer_14/stable_rank_q_proj": 52.08778381347656, "geo/layer_14/stable_rank_k_proj": 39.423885345458984, "geo/layer_14/stable_rank_o_proj": 44.44328308105469, "geo/layer_14/stable_rank_gate_proj": 72.19661712646484, "geo/layer_14/stable_rank_down_proj": 129.06834411621094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3864521384239197, "geo/layer_14/attn_entropy_mean": 5.534969329833984, "geo/layer_14/attn_entropy_std": 0.3974236249923706, "geo/layer_21/stable_rank_q_proj": 40.85737228393555, "geo/layer_21/stable_rank_k_proj": 30.383573532104492, "geo/layer_21/stable_rank_o_proj": 71.98400115966797, "geo/layer_21/stable_rank_gate_proj": 67.46227264404297, "geo/layer_21/stable_rank_down_proj": 52.61056900024414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14884939789772034, "geo/layer_21/attn_entropy_mean": 5.699717998504639, "geo/layer_21/attn_entropy_std": 0.29646989703178406, "geo/layer_27/stable_rank_q_proj": 43.329097747802734, "geo/layer_27/stable_rank_k_proj": 31.632400512695312, "geo/layer_27/stable_rank_o_proj": 115.57451629638672, "geo/layer_27/stable_rank_gate_proj": 81.3074722290039, "geo/layer_27/stable_rank_down_proj": 129.95452880859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09569292515516281, "geo/layer_27/attn_entropy_mean": 4.216439247131348, "geo/layer_27/attn_entropy_std": 0.7289914488792419, "attnres/final_alpha/block_0": 0.23563258349895477, "attnres/block_norm/0": 1.7543185949325562, "attnres/final_alpha/block_1": 0.004680377896875143, "attnres/block_norm/1": 45102.26953125, "attnres/final_alpha/block_2": 0.010241711512207985, "attnres/block_norm/2": 28045.4140625, "attnres/final_alpha/block_3": 0.012094368226826191, "attnres/block_norm/3": 55611.00390625, "attnres/final_alpha/block_4": 0.013977734372019768, "attnres/block_norm/4": 14616.1689453125, "attnres/final_alpha/block_5": 0.6135894656181335, "attnres/block_norm/5": 6419.0625, "attnres/final_alpha/block_6": 0.10978374630212784, "attnres/block_norm/6": 36496.19140625, "geo/tier1_time_s": 1.3590455055236816, "geo/step": 56175.0, "geo/rankme_slope": -1.0021449986244492e-05} {"step": 56180, "timestamp": 1778255269.4797359, "train/loss": 2.0884886145591737, "train/z_loss": 0.0013914453564211726, "train/perplexity": 8.072704970714122, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705259.5538517444, "perf/iters_per_sec": 0.8131311196573946, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2298139572143554, "data/tokens_consumed": 117820096512, "data/tokens_consumed_B": 117.820096512, "train/loss_slope": 2.5513116354129155e-07} {"step": 56190, "timestamp": 1778255279.8251746, "train/loss": 2.1129143953323366, "train/z_loss": 0.0013633311842568218, "train/perplexity": 8.272314983624996, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028098.0425643844, "perf/iters_per_sec": 0.9670725071737215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340486288070678, "data/tokens_consumed": 117841068032, "data/tokens_consumed_B": 117.841068032, "train/loss_slope": -2.519372709155833e-06} {"step": 56200, "timestamp": 1778255290.164983, "grad/layer_0/attn": 0.002866957103833556, "grad/layer_0/mlp": 0.003156704129651189, "grad/layer_0/attn_mlp_ratio": 0.9082121400237585, "grad/layer_4/attn": 0.0028639035299420357, "grad/layer_4/mlp": 0.0025997976772487164, "grad/layer_4/attn_mlp_ratio": 1.1015870368859282, "grad/layer_8/attn": 0.007742204237729311, "grad/layer_8/mlp": 0.0039156898856163025, "grad/layer_8/attn_mlp_ratio": 1.9772260485812547, "grad/layer_12/attn": 0.006684636231511831, "grad/layer_12/mlp": 0.0065367380157113075, "grad/layer_12/attn_mlp_ratio": 1.0226256755559848, "grad/layer_16/attn": 0.003945566713809967, "grad/layer_16/mlp": 0.005328765604645014, "grad/layer_16/attn_mlp_ratio": 0.7404278837725348, "grad/layer_20/attn": 0.006827977951616049, "grad/layer_20/mlp": 0.008072099648416042, "grad/layer_20/attn_mlp_ratio": 0.8458738326364064, "grad/layer_24/attn": 0.020738381892442703, "grad/layer_24/mlp": 0.014961685054004192, "grad/layer_24/attn_mlp_ratio": 1.386099338341744, "grad/layer_27/attn": 0.015382949262857437, "grad/layer_27/mlp": 0.015358353964984417, "grad/layer_27/attn_mlp_ratio": 1.0016014214654092} {"step": 56200, "timestamp": 1778255290.179307, "train/loss": 2.12883460521698, "train/z_loss": 0.0013739933841861785, "train/perplexity": 8.405065881643752, "train/grad_norm": 0.3515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026688.6969287535, "perf/iters_per_sec": 0.9664004788059013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347676992416381, "data/tokens_consumed": 117862039552, "data/tokens_consumed_B": 117.862039552, "train/loss_slope": -6.380166379388936e-06} {"step": 56210, "timestamp": 1778255300.5231776, "train/loss": 2.115606737136841, "train/z_loss": 0.0013897526543587447, "train/perplexity": 8.294616891783607, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028934.3419266266, "perf/iters_per_sec": 0.9674712857850202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033622407913208, "data/tokens_consumed": 117883011072, "data/tokens_consumed_B": 117.883011072, "train/loss_slope": -1.151873641925424e-05} {"step": 56220, "timestamp": 1778255310.8748379, "train/loss": 2.125391948223114, "train/z_loss": 0.0013839543214999139, "train/perplexity": 8.376179873640615, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026858.826537652, "perf/iters_per_sec": 0.9664816029251346, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346808433532715, "data/tokens_consumed": 117903982592, "data/tokens_consumed_B": 117.903982592, "train/loss_slope": -1.5184636721194687e-05} {"step": 56230, "timestamp": 1778255321.2212245, "train/loss": 2.152370476722717, "train/z_loss": 0.0013836900703608989, "train/perplexity": 8.605232743092035, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028332.0171470183, "perf/iters_per_sec": 0.9671840749487964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339293479919434, "data/tokens_consumed": 117924954112, "data/tokens_consumed_B": 117.924954112, "train/loss_slope": -1.3489772606067672e-05} {"step": 56240, "timestamp": 1778255331.5652425, "train/loss": 2.144785833358765, "train/z_loss": 0.0013992607942782342, "train/perplexity": 8.540212012899424, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028710.522055086, "perf/iters_per_sec": 0.9673645601535253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337364435195924, "data/tokens_consumed": 117945925632, "data/tokens_consumed_B": 117.945925632, "train/loss_slope": -1.1881361857499206e-05} {"step": 56250, "timestamp": 1778255341.8974512, "grad/layer_0/attn": 0.002601754618808627, "grad/layer_0/mlp": 0.0026410361751914024, "grad/layer_0/attn_mlp_ratio": 0.9851264230060874, "grad/layer_4/attn": 0.002140900818631053, "grad/layer_4/mlp": 0.002600159728899598, "grad/layer_4/attn_mlp_ratio": 0.8233727768716025, "grad/layer_8/attn": 0.00788495410233736, "grad/layer_8/mlp": 0.0036758892238140106, "grad/layer_8/attn_mlp_ratio": 2.1450466561262322, "grad/layer_12/attn": 0.006347482092678547, "grad/layer_12/mlp": 0.006829031277447939, "grad/layer_12/attn_mlp_ratio": 0.929484979910965, "grad/layer_16/attn": 0.005146407522261143, "grad/layer_16/mlp": 0.004587889648973942, "grad/layer_16/attn_mlp_ratio": 1.1217374008196495, "grad/layer_20/attn": 0.0026722627226263285, "grad/layer_20/mlp": 0.00515281967818737, "grad/layer_20/attn_mlp_ratio": 0.5186020155291288, "grad/layer_24/attn": 0.010954229161143303, "grad/layer_24/mlp": 0.007070621009916067, "grad/layer_24/attn_mlp_ratio": 1.5492598161964497, "grad/layer_27/attn": 0.004878375679254532, "grad/layer_27/mlp": 0.006368544884026051, "grad/layer_27/attn_mlp_ratio": 0.7660110263004742} {"step": 56250, "timestamp": 1778255342.4853144, "eos/sharpness": 11.998534202575682, "eos/L0_probe": 1.9820854663848877, "eos/L_plus": 2.048271656036377, "eos/L_minus": 2.0358846187591553, "eos/grad_norm": 0.08392733335494995, "eos/embed_grad_frac": 0.282177597284317, "eos/time_s": 0.5850799083709717} {"step": 56250, "timestamp": 1778255342.5050428, "train/loss": 2.1766127824783323, "train/z_loss": 0.0013769149663858116, "train/perplexity": 8.816392585818365, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917724.0593463478, "perf/iters_per_sec": 0.9144420906764735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0935629606246948, "data/tokens_consumed": 117966897152, "data/tokens_consumed_B": 117.966897152, "train/loss_slope": -1.2283331304207368e-05} {"step": 56250, "timestamp": 1778255343.869264, "geo/rankme_last": 438.913330078125, "geo/layer_0/stable_rank_q_proj": 19.546443939208984, "geo/layer_0/stable_rank_k_proj": 16.21013069152832, "geo/layer_0/stable_rank_o_proj": 47.452308654785156, "geo/layer_0/stable_rank_gate_proj": 132.59617614746094, "geo/layer_0/stable_rank_down_proj": 54.23967742919922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06109831482172012, "geo/layer_0/attn_entropy_mean": 6.171109199523926, "geo/layer_0/attn_entropy_std": 0.40759000182151794, "geo/layer_7/stable_rank_q_proj": 43.41642761230469, "geo/layer_7/stable_rank_k_proj": 41.41444396972656, "geo/layer_7/stable_rank_o_proj": 92.78402709960938, "geo/layer_7/stable_rank_gate_proj": 84.30786895751953, "geo/layer_7/stable_rank_down_proj": 141.15928649902344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4689813554286957, "geo/layer_7/attn_entropy_mean": 4.641364097595215, "geo/layer_7/attn_entropy_std": 0.8093754649162292, "geo/layer_14/stable_rank_q_proj": 52.013916015625, "geo/layer_14/stable_rank_k_proj": 39.55364990234375, "geo/layer_14/stable_rank_o_proj": 44.38113784790039, "geo/layer_14/stable_rank_gate_proj": 72.27201080322266, "geo/layer_14/stable_rank_down_proj": 128.99609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39967212080955505, "geo/layer_14/attn_entropy_mean": 5.518444538116455, "geo/layer_14/attn_entropy_std": 0.3927273750305176, "geo/layer_21/stable_rank_q_proj": 40.770328521728516, "geo/layer_21/stable_rank_k_proj": 30.34041976928711, "geo/layer_21/stable_rank_o_proj": 71.81160736083984, "geo/layer_21/stable_rank_gate_proj": 67.36846160888672, "geo/layer_21/stable_rank_down_proj": 52.60735321044922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14584985375404358, "geo/layer_21/attn_entropy_mean": 5.694753170013428, "geo/layer_21/attn_entropy_std": 0.31166306138038635, "geo/layer_27/stable_rank_q_proj": 43.26731872558594, "geo/layer_27/stable_rank_k_proj": 31.631139755249023, "geo/layer_27/stable_rank_o_proj": 115.38290405273438, "geo/layer_27/stable_rank_gate_proj": 81.34302520751953, "geo/layer_27/stable_rank_down_proj": 130.0255126953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09106569737195969, "geo/layer_27/attn_entropy_mean": 4.187335014343262, "geo/layer_27/attn_entropy_std": 0.7108429074287415, "attnres/final_alpha/block_0": 0.23650899529457092, "attnres/block_norm/0": 1.7544069290161133, "attnres/final_alpha/block_1": 0.004759840667247772, "attnres/block_norm/1": 45131.5859375, "attnres/final_alpha/block_2": 0.010305779054760933, "attnres/block_norm/2": 28020.5859375, "attnres/final_alpha/block_3": 0.01230589859187603, "attnres/block_norm/3": 55271.3515625, "attnres/final_alpha/block_4": 0.014349982142448425, "attnres/block_norm/4": 14605.33984375, "attnres/final_alpha/block_5": 0.6108608841896057, "attnres/block_norm/5": 6559.66162109375, "attnres/final_alpha/block_6": 0.1109086275100708, "attnres/block_norm/6": 36529.5625, "geo/tier1_time_s": 1.360386610031128, "geo/step": 56250.0, "geo/rankme_slope": -7.139222876650658e-06} {"step": 56260, "timestamp": 1778255354.2123513, "train/loss": 2.1618033170700075, "train/z_loss": 0.0013864334556274115, "train/perplexity": 8.686788576561895, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791882.4351375874, "perf/iters_per_sec": 0.8544361282051026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703624963760375, "data/tokens_consumed": 117987868672, "data/tokens_consumed_B": 117.987868672, "train/loss_slope": -1.0214878909765951e-05} {"step": 56270, "timestamp": 1778255364.5586717, "train/loss": 2.204301118850708, "train/z_loss": 0.001360199530608952, "train/perplexity": 9.063914754613887, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028263.1239602696, "perf/iters_per_sec": 0.9671512241174076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033964467048645, "data/tokens_consumed": 118008840192, "data/tokens_consumed_B": 118.008840192, "train/loss_slope": -5.705132907909761e-06} {"step": 56280, "timestamp": 1778255374.908267, "train/loss": 2.1834412813186646, "train/z_loss": 0.0013618045952171088, "train/perplexity": 8.876801328149968, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027241.5474031982, "perf/iters_per_sec": 0.9666640984550467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344855070114136, "data/tokens_consumed": 118029811712, "data/tokens_consumed_B": 118.029811712, "train/loss_slope": -5.821041562984252e-06} {"step": 56290, "timestamp": 1778255385.257272, "train/loss": 2.1483684301376345, "train/z_loss": 0.0013731900951825082, "train/perplexity": 8.570863021265822, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027754.7312757757, "perf/iters_per_sec": 0.9669088035944823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034223699569702, "data/tokens_consumed": 118050783232, "data/tokens_consumed_B": 118.050783232, "train/loss_slope": -8.055140326196909e-06} {"step": 56300, "timestamp": 1778255395.5922885, "grad/layer_0/attn": 0.0030003837309777737, "grad/layer_0/mlp": 0.0030456576496362686, "grad/layer_0/attn_mlp_ratio": 0.9851348961767283, "grad/layer_4/attn": 0.002218330977484584, "grad/layer_4/mlp": 0.0025675103534013033, "grad/layer_4/attn_mlp_ratio": 0.8640007578336654, "grad/layer_8/attn": 0.005354461260139942, "grad/layer_8/mlp": 0.0035504710394889116, "grad/layer_8/attn_mlp_ratio": 1.5080987986599192, "grad/layer_12/attn": 0.004434230737388134, "grad/layer_12/mlp": 0.006468384526669979, "grad/layer_12/attn_mlp_ratio": 0.6855236652293731, "grad/layer_16/attn": 0.004941969644278288, "grad/layer_16/mlp": 0.004682633094489574, "grad/layer_16/attn_mlp_ratio": 1.0553826103855188, "grad/layer_20/attn": 0.0029030274599790573, "grad/layer_20/mlp": 0.006040425039827824, "grad/layer_20/attn_mlp_ratio": 0.4805998572580283, "grad/layer_24/attn": 0.0060040391981601715, "grad/layer_24/mlp": 0.0075437333434820175, "grad/layer_24/attn_mlp_ratio": 0.7958975808388111, "grad/layer_27/attn": 0.0060658929869532585, "grad/layer_27/mlp": 0.006402337457984686, "grad/layer_27/attn_mlp_ratio": 0.9474497294176828} {"step": 56300, "timestamp": 1778255395.606296, "train/loss": 2.098279869556427, "train/z_loss": 0.0013903772924095392, "train/perplexity": 8.152135109400323, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027947.1545541757, "perf/iters_per_sec": 0.9670005581637267, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034125566482544, "data/tokens_consumed": 118071754752, "data/tokens_consumed_B": 118.071754752, "train/loss_slope": -1.3446502387970719e-05} {"step": 56310, "timestamp": 1778255405.9520156, "train/loss": 2.1795737504959107, "train/z_loss": 0.001380650035571307, "train/perplexity": 8.842536328587585, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028050.3469951453, "perf/iters_per_sec": 0.9670497641540267, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340729475021362, "data/tokens_consumed": 118092726272, "data/tokens_consumed_B": 118.092726272, "train/loss_slope": -9.014391076482518e-06} {"step": 56320, "timestamp": 1778255416.30307, "train/loss": 2.1561777114868166, "train/z_loss": 0.0013772222795523704, "train/perplexity": 8.638057330149726, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027474.3897398894, "perf/iters_per_sec": 0.966775126333184, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343667030334474, "data/tokens_consumed": 118113697792, "data/tokens_consumed_B": 118.113697792, "train/loss_slope": -1.0326691400600606e-05} {"step": 56325, "timestamp": 1778255422.0495756, "eos/sharpness": 59.327745437622056, "eos/L0_probe": 1.9811491966247559, "eos/L_plus": 2.218740701675415, "eos/L_minus": 2.3368351459503174, "eos/grad_norm": 0.14797571301460266, "eos/embed_grad_frac": 0.13022427260875702, "eos/time_s": 0.5842595100402832} {"step": 56325, "timestamp": 1778255423.4228678, "geo/rankme_last": 438.512939453125, "geo/layer_0/stable_rank_q_proj": 19.549102783203125, "geo/layer_0/stable_rank_k_proj": 16.207443237304688, "geo/layer_0/stable_rank_o_proj": 47.51628494262695, "geo/layer_0/stable_rank_gate_proj": 132.6610107421875, "geo/layer_0/stable_rank_down_proj": 54.24827575683594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06607150286436081, "geo/layer_0/attn_entropy_mean": 6.166765213012695, "geo/layer_0/attn_entropy_std": 0.4064084589481354, "geo/layer_7/stable_rank_q_proj": 43.365806579589844, "geo/layer_7/stable_rank_k_proj": 41.44548797607422, "geo/layer_7/stable_rank_o_proj": 92.68754577636719, "geo/layer_7/stable_rank_gate_proj": 84.3555679321289, "geo/layer_7/stable_rank_down_proj": 141.38063049316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4613911807537079, "geo/layer_7/attn_entropy_mean": 4.6520795822143555, "geo/layer_7/attn_entropy_std": 0.8040692210197449, "geo/layer_14/stable_rank_q_proj": 51.92231369018555, "geo/layer_14/stable_rank_k_proj": 39.548439025878906, "geo/layer_14/stable_rank_o_proj": 44.33234405517578, "geo/layer_14/stable_rank_gate_proj": 72.30445861816406, "geo/layer_14/stable_rank_down_proj": 128.77870178222656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4168078899383545, "geo/layer_14/attn_entropy_mean": 5.535416603088379, "geo/layer_14/attn_entropy_std": 0.3976181149482727, "geo/layer_21/stable_rank_q_proj": 40.85795211791992, "geo/layer_21/stable_rank_k_proj": 30.349870681762695, "geo/layer_21/stable_rank_o_proj": 71.77078247070312, "geo/layer_21/stable_rank_gate_proj": 67.3006820678711, "geo/layer_21/stable_rank_down_proj": 52.49673843383789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14500278234481812, "geo/layer_21/attn_entropy_mean": 5.703503608703613, "geo/layer_21/attn_entropy_std": 0.29497891664505005, "geo/layer_27/stable_rank_q_proj": 43.19400405883789, "geo/layer_27/stable_rank_k_proj": 31.57392120361328, "geo/layer_27/stable_rank_o_proj": 115.45480346679688, "geo/layer_27/stable_rank_gate_proj": 81.3705062866211, "geo/layer_27/stable_rank_down_proj": 129.62342834472656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08756080269813538, "geo/layer_27/attn_entropy_mean": 4.2244720458984375, "geo/layer_27/attn_entropy_std": 0.7150501012802124, "attnres/final_alpha/block_0": 0.23850952088832855, "attnres/block_norm/0": 1.7544152736663818, "attnres/final_alpha/block_1": 0.004706704523414373, "attnres/block_norm/1": 45234.28515625, "attnres/final_alpha/block_2": 0.010401965118944645, "attnres/block_norm/2": 28116.9375, "attnres/final_alpha/block_3": 0.012480257079005241, "attnres/block_norm/3": 55144.9140625, "attnres/final_alpha/block_4": 0.014566732570528984, "attnres/block_norm/4": 14625.2578125, "attnres/final_alpha/block_5": 0.6067919135093689, "attnres/block_norm/5": 6481.48876953125, "attnres/final_alpha/block_6": 0.11254291236400604, "attnres/block_norm/6": 36557.12890625, "geo/tier1_time_s": 1.355787754058838, "geo/step": 56325.0, "geo/rankme_slope": -8.268151010404162e-07} {"step": 56330, "timestamp": 1778255428.5964963, "train/loss": 2.10045530796051, "train/z_loss": 0.0013838789658620954, "train/perplexity": 8.169888881309511, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706660.1946864252, "perf/iters_per_sec": 0.813798997252667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2288046598434448, "data/tokens_consumed": 118134669312, "data/tokens_consumed_B": 118.134669312, "train/loss_slope": -1.3314535651448723e-05} {"step": 56340, "timestamp": 1778255438.9447234, "train/loss": 2.1460176348686217, "train/z_loss": 0.0013747358112595975, "train/perplexity": 8.55073834079371, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027939.2998279312, "perf/iters_per_sec": 0.9669968127383858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341295719146728, "data/tokens_consumed": 118155640832, "data/tokens_consumed_B": 118.155640832, "train/loss_slope": -1.3143652815236479e-05} {"step": 56350, "timestamp": 1778255449.2832189, "grad/layer_0/attn": 0.003276859875768423, "grad/layer_0/mlp": 0.0029228085186332464, "grad/layer_0/attn_mlp_ratio": 1.121133917177491, "grad/layer_4/attn": 0.002826363779604435, "grad/layer_4/mlp": 0.0026293699629604816, "grad/layer_4/attn_mlp_ratio": 1.0749204987989225, "grad/layer_8/attn": 0.003384896321222186, "grad/layer_8/mlp": 0.0035914857871830463, "grad/layer_8/attn_mlp_ratio": 0.9424779680471201, "grad/layer_12/attn": 0.004511212930083275, "grad/layer_12/mlp": 0.00643859663978219, "grad/layer_12/attn_mlp_ratio": 0.700651572447428, "grad/layer_16/attn": 0.003629207843914628, "grad/layer_16/mlp": 0.004930183291435242, "grad/layer_16/attn_mlp_ratio": 0.7361202526906647, "grad/layer_20/attn": 0.0037085404619574547, "grad/layer_20/mlp": 0.006267874035984278, "grad/layer_20/attn_mlp_ratio": 0.591674367018074, "grad/layer_24/attn": 0.010457358323037624, "grad/layer_24/mlp": 0.008723241277039051, "grad/layer_24/attn_mlp_ratio": 1.1987927275018484, "grad/layer_27/attn": 0.003998253960162401, "grad/layer_27/mlp": 0.007950188592076302, "grad/layer_27/attn_mlp_ratio": 0.5029130898675818} {"step": 56350, "timestamp": 1778255449.2973483, "train/loss": 2.151893448829651, "train/z_loss": 0.0013891852577216923, "train/perplexity": 8.601128785976085, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027039.0278796037, "perf/iters_per_sec": 0.9665675296209353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345888614654541, "data/tokens_consumed": 118176612352, "data/tokens_consumed_B": 118.176612352, "train/loss_slope": -1.2725675188311173e-05} {"step": 56360, "timestamp": 1778255459.6421857, "train/loss": 2.112796461582184, "train/z_loss": 0.001379427162464708, "train/perplexity": 8.27133945602148, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028735.7888129598, "perf/iters_per_sec": 0.9673766082825469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033723568916321, "data/tokens_consumed": 118197583872, "data/tokens_consumed_B": 118.197583872, "train/loss_slope": -1.5258514248069351e-05} {"step": 56370, "timestamp": 1778255469.9862902, "train/loss": 2.1261431217193603, "train/z_loss": 0.001388087193481624, "train/perplexity": 8.382474201731734, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028662.2362240565, "perf/iters_per_sec": 0.9673415356750757, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337610483169555, "data/tokens_consumed": 118218555392, "data/tokens_consumed_B": 118.218555392, "train/loss_slope": -1.4964584184057708e-05} {"step": 56380, "timestamp": 1778255480.3313084, "train/loss": 2.13758704662323, "train/z_loss": 0.001383530523162335, "train/perplexity": 8.478953605594194, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028205.7866958347, "perf/iters_per_sec": 0.9671238835791753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033993697166443, "data/tokens_consumed": 118239526912, "data/tokens_consumed_B": 118.239526912, "train/loss_slope": -1.4338779520995578e-05} {"step": 56390, "timestamp": 1778255490.6805973, "train/loss": 2.126563882827759, "train/z_loss": 0.0013870975002646447, "train/perplexity": 8.386001962988301, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027783.386752965, "perf/iters_per_sec": 0.9669224675907921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342090845108032, "data/tokens_consumed": 118260498432, "data/tokens_consumed_B": 118.260498432, "train/loss_slope": -1.4407594104518429e-05} {"step": 56400, "timestamp": 1778255501.0279067, "grad/layer_0/attn": 0.002688503125682473, "grad/layer_0/mlp": 0.003028169507160783, "grad/layer_0/attn_mlp_ratio": 0.8878310908757909, "grad/layer_4/attn": 0.001826972933486104, "grad/layer_4/mlp": 0.0024828780442476273, "grad/layer_4/attn_mlp_ratio": 0.7358286743628014, "grad/layer_8/attn": 0.0062200105749070644, "grad/layer_8/mlp": 0.0034765806049108505, "grad/layer_8/attn_mlp_ratio": 1.7891172686199912, "grad/layer_12/attn": 0.004318746272474527, "grad/layer_12/mlp": 0.006626864429563284, "grad/layer_12/attn_mlp_ratio": 0.6517028155937173, "grad/layer_16/attn": 0.0033939590211957693, "grad/layer_16/mlp": 0.004791480954736471, "grad/layer_16/attn_mlp_ratio": 0.7083319296109448, "grad/layer_20/attn": 0.003578905016183853, "grad/layer_20/mlp": 0.006190653424710035, "grad/layer_20/attn_mlp_ratio": 0.5781142494727948, "grad/layer_24/attn": 0.009792998433113098, "grad/layer_24/mlp": 0.011179416440427303, "grad/layer_24/attn_mlp_ratio": 0.8759847526657046, "grad/layer_27/attn": 0.004500587470829487, "grad/layer_27/mlp": 0.010908283293247223, "grad/layer_27/attn_mlp_ratio": 0.4125843919324261} {"step": 56400, "timestamp": 1778255501.6149158, "eos/sharpness": 40.57245254516601, "eos/L0_probe": 1.9814783334732056, "eos/L_plus": 2.1925790309906006, "eos/L_minus": 2.1761021614074707, "eos/grad_norm": 0.14115823805332184, "eos/embed_grad_frac": 0.13705512881278992, "eos/time_s": 0.5841948986053467} {"step": 56400, "timestamp": 1778255501.6342854, "train/loss": 2.2200642108917235, "train/z_loss": 0.0013683974277228117, "train/perplexity": 9.207922095789039, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916215.1043748888, "perf/iters_per_sec": 0.9137225648760265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944241046905518, "data/tokens_consumed": 118281469952, "data/tokens_consumed_B": 118.281469952, "train/loss_slope": -7.768906797334056e-06} {"step": 56400, "timestamp": 1778255502.997786, "geo/rankme_last": 439.95556640625, "geo/layer_0/stable_rank_q_proj": 19.542577743530273, "geo/layer_0/stable_rank_k_proj": 16.18893814086914, "geo/layer_0/stable_rank_o_proj": 47.49665832519531, "geo/layer_0/stable_rank_gate_proj": 132.6322479248047, "geo/layer_0/stable_rank_down_proj": 54.299835205078125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06174175441265106, "geo/layer_0/attn_entropy_mean": 6.1643171310424805, "geo/layer_0/attn_entropy_std": 0.40601953864097595, "geo/layer_7/stable_rank_q_proj": 43.46783447265625, "geo/layer_7/stable_rank_k_proj": 41.41115951538086, "geo/layer_7/stable_rank_o_proj": 92.75630950927734, "geo/layer_7/stable_rank_gate_proj": 84.43159484863281, "geo/layer_7/stable_rank_down_proj": 141.36883544921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47492989897727966, "geo/layer_7/attn_entropy_mean": 4.656670570373535, "geo/layer_7/attn_entropy_std": 0.8092476725578308, "geo/layer_14/stable_rank_q_proj": 51.99909973144531, "geo/layer_14/stable_rank_k_proj": 39.662052154541016, "geo/layer_14/stable_rank_o_proj": 44.36348342895508, "geo/layer_14/stable_rank_gate_proj": 72.2701644897461, "geo/layer_14/stable_rank_down_proj": 128.8473663330078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4028569757938385, "geo/layer_14/attn_entropy_mean": 5.510009765625, "geo/layer_14/attn_entropy_std": 0.3937050402164459, "geo/layer_21/stable_rank_q_proj": 40.84482955932617, "geo/layer_21/stable_rank_k_proj": 30.420686721801758, "geo/layer_21/stable_rank_o_proj": 71.83015441894531, "geo/layer_21/stable_rank_gate_proj": 67.37610626220703, "geo/layer_21/stable_rank_down_proj": 52.58250427246094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14082658290863037, "geo/layer_21/attn_entropy_mean": 5.700872421264648, "geo/layer_21/attn_entropy_std": 0.3036033809185028, "geo/layer_27/stable_rank_q_proj": 43.236839294433594, "geo/layer_27/stable_rank_k_proj": 31.65901756286621, "geo/layer_27/stable_rank_o_proj": 115.68742370605469, "geo/layer_27/stable_rank_gate_proj": 81.319091796875, "geo/layer_27/stable_rank_down_proj": 129.30487060546875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0881384015083313, "geo/layer_27/attn_entropy_mean": 4.212017059326172, "geo/layer_27/attn_entropy_std": 0.704301118850708, "attnres/final_alpha/block_0": 0.23731380701065063, "attnres/block_norm/0": 1.7542048692703247, "attnres/final_alpha/block_1": 0.004668671637773514, "attnres/block_norm/1": 45140.09765625, "attnres/final_alpha/block_2": 0.010358888655900955, "attnres/block_norm/2": 28060.990234375, "attnres/final_alpha/block_3": 0.012212265282869339, "attnres/block_norm/3": 55621.2109375, "attnres/final_alpha/block_4": 0.01435613352805376, "attnres/block_norm/4": 14598.0546875, "attnres/final_alpha/block_5": 0.609148383140564, "attnres/block_norm/5": 6539.49951171875, "attnres/final_alpha/block_6": 0.11194184422492981, "attnres/block_norm/6": 36614.703125, "geo/tier1_time_s": 1.3594834804534912, "geo/step": 56400.0, "geo/rankme_slope": 3.589934020483193e-05} {"step": 56410, "timestamp": 1778255513.3452475, "train/loss": 2.0988994598388673, "train/z_loss": 0.001379230059683323, "train/perplexity": 8.157187658188677, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791347.241232842, "perf/iters_per_sec": 0.8541809278644762, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170712161064148, "data/tokens_consumed": 118302441472, "data/tokens_consumed_B": 118.302441472, "train/loss_slope": -1.1312137478434393e-05} {"step": 56420, "timestamp": 1778255523.6932669, "train/loss": 2.172289323806763, "train/z_loss": 0.0013836007216013967, "train/perplexity": 8.77835755752451, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027535.3310218384, "perf/iters_per_sec": 0.9668041854008858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343356132507324, "data/tokens_consumed": 118323412992, "data/tokens_consumed_B": 118.323412992, "train/loss_slope": -1.1539890206042494e-05} {"step": 56430, "timestamp": 1778255534.0390062, "train/loss": 2.177102100849152, "train/z_loss": 0.0013785612070932984, "train/perplexity": 8.820707664312453, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028549.7185700694, "perf/iters_per_sec": 0.9672878830767009, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338183879852294, "data/tokens_consumed": 118344384512, "data/tokens_consumed_B": 118.344384512, "train/loss_slope": -9.011154449490294e-06} {"step": 56440, "timestamp": 1778255544.3817835, "train/loss": 2.122808003425598, "train/z_loss": 0.0013806872186250985, "train/perplexity": 8.354564226079987, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029196.7368499245, "perf/iters_per_sec": 0.9675964054345725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334887504577637, "data/tokens_consumed": 118365356032, "data/tokens_consumed_B": 118.365356032, "train/loss_slope": -1.0430579875061265e-05} {"step": 56450, "timestamp": 1778255554.7181065, "grad/layer_0/attn": 0.002840179717168212, "grad/layer_0/mlp": 0.0029849777929484844, "grad/layer_0/attn_mlp_ratio": 0.9514910391389048, "grad/layer_4/attn": 0.0022374915424734354, "grad/layer_4/mlp": 0.002558311680331826, "grad/layer_4/attn_mlp_ratio": 0.8745968961543934, "grad/layer_8/attn": 0.0037355481181293726, "grad/layer_8/mlp": 0.0037414112593978643, "grad/layer_8/attn_mlp_ratio": 0.9984328798131848, "grad/layer_12/attn": 0.0044824364595115185, "grad/layer_12/mlp": 0.0063526444137096405, "grad/layer_12/attn_mlp_ratio": 0.7056016513812435, "grad/layer_16/attn": 0.003717670449987054, "grad/layer_16/mlp": 0.004244187846779823, "grad/layer_16/attn_mlp_ratio": 0.8759438782176808, "grad/layer_20/attn": 0.0084374463185668, "grad/layer_20/mlp": 0.005610902793705463, "grad/layer_20/attn_mlp_ratio": 1.503759105870869, "grad/layer_24/attn": 0.0050120106898248196, "grad/layer_24/mlp": 0.007695838343352079, "grad/layer_24/attn_mlp_ratio": 0.6512624617470187, "grad/layer_27/attn": 0.004016547463834286, "grad/layer_27/mlp": 0.006446121260523796, "grad/layer_27/attn_mlp_ratio": 0.6230952287730294} {"step": 56450, "timestamp": 1778255554.7322922, "train/loss": 2.1273003578186036, "train/z_loss": 0.0013762309914454817, "train/perplexity": 8.392180318530126, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027125.9638300117, "perf/iters_per_sec": 0.9666089839124735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345444917678832, "data/tokens_consumed": 118386327552, "data/tokens_consumed_B": 118.386327552, "train/loss_slope": -1.3213805661629186e-05} {"step": 56460, "timestamp": 1778255565.0792668, "train/loss": 2.168833017349243, "train/z_loss": 0.0013720434857532383, "train/perplexity": 8.748069236623733, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028338.7991461973, "perf/iters_per_sec": 0.9671873088580119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339258909225464, "data/tokens_consumed": 118407299072, "data/tokens_consumed_B": 118.407299072, "train/loss_slope": -1.5019649912779467e-05} {"step": 56470, "timestamp": 1778255575.4217925, "train/loss": 2.186601185798645, "train/z_loss": 0.0013664221274666487, "train/perplexity": 8.90489553656549, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029153.0152530437, "perf/iters_per_sec": 0.9675755573525637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335110187530518, "data/tokens_consumed": 118428270592, "data/tokens_consumed_B": 118.428270592, "train/loss_slope": -1.5581279984115616e-05} {"step": 56475, "timestamp": 1778255581.1887224, "eos/sharpness": 47.2240924835205, "eos/L0_probe": 1.9849809408187866, "eos/L_plus": 2.180602550506592, "eos/L_minus": 2.2616002559661865, "eos/grad_norm": 0.12418048083782196, "eos/embed_grad_frac": 0.14449341595172882, "eos/time_s": 0.5977888107299805} {"step": 56475, "timestamp": 1778255582.5667818, "geo/rankme_last": 439.69049072265625, "geo/layer_0/stable_rank_q_proj": 19.504169464111328, "geo/layer_0/stable_rank_k_proj": 16.180187225341797, "geo/layer_0/stable_rank_o_proj": 47.458621978759766, "geo/layer_0/stable_rank_gate_proj": 132.3843231201172, "geo/layer_0/stable_rank_down_proj": 54.362091064453125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06343720853328705, "geo/layer_0/attn_entropy_mean": 6.162942409515381, "geo/layer_0/attn_entropy_std": 0.40381988883018494, "geo/layer_7/stable_rank_q_proj": 43.50466537475586, "geo/layer_7/stable_rank_k_proj": 41.4896354675293, "geo/layer_7/stable_rank_o_proj": 92.68270874023438, "geo/layer_7/stable_rank_gate_proj": 84.43707275390625, "geo/layer_7/stable_rank_down_proj": 141.7727813720703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.453244686126709, "geo/layer_7/attn_entropy_mean": 4.63733434677124, "geo/layer_7/attn_entropy_std": 0.8034049868583679, "geo/layer_14/stable_rank_q_proj": 52.01312255859375, "geo/layer_14/stable_rank_k_proj": 39.592193603515625, "geo/layer_14/stable_rank_o_proj": 44.35285568237305, "geo/layer_14/stable_rank_gate_proj": 72.20799255371094, "geo/layer_14/stable_rank_down_proj": 128.95863342285156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3875085711479187, "geo/layer_14/attn_entropy_mean": 5.504889488220215, "geo/layer_14/attn_entropy_std": 0.3846528232097626, "geo/layer_21/stable_rank_q_proj": 40.83820343017578, "geo/layer_21/stable_rank_k_proj": 30.2746524810791, "geo/layer_21/stable_rank_o_proj": 71.83399963378906, "geo/layer_21/stable_rank_gate_proj": 67.36737823486328, "geo/layer_21/stable_rank_down_proj": 52.53182601928711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14380702376365662, "geo/layer_21/attn_entropy_mean": 5.708497047424316, "geo/layer_21/attn_entropy_std": 0.3000553846359253, "geo/layer_27/stable_rank_q_proj": 43.24835968017578, "geo/layer_27/stable_rank_k_proj": 31.62193489074707, "geo/layer_27/stable_rank_o_proj": 115.66008758544922, "geo/layer_27/stable_rank_gate_proj": 81.239501953125, "geo/layer_27/stable_rank_down_proj": 129.4604034423828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09607139229774475, "geo/layer_27/attn_entropy_mean": 4.20668363571167, "geo/layer_27/attn_entropy_std": 0.6953544616699219, "attnres/final_alpha/block_0": 0.23801246285438538, "attnres/block_norm/0": 1.7543387413024902, "attnres/final_alpha/block_1": 0.004738290794193745, "attnres/block_norm/1": 45209.83203125, "attnres/final_alpha/block_2": 0.010417955927550793, "attnres/block_norm/2": 28181.5859375, "attnres/final_alpha/block_3": 0.012416034005582333, "attnres/block_norm/3": 55152.23828125, "attnres/final_alpha/block_4": 0.014156229794025421, "attnres/block_norm/4": 14635.04296875, "attnres/final_alpha/block_5": 0.609226644039154, "attnres/block_norm/5": 6480.51220703125, "attnres/final_alpha/block_6": 0.11103237420320511, "attnres/block_norm/6": 36528.98828125, "geo/tier1_time_s": 1.3596961498260498, "geo/step": 56475.0, "geo/rankme_slope": 6.442080738545418e-05} {"step": 56480, "timestamp": 1778255587.7474413, "train/loss": 2.1799908876419067, "train/z_loss": 0.0013774322578683496, "train/perplexity": 8.846225648377725, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702013.4227006058, "perf/iters_per_sec": 0.811583243704131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2321594953536987, "data/tokens_consumed": 118449242112, "data/tokens_consumed_B": 118.449242112, "train/loss_slope": -1.4584235153575882e-05} {"step": 56490, "timestamp": 1778255598.534399, "train/loss": 2.1245459794998167, "train/z_loss": 0.0013788765529170633, "train/perplexity": 8.369096883863133, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1945446.639658656, "perf/iters_per_sec": 0.9276612470906525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0779797077178954, "data/tokens_consumed": 118470213632, "data/tokens_consumed_B": 118.470213632, "train/loss_slope": -1.5164093916887523e-05} {"step": 56500, "timestamp": 1778255608.871251, "grad/layer_0/attn": 0.0027084248140454292, "grad/layer_0/mlp": 0.002910209121182561, "grad/layer_0/attn_mlp_ratio": 0.9306632644593378, "grad/layer_4/attn": 0.0023386359680444, "grad/layer_4/mlp": 0.002579405438154936, "grad/layer_4/attn_mlp_ratio": 0.906656953879862, "grad/layer_8/attn": 0.00669678533449769, "grad/layer_8/mlp": 0.003720778739079833, "grad/layer_8/attn_mlp_ratio": 1.7998342884990817, "grad/layer_12/attn": 0.005005835555493832, "grad/layer_12/mlp": 0.006485998164862394, "grad/layer_12/attn_mlp_ratio": 0.7717910722567908, "grad/layer_16/attn": 0.00337645853869617, "grad/layer_16/mlp": 0.0044737509451806545, "grad/layer_16/attn_mlp_ratio": 0.754726515757612, "grad/layer_20/attn": 0.00442223297432065, "grad/layer_20/mlp": 0.005909386090934277, "grad/layer_20/attn_mlp_ratio": 0.7483404928086944, "grad/layer_24/attn": 0.006479399278759956, "grad/layer_24/mlp": 0.008118017576634884, "grad/layer_24/attn_mlp_ratio": 0.7981504274632013, "grad/layer_27/attn": 0.003603436751291156, "grad/layer_27/mlp": 0.007973045110702515, "grad/layer_27/attn_mlp_ratio": 0.4519523790551356} {"step": 56500, "timestamp": 1778255608.8855276, "train/loss": 2.1131786823272707, "train/z_loss": 0.0013811016455292703, "train/perplexity": 8.27450153781936, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027466.21154911, "perf/iters_per_sec": 0.9667712266679335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343708753585816, "data/tokens_consumed": 118491185152, "data/tokens_consumed_B": 118.491185152, "train/loss_slope": -1.910464045452773e-05} {"step": 56500, "timestamp": 1778255616.1490867, "geo/ww_alpha_mean": 7.984895481180734, "geo/ww_alpha_std": 5.452586798444871, "geo/ww_alpha_min": 1.341334299604335, "geo/ww_alpha_max": 51.8220752677268, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 4.04684452056499, "geo/ww_alpha_by_type/k_proj": 4.50951763308338, "geo/ww_alpha_by_type/v_proj": 9.456594677944112, "geo/ww_alpha_by_type/o_proj": 8.446003233032677, "geo/ww_alpha_by_type/gate_proj": 8.06941929436559, "geo/ww_alpha_by_type/up_proj": 12.836161775234048, "geo/ww_alpha_by_type/down_proj": 8.639787959713367, "geo/twonn_id/layer_0": 0.6962093710899353, "geo/twonn_id/layer_7": 3.123422622680664, "geo/twonn_id/layer_14": 4.4560394287109375, "geo/twonn_id/layer_21": 7.460721015930176, "geo/twonn_id/layer_27": 5.7909770011901855, "geo/tier2_time_s": 7.2566070556640625} {"step": 56500, "timestamp": 1778255616.9337592, "eoc/jacobian_sigma/layer_0/attn": 1023.3828125, "eoc/jacobian_sigma/layer_0/mlp": 8480.7275390625, "eoc/jacobian_sigma/layer_0": 8480.7275390625, "eoc/jacobian_sigma/layer_7/attn": 1.1771882772445679, "eoc/jacobian_sigma/layer_7/mlp": 1.7357386350631714, "eoc/jacobian_sigma/layer_7": 1.7357386350631714, "eoc/jacobian_sigma/layer_14/attn": 1.5254696607589722, "eoc/jacobian_sigma/layer_14/mlp": 7.123600959777832, "eoc/jacobian_sigma/layer_14": 7.123600959777832, "eoc/jacobian_sigma/layer_21/attn": 1.0821384191513062, "eoc/jacobian_sigma/layer_21/mlp": 4.396938800811768, "eoc/jacobian_sigma/layer_21": 4.396938800811768, "eoc/jacobian_sigma/layer_27/attn": 3.1624226570129395, "eoc/jacobian_sigma/layer_27/mlp": 29.549421310424805, "eoc/jacobian_sigma/layer_27": 29.549421310424805, "eoc/layer0_sigma": 8480.7275390625, "eoc/sigma_max": 29.549421310424805, "eoc/sigma_min": 1.7357386350631714, "eoc/sigma_mean": 10.701424926519394, "eoc/time_s": 0.77545166015625} {"step": 56510, "timestamp": 1778255627.3108447, "train/loss": 2.1716909408569336, "train/z_loss": 0.001364745176397264, "train/perplexity": 8.773106309319967, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1138517.2132712065, "perf/iters_per_sec": 0.5428873125415833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8420028924942016, "data/tokens_consumed": 118512156672, "data/tokens_consumed_B": 118.512156672, "train/loss_slope": -1.8728880670526265e-05} {"step": 56520, "timestamp": 1778255637.6588728, "train/loss": 2.140162968635559, "train/z_loss": 0.0013882484403438867, "train/perplexity": 8.500822883512983, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028406.5280811074, "perf/iters_per_sec": 0.9672196045308625, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338913679122925, "data/tokens_consumed": 118533128192, "data/tokens_consumed_B": 118.533128192, "train/loss_slope": -1.752595669007892e-05} {"step": 56530, "timestamp": 1778255648.013913, "train/loss": 2.1825937271118163, "train/z_loss": 0.001387525990139693, "train/perplexity": 8.869280945257204, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026448.285201472, "perf/iters_per_sec": 0.9662858415610657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348904609680176, "data/tokens_consumed": 118554099712, "data/tokens_consumed_B": 118.554099712, "train/loss_slope": -1.4231716803233954e-05} {"step": 56540, "timestamp": 1778255658.3587961, "train/loss": 2.1676401615142824, "train/z_loss": 0.0013826691429130733, "train/perplexity": 8.737640272552127, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029023.2658892456, "perf/iters_per_sec": 0.9675136880346515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335771083831786, "data/tokens_consumed": 118575071232, "data/tokens_consumed_B": 118.575071232, "train/loss_slope": -1.1171600820779988e-05} {"step": 56550, "timestamp": 1778255668.6911404, "grad/layer_0/attn": 0.0030280165374279022, "grad/layer_0/mlp": 0.0031304724980145693, "grad/layer_0/attn_mlp_ratio": 0.9672713760051286, "grad/layer_4/attn": 0.002338449005037546, "grad/layer_4/mlp": 0.0025932760909199715, "grad/layer_4/attn_mlp_ratio": 0.9017354237953248, "grad/layer_8/attn": 0.005135883577167988, "grad/layer_8/mlp": 0.0037033860571682453, "grad/layer_8/attn_mlp_ratio": 1.3868074673301383, "grad/layer_12/attn": 0.004953418392688036, "grad/layer_12/mlp": 0.006879710126668215, "grad/layer_12/attn_mlp_ratio": 0.7200039288699714, "grad/layer_16/attn": 0.003936051856726408, "grad/layer_16/mlp": 0.0049070618115365505, "grad/layer_16/attn_mlp_ratio": 0.8021198688104407, "grad/layer_20/attn": 0.006509622558951378, "grad/layer_20/mlp": 0.00656593032181263, "grad/layer_20/attn_mlp_ratio": 0.9914242370473206, "grad/layer_24/attn": 0.022080110386013985, "grad/layer_24/mlp": 0.012789984233677387, "grad/layer_24/attn_mlp_ratio": 1.7263594551773382, "grad/layer_27/attn": 0.008556266315281391, "grad/layer_27/mlp": 0.01339559257030487, "grad/layer_27/attn_mlp_ratio": 0.6387374210212275} {"step": 56550, "timestamp": 1778255669.306253, "eos/sharpness": 86.56404018402098, "eos/L0_probe": 1.984125018119812, "eos/L_plus": 2.3444278240203857, "eos/L_minus": 2.4894626140594482, "eos/grad_norm": 0.28773778676986694, "eos/embed_grad_frac": 0.03087145835161209, "eos/time_s": 0.6124167442321777} {"step": 56550, "timestamp": 1778255669.3246684, "train/loss": 2.1540550708770754, "train/z_loss": 0.0013759323512203992, "train/perplexity": 8.619741284939233, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913423.926096849, "perf/iters_per_sec": 0.9123916273578877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960205793380737, "data/tokens_consumed": 118596042752, "data/tokens_consumed_B": 118.596042752, "train/loss_slope": -1.268684148001588e-05} {"step": 56550, "timestamp": 1778255670.6874619, "geo/rankme_last": 439.6657409667969, "geo/layer_0/stable_rank_q_proj": 19.52178192138672, "geo/layer_0/stable_rank_k_proj": 16.21668815612793, "geo/layer_0/stable_rank_o_proj": 47.46868133544922, "geo/layer_0/stable_rank_gate_proj": 132.2947235107422, "geo/layer_0/stable_rank_down_proj": 54.431758880615234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06432736665010452, "geo/layer_0/attn_entropy_mean": 6.1645426750183105, "geo/layer_0/attn_entropy_std": 0.4038112461566925, "geo/layer_7/stable_rank_q_proj": 43.45924377441406, "geo/layer_7/stable_rank_k_proj": 41.52266311645508, "geo/layer_7/stable_rank_o_proj": 92.5501480102539, "geo/layer_7/stable_rank_gate_proj": 84.34455108642578, "geo/layer_7/stable_rank_down_proj": 141.50830078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45893341302871704, "geo/layer_7/attn_entropy_mean": 4.628567695617676, "geo/layer_7/attn_entropy_std": 0.8033924102783203, "geo/layer_14/stable_rank_q_proj": 52.05586242675781, "geo/layer_14/stable_rank_k_proj": 39.5301399230957, "geo/layer_14/stable_rank_o_proj": 44.34286117553711, "geo/layer_14/stable_rank_gate_proj": 72.27527618408203, "geo/layer_14/stable_rank_down_proj": 128.78578186035156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.382309228181839, "geo/layer_14/attn_entropy_mean": 5.52692985534668, "geo/layer_14/attn_entropy_std": 0.3822188079357147, "geo/layer_21/stable_rank_q_proj": 40.76761245727539, "geo/layer_21/stable_rank_k_proj": 30.272050857543945, "geo/layer_21/stable_rank_o_proj": 71.91655731201172, "geo/layer_21/stable_rank_gate_proj": 67.30323028564453, "geo/layer_21/stable_rank_down_proj": 52.5811882019043, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14164069294929504, "geo/layer_21/attn_entropy_mean": 5.691935062408447, "geo/layer_21/attn_entropy_std": 0.3045421242713928, "geo/layer_27/stable_rank_q_proj": 43.20310974121094, "geo/layer_27/stable_rank_k_proj": 31.717086791992188, "geo/layer_27/stable_rank_o_proj": 115.76602172851562, "geo/layer_27/stable_rank_gate_proj": 81.22051239013672, "geo/layer_27/stable_rank_down_proj": 129.547119140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09449554234743118, "geo/layer_27/attn_entropy_mean": 4.224108695983887, "geo/layer_27/attn_entropy_std": 0.6984615325927734, "attnres/final_alpha/block_0": 0.23850692808628082, "attnres/block_norm/0": 1.7546308040618896, "attnres/final_alpha/block_1": 0.00485399691388011, "attnres/block_norm/1": 45232.4765625, "attnres/final_alpha/block_2": 0.010574829764664173, "attnres/block_norm/2": 28046.96875, "attnres/final_alpha/block_3": 0.012421457096934319, "attnres/block_norm/3": 55396.9921875, "attnres/final_alpha/block_4": 0.014610012993216515, "attnres/block_norm/4": 14581.83203125, "attnres/final_alpha/block_5": 0.606569766998291, "attnres/block_norm/5": 6549.92041015625, "attnres/final_alpha/block_6": 0.11246297508478165, "attnres/block_norm/6": 36562.734375, "geo/tier1_time_s": 1.3588650226593018, "geo/step": 56550.0, "geo/rankme_slope": 8.511207217261904e-05} {"step": 56560, "timestamp": 1778255681.045755, "train/loss": 2.1834502696990965, "train/z_loss": 0.0013702002237550915, "train/perplexity": 8.876881116575907, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789802.717481876, "perf/iters_per_sec": 0.8534444415482884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1717224359512328, "data/tokens_consumed": 118617014272, "data/tokens_consumed_B": 118.617014272, "train/loss_slope": -1.2992666048793259e-05} {"step": 56570, "timestamp": 1778255691.401392, "train/loss": 2.1485770106315614, "train/z_loss": 0.0013816989725455643, "train/perplexity": 8.572650922562364, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027630.2553000613, "perf/iters_per_sec": 0.9668494488239581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034287190437317, "data/tokens_consumed": 118637985792, "data/tokens_consumed_B": 118.637985792, "train/loss_slope": -1.1518498901987487e-05} {"step": 56580, "timestamp": 1778255701.743125, "train/loss": 2.1114285945892335, "train/z_loss": 0.0013835157500579954, "train/perplexity": 8.260033098351737, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029120.4827391747, "perf/iters_per_sec": 0.9675600446411012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335275888442994, "data/tokens_consumed": 118658957312, "data/tokens_consumed_B": 118.658957312, "train/loss_slope": -1.3760381548961886e-05} {"step": 56590, "timestamp": 1778255712.0862942, "train/loss": 2.195332384109497, "train/z_loss": 0.0013755566207692028, "train/perplexity": 8.982986362580538, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028385.6195640198, "perf/iters_per_sec": 0.9672096345729922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339020252227784, "data/tokens_consumed": 118679928832, "data/tokens_consumed_B": 118.679928832, "train/loss_slope": -1.0156080188459316e-05} {"step": 56600, "timestamp": 1778255722.4251332, "grad/layer_0/attn": 0.002655724296346307, "grad/layer_0/mlp": 0.0028013798873871565, "grad/layer_0/attn_mlp_ratio": 0.9480057358528136, "grad/layer_4/attn": 0.0026796564925462008, "grad/layer_4/mlp": 0.002530677942559123, "grad/layer_4/attn_mlp_ratio": 1.0588689858929763, "grad/layer_8/attn": 0.0033819833770394325, "grad/layer_8/mlp": 0.0035991864278912544, "grad/layer_8/attn_mlp_ratio": 0.9396521549609489, "grad/layer_12/attn": 0.005481994245201349, "grad/layer_12/mlp": 0.006764552555978298, "grad/layer_12/attn_mlp_ratio": 0.8104001142420757, "grad/layer_16/attn": 0.004043347667902708, "grad/layer_16/mlp": 0.004618115723133087, "grad/layer_16/attn_mlp_ratio": 0.8755405500331439, "grad/layer_20/attn": 0.0038883767556399107, "grad/layer_20/mlp": 0.005909766536206007, "grad/layer_20/attn_mlp_ratio": 0.6579577494342819, "grad/layer_24/attn": 0.006508558057248592, "grad/layer_24/mlp": 0.007735833525657654, "grad/layer_24/attn_mlp_ratio": 0.8413518661597734, "grad/layer_27/attn": 0.004533293657004833, "grad/layer_27/mlp": 0.00728150550276041, "grad/layer_27/attn_mlp_ratio": 0.6225764154169252} {"step": 56600, "timestamp": 1778255722.4398115, "train/loss": 2.174133133888245, "train/z_loss": 0.0013826814712956548, "train/perplexity": 8.79455811247153, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026773.5013107413, "perf/iters_per_sec": 0.9664409166864115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347244024276734, "data/tokens_consumed": 118700900352, "data/tokens_consumed_B": 118.700900352, "train/loss_slope": -5.770027497992817e-06} {"step": 56610, "timestamp": 1778255732.7813096, "train/loss": 2.131263017654419, "train/z_loss": 0.0013895994168706238, "train/perplexity": 8.425501651364113, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029044.4683847835, "perf/iters_per_sec": 0.9675237981723707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335663080215454, "data/tokens_consumed": 118721871872, "data/tokens_consumed_B": 118.721871872, "train/loss_slope": -5.12514354014513e-06} {"step": 56620, "timestamp": 1778255743.1243987, "train/loss": 2.1640809297561647, "train/z_loss": 0.0013762221089564264, "train/perplexity": 8.706596264984325, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028720.0672007569, "perf/iters_per_sec": 0.9673691116336617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337315797805786, "data/tokens_consumed": 118742843392, "data/tokens_consumed_B": 118.742843392, "train/loss_slope": -6.1868868394903205e-06} {"step": 56625, "timestamp": 1778255748.8925927, "eos/sharpness": 69.88768577575682, "eos/L0_probe": 1.983969807624817, "eos/L_plus": 2.3114771842956543, "eos/L_minus": 2.355339288711548, "eos/grad_norm": 0.2031773030757904, "eos/embed_grad_frac": 0.055874552577733994, "eos/time_s": 0.6034893989562988} {"step": 56625, "timestamp": 1778255750.275086, "geo/rankme_last": 438.4351501464844, "geo/layer_0/stable_rank_q_proj": 19.52189826965332, "geo/layer_0/stable_rank_k_proj": 16.20864486694336, "geo/layer_0/stable_rank_o_proj": 47.49785614013672, "geo/layer_0/stable_rank_gate_proj": 132.23004150390625, "geo/layer_0/stable_rank_down_proj": 54.3968391418457, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061210162937641144, "geo/layer_0/attn_entropy_mean": 6.16513729095459, "geo/layer_0/attn_entropy_std": 0.4061310887336731, "geo/layer_7/stable_rank_q_proj": 43.47039794921875, "geo/layer_7/stable_rank_k_proj": 41.46854019165039, "geo/layer_7/stable_rank_o_proj": 92.47222137451172, "geo/layer_7/stable_rank_gate_proj": 84.24463653564453, "geo/layer_7/stable_rank_down_proj": 141.61415100097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46765607595443726, "geo/layer_7/attn_entropy_mean": 4.6706743240356445, "geo/layer_7/attn_entropy_std": 0.8165395259857178, "geo/layer_14/stable_rank_q_proj": 51.962833404541016, "geo/layer_14/stable_rank_k_proj": 39.45362091064453, "geo/layer_14/stable_rank_o_proj": 44.33650207519531, "geo/layer_14/stable_rank_gate_proj": 72.20891571044922, "geo/layer_14/stable_rank_down_proj": 128.97874450683594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39718037843704224, "geo/layer_14/attn_entropy_mean": 5.53927755355835, "geo/layer_14/attn_entropy_std": 0.3836902678012848, "geo/layer_21/stable_rank_q_proj": 40.715354919433594, "geo/layer_21/stable_rank_k_proj": 30.272544860839844, "geo/layer_21/stable_rank_o_proj": 71.90070343017578, "geo/layer_21/stable_rank_gate_proj": 67.29029846191406, "geo/layer_21/stable_rank_down_proj": 52.48929214477539, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14259862899780273, "geo/layer_21/attn_entropy_mean": 5.714809417724609, "geo/layer_21/attn_entropy_std": 0.2926121950149536, "geo/layer_27/stable_rank_q_proj": 43.24217224121094, "geo/layer_27/stable_rank_k_proj": 31.70067596435547, "geo/layer_27/stable_rank_o_proj": 115.91580963134766, "geo/layer_27/stable_rank_gate_proj": 81.26556396484375, "geo/layer_27/stable_rank_down_proj": 129.54525756835938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09278175979852676, "geo/layer_27/attn_entropy_mean": 4.210636138916016, "geo/layer_27/attn_entropy_std": 0.7079522013664246, "attnres/final_alpha/block_0": 0.23656532168388367, "attnres/block_norm/0": 1.7549593448638916, "attnres/final_alpha/block_1": 0.004670628346502781, "attnres/block_norm/1": 45268.359375, "attnres/final_alpha/block_2": 0.010331784375011921, "attnres/block_norm/2": 28077.015625, "attnres/final_alpha/block_3": 0.012403630651533604, "attnres/block_norm/3": 55564.046875, "attnres/final_alpha/block_4": 0.014195049181580544, "attnres/block_norm/4": 14589.103515625, "attnres/final_alpha/block_5": 0.6099026799201965, "attnres/block_norm/5": 6539.470703125, "attnres/final_alpha/block_6": 0.11193089187145233, "attnres/block_norm/6": 36775.8203125, "geo/tier1_time_s": 1.3640563488006592, "geo/step": 56625.0, "geo/rankme_slope": 8.999003898434373e-05} {"step": 56630, "timestamp": 1778255755.4526765, "train/loss": 2.1487855195999144, "train/z_loss": 0.001382143038790673, "train/perplexity": 8.574438583527472, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702189.2721193635, "perf/iters_per_sec": 0.8116670952412431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2320322036743163, "data/tokens_consumed": 118763814912, "data/tokens_consumed_B": 118.763814912, "train/loss_slope": -7.612892467625767e-06} {"step": 56640, "timestamp": 1778255765.7912748, "train/loss": 2.123004102706909, "train/z_loss": 0.001388304028660059, "train/perplexity": 8.356202710767972, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029403.7144312172, "perf/iters_per_sec": 0.9676951000362478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033383345603943, "data/tokens_consumed": 118784786432, "data/tokens_consumed_B": 118.784786432, "train/loss_slope": -7.824121156755986e-06} {"step": 56650, "timestamp": 1778255776.125021, "grad/layer_0/attn": 0.0027099582366645336, "grad/layer_0/mlp": 0.002856890670955181, "grad/layer_0/attn_mlp_ratio": 0.9485690752392603, "grad/layer_4/attn": 0.002022872678935528, "grad/layer_4/mlp": 0.0026407106779515743, "grad/layer_4/attn_mlp_ratio": 0.7660334088175648, "grad/layer_8/attn": 0.004395101685076952, "grad/layer_8/mlp": 0.003923955373466015, "grad/layer_8/attn_mlp_ratio": 1.1200692043517955, "grad/layer_12/attn": 0.004772540181875229, "grad/layer_12/mlp": 0.006494762375950813, "grad/layer_12/attn_mlp_ratio": 0.7348290564200416, "grad/layer_16/attn": 0.003333842381834984, "grad/layer_16/mlp": 0.00431008730083704, "grad/layer_16/attn_mlp_ratio": 0.7734976281890561, "grad/layer_20/attn": 0.004173988942056894, "grad/layer_20/mlp": 0.00570998340845108, "grad/layer_20/attn_mlp_ratio": 0.7309984233543176, "grad/layer_24/attn": 0.009788239374756813, "grad/layer_24/mlp": 0.011095577850937843, "grad/layer_24/attn_mlp_ratio": 0.8821748103648329, "grad/layer_27/attn": 0.0061395904049277306, "grad/layer_27/mlp": 0.010084877721965313, "grad/layer_27/attn_mlp_ratio": 0.6087917487265369} {"step": 56650, "timestamp": 1778255776.1394992, "train/loss": 2.153856301307678, "train/z_loss": 0.0013699760660529138, "train/perplexity": 8.61802811294458, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027609.362783241, "perf/iters_per_sec": 0.9668394864956098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342978477478026, "data/tokens_consumed": 118805757952, "data/tokens_consumed_B": 118.805757952, "train/loss_slope": -7.076986587837974e-06} {"step": 56660, "timestamp": 1778255787.0910587, "train/loss": 2.095123219490051, "train/z_loss": 0.0013779158820398153, "train/perplexity": 8.12644224459241, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916444.0582709613, "perf/iters_per_sec": 0.9138317386011893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0942933559417725, "data/tokens_consumed": 118826729472, "data/tokens_consumed_B": 118.826729472, "train/loss_slope": -1.0364019359299021e-05} {"step": 56670, "timestamp": 1778255797.4321795, "train/loss": 2.1894498109817504, "train/z_loss": 0.0013682337244972586, "train/perplexity": 8.930298410699383, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029261.620550823, "perf/iters_per_sec": 0.9676273443941226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334557056427003, "data/tokens_consumed": 118847700992, "data/tokens_consumed_B": 118.847700992, "train/loss_slope": -7.375350722385287e-06} {"step": 56680, "timestamp": 1778255808.2979808, "train/loss": 2.1367068767547606, "train/z_loss": 0.0013801231514662504, "train/perplexity": 8.471493969469455, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1931423.3440888936, "perf/iters_per_sec": 0.9209744186825245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0858064889907837, "data/tokens_consumed": 118868672512, "data/tokens_consumed_B": 118.868672512, "train/loss_slope": -7.748696877725332e-06} {"step": 56690, "timestamp": 1778255818.6412935, "train/loss": 2.150945520401001, "train/z_loss": 0.001371799735352397, "train/perplexity": 8.592979394611486, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029208.5335778324, "perf/iters_per_sec": 0.9676020305527842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334827423095703, "data/tokens_consumed": 118889644032, "data/tokens_consumed_B": 118.889644032, "train/loss_slope": -7.497371949128041e-06} {"step": 56700, "timestamp": 1778255828.975317, "grad/layer_0/attn": 0.0026663984172046185, "grad/layer_0/mlp": 0.002755188848823309, "grad/layer_0/attn_mlp_ratio": 0.9677733421308141, "grad/layer_4/attn": 0.00204691500402987, "grad/layer_4/mlp": 0.0025832841638475657, "grad/layer_4/attn_mlp_ratio": 0.7923692458766335, "grad/layer_8/attn": 0.0037795815151184797, "grad/layer_8/mlp": 0.0036577547434717417, "grad/layer_8/attn_mlp_ratio": 1.0333064070338587, "grad/layer_12/attn": 0.005418011918663979, "grad/layer_12/mlp": 0.006579485256224871, "grad/layer_12/attn_mlp_ratio": 0.8234704730420871, "grad/layer_16/attn": 0.005379184614866972, "grad/layer_16/mlp": 0.004504159092903137, "grad/layer_16/attn_mlp_ratio": 1.1942705362951085, "grad/layer_20/attn": 0.003940057475119829, "grad/layer_20/mlp": 0.006081244442611933, "grad/layer_20/attn_mlp_ratio": 0.6479031467179825, "grad/layer_24/attn": 0.004256971646100283, "grad/layer_24/mlp": 0.006943345069885254, "grad/layer_24/attn_mlp_ratio": 0.6131009681851426, "grad/layer_27/attn": 0.004783739801496267, "grad/layer_27/mlp": 0.0060395956970751286, "grad/layer_27/attn_mlp_ratio": 0.7920629065628779} {"step": 56700, "timestamp": 1778255829.5865407, "eos/sharpness": 4.1550278663635245, "eos/L0_probe": 1.9800373315811157, "eos/L_plus": 2.001945734024048, "eos/L_minus": 1.9996792078018188, "eos/grad_norm": 0.08002752810716629, "eos/embed_grad_frac": 0.3332507312297821, "eos/time_s": 0.6082763671875} {"step": 56700, "timestamp": 1778255829.6069708, "train/loss": 2.1350714325904847, "train/z_loss": 0.0013856120640411973, "train/perplexity": 8.45765063717843, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913484.447742707, "perf/iters_per_sec": 0.9124204863275085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0959859132766723, "data/tokens_consumed": 118910615552, "data/tokens_consumed_B": 118.910615552, "train/loss_slope": -1.0525841567024875e-05} {"step": 56700, "timestamp": 1778255830.9710467, "geo/rankme_last": 438.8731384277344, "geo/layer_0/stable_rank_q_proj": 19.520511627197266, "geo/layer_0/stable_rank_k_proj": 16.212631225585938, "geo/layer_0/stable_rank_o_proj": 47.423526763916016, "geo/layer_0/stable_rank_gate_proj": 132.2300567626953, "geo/layer_0/stable_rank_down_proj": 54.38446807861328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06490227580070496, "geo/layer_0/attn_entropy_mean": 6.165724754333496, "geo/layer_0/attn_entropy_std": 0.4063005745410919, "geo/layer_7/stable_rank_q_proj": 43.43696212768555, "geo/layer_7/stable_rank_k_proj": 41.425331115722656, "geo/layer_7/stable_rank_o_proj": 92.58750915527344, "geo/layer_7/stable_rank_gate_proj": 84.3215103149414, "geo/layer_7/stable_rank_down_proj": 141.68954467773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46493083238601685, "geo/layer_7/attn_entropy_mean": 4.651999473571777, "geo/layer_7/attn_entropy_std": 0.8095921874046326, "geo/layer_14/stable_rank_q_proj": 52.00300216674805, "geo/layer_14/stable_rank_k_proj": 39.44501495361328, "geo/layer_14/stable_rank_o_proj": 44.25224685668945, "geo/layer_14/stable_rank_gate_proj": 72.23722839355469, "geo/layer_14/stable_rank_down_proj": 129.1275177001953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3889065086841583, "geo/layer_14/attn_entropy_mean": 5.525543212890625, "geo/layer_14/attn_entropy_std": 0.3745039105415344, "geo/layer_21/stable_rank_q_proj": 40.735679626464844, "geo/layer_21/stable_rank_k_proj": 30.21194839477539, "geo/layer_21/stable_rank_o_proj": 71.82511138916016, "geo/layer_21/stable_rank_gate_proj": 67.2968521118164, "geo/layer_21/stable_rank_down_proj": 52.471683502197266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14244377613067627, "geo/layer_21/attn_entropy_mean": 5.742031097412109, "geo/layer_21/attn_entropy_std": 0.2945307791233063, "geo/layer_27/stable_rank_q_proj": 43.241920471191406, "geo/layer_27/stable_rank_k_proj": 31.705671310424805, "geo/layer_27/stable_rank_o_proj": 116.02942657470703, "geo/layer_27/stable_rank_gate_proj": 81.22347259521484, "geo/layer_27/stable_rank_down_proj": 129.54714965820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08028598129749298, "geo/layer_27/attn_entropy_mean": 4.236698150634766, "geo/layer_27/attn_entropy_std": 0.7167194485664368, "attnres/final_alpha/block_0": 0.23675526678562164, "attnres/block_norm/0": 1.754892110824585, "attnres/final_alpha/block_1": 0.004743498284369707, "attnres/block_norm/1": 45060.59375, "attnres/final_alpha/block_2": 0.010343261063098907, "attnres/block_norm/2": 28080.36328125, "attnres/final_alpha/block_3": 0.012387964874505997, "attnres/block_norm/3": 55571.41796875, "attnres/final_alpha/block_4": 0.014334812760353088, "attnres/block_norm/4": 14643.69140625, "attnres/final_alpha/block_5": 0.6108176112174988, "attnres/block_norm/5": 6481.54052734375, "attnres/final_alpha/block_6": 0.11061759293079376, "attnres/block_norm/6": 36630.328125, "geo/tier1_time_s": 1.3602852821350098, "geo/step": 56700.0, "geo/rankme_slope": 9.533188275310123e-05} {"step": 56710, "timestamp": 1778255841.3237057, "train/loss": 2.1848628282547, "train/z_loss": 0.0013815937330946327, "train/perplexity": 8.889429091232241, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790427.1825756396, "perf/iters_per_sec": 0.8537422097089956, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171313762664795, "data/tokens_consumed": 118931587072, "data/tokens_consumed_B": 118.931587072, "train/loss_slope": -7.223527087415588e-06} {"step": 56720, "timestamp": 1778255852.1935449, "train/loss": 2.147016930580139, "train/z_loss": 0.0013851335737854243, "train/perplexity": 8.559287327719547, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930779.4827760751, "perf/iters_per_sec": 0.9206674016838432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0861685752868653, "data/tokens_consumed": 118952558592, "data/tokens_consumed_B": 118.952558592, "train/loss_slope": -7.669595501782667e-06} {"step": 56730, "timestamp": 1778255862.5529442, "train/loss": 2.1384785056114195, "train/z_loss": 0.001384545606561005, "train/perplexity": 8.486515615106223, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025757.9964276932, "perf/iters_per_sec": 0.9659566862238375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352431058883667, "data/tokens_consumed": 118973530112, "data/tokens_consumed_B": 118.973530112, "train/loss_slope": -9.344796522079246e-06} {"step": 56740, "timestamp": 1778255873.3300655, "train/loss": 2.113328051567078, "train/z_loss": 0.0013810343807563186, "train/perplexity": 8.275737586135346, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947481.8286120757, "perf/iters_per_sec": 0.9286317008076075, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0768531799316405, "data/tokens_consumed": 118994501632, "data/tokens_consumed_B": 118.994501632, "train/loss_slope": -1.1913758743428025e-05} {"step": 56750, "timestamp": 1778255883.6595728, "grad/layer_0/attn": 0.0027948799543082714, "grad/layer_0/mlp": 0.0029109823517501354, "grad/layer_0/attn_mlp_ratio": 0.9601156999857346, "grad/layer_4/attn": 0.0021544864866882563, "grad/layer_4/mlp": 0.0024533444084227085, "grad/layer_4/attn_mlp_ratio": 0.8781834264578714, "grad/layer_8/attn": 0.008384046144783497, "grad/layer_8/mlp": 0.0035980918910354376, "grad/layer_8/attn_mlp_ratio": 2.330136685129828, "grad/layer_12/attn": 0.004595475271344185, "grad/layer_12/mlp": 0.006745111662894487, "grad/layer_12/attn_mlp_ratio": 0.6813045406637054, "grad/layer_16/attn": 0.003552723675966263, "grad/layer_16/mlp": 0.004410313442349434, "grad/layer_16/attn_mlp_ratio": 0.8055490027753626, "grad/layer_20/attn": 0.00374893588013947, "grad/layer_20/mlp": 0.005771887954324484, "grad/layer_20/attn_mlp_ratio": 0.6495163878534767, "grad/layer_24/attn": 0.00923970714211464, "grad/layer_24/mlp": 0.010596871376037598, "grad/layer_24/attn_mlp_ratio": 0.8719278291718575, "grad/layer_27/attn": 0.004894253797829151, "grad/layer_27/mlp": 0.009557956829667091, "grad/layer_27/attn_mlp_ratio": 0.512060666714013} {"step": 56750, "timestamp": 1778255883.674289, "train/loss": 2.15522780418396, "train/z_loss": 0.001375477807596326, "train/perplexity": 8.629855872338354, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028381.4098476092, "perf/iters_per_sec": 0.9672076272237822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339041709899903, "data/tokens_consumed": 119015473152, "data/tokens_consumed_B": 119.015473152, "train/loss_slope": -1.0238930387179319e-05} {"step": 56760, "timestamp": 1778255894.025149, "train/loss": 2.1684687614440916, "train/z_loss": 0.0013967532198876143, "train/perplexity": 8.744883281032415, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027061.7771903344, "perf/iters_per_sec": 0.9665783773376152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345772504806519, "data/tokens_consumed": 119036444672, "data/tokens_consumed_B": 119.036444672, "train/loss_slope": -6.865669920606856e-06} {"step": 56770, "timestamp": 1778255904.3809254, "train/loss": 2.1216750144958496, "train/z_loss": 0.0013764123897999526, "train/perplexity": 8.345103957500735, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027071.8206830118, "perf/iters_per_sec": 0.9665831664481219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345721244812012, "data/tokens_consumed": 119057416192, "data/tokens_consumed_B": 119.057416192, "train/loss_slope": -7.0275350193558116e-06} {"step": 56775, "timestamp": 1778255910.1406538, "eos/sharpness": 47.682380676269524, "eos/L0_probe": 1.98148512840271, "eos/L_plus": 2.2642345428466797, "eos/L_minus": 2.1755595207214355, "eos/grad_norm": 0.11370743811130524, "eos/embed_grad_frac": 0.16669119894504547, "eos/time_s": 0.5935683250427246} {"step": 56775, "timestamp": 1778255911.515366, "geo/rankme_last": 437.9370422363281, "geo/layer_0/stable_rank_q_proj": 19.521865844726562, "geo/layer_0/stable_rank_k_proj": 16.229541778564453, "geo/layer_0/stable_rank_o_proj": 47.398109436035156, "geo/layer_0/stable_rank_gate_proj": 132.46510314941406, "geo/layer_0/stable_rank_down_proj": 54.45083236694336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06315405666828156, "geo/layer_0/attn_entropy_mean": 6.165676593780518, "geo/layer_0/attn_entropy_std": 0.40553560853004456, "geo/layer_7/stable_rank_q_proj": 43.522159576416016, "geo/layer_7/stable_rank_k_proj": 41.485477447509766, "geo/layer_7/stable_rank_o_proj": 92.52640533447266, "geo/layer_7/stable_rank_gate_proj": 84.38736724853516, "geo/layer_7/stable_rank_down_proj": 141.65728759765625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46640491485595703, "geo/layer_7/attn_entropy_mean": 4.680093288421631, "geo/layer_7/attn_entropy_std": 0.7980921864509583, "geo/layer_14/stable_rank_q_proj": 52.08287811279297, "geo/layer_14/stable_rank_k_proj": 39.45661544799805, "geo/layer_14/stable_rank_o_proj": 44.257118225097656, "geo/layer_14/stable_rank_gate_proj": 72.21233367919922, "geo/layer_14/stable_rank_down_proj": 129.20899963378906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3794151544570923, "geo/layer_14/attn_entropy_mean": 5.476316928863525, "geo/layer_14/attn_entropy_std": 0.3923317790031433, "geo/layer_21/stable_rank_q_proj": 40.73757553100586, "geo/layer_21/stable_rank_k_proj": 30.22038459777832, "geo/layer_21/stable_rank_o_proj": 71.78746032714844, "geo/layer_21/stable_rank_gate_proj": 67.33338928222656, "geo/layer_21/stable_rank_down_proj": 52.5256462097168, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1435207724571228, "geo/layer_21/attn_entropy_mean": 5.711235523223877, "geo/layer_21/attn_entropy_std": 0.30681031942367554, "geo/layer_27/stable_rank_q_proj": 43.24277877807617, "geo/layer_27/stable_rank_k_proj": 31.691431045532227, "geo/layer_27/stable_rank_o_proj": 116.11508178710938, "geo/layer_27/stable_rank_gate_proj": 81.21973419189453, "geo/layer_27/stable_rank_down_proj": 129.41824340820312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09865735471248627, "geo/layer_27/attn_entropy_mean": 4.212764739990234, "geo/layer_27/attn_entropy_std": 0.689141035079956, "attnres/final_alpha/block_0": 0.23497603833675385, "attnres/block_norm/0": 1.7550883293151855, "attnres/final_alpha/block_1": 0.004617128521203995, "attnres/block_norm/1": 45176.53125, "attnres/final_alpha/block_2": 0.010465530678629875, "attnres/block_norm/2": 28217.37890625, "attnres/final_alpha/block_3": 0.012253298424184322, "attnres/block_norm/3": 55570.3203125, "attnres/final_alpha/block_4": 0.014213170856237411, "attnres/block_norm/4": 14681.8505859375, "attnres/final_alpha/block_5": 0.614265501499176, "attnres/block_norm/5": 6448.2216796875, "attnres/final_alpha/block_6": 0.1092093288898468, "attnres/block_norm/6": 36829.9921875, "geo/tier1_time_s": 1.3553879261016846, "geo/step": 56775.0, "geo/rankme_slope": 6.743101537489996e-05} {"step": 56780, "timestamp": 1778255916.695839, "train/loss": 2.188663673400879, "train/z_loss": 0.0013757020351476967, "train/perplexity": 8.923280726304471, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704023.4777329436, "perf/iters_per_sec": 0.8125417126335829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307060480117797, "data/tokens_consumed": 119078387712, "data/tokens_consumed_B": 119.078387712, "train/loss_slope": -6.214610270612628e-06} {"step": 56790, "timestamp": 1778255927.0445852, "train/loss": 2.0955687761306763, "train/z_loss": 0.001386191661003977, "train/perplexity": 8.130063841652541, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027376.4427175408, "perf/iters_per_sec": 0.9667284215533928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034416675567627, "data/tokens_consumed": 119099359232, "data/tokens_consumed_B": 119.099359232, "train/loss_slope": -8.171365284683673e-06} {"step": 56800, "timestamp": 1778255937.3798287, "grad/layer_0/attn": 0.0028145734686404467, "grad/layer_0/mlp": 0.0029098924715071917, "grad/layer_0/attn_mlp_ratio": 0.9672430852602321, "grad/layer_4/attn": 0.0020803443621844053, "grad/layer_4/mlp": 0.0024231388233602047, "grad/layer_4/attn_mlp_ratio": 0.8585328485003076, "grad/layer_8/attn": 0.008412751369178295, "grad/layer_8/mlp": 0.0034133943263441324, "grad/layer_8/attn_mlp_ratio": 2.4646291399112212, "grad/layer_12/attn": 0.003946063108742237, "grad/layer_12/mlp": 0.006979559548199177, "grad/layer_12/attn_mlp_ratio": 0.5653742223924365, "grad/layer_16/attn": 0.003762034233659506, "grad/layer_16/mlp": 0.004323373083025217, "grad/layer_16/attn_mlp_ratio": 0.8701618098641852, "grad/layer_20/attn": 0.003691780148074031, "grad/layer_20/mlp": 0.005552003160119057, "grad/layer_20/attn_mlp_ratio": 0.6649456016340425, "grad/layer_24/attn": 0.012265809811651707, "grad/layer_24/mlp": 0.008435276336967945, "grad/layer_24/attn_mlp_ratio": 1.4541088135412246, "grad/layer_27/attn": 0.00618325499817729, "grad/layer_27/mlp": 0.007707397453486919, "grad/layer_27/attn_mlp_ratio": 0.8022493916094819} {"step": 56800, "timestamp": 1778255937.394271, "train/loss": 2.1600396633148193, "train/z_loss": 0.0013810328557156027, "train/perplexity": 8.671481591346959, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027837.8016663275, "perf/iters_per_sec": 0.9669484146434438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034181332588196, "data/tokens_consumed": 119120330752, "data/tokens_consumed_B": 119.120330752, "train/loss_slope": -3.5275461292467158e-06} {"step": 56810, "timestamp": 1778255947.7377706, "train/loss": 2.164574980735779, "train/z_loss": 0.0013875903096050024, "train/perplexity": 8.710898830153896, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028522.4448602763, "perf/iters_per_sec": 0.9672748779584295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338322877883912, "data/tokens_consumed": 119141302272, "data/tokens_consumed_B": 119.141302272, "train/loss_slope": -9.03893947744377e-07} {"step": 56820, "timestamp": 1778255958.0844717, "train/loss": 2.1238300085067747, "train/z_loss": 0.0013967606122605503, "train/perplexity": 8.363106997804538, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027825.4598835192, "perf/iters_per_sec": 0.9669425296228024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034187626838684, "data/tokens_consumed": 119162273792, "data/tokens_consumed_B": 119.162273792, "train/loss_slope": -4.8134592106633545e-06} {"step": 56830, "timestamp": 1778255968.432153, "train/loss": 2.1875450372695924, "train/z_loss": 0.0013711097300983966, "train/perplexity": 8.913304403052557, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028185.256440746, "perf/iters_per_sec": 0.9671140939906816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340041637420654, "data/tokens_consumed": 119183245312, "data/tokens_consumed_B": 119.183245312, "train/loss_slope": -3.32989644045745e-07} {"step": 56840, "timestamp": 1778255978.7766764, "train/loss": 2.178054690361023, "train/z_loss": 0.0013739815796725452, "train/perplexity": 8.829114181265007, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028104.2150961156, "perf/iters_per_sec": 0.9670754504662111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340454816818236, "data/tokens_consumed": 119204216832, "data/tokens_consumed_B": 119.204216832, "train/loss_slope": 2.1303433753905416e-06} {"step": 56850, "timestamp": 1778255989.121643, "grad/layer_0/attn": 0.002985724713653326, "grad/layer_0/mlp": 0.0030813231132924557, "grad/layer_0/attn_mlp_ratio": 0.9689748549497407, "grad/layer_4/attn": 0.0019935322925448418, "grad/layer_4/mlp": 0.002766483696177602, "grad/layer_4/attn_mlp_ratio": 0.720601470827072, "grad/layer_8/attn": 0.004261042457073927, "grad/layer_8/mlp": 0.0036667510867118835, "grad/layer_8/attn_mlp_ratio": 1.1620757013772094, "grad/layer_12/attn": 0.004096610471606255, "grad/layer_12/mlp": 0.006973205599933863, "grad/layer_12/attn_mlp_ratio": 0.5874787935260692, "grad/layer_16/attn": 0.00347836478613317, "grad/layer_16/mlp": 0.0044962577521800995, "grad/layer_16/attn_mlp_ratio": 0.7736132803074487, "grad/layer_20/attn": 0.003178160870447755, "grad/layer_20/mlp": 0.005850448738783598, "grad/layer_20/attn_mlp_ratio": 0.5432336830943976, "grad/layer_24/attn": 0.006509353872388601, "grad/layer_24/mlp": 0.008349538780748844, "grad/layer_24/attn_mlp_ratio": 0.7796063908866783, "grad/layer_27/attn": 0.005366197321563959, "grad/layer_27/mlp": 0.007125138770788908, "grad/layer_27/attn_mlp_ratio": 0.753135822175183} {"step": 56850, "timestamp": 1778255989.7174463, "eos/sharpness": 7.266402244567869, "eos/L0_probe": 1.9836690425872803, "eos/L_plus": 2.020193338394165, "eos/L_minus": 2.019808769226074, "eos/grad_norm": 0.09508483856916428, "eos/embed_grad_frac": 0.24629808962345123, "eos/time_s": 0.593008279800415} {"step": 56850, "timestamp": 1778255989.7373188, "train/loss": 2.1637934923171995, "train/z_loss": 0.0013876270037144422, "train/perplexity": 8.704094022888071, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914181.638380268, "perf/iters_per_sec": 0.9127529327298488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0955867290496826, "data/tokens_consumed": 119225188352, "data/tokens_consumed_B": 119.225188352, "train/loss_slope": 1.0077884881803016e-06} {"step": 56850, "timestamp": 1778255991.0972173, "geo/rankme_last": 439.64080810546875, "geo/layer_0/stable_rank_q_proj": 19.5090274810791, "geo/layer_0/stable_rank_k_proj": 16.250097274780273, "geo/layer_0/stable_rank_o_proj": 47.340904235839844, "geo/layer_0/stable_rank_gate_proj": 132.69725036621094, "geo/layer_0/stable_rank_down_proj": 54.4060173034668, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06853373348712921, "geo/layer_0/attn_entropy_mean": 6.165511608123779, "geo/layer_0/attn_entropy_std": 0.40294164419174194, "geo/layer_7/stable_rank_q_proj": 43.46712875366211, "geo/layer_7/stable_rank_k_proj": 41.51547622680664, "geo/layer_7/stable_rank_o_proj": 92.52880096435547, "geo/layer_7/stable_rank_gate_proj": 84.17213439941406, "geo/layer_7/stable_rank_down_proj": 141.34083557128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46034112572669983, "geo/layer_7/attn_entropy_mean": 4.636209011077881, "geo/layer_7/attn_entropy_std": 0.8003604412078857, "geo/layer_14/stable_rank_q_proj": 52.104312896728516, "geo/layer_14/stable_rank_k_proj": 39.42741394042969, "geo/layer_14/stable_rank_o_proj": 44.27886199951172, "geo/layer_14/stable_rank_gate_proj": 72.23567962646484, "geo/layer_14/stable_rank_down_proj": 129.38217163085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4062449336051941, "geo/layer_14/attn_entropy_mean": 5.545319557189941, "geo/layer_14/attn_entropy_std": 0.3731132745742798, "geo/layer_21/stable_rank_q_proj": 40.76722717285156, "geo/layer_21/stable_rank_k_proj": 30.13492774963379, "geo/layer_21/stable_rank_o_proj": 71.74699401855469, "geo/layer_21/stable_rank_gate_proj": 67.25703430175781, "geo/layer_21/stable_rank_down_proj": 52.49147033691406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.145978644490242, "geo/layer_21/attn_entropy_mean": 5.716458320617676, "geo/layer_21/attn_entropy_std": 0.3018715977668762, "geo/layer_27/stable_rank_q_proj": 43.1597785949707, "geo/layer_27/stable_rank_k_proj": 31.659454345703125, "geo/layer_27/stable_rank_o_proj": 116.18250274658203, "geo/layer_27/stable_rank_gate_proj": 81.19450378417969, "geo/layer_27/stable_rank_down_proj": 129.53436279296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08973274379968643, "geo/layer_27/attn_entropy_mean": 4.1970415115356445, "geo/layer_27/attn_entropy_std": 0.7214011549949646, "attnres/final_alpha/block_0": 0.23693670332431793, "attnres/block_norm/0": 1.755293846130371, "attnres/final_alpha/block_1": 0.004667093977332115, "attnres/block_norm/1": 45210.76171875, "attnres/final_alpha/block_2": 0.010282030329108238, "attnres/block_norm/2": 28199.9609375, "attnres/final_alpha/block_3": 0.012295188382267952, "attnres/block_norm/3": 55221.8203125, "attnres/final_alpha/block_4": 0.014286499470472336, "attnres/block_norm/4": 14610.564453125, "attnres/final_alpha/block_5": 0.6103746294975281, "attnres/block_norm/5": 6469.8212890625, "attnres/final_alpha/block_6": 0.11115783452987671, "attnres/block_norm/6": 36640.828125, "geo/tier1_time_s": 1.3562145233154297, "geo/step": 56850.0, "geo/rankme_slope": 8.534734206182473e-05} {"step": 56860, "timestamp": 1778256001.4398217, "train/loss": 2.1508833289146425, "train/z_loss": 0.0013855666620656848, "train/perplexity": 8.592445001068223, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792536.0781685822, "perf/iters_per_sec": 0.8547478094904815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1699357271194457, "data/tokens_consumed": 119246159872, "data/tokens_consumed_B": 119.246159872, "train/loss_slope": 1.7181645418264678e-06} {"step": 56870, "timestamp": 1778256011.7860742, "train/loss": 2.1352780938148497, "train/z_loss": 0.0013945987448096275, "train/perplexity": 8.459398686235117, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027999.614464658, "perf/iters_per_sec": 0.9670255729983607, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340988159179687, "data/tokens_consumed": 119267131392, "data/tokens_consumed_B": 119.267131392, "train/loss_slope": -7.237547551985931e-08} {"step": 56880, "timestamp": 1778256022.1314802, "train/loss": 2.1675943613052366, "train/z_loss": 0.0013765475596301258, "train/perplexity": 8.737240095965234, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028069.7522746834, "perf/iters_per_sec": 0.9670590173123758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340630531311035, "data/tokens_consumed": 119288102912, "data/tokens_consumed_B": 119.288102912, "train/loss_slope": -6.895055149970441e-07} {"step": 56890, "timestamp": 1778256032.4746056, "train/loss": 2.1701696515083313, "train/z_loss": 0.0013762613176368176, "train/perplexity": 8.759770022884732, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029091.6022304639, "perf/iters_per_sec": 0.9675462733414001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335422992706298, "data/tokens_consumed": 119309074432, "data/tokens_consumed_B": 119.309074432, "train/loss_slope": -1.0285099311666583e-06} {"step": 56900, "timestamp": 1778256042.8183005, "grad/layer_0/attn": 0.002659730613231659, "grad/layer_0/mlp": 0.002659379504621029, "grad/layer_0/attn_mlp_ratio": 1.0001319889082476, "grad/layer_4/attn": 0.0020798167679458857, "grad/layer_4/mlp": 0.002326820744201541, "grad/layer_4/attn_mlp_ratio": 0.8938448239918463, "grad/layer_8/attn": 0.006558455992490053, "grad/layer_8/mlp": 0.003710538148880005, "grad/layer_8/attn_mlp_ratio": 1.7675214625451356, "grad/layer_12/attn": 0.004204104654490948, "grad/layer_12/mlp": 0.006567014846950769, "grad/layer_12/attn_mlp_ratio": 0.6401850290356077, "grad/layer_16/attn": 0.0034403884783387184, "grad/layer_16/mlp": 0.004715810064226389, "grad/layer_16/attn_mlp_ratio": 0.7295434630590355, "grad/layer_20/attn": 0.003331799292936921, "grad/layer_20/mlp": 0.00620306096971035, "grad/layer_20/attn_mlp_ratio": 0.5371217944646962, "grad/layer_24/attn": 0.0047236159443855286, "grad/layer_24/mlp": 0.007888825610280037, "grad/layer_24/attn_mlp_ratio": 0.5987730136096326, "grad/layer_27/attn": 0.00403604144230485, "grad/layer_27/mlp": 0.007228719536215067, "grad/layer_27/attn_mlp_ratio": 0.5583342065287384} {"step": 56900, "timestamp": 1778256042.8326356, "train/loss": 2.1172534465789794, "train/z_loss": 0.0013910160167142748, "train/perplexity": 8.308286967970952, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025771.2927955205, "perf/iters_per_sec": 0.9659630264260867, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352363109588623, "data/tokens_consumed": 119330045952, "data/tokens_consumed_B": 119.330045952, "train/loss_slope": -2.576541306913187e-06} {"step": 56910, "timestamp": 1778256053.1818829, "train/loss": 2.159283709526062, "train/z_loss": 0.001385465485509485, "train/perplexity": 8.664928829088627, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027793.063418396, "perf/iters_per_sec": 0.9669270817844372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342041492462157, "data/tokens_consumed": 119351017472, "data/tokens_consumed_B": 119.351017472, "train/loss_slope": -1.956066003691321e-06} {"step": 56920, "timestamp": 1778256063.5238454, "train/loss": 2.10713506937027, "train/z_loss": 0.0013772435253486038, "train/perplexity": 8.22464446329691, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029283.8579831, "perf/iters_per_sec": 0.9676379480281353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033444380760193, "data/tokens_consumed": 119371988992, "data/tokens_consumed_B": 119.371988992, "train/loss_slope": -7.337705151225749e-06} {"step": 56925, "timestamp": 1778256069.279846, "eos/sharpness": 76.08976364135741, "eos/L0_probe": 1.9852454662322998, "eos/L_plus": 2.2875473499298096, "eos/L_minus": 2.4438412189483643, "eos/grad_norm": 0.18198595941066742, "eos/embed_grad_frac": 0.0637681856751442, "eos/time_s": 0.5959351062774658} {"step": 56925, "timestamp": 1778256070.6527367, "geo/rankme_last": 439.55902099609375, "geo/layer_0/stable_rank_q_proj": 19.48126792907715, "geo/layer_0/stable_rank_k_proj": 16.222370147705078, "geo/layer_0/stable_rank_o_proj": 47.362709045410156, "geo/layer_0/stable_rank_gate_proj": 132.8183135986328, "geo/layer_0/stable_rank_down_proj": 54.37822723388672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061133407056331635, "geo/layer_0/attn_entropy_mean": 6.168268203735352, "geo/layer_0/attn_entropy_std": 0.4061734080314636, "geo/layer_7/stable_rank_q_proj": 43.467018127441406, "geo/layer_7/stable_rank_k_proj": 41.51255416870117, "geo/layer_7/stable_rank_o_proj": 92.39330291748047, "geo/layer_7/stable_rank_gate_proj": 84.08343505859375, "geo/layer_7/stable_rank_down_proj": 141.33096313476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.471164345741272, "geo/layer_7/attn_entropy_mean": 4.654516220092773, "geo/layer_7/attn_entropy_std": 0.8040234446525574, "geo/layer_14/stable_rank_q_proj": 52.17427062988281, "geo/layer_14/stable_rank_k_proj": 39.44794845581055, "geo/layer_14/stable_rank_o_proj": 44.25825500488281, "geo/layer_14/stable_rank_gate_proj": 72.2121810913086, "geo/layer_14/stable_rank_down_proj": 129.6407470703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3919617831707001, "geo/layer_14/attn_entropy_mean": 5.529036045074463, "geo/layer_14/attn_entropy_std": 0.37899884581565857, "geo/layer_21/stable_rank_q_proj": 40.78212356567383, "geo/layer_21/stable_rank_k_proj": 30.143436431884766, "geo/layer_21/stable_rank_o_proj": 71.74169158935547, "geo/layer_21/stable_rank_gate_proj": 67.29590606689453, "geo/layer_21/stable_rank_down_proj": 52.45415115356445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15038713812828064, "geo/layer_21/attn_entropy_mean": 5.698131561279297, "geo/layer_21/attn_entropy_std": 0.30661943554878235, "geo/layer_27/stable_rank_q_proj": 43.237125396728516, "geo/layer_27/stable_rank_k_proj": 31.693403244018555, "geo/layer_27/stable_rank_o_proj": 115.9209976196289, "geo/layer_27/stable_rank_gate_proj": 81.1268310546875, "geo/layer_27/stable_rank_down_proj": 129.3311004638672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09003081172704697, "geo/layer_27/attn_entropy_mean": 4.238382339477539, "geo/layer_27/attn_entropy_std": 0.7254595756530762, "attnres/final_alpha/block_0": 0.2382568120956421, "attnres/block_norm/0": 1.75557541847229, "attnres/final_alpha/block_1": 0.0046860165894031525, "attnres/block_norm/1": 45348.953125, "attnres/final_alpha/block_2": 0.010288156569004059, "attnres/block_norm/2": 28159.6328125, "attnres/final_alpha/block_3": 0.012257356196641922, "attnres/block_norm/3": 55506.25390625, "attnres/final_alpha/block_4": 0.014247880317270756, "attnres/block_norm/4": 14693.9951171875, "attnres/final_alpha/block_5": 0.608708918094635, "attnres/block_norm/5": 6507.37548828125, "attnres/final_alpha/block_6": 0.11155484616756439, "attnres/block_norm/6": 36808.12890625, "geo/tier1_time_s": 1.3537466526031494, "geo/step": 56925.0, "geo/rankme_slope": 9.058568739995998e-05} {"step": 56930, "timestamp": 1778256075.8354921, "train/loss": 2.1898162603378295, "train/z_loss": 0.0013719473499804735, "train/perplexity": 8.93357151247797, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704041.8981826361, "perf/iters_per_sec": 0.8125504961884671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230692744255066, "data/tokens_consumed": 119392960512, "data/tokens_consumed_B": 119.392960512, "train/loss_slope": -1.4549898378777147e-06} {"step": 56940, "timestamp": 1778256086.1787288, "train/loss": 2.153201127052307, "train/z_loss": 0.0013984602293930948, "train/perplexity": 8.612383652047557, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028939.1155413708, "perf/iters_per_sec": 0.9674735620219092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336199760437013, "data/tokens_consumed": 119413932032, "data/tokens_consumed_B": 119.413932032, "train/loss_slope": 8.853975516913189e-07} {"step": 56950, "timestamp": 1778256096.5208793, "grad/layer_0/attn": 0.002645609201863408, "grad/layer_0/mlp": 0.002881275722756982, "grad/layer_0/attn_mlp_ratio": 0.9182075457572564, "grad/layer_4/attn": 0.002312456024810672, "grad/layer_4/mlp": 0.002617920283228159, "grad/layer_4/attn_mlp_ratio": 0.8833179341990458, "grad/layer_8/attn": 0.004322309046983719, "grad/layer_8/mlp": 0.003454175777733326, "grad/layer_8/attn_mlp_ratio": 1.2513285947153545, "grad/layer_12/attn": 0.006677198689430952, "grad/layer_12/mlp": 0.006963699124753475, "grad/layer_12/attn_mlp_ratio": 0.958857996866936, "grad/layer_16/attn": 0.0035713892430067062, "grad/layer_16/mlp": 0.004665578715503216, "grad/layer_16/attn_mlp_ratio": 0.7654761358096374, "grad/layer_20/attn": 0.004091878887265921, "grad/layer_20/mlp": 0.00596651341766119, "grad/layer_20/attn_mlp_ratio": 0.6858073605554981, "grad/layer_24/attn": 0.00954350084066391, "grad/layer_24/mlp": 0.010665233246982098, "grad/layer_24/attn_mlp_ratio": 0.8948234445676146, "grad/layer_27/attn": 0.005709688179194927, "grad/layer_27/mlp": 0.010635432787239552, "grad/layer_27/attn_mlp_ratio": 0.5368552685848303} {"step": 56950, "timestamp": 1778256096.5350444, "train/loss": 2.139327621459961, "train/z_loss": 0.0013742525363340974, "train/perplexity": 8.493724710259235, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025987.32490873, "perf/iters_per_sec": 0.9660660385650301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351259231567382, "data/tokens_consumed": 119434903552, "data/tokens_consumed_B": 119.434903552, "train/loss_slope": -4.4959067869147346e-07} {"step": 56960, "timestamp": 1778256106.877346, "train/loss": 2.149111008644104, "train/z_loss": 0.0013833861565217375, "train/perplexity": 8.57722992359715, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028908.6022342301, "perf/iters_per_sec": 0.9674590121432448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336355209350585, "data/tokens_consumed": 119455875072, "data/tokens_consumed_B": 119.455875072, "train/loss_slope": -1.805221050879389e-06} {"step": 56970, "timestamp": 1778256117.225245, "train/loss": 2.1333595275878907, "train/z_loss": 0.0013842083397321403, "train/perplexity": 8.443184328748274, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028081.5359027504, "perf/iters_per_sec": 0.9670646361840965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03405704498291, "data/tokens_consumed": 119476846592, "data/tokens_consumed_B": 119.476846592, "train/loss_slope": -3.531176443755523e-07} {"step": 56980, "timestamp": 1778256127.5651228, "train/loss": 2.1604501724243166, "train/z_loss": 0.0013708115322515369, "train/perplexity": 8.675042044282225, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029292.5658262118, "perf/iters_per_sec": 0.9676421002512988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334399461746215, "data/tokens_consumed": 119497818112, "data/tokens_consumed_B": 119.497818112, "train/loss_slope": 1.7649372740618587e-06} {"step": 56990, "timestamp": 1778256137.923363, "train/loss": 2.174410605430603, "train/z_loss": 0.0013802361208945513, "train/perplexity": 8.796998690655194, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025597.5207022238, "perf/iters_per_sec": 0.965880165434944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353251218795776, "data/tokens_consumed": 119518789632, "data/tokens_consumed_B": 119.518789632, "train/loss_slope": 3.808714354177694e-06} {"step": 57000, "timestamp": 1778256148.2880561, "grad/layer_0/attn": 0.0025287186726927757, "grad/layer_0/mlp": 0.0029678712598979473, "grad/layer_0/attn_mlp_ratio": 0.852031090990321, "grad/layer_4/attn": 0.0023674024268984795, "grad/layer_4/mlp": 0.002508905716240406, "grad/layer_4/attn_mlp_ratio": 0.943599561041327, "grad/layer_8/attn": 0.005727770272642374, "grad/layer_8/mlp": 0.0037903632037341595, "grad/layer_8/attn_mlp_ratio": 1.511140176721196, "grad/layer_12/attn": 0.0061393436044454575, "grad/layer_12/mlp": 0.00712995370849967, "grad/layer_12/attn_mlp_ratio": 0.8610635874143675, "grad/layer_16/attn": 0.0037054901476949453, "grad/layer_16/mlp": 0.004720595665276051, "grad/layer_16/attn_mlp_ratio": 0.7849623928725141, "grad/layer_20/attn": 0.004307153634727001, "grad/layer_20/mlp": 0.007373078726232052, "grad/layer_20/attn_mlp_ratio": 0.5841730078082096, "grad/layer_24/attn": 0.02110925130546093, "grad/layer_24/mlp": 0.014740421436727047, "grad/layer_24/attn_mlp_ratio": 1.4320656470281659, "grad/layer_27/attn": 0.013867351226508617, "grad/layer_27/mlp": 0.015358303673565388, "grad/layer_27/attn_mlp_ratio": 0.9029220564302819} {"step": 57000, "timestamp": 1778256148.8740385, "eos/sharpness": 81.4697504043579, "eos/L0_probe": 1.983306884765625, "eos/L_plus": 2.339884042739868, "eos/L_minus": 2.441427230834961, "eos/grad_norm": 0.2873150408267975, "eos/embed_grad_frac": 0.035704355686903, "eos/time_s": 0.5832035541534424} {"step": 57000, "timestamp": 1778256148.8914614, "train/loss": 2.1555372714996337, "train/z_loss": 0.0013858893420547248, "train/perplexity": 8.632526943953183, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913278.0072537179, "perf/iters_per_sec": 0.9123220478314008, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961041688919066, "data/tokens_consumed": 119539761152, "data/tokens_consumed_B": 119.539761152, "train/loss_slope": 2.93635821292392e-06} {"step": 57000, "timestamp": 1778256150.250998, "geo/rankme_last": 438.3841552734375, "geo/layer_0/stable_rank_q_proj": 19.475461959838867, "geo/layer_0/stable_rank_k_proj": 16.17722511291504, "geo/layer_0/stable_rank_o_proj": 47.38780212402344, "geo/layer_0/stable_rank_gate_proj": 132.83128356933594, "geo/layer_0/stable_rank_down_proj": 54.530269622802734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06330963969230652, "geo/layer_0/attn_entropy_mean": 6.164575576782227, "geo/layer_0/attn_entropy_std": 0.4067389667034149, "geo/layer_7/stable_rank_q_proj": 43.48218536376953, "geo/layer_7/stable_rank_k_proj": 41.511688232421875, "geo/layer_7/stable_rank_o_proj": 92.41978454589844, "geo/layer_7/stable_rank_gate_proj": 83.96517944335938, "geo/layer_7/stable_rank_down_proj": 141.16098022460938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46759602427482605, "geo/layer_7/attn_entropy_mean": 4.61541748046875, "geo/layer_7/attn_entropy_std": 0.8011847138404846, "geo/layer_14/stable_rank_q_proj": 52.16719436645508, "geo/layer_14/stable_rank_k_proj": 39.43838119506836, "geo/layer_14/stable_rank_o_proj": 44.23600769042969, "geo/layer_14/stable_rank_gate_proj": 72.13407135009766, "geo/layer_14/stable_rank_down_proj": 129.76478576660156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40646618604660034, "geo/layer_14/attn_entropy_mean": 5.510769844055176, "geo/layer_14/attn_entropy_std": 0.3946540057659149, "geo/layer_21/stable_rank_q_proj": 40.813289642333984, "geo/layer_21/stable_rank_k_proj": 30.150169372558594, "geo/layer_21/stable_rank_o_proj": 71.66887664794922, "geo/layer_21/stable_rank_gate_proj": 67.15391540527344, "geo/layer_21/stable_rank_down_proj": 52.37358856201172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1448844075202942, "geo/layer_21/attn_entropy_mean": 5.696533203125, "geo/layer_21/attn_entropy_std": 0.30671170353889465, "geo/layer_27/stable_rank_q_proj": 43.31832504272461, "geo/layer_27/stable_rank_k_proj": 31.702804565429688, "geo/layer_27/stable_rank_o_proj": 115.96601104736328, "geo/layer_27/stable_rank_gate_proj": 81.10984802246094, "geo/layer_27/stable_rank_down_proj": 129.2113037109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09272678196430206, "geo/layer_27/attn_entropy_mean": 4.205482006072998, "geo/layer_27/attn_entropy_std": 0.7095069885253906, "attnres/final_alpha/block_0": 0.23846882581710815, "attnres/block_norm/0": 1.7557361125946045, "attnres/final_alpha/block_1": 0.004716966301202774, "attnres/block_norm/1": 45506.94921875, "attnres/final_alpha/block_2": 0.010280409827828407, "attnres/block_norm/2": 28252.1328125, "attnres/final_alpha/block_3": 0.012130897492170334, "attnres/block_norm/3": 55497.3828125, "attnres/final_alpha/block_4": 0.014298031106591225, "attnres/block_norm/4": 14616.4619140625, "attnres/final_alpha/block_5": 0.6082441210746765, "attnres/block_norm/5": 6502.234375, "attnres/final_alpha/block_6": 0.11186078190803528, "attnres/block_norm/6": 36892.515625, "geo/tier1_time_s": 1.356093168258667, "geo/step": 57000.0, "geo/rankme_slope": 9.225963823029211e-05} {"step": 57000, "timestamp": 1778256157.3580558, "geo/ww_alpha_mean": 7.96481023626828, "geo/ww_alpha_std": 5.688507677493571, "geo/ww_alpha_min": 1.352893716279336, "geo/ww_alpha_max": 56.03348960857024, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.006488495125459, "geo/ww_alpha_by_type/k_proj": 4.505919181798785, "geo/ww_alpha_by_type/v_proj": 9.203537939161508, "geo/ww_alpha_by_type/o_proj": 7.604721644027959, "geo/ww_alpha_by_type/gate_proj": 8.10911910140848, "geo/ww_alpha_by_type/up_proj": 13.977266769735364, "geo/ww_alpha_by_type/down_proj": 8.455748315851045, "geo/twonn_id/layer_0": 0.6833426356315613, "geo/twonn_id/layer_7": 3.068035364151001, "geo/twonn_id/layer_14": 4.942211151123047, "geo/twonn_id/layer_21": 6.545904159545898, "geo/twonn_id/layer_27": 6.053886413574219, "geo/tier2_time_s": 7.100295782089233} {"step": 57000, "timestamp": 1778256158.1221783, "eoc/jacobian_sigma/layer_0/attn": 1103.87646484375, "eoc/jacobian_sigma/layer_0/mlp": 8960.0205078125, "eoc/jacobian_sigma/layer_0": 8960.0205078125, "eoc/jacobian_sigma/layer_7/attn": 1.164021372795105, "eoc/jacobian_sigma/layer_7/mlp": 1.7673144340515137, "eoc/jacobian_sigma/layer_7": 1.7673144340515137, "eoc/jacobian_sigma/layer_14/attn": 1.4976379871368408, "eoc/jacobian_sigma/layer_14/mlp": 6.712307929992676, "eoc/jacobian_sigma/layer_14": 6.712307929992676, "eoc/jacobian_sigma/layer_21/attn": 1.07569420337677, "eoc/jacobian_sigma/layer_21/mlp": 4.57255220413208, "eoc/jacobian_sigma/layer_21": 4.57255220413208, "eoc/jacobian_sigma/layer_27/attn": 2.845977306365967, "eoc/jacobian_sigma/layer_27/mlp": 33.315555572509766, "eoc/jacobian_sigma/layer_27": 33.315555572509766, "eoc/layer0_sigma": 8960.0205078125, "eoc/sigma_max": 33.315555572509766, "eoc/sigma_min": 1.7673144340515137, "eoc/sigma_mean": 11.591932535171509, "eoc/time_s": 0.7570900917053223} {"step": 57010, "timestamp": 1778256168.5217867, "train/loss": 2.1145785450935364, "train/z_loss": 0.0013767906348221004, "train/perplexity": 8.286092815637804, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1068501.9384927421, "perf/iters_per_sec": 0.5095014278854094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.962703037261963, "data/tokens_consumed": 119560732672, "data/tokens_consumed_B": 119.560732672, "train/loss_slope": 8.096942377991411e-07} {"step": 57020, "timestamp": 1778256178.89499, "train/loss": 2.170828342437744, "train/z_loss": 0.0013904901104979217, "train/perplexity": 8.765541904676951, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022914.071241929, "perf/iters_per_sec": 0.9646005970201154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366985082626343, "data/tokens_consumed": 119581704192, "data/tokens_consumed_B": 119.581704192, "train/loss_slope": 2.012909263452427e-07} {"step": 57030, "timestamp": 1778256189.272809, "train/loss": 2.1600380897521974, "train/z_loss": 0.001381326629780233, "train/perplexity": 8.671467946238385, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021768.4936841663, "perf/iters_per_sec": 0.9640543430729706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372859239578247, "data/tokens_consumed": 119602675712, "data/tokens_consumed_B": 119.602675712, "train/loss_slope": 6.1213857234149804e-06} {"step": 57040, "timestamp": 1778256199.6562474, "train/loss": 2.1649708271026613, "train/z_loss": 0.0013664621743373572, "train/perplexity": 8.714347690372453, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020643.1977907133, "perf/iters_per_sec": 0.9635177601769987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03786358833313, "data/tokens_consumed": 119623647232, "data/tokens_consumed_B": 119.623647232, "train/loss_slope": 5.100828536165497e-06} {"step": 57050, "timestamp": 1778256210.023918, "grad/layer_0/attn": 0.0026981087867170572, "grad/layer_0/mlp": 0.0028225132264196873, "grad/layer_0/attn_mlp_ratio": 0.9559241975801633, "grad/layer_4/attn": 0.002200481714680791, "grad/layer_4/mlp": 0.0025299275293946266, "grad/layer_4/attn_mlp_ratio": 0.8697804985067232, "grad/layer_8/attn": 0.0033316805493086576, "grad/layer_8/mlp": 0.003720540553331375, "grad/layer_8/attn_mlp_ratio": 0.8954829041648741, "grad/layer_12/attn": 0.005041669588536024, "grad/layer_12/mlp": 0.006772561930119991, "grad/layer_12/attn_mlp_ratio": 0.7444257529298258, "grad/layer_16/attn": 0.0036783195100724697, "grad/layer_16/mlp": 0.004738207440823317, "grad/layer_16/attn_mlp_ratio": 0.7763103406469445, "grad/layer_20/attn": 0.004329908639192581, "grad/layer_20/mlp": 0.006493041757494211, "grad/layer_20/attn_mlp_ratio": 0.6668536464453928, "grad/layer_24/attn": 0.007681249640882015, "grad/layer_24/mlp": 0.008444629609584808, "grad/layer_24/attn_mlp_ratio": 0.9096017119807701, "grad/layer_27/attn": 0.003894287161529064, "grad/layer_27/mlp": 0.006994244642555714, "grad/layer_27/attn_mlp_ratio": 0.5567845142499378} {"step": 57050, "timestamp": 1778256210.0379522, "train/loss": 2.201081395149231, "train/z_loss": 0.0013664136989973486, "train/perplexity": 9.034778384152036, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021205.9445547436, "perf/iters_per_sec": 0.963786098744747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375746250152589, "data/tokens_consumed": 119644618752, "data/tokens_consumed_B": 119.644618752, "train/loss_slope": 1.1674775406293487e-05} {"step": 57060, "timestamp": 1778256220.9654217, "train/loss": 2.2145355701446534, "train/z_loss": 0.0013823570217937231, "train/perplexity": 9.157155267582873, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1920044.1942105473, "perf/iters_per_sec": 0.9155484171917664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0922415256500244, "data/tokens_consumed": 119665590272, "data/tokens_consumed_B": 119.665590272, "train/loss_slope": 1.5167722292859132e-05} {"step": 57070, "timestamp": 1778256231.3424895, "train/loss": 2.1717301845550536, "train/z_loss": 0.0013864451204426587, "train/perplexity": 8.773450605211222, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022124.5168643654, "perf/iters_per_sec": 0.9642241081544711, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037103295326233, "data/tokens_consumed": 119686561792, "data/tokens_consumed_B": 119.686561792, "train/loss_slope": 1.5006244701198884e-05} {"step": 57075, "timestamp": 1778256237.127197, "eos/sharpness": 74.82061386108397, "eos/L0_probe": 1.9833747148513794, "eos/L_plus": 2.467686891555786, "eos/L_minus": 2.2472686767578125, "eos/grad_norm": 0.16599790751934052, "eos/embed_grad_frac": 0.10530858486890793, "eos/time_s": 0.6012313365936279} {"step": 57075, "timestamp": 1778256238.5030205, "geo/rankme_last": 438.75177001953125, "geo/layer_0/stable_rank_q_proj": 19.464033126831055, "geo/layer_0/stable_rank_k_proj": 16.221248626708984, "geo/layer_0/stable_rank_o_proj": 47.3927001953125, "geo/layer_0/stable_rank_gate_proj": 132.65875244140625, "geo/layer_0/stable_rank_down_proj": 54.517555236816406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06524897366762161, "geo/layer_0/attn_entropy_mean": 6.169171333312988, "geo/layer_0/attn_entropy_std": 0.40533751249313354, "geo/layer_7/stable_rank_q_proj": 43.567256927490234, "geo/layer_7/stable_rank_k_proj": 41.43592071533203, "geo/layer_7/stable_rank_o_proj": 92.52732849121094, "geo/layer_7/stable_rank_gate_proj": 84.02814483642578, "geo/layer_7/stable_rank_down_proj": 141.25062561035156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46204861998558044, "geo/layer_7/attn_entropy_mean": 4.625875473022461, "geo/layer_7/attn_entropy_std": 0.8102590441703796, "geo/layer_14/stable_rank_q_proj": 52.167659759521484, "geo/layer_14/stable_rank_k_proj": 39.394325256347656, "geo/layer_14/stable_rank_o_proj": 44.2428092956543, "geo/layer_14/stable_rank_gate_proj": 72.1224594116211, "geo/layer_14/stable_rank_down_proj": 129.92098999023438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3901430368423462, "geo/layer_14/attn_entropy_mean": 5.531773567199707, "geo/layer_14/attn_entropy_std": 0.4084774851799011, "geo/layer_21/stable_rank_q_proj": 40.81829071044922, "geo/layer_21/stable_rank_k_proj": 30.17691421508789, "geo/layer_21/stable_rank_o_proj": 71.64229583740234, "geo/layer_21/stable_rank_gate_proj": 67.21001434326172, "geo/layer_21/stable_rank_down_proj": 52.3649787902832, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14235590398311615, "geo/layer_21/attn_entropy_mean": 5.7168354988098145, "geo/layer_21/attn_entropy_std": 0.30906105041503906, "geo/layer_27/stable_rank_q_proj": 43.35911560058594, "geo/layer_27/stable_rank_k_proj": 31.71979522705078, "geo/layer_27/stable_rank_o_proj": 115.78947448730469, "geo/layer_27/stable_rank_gate_proj": 81.13917541503906, "geo/layer_27/stable_rank_down_proj": 129.03158569335938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09458745270967484, "geo/layer_27/attn_entropy_mean": 4.2203474044799805, "geo/layer_27/attn_entropy_std": 0.7152872681617737, "attnres/final_alpha/block_0": 0.23592446744441986, "attnres/block_norm/0": 1.7557941675186157, "attnres/final_alpha/block_1": 0.004619495011866093, "attnres/block_norm/1": 45383.125, "attnres/final_alpha/block_2": 0.010093053802847862, "attnres/block_norm/2": 28292.84375, "attnres/final_alpha/block_3": 0.01195974089205265, "attnres/block_norm/3": 55339.12890625, "attnres/final_alpha/block_4": 0.014005810022354126, "attnres/block_norm/4": 14738.666015625, "attnres/final_alpha/block_5": 0.6131751537322998, "attnres/block_norm/5": 6478.7529296875, "attnres/final_alpha/block_6": 0.11022229492664337, "attnres/block_norm/6": 36853.2421875, "geo/tier1_time_s": 1.3577919006347656, "geo/step": 57075.0, "geo/rankme_slope": 0.00010563178396358543} {"step": 57080, "timestamp": 1778256243.6903582, "train/loss": 2.1796653032302857, "train/z_loss": 0.00136488052085042, "train/perplexity": 8.84334592402705, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699298.2552390466, "perf/iters_per_sec": 0.8102885509677156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2341282606124877, "data/tokens_consumed": 119707533312, "data/tokens_consumed_B": 119.707533312, "train/loss_slope": 1.9485782153464006e-05} {"step": 57090, "timestamp": 1778256254.0649486, "train/loss": 2.139139747619629, "train/z_loss": 0.0013787893112748861, "train/perplexity": 8.492129111469524, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022248.0852835258, "perf/iters_per_sec": 0.9642830301683072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370399236679078, "data/tokens_consumed": 119728504832, "data/tokens_consumed_B": 119.728504832, "train/loss_slope": 1.9002271435334406e-05} {"step": 57100, "timestamp": 1778256264.4452803, "grad/layer_0/attn": 0.0031657065264880657, "grad/layer_0/mlp": 0.0031641630921512842, "grad/layer_0/attn_mlp_ratio": 1.00048775434232, "grad/layer_4/attn": 0.0018224116647616029, "grad/layer_4/mlp": 0.002565850969403982, "grad/layer_4/attn_mlp_ratio": 0.7102562134227566, "grad/layer_8/attn": 0.006989530753344297, "grad/layer_8/mlp": 0.003575184615328908, "grad/layer_8/attn_mlp_ratio": 1.955012484635015, "grad/layer_12/attn": 0.0038182882126420736, "grad/layer_12/mlp": 0.006743282079696655, "grad/layer_12/attn_mlp_ratio": 0.5662358642114305, "grad/layer_16/attn": 0.005146735813468695, "grad/layer_16/mlp": 0.004940895829349756, "grad/layer_16/attn_mlp_ratio": 1.041660437107411, "grad/layer_20/attn": 0.004815615247935057, "grad/layer_20/mlp": 0.006195823661983013, "grad/layer_20/attn_mlp_ratio": 0.7772356724352322, "grad/layer_24/attn": 0.014114179648458958, "grad/layer_24/mlp": 0.009701098315417767, "grad/layer_24/attn_mlp_ratio": 1.454905315260751, "grad/layer_27/attn": 0.006543368566781282, "grad/layer_27/mlp": 0.010129029862582684, "grad/layer_27/attn_mlp_ratio": 0.6460015017186173} {"step": 57100, "timestamp": 1778256264.4593768, "train/loss": 2.1459330558776855, "train/z_loss": 0.0013811934972181917, "train/perplexity": 8.55001515855653, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018642.184243214, "perf/iters_per_sec": 0.9625636025634833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038892388343811, "data/tokens_consumed": 119749476352, "data/tokens_consumed_B": 119.749476352, "train/loss_slope": 1.9550224198902994e-05} {"step": 57110, "timestamp": 1778256274.8299012, "train/loss": 2.137017107009888, "train/z_loss": 0.0013749112142249942, "train/perplexity": 8.474122490907268, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023529.8937510808, "perf/iters_per_sec": 0.9648942440753369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363830089569093, "data/tokens_consumed": 119770447872, "data/tokens_consumed_B": 119.770447872, "train/loss_slope": 2.0457906648628387e-05} {"step": 57120, "timestamp": 1778256285.1980047, "train/loss": 2.188191032409668, "train/z_loss": 0.001372639834880829, "train/perplexity": 8.91906421458377, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023493.3983899364, "perf/iters_per_sec": 0.9648768417310412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364017009735107, "data/tokens_consumed": 119791419392, "data/tokens_consumed_B": 119.791419392, "train/loss_slope": 2.025586656718647e-05} {"step": 57130, "timestamp": 1778256295.5724502, "train/loss": 2.186257004737854, "train/z_loss": 0.0013635418144986033, "train/perplexity": 8.901831167552624, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022586.4183121142, "perf/iters_per_sec": 0.9644443599281856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368664503097533, "data/tokens_consumed": 119812390912, "data/tokens_consumed_B": 119.812390912, "train/loss_slope": 2.241229597527163e-05} {"step": 57140, "timestamp": 1778256305.9433599, "train/loss": 2.130141353607178, "train/z_loss": 0.0013706171419471502, "train/perplexity": 8.416056367289872, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023278.502791582, "perf/iters_per_sec": 0.9647743715246114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036511778831482, "data/tokens_consumed": 119833362432, "data/tokens_consumed_B": 119.833362432, "train/loss_slope": 1.722184142442639e-05} {"step": 57150, "timestamp": 1778256316.3164136, "grad/layer_0/attn": 0.003289531683549285, "grad/layer_0/mlp": 0.0030649404507130384, "grad/layer_0/attn_mlp_ratio": 1.0732774842187385, "grad/layer_4/attn": 0.002417485462501645, "grad/layer_4/mlp": 0.0025454433634877205, "grad/layer_4/attn_mlp_ratio": 0.9497305664723938, "grad/layer_8/attn": 0.004704578313976526, "grad/layer_8/mlp": 0.0036930046044290066, "grad/layer_8/attn_mlp_ratio": 1.273916144307735, "grad/layer_12/attn": 0.004445808008313179, "grad/layer_12/mlp": 0.006316839251667261, "grad/layer_12/attn_mlp_ratio": 0.7038026077297275, "grad/layer_16/attn": 0.005172336474061012, "grad/layer_16/mlp": 0.0042173611000180244, "grad/layer_16/attn_mlp_ratio": 1.2264390524669566, "grad/layer_20/attn": 0.0031192824244499207, "grad/layer_20/mlp": 0.005839860066771507, "grad/layer_20/attn_mlp_ratio": 0.5341364922054934, "grad/layer_24/attn": 0.012369249016046524, "grad/layer_24/mlp": 0.010330576449632645, "grad/layer_24/attn_mlp_ratio": 1.1973435322432584, "grad/layer_27/attn": 0.005593898240476847, "grad/layer_27/mlp": 0.00876640249043703, "grad/layer_27/attn_mlp_ratio": 0.6381064732960177} {"step": 57150, "timestamp": 1778256316.9034424, "eos/sharpness": 51.95002555847167, "eos/L0_probe": 1.9809845685958862, "eos/L_plus": 2.1819238662719727, "eos/L_minus": 2.2995455265045166, "eos/grad_norm": 0.14146481454372406, "eos/embed_grad_frac": 0.1591292917728424, "eos/time_s": 0.5841624736785889} {"step": 57150, "timestamp": 1778256316.9231825, "train/loss": 2.1305108785629274, "train/z_loss": 0.001364874152932316, "train/perplexity": 8.419166884818113, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910801.4932873053, "perf/iters_per_sec": 0.9111411539494063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0975247859954833, "data/tokens_consumed": 119854333952, "data/tokens_consumed_B": 119.854333952, "train/loss_slope": 1.567742491211934e-05} {"step": 57150, "timestamp": 1778256318.2832618, "geo/rankme_last": 439.4012451171875, "geo/layer_0/stable_rank_q_proj": 19.464088439941406, "geo/layer_0/stable_rank_k_proj": 16.242494583129883, "geo/layer_0/stable_rank_o_proj": 47.409358978271484, "geo/layer_0/stable_rank_gate_proj": 132.57205200195312, "geo/layer_0/stable_rank_down_proj": 54.548728942871094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06149355694651604, "geo/layer_0/attn_entropy_mean": 6.168939113616943, "geo/layer_0/attn_entropy_std": 0.4012210965156555, "geo/layer_7/stable_rank_q_proj": 43.55672073364258, "geo/layer_7/stable_rank_k_proj": 41.43329620361328, "geo/layer_7/stable_rank_o_proj": 92.40103912353516, "geo/layer_7/stable_rank_gate_proj": 83.73361206054688, "geo/layer_7/stable_rank_down_proj": 141.20321655273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4664178490638733, "geo/layer_7/attn_entropy_mean": 4.626328468322754, "geo/layer_7/attn_entropy_std": 0.8171426653862, "geo/layer_14/stable_rank_q_proj": 52.00218200683594, "geo/layer_14/stable_rank_k_proj": 39.40148162841797, "geo/layer_14/stable_rank_o_proj": 44.282875061035156, "geo/layer_14/stable_rank_gate_proj": 72.13532257080078, "geo/layer_14/stable_rank_down_proj": 130.0464630126953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4086262583732605, "geo/layer_14/attn_entropy_mean": 5.536800861358643, "geo/layer_14/attn_entropy_std": 0.37956172227859497, "geo/layer_21/stable_rank_q_proj": 40.73842239379883, "geo/layer_21/stable_rank_k_proj": 30.102699279785156, "geo/layer_21/stable_rank_o_proj": 71.72784423828125, "geo/layer_21/stable_rank_gate_proj": 67.22032165527344, "geo/layer_21/stable_rank_down_proj": 52.33879852294922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1404060274362564, "geo/layer_21/attn_entropy_mean": 5.704968452453613, "geo/layer_21/attn_entropy_std": 0.3025560975074768, "geo/layer_27/stable_rank_q_proj": 43.448612213134766, "geo/layer_27/stable_rank_k_proj": 31.777854919433594, "geo/layer_27/stable_rank_o_proj": 115.95492553710938, "geo/layer_27/stable_rank_gate_proj": 81.13148498535156, "geo/layer_27/stable_rank_down_proj": 129.24998474121094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09323112666606903, "geo/layer_27/attn_entropy_mean": 4.195728302001953, "geo/layer_27/attn_entropy_std": 0.7216140627861023, "attnres/final_alpha/block_0": 0.23898066580295563, "attnres/block_norm/0": 1.755958080291748, "attnres/final_alpha/block_1": 0.004640690051019192, "attnres/block_norm/1": 45430.2265625, "attnres/final_alpha/block_2": 0.010518975555896759, "attnres/block_norm/2": 28177.251953125, "attnres/final_alpha/block_3": 0.012438583187758923, "attnres/block_norm/3": 55554.1171875, "attnres/final_alpha/block_4": 0.014224514365196228, "attnres/block_norm/4": 14713.9287109375, "attnres/final_alpha/block_5": 0.6072095036506653, "attnres/block_norm/5": 6499.33544921875, "attnres/final_alpha/block_6": 0.11198706924915314, "attnres/block_norm/6": 36923.6640625, "geo/tier1_time_s": 1.3566715717315674, "geo/step": 57150.0, "geo/rankme_slope": 0.00010469515931372549} {"step": 57160, "timestamp": 1778256328.6579206, "train/loss": 2.0943825960159304, "train/z_loss": 0.0013933382695540785, "train/perplexity": 8.120425838925623, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787610.180118518, "perf/iters_per_sec": 0.8523989582626905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1731595754623414, "data/tokens_consumed": 119875305472, "data/tokens_consumed_B": 119.875305472, "train/loss_slope": 1.21441165141218e-05} {"step": 57170, "timestamp": 1778256339.0314455, "train/loss": 2.168578290939331, "train/z_loss": 0.0013785454910248518, "train/perplexity": 8.745841156140944, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022719.9040657736, "perf/iters_per_sec": 0.9645080108956211, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367980241775512, "data/tokens_consumed": 119896276992, "data/tokens_consumed_B": 119.896276992, "train/loss_slope": 1.5588509367637496e-05} {"step": 57180, "timestamp": 1778256349.4094036, "train/loss": 2.1382755756378176, "train/z_loss": 0.001389184291474521, "train/perplexity": 8.484793621444451, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021651.3495041972, "perf/iters_per_sec": 0.9639984843750941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373460292816161, "data/tokens_consumed": 119917248512, "data/tokens_consumed_B": 119.917248512, "train/loss_slope": 1.1069367756687626e-05} {"step": 57190, "timestamp": 1778256359.7833443, "train/loss": 2.150114190578461, "train/z_loss": 0.0013904253486543895, "train/perplexity": 8.585838763097483, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022712.3688606494, "perf/iters_per_sec": 0.9645044178298232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368018865585327, "data/tokens_consumed": 119938220032, "data/tokens_consumed_B": 119.938220032, "train/loss_slope": 8.674110223179015e-06} {"step": 57200, "timestamp": 1778256370.149265, "grad/layer_0/attn": 0.0028993552550673485, "grad/layer_0/mlp": 0.0029055627528578043, "grad/layer_0/attn_mlp_ratio": 0.9978635472351423, "grad/layer_4/attn": 0.00197034003213048, "grad/layer_4/mlp": 0.0025816368870437145, "grad/layer_4/attn_mlp_ratio": 0.7632134347388463, "grad/layer_8/attn": 0.005644379183650017, "grad/layer_8/mlp": 0.0035310217645019293, "grad/layer_8/attn_mlp_ratio": 1.5985115358231259, "grad/layer_12/attn": 0.004642312880605459, "grad/layer_12/mlp": 0.006724445149302483, "grad/layer_12/attn_mlp_ratio": 0.6903636967059849, "grad/layer_16/attn": 0.0034532700665295124, "grad/layer_16/mlp": 0.004811880178749561, "grad/layer_16/attn_mlp_ratio": 0.7176550259947233, "grad/layer_20/attn": 0.004023939836770296, "grad/layer_20/mlp": 0.007128879893571138, "grad/layer_20/attn_mlp_ratio": 0.564456105362849, "grad/layer_24/attn": 0.021099315956234932, "grad/layer_24/mlp": 0.01314814854413271, "grad/layer_24/attn_mlp_ratio": 1.6047366459954335, "grad/layer_27/attn": 0.010790146887302399, "grad/layer_27/mlp": 0.012675166130065918, "grad/layer_27/attn_mlp_ratio": 0.851282475626064} {"step": 57200, "timestamp": 1778256370.163338, "train/loss": 2.1164980292320252, "train/z_loss": 0.0013907798216678201, "train/perplexity": 8.302013113859338, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021552.1987958148, "perf/iters_per_sec": 0.9639512056330751, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373969078063965, "data/tokens_consumed": 119959191552, "data/tokens_consumed_B": 119.959191552, "train/loss_slope": 5.202069992136414e-06} {"step": 57210, "timestamp": 1778256380.5365496, "train/loss": 2.1596778869628905, "train/z_loss": 0.0013860433944500982, "train/perplexity": 8.668345021773582, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022565.3970745266, "perf/iters_per_sec": 0.9644343362209924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368772268295288, "data/tokens_consumed": 119980163072, "data/tokens_consumed_B": 119.980163072, "train/loss_slope": 3.5082862214787712e-06} {"step": 57220, "timestamp": 1778256390.910466, "train/loss": 2.1447024583816527, "train/z_loss": 0.0013807808863930404, "train/perplexity": 8.539500002600649, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022618.2765515395, "perf/iters_per_sec": 0.9644595511205385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368501186370849, "data/tokens_consumed": 120001134592, "data/tokens_consumed_B": 120.001134592, "train/loss_slope": 1.465025457433141e-06} {"step": 57225, "timestamp": 1778256396.6705945, "eos/sharpness": 5.1187753677368155, "eos/L0_probe": 1.98288893699646, "eos/L_plus": 2.014455556869507, "eos/L_minus": 2.0025100708007812, "eos/grad_norm": 0.09537005424499512, "eos/embed_grad_frac": 0.28673914074897766, "eos/time_s": 0.5843765735626221} {"step": 57225, "timestamp": 1778256398.0463295, "geo/rankme_last": 438.9302673339844, "geo/layer_0/stable_rank_q_proj": 19.491567611694336, "geo/layer_0/stable_rank_k_proj": 16.284500122070312, "geo/layer_0/stable_rank_o_proj": 47.4425048828125, "geo/layer_0/stable_rank_gate_proj": 132.610107421875, "geo/layer_0/stable_rank_down_proj": 54.430145263671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06263649463653564, "geo/layer_0/attn_entropy_mean": 6.169239521026611, "geo/layer_0/attn_entropy_std": 0.40223148465156555, "geo/layer_7/stable_rank_q_proj": 43.62684631347656, "geo/layer_7/stable_rank_k_proj": 41.36036682128906, "geo/layer_7/stable_rank_o_proj": 92.55354309082031, "geo/layer_7/stable_rank_gate_proj": 83.73334503173828, "geo/layer_7/stable_rank_down_proj": 140.96351623535156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4638391137123108, "geo/layer_7/attn_entropy_mean": 4.645481586456299, "geo/layer_7/attn_entropy_std": 0.7998023629188538, "geo/layer_14/stable_rank_q_proj": 51.948055267333984, "geo/layer_14/stable_rank_k_proj": 39.32036209106445, "geo/layer_14/stable_rank_o_proj": 44.30488967895508, "geo/layer_14/stable_rank_gate_proj": 72.114501953125, "geo/layer_14/stable_rank_down_proj": 129.96749877929688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40164557099342346, "geo/layer_14/attn_entropy_mean": 5.548033237457275, "geo/layer_14/attn_entropy_std": 0.3764922320842743, "geo/layer_21/stable_rank_q_proj": 40.79023361206055, "geo/layer_21/stable_rank_k_proj": 30.042726516723633, "geo/layer_21/stable_rank_o_proj": 71.6903076171875, "geo/layer_21/stable_rank_gate_proj": 67.09905242919922, "geo/layer_21/stable_rank_down_proj": 52.32601547241211, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14423860609531403, "geo/layer_21/attn_entropy_mean": 5.718717575073242, "geo/layer_21/attn_entropy_std": 0.29589784145355225, "geo/layer_27/stable_rank_q_proj": 43.39748001098633, "geo/layer_27/stable_rank_k_proj": 31.810304641723633, "geo/layer_27/stable_rank_o_proj": 115.87793731689453, "geo/layer_27/stable_rank_gate_proj": 81.2813949584961, "geo/layer_27/stable_rank_down_proj": 129.17381286621094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10182491689920425, "geo/layer_27/attn_entropy_mean": 4.2284111976623535, "geo/layer_27/attn_entropy_std": 0.6956108808517456, "attnres/final_alpha/block_0": 0.2383086085319519, "attnres/block_norm/0": 1.7559258937835693, "attnres/final_alpha/block_1": 0.0046200682409107685, "attnres/block_norm/1": 45461.2265625, "attnres/final_alpha/block_2": 0.010419251397252083, "attnres/block_norm/2": 28220.890625, "attnres/final_alpha/block_3": 0.01228959672152996, "attnres/block_norm/3": 55729.1171875, "attnres/final_alpha/block_4": 0.014401644468307495, "attnres/block_norm/4": 14624.0771484375, "attnres/final_alpha/block_5": 0.609393298625946, "attnres/block_norm/5": 6484.87109375, "attnres/final_alpha/block_6": 0.11056755483150482, "attnres/block_norm/6": 36697.2734375, "geo/tier1_time_s": 1.3576347827911377, "geo/step": 57225.0, "geo/rankme_slope": 7.852047068827532e-05} {"step": 57230, "timestamp": 1778256403.2326334, "train/loss": 2.1907270431518553, "train/z_loss": 0.001370861881878227, "train/perplexity": 8.94171176231652, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702771.1285331207, "perf/iters_per_sec": 0.8119445460000614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2316112041473388, "data/tokens_consumed": 120022106112, "data/tokens_consumed_B": 120.022106112, "train/loss_slope": 3.7677858457384937e-06} {"step": 57240, "timestamp": 1778256413.6086779, "train/loss": 2.1736014604568483, "train/z_loss": 0.0013767568278126418, "train/perplexity": 8.789883522370053, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022099.3680312044, "perf/iters_per_sec": 0.9642121162563345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371161937713622, "data/tokens_consumed": 120043077632, "data/tokens_consumed_B": 120.043077632, "train/loss_slope": 4.547477047471297e-06} {"step": 57250, "timestamp": 1778256423.9735794, "grad/layer_0/attn": 0.0026648128405213356, "grad/layer_0/mlp": 0.002949009882286191, "grad/layer_0/attn_mlp_ratio": 0.9036296440256419, "grad/layer_4/attn": 0.0029180364217609167, "grad/layer_4/mlp": 0.0024720707442611456, "grad/layer_4/attn_mlp_ratio": 1.1804016169419536, "grad/layer_8/attn": 0.003333429340273142, "grad/layer_8/mlp": 0.0033816902432590723, "grad/layer_8/attn_mlp_ratio": 0.9857287338321404, "grad/layer_12/attn": 0.005536399781703949, "grad/layer_12/mlp": 0.006836982443928719, "grad/layer_12/attn_mlp_ratio": 0.8097724026837403, "grad/layer_16/attn": 0.0057407948188483715, "grad/layer_16/mlp": 0.005051366984844208, "grad/layer_16/attn_mlp_ratio": 1.1364833959647633, "grad/layer_20/attn": 0.0052294800989329815, "grad/layer_20/mlp": 0.005709446966648102, "grad/layer_20/attn_mlp_ratio": 0.9159345971488446, "grad/layer_24/attn": 0.004512376617640257, "grad/layer_24/mlp": 0.007965161465108395, "grad/layer_24/attn_mlp_ratio": 0.5665141354328386, "grad/layer_27/attn": 0.009570245631039143, "grad/layer_27/mlp": 0.00713015254586935, "grad/layer_27/attn_mlp_ratio": 1.3422217035680704} {"step": 57250, "timestamp": 1778256423.9878561, "train/loss": 2.1595539331436155, "train/z_loss": 0.0013656954397447407, "train/perplexity": 8.667270613891196, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021859.2072699368, "perf/iters_per_sec": 0.9640975986814198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037239384651184, "data/tokens_consumed": 120064049152, "data/tokens_consumed_B": 120.064049152, "train/loss_slope": 6.386985472648507e-06} {"step": 57260, "timestamp": 1778256434.358947, "train/loss": 2.1014522314071655, "train/z_loss": 0.0013879593461751938, "train/perplexity": 8.17803769628932, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023031.873617503, "perf/iters_per_sec": 0.9646567695701137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366381406784058, "data/tokens_consumed": 120085020672, "data/tokens_consumed_B": 120.085020672, "train/loss_slope": 3.8978369644444255e-06} {"step": 57270, "timestamp": 1778256444.7468164, "train/loss": 2.1459408521652223, "train/z_loss": 0.001391240325756371, "train/perplexity": 8.550081817192995, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020393.2196283857, "perf/iters_per_sec": 0.9633985613004616, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037992000579834, "data/tokens_consumed": 120105992192, "data/tokens_consumed_B": 120.105992192, "train/loss_slope": 6.699629714100563e-06} {"step": 57280, "timestamp": 1778256455.1206577, "train/loss": 2.1619221210479735, "train/z_loss": 0.0013794166152365506, "train/perplexity": 8.687820662907308, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022851.9188035459, "perf/iters_per_sec": 0.9645709604280214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367303609848022, "data/tokens_consumed": 120126963712, "data/tokens_consumed_B": 120.126963712, "train/loss_slope": 9.256611519878486e-06} {"step": 57290, "timestamp": 1778256465.501329, "train/loss": 2.159305548667908, "train/z_loss": 0.0013918141252361239, "train/perplexity": 8.665118065764783, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021003.978989409, "perf/iters_per_sec": 0.963689794058518, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03767831325531, "data/tokens_consumed": 120147935232, "data/tokens_consumed_B": 120.147935232, "train/loss_slope": 9.558351612863629e-06} {"step": 57300, "timestamp": 1778256475.866441, "grad/layer_0/attn": 0.00325123337097466, "grad/layer_0/mlp": 0.0031421135645359755, "grad/layer_0/attn_mlp_ratio": 1.0347281219232403, "grad/layer_4/attn": 0.0022244208957999945, "grad/layer_4/mlp": 0.0025915063451975584, "grad/layer_4/attn_mlp_ratio": 0.8583505165199078, "grad/layer_8/attn": 0.005463001783937216, "grad/layer_8/mlp": 0.0036269642878323793, "grad/layer_8/attn_mlp_ratio": 1.5062187547979005, "grad/layer_12/attn": 0.004243114963173866, "grad/layer_12/mlp": 0.0067675551399588585, "grad/layer_12/attn_mlp_ratio": 0.6269789920768584, "grad/layer_16/attn": 0.0033774934709072113, "grad/layer_16/mlp": 0.0044578369706869125, "grad/layer_16/attn_mlp_ratio": 0.7576529642853839, "grad/layer_20/attn": 0.004064091015607119, "grad/layer_20/mlp": 0.006179173942655325, "grad/layer_20/attn_mlp_ratio": 0.6577078081232829, "grad/layer_24/attn": 0.011900226585566998, "grad/layer_24/mlp": 0.011439688503742218, "grad/layer_24/attn_mlp_ratio": 1.040257912411543, "grad/layer_27/attn": 0.0052518886514008045, "grad/layer_27/mlp": 0.012232148088514805, "grad/layer_27/attn_mlp_ratio": 0.42935129385793314} {"step": 57300, "timestamp": 1778256476.4561896, "eos/sharpness": 70.10366916656493, "eos/L0_probe": 1.9759784936904907, "eos/L_plus": 2.2963943481445312, "eos/L_minus": 2.3565993309020996, "eos/grad_norm": 0.20260171592235565, "eos/embed_grad_frac": 0.0563754141330719, "eos/time_s": 0.586944580078125} {"step": 57300, "timestamp": 1778256476.4762151, "train/loss": 2.1522673606872558, "train/z_loss": 0.0013734272681176663, "train/perplexity": 8.604345451355133, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912224.813999842, "perf/iters_per_sec": 0.9118198461531839, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967078685760498, "data/tokens_consumed": 120168906752, "data/tokens_consumed_B": 120.168906752, "train/loss_slope": 6.393185335703531e-06} {"step": 57300, "timestamp": 1778256477.8363507, "geo/rankme_last": 438.3590393066406, "geo/layer_0/stable_rank_q_proj": 19.47124481201172, "geo/layer_0/stable_rank_k_proj": 16.25157928466797, "geo/layer_0/stable_rank_o_proj": 47.399757385253906, "geo/layer_0/stable_rank_gate_proj": 132.32101440429688, "geo/layer_0/stable_rank_down_proj": 54.46760940551758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0639180913567543, "geo/layer_0/attn_entropy_mean": 6.162197589874268, "geo/layer_0/attn_entropy_std": 0.40925276279449463, "geo/layer_7/stable_rank_q_proj": 43.693603515625, "geo/layer_7/stable_rank_k_proj": 41.449737548828125, "geo/layer_7/stable_rank_o_proj": 92.29031372070312, "geo/layer_7/stable_rank_gate_proj": 83.61087036132812, "geo/layer_7/stable_rank_down_proj": 141.1866912841797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.452079713344574, "geo/layer_7/attn_entropy_mean": 4.666624069213867, "geo/layer_7/attn_entropy_std": 0.8019204139709473, "geo/layer_14/stable_rank_q_proj": 52.042240142822266, "geo/layer_14/stable_rank_k_proj": 39.25770568847656, "geo/layer_14/stable_rank_o_proj": 44.23532485961914, "geo/layer_14/stable_rank_gate_proj": 72.21011352539062, "geo/layer_14/stable_rank_down_proj": 129.77439880371094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39420074224472046, "geo/layer_14/attn_entropy_mean": 5.5378336906433105, "geo/layer_14/attn_entropy_std": 0.38711783289909363, "geo/layer_21/stable_rank_q_proj": 40.77779006958008, "geo/layer_21/stable_rank_k_proj": 30.08279037475586, "geo/layer_21/stable_rank_o_proj": 71.57341003417969, "geo/layer_21/stable_rank_gate_proj": 67.14926147460938, "geo/layer_21/stable_rank_down_proj": 52.30073547363281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14075058698654175, "geo/layer_21/attn_entropy_mean": 5.709573268890381, "geo/layer_21/attn_entropy_std": 0.29878824949264526, "geo/layer_27/stable_rank_q_proj": 43.329830169677734, "geo/layer_27/stable_rank_k_proj": 31.781719207763672, "geo/layer_27/stable_rank_o_proj": 115.802490234375, "geo/layer_27/stable_rank_gate_proj": 81.2060546875, "geo/layer_27/stable_rank_down_proj": 129.03163146972656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0956713855266571, "geo/layer_27/attn_entropy_mean": 4.213554382324219, "geo/layer_27/attn_entropy_std": 0.7179259657859802, "attnres/final_alpha/block_0": 0.23819881677627563, "attnres/block_norm/0": 1.7561819553375244, "attnres/final_alpha/block_1": 0.004665719345211983, "attnres/block_norm/1": 45408.29296875, "attnres/final_alpha/block_2": 0.010395091958343983, "attnres/block_norm/2": 28034.380859375, "attnres/final_alpha/block_3": 0.012268539518117905, "attnres/block_norm/3": 55773.02734375, "attnres/final_alpha/block_4": 0.014608805999159813, "attnres/block_norm/4": 14668.6142578125, "attnres/final_alpha/block_5": 0.6068719625473022, "attnres/block_norm/5": 6531.69189453125, "attnres/final_alpha/block_6": 0.112991102039814, "attnres/block_norm/6": 36892.7421875, "geo/tier1_time_s": 1.355954885482788, "geo/step": 57300.0, "geo/rankme_slope": 5.634169683498399e-05} {"step": 57310, "timestamp": 1778256488.2104545, "train/loss": 2.1786944031715394, "train/z_loss": 0.0013712861575186253, "train/perplexity": 8.834764085677714, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787804.9258399084, "perf/iters_per_sec": 0.8524918202590506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173031783103943, "data/tokens_consumed": 120189878272, "data/tokens_consumed_B": 120.189878272, "train/loss_slope": 9.660052781057847e-06} {"step": 57320, "timestamp": 1778256498.5866206, "train/loss": 2.172999620437622, "train/z_loss": 0.0013772834558039905, "train/perplexity": 8.784595010280691, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022110.4780670574, "perf/iters_per_sec": 0.9642174139342582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371104955673218, "data/tokens_consumed": 120210849792, "data/tokens_consumed_B": 120.210849792, "train/loss_slope": 1.1171729865819555e-05} {"step": 57330, "timestamp": 1778256508.9488714, "train/loss": 2.136983036994934, "train/z_loss": 0.0013866116292774676, "train/perplexity": 8.473833782345464, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025553.720844344, "perf/iters_per_sec": 0.965859280035183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353475093841553, "data/tokens_consumed": 120231821312, "data/tokens_consumed_B": 120.231821312, "train/loss_slope": 7.1465042593813406e-06} {"step": 57340, "timestamp": 1778256519.2910872, "train/loss": 2.140153431892395, "train/z_loss": 0.0013612852431833743, "train/perplexity": 8.500741813735033, "train/grad_norm": 0.322265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029095.955317731, "perf/iters_per_sec": 0.967548349055162, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335400819778442, "data/tokens_consumed": 120252792832, "data/tokens_consumed_B": 120.252792832, "train/loss_slope": 6.027134431220381e-06} {"step": 57350, "timestamp": 1778256529.6223931, "grad/layer_0/attn": 0.002940708538517356, "grad/layer_0/mlp": 0.0030094755347818136, "grad/layer_0/attn_mlp_ratio": 0.9771498079367427, "grad/layer_4/attn": 0.002009285381063819, "grad/layer_4/mlp": 0.0026062936522066593, "grad/layer_4/attn_mlp_ratio": 0.770935885244181, "grad/layer_8/attn": 0.004458708688616753, "grad/layer_8/mlp": 0.003605466103181243, "grad/layer_8/attn_mlp_ratio": 1.23665247081852, "grad/layer_12/attn": 0.00636310875415802, "grad/layer_12/mlp": 0.007059455383569002, "grad/layer_12/attn_mlp_ratio": 0.9013597109533816, "grad/layer_16/attn": 0.003495084587484598, "grad/layer_16/mlp": 0.004593587014824152, "grad/layer_16/attn_mlp_ratio": 0.7608617187655958, "grad/layer_20/attn": 0.004242070950567722, "grad/layer_20/mlp": 0.006263525225222111, "grad/layer_20/attn_mlp_ratio": 0.6772657138441912, "grad/layer_24/attn": 0.0052213906310498714, "grad/layer_24/mlp": 0.00801007729023695, "grad/layer_24/attn_mlp_ratio": 0.651852707117904, "grad/layer_27/attn": 0.006129114422947168, "grad/layer_27/mlp": 0.007101958151906729, "grad/layer_27/attn_mlp_ratio": 0.8630175235543842} {"step": 57350, "timestamp": 1778256529.6367054, "train/loss": 2.175147771835327, "train/z_loss": 0.001372218143660575, "train/perplexity": 8.803485933347261, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028189.792702795, "perf/iters_per_sec": 0.967116257048986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034001851081848, "data/tokens_consumed": 120273764352, "data/tokens_consumed_B": 120.273764352, "train/loss_slope": 7.349783967215859e-06} {"step": 57360, "timestamp": 1778256539.9808404, "train/loss": 2.1754418849945067, "train/z_loss": 0.0013807422714307905, "train/perplexity": 8.806075535206237, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028818.4248642677, "perf/iters_per_sec": 0.9674160122224177, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336814641952514, "data/tokens_consumed": 120294735872, "data/tokens_consumed_B": 120.294735872, "train/loss_slope": 6.292483215034469e-06} {"step": 57370, "timestamp": 1778256550.3234804, "train/loss": 2.1369696855545044, "train/z_loss": 0.001368442177772522, "train/perplexity": 8.473720645213781, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029146.0873784989, "perf/iters_per_sec": 0.9675722538845534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335145473480225, "data/tokens_consumed": 120315707392, "data/tokens_consumed_B": 120.315707392, "train/loss_slope": 3.6834117352622796e-06} {"step": 57375, "timestamp": 1778256556.09232, "eos/sharpness": 81.37946128845213, "eos/L0_probe": 1.9767447710037231, "eos/L_plus": 2.320592164993286, "eos/L_minus": 2.4466919898986816, "eos/grad_norm": 0.23565728962421417, "eos/embed_grad_frac": 0.04347294196486473, "eos/time_s": 0.5953187942504883} {"step": 57375, "timestamp": 1778256557.4726584, "geo/rankme_last": 438.9173278808594, "geo/layer_0/stable_rank_q_proj": 19.49219512939453, "geo/layer_0/stable_rank_k_proj": 16.22431755065918, "geo/layer_0/stable_rank_o_proj": 47.458560943603516, "geo/layer_0/stable_rank_gate_proj": 132.17271423339844, "geo/layer_0/stable_rank_down_proj": 54.60832595825195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06358210742473602, "geo/layer_0/attn_entropy_mean": 6.165888786315918, "geo/layer_0/attn_entropy_std": 0.40906262397766113, "geo/layer_7/stable_rank_q_proj": 43.73051834106445, "geo/layer_7/stable_rank_k_proj": 41.436885833740234, "geo/layer_7/stable_rank_o_proj": 92.32714080810547, "geo/layer_7/stable_rank_gate_proj": 83.73851776123047, "geo/layer_7/stable_rank_down_proj": 141.39854431152344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47252288460731506, "geo/layer_7/attn_entropy_mean": 4.6734724044799805, "geo/layer_7/attn_entropy_std": 0.8033357858657837, "geo/layer_14/stable_rank_q_proj": 52.037818908691406, "geo/layer_14/stable_rank_k_proj": 39.14278030395508, "geo/layer_14/stable_rank_o_proj": 44.28413391113281, "geo/layer_14/stable_rank_gate_proj": 72.1773452758789, "geo/layer_14/stable_rank_down_proj": 129.71420288085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3929695188999176, "geo/layer_14/attn_entropy_mean": 5.534872531890869, "geo/layer_14/attn_entropy_std": 0.3861371874809265, "geo/layer_21/stable_rank_q_proj": 40.84208679199219, "geo/layer_21/stable_rank_k_proj": 30.07167625427246, "geo/layer_21/stable_rank_o_proj": 71.6956558227539, "geo/layer_21/stable_rank_gate_proj": 67.24310302734375, "geo/layer_21/stable_rank_down_proj": 52.36375045776367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14498163759708405, "geo/layer_21/attn_entropy_mean": 5.720420837402344, "geo/layer_21/attn_entropy_std": 0.29805612564086914, "geo/layer_27/stable_rank_q_proj": 43.402217864990234, "geo/layer_27/stable_rank_k_proj": 31.839550018310547, "geo/layer_27/stable_rank_o_proj": 115.74256134033203, "geo/layer_27/stable_rank_gate_proj": 81.0909652709961, "geo/layer_27/stable_rank_down_proj": 128.75701904296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09112568199634552, "geo/layer_27/attn_entropy_mean": 4.185429573059082, "geo/layer_27/attn_entropy_std": 0.7118574380874634, "attnres/final_alpha/block_0": 0.2393527626991272, "attnres/block_norm/0": 1.7562494277954102, "attnres/final_alpha/block_1": 0.00466008298099041, "attnres/block_norm/1": 45526.671875, "attnres/final_alpha/block_2": 0.010368496179580688, "attnres/block_norm/2": 28139.826171875, "attnres/final_alpha/block_3": 0.012298155575990677, "attnres/block_norm/3": 55687.9296875, "attnres/final_alpha/block_4": 0.014276530593633652, "attnres/block_norm/4": 14722.9609375, "attnres/final_alpha/block_5": 0.6066591739654541, "attnres/block_norm/5": 6568.96240234375, "attnres/final_alpha/block_6": 0.1123848408460617, "attnres/block_norm/6": 36747.9765625, "geo/tier1_time_s": 1.3596546649932861, "geo/step": 57375.0, "geo/rankme_slope": 4.9559687156112445e-05} {"step": 57380, "timestamp": 1778256562.6700628, "train/loss": 2.1581801533699037, "train/z_loss": 0.0013957362272776664, "train/perplexity": 8.655371867828228, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699396.6804132287, "perf/iters_per_sec": 0.8103354837480682, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234056782722473, "data/tokens_consumed": 120336678912, "data/tokens_consumed_B": 120.336678912, "train/loss_slope": 3.01494607687928e-06} {"step": 57390, "timestamp": 1778256573.0174263, "train/loss": 2.1719980478286742, "train/z_loss": 0.0013777797459624708, "train/perplexity": 8.775801005190145, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027782.5453081618, "perf/iters_per_sec": 0.9669220663586434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342095136642455, "data/tokens_consumed": 120357650432, "data/tokens_consumed_B": 120.357650432, "train/loss_slope": 2.4745607032741704e-06} {"step": 57400, "timestamp": 1778256583.3547328, "grad/layer_0/attn": 0.0035313523840159178, "grad/layer_0/mlp": 0.0032380628399550915, "grad/layer_0/attn_mlp_ratio": 1.0905755846935123, "grad/layer_4/attn": 0.002545089926570654, "grad/layer_4/mlp": 0.002572395373135805, "grad/layer_4/attn_mlp_ratio": 0.9893851676966818, "grad/layer_8/attn": 0.00819477904587984, "grad/layer_8/mlp": 0.003641701303422451, "grad/layer_8/attn_mlp_ratio": 2.250261110968195, "grad/layer_12/attn": 0.0048627350479364395, "grad/layer_12/mlp": 0.006627954542636871, "grad/layer_12/attn_mlp_ratio": 0.733670537914521, "grad/layer_16/attn": 0.00509374774992466, "grad/layer_16/mlp": 0.005214897450059652, "grad/layer_16/attn_mlp_ratio": 0.9767685176991432, "grad/layer_20/attn": 0.0041244300082325935, "grad/layer_20/mlp": 0.008105999790132046, "grad/layer_20/attn_mlp_ratio": 0.5088119990296974, "grad/layer_24/attn": 0.021763741970062256, "grad/layer_24/mlp": 0.01577880047261715, "grad/layer_24/attn_mlp_ratio": 1.3793026833631128, "grad/layer_27/attn": 0.015336468815803528, "grad/layer_27/mlp": 0.015998946502804756, "grad/layer_27/attn_mlp_ratio": 0.9585924121476166} {"step": 57400, "timestamp": 1778256583.3690453, "train/loss": 2.1426899909973143, "train/z_loss": 0.0013694016844965518, "train/perplexity": 8.522331818366565, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027296.3537603023, "perf/iters_per_sec": 0.9666902321626197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034457540512085, "data/tokens_consumed": 120378621952, "data/tokens_consumed_B": 120.378621952, "train/loss_slope": 5.805262745302998e-06} {"step": 57410, "timestamp": 1778256593.7134593, "train/loss": 2.178476881980896, "train/z_loss": 0.0013679281808435916, "train/perplexity": 8.832842546270092, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028791.8926237035, "perf/iters_per_sec": 0.9674033606642263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336949825286865, "data/tokens_consumed": 120399593472, "data/tokens_consumed_B": 120.399593472, "train/loss_slope": 4.0114588112291845e-06} {"step": 57420, "timestamp": 1778256604.0726585, "train/loss": 2.1661744713783264, "train/z_loss": 0.0013732880703173577, "train/perplexity": 8.72484298011674, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026171.338480482, "perf/iters_per_sec": 0.9661537830736552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350319147109985, "data/tokens_consumed": 120420564992, "data/tokens_consumed_B": 120.420564992, "train/loss_slope": 5.8391866904280936e-06} {"step": 57430, "timestamp": 1778256614.4184117, "train/loss": 2.2141818523406984, "train/z_loss": 0.0013630072469823062, "train/perplexity": 9.153916791518235, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028408.9136440095, "perf/iters_per_sec": 0.9672207420558975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033890151977539, "data/tokens_consumed": 120441536512, "data/tokens_consumed_B": 120.441536512, "train/loss_slope": 1.0817860205038336e-05} {"step": 57440, "timestamp": 1778256624.7611978, "train/loss": 2.215094780921936, "train/z_loss": 0.0013667101389728486, "train/perplexity": 9.162277479562345, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029127.5976746664, "perf/iters_per_sec": 0.9675634373067219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033523964881897, "data/tokens_consumed": 120462508032, "data/tokens_consumed_B": 120.462508032, "train/loss_slope": 1.2515715589903908e-05} {"step": 57450, "timestamp": 1778256635.093773, "grad/layer_0/attn": 0.0030442140996456146, "grad/layer_0/mlp": 0.0031178114004433155, "grad/layer_0/attn_mlp_ratio": 0.9763945316170537, "grad/layer_4/attn": 0.001952580874785781, "grad/layer_4/mlp": 0.002668785396963358, "grad/layer_4/attn_mlp_ratio": 0.7316364979529074, "grad/layer_8/attn": 0.0034642291720956564, "grad/layer_8/mlp": 0.0038679095450788736, "grad/layer_8/attn_mlp_ratio": 0.8956334273483305, "grad/layer_12/attn": 0.0048670717515051365, "grad/layer_12/mlp": 0.0075210025534033775, "grad/layer_12/attn_mlp_ratio": 0.6471307052793961, "grad/layer_16/attn": 0.0037270011380314827, "grad/layer_16/mlp": 0.004844599403440952, "grad/layer_16/attn_mlp_ratio": 0.769310473525071, "grad/layer_20/attn": 0.002986673731356859, "grad/layer_20/mlp": 0.005552496761083603, "grad/layer_20/attn_mlp_ratio": 0.5378974191394666, "grad/layer_24/attn": 0.006965530104935169, "grad/layer_24/mlp": 0.00949922390282154, "grad/layer_24/attn_mlp_ratio": 0.7332735919130035, "grad/layer_27/attn": 0.00586358830332756, "grad/layer_27/mlp": 0.00865940097719431, "grad/layer_27/attn_mlp_ratio": 0.6771355491051344} {"step": 57450, "timestamp": 1778256635.698838, "eos/sharpness": 15.247368812561032, "eos/L0_probe": 1.9797120094299316, "eos/L_plus": 2.062431812286377, "eos/L_minus": 2.0494658946990967, "eos/grad_norm": 0.10866209119558334, "eos/embed_grad_frac": 0.20632165670394897, "eos/time_s": 0.6022739410400391} {"step": 57450, "timestamp": 1778256635.7188306, "train/loss": 2.1360721826553344, "train/z_loss": 0.0013823693501763047, "train/perplexity": 8.466118868186681, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914716.607896416, "perf/iters_per_sec": 0.9130080260736542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0952806234359742, "data/tokens_consumed": 120483479552, "data/tokens_consumed_B": 120.483479552, "train/loss_slope": 9.680667890645456e-06} {"step": 57450, "timestamp": 1778256637.0852652, "geo/rankme_last": 438.7524108886719, "geo/layer_0/stable_rank_q_proj": 19.490549087524414, "geo/layer_0/stable_rank_k_proj": 16.22473907470703, "geo/layer_0/stable_rank_o_proj": 47.508567810058594, "geo/layer_0/stable_rank_gate_proj": 131.83604431152344, "geo/layer_0/stable_rank_down_proj": 54.51051712036133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05961799994111061, "geo/layer_0/attn_entropy_mean": 6.163918495178223, "geo/layer_0/attn_entropy_std": 0.4089408814907074, "geo/layer_7/stable_rank_q_proj": 43.69135284423828, "geo/layer_7/stable_rank_k_proj": 41.40681838989258, "geo/layer_7/stable_rank_o_proj": 92.32157135009766, "geo/layer_7/stable_rank_gate_proj": 83.74800872802734, "geo/layer_7/stable_rank_down_proj": 141.3668670654297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46543100476264954, "geo/layer_7/attn_entropy_mean": 4.647619247436523, "geo/layer_7/attn_entropy_std": 0.8076607584953308, "geo/layer_14/stable_rank_q_proj": 52.0020866394043, "geo/layer_14/stable_rank_k_proj": 39.15031814575195, "geo/layer_14/stable_rank_o_proj": 44.278404235839844, "geo/layer_14/stable_rank_gate_proj": 72.12830352783203, "geo/layer_14/stable_rank_down_proj": 129.5777587890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3989775776863098, "geo/layer_14/attn_entropy_mean": 5.542742729187012, "geo/layer_14/attn_entropy_std": 0.38400760293006897, "geo/layer_21/stable_rank_q_proj": 40.7696647644043, "geo/layer_21/stable_rank_k_proj": 30.041963577270508, "geo/layer_21/stable_rank_o_proj": 71.63500213623047, "geo/layer_21/stable_rank_gate_proj": 67.35528564453125, "geo/layer_21/stable_rank_down_proj": 52.39262771606445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.140610009431839, "geo/layer_21/attn_entropy_mean": 5.715366363525391, "geo/layer_21/attn_entropy_std": 0.28653451800346375, "geo/layer_27/stable_rank_q_proj": 43.33343505859375, "geo/layer_27/stable_rank_k_proj": 31.86029624938965, "geo/layer_27/stable_rank_o_proj": 115.74734497070312, "geo/layer_27/stable_rank_gate_proj": 81.02511596679688, "geo/layer_27/stable_rank_down_proj": 129.01519775390625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09203732758760452, "geo/layer_27/attn_entropy_mean": 4.223647594451904, "geo/layer_27/attn_entropy_std": 0.6951887607574463, "attnres/final_alpha/block_0": 0.23783215880393982, "attnres/block_norm/0": 1.756404995918274, "attnres/final_alpha/block_1": 0.004615774378180504, "attnres/block_norm/1": 45369.7265625, "attnres/final_alpha/block_2": 0.01037323847413063, "attnres/block_norm/2": 28253.08203125, "attnres/final_alpha/block_3": 0.012434253469109535, "attnres/block_norm/3": 55881.96484375, "attnres/final_alpha/block_4": 0.014208920300006866, "attnres/block_norm/4": 14736.8232421875, "attnres/final_alpha/block_5": 0.6102722883224487, "attnres/block_norm/5": 6519.57568359375, "attnres/final_alpha/block_6": 0.1102633848786354, "attnres/block_norm/6": 37157.83984375, "geo/tier1_time_s": 1.3620970249176025, "geo/step": 57450.0, "geo/rankme_slope": 4.8950029230442174e-05} {"step": 57460, "timestamp": 1778256647.4325814, "train/loss": 2.2010467767715456, "train/z_loss": 0.001373043528292328, "train/perplexity": 9.034465620195352, "train/grad_norm": 0.31640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790945.5997491889, "perf/iters_per_sec": 0.8539894102807946, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709747076034547, "data/tokens_consumed": 120504451072, "data/tokens_consumed_B": 120.504451072, "train/loss_slope": 1.3212100707694259e-05} {"step": 57470, "timestamp": 1778256657.779903, "train/loss": 2.1873888254165648, "train/z_loss": 0.001377639826387167, "train/perplexity": 8.911912148001358, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028202.8871813007, "perf/iters_per_sec": 0.9671225009829048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339951753616332, "data/tokens_consumed": 120525422592, "data/tokens_consumed_B": 120.525422592, "train/loss_slope": 1.6970368394471157e-05} {"step": 57480, "timestamp": 1778256668.1244414, "train/loss": 2.1154333472251894, "train/z_loss": 0.0013935302151367069, "train/perplexity": 8.293178813571286, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028283.2348617157, "perf/iters_per_sec": 0.9671608137425021, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339542150497436, "data/tokens_consumed": 120546394112, "data/tokens_consumed_B": 120.546394112, "train/loss_slope": 1.605248382561496e-05} {"step": 57490, "timestamp": 1778256678.4645424, "train/loss": 2.1462050318717956, "train/z_loss": 0.0013835052377544343, "train/perplexity": 8.55234087368394, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029567.7440693225, "perf/iters_per_sec": 0.967773315462743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332998275756835, "data/tokens_consumed": 120567365632, "data/tokens_consumed_B": 120.567365632, "train/loss_slope": 1.3679799459876885e-05} {"step": 57500, "timestamp": 1778256688.7976465, "grad/layer_0/attn": 0.0029061404056847095, "grad/layer_0/mlp": 0.0029020614456385374, "grad/layer_0/attn_mlp_ratio": 1.0014055043223677, "grad/layer_4/attn": 0.0020671228412538767, "grad/layer_4/mlp": 0.002599782543256879, "grad/layer_4/attn_mlp_ratio": 0.7951137171468641, "grad/layer_8/attn": 0.00683189369738102, "grad/layer_8/mlp": 0.003780556144192815, "grad/layer_8/attn_mlp_ratio": 1.8071133600710905, "grad/layer_12/attn": 0.004560021683573723, "grad/layer_12/mlp": 0.00643640011548996, "grad/layer_12/attn_mlp_ratio": 0.7084739187907382, "grad/layer_16/attn": 0.0032003719825297594, "grad/layer_16/mlp": 0.004694519564509392, "grad/layer_16/attn_mlp_ratio": 0.6817251201916564, "grad/layer_20/attn": 0.002997489646077156, "grad/layer_20/mlp": 0.006380010396242142, "grad/layer_20/attn_mlp_ratio": 0.4698251903884316, "grad/layer_24/attn": 0.009501667693257332, "grad/layer_24/mlp": 0.009136863984167576, "grad/layer_24/attn_mlp_ratio": 1.039926566240806, "grad/layer_27/attn": 0.005855547729879618, "grad/layer_27/mlp": 0.008386868052184582, "grad/layer_27/attn_mlp_ratio": 0.6981804916480516} {"step": 57500, "timestamp": 1778256688.8121812, "train/loss": 2.1204851269721985, "train/z_loss": 0.001384308363776654, "train/perplexity": 8.335180127709634, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028171.6945412045, "perf/iters_per_sec": 0.9671076271730444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340110778808593, "data/tokens_consumed": 120588337152, "data/tokens_consumed_B": 120.588337152, "train/loss_slope": 9.06427938802947e-06} {"step": 57500, "timestamp": 1778256696.0974505, "geo/ww_alpha_mean": 7.548520779256176, "geo/ww_alpha_std": 4.347768409055262, "geo/ww_alpha_min": 1.3525020179689267, "geo/ww_alpha_max": 28.586190718043092, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.005468690861365, "geo/ww_alpha_by_type/k_proj": 4.5361204769477625, "geo/ww_alpha_by_type/v_proj": 8.109966971564475, "geo/ww_alpha_by_type/o_proj": 7.5216448432258645, "geo/ww_alpha_by_type/gate_proj": 8.257495273468443, "geo/ww_alpha_by_type/up_proj": 12.13620337683885, "geo/ww_alpha_by_type/down_proj": 8.366862651544063, "geo/twonn_id/layer_0": 0.7241795659065247, "geo/twonn_id/layer_7": 3.1748642921447754, "geo/twonn_id/layer_14": 5.176297664642334, "geo/twonn_id/layer_21": 6.889680862426758, "geo/twonn_id/layer_27": 6.474043846130371, "geo/tier2_time_s": 7.278820514678955} {"step": 57500, "timestamp": 1778256696.7757962, "eoc/jacobian_sigma/layer_0/attn": 1074.2305908203125, "eoc/jacobian_sigma/layer_0/mlp": 8858.1748046875, "eoc/jacobian_sigma/layer_0": 8858.1748046875, "eoc/jacobian_sigma/layer_7/attn": 1.1591501235961914, "eoc/jacobian_sigma/layer_7/mlp": 1.7740058898925781, "eoc/jacobian_sigma/layer_7": 1.7740058898925781, "eoc/jacobian_sigma/layer_14/attn": 1.512503981590271, "eoc/jacobian_sigma/layer_14/mlp": 7.4592766761779785, "eoc/jacobian_sigma/layer_14": 7.4592766761779785, "eoc/jacobian_sigma/layer_21/attn": 1.0900516510009766, "eoc/jacobian_sigma/layer_21/mlp": 4.637076377868652, "eoc/jacobian_sigma/layer_21": 4.637076377868652, "eoc/jacobian_sigma/layer_27/attn": 3.233823776245117, "eoc/jacobian_sigma/layer_27/mlp": 28.4241943359375, "eoc/jacobian_sigma/layer_27": 28.4241943359375, "eoc/layer0_sigma": 8858.1748046875, "eoc/sigma_max": 28.4241943359375, "eoc/sigma_min": 1.7740058898925781, "eoc/sigma_mean": 10.573638319969177, "eoc/time_s": 0.6722157001495361} {"step": 57510, "timestamp": 1778256707.1399355, "train/loss": 2.126950812339783, "train/z_loss": 0.001369776180945337, "train/perplexity": 8.389247382469469, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1144565.9647349631, "perf/iters_per_sec": 0.545771582000238, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8322683572769165, "data/tokens_consumed": 120609308672, "data/tokens_consumed_B": 120.609308672, "train/loss_slope": 8.370288313239934e-06} {"step": 57520, "timestamp": 1778256717.4855423, "train/loss": 2.2162919521331785, "train/z_loss": 0.0013739693677052855, "train/perplexity": 9.173252862785862, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028698.076068685, "perf/iters_per_sec": 0.9673586254447388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337427854537964, "data/tokens_consumed": 120630280192, "data/tokens_consumed_B": 120.630280192, "train/loss_slope": 1.1126599009960528e-05} {"step": 57525, "timestamp": 1778256723.266184, "eos/sharpness": 34.52911376953124, "eos/L0_probe": 1.9763296842575073, "eos/L_plus": 2.1667351722717285, "eos/L_minus": 2.1312153339385986, "eos/grad_norm": 0.17420963943004608, "eos/embed_grad_frac": 0.0962306335568428, "eos/time_s": 0.6183807849884033} {"step": 57525, "timestamp": 1778256724.64785, "geo/rankme_last": 438.1446533203125, "geo/layer_0/stable_rank_q_proj": 19.493000030517578, "geo/layer_0/stable_rank_k_proj": 16.193323135375977, "geo/layer_0/stable_rank_o_proj": 47.56510925292969, "geo/layer_0/stable_rank_gate_proj": 131.80821228027344, "geo/layer_0/stable_rank_down_proj": 54.542728424072266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06647808104753494, "geo/layer_0/attn_entropy_mean": 6.161370277404785, "geo/layer_0/attn_entropy_std": 0.4054049849510193, "geo/layer_7/stable_rank_q_proj": 43.75185012817383, "geo/layer_7/stable_rank_k_proj": 41.41059112548828, "geo/layer_7/stable_rank_o_proj": 92.22114562988281, "geo/layer_7/stable_rank_gate_proj": 83.72235870361328, "geo/layer_7/stable_rank_down_proj": 141.5176544189453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47768914699554443, "geo/layer_7/attn_entropy_mean": 4.659344673156738, "geo/layer_7/attn_entropy_std": 0.7843237519264221, "geo/layer_14/stable_rank_q_proj": 51.91044998168945, "geo/layer_14/stable_rank_k_proj": 39.15644073486328, "geo/layer_14/stable_rank_o_proj": 44.27267074584961, "geo/layer_14/stable_rank_gate_proj": 72.2099380493164, "geo/layer_14/stable_rank_down_proj": 129.83226013183594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3822784721851349, "geo/layer_14/attn_entropy_mean": 5.515348434448242, "geo/layer_14/attn_entropy_std": 0.38798630237579346, "geo/layer_21/stable_rank_q_proj": 40.720340728759766, "geo/layer_21/stable_rank_k_proj": 30.005409240722656, "geo/layer_21/stable_rank_o_proj": 71.57524871826172, "geo/layer_21/stable_rank_gate_proj": 67.35653686523438, "geo/layer_21/stable_rank_down_proj": 52.44609069824219, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1420268714427948, "geo/layer_21/attn_entropy_mean": 5.7191057205200195, "geo/layer_21/attn_entropy_std": 0.30037811398506165, "geo/layer_27/stable_rank_q_proj": 43.35931396484375, "geo/layer_27/stable_rank_k_proj": 31.74361801147461, "geo/layer_27/stable_rank_o_proj": 115.73628234863281, "geo/layer_27/stable_rank_gate_proj": 81.00263214111328, "geo/layer_27/stable_rank_down_proj": 129.02647399902344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09939257055521011, "geo/layer_27/attn_entropy_mean": 4.216095447540283, "geo/layer_27/attn_entropy_std": 0.688992977142334, "attnres/final_alpha/block_0": 0.23901373147964478, "attnres/block_norm/0": 1.7566958665847778, "attnres/final_alpha/block_1": 0.004636120051145554, "attnres/block_norm/1": 45482.71484375, "attnres/final_alpha/block_2": 0.010472909547388554, "attnres/block_norm/2": 28342.3515625, "attnres/final_alpha/block_3": 0.012469412758946419, "attnres/block_norm/3": 55629.5234375, "attnres/final_alpha/block_4": 0.014488417655229568, "attnres/block_norm/4": 14660.734375, "attnres/final_alpha/block_5": 0.6075142621994019, "attnres/block_norm/5": 6520.7099609375, "attnres/final_alpha/block_6": 0.1114051565527916, "attnres/block_norm/6": 36706.1953125, "geo/tier1_time_s": 1.361147165298462, "geo/step": 57525.0, "geo/rankme_slope": 3.9173970369397756e-05} {"step": 57530, "timestamp": 1778256730.3618498, "train/loss": 2.1586370706558227, "train/z_loss": 0.0013720409595407545, "train/perplexity": 8.659327560494054, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1629368.0108813068, "perf/iters_per_sec": 0.7769432119757208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2870953559875489, "data/tokens_consumed": 120651251712, "data/tokens_consumed_B": 120.651251712, "train/loss_slope": 1.2938064445864153e-05} {"step": 57540, "timestamp": 1778256740.7010171, "train/loss": 2.1371455669403074, "train/z_loss": 0.0013711779145523905, "train/perplexity": 8.4752111460156, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029355.067861091, "perf/iters_per_sec": 0.9676719035439926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334081172943115, "data/tokens_consumed": 120672223232, "data/tokens_consumed_B": 120.672223232, "train/loss_slope": 1.2595281048719505e-05} {"step": 57550, "timestamp": 1778256751.0332496, "grad/layer_0/attn": 0.003493713680654764, "grad/layer_0/mlp": 0.003408480668440461, "grad/layer_0/attn_mlp_ratio": 1.0250061297113615, "grad/layer_4/attn": 0.002441630233079195, "grad/layer_4/mlp": 0.002899687737226486, "grad/layer_4/attn_mlp_ratio": 0.8420320979842377, "grad/layer_8/attn": 0.011039075441658497, "grad/layer_8/mlp": 0.0038848966360092163, "grad/layer_8/attn_mlp_ratio": 2.841536388686216, "grad/layer_12/attn": 0.006063488312065601, "grad/layer_12/mlp": 0.006990453694015741, "grad/layer_12/attn_mlp_ratio": 0.8673955211972534, "grad/layer_16/attn": 0.0066605848260223866, "grad/layer_16/mlp": 0.005486323032528162, "grad/layer_16/attn_mlp_ratio": 1.2140343660277828, "grad/layer_20/attn": 0.003528009634464979, "grad/layer_20/mlp": 0.0069604432210326195, "grad/layer_20/attn_mlp_ratio": 0.506865650899601, "grad/layer_24/attn": 0.017034539952874184, "grad/layer_24/mlp": 0.01300625316798687, "grad/layer_24/attn_mlp_ratio": 1.3097192251978051, "grad/layer_27/attn": 0.005185381509363651, "grad/layer_27/mlp": 0.012338330037891865, "grad/layer_27/attn_mlp_ratio": 0.4202660693475032} {"step": 57550, "timestamp": 1778256751.0477946, "train/loss": 2.1549177885055544, "train/z_loss": 0.0013860665261745454, "train/perplexity": 8.627180896379217, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028412.983146617, "perf/iters_per_sec": 0.9672226825459561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033888077735901, "data/tokens_consumed": 120693194752, "data/tokens_consumed_B": 120.693194752, "train/loss_slope": 1.2521531536813073e-05} {"step": 57560, "timestamp": 1778256761.4133248, "train/loss": 2.0983623027801515, "train/z_loss": 0.0014001160976476966, "train/perplexity": 8.152807143876231, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024395.6013377532, "perf/iters_per_sec": 0.9653070456208006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359398126602173, "data/tokens_consumed": 120714166272, "data/tokens_consumed_B": 120.714166272, "train/loss_slope": 1.0868542036279152e-05} {"step": 57570, "timestamp": 1778256771.7617326, "train/loss": 2.1051900744438172, "train/z_loss": 0.001401405967772007, "train/perplexity": 8.208663118399418, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027944.7700772767, "perf/iters_per_sec": 0.9669994211565384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341267824172973, "data/tokens_consumed": 120735137792, "data/tokens_consumed_B": 120.735137792, "train/loss_slope": 7.6097476528887966e-06} {"step": 57580, "timestamp": 1778256782.1072571, "train/loss": 2.1053991079330445, "train/z_loss": 0.0013789998600259423, "train/perplexity": 8.21037918324421, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028471.3612570819, "perf/iters_per_sec": 0.9672505193982515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033858323097229, "data/tokens_consumed": 120756109312, "data/tokens_consumed_B": 120.756109312, "train/loss_slope": 2.1640214768394985e-06} {"step": 57590, "timestamp": 1778256792.4699497, "train/loss": 2.1095531463623045, "train/z_loss": 0.0013995975605212152, "train/perplexity": 8.244556351378266, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025235.2376355643, "perf/iters_per_sec": 0.965707415406973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355103254318236, "data/tokens_consumed": 120777080832, "data/tokens_consumed_B": 120.777080832, "train/loss_slope": 2.0573841212857e-06} {"step": 57600, "timestamp": 1778256802.8072093, "grad/layer_0/attn": 0.0027140039019286633, "grad/layer_0/mlp": 0.003065470838919282, "grad/layer_0/attn_mlp_ratio": 0.8853464788954982, "grad/layer_4/attn": 0.002069483743980527, "grad/layer_4/mlp": 0.002619577571749687, "grad/layer_4/attn_mlp_ratio": 0.7900066359163392, "grad/layer_8/attn": 0.007339689880609512, "grad/layer_8/mlp": 0.003723584348335862, "grad/layer_8/attn_mlp_ratio": 1.9711356039983876, "grad/layer_12/attn": 0.004510050173848867, "grad/layer_12/mlp": 0.006707563064992428, "grad/layer_12/attn_mlp_ratio": 0.6723828107034993, "grad/layer_16/attn": 0.00482736574485898, "grad/layer_16/mlp": 0.005021338816732168, "grad/layer_16/attn_mlp_ratio": 0.9613702291182084, "grad/layer_20/attn": 0.0031540587078779936, "grad/layer_20/mlp": 0.006392219103872776, "grad/layer_20/attn_mlp_ratio": 0.49342155005683824, "grad/layer_24/attn": 0.010663076303899288, "grad/layer_24/mlp": 0.009406795725226402, "grad/layer_24/attn_mlp_ratio": 1.133550307885273, "grad/layer_27/attn": 0.007659806404262781, "grad/layer_27/mlp": 0.007664576172828674, "grad/layer_27/attn_mlp_ratio": 0.999377673546964} {"step": 57600, "timestamp": 1778256803.38952, "eos/sharpness": 66.2994861602783, "eos/L0_probe": 1.978874683380127, "eos/L_plus": 2.382798910140991, "eos/L_minus": 2.237945318222046, "eos/grad_norm": 0.14835307002067566, "eos/embed_grad_frac": 0.09721087664365768, "eos/time_s": 0.5794732570648193} {"step": 57600, "timestamp": 1778256803.4096987, "train/loss": 2.116711688041687, "train/z_loss": 0.0013895135023631155, "train/perplexity": 8.30378710160635, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918155.888862316, "perf/iters_per_sec": 0.914648003035696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0933167695999146, "data/tokens_consumed": 120798052352, "data/tokens_consumed_B": 120.798052352, "train/loss_slope": 1.1941490035997581e-06} {"step": 57600, "timestamp": 1778256804.771382, "geo/rankme_last": 438.93572998046875, "geo/layer_0/stable_rank_q_proj": 19.50221061706543, "geo/layer_0/stable_rank_k_proj": 16.199338912963867, "geo/layer_0/stable_rank_o_proj": 47.56331253051758, "geo/layer_0/stable_rank_gate_proj": 131.6849365234375, "geo/layer_0/stable_rank_down_proj": 54.489749908447266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0655289888381958, "geo/layer_0/attn_entropy_mean": 6.16242790222168, "geo/layer_0/attn_entropy_std": 0.4136330187320709, "geo/layer_7/stable_rank_q_proj": 43.587467193603516, "geo/layer_7/stable_rank_k_proj": 41.399749755859375, "geo/layer_7/stable_rank_o_proj": 92.36941528320312, "geo/layer_7/stable_rank_gate_proj": 83.56991577148438, "geo/layer_7/stable_rank_down_proj": 141.3645477294922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46358391642570496, "geo/layer_7/attn_entropy_mean": 4.671369552612305, "geo/layer_7/attn_entropy_std": 0.8170812726020813, "geo/layer_14/stable_rank_q_proj": 51.88041687011719, "geo/layer_14/stable_rank_k_proj": 39.10901641845703, "geo/layer_14/stable_rank_o_proj": 44.252010345458984, "geo/layer_14/stable_rank_gate_proj": 72.35809326171875, "geo/layer_14/stable_rank_down_proj": 129.42022705078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38513725996017456, "geo/layer_14/attn_entropy_mean": 5.553529739379883, "geo/layer_14/attn_entropy_std": 0.38928037881851196, "geo/layer_21/stable_rank_q_proj": 40.72425842285156, "geo/layer_21/stable_rank_k_proj": 30.015565872192383, "geo/layer_21/stable_rank_o_proj": 71.43058013916016, "geo/layer_21/stable_rank_gate_proj": 67.26538848876953, "geo/layer_21/stable_rank_down_proj": 52.50935745239258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1433350294828415, "geo/layer_21/attn_entropy_mean": 5.718690395355225, "geo/layer_21/attn_entropy_std": 0.29858335852622986, "geo/layer_27/stable_rank_q_proj": 43.397396087646484, "geo/layer_27/stable_rank_k_proj": 31.757287979125977, "geo/layer_27/stable_rank_o_proj": 115.73013305664062, "geo/layer_27/stable_rank_gate_proj": 81.08692932128906, "geo/layer_27/stable_rank_down_proj": 129.09474182128906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09806878119707108, "geo/layer_27/attn_entropy_mean": 4.192505836486816, "geo/layer_27/attn_entropy_std": 0.7283602356910706, "attnres/final_alpha/block_0": 0.23622311651706696, "attnres/block_norm/0": 1.7569432258605957, "attnres/final_alpha/block_1": 0.004553896375000477, "attnres/block_norm/1": 45488.796875, "attnres/final_alpha/block_2": 0.010247004218399525, "attnres/block_norm/2": 28199.96875, "attnres/final_alpha/block_3": 0.012275082990527153, "attnres/block_norm/3": 55869.9453125, "attnres/final_alpha/block_4": 0.014187615364789963, "attnres/block_norm/4": 14730.833984375, "attnres/final_alpha/block_5": 0.6145890951156616, "attnres/block_norm/5": 6436.22705078125, "attnres/final_alpha/block_6": 0.1079242080450058, "attnres/block_norm/6": 37124.0078125, "geo/tier1_time_s": 1.3573687076568604, "geo/step": 57600.0, "geo/rankme_slope": 4.332676429946979e-05} {"step": 57610, "timestamp": 1778256815.7212105, "train/loss": 2.2351215362548826, "train/z_loss": 0.0013728852616623044, "train/perplexity": 9.347617857628801, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703970.7274655693, "perf/iters_per_sec": 0.8125165593459936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307441473007201, "data/tokens_consumed": 120819023872, "data/tokens_consumed_B": 120.819023872, "train/loss_slope": 4.835886058240879e-06} {"step": 57620, "timestamp": 1778256826.0807354, "train/loss": 2.1477089405059813, "train/z_loss": 0.001366969512309879, "train/perplexity": 8.565212489407362, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028377.3404716887, "perf/iters_per_sec": 0.9672056867941325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339062452316283, "data/tokens_consumed": 120839995392, "data/tokens_consumed_B": 120.839995392, "train/loss_slope": 5.149117325863795e-06} {"step": 57630, "timestamp": 1778256836.427851, "train/loss": 2.1595093846321105, "train/z_loss": 0.001371298509184271, "train/perplexity": 8.66688450848681, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028007.3761435722, "perf/iters_per_sec": 0.9670292740552769, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340948581695557, "data/tokens_consumed": 120860966912, "data/tokens_consumed_B": 120.860966912, "train/loss_slope": 5.256018873238212e-06} {"step": 57640, "timestamp": 1778256846.7757068, "train/loss": 2.1399763226509094, "train/z_loss": 0.0013882860075682402, "train/perplexity": 8.499236387116756, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027682.3716200867, "perf/iters_per_sec": 0.966874299821895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342606067657472, "data/tokens_consumed": 120881938432, "data/tokens_consumed_B": 120.881938432, "train/loss_slope": 2.627160461178587e-06} {"step": 57650, "timestamp": 1778256857.1174004, "grad/layer_0/attn": 0.0028425026684999466, "grad/layer_0/mlp": 0.0029814601875841618, "grad/layer_0/attn_mlp_ratio": 0.9533927653965799, "grad/layer_4/attn": 0.0033125781919807196, "grad/layer_4/mlp": 0.002607478993013501, "grad/layer_4/attn_mlp_ratio": 1.270414094922742, "grad/layer_8/attn": 0.0059793367981910706, "grad/layer_8/mlp": 0.003842802718281746, "grad/layer_8/attn_mlp_ratio": 1.5559832447673299, "grad/layer_12/attn": 0.004146133083850145, "grad/layer_12/mlp": 0.00654975650832057, "grad/layer_12/attn_mlp_ratio": 0.6330209398289779, "grad/layer_16/attn": 0.0036703902296721935, "grad/layer_16/mlp": 0.0042619830928742886, "grad/layer_16/attn_mlp_ratio": 0.8611930323443806, "grad/layer_20/attn": 0.0030378554947674274, "grad/layer_20/mlp": 0.0058116246946156025, "grad/layer_20/attn_mlp_ratio": 0.5227205131311923, "grad/layer_24/attn": 0.011035095900297165, "grad/layer_24/mlp": 0.009476451203227043, "grad/layer_24/attn_mlp_ratio": 1.1644755560068516, "grad/layer_27/attn": 0.014585929922759533, "grad/layer_27/mlp": 0.008451537229120731, "grad/layer_27/attn_mlp_ratio": 1.725831568240497} {"step": 57650, "timestamp": 1778256857.1318424, "train/loss": 2.192348837852478, "train/z_loss": 0.0013773952261544764, "train/perplexity": 8.956225148752633, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026543.7146863055, "perf/iters_per_sec": 0.9663313458854225, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348417282104492, "data/tokens_consumed": 120902909952, "data/tokens_consumed_B": 120.902909952, "train/loss_slope": 4.9590020933703645e-06} {"step": 57660, "timestamp": 1778256867.4793475, "train/loss": 2.1540417194366457, "train/z_loss": 0.0013836379745043813, "train/perplexity": 8.619626199745227, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028304.3283519726, "perf/iters_per_sec": 0.9671708719024528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339434623718262, "data/tokens_consumed": 120923881472, "data/tokens_consumed_B": 120.923881472, "train/loss_slope": 1.4093969449816844e-06} {"step": 57670, "timestamp": 1778256877.8343465, "train/loss": 2.1695214748382567, "train/z_loss": 0.00137213789857924, "train/perplexity": 8.754093984057377, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026654.7024511364, "perf/iters_per_sec": 0.9663842689757998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347850561141967, "data/tokens_consumed": 120944852992, "data/tokens_consumed_B": 120.944852992, "train/loss_slope": 4.4254350762377314e-06} {"step": 57675, "timestamp": 1778256883.595756, "eos/sharpness": 83.18865299224852, "eos/L0_probe": 1.978603720664978, "eos/L_plus": 2.333719253540039, "eos/L_minus": 2.4553747177124023, "eos/grad_norm": 0.26875412464141846, "eos/embed_grad_frac": 0.031769633293151855, "eos/time_s": 0.5985381603240967} {"step": 57675, "timestamp": 1778256884.977567, "geo/rankme_last": 438.7632751464844, "geo/layer_0/stable_rank_q_proj": 19.50785255432129, "geo/layer_0/stable_rank_k_proj": 16.21654510498047, "geo/layer_0/stable_rank_o_proj": 47.51405715942383, "geo/layer_0/stable_rank_gate_proj": 131.6456298828125, "geo/layer_0/stable_rank_down_proj": 54.54673767089844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06452018022537231, "geo/layer_0/attn_entropy_mean": 6.164770126342773, "geo/layer_0/attn_entropy_std": 0.409229576587677, "geo/layer_7/stable_rank_q_proj": 43.50669860839844, "geo/layer_7/stable_rank_k_proj": 41.368865966796875, "geo/layer_7/stable_rank_o_proj": 92.42581176757812, "geo/layer_7/stable_rank_gate_proj": 83.61614990234375, "geo/layer_7/stable_rank_down_proj": 141.56625366210938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46857842803001404, "geo/layer_7/attn_entropy_mean": 4.63350772857666, "geo/layer_7/attn_entropy_std": 0.8056432604789734, "geo/layer_14/stable_rank_q_proj": 51.94734191894531, "geo/layer_14/stable_rank_k_proj": 39.15435028076172, "geo/layer_14/stable_rank_o_proj": 44.24306869506836, "geo/layer_14/stable_rank_gate_proj": 72.27630615234375, "geo/layer_14/stable_rank_down_proj": 129.300537109375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38932543992996216, "geo/layer_14/attn_entropy_mean": 5.523589134216309, "geo/layer_14/attn_entropy_std": 0.3822782039642334, "geo/layer_21/stable_rank_q_proj": 40.738948822021484, "geo/layer_21/stable_rank_k_proj": 30.115007400512695, "geo/layer_21/stable_rank_o_proj": 71.43205261230469, "geo/layer_21/stable_rank_gate_proj": 67.33740234375, "geo/layer_21/stable_rank_down_proj": 52.56382751464844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1454196274280548, "geo/layer_21/attn_entropy_mean": 5.710582256317139, "geo/layer_21/attn_entropy_std": 0.2895941436290741, "geo/layer_27/stable_rank_q_proj": 43.34345245361328, "geo/layer_27/stable_rank_k_proj": 31.837879180908203, "geo/layer_27/stable_rank_o_proj": 115.89986419677734, "geo/layer_27/stable_rank_gate_proj": 81.06159973144531, "geo/layer_27/stable_rank_down_proj": 129.0477752685547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08574801683425903, "geo/layer_27/attn_entropy_mean": 4.2203779220581055, "geo/layer_27/attn_entropy_std": 0.7162074446678162, "attnres/final_alpha/block_0": 0.2394610345363617, "attnres/block_norm/0": 1.7570313215255737, "attnres/final_alpha/block_1": 0.004642934538424015, "attnres/block_norm/1": 45420.703125, "attnres/final_alpha/block_2": 0.01042184792459011, "attnres/block_norm/2": 28286.9609375, "attnres/final_alpha/block_3": 0.01242454256862402, "attnres/block_norm/3": 55449.96875, "attnres/final_alpha/block_4": 0.0144168920814991, "attnres/block_norm/4": 14694.46875, "attnres/final_alpha/block_5": 0.6066781282424927, "attnres/block_norm/5": 6520.63916015625, "attnres/final_alpha/block_6": 0.11195462942123413, "attnres/block_norm/6": 36865.04296875, "geo/tier1_time_s": 1.3639512062072754, "geo/step": 57675.0, "geo/rankme_slope": -9.628323985844336e-06} {"step": 57680, "timestamp": 1778256890.1591864, "train/loss": 2.1246328949928284, "train/z_loss": 0.0013821568340063096, "train/perplexity": 8.369824319657118, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702599.0476827023, "perf/iters_per_sec": 0.8118624914563667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317356824874879, "data/tokens_consumed": 120965824512, "data/tokens_consumed_B": 120.965824512, "train/loss_slope": 1.6021998265538466e-06} {"step": 57690, "timestamp": 1778256900.5009766, "train/loss": 2.157801103591919, "train/z_loss": 0.001386760699097067, "train/perplexity": 8.652091672761234, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028666.8214076885, "perf/iters_per_sec": 0.9673437220610087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337587118148803, "data/tokens_consumed": 120986796032, "data/tokens_consumed_B": 120.986796032, "train/loss_slope": 1.6267915334280995e-06} {"step": 57700, "timestamp": 1778256910.838277, "grad/layer_0/attn": 0.002797011286020279, "grad/layer_0/mlp": 0.003036815207451582, "grad/layer_0/attn_mlp_ratio": 0.9210343741211775, "grad/layer_4/attn": 0.002417526440694928, "grad/layer_4/mlp": 0.00262240506708622, "grad/layer_4/attn_mlp_ratio": 0.9218737329522064, "grad/layer_8/attn": 0.006517461501061916, "grad/layer_8/mlp": 0.003631229279562831, "grad/layer_8/attn_mlp_ratio": 1.7948360788616893, "grad/layer_12/attn": 0.005089244805276394, "grad/layer_12/mlp": 0.007030726410448551, "grad/layer_12/attn_mlp_ratio": 0.7238575981746881, "grad/layer_16/attn": 0.0035507038701325655, "grad/layer_16/mlp": 0.004794933833181858, "grad/layer_16/attn_mlp_ratio": 0.7405115314647022, "grad/layer_20/attn": 0.008764098398387432, "grad/layer_20/mlp": 0.007634895853698254, "grad/layer_20/attn_mlp_ratio": 1.147900174611051, "grad/layer_24/attn": 0.011555461212992668, "grad/layer_24/mlp": 0.010845792479813099, "grad/layer_24/attn_mlp_ratio": 1.0654326208026927, "grad/layer_27/attn": 0.009116002358496189, "grad/layer_27/mlp": 0.009984872303903103, "grad/layer_27/attn_mlp_ratio": 0.9129813571711471} {"step": 57700, "timestamp": 1778256910.8525457, "train/loss": 2.128803277015686, "train/z_loss": 0.0013939141179434955, "train/perplexity": 8.404802570172482, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026912.7247728193, "perf/iters_per_sec": 0.966507303606424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034653329849243, "data/tokens_consumed": 121007767552, "data/tokens_consumed_B": 121.007767552, "train/loss_slope": -1.041553373133633e-06} {"step": 57710, "timestamp": 1778256921.9365287, "train/loss": 2.174889898300171, "train/z_loss": 0.0013633957132697105, "train/perplexity": 8.801216039993221, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1893571.4364890887, "perf/iters_per_sec": 0.9029252226300662, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1075114250183105, "data/tokens_consumed": 121028739072, "data/tokens_consumed_B": 121.028739072, "train/loss_slope": 2.053103688741083e-06} {"step": 57720, "timestamp": 1778256932.2815647, "train/loss": 2.1540648221969603, "train/z_loss": 0.0013924770755693316, "train/perplexity": 8.619825339203647, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029112.7593459424, "perf/iters_per_sec": 0.9675563618402206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335315227508546, "data/tokens_consumed": 121049710592, "data/tokens_consumed_B": 121.049710592, "train/loss_slope": 1.6289054614232113e-06} {"step": 57730, "timestamp": 1778256943.1522505, "train/loss": 2.1087443828582764, "train/z_loss": 0.0013870092108845711, "train/perplexity": 8.237891150743186, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1930350.8898399943, "perf/iters_per_sec": 0.9204630326461765, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0864097356796265, "data/tokens_consumed": 121070682112, "data/tokens_consumed_B": 121.070682112, "train/loss_slope": -2.013534709851466e-06} {"step": 57740, "timestamp": 1778256953.5107744, "train/loss": 2.163526773452759, "train/z_loss": 0.001382204471156001, "train/perplexity": 8.701772786386844, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026483.393242823, "perf/iters_per_sec": 0.9663025823797335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348725318908691, "data/tokens_consumed": 121091653632, "data/tokens_consumed_B": 121.091653632, "train/loss_slope": -3.890160866195729e-06} {"step": 57750, "timestamp": 1778256963.851159, "grad/layer_0/attn": 0.002668835921213031, "grad/layer_0/mlp": 0.0029101697728037834, "grad/layer_0/attn_mlp_ratio": 0.9170722115413005, "grad/layer_4/attn": 0.0026338647585362196, "grad/layer_4/mlp": 0.0025245673023164272, "grad/layer_4/attn_mlp_ratio": 1.0432934989652112, "grad/layer_8/attn": 0.0038203776348382235, "grad/layer_8/mlp": 0.0037311522755771875, "grad/layer_8/attn_mlp_ratio": 1.023913592981373, "grad/layer_12/attn": 0.004606734495609999, "grad/layer_12/mlp": 0.006653704214841127, "grad/layer_12/attn_mlp_ratio": 0.6923563593492801, "grad/layer_16/attn": 0.004337613936513662, "grad/layer_16/mlp": 0.004880521446466446, "grad/layer_16/attn_mlp_ratio": 0.8887603292427109, "grad/layer_20/attn": 0.0033876497764140368, "grad/layer_20/mlp": 0.006745791062712669, "grad/layer_20/attn_mlp_ratio": 0.5021871704447739, "grad/layer_24/attn": 0.014445317909121513, "grad/layer_24/mlp": 0.010484656319022179, "grad/layer_24/attn_mlp_ratio": 1.377757871294052, "grad/layer_27/attn": 0.007089646067470312, "grad/layer_27/mlp": 0.009311768226325512, "grad/layer_27/attn_mlp_ratio": 0.7613640953058307} {"step": 57750, "timestamp": 1778256964.462857, "eos/sharpness": 60.24742126464842, "eos/L0_probe": 1.984723448753357, "eos/L_plus": 2.3275201320648193, "eos/L_minus": 2.244400978088379, "eos/grad_norm": 0.15825968980789185, "eos/embed_grad_frac": 0.08854177594184875, "eos/time_s": 0.6086885929107666} {"step": 57750, "timestamp": 1778256964.4813576, "train/loss": 2.165446734428406, "train/z_loss": 0.0013773028273135423, "train/perplexity": 8.718495899280597, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913228.7344967523, "perf/iters_per_sec": 0.9122985527499925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961323976516724, "data/tokens_consumed": 121112625152, "data/tokens_consumed_B": 121.112625152, "train/loss_slope": -3.173595479111978e-06} {"step": 57750, "timestamp": 1778256965.8495991, "geo/rankme_last": 439.2198181152344, "geo/layer_0/stable_rank_q_proj": 19.520048141479492, "geo/layer_0/stable_rank_k_proj": 16.19072151184082, "geo/layer_0/stable_rank_o_proj": 47.46205139160156, "geo/layer_0/stable_rank_gate_proj": 131.5536651611328, "geo/layer_0/stable_rank_down_proj": 54.553619384765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0641852393746376, "geo/layer_0/attn_entropy_mean": 6.163219451904297, "geo/layer_0/attn_entropy_std": 0.40886369347572327, "geo/layer_7/stable_rank_q_proj": 43.52039337158203, "geo/layer_7/stable_rank_k_proj": 41.17636489868164, "geo/layer_7/stable_rank_o_proj": 92.4861068725586, "geo/layer_7/stable_rank_gate_proj": 83.56539916992188, "geo/layer_7/stable_rank_down_proj": 141.28277587890625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4682070314884186, "geo/layer_7/attn_entropy_mean": 4.627603054046631, "geo/layer_7/attn_entropy_std": 0.8032242059707642, "geo/layer_14/stable_rank_q_proj": 52.004615783691406, "geo/layer_14/stable_rank_k_proj": 39.22945022583008, "geo/layer_14/stable_rank_o_proj": 44.249820709228516, "geo/layer_14/stable_rank_gate_proj": 72.2488021850586, "geo/layer_14/stable_rank_down_proj": 129.4122772216797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3866938650608063, "geo/layer_14/attn_entropy_mean": 5.523527145385742, "geo/layer_14/attn_entropy_std": 0.39376145601272583, "geo/layer_21/stable_rank_q_proj": 40.75068283081055, "geo/layer_21/stable_rank_k_proj": 30.07917594909668, "geo/layer_21/stable_rank_o_proj": 71.47087097167969, "geo/layer_21/stable_rank_gate_proj": 67.39275360107422, "geo/layer_21/stable_rank_down_proj": 52.52790069580078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14478842914104462, "geo/layer_21/attn_entropy_mean": 5.704743385314941, "geo/layer_21/attn_entropy_std": 0.3113965094089508, "geo/layer_27/stable_rank_q_proj": 43.258670806884766, "geo/layer_27/stable_rank_k_proj": 31.799718856811523, "geo/layer_27/stable_rank_o_proj": 116.03399658203125, "geo/layer_27/stable_rank_gate_proj": 81.02544403076172, "geo/layer_27/stable_rank_down_proj": 128.78611755371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08378062397241592, "geo/layer_27/attn_entropy_mean": 4.20294189453125, "geo/layer_27/attn_entropy_std": 0.721294641494751, "attnres/final_alpha/block_0": 0.23641782999038696, "attnres/block_norm/0": 1.7571401596069336, "attnres/final_alpha/block_1": 0.004559633322060108, "attnres/block_norm/1": 45305.9609375, "attnres/final_alpha/block_2": 0.010301962494850159, "attnres/block_norm/2": 28111.51171875, "attnres/final_alpha/block_3": 0.012217111885547638, "attnres/block_norm/3": 55733.07421875, "attnres/final_alpha/block_4": 0.014213917776942253, "attnres/block_norm/4": 14724.599609375, "attnres/final_alpha/block_5": 0.6130362749099731, "attnres/block_norm/5": 6481.71826171875, "attnres/final_alpha/block_6": 0.10925333201885223, "attnres/block_norm/6": 37019.4921875, "geo/tier1_time_s": 1.364091157913208, "geo/step": 57750.0, "geo/rankme_slope": 3.6206474777410957e-06} {"step": 57760, "timestamp": 1778256976.193351, "train/loss": 2.1265242099761963, "train/z_loss": 0.0013977263821288942, "train/perplexity": 8.385669272976644, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791202.860071902, "perf/iters_per_sec": 0.8541120815619955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708065271377563, "data/tokens_consumed": 121133596672, "data/tokens_consumed_B": 121.133596672, "train/loss_slope": -3.979040951904813e-06} {"step": 57770, "timestamp": 1778256987.1488457, "train/loss": 2.1635356903076173, "train/z_loss": 0.001372070680372417, "train/perplexity": 8.701850379177733, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915668.0326600086, "perf/iters_per_sec": 0.9134617007541698, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094736647605896, "data/tokens_consumed": 121154568192, "data/tokens_consumed_B": 121.154568192, "train/loss_slope": -5.3714307907498385e-06} {"step": 57780, "timestamp": 1778256997.499469, "train/loss": 2.111454451084137, "train/z_loss": 0.0013871814822778105, "train/perplexity": 8.260246676616628, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027355.7423398236, "perf/iters_per_sec": 0.9667185508441084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034427237510681, "data/tokens_consumed": 121175539712, "data/tokens_consumed_B": 121.175539712, "train/loss_slope": -5.848074746210597e-06} {"step": 57790, "timestamp": 1778257008.2542548, "train/loss": 2.092773509025574, "train/z_loss": 0.0013778717257082462, "train/perplexity": 8.107369874260497, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951632.9026669106, "perf/iters_per_sec": 0.9306110871634057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0745627403259277, "data/tokens_consumed": 121196511232, "data/tokens_consumed_B": 121.196511232, "train/loss_slope": -1.2983932289102938e-05} {"step": 57800, "timestamp": 1778257018.5905912, "grad/layer_0/attn": 0.0028311247006058693, "grad/layer_0/mlp": 0.0030643849167972803, "grad/layer_0/attn_mlp_ratio": 0.9238802190609843, "grad/layer_4/attn": 0.0024597039446234703, "grad/layer_4/mlp": 0.0025210182648152113, "grad/layer_4/attn_mlp_ratio": 0.9756787094265236, "grad/layer_8/attn": 0.00355325685814023, "grad/layer_8/mlp": 0.003643171163275838, "grad/layer_8/attn_mlp_ratio": 0.9753197424337501, "grad/layer_12/attn": 0.004275511018931866, "grad/layer_12/mlp": 0.007116199936717749, "grad/layer_12/attn_mlp_ratio": 0.6008137765761694, "grad/layer_16/attn": 0.0032610176131129265, "grad/layer_16/mlp": 0.004428254906088114, "grad/layer_16/attn_mlp_ratio": 0.7364114326364607, "grad/layer_20/attn": 0.004415992647409439, "grad/layer_20/mlp": 0.006018210202455521, "grad/layer_20/attn_mlp_ratio": 0.7337717403473999, "grad/layer_24/attn": 0.017865948379039764, "grad/layer_24/mlp": 0.010856620967388153, "grad/layer_24/attn_mlp_ratio": 1.6456269651619968, "grad/layer_27/attn": 0.006675850600004196, "grad/layer_27/mlp": 0.010904882103204727, "grad/layer_27/attn_mlp_ratio": 0.612189153042139} {"step": 57800, "timestamp": 1778257018.6054788, "train/loss": 2.126521813869476, "train/z_loss": 0.0013837400474585593, "train/perplexity": 8.385649180042218, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027558.0447373784, "perf/iters_per_sec": 0.9668150161444561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343240261077882, "data/tokens_consumed": 121217482752, "data/tokens_consumed_B": 121.217482752, "train/loss_slope": -1.4204258887287824e-05} {"step": 57810, "timestamp": 1778257028.9588356, "train/loss": 2.183750295639038, "train/z_loss": 0.001370901265181601, "train/perplexity": 8.87954481074535, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026464.9987183828, "perf/iters_per_sec": 0.966293811186973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348819255828858, "data/tokens_consumed": 121238454272, "data/tokens_consumed_B": 121.238454272, "train/loss_slope": -1.1709780532820915e-05} {"step": 57820, "timestamp": 1778257039.3020468, "train/loss": 2.177275228500366, "train/z_loss": 0.0013840182218700647, "train/perplexity": 8.822234904912397, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028988.303801478, "perf/iters_per_sec": 0.9674970168120756, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335949182510376, "data/tokens_consumed": 121259425792, "data/tokens_consumed_B": 121.259425792, "train/loss_slope": -1.2092364567591602e-05} {"step": 57825, "timestamp": 1778257045.0649695, "eos/sharpness": 51.70056819915771, "eos/L0_probe": 1.978198528289795, "eos/L_plus": 2.240032434463501, "eos/L_minus": 2.233370304107666, "eos/grad_norm": 0.19249844551086426, "eos/embed_grad_frac": 0.08321201056241989, "eos/time_s": 0.5939619541168213} {"step": 57825, "timestamp": 1778257046.4468572, "geo/rankme_last": 438.4284362792969, "geo/layer_0/stable_rank_q_proj": 19.526458740234375, "geo/layer_0/stable_rank_k_proj": 16.153427124023438, "geo/layer_0/stable_rank_o_proj": 47.47734451293945, "geo/layer_0/stable_rank_gate_proj": 131.57835388183594, "geo/layer_0/stable_rank_down_proj": 54.59044647216797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0627349317073822, "geo/layer_0/attn_entropy_mean": 6.166644096374512, "geo/layer_0/attn_entropy_std": 0.4073338806629181, "geo/layer_7/stable_rank_q_proj": 43.54363250732422, "geo/layer_7/stable_rank_k_proj": 41.15899658203125, "geo/layer_7/stable_rank_o_proj": 92.24750518798828, "geo/layer_7/stable_rank_gate_proj": 83.35919189453125, "geo/layer_7/stable_rank_down_proj": 141.3492889404297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4686369299888611, "geo/layer_7/attn_entropy_mean": 4.637142658233643, "geo/layer_7/attn_entropy_std": 0.7877438068389893, "geo/layer_14/stable_rank_q_proj": 52.06254959106445, "geo/layer_14/stable_rank_k_proj": 39.27943801879883, "geo/layer_14/stable_rank_o_proj": 44.29444885253906, "geo/layer_14/stable_rank_gate_proj": 72.17508697509766, "geo/layer_14/stable_rank_down_proj": 129.5084991455078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4043871760368347, "geo/layer_14/attn_entropy_mean": 5.552750587463379, "geo/layer_14/attn_entropy_std": 0.3924119770526886, "geo/layer_21/stable_rank_q_proj": 40.82579803466797, "geo/layer_21/stable_rank_k_proj": 30.095247268676758, "geo/layer_21/stable_rank_o_proj": 71.45914459228516, "geo/layer_21/stable_rank_gate_proj": 67.35417938232422, "geo/layer_21/stable_rank_down_proj": 52.55097198486328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14685310423374176, "geo/layer_21/attn_entropy_mean": 5.709522247314453, "geo/layer_21/attn_entropy_std": 0.29855963587760925, "geo/layer_27/stable_rank_q_proj": 43.209381103515625, "geo/layer_27/stable_rank_k_proj": 31.74717140197754, "geo/layer_27/stable_rank_o_proj": 116.05717468261719, "geo/layer_27/stable_rank_gate_proj": 81.03749084472656, "geo/layer_27/stable_rank_down_proj": 128.79640197753906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0914774090051651, "geo/layer_27/attn_entropy_mean": 4.23136043548584, "geo/layer_27/attn_entropy_std": 0.7221909165382385, "attnres/final_alpha/block_0": 0.23620249330997467, "attnres/block_norm/0": 1.7576348781585693, "attnres/final_alpha/block_1": 0.004630163311958313, "attnres/block_norm/1": 45403.0390625, "attnres/final_alpha/block_2": 0.010403990745544434, "attnres/block_norm/2": 28223.1953125, "attnres/final_alpha/block_3": 0.012258430942893028, "attnres/block_norm/3": 55945.0078125, "attnres/final_alpha/block_4": 0.014241352677345276, "attnres/block_norm/4": 14711.9765625, "attnres/final_alpha/block_5": 0.6100595593452454, "attnres/block_norm/5": 6573.92138671875, "attnres/final_alpha/block_6": 0.11220400035381317, "attnres/block_norm/6": 37145.109375, "geo/tier1_time_s": 1.363879919052124, "geo/step": 57825.0, "geo/rankme_slope": -1.4531808817276909e-05} {"step": 57830, "timestamp": 1778257051.6199076, "train/loss": 2.148763370513916, "train/z_loss": 0.0013883232255466282, "train/perplexity": 8.574248669653118, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703755.5019346988, "perf/iters_per_sec": 0.8124139318154806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2308996200561524, "data/tokens_consumed": 121280397312, "data/tokens_consumed_B": 121.280397312, "train/loss_slope": -1.037134612747548e-05} {"step": 57840, "timestamp": 1778257061.9618134, "train/loss": 2.1051691293716432, "train/z_loss": 0.0013958966243080794, "train/perplexity": 8.208491189158492, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028766.2501300257, "perf/iters_per_sec": 0.9673911333704117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337080478668212, "data/tokens_consumed": 121301368832, "data/tokens_consumed_B": 121.301368832, "train/loss_slope": -1.1768714472441376e-05} {"step": 57850, "timestamp": 1778257072.2953477, "grad/layer_0/attn": 0.00265333941206336, "grad/layer_0/mlp": 0.0027811441104859114, "grad/layer_0/attn_mlp_ratio": 0.9540459649878344, "grad/layer_4/attn": 0.002676095347851515, "grad/layer_4/mlp": 0.00249662296846509, "grad/layer_4/attn_mlp_ratio": 1.071886013412815, "grad/layer_8/attn": 0.006323899608105421, "grad/layer_8/mlp": 0.003653450170531869, "grad/layer_8/attn_mlp_ratio": 1.7309390137626777, "grad/layer_12/attn": 0.0048470874316990376, "grad/layer_12/mlp": 0.006905661430209875, "grad/layer_12/attn_mlp_ratio": 0.7019005218391765, "grad/layer_16/attn": 0.004084818996489048, "grad/layer_16/mlp": 0.004926457069814205, "grad/layer_16/attn_mlp_ratio": 0.829159547253935, "grad/layer_20/attn": 0.0032576979137957096, "grad/layer_20/mlp": 0.005997828207910061, "grad/layer_20/attn_mlp_ratio": 0.543146243366018, "grad/layer_24/attn": 0.00919799692928791, "grad/layer_24/mlp": 0.008750567212700844, "grad/layer_24/attn_mlp_ratio": 1.051131498175856, "grad/layer_27/attn": 0.0086930887773633, "grad/layer_27/mlp": 0.007729737088084221, "grad/layer_27/attn_mlp_ratio": 1.1246292811564325} {"step": 57850, "timestamp": 1778257072.3094962, "train/loss": 2.182080388069153, "train/z_loss": 0.0013769364566542208, "train/perplexity": 8.864729165470749, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028093.2261452668, "perf/iters_per_sec": 0.9670702105261167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340510845184325, "data/tokens_consumed": 121322340352, "data/tokens_consumed_B": 121.322340352, "train/loss_slope": -9.37394076483839e-06} {"step": 57860, "timestamp": 1778257082.6526322, "train/loss": 2.1186114311218263, "train/z_loss": 0.0013819920364767312, "train/perplexity": 8.319577157467364, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028961.299104623, "perf/iters_per_sec": 0.9674841399691692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336086750030518, "data/tokens_consumed": 121343311872, "data/tokens_consumed_B": 121.343311872, "train/loss_slope": -1.1553980758374012e-05} {"step": 57870, "timestamp": 1778257093.0041552, "train/loss": 2.1095751762390136, "train/z_loss": 0.0014017838751897217, "train/perplexity": 8.244737979938828, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026787.838435657, "perf/iters_per_sec": 0.9664477531603132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034717082977295, "data/tokens_consumed": 121364283392, "data/tokens_consumed_B": 121.364283392, "train/loss_slope": -1.517786964653424e-05} {"step": 57880, "timestamp": 1778257103.3528824, "train/loss": 2.1229821920394896, "train/z_loss": 0.0014002221403643488, "train/perplexity": 8.356019622795284, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027927.4710990537, "perf/iters_per_sec": 0.9669911723609227, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034135603904724, "data/tokens_consumed": 121385254912, "data/tokens_consumed_B": 121.385254912, "train/loss_slope": -1.6015893898674138e-05} {"step": 57890, "timestamp": 1778257113.706003, "train/loss": 2.1158360719680784, "train/z_loss": 0.0013794466736726462, "train/perplexity": 8.296519354490806, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026754.0274301067, "perf/iters_per_sec": 0.9664316308165105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034734344482422, "data/tokens_consumed": 121406226432, "data/tokens_consumed_B": 121.406226432, "train/loss_slope": -1.7068821984012743e-05} {"step": 57900, "timestamp": 1778257124.04484, "grad/layer_0/attn": 0.0027713689487427473, "grad/layer_0/mlp": 0.0030153158586472273, "grad/layer_0/attn_mlp_ratio": 0.9190973638417903, "grad/layer_4/attn": 0.0020925034768879414, "grad/layer_4/mlp": 0.0024982586037367582, "grad/layer_4/attn_mlp_ratio": 0.8375847840570272, "grad/layer_8/attn": 0.004670603200793266, "grad/layer_8/mlp": 0.003670032136142254, "grad/layer_8/attn_mlp_ratio": 1.2726327455103665, "grad/layer_12/attn": 0.004590495023876429, "grad/layer_12/mlp": 0.006911454256623983, "grad/layer_12/attn_mlp_ratio": 0.6641865498940709, "grad/layer_16/attn": 0.00346161425113678, "grad/layer_16/mlp": 0.004583902191370726, "grad/layer_16/attn_mlp_ratio": 0.7551675474525991, "grad/layer_20/attn": 0.004696802701801062, "grad/layer_20/mlp": 0.005724289920181036, "grad/layer_20/attn_mlp_ratio": 0.8205039725874199, "grad/layer_24/attn": 0.006161957047879696, "grad/layer_24/mlp": 0.008298961445689201, "grad/layer_24/attn_mlp_ratio": 0.742497361140377, "grad/layer_27/attn": 0.00393310934305191, "grad/layer_27/mlp": 0.007397735491394997, "grad/layer_27/attn_mlp_ratio": 0.5316639523622446} {"step": 57900, "timestamp": 1778257124.6568286, "eos/sharpness": 55.54447174072264, "eos/L0_probe": 1.97682523727417, "eos/L_plus": 2.1986610889434814, "eos/L_minus": 2.310434103012085, "eos/grad_norm": 0.12695081532001495, "eos/embed_grad_frac": 0.13688421249389648, "eos/time_s": 0.6092350482940674} {"step": 57900, "timestamp": 1778257124.6771054, "train/loss": 2.142083263397217, "train/z_loss": 0.0013800413347780705, "train/perplexity": 8.517162652731475, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912311.7424347685, "perf/iters_per_sec": 0.9118612968610613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966580152511596, "data/tokens_consumed": 121427197952, "data/tokens_consumed_B": 121.427197952, "train/loss_slope": -1.9704346881412154e-05} {"step": 57900, "timestamp": 1778257126.0440936, "geo/rankme_last": 438.7911376953125, "geo/layer_0/stable_rank_q_proj": 19.496295928955078, "geo/layer_0/stable_rank_k_proj": 16.098499298095703, "geo/layer_0/stable_rank_o_proj": 47.39422607421875, "geo/layer_0/stable_rank_gate_proj": 131.6051483154297, "geo/layer_0/stable_rank_down_proj": 54.66604995727539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06270325928926468, "geo/layer_0/attn_entropy_mean": 6.164459228515625, "geo/layer_0/attn_entropy_std": 0.4083915948867798, "geo/layer_7/stable_rank_q_proj": 43.61512756347656, "geo/layer_7/stable_rank_k_proj": 41.22529602050781, "geo/layer_7/stable_rank_o_proj": 92.23948669433594, "geo/layer_7/stable_rank_gate_proj": 83.18984985351562, "geo/layer_7/stable_rank_down_proj": 141.34796142578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46625298261642456, "geo/layer_7/attn_entropy_mean": 4.652276992797852, "geo/layer_7/attn_entropy_std": 0.794601559638977, "geo/layer_14/stable_rank_q_proj": 52.02273178100586, "geo/layer_14/stable_rank_k_proj": 39.3941650390625, "geo/layer_14/stable_rank_o_proj": 44.307090759277344, "geo/layer_14/stable_rank_gate_proj": 72.1583023071289, "geo/layer_14/stable_rank_down_proj": 129.4379119873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39329034090042114, "geo/layer_14/attn_entropy_mean": 5.536258697509766, "geo/layer_14/attn_entropy_std": 0.4238905608654022, "geo/layer_21/stable_rank_q_proj": 40.82742691040039, "geo/layer_21/stable_rank_k_proj": 30.099380493164062, "geo/layer_21/stable_rank_o_proj": 71.35570526123047, "geo/layer_21/stable_rank_gate_proj": 67.32574462890625, "geo/layer_21/stable_rank_down_proj": 52.53379440307617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1426934152841568, "geo/layer_21/attn_entropy_mean": 5.694806098937988, "geo/layer_21/attn_entropy_std": 0.29807746410369873, "geo/layer_27/stable_rank_q_proj": 43.061397552490234, "geo/layer_27/stable_rank_k_proj": 31.756210327148438, "geo/layer_27/stable_rank_o_proj": 116.06497955322266, "geo/layer_27/stable_rank_gate_proj": 81.08720397949219, "geo/layer_27/stable_rank_down_proj": 128.91619873046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09101948887109756, "geo/layer_27/attn_entropy_mean": 4.214988708496094, "geo/layer_27/attn_entropy_std": 0.7127150893211365, "attnres/final_alpha/block_0": 0.23590585589408875, "attnres/block_norm/0": 1.7576760053634644, "attnres/final_alpha/block_1": 0.004625237546861172, "attnres/block_norm/1": 45555.53125, "attnres/final_alpha/block_2": 0.010196742601692677, "attnres/block_norm/2": 28106.375, "attnres/final_alpha/block_3": 0.012034034356474876, "attnres/block_norm/3": 55882.50390625, "attnres/final_alpha/block_4": 0.014180788770318031, "attnres/block_norm/4": 14748.7041015625, "attnres/final_alpha/block_5": 0.611340343952179, "attnres/block_norm/5": 6523.337890625, "attnres/final_alpha/block_6": 0.11171702295541763, "attnres/block_norm/6": 37192.390625, "geo/tier1_time_s": 1.3628315925598145, "geo/step": 57900.0, "geo/rankme_slope": -2.2014098608193276e-05} {"step": 57910, "timestamp": 1778257136.4272323, "train/loss": 2.1728796005249023, "train/z_loss": 0.0013892196817323565, "train/perplexity": 8.783540747221824, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785443.447764447, "perf/iters_per_sec": 0.8513657797643885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1745832681655883, "data/tokens_consumed": 121448169472, "data/tokens_consumed_B": 121.448169472, "train/loss_slope": -1.799289650148799e-05} {"step": 57920, "timestamp": 1778257146.807449, "train/loss": 2.0991387367248535, "train/z_loss": 0.0013801328954286874, "train/perplexity": 8.159139718182038, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021835.831013089, "perf/iters_per_sec": 0.964086452013535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372513771057128, "data/tokens_consumed": 121469140992, "data/tokens_consumed_B": 121.469140992, "train/loss_slope": -2.383893184249843e-05} {"step": 57930, "timestamp": 1778257157.1594672, "train/loss": 2.141605186462402, "train/z_loss": 0.0013777715270407499, "train/perplexity": 8.513091766892988, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026868.167447588, "perf/iters_per_sec": 0.9664860570180835, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346760749816895, "data/tokens_consumed": 121490112512, "data/tokens_consumed_B": 121.490112512, "train/loss_slope": -2.2141629963567877e-05} {"step": 57940, "timestamp": 1778257167.5161855, "train/loss": 2.2002241373062135, "train/z_loss": 0.0013828993192873894, "train/perplexity": 9.027036568362565, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026252.9255693713, "perf/iters_per_sec": 0.9661926868292672, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349902391433716, "data/tokens_consumed": 121511084032, "data/tokens_consumed_B": 121.511084032, "train/loss_slope": -1.9123254085567175e-05} {"step": 57950, "timestamp": 1778257177.8642855, "grad/layer_0/attn": 0.0029104112181812525, "grad/layer_0/mlp": 0.0031956927850842476, "grad/layer_0/attn_mlp_ratio": 0.9107293231353568, "grad/layer_4/attn": 0.0038489936850965023, "grad/layer_4/mlp": 0.002675179624930024, "grad/layer_4/attn_mlp_ratio": 1.4387794768432547, "grad/layer_8/attn": 0.0038491704035550356, "grad/layer_8/mlp": 0.00391876557841897, "grad/layer_8/attn_mlp_ratio": 0.9822405112795587, "grad/layer_12/attn": 0.005299982149153948, "grad/layer_12/mlp": 0.007009715773165226, "grad/layer_12/attn_mlp_ratio": 0.7560908665989553, "grad/layer_16/attn": 0.004992471542209387, "grad/layer_16/mlp": 0.004692905582487583, "grad/layer_16/attn_mlp_ratio": 1.0638337695214475, "grad/layer_20/attn": 0.0032227702904492617, "grad/layer_20/mlp": 0.006475977133959532, "grad/layer_20/attn_mlp_ratio": 0.4976500339669664, "grad/layer_24/attn": 0.012738754041492939, "grad/layer_24/mlp": 0.011999974958598614, "grad/layer_24/attn_mlp_ratio": 1.061565043201065, "grad/layer_27/attn": 0.003979756496846676, "grad/layer_27/mlp": 0.011191893368959427, "grad/layer_27/attn_mlp_ratio": 0.3555927786379025} {"step": 57950, "timestamp": 1778257177.8784854, "train/loss": 2.1622636795043944, "train/z_loss": 0.0013905938947573304, "train/perplexity": 8.690788568350339, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024967.2465030146, "perf/iters_per_sec": 0.9655796272769044, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356473684310914, "data/tokens_consumed": 121532055552, "data/tokens_consumed_B": 121.532055552, "train/loss_slope": -1.9257206685043133e-05} {"step": 57960, "timestamp": 1778257188.224597, "train/loss": 2.137848234176636, "train/z_loss": 0.0013768653734587133, "train/perplexity": 8.481168491979668, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028103.6539553148, "perf/iters_per_sec": 0.9670751828934263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340457677841186, "data/tokens_consumed": 121553027072, "data/tokens_consumed_B": 121.553027072, "train/loss_slope": -2.027617524487823e-05} {"step": 57970, "timestamp": 1778257198.5676153, "train/loss": 2.1714941263198853, "train/z_loss": 0.0013688595849089325, "train/perplexity": 8.771379804369431, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029098.5297329687, "perf/iters_per_sec": 0.9675495766320079, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335387706756591, "data/tokens_consumed": 121573998592, "data/tokens_consumed_B": 121.573998592, "train/loss_slope": -2.0237496469316344e-05} {"step": 57975, "timestamp": 1778257204.3209825, "eos/sharpness": 75.10902881622313, "eos/L0_probe": 1.9797438383102417, "eos/L_plus": 2.265709161758423, "eos/L_minus": 2.444868803024292, "eos/grad_norm": 0.1867641657590866, "eos/embed_grad_frac": 0.06555824726819992, "eos/time_s": 0.5938458442687988} {"step": 57975, "timestamp": 1778257205.7032394, "geo/rankme_last": 439.53790283203125, "geo/layer_0/stable_rank_q_proj": 19.4964656829834, "geo/layer_0/stable_rank_k_proj": 16.137929916381836, "geo/layer_0/stable_rank_o_proj": 47.3062629699707, "geo/layer_0/stable_rank_gate_proj": 131.57785034179688, "geo/layer_0/stable_rank_down_proj": 54.68831253051758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06064533814787865, "geo/layer_0/attn_entropy_mean": 6.160246849060059, "geo/layer_0/attn_entropy_std": 0.40852001309394836, "geo/layer_7/stable_rank_q_proj": 43.740150451660156, "geo/layer_7/stable_rank_k_proj": 41.22621536254883, "geo/layer_7/stable_rank_o_proj": 92.36571502685547, "geo/layer_7/stable_rank_gate_proj": 83.05302429199219, "geo/layer_7/stable_rank_down_proj": 141.22940063476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4608347415924072, "geo/layer_7/attn_entropy_mean": 4.649706840515137, "geo/layer_7/attn_entropy_std": 0.8000733256340027, "geo/layer_14/stable_rank_q_proj": 52.001197814941406, "geo/layer_14/stable_rank_k_proj": 39.46182632446289, "geo/layer_14/stable_rank_o_proj": 44.25607681274414, "geo/layer_14/stable_rank_gate_proj": 71.95494842529297, "geo/layer_14/stable_rank_down_proj": 129.42416381835938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38335275650024414, "geo/layer_14/attn_entropy_mean": 5.510714530944824, "geo/layer_14/attn_entropy_std": 0.3955739438533783, "geo/layer_21/stable_rank_q_proj": 40.777687072753906, "geo/layer_21/stable_rank_k_proj": 30.04021453857422, "geo/layer_21/stable_rank_o_proj": 71.3245849609375, "geo/layer_21/stable_rank_gate_proj": 67.25751495361328, "geo/layer_21/stable_rank_down_proj": 52.51275634765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1399947553873062, "geo/layer_21/attn_entropy_mean": 5.709078311920166, "geo/layer_21/attn_entropy_std": 0.3082464635372162, "geo/layer_27/stable_rank_q_proj": 43.01410675048828, "geo/layer_27/stable_rank_k_proj": 31.687881469726562, "geo/layer_27/stable_rank_o_proj": 116.0181655883789, "geo/layer_27/stable_rank_gate_proj": 81.05567932128906, "geo/layer_27/stable_rank_down_proj": 128.93800354003906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08464480936527252, "geo/layer_27/attn_entropy_mean": 4.232585906982422, "geo/layer_27/attn_entropy_std": 0.7269495129585266, "attnres/final_alpha/block_0": 0.23761145770549774, "attnres/block_norm/0": 1.7576581239700317, "attnres/final_alpha/block_1": 0.004633048549294472, "attnres/block_norm/1": 45618.875, "attnres/final_alpha/block_2": 0.010496590286493301, "attnres/block_norm/2": 28222.53125, "attnres/final_alpha/block_3": 0.012224502861499786, "attnres/block_norm/3": 55931.0859375, "attnres/final_alpha/block_4": 0.014495083130896091, "attnres/block_norm/4": 14767.392578125, "attnres/final_alpha/block_5": 0.6077486872673035, "attnres/block_norm/5": 6544.24951171875, "attnres/final_alpha/block_6": 0.11279065161943436, "attnres/block_norm/6": 36763.0625, "geo/tier1_time_s": 1.3644380569458008, "geo/step": 57975.0, "geo/rankme_slope": 1.630493994472789e-05} {"step": 57980, "timestamp": 1778257210.8748772, "train/loss": 2.1603336334228516, "train/z_loss": 0.0013780892826616765, "train/perplexity": 8.674031122451773, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704741.8731434287, "perf/iters_per_sec": 0.8128842702595848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2301874160766602, "data/tokens_consumed": 121594970112, "data/tokens_consumed_B": 121.594970112, "train/loss_slope": -1.9265726104070278e-05} {"step": 57990, "timestamp": 1778257221.2264485, "train/loss": 2.1082789421081545, "train/z_loss": 0.001373211876489222, "train/perplexity": 8.234057792676293, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026915.2936510867, "perf/iters_per_sec": 0.9665085285430368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034652018547058, "data/tokens_consumed": 121615941632, "data/tokens_consumed_B": 121.615941632, "train/loss_slope": -2.0540086929053528e-05} {"step": 58000, "timestamp": 1778257231.5637944, "grad/layer_0/attn": 0.002577365841716528, "grad/layer_0/mlp": 0.002737879753112793, "grad/layer_0/attn_mlp_ratio": 0.9413728797435099, "grad/layer_4/attn": 0.0022418953012675047, "grad/layer_4/mlp": 0.002525717718526721, "grad/layer_4/attn_mlp_ratio": 0.8876269885822902, "grad/layer_8/attn": 0.0037123640067875385, "grad/layer_8/mlp": 0.0037276644725352526, "grad/layer_8/attn_mlp_ratio": 0.9958954016784541, "grad/layer_12/attn": 0.0036733613815158606, "grad/layer_12/mlp": 0.006447207648307085, "grad/layer_12/attn_mlp_ratio": 0.5697600457314899, "grad/layer_16/attn": 0.0034724874421954155, "grad/layer_16/mlp": 0.004363524727523327, "grad/layer_16/attn_mlp_ratio": 0.7957987130707697, "grad/layer_20/attn": 0.003578456584364176, "grad/layer_20/mlp": 0.006311572622507811, "grad/layer_20/attn_mlp_ratio": 0.5669674963267046, "grad/layer_24/attn": 0.024276021867990494, "grad/layer_24/mlp": 0.01246552262455225, "grad/layer_24/attn_mlp_ratio": 1.9474531798154069, "grad/layer_27/attn": 0.008801115676760674, "grad/layer_27/mlp": 0.011628860607743263, "grad/layer_27/attn_mlp_ratio": 0.7568338720318758} {"step": 58000, "timestamp": 1778257231.5782547, "train/loss": 2.125860404968262, "train/z_loss": 0.0013683111174032092, "train/perplexity": 8.380104670828077, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027423.2187169844, "perf/iters_per_sec": 0.9667507260880396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034392809867859, "data/tokens_consumed": 121636913152, "data/tokens_consumed_B": 121.636913152, "train/loss_slope": -2.1834476903291117e-05} {"step": 58000, "timestamp": 1778257238.7630377, "geo/ww_alpha_mean": 7.656542490703078, "geo/ww_alpha_std": 4.31679534323063, "geo/ww_alpha_min": 1.3360585423359017, "geo/ww_alpha_max": 25.147047031664975, "geo/ww_alpha_healthy_frac": 0.15228426395939088, "geo/ww_alpha_by_type/q_proj": 4.055355567420436, "geo/ww_alpha_by_type/k_proj": 4.41490381110217, "geo/ww_alpha_by_type/v_proj": 9.373094434477986, "geo/ww_alpha_by_type/o_proj": 7.715265056954121, "geo/ww_alpha_by_type/gate_proj": 7.812509905357672, "geo/ww_alpha_by_type/up_proj": 11.975568037524202, "geo/ww_alpha_by_type/down_proj": 8.346440198490894, "geo/twonn_id/layer_0": 0.7004041075706482, "geo/twonn_id/layer_7": 3.3938448429107666, "geo/twonn_id/layer_14": 4.290434837341309, "geo/twonn_id/layer_21": 6.36958122253418, "geo/twonn_id/layer_27": 5.495238780975342, "geo/tier2_time_s": 7.175227165222168} {"step": 58000, "timestamp": 1778257239.530567, "eoc/jacobian_sigma/layer_0/attn": 1250.5452880859375, "eoc/jacobian_sigma/layer_0/mlp": 9295.5859375, "eoc/jacobian_sigma/layer_0": 9295.5859375, "eoc/jacobian_sigma/layer_7/attn": 1.1419641971588135, "eoc/jacobian_sigma/layer_7/mlp": 1.7906405925750732, "eoc/jacobian_sigma/layer_7": 1.7906405925750732, "eoc/jacobian_sigma/layer_14/attn": 1.515134572982788, "eoc/jacobian_sigma/layer_14/mlp": 5.59968900680542, "eoc/jacobian_sigma/layer_14": 5.59968900680542, "eoc/jacobian_sigma/layer_21/attn": 1.0702875852584839, "eoc/jacobian_sigma/layer_21/mlp": 4.452407360076904, "eoc/jacobian_sigma/layer_21": 4.452407360076904, "eoc/jacobian_sigma/layer_27/attn": 3.7951977252960205, "eoc/jacobian_sigma/layer_27/mlp": 31.01955795288086, "eoc/jacobian_sigma/layer_27": 31.01955795288086, "eoc/layer0_sigma": 9295.5859375, "eoc/sigma_max": 31.01955795288086, "eoc/sigma_min": 1.7906405925750732, "eoc/sigma_mean": 10.715573728084564, "eoc/time_s": 0.7601618766784668} {"step": 58010, "timestamp": 1778257249.912493, "train/loss": 2.250744843482971, "train/z_loss": 0.0013584915781393648, "train/perplexity": 9.49480534707704, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1144342.2808402479, "perf/iters_per_sec": 0.5456649212075462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.832626509666443, "data/tokens_consumed": 121657884672, "data/tokens_consumed_B": 121.657884672, "train/loss_slope": -1.8156720595498633e-05} {"step": 58020, "timestamp": 1778257260.2707477, "train/loss": 2.127742850780487, "train/z_loss": 0.00139357018051669, "train/perplexity": 8.395894620971672, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026431.992122156, "perf/iters_per_sec": 0.9662780724154263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348987817764281, "data/tokens_consumed": 121678856192, "data/tokens_consumed_B": 121.678856192, "train/loss_slope": -1.8540350631876458e-05} {"step": 58030, "timestamp": 1778257270.6248407, "train/loss": 2.16306608915329, "train/z_loss": 0.0013870302122086287, "train/perplexity": 8.69776493953355, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026530.8283797204, "perf/iters_per_sec": 0.9663252012156107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348483085632325, "data/tokens_consumed": 121699827712, "data/tokens_consumed_B": 121.699827712, "train/loss_slope": -1.742781741533992e-05} {"step": 58040, "timestamp": 1778257280.9894125, "train/loss": 2.1617711544036866, "train/z_loss": 0.0013683034223504365, "train/perplexity": 8.686509190772426, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024508.497346785, "perf/iters_per_sec": 0.9653608786329197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035882043838501, "data/tokens_consumed": 121720799232, "data/tokens_consumed_B": 121.720799232, "train/loss_slope": -1.6096891199473085e-05} {"step": 58050, "timestamp": 1778257291.3340845, "grad/layer_0/attn": 0.0028558429330587387, "grad/layer_0/mlp": 0.003149705706164241, "grad/layer_0/attn_mlp_ratio": 0.9067014854116255, "grad/layer_4/attn": 0.0021268546115607023, "grad/layer_4/mlp": 0.0026253582909703255, "grad/layer_4/attn_mlp_ratio": 0.8101197226541792, "grad/layer_8/attn": 0.004647715017199516, "grad/layer_8/mlp": 0.0036046216264367104, "grad/layer_8/attn_mlp_ratio": 1.2893766308715946, "grad/layer_12/attn": 0.0050802952609956264, "grad/layer_12/mlp": 0.00662696547806263, "grad/layer_12/attn_mlp_ratio": 0.7666095743447083, "grad/layer_16/attn": 0.003258238546550274, "grad/layer_16/mlp": 0.004400960635393858, "grad/layer_16/attn_mlp_ratio": 0.7403471065639222, "grad/layer_20/attn": 0.0034126273822039366, "grad/layer_20/mlp": 0.00607073912397027, "grad/layer_20/attn_mlp_ratio": 0.5621436296800894, "grad/layer_24/attn": 0.01417076587677002, "grad/layer_24/mlp": 0.012265510857105255, "grad/layer_24/attn_mlp_ratio": 1.1553343294321607, "grad/layer_27/attn": 0.010059656575322151, "grad/layer_27/mlp": 0.0124216228723526, "grad/layer_27/attn_mlp_ratio": 0.8098504195234721} {"step": 58050, "timestamp": 1778257291.9577837, "eos/sharpness": 78.76105308532713, "eos/L0_probe": 1.9802743196487427, "eos/L_plus": 2.334120750427246, "eos/L_minus": 2.4140384197235107, "eos/grad_norm": 0.22440849244594574, "eos/embed_grad_frac": 0.04547291621565819, "eos/time_s": 0.6209640502929688} {"step": 58050, "timestamp": 1778257291.9757338, "train/loss": 2.145413839817047, "train/z_loss": 0.0013807947863824666, "train/perplexity": 8.545577005647345, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910308.3702021195, "perf/iters_per_sec": 0.9109060145388219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978080987930299, "data/tokens_consumed": 121741770752, "data/tokens_consumed_B": 121.741770752, "train/loss_slope": -1.3545327375430924e-05} {"step": 58050, "timestamp": 1778257293.3343039, "geo/rankme_last": 438.66644287109375, "geo/layer_0/stable_rank_q_proj": 19.47450065612793, "geo/layer_0/stable_rank_k_proj": 16.126686096191406, "geo/layer_0/stable_rank_o_proj": 47.24982452392578, "geo/layer_0/stable_rank_gate_proj": 131.65118408203125, "geo/layer_0/stable_rank_down_proj": 54.688114166259766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06141376495361328, "geo/layer_0/attn_entropy_mean": 6.158994674682617, "geo/layer_0/attn_entropy_std": 0.4049171507358551, "geo/layer_7/stable_rank_q_proj": 43.779483795166016, "geo/layer_7/stable_rank_k_proj": 41.29193115234375, "geo/layer_7/stable_rank_o_proj": 92.18763732910156, "geo/layer_7/stable_rank_gate_proj": 83.25569915771484, "geo/layer_7/stable_rank_down_proj": 141.20802307128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45748841762542725, "geo/layer_7/attn_entropy_mean": 4.645557880401611, "geo/layer_7/attn_entropy_std": 0.8019731640815735, "geo/layer_14/stable_rank_q_proj": 52.030609130859375, "geo/layer_14/stable_rank_k_proj": 39.55331039428711, "geo/layer_14/stable_rank_o_proj": 44.175025939941406, "geo/layer_14/stable_rank_gate_proj": 71.91422271728516, "geo/layer_14/stable_rank_down_proj": 129.56878662109375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39296650886535645, "geo/layer_14/attn_entropy_mean": 5.513788223266602, "geo/layer_14/attn_entropy_std": 0.3855153024196625, "geo/layer_21/stable_rank_q_proj": 40.77604675292969, "geo/layer_21/stable_rank_k_proj": 30.04389190673828, "geo/layer_21/stable_rank_o_proj": 71.35603332519531, "geo/layer_21/stable_rank_gate_proj": 67.30770874023438, "geo/layer_21/stable_rank_down_proj": 52.465858459472656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1432954967021942, "geo/layer_21/attn_entropy_mean": 5.717479228973389, "geo/layer_21/attn_entropy_std": 0.30840209126472473, "geo/layer_27/stable_rank_q_proj": 43.00756072998047, "geo/layer_27/stable_rank_k_proj": 31.690673828125, "geo/layer_27/stable_rank_o_proj": 115.95148468017578, "geo/layer_27/stable_rank_gate_proj": 81.03241729736328, "geo/layer_27/stable_rank_down_proj": 129.0690155029297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09160391241312027, "geo/layer_27/attn_entropy_mean": 4.224888801574707, "geo/layer_27/attn_entropy_std": 0.7050548791885376, "attnres/final_alpha/block_0": 0.2394535392522812, "attnres/block_norm/0": 1.757824420928955, "attnres/final_alpha/block_1": 0.004608482588082552, "attnres/block_norm/1": 45704.4765625, "attnres/final_alpha/block_2": 0.010534578934311867, "attnres/block_norm/2": 28237.09765625, "attnres/final_alpha/block_3": 0.012380296364426613, "attnres/block_norm/3": 55909.96875, "attnres/final_alpha/block_4": 0.014407588168978691, "attnres/block_norm/4": 14760.33984375, "attnres/final_alpha/block_5": 0.6069828271865845, "attnres/block_norm/5": 6570.7548828125, "attnres/final_alpha/block_6": 0.11163268983364105, "attnres/block_norm/6": 37009.2109375, "geo/tier1_time_s": 1.354527473449707, "geo/step": 58050.0, "geo/rankme_slope": 5.860957664315728e-06} {"step": 58060, "timestamp": 1778257303.6960716, "train/loss": 2.1523228883743286, "train/z_loss": 0.001375658786855638, "train/perplexity": 8.604823244022061, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789923.16113855, "perf/iters_per_sec": 0.8535018735592603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716435909271241, "data/tokens_consumed": 121762742272, "data/tokens_consumed_B": 121.762742272, "train/loss_slope": -9.70111273088297e-06} {"step": 58070, "timestamp": 1778257314.0619986, "train/loss": 2.1537625074386595, "train/z_loss": 0.0013797402032651007, "train/perplexity": 8.617219832651017, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024070.49577237, "perf/iters_per_sec": 0.9651520232068872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361062049865724, "data/tokens_consumed": 121783713792, "data/tokens_consumed_B": 121.783713792, "train/loss_slope": -8.290979270637547e-06} {"step": 58080, "timestamp": 1778257324.410896, "train/loss": 2.1389273405075073, "train/z_loss": 0.0013757205102592706, "train/perplexity": 8.490325514404395, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027954.2145088539, "perf/iters_per_sec": 0.9670039246124524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341219663619996, "data/tokens_consumed": 121804685312, "data/tokens_consumed_B": 121.804685312, "train/loss_slope": -7.259663203583144e-06} {"step": 58090, "timestamp": 1778257334.761662, "train/loss": 2.1561333417892454, "train/z_loss": 0.0013762085465714336, "train/perplexity": 8.637674070661001, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027131.3362585846, "perf/iters_per_sec": 0.9666115456860469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345417499542235, "data/tokens_consumed": 121825656832, "data/tokens_consumed_B": 121.825656832, "train/loss_slope": -7.613412281646475e-06} {"step": 58100, "timestamp": 1778257345.1088026, "grad/layer_0/attn": 0.0025111637078225613, "grad/layer_0/mlp": 0.002944219158962369, "grad/layer_0/attn_mlp_ratio": 0.85291328089049, "grad/layer_4/attn": 0.002199576934799552, "grad/layer_4/mlp": 0.002485729055479169, "grad/layer_4/attn_mlp_ratio": 0.8848819791774716, "grad/layer_8/attn": 0.004481122363358736, "grad/layer_8/mlp": 0.0034913092385977507, "grad/layer_8/attn_mlp_ratio": 1.283507684013624, "grad/layer_12/attn": 0.005494142882525921, "grad/layer_12/mlp": 0.006686225533485413, "grad/layer_12/attn_mlp_ratio": 0.8217106606469576, "grad/layer_16/attn": 0.004798030015081167, "grad/layer_16/mlp": 0.004449058324098587, "grad/layer_16/attn_mlp_ratio": 1.0784371787730085, "grad/layer_20/attn": 0.003718828083947301, "grad/layer_20/mlp": 0.00614259485155344, "grad/layer_20/attn_mlp_ratio": 0.6054164588871065, "grad/layer_24/attn": 0.016120225191116333, "grad/layer_24/mlp": 0.013781645335257053, "grad/layer_24/attn_mlp_ratio": 1.1696879931242885, "grad/layer_27/attn": 0.005393528379499912, "grad/layer_27/mlp": 0.01346504781395197, "grad/layer_27/attn_mlp_ratio": 0.4005576819308117} {"step": 58100, "timestamp": 1778257345.123455, "train/loss": 2.17337908744812, "train/z_loss": 0.0013661718927323817, "train/perplexity": 8.787929106837497, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024943.2855453154, "perf/iters_per_sec": 0.9655682018019273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035659623146057, "data/tokens_consumed": 121846628352, "data/tokens_consumed_B": 121.846628352, "train/loss_slope": -6.551340318033804e-06} {"step": 58110, "timestamp": 1778257355.4799027, "train/loss": 2.151647686958313, "train/z_loss": 0.0013772053411230446, "train/perplexity": 8.599015216198094, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026551.2317741625, "perf/iters_per_sec": 0.9663349303122342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348378896713257, "data/tokens_consumed": 121867599872, "data/tokens_consumed_B": 121.867599872, "train/loss_slope": -7.3535417101243e-06} {"step": 58120, "timestamp": 1778257365.830677, "train/loss": 2.1143042922019957, "train/z_loss": 0.0013712043524719774, "train/perplexity": 8.283820642312836, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027150.1633417278, "perf/iters_per_sec": 0.9666205231388701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345321416854858, "data/tokens_consumed": 121888571392, "data/tokens_consumed_B": 121.888571392, "train/loss_slope": -7.2902710202432744e-06} {"step": 58125, "timestamp": 1778257371.6003413, "eos/sharpness": 59.02268886566161, "eos/L0_probe": 1.9805333614349365, "eos/L_plus": 2.2668325901031494, "eos/L_minus": 2.28446102142334, "eos/grad_norm": 0.19427385926246643, "eos/embed_grad_frac": 0.05898072198033333, "eos/time_s": 0.5986096858978271} {"step": 58125, "timestamp": 1778257372.981324, "geo/rankme_last": 438.84716796875, "geo/layer_0/stable_rank_q_proj": 19.47071647644043, "geo/layer_0/stable_rank_k_proj": 16.124755859375, "geo/layer_0/stable_rank_o_proj": 47.24691390991211, "geo/layer_0/stable_rank_gate_proj": 131.74627685546875, "geo/layer_0/stable_rank_down_proj": 54.74644088745117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0626833438873291, "geo/layer_0/attn_entropy_mean": 6.152394771575928, "geo/layer_0/attn_entropy_std": 0.4069484770298004, "geo/layer_7/stable_rank_q_proj": 43.68751525878906, "geo/layer_7/stable_rank_k_proj": 41.333499908447266, "geo/layer_7/stable_rank_o_proj": 92.2543716430664, "geo/layer_7/stable_rank_gate_proj": 83.21451568603516, "geo/layer_7/stable_rank_down_proj": 141.28729248046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4709732234477997, "geo/layer_7/attn_entropy_mean": 4.667446136474609, "geo/layer_7/attn_entropy_std": 0.815287172794342, "geo/layer_14/stable_rank_q_proj": 52.08341598510742, "geo/layer_14/stable_rank_k_proj": 39.52033233642578, "geo/layer_14/stable_rank_o_proj": 44.1501350402832, "geo/layer_14/stable_rank_gate_proj": 71.82940673828125, "geo/layer_14/stable_rank_down_proj": 129.47265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3982253074645996, "geo/layer_14/attn_entropy_mean": 5.53354549407959, "geo/layer_14/attn_entropy_std": 0.374350368976593, "geo/layer_21/stable_rank_q_proj": 40.793052673339844, "geo/layer_21/stable_rank_k_proj": 30.131120681762695, "geo/layer_21/stable_rank_o_proj": 71.36927032470703, "geo/layer_21/stable_rank_gate_proj": 67.25505828857422, "geo/layer_21/stable_rank_down_proj": 52.45052719116211, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14228187501430511, "geo/layer_21/attn_entropy_mean": 5.699243545532227, "geo/layer_21/attn_entropy_std": 0.29483503103256226, "geo/layer_27/stable_rank_q_proj": 43.10612106323242, "geo/layer_27/stable_rank_k_proj": 31.651369094848633, "geo/layer_27/stable_rank_o_proj": 115.88195037841797, "geo/layer_27/stable_rank_gate_proj": 81.01832580566406, "geo/layer_27/stable_rank_down_proj": 129.1568603515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09141316264867783, "geo/layer_27/attn_entropy_mean": 4.218677520751953, "geo/layer_27/attn_entropy_std": 0.7124731540679932, "attnres/final_alpha/block_0": 0.23830586671829224, "attnres/block_norm/0": 1.7579072713851929, "attnres/final_alpha/block_1": 0.004577145911753178, "attnres/block_norm/1": 45667.46875, "attnres/final_alpha/block_2": 0.0103842094540596, "attnres/block_norm/2": 28286.81640625, "attnres/final_alpha/block_3": 0.012193204835057259, "attnres/block_norm/3": 56170.54296875, "attnres/final_alpha/block_4": 0.014076417312026024, "attnres/block_norm/4": 14709.94140625, "attnres/final_alpha/block_5": 0.6103950142860413, "attnres/block_norm/5": 6504.796875, "attnres/final_alpha/block_6": 0.11006814241409302, "attnres/block_norm/6": 37143.78125, "geo/tier1_time_s": 1.3605625629425049, "geo/step": 58125.0, "geo/rankme_slope": -3.6298112995198084e-06} {"step": 58130, "timestamp": 1778257378.1665666, "train/loss": 2.141102743148804, "train/z_loss": 0.001370228361338377, "train/perplexity": 8.508815495238675, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700763.4003856145, "perf/iters_per_sec": 0.8109871866157601, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330651044845582, "data/tokens_consumed": 121909542912, "data/tokens_consumed_B": 121.909542912, "train/loss_slope": -5.663553988150345e-06} {"step": 58140, "timestamp": 1778257388.534508, "train/loss": 2.1721668243408203, "train/z_loss": 0.001387665350921452, "train/perplexity": 8.77728227927371, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023931.1967788634, "perf/iters_per_sec": 0.965085600270683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361775159835815, "data/tokens_consumed": 121930514432, "data/tokens_consumed_B": 121.930514432, "train/loss_slope": -5.538204014569608e-06} {"step": 58150, "timestamp": 1778257399.5240138, "grad/layer_0/attn": 0.0026257620193064213, "grad/layer_0/mlp": 0.00284784403629601, "grad/layer_0/attn_mlp_ratio": 0.922017460801617, "grad/layer_4/attn": 0.002220859983935952, "grad/layer_4/mlp": 0.0025955624878406525, "grad/layer_4/attn_mlp_ratio": 0.8556372303792413, "grad/layer_8/attn": 0.003730240510776639, "grad/layer_8/mlp": 0.0036648851819336414, "grad/layer_8/attn_mlp_ratio": 1.017832817077569, "grad/layer_12/attn": 0.003938879352062941, "grad/layer_12/mlp": 0.006676846649497747, "grad/layer_12/attn_mlp_ratio": 0.5899310707347349, "grad/layer_16/attn": 0.0037141505163162947, "grad/layer_16/mlp": 0.004399004392325878, "grad/layer_16/attn_mlp_ratio": 0.8443161453450844, "grad/layer_20/attn": 0.004515840206295252, "grad/layer_20/mlp": 0.006358493119478226, "grad/layer_20/attn_mlp_ratio": 0.7102060268715391, "grad/layer_24/attn": 0.014591525308787823, "grad/layer_24/mlp": 0.012616180814802647, "grad/layer_24/attn_mlp_ratio": 1.1565722945259522, "grad/layer_27/attn": 0.00820982363075018, "grad/layer_27/mlp": 0.012723488733172417, "grad/layer_27/attn_mlp_ratio": 0.6452494074852879} {"step": 58150, "timestamp": 1778257399.542981, "train/loss": 2.1699414014816285, "train/z_loss": 0.0013610474532470107, "train/perplexity": 8.757770833309314, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905993.276181009, "perf/iters_per_sec": 0.9088484173684163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1002934932708741, "data/tokens_consumed": 121951485952, "data/tokens_consumed_B": 121.951485952, "train/loss_slope": -5.573097535736283e-06} {"step": 58160, "timestamp": 1778257409.910769, "train/loss": 2.1252965688705445, "train/z_loss": 0.001381722860969603, "train/perplexity": 8.37538099712603, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024720.9511741956, "perf/iters_per_sec": 0.9654621845122316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357733488082885, "data/tokens_consumed": 121972457472, "data/tokens_consumed_B": 121.972457472, "train/loss_slope": -1.0497073572103331e-05} {"step": 58170, "timestamp": 1778257420.2653782, "train/loss": 2.2036247491836547, "train/z_loss": 0.0013778107473626732, "train/perplexity": 9.057786270403158, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026370.2768737548, "perf/iters_per_sec": 0.9662486442917608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349303007125854, "data/tokens_consumed": 121993428992, "data/tokens_consumed_B": 121.993428992, "train/loss_slope": -6.308282076185866e-06} {"step": 58180, "timestamp": 1778257430.6308107, "train/loss": 2.0957528829574583, "train/z_loss": 0.0013829508912749588, "train/perplexity": 8.131560779701992, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024184.1008206075, "perf/iters_per_sec": 0.9652061943152463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360480546951294, "data/tokens_consumed": 122014400512, "data/tokens_consumed_B": 122.014400512, "train/loss_slope": -1.0406308413052646e-05} {"step": 58190, "timestamp": 1778257440.9857244, "train/loss": 2.1041637897491454, "train/z_loss": 0.001401228120084852, "train/perplexity": 8.2002430145282, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026709.5237792195, "perf/iters_per_sec": 0.9664104098220918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347570657730103, "data/tokens_consumed": 122035372032, "data/tokens_consumed_B": 122.035372032, "train/loss_slope": -1.323615277167597e-05} {"step": 58200, "timestamp": 1778257451.330535, "grad/layer_0/attn": 0.0026762092020362616, "grad/layer_0/mlp": 0.0027330531738698483, "grad/layer_0/attn_mlp_ratio": 0.9792012572981796, "grad/layer_4/attn": 0.002377444179728627, "grad/layer_4/mlp": 0.002474621869623661, "grad/layer_4/attn_mlp_ratio": 0.9607302484630359, "grad/layer_8/attn": 0.004319981671869755, "grad/layer_8/mlp": 0.003701556473970413, "grad/layer_8/attn_mlp_ratio": 1.1670716320393824, "grad/layer_12/attn": 0.0037675583735108376, "grad/layer_12/mlp": 0.006712769158184528, "grad/layer_12/attn_mlp_ratio": 0.5612524769739777, "grad/layer_16/attn": 0.0031858335714787245, "grad/layer_16/mlp": 0.004253978841006756, "grad/layer_16/attn_mlp_ratio": 0.7489067566293026, "grad/layer_20/attn": 0.0039907051250338554, "grad/layer_20/mlp": 0.005640164948999882, "grad/layer_20/attn_mlp_ratio": 0.7075511248986394, "grad/layer_24/attn": 0.006479340605437756, "grad/layer_24/mlp": 0.007754480931907892, "grad/layer_24/attn_mlp_ratio": 0.8355608297675591, "grad/layer_27/attn": 0.004890080541372299, "grad/layer_27/mlp": 0.0071823508478701115, "grad/layer_27/attn_mlp_ratio": 0.6808467835761252} {"step": 58200, "timestamp": 1778257451.9293084, "eos/sharpness": 50.46088695526122, "eos/L0_probe": 1.9799045324325562, "eos/L_plus": 2.1869733333587646, "eos/L_minus": 2.27744460105896, "eos/grad_norm": 0.11068712174892426, "eos/embed_grad_frac": 0.17559140920639038, "eos/time_s": 0.5959534645080566} {"step": 58200, "timestamp": 1778257451.9473355, "train/loss": 2.189799165725708, "train/z_loss": 0.0013666055165231228, "train/perplexity": 8.933418797843407, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914121.4058502906, "perf/iters_per_sec": 0.912724211621423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0956212043762208, "data/tokens_consumed": 122056343552, "data/tokens_consumed_B": 122.056343552, "train/loss_slope": -1.2960944188596124e-05} {"step": 58200, "timestamp": 1778257453.309172, "geo/rankme_last": 438.4163513183594, "geo/layer_0/stable_rank_q_proj": 19.514972686767578, "geo/layer_0/stable_rank_k_proj": 16.164722442626953, "geo/layer_0/stable_rank_o_proj": 47.31515884399414, "geo/layer_0/stable_rank_gate_proj": 131.70140075683594, "geo/layer_0/stable_rank_down_proj": 54.75326156616211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0641145333647728, "geo/layer_0/attn_entropy_mean": 6.157334327697754, "geo/layer_0/attn_entropy_std": 0.40636375546455383, "geo/layer_7/stable_rank_q_proj": 43.71699905395508, "geo/layer_7/stable_rank_k_proj": 41.256290435791016, "geo/layer_7/stable_rank_o_proj": 92.23692321777344, "geo/layer_7/stable_rank_gate_proj": 83.22093200683594, "geo/layer_7/stable_rank_down_proj": 141.377197265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4661419093608856, "geo/layer_7/attn_entropy_mean": 4.690485954284668, "geo/layer_7/attn_entropy_std": 0.8047810196876526, "geo/layer_14/stable_rank_q_proj": 52.002994537353516, "geo/layer_14/stable_rank_k_proj": 39.5529670715332, "geo/layer_14/stable_rank_o_proj": 44.13159942626953, "geo/layer_14/stable_rank_gate_proj": 71.91354370117188, "geo/layer_14/stable_rank_down_proj": 129.60073852539062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3951995074748993, "geo/layer_14/attn_entropy_mean": 5.511165618896484, "geo/layer_14/attn_entropy_std": 0.39301979541778564, "geo/layer_21/stable_rank_q_proj": 40.76963806152344, "geo/layer_21/stable_rank_k_proj": 30.082172393798828, "geo/layer_21/stable_rank_o_proj": 71.37352752685547, "geo/layer_21/stable_rank_gate_proj": 67.22764587402344, "geo/layer_21/stable_rank_down_proj": 52.382049560546875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1446497142314911, "geo/layer_21/attn_entropy_mean": 5.7113518714904785, "geo/layer_21/attn_entropy_std": 0.290872722864151, "geo/layer_27/stable_rank_q_proj": 43.09455871582031, "geo/layer_27/stable_rank_k_proj": 31.653926849365234, "geo/layer_27/stable_rank_o_proj": 115.83728790283203, "geo/layer_27/stable_rank_gate_proj": 80.873291015625, "geo/layer_27/stable_rank_down_proj": 129.01243591308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09288221597671509, "geo/layer_27/attn_entropy_mean": 4.250723838806152, "geo/layer_27/attn_entropy_std": 0.6991341710090637, "attnres/final_alpha/block_0": 0.2378871589899063, "attnres/block_norm/0": 1.7581372261047363, "attnres/final_alpha/block_1": 0.004581990651786327, "attnres/block_norm/1": 45760.625, "attnres/final_alpha/block_2": 0.01019304245710373, "attnres/block_norm/2": 28161.203125, "attnres/final_alpha/block_3": 0.012097920291125774, "attnres/block_norm/3": 56083.078125, "attnres/final_alpha/block_4": 0.014048044569790363, "attnres/block_norm/4": 14757.599609375, "attnres/final_alpha/block_5": 0.6095030307769775, "attnres/block_norm/5": 6567.607421875, "attnres/final_alpha/block_6": 0.11168882250785828, "attnres/block_norm/6": 37470.8203125, "geo/tier1_time_s": 1.3580479621887207, "geo/step": 58200.0, "geo/rankme_slope": -1.2492672850390155e-05} {"step": 58210, "timestamp": 1778257463.6772745, "train/loss": 2.1503258466720583, "train/z_loss": 0.001380875799804926, "train/perplexity": 8.587656200519405, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788420.543132858, "perf/iters_per_sec": 0.8527853694595613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1726279973983764, "data/tokens_consumed": 122077315072, "data/tokens_consumed_B": 122.077315072, "train/loss_slope": -1.2501694338478355e-05} {"step": 58220, "timestamp": 1778257474.03961, "train/loss": 2.141099047660828, "train/z_loss": 0.0013838953222148121, "train/perplexity": 8.508784051071423, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025509.5497431539, "perf/iters_per_sec": 0.9658382176128167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353700876235963, "data/tokens_consumed": 122098286592, "data/tokens_consumed_B": 122.098286592, "train/loss_slope": -1.3486949323785229e-05} {"step": 58230, "timestamp": 1778257484.3927464, "train/loss": 2.1327584981918335, "train/z_loss": 0.0013865989865735173, "train/perplexity": 8.43811125145737, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026561.8771865342, "perf/iters_per_sec": 0.9663400064404174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348324537277223, "data/tokens_consumed": 122119258112, "data/tokens_consumed_B": 122.119258112, "train/loss_slope": -1.2173989196099803e-05} {"step": 58240, "timestamp": 1778257494.7501504, "train/loss": 2.1697083473205567, "train/z_loss": 0.0013771477038972079, "train/perplexity": 8.755730036192267, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025612.7274856367, "perf/iters_per_sec": 0.9658874165943321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035317349433899, "data/tokens_consumed": 122140229632, "data/tokens_consumed_B": 122.140229632, "train/loss_slope": -9.634333417968568e-06} {"step": 58250, "timestamp": 1778257505.0852094, "grad/layer_0/attn": 0.00263105146586895, "grad/layer_0/mlp": 0.0029403113294392824, "grad/layer_0/attn_mlp_ratio": 0.8948206776758649, "grad/layer_4/attn": 0.0032415224704891443, "grad/layer_4/mlp": 0.0027135019190609455, "grad/layer_4/attn_mlp_ratio": 1.1945900344717375, "grad/layer_8/attn": 0.0033091155346482992, "grad/layer_8/mlp": 0.0038614878430962563, "grad/layer_8/attn_mlp_ratio": 0.856953480992861, "grad/layer_12/attn": 0.005520529113709927, "grad/layer_12/mlp": 0.007123981602489948, "grad/layer_12/attn_mlp_ratio": 0.7749218546954453, "grad/layer_16/attn": 0.003892339300364256, "grad/layer_16/mlp": 0.004785513039678335, "grad/layer_16/attn_mlp_ratio": 0.8133588158167475, "grad/layer_20/attn": 0.004397881682962179, "grad/layer_20/mlp": 0.0059135607443749905, "grad/layer_20/attn_mlp_ratio": 0.7436943321798181, "grad/layer_24/attn": 0.008913278579711914, "grad/layer_24/mlp": 0.009219957515597343, "grad/layer_24/attn_mlp_ratio": 0.9667374787746722, "grad/layer_27/attn": 0.006954484153538942, "grad/layer_27/mlp": 0.007413228042423725, "grad/layer_27/attn_mlp_ratio": 0.9381181881804601} {"step": 58250, "timestamp": 1778257505.0994174, "train/loss": 2.2104717969894407, "train/z_loss": 0.0013739731744863092, "train/perplexity": 9.120018175295302, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027741.7828184727, "perf/iters_per_sec": 0.9669026292888988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342303037643432, "data/tokens_consumed": 122161201152, "data/tokens_consumed_B": 122.161201152, "train/loss_slope": -5.5197796591259065e-06} {"step": 58260, "timestamp": 1778257515.4572885, "train/loss": 2.17768257856369, "train/z_loss": 0.0013675231137312948, "train/perplexity": 8.825829374913662, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025700.5674545397, "perf/iters_per_sec": 0.9659293019554804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352724552154542, "data/tokens_consumed": 122182172672, "data/tokens_consumed_B": 122.182172672, "train/loss_slope": -6.935523066333297e-06} {"step": 58270, "timestamp": 1778257525.8102949, "train/loss": 2.184261846542358, "train/z_loss": 0.0013731089304201305, "train/perplexity": 8.884088311931347, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026646.2973989965, "perf/iters_per_sec": 0.9663802611346228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347893476486205, "data/tokens_consumed": 122203144192, "data/tokens_consumed_B": 122.203144192, "train/loss_slope": -5.35562439004904e-06} {"step": 58275, "timestamp": 1778257531.5643232, "eos/sharpness": 34.300565719604485, "eos/L0_probe": 1.9822282791137695, "eos/L_plus": 2.1874423027038574, "eos/L_minus": 2.1200199127197266, "eos/grad_norm": 0.1089041531085968, "eos/embed_grad_frac": 0.19892391562461853, "eos/time_s": 0.5867795944213867} {"step": 58275, "timestamp": 1778257532.9428375, "geo/rankme_last": 438.16387939453125, "geo/layer_0/stable_rank_q_proj": 19.49518394470215, "geo/layer_0/stable_rank_k_proj": 16.137922286987305, "geo/layer_0/stable_rank_o_proj": 47.36069869995117, "geo/layer_0/stable_rank_gate_proj": 131.84210205078125, "geo/layer_0/stable_rank_down_proj": 54.687904357910156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06157393380999565, "geo/layer_0/attn_entropy_mean": 6.159093379974365, "geo/layer_0/attn_entropy_std": 0.40543651580810547, "geo/layer_7/stable_rank_q_proj": 43.80561828613281, "geo/layer_7/stable_rank_k_proj": 41.27022171020508, "geo/layer_7/stable_rank_o_proj": 92.31159973144531, "geo/layer_7/stable_rank_gate_proj": 83.30490112304688, "geo/layer_7/stable_rank_down_proj": 141.57032775878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46508827805519104, "geo/layer_7/attn_entropy_mean": 4.657676696777344, "geo/layer_7/attn_entropy_std": 0.796374499797821, "geo/layer_14/stable_rank_q_proj": 51.93660354614258, "geo/layer_14/stable_rank_k_proj": 39.454734802246094, "geo/layer_14/stable_rank_o_proj": 44.11850357055664, "geo/layer_14/stable_rank_gate_proj": 72.02304077148438, "geo/layer_14/stable_rank_down_proj": 129.5233612060547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4067246615886688, "geo/layer_14/attn_entropy_mean": 5.526524543762207, "geo/layer_14/attn_entropy_std": 0.39794686436653137, "geo/layer_21/stable_rank_q_proj": 40.79588317871094, "geo/layer_21/stable_rank_k_proj": 29.99152946472168, "geo/layer_21/stable_rank_o_proj": 71.31149291992188, "geo/layer_21/stable_rank_gate_proj": 67.21639251708984, "geo/layer_21/stable_rank_down_proj": 52.346099853515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451677680015564, "geo/layer_21/attn_entropy_mean": 5.712027549743652, "geo/layer_21/attn_entropy_std": 0.29693716764450073, "geo/layer_27/stable_rank_q_proj": 43.076141357421875, "geo/layer_27/stable_rank_k_proj": 31.588048934936523, "geo/layer_27/stable_rank_o_proj": 115.84259033203125, "geo/layer_27/stable_rank_gate_proj": 80.88227081298828, "geo/layer_27/stable_rank_down_proj": 128.79286193847656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09096711128950119, "geo/layer_27/attn_entropy_mean": 4.227799415588379, "geo/layer_27/attn_entropy_std": 0.7195982933044434, "attnres/final_alpha/block_0": 0.23524121940135956, "attnres/block_norm/0": 1.758108139038086, "attnres/final_alpha/block_1": 0.004535837564617395, "attnres/block_norm/1": 45594.9296875, "attnres/final_alpha/block_2": 0.010152429342269897, "attnres/block_norm/2": 28309.107421875, "attnres/final_alpha/block_3": 0.01209636777639389, "attnres/block_norm/3": 56175.41796875, "attnres/final_alpha/block_4": 0.013985011726617813, "attnres/block_norm/4": 14715.33203125, "attnres/final_alpha/block_5": 0.6144847869873047, "attnres/block_norm/5": 6575.5927734375, "attnres/final_alpha/block_6": 0.10950435698032379, "attnres/block_norm/6": 37305.30078125, "geo/tier1_time_s": 1.3603477478027344, "geo/step": 58275.0, "geo/rankme_slope": -3.141074789290717e-05} {"step": 58280, "timestamp": 1778257538.1174607, "train/loss": 2.154033875465393, "train/z_loss": 0.0013858288759365678, "train/perplexity": 8.61955858791028, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705018.3225906298, "perf/iters_per_sec": 0.813016091628375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2299879550933839, "data/tokens_consumed": 122224115712, "data/tokens_consumed_B": 122.224115712, "train/loss_slope": -4.648874809603935e-06} {"step": 58290, "timestamp": 1778257548.4619768, "train/loss": 2.1276728391647337, "train/z_loss": 0.0013875559787265956, "train/perplexity": 8.395306831399852, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028168.7951241457, "perf/iters_per_sec": 0.9671062446232537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340125560760498, "data/tokens_consumed": 122245087232, "data/tokens_consumed_B": 122.245087232, "train/loss_slope": -5.657240688496821e-06} {"step": 58300, "timestamp": 1778257558.8038166, "grad/layer_0/attn": 0.0025773767847567797, "grad/layer_0/mlp": 0.0029066926799714565, "grad/layer_0/attn_mlp_ratio": 0.8867042304973453, "grad/layer_4/attn": 0.0033592309337109327, "grad/layer_4/mlp": 0.0024011468049138784, "grad/layer_4/attn_mlp_ratio": 1.3990110004666365, "grad/layer_8/attn": 0.003921577241271734, "grad/layer_8/mlp": 0.003540902165696025, "grad/layer_8/attn_mlp_ratio": 1.1075079025093852, "grad/layer_12/attn": 0.00585064897313714, "grad/layer_12/mlp": 0.006766422186046839, "grad/layer_12/attn_mlp_ratio": 0.8646591545434386, "grad/layer_16/attn": 0.004860504996031523, "grad/layer_16/mlp": 0.004508142825216055, "grad/layer_16/attn_mlp_ratio": 1.0781612465843862, "grad/layer_20/attn": 0.0035841036587953568, "grad/layer_20/mlp": 0.0062988451682031155, "grad/layer_20/attn_mlp_ratio": 0.5690096368755223, "grad/layer_24/attn": 0.015557718463242054, "grad/layer_24/mlp": 0.011183753609657288, "grad/layer_24/attn_mlp_ratio": 1.3910998817693743, "grad/layer_27/attn": 0.009386222809553146, "grad/layer_27/mlp": 0.010676465928554535, "grad/layer_27/attn_mlp_ratio": 0.8791507212638907} {"step": 58300, "timestamp": 1778257558.8180726, "train/loss": 2.082943820953369, "train/z_loss": 0.0013898827484808863, "train/perplexity": 8.028067355357315, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026474.5694318113, "perf/iters_per_sec": 0.9662983748587662, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348770380020142, "data/tokens_consumed": 122266058752, "data/tokens_consumed_B": 122.266058752, "train/loss_slope": -9.711369765688303e-06} {"step": 58310, "timestamp": 1778257569.161585, "train/loss": 2.1286808967590334, "train/z_loss": 0.0013748274184763431, "train/perplexity": 8.40377405121332, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028847.1572090122, "perf/iters_per_sec": 0.9674297128720342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336668252944947, "data/tokens_consumed": 122287030272, "data/tokens_consumed_B": 122.287030272, "train/loss_slope": -9.363609328843668e-06} {"step": 58320, "timestamp": 1778257579.5101495, "train/loss": 2.1125410437583922, "train/z_loss": 0.0013753924169577659, "train/perplexity": 8.269227078278726, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027583.7969110932, "perf/iters_per_sec": 0.9668272957377878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343108892440795, "data/tokens_consumed": 122308001792, "data/tokens_consumed_B": 122.308001792, "train/loss_slope": -1.0259768178146113e-05} {"step": 58330, "timestamp": 1778257589.862818, "train/loss": 2.1863091945648194, "train/z_loss": 0.0013706977129913866, "train/perplexity": 8.90229576470445, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027341.6775388129, "perf/iters_per_sec": 0.9667118442243637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344344139099122, "data/tokens_consumed": 122328973312, "data/tokens_consumed_B": 122.328973312, "train/loss_slope": -8.883926334089264e-06} {"step": 58340, "timestamp": 1778257600.204175, "train/loss": 2.0725746512413026, "train/z_loss": 0.0013912186957895755, "train/perplexity": 7.945253062232245, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029287.5564664747, "perf/iters_per_sec": 0.9676397116024373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033442497253418, "data/tokens_consumed": 122349944832, "data/tokens_consumed_B": 122.349944832, "train/loss_slope": -1.4131644177715819e-05} {"step": 58350, "timestamp": 1778257610.549156, "grad/layer_0/attn": 0.0028885751962661743, "grad/layer_0/mlp": 0.0029296467546373606, "grad/layer_0/attn_mlp_ratio": 0.9859806794439495, "grad/layer_4/attn": 0.0025857393629848957, "grad/layer_4/mlp": 0.0025247957091778517, "grad/layer_4/attn_mlp_ratio": 1.024138012898116, "grad/layer_8/attn": 0.0032339701429009438, "grad/layer_8/mlp": 0.0034739295952022076, "grad/layer_8/attn_mlp_ratio": 0.9309256164185854, "grad/layer_12/attn": 0.004881253466010094, "grad/layer_12/mlp": 0.006385233718901873, "grad/layer_12/attn_mlp_ratio": 0.764459627392246, "grad/layer_16/attn": 0.0045889331959187984, "grad/layer_16/mlp": 0.004642766434699297, "grad/layer_16/attn_mlp_ratio": 0.9884049007465363, "grad/layer_20/attn": 0.003542757360264659, "grad/layer_20/mlp": 0.005882328376173973, "grad/layer_20/attn_mlp_ratio": 0.6022712561215156, "grad/layer_24/attn": 0.010740824975073338, "grad/layer_24/mlp": 0.009155238047242165, "grad/layer_24/attn_mlp_ratio": 1.1731890314954614, "grad/layer_27/attn": 0.0048003909178078175, "grad/layer_27/mlp": 0.008773996494710445, "grad/layer_27/attn_mlp_ratio": 0.5471156577268124} {"step": 58350, "timestamp": 1778257611.1396108, "eos/sharpness": 23.223543167114254, "eos/L0_probe": 1.981831431388855, "eos/L_plus": 2.1103296279907227, "eos/L_minus": 2.08556866645813, "eos/grad_norm": 0.12188457697629929, "eos/embed_grad_frac": 0.17135943472385406, "eos/time_s": 0.5877563953399658} {"step": 58350, "timestamp": 1778257611.159493, "train/loss": 2.1731024026870727, "train/z_loss": 0.0013807641924358904, "train/perplexity": 8.78549795711902, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915124.5255856058, "perf/iters_per_sec": 0.9132025363853482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950473308563233, "data/tokens_consumed": 122370916352, "data/tokens_consumed_B": 122.370916352, "train/loss_slope": -1.1205444656404135e-05} {"step": 58350, "timestamp": 1778257612.523931, "geo/rankme_last": 439.03094482421875, "geo/layer_0/stable_rank_q_proj": 19.46922492980957, "geo/layer_0/stable_rank_k_proj": 16.1272029876709, "geo/layer_0/stable_rank_o_proj": 47.302703857421875, "geo/layer_0/stable_rank_gate_proj": 131.8832244873047, "geo/layer_0/stable_rank_down_proj": 54.658172607421875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06190475821495056, "geo/layer_0/attn_entropy_mean": 6.163230895996094, "geo/layer_0/attn_entropy_std": 0.4069536030292511, "geo/layer_7/stable_rank_q_proj": 43.809471130371094, "geo/layer_7/stable_rank_k_proj": 41.2130241394043, "geo/layer_7/stable_rank_o_proj": 92.22859191894531, "geo/layer_7/stable_rank_gate_proj": 83.375732421875, "geo/layer_7/stable_rank_down_proj": 141.4458770751953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4683249592781067, "geo/layer_7/attn_entropy_mean": 4.689221382141113, "geo/layer_7/attn_entropy_std": 0.7847449779510498, "geo/layer_14/stable_rank_q_proj": 51.94295883178711, "geo/layer_14/stable_rank_k_proj": 39.54912185668945, "geo/layer_14/stable_rank_o_proj": 44.089447021484375, "geo/layer_14/stable_rank_gate_proj": 72.14134979248047, "geo/layer_14/stable_rank_down_proj": 129.20901489257812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3958291709423065, "geo/layer_14/attn_entropy_mean": 5.518138885498047, "geo/layer_14/attn_entropy_std": 0.41142281889915466, "geo/layer_21/stable_rank_q_proj": 40.779754638671875, "geo/layer_21/stable_rank_k_proj": 30.01760482788086, "geo/layer_21/stable_rank_o_proj": 71.24970245361328, "geo/layer_21/stable_rank_gate_proj": 67.21113586425781, "geo/layer_21/stable_rank_down_proj": 52.313575744628906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1467539668083191, "geo/layer_21/attn_entropy_mean": 5.707712173461914, "geo/layer_21/attn_entropy_std": 0.2976263761520386, "geo/layer_27/stable_rank_q_proj": 43.05535888671875, "geo/layer_27/stable_rank_k_proj": 31.632902145385742, "geo/layer_27/stable_rank_o_proj": 116.02186584472656, "geo/layer_27/stable_rank_gate_proj": 80.94822692871094, "geo/layer_27/stable_rank_down_proj": 129.0531005859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09706290066242218, "geo/layer_27/attn_entropy_mean": 4.232329368591309, "geo/layer_27/attn_entropy_std": 0.7150751948356628, "attnres/final_alpha/block_0": 0.23620885610580444, "attnres/block_norm/0": 1.7583813667297363, "attnres/final_alpha/block_1": 0.004622457083314657, "attnres/block_norm/1": 45704.0234375, "attnres/final_alpha/block_2": 0.010227201506495476, "attnres/block_norm/2": 28215.91015625, "attnres/final_alpha/block_3": 0.011981043964624405, "attnres/block_norm/3": 55934.328125, "attnres/final_alpha/block_4": 0.014148960821330547, "attnres/block_norm/4": 14795.6953125, "attnres/final_alpha/block_5": 0.6117080450057983, "attnres/block_norm/5": 6544.7705078125, "attnres/final_alpha/block_6": 0.11110343039035797, "attnres/block_norm/6": 37247.3046875, "geo/tier1_time_s": 1.360264778137207, "geo/step": 58350.0, "geo/rankme_slope": -4.98410106229992e-05} {"step": 58360, "timestamp": 1778257622.8728752, "train/loss": 2.172108602523804, "train/z_loss": 0.0013578400947153568, "train/perplexity": 8.776771264827184, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791006.6440651186, "perf/iters_per_sec": 0.854018518478927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170934796333313, "data/tokens_consumed": 122391887872, "data/tokens_consumed_B": 122.391887872, "train/loss_slope": -8.318003040156933e-06} {"step": 58370, "timestamp": 1778257633.214184, "train/loss": 2.196175527572632, "train/z_loss": 0.0013687910279259086, "train/perplexity": 8.990563502670769, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028976.1820202132, "perf/iters_per_sec": 0.967491236696345, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336010932922364, "data/tokens_consumed": 122412859392, "data/tokens_consumed_B": 122.412859392, "train/loss_slope": -6.3284912160878584e-06} {"step": 58380, "timestamp": 1778257643.562762, "train/loss": 2.1759978532791138, "train/z_loss": 0.0013787370407953858, "train/perplexity": 8.810972795150123, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027423.7327509462, "perf/iters_per_sec": 0.9667509711985331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343925476074218, "data/tokens_consumed": 122433830912, "data/tokens_consumed_B": 122.433830912, "train/loss_slope": -4.323224330356159e-06} {"step": 58390, "timestamp": 1778257653.906553, "train/loss": 2.1761039972305296, "train/z_loss": 0.0013683423516340555, "train/perplexity": 8.811908076254756, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028895.358246979, "perf/iters_per_sec": 0.9674526969180007, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336422681808473, "data/tokens_consumed": 122454802432, "data/tokens_consumed_B": 122.454802432, "train/loss_slope": -1.4955870925647057e-06} {"step": 58400, "timestamp": 1778257664.2394753, "grad/layer_0/attn": 0.0025229325983673334, "grad/layer_0/mlp": 0.0027684587985277176, "grad/layer_0/attn_mlp_ratio": 0.9113130051195788, "grad/layer_4/attn": 0.002887209178879857, "grad/layer_4/mlp": 0.002539591630920768, "grad/layer_4/attn_mlp_ratio": 1.1368792643820176, "grad/layer_8/attn": 0.0036052109207957983, "grad/layer_8/mlp": 0.0037578949704766273, "grad/layer_8/attn_mlp_ratio": 0.9593697676977809, "grad/layer_12/attn": 0.00422914233058691, "grad/layer_12/mlp": 0.006690867245197296, "grad/layer_12/attn_mlp_ratio": 0.6320768463034301, "grad/layer_16/attn": 0.004545389674603939, "grad/layer_16/mlp": 0.004509332589805126, "grad/layer_16/attn_mlp_ratio": 1.0079960799699548, "grad/layer_20/attn": 0.00305737042799592, "grad/layer_20/mlp": 0.005721950437873602, "grad/layer_20/attn_mlp_ratio": 0.5343231137282959, "grad/layer_24/attn": 0.0106083694845438, "grad/layer_24/mlp": 0.01077897660434246, "grad/layer_24/attn_mlp_ratio": 0.9841722248337424, "grad/layer_27/attn": 0.005998605862259865, "grad/layer_27/mlp": 0.0109073081985116, "grad/layer_27/attn_mlp_ratio": 0.5499620711260567} {"step": 58400, "timestamp": 1778257664.253601, "train/loss": 2.1374796628952026, "train/z_loss": 0.0013894640374928712, "train/perplexity": 8.478043152831034, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028337.9104678608, "perf/iters_per_sec": 0.9671868851031593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339263439178468, "data/tokens_consumed": 122475773952, "data/tokens_consumed_B": 122.475773952, "train/loss_slope": -2.7436382663954792e-06} {"step": 58410, "timestamp": 1778257674.5986521, "train/loss": 2.119240927696228, "train/z_loss": 0.0013949059648439288, "train/perplexity": 8.324815951516916, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028211.351916312, "perf/iters_per_sec": 0.9671265372830925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339908599853516, "data/tokens_consumed": 122496745472, "data/tokens_consumed_B": 122.496745472, "train/loss_slope": -2.9000231570894435e-06} {"step": 58420, "timestamp": 1778257684.951258, "train/loss": 2.149072289466858, "train/z_loss": 0.0013729206286370755, "train/perplexity": 8.576897826740758, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027141.2402885037, "perf/iters_per_sec": 0.9666162682955283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345366954803468, "data/tokens_consumed": 122517716992, "data/tokens_consumed_B": 122.517716992, "train/loss_slope": -1.9587583310104303e-06} {"step": 58425, "timestamp": 1778257690.7149096, "eos/sharpness": 64.68207836151122, "eos/L0_probe": 1.980464220046997, "eos/L_plus": 2.272771120071411, "eos/L_minus": 2.3349781036376953, "eos/grad_norm": 0.17572021484375, "eos/embed_grad_frac": 0.06490329653024673, "eos/time_s": 0.5971405506134033} {"step": 58425, "timestamp": 1778257692.0915062, "geo/rankme_last": 438.49664306640625, "geo/layer_0/stable_rank_q_proj": 19.43831443786621, "geo/layer_0/stable_rank_k_proj": 16.13266944885254, "geo/layer_0/stable_rank_o_proj": 47.24198913574219, "geo/layer_0/stable_rank_gate_proj": 132.0842742919922, "geo/layer_0/stable_rank_down_proj": 54.71929168701172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06283829361200333, "geo/layer_0/attn_entropy_mean": 6.1602582931518555, "geo/layer_0/attn_entropy_std": 0.40425723791122437, "geo/layer_7/stable_rank_q_proj": 43.72100830078125, "geo/layer_7/stable_rank_k_proj": 41.19310760498047, "geo/layer_7/stable_rank_o_proj": 92.19280242919922, "geo/layer_7/stable_rank_gate_proj": 83.26899719238281, "geo/layer_7/stable_rank_down_proj": 141.1672821044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4639851450920105, "geo/layer_7/attn_entropy_mean": 4.6574578285217285, "geo/layer_7/attn_entropy_std": 0.7843572497367859, "geo/layer_14/stable_rank_q_proj": 51.96993637084961, "geo/layer_14/stable_rank_k_proj": 39.49736404418945, "geo/layer_14/stable_rank_o_proj": 44.11552047729492, "geo/layer_14/stable_rank_gate_proj": 72.24552917480469, "geo/layer_14/stable_rank_down_proj": 129.16891479492188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40276166796684265, "geo/layer_14/attn_entropy_mean": 5.508457183837891, "geo/layer_14/attn_entropy_std": 0.41195666790008545, "geo/layer_21/stable_rank_q_proj": 40.7027702331543, "geo/layer_21/stable_rank_k_proj": 30.010488510131836, "geo/layer_21/stable_rank_o_proj": 71.07499694824219, "geo/layer_21/stable_rank_gate_proj": 67.21501159667969, "geo/layer_21/stable_rank_down_proj": 52.289852142333984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14678619801998138, "geo/layer_21/attn_entropy_mean": 5.711978912353516, "geo/layer_21/attn_entropy_std": 0.3031250536441803, "geo/layer_27/stable_rank_q_proj": 43.024776458740234, "geo/layer_27/stable_rank_k_proj": 31.678760528564453, "geo/layer_27/stable_rank_o_proj": 115.95958709716797, "geo/layer_27/stable_rank_gate_proj": 80.92049407958984, "geo/layer_27/stable_rank_down_proj": 128.84471130371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09428740292787552, "geo/layer_27/attn_entropy_mean": 4.227116107940674, "geo/layer_27/attn_entropy_std": 0.7287711501121521, "attnres/final_alpha/block_0": 0.23878586292266846, "attnres/block_norm/0": 1.7586338520050049, "attnres/final_alpha/block_1": 0.0047130286693573, "attnres/block_norm/1": 45778.86328125, "attnres/final_alpha/block_2": 0.010368089191615582, "attnres/block_norm/2": 28241.0234375, "attnres/final_alpha/block_3": 0.012345784343779087, "attnres/block_norm/3": 56024.9296875, "attnres/final_alpha/block_4": 0.014512516558170319, "attnres/block_norm/4": 14765.361328125, "attnres/final_alpha/block_5": 0.6059370040893555, "attnres/block_norm/5": 6584.080078125, "attnres/final_alpha/block_6": 0.1133376955986023, "attnres/block_norm/6": 37145.4765625, "geo/tier1_time_s": 1.358445405960083, "geo/step": 58425.0, "geo/rankme_slope": -6.550293945703282e-05} {"step": 58430, "timestamp": 1778257697.2677412, "train/loss": 2.1029821038246155, "train/z_loss": 0.001391120976768434, "train/perplexity": 8.190558625860021, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703940.4916134025, "perf/iters_per_sec": 0.8125021417681706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230765986442566, "data/tokens_consumed": 122538688512, "data/tokens_consumed_B": 122.538688512, "train/loss_slope": -8.254614242590427e-07} {"step": 58440, "timestamp": 1778257707.6172967, "train/loss": 2.1181238412857057, "train/z_loss": 0.0013990194303914905, "train/perplexity": 8.31552160500799, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027809.2848956387, "perf/iters_per_sec": 0.9669348167875474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034195876121521, "data/tokens_consumed": 122559660032, "data/tokens_consumed_B": 122.559660032, "train/loss_slope": 1.3961266870438644e-06} {"step": 58450, "timestamp": 1778257717.9487288, "grad/layer_0/attn": 0.002952328883111477, "grad/layer_0/mlp": 0.0029978540260344744, "grad/layer_0/attn_mlp_ratio": 0.9848140566521768, "grad/layer_4/attn": 0.0021088900975883007, "grad/layer_4/mlp": 0.002613950753584504, "grad/layer_4/attn_mlp_ratio": 0.8067826121123834, "grad/layer_8/attn": 0.005507881753146648, "grad/layer_8/mlp": 0.003612740896642208, "grad/layer_8/attn_mlp_ratio": 1.5245714426430907, "grad/layer_12/attn": 0.0051416358910501, "grad/layer_12/mlp": 0.006189935375005007, "grad/layer_12/attn_mlp_ratio": 0.8306445053929972, "grad/layer_16/attn": 0.006893588230013847, "grad/layer_16/mlp": 0.004488218110054731, "grad/layer_16/attn_mlp_ratio": 1.535929829474531, "grad/layer_20/attn": 0.003724446287378669, "grad/layer_20/mlp": 0.0059071858413517475, "grad/layer_20/attn_mlp_ratio": 0.6304941683495408, "grad/layer_24/attn": 0.011935449205338955, "grad/layer_24/mlp": 0.010280969552695751, "grad/layer_24/attn_mlp_ratio": 1.1609264114702826, "grad/layer_27/attn": 0.004484088160097599, "grad/layer_27/mlp": 0.009178433567285538, "grad/layer_27/attn_mlp_ratio": 0.4885461204649899} {"step": 58450, "timestamp": 1778257717.9630618, "train/loss": 2.1576142072677613, "train/z_loss": 0.0013748824945650994, "train/perplexity": 8.650474779731708, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028483.8044618561, "perf/iters_per_sec": 0.967256452780655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033851981163025, "data/tokens_consumed": 122580631552, "data/tokens_consumed_B": 122.580631552, "train/loss_slope": 1.2908055193603848e-06} {"step": 58460, "timestamp": 1778257728.3078597, "train/loss": 2.1736672282218934, "train/z_loss": 0.001374962623231113, "train/perplexity": 8.790461632374619, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028245.1647946758, "perf/iters_per_sec": 0.9671426605199221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339736223220826, "data/tokens_consumed": 122601603072, "data/tokens_consumed_B": 122.601603072, "train/loss_slope": 6.051130334858156e-06} {"step": 58470, "timestamp": 1778257738.6604614, "train/loss": 2.188402700424194, "train/z_loss": 0.0013658192357979715, "train/perplexity": 8.92095229501357, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027142.9688319464, "perf/iters_per_sec": 0.9666170925292713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034535813331604, "data/tokens_consumed": 122622574592, "data/tokens_consumed_B": 122.622574592, "train/loss_slope": 1.0891933853190621e-05} {"step": 58480, "timestamp": 1778257749.0011764, "train/loss": 2.1312820672988892, "train/z_loss": 0.0013808505493216217, "train/perplexity": 8.425662155703826, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029608.1116939033, "perf/iters_per_sec": 0.9677925642461316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332792758941651, "data/tokens_consumed": 122643546112, "data/tokens_consumed_B": 122.643546112, "train/loss_slope": 7.977277589495268e-06} {"step": 58490, "timestamp": 1778257759.338357, "train/loss": 2.0803687810897826, "train/z_loss": 0.0013877142569981515, "train/perplexity": 8.007421355434467, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029678.9696557666, "perf/iters_per_sec": 0.9678263519553025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332432031631469, "data/tokens_consumed": 122664517632, "data/tokens_consumed_B": 122.664517632, "train/loss_slope": 3.883999888331129e-06} {"step": 58500, "timestamp": 1778257769.671467, "grad/layer_0/attn": 0.0024320913944393396, "grad/layer_0/mlp": 0.0027480735443532467, "grad/layer_0/attn_mlp_ratio": 0.8850168187584097, "grad/layer_4/attn": 0.0038682431913912296, "grad/layer_4/mlp": 0.0025074996519833803, "grad/layer_4/attn_mlp_ratio": 1.5426694213355472, "grad/layer_8/attn": 0.00364226964302361, "grad/layer_8/mlp": 0.0037485614884644747, "grad/layer_8/attn_mlp_ratio": 0.9716446047550722, "grad/layer_12/attn": 0.004732152912765741, "grad/layer_12/mlp": 0.006052091252058744, "grad/layer_12/attn_mlp_ratio": 0.7819037482234966, "grad/layer_16/attn": 0.0037760548293590546, "grad/layer_16/mlp": 0.004518229980021715, "grad/layer_16/attn_mlp_ratio": 0.8357376146149925, "grad/layer_20/attn": 0.0033587198704481125, "grad/layer_20/mlp": 0.006269214209169149, "grad/layer_20/attn_mlp_ratio": 0.535748134425036, "grad/layer_24/attn": 0.010593707673251629, "grad/layer_24/mlp": 0.009430627338588238, "grad/layer_24/attn_mlp_ratio": 1.1233301010179133, "grad/layer_27/attn": 0.007206200622022152, "grad/layer_27/mlp": 0.009920816868543625, "grad/layer_27/attn_mlp_ratio": 0.726371693467501} {"step": 58500, "timestamp": 1778257770.2712948, "eos/sharpness": 74.98748302459715, "eos/L0_probe": 1.9800763130187988, "eos/L_plus": 2.292402505874634, "eos/L_minus": 2.4176249504089355, "eos/grad_norm": 0.17138411104679108, "eos/embed_grad_frac": 0.07307986915111542, "eos/time_s": 0.5970039367675781} {"step": 58500, "timestamp": 1778257770.2906773, "train/loss": 2.143412148952484, "train/z_loss": 0.0013706681318581104, "train/perplexity": 8.528488510870465, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916084.6201217254, "perf/iters_per_sec": 0.9136603451355578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094498634338379, "data/tokens_consumed": 122685489152, "data/tokens_consumed_B": 122.685489152, "train/loss_slope": 2.056102040219603e-06} {"step": 58500, "timestamp": 1778257771.6556888, "geo/rankme_last": 438.711181640625, "geo/layer_0/stable_rank_q_proj": 19.41189956665039, "geo/layer_0/stable_rank_k_proj": 16.136802673339844, "geo/layer_0/stable_rank_o_proj": 47.235595703125, "geo/layer_0/stable_rank_gate_proj": 131.84506225585938, "geo/layer_0/stable_rank_down_proj": 54.709434509277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06537384539842606, "geo/layer_0/attn_entropy_mean": 6.1609787940979, "geo/layer_0/attn_entropy_std": 0.40454334020614624, "geo/layer_7/stable_rank_q_proj": 43.7336311340332, "geo/layer_7/stable_rank_k_proj": 41.221866607666016, "geo/layer_7/stable_rank_o_proj": 92.20858001708984, "geo/layer_7/stable_rank_gate_proj": 83.30790710449219, "geo/layer_7/stable_rank_down_proj": 141.4567108154297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4608117341995239, "geo/layer_7/attn_entropy_mean": 4.63873291015625, "geo/layer_7/attn_entropy_std": 0.8152216076850891, "geo/layer_14/stable_rank_q_proj": 51.912532806396484, "geo/layer_14/stable_rank_k_proj": 39.63861083984375, "geo/layer_14/stable_rank_o_proj": 44.04104232788086, "geo/layer_14/stable_rank_gate_proj": 72.15184020996094, "geo/layer_14/stable_rank_down_proj": 129.57777404785156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4114447832107544, "geo/layer_14/attn_entropy_mean": 5.5480875968933105, "geo/layer_14/attn_entropy_std": 0.3954883813858032, "geo/layer_21/stable_rank_q_proj": 40.712467193603516, "geo/layer_21/stable_rank_k_proj": 29.963653564453125, "geo/layer_21/stable_rank_o_proj": 70.95487213134766, "geo/layer_21/stable_rank_gate_proj": 67.1870346069336, "geo/layer_21/stable_rank_down_proj": 52.32204818725586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14360114932060242, "geo/layer_21/attn_entropy_mean": 5.718905448913574, "geo/layer_21/attn_entropy_std": 0.3018816113471985, "geo/layer_27/stable_rank_q_proj": 43.081233978271484, "geo/layer_27/stable_rank_k_proj": 31.6962890625, "geo/layer_27/stable_rank_o_proj": 116.17061614990234, "geo/layer_27/stable_rank_gate_proj": 80.81604766845703, "geo/layer_27/stable_rank_down_proj": 128.95567321777344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09175241738557816, "geo/layer_27/attn_entropy_mean": 4.210102558135986, "geo/layer_27/attn_entropy_std": 0.7121191024780273, "attnres/final_alpha/block_0": 0.23687593638896942, "attnres/block_norm/0": 1.7586867809295654, "attnres/final_alpha/block_1": 0.00469569768756628, "attnres/block_norm/1": 45533.984375, "attnres/final_alpha/block_2": 0.010376815684139729, "attnres/block_norm/2": 28312.81640625, "attnres/final_alpha/block_3": 0.012277185916900635, "attnres/block_norm/3": 55766.72265625, "attnres/final_alpha/block_4": 0.014416271820664406, "attnres/block_norm/4": 14817.00390625, "attnres/final_alpha/block_5": 0.6091975569725037, "attnres/block_norm/5": 6567.296875, "attnres/final_alpha/block_6": 0.1121605783700943, "attnres/block_norm/6": 37242.3515625, "geo/tier1_time_s": 1.3603055477142334, "geo/step": 58500.0, "geo/rankme_slope": -7.010493650585234e-05} {"step": 58500, "timestamp": 1778257778.8655295, "geo/ww_alpha_mean": 7.504853416422903, "geo/ww_alpha_std": 4.151726299439139, "geo/ww_alpha_min": 1.3478754623697442, "geo/ww_alpha_max": 26.90411990916787, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 4.024045503530528, "geo/ww_alpha_by_type/k_proj": 4.456236768013823, "geo/ww_alpha_by_type/v_proj": 9.729465433452377, "geo/ww_alpha_by_type/o_proj": 7.268123071375277, "geo/ww_alpha_by_type/gate_proj": 7.9823117409584, "geo/ww_alpha_by_type/up_proj": 11.072237048385015, "geo/ww_alpha_by_type/down_proj": 8.093613624745544, "geo/twonn_id/layer_0": 0.7294926047325134, "geo/twonn_id/layer_7": 3.1852030754089355, "geo/twonn_id/layer_14": 4.621210098266602, "geo/twonn_id/layer_21": 6.441099166870117, "geo/twonn_id/layer_27": 5.169350624084473, "geo/tier2_time_s": 7.201911449432373} {"step": 58500, "timestamp": 1778257779.6550019, "eoc/jacobian_sigma/layer_0/attn": 1213.7379150390625, "eoc/jacobian_sigma/layer_0/mlp": 10460.2294921875, "eoc/jacobian_sigma/layer_0": 10460.2294921875, "eoc/jacobian_sigma/layer_7/attn": 1.1454874277114868, "eoc/jacobian_sigma/layer_7/mlp": 1.7909091711044312, "eoc/jacobian_sigma/layer_7": 1.7909091711044312, "eoc/jacobian_sigma/layer_14/attn": 1.4995204210281372, "eoc/jacobian_sigma/layer_14/mlp": 8.202264785766602, "eoc/jacobian_sigma/layer_14": 8.202264785766602, "eoc/jacobian_sigma/layer_21/attn": 1.0668812990188599, "eoc/jacobian_sigma/layer_21/mlp": 4.293036460876465, "eoc/jacobian_sigma/layer_21": 4.293036460876465, "eoc/jacobian_sigma/layer_27/attn": 3.131967067718506, "eoc/jacobian_sigma/layer_27/mlp": 29.6470947265625, "eoc/jacobian_sigma/layer_27": 29.6470947265625, "eoc/layer0_sigma": 10460.2294921875, "eoc/sigma_max": 29.6470947265625, "eoc/sigma_min": 1.7909091711044312, "eoc/sigma_mean": 10.9833262860775, "eoc/time_s": 0.7817196846008301} {"step": 58510, "timestamp": 1778257790.0261986, "train/loss": 2.133554685115814, "train/z_loss": 0.0013691888307221234, "train/perplexity": 8.444832240525637, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1062867.5549453276, "perf/iters_per_sec": 0.5068147444464338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9731075525283814, "data/tokens_consumed": 122706460672, "data/tokens_consumed_B": 122.706460672, "train/loss_slope": 6.956822372166651e-09} {"step": 58520, "timestamp": 1778257800.3770254, "train/loss": 2.179706859588623, "train/z_loss": 0.0013801459572277963, "train/perplexity": 8.843713428915198, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027688.9623051859, "perf/iters_per_sec": 0.9668774425054483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342572450637817, "data/tokens_consumed": 122727432192, "data/tokens_consumed_B": 122.727432192, "train/loss_slope": 6.106214032600918e-06} {"step": 58530, "timestamp": 1778257810.7243178, "train/loss": 2.1546503782272337, "train/z_loss": 0.001378329924773425, "train/perplexity": 8.624874207964435, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028126.3804061268, "perf/iters_per_sec": 0.9670860197096476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340341806411744, "data/tokens_consumed": 122748403712, "data/tokens_consumed_B": 122.748403712, "train/loss_slope": 7.266638660230598e-06} {"step": 58540, "timestamp": 1778257821.0643425, "train/loss": 2.131556451320648, "train/z_loss": 0.0013887849054299295, "train/perplexity": 8.427974339970794, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029212.8403539448, "perf/iters_per_sec": 0.9676040841836666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334805488586425, "data/tokens_consumed": 122769375232, "data/tokens_consumed_B": 122.769375232, "train/loss_slope": 5.757415970631459e-06} {"step": 58550, "timestamp": 1778257831.40812, "grad/layer_0/attn": 0.003653895342722535, "grad/layer_0/mlp": 0.003351357765495777, "grad/layer_0/attn_mlp_ratio": 1.0902731040279408, "grad/layer_4/attn": 0.0030954533722251654, "grad/layer_4/mlp": 0.0026773943100124598, "grad/layer_4/attn_mlp_ratio": 1.1561439587119926, "grad/layer_8/attn": 0.004171926528215408, "grad/layer_8/mlp": 0.0036110649816691875, "grad/layer_8/attn_mlp_ratio": 1.1553174572768892, "grad/layer_12/attn": 0.0051428647711873055, "grad/layer_12/mlp": 0.006689072120934725, "grad/layer_12/attn_mlp_ratio": 0.7688457533903927, "grad/layer_16/attn": 0.005934759974479675, "grad/layer_16/mlp": 0.004628654569387436, "grad/layer_16/attn_mlp_ratio": 1.2821781701993102, "grad/layer_20/attn": 0.003550884546712041, "grad/layer_20/mlp": 0.005608217790722847, "grad/layer_20/attn_mlp_ratio": 0.6331573800985763, "grad/layer_24/attn": 0.013339188881218433, "grad/layer_24/mlp": 0.009293623268604279, "grad/layer_24/attn_mlp_ratio": 1.435305515637839, "grad/layer_27/attn": 0.004033376462757587, "grad/layer_27/mlp": 0.009015959687530994, "grad/layer_27/attn_mlp_ratio": 0.4473596331181199} {"step": 58550, "timestamp": 1778257831.4228666, "train/loss": 2.152986741065979, "train/z_loss": 0.0013919532182626426, "train/perplexity": 8.610537475587929, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026161.443925596, "perf/iters_per_sec": 0.9661490649822216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350369691848755, "data/tokens_consumed": 122790346752, "data/tokens_consumed_B": 122.790346752, "train/loss_slope": 6.605091747349164e-06} {"step": 58560, "timestamp": 1778257841.7675273, "train/loss": 2.1837610721588137, "train/z_loss": 0.0013821768923662602, "train/perplexity": 8.87964050185121, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028704.392621646, "perf/iters_per_sec": 0.9673616374119024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337395668029785, "data/tokens_consumed": 122811318272, "data/tokens_consumed_B": 122.811318272, "train/loss_slope": 5.855657901987105e-06} {"step": 58570, "timestamp": 1778257852.1136363, "train/loss": 2.1256256461143495, "train/z_loss": 0.0014002930256538093, "train/perplexity": 8.378137597962807, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027979.7897231297, "perf/iters_per_sec": 0.9670161198249482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341089248657227, "data/tokens_consumed": 122832289792, "data/tokens_consumed_B": 122.832289792, "train/loss_slope": 1.9639499617381352e-06} {"step": 58575, "timestamp": 1778257857.903212, "eos/sharpness": 5.50856590270996, "eos/L0_probe": 1.979636788368225, "eos/L_plus": 2.0043323040008545, "eos/L_minus": 2.0100269317626953, "eos/grad_norm": 0.08820149302482605, "eos/embed_grad_frac": 0.26651450991630554, "eos/time_s": 0.6227624416351318} {"step": 58575, "timestamp": 1778257859.286572, "geo/rankme_last": 438.38037109375, "geo/layer_0/stable_rank_q_proj": 19.45585823059082, "geo/layer_0/stable_rank_k_proj": 16.1591739654541, "geo/layer_0/stable_rank_o_proj": 47.360843658447266, "geo/layer_0/stable_rank_gate_proj": 131.4932403564453, "geo/layer_0/stable_rank_down_proj": 54.792930603027344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062098097056150436, "geo/layer_0/attn_entropy_mean": 6.162163257598877, "geo/layer_0/attn_entropy_std": 0.4057753384113312, "geo/layer_7/stable_rank_q_proj": 43.694976806640625, "geo/layer_7/stable_rank_k_proj": 41.252410888671875, "geo/layer_7/stable_rank_o_proj": 92.14476013183594, "geo/layer_7/stable_rank_gate_proj": 83.12053680419922, "geo/layer_7/stable_rank_down_proj": 141.5950164794922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48343655467033386, "geo/layer_7/attn_entropy_mean": 4.661202430725098, "geo/layer_7/attn_entropy_std": 0.7975635528564453, "geo/layer_14/stable_rank_q_proj": 51.967010498046875, "geo/layer_14/stable_rank_k_proj": 39.70847702026367, "geo/layer_14/stable_rank_o_proj": 44.10916519165039, "geo/layer_14/stable_rank_gate_proj": 72.19827270507812, "geo/layer_14/stable_rank_down_proj": 129.72276306152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3911058008670807, "geo/layer_14/attn_entropy_mean": 5.513818264007568, "geo/layer_14/attn_entropy_std": 0.40278586745262146, "geo/layer_21/stable_rank_q_proj": 40.6619987487793, "geo/layer_21/stable_rank_k_proj": 29.903583526611328, "geo/layer_21/stable_rank_o_proj": 70.8288345336914, "geo/layer_21/stable_rank_gate_proj": 67.13935852050781, "geo/layer_21/stable_rank_down_proj": 52.303497314453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14392417669296265, "geo/layer_21/attn_entropy_mean": 5.714605331420898, "geo/layer_21/attn_entropy_std": 0.29796066880226135, "geo/layer_27/stable_rank_q_proj": 43.130977630615234, "geo/layer_27/stable_rank_k_proj": 31.69812774658203, "geo/layer_27/stable_rank_o_proj": 116.25382232666016, "geo/layer_27/stable_rank_gate_proj": 80.85173034667969, "geo/layer_27/stable_rank_down_proj": 128.8984832763672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10317470133304596, "geo/layer_27/attn_entropy_mean": 4.2011260986328125, "geo/layer_27/attn_entropy_std": 0.7207280993461609, "attnres/final_alpha/block_0": 0.2365700751543045, "attnres/block_norm/0": 1.7585753202438354, "attnres/final_alpha/block_1": 0.00460306741297245, "attnres/block_norm/1": 45674.04296875, "attnres/final_alpha/block_2": 0.010093056596815586, "attnres/block_norm/2": 28453.697265625, "attnres/final_alpha/block_3": 0.012057343497872353, "attnres/block_norm/3": 56347.3671875, "attnres/final_alpha/block_4": 0.013974667526781559, "attnres/block_norm/4": 14751.41796875, "attnres/final_alpha/block_5": 0.6115694046020508, "attnres/block_norm/5": 6523.4140625, "attnres/final_alpha/block_6": 0.11113241314888, "attnres/block_norm/6": 37308.015625, "geo/tier1_time_s": 1.3647518157958984, "geo/step": 58575.0, "geo/rankme_slope": -9.970253726490596e-05} {"step": 58580, "timestamp": 1778257865.0076613, "train/loss": 2.190344715118408, "train/z_loss": 0.0013714301283471286, "train/perplexity": 8.938293748685632, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1627406.309623616, "perf/iters_per_sec": 0.77600779992276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2886468410491942, "data/tokens_consumed": 122853261312, "data/tokens_consumed_B": 122.853261312, "train/loss_slope": 1.9050827478454206e-06} {"step": 58590, "timestamp": 1778257875.3538666, "train/loss": 2.1706591606140138, "train/z_loss": 0.0013791016186587512, "train/perplexity": 8.764059059750274, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028317.7984821629, "perf/iters_per_sec": 0.9671772949610533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339365959167481, "data/tokens_consumed": 122874232832, "data/tokens_consumed_B": 122.874232832, "train/loss_slope": 8.265916341924257e-07} {"step": 58600, "timestamp": 1778257885.6947649, "grad/layer_0/attn": 0.0027305367402732372, "grad/layer_0/mlp": 0.002853945130482316, "grad/layer_0/attn_mlp_ratio": 0.9567586340161746, "grad/layer_4/attn": 0.002133392496034503, "grad/layer_4/mlp": 0.0025754738599061966, "grad/layer_4/attn_mlp_ratio": 0.8283494724645576, "grad/layer_8/attn": 0.004395801108330488, "grad/layer_8/mlp": 0.0037075227592140436, "grad/layer_8/attn_mlp_ratio": 1.1856436966817117, "grad/layer_12/attn": 0.01165201049298048, "grad/layer_12/mlp": 0.007366960868239403, "grad/layer_12/attn_mlp_ratio": 1.581657693479696, "grad/layer_16/attn": 0.003277527168393135, "grad/layer_16/mlp": 0.0044563268311321735, "grad/layer_16/attn_mlp_ratio": 0.7354772706410136, "grad/layer_20/attn": 0.005256029777228832, "grad/layer_20/mlp": 0.005937454756349325, "grad/layer_20/attn_mlp_ratio": 0.8852327982937338, "grad/layer_24/attn": 0.007406856864690781, "grad/layer_24/mlp": 0.008874560706317425, "grad/layer_24/attn_mlp_ratio": 0.834616723727686, "grad/layer_27/attn": 0.005605580750852823, "grad/layer_27/mlp": 0.008372029289603233, "grad/layer_27/attn_mlp_ratio": 0.669560567693908} {"step": 58600, "timestamp": 1778257885.7095733, "train/loss": 2.1422107100486754, "train/z_loss": 0.0013757269247435034, "train/perplexity": 8.518248205765074, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026066.1437742233, "perf/iters_per_sec": 0.9661036223288647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350856542587281, "data/tokens_consumed": 122895204352, "data/tokens_consumed_B": 122.895204352, "train/loss_slope": -1.581389983423527e-06} {"step": 58610, "timestamp": 1778257896.0519977, "train/loss": 2.1178091764450073, "train/z_loss": 0.0013852893142029643, "train/perplexity": 8.312905414360019, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029137.8020771407, "perf/iters_per_sec": 0.9675683031449989, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335187673568726, "data/tokens_consumed": 122916175872, "data/tokens_consumed_B": 122.916175872, "train/loss_slope": 1.7067848938633852e-06} {"step": 58620, "timestamp": 1778257906.3975208, "train/loss": 2.1551339626312256, "train/z_loss": 0.001384077954571694, "train/perplexity": 8.629046071260511, "train/grad_norm": 0.328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028535.3097263458, "perf/iters_per_sec": 0.9672810124046067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033825731277466, "data/tokens_consumed": 122937147392, "data/tokens_consumed_B": 122.937147392, "train/loss_slope": 2.0553296369866756e-06} {"step": 58630, "timestamp": 1778257916.740306, "train/loss": 2.1365593314170837, "train/z_loss": 0.0013778807944618166, "train/perplexity": 8.470244132237196, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028508.0831843717, "perf/iters_per_sec": 0.9672680297777041, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338396072387694, "data/tokens_consumed": 122958118912, "data/tokens_consumed_B": 122.958118912, "train/loss_slope": 2.0066984558906503e-06} {"step": 58640, "timestamp": 1778257927.094091, "train/loss": 2.128562319278717, "train/z_loss": 0.0013638508738949896, "train/perplexity": 8.402777611939975, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026942.6175774438, "perf/iters_per_sec": 0.9665215576064319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346380710601806, "data/tokens_consumed": 122979090432, "data/tokens_consumed_B": 122.979090432, "train/loss_slope": 3.26718126658072e-07} {"step": 58650, "timestamp": 1778257937.4373095, "grad/layer_0/attn": 0.0026734163984656334, "grad/layer_0/mlp": 0.0028874597046524286, "grad/layer_0/attn_mlp_ratio": 0.9258713815368383, "grad/layer_4/attn": 0.0025453215930610895, "grad/layer_4/mlp": 0.0025487863458693027, "grad/layer_4/attn_mlp_ratio": 0.998640587243459, "grad/layer_8/attn": 0.0037977274041622877, "grad/layer_8/mlp": 0.0036076505202800035, "grad/layer_8/attn_mlp_ratio": 1.0526871373890223, "grad/layer_12/attn": 0.005808010697364807, "grad/layer_12/mlp": 0.006820587441325188, "grad/layer_12/attn_mlp_ratio": 0.851541111696714, "grad/layer_16/attn": 0.0034665586426854134, "grad/layer_16/mlp": 0.0046050064265728, "grad/layer_16/attn_mlp_ratio": 0.7527803973093048, "grad/layer_20/attn": 0.0030581681057810783, "grad/layer_20/mlp": 0.005854320712387562, "grad/layer_20/attn_mlp_ratio": 0.5223779501987813, "grad/layer_24/attn": 0.01232705544680357, "grad/layer_24/mlp": 0.009828349575400352, "grad/layer_24/attn_mlp_ratio": 1.2542345209447825, "grad/layer_27/attn": 0.006016760598868132, "grad/layer_27/mlp": 0.009998850524425507, "grad/layer_27/attn_mlp_ratio": 0.6017452230129531} {"step": 58650, "timestamp": 1778257938.04227, "eos/sharpness": 75.11086463928221, "eos/L0_probe": 1.9810234308242798, "eos/L_plus": 2.3043205738067627, "eos/L_minus": 2.408834934234619, "eos/grad_norm": 0.17911553382873535, "eos/embed_grad_frac": 0.06634905934333801, "eos/time_s": 0.6000723838806152} {"step": 58650, "timestamp": 1778257938.0626128, "train/loss": 2.113940382003784, "train/z_loss": 0.0013802764122374355, "train/perplexity": 8.280806623950214, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913023.5557364263, "perf/iters_per_sec": 0.912200715892995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0962499618530273, "data/tokens_consumed": 123000061952, "data/tokens_consumed_B": 123.000061952, "train/loss_slope": 9.658978645628881e-07} {"step": 58650, "timestamp": 1778257939.4254854, "geo/rankme_last": 437.9336853027344, "geo/layer_0/stable_rank_q_proj": 19.441743850708008, "geo/layer_0/stable_rank_k_proj": 16.146194458007812, "geo/layer_0/stable_rank_o_proj": 47.35799026489258, "geo/layer_0/stable_rank_gate_proj": 131.64183044433594, "geo/layer_0/stable_rank_down_proj": 54.76332092285156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06318461149930954, "geo/layer_0/attn_entropy_mean": 6.1606645584106445, "geo/layer_0/attn_entropy_std": 0.4085700511932373, "geo/layer_7/stable_rank_q_proj": 43.58723831176758, "geo/layer_7/stable_rank_k_proj": 41.20123291015625, "geo/layer_7/stable_rank_o_proj": 92.46334838867188, "geo/layer_7/stable_rank_gate_proj": 83.0207290649414, "geo/layer_7/stable_rank_down_proj": 141.4910888671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4663311541080475, "geo/layer_7/attn_entropy_mean": 4.660027503967285, "geo/layer_7/attn_entropy_std": 0.8158537149429321, "geo/layer_14/stable_rank_q_proj": 51.88228988647461, "geo/layer_14/stable_rank_k_proj": 39.629276275634766, "geo/layer_14/stable_rank_o_proj": 44.05665969848633, "geo/layer_14/stable_rank_gate_proj": 72.21761322021484, "geo/layer_14/stable_rank_down_proj": 129.6847381591797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3985617160797119, "geo/layer_14/attn_entropy_mean": 5.519160270690918, "geo/layer_14/attn_entropy_std": 0.3959294557571411, "geo/layer_21/stable_rank_q_proj": 40.6235237121582, "geo/layer_21/stable_rank_k_proj": 30.04045867919922, "geo/layer_21/stable_rank_o_proj": 70.80604553222656, "geo/layer_21/stable_rank_gate_proj": 67.06401824951172, "geo/layer_21/stable_rank_down_proj": 52.31363296508789, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14326387643814087, "geo/layer_21/attn_entropy_mean": 5.707867622375488, "geo/layer_21/attn_entropy_std": 0.29984718561172485, "geo/layer_27/stable_rank_q_proj": 43.10068130493164, "geo/layer_27/stable_rank_k_proj": 31.67770004272461, "geo/layer_27/stable_rank_o_proj": 116.19158172607422, "geo/layer_27/stable_rank_gate_proj": 80.90251922607422, "geo/layer_27/stable_rank_down_proj": 128.85816955566406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08854882419109344, "geo/layer_27/attn_entropy_mean": 4.217775344848633, "geo/layer_27/attn_entropy_std": 0.7143838405609131, "attnres/final_alpha/block_0": 0.23702943325042725, "attnres/block_norm/0": 1.7586517333984375, "attnres/final_alpha/block_1": 0.004644437227398157, "attnres/block_norm/1": 45717.4609375, "attnres/final_alpha/block_2": 0.010228387080132961, "attnres/block_norm/2": 28253.51953125, "attnres/final_alpha/block_3": 0.012234210968017578, "attnres/block_norm/3": 56277.6171875, "attnres/final_alpha/block_4": 0.014245226047933102, "attnres/block_norm/4": 14826.740234375, "attnres/final_alpha/block_5": 0.6083962321281433, "attnres/block_norm/5": 6545.45458984375, "attnres/final_alpha/block_6": 0.11322207003831863, "attnres/block_norm/6": 37407.82421875, "geo/tier1_time_s": 1.3593347072601318, "geo/step": 58650.0, "geo/rankme_slope": -0.0001253570568852541} {"step": 58660, "timestamp": 1778257949.7683258, "train/loss": 2.185815858840942, "train/z_loss": 0.0013804538291878998, "train/perplexity": 8.897905027322054, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792076.9810366195, "perf/iters_per_sec": 0.8545288949187372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1702354431152344, "data/tokens_consumed": 123021033472, "data/tokens_consumed_B": 123.021033472, "train/loss_slope": 3.647363926246268e-06} {"step": 58670, "timestamp": 1778257960.1194077, "train/loss": 2.1581824660301208, "train/z_loss": 0.001369417121168226, "train/perplexity": 8.655391884785557, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026993.9044743828, "perf/iters_per_sec": 0.9665460131046213, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346118927001953, "data/tokens_consumed": 123042004992, "data/tokens_consumed_B": 123.042004992, "train/loss_slope": 5.587276509671501e-06} {"step": 58680, "timestamp": 1778257970.468304, "train/loss": 2.1796024322509764, "train/z_loss": 0.0013872685842216015, "train/perplexity": 8.842789951685882, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027862.953188745, "perf/iters_per_sec": 0.9669604078239179, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03416850566864, "data/tokens_consumed": 123062976512, "data/tokens_consumed_B": 123.062976512, "train/loss_slope": 6.092750514694523e-06} {"step": 58690, "timestamp": 1778257980.8201602, "train/loss": 2.187790584564209, "train/z_loss": 0.001385108393151313, "train/perplexity": 8.915493309563859, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027303.8764327564, "perf/iters_per_sec": 0.9666938192523749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344537019729614, "data/tokens_consumed": 123083948032, "data/tokens_consumed_B": 123.083948032, "train/loss_slope": 9.028873828449567e-06} {"step": 58700, "timestamp": 1778257991.1591191, "grad/layer_0/attn": 0.0033124429173767567, "grad/layer_0/mlp": 0.0031637181527912617, "grad/layer_0/attn_mlp_ratio": 1.0470094530239156, "grad/layer_4/attn": 0.002119982149451971, "grad/layer_4/mlp": 0.002513682236894965, "grad/layer_4/attn_mlp_ratio": 0.843377111871139, "grad/layer_8/attn": 0.003321662312373519, "grad/layer_8/mlp": 0.003753270022571087, "grad/layer_8/attn_mlp_ratio": 0.8850048634650614, "grad/layer_12/attn": 0.006955722346901894, "grad/layer_12/mlp": 0.007026743609458208, "grad/layer_12/attn_mlp_ratio": 0.9898927062814719, "grad/layer_16/attn": 0.003960933070629835, "grad/layer_16/mlp": 0.00529110711067915, "grad/layer_16/attn_mlp_ratio": 0.7486019301660347, "grad/layer_20/attn": 0.004321384709328413, "grad/layer_20/mlp": 0.0061255027540028095, "grad/layer_20/attn_mlp_ratio": 0.7054742789817712, "grad/layer_24/attn": 0.015462578274309635, "grad/layer_24/mlp": 0.01068332139402628, "grad/layer_24/attn_mlp_ratio": 1.4473568246500623, "grad/layer_27/attn": 0.005915581714361906, "grad/layer_27/mlp": 0.010425209067761898, "grad/layer_27/attn_mlp_ratio": 0.5674305061096317} {"step": 58700, "timestamp": 1778257991.1733878, "train/loss": 2.1821829795837404, "train/z_loss": 0.0013703254982829094, "train/perplexity": 8.865638658114559, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026606.4677387753, "perf/iters_per_sec": 0.9663612688726307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034809684753418, "data/tokens_consumed": 123104919552, "data/tokens_consumed_B": 123.104919552, "train/loss_slope": 9.838437082672335e-06} {"step": 58710, "timestamp": 1778258001.5296605, "train/loss": 2.1510385513305663, "train/z_loss": 0.0013815487967804075, "train/perplexity": 8.593778844658516, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026433.1592394698, "perf/iters_per_sec": 0.9662786289403295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348981857299804, "data/tokens_consumed": 123125891072, "data/tokens_consumed_B": 123.125891072, "train/loss_slope": 1.1526902795660567e-05} {"step": 58720, "timestamp": 1778258011.9059978, "train/loss": 2.172732102870941, "train/z_loss": 0.001377229334320873, "train/perplexity": 8.782245291108852, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023096.8755094202, "perf/iters_per_sec": 0.9646877648875333, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366048336029052, "data/tokens_consumed": 123146862592, "data/tokens_consumed_B": 123.146862592, "train/loss_slope": 1.3270592997104855e-05} {"step": 58725, "timestamp": 1778258017.6958525, "eos/sharpness": 57.049345970153794, "eos/L0_probe": 1.9801745414733887, "eos/L_plus": 2.235307455062866, "eos/L_minus": 2.295535087585449, "eos/grad_norm": 0.1360495388507843, "eos/embed_grad_frac": 0.11942657083272934, "eos/time_s": 0.6125376224517822} {"step": 58725, "timestamp": 1778258019.083586, "geo/rankme_last": 437.59466552734375, "geo/layer_0/stable_rank_q_proj": 19.430137634277344, "geo/layer_0/stable_rank_k_proj": 16.159759521484375, "geo/layer_0/stable_rank_o_proj": 47.45348358154297, "geo/layer_0/stable_rank_gate_proj": 131.60989379882812, "geo/layer_0/stable_rank_down_proj": 54.776512145996094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06226230040192604, "geo/layer_0/attn_entropy_mean": 6.159785270690918, "geo/layer_0/attn_entropy_std": 0.4034308195114136, "geo/layer_7/stable_rank_q_proj": 43.561153411865234, "geo/layer_7/stable_rank_k_proj": 41.21915817260742, "geo/layer_7/stable_rank_o_proj": 92.43048858642578, "geo/layer_7/stable_rank_gate_proj": 82.94904327392578, "geo/layer_7/stable_rank_down_proj": 141.46278381347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.463631272315979, "geo/layer_7/attn_entropy_mean": 4.683823585510254, "geo/layer_7/attn_entropy_std": 0.7831539511680603, "geo/layer_14/stable_rank_q_proj": 51.91084289550781, "geo/layer_14/stable_rank_k_proj": 39.61345672607422, "geo/layer_14/stable_rank_o_proj": 44.06993865966797, "geo/layer_14/stable_rank_gate_proj": 72.07054901123047, "geo/layer_14/stable_rank_down_proj": 129.98321533203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4121285378932953, "geo/layer_14/attn_entropy_mean": 5.528782844543457, "geo/layer_14/attn_entropy_std": 0.40330028533935547, "geo/layer_21/stable_rank_q_proj": 40.6325798034668, "geo/layer_21/stable_rank_k_proj": 30.09510040283203, "geo/layer_21/stable_rank_o_proj": 70.80972290039062, "geo/layer_21/stable_rank_gate_proj": 67.02050018310547, "geo/layer_21/stable_rank_down_proj": 52.29338455200195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14032378792762756, "geo/layer_21/attn_entropy_mean": 5.704847812652588, "geo/layer_21/attn_entropy_std": 0.3057399392127991, "geo/layer_27/stable_rank_q_proj": 43.069156646728516, "geo/layer_27/stable_rank_k_proj": 31.705562591552734, "geo/layer_27/stable_rank_o_proj": 116.38306427001953, "geo/layer_27/stable_rank_gate_proj": 80.88395690917969, "geo/layer_27/stable_rank_down_proj": 128.75326538085938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09876424074172974, "geo/layer_27/attn_entropy_mean": 4.212745666503906, "geo/layer_27/attn_entropy_std": 0.7057763338088989, "attnres/final_alpha/block_0": 0.23776081204414368, "attnres/block_norm/0": 1.7587335109710693, "attnres/final_alpha/block_1": 0.004646636545658112, "attnres/block_norm/1": 45754.12890625, "attnres/final_alpha/block_2": 0.010284507647156715, "attnres/block_norm/2": 28318.015625, "attnres/final_alpha/block_3": 0.012356367893517017, "attnres/block_norm/3": 56277.671875, "attnres/final_alpha/block_4": 0.014465391635894775, "attnres/block_norm/4": 14760.767578125, "attnres/final_alpha/block_5": 0.6089158654212952, "attnres/block_norm/5": 6589.2265625, "attnres/final_alpha/block_6": 0.11157041788101196, "attnres/block_norm/6": 37479.72265625, "geo/tier1_time_s": 1.3660390377044678, "geo/step": 58725.0, "geo/rankme_slope": -0.00014105028730242096} {"step": 58730, "timestamp": 1778258024.2737758, "train/loss": 2.1508187770843508, "train/z_loss": 0.001383205607999116, "train/perplexity": 8.591890360918434, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696475.5095362188, "perf/iters_per_sec": 0.8089425609284491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2361817121505738, "data/tokens_consumed": 123167834112, "data/tokens_consumed_B": 123.167834112, "train/loss_slope": 1.0943405198292635e-05} {"step": 58740, "timestamp": 1778258034.6465073, "train/loss": 2.1604816198348997, "train/z_loss": 0.0013856658595614135, "train/perplexity": 8.67531485618081, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022633.7176943673, "perf/iters_per_sec": 0.9644669140312039, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368422031402589, "data/tokens_consumed": 123188805632, "data/tokens_consumed_B": 123.188805632, "train/loss_slope": 1.2459898223899814e-05} {"step": 58750, "timestamp": 1778258045.0149367, "grad/layer_0/attn": 0.0027027337346225977, "grad/layer_0/mlp": 0.0028458985034376383, "grad/layer_0/attn_mlp_ratio": 0.9496943184686525, "grad/layer_4/attn": 0.0025554297026246786, "grad/layer_4/mlp": 0.002476254478096962, "grad/layer_4/attn_mlp_ratio": 1.031973741806694, "grad/layer_8/attn": 0.005801783874630928, "grad/layer_8/mlp": 0.003470673458650708, "grad/layer_8/attn_mlp_ratio": 1.6716593412163154, "grad/layer_12/attn": 0.0040161279030144215, "grad/layer_12/mlp": 0.007149785757064819, "grad/layer_12/attn_mlp_ratio": 0.5617130335513504, "grad/layer_16/attn": 0.004423487465828657, "grad/layer_16/mlp": 0.004276807885617018, "grad/layer_16/attn_mlp_ratio": 1.03429648483283, "grad/layer_20/attn": 0.00411289744079113, "grad/layer_20/mlp": 0.005418989807367325, "grad/layer_20/attn_mlp_ratio": 0.7589786124531231, "grad/layer_24/attn": 0.007967473939061165, "grad/layer_24/mlp": 0.009172533638775349, "grad/layer_24/attn_mlp_ratio": 0.8686230180195473, "grad/layer_27/attn": 0.009422766976058483, "grad/layer_27/mlp": 0.008646650239825249, "grad/layer_27/attn_mlp_ratio": 1.0897592253336013} {"step": 58750, "timestamp": 1778258045.0320005, "train/loss": 2.195591926574707, "train/z_loss": 0.0013807536684907972, "train/perplexity": 8.985318131589478, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020657.216221727, "perf/iters_per_sec": 0.9635244446858058, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037856388092041, "data/tokens_consumed": 123209777152, "data/tokens_consumed_B": 123.209777152, "train/loss_slope": 1.618216786697894e-05} {"step": 58760, "timestamp": 1778258056.0651147, "train/loss": 2.1795647382736205, "train/z_loss": 0.0013833557954058052, "train/perplexity": 8.842456638043677, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1901936.0020889074, "perf/iters_per_sec": 0.9069137583202874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1026406764984131, "data/tokens_consumed": 123230748672, "data/tokens_consumed_B": 123.230748672, "train/loss_slope": 1.6557208589702024e-05} {"step": 58770, "timestamp": 1778258066.9767172, "train/loss": 2.151676344871521, "train/z_loss": 0.001391666871495545, "train/perplexity": 8.599261649560948, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922880.3337084616, "perf/iters_per_sec": 0.9169007938902195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0906305313110352, "data/tokens_consumed": 123251720192, "data/tokens_consumed_B": 123.251720192, "train/loss_slope": 1.7454978091345484e-05} {"step": 58780, "timestamp": 1778258077.3529422, "train/loss": 2.1348088502883913, "train/z_loss": 0.001385041861794889, "train/perplexity": 8.455430099353542, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022430.9087008329, "perf/iters_per_sec": 0.9643702071670689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036946177482605, "data/tokens_consumed": 123272691712, "data/tokens_consumed_B": 123.272691712, "train/loss_slope": 1.4208512108783033e-05} {"step": 58790, "timestamp": 1778258087.7287827, "train/loss": 2.1087733268737794, "train/z_loss": 0.0013868141220882535, "train/perplexity": 8.23812959184307, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022556.5143377644, "perf/iters_per_sec": 0.9644301006020376, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368817806243897, "data/tokens_consumed": 123293663232, "data/tokens_consumed_B": 123.293663232, "train/loss_slope": 8.255174980484037e-06} {"step": 58800, "timestamp": 1778258098.105021, "grad/layer_0/attn": 0.003137474413961172, "grad/layer_0/mlp": 0.0032409594859927893, "grad/layer_0/attn_mlp_ratio": 0.9680695888714956, "grad/layer_4/attn": 0.00190332334022969, "grad/layer_4/mlp": 0.002626072382554412, "grad/layer_4/attn_mlp_ratio": 0.724779438828857, "grad/layer_8/attn": 0.004038976971060038, "grad/layer_8/mlp": 0.003674772335216403, "grad/layer_8/attn_mlp_ratio": 1.0991094121511737, "grad/layer_12/attn": 0.005433030426502228, "grad/layer_12/mlp": 0.007218671962618828, "grad/layer_12/attn_mlp_ratio": 0.7526357173969204, "grad/layer_16/attn": 0.008467775769531727, "grad/layer_16/mlp": 0.004716539289802313, "grad/layer_16/attn_mlp_ratio": 1.7953365952672862, "grad/layer_20/attn": 0.00536168459802866, "grad/layer_20/mlp": 0.0067293415777385235, "grad/layer_20/attn_mlp_ratio": 0.7967621284212335, "grad/layer_24/attn": 0.01619303971529007, "grad/layer_24/mlp": 0.014559885486960411, "grad/layer_24/attn_mlp_ratio": 1.1121680605644515, "grad/layer_27/attn": 0.006176786031574011, "grad/layer_27/mlp": 0.014472046867012978, "grad/layer_27/attn_mlp_ratio": 0.426808042127913} {"step": 58800, "timestamp": 1778258098.729744, "eos/sharpness": 69.67718601226805, "eos/L0_probe": 1.9788151979446411, "eos/L_plus": 2.3069040775299072, "eos/L_minus": 2.3474981784820557, "eos/grad_norm": 0.22720064222812653, "eos/embed_grad_frac": 0.056943513453006744, "eos/time_s": 0.6218798160552979} {"step": 58800, "timestamp": 1778258098.7512858, "train/loss": 2.14080810546875, "train/z_loss": 0.0013896108139306307, "train/perplexity": 8.506308846875815, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903619.1174161106, "perf/iters_per_sec": 0.9077163302498391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1016657590866088, "data/tokens_consumed": 123314634752, "data/tokens_consumed_B": 123.314634752, "train/loss_slope": 6.231044113475057e-06} {"step": 58800, "timestamp": 1778258100.1217663, "geo/rankme_last": 439.2801513671875, "geo/layer_0/stable_rank_q_proj": 19.420242309570312, "geo/layer_0/stable_rank_k_proj": 16.177579879760742, "geo/layer_0/stable_rank_o_proj": 47.44353485107422, "geo/layer_0/stable_rank_gate_proj": 131.83079528808594, "geo/layer_0/stable_rank_down_proj": 54.794769287109375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06148701161146164, "geo/layer_0/attn_entropy_mean": 6.158625602722168, "geo/layer_0/attn_entropy_std": 0.40324491262435913, "geo/layer_7/stable_rank_q_proj": 43.57047653198242, "geo/layer_7/stable_rank_k_proj": 41.14825439453125, "geo/layer_7/stable_rank_o_proj": 92.43937683105469, "geo/layer_7/stable_rank_gate_proj": 82.78782653808594, "geo/layer_7/stable_rank_down_proj": 141.5815887451172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4582301676273346, "geo/layer_7/attn_entropy_mean": 4.6727776527404785, "geo/layer_7/attn_entropy_std": 0.8156391382217407, "geo/layer_14/stable_rank_q_proj": 51.82613754272461, "geo/layer_14/stable_rank_k_proj": 39.609188079833984, "geo/layer_14/stable_rank_o_proj": 44.165340423583984, "geo/layer_14/stable_rank_gate_proj": 72.04190826416016, "geo/layer_14/stable_rank_down_proj": 129.91661071777344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3944399952888489, "geo/layer_14/attn_entropy_mean": 5.516007900238037, "geo/layer_14/attn_entropy_std": 0.3837014138698578, "geo/layer_21/stable_rank_q_proj": 40.68801498413086, "geo/layer_21/stable_rank_k_proj": 30.171964645385742, "geo/layer_21/stable_rank_o_proj": 70.7592544555664, "geo/layer_21/stable_rank_gate_proj": 67.0771255493164, "geo/layer_21/stable_rank_down_proj": 52.32897186279297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14691630005836487, "geo/layer_21/attn_entropy_mean": 5.714723587036133, "geo/layer_21/attn_entropy_std": 0.30354413390159607, "geo/layer_27/stable_rank_q_proj": 43.023189544677734, "geo/layer_27/stable_rank_k_proj": 31.697704315185547, "geo/layer_27/stable_rank_o_proj": 116.26533508300781, "geo/layer_27/stable_rank_gate_proj": 80.90977478027344, "geo/layer_27/stable_rank_down_proj": 128.87322998046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09262265264987946, "geo/layer_27/attn_entropy_mean": 4.242959976196289, "geo/layer_27/attn_entropy_std": 0.7086341977119446, "attnres/final_alpha/block_0": 0.23661339282989502, "attnres/block_norm/0": 1.758873462677002, "attnres/final_alpha/block_1": 0.004521002992987633, "attnres/block_norm/1": 45718.33984375, "attnres/final_alpha/block_2": 0.010228490456938744, "attnres/block_norm/2": 28197.8671875, "attnres/final_alpha/block_3": 0.012166885659098625, "attnres/block_norm/3": 56295.1015625, "attnres/final_alpha/block_4": 0.01430610753595829, "attnres/block_norm/4": 14805.3671875, "attnres/final_alpha/block_5": 0.6106398105621338, "attnres/block_norm/5": 6537.7548828125, "attnres/final_alpha/block_6": 0.11152428388595581, "attnres/block_norm/6": 37225.109375, "geo/tier1_time_s": 1.3668313026428223, "geo/step": 58800.0, "geo/rankme_slope": -0.00012353236216361546} {"step": 58810, "timestamp": 1778258110.483142, "train/loss": 2.1775878429412843, "train/z_loss": 0.0013702776050195098, "train/perplexity": 8.824993294078526, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788121.8053025098, "perf/iters_per_sec": 0.8526429201614903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1728239059448242, "data/tokens_consumed": 123335606272, "data/tokens_consumed_B": 123.335606272, "train/loss_slope": 9.843095715897873e-06} {"step": 58820, "timestamp": 1778258121.304976, "train/loss": 2.1346784830093384, "train/z_loss": 0.0013785232207737864, "train/perplexity": 8.454327859787815, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1939711.6436571283, "perf/iters_per_sec": 0.9249265878949777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.081166887283325, "data/tokens_consumed": 123356577792, "data/tokens_consumed_B": 123.356577792, "train/loss_slope": 1.052104380264532e-05} {"step": 58830, "timestamp": 1778258132.0417626, "train/loss": 2.1773497819900514, "train/z_loss": 0.0013730882667005063, "train/perplexity": 8.822892657829966, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1954768.0050281186, "perf/iters_per_sec": 0.932106020464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.072839331626892, "data/tokens_consumed": 123377549312, "data/tokens_consumed_B": 123.377549312, "train/loss_slope": 1.2057050355304084e-05} {"step": 58840, "timestamp": 1778258142.3908787, "train/loss": 2.1421003103256226, "train/z_loss": 0.0013797184568829835, "train/perplexity": 8.517307845430977, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027485.0448531376, "perf/iters_per_sec": 0.9667802070871055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343612670898437, "data/tokens_consumed": 123398520832, "data/tokens_consumed_B": 123.398520832, "train/loss_slope": 8.822646543066187e-06} {"step": 58850, "timestamp": 1778258152.7246597, "grad/layer_0/attn": 0.002958156866952777, "grad/layer_0/mlp": 0.003011140273883939, "grad/layer_0/attn_mlp_ratio": 0.9824041723890737, "grad/layer_4/attn": 0.002934135729447007, "grad/layer_4/mlp": 0.002639938145875931, "grad/layer_4/attn_mlp_ratio": 1.1114410475436942, "grad/layer_8/attn": 0.004131909925490618, "grad/layer_8/mlp": 0.003821017686277628, "grad/layer_8/attn_mlp_ratio": 1.0813636985228106, "grad/layer_12/attn": 0.0056189741007983685, "grad/layer_12/mlp": 0.00741640804335475, "grad/layer_12/attn_mlp_ratio": 0.7576408946469704, "grad/layer_16/attn": 0.0040703751146793365, "grad/layer_16/mlp": 0.004742779769003391, "grad/layer_16/attn_mlp_ratio": 0.8582256033600505, "grad/layer_20/attn": 0.005096453242003918, "grad/layer_20/mlp": 0.006262228824198246, "grad/layer_20/attn_mlp_ratio": 0.8138401364265707, "grad/layer_24/attn": 0.005396617576479912, "grad/layer_24/mlp": 0.008709043264389038, "grad/layer_24/attn_mlp_ratio": 0.6196567580024329, "grad/layer_27/attn": 0.005326207727193832, "grad/layer_27/mlp": 0.00810988713055849, "grad/layer_27/attn_mlp_ratio": 0.6567548445216841} {"step": 58850, "timestamp": 1778258152.738974, "train/loss": 2.132192015647888, "train/z_loss": 0.0013916721916757524, "train/perplexity": 8.433332562379315, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027826.2546150081, "perf/iters_per_sec": 0.9669429085803071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341872215270995, "data/tokens_consumed": 123419492352, "data/tokens_consumed_B": 123.419492352, "train/loss_slope": 9.616597722394328e-06} {"step": 58860, "timestamp": 1778258163.0854144, "train/loss": 2.1227245569229125, "train/z_loss": 0.0013792165671475231, "train/perplexity": 8.353867096000796, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028519.3573167003, "perf/iters_per_sec": 0.9672734057029249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338338613510132, "data/tokens_consumed": 123440463872, "data/tokens_consumed_B": 123.440463872, "train/loss_slope": 6.061394193885656e-06} {"step": 58870, "timestamp": 1778258173.4254453, "train/loss": 2.1725250720977782, "train/z_loss": 0.0013853608164936305, "train/perplexity": 8.780427284274303, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029211.2487171649, "perf/iters_per_sec": 0.9676033252321076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334813594818115, "data/tokens_consumed": 123461435392, "data/tokens_consumed_B": 123.461435392, "train/loss_slope": 4.912049055743293e-06} {"step": 58875, "timestamp": 1778258179.1842754, "eos/sharpness": 28.311848640441887, "eos/L0_probe": 1.9752699136734009, "eos/L_plus": 2.1283531188964844, "eos/L_minus": 2.1053051948547363, "eos/grad_norm": 0.128895103931427, "eos/embed_grad_frac": 0.14053738117218018, "eos/time_s": 0.5956859588623047} {"step": 58875, "timestamp": 1778258180.5687757, "geo/rankme_last": 439.3570251464844, "geo/layer_0/stable_rank_q_proj": 19.419382095336914, "geo/layer_0/stable_rank_k_proj": 16.19025421142578, "geo/layer_0/stable_rank_o_proj": 47.34737014770508, "geo/layer_0/stable_rank_gate_proj": 131.70211791992188, "geo/layer_0/stable_rank_down_proj": 54.77298355102539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06106964871287346, "geo/layer_0/attn_entropy_mean": 6.158115386962891, "geo/layer_0/attn_entropy_std": 0.4089018702507019, "geo/layer_7/stable_rank_q_proj": 43.541725158691406, "geo/layer_7/stable_rank_k_proj": 41.19688034057617, "geo/layer_7/stable_rank_o_proj": 92.32720947265625, "geo/layer_7/stable_rank_gate_proj": 82.74085998535156, "geo/layer_7/stable_rank_down_proj": 141.319091796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4598252773284912, "geo/layer_7/attn_entropy_mean": 4.6455488204956055, "geo/layer_7/attn_entropy_std": 0.7956056594848633, "geo/layer_14/stable_rank_q_proj": 51.77055358886719, "geo/layer_14/stable_rank_k_proj": 39.63511276245117, "geo/layer_14/stable_rank_o_proj": 44.149009704589844, "geo/layer_14/stable_rank_gate_proj": 72.09138488769531, "geo/layer_14/stable_rank_down_proj": 130.06996154785156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.383396178483963, "geo/layer_14/attn_entropy_mean": 5.526430606842041, "geo/layer_14/attn_entropy_std": 0.40101587772369385, "geo/layer_21/stable_rank_q_proj": 40.676414489746094, "geo/layer_21/stable_rank_k_proj": 30.132070541381836, "geo/layer_21/stable_rank_o_proj": 70.7339096069336, "geo/layer_21/stable_rank_gate_proj": 67.08326721191406, "geo/layer_21/stable_rank_down_proj": 52.298606872558594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14576812088489532, "geo/layer_21/attn_entropy_mean": 5.702744960784912, "geo/layer_21/attn_entropy_std": 0.28720080852508545, "geo/layer_27/stable_rank_q_proj": 43.04582595825195, "geo/layer_27/stable_rank_k_proj": 31.667308807373047, "geo/layer_27/stable_rank_o_proj": 116.25552368164062, "geo/layer_27/stable_rank_gate_proj": 80.87747955322266, "geo/layer_27/stable_rank_down_proj": 128.91868591308594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09221962094306946, "geo/layer_27/attn_entropy_mean": 4.216278553009033, "geo/layer_27/attn_entropy_std": 0.6850648522377014, "attnres/final_alpha/block_0": 0.23776617646217346, "attnres/block_norm/0": 1.7590292692184448, "attnres/final_alpha/block_1": 0.004677693359553814, "attnres/block_norm/1": 45809.8984375, "attnres/final_alpha/block_2": 0.010433517396450043, "attnres/block_norm/2": 28221.95703125, "attnres/final_alpha/block_3": 0.012491757050156593, "attnres/block_norm/3": 56629.6953125, "attnres/final_alpha/block_4": 0.014595581218600273, "attnres/block_norm/4": 14810.1484375, "attnres/final_alpha/block_5": 0.6089907884597778, "attnres/block_norm/5": 6541.23828125, "attnres/final_alpha/block_6": 0.11104452610015869, "attnres/block_norm/6": 37282.10546875, "geo/tier1_time_s": 1.3644537925720215, "geo/step": 58875.0, "geo/rankme_slope": -0.00010638157606792718} {"step": 58880, "timestamp": 1778258185.7399318, "train/loss": 2.1037945628166197, "train/z_loss": 0.001384268212132156, "train/perplexity": 8.19721582284872, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703935.2103519284, "perf/iters_per_sec": 0.8124996234664575, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307698011398316, "data/tokens_consumed": 123482406912, "data/tokens_consumed_B": 123.482406912, "train/loss_slope": 4.1670234862154146e-07} {"step": 58890, "timestamp": 1778258196.078272, "train/loss": 2.174907612800598, "train/z_loss": 0.0013747198157943786, "train/perplexity": 8.801371950519455, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029288.7268754141, "perf/iters_per_sec": 0.96764026969691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334419012069702, "data/tokens_consumed": 123503378432, "data/tokens_consumed_B": 123.503378432, "train/loss_slope": -2.641774640225229e-07} {"step": 58900, "timestamp": 1778258206.4160461, "grad/layer_0/attn": 0.003035448957234621, "grad/layer_0/mlp": 0.0033973269164562225, "grad/layer_0/attn_mlp_ratio": 0.8934815349041437, "grad/layer_4/attn": 0.003569057211279869, "grad/layer_4/mlp": 0.002601850777864456, "grad/layer_4/attn_mlp_ratio": 1.371737804669757, "grad/layer_8/attn": 0.005621032789349556, "grad/layer_8/mlp": 0.00385912973433733, "grad/layer_8/attn_mlp_ratio": 1.456554464515644, "grad/layer_12/attn": 0.007235963828861713, "grad/layer_12/mlp": 0.007093661930412054, "grad/layer_12/attn_mlp_ratio": 1.0200604141893963, "grad/layer_16/attn": 0.005534898955374956, "grad/layer_16/mlp": 0.005078756716102362, "grad/layer_16/attn_mlp_ratio": 1.0898137390288862, "grad/layer_20/attn": 0.004128494765609503, "grad/layer_20/mlp": 0.007605785969644785, "grad/layer_20/attn_mlp_ratio": 0.5428097408743338, "grad/layer_24/attn": 0.014227770268917084, "grad/layer_24/mlp": 0.013851561583578587, "grad/layer_24/attn_mlp_ratio": 1.027160012273887, "grad/layer_27/attn": 0.018374616280198097, "grad/layer_27/mlp": 0.012292953208088875, "grad/layer_27/attn_mlp_ratio": 1.4947275743825883} {"step": 58900, "timestamp": 1778258206.4323208, "train/loss": 2.1319546699523926, "train/z_loss": 0.0013805869035422801, "train/perplexity": 8.431331184715534, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026448.8921120665, "perf/iters_per_sec": 0.9662861309585888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348901510238648, "data/tokens_consumed": 123524349952, "data/tokens_consumed_B": 123.524349952, "train/loss_slope": -1.9768713116467197e-06} {"step": 58910, "timestamp": 1778258216.7807906, "train/loss": 2.1933332204818727, "train/z_loss": 0.0013627196429297328, "train/perplexity": 8.965045841970326, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028027.435215457, "perf/iters_per_sec": 0.9670388389661108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340846300125122, "data/tokens_consumed": 123545321472, "data/tokens_consumed_B": 123.545321472, "train/loss_slope": 1.8352855693245878e-06} {"step": 58920, "timestamp": 1778258227.1267786, "train/loss": 2.1518921494483947, "train/z_loss": 0.001379550015553832, "train/perplexity": 8.601117609837818, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028446.5220830366, "perf/iters_per_sec": 0.9672386751570876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338709831237793, "data/tokens_consumed": 123566292992, "data/tokens_consumed_B": 123.566292992, "train/loss_slope": -1.3080934129103394e-06} {"step": 58930, "timestamp": 1778258237.4714122, "train/loss": 2.144745409488678, "train/z_loss": 0.0013812640332616865, "train/perplexity": 8.53986679145614, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028305.5911690777, "perf/iters_per_sec": 0.9671714740605725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339428186416626, "data/tokens_consumed": 123587264512, "data/tokens_consumed_B": 123.587264512, "train/loss_slope": -2.3656174044261693e-06} {"step": 58940, "timestamp": 1778258247.816154, "train/loss": 2.186332654953003, "train/z_loss": 0.001372477028053254, "train/perplexity": 8.902504618468702, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028670.0029758585, "perf/iters_per_sec": 0.9673452391509335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337570905685425, "data/tokens_consumed": 123608236032, "data/tokens_consumed_B": 123.608236032, "train/loss_slope": 2.5962830805900013e-06} {"step": 58950, "timestamp": 1778258258.1511645, "grad/layer_0/attn": 0.0026940973475575447, "grad/layer_0/mlp": 0.002779546659439802, "grad/layer_0/attn_mlp_ratio": 0.969257789388843, "grad/layer_4/attn": 0.0019263464491814375, "grad/layer_4/mlp": 0.0025004600174725056, "grad/layer_4/attn_mlp_ratio": 0.7703967904629533, "grad/layer_8/attn": 0.004142770543694496, "grad/layer_8/mlp": 0.003502658801153302, "grad/layer_8/attn_mlp_ratio": 1.182750207943579, "grad/layer_12/attn": 0.004742954391986132, "grad/layer_12/mlp": 0.006783758290112019, "grad/layer_12/attn_mlp_ratio": 0.6991632247545025, "grad/layer_16/attn": 0.00409146910533309, "grad/layer_16/mlp": 0.004530555102974176, "grad/layer_16/attn_mlp_ratio": 0.9030833798575417, "grad/layer_20/attn": 0.0039044860750436783, "grad/layer_20/mlp": 0.005400226917117834, "grad/layer_20/attn_mlp_ratio": 0.7230225808409717, "grad/layer_24/attn": 0.005495915655046701, "grad/layer_24/mlp": 0.007321526296436787, "grad/layer_24/attn_mlp_ratio": 0.7506516206403946, "grad/layer_27/attn": 0.004539747256785631, "grad/layer_27/mlp": 0.006215918809175491, "grad/layer_27/attn_mlp_ratio": 0.7303420979453873} {"step": 58950, "timestamp": 1778258258.74784, "eos/sharpness": 1.8350124359130855, "eos/L0_probe": 1.977550745010376, "eos/L_plus": 1.989519476890564, "eos/L_minus": 1.9839321374893188, "eos/grad_norm": 0.08197248727083206, "eos/embed_grad_frac": 0.3018490970134735, "eos/time_s": 0.5937643051147461} {"step": 58950, "timestamp": 1778258258.7657924, "train/loss": 2.1332409143447877, "train/z_loss": 0.0013795444625429808, "train/perplexity": 8.442182914664583, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916621.4474749085, "perf/iters_per_sec": 0.9139163243650954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0941920757293702, "data/tokens_consumed": 123629207552, "data/tokens_consumed_B": 123.629207552, "train/loss_slope": 2.120256416796909e-06} {"step": 58950, "timestamp": 1778258260.1319795, "geo/rankme_last": 438.75555419921875, "geo/layer_0/stable_rank_q_proj": 19.41238784790039, "geo/layer_0/stable_rank_k_proj": 16.184276580810547, "geo/layer_0/stable_rank_o_proj": 47.32583236694336, "geo/layer_0/stable_rank_gate_proj": 131.2926483154297, "geo/layer_0/stable_rank_down_proj": 54.74631118774414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0595770999789238, "geo/layer_0/attn_entropy_mean": 6.160755157470703, "geo/layer_0/attn_entropy_std": 0.40320104360580444, "geo/layer_7/stable_rank_q_proj": 43.51664733886719, "geo/layer_7/stable_rank_k_proj": 41.172691345214844, "geo/layer_7/stable_rank_o_proj": 92.2791748046875, "geo/layer_7/stable_rank_gate_proj": 82.86644744873047, "geo/layer_7/stable_rank_down_proj": 141.1918182373047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4756103456020355, "geo/layer_7/attn_entropy_mean": 4.640597820281982, "geo/layer_7/attn_entropy_std": 0.8040608763694763, "geo/layer_14/stable_rank_q_proj": 51.84080505371094, "geo/layer_14/stable_rank_k_proj": 39.57740020751953, "geo/layer_14/stable_rank_o_proj": 44.13664627075195, "geo/layer_14/stable_rank_gate_proj": 71.97063446044922, "geo/layer_14/stable_rank_down_proj": 129.9370880126953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40148359537124634, "geo/layer_14/attn_entropy_mean": 5.540707588195801, "geo/layer_14/attn_entropy_std": 0.3984450399875641, "geo/layer_21/stable_rank_q_proj": 40.62827682495117, "geo/layer_21/stable_rank_k_proj": 30.25678825378418, "geo/layer_21/stable_rank_o_proj": 70.78923034667969, "geo/layer_21/stable_rank_gate_proj": 67.0780029296875, "geo/layer_21/stable_rank_down_proj": 52.254722595214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14573997259140015, "geo/layer_21/attn_entropy_mean": 5.719646453857422, "geo/layer_21/attn_entropy_std": 0.2909919023513794, "geo/layer_27/stable_rank_q_proj": 42.98927688598633, "geo/layer_27/stable_rank_k_proj": 31.656286239624023, "geo/layer_27/stable_rank_o_proj": 116.50084686279297, "geo/layer_27/stable_rank_gate_proj": 80.85034942626953, "geo/layer_27/stable_rank_down_proj": 128.92198181152344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09055092185735703, "geo/layer_27/attn_entropy_mean": 4.226158142089844, "geo/layer_27/attn_entropy_std": 0.6965343356132507, "attnres/final_alpha/block_0": 0.23643843829631805, "attnres/block_norm/0": 1.7589845657348633, "attnres/final_alpha/block_1": 0.00457370188087225, "attnres/block_norm/1": 45874.3671875, "attnres/final_alpha/block_2": 0.010227241553366184, "attnres/block_norm/2": 28277.716796875, "attnres/final_alpha/block_3": 0.01234742533415556, "attnres/block_norm/3": 56652.2890625, "attnres/final_alpha/block_4": 0.014432346448302269, "attnres/block_norm/4": 14813.39453125, "attnres/final_alpha/block_5": 0.6113831400871277, "attnres/block_norm/5": 6547.193359375, "attnres/final_alpha/block_6": 0.11059769243001938, "attnres/block_norm/6": 37423.671875, "geo/tier1_time_s": 1.362163782119751, "geo/step": 58950.0, "geo/rankme_slope": -0.00010179595275610244} {"step": 58960, "timestamp": 1778258270.4743822, "train/loss": 2.1443318367004394, "train/z_loss": 0.0013770930701866746, "train/perplexity": 8.536335665175214, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791683.1506239832, "perf/iters_per_sec": 0.8543411019439617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170492672920227, "data/tokens_consumed": 123650179072, "data/tokens_consumed_B": 123.650179072, "train/loss_slope": 8.582032696105483e-07} {"step": 58970, "timestamp": 1778258280.8166356, "train/loss": 2.1039018392562867, "train/z_loss": 0.0013900011312216521, "train/perplexity": 8.198095238146806, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029228.2887230038, "perf/iters_per_sec": 0.9676114505400676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334726810455321, "data/tokens_consumed": 123671150592, "data/tokens_consumed_B": 123.671150592, "train/loss_slope": -7.742680267210439e-07} {"step": 58980, "timestamp": 1778258291.1603618, "train/loss": 2.229924440383911, "train/z_loss": 0.0013694818830117584, "train/perplexity": 9.29916341171993, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028511.404596227, "perf/iters_per_sec": 0.9672696135502944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033837914466858, "data/tokens_consumed": 123692122112, "data/tokens_consumed_B": 123.692122112, "train/loss_slope": 4.484477180494687e-06} {"step": 58990, "timestamp": 1778258301.5067213, "train/loss": 2.178578782081604, "train/z_loss": 0.0013651809538714587, "train/perplexity": 8.833742659675137, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028343.5699589998, "perf/iters_per_sec": 0.9671895837588309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339234590530395, "data/tokens_consumed": 123713093632, "data/tokens_consumed_B": 123.713093632, "train/loss_slope": 3.4546381641070873e-06} {"step": 59000, "timestamp": 1778258311.839718, "grad/layer_0/attn": 0.0029052579775452614, "grad/layer_0/mlp": 0.002946697175502777, "grad/layer_0/attn_mlp_ratio": 0.9859370359140658, "grad/layer_4/attn": 0.002205873606726527, "grad/layer_4/mlp": 0.002365756081417203, "grad/layer_4/attn_mlp_ratio": 0.9324179829068867, "grad/layer_8/attn": 0.003985453397035599, "grad/layer_8/mlp": 0.003393256338313222, "grad/layer_8/attn_mlp_ratio": 1.1745217225659361, "grad/layer_12/attn": 0.004800355993211269, "grad/layer_12/mlp": 0.00683510210365057, "grad/layer_12/attn_mlp_ratio": 0.7023093218192759, "grad/layer_16/attn": 0.006550910882651806, "grad/layer_16/mlp": 0.004809671547263861, "grad/layer_16/attn_mlp_ratio": 1.362028712787183, "grad/layer_20/attn": 0.00354398088529706, "grad/layer_20/mlp": 0.006517541129142046, "grad/layer_20/attn_mlp_ratio": 0.5437604091326296, "grad/layer_24/attn": 0.010850679129362106, "grad/layer_24/mlp": 0.010218766517937183, "grad/layer_24/attn_mlp_ratio": 1.0618384326652217, "grad/layer_27/attn": 0.009213859215378761, "grad/layer_27/mlp": 0.009456265717744827, "grad/layer_27/attn_mlp_ratio": 0.9743655046253896} {"step": 59000, "timestamp": 1778258311.8539357, "train/loss": 2.0904430866241457, "train/z_loss": 0.0013795455335639418, "train/perplexity": 8.088498275827488, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028248.2047352407, "perf/iters_per_sec": 0.9671441100765422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339720726013184, "data/tokens_consumed": 123734065152, "data/tokens_consumed_B": 123.734065152, "train/loss_slope": -1.8298097819921253e-06} {"step": 59000, "timestamp": 1778258319.1247103, "geo/ww_alpha_mean": 7.704915394890164, "geo/ww_alpha_std": 4.9474027238688505, "geo/ww_alpha_min": 1.3460979418216652, "geo/ww_alpha_max": 40.495462088161254, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.982993008543427, "geo/ww_alpha_by_type/k_proj": 4.518524001857807, "geo/ww_alpha_by_type/v_proj": 8.880818928449445, "geo/ww_alpha_by_type/o_proj": 9.091219477669204, "geo/ww_alpha_by_type/gate_proj": 7.929293545141652, "geo/ww_alpha_by_type/up_proj": 11.240422470240466, "geo/ww_alpha_by_type/down_proj": 8.387409658272874, "geo/twonn_id/layer_0": 0.7016535401344299, "geo/twonn_id/layer_7": 2.9903066158294678, "geo/twonn_id/layer_14": 4.2745466232299805, "geo/twonn_id/layer_21": 8.215046882629395, "geo/twonn_id/layer_27": 5.726386547088623, "geo/tier2_time_s": 7.264239311218262} {"step": 59000, "timestamp": 1778258319.9555767, "eoc/jacobian_sigma/layer_0/attn": 1121.9776611328125, "eoc/jacobian_sigma/layer_0/mlp": 11531.404296875, "eoc/jacobian_sigma/layer_0": 11531.404296875, "eoc/jacobian_sigma/layer_7/attn": 1.1347650289535522, "eoc/jacobian_sigma/layer_7/mlp": 1.8173398971557617, "eoc/jacobian_sigma/layer_7": 1.8173398971557617, "eoc/jacobian_sigma/layer_14/attn": 1.4947168827056885, "eoc/jacobian_sigma/layer_14/mlp": 7.200114727020264, "eoc/jacobian_sigma/layer_14": 7.200114727020264, "eoc/jacobian_sigma/layer_21/attn": 1.080604076385498, "eoc/jacobian_sigma/layer_21/mlp": 4.414518356323242, "eoc/jacobian_sigma/layer_21": 4.414518356323242, "eoc/jacobian_sigma/layer_27/attn": 3.952596426010132, "eoc/jacobian_sigma/layer_27/mlp": 29.04483413696289, "eoc/jacobian_sigma/layer_27": 29.04483413696289, "eoc/layer0_sigma": 11531.404296875, "eoc/sigma_max": 29.04483413696289, "eoc/sigma_min": 1.8173398971557617, "eoc/sigma_mean": 10.61920177936554, "eoc/time_s": 0.8238043785095215} {"step": 59010, "timestamp": 1778258330.323398, "train/loss": 2.137398433685303, "train/z_loss": 0.0013602609978988767, "train/perplexity": 8.477354516053325, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1135767.4984829156, "perf/iters_per_sec": 0.5415761463560655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8464624166488648, "data/tokens_consumed": 123755036672, "data/tokens_consumed_B": 123.755036672, "train/loss_slope": 3.286427711889715e-06} {"step": 59020, "timestamp": 1778258340.6669226, "train/loss": 2.110529351234436, "train/z_loss": 0.0013876121258363128, "train/perplexity": 8.252608657167643, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029026.3081627009, "perf/iters_per_sec": 0.9675151387036804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335755586624145, "data/tokens_consumed": 123776008192, "data/tokens_consumed_B": 123.776008192, "train/loss_slope": -5.12154839827843e-07} {"step": 59025, "timestamp": 1778258346.4419994, "eos/sharpness": 82.40222930908202, "eos/L0_probe": 1.979084849357605, "eos/L_plus": 2.3485653400421143, "eos/L_minus": 2.433626651763916, "eos/grad_norm": 0.27248507738113403, "eos/embed_grad_frac": 0.033565010875463486, "eos/time_s": 0.612170934677124} {"step": 59025, "timestamp": 1778258347.8221304, "geo/rankme_last": 438.6426086425781, "geo/layer_0/stable_rank_q_proj": 19.395532608032227, "geo/layer_0/stable_rank_k_proj": 16.138378143310547, "geo/layer_0/stable_rank_o_proj": 47.26202392578125, "geo/layer_0/stable_rank_gate_proj": 131.37355041503906, "geo/layer_0/stable_rank_down_proj": 54.75962829589844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06133450195193291, "geo/layer_0/attn_entropy_mean": 6.15897274017334, "geo/layer_0/attn_entropy_std": 0.40517786145210266, "geo/layer_7/stable_rank_q_proj": 43.527889251708984, "geo/layer_7/stable_rank_k_proj": 41.23790740966797, "geo/layer_7/stable_rank_o_proj": 92.21248626708984, "geo/layer_7/stable_rank_gate_proj": 82.86199951171875, "geo/layer_7/stable_rank_down_proj": 141.062744140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4707816541194916, "geo/layer_7/attn_entropy_mean": 4.651055335998535, "geo/layer_7/attn_entropy_std": 0.7992416620254517, "geo/layer_14/stable_rank_q_proj": 51.89434051513672, "geo/layer_14/stable_rank_k_proj": 39.62823486328125, "geo/layer_14/stable_rank_o_proj": 44.03213119506836, "geo/layer_14/stable_rank_gate_proj": 72.07302856445312, "geo/layer_14/stable_rank_down_proj": 129.822998046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3940419554710388, "geo/layer_14/attn_entropy_mean": 5.540253639221191, "geo/layer_14/attn_entropy_std": 0.396779865026474, "geo/layer_21/stable_rank_q_proj": 40.53515625, "geo/layer_21/stable_rank_k_proj": 30.17874526977539, "geo/layer_21/stable_rank_o_proj": 70.8261947631836, "geo/layer_21/stable_rank_gate_proj": 67.12068176269531, "geo/layer_21/stable_rank_down_proj": 52.15736389160156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13931934535503387, "geo/layer_21/attn_entropy_mean": 5.685479640960693, "geo/layer_21/attn_entropy_std": 0.30838361382484436, "geo/layer_27/stable_rank_q_proj": 43.03110885620117, "geo/layer_27/stable_rank_k_proj": 31.728107452392578, "geo/layer_27/stable_rank_o_proj": 116.35363006591797, "geo/layer_27/stable_rank_gate_proj": 80.82080078125, "geo/layer_27/stable_rank_down_proj": 128.9586181640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0939689353108406, "geo/layer_27/attn_entropy_mean": 4.201467514038086, "geo/layer_27/attn_entropy_std": 0.7246897220611572, "attnres/final_alpha/block_0": 0.23819702863693237, "attnres/block_norm/0": 1.759093999862671, "attnres/final_alpha/block_1": 0.004630850628018379, "attnres/block_norm/1": 46034.32421875, "attnres/final_alpha/block_2": 0.010326680727303028, "attnres/block_norm/2": 28305.35546875, "attnres/final_alpha/block_3": 0.012333175167441368, "attnres/block_norm/3": 56593.4453125, "attnres/final_alpha/block_4": 0.014511138200759888, "attnres/block_norm/4": 14815.056640625, "attnres/final_alpha/block_5": 0.6074546575546265, "attnres/block_norm/5": 6575.1796875, "attnres/final_alpha/block_6": 0.11254645884037018, "attnres/block_norm/6": 37288.05078125, "geo/tier1_time_s": 1.3621718883514404, "geo/step": 59025.0, "geo/rankme_slope": -0.00010594679277961185} {"step": 59030, "timestamp": 1778258352.9982557, "train/loss": 2.153915858268738, "train/z_loss": 0.0013768418459221721, "train/perplexity": 8.618541391793828, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701418.03108943, "perf/iters_per_sec": 0.81129933886024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232590675354004, "data/tokens_consumed": 123796979712, "data/tokens_consumed_B": 123.796979712, "train/loss_slope": 4.281394588242882e-07} {"step": 59040, "timestamp": 1778258363.3404949, "train/loss": 2.1972278118133546, "train/z_loss": 0.0013570377719588577, "train/perplexity": 9.000029110341295, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028989.2866549555, "perf/iters_per_sec": 0.9674974854731347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335944175720215, "data/tokens_consumed": 123817951232, "data/tokens_consumed_B": 123.817951232, "train/loss_slope": 3.873921578044218e-06} {"step": 59050, "timestamp": 1778258373.6797433, "grad/layer_0/attn": 0.002511360449716449, "grad/layer_0/mlp": 0.002746932441368699, "grad/layer_0/attn_mlp_ratio": 0.9142417630922698, "grad/layer_4/attn": 0.0029073075857013464, "grad/layer_4/mlp": 0.002593422308564186, "grad/layer_4/attn_mlp_ratio": 1.1210312581940474, "grad/layer_8/attn": 0.006786648649722338, "grad/layer_8/mlp": 0.0037848162464797497, "grad/layer_8/attn_mlp_ratio": 1.7931249573138697, "grad/layer_12/attn": 0.005665235221385956, "grad/layer_12/mlp": 0.007147904951125383, "grad/layer_12/attn_mlp_ratio": 0.7925728141134174, "grad/layer_16/attn": 0.003333126427605748, "grad/layer_16/mlp": 0.0044453865848481655, "grad/layer_16/attn_mlp_ratio": 0.7497944867128231, "grad/layer_20/attn": 0.003128985408693552, "grad/layer_20/mlp": 0.00599968247115612, "grad/layer_20/attn_mlp_ratio": 0.5215251593036541, "grad/layer_24/attn": 0.010193314403295517, "grad/layer_24/mlp": 0.008765969425439835, "grad/layer_24/attn_mlp_ratio": 1.162827953452652, "grad/layer_27/attn": 0.004558703862130642, "grad/layer_27/mlp": 0.008443689905107021, "grad/layer_27/attn_mlp_ratio": 0.5398947449957765} {"step": 59050, "timestamp": 1778258373.694435, "train/loss": 2.1426963806152344, "train/z_loss": 0.0013811165350489318, "train/perplexity": 8.522386272984644, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026925.8027663513, "perf/iters_per_sec": 0.9665135396796948, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346466541290282, "data/tokens_consumed": 123838922752, "data/tokens_consumed_B": 123.838922752, "train/loss_slope": 3.0463080916932907e-06} {"step": 59060, "timestamp": 1778258384.0344708, "train/loss": 2.1447184681892395, "train/z_loss": 0.0013808021787554026, "train/perplexity": 8.53963671944698, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029184.050839243, "perf/iters_per_sec": 0.9675903562732902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334952116012572, "data/tokens_consumed": 123859894272, "data/tokens_consumed_B": 123.859894272, "train/loss_slope": 2.760810112402411e-06} {"step": 59070, "timestamp": 1778258394.378675, "train/loss": 2.1895425796508787, "train/z_loss": 0.0013779862434603273, "train/perplexity": 8.931126901026241, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028449.6094048962, "perf/iters_per_sec": 0.9672401473068696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338694095611571, "data/tokens_consumed": 123880865792, "data/tokens_consumed_B": 123.880865792, "train/loss_slope": 5.2345064928894395e-06} {"step": 59080, "timestamp": 1778258404.722566, "train/loss": 2.159549021720886, "train/z_loss": 0.0013786347117275, "train/perplexity": 8.667228045365837, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028527.1229745126, "perf/iters_per_sec": 0.9672771086571277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338299036026002, "data/tokens_consumed": 123901837312, "data/tokens_consumed_B": 123.901837312, "train/loss_slope": 4.984365945959682e-06} {"step": 59090, "timestamp": 1778258415.0784402, "train/loss": 2.1700335502624513, "train/z_loss": 0.0013911082176491618, "train/perplexity": 8.75857788839833, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026051.5368548606, "perf/iters_per_sec": 0.9660966572069457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035093116760254, "data/tokens_consumed": 123922808832, "data/tokens_consumed_B": 123.922808832, "train/loss_slope": 6.37510814289055e-06} {"step": 59100, "timestamp": 1778258425.4331017, "grad/layer_0/attn": 0.0029930227901786566, "grad/layer_0/mlp": 0.0031429543159902096, "grad/layer_0/attn_mlp_ratio": 0.9522959591622597, "grad/layer_4/attn": 0.0027389279566705227, "grad/layer_4/mlp": 0.0027146146167069674, "grad/layer_4/attn_mlp_ratio": 1.0089564238394202, "grad/layer_8/attn": 0.004954224918037653, "grad/layer_8/mlp": 0.003896381938830018, "grad/layer_8/attn_mlp_ratio": 1.271493623742624, "grad/layer_12/attn": 0.007984806783497334, "grad/layer_12/mlp": 0.0071875350549817085, "grad/layer_12/attn_mlp_ratio": 1.110924205770741, "grad/layer_16/attn": 0.004102102946490049, "grad/layer_16/mlp": 0.005398752633482218, "grad/layer_16/attn_mlp_ratio": 0.7598241944013199, "grad/layer_20/attn": 0.007993335835635662, "grad/layer_20/mlp": 0.007154444232583046, "grad/layer_20/attn_mlp_ratio": 1.1172545992470868, "grad/layer_24/attn": 0.015011312440037727, "grad/layer_24/mlp": 0.013316909782588482, "grad/layer_24/attn_mlp_ratio": 1.1272369170016412, "grad/layer_27/attn": 0.0054932935163378716, "grad/layer_27/mlp": 0.012138647958636284, "grad/layer_27/attn_mlp_ratio": 0.45254574395783376} {"step": 59100, "timestamp": 1778258426.0322814, "eos/sharpness": 64.12558555603026, "eos/L0_probe": 1.978356122970581, "eos/L_plus": 2.3264503479003906, "eos/L_minus": 2.271517753601074, "eos/grad_norm": 0.2170407623052597, "eos/embed_grad_frac": 0.04945409670472145, "eos/time_s": 0.5962250232696533} {"step": 59100, "timestamp": 1778258426.0556555, "train/loss": 2.166970658302307, "train/z_loss": 0.0013674435555003583, "train/perplexity": 8.731792352143753, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912096.0779876527, "perf/iters_per_sec": 0.9117584600389732, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967817068099976, "data/tokens_consumed": 123943780352, "data/tokens_consumed_B": 123.943780352, "train/loss_slope": 8.612411150229846e-06} {"step": 59100, "timestamp": 1778258427.4167283, "geo/rankme_last": 438.5533142089844, "geo/layer_0/stable_rank_q_proj": 19.40582847595215, "geo/layer_0/stable_rank_k_proj": 16.158855438232422, "geo/layer_0/stable_rank_o_proj": 47.26860809326172, "geo/layer_0/stable_rank_gate_proj": 131.21714782714844, "geo/layer_0/stable_rank_down_proj": 54.77813720703125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06549055129289627, "geo/layer_0/attn_entropy_mean": 6.159454345703125, "geo/layer_0/attn_entropy_std": 0.4065483808517456, "geo/layer_7/stable_rank_q_proj": 43.558685302734375, "geo/layer_7/stable_rank_k_proj": 41.276451110839844, "geo/layer_7/stable_rank_o_proj": 92.09465789794922, "geo/layer_7/stable_rank_gate_proj": 82.77639770507812, "geo/layer_7/stable_rank_down_proj": 141.21726989746094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4685622453689575, "geo/layer_7/attn_entropy_mean": 4.684847831726074, "geo/layer_7/attn_entropy_std": 0.7926738262176514, "geo/layer_14/stable_rank_q_proj": 51.843955993652344, "geo/layer_14/stable_rank_k_proj": 39.5666389465332, "geo/layer_14/stable_rank_o_proj": 44.03761291503906, "geo/layer_14/stable_rank_gate_proj": 72.11656951904297, "geo/layer_14/stable_rank_down_proj": 129.63902282714844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38833677768707275, "geo/layer_14/attn_entropy_mean": 5.534135818481445, "geo/layer_14/attn_entropy_std": 0.3959076404571533, "geo/layer_21/stable_rank_q_proj": 40.49863815307617, "geo/layer_21/stable_rank_k_proj": 30.260913848876953, "geo/layer_21/stable_rank_o_proj": 70.76863098144531, "geo/layer_21/stable_rank_gate_proj": 67.14307403564453, "geo/layer_21/stable_rank_down_proj": 52.17249298095703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14470131695270538, "geo/layer_21/attn_entropy_mean": 5.724458694458008, "geo/layer_21/attn_entropy_std": 0.2886531352996826, "geo/layer_27/stable_rank_q_proj": 43.12848663330078, "geo/layer_27/stable_rank_k_proj": 31.69129180908203, "geo/layer_27/stable_rank_o_proj": 116.54886627197266, "geo/layer_27/stable_rank_gate_proj": 80.89263153076172, "geo/layer_27/stable_rank_down_proj": 129.0316619873047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09285926818847656, "geo/layer_27/attn_entropy_mean": 4.2591657638549805, "geo/layer_27/attn_entropy_std": 0.7156330347061157, "attnres/final_alpha/block_0": 0.23538844287395477, "attnres/block_norm/0": 1.7591248750686646, "attnres/final_alpha/block_1": 0.004540616180747747, "attnres/block_norm/1": 45845.5859375, "attnres/final_alpha/block_2": 0.010307923890650272, "attnres/block_norm/2": 28365.88671875, "attnres/final_alpha/block_3": 0.01234623696655035, "attnres/block_norm/3": 56708.0546875, "attnres/final_alpha/block_4": 0.014298020862042904, "attnres/block_norm/4": 14862.533203125, "attnres/final_alpha/block_5": 0.6132872104644775, "attnres/block_norm/5": 6542.876953125, "attnres/final_alpha/block_6": 0.10983157157897949, "attnres/block_norm/6": 37337.02734375, "geo/tier1_time_s": 1.3573906421661377, "geo/step": 59100.0, "geo/rankme_slope": -0.00012247303218162266} {"step": 59110, "timestamp": 1778258437.7761388, "train/loss": 2.1493361234664916, "train/z_loss": 0.0013858976191841066, "train/perplexity": 8.579161002537068, "train/grad_norm": 0.330078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789885.900906526, "perf/iters_per_sec": 0.8534841064961081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716679811477662, "data/tokens_consumed": 123964751872, "data/tokens_consumed_B": 123.964751872, "train/loss_slope": 8.492754337632381e-06} {"step": 59120, "timestamp": 1778258448.130509, "train/loss": 2.1576860666275026, "train/z_loss": 0.0013788552372716368, "train/perplexity": 8.651096419645892, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026327.2838277663, "perf/iters_per_sec": 0.9662281436098892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349522590637208, "data/tokens_consumed": 123985723392, "data/tokens_consumed_B": 123.985723392, "train/loss_slope": 6.608671860189845e-06} {"step": 59130, "timestamp": 1778258458.4793353, "train/loss": 2.157599174976349, "train/z_loss": 0.0013918479438871146, "train/perplexity": 8.65034474425133, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027819.7565346411, "perf/iters_per_sec": 0.9669398100541311, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341905355453491, "data/tokens_consumed": 124006694912, "data/tokens_consumed_B": 124.006694912, "train/loss_slope": 6.291512709067372e-06} {"step": 59140, "timestamp": 1778258468.82525, "train/loss": 2.1817741870880125, "train/z_loss": 0.0013634636998176575, "train/perplexity": 8.862015192234576, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028579.332193488, "perf/iters_per_sec": 0.9673020039527359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338032960891723, "data/tokens_consumed": 124027666432, "data/tokens_consumed_B": 124.027666432, "train/loss_slope": 9.273366780743146e-06} {"step": 59150, "timestamp": 1778258479.1616905, "grad/layer_0/attn": 0.0029008230194449425, "grad/layer_0/mlp": 0.0029678279533982277, "grad/layer_0/attn_mlp_ratio": 0.9774228719630293, "grad/layer_4/attn": 0.002861778251826763, "grad/layer_4/mlp": 0.00252489629201591, "grad/layer_4/attn_mlp_ratio": 1.1334240331112684, "grad/layer_8/attn": 0.003411274403333664, "grad/layer_8/mlp": 0.003681095317006111, "grad/layer_8/attn_mlp_ratio": 0.926700891146175, "grad/layer_12/attn": 0.006062798202037811, "grad/layer_12/mlp": 0.00673038512468338, "grad/layer_12/attn_mlp_ratio": 0.9008099833279641, "grad/layer_16/attn": 0.004486847668886185, "grad/layer_16/mlp": 0.004838308785110712, "grad/layer_16/attn_mlp_ratio": 0.9273586650686763, "grad/layer_20/attn": 0.00379853299818933, "grad/layer_20/mlp": 0.006764118559658527, "grad/layer_20/attn_mlp_ratio": 0.5615710174991358, "grad/layer_24/attn": 0.019268183037638664, "grad/layer_24/mlp": 0.013396235182881355, "grad/layer_24/attn_mlp_ratio": 1.4383282042127832, "grad/layer_27/attn": 0.012096971273422241, "grad/layer_27/mlp": 0.01438072882592678, "grad/layer_27/attn_mlp_ratio": 0.8411931923431788} {"step": 59150, "timestamp": 1778258479.1760068, "train/loss": 2.1718416452407836, "train/z_loss": 0.001371635647956282, "train/perplexity": 8.774428554532335, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027131.1961079123, "perf/iters_per_sec": 0.9666114788569986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345418214797975, "data/tokens_consumed": 124048637952, "data/tokens_consumed_B": 124.048637952, "train/loss_slope": 1.1518764760520679e-05} {"step": 59160, "timestamp": 1778258489.5211148, "train/loss": 2.111698019504547, "train/z_loss": 0.0013705825200304388, "train/perplexity": 8.262258856893688, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028495.078324311, "perf/iters_per_sec": 0.9672618285771899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338462352752686, "data/tokens_consumed": 124069609472, "data/tokens_consumed_B": 124.069609472, "train/loss_slope": 7.483246131162165e-06} {"step": 59170, "timestamp": 1778258499.8685155, "train/loss": 2.162091064453125, "train/z_loss": 0.0013928911183029413, "train/perplexity": 8.689288536903767, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028146.4885959355, "perf/iters_per_sec": 0.9670956080417326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034023928642273, "data/tokens_consumed": 124090580992, "data/tokens_consumed_B": 124.090580992, "train/loss_slope": 1.120485607081975e-05} {"step": 59175, "timestamp": 1778258505.643008, "eos/sharpness": 80.2791118621826, "eos/L0_probe": 1.9774805307388306, "eos/L_plus": 2.449354887008667, "eos/L_minus": 2.3083972930908203, "eos/grad_norm": 0.2269190549850464, "eos/embed_grad_frac": 0.044996295124292374, "eos/time_s": 0.6131062507629395} {"step": 59175, "timestamp": 1778258507.0222106, "geo/rankme_last": 438.746337890625, "geo/layer_0/stable_rank_q_proj": 19.380828857421875, "geo/layer_0/stable_rank_k_proj": 16.147876739501953, "geo/layer_0/stable_rank_o_proj": 47.264347076416016, "geo/layer_0/stable_rank_gate_proj": 131.17105102539062, "geo/layer_0/stable_rank_down_proj": 54.69197463989258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06253127753734589, "geo/layer_0/attn_entropy_mean": 6.156047344207764, "geo/layer_0/attn_entropy_std": 0.40725475549697876, "geo/layer_7/stable_rank_q_proj": 43.51278305053711, "geo/layer_7/stable_rank_k_proj": 41.173789978027344, "geo/layer_7/stable_rank_o_proj": 92.24140930175781, "geo/layer_7/stable_rank_gate_proj": 82.82440948486328, "geo/layer_7/stable_rank_down_proj": 141.6951141357422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4712688624858856, "geo/layer_7/attn_entropy_mean": 4.639342784881592, "geo/layer_7/attn_entropy_std": 0.8189955353736877, "geo/layer_14/stable_rank_q_proj": 51.890052795410156, "geo/layer_14/stable_rank_k_proj": 39.639408111572266, "geo/layer_14/stable_rank_o_proj": 44.00841522216797, "geo/layer_14/stable_rank_gate_proj": 72.08536529541016, "geo/layer_14/stable_rank_down_proj": 129.67755126953125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37946343421936035, "geo/layer_14/attn_entropy_mean": 5.531600475311279, "geo/layer_14/attn_entropy_std": 0.3865049183368683, "geo/layer_21/stable_rank_q_proj": 40.46599578857422, "geo/layer_21/stable_rank_k_proj": 30.217771530151367, "geo/layer_21/stable_rank_o_proj": 70.7065658569336, "geo/layer_21/stable_rank_gate_proj": 67.20085144042969, "geo/layer_21/stable_rank_down_proj": 52.16156005859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.145097017288208, "geo/layer_21/attn_entropy_mean": 5.723018646240234, "geo/layer_21/attn_entropy_std": 0.29521745443344116, "geo/layer_27/stable_rank_q_proj": 43.1544189453125, "geo/layer_27/stable_rank_k_proj": 31.705297470092773, "geo/layer_27/stable_rank_o_proj": 116.39537048339844, "geo/layer_27/stable_rank_gate_proj": 80.91122436523438, "geo/layer_27/stable_rank_down_proj": 128.96151733398438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0915544256567955, "geo/layer_27/attn_entropy_mean": 4.228466510772705, "geo/layer_27/attn_entropy_std": 0.7235733270645142, "attnres/final_alpha/block_0": 0.23578085005283356, "attnres/block_norm/0": 1.7594246864318848, "attnres/final_alpha/block_1": 0.0045959739945828915, "attnres/block_norm/1": 45775.796875, "attnres/final_alpha/block_2": 0.01019661407917738, "attnres/block_norm/2": 28294.67578125, "attnres/final_alpha/block_3": 0.012159311212599277, "attnres/block_norm/3": 56343.48046875, "attnres/final_alpha/block_4": 0.014070735312998295, "attnres/block_norm/4": 14921.466796875, "attnres/final_alpha/block_5": 0.6140487194061279, "attnres/block_norm/5": 6542.0234375, "attnres/final_alpha/block_6": 0.1091478168964386, "attnres/block_norm/6": 37373.5546875, "geo/tier1_time_s": 1.358816385269165, "geo/step": 59175.0, "geo/rankme_slope": -0.00012076643157262905} {"step": 59180, "timestamp": 1778258512.199538, "train/loss": 2.1468315362930297, "train/z_loss": 0.0013826828100718558, "train/perplexity": 8.557700631833981, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701391.0450461532, "perf/iters_per_sec": 0.8112864709120527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2326102256774902, "data/tokens_consumed": 124111552512, "data/tokens_consumed_B": 124.111552512, "train/loss_slope": 7.532115931128888e-06} {"step": 59190, "timestamp": 1778258522.5481765, "train/loss": 2.1848260879516603, "train/z_loss": 0.0013743966934271156, "train/perplexity": 8.889102496913205, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028000.8769023633, "perf/iters_per_sec": 0.9670261749755684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034098172187805, "data/tokens_consumed": 124132524032, "data/tokens_consumed_B": 124.132524032, "train/loss_slope": 6.564929282883381e-06} {"step": 59200, "timestamp": 1778258532.8805146, "grad/layer_0/attn": 0.002594167832285166, "grad/layer_0/mlp": 0.002738616429269314, "grad/layer_0/attn_mlp_ratio": 0.9472548655716001, "grad/layer_4/attn": 0.0021170801483094692, "grad/layer_4/mlp": 0.002547529526054859, "grad/layer_4/attn_mlp_ratio": 0.8310325919891302, "grad/layer_8/attn": 0.005583821330219507, "grad/layer_8/mlp": 0.0037869825027883053, "grad/layer_8/attn_mlp_ratio": 1.4744776820754901, "grad/layer_12/attn": 0.004465183708816767, "grad/layer_12/mlp": 0.0066296979784965515, "grad/layer_12/attn_mlp_ratio": 0.6735123765740714, "grad/layer_16/attn": 0.0040652574971318245, "grad/layer_16/mlp": 0.0052148401737213135, "grad/layer_16/attn_mlp_ratio": 0.7795555153659298, "grad/layer_20/attn": 0.003358903806656599, "grad/layer_20/mlp": 0.0060777137987315655, "grad/layer_20/attn_mlp_ratio": 0.5526590857390656, "grad/layer_24/attn": 0.010297979228198528, "grad/layer_24/mlp": 0.010126665234565735, "grad/layer_24/attn_mlp_ratio": 1.0169171082456967, "grad/layer_27/attn": 0.00415703933686018, "grad/layer_27/mlp": 0.010053041391074657, "grad/layer_27/attn_mlp_ratio": 0.4135106117438095} {"step": 59200, "timestamp": 1778258532.89481, "train/loss": 2.1808238983154298, "train/z_loss": 0.0013776709092780948, "train/perplexity": 8.853597718843616, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027884.4587027058, "perf/iters_per_sec": 0.9669706624520806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341575384140014, "data/tokens_consumed": 124153495552, "data/tokens_consumed_B": 124.153495552, "train/loss_slope": 1.04532071430333e-05} {"step": 59210, "timestamp": 1778258543.2405705, "train/loss": 2.1616302013397215, "train/z_loss": 0.0013665345846675335, "train/perplexity": 8.68528488697354, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028086.586070983, "perf/iters_per_sec": 0.967067044291965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340544700622558, "data/tokens_consumed": 124174467072, "data/tokens_consumed_B": 124.174467072, "train/loss_slope": 1.0819714412008185e-05} {"step": 59220, "timestamp": 1778258553.5853386, "train/loss": 2.1257341146469115, "train/z_loss": 0.0013857992831617593, "train/perplexity": 8.379046411541609, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028668.5993416123, "perf/iters_per_sec": 0.9673445698459684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337578058242798, "data/tokens_consumed": 124195438592, "data/tokens_consumed_B": 124.195438592, "train/loss_slope": 8.4810143650168e-06} {"step": 59230, "timestamp": 1778258563.9260497, "train/loss": 2.169986069202423, "train/z_loss": 0.001363584364298731, "train/perplexity": 8.758162031708585, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029126.8487317387, "perf/iters_per_sec": 0.9675630801829046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335243463516235, "data/tokens_consumed": 124216410112, "data/tokens_consumed_B": 124.216410112, "train/loss_slope": 8.284095218508492e-06} {"step": 59240, "timestamp": 1778258574.273659, "train/loss": 2.072971510887146, "train/z_loss": 0.0013870354392565786, "train/perplexity": 7.948406838310493, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027776.2812410202, "perf/iters_per_sec": 0.9669190794186688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342127084732056, "data/tokens_consumed": 124237381632, "data/tokens_consumed_B": 124.237381632, "train/loss_slope": 4.518642226676103e-06} {"step": 59250, "timestamp": 1778258585.043975, "grad/layer_0/attn": 0.0027298086788505316, "grad/layer_0/mlp": 0.0030395863577723503, "grad/layer_0/attn_mlp_ratio": 0.8980855510361602, "grad/layer_4/attn": 0.002112897578626871, "grad/layer_4/mlp": 0.0024941302835941315, "grad/layer_4/attn_mlp_ratio": 0.847148004982045, "grad/layer_8/attn": 0.005417514592409134, "grad/layer_8/mlp": 0.0035460423678159714, "grad/layer_8/attn_mlp_ratio": 1.5277635960591902, "grad/layer_12/attn": 0.004719329532235861, "grad/layer_12/mlp": 0.0068807718344032764, "grad/layer_12/attn_mlp_ratio": 0.6858721052269751, "grad/layer_16/attn": 0.0035211751237511635, "grad/layer_16/mlp": 0.004626748152077198, "grad/layer_16/attn_mlp_ratio": 0.7610474856007818, "grad/layer_20/attn": 0.006223258096724749, "grad/layer_20/mlp": 0.00577973946928978, "grad/layer_20/attn_mlp_ratio": 1.076736766789904, "grad/layer_24/attn": 0.005263500846922398, "grad/layer_24/mlp": 0.007513000164180994, "grad/layer_24/attn_mlp_ratio": 0.7005857396301026, "grad/layer_27/attn": 0.007844612933695316, "grad/layer_27/mlp": 0.006546683609485626, "grad/layer_27/attn_mlp_ratio": 1.1982575120177401} {"step": 59250, "timestamp": 1778258585.641785, "eos/sharpness": 16.145038604736325, "eos/L0_probe": 1.9770219326019287, "eos/L_plus": 2.064974784851074, "eos/L_minus": 2.0505194664001465, "eos/grad_norm": 0.08852257579565048, "eos/embed_grad_frac": 0.2923282980918884, "eos/time_s": 0.5950214862823486} {"step": 59250, "timestamp": 1778258585.6613443, "train/loss": 2.1479294538497924, "train/z_loss": 0.0013935671420767903, "train/perplexity": 8.567101441315751, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1842861.7887357292, "perf/iters_per_sec": 0.8787449783018728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1379865884780884, "data/tokens_consumed": 124258353152, "data/tokens_consumed_B": 124.258353152, "train/loss_slope": 7.7927442559815e-06} {"step": 59250, "timestamp": 1778258587.0293124, "geo/rankme_last": 438.8061828613281, "geo/layer_0/stable_rank_q_proj": 19.370418548583984, "geo/layer_0/stable_rank_k_proj": 16.157217025756836, "geo/layer_0/stable_rank_o_proj": 47.28558349609375, "geo/layer_0/stable_rank_gate_proj": 131.3324432373047, "geo/layer_0/stable_rank_down_proj": 54.757022857666016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06355811655521393, "geo/layer_0/attn_entropy_mean": 6.16232967376709, "geo/layer_0/attn_entropy_std": 0.4042034447193146, "geo/layer_7/stable_rank_q_proj": 43.47505569458008, "geo/layer_7/stable_rank_k_proj": 41.328277587890625, "geo/layer_7/stable_rank_o_proj": 92.3774642944336, "geo/layer_7/stable_rank_gate_proj": 82.82575988769531, "geo/layer_7/stable_rank_down_proj": 141.69564819335938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46792370080947876, "geo/layer_7/attn_entropy_mean": 4.686772346496582, "geo/layer_7/attn_entropy_std": 0.8021125197410583, "geo/layer_14/stable_rank_q_proj": 51.7765007019043, "geo/layer_14/stable_rank_k_proj": 39.54337692260742, "geo/layer_14/stable_rank_o_proj": 44.039451599121094, "geo/layer_14/stable_rank_gate_proj": 72.13529968261719, "geo/layer_14/stable_rank_down_proj": 129.5489044189453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37979280948638916, "geo/layer_14/attn_entropy_mean": 5.517201900482178, "geo/layer_14/attn_entropy_std": 0.3998144268989563, "geo/layer_21/stable_rank_q_proj": 40.43632888793945, "geo/layer_21/stable_rank_k_proj": 30.27056312561035, "geo/layer_21/stable_rank_o_proj": 70.68437957763672, "geo/layer_21/stable_rank_gate_proj": 67.12898254394531, "geo/layer_21/stable_rank_down_proj": 52.097328186035156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15013548731803894, "geo/layer_21/attn_entropy_mean": 5.721357345581055, "geo/layer_21/attn_entropy_std": 0.29443565011024475, "geo/layer_27/stable_rank_q_proj": 43.20954513549805, "geo/layer_27/stable_rank_k_proj": 31.679187774658203, "geo/layer_27/stable_rank_o_proj": 116.0870132446289, "geo/layer_27/stable_rank_gate_proj": 80.81039428710938, "geo/layer_27/stable_rank_down_proj": 129.06365966796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09302414953708649, "geo/layer_27/attn_entropy_mean": 4.236867904663086, "geo/layer_27/attn_entropy_std": 0.7135182619094849, "attnres/final_alpha/block_0": 0.23535466194152832, "attnres/block_norm/0": 1.759655475616455, "attnres/final_alpha/block_1": 0.00451031094416976, "attnres/block_norm/1": 45903.1953125, "attnres/final_alpha/block_2": 0.010133279487490654, "attnres/block_norm/2": 28447.8984375, "attnres/final_alpha/block_3": 0.0121736079454422, "attnres/block_norm/3": 56673.7421875, "attnres/final_alpha/block_4": 0.014177803881466389, "attnres/block_norm/4": 14881.451171875, "attnres/final_alpha/block_5": 0.6148438453674316, "attnres/block_norm/5": 6522.4150390625, "attnres/final_alpha/block_6": 0.10880646854639053, "attnres/block_norm/6": 37287.1875, "geo/tier1_time_s": 1.3634743690490723, "geo/step": 59250.0, "geo/rankme_slope": -0.0001146753818714986} {"step": 59260, "timestamp": 1778258597.3757591, "train/loss": 2.167326283454895, "train/z_loss": 0.0013662238139659165, "train/perplexity": 8.734898149348423, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790880.6216627362, "perf/iters_per_sec": 0.853958426314705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710171937942504, "data/tokens_consumed": 124279324672, "data/tokens_consumed_B": 124.279324672, "train/loss_slope": 1.0306966150983664e-05} {"step": 59270, "timestamp": 1778258607.7328968, "train/loss": 2.143048310279846, "train/z_loss": 0.0013839457649737596, "train/perplexity": 8.525386081357203, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025873.284073534, "perf/iters_per_sec": 0.9660116596572561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351841926574707, "data/tokens_consumed": 124300296192, "data/tokens_consumed_B": 124.300296192, "train/loss_slope": 1.1790104524673617e-05} {"step": 59280, "timestamp": 1778258618.0806665, "train/loss": 2.1212641477584837, "train/z_loss": 0.0013733893167227507, "train/perplexity": 8.341675936142938, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028054.415059127, "perf/iters_per_sec": 0.967051703958095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034070873260498, "data/tokens_consumed": 124321267712, "data/tokens_consumed_B": 124.321267712, "train/loss_slope": 1.019659717627337e-05} {"step": 59290, "timestamp": 1778258628.4262397, "train/loss": 2.1805436849594115, "train/z_loss": 0.001367419515736401, "train/perplexity": 8.851117170071662, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028201.6244920788, "perf/iters_per_sec": 0.9671218988857645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339958190917968, "data/tokens_consumed": 124342239232, "data/tokens_consumed_B": 124.342239232, "train/loss_slope": 1.0566335535607339e-05} {"step": 59300, "timestamp": 1778258638.7633736, "grad/layer_0/attn": 0.0029922346584498882, "grad/layer_0/mlp": 0.0030132948886603117, "grad/layer_0/attn_mlp_ratio": 0.9930108634270198, "grad/layer_4/attn": 0.0018563109915703535, "grad/layer_4/mlp": 0.0025302038993686438, "grad/layer_4/attn_mlp_ratio": 0.7336606028737425, "grad/layer_8/attn": 0.0038694271352142096, "grad/layer_8/mlp": 0.0035457336343824863, "grad/layer_8/attn_mlp_ratio": 1.0912909499359507, "grad/layer_12/attn": 0.004674069117754698, "grad/layer_12/mlp": 0.006533565931022167, "grad/layer_12/attn_mlp_ratio": 0.7153932623565215, "grad/layer_16/attn": 0.004238982684910297, "grad/layer_16/mlp": 0.0041686007753014565, "grad/layer_16/attn_mlp_ratio": 1.0168837966776447, "grad/layer_20/attn": 0.0042103854939341545, "grad/layer_20/mlp": 0.005694705527275801, "grad/layer_20/attn_mlp_ratio": 0.7393508584127279, "grad/layer_24/attn": 0.009602939710021019, "grad/layer_24/mlp": 0.0089867627248168, "grad/layer_24/attn_mlp_ratio": 1.0685649434858409, "grad/layer_27/attn": 0.004197924863547087, "grad/layer_27/mlp": 0.008209412917494774, "grad/layer_27/attn_mlp_ratio": 0.511355057249653} {"step": 59300, "timestamp": 1778258638.7777758, "train/loss": 2.1315505027771, "train/z_loss": 0.0013970202184282244, "train/perplexity": 8.42792420594752, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027464.6693833852, "perf/iters_per_sec": 0.9667704913060118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343716621398926, "data/tokens_consumed": 124363210752, "data/tokens_consumed_B": 124.363210752, "train/loss_slope": 5.251286876047786e-06} {"step": 59310, "timestamp": 1778258649.122304, "train/loss": 2.1124401330947875, "train/z_loss": 0.0013887041015550495, "train/perplexity": 8.268392667187955, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028776.1233492065, "perf/iters_per_sec": 0.9673958412881882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337030172348023, "data/tokens_consumed": 124384182272, "data/tokens_consumed_B": 124.384182272, "train/loss_slope": 1.5145788670587226e-06} {"step": 59320, "timestamp": 1778258659.4737875, "train/loss": 2.127615487575531, "train/z_loss": 0.0013815878774039447, "train/perplexity": 8.394825361017906, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026933.8831824176, "perf/iters_per_sec": 0.966517392722329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346425294876098, "data/tokens_consumed": 124405153792, "data/tokens_consumed_B": 124.405153792, "train/loss_slope": -2.2793049787996156e-06} {"step": 59325, "timestamp": 1778258665.244014, "eos/sharpness": 75.79047679901122, "eos/L0_probe": 1.976291298866272, "eos/L_plus": 2.4218664169311523, "eos/L_minus": 2.288620948791504, "eos/grad_norm": 0.2238425463438034, "eos/embed_grad_frac": 0.04465794935822487, "eos/time_s": 0.5963401794433594} {"step": 59325, "timestamp": 1778258666.6246197, "geo/rankme_last": 438.5762634277344, "geo/layer_0/stable_rank_q_proj": 19.355876922607422, "geo/layer_0/stable_rank_k_proj": 16.154634475708008, "geo/layer_0/stable_rank_o_proj": 47.27253723144531, "geo/layer_0/stable_rank_gate_proj": 131.28123474121094, "geo/layer_0/stable_rank_down_proj": 54.7615966796875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0654735118150711, "geo/layer_0/attn_entropy_mean": 6.161509990692139, "geo/layer_0/attn_entropy_std": 0.4097886085510254, "geo/layer_7/stable_rank_q_proj": 43.42338943481445, "geo/layer_7/stable_rank_k_proj": 41.332847595214844, "geo/layer_7/stable_rank_o_proj": 92.34595489501953, "geo/layer_7/stable_rank_gate_proj": 82.88450622558594, "geo/layer_7/stable_rank_down_proj": 141.36656188964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4592757523059845, "geo/layer_7/attn_entropy_mean": 4.653202056884766, "geo/layer_7/attn_entropy_std": 0.7985822558403015, "geo/layer_14/stable_rank_q_proj": 51.80381393432617, "geo/layer_14/stable_rank_k_proj": 39.54196548461914, "geo/layer_14/stable_rank_o_proj": 44.0604133605957, "geo/layer_14/stable_rank_gate_proj": 72.06539154052734, "geo/layer_14/stable_rank_down_proj": 129.539794921875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3917931616306305, "geo/layer_14/attn_entropy_mean": 5.54134464263916, "geo/layer_14/attn_entropy_std": 0.390963077545166, "geo/layer_21/stable_rank_q_proj": 40.43802261352539, "geo/layer_21/stable_rank_k_proj": 30.284456253051758, "geo/layer_21/stable_rank_o_proj": 70.7909927368164, "geo/layer_21/stable_rank_gate_proj": 67.0841293334961, "geo/layer_21/stable_rank_down_proj": 52.04808044433594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13938292860984802, "geo/layer_21/attn_entropy_mean": 5.7009735107421875, "geo/layer_21/attn_entropy_std": 0.2986548840999603, "geo/layer_27/stable_rank_q_proj": 43.10869598388672, "geo/layer_27/stable_rank_k_proj": 31.643644332885742, "geo/layer_27/stable_rank_o_proj": 116.0082015991211, "geo/layer_27/stable_rank_gate_proj": 80.77721405029297, "geo/layer_27/stable_rank_down_proj": 129.46304321289062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08957642316818237, "geo/layer_27/attn_entropy_mean": 4.21768856048584, "geo/layer_27/attn_entropy_std": 0.7422035932540894, "attnres/final_alpha/block_0": 0.23600220680236816, "attnres/block_norm/0": 1.7597734928131104, "attnres/final_alpha/block_1": 0.004540507681667805, "attnres/block_norm/1": 46039.6796875, "attnres/final_alpha/block_2": 0.010215416550636292, "attnres/block_norm/2": 28389.140625, "attnres/final_alpha/block_3": 0.012394705787301064, "attnres/block_norm/3": 56369.84375, "attnres/final_alpha/block_4": 0.0140430498868227, "attnres/block_norm/4": 14879.2568359375, "attnres/final_alpha/block_5": 0.6124886274337769, "attnres/block_norm/5": 6540.294921875, "attnres/final_alpha/block_6": 0.11031551659107208, "attnres/block_norm/6": 37166.5703125, "geo/tier1_time_s": 1.359299898147583, "geo/step": 59325.0, "geo/rankme_slope": -0.0001260712488120248} {"step": 59330, "timestamp": 1778258671.803912, "train/loss": 2.151480567455292, "train/z_loss": 0.0013700969866476953, "train/perplexity": 8.597578273122643, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701849.1339869685, "perf/iters_per_sec": 0.8115049047407954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322784423828126, "data/tokens_consumed": 124426125312, "data/tokens_consumed_B": 124.426125312, "train/loss_slope": -2.0275543017179156e-07} {"step": 59340, "timestamp": 1778258682.148849, "train/loss": 2.136477828025818, "train/z_loss": 0.0013626801082864404, "train/perplexity": 8.469553806747886, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028236.7933131428, "perf/iters_per_sec": 0.967138668686458, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339778900146483, "data/tokens_consumed": 124447096832, "data/tokens_consumed_B": 124.447096832, "train/loss_slope": -5.8686618221224725e-06} {"step": 59350, "timestamp": 1778258692.499802, "grad/layer_0/attn": 0.003177658887580037, "grad/layer_0/mlp": 0.002970164641737938, "grad/layer_0/attn_mlp_ratio": 1.0698594737612728, "grad/layer_4/attn": 0.0027137433644384146, "grad/layer_4/mlp": 0.0025773379020392895, "grad/layer_4/attn_mlp_ratio": 1.0529249024734808, "grad/layer_8/attn": 0.006577531341463327, "grad/layer_8/mlp": 0.0035541707184165716, "grad/layer_8/attn_mlp_ratio": 1.8506514395370848, "grad/layer_12/attn": 0.004281261004507542, "grad/layer_12/mlp": 0.007000627461820841, "grad/layer_12/attn_mlp_ratio": 0.6115538880908556, "grad/layer_16/attn": 0.005254773888736963, "grad/layer_16/mlp": 0.004716510884463787, "grad/layer_16/attn_mlp_ratio": 1.114123110503976, "grad/layer_20/attn": 0.0031288706231862307, "grad/layer_20/mlp": 0.006096812896430492, "grad/layer_20/attn_mlp_ratio": 0.5131977354427787, "grad/layer_24/attn": 0.009411598555743694, "grad/layer_24/mlp": 0.010625304654240608, "grad/layer_24/attn_mlp_ratio": 0.8857721047471586, "grad/layer_27/attn": 0.014943215064704418, "grad/layer_27/mlp": 0.00997998658567667, "grad/layer_27/attn_mlp_ratio": 1.497318136320863} {"step": 59350, "timestamp": 1778258692.5143232, "train/loss": 2.1365973949432373, "train/z_loss": 0.0013704904937185346, "train/perplexity": 8.47056654573231, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024233.4780439383, "perf/iters_per_sec": 0.9652297392100994, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360227823257446, "data/tokens_consumed": 124468068352, "data/tokens_consumed_B": 124.468068352, "train/loss_slope": -5.5115657134561284e-06} {"step": 59360, "timestamp": 1778258702.8823643, "train/loss": 2.1572973012924193, "train/z_loss": 0.0013763399561867117, "train/perplexity": 8.647733826919566, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024250.900393894, "perf/iters_per_sec": 0.9652380468339414, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360138654708861, "data/tokens_consumed": 124489039872, "data/tokens_consumed_B": 124.489039872, "train/loss_slope": -3.941192120501432e-06} {"step": 59370, "timestamp": 1778258713.2285206, "train/loss": 2.2127012252807616, "train/z_loss": 0.001383418485056609, "train/perplexity": 9.14037328352875, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027946.7337637271, "perf/iters_per_sec": 0.967000357515205, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341257810592652, "data/tokens_consumed": 124510011392, "data/tokens_consumed_B": 124.510011392, "train/loss_slope": 2.3968805467048714e-06} {"step": 59380, "timestamp": 1778258723.576214, "train/loss": 2.128438663482666, "train/z_loss": 0.0013814950943924487, "train/perplexity": 8.401738624025096, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027600.01504041, "perf/iters_per_sec": 0.9668350291444826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343026161193847, "data/tokens_consumed": 124530982912, "data/tokens_consumed_B": 124.530982912, "train/loss_slope": 2.4865343017761304e-06} {"step": 59390, "timestamp": 1778258733.9221272, "train/loss": 2.1253032445907594, "train/z_loss": 0.001372166257351637, "train/perplexity": 8.375436909012885, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028351.3342709555, "perf/iters_per_sec": 0.9671932860712793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339195013046265, "data/tokens_consumed": 124551954432, "data/tokens_consumed_B": 124.551954432, "train/loss_slope": 2.4534352505990385e-06} {"step": 59400, "timestamp": 1778258744.2688956, "grad/layer_0/attn": 0.0027336631901562214, "grad/layer_0/mlp": 0.003048464423045516, "grad/layer_0/attn_mlp_ratio": 0.8967344607393372, "grad/layer_4/attn": 0.0019535082392394543, "grad/layer_4/mlp": 0.0025581871159374714, "grad/layer_4/attn_mlp_ratio": 0.7636298966194204, "grad/layer_8/attn": 0.0037697323132306337, "grad/layer_8/mlp": 0.0036012849304825068, "grad/layer_8/attn_mlp_ratio": 1.0467742156819946, "grad/layer_12/attn": 0.007976616732776165, "grad/layer_12/mlp": 0.00713562173768878, "grad/layer_12/attn_mlp_ratio": 1.1178586693938062, "grad/layer_16/attn": 0.003420236986130476, "grad/layer_16/mlp": 0.0045244451612234116, "grad/layer_16/attn_mlp_ratio": 0.7559461522152758, "grad/layer_20/attn": 0.0037413970567286015, "grad/layer_20/mlp": 0.0064957160502672195, "grad/layer_20/attn_mlp_ratio": 0.5759791484384195, "grad/layer_24/attn": 0.012550764717161655, "grad/layer_24/mlp": 0.010864452458918095, "grad/layer_24/attn_mlp_ratio": 1.1552137255971862, "grad/layer_27/attn": 0.00612886156886816, "grad/layer_27/mlp": 0.009726283140480518, "grad/layer_27/attn_mlp_ratio": 0.6301339799935098} {"step": 59400, "timestamp": 1778258744.8685827, "eos/sharpness": 32.78419971466064, "eos/L0_probe": 1.9770616292953491, "eos/L_plus": 2.1409783363342285, "eos/L_minus": 2.140986919403076, "eos/grad_norm": 0.14114360511302948, "eos/embed_grad_frac": 0.15078029036521912, "eos/time_s": 0.5966658592224121} {"step": 59400, "timestamp": 1778258744.8937762, "train/loss": 2.213050103187561, "train/z_loss": 0.0013652461464516818, "train/perplexity": 9.143562714155863, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912258.9442124788, "perf/iters_per_sec": 0.9118361207067865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966882944107055, "data/tokens_consumed": 124572925952, "data/tokens_consumed_B": 124.572925952, "train/loss_slope": 5.353119120811018e-06} {"step": 59400, "timestamp": 1778258746.2588217, "geo/rankme_last": 439.10089111328125, "geo/layer_0/stable_rank_q_proj": 19.36908531188965, "geo/layer_0/stable_rank_k_proj": 16.144493103027344, "geo/layer_0/stable_rank_o_proj": 47.2055778503418, "geo/layer_0/stable_rank_gate_proj": 131.52142333984375, "geo/layer_0/stable_rank_down_proj": 54.719482421875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06258713454008102, "geo/layer_0/attn_entropy_mean": 6.162842750549316, "geo/layer_0/attn_entropy_std": 0.40963292121887207, "geo/layer_7/stable_rank_q_proj": 43.52832794189453, "geo/layer_7/stable_rank_k_proj": 41.385005950927734, "geo/layer_7/stable_rank_o_proj": 92.23136901855469, "geo/layer_7/stable_rank_gate_proj": 82.91663360595703, "geo/layer_7/stable_rank_down_proj": 141.6598663330078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46427398920059204, "geo/layer_7/attn_entropy_mean": 4.673046112060547, "geo/layer_7/attn_entropy_std": 0.8075878024101257, "geo/layer_14/stable_rank_q_proj": 51.76850128173828, "geo/layer_14/stable_rank_k_proj": 39.56062316894531, "geo/layer_14/stable_rank_o_proj": 44.01786804199219, "geo/layer_14/stable_rank_gate_proj": 72.06413269042969, "geo/layer_14/stable_rank_down_proj": 129.5670623779297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3828636705875397, "geo/layer_14/attn_entropy_mean": 5.532003879547119, "geo/layer_14/attn_entropy_std": 0.3945971429347992, "geo/layer_21/stable_rank_q_proj": 40.43378829956055, "geo/layer_21/stable_rank_k_proj": 30.25566864013672, "geo/layer_21/stable_rank_o_proj": 70.85449981689453, "geo/layer_21/stable_rank_gate_proj": 66.94813537597656, "geo/layer_21/stable_rank_down_proj": 52.05983352661133, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14306417107582092, "geo/layer_21/attn_entropy_mean": 5.709527492523193, "geo/layer_21/attn_entropy_std": 0.29928457736968994, "geo/layer_27/stable_rank_q_proj": 43.1268424987793, "geo/layer_27/stable_rank_k_proj": 31.598058700561523, "geo/layer_27/stable_rank_o_proj": 116.00881958007812, "geo/layer_27/stable_rank_gate_proj": 80.78921508789062, "geo/layer_27/stable_rank_down_proj": 129.8399658203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08719917386770248, "geo/layer_27/attn_entropy_mean": 4.209249973297119, "geo/layer_27/attn_entropy_std": 0.7174486517906189, "attnres/final_alpha/block_0": 0.23585554957389832, "attnres/block_norm/0": 1.7597436904907227, "attnres/final_alpha/block_1": 0.004536434076726437, "attnres/block_norm/1": 45943.3828125, "attnres/final_alpha/block_2": 0.010172655805945396, "attnres/block_norm/2": 28365.212890625, "attnres/final_alpha/block_3": 0.012131568044424057, "attnres/block_norm/3": 56831.84375, "attnres/final_alpha/block_4": 0.0142043586820364, "attnres/block_norm/4": 14831.95703125, "attnres/final_alpha/block_5": 0.6137253642082214, "attnres/block_norm/5": 6544.15869140625, "attnres/final_alpha/block_6": 0.10937405377626419, "attnres/block_norm/6": 37317.9140625, "geo/tier1_time_s": 1.3612821102142334, "geo/step": 59400.0, "geo/rankme_slope": -0.00010221106020533214} {"step": 59410, "timestamp": 1778258756.6129334, "train/loss": 2.1420176029205322, "train/z_loss": 0.0013882254250347614, "train/perplexity": 8.516603430131308, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790062.7824318714, "perf/iters_per_sec": 0.8535684501799924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715522050857543, "data/tokens_consumed": 124593897472, "data/tokens_consumed_B": 124.593897472, "train/loss_slope": 2.8369790113548127e-06} {"step": 59420, "timestamp": 1778258766.9610825, "train/loss": 2.1697824478149412, "train/z_loss": 0.0013688830425962807, "train/perplexity": 8.756378864155584, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028021.543698641, "perf/iters_per_sec": 0.9670360296719747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034087634086609, "data/tokens_consumed": 124614868992, "data/tokens_consumed_B": 124.614868992, "train/loss_slope": 3.750862203987727e-06} {"step": 59430, "timestamp": 1778258777.3076267, "train/loss": 2.165507674217224, "train/z_loss": 0.0013633468304760754, "train/perplexity": 8.719027218768597, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027885.9080037477, "perf/iters_per_sec": 0.9669713535326708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341567993164062, "data/tokens_consumed": 124635840512, "data/tokens_consumed_B": 124.635840512, "train/loss_slope": 1.5925991903580269e-06} {"step": 59440, "timestamp": 1778258787.6568031, "train/loss": 2.1902305364608763, "train/z_loss": 0.0013659236137755215, "train/perplexity": 8.937273244565787, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027311.9598635198, "perf/iters_per_sec": 0.9666976737325286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034449577331543, "data/tokens_consumed": 124656812032, "data/tokens_consumed_B": 124.656812032, "train/loss_slope": 1.7456639324477775e-06} {"step": 59450, "timestamp": 1778258797.9890788, "grad/layer_0/attn": 0.0032615356612950563, "grad/layer_0/mlp": 0.0033389630261808634, "grad/layer_0/attn_mlp_ratio": 0.9768109254400873, "grad/layer_4/attn": 0.0022980947978794575, "grad/layer_4/mlp": 0.0026299795135855675, "grad/layer_4/attn_mlp_ratio": 0.8738070766816185, "grad/layer_8/attn": 0.011383861303329468, "grad/layer_8/mlp": 0.0037767798639833927, "grad/layer_8/attn_mlp_ratio": 3.014171175416542, "grad/layer_12/attn": 0.004805456846952438, "grad/layer_12/mlp": 0.006721277721226215, "grad/layer_12/attn_mlp_ratio": 0.7149617936899596, "grad/layer_16/attn": 0.0036330504808574915, "grad/layer_16/mlp": 0.0048337457701563835, "grad/layer_16/attn_mlp_ratio": 0.7516014657055096, "grad/layer_20/attn": 0.004246458411216736, "grad/layer_20/mlp": 0.006867892574518919, "grad/layer_20/attn_mlp_ratio": 0.618305878158498, "grad/layer_24/attn": 0.024760441854596138, "grad/layer_24/mlp": 0.01249491423368454, "grad/layer_24/attn_mlp_ratio": 1.9816415857966672, "grad/layer_27/attn": 0.009522516280412674, "grad/layer_27/mlp": 0.011999367736279964, "grad/layer_27/attn_mlp_ratio": 0.7935848296625632} {"step": 59450, "timestamp": 1778258798.003247, "train/loss": 2.153092312812805, "train/z_loss": 0.0013760034693405033, "train/perplexity": 8.61144655305594, "train/grad_norm": 0.244140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028431.5066061805, "perf/iters_per_sec": 0.9672315152197745, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338786363601684, "data/tokens_consumed": 124677783552, "data/tokens_consumed_B": 124.677783552, "train/loss_slope": 1.9993163929162955e-06} {"step": 59460, "timestamp": 1778258808.342186, "train/loss": 2.1495092630386354, "train/z_loss": 0.001362210197839886, "train/perplexity": 8.580646523399913, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029349.8240967286, "perf/iters_per_sec": 0.9676694031222957, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334107875823975, "data/tokens_consumed": 124698755072, "data/tokens_consumed_B": 124.698755072, "train/loss_slope": 3.018451588238979e-06} {"step": 59470, "timestamp": 1778258818.6862981, "train/loss": 2.1785035133361816, "train/z_loss": 0.0013738100533373654, "train/perplexity": 8.833077779970404, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028373.0372413516, "perf/iters_per_sec": 0.9672036348540075, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033908438682556, "data/tokens_consumed": 124719726592, "data/tokens_consumed_B": 124.719726592, "train/loss_slope": 6.682068873124197e-06} {"step": 59475, "timestamp": 1778258824.4330149, "eos/sharpness": 48.745894432067864, "eos/L0_probe": 1.9757506847381592, "eos/L_plus": 2.2036006450653076, "eos/L_minus": 2.2353596687316895, "eos/grad_norm": 0.12015015631914139, "eos/embed_grad_frac": 0.17017224431037903, "eos/time_s": 0.5841770172119141} {"step": 59475, "timestamp": 1778258825.8098948, "geo/rankme_last": 438.5077209472656, "geo/layer_0/stable_rank_q_proj": 19.374664306640625, "geo/layer_0/stable_rank_k_proj": 16.17722511291504, "geo/layer_0/stable_rank_o_proj": 47.211856842041016, "geo/layer_0/stable_rank_gate_proj": 131.9386444091797, "geo/layer_0/stable_rank_down_proj": 54.803802490234375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06389356404542923, "geo/layer_0/attn_entropy_mean": 6.159829616546631, "geo/layer_0/attn_entropy_std": 0.41086044907569885, "geo/layer_7/stable_rank_q_proj": 43.55015563964844, "geo/layer_7/stable_rank_k_proj": 41.38132858276367, "geo/layer_7/stable_rank_o_proj": 92.08802795410156, "geo/layer_7/stable_rank_gate_proj": 82.96125793457031, "geo/layer_7/stable_rank_down_proj": 141.46206665039062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4799562394618988, "geo/layer_7/attn_entropy_mean": 4.646003723144531, "geo/layer_7/attn_entropy_std": 0.8076269030570984, "geo/layer_14/stable_rank_q_proj": 51.80420684814453, "geo/layer_14/stable_rank_k_proj": 39.64827346801758, "geo/layer_14/stable_rank_o_proj": 43.994903564453125, "geo/layer_14/stable_rank_gate_proj": 72.17435455322266, "geo/layer_14/stable_rank_down_proj": 129.95596313476562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38548359274864197, "geo/layer_14/attn_entropy_mean": 5.510459899902344, "geo/layer_14/attn_entropy_std": 0.3995024263858795, "geo/layer_21/stable_rank_q_proj": 40.417205810546875, "geo/layer_21/stable_rank_k_proj": 30.31348419189453, "geo/layer_21/stable_rank_o_proj": 70.9582290649414, "geo/layer_21/stable_rank_gate_proj": 66.94706726074219, "geo/layer_21/stable_rank_down_proj": 52.11804962158203, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14185453951358795, "geo/layer_21/attn_entropy_mean": 5.688204288482666, "geo/layer_21/attn_entropy_std": 0.30468955636024475, "geo/layer_27/stable_rank_q_proj": 43.140785217285156, "geo/layer_27/stable_rank_k_proj": 31.628450393676758, "geo/layer_27/stable_rank_o_proj": 115.94733428955078, "geo/layer_27/stable_rank_gate_proj": 80.76776885986328, "geo/layer_27/stable_rank_down_proj": 129.8152313232422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09466614574193954, "geo/layer_27/attn_entropy_mean": 4.201465129852295, "geo/layer_27/attn_entropy_std": 0.7234588265419006, "attnres/final_alpha/block_0": 0.2369670271873474, "attnres/block_norm/0": 1.759684443473816, "attnres/final_alpha/block_1": 0.0045171137899160385, "attnres/block_norm/1": 45862.2578125, "attnres/final_alpha/block_2": 0.010238032788038254, "attnres/block_norm/2": 28465.080078125, "attnres/final_alpha/block_3": 0.012252841144800186, "attnres/block_norm/3": 56610.4296875, "attnres/final_alpha/block_4": 0.014233469031751156, "attnres/block_norm/4": 14859.4755859375, "attnres/final_alpha/block_5": 0.6117407083511353, "attnres/block_norm/5": 6552.2978515625, "attnres/final_alpha/block_6": 0.11005084216594696, "attnres/block_norm/6": 37439.3046875, "geo/tier1_time_s": 1.3587095737457275, "geo/step": 59475.0, "geo/rankme_slope": -0.00011604426927020809} {"step": 59480, "timestamp": 1778258830.9817116, "train/loss": 2.1354029417037963, "train/z_loss": 0.0013748855097219348, "train/perplexity": 8.4604548902339, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706651.353443641, "perf/iters_per_sec": 0.8137947814195828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2288110256195068, "data/tokens_consumed": 124740698112, "data/tokens_consumed_B": 124.740698112, "train/loss_slope": 4.335279819524014e-06} {"step": 59490, "timestamp": 1778258841.3201854, "train/loss": 2.166614294052124, "train/z_loss": 0.0013750424957834184, "train/perplexity": 8.728681207892647, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029348.7472556152, "perf/iters_per_sec": 0.9676688896444393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334113359451294, "data/tokens_consumed": 124761669632, "data/tokens_consumed_B": 124.761669632, "train/loss_slope": 7.520312559057485e-07} {"step": 59500, "timestamp": 1778258851.6583154, "grad/layer_0/attn": 0.003281651297584176, "grad/layer_0/mlp": 0.0032583700958639383, "grad/layer_0/attn_mlp_ratio": 1.007145014323354, "grad/layer_4/attn": 0.002655882854014635, "grad/layer_4/mlp": 0.0026667769998311996, "grad/layer_4/attn_mlp_ratio": 0.9959148269957567, "grad/layer_8/attn": 0.009762691333889961, "grad/layer_8/mlp": 0.003707647090777755, "grad/layer_8/attn_mlp_ratio": 2.6331230647223576, "grad/layer_12/attn": 0.006633215583860874, "grad/layer_12/mlp": 0.007116989698261023, "grad/layer_12/attn_mlp_ratio": 0.9320254450107046, "grad/layer_16/attn": 0.0034919569734483957, "grad/layer_16/mlp": 0.004700027871876955, "grad/layer_16/attn_mlp_ratio": 0.7429651470890893, "grad/layer_20/attn": 0.004003256559371948, "grad/layer_20/mlp": 0.005954748019576073, "grad/layer_20/attn_mlp_ratio": 0.67227974701589, "grad/layer_24/attn": 0.006115958094596863, "grad/layer_24/mlp": 0.008595324121415615, "grad/layer_24/attn_mlp_ratio": 0.711544781447417, "grad/layer_27/attn": 0.009132900275290012, "grad/layer_27/mlp": 0.007093057967722416, "grad/layer_27/attn_mlp_ratio": 1.28758289980031} {"step": 59500, "timestamp": 1778258851.6724112, "train/loss": 2.1637943983078003, "train/z_loss": 0.001373097556643188, "train/perplexity": 8.704101908719016, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027321.351676172, "perf/iters_per_sec": 0.9667021520977841, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344447851181031, "data/tokens_consumed": 124782641152, "data/tokens_consumed_B": 124.782641152, "train/loss_slope": 7.185693406643976e-07} {"step": 59500, "timestamp": 1778258858.9192655, "geo/ww_alpha_mean": 7.658881629361085, "geo/ww_alpha_std": 4.604558468749484, "geo/ww_alpha_min": 1.3295137577453404, "geo/ww_alpha_max": 40.365146421159906, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.028091338496882, "geo/ww_alpha_by_type/k_proj": 4.48519022599679, "geo/ww_alpha_by_type/v_proj": 9.065992970777398, "geo/ww_alpha_by_type/o_proj": 7.554644773055917, "geo/ww_alpha_by_type/gate_proj": 8.198547867556513, "geo/ww_alpha_by_type/up_proj": 12.097123168692363, "geo/ww_alpha_by_type/down_proj": 8.276805699545026, "geo/twonn_id/layer_0": 0.7073232531547546, "geo/twonn_id/layer_7": 3.238839626312256, "geo/twonn_id/layer_14": 4.715884208679199, "geo/twonn_id/layer_21": 7.610687732696533, "geo/twonn_id/layer_27": 4.897127628326416, "geo/tier2_time_s": 7.2384421825408936} {"step": 59500, "timestamp": 1778258859.701829, "eoc/jacobian_sigma/layer_0/attn": 1279.3807373046875, "eoc/jacobian_sigma/layer_0/mlp": 11569.44140625, "eoc/jacobian_sigma/layer_0": 11569.44140625, "eoc/jacobian_sigma/layer_7/attn": 1.1415539979934692, "eoc/jacobian_sigma/layer_7/mlp": 1.7906184196472168, "eoc/jacobian_sigma/layer_7": 1.7906184196472168, "eoc/jacobian_sigma/layer_14/attn": 1.5049980878829956, "eoc/jacobian_sigma/layer_14/mlp": 7.526747703552246, "eoc/jacobian_sigma/layer_14": 7.526747703552246, "eoc/jacobian_sigma/layer_21/attn": 1.0733081102371216, "eoc/jacobian_sigma/layer_21/mlp": 4.472484588623047, "eoc/jacobian_sigma/layer_21": 4.472484588623047, "eoc/jacobian_sigma/layer_27/attn": 3.5910518169403076, "eoc/jacobian_sigma/layer_27/mlp": 27.669532775878906, "eoc/jacobian_sigma/layer_27": 27.669532775878906, "eoc/layer0_sigma": 11569.44140625, "eoc/sigma_max": 27.669532775878906, "eoc/sigma_min": 1.7906184196472168, "eoc/sigma_mean": 10.364845871925354, "eoc/time_s": 0.7763612270355225} {"step": 59510, "timestamp": 1778258870.0646732, "train/loss": 2.1172924280166625, "train/z_loss": 0.0013732525520026683, "train/perplexity": 8.308610843254169, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1140685.054041393, "perf/iters_per_sec": 0.543921019573876, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8385022163391114, "data/tokens_consumed": 124803612672, "data/tokens_consumed_B": 124.803612672, "train/loss_slope": -2.6992690552471987e-06} {"step": 59520, "timestamp": 1778258880.4042473, "train/loss": 2.1897695660591125, "train/z_loss": 0.0013752656406722963, "train/perplexity": 8.933154375538859, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029237.3706012662, "perf/iters_per_sec": 0.9676157811170893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334680557250977, "data/tokens_consumed": 124824584192, "data/tokens_consumed_B": 124.824584192, "train/loss_slope": 1.0050832492039732e-06} {"step": 59530, "timestamp": 1778258890.7488985, "train/loss": 2.1627672910690308, "train/z_loss": 0.001385593379382044, "train/perplexity": 8.695166452263125, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028706.0302529086, "perf/iters_per_sec": 0.9673624182953399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337387323379517, "data/tokens_consumed": 124845555712, "data/tokens_consumed_B": 124.845555712, "train/loss_slope": 1.5746889263167797e-06} {"step": 59540, "timestamp": 1778258901.0880506, "train/loss": 2.132817840576172, "train/z_loss": 0.0013867080328054727, "train/perplexity": 8.438612003956122, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029350.2454696493, "perf/iters_per_sec": 0.9676696040485617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334105730056762, "data/tokens_consumed": 124866527232, "data/tokens_consumed_B": 124.866527232, "train/loss_slope": -1.0442538766434713e-06} {"step": 59550, "timestamp": 1778258911.4281895, "grad/layer_0/attn": 0.002467617392539978, "grad/layer_0/mlp": 0.0027305101975798607, "grad/layer_0/attn_mlp_ratio": 0.9037202294117356, "grad/layer_4/attn": 0.0035157634411007166, "grad/layer_4/mlp": 0.0025272490456700325, "grad/layer_4/attn_mlp_ratio": 1.3911423996816135, "grad/layer_8/attn": 0.005233594682067633, "grad/layer_8/mlp": 0.0034779615234583616, "grad/layer_8/attn_mlp_ratio": 1.5047879328994815, "grad/layer_12/attn": 0.005743983667343855, "grad/layer_12/mlp": 0.0068264794535934925, "grad/layer_12/attn_mlp_ratio": 0.8414269203106591, "grad/layer_16/attn": 0.003379295114427805, "grad/layer_16/mlp": 0.0046918755397200584, "grad/layer_16/attn_mlp_ratio": 0.720243965082892, "grad/layer_20/attn": 0.0030829953029751778, "grad/layer_20/mlp": 0.006008548196405172, "grad/layer_20/attn_mlp_ratio": 0.5131015265067753, "grad/layer_24/attn": 0.007205562200397253, "grad/layer_24/mlp": 0.008379543200135231, "grad/layer_24/attn_mlp_ratio": 0.8598991546807769, "grad/layer_27/attn": 0.005702272057533264, "grad/layer_27/mlp": 0.007073094602674246, "grad/layer_27/attn_mlp_ratio": 0.8061919565953654} {"step": 59550, "timestamp": 1778258912.0325382, "eos/sharpness": 55.8002471923828, "eos/L0_probe": 1.980285406112671, "eos/L_plus": 2.2074341773986816, "eos/L_minus": 2.3111391067504883, "eos/grad_norm": 0.12740348279476166, "eos/embed_grad_frac": 0.12853871285915375, "eos/time_s": 0.6014859676361084} {"step": 59550, "timestamp": 1778258912.0525343, "train/loss": 2.1714931607246397, "train/z_loss": 0.0013808816904202104, "train/perplexity": 8.771371334770885, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913966.8846775778, "perf/iters_per_sec": 0.9126505301845445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0957096576690675, "data/tokens_consumed": 124887498752, "data/tokens_consumed_B": 124.887498752, "train/loss_slope": -6.836128778512955e-08} {"step": 59550, "timestamp": 1778258913.4172058, "geo/rankme_last": 439.3865661621094, "geo/layer_0/stable_rank_q_proj": 19.372705459594727, "geo/layer_0/stable_rank_k_proj": 16.193225860595703, "geo/layer_0/stable_rank_o_proj": 47.23345947265625, "geo/layer_0/stable_rank_gate_proj": 132.10638427734375, "geo/layer_0/stable_rank_down_proj": 54.83974838256836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06562221050262451, "geo/layer_0/attn_entropy_mean": 6.159123420715332, "geo/layer_0/attn_entropy_std": 0.41063162684440613, "geo/layer_7/stable_rank_q_proj": 43.5272102355957, "geo/layer_7/stable_rank_k_proj": 41.467472076416016, "geo/layer_7/stable_rank_o_proj": 91.99112701416016, "geo/layer_7/stable_rank_gate_proj": 83.00872802734375, "geo/layer_7/stable_rank_down_proj": 141.55455017089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4606145918369293, "geo/layer_7/attn_entropy_mean": 4.654562950134277, "geo/layer_7/attn_entropy_std": 0.7854697704315186, "geo/layer_14/stable_rank_q_proj": 51.82569885253906, "geo/layer_14/stable_rank_k_proj": 39.65657424926758, "geo/layer_14/stable_rank_o_proj": 44.03231430053711, "geo/layer_14/stable_rank_gate_proj": 72.14360809326172, "geo/layer_14/stable_rank_down_proj": 130.0901641845703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3889133632183075, "geo/layer_14/attn_entropy_mean": 5.514312744140625, "geo/layer_14/attn_entropy_std": 0.391635537147522, "geo/layer_21/stable_rank_q_proj": 40.56310272216797, "geo/layer_21/stable_rank_k_proj": 30.317920684814453, "geo/layer_21/stable_rank_o_proj": 71.01299285888672, "geo/layer_21/stable_rank_gate_proj": 67.00668334960938, "geo/layer_21/stable_rank_down_proj": 52.111289978027344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13695473968982697, "geo/layer_21/attn_entropy_mean": 5.6983323097229, "geo/layer_21/attn_entropy_std": 0.29851263761520386, "geo/layer_27/stable_rank_q_proj": 43.213905334472656, "geo/layer_27/stable_rank_k_proj": 31.591880798339844, "geo/layer_27/stable_rank_o_proj": 116.08539581298828, "geo/layer_27/stable_rank_gate_proj": 80.82396697998047, "geo/layer_27/stable_rank_down_proj": 129.814208984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09320858865976334, "geo/layer_27/attn_entropy_mean": 4.208133697509766, "geo/layer_27/attn_entropy_std": 0.7316518425941467, "attnres/final_alpha/block_0": 0.2387721687555313, "attnres/block_norm/0": 1.759844183921814, "attnres/final_alpha/block_1": 0.004587358795106411, "attnres/block_norm/1": 46012.90234375, "attnres/final_alpha/block_2": 0.010502488352358341, "attnres/block_norm/2": 28307.96484375, "attnres/final_alpha/block_3": 0.012488621287047863, "attnres/block_norm/3": 56528.05859375, "attnres/final_alpha/block_4": 0.014578614383935928, "attnres/block_norm/4": 14837.974609375, "attnres/final_alpha/block_5": 0.608031153678894, "attnres/block_norm/5": 6602.5654296875, "attnres/final_alpha/block_6": 0.1110396534204483, "attnres/block_norm/6": 37444.328125, "geo/tier1_time_s": 1.360438346862793, "geo/step": 59550.0, "geo/rankme_slope": -8.774277289040616e-05} {"step": 59560, "timestamp": 1778258923.7599676, "train/loss": 2.1826544761657716, "train/z_loss": 0.0013752222759649158, "train/perplexity": 8.869819762050032, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791913.3537585393, "perf/iters_per_sec": 0.8544508713524529, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703423023223878, "data/tokens_consumed": 124908470272, "data/tokens_consumed_B": 124.908470272, "train/loss_slope": 3.413480793861101e-06} {"step": 59570, "timestamp": 1778258934.10017, "train/loss": 2.174090051651001, "train/z_loss": 0.0013733970699831843, "train/perplexity": 8.794179231394054, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028973.8887265373, "perf/iters_per_sec": 0.9674901431687056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336022615432738, "data/tokens_consumed": 124929441792, "data/tokens_consumed_B": 124.929441792, "train/loss_slope": 2.8645140753470584e-06} {"step": 59580, "timestamp": 1778258944.4473412, "train/loss": 2.1520320534706117, "train/z_loss": 0.0013809960335493087, "train/perplexity": 8.60232102496634, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028022.291825863, "perf/iters_per_sec": 0.9670363864068332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340872526168823, "data/tokens_consumed": 124950413312, "data/tokens_consumed_B": 124.950413312, "train/loss_slope": 4.869375942778465e-06} {"step": 59590, "timestamp": 1778258954.7994244, "train/loss": 2.120704698562622, "train/z_loss": 0.0013839739141985774, "train/perplexity": 8.33701049740798, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027129.60773498, "perf/iters_per_sec": 0.9666107214617634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345426321029663, "data/tokens_consumed": 124971384832, "data/tokens_consumed_B": 124.971384832, "train/loss_slope": 3.866122708176133e-06} {"step": 59600, "timestamp": 1778258965.1338792, "grad/layer_0/attn": 0.004610887728631496, "grad/layer_0/mlp": 0.004810697864741087, "grad/layer_0/attn_mlp_ratio": 0.958465437328626, "grad/layer_4/attn": 0.005228642374277115, "grad/layer_4/mlp": 0.003663117066025734, "grad/layer_4/attn_mlp_ratio": 1.4273751390676601, "grad/layer_8/attn": 0.0068502724170684814, "grad/layer_8/mlp": 0.004344434477388859, "grad/layer_8/attn_mlp_ratio": 1.576792628601559, "grad/layer_12/attn": 0.005103910807520151, "grad/layer_12/mlp": 0.008105766028165817, "grad/layer_12/attn_mlp_ratio": 0.6296642077773679, "grad/layer_16/attn": 0.0052523622289299965, "grad/layer_16/mlp": 0.005815543234348297, "grad/layer_16/attn_mlp_ratio": 0.9031593312886196, "grad/layer_20/attn": 0.012751715257763863, "grad/layer_20/mlp": 0.00850873626768589, "grad/layer_20/attn_mlp_ratio": 1.498661458849727, "grad/layer_24/attn": 0.006212944630533457, "grad/layer_24/mlp": 0.009598447009921074, "grad/layer_24/attn_mlp_ratio": 0.6472864370020521, "grad/layer_27/attn": 0.006054907105863094, "grad/layer_27/mlp": 0.008502019569277763, "grad/layer_27/attn_mlp_ratio": 0.7121727943940939} {"step": 59600, "timestamp": 1778258965.1479876, "train/loss": 2.149238181114197, "train/z_loss": 0.001374507916625589, "train/perplexity": 8.578320780475101, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027457.3791771727, "perf/iters_per_sec": 0.9667670150647987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343753814697265, "data/tokens_consumed": 124992356352, "data/tokens_consumed_B": 124.992356352, "train/loss_slope": 2.8937306734595104e-06} {"step": 59610, "timestamp": 1778258975.5010567, "train/loss": 2.165035533905029, "train/z_loss": 0.001381001842673868, "train/perplexity": 8.714911586189968, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027069.9988327648, "perf/iters_per_sec": 0.9665822977222275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345730543136598, "data/tokens_consumed": 125013327872, "data/tokens_consumed_B": 125.013327872, "train/loss_slope": 1.3724806070542417e-06} {"step": 59620, "timestamp": 1778258986.2551668, "train/loss": 2.159242033958435, "train/z_loss": 0.0013737349188886582, "train/perplexity": 8.664567720785975, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951311.3528157007, "perf/iters_per_sec": 0.9304577602461341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0747398138046265, "data/tokens_consumed": 125034299392, "data/tokens_consumed_B": 125.034299392, "train/loss_slope": 1.7124931840184225e-06} {"step": 59625, "timestamp": 1778258992.0037584, "eos/sharpness": 64.73193168640135, "eos/L0_probe": 1.977607250213623, "eos/L_plus": 2.356428861618042, "eos/L_minus": 2.2461049556732178, "eos/grad_norm": 0.15160545706748962, "eos/embed_grad_frac": 0.09674447774887085, "eos/time_s": 0.5863401889801025} {"step": 59625, "timestamp": 1778258993.377186, "geo/rankme_last": 438.9459533691406, "geo/layer_0/stable_rank_q_proj": 19.360387802124023, "geo/layer_0/stable_rank_k_proj": 16.17530632019043, "geo/layer_0/stable_rank_o_proj": 47.208335876464844, "geo/layer_0/stable_rank_gate_proj": 132.31491088867188, "geo/layer_0/stable_rank_down_proj": 54.9044303894043, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061824534088373184, "geo/layer_0/attn_entropy_mean": 6.161722183227539, "geo/layer_0/attn_entropy_std": 0.4074065387248993, "geo/layer_7/stable_rank_q_proj": 43.556827545166016, "geo/layer_7/stable_rank_k_proj": 41.461612701416016, "geo/layer_7/stable_rank_o_proj": 91.97663116455078, "geo/layer_7/stable_rank_gate_proj": 82.96269226074219, "geo/layer_7/stable_rank_down_proj": 141.07077026367188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4618794918060303, "geo/layer_7/attn_entropy_mean": 4.638463497161865, "geo/layer_7/attn_entropy_std": 0.7978945970535278, "geo/layer_14/stable_rank_q_proj": 51.8121337890625, "geo/layer_14/stable_rank_k_proj": 39.66746520996094, "geo/layer_14/stable_rank_o_proj": 44.0595817565918, "geo/layer_14/stable_rank_gate_proj": 72.11156463623047, "geo/layer_14/stable_rank_down_proj": 130.37820434570312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3903504014015198, "geo/layer_14/attn_entropy_mean": 5.498508453369141, "geo/layer_14/attn_entropy_std": 0.422964483499527, "geo/layer_21/stable_rank_q_proj": 40.62958908081055, "geo/layer_21/stable_rank_k_proj": 30.324363708496094, "geo/layer_21/stable_rank_o_proj": 70.97245788574219, "geo/layer_21/stable_rank_gate_proj": 66.96387481689453, "geo/layer_21/stable_rank_down_proj": 52.12288284301758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14504320919513702, "geo/layer_21/attn_entropy_mean": 5.70924186706543, "geo/layer_21/attn_entropy_std": 0.30123043060302734, "geo/layer_27/stable_rank_q_proj": 43.0833625793457, "geo/layer_27/stable_rank_k_proj": 31.59011459350586, "geo/layer_27/stable_rank_o_proj": 116.24518585205078, "geo/layer_27/stable_rank_gate_proj": 80.82274627685547, "geo/layer_27/stable_rank_down_proj": 129.98895263671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09315645694732666, "geo/layer_27/attn_entropy_mean": 4.198814868927002, "geo/layer_27/attn_entropy_std": 0.7160549759864807, "attnres/final_alpha/block_0": 0.23641185462474823, "attnres/block_norm/0": 1.7599196434020996, "attnres/final_alpha/block_1": 0.004441608674824238, "attnres/block_norm/1": 46200.109375, "attnres/final_alpha/block_2": 0.010132929310202599, "attnres/block_norm/2": 28308.1015625, "attnres/final_alpha/block_3": 0.012274427339434624, "attnres/block_norm/3": 56591.453125, "attnres/final_alpha/block_4": 0.014050371944904327, "attnres/block_norm/4": 14802.869140625, "attnres/final_alpha/block_5": 0.6144996881484985, "attnres/block_norm/5": 6503.75390625, "attnres/final_alpha/block_6": 0.10818910598754883, "attnres/block_norm/6": 37862.640625, "geo/tier1_time_s": 1.353701114654541, "geo/step": 59625.0, "geo/rankme_slope": -0.0001383458461509604} {"step": 59630, "timestamp": 1778258998.5552225, "train/loss": 2.15906183719635, "train/z_loss": 0.0013749997597187757, "train/perplexity": 8.663006534402411, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705654.437175086, "perf/iters_per_sec": 0.8133194146991186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2295292377471925, "data/tokens_consumed": 125055270912, "data/tokens_consumed_B": 125.055270912, "train/loss_slope": 9.111355991289077e-07} {"step": 59640, "timestamp": 1778259008.9011538, "train/loss": 2.133375954627991, "train/z_loss": 0.0013830722658894957, "train/perplexity": 8.443323026415007, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028353.0648787094, "perf/iters_per_sec": 0.9671941112893626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339186191558838, "data/tokens_consumed": 125076242432, "data/tokens_consumed_B": 125.076242432, "train/loss_slope": -1.9277891095536586e-06} {"step": 59650, "timestamp": 1778259019.2398372, "grad/layer_0/attn": 0.003048537066206336, "grad/layer_0/mlp": 0.0032085361890494823, "grad/layer_0/attn_mlp_ratio": 0.9501332668764841, "grad/layer_4/attn": 0.002500440226867795, "grad/layer_4/mlp": 0.0026644740719348192, "grad/layer_4/attn_mlp_ratio": 0.9384366541080372, "grad/layer_8/attn": 0.004589966032654047, "grad/layer_8/mlp": 0.0035164097789674997, "grad/layer_8/attn_mlp_ratio": 1.3052989243682163, "grad/layer_12/attn": 0.004599923267960548, "grad/layer_12/mlp": 0.007019483484327793, "grad/layer_12/attn_mlp_ratio": 0.6553079315165962, "grad/layer_16/attn": 0.0058336202055215836, "grad/layer_16/mlp": 0.0048820762895047665, "grad/layer_16/attn_mlp_ratio": 1.1949055565911253, "grad/layer_20/attn": 0.006182671524584293, "grad/layer_20/mlp": 0.006818374618887901, "grad/layer_20/attn_mlp_ratio": 0.9067661692833314, "grad/layer_24/attn": 0.01580275222659111, "grad/layer_24/mlp": 0.01241164281964302, "grad/layer_24/attn_mlp_ratio": 1.2732200184055589, "grad/layer_27/attn": 0.008138617500662804, "grad/layer_27/mlp": 0.012690465897321701, "grad/layer_27/attn_mlp_ratio": 0.6413174663862181} {"step": 59650, "timestamp": 1778259019.2543645, "train/loss": 2.1697796106338503, "train/z_loss": 0.001369027723558247, "train/perplexity": 8.75635402075829, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026597.0358562418, "perf/iters_per_sec": 0.966356771400567, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348145008087157, "data/tokens_consumed": 125097213952, "data/tokens_consumed_B": 125.097213952, "train/loss_slope": -3.4960753489213006e-06} {"step": 59660, "timestamp": 1778259029.606549, "train/loss": 2.1789196252822878, "train/z_loss": 0.0013745083939284085, "train/perplexity": 8.836754093981577, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026933.41610459, "perf/iters_per_sec": 0.966517170002265, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034642767906189, "data/tokens_consumed": 125118185472, "data/tokens_consumed_B": 125.118185472, "train/loss_slope": -2.3231469150637548e-07} {"step": 59670, "timestamp": 1778259039.9527278, "train/loss": 2.119474935531616, "train/z_loss": 0.0013668331899680196, "train/perplexity": 8.32676425162759, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028406.153875671, "perf/iters_per_sec": 0.9672194260958057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338915586471558, "data/tokens_consumed": 125139156992, "data/tokens_consumed_B": 125.139156992, "train/loss_slope": -2.166396513594696e-06} {"step": 59680, "timestamp": 1778259050.2988503, "train/loss": 2.1495598793029784, "train/z_loss": 0.0013823005021549762, "train/perplexity": 8.581080854664597, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028192.972774887, "perf/iters_per_sec": 0.9671177734255252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340002298355102, "data/tokens_consumed": 125160128512, "data/tokens_consumed_B": 125.160128512, "train/loss_slope": -9.686205002984956e-07} {"step": 59690, "timestamp": 1778259060.6418152, "train/loss": 2.1177454471588133, "train/z_loss": 0.0013776448438875377, "train/perplexity": 8.312375655712513, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028596.314803638, "perf/iters_per_sec": 0.9673101018922987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033794641494751, "data/tokens_consumed": 125181100032, "data/tokens_consumed_B": 125.181100032, "train/loss_slope": -1.1285044286403633e-06} {"step": 59700, "timestamp": 1778259070.9719646, "grad/layer_0/attn": 0.0025662193074822426, "grad/layer_0/mlp": 0.0029838704504072666, "grad/layer_0/attn_mlp_ratio": 0.8600303746863217, "grad/layer_4/attn": 0.002980687888339162, "grad/layer_4/mlp": 0.002531329868361354, "grad/layer_4/attn_mlp_ratio": 1.1775185082917885, "grad/layer_8/attn": 0.007137353997677565, "grad/layer_8/mlp": 0.0037984070368111134, "grad/layer_8/attn_mlp_ratio": 1.879038697170732, "grad/layer_12/attn": 0.005231440998613834, "grad/layer_12/mlp": 0.007336168549954891, "grad/layer_12/attn_mlp_ratio": 0.7131026082185286, "grad/layer_16/attn": 0.003968336619436741, "grad/layer_16/mlp": 0.004786782432347536, "grad/layer_16/attn_mlp_ratio": 0.8290196165420091, "grad/layer_20/attn": 0.0035158824175596237, "grad/layer_20/mlp": 0.006562806200236082, "grad/layer_20/attn_mlp_ratio": 0.5357285064825315, "grad/layer_24/attn": 0.008906353265047073, "grad/layer_24/mlp": 0.009500859305262566, "grad/layer_24/attn_mlp_ratio": 0.9374260669633536, "grad/layer_27/attn": 0.004652087576687336, "grad/layer_27/mlp": 0.009637795388698578, "grad/layer_27/attn_mlp_ratio": 0.48269208265961266} {"step": 59700, "timestamp": 1778259071.5698473, "eos/sharpness": 72.96373844146727, "eos/L0_probe": 1.9784636497497559, "eos/L_plus": 2.2897074222564697, "eos/L_minus": 2.396857261657715, "eos/grad_norm": 0.18375423550605774, "eos/embed_grad_frac": 0.057981450110673904, "eos/time_s": 0.5950157642364502} {"step": 59700, "timestamp": 1778259071.5881999, "train/loss": 2.1563982486724855, "train/z_loss": 0.0013738994486629963, "train/perplexity": 8.639962553081494, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917087.2921968878, "perf/iters_per_sec": 0.9141384564384879, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093926191329956, "data/tokens_consumed": 125202071552, "data/tokens_consumed_B": 125.202071552, "train/loss_slope": 7.520253973754034e-07} {"step": 59700, "timestamp": 1778259072.9528115, "geo/rankme_last": 438.772705078125, "geo/layer_0/stable_rank_q_proj": 19.34527015686035, "geo/layer_0/stable_rank_k_proj": 16.15241241455078, "geo/layer_0/stable_rank_o_proj": 47.164939880371094, "geo/layer_0/stable_rank_gate_proj": 131.9320068359375, "geo/layer_0/stable_rank_down_proj": 54.98374557495117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06969659775495529, "geo/layer_0/attn_entropy_mean": 6.15928316116333, "geo/layer_0/attn_entropy_std": 0.4069492518901825, "geo/layer_7/stable_rank_q_proj": 43.53317642211914, "geo/layer_7/stable_rank_k_proj": 41.433895111083984, "geo/layer_7/stable_rank_o_proj": 92.00436401367188, "geo/layer_7/stable_rank_gate_proj": 82.8978042602539, "geo/layer_7/stable_rank_down_proj": 141.09303283691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4588073194026947, "geo/layer_7/attn_entropy_mean": 4.637843132019043, "geo/layer_7/attn_entropy_std": 0.8038478493690491, "geo/layer_14/stable_rank_q_proj": 51.73343276977539, "geo/layer_14/stable_rank_k_proj": 39.684043884277344, "geo/layer_14/stable_rank_o_proj": 44.054534912109375, "geo/layer_14/stable_rank_gate_proj": 72.23199462890625, "geo/layer_14/stable_rank_down_proj": 130.60641479492188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3868841528892517, "geo/layer_14/attn_entropy_mean": 5.508061408996582, "geo/layer_14/attn_entropy_std": 0.4059836268424988, "geo/layer_21/stable_rank_q_proj": 40.64491653442383, "geo/layer_21/stable_rank_k_proj": 30.248153686523438, "geo/layer_21/stable_rank_o_proj": 70.97937774658203, "geo/layer_21/stable_rank_gate_proj": 66.80194854736328, "geo/layer_21/stable_rank_down_proj": 52.13168716430664, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1411859095096588, "geo/layer_21/attn_entropy_mean": 5.692288398742676, "geo/layer_21/attn_entropy_std": 0.29807984828948975, "geo/layer_27/stable_rank_q_proj": 42.955657958984375, "geo/layer_27/stable_rank_k_proj": 31.650447845458984, "geo/layer_27/stable_rank_o_proj": 116.09467315673828, "geo/layer_27/stable_rank_gate_proj": 80.72274780273438, "geo/layer_27/stable_rank_down_proj": 129.75198364257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09378716349601746, "geo/layer_27/attn_entropy_mean": 4.210722923278809, "geo/layer_27/attn_entropy_std": 0.7317851185798645, "attnres/final_alpha/block_0": 0.238242506980896, "attnres/block_norm/0": 1.7599855661392212, "attnres/final_alpha/block_1": 0.004578737076371908, "attnres/block_norm/1": 45952.46875, "attnres/final_alpha/block_2": 0.010347098112106323, "attnres/block_norm/2": 28281.33203125, "attnres/final_alpha/block_3": 0.01225484162569046, "attnres/block_norm/3": 56415.03125, "attnres/final_alpha/block_4": 0.014530971646308899, "attnres/block_norm/4": 14917.490234375, "attnres/final_alpha/block_5": 0.6075576543807983, "attnres/block_norm/5": 6595.42578125, "attnres/final_alpha/block_6": 0.11248819530010223, "attnres/block_norm/6": 37437.953125, "geo/tier1_time_s": 1.3601789474487305, "geo/step": 59700.0, "geo/rankme_slope": -0.00012546674919967988} {"step": 59710, "timestamp": 1778259083.296285, "train/loss": 2.0799915075302122, "train/z_loss": 0.0013894595205783844, "train/perplexity": 8.004400936874578, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791827.390221839, "perf/iters_per_sec": 0.8544098807439037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703984498977662, "data/tokens_consumed": 125223043072, "data/tokens_consumed_B": 125.223043072, "train/loss_slope": -3.763055436574705e-06} {"step": 59720, "timestamp": 1778259093.636288, "train/loss": 2.1856844663619994, "train/z_loss": 0.0013709121383726596, "train/perplexity": 8.896735986326396, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029614.7617350689, "perf/iters_per_sec": 0.9677957352328629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332758903503418, "data/tokens_consumed": 125244014592, "data/tokens_consumed_B": 125.244014592, "train/loss_slope": -5.993209584783194e-07} {"step": 59730, "timestamp": 1778259103.981205, "train/loss": 2.1414677619934084, "train/z_loss": 0.001388758164830506, "train/perplexity": 8.511921940160676, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028688.0164544324, "perf/iters_per_sec": 0.967353828646866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033747911453247, "data/tokens_consumed": 125264986112, "data/tokens_consumed_B": 125.264986112, "train/loss_slope": -1.40594613469831e-06} {"step": 59740, "timestamp": 1778259114.3327818, "train/loss": 2.1508926868438722, "train/z_loss": 0.0013747588149271906, "train/perplexity": 8.592525408936678, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027422.3775710636, "perf/iters_per_sec": 0.9667503249984091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343932390213013, "data/tokens_consumed": 125285957632, "data/tokens_consumed_B": 125.285957632, "train/loss_slope": -1.0558256114384016e-06} {"step": 59750, "timestamp": 1778259124.669007, "grad/layer_0/attn": 0.0025999988429248333, "grad/layer_0/mlp": 0.002864346606656909, "grad/layer_0/attn_mlp_ratio": 0.9077109404676061, "grad/layer_4/attn": 0.0022539072670042515, "grad/layer_4/mlp": 0.0025199572555720806, "grad/layer_4/attn_mlp_ratio": 0.8944227814095567, "grad/layer_8/attn": 0.010674811899662018, "grad/layer_8/mlp": 0.0034595211036503315, "grad/layer_8/attn_mlp_ratio": 3.0856327425883197, "grad/layer_12/attn": 0.005473449360579252, "grad/layer_12/mlp": 0.006943896412849426, "grad/layer_12/attn_mlp_ratio": 0.7882389016672173, "grad/layer_16/attn": 0.003552157897502184, "grad/layer_16/mlp": 0.004457242786884308, "grad/layer_16/attn_mlp_ratio": 0.7969406172489751, "grad/layer_20/attn": 0.003931150306016207, "grad/layer_20/mlp": 0.005850270390510559, "grad/layer_20/attn_mlp_ratio": 0.6719604354008478, "grad/layer_24/attn": 0.0069489674642682076, "grad/layer_24/mlp": 0.00899290107190609, "grad/layer_24/attn_mlp_ratio": 0.7727169832552869, "grad/layer_27/attn": 0.01014706864953041, "grad/layer_27/mlp": 0.007333246059715748, "grad/layer_27/attn_mlp_ratio": 1.3837076280450038} {"step": 59750, "timestamp": 1778259124.683546, "train/loss": 2.1799480438232424, "train/z_loss": 0.0013779690838418902, "train/perplexity": 8.8458466504091, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027680.6888988423, "perf/iters_per_sec": 0.9668734974378788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342614650726318, "data/tokens_consumed": 125306929152, "data/tokens_consumed_B": 125.306929152, "train/loss_slope": 3.159760971023568e-06} {"step": 59760, "timestamp": 1778259135.0315492, "train/loss": 2.2038164138793945, "train/z_loss": 0.0013795767095871269, "train/perplexity": 9.059522494633882, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028056.0516411944, "perf/iters_per_sec": 0.9670524843412373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034070038795471, "data/tokens_consumed": 125327900672, "data/tokens_consumed_B": 125.327900672, "train/loss_slope": 7.840700318353379e-06} {"step": 59770, "timestamp": 1778259145.372823, "train/loss": 2.128736209869385, "train/z_loss": 0.0013792222714982926, "train/perplexity": 8.404238902950864, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028928.304740148, "perf/iters_per_sec": 0.9674684070301761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336254835128784, "data/tokens_consumed": 125348872192, "data/tokens_consumed_B": 125.348872192, "train/loss_slope": 6.342118944522331e-06} {"step": 59775, "timestamp": 1778259151.145926, "eos/sharpness": 70.27647495269774, "eos/L0_probe": 1.9760030508041382, "eos/L_plus": 2.386363983154297, "eos/L_minus": 2.268406867980957, "eos/grad_norm": 0.1825553923845291, "eos/embed_grad_frac": 0.07474515587091446, "eos/time_s": 0.6080677509307861} {"step": 59775, "timestamp": 1778259152.521901, "geo/rankme_last": 438.4939270019531, "geo/layer_0/stable_rank_q_proj": 19.318416595458984, "geo/layer_0/stable_rank_k_proj": 16.17811393737793, "geo/layer_0/stable_rank_o_proj": 47.24794387817383, "geo/layer_0/stable_rank_gate_proj": 131.82484436035156, "geo/layer_0/stable_rank_down_proj": 54.90583038330078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062353167682886124, "geo/layer_0/attn_entropy_mean": 6.163660049438477, "geo/layer_0/attn_entropy_std": 0.40964949131011963, "geo/layer_7/stable_rank_q_proj": 43.48951721191406, "geo/layer_7/stable_rank_k_proj": 41.364871978759766, "geo/layer_7/stable_rank_o_proj": 91.84722900390625, "geo/layer_7/stable_rank_gate_proj": 82.69134521484375, "geo/layer_7/stable_rank_down_proj": 141.02279663085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47485315799713135, "geo/layer_7/attn_entropy_mean": 4.635329246520996, "geo/layer_7/attn_entropy_std": 0.8196037411689758, "geo/layer_14/stable_rank_q_proj": 51.78468704223633, "geo/layer_14/stable_rank_k_proj": 39.68804168701172, "geo/layer_14/stable_rank_o_proj": 44.05475997924805, "geo/layer_14/stable_rank_gate_proj": 72.15019226074219, "geo/layer_14/stable_rank_down_proj": 130.70082092285156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39515915513038635, "geo/layer_14/attn_entropy_mean": 5.514414310455322, "geo/layer_14/attn_entropy_std": 0.39337462186813354, "geo/layer_21/stable_rank_q_proj": 40.65845489501953, "geo/layer_21/stable_rank_k_proj": 30.282567977905273, "geo/layer_21/stable_rank_o_proj": 70.9446029663086, "geo/layer_21/stable_rank_gate_proj": 66.7656478881836, "geo/layer_21/stable_rank_down_proj": 52.188865661621094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1398339420557022, "geo/layer_21/attn_entropy_mean": 5.701587677001953, "geo/layer_21/attn_entropy_std": 0.2987237870693207, "geo/layer_27/stable_rank_q_proj": 42.89264678955078, "geo/layer_27/stable_rank_k_proj": 31.7658748626709, "geo/layer_27/stable_rank_o_proj": 116.07635498046875, "geo/layer_27/stable_rank_gate_proj": 80.6619873046875, "geo/layer_27/stable_rank_down_proj": 129.64886474609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08696053177118301, "geo/layer_27/attn_entropy_mean": 4.218104362487793, "geo/layer_27/attn_entropy_std": 0.7350801229476929, "attnres/final_alpha/block_0": 0.23685763776302338, "attnres/block_norm/0": 1.759826421737671, "attnres/final_alpha/block_1": 0.004515494219958782, "attnres/block_norm/1": 45996.421875, "attnres/final_alpha/block_2": 0.010085159912705421, "attnres/block_norm/2": 28232.65625, "attnres/final_alpha/block_3": 0.012097110971808434, "attnres/block_norm/3": 56856.21875, "attnres/final_alpha/block_4": 0.014361819252371788, "attnres/block_norm/4": 14847.271484375, "attnres/final_alpha/block_5": 0.6132103800773621, "attnres/block_norm/5": 6599.244140625, "attnres/final_alpha/block_6": 0.10887239128351212, "attnres/block_norm/6": 37709.60546875, "geo/tier1_time_s": 1.3578672409057617, "geo/step": 59775.0, "geo/rankme_slope": -0.00012264700802195878} {"step": 59780, "timestamp": 1778259157.6995037, "train/loss": 2.1236622810363768, "train/z_loss": 0.001376213238108903, "train/perplexity": 8.361704392654124, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702290.4703489528, "perf/iters_per_sec": 0.8117153503174557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319589614868165, "data/tokens_consumed": 125369843712, "data/tokens_consumed_B": 125.369843712, "train/loss_slope": 3.5473745719756404e-06} {"step": 59790, "timestamp": 1778259168.0391877, "train/loss": 2.1563764810562134, "train/z_loss": 0.0013793484074994921, "train/perplexity": 8.63977448373895, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029279.1763779083, "perf/iters_per_sec": 0.9676357156648199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033446764945984, "data/tokens_consumed": 125390815232, "data/tokens_consumed_B": 125.390815232, "train/loss_slope": 1.131514733237563e-06} {"step": 59800, "timestamp": 1778259178.9823008, "grad/layer_0/attn": 0.00327493017539382, "grad/layer_0/mlp": 0.0030803082045167685, "grad/layer_0/attn_mlp_ratio": 1.0631825946096598, "grad/layer_4/attn": 0.0019256558734923601, "grad/layer_4/mlp": 0.002667851746082306, "grad/layer_4/attn_mlp_ratio": 0.7218001540528393, "grad/layer_8/attn": 0.004687140230089426, "grad/layer_8/mlp": 0.0035170121118426323, "grad/layer_8/attn_mlp_ratio": 1.3327051337230758, "grad/layer_12/attn": 0.005225642118602991, "grad/layer_12/mlp": 0.007560305763036013, "grad/layer_12/attn_mlp_ratio": 0.6911945380612572, "grad/layer_16/attn": 0.004604591056704521, "grad/layer_16/mlp": 0.005795199424028397, "grad/layer_16/attn_mlp_ratio": 0.7945526357828916, "grad/layer_20/attn": 0.004755824338644743, "grad/layer_20/mlp": 0.008150883950293064, "grad/layer_20/attn_mlp_ratio": 0.5834734378872373, "grad/layer_24/attn": 0.018603121861815453, "grad/layer_24/mlp": 0.013289588503539562, "grad/layer_24/attn_mlp_ratio": 1.3998267679151994, "grad/layer_27/attn": 0.011407101526856422, "grad/layer_27/mlp": 0.0137031814083457, "grad/layer_27/attn_mlp_ratio": 0.8324418325707146} {"step": 59800, "timestamp": 1778259178.9968607, "train/loss": 2.1747774481773376, "train/z_loss": 0.001383836381137371, "train/perplexity": 8.800226397812178, "train/grad_norm": 0.30078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914706.7716598096, "perf/iters_per_sec": 0.9130033357905434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095286250114441, "data/tokens_consumed": 125411786752, "data/tokens_consumed_B": 125.411786752, "train/loss_slope": 1.6931538892300369e-06} {"step": 59810, "timestamp": 1778259189.893597, "train/loss": 2.129475450515747, "train/z_loss": 0.0013770335935987532, "train/perplexity": 8.410453954876134, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1926206.407522234, "perf/iters_per_sec": 0.9184867894755525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0887472867965697, "data/tokens_consumed": 125432758272, "data/tokens_consumed_B": 125.432758272, "train/loss_slope": 1.751892973702156e-06} {"step": 59820, "timestamp": 1778259200.2448113, "train/loss": 2.128328275680542, "train/z_loss": 0.001380020473152399, "train/perplexity": 8.40081122575204, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027670.4991465248, "perf/iters_per_sec": 0.9668686385853409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342666625976562, "data/tokens_consumed": 125453729792, "data/tokens_consumed_B": 125.453729792, "train/loss_slope": -8.003435655645516e-07} {"step": 59830, "timestamp": 1778259210.584012, "train/loss": 2.15595737695694, "train/z_loss": 0.0013729013269767165, "train/perplexity": 8.636154277510641, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029525.5517836527, "perf/iters_per_sec": 0.9677531966131462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333213090896607, "data/tokens_consumed": 125474701312, "data/tokens_consumed_B": 125.474701312, "train/loss_slope": 8.82512987321736e-07} {"step": 59840, "timestamp": 1778259220.9306197, "train/loss": 2.120958709716797, "train/z_loss": 0.0013759174733422696, "train/perplexity": 8.339128460048475, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028382.298564063, "perf/iters_per_sec": 0.9672080509968104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03390371799469, "data/tokens_consumed": 125495672832, "data/tokens_consumed_B": 125.495672832, "train/loss_slope": -1.624417369372074e-06} {"step": 59850, "timestamp": 1778259231.2600422, "grad/layer_0/attn": 0.0028930320404469967, "grad/layer_0/mlp": 0.0028964097145944834, "grad/layer_0/attn_mlp_ratio": 0.9988338065523509, "grad/layer_4/attn": 0.0025131681468337774, "grad/layer_4/mlp": 0.002526502590626478, "grad/layer_4/attn_mlp_ratio": 0.9947221335476217, "grad/layer_8/attn": 0.005142989568412304, "grad/layer_8/mlp": 0.003675927175208926, "grad/layer_8/attn_mlp_ratio": 1.3990999231942083, "grad/layer_12/attn": 0.004460998345166445, "grad/layer_12/mlp": 0.006805171258747578, "grad/layer_12/attn_mlp_ratio": 0.6555306413308961, "grad/layer_16/attn": 0.0034670941531658173, "grad/layer_16/mlp": 0.004646850284188986, "grad/layer_16/attn_mlp_ratio": 0.746117018306138, "grad/layer_20/attn": 0.002842215821146965, "grad/layer_20/mlp": 0.0057595898397266865, "grad/layer_20/attn_mlp_ratio": 0.49347537774222844, "grad/layer_24/attn": 0.005644866731017828, "grad/layer_24/mlp": 0.009325803257524967, "grad/layer_24/attn_mlp_ratio": 0.6052954919388258, "grad/layer_27/attn": 0.006476141978055239, "grad/layer_27/mlp": 0.007481723558157682, "grad/layer_27/attn_mlp_ratio": 0.8655949182236891} {"step": 59850, "timestamp": 1778259231.854671, "eos/sharpness": 9.893608093261717, "eos/L0_probe": 1.9791860580444336, "eos/L_plus": 2.039329767227173, "eos/L_minus": 2.0179784297943115, "eos/grad_norm": 0.10294115543365479, "eos/embed_grad_frac": 0.22637587785720825, "eos/time_s": 0.591843843460083} {"step": 59850, "timestamp": 1778259231.8748505, "train/loss": 2.110754692554474, "train/z_loss": 0.0013805606053210796, "train/perplexity": 8.254468520440359, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917422.446075898, "perf/iters_per_sec": 0.9142982702617158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0937349796295166, "data/tokens_consumed": 125516644352, "data/tokens_consumed_B": 125.516644352, "train/loss_slope": -5.312657213196701e-06} {"step": 59850, "timestamp": 1778259233.237998, "geo/rankme_last": 439.62548828125, "geo/layer_0/stable_rank_q_proj": 19.30657196044922, "geo/layer_0/stable_rank_k_proj": 16.14014434814453, "geo/layer_0/stable_rank_o_proj": 47.090126037597656, "geo/layer_0/stable_rank_gate_proj": 131.723876953125, "geo/layer_0/stable_rank_down_proj": 54.835655212402344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06359316408634186, "geo/layer_0/attn_entropy_mean": 6.160161972045898, "geo/layer_0/attn_entropy_std": 0.4079604744911194, "geo/layer_7/stable_rank_q_proj": 43.488548278808594, "geo/layer_7/stable_rank_k_proj": 41.33848190307617, "geo/layer_7/stable_rank_o_proj": 91.96195983886719, "geo/layer_7/stable_rank_gate_proj": 82.64805603027344, "geo/layer_7/stable_rank_down_proj": 141.25453186035156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4703734815120697, "geo/layer_7/attn_entropy_mean": 4.647434711456299, "geo/layer_7/attn_entropy_std": 0.8172023296356201, "geo/layer_14/stable_rank_q_proj": 51.89550018310547, "geo/layer_14/stable_rank_k_proj": 39.6058464050293, "geo/layer_14/stable_rank_o_proj": 44.073543548583984, "geo/layer_14/stable_rank_gate_proj": 72.22777557373047, "geo/layer_14/stable_rank_down_proj": 130.40904235839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3955601155757904, "geo/layer_14/attn_entropy_mean": 5.505922794342041, "geo/layer_14/attn_entropy_std": 0.39572039246559143, "geo/layer_21/stable_rank_q_proj": 40.56478500366211, "geo/layer_21/stable_rank_k_proj": 30.32456398010254, "geo/layer_21/stable_rank_o_proj": 70.9483413696289, "geo/layer_21/stable_rank_gate_proj": 66.76008605957031, "geo/layer_21/stable_rank_down_proj": 52.134456634521484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14625199139118195, "geo/layer_21/attn_entropy_mean": 5.6816277503967285, "geo/layer_21/attn_entropy_std": 0.296379417181015, "geo/layer_27/stable_rank_q_proj": 42.85513687133789, "geo/layer_27/stable_rank_k_proj": 31.690608978271484, "geo/layer_27/stable_rank_o_proj": 115.98611450195312, "geo/layer_27/stable_rank_gate_proj": 80.69347381591797, "geo/layer_27/stable_rank_down_proj": 129.50479125976562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08502095937728882, "geo/layer_27/attn_entropy_mean": 4.211275100708008, "geo/layer_27/attn_entropy_std": 0.728496789932251, "attnres/final_alpha/block_0": 0.2372312843799591, "attnres/block_norm/0": 1.7598676681518555, "attnres/final_alpha/block_1": 0.004541625268757343, "attnres/block_norm/1": 45948.8984375, "attnres/final_alpha/block_2": 0.01018121000379324, "attnres/block_norm/2": 28449.5390625, "attnres/final_alpha/block_3": 0.01210024580359459, "attnres/block_norm/3": 56704.87109375, "attnres/final_alpha/block_4": 0.014423778280615807, "attnres/block_norm/4": 14863.568359375, "attnres/final_alpha/block_5": 0.611009955406189, "attnres/block_norm/5": 6558.875, "attnres/final_alpha/block_6": 0.11051185429096222, "attnres/block_norm/6": 37472.8828125, "geo/tier1_time_s": 1.3596577644348145, "geo/step": 59850.0, "geo/rankme_slope": -9.207493544292717e-05} {"step": 59860, "timestamp": 1778259244.0557287, "train/loss": 2.183767795562744, "train/z_loss": 0.0013741146773099898, "train/perplexity": 8.87970020346176, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1722136.5948840631, "perf/iters_per_sec": 0.821178719942123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2177617073059082, "data/tokens_consumed": 125537615872, "data/tokens_consumed_B": 125.537615872, "train/loss_slope": -5.21154301394719e-06} {"step": 59870, "timestamp": 1778259254.833038, "train/loss": 2.1214207768440247, "train/z_loss": 0.0013841979322023689, "train/perplexity": 8.342982587543831, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947440.3501317254, "perf/iters_per_sec": 0.9286119223269107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0768761157989502, "data/tokens_consumed": 125558587392, "data/tokens_consumed_B": 125.558587392, "train/loss_slope": -5.869258276306707e-06} {"step": 59880, "timestamp": 1778259265.204149, "train/loss": 2.1256787300109865, "train/z_loss": 0.0013770515797659755, "train/perplexity": 8.378582353957654, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023457.230179053, "perf/iters_per_sec": 0.9648595953841462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036420226097107, "data/tokens_consumed": 125579558912, "data/tokens_consumed_B": 125.579558912, "train/loss_slope": -1.0378179949323466e-05} {"step": 59890, "timestamp": 1778259275.581777, "train/loss": 2.182844138145447, "train/z_loss": 0.001366642292123288, "train/perplexity": 8.871502189166652, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022191.7384427942, "perf/iters_per_sec": 0.964256161900899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037068819999695, "data/tokens_consumed": 125600530432, "data/tokens_consumed_B": 125.600530432, "train/loss_slope": -7.207518592454445e-06} {"step": 59900, "timestamp": 1778259285.9455066, "grad/layer_0/attn": 0.004307757597416639, "grad/layer_0/mlp": 0.0035559111274778843, "grad/layer_0/attn_mlp_ratio": 1.2114356410612703, "grad/layer_4/attn": 0.002562409033998847, "grad/layer_4/mlp": 0.0025961657520383596, "grad/layer_4/attn_mlp_ratio": 0.9869974339224096, "grad/layer_8/attn": 0.004693493712693453, "grad/layer_8/mlp": 0.0038280144799500704, "grad/layer_8/attn_mlp_ratio": 1.2260908663400836, "grad/layer_12/attn": 0.004890972748398781, "grad/layer_12/mlp": 0.007288549095392227, "grad/layer_12/attn_mlp_ratio": 0.6710488764335736, "grad/layer_16/attn": 0.007956826128065586, "grad/layer_16/mlp": 0.005488345865160227, "grad/layer_16/attn_mlp_ratio": 1.4497675945676822, "grad/layer_20/attn": 0.0030113921966403723, "grad/layer_20/mlp": 0.006052590440958738, "grad/layer_20/attn_mlp_ratio": 0.49753773632329074, "grad/layer_24/attn": 0.009101654402911663, "grad/layer_24/mlp": 0.008110954426229, "grad/layer_24/attn_mlp_ratio": 1.122143438664212, "grad/layer_27/attn": 0.005808169487863779, "grad/layer_27/mlp": 0.007099191192537546, "grad/layer_27/attn_mlp_ratio": 0.81814522929804} {"step": 59900, "timestamp": 1778259285.9595602, "train/loss": 2.0446548104286193, "train/z_loss": 0.0013844938715919853, "train/perplexity": 7.726490973421256, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022150.3636367691, "perf/iters_per_sec": 0.9642364328559728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037090039253235, "data/tokens_consumed": 125621501952, "data/tokens_consumed_B": 125.621501952, "train/loss_slope": -1.4858857669023391e-05} {"step": 59910, "timestamp": 1778259296.3422284, "train/loss": 2.1797587394714357, "train/z_loss": 0.0013618670986033977, "train/perplexity": 8.844172251633251, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020776.2414657797, "perf/iters_per_sec": 0.9635812003449343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377952575683593, "data/tokens_consumed": 125642473472, "data/tokens_consumed_B": 125.642473472, "train/loss_slope": -1.065953830108957e-05} {"step": 59920, "timestamp": 1778259306.7219293, "train/loss": 2.1052262663841246, "train/z_loss": 0.001384504116140306, "train/perplexity": 8.208960211221152, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021793.1231047758, "perf/iters_per_sec": 0.9640660872959022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372732877731323, "data/tokens_consumed": 125663444992, "data/tokens_consumed_B": 125.663444992, "train/loss_slope": -1.3383179658030757e-05} {"step": 59925, "timestamp": 1778259312.4850442, "eos/sharpness": 14.666438102722164, "eos/L0_probe": 1.980464220046997, "eos/L_plus": 2.060910224914551, "eos/L_minus": 2.046682596206665, "eos/grad_norm": 0.10322919487953186, "eos/embed_grad_frac": 0.27301743626594543, "eos/time_s": 0.5855047702789307} {"step": 59925, "timestamp": 1778259313.8633385, "geo/rankme_last": 438.9581298828125, "geo/layer_0/stable_rank_q_proj": 19.338424682617188, "geo/layer_0/stable_rank_k_proj": 16.16230583190918, "geo/layer_0/stable_rank_o_proj": 47.066551208496094, "geo/layer_0/stable_rank_gate_proj": 131.80850219726562, "geo/layer_0/stable_rank_down_proj": 54.74220657348633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061635371297597885, "geo/layer_0/attn_entropy_mean": 6.162754058837891, "geo/layer_0/attn_entropy_std": 0.404876708984375, "geo/layer_7/stable_rank_q_proj": 43.514217376708984, "geo/layer_7/stable_rank_k_proj": 41.40778350830078, "geo/layer_7/stable_rank_o_proj": 92.0615234375, "geo/layer_7/stable_rank_gate_proj": 82.5151138305664, "geo/layer_7/stable_rank_down_proj": 141.22341918945312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.461071640253067, "geo/layer_7/attn_entropy_mean": 4.654810905456543, "geo/layer_7/attn_entropy_std": 0.8145785331726074, "geo/layer_14/stable_rank_q_proj": 51.86902618408203, "geo/layer_14/stable_rank_k_proj": 39.59757614135742, "geo/layer_14/stable_rank_o_proj": 44.00986862182617, "geo/layer_14/stable_rank_gate_proj": 72.17965698242188, "geo/layer_14/stable_rank_down_proj": 130.06109619140625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3889719247817993, "geo/layer_14/attn_entropy_mean": 5.503974437713623, "geo/layer_14/attn_entropy_std": 0.3957729637622833, "geo/layer_21/stable_rank_q_proj": 40.51445388793945, "geo/layer_21/stable_rank_k_proj": 30.30623435974121, "geo/layer_21/stable_rank_o_proj": 70.92076110839844, "geo/layer_21/stable_rank_gate_proj": 66.702880859375, "geo/layer_21/stable_rank_down_proj": 52.02048873901367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14416247606277466, "geo/layer_21/attn_entropy_mean": 5.705621719360352, "geo/layer_21/attn_entropy_std": 0.30389171838760376, "geo/layer_27/stable_rank_q_proj": 42.86611557006836, "geo/layer_27/stable_rank_k_proj": 31.618066787719727, "geo/layer_27/stable_rank_o_proj": 115.9491958618164, "geo/layer_27/stable_rank_gate_proj": 80.62662506103516, "geo/layer_27/stable_rank_down_proj": 129.21131896972656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09192010760307312, "geo/layer_27/attn_entropy_mean": 4.192787170410156, "geo/layer_27/attn_entropy_std": 0.728084921836853, "attnres/final_alpha/block_0": 0.23686105012893677, "attnres/block_norm/0": 1.7599493265151978, "attnres/final_alpha/block_1": 0.004516290500760078, "attnres/block_norm/1": 46032.22265625, "attnres/final_alpha/block_2": 0.010205775499343872, "attnres/block_norm/2": 28467.84375, "attnres/final_alpha/block_3": 0.012235255911946297, "attnres/block_norm/3": 56737.0078125, "attnres/final_alpha/block_4": 0.014307240955531597, "attnres/block_norm/4": 14880.6044921875, "attnres/final_alpha/block_5": 0.611724853515625, "attnres/block_norm/5": 6543.2216796875, "attnres/final_alpha/block_6": 0.11014948785305023, "attnres/block_norm/6": 37691.3828125, "geo/tier1_time_s": 1.3600561618804932, "geo/step": 59925.0, "geo/rankme_slope": -6.636345944627851e-05} {"step": 59930, "timestamp": 1778259319.0499175, "train/loss": 2.1358192920684815, "train/z_loss": 0.0013867256580851971, "train/perplexity": 8.463978137114518, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702109.1653870668, "perf/iters_per_sec": 0.8116288973746618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2320901870727539, "data/tokens_consumed": 125684416512, "data/tokens_consumed_B": 125.684416512, "train/loss_slope": -1.4666544643565714e-05} {"step": 59940, "timestamp": 1778259329.4233294, "train/loss": 2.095831298828125, "train/z_loss": 0.0013879643171094357, "train/perplexity": 8.132198448121747, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022559.9558076703, "perf/iters_per_sec": 0.9644317416227676, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368800163269043, "data/tokens_consumed": 125705388032, "data/tokens_consumed_B": 125.705388032, "train/loss_slope": -1.5794282448817486e-05} {"step": 59950, "timestamp": 1778259339.7962232, "grad/layer_0/attn": 0.0024906271137297153, "grad/layer_0/mlp": 0.0027548468206077814, "grad/layer_0/attn_mlp_ratio": 0.9040891147520556, "grad/layer_4/attn": 0.0021143986377865076, "grad/layer_4/mlp": 0.0024509408976882696, "grad/layer_4/attn_mlp_ratio": 0.8626885101603061, "grad/layer_8/attn": 0.005734643898904324, "grad/layer_8/mlp": 0.003513561561703682, "grad/layer_8/attn_mlp_ratio": 1.632145512460899, "grad/layer_12/attn": 0.004550520330667496, "grad/layer_12/mlp": 0.007299711462110281, "grad/layer_12/attn_mlp_ratio": 0.623383580563282, "grad/layer_16/attn": 0.0033880111295729876, "grad/layer_16/mlp": 0.004587616305798292, "grad/layer_16/attn_mlp_ratio": 0.7385122969938981, "grad/layer_20/attn": 0.0065118661150336266, "grad/layer_20/mlp": 0.006515298504382372, "grad/layer_20/attn_mlp_ratio": 0.9994731646917248, "grad/layer_24/attn": 0.01714271306991577, "grad/layer_24/mlp": 0.012196882627904415, "grad/layer_24/attn_mlp_ratio": 1.4054995405257222, "grad/layer_27/attn": 0.008807657286524773, "grad/layer_27/mlp": 0.013084125705063343, "grad/layer_27/attn_mlp_ratio": 0.6731559614870375} {"step": 59950, "timestamp": 1778259339.8102865, "train/loss": 2.0795138001441957, "train/z_loss": 0.001388056855648756, "train/perplexity": 8.000578088600523, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020362.1739298906, "perf/iters_per_sec": 0.9633837575578168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038007950782776, "data/tokens_consumed": 125726359552, "data/tokens_consumed_B": 125.726359552, "train/loss_slope": -2.1000445371914437e-05} {"step": 59960, "timestamp": 1778259350.1836572, "train/loss": 2.1480796337127686, "train/z_loss": 0.0013751751626841723, "train/perplexity": 8.568388144052312, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022938.6355340409, "perf/iters_per_sec": 0.9646123101873593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366859197616578, "data/tokens_consumed": 125747331072, "data/tokens_consumed_B": 125.747331072, "train/loss_slope": -2.139673436185164e-05} {"step": 59970, "timestamp": 1778259360.5636024, "train/loss": 2.1468430757522583, "train/z_loss": 0.0013833534438163042, "train/perplexity": 8.557799383641283, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021707.8521024643, "perf/iters_per_sec": 0.9640254269134828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373170375823975, "data/tokens_consumed": 125768302592, "data/tokens_consumed_B": 125.768302592, "train/loss_slope": -2.4321282922130635e-05} {"step": 59980, "timestamp": 1778259370.9375005, "train/loss": 2.144839572906494, "train/z_loss": 0.001382739411201328, "train/perplexity": 8.540670972362534, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022699.3451816975, "perf/iters_per_sec": 0.9644982076557624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368085622787475, "data/tokens_consumed": 125789274112, "data/tokens_consumed_B": 125.789274112, "train/loss_slope": -1.977865268902982e-05} {"step": 59990, "timestamp": 1778259381.3250031, "train/loss": 2.1852825403213503, "train/z_loss": 0.0013629290624521673, "train/perplexity": 8.893160874970016, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020214.8010882312, "perf/iters_per_sec": 0.9633134847108036, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380836725234985, "data/tokens_consumed": 125810245632, "data/tokens_consumed_B": 125.810245632, "train/loss_slope": -1.584321596298417e-05} {"step": 60000, "timestamp": 1778259391.6717603, "grad/layer_0/attn": 0.0028240566607564688, "grad/layer_0/mlp": 0.0028651407919824123, "grad/layer_0/attn_mlp_ratio": 0.9856606593620193, "grad/layer_4/attn": 0.0022841545287519693, "grad/layer_4/mlp": 0.0026533990167081356, "grad/layer_4/attn_mlp_ratio": 0.8608409169841515, "grad/layer_8/attn": 0.0035504966508597136, "grad/layer_8/mlp": 0.0037136967293918133, "grad/layer_8/attn_mlp_ratio": 0.956054523018556, "grad/layer_12/attn": 0.004324833396822214, "grad/layer_12/mlp": 0.006671315990388393, "grad/layer_12/attn_mlp_ratio": 0.6482728952167561, "grad/layer_16/attn": 0.003787980880588293, "grad/layer_16/mlp": 0.004518161527812481, "grad/layer_16/attn_mlp_ratio": 0.8383898568990077, "grad/layer_20/attn": 0.0032289153896272182, "grad/layer_20/mlp": 0.00558239221572876, "grad/layer_20/attn_mlp_ratio": 0.5784106897198062, "grad/layer_24/attn": 0.007303673308342695, "grad/layer_24/mlp": 0.008353540673851967, "grad/layer_24/attn_mlp_ratio": 0.8743206630659494, "grad/layer_27/attn": 0.009367221966385841, "grad/layer_27/mlp": 0.00673870462924242, "grad/layer_27/attn_mlp_ratio": 1.3900626815917672} {"step": 60000, "timestamp": 1778259392.2572124, "eos/sharpness": 33.76731872558593, "eos/L0_probe": 1.9831122159957886, "eos/L_plus": 2.1190476417541504, "eos/L_minus": 2.184849977493286, "eos/grad_norm": 0.11473901569843292, "eos/embed_grad_frac": 0.16096454858779907, "eos/time_s": 0.5825259685516357} {"step": 60000, "timestamp": 1778259392.277541, "train/loss": 2.187393856048584, "train/z_loss": 0.0013898335280828177, "train/perplexity": 8.911956980664732, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915618.302979184, "perf/iters_per_sec": 0.913437987794487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094765067100525, "data/tokens_consumed": 125831217152, "data/tokens_consumed_B": 125.831217152, "train/loss_slope": -1.7131957155142304e-05} {"step": 60000, "timestamp": 1778259393.6391993, "geo/rankme_last": 438.08111572265625, "geo/layer_0/stable_rank_q_proj": 19.33490562438965, "geo/layer_0/stable_rank_k_proj": 16.175642013549805, "geo/layer_0/stable_rank_o_proj": 47.049224853515625, "geo/layer_0/stable_rank_gate_proj": 131.9489288330078, "geo/layer_0/stable_rank_down_proj": 54.77449035644531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06239519640803337, "geo/layer_0/attn_entropy_mean": 6.163424015045166, "geo/layer_0/attn_entropy_std": 0.40749555826187134, "geo/layer_7/stable_rank_q_proj": 43.56654739379883, "geo/layer_7/stable_rank_k_proj": 41.398948669433594, "geo/layer_7/stable_rank_o_proj": 92.04034423828125, "geo/layer_7/stable_rank_gate_proj": 82.43417358398438, "geo/layer_7/stable_rank_down_proj": 141.3566436767578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4563427269458771, "geo/layer_7/attn_entropy_mean": 4.671931266784668, "geo/layer_7/attn_entropy_std": 0.8044164180755615, "geo/layer_14/stable_rank_q_proj": 51.9456901550293, "geo/layer_14/stable_rank_k_proj": 39.696807861328125, "geo/layer_14/stable_rank_o_proj": 43.99247360229492, "geo/layer_14/stable_rank_gate_proj": 72.13871765136719, "geo/layer_14/stable_rank_down_proj": 129.8414764404297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40934085845947266, "geo/layer_14/attn_entropy_mean": 5.537071228027344, "geo/layer_14/attn_entropy_std": 0.40809059143066406, "geo/layer_21/stable_rank_q_proj": 40.45829391479492, "geo/layer_21/stable_rank_k_proj": 30.220605850219727, "geo/layer_21/stable_rank_o_proj": 70.93762969970703, "geo/layer_21/stable_rank_gate_proj": 66.76223754882812, "geo/layer_21/stable_rank_down_proj": 51.98698806762695, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1491909921169281, "geo/layer_21/attn_entropy_mean": 5.7230939865112305, "geo/layer_21/attn_entropy_std": 0.297567754983902, "geo/layer_27/stable_rank_q_proj": 43.0133056640625, "geo/layer_27/stable_rank_k_proj": 31.69802474975586, "geo/layer_27/stable_rank_o_proj": 115.907958984375, "geo/layer_27/stable_rank_gate_proj": 80.58390045166016, "geo/layer_27/stable_rank_down_proj": 129.00979614257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09659116715192795, "geo/layer_27/attn_entropy_mean": 4.213845252990723, "geo/layer_27/attn_entropy_std": 0.716903030872345, "attnres/final_alpha/block_0": 0.23882989585399628, "attnres/block_norm/0": 1.7601385116577148, "attnres/final_alpha/block_1": 0.004588743671774864, "attnres/block_norm/1": 46123.55078125, "attnres/final_alpha/block_2": 0.010371631011366844, "attnres/block_norm/2": 28450.8671875, "attnres/final_alpha/block_3": 0.012333669699728489, "attnres/block_norm/3": 57135.32421875, "attnres/final_alpha/block_4": 0.014402134343981743, "attnres/block_norm/4": 14889.595703125, "attnres/final_alpha/block_5": 0.6086505651473999, "attnres/block_norm/5": 6563.92138671875, "attnres/final_alpha/block_6": 0.1108233854174614, "attnres/block_norm/6": 37533.90625, "geo/tier1_time_s": 1.3574638366699219, "geo/step": 60000.0, "geo/rankme_slope": -8.681746135954382e-05} {"step": 60000, "timestamp": 1778259400.827251, "geo/ww_alpha_mean": 7.940638673507406, "geo/ww_alpha_std": 5.476707793057323, "geo/ww_alpha_min": 1.3333726169553843, "geo/ww_alpha_max": 42.857975498826406, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 4.01962183378327, "geo/ww_alpha_by_type/k_proj": 4.467365765012333, "geo/ww_alpha_by_type/v_proj": 8.323439128058395, "geo/ww_alpha_by_type/o_proj": 9.979989210158069, "geo/ww_alpha_by_type/gate_proj": 8.10861646428447, "geo/ww_alpha_by_type/up_proj": 12.54243104099854, "geo/ww_alpha_by_type/down_proj": 8.248003630450208, "geo/twonn_id/layer_0": 0.7582321763038635, "geo/twonn_id/layer_7": 3.074077606201172, "geo/twonn_id/layer_14": 4.554981231689453, "geo/twonn_id/layer_21": 7.807227611541748, "geo/twonn_id/layer_27": 5.472780227661133, "geo/tier2_time_s": 7.1807849407196045} {"step": 60000, "timestamp": 1778259401.6585507, "eoc/jacobian_sigma/layer_0/attn": 1233.431640625, "eoc/jacobian_sigma/layer_0/mlp": 11607.69921875, "eoc/jacobian_sigma/layer_0": 11607.69921875, "eoc/jacobian_sigma/layer_7/attn": 1.155043125152588, "eoc/jacobian_sigma/layer_7/mlp": 1.8161064386367798, "eoc/jacobian_sigma/layer_7": 1.8161064386367798, "eoc/jacobian_sigma/layer_14/attn": 1.4988908767700195, "eoc/jacobian_sigma/layer_14/mlp": 6.128586292266846, "eoc/jacobian_sigma/layer_14": 6.128586292266846, "eoc/jacobian_sigma/layer_21/attn": 1.0829434394836426, "eoc/jacobian_sigma/layer_21/mlp": 4.504029273986816, "eoc/jacobian_sigma/layer_21": 4.504029273986816, "eoc/jacobian_sigma/layer_27/attn": 3.232503890991211, "eoc/jacobian_sigma/layer_27/mlp": 26.74746322631836, "eoc/jacobian_sigma/layer_27": 26.74746322631836, "eoc/layer0_sigma": 11607.69921875, "eoc/sigma_max": 26.74746322631836, "eoc/sigma_min": 1.8161064386367798, "eoc/sigma_mean": 9.7990463078022, "eoc/time_s": 0.8240954875946045} {"step": 60010, "timestamp": 1778259412.6694467, "train/loss": 2.1171011686325074, "train/z_loss": 0.0013849779847078026, "train/perplexity": 8.30702189541654, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1028631.7391032992, "perf/iters_per_sec": 0.4904898353115555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 2.038778233528137, "data/tokens_consumed": 125852188672, "data/tokens_consumed_B": 125.852188672, "train/loss_slope": -1.986707550893476e-05} {"step": 60020, "timestamp": 1778259423.0178332, "train/loss": 2.126597547531128, "train/z_loss": 0.001385480584576726, "train/perplexity": 8.386284280008867, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028141.7654736633, "perf/iters_per_sec": 0.9670933558815304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340263366699218, "data/tokens_consumed": 125873160192, "data/tokens_consumed_B": 125.873160192, "train/loss_slope": -2.3642122627484825e-05} {"step": 60030, "timestamp": 1778259433.360216, "train/loss": 2.0901869177818297, "train/z_loss": 0.0014029678888618947, "train/perplexity": 8.086426519959076, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028772.6606926755, "perf/iters_per_sec": 0.9673941901648881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337047815322875, "data/tokens_consumed": 125894131712, "data/tokens_consumed_B": 125.894131712, "train/loss_slope": -2.6969975943040228e-05} {"step": 60040, "timestamp": 1778259443.701429, "train/loss": 2.1622169971466065, "train/z_loss": 0.001377913635224104, "train/perplexity": 8.690382871318453, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029349.9645543494, "perf/iters_per_sec": 0.9676694700977084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334107160568238, "data/tokens_consumed": 125915103232, "data/tokens_consumed_B": 125.915103232, "train/loss_slope": -2.3317365384552422e-05} {"step": 60050, "timestamp": 1778259454.049052, "grad/layer_0/attn": 0.00316039239987731, "grad/layer_0/mlp": 0.0032768186647444963, "grad/layer_0/attn_mlp_ratio": 0.9644696966094599, "grad/layer_4/attn": 0.0022589610889554024, "grad/layer_4/mlp": 0.0025647259317338467, "grad/layer_4/attn_mlp_ratio": 0.8807806607820265, "grad/layer_8/attn": 0.005611634813249111, "grad/layer_8/mlp": 0.0035830095876008272, "grad/layer_8/attn_mlp_ratio": 1.5661790791882133, "grad/layer_12/attn": 0.006063712760806084, "grad/layer_12/mlp": 0.007178229279816151, "grad/layer_12/attn_mlp_ratio": 0.8447365554876415, "grad/layer_16/attn": 0.003806917229667306, "grad/layer_16/mlp": 0.004865063354372978, "grad/layer_16/attn_mlp_ratio": 0.7825010434849409, "grad/layer_20/attn": 0.005171956494450569, "grad/layer_20/mlp": 0.006741643417626619, "grad/layer_20/attn_mlp_ratio": 0.7671655258733326, "grad/layer_24/attn": 0.016009103506803513, "grad/layer_24/mlp": 0.011657972820103168, "grad/layer_24/attn_mlp_ratio": 1.3732321747974894, "grad/layer_27/attn": 0.00667672511190176, "grad/layer_27/mlp": 0.012531545013189316, "grad/layer_27/attn_mlp_ratio": 0.5327934465858148} {"step": 60050, "timestamp": 1778259454.063629, "train/loss": 2.199833393096924, "train/z_loss": 0.0013676953501999378, "train/perplexity": 9.023509995135335, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025336.8951365743, "perf/iters_per_sec": 0.9657558894808647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354583501815795, "data/tokens_consumed": 125936074752, "data/tokens_consumed_B": 125.936074752, "train/loss_slope": -2.0693035487687916e-05} {"step": 60060, "timestamp": 1778259464.4098015, "train/loss": 2.126204490661621, "train/z_loss": 0.0013733968255110085, "train/perplexity": 8.382988641092162, "train/grad_norm": 0.3828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028080.8344924853, "perf/iters_per_sec": 0.967064301725619, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340574026107787, "data/tokens_consumed": 125957046272, "data/tokens_consumed_B": 125.957046272, "train/loss_slope": -2.238872030494512e-05} {"step": 60070, "timestamp": 1778259474.7556126, "train/loss": 2.219193196296692, "train/z_loss": 0.0013643321115523576, "train/perplexity": 9.199905353110468, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028442.546000548, "perf/iters_per_sec": 0.9672367792132129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338730096817017, "data/tokens_consumed": 125978017792, "data/tokens_consumed_B": 125.978017792, "train/loss_slope": -1.5821491788537275e-05} {"step": 60075, "timestamp": 1778259480.5404856, "eos/sharpness": 76.20019912719725, "eos/L0_probe": 1.9824588298797607, "eos/L_plus": 2.4412713050842285, "eos/L_minus": 2.2856483459472656, "eos/grad_norm": 0.2260332703590393, "eos/embed_grad_frac": 0.044171568006277084, "eos/time_s": 0.620607852935791} {"step": 60075, "timestamp": 1778259481.9195528, "geo/rankme_last": 438.70721435546875, "geo/layer_0/stable_rank_q_proj": 19.36152458190918, "geo/layer_0/stable_rank_k_proj": 16.190404891967773, "geo/layer_0/stable_rank_o_proj": 47.07252502441406, "geo/layer_0/stable_rank_gate_proj": 131.80465698242188, "geo/layer_0/stable_rank_down_proj": 54.815303802490234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06337778270244598, "geo/layer_0/attn_entropy_mean": 6.164737701416016, "geo/layer_0/attn_entropy_std": 0.4054701328277588, "geo/layer_7/stable_rank_q_proj": 43.578800201416016, "geo/layer_7/stable_rank_k_proj": 41.27680969238281, "geo/layer_7/stable_rank_o_proj": 92.19465637207031, "geo/layer_7/stable_rank_gate_proj": 82.50048065185547, "geo/layer_7/stable_rank_down_proj": 141.5563201904297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4463682472705841, "geo/layer_7/attn_entropy_mean": 4.645879745483398, "geo/layer_7/attn_entropy_std": 0.8097348213195801, "geo/layer_14/stable_rank_q_proj": 51.97442626953125, "geo/layer_14/stable_rank_k_proj": 39.77591323852539, "geo/layer_14/stable_rank_o_proj": 44.043514251708984, "geo/layer_14/stable_rank_gate_proj": 72.09564208984375, "geo/layer_14/stable_rank_down_proj": 129.88389587402344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.401604562997818, "geo/layer_14/attn_entropy_mean": 5.512059688568115, "geo/layer_14/attn_entropy_std": 0.4018590450286865, "geo/layer_21/stable_rank_q_proj": 40.45749282836914, "geo/layer_21/stable_rank_k_proj": 30.187463760375977, "geo/layer_21/stable_rank_o_proj": 70.93689727783203, "geo/layer_21/stable_rank_gate_proj": 66.75679779052734, "geo/layer_21/stable_rank_down_proj": 52.0472412109375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14157980680465698, "geo/layer_21/attn_entropy_mean": 5.715784072875977, "geo/layer_21/attn_entropy_std": 0.30305030941963196, "geo/layer_27/stable_rank_q_proj": 43.07374572753906, "geo/layer_27/stable_rank_k_proj": 31.68642807006836, "geo/layer_27/stable_rank_o_proj": 115.99803161621094, "geo/layer_27/stable_rank_gate_proj": 80.59642028808594, "geo/layer_27/stable_rank_down_proj": 128.87484741210938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08633779734373093, "geo/layer_27/attn_entropy_mean": 4.20327091217041, "geo/layer_27/attn_entropy_std": 0.7195234298706055, "attnres/final_alpha/block_0": 0.23474542796611786, "attnres/block_norm/0": 1.7602956295013428, "attnres/final_alpha/block_1": 0.00452147051692009, "attnres/block_norm/1": 46066.3515625, "attnres/final_alpha/block_2": 0.010085724294185638, "attnres/block_norm/2": 28375.7578125, "attnres/final_alpha/block_3": 0.012155740521848202, "attnres/block_norm/3": 57097.72265625, "attnres/final_alpha/block_4": 0.014147552661597729, "attnres/block_norm/4": 14917.1005859375, "attnres/final_alpha/block_5": 0.6163666844367981, "attnres/block_norm/5": 6524.849609375, "attnres/final_alpha/block_6": 0.10797741264104843, "attnres/block_norm/6": 37838.73046875, "geo/tier1_time_s": 1.3608729839324951, "geo/step": 60075.0, "geo/rankme_slope": -9.963747217637055e-05} {"step": 60080, "timestamp": 1778259487.0987527, "train/loss": 2.113247585296631, "train/z_loss": 0.001377184852026403, "train/perplexity": 8.275071695187833, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700043.4577629191, "perf/iters_per_sec": 0.8106438912214847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2335872888565063, "data/tokens_consumed": 125998989312, "data/tokens_consumed_B": 125.998989312, "train/loss_slope": -1.7401437633501708e-05} {"step": 60090, "timestamp": 1778259497.4419553, "train/loss": 2.0748173236846923, "train/z_loss": 0.0013939086114987732, "train/perplexity": 7.963091657917438, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028573.9053036384, "perf/iters_per_sec": 0.9672994162100021, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03380606174469, "data/tokens_consumed": 126019960832, "data/tokens_consumed_B": 126.019960832, "train/loss_slope": -2.057337606414592e-05} {"step": 60100, "timestamp": 1778259507.776976, "grad/layer_0/attn": 0.003077105386182666, "grad/layer_0/mlp": 0.003261794801801443, "grad/layer_0/attn_mlp_ratio": 0.9433779495097083, "grad/layer_4/attn": 0.0019276762614026666, "grad/layer_4/mlp": 0.002429607091471553, "grad/layer_4/attn_mlp_ratio": 0.7934106666169032, "grad/layer_8/attn": 0.005898256786167622, "grad/layer_8/mlp": 0.003590440610423684, "grad/layer_8/attn_mlp_ratio": 1.6427667971354991, "grad/layer_12/attn": 0.004701240453869104, "grad/layer_12/mlp": 0.006894417107105255, "grad/layer_12/attn_mlp_ratio": 0.6818909144378579, "grad/layer_16/attn": 0.005135780666023493, "grad/layer_16/mlp": 0.004477148875594139, "grad/layer_16/attn_mlp_ratio": 1.1471096213281469, "grad/layer_20/attn": 0.003403609851375222, "grad/layer_20/mlp": 0.0057382406666874886, "grad/layer_20/attn_mlp_ratio": 0.593145179814409, "grad/layer_24/attn": 0.011196066625416279, "grad/layer_24/mlp": 0.010020394809544086, "grad/layer_24/attn_mlp_ratio": 1.1173278824322987, "grad/layer_27/attn": 0.006570864003151655, "grad/layer_27/mlp": 0.009373759850859642, "grad/layer_27/attn_mlp_ratio": 0.7009848809440719} {"step": 60100, "timestamp": 1778259507.791094, "train/loss": 2.1893037080764772, "train/z_loss": 0.0013650331646203994, "train/perplexity": 8.92899376346532, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028244.6971123207, "perf/iters_per_sec": 0.967142437511597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339738607406617, "data/tokens_consumed": 126040932352, "data/tokens_consumed_B": 126.040932352, "train/loss_slope": -1.7015502061566216e-05} {"step": 60110, "timestamp": 1778259518.1333387, "train/loss": 2.1409011602401735, "train/z_loss": 0.0013864621403627097, "train/perplexity": 8.507100436331235, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029176.092906123, "perf/iters_per_sec": 0.9675865616350761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033499264717102, "data/tokens_consumed": 126061903872, "data/tokens_consumed_B": 126.061903872, "train/loss_slope": -1.7428588967333206e-05} {"step": 60120, "timestamp": 1778259528.4829898, "train/loss": 2.1573194026947022, "train/z_loss": 0.0013796972460113465, "train/perplexity": 8.647924956075814, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027692.9821767071, "perf/iters_per_sec": 0.9668793593295608, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342551946640015, "data/tokens_consumed": 126082875392, "data/tokens_consumed_B": 126.082875392, "train/loss_slope": -1.6350154607745803e-05} {"step": 60130, "timestamp": 1778259538.829456, "train/loss": 2.1405420303344727, "train/z_loss": 0.0013788193115033208, "train/perplexity": 8.504045830686696, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028451.5740691544, "perf/iters_per_sec": 0.9672410841317913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033868408203125, "data/tokens_consumed": 126103846912, "data/tokens_consumed_B": 126.103846912, "train/loss_slope": -1.6273221953867173e-05} {"step": 60140, "timestamp": 1778259549.171299, "train/loss": 2.1838232278823853, "train/z_loss": 0.0013702374300919474, "train/perplexity": 8.88019243948452, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029128.8147081032, "perf/iters_per_sec": 0.9675640176334873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335233449935912, "data/tokens_consumed": 126124818432, "data/tokens_consumed_B": 126.124818432, "train/loss_slope": -1.2139506179793437e-05} {"step": 60150, "timestamp": 1778259559.5396369, "grad/layer_0/attn": 0.003195525612682104, "grad/layer_0/mlp": 0.003039458766579628, "grad/layer_0/attn_mlp_ratio": 1.051346885400723, "grad/layer_4/attn": 0.0025250574108213186, "grad/layer_4/mlp": 0.0025895291473716497, "grad/layer_4/attn_mlp_ratio": 0.9751028737691341, "grad/layer_8/attn": 0.004877581261098385, "grad/layer_8/mlp": 0.003614512737840414, "grad/layer_8/attn_mlp_ratio": 1.3494436124378588, "grad/layer_12/attn": 0.005235319957137108, "grad/layer_12/mlp": 0.007141829933971167, "grad/layer_12/attn_mlp_ratio": 0.7330502031320456, "grad/layer_16/attn": 0.00405005132779479, "grad/layer_16/mlp": 0.0048447404988110065, "grad/layer_16/attn_mlp_ratio": 0.8359686644087305, "grad/layer_20/attn": 0.006975085940212011, "grad/layer_20/mlp": 0.006607393268495798, "grad/layer_20/attn_mlp_ratio": 1.0556486576793476, "grad/layer_24/attn": 0.013489995151758194, "grad/layer_24/mlp": 0.011065248399972916, "grad/layer_24/attn_mlp_ratio": 1.2191316943122619, "grad/layer_27/attn": 0.005751867312937975, "grad/layer_27/mlp": 0.010227406397461891, "grad/layer_27/attn_mlp_ratio": 0.562397447912665} {"step": 60150, "timestamp": 1778259560.1280081, "eos/sharpness": 46.95227146148681, "eos/L0_probe": 1.978651523590088, "eos/L_plus": 2.220036029815674, "eos/L_minus": 2.20678973197937, "eos/grad_norm": 0.16787129640579224, "eos/embed_grad_frac": 0.08449618518352509, "eos/time_s": 0.5855295658111572} {"step": 60150, "timestamp": 1778259560.1480045, "train/loss": 2.191362977027893, "train/z_loss": 0.001361422729678452, "train/perplexity": 8.947399908186572, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911424.288354123, "perf/iters_per_sec": 0.9114381257792106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097167181968689, "data/tokens_consumed": 126145789952, "data/tokens_consumed_B": 126.145789952, "train/loss_slope": -8.162315843915677e-06} {"step": 60150, "timestamp": 1778259561.5097346, "geo/rankme_last": 438.30682373046875, "geo/layer_0/stable_rank_q_proj": 19.351022720336914, "geo/layer_0/stable_rank_k_proj": 16.160715103149414, "geo/layer_0/stable_rank_o_proj": 47.1047477722168, "geo/layer_0/stable_rank_gate_proj": 131.61920166015625, "geo/layer_0/stable_rank_down_proj": 54.89817428588867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0626085177063942, "geo/layer_0/attn_entropy_mean": 6.158999443054199, "geo/layer_0/attn_entropy_std": 0.4109202027320862, "geo/layer_7/stable_rank_q_proj": 43.49443435668945, "geo/layer_7/stable_rank_k_proj": 41.31473922729492, "geo/layer_7/stable_rank_o_proj": 92.17300415039062, "geo/layer_7/stable_rank_gate_proj": 82.33991241455078, "geo/layer_7/stable_rank_down_proj": 141.3537139892578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4672999083995819, "geo/layer_7/attn_entropy_mean": 4.64082145690918, "geo/layer_7/attn_entropy_std": 0.8026211857795715, "geo/layer_14/stable_rank_q_proj": 51.852691650390625, "geo/layer_14/stable_rank_k_proj": 39.75823211669922, "geo/layer_14/stable_rank_o_proj": 44.06175994873047, "geo/layer_14/stable_rank_gate_proj": 72.13008880615234, "geo/layer_14/stable_rank_down_proj": 129.74070739746094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37561288475990295, "geo/layer_14/attn_entropy_mean": 5.4813127517700195, "geo/layer_14/attn_entropy_std": 0.3797309696674347, "geo/layer_21/stable_rank_q_proj": 40.48101043701172, "geo/layer_21/stable_rank_k_proj": 30.21678924560547, "geo/layer_21/stable_rank_o_proj": 70.93663787841797, "geo/layer_21/stable_rank_gate_proj": 66.81543731689453, "geo/layer_21/stable_rank_down_proj": 52.01991271972656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14478524029254913, "geo/layer_21/attn_entropy_mean": 5.71748161315918, "geo/layer_21/attn_entropy_std": 0.294386625289917, "geo/layer_27/stable_rank_q_proj": 43.0948600769043, "geo/layer_27/stable_rank_k_proj": 31.61714744567871, "geo/layer_27/stable_rank_o_proj": 116.07097625732422, "geo/layer_27/stable_rank_gate_proj": 80.52535247802734, "geo/layer_27/stable_rank_down_proj": 128.73068237304688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09224645793437958, "geo/layer_27/attn_entropy_mean": 4.233304023742676, "geo/layer_27/attn_entropy_std": 0.7178704738616943, "attnres/final_alpha/block_0": 0.23532821238040924, "attnres/block_norm/0": 1.760393500328064, "attnres/final_alpha/block_1": 0.004476385191082954, "attnres/block_norm/1": 46195.8828125, "attnres/final_alpha/block_2": 0.010131600312888622, "attnres/block_norm/2": 28450.23046875, "attnres/final_alpha/block_3": 0.012317046523094177, "attnres/block_norm/3": 57445.41796875, "attnres/final_alpha/block_4": 0.014281232841312885, "attnres/block_norm/4": 14905.0390625, "attnres/final_alpha/block_5": 0.6161416172981262, "attnres/block_norm/5": 6502.8134765625, "attnres/final_alpha/block_6": 0.10732389986515045, "attnres/block_norm/6": 37816.37109375, "geo/tier1_time_s": 1.3575289249420166, "geo/step": 60150.0, "geo/rankme_slope": -7.816134266206483e-05} {"step": 60160, "timestamp": 1778259571.8857424, "train/loss": 2.1739837288856507, "train/z_loss": 0.0013718550093472005, "train/perplexity": 8.793244259644458, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787258.4736933412, "perf/iters_per_sec": 0.8522312515703875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1733904361724854, "data/tokens_consumed": 126166761472, "data/tokens_consumed_B": 126.166761472, "train/loss_slope": -8.886052258122596e-06} {"step": 60170, "timestamp": 1778259582.2669895, "train/loss": 2.142047071456909, "train/z_loss": 0.001387964782770723, "train/perplexity": 8.51685440566722, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021445.9501399864, "perf/iters_per_sec": 0.963900542325967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037451434135437, "data/tokens_consumed": 126187732992, "data/tokens_consumed_B": 126.187732992, "train/loss_slope": -8.52764223632667e-06} {"step": 60180, "timestamp": 1778259592.6431773, "train/loss": 2.1403170585632325, "train/z_loss": 0.0013860070379450918, "train/perplexity": 8.502132875621973, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022022.1589786846, "perf/iters_per_sec": 0.9641753001111434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371557950973511, "data/tokens_consumed": 126208704512, "data/tokens_consumed_B": 126.208704512, "train/loss_slope": -9.172769949094898e-06} {"step": 60190, "timestamp": 1778259603.03777, "train/loss": 2.1140791177749634, "train/z_loss": 0.00139768939698115, "train/perplexity": 8.281955547739747, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020973.7503210295, "perf/iters_per_sec": 0.9636753799061916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376938343048097, "data/tokens_consumed": 126229676032, "data/tokens_consumed_B": 126.229676032, "train/loss_slope": -9.066068958027154e-06} {"step": 60200, "timestamp": 1778259613.4076612, "grad/layer_0/attn": 0.002349491696804762, "grad/layer_0/mlp": 0.0025666956789791584, "grad/layer_0/attn_mlp_ratio": 0.9153759927634336, "grad/layer_4/attn": 0.0023884461261332035, "grad/layer_4/mlp": 0.0023402234073728323, "grad/layer_4/attn_mlp_ratio": 1.020605988534192, "grad/layer_8/attn": 0.0032893263269215822, "grad/layer_8/mlp": 0.0034341788850724697, "grad/layer_8/attn_mlp_ratio": 0.9578202945214777, "grad/layer_12/attn": 0.004335496108978987, "grad/layer_12/mlp": 0.006934413220733404, "grad/layer_12/attn_mlp_ratio": 0.6252145507416123, "grad/layer_16/attn": 0.0031664289999753237, "grad/layer_16/mlp": 0.00421001436188817, "grad/layer_16/attn_mlp_ratio": 0.7521183189843953, "grad/layer_20/attn": 0.006159916520118713, "grad/layer_20/mlp": 0.005533385556191206, "grad/layer_20/attn_mlp_ratio": 1.113227398713208, "grad/layer_24/attn": 0.011550016701221466, "grad/layer_24/mlp": 0.008499214425683022, "grad/layer_24/attn_mlp_ratio": 1.3589510732220604, "grad/layer_27/attn": 0.005988024640828371, "grad/layer_27/mlp": 0.008051379583775997, "grad/layer_27/attn_mlp_ratio": 0.7437265258890464} {"step": 60200, "timestamp": 1778259613.42204, "train/loss": 2.1475669145584106, "train/z_loss": 0.0013786423136480153, "train/perplexity": 8.563996093369354, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020911.1598855124, "perf/iters_per_sec": 0.9636455344607889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377259731292725, "data/tokens_consumed": 126250647552, "data/tokens_consumed_B": 126.250647552, "train/loss_slope": -7.127645993568838e-06} {"step": 60210, "timestamp": 1778259623.807895, "train/loss": 2.1755443572998048, "train/z_loss": 0.0013818432111293078, "train/perplexity": 8.806977960302957, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020983.8264433471, "perf/iters_per_sec": 0.9636801845757232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037688660621643, "data/tokens_consumed": 126271619072, "data/tokens_consumed_B": 126.271619072, "train/loss_slope": -4.650538718060105e-06} {"step": 60220, "timestamp": 1778259634.184055, "train/loss": 2.187889838218689, "train/z_loss": 0.0013647182378917933, "train/perplexity": 8.916378248772322, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022482.0605388514, "perf/iters_per_sec": 0.9643945982641465, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369199514389038, "data/tokens_consumed": 126292590592, "data/tokens_consumed_B": 126.292590592, "train/loss_slope": -3.632255599121696e-06} {"step": 60225, "timestamp": 1778259639.9485455, "eos/sharpness": 56.25655651092528, "eos/L0_probe": 1.9780007600784302, "eos/L_plus": 2.294555187225342, "eos/L_minus": 2.2240118980407715, "eos/grad_norm": 0.14702510833740234, "eos/embed_grad_frac": 0.1031317487359047, "eos/time_s": 0.5877857208251953} {"step": 60225, "timestamp": 1778259641.326909, "geo/rankme_last": 438.565185546875, "geo/layer_0/stable_rank_q_proj": 19.330249786376953, "geo/layer_0/stable_rank_k_proj": 16.158193588256836, "geo/layer_0/stable_rank_o_proj": 47.11587905883789, "geo/layer_0/stable_rank_gate_proj": 131.6551055908203, "geo/layer_0/stable_rank_down_proj": 54.8523063659668, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06205318123102188, "geo/layer_0/attn_entropy_mean": 6.160381317138672, "geo/layer_0/attn_entropy_std": 0.4107842445373535, "geo/layer_7/stable_rank_q_proj": 43.529319763183594, "geo/layer_7/stable_rank_k_proj": 41.27045822143555, "geo/layer_7/stable_rank_o_proj": 92.3605728149414, "geo/layer_7/stable_rank_gate_proj": 82.42667388916016, "geo/layer_7/stable_rank_down_proj": 141.33480834960938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4636159837245941, "geo/layer_7/attn_entropy_mean": 4.639591217041016, "geo/layer_7/attn_entropy_std": 0.7914534211158752, "geo/layer_14/stable_rank_q_proj": 51.82402801513672, "geo/layer_14/stable_rank_k_proj": 39.77503204345703, "geo/layer_14/stable_rank_o_proj": 44.1041145324707, "geo/layer_14/stable_rank_gate_proj": 72.20799255371094, "geo/layer_14/stable_rank_down_proj": 129.7096405029297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3962216079235077, "geo/layer_14/attn_entropy_mean": 5.528913497924805, "geo/layer_14/attn_entropy_std": 0.4032633602619171, "geo/layer_21/stable_rank_q_proj": 40.56387710571289, "geo/layer_21/stable_rank_k_proj": 30.138154983520508, "geo/layer_21/stable_rank_o_proj": 70.94649505615234, "geo/layer_21/stable_rank_gate_proj": 66.83921813964844, "geo/layer_21/stable_rank_down_proj": 52.031192779541016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14335650205612183, "geo/layer_21/attn_entropy_mean": 5.717185974121094, "geo/layer_21/attn_entropy_std": 0.3022215962409973, "geo/layer_27/stable_rank_q_proj": 43.02349090576172, "geo/layer_27/stable_rank_k_proj": 31.601537704467773, "geo/layer_27/stable_rank_o_proj": 116.19747924804688, "geo/layer_27/stable_rank_gate_proj": 80.5594711303711, "geo/layer_27/stable_rank_down_proj": 128.85804748535156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08755054324865341, "geo/layer_27/attn_entropy_mean": 4.228480339050293, "geo/layer_27/attn_entropy_std": 0.7100637555122375, "attnres/final_alpha/block_0": 0.23827078938484192, "attnres/block_norm/0": 1.7604637145996094, "attnres/final_alpha/block_1": 0.004527213983237743, "attnres/block_norm/1": 46141.52734375, "attnres/final_alpha/block_2": 0.010299204848706722, "attnres/block_norm/2": 28392.42578125, "attnres/final_alpha/block_3": 0.012577841989696026, "attnres/block_norm/3": 56744.9921875, "attnres/final_alpha/block_4": 0.014393636956810951, "attnres/block_norm/4": 14919.46484375, "attnres/final_alpha/block_5": 0.6089558005332947, "attnres/block_norm/5": 6572.10888671875, "attnres/final_alpha/block_6": 0.1109754890203476, "attnres/block_norm/6": 37509.890625, "geo/tier1_time_s": 1.3602356910705566, "geo/step": 60225.0, "geo/rankme_slope": -5.512404180422169e-05} {"step": 60230, "timestamp": 1778259646.5192566, "train/loss": 2.1522449970245363, "train/z_loss": 0.0013912224676460028, "train/perplexity": 8.60415302882718, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700821.4115233687, "perf/iters_per_sec": 0.8110148484818309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330230474472046, "data/tokens_consumed": 126313562112, "data/tokens_consumed_B": 126.313562112, "train/loss_slope": -2.1241454866387295e-06} {"step": 60240, "timestamp": 1778259656.8990035, "train/loss": 2.1476325511932375, "train/z_loss": 0.0013717964058741928, "train/perplexity": 8.564558223701564, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021806.9716086, "perf/iters_per_sec": 0.9640726907771111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372661828994751, "data/tokens_consumed": 126334533632, "data/tokens_consumed_B": 126.334533632, "train/loss_slope": -6.748420842374072e-06} {"step": 60250, "timestamp": 1778259667.265689, "grad/layer_0/attn": 0.002987871179357171, "grad/layer_0/mlp": 0.003083091927692294, "grad/layer_0/attn_mlp_ratio": 0.9691151456136078, "grad/layer_4/attn": 0.0028378718998283148, "grad/layer_4/mlp": 0.0025103630032390356, "grad/layer_4/attn_mlp_ratio": 1.1304627191846095, "grad/layer_8/attn": 0.006688766181468964, "grad/layer_8/mlp": 0.0037471603136509657, "grad/layer_8/attn_mlp_ratio": 1.78502264197222, "grad/layer_12/attn": 0.004758283030241728, "grad/layer_12/mlp": 0.0073209116235375404, "grad/layer_12/attn_mlp_ratio": 0.6499577115434028, "grad/layer_16/attn": 0.004386059939861298, "grad/layer_16/mlp": 0.004388167057186365, "grad/layer_16/attn_mlp_ratio": 0.9995197955662158, "grad/layer_20/attn": 0.003025624668225646, "grad/layer_20/mlp": 0.005754499696195126, "grad/layer_20/attn_mlp_ratio": 0.525784129878011, "grad/layer_24/attn": 0.010817557573318481, "grad/layer_24/mlp": 0.009998718276619911, "grad/layer_24/attn_mlp_ratio": 1.081894415449611, "grad/layer_27/attn": 0.008616640232503414, "grad/layer_27/mlp": 0.010614386759698391, "grad/layer_27/attn_mlp_ratio": 0.8117887868982625} {"step": 60250, "timestamp": 1778259667.2799642, "train/loss": 2.104354178905487, "train/z_loss": 0.001385324541479349, "train/perplexity": 8.201804400508296, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021582.584155529, "perf/iters_per_sec": 0.9639656945016523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373813152313232, "data/tokens_consumed": 126355505152, "data/tokens_consumed_B": 126.355505152, "train/loss_slope": -9.490385185731475e-06} {"step": 60260, "timestamp": 1778259677.657803, "train/loss": 2.1285717368125914, "train/z_loss": 0.0013677648385055363, "train/perplexity": 8.402856745755397, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021778.949496974, "perf/iters_per_sec": 0.9640593287930365, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037280559539795, "data/tokens_consumed": 126376476672, "data/tokens_consumed_B": 126.376476672, "train/loss_slope": -9.565821732624846e-06} {"step": 60270, "timestamp": 1778259688.045687, "train/loss": 2.157356071472168, "train/z_loss": 0.0013774701510556042, "train/perplexity": 8.648242070725635, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020031.0327793187, "perf/iters_per_sec": 0.963225857152614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381781101226806, "data/tokens_consumed": 126397448192, "data/tokens_consumed_B": 126.397448192, "train/loss_slope": -9.35618001134977e-06} {"step": 60280, "timestamp": 1778259698.4250293, "train/loss": 2.176315450668335, "train/z_loss": 0.0013739667134359478, "train/perplexity": 8.81377158152646, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022042.4716858428, "perf/iters_per_sec": 0.9641849859647001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371453762054443, "data/tokens_consumed": 126418419712, "data/tokens_consumed_B": 126.418419712, "train/loss_slope": -9.35766184040275e-06} {"step": 60290, "timestamp": 1778259708.802869, "train/loss": 2.1442766189575195, "train/z_loss": 0.0013843217981047927, "train/perplexity": 8.535864321000377, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021690.0553019238, "perf/iters_per_sec": 0.964016940737688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037326169013977, "data/tokens_consumed": 126439391232, "data/tokens_consumed_B": 126.439391232, "train/loss_slope": -7.735809548781687e-06} {"step": 60300, "timestamp": 1778259719.1751454, "grad/layer_0/attn": 0.002785278484225273, "grad/layer_0/mlp": 0.003013302804902196, "grad/layer_0/attn_mlp_ratio": 0.9243274148423775, "grad/layer_4/attn": 0.0026601962745189667, "grad/layer_4/mlp": 0.0026290433015674353, "grad/layer_4/attn_mlp_ratio": 1.0118495088110595, "grad/layer_8/attn": 0.0057673146948218346, "grad/layer_8/mlp": 0.0038628485053777695, "grad/layer_8/attn_mlp_ratio": 1.493021157182477, "grad/layer_12/attn": 0.004744119942188263, "grad/layer_12/mlp": 0.006499028764665127, "grad/layer_12/attn_mlp_ratio": 0.7299736685248146, "grad/layer_16/attn": 0.003574534086510539, "grad/layer_16/mlp": 0.004613034892827272, "grad/layer_16/attn_mlp_ratio": 0.7748768635114452, "grad/layer_20/attn": 0.0033427877351641655, "grad/layer_20/mlp": 0.006045843940228224, "grad/layer_20/attn_mlp_ratio": 0.5529067096209744, "grad/layer_24/attn": 0.010912125930190086, "grad/layer_24/mlp": 0.01102528441697359, "grad/layer_24/attn_mlp_ratio": 0.9897364474713287, "grad/layer_27/attn": 0.004167233128100634, "grad/layer_27/mlp": 0.01103474572300911, "grad/layer_27/attn_mlp_ratio": 0.37764649906220077} {"step": 60300, "timestamp": 1778259719.7636719, "eos/sharpness": 65.81490039825438, "eos/L0_probe": 1.9781639575958252, "eos/L_plus": 2.3487653732299805, "eos/L_minus": 2.265711545944214, "eos/grad_norm": 0.17694689333438873, "eos/embed_grad_frac": 0.0769706591963768, "eos/time_s": 0.585613489151001} {"step": 60300, "timestamp": 1778259719.783488, "train/loss": 2.120017170906067, "train/z_loss": 0.001397984684444964, "train/perplexity": 8.331280542095106, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911245.7003582157, "perf/iters_per_sec": 0.911352968386753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0972697019577027, "data/tokens_consumed": 126460362752, "data/tokens_consumed_B": 126.460362752, "train/loss_slope": -1.0480871523889406e-05} {"step": 60300, "timestamp": 1778259721.1456873, "geo/rankme_last": 438.29595947265625, "geo/layer_0/stable_rank_q_proj": 19.333555221557617, "geo/layer_0/stable_rank_k_proj": 16.16282844543457, "geo/layer_0/stable_rank_o_proj": 47.12849426269531, "geo/layer_0/stable_rank_gate_proj": 131.748291015625, "geo/layer_0/stable_rank_down_proj": 54.797977447509766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06228766590356827, "geo/layer_0/attn_entropy_mean": 6.1601996421813965, "geo/layer_0/attn_entropy_std": 0.41266363859176636, "geo/layer_7/stable_rank_q_proj": 43.45950698852539, "geo/layer_7/stable_rank_k_proj": 41.21165466308594, "geo/layer_7/stable_rank_o_proj": 92.41236877441406, "geo/layer_7/stable_rank_gate_proj": 82.3660888671875, "geo/layer_7/stable_rank_down_proj": 141.15699768066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44323423504829407, "geo/layer_7/attn_entropy_mean": 4.651035785675049, "geo/layer_7/attn_entropy_std": 0.793035089969635, "geo/layer_14/stable_rank_q_proj": 51.833309173583984, "geo/layer_14/stable_rank_k_proj": 39.6914176940918, "geo/layer_14/stable_rank_o_proj": 44.12894821166992, "geo/layer_14/stable_rank_gate_proj": 72.2190933227539, "geo/layer_14/stable_rank_down_proj": 129.87188720703125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4073716402053833, "geo/layer_14/attn_entropy_mean": 5.532550811767578, "geo/layer_14/attn_entropy_std": 0.3978549540042877, "geo/layer_21/stable_rank_q_proj": 40.6420783996582, "geo/layer_21/stable_rank_k_proj": 30.17877197265625, "geo/layer_21/stable_rank_o_proj": 70.81615447998047, "geo/layer_21/stable_rank_gate_proj": 66.76861572265625, "geo/layer_21/stable_rank_down_proj": 51.98417282104492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14895617961883545, "geo/layer_21/attn_entropy_mean": 5.698151111602783, "geo/layer_21/attn_entropy_std": 0.3063388466835022, "geo/layer_27/stable_rank_q_proj": 43.082794189453125, "geo/layer_27/stable_rank_k_proj": 31.644420623779297, "geo/layer_27/stable_rank_o_proj": 116.09124755859375, "geo/layer_27/stable_rank_gate_proj": 80.6097412109375, "geo/layer_27/stable_rank_down_proj": 128.63705444335938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09601468592882156, "geo/layer_27/attn_entropy_mean": 4.229155540466309, "geo/layer_27/attn_entropy_std": 0.7133985161781311, "attnres/final_alpha/block_0": 0.23610571026802063, "attnres/block_norm/0": 1.760423183441162, "attnres/final_alpha/block_1": 0.004470967222005129, "attnres/block_norm/1": 46210.53125, "attnres/final_alpha/block_2": 0.01015321072191, "attnres/block_norm/2": 28364.294921875, "attnres/final_alpha/block_3": 0.012386886402964592, "attnres/block_norm/3": 56844.21875, "attnres/final_alpha/block_4": 0.014138365164399147, "attnres/block_norm/4": 14903.951171875, "attnres/final_alpha/block_5": 0.6141752004623413, "attnres/block_norm/5": 6563.6015625, "attnres/final_alpha/block_6": 0.10856962203979492, "attnres/block_norm/6": 37627.28125, "geo/tier1_time_s": 1.3587250709533691, "geo/step": 60300.0, "geo/rankme_slope": -3.990023743872549e-05} {"step": 60310, "timestamp": 1778259731.5211968, "train/loss": 2.1964670419692993, "train/z_loss": 0.0013730727019719779, "train/perplexity": 8.993184763415009, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787176.5873338105, "perf/iters_per_sec": 0.8521922051114132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734441995620728, "data/tokens_consumed": 126481334272, "data/tokens_consumed_B": 126.481334272, "train/loss_slope": -9.828719840978201e-06} {"step": 60320, "timestamp": 1778259741.8992903, "train/loss": 2.1637131452560423, "train/z_loss": 0.001366790826432407, "train/perplexity": 8.703394702607836, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021506.7618428685, "perf/iters_per_sec": 0.9639295396055548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374202251434326, "data/tokens_consumed": 126502305792, "data/tokens_consumed_B": 126.502305792, "train/loss_slope": -1.0303468121947206e-05} {"step": 60330, "timestamp": 1778259752.867626, "train/loss": 2.142519497871399, "train/z_loss": 0.001386445853859186, "train/perplexity": 8.520878943230894, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913218.830313096, "perf/iters_per_sec": 0.9122938300672035, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096138072013855, "data/tokens_consumed": 126523277312, "data/tokens_consumed_B": 126.523277312, "train/loss_slope": -1.063419800422727e-05} {"step": 60340, "timestamp": 1778259763.2473047, "train/loss": 2.2054715156555176, "train/z_loss": 0.0013698709779419004, "train/perplexity": 9.074529341909601, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022136.4174544758, "perf/iters_per_sec": 0.9642297827980403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037097191810608, "data/tokens_consumed": 126544248832, "data/tokens_consumed_B": 126.544248832, "train/loss_slope": -8.123706633930877e-06} {"step": 60350, "timestamp": 1778259773.610972, "grad/layer_0/attn": 0.0026310391258448362, "grad/layer_0/mlp": 0.002838962944224477, "grad/layer_0/attn_mlp_ratio": 0.9267606111313649, "grad/layer_4/attn": 0.002188905142247677, "grad/layer_4/mlp": 0.002520068548619747, "grad/layer_4/attn_mlp_ratio": 0.8685894899912945, "grad/layer_8/attn": 0.0030528989154845476, "grad/layer_8/mlp": 0.003781181061640382, "grad/layer_8/attn_mlp_ratio": 0.8073929243210645, "grad/layer_12/attn": 0.004197971895337105, "grad/layer_12/mlp": 0.006766883656382561, "grad/layer_12/attn_mlp_ratio": 0.6203700324211356, "grad/layer_16/attn": 0.003504023654386401, "grad/layer_16/mlp": 0.004336372949182987, "grad/layer_16/attn_mlp_ratio": 0.8080540153358328, "grad/layer_20/attn": 0.004581737797707319, "grad/layer_20/mlp": 0.005415997933596373, "grad/layer_20/attn_mlp_ratio": 0.8459637114500426, "grad/layer_24/attn": 0.012334014289081097, "grad/layer_24/mlp": 0.00922979786992073, "grad/layer_24/attn_mlp_ratio": 1.3363254893852272, "grad/layer_27/attn": 0.009178330190479755, "grad/layer_27/mlp": 0.008722474798560143, "grad/layer_27/attn_mlp_ratio": 1.052262150045839} {"step": 60350, "timestamp": 1778259773.6252587, "train/loss": 2.1603293657302856, "train/z_loss": 0.0013849090319126844, "train/perplexity": 8.673994104432625, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021785.2230365693, "perf/iters_per_sec": 0.964062320249829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037277340888977, "data/tokens_consumed": 126565220352, "data/tokens_consumed_B": 126.565220352, "train/loss_slope": -8.370481365763504e-06} {"step": 60360, "timestamp": 1778259784.005992, "train/loss": 2.1741958022117616, "train/z_loss": 0.001370183448307216, "train/perplexity": 8.795109269954386, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021738.2421415525, "perf/iters_per_sec": 0.9640399180133593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373014450073241, "data/tokens_consumed": 126586191872, "data/tokens_consumed_B": 126.586191872, "train/loss_slope": -6.567448830053733e-06} {"step": 60370, "timestamp": 1778259794.377285, "train/loss": 2.125389337539673, "train/z_loss": 0.0013866292079910635, "train/perplexity": 8.376158006115064, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023054.0677809783, "perf/iters_per_sec": 0.9646673525719539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366267681121826, "data/tokens_consumed": 126607163392, "data/tokens_consumed_B": 126.607163392, "train/loss_slope": -4.326276889335191e-06} {"step": 60375, "timestamp": 1778259800.1401632, "eos/sharpness": 75.19927024841307, "eos/L0_probe": 1.9797757863998413, "eos/L_plus": 2.440680503845215, "eos/L_minus": 2.2708637714385986, "eos/grad_norm": 0.22107543051242828, "eos/embed_grad_frac": 0.08430730551481247, "eos/time_s": 0.5874252319335938} {"step": 60375, "timestamp": 1778259801.5176752, "geo/rankme_last": 438.92901611328125, "geo/layer_0/stable_rank_q_proj": 19.32658576965332, "geo/layer_0/stable_rank_k_proj": 16.131969451904297, "geo/layer_0/stable_rank_o_proj": 47.18582534790039, "geo/layer_0/stable_rank_gate_proj": 132.0164337158203, "geo/layer_0/stable_rank_down_proj": 54.818729400634766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06285347789525986, "geo/layer_0/attn_entropy_mean": 6.158434867858887, "geo/layer_0/attn_entropy_std": 0.4154852330684662, "geo/layer_7/stable_rank_q_proj": 43.48143768310547, "geo/layer_7/stable_rank_k_proj": 41.190120697021484, "geo/layer_7/stable_rank_o_proj": 92.2587890625, "geo/layer_7/stable_rank_gate_proj": 82.21053314208984, "geo/layer_7/stable_rank_down_proj": 141.20138549804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4619201421737671, "geo/layer_7/attn_entropy_mean": 4.663980007171631, "geo/layer_7/attn_entropy_std": 0.7969136238098145, "geo/layer_14/stable_rank_q_proj": 51.880462646484375, "geo/layer_14/stable_rank_k_proj": 39.600799560546875, "geo/layer_14/stable_rank_o_proj": 44.07445526123047, "geo/layer_14/stable_rank_gate_proj": 72.20770263671875, "geo/layer_14/stable_rank_down_proj": 129.9459228515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4116579294204712, "geo/layer_14/attn_entropy_mean": 5.528646945953369, "geo/layer_14/attn_entropy_std": 0.39987173676490784, "geo/layer_21/stable_rank_q_proj": 40.69071578979492, "geo/layer_21/stable_rank_k_proj": 30.25150489807129, "geo/layer_21/stable_rank_o_proj": 70.78443908691406, "geo/layer_21/stable_rank_gate_proj": 66.91421508789062, "geo/layer_21/stable_rank_down_proj": 51.943729400634766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1417005956172943, "geo/layer_21/attn_entropy_mean": 5.715784072875977, "geo/layer_21/attn_entropy_std": 0.29809069633483887, "geo/layer_27/stable_rank_q_proj": 43.10646057128906, "geo/layer_27/stable_rank_k_proj": 31.649066925048828, "geo/layer_27/stable_rank_o_proj": 116.06729125976562, "geo/layer_27/stable_rank_gate_proj": 80.67472839355469, "geo/layer_27/stable_rank_down_proj": 128.71783447265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09658214449882507, "geo/layer_27/attn_entropy_mean": 4.241680145263672, "geo/layer_27/attn_entropy_std": 0.7353594899177551, "attnres/final_alpha/block_0": 0.23636503517627716, "attnres/block_norm/0": 1.760400652885437, "attnres/final_alpha/block_1": 0.004498717375099659, "attnres/block_norm/1": 46054.4140625, "attnres/final_alpha/block_2": 0.010225353762507439, "attnres/block_norm/2": 28508.3125, "attnres/final_alpha/block_3": 0.01247376762330532, "attnres/block_norm/3": 57245.46875, "attnres/final_alpha/block_4": 0.01449689082801342, "attnres/block_norm/4": 14926.2177734375, "attnres/final_alpha/block_5": 0.6130275130271912, "attnres/block_norm/5": 6523.1123046875, "attnres/final_alpha/block_6": 0.10891272872686386, "attnres/block_norm/6": 37665.1640625, "geo/tier1_time_s": 1.3574297428131104, "geo/step": 60375.0, "geo/rankme_slope": -4.3244368059723895e-05} {"step": 60380, "timestamp": 1778259806.7052534, "train/loss": 2.173968291282654, "train/z_loss": 0.001383957010693848, "train/perplexity": 8.79310851407832, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701820.6525995277, "perf/iters_per_sec": 0.8114913237569464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322990655899049, "data/tokens_consumed": 126628134912, "data/tokens_consumed_B": 126.628134912, "train/loss_slope": -4.201263691833176e-06} {"step": 60390, "timestamp": 1778259817.0852551, "train/loss": 2.1590394496917726, "train/z_loss": 0.0013789487769827248, "train/perplexity": 8.662812593474905, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021584.9072328492, "perf/iters_per_sec": 0.9639668022312399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373801231384276, "data/tokens_consumed": 126649106432, "data/tokens_consumed_B": 126.649106432, "train/loss_slope": -5.207778775867199e-06} {"step": 60400, "timestamp": 1778259827.4468737, "grad/layer_0/attn": 0.0032026683911681175, "grad/layer_0/mlp": 0.00307823671028018, "grad/layer_0/attn_mlp_ratio": 1.0404230046474598, "grad/layer_4/attn": 0.0030769642908126116, "grad/layer_4/mlp": 0.002451674547046423, "grad/layer_4/attn_mlp_ratio": 1.25504593136756, "grad/layer_8/attn": 0.0046966965310275555, "grad/layer_8/mlp": 0.003829136723652482, "grad/layer_8/attn_mlp_ratio": 1.2265679570435244, "grad/layer_12/attn": 0.004691726993769407, "grad/layer_12/mlp": 0.006590339820832014, "grad/layer_12/attn_mlp_ratio": 0.7119097118099924, "grad/layer_16/attn": 0.0034479747992008924, "grad/layer_16/mlp": 0.004858558531850576, "grad/layer_16/attn_mlp_ratio": 0.7096703076911502, "grad/layer_20/attn": 0.003354110987856984, "grad/layer_20/mlp": 0.006191794294863939, "grad/layer_20/attn_mlp_ratio": 0.541702578276695, "grad/layer_24/attn": 0.009272606112062931, "grad/layer_24/mlp": 0.010285464115440845, "grad/layer_24/attn_mlp_ratio": 0.9015252902384919, "grad/layer_27/attn": 0.0037120827473700047, "grad/layer_27/mlp": 0.008711199276149273, "grad/layer_27/attn_mlp_ratio": 0.426127630316149} {"step": 60400, "timestamp": 1778259827.4611766, "train/loss": 2.1650574088096617, "train/z_loss": 0.0013692256878130137, "train/perplexity": 8.715102226134906, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022175.5137384178, "perf/iters_per_sec": 0.9642484253589715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370771408081054, "data/tokens_consumed": 126670077952, "data/tokens_consumed_B": 126.670077952, "train/loss_slope": -5.792874028079338e-07} {"step": 60410, "timestamp": 1778259837.8356354, "train/loss": 2.171283411979675, "train/z_loss": 0.001376655511558056, "train/perplexity": 8.769531743574515, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022770.000653093, "perf/iters_per_sec": 0.9645318988099542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367723464965821, "data/tokens_consumed": 126691049472, "data/tokens_consumed_B": 126.691049472, "train/loss_slope": 1.7166252147678018e-07} {"step": 60420, "timestamp": 1778259848.1869326, "train/loss": 2.1466082096099854, "train/z_loss": 0.0013864343869499861, "train/perplexity": 8.555789682328337, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027383.9192558408, "perf/iters_per_sec": 0.966731986644669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344128608703613, "data/tokens_consumed": 126712020992, "data/tokens_consumed_B": 126.712020992, "train/loss_slope": 1.1043542289104568e-06} {"step": 60430, "timestamp": 1778259858.5273798, "train/loss": 2.1155030488967896, "train/z_loss": 0.0013805967988446354, "train/perplexity": 8.293756882143411, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029014.7943758501, "perf/iters_per_sec": 0.9675096485022784, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335814237594605, "data/tokens_consumed": 126732992512, "data/tokens_consumed_B": 126.732992512, "train/loss_slope": -4.2050713860427974e-08} {"step": 60440, "timestamp": 1778259868.8842683, "train/loss": 2.092088222503662, "train/z_loss": 0.0013773865764960646, "train/perplexity": 8.101815906204592, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026235.5620916344, "perf/iters_per_sec": 0.9661844072778866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349991083145142, "data/tokens_consumed": 126753964032, "data/tokens_consumed_B": 126.753964032, "train/loss_slope": -1.0210686033279138e-06} {"step": 60450, "timestamp": 1778259879.2307231, "grad/layer_0/attn": 0.0027833334170281887, "grad/layer_0/mlp": 0.002966291503980756, "grad/layer_0/attn_mlp_ratio": 0.9383209032088964, "grad/layer_4/attn": 0.0030648531392216682, "grad/layer_4/mlp": 0.0026133458595722914, "grad/layer_4/attn_mlp_ratio": 1.172769769725885, "grad/layer_8/attn": 0.0033288050908595324, "grad/layer_8/mlp": 0.003929095342755318, "grad/layer_8/attn_mlp_ratio": 0.8472191982501638, "grad/layer_12/attn": 0.004112736787647009, "grad/layer_12/mlp": 0.007104768883436918, "grad/layer_12/attn_mlp_ratio": 0.578869882642895, "grad/layer_16/attn": 0.004258392378687859, "grad/layer_16/mlp": 0.004889882169663906, "grad/layer_16/attn_mlp_ratio": 0.8708578537987068, "grad/layer_20/attn": 0.0032538725063204765, "grad/layer_20/mlp": 0.006138172000646591, "grad/layer_20/attn_mlp_ratio": 0.5301044762133202, "grad/layer_24/attn": 0.010841816663742065, "grad/layer_24/mlp": 0.009286245331168175, "grad/layer_24/attn_mlp_ratio": 1.1675134739980921, "grad/layer_27/attn": 0.010414193384349346, "grad/layer_27/mlp": 0.009203429333865643, "grad/layer_27/attn_mlp_ratio": 1.1315557379109666} {"step": 60450, "timestamp": 1778259879.825258, "eos/sharpness": 70.67646980285643, "eos/L0_probe": 1.9758108854293823, "eos/L_plus": 2.2723066806793213, "eos/L_minus": 2.386079788208008, "eos/grad_norm": 0.1716771274805069, "eos/embed_grad_frac": 0.07935858517885208, "eos/time_s": 0.5917420387268066} {"step": 60450, "timestamp": 1778259879.8430393, "train/loss": 2.149378204345703, "train/z_loss": 0.001372657297179103, "train/perplexity": 8.57952202877105, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915050.8916144038, "perf/iters_per_sec": 0.9131674249717731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950894355773926, "data/tokens_consumed": 126774935552, "data/tokens_consumed_B": 126.774935552, "train/loss_slope": -7.297401130646604e-07} {"step": 60450, "timestamp": 1778259881.2088356, "geo/rankme_last": 437.8280334472656, "geo/layer_0/stable_rank_q_proj": 19.349502563476562, "geo/layer_0/stable_rank_k_proj": 16.161874771118164, "geo/layer_0/stable_rank_o_proj": 47.23063659667969, "geo/layer_0/stable_rank_gate_proj": 132.2210693359375, "geo/layer_0/stable_rank_down_proj": 54.82954788208008, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06176476925611496, "geo/layer_0/attn_entropy_mean": 6.163758277893066, "geo/layer_0/attn_entropy_std": 0.41596055030822754, "geo/layer_7/stable_rank_q_proj": 43.56686782836914, "geo/layer_7/stable_rank_k_proj": 41.147701263427734, "geo/layer_7/stable_rank_o_proj": 92.51167297363281, "geo/layer_7/stable_rank_gate_proj": 82.2834701538086, "geo/layer_7/stable_rank_down_proj": 141.24606323242188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4576290547847748, "geo/layer_7/attn_entropy_mean": 4.647130012512207, "geo/layer_7/attn_entropy_std": 0.7917737364768982, "geo/layer_14/stable_rank_q_proj": 51.829891204833984, "geo/layer_14/stable_rank_k_proj": 39.66936492919922, "geo/layer_14/stable_rank_o_proj": 44.00212860107422, "geo/layer_14/stable_rank_gate_proj": 72.27066040039062, "geo/layer_14/stable_rank_down_proj": 129.86732482910156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39916661381721497, "geo/layer_14/attn_entropy_mean": 5.505738258361816, "geo/layer_14/attn_entropy_std": 0.3916023075580597, "geo/layer_21/stable_rank_q_proj": 40.783443450927734, "geo/layer_21/stable_rank_k_proj": 30.26056480407715, "geo/layer_21/stable_rank_o_proj": 70.6939697265625, "geo/layer_21/stable_rank_gate_proj": 66.93656921386719, "geo/layer_21/stable_rank_down_proj": 51.868228912353516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14059413969516754, "geo/layer_21/attn_entropy_mean": 5.711944103240967, "geo/layer_21/attn_entropy_std": 0.2974427342414856, "geo/layer_27/stable_rank_q_proj": 43.06998825073242, "geo/layer_27/stable_rank_k_proj": 31.612674713134766, "geo/layer_27/stable_rank_o_proj": 116.12454986572266, "geo/layer_27/stable_rank_gate_proj": 80.64472198486328, "geo/layer_27/stable_rank_down_proj": 128.91995239257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0947570651769638, "geo/layer_27/attn_entropy_mean": 4.229275226593018, "geo/layer_27/attn_entropy_std": 0.7556951642036438, "attnres/final_alpha/block_0": 0.2388145625591278, "attnres/block_norm/0": 1.7605715990066528, "attnres/final_alpha/block_1": 0.004630270879715681, "attnres/block_norm/1": 45894.734375, "attnres/final_alpha/block_2": 0.010418221354484558, "attnres/block_norm/2": 28414.548828125, "attnres/final_alpha/block_3": 0.012623393908143044, "attnres/block_norm/3": 56981.26171875, "attnres/final_alpha/block_4": 0.014551528729498386, "attnres/block_norm/4": 14984.05859375, "attnres/final_alpha/block_5": 0.6079292297363281, "attnres/block_norm/5": 6612.556640625, "attnres/final_alpha/block_6": 0.11103280633687973, "attnres/block_norm/6": 37642.83984375, "geo/tier1_time_s": 1.3615388870239258, "geo/step": 60450.0, "geo/rankme_slope": -6.745987457482993e-05} {"step": 60460, "timestamp": 1778259891.5562, "train/loss": 2.138907814025879, "train/z_loss": 0.0013710183324292303, "train/perplexity": 8.490159729837819, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791018.4960425345, "perf/iters_per_sec": 0.854024169942157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709270477294922, "data/tokens_consumed": 126795907072, "data/tokens_consumed_B": 126.795907072, "train/loss_slope": -1.2731121640072396e-06} {"step": 60470, "timestamp": 1778259901.9099848, "train/loss": 2.184484839439392, "train/z_loss": 0.0013755941414274274, "train/perplexity": 8.88606962142229, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026878.395842722, "perf/iters_per_sec": 0.9664909342969522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034670853614807, "data/tokens_consumed": 126816878592, "data/tokens_consumed_B": 126.816878592, "train/loss_slope": 2.6610121510960855e-06} {"step": 60480, "timestamp": 1778259912.2576225, "train/loss": 2.1564197301864625, "train/z_loss": 0.0013765976880677044, "train/perplexity": 8.640148154551332, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028128.3444443047, "perf/iters_per_sec": 0.9670869562360309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034033179283142, "data/tokens_consumed": 126837850112, "data/tokens_consumed_B": 126.837850112, "train/loss_slope": 2.3085680874911137e-06} {"step": 60490, "timestamp": 1778259922.5953374, "train/loss": 2.154404616355896, "train/z_loss": 0.0013690044987015426, "train/perplexity": 8.622754803184144, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029644.781125223, "perf/iters_per_sec": 0.967810049593555, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332606077194213, "data/tokens_consumed": 126858821632, "data/tokens_consumed_B": 126.858821632, "train/loss_slope": 3.7027887349510663e-06} {"step": 60500, "timestamp": 1778259932.9403152, "grad/layer_0/attn": 0.002906389767304063, "grad/layer_0/mlp": 0.0030875809025019407, "grad/layer_0/attn_mlp_ratio": 0.941316119301468, "grad/layer_4/attn": 0.0036782941315323114, "grad/layer_4/mlp": 0.0025343052111566067, "grad/layer_4/attn_mlp_ratio": 1.4514013427425645, "grad/layer_8/attn": 0.0064249481074512005, "grad/layer_8/mlp": 0.003888391423970461, "grad/layer_8/attn_mlp_ratio": 1.6523408375529647, "grad/layer_12/attn": 0.00507474597543478, "grad/layer_12/mlp": 0.006421033293008804, "grad/layer_12/attn_mlp_ratio": 0.7903316592248444, "grad/layer_16/attn": 0.0035696793347597122, "grad/layer_16/mlp": 0.004417025949805975, "grad/layer_16/attn_mlp_ratio": 0.8081635232639202, "grad/layer_20/attn": 0.0041287620551884174, "grad/layer_20/mlp": 0.005788573529571295, "grad/layer_20/attn_mlp_ratio": 0.7132606958813438, "grad/layer_24/attn": 0.007964751683175564, "grad/layer_24/mlp": 0.009735958650708199, "grad/layer_24/attn_mlp_ratio": 0.8180757424219987, "grad/layer_27/attn": 0.006261029746383429, "grad/layer_27/mlp": 0.008576770313084126, "grad/layer_27/attn_mlp_ratio": 0.7299985244832993} {"step": 60500, "timestamp": 1778259932.9544554, "train/loss": 2.1577258825302126, "train/z_loss": 0.0013839369756169617, "train/perplexity": 8.65144087771668, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025897.7336460007, "perf/iters_per_sec": 0.9660233181219104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351716995239257, "data/tokens_consumed": 126879793152, "data/tokens_consumed_B": 126.879793152, "train/loss_slope": 5.138062629619594e-06} {"step": 60500, "timestamp": 1778259940.0746126, "geo/ww_alpha_mean": 7.509879907953927, "geo/ww_alpha_std": 4.401742688395053, "geo/ww_alpha_min": 1.3593833445750558, "geo/ww_alpha_max": 27.49872218361842, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.986261178898025, "geo/ww_alpha_by_type/k_proj": 4.4679703349284425, "geo/ww_alpha_by_type/v_proj": 7.988608293550011, "geo/ww_alpha_by_type/o_proj": 8.299218156520286, "geo/ww_alpha_by_type/gate_proj": 8.061235130499421, "geo/ww_alpha_by_type/up_proj": 11.815478777588902, "geo/ww_alpha_by_type/down_proj": 8.043888186982864, "geo/twonn_id/layer_0": 0.674928605556488, "geo/twonn_id/layer_7": 3.4724082946777344, "geo/twonn_id/layer_14": 4.798386573791504, "geo/twonn_id/layer_21": 7.141915321350098, "geo/twonn_id/layer_27": 5.8582072257995605, "geo/tier2_time_s": 7.110251426696777} {"step": 60500, "timestamp": 1778259940.736042, "eoc/jacobian_sigma/layer_0/attn": 1187.4522705078125, "eoc/jacobian_sigma/layer_0/mlp": 10392.9599609375, "eoc/jacobian_sigma/layer_0": 10392.9599609375, "eoc/jacobian_sigma/layer_7/attn": 1.145391821861267, "eoc/jacobian_sigma/layer_7/mlp": 1.8436503410339355, "eoc/jacobian_sigma/layer_7": 1.8436503410339355, "eoc/jacobian_sigma/layer_14/attn": 1.5105211734771729, "eoc/jacobian_sigma/layer_14/mlp": 7.347196102142334, "eoc/jacobian_sigma/layer_14": 7.347196102142334, "eoc/jacobian_sigma/layer_21/attn": 1.0816071033477783, "eoc/jacobian_sigma/layer_21/mlp": 4.314075946807861, "eoc/jacobian_sigma/layer_21": 4.314075946807861, "eoc/jacobian_sigma/layer_27/attn": 3.379995584487915, "eoc/jacobian_sigma/layer_27/mlp": 27.47878074645996, "eoc/jacobian_sigma/layer_27": 27.47878074645996, "eoc/layer0_sigma": 10392.9599609375, "eoc/sigma_max": 27.47878074645996, "eoc/sigma_min": 1.8436503410339355, "eoc/sigma_mean": 10.245925784111023, "eoc/time_s": 0.6539742946624756} {"step": 60510, "timestamp": 1778259951.0967333, "train/loss": 2.134460139274597, "train/z_loss": 0.0013748829951509833, "train/perplexity": 8.452482111779272, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1156481.906989572, "perf/iters_per_sec": 0.5514535460422383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8133893728256225, "data/tokens_consumed": 126900764672, "data/tokens_consumed_B": 126.900764672, "train/loss_slope": 2.3801948800302867e-06} {"step": 60520, "timestamp": 1778259961.436598, "train/loss": 2.149563193321228, "train/z_loss": 0.001375340484082699, "train/perplexity": 8.581109292570272, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029179.2760718807, "perf/iters_per_sec": 0.96758807948679, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334976434707641, "data/tokens_consumed": 126921736192, "data/tokens_consumed_B": 126.921736192, "train/loss_slope": 4.891488768837654e-06} {"step": 60525, "timestamp": 1778259967.2052634, "eos/sharpness": 39.993691444396966, "eos/L0_probe": 1.9798219203948975, "eos/L_plus": 2.2069849967956543, "eos/L_minus": 2.1525957584381104, "eos/grad_norm": 0.10838695615530014, "eos/embed_grad_frac": 0.18843623995780945, "eos/time_s": 0.6061000823974609} {"step": 60525, "timestamp": 1778259968.5819106, "geo/rankme_last": 438.3678283691406, "geo/layer_0/stable_rank_q_proj": 19.3400821685791, "geo/layer_0/stable_rank_k_proj": 16.111244201660156, "geo/layer_0/stable_rank_o_proj": 47.184242248535156, "geo/layer_0/stable_rank_gate_proj": 132.1708221435547, "geo/layer_0/stable_rank_down_proj": 54.84731674194336, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06305410712957382, "geo/layer_0/attn_entropy_mean": 6.160456657409668, "geo/layer_0/attn_entropy_std": 0.41971126198768616, "geo/layer_7/stable_rank_q_proj": 43.5421257019043, "geo/layer_7/stable_rank_k_proj": 41.160152435302734, "geo/layer_7/stable_rank_o_proj": 92.61012268066406, "geo/layer_7/stable_rank_gate_proj": 82.24781799316406, "geo/layer_7/stable_rank_down_proj": 141.32557678222656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44838109612464905, "geo/layer_7/attn_entropy_mean": 4.6590352058410645, "geo/layer_7/attn_entropy_std": 0.8065445423126221, "geo/layer_14/stable_rank_q_proj": 51.87661361694336, "geo/layer_14/stable_rank_k_proj": 39.6353645324707, "geo/layer_14/stable_rank_o_proj": 43.9385871887207, "geo/layer_14/stable_rank_gate_proj": 72.22599029541016, "geo/layer_14/stable_rank_down_proj": 129.63900756835938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39731574058532715, "geo/layer_14/attn_entropy_mean": 5.507070064544678, "geo/layer_14/attn_entropy_std": 0.40335965156555176, "geo/layer_21/stable_rank_q_proj": 40.80729293823242, "geo/layer_21/stable_rank_k_proj": 30.298288345336914, "geo/layer_21/stable_rank_o_proj": 70.70101165771484, "geo/layer_21/stable_rank_gate_proj": 66.87479400634766, "geo/layer_21/stable_rank_down_proj": 51.908103942871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14669516682624817, "geo/layer_21/attn_entropy_mean": 5.701450347900391, "geo/layer_21/attn_entropy_std": 0.2946304976940155, "geo/layer_27/stable_rank_q_proj": 43.094425201416016, "geo/layer_27/stable_rank_k_proj": 31.6426944732666, "geo/layer_27/stable_rank_o_proj": 116.10668182373047, "geo/layer_27/stable_rank_gate_proj": 80.63729095458984, "geo/layer_27/stable_rank_down_proj": 128.79637145996094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09050223231315613, "geo/layer_27/attn_entropy_mean": 4.21098518371582, "geo/layer_27/attn_entropy_std": 0.7334940433502197, "attnres/final_alpha/block_0": 0.23582355678081512, "attnres/block_norm/0": 1.7605412006378174, "attnres/final_alpha/block_1": 0.004473647568374872, "attnres/block_norm/1": 46092.6640625, "attnres/final_alpha/block_2": 0.010337949730455875, "attnres/block_norm/2": 28372.3203125, "attnres/final_alpha/block_3": 0.012349853292107582, "attnres/block_norm/3": 57064.9921875, "attnres/final_alpha/block_4": 0.014378394931554794, "attnres/block_norm/4": 14916.560546875, "attnres/final_alpha/block_5": 0.6123650670051575, "attnres/block_norm/5": 6602.8876953125, "attnres/final_alpha/block_6": 0.11027152836322784, "attnres/block_norm/6": 37591.0703125, "geo/tier1_time_s": 1.3584589958190918, "geo/step": 60525.0, "geo/rankme_slope": -0.000103968775010004} {"step": 60530, "timestamp": 1778259973.7557762, "train/loss": 2.1283947706222532, "train/z_loss": 0.0013838859973475336, "train/perplexity": 8.401369855777652, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703125.6172376794, "perf/iters_per_sec": 0.8121135793865583, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2313548564910888, "data/tokens_consumed": 126942707712, "data/tokens_consumed_B": 126.942707712, "train/loss_slope": 4.557003554302088e-06} {"step": 60540, "timestamp": 1778259984.0940018, "train/loss": 2.151571846008301, "train/z_loss": 0.0013790174620226026, "train/perplexity": 8.598363083444426, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029283.5770861793, "perf/iters_per_sec": 0.9676378140860459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334445238113403, "data/tokens_consumed": 126963679232, "data/tokens_consumed_B": 126.963679232, "train/loss_slope": 3.825507241256792e-06} {"step": 60550, "timestamp": 1778259994.4350314, "grad/layer_0/attn": 0.0025552050210535526, "grad/layer_0/mlp": 0.0027758495416492224, "grad/layer_0/attn_mlp_ratio": 0.9205127621881646, "grad/layer_4/attn": 0.0033780818339437246, "grad/layer_4/mlp": 0.002671194728463888, "grad/layer_4/attn_mlp_ratio": 1.2646332637168, "grad/layer_8/attn": 0.003932105842977762, "grad/layer_8/mlp": 0.003705758834257722, "grad/layer_8/attn_mlp_ratio": 1.0610797714410365, "grad/layer_12/attn": 0.004325103480368853, "grad/layer_12/mlp": 0.006544502452015877, "grad/layer_12/attn_mlp_ratio": 0.6608758184434673, "grad/layer_16/attn": 0.003396423067897558, "grad/layer_16/mlp": 0.004640894941985607, "grad/layer_16/attn_mlp_ratio": 0.7318465591595018, "grad/layer_20/attn": 0.0028629302978515625, "grad/layer_20/mlp": 0.005655262153595686, "grad/layer_20/attn_mlp_ratio": 0.5062418274291836, "grad/layer_24/attn": 0.013501576147973537, "grad/layer_24/mlp": 0.007824690081179142, "grad/layer_24/attn_mlp_ratio": 1.7255093601596017, "grad/layer_27/attn": 0.004104799125343561, "grad/layer_27/mlp": 0.006939032580703497, "grad/layer_27/attn_mlp_ratio": 0.5915520670133818} {"step": 60550, "timestamp": 1778259994.449203, "train/loss": 2.1290241479873657, "train/z_loss": 0.0013883864390663802, "train/perplexity": 8.406659152107913, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026563.9782679807, "perf/iters_per_sec": 0.9663410083141235, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348313808441163, "data/tokens_consumed": 126984650752, "data/tokens_consumed_B": 126.984650752, "train/loss_slope": 4.075995458699586e-06} {"step": 60560, "timestamp": 1778260004.7902398, "train/loss": 2.1414445638656616, "train/z_loss": 0.0013934344868175685, "train/perplexity": 8.51172448179848, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029114.7252950051, "perf/iters_per_sec": 0.9675572992777849, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335305213928223, "data/tokens_consumed": 127005622272, "data/tokens_consumed_B": 127.005622272, "train/loss_slope": 5.791741436106474e-06} {"step": 60570, "timestamp": 1778260015.1315243, "train/loss": 2.1700422763824463, "train/z_loss": 0.0013786625466309487, "train/perplexity": 8.758654317133432, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029071.6625210622, "perf/iters_per_sec": 0.9675367653470336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335524559020997, "data/tokens_consumed": 127026593792, "data/tokens_consumed_B": 127.026593792, "train/loss_slope": 8.736762233657413e-06} {"step": 60580, "timestamp": 1778260025.4810061, "train/loss": 2.170698642730713, "train/z_loss": 0.0013835866120643914, "train/perplexity": 8.764405090183791, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027776.094254536, "perf/iters_per_sec": 0.9669189902565651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342128038406373, "data/tokens_consumed": 127047565312, "data/tokens_consumed_B": 127.047565312, "train/loss_slope": 1.038878453303625e-05} {"step": 60590, "timestamp": 1778260035.8258522, "train/loss": 2.176706576347351, "train/z_loss": 0.00137122618034482, "train/perplexity": 8.817219548171169, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028318.35974147, "perf/iters_per_sec": 0.9671775625903464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339363098144532, "data/tokens_consumed": 127068536832, "data/tokens_consumed_B": 127.068536832, "train/loss_slope": 1.0476684062430091e-05} {"step": 60600, "timestamp": 1778260046.156083, "grad/layer_0/attn": 0.0029007012490183115, "grad/layer_0/mlp": 0.002960021374747157, "grad/layer_0/attn_mlp_ratio": 0.9799595285929766, "grad/layer_4/attn": 0.0030945076141506433, "grad/layer_4/mlp": 0.002476206049323082, "grad/layer_4/attn_mlp_ratio": 1.249697088021765, "grad/layer_8/attn": 0.004069814924150705, "grad/layer_8/mlp": 0.00362618756480515, "grad/layer_8/attn_mlp_ratio": 1.1223398512027627, "grad/layer_12/attn": 0.004602737724781036, "grad/layer_12/mlp": 0.0068336776457726955, "grad/layer_12/attn_mlp_ratio": 0.6735374268457837, "grad/layer_16/attn": 0.005868405103683472, "grad/layer_16/mlp": 0.004650864750146866, "grad/layer_16/attn_mlp_ratio": 1.2617879239165934, "grad/layer_20/attn": 0.009399976581335068, "grad/layer_20/mlp": 0.006014049518853426, "grad/layer_20/attn_mlp_ratio": 1.5630028312149455, "grad/layer_24/attn": 0.005168030504137278, "grad/layer_24/mlp": 0.008165804669260979, "grad/layer_24/attn_mlp_ratio": 0.6328868556338256, "grad/layer_27/attn": 0.0050772144459187984, "grad/layer_27/mlp": 0.006686101201921701, "grad/layer_27/attn_mlp_ratio": 0.759368459532541} {"step": 60600, "timestamp": 1778260046.7549765, "eos/sharpness": 41.4046287536621, "eos/L0_probe": 1.9801900386810303, "eos/L_plus": 2.1440556049346924, "eos/L_minus": 2.2303707599639893, "eos/grad_norm": 0.11665213108062744, "eos/embed_grad_frac": 0.15793514251708984, "eos/time_s": 0.5957973003387451} {"step": 60600, "timestamp": 1778260046.7748325, "train/loss": 2.166969394683838, "train/z_loss": 0.0013900968013331294, "train/perplexity": 8.73178131849664, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916320.723645248, "perf/iters_per_sec": 0.9137729280687561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0943637847900392, "data/tokens_consumed": 127089508352, "data/tokens_consumed_B": 127.089508352, "train/loss_slope": 1.1648230154951344e-05} {"step": 60600, "timestamp": 1778260048.1371467, "geo/rankme_last": 438.6710205078125, "geo/layer_0/stable_rank_q_proj": 19.35079574584961, "geo/layer_0/stable_rank_k_proj": 16.150123596191406, "geo/layer_0/stable_rank_o_proj": 47.14995193481445, "geo/layer_0/stable_rank_gate_proj": 132.14559936523438, "geo/layer_0/stable_rank_down_proj": 54.8591194152832, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0650959238409996, "geo/layer_0/attn_entropy_mean": 6.16330623626709, "geo/layer_0/attn_entropy_std": 0.4208647310733795, "geo/layer_7/stable_rank_q_proj": 43.547935485839844, "geo/layer_7/stable_rank_k_proj": 41.17799377441406, "geo/layer_7/stable_rank_o_proj": 92.47252655029297, "geo/layer_7/stable_rank_gate_proj": 82.27979278564453, "geo/layer_7/stable_rank_down_proj": 141.0272674560547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4545409381389618, "geo/layer_7/attn_entropy_mean": 4.6441473960876465, "geo/layer_7/attn_entropy_std": 0.7936713099479675, "geo/layer_14/stable_rank_q_proj": 51.79133987426758, "geo/layer_14/stable_rank_k_proj": 39.769371032714844, "geo/layer_14/stable_rank_o_proj": 43.993614196777344, "geo/layer_14/stable_rank_gate_proj": 72.19544219970703, "geo/layer_14/stable_rank_down_proj": 129.7823028564453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3901997208595276, "geo/layer_14/attn_entropy_mean": 5.550910949707031, "geo/layer_14/attn_entropy_std": 0.39703014492988586, "geo/layer_21/stable_rank_q_proj": 40.78370666503906, "geo/layer_21/stable_rank_k_proj": 30.31797218322754, "geo/layer_21/stable_rank_o_proj": 70.74247741699219, "geo/layer_21/stable_rank_gate_proj": 66.85185241699219, "geo/layer_21/stable_rank_down_proj": 51.84794235229492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1431730091571808, "geo/layer_21/attn_entropy_mean": 5.702972412109375, "geo/layer_21/attn_entropy_std": 0.29819783568382263, "geo/layer_27/stable_rank_q_proj": 43.125606536865234, "geo/layer_27/stable_rank_k_proj": 31.663799285888672, "geo/layer_27/stable_rank_o_proj": 115.99608612060547, "geo/layer_27/stable_rank_gate_proj": 80.62242889404297, "geo/layer_27/stable_rank_down_proj": 128.80337524414062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09174007177352905, "geo/layer_27/attn_entropy_mean": 4.214110374450684, "geo/layer_27/attn_entropy_std": 0.748224675655365, "attnres/final_alpha/block_0": 0.23696643114089966, "attnres/block_norm/0": 1.7607524394989014, "attnres/final_alpha/block_1": 0.004586858209222555, "attnres/block_norm/1": 46000.171875, "attnres/final_alpha/block_2": 0.01034761592745781, "attnres/block_norm/2": 28307.0234375, "attnres/final_alpha/block_3": 0.0123701561242342, "attnres/block_norm/3": 57245.7734375, "attnres/final_alpha/block_4": 0.014383504167199135, "attnres/block_norm/4": 14916.984375, "attnres/final_alpha/block_5": 0.6098748445510864, "attnres/block_norm/5": 6606.78466796875, "attnres/final_alpha/block_6": 0.11147059500217438, "attnres/block_norm/6": 37767.8671875, "geo/tier1_time_s": 1.358341932296753, "geo/step": 60600.0, "geo/rankme_slope": -7.588422087585035e-05} {"step": 60610, "timestamp": 1778260058.516374, "train/loss": 2.1767763614654543, "train/z_loss": 0.0013665844919160009, "train/perplexity": 8.817834880348945, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786683.43067053, "perf/iters_per_sec": 0.8519570496895457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1737680912017823, "data/tokens_consumed": 127110479872, "data/tokens_consumed_B": 127.110479872, "train/loss_slope": 1.4338504069923711e-05} {"step": 60620, "timestamp": 1778260068.8641546, "train/loss": 2.1318644523620605, "train/z_loss": 0.0013896757271140813, "train/perplexity": 8.430570564643928, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028092.010353766, "perf/iters_per_sec": 0.9670696307915525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340517044067383, "data/tokens_consumed": 127131451392, "data/tokens_consumed_B": 127.131451392, "train/loss_slope": 1.3995532169736857e-05} {"step": 60630, "timestamp": 1778260079.2078292, "train/loss": 2.1464520931243896, "train/z_loss": 0.001376762439031154, "train/perplexity": 8.55445408676859, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028474.0744240559, "perf/iters_per_sec": 0.9672518131370811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338569402694702, "data/tokens_consumed": 127152422912, "data/tokens_consumed_B": 127.152422912, "train/loss_slope": 1.4541088157754465e-05} {"step": 60640, "timestamp": 1778260089.5552082, "train/loss": 2.1491445302963257, "train/z_loss": 0.0013880757964216174, "train/perplexity": 8.577517451334849, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028250.2625463337, "perf/iters_per_sec": 0.967145091317336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339710235595703, "data/tokens_consumed": 127173394432, "data/tokens_consumed_B": 127.173394432, "train/loss_slope": 1.3705003959010198e-05} {"step": 60650, "timestamp": 1778260099.8851612, "grad/layer_0/attn": 0.003474196419119835, "grad/layer_0/mlp": 0.003353093285113573, "grad/layer_0/attn_mlp_ratio": 1.036116809195924, "grad/layer_4/attn": 0.0025184100959450006, "grad/layer_4/mlp": 0.002559434389695525, "grad/layer_4/attn_mlp_ratio": 0.9839713054130934, "grad/layer_8/attn": 0.005417476873844862, "grad/layer_8/mlp": 0.0038443112280219793, "grad/layer_8/attn_mlp_ratio": 1.4092190802435156, "grad/layer_12/attn": 0.00720279710367322, "grad/layer_12/mlp": 0.007296784780919552, "grad/layer_12/attn_mlp_ratio": 0.9871192890046544, "grad/layer_16/attn": 0.003522060578688979, "grad/layer_16/mlp": 0.004975603427737951, "grad/layer_16/attn_mlp_ratio": 0.7078660023963378, "grad/layer_20/attn": 0.0032834301237016916, "grad/layer_20/mlp": 0.0061645531095564365, "grad/layer_20/attn_mlp_ratio": 0.5326306728298883, "grad/layer_24/attn": 0.008103937841951847, "grad/layer_24/mlp": 0.00792753230780363, "grad/layer_24/attn_mlp_ratio": 1.022252250142121, "grad/layer_27/attn": 0.004200149793177843, "grad/layer_27/mlp": 0.006572554353624582, "grad/layer_27/attn_mlp_ratio": 0.6390437420965872} {"step": 60650, "timestamp": 1778260099.899637, "train/loss": 2.1505625367164614, "train/z_loss": 0.0013772905105724931, "train/perplexity": 8.589689053815402, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028345.4876449793, "perf/iters_per_sec": 0.9671904981827637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339224815368653, "data/tokens_consumed": 127194365952, "data/tokens_consumed_B": 127.194365952, "train/loss_slope": 1.5140515764375475e-05} {"step": 60660, "timestamp": 1778260110.2432256, "train/loss": 2.126087653636932, "train/z_loss": 0.0013722729869186877, "train/perplexity": 8.382009254856737, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028496.2478192917, "perf/iters_per_sec": 0.967262386235853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033845639228821, "data/tokens_consumed": 127215337472, "data/tokens_consumed_B": 127.215337472, "train/loss_slope": 1.5699077196175545e-05} {"step": 60670, "timestamp": 1778260121.023899, "train/loss": 2.17296986579895, "train/z_loss": 0.0013721638126298786, "train/perplexity": 8.784333631718912, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1946490.8383694785, "perf/iters_per_sec": 0.9281591598365204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0774014234542846, "data/tokens_consumed": 127236308992, "data/tokens_consumed_B": 127.236308992, "train/loss_slope": 1.5503416775297925e-05} {"step": 60675, "timestamp": 1778260126.7856658, "eos/sharpness": 13.128733634948729, "eos/L0_probe": 1.9795325994491577, "eos/L_plus": 2.0545129776000977, "eos/L_minus": 2.035839557647705, "eos/grad_norm": 0.09887354075908661, "eos/embed_grad_frac": 0.23204825818538666, "eos/time_s": 0.6017882823944092} {"step": 60675, "timestamp": 1778260128.1616156, "geo/rankme_last": 438.78863525390625, "geo/layer_0/stable_rank_q_proj": 19.369075775146484, "geo/layer_0/stable_rank_k_proj": 16.169544219970703, "geo/layer_0/stable_rank_o_proj": 47.05544662475586, "geo/layer_0/stable_rank_gate_proj": 131.9679412841797, "geo/layer_0/stable_rank_down_proj": 54.8626708984375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061540085822343826, "geo/layer_0/attn_entropy_mean": 6.163365840911865, "geo/layer_0/attn_entropy_std": 0.4149056375026703, "geo/layer_7/stable_rank_q_proj": 43.604286193847656, "geo/layer_7/stable_rank_k_proj": 41.12718200683594, "geo/layer_7/stable_rank_o_proj": 92.38697052001953, "geo/layer_7/stable_rank_gate_proj": 82.21553802490234, "geo/layer_7/stable_rank_down_proj": 140.5836944580078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46964508295059204, "geo/layer_7/attn_entropy_mean": 4.690045356750488, "geo/layer_7/attn_entropy_std": 0.796832263469696, "geo/layer_14/stable_rank_q_proj": 51.73186492919922, "geo/layer_14/stable_rank_k_proj": 39.785614013671875, "geo/layer_14/stable_rank_o_proj": 43.97700119018555, "geo/layer_14/stable_rank_gate_proj": 72.01058197021484, "geo/layer_14/stable_rank_down_proj": 129.6475372314453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3856898546218872, "geo/layer_14/attn_entropy_mean": 5.506742477416992, "geo/layer_14/attn_entropy_std": 0.3845534920692444, "geo/layer_21/stable_rank_q_proj": 40.801734924316406, "geo/layer_21/stable_rank_k_proj": 30.322662353515625, "geo/layer_21/stable_rank_o_proj": 70.68523406982422, "geo/layer_21/stable_rank_gate_proj": 66.81581115722656, "geo/layer_21/stable_rank_down_proj": 51.78594207763672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13841496407985687, "geo/layer_21/attn_entropy_mean": 5.704350471496582, "geo/layer_21/attn_entropy_std": 0.29890334606170654, "geo/layer_27/stable_rank_q_proj": 43.053436279296875, "geo/layer_27/stable_rank_k_proj": 31.718894958496094, "geo/layer_27/stable_rank_o_proj": 116.13636779785156, "geo/layer_27/stable_rank_gate_proj": 80.70083618164062, "geo/layer_27/stable_rank_down_proj": 128.97305297851562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09457537531852722, "geo/layer_27/attn_entropy_mean": 4.1974287033081055, "geo/layer_27/attn_entropy_std": 0.7293622493743896, "attnres/final_alpha/block_0": 0.23688971996307373, "attnres/block_norm/0": 1.760628581047058, "attnres/final_alpha/block_1": 0.004529002122581005, "attnres/block_norm/1": 46233.21875, "attnres/final_alpha/block_2": 0.010320565663278103, "attnres/block_norm/2": 28451.40625, "attnres/final_alpha/block_3": 0.012266505509614944, "attnres/block_norm/3": 57835.15625, "attnres/final_alpha/block_4": 0.014296951703727245, "attnres/block_norm/4": 14927.0126953125, "attnres/final_alpha/block_5": 0.6112455129623413, "attnres/block_norm/5": 6575.9873046875, "attnres/final_alpha/block_6": 0.11045172810554504, "attnres/block_norm/6": 37663.5703125, "geo/tier1_time_s": 1.3578119277954102, "geo/step": 60675.0, "geo/rankme_slope": -4.5538566989295714e-05} {"step": 60680, "timestamp": 1778260133.3396833, "train/loss": 2.120698022842407, "train/z_loss": 0.0013914412935264408, "train/perplexity": 8.336954842044241, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703489.3589025491, "perf/iters_per_sec": 0.8122870249283548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2310919284820556, "data/tokens_consumed": 127257280512, "data/tokens_consumed_B": 127.257280512, "train/loss_slope": 1.3961627946184589e-05} {"step": 60690, "timestamp": 1778260143.6828957, "train/loss": 2.161749339103699, "train/z_loss": 0.0013689857441931963, "train/perplexity": 8.686319694035555, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028563.7533549012, "perf/iters_per_sec": 0.9672945753836161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338112354278564, "data/tokens_consumed": 127278252032, "data/tokens_consumed_B": 127.278252032, "train/loss_slope": 1.2965021461042658e-05} {"step": 60700, "timestamp": 1778260154.0178666, "grad/layer_0/attn": 0.003080895636230707, "grad/layer_0/mlp": 0.0031382706947624683, "grad/layer_0/attn_mlp_ratio": 0.981717588352313, "grad/layer_4/attn": 0.002068549394607544, "grad/layer_4/mlp": 0.002530875150114298, "grad/layer_4/attn_mlp_ratio": 0.8173257036332097, "grad/layer_8/attn": 0.005570830777287483, "grad/layer_8/mlp": 0.0034695318900048733, "grad/layer_8/attn_mlp_ratio": 1.6056432952156348, "grad/layer_12/attn": 0.006269984412938356, "grad/layer_12/mlp": 0.006554772146046162, "grad/layer_12/attn_mlp_ratio": 0.9565525967313985, "grad/layer_16/attn": 0.004400609526783228, "grad/layer_16/mlp": 0.004680114332586527, "grad/layer_16/attn_mlp_ratio": 0.9402781898115199, "grad/layer_20/attn": 0.0036000311374664307, "grad/layer_20/mlp": 0.00573588814586401, "grad/layer_20/attn_mlp_ratio": 0.6276327193198566, "grad/layer_24/attn": 0.008253434672951698, "grad/layer_24/mlp": 0.01045287400484085, "grad/layer_24/attn_mlp_ratio": 0.7895851983072708, "grad/layer_27/attn": 0.004022093489766121, "grad/layer_27/mlp": 0.010120963677763939, "grad/layer_27/attn_mlp_ratio": 0.39740222157525956} {"step": 60700, "timestamp": 1778260154.0322077, "train/loss": 2.167559361457825, "train/z_loss": 0.00137131258379668, "train/perplexity": 8.736934299246524, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027741.8763085464, "perf/iters_per_sec": 0.9669026738684399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342302560806274, "data/tokens_consumed": 127299223552, "data/tokens_consumed_B": 127.299223552, "train/loss_slope": 1.460334889852275e-05} {"step": 60710, "timestamp": 1778260164.3733053, "train/loss": 2.1266148567199705, "train/z_loss": 0.0013825819361954928, "train/perplexity": 8.386429441043466, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029004.123177886, "perf/iters_per_sec": 0.9675045600785666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033586859703064, "data/tokens_consumed": 127320195072, "data/tokens_consumed_B": 127.320195072, "train/loss_slope": 9.165223260702595e-06} {"step": 60720, "timestamp": 1778260174.725182, "train/loss": 2.108460307121277, "train/z_loss": 0.0013651010813191533, "train/perplexity": 8.235551298106634, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027833.7812205811, "perf/iters_per_sec": 0.9669464975455194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034183382987976, "data/tokens_consumed": 127341166592, "data/tokens_consumed_B": 127.341166592, "train/loss_slope": 8.998289782114556e-06} {"step": 60730, "timestamp": 1778260185.0693908, "train/loss": 2.17877938747406, "train/z_loss": 0.001364753267262131, "train/perplexity": 8.835514933846172, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028819.0799900978, "perf/iters_per_sec": 0.9674163246107568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336811304092408, "data/tokens_consumed": 127362138112, "data/tokens_consumed_B": 127.362138112, "train/loss_slope": 1.0421605407744507e-05} {"step": 60740, "timestamp": 1778260195.4107058, "train/loss": 2.115571141242981, "train/z_loss": 0.001375793432816863, "train/perplexity": 8.294321642735975, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029014.1859273345, "perf/iters_per_sec": 0.9675093583714173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335817337036133, "data/tokens_consumed": 127383109632, "data/tokens_consumed_B": 127.383109632, "train/loss_slope": 8.616404850991453e-06} {"step": 60750, "timestamp": 1778260205.7476525, "grad/layer_0/attn": 0.003690199926495552, "grad/layer_0/mlp": 0.003212562995031476, "grad/layer_0/attn_mlp_ratio": 1.1486778056445912, "grad/layer_4/attn": 0.0020650143269449472, "grad/layer_4/mlp": 0.002452729269862175, "grad/layer_4/attn_mlp_ratio": 0.841925062062997, "grad/layer_8/attn": 0.008548881858587265, "grad/layer_8/mlp": 0.003657275578007102, "grad/layer_8/attn_mlp_ratio": 2.3374999894034962, "grad/layer_12/attn": 0.004255283158272505, "grad/layer_12/mlp": 0.007182833272963762, "grad/layer_12/attn_mlp_ratio": 0.5924240389996268, "grad/layer_16/attn": 0.006242972798645496, "grad/layer_16/mlp": 0.004760518204420805, "grad/layer_16/attn_mlp_ratio": 1.311406111567311, "grad/layer_20/attn": 0.0035583756398409605, "grad/layer_20/mlp": 0.00566620659083128, "grad/layer_20/attn_mlp_ratio": 0.627999618439425, "grad/layer_24/attn": 0.013940874487161636, "grad/layer_24/mlp": 0.00936776865273714, "grad/layer_24/attn_mlp_ratio": 1.4881744901194636, "grad/layer_27/attn": 0.008441155776381493, "grad/layer_27/mlp": 0.008343622088432312, "grad/layer_27/attn_mlp_ratio": 1.0116895978444953} {"step": 60750, "timestamp": 1778260206.3464842, "eos/sharpness": 75.60091018676756, "eos/L0_probe": 1.9784857034683228, "eos/L_plus": 2.290861129760742, "eos/L_minus": 2.422119379043579, "eos/grad_norm": 0.182151660323143, "eos/embed_grad_frac": 0.06394579261541367, "eos/time_s": 0.5959751605987549} {"step": 60750, "timestamp": 1778260206.366392, "train/loss": 2.139351415634155, "train/z_loss": 0.0013793112710118293, "train/perplexity": 8.493926813828981, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915495.1997509794, "perf/iters_per_sec": 0.9133792876009843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0948354244232177, "data/tokens_consumed": 127404081152, "data/tokens_consumed_B": 127.404081152, "train/loss_slope": 1.0027214681783797e-05} {"step": 60750, "timestamp": 1778260207.7275283, "geo/rankme_last": 439.0380554199219, "geo/layer_0/stable_rank_q_proj": 19.34774398803711, "geo/layer_0/stable_rank_k_proj": 16.171363830566406, "geo/layer_0/stable_rank_o_proj": 47.065162658691406, "geo/layer_0/stable_rank_gate_proj": 131.9191436767578, "geo/layer_0/stable_rank_down_proj": 54.83498001098633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061563123017549515, "geo/layer_0/attn_entropy_mean": 6.165007591247559, "geo/layer_0/attn_entropy_std": 0.4161633551120758, "geo/layer_7/stable_rank_q_proj": 43.66107940673828, "geo/layer_7/stable_rank_k_proj": 41.194149017333984, "geo/layer_7/stable_rank_o_proj": 92.45199584960938, "geo/layer_7/stable_rank_gate_proj": 82.19799041748047, "geo/layer_7/stable_rank_down_proj": 140.74928283691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4492526054382324, "geo/layer_7/attn_entropy_mean": 4.647762298583984, "geo/layer_7/attn_entropy_std": 0.8210492134094238, "geo/layer_14/stable_rank_q_proj": 51.67314529418945, "geo/layer_14/stable_rank_k_proj": 39.84326171875, "geo/layer_14/stable_rank_o_proj": 43.97249221801758, "geo/layer_14/stable_rank_gate_proj": 72.0237045288086, "geo/layer_14/stable_rank_down_proj": 129.59014892578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38298383355140686, "geo/layer_14/attn_entropy_mean": 5.513136863708496, "geo/layer_14/attn_entropy_std": 0.39659014344215393, "geo/layer_21/stable_rank_q_proj": 40.81675720214844, "geo/layer_21/stable_rank_k_proj": 30.237388610839844, "geo/layer_21/stable_rank_o_proj": 70.70342254638672, "geo/layer_21/stable_rank_gate_proj": 66.6423110961914, "geo/layer_21/stable_rank_down_proj": 51.893489837646484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14001218974590302, "geo/layer_21/attn_entropy_mean": 5.695026874542236, "geo/layer_21/attn_entropy_std": 0.29507386684417725, "geo/layer_27/stable_rank_q_proj": 43.005897521972656, "geo/layer_27/stable_rank_k_proj": 31.71186065673828, "geo/layer_27/stable_rank_o_proj": 116.1349868774414, "geo/layer_27/stable_rank_gate_proj": 80.83847045898438, "geo/layer_27/stable_rank_down_proj": 129.0013885498047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10214299708604813, "geo/layer_27/attn_entropy_mean": 4.208414077758789, "geo/layer_27/attn_entropy_std": 0.7400456666946411, "attnres/final_alpha/block_0": 0.23692050576210022, "attnres/block_norm/0": 1.7609689235687256, "attnres/final_alpha/block_1": 0.004563440568745136, "attnres/block_norm/1": 46129.34375, "attnres/final_alpha/block_2": 0.010329073294997215, "attnres/block_norm/2": 28393.96875, "attnres/final_alpha/block_3": 0.012445561587810516, "attnres/block_norm/3": 57033.8125, "attnres/final_alpha/block_4": 0.014570362865924835, "attnres/block_norm/4": 14937.173828125, "attnres/final_alpha/block_5": 0.6112293004989624, "attnres/block_norm/5": 6581.734375, "attnres/final_alpha/block_6": 0.10994171351194382, "attnres/block_norm/6": 37846.84375, "geo/tier1_time_s": 1.3578133583068848, "geo/step": 60750.0, "geo/rankme_slope": -4.474444074504801e-05} {"step": 60760, "timestamp": 1778260218.0777931, "train/loss": 2.0763171195983885, "train/z_loss": 0.0013955736765637993, "train/perplexity": 7.975043630766005, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791181.3033862498, "perf/iters_per_sec": 0.8541018025332688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708206176757812, "data/tokens_consumed": 127425052672, "data/tokens_consumed_B": 127.425052672, "train/loss_slope": 9.18870157498757e-06} {"step": 60770, "timestamp": 1778260228.4246273, "train/loss": 2.1389582633972166, "train/z_loss": 0.0013825180707499386, "train/perplexity": 8.490588063863246, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027730.751050295, "perf/iters_per_sec": 0.966897368931911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342359304428101, "data/tokens_consumed": 127446024192, "data/tokens_consumed_B": 127.446024192, "train/loss_slope": 7.674143595485091e-06} {"step": 60780, "timestamp": 1778260238.7679543, "train/loss": 2.120429587364197, "train/z_loss": 0.0013919699122197926, "train/perplexity": 8.33471720792803, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028637.1116572106, "perf/iters_per_sec": 0.9673295553480199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337738513946533, "data/tokens_consumed": 127466995712, "data/tokens_consumed_B": 127.466995712, "train/loss_slope": 4.739093658911844e-06} {"step": 60790, "timestamp": 1778260249.122967, "train/loss": 2.1335708856582642, "train/z_loss": 0.0013753641163930298, "train/perplexity": 8.444969052497045, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026640.2271269192, "perf/iters_per_sec": 0.966377366603336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347924470901488, "data/tokens_consumed": 127487967232, "data/tokens_consumed_B": 127.487967232, "train/loss_slope": 4.571273317574431e-06} {"step": 60800, "timestamp": 1778260259.4583335, "grad/layer_0/attn": 0.0029805409722030163, "grad/layer_0/mlp": 0.0032016432378441095, "grad/layer_0/attn_mlp_ratio": 0.9309409755210345, "grad/layer_4/attn": 0.002642317907884717, "grad/layer_4/mlp": 0.002538821194320917, "grad/layer_4/attn_mlp_ratio": 1.0407656158372829, "grad/layer_8/attn": 0.006267504766583443, "grad/layer_8/mlp": 0.003634158056229353, "grad/layer_8/attn_mlp_ratio": 1.7246097987893674, "grad/layer_12/attn": 0.00533066364005208, "grad/layer_12/mlp": 0.007187090814113617, "grad/layer_12/attn_mlp_ratio": 0.7416997647245583, "grad/layer_16/attn": 0.004015327896922827, "grad/layer_16/mlp": 0.005166660528630018, "grad/layer_16/attn_mlp_ratio": 0.7771611463452214, "grad/layer_20/attn": 0.004216791596263647, "grad/layer_20/mlp": 0.0066461749374866486, "grad/layer_20/attn_mlp_ratio": 0.634468934760149, "grad/layer_24/attn": 0.016185281798243523, "grad/layer_24/mlp": 0.01308520045131445, "grad/layer_24/attn_mlp_ratio": 1.2369150732365093, "grad/layer_27/attn": 0.008605475537478924, "grad/layer_27/mlp": 0.012845332734286785, "grad/layer_27/attn_mlp_ratio": 0.6699301332628123} {"step": 60800, "timestamp": 1778260259.4728692, "train/loss": 2.099132037162781, "train/z_loss": 0.001383025897666812, "train/perplexity": 8.159085055702144, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027675.50052592, "perf/iters_per_sec": 0.9668710234288788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342641115188598, "data/tokens_consumed": 127508938752, "data/tokens_consumed_B": 127.508938752, "train/loss_slope": 3.5001603683621872e-06} {"step": 60810, "timestamp": 1778260269.8418045, "train/loss": 2.1903483867645264, "train/z_loss": 0.0013683681492693723, "train/perplexity": 8.938326566997429, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024169.4744452343, "perf/iters_per_sec": 0.9651992199159786, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360555410385133, "data/tokens_consumed": 127529910272, "data/tokens_consumed_B": 127.529910272, "train/loss_slope": 5.193048420518848e-06} {"step": 60820, "timestamp": 1778260280.1912777, "train/loss": 2.1517637729644776, "train/z_loss": 0.0013759275549091398, "train/perplexity": 8.600013499473729, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027530.9846281076, "perf/iters_per_sec": 0.9668021128788508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034337830543518, "data/tokens_consumed": 127550881792, "data/tokens_consumed_B": 127.550881792, "train/loss_slope": 4.451200620855474e-06} {"step": 60825, "timestamp": 1778260285.9425492, "eos/sharpness": 62.01508045196532, "eos/L0_probe": 1.9791676998138428, "eos/L_plus": 2.322556257247925, "eos/L_minus": 2.255929946899414, "eos/grad_norm": 0.20484663546085358, "eos/embed_grad_frac": 0.05908501520752907, "eos/time_s": 0.5845999717712402} {"step": 60825, "timestamp": 1778260287.3199928, "geo/rankme_last": 438.10870361328125, "geo/layer_0/stable_rank_q_proj": 19.361080169677734, "geo/layer_0/stable_rank_k_proj": 16.210575103759766, "geo/layer_0/stable_rank_o_proj": 47.041603088378906, "geo/layer_0/stable_rank_gate_proj": 131.99327087402344, "geo/layer_0/stable_rank_down_proj": 54.844364166259766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06789031624794006, "geo/layer_0/attn_entropy_mean": 6.167359352111816, "geo/layer_0/attn_entropy_std": 0.4150159955024719, "geo/layer_7/stable_rank_q_proj": 43.595638275146484, "geo/layer_7/stable_rank_k_proj": 41.196292877197266, "geo/layer_7/stable_rank_o_proj": 92.25817108154297, "geo/layer_7/stable_rank_gate_proj": 82.19727325439453, "geo/layer_7/stable_rank_down_proj": 141.107177734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44549280405044556, "geo/layer_7/attn_entropy_mean": 4.624241828918457, "geo/layer_7/attn_entropy_std": 0.7873952984809875, "geo/layer_14/stable_rank_q_proj": 51.64374923706055, "geo/layer_14/stable_rank_k_proj": 39.932342529296875, "geo/layer_14/stable_rank_o_proj": 43.986572265625, "geo/layer_14/stable_rank_gate_proj": 72.01627349853516, "geo/layer_14/stable_rank_down_proj": 129.85372924804688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40426456928253174, "geo/layer_14/attn_entropy_mean": 5.534214973449707, "geo/layer_14/attn_entropy_std": 0.4154514670372009, "geo/layer_21/stable_rank_q_proj": 40.8660888671875, "geo/layer_21/stable_rank_k_proj": 30.324687957763672, "geo/layer_21/stable_rank_o_proj": 70.68988800048828, "geo/layer_21/stable_rank_gate_proj": 66.62909698486328, "geo/layer_21/stable_rank_down_proj": 51.8935661315918, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14281754195690155, "geo/layer_21/attn_entropy_mean": 5.71813440322876, "geo/layer_21/attn_entropy_std": 0.2836577892303467, "geo/layer_27/stable_rank_q_proj": 43.06377410888672, "geo/layer_27/stable_rank_k_proj": 31.692827224731445, "geo/layer_27/stable_rank_o_proj": 115.9703369140625, "geo/layer_27/stable_rank_gate_proj": 80.84280395507812, "geo/layer_27/stable_rank_down_proj": 128.8529510498047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09601043909788132, "geo/layer_27/attn_entropy_mean": 4.211170673370361, "geo/layer_27/attn_entropy_std": 0.7313808798789978, "attnres/final_alpha/block_0": 0.23459486663341522, "attnres/block_norm/0": 1.7609102725982666, "attnres/final_alpha/block_1": 0.004481284413486719, "attnres/block_norm/1": 45904.3125, "attnres/final_alpha/block_2": 0.010244471952319145, "attnres/block_norm/2": 28434.857421875, "attnres/final_alpha/block_3": 0.01234174519777298, "attnres/block_norm/3": 57436.69140625, "attnres/final_alpha/block_4": 0.013990789651870728, "attnres/block_norm/4": 14932.1796875, "attnres/final_alpha/block_5": 0.6160225868225098, "attnres/block_norm/5": 6546.322265625, "attnres/final_alpha/block_6": 0.10832421481609344, "attnres/block_norm/6": 38037.8125, "geo/tier1_time_s": 1.3575000762939453, "geo/step": 60825.0, "geo/rankme_slope": -6.194002991821728e-05} {"step": 60830, "timestamp": 1778260292.4955254, "train/loss": 2.1425904035568237, "train/z_loss": 0.0013814822072163225, "train/perplexity": 8.521483143413146, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705041.8543432807, "perf/iters_per_sec": 0.8130273124424365, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2299709796905518, "data/tokens_consumed": 127571853312, "data/tokens_consumed_B": 127.571853312, "train/loss_slope": 4.81076576028034e-06} {"step": 60840, "timestamp": 1778260303.217175, "train/loss": 2.127421498298645, "train/z_loss": 0.0013893127907067537, "train/perplexity": 8.393197012862684, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1957151.7402475032, "perf/iters_per_sec": 0.9332426739919201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.071532654762268, "data/tokens_consumed": 127592824832, "data/tokens_consumed_B": 127.592824832, "train/loss_slope": 2.1641184394508395e-06} {"step": 60850, "timestamp": 1778260314.0498998, "grad/layer_0/attn": 0.0024507304187864065, "grad/layer_0/mlp": 0.002836803672835231, "grad/layer_0/attn_mlp_ratio": 0.8639055130475363, "grad/layer_4/attn": 0.002287720562890172, "grad/layer_4/mlp": 0.002558180596679449, "grad/layer_4/attn_mlp_ratio": 0.8942763761213828, "grad/layer_8/attn": 0.005156642757356167, "grad/layer_8/mlp": 0.0037159412167966366, "grad/layer_8/attn_mlp_ratio": 1.3877083402924943, "grad/layer_12/attn": 0.007097618654370308, "grad/layer_12/mlp": 0.006795682478696108, "grad/layer_12/attn_mlp_ratio": 1.0444305737028896, "grad/layer_16/attn": 0.0035890343133360147, "grad/layer_16/mlp": 0.004328656475991011, "grad/layer_16/attn_mlp_ratio": 0.8291335314616256, "grad/layer_20/attn": 0.004797774832695723, "grad/layer_20/mlp": 0.006210522726178169, "grad/layer_20/attn_mlp_ratio": 0.7725235003520915, "grad/layer_24/attn": 0.004540926311165094, "grad/layer_24/mlp": 0.007833895273506641, "grad/layer_24/attn_mlp_ratio": 0.579651130716144, "grad/layer_27/attn": 0.007370998617261648, "grad/layer_27/mlp": 0.0065898653119802475, "grad/layer_27/attn_mlp_ratio": 1.1185355324345951} {"step": 60850, "timestamp": 1778260314.064726, "train/loss": 2.202902340888977, "train/z_loss": 0.0013768870616331697, "train/perplexity": 9.05124521341138, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934243.9273514813, "perf/iters_per_sec": 0.9223193775899321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0842231273651124, "data/tokens_consumed": 127613796352, "data/tokens_consumed_B": 127.613796352, "train/loss_slope": 3.3753001650567078e-06} {"step": 60860, "timestamp": 1778260324.4164155, "train/loss": 2.0907175540924072, "train/z_loss": 0.0013834544457495212, "train/perplexity": 8.090718610162126, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027239.9121354285, "perf/iters_per_sec": 0.9666633186986106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344863414764405, "data/tokens_consumed": 127634767872, "data/tokens_consumed_B": 127.634767872, "train/loss_slope": 2.236487419799027e-06} {"step": 60870, "timestamp": 1778260334.7606115, "train/loss": 2.131040167808533, "train/z_loss": 0.0013809351017698645, "train/perplexity": 8.423624238817911, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028873.7376923272, "perf/iters_per_sec": 0.9674423874341618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336532831192016, "data/tokens_consumed": 127655739392, "data/tokens_consumed_B": 127.655739392, "train/loss_slope": -1.7385826860509462e-07} {"step": 60880, "timestamp": 1778260345.10407, "train/loss": 2.1410589456558227, "train/z_loss": 0.0013786464929580688, "train/perplexity": 8.508442838612517, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028521.4624590282, "perf/iters_per_sec": 0.9672744095130101, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338327884674072, "data/tokens_consumed": 127676710912, "data/tokens_consumed_B": 127.676710912, "train/loss_slope": -1.7425156912931584e-06} {"step": 60890, "timestamp": 1778260355.4530606, "train/loss": 2.162740969657898, "train/z_loss": 0.001376795652322471, "train/perplexity": 8.694937586224118, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027800.636525329, "perf/iters_per_sec": 0.9669306929232259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342002868652345, "data/tokens_consumed": 127697682432, "data/tokens_consumed_B": 127.697682432, "train/loss_slope": 1.4229800012280593e-06} {"step": 60900, "timestamp": 1778260366.2871306, "grad/layer_0/attn": 0.0029385362286120653, "grad/layer_0/mlp": 0.0027503676246851683, "grad/layer_0/attn_mlp_ratio": 1.0684157620953891, "grad/layer_4/attn": 0.002487804275006056, "grad/layer_4/mlp": 0.0025139828212559223, "grad/layer_4/attn_mlp_ratio": 0.989586784369725, "grad/layer_8/attn": 0.004223296418786049, "grad/layer_8/mlp": 0.003549426095560193, "grad/layer_8/attn_mlp_ratio": 1.1898532850376682, "grad/layer_12/attn": 0.004362059291452169, "grad/layer_12/mlp": 0.006849133409559727, "grad/layer_12/attn_mlp_ratio": 0.636877538649786, "grad/layer_16/attn": 0.0030613981653004885, "grad/layer_16/mlp": 0.004584086127579212, "grad/layer_16/attn_mlp_ratio": 0.6678317146135289, "grad/layer_20/attn": 0.0031340494751930237, "grad/layer_20/mlp": 0.006022950168699026, "grad/layer_20/attn_mlp_ratio": 0.5203512125080169, "grad/layer_24/attn": 0.011811049655079842, "grad/layer_24/mlp": 0.009202592074871063, "grad/layer_24/attn_mlp_ratio": 1.2834481231637678, "grad/layer_27/attn": 0.007148378062993288, "grad/layer_27/mlp": 0.008999067358672619, "grad/layer_27/attn_mlp_ratio": 0.794346535996263} {"step": 60900, "timestamp": 1778260366.8748338, "eos/sharpness": 72.18229770660399, "eos/L0_probe": 1.9786193370819092, "eos/L_plus": 2.4016165733337402, "eos/L_minus": 2.277445077896118, "eos/grad_norm": 0.165451318025589, "eos/embed_grad_frac": 0.07644092291593552, "eos/time_s": 0.5847880840301514} {"step": 60900, "timestamp": 1778260366.8940194, "train/loss": 2.1761913299560547, "train/z_loss": 0.0013731386745348573, "train/perplexity": 8.812677677809344, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1833859.3959694633, "perf/iters_per_sec": 0.8744523029181782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.143572950363159, "data/tokens_consumed": 127718653952, "data/tokens_consumed_B": 127.718653952, "train/loss_slope": -2.9634775573200085e-06} {"step": 60900, "timestamp": 1778260368.2547581, "geo/rankme_last": 438.8304748535156, "geo/layer_0/stable_rank_q_proj": 19.353134155273438, "geo/layer_0/stable_rank_k_proj": 16.19524383544922, "geo/layer_0/stable_rank_o_proj": 47.05972671508789, "geo/layer_0/stable_rank_gate_proj": 132.06509399414062, "geo/layer_0/stable_rank_down_proj": 54.94063949584961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06711383908987045, "geo/layer_0/attn_entropy_mean": 6.169076442718506, "geo/layer_0/attn_entropy_std": 0.41110339760780334, "geo/layer_7/stable_rank_q_proj": 43.5325813293457, "geo/layer_7/stable_rank_k_proj": 41.21180725097656, "geo/layer_7/stable_rank_o_proj": 92.24525451660156, "geo/layer_7/stable_rank_gate_proj": 82.07305908203125, "geo/layer_7/stable_rank_down_proj": 141.093994140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4602404534816742, "geo/layer_7/attn_entropy_mean": 4.642581939697266, "geo/layer_7/attn_entropy_std": 0.7984313368797302, "geo/layer_14/stable_rank_q_proj": 51.686561584472656, "geo/layer_14/stable_rank_k_proj": 39.896568298339844, "geo/layer_14/stable_rank_o_proj": 43.945274353027344, "geo/layer_14/stable_rank_gate_proj": 72.02714538574219, "geo/layer_14/stable_rank_down_proj": 129.90628051757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39039337635040283, "geo/layer_14/attn_entropy_mean": 5.505580425262451, "geo/layer_14/attn_entropy_std": 0.3853593170642853, "geo/layer_21/stable_rank_q_proj": 40.879364013671875, "geo/layer_21/stable_rank_k_proj": 30.377382278442383, "geo/layer_21/stable_rank_o_proj": 70.69914245605469, "geo/layer_21/stable_rank_gate_proj": 66.6047134399414, "geo/layer_21/stable_rank_down_proj": 51.81371307373047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1460493952035904, "geo/layer_21/attn_entropy_mean": 5.694404125213623, "geo/layer_21/attn_entropy_std": 0.29724135994911194, "geo/layer_27/stable_rank_q_proj": 43.081478118896484, "geo/layer_27/stable_rank_k_proj": 31.66680335998535, "geo/layer_27/stable_rank_o_proj": 115.99745178222656, "geo/layer_27/stable_rank_gate_proj": 80.80098724365234, "geo/layer_27/stable_rank_down_proj": 128.77149963378906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0919739156961441, "geo/layer_27/attn_entropy_mean": 4.20334529876709, "geo/layer_27/attn_entropy_std": 0.7232329845428467, "attnres/final_alpha/block_0": 0.23655839264392853, "attnres/block_norm/0": 1.761064887046814, "attnres/final_alpha/block_1": 0.004479949362576008, "attnres/block_norm/1": 46095.65234375, "attnres/final_alpha/block_2": 0.01039054337888956, "attnres/block_norm/2": 28351.90625, "attnres/final_alpha/block_3": 0.012579037807881832, "attnres/block_norm/3": 57175.2578125, "attnres/final_alpha/block_4": 0.014389737509191036, "attnres/block_norm/4": 14982.373046875, "attnres/final_alpha/block_5": 0.6124424934387207, "attnres/block_norm/5": 6643.0771484375, "attnres/final_alpha/block_6": 0.10915984213352203, "attnres/block_norm/6": 37907.078125, "geo/tier1_time_s": 1.3569416999816895, "geo/step": 60900.0, "geo/rankme_slope": -3.446101096688675e-05} {"step": 60910, "timestamp": 1778260378.5989425, "train/loss": 2.1472615957260133, "train/z_loss": 0.0013840040424838661, "train/perplexity": 8.561381743206951, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792228.368353885, "perf/iters_per_sec": 0.85460108201689, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1701365947723388, "data/tokens_consumed": 127739625472, "data/tokens_consumed_B": 127.739625472, "train/loss_slope": -1.0382758866478577e-06} {"step": 60920, "timestamp": 1778260389.3504198, "train/loss": 2.188673257827759, "train/z_loss": 0.001378601777832955, "train/perplexity": 8.923366251245973, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1951827.6505167508, "perf/iters_per_sec": 0.9307039501746897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0744555234909057, "data/tokens_consumed": 127760596992, "data/tokens_consumed_B": 127.760596992, "train/loss_slope": -1.131094585765554e-06} {"step": 60930, "timestamp": 1778260399.6962385, "train/loss": 2.1652018070220946, "train/z_loss": 0.0013713908498175441, "train/perplexity": 8.716360762180521, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028435.856861649, "perf/iters_per_sec": 0.9672335895832296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338764190673828, "data/tokens_consumed": 127781568512, "data/tokens_consumed_B": 127.781568512, "train/loss_slope": -8.642805327248122e-07} {"step": 60940, "timestamp": 1778260410.046438, "train/loss": 2.116276502609253, "train/z_loss": 0.0013771765865385533, "train/perplexity": 8.300174200623653, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027224.3071414237, "perf/iters_per_sec": 0.9666558776576155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344943046569823, "data/tokens_consumed": 127802540032, "data/tokens_consumed_B": 127.802540032, "train/loss_slope": -5.96269749321817e-06} {"step": 60950, "timestamp": 1778260420.385473, "grad/layer_0/attn": 0.002905425848439336, "grad/layer_0/mlp": 0.0030854674987494946, "grad/layer_0/attn_mlp_ratio": 0.9416484715693895, "grad/layer_4/attn": 0.0023065826389938593, "grad/layer_4/mlp": 0.0025222927797585726, "grad/layer_4/attn_mlp_ratio": 0.914478511795442, "grad/layer_8/attn": 0.005622388329356909, "grad/layer_8/mlp": 0.00362019264139235, "grad/layer_8/attn_mlp_ratio": 1.5530632568459604, "grad/layer_12/attn": 0.0039025445003062487, "grad/layer_12/mlp": 0.0069556329399347305, "grad/layer_12/attn_mlp_ratio": 0.561062447932542, "grad/layer_16/attn": 0.003400539280846715, "grad/layer_16/mlp": 0.004632866010069847, "grad/layer_16/attn_mlp_ratio": 0.7340033577606341, "grad/layer_20/attn": 0.002882255706936121, "grad/layer_20/mlp": 0.0057587698101997375, "grad/layer_20/attn_mlp_ratio": 0.5004985008744954, "grad/layer_24/attn": 0.005281062331050634, "grad/layer_24/mlp": 0.0071800812147557735, "grad/layer_24/attn_mlp_ratio": 0.7355156716954628, "grad/layer_27/attn": 0.005733904428780079, "grad/layer_27/mlp": 0.006753114052116871, "grad/layer_27/attn_mlp_ratio": 0.8490755967722974} {"step": 60950, "timestamp": 1778260420.3995779, "train/loss": 2.1413525581359862, "train/z_loss": 0.0013754604733549058, "train/perplexity": 8.510941390401735, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026702.846065298, "perf/iters_per_sec": 0.9664072256399622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347604751586914, "data/tokens_consumed": 127823511552, "data/tokens_consumed_B": 127.823511552, "train/loss_slope": -1.0584923801141772e-05} {"step": 60960, "timestamp": 1778260430.7456834, "train/loss": 2.177969288825989, "train/z_loss": 0.0013765830779448151, "train/perplexity": 8.828360193557248, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028354.7019428164, "perf/iters_per_sec": 0.9671948919023592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033917784690857, "data/tokens_consumed": 127844483072, "data/tokens_consumed_B": 127.844483072, "train/loss_slope": -8.95060764716766e-06} {"step": 60970, "timestamp": 1778260441.0925932, "train/loss": 2.0888373494148254, "train/z_loss": 0.0013852484757080675, "train/perplexity": 8.075520695258913, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027799.4678322964, "perf/iters_per_sec": 0.9669301356469614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342008829116822, "data/tokens_consumed": 127865454592, "data/tokens_consumed_B": 127.865454592, "train/loss_slope": -1.2722072270837218e-05} {"step": 60975, "timestamp": 1778260446.8455691, "eos/sharpness": 80.90620040893553, "eos/L0_probe": 1.9801768064498901, "eos/L_plus": 2.3227930068969727, "eos/L_minus": 2.446622610092163, "eos/grad_norm": 0.19933034479618073, "eos/embed_grad_frac": 0.058272022753953934, "eos/time_s": 0.5856528282165527} {"step": 60975, "timestamp": 1778260448.222072, "geo/rankme_last": 439.3782043457031, "geo/layer_0/stable_rank_q_proj": 19.321496963500977, "geo/layer_0/stable_rank_k_proj": 16.189098358154297, "geo/layer_0/stable_rank_o_proj": 47.0369987487793, "geo/layer_0/stable_rank_gate_proj": 131.89674377441406, "geo/layer_0/stable_rank_down_proj": 55.01253890991211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06363416463136673, "geo/layer_0/attn_entropy_mean": 6.164085865020752, "geo/layer_0/attn_entropy_std": 0.4112083315849304, "geo/layer_7/stable_rank_q_proj": 43.518272399902344, "geo/layer_7/stable_rank_k_proj": 41.079437255859375, "geo/layer_7/stable_rank_o_proj": 92.114990234375, "geo/layer_7/stable_rank_gate_proj": 82.03237915039062, "geo/layer_7/stable_rank_down_proj": 140.67861938476562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4616966247558594, "geo/layer_7/attn_entropy_mean": 4.639866828918457, "geo/layer_7/attn_entropy_std": 0.8221544027328491, "geo/layer_14/stable_rank_q_proj": 51.587677001953125, "geo/layer_14/stable_rank_k_proj": 39.89783477783203, "geo/layer_14/stable_rank_o_proj": 43.9327392578125, "geo/layer_14/stable_rank_gate_proj": 71.98361206054688, "geo/layer_14/stable_rank_down_proj": 129.8778839111328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3834742605686188, "geo/layer_14/attn_entropy_mean": 5.468636512756348, "geo/layer_14/attn_entropy_std": 0.4183768033981323, "geo/layer_21/stable_rank_q_proj": 40.8077392578125, "geo/layer_21/stable_rank_k_proj": 30.396577835083008, "geo/layer_21/stable_rank_o_proj": 70.63284301757812, "geo/layer_21/stable_rank_gate_proj": 66.67780303955078, "geo/layer_21/stable_rank_down_proj": 51.87013244628906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1391502320766449, "geo/layer_21/attn_entropy_mean": 5.691819667816162, "geo/layer_21/attn_entropy_std": 0.30643177032470703, "geo/layer_27/stable_rank_q_proj": 43.194366455078125, "geo/layer_27/stable_rank_k_proj": 31.678749084472656, "geo/layer_27/stable_rank_o_proj": 116.02137756347656, "geo/layer_27/stable_rank_gate_proj": 80.8094711303711, "geo/layer_27/stable_rank_down_proj": 128.79989624023438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08891567587852478, "geo/layer_27/attn_entropy_mean": 4.18832540512085, "geo/layer_27/attn_entropy_std": 0.7265167832374573, "attnres/final_alpha/block_0": 0.23875539004802704, "attnres/block_norm/0": 1.7612520456314087, "attnres/final_alpha/block_1": 0.004616246558725834, "attnres/block_norm/1": 46223.56640625, "attnres/final_alpha/block_2": 0.010462464764714241, "attnres/block_norm/2": 28424.9609375, "attnres/final_alpha/block_3": 0.012474508956074715, "attnres/block_norm/3": 57112.48046875, "attnres/final_alpha/block_4": 0.01439967192709446, "attnres/block_norm/4": 15015.5439453125, "attnres/final_alpha/block_5": 0.6076164245605469, "attnres/block_norm/5": 6647.27294921875, "attnres/final_alpha/block_6": 0.11167523264884949, "attnres/block_norm/6": 37737.56640625, "geo/tier1_time_s": 1.3567447662353516, "geo/step": 60975.0, "geo/rankme_slope": -4.446309773909564e-06} {"step": 60980, "timestamp": 1778260453.4023745, "train/loss": 2.146705174446106, "train/z_loss": 0.0013749774312600493, "train/perplexity": 8.5566193332956, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704537.5511616012, "perf/iters_per_sec": 0.8127868419464117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2303348779678345, "data/tokens_consumed": 127886426112, "data/tokens_consumed_B": 127.886426112, "train/loss_slope": -1.310765490029769e-05} {"step": 60990, "timestamp": 1778260463.7502615, "train/loss": 2.140980064868927, "train/z_loss": 0.0013800118817016483, "train/perplexity": 8.507771712415979, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027595.6683693512, "perf/iters_per_sec": 0.9668329564902073, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343048334121705, "data/tokens_consumed": 127907397632, "data/tokens_consumed_B": 127.907397632, "train/loss_slope": -1.1384493060702923e-05} {"step": 61000, "timestamp": 1778260474.093166, "grad/layer_0/attn": 0.0027093144599348307, "grad/layer_0/mlp": 0.002941054292023182, "grad/layer_0/attn_mlp_ratio": 0.9212051525749121, "grad/layer_4/attn": 0.0021973897237330675, "grad/layer_4/mlp": 0.002489848528057337, "grad/layer_4/attn_mlp_ratio": 0.8825394841161667, "grad/layer_8/attn": 0.003249756060540676, "grad/layer_8/mlp": 0.0035717941354960203, "grad/layer_8/attn_mlp_ratio": 0.9098385422780038, "grad/layer_12/attn": 0.00713761243969202, "grad/layer_12/mlp": 0.006588171701878309, "grad/layer_12/attn_mlp_ratio": 1.083398043393019, "grad/layer_16/attn": 0.0032039780635386705, "grad/layer_16/mlp": 0.0045562065206468105, "grad/layer_16/attn_mlp_ratio": 0.7032117571269905, "grad/layer_20/attn": 0.005628880113363266, "grad/layer_20/mlp": 0.005538736470043659, "grad/layer_20/attn_mlp_ratio": 1.0162751093466242, "grad/layer_24/attn": 0.012398567982017994, "grad/layer_24/mlp": 0.010600565932691097, "grad/layer_24/attn_mlp_ratio": 1.1696137681499306, "grad/layer_27/attn": 0.007653424516320229, "grad/layer_27/mlp": 0.009991936385631561, "grad/layer_27/attn_mlp_ratio": 0.765960084646843} {"step": 61000, "timestamp": 1778260474.107319, "train/loss": 2.2040744543075563, "train/z_loss": 0.0013720761868171393, "train/perplexity": 9.061860519336813, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025943.2282188705, "perf/iters_per_sec": 0.9660450116247513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351484537124633, "data/tokens_consumed": 127928369152, "data/tokens_consumed_B": 127.928369152, "train/loss_slope": -5.732022854003581e-06} {"step": 61000, "timestamp": 1778260481.3976283, "geo/ww_alpha_mean": 7.716196125203658, "geo/ww_alpha_std": 5.06533392036962, "geo/ww_alpha_min": 1.3394931303595907, "geo/ww_alpha_max": 44.81648605039959, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9559503897181942, "geo/ww_alpha_by_type/k_proj": 4.435364629745449, "geo/ww_alpha_by_type/v_proj": 8.747696195931779, "geo/ww_alpha_by_type/o_proj": 8.82835231404178, "geo/ww_alpha_by_type/gate_proj": 8.116775733744607, "geo/ww_alpha_by_type/up_proj": 12.027741126488076, "geo/ww_alpha_by_type/down_proj": 7.999504129828608, "geo/twonn_id/layer_0": 0.7394253611564636, "geo/twonn_id/layer_7": 2.9415156841278076, "geo/twonn_id/layer_14": 4.3966264724731445, "geo/twonn_id/layer_21": 6.064256191253662, "geo/twonn_id/layer_27": 5.69334602355957, "geo/tier2_time_s": 7.282851219177246} {"step": 61000, "timestamp": 1778260482.1508503, "eoc/jacobian_sigma/layer_0/attn": 1276.1173095703125, "eoc/jacobian_sigma/layer_0/mlp": 9516.548828125, "eoc/jacobian_sigma/layer_0": 9516.548828125, "eoc/jacobian_sigma/layer_7/attn": 1.1447612047195435, "eoc/jacobian_sigma/layer_7/mlp": 1.8068222999572754, "eoc/jacobian_sigma/layer_7": 1.8068222999572754, "eoc/jacobian_sigma/layer_14/attn": 1.5133973360061646, "eoc/jacobian_sigma/layer_14/mlp": 6.30716609954834, "eoc/jacobian_sigma/layer_14": 6.30716609954834, "eoc/jacobian_sigma/layer_21/attn": 1.0631749629974365, "eoc/jacobian_sigma/layer_21/mlp": 3.8054749965667725, "eoc/jacobian_sigma/layer_21": 3.8054749965667725, "eoc/jacobian_sigma/layer_27/attn": 3.122067928314209, "eoc/jacobian_sigma/layer_27/mlp": 24.49034881591797, "eoc/jacobian_sigma/layer_27": 24.49034881591797, "eoc/layer0_sigma": 9516.548828125, "eoc/sigma_max": 24.49034881591797, "eoc/sigma_min": 1.8068222999572754, "eoc/sigma_mean": 9.102453052997589, "eoc/time_s": 0.7455048561096191} {"step": 61010, "timestamp": 1778260492.5160656, "train/loss": 2.133528161048889, "train/z_loss": 0.0013770485878922046, "train/perplexity": 8.444608252200672, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1139610.090885146, "perf/iters_per_sec": 0.543408437197278, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8402364253997803, "data/tokens_consumed": 127949340672, "data/tokens_consumed_B": 127.949340672, "train/loss_slope": -8.550603147244092e-06} {"step": 61020, "timestamp": 1778260502.8647988, "train/loss": 2.1863723278045653, "train/z_loss": 0.0013828777009621263, "train/perplexity": 8.902857813219036, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027520.3757668238, "perf/iters_per_sec": 0.9667970541795844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343432426452637, "data/tokens_consumed": 127970312192, "data/tokens_consumed_B": 127.970312192, "train/loss_slope": -7.674102256722314e-06} {"step": 61030, "timestamp": 1778260513.2110445, "train/loss": 2.112906444072723, "train/z_loss": 0.0013807818992063402, "train/perplexity": 8.272249208562457, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027964.360357617, "perf/iters_per_sec": 0.9670087625301442, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034116792678833, "data/tokens_consumed": 127991283712, "data/tokens_consumed_B": 127.991283712, "train/loss_slope": -1.3440353034174345e-05} {"step": 61040, "timestamp": 1778260523.55474, "train/loss": 2.146915054321289, "train/z_loss": 0.0013825685484334827, "train/perplexity": 8.558415383964116, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028850.198954383, "perf/iters_per_sec": 0.9674311632892527, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336652755737306, "data/tokens_consumed": 128012255232, "data/tokens_consumed_B": 128.012255232, "train/loss_slope": -1.2848097134237414e-05} {"step": 61050, "timestamp": 1778260533.8925557, "grad/layer_0/attn": 0.002738746115937829, "grad/layer_0/mlp": 0.0029148077592253685, "grad/layer_0/attn_mlp_ratio": 0.9395974788765907, "grad/layer_4/attn": 0.002392358612269163, "grad/layer_4/mlp": 0.0025790389627218246, "grad/layer_4/attn_mlp_ratio": 0.9276162764841391, "grad/layer_8/attn": 0.004173670895397663, "grad/layer_8/mlp": 0.003663021372631192, "grad/layer_8/attn_mlp_ratio": 1.1394066146163389, "grad/layer_12/attn": 0.004497670568525791, "grad/layer_12/mlp": 0.00628174003213644, "grad/layer_12/attn_mlp_ratio": 0.7159911861868312, "grad/layer_16/attn": 0.00715444004163146, "grad/layer_16/mlp": 0.004799410700798035, "grad/layer_16/attn_mlp_ratio": 1.4906913241187516, "grad/layer_20/attn": 0.007119722198694944, "grad/layer_20/mlp": 0.007135323248803616, "grad/layer_20/attn_mlp_ratio": 0.997813532849736, "grad/layer_24/attn": 0.02519817091524601, "grad/layer_24/mlp": 0.015010164119303226, "grad/layer_24/attn_mlp_ratio": 1.678740521895217, "grad/layer_27/attn": 0.009535340592265129, "grad/layer_27/mlp": 0.01600598730146885, "grad/layer_27/attn_mlp_ratio": 0.5957358551581817} {"step": 61050, "timestamp": 1778260534.4934318, "eos/sharpness": 85.66434383392333, "eos/L0_probe": 1.9755642414093018, "eos/L_plus": 2.347994089126587, "eos/L_minus": 2.45977783203125, "eos/grad_norm": 0.30590423941612244, "eos/embed_grad_frac": 0.028083831071853638, "eos/time_s": 0.5980367660522461} {"step": 61050, "timestamp": 1778260534.512992, "train/loss": 2.13382328748703, "train/z_loss": 0.001375340111553669, "train/perplexity": 8.447100847152884, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914643.6721049359, "perf/iters_per_sec": 0.9129732475781135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095322346687317, "data/tokens_consumed": 128033226752, "data/tokens_consumed_B": 128.033226752, "train/loss_slope": -1.0735424216574086e-05} {"step": 61050, "timestamp": 1778260535.8754525, "geo/rankme_last": 437.8869323730469, "geo/layer_0/stable_rank_q_proj": 19.3029842376709, "geo/layer_0/stable_rank_k_proj": 16.16490936279297, "geo/layer_0/stable_rank_o_proj": 46.914878845214844, "geo/layer_0/stable_rank_gate_proj": 131.7945098876953, "geo/layer_0/stable_rank_down_proj": 55.026153564453125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06358492374420166, "geo/layer_0/attn_entropy_mean": 6.166655540466309, "geo/layer_0/attn_entropy_std": 0.41046279668807983, "geo/layer_7/stable_rank_q_proj": 43.522911071777344, "geo/layer_7/stable_rank_k_proj": 40.93645095825195, "geo/layer_7/stable_rank_o_proj": 91.8631820678711, "geo/layer_7/stable_rank_gate_proj": 82.1777572631836, "geo/layer_7/stable_rank_down_proj": 140.74429321289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46589019894599915, "geo/layer_7/attn_entropy_mean": 4.660905838012695, "geo/layer_7/attn_entropy_std": 0.7986394166946411, "geo/layer_14/stable_rank_q_proj": 51.65845489501953, "geo/layer_14/stable_rank_k_proj": 39.866275787353516, "geo/layer_14/stable_rank_o_proj": 44.0167236328125, "geo/layer_14/stable_rank_gate_proj": 72.03160858154297, "geo/layer_14/stable_rank_down_proj": 129.66629028320312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3924928307533264, "geo/layer_14/attn_entropy_mean": 5.506036281585693, "geo/layer_14/attn_entropy_std": 0.40744638442993164, "geo/layer_21/stable_rank_q_proj": 40.74964904785156, "geo/layer_21/stable_rank_k_proj": 30.315460205078125, "geo/layer_21/stable_rank_o_proj": 70.5689468383789, "geo/layer_21/stable_rank_gate_proj": 66.69813537597656, "geo/layer_21/stable_rank_down_proj": 51.877742767333984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14106564223766327, "geo/layer_21/attn_entropy_mean": 5.683015823364258, "geo/layer_21/attn_entropy_std": 0.3004469871520996, "geo/layer_27/stable_rank_q_proj": 43.16847229003906, "geo/layer_27/stable_rank_k_proj": 31.603126525878906, "geo/layer_27/stable_rank_o_proj": 115.87293243408203, "geo/layer_27/stable_rank_gate_proj": 80.740478515625, "geo/layer_27/stable_rank_down_proj": 128.7074737548828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08946190774440765, "geo/layer_27/attn_entropy_mean": 4.198838233947754, "geo/layer_27/attn_entropy_std": 0.7266332507133484, "attnres/final_alpha/block_0": 0.23909559845924377, "attnres/block_norm/0": 1.7615704536437988, "attnres/final_alpha/block_1": 0.004438076633960009, "attnres/block_norm/1": 46271.3359375, "attnres/final_alpha/block_2": 0.010563693009316921, "attnres/block_norm/2": 28381.03515625, "attnres/final_alpha/block_3": 0.012808531522750854, "attnres/block_norm/3": 57331.375, "attnres/final_alpha/block_4": 0.014645451679825783, "attnres/block_norm/4": 14973.02734375, "attnres/final_alpha/block_5": 0.6082010865211487, "attnres/block_norm/5": 6604.5859375, "attnres/final_alpha/block_6": 0.11024754494428635, "attnres/block_norm/6": 38068.3671875, "geo/tier1_time_s": 1.3584108352661133, "geo/step": 61050.0, "geo/rankme_slope": -4.0442544205182074e-05} {"step": 61060, "timestamp": 1778260546.2284758, "train/loss": 2.208633828163147, "train/z_loss": 0.0013700645649805665, "train/perplexity": 9.10327126106146, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790696.6155963873, "perf/iters_per_sec": 0.853870685384935, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171137523651123, "data/tokens_consumed": 128054198272, "data/tokens_consumed_B": 128.054198272, "train/loss_slope": -8.561698438310843e-06} {"step": 61070, "timestamp": 1778260556.5716805, "train/loss": 2.1076101899147033, "train/z_loss": 0.0013897192198783158, "train/perplexity": 8.228553089312815, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028794.3726802801, "perf/iters_per_sec": 0.9674045432473565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336937189102173, "data/tokens_consumed": 128075169792, "data/tokens_consumed_B": 128.075169792, "train/loss_slope": -6.852622880543733e-06} {"step": 61080, "timestamp": 1778260566.9212995, "train/loss": 2.1612419366836546, "train/z_loss": 0.00137736601755023, "train/perplexity": 8.681913352390268, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027735.3787689407, "perf/iters_per_sec": 0.966899575600119, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034233570098877, "data/tokens_consumed": 128096141312, "data/tokens_consumed_B": 128.096141312, "train/loss_slope": -8.244536082522194e-06} {"step": 61090, "timestamp": 1778260577.2767715, "train/loss": 2.1539806962013244, "train/z_loss": 0.0013774127583019435, "train/perplexity": 8.619100218315966, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026891.660216637, "perf/iters_per_sec": 0.9664972592433152, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346640825271607, "data/tokens_consumed": 128117112832, "data/tokens_consumed_B": 128.117112832, "train/loss_slope": -1.2454515851155653e-05} {"step": 61100, "timestamp": 1778260587.6132624, "grad/layer_0/attn": 0.0026927897706627846, "grad/layer_0/mlp": 0.0028986744582653046, "grad/layer_0/attn_mlp_ratio": 0.9289727827446311, "grad/layer_4/attn": 0.0022007294464856386, "grad/layer_4/mlp": 0.002611594507470727, "grad/layer_4/attn_mlp_ratio": 0.8426765165582102, "grad/layer_8/attn": 0.0038126397412270308, "grad/layer_8/mlp": 0.003871932392939925, "grad/layer_8/attn_mlp_ratio": 0.9846865223448475, "grad/layer_12/attn": 0.0054002562537789345, "grad/layer_12/mlp": 0.006783027667552233, "grad/layer_12/attn_mlp_ratio": 0.7961424365107245, "grad/layer_16/attn": 0.004629196133464575, "grad/layer_16/mlp": 0.00457631703466177, "grad/layer_16/attn_mlp_ratio": 1.0115549244614825, "grad/layer_20/attn": 0.004726414568722248, "grad/layer_20/mlp": 0.0062636034563183784, "grad/layer_20/attn_mlp_ratio": 0.7545839270038892, "grad/layer_24/attn": 0.013395484536886215, "grad/layer_24/mlp": 0.00966276228427887, "grad/layer_24/attn_mlp_ratio": 1.3862996940378471, "grad/layer_27/attn": 0.005201335996389389, "grad/layer_27/mlp": 0.008188942447304726, "grad/layer_27/attn_mlp_ratio": 0.6351657697368185} {"step": 61100, "timestamp": 1778260587.6273897, "train/loss": 2.1396423101425173, "train/z_loss": 0.0013791728066280483, "train/perplexity": 8.496398009904812, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027487.1478492487, "perf/iters_per_sec": 0.9667812098737949, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343601942062377, "data/tokens_consumed": 128138084352, "data/tokens_consumed_B": 128.138084352, "train/loss_slope": -1.0672717726770744e-05} {"step": 61110, "timestamp": 1778260597.9706638, "train/loss": 2.158790397644043, "train/z_loss": 0.001398620812688023, "train/perplexity": 8.6606553709009, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028926.8539487515, "perf/iters_per_sec": 0.9674677152389295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336262226104735, "data/tokens_consumed": 128159055872, "data/tokens_consumed_B": 128.159055872, "train/loss_slope": -1.0627297563950626e-05} {"step": 61120, "timestamp": 1778260608.3147855, "train/loss": 2.122586989402771, "train/z_loss": 0.0013850422692485154, "train/perplexity": 8.352717954264909, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028877.0603017127, "perf/iters_per_sec": 0.967443971777779, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03365159034729, "data/tokens_consumed": 128180027392, "data/tokens_consumed_B": 128.180027392, "train/loss_slope": -1.1758999154977647e-05} {"step": 61125, "timestamp": 1778260614.0742993, "eos/sharpness": 20.98422050476074, "eos/L0_probe": 1.9751404523849487, "eos/L_plus": 2.088552713394165, "eos/L_minus": 2.07157039642334, "eos/grad_norm": 0.13013841211795807, "eos/embed_grad_frac": 0.16331012547016144, "eos/time_s": 0.584632396697998} {"step": 61125, "timestamp": 1778260615.4485111, "geo/rankme_last": 438.0260009765625, "geo/layer_0/stable_rank_q_proj": 19.29680061340332, "geo/layer_0/stable_rank_k_proj": 16.188114166259766, "geo/layer_0/stable_rank_o_proj": 46.98094940185547, "geo/layer_0/stable_rank_gate_proj": 131.30125427246094, "geo/layer_0/stable_rank_down_proj": 55.011451721191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06060346961021423, "geo/layer_0/attn_entropy_mean": 6.1645379066467285, "geo/layer_0/attn_entropy_std": 0.41757598519325256, "geo/layer_7/stable_rank_q_proj": 43.53474426269531, "geo/layer_7/stable_rank_k_proj": 40.88045120239258, "geo/layer_7/stable_rank_o_proj": 91.95518493652344, "geo/layer_7/stable_rank_gate_proj": 82.32174682617188, "geo/layer_7/stable_rank_down_proj": 140.9748992919922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45038437843322754, "geo/layer_7/attn_entropy_mean": 4.656365394592285, "geo/layer_7/attn_entropy_std": 0.7889075875282288, "geo/layer_14/stable_rank_q_proj": 51.57864761352539, "geo/layer_14/stable_rank_k_proj": 39.91421890258789, "geo/layer_14/stable_rank_o_proj": 43.95982360839844, "geo/layer_14/stable_rank_gate_proj": 71.94583129882812, "geo/layer_14/stable_rank_down_proj": 129.7584686279297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39361536502838135, "geo/layer_14/attn_entropy_mean": 5.538825035095215, "geo/layer_14/attn_entropy_std": 0.40347057580947876, "geo/layer_21/stable_rank_q_proj": 40.8192024230957, "geo/layer_21/stable_rank_k_proj": 30.300458908081055, "geo/layer_21/stable_rank_o_proj": 70.55976867675781, "geo/layer_21/stable_rank_gate_proj": 66.70960235595703, "geo/layer_21/stable_rank_down_proj": 51.88310623168945, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.140238419175148, "geo/layer_21/attn_entropy_mean": 5.696295738220215, "geo/layer_21/attn_entropy_std": 0.29861247539520264, "geo/layer_27/stable_rank_q_proj": 43.190834045410156, "geo/layer_27/stable_rank_k_proj": 31.59307098388672, "geo/layer_27/stable_rank_o_proj": 115.83971405029297, "geo/layer_27/stable_rank_gate_proj": 80.82412719726562, "geo/layer_27/stable_rank_down_proj": 128.89219665527344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09897232800722122, "geo/layer_27/attn_entropy_mean": 4.200572967529297, "geo/layer_27/attn_entropy_std": 0.7175886631011963, "attnres/final_alpha/block_0": 0.2377484291791916, "attnres/block_norm/0": 1.7614524364471436, "attnres/final_alpha/block_1": 0.0044704945757985115, "attnres/block_norm/1": 46186.140625, "attnres/final_alpha/block_2": 0.010328691452741623, "attnres/block_norm/2": 28494.74609375, "attnres/final_alpha/block_3": 0.012270193547010422, "attnres/block_norm/3": 57561.69921875, "attnres/final_alpha/block_4": 0.01444484107196331, "attnres/block_norm/4": 15000.822265625, "attnres/final_alpha/block_5": 0.6112495064735413, "attnres/block_norm/5": 6582.43212890625, "attnres/final_alpha/block_6": 0.1094878539443016, "attnres/block_norm/6": 37981.46484375, "geo/tier1_time_s": 1.3563387393951416, "geo/step": 61125.0, "geo/rankme_slope": -5.324147237019808e-05} {"step": 61130, "timestamp": 1778260620.620433, "train/loss": 2.1086666107177736, "train/z_loss": 0.001385960797779262, "train/perplexity": 8.237250497227985, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704993.9322179426, "perf/iters_per_sec": 0.8130044613923753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2300055503845215, "data/tokens_consumed": 128200998912, "data/tokens_consumed_B": 128.200998912, "train/loss_slope": -1.4692781317030203e-05} {"step": 61140, "timestamp": 1778260630.9754307, "train/loss": 2.213689613342285, "train/z_loss": 0.0013767435448244214, "train/perplexity": 9.14941198549678, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026673.7074649557, "perf/iters_per_sec": 0.966393331272581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347753524780274, "data/tokens_consumed": 128221970432, "data/tokens_consumed_B": 128.221970432, "train/loss_slope": -8.726216013496686e-06} {"step": 61150, "timestamp": 1778260641.3215961, "grad/layer_0/attn": 0.0029187335167080164, "grad/layer_0/mlp": 0.0029790857806801796, "grad/layer_0/attn_mlp_ratio": 0.9797413144872534, "grad/layer_4/attn": 0.0020800717175006866, "grad/layer_4/mlp": 0.0025206233840435743, "grad/layer_4/attn_mlp_ratio": 0.8252211132159428, "grad/layer_8/attn": 0.0045636785216629505, "grad/layer_8/mlp": 0.0035806912928819656, "grad/layer_8/attn_mlp_ratio": 1.2745243923380436, "grad/layer_12/attn": 0.004749684128910303, "grad/layer_12/mlp": 0.0070328488945961, "grad/layer_12/attn_mlp_ratio": 0.6753570469890461, "grad/layer_16/attn": 0.0037153589073568583, "grad/layer_16/mlp": 0.004558487795293331, "grad/layer_16/attn_mlp_ratio": 0.8150419596798735, "grad/layer_20/attn": 0.005976281128823757, "grad/layer_20/mlp": 0.005933055188506842, "grad/layer_20/attn_mlp_ratio": 1.0072855953998356, "grad/layer_24/attn": 0.01217299047857523, "grad/layer_24/mlp": 0.009284000843763351, "grad/layer_24/attn_mlp_ratio": 1.3111793667742564, "grad/layer_27/attn": 0.0046519506722688675, "grad/layer_27/mlp": 0.008988060988485813, "grad/layer_27/attn_mlp_ratio": 0.5175699882845995} {"step": 61150, "timestamp": 1778260641.3360107, "train/loss": 2.115137279033661, "train/z_loss": 0.0013896865537390112, "train/perplexity": 8.290723830557052, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025696.415526647, "perf/iters_per_sec": 0.965927322161983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035274577140808, "data/tokens_consumed": 128242941952, "data/tokens_consumed_B": 128.242941952, "train/loss_slope": -8.1931335805166e-06} {"step": 61160, "timestamp": 1778260651.690203, "train/loss": 2.1370970726013185, "train/z_loss": 0.001386304956395179, "train/perplexity": 8.474800156218702, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026611.9307999983, "perf/iters_per_sec": 0.9663638738632194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348068952560425, "data/tokens_consumed": 128263913472, "data/tokens_consumed_B": 128.263913472, "train/loss_slope": -7.317316736003305e-06} {"step": 61170, "timestamp": 1778260662.0333529, "train/loss": 2.1397448062896727, "train/z_loss": 0.0013852731557562948, "train/perplexity": 8.497268902596335, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028811.5928629672, "perf/iters_per_sec": 0.9674127544703327, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336849451065064, "data/tokens_consumed": 128284884992, "data/tokens_consumed_B": 128.284884992, "train/loss_slope": -8.175495362589427e-06} {"step": 61180, "timestamp": 1778260672.3777218, "train/loss": 2.1130450487136843, "train/z_loss": 0.001386670535430312, "train/perplexity": 8.273395860157729, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028673.8395860423, "perf/iters_per_sec": 0.9673470685892307, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337551355361938, "data/tokens_consumed": 128305856512, "data/tokens_consumed_B": 128.305856512, "train/loss_slope": -1.07218844733938e-05} {"step": 61190, "timestamp": 1778260682.7273562, "train/loss": 2.1540314197540282, "train/z_loss": 0.0013854261254891754, "train/perplexity": 8.619537420788285, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027123.8148665559, "perf/iters_per_sec": 0.9666079592068462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345455884933472, "data/tokens_consumed": 128326828032, "data/tokens_consumed_B": 128.326828032, "train/loss_slope": -1.2390888205813554e-05} {"step": 61200, "timestamp": 1778260693.072002, "grad/layer_0/attn": 0.0033996435813605785, "grad/layer_0/mlp": 0.003143319860100746, "grad/layer_0/attn_mlp_ratio": 1.0815455074613585, "grad/layer_4/attn": 0.0019199573434889317, "grad/layer_4/mlp": 0.0024068059865385294, "grad/layer_4/attn_mlp_ratio": 0.7977199967323566, "grad/layer_8/attn": 0.01017176266759634, "grad/layer_8/mlp": 0.003788543399423361, "grad/layer_8/attn_mlp_ratio": 2.684874192191417, "grad/layer_12/attn": 0.005548667628318071, "grad/layer_12/mlp": 0.007214999757707119, "grad/layer_12/attn_mlp_ratio": 0.7690461174979708, "grad/layer_16/attn": 0.004541187547147274, "grad/layer_16/mlp": 0.004718541633337736, "grad/layer_16/attn_mlp_ratio": 0.9624133479762598, "grad/layer_20/attn": 0.007352032698690891, "grad/layer_20/mlp": 0.006118794437497854, "grad/layer_20/attn_mlp_ratio": 1.2015492028103534, "grad/layer_24/attn": 0.00568602466955781, "grad/layer_24/mlp": 0.008586716838181019, "grad/layer_24/attn_mlp_ratio": 0.662188437151664, "grad/layer_27/attn": 0.010957133024930954, "grad/layer_27/mlp": 0.006633578799664974, "grad/layer_27/attn_mlp_ratio": 1.6517679507037053} {"step": 61200, "timestamp": 1778260693.6569648, "eos/sharpness": 19.941425323486325, "eos/L0_probe": 1.978188157081604, "eos/L_plus": 2.075394868850708, "eos/L_minus": 2.0803956985473633, "eos/grad_norm": 0.09835796803236008, "eos/embed_grad_frac": 0.23165157437324524, "eos/time_s": 0.582190990447998} {"step": 61200, "timestamp": 1778260693.677574, "train/loss": 2.149135637283325, "train/z_loss": 0.0013668981730006636, "train/perplexity": 8.57744117169982, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916368.5692490411, "perf/iters_per_sec": 0.9137957426305013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094336462020874, "data/tokens_consumed": 128347799552, "data/tokens_consumed_B": 128.347799552, "train/loss_slope": -1.2369114616558885e-05} {"step": 61200, "timestamp": 1778260695.040222, "geo/rankme_last": 438.26708984375, "geo/layer_0/stable_rank_q_proj": 19.312026977539062, "geo/layer_0/stable_rank_k_proj": 16.168264389038086, "geo/layer_0/stable_rank_o_proj": 46.973388671875, "geo/layer_0/stable_rank_gate_proj": 131.12960815429688, "geo/layer_0/stable_rank_down_proj": 55.055606842041016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06117018684744835, "geo/layer_0/attn_entropy_mean": 6.161600112915039, "geo/layer_0/attn_entropy_std": 0.4229956865310669, "geo/layer_7/stable_rank_q_proj": 43.574302673339844, "geo/layer_7/stable_rank_k_proj": 40.836631774902344, "geo/layer_7/stable_rank_o_proj": 92.03805541992188, "geo/layer_7/stable_rank_gate_proj": 82.16133117675781, "geo/layer_7/stable_rank_down_proj": 140.86276245117188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4617341160774231, "geo/layer_7/attn_entropy_mean": 4.639449596405029, "geo/layer_7/attn_entropy_std": 0.7998392581939697, "geo/layer_14/stable_rank_q_proj": 51.543907165527344, "geo/layer_14/stable_rank_k_proj": 39.93873977661133, "geo/layer_14/stable_rank_o_proj": 44.001644134521484, "geo/layer_14/stable_rank_gate_proj": 71.93557739257812, "geo/layer_14/stable_rank_down_proj": 129.4003143310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3981953263282776, "geo/layer_14/attn_entropy_mean": 5.5536603927612305, "geo/layer_14/attn_entropy_std": 0.40716415643692017, "geo/layer_21/stable_rank_q_proj": 40.83680725097656, "geo/layer_21/stable_rank_k_proj": 30.345373153686523, "geo/layer_21/stable_rank_o_proj": 70.57981872558594, "geo/layer_21/stable_rank_gate_proj": 66.65130615234375, "geo/layer_21/stable_rank_down_proj": 51.759063720703125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13750052452087402, "geo/layer_21/attn_entropy_mean": 5.707856178283691, "geo/layer_21/attn_entropy_std": 0.29462432861328125, "geo/layer_27/stable_rank_q_proj": 43.29235076904297, "geo/layer_27/stable_rank_k_proj": 31.63912010192871, "geo/layer_27/stable_rank_o_proj": 115.95426177978516, "geo/layer_27/stable_rank_gate_proj": 80.76868438720703, "geo/layer_27/stable_rank_down_proj": 128.79991149902344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09060165286064148, "geo/layer_27/attn_entropy_mean": 4.191699028015137, "geo/layer_27/attn_entropy_std": 0.7242302894592285, "attnres/final_alpha/block_0": 0.2352955937385559, "attnres/block_norm/0": 1.7618281841278076, "attnres/final_alpha/block_1": 0.0044863102957606316, "attnres/block_norm/1": 46305.0234375, "attnres/final_alpha/block_2": 0.010201221331954002, "attnres/block_norm/2": 28431.91796875, "attnres/final_alpha/block_3": 0.012273833155632019, "attnres/block_norm/3": 56850.19140625, "attnres/final_alpha/block_4": 0.014289615675807, "attnres/block_norm/4": 15026.2744140625, "attnres/final_alpha/block_5": 0.6139048337936401, "attnres/block_norm/5": 6578.82470703125, "attnres/final_alpha/block_6": 0.10954861342906952, "attnres/block_norm/6": 38041.3359375, "geo/tier1_time_s": 1.3585972785949707, "geo/step": 61200.0, "geo/rankme_slope": -6.272051789465786e-05} {"step": 61210, "timestamp": 1778260705.3786273, "train/loss": 2.1471408724784853, "train/z_loss": 0.0013764877803623676, "train/perplexity": 8.560348247784301, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792887.9281415835, "perf/iters_per_sec": 0.8549155846317212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.169706130027771, "data/tokens_consumed": 128368771072, "data/tokens_consumed_B": 128.368771072, "train/loss_slope": -1.0772121971947575e-05} {"step": 61220, "timestamp": 1778260715.7151957, "train/loss": 2.179468905925751, "train/z_loss": 0.0013724575866945088, "train/perplexity": 8.841609285265669, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029770.7223779764, "perf/iters_per_sec": 0.9678701030626184, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033196496963501, "data/tokens_consumed": 128389742592, "data/tokens_consumed_B": 128.389742592, "train/loss_slope": -6.472353537519748e-06} {"step": 61230, "timestamp": 1778260726.0559275, "train/loss": 2.1720629215240477, "train/z_loss": 0.0013680459233000875, "train/perplexity": 8.776370342298515, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029330.9093147381, "perf/iters_per_sec": 0.9676603838514033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334204196929933, "data/tokens_consumed": 128410714112, "data/tokens_consumed_B": 128.410714112, "train/loss_slope": -4.762731848841784e-06} {"step": 61240, "timestamp": 1778260736.394607, "train/loss": 2.13908429145813, "train/z_loss": 0.0013743711984716355, "train/perplexity": 8.491658183644095, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029824.2134994343, "perf/iters_per_sec": 0.9678956096169635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331692695617676, "data/tokens_consumed": 128431685632, "data/tokens_consumed_B": 128.431685632, "train/loss_slope": -5.315562758115244e-06} {"step": 61250, "timestamp": 1778260746.7362115, "grad/layer_0/attn": 0.0024981312453746796, "grad/layer_0/mlp": 0.0028163918759673834, "grad/layer_0/attn_mlp_ratio": 0.8869970042137382, "grad/layer_4/attn": 0.003381110494956374, "grad/layer_4/mlp": 0.0025004190392792225, "grad/layer_4/attn_mlp_ratio": 1.3522174909967384, "grad/layer_8/attn": 0.0030078673735260963, "grad/layer_8/mlp": 0.0035465783439576626, "grad/layer_8/attn_mlp_ratio": 0.8481040025071565, "grad/layer_12/attn": 0.00596281373873353, "grad/layer_12/mlp": 0.007088217418640852, "grad/layer_12/attn_mlp_ratio": 0.8412289440966373, "grad/layer_16/attn": 0.00381447933614254, "grad/layer_16/mlp": 0.005009918473660946, "grad/layer_16/attn_mlp_ratio": 0.7613854956040033, "grad/layer_20/attn": 0.0035764239728450775, "grad/layer_20/mlp": 0.006539770867675543, "grad/layer_20/attn_mlp_ratio": 0.5468729700967281, "grad/layer_24/attn": 0.011327994056046009, "grad/layer_24/mlp": 0.01024694461375475, "grad/layer_24/attn_mlp_ratio": 1.1054996755120712, "grad/layer_27/attn": 0.0045910547487437725, "grad/layer_27/mlp": 0.008472052402794361, "grad/layer_27/attn_mlp_ratio": 0.5419058424424886} {"step": 61250, "timestamp": 1778260746.750204, "train/loss": 2.196029210090637, "train/z_loss": 0.0013795085600577296, "train/perplexity": 8.989248122291263, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026446.0443039637, "perf/iters_per_sec": 0.966284773017866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348916053771973, "data/tokens_consumed": 128452657152, "data/tokens_consumed_B": 128.452657152, "train/loss_slope": -5.098199930199607e-06} {"step": 61260, "timestamp": 1778260757.0891232, "train/loss": 2.2019116401672365, "train/z_loss": 0.0013635834795422852, "train/perplexity": 9.042282578623242, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029309.1858417362, "perf/iters_per_sec": 0.9676500252922707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334314823150634, "data/tokens_consumed": 128473628672, "data/tokens_consumed_B": 128.473628672, "train/loss_slope": -3.1736760678821906e-06} {"step": 61270, "timestamp": 1778260767.427509, "train/loss": 2.1208115100860594, "train/z_loss": 0.0013849272159859537, "train/perplexity": 8.337901033759048, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029903.7996411058, "perf/iters_per_sec": 0.9679335592465905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331287622451781, "data/tokens_consumed": 128494600192, "data/tokens_consumed_B": 128.494600192, "train/loss_slope": -4.410493270148396e-06} {"step": 61275, "timestamp": 1778260773.1716723, "eos/sharpness": 70.94173431396483, "eos/L0_probe": 1.9787644147872925, "eos/L_plus": 2.438040018081665, "eos/L_minus": 2.2289061546325684, "eos/grad_norm": 0.16011711955070496, "eos/embed_grad_frac": 0.10179711878299713, "eos/time_s": 0.5822367668151855} {"step": 61275, "timestamp": 1778260774.5467825, "geo/rankme_last": 438.6877746582031, "geo/layer_0/stable_rank_q_proj": 19.296279907226562, "geo/layer_0/stable_rank_k_proj": 16.13149642944336, "geo/layer_0/stable_rank_o_proj": 46.92823791503906, "geo/layer_0/stable_rank_gate_proj": 131.4990692138672, "geo/layer_0/stable_rank_down_proj": 54.99504852294922, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06617522984743118, "geo/layer_0/attn_entropy_mean": 6.158428192138672, "geo/layer_0/attn_entropy_std": 0.4189761281013489, "geo/layer_7/stable_rank_q_proj": 43.62809753417969, "geo/layer_7/stable_rank_k_proj": 40.88371658325195, "geo/layer_7/stable_rank_o_proj": 91.91757202148438, "geo/layer_7/stable_rank_gate_proj": 82.07568359375, "geo/layer_7/stable_rank_down_proj": 140.92532348632812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45388925075531006, "geo/layer_7/attn_entropy_mean": 4.6856489181518555, "geo/layer_7/attn_entropy_std": 0.7914664149284363, "geo/layer_14/stable_rank_q_proj": 51.57238006591797, "geo/layer_14/stable_rank_k_proj": 40.07550811767578, "geo/layer_14/stable_rank_o_proj": 44.018672943115234, "geo/layer_14/stable_rank_gate_proj": 72.05415344238281, "geo/layer_14/stable_rank_down_proj": 129.4182586669922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37791094183921814, "geo/layer_14/attn_entropy_mean": 5.508557319641113, "geo/layer_14/attn_entropy_std": 0.40393224358558655, "geo/layer_21/stable_rank_q_proj": 40.84978485107422, "geo/layer_21/stable_rank_k_proj": 30.307388305664062, "geo/layer_21/stable_rank_o_proj": 70.5354232788086, "geo/layer_21/stable_rank_gate_proj": 66.60948944091797, "geo/layer_21/stable_rank_down_proj": 51.77305603027344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14341533184051514, "geo/layer_21/attn_entropy_mean": 5.691418647766113, "geo/layer_21/attn_entropy_std": 0.3050147294998169, "geo/layer_27/stable_rank_q_proj": 43.3812141418457, "geo/layer_27/stable_rank_k_proj": 31.741657257080078, "geo/layer_27/stable_rank_o_proj": 116.012939453125, "geo/layer_27/stable_rank_gate_proj": 80.79391479492188, "geo/layer_27/stable_rank_down_proj": 128.65432739257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09532289952039719, "geo/layer_27/attn_entropy_mean": 4.1885666847229, "geo/layer_27/attn_entropy_std": 0.717491090297699, "attnres/final_alpha/block_0": 0.23537619411945343, "attnres/block_norm/0": 1.7619280815124512, "attnres/final_alpha/block_1": 0.004444870166480541, "attnres/block_norm/1": 46206.69921875, "attnres/final_alpha/block_2": 0.010424479842185974, "attnres/block_norm/2": 28489.978515625, "attnres/final_alpha/block_3": 0.01245950348675251, "attnres/block_norm/3": 57468.61328125, "attnres/final_alpha/block_4": 0.014454011805355549, "attnres/block_norm/4": 15053.556640625, "attnres/final_alpha/block_5": 0.6132113337516785, "attnres/block_norm/5": 6580.5498046875, "attnres/final_alpha/block_6": 0.10962963849306107, "attnres/block_norm/6": 38198.55078125, "geo/tier1_time_s": 1.3574411869049072, "geo/step": 61275.0, "geo/rankme_slope": -7.822529402385954e-05} {"step": 61280, "timestamp": 1778260779.7179513, "train/loss": 2.1687639594078063, "train/z_loss": 0.00137690658448264, "train/perplexity": 8.747465133829992, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707361.925107466, "perf/iters_per_sec": 0.8141336083924609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.228299617767334, "data/tokens_consumed": 128515571712, "data/tokens_consumed_B": 128.515571712, "train/loss_slope": -1.6057389726017975e-06} {"step": 61290, "timestamp": 1778260790.0604854, "train/loss": 2.1441789627075196, "train/z_loss": 0.0013743302901275457, "train/perplexity": 8.535030781201128, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028590.7942545456, "perf/iters_per_sec": 0.9673074694893578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337974548339843, "data/tokens_consumed": 128536543232, "data/tokens_consumed_B": 128.536543232, "train/loss_slope": -2.194164209168416e-06} {"step": 61300, "timestamp": 1778260800.3922124, "grad/layer_0/attn": 0.0033340330701321363, "grad/layer_0/mlp": 0.003367102239280939, "grad/layer_0/attn_mlp_ratio": 0.990178715757162, "grad/layer_4/attn": 0.0023960659746080637, "grad/layer_4/mlp": 0.0026027897838503122, "grad/layer_4/attn_mlp_ratio": 0.9205760286203197, "grad/layer_8/attn": 0.004333327990025282, "grad/layer_8/mlp": 0.0035725776106119156, "grad/layer_8/attn_mlp_ratio": 1.2129415623776771, "grad/layer_12/attn": 0.004146468359977007, "grad/layer_12/mlp": 0.0070447600446641445, "grad/layer_12/attn_mlp_ratio": 0.5885890044273023, "grad/layer_16/attn": 0.004146774299442768, "grad/layer_16/mlp": 0.004232420586049557, "grad/layer_16/attn_mlp_ratio": 0.9797642075398869, "grad/layer_20/attn": 0.0031938799656927586, "grad/layer_20/mlp": 0.005658983718603849, "grad/layer_20/attn_mlp_ratio": 0.5643910758664678, "grad/layer_24/attn": 0.006939208600670099, "grad/layer_24/mlp": 0.00861326139420271, "grad/layer_24/attn_mlp_ratio": 0.8056423928776159, "grad/layer_27/attn": 0.004898051265627146, "grad/layer_27/mlp": 0.007552167400717735, "grad/layer_27/attn_mlp_ratio": 0.6485623187199767} {"step": 61300, "timestamp": 1778260800.4062054, "train/loss": 2.1429391384124754, "train/z_loss": 0.0013789892196655273, "train/perplexity": 8.524455399841688, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028438.8973738612, "perf/iters_per_sec": 0.9672350394124323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338748693466187, "data/tokens_consumed": 128557514752, "data/tokens_consumed_B": 128.557514752, "train/loss_slope": -4.326394753809498e-06} {"step": 61310, "timestamp": 1778260810.7485795, "train/loss": 2.213097167015076, "train/z_loss": 0.0013759499765001238, "train/perplexity": 9.143993055340983, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029022.6574356493, "perf/iters_per_sec": 0.9675133979013678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335774183273316, "data/tokens_consumed": 128578486272, "data/tokens_consumed_B": 128.578486272, "train/loss_slope": 2.3149947879767252e-06} {"step": 61320, "timestamp": 1778260821.0868547, "train/loss": 2.1128067851066588, "train/z_loss": 0.0013842982123605907, "train/perplexity": 8.271424845837554, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029524.0533107598, "perf/iters_per_sec": 0.9677524820855903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333220720291139, "data/tokens_consumed": 128599457792, "data/tokens_consumed_B": 128.599457792, "train/loss_slope": 9.93497064798168e-07} {"step": 61330, "timestamp": 1778260831.4316146, "train/loss": 2.149290978908539, "train/z_loss": 0.0013665011385455728, "train/perplexity": 8.578773708848171, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028303.346161978, "perf/iters_per_sec": 0.967170403557767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339439630508422, "data/tokens_consumed": 128620429312, "data/tokens_consumed_B": 128.620429312, "train/loss_slope": 6.160074549324846e-07} {"step": 61340, "timestamp": 1778260841.773232, "train/loss": 2.1761962890625, "train/z_loss": 0.0013907356187701225, "train/perplexity": 8.812721380924382, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029131.7168706516, "perf/iters_per_sec": 0.9675654014924295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033521866798401, "data/tokens_consumed": 128641400832, "data/tokens_consumed_B": 128.641400832, "train/loss_slope": 5.64400031693232e-06} {"step": 61350, "timestamp": 1778260852.1006246, "grad/layer_0/attn": 0.0028343992307782173, "grad/layer_0/mlp": 0.0030976508278399706, "grad/layer_0/attn_mlp_ratio": 0.9150156995755097, "grad/layer_4/attn": 0.003301518503576517, "grad/layer_4/mlp": 0.0026471351739019156, "grad/layer_4/attn_mlp_ratio": 1.2472043027517936, "grad/layer_8/attn": 0.003800493199378252, "grad/layer_8/mlp": 0.003574106842279434, "grad/layer_8/attn_mlp_ratio": 1.0633406500574483, "grad/layer_12/attn": 0.004609591327607632, "grad/layer_12/mlp": 0.006820865906774998, "grad/layer_12/attn_mlp_ratio": 0.6758073422097779, "grad/layer_16/attn": 0.0036400321405380964, "grad/layer_16/mlp": 0.004421045072376728, "grad/layer_16/attn_mlp_ratio": 0.8233419923599733, "grad/layer_20/attn": 0.0028761932626366615, "grad/layer_20/mlp": 0.006003853399306536, "grad/layer_20/attn_mlp_ratio": 0.4790578689118366, "grad/layer_24/attn": 0.013427156955003738, "grad/layer_24/mlp": 0.010025559924542904, "grad/layer_24/attn_mlp_ratio": 1.3392924606838534, "grad/layer_27/attn": 0.008815511129796505, "grad/layer_27/mlp": 0.01013271789997816, "grad/layer_27/attn_mlp_ratio": 0.8700045861155423} {"step": 61350, "timestamp": 1778260852.6848052, "eos/sharpness": 80.91197013854979, "eos/L0_probe": 1.976257085800171, "eos/L_plus": 2.465643882751465, "eos/L_minus": 2.295989990234375, "eos/grad_norm": 0.19799967110157013, "eos/embed_grad_frac": 0.06012331321835518, "eos/time_s": 0.5813953876495361} {"step": 61350, "timestamp": 1778260852.704169, "train/loss": 2.146859550476074, "train/z_loss": 0.0013854593853466213, "train/perplexity": 8.55794037218397, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919484.9231919553, "perf/iters_per_sec": 0.9152817359885956, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0925597667694091, "data/tokens_consumed": 128662372352, "data/tokens_consumed_B": 128.662372352, "train/loss_slope": 6.228462540277143e-06} {"step": 61350, "timestamp": 1778260854.0648162, "geo/rankme_last": 439.1197509765625, "geo/layer_0/stable_rank_q_proj": 19.295129776000977, "geo/layer_0/stable_rank_k_proj": 16.142553329467773, "geo/layer_0/stable_rank_o_proj": 46.9688835144043, "geo/layer_0/stable_rank_gate_proj": 131.35304260253906, "geo/layer_0/stable_rank_down_proj": 55.0137825012207, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06354454159736633, "geo/layer_0/attn_entropy_mean": 6.162210941314697, "geo/layer_0/attn_entropy_std": 0.4160121977329254, "geo/layer_7/stable_rank_q_proj": 43.5526123046875, "geo/layer_7/stable_rank_k_proj": 40.93098831176758, "geo/layer_7/stable_rank_o_proj": 91.6009292602539, "geo/layer_7/stable_rank_gate_proj": 82.01132202148438, "geo/layer_7/stable_rank_down_proj": 140.84188842773438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4513883590698242, "geo/layer_7/attn_entropy_mean": 4.656981468200684, "geo/layer_7/attn_entropy_std": 0.8053539991378784, "geo/layer_14/stable_rank_q_proj": 51.52439880371094, "geo/layer_14/stable_rank_k_proj": 39.97996139526367, "geo/layer_14/stable_rank_o_proj": 44.08328628540039, "geo/layer_14/stable_rank_gate_proj": 72.04553985595703, "geo/layer_14/stable_rank_down_proj": 129.43984985351562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3904274106025696, "geo/layer_14/attn_entropy_mean": 5.520633697509766, "geo/layer_14/attn_entropy_std": 0.4080502390861511, "geo/layer_21/stable_rank_q_proj": 40.85211944580078, "geo/layer_21/stable_rank_k_proj": 30.417787551879883, "geo/layer_21/stable_rank_o_proj": 70.45166015625, "geo/layer_21/stable_rank_gate_proj": 66.51590728759766, "geo/layer_21/stable_rank_down_proj": 51.77830123901367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14678169786930084, "geo/layer_21/attn_entropy_mean": 5.711296081542969, "geo/layer_21/attn_entropy_std": 0.3011915683746338, "geo/layer_27/stable_rank_q_proj": 43.38410186767578, "geo/layer_27/stable_rank_k_proj": 31.7780704498291, "geo/layer_27/stable_rank_o_proj": 116.02288818359375, "geo/layer_27/stable_rank_gate_proj": 80.7922134399414, "geo/layer_27/stable_rank_down_proj": 128.67108154296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09601307660341263, "geo/layer_27/attn_entropy_mean": 4.214715957641602, "geo/layer_27/attn_entropy_std": 0.7266819477081299, "attnres/final_alpha/block_0": 0.2341885268688202, "attnres/block_norm/0": 1.7618982791900635, "attnres/final_alpha/block_1": 0.004463680554181337, "attnres/block_norm/1": 46161.2421875, "attnres/final_alpha/block_2": 0.010379211977124214, "attnres/block_norm/2": 28419.78515625, "attnres/final_alpha/block_3": 0.012204010039567947, "attnres/block_norm/3": 57097.0078125, "attnres/final_alpha/block_4": 0.014327877201139927, "attnres/block_norm/4": 15041.7060546875, "attnres/final_alpha/block_5": 0.6146795749664307, "attnres/block_norm/5": 6572.8603515625, "attnres/final_alpha/block_6": 0.10975709557533264, "attnres/block_norm/6": 37875.05859375, "geo/tier1_time_s": 1.356801986694336, "geo/step": 61350.0, "geo/rankme_slope": -5.504227081457583e-05} {"step": 61360, "timestamp": 1778260864.4103818, "train/loss": 2.1489155292510986, "train/z_loss": 0.0013739582267589868, "train/perplexity": 8.575553415764723, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792054.2349837844, "perf/iters_per_sec": 0.8545180487555429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1702502965927124, "data/tokens_consumed": 128683343872, "data/tokens_consumed_B": 128.683343872, "train/loss_slope": 7.791617601224682e-06} {"step": 61370, "timestamp": 1778260874.7520123, "train/loss": 2.155438709259033, "train/z_loss": 0.001384239294566214, "train/perplexity": 8.63167614468457, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029180.5867900955, "perf/iters_per_sec": 0.9675887044859388, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334969758987427, "data/tokens_consumed": 128704315392, "data/tokens_consumed_B": 128.704315392, "train/loss_slope": 6.814660111812707e-06} {"step": 61380, "timestamp": 1778260885.0975373, "train/loss": 2.1552435159683228, "train/z_loss": 0.0013817485887557268, "train/perplexity": 8.629991463838092, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028898.4937336394, "perf/iters_per_sec": 0.9674541920345494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336406707763672, "data/tokens_consumed": 128725286912, "data/tokens_consumed_B": 128.725286912, "train/loss_slope": 8.734223143746095e-06} {"step": 61390, "timestamp": 1778260895.4383276, "train/loss": 2.1195462942123413, "train/z_loss": 0.0013957121409475803, "train/perplexity": 8.32735845974, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029441.9684732894, "perf/iters_per_sec": 0.9677133409849593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333638668060303, "data/tokens_consumed": 128746258432, "data/tokens_consumed_B": 128.746258432, "train/loss_slope": 7.650852840010412e-06} {"step": 61400, "timestamp": 1778260905.770489, "grad/layer_0/attn": 0.002828532364219427, "grad/layer_0/mlp": 0.0030473624356091022, "grad/layer_0/attn_mlp_ratio": 0.9281903059341984, "grad/layer_4/attn": 0.0020940727554261684, "grad/layer_4/mlp": 0.002626435598358512, "grad/layer_4/attn_mlp_ratio": 0.79730592937605, "grad/layer_8/attn": 0.004067692440003157, "grad/layer_8/mlp": 0.003859610529616475, "grad/layer_8/attn_mlp_ratio": 1.053912643101866, "grad/layer_12/attn": 0.005268634296953678, "grad/layer_12/mlp": 0.007063813507556915, "grad/layer_12/attn_mlp_ratio": 0.7458625877836383, "grad/layer_16/attn": 0.0030820004176348448, "grad/layer_16/mlp": 0.004303913563489914, "grad/layer_16/attn_mlp_ratio": 0.7160925284769174, "grad/layer_20/attn": 0.002834685379639268, "grad/layer_20/mlp": 0.005856646690517664, "grad/layer_20/attn_mlp_ratio": 0.48401166760283876, "grad/layer_24/attn": 0.0044671702198684216, "grad/layer_24/mlp": 0.007186954375356436, "grad/layer_24/attn_mlp_ratio": 0.6215665112651034, "grad/layer_27/attn": 0.003595325630158186, "grad/layer_27/mlp": 0.006549407262355089, "grad/layer_27/attn_mlp_ratio": 0.5489543452165647} {"step": 61400, "timestamp": 1778260905.7846622, "train/loss": 2.1550877571105955, "train/z_loss": 0.0013727905810810626, "train/perplexity": 8.628647370905396, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028203.448377015, "perf/iters_per_sec": 0.9671227685818744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339948892593385, "data/tokens_consumed": 128767229952, "data/tokens_consumed_B": 128.767229952, "train/loss_slope": 9.090977891085906e-06} {"step": 61410, "timestamp": 1778260916.127269, "train/loss": 2.144819474220276, "train/z_loss": 0.0013825461152009665, "train/perplexity": 8.54049931782159, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029018.351466782, "perf/iters_per_sec": 0.9675113446554098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335796117782592, "data/tokens_consumed": 128788201472, "data/tokens_consumed_B": 128.788201472, "train/loss_slope": 1.0310404242748167e-05} {"step": 61420, "timestamp": 1778260926.4824405, "train/loss": 2.079307961463928, "train/z_loss": 0.0013775608618743717, "train/perplexity": 7.998931429644255, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026449.0321684093, "perf/iters_per_sec": 0.9662861977426573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034890079498291, "data/tokens_consumed": 128809172992, "data/tokens_consumed_B": 128.809172992, "train/loss_slope": 6.174350650397046e-06} {"step": 61425, "timestamp": 1778260932.236741, "eos/sharpness": 68.52018833160399, "eos/L0_probe": 1.9774377346038818, "eos/L_plus": 2.28365421295166, "eos/L_minus": 2.3564231395721436, "eos/grad_norm": 0.15690244734287262, "eos/embed_grad_frac": 0.09548236429691315, "eos/time_s": 0.5860569477081299} {"step": 61425, "timestamp": 1778260933.6143188, "geo/rankme_last": 438.6231384277344, "geo/layer_0/stable_rank_q_proj": 19.333168029785156, "geo/layer_0/stable_rank_k_proj": 16.164525985717773, "geo/layer_0/stable_rank_o_proj": 47.03360366821289, "geo/layer_0/stable_rank_gate_proj": 131.53421020507812, "geo/layer_0/stable_rank_down_proj": 55.01144027709961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06112150847911835, "geo/layer_0/attn_entropy_mean": 6.163536071777344, "geo/layer_0/attn_entropy_std": 0.41201597452163696, "geo/layer_7/stable_rank_q_proj": 43.56597137451172, "geo/layer_7/stable_rank_k_proj": 40.98442459106445, "geo/layer_7/stable_rank_o_proj": 91.79169464111328, "geo/layer_7/stable_rank_gate_proj": 82.0621337890625, "geo/layer_7/stable_rank_down_proj": 140.88641357421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46173223853111267, "geo/layer_7/attn_entropy_mean": 4.62618350982666, "geo/layer_7/attn_entropy_std": 0.7935465574264526, "geo/layer_14/stable_rank_q_proj": 51.449249267578125, "geo/layer_14/stable_rank_k_proj": 40.059085845947266, "geo/layer_14/stable_rank_o_proj": 44.01217269897461, "geo/layer_14/stable_rank_gate_proj": 72.0233383178711, "geo/layer_14/stable_rank_down_proj": 129.83375549316406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.383758544921875, "geo/layer_14/attn_entropy_mean": 5.514472007751465, "geo/layer_14/attn_entropy_std": 0.40293753147125244, "geo/layer_21/stable_rank_q_proj": 40.86442565917969, "geo/layer_21/stable_rank_k_proj": 30.371761322021484, "geo/layer_21/stable_rank_o_proj": 70.47948455810547, "geo/layer_21/stable_rank_gate_proj": 66.31013488769531, "geo/layer_21/stable_rank_down_proj": 51.74405288696289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14007039368152618, "geo/layer_21/attn_entropy_mean": 5.702054977416992, "geo/layer_21/attn_entropy_std": 0.29390770196914673, "geo/layer_27/stable_rank_q_proj": 43.3955192565918, "geo/layer_27/stable_rank_k_proj": 31.712366104125977, "geo/layer_27/stable_rank_o_proj": 115.9762954711914, "geo/layer_27/stable_rank_gate_proj": 80.72481536865234, "geo/layer_27/stable_rank_down_proj": 128.79013061523438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09446938335895538, "geo/layer_27/attn_entropy_mean": 4.203932762145996, "geo/layer_27/attn_entropy_std": 0.746300220489502, "attnres/final_alpha/block_0": 0.2382279634475708, "attnres/block_norm/0": 1.7621750831604004, "attnres/final_alpha/block_1": 0.004513864405453205, "attnres/block_norm/1": 46163.1328125, "attnres/final_alpha/block_2": 0.01058793906122446, "attnres/block_norm/2": 28399.8046875, "attnres/final_alpha/block_3": 0.012766928412020206, "attnres/block_norm/3": 57209.37109375, "attnres/final_alpha/block_4": 0.014687500894069672, "attnres/block_norm/4": 15010.0126953125, "attnres/final_alpha/block_5": 0.6084659099578857, "attnres/block_norm/5": 6605.90576171875, "attnres/final_alpha/block_6": 0.11074987053871155, "attnres/block_norm/6": 37851.22265625, "geo/tier1_time_s": 1.3596792221069336, "geo/step": 61425.0, "geo/rankme_slope": -5.330606852115846e-05} {"step": 61430, "timestamp": 1778260939.3091655, "train/loss": 2.1142525553703306, "train/z_loss": 0.001382592658046633, "train/perplexity": 8.28339207476518, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1635781.9124468707, "perf/iters_per_sec": 0.7800015985712389, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2820486545562744, "data/tokens_consumed": 128830144512, "data/tokens_consumed_B": 128.830144512, "train/loss_slope": 2.309820647000739e-06} {"step": 61440, "timestamp": 1778260949.6534884, "train/loss": 2.146983098983765, "train/z_loss": 0.001373815443366766, "train/perplexity": 8.55899775826375, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028866.0629742201, "perf/iters_per_sec": 0.9674387278433896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033657193183899, "data/tokens_consumed": 128851116032, "data/tokens_consumed_B": 128.851116032, "train/loss_slope": -1.0279002720409207e-06} {"step": 61450, "timestamp": 1778260959.9839718, "grad/layer_0/attn": 0.003005828009918332, "grad/layer_0/mlp": 0.0032261102460324764, "grad/layer_0/attn_mlp_ratio": 0.931718908379854, "grad/layer_4/attn": 0.0024999112356454134, "grad/layer_4/mlp": 0.0026801559142768383, "grad/layer_4/attn_mlp_ratio": 0.9327484005888893, "grad/layer_8/attn": 0.00896394718438387, "grad/layer_8/mlp": 0.0038210575003176928, "grad/layer_8/attn_mlp_ratio": 2.3459335403995434, "grad/layer_12/attn": 0.005122657865285873, "grad/layer_12/mlp": 0.007873328402638435, "grad/layer_12/attn_mlp_ratio": 0.6506343363634856, "grad/layer_16/attn": 0.004354802891612053, "grad/layer_16/mlp": 0.005441983230412006, "grad/layer_16/attn_mlp_ratio": 0.8002234897111227, "grad/layer_20/attn": 0.003839921671897173, "grad/layer_20/mlp": 0.006740098353475332, "grad/layer_20/attn_mlp_ratio": 0.5697129943134928, "grad/layer_24/attn": 0.010906385257840157, "grad/layer_24/mlp": 0.010219535790383816, "grad/layer_24/attn_mlp_ratio": 1.0672094481416363, "grad/layer_27/attn": 0.006492805201560259, "grad/layer_27/mlp": 0.008213212713599205, "grad/layer_27/attn_mlp_ratio": 0.790531713826976} {"step": 61450, "timestamp": 1778260959.9981837, "train/loss": 2.1950843334198, "train/z_loss": 0.0013621970778331161, "train/perplexity": 8.980758402952643, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028540.175027311, "perf/iters_per_sec": 0.9672833323608928, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338232517242432, "data/tokens_consumed": 128872087552, "data/tokens_consumed_B": 128.872087552, "train/loss_slope": 1.8981175561441777e-06} {"step": 61460, "timestamp": 1778260970.3461545, "train/loss": 2.162415099143982, "train/z_loss": 0.0013765499112196266, "train/perplexity": 8.692104624058917, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027966.3708373958, "perf/iters_per_sec": 0.9670097212016086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034115767478943, "data/tokens_consumed": 128893059072, "data/tokens_consumed_B": 128.893059072, "train/loss_slope": 2.1939668432213307e-06} {"step": 61470, "timestamp": 1778260980.6853468, "train/loss": 2.1308040857315063, "train/z_loss": 0.0013701758813112973, "train/perplexity": 8.421635806837331, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029672.4596961061, "perf/iters_per_sec": 0.967823247764638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332465171813965, "data/tokens_consumed": 128914030592, "data/tokens_consumed_B": 128.914030592, "train/loss_slope": 3.3459665465562235e-06} {"step": 61480, "timestamp": 1778260991.0262446, "train/loss": 2.12148631811142, "train/z_loss": 0.0013672319008037447, "train/perplexity": 8.343529415116164, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029311.4798934294, "perf/iters_per_sec": 0.9676511191813609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033430314064026, "data/tokens_consumed": 128935002112, "data/tokens_consumed_B": 128.935002112, "train/loss_slope": 2.3079431346683907e-06} {"step": 61490, "timestamp": 1778261001.3693867, "train/loss": 2.078326404094696, "train/z_loss": 0.0013857272802852093, "train/perplexity": 7.991083871597786, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028554.864635292, "perf/iters_per_sec": 0.9672903369118175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338157653808593, "data/tokens_consumed": 128955973632, "data/tokens_consumed_B": 128.955973632, "train/loss_slope": -1.3742393893663226e-06} {"step": 61500, "timestamp": 1778261011.7026975, "grad/layer_0/attn": 0.003061482682824135, "grad/layer_0/mlp": 0.003180799540132284, "grad/layer_0/attn_mlp_ratio": 0.9624883768840005, "grad/layer_4/attn": 0.0020166265312582254, "grad/layer_4/mlp": 0.002631119918078184, "grad/layer_4/attn_mlp_ratio": 0.7664517457972915, "grad/layer_8/attn": 0.005025009624660015, "grad/layer_8/mlp": 0.003828939748927951, "grad/layer_8/attn_mlp_ratio": 1.312376224992656, "grad/layer_12/attn": 0.005191899836063385, "grad/layer_12/mlp": 0.007083370350301266, "grad/layer_12/attn_mlp_ratio": 0.7329702537077624, "grad/layer_16/attn": 0.0038152446504682302, "grad/layer_16/mlp": 0.005214368458837271, "grad/layer_16/attn_mlp_ratio": 0.7316791299690897, "grad/layer_20/attn": 0.0033236665185540915, "grad/layer_20/mlp": 0.006843080278486013, "grad/layer_20/attn_mlp_ratio": 0.48569742494964396, "grad/layer_24/attn": 0.014690060168504715, "grad/layer_24/mlp": 0.012361192144453526, "grad/layer_24/attn_mlp_ratio": 1.1884015617584263, "grad/layer_27/attn": 0.014488723129034042, "grad/layer_27/mlp": 0.012056136503815651, "grad/layer_27/attn_mlp_ratio": 1.2017716458561445} {"step": 61500, "timestamp": 1778261012.2916412, "eos/sharpness": 86.77101135253905, "eos/L0_probe": 1.974568247795105, "eos/L_plus": 2.514465093612671, "eos/L_minus": 2.3023815155029297, "eos/grad_norm": 0.2565101087093353, "eos/embed_grad_frac": 0.03637823089957237, "eos/time_s": 0.5860936641693115} {"step": 61500, "timestamp": 1778261012.312075, "train/loss": 2.166189205646515, "train/z_loss": 0.0013720849761739372, "train/perplexity": 8.724971535240194, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917606.4962240055, "perf/iters_per_sec": 0.9143860322113063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0936300039291382, "data/tokens_consumed": 128976945152, "data/tokens_consumed_B": 128.976945152, "train/loss_slope": 4.557423000753794e-07} {"step": 61500, "timestamp": 1778261013.6768792, "geo/rankme_last": 438.7465515136719, "geo/layer_0/stable_rank_q_proj": 19.326452255249023, "geo/layer_0/stable_rank_k_proj": 16.18069839477539, "geo/layer_0/stable_rank_o_proj": 47.0262336730957, "geo/layer_0/stable_rank_gate_proj": 131.61314392089844, "geo/layer_0/stable_rank_down_proj": 55.06675338745117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06145536154508591, "geo/layer_0/attn_entropy_mean": 6.162153244018555, "geo/layer_0/attn_entropy_std": 0.41454145312309265, "geo/layer_7/stable_rank_q_proj": 43.6063117980957, "geo/layer_7/stable_rank_k_proj": 41.080848693847656, "geo/layer_7/stable_rank_o_proj": 91.89253997802734, "geo/layer_7/stable_rank_gate_proj": 82.13603210449219, "geo/layer_7/stable_rank_down_proj": 140.90235900878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46659204363822937, "geo/layer_7/attn_entropy_mean": 4.638278961181641, "geo/layer_7/attn_entropy_std": 0.7965824604034424, "geo/layer_14/stable_rank_q_proj": 51.40372848510742, "geo/layer_14/stable_rank_k_proj": 40.042633056640625, "geo/layer_14/stable_rank_o_proj": 44.04799270629883, "geo/layer_14/stable_rank_gate_proj": 71.99020385742188, "geo/layer_14/stable_rank_down_proj": 129.63136291503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39185914397239685, "geo/layer_14/attn_entropy_mean": 5.570982933044434, "geo/layer_14/attn_entropy_std": 0.40558281540870667, "geo/layer_21/stable_rank_q_proj": 40.88197708129883, "geo/layer_21/stable_rank_k_proj": 30.36082649230957, "geo/layer_21/stable_rank_o_proj": 70.48722839355469, "geo/layer_21/stable_rank_gate_proj": 66.29022216796875, "geo/layer_21/stable_rank_down_proj": 51.719295501708984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13920827209949493, "geo/layer_21/attn_entropy_mean": 5.696885108947754, "geo/layer_21/attn_entropy_std": 0.29210203886032104, "geo/layer_27/stable_rank_q_proj": 43.425228118896484, "geo/layer_27/stable_rank_k_proj": 31.748092651367188, "geo/layer_27/stable_rank_o_proj": 115.91144561767578, "geo/layer_27/stable_rank_gate_proj": 80.79346466064453, "geo/layer_27/stable_rank_down_proj": 128.6585235595703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10587023943662643, "geo/layer_27/attn_entropy_mean": 4.2018280029296875, "geo/layer_27/attn_entropy_std": 0.7425765991210938, "attnres/final_alpha/block_0": 0.2354489266872406, "attnres/block_norm/0": 1.762159824371338, "attnres/final_alpha/block_1": 0.004424666054546833, "attnres/block_norm/1": 46341.9609375, "attnres/final_alpha/block_2": 0.010385261848568916, "attnres/block_norm/2": 28440.501953125, "attnres/final_alpha/block_3": 0.012628493830561638, "attnres/block_norm/3": 57200.875, "attnres/final_alpha/block_4": 0.014705542474985123, "attnres/block_norm/4": 15004.189453125, "attnres/final_alpha/block_5": 0.61515212059021, "attnres/block_norm/5": 6628.06201171875, "attnres/final_alpha/block_6": 0.10725497454404831, "attnres/block_norm/6": 38219.125, "geo/tier1_time_s": 1.3606328964233398, "geo/step": 61500.0, "geo/rankme_slope": -3.261111085059024e-05} {"step": 61500, "timestamp": 1778261020.726545, "geo/ww_alpha_mean": 7.850085401207589, "geo/ww_alpha_std": 5.239210301714498, "geo/ww_alpha_min": 1.3454685516222693, "geo/ww_alpha_max": 38.51750586565923, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9514603659687926, "geo/ww_alpha_by_type/k_proj": 4.449010085223175, "geo/ww_alpha_by_type/v_proj": 9.505291984851212, "geo/ww_alpha_by_type/o_proj": 9.35910666327654, "geo/ww_alpha_by_type/gate_proj": 7.880505253647024, "geo/ww_alpha_by_type/up_proj": 11.700915993855164, "geo/ww_alpha_by_type/down_proj": 8.204619557117294, "geo/twonn_id/layer_0": 0.7643455862998962, "geo/twonn_id/layer_7": 3.370340347290039, "geo/twonn_id/layer_14": 4.51844596862793, "geo/twonn_id/layer_21": 6.861448764801025, "geo/twonn_id/layer_27": 5.165396690368652, "geo/tier2_time_s": 7.043400049209595} {"step": 61500, "timestamp": 1778261021.415081, "eoc/jacobian_sigma/layer_0/attn": 1247.2860107421875, "eoc/jacobian_sigma/layer_0/mlp": 8830.4423828125, "eoc/jacobian_sigma/layer_0": 8830.4423828125, "eoc/jacobian_sigma/layer_7/attn": 1.1409958600997925, "eoc/jacobian_sigma/layer_7/mlp": 1.759040117263794, "eoc/jacobian_sigma/layer_7": 1.759040117263794, "eoc/jacobian_sigma/layer_14/attn": 1.5163800716400146, "eoc/jacobian_sigma/layer_14/mlp": 5.981347560882568, "eoc/jacobian_sigma/layer_14": 5.981347560882568, "eoc/jacobian_sigma/layer_21/attn": 1.0938405990600586, "eoc/jacobian_sigma/layer_21/mlp": 4.317046642303467, "eoc/jacobian_sigma/layer_21": 4.317046642303467, "eoc/jacobian_sigma/layer_27/attn": 3.1134023666381836, "eoc/jacobian_sigma/layer_27/mlp": 21.46491813659668, "eoc/jacobian_sigma/layer_27": 21.46491813659668, "eoc/layer0_sigma": 8830.4423828125, "eoc/sigma_max": 21.46491813659668, "eoc/sigma_min": 1.759040117263794, "eoc/sigma_mean": 8.380588114261627, "eoc/time_s": 0.6818721294403076} {"step": 61510, "timestamp": 1778261031.7788281, "train/loss": 2.1744576692581177, "train/z_loss": 0.0013629143475554883, "train/perplexity": 8.797412720827063, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1077528.1806848159, "perf/iters_per_sec": 0.513805475561531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9462618589401246, "data/tokens_consumed": 128997916672, "data/tokens_consumed_B": 128.997916672, "train/loss_slope": 1.3567177983972985e-06} {"step": 61520, "timestamp": 1778261042.1199992, "train/loss": 2.1856901168823244, "train/z_loss": 0.0013614778639748693, "train/perplexity": 8.896786257655942, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029387.607897575, "perf/iters_per_sec": 0.9676874198425174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033391547203064, "data/tokens_consumed": 129018888192, "data/tokens_consumed_B": 129.018888192, "train/loss_slope": 3.7923021857316123e-06} {"step": 61530, "timestamp": 1778261052.4611278, "train/loss": 2.1334619998931883, "train/z_loss": 0.001374579407274723, "train/perplexity": 8.444049565641144, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029212.231786646, "perf/iters_per_sec": 0.9676037939961653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334808588027955, "data/tokens_consumed": 129039859712, "data/tokens_consumed_B": 129.039859712, "train/loss_slope": 1.7989348049032388e-06} {"step": 61540, "timestamp": 1778261062.8039901, "train/loss": 2.1514877557754515, "train/z_loss": 0.0013674636138603091, "train/perplexity": 8.597640075489993, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028712.3000656096, "perf/iters_per_sec": 0.9673654079750107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337355375289916, "data/tokens_consumed": 129060831232, "data/tokens_consumed_B": 129.060831232, "train/loss_slope": 2.2749943594442004e-06} {"step": 61550, "timestamp": 1778261073.1490712, "grad/layer_0/attn": 0.0029431048315018415, "grad/layer_0/mlp": 0.0032887253910303116, "grad/layer_0/attn_mlp_ratio": 0.8949074161187616, "grad/layer_4/attn": 0.003140753600746393, "grad/layer_4/mlp": 0.0026536688674241304, "grad/layer_4/attn_mlp_ratio": 1.1835513921674514, "grad/layer_8/attn": 0.00502642011269927, "grad/layer_8/mlp": 0.003812983399257064, "grad/layer_8/attn_mlp_ratio": 1.3182380971957095, "grad/layer_12/attn": 0.00506715290248394, "grad/layer_12/mlp": 0.006224929820746183, "grad/layer_12/attn_mlp_ratio": 0.8140096301480194, "grad/layer_16/attn": 0.0035627016332000494, "grad/layer_16/mlp": 0.004563331604003906, "grad/layer_16/attn_mlp_ratio": 0.7807237922402408, "grad/layer_20/attn": 0.004776688292622566, "grad/layer_20/mlp": 0.0062808385118842125, "grad/layer_20/attn_mlp_ratio": 0.76051759769538, "grad/layer_24/attn": 0.016488516703248024, "grad/layer_24/mlp": 0.013999602757394314, "grad/layer_24/attn_mlp_ratio": 1.1777846036924622, "grad/layer_27/attn": 0.0057579148560762405, "grad/layer_27/mlp": 0.014114219695329666, "grad/layer_27/attn_mlp_ratio": 0.4079513384070659} {"step": 61550, "timestamp": 1778261073.1635396, "train/loss": 2.165525126457214, "train/z_loss": 0.0013916638446971775, "train/perplexity": 8.719179386651929, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026295.6353896945, "perf/iters_per_sec": 0.9662130524586174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349684238433838, "data/tokens_consumed": 129081802752, "data/tokens_consumed_B": 129.081802752, "train/loss_slope": 2.218530888390055e-06} {"step": 61560, "timestamp": 1778261083.5096378, "train/loss": 2.109430170059204, "train/z_loss": 0.0013920488301664591, "train/perplexity": 8.243542528656834, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028438.1489392403, "perf/iters_per_sec": 0.9672346825309945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338752508163451, "data/tokens_consumed": 129102774272, "data/tokens_consumed_B": 129.102774272, "train/loss_slope": -4.6135930731743536e-07} {"step": 61570, "timestamp": 1778261093.8584836, "train/loss": 2.1225106000900267, "train/z_loss": 0.0013825182686559856, "train/perplexity": 8.352079920250636, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027454.8556564678, "perf/iters_per_sec": 0.9667658117563571, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343766689300538, "data/tokens_consumed": 129123745792, "data/tokens_consumed_B": 129.123745792, "train/loss_slope": -5.925784410029678e-07} {"step": 61575, "timestamp": 1778261099.6246057, "eos/sharpness": 86.7086887359619, "eos/L0_probe": 1.9786632061004639, "eos/L_plus": 2.332904815673828, "eos/L_minus": 2.4915084838867188, "eos/grad_norm": 0.21716265380382538, "eos/embed_grad_frac": 0.04416833817958832, "eos/time_s": 0.600456714630127} {"step": 61575, "timestamp": 1778261101.00409, "geo/rankme_last": 438.70367431640625, "geo/layer_0/stable_rank_q_proj": 19.343700408935547, "geo/layer_0/stable_rank_k_proj": 16.175832748413086, "geo/layer_0/stable_rank_o_proj": 47.14569854736328, "geo/layer_0/stable_rank_gate_proj": 131.47518920898438, "geo/layer_0/stable_rank_down_proj": 55.12298583984375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06742896884679794, "geo/layer_0/attn_entropy_mean": 6.164157867431641, "geo/layer_0/attn_entropy_std": 0.4133760333061218, "geo/layer_7/stable_rank_q_proj": 43.61852264404297, "geo/layer_7/stable_rank_k_proj": 41.09070587158203, "geo/layer_7/stable_rank_o_proj": 91.98389434814453, "geo/layer_7/stable_rank_gate_proj": 82.09840393066406, "geo/layer_7/stable_rank_down_proj": 140.95521545410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.48144733905792236, "geo/layer_7/attn_entropy_mean": 4.629385948181152, "geo/layer_7/attn_entropy_std": 0.7951580882072449, "geo/layer_14/stable_rank_q_proj": 51.432769775390625, "geo/layer_14/stable_rank_k_proj": 40.06504821777344, "geo/layer_14/stable_rank_o_proj": 44.07012176513672, "geo/layer_14/stable_rank_gate_proj": 71.93743133544922, "geo/layer_14/stable_rank_down_proj": 129.49911499023438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40553104877471924, "geo/layer_14/attn_entropy_mean": 5.57972526550293, "geo/layer_14/attn_entropy_std": 0.3953694701194763, "geo/layer_21/stable_rank_q_proj": 40.81633758544922, "geo/layer_21/stable_rank_k_proj": 30.323469161987305, "geo/layer_21/stable_rank_o_proj": 70.53145599365234, "geo/layer_21/stable_rank_gate_proj": 66.30970764160156, "geo/layer_21/stable_rank_down_proj": 51.66156005859375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14087821543216705, "geo/layer_21/attn_entropy_mean": 5.707230567932129, "geo/layer_21/attn_entropy_std": 0.3011816740036011, "geo/layer_27/stable_rank_q_proj": 43.359039306640625, "geo/layer_27/stable_rank_k_proj": 31.77817153930664, "geo/layer_27/stable_rank_o_proj": 115.84778594970703, "geo/layer_27/stable_rank_gate_proj": 80.83661651611328, "geo/layer_27/stable_rank_down_proj": 128.67111206054688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09306399524211884, "geo/layer_27/attn_entropy_mean": 4.194032192230225, "geo/layer_27/attn_entropy_std": 0.7204336524009705, "attnres/final_alpha/block_0": 0.23695223033428192, "attnres/block_norm/0": 1.762330174446106, "attnres/final_alpha/block_1": 0.004514200612902641, "attnres/block_norm/1": 46246.4765625, "attnres/final_alpha/block_2": 0.01063260156661272, "attnres/block_norm/2": 28628.880859375, "attnres/final_alpha/block_3": 0.012751413509249687, "attnres/block_norm/3": 57329.44140625, "attnres/final_alpha/block_4": 0.014740980230271816, "attnres/block_norm/4": 15053.6953125, "attnres/final_alpha/block_5": 0.6099735498428345, "attnres/block_norm/5": 6608.9404296875, "attnres/final_alpha/block_6": 0.11043499410152435, "attnres/block_norm/6": 38288.6640625, "geo/tier1_time_s": 1.359555959701538, "geo/step": 61575.0, "geo/rankme_slope": -3.849633603441377e-05} {"step": 61580, "timestamp": 1778261106.1760607, "train/loss": 2.153373098373413, "train/z_loss": 0.0013811505283229052, "train/perplexity": 8.613864862401272, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703522.6139863615, "perf/iters_per_sec": 0.8123028821880157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2310678958892822, "data/tokens_consumed": 129144717312, "data/tokens_consumed_B": 129.144717312, "train/loss_slope": 1.2064416595239367e-06} {"step": 61590, "timestamp": 1778261116.5187595, "train/loss": 2.113439202308655, "train/z_loss": 0.0013818338047713041, "train/perplexity": 8.276657491628283, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028410.0830396386, "perf/iters_per_sec": 0.9672212996671861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338895559310912, "data/tokens_consumed": 129165688832, "data/tokens_consumed_B": 129.165688832, "train/loss_slope": 1.01806102412763e-06} {"step": 61600, "timestamp": 1778261126.8601632, "grad/layer_0/attn": 0.002777965972200036, "grad/layer_0/mlp": 0.0028887582011520863, "grad/layer_0/attn_mlp_ratio": 0.9616470755245032, "grad/layer_4/attn": 0.003180441912263632, "grad/layer_4/mlp": 0.002630426548421383, "grad/layer_4/attn_mlp_ratio": 1.2090973584731335, "grad/layer_8/attn": 0.011548012495040894, "grad/layer_8/mlp": 0.0038374746218323708, "grad/layer_8/attn_mlp_ratio": 3.0092738928914136, "grad/layer_12/attn": 0.006914450321346521, "grad/layer_12/mlp": 0.006865436676889658, "grad/layer_12/attn_mlp_ratio": 1.0071391735223392, "grad/layer_16/attn": 0.004710506182163954, "grad/layer_16/mlp": 0.0048374999314546585, "grad/layer_16/attn_mlp_ratio": 0.9737480416609906, "grad/layer_20/attn": 0.003965068142861128, "grad/layer_20/mlp": 0.006040187552571297, "grad/layer_20/attn_mlp_ratio": 0.6564478408503097, "grad/layer_24/attn": 0.005394320469349623, "grad/layer_24/mlp": 0.009756434708833694, "grad/layer_24/attn_mlp_ratio": 0.5528987355571201, "grad/layer_27/attn": 0.009938730858266354, "grad/layer_27/mlp": 0.0077566769905388355, "grad/layer_27/attn_mlp_ratio": 1.281312956857398} {"step": 61600, "timestamp": 1778261126.8744557, "train/loss": 2.1378830671310425, "train/z_loss": 0.0013839707942679524, "train/perplexity": 8.48146392128037, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026516.6816448995, "perf/iters_per_sec": 0.9663184555267809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348555326461792, "data/tokens_consumed": 129186660352, "data/tokens_consumed_B": 129.186660352, "train/loss_slope": 1.76758738991882e-06} {"step": 61610, "timestamp": 1778261137.2166224, "train/loss": 2.135381746292114, "train/z_loss": 0.0013895694515667855, "train/perplexity": 8.460275569309879, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029112.6657294154, "perf/iters_per_sec": 0.9675563172003819, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335315704345702, "data/tokens_consumed": 129207631872, "data/tokens_consumed_B": 129.207631872, "train/loss_slope": 2.997789145445937e-06} {"step": 61620, "timestamp": 1778261147.558451, "train/loss": 2.161020016670227, "train/z_loss": 0.0013932182919234037, "train/perplexity": 8.679986875832286, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029286.2456100653, "perf/iters_per_sec": 0.9676390865373923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334431648254394, "data/tokens_consumed": 129228603392, "data/tokens_consumed_B": 129.228603392, "train/loss_slope": 3.0788010889941153e-06} {"step": 61630, "timestamp": 1778261157.903493, "train/loss": 2.100398766994476, "train/z_loss": 0.0013803338748402893, "train/perplexity": 8.169426960958605, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028179.410772152, "perf/iters_per_sec": 0.9671113065586815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340071439743042, "data/tokens_consumed": 129249574912, "data/tokens_consumed_B": 129.249574912, "train/loss_slope": 4.0765993952072486e-07} {"step": 61640, "timestamp": 1778261168.2424574, "train/loss": 2.1096620321273805, "train/z_loss": 0.0013809452066197991, "train/perplexity": 8.245454115080253, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029583.010522755, "perf/iters_per_sec": 0.9677805950750137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332920551300049, "data/tokens_consumed": 129270546432, "data/tokens_consumed_B": 129.270546432, "train/loss_slope": -1.4947407030322307e-06} {"step": 61650, "timestamp": 1778261178.5827162, "grad/layer_0/attn": 0.0029094370547682047, "grad/layer_0/mlp": 0.003196718404069543, "grad/layer_0/attn_mlp_ratio": 0.9101323907827298, "grad/layer_4/attn": 0.0021731583401560783, "grad/layer_4/mlp": 0.002700280863791704, "grad/layer_4/attn_mlp_ratio": 0.8047897123655435, "grad/layer_8/attn": 0.0033785426057875156, "grad/layer_8/mlp": 0.003915350418537855, "grad/layer_8/attn_mlp_ratio": 0.8628965886428978, "grad/layer_12/attn": 0.004033954814076424, "grad/layer_12/mlp": 0.007174998056143522, "grad/layer_12/attn_mlp_ratio": 0.5622238119493298, "grad/layer_16/attn": 0.0031372392550110817, "grad/layer_16/mlp": 0.004528476390987635, "grad/layer_16/attn_mlp_ratio": 0.6927802896304466, "grad/layer_20/attn": 0.0038161668926477432, "grad/layer_20/mlp": 0.0056698801927268505, "grad/layer_20/attn_mlp_ratio": 0.6730595172428959, "grad/layer_24/attn": 0.007778864819556475, "grad/layer_24/mlp": 0.008949222974479198, "grad/layer_24/attn_mlp_ratio": 0.8692223620774105, "grad/layer_27/attn": 0.0037624521646648645, "grad/layer_27/mlp": 0.007466380018740892, "grad/layer_27/attn_mlp_ratio": 0.5039191823653566} {"step": 61650, "timestamp": 1778261179.1834505, "eos/sharpness": 19.950556755065914, "eos/L0_probe": 1.9751421213150024, "eos/L_plus": 2.0767252445220947, "eos/L_minus": 2.0730645656585693, "eos/grad_norm": 0.09445838630199432, "eos/embed_grad_frac": 0.3098783493041992, "eos/time_s": 0.5977880954742432} {"step": 61650, "timestamp": 1778261179.201536, "train/loss": 2.118768334388733, "train/z_loss": 0.0013759842375293374, "train/perplexity": 8.320882628716326, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1914407.8566258617, "perf/iters_per_sec": 0.9128608019952115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0954572677612304, "data/tokens_consumed": 129291517952, "data/tokens_consumed_B": 129.291517952, "train/loss_slope": -2.7228493752008566e-06} {"step": 61650, "timestamp": 1778261180.569438, "geo/rankme_last": 438.6852722167969, "geo/layer_0/stable_rank_q_proj": 19.341203689575195, "geo/layer_0/stable_rank_k_proj": 16.166778564453125, "geo/layer_0/stable_rank_o_proj": 47.236839294433594, "geo/layer_0/stable_rank_gate_proj": 131.66558837890625, "geo/layer_0/stable_rank_down_proj": 55.06828689575195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060604169964790344, "geo/layer_0/attn_entropy_mean": 6.1657633781433105, "geo/layer_0/attn_entropy_std": 0.4107305109500885, "geo/layer_7/stable_rank_q_proj": 43.63151550292969, "geo/layer_7/stable_rank_k_proj": 41.14268493652344, "geo/layer_7/stable_rank_o_proj": 92.02583312988281, "geo/layer_7/stable_rank_gate_proj": 82.13426971435547, "geo/layer_7/stable_rank_down_proj": 140.68894958496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45454177260398865, "geo/layer_7/attn_entropy_mean": 4.632631301879883, "geo/layer_7/attn_entropy_std": 0.7987761497497559, "geo/layer_14/stable_rank_q_proj": 51.47520446777344, "geo/layer_14/stable_rank_k_proj": 40.046592712402344, "geo/layer_14/stable_rank_o_proj": 44.09726333618164, "geo/layer_14/stable_rank_gate_proj": 71.84033203125, "geo/layer_14/stable_rank_down_proj": 129.3657989501953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4007332921028137, "geo/layer_14/attn_entropy_mean": 5.560675621032715, "geo/layer_14/attn_entropy_std": 0.4003964066505432, "geo/layer_21/stable_rank_q_proj": 40.78833770751953, "geo/layer_21/stable_rank_k_proj": 30.34778594970703, "geo/layer_21/stable_rank_o_proj": 70.49260711669922, "geo/layer_21/stable_rank_gate_proj": 66.22689056396484, "geo/layer_21/stable_rank_down_proj": 51.6380500793457, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13941048085689545, "geo/layer_21/attn_entropy_mean": 5.702102184295654, "geo/layer_21/attn_entropy_std": 0.30507892370224, "geo/layer_27/stable_rank_q_proj": 43.29822540283203, "geo/layer_27/stable_rank_k_proj": 31.740108489990234, "geo/layer_27/stable_rank_o_proj": 116.01609802246094, "geo/layer_27/stable_rank_gate_proj": 80.74443817138672, "geo/layer_27/stable_rank_down_proj": 128.76063537597656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0926826000213623, "geo/layer_27/attn_entropy_mean": 4.202508926391602, "geo/layer_27/attn_entropy_std": 0.7143393754959106, "attnres/final_alpha/block_0": 0.23706427216529846, "attnres/block_norm/0": 1.7624443769454956, "attnres/final_alpha/block_1": 0.004473987966775894, "attnres/block_norm/1": 45979.96875, "attnres/final_alpha/block_2": 0.01043416652828455, "attnres/block_norm/2": 28497.9140625, "attnres/final_alpha/block_3": 0.012554539367556572, "attnres/block_norm/3": 57581.47265625, "attnres/final_alpha/block_4": 0.014656522311270237, "attnres/block_norm/4": 15026.8525390625, "attnres/final_alpha/block_5": 0.6095858216285706, "attnres/block_norm/5": 6656.2216796875, "attnres/final_alpha/block_6": 0.11123068630695343, "attnres/block_norm/6": 38083.171875, "geo/tier1_time_s": 1.3640823364257812, "geo/step": 61650.0, "geo/rankme_slope": -3.346801611269508e-05} {"step": 61660, "timestamp": 1778261190.9138107, "train/loss": 2.1857285261154176, "train/z_loss": 0.0013676845119334757, "train/perplexity": 8.897127982955753, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791141.474123981, "perf/iters_per_sec": 0.8540828104610353, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170846652984619, "data/tokens_consumed": 129312489472, "data/tokens_consumed_B": 129.312489472, "train/loss_slope": -1.4182942642522964e-06} {"step": 61670, "timestamp": 1778261201.2563508, "train/loss": 2.1328879833221435, "train/z_loss": 0.001367214962374419, "train/perplexity": 8.43920393213376, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029207.5973245748, "perf/iters_per_sec": 0.9676015841124415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334832191467285, "data/tokens_consumed": 129333460992, "data/tokens_consumed_B": 129.333460992, "train/loss_slope": -4.830113123960276e-07} {"step": 61680, "timestamp": 1778261211.6019182, "train/loss": 2.2187522888183593, "train/z_loss": 0.0013466904521919787, "train/perplexity": 9.19584994013698, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027952.1572984422, "perf/iters_per_sec": 0.9670029436580859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341230154037475, "data/tokens_consumed": 129354432512, "data/tokens_consumed_B": 129.354432512, "train/loss_slope": 2.433234057982975e-06} {"step": 61690, "timestamp": 1778261221.943579, "train/loss": 2.1192906975746153, "train/z_loss": 0.0013929625740274787, "train/perplexity": 8.325230286905047, "train/grad_norm": 0.28125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028845.6597377185, "perf/iters_per_sec": 0.967428998822078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336675882339477, "data/tokens_consumed": 129375404032, "data/tokens_consumed_B": 129.375404032, "train/loss_slope": 1.8111517219760882e-06} {"step": 61700, "timestamp": 1778261232.275355, "grad/layer_0/attn": 0.002725400263443589, "grad/layer_0/mlp": 0.003005629638209939, "grad/layer_0/attn_mlp_ratio": 0.906765137700146, "grad/layer_4/attn": 0.0020854983013123274, "grad/layer_4/mlp": 0.002516868757084012, "grad/layer_4/attn_mlp_ratio": 0.8286082508599749, "grad/layer_8/attn": 0.00574844004586339, "grad/layer_8/mlp": 0.0036973287351429462, "grad/layer_8/attn_mlp_ratio": 1.5547548790426005, "grad/layer_12/attn": 0.004607454873621464, "grad/layer_12/mlp": 0.006753222085535526, "grad/layer_12/attn_mlp_ratio": 0.682260222903668, "grad/layer_16/attn": 0.0043411096557974815, "grad/layer_16/mlp": 0.004504116252064705, "grad/layer_16/attn_mlp_ratio": 0.9638093948899654, "grad/layer_20/attn": 0.0047011966817080975, "grad/layer_20/mlp": 0.005639851558953524, "grad/layer_20/attn_mlp_ratio": 0.833567435101725, "grad/layer_24/attn": 0.010643783956766129, "grad/layer_24/mlp": 0.009032812900841236, "grad/layer_24/attn_mlp_ratio": 1.1783465411909735, "grad/layer_27/attn": 0.004581792745739222, "grad/layer_27/mlp": 0.007455811370164156, "grad/layer_27/attn_mlp_ratio": 0.6145263683334986} {"step": 61700, "timestamp": 1778261232.289874, "train/loss": 2.151752758026123, "train/z_loss": 0.0013875901349820197, "train/perplexity": 8.599918771376895, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028467.572191212, "perf/iters_per_sec": 0.9672487126308499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338602542877198, "data/tokens_consumed": 129396375552, "data/tokens_consumed_B": 129.396375552, "train/loss_slope": 3.5205866530103876e-06} {"step": 61710, "timestamp": 1778261243.1311712, "train/loss": 2.1676709413528443, "train/z_loss": 0.001368246483616531, "train/perplexity": 8.737909219848184, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935363.337456975, "perf/iters_per_sec": 0.9228531539234995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0835960149765014, "data/tokens_consumed": 129417347072, "data/tokens_consumed_B": 129.417347072, "train/loss_slope": 3.713140922589869e-06} {"step": 61720, "timestamp": 1778261253.472745, "train/loss": 2.163564586639404, "train/z_loss": 0.0013691047090105713, "train/perplexity": 8.702101834366498, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028894.9370629422, "perf/iters_per_sec": 0.9674524960818015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336424827575683, "data/tokens_consumed": 129438318592, "data/tokens_consumed_B": 129.438318592, "train/loss_slope": 2.512205365968103e-06} {"step": 61725, "timestamp": 1778261259.2355297, "eos/sharpness": 59.16523933410643, "eos/L0_probe": 1.977026343345642, "eos/L_plus": 2.3015644550323486, "eos/L_minus": 2.244140625, "eos/grad_norm": 0.19141817092895508, "eos/embed_grad_frac": 0.06105121970176697, "eos/time_s": 0.5926902294158936} {"step": 61725, "timestamp": 1778261260.6105585, "geo/rankme_last": 438.67523193359375, "geo/layer_0/stable_rank_q_proj": 19.322938919067383, "geo/layer_0/stable_rank_k_proj": 16.186471939086914, "geo/layer_0/stable_rank_o_proj": 47.203819274902344, "geo/layer_0/stable_rank_gate_proj": 131.7292938232422, "geo/layer_0/stable_rank_down_proj": 55.048667907714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06463412195444107, "geo/layer_0/attn_entropy_mean": 6.159011363983154, "geo/layer_0/attn_entropy_std": 0.4131661355495453, "geo/layer_7/stable_rank_q_proj": 43.68613815307617, "geo/layer_7/stable_rank_k_proj": 41.18745803833008, "geo/layer_7/stable_rank_o_proj": 91.94783020019531, "geo/layer_7/stable_rank_gate_proj": 82.0479507446289, "geo/layer_7/stable_rank_down_proj": 140.45030212402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4474606215953827, "geo/layer_7/attn_entropy_mean": 4.666428565979004, "geo/layer_7/attn_entropy_std": 0.7780961394309998, "geo/layer_14/stable_rank_q_proj": 51.45584487915039, "geo/layer_14/stable_rank_k_proj": 39.997657775878906, "geo/layer_14/stable_rank_o_proj": 44.1095085144043, "geo/layer_14/stable_rank_gate_proj": 71.88379669189453, "geo/layer_14/stable_rank_down_proj": 129.4241485595703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3937978148460388, "geo/layer_14/attn_entropy_mean": 5.562880039215088, "geo/layer_14/attn_entropy_std": 0.3922209143638611, "geo/layer_21/stable_rank_q_proj": 40.79296112060547, "geo/layer_21/stable_rank_k_proj": 30.35547637939453, "geo/layer_21/stable_rank_o_proj": 70.44305419921875, "geo/layer_21/stable_rank_gate_proj": 66.26217651367188, "geo/layer_21/stable_rank_down_proj": 51.595794677734375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14395031332969666, "geo/layer_21/attn_entropy_mean": 5.720170974731445, "geo/layer_21/attn_entropy_std": 0.2961874306201935, "geo/layer_27/stable_rank_q_proj": 43.27387237548828, "geo/layer_27/stable_rank_k_proj": 31.68899154663086, "geo/layer_27/stable_rank_o_proj": 116.06237030029297, "geo/layer_27/stable_rank_gate_proj": 80.69066619873047, "geo/layer_27/stable_rank_down_proj": 128.76426696777344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09642805904150009, "geo/layer_27/attn_entropy_mean": 4.197296619415283, "geo/layer_27/attn_entropy_std": 0.7261533737182617, "attnres/final_alpha/block_0": 0.23550423979759216, "attnres/block_norm/0": 1.762510061264038, "attnres/final_alpha/block_1": 0.004436135292053223, "attnres/block_norm/1": 46348.90625, "attnres/final_alpha/block_2": 0.010435283184051514, "attnres/block_norm/2": 28515.4921875, "attnres/final_alpha/block_3": 0.012322863563895226, "attnres/block_norm/3": 57719.046875, "attnres/final_alpha/block_4": 0.014301665127277374, "attnres/block_norm/4": 14998.6943359375, "attnres/final_alpha/block_5": 0.615189790725708, "attnres/block_norm/5": 6560.03759765625, "attnres/final_alpha/block_6": 0.10781005769968033, "attnres/block_norm/6": 38075.30859375, "geo/tier1_time_s": 1.3568429946899414, "geo/step": 61725.0, "geo/rankme_slope": -4.234955701030412e-06} {"step": 61730, "timestamp": 1778261265.7835116, "train/loss": 2.1743470430374146, "train/z_loss": 0.0013702331110835076, "train/perplexity": 8.796439550135883, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704496.0650814518, "perf/iters_per_sec": 0.8127670598418483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2303648233413695, "data/tokens_consumed": 129459290112, "data/tokens_consumed_B": 129.459290112, "train/loss_slope": 6.147442470611182e-06} {"step": 61740, "timestamp": 1778261276.1253433, "train/loss": 2.1109622716903687, "train/z_loss": 0.001387152716051787, "train/perplexity": 8.256182153734208, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028792.7816998921, "perf/iters_per_sec": 0.9674037846087895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336945295333861, "data/tokens_consumed": 129480261632, "data/tokens_consumed_B": 129.480261632, "train/loss_slope": 2.191764157418536e-06} {"step": 61750, "timestamp": 1778261286.4598536, "grad/layer_0/attn": 0.0031324883457273245, "grad/layer_0/mlp": 0.00335719995200634, "grad/layer_0/attn_mlp_ratio": 0.9330657384731295, "grad/layer_4/attn": 0.0020666704513132572, "grad/layer_4/mlp": 0.0025592087768018246, "grad/layer_4/attn_mlp_ratio": 0.8075427019837164, "grad/layer_8/attn": 0.0036538769491016865, "grad/layer_8/mlp": 0.0038129575550556183, "grad/layer_8/attn_mlp_ratio": 0.958278921418651, "grad/layer_12/attn": 0.007173125632107258, "grad/layer_12/mlp": 0.006971087772399187, "grad/layer_12/attn_mlp_ratio": 1.028982242571924, "grad/layer_16/attn": 0.0034237136133015156, "grad/layer_16/mlp": 0.004733428359031677, "grad/layer_16/attn_mlp_ratio": 0.7233052412081683, "grad/layer_20/attn": 0.005060262978076935, "grad/layer_20/mlp": 0.005602252669632435, "grad/layer_20/attn_mlp_ratio": 0.9032550272466446, "grad/layer_24/attn": 0.013949562795460224, "grad/layer_24/mlp": 0.009411042556166649, "grad/layer_24/attn_mlp_ratio": 1.4822547623158078, "grad/layer_27/attn": 0.005822158418595791, "grad/layer_27/mlp": 0.007336015347391367, "grad/layer_27/attn_mlp_ratio": 0.7936404251528816} {"step": 61750, "timestamp": 1778261286.4743278, "train/loss": 2.155227780342102, "train/z_loss": 0.0013935130555182695, "train/perplexity": 8.62985566658656, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027608.1943106737, "perf/iters_per_sec": 0.9668389293244714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342984437942504, "data/tokens_consumed": 129501233152, "data/tokens_consumed_B": 129.501233152, "train/loss_slope": 2.312479969119632e-06} {"step": 61760, "timestamp": 1778261296.815791, "train/loss": 2.1361798644065857, "train/z_loss": 0.0013791650999337435, "train/perplexity": 8.467030563778318, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029295.6557239895, "perf/iters_per_sec": 0.9676435736293743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334383726119996, "data/tokens_consumed": 129522204672, "data/tokens_consumed_B": 129.522204672, "train/loss_slope": -2.5376774904930678e-06} {"step": 61770, "timestamp": 1778261307.1617544, "train/loss": 2.134160506725311, "train/z_loss": 0.0013672988628968597, "train/perplexity": 8.449949852408936, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028112.632245391, "perf/iters_per_sec": 0.9670794640757517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340411901473998, "data/tokens_consumed": 129543176192, "data/tokens_consumed_B": 129.543176192, "train/loss_slope": -3.783206313070626e-06} {"step": 61780, "timestamp": 1778261317.5047634, "train/loss": 2.15506352186203, "train/z_loss": 0.001386533200275153, "train/perplexity": 8.628438256025566, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028946.744011562, "perf/iters_per_sec": 0.9674771995599566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336160898208617, "data/tokens_consumed": 129564147712, "data/tokens_consumed_B": 129.564147712, "train/loss_slope": -4.904164039012112e-06} {"step": 61790, "timestamp": 1778261327.8464534, "train/loss": 2.1749407291412353, "train/z_loss": 0.0013710861094295979, "train/perplexity": 8.801663424577294, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029280.4872251847, "perf/iters_per_sec": 0.96763634072551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334460973739623, "data/tokens_consumed": 129585119232, "data/tokens_consumed_B": 129.585119232, "train/loss_slope": -4.089420127181963e-06} {"step": 61800, "timestamp": 1778261338.1799545, "grad/layer_0/attn": 0.0024959761649370193, "grad/layer_0/mlp": 0.0029245796613395214, "grad/layer_0/attn_mlp_ratio": 0.85344780058035, "grad/layer_4/attn": 0.00233096769079566, "grad/layer_4/mlp": 0.0025442843325436115, "grad/layer_4/attn_mlp_ratio": 0.9161584534262581, "grad/layer_8/attn": 0.007155583240091801, "grad/layer_8/mlp": 0.003702379297465086, "grad/layer_8/attn_mlp_ratio": 1.9326985357013993, "grad/layer_12/attn": 0.0047512114979326725, "grad/layer_12/mlp": 0.006574866361916065, "grad/layer_12/attn_mlp_ratio": 0.7226323949624465, "grad/layer_16/attn": 0.0033148073125630617, "grad/layer_16/mlp": 0.004386469256132841, "grad/layer_16/attn_mlp_ratio": 0.7556891530379759, "grad/layer_20/attn": 0.003339771879836917, "grad/layer_20/mlp": 0.005772588774561882, "grad/layer_20/attn_mlp_ratio": 0.5785570308937674, "grad/layer_24/attn": 0.006230069324374199, "grad/layer_24/mlp": 0.008377285674214363, "grad/layer_24/attn_mlp_ratio": 0.743685901649746, "grad/layer_27/attn": 0.006202905438840389, "grad/layer_27/mlp": 0.007106649223715067, "grad/layer_27/attn_mlp_ratio": 0.872831225559581} {"step": 61800, "timestamp": 1778261338.7791715, "eos/sharpness": 35.87141036987304, "eos/L0_probe": 1.973109483718872, "eos/L_plus": 2.151724338531494, "eos/L_minus": 2.1532087326049805, "eos/grad_norm": 0.11572443693876266, "eos/embed_grad_frac": 0.1690259575843811, "eos/time_s": 0.596238374710083} {"step": 61800, "timestamp": 1778261338.8009994, "train/loss": 2.1355615139007567, "train/z_loss": 0.001376182131934911, "train/perplexity": 8.461796589528419, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915342.667142202, "perf/iters_per_sec": 0.9133065543852815, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0949226140975952, "data/tokens_consumed": 129606090752, "data/tokens_consumed_B": 129.606090752, "train/loss_slope": -7.750887252745841e-06} {"step": 61800, "timestamp": 1778261340.1667545, "geo/rankme_last": 438.477783203125, "geo/layer_0/stable_rank_q_proj": 19.345762252807617, "geo/layer_0/stable_rank_k_proj": 16.193326950073242, "geo/layer_0/stable_rank_o_proj": 47.26817321777344, "geo/layer_0/stable_rank_gate_proj": 131.52281188964844, "geo/layer_0/stable_rank_down_proj": 55.13447570800781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06762809306383133, "geo/layer_0/attn_entropy_mean": 6.1603288650512695, "geo/layer_0/attn_entropy_std": 0.41732826828956604, "geo/layer_7/stable_rank_q_proj": 43.70897674560547, "geo/layer_7/stable_rank_k_proj": 41.081905364990234, "geo/layer_7/stable_rank_o_proj": 91.90839385986328, "geo/layer_7/stable_rank_gate_proj": 82.08409881591797, "geo/layer_7/stable_rank_down_proj": 140.9362335205078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47014477849006653, "geo/layer_7/attn_entropy_mean": 4.673765659332275, "geo/layer_7/attn_entropy_std": 0.8047887682914734, "geo/layer_14/stable_rank_q_proj": 51.46625518798828, "geo/layer_14/stable_rank_k_proj": 40.0311393737793, "geo/layer_14/stable_rank_o_proj": 44.07304000854492, "geo/layer_14/stable_rank_gate_proj": 72.0060043334961, "geo/layer_14/stable_rank_down_proj": 129.49539184570312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3970361053943634, "geo/layer_14/attn_entropy_mean": 5.5239176750183105, "geo/layer_14/attn_entropy_std": 0.38855698704719543, "geo/layer_21/stable_rank_q_proj": 40.762351989746094, "geo/layer_21/stable_rank_k_proj": 30.381521224975586, "geo/layer_21/stable_rank_o_proj": 70.48617553710938, "geo/layer_21/stable_rank_gate_proj": 66.27465057373047, "geo/layer_21/stable_rank_down_proj": 51.53955078125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14171189069747925, "geo/layer_21/attn_entropy_mean": 5.686641216278076, "geo/layer_21/attn_entropy_std": 0.3064773380756378, "geo/layer_27/stable_rank_q_proj": 43.22566604614258, "geo/layer_27/stable_rank_k_proj": 31.625993728637695, "geo/layer_27/stable_rank_o_proj": 116.1728515625, "geo/layer_27/stable_rank_gate_proj": 80.594482421875, "geo/layer_27/stable_rank_down_proj": 128.52587890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09004497528076172, "geo/layer_27/attn_entropy_mean": 4.186589241027832, "geo/layer_27/attn_entropy_std": 0.7331252098083496, "attnres/final_alpha/block_0": 0.23832353949546814, "attnres/block_norm/0": 1.7626001834869385, "attnres/final_alpha/block_1": 0.004517097491770983, "attnres/block_norm/1": 46405.71875, "attnres/final_alpha/block_2": 0.010604113340377808, "attnres/block_norm/2": 28502.19140625, "attnres/final_alpha/block_3": 0.012533683329820633, "attnres/block_norm/3": 57408.6796875, "attnres/final_alpha/block_4": 0.014724922366440296, "attnres/block_norm/4": 15043.4755859375, "attnres/final_alpha/block_5": 0.6084319353103638, "attnres/block_norm/5": 6683.6396484375, "attnres/final_alpha/block_6": 0.11086472123861313, "attnres/block_norm/6": 38082.046875, "geo/tier1_time_s": 1.3612604141235352, "geo/step": 61800.0, "geo/rankme_slope": -8.547540109793917e-06} {"step": 61810, "timestamp": 1778261350.5069323, "train/loss": 2.139485716819763, "train/z_loss": 0.001394801726564765, "train/perplexity": 8.49506763487564, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792164.356075494, "perf/iters_per_sec": 0.854570558583972, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1701783895492555, "data/tokens_consumed": 129627062272, "data/tokens_consumed_B": 129.627062272, "train/loss_slope": -5.694689544657157e-06} {"step": 61820, "timestamp": 1778261360.8490362, "train/loss": 2.1624295473098756, "train/z_loss": 0.001381157839205116, "train/perplexity": 8.69223020993573, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028716.7918955511, "perf/iters_per_sec": 0.9673675498464351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337332487106323, "data/tokens_consumed": 129648033792, "data/tokens_consumed_B": 129.648033792, "train/loss_slope": -4.552912139835364e-06} {"step": 61830, "timestamp": 1778261371.191672, "train/loss": 2.1514015197753906, "train/z_loss": 0.0013688413659110666, "train/perplexity": 8.596898681367811, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028646.1414578261, "perf/iters_per_sec": 0.9673338610924845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337692499160767, "data/tokens_consumed": 129669005312, "data/tokens_consumed_B": 129.669005312, "train/loss_slope": -4.63502707511429e-06} {"step": 61840, "timestamp": 1778261381.5339944, "train/loss": 2.1420240879058836, "train/z_loss": 0.0013716752524487673, "train/perplexity": 8.51665866035888, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029083.6918293682, "perf/iters_per_sec": 0.9675425013682214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335463285446167, "data/tokens_consumed": 129689976832, "data/tokens_consumed_B": 129.689976832, "train/loss_slope": -6.20411915926473e-06} {"step": 61850, "timestamp": 1778261391.8643608, "grad/layer_0/attn": 0.0026756664738059044, "grad/layer_0/mlp": 0.0029766755178570747, "grad/layer_0/attn_mlp_ratio": 0.8988774113492862, "grad/layer_4/attn": 0.0016656476072967052, "grad/layer_4/mlp": 0.00237670517526567, "grad/layer_4/attn_mlp_ratio": 0.7008221106045712, "grad/layer_8/attn": 0.00489995488896966, "grad/layer_8/mlp": 0.0035229872446507215, "grad/layer_8/attn_mlp_ratio": 1.3908522539570616, "grad/layer_12/attn": 0.00480542192235589, "grad/layer_12/mlp": 0.007079330272972584, "grad/layer_12/attn_mlp_ratio": 0.6787961105335606, "grad/layer_16/attn": 0.006358789280056953, "grad/layer_16/mlp": 0.004250768106430769, "grad/layer_16/attn_mlp_ratio": 1.4959153195973067, "grad/layer_20/attn": 0.004022183828055859, "grad/layer_20/mlp": 0.005996680818498135, "grad/layer_20/attn_mlp_ratio": 0.6707350087026496, "grad/layer_24/attn": 0.005225952249020338, "grad/layer_24/mlp": 0.007757003419101238, "grad/layer_24/attn_mlp_ratio": 0.6737076032196826, "grad/layer_27/attn": 0.005393307656049728, "grad/layer_27/mlp": 0.006759215611964464, "grad/layer_27/attn_mlp_ratio": 0.7979191500728485} {"step": 61850, "timestamp": 1778261391.8789337, "train/loss": 2.1523380041122437, "train/z_loss": 0.0013764559989795089, "train/perplexity": 8.604953313258067, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028170.3851260224, "perf/iters_per_sec": 0.9671070027952301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340117454528808, "data/tokens_consumed": 129710948352, "data/tokens_consumed_B": 129.710948352, "train/loss_slope": -2.6034316893565424e-06} {"step": 61860, "timestamp": 1778261402.2241821, "train/loss": 2.1196988105773924, "train/z_loss": 0.0013769101817160845, "train/perplexity": 8.32862861504003, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028082.8919639725, "perf/iters_per_sec": 0.967065282804476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340563535690308, "data/tokens_consumed": 129731919872, "data/tokens_consumed_B": 129.731919872, "train/loss_slope": -7.680100926829308e-06} {"step": 61870, "timestamp": 1778261412.568949, "train/loss": 2.144424080848694, "train/z_loss": 0.0013751279679127038, "train/perplexity": 8.537123128506746, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028609.2741767666, "perf/iters_per_sec": 0.9673162814029534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337880373001098, "data/tokens_consumed": 129752891392, "data/tokens_consumed_B": 129.752891392, "train/loss_slope": -8.878928442599366e-06} {"step": 61875, "timestamp": 1778261418.3271494, "eos/sharpness": 20.66562175750732, "eos/L0_probe": 1.9754210710525513, "eos/L_plus": 2.0731310844421387, "eos/L_minus": 2.084367275238037, "eos/grad_norm": 0.1004374772310257, "eos/embed_grad_frac": 0.24868424236774445, "eos/time_s": 0.5959546566009521} {"step": 61875, "timestamp": 1778261419.706762, "geo/rankme_last": 438.54095458984375, "geo/layer_0/stable_rank_q_proj": 19.36322784423828, "geo/layer_0/stable_rank_k_proj": 16.175277709960938, "geo/layer_0/stable_rank_o_proj": 47.325042724609375, "geo/layer_0/stable_rank_gate_proj": 131.6226806640625, "geo/layer_0/stable_rank_down_proj": 55.165565490722656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06235281005501747, "geo/layer_0/attn_entropy_mean": 6.161954879760742, "geo/layer_0/attn_entropy_std": 0.4221264719963074, "geo/layer_7/stable_rank_q_proj": 43.5847282409668, "geo/layer_7/stable_rank_k_proj": 41.156558990478516, "geo/layer_7/stable_rank_o_proj": 92.00984191894531, "geo/layer_7/stable_rank_gate_proj": 81.87052917480469, "geo/layer_7/stable_rank_down_proj": 140.93939208984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45280203223228455, "geo/layer_7/attn_entropy_mean": 4.622122287750244, "geo/layer_7/attn_entropy_std": 0.7948971390724182, "geo/layer_14/stable_rank_q_proj": 51.57020950317383, "geo/layer_14/stable_rank_k_proj": 40.083900451660156, "geo/layer_14/stable_rank_o_proj": 44.06509017944336, "geo/layer_14/stable_rank_gate_proj": 72.01007080078125, "geo/layer_14/stable_rank_down_proj": 129.4495849609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4019756317138672, "geo/layer_14/attn_entropy_mean": 5.5316877365112305, "geo/layer_14/attn_entropy_std": 0.38977980613708496, "geo/layer_21/stable_rank_q_proj": 40.79468536376953, "geo/layer_21/stable_rank_k_proj": 30.445831298828125, "geo/layer_21/stable_rank_o_proj": 70.41027069091797, "geo/layer_21/stable_rank_gate_proj": 66.29000091552734, "geo/layer_21/stable_rank_down_proj": 51.52880096435547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14258305728435516, "geo/layer_21/attn_entropy_mean": 5.691836357116699, "geo/layer_21/attn_entropy_std": 0.29334843158721924, "geo/layer_27/stable_rank_q_proj": 43.161590576171875, "geo/layer_27/stable_rank_k_proj": 31.50497055053711, "geo/layer_27/stable_rank_o_proj": 116.18243408203125, "geo/layer_27/stable_rank_gate_proj": 80.57318878173828, "geo/layer_27/stable_rank_down_proj": 128.6585693359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09991631656885147, "geo/layer_27/attn_entropy_mean": 4.185637474060059, "geo/layer_27/attn_entropy_std": 0.7259708046913147, "attnres/final_alpha/block_0": 0.23715579509735107, "attnres/block_norm/0": 1.7626287937164307, "attnres/final_alpha/block_1": 0.004446225706487894, "attnres/block_norm/1": 46226.4921875, "attnres/final_alpha/block_2": 0.01080771442502737, "attnres/block_norm/2": 28400.984375, "attnres/final_alpha/block_3": 0.012812614440917969, "attnres/block_norm/3": 57283.3828125, "attnres/final_alpha/block_4": 0.014648901298642159, "attnres/block_norm/4": 15045.36328125, "attnres/final_alpha/block_5": 0.611289918422699, "attnres/block_norm/5": 6623.40625, "attnres/final_alpha/block_6": 0.10883880406618118, "attnres/block_norm/6": 38177.1875, "geo/tier1_time_s": 1.3603758811950684, "geo/step": 61875.0, "geo/rankme_slope": -4.735468406112445e-06} {"step": 61880, "timestamp": 1778261425.3110473, "train/loss": 2.170194125175476, "train/z_loss": 0.0013765063951723278, "train/perplexity": 8.759984409203936, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1646553.9413505786, "perf/iters_per_sec": 0.7851381022217648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2736612796783446, "data/tokens_consumed": 129773862912, "data/tokens_consumed_B": 129.773862912, "train/loss_slope": -7.955725880453195e-06} {"step": 61890, "timestamp": 1778261435.6622682, "train/loss": 2.140005040168762, "train/z_loss": 0.0013791475445032119, "train/perplexity": 8.499480467594113, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026992.736711061, "perf/iters_per_sec": 0.9665454562716775, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034612488746643, "data/tokens_consumed": 129794834432, "data/tokens_consumed_B": 129.794834432, "train/loss_slope": -7.546837960067848e-06} {"step": 61900, "timestamp": 1778261446.5003514, "grad/layer_0/attn": 0.0029435441829264164, "grad/layer_0/mlp": 0.0029856967739760876, "grad/layer_0/attn_mlp_ratio": 0.9858817914782032, "grad/layer_4/attn": 0.0034139740746468306, "grad/layer_4/mlp": 0.0025954470038414, "grad/layer_4/attn_mlp_ratio": 1.3153703150389644, "grad/layer_8/attn": 0.0038732399698346853, "grad/layer_8/mlp": 0.00377351394854486, "grad/layer_8/attn_mlp_ratio": 1.0264278653814158, "grad/layer_12/attn": 0.0061820806004107, "grad/layer_12/mlp": 0.0066525815054774284, "grad/layer_12/attn_mlp_ratio": 0.9292754252455409, "grad/layer_16/attn": 0.003676998894661665, "grad/layer_16/mlp": 0.005039466544985771, "grad/layer_16/attn_mlp_ratio": 0.7296404865225667, "grad/layer_20/attn": 0.005793050862848759, "grad/layer_20/mlp": 0.006812357809394598, "grad/layer_20/attn_mlp_ratio": 0.8503738264925627, "grad/layer_24/attn": 0.007521019782871008, "grad/layer_24/mlp": 0.010093318298459053, "grad/layer_24/attn_mlp_ratio": 0.7451483730087463, "grad/layer_27/attn": 0.011514605022966862, "grad/layer_27/mlp": 0.007267622277140617, "grad/layer_27/attn_mlp_ratio": 1.5843702968366367} {"step": 61900, "timestamp": 1778261446.5154998, "train/loss": 2.142367219924927, "train/z_loss": 0.0013737541041336954, "train/perplexity": 8.51958150007178, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933424.3110650752, "perf/iters_per_sec": 0.9219285540891052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0846827507019043, "data/tokens_consumed": 129815805952, "data/tokens_consumed_B": 129.815805952, "train/loss_slope": -6.155163334040478e-06} {"step": 61910, "timestamp": 1778261456.8566916, "train/loss": 2.1351680517196656, "train/z_loss": 0.0013762015150859952, "train/perplexity": 8.45846784749635, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029304.8318395207, "perf/iters_per_sec": 0.9676479491422275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033433699607849, "data/tokens_consumed": 129836777472, "data/tokens_consumed_B": 129.836777472, "train/loss_slope": -6.903886301468127e-06} {"step": 61920, "timestamp": 1778261467.2060065, "train/loss": 2.1987595081329347, "train/z_loss": 0.001364171819295734, "train/perplexity": 9.013824984653473, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027705.696293886, "perf/iters_per_sec": 0.9668854218930655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03424870967865, "data/tokens_consumed": 129857748992, "data/tokens_consumed_B": 129.857748992, "train/loss_slope": -1.3505875796052666e-06} {"step": 61930, "timestamp": 1778261477.554254, "train/loss": 2.155313181877136, "train/z_loss": 0.0013776822248473763, "train/perplexity": 8.630592700979095, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027944.95709487, "perf/iters_per_sec": 0.9669995103334761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341266870498658, "data/tokens_consumed": 129878720512, "data/tokens_consumed_B": 129.878720512, "train/loss_slope": 1.8712438479794714e-07} {"step": 61940, "timestamp": 1778261488.467512, "train/loss": 2.1316224575042724, "train/z_loss": 0.0013842956046573817, "train/perplexity": 8.42853065675263, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922597.3093032485, "perf/iters_per_sec": 0.9167658373371356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0907910823822022, "data/tokens_consumed": 129899692032, "data/tokens_consumed_B": 129.899692032, "train/loss_slope": -2.63583584301042e-06} {"step": 61950, "timestamp": 1778261498.8097186, "grad/layer_0/attn": 0.0028322478756308556, "grad/layer_0/mlp": 0.0028187257703393698, "grad/layer_0/attn_mlp_ratio": 1.0047972048058216, "grad/layer_4/attn": 0.004548271652311087, "grad/layer_4/mlp": 0.002513070357963443, "grad/layer_4/attn_mlp_ratio": 1.8098464521352666, "grad/layer_8/attn": 0.005268689710646868, "grad/layer_8/mlp": 0.0036310404539108276, "grad/layer_8/attn_mlp_ratio": 1.4510137335073834, "grad/layer_12/attn": 0.005330367479473352, "grad/layer_12/mlp": 0.0070463125593960285, "grad/layer_12/attn_mlp_ratio": 0.7564761510214113, "grad/layer_16/attn": 0.003758376697078347, "grad/layer_16/mlp": 0.004750282038003206, "grad/layer_16/attn_mlp_ratio": 0.7911902046850192, "grad/layer_20/attn": 0.004572732839733362, "grad/layer_20/mlp": 0.006334763951599598, "grad/layer_20/attn_mlp_ratio": 0.7218473809736761, "grad/layer_24/attn": 0.013257602229714394, "grad/layer_24/mlp": 0.011501753702759743, "grad/layer_24/attn_mlp_ratio": 1.1526591906821506, "grad/layer_27/attn": 0.0046039726585149765, "grad/layer_27/mlp": 0.011093299835920334, "grad/layer_27/attn_mlp_ratio": 0.41502282324551765} {"step": 61950, "timestamp": 1778261499.3942864, "eos/sharpness": 63.91243934631346, "eos/L0_probe": 1.9729804992675781, "eos/L_plus": 2.320056200027466, "eos/L_minus": 2.265029191970825, "eos/grad_norm": 0.1897020936012268, "eos/embed_grad_frac": 0.05862048268318176, "eos/time_s": 0.5818691253662109} {"step": 61950, "timestamp": 1778261499.414607, "train/loss": 2.110034692287445, "train/z_loss": 0.0013839100720360875, "train/perplexity": 8.248527439947885, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917030.6786410802, "perf/iters_per_sec": 0.9141114609914208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0939584970474243, "data/tokens_consumed": 129920663552, "data/tokens_consumed_B": 129.920663552, "train/loss_slope": -5.239893615406782e-06} {"step": 61950, "timestamp": 1778261500.7762585, "geo/rankme_last": 438.84344482421875, "geo/layer_0/stable_rank_q_proj": 19.359586715698242, "geo/layer_0/stable_rank_k_proj": 16.180187225341797, "geo/layer_0/stable_rank_o_proj": 47.28489685058594, "geo/layer_0/stable_rank_gate_proj": 131.6085968017578, "geo/layer_0/stable_rank_down_proj": 55.17521286010742, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060912035405635834, "geo/layer_0/attn_entropy_mean": 6.160090446472168, "geo/layer_0/attn_entropy_std": 0.42050281167030334, "geo/layer_7/stable_rank_q_proj": 43.51427459716797, "geo/layer_7/stable_rank_k_proj": 41.19587707519531, "geo/layer_7/stable_rank_o_proj": 92.0151596069336, "geo/layer_7/stable_rank_gate_proj": 81.71504211425781, "geo/layer_7/stable_rank_down_proj": 141.0438690185547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4558753967285156, "geo/layer_7/attn_entropy_mean": 4.638578414916992, "geo/layer_7/attn_entropy_std": 0.8007650375366211, "geo/layer_14/stable_rank_q_proj": 51.56465148925781, "geo/layer_14/stable_rank_k_proj": 40.07370376586914, "geo/layer_14/stable_rank_o_proj": 44.09455490112305, "geo/layer_14/stable_rank_gate_proj": 71.94139099121094, "geo/layer_14/stable_rank_down_proj": 129.14968872070312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3927276134490967, "geo/layer_14/attn_entropy_mean": 5.526090621948242, "geo/layer_14/attn_entropy_std": 0.3966123163700104, "geo/layer_21/stable_rank_q_proj": 40.765899658203125, "geo/layer_21/stable_rank_k_proj": 30.337318420410156, "geo/layer_21/stable_rank_o_proj": 70.44471740722656, "geo/layer_21/stable_rank_gate_proj": 66.24002838134766, "geo/layer_21/stable_rank_down_proj": 51.50875473022461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1463048756122589, "geo/layer_21/attn_entropy_mean": 5.7057576179504395, "geo/layer_21/attn_entropy_std": 0.3048129975795746, "geo/layer_27/stable_rank_q_proj": 43.25901794433594, "geo/layer_27/stable_rank_k_proj": 31.484277725219727, "geo/layer_27/stable_rank_o_proj": 116.3731689453125, "geo/layer_27/stable_rank_gate_proj": 80.59695434570312, "geo/layer_27/stable_rank_down_proj": 128.69154357910156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09421154111623764, "geo/layer_27/attn_entropy_mean": 4.202358245849609, "geo/layer_27/attn_entropy_std": 0.7430074214935303, "attnres/final_alpha/block_0": 0.23401743173599243, "attnres/block_norm/0": 1.7626668214797974, "attnres/final_alpha/block_1": 0.004353705793619156, "attnres/block_norm/1": 46624.921875, "attnres/final_alpha/block_2": 0.010236551985144615, "attnres/block_norm/2": 28585.23046875, "attnres/final_alpha/block_3": 0.012261755764484406, "attnres/block_norm/3": 57737.6171875, "attnres/final_alpha/block_4": 0.014411062002182007, "attnres/block_norm/4": 15001.8837890625, "attnres/final_alpha/block_5": 0.617260754108429, "attnres/block_norm/5": 6591.986328125, "attnres/final_alpha/block_6": 0.10745875537395477, "attnres/block_norm/6": 38254.02734375, "geo/tier1_time_s": 1.357299566268921, "geo/step": 61950.0, "geo/rankme_slope": -5.108879489295719e-06} {"step": 61960, "timestamp": 1778261511.1172428, "train/loss": 2.1965182542800905, "train/z_loss": 0.001368014106992632, "train/perplexity": 8.993645336981533, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792677.495757925, "perf/iters_per_sec": 0.8548152426519037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1698434352874756, "data/tokens_consumed": 129941635072, "data/tokens_consumed_B": 129.941635072, "train/loss_slope": -4.495330447732793e-07} {"step": 61970, "timestamp": 1778261521.8900552, "train/loss": 2.1390775203704835, "train/z_loss": 0.001370524475350976, "train/perplexity": 8.491600686076929, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947480.5350776813, "perf/iters_per_sec": 0.9286310840023428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0768538951873778, "data/tokens_consumed": 129962606592, "data/tokens_consumed_B": 129.962606592, "train/loss_slope": -4.495690006985153e-06} {"step": 61980, "timestamp": 1778261532.2327075, "train/loss": 2.1562517881393433, "train/z_loss": 0.0013688599690794944, "train/perplexity": 8.638697232221686, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029087.951268448, "perf/iters_per_sec": 0.9675445324270477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033544158935547, "data/tokens_consumed": 129983578112, "data/tokens_consumed_B": 129.983578112, "train/loss_slope": -4.074746761957223e-06} {"step": 61990, "timestamp": 1778261542.5761528, "train/loss": 2.1957074642181396, "train/z_loss": 0.0013870275695808231, "train/perplexity": 8.986356334046471, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028851.8368210525, "perf/iters_per_sec": 0.967431944284941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336644411087037, "data/tokens_consumed": 130004549632, "data/tokens_consumed_B": 130.004549632, "train/loss_slope": -1.6683353878448088e-06} {"step": 62000, "timestamp": 1778261552.906759, "grad/layer_0/attn": 0.002949394518509507, "grad/layer_0/mlp": 0.0030468078330159187, "grad/layer_0/attn_mlp_ratio": 0.9680277140377582, "grad/layer_4/attn": 0.0020176731050014496, "grad/layer_4/mlp": 0.002623296109959483, "grad/layer_4/attn_mlp_ratio": 0.7691365913392649, "grad/layer_8/attn": 0.003479515668004751, "grad/layer_8/mlp": 0.003853993955999613, "grad/layer_8/attn_mlp_ratio": 0.9028336882326268, "grad/layer_12/attn": 0.005417590029537678, "grad/layer_12/mlp": 0.006898753345012665, "grad/layer_12/attn_mlp_ratio": 0.7852998476781674, "grad/layer_16/attn": 0.003652414074167609, "grad/layer_16/mlp": 0.004464880097657442, "grad/layer_16/attn_mlp_ratio": 0.818031819999089, "grad/layer_20/attn": 0.0036981028970330954, "grad/layer_20/mlp": 0.005847146734595299, "grad/layer_20/attn_mlp_ratio": 0.632462806501259, "grad/layer_24/attn": 0.004536610562354326, "grad/layer_24/mlp": 0.00775881065055728, "grad/layer_24/attn_mlp_ratio": 0.5847043713533653, "grad/layer_27/attn": 0.005215538665652275, "grad/layer_27/mlp": 0.007008648943156004, "grad/layer_27/attn_mlp_ratio": 0.744157487917773} {"step": 62000, "timestamp": 1778261552.9214623, "train/loss": 2.1689062833786013, "train/z_loss": 0.0013723294599913062, "train/perplexity": 8.748710196401253, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028184.5081932854, "perf/iters_per_sec": 0.9671137371984889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034004545211792, "data/tokens_consumed": 130025521152, "data/tokens_consumed_B": 130.025521152, "train/loss_slope": 2.9041495725195375e-06} {"step": 62000, "timestamp": 1778261560.0789344, "geo/ww_alpha_mean": 7.4687473765531704, "geo/ww_alpha_std": 4.372591857549489, "geo/ww_alpha_min": 1.3526252433152686, "geo/ww_alpha_max": 31.167496317269624, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9379536222725173, "geo/ww_alpha_by_type/k_proj": 4.467570499425089, "geo/ww_alpha_by_type/v_proj": 8.171748548288935, "geo/ww_alpha_by_type/o_proj": 7.923723656223251, "geo/ww_alpha_by_type/gate_proj": 8.084528223904234, "geo/ww_alpha_by_type/up_proj": 11.662356009943796, "geo/ww_alpha_by_type/down_proj": 8.12488207127635, "geo/twonn_id/layer_0": 0.7056179642677307, "geo/twonn_id/layer_7": 2.9570538997650146, "geo/twonn_id/layer_14": 4.76776647567749, "geo/twonn_id/layer_21": 6.8699727058410645, "geo/twonn_id/layer_27": 6.473241329193115, "geo/tier2_time_s": 7.151998043060303} {"step": 62000, "timestamp": 1778261560.674123, "eoc/jacobian_sigma/layer_0/attn": 1255.451416015625, "eoc/jacobian_sigma/layer_0/mlp": 9391.0107421875, "eoc/jacobian_sigma/layer_0": 9391.0107421875, "eoc/jacobian_sigma/layer_7/attn": 1.1456055641174316, "eoc/jacobian_sigma/layer_7/mlp": 1.779053807258606, "eoc/jacobian_sigma/layer_7": 1.779053807258606, "eoc/jacobian_sigma/layer_14/attn": 1.4867844581604004, "eoc/jacobian_sigma/layer_14/mlp": 8.758262634277344, "eoc/jacobian_sigma/layer_14": 8.758262634277344, "eoc/jacobian_sigma/layer_21/attn": 1.0819599628448486, "eoc/jacobian_sigma/layer_21/mlp": 4.050122261047363, "eoc/jacobian_sigma/layer_21": 4.050122261047363, "eoc/jacobian_sigma/layer_27/attn": 2.927706003189087, "eoc/jacobian_sigma/layer_27/mlp": 24.889841079711914, "eoc/jacobian_sigma/layer_27": 24.889841079711914, "eoc/layer0_sigma": 9391.0107421875, "eoc/sigma_max": 24.889841079711914, "eoc/sigma_min": 1.779053807258606, "eoc/sigma_mean": 9.869319945573807, "eoc/time_s": 0.5887272357940674} {"step": 62010, "timestamp": 1778261571.0333333, "train/loss": 2.120963621139526, "train/z_loss": 0.0013835969846695661, "train/perplexity": 8.339169417134116, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1158172.4240597954, "perf/iters_per_sec": 0.5522596473978975, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.810742473602295, "data/tokens_consumed": 130046492672, "data/tokens_consumed_B": 130.046492672, "train/loss_slope": 3.95228688937937e-07} {"step": 62020, "timestamp": 1778261581.3718863, "train/loss": 2.133340859413147, "train/z_loss": 0.001387394848279655, "train/perplexity": 8.443026711379048, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029476.477946808, "perf/iters_per_sec": 0.9677297963842431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333462953567505, "data/tokens_consumed": 130067464192, "data/tokens_consumed_B": 130.067464192, "train/loss_slope": 1.8393450158156654e-06} {"step": 62025, "timestamp": 1778261587.1341648, "eos/sharpness": 28.564763069152825, "eos/L0_probe": 1.974746823310852, "eos/L_plus": 2.0979580879211426, "eos/L_minus": 2.13718318939209, "eos/grad_norm": 0.09874896705150604, "eos/embed_grad_frac": 0.2330077886581421, "eos/time_s": 0.5992374420166016} {"step": 62025, "timestamp": 1778261588.5124295, "geo/rankme_last": 438.2165222167969, "geo/layer_0/stable_rank_q_proj": 19.35909652709961, "geo/layer_0/stable_rank_k_proj": 16.186954498291016, "geo/layer_0/stable_rank_o_proj": 47.26551818847656, "geo/layer_0/stable_rank_gate_proj": 131.36585998535156, "geo/layer_0/stable_rank_down_proj": 55.14918518066406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06547302007675171, "geo/layer_0/attn_entropy_mean": 6.16077995300293, "geo/layer_0/attn_entropy_std": 0.41497090458869934, "geo/layer_7/stable_rank_q_proj": 43.43983459472656, "geo/layer_7/stable_rank_k_proj": 41.228431701660156, "geo/layer_7/stable_rank_o_proj": 92.16561126708984, "geo/layer_7/stable_rank_gate_proj": 81.6907730102539, "geo/layer_7/stable_rank_down_proj": 141.42970275878906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46518296003341675, "geo/layer_7/attn_entropy_mean": 4.665982723236084, "geo/layer_7/attn_entropy_std": 0.784359872341156, "geo/layer_14/stable_rank_q_proj": 51.52778625488281, "geo/layer_14/stable_rank_k_proj": 39.983436584472656, "geo/layer_14/stable_rank_o_proj": 44.119075775146484, "geo/layer_14/stable_rank_gate_proj": 71.83950805664062, "geo/layer_14/stable_rank_down_proj": 129.1593017578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4007098972797394, "geo/layer_14/attn_entropy_mean": 5.563296318054199, "geo/layer_14/attn_entropy_std": 0.39874815940856934, "geo/layer_21/stable_rank_q_proj": 40.72144317626953, "geo/layer_21/stable_rank_k_proj": 30.34976577758789, "geo/layer_21/stable_rank_o_proj": 70.42510986328125, "geo/layer_21/stable_rank_gate_proj": 66.27806854248047, "geo/layer_21/stable_rank_down_proj": 51.48664474487305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14846819639205933, "geo/layer_21/attn_entropy_mean": 5.717278003692627, "geo/layer_21/attn_entropy_std": 0.3002467453479767, "geo/layer_27/stable_rank_q_proj": 43.30214309692383, "geo/layer_27/stable_rank_k_proj": 31.542898178100586, "geo/layer_27/stable_rank_o_proj": 116.64348602294922, "geo/layer_27/stable_rank_gate_proj": 80.53993225097656, "geo/layer_27/stable_rank_down_proj": 128.50765991210938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0937461256980896, "geo/layer_27/attn_entropy_mean": 4.206765174865723, "geo/layer_27/attn_entropy_std": 0.7275338768959045, "attnres/final_alpha/block_0": 0.23666170239448547, "attnres/block_norm/0": 1.7625625133514404, "attnres/final_alpha/block_1": 0.004489604849368334, "attnres/block_norm/1": 46429.44921875, "attnres/final_alpha/block_2": 0.010346478782594204, "attnres/block_norm/2": 28592.45703125, "attnres/final_alpha/block_3": 0.012416239827871323, "attnres/block_norm/3": 57766.58984375, "attnres/final_alpha/block_4": 0.014476187527179718, "attnres/block_norm/4": 15014.27734375, "attnres/final_alpha/block_5": 0.611190915107727, "attnres/block_norm/5": 6627.451171875, "attnres/final_alpha/block_6": 0.11041886359453201, "attnres/block_norm/6": 38300.30859375, "geo/tier1_time_s": 1.358006477355957, "geo/step": 62025.0, "geo/rankme_slope": -3.3941330438425373e-05} {"step": 62030, "timestamp": 1778261593.6837711, "train/loss": 2.1598331451416017, "train/z_loss": 0.0013911114539951086, "train/perplexity": 8.669690957715181, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704058.3052625824, "perf/iters_per_sec": 0.8125583196938431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2306808948516845, "data/tokens_consumed": 130088435712, "data/tokens_consumed_B": 130.088435712, "train/loss_slope": 4.684268194313633e-07} {"step": 62040, "timestamp": 1778261604.020908, "train/loss": 2.1774828910827635, "train/z_loss": 0.0013910253648646176, "train/perplexity": 8.824067143232355, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029733.5800265805, "perf/iters_per_sec": 0.9678523922093298, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332154035568237, "data/tokens_consumed": 130109407232, "data/tokens_consumed_B": 130.109407232, "train/loss_slope": 2.1508186027304784e-06} {"step": 62050, "timestamp": 1778261614.3483071, "grad/layer_0/attn": 0.004077279008924961, "grad/layer_0/mlp": 0.00409386120736599, "grad/layer_0/attn_mlp_ratio": 0.9959494723450469, "grad/layer_4/attn": 0.003712528385221958, "grad/layer_4/mlp": 0.003390381345525384, "grad/layer_4/attn_mlp_ratio": 1.0950179042897201, "grad/layer_8/attn": 0.010542269796133041, "grad/layer_8/mlp": 0.004206523764878511, "grad/layer_8/attn_mlp_ratio": 2.506171398230618, "grad/layer_12/attn": 0.010579067282378674, "grad/layer_12/mlp": 0.007803361862897873, "grad/layer_12/attn_mlp_ratio": 1.3557063394826823, "grad/layer_16/attn": 0.006547549273818731, "grad/layer_16/mlp": 0.005796885117888451, "grad/layer_16/attn_mlp_ratio": 1.129494379777234, "grad/layer_20/attn": 0.010726609267294407, "grad/layer_20/mlp": 0.008814387023448944, "grad/layer_20/attn_mlp_ratio": 1.2169432902213215, "grad/layer_24/attn": 0.009824052453041077, "grad/layer_24/mlp": 0.009507556445896626, "grad/layer_24/attn_mlp_ratio": 1.033288879810139, "grad/layer_27/attn": 0.0071067106910049915, "grad/layer_27/mlp": 0.007688538637012243, "grad/layer_27/attn_mlp_ratio": 0.9243252761143856} {"step": 62050, "timestamp": 1778261614.362726, "train/loss": 2.1676204204559326, "train/z_loss": 0.001383983576670289, "train/perplexity": 8.737467783988226, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028713.8909197724, "perf/iters_per_sec": 0.9673661665533888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337347269058228, "data/tokens_consumed": 130130378752, "data/tokens_consumed_B": 130.130378752, "train/loss_slope": 2.41719555504288e-06} {"step": 62060, "timestamp": 1778261624.7051654, "train/loss": 2.147104525566101, "train/z_loss": 0.0013818284031003713, "train/perplexity": 8.56003711121102, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029104.4743167479, "perf/iters_per_sec": 0.9675524112304439, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335357427597045, "data/tokens_consumed": 130151350272, "data/tokens_consumed_B": 130.151350272, "train/loss_slope": 5.958218020860616e-06} {"step": 62070, "timestamp": 1778261635.0452166, "train/loss": 2.10631685256958, "train/z_loss": 0.0013978622970171272, "train/perplexity": 8.217917673378889, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029496.4724392632, "perf/iters_per_sec": 0.9677393305012051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333361148834228, "data/tokens_consumed": 130172321792, "data/tokens_consumed_B": 130.172321792, "train/loss_slope": 1.02740821271838e-06} {"step": 62080, "timestamp": 1778261645.3850346, "train/loss": 2.1399842262268067, "train/z_loss": 0.0013781825313344599, "train/perplexity": 8.49930356174207, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029214.4319932216, "perf/iters_per_sec": 0.9676048431364163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334797382354737, "data/tokens_consumed": 130193293312, "data/tokens_consumed_B": 130.193293312, "train/loss_slope": 1.348601459133473e-06} {"step": 62090, "timestamp": 1778261655.734192, "train/loss": 2.136855959892273, "train/z_loss": 0.0013796578743495048, "train/perplexity": 8.472757020517307, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027806.0125306298, "perf/iters_per_sec": 0.966933256402316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341975450515748, "data/tokens_consumed": 130214264832, "data/tokens_consumed_B": 130.214264832, "train/loss_slope": 1.0693937244981597e-06} {"step": 62100, "timestamp": 1778261666.062119, "grad/layer_0/attn": 0.002387963468208909, "grad/layer_0/mlp": 0.002815471962094307, "grad/layer_0/attn_mlp_ratio": 0.8481574015096449, "grad/layer_4/attn": 0.0018290795851498842, "grad/layer_4/mlp": 0.0023847301490604877, "grad/layer_4/attn_mlp_ratio": 0.7669964289967331, "grad/layer_8/attn": 0.00847641658037901, "grad/layer_8/mlp": 0.003664689138531685, "grad/layer_8/attn_mlp_ratio": 2.312997372670873, "grad/layer_12/attn": 0.004810861311852932, "grad/layer_12/mlp": 0.006894116755574942, "grad/layer_12/attn_mlp_ratio": 0.6978212601607731, "grad/layer_16/attn": 0.003841166151687503, "grad/layer_16/mlp": 0.004500920884311199, "grad/layer_16/attn_mlp_ratio": 0.853417814948675, "grad/layer_20/attn": 0.0029483684338629246, "grad/layer_20/mlp": 0.006007739342749119, "grad/layer_20/attn_mlp_ratio": 0.49076170196118934, "grad/layer_24/attn": 0.009669201448559761, "grad/layer_24/mlp": 0.00814002938568592, "grad/layer_24/attn_mlp_ratio": 1.1878582830151732, "grad/layer_27/attn": 0.004968400113284588, "grad/layer_27/mlp": 0.006570978090167046, "grad/layer_27/attn_mlp_ratio": 0.7561127079556298} {"step": 62100, "timestamp": 1778261666.6475387, "eos/sharpness": 40.19997119903564, "eos/L0_probe": 1.977168083190918, "eos/L_plus": 2.152540683746338, "eos/L_minus": 2.2037951946258545, "eos/grad_norm": 0.11073480546474457, "eos/embed_grad_frac": 0.16411547362804413, "eos/time_s": 0.582453727722168} {"step": 62100, "timestamp": 1778261666.6658242, "train/loss": 2.1063916444778443, "train/z_loss": 0.0013862274470739066, "train/perplexity": 8.218532330109028, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919285.477867355, "perf/iters_per_sec": 0.9151866330467964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0926733016967773, "data/tokens_consumed": 130235236352, "data/tokens_consumed_B": 130.235236352, "train/loss_slope": -1.8680166776138109e-06} {"step": 62100, "timestamp": 1778261668.0283084, "geo/rankme_last": 439.3524169921875, "geo/layer_0/stable_rank_q_proj": 19.315444946289062, "geo/layer_0/stable_rank_k_proj": 16.142772674560547, "geo/layer_0/stable_rank_o_proj": 47.320945739746094, "geo/layer_0/stable_rank_gate_proj": 131.32115173339844, "geo/layer_0/stable_rank_down_proj": 55.15583038330078, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06412722915410995, "geo/layer_0/attn_entropy_mean": 6.159372329711914, "geo/layer_0/attn_entropy_std": 0.4137588441371918, "geo/layer_7/stable_rank_q_proj": 43.392059326171875, "geo/layer_7/stable_rank_k_proj": 41.2253532409668, "geo/layer_7/stable_rank_o_proj": 92.00391387939453, "geo/layer_7/stable_rank_gate_proj": 81.70695495605469, "geo/layer_7/stable_rank_down_proj": 141.35208129882812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4372749328613281, "geo/layer_7/attn_entropy_mean": 4.633546829223633, "geo/layer_7/attn_entropy_std": 0.7964547872543335, "geo/layer_14/stable_rank_q_proj": 51.45979690551758, "geo/layer_14/stable_rank_k_proj": 39.997886657714844, "geo/layer_14/stable_rank_o_proj": 44.206302642822266, "geo/layer_14/stable_rank_gate_proj": 71.8923110961914, "geo/layer_14/stable_rank_down_proj": 129.48403930664062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40838927030563354, "geo/layer_14/attn_entropy_mean": 5.549384593963623, "geo/layer_14/attn_entropy_std": 0.3994426429271698, "geo/layer_21/stable_rank_q_proj": 40.705291748046875, "geo/layer_21/stable_rank_k_proj": 30.385589599609375, "geo/layer_21/stable_rank_o_proj": 70.36837005615234, "geo/layer_21/stable_rank_gate_proj": 66.25718688964844, "geo/layer_21/stable_rank_down_proj": 51.48687744140625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14343015849590302, "geo/layer_21/attn_entropy_mean": 5.690807819366455, "geo/layer_21/attn_entropy_std": 0.29732462763786316, "geo/layer_27/stable_rank_q_proj": 43.34049987792969, "geo/layer_27/stable_rank_k_proj": 31.609159469604492, "geo/layer_27/stable_rank_o_proj": 116.54048156738281, "geo/layer_27/stable_rank_gate_proj": 80.505126953125, "geo/layer_27/stable_rank_down_proj": 128.50010681152344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09073420614004135, "geo/layer_27/attn_entropy_mean": 4.210862159729004, "geo/layer_27/attn_entropy_std": 0.7331404685974121, "attnres/final_alpha/block_0": 0.23694336414337158, "attnres/block_norm/0": 1.7627360820770264, "attnres/final_alpha/block_1": 0.0045337434858083725, "attnres/block_norm/1": 46241.7421875, "attnres/final_alpha/block_2": 0.010268721729516983, "attnres/block_norm/2": 28381.154296875, "attnres/final_alpha/block_3": 0.012414075434207916, "attnres/block_norm/3": 57744.3359375, "attnres/final_alpha/block_4": 0.014482654631137848, "attnres/block_norm/4": 15037.2099609375, "attnres/final_alpha/block_5": 0.6108322143554688, "attnres/block_norm/5": 6644.95556640625, "attnres/final_alpha/block_6": 0.11052525043487549, "attnres/block_norm/6": 38248.3203125, "geo/tier1_time_s": 1.3585114479064941, "geo/step": 62100.0, "geo/rankme_slope": 1.1070639193177265e-06} {"step": 62110, "timestamp": 1778261678.3769262, "train/loss": 2.1373560190200807, "train/z_loss": 0.0013791316887363791, "train/perplexity": 8.476994959524845, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791312.1469551204, "perf/iters_per_sec": 0.8541641936088182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707350969314576, "data/tokens_consumed": 130256207872, "data/tokens_consumed_B": 130.256207872, "train/loss_slope": -1.7655643442533424e-06} {"step": 62120, "timestamp": 1778261688.7178617, "train/loss": 2.1729456186294556, "train/z_loss": 0.0013879906269721686, "train/perplexity": 8.784120639074692, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029391.0726535777, "perf/iters_per_sec": 0.9676890719669236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333897829055787, "data/tokens_consumed": 130277179392, "data/tokens_consumed_B": 130.277179392, "train/loss_slope": -1.7173004980170382e-06} {"step": 62130, "timestamp": 1778261699.0559957, "train/loss": 2.149647259712219, "train/z_loss": 0.0013673917506821453, "train/perplexity": 8.581830705782076, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029512.2060873555, "perf/iters_per_sec": 0.9677468328892496, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033328104019165, "data/tokens_consumed": 130298150912, "data/tokens_consumed_B": 130.298150912, "train/loss_slope": -3.957193290511306e-06} {"step": 62140, "timestamp": 1778261709.3968852, "train/loss": 2.161257266998291, "train/z_loss": 0.0013803380425088108, "train/perplexity": 8.682046449873816, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029294.2980404736, "perf/iters_per_sec": 0.9676429262354248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334390640258788, "data/tokens_consumed": 130319122432, "data/tokens_consumed_B": 130.319122432, "train/loss_slope": 8.084660852559731e-07} {"step": 62150, "timestamp": 1778261719.7383304, "grad/layer_0/attn": 0.0029209500644356012, "grad/layer_0/mlp": 0.0033240600023418665, "grad/layer_0/attn_mlp_ratio": 0.8787296181491224, "grad/layer_4/attn": 0.0033584258053451777, "grad/layer_4/mlp": 0.0025029501412063837, "grad/layer_4/attn_mlp_ratio": 1.3417868841557425, "grad/layer_8/attn": 0.007981620728969574, "grad/layer_8/mlp": 0.0036272916477173567, "grad/layer_8/attn_mlp_ratio": 2.200435279018398, "grad/layer_12/attn": 0.005026921629905701, "grad/layer_12/mlp": 0.006396425887942314, "grad/layer_12/attn_mlp_ratio": 0.7858953796044511, "grad/layer_16/attn": 0.003585578640922904, "grad/layer_16/mlp": 0.0046895272098481655, "grad/layer_16/attn_mlp_ratio": 0.7645927625569144, "grad/layer_20/attn": 0.0038318904116749763, "grad/layer_20/mlp": 0.00597884738817811, "grad/layer_20/attn_mlp_ratio": 0.640907870497068, "grad/layer_24/attn": 0.005848824977874756, "grad/layer_24/mlp": 0.009503362700343132, "grad/layer_24/attn_mlp_ratio": 0.6154479315115252, "grad/layer_27/attn": 0.006384870503097773, "grad/layer_27/mlp": 0.009003287181258202, "grad/layer_27/attn_mlp_ratio": 0.7091710287184675} {"step": 62150, "timestamp": 1778261719.7524679, "train/loss": 2.1560290336608885, "train/z_loss": 0.0013760515139438212, "train/perplexity": 8.63677313803345, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026570.6083757766, "perf/iters_per_sec": 0.9663441697958834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034827995300293, "data/tokens_consumed": 130340093952, "data/tokens_consumed_B": 130.340093952, "train/loss_slope": -6.464062958839741e-07} {"step": 62160, "timestamp": 1778261730.099749, "train/loss": 2.1469858407974245, "train/z_loss": 0.0013810677221044898, "train/perplexity": 8.559021225472888, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027775.7670282722, "perf/iters_per_sec": 0.9669188342229234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342129707336425, "data/tokens_consumed": 130361065472, "data/tokens_consumed_B": 130.361065472, "train/loss_slope": -1.3566764882474252e-06} {"step": 62170, "timestamp": 1778261740.4408872, "train/loss": 2.1170897483825684, "train/z_loss": 0.0013928181491792203, "train/perplexity": 8.306927027691954, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029236.3875074275, "perf/iters_per_sec": 0.9676153123414171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334685564041137, "data/tokens_consumed": 130382036992, "data/tokens_consumed_B": 130.382036992, "train/loss_slope": -3.6943511708234175e-06} {"step": 62175, "timestamp": 1778261746.1860077, "eos/sharpness": 38.23318481445312, "eos/L0_probe": 1.9774688482284546, "eos/L_plus": 2.169710159301758, "eos/L_minus": 2.1675593852996826, "eos/grad_norm": 0.12778949737548828, "eos/embed_grad_frac": 0.1271357536315918, "eos/time_s": 0.5837914943695068} {"step": 62175, "timestamp": 1778261747.56335, "geo/rankme_last": 439.0992126464844, "geo/layer_0/stable_rank_q_proj": 19.304243087768555, "geo/layer_0/stable_rank_k_proj": 16.15127182006836, "geo/layer_0/stable_rank_o_proj": 47.29228210449219, "geo/layer_0/stable_rank_gate_proj": 131.5267791748047, "geo/layer_0/stable_rank_down_proj": 55.05428695678711, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06773523986339569, "geo/layer_0/attn_entropy_mean": 6.1541242599487305, "geo/layer_0/attn_entropy_std": 0.41722723841667175, "geo/layer_7/stable_rank_q_proj": 43.34341049194336, "geo/layer_7/stable_rank_k_proj": 41.18852233886719, "geo/layer_7/stable_rank_o_proj": 91.86624908447266, "geo/layer_7/stable_rank_gate_proj": 81.79600524902344, "geo/layer_7/stable_rank_down_proj": 141.2376251220703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4588221311569214, "geo/layer_7/attn_entropy_mean": 4.644577503204346, "geo/layer_7/attn_entropy_std": 0.7970860600471497, "geo/layer_14/stable_rank_q_proj": 51.40810775756836, "geo/layer_14/stable_rank_k_proj": 39.956268310546875, "geo/layer_14/stable_rank_o_proj": 44.18912887573242, "geo/layer_14/stable_rank_gate_proj": 71.84012603759766, "geo/layer_14/stable_rank_down_proj": 129.57774353027344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41081055998802185, "geo/layer_14/attn_entropy_mean": 5.536489009857178, "geo/layer_14/attn_entropy_std": 0.40311965346336365, "geo/layer_21/stable_rank_q_proj": 40.73733901977539, "geo/layer_21/stable_rank_k_proj": 30.382204055786133, "geo/layer_21/stable_rank_o_proj": 70.37322998046875, "geo/layer_21/stable_rank_gate_proj": 66.16233825683594, "geo/layer_21/stable_rank_down_proj": 51.466575622558594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14015096426010132, "geo/layer_21/attn_entropy_mean": 5.70298957824707, "geo/layer_21/attn_entropy_std": 0.3026614487171173, "geo/layer_27/stable_rank_q_proj": 43.32511901855469, "geo/layer_27/stable_rank_k_proj": 31.596057891845703, "geo/layer_27/stable_rank_o_proj": 116.51892852783203, "geo/layer_27/stable_rank_gate_proj": 80.46741485595703, "geo/layer_27/stable_rank_down_proj": 128.32521057128906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09639330953359604, "geo/layer_27/attn_entropy_mean": 4.198114395141602, "geo/layer_27/attn_entropy_std": 0.7310872673988342, "attnres/final_alpha/block_0": 0.23613783717155457, "attnres/block_norm/0": 1.7628296613693237, "attnres/final_alpha/block_1": 0.004482579883188009, "attnres/block_norm/1": 46404.2109375, "attnres/final_alpha/block_2": 0.010362830944359303, "attnres/block_norm/2": 28463.4453125, "attnres/final_alpha/block_3": 0.012483172118663788, "attnres/block_norm/3": 57659.51953125, "attnres/final_alpha/block_4": 0.014878390356898308, "attnres/block_norm/4": 15043.1318359375, "attnres/final_alpha/block_5": 0.61383056640625, "attnres/block_norm/5": 6633.220703125, "attnres/final_alpha/block_6": 0.10782463848590851, "attnres/block_norm/6": 38076.1328125, "geo/tier1_time_s": 1.359907865524292, "geo/step": 62175.0, "geo/rankme_slope": 1.0353555484693878e-05} {"step": 62180, "timestamp": 1778261752.73541, "train/loss": 2.10062917470932, "train/z_loss": 0.0013997866306453943, "train/perplexity": 8.171309476821024, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706527.055929727, "perf/iters_per_sec": 0.8137355117462763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2289005279541017, "data/tokens_consumed": 130403008512, "data/tokens_consumed_B": 130.403008512, "train/loss_slope": -8.600860052626662e-06} {"step": 62190, "timestamp": 1778261763.0728233, "train/loss": 2.107642042636871, "train/z_loss": 0.0013791730161756277, "train/perplexity": 8.228815195302586, "train/grad_norm": 0.3359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029588.3959829065, "perf/iters_per_sec": 0.9677831630625279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332893133163452, "data/tokens_consumed": 130423980032, "data/tokens_consumed_B": 130.423980032, "train/loss_slope": -1.0591839892779117e-05} {"step": 62200, "timestamp": 1778261773.4198086, "grad/layer_0/attn": 0.002764283213764429, "grad/layer_0/mlp": 0.003048601793125272, "grad/layer_0/attn_mlp_ratio": 0.9067380099704085, "grad/layer_4/attn": 0.0037275953218340874, "grad/layer_4/mlp": 0.0025688353925943375, "grad/layer_4/attn_mlp_ratio": 1.4510837041065197, "grad/layer_8/attn": 0.00999073963612318, "grad/layer_8/mlp": 0.0037780897691845894, "grad/layer_8/attn_mlp_ratio": 2.6443890913265764, "grad/layer_12/attn": 0.005023700650781393, "grad/layer_12/mlp": 0.00720318453386426, "grad/layer_12/attn_mlp_ratio": 0.6974277220610895, "grad/layer_16/attn": 0.00440567871555686, "grad/layer_16/mlp": 0.0045846267603337765, "grad/layer_16/attn_mlp_ratio": 0.9609677842432114, "grad/layer_20/attn": 0.00787606742233038, "grad/layer_20/mlp": 0.006006272044032812, "grad/layer_20/attn_mlp_ratio": 1.3113071192012495, "grad/layer_24/attn": 0.007401102222502232, "grad/layer_24/mlp": 0.00927720032632351, "grad/layer_24/attn_mlp_ratio": 0.7977732378727144, "grad/layer_27/attn": 0.010004193522036076, "grad/layer_27/mlp": 0.007862771861255169, "grad/layer_27/attn_mlp_ratio": 1.2723494425799249} {"step": 62200, "timestamp": 1778261773.4339905, "train/loss": 2.1455766201019286, "train/z_loss": 0.001386630698107183, "train/perplexity": 8.546968170330821, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025419.9074538418, "perf/iters_per_sec": 0.9657954728383263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354159116744994, "data/tokens_consumed": 130444951552, "data/tokens_consumed_B": 130.444951552, "train/loss_slope": -1.057032174689258e-05} {"step": 62210, "timestamp": 1778261783.7857552, "train/loss": 2.1587416887283326, "train/z_loss": 0.001380749442614615, "train/perplexity": 8.660233530042232, "train/grad_norm": 0.3671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027054.3964545082, "perf/iters_per_sec": 0.9665748579285184, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345810174942016, "data/tokens_consumed": 130465923072, "data/tokens_consumed_B": 130.465923072, "train/loss_slope": -9.883343905183678e-06} {"step": 62220, "timestamp": 1778261794.1333928, "train/loss": 2.1717881679534914, "train/z_loss": 0.0013812926248647273, "train/perplexity": 8.77395933444212, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027742.0165436734, "perf/iters_per_sec": 0.9669027407377593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342301845550537, "data/tokens_consumed": 130486894592, "data/tokens_consumed_B": 130.486894592, "train/loss_slope": -6.475975311020624e-06} {"step": 62230, "timestamp": 1778261804.477741, "train/loss": 2.081598770618439, "train/z_loss": 0.0013972512679174542, "train/perplexity": 8.0172764594481, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028628.3158665423, "perf/iters_per_sec": 0.9673253611881935, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337783336639403, "data/tokens_consumed": 130507866112, "data/tokens_consumed_B": 130.507866112, "train/loss_slope": -8.866022429307433e-06} {"step": 62240, "timestamp": 1778261814.8200867, "train/loss": 2.1349326372146606, "train/z_loss": 0.0013811237760819495, "train/perplexity": 8.456476835840636, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028928.772737815, "perf/iters_per_sec": 0.9674686301888538, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336252450942993, "data/tokens_consumed": 130528837632, "data/tokens_consumed_B": 130.528837632, "train/loss_slope": -9.977858051536942e-06} {"step": 62250, "timestamp": 1778261825.150873, "grad/layer_0/attn": 0.003502889536321163, "grad/layer_0/mlp": 0.003251350950449705, "grad/layer_0/attn_mlp_ratio": 1.0773642962474528, "grad/layer_4/attn": 0.0025851863902062178, "grad/layer_4/mlp": 0.0025967019610106945, "grad/layer_4/attn_mlp_ratio": 0.9955652706649007, "grad/layer_8/attn": 0.0052396757528185844, "grad/layer_8/mlp": 0.0035995591897517443, "grad/layer_8/attn_mlp_ratio": 1.455643686085792, "grad/layer_12/attn": 0.005695636384189129, "grad/layer_12/mlp": 0.006602696143090725, "grad/layer_12/attn_mlp_ratio": 0.862622809605883, "grad/layer_16/attn": 0.003922969102859497, "grad/layer_16/mlp": 0.0049116043373942375, "grad/layer_16/attn_mlp_ratio": 0.798714382003604, "grad/layer_20/attn": 0.004463287070393562, "grad/layer_20/mlp": 0.007653153035789728, "grad/layer_20/attn_mlp_ratio": 0.5831958398324923, "grad/layer_24/attn": 0.018957285210490227, "grad/layer_24/mlp": 0.012806625105440617, "grad/layer_24/attn_mlp_ratio": 1.4802717270461414, "grad/layer_27/attn": 0.009282653219997883, "grad/layer_27/mlp": 0.013568628579378128, "grad/layer_27/attn_mlp_ratio": 0.6841261146828967} {"step": 62250, "timestamp": 1778261825.7364347, "eos/sharpness": 78.24082374572752, "eos/L0_probe": 1.9783351421356201, "eos/L_plus": 2.3253438472747803, "eos/L_minus": 2.4137346744537354, "eos/grad_norm": 0.2466268539428711, "eos/embed_grad_frac": 0.042717527598142624, "eos/time_s": 0.5828800201416016} {"step": 62250, "timestamp": 1778261825.7555523, "train/loss": 2.195561385154724, "train/z_loss": 0.0013766635907813906, "train/perplexity": 8.985043711405355, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918750.7987256716, "perf/iters_per_sec": 0.9149316781643255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0929777860641479, "data/tokens_consumed": 130549809152, "data/tokens_consumed_B": 130.549809152, "train/loss_slope": -4.031796202157421e-06} {"step": 62250, "timestamp": 1778261827.1188412, "geo/rankme_last": 438.2923889160156, "geo/layer_0/stable_rank_q_proj": 19.335735321044922, "geo/layer_0/stable_rank_k_proj": 16.17776107788086, "geo/layer_0/stable_rank_o_proj": 47.26020812988281, "geo/layer_0/stable_rank_gate_proj": 131.8278045654297, "geo/layer_0/stable_rank_down_proj": 55.08339309692383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06380394846200943, "geo/layer_0/attn_entropy_mean": 6.160984516143799, "geo/layer_0/attn_entropy_std": 0.4159412682056427, "geo/layer_7/stable_rank_q_proj": 43.36071014404297, "geo/layer_7/stable_rank_k_proj": 41.19820022583008, "geo/layer_7/stable_rank_o_proj": 91.75194549560547, "geo/layer_7/stable_rank_gate_proj": 81.71610260009766, "geo/layer_7/stable_rank_down_proj": 140.90086364746094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4471944272518158, "geo/layer_7/attn_entropy_mean": 4.670377731323242, "geo/layer_7/attn_entropy_std": 0.7962183952331543, "geo/layer_14/stable_rank_q_proj": 51.46628189086914, "geo/layer_14/stable_rank_k_proj": 40.08916091918945, "geo/layer_14/stable_rank_o_proj": 44.180747985839844, "geo/layer_14/stable_rank_gate_proj": 71.87146759033203, "geo/layer_14/stable_rank_down_proj": 129.7228546142578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40799397230148315, "geo/layer_14/attn_entropy_mean": 5.551164627075195, "geo/layer_14/attn_entropy_std": 0.4064313471317291, "geo/layer_21/stable_rank_q_proj": 40.71358871459961, "geo/layer_21/stable_rank_k_proj": 30.33572769165039, "geo/layer_21/stable_rank_o_proj": 70.53241729736328, "geo/layer_21/stable_rank_gate_proj": 66.09510803222656, "geo/layer_21/stable_rank_down_proj": 51.461971282958984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14464505016803741, "geo/layer_21/attn_entropy_mean": 5.710455417633057, "geo/layer_21/attn_entropy_std": 0.3034329414367676, "geo/layer_27/stable_rank_q_proj": 43.293495178222656, "geo/layer_27/stable_rank_k_proj": 31.694774627685547, "geo/layer_27/stable_rank_o_proj": 116.33403015136719, "geo/layer_27/stable_rank_gate_proj": 80.41865539550781, "geo/layer_27/stable_rank_down_proj": 128.1984100341797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08718140423297882, "geo/layer_27/attn_entropy_mean": 4.197642803192139, "geo/layer_27/attn_entropy_std": 0.718268632888794, "attnres/final_alpha/block_0": 0.2379983365535736, "attnres/block_norm/0": 1.762850284576416, "attnres/final_alpha/block_1": 0.0045870039612054825, "attnres/block_norm/1": 46240.0546875, "attnres/final_alpha/block_2": 0.010539715178310871, "attnres/block_norm/2": 28559.58984375, "attnres/final_alpha/block_3": 0.01247305516153574, "attnres/block_norm/3": 57714.8125, "attnres/final_alpha/block_4": 0.014707215130329132, "attnres/block_norm/4": 15109.7431640625, "attnres/final_alpha/block_5": 0.6082930564880371, "attnres/block_norm/5": 6670.26708984375, "attnres/final_alpha/block_6": 0.11140164732933044, "attnres/block_norm/6": 38018.171875, "geo/tier1_time_s": 1.3596022129058838, "geo/step": 62250.0, "geo/rankme_slope": 5.216735131552621e-07} {"step": 62260, "timestamp": 1778261837.4586782, "train/loss": 2.161862087249756, "train/z_loss": 0.0013716765213757753, "train/perplexity": 8.687299115690069, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792549.1558933312, "perf/iters_per_sec": 0.8547540454355865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.169927191734314, "data/tokens_consumed": 130570780672, "data/tokens_consumed_B": 130.570780672, "train/loss_slope": 2.6939951761904726e-07} {"step": 62270, "timestamp": 1778261847.7984602, "train/loss": 2.204386281967163, "train/z_loss": 0.0013656246359460057, "train/perplexity": 9.064686698711787, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029735.2661568369, "perf/iters_per_sec": 0.9678531962188897, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033214545249939, "data/tokens_consumed": 130591752192, "data/tokens_consumed_B": 130.591752192, "train/loss_slope": 2.229689893656754e-06} {"step": 62280, "timestamp": 1778261858.136202, "train/loss": 2.152575969696045, "train/z_loss": 0.0013801316381432116, "train/perplexity": 8.607001239655174, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029582.542223223, "perf/iters_per_sec": 0.9677803717723956, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033292293548584, "data/tokens_consumed": 130612723712, "data/tokens_consumed_B": 130.612723712, "train/loss_slope": 3.918049230326654e-06} {"step": 62290, "timestamp": 1778261868.485513, "train/loss": 2.1288578271865846, "train/z_loss": 0.0013812858844175935, "train/perplexity": 8.405261066094456, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027756.086900002, "perf/iters_per_sec": 0.966909450006486, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342230081558228, "data/tokens_consumed": 130633695232, "data/tokens_consumed_B": 130.633695232, "train/loss_slope": 2.726837732467896e-06} {"step": 62300, "timestamp": 1778261878.8158386, "grad/layer_0/attn": 0.0028982083313167095, "grad/layer_0/mlp": 0.0030236837919801474, "grad/layer_0/attn_mlp_ratio": 0.9585024211703339, "grad/layer_4/attn": 0.002190427854657173, "grad/layer_4/mlp": 0.0027040347922593355, "grad/layer_4/attn_mlp_ratio": 0.8100590199215136, "grad/layer_8/attn": 0.004243847914040089, "grad/layer_8/mlp": 0.003712959587574005, "grad/layer_8/attn_mlp_ratio": 1.1429824913647146, "grad/layer_12/attn": 0.0046305726282298565, "grad/layer_12/mlp": 0.006507547572255135, "grad/layer_12/attn_mlp_ratio": 0.7115695284065697, "grad/layer_16/attn": 0.003495702752843499, "grad/layer_16/mlp": 0.004411828238517046, "grad/layer_16/attn_mlp_ratio": 0.7923478622965904, "grad/layer_20/attn": 0.004070049151778221, "grad/layer_20/mlp": 0.0058616893365979195, "grad/layer_20/attn_mlp_ratio": 0.6943474566166111, "grad/layer_24/attn": 0.0056242686696350574, "grad/layer_24/mlp": 0.00746318930760026, "grad/layer_24/attn_mlp_ratio": 0.7536012236145974, "grad/layer_27/attn": 0.0049978927709162235, "grad/layer_27/mlp": 0.00682374881580472, "grad/layer_27/attn_mlp_ratio": 0.732426241437523} {"step": 62300, "timestamp": 1778261878.8299088, "train/loss": 2.156889760494232, "train/z_loss": 0.0013679341878741979, "train/perplexity": 8.644210240624588, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028584.8526801956, "perf/iters_per_sec": 0.9673046363259294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033800482749939, "data/tokens_consumed": 130654666752, "data/tokens_consumed_B": 130.654666752, "train/loss_slope": 3.1441359343988227e-06} {"step": 62310, "timestamp": 1778261889.1683984, "train/loss": 2.127019226551056, "train/z_loss": 0.0013900533667765558, "train/perplexity": 8.389821345845718, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029461.3535690363, "perf/iters_per_sec": 0.9677225845189268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333539962768554, "data/tokens_consumed": 130675638272, "data/tokens_consumed_B": 130.675638272, "train/loss_slope": 6.022206284139346e-06} {"step": 62320, "timestamp": 1778261899.5083756, "train/loss": 2.0926162242889403, "train/z_loss": 0.0013879727572202682, "train/perplexity": 8.106094809001814, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029474.4644717441, "perf/iters_per_sec": 0.9677288362845154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333473205566406, "data/tokens_consumed": 130696609792, "data/tokens_consumed_B": 130.696609792, "train/loss_slope": 8.81632820035566e-07} {"step": 62325, "timestamp": 1778261905.2564304, "eos/sharpness": 14.920186996459957, "eos/L0_probe": 1.9773643016815186, "eos/L_plus": 2.0575613975524902, "eos/L_minus": 2.0463690757751465, "eos/grad_norm": 0.09540995955467224, "eos/embed_grad_frac": 0.22078783810138702, "eos/time_s": 0.5816113948822021} {"step": 62325, "timestamp": 1778261906.6333084, "geo/rankme_last": 439.05816650390625, "geo/layer_0/stable_rank_q_proj": 19.340301513671875, "geo/layer_0/stable_rank_k_proj": 16.158586502075195, "geo/layer_0/stable_rank_o_proj": 47.270992279052734, "geo/layer_0/stable_rank_gate_proj": 131.8651580810547, "geo/layer_0/stable_rank_down_proj": 55.05588912963867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0640864372253418, "geo/layer_0/attn_entropy_mean": 6.166565895080566, "geo/layer_0/attn_entropy_std": 0.4170907139778137, "geo/layer_7/stable_rank_q_proj": 43.37331008911133, "geo/layer_7/stable_rank_k_proj": 41.331356048583984, "geo/layer_7/stable_rank_o_proj": 91.85150146484375, "geo/layer_7/stable_rank_gate_proj": 81.86129760742188, "geo/layer_7/stable_rank_down_proj": 140.94627380371094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4491702616214752, "geo/layer_7/attn_entropy_mean": 4.648436069488525, "geo/layer_7/attn_entropy_std": 0.7834010720252991, "geo/layer_14/stable_rank_q_proj": 51.50347900390625, "geo/layer_14/stable_rank_k_proj": 40.21881103515625, "geo/layer_14/stable_rank_o_proj": 44.107234954833984, "geo/layer_14/stable_rank_gate_proj": 72.09065246582031, "geo/layer_14/stable_rank_down_proj": 129.91513061523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4025707542896271, "geo/layer_14/attn_entropy_mean": 5.532791614532471, "geo/layer_14/attn_entropy_std": 0.3944646716117859, "geo/layer_21/stable_rank_q_proj": 40.71288299560547, "geo/layer_21/stable_rank_k_proj": 30.317968368530273, "geo/layer_21/stable_rank_o_proj": 70.47349548339844, "geo/layer_21/stable_rank_gate_proj": 65.95143127441406, "geo/layer_21/stable_rank_down_proj": 51.47615051269531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1476007103919983, "geo/layer_21/attn_entropy_mean": 5.684329509735107, "geo/layer_21/attn_entropy_std": 0.30380484461784363, "geo/layer_27/stable_rank_q_proj": 43.376564025878906, "geo/layer_27/stable_rank_k_proj": 31.802371978759766, "geo/layer_27/stable_rank_o_proj": 116.26126861572266, "geo/layer_27/stable_rank_gate_proj": 80.38668060302734, "geo/layer_27/stable_rank_down_proj": 128.35772705078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08840793371200562, "geo/layer_27/attn_entropy_mean": 4.199461460113525, "geo/layer_27/attn_entropy_std": 0.7203695774078369, "attnres/final_alpha/block_0": 0.23584333062171936, "attnres/block_norm/0": 1.7630306482315063, "attnres/final_alpha/block_1": 0.004457036033272743, "attnres/block_norm/1": 46335.0390625, "attnres/final_alpha/block_2": 0.010363137349486351, "attnres/block_norm/2": 28465.1328125, "attnres/final_alpha/block_3": 0.012462884187698364, "attnres/block_norm/3": 57547.984375, "attnres/final_alpha/block_4": 0.01453134510666132, "attnres/block_norm/4": 15076.9443359375, "attnres/final_alpha/block_5": 0.6123497486114502, "attnres/block_norm/5": 6637.5791015625, "attnres/final_alpha/block_6": 0.10999249666929245, "attnres/block_norm/6": 38151.7734375, "geo/tier1_time_s": 1.35872483253479, "geo/step": 62325.0, "geo/rankme_slope": 4.448478610194077e-06} {"step": 62330, "timestamp": 1778261911.8057232, "train/loss": 2.127191424369812, "train/z_loss": 0.0013937737559899688, "train/perplexity": 8.39126617917623, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706373.8445934323, "perf/iters_per_sec": 0.8136624548880731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2290108680725098, "data/tokens_consumed": 130717581312, "data/tokens_consumed_B": 130.717581312, "train/loss_slope": 3.042599477464188e-08} {"step": 62340, "timestamp": 1778261922.1711423, "train/loss": 2.177856373786926, "train/z_loss": 0.0013813888654112816, "train/perplexity": 8.827363395198955, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024673.2278969123, "perf/iters_per_sec": 0.9654394282803117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357977628707886, "data/tokens_consumed": 130738552832, "data/tokens_consumed_B": 130.738552832, "train/loss_slope": 3.8461651202618994e-06} {"step": 62350, "timestamp": 1778261932.5270767, "grad/layer_0/attn": 0.0029986456502228975, "grad/layer_0/mlp": 0.002940633101388812, "grad/layer_0/attn_mlp_ratio": 1.0197278765698106, "grad/layer_4/attn": 0.0026357544120401144, "grad/layer_4/mlp": 0.0026591953355818987, "grad/layer_4/attn_mlp_ratio": 0.9911849188562348, "grad/layer_8/attn": 0.005115667823702097, "grad/layer_8/mlp": 0.0036243342328816652, "grad/layer_8/attn_mlp_ratio": 1.4114779029325062, "grad/layer_12/attn": 0.004638367798179388, "grad/layer_12/mlp": 0.006651346106082201, "grad/layer_12/attn_mlp_ratio": 0.6973577460060518, "grad/layer_16/attn": 0.004347169306129217, "grad/layer_16/mlp": 0.0044470583088696, "grad/layer_16/attn_mlp_ratio": 0.977538162633269, "grad/layer_20/attn": 0.00702300388365984, "grad/layer_20/mlp": 0.005815282929688692, "grad/layer_20/attn_mlp_ratio": 1.207680494277817, "grad/layer_24/attn": 0.010396444238722324, "grad/layer_24/mlp": 0.009892480447888374, "grad/layer_24/attn_mlp_ratio": 1.050944117442973, "grad/layer_27/attn": 0.004946023691445589, "grad/layer_27/mlp": 0.009308263659477234, "grad/layer_27/attn_mlp_ratio": 0.5313583520245418} {"step": 62350, "timestamp": 1778261932.5413458, "train/loss": 2.150681757926941, "train/z_loss": 0.0013815624988637865, "train/perplexity": 8.590713187990085, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023294.3728785363, "perf/iters_per_sec": 0.9647819389717752, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365036487579347, "data/tokens_consumed": 130759524352, "data/tokens_consumed_B": 130.759524352, "train/loss_slope": 4.267594172651993e-06} {"step": 62360, "timestamp": 1778261942.9083538, "train/loss": 2.093375301361084, "train/z_loss": 0.0013845453155227005, "train/perplexity": 8.112250295664783, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024317.5180270174, "perf/iters_per_sec": 0.9652698125968062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359797716140746, "data/tokens_consumed": 130780495872, "data/tokens_consumed_B": 130.780495872, "train/loss_slope": 1.4046969372268584e-06} {"step": 62370, "timestamp": 1778261953.2777412, "train/loss": 2.1304585695266725, "train/z_loss": 0.0013785703922621906, "train/perplexity": 8.418726497830507, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023735.9496572707, "perf/iters_per_sec": 0.9649924991880754, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362774848937988, "data/tokens_consumed": 130801467392, "data/tokens_consumed_B": 130.801467392, "train/loss_slope": 1.206765309347272e-06} {"step": 62380, "timestamp": 1778261963.6441922, "train/loss": 2.1160791397094725, "train/z_loss": 0.001383730990346521, "train/perplexity": 8.298536215818768, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024395.9274739982, "perf/iters_per_sec": 0.9653072011346808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359396457672119, "data/tokens_consumed": 130822438912, "data/tokens_consumed_B": 130.822438912, "train/loss_slope": 1.72759397159364e-07} {"step": 62390, "timestamp": 1778261974.0073874, "train/loss": 2.1396186113357545, "train/z_loss": 0.0013753958977758885, "train/perplexity": 8.496196657796107, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024588.8787783785, "perf/iters_per_sec": 0.9653992074863331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035840916633606, "data/tokens_consumed": 130843410432, "data/tokens_consumed_B": 130.843410432, "train/loss_slope": -1.579328095487294e-06} {"step": 62400, "timestamp": 1778261984.3703785, "grad/layer_0/attn": 0.002683785976842046, "grad/layer_0/mlp": 0.0028230466414242983, "grad/layer_0/attn_mlp_ratio": 0.9506700464647666, "grad/layer_4/attn": 0.002576550468802452, "grad/layer_4/mlp": 0.0026172276120632887, "grad/layer_4/attn_mlp_ratio": 0.9844578891346184, "grad/layer_8/attn": 0.008493311703205109, "grad/layer_8/mlp": 0.003893284359946847, "grad/layer_8/attn_mlp_ratio": 2.181528678570038, "grad/layer_12/attn": 0.004467654973268509, "grad/layer_12/mlp": 0.006792864296585321, "grad/layer_12/attn_mlp_ratio": 0.6576982422193408, "grad/layer_16/attn": 0.005152310244739056, "grad/layer_16/mlp": 0.004672659561038017, "grad/layer_16/attn_mlp_ratio": 1.1026504428945476, "grad/layer_20/attn": 0.0034942287020385265, "grad/layer_20/mlp": 0.006355280056595802, "grad/layer_20/attn_mlp_ratio": 0.5498150539299287, "grad/layer_24/attn": 0.010060235857963562, "grad/layer_24/mlp": 0.010014088824391365, "grad/layer_24/attn_mlp_ratio": 1.0046082008978168, "grad/layer_27/attn": 0.0039516109973192215, "grad/layer_27/mlp": 0.009930642321705818, "grad/layer_27/attn_mlp_ratio": 0.3979209833073862} {"step": 62400, "timestamp": 1778261984.9551656, "eos/sharpness": 73.97563457489012, "eos/L0_probe": 1.9757881164550781, "eos/L_plus": 2.397052526473999, "eos/L_minus": 2.2942800521850586, "eos/grad_norm": 0.1808096170425415, "eos/embed_grad_frac": 0.0670056864619255, "eos/time_s": 0.5819921493530273} {"step": 62400, "timestamp": 1778261984.9756067, "train/loss": 2.1585211873054506, "train/z_loss": 0.0013875715318135916, "train/perplexity": 8.65832414674497, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913177.175604404, "perf/iters_per_sec": 0.912273967554285, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961619377136231, "data/tokens_consumed": 130864381952, "data/tokens_consumed_B": 130.864381952, "train/loss_slope": -7.855141803088132e-08} {"step": 62400, "timestamp": 1778261986.3361726, "geo/rankme_last": 439.1976318359375, "geo/layer_0/stable_rank_q_proj": 19.366004943847656, "geo/layer_0/stable_rank_k_proj": 16.158000946044922, "geo/layer_0/stable_rank_o_proj": 47.3940315246582, "geo/layer_0/stable_rank_gate_proj": 131.79940795898438, "geo/layer_0/stable_rank_down_proj": 55.06516647338867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06354635208845139, "geo/layer_0/attn_entropy_mean": 6.166606426239014, "geo/layer_0/attn_entropy_std": 0.4205085337162018, "geo/layer_7/stable_rank_q_proj": 43.27968978881836, "geo/layer_7/stable_rank_k_proj": 41.222900390625, "geo/layer_7/stable_rank_o_proj": 91.66974639892578, "geo/layer_7/stable_rank_gate_proj": 81.82943725585938, "geo/layer_7/stable_rank_down_proj": 141.05250549316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.452979177236557, "geo/layer_7/attn_entropy_mean": 4.616888523101807, "geo/layer_7/attn_entropy_std": 0.8058378100395203, "geo/layer_14/stable_rank_q_proj": 51.5339469909668, "geo/layer_14/stable_rank_k_proj": 40.30616760253906, "geo/layer_14/stable_rank_o_proj": 44.1151008605957, "geo/layer_14/stable_rank_gate_proj": 72.13511657714844, "geo/layer_14/stable_rank_down_proj": 129.8536834716797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38479551672935486, "geo/layer_14/attn_entropy_mean": 5.55226993560791, "geo/layer_14/attn_entropy_std": 0.40877169370651245, "geo/layer_21/stable_rank_q_proj": 40.615074157714844, "geo/layer_21/stable_rank_k_proj": 30.27012062072754, "geo/layer_21/stable_rank_o_proj": 70.44456481933594, "geo/layer_21/stable_rank_gate_proj": 65.95384216308594, "geo/layer_21/stable_rank_down_proj": 51.43928146362305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13972827792167664, "geo/layer_21/attn_entropy_mean": 5.685696125030518, "geo/layer_21/attn_entropy_std": 0.3053045868873596, "geo/layer_27/stable_rank_q_proj": 43.3832893371582, "geo/layer_27/stable_rank_k_proj": 31.799882888793945, "geo/layer_27/stable_rank_o_proj": 116.33671569824219, "geo/layer_27/stable_rank_gate_proj": 80.48783874511719, "geo/layer_27/stable_rank_down_proj": 128.349853515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09687300771474838, "geo/layer_27/attn_entropy_mean": 4.2108564376831055, "geo/layer_27/attn_entropy_std": 0.7356253862380981, "attnres/final_alpha/block_0": 0.235963374376297, "attnres/block_norm/0": 1.763124942779541, "attnres/final_alpha/block_1": 0.004496856592595577, "attnres/block_norm/1": 46260.140625, "attnres/final_alpha/block_2": 0.010397219099104404, "attnres/block_norm/2": 28444.02734375, "attnres/final_alpha/block_3": 0.012439478188753128, "attnres/block_norm/3": 57794.75, "attnres/final_alpha/block_4": 0.014521751552820206, "attnres/block_norm/4": 15125.509765625, "attnres/final_alpha/block_5": 0.612941563129425, "attnres/block_norm/5": 6661.2412109375, "attnres/final_alpha/block_6": 0.10923977196216583, "attnres/block_norm/6": 38121.9296875, "geo/tier1_time_s": 1.3562278747558594, "geo/step": 62400.0, "geo/rankme_slope": -2.702917104341739e-06} {"step": 62410, "timestamp": 1778261997.3332734, "train/loss": 2.1393149375915526, "train/z_loss": 0.0013838708051480352, "train/perplexity": 8.493616977655947, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697634.436703889, "perf/iters_per_sec": 0.8094951804656453, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353378057479858, "data/tokens_consumed": 130885353472, "data/tokens_consumed_B": 130.885353472, "train/loss_slope": -3.451807735705044e-07} {"step": 62420, "timestamp": 1778262007.699464, "train/loss": 2.1924338579177856, "train/z_loss": 0.0013764021219685673, "train/perplexity": 8.956986639970246, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024078.2739811332, "perf/iters_per_sec": 0.9651557321458498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361022233963013, "data/tokens_consumed": 130906324992, "data/tokens_consumed_B": 130.906324992, "train/loss_slope": -1.4200193117303637e-06} {"step": 62430, "timestamp": 1778262018.0703154, "train/loss": 2.1229625225067137, "train/z_loss": 0.001376098464243114, "train/perplexity": 8.355855265409858, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023489.4882507497, "perf/iters_per_sec": 0.9648749772313832, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364037036895752, "data/tokens_consumed": 130927296512, "data/tokens_consumed_B": 130.927296512, "train/loss_slope": -4.6397783503745745e-06} {"step": 62440, "timestamp": 1778262028.4335935, "train/loss": 2.133998763561249, "train/z_loss": 0.001368878793437034, "train/perplexity": 8.448583241306622, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024965.4284338146, "perf/iters_per_sec": 0.9655787603539536, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356482982635498, "data/tokens_consumed": 130948268032, "data/tokens_consumed_B": 130.948268032, "train/loss_slope": -5.230702804510486e-06} {"step": 62450, "timestamp": 1778262038.7881367, "grad/layer_0/attn": 0.0031218519434332848, "grad/layer_0/mlp": 0.003495529294013977, "grad/layer_0/attn_mlp_ratio": 0.8930984670818082, "grad/layer_4/attn": 0.0024646662641316652, "grad/layer_4/mlp": 0.0027189028915017843, "grad/layer_4/attn_mlp_ratio": 0.9064929024077863, "grad/layer_8/attn": 0.004738031420856714, "grad/layer_8/mlp": 0.0036680984776467085, "grad/layer_8/attn_mlp_ratio": 1.291685956787025, "grad/layer_12/attn": 0.0058577219024300575, "grad/layer_12/mlp": 0.00783199816942215, "grad/layer_12/attn_mlp_ratio": 0.747921756481982, "grad/layer_16/attn": 0.00427281716838479, "grad/layer_16/mlp": 0.0052864449098706245, "grad/layer_16/attn_mlp_ratio": 0.808259077774718, "grad/layer_20/attn": 0.0097076166421175, "grad/layer_20/mlp": 0.008307233452796936, "grad/layer_20/attn_mlp_ratio": 1.1685739398586037, "grad/layer_24/attn": 0.00633596396073699, "grad/layer_24/mlp": 0.009720878675580025, "grad/layer_24/attn_mlp_ratio": 0.6517892164907628, "grad/layer_27/attn": 0.006367390975356102, "grad/layer_27/mlp": 0.011050892993807793, "grad/layer_27/attn_mlp_ratio": 0.5761879081903324} {"step": 62450, "timestamp": 1778262038.8024414, "train/loss": 2.1189095616340636, "train/z_loss": 0.0013842430780641735, "train/perplexity": 8.32205784703317, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023499.6360242066, "perf/iters_per_sec": 0.9648798160668405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363985061645509, "data/tokens_consumed": 130969239552, "data/tokens_consumed_B": 130.969239552, "train/loss_slope": -3.7872064017047735e-06} {"step": 62460, "timestamp": 1778262049.1726334, "train/loss": 2.126125454902649, "train/z_loss": 0.001365842146333307, "train/perplexity": 8.382326111404572, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023669.8824275418, "perf/iters_per_sec": 0.9649609958780011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363113164901734, "data/tokens_consumed": 130990211072, "data/tokens_consumed_B": 130.990211072, "train/loss_slope": -3.803577796496523e-06} {"step": 62470, "timestamp": 1778262059.5338376, "train/loss": 2.118247652053833, "train/z_loss": 0.001389369077514857, "train/perplexity": 8.316551219862673, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025374.9960126139, "perf/iters_per_sec": 0.9657740573943204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035438871383667, "data/tokens_consumed": 131011182592, "data/tokens_consumed_B": 131.011182592, "train/loss_slope": -6.1602045457498255e-06} {"step": 62475, "timestamp": 1778262065.2908306, "eos/sharpness": 10.66923141479492, "eos/L0_probe": 1.9769220352172852, "eos/L_plus": 2.0402119159698486, "eos/L_minus": 2.020324468612671, "eos/grad_norm": 0.09307041019201279, "eos/embed_grad_frac": 0.23343349993228912, "eos/time_s": 0.5868749618530273} {"step": 62475, "timestamp": 1778262066.670696, "geo/rankme_last": 438.7630920410156, "geo/layer_0/stable_rank_q_proj": 19.382347106933594, "geo/layer_0/stable_rank_k_proj": 16.1434268951416, "geo/layer_0/stable_rank_o_proj": 47.37479019165039, "geo/layer_0/stable_rank_gate_proj": 131.8966522216797, "geo/layer_0/stable_rank_down_proj": 55.008811950683594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06439168751239777, "geo/layer_0/attn_entropy_mean": 6.164975166320801, "geo/layer_0/attn_entropy_std": 0.41739892959594727, "geo/layer_7/stable_rank_q_proj": 43.37361145019531, "geo/layer_7/stable_rank_k_proj": 41.09022521972656, "geo/layer_7/stable_rank_o_proj": 91.39107513427734, "geo/layer_7/stable_rank_gate_proj": 81.70305633544922, "geo/layer_7/stable_rank_down_proj": 141.08450317382812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4742785096168518, "geo/layer_7/attn_entropy_mean": 4.6814188957214355, "geo/layer_7/attn_entropy_std": 0.8055678606033325, "geo/layer_14/stable_rank_q_proj": 51.62022018432617, "geo/layer_14/stable_rank_k_proj": 40.188682556152344, "geo/layer_14/stable_rank_o_proj": 44.0498161315918, "geo/layer_14/stable_rank_gate_proj": 72.12601470947266, "geo/layer_14/stable_rank_down_proj": 129.83018493652344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37723296880722046, "geo/layer_14/attn_entropy_mean": 5.536281108856201, "geo/layer_14/attn_entropy_std": 0.3866739273071289, "geo/layer_21/stable_rank_q_proj": 40.51290512084961, "geo/layer_21/stable_rank_k_proj": 30.295589447021484, "geo/layer_21/stable_rank_o_proj": 70.44273376464844, "geo/layer_21/stable_rank_gate_proj": 66.08555603027344, "geo/layer_21/stable_rank_down_proj": 51.46261215209961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14260879158973694, "geo/layer_21/attn_entropy_mean": 5.698580741882324, "geo/layer_21/attn_entropy_std": 0.2948335111141205, "geo/layer_27/stable_rank_q_proj": 43.381996154785156, "geo/layer_27/stable_rank_k_proj": 31.786060333251953, "geo/layer_27/stable_rank_o_proj": 116.46124267578125, "geo/layer_27/stable_rank_gate_proj": 80.38312530517578, "geo/layer_27/stable_rank_down_proj": 128.45352172851562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09599259495735168, "geo/layer_27/attn_entropy_mean": 4.22390604019165, "geo/layer_27/attn_entropy_std": 0.736785888671875, "attnres/final_alpha/block_0": 0.23568645119667053, "attnres/block_norm/0": 1.7633956670761108, "attnres/final_alpha/block_1": 0.0044140201061964035, "attnres/block_norm/1": 46390.33203125, "attnres/final_alpha/block_2": 0.010448578745126724, "attnres/block_norm/2": 28485.814453125, "attnres/final_alpha/block_3": 0.012539655901491642, "attnres/block_norm/3": 57532.90234375, "attnres/final_alpha/block_4": 0.014689689502120018, "attnres/block_norm/4": 15039.2265625, "attnres/final_alpha/block_5": 0.6143794059753418, "attnres/block_norm/5": 6605.7158203125, "attnres/final_alpha/block_6": 0.10784216225147247, "attnres/block_norm/6": 38199.0546875, "geo/tier1_time_s": 1.359499454498291, "geo/step": 62475.0, "geo/rankme_slope": -3.617546628026211e-05} {"step": 62480, "timestamp": 1778262071.8534527, "train/loss": 2.1399896144866943, "train/z_loss": 0.0013708595652133226, "train/perplexity": 8.499349358321906, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702989.8311533255, "perf/iters_per_sec": 0.8120488315359714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.231453037261963, "data/tokens_consumed": 131032154112, "data/tokens_consumed_B": 131.032154112, "train/loss_slope": -7.774873528078497e-06} {"step": 62490, "timestamp": 1778262082.2165935, "train/loss": 2.158591365814209, "train/z_loss": 0.0013790899887681008, "train/perplexity": 8.658931796343659, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024603.6044164342, "perf/iters_per_sec": 0.9654062292177363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358333826065063, "data/tokens_consumed": 131053125632, "data/tokens_consumed_B": 131.053125632, "train/loss_slope": -1.0922446517017244e-05} {"step": 62500, "timestamp": 1778262092.5697722, "grad/layer_0/attn": 0.002705130958929658, "grad/layer_0/mlp": 0.0029786271043121815, "grad/layer_0/attn_mlp_ratio": 0.9081804379592784, "grad/layer_4/attn": 0.001991266617551446, "grad/layer_4/mlp": 0.0026844502426683903, "grad/layer_4/attn_mlp_ratio": 0.7417781532036433, "grad/layer_8/attn": 0.004615600686520338, "grad/layer_8/mlp": 0.003712552133947611, "grad/layer_8/attn_mlp_ratio": 1.2432419520768607, "grad/layer_12/attn": 0.0054003228433430195, "grad/layer_12/mlp": 0.007062850054353476, "grad/layer_12/attn_mlp_ratio": 0.7646095733765936, "grad/layer_16/attn": 0.005697503685951233, "grad/layer_16/mlp": 0.004626312758773565, "grad/layer_16/attn_mlp_ratio": 1.2315431013590465, "grad/layer_20/attn": 0.0033775223419070244, "grad/layer_20/mlp": 0.005836778320372105, "grad/layer_20/attn_mlp_ratio": 0.5786620801157125, "grad/layer_24/attn": 0.012493148446083069, "grad/layer_24/mlp": 0.009631345048546791, "grad/layer_24/attn_mlp_ratio": 1.297134331020011, "grad/layer_27/attn": 0.00824361015111208, "grad/layer_27/mlp": 0.008419220335781574, "grad/layer_27/attn_mlp_ratio": 0.9791417404961684} {"step": 62500, "timestamp": 1778262092.584143, "train/loss": 2.194023323059082, "train/z_loss": 0.001372143323533237, "train/perplexity": 8.971234778465565, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024051.772425168, "perf/iters_per_sec": 0.9651430952192154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036115789413452, "data/tokens_consumed": 131074097152, "data/tokens_consumed_B": 131.074097152, "train/loss_slope": -6.736460069690085e-06} {"step": 62500, "timestamp": 1778262099.6974528, "geo/ww_alpha_mean": 7.558612173912712, "geo/ww_alpha_std": 4.507940050365604, "geo/ww_alpha_min": 1.3437945958376913, "geo/ww_alpha_max": 28.16529809716895, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9263225995731545, "geo/ww_alpha_by_type/k_proj": 4.4681579215745675, "geo/ww_alpha_by_type/v_proj": 8.098042141739938, "geo/ww_alpha_by_type/o_proj": 8.748681422748513, "geo/ww_alpha_by_type/gate_proj": 8.240761238109448, "geo/ww_alpha_by_type/up_proj": 11.529860646388832, "geo/ww_alpha_by_type/down_proj": 7.9946170678860415, "geo/twonn_id/layer_0": 0.7536762952804565, "geo/twonn_id/layer_7": 3.194981813430786, "geo/twonn_id/layer_14": 4.406914710998535, "geo/twonn_id/layer_21": 6.940673828125, "geo/twonn_id/layer_27": 6.420340538024902, "geo/tier2_time_s": 7.103872299194336} {"step": 62500, "timestamp": 1778262100.4534779, "eoc/jacobian_sigma/layer_0/attn": 1171.4873046875, "eoc/jacobian_sigma/layer_0/mlp": 9965.66015625, "eoc/jacobian_sigma/layer_0": 9965.66015625, "eoc/jacobian_sigma/layer_7/attn": 1.1532258987426758, "eoc/jacobian_sigma/layer_7/mlp": 1.7135041952133179, "eoc/jacobian_sigma/layer_7": 1.7135041952133179, "eoc/jacobian_sigma/layer_14/attn": 1.485120415687561, "eoc/jacobian_sigma/layer_14/mlp": 5.824930667877197, "eoc/jacobian_sigma/layer_14": 5.824930667877197, "eoc/jacobian_sigma/layer_21/attn": 1.081254243850708, "eoc/jacobian_sigma/layer_21/mlp": 4.3981614112854, "eoc/jacobian_sigma/layer_21": 4.3981614112854, "eoc/jacobian_sigma/layer_27/attn": 3.3519999980926514, "eoc/jacobian_sigma/layer_27/mlp": 25.775047302246094, "eoc/jacobian_sigma/layer_27": 25.775047302246094, "eoc/layer0_sigma": 9965.66015625, "eoc/sigma_max": 25.775047302246094, "eoc/sigma_min": 1.7135041952133179, "eoc/sigma_mean": 9.427910894155502, "eoc/time_s": 0.7493624687194824} {"step": 62510, "timestamp": 1778262111.2942164, "train/loss": 2.122527575492859, "train/z_loss": 0.0013786383555270732, "train/perplexity": 8.352221701375163, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1121349.2884259876, "perf/iters_per_sec": 0.5347010080461443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8702040672302247, "data/tokens_consumed": 131095068672, "data/tokens_consumed_B": 131.095068672, "train/loss_slope": -6.33003102956927e-06} {"step": 62520, "timestamp": 1778262121.6572237, "train/loss": 2.1309355974197386, "train/z_loss": 0.001379225996788591, "train/perplexity": 8.422743423210619, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025106.1286688545, "perf/iters_per_sec": 0.9656458514541886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035576343536377, "data/tokens_consumed": 131116040192, "data/tokens_consumed_B": 131.116040192, "train/loss_slope": -4.681038792126834e-06} {"step": 62530, "timestamp": 1778262132.021605, "train/loss": 2.147492527961731, "train/z_loss": 0.0013738294714130462, "train/perplexity": 8.563359070539265, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024915.8756509456, "perf/iters_per_sec": 0.9655551317457893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356736421585082, "data/tokens_consumed": 131137011712, "data/tokens_consumed_B": 131.137011712, "train/loss_slope": -5.14809495151628e-06} {"step": 62540, "timestamp": 1778262142.3845692, "train/loss": 2.1702914476394652, "train/z_loss": 0.0013799965847283602, "train/perplexity": 8.760836993958298, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024691.2636757288, "perf/iters_per_sec": 0.9654480284098286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357885360717773, "data/tokens_consumed": 131157983232, "data/tokens_consumed_B": 131.157983232, "train/loss_slope": -3.185128106965017e-06} {"step": 62550, "timestamp": 1778262152.738609, "grad/layer_0/attn": 0.0025543617084622383, "grad/layer_0/mlp": 0.002902559470385313, "grad/layer_0/attn_mlp_ratio": 0.880037651776135, "grad/layer_4/attn": 0.002862275578081608, "grad/layer_4/mlp": 0.002659094287082553, "grad/layer_4/attn_mlp_ratio": 1.0764099206053295, "grad/layer_8/attn": 0.00391219649463892, "grad/layer_8/mlp": 0.0036884478759020567, "grad/layer_8/attn_mlp_ratio": 1.0606619695326305, "grad/layer_12/attn": 0.004622649401426315, "grad/layer_12/mlp": 0.006802009884268045, "grad/layer_12/attn_mlp_ratio": 0.6796005022218079, "grad/layer_16/attn": 0.005884282290935516, "grad/layer_16/mlp": 0.004376566968858242, "grad/layer_16/attn_mlp_ratio": 1.344497227702855, "grad/layer_20/attn": 0.0036378735676407814, "grad/layer_20/mlp": 0.005189741495996714, "grad/layer_20/attn_mlp_ratio": 0.7009739310425361, "grad/layer_24/attn": 0.00801731925457716, "grad/layer_24/mlp": 0.009037001989781857, "grad/layer_24/attn_mlp_ratio": 0.8871658073026616, "grad/layer_27/attn": 0.0042553492821753025, "grad/layer_27/mlp": 0.00933479331433773, "grad/layer_27/attn_mlp_ratio": 0.4558589669096822} {"step": 62550, "timestamp": 1778262153.3458197, "eos/sharpness": 50.32722949981689, "eos/L0_probe": 1.976829171180725, "eos/L_plus": 2.2442219257354736, "eos/L_minus": 2.2127087116241455, "eos/grad_norm": 0.12318290024995804, "eos/embed_grad_frac": 0.14377236366271973, "eos/time_s": 0.6044037342071533} {"step": 62550, "timestamp": 1778262153.365421, "train/loss": 2.2050770044326784, "train/z_loss": 0.0013747476041316986, "train/perplexity": 9.07095004432519, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911046.3857300312, "perf/iters_per_sec": 0.9112579277658611, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973841428756714, "data/tokens_consumed": 131178954752, "data/tokens_consumed_B": 131.178954752, "train/loss_slope": 1.672490509358287e-06} {"step": 62550, "timestamp": 1778262154.7281911, "geo/rankme_last": 439.0682067871094, "geo/layer_0/stable_rank_q_proj": 19.392658233642578, "geo/layer_0/stable_rank_k_proj": 16.126955032348633, "geo/layer_0/stable_rank_o_proj": 47.310394287109375, "geo/layer_0/stable_rank_gate_proj": 131.90399169921875, "geo/layer_0/stable_rank_down_proj": 55.0372314453125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06621604412794113, "geo/layer_0/attn_entropy_mean": 6.163629531860352, "geo/layer_0/attn_entropy_std": 0.41591179370880127, "geo/layer_7/stable_rank_q_proj": 43.43529510498047, "geo/layer_7/stable_rank_k_proj": 41.075843811035156, "geo/layer_7/stable_rank_o_proj": 91.41142272949219, "geo/layer_7/stable_rank_gate_proj": 81.873046875, "geo/layer_7/stable_rank_down_proj": 140.50999450683594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4594341814517975, "geo/layer_7/attn_entropy_mean": 4.681840419769287, "geo/layer_7/attn_entropy_std": 0.8154869675636292, "geo/layer_14/stable_rank_q_proj": 51.591758728027344, "geo/layer_14/stable_rank_k_proj": 40.16443634033203, "geo/layer_14/stable_rank_o_proj": 43.98813247680664, "geo/layer_14/stable_rank_gate_proj": 72.2217788696289, "geo/layer_14/stable_rank_down_proj": 130.1396942138672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4055067002773285, "geo/layer_14/attn_entropy_mean": 5.517550468444824, "geo/layer_14/attn_entropy_std": 0.410930871963501, "geo/layer_21/stable_rank_q_proj": 40.516700744628906, "geo/layer_21/stable_rank_k_proj": 30.253610610961914, "geo/layer_21/stable_rank_o_proj": 70.2720947265625, "geo/layer_21/stable_rank_gate_proj": 66.20121002197266, "geo/layer_21/stable_rank_down_proj": 51.40264892578125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14561611413955688, "geo/layer_21/attn_entropy_mean": 5.691665172576904, "geo/layer_21/attn_entropy_std": 0.30046823620796204, "geo/layer_27/stable_rank_q_proj": 43.37786865234375, "geo/layer_27/stable_rank_k_proj": 31.826732635498047, "geo/layer_27/stable_rank_o_proj": 116.60640716552734, "geo/layer_27/stable_rank_gate_proj": 80.36972045898438, "geo/layer_27/stable_rank_down_proj": 128.26934814453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09941238164901733, "geo/layer_27/attn_entropy_mean": 4.223345756530762, "geo/layer_27/attn_entropy_std": 0.7307230830192566, "attnres/final_alpha/block_0": 0.23537912964820862, "attnres/block_norm/0": 1.763323426246643, "attnres/final_alpha/block_1": 0.004488293081521988, "attnres/block_norm/1": 46446.19921875, "attnres/final_alpha/block_2": 0.01027973834425211, "attnres/block_norm/2": 28552.732421875, "attnres/final_alpha/block_3": 0.01239142194390297, "attnres/block_norm/3": 57634.92578125, "attnres/final_alpha/block_4": 0.014599071815609932, "attnres/block_norm/4": 15051.947265625, "attnres/final_alpha/block_5": 0.613079845905304, "attnres/block_norm/5": 6660.19580078125, "attnres/final_alpha/block_6": 0.10978245735168457, "attnres/block_norm/6": 38397.3984375, "geo/tier1_time_s": 1.3584227561950684, "geo/step": 62550.0, "geo/rankme_slope": -6.535700217587031e-06} {"step": 62560, "timestamp": 1778262165.0897973, "train/loss": 2.1133544206619264, "train/z_loss": 0.001376134983729571, "train/perplexity": 8.275955812721902, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789298.7199383297, "perf/iters_per_sec": 0.8532041167918823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1720524787902833, "data/tokens_consumed": 131199926272, "data/tokens_consumed_B": 131.199926272, "train/loss_slope": -2.365918564360047e-06} {"step": 62570, "timestamp": 1778262175.455009, "train/loss": 2.191680979728699, "train/z_loss": 0.0013770034885965288, "train/perplexity": 8.950245657976643, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024268.4162138135, "perf/iters_per_sec": 0.9652463990277355, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036004900932312, "data/tokens_consumed": 131220897792, "data/tokens_consumed_B": 131.220897792, "train/loss_slope": -9.632209596996772e-07} {"step": 62580, "timestamp": 1778262185.8187752, "train/loss": 2.128064978122711, "train/z_loss": 0.001382901263423264, "train/perplexity": 8.3985996038426, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024893.5007770872, "perf/iters_per_sec": 0.9655444625745235, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035685086250305, "data/tokens_consumed": 131241869312, "data/tokens_consumed_B": 131.241869312, "train/loss_slope": -1.5522500827009396e-06} {"step": 62590, "timestamp": 1778262196.18686, "train/loss": 2.114078903198242, "train/z_loss": 0.0013881219900213182, "train/perplexity": 8.28195377062507, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023879.738777779, "perf/iters_per_sec": 0.9650610631836791, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362038612365723, "data/tokens_consumed": 131262840832, "data/tokens_consumed_B": 131.262840832, "train/loss_slope": -5.361998485367138e-06} {"step": 62600, "timestamp": 1778262206.563325, "grad/layer_0/attn": 0.0035591856576502323, "grad/layer_0/mlp": 0.003227157285436988, "grad/layer_0/attn_mlp_ratio": 1.1028856769463955, "grad/layer_4/attn": 0.0028609486762434244, "grad/layer_4/mlp": 0.002538670552894473, "grad/layer_4/attn_mlp_ratio": 1.1269475514602514, "grad/layer_8/attn": 0.005370126571506262, "grad/layer_8/mlp": 0.003529883921146393, "grad/layer_8/attn_mlp_ratio": 1.5213322985501991, "grad/layer_12/attn": 0.005946406628936529, "grad/layer_12/mlp": 0.007491500116884708, "grad/layer_12/attn_mlp_ratio": 0.7937537818572344, "grad/layer_16/attn": 0.004413635469973087, "grad/layer_16/mlp": 0.005449992138892412, "grad/layer_16/attn_mlp_ratio": 0.8098425238987238, "grad/layer_20/attn": 0.0051103620789945126, "grad/layer_20/mlp": 0.0067398203536868095, "grad/layer_20/attn_mlp_ratio": 0.7582341568459806, "grad/layer_24/attn": 0.016095420345664024, "grad/layer_24/mlp": 0.010196913033723831, "grad/layer_24/attn_mlp_ratio": 1.5784600824373314, "grad/layer_27/attn": 0.010616777464747429, "grad/layer_27/mlp": 0.009468554519116879, "grad/layer_27/attn_mlp_ratio": 1.1212669612014812} {"step": 62600, "timestamp": 1778262206.577472, "train/loss": 2.121212422847748, "train/z_loss": 0.0013908127904869616, "train/perplexity": 8.341244474858497, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019249.9383529238, "perf/iters_per_sec": 0.9628534023060435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385797023773193, "data/tokens_consumed": 131283812352, "data/tokens_consumed_B": 131.283812352, "train/loss_slope": -7.267294808475553e-06} {"step": 62610, "timestamp": 1778262216.94517, "train/loss": 2.1756450176239013, "train/z_loss": 0.0013788115931674838, "train/perplexity": 8.807864518178604, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024501.088596971, "perf/iters_per_sec": 0.9653573458657126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358858346939086, "data/tokens_consumed": 131304783872, "data/tokens_consumed_B": 131.304783872, "train/loss_slope": -6.070560068473246e-06} {"step": 62620, "timestamp": 1778262227.3118885, "train/loss": 2.1955167770385744, "train/z_loss": 0.001374862191732973, "train/perplexity": 8.98464291447133, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024179.8619448454, "perf/iters_per_sec": 0.965204173061774, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360502243041991, "data/tokens_consumed": 131325755392, "data/tokens_consumed_B": 131.325755392, "train/loss_slope": -2.1878109584868523e-06} {"step": 62625, "timestamp": 1778262233.0624313, "eos/sharpness": 19.041013717651364, "eos/L0_probe": 1.9754656553268433, "eos/L_plus": 2.0745320320129395, "eos/L_minus": 2.0668094158172607, "eos/grad_norm": 0.13111351430416107, "eos/embed_grad_frac": 0.1298401951789856, "eos/time_s": 0.5806641578674316} {"step": 62625, "timestamp": 1778262234.4386916, "geo/rankme_last": 438.7973937988281, "geo/layer_0/stable_rank_q_proj": 19.401508331298828, "geo/layer_0/stable_rank_k_proj": 16.13657569885254, "geo/layer_0/stable_rank_o_proj": 47.27320098876953, "geo/layer_0/stable_rank_gate_proj": 131.9907684326172, "geo/layer_0/stable_rank_down_proj": 55.054420471191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06250923871994019, "geo/layer_0/attn_entropy_mean": 6.163786888122559, "geo/layer_0/attn_entropy_std": 0.41627371311187744, "geo/layer_7/stable_rank_q_proj": 43.39375686645508, "geo/layer_7/stable_rank_k_proj": 41.184444427490234, "geo/layer_7/stable_rank_o_proj": 91.47000122070312, "geo/layer_7/stable_rank_gate_proj": 81.76732635498047, "geo/layer_7/stable_rank_down_proj": 140.5708465576172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4504527151584625, "geo/layer_7/attn_entropy_mean": 4.676367282867432, "geo/layer_7/attn_entropy_std": 0.7914363741874695, "geo/layer_14/stable_rank_q_proj": 51.63838195800781, "geo/layer_14/stable_rank_k_proj": 40.22675704956055, "geo/layer_14/stable_rank_o_proj": 44.031864166259766, "geo/layer_14/stable_rank_gate_proj": 72.1598129272461, "geo/layer_14/stable_rank_down_proj": 130.13441467285156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38926854729652405, "geo/layer_14/attn_entropy_mean": 5.553175926208496, "geo/layer_14/attn_entropy_std": 0.3919413387775421, "geo/layer_21/stable_rank_q_proj": 40.45962142944336, "geo/layer_21/stable_rank_k_proj": 30.244544982910156, "geo/layer_21/stable_rank_o_proj": 70.32192993164062, "geo/layer_21/stable_rank_gate_proj": 65.99364471435547, "geo/layer_21/stable_rank_down_proj": 51.40055847167969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14471176266670227, "geo/layer_21/attn_entropy_mean": 5.696563720703125, "geo/layer_21/attn_entropy_std": 0.306673526763916, "geo/layer_27/stable_rank_q_proj": 43.35300827026367, "geo/layer_27/stable_rank_k_proj": 31.830228805541992, "geo/layer_27/stable_rank_o_proj": 116.62455749511719, "geo/layer_27/stable_rank_gate_proj": 80.484619140625, "geo/layer_27/stable_rank_down_proj": 128.38027954101562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09358138591051102, "geo/layer_27/attn_entropy_mean": 4.225538730621338, "geo/layer_27/attn_entropy_std": 0.7464669942855835, "attnres/final_alpha/block_0": 0.23648914694786072, "attnres/block_norm/0": 1.763408899307251, "attnres/final_alpha/block_1": 0.004533139057457447, "attnres/block_norm/1": 46461.4453125, "attnres/final_alpha/block_2": 0.010535226203501225, "attnres/block_norm/2": 28466.43359375, "attnres/final_alpha/block_3": 0.012563491240143776, "attnres/block_norm/3": 57697.53515625, "attnres/final_alpha/block_4": 0.014648647978901863, "attnres/block_norm/4": 15152.447265625, "attnres/final_alpha/block_5": 0.6120359897613525, "attnres/block_norm/5": 6656.734375, "attnres/final_alpha/block_6": 0.1091943308711052, "attnres/block_norm/6": 38268.234375, "geo/tier1_time_s": 1.3562500476837158, "geo/step": 62625.0, "geo/rankme_slope": 1.7389533938575434e-05} {"step": 62630, "timestamp": 1778262239.6219409, "train/loss": 2.1785110235214233, "train/z_loss": 0.001365465356502682, "train/perplexity": 8.833144118269892, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704291.6699281535, "perf/iters_per_sec": 0.812669596637799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2305123805999756, "data/tokens_consumed": 131346726912, "data/tokens_consumed_B": 131.346726912, "train/loss_slope": -3.0307200160285097e-06} {"step": 62640, "timestamp": 1778262249.9817376, "train/loss": 2.1207780122756956, "train/z_loss": 0.0013692358741536736, "train/perplexity": 8.337621737009329, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025114.4277028672, "perf/iters_per_sec": 0.965649808741983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035572099685669, "data/tokens_consumed": 131367698432, "data/tokens_consumed_B": 131.367698432, "train/loss_slope": -6.835646984135786e-06} {"step": 62650, "timestamp": 1778262260.3395119, "grad/layer_0/attn": 0.002762305550277233, "grad/layer_0/mlp": 0.0028047210071235895, "grad/layer_0/attn_mlp_ratio": 0.9848770857328285, "grad/layer_4/attn": 0.0023591930512338877, "grad/layer_4/mlp": 0.0026672433596104383, "grad/layer_4/attn_mlp_ratio": 0.8845060778885394, "grad/layer_8/attn": 0.0047742645256221294, "grad/layer_8/mlp": 0.003677431959658861, "grad/layer_8/attn_mlp_ratio": 1.2982604296066904, "grad/layer_12/attn": 0.005057451315224171, "grad/layer_12/mlp": 0.007610794622451067, "grad/layer_12/attn_mlp_ratio": 0.6645102777907288, "grad/layer_16/attn": 0.004550511483103037, "grad/layer_16/mlp": 0.004443215671926737, "grad/layer_16/attn_mlp_ratio": 1.0241482108193394, "grad/layer_20/attn": 0.0027969288639724255, "grad/layer_20/mlp": 0.005499734077602625, "grad/layer_20/attn_mlp_ratio": 0.5085570999708986, "grad/layer_24/attn": 0.005114427767693996, "grad/layer_24/mlp": 0.007104671094566584, "grad/layer_24/attn_mlp_ratio": 0.7198683271374108, "grad/layer_27/attn": 0.007184534799307585, "grad/layer_27/mlp": 0.006427980959415436, "grad/layer_27/attn_mlp_ratio": 1.117696946039376} {"step": 62650, "timestamp": 1778262260.3536692, "train/loss": 2.1677303194999693, "train/z_loss": 0.0013708455138839782, "train/perplexity": 8.738428076111616, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023403.608647424, "perf/iters_per_sec": 0.9648340266453858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364476919174195, "data/tokens_consumed": 131388669952, "data/tokens_consumed_B": 131.388669952, "train/loss_slope": -7.3127713820042015e-06} {"step": 62660, "timestamp": 1778262270.7171571, "train/loss": 2.1625935792922975, "train/z_loss": 0.0013837886159308254, "train/perplexity": 8.693656130633835, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024901.6582259056, "perf/iters_per_sec": 0.9655483523492363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356809139251708, "data/tokens_consumed": 131409641472, "data/tokens_consumed_B": 131.409641472, "train/loss_slope": -4.095615483675098e-06} {"step": 62670, "timestamp": 1778262281.0802264, "train/loss": 2.1078359127044677, "train/z_loss": 0.0013919002027250825, "train/perplexity": 8.230410670913223, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024889.86490679, "perf/iters_per_sec": 0.9655427288564634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356869459152223, "data/tokens_consumed": 131430612992, "data/tokens_consumed_B": 131.430612992, "train/loss_slope": -7.306082700059221e-06} {"step": 62680, "timestamp": 1778262291.4449787, "train/loss": 2.169446516036987, "train/z_loss": 0.0013733371743001044, "train/perplexity": 8.753437812259364, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024279.7830250931, "perf/iters_per_sec": 0.9652518191457239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359990835189818, "data/tokens_consumed": 131451584512, "data/tokens_consumed_B": 131.451584512, "train/loss_slope": -1.6225512808162363e-06} {"step": 62690, "timestamp": 1778262301.8125172, "train/loss": 2.1951913356781008, "train/z_loss": 0.0013686737627722323, "train/perplexity": 8.98171941579737, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024437.5340024694, "perf/iters_per_sec": 0.9653270406734797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359183549880981, "data/tokens_consumed": 131472556032, "data/tokens_consumed_B": 131.472556032, "train/loss_slope": -3.7842701763994754e-07} {"step": 62700, "timestamp": 1778262312.176872, "grad/layer_0/attn": 0.00298426509834826, "grad/layer_0/mlp": 0.0032473180908709764, "grad/layer_0/attn_mlp_ratio": 0.9189937428176196, "grad/layer_4/attn": 0.0035981666296720505, "grad/layer_4/mlp": 0.0025776808615773916, "grad/layer_4/attn_mlp_ratio": 1.3958929298489215, "grad/layer_8/attn": 0.004683078732341528, "grad/layer_8/mlp": 0.003766138106584549, "grad/layer_8/attn_mlp_ratio": 1.2434696963998402, "grad/layer_12/attn": 0.004750029183924198, "grad/layer_12/mlp": 0.006979974918067455, "grad/layer_12/attn_mlp_ratio": 0.6805223760298412, "grad/layer_16/attn": 0.0033941734582185745, "grad/layer_16/mlp": 0.004777815192937851, "grad/layer_16/attn_mlp_ratio": 0.7104028201415709, "grad/layer_20/attn": 0.004851499572396278, "grad/layer_20/mlp": 0.006428261287510395, "grad/layer_20/attn_mlp_ratio": 0.7547141100737378, "grad/layer_24/attn": 0.016995565965771675, "grad/layer_24/mlp": 0.012429064139723778, "grad/layer_24/attn_mlp_ratio": 1.3674051109538221, "grad/layer_27/attn": 0.011115163564682007, "grad/layer_27/mlp": 0.012280507944524288, "grad/layer_27/attn_mlp_ratio": 0.9051061669747538} {"step": 62700, "timestamp": 1778262312.8360314, "eos/sharpness": 78.38056087493895, "eos/L0_probe": 1.9712209701538086, "eos/L_plus": 2.333362579345703, "eos/L_minus": 2.3928849697113037, "eos/grad_norm": 0.23627151548862457, "eos/embed_grad_frac": 0.039941202849149704, "eos/time_s": 0.6534326076507568} {"step": 62700, "timestamp": 1778262312.8573315, "train/loss": 2.120716691017151, "train/z_loss": 0.0013769086683169008, "train/perplexity": 8.33711047922679, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1900005.18893101, "perf/iters_per_sec": 0.9059930748610544, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1037611961364746, "data/tokens_consumed": 131493527552, "data/tokens_consumed_B": 131.493527552, "train/loss_slope": -1.6822313425457145e-06} {"step": 62700, "timestamp": 1778262314.219369, "geo/rankme_last": 438.72686767578125, "geo/layer_0/stable_rank_q_proj": 19.354595184326172, "geo/layer_0/stable_rank_k_proj": 16.136581420898438, "geo/layer_0/stable_rank_o_proj": 47.24340057373047, "geo/layer_0/stable_rank_gate_proj": 132.45535278320312, "geo/layer_0/stable_rank_down_proj": 54.97427749633789, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06513889133930206, "geo/layer_0/attn_entropy_mean": 6.165191173553467, "geo/layer_0/attn_entropy_std": 0.4150458872318268, "geo/layer_7/stable_rank_q_proj": 43.416873931884766, "geo/layer_7/stable_rank_k_proj": 41.139617919921875, "geo/layer_7/stable_rank_o_proj": 91.3913345336914, "geo/layer_7/stable_rank_gate_proj": 81.83306121826172, "geo/layer_7/stable_rank_down_proj": 140.36985778808594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.452145516872406, "geo/layer_7/attn_entropy_mean": 4.654160499572754, "geo/layer_7/attn_entropy_std": 0.8135857582092285, "geo/layer_14/stable_rank_q_proj": 51.52104568481445, "geo/layer_14/stable_rank_k_proj": 40.267112731933594, "geo/layer_14/stable_rank_o_proj": 43.95362091064453, "geo/layer_14/stable_rank_gate_proj": 72.11808013916016, "geo/layer_14/stable_rank_down_proj": 129.96604919433594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39197835326194763, "geo/layer_14/attn_entropy_mean": 5.539998531341553, "geo/layer_14/attn_entropy_std": 0.4021652638912201, "geo/layer_21/stable_rank_q_proj": 40.427391052246094, "geo/layer_21/stable_rank_k_proj": 30.246681213378906, "geo/layer_21/stable_rank_o_proj": 70.37333679199219, "geo/layer_21/stable_rank_gate_proj": 66.02932739257812, "geo/layer_21/stable_rank_down_proj": 51.4362907409668, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1453423798084259, "geo/layer_21/attn_entropy_mean": 5.687603950500488, "geo/layer_21/attn_entropy_std": 0.2978970408439636, "geo/layer_27/stable_rank_q_proj": 43.41384506225586, "geo/layer_27/stable_rank_k_proj": 31.728967666625977, "geo/layer_27/stable_rank_o_proj": 116.44403839111328, "geo/layer_27/stable_rank_gate_proj": 80.53384399414062, "geo/layer_27/stable_rank_down_proj": 128.3946075439453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0952020063996315, "geo/layer_27/attn_entropy_mean": 4.211103439331055, "geo/layer_27/attn_entropy_std": 0.7414266467094421, "attnres/final_alpha/block_0": 0.23629288375377655, "attnres/block_norm/0": 1.763444185256958, "attnres/final_alpha/block_1": 0.004555069375783205, "attnres/block_norm/1": 46433.3984375, "attnres/final_alpha/block_2": 0.010447761043906212, "attnres/block_norm/2": 28601.7734375, "attnres/final_alpha/block_3": 0.012601166032254696, "attnres/block_norm/3": 57894.37109375, "attnres/final_alpha/block_4": 0.014784198254346848, "attnres/block_norm/4": 15084.798828125, "attnres/final_alpha/block_5": 0.6105269193649292, "attnres/block_norm/5": 6715.74609375, "attnres/final_alpha/block_6": 0.11079207062721252, "attnres/block_norm/6": 38467.2890625, "geo/tier1_time_s": 1.357877492904663, "geo/step": 62700.0, "geo/rankme_slope": 2.017893094737895e-05} {"step": 62710, "timestamp": 1778262324.582302, "train/loss": 2.1694133043289185, "train/z_loss": 0.001385953463613987, "train/perplexity": 8.753147100465691, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789245.9809889947, "perf/iters_per_sec": 0.8531789689011549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172087025642395, "data/tokens_consumed": 131514499072, "data/tokens_consumed_B": 131.514499072, "train/loss_slope": 9.088178386282123e-07} {"step": 62720, "timestamp": 1778262334.951544, "train/loss": 2.194350481033325, "train/z_loss": 0.0013768375967629254, "train/perplexity": 8.974170269620632, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023851.845443686, "perf/iters_per_sec": 0.9650477626055174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362181425094605, "data/tokens_consumed": 131535470592, "data/tokens_consumed_B": 131.535470592, "train/loss_slope": 4.730322430379192e-06} {"step": 62730, "timestamp": 1778262345.3179333, "train/loss": 2.195354127883911, "train/z_loss": 0.0013764517148956656, "train/perplexity": 8.983181688733126, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024317.1453287615, "perf/iters_per_sec": 0.965269634880429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035979962348938, "data/tokens_consumed": 131556442112, "data/tokens_consumed_B": 131.556442112, "train/loss_slope": 9.22798504530401e-06} {"step": 62740, "timestamp": 1778262355.6829534, "train/loss": 2.117936706542969, "train/z_loss": 0.0013715808629058302, "train/perplexity": 8.31396562760497, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024267.7174385965, "perf/iters_per_sec": 0.9652460658257468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360052585601807, "data/tokens_consumed": 131577413632, "data/tokens_consumed_B": 131.577413632, "train/loss_slope": 5.2598806104251055e-06} {"step": 62750, "timestamp": 1778262366.0421853, "grad/layer_0/attn": 0.0030386880971491337, "grad/layer_0/mlp": 0.0030598987359553576, "grad/layer_0/attn_mlp_ratio": 0.993068156843296, "grad/layer_4/attn": 0.0022450664546340704, "grad/layer_4/mlp": 0.002616405952721834, "grad/layer_4/attn_mlp_ratio": 0.858072642164445, "grad/layer_8/attn": 0.0037486946675926447, "grad/layer_8/mlp": 0.003841776866465807, "grad/layer_8/attn_mlp_ratio": 0.9757710299984457, "grad/layer_12/attn": 0.005735147278755903, "grad/layer_12/mlp": 0.0065734670497477055, "grad/layer_12/attn_mlp_ratio": 0.8724691472712423, "grad/layer_16/attn": 0.00376723799854517, "grad/layer_16/mlp": 0.004657805431634188, "grad/layer_16/attn_mlp_ratio": 0.8088010486825622, "grad/layer_20/attn": 0.004318589344620705, "grad/layer_20/mlp": 0.006006193347275257, "grad/layer_20/attn_mlp_ratio": 0.7190226859209566, "grad/layer_24/attn": 0.01039168518036604, "grad/layer_24/mlp": 0.009267833083868027, "grad/layer_24/attn_mlp_ratio": 1.1212637273677126, "grad/layer_27/attn": 0.010420329868793488, "grad/layer_27/mlp": 0.007814531214535236, "grad/layer_27/attn_mlp_ratio": 1.3334555137569668} {"step": 62750, "timestamp": 1778262366.056772, "train/loss": 2.135930860042572, "train/z_loss": 0.0013743580784648657, "train/perplexity": 8.464922498687296, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022937.7515813636, "perf/iters_per_sec": 0.9646118886858767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036686372756958, "data/tokens_consumed": 131598385152, "data/tokens_consumed_B": 131.598385152, "train/loss_slope": 5.035123730650056e-06} {"step": 62760, "timestamp": 1778262376.8949487, "train/loss": 2.1110278367996216, "train/z_loss": 0.0013813786557875575, "train/perplexity": 8.256723488965289, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936190.2672486622, "perf/iters_per_sec": 0.9232474647754012, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0831332206726074, "data/tokens_consumed": 131619356672, "data/tokens_consumed_B": 131.619356672, "train/loss_slope": 2.1997188041062263e-06} {"step": 62770, "timestamp": 1778262387.2588885, "train/loss": 2.1397602558135986, "train/z_loss": 0.0013778678141534329, "train/perplexity": 8.497400182369654, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024509.7088456238, "perf/iters_per_sec": 0.9653614563205832, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358814239501952, "data/tokens_consumed": 131640328192, "data/tokens_consumed_B": 131.640328192, "train/loss_slope": 9.789903922872195e-07} {"step": 62775, "timestamp": 1778262393.0205169, "eos/sharpness": 26.969933509826657, "eos/L0_probe": 1.9708408117294312, "eos/L_plus": 2.1281542778015137, "eos/L_minus": 2.0832266807556152, "eos/grad_norm": 0.10063965618610382, "eos/embed_grad_frac": 0.20074112713336945, "eos/time_s": 0.5800943374633789} {"step": 62775, "timestamp": 1778262394.3978524, "geo/rankme_last": 439.3907470703125, "geo/layer_0/stable_rank_q_proj": 19.334861755371094, "geo/layer_0/stable_rank_k_proj": 16.144716262817383, "geo/layer_0/stable_rank_o_proj": 47.23279571533203, "geo/layer_0/stable_rank_gate_proj": 132.05731201171875, "geo/layer_0/stable_rank_down_proj": 55.052879333496094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0615379735827446, "geo/layer_0/attn_entropy_mean": 6.165756702423096, "geo/layer_0/attn_entropy_std": 0.4151518642902374, "geo/layer_7/stable_rank_q_proj": 43.38064193725586, "geo/layer_7/stable_rank_k_proj": 41.176544189453125, "geo/layer_7/stable_rank_o_proj": 91.30840301513672, "geo/layer_7/stable_rank_gate_proj": 81.78174591064453, "geo/layer_7/stable_rank_down_proj": 140.30093383789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45378151535987854, "geo/layer_7/attn_entropy_mean": 4.650778770446777, "geo/layer_7/attn_entropy_std": 0.811964750289917, "geo/layer_14/stable_rank_q_proj": 51.518951416015625, "geo/layer_14/stable_rank_k_proj": 40.22444152832031, "geo/layer_14/stable_rank_o_proj": 43.857845306396484, "geo/layer_14/stable_rank_gate_proj": 72.13092041015625, "geo/layer_14/stable_rank_down_proj": 130.2532501220703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3909713923931122, "geo/layer_14/attn_entropy_mean": 5.546558380126953, "geo/layer_14/attn_entropy_std": 0.4169325828552246, "geo/layer_21/stable_rank_q_proj": 40.39937973022461, "geo/layer_21/stable_rank_k_proj": 30.176841735839844, "geo/layer_21/stable_rank_o_proj": 70.33920288085938, "geo/layer_21/stable_rank_gate_proj": 66.04141235351562, "geo/layer_21/stable_rank_down_proj": 51.40577697753906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14529560506343842, "geo/layer_21/attn_entropy_mean": 5.682270526885986, "geo/layer_21/attn_entropy_std": 0.3010088801383972, "geo/layer_27/stable_rank_q_proj": 43.38734436035156, "geo/layer_27/stable_rank_k_proj": 31.64091682434082, "geo/layer_27/stable_rank_o_proj": 116.46389770507812, "geo/layer_27/stable_rank_gate_proj": 80.524169921875, "geo/layer_27/stable_rank_down_proj": 128.5218505859375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09677291661500931, "geo/layer_27/attn_entropy_mean": 4.1960625648498535, "geo/layer_27/attn_entropy_std": 0.7340153455734253, "attnres/final_alpha/block_0": 0.2360951453447342, "attnres/block_norm/0": 1.763736605644226, "attnres/final_alpha/block_1": 0.004400639329105616, "attnres/block_norm/1": 46447.1015625, "attnres/final_alpha/block_2": 0.010459925048053265, "attnres/block_norm/2": 28515.86328125, "attnres/final_alpha/block_3": 0.012469442561268806, "attnres/block_norm/3": 58079.953125, "attnres/final_alpha/block_4": 0.01472614798694849, "attnres/block_norm/4": 15033.87109375, "attnres/final_alpha/block_5": 0.6128287315368652, "attnres/block_norm/5": 6699.8623046875, "attnres/final_alpha/block_6": 0.10902000218629837, "attnres/block_norm/6": 38586.6875, "geo/tier1_time_s": 1.3582170009613037, "geo/step": 62775.0, "geo/rankme_slope": 4.014463207157864e-05} {"step": 62780, "timestamp": 1778262399.5813768, "train/loss": 2.159125542640686, "train/z_loss": 0.0013846401823684573, "train/perplexity": 8.66355843266225, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702797.4002081936, "perf/iters_per_sec": 0.8119570733109444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2315922021865844, "data/tokens_consumed": 131661299712, "data/tokens_consumed_B": 131.661299712, "train/loss_slope": 2.168804090587457e-06} {"step": 62790, "timestamp": 1778262409.9438167, "train/loss": 2.128476059436798, "train/z_loss": 0.0013893582159653305, "train/perplexity": 8.40205282093212, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024704.359623467, "perf/iters_per_sec": 0.9654542730443321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357818365097047, "data/tokens_consumed": 131682271232, "data/tokens_consumed_B": 131.682271232, "train/loss_slope": 2.737660727056005e-06} {"step": 62800, "timestamp": 1778262420.2955844, "grad/layer_0/attn": 0.0030020195990800858, "grad/layer_0/mlp": 0.0030475773382931948, "grad/layer_0/attn_mlp_ratio": 0.9850511299103777, "grad/layer_4/attn": 0.003216933226212859, "grad/layer_4/mlp": 0.002487176563590765, "grad/layer_4/attn_mlp_ratio": 1.2934076108484134, "grad/layer_8/attn": 0.0066960155963897705, "grad/layer_8/mlp": 0.0036000562831759453, "grad/layer_8/attn_mlp_ratio": 1.8599752014113153, "grad/layer_12/attn": 0.004674699157476425, "grad/layer_12/mlp": 0.00684950640425086, "grad/layer_12/attn_mlp_ratio": 0.6824870017387776, "grad/layer_16/attn": 0.003586956299841404, "grad/layer_16/mlp": 0.004678764846175909, "grad/layer_16/attn_mlp_ratio": 0.7666459719830825, "grad/layer_20/attn": 0.0034898831509053707, "grad/layer_20/mlp": 0.005568146239966154, "grad/layer_20/attn_mlp_ratio": 0.6267585185138264, "grad/layer_24/attn": 0.009312830865383148, "grad/layer_24/mlp": 0.009988060221076012, "grad/layer_24/attn_mlp_ratio": 0.9323963378286726, "grad/layer_27/attn": 0.01354855578392744, "grad/layer_27/mlp": 0.00868696253746748, "grad/layer_27/attn_mlp_ratio": 1.5596424607021526} {"step": 62800, "timestamp": 1778262420.3098903, "train/loss": 2.142375087738037, "train/z_loss": 0.0013860585051588715, "train/perplexity": 8.519648530810493, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024014.4664899486, "perf/iters_per_sec": 0.9651253063630812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361348867416382, "data/tokens_consumed": 131703242752, "data/tokens_consumed_B": 131.703242752, "train/loss_slope": 1.801346287582879e-06} {"step": 62810, "timestamp": 1778262430.6822312, "train/loss": 2.1867027997970583, "train/z_loss": 0.001373157068155706, "train/perplexity": 8.905800444581297, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023673.001783924, "perf/iters_per_sec": 0.9649624833030338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363097190856934, "data/tokens_consumed": 131724214272, "data/tokens_consumed_B": 131.724214272, "train/loss_slope": 3.7280146509829505e-06} {"step": 62820, "timestamp": 1778262441.0492964, "train/loss": 2.12948477268219, "train/z_loss": 0.0013755076797679067, "train/perplexity": 8.410532358893207, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024188.8521090474, "perf/iters_per_sec": 0.9652084599061238, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360456228256225, "data/tokens_consumed": 131745185792, "data/tokens_consumed_B": 131.745185792, "train/loss_slope": 3.5894613097174325e-06} {"step": 62830, "timestamp": 1778262451.415713, "train/loss": 2.1522579908370973, "train/z_loss": 0.0013828809605911374, "train/perplexity": 8.604264830305247, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023976.323640976, "perf/iters_per_sec": 0.9651071184353714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361544132232665, "data/tokens_consumed": 131766157312, "data/tokens_consumed_B": 131.766157312, "train/loss_slope": 4.174944791499156e-06} {"step": 62840, "timestamp": 1778262461.7784312, "train/loss": 2.151217830181122, "train/z_loss": 0.001400163141079247, "train/perplexity": 8.595319665565889, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025287.5106997644, "perf/iters_per_sec": 0.9657323411463568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354835987091064, "data/tokens_consumed": 131787128832, "data/tokens_consumed_B": 131.787128832, "train/loss_slope": 4.129279478870328e-06} {"step": 62850, "timestamp": 1778262472.1298053, "grad/layer_0/attn": 0.003107760800048709, "grad/layer_0/mlp": 0.0029644835740327835, "grad/layer_0/attn_mlp_ratio": 1.0483312245133791, "grad/layer_4/attn": 0.0019726913888007402, "grad/layer_4/mlp": 0.0026007150299847126, "grad/layer_4/attn_mlp_ratio": 0.7585188266322487, "grad/layer_8/attn": 0.005525731481611729, "grad/layer_8/mlp": 0.0034823683090507984, "grad/layer_8/attn_mlp_ratio": 1.58677395167328, "grad/layer_12/attn": 0.004669116344302893, "grad/layer_12/mlp": 0.0075977640226483345, "grad/layer_12/attn_mlp_ratio": 0.61453820741613, "grad/layer_16/attn": 0.0036796911153942347, "grad/layer_16/mlp": 0.00467020645737648, "grad/layer_16/attn_mlp_ratio": 0.7879075732918601, "grad/layer_20/attn": 0.0039366488344967365, "grad/layer_20/mlp": 0.005582401994615793, "grad/layer_20/attn_mlp_ratio": 0.7051890508377425, "grad/layer_24/attn": 0.00880614947527647, "grad/layer_24/mlp": 0.008892756886780262, "grad/layer_24/attn_mlp_ratio": 0.9902608930354737, "grad/layer_27/attn": 0.004063637927174568, "grad/layer_27/mlp": 0.00799122266471386, "grad/layer_27/attn_mlp_ratio": 0.50851265780226} {"step": 62850, "timestamp": 1778262472.7225597, "eos/sharpness": 35.88414192199706, "eos/L0_probe": 1.9690788984298706, "eos/L_plus": 2.149624824523926, "eos/L_minus": 2.147374391555786, "eos/grad_norm": 0.11101870983839035, "eos/embed_grad_frac": 0.18286988139152527, "eos/time_s": 0.5898988246917725} {"step": 62850, "timestamp": 1778262472.7407575, "train/loss": 2.1555867195129395, "train/z_loss": 0.0013769818237051369, "train/perplexity": 8.632953815814265, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913907.9982683624, "perf/iters_per_sec": 0.9126224509565174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0957433700561523, "data/tokens_consumed": 131808100352, "data/tokens_consumed_B": 131.808100352, "train/loss_slope": 4.957204382948265e-06} {"step": 62850, "timestamp": 1778262474.1052263, "geo/rankme_last": 439.1180725097656, "geo/layer_0/stable_rank_q_proj": 19.35424041748047, "geo/layer_0/stable_rank_k_proj": 16.126953125, "geo/layer_0/stable_rank_o_proj": 47.19353485107422, "geo/layer_0/stable_rank_gate_proj": 132.02003479003906, "geo/layer_0/stable_rank_down_proj": 55.13390350341797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06164563074707985, "geo/layer_0/attn_entropy_mean": 6.168159484863281, "geo/layer_0/attn_entropy_std": 0.4144984483718872, "geo/layer_7/stable_rank_q_proj": 43.35226058959961, "geo/layer_7/stable_rank_k_proj": 41.110111236572266, "geo/layer_7/stable_rank_o_proj": 91.38050079345703, "geo/layer_7/stable_rank_gate_proj": 81.80802154541016, "geo/layer_7/stable_rank_down_proj": 140.37991333007812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46030178666114807, "geo/layer_7/attn_entropy_mean": 4.676292419433594, "geo/layer_7/attn_entropy_std": 0.806101381778717, "geo/layer_14/stable_rank_q_proj": 51.50536346435547, "geo/layer_14/stable_rank_k_proj": 40.16861343383789, "geo/layer_14/stable_rank_o_proj": 43.82731246948242, "geo/layer_14/stable_rank_gate_proj": 72.06571197509766, "geo/layer_14/stable_rank_down_proj": 130.17359924316406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3758000135421753, "geo/layer_14/attn_entropy_mean": 5.537640571594238, "geo/layer_14/attn_entropy_std": 0.39460453391075134, "geo/layer_21/stable_rank_q_proj": 40.34587097167969, "geo/layer_21/stable_rank_k_proj": 30.265892028808594, "geo/layer_21/stable_rank_o_proj": 70.20596313476562, "geo/layer_21/stable_rank_gate_proj": 66.06182098388672, "geo/layer_21/stable_rank_down_proj": 51.382240295410156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14276841282844543, "geo/layer_21/attn_entropy_mean": 5.695570945739746, "geo/layer_21/attn_entropy_std": 0.29834386706352234, "geo/layer_27/stable_rank_q_proj": 43.32282638549805, "geo/layer_27/stable_rank_k_proj": 31.659934997558594, "geo/layer_27/stable_rank_o_proj": 116.06517028808594, "geo/layer_27/stable_rank_gate_proj": 80.50407409667969, "geo/layer_27/stable_rank_down_proj": 128.51649475097656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08222271502017975, "geo/layer_27/attn_entropy_mean": 4.205480098724365, "geo/layer_27/attn_entropy_std": 0.7615428566932678, "attnres/final_alpha/block_0": 0.23908671736717224, "attnres/block_norm/0": 1.763786792755127, "attnres/final_alpha/block_1": 0.004538421053439379, "attnres/block_norm/1": 46482.203125, "attnres/final_alpha/block_2": 0.010626650415360928, "attnres/block_norm/2": 28537.537109375, "attnres/final_alpha/block_3": 0.01258764136582613, "attnres/block_norm/3": 57867.0703125, "attnres/final_alpha/block_4": 0.01484912820160389, "attnres/block_norm/4": 15074.5517578125, "attnres/final_alpha/block_5": 0.6080438494682312, "attnres/block_norm/5": 6648.63623046875, "attnres/final_alpha/block_6": 0.11026756465435028, "attnres/block_norm/6": 38221.53125, "geo/tier1_time_s": 1.3602485656738281, "geo/step": 62850.0, "geo/rankme_slope": 4.768210018382353e-05} {"step": 62860, "timestamp": 1778262484.466725, "train/loss": 2.159634327888489, "train/z_loss": 0.0013704515295103192, "train/perplexity": 8.667967444911348, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789063.0748116393, "perf/iters_per_sec": 0.8530917524393269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1722068548202516, "data/tokens_consumed": 131829071872, "data/tokens_consumed_B": 131.829071872, "train/loss_slope": 4.043549479860294e-06} {"step": 62870, "timestamp": 1778262494.825857, "train/loss": 2.1005255699157717, "train/z_loss": 0.0013614286901429296, "train/perplexity": 8.170462933843371, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025513.6076203457, "perf/iters_per_sec": 0.9658401525594452, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035368013381958, "data/tokens_consumed": 131850043392, "data/tokens_consumed_B": 131.850043392, "train/loss_slope": 1.0690570271055072e-06} {"step": 62880, "timestamp": 1778262505.1900434, "train/loss": 2.1389338970184326, "train/z_loss": 0.0013637186144478618, "train/perplexity": 8.490381181498881, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024604.3966245535, "perf/iters_per_sec": 0.9654066069720046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035832977294922, "data/tokens_consumed": 131871014912, "data/tokens_consumed_B": 131.871014912, "train/loss_slope": 1.9907517222860897e-06} {"step": 62890, "timestamp": 1778262515.5510178, "train/loss": 2.191720199584961, "train/z_loss": 0.0013678276096470653, "train/perplexity": 8.950596692208572, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024971.675146779, "perf/iters_per_sec": 0.9655817390188117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03564510345459, "data/tokens_consumed": 131891986432, "data/tokens_consumed_B": 131.891986432, "train/loss_slope": 4.256140881746469e-06} {"step": 62900, "timestamp": 1778262525.9026134, "grad/layer_0/attn": 0.0026564463041722775, "grad/layer_0/mlp": 0.0028713266365230083, "grad/layer_0/attn_mlp_ratio": 0.925163364511091, "grad/layer_4/attn": 0.0030664238147437572, "grad/layer_4/mlp": 0.0025129953864961863, "grad/layer_4/attn_mlp_ratio": 1.2202265508320524, "grad/layer_8/attn": 0.005077564623206854, "grad/layer_8/mlp": 0.0034544302616268396, "grad/layer_8/attn_mlp_ratio": 1.4698703090415204, "grad/layer_12/attn": 0.006127002649009228, "grad/layer_12/mlp": 0.00775549840182066, "grad/layer_12/attn_mlp_ratio": 0.7900204799950473, "grad/layer_16/attn": 0.006123953033238649, "grad/layer_16/mlp": 0.004690107889473438, "grad/layer_16/attn_mlp_ratio": 1.3057168506531958, "grad/layer_20/attn": 0.0032556040678173304, "grad/layer_20/mlp": 0.00550105283036828, "grad/layer_20/attn_mlp_ratio": 0.5918147142058815, "grad/layer_24/attn": 0.006792864762246609, "grad/layer_24/mlp": 0.008340631611645222, "grad/layer_24/attn_mlp_ratio": 0.814430488851628, "grad/layer_27/attn": 0.010388057678937912, "grad/layer_27/mlp": 0.007232890464365482, "grad/layer_27/attn_mlp_ratio": 1.4362249209350817} {"step": 62900, "timestamp": 1778262525.9168317, "train/loss": 2.1699437856674195, "train/z_loss": 0.0013653626083396375, "train/perplexity": 8.757791713486988, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024754.7409876694, "perf/iters_per_sec": 0.9654782967508647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357560634613037, "data/tokens_consumed": 131912957952, "data/tokens_consumed_B": 131.912957952, "train/loss_slope": 5.308979701395045e-06} {"step": 62910, "timestamp": 1778262536.275453, "train/loss": 2.1560832023620606, "train/z_loss": 0.0013797839172184467, "train/perplexity": 8.637240993488101, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025678.5950856167, "perf/iters_per_sec": 0.9659188247135242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352836847305298, "data/tokens_consumed": 131933929472, "data/tokens_consumed_B": 131.933929472, "train/loss_slope": 5.069009119635341e-06} {"step": 62920, "timestamp": 1778262546.647181, "train/loss": 2.1553351879119873, "train/z_loss": 0.0013804555870592593, "train/perplexity": 8.630782628192623, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022922.5849347124, "perf/iters_per_sec": 0.964604656665188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366941452026368, "data/tokens_consumed": 131954900992, "data/tokens_consumed_B": 131.954900992, "train/loss_slope": 8.613529006461375e-06} {"step": 62925, "timestamp": 1778262552.4054635, "eos/sharpness": 46.79665565490722, "eos/L0_probe": 1.9736802577972412, "eos/L_plus": 2.236335515975952, "eos/L_minus": 2.1789915561676025, "eos/grad_norm": 0.12401535362005234, "eos/embed_grad_frac": 0.12997618317604065, "eos/time_s": 0.5864934921264648} {"step": 62925, "timestamp": 1778262553.7802851, "geo/rankme_last": 438.7988586425781, "geo/layer_0/stable_rank_q_proj": 19.40270233154297, "geo/layer_0/stable_rank_k_proj": 16.129846572875977, "geo/layer_0/stable_rank_o_proj": 47.001522064208984, "geo/layer_0/stable_rank_gate_proj": 131.81044006347656, "geo/layer_0/stable_rank_down_proj": 55.0755500793457, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05998879298567772, "geo/layer_0/attn_entropy_mean": 6.165996551513672, "geo/layer_0/attn_entropy_std": 0.41437390446662903, "geo/layer_7/stable_rank_q_proj": 43.375362396240234, "geo/layer_7/stable_rank_k_proj": 41.07286834716797, "geo/layer_7/stable_rank_o_proj": 91.49003601074219, "geo/layer_7/stable_rank_gate_proj": 81.61930847167969, "geo/layer_7/stable_rank_down_proj": 140.37979125976562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44523170590400696, "geo/layer_7/attn_entropy_mean": 4.629087924957275, "geo/layer_7/attn_entropy_std": 0.7900009155273438, "geo/layer_14/stable_rank_q_proj": 51.426082611083984, "geo/layer_14/stable_rank_k_proj": 40.217220306396484, "geo/layer_14/stable_rank_o_proj": 43.81827163696289, "geo/layer_14/stable_rank_gate_proj": 71.99806213378906, "geo/layer_14/stable_rank_down_proj": 129.73582458496094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40189704298973083, "geo/layer_14/attn_entropy_mean": 5.540733337402344, "geo/layer_14/attn_entropy_std": 0.40782997012138367, "geo/layer_21/stable_rank_q_proj": 40.40105056762695, "geo/layer_21/stable_rank_k_proj": 30.2333927154541, "geo/layer_21/stable_rank_o_proj": 70.24978637695312, "geo/layer_21/stable_rank_gate_proj": 66.10317993164062, "geo/layer_21/stable_rank_down_proj": 51.36815643310547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1437653750181198, "geo/layer_21/attn_entropy_mean": 5.706680774688721, "geo/layer_21/attn_entropy_std": 0.30029386281967163, "geo/layer_27/stable_rank_q_proj": 43.370819091796875, "geo/layer_27/stable_rank_k_proj": 31.73708152770996, "geo/layer_27/stable_rank_o_proj": 116.22065734863281, "geo/layer_27/stable_rank_gate_proj": 80.54986572265625, "geo/layer_27/stable_rank_down_proj": 128.43475341796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08985905349254608, "geo/layer_27/attn_entropy_mean": 4.2173075675964355, "geo/layer_27/attn_entropy_std": 0.7249416708946228, "attnres/final_alpha/block_0": 0.23618125915527344, "attnres/block_norm/0": 1.7640115022659302, "attnres/final_alpha/block_1": 0.004446766339242458, "attnres/block_norm/1": 46511.484375, "attnres/final_alpha/block_2": 0.010277384892106056, "attnres/block_norm/2": 28480.474609375, "attnres/final_alpha/block_3": 0.012369031086564064, "attnres/block_norm/3": 57884.7890625, "attnres/final_alpha/block_4": 0.014534905552864075, "attnres/block_norm/4": 15158.94140625, "attnres/final_alpha/block_5": 0.6143757104873657, "attnres/block_norm/5": 6665.3642578125, "attnres/final_alpha/block_6": 0.10781493782997131, "attnres/block_norm/6": 38704.734375, "geo/tier1_time_s": 1.356905460357666, "geo/step": 62925.0, "geo/rankme_slope": 5.078464979741896e-05} {"step": 62930, "timestamp": 1778262559.3552642, "train/loss": 2.162746238708496, "train/z_loss": 0.001376953034196049, "train/perplexity": 8.694983400410905, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1651170.932154224, "perf/iters_per_sec": 0.7873396549960251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2700998783111572, "data/tokens_consumed": 131975872512, "data/tokens_consumed_B": 131.975872512, "train/loss_slope": 1.0017313050179445e-05} {"step": 62940, "timestamp": 1778262570.2107506, "train/loss": 2.1193937063217163, "train/z_loss": 0.0013879891484975815, "train/perplexity": 8.32608790261643, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932699.27629527, "perf/iters_per_sec": 0.9215828305698729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.085089659690857, "data/tokens_consumed": 131996844032, "data/tokens_consumed_B": 131.996844032, "train/loss_slope": 7.400977092929539e-06} {"step": 62950, "timestamp": 1778262580.5665529, "grad/layer_0/attn": 0.002796425484120846, "grad/layer_0/mlp": 0.0029211007058620453, "grad/layer_0/attn_mlp_ratio": 0.9573190622209949, "grad/layer_4/attn": 0.0028908057138323784, "grad/layer_4/mlp": 0.002512178383767605, "grad/layer_4/attn_mlp_ratio": 1.1507166917124982, "grad/layer_8/attn": 0.005828202236443758, "grad/layer_8/mlp": 0.003621340962126851, "grad/layer_8/attn_mlp_ratio": 1.6094043992146916, "grad/layer_12/attn": 0.0052079977467656136, "grad/layer_12/mlp": 0.006846966687589884, "grad/layer_12/attn_mlp_ratio": 0.7606284517408645, "grad/layer_16/attn": 0.004013577941805124, "grad/layer_16/mlp": 0.00486292177811265, "grad/layer_16/attn_mlp_ratio": 0.8253428787062554, "grad/layer_20/attn": 0.003694795770570636, "grad/layer_20/mlp": 0.006597353611141443, "grad/layer_20/attn_mlp_ratio": 0.5600420914723669, "grad/layer_24/attn": 0.011005845852196217, "grad/layer_24/mlp": 0.010319036431610584, "grad/layer_24/attn_mlp_ratio": 1.0665575045191198, "grad/layer_27/attn": 0.009098866023123264, "grad/layer_27/mlp": 0.010601319372653961, "grad/layer_27/attn_mlp_ratio": 0.8582767500398166} {"step": 62950, "timestamp": 1778262580.5809283, "train/loss": 2.1790534257888794, "train/z_loss": 0.0013743723626248538, "train/perplexity": 8.837936535259836, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023770.6844059627, "perf/iters_per_sec": 0.9650090620069326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036259698867798, "data/tokens_consumed": 132017815552, "data/tokens_consumed_B": 132.017815552, "train/loss_slope": 7.035109448139688e-06} {"step": 62960, "timestamp": 1778262590.9429848, "train/loss": 2.1604068517684936, "train/z_loss": 0.0013689154060557485, "train/perplexity": 8.674666243911593, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025327.801504133, "perf/iters_per_sec": 0.9657515532990136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035462999343872, "data/tokens_consumed": 132038787072, "data/tokens_consumed_B": 132.038787072, "train/loss_slope": 1.0720121771088297e-05} {"step": 62970, "timestamp": 1778262601.3095648, "train/loss": 2.121750545501709, "train/z_loss": 0.0013851135154254735, "train/perplexity": 8.345734295401376, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023922.2554722922, "perf/iters_per_sec": 0.965081336723467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361820936203003, "data/tokens_consumed": 132059758592, "data/tokens_consumed_B": 132.059758592, "train/loss_slope": 8.670801276599898e-06} {"step": 62980, "timestamp": 1778262611.674427, "train/loss": 2.0955750942230225, "train/z_loss": 0.0013914029696024954, "train/perplexity": 8.130115208308942, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024649.6933199144, "perf/iters_per_sec": 0.9654282061194965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358098030090332, "data/tokens_consumed": 132080730112, "data/tokens_consumed_B": 132.080730112, "train/loss_slope": 6.128162640978827e-06} {"step": 62990, "timestamp": 1778262622.5877852, "train/loss": 2.1447479486465455, "train/z_loss": 0.0013791945413686336, "train/perplexity": 8.53988847555362, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922854.9026560627, "perf/iters_per_sec": 0.9168886674194635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0906449556350708, "data/tokens_consumed": 132101701632, "data/tokens_consumed_B": 132.101701632, "train/loss_slope": 8.97075608439757e-06} {"step": 63000, "timestamp": 1778262632.9453616, "grad/layer_0/attn": 0.0027439636178314686, "grad/layer_0/mlp": 0.0032443844247609377, "grad/layer_0/attn_mlp_ratio": 0.8457578307656543, "grad/layer_4/attn": 0.002880257088690996, "grad/layer_4/mlp": 0.0027379656676203012, "grad/layer_4/attn_mlp_ratio": 1.051969722468213, "grad/layer_8/attn": 0.006393081974238157, "grad/layer_8/mlp": 0.003931427374482155, "grad/layer_8/attn_mlp_ratio": 1.626147758221141, "grad/layer_12/attn": 0.006468628998845816, "grad/layer_12/mlp": 0.006847649812698364, "grad/layer_12/attn_mlp_ratio": 0.9446494901630869, "grad/layer_16/attn": 0.003986085299402475, "grad/layer_16/mlp": 0.005281582474708557, "grad/layer_16/attn_mlp_ratio": 0.7547141870867051, "grad/layer_20/attn": 0.004122127778828144, "grad/layer_20/mlp": 0.007354175206273794, "grad/layer_20/attn_mlp_ratio": 0.5605153001059937, "grad/layer_24/attn": 0.021227896213531494, "grad/layer_24/mlp": 0.013954033143818378, "grad/layer_24/attn_mlp_ratio": 1.5212731575607668, "grad/layer_27/attn": 0.007580089848488569, "grad/layer_27/mlp": 0.01348567008972168, "grad/layer_27/attn_mlp_ratio": 0.5620847716019228} {"step": 63000, "timestamp": 1778262633.5357444, "eos/sharpness": 79.62126731872557, "eos/L0_probe": 1.9769220352172852, "eos/L_plus": 2.434816598892212, "eos/L_minus": 2.3152401447296143, "eos/grad_norm": 0.25696077942848206, "eos/embed_grad_frac": 0.03741587698459625, "eos/time_s": 0.5872397422790527} {"step": 63000, "timestamp": 1778262633.5562634, "train/loss": 2.1060431003570557, "train/z_loss": 0.001386482734233141, "train/perplexity": 8.215668308131862, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913210.0081854332, "perf/iters_per_sec": 0.9122896233489195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096143126487732, "data/tokens_consumed": 132122673152, "data/tokens_consumed_B": 132.122673152, "train/loss_slope": 7.950895153793106e-06} {"step": 63000, "timestamp": 1778262634.9227452, "geo/rankme_last": 438.22906494140625, "geo/layer_0/stable_rank_q_proj": 19.42509651184082, "geo/layer_0/stable_rank_k_proj": 16.127452850341797, "geo/layer_0/stable_rank_o_proj": 46.942745208740234, "geo/layer_0/stable_rank_gate_proj": 131.50552368164062, "geo/layer_0/stable_rank_down_proj": 55.04146957397461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06226852536201477, "geo/layer_0/attn_entropy_mean": 6.164134979248047, "geo/layer_0/attn_entropy_std": 0.41278380155563354, "geo/layer_7/stable_rank_q_proj": 43.40873336791992, "geo/layer_7/stable_rank_k_proj": 41.096580505371094, "geo/layer_7/stable_rank_o_proj": 91.39373016357422, "geo/layer_7/stable_rank_gate_proj": 81.6048812866211, "geo/layer_7/stable_rank_down_proj": 140.3850555419922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4494173228740692, "geo/layer_7/attn_entropy_mean": 4.637103080749512, "geo/layer_7/attn_entropy_std": 0.7916527986526489, "geo/layer_14/stable_rank_q_proj": 51.340675354003906, "geo/layer_14/stable_rank_k_proj": 40.15225601196289, "geo/layer_14/stable_rank_o_proj": 43.72391891479492, "geo/layer_14/stable_rank_gate_proj": 71.91127014160156, "geo/layer_14/stable_rank_down_proj": 129.68313598632812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3918169438838959, "geo/layer_14/attn_entropy_mean": 5.531952857971191, "geo/layer_14/attn_entropy_std": 0.38588154315948486, "geo/layer_21/stable_rank_q_proj": 40.37014389038086, "geo/layer_21/stable_rank_k_proj": 30.214284896850586, "geo/layer_21/stable_rank_o_proj": 70.18921661376953, "geo/layer_21/stable_rank_gate_proj": 66.16777038574219, "geo/layer_21/stable_rank_down_proj": 51.363101959228516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14555172622203827, "geo/layer_21/attn_entropy_mean": 5.720473289489746, "geo/layer_21/attn_entropy_std": 0.2889345586299896, "geo/layer_27/stable_rank_q_proj": 43.317054748535156, "geo/layer_27/stable_rank_k_proj": 31.71661376953125, "geo/layer_27/stable_rank_o_proj": 116.34331512451172, "geo/layer_27/stable_rank_gate_proj": 80.54379272460938, "geo/layer_27/stable_rank_down_proj": 128.25877380371094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09497202187776566, "geo/layer_27/attn_entropy_mean": 4.200749397277832, "geo/layer_27/attn_entropy_std": 0.7293123006820679, "attnres/final_alpha/block_0": 0.23299361765384674, "attnres/block_norm/0": 1.7641165256500244, "attnres/final_alpha/block_1": 0.004339041654020548, "attnres/block_norm/1": 46660.62890625, "attnres/final_alpha/block_2": 0.010027174837887287, "attnres/block_norm/2": 28549.087890625, "attnres/final_alpha/block_3": 0.012181662023067474, "attnres/block_norm/3": 58080.04296875, "attnres/final_alpha/block_4": 0.014245720580220222, "attnres/block_norm/4": 15186.734375, "attnres/final_alpha/block_5": 0.6205928325653076, "attnres/block_norm/5": 6617.7841796875, "attnres/final_alpha/block_6": 0.10561993718147278, "attnres/block_norm/6": 38815.51171875, "geo/tier1_time_s": 1.3628787994384766, "geo/step": 63000.0, "geo/rankme_slope": 3.789818661839736e-05} {"step": 63000, "timestamp": 1778262641.957837, "geo/ww_alpha_mean": 7.683867966272371, "geo/ww_alpha_std": 4.675807467111498, "geo/ww_alpha_min": 1.347666385936384, "geo/ww_alpha_max": 33.67547316549126, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.959114016107545, "geo/ww_alpha_by_type/k_proj": 4.487104605402348, "geo/ww_alpha_by_type/v_proj": 8.015898516646194, "geo/ww_alpha_by_type/o_proj": 9.21579499142206, "geo/ww_alpha_by_type/gate_proj": 8.281454216628061, "geo/ww_alpha_by_type/up_proj": 11.880198494160767, "geo/ww_alpha_by_type/down_proj": 8.046889814064482, "geo/twonn_id/layer_0": 0.7042528986930847, "geo/twonn_id/layer_7": 3.2415971755981445, "geo/twonn_id/layer_14": 4.725605487823486, "geo/twonn_id/layer_21": 6.764214038848877, "geo/twonn_id/layer_27": 5.959559917449951, "geo/tier2_time_s": 7.028726100921631} {"step": 63000, "timestamp": 1778262642.7612126, "eoc/jacobian_sigma/layer_0/attn": 1293.5108642578125, "eoc/jacobian_sigma/layer_0/mlp": 9402.2607421875, "eoc/jacobian_sigma/layer_0": 9402.2607421875, "eoc/jacobian_sigma/layer_7/attn": 1.1584761142730713, "eoc/jacobian_sigma/layer_7/mlp": 1.7143558263778687, "eoc/jacobian_sigma/layer_7": 1.7143558263778687, "eoc/jacobian_sigma/layer_14/attn": 1.5006709098815918, "eoc/jacobian_sigma/layer_14/mlp": 6.738681793212891, "eoc/jacobian_sigma/layer_14": 6.738681793212891, "eoc/jacobian_sigma/layer_21/attn": 1.09752357006073, "eoc/jacobian_sigma/layer_21/mlp": 4.418468952178955, "eoc/jacobian_sigma/layer_21": 4.418468952178955, "eoc/jacobian_sigma/layer_27/attn": 2.806729316711426, "eoc/jacobian_sigma/layer_27/mlp": 32.18584060668945, "eoc/jacobian_sigma/layer_27": 32.18584060668945, "eoc/layer0_sigma": 9402.2607421875, "eoc/sigma_max": 32.18584060668945, "eoc/sigma_min": 1.7143558263778687, "eoc/sigma_mean": 11.264336794614792, "eoc/time_s": 0.7958354949951172} {"step": 63010, "timestamp": 1778262653.5816758, "train/loss": 2.124014675617218, "train/z_loss": 0.0013960619340650736, "train/perplexity": 8.364651531216023, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1047425.226818332, "perf/iters_per_sec": 0.49945126858631705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 2.002197337150574, "data/tokens_consumed": 132143644672, "data/tokens_consumed_B": 132.143644672, "train/loss_slope": 5.168480026160488e-06} {"step": 63020, "timestamp": 1778262663.9480362, "train/loss": 2.1107234716415406, "train/z_loss": 0.0013958163443021476, "train/perplexity": 8.254210812420332, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024079.298659798, "perf/iters_per_sec": 0.9651562207507124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361016988754272, "data/tokens_consumed": 132164616192, "data/tokens_consumed_B": 132.164616192, "train/loss_slope": 2.3429624461355776e-06} {"step": 63030, "timestamp": 1778262674.3137436, "train/loss": 2.1501522064208984, "train/z_loss": 0.0013864212203770875, "train/perplexity": 8.586165167195318, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024561.1988866136, "perf/iters_per_sec": 0.9653860086854046, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358550786972045, "data/tokens_consumed": 132185587712, "data/tokens_consumed_B": 132.185587712, "train/loss_slope": 3.4924825962478274e-06} {"step": 63040, "timestamp": 1778262684.6817799, "train/loss": 2.1366379141807554, "train/z_loss": 0.0013764425180852412, "train/perplexity": 8.470909773583708, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023818.0857589135, "perf/iters_per_sec": 0.9650316647333687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362354278564454, "data/tokens_consumed": 132206559232, "data/tokens_consumed_B": 132.206559232, "train/loss_slope": 4.92047336485668e-06} {"step": 63050, "timestamp": 1778262695.0360372, "grad/layer_0/attn": 0.003678038250654936, "grad/layer_0/mlp": 0.003173248842358589, "grad/layer_0/attn_mlp_ratio": 1.15907649146529, "grad/layer_4/attn": 0.0041181729175150394, "grad/layer_4/mlp": 0.002586548449471593, "grad/layer_4/attn_mlp_ratio": 1.5921498625480441, "grad/layer_8/attn": 0.004815318621695042, "grad/layer_8/mlp": 0.003667693119496107, "grad/layer_8/attn_mlp_ratio": 1.3129011434485798, "grad/layer_12/attn": 0.006797836162149906, "grad/layer_12/mlp": 0.007375901564955711, "grad/layer_12/attn_mlp_ratio": 0.9216278186635402, "grad/layer_16/attn": 0.004062828607857227, "grad/layer_16/mlp": 0.004982186481356621, "grad/layer_16/attn_mlp_ratio": 0.8154709867872798, "grad/layer_20/attn": 0.0033017946407198906, "grad/layer_20/mlp": 0.007031481247395277, "grad/layer_20/attn_mlp_ratio": 0.46957312088198866, "grad/layer_24/attn": 0.013840507715940475, "grad/layer_24/mlp": 0.011193112470209599, "grad/layer_24/attn_mlp_ratio": 1.2365200143503339, "grad/layer_27/attn": 0.008963141590356827, "grad/layer_27/mlp": 0.010880137793719769, "grad/layer_27/attn_mlp_ratio": 0.8238077198939299} {"step": 63050, "timestamp": 1778262695.050486, "train/loss": 2.1431183099746702, "train/z_loss": 0.0013758113607764244, "train/perplexity": 8.525982876668657, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023551.0281462942, "perf/iters_per_sec": 0.9649043217402907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036372184753418, "data/tokens_consumed": 132227530752, "data/tokens_consumed_B": 132.227530752, "train/loss_slope": 6.184731522659447e-06} {"step": 63060, "timestamp": 1778262705.4216738, "train/loss": 2.168527936935425, "train/z_loss": 0.0013714696513488888, "train/perplexity": 8.745400779108671, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023547.2108880596, "perf/iters_per_sec": 0.964902501529722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363741397857666, "data/tokens_consumed": 132248502272, "data/tokens_consumed_B": 132.248502272, "train/loss_slope": 7.74449037425414e-06} {"step": 63070, "timestamp": 1778262715.7840188, "train/loss": 2.1596948981285093, "train/z_loss": 0.0013726239092648029, "train/perplexity": 8.668492481680618, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024817.7095663592, "perf/iters_per_sec": 0.9655083225089832, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357238531112671, "data/tokens_consumed": 132269473792, "data/tokens_consumed_B": 132.269473792, "train/loss_slope": 6.281823400903187e-06} {"step": 63075, "timestamp": 1778262721.5411453, "eos/sharpness": 55.61885833740233, "eos/L0_probe": 1.9750654697418213, "eos/L_plus": 2.261381149291992, "eos/L_minus": 2.244938373565674, "eos/grad_norm": 0.18992172181606293, "eos/embed_grad_frac": 0.06923320144414902, "eos/time_s": 0.5977070331573486} {"step": 63075, "timestamp": 1778262722.9151263, "geo/rankme_last": 438.8003234863281, "geo/layer_0/stable_rank_q_proj": 19.415754318237305, "geo/layer_0/stable_rank_k_proj": 16.107479095458984, "geo/layer_0/stable_rank_o_proj": 46.95515441894531, "geo/layer_0/stable_rank_gate_proj": 131.3629913330078, "geo/layer_0/stable_rank_down_proj": 55.04594421386719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06192784756422043, "geo/layer_0/attn_entropy_mean": 6.160901069641113, "geo/layer_0/attn_entropy_std": 0.42082324624061584, "geo/layer_7/stable_rank_q_proj": 43.442867279052734, "geo/layer_7/stable_rank_k_proj": 41.17434310913086, "geo/layer_7/stable_rank_o_proj": 91.24718475341797, "geo/layer_7/stable_rank_gate_proj": 81.60563659667969, "geo/layer_7/stable_rank_down_proj": 140.5548858642578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43629130721092224, "geo/layer_7/attn_entropy_mean": 4.642923355102539, "geo/layer_7/attn_entropy_std": 0.7944241166114807, "geo/layer_14/stable_rank_q_proj": 51.37952423095703, "geo/layer_14/stable_rank_k_proj": 40.076332092285156, "geo/layer_14/stable_rank_o_proj": 43.71894836425781, "geo/layer_14/stable_rank_gate_proj": 71.9973373413086, "geo/layer_14/stable_rank_down_proj": 129.6112823486328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39875656366348267, "geo/layer_14/attn_entropy_mean": 5.5179877281188965, "geo/layer_14/attn_entropy_std": 0.4067991077899933, "geo/layer_21/stable_rank_q_proj": 40.25824737548828, "geo/layer_21/stable_rank_k_proj": 30.23238754272461, "geo/layer_21/stable_rank_o_proj": 70.17537689208984, "geo/layer_21/stable_rank_gate_proj": 66.18415832519531, "geo/layer_21/stable_rank_down_proj": 51.38315963745117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14517362415790558, "geo/layer_21/attn_entropy_mean": 5.705388069152832, "geo/layer_21/attn_entropy_std": 0.298696905374527, "geo/layer_27/stable_rank_q_proj": 43.327850341796875, "geo/layer_27/stable_rank_k_proj": 31.701305389404297, "geo/layer_27/stable_rank_o_proj": 116.37040710449219, "geo/layer_27/stable_rank_gate_proj": 80.59781646728516, "geo/layer_27/stable_rank_down_proj": 128.1505584716797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0931859016418457, "geo/layer_27/attn_entropy_mean": 4.196033477783203, "geo/layer_27/attn_entropy_std": 0.7424110770225525, "attnres/final_alpha/block_0": 0.2358381450176239, "attnres/block_norm/0": 1.7640432119369507, "attnres/final_alpha/block_1": 0.004444766789674759, "attnres/block_norm/1": 46395.890625, "attnres/final_alpha/block_2": 0.010221032425761223, "attnres/block_norm/2": 28421.18359375, "attnres/final_alpha/block_3": 0.012543849647045135, "attnres/block_norm/3": 58110.7265625, "attnres/final_alpha/block_4": 0.01442188210785389, "attnres/block_norm/4": 15092.08203125, "attnres/final_alpha/block_5": 0.6138900518417358, "attnres/block_norm/5": 6681.01708984375, "attnres/final_alpha/block_6": 0.10864025354385376, "attnres/block_norm/6": 38520.625, "geo/tier1_time_s": 1.3562662601470947, "geo/step": 63075.0, "geo/rankme_slope": 3.616499334108644e-05} {"step": 63080, "timestamp": 1778262728.090284, "train/loss": 2.1241267681121827, "train/z_loss": 0.0013933297945186497, "train/perplexity": 8.365589198427413, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704852.6272067104, "perf/iters_per_sec": 0.8129370819123795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2301074981689453, "data/tokens_consumed": 132290445312, "data/tokens_consumed_B": 132.290445312, "train/loss_slope": 4.682585036400548e-06} {"step": 63090, "timestamp": 1778262738.4271712, "train/loss": 2.116440451145172, "train/z_loss": 0.0013768093311227858, "train/perplexity": 8.301535113588518, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029845.7606321163, "perf/iters_per_sec": 0.9679058840904791, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033158302307129, "data/tokens_consumed": 132311416832, "data/tokens_consumed_B": 132.311416832, "train/loss_slope": 2.4561727365287494e-06} {"step": 63100, "timestamp": 1778262748.7603297, "grad/layer_0/attn": 0.0025897554587572813, "grad/layer_0/mlp": 0.002785477787256241, "grad/layer_0/attn_mlp_ratio": 0.929734703910448, "grad/layer_4/attn": 0.0046322885900735855, "grad/layer_4/mlp": 0.0026235857512801886, "grad/layer_4/attn_mlp_ratio": 1.7656325550822904, "grad/layer_8/attn": 0.0040627820417284966, "grad/layer_8/mlp": 0.0037336242385208607, "grad/layer_8/attn_mlp_ratio": 1.0881603700221316, "grad/layer_12/attn": 0.005739174783229828, "grad/layer_12/mlp": 0.007103987969458103, "grad/layer_12/attn_mlp_ratio": 0.8078806899893367, "grad/layer_16/attn": 0.004509123042225838, "grad/layer_16/mlp": 0.004732144996523857, "grad/layer_16/attn_mlp_ratio": 0.9528708334700373, "grad/layer_20/attn": 0.0034260163083672523, "grad/layer_20/mlp": 0.006266893818974495, "grad/layer_20/attn_mlp_ratio": 0.5466849052597145, "grad/layer_24/attn": 0.014290533028542995, "grad/layer_24/mlp": 0.011312575079500675, "grad/layer_24/attn_mlp_ratio": 1.2632431432976134, "grad/layer_27/attn": 0.00469233887270093, "grad/layer_27/mlp": 0.01147894375026226, "grad/layer_27/attn_mlp_ratio": 0.4087779271255625} {"step": 63100, "timestamp": 1778262748.7743769, "train/loss": 2.1503785610198975, "train/z_loss": 0.0013874077470973135, "train/perplexity": 8.588108905147394, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027712.7545674692, "perf/iters_per_sec": 0.9668887875401827, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342451095581056, "data/tokens_consumed": 132332388352, "data/tokens_consumed_B": 132.332388352, "train/loss_slope": 4.2406469002311957e-07} {"step": 63110, "timestamp": 1778262759.120888, "train/loss": 2.1486074209213255, "train/z_loss": 0.0013658214826136827, "train/perplexity": 8.572911623324938, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027712.1001559312, "perf/iters_per_sec": 0.9668884754924446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342454433441162, "data/tokens_consumed": 132353359872, "data/tokens_consumed_B": 132.353359872, "train/loss_slope": 1.1057958136515224e-07} {"step": 63120, "timestamp": 1778262769.4636815, "train/loss": 2.238582158088684, "train/z_loss": 0.001365043269470334, "train/perplexity": 9.380022465788668, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028848.607886436, "perf/iters_per_sec": 0.9674304046089344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336660861968994, "data/tokens_consumed": 132374331392, "data/tokens_consumed_B": 132.374331392, "train/loss_slope": 7.285570762123846e-06} {"step": 63130, "timestamp": 1778262779.8016453, "train/loss": 2.187359881401062, "train/z_loss": 0.0013722157105803489, "train/perplexity": 8.911654205210956, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029515.6244419669, "perf/iters_per_sec": 0.9677484628877482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333263635635377, "data/tokens_consumed": 132395302912, "data/tokens_consumed_B": 132.395302912, "train/loss_slope": 9.926860939801898e-06} {"step": 63140, "timestamp": 1778262790.1445315, "train/loss": 2.1600011825561523, "train/z_loss": 0.0013732455088756978, "train/perplexity": 8.671147912576703, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028590.887822924, "perf/iters_per_sec": 0.9673075141062374, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337974071502685, "data/tokens_consumed": 132416274432, "data/tokens_consumed_B": 132.416274432, "train/loss_slope": 1.1601259012391133e-05} {"step": 63150, "timestamp": 1778262800.4766204, "grad/layer_0/attn": 0.0025649562012404203, "grad/layer_0/mlp": 0.0029028919525444508, "grad/layer_0/attn_mlp_ratio": 0.8835864905800331, "grad/layer_4/attn": 0.002033795230090618, "grad/layer_4/mlp": 0.0025638064835220575, "grad/layer_4/attn_mlp_ratio": 0.7932717090135051, "grad/layer_8/attn": 0.004400528036057949, "grad/layer_8/mlp": 0.0035766949877142906, "grad/layer_8/attn_mlp_ratio": 1.230333569996914, "grad/layer_12/attn": 0.004031000193208456, "grad/layer_12/mlp": 0.0063428194262087345, "grad/layer_12/attn_mlp_ratio": 0.6355218174744266, "grad/layer_16/attn": 0.007693406194448471, "grad/layer_16/mlp": 0.004411929287016392, "grad/layer_16/attn_mlp_ratio": 1.7437736463074287, "grad/layer_20/attn": 0.003557675052434206, "grad/layer_20/mlp": 0.005687477067112923, "grad/layer_20/attn_mlp_ratio": 0.6255277951718184, "grad/layer_24/attn": 0.007788579910993576, "grad/layer_24/mlp": 0.00974386464804411, "grad/layer_24/attn_mlp_ratio": 0.7993316935723047, "grad/layer_27/attn": 0.0051099564880132675, "grad/layer_27/mlp": 0.009209021925926208, "grad/layer_27/attn_mlp_ratio": 0.5548859014157183} {"step": 63150, "timestamp": 1778262801.0617065, "eos/sharpness": 47.618269920349114, "eos/L0_probe": 1.9720708131790161, "eos/L_plus": 2.2139315605163574, "eos/L_minus": 2.206392765045166, "eos/grad_norm": 0.12923866510391235, "eos/embed_grad_frac": 0.14539587497711182, "eos/time_s": 0.5824313163757324} {"step": 63150, "timestamp": 1778262801.0796685, "train/loss": 2.1907907009124754, "train/z_loss": 0.001380667311605066, "train/perplexity": 8.9422809897811, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919115.2994303107, "perf/iters_per_sec": 0.9151054856444887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0927701950073243, "data/tokens_consumed": 132437245952, "data/tokens_consumed_B": 132.437245952, "train/loss_slope": 1.4789382208942334e-05} {"step": 63150, "timestamp": 1778262802.438727, "geo/rankme_last": 438.897216796875, "geo/layer_0/stable_rank_q_proj": 19.397336959838867, "geo/layer_0/stable_rank_k_proj": 16.0953311920166, "geo/layer_0/stable_rank_o_proj": 47.029212951660156, "geo/layer_0/stable_rank_gate_proj": 131.36581420898438, "geo/layer_0/stable_rank_down_proj": 55.04261016845703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07070168852806091, "geo/layer_0/attn_entropy_mean": 6.160071849822998, "geo/layer_0/attn_entropy_std": 0.42253759503364563, "geo/layer_7/stable_rank_q_proj": 43.444007873535156, "geo/layer_7/stable_rank_k_proj": 41.13730239868164, "geo/layer_7/stable_rank_o_proj": 91.1622543334961, "geo/layer_7/stable_rank_gate_proj": 81.7491683959961, "geo/layer_7/stable_rank_down_proj": 140.79493713378906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4594388008117676, "geo/layer_7/attn_entropy_mean": 4.623742580413818, "geo/layer_7/attn_entropy_std": 0.8145479559898376, "geo/layer_14/stable_rank_q_proj": 51.38719940185547, "geo/layer_14/stable_rank_k_proj": 40.10934066772461, "geo/layer_14/stable_rank_o_proj": 43.70603561401367, "geo/layer_14/stable_rank_gate_proj": 72.0230941772461, "geo/layer_14/stable_rank_down_proj": 129.6007537841797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3954751789569855, "geo/layer_14/attn_entropy_mean": 5.533298969268799, "geo/layer_14/attn_entropy_std": 0.415657639503479, "geo/layer_21/stable_rank_q_proj": 40.21627426147461, "geo/layer_21/stable_rank_k_proj": 30.24915313720703, "geo/layer_21/stable_rank_o_proj": 70.1457748413086, "geo/layer_21/stable_rank_gate_proj": 66.19344329833984, "geo/layer_21/stable_rank_down_proj": 51.34029006958008, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14491501450538635, "geo/layer_21/attn_entropy_mean": 5.7012739181518555, "geo/layer_21/attn_entropy_std": 0.294329434633255, "geo/layer_27/stable_rank_q_proj": 43.38230895996094, "geo/layer_27/stable_rank_k_proj": 31.811126708984375, "geo/layer_27/stable_rank_o_proj": 116.14762115478516, "geo/layer_27/stable_rank_gate_proj": 80.57793426513672, "geo/layer_27/stable_rank_down_proj": 128.107666015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09244289249181747, "geo/layer_27/attn_entropy_mean": 4.1893534660339355, "geo/layer_27/attn_entropy_std": 0.7652717232704163, "attnres/final_alpha/block_0": 0.23602911829948425, "attnres/block_norm/0": 1.764122486114502, "attnres/final_alpha/block_1": 0.004423239268362522, "attnres/block_norm/1": 46670.734375, "attnres/final_alpha/block_2": 0.010073043406009674, "attnres/block_norm/2": 28638.869140625, "attnres/final_alpha/block_3": 0.012291768565773964, "attnres/block_norm/3": 58448.0859375, "attnres/final_alpha/block_4": 0.014481861144304276, "attnres/block_norm/4": 15191.4853515625, "attnres/final_alpha/block_5": 0.6135624647140503, "attnres/block_norm/5": 6647.45361328125, "attnres/final_alpha/block_6": 0.10913847386837006, "attnres/block_norm/6": 38837.78515625, "geo/tier1_time_s": 1.3551948070526123, "geo/step": 63150.0, "geo/rankme_slope": 5.431537849514807e-05} {"step": 63160, "timestamp": 1778262812.777845, "train/loss": 2.1837132215499877, "train/z_loss": 0.001365478418301791, "train/perplexity": 8.87921561581265, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1793322.1398420914, "perf/iters_per_sec": 0.85512263290505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1694229125976563, "data/tokens_consumed": 132458217472, "data/tokens_consumed_B": 132.458217472, "train/loss_slope": 1.6967270619655356e-05} {"step": 63170, "timestamp": 1778262823.1195934, "train/loss": 2.1382878541946413, "train/z_loss": 0.0013753502862527966, "train/perplexity": 8.48489780310467, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029138.5042186994, "perf/iters_per_sec": 0.9675686379521844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033518409729004, "data/tokens_consumed": 132479188992, "data/tokens_consumed_B": 132.479188992, "train/loss_slope": 1.4590660702384648e-05} {"step": 63180, "timestamp": 1778262833.4617631, "train/loss": 2.1618165254592894, "train/z_loss": 0.0013828078634105622, "train/perplexity": 8.686903315804782, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029120.66997368, "perf/iters_per_sec": 0.9675601339214707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335274934768677, "data/tokens_consumed": 132500160512, "data/tokens_consumed_B": 132.500160512, "train/loss_slope": 1.258874283586577e-05} {"step": 63190, "timestamp": 1778262843.7999794, "train/loss": 2.1439491748809814, "train/z_loss": 0.001377972646150738, "train/perplexity": 8.533069760346468, "train/grad_norm": 0.314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029552.9461312138, "perf/iters_per_sec": 0.967766259255988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333073616027832, "data/tokens_consumed": 132521132032, "data/tokens_consumed_B": 132.521132032, "train/loss_slope": 9.876988336841787e-06} {"step": 63200, "timestamp": 1778262854.1280558, "grad/layer_0/attn": 0.0032328194938600063, "grad/layer_0/mlp": 0.0032848757691681385, "grad/layer_0/attn_mlp_ratio": 0.9841527115844061, "grad/layer_4/attn": 0.0025932572316378355, "grad/layer_4/mlp": 0.0026108482852578163, "grad/layer_4/attn_mlp_ratio": 0.9932622845051786, "grad/layer_8/attn": 0.007274566683918238, "grad/layer_8/mlp": 0.00375933013856411, "grad/layer_8/attn_mlp_ratio": 1.935069872099554, "grad/layer_12/attn": 0.006174726411700249, "grad/layer_12/mlp": 0.006728666834533215, "grad/layer_12/attn_mlp_ratio": 0.9176745515534432, "grad/layer_16/attn": 0.004705324303358793, "grad/layer_16/mlp": 0.0046424116007983685, "grad/layer_16/attn_mlp_ratio": 1.0135517068745983, "grad/layer_20/attn": 0.003489915281534195, "grad/layer_20/mlp": 0.006504945922642946, "grad/layer_20/attn_mlp_ratio": 0.5365018048399193, "grad/layer_24/attn": 0.020526668056845665, "grad/layer_24/mlp": 0.013712650164961815, "grad/layer_24/attn_mlp_ratio": 1.4969147218240402, "grad/layer_27/attn": 0.011342081241309643, "grad/layer_27/mlp": 0.012397559359669685, "grad/layer_27/attn_mlp_ratio": 0.9148640325707973} {"step": 63200, "timestamp": 1778262854.1421454, "train/loss": 2.16065514087677, "train/z_loss": 0.0013605816988274456, "train/perplexity": 8.676820336465683, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029071.147651127, "perf/iters_per_sec": 0.9675365198379169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335527181625366, "data/tokens_consumed": 132542103552, "data/tokens_consumed_B": 132.542103552, "train/loss_slope": 1.0413159846258041e-05} {"step": 63210, "timestamp": 1778262864.489611, "train/loss": 2.1121534585952757, "train/z_loss": 0.0013766514253802597, "train/perplexity": 8.266022669583492, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028130.635826987, "perf/iters_per_sec": 0.9670880488524375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340320110321044, "data/tokens_consumed": 132563075072, "data/tokens_consumed_B": 132.563075072, "train/loss_slope": 8.847830211869426e-06} {"step": 63220, "timestamp": 1778262874.8291292, "train/loss": 2.1359907746315003, "train/z_loss": 0.0013829078525304794, "train/perplexity": 8.46542968623293, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029296.3579747966, "perf/iters_per_sec": 0.9676439084886534, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033438014984131, "data/tokens_consumed": 132584046592, "data/tokens_consumed_B": 132.584046592, "train/loss_slope": 9.545185935772671e-06} {"step": 63225, "timestamp": 1778262880.5752428, "eos/sharpness": 11.585593223571776, "eos/L0_probe": 1.9684865474700928, "eos/L_plus": 2.0324366092681885, "eos/L_minus": 2.020392417907715, "eos/grad_norm": 0.09515608847141266, "eos/embed_grad_frac": 0.2565363347530365, "eos/time_s": 0.581700325012207} {"step": 63225, "timestamp": 1778262881.9515123, "geo/rankme_last": 438.77142333984375, "geo/layer_0/stable_rank_q_proj": 19.381248474121094, "geo/layer_0/stable_rank_k_proj": 16.088979721069336, "geo/layer_0/stable_rank_o_proj": 47.09639358520508, "geo/layer_0/stable_rank_gate_proj": 131.31338500976562, "geo/layer_0/stable_rank_down_proj": 54.97124481201172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06366341561079025, "geo/layer_0/attn_entropy_mean": 6.163966178894043, "geo/layer_0/attn_entropy_std": 0.4210048317909241, "geo/layer_7/stable_rank_q_proj": 43.39298629760742, "geo/layer_7/stable_rank_k_proj": 41.09758758544922, "geo/layer_7/stable_rank_o_proj": 91.08915710449219, "geo/layer_7/stable_rank_gate_proj": 81.6634750366211, "geo/layer_7/stable_rank_down_proj": 140.75111389160156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44212809205055237, "geo/layer_7/attn_entropy_mean": 4.650543212890625, "geo/layer_7/attn_entropy_std": 0.7753127813339233, "geo/layer_14/stable_rank_q_proj": 51.49005889892578, "geo/layer_14/stable_rank_k_proj": 40.06067657470703, "geo/layer_14/stable_rank_o_proj": 43.7542610168457, "geo/layer_14/stable_rank_gate_proj": 71.9311752319336, "geo/layer_14/stable_rank_down_proj": 129.607421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3935945928096771, "geo/layer_14/attn_entropy_mean": 5.553730487823486, "geo/layer_14/attn_entropy_std": 0.4003986120223999, "geo/layer_21/stable_rank_q_proj": 40.174259185791016, "geo/layer_21/stable_rank_k_proj": 30.299850463867188, "geo/layer_21/stable_rank_o_proj": 70.1053695678711, "geo/layer_21/stable_rank_gate_proj": 66.18876647949219, "geo/layer_21/stable_rank_down_proj": 51.441917419433594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1474476456642151, "geo/layer_21/attn_entropy_mean": 5.706087589263916, "geo/layer_21/attn_entropy_std": 0.29238182306289673, "geo/layer_27/stable_rank_q_proj": 43.396240234375, "geo/layer_27/stable_rank_k_proj": 31.868959426879883, "geo/layer_27/stable_rank_o_proj": 115.78073120117188, "geo/layer_27/stable_rank_gate_proj": 80.55496978759766, "geo/layer_27/stable_rank_down_proj": 128.19815063476562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09229974448680878, "geo/layer_27/attn_entropy_mean": 4.206599712371826, "geo/layer_27/attn_entropy_std": 0.7462457418441772, "attnres/final_alpha/block_0": 0.23688040673732758, "attnres/block_norm/0": 1.764307975769043, "attnres/final_alpha/block_1": 0.0044434890151023865, "attnres/block_norm/1": 46629.87109375, "attnres/final_alpha/block_2": 0.010175489820539951, "attnres/block_norm/2": 28472.76171875, "attnres/final_alpha/block_3": 0.012153206393122673, "attnres/block_norm/3": 58268.0625, "attnres/final_alpha/block_4": 0.014494603499770164, "attnres/block_norm/4": 15101.9267578125, "attnres/final_alpha/block_5": 0.6129947900772095, "attnres/block_norm/5": 6624.65576171875, "attnres/final_alpha/block_6": 0.10885801911354065, "attnres/block_norm/6": 38600.4140625, "geo/tier1_time_s": 1.3559327125549316, "geo/step": 63225.0, "geo/rankme_slope": 4.941128795268108e-05} {"step": 63230, "timestamp": 1778262887.123004, "train/loss": 2.1222668886184692, "train/z_loss": 0.0013722510426305234, "train/perplexity": 8.350044670579626, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706860.2895503785, "perf/iters_per_sec": 0.8138944099189656, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2286606073379516, "data/tokens_consumed": 132605018112, "data/tokens_consumed_B": 132.605018112, "train/loss_slope": 4.004198389657079e-06} {"step": 63240, "timestamp": 1778262897.4584255, "train/loss": 2.165117788314819, "train/z_loss": 0.0013807638664729894, "train/perplexity": 8.715628455581294, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029947.4130578518, "perf/iters_per_sec": 0.9679543557442912, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331065654754639, "data/tokens_consumed": 132625989632, "data/tokens_consumed_B": 132.625989632, "train/loss_slope": 4.19235777432875e-06} {"step": 63250, "timestamp": 1778262907.7924905, "grad/layer_0/attn": 0.0027083121240139008, "grad/layer_0/mlp": 0.0030236083548516035, "grad/layer_0/attn_mlp_ratio": 0.8957218384768747, "grad/layer_4/attn": 0.0019444703357294202, "grad/layer_4/mlp": 0.00255749118514359, "grad/layer_4/attn_mlp_ratio": 0.7603037973285794, "grad/layer_8/attn": 0.005867660511285067, "grad/layer_8/mlp": 0.0036180338356643915, "grad/layer_8/attn_mlp_ratio": 1.6217815022256192, "grad/layer_12/attn": 0.005801208317279816, "grad/layer_12/mlp": 0.006701461039483547, "grad/layer_12/attn_mlp_ratio": 0.8656632033722261, "grad/layer_16/attn": 0.004039186518639326, "grad/layer_16/mlp": 0.004534953739494085, "grad/layer_16/attn_mlp_ratio": 0.8906786400917223, "grad/layer_20/attn": 0.0032264944165945053, "grad/layer_20/mlp": 0.005851208232343197, "grad/layer_20/attn_mlp_ratio": 0.5514236091645726, "grad/layer_24/attn": 0.006880589295178652, "grad/layer_24/mlp": 0.008020416833460331, "grad/layer_24/attn_mlp_ratio": 0.8578842412136408, "grad/layer_27/attn": 0.004300630651414394, "grad/layer_27/mlp": 0.006734828930348158, "grad/layer_27/attn_mlp_ratio": 0.6385656758375161} {"step": 63250, "timestamp": 1778262907.806539, "train/loss": 2.145590841770172, "train/z_loss": 0.0013860693667083978, "train/perplexity": 8.547089723340969, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027558.3718933777, "perf/iters_per_sec": 0.9668151721445931, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343238592147828, "data/tokens_consumed": 132646961152, "data/tokens_consumed_B": 132.646961152, "train/loss_slope": 6.8587443008102305e-06} {"step": 63260, "timestamp": 1778262918.1444163, "train/loss": 2.1410795927047728, "train/z_loss": 0.001378150295931846, "train/perplexity": 8.508618514661885, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029937.153664502, "perf/iters_per_sec": 0.9679494636843214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331117868423463, "data/tokens_consumed": 132667932672, "data/tokens_consumed_B": 132.667932672, "train/loss_slope": 7.274724791223774e-06} {"step": 63270, "timestamp": 1778262928.484372, "train/loss": 2.136796164512634, "train/z_loss": 0.0013866923633031546, "train/perplexity": 8.472250403941494, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029167.058387119, "perf/iters_per_sec": 0.9675822536407085, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335038661956788, "data/tokens_consumed": 132688904192, "data/tokens_consumed_B": 132.688904192, "train/loss_slope": 1.003840961078603e-05} {"step": 63280, "timestamp": 1778262938.827163, "train/loss": 2.108232295513153, "train/z_loss": 0.001374293549451977, "train/perplexity": 8.233673710875344, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028790.4888207815, "perf/iters_per_sec": 0.9674026912788303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336956977844238, "data/tokens_consumed": 132709875712, "data/tokens_consumed_B": 132.709875712, "train/loss_slope": 8.046328252477018e-06} {"step": 63290, "timestamp": 1778262949.1699903, "train/loss": 2.1336881160736083, "train/z_loss": 0.0013822478358633816, "train/perplexity": 8.445959117758385, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028979.7389757626, "perf/iters_per_sec": 0.967492932784921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033599281311035, "data/tokens_consumed": 132730847232, "data/tokens_consumed_B": 132.730847232, "train/loss_slope": 6.182228406556454e-06} {"step": 63300, "timestamp": 1778262959.4976916, "grad/layer_0/attn": 0.002936426317319274, "grad/layer_0/mlp": 0.003159136511385441, "grad/layer_0/attn_mlp_ratio": 0.9295027972948289, "grad/layer_4/attn": 0.002139297779649496, "grad/layer_4/mlp": 0.0025568094570189714, "grad/layer_4/attn_mlp_ratio": 0.8367059540186248, "grad/layer_8/attn": 0.0044225300662219524, "grad/layer_8/mlp": 0.003948044031858444, "grad/layer_8/attn_mlp_ratio": 1.120182530518005, "grad/layer_12/attn": 0.005118330474942923, "grad/layer_12/mlp": 0.0066120135597884655, "grad/layer_12/attn_mlp_ratio": 0.7740955688084091, "grad/layer_16/attn": 0.0043241530656814575, "grad/layer_16/mlp": 0.004466543439775705, "grad/layer_16/attn_mlp_ratio": 0.968120656873436, "grad/layer_20/attn": 0.004626693669706583, "grad/layer_20/mlp": 0.006776714231818914, "grad/layer_20/attn_mlp_ratio": 0.6827340571200894, "grad/layer_24/attn": 0.012944244779646397, "grad/layer_24/mlp": 0.011266746558248997, "grad/layer_24/attn_mlp_ratio": 1.1488893087135508, "grad/layer_27/attn": 0.007885048165917397, "grad/layer_27/mlp": 0.010144460946321487, "grad/layer_27/attn_mlp_ratio": 0.7772762032317742} {"step": 63300, "timestamp": 1778262960.083655, "eos/sharpness": 63.532900810241685, "eos/L0_probe": 1.9753259420394897, "eos/L_plus": 2.278459072113037, "eos/L_minus": 2.3075218200683594, "eos/grad_norm": 0.18128198385238647, "eos/embed_grad_frac": 0.06249300017952919, "eos/time_s": 0.5833339691162109} {"step": 63300, "timestamp": 1778262960.1014662, "train/loss": 2.1355568170547485, "train/z_loss": 0.0013790676603093743, "train/perplexity": 8.461756845866221, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919354.2864281288, "perf/iters_per_sec": 0.9152194435253758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.092634129524231, "data/tokens_consumed": 132751818752, "data/tokens_consumed_B": 132.751818752, "train/loss_slope": 6.122248622700834e-06} {"step": 63300, "timestamp": 1778262961.4640589, "geo/rankme_last": 438.6944580078125, "geo/layer_0/stable_rank_q_proj": 19.361108779907227, "geo/layer_0/stable_rank_k_proj": 16.100366592407227, "geo/layer_0/stable_rank_o_proj": 47.04426193237305, "geo/layer_0/stable_rank_gate_proj": 131.29612731933594, "geo/layer_0/stable_rank_down_proj": 55.03797149658203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0657905787229538, "geo/layer_0/attn_entropy_mean": 6.166399002075195, "geo/layer_0/attn_entropy_std": 0.41837364435195923, "geo/layer_7/stable_rank_q_proj": 43.44245529174805, "geo/layer_7/stable_rank_k_proj": 41.05671691894531, "geo/layer_7/stable_rank_o_proj": 91.11066436767578, "geo/layer_7/stable_rank_gate_proj": 81.70214080810547, "geo/layer_7/stable_rank_down_proj": 141.0048828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44507259130477905, "geo/layer_7/attn_entropy_mean": 4.669061183929443, "geo/layer_7/attn_entropy_std": 0.7879180312156677, "geo/layer_14/stable_rank_q_proj": 51.481773376464844, "geo/layer_14/stable_rank_k_proj": 40.006954193115234, "geo/layer_14/stable_rank_o_proj": 43.76487731933594, "geo/layer_14/stable_rank_gate_proj": 71.97335052490234, "geo/layer_14/stable_rank_down_proj": 129.6595001220703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4001607596874237, "geo/layer_14/attn_entropy_mean": 5.546758651733398, "geo/layer_14/attn_entropy_std": 0.39546340703964233, "geo/layer_21/stable_rank_q_proj": 40.22821807861328, "geo/layer_21/stable_rank_k_proj": 30.374059677124023, "geo/layer_21/stable_rank_o_proj": 70.13468933105469, "geo/layer_21/stable_rank_gate_proj": 66.20931243896484, "geo/layer_21/stable_rank_down_proj": 51.42588424682617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14197991788387299, "geo/layer_21/attn_entropy_mean": 5.711802005767822, "geo/layer_21/attn_entropy_std": 0.30559012293815613, "geo/layer_27/stable_rank_q_proj": 43.37789535522461, "geo/layer_27/stable_rank_k_proj": 31.810222625732422, "geo/layer_27/stable_rank_o_proj": 115.80599975585938, "geo/layer_27/stable_rank_gate_proj": 80.55452728271484, "geo/layer_27/stable_rank_down_proj": 128.4015350341797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0956086590886116, "geo/layer_27/attn_entropy_mean": 4.214190483093262, "geo/layer_27/attn_entropy_std": 0.737765371799469, "attnres/final_alpha/block_0": 0.2385549545288086, "attnres/block_norm/0": 1.764587163925171, "attnres/final_alpha/block_1": 0.004509636666625738, "attnres/block_norm/1": 46612.38671875, "attnres/final_alpha/block_2": 0.01041427068412304, "attnres/block_norm/2": 28649.916015625, "attnres/final_alpha/block_3": 0.012617028318345547, "attnres/block_norm/3": 58156.578125, "attnres/final_alpha/block_4": 0.01487890724092722, "attnres/block_norm/4": 15140.431640625, "attnres/final_alpha/block_5": 0.6083474159240723, "attnres/block_norm/5": 6695.92724609375, "attnres/final_alpha/block_6": 0.1106778159737587, "attnres/block_norm/6": 38565.83984375, "geo/tier1_time_s": 1.3583838939666748, "geo/step": 63300.0, "geo/rankme_slope": 7.04519112332433e-05} {"step": 63310, "timestamp": 1778262971.8199477, "train/loss": 2.1596211433410644, "train/z_loss": 0.0013570924871601163, "train/perplexity": 8.667853162436879, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790202.7534207602, "perf/iters_per_sec": 0.853635193548565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714606046676637, "data/tokens_consumed": 132772790272, "data/tokens_consumed_B": 132.772790272, "train/loss_slope": 5.7070994784872455e-06} {"step": 63320, "timestamp": 1778262982.164351, "train/loss": 2.0825860500335693, "train/z_loss": 0.0013729178463108838, "train/perplexity": 8.02519566005054, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028604.82960776, "perf/iters_per_sec": 0.9673141620672989, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337903022766113, "data/tokens_consumed": 132793761792, "data/tokens_consumed_B": 132.793761792, "train/loss_slope": -1.4085484953067204e-06} {"step": 63330, "timestamp": 1778262992.4997807, "train/loss": 2.1572646141052245, "train/z_loss": 0.001374046935234219, "train/perplexity": 8.64745116144495, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030037.925075776, "perf/iters_per_sec": 0.9679975152377014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330605030059814, "data/tokens_consumed": 132814733312, "data/tokens_consumed_B": 132.814733312, "train/loss_slope": -1.9803420437945708e-06} {"step": 63340, "timestamp": 1778263002.8458462, "train/loss": 2.140070366859436, "train/z_loss": 0.001368936919607222, "train/perplexity": 8.500035728661993, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027885.7677487177, "perf/iters_per_sec": 0.9669712866538609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03415687084198, "data/tokens_consumed": 132835704832, "data/tokens_consumed_B": 132.835704832, "train/loss_slope": -5.390644645748769e-07} {"step": 63350, "timestamp": 1778263013.174432, "grad/layer_0/attn": 0.003197809448465705, "grad/layer_0/mlp": 0.0033469644840806723, "grad/layer_0/attn_mlp_ratio": 0.9554356994620137, "grad/layer_4/attn": 0.0020552349742501974, "grad/layer_4/mlp": 0.0025951459538191557, "grad/layer_4/attn_mlp_ratio": 0.7919534899492858, "grad/layer_8/attn": 0.006593192927539349, "grad/layer_8/mlp": 0.0037761100102216005, "grad/layer_8/attn_mlp_ratio": 1.7460277203496133, "grad/layer_12/attn": 0.0049691880121827126, "grad/layer_12/mlp": 0.006829541176557541, "grad/layer_12/attn_mlp_ratio": 0.7276020176112696, "grad/layer_16/attn": 0.00472224410623312, "grad/layer_16/mlp": 0.004775946494191885, "grad/layer_16/attn_mlp_ratio": 0.988755634741797, "grad/layer_20/attn": 0.004754221998155117, "grad/layer_20/mlp": 0.007742062211036682, "grad/layer_20/attn_mlp_ratio": 0.6140769483833455, "grad/layer_24/attn": 0.01699122041463852, "grad/layer_24/mlp": 0.012225016951560974, "grad/layer_24/attn_mlp_ratio": 1.3898729419333582, "grad/layer_27/attn": 0.008495921269059181, "grad/layer_27/mlp": 0.012868317775428295, "grad/layer_27/attn_mlp_ratio": 0.660220034296939} {"step": 63350, "timestamp": 1778263013.1885617, "train/loss": 2.1668415784835817, "train/z_loss": 0.0013729611411690712, "train/perplexity": 8.730665326709476, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028929.8491332692, "perf/iters_per_sec": 0.9674691434542032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336246967315674, "data/tokens_consumed": 132856676352, "data/tokens_consumed_B": 132.856676352, "train/loss_slope": 8.90983449350432e-07} {"step": 63360, "timestamp": 1778263023.5275943, "train/loss": 2.146905851364136, "train/z_loss": 0.0013962360448203982, "train/perplexity": 8.55833662159646, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029364.3381536764, "perf/iters_per_sec": 0.9676763239639646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334033966064453, "data/tokens_consumed": 132877647872, "data/tokens_consumed_B": 132.877647872, "train/loss_slope": -2.355781561470703e-06} {"step": 63370, "timestamp": 1778263033.8733451, "train/loss": 2.15047550201416, "train/z_loss": 0.0013806538889184594, "train/perplexity": 8.588941485318417, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028479.1733264544, "perf/iters_per_sec": 0.9672542444832107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033854341506958, "data/tokens_consumed": 132898619392, "data/tokens_consumed_B": 132.898619392, "train/loss_slope": -3.2072604042325975e-06} {"step": 63375, "timestamp": 1778263039.6176822, "eos/sharpness": 59.63308811187743, "eos/L0_probe": 1.9759689569473267, "eos/L_plus": 2.2983558177948, "eos/L_minus": 2.249912977218628, "eos/grad_norm": 0.13609173893928528, "eos/embed_grad_frac": 0.10832205414772034, "eos/time_s": 0.5825839042663574} {"step": 63375, "timestamp": 1778263041.0018322, "geo/rankme_last": 439.5223388671875, "geo/layer_0/stable_rank_q_proj": 19.36211585998535, "geo/layer_0/stable_rank_k_proj": 16.081012725830078, "geo/layer_0/stable_rank_o_proj": 47.13193130493164, "geo/layer_0/stable_rank_gate_proj": 131.1732940673828, "geo/layer_0/stable_rank_down_proj": 54.95512771606445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06523224711418152, "geo/layer_0/attn_entropy_mean": 6.162143707275391, "geo/layer_0/attn_entropy_std": 0.4227745532989502, "geo/layer_7/stable_rank_q_proj": 43.42759323120117, "geo/layer_7/stable_rank_k_proj": 41.1567497253418, "geo/layer_7/stable_rank_o_proj": 90.86245727539062, "geo/layer_7/stable_rank_gate_proj": 81.88797760009766, "geo/layer_7/stable_rank_down_proj": 141.15203857421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44245314598083496, "geo/layer_7/attn_entropy_mean": 4.664495944976807, "geo/layer_7/attn_entropy_std": 0.7819981575012207, "geo/layer_14/stable_rank_q_proj": 51.38943099975586, "geo/layer_14/stable_rank_k_proj": 40.007686614990234, "geo/layer_14/stable_rank_o_proj": 43.81803894042969, "geo/layer_14/stable_rank_gate_proj": 71.84822845458984, "geo/layer_14/stable_rank_down_proj": 129.58309936523438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.391789972782135, "geo/layer_14/attn_entropy_mean": 5.573734760284424, "geo/layer_14/attn_entropy_std": 0.40602824091911316, "geo/layer_21/stable_rank_q_proj": 40.33039855957031, "geo/layer_21/stable_rank_k_proj": 30.353836059570312, "geo/layer_21/stable_rank_o_proj": 70.09624481201172, "geo/layer_21/stable_rank_gate_proj": 66.2572021484375, "geo/layer_21/stable_rank_down_proj": 51.50414276123047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14586962759494781, "geo/layer_21/attn_entropy_mean": 5.704370498657227, "geo/layer_21/attn_entropy_std": 0.2974596917629242, "geo/layer_27/stable_rank_q_proj": 43.32559585571289, "geo/layer_27/stable_rank_k_proj": 31.70818519592285, "geo/layer_27/stable_rank_o_proj": 115.81257629394531, "geo/layer_27/stable_rank_gate_proj": 80.48350524902344, "geo/layer_27/stable_rank_down_proj": 128.4473876953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09128616750240326, "geo/layer_27/attn_entropy_mean": 4.206847190856934, "geo/layer_27/attn_entropy_std": 0.7403425574302673, "attnres/final_alpha/block_0": 0.23715318739414215, "attnres/block_norm/0": 1.7646188735961914, "attnres/final_alpha/block_1": 0.004430164583027363, "attnres/block_norm/1": 46791.109375, "attnres/final_alpha/block_2": 0.01024073176085949, "attnres/block_norm/2": 28495.38671875, "attnres/final_alpha/block_3": 0.012293040752410889, "attnres/block_norm/3": 57892.78515625, "attnres/final_alpha/block_4": 0.014552507549524307, "attnres/block_norm/4": 15152.3671875, "attnres/final_alpha/block_5": 0.6124834418296814, "attnres/block_norm/5": 6686.08056640625, "attnres/final_alpha/block_6": 0.10884693264961243, "attnres/block_norm/6": 38499.359375, "geo/tier1_time_s": 1.357182502746582, "geo/step": 63375.0, "geo/rankme_slope": 0.00010396416379051621} {"step": 63380, "timestamp": 1778263046.1738503, "train/loss": 2.1394402742385865, "train/z_loss": 0.0013754610437899828, "train/perplexity": 8.494681605846186, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705813.20918599, "perf/iters_per_sec": 0.8133951230936003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2294147968292237, "data/tokens_consumed": 132919590912, "data/tokens_consumed_B": 132.919590912, "train/loss_slope": -5.6098006489110634e-06} {"step": 63390, "timestamp": 1778263056.5149648, "train/loss": 2.173757863044739, "train/z_loss": 0.0013656147755682468, "train/perplexity": 8.791258390413866, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028990.877943557, "perf/iters_per_sec": 0.9674982442586694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335936069488525, "data/tokens_consumed": 132940562432, "data/tokens_consumed_B": 132.940562432, "train/loss_slope": -4.575073772674229e-06} {"step": 63400, "timestamp": 1778263066.854657, "grad/layer_0/attn": 0.0029659229330718517, "grad/layer_0/mlp": 0.0031453147530555725, "grad/layer_0/attn_mlp_ratio": 0.9429653537516451, "grad/layer_4/attn": 0.004377166274935007, "grad/layer_4/mlp": 0.002596871694549918, "grad/layer_4/attn_mlp_ratio": 1.6855534740380373, "grad/layer_8/attn": 0.010421299375593662, "grad/layer_8/mlp": 0.0038173829670995474, "grad/layer_8/attn_mlp_ratio": 2.729959030156169, "grad/layer_12/attn": 0.007704876363277435, "grad/layer_12/mlp": 0.007334450259804726, "grad/layer_12/attn_mlp_ratio": 1.0505049438336607, "grad/layer_16/attn": 0.0034349861089140177, "grad/layer_16/mlp": 0.005177478305995464, "grad/layer_16/attn_mlp_ratio": 0.6634476939462154, "grad/layer_20/attn": 0.00658220611512661, "grad/layer_20/mlp": 0.006164141930639744, "grad/layer_20/attn_mlp_ratio": 1.0678219421954944, "grad/layer_24/attn": 0.008668165653944016, "grad/layer_24/mlp": 0.009254551492631435, "grad/layer_24/attn_mlp_ratio": 0.936638103659792, "grad/layer_27/attn": 0.009877340868115425, "grad/layer_27/mlp": 0.008073433302342892, "grad/layer_27/attn_mlp_ratio": 1.223437461594598} {"step": 63400, "timestamp": 1778263066.8688738, "train/loss": 2.1418037533760073, "train/z_loss": 0.0013784554088488222, "train/perplexity": 8.514782353092063, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026715.3609777826, "perf/iters_per_sec": 0.9664131932152665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347540855407715, "data/tokens_consumed": 132961533952, "data/tokens_consumed_B": 132.961533952, "train/loss_slope": -4.333971371494759e-06} {"step": 63410, "timestamp": 1778263077.208653, "train/loss": 2.145522713661194, "train/z_loss": 0.0013787986361421644, "train/perplexity": 8.546507446115799, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029519.2769431812, "perf/iters_per_sec": 0.9677502045360475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333245038986205, "data/tokens_consumed": 132982505472, "data/tokens_consumed_B": 132.982505472, "train/loss_slope": -5.015892843709434e-06} {"step": 63420, "timestamp": 1778263087.5507731, "train/loss": 2.1393715143203735, "train/z_loss": 0.001379404414910823, "train/perplexity": 8.494097532314377, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028788.897846485, "perf/iters_per_sec": 0.967401932643168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336965084075929, "data/tokens_consumed": 133003476992, "data/tokens_consumed_B": 133.003476992, "train/loss_slope": -2.8513536702181175e-06} {"step": 63430, "timestamp": 1778263097.899705, "train/loss": 2.1516271352767946, "train/z_loss": 0.0013809037278406321, "train/perplexity": 8.598838493791977, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027781.3766359822, "perf/iters_per_sec": 0.9669215090923224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342101097106933, "data/tokens_consumed": 133024448512, "data/tokens_consumed_B": 133.024448512, "train/loss_slope": -4.105460590118787e-06} {"step": 63440, "timestamp": 1778263108.2441902, "train/loss": 2.1421207427978515, "train/z_loss": 0.0013747705612331628, "train/perplexity": 8.517481876864935, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028566.6539013148, "perf/iters_per_sec": 0.9672959584719252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033809757232666, "data/tokens_consumed": 133045420032, "data/tokens_consumed_B": 133.045420032, "train/loss_slope": -5.289841578094746e-06} {"step": 63450, "timestamp": 1778263118.5917015, "grad/layer_0/attn": 0.0028115303721278906, "grad/layer_0/mlp": 0.002946958178654313, "grad/layer_0/attn_mlp_ratio": 0.9540448510902354, "grad/layer_4/attn": 0.0025489111430943012, "grad/layer_4/mlp": 0.0024273686576634645, "grad/layer_4/attn_mlp_ratio": 1.0500716609485536, "grad/layer_8/attn": 0.0033435318619012833, "grad/layer_8/mlp": 0.003628670470789075, "grad/layer_8/attn_mlp_ratio": 0.921420613052293, "grad/layer_12/attn": 0.0061116525903344154, "grad/layer_12/mlp": 0.0063879601657390594, "grad/layer_12/attn_mlp_ratio": 0.956745555089536, "grad/layer_16/attn": 0.0042067221365869045, "grad/layer_16/mlp": 0.00463609816506505, "grad/layer_16/attn_mlp_ratio": 0.9073841614372855, "grad/layer_20/attn": 0.0051025948487222195, "grad/layer_20/mlp": 0.006408357061445713, "grad/layer_20/attn_mlp_ratio": 0.7962407088388757, "grad/layer_24/attn": 0.011933010071516037, "grad/layer_24/mlp": 0.01231243833899498, "grad/layer_24/attn_mlp_ratio": 0.9691833287647355, "grad/layer_27/attn": 0.0075102620758116245, "grad/layer_27/mlp": 0.012444124557077885, "grad/layer_27/attn_mlp_ratio": 0.6035187112610599} {"step": 63450, "timestamp": 1778263119.1788588, "eos/sharpness": 70.72536945343016, "eos/L0_probe": 1.9760462045669556, "eos/L_plus": 2.3620080947875977, "eos/L_minus": 2.2973380088806152, "eos/grad_norm": 0.21971233189105988, "eos/embed_grad_frac": 0.05513935908675194, "eos/time_s": 0.5842859745025635} {"step": 63450, "timestamp": 1778263119.1983576, "train/loss": 2.1107883214950562, "train/z_loss": 0.0013860745937563479, "train/perplexity": 8.254746114139337, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915396.553382698, "perf/iters_per_sec": 0.9133322493470659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0948918104171752, "data/tokens_consumed": 133066391552, "data/tokens_consumed_B": 133.066391552, "train/loss_slope": -9.259798960967588e-06} {"step": 63450, "timestamp": 1778263120.5630617, "geo/rankme_last": 439.03851318359375, "geo/layer_0/stable_rank_q_proj": 19.372709274291992, "geo/layer_0/stable_rank_k_proj": 16.105457305908203, "geo/layer_0/stable_rank_o_proj": 47.08601760864258, "geo/layer_0/stable_rank_gate_proj": 131.2443084716797, "geo/layer_0/stable_rank_down_proj": 55.03050994873047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06338910758495331, "geo/layer_0/attn_entropy_mean": 6.163102149963379, "geo/layer_0/attn_entropy_std": 0.42059507966041565, "geo/layer_7/stable_rank_q_proj": 43.33357238769531, "geo/layer_7/stable_rank_k_proj": 41.02769088745117, "geo/layer_7/stable_rank_o_proj": 90.71835327148438, "geo/layer_7/stable_rank_gate_proj": 81.90801239013672, "geo/layer_7/stable_rank_down_proj": 141.0302276611328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4602864682674408, "geo/layer_7/attn_entropy_mean": 4.637392044067383, "geo/layer_7/attn_entropy_std": 0.7912488579750061, "geo/layer_14/stable_rank_q_proj": 51.44271469116211, "geo/layer_14/stable_rank_k_proj": 40.174766540527344, "geo/layer_14/stable_rank_o_proj": 43.7366943359375, "geo/layer_14/stable_rank_gate_proj": 71.7874984741211, "geo/layer_14/stable_rank_down_proj": 129.46304321289062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3934158384799957, "geo/layer_14/attn_entropy_mean": 5.533691883087158, "geo/layer_14/attn_entropy_std": 0.399291455745697, "geo/layer_21/stable_rank_q_proj": 40.477054595947266, "geo/layer_21/stable_rank_k_proj": 30.388084411621094, "geo/layer_21/stable_rank_o_proj": 70.08087158203125, "geo/layer_21/stable_rank_gate_proj": 66.16824340820312, "geo/layer_21/stable_rank_down_proj": 51.5145263671875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14138194918632507, "geo/layer_21/attn_entropy_mean": 5.703108787536621, "geo/layer_21/attn_entropy_std": 0.29360172152519226, "geo/layer_27/stable_rank_q_proj": 43.364952087402344, "geo/layer_27/stable_rank_k_proj": 31.750354766845703, "geo/layer_27/stable_rank_o_proj": 116.15203857421875, "geo/layer_27/stable_rank_gate_proj": 80.52945709228516, "geo/layer_27/stable_rank_down_proj": 128.33360290527344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09845927357673645, "geo/layer_27/attn_entropy_mean": 4.181581497192383, "geo/layer_27/attn_entropy_std": 0.7462005019187927, "attnres/final_alpha/block_0": 0.23514661192893982, "attnres/block_norm/0": 1.7645397186279297, "attnres/final_alpha/block_1": 0.004289836622774601, "attnres/block_norm/1": 46769.0390625, "attnres/final_alpha/block_2": 0.010028795339167118, "attnres/block_norm/2": 28609.931640625, "attnres/final_alpha/block_3": 0.012362662702798843, "attnres/block_norm/3": 58018.265625, "attnres/final_alpha/block_4": 0.014372272416949272, "attnres/block_norm/4": 15106.9677734375, "attnres/final_alpha/block_5": 0.6161941885948181, "attnres/block_norm/5": 6620.412109375, "attnres/final_alpha/block_6": 0.10760566592216492, "attnres/block_norm/6": 38617.2109375, "geo/tier1_time_s": 1.3611505031585693, "geo/step": 63450.0, "geo/rankme_slope": 0.00011590241565376151} {"step": 63460, "timestamp": 1778263130.9167907, "train/loss": 2.1573033571243285, "train/z_loss": 0.0013782019261270762, "train/perplexity": 8.647786196300588, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790133.9310086912, "perf/iters_per_sec": 0.8536023764651733, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715056419372558, "data/tokens_consumed": 133087363072, "data/tokens_consumed_B": 133.087363072, "train/loss_slope": -1.0019413572941486e-05} {"step": 63470, "timestamp": 1778263141.2643886, "train/loss": 2.14752197265625, "train/z_loss": 0.0013776199426501988, "train/perplexity": 8.563611219743363, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028156.636368688, "perf/iters_per_sec": 0.9671004468768539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340187549591064, "data/tokens_consumed": 133108334592, "data/tokens_consumed_B": 133.108334592, "train/loss_slope": -1.1874960350840106e-05} {"step": 63480, "timestamp": 1778263151.6107197, "train/loss": 2.151783013343811, "train/z_loss": 0.0013656729483045638, "train/perplexity": 8.600178968587569, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027968.5215876189, "perf/iters_per_sec": 0.9670107467592329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034114670753479, "data/tokens_consumed": 133129306112, "data/tokens_consumed_B": 133.129306112, "train/loss_slope": -1.219481400387662e-05} {"step": 63490, "timestamp": 1778263161.953731, "train/loss": 2.1963337898254394, "train/z_loss": 0.001368154038209468, "train/perplexity": 8.991986482103705, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028880.0085416883, "perf/iters_per_sec": 0.9674453776081506, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336500883102417, "data/tokens_consumed": 133150277632, "data/tokens_consumed_B": 133.150277632, "train/loss_slope": -8.754861501470007e-06} {"step": 63500, "timestamp": 1778263172.2871337, "grad/layer_0/attn": 0.0034086222294718027, "grad/layer_0/mlp": 0.003645561868324876, "grad/layer_0/attn_mlp_ratio": 0.9350059768804462, "grad/layer_4/attn": 0.002844790695235133, "grad/layer_4/mlp": 0.0027012282516807318, "grad/layer_4/attn_mlp_ratio": 1.0531470593610035, "grad/layer_8/attn": 0.004638022277504206, "grad/layer_8/mlp": 0.004032034892588854, "grad/layer_8/attn_mlp_ratio": 1.150293161152916, "grad/layer_12/attn": 0.005026424769312143, "grad/layer_12/mlp": 0.007003965321928263, "grad/layer_12/attn_mlp_ratio": 0.7176541382650513, "grad/layer_16/attn": 0.005146292503923178, "grad/layer_16/mlp": 0.004826403688639402, "grad/layer_16/attn_mlp_ratio": 1.0662788961082668, "grad/layer_20/attn": 0.004745046142488718, "grad/layer_20/mlp": 0.006631331518292427, "grad/layer_20/attn_mlp_ratio": 0.7155495178976694, "grad/layer_24/attn": 0.010720642283558846, "grad/layer_24/mlp": 0.010443407110869884, "grad/layer_24/attn_mlp_ratio": 1.0265464198696002, "grad/layer_27/attn": 0.015153675340116024, "grad/layer_27/mlp": 0.009343170560896397, "grad/layer_27/attn_mlp_ratio": 1.621898592042004} {"step": 63500, "timestamp": 1778263172.3012898, "train/loss": 2.104283261299133, "train/z_loss": 0.0013793783145956695, "train/perplexity": 8.201222768796637, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027992.881490108, "perf/iters_per_sec": 0.96702236246591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341022491455079, "data/tokens_consumed": 133171249152, "data/tokens_consumed_B": 133.171249152, "train/loss_slope": -8.681161318532099e-06} {"step": 63500, "timestamp": 1778263179.5879025, "geo/ww_alpha_mean": 7.869907317002154, "geo/ww_alpha_std": 5.2905754537978416, "geo/ww_alpha_min": 1.357967204059047, "geo/ww_alpha_max": 43.88931369594073, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.9674609351845116, "geo/ww_alpha_by_type/k_proj": 4.499340778407761, "geo/ww_alpha_by_type/v_proj": 8.594228669627537, "geo/ww_alpha_by_type/o_proj": 10.288954383444855, "geo/ww_alpha_by_type/gate_proj": 8.137938742369474, "geo/ww_alpha_by_type/up_proj": 11.438437279483825, "geo/ww_alpha_by_type/down_proj": 8.269079945907182, "geo/twonn_id/layer_0": 0.7571680545806885, "geo/twonn_id/layer_7": 3.066230058670044, "geo/twonn_id/layer_14": 4.908211708068848, "geo/twonn_id/layer_21": 6.827984809875488, "geo/twonn_id/layer_27": 6.536372184753418, "geo/tier2_time_s": 7.27965784072876} {"step": 63500, "timestamp": 1778263180.3551497, "eoc/jacobian_sigma/layer_0/attn": 1205.9600830078125, "eoc/jacobian_sigma/layer_0/mlp": 10254.896484375, "eoc/jacobian_sigma/layer_0": 10254.896484375, "eoc/jacobian_sigma/layer_7/attn": 1.1531673669815063, "eoc/jacobian_sigma/layer_7/mlp": 1.8057587146759033, "eoc/jacobian_sigma/layer_7": 1.8057587146759033, "eoc/jacobian_sigma/layer_14/attn": 1.4804887771606445, "eoc/jacobian_sigma/layer_14/mlp": 7.103549003601074, "eoc/jacobian_sigma/layer_14": 7.103549003601074, "eoc/jacobian_sigma/layer_21/attn": 1.086111068725586, "eoc/jacobian_sigma/layer_21/mlp": 4.243290424346924, "eoc/jacobian_sigma/layer_21": 4.243290424346924, "eoc/jacobian_sigma/layer_27/attn": 2.915243148803711, "eoc/jacobian_sigma/layer_27/mlp": 31.486234664916992, "eoc/jacobian_sigma/layer_27": 31.486234664916992, "eoc/layer0_sigma": 10254.896484375, "eoc/sigma_max": 31.486234664916992, "eoc/sigma_min": 1.8057587146759033, "eoc/sigma_mean": 11.159708201885223, "eoc/time_s": 0.7578251361846924} {"step": 63510, "timestamp": 1778263190.7193818, "train/loss": 2.1063228368759157, "train/z_loss": 0.0013851716532371939, "train/perplexity": 8.217966852062839, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1138986.2133117851, "perf/iters_per_sec": 0.54311094918813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8412444114685058, "data/tokens_consumed": 133192220672, "data/tokens_consumed_B": 133.192220672, "train/loss_slope": -1.2711674996120544e-05} {"step": 63520, "timestamp": 1778263201.0618148, "train/loss": 2.175839400291443, "train/z_loss": 0.0013683490571565927, "train/perplexity": 8.80957678079069, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028762.2260019064, "perf/iters_per_sec": 0.9673892145165951, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337100982666017, "data/tokens_consumed": 133213192192, "data/tokens_consumed_B": 133.213192192, "train/loss_slope": -1.208346711956202e-05} {"step": 63525, "timestamp": 1778263206.8252072, "eos/sharpness": 45.00117301940917, "eos/L0_probe": 1.9724948406219482, "eos/L_plus": 2.18385910987854, "eos/L_minus": 2.2111423015594482, "eos/grad_norm": 0.13426388800144196, "eos/embed_grad_frac": 0.14510971307754517, "eos/time_s": 0.6034636497497559} {"step": 63525, "timestamp": 1778263208.2027113, "geo/rankme_last": 438.27166748046875, "geo/layer_0/stable_rank_q_proj": 19.390851974487305, "geo/layer_0/stable_rank_k_proj": 16.100055694580078, "geo/layer_0/stable_rank_o_proj": 47.14444351196289, "geo/layer_0/stable_rank_gate_proj": 131.4276123046875, "geo/layer_0/stable_rank_down_proj": 55.0809326171875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062324684113264084, "geo/layer_0/attn_entropy_mean": 6.162327289581299, "geo/layer_0/attn_entropy_std": 0.42173030972480774, "geo/layer_7/stable_rank_q_proj": 43.29684829711914, "geo/layer_7/stable_rank_k_proj": 40.88874053955078, "geo/layer_7/stable_rank_o_proj": 90.591796875, "geo/layer_7/stable_rank_gate_proj": 81.86112213134766, "geo/layer_7/stable_rank_down_proj": 140.86827087402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4500320255756378, "geo/layer_7/attn_entropy_mean": 4.652814865112305, "geo/layer_7/attn_entropy_std": 0.7967891097068787, "geo/layer_14/stable_rank_q_proj": 51.4223747253418, "geo/layer_14/stable_rank_k_proj": 40.22591018676758, "geo/layer_14/stable_rank_o_proj": 43.77883529663086, "geo/layer_14/stable_rank_gate_proj": 71.78490447998047, "geo/layer_14/stable_rank_down_proj": 129.99464416503906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4070131480693817, "geo/layer_14/attn_entropy_mean": 5.555045127868652, "geo/layer_14/attn_entropy_std": 0.3891085684299469, "geo/layer_21/stable_rank_q_proj": 40.48017120361328, "geo/layer_21/stable_rank_k_proj": 30.409509658813477, "geo/layer_21/stable_rank_o_proj": 70.22008514404297, "geo/layer_21/stable_rank_gate_proj": 66.14803314208984, "geo/layer_21/stable_rank_down_proj": 51.43684387207031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14044252038002014, "geo/layer_21/attn_entropy_mean": 5.703597068786621, "geo/layer_21/attn_entropy_std": 0.29722610116004944, "geo/layer_27/stable_rank_q_proj": 43.36725997924805, "geo/layer_27/stable_rank_k_proj": 31.736858367919922, "geo/layer_27/stable_rank_o_proj": 116.09185028076172, "geo/layer_27/stable_rank_gate_proj": 80.50342559814453, "geo/layer_27/stable_rank_down_proj": 128.3693389892578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08880762755870819, "geo/layer_27/attn_entropy_mean": 4.222710609436035, "geo/layer_27/attn_entropy_std": 0.7393874526023865, "attnres/final_alpha/block_0": 0.23927736282348633, "attnres/block_norm/0": 1.764486312866211, "attnres/final_alpha/block_1": 0.004443855490535498, "attnres/block_norm/1": 46593.7421875, "attnres/final_alpha/block_2": 0.0102396160364151, "attnres/block_norm/2": 28629.25, "attnres/final_alpha/block_3": 0.012406369671225548, "attnres/block_norm/3": 58088.41796875, "attnres/final_alpha/block_4": 0.014789859764277935, "attnres/block_norm/4": 15143.111328125, "attnres/final_alpha/block_5": 0.6081483960151672, "attnres/block_norm/5": 6645.25048828125, "attnres/final_alpha/block_6": 0.11069457232952118, "attnres/block_norm/6": 38495.140625, "geo/tier1_time_s": 1.3581717014312744, "geo/step": 63525.0, "geo/rankme_slope": 9.43394349927471e-05} {"step": 63530, "timestamp": 1778263213.3828876, "train/loss": 2.1455546617507935, "train/z_loss": 0.001378927123732865, "train/perplexity": 8.546780495063123, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702789.6537705397, "perf/iters_per_sec": 0.8119533795216273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2315978050231933, "data/tokens_consumed": 133234163712, "data/tokens_consumed_B": 133.234163712, "train/loss_slope": -1.2304792238218907e-05} {"step": 63540, "timestamp": 1778263223.740499, "train/loss": 2.1292898654937744, "train/z_loss": 0.001366643630899489, "train/perplexity": 8.408893245420746, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025660.261817735, "perf/iters_per_sec": 0.9659100827301669, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352930545806884, "data/tokens_consumed": 133255135232, "data/tokens_consumed_B": 133.255135232, "train/loss_slope": -1.2108264497332946e-05} {"step": 63550, "timestamp": 1778263234.0814965, "grad/layer_0/attn": 0.002622579922899604, "grad/layer_0/mlp": 0.0029540967661887407, "grad/layer_0/attn_mlp_ratio": 0.887777226575225, "grad/layer_4/attn": 0.003797336481511593, "grad/layer_4/mlp": 0.0024688171688467264, "grad/layer_4/attn_mlp_ratio": 1.53811970186253, "grad/layer_8/attn": 0.005372307263314724, "grad/layer_8/mlp": 0.003523781895637512, "grad/layer_8/attn_mlp_ratio": 1.5245855929696304, "grad/layer_12/attn": 0.004307087045162916, "grad/layer_12/mlp": 0.006834213621914387, "grad/layer_12/attn_mlp_ratio": 0.6302242248222263, "grad/layer_16/attn": 0.0038547960575670004, "grad/layer_16/mlp": 0.0047590043395757675, "grad/layer_16/attn_mlp_ratio": 0.8100005172322615, "grad/layer_20/attn": 0.0031915807630866766, "grad/layer_20/mlp": 0.005729320924729109, "grad/layer_20/attn_mlp_ratio": 0.5570609063990405, "grad/layer_24/attn": 0.00707699079066515, "grad/layer_24/mlp": 0.009037449955940247, "grad/layer_24/attn_mlp_ratio": 0.7830738479172561, "grad/layer_27/attn": 0.003889348590746522, "grad/layer_27/mlp": 0.008419927209615707, "grad/layer_27/attn_mlp_ratio": 0.46192187268705} {"step": 63550, "timestamp": 1778263234.0958166, "train/loss": 2.1078867673873902, "train/z_loss": 0.0013776077656075358, "train/perplexity": 8.230829236481133, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026626.6391881488, "perf/iters_per_sec": 0.9663708873692268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347993850708008, "data/tokens_consumed": 133276106752, "data/tokens_consumed_B": 133.276106752, "train/loss_slope": -1.1025785548601875e-05} {"step": 63560, "timestamp": 1778263244.437331, "train/loss": 2.126948046684265, "train/z_loss": 0.0013698984170332552, "train/perplexity": 8.389224180733239, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029217.2875875942, "perf/iters_per_sec": 0.9676062047899219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033478283882141, "data/tokens_consumed": 133297078272, "data/tokens_consumed_B": 133.297078272, "train/loss_slope": -1.4253257896819369e-05} {"step": 63570, "timestamp": 1778263254.7737336, "train/loss": 2.163506603240967, "train/z_loss": 0.0013848436647094786, "train/perplexity": 8.701597271556869, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029848.10273932, "perf/iters_per_sec": 0.9679070008942222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331571102142334, "data/tokens_consumed": 133318049792, "data/tokens_consumed_B": 133.318049792, "train/loss_slope": -1.0578184602784933e-05} {"step": 63580, "timestamp": 1778263265.1174474, "train/loss": 2.1596494197845457, "train/z_loss": 0.0013713237713091075, "train/perplexity": 8.668098261962188, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028897.6981614798, "perf/iters_per_sec": 0.9674538126761817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336410760879517, "data/tokens_consumed": 133339021312, "data/tokens_consumed_B": 133.339021312, "train/loss_slope": -1.0953953535822209e-05} {"step": 63590, "timestamp": 1778263275.459709, "train/loss": 2.137765955924988, "train/z_loss": 0.0013738057110458612, "train/perplexity": 8.480470704970957, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028965.838838752, "perf/iters_per_sec": 0.9674863046830902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336063623428344, "data/tokens_consumed": 133359992832, "data/tokens_consumed_B": 133.359992832, "train/loss_slope": -1.3515276247912138e-05} {"step": 63600, "timestamp": 1778263285.7897432, "grad/layer_0/attn": 0.003491697832942009, "grad/layer_0/mlp": 0.0032255270052701235, "grad/layer_0/attn_mlp_ratio": 1.0825200715991488, "grad/layer_4/attn": 0.0027962892781943083, "grad/layer_4/mlp": 0.002497939160093665, "grad/layer_4/attn_mlp_ratio": 1.119438459880508, "grad/layer_8/attn": 0.005672306753695011, "grad/layer_8/mlp": 0.003606807440519333, "grad/layer_8/attn_mlp_ratio": 1.5726668778335406, "grad/layer_12/attn": 0.005521873477846384, "grad/layer_12/mlp": 0.006858199834823608, "grad/layer_12/attn_mlp_ratio": 0.8051490960198153, "grad/layer_16/attn": 0.0051898956298828125, "grad/layer_16/mlp": 0.005219592712819576, "grad/layer_16/attn_mlp_ratio": 0.9943104406796205, "grad/layer_20/attn": 0.0037232208997011185, "grad/layer_20/mlp": 0.007353317458182573, "grad/layer_20/attn_mlp_ratio": 0.5063321242747117, "grad/layer_24/attn": 0.01924693025648594, "grad/layer_24/mlp": 0.01460107322782278, "grad/layer_24/attn_mlp_ratio": 1.3181859870404418, "grad/layer_27/attn": 0.004913019016385078, "grad/layer_27/mlp": 0.014921758323907852, "grad/layer_27/attn_mlp_ratio": 0.32925201419380773} {"step": 63600, "timestamp": 1778263286.3766623, "eos/sharpness": 80.1321506500244, "eos/L0_probe": 1.9725890159606934, "eos/L_plus": 2.3333821296691895, "eos/L_minus": 2.4131174087524414, "eos/grad_norm": 0.26611870527267456, "eos/embed_grad_frac": 0.03251466155052185, "eos/time_s": 0.5840997695922852} {"step": 63600, "timestamp": 1778263286.3967981, "train/loss": 2.1850040435791014, "train/z_loss": 0.001380997139494866, "train/perplexity": 8.890684503484742, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918317.9901229914, "perf/iters_per_sec": 0.9147252989401776, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0932243824005128, "data/tokens_consumed": 133380964352, "data/tokens_consumed_B": 133.380964352, "train/loss_slope": -1.2866468712835065e-05} {"step": 63600, "timestamp": 1778263287.7617073, "geo/rankme_last": 439.2123107910156, "geo/layer_0/stable_rank_q_proj": 19.399194717407227, "geo/layer_0/stable_rank_k_proj": 16.04822540283203, "geo/layer_0/stable_rank_o_proj": 47.100067138671875, "geo/layer_0/stable_rank_gate_proj": 131.32638549804688, "geo/layer_0/stable_rank_down_proj": 54.93673324584961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0647556260228157, "geo/layer_0/attn_entropy_mean": 6.158719062805176, "geo/layer_0/attn_entropy_std": 0.41596245765686035, "geo/layer_7/stable_rank_q_proj": 43.261539459228516, "geo/layer_7/stable_rank_k_proj": 40.83553695678711, "geo/layer_7/stable_rank_o_proj": 90.4338607788086, "geo/layer_7/stable_rank_gate_proj": 81.8106918334961, "geo/layer_7/stable_rank_down_proj": 140.8909454345703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45105797052383423, "geo/layer_7/attn_entropy_mean": 4.653197288513184, "geo/layer_7/attn_entropy_std": 0.7836480736732483, "geo/layer_14/stable_rank_q_proj": 51.3493537902832, "geo/layer_14/stable_rank_k_proj": 40.198211669921875, "geo/layer_14/stable_rank_o_proj": 43.82469177246094, "geo/layer_14/stable_rank_gate_proj": 71.85444641113281, "geo/layer_14/stable_rank_down_proj": 129.9588623046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3996841311454773, "geo/layer_14/attn_entropy_mean": 5.549504280090332, "geo/layer_14/attn_entropy_std": 0.3948122560977936, "geo/layer_21/stable_rank_q_proj": 40.47264099121094, "geo/layer_21/stable_rank_k_proj": 30.328048706054688, "geo/layer_21/stable_rank_o_proj": 70.2454605102539, "geo/layer_21/stable_rank_gate_proj": 66.24437713623047, "geo/layer_21/stable_rank_down_proj": 51.440467834472656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1422172337770462, "geo/layer_21/attn_entropy_mean": 5.700888633728027, "geo/layer_21/attn_entropy_std": 0.3032754063606262, "geo/layer_27/stable_rank_q_proj": 43.39093017578125, "geo/layer_27/stable_rank_k_proj": 31.833498001098633, "geo/layer_27/stable_rank_o_proj": 116.3519058227539, "geo/layer_27/stable_rank_gate_proj": 80.56025695800781, "geo/layer_27/stable_rank_down_proj": 128.38023376464844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08977708220481873, "geo/layer_27/attn_entropy_mean": 4.203615188598633, "geo/layer_27/attn_entropy_std": 0.742720901966095, "attnres/final_alpha/block_0": 0.2375652939081192, "attnres/block_norm/0": 1.7646915912628174, "attnres/final_alpha/block_1": 0.004399353172630072, "attnres/block_norm/1": 46690.0234375, "attnres/final_alpha/block_2": 0.010150639340281487, "attnres/block_norm/2": 28595.66796875, "attnres/final_alpha/block_3": 0.0122917415574193, "attnres/block_norm/3": 57731.5859375, "attnres/final_alpha/block_4": 0.014414945617318153, "attnres/block_norm/4": 15200.533203125, "attnres/final_alpha/block_5": 0.6100717782974243, "attnres/block_norm/5": 6738.37939453125, "attnres/final_alpha/block_6": 0.11110621690750122, "attnres/block_norm/6": 38825.12890625, "geo/tier1_time_s": 1.360661268234253, "geo/step": 63600.0, "geo/rankme_slope": 0.00013952037846388555} {"step": 63610, "timestamp": 1778263298.6556504, "train/loss": 2.1381136417388915, "train/z_loss": 0.0013746460550464689, "train/perplexity": 8.483419756972365, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1711308.1706003672, "perf/iters_per_sec": 0.8160153248788677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2254671812057496, "data/tokens_consumed": 133401935872, "data/tokens_consumed_B": 133.401935872, "train/loss_slope": -1.1780842066598063e-05} {"step": 63620, "timestamp": 1778263308.9996305, "train/loss": 2.138998794555664, "train/z_loss": 0.0013814272708259523, "train/perplexity": 8.49093220420753, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028763.489389248, "perf/iters_per_sec": 0.9673898169466247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033709454536438, "data/tokens_consumed": 133422907392, "data/tokens_consumed_B": 133.422907392, "train/loss_slope": -9.393240873998945e-06} {"step": 63630, "timestamp": 1778263319.3395808, "train/loss": 2.117511975765228, "train/z_loss": 0.0013770430465228855, "train/perplexity": 8.310435180315732, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029475.8223963487, "perf/iters_per_sec": 0.9677294837934249, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333466291427613, "data/tokens_consumed": 133443878912, "data/tokens_consumed_B": 133.443878912, "train/loss_slope": -9.244907728516286e-06} {"step": 63640, "timestamp": 1778263329.6882758, "train/loss": 2.116403317451477, "train/z_loss": 0.0013777042971923948, "train/perplexity": 8.30122685264988, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027480.3248110835, "perf/iters_per_sec": 0.9667779563956659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343636751174927, "data/tokens_consumed": 133464850432, "data/tokens_consumed_B": 133.464850432, "train/loss_slope": -1.258819967356315e-05} {"step": 63650, "timestamp": 1778263340.0233788, "grad/layer_0/attn": 0.0033828902523964643, "grad/layer_0/mlp": 0.0034212004393339157, "grad/layer_0/attn_mlp_ratio": 0.9888020925704314, "grad/layer_4/attn": 0.002349593210965395, "grad/layer_4/mlp": 0.0026168390177190304, "grad/layer_4/attn_mlp_ratio": 0.8978745369006175, "grad/layer_8/attn": 0.004143297206610441, "grad/layer_8/mlp": 0.0036142843309789896, "grad/layer_8/attn_mlp_ratio": 1.1463672230932194, "grad/layer_12/attn": 0.0050609903410077095, "grad/layer_12/mlp": 0.0076620872132480145, "grad/layer_12/attn_mlp_ratio": 0.6605237103285265, "grad/layer_16/attn": 0.006021184381097555, "grad/layer_16/mlp": 0.005788807757198811, "grad/layer_16/attn_mlp_ratio": 1.040142379852834, "grad/layer_20/attn": 0.005800778977572918, "grad/layer_20/mlp": 0.008047704584896564, "grad/layer_20/attn_mlp_ratio": 0.7207991849476612, "grad/layer_24/attn": 0.02594810165464878, "grad/layer_24/mlp": 0.014041686430573463, "grad/layer_24/attn_mlp_ratio": 1.8479334087220263, "grad/layer_27/attn": 0.010229358449578285, "grad/layer_27/mlp": 0.014448607340455055, "grad/layer_27/attn_mlp_ratio": 0.7079823084497971} {"step": 63650, "timestamp": 1778263340.0376642, "train/loss": 2.140868902206421, "train/z_loss": 0.0013752730796113611, "train/perplexity": 8.506826018424338, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027541.1729782515, "perf/iters_per_sec": 0.9668069710627801, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343326330184937, "data/tokens_consumed": 133485821952, "data/tokens_consumed_B": 133.485821952, "train/loss_slope": -1.1627246008025251e-05} {"step": 63660, "timestamp": 1778263350.385453, "train/loss": 2.129958748817444, "train/z_loss": 0.0013813499943353235, "train/perplexity": 8.414519695392627, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028539.09904528, "perf/iters_per_sec": 0.9672828192926788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033823800086975, "data/tokens_consumed": 133506793472, "data/tokens_consumed_B": 133.506793472, "train/loss_slope": -1.1593500930484046e-05} {"step": 63670, "timestamp": 1778263360.7262034, "train/loss": 2.1876967430114744, "train/z_loss": 0.0013654106296598911, "train/perplexity": 8.91465670508304, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028898.5873304051, "perf/iters_per_sec": 0.9674542366649652, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336406230926514, "data/tokens_consumed": 133527764992, "data/tokens_consumed_B": 133.527764992, "train/loss_slope": -1.1409256746559445e-05} {"step": 63675, "timestamp": 1778263366.4747267, "eos/sharpness": 4.198324680328368, "eos/L0_probe": 1.9748719930648804, "eos/L_plus": 1.9972907304763794, "eos/L_minus": 1.994436502456665, "eos/grad_norm": 0.0895485132932663, "eos/embed_grad_frac": 0.2828030288219452, "eos/time_s": 0.587191104888916} {"step": 63675, "timestamp": 1778263367.8506591, "geo/rankme_last": 438.2909240722656, "geo/layer_0/stable_rank_q_proj": 19.40633201599121, "geo/layer_0/stable_rank_k_proj": 16.065969467163086, "geo/layer_0/stable_rank_o_proj": 47.09426498413086, "geo/layer_0/stable_rank_gate_proj": 131.64678955078125, "geo/layer_0/stable_rank_down_proj": 54.90806198120117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06595741212368011, "geo/layer_0/attn_entropy_mean": 6.161690711975098, "geo/layer_0/attn_entropy_std": 0.4195761978626251, "geo/layer_7/stable_rank_q_proj": 43.37324523925781, "geo/layer_7/stable_rank_k_proj": 40.992496490478516, "geo/layer_7/stable_rank_o_proj": 90.5233383178711, "geo/layer_7/stable_rank_gate_proj": 81.6911849975586, "geo/layer_7/stable_rank_down_proj": 140.81405639648438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47360071539878845, "geo/layer_7/attn_entropy_mean": 4.667060375213623, "geo/layer_7/attn_entropy_std": 0.8198567628860474, "geo/layer_14/stable_rank_q_proj": 51.3647575378418, "geo/layer_14/stable_rank_k_proj": 40.313846588134766, "geo/layer_14/stable_rank_o_proj": 43.83092498779297, "geo/layer_14/stable_rank_gate_proj": 71.87195587158203, "geo/layer_14/stable_rank_down_proj": 129.83993530273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39213263988494873, "geo/layer_14/attn_entropy_mean": 5.561596870422363, "geo/layer_14/attn_entropy_std": 0.41424450278282166, "geo/layer_21/stable_rank_q_proj": 40.349403381347656, "geo/layer_21/stable_rank_k_proj": 30.31757926940918, "geo/layer_21/stable_rank_o_proj": 70.18061828613281, "geo/layer_21/stable_rank_gate_proj": 66.13294982910156, "geo/layer_21/stable_rank_down_proj": 51.32109832763672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14464564621448517, "geo/layer_21/attn_entropy_mean": 5.706273555755615, "geo/layer_21/attn_entropy_std": 0.2911500930786133, "geo/layer_27/stable_rank_q_proj": 43.42588806152344, "geo/layer_27/stable_rank_k_proj": 31.93584632873535, "geo/layer_27/stable_rank_o_proj": 116.18299865722656, "geo/layer_27/stable_rank_gate_proj": 80.45279693603516, "geo/layer_27/stable_rank_down_proj": 128.43478393554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08483947813510895, "geo/layer_27/attn_entropy_mean": 4.200772285461426, "geo/layer_27/attn_entropy_std": 0.725005030632019, "attnres/final_alpha/block_0": 0.23784928023815155, "attnres/block_norm/0": 1.7649035453796387, "attnres/final_alpha/block_1": 0.004439977928996086, "attnres/block_norm/1": 46672.328125, "attnres/final_alpha/block_2": 0.010196983814239502, "attnres/block_norm/2": 28773.84765625, "attnres/final_alpha/block_3": 0.012169372290372849, "attnres/block_norm/3": 57898.1015625, "attnres/final_alpha/block_4": 0.014696321450173855, "attnres/block_norm/4": 15123.0244140625, "attnres/final_alpha/block_5": 0.6114962100982666, "attnres/block_norm/5": 6634.791015625, "attnres/final_alpha/block_6": 0.10915185511112213, "attnres/block_norm/6": 38742.3984375, "geo/tier1_time_s": 1.3578519821166992, "geo/step": 63675.0, "geo/rankme_slope": 0.00013453285611119447} {"step": 63680, "timestamp": 1778263373.0265415, "train/loss": 2.2057214260101317, "train/z_loss": 0.00137105796020478, "train/perplexity": 9.076797444154707, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705843.3130462433, "perf/iters_per_sec": 0.8134094777327744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2293931007385255, "data/tokens_consumed": 133548736512, "data/tokens_consumed_B": 133.548736512, "train/loss_slope": -6.516105936269694e-06} {"step": 63690, "timestamp": 1778263383.36681, "train/loss": 2.148773169517517, "train/z_loss": 0.001377987419255078, "train/perplexity": 8.574332689158359, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029557.3948557447, "perf/iters_per_sec": 0.967768380573151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333050966262818, "data/tokens_consumed": 133569708032, "data/tokens_consumed_B": 133.569708032, "train/loss_slope": -3.4892620140463414e-06} {"step": 63700, "timestamp": 1778263393.6964762, "grad/layer_0/attn": 0.002819171641021967, "grad/layer_0/mlp": 0.002949537942185998, "grad/layer_0/attn_mlp_ratio": 0.955801078236844, "grad/layer_4/attn": 0.003024835605174303, "grad/layer_4/mlp": 0.0026354824658483267, "grad/layer_4/attn_mlp_ratio": 1.1477350085223004, "grad/layer_8/attn": 0.006369623355567455, "grad/layer_8/mlp": 0.0037444429472088814, "grad/layer_8/attn_mlp_ratio": 1.7010869908451114, "grad/layer_12/attn": 0.006534299813210964, "grad/layer_12/mlp": 0.006781707983464003, "grad/layer_12/attn_mlp_ratio": 0.9635182955078383, "grad/layer_16/attn": 0.0035133049823343754, "grad/layer_16/mlp": 0.004895940888673067, "grad/layer_16/attn_mlp_ratio": 0.7175954511017455, "grad/layer_20/attn": 0.002772850915789604, "grad/layer_20/mlp": 0.005649152211844921, "grad/layer_20/attn_mlp_ratio": 0.49084371649452596, "grad/layer_24/attn": 0.012913186103105545, "grad/layer_24/mlp": 0.00916362926363945, "grad/layer_24/attn_mlp_ratio": 1.4091781313574332, "grad/layer_27/attn": 0.006025027949362993, "grad/layer_27/mlp": 0.009061357006430626, "grad/layer_27/attn_mlp_ratio": 0.6649145242369023} {"step": 63700, "timestamp": 1778263393.710672, "train/loss": 2.1798195600509644, "train/z_loss": 0.0013677411596290767, "train/perplexity": 8.844710175673308, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028496.2945991189, "perf/iters_per_sec": 0.9672624085422129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338456153869628, "data/tokens_consumed": 133590679552, "data/tokens_consumed_B": 133.590679552, "train/loss_slope": -3.0759855179396934e-06} {"step": 63710, "timestamp": 1778263404.0517826, "train/loss": 2.143271064758301, "train/z_loss": 0.0013643179670907557, "train/perplexity": 8.52728536081603, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029361.7162441534, "perf/iters_per_sec": 0.9676750737400787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334047317504882, "data/tokens_consumed": 133611651072, "data/tokens_consumed_B": 133.611651072, "train/loss_slope": -1.953527400202555e-06} {"step": 63720, "timestamp": 1778263414.3917174, "train/loss": 2.155187749862671, "train/z_loss": 0.0013655527727678418, "train/perplexity": 8.62951021624112, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029568.4465084581, "perf/iters_per_sec": 0.9677736504118243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332994699478149, "data/tokens_consumed": 133632622592, "data/tokens_consumed_B": 133.632622592, "train/loss_slope": 1.4195704724814361e-06} {"step": 63730, "timestamp": 1778263424.731163, "train/loss": 2.1741132974624633, "train/z_loss": 0.0013631287845782935, "train/perplexity": 8.794383661602497, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029402.9184627938, "perf/iters_per_sec": 0.9676947204889268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333837509155273, "data/tokens_consumed": 133653594112, "data/tokens_consumed_B": 133.653594112, "train/loss_slope": 6.0247853608449574e-06} {"step": 63740, "timestamp": 1778263435.070549, "train/loss": 2.191331720352173, "train/z_loss": 0.001375688216648996, "train/perplexity": 8.947120246579773, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029357.408836074, "perf/iters_per_sec": 0.9676730198078508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033406925201416, "data/tokens_consumed": 133674565632, "data/tokens_consumed_B": 133.674565632, "train/loss_slope": 6.986403472424192e-06} {"step": 63750, "timestamp": 1778263445.402267, "grad/layer_0/attn": 0.0033617718145251274, "grad/layer_0/mlp": 0.003344013122841716, "grad/layer_0/attn_mlp_ratio": 1.005310562638362, "grad/layer_4/attn": 0.00348646380007267, "grad/layer_4/mlp": 0.002661000704392791, "grad/layer_4/attn_mlp_ratio": 1.3102077212142145, "grad/layer_8/attn": 0.005262396764010191, "grad/layer_8/mlp": 0.0037100513000041246, "grad/layer_8/attn_mlp_ratio": 1.4184161340741372, "grad/layer_12/attn": 0.005083063151687384, "grad/layer_12/mlp": 0.00716995308175683, "grad/layer_12/attn_mlp_ratio": 0.7089395178507842, "grad/layer_16/attn": 0.005160820670425892, "grad/layer_16/mlp": 0.004465329460799694, "grad/layer_16/attn_mlp_ratio": 1.1557535899996692, "grad/layer_20/attn": 0.0036535433027893305, "grad/layer_20/mlp": 0.005963386967778206, "grad/layer_20/attn_mlp_ratio": 0.6126624452285537, "grad/layer_24/attn": 0.011262834072113037, "grad/layer_24/mlp": 0.01228547003120184, "grad/layer_24/attn_mlp_ratio": 0.9167605270154394, "grad/layer_27/attn": 0.005916359834372997, "grad/layer_27/mlp": 0.011233055032789707, "grad/layer_27/attn_mlp_ratio": 0.5266919608631602} {"step": 63750, "timestamp": 1778263445.9960926, "eos/sharpness": 59.48414802551268, "eos/L0_probe": 1.9719773530960083, "eos/L_plus": 2.2628157138824463, "eos/L_minus": 2.2759804725646973, "eos/grad_norm": 0.18309958279132843, "eos/embed_grad_frac": 0.0726366937160492, "eos/time_s": 0.5888988971710205} {"step": 63750, "timestamp": 1778263446.0252519, "train/loss": 2.193664479255676, "train/z_loss": 0.0013651167042553424, "train/perplexity": 8.968016083995233, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915494.9494724954, "perf/iters_per_sec": 0.9133791682589032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0948355674743653, "data/tokens_consumed": 133695537152, "data/tokens_consumed_B": 133.695537152, "train/loss_slope": 9.089073253543372e-06} {"step": 63750, "timestamp": 1778263447.3869753, "geo/rankme_last": 438.5601806640625, "geo/layer_0/stable_rank_q_proj": 19.41046714782715, "geo/layer_0/stable_rank_k_proj": 16.10804557800293, "geo/layer_0/stable_rank_o_proj": 47.00215530395508, "geo/layer_0/stable_rank_gate_proj": 131.58914184570312, "geo/layer_0/stable_rank_down_proj": 54.91339111328125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06289354711771011, "geo/layer_0/attn_entropy_mean": 6.161437511444092, "geo/layer_0/attn_entropy_std": 0.41375288367271423, "geo/layer_7/stable_rank_q_proj": 43.370384216308594, "geo/layer_7/stable_rank_k_proj": 40.94097900390625, "geo/layer_7/stable_rank_o_proj": 90.41722869873047, "geo/layer_7/stable_rank_gate_proj": 81.66986083984375, "geo/layer_7/stable_rank_down_proj": 140.74374389648438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45071274042129517, "geo/layer_7/attn_entropy_mean": 4.6693925857543945, "geo/layer_7/attn_entropy_std": 0.7925078868865967, "geo/layer_14/stable_rank_q_proj": 51.15693664550781, "geo/layer_14/stable_rank_k_proj": 40.2885856628418, "geo/layer_14/stable_rank_o_proj": 43.78221130371094, "geo/layer_14/stable_rank_gate_proj": 71.9407730102539, "geo/layer_14/stable_rank_down_proj": 130.11709594726562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3986477553844452, "geo/layer_14/attn_entropy_mean": 5.549345016479492, "geo/layer_14/attn_entropy_std": 0.41289252042770386, "geo/layer_21/stable_rank_q_proj": 40.33255386352539, "geo/layer_21/stable_rank_k_proj": 30.348234176635742, "geo/layer_21/stable_rank_o_proj": 70.06991577148438, "geo/layer_21/stable_rank_gate_proj": 66.02123260498047, "geo/layer_21/stable_rank_down_proj": 51.317893981933594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13979770243167877, "geo/layer_21/attn_entropy_mean": 5.701794624328613, "geo/layer_21/attn_entropy_std": 0.29832857847213745, "geo/layer_27/stable_rank_q_proj": 43.358680725097656, "geo/layer_27/stable_rank_k_proj": 31.82644271850586, "geo/layer_27/stable_rank_o_proj": 116.03527069091797, "geo/layer_27/stable_rank_gate_proj": 80.45673370361328, "geo/layer_27/stable_rank_down_proj": 128.53762817382812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09956292808055878, "geo/layer_27/attn_entropy_mean": 4.236972808837891, "geo/layer_27/attn_entropy_std": 0.7151991724967957, "attnres/final_alpha/block_0": 0.23865526914596558, "attnres/block_norm/0": 1.7648401260375977, "attnres/final_alpha/block_1": 0.004428572952747345, "attnres/block_norm/1": 46801.5, "attnres/final_alpha/block_2": 0.010322092100977898, "attnres/block_norm/2": 28763.42578125, "attnres/final_alpha/block_3": 0.012293374165892601, "attnres/block_norm/3": 58336.9609375, "attnres/final_alpha/block_4": 0.014569100923836231, "attnres/block_norm/4": 15194.341796875, "attnres/final_alpha/block_5": 0.6114035844802856, "attnres/block_norm/5": 6676.828125, "attnres/final_alpha/block_6": 0.10832805931568146, "attnres/block_norm/6": 38709.1875, "geo/tier1_time_s": 1.3578128814697266, "geo/step": 63750.0, "geo/rankme_slope": 0.0001102097675007503} {"step": 63760, "timestamp": 1778263457.7316856, "train/loss": 2.098053753376007, "train/z_loss": 0.0014010599930770694, "train/perplexity": 8.150291988134741, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792016.995501771, "perf/iters_per_sec": 0.8545002915867668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1702746152877808, "data/tokens_consumed": 133716508672, "data/tokens_consumed_B": 133.716508672, "train/loss_slope": 3.933336525180747e-06} {"step": 63770, "timestamp": 1778263468.0735397, "train/loss": 2.1441830158233643, "train/z_loss": 0.0013686190010048448, "train/perplexity": 8.535065374739728, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028650.1183385171, "perf/iters_per_sec": 0.9673357574169718, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337672233581543, "data/tokens_consumed": 133737480192, "data/tokens_consumed_B": 133.737480192, "train/loss_slope": 3.2748812066874778e-06} {"step": 63780, "timestamp": 1778263478.411697, "train/loss": 2.1657788515090943, "train/z_loss": 0.0013923304039053618, "train/perplexity": 8.721391941572596, "train/grad_norm": 0.31640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029593.453658464, "perf/iters_per_sec": 0.9677855747501678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033286738395691, "data/tokens_consumed": 133758451712, "data/tokens_consumed_B": 133.758451712, "train/loss_slope": 5.067692724319963e-06} {"step": 63790, "timestamp": 1778263488.7505343, "train/loss": 2.1652140736579897, "train/z_loss": 0.0013690480147488416, "train/perplexity": 8.7164676832601, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029462.1027589133, "perf/iters_per_sec": 0.9677229417604987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033353614807129, "data/tokens_consumed": 133779423232, "data/tokens_consumed_B": 133.779423232, "train/loss_slope": 4.961423876762616e-06} {"step": 63800, "timestamp": 1778263499.5668106, "grad/layer_0/attn": 0.0032789716497063637, "grad/layer_0/mlp": 0.0033472420182079077, "grad/layer_0/attn_mlp_ratio": 0.9796039646698472, "grad/layer_4/attn": 0.002479046117514372, "grad/layer_4/mlp": 0.0025379059370607138, "grad/layer_4/attn_mlp_ratio": 0.9768076837019108, "grad/layer_8/attn": 0.003951071295887232, "grad/layer_8/mlp": 0.0037489519454538822, "grad/layer_8/attn_mlp_ratio": 1.0539135331641403, "grad/layer_12/attn": 0.004325018264353275, "grad/layer_12/mlp": 0.006630835589021444, "grad/layer_12/attn_mlp_ratio": 0.6522583980649876, "grad/layer_16/attn": 0.004121862351894379, "grad/layer_16/mlp": 0.004413952119648457, "grad/layer_16/attn_mlp_ratio": 0.9338257749021713, "grad/layer_20/attn": 0.0033461260609328747, "grad/layer_20/mlp": 0.006466826424002647, "grad/layer_20/attn_mlp_ratio": 0.5174293834098069, "grad/layer_24/attn": 0.010842983610928059, "grad/layer_24/mlp": 0.00817203801125288, "grad/layer_24/attn_mlp_ratio": 1.3268395794676107, "grad/layer_27/attn": 0.007353468798100948, "grad/layer_27/mlp": 0.007088222540915012, "grad/layer_27/attn_mlp_ratio": 1.0374206864856175} {"step": 63800, "timestamp": 1778263499.5810506, "train/loss": 2.1443618535995483, "train/z_loss": 0.001386570988688618, "train/perplexity": 8.536591903347354, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1937572.1605738678, "perf/iters_per_sec": 0.9239064028615321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0823607206344605, "data/tokens_consumed": 133800394752, "data/tokens_consumed_B": 133.800394752, "train/loss_slope": 4.414684608681457e-06} {"step": 63810, "timestamp": 1778263509.9277434, "train/loss": 2.1333935260772705, "train/z_loss": 0.0013744618860073387, "train/perplexity": 8.443471389140788, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028308.0232657106, "perf/iters_per_sec": 0.9671726337746194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339415788650512, "data/tokens_consumed": 133821366272, "data/tokens_consumed_B": 133.821366272, "train/loss_slope": 5.900505173503147e-06} {"step": 63820, "timestamp": 1778263520.269149, "train/loss": 2.18332154750824, "train/z_loss": 0.0013737877481617034, "train/perplexity": 8.875738538529761, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029234.608578141, "perf/iters_per_sec": 0.9676144640818315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334694623947143, "data/tokens_consumed": 133842337792, "data/tokens_consumed_B": 133.842337792, "train/loss_slope": 6.9485651205653695e-06} {"step": 63825, "timestamp": 1778263526.0166655, "eos/sharpness": 73.46441745758055, "eos/L0_probe": 1.9701483249664307, "eos/L_plus": 2.387981653213501, "eos/L_minus": 2.286959171295166, "eos/grad_norm": 0.1826707273721695, "eos/embed_grad_frac": 0.06894183903932571, "eos/time_s": 0.5856506824493408} {"step": 63825, "timestamp": 1778263527.3957705, "geo/rankme_last": 437.79608154296875, "geo/layer_0/stable_rank_q_proj": 19.405803680419922, "geo/layer_0/stable_rank_k_proj": 16.152416229248047, "geo/layer_0/stable_rank_o_proj": 47.022159576416016, "geo/layer_0/stable_rank_gate_proj": 131.28118896484375, "geo/layer_0/stable_rank_down_proj": 54.927696228027344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06505438685417175, "geo/layer_0/attn_entropy_mean": 6.162118911743164, "geo/layer_0/attn_entropy_std": 0.40960657596588135, "geo/layer_7/stable_rank_q_proj": 43.41538619995117, "geo/layer_7/stable_rank_k_proj": 41.16825485229492, "geo/layer_7/stable_rank_o_proj": 90.34635162353516, "geo/layer_7/stable_rank_gate_proj": 81.46561431884766, "geo/layer_7/stable_rank_down_proj": 140.97096252441406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.449032187461853, "geo/layer_7/attn_entropy_mean": 4.65548038482666, "geo/layer_7/attn_entropy_std": 0.8072458505630493, "geo/layer_14/stable_rank_q_proj": 51.12575912475586, "geo/layer_14/stable_rank_k_proj": 40.27065658569336, "geo/layer_14/stable_rank_o_proj": 43.725704193115234, "geo/layer_14/stable_rank_gate_proj": 72.03474426269531, "geo/layer_14/stable_rank_down_proj": 129.9032440185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4065244197845459, "geo/layer_14/attn_entropy_mean": 5.525864124298096, "geo/layer_14/attn_entropy_std": 0.39323773980140686, "geo/layer_21/stable_rank_q_proj": 40.25425338745117, "geo/layer_21/stable_rank_k_proj": 30.37252426147461, "geo/layer_21/stable_rank_o_proj": 70.07300567626953, "geo/layer_21/stable_rank_gate_proj": 65.96479797363281, "geo/layer_21/stable_rank_down_proj": 51.329505920410156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14133320748806, "geo/layer_21/attn_entropy_mean": 5.705406188964844, "geo/layer_21/attn_entropy_std": 0.29359257221221924, "geo/layer_27/stable_rank_q_proj": 43.4033203125, "geo/layer_27/stable_rank_k_proj": 31.81863784790039, "geo/layer_27/stable_rank_o_proj": 115.87879180908203, "geo/layer_27/stable_rank_gate_proj": 80.37671661376953, "geo/layer_27/stable_rank_down_proj": 128.43478393554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10524009168148041, "geo/layer_27/attn_entropy_mean": 4.2299299240112305, "geo/layer_27/attn_entropy_std": 0.7302926778793335, "attnres/final_alpha/block_0": 0.23625272512435913, "attnres/block_norm/0": 1.7648828029632568, "attnres/final_alpha/block_1": 0.00431668758392334, "attnres/block_norm/1": 46822.34375, "attnres/final_alpha/block_2": 0.010165629908442497, "attnres/block_norm/2": 28595.01953125, "attnres/final_alpha/block_3": 0.012128196656703949, "attnres/block_norm/3": 58229.7734375, "attnres/final_alpha/block_4": 0.014193600974977016, "attnres/block_norm/4": 15157.185546875, "attnres/final_alpha/block_5": 0.6164196133613586, "attnres/block_norm/5": 6651.7119140625, "attnres/final_alpha/block_6": 0.10652356594800949, "attnres/block_norm/6": 38896.3125, "geo/tier1_time_s": 1.3582496643066406, "geo/step": 63825.0, "geo/rankme_slope": 8.174553024334734e-05} {"step": 63830, "timestamp": 1778263532.5669923, "train/loss": 2.1893383502960204, "train/z_loss": 0.0013582553016021847, "train/perplexity": 8.929303088985403, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705953.7801590713, "perf/iters_per_sec": 0.8134621525569302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2293134927749634, "data/tokens_consumed": 133863309312, "data/tokens_consumed_B": 133.863309312, "train/loss_slope": 9.669643345445664e-06} {"step": 63840, "timestamp": 1778263542.9042456, "train/loss": 2.1433565616607666, "train/z_loss": 0.0013778306543827056, "train/perplexity": 8.528014448467745, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029710.911214032, "perf/iters_per_sec": 0.9678415828771744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332269430160523, "data/tokens_consumed": 133884280832, "data/tokens_consumed_B": 133.884280832, "train/loss_slope": 9.55158924505654e-06} {"step": 63850, "timestamp": 1778263553.2295198, "grad/layer_0/attn": 0.0027398450765758753, "grad/layer_0/mlp": 0.0027626855298876762, "grad/layer_0/attn_mlp_ratio": 0.9917324819499171, "grad/layer_4/attn": 0.0029943794943392277, "grad/layer_4/mlp": 0.0025651101022958755, "grad/layer_4/attn_mlp_ratio": 1.1673492591698942, "grad/layer_8/attn": 0.0035212435759603977, "grad/layer_8/mlp": 0.0035878941416740417, "grad/layer_8/attn_mlp_ratio": 0.9814234586573136, "grad/layer_12/attn": 0.004437919240444899, "grad/layer_12/mlp": 0.006440666038542986, "grad/layer_12/attn_mlp_ratio": 0.6890466211075567, "grad/layer_16/attn": 0.003774346550926566, "grad/layer_16/mlp": 0.004805125296115875, "grad/layer_16/attn_mlp_ratio": 0.7854834660459604, "grad/layer_20/attn": 0.004401704296469688, "grad/layer_20/mlp": 0.006202514749020338, "grad/layer_20/attn_mlp_ratio": 0.7096644512128689, "grad/layer_24/attn": 0.01287183165550232, "grad/layer_24/mlp": 0.010252141393721104, "grad/layer_24/attn_mlp_ratio": 1.2555261418685684, "grad/layer_27/attn": 0.007777618244290352, "grad/layer_27/mlp": 0.01013596449047327, "grad/layer_27/attn_mlp_ratio": 0.7673288688873763} {"step": 63850, "timestamp": 1778263553.244033, "train/loss": 2.1776012420654296, "train/z_loss": 0.0013850907213054597, "train/perplexity": 8.825111542051463, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029130.8274973754, "perf/iters_per_sec": 0.967564977406204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335223197937011, "data/tokens_consumed": 133905252352, "data/tokens_consumed_B": 133.905252352, "train/loss_slope": 1.1742088231745671e-05} {"step": 63860, "timestamp": 1778263563.5823104, "train/loss": 2.164965510368347, "train/z_loss": 0.0013708180165849626, "train/perplexity": 8.71430135862423, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029899.5367761012, "perf/iters_per_sec": 0.9679315265541559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033130931854248, "data/tokens_consumed": 133926223872, "data/tokens_consumed_B": 133.926223872, "train/loss_slope": 1.3400839221323699e-05} {"step": 63870, "timestamp": 1778263573.9240463, "train/loss": 2.091910421848297, "train/z_loss": 0.0013766184449195862, "train/perplexity": 8.100375526080882, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028938.5071382427, "perf/iters_per_sec": 0.9674732719126905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033620285987854, "data/tokens_consumed": 133947195392, "data/tokens_consumed_B": 133.947195392, "train/loss_slope": 7.13093693536071e-06} {"step": 63880, "timestamp": 1778263584.2701669, "train/loss": 2.1317368745803833, "train/z_loss": 0.001364414719864726, "train/perplexity": 8.429495079758464, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028111.603478954, "perf/iters_per_sec": 0.9670789735216875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340417146682739, "data/tokens_consumed": 133968166912, "data/tokens_consumed_B": 133.968166912, "train/loss_slope": 5.565079146235547e-06} {"step": 63890, "timestamp": 1778263594.6179023, "train/loss": 2.1291765451431273, "train/z_loss": 0.0013590985094197095, "train/perplexity": 8.40794040067899, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028142.7942707, "perf/iters_per_sec": 0.9670938464501858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340258121490478, "data/tokens_consumed": 133989138432, "data/tokens_consumed_B": 133.989138432, "train/loss_slope": 7.054929721831132e-06} {"step": 63900, "timestamp": 1778263604.9475315, "grad/layer_0/attn": 0.002817522268742323, "grad/layer_0/mlp": 0.0030066040344536304, "grad/layer_0/attn_mlp_ratio": 0.9371111535620674, "grad/layer_4/attn": 0.002141256583854556, "grad/layer_4/mlp": 0.002574556041508913, "grad/layer_4/attn_mlp_ratio": 0.8316993167605171, "grad/layer_8/attn": 0.005173696205019951, "grad/layer_8/mlp": 0.0037486725486814976, "grad/layer_8/attn_mlp_ratio": 1.3801408364743903, "grad/layer_12/attn": 0.005597071722149849, "grad/layer_12/mlp": 0.006281740497797728, "grad/layer_12/attn_mlp_ratio": 0.8910065028969978, "grad/layer_16/attn": 0.004861438646912575, "grad/layer_16/mlp": 0.0046279882080852985, "grad/layer_16/attn_mlp_ratio": 1.0504431565696544, "grad/layer_20/attn": 0.002975938841700554, "grad/layer_20/mlp": 0.0062466817907989025, "grad/layer_20/attn_mlp_ratio": 0.4764031358926706, "grad/layer_24/attn": 0.009154172614216805, "grad/layer_24/mlp": 0.007862146012485027, "grad/layer_24/attn_mlp_ratio": 1.1643350916208552, "grad/layer_27/attn": 0.0046393717639148235, "grad/layer_27/mlp": 0.006259570829570293, "grad/layer_27/attn_mlp_ratio": 0.7411645009082607} {"step": 63900, "timestamp": 1778263605.5322301, "eos/sharpness": 8.022999763488768, "eos/L0_probe": 1.971625804901123, "eos/L_plus": 2.0179331302642822, "eos/L_minus": 2.0055484771728516, "eos/grad_norm": 0.09788639843463898, "eos/embed_grad_frac": 0.23448556661605835, "eos/time_s": 0.5817551612854004} {"step": 63900, "timestamp": 1778263605.550415, "train/loss": 2.1495245695114136, "train/z_loss": 0.0013656944618560373, "train/perplexity": 8.58077786383752, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919511.7312673589, "perf/iters_per_sec": 0.915294519075088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0925445079803466, "data/tokens_consumed": 134010109952, "data/tokens_consumed_B": 134.010109952, "train/loss_slope": 8.508851053905751e-06} {"step": 63900, "timestamp": 1778263606.912515, "geo/rankme_last": 438.9637756347656, "geo/layer_0/stable_rank_q_proj": 19.407217025756836, "geo/layer_0/stable_rank_k_proj": 16.16033935546875, "geo/layer_0/stable_rank_o_proj": 47.0595817565918, "geo/layer_0/stable_rank_gate_proj": 131.18724060058594, "geo/layer_0/stable_rank_down_proj": 54.90510940551758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06342203170061111, "geo/layer_0/attn_entropy_mean": 6.163112163543701, "geo/layer_0/attn_entropy_std": 0.4121069610118866, "geo/layer_7/stable_rank_q_proj": 43.364192962646484, "geo/layer_7/stable_rank_k_proj": 41.177764892578125, "geo/layer_7/stable_rank_o_proj": 90.37036895751953, "geo/layer_7/stable_rank_gate_proj": 81.4267807006836, "geo/layer_7/stable_rank_down_proj": 140.9384307861328, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46777430176734924, "geo/layer_7/attn_entropy_mean": 4.673028469085693, "geo/layer_7/attn_entropy_std": 0.81462162733078, "geo/layer_14/stable_rank_q_proj": 51.1210823059082, "geo/layer_14/stable_rank_k_proj": 40.245277404785156, "geo/layer_14/stable_rank_o_proj": 43.69791793823242, "geo/layer_14/stable_rank_gate_proj": 72.04000091552734, "geo/layer_14/stable_rank_down_proj": 130.06155395507812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39289531111717224, "geo/layer_14/attn_entropy_mean": 5.503726005554199, "geo/layer_14/attn_entropy_std": 0.4123123288154602, "geo/layer_21/stable_rank_q_proj": 40.337642669677734, "geo/layer_21/stable_rank_k_proj": 30.282644271850586, "geo/layer_21/stable_rank_o_proj": 69.97798156738281, "geo/layer_21/stable_rank_gate_proj": 65.76493835449219, "geo/layer_21/stable_rank_down_proj": 51.27140808105469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14254333078861237, "geo/layer_21/attn_entropy_mean": 5.682590484619141, "geo/layer_21/attn_entropy_std": 0.2984175980091095, "geo/layer_27/stable_rank_q_proj": 43.38132095336914, "geo/layer_27/stable_rank_k_proj": 31.828214645385742, "geo/layer_27/stable_rank_o_proj": 115.79901885986328, "geo/layer_27/stable_rank_gate_proj": 80.30188751220703, "geo/layer_27/stable_rank_down_proj": 128.2793731689453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09512202441692352, "geo/layer_27/attn_entropy_mean": 4.196938991546631, "geo/layer_27/attn_entropy_std": 0.756144642829895, "attnres/final_alpha/block_0": 0.23735696077346802, "attnres/block_norm/0": 1.7649049758911133, "attnres/final_alpha/block_1": 0.00433953944593668, "attnres/block_norm/1": 46738.796875, "attnres/final_alpha/block_2": 0.010252796113491058, "attnres/block_norm/2": 28735.505859375, "attnres/final_alpha/block_3": 0.012126199901103973, "attnres/block_norm/3": 58279.6484375, "attnres/final_alpha/block_4": 0.014443900436162949, "attnres/block_norm/4": 15206.90625, "attnres/final_alpha/block_5": 0.6134176850318909, "attnres/block_norm/5": 6667.3154296875, "attnres/final_alpha/block_6": 0.10806293785572052, "attnres/block_norm/6": 38783.26171875, "geo/tier1_time_s": 1.357337474822998, "geo/step": 63900.0, "geo/rankme_slope": 7.800757021558623e-05} {"step": 63910, "timestamp": 1778263617.2691941, "train/loss": 2.1712493181228636, "train/z_loss": 0.0013670531334355474, "train/perplexity": 8.7692327615117, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790190.7300662915, "perf/iters_per_sec": 0.8536294603663881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.171468472480774, "data/tokens_consumed": 134031081472, "data/tokens_consumed_B": 134.031081472, "train/loss_slope": 1.0437821669511317e-05} {"step": 63920, "timestamp": 1778263627.616061, "train/loss": 2.165817952156067, "train/z_loss": 0.0013777512940578163, "train/perplexity": 8.721732960306998, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028122.966729774, "perf/iters_per_sec": 0.9670843919419164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340359210968018, "data/tokens_consumed": 134052052992, "data/tokens_consumed_B": 134.052052992, "train/loss_slope": 1.1980601513024544e-05} {"step": 63930, "timestamp": 1778263637.9581926, "train/loss": 2.1914883136749266, "train/z_loss": 0.0013629180495627225, "train/perplexity": 8.948521415572252, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029241.3498003436, "perf/iters_per_sec": 0.9676176785470694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334660291671753, "data/tokens_consumed": 134073024512, "data/tokens_consumed_B": 134.073024512, "train/loss_slope": 1.5484927346532558e-05} {"step": 63940, "timestamp": 1778263648.301047, "train/loss": 2.148123061656952, "train/z_loss": 0.0013893442577682436, "train/perplexity": 8.568760259614422, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028569.2269736247, "perf/iters_per_sec": 0.9672971854084133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033808445930481, "data/tokens_consumed": 134093996032, "data/tokens_consumed_B": 134.093996032, "train/loss_slope": 1.3751179450201338e-05} {"step": 63950, "timestamp": 1778263658.6307278, "grad/layer_0/attn": 0.002899389248341322, "grad/layer_0/mlp": 0.003069654805585742, "grad/layer_0/attn_mlp_ratio": 0.9445326388531188, "grad/layer_4/attn": 0.0022387623321264982, "grad/layer_4/mlp": 0.002666894346475601, "grad/layer_4/attn_mlp_ratio": 0.8394641696768707, "grad/layer_8/attn": 0.003950376994907856, "grad/layer_8/mlp": 0.0038459021598100662, "grad/layer_8/attn_mlp_ratio": 1.027165208068223, "grad/layer_12/attn": 0.0051542953588068485, "grad/layer_12/mlp": 0.007127385586500168, "grad/layer_12/attn_mlp_ratio": 0.7231677343586852, "grad/layer_16/attn": 0.003907681442797184, "grad/layer_16/mlp": 0.004407161846756935, "grad/layer_16/attn_mlp_ratio": 0.8866661788257407, "grad/layer_20/attn": 0.006166623439639807, "grad/layer_20/mlp": 0.005508142523467541, "grad/layer_20/attn_mlp_ratio": 1.1195467984737355, "grad/layer_24/attn": 0.012212651781737804, "grad/layer_24/mlp": 0.009314033202826977, "grad/layer_24/attn_mlp_ratio": 1.3112098040309825, "grad/layer_27/attn": 0.003632661420851946, "grad/layer_27/mlp": 0.007468360010534525, "grad/layer_27/attn_mlp_ratio": 0.48640683725572914} {"step": 63950, "timestamp": 1778263658.6448262, "train/loss": 2.1568889737129213, "train/z_loss": 0.0013833431527018546, "train/perplexity": 8.6442034395242, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028518.7023844228, "perf/iters_per_sec": 0.9672730934068788, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033834195137024, "data/tokens_consumed": 134114967552, "data/tokens_consumed_B": 134.114967552, "train/loss_slope": 1.6119440704217527e-05} {"step": 63960, "timestamp": 1778263668.9859116, "train/loss": 2.141757535934448, "train/z_loss": 0.0013856558362022042, "train/perplexity": 8.514388830730137, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029366.7727900138, "perf/iters_per_sec": 0.967677484889037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033402156829834, "data/tokens_consumed": 134135939072, "data/tokens_consumed_B": 134.135939072, "train/loss_slope": 1.648530927177953e-05} {"step": 63970, "timestamp": 1778263679.323405, "train/loss": 2.1650843620300293, "train/z_loss": 0.001383161568082869, "train/perplexity": 8.715337129371418, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029648.0594275405, "perf/iters_per_sec": 0.9678116128099158, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332589387893676, "data/tokens_consumed": 134156910592, "data/tokens_consumed_B": 134.156910592, "train/loss_slope": 1.591650486135498e-05} {"step": 63975, "timestamp": 1778263685.4658124, "eos/sharpness": 45.72856426239013, "eos/L0_probe": 1.9713672399520874, "eos/L_plus": 2.1827149391174316, "eos/L_minus": 2.2173051834106445, "eos/grad_norm": 0.12842068076133728, "eos/embed_grad_frac": 0.11987733095884323, "eos/time_s": 0.5836453437805176} {"step": 63975, "timestamp": 1778263686.8421223, "geo/rankme_last": 439.0962829589844, "geo/layer_0/stable_rank_q_proj": 19.40459442138672, "geo/layer_0/stable_rank_k_proj": 16.136123657226562, "geo/layer_0/stable_rank_o_proj": 47.01350402832031, "geo/layer_0/stable_rank_gate_proj": 130.8892364501953, "geo/layer_0/stable_rank_down_proj": 54.80089569091797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06256233155727386, "geo/layer_0/attn_entropy_mean": 6.161620140075684, "geo/layer_0/attn_entropy_std": 0.4112846255302429, "geo/layer_7/stable_rank_q_proj": 43.35300827026367, "geo/layer_7/stable_rank_k_proj": 41.16950225830078, "geo/layer_7/stable_rank_o_proj": 90.33574676513672, "geo/layer_7/stable_rank_gate_proj": 81.49116516113281, "geo/layer_7/stable_rank_down_proj": 140.63185119628906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45331883430480957, "geo/layer_7/attn_entropy_mean": 4.666004180908203, "geo/layer_7/attn_entropy_std": 0.7970272302627563, "geo/layer_14/stable_rank_q_proj": 51.14589309692383, "geo/layer_14/stable_rank_k_proj": 40.219024658203125, "geo/layer_14/stable_rank_o_proj": 43.6707878112793, "geo/layer_14/stable_rank_gate_proj": 72.06179809570312, "geo/layer_14/stable_rank_down_proj": 129.85977172851562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4089573323726654, "geo/layer_14/attn_entropy_mean": 5.523214340209961, "geo/layer_14/attn_entropy_std": 0.404163122177124, "geo/layer_21/stable_rank_q_proj": 40.321136474609375, "geo/layer_21/stable_rank_k_proj": 30.297748565673828, "geo/layer_21/stable_rank_o_proj": 69.98406219482422, "geo/layer_21/stable_rank_gate_proj": 65.77528381347656, "geo/layer_21/stable_rank_down_proj": 51.35083770751953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.145661860704422, "geo/layer_21/attn_entropy_mean": 5.696091651916504, "geo/layer_21/attn_entropy_std": 0.3056776523590088, "geo/layer_27/stable_rank_q_proj": 43.34488296508789, "geo/layer_27/stable_rank_k_proj": 31.852928161621094, "geo/layer_27/stable_rank_o_proj": 115.73046112060547, "geo/layer_27/stable_rank_gate_proj": 80.22711181640625, "geo/layer_27/stable_rank_down_proj": 128.5164794921875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09018460661172867, "geo/layer_27/attn_entropy_mean": 4.192225456237793, "geo/layer_27/attn_entropy_std": 0.730772078037262, "attnres/final_alpha/block_0": 0.24008099734783173, "attnres/block_norm/0": 1.7650842666625977, "attnres/final_alpha/block_1": 0.004422971047461033, "attnres/block_norm/1": 46750.5546875, "attnres/final_alpha/block_2": 0.010270887054502964, "attnres/block_norm/2": 28593.85546875, "attnres/final_alpha/block_3": 0.012271815910935402, "attnres/block_norm/3": 58253.46875, "attnres/final_alpha/block_4": 0.014629114419221878, "attnres/block_norm/4": 15222.294921875, "attnres/final_alpha/block_5": 0.607853889465332, "attnres/block_norm/5": 6645.0185546875, "attnres/final_alpha/block_6": 0.11047036200761795, "attnres/block_norm/6": 38757.1171875, "geo/tier1_time_s": 1.3567345142364502, "geo/step": 63975.0, "geo/rankme_slope": 8.602177199004602e-05} {"step": 63980, "timestamp": 1778263692.5535104, "train/loss": 2.182615780830383, "train/z_loss": 0.001367034378927201, "train/perplexity": 8.86947654803994, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1586101.9524603966, "perf/iters_per_sec": 0.7563123476316436, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.3222050428390504, "data/tokens_consumed": 134177882112, "data/tokens_consumed_B": 134.177882112, "train/loss_slope": 1.4750774107714734e-05} {"step": 63990, "timestamp": 1778263702.9102552, "train/loss": 2.1547771692276, "train/z_loss": 0.0013843228691257537, "train/perplexity": 8.625967833722784, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026898.1990563963, "perf/iters_per_sec": 0.966500377205084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346607446670533, "data/tokens_consumed": 134198853632, "data/tokens_consumed_B": 134.198853632, "train/loss_slope": 1.4806978172487672e-05} {"step": 64000, "timestamp": 1778263713.243916, "grad/layer_0/attn": 0.002667366759851575, "grad/layer_0/mlp": 0.0030168285593390465, "grad/layer_0/attn_mlp_ratio": 0.8841624967975361, "grad/layer_4/attn": 0.0019726352766156197, "grad/layer_4/mlp": 0.0026878397911787033, "grad/layer_4/attn_mlp_ratio": 0.7339110052982197, "grad/layer_8/attn": 0.003501706290990114, "grad/layer_8/mlp": 0.0036845372524112463, "grad/layer_8/attn_mlp_ratio": 0.9503788280768865, "grad/layer_12/attn": 0.004159216769039631, "grad/layer_12/mlp": 0.006931421812623739, "grad/layer_12/attn_mlp_ratio": 0.6000524598660955, "grad/layer_16/attn": 0.0035130674950778484, "grad/layer_16/mlp": 0.004805453587323427, "grad/layer_16/attn_mlp_ratio": 0.7310584439394691, "grad/layer_20/attn": 0.004075079690665007, "grad/layer_20/mlp": 0.005994410254061222, "grad/layer_20/attn_mlp_ratio": 0.6798132676893123, "grad/layer_24/attn": 0.004624919034540653, "grad/layer_24/mlp": 0.007809619419276714, "grad/layer_24/attn_mlp_ratio": 0.5922079844126631, "grad/layer_27/attn": 0.004583333153277636, "grad/layer_27/mlp": 0.006375872064381838, "grad/layer_27/attn_mlp_ratio": 0.718855873378698} {"step": 64000, "timestamp": 1778263713.2579215, "train/loss": 2.1715861320495606, "train/z_loss": 0.001378547353670001, "train/perplexity": 8.772186858694834, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027878.1472545804, "perf/iters_per_sec": 0.9669676529190924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341607570648192, "data/tokens_consumed": 134219825152, "data/tokens_consumed_B": 134.219825152, "train/loss_slope": 1.3503949834604455e-05} {"step": 64000, "timestamp": 1778263720.5270534, "geo/ww_alpha_mean": 7.88751615058026, "geo/ww_alpha_std": 5.838216780440033, "geo/ww_alpha_min": 1.3464106691036493, "geo/ww_alpha_max": 49.86518364937001, "geo/ww_alpha_healthy_frac": 0.18781725888324874, "geo/ww_alpha_by_type/q_proj": 3.9830007716253566, "geo/ww_alpha_by_type/k_proj": 4.388254013352136, "geo/ww_alpha_by_type/v_proj": 8.184359473068062, "geo/ww_alpha_by_type/o_proj": 9.22473942944267, "geo/ww_alpha_by_type/gate_proj": 7.971673417186401, "geo/ww_alpha_by_type/up_proj": 13.347007489303115, "geo/ww_alpha_by_type/down_proj": 8.220376199954561, "geo/twonn_id/layer_0": 0.6600958704948425, "geo/twonn_id/layer_7": 3.3591344356536865, "geo/twonn_id/layer_14": 4.691245079040527, "geo/twonn_id/layer_21": 7.983675003051758, "geo/twonn_id/layer_27": 5.834877967834473, "geo/tier2_time_s": 7.260799169540405} {"step": 64000, "timestamp": 1778263721.314944, "eoc/jacobian_sigma/layer_0/attn": 1212.5987548828125, "eoc/jacobian_sigma/layer_0/mlp": 9349.4560546875, "eoc/jacobian_sigma/layer_0": 9349.4560546875, "eoc/jacobian_sigma/layer_7/attn": 1.1580147743225098, "eoc/jacobian_sigma/layer_7/mlp": 1.76527738571167, "eoc/jacobian_sigma/layer_7": 1.76527738571167, "eoc/jacobian_sigma/layer_14/attn": 1.4862726926803589, "eoc/jacobian_sigma/layer_14/mlp": 6.247786521911621, "eoc/jacobian_sigma/layer_14": 6.247786521911621, "eoc/jacobian_sigma/layer_21/attn": 1.0872689485549927, "eoc/jacobian_sigma/layer_21/mlp": 4.391845226287842, "eoc/jacobian_sigma/layer_21": 4.391845226287842, "eoc/jacobian_sigma/layer_27/attn": 3.2972185611724854, "eoc/jacobian_sigma/layer_27/mlp": 28.746103286743164, "eoc/jacobian_sigma/layer_27": 28.746103286743164, "eoc/layer0_sigma": 9349.4560546875, "eoc/sigma_max": 28.746103286743164, "eoc/sigma_min": 1.76527738571167, "eoc/sigma_mean": 10.287753105163574, "eoc/time_s": 0.778616189956665} {"step": 64010, "timestamp": 1778263731.693126, "train/loss": 2.073840320110321, "train/z_loss": 0.0013783377245999873, "train/perplexity": 7.955315488195966, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1138039.087555889, "perf/iters_per_sec": 0.5426593244342275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8427767753601074, "data/tokens_consumed": 134240796672, "data/tokens_consumed_B": 134.240796672, "train/loss_slope": 7.40476647476304e-06} {"step": 64020, "timestamp": 1778263742.0546553, "train/loss": 2.1247186779975893, "train/z_loss": 0.0013867054134607316, "train/perplexity": 8.370542339133074, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025019.505733788, "perf/iters_per_sec": 0.9656045464199963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035620641708374, "data/tokens_consumed": 134261768192, "data/tokens_consumed_B": 134.261768192, "train/loss_slope": 3.582747555551648e-06} {"step": 64030, "timestamp": 1778263752.3999074, "train/loss": 2.1410288333892824, "train/z_loss": 0.001391883788164705, "train/perplexity": 8.508186633971388, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028606.7477877846, "perf/iters_per_sec": 0.9673150767268107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033789324760437, "data/tokens_consumed": 134282739712, "data/tokens_consumed_B": 134.282739712, "train/loss_slope": 3.102473121057125e-06} {"step": 64040, "timestamp": 1778263763.2579832, "train/loss": 2.1038002371788025, "train/z_loss": 0.001385128207039088, "train/perplexity": 8.197262336952159, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932786.4624040313, "perf/iters_per_sec": 0.9216244041462094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0850407123565673, "data/tokens_consumed": 134303711232, "data/tokens_consumed_B": 134.303711232, "train/loss_slope": -3.974999603193203e-07} {"step": 64050, "timestamp": 1778263774.0086524, "grad/layer_0/attn": 0.0063928887248039246, "grad/layer_0/mlp": 0.005714872386306524, "grad/layer_0/attn_mlp_ratio": 1.1186406591086684, "grad/layer_4/attn": 0.00664933817461133, "grad/layer_4/mlp": 0.003670142497867346, "grad/layer_4/attn_mlp_ratio": 1.811738371821068, "grad/layer_8/attn": 0.0046108197420835495, "grad/layer_8/mlp": 0.003921668976545334, "grad/layer_8/attn_mlp_ratio": 1.1757289185005129, "grad/layer_12/attn": 0.008097252808511257, "grad/layer_12/mlp": 0.009399949572980404, "grad/layer_12/attn_mlp_ratio": 0.8614144852058442, "grad/layer_16/attn": 0.005546264350414276, "grad/layer_16/mlp": 0.0061419131234288216, "grad/layer_16/attn_mlp_ratio": 0.9030190021665571, "grad/layer_20/attn": 0.004914147313684225, "grad/layer_20/mlp": 0.008020554669201374, "grad/layer_20/attn_mlp_ratio": 0.6126941907502913, "grad/layer_24/attn": 0.010331548750400543, "grad/layer_24/mlp": 0.010862010531127453, "grad/layer_24/attn_mlp_ratio": 0.9511635645792175, "grad/layer_27/attn": 0.0092018386349082, "grad/layer_27/mlp": 0.010292246006429195, "grad/layer_27/attn_mlp_ratio": 0.8940554413249157} {"step": 64050, "timestamp": 1778263774.613463, "eos/sharpness": 61.21573448181151, "eos/L0_probe": 1.9722282886505127, "eos/L_plus": 2.257143020629883, "eos/L_minus": 2.299470901489258, "eos/grad_norm": 0.1738092005252838, "eos/embed_grad_frac": 0.1361122876405716, "eos/time_s": 0.6018764972686768} {"step": 64050, "timestamp": 1778263774.6341043, "train/loss": 2.1868797540664673, "train/z_loss": 0.001371313282288611, "train/perplexity": 8.907376503433529, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1844772.3963157542, "perf/iters_per_sec": 0.8796560269907733, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1368079900741577, "data/tokens_consumed": 134324682752, "data/tokens_consumed_B": 134.324682752, "train/loss_slope": 1.4701042166709403e-06} {"step": 64050, "timestamp": 1778263775.9946375, "geo/rankme_last": 438.3820495605469, "geo/layer_0/stable_rank_q_proj": 19.410484313964844, "geo/layer_0/stable_rank_k_proj": 16.110868453979492, "geo/layer_0/stable_rank_o_proj": 47.054046630859375, "geo/layer_0/stable_rank_gate_proj": 131.08407592773438, "geo/layer_0/stable_rank_down_proj": 54.85215759277344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06391879171133041, "geo/layer_0/attn_entropy_mean": 6.159019947052002, "geo/layer_0/attn_entropy_std": 0.4168010950088501, "geo/layer_7/stable_rank_q_proj": 43.34161376953125, "geo/layer_7/stable_rank_k_proj": 41.13648223876953, "geo/layer_7/stable_rank_o_proj": 90.21446990966797, "geo/layer_7/stable_rank_gate_proj": 81.39502716064453, "geo/layer_7/stable_rank_down_proj": 140.80577087402344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4449896812438965, "geo/layer_7/attn_entropy_mean": 4.6397905349731445, "geo/layer_7/attn_entropy_std": 0.8045546412467957, "geo/layer_14/stable_rank_q_proj": 51.05366897583008, "geo/layer_14/stable_rank_k_proj": 40.19267272949219, "geo/layer_14/stable_rank_o_proj": 43.64897155761719, "geo/layer_14/stable_rank_gate_proj": 72.12256622314453, "geo/layer_14/stable_rank_down_proj": 130.06016540527344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3933505117893219, "geo/layer_14/attn_entropy_mean": 5.5405592918396, "geo/layer_14/attn_entropy_std": 0.4154857397079468, "geo/layer_21/stable_rank_q_proj": 40.37012481689453, "geo/layer_21/stable_rank_k_proj": 30.243284225463867, "geo/layer_21/stable_rank_o_proj": 69.95610809326172, "geo/layer_21/stable_rank_gate_proj": 65.8798599243164, "geo/layer_21/stable_rank_down_proj": 51.295650482177734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15003912150859833, "geo/layer_21/attn_entropy_mean": 5.671935081481934, "geo/layer_21/attn_entropy_std": 0.306571364402771, "geo/layer_27/stable_rank_q_proj": 43.37034606933594, "geo/layer_27/stable_rank_k_proj": 31.903684616088867, "geo/layer_27/stable_rank_o_proj": 115.7169418334961, "geo/layer_27/stable_rank_gate_proj": 80.19068908691406, "geo/layer_27/stable_rank_down_proj": 128.58236694335938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09178795665502548, "geo/layer_27/attn_entropy_mean": 4.197325706481934, "geo/layer_27/attn_entropy_std": 0.7264896035194397, "attnres/final_alpha/block_0": 0.23828855156898499, "attnres/block_norm/0": 1.7651653289794922, "attnres/final_alpha/block_1": 0.004342114552855492, "attnres/block_norm/1": 46604.55859375, "attnres/final_alpha/block_2": 0.010332472622394562, "attnres/block_norm/2": 28592.44921875, "attnres/final_alpha/block_3": 0.01222163625061512, "attnres/block_norm/3": 58802.58984375, "attnres/final_alpha/block_4": 0.014626218006014824, "attnres/block_norm/4": 15237.2119140625, "attnres/final_alpha/block_5": 0.6109946370124817, "attnres/block_norm/5": 6708.5126953125, "attnres/final_alpha/block_6": 0.10919435322284698, "attnres/block_norm/6": 38927.640625, "geo/tier1_time_s": 1.3573215007781982, "geo/step": 64050.0, "geo/rankme_slope": 6.215751925770309e-05} {"step": 64060, "timestamp": 1778263786.342555, "train/loss": 2.1481428384780883, "train/z_loss": 0.00136573005001992, "train/perplexity": 8.568929724129166, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791651.6925415627, "perf/iters_per_sec": 0.8543261015613378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1705132246017456, "data/tokens_consumed": 134345654272, "data/tokens_consumed_B": 134.345654272, "train/loss_slope": 2.5239638941731166e-06} {"step": 64070, "timestamp": 1778263796.683465, "train/loss": 2.1263543367385864, "train/z_loss": 0.0013776190811768175, "train/perplexity": 8.384244893173147, "train/grad_norm": 0.3125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029077.7473673595, "perf/iters_per_sec": 0.9675396668278501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335493564605713, "data/tokens_consumed": 134366625792, "data/tokens_consumed_B": 134.366625792, "train/loss_slope": 1.7725860110425388e-06} {"step": 64080, "timestamp": 1778263807.0253904, "train/loss": 2.137201118469238, "train/z_loss": 0.0013803940382786095, "train/perplexity": 8.475681970030147, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028720.722263067, "perf/iters_per_sec": 0.9673694239917121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337312459945678, "data/tokens_consumed": 134387597312, "data/tokens_consumed_B": 134.387597312, "train/loss_slope": -4.5006016943855054e-07} {"step": 64090, "timestamp": 1778263817.3669152, "train/loss": 2.1551928997039793, "train/z_loss": 0.0013648163760080934, "train/perplexity": 8.629554656963737, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029325.4784028851, "perf/iters_per_sec": 0.9676577941908289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334231853485107, "data/tokens_consumed": 134408568832, "data/tokens_consumed_B": 134.408568832, "train/loss_slope": -2.085415844154738e-06} {"step": 64100, "timestamp": 1778263827.7092025, "grad/layer_0/attn": 0.00399016123265028, "grad/layer_0/mlp": 0.0035378297325223684, "grad/layer_0/attn_mlp_ratio": 1.1278556124915178, "grad/layer_4/attn": 0.0023478043731302023, "grad/layer_4/mlp": 0.002620868617668748, "grad/layer_4/attn_mlp_ratio": 0.8958115136795424, "grad/layer_8/attn": 0.0035914576146751642, "grad/layer_8/mlp": 0.003826517378911376, "grad/layer_8/attn_mlp_ratio": 0.9385708113103707, "grad/layer_12/attn": 0.006833635736256838, "grad/layer_12/mlp": 0.007632169406861067, "grad/layer_12/attn_mlp_ratio": 0.8953726368516353, "grad/layer_16/attn": 0.004969130270183086, "grad/layer_16/mlp": 0.005934200715273619, "grad/layer_16/attn_mlp_ratio": 0.8373714380196563, "grad/layer_20/attn": 0.006544820033013821, "grad/layer_20/mlp": 0.008333933539688587, "grad/layer_20/attn_mlp_ratio": 0.7853218319192641, "grad/layer_24/attn": 0.02847868576645851, "grad/layer_24/mlp": 0.01899411715567112, "grad/layer_24/attn_mlp_ratio": 1.499342421820396, "grad/layer_27/attn": 0.014046750031411648, "grad/layer_27/mlp": 0.018749529495835304, "grad/layer_27/attn_mlp_ratio": 0.7491787972393584} {"step": 64100, "timestamp": 1778263827.72352, "train/loss": 2.134771990776062, "train/z_loss": 0.001393207092769444, "train/perplexity": 8.455118442067349, "train/grad_norm": 0.416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025987.4182370936, "perf/iters_per_sec": 0.9660660830674618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351258754730224, "data/tokens_consumed": 134429540352, "data/tokens_consumed_B": 134.429540352, "train/loss_slope": -2.9235472666262112e-06} {"step": 64110, "timestamp": 1778263838.069364, "train/loss": 2.1281306982040404, "train/z_loss": 0.0013899000827223063, "train/perplexity": 8.399151578629333, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027948.6974539806, "perf/iters_per_sec": 0.967001293875685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034124779701233, "data/tokens_consumed": 134450511872, "data/tokens_consumed_B": 134.450511872, "train/loss_slope": -4.244822987986508e-06} {"step": 64120, "timestamp": 1778263848.4197571, "train/loss": 2.1333625078201295, "train/z_loss": 0.0013757884153164923, "train/perplexity": 8.443209491435903, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027606.8388841825, "perf/iters_per_sec": 0.9668382830067551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342991352081299, "data/tokens_consumed": 134471483392, "data/tokens_consumed_B": 134.471483392, "train/loss_slope": 2.222907603985246e-07} {"step": 64125, "timestamp": 1778263854.172043, "eos/sharpness": 81.22465610504149, "eos/L0_probe": 1.9792567491531372, "eos/L_plus": 2.462714672088623, "eos/L_minus": 2.3080453872680664, "eos/grad_norm": 0.21296744048595428, "eos/embed_grad_frac": 0.05552360787987709, "eos/time_s": 0.589202880859375} {"step": 64125, "timestamp": 1778263855.547553, "geo/rankme_last": 438.6950988769531, "geo/layer_0/stable_rank_q_proj": 19.36284637451172, "geo/layer_0/stable_rank_k_proj": 16.07713508605957, "geo/layer_0/stable_rank_o_proj": 47.0075569152832, "geo/layer_0/stable_rank_gate_proj": 131.32553100585938, "geo/layer_0/stable_rank_down_proj": 54.85808181762695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06559795141220093, "geo/layer_0/attn_entropy_mean": 6.159600734710693, "geo/layer_0/attn_entropy_std": 0.4174177944660187, "geo/layer_7/stable_rank_q_proj": 43.344520568847656, "geo/layer_7/stable_rank_k_proj": 41.200225830078125, "geo/layer_7/stable_rank_o_proj": 90.25637817382812, "geo/layer_7/stable_rank_gate_proj": 81.39382934570312, "geo/layer_7/stable_rank_down_proj": 140.68521118164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4502694010734558, "geo/layer_7/attn_entropy_mean": 4.659060478210449, "geo/layer_7/attn_entropy_std": 0.799234926700592, "geo/layer_14/stable_rank_q_proj": 51.04225540161133, "geo/layer_14/stable_rank_k_proj": 40.19853591918945, "geo/layer_14/stable_rank_o_proj": 43.61970138549805, "geo/layer_14/stable_rank_gate_proj": 72.09712982177734, "geo/layer_14/stable_rank_down_proj": 129.9744110107422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3784460723400116, "geo/layer_14/attn_entropy_mean": 5.556829452514648, "geo/layer_14/attn_entropy_std": 0.40918076038360596, "geo/layer_21/stable_rank_q_proj": 40.48183059692383, "geo/layer_21/stable_rank_k_proj": 30.18989372253418, "geo/layer_21/stable_rank_o_proj": 69.87568664550781, "geo/layer_21/stable_rank_gate_proj": 65.83758544921875, "geo/layer_21/stable_rank_down_proj": 51.19404220581055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14295423030853271, "geo/layer_21/attn_entropy_mean": 5.701229095458984, "geo/layer_21/attn_entropy_std": 0.29670029878616333, "geo/layer_27/stable_rank_q_proj": 43.3394660949707, "geo/layer_27/stable_rank_k_proj": 31.890377044677734, "geo/layer_27/stable_rank_o_proj": 115.64918518066406, "geo/layer_27/stable_rank_gate_proj": 80.25106048583984, "geo/layer_27/stable_rank_down_proj": 128.817138671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09127132594585419, "geo/layer_27/attn_entropy_mean": 4.206584453582764, "geo/layer_27/attn_entropy_std": 0.723114550113678, "attnres/final_alpha/block_0": 0.23646874725818634, "attnres/block_norm/0": 1.7651705741882324, "attnres/final_alpha/block_1": 0.004312655422836542, "attnres/block_norm/1": 46679.58984375, "attnres/final_alpha/block_2": 0.010255202651023865, "attnres/block_norm/2": 28612.669921875, "attnres/final_alpha/block_3": 0.012216243892908096, "attnres/block_norm/3": 58484.75, "attnres/final_alpha/block_4": 0.014620019122958183, "attnres/block_norm/4": 15230.73046875, "attnres/final_alpha/block_5": 0.6153870820999146, "attnres/block_norm/5": 6592.64453125, "attnres/final_alpha/block_6": 0.1067400723695755, "attnres/block_norm/6": 38962.54296875, "geo/tier1_time_s": 1.357480764389038, "geo/step": 64125.0, "geo/rankme_slope": 6.86754193865046e-05} {"step": 64130, "timestamp": 1778263860.721683, "train/loss": 2.181785726547241, "train/z_loss": 0.0013697966001927852, "train/perplexity": 8.862117455687601, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705415.243418017, "perf/iters_per_sec": 0.8132053582277379, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2297016859054566, "data/tokens_consumed": 134492454912, "data/tokens_consumed_B": 134.492454912, "train/loss_slope": 4.587927166492143e-06} {"step": 64140, "timestamp": 1778263871.0656302, "train/loss": 2.1525824546813963, "train/z_loss": 0.001384828460868448, "train/perplexity": 8.607057056113117, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028413.2170265731, "perf/iters_per_sec": 0.9672227940686098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338879585266114, "data/tokens_consumed": 134513426432, "data/tokens_consumed_B": 134.513426432, "train/loss_slope": 5.567302433463713e-06} {"step": 64150, "timestamp": 1778263881.3983178, "grad/layer_0/attn": 0.0028257397934794426, "grad/layer_0/mlp": 0.0029817339964210987, "grad/layer_0/attn_mlp_ratio": 0.9476833621318238, "grad/layer_4/attn": 0.0021811388432979584, "grad/layer_4/mlp": 0.0025618746876716614, "grad/layer_4/attn_mlp_ratio": 0.851383859115249, "grad/layer_8/attn": 0.006828282494097948, "grad/layer_8/mlp": 0.0036361636593937874, "grad/layer_8/attn_mlp_ratio": 1.8778809057918622, "grad/layer_12/attn": 0.005095700267702341, "grad/layer_12/mlp": 0.006446673069149256, "grad/layer_12/attn_mlp_ratio": 0.7904387478626906, "grad/layer_16/attn": 0.004142029210925102, "grad/layer_16/mlp": 0.004663709085434675, "grad/layer_16/attn_mlp_ratio": 0.8881405435530068, "grad/layer_20/attn": 0.007260852959007025, "grad/layer_20/mlp": 0.006686025764793158, "grad/layer_20/attn_mlp_ratio": 1.0859744048016258, "grad/layer_24/attn": 0.010883492417633533, "grad/layer_24/mlp": 0.0129590705037117, "grad/layer_24/attn_mlp_ratio": 0.8398358763873325, "grad/layer_27/attn": 0.013753828592598438, "grad/layer_27/mlp": 0.011622235178947449, "grad/layer_27/attn_mlp_ratio": 1.1834064844231957} {"step": 64150, "timestamp": 1778263881.4125273, "train/loss": 2.132932209968567, "train/z_loss": 0.001381420879624784, "train/perplexity": 8.439577178075806, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028275.9855373185, "perf/iters_per_sec": 0.9671573569952576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339579105377197, "data/tokens_consumed": 134534397952, "data/tokens_consumed_B": 134.534397952, "train/loss_slope": 7.254271209687025e-06} {"step": 64160, "timestamp": 1778263891.7563667, "train/loss": 2.1748158931732178, "train/z_loss": 0.001375265361275524, "train/perplexity": 8.800564728983314, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028519.3105358095, "perf/iters_per_sec": 0.9672733833960578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033833885192871, "data/tokens_consumed": 134555369472, "data/tokens_consumed_B": 134.555369472, "train/loss_slope": 1.1069878571891137e-05} {"step": 64170, "timestamp": 1778263902.097927, "train/loss": 2.140885257720947, "train/z_loss": 0.001394768722821027, "train/perplexity": 8.50696515307866, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028784.2653178915, "perf/iters_per_sec": 0.9673997236813981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336988687515258, "data/tokens_consumed": 134576340992, "data/tokens_consumed_B": 134.576340992, "train/loss_slope": 1.0127429914946594e-05} {"step": 64180, "timestamp": 1778263912.442621, "train/loss": 2.1515187501907347, "train/z_loss": 0.001367037883028388, "train/perplexity": 8.59790655844667, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028669.2075828805, "perf/iters_per_sec": 0.9673448598780062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033757495880127, "data/tokens_consumed": 134597312512, "data/tokens_consumed_B": 134.597312512, "train/loss_slope": 1.12395367606638e-05} {"step": 64190, "timestamp": 1778263922.7849827, "train/loss": 2.1280377984046934, "train/z_loss": 0.0013783171656541527, "train/perplexity": 8.398371335375774, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028674.4010424083, "perf/iters_per_sec": 0.9673473363124887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337548494338988, "data/tokens_consumed": 134618284032, "data/tokens_consumed_B": 134.618284032, "train/loss_slope": 9.886224413647682e-06} {"step": 64200, "timestamp": 1778263933.1178002, "grad/layer_0/attn": 0.0028056998271495104, "grad/layer_0/mlp": 0.0031088257674127817, "grad/layer_0/attn_mlp_ratio": 0.902495008343604, "grad/layer_4/attn": 0.0025092673022300005, "grad/layer_4/mlp": 0.0024137680884450674, "grad/layer_4/attn_mlp_ratio": 1.039564326948253, "grad/layer_8/attn": 0.003621033625677228, "grad/layer_8/mlp": 0.0035896541085094213, "grad/layer_8/attn_mlp_ratio": 1.0087416267264464, "grad/layer_12/attn": 0.008645656518638134, "grad/layer_12/mlp": 0.007073131389915943, "grad/layer_12/attn_mlp_ratio": 1.222323737507795, "grad/layer_16/attn": 0.005016348324716091, "grad/layer_16/mlp": 0.004703060258179903, "grad/layer_16/attn_mlp_ratio": 1.0666136393489605, "grad/layer_20/attn": 0.0030210705008357763, "grad/layer_20/mlp": 0.006328972522169352, "grad/layer_20/attn_mlp_ratio": 0.47733979607582167, "grad/layer_24/attn": 0.005532294046133757, "grad/layer_24/mlp": 0.008358079940080643, "grad/layer_24/attn_mlp_ratio": 0.6619096753804692, "grad/layer_27/attn": 0.009126628749072552, "grad/layer_27/mlp": 0.007321731653064489, "grad/layer_27/attn_mlp_ratio": 1.246512308410183} {"step": 64200, "timestamp": 1778263933.7085686, "eos/sharpness": 35.80095767974853, "eos/L0_probe": 1.9748436212539673, "eos/L_plus": 2.122939109802246, "eos/L_minus": 2.184757709503174, "eos/grad_norm": 0.10613570362329483, "eos/embed_grad_frac": 0.21268540620803833, "eos/time_s": 0.5878503322601318} {"step": 64200, "timestamp": 1778263933.728809, "train/loss": 2.1170879125595095, "train/z_loss": 0.0013881811290048062, "train/perplexity": 8.306911777657765, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917502.8670044453, "perf/iters_per_sec": 0.9143366179487444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0936891078948974, "data/tokens_consumed": 134639255552, "data/tokens_consumed_B": 134.639255552, "train/loss_slope": 8.914002147552067e-06} {"step": 64200, "timestamp": 1778263935.0882633, "geo/rankme_last": 439.1477966308594, "geo/layer_0/stable_rank_q_proj": 19.371376037597656, "geo/layer_0/stable_rank_k_proj": 16.06012535095215, "geo/layer_0/stable_rank_o_proj": 47.06383514404297, "geo/layer_0/stable_rank_gate_proj": 131.50579833984375, "geo/layer_0/stable_rank_down_proj": 54.936893463134766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06605702638626099, "geo/layer_0/attn_entropy_mean": 6.161960124969482, "geo/layer_0/attn_entropy_std": 0.4187436103820801, "geo/layer_7/stable_rank_q_proj": 43.40036392211914, "geo/layer_7/stable_rank_k_proj": 40.98133087158203, "geo/layer_7/stable_rank_o_proj": 90.1609878540039, "geo/layer_7/stable_rank_gate_proj": 81.2951431274414, "geo/layer_7/stable_rank_down_proj": 140.58139038085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44650188088417053, "geo/layer_7/attn_entropy_mean": 4.670547962188721, "geo/layer_7/attn_entropy_std": 0.7898067235946655, "geo/layer_14/stable_rank_q_proj": 50.95823287963867, "geo/layer_14/stable_rank_k_proj": 40.048213958740234, "geo/layer_14/stable_rank_o_proj": 43.65849685668945, "geo/layer_14/stable_rank_gate_proj": 71.86224365234375, "geo/layer_14/stable_rank_down_proj": 129.96897888183594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39487630128860474, "geo/layer_14/attn_entropy_mean": 5.560185432434082, "geo/layer_14/attn_entropy_std": 0.38789471983909607, "geo/layer_21/stable_rank_q_proj": 40.416526794433594, "geo/layer_21/stable_rank_k_proj": 30.241729736328125, "geo/layer_21/stable_rank_o_proj": 69.93514251708984, "geo/layer_21/stable_rank_gate_proj": 65.77328491210938, "geo/layer_21/stable_rank_down_proj": 51.147918701171875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14847348630428314, "geo/layer_21/attn_entropy_mean": 5.693360328674316, "geo/layer_21/attn_entropy_std": 0.2998078763484955, "geo/layer_27/stable_rank_q_proj": 43.39414978027344, "geo/layer_27/stable_rank_k_proj": 31.93068504333496, "geo/layer_27/stable_rank_o_proj": 115.71253204345703, "geo/layer_27/stable_rank_gate_proj": 80.21317291259766, "geo/layer_27/stable_rank_down_proj": 128.62823486328125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09582101553678513, "geo/layer_27/attn_entropy_mean": 4.2179155349731445, "geo/layer_27/attn_entropy_std": 0.7284964919090271, "attnres/final_alpha/block_0": 0.2387145757675171, "attnres/block_norm/0": 1.7651546001434326, "attnres/final_alpha/block_1": 0.004358518403023481, "attnres/block_norm/1": 46818.78515625, "attnres/final_alpha/block_2": 0.010301797650754452, "attnres/block_norm/2": 28611.16015625, "attnres/final_alpha/block_3": 0.011976020410656929, "attnres/block_norm/3": 58577.83984375, "attnres/final_alpha/block_4": 0.014580737799406052, "attnres/block_norm/4": 15174.359375, "attnres/final_alpha/block_5": 0.6096929311752319, "attnres/block_norm/5": 6688.7900390625, "attnres/final_alpha/block_6": 0.11037544161081314, "attnres/block_norm/6": 38679.65625, "geo/tier1_time_s": 1.3559727668762207, "geo/step": 64200.0, "geo/rankme_slope": 5.374468146633653e-05} {"step": 64210, "timestamp": 1778263945.4315693, "train/loss": 2.1534141659736634, "train/z_loss": 0.0013783398317173123, "train/perplexity": 8.614218620423998, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792512.2611265772, "perf/iters_per_sec": 0.854736452639855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1699512720108032, "data/tokens_consumed": 134660227072, "data/tokens_consumed_B": 134.660227072, "train/loss_slope": 7.21256514050526e-06} {"step": 64220, "timestamp": 1778263955.782132, "train/loss": 2.1669168353080748, "train/z_loss": 0.0013785697286948563, "train/perplexity": 8.731322393581749, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027207.6278176499, "perf/iters_per_sec": 0.9666479243362665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345028162002563, "data/tokens_consumed": 134681198592, "data/tokens_consumed_B": 134.681198592, "train/loss_slope": 7.708434925542842e-06} {"step": 64230, "timestamp": 1778263966.1333513, "train/loss": 2.160623645782471, "train/z_loss": 0.0013730589300394059, "train/perplexity": 8.67654706349437, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027683.680405208, "perf/iters_per_sec": 0.9668749238992729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342599391937255, "data/tokens_consumed": 134702170112, "data/tokens_consumed_B": 134.702170112, "train/loss_slope": 6.961586215708488e-06} {"step": 64240, "timestamp": 1778263976.4815042, "train/loss": 2.197412347793579, "train/z_loss": 0.001385523611679673, "train/perplexity": 9.00169009278602, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027609.0356107864, "perf/iters_per_sec": 0.9668393304876263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034298014640808, "data/tokens_consumed": 134723141632, "data/tokens_consumed_B": 134.723141632, "train/loss_slope": 1.0951196464231811e-05} {"step": 64250, "timestamp": 1778263986.8161936, "grad/layer_0/attn": 0.0028255810029804707, "grad/layer_0/mlp": 0.0030981041491031647, "grad/layer_0/attn_mlp_ratio": 0.9120354822786922, "grad/layer_4/attn": 0.0020429808646440506, "grad/layer_4/mlp": 0.002573388395830989, "grad/layer_4/attn_mlp_ratio": 0.793887463145878, "grad/layer_8/attn": 0.00497785909101367, "grad/layer_8/mlp": 0.003911640495061874, "grad/layer_8/attn_mlp_ratio": 1.2725757824729116, "grad/layer_12/attn": 0.004899803549051285, "grad/layer_12/mlp": 0.006661142688244581, "grad/layer_12/attn_mlp_ratio": 0.7355800205481763, "grad/layer_16/attn": 0.0050443182699382305, "grad/layer_16/mlp": 0.0044892774894833565, "grad/layer_16/attn_mlp_ratio": 1.1236369704014562, "grad/layer_20/attn": 0.008515965193510056, "grad/layer_20/mlp": 0.005724797956645489, "grad/layer_20/attn_mlp_ratio": 1.4875573093140833, "grad/layer_24/attn": 0.009267846122384071, "grad/layer_24/mlp": 0.008729412220418453, "grad/layer_24/attn_mlp_ratio": 1.0616804181314932, "grad/layer_27/attn": 0.004846729803830385, "grad/layer_27/mlp": 0.007220834493637085, "grad/layer_27/attn_mlp_ratio": 0.6712146277524854} {"step": 64250, "timestamp": 1778263986.8304846, "train/loss": 2.1173949003219605, "train/z_loss": 0.0013742357725277544, "train/perplexity": 8.30946228938518, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027359.6674354004, "perf/iters_per_sec": 0.9667204224755289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344252347946168, "data/tokens_consumed": 134744113152, "data/tokens_consumed_B": 134.744113152, "train/loss_slope": 8.96508632892248e-06} {"step": 64260, "timestamp": 1778263997.175807, "train/loss": 2.149044060707092, "train/z_loss": 0.0013667309074662625, "train/perplexity": 8.576655714969744, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028706.7788852237, "perf/iters_per_sec": 0.9673627752710455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337383508682252, "data/tokens_consumed": 134765084672, "data/tokens_consumed_B": 134.765084672, "train/loss_slope": 8.619553814626725e-06} {"step": 64270, "timestamp": 1778264007.514552, "train/loss": 2.13003933429718, "train/z_loss": 0.0013844871544279158, "train/perplexity": 8.415197810821793, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029384.6581821935, "perf/iters_per_sec": 0.9676860133086174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333930492401122, "data/tokens_consumed": 134786056192, "data/tokens_consumed_B": 134.786056192, "train/loss_slope": 6.875867626168392e-06} {"step": 64275, "timestamp": 1778264013.2638226, "eos/sharpness": 82.23116397857665, "eos/L0_probe": 1.9740185737609863, "eos/L_plus": 2.4667484760284424, "eos/L_minus": 2.303600311279297, "eos/grad_norm": 0.23614658415317535, "eos/embed_grad_frac": 0.048769861459732056, "eos/time_s": 0.5893855094909668} {"step": 64275, "timestamp": 1778264014.640195, "geo/rankme_last": 438.56561279296875, "geo/layer_0/stable_rank_q_proj": 19.381574630737305, "geo/layer_0/stable_rank_k_proj": 16.105751037597656, "geo/layer_0/stable_rank_o_proj": 47.0318489074707, "geo/layer_0/stable_rank_gate_proj": 131.58273315429688, "geo/layer_0/stable_rank_down_proj": 54.95547103881836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0629960373044014, "geo/layer_0/attn_entropy_mean": 6.162940502166748, "geo/layer_0/attn_entropy_std": 0.41839468479156494, "geo/layer_7/stable_rank_q_proj": 43.38472366333008, "geo/layer_7/stable_rank_k_proj": 41.0075569152832, "geo/layer_7/stable_rank_o_proj": 89.8775863647461, "geo/layer_7/stable_rank_gate_proj": 81.17314910888672, "geo/layer_7/stable_rank_down_proj": 140.5874786376953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4516552686691284, "geo/layer_7/attn_entropy_mean": 4.628208160400391, "geo/layer_7/attn_entropy_std": 0.7928961515426636, "geo/layer_14/stable_rank_q_proj": 50.978267669677734, "geo/layer_14/stable_rank_k_proj": 40.087772369384766, "geo/layer_14/stable_rank_o_proj": 43.71837615966797, "geo/layer_14/stable_rank_gate_proj": 71.92757415771484, "geo/layer_14/stable_rank_down_proj": 129.63043212890625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3765404522418976, "geo/layer_14/attn_entropy_mean": 5.542594909667969, "geo/layer_14/attn_entropy_std": 0.408570259809494, "geo/layer_21/stable_rank_q_proj": 40.39562225341797, "geo/layer_21/stable_rank_k_proj": 30.1724853515625, "geo/layer_21/stable_rank_o_proj": 69.96864318847656, "geo/layer_21/stable_rank_gate_proj": 65.760009765625, "geo/layer_21/stable_rank_down_proj": 51.18511199951172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1424427479505539, "geo/layer_21/attn_entropy_mean": 5.712066650390625, "geo/layer_21/attn_entropy_std": 0.29589465260505676, "geo/layer_27/stable_rank_q_proj": 43.387454986572266, "geo/layer_27/stable_rank_k_proj": 31.96636390686035, "geo/layer_27/stable_rank_o_proj": 115.65983581542969, "geo/layer_27/stable_rank_gate_proj": 80.28121185302734, "geo/layer_27/stable_rank_down_proj": 128.4151611328125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08556365966796875, "geo/layer_27/attn_entropy_mean": 4.19172477722168, "geo/layer_27/attn_entropy_std": 0.7381650805473328, "attnres/final_alpha/block_0": 0.23687076568603516, "attnres/block_norm/0": 1.76524817943573, "attnres/final_alpha/block_1": 0.0042605409398674965, "attnres/block_norm/1": 46864.58984375, "attnres/final_alpha/block_2": 0.010096469894051552, "attnres/block_norm/2": 28569.845703125, "attnres/final_alpha/block_3": 0.011821702122688293, "attnres/block_norm/3": 58583.484375, "attnres/final_alpha/block_4": 0.014215417206287384, "attnres/block_norm/4": 15215.2314453125, "attnres/final_alpha/block_5": 0.6164337396621704, "attnres/block_norm/5": 6672.97509765625, "attnres/final_alpha/block_6": 0.10630135238170624, "attnres/block_norm/6": 39121.98828125, "geo/tier1_time_s": 1.3581981658935547, "geo/step": 64275.0, "geo/rankme_slope": 3.6485336322028814e-05} {"step": 64280, "timestamp": 1778264019.8157785, "train/loss": 2.1168327212333677, "train/z_loss": 0.0013870394323021173, "train/perplexity": 8.304792196285975, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705499.6629188082, "perf/iters_per_sec": 0.8132456125825921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.229640817642212, "data/tokens_consumed": 134807027712, "data/tokens_consumed_B": 134.807027712, "train/loss_slope": 2.6245955980256335e-06} {"step": 64290, "timestamp": 1778264030.156875, "train/loss": 2.1048628211021425, "train/z_loss": 0.001372182578779757, "train/perplexity": 8.205977245467656, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029460.8385013167, "perf/iters_per_sec": 0.9677223389154991, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333542585372926, "data/tokens_consumed": 134827999232, "data/tokens_consumed_B": 134.827999232, "train/loss_slope": -8.053041455841272e-07} {"step": 64300, "timestamp": 1778264040.484777, "grad/layer_0/attn": 0.00335211050696671, "grad/layer_0/mlp": 0.0032612327486276627, "grad/layer_0/attn_mlp_ratio": 1.027866044087373, "grad/layer_4/attn": 0.002132889349013567, "grad/layer_4/mlp": 0.002658647717908025, "grad/layer_4/attn_mlp_ratio": 0.8022459141248146, "grad/layer_8/attn": 0.004376426804810762, "grad/layer_8/mlp": 0.0037200492806732655, "grad/layer_8/attn_mlp_ratio": 1.1764432019498365, "grad/layer_12/attn": 0.004795128013938665, "grad/layer_12/mlp": 0.006533898413181305, "grad/layer_12/attn_mlp_ratio": 0.7338846791490727, "grad/layer_16/attn": 0.0038113652262836695, "grad/layer_16/mlp": 0.004896208178251982, "grad/layer_16/attn_mlp_ratio": 0.7784320048665052, "grad/layer_20/attn": 0.0036375802010297775, "grad/layer_20/mlp": 0.006108315195888281, "grad/layer_20/attn_mlp_ratio": 0.595512841892487, "grad/layer_24/attn": 0.014258690178394318, "grad/layer_24/mlp": 0.00967931468039751, "grad/layer_24/attn_mlp_ratio": 1.4731094609373518, "grad/layer_27/attn": 0.014285113662481308, "grad/layer_27/mlp": 0.00869455561041832, "grad/layer_27/attn_mlp_ratio": 1.6429952418803941} {"step": 64300, "timestamp": 1778264040.4990451, "train/loss": 2.1206512689590453, "train/z_loss": 0.001375263836234808, "train/perplexity": 8.336565066141803, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028669.0204316822, "perf/iters_per_sec": 0.9673447706373607, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337575912475585, "data/tokens_consumed": 134848970752, "data/tokens_consumed_B": 134.848970752, "train/loss_slope": -3.149427875469632e-06} {"step": 64310, "timestamp": 1778264050.8482258, "train/loss": 2.1347437620162966, "train/z_loss": 0.001377144898287952, "train/perplexity": 8.454879767928814, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027394.293044069, "perf/iters_per_sec": 0.9667369332523675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344075679779052, "data/tokens_consumed": 134869942272, "data/tokens_consumed_B": 134.869942272, "train/loss_slope": -3.1800413360619e-06} {"step": 64320, "timestamp": 1778264061.1896777, "train/loss": 2.163361930847168, "train/z_loss": 0.001379937573801726, "train/perplexity": 8.700338481707984, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029174.2672714447, "perf/iters_per_sec": 0.9675856911046241, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335001945495605, "data/tokens_consumed": 134890913792, "data/tokens_consumed_B": 134.890913792, "train/loss_slope": -6.149503249790203e-06} {"step": 64330, "timestamp": 1778264071.5319786, "train/loss": 2.087850332260132, "train/z_loss": 0.001385533099528402, "train/perplexity": 8.067553950103447, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029000.70654612, "perf/iters_per_sec": 0.9675029309015847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335886001586914, "data/tokens_consumed": 134911885312, "data/tokens_consumed_B": 134.911885312, "train/loss_slope": -9.17576987429354e-06} {"step": 64340, "timestamp": 1778264081.8784683, "train/loss": 2.1540948390960692, "train/z_loss": 0.0013672231114469468, "train/perplexity": 8.620084083514522, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027941.8713089905, "perf/iters_per_sec": 0.9669980389161065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341282606124877, "data/tokens_consumed": 134932856832, "data/tokens_consumed_B": 134.932856832, "train/loss_slope": -9.225489382911284e-06} {"step": 64350, "timestamp": 1778264092.2097244, "grad/layer_0/attn": 0.0026512478943914175, "grad/layer_0/mlp": 0.002949885092675686, "grad/layer_0/attn_mlp_ratio": 0.8987630776188309, "grad/layer_4/attn": 0.002099602948874235, "grad/layer_4/mlp": 0.002522888360545039, "grad/layer_4/attn_mlp_ratio": 0.8322218685881347, "grad/layer_8/attn": 0.003617385867983103, "grad/layer_8/mlp": 0.003439342835918069, "grad/layer_8/attn_mlp_ratio": 1.0517665540721977, "grad/layer_12/attn": 0.004471606574952602, "grad/layer_12/mlp": 0.006956152617931366, "grad/layer_12/attn_mlp_ratio": 0.6428275450919625, "grad/layer_16/attn": 0.003351658582687378, "grad/layer_16/mlp": 0.004445518832653761, "grad/layer_16/attn_mlp_ratio": 0.7539409084659098, "grad/layer_20/attn": 0.004350617993623018, "grad/layer_20/mlp": 0.005544175859540701, "grad/layer_20/attn_mlp_ratio": 0.7847185993684513, "grad/layer_24/attn": 0.008006037212908268, "grad/layer_24/mlp": 0.00900188647210598, "grad/layer_24/attn_mlp_ratio": 0.8893732606803183, "grad/layer_27/attn": 0.003587675979360938, "grad/layer_27/mlp": 0.009194320067763329, "grad/layer_27/attn_mlp_ratio": 0.3902056828453582} {"step": 64350, "timestamp": 1778264092.8025331, "eos/sharpness": 51.187634468078606, "eos/L0_probe": 1.9720757007598877, "eos/L_plus": 2.2290515899658203, "eos/L_minus": 2.226976156234741, "eos/grad_norm": 0.13092368841171265, "eos/embed_grad_frac": 0.11702743172645569, "eos/time_s": 0.5898993015289307} {"step": 64350, "timestamp": 1778264092.820603, "train/loss": 2.139284682273865, "train/z_loss": 0.001378620509058237, "train/perplexity": 8.493360004463392, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917706.4992160618, "perf/iters_per_sec": 0.9144337173538503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093572974205017, "data/tokens_consumed": 134953828352, "data/tokens_consumed_B": 134.953828352, "train/loss_slope": -8.549353563495886e-06} {"step": 64350, "timestamp": 1778264094.1881979, "geo/rankme_last": 439.0213317871094, "geo/layer_0/stable_rank_q_proj": 19.375099182128906, "geo/layer_0/stable_rank_k_proj": 16.09435272216797, "geo/layer_0/stable_rank_o_proj": 46.9410285949707, "geo/layer_0/stable_rank_gate_proj": 131.57411193847656, "geo/layer_0/stable_rank_down_proj": 54.86382293701172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06450548768043518, "geo/layer_0/attn_entropy_mean": 6.161397457122803, "geo/layer_0/attn_entropy_std": 0.4158285856246948, "geo/layer_7/stable_rank_q_proj": 43.45856475830078, "geo/layer_7/stable_rank_k_proj": 40.9310417175293, "geo/layer_7/stable_rank_o_proj": 89.92913055419922, "geo/layer_7/stable_rank_gate_proj": 81.20638275146484, "geo/layer_7/stable_rank_down_proj": 140.5892791748047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4580352008342743, "geo/layer_7/attn_entropy_mean": 4.65173864364624, "geo/layer_7/attn_entropy_std": 0.8086323738098145, "geo/layer_14/stable_rank_q_proj": 51.02450180053711, "geo/layer_14/stable_rank_k_proj": 40.212589263916016, "geo/layer_14/stable_rank_o_proj": 43.73104476928711, "geo/layer_14/stable_rank_gate_proj": 71.87447357177734, "geo/layer_14/stable_rank_down_proj": 129.54714965820312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3784481883049011, "geo/layer_14/attn_entropy_mean": 5.543550491333008, "geo/layer_14/attn_entropy_std": 0.41767042875289917, "geo/layer_21/stable_rank_q_proj": 40.50576400756836, "geo/layer_21/stable_rank_k_proj": 30.179197311401367, "geo/layer_21/stable_rank_o_proj": 69.89568328857422, "geo/layer_21/stable_rank_gate_proj": 65.769287109375, "geo/layer_21/stable_rank_down_proj": 51.20602798461914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13896477222442627, "geo/layer_21/attn_entropy_mean": 5.692719459533691, "geo/layer_21/attn_entropy_std": 0.3042145371437073, "geo/layer_27/stable_rank_q_proj": 43.39613342285156, "geo/layer_27/stable_rank_k_proj": 31.984445571899414, "geo/layer_27/stable_rank_o_proj": 115.60125732421875, "geo/layer_27/stable_rank_gate_proj": 80.31809997558594, "geo/layer_27/stable_rank_down_proj": 128.31655883789062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10839685052633286, "geo/layer_27/attn_entropy_mean": 4.211353302001953, "geo/layer_27/attn_entropy_std": 0.7351933121681213, "attnres/final_alpha/block_0": 0.23846019804477692, "attnres/block_norm/0": 1.7654829025268555, "attnres/final_alpha/block_1": 0.00423719547688961, "attnres/block_norm/1": 46947.0546875, "attnres/final_alpha/block_2": 0.010169953107833862, "attnres/block_norm/2": 28642.087890625, "attnres/final_alpha/block_3": 0.01219248492270708, "attnres/block_norm/3": 58042.1328125, "attnres/final_alpha/block_4": 0.014334121719002724, "attnres/block_norm/4": 15228.41015625, "attnres/final_alpha/block_5": 0.6122113466262817, "attnres/block_norm/5": 6697.40625, "attnres/final_alpha/block_6": 0.10839467495679855, "attnres/block_norm/6": 38920.15625, "geo/tier1_time_s": 1.363377332687378, "geo/step": 64350.0, "geo/rankme_slope": 4.31699437587535e-05} {"step": 64360, "timestamp": 1778264104.5268517, "train/loss": 2.098769021034241, "train/z_loss": 0.00139848121907562, "train/perplexity": 8.156123713772757, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792089.285321903, "perf/iters_per_sec": 0.8545347620591655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1702274084091187, "data/tokens_consumed": 134974799872, "data/tokens_consumed_B": 134.974799872, "train/loss_slope": -1.1455243012227723e-05} {"step": 64370, "timestamp": 1778264114.8700876, "train/loss": 2.177880620956421, "train/z_loss": 0.0013809798518195748, "train/perplexity": 8.827577436370326, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028597.9990449362, "perf/iters_per_sec": 0.967310905001133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337937831878663, "data/tokens_consumed": 134995771392, "data/tokens_consumed_B": 134.995771392, "train/loss_slope": -9.387321018650491e-06} {"step": 64380, "timestamp": 1778264125.2125115, "train/loss": 2.108610248565674, "train/z_loss": 0.001389025803655386, "train/perplexity": 8.236786241145936, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028864.9866464452, "perf/iters_per_sec": 0.9674382146103121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336577415466308, "data/tokens_consumed": 135016742912, "data/tokens_consumed_B": 135.016742912, "train/loss_slope": -1.2136161767288333e-05} {"step": 64390, "timestamp": 1778264135.5537322, "train/loss": 2.1125244140625, "train/z_loss": 0.0013834165642037989, "train/perplexity": 8.269089564690558, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029210.6869636034, "perf/iters_per_sec": 0.9676030573671357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334816455841065, "data/tokens_consumed": 135037714432, "data/tokens_consumed_B": 135.037714432, "train/loss_slope": -1.2535624235126295e-05} {"step": 64400, "timestamp": 1778264145.8861535, "grad/layer_0/attn": 0.002581774489954114, "grad/layer_0/mlp": 0.0029239647556096315, "grad/layer_0/attn_mlp_ratio": 0.8829704245593017, "grad/layer_4/attn": 0.0020607064943760633, "grad/layer_4/mlp": 0.0025445998180657625, "grad/layer_4/attn_mlp_ratio": 0.8098351649490254, "grad/layer_8/attn": 0.0033720401115715504, "grad/layer_8/mlp": 0.0034377078991383314, "grad/layer_8/attn_mlp_ratio": 0.9808977703797876, "grad/layer_12/attn": 0.004408498760312796, "grad/layer_12/mlp": 0.006870062090456486, "grad/layer_12/attn_mlp_ratio": 0.6416970673768924, "grad/layer_16/attn": 0.003450942924246192, "grad/layer_16/mlp": 0.004624740220606327, "grad/layer_16/attn_mlp_ratio": 0.7461917178073592, "grad/layer_20/attn": 0.0033445872832089663, "grad/layer_20/mlp": 0.006038750521838665, "grad/layer_20/attn_mlp_ratio": 0.553854181544363, "grad/layer_24/attn": 0.007649473380297422, "grad/layer_24/mlp": 0.008701475337147713, "grad/layer_24/attn_mlp_ratio": 0.8791007267159362, "grad/layer_27/attn": 0.006942223757505417, "grad/layer_27/mlp": 0.00781070115044713, "grad/layer_27/attn_mlp_ratio": 0.8888092803585342} {"step": 64400, "timestamp": 1778264145.9005253, "train/loss": 2.1201258420944216, "train/z_loss": 0.0013791306293569506, "train/perplexity": 8.33218596144774, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027670.5926300255, "perf/iters_per_sec": 0.9668686831617477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342666149139403, "data/tokens_consumed": 135058685952, "data/tokens_consumed_B": 135.058685952, "train/loss_slope": -1.4346641940538323e-05} {"step": 64410, "timestamp": 1778264156.2387075, "train/loss": 2.1420937061309813, "train/z_loss": 0.0013758488115854562, "train/perplexity": 8.51725159565789, "train/grad_norm": 0.2431640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030039.5180096482, "perf/iters_per_sec": 0.9679982748077622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330596923828126, "data/tokens_consumed": 135079657472, "data/tokens_consumed_B": 135.079657472, "train/loss_slope": -1.4601230392433143e-05} {"step": 64420, "timestamp": 1778264166.5747843, "train/loss": 2.1431288361549377, "train/z_loss": 0.0013738545472733676, "train/perplexity": 8.526072623173716, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030003.490204679, "perf/iters_per_sec": 0.9679810954116245, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330780267715454, "data/tokens_consumed": 135100628992, "data/tokens_consumed_B": 135.100628992, "train/loss_slope": -1.5163010711109088e-05} {"step": 64425, "timestamp": 1778264172.3258512, "eos/sharpness": 73.19672107696532, "eos/L0_probe": 1.974744439125061, "eos/L_plus": 2.271146059036255, "eos/L_minus": 2.4103100299835205, "eos/grad_norm": 0.16434259712696075, "eos/embed_grad_frac": 0.07915478199720383, "eos/time_s": 0.5897762775421143} {"step": 64425, "timestamp": 1778264173.7035785, "geo/rankme_last": 438.4338684082031, "geo/layer_0/stable_rank_q_proj": 19.360206604003906, "geo/layer_0/stable_rank_k_proj": 16.08177947998047, "geo/layer_0/stable_rank_o_proj": 46.98940658569336, "geo/layer_0/stable_rank_gate_proj": 131.53038024902344, "geo/layer_0/stable_rank_down_proj": 54.97808074951172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06199176609516144, "geo/layer_0/attn_entropy_mean": 6.15806770324707, "geo/layer_0/attn_entropy_std": 0.4201013445854187, "geo/layer_7/stable_rank_q_proj": 43.5010986328125, "geo/layer_7/stable_rank_k_proj": 40.84151077270508, "geo/layer_7/stable_rank_o_proj": 89.92796325683594, "geo/layer_7/stable_rank_gate_proj": 81.16100311279297, "geo/layer_7/stable_rank_down_proj": 140.74363708496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44360917806625366, "geo/layer_7/attn_entropy_mean": 4.6397786140441895, "geo/layer_7/attn_entropy_std": 0.8111775517463684, "geo/layer_14/stable_rank_q_proj": 50.989192962646484, "geo/layer_14/stable_rank_k_proj": 40.137840270996094, "geo/layer_14/stable_rank_o_proj": 43.71458435058594, "geo/layer_14/stable_rank_gate_proj": 71.8484115600586, "geo/layer_14/stable_rank_down_proj": 129.77120971679688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39738067984580994, "geo/layer_14/attn_entropy_mean": 5.503050804138184, "geo/layer_14/attn_entropy_std": 0.3964182138442993, "geo/layer_21/stable_rank_q_proj": 40.403839111328125, "geo/layer_21/stable_rank_k_proj": 30.313199996948242, "geo/layer_21/stable_rank_o_proj": 69.82880401611328, "geo/layer_21/stable_rank_gate_proj": 65.79894256591797, "geo/layer_21/stable_rank_down_proj": 51.18587112426758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14147888123989105, "geo/layer_21/attn_entropy_mean": 5.689478397369385, "geo/layer_21/attn_entropy_std": 0.29610776901245117, "geo/layer_27/stable_rank_q_proj": 43.44217300415039, "geo/layer_27/stable_rank_k_proj": 32.013126373291016, "geo/layer_27/stable_rank_o_proj": 115.46434020996094, "geo/layer_27/stable_rank_gate_proj": 80.27311706542969, "geo/layer_27/stable_rank_down_proj": 128.53762817382812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0919627845287323, "geo/layer_27/attn_entropy_mean": 4.215773582458496, "geo/layer_27/attn_entropy_std": 0.7159683704376221, "attnres/final_alpha/block_0": 0.23981419205665588, "attnres/block_norm/0": 1.765620231628418, "attnres/final_alpha/block_1": 0.004307204857468605, "attnres/block_norm/1": 46827.375, "attnres/final_alpha/block_2": 0.01033569686114788, "attnres/block_norm/2": 28566.53515625, "attnres/final_alpha/block_3": 0.012187362648546696, "attnres/block_norm/3": 58263.140625, "attnres/final_alpha/block_4": 0.014604571275413036, "attnres/block_norm/4": 15246.451171875, "attnres/final_alpha/block_5": 0.6095447540283203, "attnres/block_norm/5": 6735.8828125, "attnres/final_alpha/block_6": 0.10920621454715729, "attnres/block_norm/6": 38985.609375, "geo/tier1_time_s": 1.3575270175933838, "geo/step": 64425.0, "geo/rankme_slope": 3.481623117997199e-05} {"step": 64430, "timestamp": 1778264178.876389, "train/loss": 2.1291645765304565, "train/z_loss": 0.0013815804617479444, "train/perplexity": 8.407839769899182, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705676.299649252, "perf/iters_per_sec": 0.8133298395391713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2295134782791137, "data/tokens_consumed": 135121600512, "data/tokens_consumed_B": 135.121600512, "train/loss_slope": -1.581609533815241e-05} {"step": 64440, "timestamp": 1778264189.21941, "train/loss": 2.169538688659668, "train/z_loss": 0.0013933419832028449, "train/perplexity": 8.754244676764833, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028502.8905763638, "perf/iters_per_sec": 0.9672655537492579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338422536849976, "data/tokens_consumed": 135142572032, "data/tokens_consumed_B": 135.142572032, "train/loss_slope": -1.4619905090961483e-05} {"step": 64450, "timestamp": 1778264199.5539804, "grad/layer_0/attn": 0.003002862213179469, "grad/layer_0/mlp": 0.003292366862297058, "grad/layer_0/attn_mlp_ratio": 0.9120678975239127, "grad/layer_4/attn": 0.002625443274155259, "grad/layer_4/mlp": 0.002683017635717988, "grad/layer_4/attn_mlp_ratio": 0.978541155059744, "grad/layer_8/attn": 0.00696266395971179, "grad/layer_8/mlp": 0.0036972861271351576, "grad/layer_8/attn_mlp_ratio": 1.8831822942490473, "grad/layer_12/attn": 0.007211688440293074, "grad/layer_12/mlp": 0.006620192900300026, "grad/layer_12/attn_mlp_ratio": 1.0893471595112472, "grad/layer_16/attn": 0.0033507533371448517, "grad/layer_16/mlp": 0.004684559069573879, "grad/layer_16/attn_mlp_ratio": 0.715276126494021, "grad/layer_20/attn": 0.003792164381593466, "grad/layer_20/mlp": 0.005677384790033102, "grad/layer_20/attn_mlp_ratio": 0.667942099231422, "grad/layer_24/attn": 0.010835918597877026, "grad/layer_24/mlp": 0.00951992254704237, "grad/layer_24/attn_mlp_ratio": 1.1382359919955345, "grad/layer_27/attn": 0.0058402325958013535, "grad/layer_27/mlp": 0.009202618151903152, "grad/layer_27/attn_mlp_ratio": 0.6346272806213125} {"step": 64450, "timestamp": 1778264199.5679398, "train/loss": 2.127496910095215, "train/z_loss": 0.001388764614239335, "train/perplexity": 8.39382998279479, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027511.3559853008, "perf/iters_per_sec": 0.9667927532125954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343478441238403, "data/tokens_consumed": 135163543552, "data/tokens_consumed_B": 135.163543552, "train/loss_slope": -1.7853085693567622e-05} {"step": 64460, "timestamp": 1778264209.911353, "train/loss": 2.1254225730895997, "train/z_loss": 0.0013781976187601686, "train/perplexity": 8.376436396958882, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028768.355784822, "perf/iters_per_sec": 0.967392137424861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337069749832153, "data/tokens_consumed": 135184515072, "data/tokens_consumed_B": 135.184515072, "train/loss_slope": -1.841045348021201e-05} {"step": 64470, "timestamp": 1778264220.2542949, "train/loss": 2.1681290745735167, "train/z_loss": 0.00136619663098827, "train/perplexity": 8.741913263463683, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028889.7892721766, "perf/iters_per_sec": 0.9674500414238818, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336451053619384, "data/tokens_consumed": 135205486592, "data/tokens_consumed_B": 135.205486592, "train/loss_slope": -1.6985351789687896e-05} {"step": 64480, "timestamp": 1778264230.5947125, "train/loss": 2.1254664421081544, "train/z_loss": 0.0013858724269084632, "train/perplexity": 8.376803871062908, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028998.7408179897, "perf/iters_per_sec": 0.9675019935693692, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335896015167236, "data/tokens_consumed": 135226458112, "data/tokens_consumed_B": 135.226458112, "train/loss_slope": -1.7861150014232333e-05} {"step": 64490, "timestamp": 1778264240.9440491, "train/loss": 2.0963565587997435, "train/z_loss": 0.0013820795924402774, "train/perplexity": 8.136471088473044, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027947.7156083786, "perf/iters_per_sec": 0.9670008256952184, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034125280380249, "data/tokens_consumed": 135247429632, "data/tokens_consumed_B": 135.247429632, "train/loss_slope": -1.7734618134016435e-05} {"step": 64500, "timestamp": 1778264251.2748473, "grad/layer_0/attn": 0.002339149359613657, "grad/layer_0/mlp": 0.0024146561045199633, "grad/layer_0/attn_mlp_ratio": 0.968729774133077, "grad/layer_4/attn": 0.002204484771937132, "grad/layer_4/mlp": 0.0023133622016757727, "grad/layer_4/attn_mlp_ratio": 0.9529353747747289, "grad/layer_8/attn": 0.00521459337323904, "grad/layer_8/mlp": 0.003549377666786313, "grad/layer_8/attn_mlp_ratio": 1.4691570511415164, "grad/layer_12/attn": 0.0041546812281012535, "grad/layer_12/mlp": 0.00680109579116106, "grad/layer_12/attn_mlp_ratio": 0.6108840831814798, "grad/layer_16/attn": 0.003643723903223872, "grad/layer_16/mlp": 0.004552855622023344, "grad/layer_16/attn_mlp_ratio": 0.8003161368804715, "grad/layer_20/attn": 0.002917541190981865, "grad/layer_20/mlp": 0.005937534384429455, "grad/layer_20/attn_mlp_ratio": 0.49137250463686616, "grad/layer_24/attn": 0.005064088385552168, "grad/layer_24/mlp": 0.007220577448606491, "grad/layer_24/attn_mlp_ratio": 0.7013411810152905, "grad/layer_27/attn": 0.0072371396236121655, "grad/layer_27/mlp": 0.006428965367376804, "grad/layer_27/attn_mlp_ratio": 1.1257082745795366} {"step": 64500, "timestamp": 1778264251.8592377, "eos/sharpness": 57.26525783538817, "eos/L0_probe": 1.9710583686828613, "eos/L_plus": 2.3376858234405518, "eos/L_minus": 2.1770834922790527, "eos/grad_norm": 0.12029048800468445, "eos/embed_grad_frac": 0.13751472532749176, "eos/time_s": 0.5816860198974609} {"step": 64500, "timestamp": 1778264251.8768024, "train/loss": 2.188955068588257, "train/z_loss": 0.0013698181486688555, "train/perplexity": 8.925881306243575, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919458.3252946865, "perf/iters_per_sec": 0.9152690531228478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.092574906349182, "data/tokens_consumed": 135268401152, "data/tokens_consumed_B": 135.268401152, "train/loss_slope": -1.75660199171925e-05} {"step": 64500, "timestamp": 1778264253.2390413, "geo/rankme_last": 438.7750244140625, "geo/layer_0/stable_rank_q_proj": 19.34308433532715, "geo/layer_0/stable_rank_k_proj": 16.0790958404541, "geo/layer_0/stable_rank_o_proj": 46.995628356933594, "geo/layer_0/stable_rank_gate_proj": 131.37034606933594, "geo/layer_0/stable_rank_down_proj": 55.04316711425781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05932879075407982, "geo/layer_0/attn_entropy_mean": 6.1604228019714355, "geo/layer_0/attn_entropy_std": 0.4220055341720581, "geo/layer_7/stable_rank_q_proj": 43.50283432006836, "geo/layer_7/stable_rank_k_proj": 40.85034942626953, "geo/layer_7/stable_rank_o_proj": 89.80316925048828, "geo/layer_7/stable_rank_gate_proj": 81.1629867553711, "geo/layer_7/stable_rank_down_proj": 140.3113250732422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4572504162788391, "geo/layer_7/attn_entropy_mean": 4.663238048553467, "geo/layer_7/attn_entropy_std": 0.7955734133720398, "geo/layer_14/stable_rank_q_proj": 50.96933364868164, "geo/layer_14/stable_rank_k_proj": 40.211875915527344, "geo/layer_14/stable_rank_o_proj": 43.69120407104492, "geo/layer_14/stable_rank_gate_proj": 71.75253295898438, "geo/layer_14/stable_rank_down_proj": 129.5010223388672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4019962251186371, "geo/layer_14/attn_entropy_mean": 5.5507493019104, "geo/layer_14/attn_entropy_std": 0.4085090756416321, "geo/layer_21/stable_rank_q_proj": 40.37098693847656, "geo/layer_21/stable_rank_k_proj": 30.38336753845215, "geo/layer_21/stable_rank_o_proj": 69.80135345458984, "geo/layer_21/stable_rank_gate_proj": 65.76021575927734, "geo/layer_21/stable_rank_down_proj": 51.13247299194336, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14427627623081207, "geo/layer_21/attn_entropy_mean": 5.690649509429932, "geo/layer_21/attn_entropy_std": 0.2943224608898163, "geo/layer_27/stable_rank_q_proj": 43.514591217041016, "geo/layer_27/stable_rank_k_proj": 32.00782012939453, "geo/layer_27/stable_rank_o_proj": 115.19518280029297, "geo/layer_27/stable_rank_gate_proj": 80.22692108154297, "geo/layer_27/stable_rank_down_proj": 128.59178161621094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09605269134044647, "geo/layer_27/attn_entropy_mean": 4.198472023010254, "geo/layer_27/attn_entropy_std": 0.7420463562011719, "attnres/final_alpha/block_0": 0.23750954866409302, "attnres/block_norm/0": 1.7657469511032104, "attnres/final_alpha/block_1": 0.004341095220297575, "attnres/block_norm/1": 46907.328125, "attnres/final_alpha/block_2": 0.010163494385778904, "attnres/block_norm/2": 28683.16796875, "attnres/final_alpha/block_3": 0.01191030628979206, "attnres/block_norm/3": 58673.671875, "attnres/final_alpha/block_4": 0.014468647539615631, "attnres/block_norm/4": 15202.6923828125, "attnres/final_alpha/block_5": 0.6143786311149597, "attnres/block_norm/5": 6706.9912109375, "attnres/final_alpha/block_6": 0.10722824931144714, "attnres/block_norm/6": 39181.703125, "geo/tier1_time_s": 1.3585855960845947, "geo/step": 64500.0, "geo/rankme_slope": 4.5764067345688275e-05} {"step": 64500, "timestamp": 1778264260.3590508, "geo/ww_alpha_mean": 7.849035642260895, "geo/ww_alpha_std": 6.14580646260648, "geo/ww_alpha_min": 1.3440936439313387, "geo/ww_alpha_max": 63.457072949095746, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.9209761494805986, "geo/ww_alpha_by_type/k_proj": 4.46410962681961, "geo/ww_alpha_by_type/v_proj": 8.049140988065012, "geo/ww_alpha_by_type/o_proj": 9.141272380921588, "geo/ww_alpha_by_type/gate_proj": 7.89711117776812, "geo/ww_alpha_by_type/up_proj": 13.285355702859132, "geo/ww_alpha_by_type/down_proj": 8.290908968058918, "geo/twonn_id/layer_0": 0.7262522578239441, "geo/twonn_id/layer_7": 3.6036009788513184, "geo/twonn_id/layer_14": 4.157680988311768, "geo/twonn_id/layer_21": 6.96504020690918, "geo/twonn_id/layer_27": 5.8483991622924805, "geo/tier2_time_s": 7.107358694076538} {"step": 64500, "timestamp": 1778264261.1669765, "eoc/jacobian_sigma/layer_0/attn": 1242.0908203125, "eoc/jacobian_sigma/layer_0/mlp": 8979.4931640625, "eoc/jacobian_sigma/layer_0": 8979.4931640625, "eoc/jacobian_sigma/layer_7/attn": 1.1415997743606567, "eoc/jacobian_sigma/layer_7/mlp": 1.7759692668914795, "eoc/jacobian_sigma/layer_7": 1.7759692668914795, "eoc/jacobian_sigma/layer_14/attn": 1.4849272966384888, "eoc/jacobian_sigma/layer_14/mlp": 5.7819600105285645, "eoc/jacobian_sigma/layer_14": 5.7819600105285645, "eoc/jacobian_sigma/layer_21/attn": 1.0753095149993896, "eoc/jacobian_sigma/layer_21/mlp": 3.815638780593872, "eoc/jacobian_sigma/layer_21": 3.815638780593872, "eoc/jacobian_sigma/layer_27/attn": 3.409050226211548, "eoc/jacobian_sigma/layer_27/mlp": 31.041494369506836, "eoc/jacobian_sigma/layer_27": 31.041494369506836, "eoc/layer0_sigma": 8979.4931640625, "eoc/sigma_max": 31.041494369506836, "eoc/sigma_min": 1.7759692668914795, "eoc/sigma_mean": 10.603765606880188, "eoc/time_s": 0.7989645004272461} {"step": 64510, "timestamp": 1778264271.564017, "train/loss": 2.093333673477173, "train/z_loss": 0.0013966457103379072, "train/perplexity": 8.1119126068799, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1065431.3940972697, "perf/iters_per_sec": 0.5080372782217358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9683594942092895, "data/tokens_consumed": 135289372672, "data/tokens_consumed_B": 135.289372672, "train/loss_slope": -2.305590630722156e-05} {"step": 64520, "timestamp": 1778264281.9041982, "train/loss": 2.1656508922576903, "train/z_loss": 0.0013748643570579589, "train/perplexity": 8.720276030185666, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029498.1581756047, "perf/iters_per_sec": 0.9677401343229316, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333352565765381, "data/tokens_consumed": 135310344192, "data/tokens_consumed_B": 135.310344192, "train/loss_slope": -2.0021006684027125e-05} {"step": 64530, "timestamp": 1778264292.255284, "train/loss": 2.103720259666443, "train/z_loss": 0.0013725506025366486, "train/perplexity": 8.196606766518087, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026986.0571307214, "perf/iters_per_sec": 0.9665422711995704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346158981323241, "data/tokens_consumed": 135331315712, "data/tokens_consumed_B": 135.331315712, "train/loss_slope": -2.2488365808550408e-05} {"step": 64540, "timestamp": 1778264302.5974007, "train/loss": 2.1476542234420775, "train/z_loss": 0.001380655134562403, "train/perplexity": 8.564743838949935, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028705.0008843779, "perf/iters_per_sec": 0.9673619274541749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337392568588257, "data/tokens_consumed": 135352287232, "data/tokens_consumed_B": 135.352287232, "train/loss_slope": -2.328132541552338e-05} {"step": 64550, "timestamp": 1778264312.9273365, "grad/layer_0/attn": 0.0030326650012284517, "grad/layer_0/mlp": 0.003048241138458252, "grad/layer_0/attn_mlp_ratio": 0.9948900903795664, "grad/layer_4/attn": 0.003331321058794856, "grad/layer_4/mlp": 0.00266779656521976, "grad/layer_4/attn_mlp_ratio": 1.2487162542128893, "grad/layer_8/attn": 0.004530103411525488, "grad/layer_8/mlp": 0.0036795404739677906, "grad/layer_8/attn_mlp_ratio": 1.2311600648122498, "grad/layer_12/attn": 0.00479451147839427, "grad/layer_12/mlp": 0.0067174406722188, "grad/layer_12/attn_mlp_ratio": 0.7137407892337282, "grad/layer_16/attn": 0.004090527072548866, "grad/layer_16/mlp": 0.004541682079434395, "grad/layer_16/attn_mlp_ratio": 0.9006634350310894, "grad/layer_20/attn": 0.00342180160805583, "grad/layer_20/mlp": 0.006602612789720297, "grad/layer_20/attn_mlp_ratio": 0.5182496180236892, "grad/layer_24/attn": 0.018800335004925728, "grad/layer_24/mlp": 0.013125701807439327, "grad/layer_24/attn_mlp_ratio": 1.4323298774803166, "grad/layer_27/attn": 0.008760933764278889, "grad/layer_27/mlp": 0.012859871610999107, "grad/layer_27/attn_mlp_ratio": 0.6812613656779812} {"step": 64550, "timestamp": 1778264312.9415078, "train/loss": 2.0991889953613283, "train/z_loss": 0.0013798987492918968, "train/perplexity": 8.159549795723963, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028819.5011426404, "perf/iters_per_sec": 0.9674165254319383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336809158325195, "data/tokens_consumed": 135373258752, "data/tokens_consumed_B": 135.373258752, "train/loss_slope": -2.8272604377213172e-05} {"step": 64560, "timestamp": 1778264323.2876165, "train/loss": 2.1466696858406067, "train/z_loss": 0.0013805431430228055, "train/perplexity": 8.55631567619589, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028422.9464805482, "perf/iters_per_sec": 0.9672274334337941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033882999420166, "data/tokens_consumed": 135394230272, "data/tokens_consumed_B": 135.394230272, "train/loss_slope": -2.9277580842362293e-05} {"step": 64570, "timestamp": 1778264333.6314623, "train/loss": 2.0602747797966003, "train/z_loss": 0.0013941414188593626, "train/perplexity": 7.848126020534996, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029079.1047611274, "perf/iters_per_sec": 0.9675403140836369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335486650466919, "data/tokens_consumed": 135415201792, "data/tokens_consumed_B": 135.415201792, "train/loss_slope": -3.322292620306642e-05} {"step": 64575, "timestamp": 1778264339.3952072, "eos/sharpness": 37.65037059783935, "eos/L0_probe": 1.970814824104309, "eos/L_plus": 2.1746268272399902, "eos/L_minus": 2.1435065269470215, "eos/grad_norm": 0.11473172903060913, "eos/embed_grad_frac": 0.1751531958580017, "eos/time_s": 0.5991268157958984} {"step": 64575, "timestamp": 1778264340.7724972, "geo/rankme_last": 438.6300354003906, "geo/layer_0/stable_rank_q_proj": 19.332927703857422, "geo/layer_0/stable_rank_k_proj": 16.06734848022461, "geo/layer_0/stable_rank_o_proj": 46.983184814453125, "geo/layer_0/stable_rank_gate_proj": 131.43850708007812, "geo/layer_0/stable_rank_down_proj": 55.03228759765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06309310346841812, "geo/layer_0/attn_entropy_mean": 6.157648086547852, "geo/layer_0/attn_entropy_std": 0.4197821617126465, "geo/layer_7/stable_rank_q_proj": 43.50238037109375, "geo/layer_7/stable_rank_k_proj": 40.814422607421875, "geo/layer_7/stable_rank_o_proj": 89.91078186035156, "geo/layer_7/stable_rank_gate_proj": 81.05609130859375, "geo/layer_7/stable_rank_down_proj": 140.27279663085938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44993600249290466, "geo/layer_7/attn_entropy_mean": 4.629112243652344, "geo/layer_7/attn_entropy_std": 0.8040769696235657, "geo/layer_14/stable_rank_q_proj": 50.978538513183594, "geo/layer_14/stable_rank_k_proj": 40.23634719848633, "geo/layer_14/stable_rank_o_proj": 43.757484436035156, "geo/layer_14/stable_rank_gate_proj": 71.69818878173828, "geo/layer_14/stable_rank_down_proj": 129.17141723632812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3900047838687897, "geo/layer_14/attn_entropy_mean": 5.570412635803223, "geo/layer_14/attn_entropy_std": 0.42126038670539856, "geo/layer_21/stable_rank_q_proj": 40.38047409057617, "geo/layer_21/stable_rank_k_proj": 30.33706283569336, "geo/layer_21/stable_rank_o_proj": 69.8038101196289, "geo/layer_21/stable_rank_gate_proj": 65.71229553222656, "geo/layer_21/stable_rank_down_proj": 51.16746520996094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14131735265254974, "geo/layer_21/attn_entropy_mean": 5.6957597732543945, "geo/layer_21/attn_entropy_std": 0.31473204493522644, "geo/layer_27/stable_rank_q_proj": 43.46481704711914, "geo/layer_27/stable_rank_k_proj": 32.00571060180664, "geo/layer_27/stable_rank_o_proj": 114.97844696044922, "geo/layer_27/stable_rank_gate_proj": 80.34131622314453, "geo/layer_27/stable_rank_down_proj": 128.605712890625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.090626060962677, "geo/layer_27/attn_entropy_mean": 4.2130560874938965, "geo/layer_27/attn_entropy_std": 0.7688565254211426, "attnres/final_alpha/block_0": 0.23660512268543243, "attnres/block_norm/0": 1.7657228708267212, "attnres/final_alpha/block_1": 0.004351187497377396, "attnres/block_norm/1": 46880.4140625, "attnres/final_alpha/block_2": 0.01010599173605442, "attnres/block_norm/2": 28656.3828125, "attnres/final_alpha/block_3": 0.011966219171881676, "attnres/block_norm/3": 58210.2421875, "attnres/final_alpha/block_4": 0.01439282950013876, "attnres/block_norm/4": 15246.947265625, "attnres/final_alpha/block_5": 0.6141573190689087, "attnres/block_norm/5": 6694.86328125, "attnres/final_alpha/block_6": 0.10842130333185196, "attnres/block_norm/6": 39116.12890625, "geo/tier1_time_s": 1.3591773509979248, "geo/step": 64575.0, "geo/rankme_slope": 2.2153861544617846e-05} {"step": 64580, "timestamp": 1778264345.9443243, "train/loss": 2.155864691734314, "train/z_loss": 0.001381986343767494, "train/perplexity": 8.635353870722314, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1704227.8745972796, "perf/iters_per_sec": 0.8126391766535185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230558443069458, "data/tokens_consumed": 135436173312, "data/tokens_consumed_B": 135.436173312, "train/loss_slope": -3.1599541050944946e-05} {"step": 64590, "timestamp": 1778264356.2872062, "train/loss": 2.0925700664520264, "train/z_loss": 0.0013816553866490722, "train/perplexity": 8.10572065783468, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028946.744011562, "perf/iters_per_sec": 0.9674771995599566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336160898208617, "data/tokens_consumed": 135457144832, "data/tokens_consumed_B": 135.457144832, "train/loss_slope": -3.505796105733614e-05} {"step": 64600, "timestamp": 1778264366.6178727, "grad/layer_0/attn": 0.0023734986316412687, "grad/layer_0/mlp": 0.002660023747012019, "grad/layer_0/attn_mlp_ratio": 0.8922847192920463, "grad/layer_4/attn": 0.003029055427759886, "grad/layer_4/mlp": 0.0025884704664349556, "grad/layer_4/attn_mlp_ratio": 1.170210496900391, "grad/layer_8/attn": 0.007177280727773905, "grad/layer_8/mlp": 0.003768155351281166, "grad/layer_8/attn_mlp_ratio": 1.9047199141780742, "grad/layer_12/attn": 0.0047252485528588295, "grad/layer_12/mlp": 0.0062436508014798164, "grad/layer_12/attn_mlp_ratio": 0.7568085768118284, "grad/layer_16/attn": 0.003114019986242056, "grad/layer_16/mlp": 0.004642950836569071, "grad/layer_16/attn_mlp_ratio": 0.6706984477727811, "grad/layer_20/attn": 0.004568783566355705, "grad/layer_20/mlp": 0.00524093396961689, "grad/layer_20/attn_mlp_ratio": 0.8717498647506706, "grad/layer_24/attn": 0.0043054791167378426, "grad/layer_24/mlp": 0.007484007626771927, "grad/layer_24/attn_mlp_ratio": 0.5752905761088681, "grad/layer_27/attn": 0.005090267863124609, "grad/layer_27/mlp": 0.006494075991213322, "grad/layer_27/attn_mlp_ratio": 0.783832494665699} {"step": 64600, "timestamp": 1778264366.6319547, "train/loss": 2.17640061378479, "train/z_loss": 0.0013834552722983063, "train/perplexity": 8.814522221745044, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028645.7671639756, "perf/iters_per_sec": 0.9673336826152685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03376944065094, "data/tokens_consumed": 135478116352, "data/tokens_consumed_B": 135.478116352, "train/loss_slope": -3.061919365421338e-05} {"step": 64610, "timestamp": 1778264376.9695356, "train/loss": 2.1272308111190794, "train/z_loss": 0.0013769883895292877, "train/perplexity": 8.391596690382102, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029689.9757976567, "perf/iters_per_sec": 0.9678316000927242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033237600326538, "data/tokens_consumed": 135499087872, "data/tokens_consumed_B": 135.499087872, "train/loss_slope": -3.193292201477376e-05} {"step": 64620, "timestamp": 1778264387.308051, "train/loss": 2.1207826375961303, "train/z_loss": 0.0013838457060046494, "train/perplexity": 8.337660301270711, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029743.322151172, "perf/iters_per_sec": 0.9678570376163349, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332104444503785, "data/tokens_consumed": 135520059392, "data/tokens_consumed_B": 135.520059392, "train/loss_slope": -3.356300386050565e-05} {"step": 64630, "timestamp": 1778264397.6508758, "train/loss": 2.139443850517273, "train/z_loss": 0.001364523870870471, "train/perplexity": 8.494711985249284, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029038.7113719727, "perf/iters_per_sec": 0.9675210530147422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335692405700683, "data/tokens_consumed": 135541030912, "data/tokens_consumed_B": 135.541030912, "train/loss_slope": -3.536486866736673e-05} {"step": 64640, "timestamp": 1778264407.992506, "train/loss": 2.123150134086609, "train/z_loss": 0.0013693091343156994, "train/perplexity": 8.357423067681909, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028858.8094830385, "perf/iters_per_sec": 0.9674352691092675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033660888671875, "data/tokens_consumed": 135562002432, "data/tokens_consumed_B": 135.562002432, "train/loss_slope": -3.822818931930955e-05} {"step": 64650, "timestamp": 1778264418.3306012, "grad/layer_0/attn": 0.0028296809177845716, "grad/layer_0/mlp": 0.0028496887534856796, "grad/layer_0/attn_mlp_ratio": 0.9929789051613004, "grad/layer_4/attn": 0.00455069076269865, "grad/layer_4/mlp": 0.002495288150385022, "grad/layer_4/attn_mlp_ratio": 1.8237134575520404, "grad/layer_8/attn": 0.004227639641612768, "grad/layer_8/mlp": 0.00354183092713356, "grad/layer_8/attn_mlp_ratio": 1.1936310934161758, "grad/layer_12/attn": 0.005120637360960245, "grad/layer_12/mlp": 0.0067656743340194225, "grad/layer_12/attn_mlp_ratio": 0.7568554193522025, "grad/layer_16/attn": 0.004997346084564924, "grad/layer_16/mlp": 0.004726558458060026, "grad/layer_16/attn_mlp_ratio": 1.0572906319002722, "grad/layer_20/attn": 0.005924233701080084, "grad/layer_20/mlp": 0.005711711943149567, "grad/layer_20/attn_mlp_ratio": 1.0372080483618582, "grad/layer_24/attn": 0.008923233486711979, "grad/layer_24/mlp": 0.00924072228372097, "grad/layer_24/attn_mlp_ratio": 0.9656424158387986, "grad/layer_27/attn": 0.005993988830596209, "grad/layer_27/mlp": 0.007994528859853745, "grad/layer_27/attn_mlp_ratio": 0.7497613506307025} {"step": 64650, "timestamp": 1778264418.916909, "eos/sharpness": 51.63629055023193, "eos/L0_probe": 1.9713743925094604, "eos/L_plus": 2.2452778816223145, "eos/L_minus": 2.213833808898926, "eos/grad_norm": 0.12793222069740295, "eos/embed_grad_frac": 0.12422888725996017, "eos/time_s": 0.583402156829834} {"step": 64650, "timestamp": 1778264418.9363973, "train/loss": 2.1167118549346924, "train/z_loss": 0.0013836675323545932, "train/perplexity": 8.30378848745045, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917628.4441478746, "perf/iters_per_sec": 0.9143964977969525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0936174869537354, "data/tokens_consumed": 135582973952, "data/tokens_consumed_B": 135.582973952, "train/loss_slope": -3.999931627731938e-05} {"step": 64650, "timestamp": 1778264420.3002222, "geo/rankme_last": 438.84466552734375, "geo/layer_0/stable_rank_q_proj": 19.34028434753418, "geo/layer_0/stable_rank_k_proj": 16.068702697753906, "geo/layer_0/stable_rank_o_proj": 47.05216979980469, "geo/layer_0/stable_rank_gate_proj": 131.33712768554688, "geo/layer_0/stable_rank_down_proj": 55.01448059082031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06219836324453354, "geo/layer_0/attn_entropy_mean": 6.1629743576049805, "geo/layer_0/attn_entropy_std": 0.4213745594024658, "geo/layer_7/stable_rank_q_proj": 43.527687072753906, "geo/layer_7/stable_rank_k_proj": 40.90211868286133, "geo/layer_7/stable_rank_o_proj": 89.92697143554688, "geo/layer_7/stable_rank_gate_proj": 81.08960723876953, "geo/layer_7/stable_rank_down_proj": 140.4460906982422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46268710494041443, "geo/layer_7/attn_entropy_mean": 4.655237674713135, "geo/layer_7/attn_entropy_std": 0.7939611077308655, "geo/layer_14/stable_rank_q_proj": 50.96304702758789, "geo/layer_14/stable_rank_k_proj": 40.27376937866211, "geo/layer_14/stable_rank_o_proj": 43.766422271728516, "geo/layer_14/stable_rank_gate_proj": 71.71150970458984, "geo/layer_14/stable_rank_down_proj": 129.004150390625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3944953382015228, "geo/layer_14/attn_entropy_mean": 5.592926502227783, "geo/layer_14/attn_entropy_std": 0.4061639904975891, "geo/layer_21/stable_rank_q_proj": 40.29646682739258, "geo/layer_21/stable_rank_k_proj": 30.297698974609375, "geo/layer_21/stable_rank_o_proj": 69.81961059570312, "geo/layer_21/stable_rank_gate_proj": 65.69014739990234, "geo/layer_21/stable_rank_down_proj": 51.17697525024414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13767321407794952, "geo/layer_21/attn_entropy_mean": 5.680729389190674, "geo/layer_21/attn_entropy_std": 0.309312641620636, "geo/layer_27/stable_rank_q_proj": 43.46276092529297, "geo/layer_27/stable_rank_k_proj": 31.896883010864258, "geo/layer_27/stable_rank_o_proj": 114.78609466552734, "geo/layer_27/stable_rank_gate_proj": 80.2852783203125, "geo/layer_27/stable_rank_down_proj": 128.51109313964844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09191985428333282, "geo/layer_27/attn_entropy_mean": 4.171855449676514, "geo/layer_27/attn_entropy_std": 0.760629415512085, "attnres/final_alpha/block_0": 0.23740217089653015, "attnres/block_norm/0": 1.7656968832015991, "attnres/final_alpha/block_1": 0.004322705790400505, "attnres/block_norm/1": 46810.01171875, "attnres/final_alpha/block_2": 0.010244611650705338, "attnres/block_norm/2": 28701.505859375, "attnres/final_alpha/block_3": 0.012072339653968811, "attnres/block_norm/3": 58668.1484375, "attnres/final_alpha/block_4": 0.014490775763988495, "attnres/block_norm/4": 15229.115234375, "attnres/final_alpha/block_5": 0.6147324442863464, "attnres/block_norm/5": 6660.140625, "attnres/final_alpha/block_6": 0.10673495382070541, "attnres/block_norm/6": 38989.765625, "geo/tier1_time_s": 1.35960054397583, "geo/step": 64650.0, "geo/rankme_slope": 2.8177618703731492e-05} {"step": 64660, "timestamp": 1778264430.6424143, "train/loss": 2.1380154848098756, "train/z_loss": 0.0013812582823447884, "train/perplexity": 8.482587091408083, "train/grad_norm": 0.357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792133.7574117526, "perf/iters_per_sec": 0.8545559680041087, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1701983690261841, "data/tokens_consumed": 135603945472, "data/tokens_consumed_B": 135.603945472, "train/loss_slope": -4.1137111126178344e-05} {"step": 64670, "timestamp": 1778264440.9885588, "train/loss": 2.172988533973694, "train/z_loss": 0.0013721331371925772, "train/perplexity": 8.78449762072484, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027786.2850681883, "perf/iters_per_sec": 0.9669238496151868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342076063156127, "data/tokens_consumed": 135624916992, "data/tokens_consumed_B": 135.624916992, "train/loss_slope": -3.670769576871854e-05} {"step": 64680, "timestamp": 1778264451.332883, "train/loss": 2.164152812957764, "train/z_loss": 0.0013882375438697635, "train/perplexity": 8.70722214549376, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028346.9376050783, "perf/iters_per_sec": 0.9671911895776168, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03392174243927, "data/tokens_consumed": 135645888512, "data/tokens_consumed_B": 135.645888512, "train/loss_slope": -3.169311808280816e-05} {"step": 64690, "timestamp": 1778264461.6808796, "train/loss": 2.160219168663025, "train/z_loss": 0.0013695701025426387, "train/perplexity": 8.67303830838481, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027989.000698699, "perf/iters_per_sec": 0.9670205119603629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341042280197144, "data/tokens_consumed": 135666860032, "data/tokens_consumed_B": 135.666860032, "train/loss_slope": -3.03137443961948e-05} {"step": 64700, "timestamp": 1778264472.0116487, "grad/layer_0/attn": 0.0029272849205881357, "grad/layer_0/mlp": 0.003028872888535261, "grad/layer_0/attn_mlp_ratio": 0.9664601096408956, "grad/layer_4/attn": 0.0020420553628355265, "grad/layer_4/mlp": 0.0025740433484315872, "grad/layer_4/attn_mlp_ratio": 0.7933259106717057, "grad/layer_8/attn": 0.004823953378945589, "grad/layer_8/mlp": 0.0037044209893792868, "grad/layer_8/attn_mlp_ratio": 1.3022151808756415, "grad/layer_12/attn": 0.004163796082139015, "grad/layer_12/mlp": 0.006477620918303728, "grad/layer_12/attn_mlp_ratio": 0.6427971118368042, "grad/layer_16/attn": 0.004338915925472975, "grad/layer_16/mlp": 0.004655295517295599, "grad/layer_16/attn_mlp_ratio": 0.9320387537480556, "grad/layer_20/attn": 0.005273588467389345, "grad/layer_20/mlp": 0.006498619914054871, "grad/layer_20/attn_mlp_ratio": 0.8114935872514331, "grad/layer_24/attn": 0.01258845441043377, "grad/layer_24/mlp": 0.011518380604684353, "grad/layer_24/attn_mlp_ratio": 1.0929014010896718, "grad/layer_27/attn": 0.00841448176652193, "grad/layer_27/mlp": 0.011676223017275333, "grad/layer_27/attn_mlp_ratio": 0.7206509915070435} {"step": 64700, "timestamp": 1778264472.0257797, "train/loss": 2.167063522338867, "train/z_loss": 0.001377031672745943, "train/perplexity": 8.732603259279449, "train/grad_norm": 0.232421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028009.9010398814, "perf/iters_per_sec": 0.9670304780196578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340935707092285, "data/tokens_consumed": 135687831552, "data/tokens_consumed_B": 135.687831552, "train/loss_slope": -2.6659912588071975e-05} {"step": 64710, "timestamp": 1778264482.3762906, "train/loss": 2.142190432548523, "train/z_loss": 0.0013852217583917082, "train/perplexity": 8.518075478737027, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027204.4975549811, "perf/iters_per_sec": 0.9666464317107111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345044136047363, "data/tokens_consumed": 135708803072, "data/tokens_consumed_B": 135.708803072, "train/loss_slope": -2.668344161906039e-05} {"step": 64720, "timestamp": 1778264493.3145103, "train/loss": 2.1552011251449583, "train/z_loss": 0.0013863135944120585, "train/perplexity": 8.629625639148172, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918471.7083395494, "perf/iters_per_sec": 0.9147985974977252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0931367874145508, "data/tokens_consumed": 135729774592, "data/tokens_consumed_B": 135.729774592, "train/loss_slope": -2.5210537990578052e-05} {"step": 64725, "timestamp": 1778264499.0688834, "eos/sharpness": 18.691897392272946, "eos/L0_probe": 1.9702637195587158, "eos/L_plus": 2.070066213607788, "eos/L_minus": 2.057380199432373, "eos/grad_norm": 0.10329926759004593, "eos/embed_grad_frac": 0.20843826234340668, "eos/time_s": 0.5847156047821045} {"step": 64725, "timestamp": 1778264500.4488997, "geo/rankme_last": 437.86102294921875, "geo/layer_0/stable_rank_q_proj": 19.322734832763672, "geo/layer_0/stable_rank_k_proj": 16.083271026611328, "geo/layer_0/stable_rank_o_proj": 47.04073715209961, "geo/layer_0/stable_rank_gate_proj": 131.30795288085938, "geo/layer_0/stable_rank_down_proj": 54.93679428100586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06518730521202087, "geo/layer_0/attn_entropy_mean": 6.161994934082031, "geo/layer_0/attn_entropy_std": 0.41619783639907837, "geo/layer_7/stable_rank_q_proj": 43.57453155517578, "geo/layer_7/stable_rank_k_proj": 40.80788040161133, "geo/layer_7/stable_rank_o_proj": 89.78779602050781, "geo/layer_7/stable_rank_gate_proj": 81.09632873535156, "geo/layer_7/stable_rank_down_proj": 140.44737243652344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47248125076293945, "geo/layer_7/attn_entropy_mean": 4.640741348266602, "geo/layer_7/attn_entropy_std": 0.8066238164901733, "geo/layer_14/stable_rank_q_proj": 51.03984069824219, "geo/layer_14/stable_rank_k_proj": 40.180912017822266, "geo/layer_14/stable_rank_o_proj": 43.69324493408203, "geo/layer_14/stable_rank_gate_proj": 71.67724609375, "geo/layer_14/stable_rank_down_proj": 128.9438934326172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3991125226020813, "geo/layer_14/attn_entropy_mean": 5.559161186218262, "geo/layer_14/attn_entropy_std": 0.4162672162055969, "geo/layer_21/stable_rank_q_proj": 40.34107208251953, "geo/layer_21/stable_rank_k_proj": 30.2503604888916, "geo/layer_21/stable_rank_o_proj": 69.87874603271484, "geo/layer_21/stable_rank_gate_proj": 65.62469482421875, "geo/layer_21/stable_rank_down_proj": 51.17938995361328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14538735151290894, "geo/layer_21/attn_entropy_mean": 5.691929817199707, "geo/layer_21/attn_entropy_std": 0.3050891160964966, "geo/layer_27/stable_rank_q_proj": 43.47748947143555, "geo/layer_27/stable_rank_k_proj": 31.918031692504883, "geo/layer_27/stable_rank_o_proj": 114.82220458984375, "geo/layer_27/stable_rank_gate_proj": 80.2156753540039, "geo/layer_27/stable_rank_down_proj": 128.4156036376953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10033949464559555, "geo/layer_27/attn_entropy_mean": 4.20594596862793, "geo/layer_27/attn_entropy_std": 0.7461920976638794, "attnres/final_alpha/block_0": 0.237777441740036, "attnres/block_norm/0": 1.7656919956207275, "attnres/final_alpha/block_1": 0.004319451749324799, "attnres/block_norm/1": 46978.32421875, "attnres/final_alpha/block_2": 0.010048137977719307, "attnres/block_norm/2": 28716.580078125, "attnres/final_alpha/block_3": 0.012043696828186512, "attnres/block_norm/3": 58739.0234375, "attnres/final_alpha/block_4": 0.014360246248543262, "attnres/block_norm/4": 15227.5537109375, "attnres/final_alpha/block_5": 0.6131841540336609, "attnres/block_norm/5": 6671.833984375, "attnres/final_alpha/block_6": 0.10826683789491653, "attnres/block_norm/6": 39127.453125, "geo/tier1_time_s": 1.3596019744873047, "geo/step": 64725.0, "geo/rankme_slope": 2.1208874174669868e-05} {"step": 64730, "timestamp": 1778264505.6217854, "train/loss": 2.1198739767074586, "train/z_loss": 0.0013836683123372496, "train/perplexity": 8.330087636465121, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705185.2732653602, "perf/iters_per_sec": 0.8130956999136735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2298675298690795, "data/tokens_consumed": 135750746112, "data/tokens_consumed_B": 135.750746112, "train/loss_slope": -2.468929001063937e-05} {"step": 64740, "timestamp": 1778264515.962391, "train/loss": 2.1551498651504515, "train/z_loss": 0.0013804001151584088, "train/perplexity": 8.629183295922665, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029051.816814452, "perf/iters_per_sec": 0.967527302176691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335625648498534, "data/tokens_consumed": 135771717632, "data/tokens_consumed_B": 135.771717632, "train/loss_slope": -2.096381022198363e-05} {"step": 64750, "timestamp": 1778264526.292353, "grad/layer_0/attn": 0.0026039688382297754, "grad/layer_0/mlp": 0.002992480993270874, "grad/layer_0/attn_mlp_ratio": 0.8701705230770755, "grad/layer_4/attn": 0.0024666127283126116, "grad/layer_4/mlp": 0.0024988630320876837, "grad/layer_4/attn_mlp_ratio": 0.987093969509195, "grad/layer_8/attn": 0.005967702716588974, "grad/layer_8/mlp": 0.0036067424807697535, "grad/layer_8/attn_mlp_ratio": 1.654596241053428, "grad/layer_12/attn": 0.0060491664335131645, "grad/layer_12/mlp": 0.006406648550182581, "grad/layer_12/attn_mlp_ratio": 0.9442013701408102, "grad/layer_16/attn": 0.0032820894848555326, "grad/layer_16/mlp": 0.004433372989296913, "grad/layer_16/attn_mlp_ratio": 0.7403142976572806, "grad/layer_20/attn": 0.003039600793272257, "grad/layer_20/mlp": 0.005768096540123224, "grad/layer_20/attn_mlp_ratio": 0.5269677300703689, "grad/layer_24/attn": 0.011603417806327343, "grad/layer_24/mlp": 0.01036645844578743, "grad/layer_24/attn_mlp_ratio": 1.119323224520352, "grad/layer_27/attn": 0.008492137305438519, "grad/layer_27/mlp": 0.0097999582067132, "grad/layer_27/attn_mlp_ratio": 0.8665483096618082} {"step": 64750, "timestamp": 1778264526.3065057, "train/loss": 2.151526117324829, "train/z_loss": 0.0013834903365932405, "train/perplexity": 8.597969900610542, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028610.67772892, "perf/iters_per_sec": 0.9673169506687737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337873220443725, "data/tokens_consumed": 135792689152, "data/tokens_consumed_B": 135.792689152, "train/loss_slope": -1.7268800685400714e-05} {"step": 64760, "timestamp": 1778264536.6482024, "train/loss": 2.1425506591796877, "train/z_loss": 0.0013724125805310906, "train/perplexity": 8.52114446910358, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028934.1547265062, "perf/iters_per_sec": 0.9674711965210467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336225032806396, "data/tokens_consumed": 135813660672, "data/tokens_consumed_B": 135.813660672, "train/loss_slope": -1.985100503801904e-05} {"step": 64770, "timestamp": 1778264547.0040584, "train/loss": 2.1532727479934692, "train/z_loss": 0.0013789700460620224, "train/perplexity": 8.613000501159759, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026376.392210909, "perf/iters_per_sec": 0.9662515603117509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349271774291993, "data/tokens_consumed": 135834632192, "data/tokens_consumed_B": 135.834632192, "train/loss_slope": -1.9053942371051347e-05} {"step": 64780, "timestamp": 1778264557.3465834, "train/loss": 2.0821765065193176, "train/z_loss": 0.0013838924816809595, "train/perplexity": 8.021909666142042, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029020.1300169248, "perf/iters_per_sec": 0.9675121927342056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335787057876586, "data/tokens_consumed": 135855603712, "data/tokens_consumed_B": 135.855603712, "train/loss_slope": -2.118248904463121e-05} {"step": 64790, "timestamp": 1778264567.6848981, "train/loss": 2.0990998268127443, "train/z_loss": 0.0013924550265073776, "train/perplexity": 8.15882225294903, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029258.15623683, "perf/iters_per_sec": 0.967625692480483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334574699401855, "data/tokens_consumed": 135876575232, "data/tokens_consumed_B": 135.876575232, "train/loss_slope": -2.2239586071606073e-05} {"step": 64800, "timestamp": 1778264578.0222332, "grad/layer_0/attn": 0.0026586949825286865, "grad/layer_0/mlp": 0.00292890053242445, "grad/layer_0/attn_mlp_ratio": 0.9077450266135878, "grad/layer_4/attn": 0.0019490043632686138, "grad/layer_4/mlp": 0.0025600444059818983, "grad/layer_4/attn_mlp_ratio": 0.7613165937992465, "grad/layer_8/attn": 0.00314927170984447, "grad/layer_8/mlp": 0.0035236214753240347, "grad/layer_8/attn_mlp_ratio": 0.8937599122161297, "grad/layer_12/attn": 0.00439284835010767, "grad/layer_12/mlp": 0.006538330111652613, "grad/layer_12/attn_mlp_ratio": 0.6718608892341863, "grad/layer_16/attn": 0.003949010744690895, "grad/layer_16/mlp": 0.004460583440959454, "grad/layer_16/attn_mlp_ratio": 0.8853125848734757, "grad/layer_20/attn": 0.00320617132820189, "grad/layer_20/mlp": 0.005526657681912184, "grad/layer_20/attn_mlp_ratio": 0.5801284347106034, "grad/layer_24/attn": 0.00795039813965559, "grad/layer_24/mlp": 0.00814939383417368, "grad/layer_24/attn_mlp_ratio": 0.9755815222425781, "grad/layer_27/attn": 0.005839386954903603, "grad/layer_27/mlp": 0.0077892025001347065, "grad/layer_27/attn_mlp_ratio": 0.7496771177581921} {"step": 64800, "timestamp": 1778264578.6039228, "eos/sharpness": 46.953082084655755, "eos/L0_probe": 1.9746454954147339, "eos/L_plus": 2.2393596172332764, "eos/L_minus": 2.179462194442749, "eos/grad_norm": 0.1357414871454239, "eos/embed_grad_frac": 0.11237400770187378, "eos/time_s": 0.5787875652313232} {"step": 64800, "timestamp": 1778264578.6236668, "train/loss": 2.1769593238830565, "train/z_loss": 0.0013652916764840483, "train/perplexity": 8.819448360335269, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918296.653939803, "perf/iters_per_sec": 0.9147151250552191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093236541748047, "data/tokens_consumed": 135897546752, "data/tokens_consumed_B": 135.897546752, "train/loss_slope": -1.985579232285407e-05} {"step": 64800, "timestamp": 1778264579.990125, "geo/rankme_last": 439.1957702636719, "geo/layer_0/stable_rank_q_proj": 19.3211612701416, "geo/layer_0/stable_rank_k_proj": 16.070032119750977, "geo/layer_0/stable_rank_o_proj": 47.003173828125, "geo/layer_0/stable_rank_gate_proj": 131.30369567871094, "geo/layer_0/stable_rank_down_proj": 54.964134216308594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062416061758995056, "geo/layer_0/attn_entropy_mean": 6.159025192260742, "geo/layer_0/attn_entropy_std": 0.4167499542236328, "geo/layer_7/stable_rank_q_proj": 43.57912826538086, "geo/layer_7/stable_rank_k_proj": 40.84587478637695, "geo/layer_7/stable_rank_o_proj": 89.92615509033203, "geo/layer_7/stable_rank_gate_proj": 81.03973388671875, "geo/layer_7/stable_rank_down_proj": 140.39918518066406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45076751708984375, "geo/layer_7/attn_entropy_mean": 4.641308784484863, "geo/layer_7/attn_entropy_std": 0.7838069796562195, "geo/layer_14/stable_rank_q_proj": 51.07282638549805, "geo/layer_14/stable_rank_k_proj": 40.12965393066406, "geo/layer_14/stable_rank_o_proj": 43.70060729980469, "geo/layer_14/stable_rank_gate_proj": 71.6258773803711, "geo/layer_14/stable_rank_down_proj": 128.89845275878906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3946197032928467, "geo/layer_14/attn_entropy_mean": 5.551470756530762, "geo/layer_14/attn_entropy_std": 0.42667797207832336, "geo/layer_21/stable_rank_q_proj": 40.331329345703125, "geo/layer_21/stable_rank_k_proj": 30.276752471923828, "geo/layer_21/stable_rank_o_proj": 69.77127075195312, "geo/layer_21/stable_rank_gate_proj": 65.59640502929688, "geo/layer_21/stable_rank_down_proj": 51.19660568237305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13869783282279968, "geo/layer_21/attn_entropy_mean": 5.692163467407227, "geo/layer_21/attn_entropy_std": 0.29058703780174255, "geo/layer_27/stable_rank_q_proj": 43.45592498779297, "geo/layer_27/stable_rank_k_proj": 31.89462661743164, "geo/layer_27/stable_rank_o_proj": 114.95377349853516, "geo/layer_27/stable_rank_gate_proj": 80.14463806152344, "geo/layer_27/stable_rank_down_proj": 128.16783142089844, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10049832612276077, "geo/layer_27/attn_entropy_mean": 4.200108051300049, "geo/layer_27/attn_entropy_std": 0.7584481239318848, "attnres/final_alpha/block_0": 0.2352081835269928, "attnres/block_norm/0": 1.7655761241912842, "attnres/final_alpha/block_1": 0.00429381662979722, "attnres/block_norm/1": 47082.14453125, "attnres/final_alpha/block_2": 0.01000232994556427, "attnres/block_norm/2": 28652.130859375, "attnres/final_alpha/block_3": 0.011937528848648071, "attnres/block_norm/3": 58735.03125, "attnres/final_alpha/block_4": 0.014022612944245338, "attnres/block_norm/4": 15270.9765625, "attnres/final_alpha/block_5": 0.6181520223617554, "attnres/block_norm/5": 6611.6142578125, "attnres/final_alpha/block_6": 0.1063835397362709, "attnres/block_norm/6": 39139.0, "geo/tier1_time_s": 1.3621268272399902, "geo/step": 64800.0, "geo/rankme_slope": 9.362729466786712e-06} {"step": 64810, "timestamp": 1778264590.855753, "train/loss": 2.146668040752411, "train/z_loss": 0.001379386894404888, "train/perplexity": 8.556301600313548, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1715085.1751714994, "perf/iters_per_sec": 0.8178163410050866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2227684259414673, "data/tokens_consumed": 135918518272, "data/tokens_consumed_B": 135.918518272, "train/loss_slope": -1.9975348739269184e-05} {"step": 64820, "timestamp": 1778264601.2025163, "train/loss": 2.125624680519104, "train/z_loss": 0.001379190234001726, "train/perplexity": 8.378129508076881, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028420.7012136544, "perf/iters_per_sec": 0.967226362807109, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338841438293458, "data/tokens_consumed": 135939489792, "data/tokens_consumed_B": 135.939489792, "train/loss_slope": -1.8334996081528733e-05} {"step": 64830, "timestamp": 1778264611.544615, "train/loss": 2.1398246765136717, "train/z_loss": 0.0013746981392614543, "train/perplexity": 8.4979476084708, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029358.7666040384, "perf/iters_per_sec": 0.9676736672420685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334062337875367, "data/tokens_consumed": 135960461312, "data/tokens_consumed_B": 135.960461312, "train/loss_slope": -1.5417181426184736e-05} {"step": 64840, "timestamp": 1778264622.3338196, "train/loss": 2.151823377609253, "train/z_loss": 0.0013564916560426354, "train/perplexity": 8.600526115500426, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1944717.6782668866, "perf/iters_per_sec": 0.9273136512121614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.078383779525757, "data/tokens_consumed": 135981432832, "data/tokens_consumed_B": 135.981432832, "train/loss_slope": -1.4513925364380121e-05} {"step": 64850, "timestamp": 1778264632.6702929, "grad/layer_0/attn": 0.0024518962018191814, "grad/layer_0/mlp": 0.002734168665483594, "grad/layer_0/attn_mlp_ratio": 0.8967611044249176, "grad/layer_4/attn": 0.002466233680024743, "grad/layer_4/mlp": 0.0024847739841789007, "grad/layer_4/attn_mlp_ratio": 0.992538394426999, "grad/layer_8/attn": 0.0042631905525922775, "grad/layer_8/mlp": 0.003561897436156869, "grad/layer_8/attn_mlp_ratio": 1.1968874762164194, "grad/layer_12/attn": 0.006243248004466295, "grad/layer_12/mlp": 0.007029861211776733, "grad/layer_12/attn_mlp_ratio": 0.8881040076860879, "grad/layer_16/attn": 0.003756582736968994, "grad/layer_16/mlp": 0.004358518868684769, "grad/layer_16/attn_mlp_ratio": 0.861894318680133, "grad/layer_20/attn": 0.0046607800759375095, "grad/layer_20/mlp": 0.0060075074434280396, "grad/layer_20/attn_mlp_ratio": 0.7758259215231793, "grad/layer_24/attn": 0.0047597214579582214, "grad/layer_24/mlp": 0.007677803281694651, "grad/layer_24/attn_mlp_ratio": 0.6199327100907932, "grad/layer_27/attn": 0.00567310955375433, "grad/layer_27/mlp": 0.006548683624714613, "grad/layer_27/attn_mlp_ratio": 0.86629768549428} {"step": 64850, "timestamp": 1778264632.684576, "train/loss": 2.1270907878875733, "train/z_loss": 0.0013848108472302556, "train/perplexity": 8.390421754157122, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027643.0621115873, "perf/iters_per_sec": 0.9668555555875717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342806577682495, "data/tokens_consumed": 136002404352, "data/tokens_consumed_B": 136.002404352, "train/loss_slope": -1.3014658077536386e-05} {"step": 64860, "timestamp": 1778264643.0270784, "train/loss": 2.1267890691757203, "train/z_loss": 0.0013817539322189987, "train/perplexity": 8.387890588782735, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029135.7892740315, "perf/iters_per_sec": 0.9675673433656843, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335197925567627, "data/tokens_consumed": 136023375872, "data/tokens_consumed_B": 136.023375872, "train/loss_slope": -1.2238497988726289e-05} {"step": 64870, "timestamp": 1778264653.3729556, "train/loss": 2.166610085964203, "train/z_loss": 0.0013807256240397693, "train/perplexity": 8.728644476911972, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028091.823309049, "perf/iters_per_sec": 0.9670695416016812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340517997741698, "data/tokens_consumed": 136044347392, "data/tokens_consumed_B": 136.044347392, "train/loss_slope": -1.3478497753549578e-05} {"step": 64875, "timestamp": 1778264659.1175725, "eos/sharpness": 74.52244758605956, "eos/L0_probe": 1.9738999605178833, "eos/L_plus": 2.3922457695007324, "eos/L_minus": 2.30077862739563, "eos/grad_norm": 0.18057553470134735, "eos/embed_grad_frac": 0.06733020395040512, "eos/time_s": 0.5840661525726318} {"step": 64875, "timestamp": 1778264660.4920058, "geo/rankme_last": 438.65911865234375, "geo/layer_0/stable_rank_q_proj": 19.326967239379883, "geo/layer_0/stable_rank_k_proj": 16.070472717285156, "geo/layer_0/stable_rank_o_proj": 47.02495193481445, "geo/layer_0/stable_rank_gate_proj": 131.18051147460938, "geo/layer_0/stable_rank_down_proj": 55.10362243652344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06549540907144547, "geo/layer_0/attn_entropy_mean": 6.162420272827148, "geo/layer_0/attn_entropy_std": 0.41823118925094604, "geo/layer_7/stable_rank_q_proj": 43.54426193237305, "geo/layer_7/stable_rank_k_proj": 40.81341552734375, "geo/layer_7/stable_rank_o_proj": 89.75361633300781, "geo/layer_7/stable_rank_gate_proj": 80.79985046386719, "geo/layer_7/stable_rank_down_proj": 140.42388916015625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4609620273113251, "geo/layer_7/attn_entropy_mean": 4.654144287109375, "geo/layer_7/attn_entropy_std": 0.8059975504875183, "geo/layer_14/stable_rank_q_proj": 51.172401428222656, "geo/layer_14/stable_rank_k_proj": 40.105655670166016, "geo/layer_14/stable_rank_o_proj": 43.67690658569336, "geo/layer_14/stable_rank_gate_proj": 71.572265625, "geo/layer_14/stable_rank_down_proj": 128.52183532714844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3920920491218567, "geo/layer_14/attn_entropy_mean": 5.515295505523682, "geo/layer_14/attn_entropy_std": 0.4025121033191681, "geo/layer_21/stable_rank_q_proj": 40.30198669433594, "geo/layer_21/stable_rank_k_proj": 30.25513458251953, "geo/layer_21/stable_rank_o_proj": 69.87010955810547, "geo/layer_21/stable_rank_gate_proj": 65.5887451171875, "geo/layer_21/stable_rank_down_proj": 51.212196350097656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14345715939998627, "geo/layer_21/attn_entropy_mean": 5.680056571960449, "geo/layer_21/attn_entropy_std": 0.30422425270080566, "geo/layer_27/stable_rank_q_proj": 43.36628341674805, "geo/layer_27/stable_rank_k_proj": 31.837909698486328, "geo/layer_27/stable_rank_o_proj": 114.8663101196289, "geo/layer_27/stable_rank_gate_proj": 80.00447082519531, "geo/layer_27/stable_rank_down_proj": 128.4347686767578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08760204911231995, "geo/layer_27/attn_entropy_mean": 4.20300817489624, "geo/layer_27/attn_entropy_std": 0.7581939697265625, "attnres/final_alpha/block_0": 0.2368430495262146, "attnres/block_norm/0": 1.7655017375946045, "attnres/final_alpha/block_1": 0.004332429729402065, "attnres/block_norm/1": 46962.79296875, "attnres/final_alpha/block_2": 0.01023477129638195, "attnres/block_norm/2": 28787.05859375, "attnres/final_alpha/block_3": 0.012115254998207092, "attnres/block_norm/3": 58403.0390625, "attnres/final_alpha/block_4": 0.014319672249257565, "attnres/block_norm/4": 15240.447265625, "attnres/final_alpha/block_5": 0.6151116490364075, "attnres/block_norm/5": 6676.4599609375, "attnres/final_alpha/block_6": 0.10704316198825836, "attnres/block_norm/6": 38983.71875, "geo/tier1_time_s": 1.3564040660858154, "geo/step": 64875.0, "geo/rankme_slope": -1.645451149209684e-05} {"step": 64880, "timestamp": 1778264665.6719284, "train/loss": 2.1690372705459593, "train/z_loss": 0.0013770516379736364, "train/perplexity": 8.749856240224794, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705810.6289045508, "perf/iters_per_sec": 0.8133938927195314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2294166564941407, "data/tokens_consumed": 136065318912, "data/tokens_consumed_B": 136.065318912, "train/loss_slope": -1.2250232489088268e-05} {"step": 64890, "timestamp": 1778264676.014048, "train/loss": 2.1324464440345765, "train/z_loss": 0.0013707200647331773, "train/perplexity": 8.435478514561566, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029122.6359380716, "perf/iters_per_sec": 0.9675610713663443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335264921188354, "data/tokens_consumed": 136086290432, "data/tokens_consumed_B": 136.086290432, "train/loss_slope": -1.3395616123349373e-05} {"step": 64900, "timestamp": 1778264686.345858, "grad/layer_0/attn": 0.002799177309498191, "grad/layer_0/mlp": 0.0030133023392409086, "grad/layer_0/attn_mlp_ratio": 0.9289400469882277, "grad/layer_4/attn": 0.0022018325980752707, "grad/layer_4/mlp": 0.0025111923459917307, "grad/layer_4/attn_mlp_ratio": 0.8768075905889856, "grad/layer_8/attn": 0.005284077953547239, "grad/layer_8/mlp": 0.0035450023133307695, "grad/layer_8/attn_mlp_ratio": 1.490571045502474, "grad/layer_12/attn": 0.005127452313899994, "grad/layer_12/mlp": 0.006565647665411234, "grad/layer_12/attn_mlp_ratio": 0.780951476092297, "grad/layer_16/attn": 0.003318206639960408, "grad/layer_16/mlp": 0.0048225088976323605, "grad/layer_16/attn_mlp_ratio": 0.6880664487281417, "grad/layer_20/attn": 0.009668788872659206, "grad/layer_20/mlp": 0.005511371418833733, "grad/layer_20/attn_mlp_ratio": 1.7543344410041206, "grad/layer_24/attn": 0.007002208381891251, "grad/layer_24/mlp": 0.00693280017003417, "grad/layer_24/attn_mlp_ratio": 1.0100115550937023, "grad/layer_27/attn": 0.003527421969920397, "grad/layer_27/mlp": 0.005955854430794716, "grad/layer_27/attn_mlp_ratio": 0.5922612702647252} {"step": 64900, "timestamp": 1778264686.360051, "train/loss": 2.124030327796936, "train/z_loss": 0.001374091557227075, "train/perplexity": 8.364782457269705, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027937.4296639843, "perf/iters_per_sec": 0.966995920974724, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341305255889892, "data/tokens_consumed": 136107261952, "data/tokens_consumed_B": 136.107261952, "train/loss_slope": -1.381167773187533e-05} {"step": 64910, "timestamp": 1778264696.7108073, "train/loss": 2.148303246498108, "train/z_loss": 0.0013697183807380498, "train/perplexity": 8.570304359428217, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027166.3278162004, "perf/iters_per_sec": 0.9666282309609415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034523892402649, "data/tokens_consumed": 136128233472, "data/tokens_consumed_B": 136.128233472, "train/loss_slope": -1.143853619332573e-05} {"step": 64920, "timestamp": 1778264707.0585883, "train/loss": 2.142251396179199, "train/z_loss": 0.0013649731990881264, "train/perplexity": 8.518594787373893, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028692.7889101515, "perf/iters_per_sec": 0.9673561043310888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337454795837402, "data/tokens_consumed": 136149204992, "data/tokens_consumed_B": 136.149204992, "train/loss_slope": -9.726546170509055e-06} {"step": 64930, "timestamp": 1778264717.3993287, "train/loss": 2.1420129895210267, "train/z_loss": 0.0013756572734564543, "train/perplexity": 8.516564139727887, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029016.7133312507, "perf/iters_per_sec": 0.9675105635315183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033580446243286, "data/tokens_consumed": 136170176512, "data/tokens_consumed_B": 136.170176512, "train/loss_slope": -6.4446567380317255e-06} {"step": 64940, "timestamp": 1778264727.7419753, "train/loss": 2.1452890157699587, "train/z_loss": 0.0013781548477709293, "train/perplexity": 8.544510378712525, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028891.193212522, "perf/iters_per_sec": 0.9674507108748064, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336443901062011, "data/tokens_consumed": 136191148032, "data/tokens_consumed_B": 136.191148032, "train/loss_slope": -5.53697265020688e-06} {"step": 64950, "timestamp": 1778264738.078273, "grad/layer_0/attn": 0.0029232893139123917, "grad/layer_0/mlp": 0.003367156255990267, "grad/layer_0/attn_mlp_ratio": 0.8681774782189062, "grad/layer_4/attn": 0.00287151918746531, "grad/layer_4/mlp": 0.002554468344897032, "grad/layer_4/attn_mlp_ratio": 1.1241161319497366, "grad/layer_8/attn": 0.0028733231592923403, "grad/layer_8/mlp": 0.0038865392562001944, "grad/layer_8/attn_mlp_ratio": 0.7393011869823289, "grad/layer_12/attn": 0.003561132587492466, "grad/layer_12/mlp": 0.006093909963965416, "grad/layer_12/attn_mlp_ratio": 0.5843756389760655, "grad/layer_16/attn": 0.00340783828869462, "grad/layer_16/mlp": 0.004847525618970394, "grad/layer_16/attn_mlp_ratio": 0.7030057159590355, "grad/layer_20/attn": 0.0038457622285932302, "grad/layer_20/mlp": 0.006928399205207825, "grad/layer_20/attn_mlp_ratio": 0.555072255391301, "grad/layer_24/attn": 0.01783153787255287, "grad/layer_24/mlp": 0.012952963821589947, "grad/layer_24/attn_mlp_ratio": 1.376637654554981, "grad/layer_27/attn": 0.006739157252013683, "grad/layer_27/mlp": 0.012083655223250389, "grad/layer_27/attn_mlp_ratio": 0.5577084972828332} {"step": 64950, "timestamp": 1778264738.66385, "eos/sharpness": 78.5426616668701, "eos/L0_probe": 1.9702651500701904, "eos/L_plus": 2.4246652126312256, "eos/L_minus": 2.3012917041778564, "eos/grad_norm": 0.23667335510253906, "eos/embed_grad_frac": 0.04082886874675751, "eos/time_s": 0.5828080177307129} {"step": 64950, "timestamp": 1778264738.6834283, "train/loss": 2.1642361640930177, "train/z_loss": 0.0013773448765277863, "train/perplexity": 8.707947932591663, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917964.0800281456, "perf/iters_per_sec": 0.914556541456292, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0934261083602905, "data/tokens_consumed": 136212119552, "data/tokens_consumed_B": 136.212119552, "train/loss_slope": -2.9690468021601752e-06} {"step": 64950, "timestamp": 1778264740.0441978, "geo/rankme_last": 439.2198181152344, "geo/layer_0/stable_rank_q_proj": 19.354206085205078, "geo/layer_0/stable_rank_k_proj": 16.082517623901367, "geo/layer_0/stable_rank_o_proj": 47.00644302368164, "geo/layer_0/stable_rank_gate_proj": 131.0762481689453, "geo/layer_0/stable_rank_down_proj": 55.047508239746094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06392936408519745, "geo/layer_0/attn_entropy_mean": 6.168549537658691, "geo/layer_0/attn_entropy_std": 0.41527867317199707, "geo/layer_7/stable_rank_q_proj": 43.57453155517578, "geo/layer_7/stable_rank_k_proj": 40.80225372314453, "geo/layer_7/stable_rank_o_proj": 89.85578918457031, "geo/layer_7/stable_rank_gate_proj": 80.7109375, "geo/layer_7/stable_rank_down_proj": 140.3955078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4540677070617676, "geo/layer_7/attn_entropy_mean": 4.661756992340088, "geo/layer_7/attn_entropy_std": 0.8012259602546692, "geo/layer_14/stable_rank_q_proj": 51.12579345703125, "geo/layer_14/stable_rank_k_proj": 40.10507583618164, "geo/layer_14/stable_rank_o_proj": 43.678646087646484, "geo/layer_14/stable_rank_gate_proj": 71.6823501586914, "geo/layer_14/stable_rank_down_proj": 128.53244018554688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3983580470085144, "geo/layer_14/attn_entropy_mean": 5.527669906616211, "geo/layer_14/attn_entropy_std": 0.42369312047958374, "geo/layer_21/stable_rank_q_proj": 40.19850158691406, "geo/layer_21/stable_rank_k_proj": 30.354280471801758, "geo/layer_21/stable_rank_o_proj": 69.9530258178711, "geo/layer_21/stable_rank_gate_proj": 65.56075286865234, "geo/layer_21/stable_rank_down_proj": 51.129173278808594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14455962181091309, "geo/layer_21/attn_entropy_mean": 5.702951908111572, "geo/layer_21/attn_entropy_std": 0.2944512665271759, "geo/layer_27/stable_rank_q_proj": 43.47146224975586, "geo/layer_27/stable_rank_k_proj": 31.84136199951172, "geo/layer_27/stable_rank_o_proj": 114.74417877197266, "geo/layer_27/stable_rank_gate_proj": 79.98417663574219, "geo/layer_27/stable_rank_down_proj": 128.33534240722656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10604283213615417, "geo/layer_27/attn_entropy_mean": 4.2091169357299805, "geo/layer_27/attn_entropy_std": 0.7535973191261292, "attnres/final_alpha/block_0": 0.2354128062725067, "attnres/block_norm/0": 1.7657020092010498, "attnres/final_alpha/block_1": 0.004317452199757099, "attnres/block_norm/1": 46937.5, "attnres/final_alpha/block_2": 0.010075808502733707, "attnres/block_norm/2": 28668.234375, "attnres/final_alpha/block_3": 0.011968129314482212, "attnres/block_norm/3": 58702.90625, "attnres/final_alpha/block_4": 0.014226102270185947, "attnres/block_norm/4": 15200.1865234375, "attnres/final_alpha/block_5": 0.6177060604095459, "attnres/block_norm/5": 6649.24755859375, "attnres/final_alpha/block_6": 0.10629363358020782, "attnres/block_norm/6": 39024.453125, "geo/tier1_time_s": 1.3571603298187256, "geo/step": 64950.0, "geo/rankme_slope": -1.761960643632453e-05} {"step": 64960, "timestamp": 1778264750.3835273, "train/loss": 2.1063947439193726, "train/z_loss": 0.0013817404978908598, "train/perplexity": 8.21855780300891, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792919.4296513735, "perf/iters_per_sec": 0.8549306057221286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1696855783462525, "data/tokens_consumed": 136233091072, "data/tokens_consumed_B": 136.233091072, "train/loss_slope": -4.763119270091178e-06} {"step": 64970, "timestamp": 1778264760.727615, "train/loss": 2.1375771045684813, "train/z_loss": 0.0013719754060730338, "train/perplexity": 8.478869307792282, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028181.468443654, "perf/iters_per_sec": 0.967112287732913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340060949325562, "data/tokens_consumed": 136254062592, "data/tokens_consumed_B": 136.254062592, "train/loss_slope": -3.248587643245339e-06} {"step": 64980, "timestamp": 1778264771.071772, "train/loss": 2.1479045152664185, "train/z_loss": 0.001365594530943781, "train/perplexity": 8.566887792606243, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028652.1769652348, "perf/iters_per_sec": 0.9673367390466856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337661743164062, "data/tokens_consumed": 136275034112, "data/tokens_consumed_B": 136.275034112, "train/loss_slope": -2.5024229508035178e-08} {"step": 64990, "timestamp": 1778264781.418467, "train/loss": 2.1730759143829346, "train/z_loss": 0.0013824890600517392, "train/perplexity": 8.785265247259185, "train/grad_norm": 0.333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028378.790477328, "perf/iters_per_sec": 0.967206378210701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339055061340332, "data/tokens_consumed": 136296005632, "data/tokens_consumed_B": 136.296005632, "train/loss_slope": 3.048338927272732e-06} {"step": 65000, "timestamp": 1778264791.7543292, "grad/layer_0/attn": 0.003174097277224064, "grad/layer_0/mlp": 0.0033814951311796904, "grad/layer_0/attn_mlp_ratio": 0.93866679093814, "grad/layer_4/attn": 0.0028842876199632883, "grad/layer_4/mlp": 0.0025921326596289873, "grad/layer_4/attn_mlp_ratio": 1.1127082936817314, "grad/layer_8/attn": 0.003955490421503782, "grad/layer_8/mlp": 0.003749324008822441, "grad/layer_8/attn_mlp_ratio": 1.0549875942163058, "grad/layer_12/attn": 0.005029354244470596, "grad/layer_12/mlp": 0.006871743593364954, "grad/layer_12/attn_mlp_ratio": 0.7318890908761213, "grad/layer_16/attn": 0.004171039443463087, "grad/layer_16/mlp": 0.0049228486604988575, "grad/layer_16/attn_mlp_ratio": 0.8472816546654207, "grad/layer_20/attn": 0.004003364127129316, "grad/layer_20/mlp": 0.00557901943102479, "grad/layer_20/attn_mlp_ratio": 0.7175748543031096, "grad/layer_24/attn": 0.009030523709952831, "grad/layer_24/mlp": 0.00912655983120203, "grad/layer_24/attn_mlp_ratio": 0.9894772814759186, "grad/layer_27/attn": 0.007256395183503628, "grad/layer_27/mlp": 0.007769329939037561, "grad/layer_27/attn_mlp_ratio": 0.9339795255245105} {"step": 65000, "timestamp": 1778264791.7685146, "train/loss": 2.1694812297821047, "train/z_loss": 0.001382362109143287, "train/perplexity": 8.753741682142682, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027192.3036388524, "perf/iters_per_sec": 0.9666406171983969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345106363296508, "data/tokens_consumed": 136316977152, "data/tokens_consumed_B": 136.316977152, "train/loss_slope": 6.9049207934595485e-06} {"step": 65000, "timestamp": 1778264799.0132535, "geo/ww_alpha_mean": 7.641198060138591, "geo/ww_alpha_std": 4.650910480361078, "geo/ww_alpha_min": 1.3566974127448952, "geo/ww_alpha_max": 35.684752046630656, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9236276556528833, "geo/ww_alpha_by_type/k_proj": 4.489030538126109, "geo/ww_alpha_by_type/v_proj": 8.250005002713875, "geo/ww_alpha_by_type/o_proj": 9.521238725053777, "geo/ww_alpha_by_type/gate_proj": 7.864885299825231, "geo/ww_alpha_by_type/up_proj": 11.379200864365346, "geo/ww_alpha_by_type/down_proj": 8.1573470397579, "geo/twonn_id/layer_0": 0.680932343006134, "geo/twonn_id/layer_7": 3.1517064571380615, "geo/twonn_id/layer_14": 4.222764492034912, "geo/twonn_id/layer_21": 7.49693489074707, "geo/twonn_id/layer_27": 5.276371955871582, "geo/tier2_time_s": 7.236006736755371} {"step": 65000, "timestamp": 1778264799.8149483, "eoc/jacobian_sigma/layer_0/attn": 1232.0826416015625, "eoc/jacobian_sigma/layer_0/mlp": 8199.13671875, "eoc/jacobian_sigma/layer_0": 8199.13671875, "eoc/jacobian_sigma/layer_7/attn": 1.146079182624817, "eoc/jacobian_sigma/layer_7/mlp": 1.7820961475372314, "eoc/jacobian_sigma/layer_7": 1.7820961475372314, "eoc/jacobian_sigma/layer_14/attn": 1.4977738857269287, "eoc/jacobian_sigma/layer_14/mlp": 6.084799289703369, "eoc/jacobian_sigma/layer_14": 6.084799289703369, "eoc/jacobian_sigma/layer_21/attn": 1.0869413614273071, "eoc/jacobian_sigma/layer_21/mlp": 4.213105201721191, "eoc/jacobian_sigma/layer_21": 4.213105201721191, "eoc/jacobian_sigma/layer_27/attn": 2.9372689723968506, "eoc/jacobian_sigma/layer_27/mlp": 30.86517906188965, "eoc/jacobian_sigma/layer_27": 30.86517906188965, "eoc/layer0_sigma": 8199.13671875, "eoc/sigma_max": 30.86517906188965, "eoc/sigma_min": 1.7820961475372314, "eoc/sigma_mean": 10.73629492521286, "eoc/time_s": 0.7931716442108154} {"step": 65010, "timestamp": 1778264810.1745007, "train/loss": 2.1141005158424377, "train/z_loss": 0.0013643601560033858, "train/perplexity": 8.28213276747945, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1139834.912054684, "perf/iters_per_sec": 0.5435156402848644, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.839873456954956, "data/tokens_consumed": 136337948672, "data/tokens_consumed_B": 136.337948672, "train/loss_slope": 1.550096790246239e-06} {"step": 65020, "timestamp": 1778264820.8916805, "train/loss": 2.101652216911316, "train/z_loss": 0.0013889954891055823, "train/perplexity": 8.17967334882892, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1957608.6994818891, "perf/iters_per_sec": 0.9334605691346594, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0712825298309325, "data/tokens_consumed": 136358920192, "data/tokens_consumed_B": 136.358920192, "train/loss_slope": -1.5090103446989541e-06} {"step": 65025, "timestamp": 1778264827.164713, "eos/sharpness": 6.507730484008788, "eos/L0_probe": 1.972902774810791, "eos/L_plus": 2.008863687515259, "eos/L_minus": 2.002019166946411, "eos/grad_norm": 0.10027742385864258, "eos/embed_grad_frac": 0.27057069540023804, "eos/time_s": 0.6048357486724854} {"step": 65025, "timestamp": 1778264828.5560255, "geo/rankme_last": 437.9639587402344, "geo/layer_0/stable_rank_q_proj": 19.352758407592773, "geo/layer_0/stable_rank_k_proj": 16.1033878326416, "geo/layer_0/stable_rank_o_proj": 46.96123504638672, "geo/layer_0/stable_rank_gate_proj": 131.0904998779297, "geo/layer_0/stable_rank_down_proj": 54.980743408203125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06258866935968399, "geo/layer_0/attn_entropy_mean": 6.1658735275268555, "geo/layer_0/attn_entropy_std": 0.4130557179450989, "geo/layer_7/stable_rank_q_proj": 43.59385299682617, "geo/layer_7/stable_rank_k_proj": 40.80272674560547, "geo/layer_7/stable_rank_o_proj": 89.91270446777344, "geo/layer_7/stable_rank_gate_proj": 80.6362075805664, "geo/layer_7/stable_rank_down_proj": 140.0643768310547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4487183392047882, "geo/layer_7/attn_entropy_mean": 4.64154052734375, "geo/layer_7/attn_entropy_std": 0.8029401302337646, "geo/layer_14/stable_rank_q_proj": 51.094425201416016, "geo/layer_14/stable_rank_k_proj": 40.09061813354492, "geo/layer_14/stable_rank_o_proj": 43.625511169433594, "geo/layer_14/stable_rank_gate_proj": 71.69384002685547, "geo/layer_14/stable_rank_down_proj": 128.86688232421875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40595516562461853, "geo/layer_14/attn_entropy_mean": 5.543121814727783, "geo/layer_14/attn_entropy_std": 0.40017494559288025, "geo/layer_21/stable_rank_q_proj": 40.146385192871094, "geo/layer_21/stable_rank_k_proj": 30.301298141479492, "geo/layer_21/stable_rank_o_proj": 69.84101867675781, "geo/layer_21/stable_rank_gate_proj": 65.53263854980469, "geo/layer_21/stable_rank_down_proj": 51.15509796142578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14181958138942719, "geo/layer_21/attn_entropy_mean": 5.69124698638916, "geo/layer_21/attn_entropy_std": 0.297292560338974, "geo/layer_27/stable_rank_q_proj": 43.50798797607422, "geo/layer_27/stable_rank_k_proj": 31.884382247924805, "geo/layer_27/stable_rank_o_proj": 114.83162689208984, "geo/layer_27/stable_rank_gate_proj": 79.95152282714844, "geo/layer_27/stable_rank_down_proj": 128.49453735351562, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10284778475761414, "geo/layer_27/attn_entropy_mean": 4.204363822937012, "geo/layer_27/attn_entropy_std": 0.7641640901565552, "attnres/final_alpha/block_0": 0.2348736971616745, "attnres/block_norm/0": 1.7657616138458252, "attnres/final_alpha/block_1": 0.004243302159011364, "attnres/block_norm/1": 47003.1015625, "attnres/final_alpha/block_2": 0.010105855762958527, "attnres/block_norm/2": 28731.7890625, "attnres/final_alpha/block_3": 0.011867244727909565, "attnres/block_norm/3": 58665.99609375, "attnres/final_alpha/block_4": 0.014212274923920631, "attnres/block_norm/4": 15300.39453125, "attnres/final_alpha/block_5": 0.6175839304924011, "attnres/block_norm/5": 6646.1318359375, "attnres/final_alpha/block_6": 0.10711369663476944, "attnres/block_norm/6": 39435.6328125, "geo/tier1_time_s": 1.3599021434783936, "geo/step": 65025.0, "geo/rankme_slope": -4.56675443614946e-05} {"step": 65030, "timestamp": 1778264833.7341561, "train/loss": 2.160149335861206, "train/z_loss": 0.001365572412032634, "train/perplexity": 8.672432666966516, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1633710.9195459, "perf/iters_per_sec": 0.7790140722016812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2836738586425782, "data/tokens_consumed": 136379891712, "data/tokens_consumed_B": 136.379891712, "train/loss_slope": -7.686431866445684e-08} {"step": 65040, "timestamp": 1778264844.0729654, "train/loss": 2.108158254623413, "train/z_loss": 0.0013776339357718826, "train/perplexity": 8.233064104916123, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029514.0323301672, "perf/iters_per_sec": 0.9677477037096821, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333271741867065, "data/tokens_consumed": 136400863232, "data/tokens_consumed_B": 136.400863232, "train/loss_slope": -4.0125228319301875e-06} {"step": 65050, "timestamp": 1778264854.405767, "grad/layer_0/attn": 0.002523715840652585, "grad/layer_0/mlp": 0.0027516353875398636, "grad/layer_0/attn_mlp_ratio": 0.9171693896523143, "grad/layer_4/attn": 0.0019703509751707315, "grad/layer_4/mlp": 0.0025209777522832155, "grad/layer_4/attn_mlp_ratio": 0.7815820251598844, "grad/layer_8/attn": 0.004059257451444864, "grad/layer_8/mlp": 0.003803960746154189, "grad/layer_8/attn_mlp_ratio": 1.0671133630486183, "grad/layer_12/attn": 0.006840290501713753, "grad/layer_12/mlp": 0.007295985706150532, "grad/layer_12/attn_mlp_ratio": 0.9375416404932385, "grad/layer_16/attn": 0.0040670521557331085, "grad/layer_16/mlp": 0.005134860053658485, "grad/layer_16/attn_mlp_ratio": 0.7920473068454289, "grad/layer_20/attn": 0.004747740458697081, "grad/layer_20/mlp": 0.006799451541155577, "grad/layer_20/attn_mlp_ratio": 0.6982534341387264, "grad/layer_24/attn": 0.012620530091226101, "grad/layer_24/mlp": 0.008749701082706451, "grad/layer_24/attn_mlp_ratio": 1.4423955547385137, "grad/layer_27/attn": 0.004930640570819378, "grad/layer_27/mlp": 0.007323369383811951, "grad/layer_27/attn_mlp_ratio": 0.6732748609391326} {"step": 65050, "timestamp": 1778264854.419959, "train/loss": 2.1657280683517457, "train/z_loss": 0.001366322219837457, "train/perplexity": 8.720949052999064, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027942.43236027, "perf/iters_per_sec": 0.9669983064462042, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341279745101928, "data/tokens_consumed": 136421834752, "data/tokens_consumed_B": 136.421834752, "train/loss_slope": 5.016996808286027e-07} {"step": 65060, "timestamp": 1778264864.765045, "train/loss": 2.1125892043113708, "train/z_loss": 0.0013813465251587332, "train/perplexity": 8.269625338417658, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028622.186929838, "perf/iters_per_sec": 0.9673224386834326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337814569473267, "data/tokens_consumed": 136442806272, "data/tokens_consumed_B": 136.442806272, "train/loss_slope": -4.63149299358276e-07} {"step": 65070, "timestamp": 1778264875.1078286, "train/loss": 2.153892660140991, "train/z_loss": 0.0013770370976999402, "train/perplexity": 8.61834146008866, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029031.0354057096, "perf/iters_per_sec": 0.9675173928288029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335731506347656, "data/tokens_consumed": 136463777792, "data/tokens_consumed_B": 136.463777792, "train/loss_slope": -2.521742652780291e-07} {"step": 65080, "timestamp": 1778264885.9595895, "train/loss": 2.119134712219238, "train/z_loss": 0.0013845272362232209, "train/perplexity": 8.323931774177264, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933670.8286478878, "perf/iters_per_sec": 0.9220461028327407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0845444679260254, "data/tokens_consumed": 136484749312, "data/tokens_consumed_B": 136.484749312, "train/loss_slope": -1.4816964026724542e-06} {"step": 65090, "timestamp": 1778264896.7580278, "train/loss": 2.1159135580062864, "train/z_loss": 0.001382259209640324, "train/perplexity": 8.297162243813654, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943472.9379686278, "perf/iters_per_sec": 0.9267201127856387, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.079074454307556, "data/tokens_consumed": 136505720832, "data/tokens_consumed_B": 136.505720832, "train/loss_slope": -1.7904813962765108e-06} {"step": 65100, "timestamp": 1778264907.0918868, "grad/layer_0/attn": 0.0025579871144145727, "grad/layer_0/mlp": 0.003071257844567299, "grad/layer_0/attn_mlp_ratio": 0.8328792828812561, "grad/layer_4/attn": 0.00204250100068748, "grad/layer_4/mlp": 0.0026617776602506638, "grad/layer_4/attn_mlp_ratio": 0.767344678879252, "grad/layer_8/attn": 0.004350576549768448, "grad/layer_8/mlp": 0.0037044247146695852, "grad/layer_8/attn_mlp_ratio": 1.1744270075451635, "grad/layer_12/attn": 0.005831173155456781, "grad/layer_12/mlp": 0.00664669182151556, "grad/layer_12/attn_mlp_ratio": 0.8773045635801296, "grad/layer_16/attn": 0.0035946578718721867, "grad/layer_16/mlp": 0.00480561563745141, "grad/layer_16/attn_mlp_ratio": 0.7480119236038965, "grad/layer_20/attn": 0.003907673992216587, "grad/layer_20/mlp": 0.006680232007056475, "grad/layer_20/attn_mlp_ratio": 0.5849608111803221, "grad/layer_24/attn": 0.012022418901324272, "grad/layer_24/mlp": 0.01051933690905571, "grad/layer_24/attn_mlp_ratio": 1.1428875119196784, "grad/layer_27/attn": 0.010950790718197823, "grad/layer_27/mlp": 0.009854272939264774, "grad/layer_27/attn_mlp_ratio": 1.1112733201692226} {"step": 65100, "timestamp": 1778264907.6790872, "eos/sharpness": 78.56934070587157, "eos/L0_probe": 1.9718631505966187, "eos/L_plus": 2.4370574951171875, "eos/L_minus": 2.2923622131347656, "eos/grad_norm": 0.20236599445343018, "eos/embed_grad_frac": 0.05581635236740112, "eos/time_s": 0.5842757225036621} {"step": 65100, "timestamp": 1778264907.6995828, "train/loss": 2.168162989616394, "train/z_loss": 0.0013767254306003452, "train/perplexity": 8.742209750854505, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917811.6557516516, "perf/iters_per_sec": 0.9144838598974474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.093513011932373, "data/tokens_consumed": 136526692352, "data/tokens_consumed_B": 136.526692352, "train/loss_slope": -1.8583067966373395e-07} {"step": 65100, "timestamp": 1778264909.0613923, "geo/rankme_last": 438.41278076171875, "geo/layer_0/stable_rank_q_proj": 19.34007453918457, "geo/layer_0/stable_rank_k_proj": 16.070470809936523, "geo/layer_0/stable_rank_o_proj": 46.929222106933594, "geo/layer_0/stable_rank_gate_proj": 131.08970642089844, "geo/layer_0/stable_rank_down_proj": 54.99716567993164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06789896637201309, "geo/layer_0/attn_entropy_mean": 6.15991735458374, "geo/layer_0/attn_entropy_std": 0.41415002942085266, "geo/layer_7/stable_rank_q_proj": 43.427005767822266, "geo/layer_7/stable_rank_k_proj": 40.640167236328125, "geo/layer_7/stable_rank_o_proj": 89.83573913574219, "geo/layer_7/stable_rank_gate_proj": 80.7841567993164, "geo/layer_7/stable_rank_down_proj": 140.17831420898438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4549616277217865, "geo/layer_7/attn_entropy_mean": 4.628104209899902, "geo/layer_7/attn_entropy_std": 0.7902563810348511, "geo/layer_14/stable_rank_q_proj": 51.08359909057617, "geo/layer_14/stable_rank_k_proj": 40.09920883178711, "geo/layer_14/stable_rank_o_proj": 43.646881103515625, "geo/layer_14/stable_rank_gate_proj": 71.73977661132812, "geo/layer_14/stable_rank_down_proj": 129.20668029785156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.408006876707077, "geo/layer_14/attn_entropy_mean": 5.534669876098633, "geo/layer_14/attn_entropy_std": 0.3996206223964691, "geo/layer_21/stable_rank_q_proj": 40.16291427612305, "geo/layer_21/stable_rank_k_proj": 30.28024673461914, "geo/layer_21/stable_rank_o_proj": 69.89967346191406, "geo/layer_21/stable_rank_gate_proj": 65.51774597167969, "geo/layer_21/stable_rank_down_proj": 51.13286590576172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1426103115081787, "geo/layer_21/attn_entropy_mean": 5.701303958892822, "geo/layer_21/attn_entropy_std": 0.30340471863746643, "geo/layer_27/stable_rank_q_proj": 43.44903564453125, "geo/layer_27/stable_rank_k_proj": 31.885746002197266, "geo/layer_27/stable_rank_o_proj": 114.88382720947266, "geo/layer_27/stable_rank_gate_proj": 79.95215606689453, "geo/layer_27/stable_rank_down_proj": 128.33364868164062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10121358931064606, "geo/layer_27/attn_entropy_mean": 4.184429168701172, "geo/layer_27/attn_entropy_std": 0.7462241053581238, "attnres/final_alpha/block_0": 0.2345680296421051, "attnres/block_norm/0": 1.7659285068511963, "attnres/final_alpha/block_1": 0.004299923777580261, "attnres/block_norm/1": 47074.52734375, "attnres/final_alpha/block_2": 0.010069029405713081, "attnres/block_norm/2": 28670.771484375, "attnres/final_alpha/block_3": 0.011818069964647293, "attnres/block_norm/3": 58544.1640625, "attnres/final_alpha/block_4": 0.014346098527312279, "attnres/block_norm/4": 15244.2734375, "attnres/final_alpha/block_5": 0.617824912071228, "attnres/block_norm/5": 6695.134765625, "attnres/final_alpha/block_6": 0.10707393288612366, "attnres/block_norm/6": 39226.765625, "geo/tier1_time_s": 1.358586311340332, "geo/step": 65100.0, "geo/rankme_slope": -4.460258712860144e-05} {"step": 65110, "timestamp": 1778264919.4050946, "train/loss": 2.125611972808838, "train/z_loss": 0.0013747239718213677, "train/perplexity": 8.378023041910996, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792061.9021280918, "perf/iters_per_sec": 0.854521704734846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1702452898025513, "data/tokens_consumed": 136547663872, "data/tokens_consumed_B": 136.547663872, "train/loss_slope": -1.5515389257889387e-06} {"step": 65120, "timestamp": 1778264929.7560048, "train/loss": 2.170141816139221, "train/z_loss": 0.0013633464463055135, "train/perplexity": 8.759526194846362, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027558.0914739433, "perf/iters_per_sec": 0.9668150384301869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343240022659301, "data/tokens_consumed": 136568635392, "data/tokens_consumed_B": 136.568635392, "train/loss_slope": 4.819219905694993e-08} {"step": 65130, "timestamp": 1778264940.1063747, "train/loss": 2.13413519859314, "train/z_loss": 0.0013829400180839003, "train/perplexity": 8.449736002667311, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027395.8351027337, "perf/iters_per_sec": 0.966737668563239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344067811965942, "data/tokens_consumed": 136589606912, "data/tokens_consumed_B": 136.589606912, "train/loss_slope": 2.3995172859418638e-06} {"step": 65140, "timestamp": 1778264950.4509952, "train/loss": 2.141045928001404, "train/z_loss": 0.001379805407486856, "train/perplexity": 8.508332079364916, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028575.3088068503, "perf/iters_per_sec": 0.9673000854524852, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338053464889527, "data/tokens_consumed": 136610578432, "data/tokens_consumed_B": 136.610578432, "train/loss_slope": 3.448671841099212e-06} {"step": 65150, "timestamp": 1778264960.7822676, "grad/layer_0/attn": 0.002898657228797674, "grad/layer_0/mlp": 0.0029347229283303022, "grad/layer_0/attn_mlp_ratio": 0.9877106632603938, "grad/layer_4/attn": 0.002355991629883647, "grad/layer_4/mlp": 0.002739031333476305, "grad/layer_4/attn_mlp_ratio": 0.8601550172403409, "grad/layer_8/attn": 0.007171766832470894, "grad/layer_8/mlp": 0.00392227852717042, "grad/layer_8/attn_mlp_ratio": 1.8284694980082778, "grad/layer_12/attn": 0.00457774568349123, "grad/layer_12/mlp": 0.006730061024427414, "grad/layer_12/attn_mlp_ratio": 0.6801937751911131, "grad/layer_16/attn": 0.0034326883032917976, "grad/layer_16/mlp": 0.004644221626222134, "grad/layer_16/attn_mlp_ratio": 0.739131011749548, "grad/layer_20/attn": 0.0040407972410321236, "grad/layer_20/mlp": 0.006058950442820787, "grad/layer_20/attn_mlp_ratio": 0.666913719211661, "grad/layer_24/attn": 0.005736690014600754, "grad/layer_24/mlp": 0.007415726315230131, "grad/layer_24/attn_mlp_ratio": 0.7735843656285597, "grad/layer_27/attn": 0.005420290399342775, "grad/layer_27/mlp": 0.006383540108799934, "grad/layer_27/attn_mlp_ratio": 0.8491041368973778} {"step": 65150, "timestamp": 1778264960.7964633, "train/loss": 2.131724643707275, "train/z_loss": 0.0013836085330694915, "train/perplexity": 8.429391980304278, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028504.2472011158, "perf/iters_per_sec": 0.9672662006383494, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338415622711181, "data/tokens_consumed": 136631549952, "data/tokens_consumed_B": 136.631549952, "train/loss_slope": 2.7670080297671956e-06} {"step": 65160, "timestamp": 1778264971.1340096, "train/loss": 2.128755235671997, "train/z_loss": 0.0013776365434750915, "train/perplexity": 8.404398801862433, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029601.2743730708, "perf/iters_per_sec": 0.9677893039574961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03328275680542, "data/tokens_consumed": 136652521472, "data/tokens_consumed_B": 136.652521472, "train/loss_slope": 4.44879800823403e-06} {"step": 65170, "timestamp": 1778264981.4778728, "train/loss": 2.0997140407562256, "train/z_loss": 0.0013885595020838083, "train/perplexity": 8.163835054647931, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028650.0715515949, "perf/iters_per_sec": 0.9673357351072287, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337672472000121, "data/tokens_consumed": 136673492992, "data/tokens_consumed_B": 136.673492992, "train/loss_slope": 2.404244654964737e-06} {"step": 65175, "timestamp": 1778264987.2249079, "eos/sharpness": 74.02703762054442, "eos/L0_probe": 1.9732494354248047, "eos/L_plus": 2.3967528343200684, "eos/L_minus": 2.2900164127349854, "eos/grad_norm": 0.21142402291297913, "eos/embed_grad_frac": 0.05306336283683777, "eos/time_s": 0.5848205089569092} {"step": 65175, "timestamp": 1778264988.598508, "geo/rankme_last": 438.21026611328125, "geo/layer_0/stable_rank_q_proj": 19.333410263061523, "geo/layer_0/stable_rank_k_proj": 16.06012535095215, "geo/layer_0/stable_rank_o_proj": 46.89390182495117, "geo/layer_0/stable_rank_gate_proj": 130.8896942138672, "geo/layer_0/stable_rank_down_proj": 55.004459381103516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06269880384206772, "geo/layer_0/attn_entropy_mean": 6.16108512878418, "geo/layer_0/attn_entropy_std": 0.41715046763420105, "geo/layer_7/stable_rank_q_proj": 43.32272720336914, "geo/layer_7/stable_rank_k_proj": 40.635398864746094, "geo/layer_7/stable_rank_o_proj": 89.77721405029297, "geo/layer_7/stable_rank_gate_proj": 80.80892944335938, "geo/layer_7/stable_rank_down_proj": 140.1374969482422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4673922657966614, "geo/layer_7/attn_entropy_mean": 4.650368690490723, "geo/layer_7/attn_entropy_std": 0.786461591720581, "geo/layer_14/stable_rank_q_proj": 51.10557174682617, "geo/layer_14/stable_rank_k_proj": 40.16651916503906, "geo/layer_14/stable_rank_o_proj": 43.61589050292969, "geo/layer_14/stable_rank_gate_proj": 71.68230438232422, "geo/layer_14/stable_rank_down_proj": 129.23880004882812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41305890679359436, "geo/layer_14/attn_entropy_mean": 5.550575256347656, "geo/layer_14/attn_entropy_std": 0.4189694821834564, "geo/layer_21/stable_rank_q_proj": 40.18104934692383, "geo/layer_21/stable_rank_k_proj": 30.27187156677246, "geo/layer_21/stable_rank_o_proj": 70.0095443725586, "geo/layer_21/stable_rank_gate_proj": 65.54219055175781, "geo/layer_21/stable_rank_down_proj": 51.131263732910156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1474304497241974, "geo/layer_21/attn_entropy_mean": 5.712929725646973, "geo/layer_21/attn_entropy_std": 0.29898014664649963, "geo/layer_27/stable_rank_q_proj": 43.46856689453125, "geo/layer_27/stable_rank_k_proj": 31.8353328704834, "geo/layer_27/stable_rank_o_proj": 114.97025299072266, "geo/layer_27/stable_rank_gate_proj": 80.05340576171875, "geo/layer_27/stable_rank_down_proj": 128.42828369140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0847158282995224, "geo/layer_27/attn_entropy_mean": 4.196730613708496, "geo/layer_27/attn_entropy_std": 0.7348818778991699, "attnres/final_alpha/block_0": 0.23220205307006836, "attnres/block_norm/0": 1.7660152912139893, "attnres/final_alpha/block_1": 0.004295998718589544, "attnres/block_norm/1": 47046.7578125, "attnres/final_alpha/block_2": 0.009914025664329529, "attnres/block_norm/2": 28836.96484375, "attnres/final_alpha/block_3": 0.011724917218089104, "attnres/block_norm/3": 58915.77734375, "attnres/final_alpha/block_4": 0.014148423448204994, "attnres/block_norm/4": 15231.0419921875, "attnres/final_alpha/block_5": 0.6211302280426025, "attnres/block_norm/5": 6597.02685546875, "attnres/final_alpha/block_6": 0.10658439993858337, "attnres/block_norm/6": 39341.8359375, "geo/tier1_time_s": 1.3558688163757324, "geo/step": 65175.0, "geo/rankme_slope": -6.520166660414165e-05} {"step": 65180, "timestamp": 1778264993.770805, "train/loss": 2.149440360069275, "train/z_loss": 0.0013774703838862479, "train/perplexity": 8.580055311743774, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707000.6025105026, "perf/iters_per_sec": 0.8139613163521302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2285596132278442, "data/tokens_consumed": 136694464512, "data/tokens_consumed_B": 136.694464512, "train/loss_slope": 4.007594563243528e-06} {"step": 65190, "timestamp": 1778265004.110832, "train/loss": 2.135417675971985, "train/z_loss": 0.001381628680974245, "train/perplexity": 8.460579549763631, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029227.0247567, "perf/iters_per_sec": 0.9676108478339672, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334733247756958, "data/tokens_consumed": 136715436032, "data/tokens_consumed_B": 136.715436032, "train/loss_slope": 3.357320063614146e-06} {"step": 65200, "timestamp": 1778265014.4442687, "grad/layer_0/attn": 0.0027808640152215958, "grad/layer_0/mlp": 0.0028003363404423, "grad/layer_0/attn_mlp_ratio": 0.9930463979472307, "grad/layer_4/attn": 0.00225411681458354, "grad/layer_4/mlp": 0.002714809961616993, "grad/layer_4/attn_mlp_ratio": 0.8303036910217375, "grad/layer_8/attn": 0.005133605096489191, "grad/layer_8/mlp": 0.003872028086334467, "grad/layer_8/attn_mlp_ratio": 1.3258181111922693, "grad/layer_12/attn": 0.004258725792169571, "grad/layer_12/mlp": 0.006625310052186251, "grad/layer_12/attn_mlp_ratio": 0.6427964418789145, "grad/layer_16/attn": 0.003462966298684478, "grad/layer_16/mlp": 0.004652196541428566, "grad/layer_16/attn_mlp_ratio": 0.7443722967009179, "grad/layer_20/attn": 0.003211056115105748, "grad/layer_20/mlp": 0.005593598820269108, "grad/layer_20/attn_mlp_ratio": 0.5740590558736851, "grad/layer_24/attn": 0.008876633830368519, "grad/layer_24/mlp": 0.008177043870091438, "grad/layer_24/attn_mlp_ratio": 1.0855553501773891, "grad/layer_27/attn": 0.0059301904402673244, "grad/layer_27/mlp": 0.0077502732165157795, "grad/layer_27/attn_mlp_ratio": 0.7651588786720759} {"step": 65200, "timestamp": 1778265014.4582963, "train/loss": 2.063570511341095, "train/z_loss": 0.0013937264564447104, "train/perplexity": 7.874034006458565, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028058.6234183512, "perf/iters_per_sec": 0.9670537106601482, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034068727493286, "data/tokens_consumed": 136736407552, "data/tokens_consumed_B": 136.736407552, "train/loss_slope": -2.233589013846773e-06} {"step": 65210, "timestamp": 1778265024.798917, "train/loss": 2.17675724029541, "train/z_loss": 0.0013697446789592504, "train/perplexity": 8.81766627464075, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029384.6113613837, "perf/iters_per_sec": 0.9676859909827155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333930730819703, "data/tokens_consumed": 136757379072, "data/tokens_consumed_B": 136.757379072, "train/loss_slope": 1.1652844251900933e-06} {"step": 65220, "timestamp": 1778265035.1431491, "train/loss": 2.163736271858215, "train/z_loss": 0.0013744361465796828, "train/perplexity": 8.703595984882156, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028222.0147769952, "perf/iters_per_sec": 0.967131621731279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033985424041748, "data/tokens_consumed": 136778350592, "data/tokens_consumed_B": 136.778350592, "train/loss_slope": 4.580964092159606e-06} {"step": 65230, "timestamp": 1778265045.4885204, "train/loss": 2.154196834564209, "train/z_loss": 0.0013772581471130252, "train/perplexity": 8.620963337865243, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028472.2032736419, "perf/iters_per_sec": 0.9672509209030351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338578939437866, "data/tokens_consumed": 136799322112, "data/tokens_consumed_B": 136.799322112, "train/loss_slope": 7.052356145992075e-06} {"step": 65240, "timestamp": 1778265055.831228, "train/loss": 2.1291354417800905, "train/z_loss": 0.001367409119848162, "train/perplexity": 8.407594813154763, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029051.3019545882, "perf/iters_per_sec": 0.9675270566723767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335628271102906, "data/tokens_consumed": 136820293632, "data/tokens_consumed_B": 136.820293632, "train/loss_slope": 1.0272283837346734e-05} {"step": 65250, "timestamp": 1778265066.1813867, "grad/layer_0/attn": 0.003219967009499669, "grad/layer_0/mlp": 0.003189156064763665, "grad/layer_0/attn_mlp_ratio": 1.009661127628815, "grad/layer_4/attn": 0.0020818421617150307, "grad/layer_4/mlp": 0.0025797300040721893, "grad/layer_4/attn_mlp_ratio": 0.8069999874904642, "grad/layer_8/attn": 0.004699298180639744, "grad/layer_8/mlp": 0.0036568159703165293, "grad/layer_8/attn_mlp_ratio": 1.2850791755115478, "grad/layer_12/attn": 0.005887251812964678, "grad/layer_12/mlp": 0.006596054416149855, "grad/layer_12/attn_mlp_ratio": 0.8925414122260912, "grad/layer_16/attn": 0.00390790356323123, "grad/layer_16/mlp": 0.004947907291352749, "grad/layer_16/attn_mlp_ratio": 0.7898093586110585, "grad/layer_20/attn": 0.0029173188377171755, "grad/layer_20/mlp": 0.0060507128946483135, "grad/layer_20/attn_mlp_ratio": 0.4821446398626844, "grad/layer_24/attn": 0.006564903538674116, "grad/layer_24/mlp": 0.00946772750467062, "grad/layer_24/attn_mlp_ratio": 0.6933980161655177, "grad/layer_27/attn": 0.004137983079999685, "grad/layer_27/mlp": 0.008523941971361637, "grad/layer_27/attn_mlp_ratio": 0.4854541531789966} {"step": 65250, "timestamp": 1778265066.773058, "eos/sharpness": 22.476196289062496, "eos/L0_probe": 1.972636103630066, "eos/L_plus": 2.0900771617889404, "eos/L_minus": 2.0799570083618164, "eos/grad_norm": 0.1175718605518341, "eos/embed_grad_frac": 0.19693708419799805, "eos/time_s": 0.5888454914093018} {"step": 65250, "timestamp": 1778265066.7941113, "train/loss": 2.1513222455978394, "train/z_loss": 0.0013719177106395364, "train/perplexity": 8.596217196307796, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913748.0570817403, "perf/iters_per_sec": 0.9125461850556089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0958349466323853, "data/tokens_consumed": 136841265152, "data/tokens_consumed_B": 136.841265152, "train/loss_slope": 1.0042637688527755e-05} {"step": 65250, "timestamp": 1778265068.1551788, "geo/rankme_last": 438.9633483886719, "geo/layer_0/stable_rank_q_proj": 19.31930160522461, "geo/layer_0/stable_rank_k_proj": 16.04063606262207, "geo/layer_0/stable_rank_o_proj": 46.91511535644531, "geo/layer_0/stable_rank_gate_proj": 130.47662353515625, "geo/layer_0/stable_rank_down_proj": 55.055912017822266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0645822137594223, "geo/layer_0/attn_entropy_mean": 6.162103652954102, "geo/layer_0/attn_entropy_std": 0.42335060238838196, "geo/layer_7/stable_rank_q_proj": 43.35367965698242, "geo/layer_7/stable_rank_k_proj": 40.657562255859375, "geo/layer_7/stable_rank_o_proj": 89.57852172851562, "geo/layer_7/stable_rank_gate_proj": 80.65287017822266, "geo/layer_7/stable_rank_down_proj": 140.179931640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45140159130096436, "geo/layer_7/attn_entropy_mean": 4.63381290435791, "geo/layer_7/attn_entropy_std": 0.7889070510864258, "geo/layer_14/stable_rank_q_proj": 51.18909454345703, "geo/layer_14/stable_rank_k_proj": 40.10302734375, "geo/layer_14/stable_rank_o_proj": 43.63725662231445, "geo/layer_14/stable_rank_gate_proj": 71.70735931396484, "geo/layer_14/stable_rank_down_proj": 129.4840545654297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3781258165836334, "geo/layer_14/attn_entropy_mean": 5.546691417694092, "geo/layer_14/attn_entropy_std": 0.4152611494064331, "geo/layer_21/stable_rank_q_proj": 40.214717864990234, "geo/layer_21/stable_rank_k_proj": 30.250080108642578, "geo/layer_21/stable_rank_o_proj": 69.9774169921875, "geo/layer_21/stable_rank_gate_proj": 65.4905014038086, "geo/layer_21/stable_rank_down_proj": 51.07478713989258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14322663843631744, "geo/layer_21/attn_entropy_mean": 5.695929050445557, "geo/layer_21/attn_entropy_std": 0.30292436480522156, "geo/layer_27/stable_rank_q_proj": 43.522430419921875, "geo/layer_27/stable_rank_k_proj": 31.889936447143555, "geo/layer_27/stable_rank_o_proj": 114.72947692871094, "geo/layer_27/stable_rank_gate_proj": 79.95995330810547, "geo/layer_27/stable_rank_down_proj": 128.3166046142578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09445790201425552, "geo/layer_27/attn_entropy_mean": 4.202152252197266, "geo/layer_27/attn_entropy_std": 0.7554352283477783, "attnres/final_alpha/block_0": 0.23654112219810486, "attnres/block_norm/0": 1.7659947872161865, "attnres/final_alpha/block_1": 0.004395224153995514, "attnres/block_norm/1": 47004.453125, "attnres/final_alpha/block_2": 0.010198783129453659, "attnres/block_norm/2": 28818.80859375, "attnres/final_alpha/block_3": 0.012183748185634613, "attnres/block_norm/3": 58963.76953125, "attnres/final_alpha/block_4": 0.014509813860058784, "attnres/block_norm/4": 15294.498046875, "attnres/final_alpha/block_5": 0.6121087074279785, "attnres/block_norm/5": 6695.65576171875, "attnres/final_alpha/block_6": 0.11006257683038712, "attnres/block_norm/6": 39065.734375, "geo/tier1_time_s": 1.3567137718200684, "geo/step": 65250.0, "geo/rankme_slope": -5.761613239045618e-05} {"step": 65260, "timestamp": 1778265078.502323, "train/loss": 2.148871326446533, "train/z_loss": 0.0013669065083377063, "train/perplexity": 8.575174360630756, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791833.376375041, "perf/iters_per_sec": 0.854412735164185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703945398330688, "data/tokens_consumed": 136862236672, "data/tokens_consumed_B": 136.862236672, "train/loss_slope": 1.1544806428618e-05} {"step": 65270, "timestamp": 1778265088.8476026, "train/loss": 2.1422812700271607, "train/z_loss": 0.0013842409709468484, "train/perplexity": 8.518849274380651, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028479.1733264544, "perf/iters_per_sec": 0.9672542444832107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033854341506958, "data/tokens_consumed": 136883208192, "data/tokens_consumed_B": 136.883208192, "train/loss_slope": 1.1503892369789634e-05} {"step": 65280, "timestamp": 1778265099.1953382, "train/loss": 2.1127429366111756, "train/z_loss": 0.001377863110974431, "train/perplexity": 8.270896744665057, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027669.6110536978, "perf/iters_per_sec": 0.966868215109681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342671155929566, "data/tokens_consumed": 136904179712, "data/tokens_consumed_B": 136.904179712, "train/loss_slope": 8.893133247002351e-06} {"step": 65290, "timestamp": 1778265109.5412886, "train/loss": 2.1785635471343996, "train/z_loss": 0.0013712047832086683, "train/perplexity": 8.833608079097267, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028563.5194402293, "perf/iters_per_sec": 0.9672944638444086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033811354637146, "data/tokens_consumed": 136925151232, "data/tokens_consumed_B": 136.925151232, "train/loss_slope": 9.471969142390767e-06} {"step": 65300, "timestamp": 1778265119.8783503, "grad/layer_0/attn": 0.002617573831230402, "grad/layer_0/mlp": 0.002911793766543269, "grad/layer_0/attn_mlp_ratio": 0.898955747282293, "grad/layer_4/attn": 0.002859818749129772, "grad/layer_4/mlp": 0.002462616888806224, "grad/layer_4/attn_mlp_ratio": 1.1612925445284512, "grad/layer_8/attn": 0.005173853132873774, "grad/layer_8/mlp": 0.003649370977655053, "grad/layer_8/attn_mlp_ratio": 1.417738295936267, "grad/layer_12/attn": 0.003967327065765858, "grad/layer_12/mlp": 0.007048759143799543, "grad/layer_12/attn_mlp_ratio": 0.5628404841966657, "grad/layer_16/attn": 0.003690239740535617, "grad/layer_16/mlp": 0.004673388320952654, "grad/layer_16/attn_mlp_ratio": 0.7896282971025496, "grad/layer_20/attn": 0.003359326859936118, "grad/layer_20/mlp": 0.0062124961987137794, "grad/layer_20/attn_mlp_ratio": 0.5407370400577342, "grad/layer_24/attn": 0.005408962722867727, "grad/layer_24/mlp": 0.008175437338650227, "grad/layer_24/attn_mlp_ratio": 0.661611413879372, "grad/layer_27/attn": 0.004650449380278587, "grad/layer_27/mlp": 0.007073627784848213, "grad/layer_27/attn_mlp_ratio": 0.6574348348518445} {"step": 65300, "timestamp": 1778265119.892381, "train/loss": 2.178509163856506, "train/z_loss": 0.0013882056926377117, "train/perplexity": 8.83312769159694, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027086.4423177058, "perf/iters_per_sec": 0.9665901385868577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345646619796753, "data/tokens_consumed": 136946122752, "data/tokens_consumed_B": 136.946122752, "train/loss_slope": 1.0916000259484088e-05} {"step": 65310, "timestamp": 1778265130.2355967, "train/loss": 2.1425782203674317, "train/z_loss": 0.0013794678030535578, "train/perplexity": 8.52137932520253, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028573.5310364433, "perf/iters_per_sec": 0.9672992377454964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338062524795533, "data/tokens_consumed": 136967094272, "data/tokens_consumed_B": 136.967094272, "train/loss_slope": 1.1010173952976497e-05} {"step": 65320, "timestamp": 1778265140.575974, "train/loss": 2.1232476472854613, "train/z_loss": 0.0013805208611302078, "train/perplexity": 8.358238066475327, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029588.3491526952, "perf/iters_per_sec": 0.967783140732143, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033289337158203, "data/tokens_consumed": 136988065792, "data/tokens_consumed_B": 136.988065792, "train/loss_slope": 1.1681028954660129e-05} {"step": 65325, "timestamp": 1778265146.3190014, "eos/sharpness": 53.792285919189446, "eos/L0_probe": 1.9733232259750366, "eos/L_plus": 2.275181293487549, "eos/L_minus": 2.209388017654419, "eos/grad_norm": 0.12781085073947906, "eos/embed_grad_frac": 0.15115372836589813, "eos/time_s": 0.5824563503265381} {"step": 65325, "timestamp": 1778265147.6945033, "geo/rankme_last": 439.3861389160156, "geo/layer_0/stable_rank_q_proj": 19.312740325927734, "geo/layer_0/stable_rank_k_proj": 16.023454666137695, "geo/layer_0/stable_rank_o_proj": 46.995357513427734, "geo/layer_0/stable_rank_gate_proj": 130.55006408691406, "geo/layer_0/stable_rank_down_proj": 55.06459426879883, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06175541877746582, "geo/layer_0/attn_entropy_mean": 6.1636962890625, "geo/layer_0/attn_entropy_std": 0.41797319054603577, "geo/layer_7/stable_rank_q_proj": 43.401920318603516, "geo/layer_7/stable_rank_k_proj": 40.64363479614258, "geo/layer_7/stable_rank_o_proj": 89.75933074951172, "geo/layer_7/stable_rank_gate_proj": 80.4659194946289, "geo/layer_7/stable_rank_down_proj": 140.26869201660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4519340991973877, "geo/layer_7/attn_entropy_mean": 4.658914089202881, "geo/layer_7/attn_entropy_std": 0.8003082871437073, "geo/layer_14/stable_rank_q_proj": 51.21274185180664, "geo/layer_14/stable_rank_k_proj": 40.10334014892578, "geo/layer_14/stable_rank_o_proj": 43.659236907958984, "geo/layer_14/stable_rank_gate_proj": 71.6792984008789, "geo/layer_14/stable_rank_down_proj": 129.80592346191406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39714691042900085, "geo/layer_14/attn_entropy_mean": 5.540370941162109, "geo/layer_14/attn_entropy_std": 0.39393505454063416, "geo/layer_21/stable_rank_q_proj": 40.218502044677734, "geo/layer_21/stable_rank_k_proj": 30.26744842529297, "geo/layer_21/stable_rank_o_proj": 69.94774627685547, "geo/layer_21/stable_rank_gate_proj": 65.62968444824219, "geo/layer_21/stable_rank_down_proj": 51.10225296020508, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14080049097537994, "geo/layer_21/attn_entropy_mean": 5.686323165893555, "geo/layer_21/attn_entropy_std": 0.30378103256225586, "geo/layer_27/stable_rank_q_proj": 43.5162239074707, "geo/layer_27/stable_rank_k_proj": 31.9492130279541, "geo/layer_27/stable_rank_o_proj": 114.56513214111328, "geo/layer_27/stable_rank_gate_proj": 79.88552856445312, "geo/layer_27/stable_rank_down_proj": 128.27146911621094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09694461524486542, "geo/layer_27/attn_entropy_mean": 4.185060501098633, "geo/layer_27/attn_entropy_std": 0.7438466548919678, "attnres/final_alpha/block_0": 0.23577550053596497, "attnres/block_norm/0": 1.7660531997680664, "attnres/final_alpha/block_1": 0.0043539926409721375, "attnres/block_norm/1": 47144.12109375, "attnres/final_alpha/block_2": 0.010238314978778362, "attnres/block_norm/2": 28692.66796875, "attnres/final_alpha/block_3": 0.01218376960605383, "attnres/block_norm/3": 58891.921875, "attnres/final_alpha/block_4": 0.014495876617729664, "attnres/block_norm/4": 15258.755859375, "attnres/final_alpha/block_5": 0.6156915426254272, "attnres/block_norm/5": 6680.578125, "attnres/final_alpha/block_6": 0.10726097226142883, "attnres/block_norm/6": 39144.71484375, "geo/tier1_time_s": 1.3570337295532227, "geo/step": 65325.0, "geo/rankme_slope": -3.844426442451982e-05} {"step": 65330, "timestamp": 1778265152.8655822, "train/loss": 2.1447928190231322, "train/z_loss": 0.0013757435022853314, "train/perplexity": 8.540271672162552, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707082.926208841, "perf/iters_per_sec": 0.8140005713504986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2285003662109375, "data/tokens_consumed": 137009037312, "data/tokens_consumed_B": 137.009037312, "train/loss_slope": 9.103476582723708e-06} {"step": 65340, "timestamp": 1778265163.205329, "train/loss": 2.152265167236328, "train/z_loss": 0.0013818795676343143, "train/perplexity": 8.60432657816632, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029118.4699704254, "perf/iters_per_sec": 0.9675590848781707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335286140441895, "data/tokens_consumed": 137030008832, "data/tokens_consumed_B": 137.030008832, "train/loss_slope": 1.0916306863058564e-05} {"step": 65350, "timestamp": 1778265173.5354145, "grad/layer_0/attn": 0.002581251785159111, "grad/layer_0/mlp": 0.0029724871274083853, "grad/layer_0/attn_mlp_ratio": 0.8683811191375985, "grad/layer_4/attn": 0.0024271863512694836, "grad/layer_4/mlp": 0.0027008377946913242, "grad/layer_4/attn_mlp_ratio": 0.898679019588802, "grad/layer_8/attn": 0.004124247469007969, "grad/layer_8/mlp": 0.003705937648192048, "grad/layer_8/attn_mlp_ratio": 1.112875538996842, "grad/layer_12/attn": 0.006597417406737804, "grad/layer_12/mlp": 0.0072702690958976746, "grad/layer_12/attn_mlp_ratio": 0.907451598966991, "grad/layer_16/attn": 0.0035983924753963947, "grad/layer_16/mlp": 0.004534592851996422, "grad/layer_16/attn_mlp_ratio": 0.7935425546436642, "grad/layer_20/attn": 0.002830923767760396, "grad/layer_20/mlp": 0.005736807361245155, "grad/layer_20/attn_mlp_ratio": 0.4934667560110085, "grad/layer_24/attn": 0.008413809351623058, "grad/layer_24/mlp": 0.008541994728147984, "grad/layer_24/attn_mlp_ratio": 0.9849934963549121, "grad/layer_27/attn": 0.006708561442792416, "grad/layer_27/mlp": 0.007266754750162363, "grad/layer_27/attn_mlp_ratio": 0.9231853256536547} {"step": 65350, "timestamp": 1778265173.5495486, "train/loss": 2.161046934127808, "train/z_loss": 0.0013734333799220622, "train/perplexity": 8.680220522155386, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028680.5770829464, "perf/iters_per_sec": 0.9673502812781078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337517023086549, "data/tokens_consumed": 137050980352, "data/tokens_consumed_B": 137.050980352, "train/loss_slope": 1.2355436829522027e-05} {"step": 65360, "timestamp": 1778265183.8858004, "train/loss": 2.1683213233947756, "train/z_loss": 0.0013721828814595937, "train/perplexity": 8.743594047543333, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029820.372536867, "perf/iters_per_sec": 0.9678937781032881, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331712245941163, "data/tokens_consumed": 137071951872, "data/tokens_consumed_B": 137.071951872, "train/loss_slope": 1.174509679809764e-05} {"step": 65370, "timestamp": 1778265194.2449353, "train/loss": 2.08936470746994, "train/z_loss": 0.001383984531275928, "train/perplexity": 8.079780509271698, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025319.7805217598, "perf/iters_per_sec": 0.9657477285965728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354671001434326, "data/tokens_consumed": 137092923392, "data/tokens_consumed_B": 137.092923392, "train/loss_slope": 1.1155436041116948e-05} {"step": 65380, "timestamp": 1778265204.5955248, "train/loss": 2.1775341987609864, "train/z_loss": 0.0013657933450303972, "train/perplexity": 8.824519897244736, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027613.9899478343, "perf/iters_per_sec": 0.966841692899625, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342954874038697, "data/tokens_consumed": 137113894912, "data/tokens_consumed_B": 137.113894912, "train/loss_slope": 1.1711592177818273e-05} {"step": 65390, "timestamp": 1778265214.9327312, "train/loss": 2.154487419128418, "train/z_loss": 0.001373613183386624, "train/perplexity": 8.623468820749531, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029637.475232443, "perf/iters_per_sec": 0.9678065658724037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332643270492554, "data/tokens_consumed": 137134866432, "data/tokens_consumed_B": 137.134866432, "train/loss_slope": 1.1053137832170067e-05} {"step": 65400, "timestamp": 1778265225.2605348, "grad/layer_0/attn": 0.0031208896543830633, "grad/layer_0/mlp": 0.0030424101278185844, "grad/layer_0/attn_mlp_ratio": 1.0257951494663324, "grad/layer_4/attn": 0.004284810274839401, "grad/layer_4/mlp": 0.002441585995256901, "grad/layer_4/attn_mlp_ratio": 1.7549290124002606, "grad/layer_8/attn": 0.003974601626396179, "grad/layer_8/mlp": 0.003567907027900219, "grad/layer_8/attn_mlp_ratio": 1.113986851091415, "grad/layer_12/attn": 0.004932657349854708, "grad/layer_12/mlp": 0.007109116297215223, "grad/layer_12/attn_mlp_ratio": 0.6938495692357667, "grad/layer_16/attn": 0.004543141927570105, "grad/layer_16/mlp": 0.004836627282202244, "grad/layer_16/attn_mlp_ratio": 0.9393202263808655, "grad/layer_20/attn": 0.004960428923368454, "grad/layer_20/mlp": 0.00653096055611968, "grad/layer_20/attn_mlp_ratio": 0.7595251578679169, "grad/layer_24/attn": 0.012802069075405598, "grad/layer_24/mlp": 0.012230735272169113, "grad/layer_24/attn_mlp_ratio": 1.0467129478196828, "grad/layer_27/attn": 0.014433493837714195, "grad/layer_27/mlp": 0.011761236935853958, "grad/layer_27/attn_mlp_ratio": 1.2272088211226337} {"step": 65400, "timestamp": 1778265225.8448374, "eos/sharpness": 82.36472606658934, "eos/L0_probe": 1.9723784923553467, "eos/L_plus": 2.321831226348877, "eos/L_minus": 2.44657301902771, "eos/grad_norm": 0.2365817129611969, "eos/embed_grad_frac": 0.04031115397810936, "eos/time_s": 0.5815463066101074} {"step": 65400, "timestamp": 1778265225.864446, "train/loss": 2.150664043426514, "train/z_loss": 0.0013655935297720134, "train/perplexity": 8.590561009145539, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919249.714441093, "perf/iters_per_sec": 0.9151695797162499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0926936626434327, "data/tokens_consumed": 137155837952, "data/tokens_consumed_B": 137.155837952, "train/loss_slope": 1.0577884222557724e-05} {"step": 65400, "timestamp": 1778265227.2286038, "geo/rankme_last": 436.9168701171875, "geo/layer_0/stable_rank_q_proj": 19.334232330322266, "geo/layer_0/stable_rank_k_proj": 16.02345848083496, "geo/layer_0/stable_rank_o_proj": 47.05958557128906, "geo/layer_0/stable_rank_gate_proj": 130.6367645263672, "geo/layer_0/stable_rank_down_proj": 55.14979553222656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06651776283979416, "geo/layer_0/attn_entropy_mean": 6.1627984046936035, "geo/layer_0/attn_entropy_std": 0.4174019396305084, "geo/layer_7/stable_rank_q_proj": 43.39379119873047, "geo/layer_7/stable_rank_k_proj": 40.75646209716797, "geo/layer_7/stable_rank_o_proj": 89.81901550292969, "geo/layer_7/stable_rank_gate_proj": 80.48278045654297, "geo/layer_7/stable_rank_down_proj": 140.2930450439453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4490271508693695, "geo/layer_7/attn_entropy_mean": 4.650206565856934, "geo/layer_7/attn_entropy_std": 0.8124058246612549, "geo/layer_14/stable_rank_q_proj": 51.25390625, "geo/layer_14/stable_rank_k_proj": 40.105857849121094, "geo/layer_14/stable_rank_o_proj": 43.638694763183594, "geo/layer_14/stable_rank_gate_proj": 71.57955169677734, "geo/layer_14/stable_rank_down_proj": 129.86276245117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38627418875694275, "geo/layer_14/attn_entropy_mean": 5.570306777954102, "geo/layer_14/attn_entropy_std": 0.393065869808197, "geo/layer_21/stable_rank_q_proj": 40.266357421875, "geo/layer_21/stable_rank_k_proj": 30.257339477539062, "geo/layer_21/stable_rank_o_proj": 69.98471069335938, "geo/layer_21/stable_rank_gate_proj": 65.62442016601562, "geo/layer_21/stable_rank_down_proj": 51.000282287597656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1366005837917328, "geo/layer_21/attn_entropy_mean": 5.707112789154053, "geo/layer_21/attn_entropy_std": 0.31431302428245544, "geo/layer_27/stable_rank_q_proj": 43.48883819580078, "geo/layer_27/stable_rank_k_proj": 31.91507339477539, "geo/layer_27/stable_rank_o_proj": 114.79622650146484, "geo/layer_27/stable_rank_gate_proj": 79.78523254394531, "geo/layer_27/stable_rank_down_proj": 128.3477783203125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10214658826589584, "geo/layer_27/attn_entropy_mean": 4.188221454620361, "geo/layer_27/attn_entropy_std": 0.7505041360855103, "attnres/final_alpha/block_0": 0.2376326322555542, "attnres/block_norm/0": 1.766038179397583, "attnres/final_alpha/block_1": 0.004392352886497974, "attnres/block_norm/1": 47112.0078125, "attnres/final_alpha/block_2": 0.010287105105817318, "attnres/block_norm/2": 28834.947265625, "attnres/final_alpha/block_3": 0.01200224831700325, "attnres/block_norm/3": 59378.75, "attnres/final_alpha/block_4": 0.01473873108625412, "attnres/block_norm/4": 15280.212890625, "attnres/final_alpha/block_5": 0.6107558012008667, "attnres/block_norm/5": 6713.751953125, "attnres/final_alpha/block_6": 0.11019115149974823, "attnres/block_norm/6": 39534.171875, "geo/tier1_time_s": 1.3607640266418457, "geo/step": 65400.0, "geo/rankme_slope": -9.821444202681072e-05} {"step": 65410, "timestamp": 1778265237.5666962, "train/loss": 2.1351861000061034, "train/z_loss": 0.0013779078726656735, "train/perplexity": 8.458620509724527, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792574.6908835864, "perf/iters_per_sec": 0.8547662214677746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1699105262756349, "data/tokens_consumed": 137176809472, "data/tokens_consumed_B": 137.176809472, "train/loss_slope": 1.0477885018230535e-05} {"step": 65420, "timestamp": 1778265247.9066114, "train/loss": 2.172040045261383, "train/z_loss": 0.0013629211927764118, "train/perplexity": 8.776169574041745, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029179.9782421424, "perf/iters_per_sec": 0.9675884143076622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334972858428955, "data/tokens_consumed": 137197780992, "data/tokens_consumed_B": 137.197780992, "train/loss_slope": 1.2638254205707748e-05} {"step": 65430, "timestamp": 1778265258.2430704, "train/loss": 2.1316524505615235, "train/z_loss": 0.0013717467430979014, "train/perplexity": 8.428783457946281, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030037.8313738613, "perf/iters_per_sec": 0.9679974705571467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330605506896973, "data/tokens_consumed": 137218752512, "data/tokens_consumed_B": 137.218752512, "train/loss_slope": 1.1518344655968293e-05} {"step": 65440, "timestamp": 1778265268.5849862, "train/loss": 2.1436885833740233, "train/z_loss": 0.0013623382896184922, "train/perplexity": 8.530846404545043, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029280.1595132067, "perf/iters_per_sec": 0.9676361844602617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334462642669677, "data/tokens_consumed": 137239724032, "data/tokens_consumed_B": 137.239724032, "train/loss_slope": 1.3557383036277274e-05} {"step": 65450, "timestamp": 1778265278.9220853, "grad/layer_0/attn": 0.0028732505161315203, "grad/layer_0/mlp": 0.003165466943755746, "grad/layer_0/attn_mlp_ratio": 0.9076861254326887, "grad/layer_4/attn": 0.004623878747224808, "grad/layer_4/mlp": 0.0027629253454506397, "grad/layer_4/attn_mlp_ratio": 1.673544523193112, "grad/layer_8/attn": 0.0040677934885025024, "grad/layer_8/mlp": 0.0038650697097182274, "grad/layer_8/attn_mlp_ratio": 1.05245019851247, "grad/layer_12/attn": 0.003830091329291463, "grad/layer_12/mlp": 0.006744413170963526, "grad/layer_12/attn_mlp_ratio": 0.5678909603272704, "grad/layer_16/attn": 0.0042969281785190105, "grad/layer_16/mlp": 0.0047534918412566185, "grad/layer_16/attn_mlp_ratio": 0.9039519224225482, "grad/layer_20/attn": 0.0033068556804209948, "grad/layer_20/mlp": 0.006740029435604811, "grad/layer_20/attn_mlp_ratio": 0.4906292565859294, "grad/layer_24/attn": 0.013331226073205471, "grad/layer_24/mlp": 0.011552332900464535, "grad/layer_24/attn_mlp_ratio": 1.1539856124879195, "grad/layer_27/attn": 0.006378376390784979, "grad/layer_27/mlp": 0.011989894323050976, "grad/layer_27/attn_mlp_ratio": 0.5319793624306095} {"step": 65450, "timestamp": 1778265278.9366124, "train/loss": 2.12363977432251, "train/z_loss": 0.0013937539304606616, "train/perplexity": 8.361516200283724, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026830.4774031646, "perf/iters_per_sec": 0.9664680850044082, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034695315361023, "data/tokens_consumed": 137260695552, "data/tokens_consumed_B": 137.260695552, "train/loss_slope": 1.1888439706092226e-05} {"step": 65460, "timestamp": 1778265289.2795463, "train/loss": 2.136879825592041, "train/z_loss": 0.0013731863582506776, "train/perplexity": 8.472959231205506, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028889.2744945367, "perf/iters_per_sec": 0.9674497959587749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336453676223756, "data/tokens_consumed": 137281667072, "data/tokens_consumed_B": 137.281667072, "train/loss_slope": 1.0884945706637346e-05} {"step": 65470, "timestamp": 1778265299.6206133, "train/loss": 2.1483972549438475, "train/z_loss": 0.0013563638785853982, "train/perplexity": 8.571110078292158, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029409.239405718, "perf/iters_per_sec": 0.9676977345493879, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333805322647094, "data/tokens_consumed": 137302638592, "data/tokens_consumed_B": 137.302638592, "train/loss_slope": 1.3140178170248525e-05} {"step": 65475, "timestamp": 1778265305.3603933, "eos/sharpness": 22.30138778686523, "eos/L0_probe": 1.9704653024673462, "eos/L_plus": 2.097562074661255, "eos/L_minus": 2.06638240814209, "eos/grad_norm": 0.10569508373737335, "eos/embed_grad_frac": 0.21251460909843445, "eos/time_s": 0.5808918476104736} {"step": 65475, "timestamp": 1778265306.7342305, "geo/rankme_last": 439.142333984375, "geo/layer_0/stable_rank_q_proj": 19.31612777709961, "geo/layer_0/stable_rank_k_proj": 16.018028259277344, "geo/layer_0/stable_rank_o_proj": 47.054744720458984, "geo/layer_0/stable_rank_gate_proj": 130.53890991210938, "geo/layer_0/stable_rank_down_proj": 55.19363784790039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06525598466396332, "geo/layer_0/attn_entropy_mean": 6.15970516204834, "geo/layer_0/attn_entropy_std": 0.41957923769950867, "geo/layer_7/stable_rank_q_proj": 43.346160888671875, "geo/layer_7/stable_rank_k_proj": 40.747493743896484, "geo/layer_7/stable_rank_o_proj": 89.8038101196289, "geo/layer_7/stable_rank_gate_proj": 80.58241271972656, "geo/layer_7/stable_rank_down_proj": 140.02630615234375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4549674987792969, "geo/layer_7/attn_entropy_mean": 4.635128021240234, "geo/layer_7/attn_entropy_std": 0.7944932579994202, "geo/layer_14/stable_rank_q_proj": 51.221920013427734, "geo/layer_14/stable_rank_k_proj": 40.01603698730469, "geo/layer_14/stable_rank_o_proj": 43.67892074584961, "geo/layer_14/stable_rank_gate_proj": 71.57008361816406, "geo/layer_14/stable_rank_down_proj": 129.94581604003906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4087149500846863, "geo/layer_14/attn_entropy_mean": 5.537091255187988, "geo/layer_14/attn_entropy_std": 0.40860456228256226, "geo/layer_21/stable_rank_q_proj": 40.334354400634766, "geo/layer_21/stable_rank_k_proj": 30.1336727142334, "geo/layer_21/stable_rank_o_proj": 70.04442596435547, "geo/layer_21/stable_rank_gate_proj": 65.76378631591797, "geo/layer_21/stable_rank_down_proj": 50.97481155395508, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14185328781604767, "geo/layer_21/attn_entropy_mean": 5.689075469970703, "geo/layer_21/attn_entropy_std": 0.2940201163291931, "geo/layer_27/stable_rank_q_proj": 43.554805755615234, "geo/layer_27/stable_rank_k_proj": 31.89415168762207, "geo/layer_27/stable_rank_o_proj": 114.77794647216797, "geo/layer_27/stable_rank_gate_proj": 79.7467041015625, "geo/layer_27/stable_rank_down_proj": 128.4217529296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09675052762031555, "geo/layer_27/attn_entropy_mean": 4.188165664672852, "geo/layer_27/attn_entropy_std": 0.7520642280578613, "attnres/final_alpha/block_0": 0.2353140115737915, "attnres/block_norm/0": 1.766112208366394, "attnres/final_alpha/block_1": 0.004354782402515411, "attnres/block_norm/1": 47122.03125, "attnres/final_alpha/block_2": 0.01018567569553852, "attnres/block_norm/2": 28714.6953125, "attnres/final_alpha/block_3": 0.011980469338595867, "attnres/block_norm/3": 58667.5859375, "attnres/final_alpha/block_4": 0.014552883803844452, "attnres/block_norm/4": 15325.265625, "attnres/final_alpha/block_5": 0.6159517168998718, "attnres/block_norm/5": 6648.255859375, "attnres/final_alpha/block_6": 0.10766048729419708, "attnres/block_norm/6": 39260.34765625, "geo/tier1_time_s": 1.356414556503296, "geo/step": 65475.0, "geo/rankme_slope": -8.622953087484994e-05} {"step": 65480, "timestamp": 1778265311.9051309, "train/loss": 2.1547141790390016, "train/z_loss": 0.0013756350264884532, "train/perplexity": 8.625424499494635, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707836.2004766474, "perf/iters_per_sec": 0.814359760511707, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2279585123062133, "data/tokens_consumed": 137323610112, "data/tokens_consumed_B": 137.323610112, "train/loss_slope": 1.320873991419454e-05} {"step": 65490, "timestamp": 1778265322.2846215, "train/loss": 2.12111040353775, "train/z_loss": 0.0013831537100486458, "train/perplexity": 8.340393550258758, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021562.931135047, "perf/iters_per_sec": 0.9639563232112155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373914003372193, "data/tokens_consumed": 137344581632, "data/tokens_consumed_B": 137.344581632, "train/loss_slope": 9.481701620555741e-06} {"step": 65500, "timestamp": 1778265332.620436, "grad/layer_0/attn": 0.0025010420940816402, "grad/layer_0/mlp": 0.002896300982683897, "grad/layer_0/attn_mlp_ratio": 0.8635297307433298, "grad/layer_4/attn": 0.0021062404848635197, "grad/layer_4/mlp": 0.0025234417989850044, "grad/layer_4/attn_mlp_ratio": 0.8346696968575761, "grad/layer_8/attn": 0.003635952016338706, "grad/layer_8/mlp": 0.003582390258088708, "grad/layer_8/attn_mlp_ratio": 1.0149513740536569, "grad/layer_12/attn": 0.004210665822029114, "grad/layer_12/mlp": 0.006799187045544386, "grad/layer_12/attn_mlp_ratio": 0.6192895903429323, "grad/layer_16/attn": 0.0033291971776634455, "grad/layer_16/mlp": 0.004684248939156532, "grad/layer_16/attn_mlp_ratio": 0.7107216439250029, "grad/layer_20/attn": 0.003149260301142931, "grad/layer_20/mlp": 0.005414826795458794, "grad/layer_20/attn_mlp_ratio": 0.5815994420401683, "grad/layer_24/attn": 0.007573534268885851, "grad/layer_24/mlp": 0.008818901143968105, "grad/layer_24/attn_mlp_ratio": 0.8587843382491609, "grad/layer_27/attn": 0.005830487702041864, "grad/layer_27/mlp": 0.008542313240468502, "grad/layer_27/attn_mlp_ratio": 0.6825420081958844} {"step": 65500, "timestamp": 1778265332.6345987, "train/loss": 2.1126566529273987, "train/z_loss": 0.0013863332220353185, "train/perplexity": 8.270183132012797, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027459.5755799364, "perf/iters_per_sec": 0.9667680623912508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343742609024047, "data/tokens_consumed": 137365553152, "data/tokens_consumed_B": 137.365553152, "train/loss_slope": 1.0834783632190894e-05} {"step": 65500, "timestamp": 1778265339.9148636, "geo/ww_alpha_mean": 7.550073380200876, "geo/ww_alpha_std": 4.302836589670328, "geo/ww_alpha_min": 1.3434678718855435, "geo/ww_alpha_max": 26.62168068844183, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.798835166666959, "geo/ww_alpha_by_type/k_proj": 4.562692613972979, "geo/ww_alpha_by_type/v_proj": 7.76340570485361, "geo/ww_alpha_by_type/o_proj": 9.710713861585754, "geo/ww_alpha_by_type/gate_proj": 7.742625958536173, "geo/ww_alpha_by_type/up_proj": 10.992262405530363, "geo/ww_alpha_by_type/down_proj": 8.36792564209546, "geo/twonn_id/layer_0": 0.7118646502494812, "geo/twonn_id/layer_7": 3.010868549346924, "geo/twonn_id/layer_14": 4.805624008178711, "geo/twonn_id/layer_21": 8.721446990966797, "geo/twonn_id/layer_27": 5.1394805908203125, "geo/tier2_time_s": 7.27414608001709} {"step": 65500, "timestamp": 1778265340.7233388, "eoc/jacobian_sigma/layer_0/attn": 1237.43359375, "eoc/jacobian_sigma/layer_0/mlp": 8029.9248046875, "eoc/jacobian_sigma/layer_0": 8029.9248046875, "eoc/jacobian_sigma/layer_7/attn": 1.1428614854812622, "eoc/jacobian_sigma/layer_7/mlp": 1.7699439525604248, "eoc/jacobian_sigma/layer_7": 1.7699439525604248, "eoc/jacobian_sigma/layer_14/attn": 1.4756487607955933, "eoc/jacobian_sigma/layer_14/mlp": 5.37923526763916, "eoc/jacobian_sigma/layer_14": 5.37923526763916, "eoc/jacobian_sigma/layer_21/attn": 1.088579535484314, "eoc/jacobian_sigma/layer_21/mlp": 4.541765213012695, "eoc/jacobian_sigma/layer_21": 4.541765213012695, "eoc/jacobian_sigma/layer_27/attn": 2.837824583053589, "eoc/jacobian_sigma/layer_27/mlp": 25.3911075592041, "eoc/jacobian_sigma/layer_27": 25.3911075592041, "eoc/layer0_sigma": 8029.9248046875, "eoc/sigma_max": 25.3911075592041, "eoc/sigma_min": 1.7699439525604248, "eoc/sigma_mean": 9.270512998104095, "eoc/time_s": 0.7993412017822266} {"step": 65510, "timestamp": 1778265351.0829563, "train/loss": 2.1329802513122558, "train/z_loss": 0.001385567255783826, "train/perplexity": 8.439982636442913, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1137075.3820661504, "perf/iters_per_sec": 0.5421997938471558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.844338583946228, "data/tokens_consumed": 137386524672, "data/tokens_consumed_B": 137.386524672, "train/loss_slope": 7.691539310314558e-06} {"step": 65520, "timestamp": 1778265361.4262397, "train/loss": 2.126845133304596, "train/z_loss": 0.0013847045134752988, "train/perplexity": 8.3883608617443, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028527.0294120163, "perf/iters_per_sec": 0.9672770640430528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338299512863158, "data/tokens_consumed": 137407496192, "data/tokens_consumed_B": 137.407496192, "train/loss_slope": 8.519113625344172e-06} {"step": 65530, "timestamp": 1778265371.7741704, "train/loss": 2.1329487919807435, "train/z_loss": 0.0013843685621395708, "train/perplexity": 8.439717124407633, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027894.0895807438, "perf/iters_per_sec": 0.9669752548125953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034152626991272, "data/tokens_consumed": 137428467712, "data/tokens_consumed_B": 137.428467712, "train/loss_slope": 6.002481420799119e-06} {"step": 65540, "timestamp": 1778265382.1138287, "train/loss": 2.127446436882019, "train/z_loss": 0.0013821186847053468, "train/perplexity": 8.393406329916187, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029691.9897003681, "perf/iters_per_sec": 0.96783256039637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033236575126648, "data/tokens_consumed": 137449439232, "data/tokens_consumed_B": 137.449439232, "train/loss_slope": 5.786563360830569e-06} {"step": 65550, "timestamp": 1778265392.4431465, "grad/layer_0/attn": 0.0027361686807125807, "grad/layer_0/mlp": 0.0031087470706552267, "grad/layer_0/attn_mlp_ratio": 0.880151562835482, "grad/layer_4/attn": 0.004299596883356571, "grad/layer_4/mlp": 0.002476767171174288, "grad/layer_4/attn_mlp_ratio": 1.7359712934667622, "grad/layer_8/attn": 0.005559518467634916, "grad/layer_8/mlp": 0.003768655937165022, "grad/layer_8/attn_mlp_ratio": 1.475199225614942, "grad/layer_12/attn": 0.005909358616918325, "grad/layer_12/mlp": 0.007309862412512302, "grad/layer_12/attn_mlp_ratio": 0.8084089963119372, "grad/layer_16/attn": 0.004184100776910782, "grad/layer_16/mlp": 0.004830941092222929, "grad/layer_16/attn_mlp_ratio": 0.8661046803149953, "grad/layer_20/attn": 0.0030123647302389145, "grad/layer_20/mlp": 0.006335927173495293, "grad/layer_20/attn_mlp_ratio": 0.4754418098895107, "grad/layer_24/attn": 0.011458240449428558, "grad/layer_24/mlp": 0.009859420359134674, "grad/layer_24/attn_mlp_ratio": 1.1621616602031197, "grad/layer_27/attn": 0.005170674528926611, "grad/layer_27/mlp": 0.008253589272499084, "grad/layer_27/attn_mlp_ratio": 0.6264758634776854} {"step": 65550, "timestamp": 1778265393.0439432, "eos/sharpness": 57.436847686767564, "eos/L0_probe": 1.9721113443374634, "eos/L_plus": 2.2537944316864014, "eos/L_minus": 2.264796733856201, "eos/grad_norm": 0.14156141877174377, "eos/embed_grad_frac": 0.10613974928855896, "eos/time_s": 0.597968339920044} {"step": 65550, "timestamp": 1778265393.0612364, "train/loss": 2.14257493019104, "train/z_loss": 0.0013713741791434586, "train/perplexity": 8.52135128840757, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916827.8164879489, "perf/iters_per_sec": 0.9140147287788147, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094074273109436, "data/tokens_consumed": 137470410752, "data/tokens_consumed_B": 137.470410752, "train/loss_slope": 3.5563328943558854e-06} {"step": 65550, "timestamp": 1778265394.4255621, "geo/rankme_last": 439.49261474609375, "geo/layer_0/stable_rank_q_proj": 19.292566299438477, "geo/layer_0/stable_rank_k_proj": 15.994071960449219, "geo/layer_0/stable_rank_o_proj": 47.09561538696289, "geo/layer_0/stable_rank_gate_proj": 130.56793212890625, "geo/layer_0/stable_rank_down_proj": 55.18099594116211, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06352709233760834, "geo/layer_0/attn_entropy_mean": 6.160425662994385, "geo/layer_0/attn_entropy_std": 0.42453140020370483, "geo/layer_7/stable_rank_q_proj": 43.292938232421875, "geo/layer_7/stable_rank_k_proj": 40.79601287841797, "geo/layer_7/stable_rank_o_proj": 89.75975799560547, "geo/layer_7/stable_rank_gate_proj": 80.51465606689453, "geo/layer_7/stable_rank_down_proj": 139.93084716796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4550437331199646, "geo/layer_7/attn_entropy_mean": 4.66372013092041, "geo/layer_7/attn_entropy_std": 0.790598452091217, "geo/layer_14/stable_rank_q_proj": 51.32875061035156, "geo/layer_14/stable_rank_k_proj": 40.03916931152344, "geo/layer_14/stable_rank_o_proj": 43.66355514526367, "geo/layer_14/stable_rank_gate_proj": 71.34529113769531, "geo/layer_14/stable_rank_down_proj": 129.83328247070312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39626047015190125, "geo/layer_14/attn_entropy_mean": 5.519437313079834, "geo/layer_14/attn_entropy_std": 0.39008283615112305, "geo/layer_21/stable_rank_q_proj": 40.431941986083984, "geo/layer_21/stable_rank_k_proj": 30.124561309814453, "geo/layer_21/stable_rank_o_proj": 69.89414978027344, "geo/layer_21/stable_rank_gate_proj": 65.6593246459961, "geo/layer_21/stable_rank_down_proj": 50.959556579589844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14303886890411377, "geo/layer_21/attn_entropy_mean": 5.676105499267578, "geo/layer_21/attn_entropy_std": 0.31675851345062256, "geo/layer_27/stable_rank_q_proj": 43.515567779541016, "geo/layer_27/stable_rank_k_proj": 31.85145378112793, "geo/layer_27/stable_rank_o_proj": 114.60221862792969, "geo/layer_27/stable_rank_gate_proj": 79.8342056274414, "geo/layer_27/stable_rank_down_proj": 128.3577117919922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08964864164590836, "geo/layer_27/attn_entropy_mean": 4.187640190124512, "geo/layer_27/attn_entropy_std": 0.7572247982025146, "attnres/final_alpha/block_0": 0.23692643642425537, "attnres/block_norm/0": 1.766425609588623, "attnres/final_alpha/block_1": 0.004352947697043419, "attnres/block_norm/1": 47273.16015625, "attnres/final_alpha/block_2": 0.010313387960195541, "attnres/block_norm/2": 28722.615234375, "attnres/final_alpha/block_3": 0.012052744626998901, "attnres/block_norm/3": 59044.1796875, "attnres/final_alpha/block_4": 0.014717057347297668, "attnres/block_norm/4": 15316.841796875, "attnres/final_alpha/block_5": 0.6130006313323975, "attnres/block_norm/5": 6684.4892578125, "attnres/final_alpha/block_6": 0.10863684117794037, "attnres/block_norm/6": 39050.67578125, "geo/tier1_time_s": 1.3604097366333008, "geo/step": 65550.0, "geo/rankme_slope": -7.030085471688676e-05} {"step": 65560, "timestamp": 1778265404.7690463, "train/loss": 2.1257935404777526, "train/z_loss": 0.0013817879953421652, "train/perplexity": 8.379544358131568, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791775.4875703407, "perf/iters_per_sec": 0.85438513163106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704323530197143, "data/tokens_consumed": 137491382272, "data/tokens_consumed_B": 137.491382272, "train/loss_slope": 3.1547374636641286e-06} {"step": 65570, "timestamp": 1778265415.1088498, "train/loss": 2.114009642601013, "train/z_loss": 0.0013756940024904907, "train/perplexity": 8.281380177424627, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029610.1254341633, "perf/iters_per_sec": 0.9677935244723145, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332782506942748, "data/tokens_consumed": 137512353792, "data/tokens_consumed_B": 137.512353792, "train/loss_slope": -3.1578925361942318e-06} {"step": 65580, "timestamp": 1778265425.4512985, "train/loss": 2.111236333847046, "train/z_loss": 0.001376993244048208, "train/perplexity": 8.2584451709107, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028712.8615432645, "perf/iters_per_sec": 0.96736567570842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337352514266969, "data/tokens_consumed": 137533325312, "data/tokens_consumed_B": 137.533325312, "train/loss_slope": -3.906433839108393e-06} {"step": 65590, "timestamp": 1778265435.7962365, "train/loss": 2.1486650705337524, "train/z_loss": 0.0013818274950608612, "train/perplexity": 8.573405862603607, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028172.95719316, "perf/iters_per_sec": 0.9671082292524147, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340104341506957, "data/tokens_consumed": 137554296832, "data/tokens_consumed_B": 137.554296832, "train/loss_slope": -6.213964344394335e-06} {"step": 65600, "timestamp": 1778265446.1343594, "grad/layer_0/attn": 0.002383818384259939, "grad/layer_0/mlp": 0.002836065599694848, "grad/layer_0/attn_mlp_ratio": 0.8405370808287095, "grad/layer_4/attn": 0.004106577485799789, "grad/layer_4/mlp": 0.002536030486226082, "grad/layer_4/attn_mlp_ratio": 1.619293358725168, "grad/layer_8/attn": 0.009902087971568108, "grad/layer_8/mlp": 0.003747358685359359, "grad/layer_8/attn_mlp_ratio": 2.642417910517349, "grad/layer_12/attn": 0.0057292599231004715, "grad/layer_12/mlp": 0.007892902940511703, "grad/layer_12/attn_mlp_ratio": 0.7258748642538803, "grad/layer_16/attn": 0.0038798311725258827, "grad/layer_16/mlp": 0.004612790886312723, "grad/layer_16/attn_mlp_ratio": 0.8411027475639562, "grad/layer_20/attn": 0.003553906222805381, "grad/layer_20/mlp": 0.005892399698495865, "grad/layer_20/attn_mlp_ratio": 0.603133925792438, "grad/layer_24/attn": 0.011205554008483887, "grad/layer_24/mlp": 0.009492999874055386, "grad/layer_24/attn_mlp_ratio": 1.1804017738448285, "grad/layer_27/attn": 0.0036409073509275913, "grad/layer_27/mlp": 0.0088966004550457, "grad/layer_27/attn_mlp_ratio": 0.4092470296266874} {"step": 65600, "timestamp": 1778265446.1485806, "train/loss": 2.1651514768600464, "train/z_loss": 0.0013742036186158656, "train/perplexity": 8.715922077370521, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027218.887499115, "perf/iters_per_sec": 0.9666532933707785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344970703125, "data/tokens_consumed": 137575268352, "data/tokens_consumed_B": 137.575268352, "train/loss_slope": -2.5287858795339835e-06} {"step": 65610, "timestamp": 1778265456.4876802, "train/loss": 2.128198266029358, "train/z_loss": 0.0013790862285532057, "train/perplexity": 8.399719110209235, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029297.0602260898, "perf/iters_per_sec": 0.9676442433481645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334376573562623, "data/tokens_consumed": 137596239872, "data/tokens_consumed_B": 137.596239872, "train/loss_slope": -4.005335388046248e-06} {"step": 65620, "timestamp": 1778265466.8288646, "train/loss": 2.1403937101364137, "train/z_loss": 0.0013700390933081507, "train/perplexity": 8.502784602459908, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028912.8141392046, "perf/iters_per_sec": 0.9674610205360434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336333751678466, "data/tokens_consumed": 137617211392, "data/tokens_consumed_B": 137.617211392, "train/loss_slope": -5.149362525745369e-06} {"step": 65625, "timestamp": 1778265472.5780644, "eos/sharpness": 36.39171123504638, "eos/L0_probe": 1.9733489751815796, "eos/L_plus": 2.16264009475708, "eos/L_minus": 2.147974967956543, "eos/grad_norm": 0.11808306723833084, "eos/embed_grad_frac": 0.16216759383678436, "eos/time_s": 0.5862104892730713} {"step": 65625, "timestamp": 1778265473.9547372, "geo/rankme_last": 438.12188720703125, "geo/layer_0/stable_rank_q_proj": 19.306896209716797, "geo/layer_0/stable_rank_k_proj": 16.009220123291016, "geo/layer_0/stable_rank_o_proj": 47.10194396972656, "geo/layer_0/stable_rank_gate_proj": 130.43455505371094, "geo/layer_0/stable_rank_down_proj": 55.17811965942383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06295986473560333, "geo/layer_0/attn_entropy_mean": 6.1598920822143555, "geo/layer_0/attn_entropy_std": 0.41730940341949463, "geo/layer_7/stable_rank_q_proj": 43.2711067199707, "geo/layer_7/stable_rank_k_proj": 40.637123107910156, "geo/layer_7/stable_rank_o_proj": 89.8411865234375, "geo/layer_7/stable_rank_gate_proj": 80.54911804199219, "geo/layer_7/stable_rank_down_proj": 139.8824920654297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4577607810497284, "geo/layer_7/attn_entropy_mean": 4.6437177658081055, "geo/layer_7/attn_entropy_std": 0.7996242642402649, "geo/layer_14/stable_rank_q_proj": 51.42033767700195, "geo/layer_14/stable_rank_k_proj": 40.05796813964844, "geo/layer_14/stable_rank_o_proj": 43.6000862121582, "geo/layer_14/stable_rank_gate_proj": 71.381103515625, "geo/layer_14/stable_rank_down_proj": 129.76075744628906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39773041009902954, "geo/layer_14/attn_entropy_mean": 5.557509422302246, "geo/layer_14/attn_entropy_std": 0.4127442538738251, "geo/layer_21/stable_rank_q_proj": 40.491119384765625, "geo/layer_21/stable_rank_k_proj": 30.191892623901367, "geo/layer_21/stable_rank_o_proj": 69.87248229980469, "geo/layer_21/stable_rank_gate_proj": 65.60684967041016, "geo/layer_21/stable_rank_down_proj": 50.87769317626953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14506053924560547, "geo/layer_21/attn_entropy_mean": 5.688990592956543, "geo/layer_21/attn_entropy_std": 0.28937917947769165, "geo/layer_27/stable_rank_q_proj": 43.48529815673828, "geo/layer_27/stable_rank_k_proj": 31.85649299621582, "geo/layer_27/stable_rank_o_proj": 114.4758529663086, "geo/layer_27/stable_rank_gate_proj": 79.88636779785156, "geo/layer_27/stable_rank_down_proj": 128.298583984375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09512574225664139, "geo/layer_27/attn_entropy_mean": 4.2125244140625, "geo/layer_27/attn_entropy_std": 0.7317059636116028, "attnres/final_alpha/block_0": 0.2354694902896881, "attnres/block_norm/0": 1.7664906978607178, "attnres/final_alpha/block_1": 0.004321450367569923, "attnres/block_norm/1": 46873.87890625, "attnres/final_alpha/block_2": 0.010173716582357883, "attnres/block_norm/2": 28756.2578125, "attnres/final_alpha/block_3": 0.011776643805205822, "attnres/block_norm/3": 58922.26953125, "attnres/final_alpha/block_4": 0.014400702901184559, "attnres/block_norm/4": 15296.875, "attnres/final_alpha/block_5": 0.6160933971405029, "attnres/block_norm/5": 6700.88427734375, "attnres/final_alpha/block_6": 0.10776455700397491, "attnres/block_norm/6": 39615.69140625, "geo/tier1_time_s": 1.3589229583740234, "geo/step": 65625.0, "geo/rankme_slope": -9.661216048919568e-05} {"step": 65630, "timestamp": 1778265479.1317685, "train/loss": 2.1492159843444822, "train/z_loss": 0.0013635001378133893, "train/perplexity": 8.578130371577437, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705709.2763323744, "perf/iters_per_sec": 0.8133455640470383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2294897079467773, "data/tokens_consumed": 137638182912, "data/tokens_consumed_B": 137.638182912, "train/loss_slope": -4.6618472052664884e-06} {"step": 65640, "timestamp": 1778265489.471723, "train/loss": 2.1447296500205995, "train/z_loss": 0.0013670797925442457, "train/perplexity": 8.539732208758524, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029035.435037805, "perf/iters_per_sec": 0.9675194907368684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033570909500122, "data/tokens_consumed": 137659154432, "data/tokens_consumed_B": 137.659154432, "train/loss_slope": -5.4400724939780705e-06} {"step": 65650, "timestamp": 1778265499.8007998, "grad/layer_0/attn": 0.0027442825958132744, "grad/layer_0/mlp": 0.003243387211114168, "grad/layer_0/attn_mlp_ratio": 0.846116215109246, "grad/layer_4/attn": 0.0020439166110008955, "grad/layer_4/mlp": 0.0024803581181913614, "grad/layer_4/attn_mlp_ratio": 0.8240408970004695, "grad/layer_8/attn": 0.0045765964314341545, "grad/layer_8/mlp": 0.0036741802468895912, "grad/layer_8/attn_mlp_ratio": 1.2456101767863712, "grad/layer_12/attn": 0.004464768338948488, "grad/layer_12/mlp": 0.007236381061375141, "grad/layer_12/attn_mlp_ratio": 0.6169891053804083, "grad/layer_16/attn": 0.004008499439805746, "grad/layer_16/mlp": 0.004641969222575426, "grad/layer_16/attn_mlp_ratio": 0.8635342375726383, "grad/layer_20/attn": 0.004018859472125769, "grad/layer_20/mlp": 0.006779738236218691, "grad/layer_20/attn_mlp_ratio": 0.5927750117812415, "grad/layer_24/attn": 0.017059918493032455, "grad/layer_24/mlp": 0.013901430182158947, "grad/layer_24/attn_mlp_ratio": 1.2272059886475928, "grad/layer_27/attn": 0.013204321265220642, "grad/layer_27/mlp": 0.014341581612825394, "grad/layer_27/attn_mlp_ratio": 0.9207018813979408} {"step": 65650, "timestamp": 1778265499.8148897, "train/loss": 2.1586661458015444, "train/z_loss": 0.001363427669275552, "train/perplexity": 8.659579335364905, "train/grad_norm": 0.2333984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028524.5967901452, "perf/iters_per_sec": 0.9672759040785528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338311910629272, "data/tokens_consumed": 137680125952, "data/tokens_consumed_B": 137.680125952, "train/loss_slope": -5.806483887638941e-06} {"step": 65660, "timestamp": 1778265510.1558795, "train/loss": 2.1764958620071413, "train/z_loss": 0.0013743979390710593, "train/perplexity": 8.815361829302468, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029442.7644723454, "perf/iters_per_sec": 0.9677137205468871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333634614944458, "data/tokens_consumed": 137701097472, "data/tokens_consumed_B": 137.701097472, "train/loss_slope": -3.872925320295375e-06} {"step": 65670, "timestamp": 1778265520.494386, "train/loss": 2.1316495060920717, "train/z_loss": 0.0013690189458429813, "train/perplexity": 8.428758639687413, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029493.897014149, "perf/iters_per_sec": 0.9677381024428124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333374261856079, "data/tokens_consumed": 137722068992, "data/tokens_consumed_B": 137.722068992, "train/loss_slope": -2.530109041845163e-06} {"step": 65680, "timestamp": 1778265530.840723, "train/loss": 2.1261831879615785, "train/z_loss": 0.001364114077296108, "train/perplexity": 8.382810062701788, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028429.8694180949, "perf/iters_per_sec": 0.9672307345476603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338794708251953, "data/tokens_consumed": 137743040512, "data/tokens_consumed_B": 137.743040512, "train/loss_slope": -1.99791098227555e-06} {"step": 65690, "timestamp": 1778265541.1788752, "train/loss": 2.13883798122406, "train/z_loss": 0.0013781607965938746, "train/perplexity": 8.489566858897154, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029860.3754694292, "perf/iters_per_sec": 0.9679128529879709, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033150863647461, "data/tokens_consumed": 137764012032, "data/tokens_consumed_B": 137.764012032, "train/loss_slope": -9.067776036483882e-07} {"step": 65700, "timestamp": 1778265551.5060997, "grad/layer_0/attn": 0.002746429294347763, "grad/layer_0/mlp": 0.003073338186368346, "grad/layer_0/attn_mlp_ratio": 0.8936306512463749, "grad/layer_4/attn": 0.0032878280617296696, "grad/layer_4/mlp": 0.0025367559865117073, "grad/layer_4/attn_mlp_ratio": 1.296075755651684, "grad/layer_8/attn": 0.0035313284024596214, "grad/layer_8/mlp": 0.003501123981550336, "grad/layer_8/attn_mlp_ratio": 1.0086270352623181, "grad/layer_12/attn": 0.005412854719907045, "grad/layer_12/mlp": 0.007051373366266489, "grad/layer_12/attn_mlp_ratio": 0.7676312630159136, "grad/layer_16/attn": 0.0034648634027689695, "grad/layer_16/mlp": 0.0044437251053750515, "grad/layer_16/attn_mlp_ratio": 0.7797204468399469, "grad/layer_20/attn": 0.0047622499987483025, "grad/layer_20/mlp": 0.005675864405930042, "grad/layer_20/attn_mlp_ratio": 0.8390351802395545, "grad/layer_24/attn": 0.008816148154437542, "grad/layer_24/mlp": 0.008666396141052246, "grad/layer_24/attn_mlp_ratio": 1.0172796061038532, "grad/layer_27/attn": 0.004510441794991493, "grad/layer_27/mlp": 0.008104919455945492, "grad/layer_27/attn_mlp_ratio": 0.556506670283086} {"step": 65700, "timestamp": 1778265552.0902884, "eos/sharpness": 58.3576202392578, "eos/L0_probe": 1.9748433828353882, "eos/L_plus": 2.2336626052856445, "eos/L_minus": 2.29960036277771, "eos/grad_norm": 0.13607101142406464, "eos/embed_grad_frac": 0.12171830981969833, "eos/time_s": 0.5814511775970459} {"step": 65700, "timestamp": 1778265552.1099675, "train/loss": 2.122467589378357, "train/z_loss": 0.0013639506185427308, "train/perplexity": 8.35172069907458, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919402.5349348406, "perf/iters_per_sec": 0.9152424502062038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0926066637039185, "data/tokens_consumed": 137784983552, "data/tokens_consumed_B": 137.784983552, "train/loss_slope": -3.47673445895272e-07} {"step": 65700, "timestamp": 1778265553.4713104, "geo/rankme_last": 438.46942138671875, "geo/layer_0/stable_rank_q_proj": 19.304119110107422, "geo/layer_0/stable_rank_k_proj": 16.02593231201172, "geo/layer_0/stable_rank_o_proj": 47.13475799560547, "geo/layer_0/stable_rank_gate_proj": 130.14892578125, "geo/layer_0/stable_rank_down_proj": 55.14690017700195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0626865029335022, "geo/layer_0/attn_entropy_mean": 6.1584014892578125, "geo/layer_0/attn_entropy_std": 0.42070311307907104, "geo/layer_7/stable_rank_q_proj": 43.221885681152344, "geo/layer_7/stable_rank_k_proj": 40.617088317871094, "geo/layer_7/stable_rank_o_proj": 89.91645812988281, "geo/layer_7/stable_rank_gate_proj": 80.38623046875, "geo/layer_7/stable_rank_down_proj": 139.9283905029297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4555578827857971, "geo/layer_7/attn_entropy_mean": 4.660473823547363, "geo/layer_7/attn_entropy_std": 0.7861759066581726, "geo/layer_14/stable_rank_q_proj": 51.388465881347656, "geo/layer_14/stable_rank_k_proj": 39.98469924926758, "geo/layer_14/stable_rank_o_proj": 43.55082321166992, "geo/layer_14/stable_rank_gate_proj": 71.28667449951172, "geo/layer_14/stable_rank_down_proj": 129.59112548828125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3880593180656433, "geo/layer_14/attn_entropy_mean": 5.52238130569458, "geo/layer_14/attn_entropy_std": 0.3924117088317871, "geo/layer_21/stable_rank_q_proj": 40.40697479248047, "geo/layer_21/stable_rank_k_proj": 30.21192741394043, "geo/layer_21/stable_rank_o_proj": 69.89884185791016, "geo/layer_21/stable_rank_gate_proj": 65.6340560913086, "geo/layer_21/stable_rank_down_proj": 50.88652420043945, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1415819376707077, "geo/layer_21/attn_entropy_mean": 5.690052032470703, "geo/layer_21/attn_entropy_std": 0.294583797454834, "geo/layer_27/stable_rank_q_proj": 43.569671630859375, "geo/layer_27/stable_rank_k_proj": 31.805028915405273, "geo/layer_27/stable_rank_o_proj": 114.13880920410156, "geo/layer_27/stable_rank_gate_proj": 79.90876007080078, "geo/layer_27/stable_rank_down_proj": 128.3336181640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09291137754917145, "geo/layer_27/attn_entropy_mean": 4.215468406677246, "geo/layer_27/attn_entropy_std": 0.7430816888809204, "attnres/final_alpha/block_0": 0.2370733618736267, "attnres/block_norm/0": 1.76654052734375, "attnres/final_alpha/block_1": 0.0042772009037435055, "attnres/block_norm/1": 47309.390625, "attnres/final_alpha/block_2": 0.010356608778238297, "attnres/block_norm/2": 28709.09375, "attnres/final_alpha/block_3": 0.012134636752307415, "attnres/block_norm/3": 59427.3671875, "attnres/final_alpha/block_4": 0.014588067308068275, "attnres/block_norm/4": 15259.095703125, "attnres/final_alpha/block_5": 0.6120460033416748, "attnres/block_norm/5": 6711.27490234375, "attnres/final_alpha/block_6": 0.10952415317296982, "attnres/block_norm/6": 39338.1875, "geo/tier1_time_s": 1.3574397563934326, "geo/step": 65700.0, "geo/rankme_slope": -0.00010160658013205282} {"step": 65710, "timestamp": 1778265563.8321595, "train/loss": 2.1257392168045044, "train/z_loss": 0.0013804007670842112, "train/perplexity": 8.37908916286594, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789604.3321899397, "perf/iters_per_sec": 0.8533498440694521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718523263931275, "data/tokens_consumed": 137805955072, "data/tokens_consumed_B": 137.805955072, "train/loss_slope": -1.0481546993123566e-06} {"step": 65720, "timestamp": 1778265574.193597, "train/loss": 2.122212624549866, "train/z_loss": 0.0013829192263074218, "train/perplexity": 8.349591575476284, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025334.7033305168, "perf/iters_per_sec": 0.9657548443462929, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354594707489013, "data/tokens_consumed": 137826926592, "data/tokens_consumed_B": 137.826926592, "train/loss_slope": -1.1498662254454457e-06} {"step": 65730, "timestamp": 1778265584.5515883, "train/loss": 2.1795546293258665, "train/z_loss": 0.0013628856977447867, "train/perplexity": 8.842367250563314, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025695.2959085556, "perf/iters_per_sec": 0.965926788286474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035275149345398, "data/tokens_consumed": 137847898112, "data/tokens_consumed_B": 137.847898112, "train/loss_slope": 5.342883817650931e-08} {"step": 65740, "timestamp": 1778265594.917848, "train/loss": 2.1666934490203857, "train/z_loss": 0.0013876120559871198, "train/perplexity": 8.729372153722162, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024470.3824698739, "perf/iters_per_sec": 0.9653427040433282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359015464782715, "data/tokens_consumed": 137868869632, "data/tokens_consumed_B": 137.868869632, "train/loss_slope": 2.559002075973606e-06} {"step": 65750, "timestamp": 1778265605.25179, "grad/layer_0/attn": 0.0029909212607890368, "grad/layer_0/mlp": 0.00313210510648787, "grad/layer_0/attn_mlp_ratio": 0.9549236259987743, "grad/layer_4/attn": 0.0023776048328727484, "grad/layer_4/mlp": 0.0025617817882448435, "grad/layer_4/attn_mlp_ratio": 0.9281058796546179, "grad/layer_8/attn": 0.003836610121652484, "grad/layer_8/mlp": 0.0037479624152183533, "grad/layer_8/attn_mlp_ratio": 1.023652212655325, "grad/layer_12/attn": 0.004365872126072645, "grad/layer_12/mlp": 0.00697473157197237, "grad/layer_12/attn_mlp_ratio": 0.6259555680997301, "grad/layer_16/attn": 0.004632079508155584, "grad/layer_16/mlp": 0.004829143639653921, "grad/layer_16/attn_mlp_ratio": 0.9591927177730976, "grad/layer_20/attn": 0.0031265749130398035, "grad/layer_20/mlp": 0.006364075932651758, "grad/layer_20/attn_mlp_ratio": 0.4912849716122946, "grad/layer_24/attn": 0.018711032345891, "grad/layer_24/mlp": 0.010523124597966671, "grad/layer_24/attn_mlp_ratio": 1.7780871065327617, "grad/layer_27/attn": 0.007956752553582191, "grad/layer_27/mlp": 0.009599702432751656, "grad/layer_27/attn_mlp_ratio": 0.8288540740127986} {"step": 65750, "timestamp": 1778265605.265835, "train/loss": 2.1739439249038695, "train/z_loss": 0.001375890802592039, "train/perplexity": 8.792894260475876, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028089.5787751372, "perf/iters_per_sec": 0.9670684713245092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340529441833497, "data/tokens_consumed": 137889841152, "data/tokens_consumed_B": 137.889841152, "train/loss_slope": 5.261821887030315e-06} {"step": 65760, "timestamp": 1778265615.610362, "train/loss": 2.1223395824432374, "train/z_loss": 0.0013823083252646029, "train/perplexity": 8.350651689326705, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028320.4176949207, "perf/iters_per_sec": 0.9671785438990215, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033935260772705, "data/tokens_consumed": 137910812672, "data/tokens_consumed_B": 137.910812672, "train/loss_slope": 4.328165975662826e-06} {"step": 65770, "timestamp": 1778265625.98025, "train/loss": 2.1121787309646605, "train/z_loss": 0.0013778304681181907, "train/perplexity": 8.266231574201488, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023759.5561194024, "perf/iters_per_sec": 0.9650037556263935, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362653970718383, "data/tokens_consumed": 137931784192, "data/tokens_consumed_B": 137.931784192, "train/loss_slope": 3.464974407101052e-06} {"step": 65775, "timestamp": 1778265631.7344456, "eos/sharpness": 26.086592674255368, "eos/L0_probe": 1.9757027626037598, "eos/L_plus": 2.1128571033477783, "eos/L_minus": 2.099414348602295, "eos/grad_norm": 0.10507293045520782, "eos/embed_grad_frac": 0.19727104902267456, "eos/time_s": 0.5824618339538574} {"step": 65775, "timestamp": 1778265633.1081533, "geo/rankme_last": 438.3226623535156, "geo/layer_0/stable_rank_q_proj": 19.32756233215332, "geo/layer_0/stable_rank_k_proj": 16.028196334838867, "geo/layer_0/stable_rank_o_proj": 47.152618408203125, "geo/layer_0/stable_rank_gate_proj": 130.23818969726562, "geo/layer_0/stable_rank_down_proj": 55.13789749145508, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06082944571971893, "geo/layer_0/attn_entropy_mean": 6.157459735870361, "geo/layer_0/attn_entropy_std": 0.42553648352622986, "geo/layer_7/stable_rank_q_proj": 43.23904800415039, "geo/layer_7/stable_rank_k_proj": 40.565399169921875, "geo/layer_7/stable_rank_o_proj": 89.83658599853516, "geo/layer_7/stable_rank_gate_proj": 80.42022705078125, "geo/layer_7/stable_rank_down_proj": 140.0103302001953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.442576140165329, "geo/layer_7/attn_entropy_mean": 4.643165588378906, "geo/layer_7/attn_entropy_std": 0.8268754482269287, "geo/layer_14/stable_rank_q_proj": 51.28192138671875, "geo/layer_14/stable_rank_k_proj": 40.14035415649414, "geo/layer_14/stable_rank_o_proj": 43.55162811279297, "geo/layer_14/stable_rank_gate_proj": 71.41959381103516, "geo/layer_14/stable_rank_down_proj": 129.20433044433594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3843863308429718, "geo/layer_14/attn_entropy_mean": 5.560148239135742, "geo/layer_14/attn_entropy_std": 0.39544478058815, "geo/layer_21/stable_rank_q_proj": 40.31563186645508, "geo/layer_21/stable_rank_k_proj": 30.241146087646484, "geo/layer_21/stable_rank_o_proj": 69.80364227294922, "geo/layer_21/stable_rank_gate_proj": 65.60589599609375, "geo/layer_21/stable_rank_down_proj": 50.852378845214844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14416521787643433, "geo/layer_21/attn_entropy_mean": 5.7031636238098145, "geo/layer_21/attn_entropy_std": 0.29814162850379944, "geo/layer_27/stable_rank_q_proj": 43.538272857666016, "geo/layer_27/stable_rank_k_proj": 31.788894653320312, "geo/layer_27/stable_rank_o_proj": 114.15034484863281, "geo/layer_27/stable_rank_gate_proj": 79.91767883300781, "geo/layer_27/stable_rank_down_proj": 128.27947998046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09684035181999207, "geo/layer_27/attn_entropy_mean": 4.202165603637695, "geo/layer_27/attn_entropy_std": 0.7174867391586304, "attnres/final_alpha/block_0": 0.23542605340480804, "attnres/block_norm/0": 1.7666730880737305, "attnres/final_alpha/block_1": 0.00429414352402091, "attnres/block_norm/1": 47144.47265625, "attnres/final_alpha/block_2": 0.010230345651507378, "attnres/block_norm/2": 28625.216796875, "attnres/final_alpha/block_3": 0.011899230070412159, "attnres/block_norm/3": 58965.9140625, "attnres/final_alpha/block_4": 0.014337703585624695, "attnres/block_norm/4": 15331.134765625, "attnres/final_alpha/block_5": 0.6155463457107544, "attnres/block_norm/5": 6688.646484375, "attnres/final_alpha/block_6": 0.108266182243824, "attnres/block_norm/6": 39473.328125, "geo/tier1_time_s": 1.3559551239013672, "geo/step": 65775.0, "geo/rankme_slope": -0.00013119720544467787} {"step": 65780, "timestamp": 1778265638.2797592, "train/loss": 2.1362496614456177, "train/z_loss": 0.001370221795514226, "train/perplexity": 8.467621558065648, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705943.3250365036, "perf/iters_per_sec": 0.8134571671659964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.229321026802063, "data/tokens_consumed": 137952755712, "data/tokens_consumed_B": 137.952755712, "train/loss_slope": -2.2780626270100573e-07} {"step": 65790, "timestamp": 1778265648.618541, "train/loss": 2.1280739307403564, "train/z_loss": 0.0013825970119796694, "train/perplexity": 8.398674793630182, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029224.4500153966, "perf/iters_per_sec": 0.967609620101641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334746360778808, "data/tokens_consumed": 137973727232, "data/tokens_consumed_B": 137.973727232, "train/loss_slope": -3.445512405072741e-06} {"step": 65800, "timestamp": 1778265658.9514074, "grad/layer_0/attn": 0.002538772765547037, "grad/layer_0/mlp": 0.002882810775190592, "grad/layer_0/attn_mlp_ratio": 0.8806587998524845, "grad/layer_4/attn": 0.0029820359777659178, "grad/layer_4/mlp": 0.002488331403583288, "grad/layer_4/attn_mlp_ratio": 1.19840783813237, "grad/layer_8/attn": 0.0061638373881578445, "grad/layer_8/mlp": 0.003585713915526867, "grad/layer_8/attn_mlp_ratio": 1.7189986043134455, "grad/layer_12/attn": 0.006264306139200926, "grad/layer_12/mlp": 0.007495385594666004, "grad/layer_12/attn_mlp_ratio": 0.83575500906629, "grad/layer_16/attn": 0.004089007619768381, "grad/layer_16/mlp": 0.004572187550365925, "grad/layer_16/attn_mlp_ratio": 0.8943219159959733, "grad/layer_20/attn": 0.003330809995532036, "grad/layer_20/mlp": 0.006380898412317038, "grad/layer_20/attn_mlp_ratio": 0.5219970179908957, "grad/layer_24/attn": 0.01751548796892166, "grad/layer_24/mlp": 0.00988616794347763, "grad/layer_24/attn_mlp_ratio": 1.771716593516479, "grad/layer_27/attn": 0.011216854676604271, "grad/layer_27/mlp": 0.008780273608863354, "grad/layer_27/attn_mlp_ratio": 1.2775062655827325} {"step": 65800, "timestamp": 1778265658.9654052, "train/loss": 2.1647245049476624, "train/z_loss": 0.001383116713259369, "train/perplexity": 8.712201417818044, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027956.365233292, "perf/iters_per_sec": 0.9670049501577817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341208696365356, "data/tokens_consumed": 137994698752, "data/tokens_consumed_B": 137.994698752, "train/loss_slope": 1.9802828290031632e-07} {"step": 65810, "timestamp": 1778265669.3030336, "train/loss": 2.1182064771652223, "train/z_loss": 0.0013842831482179462, "train/perplexity": 8.316208793842295, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029607.362396222, "perf/iters_per_sec": 0.9677922069531546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332796573638916, "data/tokens_consumed": 138015670272, "data/tokens_consumed_B": 138.015670272, "train/loss_slope": -7.430303143267473e-07} {"step": 65820, "timestamp": 1778265680.1452606, "train/loss": 2.17238392829895, "train/z_loss": 0.0013719204580411315, "train/perplexity": 8.77918806886791, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1935048.147608734, "perf/iters_per_sec": 0.9227028596919699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0837725162506104, "data/tokens_consumed": 138036641792, "data/tokens_consumed_B": 138.036641792, "train/loss_slope": 2.9317565841284847e-07} {"step": 65830, "timestamp": 1778265690.4858632, "train/loss": 2.119148778915405, "train/z_loss": 0.001365788804832846, "train/perplexity": 8.324048865219988, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029656.0210629734, "perf/iters_per_sec": 0.9678154092135303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332548856735229, "data/tokens_consumed": 138057613312, "data/tokens_consumed_B": 138.057613312, "train/loss_slope": -1.0286133817964313e-06} {"step": 65840, "timestamp": 1778265700.8228848, "train/loss": 2.1553198099136353, "train/z_loss": 0.0013615634059533476, "train/perplexity": 8.6306499050521, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029839.4838114628, "perf/iters_per_sec": 0.9679028910691561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033161497116089, "data/tokens_consumed": 138078584832, "data/tokens_consumed_B": 138.078584832, "train/loss_slope": 5.503792609676046e-07} {"step": 65850, "timestamp": 1778265711.1613085, "grad/layer_0/attn": 0.002367371693253517, "grad/layer_0/mlp": 0.002715315204113722, "grad/layer_0/attn_mlp_ratio": 0.8718588554584922, "grad/layer_4/attn": 0.002089601242914796, "grad/layer_4/mlp": 0.0025675047654658556, "grad/layer_4/attn_mlp_ratio": 0.8138645698479143, "grad/layer_8/attn": 0.00855923630297184, "grad/layer_8/mlp": 0.0036927172914147377, "grad/layer_8/attn_mlp_ratio": 2.3178693075379546, "grad/layer_12/attn": 0.004308902658522129, "grad/layer_12/mlp": 0.0070589520037174225, "grad/layer_12/attn_mlp_ratio": 0.610416758069934, "grad/layer_16/attn": 0.0035350483376532793, "grad/layer_16/mlp": 0.004906067159026861, "grad/layer_16/attn_mlp_ratio": 0.7205462442751898, "grad/layer_20/attn": 0.004550558980554342, "grad/layer_20/mlp": 0.006583391223102808, "grad/layer_20/attn_mlp_ratio": 0.6912180603005117, "grad/layer_24/attn": 0.01270698755979538, "grad/layer_24/mlp": 0.011728648096323013, "grad/layer_24/attn_mlp_ratio": 1.0834145032825762, "grad/layer_27/attn": 0.0060926866717636585, "grad/layer_27/mlp": 0.012624453753232956, "grad/layer_27/attn_mlp_ratio": 0.4826099206028942} {"step": 65850, "timestamp": 1778265711.7471423, "eos/sharpness": 64.95878696441649, "eos/L0_probe": 1.974278450012207, "eos/L_plus": 2.3323769569396973, "eos/L_minus": 2.265767812728882, "eos/grad_norm": 0.2032967209815979, "eos/embed_grad_frac": 0.05616790056228638, "eos/time_s": 0.5830602645874023} {"step": 65850, "timestamp": 1778265711.7676134, "train/loss": 2.1624732494354246, "train/z_loss": 0.0013599417172372342, "train/perplexity": 8.692610087172326, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917301.576579803, "perf/iters_per_sec": 0.9142406351946845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0938039302825928, "data/tokens_consumed": 138099556352, "data/tokens_consumed_B": 138.099556352, "train/loss_slope": 1.0511877370102102e-06} {"step": 65850, "timestamp": 1778265713.1297698, "geo/rankme_last": 438.041015625, "geo/layer_0/stable_rank_q_proj": 19.34621238708496, "geo/layer_0/stable_rank_k_proj": 16.02068519592285, "geo/layer_0/stable_rank_o_proj": 47.182884216308594, "geo/layer_0/stable_rank_gate_proj": 130.3153076171875, "geo/layer_0/stable_rank_down_proj": 55.13655090332031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06120259314775467, "geo/layer_0/attn_entropy_mean": 6.1564741134643555, "geo/layer_0/attn_entropy_std": 0.42199936509132385, "geo/layer_7/stable_rank_q_proj": 43.25183868408203, "geo/layer_7/stable_rank_k_proj": 40.70256805419922, "geo/layer_7/stable_rank_o_proj": 89.63794708251953, "geo/layer_7/stable_rank_gate_proj": 80.46279907226562, "geo/layer_7/stable_rank_down_proj": 140.02859497070312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45573291182518005, "geo/layer_7/attn_entropy_mean": 4.645257472991943, "geo/layer_7/attn_entropy_std": 0.8106527328491211, "geo/layer_14/stable_rank_q_proj": 51.31570816040039, "geo/layer_14/stable_rank_k_proj": 40.24949645996094, "geo/layer_14/stable_rank_o_proj": 43.543601989746094, "geo/layer_14/stable_rank_gate_proj": 71.40351104736328, "geo/layer_14/stable_rank_down_proj": 129.1049346923828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40038153529167175, "geo/layer_14/attn_entropy_mean": 5.5747971534729, "geo/layer_14/attn_entropy_std": 0.4045819938182831, "geo/layer_21/stable_rank_q_proj": 40.26296615600586, "geo/layer_21/stable_rank_k_proj": 30.197246551513672, "geo/layer_21/stable_rank_o_proj": 69.78397369384766, "geo/layer_21/stable_rank_gate_proj": 65.60440826416016, "geo/layer_21/stable_rank_down_proj": 50.89069366455078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14272360503673553, "geo/layer_21/attn_entropy_mean": 5.711945533752441, "geo/layer_21/attn_entropy_std": 0.28475913405418396, "geo/layer_27/stable_rank_q_proj": 43.505767822265625, "geo/layer_27/stable_rank_k_proj": 31.782930374145508, "geo/layer_27/stable_rank_o_proj": 114.22396850585938, "geo/layer_27/stable_rank_gate_proj": 79.8955307006836, "geo/layer_27/stable_rank_down_proj": 128.41517639160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10259448736906052, "geo/layer_27/attn_entropy_mean": 4.211459636688232, "geo/layer_27/attn_entropy_std": 0.7260001301765442, "attnres/final_alpha/block_0": 0.23415344953536987, "attnres/block_norm/0": 1.7666581869125366, "attnres/final_alpha/block_1": 0.00419106287881732, "attnres/block_norm/1": 47342.0703125, "attnres/final_alpha/block_2": 0.010175680741667747, "attnres/block_norm/2": 28642.564453125, "attnres/final_alpha/block_3": 0.01206078752875328, "attnres/block_norm/3": 59406.0078125, "attnres/final_alpha/block_4": 0.014516116119921207, "attnres/block_norm/4": 15301.775390625, "attnres/final_alpha/block_5": 0.6182737350463867, "attnres/block_norm/5": 6673.490234375, "attnres/final_alpha/block_6": 0.1066291332244873, "attnres/block_norm/6": 39408.78125, "geo/tier1_time_s": 1.358701229095459, "geo/step": 65850.0, "geo/rankme_slope": -0.0001326692786489596} {"step": 65860, "timestamp": 1778265723.4693916, "train/loss": 2.128263330459595, "train/z_loss": 0.0013649782282300293, "train/perplexity": 8.400265650927278, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792663.8316019243, "perf/iters_per_sec": 0.854808727074587, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.169852352142334, "data/tokens_consumed": 138120527872, "data/tokens_consumed_B": 138.120527872, "train/loss_slope": -5.410253840669027e-07} {"step": 65870, "timestamp": 1778265733.806145, "train/loss": 2.1356040716171263, "train/z_loss": 0.001380097959190607, "train/perplexity": 8.462156711930593, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029832.5512486915, "perf/iters_per_sec": 0.9678995853656251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331650257110596, "data/tokens_consumed": 138141499392, "data/tokens_consumed_B": 138.141499392, "train/loss_slope": 7.144707943846965e-07} {"step": 65880, "timestamp": 1778265744.151898, "train/loss": 2.1324084520339968, "train/z_loss": 0.0013678124640136957, "train/perplexity": 8.435158039944726, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028252.0397501837, "perf/iters_per_sec": 0.9671459387541693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339701175689697, "data/tokens_consumed": 138162470912, "data/tokens_consumed_B": 138.162470912, "train/loss_slope": 1.964441219894043e-06} {"step": 65890, "timestamp": 1778265754.909735, "train/loss": 2.1604127883911133, "train/z_loss": 0.0013616861077025532, "train/perplexity": 8.6747177422843, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950650.40223933, "perf/iters_per_sec": 0.9301425944515849, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0751039743423463, "data/tokens_consumed": 138183442432, "data/tokens_consumed_B": 138.183442432, "train/loss_slope": 2.7043687163477637e-06} {"step": 65900, "timestamp": 1778265765.235479, "grad/layer_0/attn": 0.0024001169949769974, "grad/layer_0/mlp": 0.002662775805220008, "grad/layer_0/attn_mlp_ratio": 0.901358987916293, "grad/layer_4/attn": 0.004490172956138849, "grad/layer_4/mlp": 0.002429730026051402, "grad/layer_4/attn_mlp_ratio": 1.8480130398003953, "grad/layer_8/attn": 0.006899765692651272, "grad/layer_8/mlp": 0.003574380651116371, "grad/layer_8/attn_mlp_ratio": 1.9303387560198497, "grad/layer_12/attn": 0.0068110255524516106, "grad/layer_12/mlp": 0.007123945746570826, "grad/layer_12/attn_mlp_ratio": 0.9560748634452574, "grad/layer_16/attn": 0.0034197017084807158, "grad/layer_16/mlp": 0.0045280358754098415, "grad/layer_16/attn_mlp_ratio": 0.7552284758893048, "grad/layer_20/attn": 0.007719023153185844, "grad/layer_20/mlp": 0.005195156671106815, "grad/layer_20/attn_mlp_ratio": 1.485811399593496, "grad/layer_24/attn": 0.009809466078877449, "grad/layer_24/mlp": 0.00889282114803791, "grad/layer_24/attn_mlp_ratio": 1.1030769432188672, "grad/layer_27/attn": 0.004761890042573214, "grad/layer_27/mlp": 0.009314320050179958, "grad/layer_27/attn_mlp_ratio": 0.5112439733436918} {"step": 65900, "timestamp": 1778265765.24955, "train/loss": 2.1181204080581666, "train/z_loss": 0.0013745604315772652, "train/perplexity": 8.315493055979221, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029214.946935875, "perf/iters_per_sec": 0.9676050886802078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334794759750365, "data/tokens_consumed": 138204413952, "data/tokens_consumed_B": 138.204413952, "train/loss_slope": 3.882469588702079e-07} {"step": 65910, "timestamp": 1778265775.5885007, "train/loss": 2.1713138580322267, "train/z_loss": 0.0013826021924614907, "train/perplexity": 8.769798745263385, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029660.7980742147, "perf/iters_per_sec": 0.9678176870699953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332524538040162, "data/tokens_consumed": 138225385472, "data/tokens_consumed_B": 138.225385472, "train/loss_slope": 2.7103107253817556e-06} {"step": 65920, "timestamp": 1778265785.9295895, "train/loss": 2.096154344081879, "train/z_loss": 0.0013977314112707973, "train/perplexity": 8.134825940609636, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029002.1106399735, "perf/iters_per_sec": 0.9675036004257076, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033587884902954, "data/tokens_consumed": 138246356992, "data/tokens_consumed_B": 138.246356992, "train/loss_slope": 1.7305856848348192e-07} {"step": 65925, "timestamp": 1778265791.6716783, "eos/sharpness": 76.4007806777954, "eos/L0_probe": 1.9687083959579468, "eos/L_plus": 2.405754804611206, "eos/L_minus": 2.2956697940826416, "eos/grad_norm": 0.2370474636554718, "eos/embed_grad_frac": 0.04788285121321678, "eos/time_s": 0.5830931663513184} {"step": 65925, "timestamp": 1778265793.046851, "geo/rankme_last": 438.53802490234375, "geo/layer_0/stable_rank_q_proj": 19.356674194335938, "geo/layer_0/stable_rank_k_proj": 16.048866271972656, "geo/layer_0/stable_rank_o_proj": 47.20269012451172, "geo/layer_0/stable_rank_gate_proj": 130.3362274169922, "geo/layer_0/stable_rank_down_proj": 55.25206756591797, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06128372997045517, "geo/layer_0/attn_entropy_mean": 6.159351825714111, "geo/layer_0/attn_entropy_std": 0.4233770966529846, "geo/layer_7/stable_rank_q_proj": 43.313175201416016, "geo/layer_7/stable_rank_k_proj": 40.74552917480469, "geo/layer_7/stable_rank_o_proj": 89.52332305908203, "geo/layer_7/stable_rank_gate_proj": 80.52079010009766, "geo/layer_7/stable_rank_down_proj": 139.9039764404297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.438679575920105, "geo/layer_7/attn_entropy_mean": 4.678430557250977, "geo/layer_7/attn_entropy_std": 0.8092289566993713, "geo/layer_14/stable_rank_q_proj": 51.32105255126953, "geo/layer_14/stable_rank_k_proj": 40.27200698852539, "geo/layer_14/stable_rank_o_proj": 43.56633377075195, "geo/layer_14/stable_rank_gate_proj": 71.4834976196289, "geo/layer_14/stable_rank_down_proj": 129.0583953857422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39693495631217957, "geo/layer_14/attn_entropy_mean": 5.565319538116455, "geo/layer_14/attn_entropy_std": 0.40221309661865234, "geo/layer_21/stable_rank_q_proj": 40.26838302612305, "geo/layer_21/stable_rank_k_proj": 30.201011657714844, "geo/layer_21/stable_rank_o_proj": 69.81303405761719, "geo/layer_21/stable_rank_gate_proj": 65.6011734008789, "geo/layer_21/stable_rank_down_proj": 50.85712814331055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14058572053909302, "geo/layer_21/attn_entropy_mean": 5.694938659667969, "geo/layer_21/attn_entropy_std": 0.30495762825012207, "geo/layer_27/stable_rank_q_proj": 43.40681076049805, "geo/layer_27/stable_rank_k_proj": 31.780946731567383, "geo/layer_27/stable_rank_o_proj": 114.1878433227539, "geo/layer_27/stable_rank_gate_proj": 80.03636169433594, "geo/layer_27/stable_rank_down_proj": 128.6102752685547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09471634775400162, "geo/layer_27/attn_entropy_mean": 4.202993392944336, "geo/layer_27/attn_entropy_std": 0.7344373464584351, "attnres/final_alpha/block_0": 0.2341841161251068, "attnres/block_norm/0": 1.76674222946167, "attnres/final_alpha/block_1": 0.004279798362404108, "attnres/block_norm/1": 47211.24609375, "attnres/final_alpha/block_2": 0.01013461034744978, "attnres/block_norm/2": 28786.798828125, "attnres/final_alpha/block_3": 0.011984551325440407, "attnres/block_norm/3": 59486.109375, "attnres/final_alpha/block_4": 0.014245056547224522, "attnres/block_norm/4": 15289.470703125, "attnres/final_alpha/block_5": 0.618559718132019, "attnres/block_norm/5": 6678.85400390625, "attnres/final_alpha/block_6": 0.10661210864782333, "attnres/block_norm/6": 39453.734375, "geo/tier1_time_s": 1.3552756309509277, "geo/step": 65925.0, "geo/rankme_slope": -0.0001251375354829432} {"step": 65930, "timestamp": 1778265798.2175379, "train/loss": 2.152946043014526, "train/z_loss": 0.0013687413185834885, "train/perplexity": 8.610187050621565, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707338.9257877152, "perf/iters_per_sec": 0.8141226414621903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2283161640167237, "data/tokens_consumed": 138267328512, "data/tokens_consumed_B": 138.267328512, "train/loss_slope": 1.050443810955934e-06} {"step": 65940, "timestamp": 1778265808.588535, "train/loss": 2.147289848327637, "train/z_loss": 0.001380008680280298, "train/perplexity": 8.561623627931608, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023068.16621321, "perf/iters_per_sec": 0.9646740752283144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366195440292358, "data/tokens_consumed": 138288300032, "data/tokens_consumed_B": 138.288300032, "train/loss_slope": 1.7772435569705898e-06} {"step": 65950, "timestamp": 1778265818.9171555, "grad/layer_0/attn": 0.0026962824631482363, "grad/layer_0/mlp": 0.002974331146106124, "grad/layer_0/attn_mlp_ratio": 0.9065172101049949, "grad/layer_4/attn": 0.004099866840988398, "grad/layer_4/mlp": 0.0025527463294565678, "grad/layer_4/attn_mlp_ratio": 1.606061140142769, "grad/layer_8/attn": 0.0033881806302815676, "grad/layer_8/mlp": 0.0037111968267709017, "grad/layer_8/attn_mlp_ratio": 0.9129616932587866, "grad/layer_12/attn": 0.004079146310687065, "grad/layer_12/mlp": 0.006548945792019367, "grad/layer_12/attn_mlp_ratio": 0.6228706692565543, "grad/layer_16/attn": 0.003765803063288331, "grad/layer_16/mlp": 0.0044010356068611145, "grad/layer_16/attn_mlp_ratio": 0.8556629198480553, "grad/layer_20/attn": 0.0039247311651706696, "grad/layer_20/mlp": 0.00556737557053566, "grad/layer_20/attn_mlp_ratio": 0.7049517398191051, "grad/layer_24/attn": 0.004399068653583527, "grad/layer_24/mlp": 0.007407813332974911, "grad/layer_24/attn_mlp_ratio": 0.5938417177195161, "grad/layer_27/attn": 0.00458284979686141, "grad/layer_27/mlp": 0.006544430740177631, "grad/layer_27/attn_mlp_ratio": 0.7002671292247962} {"step": 65950, "timestamp": 1778265818.9312088, "train/loss": 2.100260078907013, "train/z_loss": 0.001378045161254704, "train/perplexity": 8.168294037321042, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028945.0123905966, "perf/iters_per_sec": 0.9674763738587363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336169719696044, "data/tokens_consumed": 138309271552, "data/tokens_consumed_B": 138.309271552, "train/loss_slope": 8.561063950175529e-07} {"step": 65960, "timestamp": 1778265829.2678916, "train/loss": 2.1602835297584533, "train/z_loss": 0.001385013770777732, "train/perplexity": 8.673596532594797, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029777.7481963183, "perf/iters_per_sec": 0.9678734532338706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331929206848145, "data/tokens_consumed": 138330243072, "data/tokens_consumed_B": 138.330243072, "train/loss_slope": 7.195715213229617e-08} {"step": 65970, "timestamp": 1778265839.6155064, "train/loss": 2.0951542377471926, "train/z_loss": 0.00139033425366506, "train/perplexity": 8.126694316576994, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028097.154097001, "perf/iters_per_sec": 0.9670720835194593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340490818023682, "data/tokens_consumed": 138351214592, "data/tokens_consumed_B": 138.351214592, "train/loss_slope": -2.7560919138750756e-06} {"step": 65980, "timestamp": 1778265849.954748, "train/loss": 2.193701243400574, "train/z_loss": 0.00139402796048671, "train/perplexity": 8.96834579149866, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029616.9159831011, "perf/iters_per_sec": 0.9677967624583727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332747936248778, "data/tokens_consumed": 138372186112, "data/tokens_consumed_B": 138.372186112, "train/loss_slope": 9.469527937863083e-07} {"step": 65990, "timestamp": 1778265860.2934806, "train/loss": 2.147154760360718, "train/z_loss": 0.0013724469114094972, "train/perplexity": 8.560467133718172, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029365.9768505685, "perf/iters_per_sec": 0.9676771053555339, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334025621414185, "data/tokens_consumed": 138393157632, "data/tokens_consumed_B": 138.393157632, "train/loss_slope": 3.3554376655965588e-06} {"step": 66000, "timestamp": 1778265870.6271608, "grad/layer_0/attn": 0.002784640761092305, "grad/layer_0/mlp": 0.003111362922936678, "grad/layer_0/attn_mlp_ratio": 0.8949906329040338, "grad/layer_4/attn": 0.0021930993534624577, "grad/layer_4/mlp": 0.002614792436361313, "grad/layer_4/attn_mlp_ratio": 0.8387278619489735, "grad/layer_8/attn": 0.003251565620303154, "grad/layer_8/mlp": 0.0036250378470867872, "grad/layer_8/attn_mlp_ratio": 0.8969742297225415, "grad/layer_12/attn": 0.005430541932582855, "grad/layer_12/mlp": 0.006306937895715237, "grad/layer_12/attn_mlp_ratio": 0.861042543350231, "grad/layer_16/attn": 0.0074347578920423985, "grad/layer_16/mlp": 0.004848100710660219, "grad/layer_16/attn_mlp_ratio": 1.5335402835878584, "grad/layer_20/attn": 0.008224571123719215, "grad/layer_20/mlp": 0.006951093208044767, "grad/layer_20/attn_mlp_ratio": 1.1832053979480615, "grad/layer_24/attn": 0.016242587938904762, "grad/layer_24/mlp": 0.01192732434719801, "grad/layer_24/attn_mlp_ratio": 1.3617964373158729, "grad/layer_27/attn": 0.012747547589242458, "grad/layer_27/mlp": 0.012895675376057625, "grad/layer_27/attn_mlp_ratio": 0.9885133673617809} {"step": 66000, "timestamp": 1778265871.2118099, "eos/sharpness": 88.14380168914794, "eos/L0_probe": 1.9723132848739624, "eos/L_plus": 2.3579554557800293, "eos/L_minus": 2.468109130859375, "eos/grad_norm": 0.2719833254814148, "eos/embed_grad_frac": 0.027646442875266075, "eos/time_s": 0.5817995071411133} {"step": 66000, "timestamp": 1778265871.2319183, "train/loss": 2.127745032310486, "train/z_loss": 0.0013832440134137868, "train/perplexity": 8.395912936887633, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918494.3874180904, "perf/iters_per_sec": 0.9148094117250873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0931238651275634, "data/tokens_consumed": 138414129152, "data/tokens_consumed_B": 138.414129152, "train/loss_slope": 4.4241182088065275e-06} {"step": 66000, "timestamp": 1778265872.5927813, "geo/rankme_last": 438.55914306640625, "geo/layer_0/stable_rank_q_proj": 19.343996047973633, "geo/layer_0/stable_rank_k_proj": 16.067808151245117, "geo/layer_0/stable_rank_o_proj": 47.2016487121582, "geo/layer_0/stable_rank_gate_proj": 130.5107421875, "geo/layer_0/stable_rank_down_proj": 55.32549285888672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06737928837537766, "geo/layer_0/attn_entropy_mean": 6.158105850219727, "geo/layer_0/attn_entropy_std": 0.4218932092189789, "geo/layer_7/stable_rank_q_proj": 43.3388557434082, "geo/layer_7/stable_rank_k_proj": 40.71029281616211, "geo/layer_7/stable_rank_o_proj": 89.567138671875, "geo/layer_7/stable_rank_gate_proj": 80.53605651855469, "geo/layer_7/stable_rank_down_proj": 140.0338897705078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4597879648208618, "geo/layer_7/attn_entropy_mean": 4.647871017456055, "geo/layer_7/attn_entropy_std": 0.7986235618591309, "geo/layer_14/stable_rank_q_proj": 51.20414733886719, "geo/layer_14/stable_rank_k_proj": 40.28404998779297, "geo/layer_14/stable_rank_o_proj": 43.530025482177734, "geo/layer_14/stable_rank_gate_proj": 71.47109985351562, "geo/layer_14/stable_rank_down_proj": 129.10238647460938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40935254096984863, "geo/layer_14/attn_entropy_mean": 5.5497727394104, "geo/layer_14/attn_entropy_std": 0.42425447702407837, "geo/layer_21/stable_rank_q_proj": 40.29147720336914, "geo/layer_21/stable_rank_k_proj": 30.176551818847656, "geo/layer_21/stable_rank_o_proj": 69.81392669677734, "geo/layer_21/stable_rank_gate_proj": 65.56952667236328, "geo/layer_21/stable_rank_down_proj": 50.8832893371582, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14157123863697052, "geo/layer_21/attn_entropy_mean": 5.675108432769775, "geo/layer_21/attn_entropy_std": 0.3014258146286011, "geo/layer_27/stable_rank_q_proj": 43.374664306640625, "geo/layer_27/stable_rank_k_proj": 31.756572723388672, "geo/layer_27/stable_rank_o_proj": 114.1845703125, "geo/layer_27/stable_rank_gate_proj": 79.98160552978516, "geo/layer_27/stable_rank_down_proj": 128.58026123046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0995393693447113, "geo/layer_27/attn_entropy_mean": 4.218076705932617, "geo/layer_27/attn_entropy_std": 0.7446310520172119, "attnres/final_alpha/block_0": 0.2367122918367386, "attnres/block_norm/0": 1.767106294631958, "attnres/final_alpha/block_1": 0.004369260743260384, "attnres/block_norm/1": 47183.51953125, "attnres/final_alpha/block_2": 0.010225366801023483, "attnres/block_norm/2": 28660.4921875, "attnres/final_alpha/block_3": 0.012178387492895126, "attnres/block_norm/3": 58822.76171875, "attnres/final_alpha/block_4": 0.014493025839328766, "attnres/block_norm/4": 15309.107421875, "attnres/final_alpha/block_5": 0.6111594438552856, "attnres/block_norm/5": 6749.0380859375, "attnres/final_alpha/block_6": 0.11086222529411316, "attnres/block_norm/6": 39321.91796875, "geo/tier1_time_s": 1.356708288192749, "geo/step": 66000.0, "geo/rankme_slope": -0.00014256991859243698} {"step": 66000, "timestamp": 1778265879.7184763, "geo/ww_alpha_mean": 7.7953942441287545, "geo/ww_alpha_std": 5.430075285058482, "geo/ww_alpha_min": 1.3347180641647758, "geo/ww_alpha_max": 52.97922566340403, "geo/ww_alpha_healthy_frac": 0.18781725888324874, "geo/ww_alpha_by_type/q_proj": 3.9372795798433495, "geo/ww_alpha_by_type/k_proj": 4.526651615047904, "geo/ww_alpha_by_type/v_proj": 8.098108600824423, "geo/ww_alpha_by_type/o_proj": 8.990547244599503, "geo/ww_alpha_by_type/gate_proj": 8.127755959813173, "geo/ww_alpha_by_type/up_proj": 12.851102623424044, "geo/ww_alpha_by_type/down_proj": 8.135982678080564, "geo/twonn_id/layer_0": 0.6742449402809143, "geo/twonn_id/layer_7": 3.269508123397827, "geo/twonn_id/layer_14": 5.386867523193359, "geo/twonn_id/layer_21": 6.897573471069336, "geo/twonn_id/layer_27": 5.462623596191406, "geo/tier2_time_s": 7.1197240352630615} {"step": 66000, "timestamp": 1778265880.4315393, "eoc/jacobian_sigma/layer_0/attn": 1247.2757568359375, "eoc/jacobian_sigma/layer_0/mlp": 8254.3837890625, "eoc/jacobian_sigma/layer_0": 8254.3837890625, "eoc/jacobian_sigma/layer_7/attn": 1.1533256769180298, "eoc/jacobian_sigma/layer_7/mlp": 1.755942702293396, "eoc/jacobian_sigma/layer_7": 1.755942702293396, "eoc/jacobian_sigma/layer_14/attn": 1.4666920900344849, "eoc/jacobian_sigma/layer_14/mlp": 5.700003623962402, "eoc/jacobian_sigma/layer_14": 5.700003623962402, "eoc/jacobian_sigma/layer_21/attn": 1.0824027061462402, "eoc/jacobian_sigma/layer_21/mlp": 4.252837181091309, "eoc/jacobian_sigma/layer_21": 4.252837181091309, "eoc/jacobian_sigma/layer_27/attn": 2.9042999744415283, "eoc/jacobian_sigma/layer_27/mlp": 27.703292846679688, "eoc/jacobian_sigma/layer_27": 27.703292846679688, "eoc/layer0_sigma": 8254.3837890625, "eoc/sigma_max": 27.703292846679688, "eoc/sigma_min": 1.755942702293396, "eoc/sigma_mean": 9.853019088506699, "eoc/time_s": 0.7060089111328125} {"step": 66010, "timestamp": 1778265890.790377, "train/loss": 2.1009621024131775, "train/z_loss": 0.0013636576593853533, "train/perplexity": 8.17403038503038, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1072473.498407645, "perf/iters_per_sec": 0.5113952152288651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.955434799194336, "data/tokens_consumed": 138435100672, "data/tokens_consumed_B": 138.435100672, "train/loss_slope": 5.954151499783043e-07} {"step": 66020, "timestamp": 1778265901.1327395, "train/loss": 2.1682302236557005, "train/z_loss": 0.00137499290285632, "train/perplexity": 8.742797544688175, "train/grad_norm": 0.30859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029001.736214756, "perf/iters_per_sec": 0.9675034218858509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335880756378173, "data/tokens_consumed": 138456072192, "data/tokens_consumed_B": 138.456072192, "train/loss_slope": 2.416341897787289e-08} {"step": 66030, "timestamp": 1778265911.4748466, "train/loss": 2.1270769119262694, "train/z_loss": 0.0013764351373538376, "train/perplexity": 8.39030532979729, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029139.3467892115, "perf/iters_per_sec": 0.967569039721113, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335179805755614, "data/tokens_consumed": 138477043712, "data/tokens_consumed_B": 138.477043712, "train/loss_slope": 4.7353883423392514e-07} {"step": 66040, "timestamp": 1778265921.815967, "train/loss": 2.1427870273590086, "train/z_loss": 0.0013915158458985388, "train/perplexity": 8.523158834564043, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029212.184973792, "perf/iters_per_sec": 0.967603771674057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334808826446533, "data/tokens_consumed": 138498015232, "data/tokens_consumed_B": 138.498015232, "train/loss_slope": -1.255095309049499e-06} {"step": 66050, "timestamp": 1778265932.1487384, "grad/layer_0/attn": 0.0028552638832479715, "grad/layer_0/mlp": 0.0029769798275083303, "grad/layer_0/attn_mlp_ratio": 0.9591142543032756, "grad/layer_4/attn": 0.0021049187052994967, "grad/layer_4/mlp": 0.0024077410344034433, "grad/layer_4/attn_mlp_ratio": 0.8742296566781977, "grad/layer_8/attn": 0.004797295201569796, "grad/layer_8/mlp": 0.0035566731821745634, "grad/layer_8/attn_mlp_ratio": 1.348815261051111, "grad/layer_12/attn": 0.0050445133820176125, "grad/layer_12/mlp": 0.007300815545022488, "grad/layer_12/attn_mlp_ratio": 0.690952028826647, "grad/layer_16/attn": 0.0036612905096262693, "grad/layer_16/mlp": 0.004634432028979063, "grad/layer_16/attn_mlp_ratio": 0.7900192316405397, "grad/layer_20/attn": 0.004220100585371256, "grad/layer_20/mlp": 0.006091075949370861, "grad/layer_20/attn_mlp_ratio": 0.6928333435940509, "grad/layer_24/attn": 0.010286353528499603, "grad/layer_24/mlp": 0.009636042639613152, "grad/layer_24/attn_mlp_ratio": 1.0674873292345484, "grad/layer_27/attn": 0.004661731421947479, "grad/layer_27/mlp": 0.009008141234517097, "grad/layer_27/attn_mlp_ratio": 0.5175020294236294} {"step": 66050, "timestamp": 1778265932.1627967, "train/loss": 2.173201632499695, "train/z_loss": 0.001385065447539091, "train/perplexity": 8.786369783689976, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027772.8219966486, "perf/iters_per_sec": 0.9669174299224131, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034214472770691, "data/tokens_consumed": 138518986752, "data/tokens_consumed_B": 138.518986752, "train/loss_slope": 2.270599710594388e-06} {"step": 66060, "timestamp": 1778265942.9400394, "train/loss": 2.1882709503173827, "train/z_loss": 0.0013838529121130705, "train/perplexity": 8.919777036017791, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947364.6413701156, "perf/iters_per_sec": 0.9285758215761736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0769179821014405, "data/tokens_consumed": 138539958272, "data/tokens_consumed_B": 138.539958272, "train/loss_slope": 3.4619953408457256e-06} {"step": 66070, "timestamp": 1778265953.2873192, "train/loss": 2.119708228111267, "train/z_loss": 0.001387492974754423, "train/perplexity": 8.328707050551474, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028231.2747206518, "perf/iters_per_sec": 0.9671360372164973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033980703353882, "data/tokens_consumed": 138560929792, "data/tokens_consumed_B": 138.560929792, "train/loss_slope": 2.9927705380782625e-06} {"step": 66075, "timestamp": 1778265959.5539496, "eos/sharpness": 67.50550270080565, "eos/L0_probe": 1.9749525785446167, "eos/L_plus": 2.2822258472442627, "eos/L_minus": 2.3427343368530273, "eos/grad_norm": 0.18992148339748383, "eos/embed_grad_frac": 0.06280593574047089, "eos/time_s": 0.6043577194213867} {"step": 66075, "timestamp": 1778265960.9365938, "geo/rankme_last": 438.0892639160156, "geo/layer_0/stable_rank_q_proj": 19.35847282409668, "geo/layer_0/stable_rank_k_proj": 16.058574676513672, "geo/layer_0/stable_rank_o_proj": 47.276336669921875, "geo/layer_0/stable_rank_gate_proj": 130.2367706298828, "geo/layer_0/stable_rank_down_proj": 55.31431198120117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06544234603643417, "geo/layer_0/attn_entropy_mean": 6.161288261413574, "geo/layer_0/attn_entropy_std": 0.42367318272590637, "geo/layer_7/stable_rank_q_proj": 43.33806228637695, "geo/layer_7/stable_rank_k_proj": 40.76062774658203, "geo/layer_7/stable_rank_o_proj": 89.49478149414062, "geo/layer_7/stable_rank_gate_proj": 80.30712127685547, "geo/layer_7/stable_rank_down_proj": 139.79473876953125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44431638717651367, "geo/layer_7/attn_entropy_mean": 4.632539749145508, "geo/layer_7/attn_entropy_std": 0.807488739490509, "geo/layer_14/stable_rank_q_proj": 51.177345275878906, "geo/layer_14/stable_rank_k_proj": 40.31064987182617, "geo/layer_14/stable_rank_o_proj": 43.51061248779297, "geo/layer_14/stable_rank_gate_proj": 71.51824951171875, "geo/layer_14/stable_rank_down_proj": 128.9957733154297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3954743444919586, "geo/layer_14/attn_entropy_mean": 5.54312801361084, "geo/layer_14/attn_entropy_std": 0.40657010674476624, "geo/layer_21/stable_rank_q_proj": 40.356292724609375, "geo/layer_21/stable_rank_k_proj": 30.09307861328125, "geo/layer_21/stable_rank_o_proj": 69.66122436523438, "geo/layer_21/stable_rank_gate_proj": 65.637451171875, "geo/layer_21/stable_rank_down_proj": 50.86544418334961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.144178569316864, "geo/layer_21/attn_entropy_mean": 5.691436767578125, "geo/layer_21/attn_entropy_std": 0.29966607689857483, "geo/layer_27/stable_rank_q_proj": 43.34637451171875, "geo/layer_27/stable_rank_k_proj": 31.776119232177734, "geo/layer_27/stable_rank_o_proj": 114.30321502685547, "geo/layer_27/stable_rank_gate_proj": 79.99011993408203, "geo/layer_27/stable_rank_down_proj": 128.4534912109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09127552062273026, "geo/layer_27/attn_entropy_mean": 4.201930046081543, "geo/layer_27/attn_entropy_std": 0.7515307664871216, "attnres/final_alpha/block_0": 0.2364788055419922, "attnres/block_norm/0": 1.767207145690918, "attnres/final_alpha/block_1": 0.0042956313118338585, "attnres/block_norm/1": 47300.46875, "attnres/final_alpha/block_2": 0.010126926936209202, "attnres/block_norm/2": 28639.5859375, "attnres/final_alpha/block_3": 0.012030255049467087, "attnres/block_norm/3": 59250.82421875, "attnres/final_alpha/block_4": 0.014433469623327255, "attnres/block_norm/4": 15286.666015625, "attnres/final_alpha/block_5": 0.6137756705284119, "attnres/block_norm/5": 6680.86669921875, "attnres/final_alpha/block_6": 0.10885920375585556, "attnres/block_norm/6": 39615.4296875, "geo/tier1_time_s": 1.3593173027038574, "geo/step": 66075.0, "geo/rankme_slope": -0.00015008020786439575} {"step": 66080, "timestamp": 1778265966.1097963, "train/loss": 2.1591363906860352, "train/z_loss": 0.0013784163747914135, "train/perplexity": 8.663652415846776, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1636085.5920784385, "perf/iters_per_sec": 0.7801464043037598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2818106889724732, "data/tokens_consumed": 138581901312, "data/tokens_consumed_B": 138.581901312, "train/loss_slope": 2.800295941650107e-06} {"step": 66090, "timestamp": 1778265976.4495478, "train/loss": 2.156821835041046, "train/z_loss": 0.0013692511478438974, "train/perplexity": 8.643623098667724, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029218.2706629264, "perf/iters_per_sec": 0.9676066735567695, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033477783203125, "data/tokens_consumed": 138602872832, "data/tokens_consumed_B": 138.602872832, "train/loss_slope": 2.227094707780817e-06} {"step": 66100, "timestamp": 1778265986.784739, "grad/layer_0/attn": 0.0030820064712315798, "grad/layer_0/mlp": 0.0028904746286571026, "grad/layer_0/attn_mlp_ratio": 1.0662630746000197, "grad/layer_4/attn": 0.002333409385755658, "grad/layer_4/mlp": 0.0025334490928798914, "grad/layer_4/attn_mlp_ratio": 0.9210405293753539, "grad/layer_8/attn": 0.004382340703159571, "grad/layer_8/mlp": 0.00392354978248477, "grad/layer_8/attn_mlp_ratio": 1.116932582588768, "grad/layer_12/attn": 0.0045435610227286816, "grad/layer_12/mlp": 0.006810465827584267, "grad/layer_12/attn_mlp_ratio": 0.6671439327412249, "grad/layer_16/attn": 0.004458845127373934, "grad/layer_16/mlp": 0.004438498057425022, "grad/layer_16/attn_mlp_ratio": 1.0045842015085382, "grad/layer_20/attn": 0.0036234152503311634, "grad/layer_20/mlp": 0.005541183520108461, "grad/layer_20/attn_mlp_ratio": 0.6539063670769026, "grad/layer_24/attn": 0.00427141273394227, "grad/layer_24/mlp": 0.007659642957150936, "grad/layer_24/attn_mlp_ratio": 0.5576516688926567, "grad/layer_27/attn": 0.0044800955802202225, "grad/layer_27/mlp": 0.006913971155881882, "grad/layer_27/attn_mlp_ratio": 0.6479771775748847} {"step": 66100, "timestamp": 1778265986.7991755, "train/loss": 2.097526025772095, "train/z_loss": 0.0013738004374317825, "train/perplexity": 8.14599198878663, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027581.2730756812, "perf/iters_per_sec": 0.9668260922792822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343121767044068, "data/tokens_consumed": 138623844352, "data/tokens_consumed_B": 138.623844352, "train/loss_slope": 1.24890752787203e-06} {"step": 66110, "timestamp": 1778265997.1397805, "train/loss": 2.155881142616272, "train/z_loss": 0.0013617908116430044, "train/perplexity": 8.635495931078013, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029391.1194746858, "perf/iters_per_sec": 0.9676890942929677, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333897590637207, "data/tokens_consumed": 138644815872, "data/tokens_consumed_B": 138.644815872, "train/loss_slope": 1.2432842829761442e-06} {"step": 66120, "timestamp": 1778266008.0423748, "train/loss": 2.093703734874725, "train/z_loss": 0.0013798373518511653, "train/perplexity": 8.114915068109259, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1924879.5939291043, "perf/iters_per_sec": 0.9178541154523393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0894977569580078, "data/tokens_consumed": 138665787392, "data/tokens_consumed_B": 138.665787392, "train/loss_slope": 2.0640539472033334e-07} {"step": 66130, "timestamp": 1778266018.3887198, "train/loss": 2.16570041179657, "train/z_loss": 0.0013707576086744665, "train/perplexity": 8.720707864925625, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028249.2336402654, "perf/iters_per_sec": 0.9671446006966903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339715480804443, "data/tokens_consumed": 138686758912, "data/tokens_consumed_B": 138.686758912, "train/loss_slope": 1.3560724444407625e-06} {"step": 66140, "timestamp": 1778266029.0640688, "train/loss": 2.150391697883606, "train/z_loss": 0.0013834113720804452, "train/perplexity": 8.588221726704651, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1965746.9802454351, "perf/iters_per_sec": 0.9373412038066078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0668473720550538, "data/tokens_consumed": 138707730432, "data/tokens_consumed_B": 138.707730432, "train/loss_slope": 1.9772609861770145e-06} {"step": 66150, "timestamp": 1778266039.399469, "grad/layer_0/attn": 0.00303004402667284, "grad/layer_0/mlp": 0.0033432573545724154, "grad/layer_0/attn_mlp_ratio": 0.9063148943342045, "grad/layer_4/attn": 0.0026160432025790215, "grad/layer_4/mlp": 0.0025419541634619236, "grad/layer_4/attn_mlp_ratio": 1.0291464485345205, "grad/layer_8/attn": 0.008346408605575562, "grad/layer_8/mlp": 0.003743031295016408, "grad/layer_8/attn_mlp_ratio": 2.2298526848287294, "grad/layer_12/attn": 0.006958957761526108, "grad/layer_12/mlp": 0.006847039330750704, "grad/layer_12/attn_mlp_ratio": 1.0163455069752876, "grad/layer_16/attn": 0.005056978203356266, "grad/layer_16/mlp": 0.0047043683007359505, "grad/layer_16/attn_mlp_ratio": 1.0749536967736528, "grad/layer_20/attn": 0.004744014237076044, "grad/layer_20/mlp": 0.005652609281241894, "grad/layer_20/attn_mlp_ratio": 0.8392609354573465, "grad/layer_24/attn": 0.008348933421075344, "grad/layer_24/mlp": 0.007315630093216896, "grad/layer_24/attn_mlp_ratio": 1.1412459624895384, "grad/layer_27/attn": 0.003868411062285304, "grad/layer_27/mlp": 0.00661894865334034, "grad/layer_27/attn_mlp_ratio": 0.5844449332430712} {"step": 66150, "timestamp": 1778266039.9912066, "eos/sharpness": 4.157745838165282, "eos/L0_probe": 1.9693154096603394, "eos/L_plus": 1.9953536987304688, "eos/L_minus": 1.9848545789718628, "eos/grad_norm": 0.09309285879135132, "eos/embed_grad_frac": 0.26491281390190125, "eos/time_s": 0.588796854019165} {"step": 66150, "timestamp": 1778266040.011074, "train/loss": 2.141189706325531, "train/z_loss": 0.0013745409552939237, "train/perplexity": 8.50955548103962, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916488.0684427388, "perf/iters_per_sec": 0.9138527242864317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094268226623535, "data/tokens_consumed": 138728701952, "data/tokens_consumed_B": 138.728701952, "train/loss_slope": 1.4756541977477754e-06} {"step": 66150, "timestamp": 1778266041.3769443, "geo/rankme_last": 439.2192077636719, "geo/layer_0/stable_rank_q_proj": 19.357486724853516, "geo/layer_0/stable_rank_k_proj": 16.0406494140625, "geo/layer_0/stable_rank_o_proj": 47.2556037902832, "geo/layer_0/stable_rank_gate_proj": 130.09840393066406, "geo/layer_0/stable_rank_down_proj": 55.367156982421875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06048571318387985, "geo/layer_0/attn_entropy_mean": 6.159250259399414, "geo/layer_0/attn_entropy_std": 0.4231214225292206, "geo/layer_7/stable_rank_q_proj": 43.40427017211914, "geo/layer_7/stable_rank_k_proj": 40.798980712890625, "geo/layer_7/stable_rank_o_proj": 89.60101318359375, "geo/layer_7/stable_rank_gate_proj": 80.30237579345703, "geo/layer_7/stable_rank_down_proj": 139.4962158203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44514837861061096, "geo/layer_7/attn_entropy_mean": 4.646450042724609, "geo/layer_7/attn_entropy_std": 0.80409836769104, "geo/layer_14/stable_rank_q_proj": 51.159828186035156, "geo/layer_14/stable_rank_k_proj": 40.27663040161133, "geo/layer_14/stable_rank_o_proj": 43.50234603881836, "geo/layer_14/stable_rank_gate_proj": 71.50480651855469, "geo/layer_14/stable_rank_down_proj": 128.96450805664062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38430359959602356, "geo/layer_14/attn_entropy_mean": 5.514153480529785, "geo/layer_14/attn_entropy_std": 0.4127922058105469, "geo/layer_21/stable_rank_q_proj": 40.340423583984375, "geo/layer_21/stable_rank_k_proj": 30.03697967529297, "geo/layer_21/stable_rank_o_proj": 69.60725402832031, "geo/layer_21/stable_rank_gate_proj": 65.65350341796875, "geo/layer_21/stable_rank_down_proj": 50.85497283935547, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1425389051437378, "geo/layer_21/attn_entropy_mean": 5.677868843078613, "geo/layer_21/attn_entropy_std": 0.29608359932899475, "geo/layer_27/stable_rank_q_proj": 43.283477783203125, "geo/layer_27/stable_rank_k_proj": 31.722530364990234, "geo/layer_27/stable_rank_o_proj": 114.44648742675781, "geo/layer_27/stable_rank_gate_proj": 79.99517822265625, "geo/layer_27/stable_rank_down_proj": 128.56776428222656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09602224081754684, "geo/layer_27/attn_entropy_mean": 4.209988594055176, "geo/layer_27/attn_entropy_std": 0.763129472732544, "attnres/final_alpha/block_0": 0.23497605323791504, "attnres/block_norm/0": 1.7671515941619873, "attnres/final_alpha/block_1": 0.004210841376334429, "attnres/block_norm/1": 47378.1484375, "attnres/final_alpha/block_2": 0.010041464120149612, "attnres/block_norm/2": 28647.701171875, "attnres/final_alpha/block_3": 0.011869490146636963, "attnres/block_norm/3": 59006.375, "attnres/final_alpha/block_4": 0.014070864766836166, "attnres/block_norm/4": 15340.54296875, "attnres/final_alpha/block_5": 0.6173621416091919, "attnres/block_norm/5": 6617.4228515625, "attnres/final_alpha/block_6": 0.10746918618679047, "attnres/block_norm/6": 39410.02734375, "geo/tier1_time_s": 1.361267328262329, "geo/step": 66150.0, "geo/rankme_slope": -0.00011634661677170868} {"step": 66160, "timestamp": 1778266051.7163467, "train/loss": 2.1120661973953245, "train/z_loss": 0.0013815509155392647, "train/perplexity": 8.26530139799649, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792252.9082150047, "perf/iters_per_sec": 0.8546127835345291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1701205730438233, "data/tokens_consumed": 138749673472, "data/tokens_consumed_B": 138.749673472, "train/loss_slope": -9.473853664930904e-07} {"step": 66170, "timestamp": 1778266062.0590026, "train/loss": 2.1126956582069396, "train/z_loss": 0.0013811376877129078, "train/perplexity": 8.270505719108973, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029099.465885584, "perf/iters_per_sec": 0.9675500230243607, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033538293838501, "data/tokens_consumed": 138770644992, "data/tokens_consumed_B": 138.770644992, "train/loss_slope": -5.0730747894736326e-06} {"step": 66180, "timestamp": 1778266072.3986595, "train/loss": 2.110522222518921, "train/z_loss": 0.0013804667280055583, "train/perplexity": 8.252549826877964, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029543.3931465473, "perf/iters_per_sec": 0.9677617040379273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333122253417968, "data/tokens_consumed": 138791616512, "data/tokens_consumed_B": 138.791616512, "train/loss_slope": -6.32974238070936e-06} {"step": 66190, "timestamp": 1778266082.7398133, "train/loss": 2.157245409488678, "train/z_loss": 0.0013751636259257793, "train/perplexity": 8.647285092055945, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028927.4155451749, "perf/iters_per_sec": 0.9674679830289721, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336259365081788, "data/tokens_consumed": 138812588032, "data/tokens_consumed_B": 138.812588032, "train/loss_slope": -5.61392830901058e-06} {"step": 66200, "timestamp": 1778266093.07793, "grad/layer_0/attn": 0.00363708077929914, "grad/layer_0/mlp": 0.0036159218288958073, "grad/layer_0/attn_mlp_ratio": 1.0058515783303412, "grad/layer_4/attn": 0.002732559572905302, "grad/layer_4/mlp": 0.0026234739925712347, "grad/layer_4/attn_mlp_ratio": 1.0415805441505823, "grad/layer_8/attn": 0.0045575937256217, "grad/layer_8/mlp": 0.003993947058916092, "grad/layer_8/attn_mlp_ratio": 1.1411251937691071, "grad/layer_12/attn": 0.004650043789297342, "grad/layer_12/mlp": 0.006781276315450668, "grad/layer_12/attn_mlp_ratio": 0.6857180719993274, "grad/layer_16/attn": 0.0037330787163227797, "grad/layer_16/mlp": 0.005036127287894487, "grad/layer_16/attn_mlp_ratio": 0.7412597872913441, "grad/layer_20/attn": 0.003977705724537373, "grad/layer_20/mlp": 0.00793975405395031, "grad/layer_20/attn_mlp_ratio": 0.5009860062931952, "grad/layer_24/attn": 0.020795408636331558, "grad/layer_24/mlp": 0.01499095093458891, "grad/layer_24/attn_mlp_ratio": 1.3871974225217543, "grad/layer_27/attn": 0.010233837179839611, "grad/layer_27/mlp": 0.01576842926442623, "grad/layer_27/attn_mlp_ratio": 0.6490080237748518} {"step": 66200, "timestamp": 1778266093.0924609, "train/loss": 2.148649299144745, "train/z_loss": 0.001367920485790819, "train/perplexity": 8.573270649150883, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027088.6379168497, "perf/iters_per_sec": 0.966591185530114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345635414123535, "data/tokens_consumed": 138833559552, "data/tokens_consumed_B": 138.833559552, "train/loss_slope": -9.789343771546818e-06} {"step": 66210, "timestamp": 1778266103.4374692, "train/loss": 2.161067879199982, "train/z_loss": 0.0013916507014073431, "train/perplexity": 8.68040233190471, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028743.1350069533, "perf/iters_per_sec": 0.9673801112208144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033719825744629, "data/tokens_consumed": 138854531072, "data/tokens_consumed_B": 138.854531072, "train/loss_slope": -6.469324806092077e-06} {"step": 66220, "timestamp": 1778266113.7791789, "train/loss": 2.18239848613739, "train/z_loss": 0.001375217130407691, "train/perplexity": 8.86754946723622, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028761.2901604457, "perf/iters_per_sec": 0.9673887682726124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337105751037599, "data/tokens_consumed": 138875502592, "data/tokens_consumed_B": 138.875502592, "train/loss_slope": -2.652461558106534e-06} {"step": 66225, "timestamp": 1778266119.5299609, "eos/sharpness": 73.29139709472655, "eos/L0_probe": 1.972280502319336, "eos/L_plus": 2.3904528617858887, "eos/L_minus": 2.287022113800049, "eos/grad_norm": 0.20613306760787964, "eos/embed_grad_frac": 0.06223790720105171, "eos/time_s": 0.5847978591918945} {"step": 66225, "timestamp": 1778266120.910208, "geo/rankme_last": 439.4463195800781, "geo/layer_0/stable_rank_q_proj": 19.380823135375977, "geo/layer_0/stable_rank_k_proj": 16.062114715576172, "geo/layer_0/stable_rank_o_proj": 47.26758575439453, "geo/layer_0/stable_rank_gate_proj": 130.06239318847656, "geo/layer_0/stable_rank_down_proj": 55.33970642089844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060498300939798355, "geo/layer_0/attn_entropy_mean": 6.158669471740723, "geo/layer_0/attn_entropy_std": 0.4213652014732361, "geo/layer_7/stable_rank_q_proj": 43.30267333984375, "geo/layer_7/stable_rank_k_proj": 40.76515579223633, "geo/layer_7/stable_rank_o_proj": 89.59479522705078, "geo/layer_7/stable_rank_gate_proj": 80.51119995117188, "geo/layer_7/stable_rank_down_proj": 139.83863830566406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4572184979915619, "geo/layer_7/attn_entropy_mean": 4.618221759796143, "geo/layer_7/attn_entropy_std": 0.7981520891189575, "geo/layer_14/stable_rank_q_proj": 51.17091369628906, "geo/layer_14/stable_rank_k_proj": 40.39413070678711, "geo/layer_14/stable_rank_o_proj": 43.520931243896484, "geo/layer_14/stable_rank_gate_proj": 71.45069885253906, "geo/layer_14/stable_rank_down_proj": 128.85386657714844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3972164988517761, "geo/layer_14/attn_entropy_mean": 5.539391994476318, "geo/layer_14/attn_entropy_std": 0.41124624013900757, "geo/layer_21/stable_rank_q_proj": 40.310916900634766, "geo/layer_21/stable_rank_k_proj": 30.070270538330078, "geo/layer_21/stable_rank_o_proj": 69.6759262084961, "geo/layer_21/stable_rank_gate_proj": 65.5932388305664, "geo/layer_21/stable_rank_down_proj": 50.77731704711914, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.146929532289505, "geo/layer_21/attn_entropy_mean": 5.6923909187316895, "geo/layer_21/attn_entropy_std": 0.29394009709358215, "geo/layer_27/stable_rank_q_proj": 43.303741455078125, "geo/layer_27/stable_rank_k_proj": 31.778614044189453, "geo/layer_27/stable_rank_o_proj": 114.45010375976562, "geo/layer_27/stable_rank_gate_proj": 79.93417358398438, "geo/layer_27/stable_rank_down_proj": 128.34983825683594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09749983996152878, "geo/layer_27/attn_entropy_mean": 4.220685958862305, "geo/layer_27/attn_entropy_std": 0.750786304473877, "attnres/final_alpha/block_0": 0.23507189750671387, "attnres/block_norm/0": 1.767379879951477, "attnres/final_alpha/block_1": 0.0041859205812215805, "attnres/block_norm/1": 47265.609375, "attnres/final_alpha/block_2": 0.010124476626515388, "attnres/block_norm/2": 28718.41796875, "attnres/final_alpha/block_3": 0.011884219944477081, "attnres/block_norm/3": 59202.6171875, "attnres/final_alpha/block_4": 0.014178281649947166, "attnres/block_norm/4": 15328.9453125, "attnres/final_alpha/block_5": 0.6184401512145996, "attnres/block_norm/5": 6629.087890625, "attnres/final_alpha/block_6": 0.10611507296562195, "attnres/block_norm/6": 39470.1953125, "geo/tier1_time_s": 1.3624374866485596, "geo/step": 66225.0, "geo/rankme_slope": -8.97046318527411e-05} {"step": 66230, "timestamp": 1778266126.0851846, "train/loss": 2.1456757545471192, "train/z_loss": 0.0013679257244803011, "train/perplexity": 8.547815511278094, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705088.787587302, "perf/iters_per_sec": 0.8130496919571409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2299371242523194, "data/tokens_consumed": 138896474112, "data/tokens_consumed_B": 138.896474112, "train/loss_slope": -1.6176913330371424e-06} {"step": 66240, "timestamp": 1778266136.4296381, "train/loss": 2.1436243772506716, "train/z_loss": 0.001381256792228669, "train/perplexity": 8.530298689552014, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028558.7008099896, "perf/iters_per_sec": 0.9672921661424587, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338138103485108, "data/tokens_consumed": 138917445632, "data/tokens_consumed_B": 138.917445632, "train/loss_slope": -2.2134310770707305e-06} {"step": 66250, "timestamp": 1778266146.767012, "grad/layer_0/attn": 0.002911284565925598, "grad/layer_0/mlp": 0.00316020380705595, "grad/layer_0/attn_mlp_ratio": 0.9212331392368142, "grad/layer_4/attn": 0.002315974561497569, "grad/layer_4/mlp": 0.002598122926428914, "grad/layer_4/attn_mlp_ratio": 0.8914029620378854, "grad/layer_8/attn": 0.005234511103481054, "grad/layer_8/mlp": 0.003831026377156377, "grad/layer_8/attn_mlp_ratio": 1.366346887627472, "grad/layer_12/attn": 0.006092363502830267, "grad/layer_12/mlp": 0.007643752731382847, "grad/layer_12/attn_mlp_ratio": 0.7970382660487055, "grad/layer_16/attn": 0.003272805828601122, "grad/layer_16/mlp": 0.004646373447030783, "grad/layer_16/attn_mlp_ratio": 0.7043785428514621, "grad/layer_20/attn": 0.003753197845071554, "grad/layer_20/mlp": 0.006041715387254953, "grad/layer_20/attn_mlp_ratio": 0.6212139338552031, "grad/layer_24/attn": 0.012964006513357162, "grad/layer_24/mlp": 0.011032192967832088, "grad/layer_24/attn_mlp_ratio": 1.1751069287536218, "grad/layer_27/attn": 0.00758745102211833, "grad/layer_27/mlp": 0.010925943031907082, "grad/layer_27/attn_mlp_ratio": 0.6944435762218698} {"step": 66250, "timestamp": 1778266146.7911835, "train/loss": 2.1224133014678954, "train/z_loss": 0.0013776825624518096, "train/perplexity": 8.351267313915844, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024977.59557449, "perf/iters_per_sec": 0.9655845620987368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356420755386353, "data/tokens_consumed": 138938417152, "data/tokens_consumed_B": 138.938417152, "train/loss_slope": -2.7419684135695838e-06} {"step": 66260, "timestamp": 1778266157.1342194, "train/loss": 2.101680064201355, "train/z_loss": 0.0013876767829060555, "train/perplexity": 8.17990113373667, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028624.6197857766, "perf/iters_per_sec": 0.9673235987595447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337802171707153, "data/tokens_consumed": 138959388672, "data/tokens_consumed_B": 138.959388672, "train/loss_slope": -4.616029563695534e-06} {"step": 66270, "timestamp": 1778266167.4776883, "train/loss": 2.119555878639221, "train/z_loss": 0.0014044276089407504, "train/perplexity": 8.32743827308074, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028756.7513416095, "perf/iters_per_sec": 0.967386603995137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033712887763977, "data/tokens_consumed": 138980360192, "data/tokens_consumed_B": 138.980360192, "train/loss_slope": -5.770923323792984e-06} {"step": 66280, "timestamp": 1778266177.819749, "train/loss": 2.096117901802063, "train/z_loss": 0.00138478628359735, "train/perplexity": 8.134529494408074, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028906.9174771367, "perf/iters_per_sec": 0.9674582087884601, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336363792419434, "data/tokens_consumed": 139001331712, "data/tokens_consumed_B": 139.001331712, "train/loss_slope": -1.0081100971749588e-05} {"step": 66290, "timestamp": 1778266188.166719, "train/loss": 2.21920166015625, "train/z_loss": 0.0013707555015571415, "train/perplexity": 9.199983220146853, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028121.1429895826, "perf/iters_per_sec": 0.9670835223148263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340368509292603, "data/tokens_consumed": 139022303232, "data/tokens_consumed_B": 139.022303232, "train/loss_slope": -3.0702922353507254e-06} {"step": 66300, "timestamp": 1778266198.4984567, "grad/layer_0/attn": 0.002980602905154228, "grad/layer_0/mlp": 0.00323107885196805, "grad/layer_0/attn_mlp_ratio": 0.9224791314179256, "grad/layer_4/attn": 0.0030012265779078007, "grad/layer_4/mlp": 0.0026062424294650555, "grad/layer_4/attn_mlp_ratio": 1.1515530669065621, "grad/layer_8/attn": 0.005378596018999815, "grad/layer_8/mlp": 0.0036994994152337313, "grad/layer_8/attn_mlp_ratio": 1.4538712593019492, "grad/layer_12/attn": 0.00499484920874238, "grad/layer_12/mlp": 0.00711409468203783, "grad/layer_12/attn_mlp_ratio": 0.7021060812056833, "grad/layer_16/attn": 0.005848964210599661, "grad/layer_16/mlp": 0.005011689383536577, "grad/layer_16/attn_mlp_ratio": 1.1670643661810125, "grad/layer_20/attn": 0.003583161858841777, "grad/layer_20/mlp": 0.006334791891276836, "grad/layer_20/attn_mlp_ratio": 0.5656321255340158, "grad/layer_24/attn": 0.007977530360221863, "grad/layer_24/mlp": 0.007982810959219933, "grad/layer_24/attn_mlp_ratio": 0.999338491295999, "grad/layer_27/attn": 0.007426201365888119, "grad/layer_27/mlp": 0.006674733012914658, "grad/layer_27/attn_mlp_ratio": 1.1125840150101998} {"step": 66300, "timestamp": 1778266199.083558, "eos/sharpness": 39.314651489257805, "eos/L0_probe": 1.9727931022644043, "eos/L_plus": 2.1432037353515625, "eos/L_minus": 2.195528984069824, "eos/grad_norm": 0.11130429804325104, "eos/embed_grad_frac": 0.17822159826755524, "eos/time_s": 0.5822398662567139} {"step": 66300, "timestamp": 1778266199.1015003, "train/loss": 2.1429814219474794, "train/z_loss": 0.0013693504384718834, "train/perplexity": 8.52481585157051, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919135.6070329074, "perf/iters_per_sec": 0.9151151690640008, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0927586317062379, "data/tokens_consumed": 139043274752, "data/tokens_consumed_B": 139.043274752, "train/loss_slope": -6.394850109706037e-07} {"step": 66300, "timestamp": 1778266200.4641736, "geo/rankme_last": 439.233642578125, "geo/layer_0/stable_rank_q_proj": 19.386493682861328, "geo/layer_0/stable_rank_k_proj": 16.04285430908203, "geo/layer_0/stable_rank_o_proj": 47.183616638183594, "geo/layer_0/stable_rank_gate_proj": 130.08908081054688, "geo/layer_0/stable_rank_down_proj": 55.376129150390625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06123707443475723, "geo/layer_0/attn_entropy_mean": 6.16145658493042, "geo/layer_0/attn_entropy_std": 0.4194079339504242, "geo/layer_7/stable_rank_q_proj": 43.27394485473633, "geo/layer_7/stable_rank_k_proj": 40.79438781738281, "geo/layer_7/stable_rank_o_proj": 89.69581604003906, "geo/layer_7/stable_rank_gate_proj": 80.49590301513672, "geo/layer_7/stable_rank_down_proj": 139.82386779785156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4446554183959961, "geo/layer_7/attn_entropy_mean": 4.59230899810791, "geo/layer_7/attn_entropy_std": 0.7980741858482361, "geo/layer_14/stable_rank_q_proj": 51.16496658325195, "geo/layer_14/stable_rank_k_proj": 40.27060317993164, "geo/layer_14/stable_rank_o_proj": 43.48894119262695, "geo/layer_14/stable_rank_gate_proj": 71.44332122802734, "geo/layer_14/stable_rank_down_proj": 128.78582763671875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37856242060661316, "geo/layer_14/attn_entropy_mean": 5.497445583343506, "geo/layer_14/attn_entropy_std": 0.41213729977607727, "geo/layer_21/stable_rank_q_proj": 40.28756332397461, "geo/layer_21/stable_rank_k_proj": 30.072233200073242, "geo/layer_21/stable_rank_o_proj": 69.77059173583984, "geo/layer_21/stable_rank_gate_proj": 65.48062133789062, "geo/layer_21/stable_rank_down_proj": 50.78947830200195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1460033506155014, "geo/layer_21/attn_entropy_mean": 5.6834821701049805, "geo/layer_21/attn_entropy_std": 0.30457183718681335, "geo/layer_27/stable_rank_q_proj": 43.297691345214844, "geo/layer_27/stable_rank_k_proj": 31.79380226135254, "geo/layer_27/stable_rank_o_proj": 114.58811950683594, "geo/layer_27/stable_rank_gate_proj": 79.98533630371094, "geo/layer_27/stable_rank_down_proj": 128.15057373046875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10190641134977341, "geo/layer_27/attn_entropy_mean": 4.188595294952393, "geo/layer_27/attn_entropy_std": 0.7505791783332825, "attnres/final_alpha/block_0": 0.23828300833702087, "attnres/block_norm/0": 1.7675561904907227, "attnres/final_alpha/block_1": 0.0043150740675628185, "attnres/block_norm/1": 47258.984375, "attnres/final_alpha/block_2": 0.010365037247538567, "attnres/block_norm/2": 28670.65234375, "attnres/final_alpha/block_3": 0.012306643649935722, "attnres/block_norm/3": 59186.7265625, "attnres/final_alpha/block_4": 0.014928224496543407, "attnres/block_norm/4": 15294.955078125, "attnres/final_alpha/block_5": 0.6121395826339722, "attnres/block_norm/5": 6729.8037109375, "attnres/final_alpha/block_6": 0.10766241699457169, "attnres/block_norm/6": 39406.1171875, "geo/tier1_time_s": 1.359229326248169, "geo/step": 66300.0, "geo/rankme_slope": -6.0646817320678275e-05} {"step": 66310, "timestamp": 1778266210.808356, "train/loss": 2.0445249319076537, "train/z_loss": 0.001392138993833214, "train/perplexity": 7.725487533365439, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791930.9125059056, "perf/iters_per_sec": 0.8544592440156487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703308343887329, "data/tokens_consumed": 139064246272, "data/tokens_consumed_B": 139.064246272, "train/loss_slope": -6.192573569681518e-06} {"step": 66320, "timestamp": 1778266221.1512802, "train/loss": 2.1693092584609985, "train/z_loss": 0.001382009289227426, "train/perplexity": 8.752236419055732, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028943.2339721408, "perf/iters_per_sec": 0.9674755258427338, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033617877960205, "data/tokens_consumed": 139085217792, "data/tokens_consumed_B": 139.085217792, "train/loss_slope": -5.386606008127394e-06} {"step": 66330, "timestamp": 1778266231.498215, "train/loss": 2.193008041381836, "train/z_loss": 0.0013655831688083708, "train/perplexity": 8.962131070368734, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028100.8950175603, "perf/iters_per_sec": 0.9670738673293878, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340471744537354, "data/tokens_consumed": 139106189312, "data/tokens_consumed_B": 139.106189312, "train/loss_slope": -1.9223014334820018e-06} {"step": 66340, "timestamp": 1778266241.8443553, "train/loss": 2.1443079471588136, "train/z_loss": 0.0013595874654129148, "train/perplexity": 8.536131738464876, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028710.8027933824, "perf/iters_per_sec": 0.9673646940199768, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337363004684448, "data/tokens_consumed": 139127160832, "data/tokens_consumed_B": 139.127160832, "train/loss_slope": -9.560662277318028e-07} {"step": 66350, "timestamp": 1778266252.1734092, "grad/layer_0/attn": 0.002734989393502474, "grad/layer_0/mlp": 0.003184282686561346, "grad/layer_0/attn_mlp_ratio": 0.8589027975294692, "grad/layer_4/attn": 0.002107087057083845, "grad/layer_4/mlp": 0.002467275597155094, "grad/layer_4/attn_mlp_ratio": 0.8540136230067169, "grad/layer_8/attn": 0.0036655468866229057, "grad/layer_8/mlp": 0.0036526373587548733, "grad/layer_8/attn_mlp_ratio": 1.0035342757155081, "grad/layer_12/attn": 0.004325345624238253, "grad/layer_12/mlp": 0.0064356061629951, "grad/layer_12/attn_mlp_ratio": 0.6720960617353334, "grad/layer_16/attn": 0.0035220927093178034, "grad/layer_16/mlp": 0.004409969784319401, "grad/layer_16/attn_mlp_ratio": 0.7986659323550858, "grad/layer_20/attn": 0.002739452291280031, "grad/layer_20/mlp": 0.005477346479892731, "grad/layer_20/attn_mlp_ratio": 0.5001422223922302, "grad/layer_24/attn": 0.008297000080347061, "grad/layer_24/mlp": 0.008572131395339966, "grad/layer_24/attn_mlp_ratio": 0.9679039670422143, "grad/layer_27/attn": 0.005465812981128693, "grad/layer_27/mlp": 0.007018443662673235, "grad/layer_27/attn_mlp_ratio": 0.7787784822324821} {"step": 66350, "timestamp": 1778266252.1873446, "train/loss": 2.1688230514526365, "train/z_loss": 0.0013674382586032151, "train/perplexity": 8.747982054704638, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028560.1042921627, "perf/iters_per_sec": 0.9672928353749097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338130950927735, "data/tokens_consumed": 139148132352, "data/tokens_consumed_B": 139.148132352, "train/loss_slope": 2.008289722862646e-06} {"step": 66360, "timestamp": 1778266262.5408819, "train/loss": 2.083267843723297, "train/z_loss": 0.0013883083127439022, "train/perplexity": 8.030669053460917, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027030.1992291973, "perf/iters_per_sec": 0.9665633197923648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034593367576599, "data/tokens_consumed": 139169103872, "data/tokens_consumed_B": 139.169103872, "train/loss_slope": 3.2169788834427976e-07} {"step": 66370, "timestamp": 1778266272.888119, "train/loss": 2.176718306541443, "train/z_loss": 0.0013739028829149902, "train/perplexity": 8.817322976474436, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028747.4865991874, "perf/iters_per_sec": 0.9673821862216889, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337176084518434, "data/tokens_consumed": 139190075392, "data/tokens_consumed_B": 139.190075392, "train/loss_slope": -4.965563704579023e-07} {"step": 66375, "timestamp": 1778266278.6363504, "eos/sharpness": 19.332456588745114, "eos/L0_probe": 1.9725526571273804, "eos/L_plus": 2.0740981101989746, "eos/L_minus": 2.0643317699432373, "eos/grad_norm": 0.0960797369480133, "eos/embed_grad_frac": 0.24116401374340057, "eos/time_s": 0.584815263748169} {"step": 66375, "timestamp": 1778266280.0145876, "geo/rankme_last": 439.16748046875, "geo/layer_0/stable_rank_q_proj": 19.37972640991211, "geo/layer_0/stable_rank_k_proj": 16.063114166259766, "geo/layer_0/stable_rank_o_proj": 47.1966438293457, "geo/layer_0/stable_rank_gate_proj": 129.95594787597656, "geo/layer_0/stable_rank_down_proj": 55.322147369384766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06257878243923187, "geo/layer_0/attn_entropy_mean": 6.155613899230957, "geo/layer_0/attn_entropy_std": 0.417063444852829, "geo/layer_7/stable_rank_q_proj": 43.249691009521484, "geo/layer_7/stable_rank_k_proj": 40.8078727722168, "geo/layer_7/stable_rank_o_proj": 89.88455200195312, "geo/layer_7/stable_rank_gate_proj": 80.45527648925781, "geo/layer_7/stable_rank_down_proj": 139.853271484375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4381150007247925, "geo/layer_7/attn_entropy_mean": 4.640563011169434, "geo/layer_7/attn_entropy_std": 0.7900906801223755, "geo/layer_14/stable_rank_q_proj": 51.08173370361328, "geo/layer_14/stable_rank_k_proj": 40.210166931152344, "geo/layer_14/stable_rank_o_proj": 43.47699737548828, "geo/layer_14/stable_rank_gate_proj": 71.51792907714844, "geo/layer_14/stable_rank_down_proj": 128.56781005859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4021567106246948, "geo/layer_14/attn_entropy_mean": 5.5652756690979, "geo/layer_14/attn_entropy_std": 0.4246614873409271, "geo/layer_21/stable_rank_q_proj": 40.30103302001953, "geo/layer_21/stable_rank_k_proj": 30.0256404876709, "geo/layer_21/stable_rank_o_proj": 69.76464080810547, "geo/layer_21/stable_rank_gate_proj": 65.45793914794922, "geo/layer_21/stable_rank_down_proj": 50.770103454589844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14247149229049683, "geo/layer_21/attn_entropy_mean": 5.682062149047852, "geo/layer_21/attn_entropy_std": 0.2871137261390686, "geo/layer_27/stable_rank_q_proj": 43.385955810546875, "geo/layer_27/stable_rank_k_proj": 31.690059661865234, "geo/layer_27/stable_rank_o_proj": 114.61795806884766, "geo/layer_27/stable_rank_gate_proj": 80.01934814453125, "geo/layer_27/stable_rank_down_proj": 128.0360565185547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0983908474445343, "geo/layer_27/attn_entropy_mean": 4.17982292175293, "geo/layer_27/attn_entropy_std": 0.7563146948814392, "attnres/final_alpha/block_0": 0.23543408513069153, "attnres/block_norm/0": 1.7676397562026978, "attnres/final_alpha/block_1": 0.004261030349880457, "attnres/block_norm/1": 47392.390625, "attnres/final_alpha/block_2": 0.010178444907069206, "attnres/block_norm/2": 28781.88671875, "attnres/final_alpha/block_3": 0.012078981846570969, "attnres/block_norm/3": 59258.328125, "attnres/final_alpha/block_4": 0.014256780967116356, "attnres/block_norm/4": 15352.2490234375, "attnres/final_alpha/block_5": 0.6169013977050781, "attnres/block_norm/5": 6675.107421875, "attnres/final_alpha/block_6": 0.10688921809196472, "attnres/block_norm/6": 39622.37890625, "geo/tier1_time_s": 1.3601694107055664, "geo/step": 66375.0, "geo/rankme_slope": -4.271939244447779e-05} {"step": 66380, "timestamp": 1778266285.188138, "train/loss": 2.1597352027893066, "train/z_loss": 0.0013784747454337776, "train/perplexity": 8.668841869370643, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706017.7709484345, "perf/iters_per_sec": 0.8134926657430813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.229267382621765, "data/tokens_consumed": 139211046912, "data/tokens_consumed_B": 139.211046912, "train/loss_slope": 2.915062848562393e-06} {"step": 66390, "timestamp": 1778266295.532615, "train/loss": 2.1824581623077393, "train/z_loss": 0.0013624837272800505, "train/perplexity": 8.868078664418881, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028165.1942467585, "perf/iters_per_sec": 0.9671045275911133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034014391899109, "data/tokens_consumed": 139232018432, "data/tokens_consumed_B": 139.232018432, "train/loss_slope": 6.301147258929078e-06} {"step": 66400, "timestamp": 1778266305.8684413, "grad/layer_0/attn": 0.002990598091855645, "grad/layer_0/mlp": 0.0031294580549001694, "grad/layer_0/attn_mlp_ratio": 0.955628081229623, "grad/layer_4/attn": 0.002091765869408846, "grad/layer_4/mlp": 0.002493595937266946, "grad/layer_4/attn_mlp_ratio": 0.8388551466024474, "grad/layer_8/attn": 0.0064836484380066395, "grad/layer_8/mlp": 0.003833496244624257, "grad/layer_8/attn_mlp_ratio": 1.6913146264241812, "grad/layer_12/attn": 0.006271676626056433, "grad/layer_12/mlp": 0.006626749411225319, "grad/layer_12/attn_mlp_ratio": 0.9464182425233648, "grad/layer_16/attn": 0.003958078101277351, "grad/layer_16/mlp": 0.004904290195554495, "grad/layer_16/attn_mlp_ratio": 0.8070643992801896, "grad/layer_20/attn": 0.0036368747241795063, "grad/layer_20/mlp": 0.0066117155365645885, "grad/layer_20/attn_mlp_ratio": 0.55006520608155, "grad/layer_24/attn": 0.016030332073569298, "grad/layer_24/mlp": 0.010227944701910019, "grad/layer_24/attn_mlp_ratio": 1.5673072532201886, "grad/layer_27/attn": 0.009362723678350449, "grad/layer_27/mlp": 0.010662565007805824, "grad/layer_27/attn_mlp_ratio": 0.8780929901657726} {"step": 66400, "timestamp": 1778266305.8825321, "train/loss": 2.153837037086487, "train/z_loss": 0.0013765912735834718, "train/perplexity": 8.61786209494389, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027536.2189970436, "perf/iters_per_sec": 0.9668046088204592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034335160255432, "data/tokens_consumed": 139252989952, "data/tokens_consumed_B": 139.252989952, "train/loss_slope": 7.721678816517534e-06} {"step": 66410, "timestamp": 1778266316.221993, "train/loss": 2.168325686454773, "train/z_loss": 0.001358783314935863, "train/perplexity": 8.743632196451978, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029747.959060629, "perf/iters_per_sec": 0.9678592486670632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332080841064453, "data/tokens_consumed": 139273961472, "data/tokens_consumed_B": 139.273961472, "train/loss_slope": 9.061057067582683e-06} {"step": 66420, "timestamp": 1778266326.5622845, "train/loss": 2.141229498386383, "train/z_loss": 0.0013630090979859233, "train/perplexity": 8.50989410052628, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028958.9590434192, "perf/iters_per_sec": 0.9674830241410347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336098670959473, "data/tokens_consumed": 139294932992, "data/tokens_consumed_B": 139.294932992, "train/loss_slope": 1.0984561738759017e-05} {"step": 66430, "timestamp": 1778266336.9009025, "train/loss": 2.2045437335968017, "train/z_loss": 0.0013425651472061873, "train/perplexity": 9.066114060771957, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029769.5045744097, "perf/iters_per_sec": 0.9678695223686264, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331971168518066, "data/tokens_consumed": 139315904512, "data/tokens_consumed_B": 139.315904512, "train/loss_slope": 1.4258551447376486e-05} {"step": 66440, "timestamp": 1778266347.2398322, "train/loss": 2.1595284223556517, "train/z_loss": 0.0013681450858712197, "train/perplexity": 8.667049507808647, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029673.2558759816, "perf/iters_per_sec": 0.9678236274127873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033246111869812, "data/tokens_consumed": 139336876032, "data/tokens_consumed_B": 139.336876032, "train/loss_slope": 1.550034855780498e-05} {"step": 66450, "timestamp": 1778266357.5719059, "grad/layer_0/attn": 0.00283594592474401, "grad/layer_0/mlp": 0.0030203319620341063, "grad/layer_0/attn_mlp_ratio": 0.9389517001763318, "grad/layer_4/attn": 0.002295007463544607, "grad/layer_4/mlp": 0.002604146022349596, "grad/layer_4/attn_mlp_ratio": 0.8812898185121548, "grad/layer_8/attn": 0.007344040088355541, "grad/layer_8/mlp": 0.003775102086365223, "grad/layer_8/attn_mlp_ratio": 1.94538842283011, "grad/layer_12/attn": 0.0037247829604893923, "grad/layer_12/mlp": 0.00640465784817934, "grad/layer_12/attn_mlp_ratio": 0.5815740654109781, "grad/layer_16/attn": 0.0032629044726490974, "grad/layer_16/mlp": 0.004422393161803484, "grad/layer_16/attn_mlp_ratio": 0.7378141832909861, "grad/layer_20/attn": 0.0030018924735486507, "grad/layer_20/mlp": 0.005916583351790905, "grad/layer_20/attn_mlp_ratio": 0.5073692440930586, "grad/layer_24/attn": 0.015785207971930504, "grad/layer_24/mlp": 0.01275539305061102, "grad/layer_24/attn_mlp_ratio": 1.2375320607953466, "grad/layer_27/attn": 0.013126584701240063, "grad/layer_27/mlp": 0.012474588118493557, "grad/layer_27/attn_mlp_ratio": 1.052265972337261} {"step": 66450, "timestamp": 1778266358.1599314, "eos/sharpness": 78.13644409179686, "eos/L0_probe": 1.9688953161239624, "eos/L_plus": 2.326824903488159, "eos/L_minus": 2.3923301696777344, "eos/grad_norm": 0.21699905395507812, "eos/embed_grad_frac": 0.05177726969122887, "eos/time_s": 0.5851168632507324} {"step": 66450, "timestamp": 1778266358.1789618, "train/loss": 2.156829285621643, "train/z_loss": 0.0013782350230030715, "train/perplexity": 8.64368749891818, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917993.2713094244, "perf/iters_per_sec": 0.9145704609439013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0934094667434693, "data/tokens_consumed": 139357847552, "data/tokens_consumed_B": 139.357847552, "train/loss_slope": 1.534771186755362e-05} {"step": 66450, "timestamp": 1778266359.541653, "geo/rankme_last": 439.65484619140625, "geo/layer_0/stable_rank_q_proj": 19.349945068359375, "geo/layer_0/stable_rank_k_proj": 16.04561996459961, "geo/layer_0/stable_rank_o_proj": 47.163429260253906, "geo/layer_0/stable_rank_gate_proj": 130.22665405273438, "geo/layer_0/stable_rank_down_proj": 55.41912078857422, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06619882583618164, "geo/layer_0/attn_entropy_mean": 6.155937194824219, "geo/layer_0/attn_entropy_std": 0.4247744381427765, "geo/layer_7/stable_rank_q_proj": 43.30601119995117, "geo/layer_7/stable_rank_k_proj": 40.67113494873047, "geo/layer_7/stable_rank_o_proj": 89.9078369140625, "geo/layer_7/stable_rank_gate_proj": 80.30306243896484, "geo/layer_7/stable_rank_down_proj": 139.81040954589844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4597473740577698, "geo/layer_7/attn_entropy_mean": 4.630782127380371, "geo/layer_7/attn_entropy_std": 0.808570921421051, "geo/layer_14/stable_rank_q_proj": 50.936767578125, "geo/layer_14/stable_rank_k_proj": 40.218589782714844, "geo/layer_14/stable_rank_o_proj": 43.431907653808594, "geo/layer_14/stable_rank_gate_proj": 71.38655090332031, "geo/layer_14/stable_rank_down_proj": 128.57266235351562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3667888641357422, "geo/layer_14/attn_entropy_mean": 5.547979354858398, "geo/layer_14/attn_entropy_std": 0.4214898347854614, "geo/layer_21/stable_rank_q_proj": 40.25217819213867, "geo/layer_21/stable_rank_k_proj": 30.01656723022461, "geo/layer_21/stable_rank_o_proj": 69.9101333618164, "geo/layer_21/stable_rank_gate_proj": 65.37237548828125, "geo/layer_21/stable_rank_down_proj": 50.83567810058594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14040018618106842, "geo/layer_21/attn_entropy_mean": 5.6799163818359375, "geo/layer_21/attn_entropy_std": 0.306306391954422, "geo/layer_27/stable_rank_q_proj": 43.38332748413086, "geo/layer_27/stable_rank_k_proj": 31.752578735351562, "geo/layer_27/stable_rank_o_proj": 114.71782684326172, "geo/layer_27/stable_rank_gate_proj": 80.0112075805664, "geo/layer_27/stable_rank_down_proj": 128.08184814453125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08831851184368134, "geo/layer_27/attn_entropy_mean": 4.16261100769043, "geo/layer_27/attn_entropy_std": 0.7517117857933044, "attnres/final_alpha/block_0": 0.2368909865617752, "attnres/block_norm/0": 1.7677078247070312, "attnres/final_alpha/block_1": 0.00433112820610404, "attnres/block_norm/1": 47144.0390625, "attnres/final_alpha/block_2": 0.010224075987935066, "attnres/block_norm/2": 28769.359375, "attnres/final_alpha/block_3": 0.012082414701581001, "attnres/block_norm/3": 59098.078125, "attnres/final_alpha/block_4": 0.01458231545984745, "attnres/block_norm/4": 15356.865234375, "attnres/final_alpha/block_5": 0.613695502281189, "attnres/block_norm/5": 6744.66015625, "attnres/final_alpha/block_6": 0.10819362103939056, "attnres/block_norm/6": 39316.09765625, "geo/tier1_time_s": 1.3584043979644775, "geo/step": 66450.0, "geo/rankme_slope": -1.2278856855242096e-05} {"step": 66460, "timestamp": 1778266369.883054, "train/loss": 2.187804090976715, "train/z_loss": 0.0013626570696942508, "train/perplexity": 8.915613726707392, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792441.0694785279, "perf/iters_per_sec": 0.8547025058167114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1699977397918702, "data/tokens_consumed": 139378819072, "data/tokens_consumed_B": 139.378819072, "train/loss_slope": 1.7797758577108134e-05} {"step": 66470, "timestamp": 1778266380.2277422, "train/loss": 2.128054177761078, "train/z_loss": 0.0013627965934574604, "train/perplexity": 8.398508896419504, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028618.678013281, "perf/iters_per_sec": 0.9673207655016332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03378324508667, "data/tokens_consumed": 139399790592, "data/tokens_consumed_B": 139.399790592, "train/loss_slope": 1.733521628301132e-05} {"step": 66480, "timestamp": 1778266390.5668023, "train/loss": 2.148711395263672, "train/z_loss": 0.0013680997304618358, "train/perplexity": 8.573803032514007, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029282.1726027427, "perf/iters_per_sec": 0.9676371443761552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334452390670776, "data/tokens_consumed": 139420762112, "data/tokens_consumed_B": 139.420762112, "train/loss_slope": 1.8507093443299696e-05} {"step": 66490, "timestamp": 1778266400.9069226, "train/loss": 2.12962429523468, "train/z_loss": 0.0013653390691615642, "train/perplexity": 8.411705899701522, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029468.0494731825, "perf/iters_per_sec": 0.9677257773748315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333505868911743, "data/tokens_consumed": 139441733632, "data/tokens_consumed_B": 139.441733632, "train/loss_slope": 1.651569507946809e-05} {"step": 66500, "timestamp": 1778266411.2367048, "grad/layer_0/attn": 0.002889328869059682, "grad/layer_0/mlp": 0.0031062744092196226, "grad/layer_0/attn_mlp_ratio": 0.9301588962868439, "grad/layer_4/attn": 0.0023796663153916597, "grad/layer_4/mlp": 0.0025642181281000376, "grad/layer_4/attn_mlp_ratio": 0.9280280006257026, "grad/layer_8/attn": 0.005037881899625063, "grad/layer_8/mlp": 0.0037755249068140984, "grad/layer_8/attn_mlp_ratio": 1.334352676921131, "grad/layer_12/attn": 0.005029721185564995, "grad/layer_12/mlp": 0.007586627267301083, "grad/layer_12/attn_mlp_ratio": 0.6629719560556593, "grad/layer_16/attn": 0.003584605176001787, "grad/layer_16/mlp": 0.004545688629150391, "grad/layer_16/attn_mlp_ratio": 0.7885725111388712, "grad/layer_20/attn": 0.005412176251411438, "grad/layer_20/mlp": 0.005857188254594803, "grad/layer_20/attn_mlp_ratio": 0.9240229140259311, "grad/layer_24/attn": 0.010833934880793095, "grad/layer_24/mlp": 0.009687910787761211, "grad/layer_24/attn_mlp_ratio": 1.1182942335359074, "grad/layer_27/attn": 0.0037326097954064608, "grad/layer_27/mlp": 0.009206929244101048, "grad/layer_27/attn_mlp_ratio": 0.4054131030991319} {"step": 66500, "timestamp": 1778266411.2509212, "train/loss": 2.1363346457481383, "train/z_loss": 0.0013667496852576733, "train/perplexity": 8.468341203556621, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028737.800822585, "perf/iters_per_sec": 0.9673775676834988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337225437164306, "data/tokens_consumed": 139462705152, "data/tokens_consumed_B": 139.462705152, "train/loss_slope": 1.4400365185959319e-05} {"step": 66500, "timestamp": 1778266418.5085146, "geo/ww_alpha_mean": 7.281381821290348, "geo/ww_alpha_std": 4.117842337156967, "geo/ww_alpha_min": 1.347366230043796, "geo/ww_alpha_max": 28.033196490271205, "geo/ww_alpha_healthy_frac": 0.19289340101522842, "geo/ww_alpha_by_type/q_proj": 3.8888141842840116, "geo/ww_alpha_by_type/k_proj": 4.529406715682411, "geo/ww_alpha_by_type/v_proj": 7.99465864971, "geo/ww_alpha_by_type/o_proj": 8.238537872126251, "geo/ww_alpha_by_type/gate_proj": 7.651445710402028, "geo/ww_alpha_by_type/up_proj": 10.864271051685973, "geo/ww_alpha_by_type/down_proj": 7.879704487529416, "geo/twonn_id/layer_0": 0.7003923058509827, "geo/twonn_id/layer_7": 3.3630614280700684, "geo/twonn_id/layer_14": 4.424942493438721, "geo/twonn_id/layer_21": 6.315548419952393, "geo/twonn_id/layer_27": 5.6708245277404785, "geo/tier2_time_s": 7.251694679260254} {"step": 66500, "timestamp": 1778266419.2120154, "eoc/jacobian_sigma/layer_0/attn": 1199.97021484375, "eoc/jacobian_sigma/layer_0/mlp": 7480.25537109375, "eoc/jacobian_sigma/layer_0": 7480.25537109375, "eoc/jacobian_sigma/layer_7/attn": 1.1528548002243042, "eoc/jacobian_sigma/layer_7/mlp": 1.8492047786712646, "eoc/jacobian_sigma/layer_7": 1.8492047786712646, "eoc/jacobian_sigma/layer_14/attn": 1.4488221406936646, "eoc/jacobian_sigma/layer_14/mlp": 6.053675174713135, "eoc/jacobian_sigma/layer_14": 6.053675174713135, "eoc/jacobian_sigma/layer_21/attn": 1.0802117586135864, "eoc/jacobian_sigma/layer_21/mlp": 4.20730447769165, "eoc/jacobian_sigma/layer_21": 4.20730447769165, "eoc/jacobian_sigma/layer_27/attn": 3.016977548599243, "eoc/jacobian_sigma/layer_27/mlp": 25.141254425048828, "eoc/jacobian_sigma/layer_27": 25.141254425048828, "eoc/layer0_sigma": 7480.25537109375, "eoc/sigma_max": 25.141254425048828, "eoc/sigma_min": 1.8492047786712646, "eoc/sigma_mean": 9.31285971403122, "eoc/time_s": 0.6971490383148193} {"step": 66510, "timestamp": 1778266429.5786216, "train/loss": 2.1472981214523315, "train/z_loss": 0.0013773442944511772, "train/perplexity": 8.56169445960447, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1144551.220527223, "perf/iters_per_sec": 0.5457645514141193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8322919607162476, "data/tokens_consumed": 139483676672, "data/tokens_consumed_B": 139.483676672, "train/loss_slope": 1.413964768125596e-05} {"step": 66520, "timestamp": 1778266439.923742, "train/loss": 2.1193721532821654, "train/z_loss": 0.0013871369068510831, "train/perplexity": 8.32590845204842, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028526.3276935702, "perf/iters_per_sec": 0.9672767294376231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338303089141845, "data/tokens_consumed": 139504648192, "data/tokens_consumed_B": 139.504648192, "train/loss_slope": 1.1830953274122227e-05} {"step": 66525, "timestamp": 1778266445.6816642, "eos/sharpness": 63.89806270599364, "eos/L0_probe": 1.9728641510009766, "eos/L_plus": 2.329869508743286, "eos/L_minus": 2.2548394203186035, "eos/grad_norm": 0.14991495013237, "eos/embed_grad_frac": 0.09581364691257477, "eos/time_s": 0.5983684062957764} {"step": 66525, "timestamp": 1778266447.059055, "geo/rankme_last": 438.1128845214844, "geo/layer_0/stable_rank_q_proj": 19.362770080566406, "geo/layer_0/stable_rank_k_proj": 16.05804443359375, "geo/layer_0/stable_rank_o_proj": 47.08473587036133, "geo/layer_0/stable_rank_gate_proj": 130.25823974609375, "geo/layer_0/stable_rank_down_proj": 55.42874526977539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06481768190860748, "geo/layer_0/attn_entropy_mean": 6.154567241668701, "geo/layer_0/attn_entropy_std": 0.4223562180995941, "geo/layer_7/stable_rank_q_proj": 43.284664154052734, "geo/layer_7/stable_rank_k_proj": 40.62008285522461, "geo/layer_7/stable_rank_o_proj": 89.93467712402344, "geo/layer_7/stable_rank_gate_proj": 80.31126403808594, "geo/layer_7/stable_rank_down_proj": 139.8534393310547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4483168423175812, "geo/layer_7/attn_entropy_mean": 4.634690284729004, "geo/layer_7/attn_entropy_std": 0.8023447394371033, "geo/layer_14/stable_rank_q_proj": 50.94670486450195, "geo/layer_14/stable_rank_k_proj": 40.26517105102539, "geo/layer_14/stable_rank_o_proj": 43.46546936035156, "geo/layer_14/stable_rank_gate_proj": 71.34510040283203, "geo/layer_14/stable_rank_down_proj": 128.5775146484375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38899433612823486, "geo/layer_14/attn_entropy_mean": 5.546295166015625, "geo/layer_14/attn_entropy_std": 0.4131416380405426, "geo/layer_21/stable_rank_q_proj": 40.18204116821289, "geo/layer_21/stable_rank_k_proj": 29.915218353271484, "geo/layer_21/stable_rank_o_proj": 70.01199340820312, "geo/layer_21/stable_rank_gate_proj": 65.3264389038086, "geo/layer_21/stable_rank_down_proj": 50.76240158081055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14215147495269775, "geo/layer_21/attn_entropy_mean": 5.703439712524414, "geo/layer_21/attn_entropy_std": 0.30045175552368164, "geo/layer_27/stable_rank_q_proj": 43.46830368041992, "geo/layer_27/stable_rank_k_proj": 31.82455825805664, "geo/layer_27/stable_rank_o_proj": 114.84516143798828, "geo/layer_27/stable_rank_gate_proj": 80.02799987792969, "geo/layer_27/stable_rank_down_proj": 128.04296875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09595232456922531, "geo/layer_27/attn_entropy_mean": 4.174042701721191, "geo/layer_27/attn_entropy_std": 0.7602764964103699, "attnres/final_alpha/block_0": 0.2349260449409485, "attnres/block_norm/0": 1.7678768634796143, "attnres/final_alpha/block_1": 0.004298066720366478, "attnres/block_norm/1": 47163.23046875, "attnres/final_alpha/block_2": 0.010056761093437672, "attnres/block_norm/2": 28825.375, "attnres/final_alpha/block_3": 0.011883649975061417, "attnres/block_norm/3": 59650.41015625, "attnres/final_alpha/block_4": 0.014513568952679634, "attnres/block_norm/4": 15346.34375, "attnres/final_alpha/block_5": 0.6182968616485596, "attnres/block_norm/5": 6699.27978515625, "attnres/final_alpha/block_6": 0.10602502524852753, "attnres/block_norm/6": 39584.078125, "geo/tier1_time_s": 1.3595283031463623, "geo/step": 66525.0, "geo/rankme_slope": -9.72418264180672e-06} {"step": 66530, "timestamp": 1778266452.2750597, "train/loss": 2.125715267658234, "train/z_loss": 0.001369309553410858, "train/perplexity": 8.378888493236907, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698641.6460598148, "perf/iters_per_sec": 0.8099754553126406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234605312347412, "data/tokens_consumed": 139525619712, "data/tokens_consumed_B": 139.525619712, "train/loss_slope": 1.0277964723314038e-05} {"step": 66540, "timestamp": 1778266462.6129322, "train/loss": 2.184719371795654, "train/z_loss": 0.0013695929199457168, "train/perplexity": 8.888153936678641, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029914.808221465, "perf/iters_per_sec": 0.9679388085467648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331231594085692, "data/tokens_consumed": 139546591232, "data/tokens_consumed_B": 139.546591232, "train/loss_slope": 1.190537550840651e-05} {"step": 66550, "timestamp": 1778266472.946571, "grad/layer_0/attn": 0.002443705452606082, "grad/layer_0/mlp": 0.002724284306168556, "grad/layer_0/attn_mlp_ratio": 0.8970082004187362, "grad/layer_4/attn": 0.0019219808746129274, "grad/layer_4/mlp": 0.002528037875890732, "grad/layer_4/attn_mlp_ratio": 0.7602658239086517, "grad/layer_8/attn": 0.006060190033167601, "grad/layer_8/mlp": 0.0033167514484375715, "grad/layer_8/attn_mlp_ratio": 1.8271462135963683, "grad/layer_12/attn": 0.004635735880583525, "grad/layer_12/mlp": 0.006871911697089672, "grad/layer_12/attn_mlp_ratio": 0.6745918774083812, "grad/layer_16/attn": 0.003898420138284564, "grad/layer_16/mlp": 0.004859834909439087, "grad/layer_16/attn_mlp_ratio": 0.8021712940280482, "grad/layer_20/attn": 0.003795861266553402, "grad/layer_20/mlp": 0.006045384798198938, "grad/layer_20/attn_mlp_ratio": 0.6278940597618984, "grad/layer_24/attn": 0.012110109440982342, "grad/layer_24/mlp": 0.009352211840450764, "grad/layer_24/attn_mlp_ratio": 1.2948925364493666, "grad/layer_27/attn": 0.008579459972679615, "grad/layer_27/mlp": 0.008827071636915207, "grad/layer_27/attn_mlp_ratio": 0.9719485950023417} {"step": 66550, "timestamp": 1778266472.960631, "train/loss": 2.1378543853759764, "train/z_loss": 0.0013823416898958384, "train/perplexity": 8.481220661498154, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027925.8814780232, "perf/iters_per_sec": 0.9669904143705479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034136414527893, "data/tokens_consumed": 139567562752, "data/tokens_consumed_B": 139.567562752, "train/loss_slope": 1.1596871681339218e-05} {"step": 66560, "timestamp": 1778266483.300243, "train/loss": 2.1777435779571532, "train/z_loss": 0.0013686406309716403, "train/perplexity": 8.826367761572802, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029183.3954776865, "perf/iters_per_sec": 0.967590043772548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033495545387268, "data/tokens_consumed": 139588534272, "data/tokens_consumed_B": 139.588534272, "train/loss_slope": 1.2646634157377004e-05} {"step": 66570, "timestamp": 1778266493.6381533, "train/loss": 2.1762025356292725, "train/z_loss": 0.0013653972768224777, "train/perplexity": 8.812776430348869, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029885.6708775042, "perf/iters_per_sec": 0.967924914778473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331379890441894, "data/tokens_consumed": 139609505792, "data/tokens_consumed_B": 139.609505792, "train/loss_slope": 1.2828327655935229e-05} {"step": 66580, "timestamp": 1778266503.9786031, "train/loss": 2.124143862724304, "train/z_loss": 0.0013792819227091968, "train/perplexity": 8.365732206152256, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029089.4022902402, "perf/iters_per_sec": 0.9675452243281556, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335434198379516, "data/tokens_consumed": 139630477312, "data/tokens_consumed_B": 139.630477312, "train/loss_slope": 9.674708470545616e-06} {"step": 66590, "timestamp": 1778266514.3217824, "train/loss": 2.142555856704712, "train/z_loss": 0.0013788739452138543, "train/perplexity": 8.52118875808029, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028480.9041523621, "perf/iters_per_sec": 0.967255069805318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338534593582154, "data/tokens_consumed": 139651448832, "data/tokens_consumed_B": 139.651448832, "train/loss_slope": 9.867788806106022e-06} {"step": 66600, "timestamp": 1778266524.6574388, "grad/layer_0/attn": 0.00294293905608356, "grad/layer_0/mlp": 0.0031496842857450247, "grad/layer_0/attn_mlp_ratio": 0.934359985210848, "grad/layer_4/attn": 0.002499354537576437, "grad/layer_4/mlp": 0.0026630947832018137, "grad/layer_4/attn_mlp_ratio": 0.9385150162473695, "grad/layer_8/attn": 0.006615620106458664, "grad/layer_8/mlp": 0.0036597438156604767, "grad/layer_8/attn_mlp_ratio": 1.8076729571568229, "grad/layer_12/attn": 0.004351431038230658, "grad/layer_12/mlp": 0.006823363713920116, "grad/layer_12/attn_mlp_ratio": 0.6377251978494022, "grad/layer_16/attn": 0.0034980513155460358, "grad/layer_16/mlp": 0.004800193011760712, "grad/layer_16/attn_mlp_ratio": 0.7287313726974102, "grad/layer_20/attn": 0.003201646963134408, "grad/layer_20/mlp": 0.006751116365194321, "grad/layer_20/attn_mlp_ratio": 0.474239628310464, "grad/layer_24/attn": 0.012058562599122524, "grad/layer_24/mlp": 0.011309999972581863, "grad/layer_24/attn_mlp_ratio": 1.0661858993578042, "grad/layer_27/attn": 0.005759317893534899, "grad/layer_27/mlp": 0.010280683636665344, "grad/layer_27/attn_mlp_ratio": 0.5602076711099178} {"step": 66600, "timestamp": 1778266525.2418463, "eos/sharpness": 65.62376022338866, "eos/L0_probe": 1.9717966318130493, "eos/L_plus": 2.3353166580200195, "eos/L_minus": 2.264514207839966, "eos/grad_norm": 0.19682691991329193, "eos/embed_grad_frac": 0.06548664718866348, "eos/time_s": 0.5816237926483154} {"step": 66600, "timestamp": 1778266525.2595594, "train/loss": 2.1299678802490236, "train/z_loss": 0.0013791469507850706, "train/perplexity": 8.414596532354318, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918585.1090944903, "perf/iters_per_sec": 0.9148526711914493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0930721759796143, "data/tokens_consumed": 139672420352, "data/tokens_consumed_B": 139.672420352, "train/loss_slope": 1.0319576490901736e-05} {"step": 66600, "timestamp": 1778266526.6198556, "geo/rankme_last": 439.0022888183594, "geo/layer_0/stable_rank_q_proj": 19.34221649169922, "geo/layer_0/stable_rank_k_proj": 16.0791015625, "geo/layer_0/stable_rank_o_proj": 47.032554626464844, "geo/layer_0/stable_rank_gate_proj": 130.3992156982422, "geo/layer_0/stable_rank_down_proj": 55.497798919677734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06378239393234253, "geo/layer_0/attn_entropy_mean": 6.151343822479248, "geo/layer_0/attn_entropy_std": 0.41851556301116943, "geo/layer_7/stable_rank_q_proj": 43.276519775390625, "geo/layer_7/stable_rank_k_proj": 40.64705276489258, "geo/layer_7/stable_rank_o_proj": 89.87015533447266, "geo/layer_7/stable_rank_gate_proj": 80.24553680419922, "geo/layer_7/stable_rank_down_proj": 139.33815002441406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45017069578170776, "geo/layer_7/attn_entropy_mean": 4.653476715087891, "geo/layer_7/attn_entropy_std": 0.7904441952705383, "geo/layer_14/stable_rank_q_proj": 50.87205123901367, "geo/layer_14/stable_rank_k_proj": 40.37477493286133, "geo/layer_14/stable_rank_o_proj": 43.41997528076172, "geo/layer_14/stable_rank_gate_proj": 71.48646545410156, "geo/layer_14/stable_rank_down_proj": 128.3418731689453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4037560224533081, "geo/layer_14/attn_entropy_mean": 5.561942100524902, "geo/layer_14/attn_entropy_std": 0.41686347126960754, "geo/layer_21/stable_rank_q_proj": 40.23135757446289, "geo/layer_21/stable_rank_k_proj": 29.955921173095703, "geo/layer_21/stable_rank_o_proj": 70.01040649414062, "geo/layer_21/stable_rank_gate_proj": 65.39024353027344, "geo/layer_21/stable_rank_down_proj": 50.761871337890625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1394515037536621, "geo/layer_21/attn_entropy_mean": 5.693750381469727, "geo/layer_21/attn_entropy_std": 0.3014398515224457, "geo/layer_27/stable_rank_q_proj": 43.435585021972656, "geo/layer_27/stable_rank_k_proj": 31.861351013183594, "geo/layer_27/stable_rank_o_proj": 115.08000183105469, "geo/layer_27/stable_rank_gate_proj": 80.0729751586914, "geo/layer_27/stable_rank_down_proj": 128.0251922607422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09668440371751785, "geo/layer_27/attn_entropy_mean": 4.180239677429199, "geo/layer_27/attn_entropy_std": 0.7592763900756836, "attnres/final_alpha/block_0": 0.23437225818634033, "attnres/block_norm/0": 1.7678766250610352, "attnres/final_alpha/block_1": 0.004306546412408352, "attnres/block_norm/1": 47215.25, "attnres/final_alpha/block_2": 0.009935205802321434, "attnres/block_norm/2": 28790.669921875, "attnres/final_alpha/block_3": 0.011920912191271782, "attnres/block_norm/3": 59175.98046875, "attnres/final_alpha/block_4": 0.014231623150408268, "attnres/block_norm/4": 15364.615234375, "attnres/final_alpha/block_5": 0.6193662881851196, "attnres/block_norm/5": 6693.79296875, "attnres/final_alpha/block_6": 0.10586713254451752, "attnres/block_norm/6": 39677.24609375, "geo/tier1_time_s": 1.3568334579467773, "geo/step": 66600.0, "geo/rankme_slope": 1.3466030943627451e-05} {"step": 66610, "timestamp": 1778266536.9630368, "train/loss": 2.116749107837677, "train/z_loss": 0.0013715922832489014, "train/perplexity": 8.30409783343936, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1792404.5444041523, "perf/iters_per_sec": 0.854685089304043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1700215816497803, "data/tokens_consumed": 139693391872, "data/tokens_consumed_B": 139.693391872, "train/loss_slope": 7.788726570296426e-06} {"step": 66620, "timestamp": 1778266547.3098936, "train/loss": 2.1390300035476684, "train/z_loss": 0.0013636925374157726, "train/perplexity": 8.491197201777933, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027818.7748138905, "perf/iters_per_sec": 0.9669393419331982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341910362243651, "data/tokens_consumed": 139714363392, "data/tokens_consumed_B": 139.714363392, "train/loss_slope": 7.334352405157745e-06} {"step": 66630, "timestamp": 1778266557.6526597, "train/loss": 2.1487987518310545, "train/z_loss": 0.0013700552517548203, "train/perplexity": 8.574552043231368, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028929.5215345316, "perf/iters_per_sec": 0.9674689872429522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336248636245728, "data/tokens_consumed": 139735334912, "data/tokens_consumed_B": 139.735334912, "train/loss_slope": 7.996619817125363e-06} {"step": 66640, "timestamp": 1778266567.9931223, "train/loss": 2.14768123626709, "train/z_loss": 0.00138119135517627, "train/perplexity": 8.564975200001376, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029257.6880871975, "perf/iters_per_sec": 0.9676254692493427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334577083587646, "data/tokens_consumed": 139756306432, "data/tokens_consumed_B": 139.756306432, "train/loss_slope": 8.321101845043889e-06} {"step": 66650, "timestamp": 1778266578.3289406, "grad/layer_0/attn": 0.003340155817568302, "grad/layer_0/mlp": 0.0033910199999809265, "grad/layer_0/attn_mlp_ratio": 0.9850003005252282, "grad/layer_4/attn": 0.0027436139062047005, "grad/layer_4/mlp": 0.0026149211917072535, "grad/layer_4/attn_mlp_ratio": 1.0492147182041662, "grad/layer_8/attn": 0.006455023307353258, "grad/layer_8/mlp": 0.0038211417850106955, "grad/layer_8/attn_mlp_ratio": 1.6892917095474957, "grad/layer_12/attn": 0.004791794810444117, "grad/layer_12/mlp": 0.0070743253454566, "grad/layer_12/attn_mlp_ratio": 0.6773500664323535, "grad/layer_16/attn": 0.004155423026531935, "grad/layer_16/mlp": 0.005063893739134073, "grad/layer_16/attn_mlp_ratio": 0.8205983692664681, "grad/layer_20/attn": 0.005670576822012663, "grad/layer_20/mlp": 0.0063518560491502285, "grad/layer_20/attn_mlp_ratio": 0.8927432688744519, "grad/layer_24/attn": 0.011923183687031269, "grad/layer_24/mlp": 0.011573117226362228, "grad/layer_24/attn_mlp_ratio": 1.0302482339716397, "grad/layer_27/attn": 0.007863433100283146, "grad/layer_27/mlp": 0.010350293479859829, "grad/layer_27/attn_mlp_ratio": 0.7597304404567081} {"step": 66650, "timestamp": 1778266578.3432715, "train/loss": 2.200039005279541, "train/z_loss": 0.0013715919805690647, "train/perplexity": 9.025365529474, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027004.414404825, "perf/iters_per_sec": 0.9665510246299863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346065282821655, "data/tokens_consumed": 139777277952, "data/tokens_consumed_B": 139.777277952, "train/loss_slope": 1.2597040238768075e-05} {"step": 66660, "timestamp": 1778266588.6939259, "train/loss": 2.1166189432144167, "train/z_loss": 0.0013750212849117816, "train/perplexity": 8.303017004017757, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027903.253025356, "perf/iters_per_sec": 0.9669796242834835, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341479539871217, "data/tokens_consumed": 139798249472, "data/tokens_consumed_B": 139.798249472, "train/loss_slope": 1.2948267845430741e-05} {"step": 66670, "timestamp": 1778266599.035008, "train/loss": 2.1388357400894167, "train/z_loss": 0.0013779023429378867, "train/perplexity": 8.489547832656081, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028945.386794879, "perf/iters_per_sec": 0.9674765523886104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336167812347412, "data/tokens_consumed": 139819220992, "data/tokens_consumed_B": 139.819220992, "train/loss_slope": 1.1973203691390391e-05} {"step": 66675, "timestamp": 1778266604.7814384, "eos/sharpness": 8.548951148986815, "eos/L0_probe": 1.9690067768096924, "eos/L_plus": 2.014704704284668, "eos/L_minus": 2.008798360824585, "eos/grad_norm": 0.08713653683662415, "eos/embed_grad_frac": 0.28085827827453613, "eos/time_s": 0.5820729732513428} {"step": 66675, "timestamp": 1778266606.1598845, "geo/rankme_last": 438.9393005371094, "geo/layer_0/stable_rank_q_proj": 19.32288932800293, "geo/layer_0/stable_rank_k_proj": 16.076730728149414, "geo/layer_0/stable_rank_o_proj": 47.00236892700195, "geo/layer_0/stable_rank_gate_proj": 130.35540771484375, "geo/layer_0/stable_rank_down_proj": 55.48487091064453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0671112909913063, "geo/layer_0/attn_entropy_mean": 6.149535179138184, "geo/layer_0/attn_entropy_std": 0.4231914281845093, "geo/layer_7/stable_rank_q_proj": 43.2643928527832, "geo/layer_7/stable_rank_k_proj": 40.635379791259766, "geo/layer_7/stable_rank_o_proj": 89.92557525634766, "geo/layer_7/stable_rank_gate_proj": 80.19075012207031, "geo/layer_7/stable_rank_down_proj": 139.4592742919922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4627729058265686, "geo/layer_7/attn_entropy_mean": 4.636800765991211, "geo/layer_7/attn_entropy_std": 0.8091560006141663, "geo/layer_14/stable_rank_q_proj": 50.84129333496094, "geo/layer_14/stable_rank_k_proj": 40.30916976928711, "geo/layer_14/stable_rank_o_proj": 43.44633102416992, "geo/layer_14/stable_rank_gate_proj": 71.64389038085938, "geo/layer_14/stable_rank_down_proj": 128.50010681152344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38882511854171753, "geo/layer_14/attn_entropy_mean": 5.511653900146484, "geo/layer_14/attn_entropy_std": 0.4234013557434082, "geo/layer_21/stable_rank_q_proj": 40.16661071777344, "geo/layer_21/stable_rank_k_proj": 29.899478912353516, "geo/layer_21/stable_rank_o_proj": 70.11750793457031, "geo/layer_21/stable_rank_gate_proj": 65.34046936035156, "geo/layer_21/stable_rank_down_proj": 50.71369552612305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1463743895292282, "geo/layer_21/attn_entropy_mean": 5.685173511505127, "geo/layer_21/attn_entropy_std": 0.29617953300476074, "geo/layer_27/stable_rank_q_proj": 43.399105072021484, "geo/layer_27/stable_rank_k_proj": 31.89065933227539, "geo/layer_27/stable_rank_o_proj": 114.96633911132812, "geo/layer_27/stable_rank_gate_proj": 80.01945495605469, "geo/layer_27/stable_rank_down_proj": 128.1075897216797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09553998708724976, "geo/layer_27/attn_entropy_mean": 4.183847904205322, "geo/layer_27/attn_entropy_std": 0.7419081926345825, "attnres/final_alpha/block_0": 0.23586928844451904, "attnres/block_norm/0": 1.7680556774139404, "attnres/final_alpha/block_1": 0.004344316199421883, "attnres/block_norm/1": 47308.5546875, "attnres/final_alpha/block_2": 0.0101076140999794, "attnres/block_norm/2": 28774.439453125, "attnres/final_alpha/block_3": 0.011816407553851604, "attnres/block_norm/3": 59894.8046875, "attnres/final_alpha/block_4": 0.014494696632027626, "attnres/block_norm/4": 15346.0, "attnres/final_alpha/block_5": 0.6152148842811584, "attnres/block_norm/5": 6688.61328125, "attnres/final_alpha/block_6": 0.10815280675888062, "attnres/block_norm/6": 39277.09375, "geo/tier1_time_s": 1.3581233024597168, "geo/step": 66675.0, "geo/rankme_slope": 2.440509406887755e-05} {"step": 66680, "timestamp": 1778266611.332277, "train/loss": 2.1911267757415773, "train/z_loss": 0.0013771793455816805, "train/perplexity": 8.945286770391835, "train/grad_norm": 0.337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1706113.6006729729, "perf/iters_per_sec": 0.8135383609166016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2291983366012573, "data/tokens_consumed": 139840192512, "data/tokens_consumed_B": 139.840192512, "train/loss_slope": 1.3764621343764318e-05} {"step": 66690, "timestamp": 1778266621.6797915, "train/loss": 2.1777645111083985, "train/z_loss": 0.0013640837976709008, "train/perplexity": 8.82655252719796, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027644.6512892637, "perf/iters_per_sec": 0.9668563133665389, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342798471450805, "data/tokens_consumed": 139861164032, "data/tokens_consumed_B": 139.861164032, "train/loss_slope": 1.545125816521471e-05} {"step": 66700, "timestamp": 1778266632.0147119, "grad/layer_0/attn": 0.0034924685023725033, "grad/layer_0/mlp": 0.003624401753768325, "grad/layer_0/attn_mlp_ratio": 0.9635985862719253, "grad/layer_4/attn": 0.002392045222222805, "grad/layer_4/mlp": 0.0026189431082457304, "grad/layer_4/attn_mlp_ratio": 0.9133627696436722, "grad/layer_8/attn": 0.004438641015440226, "grad/layer_8/mlp": 0.003668986726552248, "grad/layer_8/attn_mlp_ratio": 1.2097729496650231, "grad/layer_12/attn": 0.00531036127358675, "grad/layer_12/mlp": 0.0070313867181539536, "grad/layer_12/attn_mlp_ratio": 0.7552366852974464, "grad/layer_16/attn": 0.005101052578538656, "grad/layer_16/mlp": 0.004612287040799856, "grad/layer_16/attn_mlp_ratio": 1.105970296908713, "grad/layer_20/attn": 0.002988946158438921, "grad/layer_20/mlp": 0.006416808348149061, "grad/layer_20/attn_mlp_ratio": 0.46579949870579157, "grad/layer_24/attn": 0.023565193638205528, "grad/layer_24/mlp": 0.01368847955018282, "grad/layer_24/attn_mlp_ratio": 1.721534767952904, "grad/layer_27/attn": 0.014221249148249626, "grad/layer_27/mlp": 0.011939785443246365, "grad/layer_27/attn_mlp_ratio": 1.1910807858934913} {"step": 66700, "timestamp": 1778266632.0291533, "train/loss": 2.135455060005188, "train/z_loss": 0.001377472304739058, "train/perplexity": 8.460895846262618, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027698.1706390935, "perf/iters_per_sec": 0.9668818333812206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342525482177733, "data/tokens_consumed": 139882135552, "data/tokens_consumed_B": 139.882135552, "train/loss_slope": 1.3585600780002921e-05} {"step": 66710, "timestamp": 1778266642.3706295, "train/loss": 2.1398919343948366, "train/z_loss": 0.0013931904337368905, "train/perplexity": 8.498519181642381, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028847.4847811523, "perf/iters_per_sec": 0.9674298690706026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336666584014893, "data/tokens_consumed": 139903107072, "data/tokens_consumed_B": 139.903107072, "train/loss_slope": 1.2166214017870912e-05} {"step": 66720, "timestamp": 1778266652.723306, "train/loss": 2.1067285537719727, "train/z_loss": 0.0013881054474040866, "train/perplexity": 8.221301696521582, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027322.7534467175, "perf/iters_per_sec": 0.9667028205140674, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344440698623658, "data/tokens_consumed": 139924078592, "data/tokens_consumed_B": 139.924078592, "train/loss_slope": 8.546007613037012e-06} {"step": 66730, "timestamp": 1778266663.062077, "train/loss": 2.1799285411834717, "train/z_loss": 0.001373461657203734, "train/perplexity": 8.845674134730672, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029437.9416641048, "perf/iters_per_sec": 0.9677114208527111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333659172058105, "data/tokens_consumed": 139945050112, "data/tokens_consumed_B": 139.945050112, "train/loss_slope": 1.276817107179163e-05} {"step": 66740, "timestamp": 1778266673.4050171, "train/loss": 2.13209228515625, "train/z_loss": 0.0013789981370791794, "train/perplexity": 8.43249154391501, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029469.2200903331, "perf/iters_per_sec": 0.9677263355685869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333499908447266, "data/tokens_consumed": 139966021632, "data/tokens_consumed_B": 139.966021632, "train/loss_slope": 1.3368662534588415e-05} {"step": 66750, "timestamp": 1778266683.7344363, "grad/layer_0/attn": 0.002834176179021597, "grad/layer_0/mlp": 0.003154090838506818, "grad/layer_0/attn_mlp_ratio": 0.898571485184675, "grad/layer_4/attn": 0.0035218773409724236, "grad/layer_4/mlp": 0.002886299742385745, "grad/layer_4/attn_mlp_ratio": 1.2202049451873045, "grad/layer_8/attn": 0.006049082148820162, "grad/layer_8/mlp": 0.003921379800885916, "grad/layer_8/attn_mlp_ratio": 1.5425901855246307, "grad/layer_12/attn": 0.011142994277179241, "grad/layer_12/mlp": 0.0069891600869596004, "grad/layer_12/attn_mlp_ratio": 1.5943252091960747, "grad/layer_16/attn": 0.004023527726531029, "grad/layer_16/mlp": 0.004691773094236851, "grad/layer_16/attn_mlp_ratio": 0.8575708074451122, "grad/layer_20/attn": 0.003078221110627055, "grad/layer_20/mlp": 0.005955803673714399, "grad/layer_20/attn_mlp_ratio": 0.5168439437532527, "grad/layer_24/attn": 0.005757792387157679, "grad/layer_24/mlp": 0.008089372888207436, "grad/layer_24/attn_mlp_ratio": 0.7117723951598351, "grad/layer_27/attn": 0.006809363141655922, "grad/layer_27/mlp": 0.0067975157871842384, "grad/layer_27/attn_mlp_ratio": 1.0017428799973855} {"step": 66750, "timestamp": 1778266684.3195097, "eos/sharpness": 8.53593349456787, "eos/L0_probe": 1.96882164478302, "eos/L_plus": 2.011314868927002, "eos/L_minus": 2.011687755584717, "eos/grad_norm": 0.09650851041078568, "eos/embed_grad_frac": 0.25301873683929443, "eos/time_s": 0.5822761058807373} {"step": 66750, "timestamp": 1778266684.3374891, "train/loss": 2.142849493026733, "train/z_loss": 0.0013573120813816786, "train/perplexity": 8.523691256000625, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919391.561537048, "perf/iters_per_sec": 0.9152372176823845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.092612910270691, "data/tokens_consumed": 139986993152, "data/tokens_consumed_B": 139.986993152, "train/loss_slope": 1.5089144379106161e-05} {"step": 66750, "timestamp": 1778266685.696968, "geo/rankme_last": 438.55767822265625, "geo/layer_0/stable_rank_q_proj": 19.309968948364258, "geo/layer_0/stable_rank_k_proj": 16.053651809692383, "geo/layer_0/stable_rank_o_proj": 47.05390548706055, "geo/layer_0/stable_rank_gate_proj": 130.03292846679688, "geo/layer_0/stable_rank_down_proj": 55.55070495605469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06677289307117462, "geo/layer_0/attn_entropy_mean": 6.152609825134277, "geo/layer_0/attn_entropy_std": 0.42235156893730164, "geo/layer_7/stable_rank_q_proj": 43.34370803833008, "geo/layer_7/stable_rank_k_proj": 40.733219146728516, "geo/layer_7/stable_rank_o_proj": 89.82288360595703, "geo/layer_7/stable_rank_gate_proj": 80.13790130615234, "geo/layer_7/stable_rank_down_proj": 139.45706176757812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45218974351882935, "geo/layer_7/attn_entropy_mean": 4.642102241516113, "geo/layer_7/attn_entropy_std": 0.800329864025116, "geo/layer_14/stable_rank_q_proj": 50.946868896484375, "geo/layer_14/stable_rank_k_proj": 40.302555084228516, "geo/layer_14/stable_rank_o_proj": 43.39873123168945, "geo/layer_14/stable_rank_gate_proj": 71.82231903076172, "geo/layer_14/stable_rank_down_proj": 128.67111206054688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3804670572280884, "geo/layer_14/attn_entropy_mean": 5.5511908531188965, "geo/layer_14/attn_entropy_std": 0.4032765328884125, "geo/layer_21/stable_rank_q_proj": 40.16292190551758, "geo/layer_21/stable_rank_k_proj": 29.983470916748047, "geo/layer_21/stable_rank_o_proj": 70.06584930419922, "geo/layer_21/stable_rank_gate_proj": 65.28909301757812, "geo/layer_21/stable_rank_down_proj": 50.70238494873047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1423877626657486, "geo/layer_21/attn_entropy_mean": 5.701539993286133, "geo/layer_21/attn_entropy_std": 0.2954559922218323, "geo/layer_27/stable_rank_q_proj": 43.323299407958984, "geo/layer_27/stable_rank_k_proj": 31.838632583618164, "geo/layer_27/stable_rank_o_proj": 114.75157928466797, "geo/layer_27/stable_rank_gate_proj": 79.98784637451172, "geo/layer_27/stable_rank_down_proj": 128.0227813720703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09929557144641876, "geo/layer_27/attn_entropy_mean": 4.184248447418213, "geo/layer_27/attn_entropy_std": 0.7420058846473694, "attnres/final_alpha/block_0": 0.23700331151485443, "attnres/block_norm/0": 1.767899990081787, "attnres/final_alpha/block_1": 0.0044478317722678185, "attnres/block_norm/1": 47202.171875, "attnres/final_alpha/block_2": 0.01026215497404337, "attnres/block_norm/2": 28800.421875, "attnres/final_alpha/block_3": 0.012065521441400051, "attnres/block_norm/3": 58781.3984375, "attnres/final_alpha/block_4": 0.014344168826937675, "attnres/block_norm/4": 15395.9345703125, "attnres/final_alpha/block_5": 0.6121247410774231, "attnres/block_norm/5": 6734.9541015625, "attnres/final_alpha/block_6": 0.10975226014852524, "attnres/block_norm/6": 39311.125, "geo/tier1_time_s": 1.3560385704040527, "geo/step": 66750.0, "geo/rankme_slope": 4.586346257252901e-06} {"step": 66760, "timestamp": 1778266696.0553422, "train/loss": 2.1328089833259583, "train/z_loss": 0.0013744630385190248, "train/perplexity": 8.438537261389154, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790211.1698649833, "perf/iters_per_sec": 0.8536392068219105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714550971984863, "data/tokens_consumed": 140007964672, "data/tokens_consumed_B": 140.007964672, "train/loss_slope": 1.3122941444058174e-05} {"step": 66770, "timestamp": 1778266706.4326854, "train/loss": 2.1293728351593018, "train/z_loss": 0.001377025747206062, "train/perplexity": 8.40959095742484, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022248.5966966243, "perf/iters_per_sec": 0.9642832740290758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370396614074706, "data/tokens_consumed": 140028936192, "data/tokens_consumed_B": 140.028936192, "train/loss_slope": 1.0324237177116578e-05} {"step": 66780, "timestamp": 1778266716.8128176, "train/loss": 2.135943627357483, "train/z_loss": 0.001376362587325275, "train/perplexity": 8.465030573708447, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021133.7264318734, "perf/iters_per_sec": 0.9637516624602668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376116991043092, "data/tokens_consumed": 140049907712, "data/tokens_consumed_B": 140.049907712, "train/loss_slope": 9.354086284196787e-06} {"step": 66790, "timestamp": 1778266727.198226, "train/loss": 2.136812353134155, "train/z_loss": 0.0013808820745907723, "train/perplexity": 8.472387559106888, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020283.2414059057, "perf/iters_per_sec": 0.96334611959739, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038048505783081, "data/tokens_consumed": 140070879232, "data/tokens_consumed_B": 140.070879232, "train/loss_slope": 7.940411310170146e-06} {"step": 66800, "timestamp": 1778266737.565774, "grad/layer_0/attn": 0.002711547538638115, "grad/layer_0/mlp": 0.0029097660444676876, "grad/layer_0/attn_mlp_ratio": 0.9318781661521343, "grad/layer_4/attn": 0.002266602125018835, "grad/layer_4/mlp": 0.0026401691138744354, "grad/layer_4/attn_mlp_ratio": 0.8585063840255176, "grad/layer_8/attn": 0.004776890389621258, "grad/layer_8/mlp": 0.0037161835934966803, "grad/layer_8/attn_mlp_ratio": 1.2854290270905644, "grad/layer_12/attn": 0.007923035882413387, "grad/layer_12/mlp": 0.0074025425128638744, "grad/layer_12/attn_mlp_ratio": 1.0703127691078764, "grad/layer_16/attn": 0.0031876079738140106, "grad/layer_16/mlp": 0.004523131996393204, "grad/layer_16/attn_mlp_ratio": 0.7047346630349013, "grad/layer_20/attn": 0.0027406015433371067, "grad/layer_20/mlp": 0.005674208980053663, "grad/layer_20/attn_mlp_ratio": 0.48299269636909253, "grad/layer_24/attn": 0.007757491432130337, "grad/layer_24/mlp": 0.007849067449569702, "grad/layer_24/attn_mlp_ratio": 0.9883328666926321, "grad/layer_27/attn": 0.00422819284722209, "grad/layer_27/mlp": 0.006902073509991169, "grad/layer_27/attn_mlp_ratio": 0.6125974723163666} {"step": 66800, "timestamp": 1778266737.579835, "train/loss": 2.111779475212097, "train/z_loss": 0.001392496912740171, "train/perplexity": 8.26293189244576, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021131.404391374, "perf/iters_per_sec": 0.9637505552250738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376128911972047, "data/tokens_consumed": 140091850752, "data/tokens_consumed_B": 140.091850752, "train/loss_slope": 7.250394443474169e-06} {"step": 66810, "timestamp": 1778266747.9599602, "train/loss": 2.2112714290618896, "train/z_loss": 0.001352778624277562, "train/perplexity": 9.127313750828936, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021295.9109400152, "perf/iters_per_sec": 0.9638289980602337, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375284433364869, "data/tokens_consumed": 140112822272, "data/tokens_consumed_B": 140.112822272, "train/loss_slope": 9.715056655430535e-06} {"step": 66820, "timestamp": 1778266758.337495, "train/loss": 2.1419732570648193, "train/z_loss": 0.001361039315816015, "train/perplexity": 8.516225762438493, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021749.4876169614, "perf/iters_per_sec": 0.9640452802738959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03729567527771, "data/tokens_consumed": 140133793792, "data/tokens_consumed_B": 140.133793792, "train/loss_slope": 1.1234788554157514e-05} {"step": 66825, "timestamp": 1778266764.1000447, "eos/sharpness": 11.968088150024412, "eos/L0_probe": 1.9708633422851562, "eos/L_plus": 2.0345475673675537, "eos/L_minus": 2.026859998703003, "eos/grad_norm": 0.09466730058193207, "eos/embed_grad_frac": 0.2722928822040558, "eos/time_s": 0.5803723335266113} {"step": 66825, "timestamp": 1778266765.474082, "geo/rankme_last": 437.7843933105469, "geo/layer_0/stable_rank_q_proj": 19.323339462280273, "geo/layer_0/stable_rank_k_proj": 16.022117614746094, "geo/layer_0/stable_rank_o_proj": 46.972530364990234, "geo/layer_0/stable_rank_gate_proj": 129.6712646484375, "geo/layer_0/stable_rank_down_proj": 55.52135467529297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06708110868930817, "geo/layer_0/attn_entropy_mean": 6.154390335083008, "geo/layer_0/attn_entropy_std": 0.4211214780807495, "geo/layer_7/stable_rank_q_proj": 43.27617645263672, "geo/layer_7/stable_rank_k_proj": 40.706974029541016, "geo/layer_7/stable_rank_o_proj": 89.93187713623047, "geo/layer_7/stable_rank_gate_proj": 80.1474609375, "geo/layer_7/stable_rank_down_proj": 139.71043395996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44323188066482544, "geo/layer_7/attn_entropy_mean": 4.654930114746094, "geo/layer_7/attn_entropy_std": 0.7791053056716919, "geo/layer_14/stable_rank_q_proj": 50.99683380126953, "geo/layer_14/stable_rank_k_proj": 40.32838821411133, "geo/layer_14/stable_rank_o_proj": 43.40816116333008, "geo/layer_14/stable_rank_gate_proj": 71.85948181152344, "geo/layer_14/stable_rank_down_proj": 128.81373596191406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39226391911506653, "geo/layer_14/attn_entropy_mean": 5.53562593460083, "geo/layer_14/attn_entropy_std": 0.41278886795043945, "geo/layer_21/stable_rank_q_proj": 40.18436813354492, "geo/layer_21/stable_rank_k_proj": 30.034793853759766, "geo/layer_21/stable_rank_o_proj": 69.992919921875, "geo/layer_21/stable_rank_gate_proj": 65.27851867675781, "geo/layer_21/stable_rank_down_proj": 50.69752502441406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1419587880373001, "geo/layer_21/attn_entropy_mean": 5.680625915527344, "geo/layer_21/attn_entropy_std": 0.30426833033561707, "geo/layer_27/stable_rank_q_proj": 43.277713775634766, "geo/layer_27/stable_rank_k_proj": 31.92865562438965, "geo/layer_27/stable_rank_o_proj": 114.74098205566406, "geo/layer_27/stable_rank_gate_proj": 79.97493743896484, "geo/layer_27/stable_rank_down_proj": 127.7659683227539, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08690979331731796, "geo/layer_27/attn_entropy_mean": 4.178717613220215, "geo/layer_27/attn_entropy_std": 0.7528892755508423, "attnres/final_alpha/block_0": 0.23582586646080017, "attnres/block_norm/0": 1.7680847644805908, "attnres/final_alpha/block_1": 0.0042982082813978195, "attnres/block_norm/1": 47312.671875, "attnres/final_alpha/block_2": 0.010279886424541473, "attnres/block_norm/2": 28817.611328125, "attnres/final_alpha/block_3": 0.011895958334207535, "attnres/block_norm/3": 59617.85546875, "attnres/final_alpha/block_4": 0.01426316425204277, "attnres/block_norm/4": 15361.630859375, "attnres/final_alpha/block_5": 0.6149263978004456, "attnres/block_norm/5": 6714.0927734375, "attnres/final_alpha/block_6": 0.10851050913333893, "attnres/block_norm/6": 39606.4921875, "geo/tier1_time_s": 1.3558778762817383, "geo/step": 66825.0, "geo/rankme_slope": -2.125977344062625e-05} {"step": 66830, "timestamp": 1778266770.664014, "train/loss": 2.187303829193115, "train/z_loss": 0.0013597447308711708, "train/perplexity": 8.911154701315573, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702053.2071391747, "perf/iters_per_sec": 0.8116022144027589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2321306943893433, "data/tokens_consumed": 140154765312, "data/tokens_consumed_B": 140.154765312, "train/loss_slope": 1.2257549493047404e-05} {"step": 66840, "timestamp": 1778266781.0429022, "train/loss": 2.134605014324188, "train/z_loss": 0.001388530875556171, "train/perplexity": 8.45370675425231, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021400.4715098843, "perf/iters_per_sec": 0.9638788564252302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037474775314331, "data/tokens_consumed": 140175736832, "data/tokens_consumed_B": 140.175736832, "train/loss_slope": 1.226007722356169e-05} {"step": 66850, "timestamp": 1778266791.41522, "grad/layer_0/attn": 0.002839218359440565, "grad/layer_0/mlp": 0.003147922456264496, "grad/layer_0/attn_mlp_ratio": 0.9019339925597608, "grad/layer_4/attn": 0.001942862058058381, "grad/layer_4/mlp": 0.00253590801730752, "grad/layer_4/attn_mlp_ratio": 0.7661405572222385, "grad/layer_8/attn": 0.003401661990210414, "grad/layer_8/mlp": 0.0036874159704893827, "grad/layer_8/attn_mlp_ratio": 0.9225056042452393, "grad/layer_12/attn": 0.004558446351438761, "grad/layer_12/mlp": 0.00747802946716547, "grad/layer_12/attn_mlp_ratio": 0.6095785407768359, "grad/layer_16/attn": 0.0036173854023218155, "grad/layer_16/mlp": 0.005140749271959066, "grad/layer_16/attn_mlp_ratio": 0.7036688896084583, "grad/layer_20/attn": 0.0032099124509841204, "grad/layer_20/mlp": 0.0061000338755548, "grad/layer_20/attn_mlp_ratio": 0.5262122250216119, "grad/layer_24/attn": 0.014927114360034466, "grad/layer_24/mlp": 0.009674062952399254, "grad/layer_24/attn_mlp_ratio": 1.5430036251761259, "grad/layer_27/attn": 0.006531279068440199, "grad/layer_27/mlp": 0.00846405141055584, "grad/layer_27/attn_mlp_ratio": 0.7716492580763236} {"step": 66850, "timestamp": 1778266791.4296393, "train/loss": 2.0464699029922486, "train/z_loss": 0.0013940029544755816, "train/perplexity": 7.740528005132177, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019966.6916177527, "perf/iters_per_sec": 0.9631951768959773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038211178779602, "data/tokens_consumed": 140196708352, "data/tokens_consumed_B": 140.196708352, "train/loss_slope": 7.485257795970737e-06} {"step": 66860, "timestamp": 1778266801.8065133, "train/loss": 2.17450954914093, "train/z_loss": 0.0013732764055021107, "train/perplexity": 8.797869141407492, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022291.1378736661, "perf/iters_per_sec": 0.964303559243043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370178461074828, "data/tokens_consumed": 140217679872, "data/tokens_consumed_B": 140.217679872, "train/loss_slope": 8.382643495920737e-06} {"step": 66870, "timestamp": 1778266812.182949, "train/loss": 2.172838830947876, "train/z_loss": 0.0013649617787450552, "train/perplexity": 8.783182653280486, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022077.1018415166, "perf/iters_per_sec": 0.9642014989097197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371276140213013, "data/tokens_consumed": 140238651392, "data/tokens_consumed_B": 140.238651392, "train/loss_slope": 9.570171001112647e-06} {"step": 66880, "timestamp": 1778266822.5633047, "train/loss": 2.0860324263572694, "train/z_loss": 0.001374929107259959, "train/perplexity": 8.052901218834565, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021286.110392674, "perf/iters_per_sec": 0.9638243247950907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375334739685058, "data/tokens_consumed": 140259622912, "data/tokens_consumed_B": 140.259622912, "train/loss_slope": 5.36252229806716e-06} {"step": 66890, "timestamp": 1778266832.9355183, "train/loss": 2.183173322677612, "train/z_loss": 0.0013609216432087124, "train/perplexity": 8.874423031186026, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023129.7735491735, "perf/iters_per_sec": 0.9647034518953197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365879774093627, "data/tokens_consumed": 140280594432, "data/tokens_consumed_B": 140.280594432, "train/loss_slope": 8.678509214065275e-06} {"step": 66900, "timestamp": 1778266843.306351, "grad/layer_0/attn": 0.002741763601079583, "grad/layer_0/mlp": 0.0027105375193059444, "grad/layer_0/attn_mlp_ratio": 1.0115202170784239, "grad/layer_4/attn": 0.002575006801635027, "grad/layer_4/mlp": 0.0024408057797700167, "grad/layer_4/attn_mlp_ratio": 1.0549822183637378, "grad/layer_8/attn": 0.002868601819500327, "grad/layer_8/mlp": 0.003609350649639964, "grad/layer_8/attn_mlp_ratio": 0.7947694802968297, "grad/layer_12/attn": 0.004002220928668976, "grad/layer_12/mlp": 0.007012374233454466, "grad/layer_12/attn_mlp_ratio": 0.5707369199580914, "grad/layer_16/attn": 0.0033625138457864523, "grad/layer_16/mlp": 0.0041254498064517975, "grad/layer_16/attn_mlp_ratio": 0.8150659739020978, "grad/layer_20/attn": 0.002899113344028592, "grad/layer_20/mlp": 0.0061161150224506855, "grad/layer_20/attn_mlp_ratio": 0.47401222605942994, "grad/layer_24/attn": 0.01075277291238308, "grad/layer_24/mlp": 0.010727401822805405, "grad/layer_24/attn_mlp_ratio": 1.002365063764763, "grad/layer_27/attn": 0.00838173646479845, "grad/layer_27/mlp": 0.00905062910169363, "grad/layer_27/attn_mlp_ratio": 0.9260943386378031} {"step": 66900, "timestamp": 1778266843.8910518, "eos/sharpness": 24.787354469299313, "eos/L0_probe": 1.9733446836471558, "eos/L_plus": 2.1196401119232178, "eos/L_minus": 2.074922800064087, "eos/grad_norm": 0.131532683968544, "eos/embed_grad_frac": 0.1431092917919159, "eos/time_s": 0.581899881362915} {"step": 66900, "timestamp": 1778266843.9088666, "train/loss": 2.1758294224739076, "train/z_loss": 0.0013740543159656226, "train/perplexity": 8.809488880879533, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911815.0140330868, "perf/iters_per_sec": 0.9116244383016047, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096942949295044, "data/tokens_consumed": 140301565952, "data/tokens_consumed_B": 140.301565952, "train/loss_slope": 8.967734892519215e-06} {"step": 66900, "timestamp": 1778266845.271136, "geo/rankme_last": 437.1863098144531, "geo/layer_0/stable_rank_q_proj": 19.338518142700195, "geo/layer_0/stable_rank_k_proj": 16.042842864990234, "geo/layer_0/stable_rank_o_proj": 46.947265625, "geo/layer_0/stable_rank_gate_proj": 129.6458282470703, "geo/layer_0/stable_rank_down_proj": 55.48637771606445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06157933548092842, "geo/layer_0/attn_entropy_mean": 6.156869411468506, "geo/layer_0/attn_entropy_std": 0.419974684715271, "geo/layer_7/stable_rank_q_proj": 43.209232330322266, "geo/layer_7/stable_rank_k_proj": 40.68654251098633, "geo/layer_7/stable_rank_o_proj": 89.94624328613281, "geo/layer_7/stable_rank_gate_proj": 80.1639175415039, "geo/layer_7/stable_rank_down_proj": 139.86898803710938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4384249746799469, "geo/layer_7/attn_entropy_mean": 4.628806114196777, "geo/layer_7/attn_entropy_std": 0.7914294600486755, "geo/layer_14/stable_rank_q_proj": 50.96525573730469, "geo/layer_14/stable_rank_k_proj": 40.263031005859375, "geo/layer_14/stable_rank_o_proj": 43.3948860168457, "geo/layer_14/stable_rank_gate_proj": 71.7581787109375, "geo/layer_14/stable_rank_down_proj": 128.74203491210938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38905981183052063, "geo/layer_14/attn_entropy_mean": 5.529715538024902, "geo/layer_14/attn_entropy_std": 0.4146934151649475, "geo/layer_21/stable_rank_q_proj": 40.2032356262207, "geo/layer_21/stable_rank_k_proj": 30.087549209594727, "geo/layer_21/stable_rank_o_proj": 69.90708923339844, "geo/layer_21/stable_rank_gate_proj": 65.3602066040039, "geo/layer_21/stable_rank_down_proj": 50.70402908325195, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1497480273246765, "geo/layer_21/attn_entropy_mean": 5.681814193725586, "geo/layer_21/attn_entropy_std": 0.2990933060646057, "geo/layer_27/stable_rank_q_proj": 43.27569580078125, "geo/layer_27/stable_rank_k_proj": 31.90506362915039, "geo/layer_27/stable_rank_o_proj": 114.77767944335938, "geo/layer_27/stable_rank_gate_proj": 79.85840606689453, "geo/layer_27/stable_rank_down_proj": 127.42110443115234, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09526098519563675, "geo/layer_27/attn_entropy_mean": 4.188501358032227, "geo/layer_27/attn_entropy_std": 0.7380367517471313, "attnres/final_alpha/block_0": 0.2354649007320404, "attnres/block_norm/0": 1.768349289894104, "attnres/final_alpha/block_1": 0.004308788571506739, "attnres/block_norm/1": 47256.4296875, "attnres/final_alpha/block_2": 0.010301167145371437, "attnres/block_norm/2": 28671.9375, "attnres/final_alpha/block_3": 0.01202406920492649, "attnres/block_norm/3": 59610.6484375, "attnres/final_alpha/block_4": 0.014463687315583229, "attnres/block_norm/4": 15365.2109375, "attnres/final_alpha/block_5": 0.6163011193275452, "attnres/block_norm/5": 6678.22998046875, "attnres/final_alpha/block_6": 0.10713624954223633, "attnres/block_norm/6": 39863.2421875, "geo/tier1_time_s": 1.3582353591918945, "geo/step": 66900.0, "geo/rankme_slope": -6.140452274659864e-05} {"step": 66910, "timestamp": 1778266855.6456668, "train/loss": 2.128106105327606, "train/z_loss": 0.0013765665935352445, "train/perplexity": 8.39894502187233, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1787409.3385436158, "perf/iters_per_sec": 0.8523031895368651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1732913970947265, "data/tokens_consumed": 140322537472, "data/tokens_consumed_B": 140.322537472, "train/loss_slope": 9.576499737528646e-06} {"step": 66920, "timestamp": 1778266866.0334551, "train/loss": 2.157863140106201, "train/z_loss": 0.0013863923260942102, "train/perplexity": 8.65262843501912, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020611.6802405266, "perf/iters_per_sec": 0.9635027314379342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378797769546508, "data/tokens_consumed": 140343508992, "data/tokens_consumed_B": 140.343508992, "train/loss_slope": 7.449741653948235e-06} {"step": 66930, "timestamp": 1778266876.9879224, "train/loss": 2.1425623893737793, "train/z_loss": 0.0013797270716167987, "train/perplexity": 8.521244424368332, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915306.7585679023, "perf/iters_per_sec": 0.9132894318427573, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.094943141937256, "data/tokens_consumed": 140364480512, "data/tokens_consumed_B": 140.364480512, "train/loss_slope": 7.781891267720946e-06} {"step": 66940, "timestamp": 1778266887.8333428, "train/loss": 2.093953490257263, "train/z_loss": 0.0013652078807353974, "train/perplexity": 8.116942064942505, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1934983.5725477308, "perf/iters_per_sec": 0.9226720679033903, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.08380868434906, "data/tokens_consumed": 140385452032, "data/tokens_consumed_B": 140.385452032, "train/loss_slope": 4.896045460773923e-06} {"step": 66950, "timestamp": 1778266898.1947, "grad/layer_0/attn": 0.002633002121001482, "grad/layer_0/mlp": 0.0029233084060251713, "grad/layer_0/attn_mlp_ratio": 0.9006925254637529, "grad/layer_4/attn": 0.0019555031321942806, "grad/layer_4/mlp": 0.002576973056420684, "grad/layer_4/attn_mlp_ratio": 0.7588372146298951, "grad/layer_8/attn": 0.0058877249248325825, "grad/layer_8/mlp": 0.003689701436087489, "grad/layer_8/attn_mlp_ratio": 1.5957184794615795, "grad/layer_12/attn": 0.0044157663360238075, "grad/layer_12/mlp": 0.007202010601758957, "grad/layer_12/attn_mlp_ratio": 0.613129654881704, "grad/layer_16/attn": 0.003529322799295187, "grad/layer_16/mlp": 0.004517118446528912, "grad/layer_16/attn_mlp_ratio": 0.7813217127115745, "grad/layer_20/attn": 0.0029085183050483465, "grad/layer_20/mlp": 0.005600241012871265, "grad/layer_20/attn_mlp_ratio": 0.5193559074382671, "grad/layer_24/attn": 0.005998330656439066, "grad/layer_24/mlp": 0.007325164508074522, "grad/layer_24/attn_mlp_ratio": 0.8188663296149157, "grad/layer_27/attn": 0.004352903924882412, "grad/layer_27/mlp": 0.006153765134513378, "grad/layer_27/attn_mlp_ratio": 0.7073561890969061} {"step": 66950, "timestamp": 1778266898.2088988, "train/loss": 2.130482864379883, "train/z_loss": 0.0013772571342997252, "train/perplexity": 8.418931032039547, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022730.6022929419, "perf/iters_per_sec": 0.9645131122078618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367925405502318, "data/tokens_consumed": 140406423552, "data/tokens_consumed_B": 140.406423552, "train/loss_slope": 1.3939824577855573e-06} {"step": 66960, "timestamp": 1778266908.5853486, "train/loss": 2.1045796155929564, "train/z_loss": 0.0013851913390681148, "train/perplexity": 8.203653596554101, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022057.2067883117, "perf/iters_per_sec": 0.9641920122090872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371378183364868, "data/tokens_consumed": 140427395072, "data/tokens_consumed_B": 140.427395072, "train/loss_slope": -4.537347340923027e-08} {"step": 66970, "timestamp": 1778266918.9688873, "train/loss": 2.125720191001892, "train/z_loss": 0.001379413076210767, "train/perplexity": 8.378929745485982, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021026.314423611, "perf/iters_per_sec": 0.96370044442349, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376668453216553, "data/tokens_consumed": 140448366592, "data/tokens_consumed_B": 140.448366592, "train/loss_slope": -4.109232074464036e-06} {"step": 66975, "timestamp": 1778266924.73136, "eos/sharpness": 9.385704994201658, "eos/L0_probe": 1.9693197011947632, "eos/L_plus": 2.017531156539917, "eos/L_minus": 2.014965295791626, "eos/grad_norm": 0.09251880645751953, "eos/embed_grad_frac": 0.23976252973079681, "eos/time_s": 0.5819995403289795} {"step": 66975, "timestamp": 1778266926.1095865, "geo/rankme_last": 438.2106628417969, "geo/layer_0/stable_rank_q_proj": 19.358463287353516, "geo/layer_0/stable_rank_k_proj": 16.090696334838867, "geo/layer_0/stable_rank_o_proj": 46.90827178955078, "geo/layer_0/stable_rank_gate_proj": 129.77911376953125, "geo/layer_0/stable_rank_down_proj": 55.59036636352539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06813103705644608, "geo/layer_0/attn_entropy_mean": 6.156820297241211, "geo/layer_0/attn_entropy_std": 0.42314812541007996, "geo/layer_7/stable_rank_q_proj": 43.239356994628906, "geo/layer_7/stable_rank_k_proj": 40.7374382019043, "geo/layer_7/stable_rank_o_proj": 89.94419860839844, "geo/layer_7/stable_rank_gate_proj": 80.10010528564453, "geo/layer_7/stable_rank_down_proj": 139.97274780273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.447552353143692, "geo/layer_7/attn_entropy_mean": 4.64035701751709, "geo/layer_7/attn_entropy_std": 0.7987557053565979, "geo/layer_14/stable_rank_q_proj": 51.02632522583008, "geo/layer_14/stable_rank_k_proj": 40.195552825927734, "geo/layer_14/stable_rank_o_proj": 43.42276382446289, "geo/layer_14/stable_rank_gate_proj": 71.69941711425781, "geo/layer_14/stable_rank_down_proj": 128.8273468017578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39463964104652405, "geo/layer_14/attn_entropy_mean": 5.555912017822266, "geo/layer_14/attn_entropy_std": 0.4049982726573944, "geo/layer_21/stable_rank_q_proj": 40.164955139160156, "geo/layer_21/stable_rank_k_proj": 30.10499382019043, "geo/layer_21/stable_rank_o_proj": 69.8568115234375, "geo/layer_21/stable_rank_gate_proj": 65.27816772460938, "geo/layer_21/stable_rank_down_proj": 50.69829177856445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14014817774295807, "geo/layer_21/attn_entropy_mean": 5.701233386993408, "geo/layer_21/attn_entropy_std": 0.3030785620212555, "geo/layer_27/stable_rank_q_proj": 43.19895935058594, "geo/layer_27/stable_rank_k_proj": 31.854787826538086, "geo/layer_27/stable_rank_o_proj": 114.78875732421875, "geo/layer_27/stable_rank_gate_proj": 79.80618286132812, "geo/layer_27/stable_rank_down_proj": 127.49181365966797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09670934081077576, "geo/layer_27/attn_entropy_mean": 4.189086437225342, "geo/layer_27/attn_entropy_std": 0.7353640198707581, "attnres/final_alpha/block_0": 0.2381245493888855, "attnres/block_norm/0": 1.7684059143066406, "attnres/final_alpha/block_1": 0.004325969144701958, "attnres/block_norm/1": 47291.6015625, "attnres/final_alpha/block_2": 0.010177082382142544, "attnres/block_norm/2": 28846.705078125, "attnres/final_alpha/block_3": 0.011960867792367935, "attnres/block_norm/3": 59675.484375, "attnres/final_alpha/block_4": 0.014633109793066978, "attnres/block_norm/4": 15339.470703125, "attnres/final_alpha/block_5": 0.6142184734344482, "attnres/block_norm/5": 6696.12890625, "attnres/final_alpha/block_6": 0.10655999183654785, "attnres/block_norm/6": 39664.34375, "geo/tier1_time_s": 1.3596367835998535, "geo/step": 66975.0, "geo/rankme_slope": -7.13294106705182e-05} {"step": 66980, "timestamp": 1778266931.295707, "train/loss": 2.1758216619491577, "train/z_loss": 0.0013725869939662516, "train/perplexity": 8.809420514888318, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702235.8837414603, "perf/iters_per_sec": 0.811689321394663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319984674453734, "data/tokens_consumed": 140469338112, "data/tokens_consumed_B": 140.469338112, "train/loss_slope": 7.390973162371874e-07} {"step": 66990, "timestamp": 1778266941.6828666, "train/loss": 2.1443169116973877, "train/z_loss": 0.0013644888182170688, "train/perplexity": 8.536208261290115, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019906.2044584113, "perf/iters_per_sec": 0.9631663343708092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038242268562317, "data/tokens_consumed": 140490309632, "data/tokens_consumed_B": 140.490309632, "train/loss_slope": 9.16316056444657e-07} {"step": 67000, "timestamp": 1778266952.0532484, "grad/layer_0/attn": 0.0026436576154083014, "grad/layer_0/mlp": 0.0028208608273416758, "grad/layer_0/attn_mlp_ratio": 0.9371811243100285, "grad/layer_4/attn": 0.0029956635553389788, "grad/layer_4/mlp": 0.0026163223665207624, "grad/layer_4/attn_mlp_ratio": 1.1449901889665273, "grad/layer_8/attn": 0.0034432101529091597, "grad/layer_8/mlp": 0.0036026551388204098, "grad/layer_8/attn_mlp_ratio": 0.9557423413172735, "grad/layer_12/attn": 0.00451961625367403, "grad/layer_12/mlp": 0.006663761101663113, "grad/layer_12/attn_mlp_ratio": 0.6782380275790861, "grad/layer_16/attn": 0.003637481015175581, "grad/layer_16/mlp": 0.004613836295902729, "grad/layer_16/attn_mlp_ratio": 0.7883853485584815, "grad/layer_20/attn": 0.0052299462258815765, "grad/layer_20/mlp": 0.006274063605815172, "grad/layer_20/attn_mlp_ratio": 0.8335819448301353, "grad/layer_24/attn": 0.011476480402052402, "grad/layer_24/mlp": 0.009130117483437061, "grad/layer_24/attn_mlp_ratio": 1.256991522526706, "grad/layer_27/attn": 0.0035212000366300344, "grad/layer_27/mlp": 0.008333866484463215, "grad/layer_27/attn_mlp_ratio": 0.42251696747756795} {"step": 67000, "timestamp": 1778266952.0675454, "train/loss": 2.175357961654663, "train/z_loss": 0.0013694758643396198, "train/perplexity": 8.805336530946692, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020586.2441128592, "perf/iters_per_sec": 0.9634906025471016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378928422927856, "data/tokens_consumed": 140511281152, "data/tokens_consumed_B": 140.511281152, "train/loss_slope": 1.7646161708036248e-06} {"step": 67000, "timestamp": 1778266959.3012042, "geo/ww_alpha_mean": 8.029677648513747, "geo/ww_alpha_std": 5.628107937610372, "geo/ww_alpha_min": 1.346777686910074, "geo/ww_alpha_max": 40.262818138995456, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.853344709099761, "geo/ww_alpha_by_type/k_proj": 4.527291967571096, "geo/ww_alpha_by_type/v_proj": 8.616770903810101, "geo/ww_alpha_by_type/o_proj": 9.057795247379932, "geo/ww_alpha_by_type/gate_proj": 7.957732072201904, "geo/ww_alpha_by_type/up_proj": 14.20515419052021, "geo/ww_alpha_by_type/down_proj": 8.099444373286902, "geo/twonn_id/layer_0": 0.7529626488685608, "geo/twonn_id/layer_7": 3.1338133811950684, "geo/twonn_id/layer_14": 4.8835601806640625, "geo/twonn_id/layer_21": 8.654093742370605, "geo/twonn_id/layer_27": 5.264651775360107, "geo/tier2_time_s": 7.226978302001953} {"step": 67000, "timestamp": 1778266960.1062055, "eoc/jacobian_sigma/layer_0/attn": 1159.6959228515625, "eoc/jacobian_sigma/layer_0/mlp": 7698.49267578125, "eoc/jacobian_sigma/layer_0": 7698.49267578125, "eoc/jacobian_sigma/layer_7/attn": 1.171406865119934, "eoc/jacobian_sigma/layer_7/mlp": 1.8428064584732056, "eoc/jacobian_sigma/layer_7": 1.8428064584732056, "eoc/jacobian_sigma/layer_14/attn": 1.443634033203125, "eoc/jacobian_sigma/layer_14/mlp": 7.632632732391357, "eoc/jacobian_sigma/layer_14": 7.632632732391357, "eoc/jacobian_sigma/layer_21/attn": 1.097264051437378, "eoc/jacobian_sigma/layer_21/mlp": 4.29762601852417, "eoc/jacobian_sigma/layer_21": 4.29762601852417, "eoc/jacobian_sigma/layer_27/attn": 2.9478209018707275, "eoc/jacobian_sigma/layer_27/mlp": 25.93561553955078, "eoc/jacobian_sigma/layer_27": 25.93561553955078, "eoc/layer0_sigma": 7698.49267578125, "eoc/sigma_max": 25.93561553955078, "eoc/sigma_min": 1.8428064584732056, "eoc/sigma_mean": 9.927170187234879, "eoc/time_s": 0.7986276149749756} {"step": 67010, "timestamp": 1778266970.5001693, "train/loss": 2.0884825229644775, "train/z_loss": 0.0013872873038053512, "train/perplexity": 8.072655795217118, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1138080.3161293878, "perf/iters_per_sec": 0.5426789837500514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.842710018157959, "data/tokens_consumed": 140532252672, "data/tokens_consumed_B": 140.532252672, "train/loss_slope": -4.228349966649511e-06} {"step": 67020, "timestamp": 1778266980.885711, "train/loss": 2.168948531150818, "train/z_loss": 0.0013660355238243938, "train/perplexity": 8.749079817724603, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020736.1779714013, "perf/iters_per_sec": 0.9635620965821272, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378158330917358, "data/tokens_consumed": 140553224192, "data/tokens_consumed_B": 140.553224192, "train/loss_slope": -1.349324376025881e-06} {"step": 67030, "timestamp": 1778266991.2614772, "train/loss": 2.0949710607528687, "train/z_loss": 0.0013770541758276523, "train/perplexity": 8.125205829470753, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022489.919550543, "perf/iters_per_sec": 0.9643983457329478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036915922164917, "data/tokens_consumed": 140574195712, "data/tokens_consumed_B": 140.574195712, "train/loss_slope": -5.36000215717528e-06} {"step": 67040, "timestamp": 1778267001.639924, "train/loss": 2.1263402938842773, "train/z_loss": 0.0013774524792097508, "train/perplexity": 8.38412715527031, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021659.2949912404, "perf/iters_per_sec": 0.9640022730785562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373419523239136, "data/tokens_consumed": 140595167232, "data/tokens_consumed_B": 140.595167232, "train/loss_slope": -6.5165020558986864e-06} {"step": 67050, "timestamp": 1778267012.0089486, "grad/layer_0/attn": 0.003418086562305689, "grad/layer_0/mlp": 0.0033735334873199463, "grad/layer_0/attn_mlp_ratio": 1.0132066196563756, "grad/layer_4/attn": 0.002744230441749096, "grad/layer_4/mlp": 0.0025721534620970488, "grad/layer_4/attn_mlp_ratio": 1.06689992471202, "grad/layer_8/attn": 0.005878790281713009, "grad/layer_8/mlp": 0.0034961372148245573, "grad/layer_8/attn_mlp_ratio": 1.681510121694982, "grad/layer_12/attn": 0.006452558096498251, "grad/layer_12/mlp": 0.006678704638034105, "grad/layer_12/attn_mlp_ratio": 0.9661391466749553, "grad/layer_16/attn": 0.0036971985828131437, "grad/layer_16/mlp": 0.005142244044691324, "grad/layer_16/attn_mlp_ratio": 0.7189854232475532, "grad/layer_20/attn": 0.003731426550075412, "grad/layer_20/mlp": 0.007478209678083658, "grad/layer_20/attn_mlp_ratio": 0.4989732383559338, "grad/layer_24/attn": 0.022560926154255867, "grad/layer_24/mlp": 0.01550793182104826, "grad/layer_24/attn_mlp_ratio": 1.4547991485334597, "grad/layer_27/attn": 0.0068594906479120255, "grad/layer_27/mlp": 0.015312768518924713, "grad/layer_27/attn_mlp_ratio": 0.4479588778893016} {"step": 67050, "timestamp": 1778267012.6104002, "eos/sharpness": 81.73887729644774, "eos/L0_probe": 1.9708682298660278, "eos/L_plus": 2.4469735622406006, "eos/L_minus": 2.3121516704559326, "eos/grad_norm": 0.3041083812713623, "eos/embed_grad_frac": 0.024442316964268684, "eos/time_s": 0.598707914352417} {"step": 67050, "timestamp": 1778267012.6297998, "train/loss": 2.1398956060409544, "train/z_loss": 0.0013659557327628136, "train/perplexity": 8.498550385254626, "train/grad_norm": 0.3046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909732.6144703357, "perf/iters_per_sec": 0.910631472811859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0981390714645385, "data/tokens_consumed": 140616138752, "data/tokens_consumed_B": 140.616138752, "train/loss_slope": -5.0046884294676824e-06} {"step": 67050, "timestamp": 1778267013.9885805, "geo/rankme_last": 437.630126953125, "geo/layer_0/stable_rank_q_proj": 19.364797592163086, "geo/layer_0/stable_rank_k_proj": 16.094682693481445, "geo/layer_0/stable_rank_o_proj": 46.88779830932617, "geo/layer_0/stable_rank_gate_proj": 129.7292938232422, "geo/layer_0/stable_rank_down_proj": 55.48339080810547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0653558000922203, "geo/layer_0/attn_entropy_mean": 6.154416084289551, "geo/layer_0/attn_entropy_std": 0.4257994592189789, "geo/layer_7/stable_rank_q_proj": 43.281429290771484, "geo/layer_7/stable_rank_k_proj": 40.57789611816406, "geo/layer_7/stable_rank_o_proj": 89.87718200683594, "geo/layer_7/stable_rank_gate_proj": 80.1126480102539, "geo/layer_7/stable_rank_down_proj": 139.91046142578125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45533517003059387, "geo/layer_7/attn_entropy_mean": 4.649748802185059, "geo/layer_7/attn_entropy_std": 0.8014656901359558, "geo/layer_14/stable_rank_q_proj": 51.013397216796875, "geo/layer_14/stable_rank_k_proj": 40.23463821411133, "geo/layer_14/stable_rank_o_proj": 43.48564147949219, "geo/layer_14/stable_rank_gate_proj": 71.72867584228516, "geo/layer_14/stable_rank_down_proj": 128.81369018554688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4013366997241974, "geo/layer_14/attn_entropy_mean": 5.530794143676758, "geo/layer_14/attn_entropy_std": 0.4033986032009125, "geo/layer_21/stable_rank_q_proj": 40.165306091308594, "geo/layer_21/stable_rank_k_proj": 30.1591739654541, "geo/layer_21/stable_rank_o_proj": 69.87992858886719, "geo/layer_21/stable_rank_gate_proj": 65.21607208251953, "geo/layer_21/stable_rank_down_proj": 50.749900817871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14399124681949615, "geo/layer_21/attn_entropy_mean": 5.686659812927246, "geo/layer_21/attn_entropy_std": 0.306703120470047, "geo/layer_27/stable_rank_q_proj": 43.05293273925781, "geo/layer_27/stable_rank_k_proj": 31.762243270874023, "geo/layer_27/stable_rank_o_proj": 115.00385284423828, "geo/layer_27/stable_rank_gate_proj": 79.84121704101562, "geo/layer_27/stable_rank_down_proj": 127.44035339355469, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09958434849977493, "geo/layer_27/attn_entropy_mean": 4.174371719360352, "geo/layer_27/attn_entropy_std": 0.7392449378967285, "attnres/final_alpha/block_0": 0.23408032953739166, "attnres/block_norm/0": 1.7683453559875488, "attnres/final_alpha/block_1": 0.004274630919098854, "attnres/block_norm/1": 47365.00390625, "attnres/final_alpha/block_2": 0.00988698285073042, "attnres/block_norm/2": 28825.474609375, "attnres/final_alpha/block_3": 0.0116534773260355, "attnres/block_norm/3": 59508.9765625, "attnres/final_alpha/block_4": 0.01412290520966053, "attnres/block_norm/4": 15400.73046875, "attnres/final_alpha/block_5": 0.6208623647689819, "attnres/block_norm/5": 6656.3232421875, "attnres/final_alpha/block_6": 0.10511934757232666, "attnres/block_norm/6": 39885.546875, "geo/tier1_time_s": 1.3549211025238037, "geo/step": 67050.0, "geo/rankme_slope": -0.00010126228225665266} {"step": 67060, "timestamp": 1778267024.3614175, "train/loss": 2.161776328086853, "train/z_loss": 0.0013908136403188109, "train/perplexity": 8.686554132135058, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1788170.9518498387, "perf/iters_per_sec": 0.8526663550614542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1727916717529296, "data/tokens_consumed": 140637110272, "data/tokens_consumed_B": 140.637110272, "train/loss_slope": -1.2397667111987555e-06} {"step": 67070, "timestamp": 1778267034.7491384, "train/loss": 2.126616287231445, "train/z_loss": 0.0013773155049420893, "train/perplexity": 8.386441437935593, "train/grad_norm": 0.16015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019775.9652664666, "perf/iters_per_sec": 0.9631042314846356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383092164993286, "data/tokens_consumed": 140658081792, "data/tokens_consumed_B": 140.658081792, "train/loss_slope": -3.6870800622619696e-06} {"step": 67080, "timestamp": 1778267045.1332242, "train/loss": 2.1234954357147218, "train/z_loss": 0.00137818866642192, "train/perplexity": 8.36030939777257, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021117.6580209043, "perf/iters_per_sec": 0.9637440004448434, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037619948387146, "data/tokens_consumed": 140679053312, "data/tokens_consumed_B": 140.679053312, "train/loss_slope": -3.938495422532725e-06} {"step": 67090, "timestamp": 1778267055.5114021, "train/loss": 2.1311988115310667, "train/z_loss": 0.0013664920115843415, "train/perplexity": 8.424960699932164, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021757.5268144938, "perf/iters_per_sec": 0.9640491136620015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372915506362914, "data/tokens_consumed": 140700024832, "data/tokens_consumed_B": 140.700024832, "train/loss_slope": -3.82978717307239e-06} {"step": 67100, "timestamp": 1778267065.88067, "grad/layer_0/attn": 0.00356841366738081, "grad/layer_0/mlp": 0.003287919797003269, "grad/layer_0/attn_mlp_ratio": 1.0853104026753184, "grad/layer_4/attn": 0.002521135378628969, "grad/layer_4/mlp": 0.0024639111943542957, "grad/layer_4/attn_mlp_ratio": 1.023224896288188, "grad/layer_8/attn": 0.0032948742154985666, "grad/layer_8/mlp": 0.0036030313931405544, "grad/layer_8/attn_mlp_ratio": 0.9144727770965485, "grad/layer_12/attn": 0.004332227166742086, "grad/layer_12/mlp": 0.007473438512533903, "grad/layer_12/attn_mlp_ratio": 0.5796832477457422, "grad/layer_16/attn": 0.004283501300960779, "grad/layer_16/mlp": 0.005530846770852804, "grad/layer_16/attn_mlp_ratio": 0.7744747596492948, "grad/layer_20/attn": 0.004755303263664246, "grad/layer_20/mlp": 0.007463852409273386, "grad/layer_20/attn_mlp_ratio": 0.6371110974869973, "grad/layer_24/attn": 0.01607137732207775, "grad/layer_24/mlp": 0.012558738701045513, "grad/layer_24/attn_mlp_ratio": 1.2796967575071958, "grad/layer_27/attn": 0.015516950748860836, "grad/layer_27/mlp": 0.013645248487591743, "grad/layer_27/attn_mlp_ratio": 1.1371687843760587} {"step": 67100, "timestamp": 1778267065.8948681, "train/loss": 2.1655320644378664, "train/z_loss": 0.0013777769985608757, "train/perplexity": 8.71923988035967, "train/grad_norm": 0.29296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021334.4173282506, "perf/iters_per_sec": 0.9638473593369725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375086784362793, "data/tokens_consumed": 140720996352, "data/tokens_consumed_B": 140.720996352, "train/loss_slope": -5.244414447987052e-06} {"step": 67110, "timestamp": 1778267076.933175, "train/loss": 2.114204835891724, "train/z_loss": 0.0013872322277165949, "train/perplexity": 8.282996805045384, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1901166.4194466088, "perf/iters_per_sec": 0.9065467927201313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.103087019920349, "data/tokens_consumed": 140741967872, "data/tokens_consumed_B": 140.741967872, "train/loss_slope": -6.253125672579342e-06} {"step": 67120, "timestamp": 1778267087.8155305, "train/loss": 2.1677354097366335, "train/z_loss": 0.0013768024160526692, "train/perplexity": 8.738472556891804, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928418.6339061125, "perf/iters_per_sec": 0.9195416612177432, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0874983072280884, "data/tokens_consumed": 140762939392, "data/tokens_consumed_B": 140.762939392, "train/loss_slope": -7.80011397956052e-06} {"step": 67125, "timestamp": 1778267093.5918412, "eos/sharpness": 82.30047225952147, "eos/L0_probe": 1.9730030298233032, "eos/L_plus": 2.4565136432647705, "eos/L_minus": 2.312497138977051, "eos/grad_norm": 0.2502428889274597, "eos/embed_grad_frac": 0.036494094878435135, "eos/time_s": 0.5964376926422119} {"step": 67125, "timestamp": 1778267094.9721558, "geo/rankme_last": 438.28338623046875, "geo/layer_0/stable_rank_q_proj": 19.345731735229492, "geo/layer_0/stable_rank_k_proj": 16.127201080322266, "geo/layer_0/stable_rank_o_proj": 46.87794494628906, "geo/layer_0/stable_rank_gate_proj": 129.3589324951172, "geo/layer_0/stable_rank_down_proj": 55.455745697021484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059383850544691086, "geo/layer_0/attn_entropy_mean": 6.159761428833008, "geo/layer_0/attn_entropy_std": 0.42669564485549927, "geo/layer_7/stable_rank_q_proj": 43.28385543823242, "geo/layer_7/stable_rank_k_proj": 40.615936279296875, "geo/layer_7/stable_rank_o_proj": 89.89585876464844, "geo/layer_7/stable_rank_gate_proj": 80.12109375, "geo/layer_7/stable_rank_down_proj": 139.9149169921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43689608573913574, "geo/layer_7/attn_entropy_mean": 4.642542839050293, "geo/layer_7/attn_entropy_std": 0.8195489048957825, "geo/layer_14/stable_rank_q_proj": 51.03012466430664, "geo/layer_14/stable_rank_k_proj": 40.2921028137207, "geo/layer_14/stable_rank_o_proj": 43.4870491027832, "geo/layer_14/stable_rank_gate_proj": 71.74808502197266, "geo/layer_14/stable_rank_down_proj": 129.08958435058594, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4016130268573761, "geo/layer_14/attn_entropy_mean": 5.53688907623291, "geo/layer_14/attn_entropy_std": 0.41111597418785095, "geo/layer_21/stable_rank_q_proj": 40.234622955322266, "geo/layer_21/stable_rank_k_proj": 30.1220703125, "geo/layer_21/stable_rank_o_proj": 69.81385040283203, "geo/layer_21/stable_rank_gate_proj": 65.18682098388672, "geo/layer_21/stable_rank_down_proj": 50.78084945678711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1439245641231537, "geo/layer_21/attn_entropy_mean": 5.684693336486816, "geo/layer_21/attn_entropy_std": 0.2990756928920746, "geo/layer_27/stable_rank_q_proj": 43.17237091064453, "geo/layer_27/stable_rank_k_proj": 31.867652893066406, "geo/layer_27/stable_rank_o_proj": 115.18603515625, "geo/layer_27/stable_rank_gate_proj": 79.82489013671875, "geo/layer_27/stable_rank_down_proj": 127.4654541015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09290749579668045, "geo/layer_27/attn_entropy_mean": 4.186399459838867, "geo/layer_27/attn_entropy_std": 0.743428111076355, "attnres/final_alpha/block_0": 0.2354307323694229, "attnres/block_norm/0": 1.768560528755188, "attnres/final_alpha/block_1": 0.004260207526385784, "attnres/block_norm/1": 47347.859375, "attnres/final_alpha/block_2": 0.01010872796177864, "attnres/block_norm/2": 28667.1484375, "attnres/final_alpha/block_3": 0.011942360550165176, "attnres/block_norm/3": 59609.3515625, "attnres/final_alpha/block_4": 0.014405768364667892, "attnres/block_norm/4": 15362.9658203125, "attnres/final_alpha/block_5": 0.6193519830703735, "attnres/block_norm/5": 6666.27734375, "attnres/final_alpha/block_6": 0.1045001745223999, "attnres/block_norm/6": 39767.96484375, "geo/tier1_time_s": 1.3606023788452148, "geo/step": 67125.0, "geo/rankme_slope": -8.23050118484894e-05} {"step": 67130, "timestamp": 1778267100.1626382, "train/loss": 2.145721673965454, "train/z_loss": 0.0013676140690222383, "train/perplexity": 8.548208031006475, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699598.6551532636, "perf/iters_per_sec": 0.8104317928091352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2339101314544678, "data/tokens_consumed": 140783910912, "data/tokens_consumed_B": 140.783910912, "train/loss_slope": -6.380260895581155e-06} {"step": 67140, "timestamp": 1778267110.5425177, "train/loss": 2.12022579908371, "train/z_loss": 0.0013716824003495276, "train/perplexity": 8.333018863297124, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021450.363394835, "perf/iters_per_sec": 0.9639026467298675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374491691589356, "data/tokens_consumed": 140804882432, "data/tokens_consumed_B": 140.804882432, "train/loss_slope": -7.378838200344547e-06} {"step": 67150, "timestamp": 1778267120.9143484, "grad/layer_0/attn": 0.002603644970804453, "grad/layer_0/mlp": 0.0028646711725741625, "grad/layer_0/attn_mlp_ratio": 0.908880888265007, "grad/layer_4/attn": 0.0023782949429005384, "grad/layer_4/mlp": 0.002577080624178052, "grad/layer_4/attn_mlp_ratio": 0.9228639679725534, "grad/layer_8/attn": 0.0033216930460184813, "grad/layer_8/mlp": 0.0036624986678361893, "grad/layer_8/attn_mlp_ratio": 0.9069472118842359, "grad/layer_12/attn": 0.004741439130157232, "grad/layer_12/mlp": 0.007293293252587318, "grad/layer_12/attn_mlp_ratio": 0.6501094774249266, "grad/layer_16/attn": 0.0036206452641636133, "grad/layer_16/mlp": 0.004915343597531319, "grad/layer_16/attn_mlp_ratio": 0.7366006299787431, "grad/layer_20/attn": 0.004982186481356621, "grad/layer_20/mlp": 0.00646474864333868, "grad/layer_20/attn_mlp_ratio": 0.7706697783870253, "grad/layer_24/attn": 0.016032898798584938, "grad/layer_24/mlp": 0.012033902108669281, "grad/layer_24/attn_mlp_ratio": 1.3323108764366358, "grad/layer_27/attn": 0.007340261712670326, "grad/layer_27/mlp": 0.011399281211197376, "grad/layer_27/attn_mlp_ratio": 0.6439232011460299} {"step": 67150, "timestamp": 1778267120.928839, "train/loss": 2.117923903465271, "train/z_loss": 0.0013828860828652978, "train/perplexity": 8.313859183938465, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020318.1825560508, "perf/iters_per_sec": 0.9633627808361296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380305528640748, "data/tokens_consumed": 140825853952, "data/tokens_consumed_B": 140.825853952, "train/loss_slope": -9.035655538706037e-06} {"step": 67160, "timestamp": 1778267131.3055265, "train/loss": 2.164269506931305, "train/z_loss": 0.0013795518316328526, "train/perplexity": 8.708238285131955, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022435.0937540273, "perf/iters_per_sec": 0.964372202755941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036944031715393, "data/tokens_consumed": 140846825472, "data/tokens_consumed_B": 140.846825472, "train/loss_slope": -9.67640810244585e-06} {"step": 67170, "timestamp": 1778267142.182201, "train/loss": 2.141146206855774, "train/z_loss": 0.0013805193011648953, "train/perplexity": 8.509185327939113, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929279.7105567092, "perf/iters_per_sec": 0.9199522545608088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0870129346847535, "data/tokens_consumed": 140867796992, "data/tokens_consumed_B": 140.867796992, "train/loss_slope": -1.1715323153180207e-05} {"step": 67180, "timestamp": 1778267152.886408, "train/loss": 2.179857409000397, "train/z_loss": 0.001369879674166441, "train/perplexity": 8.845044944996785, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1960020.947270156, "perf/iters_per_sec": 0.9346108185148982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0699640750885009, "data/tokens_consumed": 140888768512, "data/tokens_consumed_B": 140.888768512, "train/loss_slope": -1.1620430276803776e-05} {"step": 67190, "timestamp": 1778267163.2740607, "train/loss": 2.1950787544250487, "train/z_loss": 0.0013562103849835694, "train/perplexity": 8.980708299488416, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020360.039283138, "perf/iters_per_sec": 0.9633827396789255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380090475082397, "data/tokens_consumed": 140909740032, "data/tokens_consumed_B": 140.909740032, "train/loss_slope": -7.87280130200367e-06} {"step": 67200, "timestamp": 1778267173.6425774, "grad/layer_0/attn": 0.0031340515706688166, "grad/layer_0/mlp": 0.003225131891667843, "grad/layer_0/attn_mlp_ratio": 0.971759164823538, "grad/layer_4/attn": 0.0025324432644993067, "grad/layer_4/mlp": 0.0025601075030863285, "grad/layer_4/attn_mlp_ratio": 0.989194071939136, "grad/layer_8/attn": 0.004766029771417379, "grad/layer_8/mlp": 0.0036812806501984596, "grad/layer_8/attn_mlp_ratio": 1.2946662031034826, "grad/layer_12/attn": 0.0066303289495408535, "grad/layer_12/mlp": 0.006986252963542938, "grad/layer_12/attn_mlp_ratio": 0.9490536471031319, "grad/layer_16/attn": 0.006240307353436947, "grad/layer_16/mlp": 0.004561941605061293, "grad/layer_16/attn_mlp_ratio": 1.3679059832162201, "grad/layer_20/attn": 0.005022798199206591, "grad/layer_20/mlp": 0.006299084983766079, "grad/layer_20/attn_mlp_ratio": 0.797385355557632, "grad/layer_24/attn": 0.006869450211524963, "grad/layer_24/mlp": 0.009019946679472923, "grad/layer_24/attn_mlp_ratio": 0.7615843396280413, "grad/layer_27/attn": 0.009915863163769245, "grad/layer_27/mlp": 0.007836943492293358, "grad/layer_27/attn_mlp_ratio": 1.2652717283202397} {"step": 67200, "timestamp": 1778267174.2279503, "eos/sharpness": 50.83684921264648, "eos/L0_probe": 1.971998691558838, "eos/L_plus": 2.2619009017944336, "eos/L_minus": 2.190464973449707, "eos/grad_norm": 0.12686388194561005, "eos/embed_grad_frac": 0.15728536248207092, "eos/time_s": 0.5824923515319824} {"step": 67200, "timestamp": 1778267174.2457688, "train/loss": 2.1736511468887327, "train/z_loss": 0.0013698337599635125, "train/perplexity": 8.790320271169113, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912578.7298985121, "perf/iters_per_sec": 0.9119886064045487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965049266815186, "data/tokens_consumed": 140930711552, "data/tokens_consumed_B": 140.930711552, "train/loss_slope": -5.964480446438665e-06} {"step": 67200, "timestamp": 1778267175.604709, "geo/rankme_last": 438.5194091796875, "geo/layer_0/stable_rank_q_proj": 19.323497772216797, "geo/layer_0/stable_rank_k_proj": 16.116830825805664, "geo/layer_0/stable_rank_o_proj": 46.92356491088867, "geo/layer_0/stable_rank_gate_proj": 129.3345947265625, "geo/layer_0/stable_rank_down_proj": 55.32975769042969, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06210872903466225, "geo/layer_0/attn_entropy_mean": 6.155160903930664, "geo/layer_0/attn_entropy_std": 0.4208250045776367, "geo/layer_7/stable_rank_q_proj": 43.352745056152344, "geo/layer_7/stable_rank_k_proj": 40.501277923583984, "geo/layer_7/stable_rank_o_proj": 89.98077392578125, "geo/layer_7/stable_rank_gate_proj": 79.92400360107422, "geo/layer_7/stable_rank_down_proj": 140.0291290283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44922226667404175, "geo/layer_7/attn_entropy_mean": 4.646056175231934, "geo/layer_7/attn_entropy_std": 0.7865170836448669, "geo/layer_14/stable_rank_q_proj": 51.004547119140625, "geo/layer_14/stable_rank_k_proj": 40.37222671508789, "geo/layer_14/stable_rank_o_proj": 43.52071762084961, "geo/layer_14/stable_rank_gate_proj": 71.80367279052734, "geo/layer_14/stable_rank_down_proj": 128.97019958496094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3924221396446228, "geo/layer_14/attn_entropy_mean": 5.591217994689941, "geo/layer_14/attn_entropy_std": 0.40163031220436096, "geo/layer_21/stable_rank_q_proj": 40.235477447509766, "geo/layer_21/stable_rank_k_proj": 30.182682037353516, "geo/layer_21/stable_rank_o_proj": 69.75672912597656, "geo/layer_21/stable_rank_gate_proj": 65.15074920654297, "geo/layer_21/stable_rank_down_proj": 50.756534576416016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14154966175556183, "geo/layer_21/attn_entropy_mean": 5.711001396179199, "geo/layer_21/attn_entropy_std": 0.2912347614765167, "geo/layer_27/stable_rank_q_proj": 43.14681625366211, "geo/layer_27/stable_rank_k_proj": 31.925899505615234, "geo/layer_27/stable_rank_o_proj": 115.1787338256836, "geo/layer_27/stable_rank_gate_proj": 79.79833984375, "geo/layer_27/stable_rank_down_proj": 127.65232849121094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09001000970602036, "geo/layer_27/attn_entropy_mean": 4.204031944274902, "geo/layer_27/attn_entropy_std": 0.7466104626655579, "attnres/final_alpha/block_0": 0.23745408654212952, "attnres/block_norm/0": 1.7688461542129517, "attnres/final_alpha/block_1": 0.004278023727238178, "attnres/block_norm/1": 47119.8515625, "attnres/final_alpha/block_2": 0.010102309286594391, "attnres/block_norm/2": 28803.265625, "attnres/final_alpha/block_3": 0.011968713253736496, "attnres/block_norm/3": 59605.78515625, "attnres/final_alpha/block_4": 0.014520404860377312, "attnres/block_norm/4": 15339.353515625, "attnres/final_alpha/block_5": 0.616001546382904, "attnres/block_norm/5": 6694.9892578125, "attnres/final_alpha/block_6": 0.10567489266395569, "attnres/block_norm/6": 39545.69140625, "geo/tier1_time_s": 1.3554954528808594, "geo/step": 67200.0, "geo/rankme_slope": -7.015499559198679e-05} {"step": 67210, "timestamp": 1778267186.6070783, "train/loss": 2.1396585941314696, "train/z_loss": 0.001383314561098814, "train/perplexity": 8.496536366282633, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697012.2447916204, "perf/iters_per_sec": 0.8091984962423422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235790729522705, "data/tokens_consumed": 140951683072, "data/tokens_consumed_B": 140.951683072, "train/loss_slope": -5.35288316295772e-06} {"step": 67220, "timestamp": 1778267196.9877582, "train/loss": 2.169371747970581, "train/z_loss": 0.0013691599131561815, "train/perplexity": 8.752783359106138, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021664.08089954, "perf/iters_per_sec": 0.9640045551774693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373394966125489, "data/tokens_consumed": 140972654592, "data/tokens_consumed_B": 140.972654592, "train/loss_slope": -1.6576902593824025e-06} {"step": 67230, "timestamp": 1778267207.3736527, "train/loss": 2.2078032970428465, "train/z_loss": 0.0013593697804026306, "train/perplexity": 9.09571384974967, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020150.2626369672, "perf/iters_per_sec": 0.9632827103791081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381168365478515, "data/tokens_consumed": 140993626112, "data/tokens_consumed_B": 140.993626112, "train/loss_slope": 2.1105785181026434e-06} {"step": 67240, "timestamp": 1778267217.757531, "train/loss": 2.0882149815559385, "train/z_loss": 0.0013745169038884343, "train/perplexity": 8.070496314403165, "train/grad_norm": 0.26171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021016.2843130862, "perf/iters_per_sec": 0.9636956616940909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376719951629638, "data/tokens_consumed": 141014597632, "data/tokens_consumed_B": 141.014597632, "train/loss_slope": -1.4242954737711804e-06} {"step": 67250, "timestamp": 1778267228.1302445, "grad/layer_0/attn": 0.0026142096612602472, "grad/layer_0/mlp": 0.0029355299193412066, "grad/layer_0/attn_mlp_ratio": 0.8905409394678682, "grad/layer_4/attn": 0.00279659079387784, "grad/layer_4/mlp": 0.0025072693824768066, "grad/layer_4/attn_mlp_ratio": 1.1153929856455747, "grad/layer_8/attn": 0.0039474028162658215, "grad/layer_8/mlp": 0.003645550925284624, "grad/layer_8/attn_mlp_ratio": 1.0828000455589921, "grad/layer_12/attn": 0.005447969306260347, "grad/layer_12/mlp": 0.007291574962437153, "grad/layer_12/attn_mlp_ratio": 0.7471594627511666, "grad/layer_16/attn": 0.004437907598912716, "grad/layer_16/mlp": 0.004774858243763447, "grad/layer_16/attn_mlp_ratio": 0.9294323055068576, "grad/layer_20/attn": 0.00510172825306654, "grad/layer_20/mlp": 0.006441864650696516, "grad/layer_20/attn_mlp_ratio": 0.7919645088039025, "grad/layer_24/attn": 0.010854513384401798, "grad/layer_24/mlp": 0.010828167200088501, "grad/layer_24/attn_mlp_ratio": 1.002433106506683, "grad/layer_27/attn": 0.00486944941803813, "grad/layer_27/mlp": 0.010788344778120518, "grad/layer_27/attn_mlp_ratio": 0.4513620460830556} {"step": 67250, "timestamp": 1778267228.144582, "train/loss": 2.2111857891082765, "train/z_loss": 0.0013568135909736156, "train/perplexity": 9.126532121572527, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020374.3786272928, "perf/iters_per_sec": 0.9633895772110428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380016803741454, "data/tokens_consumed": 141035569152, "data/tokens_consumed_B": 141.035569152, "train/loss_slope": 1.1270052028281602e-06} {"step": 67260, "timestamp": 1778267238.518416, "train/loss": 2.1896334648132325, "train/z_loss": 0.001380087563302368, "train/perplexity": 8.931938644831817, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022558.9326667453, "perf/iters_per_sec": 0.9644312537511565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368805408477784, "data/tokens_consumed": 141056540672, "data/tokens_consumed_B": 141.056540672, "train/loss_slope": 1.0348723094240777e-06} {"step": 67270, "timestamp": 1778267248.8980277, "train/loss": 2.1647442102432253, "train/z_loss": 0.0013791143079288303, "train/perplexity": 8.712373096013463, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021506.7618428685, "perf/iters_per_sec": 0.9639295396055548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374202251434326, "data/tokens_consumed": 141077512192, "data/tokens_consumed_B": 141.077512192, "train/loss_slope": 4.419978946098563e-07} {"step": 67275, "timestamp": 1778267254.6638715, "eos/sharpness": 26.394653320312496, "eos/L0_probe": 1.969955563545227, "eos/L_plus": 2.121434450149536, "eos/L_minus": 2.082423210144043, "eos/grad_norm": 0.12052614986896515, "eos/embed_grad_frac": 0.17272889614105225, "eos/time_s": 0.5855276584625244} {"step": 67275, "timestamp": 1778267256.0399058, "geo/rankme_last": 437.9808654785156, "geo/layer_0/stable_rank_q_proj": 19.346311569213867, "geo/layer_0/stable_rank_k_proj": 16.104278564453125, "geo/layer_0/stable_rank_o_proj": 46.97084045410156, "geo/layer_0/stable_rank_gate_proj": 129.5526580810547, "geo/layer_0/stable_rank_down_proj": 55.29892349243164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06413254886865616, "geo/layer_0/attn_entropy_mean": 6.155153274536133, "geo/layer_0/attn_entropy_std": 0.4228169023990631, "geo/layer_7/stable_rank_q_proj": 43.33740997314453, "geo/layer_7/stable_rank_k_proj": 40.620140075683594, "geo/layer_7/stable_rank_o_proj": 90.09906005859375, "geo/layer_7/stable_rank_gate_proj": 79.96529388427734, "geo/layer_7/stable_rank_down_proj": 140.59808349609375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4529399871826172, "geo/layer_7/attn_entropy_mean": 4.679146766662598, "geo/layer_7/attn_entropy_std": 0.7854411602020264, "geo/layer_14/stable_rank_q_proj": 50.970947265625, "geo/layer_14/stable_rank_k_proj": 40.412681579589844, "geo/layer_14/stable_rank_o_proj": 43.5241584777832, "geo/layer_14/stable_rank_gate_proj": 71.75703430175781, "geo/layer_14/stable_rank_down_proj": 129.23875427246094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4048994481563568, "geo/layer_14/attn_entropy_mean": 5.5304856300354, "geo/layer_14/attn_entropy_std": 0.3992704153060913, "geo/layer_21/stable_rank_q_proj": 40.17850875854492, "geo/layer_21/stable_rank_k_proj": 30.194339752197266, "geo/layer_21/stable_rank_o_proj": 69.57218933105469, "geo/layer_21/stable_rank_gate_proj": 65.11714172363281, "geo/layer_21/stable_rank_down_proj": 50.707427978515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451542228460312, "geo/layer_21/attn_entropy_mean": 5.680332660675049, "geo/layer_21/attn_entropy_std": 0.3015015721321106, "geo/layer_27/stable_rank_q_proj": 43.16414260864258, "geo/layer_27/stable_rank_k_proj": 31.96310043334961, "geo/layer_27/stable_rank_o_proj": 115.09059143066406, "geo/layer_27/stable_rank_gate_proj": 79.68353271484375, "geo/layer_27/stable_rank_down_proj": 127.44041442871094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09560026228427887, "geo/layer_27/attn_entropy_mean": 4.177532196044922, "geo/layer_27/attn_entropy_std": 0.7551552057266235, "attnres/final_alpha/block_0": 0.2374226301908493, "attnres/block_norm/0": 1.7689404487609863, "attnres/final_alpha/block_1": 0.004384086467325687, "attnres/block_norm/1": 47254.45703125, "attnres/final_alpha/block_2": 0.010144544765353203, "attnres/block_norm/2": 28690.853515625, "attnres/final_alpha/block_3": 0.011876439675688744, "attnres/block_norm/3": 59705.4140625, "attnres/final_alpha/block_4": 0.014384064823389053, "attnres/block_norm/4": 15388.8974609375, "attnres/final_alpha/block_5": 0.6137048006057739, "attnres/block_norm/5": 6750.40576171875, "attnres/final_alpha/block_6": 0.10808347165584564, "attnres/block_norm/6": 39679.14453125, "geo/tier1_time_s": 1.3586194515228271, "geo/step": 67275.0, "geo/rankme_slope": -9.927560868097239e-05} {"step": 67280, "timestamp": 1778267261.2300372, "train/loss": 2.1583115577697756, "train/z_loss": 0.0013673057314008475, "train/perplexity": 8.656509296504117, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701569.563747403, "perf/iters_per_sec": 0.8113715952622428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2324809074401855, "data/tokens_consumed": 141098483712, "data/tokens_consumed_B": 141.098483712, "train/loss_slope": -2.007729156647512e-06} {"step": 67290, "timestamp": 1778267271.6114788, "train/loss": 2.1545344829559325, "train/z_loss": 0.001370728062465787, "train/perplexity": 8.623874683749348, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021238.6882336405, "perf/iters_per_sec": 0.9638017121475413, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375578165054322, "data/tokens_consumed": 141119455232, "data/tokens_consumed_B": 141.119455232, "train/loss_slope": 2.703144977373864e-06} {"step": 67300, "timestamp": 1778267281.9812746, "grad/layer_0/attn": 0.0029331608675420284, "grad/layer_0/mlp": 0.003189853159710765, "grad/layer_0/attn_mlp_ratio": 0.9195284637663829, "grad/layer_4/attn": 0.003579042386263609, "grad/layer_4/mlp": 0.002611248753964901, "grad/layer_4/attn_mlp_ratio": 1.3706247801044382, "grad/layer_8/attn": 0.003740469692274928, "grad/layer_8/mlp": 0.003746585687622428, "grad/layer_8/attn_mlp_ratio": 0.9983675549702598, "grad/layer_12/attn": 0.0075820735655725, "grad/layer_12/mlp": 0.006840419489890337, "grad/layer_12/attn_mlp_ratio": 1.1084222928047098, "grad/layer_16/attn": 0.0040924036875367165, "grad/layer_16/mlp": 0.004723059944808483, "grad/layer_16/attn_mlp_ratio": 0.8664729325292033, "grad/layer_20/attn": 0.004753997083753347, "grad/layer_20/mlp": 0.006488954648375511, "grad/layer_20/attn_mlp_ratio": 0.7326291010032849, "grad/layer_24/attn": 0.012252331711351871, "grad/layer_24/mlp": 0.009421334601938725, "grad/layer_24/attn_mlp_ratio": 1.3004878925308305, "grad/layer_27/attn": 0.004412743728607893, "grad/layer_27/mlp": 0.007831137627363205, "grad/layer_27/attn_mlp_ratio": 0.5634869264511956} {"step": 67300, "timestamp": 1778267281.9956253, "train/loss": 2.1584533572196962, "train/z_loss": 0.0013782196794636548, "train/perplexity": 8.657736871793288, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020888.5948484703, "perf/iters_per_sec": 0.963634774612651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377375602722168, "data/tokens_consumed": 141140426752, "data/tokens_consumed_B": 141.140426752, "train/loss_slope": 3.105023520292572e-06} {"step": 67310, "timestamp": 1778267292.3788357, "train/loss": 2.1226541638374328, "train/z_loss": 0.0013905350351706147, "train/perplexity": 8.35327906221722, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021025.5714490467, "perf/iters_per_sec": 0.9637000901456102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376672267913818, "data/tokens_consumed": 141161398272, "data/tokens_consumed_B": 141.161398272, "train/loss_slope": -4.605410873728961e-06} {"step": 67320, "timestamp": 1778267302.7578986, "train/loss": 2.1105863213539124, "train/z_loss": 0.0013867837260477245, "train/perplexity": 8.2530788226614, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021599.2175068825, "perf/iters_per_sec": 0.963973625901643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373727798461914, "data/tokens_consumed": 141182369792, "data/tokens_consumed_B": 141.182369792, "train/loss_slope": -5.563824798407708e-06} {"step": 67330, "timestamp": 1778267313.142233, "train/loss": 2.1167606115341187, "train/z_loss": 0.0013718580128625035, "train/perplexity": 8.30419336180952, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020828.3774530226, "perf/iters_per_sec": 0.9636060607209314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377684831619263, "data/tokens_consumed": 141203341312, "data/tokens_consumed_B": 141.203341312, "train/loss_slope": -4.648684761454785e-06} {"step": 67340, "timestamp": 1778267323.4804204, "train/loss": 2.121733009815216, "train/z_loss": 0.0013771995087154209, "train/perplexity": 8.345587948504367, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029910.779535651, "perf/iters_per_sec": 0.96793688751967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331252098083497, "data/tokens_consumed": 141224312832, "data/tokens_consumed_B": 141.224312832, "train/loss_slope": -6.298169523182445e-06} {"step": 67350, "timestamp": 1778267333.8074982, "grad/layer_0/attn": 0.0026263201143592596, "grad/layer_0/mlp": 0.0030096268747001886, "grad/layer_0/attn_mlp_ratio": 0.8726397445387354, "grad/layer_4/attn": 0.002640127670019865, "grad/layer_4/mlp": 0.0026260940358042717, "grad/layer_4/attn_mlp_ratio": 1.0053438808701711, "grad/layer_8/attn": 0.00371800665743649, "grad/layer_8/mlp": 0.003592314664274454, "grad/layer_8/attn_mlp_ratio": 1.034989109086999, "grad/layer_12/attn": 0.006388249807059765, "grad/layer_12/mlp": 0.007084370590746403, "grad/layer_12/attn_mlp_ratio": 0.9017385009799234, "grad/layer_16/attn": 0.0038287141360342503, "grad/layer_16/mlp": 0.004661789629608393, "grad/layer_16/attn_mlp_ratio": 0.8212970464362571, "grad/layer_20/attn": 0.00864887610077858, "grad/layer_20/mlp": 0.0069055515341460705, "grad/layer_20/attn_mlp_ratio": 1.2524526003125145, "grad/layer_24/attn": 0.01220285426825285, "grad/layer_24/mlp": 0.009937751106917858, "grad/layer_24/attn_mlp_ratio": 1.227929137505295, "grad/layer_27/attn": 0.009248143061995506, "grad/layer_27/mlp": 0.01037778239697218, "grad/layer_27/attn_mlp_ratio": 0.8911482838162915} {"step": 67350, "timestamp": 1778267334.3969536, "eos/sharpness": 47.668623924255364, "eos/L0_probe": 1.9695781469345093, "eos/L_plus": 2.2611773014068604, "eos/L_minus": 2.154665231704712, "eos/grad_norm": 0.1675981730222702, "eos/embed_grad_frac": 0.17869986593723297, "eos/time_s": 0.586559534072876} {"step": 67350, "timestamp": 1778267334.4171655, "train/loss": 2.142092490196228, "train/z_loss": 0.0013821797678247095, "train/perplexity": 8.517241239241967, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1918183.9147342683, "perf/iters_per_sec": 0.9146613668128339, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0933007955551148, "data/tokens_consumed": 141245284352, "data/tokens_consumed_B": 141.245284352, "train/loss_slope": -5.225323672198739e-06} {"step": 67350, "timestamp": 1778267335.781531, "geo/rankme_last": 439.11029052734375, "geo/layer_0/stable_rank_q_proj": 19.343448638916016, "geo/layer_0/stable_rank_k_proj": 16.096614837646484, "geo/layer_0/stable_rank_o_proj": 46.99313735961914, "geo/layer_0/stable_rank_gate_proj": 129.69142150878906, "geo/layer_0/stable_rank_down_proj": 55.24052810668945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06243816763162613, "geo/layer_0/attn_entropy_mean": 6.15815544128418, "geo/layer_0/attn_entropy_std": 0.4211326837539673, "geo/layer_7/stable_rank_q_proj": 43.342628479003906, "geo/layer_7/stable_rank_k_proj": 40.55253601074219, "geo/layer_7/stable_rank_o_proj": 90.25047302246094, "geo/layer_7/stable_rank_gate_proj": 79.89167785644531, "geo/layer_7/stable_rank_down_proj": 140.4354705810547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4548771381378174, "geo/layer_7/attn_entropy_mean": 4.641122817993164, "geo/layer_7/attn_entropy_std": 0.8129661679267883, "geo/layer_14/stable_rank_q_proj": 50.965789794921875, "geo/layer_14/stable_rank_k_proj": 40.40701675415039, "geo/layer_14/stable_rank_o_proj": 43.49292755126953, "geo/layer_14/stable_rank_gate_proj": 71.67898559570312, "geo/layer_14/stable_rank_down_proj": 129.33460998535156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3867506682872772, "geo/layer_14/attn_entropy_mean": 5.518045425415039, "geo/layer_14/attn_entropy_std": 0.42036300897598267, "geo/layer_21/stable_rank_q_proj": 40.22692108154297, "geo/layer_21/stable_rank_k_proj": 30.13392448425293, "geo/layer_21/stable_rank_o_proj": 69.47868347167969, "geo/layer_21/stable_rank_gate_proj": 65.11175537109375, "geo/layer_21/stable_rank_down_proj": 50.708011627197266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1465565413236618, "geo/layer_21/attn_entropy_mean": 5.691739082336426, "geo/layer_21/attn_entropy_std": 0.2940046787261963, "geo/layer_27/stable_rank_q_proj": 43.322879791259766, "geo/layer_27/stable_rank_k_proj": 31.960250854492188, "geo/layer_27/stable_rank_o_proj": 114.99818420410156, "geo/layer_27/stable_rank_gate_proj": 79.73463439941406, "geo/layer_27/stable_rank_down_proj": 127.53707885742188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09440676867961884, "geo/layer_27/attn_entropy_mean": 4.188668251037598, "geo/layer_27/attn_entropy_std": 0.7496448755264282, "attnres/final_alpha/block_0": 0.23612774908542633, "attnres/block_norm/0": 1.7691978216171265, "attnres/final_alpha/block_1": 0.004301406443119049, "attnres/block_norm/1": 47244.0390625, "attnres/final_alpha/block_2": 0.009910011664032936, "attnres/block_norm/2": 28835.048828125, "attnres/final_alpha/block_3": 0.011715047992765903, "attnres/block_norm/3": 59708.421875, "attnres/final_alpha/block_4": 0.014325086027383804, "attnres/block_norm/4": 15374.873046875, "attnres/final_alpha/block_5": 0.6184340119361877, "attnres/block_norm/5": 6679.49951171875, "attnres/final_alpha/block_6": 0.10518666356801987, "attnres/block_norm/6": 39619.203125, "geo/tier1_time_s": 1.3611421585083008, "geo/step": 67350.0, "geo/rankme_slope": -6.187601993922569e-05} {"step": 67360, "timestamp": 1778267346.1363294, "train/loss": 2.179919385910034, "train/z_loss": 0.0013762056827545167, "train/perplexity": 8.845593150535946, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790005.8093596818, "perf/iters_per_sec": 0.8535412833021554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1715894937515259, "data/tokens_consumed": 141266255872, "data/tokens_consumed_B": 141.266255872, "train/loss_slope": -7.05841972489559e-06} {"step": 67370, "timestamp": 1778267356.4842556, "train/loss": 2.1291566371917723, "train/z_loss": 0.0013815924292430281, "train/perplexity": 8.407773017476632, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028029.913403393, "perf/iters_per_sec": 0.9670400206582036, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034083366394043, "data/tokens_consumed": 141287227392, "data/tokens_consumed_B": 141.287227392, "train/loss_slope": -6.3594536490887874e-06} {"step": 67380, "timestamp": 1778267366.8322098, "train/loss": 2.17628960609436, "train/z_loss": 0.0013728835969232024, "train/perplexity": 8.813543796298344, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028091.1218916678, "perf/iters_per_sec": 0.9670692071398105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340521574020385, "data/tokens_consumed": 141308198912, "data/tokens_consumed_B": 141.308198912, "train/loss_slope": -3.832708526723471e-06} {"step": 67390, "timestamp": 1778267377.1889193, "train/loss": 2.1993227005004883, "train/z_loss": 0.00136433927109465, "train/perplexity": 9.018902931883634, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025859.0065426999, "perf/iters_per_sec": 0.966004851600027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351914882659912, "data/tokens_consumed": 141329170432, "data/tokens_consumed_B": 141.329170432, "train/loss_slope": 1.419621024659711e-06} {"step": 67400, "timestamp": 1778267387.5411005, "grad/layer_0/attn": 0.0029159674886614084, "grad/layer_0/mlp": 0.003120680572465062, "grad/layer_0/attn_mlp_ratio": 0.9344011113953712, "grad/layer_4/attn": 0.002936759265139699, "grad/layer_4/mlp": 0.0026271718088537455, "grad/layer_4/attn_mlp_ratio": 1.1178405399519626, "grad/layer_8/attn": 0.00623167073354125, "grad/layer_8/mlp": 0.003633245127275586, "grad/layer_8/attn_mlp_ratio": 1.7151803260508542, "grad/layer_12/attn": 0.004966091830283403, "grad/layer_12/mlp": 0.006547324359416962, "grad/layer_12/attn_mlp_ratio": 0.7584917871514242, "grad/layer_16/attn": 0.0033241622149944305, "grad/layer_16/mlp": 0.004600665997713804, "grad/layer_16/attn_mlp_ratio": 0.7225393333035617, "grad/layer_20/attn": 0.004646369256079197, "grad/layer_20/mlp": 0.006518680136650801, "grad/layer_20/attn_mlp_ratio": 0.7127776002810089, "grad/layer_24/attn": 0.011998089961707592, "grad/layer_24/mlp": 0.011443949304521084, "grad/layer_24/attn_mlp_ratio": 1.0484221432303422, "grad/layer_27/attn": 0.010391985066235065, "grad/layer_27/mlp": 0.011830126866698265, "grad/layer_27/attn_mlp_ratio": 0.8784339420437701} {"step": 67400, "timestamp": 1778267387.5554929, "train/loss": 2.1769705057144164, "train/z_loss": 0.0013653825619257987, "train/perplexity": 8.819546978470887, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024325.6242480346, "perf/iters_per_sec": 0.9652736779441998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359756231307984, "data/tokens_consumed": 141350141952, "data/tokens_consumed_B": 141.350141952, "train/loss_slope": 3.589244300883522e-06} {"step": 67410, "timestamp": 1778267397.9022994, "train/loss": 2.174547481536865, "train/z_loss": 0.0013630738365463912, "train/perplexity": 8.798202871992709, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028407.0893895207, "perf/iters_per_sec": 0.9672198721835712, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338910818099976, "data/tokens_consumed": 141371113472, "data/tokens_consumed_B": 141.371113472, "train/loss_slope": 6.465262569824661e-06} {"step": 67420, "timestamp": 1778267408.2549744, "train/loss": 2.1868078470230103, "train/z_loss": 0.001370611391030252, "train/perplexity": 8.906736023352, "train/grad_norm": 0.2392578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026817.8209616186, "perf/iters_per_sec": 0.9664620499427884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347017765045166, "data/tokens_consumed": 141392084992, "data/tokens_consumed_B": 141.392084992, "train/loss_slope": 8.419959265442557e-06} {"step": 67425, "timestamp": 1778267414.0236218, "eos/sharpness": 87.76102066040038, "eos/L0_probe": 1.9672086238861084, "eos/L_plus": 2.4939990043640137, "eos/L_minus": 2.318028450012207, "eos/grad_norm": 0.27830442786216736, "eos/embed_grad_frac": 0.031524233520030975, "eos/time_s": 0.5995244979858398} {"step": 67425, "timestamp": 1778267415.3997557, "geo/rankme_last": 438.88446044921875, "geo/layer_0/stable_rank_q_proj": 19.314678192138672, "geo/layer_0/stable_rank_k_proj": 16.121999740600586, "geo/layer_0/stable_rank_o_proj": 46.95766067504883, "geo/layer_0/stable_rank_gate_proj": 129.71456909179688, "geo/layer_0/stable_rank_down_proj": 55.20563888549805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06177885830402374, "geo/layer_0/attn_entropy_mean": 6.156409740447998, "geo/layer_0/attn_entropy_std": 0.42313769459724426, "geo/layer_7/stable_rank_q_proj": 43.42567825317383, "geo/layer_7/stable_rank_k_proj": 40.52964401245117, "geo/layer_7/stable_rank_o_proj": 90.26802062988281, "geo/layer_7/stable_rank_gate_proj": 79.81676483154297, "geo/layer_7/stable_rank_down_proj": 140.22348022460938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4582197070121765, "geo/layer_7/attn_entropy_mean": 4.635551929473877, "geo/layer_7/attn_entropy_std": 0.8010112047195435, "geo/layer_14/stable_rank_q_proj": 50.96146011352539, "geo/layer_14/stable_rank_k_proj": 40.3801155090332, "geo/layer_14/stable_rank_o_proj": 43.52341079711914, "geo/layer_14/stable_rank_gate_proj": 71.60717010498047, "geo/layer_14/stable_rank_down_proj": 129.24099731445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38101568818092346, "geo/layer_14/attn_entropy_mean": 5.52798318862915, "geo/layer_14/attn_entropy_std": 0.4027693569660187, "geo/layer_21/stable_rank_q_proj": 40.20042037963867, "geo/layer_21/stable_rank_k_proj": 30.12473487854004, "geo/layer_21/stable_rank_o_proj": 69.44821166992188, "geo/layer_21/stable_rank_gate_proj": 65.15509033203125, "geo/layer_21/stable_rank_down_proj": 50.7273063659668, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14381784200668335, "geo/layer_21/attn_entropy_mean": 5.690742492675781, "geo/layer_21/attn_entropy_std": 0.30869367718696594, "geo/layer_27/stable_rank_q_proj": 43.23381805419922, "geo/layer_27/stable_rank_k_proj": 32.00582504272461, "geo/layer_27/stable_rank_o_proj": 115.0516128540039, "geo/layer_27/stable_rank_gate_proj": 79.779052734375, "geo/layer_27/stable_rank_down_proj": 127.3711929321289, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09734241664409637, "geo/layer_27/attn_entropy_mean": 4.172357559204102, "geo/layer_27/attn_entropy_std": 0.7575105428695679, "attnres/final_alpha/block_0": 0.23572072386741638, "attnres/block_norm/0": 1.769026279449463, "attnres/final_alpha/block_1": 0.004233472980558872, "attnres/block_norm/1": 47496.32421875, "attnres/final_alpha/block_2": 0.010047445073723793, "attnres/block_norm/2": 28630.861328125, "attnres/final_alpha/block_3": 0.01181158609688282, "attnres/block_norm/3": 59531.390625, "attnres/final_alpha/block_4": 0.014220348559319973, "attnres/block_norm/4": 15473.64453125, "attnres/final_alpha/block_5": 0.617943525314331, "attnres/block_norm/5": 6717.8056640625, "attnres/final_alpha/block_6": 0.1060229241847992, "attnres/block_norm/6": 39691.04296875, "geo/tier1_time_s": 1.358147144317627, "geo/step": 67425.0, "geo/rankme_slope": -6.151675513955582e-05} {"step": 67430, "timestamp": 1778267420.579055, "train/loss": 2.1672689437866213, "train/z_loss": 0.0013759031193330885, "train/perplexity": 8.734397307545322, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702603.958146117, "perf/iters_per_sec": 0.8118648329477868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317321300506592, "data/tokens_consumed": 141413056512, "data/tokens_consumed_B": 141.413056512, "train/loss_slope": 1.2996455924202679e-05} {"step": 67440, "timestamp": 1778267430.9239671, "train/loss": 2.1397069931030273, "train/z_loss": 0.0013756603701040148, "train/perplexity": 8.496947599856124, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028467.0108493243, "perf/iters_per_sec": 0.9672484449621793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338605403900147, "data/tokens_consumed": 141434028032, "data/tokens_consumed_B": 141.434028032, "train/loss_slope": 1.3252142517909339e-05} {"step": 67450, "timestamp": 1778267441.2531753, "grad/layer_0/attn": 0.002522936789318919, "grad/layer_0/mlp": 0.0027961470186710358, "grad/layer_0/attn_mlp_ratio": 0.9022904311694554, "grad/layer_4/attn": 0.0021542259491980076, "grad/layer_4/mlp": 0.0024647086393088102, "grad/layer_4/attn_mlp_ratio": 0.8740286082655458, "grad/layer_8/attn": 0.00314827891997993, "grad/layer_8/mlp": 0.003455628640949726, "grad/layer_8/attn_mlp_ratio": 0.9110582056088219, "grad/layer_12/attn": 0.004557132720947266, "grad/layer_12/mlp": 0.006855486426502466, "grad/layer_12/attn_mlp_ratio": 0.6647424224859829, "grad/layer_16/attn": 0.004002585541456938, "grad/layer_16/mlp": 0.004983633756637573, "grad/layer_16/attn_mlp_ratio": 0.8031459887700213, "grad/layer_20/attn": 0.002951699309051037, "grad/layer_20/mlp": 0.005440204869955778, "grad/layer_20/attn_mlp_ratio": 0.5425713415858724, "grad/layer_24/attn": 0.009182378649711609, "grad/layer_24/mlp": 0.008238025940954685, "grad/layer_24/attn_mlp_ratio": 1.1146333604752099, "grad/layer_27/attn": 0.005248662084341049, "grad/layer_27/mlp": 0.007622697856277227, "grad/layer_27/attn_mlp_ratio": 0.6885570062524674} {"step": 67450, "timestamp": 1778267441.2673929, "train/loss": 2.15708167552948, "train/z_loss": 0.0013814245001412928, "train/perplexity": 8.645869353736893, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028493.1603554622, "perf/iters_per_sec": 0.9672609140183746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338472127914429, "data/tokens_consumed": 141454999552, "data/tokens_consumed_B": 141.454999552, "train/loss_slope": 1.4400192513109667e-05} {"step": 67460, "timestamp": 1778267451.6052954, "train/loss": 2.164096307754517, "train/z_loss": 0.0013862116145901382, "train/perplexity": 8.706730156036825, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029811.5664824334, "perf/iters_per_sec": 0.967889579049317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331757068634033, "data/tokens_consumed": 141475971072, "data/tokens_consumed_B": 141.475971072, "train/loss_slope": 1.7841911365990898e-05} {"step": 67470, "timestamp": 1778267461.9447994, "train/loss": 2.1739348649978636, "train/z_loss": 0.0013639832264743744, "train/perplexity": 8.792814598041224, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029650.6352439336, "perf/iters_per_sec": 0.9678128410548847, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332576274871825, "data/tokens_consumed": 141496942592, "data/tokens_consumed_B": 141.496942592, "train/loss_slope": 1.8275344289533034e-05} {"step": 67480, "timestamp": 1778267472.2929444, "train/loss": 2.0957229137420654, "train/z_loss": 0.0013795222854241728, "train/perplexity": 8.131317086857164, "train/grad_norm": 0.109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027385.6482131733, "perf/iters_per_sec": 0.96673281107577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344119787216186, "data/tokens_consumed": 141517914112, "data/tokens_consumed_B": 141.517914112, "train/loss_slope": 1.5259412928025317e-05} {"step": 67490, "timestamp": 1778267482.6334617, "train/loss": 2.165251588821411, "train/z_loss": 0.0013701177202165126, "train/perplexity": 8.716794689103496, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029519.7920405641, "perf/iters_per_sec": 0.9677504501536198, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333242416381836, "data/tokens_consumed": 141538885632, "data/tokens_consumed_B": 141.538885632, "train/loss_slope": 1.5280696503793476e-05} {"step": 67500, "timestamp": 1778267492.9657972, "grad/layer_0/attn": 0.003821729449555278, "grad/layer_0/mlp": 0.003496121848002076, "grad/layer_0/attn_mlp_ratio": 1.0931339084837346, "grad/layer_4/attn": 0.002431001514196396, "grad/layer_4/mlp": 0.002628782531246543, "grad/layer_4/attn_mlp_ratio": 0.9247632289184895, "grad/layer_8/attn": 0.005054488778114319, "grad/layer_8/mlp": 0.003743580309674144, "grad/layer_8/attn_mlp_ratio": 1.3501750263070433, "grad/layer_12/attn": 0.008827676065266132, "grad/layer_12/mlp": 0.007106418255716562, "grad/layer_12/attn_mlp_ratio": 1.2422117054458175, "grad/layer_16/attn": 0.004003457259386778, "grad/layer_16/mlp": 0.005391762126237154, "grad/layer_16/attn_mlp_ratio": 0.7425136887352585, "grad/layer_20/attn": 0.0038462441880255938, "grad/layer_20/mlp": 0.007475064601749182, "grad/layer_20/attn_mlp_ratio": 0.5145432637025288, "grad/layer_24/attn": 0.017439337447285652, "grad/layer_24/mlp": 0.013314997777342796, "grad/layer_24/attn_mlp_ratio": 1.3097514252676643, "grad/layer_27/attn": 0.013917542062699795, "grad/layer_27/mlp": 0.012751474976539612, "grad/layer_27/attn_mlp_ratio": 1.091445654652577} {"step": 67500, "timestamp": 1778267493.5669355, "eos/sharpness": 78.65245342254637, "eos/L0_probe": 1.9695707559585571, "eos/L_plus": 2.324414014816284, "eos/L_minus": 2.401252031326294, "eos/grad_norm": 0.2649381458759308, "eos/embed_grad_frac": 0.03344709798693657, "eos/time_s": 0.5983209609985352} {"step": 67500, "timestamp": 1778267493.585884, "train/loss": 2.1488470792770387, "train/z_loss": 0.0013720445334911347, "train/perplexity": 8.57496643944535, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1916237.646641062, "perf/iters_per_sec": 0.9137333138661681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0944112300872804, "data/tokens_consumed": 141559857152, "data/tokens_consumed_B": 141.559857152, "train/loss_slope": 1.4691385641993637e-05} {"step": 67500, "timestamp": 1778267494.9534812, "geo/rankme_last": 439.7957763671875, "geo/layer_0/stable_rank_q_proj": 19.34082794189453, "geo/layer_0/stable_rank_k_proj": 16.108903884887695, "geo/layer_0/stable_rank_o_proj": 46.912933349609375, "geo/layer_0/stable_rank_gate_proj": 129.65267944335938, "geo/layer_0/stable_rank_down_proj": 55.341487884521484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06517031788825989, "geo/layer_0/attn_entropy_mean": 6.156533241271973, "geo/layer_0/attn_entropy_std": 0.41972798109054565, "geo/layer_7/stable_rank_q_proj": 43.40470886230469, "geo/layer_7/stable_rank_k_proj": 40.504554748535156, "geo/layer_7/stable_rank_o_proj": 90.10009765625, "geo/layer_7/stable_rank_gate_proj": 79.81735229492188, "geo/layer_7/stable_rank_down_proj": 140.22494506835938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4474886953830719, "geo/layer_7/attn_entropy_mean": 4.659931659698486, "geo/layer_7/attn_entropy_std": 0.7921080589294434, "geo/layer_14/stable_rank_q_proj": 50.930931091308594, "geo/layer_14/stable_rank_k_proj": 40.47650146484375, "geo/layer_14/stable_rank_o_proj": 43.501285552978516, "geo/layer_14/stable_rank_gate_proj": 71.71511840820312, "geo/layer_14/stable_rank_down_proj": 128.87965393066406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39861735701560974, "geo/layer_14/attn_entropy_mean": 5.552872657775879, "geo/layer_14/attn_entropy_std": 0.4105800688266754, "geo/layer_21/stable_rank_q_proj": 40.151527404785156, "geo/layer_21/stable_rank_k_proj": 30.070425033569336, "geo/layer_21/stable_rank_o_proj": 69.43036651611328, "geo/layer_21/stable_rank_gate_proj": 65.13230895996094, "geo/layer_21/stable_rank_down_proj": 50.6817512512207, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14029107987880707, "geo/layer_21/attn_entropy_mean": 5.692410469055176, "geo/layer_21/attn_entropy_std": 0.3148067593574524, "geo/layer_27/stable_rank_q_proj": 43.29227066040039, "geo/layer_27/stable_rank_k_proj": 32.02027893066406, "geo/layer_27/stable_rank_o_proj": 115.03275299072266, "geo/layer_27/stable_rank_gate_proj": 79.70955657958984, "geo/layer_27/stable_rank_down_proj": 127.14598846435547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0841054767370224, "geo/layer_27/attn_entropy_mean": 4.1930742263793945, "geo/layer_27/attn_entropy_std": 0.7646136283874512, "attnres/final_alpha/block_0": 0.2396586388349533, "attnres/block_norm/0": 1.7691173553466797, "attnres/final_alpha/block_1": 0.004386598244309425, "attnres/block_norm/1": 47305.6328125, "attnres/final_alpha/block_2": 0.01030992716550827, "attnres/block_norm/2": 28717.26171875, "attnres/final_alpha/block_3": 0.012257711961865425, "attnres/block_norm/3": 59482.32421875, "attnres/final_alpha/block_4": 0.0147387171164155, "attnres/block_norm/4": 15442.853515625, "attnres/final_alpha/block_5": 0.6081486344337463, "attnres/block_norm/5": 6776.8115234375, "attnres/final_alpha/block_6": 0.11049975454807281, "attnres/block_norm/6": 39078.328125, "geo/tier1_time_s": 1.3635263442993164, "geo/step": 67500.0, "geo/rankme_slope": -2.45351656287515e-05} {"step": 67500, "timestamp": 1778267502.0198984, "geo/ww_alpha_mean": 7.4155243098928745, "geo/ww_alpha_std": 4.243803228857622, "geo/ww_alpha_min": 1.3443132668077393, "geo/ww_alpha_max": 25.812842615281053, "geo/ww_alpha_healthy_frac": 0.15736040609137056, "geo/ww_alpha_by_type/q_proj": 3.8720745320374594, "geo/ww_alpha_by_type/k_proj": 4.535459398243362, "geo/ww_alpha_by_type/v_proj": 7.061689580704912, "geo/ww_alpha_by_type/o_proj": 8.78276028025756, "geo/ww_alpha_by_type/gate_proj": 7.615771453178827, "geo/ww_alpha_by_type/up_proj": 11.927278576713459, "geo/ww_alpha_by_type/down_proj": 8.195183889985632, "geo/twonn_id/layer_0": 0.7087283730506897, "geo/twonn_id/layer_7": 3.3681628704071045, "geo/twonn_id/layer_14": 5.15295934677124, "geo/twonn_id/layer_21": 6.934997081756592, "geo/twonn_id/layer_27": 6.117454528808594, "geo/tier2_time_s": 7.0561909675598145} {"step": 67500, "timestamp": 1778267502.682602, "eoc/jacobian_sigma/layer_0/attn": 1150.51123046875, "eoc/jacobian_sigma/layer_0/mlp": 8767.693359375, "eoc/jacobian_sigma/layer_0": 8767.693359375, "eoc/jacobian_sigma/layer_7/attn": 1.1536020040512085, "eoc/jacobian_sigma/layer_7/mlp": 1.7916152477264404, "eoc/jacobian_sigma/layer_7": 1.7916152477264404, "eoc/jacobian_sigma/layer_14/attn": 1.4775131940841675, "eoc/jacobian_sigma/layer_14/mlp": 5.229883670806885, "eoc/jacobian_sigma/layer_14": 5.229883670806885, "eoc/jacobian_sigma/layer_21/attn": 1.0805481672286987, "eoc/jacobian_sigma/layer_21/mlp": 4.100949287414551, "eoc/jacobian_sigma/layer_21": 4.100949287414551, "eoc/jacobian_sigma/layer_27/attn": 3.178389072418213, "eoc/jacobian_sigma/layer_27/mlp": 32.44340896606445, "eoc/jacobian_sigma/layer_27": 32.44340896606445, "eoc/layer0_sigma": 8767.693359375, "eoc/sigma_max": 32.44340896606445, "eoc/sigma_min": 1.7916152477264404, "eoc/sigma_mean": 10.891464293003082, "eoc/time_s": 0.6554710865020752} {"step": 67510, "timestamp": 1778267513.0522313, "train/loss": 2.118485450744629, "train/z_loss": 0.001384649716783315, "train/perplexity": 8.318529120016406, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1077539.4930350352, "perf/iters_per_sec": 0.5138108697104622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9462414264678956, "data/tokens_consumed": 141580828672, "data/tokens_consumed_B": 141.580828672, "train/loss_slope": 1.2947850330363105e-05} {"step": 67520, "timestamp": 1778267523.4081721, "train/loss": 2.137857437133789, "train/z_loss": 0.0013913701754063369, "train/perplexity": 8.481246544169064, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026881.3382768806, "perf/iters_per_sec": 0.9664923373588946, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346693515777587, "data/tokens_consumed": 141601800192, "data/tokens_consumed_B": 141.601800192, "train/loss_slope": 1.0697221834667465e-05} {"step": 67530, "timestamp": 1778267533.7682667, "train/loss": 2.1012073516845704, "train/z_loss": 0.0013851807219907642, "train/perplexity": 8.176035305869291, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025441.1279865385, "perf/iters_per_sec": 0.965805591576833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354050636291503, "data/tokens_consumed": 141622771712, "data/tokens_consumed_B": 141.622771712, "train/loss_slope": 6.631607155237655e-06} {"step": 67540, "timestamp": 1778267544.1506689, "train/loss": 2.139322876930237, "train/z_loss": 0.0013801916269585491, "train/perplexity": 8.493684411625477, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021251.4608782828, "perf/iters_per_sec": 0.9638078026191153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037551259994507, "data/tokens_consumed": 141643743232, "data/tokens_consumed_B": 141.643743232, "train/loss_slope": 8.435699856034522e-06} {"step": 67550, "timestamp": 1778267554.5292315, "grad/layer_0/attn": 0.0035562976263463497, "grad/layer_0/mlp": 0.0032865877728909254, "grad/layer_0/attn_mlp_ratio": 1.082063758489495, "grad/layer_4/attn": 0.0023260528687387705, "grad/layer_4/mlp": 0.0026498932857066393, "grad/layer_4/attn_mlp_ratio": 0.8777911146483687, "grad/layer_8/attn": 0.003913176711648703, "grad/layer_8/mlp": 0.0037612970918416977, "grad/layer_8/attn_mlp_ratio": 1.0403795584503224, "grad/layer_12/attn": 0.004956601653248072, "grad/layer_12/mlp": 0.006630091927945614, "grad/layer_12/attn_mlp_ratio": 0.7475916823410842, "grad/layer_16/attn": 0.006464821752160788, "grad/layer_16/mlp": 0.004674668423831463, "grad/layer_16/attn_mlp_ratio": 1.382947629164105, "grad/layer_20/attn": 0.003453677985817194, "grad/layer_20/mlp": 0.006066285073757172, "grad/layer_20/attn_mlp_ratio": 0.569323381096202, "grad/layer_24/attn": 0.01238153874874115, "grad/layer_24/mlp": 0.009818331338465214, "grad/layer_24/attn_mlp_ratio": 1.2610634328592816, "grad/layer_27/attn": 0.007661471609026194, "grad/layer_27/mlp": 0.008625402115285397, "grad/layer_27/attn_mlp_ratio": 0.8882451412467485} {"step": 67550, "timestamp": 1778267554.5460434, "train/loss": 2.1378383159637453, "train/z_loss": 0.001366090285591781, "train/perplexity": 8.481084374362153, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018861.702758717, "perf/iters_per_sec": 0.9626682771485886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387794256210328, "data/tokens_consumed": 141664714752, "data/tokens_consumed_B": 141.664714752, "train/loss_slope": 7.365779850957232e-06} {"step": 67560, "timestamp": 1778267564.9295375, "train/loss": 2.132213091850281, "train/z_loss": 0.0013977409224025905, "train/perplexity": 8.433510306876327, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020550.22616623, "perf/iters_per_sec": 0.9634734278517866, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037911343574524, "data/tokens_consumed": 141685686272, "data/tokens_consumed_B": 141.685686272, "train/loss_slope": 8.37923422469203e-06} {"step": 67570, "timestamp": 1778267575.3082361, "train/loss": 2.1554732799530028, "train/z_loss": 0.001380880014039576, "train/perplexity": 8.631974552877073, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021915.210242938, "perf/iters_per_sec": 0.9641243029799166, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372106552124023, "data/tokens_consumed": 141706657792, "data/tokens_consumed_B": 141.706657792, "train/loss_slope": 1.0735727493876729e-05} {"step": 67575, "timestamp": 1778267581.1102283, "eos/sharpness": 69.11230087280272, "eos/L0_probe": 1.9752973318099976, "eos/L_plus": 2.3708841800689697, "eos/L_minus": 2.2708334922790527, "eos/grad_norm": 0.1693200170993805, "eos/embed_grad_frac": 0.07885871082544327, "eos/time_s": 0.6220226287841797} {"step": 67575, "timestamp": 1778267582.4941247, "geo/rankme_last": 438.6827697753906, "geo/layer_0/stable_rank_q_proj": 19.355270385742188, "geo/layer_0/stable_rank_k_proj": 16.107194900512695, "geo/layer_0/stable_rank_o_proj": 46.894287109375, "geo/layer_0/stable_rank_gate_proj": 129.4221954345703, "geo/layer_0/stable_rank_down_proj": 55.311946868896484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06950351595878601, "geo/layer_0/attn_entropy_mean": 6.160617351531982, "geo/layer_0/attn_entropy_std": 0.42380401492118835, "geo/layer_7/stable_rank_q_proj": 43.43059158325195, "geo/layer_7/stable_rank_k_proj": 40.57707977294922, "geo/layer_7/stable_rank_o_proj": 89.94387817382812, "geo/layer_7/stable_rank_gate_proj": 79.74658203125, "geo/layer_7/stable_rank_down_proj": 140.42929077148438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4440482556819916, "geo/layer_7/attn_entropy_mean": 4.666973114013672, "geo/layer_7/attn_entropy_std": 0.8107017874717712, "geo/layer_14/stable_rank_q_proj": 50.8838996887207, "geo/layer_14/stable_rank_k_proj": 40.53136444091797, "geo/layer_14/stable_rank_o_proj": 43.487491607666016, "geo/layer_14/stable_rank_gate_proj": 71.7234115600586, "geo/layer_14/stable_rank_down_proj": 128.8407440185547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39546987414360046, "geo/layer_14/attn_entropy_mean": 5.525171756744385, "geo/layer_14/attn_entropy_std": 0.413381427526474, "geo/layer_21/stable_rank_q_proj": 40.1368293762207, "geo/layer_21/stable_rank_k_proj": 30.00168800354004, "geo/layer_21/stable_rank_o_proj": 69.49874114990234, "geo/layer_21/stable_rank_gate_proj": 65.10319519042969, "geo/layer_21/stable_rank_down_proj": 50.69164276123047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1449788361787796, "geo/layer_21/attn_entropy_mean": 5.695946216583252, "geo/layer_21/attn_entropy_std": 0.29655635356903076, "geo/layer_27/stable_rank_q_proj": 43.330467224121094, "geo/layer_27/stable_rank_k_proj": 32.083412170410156, "geo/layer_27/stable_rank_o_proj": 114.99089813232422, "geo/layer_27/stable_rank_gate_proj": 79.69161987304688, "geo/layer_27/stable_rank_down_proj": 127.10829162597656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0887526422739029, "geo/layer_27/attn_entropy_mean": 4.190213203430176, "geo/layer_27/attn_entropy_std": 0.7285184264183044, "attnres/final_alpha/block_0": 0.2354966253042221, "attnres/block_norm/0": 1.7692198753356934, "attnres/final_alpha/block_1": 0.00427281903102994, "attnres/block_norm/1": 47204.6953125, "attnres/final_alpha/block_2": 0.01019665040075779, "attnres/block_norm/2": 28718.23046875, "attnres/final_alpha/block_3": 0.011872307397425175, "attnres/block_norm/3": 59584.51953125, "attnres/final_alpha/block_4": 0.014199936762452126, "attnres/block_norm/4": 15346.3828125, "attnres/final_alpha/block_5": 0.6177111268043518, "attnres/block_norm/5": 6680.94775390625, "attnres/final_alpha/block_6": 0.10625052452087402, "attnres/block_norm/6": 39577.58984375, "geo/tier1_time_s": 1.3640081882476807, "geo/step": 67575.0, "geo/rankme_slope": -4.9003800739045625e-05} {"step": 67580, "timestamp": 1778267587.6907356, "train/loss": 2.1466971397399903, "train/z_loss": 0.001363081217277795, "train/perplexity": 8.556550583650107, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694538.7059598218, "perf/iters_per_sec": 0.8080190210150823, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2375946283340453, "data/tokens_consumed": 141727629312, "data/tokens_consumed_B": 141.727629312, "train/loss_slope": 9.440672408343457e-06} {"step": 67590, "timestamp": 1778267598.0718498, "train/loss": 2.121992826461792, "train/z_loss": 0.0013791545061394572, "train/perplexity": 8.347756552886413, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021820.1232157154, "perf/iters_per_sec": 0.9640789619520738, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372594356536866, "data/tokens_consumed": 141748600832, "data/tokens_consumed_B": 141.748600832, "train/loss_slope": 7.766846139760784e-06} {"step": 67600, "timestamp": 1778267608.4418056, "grad/layer_0/attn": 0.0025572176091372967, "grad/layer_0/mlp": 0.00287631549872458, "grad/layer_0/attn_mlp_ratio": 0.8890601609472972, "grad/layer_4/attn": 0.0027169836685061455, "grad/layer_4/mlp": 0.0025931012351065874, "grad/layer_4/attn_mlp_ratio": 1.0477738111204458, "grad/layer_8/attn": 0.0034369255881756544, "grad/layer_8/mlp": 0.003545364597812295, "grad/layer_8/attn_mlp_ratio": 0.9694138349988214, "grad/layer_12/attn": 0.0041967653669416904, "grad/layer_12/mlp": 0.006966510321944952, "grad/layer_12/attn_mlp_ratio": 0.6024200227594022, "grad/layer_16/attn": 0.0036901177372783422, "grad/layer_16/mlp": 0.005115107167512178, "grad/layer_16/attn_mlp_ratio": 0.7214155137499385, "grad/layer_20/attn": 0.004306308925151825, "grad/layer_20/mlp": 0.006894642487168312, "grad/layer_20/attn_mlp_ratio": 0.6245876955487641, "grad/layer_24/attn": 0.019665591418743134, "grad/layer_24/mlp": 0.014136251993477345, "grad/layer_24/attn_mlp_ratio": 1.3911460611131221, "grad/layer_27/attn": 0.01293272152543068, "grad/layer_27/mlp": 0.01318877749145031, "grad/layer_27/attn_mlp_ratio": 0.9805853071488885} {"step": 67600, "timestamp": 1778267608.4587967, "train/loss": 2.155047059059143, "train/z_loss": 0.0013637072639539837, "train/perplexity": 8.628296208916582, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020335.35195714, "perf/iters_per_sec": 0.963370967844553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038021731376648, "data/tokens_consumed": 141769572352, "data/tokens_consumed_B": 141.769572352, "train/loss_slope": 7.318408089359381e-06} {"step": 67610, "timestamp": 1778267618.8372076, "train/loss": 2.123049187660217, "train/z_loss": 0.001375645468942821, "train/perplexity": 8.356579458269776, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021648.5616288083, "perf/iters_per_sec": 0.9639971550125161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373474597930907, "data/tokens_consumed": 141790543872, "data/tokens_consumed_B": 141.790543872, "train/loss_slope": 4.137870645222655e-06} {"step": 67620, "timestamp": 1778267629.2147055, "train/loss": 2.1213162422180174, "train/z_loss": 0.0013761923066340387, "train/perplexity": 8.342110502561589, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021981.1164702596, "perf/iters_per_sec": 0.9641557295180605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371768474578857, "data/tokens_consumed": 141811515392, "data/tokens_consumed_B": 141.811515392, "train/loss_slope": 2.1971824109786837e-06} {"step": 67630, "timestamp": 1778267639.6091735, "train/loss": 2.1184970259666445, "train/z_loss": 0.0013862458406947553, "train/perplexity": 8.318625409395098, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019203.9559010153, "perf/iters_per_sec": 0.9628314761643483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386033535003663, "data/tokens_consumed": 141832486912, "data/tokens_consumed_B": 141.832486912, "train/loss_slope": 7.023199735516241e-07} {"step": 67640, "timestamp": 1778267649.9969323, "train/loss": 2.140841317176819, "train/z_loss": 0.0013843051972799003, "train/perplexity": 8.506591360613337, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019799.8968498458, "perf/iters_per_sec": 0.9631156429528455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038296914100647, "data/tokens_consumed": 141853458432, "data/tokens_consumed_B": 141.853458432, "train/loss_slope": 5.034786639827505e-07} {"step": 67650, "timestamp": 1778267660.3638167, "grad/layer_0/attn": 0.0030229962430894375, "grad/layer_0/mlp": 0.003101085079833865, "grad/layer_0/attn_mlp_ratio": 0.9748188352734602, "grad/layer_4/attn": 0.0022040195763111115, "grad/layer_4/mlp": 0.002673282055184245, "grad/layer_4/attn_mlp_ratio": 0.8244620090089998, "grad/layer_8/attn": 0.004249352030456066, "grad/layer_8/mlp": 0.0037978908512741327, "grad/layer_8/attn_mlp_ratio": 1.1188715223722985, "grad/layer_12/attn": 0.005843032151460648, "grad/layer_12/mlp": 0.007134042214602232, "grad/layer_12/attn_mlp_ratio": 0.819035252916976, "grad/layer_16/attn": 0.004724218975752592, "grad/layer_16/mlp": 0.004660434555262327, "grad/layer_16/attn_mlp_ratio": 1.0136863458472147, "grad/layer_20/attn": 0.004421922378242016, "grad/layer_20/mlp": 0.005608990788459778, "grad/layer_20/attn_mlp_ratio": 0.7883632664370882, "grad/layer_24/attn": 0.008251172490417957, "grad/layer_24/mlp": 0.00885153841227293, "grad/layer_24/attn_mlp_ratio": 0.9321738225481879, "grad/layer_27/attn": 0.004950588569045067, "grad/layer_27/mlp": 0.008223319426178932, "grad/layer_27/attn_mlp_ratio": 0.602018266867154} {"step": 67650, "timestamp": 1778267660.978646, "eos/sharpness": 40.626931190490716, "eos/L0_probe": 1.9684836864471436, "eos/L_plus": 2.175527572631836, "eos/L_minus": 2.1677091121673584, "eos/grad_norm": 0.1218864917755127, "eos/embed_grad_frac": 0.16982842981815338, "eos/time_s": 0.611947774887085} {"step": 67650, "timestamp": 1778267661.0003126, "train/loss": 2.2015158414840696, "train/z_loss": 0.0013649654807522892, "train/perplexity": 9.038704363258978, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907041.268298248, "perf/iters_per_sec": 0.9093481389514199, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0996888399124145, "data/tokens_consumed": 141874429952, "data/tokens_consumed_B": 141.874429952, "train/loss_slope": 7.0904713867306696e-06} {"step": 67650, "timestamp": 1778267662.371634, "geo/rankme_last": 438.8254089355469, "geo/layer_0/stable_rank_q_proj": 19.358051300048828, "geo/layer_0/stable_rank_k_proj": 16.094348907470703, "geo/layer_0/stable_rank_o_proj": 46.855125427246094, "geo/layer_0/stable_rank_gate_proj": 129.5890655517578, "geo/layer_0/stable_rank_down_proj": 55.37956619262695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0626211017370224, "geo/layer_0/attn_entropy_mean": 6.159222602844238, "geo/layer_0/attn_entropy_std": 0.4207369089126587, "geo/layer_7/stable_rank_q_proj": 43.416751861572266, "geo/layer_7/stable_rank_k_proj": 40.50130081176758, "geo/layer_7/stable_rank_o_proj": 89.86357879638672, "geo/layer_7/stable_rank_gate_proj": 79.64048767089844, "geo/layer_7/stable_rank_down_proj": 140.26669311523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4527606666088104, "geo/layer_7/attn_entropy_mean": 4.650936603546143, "geo/layer_7/attn_entropy_std": 0.8024082779884338, "geo/layer_14/stable_rank_q_proj": 50.83855056762695, "geo/layer_14/stable_rank_k_proj": 40.573001861572266, "geo/layer_14/stable_rank_o_proj": 43.46408462524414, "geo/layer_14/stable_rank_gate_proj": 71.67728424072266, "geo/layer_14/stable_rank_down_proj": 128.85385131835938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39844658970832825, "geo/layer_14/attn_entropy_mean": 5.533838272094727, "geo/layer_14/attn_entropy_std": 0.4298831820487976, "geo/layer_21/stable_rank_q_proj": 40.12394714355469, "geo/layer_21/stable_rank_k_proj": 30.060100555419922, "geo/layer_21/stable_rank_o_proj": 69.48177337646484, "geo/layer_21/stable_rank_gate_proj": 65.18807220458984, "geo/layer_21/stable_rank_down_proj": 50.63533401489258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14131152629852295, "geo/layer_21/attn_entropy_mean": 5.695924758911133, "geo/layer_21/attn_entropy_std": 0.3124155104160309, "geo/layer_27/stable_rank_q_proj": 43.3493537902832, "geo/layer_27/stable_rank_k_proj": 32.04591751098633, "geo/layer_27/stable_rank_o_proj": 114.98562622070312, "geo/layer_27/stable_rank_gate_proj": 79.6736068725586, "geo/layer_27/stable_rank_down_proj": 127.37995147705078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08079805225133896, "geo/layer_27/attn_entropy_mean": 4.204348564147949, "geo/layer_27/attn_entropy_std": 0.7237792015075684, "attnres/final_alpha/block_0": 0.2367582768201828, "attnres/block_norm/0": 1.769291639328003, "attnres/final_alpha/block_1": 0.004230046644806862, "attnres/block_norm/1": 47239.0703125, "attnres/final_alpha/block_2": 0.010241609998047352, "attnres/block_norm/2": 28826.0546875, "attnres/final_alpha/block_3": 0.01202864944934845, "attnres/block_norm/3": 59408.84375, "attnres/final_alpha/block_4": 0.014294909313321114, "attnres/block_norm/4": 15419.80078125, "attnres/final_alpha/block_5": 0.6142237186431885, "attnres/block_norm/5": 6722.75390625, "attnres/final_alpha/block_6": 0.10822278261184692, "attnres/block_norm/6": 39458.1171875, "geo/tier1_time_s": 1.3674380779266357, "geo/step": 67650.0, "geo/rankme_slope": -3.200411023784514e-05} {"step": 67660, "timestamp": 1778267672.7462692, "train/loss": 2.155740976333618, "train/z_loss": 0.0013761920272372663, "train/perplexity": 8.634285610539528, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786014.7916050802, "perf/iters_per_sec": 0.851638217737713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1742075204849243, "data/tokens_consumed": 141895401472, "data/tokens_consumed_B": 141.895401472, "train/loss_slope": 5.900631471686015e-06} {"step": 67670, "timestamp": 1778267683.1203156, "train/loss": 2.1754118680953978, "train/z_loss": 0.001379618642386049, "train/perplexity": 8.805811208092509, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022404.2642683296, "perf/iters_per_sec": 0.9643575021115921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369598388671875, "data/tokens_consumed": 141916372992, "data/tokens_consumed_B": 141.916372992, "train/loss_slope": 7.178880787096074e-06} {"step": 67680, "timestamp": 1778267693.5024385, "train/loss": 2.1375259876251222, "train/z_loss": 0.0013817029423080385, "train/perplexity": 8.478435904987336, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022576.6982607772, "perf/iters_per_sec": 0.9644397250465284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368714332580566, "data/tokens_consumed": 141937344512, "data/tokens_consumed_B": 141.937344512, "train/loss_slope": 9.331741670642337e-06} {"step": 67690, "timestamp": 1778267703.87591, "train/loss": 2.16466600894928, "train/z_loss": 0.001375990267843008, "train/perplexity": 8.711691803803335, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022985.951391538, "perf/iters_per_sec": 0.9646348721463861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366616725921631, "data/tokens_consumed": 141958316032, "data/tokens_consumed_B": 141.958316032, "train/loss_slope": 1.2351374206977958e-05} {"step": 67700, "timestamp": 1778267714.243693, "grad/layer_0/attn": 0.0028131704311817884, "grad/layer_0/mlp": 0.002986492356285453, "grad/layer_0/attn_mlp_ratio": 0.9419646867887156, "grad/layer_4/attn": 0.0023696606513112783, "grad/layer_4/mlp": 0.0027206474915146828, "grad/layer_4/attn_mlp_ratio": 0.8709913987764959, "grad/layer_8/attn": 0.003749147057533264, "grad/layer_8/mlp": 0.003810641122981906, "grad/layer_8/attn_mlp_ratio": 0.9838625150334879, "grad/layer_12/attn": 0.004146982915699482, "grad/layer_12/mlp": 0.0064588370732963085, "grad/layer_12/attn_mlp_ratio": 0.642063393832708, "grad/layer_16/attn": 0.004139528144150972, "grad/layer_16/mlp": 0.004848841577768326, "grad/layer_16/attn_mlp_ratio": 0.8537148496991525, "grad/layer_20/attn": 0.0036445760633796453, "grad/layer_20/mlp": 0.006213953252881765, "grad/layer_20/attn_mlp_ratio": 0.5865148732874615, "grad/layer_24/attn": 0.011549029499292374, "grad/layer_24/mlp": 0.011000660248100758, "grad/layer_24/attn_mlp_ratio": 1.049848748515019, "grad/layer_27/attn": 0.004746987484395504, "grad/layer_27/mlp": 0.010900665074586868, "grad/layer_27/attn_mlp_ratio": 0.43547686387637474} {"step": 67700, "timestamp": 1778267714.2603526, "train/loss": 2.128907346725464, "train/z_loss": 0.001368917606305331, "train/perplexity": 8.405677301052405, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020513.2812561267, "perf/iters_per_sec": 0.9634558111458429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379303216934204, "data/tokens_consumed": 141979287552, "data/tokens_consumed_B": 141.979287552, "train/loss_slope": 1.0698240396320571e-05} {"step": 67710, "timestamp": 1778267724.632943, "train/loss": 2.113576555252075, "train/z_loss": 0.0013925445382483304, "train/perplexity": 8.277794392973027, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023094.3163008273, "perf/iters_per_sec": 0.9646865445617806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366061449050903, "data/tokens_consumed": 142000259072, "data/tokens_consumed_B": 142.000259072, "train/loss_slope": 8.411126018035521e-06} {"step": 67720, "timestamp": 1778267735.0056658, "train/loss": 2.1295194387435914, "train/z_loss": 0.0013743991148658097, "train/perplexity": 8.410823923978057, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023040.109130774, "perf/iters_per_sec": 0.9646606965688581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366339206695556, "data/tokens_consumed": 142021230592, "data/tokens_consumed_B": 142.021230592, "train/loss_slope": 5.0927933865469776e-06} {"step": 67725, "timestamp": 1778267740.797876, "eos/sharpness": 44.748353958129876, "eos/L0_probe": 1.9629520177841187, "eos/L_plus": 2.190429925918579, "eos/L_minus": 2.182957649230957, "eos/grad_norm": 0.14272071421146393, "eos/embed_grad_frac": 0.14710238575935364, "eos/time_s": 0.6177527904510498} {"step": 67725, "timestamp": 1778267742.1814103, "geo/rankme_last": 438.8135070800781, "geo/layer_0/stable_rank_q_proj": 19.3364315032959, "geo/layer_0/stable_rank_k_proj": 16.067346572875977, "geo/layer_0/stable_rank_o_proj": 46.85036087036133, "geo/layer_0/stable_rank_gate_proj": 129.71954345703125, "geo/layer_0/stable_rank_down_proj": 55.505027770996094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06303807348012924, "geo/layer_0/attn_entropy_mean": 6.159012794494629, "geo/layer_0/attn_entropy_std": 0.42070871591567993, "geo/layer_7/stable_rank_q_proj": 43.34186935424805, "geo/layer_7/stable_rank_k_proj": 40.431671142578125, "geo/layer_7/stable_rank_o_proj": 89.87860107421875, "geo/layer_7/stable_rank_gate_proj": 79.74463653564453, "geo/layer_7/stable_rank_down_proj": 140.30410766601562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45210400223731995, "geo/layer_7/attn_entropy_mean": 4.637571334838867, "geo/layer_7/attn_entropy_std": 0.8090612888336182, "geo/layer_14/stable_rank_q_proj": 50.84834671020508, "geo/layer_14/stable_rank_k_proj": 40.51048278808594, "geo/layer_14/stable_rank_o_proj": 43.47145462036133, "geo/layer_14/stable_rank_gate_proj": 71.8006591796875, "geo/layer_14/stable_rank_down_proj": 128.75694274902344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3834335505962372, "geo/layer_14/attn_entropy_mean": 5.557413101196289, "geo/layer_14/attn_entropy_std": 0.4141888916492462, "geo/layer_21/stable_rank_q_proj": 40.166778564453125, "geo/layer_21/stable_rank_k_proj": 30.167997360229492, "geo/layer_21/stable_rank_o_proj": 69.45225524902344, "geo/layer_21/stable_rank_gate_proj": 65.13935852050781, "geo/layer_21/stable_rank_down_proj": 50.607383728027344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14398503303527832, "geo/layer_21/attn_entropy_mean": 5.696213245391846, "geo/layer_21/attn_entropy_std": 0.3067252039909363, "geo/layer_27/stable_rank_q_proj": 43.38379669189453, "geo/layer_27/stable_rank_k_proj": 32.01945114135742, "geo/layer_27/stable_rank_o_proj": 115.28812408447266, "geo/layer_27/stable_rank_gate_proj": 79.72187042236328, "geo/layer_27/stable_rank_down_proj": 127.49729919433594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09638843685388565, "geo/layer_27/attn_entropy_mean": 4.183599948883057, "geo/layer_27/attn_entropy_std": 0.742991030216217, "attnres/final_alpha/block_0": 0.23487453162670135, "attnres/block_norm/0": 1.7692453861236572, "attnres/final_alpha/block_1": 0.004174478352069855, "attnres/block_norm/1": 47315.203125, "attnres/final_alpha/block_2": 0.010180409997701645, "attnres/block_norm/2": 28751.23828125, "attnres/final_alpha/block_3": 0.011841190978884697, "attnres/block_norm/3": 59880.921875, "attnres/final_alpha/block_4": 0.014099400490522385, "attnres/block_norm/4": 15448.9912109375, "attnres/final_alpha/block_5": 0.6189173460006714, "attnres/block_norm/5": 6675.36962890625, "attnres/final_alpha/block_6": 0.10591263324022293, "attnres/block_norm/6": 39698.703125, "geo/tier1_time_s": 1.362813949584961, "geo/step": 67725.0, "geo/rankme_slope": -1.0873861263255308e-05} {"step": 67730, "timestamp": 1778267747.3709, "train/loss": 2.095510494709015, "train/z_loss": 0.0013719569193199276, "train/perplexity": 8.129590023781176, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696666.612376481, "perf/iters_per_sec": 0.8090336858637243, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2360424757003785, "data/tokens_consumed": 142042202112, "data/tokens_consumed_B": 142.042202112, "train/loss_slope": 4.1631385223522266e-06} {"step": 67740, "timestamp": 1778267757.7494044, "train/loss": 2.1648028612136843, "train/z_loss": 0.001351300347596407, "train/perplexity": 8.712884100135843, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021970.3332110264, "perf/iters_per_sec": 0.9641505876593716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037182378768921, "data/tokens_consumed": 142063173632, "data/tokens_consumed_B": 142.063173632, "train/loss_slope": 4.552006621350849e-06} {"step": 67750, "timestamp": 1778267768.1147773, "grad/layer_0/attn": 0.0027081011794507504, "grad/layer_0/mlp": 0.0030391013715416193, "grad/layer_0/attn_mlp_ratio": 0.8910861334541208, "grad/layer_4/attn": 0.002277947263792157, "grad/layer_4/mlp": 0.002657237695530057, "grad/layer_4/attn_mlp_ratio": 0.8572613514771115, "grad/layer_8/attn": 0.008952101692557335, "grad/layer_8/mlp": 0.003642428433522582, "grad/layer_8/attn_mlp_ratio": 2.4577288504545574, "grad/layer_12/attn": 0.0037415658589452505, "grad/layer_12/mlp": 0.006894495338201523, "grad/layer_12/attn_mlp_ratio": 0.5426888584498454, "grad/layer_16/attn": 0.0031638562213629484, "grad/layer_16/mlp": 0.004474489018321037, "grad/layer_16/attn_mlp_ratio": 0.707087700450174, "grad/layer_20/attn": 0.010936900041997433, "grad/layer_20/mlp": 0.006292697973549366, "grad/layer_20/attn_mlp_ratio": 1.7380303192949043, "grad/layer_24/attn": 0.008941222913563251, "grad/layer_24/mlp": 0.01111129391938448, "grad/layer_24/attn_mlp_ratio": 0.8046968155072328, "grad/layer_27/attn": 0.008630774915218353, "grad/layer_27/mlp": 0.010337990708649158, "grad/layer_27/attn_mlp_ratio": 0.8348599911694173} {"step": 67750, "timestamp": 1778267768.1313424, "train/loss": 2.1804311513900756, "train/z_loss": 0.0013648493099026383, "train/perplexity": 8.850121178306209, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021001.9822906405, "perf/iters_per_sec": 0.9636888419583514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376793384552, "data/tokens_consumed": 142084145152, "data/tokens_consumed_B": 142.084145152, "train/loss_slope": 6.481983373374791e-06} {"step": 67760, "timestamp": 1778267778.505184, "train/loss": 2.10752876996994, "train/z_loss": 0.0013852167059667409, "train/perplexity": 8.227883148248456, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022544.6553082073, "perf/iters_per_sec": 0.9644244457760845, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368878602981568, "data/tokens_consumed": 142105116672, "data/tokens_consumed_B": 142.105116672, "train/loss_slope": 3.427507345861758e-06} {"step": 67770, "timestamp": 1778267788.8795774, "train/loss": 2.153108525276184, "train/z_loss": 0.0013765107025392354, "train/perplexity": 8.61158616694956, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022894.1131938493, "perf/iters_per_sec": 0.9645910802811858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367087364196776, "data/tokens_consumed": 142126088192, "data/tokens_consumed_B": 142.126088192, "train/loss_slope": 2.9028274402796373e-06} {"step": 67780, "timestamp": 1778267799.2219079, "train/loss": 2.11933913230896, "train/z_loss": 0.001371779094915837, "train/perplexity": 8.325633526987685, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029081.9599746363, "perf/iters_per_sec": 0.9675416755555326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335472106933594, "data/tokens_consumed": 142147059712, "data/tokens_consumed_B": 142.147059712, "train/loss_slope": 7.41789135674032e-07} {"step": 67790, "timestamp": 1778267809.5663798, "train/loss": 2.1428855895996093, "train/z_loss": 0.0013705561636015774, "train/perplexity": 8.523998937596312, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028406.153875671, "perf/iters_per_sec": 0.9672194260958057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338915586471558, "data/tokens_consumed": 142168031232, "data/tokens_consumed_B": 142.168031232, "train/loss_slope": 5.21277079452521e-08} {"step": 67800, "timestamp": 1778267819.9434197, "grad/layer_0/attn": 0.003264430910348892, "grad/layer_0/mlp": 0.0033168906811624765, "grad/layer_0/attn_mlp_ratio": 0.9841840222441094, "grad/layer_4/attn": 0.002241225913167, "grad/layer_4/mlp": 0.0026241247542202473, "grad/layer_4/attn_mlp_ratio": 0.8540850903349928, "grad/layer_8/attn": 0.007167572621256113, "grad/layer_8/mlp": 0.0036940432619303465, "grad/layer_8/attn_mlp_ratio": 1.9403054915713418, "grad/layer_12/attn": 0.00632368354126811, "grad/layer_12/mlp": 0.007533878553658724, "grad/layer_12/attn_mlp_ratio": 0.8393662589982255, "grad/layer_16/attn": 0.007655447814613581, "grad/layer_16/mlp": 0.004608715418726206, "grad/layer_16/attn_mlp_ratio": 1.6610805729943288, "grad/layer_20/attn": 0.0033538243733346462, "grad/layer_20/mlp": 0.006052894983440638, "grad/layer_20/attn_mlp_ratio": 0.5540859914307712, "grad/layer_24/attn": 0.010490492917597294, "grad/layer_24/mlp": 0.008644752204418182, "grad/layer_24/attn_mlp_ratio": 1.2135099477906135, "grad/layer_27/attn": 0.004407361149787903, "grad/layer_27/mlp": 0.008487184531986713, "grad/layer_27/attn_mlp_ratio": 0.519296014037132} {"step": 67800, "timestamp": 1778267820.5531466, "eos/sharpness": 8.993983268737791, "eos/L0_probe": 1.9656836986541748, "eos/L_plus": 2.0155487060546875, "eos/L_minus": 2.00575852394104, "eos/grad_norm": 0.10209585726261139, "eos/embed_grad_frac": 0.2721184194087982, "eos/time_s": 0.6066951751708984} {"step": 67800, "timestamp": 1778267820.5729442, "train/loss": 2.1721113681793214, "train/z_loss": 0.0013748612022027374, "train/perplexity": 8.776795538386624, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906982.3524932447, "perf/iters_per_sec": 0.9093200457063888, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0997228145599365, "data/tokens_consumed": 142189002752, "data/tokens_consumed_B": 142.189002752, "train/loss_slope": -4.257815827701951e-07} {"step": 67800, "timestamp": 1778267821.9393866, "geo/rankme_last": 439.1865539550781, "geo/layer_0/stable_rank_q_proj": 19.322731018066406, "geo/layer_0/stable_rank_k_proj": 16.08652687072754, "geo/layer_0/stable_rank_o_proj": 46.84343338012695, "geo/layer_0/stable_rank_gate_proj": 129.93504333496094, "geo/layer_0/stable_rank_down_proj": 55.50392532348633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06650488823652267, "geo/layer_0/attn_entropy_mean": 6.156957626342773, "geo/layer_0/attn_entropy_std": 0.4210474491119385, "geo/layer_7/stable_rank_q_proj": 43.28362274169922, "geo/layer_7/stable_rank_k_proj": 40.584999084472656, "geo/layer_7/stable_rank_o_proj": 89.78424072265625, "geo/layer_7/stable_rank_gate_proj": 79.67750549316406, "geo/layer_7/stable_rank_down_proj": 140.49758911132812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4444381296634674, "geo/layer_7/attn_entropy_mean": 4.638113975524902, "geo/layer_7/attn_entropy_std": 0.8043746948242188, "geo/layer_14/stable_rank_q_proj": 50.8387451171875, "geo/layer_14/stable_rank_k_proj": 40.63479995727539, "geo/layer_14/stable_rank_o_proj": 43.47914123535156, "geo/layer_14/stable_rank_gate_proj": 71.75486755371094, "geo/layer_14/stable_rank_down_proj": 128.78155517578125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3920239210128784, "geo/layer_14/attn_entropy_mean": 5.526630401611328, "geo/layer_14/attn_entropy_std": 0.42337802052497864, "geo/layer_21/stable_rank_q_proj": 40.210113525390625, "geo/layer_21/stable_rank_k_proj": 30.199556350708008, "geo/layer_21/stable_rank_o_proj": 69.45773315429688, "geo/layer_21/stable_rank_gate_proj": 64.99188232421875, "geo/layer_21/stable_rank_down_proj": 50.6310920715332, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1426704227924347, "geo/layer_21/attn_entropy_mean": 5.680137634277344, "geo/layer_21/attn_entropy_std": 0.3114851117134094, "geo/layer_27/stable_rank_q_proj": 43.371768951416016, "geo/layer_27/stable_rank_k_proj": 32.07228469848633, "geo/layer_27/stable_rank_o_proj": 115.35480499267578, "geo/layer_27/stable_rank_gate_proj": 79.67304992675781, "geo/layer_27/stable_rank_down_proj": 126.96648406982422, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09306881576776505, "geo/layer_27/attn_entropy_mean": 4.195918083190918, "geo/layer_27/attn_entropy_std": 0.7332192659378052, "attnres/final_alpha/block_0": 0.23657070100307465, "attnres/block_norm/0": 1.769165277481079, "attnres/final_alpha/block_1": 0.004233559127897024, "attnres/block_norm/1": 47354.17578125, "attnres/final_alpha/block_2": 0.010153437033295631, "attnres/block_norm/2": 28872.666015625, "attnres/final_alpha/block_3": 0.011935821734368801, "attnres/block_norm/3": 59777.82421875, "attnres/final_alpha/block_4": 0.014361850917339325, "attnres/block_norm/4": 15449.7099609375, "attnres/final_alpha/block_5": 0.6155750751495361, "attnres/block_norm/5": 6660.9921875, "attnres/final_alpha/block_6": 0.10716959834098816, "attnres/block_norm/6": 39630.34765625, "geo/tier1_time_s": 1.3625106811523438, "geo/step": 67800.0, "geo/rankme_slope": -9.999898396858714e-07} {"step": 67810, "timestamp": 1778267832.313059, "train/loss": 2.1555055975914, "train/z_loss": 0.0013887486187741161, "train/perplexity": 8.632253522417123, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786875.2520553267, "perf/iters_per_sec": 0.8520485172535547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173642086982727, "data/tokens_consumed": 142209974272, "data/tokens_consumed_B": 142.209974272, "train/loss_slope": 4.067237578173759e-06} {"step": 67820, "timestamp": 1778267842.6955607, "train/loss": 2.1430673837661742, "train/z_loss": 0.0013770102057605981, "train/perplexity": 8.525548691742836, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021458.1679347912, "perf/iters_per_sec": 0.9639063682245212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374451637268067, "data/tokens_consumed": 142230945792, "data/tokens_consumed_B": 142.230945792, "train/loss_slope": 3.6883894259101024e-06} {"step": 67830, "timestamp": 1778267853.0739143, "train/loss": 2.1528006315231325, "train/z_loss": 0.0013862395775504411, "train/perplexity": 8.608935121506002, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022198.200503567, "perf/iters_per_sec": 0.9642592432515941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370655059814453, "data/tokens_consumed": 142251917312, "data/tokens_consumed_B": 142.251917312, "train/loss_slope": 6.633748330048503e-06} {"step": 67840, "timestamp": 1778267863.4481738, "train/loss": 2.161497688293457, "train/z_loss": 0.0013614966068416835, "train/perplexity": 8.684134049667655, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022513.5434563803, "perf/iters_per_sec": 0.9644096104890729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369038105010986, "data/tokens_consumed": 142272888832, "data/tokens_consumed_B": 142.272888832, "train/loss_slope": 6.943304482215997e-06} {"step": 67850, "timestamp": 1778267873.8130379, "grad/layer_0/attn": 0.0030633292626589537, "grad/layer_0/mlp": 0.003449447685852647, "grad/layer_0/attn_mlp_ratio": 0.8880636707193261, "grad/layer_4/attn": 0.00394351314753294, "grad/layer_4/mlp": 0.0026132198981940746, "grad/layer_4/attn_mlp_ratio": 1.509062822976326, "grad/layer_8/attn": 0.003869697218760848, "grad/layer_8/mlp": 0.0037019988521933556, "grad/layer_8/attn_mlp_ratio": 1.0452993825047234, "grad/layer_12/attn": 0.005901869386434555, "grad/layer_12/mlp": 0.007034136448055506, "grad/layer_12/attn_mlp_ratio": 0.839032530306232, "grad/layer_16/attn": 0.0033818429801613092, "grad/layer_16/mlp": 0.004924415610730648, "grad/layer_16/attn_mlp_ratio": 0.6867500996701063, "grad/layer_20/attn": 0.0030478592962026596, "grad/layer_20/mlp": 0.005818651523441076, "grad/layer_20/attn_mlp_ratio": 0.523808520160242, "grad/layer_24/attn": 0.013523268513381481, "grad/layer_24/mlp": 0.011448465287685394, "grad/layer_24/attn_mlp_ratio": 1.1812298028981125, "grad/layer_27/attn": 0.004507280886173248, "grad/layer_27/mlp": 0.010960531421005726, "grad/layer_27/attn_mlp_ratio": 0.41122831292762574} {"step": 67850, "timestamp": 1778267873.8293977, "train/loss": 2.107734966278076, "train/z_loss": 0.0013872987707145511, "train/perplexity": 8.229579882301534, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021522.604147704, "perf/iters_per_sec": 0.963937093805172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374120950698853, "data/tokens_consumed": 142293860352, "data/tokens_consumed_B": 142.293860352, "train/loss_slope": -1.3147607709493375e-06} {"step": 67860, "timestamp": 1778267884.2119234, "train/loss": 2.031462788581848, "train/z_loss": 0.0013974209548905491, "train/perplexity": 7.625232307462575, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021327.2175904715, "perf/iters_per_sec": 0.9638439262344701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375123739242553, "data/tokens_consumed": 142314831872, "data/tokens_consumed_B": 142.314831872, "train/loss_slope": -6.4173933553366265e-06} {"step": 67870, "timestamp": 1778267894.5826108, "train/loss": 2.137544298171997, "train/z_loss": 0.0013836066820658744, "train/perplexity": 8.478591151206714, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023203.5770044967, "perf/iters_per_sec": 0.9647386441252216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365501642227173, "data/tokens_consumed": 142335803392, "data/tokens_consumed_B": 142.335803392, "train/loss_slope": -5.147736450948381e-06} {"step": 67875, "timestamp": 1778267900.3691263, "eos/sharpness": 86.12608909606932, "eos/L0_probe": 1.9659887552261353, "eos/L_plus": 2.344712495803833, "eos/L_minus": 2.448525905609131, "eos/grad_norm": 0.3111763596534729, "eos/embed_grad_frac": 0.025787431746721268, "eos/time_s": 0.6075599193572998} {"step": 67875, "timestamp": 1778267901.7524378, "geo/rankme_last": 438.1020202636719, "geo/layer_0/stable_rank_q_proj": 19.316316604614258, "geo/layer_0/stable_rank_k_proj": 16.070903778076172, "geo/layer_0/stable_rank_o_proj": 46.890647888183594, "geo/layer_0/stable_rank_gate_proj": 130.0171661376953, "geo/layer_0/stable_rank_down_proj": 55.546424865722656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06297773122787476, "geo/layer_0/attn_entropy_mean": 6.160553932189941, "geo/layer_0/attn_entropy_std": 0.4178181290626526, "geo/layer_7/stable_rank_q_proj": 43.26335525512695, "geo/layer_7/stable_rank_k_proj": 40.56557846069336, "geo/layer_7/stable_rank_o_proj": 89.80009460449219, "geo/layer_7/stable_rank_gate_proj": 79.56529998779297, "geo/layer_7/stable_rank_down_proj": 140.40045166015625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43916282057762146, "geo/layer_7/attn_entropy_mean": 4.642542839050293, "geo/layer_7/attn_entropy_std": 0.8141850233078003, "geo/layer_14/stable_rank_q_proj": 50.82968521118164, "geo/layer_14/stable_rank_k_proj": 40.645416259765625, "geo/layer_14/stable_rank_o_proj": 43.514808654785156, "geo/layer_14/stable_rank_gate_proj": 71.7610855102539, "geo/layer_14/stable_rank_down_proj": 128.7858428955078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39206889271736145, "geo/layer_14/attn_entropy_mean": 5.550975799560547, "geo/layer_14/attn_entropy_std": 0.409005731344223, "geo/layer_21/stable_rank_q_proj": 40.25675964355469, "geo/layer_21/stable_rank_k_proj": 30.048206329345703, "geo/layer_21/stable_rank_o_proj": 69.42617797851562, "geo/layer_21/stable_rank_gate_proj": 64.8522720336914, "geo/layer_21/stable_rank_down_proj": 50.62520217895508, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13905830681324005, "geo/layer_21/attn_entropy_mean": 5.6892900466918945, "geo/layer_21/attn_entropy_std": 0.30926600098609924, "geo/layer_27/stable_rank_q_proj": 43.425209045410156, "geo/layer_27/stable_rank_k_proj": 32.09135437011719, "geo/layer_27/stable_rank_o_proj": 115.43706512451172, "geo/layer_27/stable_rank_gate_proj": 79.63871002197266, "geo/layer_27/stable_rank_down_proj": 127.21417236328125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09698008000850677, "geo/layer_27/attn_entropy_mean": 4.219090938568115, "geo/layer_27/attn_entropy_std": 0.7378893494606018, "attnres/final_alpha/block_0": 0.23741626739501953, "attnres/block_norm/0": 1.7695138454437256, "attnres/final_alpha/block_1": 0.004211866296827793, "attnres/block_norm/1": 47345.29296875, "attnres/final_alpha/block_2": 0.010182667523622513, "attnres/block_norm/2": 28749.51953125, "attnres/final_alpha/block_3": 0.012091042473912239, "attnres/block_norm/3": 59882.09375, "attnres/final_alpha/block_4": 0.014595196582376957, "attnres/block_norm/4": 15440.21875, "attnres/final_alpha/block_5": 0.6127943992614746, "attnres/block_norm/5": 6739.3896484375, "attnres/final_alpha/block_6": 0.10870854556560516, "attnres/block_norm/6": 40040.9375, "geo/tier1_time_s": 1.361494541168213, "geo/step": 67875.0, "geo/rankme_slope": -1.5958864014355737e-05} {"step": 67880, "timestamp": 1778267906.942796, "train/loss": 2.1471675872802733, "train/z_loss": 0.0013746952172368765, "train/perplexity": 8.560576938845681, "train/grad_norm": 0.2421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697578.8705711893, "perf/iters_per_sec": 0.8094686844688365, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353782415390016, "data/tokens_consumed": 142356774912, "data/tokens_consumed_B": 142.356774912, "train/loss_slope": -8.525035547988365e-06} {"step": 67890, "timestamp": 1778267917.313698, "train/loss": 2.158596467971802, "train/z_loss": 0.0013794388971291483, "train/perplexity": 8.658975975690975, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023078.3562915185, "perf/iters_per_sec": 0.9646789342362969, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366143226623534, "data/tokens_consumed": 142377746432, "data/tokens_consumed_B": 142.377746432, "train/loss_slope": -5.40943371795367e-06} {"step": 67900, "timestamp": 1778267927.67532, "grad/layer_0/attn": 0.003006582846865058, "grad/layer_0/mlp": 0.0031938988249748945, "grad/layer_0/attn_mlp_ratio": 0.9413519079626764, "grad/layer_4/attn": 0.0035100013483315706, "grad/layer_4/mlp": 0.0026600754354149103, "grad/layer_4/attn_mlp_ratio": 1.3195119091924927, "grad/layer_8/attn": 0.00492499303072691, "grad/layer_8/mlp": 0.0037308717146515846, "grad/layer_8/attn_mlp_ratio": 1.3200649272874685, "grad/layer_12/attn": 0.004828127566725016, "grad/layer_12/mlp": 0.006797301582992077, "grad/layer_12/attn_mlp_ratio": 0.7103006151405277, "grad/layer_16/attn": 0.003913532942533493, "grad/layer_16/mlp": 0.00486889760941267, "grad/layer_16/attn_mlp_ratio": 0.8037821240252712, "grad/layer_20/attn": 0.004073820076882839, "grad/layer_20/mlp": 0.007099155802279711, "grad/layer_20/attn_mlp_ratio": 0.5738456984124883, "grad/layer_24/attn": 0.013833784498274326, "grad/layer_24/mlp": 0.01192486472427845, "grad/layer_24/attn_mlp_ratio": 1.1600789360823116, "grad/layer_27/attn": 0.00839255377650261, "grad/layer_27/mlp": 0.011057124473154545, "grad/layer_27/attn_mlp_ratio": 0.7590177465196319} {"step": 67900, "timestamp": 1778267927.6920323, "train/loss": 2.113005518913269, "train/z_loss": 0.0013824378955177962, "train/perplexity": 8.273068820934562, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021597.4984048142, "perf/iters_per_sec": 0.9639728061698981, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037373661994934, "data/tokens_consumed": 142398717952, "data/tokens_consumed_B": 142.398717952, "train/loss_slope": -5.41779478498306e-06} {"step": 67910, "timestamp": 1778267938.0698483, "train/loss": 2.1680519461631773, "train/z_loss": 0.0013618550379760563, "train/perplexity": 8.741239039591589, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022189.6929069825, "perf/iters_per_sec": 0.9642551865134156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370698690414428, "data/tokens_consumed": 142419689472, "data/tokens_consumed_B": 142.419689472, "train/loss_slope": -4.972997092285038e-06} {"step": 67920, "timestamp": 1778267948.4415998, "train/loss": 2.1576612114906313, "train/z_loss": 0.0013705479796044528, "train/perplexity": 8.650881398132501, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023036.9917252085, "perf/iters_per_sec": 0.9646592100740473, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366355180740356, "data/tokens_consumed": 142440660992, "data/tokens_consumed_B": 142.440660992, "train/loss_slope": -3.3899539160077125e-06} {"step": 67930, "timestamp": 1778267958.8230016, "train/loss": 2.143805503845215, "train/z_loss": 0.0013788652257062494, "train/perplexity": 8.531843893438623, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021042.3349455318, "perf/iters_per_sec": 0.9637080836036357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376586198806763, "data/tokens_consumed": 142461632512, "data/tokens_consumed_B": 142.461632512, "train/loss_slope": -3.5570979726374957e-06} {"step": 67940, "timestamp": 1778267969.2014406, "train/loss": 2.153108835220337, "train/z_loss": 0.001379896537400782, "train/perplexity": 8.611588836060754, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022111.3612969036, "perf/iters_per_sec": 0.9642178350910681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371100425720214, "data/tokens_consumed": 142482604032, "data/tokens_consumed_B": 142.482604032, "train/loss_slope": -6.119054650673767e-06} {"step": 67950, "timestamp": 1778267979.5832803, "grad/layer_0/attn": 0.002823022659868002, "grad/layer_0/mlp": 0.0031076264567673206, "grad/layer_0/attn_mlp_ratio": 0.9084176004740495, "grad/layer_4/attn": 0.0022683434654027224, "grad/layer_4/mlp": 0.002725510625168681, "grad/layer_4/attn_mlp_ratio": 0.8322636357493457, "grad/layer_8/attn": 0.006066395901143551, "grad/layer_8/mlp": 0.0038372394628822803, "grad/layer_8/attn_mlp_ratio": 1.5809270705493497, "grad/layer_12/attn": 0.004149952437728643, "grad/layer_12/mlp": 0.00624614953994751, "grad/layer_12/attn_mlp_ratio": 0.6644016997587522, "grad/layer_16/attn": 0.0038625025190413, "grad/layer_16/mlp": 0.004696039482951164, "grad/layer_16/attn_mlp_ratio": 0.8225021213756383, "grad/layer_20/attn": 0.003140972461551428, "grad/layer_20/mlp": 0.006375827360898256, "grad/layer_20/attn_mlp_ratio": 0.49263761932304756, "grad/layer_24/attn": 0.015316746197640896, "grad/layer_24/mlp": 0.012044541537761688, "grad/layer_24/attn_mlp_ratio": 1.2716753080599008, "grad/layer_27/attn": 0.0053797997534275055, "grad/layer_27/mlp": 0.011591135524213314, "grad/layer_27/attn_mlp_ratio": 0.46413051558031704} {"step": 67950, "timestamp": 1778267980.192514, "eos/sharpness": 58.30028057098387, "eos/L0_probe": 1.970686674118042, "eos/L_plus": 2.284620761871338, "eos/L_minus": 2.239755392074585, "eos/grad_norm": 0.1827416718006134, "eos/embed_grad_frac": 0.07012765854597092, "eos/time_s": 0.6064450740814209} {"step": 67950, "timestamp": 1778267980.2119281, "train/loss": 2.1558088064193726, "train/z_loss": 0.0013868023990653456, "train/perplexity": 8.634871294736199, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905538.0081506597, "perf/iters_per_sec": 0.9086313286546038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1005563735961914, "data/tokens_consumed": 142503575552, "data/tokens_consumed_B": 142.503575552, "train/loss_slope": -6.377709051384517e-06} {"step": 67950, "timestamp": 1778267981.5793335, "geo/rankme_last": 438.7157897949219, "geo/layer_0/stable_rank_q_proj": 19.306081771850586, "geo/layer_0/stable_rank_k_proj": 16.04213523864746, "geo/layer_0/stable_rank_o_proj": 46.94034957885742, "geo/layer_0/stable_rank_gate_proj": 130.12828063964844, "geo/layer_0/stable_rank_down_proj": 55.580448150634766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06003599613904953, "geo/layer_0/attn_entropy_mean": 6.161443710327148, "geo/layer_0/attn_entropy_std": 0.4222151041030884, "geo/layer_7/stable_rank_q_proj": 43.310359954833984, "geo/layer_7/stable_rank_k_proj": 40.37625503540039, "geo/layer_7/stable_rank_o_proj": 89.83296966552734, "geo/layer_7/stable_rank_gate_proj": 79.59520721435547, "geo/layer_7/stable_rank_down_proj": 140.5133056640625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44549497961997986, "geo/layer_7/attn_entropy_mean": 4.613229751586914, "geo/layer_7/attn_entropy_std": 0.8151302337646484, "geo/layer_14/stable_rank_q_proj": 50.85965347290039, "geo/layer_14/stable_rank_k_proj": 40.59896469116211, "geo/layer_14/stable_rank_o_proj": 43.523807525634766, "geo/layer_14/stable_rank_gate_proj": 71.74960327148438, "geo/layer_14/stable_rank_down_proj": 128.8340606689453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3816841244697571, "geo/layer_14/attn_entropy_mean": 5.537840843200684, "geo/layer_14/attn_entropy_std": 0.42050331830978394, "geo/layer_21/stable_rank_q_proj": 40.23007583618164, "geo/layer_21/stable_rank_k_proj": 30.077049255371094, "geo/layer_21/stable_rank_o_proj": 69.5186996459961, "geo/layer_21/stable_rank_gate_proj": 64.89450073242188, "geo/layer_21/stable_rank_down_proj": 50.614646911621094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14056435227394104, "geo/layer_21/attn_entropy_mean": 5.689388275146484, "geo/layer_21/attn_entropy_std": 0.30187830328941345, "geo/layer_27/stable_rank_q_proj": 43.43510818481445, "geo/layer_27/stable_rank_k_proj": 32.06984329223633, "geo/layer_27/stable_rank_o_proj": 115.4252700805664, "geo/layer_27/stable_rank_gate_proj": 79.63209533691406, "geo/layer_27/stable_rank_down_proj": 127.18869018554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08852353692054749, "geo/layer_27/attn_entropy_mean": 4.1934380531311035, "geo/layer_27/attn_entropy_std": 0.7375494837760925, "attnres/final_alpha/block_0": 0.2363872528076172, "attnres/block_norm/0": 1.769522786140442, "attnres/final_alpha/block_1": 0.004159855656325817, "attnres/block_norm/1": 47339.40625, "attnres/final_alpha/block_2": 0.010128338821232319, "attnres/block_norm/2": 28858.681640625, "attnres/final_alpha/block_3": 0.011999301612377167, "attnres/block_norm/3": 59620.86328125, "attnres/final_alpha/block_4": 0.014447184279561043, "attnres/block_norm/4": 15431.583984375, "attnres/final_alpha/block_5": 0.6167612671852112, "attnres/block_norm/5": 6687.69189453125, "attnres/final_alpha/block_6": 0.10611683130264282, "attnres/block_norm/6": 39881.03125, "geo/tier1_time_s": 1.3632333278656006, "geo/step": 67950.0, "geo/rankme_slope": 3.879188394107644e-06} {"step": 67960, "timestamp": 1778267991.9604042, "train/loss": 2.1790044784545897, "train/z_loss": 0.0013697220245376229, "train/perplexity": 8.837503952412787, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785645.4427359144, "perf/iters_per_sec": 0.8514620984725544, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174450397491455, "data/tokens_consumed": 142524547072, "data/tokens_consumed_B": 142.524547072, "train/loss_slope": -6.8586907728705915e-06} {"step": 67970, "timestamp": 1778268002.3338728, "train/loss": 2.1668394088745115, "train/z_loss": 0.0013722368166781962, "train/perplexity": 8.730646384599343, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023000.7467746865, "perf/iters_per_sec": 0.9646419271348412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366540908813477, "data/tokens_consumed": 142545518592, "data/tokens_consumed_B": 142.545518592, "train/loss_slope": -6.870421667983132e-06} {"step": 67980, "timestamp": 1778268013.2446404, "train/loss": 2.160971975326538, "train/z_loss": 0.0013732525985687971, "train/perplexity": 8.679569887615987, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922937.629602188, "perf/iters_per_sec": 0.9169281147013607, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0905980348587037, "data/tokens_consumed": 142566490112, "data/tokens_consumed_B": 142.566490112, "train/loss_slope": -4.243608183450163e-06} {"step": 67990, "timestamp": 1778268023.621476, "train/loss": 2.125161361694336, "train/z_loss": 0.0013732204912230372, "train/perplexity": 8.374248662063378, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021953.6007950911, "perf/iters_per_sec": 0.9641426090217071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371909618377686, "data/tokens_consumed": 142587461632, "data/tokens_consumed_B": 142.587461632, "train/loss_slope": -5.635715282038428e-06} {"step": 68000, "timestamp": 1778268033.9842675, "grad/layer_0/attn": 0.0027448569890111685, "grad/layer_0/mlp": 0.0030501806177198887, "grad/layer_0/attn_mlp_ratio": 0.8998997905484892, "grad/layer_4/attn": 0.0022788101341575384, "grad/layer_4/mlp": 0.0024365978315472603, "grad/layer_4/attn_mlp_ratio": 0.9352425792754715, "grad/layer_8/attn": 0.004205633886158466, "grad/layer_8/mlp": 0.0036808752920478582, "grad/layer_8/attn_mlp_ratio": 1.1425635041176057, "grad/layer_12/attn": 0.004172070883214474, "grad/layer_12/mlp": 0.007768176030367613, "grad/layer_12/attn_mlp_ratio": 0.537072127768174, "grad/layer_16/attn": 0.004202337935566902, "grad/layer_16/mlp": 0.004595134872943163, "grad/layer_16/attn_mlp_ratio": 0.9145189336789653, "grad/layer_20/attn": 0.005965115036815405, "grad/layer_20/mlp": 0.005331854801625013, "grad/layer_20/attn_mlp_ratio": 1.1187691988762467, "grad/layer_24/attn": 0.006317464634776115, "grad/layer_24/mlp": 0.007690528407692909, "grad/layer_24/attn_mlp_ratio": 0.8214604013829082, "grad/layer_27/attn": 0.005974427796900272, "grad/layer_27/mlp": 0.007204264402389526, "grad/layer_27/attn_mlp_ratio": 0.8292904563566011} {"step": 68000, "timestamp": 1778268033.9989033, "train/loss": 2.0936059594154357, "train/z_loss": 0.0013722294243052602, "train/perplexity": 8.114121667349572, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022313.1763019315, "perf/iters_per_sec": 0.9643140679845483, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370065450668335, "data/tokens_consumed": 142608433152, "data/tokens_consumed_B": 142.608433152, "train/loss_slope": -6.998136019942764e-06} {"step": 68000, "timestamp": 1778268041.060983, "geo/ww_alpha_mean": 7.436943856865838, "geo/ww_alpha_std": 4.468250061827755, "geo/ww_alpha_min": 1.3456222928743489, "geo/ww_alpha_max": 35.270553128051404, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.9125326483220775, "geo/ww_alpha_by_type/k_proj": 4.447265365087648, "geo/ww_alpha_by_type/v_proj": 7.397585471074918, "geo/ww_alpha_by_type/o_proj": 8.718441221719992, "geo/ww_alpha_by_type/gate_proj": 8.101424024688672, "geo/ww_alpha_by_type/up_proj": 11.77110100859471, "geo/ww_alpha_by_type/down_proj": 7.796566646658922, "geo/twonn_id/layer_0": 0.7560438513755798, "geo/twonn_id/layer_7": 3.4859426021575928, "geo/twonn_id/layer_14": 4.678079605102539, "geo/twonn_id/layer_21": 7.35460901260376, "geo/twonn_id/layer_27": 5.2758402824401855, "geo/tier2_time_s": 7.054470062255859} {"step": 68000, "timestamp": 1778268041.888678, "eoc/jacobian_sigma/layer_0/attn": 1142.1673583984375, "eoc/jacobian_sigma/layer_0/mlp": 8854.4931640625, "eoc/jacobian_sigma/layer_0": 8854.4931640625, "eoc/jacobian_sigma/layer_7/attn": 1.1417601108551025, "eoc/jacobian_sigma/layer_7/mlp": 1.7376396656036377, "eoc/jacobian_sigma/layer_7": 1.7376396656036377, "eoc/jacobian_sigma/layer_14/attn": 1.4631521701812744, "eoc/jacobian_sigma/layer_14/mlp": 7.092895984649658, "eoc/jacobian_sigma/layer_14": 7.092895984649658, "eoc/jacobian_sigma/layer_21/attn": 1.0869159698486328, "eoc/jacobian_sigma/layer_21/mlp": 4.444485664367676, "eoc/jacobian_sigma/layer_21": 4.444485664367676, "eoc/jacobian_sigma/layer_27/attn": 3.063140392303467, "eoc/jacobian_sigma/layer_27/mlp": 29.89466094970703, "eoc/jacobian_sigma/layer_27": 29.89466094970703, "eoc/layer0_sigma": 8854.4931640625, "eoc/sigma_max": 29.89466094970703, "eoc/sigma_min": 1.7376396656036377, "eoc/sigma_mean": 10.792420566082, "eoc/time_s": 0.8202853202819824} {"step": 68010, "timestamp": 1778268052.9214318, "train/loss": 2.1327967166900637, "train/z_loss": 0.0013843779684975742, "train/perplexity": 8.438433749559957, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1108667.6286791738, "perf/iters_per_sec": 0.5286539214511746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8915966749191284, "data/tokens_consumed": 142629404672, "data/tokens_consumed_B": 142.629404672, "train/loss_slope": -1.1199458847881375e-05} {"step": 68020, "timestamp": 1778268063.2887328, "train/loss": 2.114154100418091, "train/z_loss": 0.0013786177383735775, "train/perplexity": 8.282576573939782, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024311.7412194738, "perf/iters_per_sec": 0.9652670580003136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359827280044556, "data/tokens_consumed": 142650376192, "data/tokens_consumed_B": 142.650376192, "train/loss_slope": -1.1684718412427307e-05} {"step": 68025, "timestamp": 1778268069.0657687, "eos/sharpness": 58.33044052124022, "eos/L0_probe": 1.9675843715667725, "eos/L_plus": 2.3013200759887695, "eos/L_minus": 2.2171530723571777, "eos/grad_norm": 0.15690414607524872, "eos/embed_grad_frac": 0.09212592989206314, "eos/time_s": 0.6024086475372314} {"step": 68025, "timestamp": 1778268070.442753, "geo/rankme_last": 439.2290344238281, "geo/layer_0/stable_rank_q_proj": 19.312849044799805, "geo/layer_0/stable_rank_k_proj": 16.046222686767578, "geo/layer_0/stable_rank_o_proj": 46.92270278930664, "geo/layer_0/stable_rank_gate_proj": 130.00448608398438, "geo/layer_0/stable_rank_down_proj": 55.63013458251953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06397460401058197, "geo/layer_0/attn_entropy_mean": 6.159642219543457, "geo/layer_0/attn_entropy_std": 0.4187769591808319, "geo/layer_7/stable_rank_q_proj": 43.26278305053711, "geo/layer_7/stable_rank_k_proj": 40.37244415283203, "geo/layer_7/stable_rank_o_proj": 89.63761138916016, "geo/layer_7/stable_rank_gate_proj": 79.61318969726562, "geo/layer_7/stable_rank_down_proj": 140.73272705078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44435861706733704, "geo/layer_7/attn_entropy_mean": 4.63885498046875, "geo/layer_7/attn_entropy_std": 0.7855960726737976, "geo/layer_14/stable_rank_q_proj": 50.804866790771484, "geo/layer_14/stable_rank_k_proj": 40.55794906616211, "geo/layer_14/stable_rank_o_proj": 43.493080139160156, "geo/layer_14/stable_rank_gate_proj": 71.73450469970703, "geo/layer_14/stable_rank_down_proj": 128.7642364501953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.375457227230072, "geo/layer_14/attn_entropy_mean": 5.519002914428711, "geo/layer_14/attn_entropy_std": 0.40937352180480957, "geo/layer_21/stable_rank_q_proj": 40.277435302734375, "geo/layer_21/stable_rank_k_proj": 30.035425186157227, "geo/layer_21/stable_rank_o_proj": 69.50768280029297, "geo/layer_21/stable_rank_gate_proj": 64.89672088623047, "geo/layer_21/stable_rank_down_proj": 50.580230712890625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14290456473827362, "geo/layer_21/attn_entropy_mean": 5.690654754638672, "geo/layer_21/attn_entropy_std": 0.29559144377708435, "geo/layer_27/stable_rank_q_proj": 43.42558670043945, "geo/layer_27/stable_rank_k_proj": 31.953874588012695, "geo/layer_27/stable_rank_o_proj": 115.23614501953125, "geo/layer_27/stable_rank_gate_proj": 79.59203338623047, "geo/layer_27/stable_rank_down_proj": 127.2230453491211, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09266499429941177, "geo/layer_27/attn_entropy_mean": 4.169605731964111, "geo/layer_27/attn_entropy_std": 0.7294687032699585, "attnres/final_alpha/block_0": 0.2367336004972458, "attnres/block_norm/0": 1.769509196281433, "attnres/final_alpha/block_1": 0.004155436530709267, "attnres/block_norm/1": 47398.0546875, "attnres/final_alpha/block_2": 0.010167392902076244, "attnres/block_norm/2": 28726.9921875, "attnres/final_alpha/block_3": 0.01190219447016716, "attnres/block_norm/3": 59863.1328125, "attnres/final_alpha/block_4": 0.014375005848705769, "attnres/block_norm/4": 15522.0693359375, "attnres/final_alpha/block_5": 0.616812527179718, "attnres/block_norm/5": 6740.41845703125, "attnres/final_alpha/block_6": 0.1058538556098938, "attnres/block_norm/6": 39555.46875, "geo/tier1_time_s": 1.359266996383667, "geo/step": 68025.0, "geo/rankme_slope": 2.1362138605442173e-05} {"step": 68030, "timestamp": 1778268075.6327279, "train/loss": 2.1410470724105837, "train/z_loss": 0.001371477567590773, "train/perplexity": 8.508341816383824, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699779.6884288085, "perf/iters_per_sec": 0.8105181162017863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233778715133667, "data/tokens_consumed": 142671347712, "data/tokens_consumed_B": 142.671347712, "train/loss_slope": -1.499009884194211e-05} {"step": 68040, "timestamp": 1778268085.9979389, "train/loss": 2.13562912940979, "train/z_loss": 0.0013803108595311642, "train/perplexity": 8.462368757555657, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024191.6004135236, "perf/iters_per_sec": 0.9652097703998201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036044216156006, "data/tokens_consumed": 142692319232, "data/tokens_consumed_B": 142.692319232, "train/loss_slope": -1.677146837083988e-05} {"step": 68050, "timestamp": 1778268096.3516624, "grad/layer_0/attn": 0.003041125601157546, "grad/layer_0/mlp": 0.0030365458223968744, "grad/layer_0/attn_mlp_ratio": 1.0015081868931712, "grad/layer_4/attn": 0.0021851095370948315, "grad/layer_4/mlp": 0.002608631271868944, "grad/layer_4/attn_mlp_ratio": 0.8376459628058976, "grad/layer_8/attn": 0.003574453992769122, "grad/layer_8/mlp": 0.0038155780639499426, "grad/layer_8/attn_mlp_ratio": 0.9368053383209414, "grad/layer_12/attn": 0.008965590968728065, "grad/layer_12/mlp": 0.007098088506609201, "grad/layer_12/attn_mlp_ratio": 1.2630993307663119, "grad/layer_16/attn": 0.0038542544934898615, "grad/layer_16/mlp": 0.00468054972589016, "grad/layer_16/attn_mlp_ratio": 0.8234619087207024, "grad/layer_20/attn": 0.003417006926611066, "grad/layer_20/mlp": 0.0056348517537117004, "grad/layer_20/attn_mlp_ratio": 0.6064058142647121, "grad/layer_24/attn": 0.008684193715453148, "grad/layer_24/mlp": 0.007597596850246191, "grad/layer_24/attn_mlp_ratio": 1.143018479701236, "grad/layer_27/attn": 0.0056557790376245975, "grad/layer_27/mlp": 0.0065735518001019955, "grad/layer_27/attn_mlp_ratio": 0.8603840242802133} {"step": 68050, "timestamp": 1778268096.3657842, "train/loss": 2.1637945652008055, "train/z_loss": 0.0013722608448006213, "train/perplexity": 8.704103361372862, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024394.436566308, "perf/iters_per_sec": 0.9653064902144947, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035940408706665, "data/tokens_consumed": 142713290752, "data/tokens_consumed_B": 142.713290752, "train/loss_slope": -1.6069257341631513e-05} {"step": 68060, "timestamp": 1778268106.7270658, "train/loss": 2.1670737266540527, "train/z_loss": 0.0013708742102608086, "train/perplexity": 8.732692369970154, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025439.9620116588, "perf/iters_per_sec": 0.9658050355966848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035405659675598, "data/tokens_consumed": 142734262272, "data/tokens_consumed_B": 142.734262272, "train/loss_slope": -1.387482189466885e-05} {"step": 68070, "timestamp": 1778268117.0938292, "train/loss": 2.162655305862427, "train/z_loss": 0.0013593691983260214, "train/perplexity": 8.694192776771157, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024197.749189547, "perf/iters_per_sec": 0.9652127023647056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360410690307618, "data/tokens_consumed": 142755233792, "data/tokens_consumed_B": 142.755233792, "train/loss_slope": -1.4080135971799525e-05} {"step": 68080, "timestamp": 1778268127.4562507, "train/loss": 2.154621624946594, "train/z_loss": 0.0013648564578033983, "train/perplexity": 8.624626218101131, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025476.947388908, "perf/iters_per_sec": 0.9658226715988674, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353867530822753, "data/tokens_consumed": 142776205312, "data/tokens_consumed_B": 142.776205312, "train/loss_slope": -1.4995092083804769e-05} {"step": 68090, "timestamp": 1778268137.8151543, "train/loss": 2.1103917837142943, "train/z_loss": 0.001377501175738871, "train/perplexity": 8.251473444345974, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025459.8770469513, "perf/iters_per_sec": 0.9658145318255192, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353954792022706, "data/tokens_consumed": 142797176832, "data/tokens_consumed_B": 142.797176832, "train/loss_slope": -1.8108047417538544e-05} {"step": 68100, "timestamp": 1778268148.1669953, "grad/layer_0/attn": 0.003544815815985203, "grad/layer_0/mlp": 0.0030728280544281006, "grad/layer_0/attn_mlp_ratio": 1.1536004090814334, "grad/layer_4/attn": 0.0025094689335674047, "grad/layer_4/mlp": 0.0025915722362697124, "grad/layer_4/attn_mlp_ratio": 0.9683190773596208, "grad/layer_8/attn": 0.009172786958515644, "grad/layer_8/mlp": 0.0036675818264484406, "grad/layer_8/attn_mlp_ratio": 2.501044868927646, "grad/layer_12/attn": 0.005352359730750322, "grad/layer_12/mlp": 0.0068147811107337475, "grad/layer_12/attn_mlp_ratio": 0.7854044855203846, "grad/layer_16/attn": 0.004166780970990658, "grad/layer_16/mlp": 0.004537761211395264, "grad/layer_16/attn_mlp_ratio": 0.9182459554509849, "grad/layer_20/attn": 0.0037478054873645306, "grad/layer_20/mlp": 0.005501555744558573, "grad/layer_20/attn_mlp_ratio": 0.6812264736113467, "grad/layer_24/attn": 0.013849625363945961, "grad/layer_24/mlp": 0.00868316926062107, "grad/layer_24/attn_mlp_ratio": 1.5949965719608348, "grad/layer_27/attn": 0.008587817661464214, "grad/layer_27/mlp": 0.008135128766298294, "grad/layer_27/attn_mlp_ratio": 1.0556461738475087} {"step": 68100, "timestamp": 1778268148.7604504, "eos/sharpness": 61.675882339477525, "eos/L0_probe": 1.9661017656326294, "eos/L_plus": 2.318633794784546, "eos/L_minus": 2.2303285598754883, "eos/grad_norm": 0.1467294842004776, "eos/embed_grad_frac": 0.10123450309038162, "eos/time_s": 0.5906012058258057} {"step": 68100, "timestamp": 1778268148.7808437, "train/loss": 2.1641132831573486, "train/z_loss": 0.0013601438142359257, "train/perplexity": 8.706877957543064, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913319.0836960357, "perf/iters_per_sec": 0.9123416346054247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960806369781495, "data/tokens_consumed": 142818148352, "data/tokens_consumed_B": 142.818148352, "train/loss_slope": -1.5923852409788554e-05} {"step": 68100, "timestamp": 1778268150.1409693, "geo/rankme_last": 438.2685546875, "geo/layer_0/stable_rank_q_proj": 19.28171730041504, "geo/layer_0/stable_rank_k_proj": 16.09850311279297, "geo/layer_0/stable_rank_o_proj": 46.96683883666992, "geo/layer_0/stable_rank_gate_proj": 129.96029663085938, "geo/layer_0/stable_rank_down_proj": 55.67694091796875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06581588089466095, "geo/layer_0/attn_entropy_mean": 6.158560276031494, "geo/layer_0/attn_entropy_std": 0.4195984899997711, "geo/layer_7/stable_rank_q_proj": 43.28141403198242, "geo/layer_7/stable_rank_k_proj": 40.32035827636719, "geo/layer_7/stable_rank_o_proj": 89.81356811523438, "geo/layer_7/stable_rank_gate_proj": 79.64224243164062, "geo/layer_7/stable_rank_down_proj": 140.5801544189453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44617703557014465, "geo/layer_7/attn_entropy_mean": 4.638286113739014, "geo/layer_7/attn_entropy_std": 0.7982832193374634, "geo/layer_14/stable_rank_q_proj": 50.781829833984375, "geo/layer_14/stable_rank_k_proj": 40.60793685913086, "geo/layer_14/stable_rank_o_proj": 43.52460479736328, "geo/layer_14/stable_rank_gate_proj": 71.85997009277344, "geo/layer_14/stable_rank_down_proj": 128.48318481445312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3965326249599457, "geo/layer_14/attn_entropy_mean": 5.53705358505249, "geo/layer_14/attn_entropy_std": 0.41148489713668823, "geo/layer_21/stable_rank_q_proj": 40.1833381652832, "geo/layer_21/stable_rank_k_proj": 30.116668701171875, "geo/layer_21/stable_rank_o_proj": 69.50125122070312, "geo/layer_21/stable_rank_gate_proj": 64.90225219726562, "geo/layer_21/stable_rank_down_proj": 50.58119583129883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1469728648662567, "geo/layer_21/attn_entropy_mean": 5.7018256187438965, "geo/layer_21/attn_entropy_std": 0.3018139600753784, "geo/layer_27/stable_rank_q_proj": 43.42521667480469, "geo/layer_27/stable_rank_k_proj": 31.87568473815918, "geo/layer_27/stable_rank_o_proj": 115.1760482788086, "geo/layer_27/stable_rank_gate_proj": 79.50855255126953, "geo/layer_27/stable_rank_down_proj": 127.06058502197266, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09090618044137955, "geo/layer_27/attn_entropy_mean": 4.182684898376465, "geo/layer_27/attn_entropy_std": 0.7477084994316101, "attnres/final_alpha/block_0": 0.2369689643383026, "attnres/block_norm/0": 1.7694982290267944, "attnres/final_alpha/block_1": 0.004151180852204561, "attnres/block_norm/1": 47517.984375, "attnres/final_alpha/block_2": 0.01015041209757328, "attnres/block_norm/2": 28814.25, "attnres/final_alpha/block_3": 0.011962801218032837, "attnres/block_norm/3": 60086.8046875, "attnres/final_alpha/block_4": 0.014235012233257294, "attnres/block_norm/4": 15437.4140625, "attnres/final_alpha/block_5": 0.6162036061286926, "attnres/block_norm/5": 6670.79345703125, "attnres/final_alpha/block_6": 0.10632799565792084, "attnres/block_norm/6": 39830.390625, "geo/tier1_time_s": 1.3570151329040527, "geo/step": 68100.0, "geo/rankme_slope": 2.274349192802121e-05} {"step": 68110, "timestamp": 1778268160.4999762, "train/loss": 2.0381200075149537, "train/z_loss": 0.0013982906588353218, "train/perplexity": 7.676164493602775, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789969.4927403016, "perf/iters_per_sec": 0.8535239661885746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716132640838623, "data/tokens_consumed": 142839119872, "data/tokens_consumed_B": 142.839119872, "train/loss_slope": -2.4333444866302862e-05} {"step": 68120, "timestamp": 1778268170.8601522, "train/loss": 2.1716897964477537, "train/z_loss": 0.0013655047514475882, "train/perplexity": 8.773096269302314, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025263.0758539573, "perf/iters_per_sec": 0.965720689703921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354960918426515, "data/tokens_consumed": 142860091392, "data/tokens_consumed_B": 142.860091392, "train/loss_slope": -2.1472610248447766e-05} {"step": 68130, "timestamp": 1778268181.224061, "train/loss": 2.15490608215332, "train/z_loss": 0.00137780545046553, "train/perplexity": 8.627079904151982, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024912.472773664, "perf/iters_per_sec": 0.9655535091274566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356753826141358, "data/tokens_consumed": 142881062912, "data/tokens_consumed_B": 142.881062912, "train/loss_slope": -2.0947739507856567e-05} {"step": 68140, "timestamp": 1778268191.5839717, "train/loss": 2.0819392085075377, "train/z_loss": 0.0013922444079071284, "train/perplexity": 8.02000630876798, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025634.1386505652, "perf/iters_per_sec": 0.9658976262333704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353064060211181, "data/tokens_consumed": 142902034432, "data/tokens_consumed_B": 142.902034432, "train/loss_slope": -2.6313761458753134e-05} {"step": 68150, "timestamp": 1778268202.316863, "grad/layer_0/attn": 0.002819937653839588, "grad/layer_0/mlp": 0.0031900820322334766, "grad/layer_0/attn_mlp_ratio": 0.8839702355453969, "grad/layer_4/attn": 0.0019937355536967516, "grad/layer_4/mlp": 0.0026453277096152306, "grad/layer_4/attn_mlp_ratio": 0.7536818485973371, "grad/layer_8/attn": 0.005994470790028572, "grad/layer_8/mlp": 0.0038124793209135532, "grad/layer_8/attn_mlp_ratio": 1.5723286943256907, "grad/layer_12/attn": 0.006664798595011234, "grad/layer_12/mlp": 0.007503672502934933, "grad/layer_12/attn_mlp_ratio": 0.888204876156832, "grad/layer_16/attn": 0.0033691893331706524, "grad/layer_16/mlp": 0.004394715651869774, "grad/layer_16/attn_mlp_ratio": 0.7666455633079793, "grad/layer_20/attn": 0.002944642910733819, "grad/layer_20/mlp": 0.005868103820830584, "grad/layer_20/attn_mlp_ratio": 0.5018048334626337, "grad/layer_24/attn": 0.009986898861825466, "grad/layer_24/mlp": 0.011771867983043194, "grad/layer_24/attn_mlp_ratio": 0.8483699266228706, "grad/layer_27/attn": 0.011820321902632713, "grad/layer_27/mlp": 0.009899429976940155, "grad/layer_27/attn_mlp_ratio": 1.1940406478719523} {"step": 68150, "timestamp": 1778268202.3313732, "train/loss": 2.1671066284179688, "train/z_loss": 0.0013774000573903323, "train/perplexity": 8.732979695679598, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1952159.855213235, "perf/iters_per_sec": 0.9308623577181029, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0742726802825928, "data/tokens_consumed": 142923005952, "data/tokens_consumed_B": 142.923005952, "train/loss_slope": -2.6713893043719503e-05} {"step": 68160, "timestamp": 1778268213.170537, "train/loss": 2.163939666748047, "train/z_loss": 0.0013618013937957585, "train/perplexity": 8.705366431872473, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1936125.317623326, "perf/iters_per_sec": 0.9232164943806296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0831695556640626, "data/tokens_consumed": 142943977472, "data/tokens_consumed_B": 142.943977472, "train/loss_slope": -2.4552361685486467e-05} {"step": 68170, "timestamp": 1778268223.5392327, "train/loss": 2.149327802658081, "train/z_loss": 0.0013632908929139375, "train/perplexity": 8.579089617279035, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024225.2793947898, "perf/iters_per_sec": 0.9652258297895383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360269784927367, "data/tokens_consumed": 142964948992, "data/tokens_consumed_B": 142.964948992, "train/loss_slope": -2.465987813533551e-05} {"step": 68175, "timestamp": 1778268229.2943938, "eos/sharpness": 89.65020179748534, "eos/L0_probe": 1.974297285079956, "eos/L_plus": 2.527445077896118, "eos/L_minus": 2.3176515102386475, "eos/grad_norm": 0.2902911603450775, "eos/embed_grad_frac": 0.026118380948901176, "eos/time_s": 0.5852270126342773} {"step": 68175, "timestamp": 1778268230.6754487, "geo/rankme_last": 439.0270080566406, "geo/layer_0/stable_rank_q_proj": 19.27945899963379, "geo/layer_0/stable_rank_k_proj": 16.100364685058594, "geo/layer_0/stable_rank_o_proj": 46.96831512451172, "geo/layer_0/stable_rank_gate_proj": 129.53981018066406, "geo/layer_0/stable_rank_down_proj": 55.69843673706055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06535904854536057, "geo/layer_0/attn_entropy_mean": 6.1564531326293945, "geo/layer_0/attn_entropy_std": 0.4193127453327179, "geo/layer_7/stable_rank_q_proj": 43.27495193481445, "geo/layer_7/stable_rank_k_proj": 40.41028594970703, "geo/layer_7/stable_rank_o_proj": 89.66726684570312, "geo/layer_7/stable_rank_gate_proj": 79.58648681640625, "geo/layer_7/stable_rank_down_proj": 140.37071228027344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45184725522994995, "geo/layer_7/attn_entropy_mean": 4.660330772399902, "geo/layer_7/attn_entropy_std": 0.8027824759483337, "geo/layer_14/stable_rank_q_proj": 50.669189453125, "geo/layer_14/stable_rank_k_proj": 40.67634963989258, "geo/layer_14/stable_rank_o_proj": 43.6052131652832, "geo/layer_14/stable_rank_gate_proj": 71.88166809082031, "geo/layer_14/stable_rank_down_proj": 128.38754272460938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.384907066822052, "geo/layer_14/attn_entropy_mean": 5.524550437927246, "geo/layer_14/attn_entropy_std": 0.39825239777565, "geo/layer_21/stable_rank_q_proj": 40.258689880371094, "geo/layer_21/stable_rank_k_proj": 30.13826560974121, "geo/layer_21/stable_rank_o_proj": 69.61029052734375, "geo/layer_21/stable_rank_gate_proj": 64.96326446533203, "geo/layer_21/stable_rank_down_proj": 50.53542709350586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14512695372104645, "geo/layer_21/attn_entropy_mean": 5.687694549560547, "geo/layer_21/attn_entropy_std": 0.30838295817375183, "geo/layer_27/stable_rank_q_proj": 43.42076873779297, "geo/layer_27/stable_rank_k_proj": 31.90081214904785, "geo/layer_27/stable_rank_o_proj": 115.09901428222656, "geo/layer_27/stable_rank_gate_proj": 79.51885223388672, "geo/layer_27/stable_rank_down_proj": 127.19635772705078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09065172076225281, "geo/layer_27/attn_entropy_mean": 4.186344146728516, "geo/layer_27/attn_entropy_std": 0.7449649572372437, "attnres/final_alpha/block_0": 0.23516762256622314, "attnres/block_norm/0": 1.7694119215011597, "attnres/final_alpha/block_1": 0.004130009561777115, "attnres/block_norm/1": 47544.6796875, "attnres/final_alpha/block_2": 0.010073309764266014, "attnres/block_norm/2": 28873.619140625, "attnres/final_alpha/block_3": 0.01175447553396225, "attnres/block_norm/3": 60420.90625, "attnres/final_alpha/block_4": 0.014231182634830475, "attnres/block_norm/4": 15468.916015625, "attnres/final_alpha/block_5": 0.6198519468307495, "attnres/block_norm/5": 6714.4765625, "attnres/final_alpha/block_6": 0.10479143261909485, "attnres/block_norm/6": 40162.90625, "geo/tier1_time_s": 1.3613786697387695, "geo/step": 68175.0, "geo/rankme_slope": 2.970092333808523e-05} {"step": 68180, "timestamp": 1778268235.8542693, "train/loss": 2.1109994530677794, "train/z_loss": 0.001377221359871328, "train/perplexity": 8.256489135665808, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703639.2809757604, "perf/iters_per_sec": 0.8123585133436968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230983591079712, "data/tokens_consumed": 142985920512, "data/tokens_consumed_B": 142.985920512, "train/loss_slope": -2.4708011677556313e-05} {"step": 68190, "timestamp": 1778268246.1967974, "train/loss": 2.1111072421073915, "train/z_loss": 0.0013901603640988469, "train/perplexity": 8.257379142665949, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029055.1400071303, "perf/iters_per_sec": 0.967528886798444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033560872077942, "data/tokens_consumed": 143006892032, "data/tokens_consumed_B": 143.006892032, "train/loss_slope": -2.3744598316280755e-05} {"step": 68200, "timestamp": 1778268256.5327046, "grad/layer_0/attn": 0.002823422895744443, "grad/layer_0/mlp": 0.002945141401141882, "grad/layer_0/attn_mlp_ratio": 0.9586713896937561, "grad/layer_4/attn": 0.002215777290984988, "grad/layer_4/mlp": 0.0023557732347398996, "grad/layer_4/attn_mlp_ratio": 0.940573211484131, "grad/layer_8/attn": 0.002802743809297681, "grad/layer_8/mlp": 0.0033442561980336905, "grad/layer_8/attn_mlp_ratio": 0.838076857609747, "grad/layer_12/attn": 0.0052001881413161755, "grad/layer_12/mlp": 0.0067364233545959, "grad/layer_12/attn_mlp_ratio": 0.7719508989252104, "grad/layer_16/attn": 0.005032764747738838, "grad/layer_16/mlp": 0.00460009602829814, "grad/layer_16/attn_mlp_ratio": 1.0940564299904685, "grad/layer_20/attn": 0.004792002495378256, "grad/layer_20/mlp": 0.0053910366259515285, "grad/layer_20/attn_mlp_ratio": 0.8888832962888893, "grad/layer_24/attn": 0.00748717924579978, "grad/layer_24/mlp": 0.007831249386072159, "grad/layer_24/attn_mlp_ratio": 0.9560644516709236, "grad/layer_27/attn": 0.0055982330814003944, "grad/layer_27/mlp": 0.006653149612247944, "grad/layer_27/attn_mlp_ratio": 0.8414410201974671} {"step": 68200, "timestamp": 1778268256.5470648, "train/loss": 2.093758153915405, "train/z_loss": 0.0013822954846546055, "train/perplexity": 8.115356686018567, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027181.9319175982, "perf/iters_per_sec": 0.9666356715763084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034515929222107, "data/tokens_consumed": 143027863552, "data/tokens_consumed_B": 143.027863552, "train/loss_slope": -2.5009690846117865e-05} {"step": 68210, "timestamp": 1778268267.3516402, "train/loss": 2.129261302947998, "train/z_loss": 0.001378797017969191, "train/perplexity": 8.408653069452528, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1943292.9480283994, "perf/iters_per_sec": 0.926634286894035, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0791743993759155, "data/tokens_consumed": 143048835072, "data/tokens_consumed_B": 143.048835072, "train/loss_slope": -2.612995894649817e-05} {"step": 68220, "timestamp": 1778268278.0802612, "train/loss": 2.1156948447227477, "train/z_loss": 0.0013858962687663735, "train/perplexity": 8.295347742650243, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1956266.741090012, "perf/iters_per_sec": 0.9328206735086498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0720174074172975, "data/tokens_consumed": 143069806592, "data/tokens_consumed_B": 143.069806592, "train/loss_slope": -2.6242880058689203e-05} {"step": 68230, "timestamp": 1778268288.4256725, "train/loss": 2.16322283744812, "train/z_loss": 0.0013807298964820802, "train/perplexity": 8.699128406214403, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028293.2436915294, "perf/iters_per_sec": 0.9671655863244674, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033949112892151, "data/tokens_consumed": 143090778112, "data/tokens_consumed_B": 143.090778112, "train/loss_slope": -2.113875254045339e-05} {"step": 68240, "timestamp": 1778268298.7683427, "train/loss": 2.209429979324341, "train/z_loss": 0.001361914142034948, "train/perplexity": 9.110521726897035, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029084.8151961805, "perf/iters_per_sec": 0.9675430370312598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335457563400268, "data/tokens_consumed": 143111749632, "data/tokens_consumed_B": 143.111749632, "train/loss_slope": -2.0483921091369925e-05} {"step": 68250, "timestamp": 1778268309.1035454, "grad/layer_0/attn": 0.002687383908778429, "grad/layer_0/mlp": 0.0029831507708877325, "grad/layer_0/attn_mlp_ratio": 0.9008541723465402, "grad/layer_4/attn": 0.002495724707841873, "grad/layer_4/mlp": 0.002490425016731024, "grad/layer_4/attn_mlp_ratio": 1.0021279865334018, "grad/layer_8/attn": 0.0034239215310662985, "grad/layer_8/mlp": 0.003642806550487876, "grad/layer_8/attn_mlp_ratio": 0.9399130559421093, "grad/layer_12/attn": 0.00438598683103919, "grad/layer_12/mlp": 0.006761944852769375, "grad/layer_12/attn_mlp_ratio": 0.6486280000316915, "grad/layer_16/attn": 0.003388420445844531, "grad/layer_16/mlp": 0.004725969862192869, "grad/layer_16/attn_mlp_ratio": 0.7169788367153079, "grad/layer_20/attn": 0.0031030464451760054, "grad/layer_20/mlp": 0.005842274520546198, "grad/layer_20/attn_mlp_ratio": 0.5311366970431629, "grad/layer_24/attn": 0.01037102285772562, "grad/layer_24/mlp": 0.01161607913672924, "grad/layer_24/attn_mlp_ratio": 0.8928161255075777, "grad/layer_27/attn": 0.006877714768052101, "grad/layer_27/mlp": 0.01101090107113123, "grad/layer_27/attn_mlp_ratio": 0.6246277812468554} {"step": 68250, "timestamp": 1778268309.7053406, "eos/sharpness": 39.16683197021484, "eos/L0_probe": 1.9696069955825806, "eos/L_plus": 2.181462049484253, "eos/L_minus": 2.1494202613830566, "eos/grad_norm": 0.14211905002593994, "eos/embed_grad_frac": 0.1345539093017578, "eos/time_s": 0.5989606380462646} {"step": 68250, "timestamp": 1778268309.7231936, "train/loss": 2.147787642478943, "train/z_loss": 0.001387688668910414, "train/perplexity": 8.565886615056277, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915618.6784452884, "perf/iters_per_sec": 0.9134381668306772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0947648525238036, "data/tokens_consumed": 143132721152, "data/tokens_consumed_B": 143.132721152, "train/loss_slope": -1.618370697228649e-05} {"step": 68250, "timestamp": 1778268311.0892549, "geo/rankme_last": 438.2817077636719, "geo/layer_0/stable_rank_q_proj": 19.2852783203125, "geo/layer_0/stable_rank_k_proj": 16.09370231628418, "geo/layer_0/stable_rank_o_proj": 46.97159194946289, "geo/layer_0/stable_rank_gate_proj": 129.76637268066406, "geo/layer_0/stable_rank_down_proj": 55.658565521240234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06582137197256088, "geo/layer_0/attn_entropy_mean": 6.156633377075195, "geo/layer_0/attn_entropy_std": 0.422011137008667, "geo/layer_7/stable_rank_q_proj": 43.25157928466797, "geo/layer_7/stable_rank_k_proj": 40.47924041748047, "geo/layer_7/stable_rank_o_proj": 89.61016845703125, "geo/layer_7/stable_rank_gate_proj": 79.72737121582031, "geo/layer_7/stable_rank_down_proj": 140.45718383789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44762206077575684, "geo/layer_7/attn_entropy_mean": 4.609676361083984, "geo/layer_7/attn_entropy_std": 0.8092214465141296, "geo/layer_14/stable_rank_q_proj": 50.674713134765625, "geo/layer_14/stable_rank_k_proj": 40.70854949951172, "geo/layer_14/stable_rank_o_proj": 43.555721282958984, "geo/layer_14/stable_rank_gate_proj": 71.93136596679688, "geo/layer_14/stable_rank_down_proj": 128.4888916015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3761690557003021, "geo/layer_14/attn_entropy_mean": 5.539693832397461, "geo/layer_14/attn_entropy_std": 0.4027916491031647, "geo/layer_21/stable_rank_q_proj": 40.25761413574219, "geo/layer_21/stable_rank_k_proj": 30.116886138916016, "geo/layer_21/stable_rank_o_proj": 69.45985412597656, "geo/layer_21/stable_rank_gate_proj": 64.94808197021484, "geo/layer_21/stable_rank_down_proj": 50.46076965332031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14183281362056732, "geo/layer_21/attn_entropy_mean": 5.674517631530762, "geo/layer_21/attn_entropy_std": 0.3031173348426819, "geo/layer_27/stable_rank_q_proj": 43.46474838256836, "geo/layer_27/stable_rank_k_proj": 31.855268478393555, "geo/layer_27/stable_rank_o_proj": 115.09584045410156, "geo/layer_27/stable_rank_gate_proj": 79.55130004882812, "geo/layer_27/stable_rank_down_proj": 127.26298522949219, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09570562094449997, "geo/layer_27/attn_entropy_mean": 4.182137489318848, "geo/layer_27/attn_entropy_std": 0.745107889175415, "attnres/final_alpha/block_0": 0.23812106251716614, "attnres/block_norm/0": 1.7695459127426147, "attnres/final_alpha/block_1": 0.004127162508666515, "attnres/block_norm/1": 47578.48046875, "attnres/final_alpha/block_2": 0.01026469562202692, "attnres/block_norm/2": 28870.9296875, "attnres/final_alpha/block_3": 0.012149933725595474, "attnres/block_norm/3": 59944.81640625, "attnres/final_alpha/block_4": 0.014593599364161491, "attnres/block_norm/4": 15469.71875, "attnres/final_alpha/block_5": 0.6139848232269287, "attnres/block_norm/5": 6690.2978515625, "attnres/final_alpha/block_6": 0.10675869882106781, "attnres/block_norm/6": 39829.48828125, "geo/tier1_time_s": 1.3620967864990234, "geo/step": 68250.0, "geo/rankme_slope": 2.3656650160064026e-05} {"step": 68260, "timestamp": 1778268321.4310193, "train/loss": 2.1266823768615724, "train/z_loss": 0.001372008363250643, "train/perplexity": 8.386995713064026, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791825.4191801476, "perf/iters_per_sec": 0.8544089408779848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1703997373580932, "data/tokens_consumed": 143153692672, "data/tokens_consumed_B": 143.153692672, "train/loss_slope": -1.4367387096623683e-05} {"step": 68270, "timestamp": 1778268331.7733927, "train/loss": 2.113729202747345, "train/z_loss": 0.001384638319723308, "train/perplexity": 8.27905807399989, "train/grad_norm": 0.421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029204.6949449668, "perf/iters_per_sec": 0.9676002001499971, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033484697341919, "data/tokens_consumed": 143174664192, "data/tokens_consumed_B": 143.174664192, "train/loss_slope": -1.4753453523376673e-05} {"step": 68280, "timestamp": 1778268342.11287, "train/loss": 2.1610838890075685, "train/z_loss": 0.0013744900468736887, "train/perplexity": 8.680541304588282, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029754.6100182515, "perf/iters_per_sec": 0.9678624200907953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033204698562622, "data/tokens_consumed": 143195635712, "data/tokens_consumed_B": 143.195635712, "train/loss_slope": -1.2655003865559916e-05} {"step": 68290, "timestamp": 1778268352.449427, "train/loss": 2.1666094303131103, "train/z_loss": 0.0013724045478738845, "train/perplexity": 8.72863875396856, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029844.1211602897, "perf/iters_per_sec": 0.9679051023293923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331591367721558, "data/tokens_consumed": 143216607232, "data/tokens_consumed_B": 143.216607232, "train/loss_slope": -1.0460544984475896e-05} {"step": 68300, "timestamp": 1778268362.7886844, "grad/layer_0/attn": 0.002728898311033845, "grad/layer_0/mlp": 0.0029804804362356663, "grad/layer_0/attn_mlp_ratio": 0.9155900459193843, "grad/layer_4/attn": 0.0023509569000452757, "grad/layer_4/mlp": 0.002582679269835353, "grad/layer_4/attn_mlp_ratio": 0.9102782666340616, "grad/layer_8/attn": 0.008220815099775791, "grad/layer_8/mlp": 0.0036009913310408592, "grad/layer_8/attn_mlp_ratio": 2.282931036411709, "grad/layer_12/attn": 0.00404667342081666, "grad/layer_12/mlp": 0.0071504320949316025, "grad/layer_12/attn_mlp_ratio": 0.5659341016736078, "grad/layer_16/attn": 0.003774862503632903, "grad/layer_16/mlp": 0.0047193109057843685, "grad/layer_16/attn_mlp_ratio": 0.799875765552668, "grad/layer_20/attn": 0.003955266438424587, "grad/layer_20/mlp": 0.005713940132409334, "grad/layer_20/attn_mlp_ratio": 0.6922134774862378, "grad/layer_24/attn": 0.006837864406406879, "grad/layer_24/mlp": 0.008004331961274147, "grad/layer_24/attn_mlp_ratio": 0.8542704568054131, "grad/layer_27/attn": 0.0052458965219557285, "grad/layer_27/mlp": 0.007589457090944052, "grad/layer_27/attn_mlp_ratio": 0.6912083947472919} {"step": 68300, "timestamp": 1778268362.8029742, "train/loss": 2.1299772024154664, "train/z_loss": 0.0013696658657863735, "train/perplexity": 8.414674974989369, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027007.4506272434, "perf/iters_per_sec": 0.9665524724136559, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346049785614013, "data/tokens_consumed": 143237578752, "data/tokens_consumed_B": 143.237578752, "train/loss_slope": -1.0219241912537367e-05} {"step": 68310, "timestamp": 1778268373.150264, "train/loss": 2.1731956958770753, "train/z_loss": 0.0013662310782819986, "train/perplexity": 8.786317622483203, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028246.9887579211, "perf/iters_per_sec": 0.9671435302533727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033972692489624, "data/tokens_consumed": 143258550272, "data/tokens_consumed_B": 143.258550272, "train/loss_slope": -9.545976865981898e-06} {"step": 68320, "timestamp": 1778268383.488974, "train/loss": 2.115728032588959, "train/z_loss": 0.001392836996819824, "train/perplexity": 8.295623052109743, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029505.8376765975, "perf/iters_per_sec": 0.9677437961943615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333313465118408, "data/tokens_consumed": 143279521792, "data/tokens_consumed_B": 143.279521792, "train/loss_slope": -1.307867270777828e-05} {"step": 68325, "timestamp": 1778268389.2506502, "eos/sharpness": 67.86041259765624, "eos/L0_probe": 1.9696601629257202, "eos/L_plus": 2.3652851581573486, "eos/L_minus": 2.2526392936706543, "eos/grad_norm": 0.15552890300750732, "eos/embed_grad_frac": 0.0880119800567627, "eos/time_s": 0.5846796035766602} {"step": 68325, "timestamp": 1778268390.631056, "geo/rankme_last": 438.9263000488281, "geo/layer_0/stable_rank_q_proj": 19.281679153442383, "geo/layer_0/stable_rank_k_proj": 16.093366622924805, "geo/layer_0/stable_rank_o_proj": 46.92026138305664, "geo/layer_0/stable_rank_gate_proj": 129.83958435058594, "geo/layer_0/stable_rank_down_proj": 55.64308547973633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062181256711483, "geo/layer_0/attn_entropy_mean": 6.155069828033447, "geo/layer_0/attn_entropy_std": 0.42652061581611633, "geo/layer_7/stable_rank_q_proj": 43.24583435058594, "geo/layer_7/stable_rank_k_proj": 40.49751281738281, "geo/layer_7/stable_rank_o_proj": 89.57117462158203, "geo/layer_7/stable_rank_gate_proj": 79.81378173828125, "geo/layer_7/stable_rank_down_proj": 140.38809204101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44368335604667664, "geo/layer_7/attn_entropy_mean": 4.6453857421875, "geo/layer_7/attn_entropy_std": 0.8006721138954163, "geo/layer_14/stable_rank_q_proj": 50.65678787231445, "geo/layer_14/stable_rank_k_proj": 40.6177864074707, "geo/layer_14/stable_rank_o_proj": 43.54329299926758, "geo/layer_14/stable_rank_gate_proj": 71.94564819335938, "geo/layer_14/stable_rank_down_proj": 128.224365234375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39708033204078674, "geo/layer_14/attn_entropy_mean": 5.581756591796875, "geo/layer_14/attn_entropy_std": 0.42210522294044495, "geo/layer_21/stable_rank_q_proj": 40.227516174316406, "geo/layer_21/stable_rank_k_proj": 30.1348934173584, "geo/layer_21/stable_rank_o_proj": 69.55036926269531, "geo/layer_21/stable_rank_gate_proj": 64.90830993652344, "geo/layer_21/stable_rank_down_proj": 50.50239181518555, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14577533304691315, "geo/layer_21/attn_entropy_mean": 5.696784973144531, "geo/layer_21/attn_entropy_std": 0.3048156797885895, "geo/layer_27/stable_rank_q_proj": 43.61143112182617, "geo/layer_27/stable_rank_k_proj": 31.90921401977539, "geo/layer_27/stable_rank_o_proj": 115.29546356201172, "geo/layer_27/stable_rank_gate_proj": 79.56132507324219, "geo/layer_27/stable_rank_down_proj": 127.3638687133789, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09427851438522339, "geo/layer_27/attn_entropy_mean": 4.192384243011475, "geo/layer_27/attn_entropy_std": 0.7527086734771729, "attnres/final_alpha/block_0": 0.233981192111969, "attnres/block_norm/0": 1.7696247100830078, "attnres/final_alpha/block_1": 0.004091288428753614, "attnres/block_norm/1": 47492.5703125, "attnres/final_alpha/block_2": 0.009887056425213814, "attnres/block_norm/2": 28879.69921875, "attnres/final_alpha/block_3": 0.011833293363451958, "attnres/block_norm/3": 59705.47265625, "attnres/final_alpha/block_4": 0.01425289735198021, "attnres/block_norm/4": 15515.6240234375, "attnres/final_alpha/block_5": 0.6217714548110962, "attnres/block_norm/5": 6711.0322265625, "attnres/final_alpha/block_6": 0.10418276488780975, "attnres/block_norm/6": 40195.0, "geo/tier1_time_s": 1.3607308864593506, "geo/step": 68325.0, "geo/rankme_slope": 3.373224289715887e-05} {"step": 68330, "timestamp": 1778268395.8224323, "train/loss": 2.099356245994568, "train/z_loss": 0.0013758172630332411, "train/perplexity": 8.160914599723235, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701476.7780583776, "perf/iters_per_sec": 0.811327351597966, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325481176376343, "data/tokens_consumed": 143300493312, "data/tokens_consumed_B": 143.300493312, "train/loss_slope": -1.7215921197107795e-05} {"step": 68340, "timestamp": 1778268406.1971996, "train/loss": 2.0927629709243774, "train/z_loss": 0.001378517912235111, "train/perplexity": 8.107284438426491, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022283.7918376515, "perf/iters_per_sec": 0.9643000563801057, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370216131210328, "data/tokens_consumed": 143321464832, "data/tokens_consumed_B": 143.321464832, "train/loss_slope": -2.1422604594615487e-05} {"step": 68350, "timestamp": 1778268416.5407438, "grad/layer_0/attn": 0.0026322579942643642, "grad/layer_0/mlp": 0.0029922693502157927, "grad/layer_0/attn_mlp_ratio": 0.8796861506153915, "grad/layer_4/attn": 0.0034120252821594477, "grad/layer_4/mlp": 0.0026308174710720778, "grad/layer_4/attn_mlp_ratio": 1.2969448431838708, "grad/layer_8/attn": 0.012166503816843033, "grad/layer_8/mlp": 0.0035573348868638277, "grad/layer_8/attn_mlp_ratio": 3.4201175491682076, "grad/layer_12/attn": 0.0066002653911709785, "grad/layer_12/mlp": 0.006733359303325415, "grad/layer_12/attn_mlp_ratio": 0.9802336390823421, "grad/layer_16/attn": 0.0035434814635664225, "grad/layer_16/mlp": 0.004618210252374411, "grad/layer_16/attn_mlp_ratio": 0.7672845524986912, "grad/layer_20/attn": 0.004390588030219078, "grad/layer_20/mlp": 0.005966490600258112, "grad/layer_20/attn_mlp_ratio": 0.7358744445925542, "grad/layer_24/attn": 0.010750452056527138, "grad/layer_24/mlp": 0.009318528696894646, "grad/layer_24/attn_mlp_ratio": 1.153664091279052, "grad/layer_27/attn": 0.00458111334592104, "grad/layer_27/mlp": 0.008031985722482204, "grad/layer_27/attn_mlp_ratio": 0.570358744047844} {"step": 68350, "timestamp": 1778268416.5549197, "train/loss": 2.1132635354995726, "train/z_loss": 0.0013730401871725917, "train/perplexity": 8.27520368531336, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025597.334117342, "perf/iters_per_sec": 0.9658800764643393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353252172470093, "data/tokens_consumed": 143342436352, "data/tokens_consumed_B": 143.342436352, "train/loss_slope": -2.3142757245523596e-05} {"step": 68360, "timestamp": 1778268426.8945966, "train/loss": 2.1774716854095457, "train/z_loss": 0.001366086769849062, "train/perplexity": 8.8239682641735, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029702.8554769703, "perf/iters_per_sec": 0.9678377416024067, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332310438156127, "data/tokens_consumed": 143363407872, "data/tokens_consumed_B": 143.363407872, "train/loss_slope": -1.8721427017598784e-05} {"step": 68370, "timestamp": 1778268437.2335942, "train/loss": 2.195647430419922, "train/z_loss": 0.0013738381210714578, "train/perplexity": 8.985816865136949, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029752.174451247, "perf/iters_per_sec": 0.9678612587219463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332059383392334, "data/tokens_consumed": 143384379392, "data/tokens_consumed_B": 143.384379392, "train/loss_slope": -1.6293942252329958e-05} {"step": 68380, "timestamp": 1778268447.5705297, "train/loss": 2.1629661083221436, "train/z_loss": 0.001374085561838001, "train/perplexity": 8.696895373236488, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029800.7463818712, "perf/iters_per_sec": 0.9678844196233135, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0331812143325805, "data/tokens_consumed": 143405350912, "data/tokens_consumed_B": 143.405350912, "train/loss_slope": -1.3031175527850204e-05} {"step": 68390, "timestamp": 1778268457.9774432, "train/loss": 2.1131420612335203, "train/z_loss": 0.001386901061050594, "train/perplexity": 8.274198522071222, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017201.1181468805, "perf/iters_per_sec": 0.9618764487013247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0396345615386964, "data/tokens_consumed": 143426322432, "data/tokens_consumed_B": 143.426322432, "train/loss_slope": -1.1316318273043618e-05} {"step": 68400, "timestamp": 1778268468.362493, "grad/layer_0/attn": 0.002757380483672023, "grad/layer_0/mlp": 0.0030689567793160677, "grad/layer_0/attn_mlp_ratio": 0.8984748212840705, "grad/layer_4/attn": 0.0026878067292273045, "grad/layer_4/mlp": 0.002651068614795804, "grad/layer_4/attn_mlp_ratio": 1.0138578129742406, "grad/layer_8/attn": 0.0050125750713050365, "grad/layer_8/mlp": 0.003565303748473525, "grad/layer_8/attn_mlp_ratio": 1.405932084428416, "grad/layer_12/attn": 0.006130462046712637, "grad/layer_12/mlp": 0.007954725064337254, "grad/layer_12/attn_mlp_ratio": 0.7706692462735003, "grad/layer_16/attn": 0.004390794318169355, "grad/layer_16/mlp": 0.005001552402973175, "grad/layer_16/attn_mlp_ratio": 0.8778862794220886, "grad/layer_20/attn": 0.007826045155525208, "grad/layer_20/mlp": 0.006496441084891558, "grad/layer_20/attn_mlp_ratio": 1.2046665139870492, "grad/layer_24/attn": 0.011645250022411346, "grad/layer_24/mlp": 0.010663573630154133, "grad/layer_24/attn_mlp_ratio": 1.0920588460396967, "grad/layer_27/attn": 0.004798299167305231, "grad/layer_27/mlp": 0.0104982266202569, "grad/layer_27/attn_mlp_ratio": 0.45705806277232053} {"step": 68400, "timestamp": 1778268468.982296, "eos/sharpness": 43.538284301757805, "eos/L0_probe": 1.9653151035308838, "eos/L_plus": 2.185309648513794, "eos/L_minus": 2.1807034015655518, "eos/grad_norm": 0.1530739814043045, "eos/embed_grad_frac": 0.10370021313428879, "eos/time_s": 0.6167709827423096} {"step": 68400, "timestamp": 1778268469.003756, "train/loss": 2.0990832328796385, "train/z_loss": 0.0013922686688601972, "train/perplexity": 8.158686867121636, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1903182.7305277104, "perf/iters_per_sec": 0.9075082447660973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.101918363571167, "data/tokens_consumed": 143447293952, "data/tokens_consumed_B": 143.447293952, "train/loss_slope": -1.1687890302301613e-05} {"step": 68400, "timestamp": 1778268470.372396, "geo/rankme_last": 438.61602783203125, "geo/layer_0/stable_rank_q_proj": 19.286876678466797, "geo/layer_0/stable_rank_k_proj": 16.039113998413086, "geo/layer_0/stable_rank_o_proj": 46.862327575683594, "geo/layer_0/stable_rank_gate_proj": 129.79335021972656, "geo/layer_0/stable_rank_down_proj": 55.63510513305664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061898618936538696, "geo/layer_0/attn_entropy_mean": 6.149807929992676, "geo/layer_0/attn_entropy_std": 0.41908782720565796, "geo/layer_7/stable_rank_q_proj": 43.193382263183594, "geo/layer_7/stable_rank_k_proj": 40.367645263671875, "geo/layer_7/stable_rank_o_proj": 89.57161712646484, "geo/layer_7/stable_rank_gate_proj": 79.88508605957031, "geo/layer_7/stable_rank_down_proj": 140.5585174560547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4466676414012909, "geo/layer_7/attn_entropy_mean": 4.631453990936279, "geo/layer_7/attn_entropy_std": 0.8062539100646973, "geo/layer_14/stable_rank_q_proj": 50.55514144897461, "geo/layer_14/stable_rank_k_proj": 40.659061431884766, "geo/layer_14/stable_rank_o_proj": 43.4949836730957, "geo/layer_14/stable_rank_gate_proj": 71.94046020507812, "geo/layer_14/stable_rank_down_proj": 128.0890350341797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38641834259033203, "geo/layer_14/attn_entropy_mean": 5.530606269836426, "geo/layer_14/attn_entropy_std": 0.4079863131046295, "geo/layer_21/stable_rank_q_proj": 40.161834716796875, "geo/layer_21/stable_rank_k_proj": 30.10784339904785, "geo/layer_21/stable_rank_o_proj": 69.5154037475586, "geo/layer_21/stable_rank_gate_proj": 64.88723754882812, "geo/layer_21/stable_rank_down_proj": 50.497230529785156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14443320035934448, "geo/layer_21/attn_entropy_mean": 5.68764591217041, "geo/layer_21/attn_entropy_std": 0.31172722578048706, "geo/layer_27/stable_rank_q_proj": 43.47166061401367, "geo/layer_27/stable_rank_k_proj": 31.863901138305664, "geo/layer_27/stable_rank_o_proj": 115.20965576171875, "geo/layer_27/stable_rank_gate_proj": 79.50596618652344, "geo/layer_27/stable_rank_down_proj": 127.47062683105469, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10249511152505875, "geo/layer_27/attn_entropy_mean": 4.164416313171387, "geo/layer_27/attn_entropy_std": 0.7464632391929626, "attnres/final_alpha/block_0": 0.23654134571552277, "attnres/block_norm/0": 1.7696502208709717, "attnres/final_alpha/block_1": 0.004174603149294853, "attnres/block_norm/1": 47597.2578125, "attnres/final_alpha/block_2": 0.010092191398143768, "attnres/block_norm/2": 28923.87109375, "attnres/final_alpha/block_3": 0.012031003832817078, "attnres/block_norm/3": 59898.4140625, "attnres/final_alpha/block_4": 0.014401067048311234, "attnres/block_norm/4": 15451.4619140625, "attnres/final_alpha/block_5": 0.6165838837623596, "attnres/block_norm/5": 6679.2802734375, "attnres/final_alpha/block_6": 0.10617586970329285, "attnres/block_norm/6": 40004.421875, "geo/tier1_time_s": 1.3648595809936523, "geo/step": 68400.0, "geo/rankme_slope": 4.0702765481192475e-05} {"step": 68410, "timestamp": 1778268480.7699366, "train/loss": 2.1228822469711304, "train/z_loss": 0.0013805803726427257, "train/perplexity": 8.35518452157569, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1782915.47667533, "perf/iters_per_sec": 0.8501603492142343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1762486934661864, "data/tokens_consumed": 143468265472, "data/tokens_consumed_B": 143.468265472, "train/loss_slope": -1.0699035389588673e-05} {"step": 68420, "timestamp": 1778268491.111399, "train/loss": 2.1218647241592405, "train/z_loss": 0.0013848504167981446, "train/perplexity": 8.3466872545421, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029488.043799562, "perf/iters_per_sec": 0.9677353114126024, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333404064178466, "data/tokens_consumed": 143489236992, "data/tokens_consumed_B": 143.489236992, "train/loss_slope": -8.96557044095911e-06} {"step": 68430, "timestamp": 1778268501.452257, "train/loss": 2.084432864189148, "train/z_loss": 0.0013876477489247919, "train/perplexity": 8.040030399283584, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029082.6152706738, "perf/iters_per_sec": 0.9675419880250329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335468769073486, "data/tokens_consumed": 143510208512, "data/tokens_consumed_B": 143.510208512, "train/loss_slope": -1.056201675436791e-05} {"step": 68440, "timestamp": 1778268511.7932029, "train/loss": 2.12061083316803, "train/z_loss": 0.0013784010545350612, "train/perplexity": 8.336227977354273, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029539.2722789648, "perf/iters_per_sec": 0.9677597390551399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033314323425293, "data/tokens_consumed": 143531180032, "data/tokens_consumed_B": 143.531180032, "train/loss_slope": -1.1580284808514029e-05} {"step": 68450, "timestamp": 1778268522.1255784, "grad/layer_0/attn": 0.0026387579273432493, "grad/layer_0/mlp": 0.002818419598042965, "grad/layer_0/attn_mlp_ratio": 0.9362544298053026, "grad/layer_4/attn": 0.0019484705990180373, "grad/layer_4/mlp": 0.0024628667160868645, "grad/layer_4/attn_mlp_ratio": 0.7911392472752028, "grad/layer_8/attn": 0.005185738205909729, "grad/layer_8/mlp": 0.0035493155010044575, "grad/layer_8/attn_mlp_ratio": 1.4610529997507584, "grad/layer_12/attn": 0.005088080186396837, "grad/layer_12/mlp": 0.006643927656114101, "grad/layer_12/attn_mlp_ratio": 0.765824128914483, "grad/layer_16/attn": 0.003386675613000989, "grad/layer_16/mlp": 0.004624256398528814, "grad/layer_16/attn_mlp_ratio": 0.732371920562461, "grad/layer_20/attn": 0.0030182397458702326, "grad/layer_20/mlp": 0.005349398124963045, "grad/layer_20/attn_mlp_ratio": 0.5642204261005608, "grad/layer_24/attn": 0.008684367872774601, "grad/layer_24/mlp": 0.007047621067613363, "grad/layer_24/attn_mlp_ratio": 1.2322410166827273, "grad/layer_27/attn": 0.004316576756536961, "grad/layer_27/mlp": 0.006211555562913418, "grad/layer_27/attn_mlp_ratio": 0.6949268413240539} {"step": 68450, "timestamp": 1778268522.139727, "train/loss": 2.230595564842224, "train/z_loss": 0.0013498388114385306, "train/perplexity": 9.30540640240493, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028035.2906244593, "perf/iters_per_sec": 0.9670425847170159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340806245803833, "data/tokens_consumed": 143552151552, "data/tokens_consumed_B": 143.552151552, "train/loss_slope": -4.988877686730361e-06} {"step": 68460, "timestamp": 1778268532.4754725, "train/loss": 2.1590996026992797, "train/z_loss": 0.0013639748678542674, "train/perplexity": 8.663333703378878, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2030100.3793261065, "perf/iters_per_sec": 0.9680272957449467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0330287218093872, "data/tokens_consumed": 143573123072, "data/tokens_consumed_B": 143.573123072, "train/loss_slope": -2.307851708690046e-06} {"step": 68470, "timestamp": 1778268542.8158832, "train/loss": 2.1248565316200256, "train/z_loss": 0.0013851150637492538, "train/perplexity": 8.371696328255242, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029579.638770948, "perf/iters_per_sec": 0.9677789872984638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0332937717437745, "data/tokens_consumed": 143594094592, "data/tokens_consumed_B": 143.594094592, "train/loss_slope": -1.0587947346447199e-06} {"step": 68475, "timestamp": 1778268548.5702665, "eos/sharpness": 16.854476928710934, "eos/L0_probe": 1.972991943359375, "eos/L_plus": 2.064157724380493, "eos/L_minus": 2.050370931625366, "eos/grad_norm": 0.11609268933534622, "eos/embed_grad_frac": 0.18947233259677887, "eos/time_s": 0.5930125713348389} {"step": 68475, "timestamp": 1778268549.9508708, "geo/rankme_last": 438.3584289550781, "geo/layer_0/stable_rank_q_proj": 19.29665184020996, "geo/layer_0/stable_rank_k_proj": 16.05960464477539, "geo/layer_0/stable_rank_o_proj": 46.849098205566406, "geo/layer_0/stable_rank_gate_proj": 129.7727508544922, "geo/layer_0/stable_rank_down_proj": 55.66101837158203, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06247609481215477, "geo/layer_0/attn_entropy_mean": 6.154839992523193, "geo/layer_0/attn_entropy_std": 0.4167209565639496, "geo/layer_7/stable_rank_q_proj": 43.3011474609375, "geo/layer_7/stable_rank_k_proj": 40.33860397338867, "geo/layer_7/stable_rank_o_proj": 89.49178314208984, "geo/layer_7/stable_rank_gate_proj": 79.8935546875, "geo/layer_7/stable_rank_down_proj": 140.4752960205078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44848203659057617, "geo/layer_7/attn_entropy_mean": 4.641156196594238, "geo/layer_7/attn_entropy_std": 0.8079342246055603, "geo/layer_14/stable_rank_q_proj": 50.46839141845703, "geo/layer_14/stable_rank_k_proj": 40.6316032409668, "geo/layer_14/stable_rank_o_proj": 43.44606018066406, "geo/layer_14/stable_rank_gate_proj": 71.92794036865234, "geo/layer_14/stable_rank_down_proj": 128.25877380371094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38644519448280334, "geo/layer_14/attn_entropy_mean": 5.56561279296875, "geo/layer_14/attn_entropy_std": 0.39555832743644714, "geo/layer_21/stable_rank_q_proj": 40.1908073425293, "geo/layer_21/stable_rank_k_proj": 30.053720474243164, "geo/layer_21/stable_rank_o_proj": 69.5042724609375, "geo/layer_21/stable_rank_gate_proj": 64.85338592529297, "geo/layer_21/stable_rank_down_proj": 50.52838897705078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1456238180398941, "geo/layer_21/attn_entropy_mean": 5.702235221862793, "geo/layer_21/attn_entropy_std": 0.30646592378616333, "geo/layer_27/stable_rank_q_proj": 43.4178352355957, "geo/layer_27/stable_rank_k_proj": 31.92593002319336, "geo/layer_27/stable_rank_o_proj": 115.30126953125, "geo/layer_27/stable_rank_gate_proj": 79.44331359863281, "geo/layer_27/stable_rank_down_proj": 127.54912567138672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09441911429166794, "geo/layer_27/attn_entropy_mean": 4.201199531555176, "geo/layer_27/attn_entropy_std": 0.7554037570953369, "attnres/final_alpha/block_0": 0.237541064620018, "attnres/block_norm/0": 1.7697374820709229, "attnres/final_alpha/block_1": 0.004198608919978142, "attnres/block_norm/1": 47537.0703125, "attnres/final_alpha/block_2": 0.010301383212208748, "attnres/block_norm/2": 28928.99609375, "attnres/final_alpha/block_3": 0.012293899431824684, "attnres/block_norm/3": 59455.03125, "attnres/final_alpha/block_4": 0.014343628659844398, "attnres/block_norm/4": 15510.09375, "attnres/final_alpha/block_5": 0.6144559979438782, "attnres/block_norm/5": 6684.64599609375, "attnres/final_alpha/block_6": 0.10686540603637695, "attnres/block_norm/6": 39841.8125, "geo/tier1_time_s": 1.3625802993774414, "geo/step": 68475.0, "geo/rankme_slope": 7.765586703431368e-06} {"step": 68480, "timestamp": 1778268555.1227715, "train/loss": 2.1013112306594848, "train/z_loss": 0.0013851875206455587, "train/perplexity": 8.176884668150405, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1705025.7918687179, "perf/iters_per_sec": 0.8130196532577123, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.229982566833496, "data/tokens_consumed": 143615066112, "data/tokens_consumed_B": 143.615066112, "train/loss_slope": -5.889686878615287e-06} {"step": 68490, "timestamp": 1778268565.4614835, "train/loss": 2.190651869773865, "train/z_loss": 0.0013651650864630938, "train/perplexity": 8.941039608902669, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029387.9824652213, "perf/iters_per_sec": 0.9676875984502894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0333913564682007, "data/tokens_consumed": 143636037632, "data/tokens_consumed_B": 143.636037632, "train/loss_slope": -1.2060618779697634e-06} {"step": 68500, "timestamp": 1778268575.8269546, "grad/layer_0/attn": 0.0029293557163327932, "grad/layer_0/mlp": 0.002951010363176465, "grad/layer_0/attn_mlp_ratio": 0.9926619213608743, "grad/layer_4/attn": 0.004080174025148153, "grad/layer_4/mlp": 0.002593014156445861, "grad/layer_4/attn_mlp_ratio": 1.5735254887262677, "grad/layer_8/attn": 0.0030288537964224815, "grad/layer_8/mlp": 0.003655563108623028, "grad/layer_8/attn_mlp_ratio": 0.8285600941813247, "grad/layer_12/attn": 0.004105861764401197, "grad/layer_12/mlp": 0.007046049926429987, "grad/layer_12/attn_mlp_ratio": 0.5827182249629171, "grad/layer_16/attn": 0.0032986614387482405, "grad/layer_16/mlp": 0.004889004863798618, "grad/layer_16/attn_mlp_ratio": 0.6747101840095646, "grad/layer_20/attn": 0.0071234628558158875, "grad/layer_20/mlp": 0.006193116772919893, "grad/layer_20/attn_mlp_ratio": 1.1502225780630813, "grad/layer_24/attn": 0.01631191559135914, "grad/layer_24/mlp": 0.011741595342755318, "grad/layer_24/attn_mlp_ratio": 1.389241834373007, "grad/layer_27/attn": 0.012178802862763405, "grad/layer_27/mlp": 0.009983591735363007, "grad/layer_27/attn_mlp_ratio": 1.219881888562863} {"step": 68500, "timestamp": 1778268575.8440619, "train/loss": 2.0960702419281008, "train/z_loss": 0.0013807298964820802, "train/perplexity": 8.134141812996127, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021863.6223294316, "perf/iters_per_sec": 0.9640997039458425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372371196746826, "data/tokens_consumed": 143657009152, "data/tokens_consumed_B": 143.657009152, "train/loss_slope": -3.165843527559571e-06} {"step": 68500, "timestamp": 1778268582.8066158, "geo/ww_alpha_mean": 7.531081241254018, "geo/ww_alpha_std": 4.558705024163603, "geo/ww_alpha_min": 1.3597408144861158, "geo/ww_alpha_max": 30.006142399548068, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.889098174922263, "geo/ww_alpha_by_type/k_proj": 4.492541248952799, "geo/ww_alpha_by_type/v_proj": 7.588214816756163, "geo/ww_alpha_by_type/o_proj": 8.179626556816684, "geo/ww_alpha_by_type/gate_proj": 7.954138826645243, "geo/ww_alpha_by_type/up_proj": 12.362761432194342, "geo/ww_alpha_by_type/down_proj": 8.334590615862309, "geo/twonn_id/layer_0": 0.7245457768440247, "geo/twonn_id/layer_7": 3.5324559211730957, "geo/twonn_id/layer_14": 5.70554780960083, "geo/twonn_id/layer_21": 6.653957366943359, "geo/twonn_id/layer_27": 6.284852504730225, "geo/tier2_time_s": 6.953963756561279} {"step": 68500, "timestamp": 1778268583.5128593, "eoc/jacobian_sigma/layer_0/attn": 1144.5614013671875, "eoc/jacobian_sigma/layer_0/mlp": 10619.4775390625, "eoc/jacobian_sigma/layer_0": 10619.4775390625, "eoc/jacobian_sigma/layer_7/attn": 1.1467442512512207, "eoc/jacobian_sigma/layer_7/mlp": 1.789108395576477, "eoc/jacobian_sigma/layer_7": 1.789108395576477, "eoc/jacobian_sigma/layer_14/attn": 1.4652835130691528, "eoc/jacobian_sigma/layer_14/mlp": 7.03265905380249, "eoc/jacobian_sigma/layer_14": 7.03265905380249, "eoc/jacobian_sigma/layer_21/attn": 1.090369701385498, "eoc/jacobian_sigma/layer_21/mlp": 4.669402122497559, "eoc/jacobian_sigma/layer_21": 4.669402122497559, "eoc/jacobian_sigma/layer_27/attn": 2.7811708450317383, "eoc/jacobian_sigma/layer_27/mlp": 31.229385375976562, "eoc/jacobian_sigma/layer_27": 31.229385375976562, "eoc/layer0_sigma": 10619.4775390625, "eoc/sigma_max": 31.229385375976562, "eoc/sigma_min": 1.789108395576477, "eoc/sigma_mean": 11.180138736963272, "eoc/time_s": 0.7001330852508545} {"step": 68510, "timestamp": 1778268593.9057887, "train/loss": 2.133685803413391, "train/z_loss": 0.0013721789582632482, "train/perplexity": 8.445939585147324, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1161589.328214013, "perf/iters_per_sec": 0.553888954264647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8054160356521607, "data/tokens_consumed": 143677980672, "data/tokens_consumed_B": 143.677980672, "train/loss_slope": -4.667797526403358e-06} {"step": 68520, "timestamp": 1778268604.276903, "train/loss": 2.1562867283821108, "train/z_loss": 0.0013742398703470826, "train/perplexity": 8.638999075673386, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023655.542823409, "perf/iters_per_sec": 0.9649541582219167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363186597824097, "data/tokens_consumed": 143698952192, "data/tokens_consumed_B": 143.698952192, "train/loss_slope": -3.6713047854506328e-06} {"step": 68530, "timestamp": 1778268614.6183648, "train/loss": 2.1317561864852905, "train/z_loss": 0.0013911534100770951, "train/perplexity": 8.42965787093776, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029121.0912514434, "perf/iters_per_sec": 0.9675603348023621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335272789001464, "data/tokens_consumed": 143719923712, "data/tokens_consumed_B": 143.719923712, "train/loss_slope": -6.375406620347251e-06} {"step": 68540, "timestamp": 1778268624.9626262, "train/loss": 2.159246063232422, "train/z_loss": 0.0013668768689967692, "train/perplexity": 8.664602632773635, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028224.6805096841, "perf/iters_per_sec": 0.9671328928516789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339840650558472, "data/tokens_consumed": 143740895232, "data/tokens_consumed_B": 143.740895232, "train/loss_slope": -5.173076905182735e-06} {"step": 68550, "timestamp": 1778268635.3223634, "grad/layer_0/attn": 0.0029989867471158504, "grad/layer_0/mlp": 0.003094941610470414, "grad/layer_0/attn_mlp_ratio": 0.9689961969138413, "grad/layer_4/attn": 0.0020100297406315804, "grad/layer_4/mlp": 0.002524111419916153, "grad/layer_4/attn_mlp_ratio": 0.7963315902533301, "grad/layer_8/attn": 0.0036691047716885805, "grad/layer_8/mlp": 0.0036298767663538456, "grad/layer_8/attn_mlp_ratio": 1.0108069520755227, "grad/layer_12/attn": 0.0034298805985599756, "grad/layer_12/mlp": 0.006755684036761522, "grad/layer_12/attn_mlp_ratio": 0.5077029252886534, "grad/layer_16/attn": 0.0030849752947688103, "grad/layer_16/mlp": 0.004495916422456503, "grad/layer_16/attn_mlp_ratio": 0.686172725707822, "grad/layer_20/attn": 0.003213509451597929, "grad/layer_20/mlp": 0.006026919465512037, "grad/layer_20/attn_mlp_ratio": 0.5331926893444304, "grad/layer_24/attn": 0.009468396194279194, "grad/layer_24/mlp": 0.01091876719146967, "grad/layer_24/attn_mlp_ratio": 0.8671671390667348, "grad/layer_27/attn": 0.007400931790471077, "grad/layer_27/mlp": 0.010760489851236343, "grad/layer_27/attn_mlp_ratio": 0.6877876215683595} {"step": 68550, "timestamp": 1778268635.957526, "eos/sharpness": 25.26626586914062, "eos/L0_probe": 1.9743369817733765, "eos/L_plus": 2.1032562255859375, "eos/L_minus": 2.0980803966522217, "eos/grad_norm": 0.13830265402793884, "eos/embed_grad_frac": 0.13336817920207977, "eos/time_s": 0.6320197582244873} {"step": 68550, "timestamp": 1778268635.9800456, "train/loss": 2.1734798669815065, "train/z_loss": 0.0013684729114174844, "train/perplexity": 8.78881479486118, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904747.029858846, "perf/iters_per_sec": 0.908254160813735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1010133981704713, "data/tokens_consumed": 143761866752, "data/tokens_consumed_B": 143.761866752, "train/loss_slope": -3.2390582965652593e-06} {"step": 68550, "timestamp": 1778268637.3493445, "geo/rankme_last": 438.45855712890625, "geo/layer_0/stable_rank_q_proj": 19.270721435546875, "geo/layer_0/stable_rank_k_proj": 16.033191680908203, "geo/layer_0/stable_rank_o_proj": 46.86182403564453, "geo/layer_0/stable_rank_gate_proj": 129.855224609375, "geo/layer_0/stable_rank_down_proj": 55.654659271240234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0647423043847084, "geo/layer_0/attn_entropy_mean": 6.158543586730957, "geo/layer_0/attn_entropy_std": 0.41992342472076416, "geo/layer_7/stable_rank_q_proj": 43.34657287597656, "geo/layer_7/stable_rank_k_proj": 40.33316421508789, "geo/layer_7/stable_rank_o_proj": 89.38851928710938, "geo/layer_7/stable_rank_gate_proj": 79.63500213623047, "geo/layer_7/stable_rank_down_proj": 140.0703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43184173107147217, "geo/layer_7/attn_entropy_mean": 4.621559143066406, "geo/layer_7/attn_entropy_std": 0.7885946035385132, "geo/layer_14/stable_rank_q_proj": 50.45687484741211, "geo/layer_14/stable_rank_k_proj": 40.59267807006836, "geo/layer_14/stable_rank_o_proj": 43.444541931152344, "geo/layer_14/stable_rank_gate_proj": 71.80098724365234, "geo/layer_14/stable_rank_down_proj": 127.71299743652344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3828229606151581, "geo/layer_14/attn_entropy_mean": 5.525023460388184, "geo/layer_14/attn_entropy_std": 0.41668054461479187, "geo/layer_21/stable_rank_q_proj": 40.32012176513672, "geo/layer_21/stable_rank_k_proj": 30.112686157226562, "geo/layer_21/stable_rank_o_proj": 69.4686508178711, "geo/layer_21/stable_rank_gate_proj": 64.79234313964844, "geo/layer_21/stable_rank_down_proj": 50.514869689941406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1418987661600113, "geo/layer_21/attn_entropy_mean": 5.681654930114746, "geo/layer_21/attn_entropy_std": 0.314582496881485, "geo/layer_27/stable_rank_q_proj": 43.42438888549805, "geo/layer_27/stable_rank_k_proj": 31.881338119506836, "geo/layer_27/stable_rank_o_proj": 115.08948516845703, "geo/layer_27/stable_rank_gate_proj": 79.42733001708984, "geo/layer_27/stable_rank_down_proj": 127.66034698486328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0997442677617073, "geo/layer_27/attn_entropy_mean": 4.1841630935668945, "geo/layer_27/attn_entropy_std": 0.7502543330192566, "attnres/final_alpha/block_0": 0.2354719340801239, "attnres/block_norm/0": 1.7698414325714111, "attnres/final_alpha/block_1": 0.00415757205337286, "attnres/block_norm/1": 47582.53125, "attnres/final_alpha/block_2": 0.00994057022035122, "attnres/block_norm/2": 28842.28125, "attnres/final_alpha/block_3": 0.011807365342974663, "attnres/block_norm/3": 60043.46875, "attnres/final_alpha/block_4": 0.01423751562833786, "attnres/block_norm/4": 15486.1640625, "attnres/final_alpha/block_5": 0.6170536279678345, "attnres/block_norm/5": 6703.20849609375, "attnres/final_alpha/block_6": 0.10733139514923096, "attnres/block_norm/6": 39828.140625, "geo/tier1_time_s": 1.3655519485473633, "geo/step": 68550.0, "geo/rankme_slope": 2.0916335284113645e-05} {"step": 68560, "timestamp": 1778268647.7268581, "train/loss": 2.1367501258850097, "train/z_loss": 0.001378195604775101, "train/perplexity": 8.47186036213857, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785818.3327921727, "perf/iters_per_sec": 0.8515445388756622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1743366956710815, "data/tokens_consumed": 143782838272, "data/tokens_consumed_B": 143.782838272, "train/loss_slope": -3.870701317739935e-06} {"step": 68570, "timestamp": 1778268658.1065865, "train/loss": 2.139148139953613, "train/z_loss": 0.0013846727786585688, "train/perplexity": 8.49220038055232, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021442.9770107416, "perf/iters_per_sec": 0.963899124627467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374529600143432, "data/tokens_consumed": 143803809792, "data/tokens_consumed_B": 143.803809792, "train/loss_slope": -2.9556246730897073e-06} {"step": 68580, "timestamp": 1778268668.4842427, "train/loss": 2.133626842498779, "train/z_loss": 0.0013890754897147417, "train/perplexity": 8.445441619505024, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022126.6087548237, "perf/iters_per_sec": 0.9642251056455725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037102222442627, "data/tokens_consumed": 143824781312, "data/tokens_consumed_B": 143.824781312, "train/loss_slope": -2.8808410554686057e-06} {"step": 68590, "timestamp": 1778268678.8652382, "train/loss": 2.1630096197128297, "train/z_loss": 0.0013897988363169133, "train/perplexity": 8.697273795481607, "train/grad_norm": 0.294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021516.5180716058, "perf/iters_per_sec": 0.9639341917379407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374152183532714, "data/tokens_consumed": 143845752832, "data/tokens_consumed_B": 143.845752832, "train/loss_slope": -2.5420911026687536e-06} {"step": 68600, "timestamp": 1778268689.2299602, "grad/layer_0/attn": 0.002789779333397746, "grad/layer_0/mlp": 0.0030413686763495207, "grad/layer_0/attn_mlp_ratio": 0.9172775610415285, "grad/layer_4/attn": 0.00312105193734169, "grad/layer_4/mlp": 0.0026440073270350695, "grad/layer_4/attn_mlp_ratio": 1.1804247996540491, "grad/layer_8/attn": 0.0037587867118418217, "grad/layer_8/mlp": 0.003902280703186989, "grad/layer_8/attn_mlp_ratio": 0.9632281482081003, "grad/layer_12/attn": 0.00409062672406435, "grad/layer_12/mlp": 0.006808730307966471, "grad/layer_12/attn_mlp_ratio": 0.6007914073493295, "grad/layer_16/attn": 0.0039026096928864717, "grad/layer_16/mlp": 0.0047640735283494, "grad/layer_16/attn_mlp_ratio": 0.8191749324912517, "grad/layer_20/attn": 0.004177255090326071, "grad/layer_20/mlp": 0.006265195086598396, "grad/layer_20/attn_mlp_ratio": 0.6667398167037882, "grad/layer_24/attn": 0.01900857500731945, "grad/layer_24/mlp": 0.013644391670823097, "grad/layer_24/attn_mlp_ratio": 1.3931419829183607, "grad/layer_27/attn": 0.013093775138258934, "grad/layer_27/mlp": 0.013311170972883701, "grad/layer_27/attn_mlp_ratio": 0.9836681586139611} {"step": 68600, "timestamp": 1778268689.2468908, "train/loss": 2.1353210687637327, "train/z_loss": 0.001387486921157688, "train/perplexity": 8.459762236272956, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021058.309284043, "perf/iters_per_sec": 0.9637157007618156, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376504182815551, "data/tokens_consumed": 143866724352, "data/tokens_consumed_B": 143.866724352, "train/loss_slope": -1.8941438130133456e-06} {"step": 68610, "timestamp": 1778268699.6368868, "train/loss": 2.105440878868103, "train/z_loss": 0.001380434585735202, "train/perplexity": 8.210722145622759, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019809.6366101, "perf/iters_per_sec": 0.9631202872324467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038291907310486, "data/tokens_consumed": 143887695872, "data/tokens_consumed_B": 143.887695872, "train/loss_slope": -4.936848686795112e-06} {"step": 68620, "timestamp": 1778268710.0120249, "train/loss": 2.1402477979660035, "train/z_loss": 0.0013770642457529903, "train/perplexity": 8.50154403321331, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022359.3934687357, "perf/iters_per_sec": 0.9643361060470275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369828462600708, "data/tokens_consumed": 143908667392, "data/tokens_consumed_B": 143.908667392, "train/loss_slope": -5.995710705122802e-06} {"step": 68625, "timestamp": 1778268715.7957122, "eos/sharpness": 65.7971143722534, "eos/L0_probe": 1.9684722423553467, "eos/L_plus": 2.291881561279297, "eos/L_minus": 2.3030340671539307, "eos/grad_norm": 0.21493273973464966, "eos/embed_grad_frac": 0.05702658370137215, "eos/time_s": 0.606818437576294} {"step": 68625, "timestamp": 1778268717.176648, "geo/rankme_last": 438.2871398925781, "geo/layer_0/stable_rank_q_proj": 19.27580451965332, "geo/layer_0/stable_rank_k_proj": 16.07792854309082, "geo/layer_0/stable_rank_o_proj": 46.86406707763672, "geo/layer_0/stable_rank_gate_proj": 129.75218200683594, "geo/layer_0/stable_rank_down_proj": 55.70292663574219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0636523962020874, "geo/layer_0/attn_entropy_mean": 6.157647609710693, "geo/layer_0/attn_entropy_std": 0.41607651114463806, "geo/layer_7/stable_rank_q_proj": 43.30869674682617, "geo/layer_7/stable_rank_k_proj": 40.379920959472656, "geo/layer_7/stable_rank_o_proj": 89.48413848876953, "geo/layer_7/stable_rank_gate_proj": 79.73648071289062, "geo/layer_7/stable_rank_down_proj": 139.70468139648438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4517328441143036, "geo/layer_7/attn_entropy_mean": 4.655703544616699, "geo/layer_7/attn_entropy_std": 0.8077927231788635, "geo/layer_14/stable_rank_q_proj": 50.47862243652344, "geo/layer_14/stable_rank_k_proj": 40.628662109375, "geo/layer_14/stable_rank_o_proj": 43.4687385559082, "geo/layer_14/stable_rank_gate_proj": 71.64752197265625, "geo/layer_14/stable_rank_down_proj": 127.76737213134766, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37754908204078674, "geo/layer_14/attn_entropy_mean": 5.5467939376831055, "geo/layer_14/attn_entropy_std": 0.4047190845012665, "geo/layer_21/stable_rank_q_proj": 40.3493766784668, "geo/layer_21/stable_rank_k_proj": 30.17693519592285, "geo/layer_21/stable_rank_o_proj": 69.44882202148438, "geo/layer_21/stable_rank_gate_proj": 64.79108428955078, "geo/layer_21/stable_rank_down_proj": 50.532962799072266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1451795995235443, "geo/layer_21/attn_entropy_mean": 5.695713996887207, "geo/layer_21/attn_entropy_std": 0.30399104952812195, "geo/layer_27/stable_rank_q_proj": 43.429874420166016, "geo/layer_27/stable_rank_k_proj": 31.81951141357422, "geo/layer_27/stable_rank_o_proj": 115.3473129272461, "geo/layer_27/stable_rank_gate_proj": 79.39920043945312, "geo/layer_27/stable_rank_down_proj": 127.55520629882812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09287349134683609, "geo/layer_27/attn_entropy_mean": 4.185915946960449, "geo/layer_27/attn_entropy_std": 0.7641147971153259, "attnres/final_alpha/block_0": 0.2371440976858139, "attnres/block_norm/0": 1.7700695991516113, "attnres/final_alpha/block_1": 0.004218471236526966, "attnres/block_norm/1": 47642.6015625, "attnres/final_alpha/block_2": 0.010325007140636444, "attnres/block_norm/2": 28858.73828125, "attnres/final_alpha/block_3": 0.012258954346179962, "attnres/block_norm/3": 60158.43359375, "attnres/final_alpha/block_4": 0.014491116628050804, "attnres/block_norm/4": 15493.04296875, "attnres/final_alpha/block_5": 0.6130249500274658, "attnres/block_norm/5": 6768.271484375, "attnres/final_alpha/block_6": 0.10853737592697144, "attnres/block_norm/6": 40063.9453125, "geo/tier1_time_s": 1.3608288764953613, "geo/step": 68625.0, "geo/rankme_slope": 1.2109980711034413e-05} {"step": 68630, "timestamp": 1778268722.368184, "train/loss": 2.1627030611038207, "train/z_loss": 0.0013832857483066618, "train/perplexity": 8.69460797995992, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697997.9997208256, "perf/iters_per_sec": 0.8096685408214691, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350733041763307, "data/tokens_consumed": 143929638912, "data/tokens_consumed_B": 143.929638912, "train/loss_slope": -5.914178425365127e-06} {"step": 68640, "timestamp": 1778268732.7448661, "train/loss": 2.1471131086349486, "train/z_loss": 0.0013745420379564165, "train/perplexity": 8.56011058291419, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021827.2335197942, "perf/iters_per_sec": 0.9640823524092647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372557878494262, "data/tokens_consumed": 143950610432, "data/tokens_consumed_B": 143.950610432, "train/loss_slope": -5.457635015973523e-06} {"step": 68650, "timestamp": 1778268743.1126802, "grad/layer_0/attn": 0.002844264730811119, "grad/layer_0/mlp": 0.0032337536104023457, "grad/layer_0/attn_mlp_ratio": 0.8795551503077313, "grad/layer_4/attn": 0.0028888536617159843, "grad/layer_4/mlp": 0.0025768482591956854, "grad/layer_4/attn_mlp_ratio": 1.1210801952729905, "grad/layer_8/attn": 0.0033822122495621443, "grad/layer_8/mlp": 0.0037446876522153616, "grad/layer_8/attn_mlp_ratio": 0.9032027430220914, "grad/layer_12/attn": 0.006470163352787495, "grad/layer_12/mlp": 0.00667268130928278, "grad/layer_12/attn_mlp_ratio": 0.9696496739356462, "grad/layer_16/attn": 0.003918665926903486, "grad/layer_16/mlp": 0.004694964736700058, "grad/layer_16/attn_mlp_ratio": 0.8346528809484743, "grad/layer_20/attn": 0.0026534360367804766, "grad/layer_20/mlp": 0.005957881920039654, "grad/layer_20/attn_mlp_ratio": 0.44536565642882875, "grad/layer_24/attn": 0.006422285456210375, "grad/layer_24/mlp": 0.008358166553080082, "grad/layer_24/attn_mlp_ratio": 0.7683844702765873, "grad/layer_27/attn": 0.004017701372504234, "grad/layer_27/mlp": 0.008177144452929497, "grad/layer_27/attn_mlp_ratio": 0.4913330498804593} {"step": 68650, "timestamp": 1778268743.129617, "train/loss": 2.1597445964813233, "train/z_loss": 0.0013814455247484147, "train/perplexity": 8.668923302183783, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020627.369266321, "perf/iters_per_sec": 0.963510212548409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378717184066772, "data/tokens_consumed": 143971581952, "data/tokens_consumed_B": 143.971581952, "train/loss_slope": -5.809891949964714e-07} {"step": 68660, "timestamp": 1778268753.5041935, "train/loss": 2.103255295753479, "train/z_loss": 0.0013711492880247532, "train/perplexity": 8.192796526043704, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022442.5803923921, "perf/iters_per_sec": 0.9643757726633034, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369401931762696, "data/tokens_consumed": 143992553472, "data/tokens_consumed_B": 143.992553472, "train/loss_slope": -1.784247161746215e-06} {"step": 68670, "timestamp": 1778268763.881539, "train/loss": 2.1434937477111817, "train/z_loss": 0.0013705017743632197, "train/perplexity": 8.529184453340152, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021767.703693815, "perf/iters_per_sec": 0.9640539663762164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372863292694092, "data/tokens_consumed": 144013524992, "data/tokens_consumed_B": 144.013524992, "train/loss_slope": 6.5806233199765e-07} {"step": 68680, "timestamp": 1778268774.2574205, "train/loss": 2.1129364728927613, "train/z_loss": 0.0013684089644812047, "train/perplexity": 8.272497618174956, "train/grad_norm": 0.2412109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022349.489612301, "perf/iters_per_sec": 0.9643313835202699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369879245758056, "data/tokens_consumed": 144034496512, "data/tokens_consumed_B": 144.034496512, "train/loss_slope": -9.727202197148414e-07} {"step": 68690, "timestamp": 1778268784.633849, "train/loss": 2.202184247970581, "train/z_loss": 0.001357841829303652, "train/perplexity": 9.044747911433433, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022083.7955922948, "perf/iters_per_sec": 0.9642046907388185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371241807937621, "data/tokens_consumed": 144055468032, "data/tokens_consumed_B": 144.055468032, "train/loss_slope": 4.372705398934976e-06} {"step": 68700, "timestamp": 1778268794.989643, "grad/layer_0/attn": 0.002701053861528635, "grad/layer_0/mlp": 0.002890132600441575, "grad/layer_0/attn_mlp_ratio": 0.9345778002220967, "grad/layer_4/attn": 0.003065868280827999, "grad/layer_4/mlp": 0.00241627125069499, "grad/layer_4/attn_mlp_ratio": 1.2688427067374555, "grad/layer_8/attn": 0.00541037367656827, "grad/layer_8/mlp": 0.0034959346521645784, "grad/layer_8/attn_mlp_ratio": 1.547618608504729, "grad/layer_12/attn": 0.009039732627570629, "grad/layer_12/mlp": 0.006315266713500023, "grad/layer_12/attn_mlp_ratio": 1.4314094549808363, "grad/layer_16/attn": 0.005316933151334524, "grad/layer_16/mlp": 0.0043802885338664055, "grad/layer_16/attn_mlp_ratio": 1.2138316891326313, "grad/layer_20/attn": 0.003655014093965292, "grad/layer_20/mlp": 0.005221623461693525, "grad/layer_20/attn_mlp_ratio": 0.6999765591642658, "grad/layer_24/attn": 0.007218317128717899, "grad/layer_24/mlp": 0.007653266657143831, "grad/layer_24/attn_mlp_ratio": 0.9431681081781275, "grad/layer_27/attn": 0.009111985564231873, "grad/layer_27/mlp": 0.0066855004988610744, "grad/layer_27/attn_mlp_ratio": 1.36294738583737} {"step": 68700, "timestamp": 1778268795.5960639, "eos/sharpness": 14.025306701660153, "eos/L0_probe": 1.9680299758911133, "eos/L_plus": 2.0485334396362305, "eos/L_minus": 2.0277795791625977, "eos/grad_norm": 0.09278471767902374, "eos/embed_grad_frac": 0.2666243314743042, "eos/time_s": 0.6033987998962402} {"step": 68700, "timestamp": 1778268795.6170068, "train/loss": 2.1442723274230957, "train/z_loss": 0.0013583556516095995, "train/perplexity": 8.53582768912341, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910140.0702905003, "perf/iters_per_sec": 0.9108257628872396, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979048252105712, "data/tokens_consumed": 144076439552, "data/tokens_consumed_B": 144.076439552, "train/loss_slope": 4.0656008664602405e-06} {"step": 68700, "timestamp": 1778268796.9913058, "geo/rankme_last": 438.7157897949219, "geo/layer_0/stable_rank_q_proj": 19.260848999023438, "geo/layer_0/stable_rank_k_proj": 16.06825065612793, "geo/layer_0/stable_rank_o_proj": 46.957210540771484, "geo/layer_0/stable_rank_gate_proj": 129.55467224121094, "geo/layer_0/stable_rank_down_proj": 55.6724967956543, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06718574464321136, "geo/layer_0/attn_entropy_mean": 6.1572160720825195, "geo/layer_0/attn_entropy_std": 0.4200182557106018, "geo/layer_7/stable_rank_q_proj": 43.24077224731445, "geo/layer_7/stable_rank_k_proj": 40.39772415161133, "geo/layer_7/stable_rank_o_proj": 89.30070495605469, "geo/layer_7/stable_rank_gate_proj": 79.5712661743164, "geo/layer_7/stable_rank_down_proj": 139.87875366210938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4470047056674957, "geo/layer_7/attn_entropy_mean": 4.659297943115234, "geo/layer_7/attn_entropy_std": 0.7950934767723083, "geo/layer_14/stable_rank_q_proj": 50.474891662597656, "geo/layer_14/stable_rank_k_proj": 40.70188903808594, "geo/layer_14/stable_rank_o_proj": 43.50214767456055, "geo/layer_14/stable_rank_gate_proj": 71.68148040771484, "geo/layer_14/stable_rank_down_proj": 128.02284240722656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.388723760843277, "geo/layer_14/attn_entropy_mean": 5.514641761779785, "geo/layer_14/attn_entropy_std": 0.40256404876708984, "geo/layer_21/stable_rank_q_proj": 40.22843933105469, "geo/layer_21/stable_rank_k_proj": 30.1901798248291, "geo/layer_21/stable_rank_o_proj": 69.52933502197266, "geo/layer_21/stable_rank_gate_proj": 64.83804321289062, "geo/layer_21/stable_rank_down_proj": 50.51584243774414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1384463608264923, "geo/layer_21/attn_entropy_mean": 5.698914527893066, "geo/layer_21/attn_entropy_std": 0.2998792231082916, "geo/layer_27/stable_rank_q_proj": 43.47742462158203, "geo/layer_27/stable_rank_k_proj": 31.775802612304688, "geo/layer_27/stable_rank_o_proj": 115.29204559326172, "geo/layer_27/stable_rank_gate_proj": 79.35211944580078, "geo/layer_27/stable_rank_down_proj": 127.5311050415039, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09604949504137039, "geo/layer_27/attn_entropy_mean": 4.176409721374512, "geo/layer_27/attn_entropy_std": 0.7476384043693542, "attnres/final_alpha/block_0": 0.23711466789245605, "attnres/block_norm/0": 1.769946813583374, "attnres/final_alpha/block_1": 0.004157291725277901, "attnres/block_norm/1": 47703.40234375, "attnres/final_alpha/block_2": 0.01022487785667181, "attnres/block_norm/2": 28920.798828125, "attnres/final_alpha/block_3": 0.012149610556662083, "attnres/block_norm/3": 59926.9375, "attnres/final_alpha/block_4": 0.01449006237089634, "attnres/block_norm/4": 15531.587890625, "attnres/final_alpha/block_5": 0.6146584749221802, "attnres/block_norm/5": 6768.1005859375, "attnres/final_alpha/block_6": 0.10720501095056534, "attnres/block_norm/6": 40078.296875, "geo/tier1_time_s": 1.3698880672454834, "geo/step": 68700.0, "geo/rankme_slope": 3.5536343443627446e-05} {"step": 68710, "timestamp": 1778268807.342437, "train/loss": 2.130705690383911, "train/z_loss": 0.0013584719388745725, "train/perplexity": 8.420807197821103, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789211.3329367521, "perf/iters_per_sec": 0.8531624474223862, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1721097230911255, "data/tokens_consumed": 144097411072, "data/tokens_consumed_B": 144.097411072, "train/loss_slope": 2.004978801980234e-06} {"step": 68720, "timestamp": 1778268817.7303138, "train/loss": 2.068626272678375, "train/z_loss": 0.0013727242941968143, "train/perplexity": 7.913944045963444, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020547.2556712492, "perf/iters_per_sec": 0.9634720114094015, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03791286945343, "data/tokens_consumed": 144118382592, "data/tokens_consumed_B": 144.118382592, "train/loss_slope": -2.7978511175664422e-06} {"step": 68730, "timestamp": 1778268828.1101315, "train/loss": 2.179467487335205, "train/z_loss": 0.0013764868839643896, "train/perplexity": 8.841596742651223, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021723.74402729, "perf/iters_per_sec": 0.9640330047737551, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373088836669921, "data/tokens_consumed": 144139354112, "data/tokens_consumed_B": 144.139354112, "train/loss_slope": -3.0041234399547783e-06} {"step": 68740, "timestamp": 1778268838.4952965, "train/loss": 2.175672769546509, "train/z_loss": 0.0013572446536272765, "train/perplexity": 8.808108956744855, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021952.0205256762, "perf/iters_per_sec": 0.9641418554905301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371917724609374, "data/tokens_consumed": 144160325632, "data/tokens_consumed_B": 144.160325632, "train/loss_slope": 6.629543550516028e-07} {"step": 68750, "timestamp": 1778268848.8641977, "grad/layer_0/attn": 0.002934293355792761, "grad/layer_0/mlp": 0.003284208942204714, "grad/layer_0/attn_mlp_ratio": 0.8934551114392365, "grad/layer_4/attn": 0.002218614099547267, "grad/layer_4/mlp": 0.002677634358406067, "grad/layer_4/attn_mlp_ratio": 0.8285724336203665, "grad/layer_8/attn": 0.00324307125993073, "grad/layer_8/mlp": 0.0037788173649460077, "grad/layer_8/attn_mlp_ratio": 0.8582238464850186, "grad/layer_12/attn": 0.0048883408308029175, "grad/layer_12/mlp": 0.007345583755522966, "grad/layer_12/attn_mlp_ratio": 0.6654802296113593, "grad/layer_16/attn": 0.004535236395895481, "grad/layer_16/mlp": 0.004615783225744963, "grad/layer_16/attn_mlp_ratio": 0.9825496726849756, "grad/layer_20/attn": 0.004020311404019594, "grad/layer_20/mlp": 0.005923699587583542, "grad/layer_20/attn_mlp_ratio": 0.6786825153284572, "grad/layer_24/attn": 0.009417778812348843, "grad/layer_24/mlp": 0.00999962817877531, "grad/layer_24/attn_mlp_ratio": 0.9418128904189897, "grad/layer_27/attn": 0.003779505379498005, "grad/layer_27/mlp": 0.01008329726755619, "grad/layer_27/attn_mlp_ratio": 0.37482831674278133} {"step": 68750, "timestamp": 1778268848.880939, "train/loss": 2.127650737762451, "train/z_loss": 0.0013664905680343508, "train/perplexity": 8.395121285396707, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020630.7577565466, "perf/iters_per_sec": 0.9635118283064588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378699779510498, "data/tokens_consumed": 144181297152, "data/tokens_consumed_B": 144.181297152, "train/loss_slope": 2.411362089768858e-06} {"step": 68760, "timestamp": 1778268859.256218, "train/loss": 2.118615221977234, "train/z_loss": 0.0013455086504109205, "train/perplexity": 8.3196086958412, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022692.8334051133, "perf/iters_per_sec": 0.9644951025987212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036811900138855, "data/tokens_consumed": 144202268672, "data/tokens_consumed_B": 144.202268672, "train/loss_slope": -7.31976805525847e-07} {"step": 68770, "timestamp": 1778268869.6318684, "train/loss": 2.1896688461303713, "train/z_loss": 0.0013604366220533847, "train/perplexity": 8.93225467417641, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022163.3337589426, "perf/iters_per_sec": 0.9642426174921715, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037083387374878, "data/tokens_consumed": 144223240192, "data/tokens_consumed_B": 144.223240192, "train/loss_slope": 3.094796014196922e-06} {"step": 68775, "timestamp": 1778268875.42014, "eos/sharpness": 75.34289360046385, "eos/L0_probe": 1.9684919118881226, "eos/L_plus": 2.4035489559173584, "eos/L_minus": 2.2868638038635254, "eos/grad_norm": 0.2185896337032318, "eos/embed_grad_frac": 0.04579945653676987, "eos/time_s": 0.6092193126678467} {"step": 68775, "timestamp": 1778268876.7978373, "geo/rankme_last": 438.6478271484375, "geo/layer_0/stable_rank_q_proj": 19.264463424682617, "geo/layer_0/stable_rank_k_proj": 16.07832145690918, "geo/layer_0/stable_rank_o_proj": 47.00804901123047, "geo/layer_0/stable_rank_gate_proj": 129.6611785888672, "geo/layer_0/stable_rank_down_proj": 55.71350860595703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06570466607809067, "geo/layer_0/attn_entropy_mean": 6.1555962562561035, "geo/layer_0/attn_entropy_std": 0.42296352982521057, "geo/layer_7/stable_rank_q_proj": 43.18594741821289, "geo/layer_7/stable_rank_k_proj": 40.4609489440918, "geo/layer_7/stable_rank_o_proj": 89.2869644165039, "geo/layer_7/stable_rank_gate_proj": 79.55960845947266, "geo/layer_7/stable_rank_down_proj": 139.82681274414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4564540684223175, "geo/layer_7/attn_entropy_mean": 4.650974273681641, "geo/layer_7/attn_entropy_std": 0.7832239270210266, "geo/layer_14/stable_rank_q_proj": 50.40065383911133, "geo/layer_14/stable_rank_k_proj": 40.72624969482422, "geo/layer_14/stable_rank_o_proj": 43.49827194213867, "geo/layer_14/stable_rank_gate_proj": 71.62586212158203, "geo/layer_14/stable_rank_down_proj": 127.8083267211914, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3866983950138092, "geo/layer_14/attn_entropy_mean": 5.539403915405273, "geo/layer_14/attn_entropy_std": 0.4137583374977112, "geo/layer_21/stable_rank_q_proj": 40.25513458251953, "geo/layer_21/stable_rank_k_proj": 30.154016494750977, "geo/layer_21/stable_rank_o_proj": 69.51643371582031, "geo/layer_21/stable_rank_gate_proj": 64.82320404052734, "geo/layer_21/stable_rank_down_proj": 50.48820495605469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14907580614089966, "geo/layer_21/attn_entropy_mean": 5.708783149719238, "geo/layer_21/attn_entropy_std": 0.2996569573879242, "geo/layer_27/stable_rank_q_proj": 43.558204650878906, "geo/layer_27/stable_rank_k_proj": 31.842790603637695, "geo/layer_27/stable_rank_o_proj": 115.45142364501953, "geo/layer_27/stable_rank_gate_proj": 79.27555084228516, "geo/layer_27/stable_rank_down_proj": 127.75447845458984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09039900451898575, "geo/layer_27/attn_entropy_mean": 4.177813529968262, "geo/layer_27/attn_entropy_std": 0.7633870840072632, "attnres/final_alpha/block_0": 0.23476512730121613, "attnres/block_norm/0": 1.7701588869094849, "attnres/final_alpha/block_1": 0.0041281720623373985, "attnres/block_norm/1": 47735.625, "attnres/final_alpha/block_2": 0.00987955927848816, "attnres/block_norm/2": 28945.1015625, "attnres/final_alpha/block_3": 0.011811630800366402, "attnres/block_norm/3": 59971.94921875, "attnres/final_alpha/block_4": 0.014022288843989372, "attnres/block_norm/4": 15504.0166015625, "attnres/final_alpha/block_5": 0.62046217918396, "attnres/block_norm/5": 6681.51953125, "attnres/final_alpha/block_6": 0.1049310564994812, "attnres/block_norm/6": 40226.53125, "geo/tier1_time_s": 1.3581840991973877, "geo/step": 68775.0, "geo/rankme_slope": 1.6468306072428967e-05} {"step": 68780, "timestamp": 1778268881.988398, "train/loss": 2.140389013290405, "train/z_loss": 0.0013757838984020053, "train/perplexity": 8.502744666283775, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698279.2828989236, "perf/iters_per_sec": 0.8098026670927637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2348687410354615, "data/tokens_consumed": 144244211712, "data/tokens_consumed_B": 144.244211712, "train/loss_slope": 1.9035473622397314e-06} {"step": 68790, "timestamp": 1778268892.3668456, "train/loss": 2.1207017421722414, "train/z_loss": 0.001386903610546142, "train/perplexity": 8.336985849986776, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022057.0673383859, "perf/iters_per_sec": 0.9641919457141809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371378898620605, "data/tokens_consumed": 144265183232, "data/tokens_consumed_B": 144.265183232, "train/loss_slope": 9.445534859768065e-07} {"step": 68800, "timestamp": 1778268902.732474, "grad/layer_0/attn": 0.002693663816899061, "grad/layer_0/mlp": 0.003177172038704157, "grad/layer_0/attn_mlp_ratio": 0.8478180278886936, "grad/layer_4/attn": 0.002265277551487088, "grad/layer_4/mlp": 0.0025483432691544294, "grad/layer_4/attn_mlp_ratio": 0.8889216339157364, "grad/layer_8/attn": 0.009067333303391933, "grad/layer_8/mlp": 0.00352343893609941, "grad/layer_8/attn_mlp_ratio": 2.5734327202748593, "grad/layer_12/attn": 0.005119101610034704, "grad/layer_12/mlp": 0.006724560633301735, "grad/layer_12/attn_mlp_ratio": 0.7612544243497738, "grad/layer_16/attn": 0.005428996402770281, "grad/layer_16/mlp": 0.00475953659042716, "grad/layer_16/attn_mlp_ratio": 1.140656487361386, "grad/layer_20/attn": 0.0034308815374970436, "grad/layer_20/mlp": 0.006378933321684599, "grad/layer_20/attn_mlp_ratio": 0.5378456413785535, "grad/layer_24/attn": 0.012406806461513042, "grad/layer_24/mlp": 0.010285019874572754, "grad/layer_24/attn_mlp_ratio": 1.2062987230152102, "grad/layer_27/attn": 0.005410873331129551, "grad/layer_27/mlp": 0.01036257017403841, "grad/layer_27/attn_mlp_ratio": 0.5221555259012852} {"step": 68800, "timestamp": 1778268902.7490892, "train/loss": 2.159026622772217, "train/z_loss": 0.0013780403300188483, "train/perplexity": 8.662701476987284, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021406.6497940593, "perf/iters_per_sec": 0.9638818024606988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037471604347229, "data/tokens_consumed": 144286154752, "data/tokens_consumed_B": 144.286154752, "train/loss_slope": 4.060167777012545e-06} {"step": 68810, "timestamp": 1778268913.1266341, "train/loss": 2.1552470088005067, "train/z_loss": 0.0013748379657045006, "train/perplexity": 8.630021607002666, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021818.6825719625, "perf/iters_per_sec": 0.9640782749996006, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372601747512817, "data/tokens_consumed": 144307126272, "data/tokens_consumed_B": 144.307126272, "train/loss_slope": 5.9605434711295086e-06} {"step": 68820, "timestamp": 1778268923.510237, "train/loss": 2.1465316295623778, "train/z_loss": 0.0013739644782617688, "train/perplexity": 8.555134504634234, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021154.0212929402, "perf/iters_per_sec": 0.963761339804144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376012802124024, "data/tokens_consumed": 144328097792, "data/tokens_consumed_B": 144.328097792, "train/loss_slope": 6.589653060631535e-06} {"step": 68830, "timestamp": 1778268933.8878639, "train/loss": 2.145036506652832, "train/z_loss": 0.0013781892484985292, "train/perplexity": 8.54235308432023, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022127.5384853054, "perf/iters_per_sec": 0.9642255489756133, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371017456054688, "data/tokens_consumed": 144349069312, "data/tokens_consumed_B": 144.349069312, "train/loss_slope": 7.715679760133763e-06} {"step": 68840, "timestamp": 1778268944.2686853, "train/loss": 2.098008918762207, "train/z_loss": 0.0013923844206146895, "train/perplexity": 8.149926581132597, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021121.2339168496, "perf/iters_per_sec": 0.963745705564904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376181125640869, "data/tokens_consumed": 144370040832, "data/tokens_consumed_B": 144.370040832, "train/loss_slope": 6.584400598472779e-06} {"step": 68850, "timestamp": 1778268954.6328204, "grad/layer_0/attn": 0.002573975594714284, "grad/layer_0/mlp": 0.0027756907511502504, "grad/layer_0/attn_mlp_ratio": 0.9273279096076656, "grad/layer_4/attn": 0.00354551849886775, "grad/layer_4/mlp": 0.002802138216793537, "grad/layer_4/attn_mlp_ratio": 1.2652903240425535, "grad/layer_8/attn": 0.006219134200364351, "grad/layer_8/mlp": 0.0038178882095962763, "grad/layer_8/attn_mlp_ratio": 1.6289460811969105, "grad/layer_12/attn": 0.00912072416394949, "grad/layer_12/mlp": 0.006938572973012924, "grad/layer_12/attn_mlp_ratio": 1.3144956560915795, "grad/layer_16/attn": 0.0034708178136497736, "grad/layer_16/mlp": 0.004547939635813236, "grad/layer_16/attn_mlp_ratio": 0.7631626660130189, "grad/layer_20/attn": 0.002966935746371746, "grad/layer_20/mlp": 0.0057938965037465096, "grad/layer_20/attn_mlp_ratio": 0.5120795121634093, "grad/layer_24/attn": 0.007662651129066944, "grad/layer_24/mlp": 0.008481197990477085, "grad/layer_24/attn_mlp_ratio": 0.9034868714681683, "grad/layer_27/attn": 0.0063573457300662994, "grad/layer_27/mlp": 0.00807798933237791, "grad/layer_27/attn_mlp_ratio": 0.7869960444098889} {"step": 68850, "timestamp": 1778268955.2440932, "eos/sharpness": 72.26722240447997, "eos/L0_probe": 1.9649134874343872, "eos/L_plus": 2.401235342025757, "eos/L_minus": 2.2512638568878174, "eos/grad_norm": 0.16200567781925201, "eos/embed_grad_frac": 0.0823025330901146, "eos/time_s": 0.6084613800048828} {"step": 68850, "timestamp": 1778268955.2656455, "train/loss": 2.1715705156326295, "train/z_loss": 0.0013746447162702679, "train/perplexity": 8.772049869637092, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908095.6702040324, "perf/iters_per_sec": 0.909850916959778, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0990811586380005, "data/tokens_consumed": 144391012352, "data/tokens_consumed_B": 144.391012352, "train/loss_slope": 6.640964644541078e-06} {"step": 68850, "timestamp": 1778268956.6324563, "geo/rankme_last": 438.7936096191406, "geo/layer_0/stable_rank_q_proj": 19.285491943359375, "geo/layer_0/stable_rank_k_proj": 16.069595336914062, "geo/layer_0/stable_rank_o_proj": 46.945220947265625, "geo/layer_0/stable_rank_gate_proj": 129.62937927246094, "geo/layer_0/stable_rank_down_proj": 55.764564514160156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06323106586933136, "geo/layer_0/attn_entropy_mean": 6.156937599182129, "geo/layer_0/attn_entropy_std": 0.4276759922504425, "geo/layer_7/stable_rank_q_proj": 43.19828414916992, "geo/layer_7/stable_rank_k_proj": 40.52582550048828, "geo/layer_7/stable_rank_o_proj": 89.227294921875, "geo/layer_7/stable_rank_gate_proj": 79.54864501953125, "geo/layer_7/stable_rank_down_proj": 139.73756408691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4651709198951721, "geo/layer_7/attn_entropy_mean": 4.665409088134766, "geo/layer_7/attn_entropy_std": 0.8246319890022278, "geo/layer_14/stable_rank_q_proj": 50.442771911621094, "geo/layer_14/stable_rank_k_proj": 40.71699905395508, "geo/layer_14/stable_rank_o_proj": 43.473140716552734, "geo/layer_14/stable_rank_gate_proj": 71.55889892578125, "geo/layer_14/stable_rank_down_proj": 127.9420394897461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39719539880752563, "geo/layer_14/attn_entropy_mean": 5.515280723571777, "geo/layer_14/attn_entropy_std": 0.43566811084747314, "geo/layer_21/stable_rank_q_proj": 40.176734924316406, "geo/layer_21/stable_rank_k_proj": 30.178571701049805, "geo/layer_21/stable_rank_o_proj": 69.69578552246094, "geo/layer_21/stable_rank_gate_proj": 64.8827133178711, "geo/layer_21/stable_rank_down_proj": 50.481990814208984, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1367165744304657, "geo/layer_21/attn_entropy_mean": 5.699802398681641, "geo/layer_21/attn_entropy_std": 0.3037111759185791, "geo/layer_27/stable_rank_q_proj": 43.544334411621094, "geo/layer_27/stable_rank_k_proj": 31.81757354736328, "geo/layer_27/stable_rank_o_proj": 115.41386413574219, "geo/layer_27/stable_rank_gate_proj": 79.2048568725586, "geo/layer_27/stable_rank_down_proj": 127.62940216064453, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09347166866064072, "geo/layer_27/attn_entropy_mean": 4.1456685066223145, "geo/layer_27/attn_entropy_std": 0.7549520134925842, "attnres/final_alpha/block_0": 0.23583456873893738, "attnres/block_norm/0": 1.7702679634094238, "attnres/final_alpha/block_1": 0.004174722358584404, "attnres/block_norm/1": 47746.9921875, "attnres/final_alpha/block_2": 0.00993247702717781, "attnres/block_norm/2": 28868.03515625, "attnres/final_alpha/block_3": 0.011763944290578365, "attnres/block_norm/3": 60346.65625, "attnres/final_alpha/block_4": 0.014278119429945946, "attnres/block_norm/4": 15522.5146484375, "attnres/final_alpha/block_5": 0.6190192103385925, "attnres/block_norm/5": 6696.95361328125, "attnres/final_alpha/block_6": 0.1049969494342804, "attnres/block_norm/6": 40065.09375, "geo/tier1_time_s": 1.3630173206329346, "geo/step": 68850.0, "geo/rankme_slope": 1.5753547512755107e-05} {"step": 68860, "timestamp": 1778268967.0074542, "train/loss": 2.1547298669815063, "train/z_loss": 0.0013670952175743878, "train/perplexity": 8.625559815719676, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786629.9022517006, "perf/iters_per_sec": 0.8519315253504279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1738032579421998, "data/tokens_consumed": 144411983872, "data/tokens_consumed_B": 144.411983872, "train/loss_slope": 9.97927573004027e-07} {"step": 68870, "timestamp": 1778268977.3843074, "train/loss": 2.091285061836243, "train/z_loss": 0.0013970463653095067, "train/perplexity": 8.09531145874189, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022013.4669380616, "perf/iters_per_sec": 0.9641711554231938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371602535247804, "data/tokens_consumed": 144432955392, "data/tokens_consumed_B": 144.432955392, "train/loss_slope": -2.1328607121996094e-06} {"step": 68880, "timestamp": 1778268987.7611156, "train/loss": 2.1507906913757324, "train/z_loss": 0.001372231380082667, "train/perplexity": 8.591649054977916, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022403.8922742414, "perf/iters_per_sec": 0.9643573247309882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369600296020507, "data/tokens_consumed": 144453926912, "data/tokens_consumed_B": 144.453926912, "train/loss_slope": -1.0899148257759504e-06} {"step": 68890, "timestamp": 1778268998.1412618, "train/loss": 2.0993361592292787, "train/z_loss": 0.0013821319327689708, "train/perplexity": 8.160750674993489, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021445.5320431578, "perf/iters_per_sec": 0.9639003429618634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374516487121581, "data/tokens_consumed": 144474898432, "data/tokens_consumed_B": 144.474898432, "train/loss_slope": -2.415362543219944e-06} {"step": 68900, "timestamp": 1778269008.5069566, "grad/layer_0/attn": 0.002647832967340946, "grad/layer_0/mlp": 0.0030356510542333126, "grad/layer_0/attn_mlp_ratio": 0.8722454698552758, "grad/layer_4/attn": 0.003837914438918233, "grad/layer_4/mlp": 0.0024821751285344362, "grad/layer_4/attn_mlp_ratio": 1.5461899687010707, "grad/layer_8/attn": 0.0033999066799879074, "grad/layer_8/mlp": 0.003510858165100217, "grad/layer_8/attn_mlp_ratio": 0.9683975892119523, "grad/layer_12/attn": 0.004606913775205612, "grad/layer_12/mlp": 0.00757180480286479, "grad/layer_12/attn_mlp_ratio": 0.6084300684322423, "grad/layer_16/attn": 0.0043053110130131245, "grad/layer_16/mlp": 0.004705558530986309, "grad/layer_16/attn_mlp_ratio": 0.914941529930679, "grad/layer_20/attn": 0.003939211834222078, "grad/layer_20/mlp": 0.006690767593681812, "grad/layer_20/attn_mlp_ratio": 0.5887533411064223, "grad/layer_24/attn": 0.011367147788405418, "grad/layer_24/mlp": 0.01147504337131977, "grad/layer_24/attn_mlp_ratio": 0.9905973617281694, "grad/layer_27/attn": 0.004447345156222582, "grad/layer_27/mlp": 0.01095523126423359, "grad/layer_27/attn_mlp_ratio": 0.405956296892294} {"step": 68900, "timestamp": 1778269008.5238729, "train/loss": 2.1617021322250367, "train/z_loss": 0.0013747094315476715, "train/perplexity": 8.68590964967427, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020784.7371733733, "perf/iters_per_sec": 0.9635852514140002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377908945083618, "data/tokens_consumed": 144495869952, "data/tokens_consumed_B": 144.495869952, "train/loss_slope": -2.7278693130772216e-06} {"step": 68910, "timestamp": 1778269018.9034293, "train/loss": 2.112394106388092, "train/z_loss": 0.0013714901288039982, "train/perplexity": 8.268012109061807, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021441.536904408, "perf/iters_per_sec": 0.9638984379312553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374536991119385, "data/tokens_consumed": 144516841472, "data/tokens_consumed_B": 144.516841472, "train/loss_slope": -2.691860410711459e-06} {"step": 68920, "timestamp": 1778269029.2836487, "train/loss": 2.1746814966201784, "train/z_loss": 0.001361362321767956, "train/perplexity": 8.799382042895186, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021653.9979929389, "perf/iters_per_sec": 0.9639997472729391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373446702957154, "data/tokens_consumed": 144537812992, "data/tokens_consumed_B": 144.537812992, "train/loss_slope": 4.814440875736271e-07} {"step": 68925, "timestamp": 1778269035.0800464, "eos/sharpness": 84.77723598480223, "eos/L0_probe": 1.96865713596344, "eos/L_plus": 2.3291594982147217, "eos/L_minus": 2.4559271335601807, "eos/grad_norm": 0.27254095673561096, "eos/embed_grad_frac": 0.03155350312590599, "eos/time_s": 0.6101326942443848} {"step": 68925, "timestamp": 1778269036.4620087, "geo/rankme_last": 438.99957275390625, "geo/layer_0/stable_rank_q_proj": 19.305538177490234, "geo/layer_0/stable_rank_k_proj": 16.06597137451172, "geo/layer_0/stable_rank_o_proj": 46.932926177978516, "geo/layer_0/stable_rank_gate_proj": 129.82455444335938, "geo/layer_0/stable_rank_down_proj": 55.762752532958984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06555861234664917, "geo/layer_0/attn_entropy_mean": 6.156055450439453, "geo/layer_0/attn_entropy_std": 0.4221011996269226, "geo/layer_7/stable_rank_q_proj": 43.22041702270508, "geo/layer_7/stable_rank_k_proj": 40.48446273803711, "geo/layer_7/stable_rank_o_proj": 89.18067169189453, "geo/layer_7/stable_rank_gate_proj": 79.56375885009766, "geo/layer_7/stable_rank_down_proj": 139.72952270507812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.47604456543922424, "geo/layer_7/attn_entropy_mean": 4.65227746963501, "geo/layer_7/attn_entropy_std": 0.7870658040046692, "geo/layer_14/stable_rank_q_proj": 50.43178176879883, "geo/layer_14/stable_rank_k_proj": 40.687095642089844, "geo/layer_14/stable_rank_o_proj": 43.430274963378906, "geo/layer_14/stable_rank_gate_proj": 71.36768341064453, "geo/layer_14/stable_rank_down_proj": 127.59716033935547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4010257124900818, "geo/layer_14/attn_entropy_mean": 5.557210445404053, "geo/layer_14/attn_entropy_std": 0.4210350215435028, "geo/layer_21/stable_rank_q_proj": 40.02834701538086, "geo/layer_21/stable_rank_k_proj": 30.14523696899414, "geo/layer_21/stable_rank_o_proj": 69.70172119140625, "geo/layer_21/stable_rank_gate_proj": 64.832763671875, "geo/layer_21/stable_rank_down_proj": 50.46633529663086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14258119463920593, "geo/layer_21/attn_entropy_mean": 5.665790557861328, "geo/layer_21/attn_entropy_std": 0.31118243932724, "geo/layer_27/stable_rank_q_proj": 43.53343963623047, "geo/layer_27/stable_rank_k_proj": 31.801740646362305, "geo/layer_27/stable_rank_o_proj": 115.45574188232422, "geo/layer_27/stable_rank_gate_proj": 79.18975830078125, "geo/layer_27/stable_rank_down_proj": 127.52397918701172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10093903541564941, "geo/layer_27/attn_entropy_mean": 4.2036285400390625, "geo/layer_27/attn_entropy_std": 0.7547658085823059, "attnres/final_alpha/block_0": 0.24005918204784393, "attnres/block_norm/0": 1.7705070972442627, "attnres/final_alpha/block_1": 0.004341124556958675, "attnres/block_norm/1": 47540.25, "attnres/final_alpha/block_2": 0.010251177474856377, "attnres/block_norm/2": 28952.39453125, "attnres/final_alpha/block_3": 0.012117085978388786, "attnres/block_norm/3": 60144.859375, "attnres/final_alpha/block_4": 0.014417305588722229, "attnres/block_norm/4": 15521.638671875, "attnres/final_alpha/block_5": 0.6089007258415222, "attnres/block_norm/5": 6739.1181640625, "attnres/final_alpha/block_6": 0.10991337150335312, "attnres/block_norm/6": 39864.0, "geo/tier1_time_s": 1.3608067035675049, "geo/step": 68925.0, "geo/rankme_slope": 1.4399998280562227e-05} {"step": 68930, "timestamp": 1778269041.6586673, "train/loss": 2.152444291114807, "train/z_loss": 0.0013843229506164789, "train/perplexity": 8.605867956559416, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695284.0845167658, "perf/iters_per_sec": 0.808374445207961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2370504856109619, "data/tokens_consumed": 144558784512, "data/tokens_consumed_B": 144.558784512, "train/loss_slope": 1.4735602345368246e-06} {"step": 68940, "timestamp": 1778269052.0394027, "train/loss": 2.172556471824646, "train/z_loss": 0.001377939188387245, "train/perplexity": 8.780702991621359, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021535.5197588413, "perf/iters_per_sec": 0.9639432524484831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374054670333863, "data/tokens_consumed": 144579756032, "data/tokens_consumed_B": 144.579756032, "train/loss_slope": 4.213930077642973e-06} {"step": 68950, "timestamp": 1778269062.3770418, "grad/layer_0/attn": 0.002713932655751705, "grad/layer_0/mlp": 0.0028540242929011583, "grad/layer_0/attn_mlp_ratio": 0.9509143168159653, "grad/layer_4/attn": 0.001971189398318529, "grad/layer_4/mlp": 0.0025771346408873796, "grad/layer_4/attn_mlp_ratio": 0.7648763438887147, "grad/layer_8/attn": 0.004170068074017763, "grad/layer_8/mlp": 0.003676476888358593, "grad/layer_8/attn_mlp_ratio": 1.1342565415810042, "grad/layer_12/attn": 0.006559442263096571, "grad/layer_12/mlp": 0.00674697570502758, "grad/layer_12/attn_mlp_ratio": 0.9722047999947967, "grad/layer_16/attn": 0.003989247605204582, "grad/layer_16/mlp": 0.0045452113263309, "grad/layer_16/attn_mlp_ratio": 0.8776814169951349, "grad/layer_20/attn": 0.004295097663998604, "grad/layer_20/mlp": 0.005745185539126396, "grad/layer_20/attn_mlp_ratio": 0.7475994569692808, "grad/layer_24/attn": 0.005506204906851053, "grad/layer_24/mlp": 0.008933077566325665, "grad/layer_24/attn_mlp_ratio": 0.6163838614778163, "grad/layer_27/attn": 0.0037963746581226587, "grad/layer_27/mlp": 0.007155084516853094, "grad/layer_27/attn_mlp_ratio": 0.5305841735513056} {"step": 68950, "timestamp": 1778269062.3923028, "train/loss": 2.1309755444526672, "train/z_loss": 0.0013838796759955585, "train/perplexity": 8.423079893539947, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026760.2851830781, "perf/iters_per_sec": 0.9664346147456542, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034731149673462, "data/tokens_consumed": 144600727552, "data/tokens_consumed_B": 144.600727552, "train/loss_slope": 4.6244409396918545e-06} {"step": 68960, "timestamp": 1778269072.759245, "train/loss": 2.1338085174560546, "train/z_loss": 0.0013675482710823416, "train/perplexity": 8.446976084133096, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024078.1342523047, "perf/iters_per_sec": 0.9651556655179523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036102294921875, "data/tokens_consumed": 144621699072, "data/tokens_consumed_B": 144.621699072, "train/loss_slope": 6.638848427498726e-06} {"step": 68970, "timestamp": 1778269083.1034248, "train/loss": 2.140269422531128, "train/z_loss": 0.001376903976779431, "train/perplexity": 8.501727877393682, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2029148.474681277, "perf/iters_per_sec": 0.9675733922392259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033513331413269, "data/tokens_consumed": 144642670592, "data/tokens_consumed_B": 144.642670592, "train/loss_slope": 8.35403569138806e-06} {"step": 68980, "timestamp": 1778269093.4521434, "train/loss": 2.1005170345306396, "train/z_loss": 0.0013604694744572044, "train/perplexity": 8.170393196093144, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027565.0552461839, "perf/iters_per_sec": 0.9668183590155525, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343204498291017, "data/tokens_consumed": 144663642112, "data/tokens_consumed_B": 144.663642112, "train/loss_slope": 7.383980094367184e-06} {"step": 68990, "timestamp": 1778269103.8215137, "train/loss": 2.1364256143569946, "train/z_loss": 0.0013627468491904438, "train/perplexity": 8.469111591815272, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023738.1380043672, "perf/iters_per_sec": 0.9649935426732861, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036276364326477, "data/tokens_consumed": 144684613632, "data/tokens_consumed_B": 144.684613632, "train/loss_slope": 6.4493204214678354e-06} {"step": 69000, "timestamp": 1778269114.1658795, "grad/layer_0/attn": 0.0029341378249228, "grad/layer_0/mlp": 0.0031629945151507854, "grad/layer_0/attn_mlp_ratio": 0.9276455327708247, "grad/layer_4/attn": 0.001947069657035172, "grad/layer_4/mlp": 0.002556430408731103, "grad/layer_4/attn_mlp_ratio": 0.7616360587096928, "grad/layer_8/attn": 0.004612843040376902, "grad/layer_8/mlp": 0.003558748634532094, "grad/layer_8/attn_mlp_ratio": 1.296198013536744, "grad/layer_12/attn": 0.005647706799209118, "grad/layer_12/mlp": 0.0068871090188622475, "grad/layer_12/attn_mlp_ratio": 0.8200402667849875, "grad/layer_16/attn": 0.0036381178069859743, "grad/layer_16/mlp": 0.004780799616128206, "grad/layer_16/attn_mlp_ratio": 0.7609851955756792, "grad/layer_20/attn": 0.0028903691563755274, "grad/layer_20/mlp": 0.0056471750140190125, "grad/layer_20/attn_mlp_ratio": 0.5118256646938816, "grad/layer_24/attn": 0.007309848442673683, "grad/layer_24/mlp": 0.008977647870779037, "grad/layer_24/attn_mlp_ratio": 0.8142275645543462, "grad/layer_27/attn": 0.007045044098049402, "grad/layer_27/mlp": 0.007920606061816216, "grad/layer_27/attn_mlp_ratio": 0.8894576947926364} {"step": 69000, "timestamp": 1778269114.7615967, "eos/sharpness": 28.513121604919426, "eos/L0_probe": 1.9633920192718506, "eos/L_plus": 2.102562665939331, "eos/L_minus": 2.1093525886535645, "eos/grad_norm": 0.10866061598062515, "eos/embed_grad_frac": 0.1987590342760086, "eos/time_s": 0.592930793762207} {"step": 69000, "timestamp": 1778269114.778982, "train/loss": 2.1647126197814943, "train/z_loss": 0.0013735695858485997, "train/perplexity": 8.71209787247183, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1915353.8028272553, "perf/iters_per_sec": 0.913311864293697, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0949162483215331, "data/tokens_consumed": 144705585152, "data/tokens_consumed_B": 144.705585152, "train/loss_slope": 5.269109834395799e-06} {"step": 69000, "timestamp": 1778269116.1457896, "geo/rankme_last": 438.1995849609375, "geo/layer_0/stable_rank_q_proj": 19.336902618408203, "geo/layer_0/stable_rank_k_proj": 16.055339813232422, "geo/layer_0/stable_rank_o_proj": 46.892093658447266, "geo/layer_0/stable_rank_gate_proj": 129.802978515625, "geo/layer_0/stable_rank_down_proj": 55.669097900390625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06502377986907959, "geo/layer_0/attn_entropy_mean": 6.157935619354248, "geo/layer_0/attn_entropy_std": 0.42266908288002014, "geo/layer_7/stable_rank_q_proj": 43.23846435546875, "geo/layer_7/stable_rank_k_proj": 40.31369400024414, "geo/layer_7/stable_rank_o_proj": 89.33731842041016, "geo/layer_7/stable_rank_gate_proj": 79.55538177490234, "geo/layer_7/stable_rank_down_proj": 139.47254943847656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4340211749076843, "geo/layer_7/attn_entropy_mean": 4.616813659667969, "geo/layer_7/attn_entropy_std": 0.8017139434814453, "geo/layer_14/stable_rank_q_proj": 50.37776565551758, "geo/layer_14/stable_rank_k_proj": 40.702857971191406, "geo/layer_14/stable_rank_o_proj": 43.40849685668945, "geo/layer_14/stable_rank_gate_proj": 71.42727661132812, "geo/layer_14/stable_rank_down_proj": 127.38899993896484, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38628560304641724, "geo/layer_14/attn_entropy_mean": 5.536462783813477, "geo/layer_14/attn_entropy_std": 0.42515116930007935, "geo/layer_21/stable_rank_q_proj": 40.045101165771484, "geo/layer_21/stable_rank_k_proj": 30.07511329650879, "geo/layer_21/stable_rank_o_proj": 69.57159423828125, "geo/layer_21/stable_rank_gate_proj": 64.76099395751953, "geo/layer_21/stable_rank_down_proj": 50.469146728515625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14543458819389343, "geo/layer_21/attn_entropy_mean": 5.68031120300293, "geo/layer_21/attn_entropy_std": 0.3064207434654236, "geo/layer_27/stable_rank_q_proj": 43.604026794433594, "geo/layer_27/stable_rank_k_proj": 31.789472579956055, "geo/layer_27/stable_rank_o_proj": 115.339111328125, "geo/layer_27/stable_rank_gate_proj": 79.25946807861328, "geo/layer_27/stable_rank_down_proj": 127.48643493652344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10005629062652588, "geo/layer_27/attn_entropy_mean": 4.170034408569336, "geo/layer_27/attn_entropy_std": 0.7538148760795593, "attnres/final_alpha/block_0": 0.2390938252210617, "attnres/block_norm/0": 1.770680546760559, "attnres/final_alpha/block_1": 0.00420205295085907, "attnres/block_norm/1": 47689.265625, "attnres/final_alpha/block_2": 0.010003813542425632, "attnres/block_norm/2": 29003.638671875, "attnres/final_alpha/block_3": 0.012125993147492409, "attnres/block_norm/3": 60275.55859375, "attnres/final_alpha/block_4": 0.014364782720804214, "attnres/block_norm/4": 15512.03125, "attnres/final_alpha/block_5": 0.6128919124603271, "attnres/block_norm/5": 6766.81591796875, "attnres/final_alpha/block_6": 0.10731765627861023, "attnres/block_norm/6": 40331.45703125, "geo/tier1_time_s": 1.3631415367126465, "geo/step": 69000.0, "geo/rankme_slope": 1.1528517657062822e-05} {"step": 69000, "timestamp": 1778269123.0394642, "geo/ww_alpha_mean": 7.5129019000646675, "geo/ww_alpha_std": 4.832647043846905, "geo/ww_alpha_min": 1.3454736583819735, "geo/ww_alpha_max": 43.25866735426894, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.901627349121866, "geo/ww_alpha_by_type/k_proj": 4.4492379598903415, "geo/ww_alpha_by_type/v_proj": 7.385600388056747, "geo/ww_alpha_by_type/o_proj": 7.839409320289191, "geo/ww_alpha_by_type/gate_proj": 7.9698312643872375, "geo/ww_alpha_by_type/up_proj": 13.479049133921624, "geo/ww_alpha_by_type/down_proj": 7.6550173333977245, "geo/twonn_id/layer_0": 0.7145963907241821, "geo/twonn_id/layer_7": 3.400663137435913, "geo/twonn_id/layer_14": 4.77600622177124, "geo/twonn_id/layer_21": 6.3186821937561035, "geo/twonn_id/layer_27": 5.101499557495117, "geo/tier2_time_s": 6.887626886367798} {"step": 69000, "timestamp": 1778269123.961399, "eoc/jacobian_sigma/layer_0/attn": 1370.7646484375, "eoc/jacobian_sigma/layer_0/mlp": 9322.490234375, "eoc/jacobian_sigma/layer_0": 9322.490234375, "eoc/jacobian_sigma/layer_7/attn": 1.1565546989440918, "eoc/jacobian_sigma/layer_7/mlp": 1.7593117952346802, "eoc/jacobian_sigma/layer_7": 1.7593117952346802, "eoc/jacobian_sigma/layer_14/attn": 1.4657537937164307, "eoc/jacobian_sigma/layer_14/mlp": 6.213070869445801, "eoc/jacobian_sigma/layer_14": 6.213070869445801, "eoc/jacobian_sigma/layer_21/attn": 1.0945155620574951, "eoc/jacobian_sigma/layer_21/mlp": 4.051895618438721, "eoc/jacobian_sigma/layer_21": 4.051895618438721, "eoc/jacobian_sigma/layer_27/attn": 3.0867409706115723, "eoc/jacobian_sigma/layer_27/mlp": 27.03896713256836, "eoc/jacobian_sigma/layer_27": 27.03896713256836, "eoc/layer0_sigma": 9322.490234375, "eoc/sigma_max": 27.03896713256836, "eoc/sigma_min": 1.7593117952346802, "eoc/sigma_mean": 9.76581135392189, "eoc/time_s": 0.9154503345489502} {"step": 69010, "timestamp": 1778269134.3232908, "train/loss": 2.1513110399246216, "train/z_loss": 0.0013759490451775492, "train/perplexity": 8.596120870446684, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1073221.641645225, "perf/iters_per_sec": 0.5117519577242017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9540716648101806, "data/tokens_consumed": 144726556672, "data/tokens_consumed_B": 144.726556672, "train/loss_slope": 5.582626674971469e-06} {"step": 69020, "timestamp": 1778269145.2420409, "train/loss": 2.1690993309020996, "train/z_loss": 0.0013589530833996832, "train/perplexity": 8.75039927626957, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1922469.357844981, "perf/iters_per_sec": 0.9167048253273873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0908636808395387, "data/tokens_consumed": 144747528192, "data/tokens_consumed_B": 144.747528192, "train/loss_slope": 5.800798731167873e-06} {"step": 69030, "timestamp": 1778269155.6229198, "train/loss": 2.0845534801483154, "train/z_loss": 0.0013649333850480616, "train/perplexity": 8.041000213748305, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021429.0405839873, "perf/iters_per_sec": 0.9638924792213379, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374601125717162, "data/tokens_consumed": 144768499712, "data/tokens_consumed_B": 144.768499712, "train/loss_slope": 2.5603812197969956e-06} {"step": 69040, "timestamp": 1778269166.003888, "train/loss": 2.105192279815674, "train/z_loss": 0.0013869132264517248, "train/perplexity": 8.208681221574002, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021661.0606635092, "perf/iters_per_sec": 0.9640031150167032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037341046333313, "data/tokens_consumed": 144789471232, "data/tokens_consumed_B": 144.789471232, "train/loss_slope": 2.854699241554249e-07} {"step": 69050, "timestamp": 1778269176.3724477, "grad/layer_0/attn": 0.004062001593410969, "grad/layer_0/mlp": 0.0037400436121970415, "grad/layer_0/attn_mlp_ratio": 1.086083988848568, "grad/layer_4/attn": 0.0026768369134515524, "grad/layer_4/mlp": 0.0028748486656695604, "grad/layer_4/attn_mlp_ratio": 0.9311226891019115, "grad/layer_8/attn": 0.005085454788058996, "grad/layer_8/mlp": 0.0040371958166360855, "grad/layer_8/attn_mlp_ratio": 1.2596502357250852, "grad/layer_12/attn": 0.008421117439866066, "grad/layer_12/mlp": 0.00733238086104393, "grad/layer_12/attn_mlp_ratio": 1.1484833486702974, "grad/layer_16/attn": 0.0074533866718411446, "grad/layer_16/mlp": 0.00529901310801506, "grad/layer_16/attn_mlp_ratio": 1.4065612557008673, "grad/layer_20/attn": 0.006547337397933006, "grad/layer_20/mlp": 0.0063539836555719376, "grad/layer_20/attn_mlp_ratio": 1.030430301649971, "grad/layer_24/attn": 0.007043262477964163, "grad/layer_24/mlp": 0.008404071442782879, "grad/layer_24/attn_mlp_ratio": 0.8380774059464866, "grad/layer_27/attn": 0.004704191815108061, "grad/layer_27/mlp": 0.007063546683639288, "grad/layer_27/attn_mlp_ratio": 0.6659815471179427} {"step": 69050, "timestamp": 1778269176.3894472, "train/loss": 2.126007914543152, "train/z_loss": 0.001383466727565974, "train/perplexity": 8.38134090768175, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020688.2245680375, "perf/iters_per_sec": 0.9635392306175411, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378404617309571, "data/tokens_consumed": 144810442752, "data/tokens_consumed_B": 144.810442752, "train/loss_slope": 9.906549694085963e-07} {"step": 69060, "timestamp": 1778269186.7690582, "train/loss": 2.1611906766891478, "train/z_loss": 0.001366354408673942, "train/perplexity": 8.681468328965561, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021498.771097102, "perf/iters_per_sec": 0.9639257293210516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374243259429932, "data/tokens_consumed": 144831414272, "data/tokens_consumed_B": 144.831414272, "train/loss_slope": 4.029990666055033e-06} {"step": 69070, "timestamp": 1778269197.148509, "train/loss": 2.1976857423782348, "train/z_loss": 0.0013623466598801314, "train/perplexity": 9.004151442554653, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021358.943271561, "perf/iters_per_sec": 0.9638590542180829, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374960899353027, "data/tokens_consumed": 144852385792, "data/tokens_consumed_B": 144.852385792, "train/loss_slope": 8.976627354717802e-06} {"step": 69075, "timestamp": 1778269202.9632297, "eos/sharpness": 62.34502792358397, "eos/L0_probe": 1.9704841375350952, "eos/L_plus": 2.2930312156677246, "eos/L_minus": 2.2713873386383057, "eos/grad_norm": 0.24813313782215118, "eos/embed_grad_frac": 0.052527476102113724, "eos/time_s": 0.6340889930725098} {"step": 69075, "timestamp": 1778269204.3478782, "geo/rankme_last": 439.0856018066406, "geo/layer_0/stable_rank_q_proj": 19.320348739624023, "geo/layer_0/stable_rank_k_proj": 16.047595977783203, "geo/layer_0/stable_rank_o_proj": 46.78839874267578, "geo/layer_0/stable_rank_gate_proj": 129.86582946777344, "geo/layer_0/stable_rank_down_proj": 55.655147552490234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0700809434056282, "geo/layer_0/attn_entropy_mean": 6.153112888336182, "geo/layer_0/attn_entropy_std": 0.426310271024704, "geo/layer_7/stable_rank_q_proj": 43.13893127441406, "geo/layer_7/stable_rank_k_proj": 40.25172424316406, "geo/layer_7/stable_rank_o_proj": 89.50150299072266, "geo/layer_7/stable_rank_gate_proj": 79.32471466064453, "geo/layer_7/stable_rank_down_proj": 139.34730529785156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4519299864768982, "geo/layer_7/attn_entropy_mean": 4.651068687438965, "geo/layer_7/attn_entropy_std": 0.8106804490089417, "geo/layer_14/stable_rank_q_proj": 50.41932678222656, "geo/layer_14/stable_rank_k_proj": 40.68848419189453, "geo/layer_14/stable_rank_o_proj": 43.3939208984375, "geo/layer_14/stable_rank_gate_proj": 71.4661865234375, "geo/layer_14/stable_rank_down_proj": 127.2557144165039, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.385000079870224, "geo/layer_14/attn_entropy_mean": 5.555475234985352, "geo/layer_14/attn_entropy_std": 0.4130391776561737, "geo/layer_21/stable_rank_q_proj": 40.0504264831543, "geo/layer_21/stable_rank_k_proj": 30.127670288085938, "geo/layer_21/stable_rank_o_proj": 69.58099365234375, "geo/layer_21/stable_rank_gate_proj": 64.70404815673828, "geo/layer_21/stable_rank_down_proj": 50.45543670654297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14316487312316895, "geo/layer_21/attn_entropy_mean": 5.6952433586120605, "geo/layer_21/attn_entropy_std": 0.3133540153503418, "geo/layer_27/stable_rank_q_proj": 43.6216926574707, "geo/layer_27/stable_rank_k_proj": 31.803138732910156, "geo/layer_27/stable_rank_o_proj": 115.52256774902344, "geo/layer_27/stable_rank_gate_proj": 79.18512725830078, "geo/layer_27/stable_rank_down_proj": 127.52534484863281, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08780565857887268, "geo/layer_27/attn_entropy_mean": 4.159872055053711, "geo/layer_27/attn_entropy_std": 0.7593665719032288, "attnres/final_alpha/block_0": 0.23653379082679749, "attnres/block_norm/0": 1.7706927061080933, "attnres/final_alpha/block_1": 0.004231748171150684, "attnres/block_norm/1": 47664.2578125, "attnres/final_alpha/block_2": 0.009954575449228287, "attnres/block_norm/2": 28828.216796875, "attnres/final_alpha/block_3": 0.011857856065034866, "attnres/block_norm/3": 59893.76171875, "attnres/final_alpha/block_4": 0.0140709662809968, "attnres/block_norm/4": 15542.0771484375, "attnres/final_alpha/block_5": 0.6180324554443359, "attnres/block_norm/5": 6727.5322265625, "attnres/final_alpha/block_6": 0.105318583548069, "attnres/block_norm/6": 40264.8828125, "geo/tier1_time_s": 1.3645806312561035, "geo/step": 69075.0, "geo/rankme_slope": 5.123705732292917e-05} {"step": 69080, "timestamp": 1778269209.543226, "train/loss": 2.152722191810608, "train/z_loss": 0.0013815017999149859, "train/perplexity": 8.608259865593716, "train/grad_norm": 0.1494140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1692907.3556062104, "perf/iters_per_sec": 0.8072411325484325, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2387872219085694, "data/tokens_consumed": 144873357312, "data/tokens_consumed_B": 144.873357312, "train/loss_slope": 1.0723231549095682e-05} {"step": 69090, "timestamp": 1778269219.9304838, "train/loss": 2.157891273498535, "train/z_loss": 0.001385451457463205, "train/perplexity": 8.652871866233859, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019861.1197604924, "perf/iters_per_sec": 0.9631448363115751, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382654428482057, "data/tokens_consumed": 144894328832, "data/tokens_consumed_B": 144.894328832, "train/loss_slope": 1.0098593012549147e-05} {"step": 69100, "timestamp": 1778269230.300853, "grad/layer_0/attn": 0.0026871799491345882, "grad/layer_0/mlp": 0.0031251024920493364, "grad/layer_0/attn_mlp_ratio": 0.8598693546801056, "grad/layer_4/attn": 0.001999578671529889, "grad/layer_4/mlp": 0.00269201654009521, "grad/layer_4/attn_mlp_ratio": 0.7427809478395976, "grad/layer_8/attn": 0.005103674717247486, "grad/layer_8/mlp": 0.003690899582579732, "grad/layer_8/attn_mlp_ratio": 1.3827725368250332, "grad/layer_12/attn": 0.00485970126464963, "grad/layer_12/mlp": 0.006574657279998064, "grad/layer_12/attn_mlp_ratio": 0.7391565801488291, "grad/layer_16/attn": 0.005622127093374729, "grad/layer_16/mlp": 0.004838242195546627, "grad/layer_16/attn_mlp_ratio": 1.1620185079506316, "grad/layer_20/attn": 0.0036452922504395247, "grad/layer_20/mlp": 0.006588623393326998, "grad/layer_20/attn_mlp_ratio": 0.5532706875922567, "grad/layer_24/attn": 0.017184078693389893, "grad/layer_24/mlp": 0.014638748951256275, "grad/layer_24/attn_mlp_ratio": 1.1738761716060144, "grad/layer_27/attn": 0.006280585657805204, "grad/layer_27/mlp": 0.015009894967079163, "grad/layer_27/attn_mlp_ratio": 0.41842968453392193} {"step": 69100, "timestamp": 1778269230.3178258, "train/loss": 2.165045714378357, "train/z_loss": 0.0013752279686741532, "train/perplexity": 8.715000308566545, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020181.0234405354, "perf/iters_per_sec": 0.9632973782732656, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038101029396057, "data/tokens_consumed": 144915300352, "data/tokens_consumed_B": 144.915300352, "train/loss_slope": 1.309781412873731e-05} {"step": 69110, "timestamp": 1778269240.6962626, "train/loss": 2.1448230743408203, "train/z_loss": 0.00137750580906868, "train/perplexity": 8.54053006470399, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021987.4377446629, "perf/iters_per_sec": 0.9641587437365832, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03717360496521, "data/tokens_consumed": 144936271872, "data/tokens_consumed_B": 144.936271872, "train/loss_slope": 7.258615163293286e-06} {"step": 69120, "timestamp": 1778269251.6699452, "train/loss": 2.0835558772087097, "train/z_loss": 0.0013959677889943123, "train/perplexity": 8.032982488215923, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911992.212258975, "perf/iters_per_sec": 0.9117089330000758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096841287612915, "data/tokens_consumed": 144957243392, "data/tokens_consumed_B": 144.957243392, "train/loss_slope": 5.74686296201495e-06} {"step": 69130, "timestamp": 1778269262.0193934, "train/loss": 2.117946982383728, "train/z_loss": 0.0013692602748051285, "train/perplexity": 8.314051061030787, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027863.0934406193, "perf/iters_per_sec": 0.966960474701223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341684341430664, "data/tokens_consumed": 144978214912, "data/tokens_consumed_B": 144.978214912, "train/loss_slope": 5.366723186219428e-06} {"step": 69140, "timestamp": 1778269272.3640614, "train/loss": 2.1419073820114134, "train/z_loss": 0.0013923478545621037, "train/perplexity": 8.515664774089347, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028668.7397049495, "perf/iters_per_sec": 0.9673446367764232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033757734298706, "data/tokens_consumed": 144999186432, "data/tokens_consumed_B": 144.999186432, "train/loss_slope": 2.032094076164084e-06} {"step": 69150, "timestamp": 1778269282.736571, "grad/layer_0/attn": 0.0031595956534147263, "grad/layer_0/mlp": 0.0032749189995229244, "grad/layer_0/attn_mlp_ratio": 0.9647858641378353, "grad/layer_4/attn": 0.003449121955782175, "grad/layer_4/mlp": 0.0026084331329911947, "grad/layer_4/attn_mlp_ratio": 1.3222964314968986, "grad/layer_8/attn": 0.00438313465565443, "grad/layer_8/mlp": 0.0037600183859467506, "grad/layer_8/attn_mlp_ratio": 1.1657215707945603, "grad/layer_12/attn": 0.004998933058232069, "grad/layer_12/mlp": 0.0072566973976790905, "grad/layer_12/attn_mlp_ratio": 0.6888716333884495, "grad/layer_16/attn": 0.0037731651682406664, "grad/layer_16/mlp": 0.004724440164864063, "grad/layer_16/attn_mlp_ratio": 0.7986480845788051, "grad/layer_20/attn": 0.003724190406501293, "grad/layer_20/mlp": 0.005365245509892702, "grad/layer_20/attn_mlp_ratio": 0.694132324461428, "grad/layer_24/attn": 0.010174465365707874, "grad/layer_24/mlp": 0.009544315747916698, "grad/layer_24/attn_mlp_ratio": 1.0660235398568378, "grad/layer_27/attn": 0.00986730307340622, "grad/layer_27/mlp": 0.00921335443854332, "grad/layer_27/attn_mlp_ratio": 1.0709783317386907} {"step": 69150, "timestamp": 1778269283.3613527, "eos/sharpness": 82.68113136291502, "eos/L0_probe": 1.9710389375686646, "eos/L_plus": 2.476372003555298, "eos/L_minus": 2.2925171852111816, "eos/grad_norm": 0.19822487235069275, "eos/embed_grad_frac": 0.05541596934199333, "eos/time_s": 0.6218118667602539} {"step": 69150, "timestamp": 1778269283.383877, "train/loss": 2.103879725933075, "train/z_loss": 0.0013969960040412844, "train/perplexity": 8.1979139530215, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904059.867331551, "perf/iters_per_sec": 0.907926496186996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1014107465744019, "data/tokens_consumed": 145020157952, "data/tokens_consumed_B": 145.020157952, "train/loss_slope": 1.528089095835887e-06} {"step": 69150, "timestamp": 1778269284.747888, "geo/rankme_last": 438.4056701660156, "geo/layer_0/stable_rank_q_proj": 19.301462173461914, "geo/layer_0/stable_rank_k_proj": 16.046283721923828, "geo/layer_0/stable_rank_o_proj": 46.7156982421875, "geo/layer_0/stable_rank_gate_proj": 129.9106903076172, "geo/layer_0/stable_rank_down_proj": 55.552490234375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06394071877002716, "geo/layer_0/attn_entropy_mean": 6.154518127441406, "geo/layer_0/attn_entropy_std": 0.421468049287796, "geo/layer_7/stable_rank_q_proj": 43.15526580810547, "geo/layer_7/stable_rank_k_proj": 40.377342224121094, "geo/layer_7/stable_rank_o_proj": 89.51544189453125, "geo/layer_7/stable_rank_gate_proj": 79.2232894897461, "geo/layer_7/stable_rank_down_proj": 139.68760681152344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.436581015586853, "geo/layer_7/attn_entropy_mean": 4.650165557861328, "geo/layer_7/attn_entropy_std": 0.7908165454864502, "geo/layer_14/stable_rank_q_proj": 50.33225631713867, "geo/layer_14/stable_rank_k_proj": 40.73210906982422, "geo/layer_14/stable_rank_o_proj": 43.312957763671875, "geo/layer_14/stable_rank_gate_proj": 71.39579010009766, "geo/layer_14/stable_rank_down_proj": 127.63687133789062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3982428312301636, "geo/layer_14/attn_entropy_mean": 5.536016464233398, "geo/layer_14/attn_entropy_std": 0.41657841205596924, "geo/layer_21/stable_rank_q_proj": 40.078529357910156, "geo/layer_21/stable_rank_k_proj": 30.010892868041992, "geo/layer_21/stable_rank_o_proj": 69.50177764892578, "geo/layer_21/stable_rank_gate_proj": 64.69265747070312, "geo/layer_21/stable_rank_down_proj": 50.44468307495117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14594511687755585, "geo/layer_21/attn_entropy_mean": 5.720860958099365, "geo/layer_21/attn_entropy_std": 0.2976815104484558, "geo/layer_27/stable_rank_q_proj": 43.5689811706543, "geo/layer_27/stable_rank_k_proj": 31.893558502197266, "geo/layer_27/stable_rank_o_proj": 115.49750518798828, "geo/layer_27/stable_rank_gate_proj": 79.21553039550781, "geo/layer_27/stable_rank_down_proj": 127.36692810058594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09907585382461548, "geo/layer_27/attn_entropy_mean": 4.188704490661621, "geo/layer_27/attn_entropy_std": 0.7353545427322388, "attnres/final_alpha/block_0": 0.23447354137897491, "attnres/block_norm/0": 1.7707774639129639, "attnres/final_alpha/block_1": 0.004048878327012062, "attnres/block_norm/1": 47488.80859375, "attnres/final_alpha/block_2": 0.009833856485784054, "attnres/block_norm/2": 28840.3984375, "attnres/final_alpha/block_3": 0.011814301833510399, "attnres/block_norm/3": 60300.76171875, "attnres/final_alpha/block_4": 0.013763410970568657, "attnres/block_norm/4": 15560.8125, "attnres/final_alpha/block_5": 0.6223528981208801, "attnres/block_norm/5": 6662.271484375, "attnres/final_alpha/block_6": 0.10371311753988266, "attnres/block_norm/6": 40423.9375, "geo/tier1_time_s": 1.359710454940796, "geo/step": 69150.0, "geo/rankme_slope": -1.0629935567977188e-05} {"step": 69160, "timestamp": 1778269295.12781, "train/loss": 2.1237619400024412, "train/z_loss": 0.0013784440117888153, "train/perplexity": 8.362537752993658, "train/grad_norm": 0.19921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786352.3680084164, "perf/iters_per_sec": 0.8517991867105562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739856243133544, "data/tokens_consumed": 145041129472, "data/tokens_consumed_B": 145.041129472, "train/loss_slope": 2.0891485434554526e-06} {"step": 69170, "timestamp": 1778269305.5088055, "train/loss": 2.1592583656311035, "train/z_loss": 0.001383378985337913, "train/perplexity": 8.664709228825332, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021196.1913235441, "perf/iters_per_sec": 0.9637814480416985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03757963180542, "data/tokens_consumed": 145062100992, "data/tokens_consumed_B": 145.062100992, "train/loss_slope": 3.921557116572887e-06} {"step": 69180, "timestamp": 1778269315.8935447, "train/loss": 2.110408663749695, "train/z_loss": 0.0013893818482756614, "train/perplexity": 8.251612730685398, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020510.9606410419, "perf/iters_per_sec": 0.9634547045903405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379315137863159, "data/tokens_consumed": 145083072512, "data/tokens_consumed_B": 145.083072512, "train/loss_slope": 5.171550561313883e-07} {"step": 69190, "timestamp": 1778269326.5827289, "train/loss": 2.1377110958099363, "train/z_loss": 0.0013622769736684858, "train/perplexity": 8.480005478133707, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1963237.1487964687, "perf/iters_per_sec": 0.9361444229109138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0682112455368042, "data/tokens_consumed": 145104044032, "data/tokens_consumed_B": 145.104044032, "train/loss_slope": -1.2580786601151446e-06} {"step": 69200, "timestamp": 1778269336.9512868, "grad/layer_0/attn": 0.002447891980409622, "grad/layer_0/mlp": 0.003018753370270133, "grad/layer_0/attn_mlp_ratio": 0.8108949619495005, "grad/layer_4/attn": 0.0038040094077587128, "grad/layer_4/mlp": 0.0024207455571740866, "grad/layer_4/attn_mlp_ratio": 1.571420523459458, "grad/layer_8/attn": 0.0040431758388876915, "grad/layer_8/mlp": 0.003673794213682413, "grad/layer_8/attn_mlp_ratio": 1.100544966230031, "grad/layer_12/attn": 0.004609298892319202, "grad/layer_12/mlp": 0.006725400220602751, "grad/layer_12/attn_mlp_ratio": 0.6853568074154586, "grad/layer_16/attn": 0.003315533511340618, "grad/layer_16/mlp": 0.004437790252268314, "grad/layer_16/attn_mlp_ratio": 0.7471135966677492, "grad/layer_20/attn": 0.0041224961169064045, "grad/layer_20/mlp": 0.005635528825223446, "grad/layer_20/attn_mlp_ratio": 0.7315189348873673, "grad/layer_24/attn": 0.013024994172155857, "grad/layer_24/mlp": 0.009681726805865765, "grad/layer_24/attn_mlp_ratio": 1.3453172454455975, "grad/layer_27/attn": 0.004348766058683395, "grad/layer_27/mlp": 0.008287470787763596, "grad/layer_27/attn_mlp_ratio": 0.52473982926496} {"step": 69200, "timestamp": 1778269336.9685016, "train/loss": 2.1237114667892456, "train/z_loss": 0.001369154849089682, "train/perplexity": 8.362115679494588, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020412.6642812297, "perf/iters_per_sec": 0.963407833233466, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379820108413695, "data/tokens_consumed": 145125015552, "data/tokens_consumed_B": 145.125015552, "train/loss_slope": -4.948361213951898e-06} {"step": 69210, "timestamp": 1778269347.8562188, "train/loss": 2.152144157886505, "train/z_loss": 0.0013675455935299396, "train/perplexity": 8.603285437196597, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1927166.6701775668, "perf/iters_per_sec": 0.91894467839125, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0882047891616822, "data/tokens_consumed": 145145987072, "data/tokens_consumed_B": 145.145987072, "train/loss_slope": -4.833814577765925e-06} {"step": 69220, "timestamp": 1778269358.2392097, "train/loss": 2.1716291904449463, "train/z_loss": 0.001374727615620941, "train/perplexity": 8.772564583117038, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021125.6921944518, "perf/iters_per_sec": 0.9637478314373263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376158237457276, "data/tokens_consumed": 145166958592, "data/tokens_consumed_B": 145.166958592, "train/loss_slope": -4.4114130164924036e-06} {"step": 69225, "timestamp": 1778269364.0313575, "eos/sharpness": 64.82965946197508, "eos/L0_probe": 1.9706168174743652, "eos/L_plus": 2.2395944595336914, "eos/L_minus": 2.34993577003479, "eos/grad_norm": 0.13698966801166534, "eos/embed_grad_frac": 0.12243741005659103, "eos/time_s": 0.6067843437194824} {"step": 69225, "timestamp": 1778269365.4079645, "geo/rankme_last": 438.41485595703125, "geo/layer_0/stable_rank_q_proj": 19.271116256713867, "geo/layer_0/stable_rank_k_proj": 16.00282096862793, "geo/layer_0/stable_rank_o_proj": 46.76766586303711, "geo/layer_0/stable_rank_gate_proj": 129.77630615234375, "geo/layer_0/stable_rank_down_proj": 55.59467315673828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06881503015756607, "geo/layer_0/attn_entropy_mean": 6.148550510406494, "geo/layer_0/attn_entropy_std": 0.4214988648891449, "geo/layer_7/stable_rank_q_proj": 43.1274528503418, "geo/layer_7/stable_rank_k_proj": 40.23863220214844, "geo/layer_7/stable_rank_o_proj": 89.45647430419922, "geo/layer_7/stable_rank_gate_proj": 79.17160034179688, "geo/layer_7/stable_rank_down_proj": 139.47183227539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44466474652290344, "geo/layer_7/attn_entropy_mean": 4.636044502258301, "geo/layer_7/attn_entropy_std": 0.8063154220581055, "geo/layer_14/stable_rank_q_proj": 50.333221435546875, "geo/layer_14/stable_rank_k_proj": 40.751834869384766, "geo/layer_14/stable_rank_o_proj": 43.34587860107422, "geo/layer_14/stable_rank_gate_proj": 71.33944702148438, "geo/layer_14/stable_rank_down_proj": 127.54913330078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3888856768608093, "geo/layer_14/attn_entropy_mean": 5.534351348876953, "geo/layer_14/attn_entropy_std": 0.43041107058525085, "geo/layer_21/stable_rank_q_proj": 40.07927322387695, "geo/layer_21/stable_rank_k_proj": 30.052631378173828, "geo/layer_21/stable_rank_o_proj": 69.50787353515625, "geo/layer_21/stable_rank_gate_proj": 64.7103500366211, "geo/layer_21/stable_rank_down_proj": 50.35540771484375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14299555122852325, "geo/layer_21/attn_entropy_mean": 5.693452835083008, "geo/layer_21/attn_entropy_std": 0.30634233355522156, "geo/layer_27/stable_rank_q_proj": 43.56361389160156, "geo/layer_27/stable_rank_k_proj": 31.88727378845215, "geo/layer_27/stable_rank_o_proj": 115.33475494384766, "geo/layer_27/stable_rank_gate_proj": 79.06841278076172, "geo/layer_27/stable_rank_down_proj": 127.36263275146484, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10184476524591446, "geo/layer_27/attn_entropy_mean": 4.180196285247803, "geo/layer_27/attn_entropy_std": 0.7473747730255127, "attnres/final_alpha/block_0": 0.2374749481678009, "attnres/block_norm/0": 1.7708146572113037, "attnres/final_alpha/block_1": 0.004106000531464815, "attnres/block_norm/1": 47776.015625, "attnres/final_alpha/block_2": 0.010194426402449608, "attnres/block_norm/2": 28927.94921875, "attnres/final_alpha/block_3": 0.012090984731912613, "attnres/block_norm/3": 59895.1015625, "attnres/final_alpha/block_4": 0.014231571927666664, "attnres/block_norm/4": 15569.931640625, "attnres/final_alpha/block_5": 0.6153451204299927, "attnres/block_norm/5": 6722.92626953125, "attnres/final_alpha/block_6": 0.10655692219734192, "attnres/block_norm/6": 40070.046875, "geo/tier1_time_s": 1.3567874431610107, "geo/step": 69225.0, "geo/rankme_slope": -1.446965504951979e-06} {"step": 69230, "timestamp": 1778269370.5984645, "train/loss": 2.129215121269226, "train/z_loss": 0.00137826333520934, "train/perplexity": 8.4082647527042, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697515.707828678, "perf/iters_per_sec": 0.809438566126193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2354242086410523, "data/tokens_consumed": 145187930112, "data/tokens_consumed_B": 145.187930112, "train/loss_slope": -3.695302649085128e-06} {"step": 69240, "timestamp": 1778269380.985292, "train/loss": 2.1352606534957888, "train/z_loss": 0.0013682312332093717, "train/perplexity": 8.459251152909484, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019954.6773921237, "perf/iters_per_sec": 0.9631894480667704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382173538208008, "data/tokens_consumed": 145208901632, "data/tokens_consumed_B": 145.208901632, "train/loss_slope": 2.21194428364631e-07} {"step": 69250, "timestamp": 1778269391.8490696, "grad/layer_0/attn": 0.0033782990649342537, "grad/layer_0/mlp": 0.003183131106197834, "grad/layer_0/attn_mlp_ratio": 1.0613131681020282, "grad/layer_4/attn": 0.002302017994225025, "grad/layer_4/mlp": 0.0026254344265908003, "grad/layer_4/attn_mlp_ratio": 0.8768140934042867, "grad/layer_8/attn": 0.005124925170093775, "grad/layer_8/mlp": 0.0039336890913546085, "grad/layer_8/attn_mlp_ratio": 1.3028292070856142, "grad/layer_12/attn": 0.006339204963296652, "grad/layer_12/mlp": 0.007403900381177664, "grad/layer_12/attn_mlp_ratio": 0.8561980241917485, "grad/layer_16/attn": 0.004730600863695145, "grad/layer_16/mlp": 0.005323498044162989, "grad/layer_16/attn_mlp_ratio": 0.8886263760384827, "grad/layer_20/attn": 0.0035563847050070763, "grad/layer_20/mlp": 0.007222909480333328, "grad/layer_20/attn_mlp_ratio": 0.49237563691652275, "grad/layer_24/attn": 0.015292810276150703, "grad/layer_24/mlp": 0.010955176316201687, "grad/layer_24/attn_mlp_ratio": 1.395943770794422, "grad/layer_27/attn": 0.01308394968509674, "grad/layer_27/mlp": 0.010329289361834526, "grad/layer_27/attn_mlp_ratio": 1.2666843865147115} {"step": 69250, "timestamp": 1778269391.8665905, "train/loss": 2.091436731815338, "train/z_loss": 0.0013788947486318647, "train/perplexity": 8.096539367577705, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1928568.6896474669, "perf/iters_per_sec": 0.9196132133710226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0874136924743651, "data/tokens_consumed": 145229873152, "data/tokens_consumed_B": 145.229873152, "train/loss_slope": -2.1125968926810777e-06} {"step": 69260, "timestamp": 1778269402.2519772, "train/loss": 2.165549111366272, "train/z_loss": 0.0013560650520958006, "train/perplexity": 8.719388517884566, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021431.363308435, "perf/iters_per_sec": 0.9638935867826629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374589204788207, "data/tokens_consumed": 145250844672, "data/tokens_consumed_B": 145.250844672, "train/loss_slope": -1.2551517483710986e-06} {"step": 69270, "timestamp": 1778269412.9758856, "train/loss": 2.1683234691619875, "train/z_loss": 0.0013698010123334825, "train/perplexity": 8.743612809280883, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1956923.4494633647, "perf/iters_per_sec": 0.9331338164631675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.071657657623291, "data/tokens_consumed": 145271816192, "data/tokens_consumed_B": 145.271816192, "train/loss_slope": -1.064578870950143e-06} {"step": 69280, "timestamp": 1778269423.3550467, "train/loss": 2.1247234106063844, "train/z_loss": 0.0013792749610729515, "train/perplexity": 8.370581953729108, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021728.7625817726, "perf/iters_per_sec": 0.9640353978070129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037306308746338, "data/tokens_consumed": 145292787712, "data/tokens_consumed_B": 145.292787712, "train/loss_slope": -6.596471681774936e-07} {"step": 69290, "timestamp": 1778269433.7359624, "train/loss": 2.129134106636047, "train/z_loss": 0.0013803369831293822, "train/perplexity": 8.407583587812123, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021200.6963757814, "perf/iters_per_sec": 0.9637835962180049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375773191452027, "data/tokens_consumed": 145313759232, "data/tokens_consumed_B": 145.313759232, "train/loss_slope": 3.8582430993190974e-07} {"step": 69300, "timestamp": 1778269444.1020558, "grad/layer_0/attn": 0.002709602704271674, "grad/layer_0/mlp": 0.0030206511728465557, "grad/layer_0/attn_mlp_ratio": 0.8970259919206892, "grad/layer_4/attn": 0.0020958296954631805, "grad/layer_4/mlp": 0.0025893687270581722, "grad/layer_4/attn_mlp_ratio": 0.809397901744376, "grad/layer_8/attn": 0.005266678985208273, "grad/layer_8/mlp": 0.003567925188690424, "grad/layer_8/attn_mlp_ratio": 1.476118068363859, "grad/layer_12/attn": 0.005934277083724737, "grad/layer_12/mlp": 0.007153499871492386, "grad/layer_12/attn_mlp_ratio": 0.8295627465399583, "grad/layer_16/attn": 0.004648247268050909, "grad/layer_16/mlp": 0.004673854913562536, "grad/layer_16/attn_mlp_ratio": 0.9945210654936196, "grad/layer_20/attn": 0.0042174383997917175, "grad/layer_20/mlp": 0.006673881784081459, "grad/layer_20/attn_mlp_ratio": 0.6319318311358118, "grad/layer_24/attn": 0.02397211082279682, "grad/layer_24/mlp": 0.012310382910072803, "grad/layer_24/attn_mlp_ratio": 1.947308284655479, "grad/layer_27/attn": 0.0051985858008265495, "grad/layer_27/mlp": 0.012220817618072033, "grad/layer_27/attn_mlp_ratio": 0.4253877212438026} {"step": 69300, "timestamp": 1778269444.7124588, "eos/sharpness": 78.58908176422118, "eos/L0_probe": 1.9695847034454346, "eos/L_plus": 2.4283320903778076, "eos/L_minus": 2.2967281341552734, "eos/grad_norm": 0.22271114587783813, "eos/embed_grad_frac": 0.046116527169942856, "eos/time_s": 0.6076889038085938} {"step": 69300, "timestamp": 1778269444.732393, "train/loss": 2.199707341194153, "train/z_loss": 0.0013683368102647363, "train/perplexity": 9.02237263621541, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908175.2692531352, "perf/iters_per_sec": 0.9098888727441479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0990353107452393, "data/tokens_consumed": 145334730752, "data/tokens_consumed_B": 145.334730752, "train/loss_slope": 3.4486050295322063e-06} {"step": 69300, "timestamp": 1778269446.0968268, "geo/rankme_last": 439.1000671386719, "geo/layer_0/stable_rank_q_proj": 19.260921478271484, "geo/layer_0/stable_rank_k_proj": 16.000444412231445, "geo/layer_0/stable_rank_o_proj": 46.76325225830078, "geo/layer_0/stable_rank_gate_proj": 129.88238525390625, "geo/layer_0/stable_rank_down_proj": 55.700714111328125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06270908564329147, "geo/layer_0/attn_entropy_mean": 6.151451587677002, "geo/layer_0/attn_entropy_std": 0.42403748631477356, "geo/layer_7/stable_rank_q_proj": 43.17432403564453, "geo/layer_7/stable_rank_k_proj": 40.27202606201172, "geo/layer_7/stable_rank_o_proj": 89.26387023925781, "geo/layer_7/stable_rank_gate_proj": 79.25017547607422, "geo/layer_7/stable_rank_down_proj": 139.49899291992188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44741326570510864, "geo/layer_7/attn_entropy_mean": 4.638071060180664, "geo/layer_7/attn_entropy_std": 0.802497148513794, "geo/layer_14/stable_rank_q_proj": 50.37409210205078, "geo/layer_14/stable_rank_k_proj": 40.705055236816406, "geo/layer_14/stable_rank_o_proj": 43.418216705322266, "geo/layer_14/stable_rank_gate_proj": 71.24588775634766, "geo/layer_14/stable_rank_down_proj": 127.5745849609375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3954574763774872, "geo/layer_14/attn_entropy_mean": 5.539523601531982, "geo/layer_14/attn_entropy_std": 0.43855974078178406, "geo/layer_21/stable_rank_q_proj": 40.01100158691406, "geo/layer_21/stable_rank_k_proj": 30.027755737304688, "geo/layer_21/stable_rank_o_proj": 69.64710235595703, "geo/layer_21/stable_rank_gate_proj": 64.64752960205078, "geo/layer_21/stable_rank_down_proj": 50.301631927490234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14586502313613892, "geo/layer_21/attn_entropy_mean": 5.697691440582275, "geo/layer_21/attn_entropy_std": 0.2895907759666443, "geo/layer_27/stable_rank_q_proj": 43.5969352722168, "geo/layer_27/stable_rank_k_proj": 31.872943878173828, "geo/layer_27/stable_rank_o_proj": 115.1891098022461, "geo/layer_27/stable_rank_gate_proj": 78.97264862060547, "geo/layer_27/stable_rank_down_proj": 127.16213989257812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08924846351146698, "geo/layer_27/attn_entropy_mean": 4.1597795486450195, "geo/layer_27/attn_entropy_std": 0.7655500769615173, "attnres/final_alpha/block_0": 0.2362833172082901, "attnres/block_norm/0": 1.770725965499878, "attnres/final_alpha/block_1": 0.0040924279019236565, "attnres/block_norm/1": 47799.328125, "attnres/final_alpha/block_2": 0.010088445618748665, "attnres/block_norm/2": 28825.72265625, "attnres/final_alpha/block_3": 0.011971952393651009, "attnres/block_norm/3": 60233.34375, "attnres/final_alpha/block_4": 0.014142943546175957, "attnres/block_norm/4": 15529.6005859375, "attnres/final_alpha/block_5": 0.6171421408653259, "attnres/block_norm/5": 6718.322265625, "attnres/final_alpha/block_6": 0.10627877712249756, "attnres/block_norm/6": 40172.30078125, "geo/tier1_time_s": 1.3600950241088867, "geo/step": 69300.0, "geo/rankme_slope": 4.160210959383754e-05} {"step": 69310, "timestamp": 1778269456.4771717, "train/loss": 2.107743573188782, "train/z_loss": 0.001366127678193152, "train/perplexity": 8.22965071386555, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786254.277432248, "perf/iters_per_sec": 0.8517524134789696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1740500926971436, "data/tokens_consumed": 145355702272, "data/tokens_consumed_B": 145.355702272, "train/loss_slope": 3.583809709248545e-06} {"step": 69320, "timestamp": 1778269466.8560283, "train/loss": 2.1494831323623655, "train/z_loss": 0.0013733520405367016, "train/perplexity": 8.580422308232885, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021512.6155688104, "perf/iters_per_sec": 0.9639323308795978, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037417221069336, "data/tokens_consumed": 145376673792, "data/tokens_consumed_B": 145.376673792, "train/loss_slope": 2.794253879790934e-06} {"step": 69330, "timestamp": 1778269477.2380602, "train/loss": 2.139720320701599, "train/z_loss": 0.001383355213329196, "train/perplexity": 8.49706084451746, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021121.0481557099, "perf/iters_per_sec": 0.96374561698709, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376182079315186, "data/tokens_consumed": 145397645312, "data/tokens_consumed_B": 145.397645312, "train/loss_slope": 3.919893794685379e-07} {"step": 69340, "timestamp": 1778269487.6175346, "train/loss": 2.1562484383583067, "train/z_loss": 0.0013732940657064318, "train/perplexity": 8.638668294525985, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021487.2496679036, "perf/iters_per_sec": 0.9639202354754942, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374302387237548, "data/tokens_consumed": 145418616832, "data/tokens_consumed_B": 145.418616832, "train/loss_slope": -1.4764409075260886e-06} {"step": 69350, "timestamp": 1778269497.9854395, "grad/layer_0/attn": 0.0029575363732874393, "grad/layer_0/mlp": 0.0031207194551825523, "grad/layer_0/attn_mlp_ratio": 0.9477097576345442, "grad/layer_4/attn": 0.0019374694675207138, "grad/layer_4/mlp": 0.0025790813378989697, "grad/layer_4/attn_mlp_ratio": 0.7512246178232587, "grad/layer_8/attn": 0.004903867375105619, "grad/layer_8/mlp": 0.0034226328134536743, "grad/layer_8/attn_mlp_ratio": 1.4327763155170727, "grad/layer_12/attn": 0.007686950266361237, "grad/layer_12/mlp": 0.006544934120029211, "grad/layer_12/attn_mlp_ratio": 1.174488544565835, "grad/layer_16/attn": 0.003354277927428484, "grad/layer_16/mlp": 0.004577623214572668, "grad/layer_16/attn_mlp_ratio": 0.7327553398179977, "grad/layer_20/attn": 0.002900988096371293, "grad/layer_20/mlp": 0.005856340751051903, "grad/layer_20/attn_mlp_ratio": 0.49535847898098767, "grad/layer_24/attn": 0.007275842130184174, "grad/layer_24/mlp": 0.008260277099907398, "grad/layer_24/attn_mlp_ratio": 0.8808229983208965, "grad/layer_27/attn": 0.004471828229725361, "grad/layer_27/mlp": 0.007695678621530533, "grad/layer_27/attn_mlp_ratio": 0.5810830196450814} {"step": 69350, "timestamp": 1778269498.002533, "train/loss": 2.1081245303153993, "train/z_loss": 0.001378210075199604, "train/perplexity": 8.23278645520815, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020293.310643937, "perf/iters_per_sec": 0.9633509209842381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380433320999145, "data/tokens_consumed": 145439588352, "data/tokens_consumed_B": 145.439588352, "train/loss_slope": -5.037448937707621e-06} {"step": 69360, "timestamp": 1778269508.3827274, "train/loss": 2.118604028224945, "train/z_loss": 0.0013841994339600205, "train/perplexity": 8.319515568723538, "train/grad_norm": 0.189453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021526.5531479819, "perf/iters_per_sec": 0.9639389768352422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037410068511963, "data/tokens_consumed": 145460559872, "data/tokens_consumed_B": 145.460559872, "train/loss_slope": -4.0783420898089504e-06} {"step": 69370, "timestamp": 1778269518.7620316, "train/loss": 2.1075640439987184, "train/z_loss": 0.0013804034912027418, "train/perplexity": 8.228173383954273, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021566.090460081, "perf/iters_per_sec": 0.9639578296947865, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373897790908813, "data/tokens_consumed": 145481531392, "data/tokens_consumed_B": 145.481531392, "train/loss_slope": -2.602867331906808e-06} {"step": 69375, "timestamp": 1778269524.5605884, "eos/sharpness": 80.33833503723143, "eos/L0_probe": 1.9680242538452148, "eos/L_plus": 2.443861484527588, "eos/L_minus": 2.2955703735351562, "eos/grad_norm": 0.21164856851100922, "eos/embed_grad_frac": 0.06298981606960297, "eos/time_s": 0.620159387588501} {"step": 69375, "timestamp": 1778269525.9399657, "geo/rankme_last": 438.4709167480469, "geo/layer_0/stable_rank_q_proj": 19.29994773864746, "geo/layer_0/stable_rank_k_proj": 15.981951713562012, "geo/layer_0/stable_rank_o_proj": 46.807281494140625, "geo/layer_0/stable_rank_gate_proj": 129.97897338867188, "geo/layer_0/stable_rank_down_proj": 55.75029373168945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0626210868358612, "geo/layer_0/attn_entropy_mean": 6.153736591339111, "geo/layer_0/attn_entropy_std": 0.4226566255092621, "geo/layer_7/stable_rank_q_proj": 43.144840240478516, "geo/layer_7/stable_rank_k_proj": 40.179351806640625, "geo/layer_7/stable_rank_o_proj": 89.33665466308594, "geo/layer_7/stable_rank_gate_proj": 79.24174499511719, "geo/layer_7/stable_rank_down_proj": 139.34365844726562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44778522849082947, "geo/layer_7/attn_entropy_mean": 4.640928268432617, "geo/layer_7/attn_entropy_std": 0.7914658188819885, "geo/layer_14/stable_rank_q_proj": 50.375572204589844, "geo/layer_14/stable_rank_k_proj": 40.61286163330078, "geo/layer_14/stable_rank_o_proj": 43.3956298828125, "geo/layer_14/stable_rank_gate_proj": 71.33077239990234, "geo/layer_14/stable_rank_down_proj": 127.46045684814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3880985379219055, "geo/layer_14/attn_entropy_mean": 5.544276714324951, "geo/layer_14/attn_entropy_std": 0.3849893808364868, "geo/layer_21/stable_rank_q_proj": 40.02216720581055, "geo/layer_21/stable_rank_k_proj": 30.04305076599121, "geo/layer_21/stable_rank_o_proj": 69.67903137207031, "geo/layer_21/stable_rank_gate_proj": 64.69624328613281, "geo/layer_21/stable_rank_down_proj": 50.299827575683594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14725370705127716, "geo/layer_21/attn_entropy_mean": 5.703020095825195, "geo/layer_21/attn_entropy_std": 0.3004337251186371, "geo/layer_27/stable_rank_q_proj": 43.62228012084961, "geo/layer_27/stable_rank_k_proj": 31.806459426879883, "geo/layer_27/stable_rank_o_proj": 114.95438385009766, "geo/layer_27/stable_rank_gate_proj": 78.92766571044922, "geo/layer_27/stable_rank_down_proj": 127.05481719970703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09708788245916367, "geo/layer_27/attn_entropy_mean": 4.159287452697754, "geo/layer_27/attn_entropy_std": 0.7630159854888916, "attnres/final_alpha/block_0": 0.23543909192085266, "attnres/block_norm/0": 1.7708942890167236, "attnres/final_alpha/block_1": 0.00410918053239584, "attnres/block_norm/1": 47832.890625, "attnres/final_alpha/block_2": 0.009882641024887562, "attnres/block_norm/2": 28702.947265625, "attnres/final_alpha/block_3": 0.011558568105101585, "attnres/block_norm/3": 60535.59375, "attnres/final_alpha/block_4": 0.013733474537730217, "attnres/block_norm/4": 15549.4541015625, "attnres/final_alpha/block_5": 0.6200376152992249, "attnres/block_norm/5": 6701.5615234375, "attnres/final_alpha/block_6": 0.10523944348096848, "attnres/block_norm/6": 40251.078125, "geo/tier1_time_s": 1.3597180843353271, "geo/step": 69375.0, "geo/rankme_slope": 2.0657344969237694e-05} {"step": 69380, "timestamp": 1778269531.134413, "train/loss": 2.1172718048095702, "train/z_loss": 0.0013720199698582292, "train/perplexity": 8.308439494818982, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695703.061823608, "perf/iters_per_sec": 0.8085742291563072, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2367448329925537, "data/tokens_consumed": 145502502912, "data/tokens_consumed_B": 145.502502912, "train/loss_slope": -2.425669464949093e-06} {"step": 69390, "timestamp": 1778269541.5114343, "train/loss": 2.130849552154541, "train/z_loss": 0.0013730178587138652, "train/perplexity": 8.422018717198288, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022105.736530547, "perf/iters_per_sec": 0.964215152993463, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371129274368287, "data/tokens_consumed": 145523474432, "data/tokens_consumed_B": 145.523474432, "train/loss_slope": -4.4066733152273085e-06} {"step": 69400, "timestamp": 1778269551.8810327, "grad/layer_0/attn": 0.0027467638719826937, "grad/layer_0/mlp": 0.002822709036991, "grad/layer_0/attn_mlp_ratio": 0.9730949023358234, "grad/layer_4/attn": 0.0024029058404266834, "grad/layer_4/mlp": 0.0024369133170694113, "grad/layer_4/attn_mlp_ratio": 0.986044815378125, "grad/layer_8/attn": 0.0036718444898724556, "grad/layer_8/mlp": 0.003538955468684435, "grad/layer_8/attn_mlp_ratio": 1.0375503220113669, "grad/layer_12/attn": 0.004668423905968666, "grad/layer_12/mlp": 0.006880445405840874, "grad/layer_12/attn_mlp_ratio": 0.6785060505174554, "grad/layer_16/attn": 0.010834949091076851, "grad/layer_16/mlp": 0.0047923363745212555, "grad/layer_16/attn_mlp_ratio": 2.2608907259916973, "grad/layer_20/attn": 0.0036864066496491432, "grad/layer_20/mlp": 0.006226152181625366, "grad/layer_20/attn_mlp_ratio": 0.5920842412622117, "grad/layer_24/attn": 0.006338811479508877, "grad/layer_24/mlp": 0.008586795069277287, "grad/layer_24/attn_mlp_ratio": 0.738204575111856, "grad/layer_27/attn": 0.008888591080904007, "grad/layer_27/mlp": 0.008894689381122589, "grad/layer_27/attn_mlp_ratio": 0.9993143773898432} {"step": 69400, "timestamp": 1778269551.8982434, "train/loss": 2.094058907032013, "train/z_loss": 0.0013783782138489186, "train/perplexity": 8.117797771897964, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020088.419090148, "perf/iters_per_sec": 0.9632532210779896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038148617744446, "data/tokens_consumed": 145544445952, "data/tokens_consumed_B": 145.544445952, "train/loss_slope": -9.446561361553766e-06} {"step": 69410, "timestamp": 1778269562.277242, "train/loss": 2.147015023231506, "train/z_loss": 0.0013747856486588717, "train/perplexity": 8.559271002190133, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021652.464656506, "perf/iters_per_sec": 0.9639990161211519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373454570770264, "data/tokens_consumed": 145565417472, "data/tokens_consumed_B": 145.565417472, "train/loss_slope": -9.892147230451032e-06} {"step": 69420, "timestamp": 1778269572.651519, "train/loss": 2.168995475769043, "train/z_loss": 0.001368895173072815, "train/perplexity": 8.749490549577217, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023101.3890385358, "perf/iters_per_sec": 0.9646899171059302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036602520942688, "data/tokens_consumed": 145586388992, "data/tokens_consumed_B": 145.586388992, "train/loss_slope": -9.122593918613505e-06} {"step": 69430, "timestamp": 1778269583.0019267, "train/loss": 2.145600152015686, "train/z_loss": 0.0013672746950760483, "train/perplexity": 8.54716929921516, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027018.4745266435, "perf/iters_per_sec": 0.9665577290185182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345993518829346, "data/tokens_consumed": 145607360512, "data/tokens_consumed_B": 145.607360512, "train/loss_slope": -1.2068021951979277e-05} {"step": 69440, "timestamp": 1778269593.3601506, "train/loss": 2.114669644832611, "train/z_loss": 0.001382860855665058, "train/perplexity": 8.286847710916117, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025701.7337294817, "perf/iters_per_sec": 0.9659298580787095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352718591690064, "data/tokens_consumed": 145628332032, "data/tokens_consumed_B": 145.628332032, "train/loss_slope": -1.4731709763269112e-05} {"step": 69450, "timestamp": 1778269603.7089777, "grad/layer_0/attn": 0.00292884255759418, "grad/layer_0/mlp": 0.0031173492316156626, "grad/layer_0/attn_mlp_ratio": 0.9395297883013377, "grad/layer_4/attn": 0.0020768949761986732, "grad/layer_4/mlp": 0.002489405917003751, "grad/layer_4/attn_mlp_ratio": 0.8342933864594833, "grad/layer_8/attn": 0.0060890354216098785, "grad/layer_8/mlp": 0.0036451390478760004, "grad/layer_8/attn_mlp_ratio": 1.670453492881862, "grad/layer_12/attn": 0.004060185048729181, "grad/layer_12/mlp": 0.006725495681166649, "grad/layer_12/attn_mlp_ratio": 0.6037004825873038, "grad/layer_16/attn": 0.0038572971243411303, "grad/layer_16/mlp": 0.004890225827693939, "grad/layer_16/attn_mlp_ratio": 0.7887768748058841, "grad/layer_20/attn": 0.004439835902303457, "grad/layer_20/mlp": 0.006346615497022867, "grad/layer_20/attn_mlp_ratio": 0.699559605340859, "grad/layer_24/attn": 0.015351485460996628, "grad/layer_24/mlp": 0.011159389279782772, "grad/layer_24/attn_mlp_ratio": 1.375656403638768, "grad/layer_27/attn": 0.0040411814115941525, "grad/layer_27/mlp": 0.010375083424150944, "grad/layer_27/attn_mlp_ratio": 0.38950832561368387} {"step": 69450, "timestamp": 1778269604.321144, "eos/sharpness": 65.88952541351317, "eos/L0_probe": 1.96651291847229, "eos/L_plus": 2.283902645111084, "eos/L_minus": 2.308018445968628, "eos/grad_norm": 0.1717565655708313, "eos/embed_grad_frac": 0.07791784405708313, "eos/time_s": 0.6093733310699463} {"step": 69450, "timestamp": 1778269604.3405502, "train/loss": 2.1114049911499024, "train/z_loss": 0.0013859404367394744, "train/perplexity": 8.259838135462536, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910996.937224849, "perf/iters_per_sec": 0.9112343488811726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0974125385284423, "data/tokens_consumed": 145649303552, "data/tokens_consumed_B": 145.649303552, "train/loss_slope": -1.091646594469016e-05} {"step": 69450, "timestamp": 1778269605.7040124, "geo/rankme_last": 438.62939453125, "geo/layer_0/stable_rank_q_proj": 19.296642303466797, "geo/layer_0/stable_rank_k_proj": 15.932286262512207, "geo/layer_0/stable_rank_o_proj": 46.89087677001953, "geo/layer_0/stable_rank_gate_proj": 130.1597900390625, "geo/layer_0/stable_rank_down_proj": 55.7404899597168, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06621534377336502, "geo/layer_0/attn_entropy_mean": 6.15579080581665, "geo/layer_0/attn_entropy_std": 0.4279835820198059, "geo/layer_7/stable_rank_q_proj": 43.19949722290039, "geo/layer_7/stable_rank_k_proj": 40.25056457519531, "geo/layer_7/stable_rank_o_proj": 89.29730224609375, "geo/layer_7/stable_rank_gate_proj": 79.02228546142578, "geo/layer_7/stable_rank_down_proj": 138.97714233398438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44872933626174927, "geo/layer_7/attn_entropy_mean": 4.670026779174805, "geo/layer_7/attn_entropy_std": 0.7768890261650085, "geo/layer_14/stable_rank_q_proj": 50.38287353515625, "geo/layer_14/stable_rank_k_proj": 40.70185470581055, "geo/layer_14/stable_rank_o_proj": 43.401329040527344, "geo/layer_14/stable_rank_gate_proj": 71.26420593261719, "geo/layer_14/stable_rank_down_proj": 127.32898712158203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4055781960487366, "geo/layer_14/attn_entropy_mean": 5.554498672485352, "geo/layer_14/attn_entropy_std": 0.40653273463249207, "geo/layer_21/stable_rank_q_proj": 40.03733825683594, "geo/layer_21/stable_rank_k_proj": 30.033994674682617, "geo/layer_21/stable_rank_o_proj": 69.65985107421875, "geo/layer_21/stable_rank_gate_proj": 64.65283966064453, "geo/layer_21/stable_rank_down_proj": 50.29123306274414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1478501856327057, "geo/layer_21/attn_entropy_mean": 5.700058937072754, "geo/layer_21/attn_entropy_std": 0.31238996982574463, "geo/layer_27/stable_rank_q_proj": 43.61671829223633, "geo/layer_27/stable_rank_k_proj": 31.837982177734375, "geo/layer_27/stable_rank_o_proj": 115.14833068847656, "geo/layer_27/stable_rank_gate_proj": 78.84123229980469, "geo/layer_27/stable_rank_down_proj": 127.05484008789062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09956661611795425, "geo/layer_27/attn_entropy_mean": 4.155948638916016, "geo/layer_27/attn_entropy_std": 0.780652642250061, "attnres/final_alpha/block_0": 0.2374698370695114, "attnres/block_norm/0": 1.7711269855499268, "attnres/final_alpha/block_1": 0.004139530006796122, "attnres/block_norm/1": 47947.93359375, "attnres/final_alpha/block_2": 0.009954959154129028, "attnres/block_norm/2": 28899.69921875, "attnres/final_alpha/block_3": 0.011768737807869911, "attnres/block_norm/3": 59907.4140625, "attnres/final_alpha/block_4": 0.0140450494363904, "attnres/block_norm/4": 15529.8154296875, "attnres/final_alpha/block_5": 0.6150720119476318, "attnres/block_norm/5": 6762.15625, "attnres/final_alpha/block_6": 0.10754988342523575, "attnres/block_norm/6": 40050.08984375, "geo/tier1_time_s": 1.359966516494751, "geo/step": 69450.0, "geo/rankme_slope": 1.55855506265006e-05} {"step": 69460, "timestamp": 1778269616.054492, "train/loss": 2.124748373031616, "train/z_loss": 0.001372437970712781, "train/perplexity": 8.370790906363245, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790852.9837040007, "perf/iters_per_sec": 0.853945247509003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710352659225465, "data/tokens_consumed": 145670275072, "data/tokens_consumed_B": 145.670275072, "train/loss_slope": -1.049859159671613e-05} {"step": 69470, "timestamp": 1778269626.4070113, "train/loss": 2.1226458072662355, "train/z_loss": 0.0013745696167461574, "train/perplexity": 8.353209257737669, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026857.0517744974, "perf/iters_per_sec": 0.9664807566521155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034681749343872, "data/tokens_consumed": 145691246592, "data/tokens_consumed_B": 145.691246592, "train/loss_slope": -1.22397341887013e-05} {"step": 69480, "timestamp": 1778269636.759602, "train/loss": 2.0763908505439757, "train/z_loss": 0.0013838911429047585, "train/perplexity": 7.975631659951707, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026937.6665207546, "perf/iters_per_sec": 0.9665191967586301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346405982971192, "data/tokens_consumed": 145712218112, "data/tokens_consumed_B": 145.712218112, "train/loss_slope": -1.8153030794374304e-05} {"step": 69490, "timestamp": 1778269647.1137078, "train/loss": 2.1375136852264403, "train/z_loss": 0.0013856480713002385, "train/perplexity": 8.47833160053023, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026349.643718469, "perf/iters_per_sec": 0.9662388056366296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349408388137817, "data/tokens_consumed": 145733189632, "data/tokens_consumed_B": 145.733189632, "train/loss_slope": -1.4990776278326698e-05} {"step": 69500, "timestamp": 1778269657.4652286, "grad/layer_0/attn": 0.0027269432321190834, "grad/layer_0/mlp": 0.00308889034204185, "grad/layer_0/attn_mlp_ratio": 0.8828229046273623, "grad/layer_4/attn": 0.002138930605724454, "grad/layer_4/mlp": 0.0025829351507127285, "grad/layer_4/attn_mlp_ratio": 0.8281007451248509, "grad/layer_8/attn": 0.003764934604987502, "grad/layer_8/mlp": 0.0037437102291733027, "grad/layer_8/attn_mlp_ratio": 1.0056693157183683, "grad/layer_12/attn": 0.00709804380312562, "grad/layer_12/mlp": 0.00727291451767087, "grad/layer_12/attn_mlp_ratio": 0.975955882374809, "grad/layer_16/attn": 0.003937142435461283, "grad/layer_16/mlp": 0.005264416802674532, "grad/layer_16/attn_mlp_ratio": 0.7478781616746688, "grad/layer_20/attn": 0.003525640582665801, "grad/layer_20/mlp": 0.006497449241578579, "grad/layer_20/attn_mlp_ratio": 0.5426191721272019, "grad/layer_24/attn": 0.011176603846251965, "grad/layer_24/mlp": 0.009535236284136772, "grad/layer_24/attn_mlp_ratio": 1.1721370499892212, "grad/layer_27/attn": 0.0037137980107218027, "grad/layer_27/mlp": 0.009959150105714798, "grad/layer_27/attn_mlp_ratio": 0.3729031025750306} {"step": 69500, "timestamp": 1778269657.4813743, "train/loss": 2.123241639137268, "train/z_loss": 0.0013864891603589057, "train/perplexity": 8.358187849093246, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024235.6208836404, "perf/iters_per_sec": 0.9652307609956934, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360216856002809, "data/tokens_consumed": 145754161152, "data/tokens_consumed_B": 145.754161152, "train/loss_slope": -1.8344813755171597e-05} {"step": 69500, "timestamp": 1778269664.5093322, "geo/ww_alpha_mean": 7.8800661680011945, "geo/ww_alpha_std": 5.141866904482845, "geo/ww_alpha_min": 1.3370655661170916, "geo/ww_alpha_max": 29.016559476020788, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.936071704858086, "geo/ww_alpha_by_type/k_proj": 4.535897677839442, "geo/ww_alpha_by_type/v_proj": 9.07334796578836, "geo/ww_alpha_by_type/o_proj": 8.650761594330266, "geo/ww_alpha_by_type/gate_proj": 8.058069453474976, "geo/ww_alpha_by_type/up_proj": 12.944657170888638, "geo/ww_alpha_by_type/down_proj": 8.064496322636748, "geo/twonn_id/layer_0": 0.7464122772216797, "geo/twonn_id/layer_7": 2.7601113319396973, "geo/twonn_id/layer_14": 4.52067756652832, "geo/twonn_id/layer_21": 6.946937561035156, "geo/twonn_id/layer_27": 5.822625160217285, "geo/tier2_time_s": 7.0190935134887695} {"step": 69500, "timestamp": 1778269665.176432, "eoc/jacobian_sigma/layer_0/attn": 1249.8204345703125, "eoc/jacobian_sigma/layer_0/mlp": 8851.2421875, "eoc/jacobian_sigma/layer_0": 8851.2421875, "eoc/jacobian_sigma/layer_7/attn": 1.1535964012145996, "eoc/jacobian_sigma/layer_7/mlp": 1.762086033821106, "eoc/jacobian_sigma/layer_7": 1.762086033821106, "eoc/jacobian_sigma/layer_14/attn": 1.4633052349090576, "eoc/jacobian_sigma/layer_14/mlp": 6.252708435058594, "eoc/jacobian_sigma/layer_14": 6.252708435058594, "eoc/jacobian_sigma/layer_21/attn": 1.0857418775558472, "eoc/jacobian_sigma/layer_21/mlp": 4.175387859344482, "eoc/jacobian_sigma/layer_21": 4.175387859344482, "eoc/jacobian_sigma/layer_27/attn": 3.023371458053589, "eoc/jacobian_sigma/layer_27/mlp": 25.42853546142578, "eoc/jacobian_sigma/layer_27": 25.42853546142578, "eoc/layer0_sigma": 8851.2421875, "eoc/sigma_max": 25.42853546142578, "eoc/sigma_min": 1.762086033821106, "eoc/sigma_mean": 9.40467944741249, "eoc/time_s": 0.6610078811645508} {"step": 69510, "timestamp": 1778269675.5456977, "train/loss": 2.0939502716064453, "train/z_loss": 0.0013747203280217945, "train/perplexity": 8.116915939382334, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1161414.0531605384, "perf/iters_per_sec": 0.5538053766062443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.805688500404358, "data/tokens_consumed": 145775132672, "data/tokens_consumed_B": 145.775132672, "train/loss_slope": -2.119181040990756e-05} {"step": 69520, "timestamp": 1778269685.8998923, "train/loss": 2.1625592947006225, "train/z_loss": 0.0013598621357232333, "train/perplexity": 8.693358077292581, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026690.14452111, "perf/iters_per_sec": 0.9664011690717268, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034766960144043, "data/tokens_consumed": 145796104192, "data/tokens_consumed_B": 145.796104192, "train/loss_slope": -1.8545583079178117e-05} {"step": 69525, "timestamp": 1778269691.6985939, "eos/sharpness": 70.88592052459715, "eos/L0_probe": 1.9662466049194336, "eos/L_plus": 2.2595906257629395, "eos/L_minus": 2.3817617893218994, "eos/grad_norm": 0.1544903814792633, "eos/embed_grad_frac": 0.09600276499986649, "eos/time_s": 0.6318550109863281} {"step": 69525, "timestamp": 1778269693.0804195, "geo/rankme_last": 438.1764221191406, "geo/layer_0/stable_rank_q_proj": 19.3182430267334, "geo/layer_0/stable_rank_k_proj": 15.988304138183594, "geo/layer_0/stable_rank_o_proj": 46.983238220214844, "geo/layer_0/stable_rank_gate_proj": 130.13970947265625, "geo/layer_0/stable_rank_down_proj": 55.79768753051758, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060874857008457184, "geo/layer_0/attn_entropy_mean": 6.152750015258789, "geo/layer_0/attn_entropy_std": 0.42661821842193604, "geo/layer_7/stable_rank_q_proj": 43.244014739990234, "geo/layer_7/stable_rank_k_proj": 40.18205261230469, "geo/layer_7/stable_rank_o_proj": 89.2621078491211, "geo/layer_7/stable_rank_gate_proj": 79.04524993896484, "geo/layer_7/stable_rank_down_proj": 139.204833984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4518671929836273, "geo/layer_7/attn_entropy_mean": 4.66689395904541, "geo/layer_7/attn_entropy_std": 0.7982587814331055, "geo/layer_14/stable_rank_q_proj": 50.35906219482422, "geo/layer_14/stable_rank_k_proj": 40.73746109008789, "geo/layer_14/stable_rank_o_proj": 43.39369583129883, "geo/layer_14/stable_rank_gate_proj": 71.20590209960938, "geo/layer_14/stable_rank_down_proj": 127.2195816040039, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3836030960083008, "geo/layer_14/attn_entropy_mean": 5.514585018157959, "geo/layer_14/attn_entropy_std": 0.4178277552127838, "geo/layer_21/stable_rank_q_proj": 40.005584716796875, "geo/layer_21/stable_rank_k_proj": 30.054481506347656, "geo/layer_21/stable_rank_o_proj": 69.56014251708984, "geo/layer_21/stable_rank_gate_proj": 64.49304962158203, "geo/layer_21/stable_rank_down_proj": 50.21041488647461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15079018473625183, "geo/layer_21/attn_entropy_mean": 5.673649311065674, "geo/layer_21/attn_entropy_std": 0.3044629991054535, "geo/layer_27/stable_rank_q_proj": 43.64940643310547, "geo/layer_27/stable_rank_k_proj": 31.90732765197754, "geo/layer_27/stable_rank_o_proj": 114.96698760986328, "geo/layer_27/stable_rank_gate_proj": 78.78400421142578, "geo/layer_27/stable_rank_down_proj": 127.15637969970703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1080155000090599, "geo/layer_27/attn_entropy_mean": 4.160900592803955, "geo/layer_27/attn_entropy_std": 0.7551627159118652, "attnres/final_alpha/block_0": 0.23823145031929016, "attnres/block_norm/0": 1.7712398767471313, "attnres/final_alpha/block_1": 0.0040830206125974655, "attnres/block_norm/1": 47856.4609375, "attnres/final_alpha/block_2": 0.00995718315243721, "attnres/block_norm/2": 28863.96484375, "attnres/final_alpha/block_3": 0.011710739694535732, "attnres/block_norm/3": 60909.875, "attnres/final_alpha/block_4": 0.014006421901285648, "attnres/block_norm/4": 15517.3369140625, "attnres/final_alpha/block_5": 0.6142317652702332, "attnres/block_norm/5": 6741.02685546875, "attnres/final_alpha/block_6": 0.10777945071458817, "attnres/block_norm/6": 40139.13671875, "geo/tier1_time_s": 1.3618848323822021, "geo/step": 69525.0, "geo/rankme_slope": -8.694122180122048e-06} {"step": 69530, "timestamp": 1778269698.260229, "train/loss": 2.1408113718032835, "train/z_loss": 0.0013706609955988825, "train/perplexity": 8.50633663137153, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697545.4540118938, "perf/iters_per_sec": 0.8094527502116651, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23540256023407, "data/tokens_consumed": 145817075712, "data/tokens_consumed_B": 145.817075712, "train/loss_slope": -1.8685538905395814e-05} {"step": 69540, "timestamp": 1778269708.6144927, "train/loss": 2.1749417066574095, "train/z_loss": 0.0013658134266734123, "train/perplexity": 8.801672028349858, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026321.3088257017, "perf/iters_per_sec": 0.9662252945068844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349553108215332, "data/tokens_consumed": 145838047232, "data/tokens_consumed_B": 145.838047232, "train/loss_slope": -1.5142764259736678e-05} {"step": 69550, "timestamp": 1778269718.9591794, "grad/layer_0/attn": 0.0028254641219973564, "grad/layer_0/mlp": 0.003168829483911395, "grad/layer_0/attn_mlp_ratio": 0.8916428123312927, "grad/layer_4/attn": 0.002108530141413212, "grad/layer_4/mlp": 0.002680652542039752, "grad/layer_4/attn_mlp_ratio": 0.7865734293007083, "grad/layer_8/attn": 0.0035116090439260006, "grad/layer_8/mlp": 0.0037228106521070004, "grad/layer_8/attn_mlp_ratio": 0.9432682125833375, "grad/layer_12/attn": 0.006045602262020111, "grad/layer_12/mlp": 0.0068980371579527855, "grad/layer_12/attn_mlp_ratio": 0.8764235442553029, "grad/layer_16/attn": 0.004902762360870838, "grad/layer_16/mlp": 0.004525505937635899, "grad/layer_16/attn_mlp_ratio": 1.0833622406196184, "grad/layer_20/attn": 0.002839866792783141, "grad/layer_20/mlp": 0.005602552089840174, "grad/layer_20/attn_mlp_ratio": 0.5068880568275715, "grad/layer_24/attn": 0.0054875994101166725, "grad/layer_24/mlp": 0.007939384318888187, "grad/layer_24/attn_mlp_ratio": 0.6911870140790013, "grad/layer_27/attn": 0.004123411141335964, "grad/layer_27/mlp": 0.0073646376840770245, "grad/layer_27/attn_mlp_ratio": 0.5598932713637501} {"step": 69550, "timestamp": 1778269718.975274, "train/loss": 2.1402320861816406, "train/z_loss": 0.0013651429675519467, "train/perplexity": 8.50141045983605, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025383.7169777697, "perf/iters_per_sec": 0.9657782158745621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354344129562378, "data/tokens_consumed": 145859018752, "data/tokens_consumed_B": 145.859018752, "train/loss_slope": -1.281812914920718e-05} {"step": 69560, "timestamp": 1778269729.3272135, "train/loss": 2.1185545563697814, "train/z_loss": 0.0013801750726997852, "train/perplexity": 8.319103997034983, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026783.4018443855, "perf/iters_per_sec": 0.9664456376287391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347193479537964, "data/tokens_consumed": 145879990272, "data/tokens_consumed_B": 145.879990272, "train/loss_slope": -1.39674116747536e-05} {"step": 69570, "timestamp": 1778269739.6789734, "train/loss": 2.148041808605194, "train/z_loss": 0.0013681928627192973, "train/perplexity": 8.568064049978508, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026773.921614449, "perf/iters_per_sec": 0.9664411171028371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034724187850952, "data/tokens_consumed": 145900961792, "data/tokens_consumed_B": 145.900961792, "train/loss_slope": -1.3197805192640098e-05} {"step": 69580, "timestamp": 1778269750.0415335, "train/loss": 2.1259932279586793, "train/z_loss": 0.001376488443929702, "train/perplexity": 8.38121781531442, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025139.838089526, "perf/iters_per_sec": 0.9656619253585462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035559105873108, "data/tokens_consumed": 145921933312, "data/tokens_consumed_B": 145.921933312, "train/loss_slope": -1.4083312940497401e-05} {"step": 69590, "timestamp": 1778269760.402844, "train/loss": 2.112678289413452, "train/z_loss": 0.0013742086477577687, "train/perplexity": 8.270362071650597, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025180.3561535315, "perf/iters_per_sec": 0.9656812458770425, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035538387298584, "data/tokens_consumed": 145942904832, "data/tokens_consumed_B": 145.942904832, "train/loss_slope": -1.3969871506403424e-05} {"step": 69600, "timestamp": 1778269770.7466688, "grad/layer_0/attn": 0.0031332846265286207, "grad/layer_0/mlp": 0.003227981273084879, "grad/layer_0/attn_mlp_ratio": 0.9706637877944879, "grad/layer_4/attn": 0.0020739175379276276, "grad/layer_4/mlp": 0.0025974935851991177, "grad/layer_4/attn_mlp_ratio": 0.7984302521099857, "grad/layer_8/attn": 0.006135720759630203, "grad/layer_8/mlp": 0.0034689567983150482, "grad/layer_8/attn_mlp_ratio": 1.7687509356517122, "grad/layer_12/attn": 0.005217689089477062, "grad/layer_12/mlp": 0.006738224532455206, "grad/layer_12/attn_mlp_ratio": 0.774341814659256, "grad/layer_16/attn": 0.008496839553117752, "grad/layer_16/mlp": 0.005224945954978466, "grad/layer_16/attn_mlp_ratio": 1.626206177769384, "grad/layer_20/attn": 0.003248637542128563, "grad/layer_20/mlp": 0.005949182435870171, "grad/layer_20/attn_mlp_ratio": 0.546064526099365, "grad/layer_24/attn": 0.0057006217539310455, "grad/layer_24/mlp": 0.007349071558564901, "grad/layer_24/attn_mlp_ratio": 0.7756927702953221, "grad/layer_27/attn": 0.00391676789149642, "grad/layer_27/mlp": 0.0068933009169995785, "grad/layer_27/attn_mlp_ratio": 0.5681991663844759} {"step": 69600, "timestamp": 1778269771.4465528, "eos/sharpness": 39.106941223144524, "eos/L0_probe": 1.9660249948501587, "eos/L_plus": 2.143496513366699, "eos/L_minus": 2.1796228885650635, "eos/grad_norm": 0.11133994162082672, "eos/embed_grad_frac": 0.16988663375377655, "eos/time_s": 0.6970891952514648} {"step": 69600, "timestamp": 1778269771.4809644, "train/loss": 2.1084699630737305, "train/z_loss": 0.0013928357395343483, "train/perplexity": 8.23563082058233, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1893895.1555828322, "perf/iters_per_sec": 0.903079583922783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1073221206665038, "data/tokens_consumed": 145963876352, "data/tokens_consumed_B": 145.963876352, "train/loss_slope": -1.572412001703941e-05} {"step": 69600, "timestamp": 1778269772.8493016, "geo/rankme_last": 439.1431884765625, "geo/layer_0/stable_rank_q_proj": 19.31619644165039, "geo/layer_0/stable_rank_k_proj": 16.011051177978516, "geo/layer_0/stable_rank_o_proj": 46.972076416015625, "geo/layer_0/stable_rank_gate_proj": 130.16600036621094, "geo/layer_0/stable_rank_down_proj": 55.852317810058594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06808987259864807, "geo/layer_0/attn_entropy_mean": 6.154459476470947, "geo/layer_0/attn_entropy_std": 0.4259586036205292, "geo/layer_7/stable_rank_q_proj": 43.17185592651367, "geo/layer_7/stable_rank_k_proj": 40.09455490112305, "geo/layer_7/stable_rank_o_proj": 89.23680114746094, "geo/layer_7/stable_rank_gate_proj": 79.12174987792969, "geo/layer_7/stable_rank_down_proj": 139.02850341796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44578567147254944, "geo/layer_7/attn_entropy_mean": 4.628708839416504, "geo/layer_7/attn_entropy_std": 0.8127546310424805, "geo/layer_14/stable_rank_q_proj": 50.37559127807617, "geo/layer_14/stable_rank_k_proj": 40.89066696166992, "geo/layer_14/stable_rank_o_proj": 43.338043212890625, "geo/layer_14/stable_rank_gate_proj": 71.31716918945312, "geo/layer_14/stable_rank_down_proj": 127.33316802978516, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38298067450523376, "geo/layer_14/attn_entropy_mean": 5.530856132507324, "geo/layer_14/attn_entropy_std": 0.41412830352783203, "geo/layer_21/stable_rank_q_proj": 40.025657653808594, "geo/layer_21/stable_rank_k_proj": 30.024734497070312, "geo/layer_21/stable_rank_o_proj": 69.58666229248047, "geo/layer_21/stable_rank_gate_proj": 64.4544677734375, "geo/layer_21/stable_rank_down_proj": 50.25849151611328, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14079882204532623, "geo/layer_21/attn_entropy_mean": 5.672597408294678, "geo/layer_21/attn_entropy_std": 0.30648481845855713, "geo/layer_27/stable_rank_q_proj": 43.597110748291016, "geo/layer_27/stable_rank_k_proj": 31.982053756713867, "geo/layer_27/stable_rank_o_proj": 115.04151153564453, "geo/layer_27/stable_rank_gate_proj": 78.76648712158203, "geo/layer_27/stable_rank_down_proj": 127.13634490966797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09509044885635376, "geo/layer_27/attn_entropy_mean": 4.18170166015625, "geo/layer_27/attn_entropy_std": 0.7428147196769714, "attnres/final_alpha/block_0": 0.2382769137620926, "attnres/block_norm/0": 1.7714478969573975, "attnres/final_alpha/block_1": 0.00411803275346756, "attnres/block_norm/1": 47783.3671875, "attnres/final_alpha/block_2": 0.010010247118771076, "attnres/block_norm/2": 28832.80078125, "attnres/final_alpha/block_3": 0.011863035149872303, "attnres/block_norm/3": 60016.3203125, "attnres/final_alpha/block_4": 0.014009494334459305, "attnres/block_norm/4": 15546.732421875, "attnres/final_alpha/block_5": 0.6141624450683594, "attnres/block_norm/5": 6754.7744140625, "attnres/final_alpha/block_6": 0.10755988210439682, "attnres/block_norm/6": 40128.96875, "geo/tier1_time_s": 1.3651483058929443, "geo/step": 69600.0, "geo/rankme_slope": -1.1653235512955178e-05} {"step": 69610, "timestamp": 1778269783.796389, "train/loss": 2.1330037832260134, "train/z_loss": 0.0013786543975584209, "train/perplexity": 8.440181247723272, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703301.663580643, "perf/iters_per_sec": 0.8121975248244491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2312275886535644, "data/tokens_consumed": 145984847872, "data/tokens_consumed_B": 145.984847872, "train/loss_slope": -1.779960991561575e-05} {"step": 69620, "timestamp": 1778269794.1843934, "train/loss": 2.1366049528121946, "train/z_loss": 0.0013831724179908633, "train/perplexity": 8.470630565406182, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019987.5660948167, "perf/iters_per_sec": 0.9632051306222995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382004499435424, "data/tokens_consumed": 146005819392, "data/tokens_consumed_B": 146.005819392, "train/loss_slope": -1.7584737492437463e-05} {"step": 69630, "timestamp": 1778269804.5763752, "train/loss": 2.167535090446472, "train/z_loss": 0.0013702743337489665, "train/perplexity": 8.736722247588323, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019916.0380137933, "perf/iters_per_sec": 0.9631710233754126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03823721408844, "data/tokens_consumed": 146026790912, "data/tokens_consumed_B": 146.026790912, "train/loss_slope": -1.4167134260365914e-05} {"step": 69640, "timestamp": 1778269814.9591815, "train/loss": 2.163743233680725, "train/z_loss": 0.0013631276087835431, "train/perplexity": 8.703656577983521, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021207.570102428, "perf/iters_per_sec": 0.9637868738662854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037573790550232, "data/tokens_consumed": 146047762432, "data/tokens_consumed_B": 146.047762432, "train/loss_slope": -1.1925434465348265e-05} {"step": 69650, "timestamp": 1778269825.3310874, "grad/layer_0/attn": 0.0026539615355432034, "grad/layer_0/mlp": 0.002903127809986472, "grad/layer_0/attn_mlp_ratio": 0.9141731325078164, "grad/layer_4/attn": 0.0041475179605185986, "grad/layer_4/mlp": 0.0026131393387913704, "grad/layer_4/attn_mlp_ratio": 1.58717820371534, "grad/layer_8/attn": 0.004800828173756599, "grad/layer_8/mlp": 0.003937099128961563, "grad/layer_8/attn_mlp_ratio": 1.219382061402311, "grad/layer_12/attn": 0.008547862060368061, "grad/layer_12/mlp": 0.006389087066054344, "grad/layer_12/attn_mlp_ratio": 1.3378847147028194, "grad/layer_16/attn": 0.005309481639415026, "grad/layer_16/mlp": 0.004805146250873804, "grad/layer_16/attn_mlp_ratio": 1.1049573211125006, "grad/layer_20/attn": 0.003037419868633151, "grad/layer_20/mlp": 0.005706180352717638, "grad/layer_20/attn_mlp_ratio": 0.5323035073639396, "grad/layer_24/attn": 0.005251972004771233, "grad/layer_24/mlp": 0.00781851727515459, "grad/layer_24/attn_mlp_ratio": 0.6717350301555592, "grad/layer_27/attn": 0.004316600505262613, "grad/layer_27/mlp": 0.007536375429481268, "grad/layer_27/attn_mlp_ratio": 0.5727687650882927} {"step": 69650, "timestamp": 1778269825.3468814, "train/loss": 2.1203262329101564, "train/z_loss": 0.0013832037569954992, "train/perplexity": 8.33385582229621, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019828.3279369248, "perf/iters_per_sec": 0.9631291999516128, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038282299041748, "data/tokens_consumed": 146068733952, "data/tokens_consumed_B": 146.068733952, "train/loss_slope": -1.1517375866786673e-05} {"step": 69660, "timestamp": 1778269835.7248628, "train/loss": 2.1317115783691407, "train/z_loss": 0.0013846185058355332, "train/perplexity": 8.429281848167244, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022261.7540498204, "perf/iters_per_sec": 0.9642895479439832, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037032914161682, "data/tokens_consumed": 146089705472, "data/tokens_consumed_B": 146.089705472, "train/loss_slope": -1.380924732878944e-05} {"step": 69670, "timestamp": 1778269846.1036127, "train/loss": 2.151476275920868, "train/z_loss": 0.0013638993492349981, "train/perplexity": 8.597541376398693, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021861.2056628985, "perf/iters_per_sec": 0.9640985515894406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037238359451294, "data/tokens_consumed": 146110676992, "data/tokens_consumed_B": 146.110676992, "train/loss_slope": -1.2522435238366362e-05} {"step": 69675, "timestamp": 1778269851.9082167, "eos/sharpness": 68.7474489212036, "eos/L0_probe": 1.9647812843322754, "eos/L_plus": 2.282761812210083, "eos/L_minus": 2.334275245666504, "eos/grad_norm": 0.16238097846508026, "eos/embed_grad_frac": 0.07770632952451706, "eos/time_s": 0.6225748062133789} {"step": 69675, "timestamp": 1778269853.2902822, "geo/rankme_last": 438.2775573730469, "geo/layer_0/stable_rank_q_proj": 19.31045150756836, "geo/layer_0/stable_rank_k_proj": 15.986254692077637, "geo/layer_0/stable_rank_o_proj": 46.934837341308594, "geo/layer_0/stable_rank_gate_proj": 130.42315673828125, "geo/layer_0/stable_rank_down_proj": 55.89330291748047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06328804045915604, "geo/layer_0/attn_entropy_mean": 6.157390594482422, "geo/layer_0/attn_entropy_std": 0.4260846972465515, "geo/layer_7/stable_rank_q_proj": 43.168861389160156, "geo/layer_7/stable_rank_k_proj": 40.19648742675781, "geo/layer_7/stable_rank_o_proj": 89.24034881591797, "geo/layer_7/stable_rank_gate_proj": 79.14531707763672, "geo/layer_7/stable_rank_down_proj": 139.1884002685547, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44669437408447266, "geo/layer_7/attn_entropy_mean": 4.649425983428955, "geo/layer_7/attn_entropy_std": 0.7880717515945435, "geo/layer_14/stable_rank_q_proj": 50.38772964477539, "geo/layer_14/stable_rank_k_proj": 40.8748893737793, "geo/layer_14/stable_rank_o_proj": 43.31082534790039, "geo/layer_14/stable_rank_gate_proj": 71.27265167236328, "geo/layer_14/stable_rank_down_proj": 127.26658630371094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3894135355949402, "geo/layer_14/attn_entropy_mean": 5.516918659210205, "geo/layer_14/attn_entropy_std": 0.4394679069519043, "geo/layer_21/stable_rank_q_proj": 40.01804733276367, "geo/layer_21/stable_rank_k_proj": 30.01835060119629, "geo/layer_21/stable_rank_o_proj": 69.4493179321289, "geo/layer_21/stable_rank_gate_proj": 64.53346252441406, "geo/layer_21/stable_rank_down_proj": 50.24745559692383, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.138291597366333, "geo/layer_21/attn_entropy_mean": 5.679928779602051, "geo/layer_21/attn_entropy_std": 0.3112433850765228, "geo/layer_27/stable_rank_q_proj": 43.61662292480469, "geo/layer_27/stable_rank_k_proj": 32.03270721435547, "geo/layer_27/stable_rank_o_proj": 115.00408935546875, "geo/layer_27/stable_rank_gate_proj": 78.697509765625, "geo/layer_27/stable_rank_down_proj": 127.15973663330078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09222349524497986, "geo/layer_27/attn_entropy_mean": 4.161812782287598, "geo/layer_27/attn_entropy_std": 0.7601069808006287, "attnres/final_alpha/block_0": 0.23992374539375305, "attnres/block_norm/0": 1.7713840007781982, "attnres/final_alpha/block_1": 0.004215718246996403, "attnres/block_norm/1": 47782.7578125, "attnres/final_alpha/block_2": 0.010155723430216312, "attnres/block_norm/2": 28848.212890625, "attnres/final_alpha/block_3": 0.011931391432881355, "attnres/block_norm/3": 60137.484375, "attnres/final_alpha/block_4": 0.01435115560889244, "attnres/block_norm/4": 15560.736328125, "attnres/final_alpha/block_5": 0.6099231243133545, "attnres/block_norm/5": 6808.83984375, "attnres/final_alpha/block_6": 0.10949917882680893, "attnres/block_norm/6": 39873.12890625, "geo/tier1_time_s": 1.3628628253936768, "geo/step": 69675.0, "geo/rankme_slope": -2.6951874499799916e-05} {"step": 69680, "timestamp": 1778269858.4855597, "train/loss": 2.138667106628418, "train/z_loss": 0.0013655926566570997, "train/perplexity": 8.488116331525633, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1694622.9335674185, "perf/iters_per_sec": 0.8080591838681309, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2375331163406371, "data/tokens_consumed": 146131648512, "data/tokens_consumed_B": 146.131648512, "train/loss_slope": -1.3858099939441873e-05} {"step": 69690, "timestamp": 1778269868.8614287, "train/loss": 2.0984405755996702, "train/z_loss": 0.00138817933620885, "train/perplexity": 8.15344531205366, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022153.989675675, "perf/iters_per_sec": 0.9642381618860603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370881795883178, "data/tokens_consumed": 146152620032, "data/tokens_consumed_B": 146.152620032, "train/loss_slope": -1.2205383336261496e-05} {"step": 69700, "timestamp": 1778269879.2260723, "grad/layer_0/attn": 0.0030998566653579473, "grad/layer_0/mlp": 0.003329393919557333, "grad/layer_0/attn_mlp_ratio": 0.9310573176827223, "grad/layer_4/attn": 0.0031010182574391365, "grad/layer_4/mlp": 0.002536847721785307, "grad/layer_4/attn_mlp_ratio": 1.2223903344966094, "grad/layer_8/attn": 0.0034539527259767056, "grad/layer_8/mlp": 0.0035523814149200916, "grad/layer_8/attn_mlp_ratio": 0.9722921683580487, "grad/layer_12/attn": 0.004879670217633247, "grad/layer_12/mlp": 0.00652153417468071, "grad/layer_12/attn_mlp_ratio": 0.7482396031526114, "grad/layer_16/attn": 0.005862120073288679, "grad/layer_16/mlp": 0.0042974334210157394, "grad/layer_16/attn_mlp_ratio": 1.3640979074187312, "grad/layer_20/attn": 0.0038856423925608397, "grad/layer_20/mlp": 0.005346674472093582, "grad/layer_20/attn_mlp_ratio": 0.7267400213286873, "grad/layer_24/attn": 0.009587451815605164, "grad/layer_24/mlp": 0.009593965485692024, "grad/layer_24/attn_mlp_ratio": 0.9993210555084151, "grad/layer_27/attn": 0.006615158636122942, "grad/layer_27/mlp": 0.009300386533141136, "grad/layer_27/attn_mlp_ratio": 0.7112778099515119} {"step": 69700, "timestamp": 1778269879.2418566, "train/loss": 2.144632709026337, "train/z_loss": 0.0013781790505163372, "train/perplexity": 8.538904398752482, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021405.1632861458, "perf/iters_per_sec": 0.9638810936384896, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374723672866821, "data/tokens_consumed": 146173591552, "data/tokens_consumed_B": 146.173591552, "train/loss_slope": -1.1193888108007125e-05} {"step": 69710, "timestamp": 1778269889.6339567, "train/loss": 2.111189031600952, "train/z_loss": 0.0013756834319792687, "train/perplexity": 8.258054537143874, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019011.4271275876, "perf/iters_per_sec": 0.9627396712911547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038702392578125, "data/tokens_consumed": 146194563072, "data/tokens_consumed_B": 146.194563072, "train/loss_slope": -1.2991798921923294e-05} {"step": 69720, "timestamp": 1778269900.0208979, "train/loss": 2.16063129901886, "train/z_loss": 0.0013820748194120825, "train/perplexity": 8.67661346741419, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020379.4832973164, "perf/iters_per_sec": 0.9633920113073904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379990577697753, "data/tokens_consumed": 146215534592, "data/tokens_consumed_B": 146.215534592, "train/loss_slope": -1.559151195385348e-05} {"step": 69730, "timestamp": 1778269910.4038544, "train/loss": 2.1649691343307493, "train/z_loss": 0.0013786511379294097, "train/perplexity": 8.714332938981936, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021141.7607331695, "perf/iters_per_sec": 0.963755493513665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376075744628905, "data/tokens_consumed": 146236506112, "data/tokens_consumed_B": 146.236506112, "train/loss_slope": -1.132629949911346e-05} {"step": 69740, "timestamp": 1778269920.7609506, "train/loss": 2.152137279510498, "train/z_loss": 0.001385813660454005, "train/perplexity": 8.603226260767983, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025746.9395810552, "perf/iters_per_sec": 0.9659514139085079, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352487564086914, "data/tokens_consumed": 146257477632, "data/tokens_consumed_B": 146.257477632, "train/loss_slope": -8.035958582239793e-06} {"step": 69750, "timestamp": 1778269931.1219075, "grad/layer_0/attn": 0.002642339561134577, "grad/layer_0/mlp": 0.0030060429126024246, "grad/layer_0/attn_mlp_ratio": 0.8790092324218014, "grad/layer_4/attn": 0.0041559855453670025, "grad/layer_4/mlp": 0.0024762728717178106, "grad/layer_4/attn_mlp_ratio": 1.678322863768917, "grad/layer_8/attn": 0.0035155434161424637, "grad/layer_8/mlp": 0.003460689913481474, "grad/layer_8/attn_mlp_ratio": 1.0158504235997163, "grad/layer_12/attn": 0.004606115631759167, "grad/layer_12/mlp": 0.00658687949180603, "grad/layer_12/attn_mlp_ratio": 0.6992864477876746, "grad/layer_16/attn": 0.004502637777477503, "grad/layer_16/mlp": 0.004385733045637608, "grad/layer_16/attn_mlp_ratio": 1.0266556646193072, "grad/layer_20/attn": 0.003299611620604992, "grad/layer_20/mlp": 0.006051873322576284, "grad/layer_20/attn_mlp_ratio": 0.5452215190582004, "grad/layer_24/attn": 0.005709987133741379, "grad/layer_24/mlp": 0.008049839176237583, "grad/layer_24/attn_mlp_ratio": 0.7093293341392242, "grad/layer_27/attn": 0.005386033095419407, "grad/layer_27/mlp": 0.006790827494114637, "grad/layer_27/attn_mlp_ratio": 0.7931335350182186} {"step": 69750, "timestamp": 1778269931.7341664, "eos/sharpness": 6.121659278869628, "eos/L0_probe": 1.969024658203125, "eos/L_plus": 2.0048115253448486, "eos/L_minus": 1.9944543838500977, "eos/grad_norm": 0.08506377786397934, "eos/embed_grad_frac": 0.325335830450058, "eos/time_s": 0.6094086170196533} {"step": 69750, "timestamp": 1778269931.7547538, "train/loss": 2.1213842749595644, "train/z_loss": 0.0013828210765495896, "train/perplexity": 8.342678058515341, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908933.3451207345, "perf/iters_per_sec": 0.9102503514865563, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985988616943358, "data/tokens_consumed": 146278449152, "data/tokens_consumed_B": 146.278449152, "train/loss_slope": -9.45470958295402e-06} {"step": 69750, "timestamp": 1778269933.1172128, "geo/rankme_last": 439.1153564453125, "geo/layer_0/stable_rank_q_proj": 19.299819946289062, "geo/layer_0/stable_rank_k_proj": 15.976749420166016, "geo/layer_0/stable_rank_o_proj": 46.95477294921875, "geo/layer_0/stable_rank_gate_proj": 130.31654357910156, "geo/layer_0/stable_rank_down_proj": 55.95437240600586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0638931542634964, "geo/layer_0/attn_entropy_mean": 6.159661293029785, "geo/layer_0/attn_entropy_std": 0.42807120084762573, "geo/layer_7/stable_rank_q_proj": 43.03595733642578, "geo/layer_7/stable_rank_k_proj": 40.14771270751953, "geo/layer_7/stable_rank_o_proj": 89.20612335205078, "geo/layer_7/stable_rank_gate_proj": 79.11984252929688, "geo/layer_7/stable_rank_down_proj": 139.5635986328125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4535611867904663, "geo/layer_7/attn_entropy_mean": 4.6542134284973145, "geo/layer_7/attn_entropy_std": 0.7854307293891907, "geo/layer_14/stable_rank_q_proj": 50.180667877197266, "geo/layer_14/stable_rank_k_proj": 40.95783615112305, "geo/layer_14/stable_rank_o_proj": 43.329734802246094, "geo/layer_14/stable_rank_gate_proj": 71.313232421875, "geo/layer_14/stable_rank_down_proj": 127.13008117675781, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4083849787712097, "geo/layer_14/attn_entropy_mean": 5.572386741638184, "geo/layer_14/attn_entropy_std": 0.4316149353981018, "geo/layer_21/stable_rank_q_proj": 39.97352600097656, "geo/layer_21/stable_rank_k_proj": 30.04961395263672, "geo/layer_21/stable_rank_o_proj": 69.45549011230469, "geo/layer_21/stable_rank_gate_proj": 64.62804412841797, "geo/layer_21/stable_rank_down_proj": 50.247901916503906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1416858285665512, "geo/layer_21/attn_entropy_mean": 5.703469753265381, "geo/layer_21/attn_entropy_std": 0.3075594902038574, "geo/layer_27/stable_rank_q_proj": 43.61098098754883, "geo/layer_27/stable_rank_k_proj": 32.0151252746582, "geo/layer_27/stable_rank_o_proj": 115.16320037841797, "geo/layer_27/stable_rank_gate_proj": 78.60957336425781, "geo/layer_27/stable_rank_down_proj": 127.02640533447266, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10183949023485184, "geo/layer_27/attn_entropy_mean": 4.177617073059082, "geo/layer_27/attn_entropy_std": 0.7645343542098999, "attnres/final_alpha/block_0": 0.23876360058784485, "attnres/block_norm/0": 1.771545648574829, "attnres/final_alpha/block_1": 0.004166137892752886, "attnres/block_norm/1": 47829.91015625, "attnres/final_alpha/block_2": 0.010201623663306236, "attnres/block_norm/2": 28866.927734375, "attnres/final_alpha/block_3": 0.011920643970370293, "attnres/block_norm/3": 60729.2734375, "attnres/final_alpha/block_4": 0.01401957031339407, "attnres/block_norm/4": 15475.349609375, "attnres/final_alpha/block_5": 0.6131051778793335, "attnres/block_norm/5": 6751.1923828125, "attnres/final_alpha/block_6": 0.10782328248023987, "attnres/block_norm/6": 40085.3671875, "geo/tier1_time_s": 1.358905553817749, "geo/step": 69750.0, "geo/rankme_slope": -1.4943516469087633e-05} {"step": 69760, "timestamp": 1778269943.4670885, "train/loss": 2.132866930961609, "train/z_loss": 0.0013832343742251397, "train/perplexity": 8.43902626884008, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791056.4963632124, "perf/iters_per_sec": 0.8540422899070799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709022045135498, "data/tokens_consumed": 146299420672, "data/tokens_consumed_B": 146.299420672, "train/loss_slope": -1.0731409113220441e-05} {"step": 69770, "timestamp": 1778269953.8191948, "train/loss": 2.2040435314178466, "train/z_loss": 0.0013667640858329832, "train/perplexity": 9.061580304755953, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027112.416093102, "perf/iters_per_sec": 0.9666025238481054, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345514059066772, "data/tokens_consumed": 146320392192, "data/tokens_consumed_B": 146.320392192, "train/loss_slope": -3.4906192545485947e-06} {"step": 69780, "timestamp": 1778269964.1725197, "train/loss": 2.1617878079414368, "train/z_loss": 0.0013731591869145631, "train/perplexity": 8.68665385308572, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026482.1326936882, "perf/iters_per_sec": 0.9663019813030663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348731756210328, "data/tokens_consumed": 146341363712, "data/tokens_consumed_B": 146.341363712, "train/loss_slope": -1.763978277710596e-06} {"step": 69790, "timestamp": 1778269974.548637, "train/loss": 2.1257460474967957, "train/z_loss": 0.0013795171631500124, "train/perplexity": 8.37914639804117, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022619.7648441957, "perf/iters_per_sec": 0.9644602607937792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036849355697632, "data/tokens_consumed": 146362335232, "data/tokens_consumed_B": 146.362335232, "train/loss_slope": -3.3972810490010463e-06} {"step": 69800, "timestamp": 1778269984.8848805, "grad/layer_0/attn": 0.0025056013837456703, "grad/layer_0/mlp": 0.0029200008139014244, "grad/layer_0/attn_mlp_ratio": 0.8580823971037485, "grad/layer_4/attn": 0.0022030482068657875, "grad/layer_4/mlp": 0.0025098882615566254, "grad/layer_4/attn_mlp_ratio": 0.8777474889359077, "grad/layer_8/attn": 0.006316801067441702, "grad/layer_8/mlp": 0.0034556626342236996, "grad/layer_8/attn_mlp_ratio": 1.8279564741322318, "grad/layer_12/attn": 0.005275374744087458, "grad/layer_12/mlp": 0.0069743189960718155, "grad/layer_12/attn_mlp_ratio": 0.7563999684297119, "grad/layer_16/attn": 0.003412982914596796, "grad/layer_16/mlp": 0.004723468795418739, "grad/layer_16/attn_mlp_ratio": 0.7225585666303478, "grad/layer_20/attn": 0.002693707589060068, "grad/layer_20/mlp": 0.0053482940420508385, "grad/layer_20/attn_mlp_ratio": 0.5036573377445446, "grad/layer_24/attn": 0.00689124409109354, "grad/layer_24/mlp": 0.007728517986834049, "grad/layer_24/attn_mlp_ratio": 0.8916643544941881, "grad/layer_27/attn": 0.00840151309967041, "grad/layer_27/mlp": 0.007293347734957933, "grad/layer_27/attn_mlp_ratio": 1.1519419188264821} {"step": 69800, "timestamp": 1778269984.9005983, "train/loss": 2.142700362205505, "train/z_loss": 0.0013779827393591404, "train/perplexity": 8.522420205702465, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027127.1784635456, "perf/iters_per_sec": 0.9666095630948761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345438718795776, "data/tokens_consumed": 146383306752, "data/tokens_consumed_B": 146.383306752, "train/loss_slope": -1.7067305432974109e-06} {"step": 69810, "timestamp": 1778269995.2477639, "train/loss": 2.088774633407593, "train/z_loss": 0.0013749202713370322, "train/perplexity": 8.075014246725965, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027612.1203838398, "perf/iters_per_sec": 0.9668408014220428, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034296441078186, "data/tokens_consumed": 146404278272, "data/tokens_consumed_B": 146.404278272, "train/loss_slope": -3.42916272046363e-06} {"step": 69820, "timestamp": 1778270005.5993693, "train/loss": 2.1544986844062803, "train/z_loss": 0.001364919065963477, "train/perplexity": 8.623565967069124, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026837.3894748755, "perf/iters_per_sec": 0.9664713809370401, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346917867660523, "data/tokens_consumed": 146425249792, "data/tokens_consumed_B": 146.425249792, "train/loss_slope": -1.6956258027574451e-06} {"step": 69825, "timestamp": 1778270011.388516, "eos/sharpness": 56.00023269653319, "eos/L0_probe": 1.9705238342285156, "eos/L_plus": 2.281327486038208, "eos/L_minus": 2.2197225093841553, "eos/grad_norm": 0.12382224202156067, "eos/embed_grad_frac": 0.15474367141723633, "eos/time_s": 0.6203045845031738} {"step": 69825, "timestamp": 1778270012.7677052, "geo/rankme_last": 439.1440124511719, "geo/layer_0/stable_rank_q_proj": 19.31767463684082, "geo/layer_0/stable_rank_k_proj": 15.994422912597656, "geo/layer_0/stable_rank_o_proj": 46.94831848144531, "geo/layer_0/stable_rank_gate_proj": 130.19589233398438, "geo/layer_0/stable_rank_down_proj": 55.87177658081055, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0631818100810051, "geo/layer_0/attn_entropy_mean": 6.15439510345459, "geo/layer_0/attn_entropy_std": 0.427704781293869, "geo/layer_7/stable_rank_q_proj": 43.05045700073242, "geo/layer_7/stable_rank_k_proj": 40.209556579589844, "geo/layer_7/stable_rank_o_proj": 89.25187683105469, "geo/layer_7/stable_rank_gate_proj": 78.98856353759766, "geo/layer_7/stable_rank_down_proj": 139.35203552246094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46288129687309265, "geo/layer_7/attn_entropy_mean": 4.665372848510742, "geo/layer_7/attn_entropy_std": 0.7933372259140015, "geo/layer_14/stable_rank_q_proj": 50.14379119873047, "geo/layer_14/stable_rank_k_proj": 40.865535736083984, "geo/layer_14/stable_rank_o_proj": 43.309932708740234, "geo/layer_14/stable_rank_gate_proj": 71.34970092773438, "geo/layer_14/stable_rank_down_proj": 127.3413314819336, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41094446182250977, "geo/layer_14/attn_entropy_mean": 5.508151531219482, "geo/layer_14/attn_entropy_std": 0.43119582533836365, "geo/layer_21/stable_rank_q_proj": 39.922088623046875, "geo/layer_21/stable_rank_k_proj": 30.09811019897461, "geo/layer_21/stable_rank_o_proj": 69.44274139404297, "geo/layer_21/stable_rank_gate_proj": 64.602294921875, "geo/layer_21/stable_rank_down_proj": 50.23235321044922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14446353912353516, "geo/layer_21/attn_entropy_mean": 5.676839828491211, "geo/layer_21/attn_entropy_std": 0.30955052375793457, "geo/layer_27/stable_rank_q_proj": 43.614681243896484, "geo/layer_27/stable_rank_k_proj": 32.11027908325195, "geo/layer_27/stable_rank_o_proj": 115.0925064086914, "geo/layer_27/stable_rank_gate_proj": 78.60836791992188, "geo/layer_27/stable_rank_down_proj": 126.92254638671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09621258080005646, "geo/layer_27/attn_entropy_mean": 4.183927536010742, "geo/layer_27/attn_entropy_std": 0.7469450235366821, "attnres/final_alpha/block_0": 0.23738791048526764, "attnres/block_norm/0": 1.7715208530426025, "attnres/final_alpha/block_1": 0.004138054326176643, "attnres/block_norm/1": 47878.6328125, "attnres/final_alpha/block_2": 0.009988373145461082, "attnres/block_norm/2": 28939.720703125, "attnres/final_alpha/block_3": 0.011757789179682732, "attnres/block_norm/3": 60310.89453125, "attnres/final_alpha/block_4": 0.014013632200658321, "attnres/block_norm/4": 15552.08203125, "attnres/final_alpha/block_5": 0.6156619191169739, "attnres/block_norm/5": 6758.93701171875, "attnres/final_alpha/block_6": 0.10705232620239258, "attnres/block_norm/6": 40209.8046875, "geo/tier1_time_s": 1.3591868877410889, "geo/step": 69825.0, "geo/rankme_slope": -1.8091377175870347e-05} {"step": 69830, "timestamp": 1778270017.9468815, "train/loss": 2.105101537704468, "train/z_loss": 0.0013703474192880093, "train/perplexity": 8.207936382304394, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699117.094742934, "perf/iters_per_sec": 0.8102021669115705, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234259843826294, "data/tokens_consumed": 146446221312, "data/tokens_consumed_B": 146.446221312, "train/loss_slope": -2.996747779159493e-06} {"step": 69840, "timestamp": 1778270028.2983925, "train/loss": 2.177388858795166, "train/z_loss": 0.0013626245083287359, "train/perplexity": 8.823237435023257, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027222.905507014, "perf/iters_per_sec": 0.9666552093062467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344950199127196, "data/tokens_consumed": 146467192832, "data/tokens_consumed_B": 146.467192832, "train/loss_slope": -2.8058035395863053e-06} {"step": 69850, "timestamp": 1778270038.6357336, "grad/layer_0/attn": 0.0026395018212497234, "grad/layer_0/mlp": 0.002872298937290907, "grad/layer_0/attn_mlp_ratio": 0.9189509124855074, "grad/layer_4/attn": 0.002345242304727435, "grad/layer_4/mlp": 0.0026181694120168686, "grad/layer_4/attn_mlp_ratio": 0.8957564794652322, "grad/layer_8/attn": 0.005927093327045441, "grad/layer_8/mlp": 0.0035986152943223715, "grad/layer_8/attn_mlp_ratio": 1.6470482887381543, "grad/layer_12/attn": 0.0056527466513216496, "grad/layer_12/mlp": 0.006902344059199095, "grad/layer_12/attn_mlp_ratio": 0.8189604170617826, "grad/layer_16/attn": 0.004156979266554117, "grad/layer_16/mlp": 0.00512739410623908, "grad/layer_16/attn_mlp_ratio": 0.8107391589856404, "grad/layer_20/attn": 0.0037349634803831577, "grad/layer_20/mlp": 0.007021971046924591, "grad/layer_20/attn_mlp_ratio": 0.5318967284590678, "grad/layer_24/attn": 0.02258295938372612, "grad/layer_24/mlp": 0.014841252006590366, "grad/layer_24/attn_mlp_ratio": 1.521634375693813, "grad/layer_27/attn": 0.017067868262529373, "grad/layer_27/mlp": 0.01317151915282011, "grad/layer_27/attn_mlp_ratio": 1.29581621792604} {"step": 69850, "timestamp": 1778270038.65143, "train/loss": 2.146531009674072, "train/z_loss": 0.0013800141168758273, "train/perplexity": 8.555129201408045, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026553.2394530263, "perf/iters_per_sec": 0.9663358876481182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348368644714356, "data/tokens_consumed": 146488164352, "data/tokens_consumed_B": 146.488164352, "train/loss_slope": -8.498570813885532e-08} {"step": 69860, "timestamp": 1778270049.0049102, "train/loss": 2.123586869239807, "train/z_loss": 0.0013703664648346604, "train/perplexity": 8.361073845279146, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026898.2924686985, "perf/iters_per_sec": 0.9665004217475407, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346606969833374, "data/tokens_consumed": 146509135872, "data/tokens_consumed_B": 146.509135872, "train/loss_slope": 2.822189190850349e-07} {"step": 69870, "timestamp": 1778270059.3534293, "train/loss": 2.125150728225708, "train/z_loss": 0.001370796258561313, "train/perplexity": 8.374159615226382, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027949.39877285, "perf/iters_per_sec": 0.9670016282905817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341244220733643, "data/tokens_consumed": 146530107392, "data/tokens_consumed_B": 146.530107392, "train/loss_slope": -3.0654383106271946e-06} {"step": 69880, "timestamp": 1778270069.699437, "train/loss": 2.126647174358368, "train/z_loss": 0.0013788633164949714, "train/perplexity": 8.386700475017152, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027957.1600673913, "perf/iters_per_sec": 0.9670053291642148, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034120464324951, "data/tokens_consumed": 146551078912, "data/tokens_consumed_B": 146.551078912, "train/loss_slope": -2.7584388239620685e-06} {"step": 69890, "timestamp": 1778270080.0532703, "train/loss": 2.1254246711730955, "train/z_loss": 0.0013742263778112828, "train/perplexity": 8.376453971440277, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026739.7374823762, "perf/iters_per_sec": 0.9664248168384438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347416400909424, "data/tokens_consumed": 146572050432, "data/tokens_consumed_B": 146.572050432, "train/loss_slope": -5.6135446646891154e-06} {"step": 69900, "timestamp": 1778270090.392899, "grad/layer_0/attn": 0.0030375055503100157, "grad/layer_0/mlp": 0.0033946826588362455, "grad/layer_0/attn_mlp_ratio": 0.8947833320811762, "grad/layer_4/attn": 0.002111824695020914, "grad/layer_4/mlp": 0.0025908839888870716, "grad/layer_4/attn_mlp_ratio": 0.8150980987837475, "grad/layer_8/attn": 0.006265752483159304, "grad/layer_8/mlp": 0.0037516511511057615, "grad/layer_8/attn_mlp_ratio": 1.6701319135973889, "grad/layer_12/attn": 0.006020399276167154, "grad/layer_12/mlp": 0.006553737446665764, "grad/layer_12/attn_mlp_ratio": 0.9186207462991943, "grad/layer_16/attn": 0.0033955934923142195, "grad/layer_16/mlp": 0.004443022422492504, "grad/layer_16/attn_mlp_ratio": 0.7642530451115778, "grad/layer_20/attn": 0.0034389267675578594, "grad/layer_20/mlp": 0.005743885412812233, "grad/layer_20/attn_mlp_ratio": 0.5987108830576509, "grad/layer_24/attn": 0.01182775292545557, "grad/layer_24/mlp": 0.00954578910022974, "grad/layer_24/attn_mlp_ratio": 1.2390544854238883, "grad/layer_27/attn": 0.007826344110071659, "grad/layer_27/mlp": 0.0084430743008852, "grad/layer_27/attn_mlp_ratio": 0.926954298691377} {"step": 69900, "timestamp": 1778270091.0004249, "eos/sharpness": 63.532996177673326, "eos/L0_probe": 1.964513897895813, "eos/L_plus": 2.3285648822784424, "eos/L_minus": 2.235792875289917, "eos/grad_norm": 0.14354373514652252, "eos/embed_grad_frac": 0.10490523278713226, "eos/time_s": 0.6047992706298828} {"step": 69900, "timestamp": 1778270091.0199277, "train/loss": 2.0663320899009703, "train/z_loss": 0.0013833188218995928, "train/perplexity": 7.895808822544686, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1913386.715926355, "perf/iters_per_sec": 0.9123738841659331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0960418939590455, "data/tokens_consumed": 146593021952, "data/tokens_consumed_B": 146.593021952, "train/loss_slope": -8.230654284147924e-06} {"step": 69900, "timestamp": 1778270092.3819149, "geo/rankme_last": 438.9687805175781, "geo/layer_0/stable_rank_q_proj": 19.287540435791016, "geo/layer_0/stable_rank_k_proj": 16.022127151489258, "geo/layer_0/stable_rank_o_proj": 46.91236877441406, "geo/layer_0/stable_rank_gate_proj": 130.25384521484375, "geo/layer_0/stable_rank_down_proj": 55.87061309814453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06452184915542603, "geo/layer_0/attn_entropy_mean": 6.158160209655762, "geo/layer_0/attn_entropy_std": 0.42568597197532654, "geo/layer_7/stable_rank_q_proj": 43.00566101074219, "geo/layer_7/stable_rank_k_proj": 40.21004867553711, "geo/layer_7/stable_rank_o_proj": 89.44945526123047, "geo/layer_7/stable_rank_gate_proj": 79.14673614501953, "geo/layer_7/stable_rank_down_proj": 139.2266845703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.445928692817688, "geo/layer_7/attn_entropy_mean": 4.664243221282959, "geo/layer_7/attn_entropy_std": 0.7743581533432007, "geo/layer_14/stable_rank_q_proj": 50.146697998046875, "geo/layer_14/stable_rank_k_proj": 40.946685791015625, "geo/layer_14/stable_rank_o_proj": 43.32528305053711, "geo/layer_14/stable_rank_gate_proj": 71.42517852783203, "geo/layer_14/stable_rank_down_proj": 127.32906341552734, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38887515664100647, "geo/layer_14/attn_entropy_mean": 5.558968544006348, "geo/layer_14/attn_entropy_std": 0.42596063017845154, "geo/layer_21/stable_rank_q_proj": 39.94206619262695, "geo/layer_21/stable_rank_k_proj": 30.010278701782227, "geo/layer_21/stable_rank_o_proj": 69.39488220214844, "geo/layer_21/stable_rank_gate_proj": 64.5606689453125, "geo/layer_21/stable_rank_down_proj": 50.20964050292969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14281992614269257, "geo/layer_21/attn_entropy_mean": 5.680450439453125, "geo/layer_21/attn_entropy_std": 0.3091747760772705, "geo/layer_27/stable_rank_q_proj": 43.62278366088867, "geo/layer_27/stable_rank_k_proj": 32.170860290527344, "geo/layer_27/stable_rank_o_proj": 115.1177749633789, "geo/layer_27/stable_rank_gate_proj": 78.6218032836914, "geo/layer_27/stable_rank_down_proj": 126.92760467529297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10112493485212326, "geo/layer_27/attn_entropy_mean": 4.172147274017334, "geo/layer_27/attn_entropy_std": 0.7592014074325562, "attnres/final_alpha/block_0": 0.23665979504585266, "attnres/block_norm/0": 1.77165687084198, "attnres/final_alpha/block_1": 0.004133561626076698, "attnres/block_norm/1": 47918.63671875, "attnres/final_alpha/block_2": 0.010013105347752571, "attnres/block_norm/2": 28942.669921875, "attnres/final_alpha/block_3": 0.011683482676744461, "attnres/block_norm/3": 60224.71484375, "attnres/final_alpha/block_4": 0.013920491561293602, "attnres/block_norm/4": 15670.6640625, "attnres/final_alpha/block_5": 0.6167930364608765, "attnres/block_norm/5": 6804.0244140625, "attnres/final_alpha/block_6": 0.10679654777050018, "attnres/block_norm/6": 40217.88671875, "geo/tier1_time_s": 1.3577017784118652, "geo/step": 69900.0, "geo/rankme_slope": 8.808796956282513e-06} {"step": 69910, "timestamp": 1778270102.7329805, "train/loss": 2.13929545879364, "train/z_loss": 0.0013718278147280215, "train/perplexity": 8.493451533818622, "train/grad_norm": 0.267578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791043.91450458, "perf/iters_per_sec": 0.8540362904093647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709104299545288, "data/tokens_consumed": 146613993472, "data/tokens_consumed_B": 146.613993472, "train/loss_slope": -9.387216051526892e-06} {"step": 69920, "timestamp": 1778270113.0833511, "train/loss": 2.0856151461601256, "train/z_loss": 0.0013847243157215417, "train/perplexity": 8.049541603625588, "train/grad_norm": 0.2470703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027201.8345033063, "perf/iters_per_sec": 0.9666451618687183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345057725906373, "data/tokens_consumed": 146634964992, "data/tokens_consumed_B": 146.634964992, "train/loss_slope": -9.989998786255594e-06} {"step": 69930, "timestamp": 1778270123.4314191, "train/loss": 2.1562818765640257, "train/z_loss": 0.0013630008208565414, "train/perplexity": 8.638957160923114, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027834.9967024976, "perf/iters_per_sec": 0.9669470771324623, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341827630996705, "data/tokens_consumed": 146655936512, "data/tokens_consumed_B": 146.655936512, "train/loss_slope": -7.63557704761874e-06} {"step": 69940, "timestamp": 1778270133.7907891, "train/loss": 2.13827782869339, "train/z_loss": 0.0013822911074385046, "train/perplexity": 8.484812738177535, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025394.4434015427, "perf/iters_per_sec": 0.9657833306319917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354289293289185, "data/tokens_consumed": 146676908032, "data/tokens_consumed_B": 146.676908032, "train/loss_slope": -5.1363882273122894e-06} {"step": 69950, "timestamp": 1778270144.1404488, "grad/layer_0/attn": 0.0028900946490466595, "grad/layer_0/mlp": 0.003061133436858654, "grad/layer_0/attn_mlp_ratio": 0.944125636548507, "grad/layer_4/attn": 0.0031989356502890587, "grad/layer_4/mlp": 0.0026350996922701597, "grad/layer_4/attn_mlp_ratio": 1.2139713492721844, "grad/layer_8/attn": 0.003980264067649841, "grad/layer_8/mlp": 0.003659675130620599, "grad/layer_8/attn_mlp_ratio": 1.0876003516232444, "grad/layer_12/attn": 0.004835836123675108, "grad/layer_12/mlp": 0.0070171961560845375, "grad/layer_12/attn_mlp_ratio": 0.6891407831841678, "grad/layer_16/attn": 0.004032370168715715, "grad/layer_16/mlp": 0.004555670544505119, "grad/layer_16/attn_mlp_ratio": 0.885132065808886, "grad/layer_20/attn": 0.004805667791515589, "grad/layer_20/mlp": 0.00608502933755517, "grad/layer_20/attn_mlp_ratio": 0.7897525954198834, "grad/layer_24/attn": 0.009254049509763718, "grad/layer_24/mlp": 0.008363110013306141, "grad/layer_24/attn_mlp_ratio": 1.1065320657490862, "grad/layer_27/attn": 0.0046971808187663555, "grad/layer_27/mlp": 0.006960202939808369, "grad/layer_27/attn_mlp_ratio": 0.6748626141940366} {"step": 69950, "timestamp": 1778270144.1562722, "train/loss": 2.147987794876099, "train/z_loss": 0.0013715867884457112, "train/perplexity": 8.567601269386408, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024479.5149991263, "perf/iters_per_sec": 0.9653470587726242, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358968734741212, "data/tokens_consumed": 146697879552, "data/tokens_consumed_B": 146.697879552, "train/loss_slope": -4.5392874515418e-06} {"step": 69960, "timestamp": 1778270154.516215, "train/loss": 2.146957552433014, "train/z_loss": 0.001373054285068065, "train/perplexity": 8.558779108186032, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025692.1236640182, "perf/iters_per_sec": 0.9659252756424037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352767705917358, "data/tokens_consumed": 146718851072, "data/tokens_consumed_B": 146.718851072, "train/loss_slope": -3.8521106057863915e-06} {"step": 69970, "timestamp": 1778270164.8959632, "train/loss": 2.1546195268630983, "train/z_loss": 0.001365480525419116, "train/perplexity": 8.624608122934188, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021935.2419351186, "perf/iters_per_sec": 0.96413385483509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037200379371643, "data/tokens_consumed": 146739822592, "data/tokens_consumed_B": 146.739822592, "train/loss_slope": -2.3339773776686525e-06} {"step": 69975, "timestamp": 1778270170.6939077, "eos/sharpness": 13.850545883178707, "eos/L0_probe": 1.9650369882583618, "eos/L_plus": 2.034855842590332, "eos/L_minus": 2.0337235927581787, "eos/grad_norm": 0.09868675470352173, "eos/embed_grad_frac": 0.21687492728233337, "eos/time_s": 0.6143701076507568} {"step": 69975, "timestamp": 1778270172.0824492, "geo/rankme_last": 438.3870849609375, "geo/layer_0/stable_rank_q_proj": 19.249523162841797, "geo/layer_0/stable_rank_k_proj": 15.990504264831543, "geo/layer_0/stable_rank_o_proj": 46.85319137573242, "geo/layer_0/stable_rank_gate_proj": 130.0147247314453, "geo/layer_0/stable_rank_down_proj": 55.954105377197266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06258949637413025, "geo/layer_0/attn_entropy_mean": 6.158973217010498, "geo/layer_0/attn_entropy_std": 0.4278865158557892, "geo/layer_7/stable_rank_q_proj": 42.97320556640625, "geo/layer_7/stable_rank_k_proj": 40.21857452392578, "geo/layer_7/stable_rank_o_proj": 89.32788848876953, "geo/layer_7/stable_rank_gate_proj": 79.12594604492188, "geo/layer_7/stable_rank_down_proj": 139.65145874023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43445050716400146, "geo/layer_7/attn_entropy_mean": 4.640224456787109, "geo/layer_7/attn_entropy_std": 0.7624709010124207, "geo/layer_14/stable_rank_q_proj": 50.09470748901367, "geo/layer_14/stable_rank_k_proj": 40.97803497314453, "geo/layer_14/stable_rank_o_proj": 43.367408752441406, "geo/layer_14/stable_rank_gate_proj": 71.50336456298828, "geo/layer_14/stable_rank_down_proj": 127.07521057128906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39492732286453247, "geo/layer_14/attn_entropy_mean": 5.514211654663086, "geo/layer_14/attn_entropy_std": 0.3986739218235016, "geo/layer_21/stable_rank_q_proj": 39.939857482910156, "geo/layer_21/stable_rank_k_proj": 29.94231414794922, "geo/layer_21/stable_rank_o_proj": 69.3622817993164, "geo/layer_21/stable_rank_gate_proj": 64.58985900878906, "geo/layer_21/stable_rank_down_proj": 50.22179412841797, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13894522190093994, "geo/layer_21/attn_entropy_mean": 5.676630973815918, "geo/layer_21/attn_entropy_std": 0.3183138370513916, "geo/layer_27/stable_rank_q_proj": 43.6710090637207, "geo/layer_27/stable_rank_k_proj": 32.11497116088867, "geo/layer_27/stable_rank_o_proj": 115.0946044921875, "geo/layer_27/stable_rank_gate_proj": 78.5279541015625, "geo/layer_27/stable_rank_down_proj": 127.05189514160156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09029242396354675, "geo/layer_27/attn_entropy_mean": 4.155483245849609, "geo/layer_27/attn_entropy_std": 0.7645928859710693, "attnres/final_alpha/block_0": 0.23770907521247864, "attnres/block_norm/0": 1.7716830968856812, "attnres/final_alpha/block_1": 0.004167765844613314, "attnres/block_norm/1": 47936.9140625, "attnres/final_alpha/block_2": 0.009946202859282494, "attnres/block_norm/2": 28809.072265625, "attnres/final_alpha/block_3": 0.011724606156349182, "attnres/block_norm/3": 60218.94140625, "attnres/final_alpha/block_4": 0.013975035399198532, "attnres/block_norm/4": 15580.203125, "attnres/final_alpha/block_5": 0.614261269569397, "attnres/block_norm/5": 6848.5849609375, "attnres/final_alpha/block_6": 0.10821601748466492, "attnres/block_norm/6": 40578.671875, "geo/tier1_time_s": 1.360334873199463, "geo/step": 69975.0, "geo/rankme_slope": 2.519626991421569e-05} {"step": 69980, "timestamp": 1778270177.2742338, "train/loss": 2.102170097827911, "train/z_loss": 0.0013727956684306264, "train/perplexity": 8.183910542646469, "train/grad_norm": 0.1806640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695142.2608830654, "perf/iters_per_sec": 0.8083068184295013, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23715398311615, "data/tokens_consumed": 146760794112, "data/tokens_consumed_B": 146.760794112, "train/loss_slope": -6.358109298783902e-06} {"step": 69990, "timestamp": 1778270187.6472049, "train/loss": 2.120279145240784, "train/z_loss": 0.0013759366003796459, "train/perplexity": 8.33346340968762, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022740.1842794323, "perf/iters_per_sec": 0.9645176812550699, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367876291275024, "data/tokens_consumed": 146781765632, "data/tokens_consumed_B": 146.781765632, "train/loss_slope": -7.132162534185555e-06} {"step": 70000, "timestamp": 1778270198.0141785, "grad/layer_0/attn": 0.0033980817534029484, "grad/layer_0/mlp": 0.0032031009905040264, "grad/layer_0/attn_mlp_ratio": 1.0608724662100004, "grad/layer_4/attn": 0.0025495325680822134, "grad/layer_4/mlp": 0.002582711633294821, "grad/layer_4/attn_mlp_ratio": 0.9871533610255915, "grad/layer_8/attn": 0.0031031605321913958, "grad/layer_8/mlp": 0.0036272075958549976, "grad/layer_8/attn_mlp_ratio": 0.8555232543583158, "grad/layer_12/attn": 0.006179793272167444, "grad/layer_12/mlp": 0.006321939639747143, "grad/layer_12/attn_mlp_ratio": 0.9775153713209257, "grad/layer_16/attn": 0.0034818428102880716, "grad/layer_16/mlp": 0.004788951016962528, "grad/layer_16/attn_mlp_ratio": 0.7270574965685782, "grad/layer_20/attn": 0.006876985542476177, "grad/layer_20/mlp": 0.006128544453531504, "grad/layer_20/attn_mlp_ratio": 1.1221237738270815, "grad/layer_24/attn": 0.009734267368912697, "grad/layer_24/mlp": 0.01028696820139885, "grad/layer_24/attn_mlp_ratio": 0.9462717375719925, "grad/layer_27/attn": 0.005790709983557463, "grad/layer_27/mlp": 0.009853130206465721, "grad/layer_27/attn_mlp_ratio": 0.5877025679603103} {"step": 70000, "timestamp": 1778270198.0300248, "train/loss": 2.1302362442016602, "train/z_loss": 0.0013785772724077105, "train/perplexity": 8.416855009772993, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021220.2494641042, "perf/iters_per_sec": 0.9637929198570748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375672817230224, "data/tokens_consumed": 146802737152, "data/tokens_consumed_B": 146.802737152, "train/loss_slope": -5.580963274397501e-06} {"step": 70000, "timestamp": 1778270204.9971268, "geo/ww_alpha_mean": 7.798000762701897, "geo/ww_alpha_std": 5.157970566125381, "geo/ww_alpha_min": 1.3547777292923309, "geo/ww_alpha_max": 33.04440692095971, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9226318941369205, "geo/ww_alpha_by_type/k_proj": 4.404715456833032, "geo/ww_alpha_by_type/v_proj": 8.668222561160475, "geo/ww_alpha_by_type/o_proj": 8.779831816563778, "geo/ww_alpha_by_type/gate_proj": 7.960833622525152, "geo/ww_alpha_by_type/up_proj": 12.87580551796272, "geo/ww_alpha_by_type/down_proj": 8.073512996883295, "geo/twonn_id/layer_0": 0.7089946866035461, "geo/twonn_id/layer_7": 3.33843994140625, "geo/twonn_id/layer_14": 4.638893127441406, "geo/twonn_id/layer_21": 7.52913761138916, "geo/twonn_id/layer_27": 6.241659164428711, "geo/tier2_time_s": 6.960520505905151} {"step": 70000, "timestamp": 1778270205.6108396, "eoc/jacobian_sigma/layer_0/attn": 1187.3353271484375, "eoc/jacobian_sigma/layer_0/mlp": 8480.25, "eoc/jacobian_sigma/layer_0": 8480.25, "eoc/jacobian_sigma/layer_7/attn": 1.153417944908142, "eoc/jacobian_sigma/layer_7/mlp": 1.713814616203308, "eoc/jacobian_sigma/layer_7": 1.713814616203308, "eoc/jacobian_sigma/layer_14/attn": 1.4536614418029785, "eoc/jacobian_sigma/layer_14/mlp": 7.706289768218994, "eoc/jacobian_sigma/layer_14": 7.706289768218994, "eoc/jacobian_sigma/layer_21/attn": 1.0903785228729248, "eoc/jacobian_sigma/layer_21/mlp": 4.183182716369629, "eoc/jacobian_sigma/layer_21": 4.183182716369629, "eoc/jacobian_sigma/layer_27/attn": 3.193453073501587, "eoc/jacobian_sigma/layer_27/mlp": 27.865230560302734, "eoc/jacobian_sigma/layer_27": 27.865230560302734, "eoc/layer0_sigma": 8480.25, "eoc/sigma_max": 27.865230560302734, "eoc/sigma_min": 1.713814616203308, "eoc/sigma_mean": 10.367129415273666, "eoc/time_s": 0.6055786609649658} {"step": 70010, "timestamp": 1778270216.001902, "train/loss": 2.1682056903839113, "train/z_loss": 0.0013604656676761806, "train/perplexity": 8.742583057890856, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1167243.2984253515, "perf/iters_per_sec": 0.5565849773527868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7966708421707154, "data/tokens_consumed": 146823708672, "data/tokens_consumed_B": 146.823708672, "train/loss_slope": -2.5449945230652642e-06} {"step": 70020, "timestamp": 1778270226.3848355, "train/loss": 2.1376101970672607, "train/z_loss": 0.0013780164066702128, "train/perplexity": 8.479149899407215, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021367.5367826517, "perf/iters_per_sec": 0.9638631519234904, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374916791915894, "data/tokens_consumed": 146844680192, "data/tokens_consumed_B": 146.844680192, "train/loss_slope": -2.68777197200597e-07} {"step": 70030, "timestamp": 1778270236.765361, "train/loss": 2.170539712905884, "train/z_loss": 0.0013568816124461592, "train/perplexity": 8.763012275500905, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021391.0880226996, "perf/iters_per_sec": 0.963874382029867, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374795913696289, "data/tokens_consumed": 146865651712, "data/tokens_consumed_B": 146.865651712, "train/loss_slope": -1.1225512247345704e-06} {"step": 70040, "timestamp": 1778270247.1438296, "train/loss": 2.1998682379722596, "train/z_loss": 0.0013524145470000803, "train/perplexity": 9.023824423694286, "train/grad_norm": 0.201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021927.6660976703, "perf/iters_per_sec": 0.9641302423942901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372042655944824, "data/tokens_consumed": 146886623232, "data/tokens_consumed_B": 146.886623232, "train/loss_slope": 9.136059103232744e-07} {"step": 70050, "timestamp": 1778270257.5196319, "grad/layer_0/attn": 0.003125499002635479, "grad/layer_0/mlp": 0.003185597248375416, "grad/layer_0/attn_mlp_ratio": 0.9811343559252441, "grad/layer_4/attn": 0.0026832253206521273, "grad/layer_4/mlp": 0.0027920876163989305, "grad/layer_4/attn_mlp_ratio": 0.9610103955160808, "grad/layer_8/attn": 0.007745536509901285, "grad/layer_8/mlp": 0.0037910621613264084, "grad/layer_8/attn_mlp_ratio": 2.0431045379854282, "grad/layer_12/attn": 0.005158124957233667, "grad/layer_12/mlp": 0.006753700319677591, "grad/layer_12/attn_mlp_ratio": 0.7637479658121861, "grad/layer_16/attn": 0.0033260551281273365, "grad/layer_16/mlp": 0.00475171348080039, "grad/layer_16/attn_mlp_ratio": 0.6999696155017593, "grad/layer_20/attn": 0.006694393698126078, "grad/layer_20/mlp": 0.005860228557139635, "grad/layer_20/attn_mlp_ratio": 1.1423434288643606, "grad/layer_24/attn": 0.006368202622979879, "grad/layer_24/mlp": 0.009409523569047451, "grad/layer_24/attn_mlp_ratio": 0.6767826775257528, "grad/layer_27/attn": 0.004200406838208437, "grad/layer_27/mlp": 0.00771620636805892, "grad/layer_27/attn_mlp_ratio": 0.544361643975694} {"step": 70050, "timestamp": 1778270258.1457057, "eos/sharpness": 29.709529876708977, "eos/L0_probe": 1.962856411933899, "eos/L_plus": 2.111952781677246, "eos/L_minus": 2.1108553409576416, "eos/grad_norm": 0.13087360560894012, "eos/embed_grad_frac": 0.14722195267677307, "eos/time_s": 0.6232104301452637} {"step": 70050, "timestamp": 1778270258.1646707, "train/loss": 2.2295493125915526, "train/z_loss": 0.001353717886377126, "train/perplexity": 9.295675691289842, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1904166.7065618134, "perf/iters_per_sec": 0.9079774411019389, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1013489484786987, "data/tokens_consumed": 146907594752, "data/tokens_consumed_B": 146.907594752, "train/loss_slope": 5.860926308790728e-06} {"step": 70050, "timestamp": 1778270259.5276296, "geo/rankme_last": 438.4121398925781, "geo/layer_0/stable_rank_q_proj": 19.23174285888672, "geo/layer_0/stable_rank_k_proj": 15.976981163024902, "geo/layer_0/stable_rank_o_proj": 46.83830261230469, "geo/layer_0/stable_rank_gate_proj": 129.83067321777344, "geo/layer_0/stable_rank_down_proj": 55.86101150512695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06527914851903915, "geo/layer_0/attn_entropy_mean": 6.156364440917969, "geo/layer_0/attn_entropy_std": 0.4241718053817749, "geo/layer_7/stable_rank_q_proj": 43.010398864746094, "geo/layer_7/stable_rank_k_proj": 40.30644607543945, "geo/layer_7/stable_rank_o_proj": 89.39985656738281, "geo/layer_7/stable_rank_gate_proj": 79.10138702392578, "geo/layer_7/stable_rank_down_proj": 139.70379638671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4328528046607971, "geo/layer_7/attn_entropy_mean": 4.651805877685547, "geo/layer_7/attn_entropy_std": 0.7999718189239502, "geo/layer_14/stable_rank_q_proj": 50.10533905029297, "geo/layer_14/stable_rank_k_proj": 41.01056671142578, "geo/layer_14/stable_rank_o_proj": 43.381385803222656, "geo/layer_14/stable_rank_gate_proj": 71.37611389160156, "geo/layer_14/stable_rank_down_proj": 127.25184631347656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3913152515888214, "geo/layer_14/attn_entropy_mean": 5.51625919342041, "geo/layer_14/attn_entropy_std": 0.4226069152355194, "geo/layer_21/stable_rank_q_proj": 39.93485641479492, "geo/layer_21/stable_rank_k_proj": 30.054540634155273, "geo/layer_21/stable_rank_o_proj": 69.36178588867188, "geo/layer_21/stable_rank_gate_proj": 64.6351547241211, "geo/layer_21/stable_rank_down_proj": 50.251182556152344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14152954518795013, "geo/layer_21/attn_entropy_mean": 5.684828758239746, "geo/layer_21/attn_entropy_std": 0.30477359890937805, "geo/layer_27/stable_rank_q_proj": 43.61148452758789, "geo/layer_27/stable_rank_k_proj": 32.14165496826172, "geo/layer_27/stable_rank_o_proj": 115.0904541015625, "geo/layer_27/stable_rank_gate_proj": 78.52141571044922, "geo/layer_27/stable_rank_down_proj": 127.35833740234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09940247237682343, "geo/layer_27/attn_entropy_mean": 4.174369812011719, "geo/layer_27/attn_entropy_std": 0.7593085765838623, "attnres/final_alpha/block_0": 0.23846043646335602, "attnres/block_norm/0": 1.771512508392334, "attnres/final_alpha/block_1": 0.004228661768138409, "attnres/block_norm/1": 47762.36328125, "attnres/final_alpha/block_2": 0.010139919817447662, "attnres/block_norm/2": 28900.474609375, "attnres/final_alpha/block_3": 0.012132106348872185, "attnres/block_norm/3": 60058.7890625, "attnres/final_alpha/block_4": 0.014099366962909698, "attnres/block_norm/4": 15586.2265625, "attnres/final_alpha/block_5": 0.6137809753417969, "attnres/block_norm/5": 6782.91650390625, "attnres/final_alpha/block_6": 0.10715853422880173, "attnres/block_norm/6": 40335.4921875, "geo/tier1_time_s": 1.3587977886199951, "geo/step": 70050.0, "geo/rankme_slope": 3.678158763505402e-05} {"step": 70060, "timestamp": 1778270270.452081, "train/loss": 2.1732257604599, "train/z_loss": 0.0013615781208500266, "train/perplexity": 8.786581783428012, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1707279.8727356265, "perf/iters_per_sec": 0.8140944827726491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2283586502075194, "data/tokens_consumed": 146928566272, "data/tokens_consumed_B": 146.928566272, "train/loss_slope": 9.470319626319187e-06} {"step": 70070, "timestamp": 1778270280.828518, "train/loss": 2.1788464546203614, "train/z_loss": 0.0013663613353855908, "train/perplexity": 8.836107526490414, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022312.5253710058, "perf/iters_per_sec": 0.9643137575964955, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370068788528441, "data/tokens_consumed": 146949537792, "data/tokens_consumed_B": 146.949537792, "train/loss_slope": 1.5610994185336976e-05} {"step": 70080, "timestamp": 1778270291.2037315, "train/loss": 2.153006744384766, "train/z_loss": 0.001372468937188387, "train/perplexity": 8.610709716636668, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022590.231946291, "perf/iters_per_sec": 0.9644461784106688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368644952774049, "data/tokens_consumed": 146970509312, "data/tokens_consumed_B": 146.970509312, "train/loss_slope": 1.7514182188616087e-05} {"step": 70090, "timestamp": 1778270301.5849485, "train/loss": 2.1760963439941405, "train/z_loss": 0.0013754322892054916, "train/perplexity": 8.811840636897275, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021118.215302557, "perf/iters_per_sec": 0.963744266177443, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037619662284851, "data/tokens_consumed": 146991480832, "data/tokens_consumed_B": 146.991480832, "train/loss_slope": 2.11019657554478e-05} {"step": 70100, "timestamp": 1778270311.9524462, "grad/layer_0/attn": 0.0026806932874023914, "grad/layer_0/mlp": 0.0030331918969750404, "grad/layer_0/attn_mlp_ratio": 0.8837862192949899, "grad/layer_4/attn": 0.0021944043692201376, "grad/layer_4/mlp": 0.0025744857266545296, "grad/layer_4/attn_mlp_ratio": 0.8523660711201907, "grad/layer_8/attn": 0.004754764959216118, "grad/layer_8/mlp": 0.0035107748117297888, "grad/layer_8/attn_mlp_ratio": 1.3543348915164157, "grad/layer_12/attn": 0.006381864659488201, "grad/layer_12/mlp": 0.0070545608177781105, "grad/layer_12/attn_mlp_ratio": 0.9046437806505213, "grad/layer_16/attn": 0.004501173738390207, "grad/layer_16/mlp": 0.004638720769435167, "grad/layer_16/attn_mlp_ratio": 0.970348047464708, "grad/layer_20/attn": 0.003983586095273495, "grad/layer_20/mlp": 0.006235094740986824, "grad/layer_20/attn_mlp_ratio": 0.6388974340994973, "grad/layer_24/attn": 0.010531091131269932, "grad/layer_24/mlp": 0.010698525235056877, "grad/layer_24/attn_mlp_ratio": 0.9843497866721597, "grad/layer_27/attn": 0.010925013571977615, "grad/layer_27/mlp": 0.011189017444849014, "grad/layer_27/attn_mlp_ratio": 0.9764050800873991} {"step": 70100, "timestamp": 1778270311.9680018, "train/loss": 2.1423147916793823, "train/z_loss": 0.0013862169347703458, "train/perplexity": 8.519134845069729, "train/grad_norm": 0.2314453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021224.1508381628, "perf/iters_per_sec": 0.963794780177194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037565279006958, "data/tokens_consumed": 147012452352, "data/tokens_consumed_B": 147.012452352, "train/loss_slope": 2.309467864043717e-05} {"step": 70110, "timestamp": 1778270322.346637, "train/loss": 2.130858898162842, "train/z_loss": 0.0013911661226302385, "train/perplexity": 8.42209742982295, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022020.7645286322, "perf/iters_per_sec": 0.9641746351855431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371565103530884, "data/tokens_consumed": 147033423872, "data/tokens_consumed_B": 147.033423872, "train/loss_slope": 2.3208508683223858e-05} {"step": 70120, "timestamp": 1778270332.730527, "train/loss": 2.1609237432479858, "train/z_loss": 0.0013819097192026674, "train/perplexity": 8.67915126401499, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020475.6879480428, "perf/iters_per_sec": 0.9634378852596487, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379496335983276, "data/tokens_consumed": 147054395392, "data/tokens_consumed_B": 147.054395392, "train/loss_slope": 2.1411964351838353e-05} {"step": 70125, "timestamp": 1778270338.53149, "eos/sharpness": 26.118350028991696, "eos/L0_probe": 1.9669984579086304, "eos/L_plus": 2.1140618324279785, "eos/L_minus": 2.081118583679199, "eos/grad_norm": 0.11379266530275345, "eos/embed_grad_frac": 0.1902768462896347, "eos/time_s": 0.6159627437591553} {"step": 70125, "timestamp": 1778270339.910911, "geo/rankme_last": 438.9794921875, "geo/layer_0/stable_rank_q_proj": 19.223819732666016, "geo/layer_0/stable_rank_k_proj": 15.97400188446045, "geo/layer_0/stable_rank_o_proj": 46.867618560791016, "geo/layer_0/stable_rank_gate_proj": 129.7342071533203, "geo/layer_0/stable_rank_down_proj": 55.95466232299805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061540018767118454, "geo/layer_0/attn_entropy_mean": 6.159939765930176, "geo/layer_0/attn_entropy_std": 0.4242306649684906, "geo/layer_7/stable_rank_q_proj": 42.977535247802734, "geo/layer_7/stable_rank_k_proj": 40.325687408447266, "geo/layer_7/stable_rank_o_proj": 89.49053955078125, "geo/layer_7/stable_rank_gate_proj": 79.07320404052734, "geo/layer_7/stable_rank_down_proj": 139.80587768554688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4470840096473694, "geo/layer_7/attn_entropy_mean": 4.622202396392822, "geo/layer_7/attn_entropy_std": 0.8090234398841858, "geo/layer_14/stable_rank_q_proj": 50.108158111572266, "geo/layer_14/stable_rank_k_proj": 41.0115852355957, "geo/layer_14/stable_rank_o_proj": 43.36482238769531, "geo/layer_14/stable_rank_gate_proj": 71.25460815429688, "geo/layer_14/stable_rank_down_proj": 126.95600891113281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.386838436126709, "geo/layer_14/attn_entropy_mean": 5.508652687072754, "geo/layer_14/attn_entropy_std": 0.42164888978004456, "geo/layer_21/stable_rank_q_proj": 39.94578170776367, "geo/layer_21/stable_rank_k_proj": 30.049062728881836, "geo/layer_21/stable_rank_o_proj": 69.4294204711914, "geo/layer_21/stable_rank_gate_proj": 64.79109954833984, "geo/layer_21/stable_rank_down_proj": 50.222774505615234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13848921656608582, "geo/layer_21/attn_entropy_mean": 5.657498836517334, "geo/layer_21/attn_entropy_std": 0.30089232325553894, "geo/layer_27/stable_rank_q_proj": 43.707176208496094, "geo/layer_27/stable_rank_k_proj": 32.106658935546875, "geo/layer_27/stable_rank_o_proj": 114.9903335571289, "geo/layer_27/stable_rank_gate_proj": 78.53093719482422, "geo/layer_27/stable_rank_down_proj": 127.68004608154297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10034658759832382, "geo/layer_27/attn_entropy_mean": 4.157540321350098, "geo/layer_27/attn_entropy_std": 0.7510411143302917, "attnres/final_alpha/block_0": 0.23740863800048828, "attnres/block_norm/0": 1.7717827558517456, "attnres/final_alpha/block_1": 0.0040727839805185795, "attnres/block_norm/1": 48044.9296875, "attnres/final_alpha/block_2": 0.009845145046710968, "attnres/block_norm/2": 28828.599609375, "attnres/final_alpha/block_3": 0.011798982508480549, "attnres/block_norm/3": 60328.546875, "attnres/final_alpha/block_4": 0.013985195197165012, "attnres/block_norm/4": 15669.4345703125, "attnres/final_alpha/block_5": 0.616266131401062, "attnres/block_norm/5": 6814.0205078125, "attnres/final_alpha/block_6": 0.10662311315536499, "attnres/block_norm/6": 40535.85546875, "geo/tier1_time_s": 1.357767105102539, "geo/step": 70125.0, "geo/rankme_slope": 6.50577418467387e-05} {"step": 70130, "timestamp": 1778270345.0991964, "train/loss": 2.1624056458473206, "train/z_loss": 0.0013715476146899165, "train/perplexity": 8.692022455403677, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696445.9971478176, "perf/iters_per_sec": 0.8089284883250321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2362032175064086, "data/tokens_consumed": 147075366912, "data/tokens_consumed_B": 147.075366912, "train/loss_slope": 2.1694912518462115e-05} {"step": 70140, "timestamp": 1778270355.4808965, "train/loss": 2.1348634481430055, "train/z_loss": 0.0013718661502934992, "train/perplexity": 8.455891760299542, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020924.9034471752, "perf/iters_per_sec": 0.9636520879016758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037718915939331, "data/tokens_consumed": 147096338432, "data/tokens_consumed_B": 147.096338432, "train/loss_slope": 2.1740480224684947e-05} {"step": 70150, "timestamp": 1778270365.849102, "grad/layer_0/attn": 0.0024868277832865715, "grad/layer_0/mlp": 0.0029223517049103975, "grad/layer_0/attn_mlp_ratio": 0.8509679700808029, "grad/layer_4/attn": 0.0021548550575971603, "grad/layer_4/mlp": 0.0024783918634057045, "grad/layer_4/attn_mlp_ratio": 0.8694569258673865, "grad/layer_8/attn": 0.004748882260173559, "grad/layer_8/mlp": 0.003618958406150341, "grad/layer_8/attn_mlp_ratio": 1.312223462110145, "grad/layer_12/attn": 0.00455059576779604, "grad/layer_12/mlp": 0.00645779725164175, "grad/layer_12/attn_mlp_ratio": 0.7046668577543941, "grad/layer_16/attn": 0.0031740344129502773, "grad/layer_16/mlp": 0.0045127710327506065, "grad/layer_16/attn_mlp_ratio": 0.7033448671738096, "grad/layer_20/attn": 0.004843557719141245, "grad/layer_20/mlp": 0.006026201415807009, "grad/layer_20/attn_mlp_ratio": 0.8037497097361193, "grad/layer_24/attn": 0.01699868030846119, "grad/layer_24/mlp": 0.012053720653057098, "grad/layer_24/attn_mlp_ratio": 1.4102434141880993, "grad/layer_27/attn": 0.00429091090336442, "grad/layer_27/mlp": 0.012096035294234753, "grad/layer_27/attn_mlp_ratio": 0.35473696657745946} {"step": 70150, "timestamp": 1778270365.864831, "train/loss": 2.15005099773407, "train/z_loss": 0.0013662316370755434, "train/perplexity": 8.585296216667267, "train/grad_norm": 0.224609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020411.782534694, "perf/iters_per_sec": 0.9634074127839537, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03798246383667, "data/tokens_consumed": 147117309952, "data/tokens_consumed_B": 147.117309952, "train/loss_slope": 2.0392025736692363e-05} {"step": 70160, "timestamp": 1778270376.2399392, "train/loss": 2.141558837890625, "train/z_loss": 0.001364846364594996, "train/perplexity": 8.512697206391914, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022779.4899798047, "perf/iters_per_sec": 0.9645364236735366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367674827575684, "data/tokens_consumed": 147138281472, "data/tokens_consumed_B": 147.138281472, "train/loss_slope": 1.96886581091276e-05} {"step": 70170, "timestamp": 1778270386.616804, "train/loss": 2.128276324272156, "train/z_loss": 0.0013692670851014554, "train/perplexity": 8.400374803113758, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021959.364151536, "perf/iters_per_sec": 0.964145357204216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371880054473877, "data/tokens_consumed": 147159252992, "data/tokens_consumed_B": 147.159252992, "train/loss_slope": 2.032617037624635e-05} {"step": 70180, "timestamp": 1778270397.001477, "train/loss": 2.189287543296814, "train/z_loss": 0.0013686869526281953, "train/perplexity": 8.928849429415086, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020814.7280566045, "perf/iters_per_sec": 0.9635995521815321, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037775492668152, "data/tokens_consumed": 147180224512, "data/tokens_consumed_B": 147.180224512, "train/loss_slope": 2.1664705671349952e-05} {"step": 70190, "timestamp": 1778270407.3775098, "train/loss": 2.1372117042541503, "train/z_loss": 0.0013774847961030901, "train/perplexity": 8.475771692251355, "train/grad_norm": 0.3671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022511.6832862974, "perf/iters_per_sec": 0.9644087234908568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036904764175415, "data/tokens_consumed": 147201196032, "data/tokens_consumed_B": 147.201196032, "train/loss_slope": 2.1469655496166404e-05} {"step": 70200, "timestamp": 1778270418.24332, "grad/layer_0/attn": 0.002685259096324444, "grad/layer_0/mlp": 0.0031810852233320475, "grad/layer_0/attn_mlp_ratio": 0.8441329997121085, "grad/layer_4/attn": 0.0027280000504106283, "grad/layer_4/mlp": 0.002574823098257184, "grad/layer_4/attn_mlp_ratio": 1.059490240827846, "grad/layer_8/attn": 0.0034261744003742933, "grad/layer_8/mlp": 0.0035679908469319344, "grad/layer_8/attn_mlp_ratio": 0.9602531091959224, "grad/layer_12/attn": 0.006709035951644182, "grad/layer_12/mlp": 0.007065834943205118, "grad/layer_12/attn_mlp_ratio": 0.9495036199714212, "grad/layer_16/attn": 0.004164835903793573, "grad/layer_16/mlp": 0.00456983782351017, "grad/layer_16/attn_mlp_ratio": 0.9113749707329867, "grad/layer_20/attn": 0.004755121655762196, "grad/layer_20/mlp": 0.006406365893781185, "grad/layer_20/attn_mlp_ratio": 0.7422494531811182, "grad/layer_24/attn": 0.007961203344166279, "grad/layer_24/mlp": 0.00799572840332985, "grad/layer_24/attn_mlp_ratio": 0.9956820495907044, "grad/layer_27/attn": 0.0036976321134716272, "grad/layer_27/mlp": 0.0068143196403980255, "grad/layer_27/attn_mlp_ratio": 0.5426267410891477} {"step": 70200, "timestamp": 1778270418.859977, "eos/sharpness": 9.366869926452635, "eos/L0_probe": 1.9660367965698242, "eos/L_plus": 2.0175118446350098, "eos/L_minus": 2.008230447769165, "eos/grad_norm": 0.09197070449590683, "eos/embed_grad_frac": 0.2518603503704071, "eos/time_s": 0.61360764503479} {"step": 70200, "timestamp": 1778270418.881702, "train/loss": 2.1733006477355956, "train/z_loss": 0.0013780803652480244, "train/perplexity": 8.787239811239099, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1823793.7802464115, "perf/iters_per_sec": 0.8696526433212335, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1498843908309937, "data/tokens_consumed": 147222167552, "data/tokens_consumed_B": 147.222167552, "train/loss_slope": 2.2570639918453536e-05} {"step": 70200, "timestamp": 1778270420.2466836, "geo/rankme_last": 438.7710266113281, "geo/layer_0/stable_rank_q_proj": 19.231544494628906, "geo/layer_0/stable_rank_k_proj": 15.989301681518555, "geo/layer_0/stable_rank_o_proj": 46.86663818359375, "geo/layer_0/stable_rank_gate_proj": 129.6943817138672, "geo/layer_0/stable_rank_down_proj": 56.0184326171875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06579195708036423, "geo/layer_0/attn_entropy_mean": 6.158478736877441, "geo/layer_0/attn_entropy_std": 0.42203158140182495, "geo/layer_7/stable_rank_q_proj": 42.99894714355469, "geo/layer_7/stable_rank_k_proj": 40.25322341918945, "geo/layer_7/stable_rank_o_proj": 89.4937744140625, "geo/layer_7/stable_rank_gate_proj": 79.03510284423828, "geo/layer_7/stable_rank_down_proj": 140.0110321044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4535296559333801, "geo/layer_7/attn_entropy_mean": 4.654748916625977, "geo/layer_7/attn_entropy_std": 0.8023837804794312, "geo/layer_14/stable_rank_q_proj": 50.05850601196289, "geo/layer_14/stable_rank_k_proj": 41.040470123291016, "geo/layer_14/stable_rank_o_proj": 43.3698844909668, "geo/layer_14/stable_rank_gate_proj": 71.23866271972656, "geo/layer_14/stable_rank_down_proj": 127.21614837646484, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39906102418899536, "geo/layer_14/attn_entropy_mean": 5.543320178985596, "geo/layer_14/attn_entropy_std": 0.4298751652240753, "geo/layer_21/stable_rank_q_proj": 39.874942779541016, "geo/layer_21/stable_rank_k_proj": 30.125286102294922, "geo/layer_21/stable_rank_o_proj": 69.51581573486328, "geo/layer_21/stable_rank_gate_proj": 64.71243286132812, "geo/layer_21/stable_rank_down_proj": 50.15552520751953, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14584332704544067, "geo/layer_21/attn_entropy_mean": 5.681031227111816, "geo/layer_21/attn_entropy_std": 0.30258187651634216, "geo/layer_27/stable_rank_q_proj": 43.639041900634766, "geo/layer_27/stable_rank_k_proj": 32.053340911865234, "geo/layer_27/stable_rank_o_proj": 115.02351379394531, "geo/layer_27/stable_rank_gate_proj": 78.44679260253906, "geo/layer_27/stable_rank_down_proj": 127.83483123779297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.11174505203962326, "geo/layer_27/attn_entropy_mean": 4.18204402923584, "geo/layer_27/attn_entropy_std": 0.7564738988876343, "attnres/final_alpha/block_0": 0.23681460320949554, "attnres/block_norm/0": 1.771941900253296, "attnres/final_alpha/block_1": 0.00409206748008728, "attnres/block_norm/1": 48057.4375, "attnres/final_alpha/block_2": 0.009910334832966328, "attnres/block_norm/2": 28813.849609375, "attnres/final_alpha/block_3": 0.011720548383891582, "attnres/block_norm/3": 60575.1484375, "attnres/final_alpha/block_4": 0.013679488562047482, "attnres/block_norm/4": 15632.7421875, "attnres/final_alpha/block_5": 0.6171314716339111, "attnres/block_norm/5": 6775.6787109375, "attnres/final_alpha/block_6": 0.10665145516395569, "attnres/block_norm/6": 40277.65625, "geo/tier1_time_s": 1.3613030910491943, "geo/step": 70200.0, "geo/rankme_slope": 0.00010294850362019808} {"step": 70210, "timestamp": 1778270430.6294053, "train/loss": 2.147351336479187, "train/z_loss": 0.0013759510358795524, "train/perplexity": 8.562150082527953, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785649.538925845, "perf/iters_per_sec": 0.8514640516881203, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1744477033615113, "data/tokens_consumed": 147243139072, "data/tokens_consumed_B": 147.243139072, "train/loss_slope": 2.3793761529187153e-05} {"step": 70220, "timestamp": 1778270441.0126238, "train/loss": 2.160459542274475, "train/z_loss": 0.001366020052228123, "train/perplexity": 8.675123328507109, "train/grad_norm": 0.197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021182.7691735572, "perf/iters_per_sec": 0.9637750478618418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037586522102356, "data/tokens_consumed": 147264110592, "data/tokens_consumed_B": 147.264110592, "train/loss_slope": 2.6982251459723168e-05} {"step": 70230, "timestamp": 1778270451.391518, "train/loss": 2.1565990924835203, "train/z_loss": 0.001368061127141118, "train/perplexity": 8.641698010359946, "train/grad_norm": 0.2158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021470.6646153415, "perf/iters_per_sec": 0.9639123271061618, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374387502670288, "data/tokens_consumed": 147285082112, "data/tokens_consumed_B": 147.285082112, "train/loss_slope": 2.738426301059443e-05} {"step": 70240, "timestamp": 1778270462.1962452, "train/loss": 2.1250740766525267, "train/z_loss": 0.0013735397369600832, "train/perplexity": 8.373517747318212, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1942166.57876844, "perf/iters_per_sec": 0.9260971921770287, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.079800271987915, "data/tokens_consumed": 147306053632, "data/tokens_consumed_B": 147.306053632, "train/loss_slope": 2.624703318443951e-05} {"step": 70250, "timestamp": 1778270472.5711296, "grad/layer_0/attn": 0.0023026454728096724, "grad/layer_0/mlp": 0.002638373291119933, "grad/layer_0/attn_mlp_ratio": 0.8727519313830893, "grad/layer_4/attn": 0.002117455005645752, "grad/layer_4/mlp": 0.002480583731085062, "grad/layer_4/attn_mlp_ratio": 0.8536115486649489, "grad/layer_8/attn": 0.006214975845068693, "grad/layer_8/mlp": 0.0035124190617352724, "grad/layer_8/attn_mlp_ratio": 1.769428863381514, "grad/layer_12/attn": 0.004049078561365604, "grad/layer_12/mlp": 0.00633595883846283, "grad/layer_12/attn_mlp_ratio": 0.6390632579364465, "grad/layer_16/attn": 0.003359501250088215, "grad/layer_16/mlp": 0.0043342881835997105, "grad/layer_16/attn_mlp_ratio": 0.7750987083162092, "grad/layer_20/attn": 0.004235957283526659, "grad/layer_20/mlp": 0.005828781519085169, "grad/layer_20/attn_mlp_ratio": 0.7267311696250331, "grad/layer_24/attn": 0.006794658023864031, "grad/layer_24/mlp": 0.007595098577439785, "grad/layer_24/attn_mlp_ratio": 0.8946108947928018, "grad/layer_27/attn": 0.004761481657624245, "grad/layer_27/mlp": 0.006257359869778156, "grad/layer_27/attn_mlp_ratio": 0.76094098479571} {"step": 70250, "timestamp": 1778270472.5868912, "train/loss": 2.1411629199981688, "train/z_loss": 0.0013861342216841877, "train/perplexity": 8.509327544353603, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019833.4762291468, "perf/iters_per_sec": 0.9631316548486456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03827965259552, "data/tokens_consumed": 147327025152, "data/tokens_consumed_B": 147.327025152, "train/loss_slope": 2.3421806091665668e-05} {"step": 70260, "timestamp": 1778270483.4592094, "train/loss": 2.107301187515259, "train/z_loss": 0.0013823027489706873, "train/perplexity": 8.226010839465143, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1929875.1055804505, "perf/iters_per_sec": 0.9202361610319378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.086677575111389, "data/tokens_consumed": 147347996672, "data/tokens_consumed_B": 147.347996672, "train/loss_slope": 2.301697290376468e-05} {"step": 70270, "timestamp": 1778270493.8448856, "train/loss": 2.1118401288986206, "train/z_loss": 0.0013707680511288345, "train/perplexity": 8.26343308492596, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020631.2683519493, "perf/iters_per_sec": 0.9635120717773196, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378697156906127, "data/tokens_consumed": 147368968192, "data/tokens_consumed_B": 147.368968192, "train/loss_slope": 2.311982719382474e-05} {"step": 70275, "timestamp": 1778270499.6381075, "eos/sharpness": 38.617157936096184, "eos/L0_probe": 1.9674935340881348, "eos/L_plus": 2.171773672103882, "eos/L_minus": 2.1493849754333496, "eos/grad_norm": 0.10890784859657288, "eos/embed_grad_frac": 0.22518113255500793, "eos/time_s": 0.614426851272583} {"step": 70275, "timestamp": 1778270501.0134115, "geo/rankme_last": 438.70867919921875, "geo/layer_0/stable_rank_q_proj": 19.243183135986328, "geo/layer_0/stable_rank_k_proj": 15.984601974487305, "geo/layer_0/stable_rank_o_proj": 46.80143356323242, "geo/layer_0/stable_rank_gate_proj": 129.92552185058594, "geo/layer_0/stable_rank_down_proj": 56.00710678100586, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06918233633041382, "geo/layer_0/attn_entropy_mean": 6.157421588897705, "geo/layer_0/attn_entropy_std": 0.42152249813079834, "geo/layer_7/stable_rank_q_proj": 42.989566802978516, "geo/layer_7/stable_rank_k_proj": 40.30337905883789, "geo/layer_7/stable_rank_o_proj": 89.46025085449219, "geo/layer_7/stable_rank_gate_proj": 78.96500396728516, "geo/layer_7/stable_rank_down_proj": 139.8015594482422, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44585949182510376, "geo/layer_7/attn_entropy_mean": 4.617654800415039, "geo/layer_7/attn_entropy_std": 0.8068561553955078, "geo/layer_14/stable_rank_q_proj": 49.98141098022461, "geo/layer_14/stable_rank_k_proj": 40.92485046386719, "geo/layer_14/stable_rank_o_proj": 43.424896240234375, "geo/layer_14/stable_rank_gate_proj": 71.19590759277344, "geo/layer_14/stable_rank_down_proj": 127.35821533203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38223177194595337, "geo/layer_14/attn_entropy_mean": 5.543338775634766, "geo/layer_14/attn_entropy_std": 0.41404661536216736, "geo/layer_21/stable_rank_q_proj": 39.83698272705078, "geo/layer_21/stable_rank_k_proj": 30.181114196777344, "geo/layer_21/stable_rank_o_proj": 69.40245819091797, "geo/layer_21/stable_rank_gate_proj": 64.71088409423828, "geo/layer_21/stable_rank_down_proj": 50.13871765136719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1361074298620224, "geo/layer_21/attn_entropy_mean": 5.678929328918457, "geo/layer_21/attn_entropy_std": 0.31159308552742004, "geo/layer_27/stable_rank_q_proj": 43.63813781738281, "geo/layer_27/stable_rank_k_proj": 32.166751861572266, "geo/layer_27/stable_rank_o_proj": 115.29461669921875, "geo/layer_27/stable_rank_gate_proj": 78.41255187988281, "geo/layer_27/stable_rank_down_proj": 127.85245513916016, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09543057531118393, "geo/layer_27/attn_entropy_mean": 4.170526027679443, "geo/layer_27/attn_entropy_std": 0.767846941947937, "attnres/final_alpha/block_0": 0.23842470347881317, "attnres/block_norm/0": 1.7718230485916138, "attnres/final_alpha/block_1": 0.004099966958165169, "attnres/block_norm/1": 48165.5703125, "attnres/final_alpha/block_2": 0.010034294798970222, "attnres/block_norm/2": 28917.076171875, "attnres/final_alpha/block_3": 0.01189478486776352, "attnres/block_norm/3": 60636.734375, "attnres/final_alpha/block_4": 0.013843279331922531, "attnres/block_norm/4": 15578.8681640625, "attnres/final_alpha/block_5": 0.6139137744903564, "attnres/block_norm/5": 6777.689453125, "attnres/final_alpha/block_6": 0.10778923332691193, "attnres/block_norm/6": 40125.5625, "geo/tier1_time_s": 1.3558540344238281, "geo/step": 70275.0, "geo/rankme_slope": 8.966524109643858e-05} {"step": 70280, "timestamp": 1778270506.2073617, "train/loss": 2.168085980415344, "train/z_loss": 0.0013699669623747468, "train/perplexity": 8.74153654618799, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697292.710181362, "perf/iters_per_sec": 0.8093322325617609, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2355865240097046, "data/tokens_consumed": 147389939712, "data/tokens_consumed_B": 147.389939712, "train/loss_slope": 2.3989378193495625e-05} {"step": 70290, "timestamp": 1778270516.608015, "train/loss": 2.1026267290115355, "train/z_loss": 0.0013829118222929537, "train/perplexity": 8.187648424756048, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017666.8357155093, "perf/iters_per_sec": 0.9620985201432749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0393945932388307, "data/tokens_consumed": 147410911232, "data/tokens_consumed_B": 147.410911232, "train/loss_slope": 2.118553541602938e-05} {"step": 70300, "timestamp": 1778270527.5355213, "grad/layer_0/attn": 0.002870363648980856, "grad/layer_0/mlp": 0.0029912523459643126, "grad/layer_0/attn_mlp_ratio": 0.9595858928100317, "grad/layer_4/attn": 0.003842270001769066, "grad/layer_4/mlp": 0.0026254712138324976, "grad/layer_4/attn_mlp_ratio": 1.4634591440880618, "grad/layer_8/attn": 0.004074236378073692, "grad/layer_8/mlp": 0.0037627762649208307, "grad/layer_8/attn_mlp_ratio": 1.0827739899868896, "grad/layer_12/attn": 0.005347603000700474, "grad/layer_12/mlp": 0.007570971269160509, "grad/layer_12/attn_mlp_ratio": 0.7063298406441381, "grad/layer_16/attn": 0.0033807111904025078, "grad/layer_16/mlp": 0.004733042325824499, "grad/layer_16/attn_mlp_ratio": 0.7142786576255095, "grad/layer_20/attn": 0.005467855371534824, "grad/layer_20/mlp": 0.006241961382329464, "grad/layer_20/attn_mlp_ratio": 0.875983516882301, "grad/layer_24/attn": 0.014094130136072636, "grad/layer_24/mlp": 0.010675046592950821, "grad/layer_24/attn_mlp_ratio": 1.3202874461785332, "grad/layer_27/attn": 0.011539794504642487, "grad/layer_27/mlp": 0.010654870420694351, "grad/layer_27/attn_mlp_ratio": 1.0830534713893893} {"step": 70300, "timestamp": 1778270527.551659, "train/loss": 2.1595842361450197, "train/z_loss": 0.0013808102812618017, "train/perplexity": 8.667533262184275, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1917588.9382466371, "perf/iters_per_sec": 0.9143776599152742, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0936400175094605, "data/tokens_consumed": 147431882752, "data/tokens_consumed_B": 147.431882752, "train/loss_slope": 2.6074284688867757e-05} {"step": 70310, "timestamp": 1778270537.9336684, "train/loss": 2.1236498594284057, "train/z_loss": 0.0013846318121068179, "train/perplexity": 8.361600527485276, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021284.6240620406, "perf/iters_per_sec": 0.9638236160574153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037534236907959, "data/tokens_consumed": 147452854272, "data/tokens_consumed_B": 147.452854272, "train/loss_slope": 2.330290935721228e-05} {"step": 70320, "timestamp": 1778270548.643478, "train/loss": 2.075819504261017, "train/z_loss": 0.0013878465048037469, "train/perplexity": 7.971076113969604, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1959291.237277995, "perf/iters_per_sec": 0.9342628656759239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0703625679016113, "data/tokens_consumed": 147473825792, "data/tokens_consumed_B": 147.473825792, "train/loss_slope": 2.0200707588401825e-05} {"step": 70330, "timestamp": 1778270559.024717, "train/loss": 2.155529808998108, "train/z_loss": 0.0013665390550158917, "train/perplexity": 8.63246252394806, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021759.989702832, "perf/iters_per_sec": 0.9640502880586777, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372902870178222, "data/tokens_consumed": 147494797312, "data/tokens_consumed_B": 147.494797312, "train/loss_slope": 2.133049108658045e-05} {"step": 70340, "timestamp": 1778270569.401391, "train/loss": 2.099755954742432, "train/z_loss": 0.0013778317254036665, "train/perplexity": 8.164177240688943, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022092.5812074458, "perf/iters_per_sec": 0.9642088800465802, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371196746826172, "data/tokens_consumed": 147515768832, "data/tokens_consumed_B": 147.515768832, "train/loss_slope": 2.0129707129267023e-05} {"step": 70350, "timestamp": 1778270579.7733228, "grad/layer_0/attn": 0.0029659175779670477, "grad/layer_0/mlp": 0.0031738481484353542, "grad/layer_0/attn_mlp_ratio": 0.934486259520816, "grad/layer_4/attn": 0.0020962089765816927, "grad/layer_4/mlp": 0.0026172955986112356, "grad/layer_4/attn_mlp_ratio": 0.8009064385403465, "grad/layer_8/attn": 0.010267399251461029, "grad/layer_8/mlp": 0.0037607084959745407, "grad/layer_8/attn_mlp_ratio": 2.730176771061508, "grad/layer_12/attn": 0.005786202382296324, "grad/layer_12/mlp": 0.007383796852082014, "grad/layer_12/attn_mlp_ratio": 0.7836350890803931, "grad/layer_16/attn": 0.003909667022526264, "grad/layer_16/mlp": 0.004833821672946215, "grad/layer_16/attn_mlp_ratio": 0.8088148893713395, "grad/layer_20/attn": 0.0042151473462581635, "grad/layer_20/mlp": 0.005921964067965746, "grad/layer_20/attn_mlp_ratio": 0.7117819741395208, "grad/layer_24/attn": 0.007680263835936785, "grad/layer_24/mlp": 0.00864323042333126, "grad/layer_24/attn_mlp_ratio": 0.8885871798981788, "grad/layer_27/attn": 0.004724358208477497, "grad/layer_27/mlp": 0.0076149022206664085, "grad/layer_27/attn_mlp_ratio": 0.6204095613486544} {"step": 70350, "timestamp": 1778270580.391336, "eos/sharpness": 42.88558959960937, "eos/L0_probe": 1.9683529138565063, "eos/L_plus": 2.173313856124878, "eos/L_minus": 2.1922478675842285, "eos/grad_norm": 0.12847855687141418, "eos/embed_grad_frac": 0.1381009817123413, "eos/time_s": 0.6153497695922852} {"step": 70350, "timestamp": 1778270580.4128377, "train/loss": 2.1364946603775024, "train/z_loss": 0.001363361650146544, "train/perplexity": 8.46969637045601, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1905401.9984648202, "perf/iters_per_sec": 0.90856647418252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1006349325180054, "data/tokens_consumed": 147536740352, "data/tokens_consumed_B": 147.536740352, "train/loss_slope": 1.826261780907936e-05} {"step": 70350, "timestamp": 1778270581.7734, "geo/rankme_last": 438.594482421875, "geo/layer_0/stable_rank_q_proj": 19.240921020507812, "geo/layer_0/stable_rank_k_proj": 15.97139835357666, "geo/layer_0/stable_rank_o_proj": 46.78758239746094, "geo/layer_0/stable_rank_gate_proj": 129.75682067871094, "geo/layer_0/stable_rank_down_proj": 55.911800384521484, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06601250916719437, "geo/layer_0/attn_entropy_mean": 6.158688545227051, "geo/layer_0/attn_entropy_std": 0.42617231607437134, "geo/layer_7/stable_rank_q_proj": 43.102783203125, "geo/layer_7/stable_rank_k_proj": 40.317359924316406, "geo/layer_7/stable_rank_o_proj": 89.35273742675781, "geo/layer_7/stable_rank_gate_proj": 78.8917465209961, "geo/layer_7/stable_rank_down_proj": 139.7279815673828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.46134090423583984, "geo/layer_7/attn_entropy_mean": 4.658013343811035, "geo/layer_7/attn_entropy_std": 0.7975581884384155, "geo/layer_14/stable_rank_q_proj": 50.07248306274414, "geo/layer_14/stable_rank_k_proj": 40.901371002197266, "geo/layer_14/stable_rank_o_proj": 43.48248291015625, "geo/layer_14/stable_rank_gate_proj": 71.15859985351562, "geo/layer_14/stable_rank_down_proj": 127.43065643310547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37571612000465393, "geo/layer_14/attn_entropy_mean": 5.541962146759033, "geo/layer_14/attn_entropy_std": 0.425985187292099, "geo/layer_21/stable_rank_q_proj": 39.68746566772461, "geo/layer_21/stable_rank_k_proj": 30.200298309326172, "geo/layer_21/stable_rank_o_proj": 69.40355682373047, "geo/layer_21/stable_rank_gate_proj": 64.66191101074219, "geo/layer_21/stable_rank_down_proj": 50.07181930541992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14181293547153473, "geo/layer_21/attn_entropy_mean": 5.684101104736328, "geo/layer_21/attn_entropy_std": 0.30580735206604004, "geo/layer_27/stable_rank_q_proj": 43.65861892700195, "geo/layer_27/stable_rank_k_proj": 32.09379196166992, "geo/layer_27/stable_rank_o_proj": 115.1981430053711, "geo/layer_27/stable_rank_gate_proj": 78.48149108886719, "geo/layer_27/stable_rank_down_proj": 127.90647888183594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09458043426275253, "geo/layer_27/attn_entropy_mean": 4.163896560668945, "geo/layer_27/attn_entropy_std": 0.7681465148925781, "attnres/final_alpha/block_0": 0.2372073233127594, "attnres/block_norm/0": 1.7719311714172363, "attnres/final_alpha/block_1": 0.004099531099200249, "attnres/block_norm/1": 47717.7421875, "attnres/final_alpha/block_2": 0.009965511038899422, "attnres/block_norm/2": 28995.96875, "attnres/final_alpha/block_3": 0.011863172054290771, "attnres/block_norm/3": 60861.859375, "attnres/final_alpha/block_4": 0.013989206403493881, "attnres/block_norm/4": 15603.51953125, "attnres/final_alpha/block_5": 0.6156160831451416, "attnres/block_norm/5": 6804.8037109375, "attnres/final_alpha/block_6": 0.10725920647382736, "attnres/block_norm/6": 40381.5, "geo/tier1_time_s": 1.3570337295532227, "geo/step": 70350.0, "geo/rankme_slope": 0.00010107818518032212} {"step": 70360, "timestamp": 1778270592.161828, "train/loss": 2.135776233673096, "train/z_loss": 0.0013651416869834066, "train/perplexity": 8.463613699643457, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785445.223584256, "perf/iters_per_sec": 0.8513666265412597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1745820999145509, "data/tokens_consumed": 147557711872, "data/tokens_consumed_B": 147.557711872, "train/loss_slope": 1.6953923199364942e-05} {"step": 70370, "timestamp": 1778270602.5416582, "train/loss": 2.06937860250473, "train/z_loss": 0.0013723841751925648, "train/perplexity": 7.9199001823219115, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021322.2010293053, "perf/iters_per_sec": 0.9638415341516997, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375149488449096, "data/tokens_consumed": 147578683392, "data/tokens_consumed_B": 147.578683392, "train/loss_slope": 1.101111619636316e-05} {"step": 70380, "timestamp": 1778270612.9243364, "train/loss": 2.185913586616516, "train/z_loss": 0.0013723675394430757, "train/perplexity": 8.898774642279717, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021202.9721314537, "perf/iters_per_sec": 0.9637846813828724, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037576150894165, "data/tokens_consumed": 147599654912, "data/tokens_consumed_B": 147.599654912, "train/loss_slope": 1.2625355789191449e-05} {"step": 70390, "timestamp": 1778270623.3008661, "train/loss": 2.1496305227279664, "train/z_loss": 0.0013696996495127678, "train/perplexity": 8.581687073018687, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022059.902823992, "perf/iters_per_sec": 0.9641932977790795, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037136435508728, "data/tokens_consumed": 147620626432, "data/tokens_consumed_B": 147.620626432, "train/loss_slope": 1.2824681246563252e-05} {"step": 70400, "timestamp": 1778270633.680955, "grad/layer_0/attn": 0.002866902621462941, "grad/layer_0/mlp": 0.0033011394552886486, "grad/layer_0/attn_mlp_ratio": 0.8684584742471639, "grad/layer_4/attn": 0.002193900989368558, "grad/layer_4/mlp": 0.002700207754969597, "grad/layer_4/attn_mlp_ratio": 0.8124933735492967, "grad/layer_8/attn": 0.006105738691985607, "grad/layer_8/mlp": 0.003885876154527068, "grad/layer_8/attn_mlp_ratio": 1.5712643151907852, "grad/layer_12/attn": 0.007112410385161638, "grad/layer_12/mlp": 0.007520664483308792, "grad/layer_12/attn_mlp_ratio": 0.9457156753070434, "grad/layer_16/attn": 0.003482879139482975, "grad/layer_16/mlp": 0.004715908784419298, "grad/layer_16/attn_mlp_ratio": 0.7385382595049534, "grad/layer_20/attn": 0.004212455358356237, "grad/layer_20/mlp": 0.0062301368452608585, "grad/layer_20/attn_mlp_ratio": 0.6761416956589644, "grad/layer_24/attn": 0.008475047536194324, "grad/layer_24/mlp": 0.00989295169711113, "grad/layer_24/attn_mlp_ratio": 0.8566753088465617, "grad/layer_27/attn": 0.005204764194786549, "grad/layer_27/mlp": 0.009330328553915024, "grad/layer_27/attn_mlp_ratio": 0.5578328896916852} {"step": 70400, "timestamp": 1778270633.6970832, "train/loss": 2.1673758745193483, "train/z_loss": 0.0013706699246540666, "train/perplexity": 8.735331332986469, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2018347.1743020772, "perf/iters_per_sec": 0.9624229308615099, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.039044237136841, "data/tokens_consumed": 147641597952, "data/tokens_consumed_B": 147.641597952, "train/loss_slope": 1.1825910531612591e-05} {"step": 70410, "timestamp": 1778270644.0826557, "train/loss": 2.10502610206604, "train/z_loss": 0.0013769995304755867, "train/perplexity": 8.207317234736411, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020679.4975719482, "perf/iters_per_sec": 0.9635350692615262, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378449440002442, "data/tokens_consumed": 147662569472, "data/tokens_consumed_B": 147.662569472, "train/loss_slope": 1.0244665366194859e-05} {"step": 70420, "timestamp": 1778270654.465564, "train/loss": 2.1802768230438234, "train/z_loss": 0.0013635174487717449, "train/perplexity": 8.848755459127931, "train/grad_norm": 0.1884765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021223.547053097, "perf/iters_per_sec": 0.9637944922700391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375655889511108, "data/tokens_consumed": 147683540992, "data/tokens_consumed_B": 147.683540992, "train/loss_slope": 1.4516300444055076e-05} {"step": 70425, "timestamp": 1778270660.2614725, "eos/sharpness": 87.3244285583496, "eos/L0_probe": 1.9697188138961792, "eos/L_plus": 2.330901622772217, "eos/L_minus": 2.4817802906036377, "eos/grad_norm": 0.24490870535373688, "eos/embed_grad_frac": 0.0372055284678936, "eos/time_s": 0.6197941303253174} {"step": 70425, "timestamp": 1778270661.6411715, "geo/rankme_last": 438.927978515625, "geo/layer_0/stable_rank_q_proj": 19.217897415161133, "geo/layer_0/stable_rank_k_proj": 15.956267356872559, "geo/layer_0/stable_rank_o_proj": 46.7331428527832, "geo/layer_0/stable_rank_gate_proj": 129.3388671875, "geo/layer_0/stable_rank_down_proj": 55.874149322509766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06121799722313881, "geo/layer_0/attn_entropy_mean": 6.155881881713867, "geo/layer_0/attn_entropy_std": 0.4261162579059601, "geo/layer_7/stable_rank_q_proj": 43.14577102661133, "geo/layer_7/stable_rank_k_proj": 40.37346267700195, "geo/layer_7/stable_rank_o_proj": 89.4722900390625, "geo/layer_7/stable_rank_gate_proj": 78.97016143798828, "geo/layer_7/stable_rank_down_proj": 139.7763671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45557141304016113, "geo/layer_7/attn_entropy_mean": 4.654247283935547, "geo/layer_7/attn_entropy_std": 0.789847195148468, "geo/layer_14/stable_rank_q_proj": 50.13632583618164, "geo/layer_14/stable_rank_k_proj": 40.99272537231445, "geo/layer_14/stable_rank_o_proj": 43.39769744873047, "geo/layer_14/stable_rank_gate_proj": 71.2099380493164, "geo/layer_14/stable_rank_down_proj": 127.33735656738281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38964539766311646, "geo/layer_14/attn_entropy_mean": 5.512226104736328, "geo/layer_14/attn_entropy_std": 0.4295786917209625, "geo/layer_21/stable_rank_q_proj": 39.64704132080078, "geo/layer_21/stable_rank_k_proj": 30.16498565673828, "geo/layer_21/stable_rank_o_proj": 69.30408477783203, "geo/layer_21/stable_rank_gate_proj": 64.64137268066406, "geo/layer_21/stable_rank_down_proj": 50.08047866821289, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14475001394748688, "geo/layer_21/attn_entropy_mean": 5.681488990783691, "geo/layer_21/attn_entropy_std": 0.30522236227989197, "geo/layer_27/stable_rank_q_proj": 43.690650939941406, "geo/layer_27/stable_rank_k_proj": 32.026912689208984, "geo/layer_27/stable_rank_o_proj": 115.19950103759766, "geo/layer_27/stable_rank_gate_proj": 78.52665710449219, "geo/layer_27/stable_rank_down_proj": 127.8963394165039, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1047966480255127, "geo/layer_27/attn_entropy_mean": 4.172691822052002, "geo/layer_27/attn_entropy_std": 0.7583826184272766, "attnres/final_alpha/block_0": 0.24021051824092865, "attnres/block_norm/0": 1.772075891494751, "attnres/final_alpha/block_1": 0.004151251167058945, "attnres/block_norm/1": 47936.953125, "attnres/final_alpha/block_2": 0.010240964591503143, "attnres/block_norm/2": 28879.36328125, "attnres/final_alpha/block_3": 0.012029396370053291, "attnres/block_norm/3": 60498.390625, "attnres/final_alpha/block_4": 0.014153344556689262, "attnres/block_norm/4": 15639.390625, "attnres/final_alpha/block_5": 0.6104348301887512, "attnres/block_norm/5": 6827.2666015625, "attnres/final_alpha/block_6": 0.10877968370914459, "attnres/block_norm/6": 40333.22265625, "geo/tier1_time_s": 1.360353708267212, "geo/step": 70425.0, "geo/rankme_slope": 0.00012141919267707083} {"step": 70430, "timestamp": 1778270666.837304, "train/loss": 2.079374599456787, "train/z_loss": 0.0013933645910583436, "train/perplexity": 7.999464480140253, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1695808.6558525756, "perf/iters_per_sec": 0.8086245803130033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2366678237915039, "data/tokens_consumed": 147704512512, "data/tokens_consumed_B": 147.704512512, "train/loss_slope": 1.1362306641774151e-05} {"step": 70440, "timestamp": 1778270677.217423, "train/loss": 2.1247450590133665, "train/z_loss": 0.0013863736181519925, "train/perplexity": 8.370763165455385, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021732.7123875443, "perf/iters_per_sec": 0.9640372812211725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373042821884155, "data/tokens_consumed": 147725484032, "data/tokens_consumed_B": 147.725484032, "train/loss_slope": 9.108490082654692e-06} {"step": 70450, "timestamp": 1778270687.5810087, "grad/layer_0/attn": 0.0023131759371608496, "grad/layer_0/mlp": 0.002765845973044634, "grad/layer_0/attn_mlp_ratio": 0.8363357453997841, "grad/layer_4/attn": 0.0024103892501443624, "grad/layer_4/mlp": 0.002577048959210515, "grad/layer_4/attn_mlp_ratio": 0.9353292059107291, "grad/layer_8/attn": 0.006016104482114315, "grad/layer_8/mlp": 0.0034679637756198645, "grad/layer_8/attn_mlp_ratio": 1.7347656140273362, "grad/layer_12/attn": 0.0053639039397239685, "grad/layer_12/mlp": 0.006640291307121515, "grad/layer_12/attn_mlp_ratio": 0.8077814076007781, "grad/layer_16/attn": 0.006405798718333244, "grad/layer_16/mlp": 0.004725415725260973, "grad/layer_16/attn_mlp_ratio": 1.355605295959212, "grad/layer_20/attn": 0.0029647124465554953, "grad/layer_20/mlp": 0.005675177089869976, "grad/layer_20/attn_mlp_ratio": 0.5223999793076805, "grad/layer_24/attn": 0.007482283748686314, "grad/layer_24/mlp": 0.008450225926935673, "grad/layer_24/attn_mlp_ratio": 0.8854536819294565, "grad/layer_27/attn": 0.004233154002577066, "grad/layer_27/mlp": 0.008050336502492428, "grad/layer_27/attn_mlp_ratio": 0.5258356527932583} {"step": 70450, "timestamp": 1778270687.5969365, "train/loss": 2.0677871704101562, "train/z_loss": 0.001381639251485467, "train/perplexity": 7.9073062228596, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021416.8695951484, "perf/iters_per_sec": 0.9638866756416075, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374663591384887, "data/tokens_consumed": 147746455552, "data/tokens_consumed_B": 147.746455552, "train/loss_slope": 3.2610870716703095e-06} {"step": 70460, "timestamp": 1778270697.969623, "train/loss": 2.121900236606598, "train/z_loss": 0.0013854033779352904, "train/perplexity": 8.346983671097044, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022784.5137770497, "perf/iters_per_sec": 0.9645388192067383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036764907836914, "data/tokens_consumed": 147767427072, "data/tokens_consumed_B": 147.767427072, "train/loss_slope": 1.4893580870767513e-06} {"step": 70470, "timestamp": 1778270708.343915, "train/loss": 2.1654736518859865, "train/z_loss": 0.001363959745503962, "train/perplexity": 8.718730582182651, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022843.033549909, "perf/iters_per_sec": 0.9645667236089273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367349147796632, "data/tokens_consumed": 147788398592, "data/tokens_consumed_B": 147.788398592, "train/loss_slope": 2.182138699366517e-06} {"step": 70480, "timestamp": 1778270718.7175653, "train/loss": 2.1442035675048827, "train/z_loss": 0.001367373496759683, "train/perplexity": 8.535240786487545, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022406.91473017, "perf/iters_per_sec": 0.964358765950284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369584798812865, "data/tokens_consumed": 147809370112, "data/tokens_consumed_B": 147.809370112, "train/loss_slope": -1.2433793213572335e-06} {"step": 70490, "timestamp": 1778270729.091393, "train/loss": 2.154658818244934, "train/z_loss": 0.0013585645821876824, "train/perplexity": 8.624947002362605, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022311.2700054038, "perf/iters_per_sec": 0.9643131589915294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370075225830078, "data/tokens_consumed": 147830341632, "data/tokens_consumed_B": 147.830341632, "train/loss_slope": -4.2476251323478844e-07} {"step": 70500, "timestamp": 1778270739.4601305, "grad/layer_0/attn": 0.0027022839058190584, "grad/layer_0/mlp": 0.0030107470229268074, "grad/layer_0/attn_mlp_ratio": 0.8975459563645166, "grad/layer_4/attn": 0.0027286293916404247, "grad/layer_4/mlp": 0.002846202114596963, "grad/layer_4/attn_mlp_ratio": 0.9586913317846648, "grad/layer_8/attn": 0.00491326255723834, "grad/layer_8/mlp": 0.0036989599466323853, "grad/layer_8/attn_mlp_ratio": 1.3282821374919913, "grad/layer_12/attn": 0.0046918755397200584, "grad/layer_12/mlp": 0.006790611892938614, "grad/layer_12/attn_mlp_ratio": 0.6909355952893527, "grad/layer_16/attn": 0.005674825515598059, "grad/layer_16/mlp": 0.004816507454961538, "grad/layer_16/attn_mlp_ratio": 1.178203387172591, "grad/layer_20/attn": 0.007240001577883959, "grad/layer_20/mlp": 0.005968082696199417, "grad/layer_20/attn_mlp_ratio": 1.213120163563167, "grad/layer_24/attn": 0.0059775784611701965, "grad/layer_24/mlp": 0.008608262985944748, "grad/layer_24/attn_mlp_ratio": 0.6944000667138258, "grad/layer_27/attn": 0.006172075401991606, "grad/layer_27/mlp": 0.007476955186575651, "grad/layer_27/attn_mlp_ratio": 0.8254797795933239} {"step": 70500, "timestamp": 1778270740.0766475, "eos/sharpness": 8.744645118713377, "eos/L0_probe": 1.968605399131775, "eos/L_plus": 2.0152976512908936, "eos/L_minus": 2.00935959815979, "eos/grad_norm": 0.09139459580183029, "eos/embed_grad_frac": 0.28428471088409424, "eos/time_s": 0.6137022972106934} {"step": 70500, "timestamp": 1778270740.0957358, "train/loss": 2.157959151268005, "train/z_loss": 0.0013893836410716176, "train/perplexity": 8.653459223809683, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906951.0975861715, "perf/iters_per_sec": 0.9093051422053201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0997408390045167, "data/tokens_consumed": 147851313152, "data/tokens_consumed_B": 147.851313152, "train/loss_slope": -2.9563502509993287e-07} {"step": 70500, "timestamp": 1778270741.4586377, "geo/rankme_last": 438.40753173828125, "geo/layer_0/stable_rank_q_proj": 19.238985061645508, "geo/layer_0/stable_rank_k_proj": 15.968940734863281, "geo/layer_0/stable_rank_o_proj": 46.77316665649414, "geo/layer_0/stable_rank_gate_proj": 129.29844665527344, "geo/layer_0/stable_rank_down_proj": 55.872440338134766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06430443376302719, "geo/layer_0/attn_entropy_mean": 6.155123710632324, "geo/layer_0/attn_entropy_std": 0.4258601665496826, "geo/layer_7/stable_rank_q_proj": 43.2280387878418, "geo/layer_7/stable_rank_k_proj": 40.315216064453125, "geo/layer_7/stable_rank_o_proj": 89.66766357421875, "geo/layer_7/stable_rank_gate_proj": 79.01861572265625, "geo/layer_7/stable_rank_down_proj": 139.8628692626953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4367130696773529, "geo/layer_7/attn_entropy_mean": 4.662115097045898, "geo/layer_7/attn_entropy_std": 0.7836592197418213, "geo/layer_14/stable_rank_q_proj": 50.13289260864258, "geo/layer_14/stable_rank_k_proj": 41.014007568359375, "geo/layer_14/stable_rank_o_proj": 43.42974090576172, "geo/layer_14/stable_rank_gate_proj": 71.16090393066406, "geo/layer_14/stable_rank_down_proj": 127.53109741210938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39241525530815125, "geo/layer_14/attn_entropy_mean": 5.542720317840576, "geo/layer_14/attn_entropy_std": 0.42677727341651917, "geo/layer_21/stable_rank_q_proj": 39.74535369873047, "geo/layer_21/stable_rank_k_proj": 30.154626846313477, "geo/layer_21/stable_rank_o_proj": 69.264404296875, "geo/layer_21/stable_rank_gate_proj": 64.6883544921875, "geo/layer_21/stable_rank_down_proj": 50.030479431152344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14182163774967194, "geo/layer_21/attn_entropy_mean": 5.6860456466674805, "geo/layer_21/attn_entropy_std": 0.3052080571651459, "geo/layer_27/stable_rank_q_proj": 43.693153381347656, "geo/layer_27/stable_rank_k_proj": 32.11119842529297, "geo/layer_27/stable_rank_o_proj": 115.16836547851562, "geo/layer_27/stable_rank_gate_proj": 78.44937133789062, "geo/layer_27/stable_rank_down_proj": 127.54148864746094, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10035663843154907, "geo/layer_27/attn_entropy_mean": 4.161103248596191, "geo/layer_27/attn_entropy_std": 0.7618470191955566, "attnres/final_alpha/block_0": 0.23862504959106445, "attnres/block_norm/0": 1.7721357345581055, "attnres/final_alpha/block_1": 0.00411895802244544, "attnres/block_norm/1": 47928.6953125, "attnres/final_alpha/block_2": 0.009946320205926895, "attnres/block_norm/2": 28846.4453125, "attnres/final_alpha/block_3": 0.011781837791204453, "attnres/block_norm/3": 60847.9296875, "attnres/final_alpha/block_4": 0.013840766623616219, "attnres/block_norm/4": 15646.1474609375, "attnres/final_alpha/block_5": 0.6146608591079712, "attnres/block_norm/5": 6809.80419921875, "attnres/final_alpha/block_6": 0.10702624917030334, "attnres/block_norm/6": 40489.125, "geo/tier1_time_s": 1.3590359687805176, "geo/step": 70500.0, "geo/rankme_slope": 0.0001129858193277311} {"step": 70500, "timestamp": 1778270748.3947775, "geo/ww_alpha_mean": 7.762676460447233, "geo/ww_alpha_std": 4.78961672939014, "geo/ww_alpha_min": 1.3465796701815298, "geo/ww_alpha_max": 29.95116167892581, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.95935278386832, "geo/ww_alpha_by_type/k_proj": 4.4479851521000935, "geo/ww_alpha_by_type/v_proj": 9.43717381000786, "geo/ww_alpha_by_type/o_proj": 8.205462183130601, "geo/ww_alpha_by_type/gate_proj": 8.168667713257184, "geo/ww_alpha_by_type/up_proj": 12.249737098795208, "geo/ww_alpha_by_type/down_proj": 7.968234968864247, "geo/twonn_id/layer_0": 0.7154154777526855, "geo/twonn_id/layer_7": 3.7968735694885254, "geo/twonn_id/layer_14": 4.687313079833984, "geo/twonn_id/layer_21": 7.535564422607422, "geo/twonn_id/layer_27": 5.2724289894104, "geo/tier2_time_s": 6.929714918136597} {"step": 70500, "timestamp": 1778270749.0157623, "eoc/jacobian_sigma/layer_0/attn": 1165.026123046875, "eoc/jacobian_sigma/layer_0/mlp": 8874.2060546875, "eoc/jacobian_sigma/layer_0": 8874.2060546875, "eoc/jacobian_sigma/layer_7/attn": 1.1496031284332275, "eoc/jacobian_sigma/layer_7/mlp": 1.7136805057525635, "eoc/jacobian_sigma/layer_7": 1.7136805057525635, "eoc/jacobian_sigma/layer_14/attn": 1.455126404762268, "eoc/jacobian_sigma/layer_14/mlp": 7.733077049255371, "eoc/jacobian_sigma/layer_14": 7.733077049255371, "eoc/jacobian_sigma/layer_21/attn": 1.1053593158721924, "eoc/jacobian_sigma/layer_21/mlp": 4.098773002624512, "eoc/jacobian_sigma/layer_21": 4.098773002624512, "eoc/jacobian_sigma/layer_27/attn": 3.0073091983795166, "eoc/jacobian_sigma/layer_27/mlp": 24.53995132446289, "eoc/jacobian_sigma/layer_27": 24.53995132446289, "eoc/layer0_sigma": 8874.2060546875, "eoc/sigma_max": 24.53995132446289, "eoc/sigma_min": 1.7136805057525635, "eoc/sigma_mean": 9.521370470523834, "eoc/time_s": 0.6143155097961426} {"step": 70510, "timestamp": 1778270759.4120507, "train/loss": 2.1711404800415037, "train/z_loss": 0.001379970065318048, "train/perplexity": 8.768278386980027, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1085893.263895827, "perf/iters_per_sec": 0.5177942580680023, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9312690019607544, "data/tokens_consumed": 147872284672, "data/tokens_consumed_B": 147.872284672, "train/loss_slope": -1.2003578797782953e-06} {"step": 70520, "timestamp": 1778270769.7868254, "train/loss": 2.0915740489959718, "train/z_loss": 0.001383214059751481, "train/perplexity": 8.097651237874253, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022773.5823983029, "perf/iters_per_sec": 0.9645336067191614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036770510673523, "data/tokens_consumed": 147893256192, "data/tokens_consumed_B": 147.893256192, "train/loss_slope": -2.7663143125339983e-06} {"step": 70530, "timestamp": 1778270780.1665385, "train/loss": 2.1133280754089356, "train/z_loss": 0.001394046435598284, "train/perplexity": 8.275737783444308, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021426.392684628, "perf/iters_per_sec": 0.9638912166045323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374614715576171, "data/tokens_consumed": 147914227712, "data/tokens_consumed_B": 147.914227712, "train/loss_slope": -4.272817463764654e-06} {"step": 70540, "timestamp": 1778270790.5199974, "train/loss": 2.115612101554871, "train/z_loss": 0.0013887180713936687, "train/perplexity": 8.294661387695358, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026442.5429115288, "perf/iters_per_sec": 0.9662831034238476, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348933935165405, "data/tokens_consumed": 147935199232, "data/tokens_consumed_B": 147.935199232, "train/loss_slope": -3.54214749678169e-06} {"step": 70550, "timestamp": 1778270800.8608973, "grad/layer_0/attn": 0.0025534764863550663, "grad/layer_0/mlp": 0.002966450061649084, "grad/layer_0/attn_mlp_ratio": 0.8607852305651273, "grad/layer_4/attn": 0.002288047457113862, "grad/layer_4/mlp": 0.0025744601152837276, "grad/layer_4/attn_mlp_ratio": 0.8887484232735364, "grad/layer_8/attn": 0.008863505907356739, "grad/layer_8/mlp": 0.0035276683047413826, "grad/layer_8/attn_mlp_ratio": 2.51256776159678, "grad/layer_12/attn": 0.00450964504852891, "grad/layer_12/mlp": 0.006344020366668701, "grad/layer_12/attn_mlp_ratio": 0.7108497004734543, "grad/layer_16/attn": 0.005918556824326515, "grad/layer_16/mlp": 0.004935626406222582, "grad/layer_16/attn_mlp_ratio": 1.1991500606589065, "grad/layer_20/attn": 0.0033047772012650967, "grad/layer_20/mlp": 0.005811873357743025, "grad/layer_20/attn_mlp_ratio": 0.5686251129336305, "grad/layer_24/attn": 0.013236281462013721, "grad/layer_24/mlp": 0.009883888997137547, "grad/layer_24/attn_mlp_ratio": 1.3391774565587806, "grad/layer_27/attn": 0.004938932601362467, "grad/layer_27/mlp": 0.008558818139135838, "grad/layer_27/attn_mlp_ratio": 0.5770577740252536} {"step": 70550, "timestamp": 1778270800.8767872, "train/loss": 2.171140897274017, "train/z_loss": 0.0013823584304191172, "train/perplexity": 8.76828204539162, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026191.1745537405, "perf/iters_per_sec": 0.9661632416504576, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350217819213867, "data/tokens_consumed": 147956170752, "data/tokens_consumed_B": 147.956170752, "train/loss_slope": -1.5451479058275803e-06} {"step": 70560, "timestamp": 1778270811.2252257, "train/loss": 2.1289092779159544, "train/z_loss": 0.0013841880019754172, "train/perplexity": 8.40569353403215, "train/grad_norm": 0.1875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027496.401083966, "perf/iters_per_sec": 0.9667856221599417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343554735183715, "data/tokens_consumed": 147977142272, "data/tokens_consumed_B": 147.977142272, "train/loss_slope": -3.4078413539558273e-06} {"step": 70570, "timestamp": 1778270821.5780385, "train/loss": 2.1966631174087525, "train/z_loss": 0.0013531408389098943, "train/perplexity": 8.994948278954972, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026601.4249391726, "perf/iters_per_sec": 0.9663588642783988, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348122596740723, "data/tokens_consumed": 147998113792, "data/tokens_consumed_B": 147.998113792, "train/loss_slope": 5.291250493838777e-07} {"step": 70575, "timestamp": 1778270827.372865, "eos/sharpness": 17.774939537048336, "eos/L0_probe": 1.9636346101760864, "eos/L_plus": 2.045405864715576, "eos/L_minus": 2.05961275100708, "eos/grad_norm": 0.09316068142652512, "eos/embed_grad_frac": 0.2573311924934387, "eos/time_s": 0.6220388412475586} {"step": 70575, "timestamp": 1778270828.7498214, "geo/rankme_last": 439.1662292480469, "geo/layer_0/stable_rank_q_proj": 19.24588394165039, "geo/layer_0/stable_rank_k_proj": 15.977961540222168, "geo/layer_0/stable_rank_o_proj": 46.798397064208984, "geo/layer_0/stable_rank_gate_proj": 129.17848205566406, "geo/layer_0/stable_rank_down_proj": 55.977054595947266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06202683225274086, "geo/layer_0/attn_entropy_mean": 6.152494430541992, "geo/layer_0/attn_entropy_std": 0.4275732636451721, "geo/layer_7/stable_rank_q_proj": 43.1865348815918, "geo/layer_7/stable_rank_k_proj": 40.2138786315918, "geo/layer_7/stable_rank_o_proj": 89.58683776855469, "geo/layer_7/stable_rank_gate_proj": 79.04691314697266, "geo/layer_7/stable_rank_down_proj": 139.5248565673828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.442660391330719, "geo/layer_7/attn_entropy_mean": 4.619941711425781, "geo/layer_7/attn_entropy_std": 0.7982147336006165, "geo/layer_14/stable_rank_q_proj": 50.20783615112305, "geo/layer_14/stable_rank_k_proj": 41.03333282470703, "geo/layer_14/stable_rank_o_proj": 43.40907669067383, "geo/layer_14/stable_rank_gate_proj": 71.19482421875, "geo/layer_14/stable_rank_down_proj": 127.35838317871094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39355185627937317, "geo/layer_14/attn_entropy_mean": 5.530810356140137, "geo/layer_14/attn_entropy_std": 0.42053744196891785, "geo/layer_21/stable_rank_q_proj": 39.74892807006836, "geo/layer_21/stable_rank_k_proj": 30.14708137512207, "geo/layer_21/stable_rank_o_proj": 69.27069854736328, "geo/layer_21/stable_rank_gate_proj": 64.64445495605469, "geo/layer_21/stable_rank_down_proj": 50.04736328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1382278949022293, "geo/layer_21/attn_entropy_mean": 5.673731803894043, "geo/layer_21/attn_entropy_std": 0.2959446310997009, "geo/layer_27/stable_rank_q_proj": 43.68219757080078, "geo/layer_27/stable_rank_k_proj": 32.17389678955078, "geo/layer_27/stable_rank_o_proj": 115.25485229492188, "geo/layer_27/stable_rank_gate_proj": 78.40325164794922, "geo/layer_27/stable_rank_down_proj": 127.57415008544922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09724190831184387, "geo/layer_27/attn_entropy_mean": 4.174251556396484, "geo/layer_27/attn_entropy_std": 0.7734902501106262, "attnres/final_alpha/block_0": 0.2379055917263031, "attnres/block_norm/0": 1.7722707986831665, "attnres/final_alpha/block_1": 0.004035188816487789, "attnres/block_norm/1": 47927.42578125, "attnres/final_alpha/block_2": 0.009990997612476349, "attnres/block_norm/2": 28748.005859375, "attnres/final_alpha/block_3": 0.011685953475534916, "attnres/block_norm/3": 60741.234375, "attnres/final_alpha/block_4": 0.013895593583583832, "attnres/block_norm/4": 15657.19921875, "attnres/final_alpha/block_5": 0.615051805973053, "attnres/block_norm/5": 6804.1591796875, "attnres/final_alpha/block_6": 0.10743483155965805, "attnres/block_norm/6": 40508.54296875, "geo/tier1_time_s": 1.357328176498413, "geo/step": 70575.0, "geo/rankme_slope": 0.0001032968656212485} {"step": 70580, "timestamp": 1778270833.9265459, "train/loss": 2.144344663619995, "train/z_loss": 0.0013732962193898856, "train/perplexity": 8.536445160768336, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699114.9285347764, "perf/iters_per_sec": 0.8102011339830286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.234261417388916, "data/tokens_consumed": 148019085312, "data/tokens_consumed_B": 148.019085312, "train/loss_slope": -3.65645256695045e-08} {"step": 70590, "timestamp": 1778270844.291863, "train/loss": 2.127666139602661, "train/z_loss": 0.001388229732401669, "train/perplexity": 8.395250586709023, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024389.824084548, "perf/iters_per_sec": 0.9653042908118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359427690505982, "data/tokens_consumed": 148040056832, "data/tokens_consumed_B": 148.040056832, "train/loss_slope": -2.4220474053173394e-06} {"step": 70600, "timestamp": 1778270854.6322308, "grad/layer_0/attn": 0.003841608762741089, "grad/layer_0/mlp": 0.003696227679029107, "grad/layer_0/attn_mlp_ratio": 1.0393322577512172, "grad/layer_4/attn": 0.00243589049205184, "grad/layer_4/mlp": 0.0026946873404085636, "grad/layer_4/attn_mlp_ratio": 0.9039603092826657, "grad/layer_8/attn": 0.00495676277205348, "grad/layer_8/mlp": 0.00387902045622468, "grad/layer_8/attn_mlp_ratio": 1.2778387482632292, "grad/layer_12/attn": 0.004648847039788961, "grad/layer_12/mlp": 0.007133438717573881, "grad/layer_12/attn_mlp_ratio": 0.6516978919530508, "grad/layer_16/attn": 0.004299707245081663, "grad/layer_16/mlp": 0.004667914006859064, "grad/layer_16/attn_mlp_ratio": 0.9211196150254018, "grad/layer_20/attn": 0.0034589283168315887, "grad/layer_20/mlp": 0.006433041766285896, "grad/layer_20/attn_mlp_ratio": 0.5376816113942989, "grad/layer_24/attn": 0.013690004125237465, "grad/layer_24/mlp": 0.01048165000975132, "grad/layer_24/attn_mlp_ratio": 1.3060924550898088, "grad/layer_27/attn": 0.009220071136951447, "grad/layer_27/mlp": 0.0097309285774827, "grad/layer_27/attn_mlp_ratio": 0.947501666340092} {"step": 70600, "timestamp": 1778270854.647953, "train/loss": 2.136972725391388, "train/z_loss": 0.0013783449539914727, "train/perplexity": 8.47374640398149, "train/grad_norm": 0.1767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026024.516942566, "perf/iters_per_sec": 0.9660837731087523, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351069211959838, "data/tokens_consumed": 148061028352, "data/tokens_consumed_B": 148.061028352, "train/loss_slope": -4.527701102610746e-06} {"step": 70610, "timestamp": 1778270864.9972064, "train/loss": 2.16393096446991, "train/z_loss": 0.001370228361338377, "train/perplexity": 8.705290675682123, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027658.954000452, "perf/iters_per_sec": 0.9668631334306965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03427255153656, "data/tokens_consumed": 148081999872, "data/tokens_consumed_B": 148.081999872, "train/loss_slope": -3.579183795092315e-06} {"step": 70620, "timestamp": 1778270875.3591642, "train/loss": 2.1237950563430785, "train/z_loss": 0.0013785311137326061, "train/perplexity": 8.362814694228094, "train/grad_norm": 0.263671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025063.329112731, "perf/iters_per_sec": 0.9656254430354743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355982303619384, "data/tokens_consumed": 148102971392, "data/tokens_consumed_B": 148.102971392, "train/loss_slope": -4.833841488377836e-06} {"step": 70630, "timestamp": 1778270885.7172148, "train/loss": 2.139369082450867, "train/z_loss": 0.0013698895811103285, "train/perplexity": 8.494076875802719, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025875.990294876, "perf/iters_per_sec": 0.9660129500841503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035182809829712, "data/tokens_consumed": 148123942912, "data/tokens_consumed_B": 148.123942912, "train/loss_slope": -3.2733823027726853e-06} {"step": 70640, "timestamp": 1778270896.0650399, "train/loss": 2.1349776148796082, "train/z_loss": 0.0013797678053379058, "train/perplexity": 8.45685719697621, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027743.699366708, "perf/iters_per_sec": 0.9669035431703129, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034229326248169, "data/tokens_consumed": 148144914432, "data/tokens_consumed_B": 148.144914432, "train/loss_slope": -2.169809289927111e-06} {"step": 70650, "timestamp": 1778270906.4009228, "grad/layer_0/attn": 0.010308049619197845, "grad/layer_0/mlp": 0.010283377952873707, "grad/layer_0/attn_mlp_ratio": 1.0023991694360828, "grad/layer_4/attn": 0.0045829094015061855, "grad/layer_4/mlp": 0.00416281633079052, "grad/layer_4/attn_mlp_ratio": 1.1009155646663695, "grad/layer_8/attn": 0.005616029258817434, "grad/layer_8/mlp": 0.003942372743040323, "grad/layer_8/attn_mlp_ratio": 1.4245302213695248, "grad/layer_12/attn": 0.005571556743234396, "grad/layer_12/mlp": 0.0074639334343373775, "grad/layer_12/attn_mlp_ratio": 0.7464638742564874, "grad/layer_16/attn": 0.004528645891696215, "grad/layer_16/mlp": 0.004445206839591265, "grad/layer_16/attn_mlp_ratio": 1.0187705439226689, "grad/layer_20/attn": 0.004182486329227686, "grad/layer_20/mlp": 0.006049163173884153, "grad/layer_20/attn_mlp_ratio": 0.6914156784764914, "grad/layer_24/attn": 0.01005080621689558, "grad/layer_24/mlp": 0.01079473178833723, "grad/layer_24/attn_mlp_ratio": 0.9310843771631422, "grad/layer_27/attn": 0.005869318265467882, "grad/layer_27/mlp": 0.00954781286418438, "grad/layer_27/attn_mlp_ratio": 0.6147290785318885} {"step": 70650, "timestamp": 1778270907.0117269, "eos/sharpness": 28.499484062194817, "eos/L0_probe": 1.9656124114990234, "eos/L_plus": 2.119293451309204, "eos/L_minus": 2.096926212310791, "eos/grad_norm": 0.16801267862319946, "eos/embed_grad_frac": 0.20619861781597137, "eos/time_s": 0.6080915927886963} {"step": 70650, "timestamp": 1778270907.033696, "train/loss": 2.1352667570114137, "train/z_loss": 0.0013675535330548883, "train/perplexity": 8.459302784238638, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912677.335769543, "perf/iters_per_sec": 0.9120356253478732, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964483976364137, "data/tokens_consumed": 148165885952, "data/tokens_consumed_B": 148.165885952, "train/loss_slope": -3.6458707020299154e-06} {"step": 70650, "timestamp": 1778270908.39547, "geo/rankme_last": 437.5508728027344, "geo/layer_0/stable_rank_q_proj": 19.24539566040039, "geo/layer_0/stable_rank_k_proj": 15.953739166259766, "geo/layer_0/stable_rank_o_proj": 46.75852584838867, "geo/layer_0/stable_rank_gate_proj": 129.14962768554688, "geo/layer_0/stable_rank_down_proj": 56.04563903808594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06380477547645569, "geo/layer_0/attn_entropy_mean": 6.155018329620361, "geo/layer_0/attn_entropy_std": 0.4278167486190796, "geo/layer_7/stable_rank_q_proj": 43.18804168701172, "geo/layer_7/stable_rank_k_proj": 40.20123291015625, "geo/layer_7/stable_rank_o_proj": 89.59941101074219, "geo/layer_7/stable_rank_gate_proj": 78.98892974853516, "geo/layer_7/stable_rank_down_proj": 139.56057739257812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4344105124473572, "geo/layer_7/attn_entropy_mean": 4.649921417236328, "geo/layer_7/attn_entropy_std": 0.8002851605415344, "geo/layer_14/stable_rank_q_proj": 50.347023010253906, "geo/layer_14/stable_rank_k_proj": 40.95791244506836, "geo/layer_14/stable_rank_o_proj": 43.355018615722656, "geo/layer_14/stable_rank_gate_proj": 71.24717712402344, "geo/layer_14/stable_rank_down_proj": 127.59394073486328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4077315926551819, "geo/layer_14/attn_entropy_mean": 5.550264358520508, "geo/layer_14/attn_entropy_std": 0.42871996760368347, "geo/layer_21/stable_rank_q_proj": 39.76008605957031, "geo/layer_21/stable_rank_k_proj": 30.146474838256836, "geo/layer_21/stable_rank_o_proj": 69.12921905517578, "geo/layer_21/stable_rank_gate_proj": 64.56499481201172, "geo/layer_21/stable_rank_down_proj": 50.02192687988281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14049489796161652, "geo/layer_21/attn_entropy_mean": 5.68196964263916, "geo/layer_21/attn_entropy_std": 0.30596229434013367, "geo/layer_27/stable_rank_q_proj": 43.72333908081055, "geo/layer_27/stable_rank_k_proj": 32.12029266357422, "geo/layer_27/stable_rank_o_proj": 115.24800872802734, "geo/layer_27/stable_rank_gate_proj": 78.41126251220703, "geo/layer_27/stable_rank_down_proj": 127.83484649658203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09656810760498047, "geo/layer_27/attn_entropy_mean": 4.1490373611450195, "geo/layer_27/attn_entropy_std": 0.7760551571846008, "attnres/final_alpha/block_0": 0.23762573301792145, "attnres/block_norm/0": 1.7722742557525635, "attnres/final_alpha/block_1": 0.0040495977737009525, "attnres/block_norm/1": 47912.33984375, "attnres/final_alpha/block_2": 0.010044518858194351, "attnres/block_norm/2": 28779.599609375, "attnres/final_alpha/block_3": 0.011706574819982052, "attnres/block_norm/3": 60903.2109375, "attnres/final_alpha/block_4": 0.013994373381137848, "attnres/block_norm/4": 15624.2607421875, "attnres/final_alpha/block_5": 0.6140488982200623, "attnres/block_norm/5": 6816.2998046875, "attnres/final_alpha/block_6": 0.10853031277656555, "attnres/block_norm/6": 40847.1953125, "geo/tier1_time_s": 1.357853889465332, "geo/step": 70650.0, "geo/rankme_slope": 2.1631347851640657e-05} {"step": 70660, "timestamp": 1778270918.7506866, "train/loss": 2.0999680757522583, "train/z_loss": 0.0013699505827389658, "train/perplexity": 8.165909217897523, "train/grad_norm": 0.291015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790437.8606692166, "perf/iters_per_sec": 0.8537473014207919, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713067770004273, "data/tokens_consumed": 148186857472, "data/tokens_consumed_B": 148.186857472, "train/loss_slope": -6.546792961118308e-06} {"step": 70670, "timestamp": 1778270929.1070924, "train/loss": 2.1027570962905884, "train/z_loss": 0.0013839434832334518, "train/perplexity": 8.188715895783162, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026495.8120699818, "perf/iters_per_sec": 0.9663085041379842, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348661899566651, "data/tokens_consumed": 148207828992, "data/tokens_consumed_B": 148.207828992, "train/loss_slope": -8.046074361369073e-06} {"step": 70680, "timestamp": 1778270939.4669657, "train/loss": 2.1683449745178223, "train/z_loss": 0.001362003607209772, "train/perplexity": 8.743800845807518, "train/grad_norm": 0.265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025255.987993739, "perf/iters_per_sec": 0.9657173099487968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354997158050536, "data/tokens_consumed": 148228800512, "data/tokens_consumed_B": 148.228800512, "train/loss_slope": -6.366890591971321e-06} {"step": 70690, "timestamp": 1778270949.8192713, "train/loss": 2.137407636642456, "train/z_loss": 0.0013795353006571531, "train/perplexity": 8.477432533142702, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027132.2238799597, "perf/iters_per_sec": 0.9666119689369009, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345412969589234, "data/tokens_consumed": 148249772032, "data/tokens_consumed_B": 148.249772032, "train/loss_slope": -8.999157059203377e-06} {"step": 70700, "timestamp": 1778270960.1644747, "grad/layer_0/attn": 0.003382675349712372, "grad/layer_0/mlp": 0.003332747146487236, "grad/layer_0/attn_mlp_ratio": 1.014981065029087, "grad/layer_4/attn": 0.003603740129619837, "grad/layer_4/mlp": 0.0025724275037646294, "grad/layer_4/attn_mlp_ratio": 1.4009102236136515, "grad/layer_8/attn": 0.0044348887167871, "grad/layer_8/mlp": 0.0036597582511603832, "grad/layer_8/attn_mlp_ratio": 1.2117982367281024, "grad/layer_12/attn": 0.00456976518034935, "grad/layer_12/mlp": 0.0068733165971934795, "grad/layer_12/attn_mlp_ratio": 0.664855903150117, "grad/layer_16/attn": 0.003955316264182329, "grad/layer_16/mlp": 0.004847255069762468, "grad/layer_16/attn_mlp_ratio": 0.8159909321167743, "grad/layer_20/attn": 0.005533877294510603, "grad/layer_20/mlp": 0.00652243010699749, "grad/layer_20/attn_mlp_ratio": 0.8484379470360093, "grad/layer_24/attn": 0.021694086492061615, "grad/layer_24/mlp": 0.011562399566173553, "grad/layer_24/attn_mlp_ratio": 1.876261599530146, "grad/layer_27/attn": 0.006887860130518675, "grad/layer_27/mlp": 0.010176166892051697, "grad/layer_27/attn_mlp_ratio": 0.6768619398540314} {"step": 70700, "timestamp": 1778270960.1803493, "train/loss": 2.224667859077454, "train/z_loss": 0.0013617049902677537, "train/perplexity": 9.250409853946106, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025029.6222390472, "perf/iters_per_sec": 0.965609370345615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356154680252074, "data/tokens_consumed": 148270743552, "data/tokens_consumed_B": 148.270743552, "train/loss_slope": -3.694889848024079e-06} {"step": 70710, "timestamp": 1778270970.5286791, "train/loss": 2.1367050051689147, "train/z_loss": 0.0013631084584631026, "train/perplexity": 8.471478114356085, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027541.5468646083, "perf/iters_per_sec": 0.966807149345688, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343324422836304, "data/tokens_consumed": 148291715072, "data/tokens_consumed_B": 148.291715072, "train/loss_slope": -5.7390800367916555e-06} {"step": 70720, "timestamp": 1778270980.884667, "train/loss": 2.111606252193451, "train/z_loss": 0.0013669095700606705, "train/perplexity": 8.261500686402979, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026409.6304140687, "perf/iters_per_sec": 0.9662674095220893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349102020263672, "data/tokens_consumed": 148312686592, "data/tokens_consumed_B": 148.312686592, "train/loss_slope": -6.308406428678424e-06} {"step": 70725, "timestamp": 1778270986.6699617, "eos/sharpness": 25.89311599731445, "eos/L0_probe": 1.967490792274475, "eos/L_plus": 2.1233952045440674, "eos/L_minus": 2.0705175399780273, "eos/grad_norm": 0.1093444898724556, "eos/embed_grad_frac": 0.1885225921869278, "eos/time_s": 0.6113457679748535} {"step": 70725, "timestamp": 1778270988.0537395, "geo/rankme_last": 439.0096130371094, "geo/layer_0/stable_rank_q_proj": 19.291383743286133, "geo/layer_0/stable_rank_k_proj": 15.962428092956543, "geo/layer_0/stable_rank_o_proj": 46.72304916381836, "geo/layer_0/stable_rank_gate_proj": 129.3389129638672, "geo/layer_0/stable_rank_down_proj": 56.08469772338867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06444332748651505, "geo/layer_0/attn_entropy_mean": 6.160699367523193, "geo/layer_0/attn_entropy_std": 0.42666706442832947, "geo/layer_7/stable_rank_q_proj": 43.20199966430664, "geo/layer_7/stable_rank_k_proj": 40.315818786621094, "geo/layer_7/stable_rank_o_proj": 89.39405059814453, "geo/layer_7/stable_rank_gate_proj": 78.87841033935547, "geo/layer_7/stable_rank_down_proj": 139.7698211669922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4304829239845276, "geo/layer_7/attn_entropy_mean": 4.661238670349121, "geo/layer_7/attn_entropy_std": 0.7867839932441711, "geo/layer_14/stable_rank_q_proj": 50.26018142700195, "geo/layer_14/stable_rank_k_proj": 40.95470428466797, "geo/layer_14/stable_rank_o_proj": 43.3475341796875, "geo/layer_14/stable_rank_gate_proj": 71.14913940429688, "geo/layer_14/stable_rank_down_proj": 127.19204711914062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3963461220264435, "geo/layer_14/attn_entropy_mean": 5.542934417724609, "geo/layer_14/attn_entropy_std": 0.41364142298698425, "geo/layer_21/stable_rank_q_proj": 39.81302261352539, "geo/layer_21/stable_rank_k_proj": 30.12069320678711, "geo/layer_21/stable_rank_o_proj": 69.114013671875, "geo/layer_21/stable_rank_gate_proj": 64.49703216552734, "geo/layer_21/stable_rank_down_proj": 49.96150207519531, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14205415546894073, "geo/layer_21/attn_entropy_mean": 5.686164855957031, "geo/layer_21/attn_entropy_std": 0.3013935089111328, "geo/layer_27/stable_rank_q_proj": 43.75426483154297, "geo/layer_27/stable_rank_k_proj": 32.07819366455078, "geo/layer_27/stable_rank_o_proj": 115.4272689819336, "geo/layer_27/stable_rank_gate_proj": 78.38785552978516, "geo/layer_27/stable_rank_down_proj": 127.53705596923828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09298931062221527, "geo/layer_27/attn_entropy_mean": 4.181766033172607, "geo/layer_27/attn_entropy_std": 0.7616642117500305, "attnres/final_alpha/block_0": 0.23752358555793762, "attnres/block_norm/0": 1.7725528478622437, "attnres/final_alpha/block_1": 0.00407264893874526, "attnres/block_norm/1": 47979.9453125, "attnres/final_alpha/block_2": 0.010191278532147408, "attnres/block_norm/2": 28920.21875, "attnres/final_alpha/block_3": 0.011729918420314789, "attnres/block_norm/3": 61276.71484375, "attnres/final_alpha/block_4": 0.013887021690607071, "attnres/block_norm/4": 15607.0654296875, "attnres/final_alpha/block_5": 0.616768479347229, "attnres/block_norm/5": 6787.10693359375, "attnres/final_alpha/block_6": 0.10582703351974487, "attnres/block_norm/6": 40287.58203125, "geo/tier1_time_s": 1.36423659324646, "geo/step": 70725.0, "geo/rankme_slope": 1.871166435324129e-05} {"step": 70730, "timestamp": 1778270993.231558, "train/loss": 2.0578872203826903, "train/z_loss": 0.0013845811714418232, "train/perplexity": 7.829410504466958, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699944.6619766522, "perf/iters_per_sec": 0.8105967817195188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2336589813232421, "data/tokens_consumed": 148333658112, "data/tokens_consumed_B": 148.333658112, "train/loss_slope": -9.747227383489684e-06} {"step": 70740, "timestamp": 1778271003.5832353, "train/loss": 2.1585633754730225, "train/z_loss": 0.0013797952560707927, "train/perplexity": 8.658689433280296, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026726.568493279, "perf/iters_per_sec": 0.9664185373751063, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034748363494873, "data/tokens_consumed": 148354629632, "data/tokens_consumed_B": 148.354629632, "train/loss_slope": -7.854463672838224e-06} {"step": 70750, "timestamp": 1778271013.9216201, "grad/layer_0/attn": 0.0027430779300630093, "grad/layer_0/mlp": 0.0031593444291502237, "grad/layer_0/attn_mlp_ratio": 0.8682427335016929, "grad/layer_4/attn": 0.002141309203580022, "grad/layer_4/mlp": 0.0027214118745177984, "grad/layer_4/attn_mlp_ratio": 0.7868375768279025, "grad/layer_8/attn": 0.005188923794776201, "grad/layer_8/mlp": 0.003841727040708065, "grad/layer_8/attn_mlp_ratio": 1.3506747368372016, "grad/layer_12/attn": 0.004809753969311714, "grad/layer_12/mlp": 0.006975602358579636, "grad/layer_12/attn_mlp_ratio": 0.6895109057420499, "grad/layer_16/attn": 0.003721916815266013, "grad/layer_16/mlp": 0.004875581711530685, "grad/layer_16/attn_mlp_ratio": 0.7633790097550471, "grad/layer_20/attn": 0.003272857517004013, "grad/layer_20/mlp": 0.006548910401761532, "grad/layer_20/attn_mlp_ratio": 0.499756030583053, "grad/layer_24/attn": 0.016096271574497223, "grad/layer_24/mlp": 0.011192875914275646, "grad/layer_24/attn_mlp_ratio": 1.4380818257941637, "grad/layer_27/attn": 0.01339388731867075, "grad/layer_27/mlp": 0.01108588557690382, "grad/layer_27/attn_mlp_ratio": 1.2081928056118592} {"step": 70750, "timestamp": 1778271013.937265, "train/loss": 2.1204092264175416, "train/z_loss": 0.0013760882662609219, "train/perplexity": 8.334547506923213, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026718.1628450362, "perf/iters_per_sec": 0.9664145292496854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034752655029297, "data/tokens_consumed": 148375601152, "data/tokens_consumed_B": 148.375601152, "train/loss_slope": -1.009981361838956e-05} {"step": 70760, "timestamp": 1778271024.2864647, "train/loss": 2.1537126898765564, "train/z_loss": 0.0013743674149736761, "train/perplexity": 8.616790554459733, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027427.985223719, "perf/iters_per_sec": 0.9667529989355654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034390377998352, "data/tokens_consumed": 148396572672, "data/tokens_consumed_B": 148.396572672, "train/loss_slope": -9.669651271271811e-06} {"step": 70770, "timestamp": 1778271034.634933, "train/loss": 2.150153636932373, "train/z_loss": 0.0013808029354549945, "train/perplexity": 8.586177449811899, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027474.7636016165, "perf/iters_per_sec": 0.9667753046043475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034366512298584, "data/tokens_consumed": 148417544192, "data/tokens_consumed_B": 148.417544192, "train/loss_slope": -5.162201863382421e-06} {"step": 70780, "timestamp": 1778271044.98892, "train/loss": 2.1168381929397584, "train/z_loss": 0.001371154107619077, "train/perplexity": 8.30483763779483, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026694.5807041132, "perf/iters_per_sec": 0.9664032844086233, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347646951675415, "data/tokens_consumed": 148438515712, "data/tokens_consumed_B": 148.438515712, "train/loss_slope": -5.13016633933059e-06} {"step": 70790, "timestamp": 1778271055.3525543, "train/loss": 2.165446901321411, "train/z_loss": 0.001349375897552818, "train/perplexity": 8.7184973543367, "train/grad_norm": 0.17578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024495.8232920154, "perf/iters_per_sec": 0.9653548351726605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358885288238526, "data/tokens_consumed": 148459487232, "data/tokens_consumed_B": 148.459487232, "train/loss_slope": -4.340888958166114e-06} {"step": 70800, "timestamp": 1778271065.698719, "grad/layer_0/attn": 0.0026993120554834604, "grad/layer_0/mlp": 0.0029318812303245068, "grad/layer_0/attn_mlp_ratio": 0.9206757543575936, "grad/layer_4/attn": 0.002247877884656191, "grad/layer_4/mlp": 0.002597481245175004, "grad/layer_4/attn_mlp_ratio": 0.8654067482839771, "grad/layer_8/attn": 0.004230299964547157, "grad/layer_8/mlp": 0.0035028685815632343, "grad/layer_8/attn_mlp_ratio": 1.2076673004650835, "grad/layer_12/attn": 0.004371238872408867, "grad/layer_12/mlp": 0.00642011035233736, "grad/layer_12/attn_mlp_ratio": 0.6808666151245788, "grad/layer_16/attn": 0.0038380571641027927, "grad/layer_16/mlp": 0.00432508485391736, "grad/layer_16/attn_mlp_ratio": 0.887394630393687, "grad/layer_20/attn": 0.003277741139754653, "grad/layer_20/mlp": 0.005750494543462992, "grad/layer_20/attn_mlp_ratio": 0.5699929037375422, "grad/layer_24/attn": 0.00439414894208312, "grad/layer_24/mlp": 0.007256780285388231, "grad/layer_24/attn_mlp_ratio": 0.6055232084645811, "grad/layer_27/attn": 0.004137119278311729, "grad/layer_27/mlp": 0.006135940086096525, "grad/layer_27/attn_mlp_ratio": 0.6742437430674537} {"step": 70800, "timestamp": 1778271066.3129435, "eos/sharpness": 5.258381366729735, "eos/L0_probe": 1.9642746448516846, "eos/L_plus": 1.9940505027770996, "eos/L_minus": 1.987082600593567, "eos/grad_norm": 0.08720467984676361, "eos/embed_grad_frac": 0.2786455750465393, "eos/time_s": 0.6114950180053711} {"step": 70800, "timestamp": 1778271066.3343096, "train/loss": 2.123653769493103, "train/z_loss": 0.0013767078053206205, "train/perplexity": 8.361633221948232, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910468.7742997666, "perf/iters_per_sec": 0.9109825011729081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097715926170349, "data/tokens_consumed": 148480458752, "data/tokens_consumed_B": 148.480458752, "train/loss_slope": -5.054483448031984e-06} {"step": 70800, "timestamp": 1778271067.6977272, "geo/rankme_last": 438.44976806640625, "geo/layer_0/stable_rank_q_proj": 19.298892974853516, "geo/layer_0/stable_rank_k_proj": 15.951006889343262, "geo/layer_0/stable_rank_o_proj": 46.677494049072266, "geo/layer_0/stable_rank_gate_proj": 129.29190063476562, "geo/layer_0/stable_rank_down_proj": 56.17244338989258, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06195001304149628, "geo/layer_0/attn_entropy_mean": 6.156559467315674, "geo/layer_0/attn_entropy_std": 0.42587852478027344, "geo/layer_7/stable_rank_q_proj": 43.202674865722656, "geo/layer_7/stable_rank_k_proj": 40.37793731689453, "geo/layer_7/stable_rank_o_proj": 89.3813705444336, "geo/layer_7/stable_rank_gate_proj": 78.86542510986328, "geo/layer_7/stable_rank_down_proj": 139.7010040283203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4334211051464081, "geo/layer_7/attn_entropy_mean": 4.654310703277588, "geo/layer_7/attn_entropy_std": 0.7873316407203674, "geo/layer_14/stable_rank_q_proj": 50.20170211791992, "geo/layer_14/stable_rank_k_proj": 40.95579147338867, "geo/layer_14/stable_rank_o_proj": 43.34647750854492, "geo/layer_14/stable_rank_gate_proj": 71.13174438476562, "geo/layer_14/stable_rank_down_proj": 127.1686019897461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40788277983665466, "geo/layer_14/attn_entropy_mean": 5.556127548217773, "geo/layer_14/attn_entropy_std": 0.42780134081840515, "geo/layer_21/stable_rank_q_proj": 39.82577896118164, "geo/layer_21/stable_rank_k_proj": 30.12961769104004, "geo/layer_21/stable_rank_o_proj": 69.09818267822266, "geo/layer_21/stable_rank_gate_proj": 64.48699188232422, "geo/layer_21/stable_rank_down_proj": 49.96892166137695, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1417849361896515, "geo/layer_21/attn_entropy_mean": 5.691653251647949, "geo/layer_21/attn_entropy_std": 0.3112429082393646, "geo/layer_27/stable_rank_q_proj": 43.85378646850586, "geo/layer_27/stable_rank_k_proj": 32.08943176269531, "geo/layer_27/stable_rank_o_proj": 115.44971466064453, "geo/layer_27/stable_rank_gate_proj": 78.35060119628906, "geo/layer_27/stable_rank_down_proj": 127.41185760498047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08880047500133514, "geo/layer_27/attn_entropy_mean": 4.146924018859863, "geo/layer_27/attn_entropy_std": 0.775681734085083, "attnres/final_alpha/block_0": 0.2345188856124878, "attnres/block_norm/0": 1.7726714611053467, "attnres/final_alpha/block_1": 0.00394354946911335, "attnres/block_norm/1": 48075.2265625, "attnres/final_alpha/block_2": 0.009788389317691326, "attnres/block_norm/2": 28971.4453125, "attnres/final_alpha/block_3": 0.011508188210427761, "attnres/block_norm/3": 61442.9765625, "attnres/final_alpha/block_4": 0.013918976299464703, "attnres/block_norm/4": 15574.2607421875, "attnres/final_alpha/block_5": 0.6210427284240723, "attnres/block_norm/5": 6702.3974609375, "attnres/final_alpha/block_6": 0.10527929663658142, "attnres/block_norm/6": 40551.94140625, "geo/tier1_time_s": 1.359203577041626, "geo/step": 70800.0, "geo/rankme_slope": -2.1751122323929573e-05} {"step": 70810, "timestamp": 1778271078.0816443, "train/loss": 2.1800515174865724, "train/z_loss": 0.0013726513367146253, "train/perplexity": 8.846762009924259, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785809.4137610672, "perf/iters_per_sec": 0.8515402859502159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1743425607681275, "data/tokens_consumed": 148501430272, "data/tokens_consumed_B": 148.501430272, "train/loss_slope": -5.663084461636743e-06} {"step": 70820, "timestamp": 1778271088.4296331, "train/loss": 2.1542181968688965, "train/z_loss": 0.0013667669496499001, "train/perplexity": 8.621147503477859, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027574.3091920386, "perf/iters_per_sec": 0.966822771640796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343157291412353, "data/tokens_consumed": 148522401792, "data/tokens_consumed_B": 148.522401792, "train/loss_slope": -3.932605582316481e-06} {"step": 70830, "timestamp": 1778271098.78322, "train/loss": 2.123176097869873, "train/z_loss": 0.0013814012287184595, "train/perplexity": 8.357640060820055, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026926.8770376062, "perf/iters_per_sec": 0.9665140519321471, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346461057662963, "data/tokens_consumed": 148543373312, "data/tokens_consumed_B": 148.543373312, "train/loss_slope": -7.039641640116892e-06} {"step": 70840, "timestamp": 1778271109.1322749, "train/loss": 2.1523794651031496, "train/z_loss": 0.0013756932807154954, "train/perplexity": 8.605310090545252, "train/grad_norm": 0.296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027675.4070419667, "perf/iters_per_sec": 0.9668709788522561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342641592025756, "data/tokens_consumed": 148564344832, "data/tokens_consumed_B": 148.564344832, "train/loss_slope": -4.052466077200779e-06} {"step": 70850, "timestamp": 1778271119.4731092, "grad/layer_0/attn": 0.003371122060343623, "grad/layer_0/mlp": 0.0031952764838933945, "grad/layer_0/attn_mlp_ratio": 1.0550329437322024, "grad/layer_4/attn": 0.004780390299856663, "grad/layer_4/mlp": 0.0026497659273445606, "grad/layer_4/attn_mlp_ratio": 1.8040801529361021, "grad/layer_8/attn": 0.00402574148029089, "grad/layer_8/mlp": 0.003909735940396786, "grad/layer_8/attn_mlp_ratio": 1.0296709135080975, "grad/layer_12/attn": 0.004907615482807159, "grad/layer_12/mlp": 0.007325857877731323, "grad/layer_12/attn_mlp_ratio": 0.6699031700757805, "grad/layer_16/attn": 0.003808852517977357, "grad/layer_16/mlp": 0.004691066220402718, "grad/layer_16/attn_mlp_ratio": 0.8119374696135986, "grad/layer_20/attn": 0.0039043365977704525, "grad/layer_20/mlp": 0.0058855400420725346, "grad/layer_20/attn_mlp_ratio": 0.6633777875135823, "grad/layer_24/attn": 0.010575152933597565, "grad/layer_24/mlp": 0.011616414412856102, "grad/layer_24/attn_mlp_ratio": 0.9103629111972413, "grad/layer_27/attn": 0.005511023569852114, "grad/layer_27/mlp": 0.010693020187318325, "grad/layer_27/attn_mlp_ratio": 0.5153851224230875} {"step": 70850, "timestamp": 1778271119.4887455, "train/loss": 2.1906328797340393, "train/z_loss": 0.0013668792322278022, "train/perplexity": 8.94086981981657, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025768.3069293026, "perf/iters_per_sec": 0.9659616026541246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352378368377686, "data/tokens_consumed": 148585316352, "data/tokens_consumed_B": 148.585316352, "train/loss_slope": -6.329688373499999e-07} {"step": 70860, "timestamp": 1778271129.8412645, "train/loss": 2.1212270259857178, "train/z_loss": 0.0013557148864492773, "train/perplexity": 8.341366284091801, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027238.7908104828, "perf/iters_per_sec": 0.96666278400921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344869136810302, "data/tokens_consumed": 148606287872, "data/tokens_consumed_B": 148.606287872, "train/loss_slope": -2.7800731246906577e-06} {"step": 70870, "timestamp": 1778271140.1885767, "train/loss": 2.12680401802063, "train/z_loss": 0.0013698671013116837, "train/perplexity": 8.388015978995481, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027927.1905775145, "perf/iters_per_sec": 0.9669910385978291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341357469558716, "data/tokens_consumed": 148627259392, "data/tokens_consumed_B": 148.627259392, "train/loss_slope": -4.498259526918545e-06} {"step": 70875, "timestamp": 1778271145.9547555, "eos/sharpness": 69.3193197250366, "eos/L0_probe": 1.9670361280441284, "eos/L_plus": 2.360841989517212, "eos/L_minus": 2.266423463821411, "eos/grad_norm": 0.15769468247890472, "eos/embed_grad_frac": 0.08598830550909042, "eos/time_s": 0.6043984889984131} {"step": 70875, "timestamp": 1778271147.3353875, "geo/rankme_last": 438.6768798828125, "geo/layer_0/stable_rank_q_proj": 19.280929565429688, "geo/layer_0/stable_rank_k_proj": 15.960660934448242, "geo/layer_0/stable_rank_o_proj": 46.75102996826172, "geo/layer_0/stable_rank_gate_proj": 129.54798889160156, "geo/layer_0/stable_rank_down_proj": 56.204952239990234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06278660148382187, "geo/layer_0/attn_entropy_mean": 6.156192779541016, "geo/layer_0/attn_entropy_std": 0.42868345975875854, "geo/layer_7/stable_rank_q_proj": 43.13661575317383, "geo/layer_7/stable_rank_k_proj": 40.33053207397461, "geo/layer_7/stable_rank_o_proj": 89.4471664428711, "geo/layer_7/stable_rank_gate_proj": 78.92205047607422, "geo/layer_7/stable_rank_down_proj": 139.86392211914062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4398833215236664, "geo/layer_7/attn_entropy_mean": 4.656046390533447, "geo/layer_7/attn_entropy_std": 0.7922636270523071, "geo/layer_14/stable_rank_q_proj": 50.11906433105469, "geo/layer_14/stable_rank_k_proj": 40.96514892578125, "geo/layer_14/stable_rank_o_proj": 43.340911865234375, "geo/layer_14/stable_rank_gate_proj": 71.19276428222656, "geo/layer_14/stable_rank_down_proj": 127.08415222167969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3977859914302826, "geo/layer_14/attn_entropy_mean": 5.558873653411865, "geo/layer_14/attn_entropy_std": 0.42374753952026367, "geo/layer_21/stable_rank_q_proj": 39.85655975341797, "geo/layer_21/stable_rank_k_proj": 30.106536865234375, "geo/layer_21/stable_rank_o_proj": 69.06006622314453, "geo/layer_21/stable_rank_gate_proj": 64.45771789550781, "geo/layer_21/stable_rank_down_proj": 49.924659729003906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14038103818893433, "geo/layer_21/attn_entropy_mean": 5.694615364074707, "geo/layer_21/attn_entropy_std": 0.30790477991104126, "geo/layer_27/stable_rank_q_proj": 43.69455337524414, "geo/layer_27/stable_rank_k_proj": 32.10006332397461, "geo/layer_27/stable_rank_o_proj": 115.43052673339844, "geo/layer_27/stable_rank_gate_proj": 78.30314636230469, "geo/layer_27/stable_rank_down_proj": 127.45024871826172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09351839870214462, "geo/layer_27/attn_entropy_mean": 4.151933193206787, "geo/layer_27/attn_entropy_std": 0.7780722379684448, "attnres/final_alpha/block_0": 0.2369781732559204, "attnres/block_norm/0": 1.772911548614502, "attnres/final_alpha/block_1": 0.004069151356816292, "attnres/block_norm/1": 48008.4375, "attnres/final_alpha/block_2": 0.010116578079760075, "attnres/block_norm/2": 28941.43359375, "attnres/final_alpha/block_3": 0.011816902086138725, "attnres/block_norm/3": 60604.4921875, "attnres/final_alpha/block_4": 0.014008958823978901, "attnres/block_norm/4": 15640.6181640625, "attnres/final_alpha/block_5": 0.6175658106803894, "attnres/block_norm/5": 6815.705078125, "attnres/final_alpha/block_6": 0.10544444620609283, "attnres/block_norm/6": 40733.953125, "geo/tier1_time_s": 1.3620197772979736, "geo/step": 70875.0, "geo/rankme_slope": -3.480747767857143e-05} {"step": 70880, "timestamp": 1778271152.5128765, "train/loss": 2.1545194149017335, "train/z_loss": 0.0013643543934449553, "train/perplexity": 8.623744739717212, "train/grad_norm": 0.361328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1702251.1689563761, "perf/iters_per_sec": 0.811696609953106, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2319874048233033, "data/tokens_consumed": 148648230912, "data/tokens_consumed_B": 148.648230912, "train/loss_slope": -4.481277149645626e-06} {"step": 70890, "timestamp": 1778271162.8653755, "train/loss": 2.149590277671814, "train/z_loss": 0.0013626951375044882, "train/perplexity": 8.581341709490184, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026648.7722126627, "perf/iters_per_sec": 0.9663814412177385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347880840301513, "data/tokens_consumed": 148669202432, "data/tokens_consumed_B": 148.669202432, "train/loss_slope": -4.864655941626405e-06} {"step": 70900, "timestamp": 1778271173.2568913, "grad/layer_0/attn": 0.003085286123678088, "grad/layer_0/mlp": 0.0029415530152618885, "grad/layer_0/attn_mlp_ratio": 1.0488629655097705, "grad/layer_4/attn": 0.0026060701347887516, "grad/layer_4/mlp": 0.002529151737689972, "grad/layer_4/attn_mlp_ratio": 1.0304126845816555, "grad/layer_8/attn": 0.005914631765335798, "grad/layer_8/mlp": 0.0037425814662128687, "grad/layer_8/attn_mlp_ratio": 1.5803614859677821, "grad/layer_12/attn": 0.005988750606775284, "grad/layer_12/mlp": 0.006797852460294962, "grad/layer_12/attn_mlp_ratio": 0.8809768310884679, "grad/layer_16/attn": 0.003602165961638093, "grad/layer_16/mlp": 0.004753140266984701, "grad/layer_16/attn_mlp_ratio": 0.7578496916814675, "grad/layer_20/attn": 0.003074861830100417, "grad/layer_20/mlp": 0.005665145814418793, "grad/layer_20/attn_mlp_ratio": 0.5427683375770346, "grad/layer_24/attn": 0.008405664004385471, "grad/layer_24/mlp": 0.009369907900691032, "grad/layer_24/attn_mlp_ratio": 0.8970914126121144, "grad/layer_27/attn": 0.005587756633758545, "grad/layer_27/mlp": 0.008619365282356739, "grad/layer_27/attn_mlp_ratio": 0.6482793553683555} {"step": 70900, "timestamp": 1778271173.2820408, "train/loss": 2.153955316543579, "train/z_loss": 0.0013735341839492322, "train/perplexity": 8.618881471278227, "train/grad_norm": 0.11962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2014703.4282344964, "perf/iters_per_sec": 0.9606854573414308, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0409234285354614, "data/tokens_consumed": 148690173952, "data/tokens_consumed_B": 148.690173952, "train/loss_slope": -8.599095683608575e-06} {"step": 70910, "timestamp": 1778271183.6443434, "train/loss": 2.123916482925415, "train/z_loss": 0.00136978899827227, "train/perplexity": 8.363830223890028, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024988.7372606741, "perf/iters_per_sec": 0.9655898748687144, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356363773345947, "data/tokens_consumed": 148711145472, "data/tokens_consumed_B": 148.711145472, "train/loss_slope": -9.801156621704641e-06} {"step": 70920, "timestamp": 1778271194.0000749, "train/loss": 2.158493995666504, "train/z_loss": 0.0013744788593612611, "train/perplexity": 8.658088715921778, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026497.5862005728, "perf/iters_per_sec": 0.9663093501093735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348652839660644, "data/tokens_consumed": 148732116992, "data/tokens_consumed_B": 148.732116992, "train/loss_slope": -1.2184003556128777e-05} {"step": 70930, "timestamp": 1778271204.3633292, "train/loss": 2.126319646835327, "train/z_loss": 0.001372970885131508, "train/perplexity": 8.383954049573598, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024944.0780192849, "perf/iters_per_sec": 0.9655685796829628, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356592178344726, "data/tokens_consumed": 148753088512, "data/tokens_consumed_B": 148.753088512, "train/loss_slope": -1.2282829165923657e-05} {"step": 70940, "timestamp": 1778271214.7076285, "train/loss": 2.170250344276428, "train/z_loss": 0.0013655274990014732, "train/perplexity": 8.760476901495382, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028268.1750328962, "perf/iters_per_sec": 0.9671536326565248, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339618921279907, "data/tokens_consumed": 148774060032, "data/tokens_consumed_B": 148.774060032, "train/loss_slope": -1.082710651817748e-05} {"step": 70950, "timestamp": 1778271225.0582416, "grad/layer_0/attn": 0.002828391967341304, "grad/layer_0/mlp": 0.0029763730708509684, "grad/layer_0/attn_mlp_ratio": 0.9502813676191835, "grad/layer_4/attn": 0.002096709096804261, "grad/layer_4/mlp": 0.002477288246154785, "grad/layer_4/attn_mlp_ratio": 0.8463726477617133, "grad/layer_8/attn": 0.007414609659463167, "grad/layer_8/mlp": 0.003583242418244481, "grad/layer_8/attn_mlp_ratio": 2.0692458357788652, "grad/layer_12/attn": 0.005997958593070507, "grad/layer_12/mlp": 0.006560061126947403, "grad/layer_12/attn_mlp_ratio": 0.914314422620342, "grad/layer_16/attn": 0.003206856083124876, "grad/layer_16/mlp": 0.004556596744805574, "grad/layer_16/attn_mlp_ratio": 0.7037831505283654, "grad/layer_20/attn": 0.004605337977409363, "grad/layer_20/mlp": 0.005592829082161188, "grad/layer_20/attn_mlp_ratio": 0.8234361943501666, "grad/layer_24/attn": 0.0071089137345552444, "grad/layer_24/mlp": 0.008010299876332283, "grad/layer_24/attn_mlp_ratio": 0.8874715997602676, "grad/layer_27/attn": 0.006922184024006128, "grad/layer_27/mlp": 0.007215880323201418, "grad/layer_27/attn_mlp_ratio": 0.9592986050252497} {"step": 70950, "timestamp": 1778271225.6682684, "eos/sharpness": 16.23768806457519, "eos/L0_probe": 1.9688060283660889, "eos/L_plus": 2.0581090450286865, "eos/L_minus": 2.041879892349243, "eos/grad_norm": 0.10868962854146957, "eos/embed_grad_frac": 0.18531043827533722, "eos/time_s": 0.6072273254394531} {"step": 70950, "timestamp": 1778271225.6887813, "train/loss": 2.2259604215621946, "train/z_loss": 0.0013663648860529065, "train/perplexity": 9.26237431743435, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910885.8431160196, "perf/iters_per_sec": 0.9111813750820253, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.09747633934021, "data/tokens_consumed": 148795031552, "data/tokens_consumed_B": 148.795031552, "train/loss_slope": -5.511762385535765e-06} {"step": 70950, "timestamp": 1778271227.0519104, "geo/rankme_last": 438.1726379394531, "geo/layer_0/stable_rank_q_proj": 19.28886604309082, "geo/layer_0/stable_rank_k_proj": 15.959789276123047, "geo/layer_0/stable_rank_o_proj": 46.72003173828125, "geo/layer_0/stable_rank_gate_proj": 129.8482208251953, "geo/layer_0/stable_rank_down_proj": 56.17338180541992, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06108962371945381, "geo/layer_0/attn_entropy_mean": 6.158442497253418, "geo/layer_0/attn_entropy_std": 0.42533570528030396, "geo/layer_7/stable_rank_q_proj": 43.21114730834961, "geo/layer_7/stable_rank_k_proj": 40.341949462890625, "geo/layer_7/stable_rank_o_proj": 89.57124328613281, "geo/layer_7/stable_rank_gate_proj": 78.98208618164062, "geo/layer_7/stable_rank_down_proj": 140.0101776123047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4336808919906616, "geo/layer_7/attn_entropy_mean": 4.657192230224609, "geo/layer_7/attn_entropy_std": 0.7884565591812134, "geo/layer_14/stable_rank_q_proj": 50.17765808105469, "geo/layer_14/stable_rank_k_proj": 40.97109603881836, "geo/layer_14/stable_rank_o_proj": 43.36509704589844, "geo/layer_14/stable_rank_gate_proj": 71.22264862060547, "geo/layer_14/stable_rank_down_proj": 126.95077514648438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38911888003349304, "geo/layer_14/attn_entropy_mean": 5.543132781982422, "geo/layer_14/attn_entropy_std": 0.4401276111602783, "geo/layer_21/stable_rank_q_proj": 39.770809173583984, "geo/layer_21/stable_rank_k_proj": 30.1518497467041, "geo/layer_21/stable_rank_o_proj": 69.16159057617188, "geo/layer_21/stable_rank_gate_proj": 64.44570922851562, "geo/layer_21/stable_rank_down_proj": 49.90745162963867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14410129189491272, "geo/layer_21/attn_entropy_mean": 5.697673797607422, "geo/layer_21/attn_entropy_std": 0.3029012978076935, "geo/layer_27/stable_rank_q_proj": 43.67827224731445, "geo/layer_27/stable_rank_k_proj": 32.06470489501953, "geo/layer_27/stable_rank_o_proj": 115.73746490478516, "geo/layer_27/stable_rank_gate_proj": 78.36530303955078, "geo/layer_27/stable_rank_down_proj": 127.475830078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0965256467461586, "geo/layer_27/attn_entropy_mean": 4.171444892883301, "geo/layer_27/attn_entropy_std": 0.7535619139671326, "attnres/final_alpha/block_0": 0.23803433775901794, "attnres/block_norm/0": 1.7728829383850098, "attnres/final_alpha/block_1": 0.0040812320075929165, "attnres/block_norm/1": 48083.09765625, "attnres/final_alpha/block_2": 0.010150223970413208, "attnres/block_norm/2": 28880.2734375, "attnres/final_alpha/block_3": 0.011998197995126247, "attnres/block_norm/3": 60701.29296875, "attnres/final_alpha/block_4": 0.014099689200520515, "attnres/block_norm/4": 15735.84765625, "attnres/final_alpha/block_5": 0.6133583188056946, "attnres/block_norm/5": 6851.267578125, "attnres/final_alpha/block_6": 0.10827799141407013, "attnres/block_norm/6": 40859.50390625, "geo/tier1_time_s": 1.3593225479125977, "geo/step": 70950.0, "geo/rankme_slope": -5.648018973214286e-05} {"step": 70960, "timestamp": 1778271237.3998044, "train/loss": 2.1301962018013, "train/z_loss": 0.0013651882763952017, "train/perplexity": 8.416517985442592, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791269.174753953, "perf/iters_per_sec": 0.8541437028665319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707631826400757, "data/tokens_consumed": 148816003072, "data/tokens_consumed_B": 148.816003072, "train/loss_slope": -6.041397248665031e-06} {"step": 70970, "timestamp": 1778271247.7604556, "train/loss": 2.0651329517364503, "train/z_loss": 0.0013739212998189032, "train/perplexity": 7.886346331396821, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025447.6574706729, "perf/iters_per_sec": 0.9658087050774922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354017257690429, "data/tokens_consumed": 148836974592, "data/tokens_consumed_B": 148.836974592, "train/loss_slope": -9.951697961010114e-06} {"step": 70980, "timestamp": 1778271258.119594, "train/loss": 2.1053194403648376, "train/z_loss": 0.0013770093093626202, "train/perplexity": 8.209725108355249, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025292.9666523475, "perf/iters_per_sec": 0.9657349427472818, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035480809211731, "data/tokens_consumed": 148857946112, "data/tokens_consumed_B": 148.857946112, "train/loss_slope": -1.4546041162458261e-05} {"step": 70990, "timestamp": 1778271268.4785275, "train/loss": 2.1342368006706236, "train/z_loss": 0.001378931722138077, "train/perplexity": 8.450594557014085, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025718.5282375521, "perf/iters_per_sec": 0.9659378663242112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352632761001588, "data/tokens_consumed": 148878917632, "data/tokens_consumed_B": 148.878917632, "train/loss_slope": -1.6328782939424505e-05} {"step": 71000, "timestamp": 1778271278.824137, "grad/layer_0/attn": 0.002829481614753604, "grad/layer_0/mlp": 0.003087786491960287, "grad/layer_0/attn_mlp_ratio": 0.9163462339401189, "grad/layer_4/attn": 0.0020795147866010666, "grad/layer_4/mlp": 0.0025670742616057396, "grad/layer_4/attn_mlp_ratio": 0.810071892619543, "grad/layer_8/attn": 0.0038367663510143757, "grad/layer_8/mlp": 0.003461918793618679, "grad/layer_8/attn_mlp_ratio": 1.1082773655057747, "grad/layer_12/attn": 0.004170367494225502, "grad/layer_12/mlp": 0.007271018344908953, "grad/layer_12/attn_mlp_ratio": 0.5735602963771224, "grad/layer_16/attn": 0.00364243658259511, "grad/layer_16/mlp": 0.004960529040545225, "grad/layer_16/attn_mlp_ratio": 0.7342838796819889, "grad/layer_20/attn": 0.004659895319491625, "grad/layer_20/mlp": 0.005887295119464397, "grad/layer_20/attn_mlp_ratio": 0.7915171816227629, "grad/layer_24/attn": 0.009059558622539043, "grad/layer_24/mlp": 0.009176824241876602, "grad/layer_24/attn_mlp_ratio": 0.987221536016284, "grad/layer_27/attn": 0.004117781762033701, "grad/layer_27/mlp": 0.008217258378863335, "grad/layer_27/attn_mlp_ratio": 0.5011138170504891} {"step": 71000, "timestamp": 1778271278.8401735, "train/loss": 2.101108956336975, "train/z_loss": 0.0013724397635087372, "train/perplexity": 8.175230861610851, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025504.978820502, "perf/iters_per_sec": 0.9658360380270491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353724241256714, "data/tokens_consumed": 148899889152, "data/tokens_consumed_B": 148.899889152, "train/loss_slope": -1.949280578740328e-05} {"step": 71000, "timestamp": 1778271285.938199, "geo/ww_alpha_mean": 7.771411227861616, "geo/ww_alpha_std": 4.997992168378997, "geo/ww_alpha_min": 1.351627796773007, "geo/ww_alpha_max": 30.98490265689496, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.9103374633064356, "geo/ww_alpha_by_type/k_proj": 4.503823621288377, "geo/ww_alpha_by_type/v_proj": 9.396579452265838, "geo/ww_alpha_by_type/o_proj": 8.609649247214893, "geo/ww_alpha_by_type/gate_proj": 7.965665198215929, "geo/ww_alpha_by_type/up_proj": 12.406723846743782, "geo/ww_alpha_by_type/down_proj": 7.704248724873085, "geo/twonn_id/layer_0": 0.7118664383888245, "geo/twonn_id/layer_7": 2.959505796432495, "geo/twonn_id/layer_14": 4.831295967102051, "geo/twonn_id/layer_21": 7.0636491775512695, "geo/twonn_id/layer_27": 5.030441761016846, "geo/tier2_time_s": 7.091217279434204} {"step": 71000, "timestamp": 1778271286.612428, "eoc/jacobian_sigma/layer_0/attn": 1170.9483642578125, "eoc/jacobian_sigma/layer_0/mlp": 8961.7158203125, "eoc/jacobian_sigma/layer_0": 8961.7158203125, "eoc/jacobian_sigma/layer_7/attn": 1.1460063457489014, "eoc/jacobian_sigma/layer_7/mlp": 1.6645607948303223, "eoc/jacobian_sigma/layer_7": 1.6645607948303223, "eoc/jacobian_sigma/layer_14/attn": 1.447922945022583, "eoc/jacobian_sigma/layer_14/mlp": 8.77992057800293, "eoc/jacobian_sigma/layer_14": 8.77992057800293, "eoc/jacobian_sigma/layer_21/attn": 1.0929133892059326, "eoc/jacobian_sigma/layer_21/mlp": 4.140532493591309, "eoc/jacobian_sigma/layer_21": 4.140532493591309, "eoc/jacobian_sigma/layer_27/attn": 3.0713891983032227, "eoc/jacobian_sigma/layer_27/mlp": 25.342254638671875, "eoc/jacobian_sigma/layer_27": 25.342254638671875, "eoc/layer0_sigma": 8961.7158203125, "eoc/sigma_max": 25.342254638671875, "eoc/sigma_min": 1.6645607948303223, "eoc/sigma_mean": 9.981817126274109, "eoc/time_s": 0.6682097911834717} {"step": 71010, "timestamp": 1778271296.984762, "train/loss": 2.1363014459609984, "train/z_loss": 0.0013738018460571766, "train/perplexity": 8.468060061098196, "train/grad_norm": 0.11279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1156141.5053089743, "perf/iters_per_sec": 0.5512912298722145, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8139232873916626, "data/tokens_consumed": 148920860672, "data/tokens_consumed_B": 148.920860672, "train/loss_slope": -1.8230050889858472e-05} {"step": 71020, "timestamp": 1778271307.3526883, "train/loss": 2.138326609134674, "train/z_loss": 0.0013716309564188122, "train/perplexity": 8.48522664118222, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026948.8297865598, "perf/iters_per_sec": 0.9665245198185729, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346349000930786, "data/tokens_consumed": 148941832192, "data/tokens_consumed_B": 148.941832192, "train/loss_slope": -1.8662972669146542e-05} {"step": 71025, "timestamp": 1778271313.1507354, "eos/sharpness": 39.37959671020507, "eos/L0_probe": 1.96753990650177, "eos/L_plus": 2.146085023880005, "eos/L_minus": 2.182790756225586, "eos/grad_norm": 0.10897671431303024, "eos/embed_grad_frac": 0.17648886144161224, "eos/time_s": 0.6277816295623779} {"step": 71025, "timestamp": 1778271314.533649, "geo/rankme_last": 437.7182312011719, "geo/layer_0/stable_rank_q_proj": 19.26616096496582, "geo/layer_0/stable_rank_k_proj": 15.929492950439453, "geo/layer_0/stable_rank_o_proj": 46.7519416809082, "geo/layer_0/stable_rank_gate_proj": 129.87887573242188, "geo/layer_0/stable_rank_down_proj": 56.1405029296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06177467480301857, "geo/layer_0/attn_entropy_mean": 6.153412342071533, "geo/layer_0/attn_entropy_std": 0.425555557012558, "geo/layer_7/stable_rank_q_proj": 43.15864944458008, "geo/layer_7/stable_rank_k_proj": 40.37959289550781, "geo/layer_7/stable_rank_o_proj": 89.56245422363281, "geo/layer_7/stable_rank_gate_proj": 78.91671752929688, "geo/layer_7/stable_rank_down_proj": 139.9477081298828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44539904594421387, "geo/layer_7/attn_entropy_mean": 4.64192008972168, "geo/layer_7/attn_entropy_std": 0.7993724942207336, "geo/layer_14/stable_rank_q_proj": 50.23896789550781, "geo/layer_14/stable_rank_k_proj": 41.093257904052734, "geo/layer_14/stable_rank_o_proj": 43.35560989379883, "geo/layer_14/stable_rank_gate_proj": 71.22728729248047, "geo/layer_14/stable_rank_down_proj": 127.12692260742188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3856777846813202, "geo/layer_14/attn_entropy_mean": 5.518953323364258, "geo/layer_14/attn_entropy_std": 0.4151515066623688, "geo/layer_21/stable_rank_q_proj": 39.78734588623047, "geo/layer_21/stable_rank_k_proj": 30.06549644470215, "geo/layer_21/stable_rank_o_proj": 69.18155670166016, "geo/layer_21/stable_rank_gate_proj": 64.39527130126953, "geo/layer_21/stable_rank_down_proj": 49.84907531738281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13920825719833374, "geo/layer_21/attn_entropy_mean": 5.690525531768799, "geo/layer_21/attn_entropy_std": 0.3035530149936676, "geo/layer_27/stable_rank_q_proj": 43.71512985229492, "geo/layer_27/stable_rank_k_proj": 32.169830322265625, "geo/layer_27/stable_rank_o_proj": 115.42359924316406, "geo/layer_27/stable_rank_gate_proj": 78.34062957763672, "geo/layer_27/stable_rank_down_proj": 127.33718872070312, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10016901791095734, "geo/layer_27/attn_entropy_mean": 4.147741317749023, "geo/layer_27/attn_entropy_std": 0.7514543533325195, "attnres/final_alpha/block_0": 0.2377355694770813, "attnres/block_norm/0": 1.7727514505386353, "attnres/final_alpha/block_1": 0.004045382142066956, "attnres/block_norm/1": 48081.6328125, "attnres/final_alpha/block_2": 0.009992324747145176, "attnres/block_norm/2": 28890.31640625, "attnres/final_alpha/block_3": 0.011726738885045052, "attnres/block_norm/3": 60873.10546875, "attnres/final_alpha/block_4": 0.01409146562218666, "attnres/block_norm/4": 15700.208984375, "attnres/final_alpha/block_5": 0.6151716709136963, "attnres/block_norm/5": 6865.8388671875, "attnres/final_alpha/block_6": 0.10723689198493958, "attnres/block_norm/6": 40989.2578125, "geo/tier1_time_s": 1.3615596294403076, "geo/step": 71025.0, "geo/rankme_slope": -0.00010954985509828931} {"step": 71030, "timestamp": 1778271319.716712, "train/loss": 2.1588563442230226, "train/z_loss": 0.0013718552771024406, "train/perplexity": 8.661226530327133, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697111.9115405786, "perf/iters_per_sec": 0.8092460210516828, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2357181549072265, "data/tokens_consumed": 148962803712, "data/tokens_consumed_B": 148.962803712, "train/loss_slope": -1.5881437773179516e-05} {"step": 71040, "timestamp": 1778271330.0683544, "train/loss": 2.1128486156463624, "train/z_loss": 0.0013728943769820035, "train/perplexity": 8.271770851239719, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026853.3621452546, "perf/iters_per_sec": 0.9664789972997926, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034683632850647, "data/tokens_consumed": 148983775232, "data/tokens_consumed_B": 148.983775232, "train/loss_slope": -1.4041527455682148e-05} {"step": 71050, "timestamp": 1778271340.4182026, "grad/layer_0/attn": 0.002783918986096978, "grad/layer_0/mlp": 0.003039719769731164, "grad/layer_0/attn_mlp_ratio": 0.9158472179685396, "grad/layer_4/attn": 0.0020557153038680553, "grad/layer_4/mlp": 0.0025926001835614443, "grad/layer_4/attn_mlp_ratio": 0.7929164078637404, "grad/layer_8/attn": 0.005869422107934952, "grad/layer_8/mlp": 0.0037984626833349466, "grad/layer_8/attn_mlp_ratio": 1.5452098500703906, "grad/layer_12/attn": 0.004737330600619316, "grad/layer_12/mlp": 0.006326606031507254, "grad/layer_12/attn_mlp_ratio": 0.748794930828212, "grad/layer_16/attn": 0.0034436937421560287, "grad/layer_16/mlp": 0.004653210286051035, "grad/layer_16/attn_mlp_ratio": 0.7400683520519976, "grad/layer_20/attn": 0.0041093905456364155, "grad/layer_20/mlp": 0.005989702418446541, "grad/layer_20/attn_mlp_ratio": 0.6860758999266973, "grad/layer_24/attn": 0.0064552780240774155, "grad/layer_24/mlp": 0.008576997555792332, "grad/layer_24/attn_mlp_ratio": 0.7526267679131231, "grad/layer_27/attn": 0.00445794640108943, "grad/layer_27/mlp": 0.007276532705873251, "grad/layer_27/attn_mlp_ratio": 0.6126470559565402} {"step": 71050, "timestamp": 1778271340.4342947, "train/loss": 2.1626901984214784, "train/z_loss": 0.0013613955001346768, "train/perplexity": 8.694496144698636, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024618.842881342, "perf/iters_per_sec": 0.9654134954840383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358255863189698, "data/tokens_consumed": 149004746752, "data/tokens_consumed_B": 149.004746752, "train/loss_slope": -7.337444035312372e-06} {"step": 71060, "timestamp": 1778271350.8024743, "train/loss": 2.1513472318649294, "train/z_loss": 0.0013745716074481606, "train/perplexity": 8.596431986370016, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026211.7578074152, "perf/iters_per_sec": 0.9661730565106464, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350112676620484, "data/tokens_consumed": 149025718272, "data/tokens_consumed_B": 149.025718272, "train/loss_slope": -4.640509865500727e-06} {"step": 71070, "timestamp": 1778271361.1744418, "train/loss": 2.0760104656219482, "train/z_loss": 0.0013868941576220095, "train/perplexity": 7.972598426859259, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023612.664894851, "perf/iters_per_sec": 0.9649337124323134, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363406181335448, "data/tokens_consumed": 149046689792, "data/tokens_consumed_B": 149.046689792, "train/loss_slope": -6.052122164731137e-06} {"step": 71080, "timestamp": 1778271371.5399725, "train/loss": 2.12982292175293, "train/z_loss": 0.001375499984715134, "train/perplexity": 8.413376853499294, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025163.9902040751, "perf/iters_per_sec": 0.9656734419842125, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355467557907105, "data/tokens_consumed": 149067661312, "data/tokens_consumed_B": 149.067661312, "train/loss_slope": -5.709582408054642e-06} {"step": 71090, "timestamp": 1778271381.9144826, "train/loss": 2.117591941356659, "train/z_loss": 0.0013778102118521928, "train/perplexity": 8.311099755751203, "train/grad_norm": 0.2490234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023067.2821473284, "perf/iters_per_sec": 0.9646736536728517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366199970245362, "data/tokens_consumed": 149088632832, "data/tokens_consumed_B": 149.088632832, "train/loss_slope": -4.6664425034155396e-06} {"step": 71100, "timestamp": 1778271392.2818964, "grad/layer_0/attn": 0.002925520297139883, "grad/layer_0/mlp": 0.003067702753469348, "grad/layer_0/attn_mlp_ratio": 0.9536517833959482, "grad/layer_4/attn": 0.0022848451044410467, "grad/layer_4/mlp": 0.0026081358082592487, "grad/layer_4/attn_mlp_ratio": 0.8760452617540255, "grad/layer_8/attn": 0.003961638081818819, "grad/layer_8/mlp": 0.0037092019338160753, "grad/layer_8/attn_mlp_ratio": 1.0680566994467633, "grad/layer_12/attn": 0.005076814908534288, "grad/layer_12/mlp": 0.006811178755015135, "grad/layer_12/attn_mlp_ratio": 0.7453650853399896, "grad/layer_16/attn": 0.0033535552211105824, "grad/layer_16/mlp": 0.004704446531832218, "grad/layer_16/attn_mlp_ratio": 0.7128479677969032, "grad/layer_20/attn": 0.0028442912735044956, "grad/layer_20/mlp": 0.005579343996942043, "grad/layer_20/attn_mlp_ratio": 0.5097895422982444, "grad/layer_24/attn": 0.006927158683538437, "grad/layer_24/mlp": 0.007918030023574829, "grad/layer_24/attn_mlp_ratio": 0.8748588443625378, "grad/layer_27/attn": 0.006928853690624237, "grad/layer_27/mlp": 0.007169421296566725, "grad/layer_27/attn_mlp_ratio": 0.9664453109064436} {"step": 71100, "timestamp": 1778271392.8992188, "eos/sharpness": 19.596743583679196, "eos/L0_probe": 1.964939832687378, "eos/L_plus": 2.066890239715576, "eos/L_minus": 2.0589568614959717, "eos/grad_norm": 0.09749756008386612, "eos/embed_grad_frac": 0.21909041702747345, "eos/time_s": 0.6145265102386475} {"step": 71100, "timestamp": 1778271392.9204097, "train/loss": 2.143109679222107, "train/z_loss": 0.0013774094171822072, "train/perplexity": 8.525909291337639, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906588.061752425, "perf/iters_per_sec": 0.9091320332300306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999502420425415, "data/tokens_consumed": 149109604352, "data/tokens_consumed_B": 149.109604352, "train/loss_slope": -4.084551881606472e-06} {"step": 71100, "timestamp": 1778271394.287495, "geo/rankme_last": 439.0453796386719, "geo/layer_0/stable_rank_q_proj": 19.249746322631836, "geo/layer_0/stable_rank_k_proj": 15.934588432312012, "geo/layer_0/stable_rank_o_proj": 46.722110748291016, "geo/layer_0/stable_rank_gate_proj": 129.55978393554688, "geo/layer_0/stable_rank_down_proj": 56.09580993652344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06523896753787994, "geo/layer_0/attn_entropy_mean": 6.154363632202148, "geo/layer_0/attn_entropy_std": 0.42468196153640747, "geo/layer_7/stable_rank_q_proj": 43.14786911010742, "geo/layer_7/stable_rank_k_proj": 40.39908218383789, "geo/layer_7/stable_rank_o_proj": 89.57262420654297, "geo/layer_7/stable_rank_gate_proj": 78.88002014160156, "geo/layer_7/stable_rank_down_proj": 140.0452423095703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4384302496910095, "geo/layer_7/attn_entropy_mean": 4.67265510559082, "geo/layer_7/attn_entropy_std": 0.783170223236084, "geo/layer_14/stable_rank_q_proj": 50.3787841796875, "geo/layer_14/stable_rank_k_proj": 41.15739059448242, "geo/layer_14/stable_rank_o_proj": 43.2939567565918, "geo/layer_14/stable_rank_gate_proj": 71.23533630371094, "geo/layer_14/stable_rank_down_proj": 127.23025512695312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3932923674583435, "geo/layer_14/attn_entropy_mean": 5.537649154663086, "geo/layer_14/attn_entropy_std": 0.42711713910102844, "geo/layer_21/stable_rank_q_proj": 39.802215576171875, "geo/layer_21/stable_rank_k_proj": 30.147666931152344, "geo/layer_21/stable_rank_o_proj": 69.16989135742188, "geo/layer_21/stable_rank_gate_proj": 64.3978042602539, "geo/layer_21/stable_rank_down_proj": 49.76683807373047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14443741738796234, "geo/layer_21/attn_entropy_mean": 5.684161186218262, "geo/layer_21/attn_entropy_std": 0.3047807514667511, "geo/layer_27/stable_rank_q_proj": 43.66525650024414, "geo/layer_27/stable_rank_k_proj": 32.20171356201172, "geo/layer_27/stable_rank_o_proj": 115.31787872314453, "geo/layer_27/stable_rank_gate_proj": 78.34298706054688, "geo/layer_27/stable_rank_down_proj": 127.22657775878906, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09297443926334381, "geo/layer_27/attn_entropy_mean": 4.156813621520996, "geo/layer_27/attn_entropy_std": 0.7550087571144104, "attnres/final_alpha/block_0": 0.23685432970523834, "attnres/block_norm/0": 1.7728943824768066, "attnres/final_alpha/block_1": 0.0040808385238051414, "attnres/block_norm/1": 48255.3203125, "attnres/final_alpha/block_2": 0.010037856176495552, "attnres/block_norm/2": 28843.55859375, "attnres/final_alpha/block_3": 0.011839829385280609, "attnres/block_norm/3": 60701.33203125, "attnres/final_alpha/block_4": 0.0138307586312294, "attnres/block_norm/4": 15679.333984375, "attnres/final_alpha/block_5": 0.6162134408950806, "attnres/block_norm/5": 6806.22509765625, "attnres/final_alpha/block_6": 0.10714291036128998, "attnres/block_norm/6": 40779.64453125, "geo/tier1_time_s": 1.3630125522613525, "geo/step": 71100.0, "geo/rankme_slope": -8.37679993872549e-05} {"step": 71110, "timestamp": 1778271405.2334116, "train/loss": 2.146901178359985, "train/z_loss": 0.001365734008140862, "train/perplexity": 8.55829662854735, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1703766.4252760955, "perf/iters_per_sec": 0.8124191404705503, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.230891728401184, "data/tokens_consumed": 149130575872, "data/tokens_consumed_B": 149.130575872, "train/loss_slope": -3.972674224457528e-06} {"step": 71120, "timestamp": 1778271415.6108797, "train/loss": 2.164119291305542, "train/z_loss": 0.0013657190254889428, "train/perplexity": 8.706930269913283, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022155.0124069485, "perf/iters_per_sec": 0.9642386495623343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037087655067444, "data/tokens_consumed": 149151547392, "data/tokens_consumed_B": 149.151547392, "train/loss_slope": -1.0350792106837066e-06} {"step": 71130, "timestamp": 1778271425.9846606, "train/loss": 2.1225545167922975, "train/z_loss": 0.0013586986577138304, "train/perplexity": 8.352446724112184, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022557.583982562, "perf/iters_per_sec": 0.9644306106484232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368812322616576, "data/tokens_consumed": 149172518912, "data/tokens_consumed_B": 149.172518912, "train/loss_slope": -4.8070148487477e-07} {"step": 71140, "timestamp": 1778271436.3774924, "train/loss": 2.1194337129592897, "train/z_loss": 0.0013622629223391414, "train/perplexity": 8.326421008060725, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019294.949414281, "perf/iters_per_sec": 0.9628748652526288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385565519332887, "data/tokens_consumed": 149193490432, "data/tokens_consumed_B": 149.193490432, "train/loss_slope": -1.733116011987251e-06} {"step": 71150, "timestamp": 1778271446.744536, "grad/layer_0/attn": 0.0026024465914815664, "grad/layer_0/mlp": 0.0029096249490976334, "grad/layer_0/attn_mlp_ratio": 0.8944267895578736, "grad/layer_4/attn": 0.0022930577397346497, "grad/layer_4/mlp": 0.0025756945833563805, "grad/layer_4/attn_mlp_ratio": 0.890267683725065, "grad/layer_8/attn": 0.006821966730058193, "grad/layer_8/mlp": 0.0037671735044568777, "grad/layer_8/attn_mlp_ratio": 1.810897889597454, "grad/layer_12/attn": 0.005641072988510132, "grad/layer_12/mlp": 0.0072792815044522285, "grad/layer_12/attn_mlp_ratio": 0.774949135785581, "grad/layer_16/attn": 0.004744502250105143, "grad/layer_16/mlp": 0.004981632810086012, "grad/layer_16/attn_mlp_ratio": 0.9523990096699488, "grad/layer_20/attn": 0.004286374431103468, "grad/layer_20/mlp": 0.007131021935492754, "grad/layer_20/attn_mlp_ratio": 0.6010883727142037, "grad/layer_24/attn": 0.0199182890355587, "grad/layer_24/mlp": 0.015611155889928341, "grad/layer_24/attn_mlp_ratio": 1.2759009677700448, "grad/layer_27/attn": 0.015619075857102871, "grad/layer_27/mlp": 0.015627527609467506, "grad/layer_27/attn_mlp_ratio": 0.9994591689407457} {"step": 71150, "timestamp": 1778271446.770823, "train/loss": 2.1313350439071654, "train/z_loss": 0.0013722709496505558, "train/perplexity": 8.426108530530925, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019069.7751338761, "perf/iters_per_sec": 0.9627674937886601, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386723756790162, "data/tokens_consumed": 149214461952, "data/tokens_consumed_B": 149.214461952, "train/loss_slope": -1.3395456948725371e-06} {"step": 71160, "timestamp": 1778271457.1476865, "train/loss": 2.1272560119628907, "train/z_loss": 0.0013788290088996291, "train/perplexity": 8.391808168364324, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021820.2626329619, "perf/iters_per_sec": 0.9640790284313974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372593641281127, "data/tokens_consumed": 149235433472, "data/tokens_consumed_B": 149.235433472, "train/loss_slope": -1.6805090705374817e-06} {"step": 71170, "timestamp": 1778271467.5268102, "train/loss": 2.141712462902069, "train/z_loss": 0.001366724947001785, "train/perplexity": 8.514005070055378, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021642.242473059, "perf/iters_per_sec": 0.9639941418042465, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373507022857666, "data/tokens_consumed": 149256404992, "data/tokens_consumed_B": 149.256404992, "train/loss_slope": -1.950509098246388e-06} {"step": 71175, "timestamp": 1778271473.3222804, "eos/sharpness": 31.714200973510735, "eos/L0_probe": 1.966218113899231, "eos/L_plus": 2.116050958633423, "eos/L_minus": 2.1335272789001465, "eos/grad_norm": 0.12756498157978058, "eos/embed_grad_frac": 0.15185223519802094, "eos/time_s": 0.6117660999298096} {"step": 71175, "timestamp": 1778271474.7052927, "geo/rankme_last": 439.05987548828125, "geo/layer_0/stable_rank_q_proj": 19.24500274658203, "geo/layer_0/stable_rank_k_proj": 15.941243171691895, "geo/layer_0/stable_rank_o_proj": 46.73930740356445, "geo/layer_0/stable_rank_gate_proj": 129.5047607421875, "geo/layer_0/stable_rank_down_proj": 56.126224517822266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06645897030830383, "geo/layer_0/attn_entropy_mean": 6.155613899230957, "geo/layer_0/attn_entropy_std": 0.42459622025489807, "geo/layer_7/stable_rank_q_proj": 43.083736419677734, "geo/layer_7/stable_rank_k_proj": 40.18058395385742, "geo/layer_7/stable_rank_o_proj": 89.61261749267578, "geo/layer_7/stable_rank_gate_proj": 78.74137115478516, "geo/layer_7/stable_rank_down_proj": 140.25570678710938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4366149604320526, "geo/layer_7/attn_entropy_mean": 4.630788803100586, "geo/layer_7/attn_entropy_std": 0.7830474972724915, "geo/layer_14/stable_rank_q_proj": 50.35495376586914, "geo/layer_14/stable_rank_k_proj": 41.07954025268555, "geo/layer_14/stable_rank_o_proj": 43.26057815551758, "geo/layer_14/stable_rank_gate_proj": 71.14026641845703, "geo/layer_14/stable_rank_down_proj": 127.1819076538086, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3823332190513611, "geo/layer_14/attn_entropy_mean": 5.559451103210449, "geo/layer_14/attn_entropy_std": 0.41370290517807007, "geo/layer_21/stable_rank_q_proj": 39.87716293334961, "geo/layer_21/stable_rank_k_proj": 30.057594299316406, "geo/layer_21/stable_rank_o_proj": 69.12354278564453, "geo/layer_21/stable_rank_gate_proj": 64.39151763916016, "geo/layer_21/stable_rank_down_proj": 49.77073287963867, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1387023776769638, "geo/layer_21/attn_entropy_mean": 5.685374736785889, "geo/layer_21/attn_entropy_std": 0.30275091528892517, "geo/layer_27/stable_rank_q_proj": 43.68669509887695, "geo/layer_27/stable_rank_k_proj": 32.2265510559082, "geo/layer_27/stable_rank_o_proj": 115.2696304321289, "geo/layer_27/stable_rank_gate_proj": 78.33413696289062, "geo/layer_27/stable_rank_down_proj": 127.57215118408203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09757435321807861, "geo/layer_27/attn_entropy_mean": 4.175748825073242, "geo/layer_27/attn_entropy_std": 0.7600263357162476, "attnres/final_alpha/block_0": 0.23637805879116058, "attnres/block_norm/0": 1.7728538513183594, "attnres/final_alpha/block_1": 0.00408204598352313, "attnres/block_norm/1": 48160.796875, "attnres/final_alpha/block_2": 0.010122818872332573, "attnres/block_norm/2": 28865.46484375, "attnres/final_alpha/block_3": 0.01184115745127201, "attnres/block_norm/3": 60781.61328125, "attnres/final_alpha/block_4": 0.014093302190303802, "attnres/block_norm/4": 15644.904296875, "attnres/final_alpha/block_5": 0.6169543266296387, "attnres/block_norm/5": 6808.001953125, "attnres/final_alpha/block_6": 0.10652823746204376, "attnres/block_norm/6": 40960.2578125, "geo/tier1_time_s": 1.3635647296905518, "geo/step": 71175.0, "geo/rankme_slope": -6.482229610594237e-05} {"step": 71180, "timestamp": 1778271479.897355, "train/loss": 2.1350494861602782, "train/z_loss": 0.0013764664996415376, "train/perplexity": 8.457465023975791, "train/grad_norm": 0.283203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696165.9423214805, "perf/iters_per_sec": 0.8087949477775004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2364073276519776, "data/tokens_consumed": 149277376512, "data/tokens_consumed_B": 149.277376512, "train/loss_slope": 1.0651951325465583e-06} {"step": 71190, "timestamp": 1778271490.2717705, "train/loss": 2.1377967834472655, "train/z_loss": 0.0013798000873066485, "train/perplexity": 8.48073214090023, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022824.0537931207, "perf/iters_per_sec": 0.9645576733556369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367446422576905, "data/tokens_consumed": 149298348032, "data/tokens_consumed_B": 149.298348032, "train/loss_slope": 1.153085863414983e-06} {"step": 71200, "timestamp": 1778271500.640357, "grad/layer_0/attn": 0.002735695568844676, "grad/layer_0/mlp": 0.002889807801693678, "grad/layer_0/attn_mlp_ratio": 0.9466703884508452, "grad/layer_4/attn": 0.0022642272524535656, "grad/layer_4/mlp": 0.0026118610985577106, "grad/layer_4/attn_mlp_ratio": 0.8669018298919899, "grad/layer_8/attn": 0.007126850076019764, "grad/layer_8/mlp": 0.0036750398576259613, "grad/layer_8/attn_mlp_ratio": 1.9392578470421957, "grad/layer_12/attn": 0.005439684726297855, "grad/layer_12/mlp": 0.007158955559134483, "grad/layer_12/attn_mlp_ratio": 0.7598433326454648, "grad/layer_16/attn": 0.006877265404909849, "grad/layer_16/mlp": 0.004402578808367252, "grad/layer_16/attn_mlp_ratio": 1.5620992940840586, "grad/layer_20/attn": 0.003592252731323242, "grad/layer_20/mlp": 0.00660207774490118, "grad/layer_20/attn_mlp_ratio": 0.5441094176279, "grad/layer_24/attn": 0.01644851826131344, "grad/layer_24/mlp": 0.01202435977756977, "grad/layer_24/attn_mlp_ratio": 1.367932965146568, "grad/layer_27/attn": 0.003914163913577795, "grad/layer_27/mlp": 0.011528117582201958, "grad/layer_27/attn_mlp_ratio": 0.33953191852133835} {"step": 71200, "timestamp": 1778271500.6572046, "train/loss": 2.1881606578826904, "train/z_loss": 0.0013675825204700232, "train/perplexity": 8.918793306341545, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020237.2582813175, "perf/iters_per_sec": 0.9633241931349361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03807213306427, "data/tokens_consumed": 149319319552, "data/tokens_consumed_B": 149.319319552, "train/loss_slope": 6.419396457677834e-06} {"step": 71210, "timestamp": 1778271511.0337827, "train/loss": 2.121401834487915, "train/z_loss": 0.001362728967797011, "train/perplexity": 8.342824553293415, "train/grad_norm": 0.208984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022315.268582745, "perf/iters_per_sec": 0.9643150656617856, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370054721832276, "data/tokens_consumed": 149340291072, "data/tokens_consumed_B": 149.340291072, "train/loss_slope": 6.129317028973854e-06} {"step": 71220, "timestamp": 1778271521.4110518, "train/loss": 2.1802785396575928, "train/z_loss": 0.0013646280625835062, "train/perplexity": 8.848770649036432, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022236.834260799, "perf/iters_per_sec": 0.9642776652626033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037045693397522, "data/tokens_consumed": 149361262592, "data/tokens_consumed_B": 149.361262592, "train/loss_slope": 1.0162442881460831e-05} {"step": 71230, "timestamp": 1778271531.7830236, "train/loss": 2.1604453802108763, "train/z_loss": 0.0013715288718231023, "train/perplexity": 8.675000471728758, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022728.462638455, "perf/iters_per_sec": 0.9645120919410968, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367936372756958, "data/tokens_consumed": 149382234112, "data/tokens_consumed_B": 149.382234112, "train/loss_slope": 1.275960941268913e-05} {"step": 71240, "timestamp": 1778271542.1557865, "train/loss": 2.158058500289917, "train/z_loss": 0.0013732167542912066, "train/perplexity": 8.654318979226947, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023100.0861620842, "perf/iters_per_sec": 0.9646892958460256, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366031885147096, "data/tokens_consumed": 149403205632, "data/tokens_consumed_B": 149.403205632, "train/loss_slope": 1.3299758070194033e-05} {"step": 71250, "timestamp": 1778271552.5176437, "grad/layer_0/attn": 0.0027785359416157007, "grad/layer_0/mlp": 0.0029677320271730423, "grad/layer_0/attn_mlp_ratio": 0.9362489006925422, "grad/layer_4/attn": 0.00445658341050148, "grad/layer_4/mlp": 0.0025375268887728453, "grad/layer_4/attn_mlp_ratio": 1.7562703491310236, "grad/layer_8/attn": 0.003718818072229624, "grad/layer_8/mlp": 0.003498602658510208, "grad/layer_8/attn_mlp_ratio": 1.0629437889693967, "grad/layer_12/attn": 0.004388345871120691, "grad/layer_12/mlp": 0.006829177960753441, "grad/layer_12/attn_mlp_ratio": 0.6425877070536568, "grad/layer_16/attn": 0.004613471683114767, "grad/layer_16/mlp": 0.0046043843030929565, "grad/layer_16/attn_mlp_ratio": 1.0019736145435005, "grad/layer_20/attn": 0.0032318856101483107, "grad/layer_20/mlp": 0.005934181157499552, "grad/layer_20/attn_mlp_ratio": 0.5446219907866633, "grad/layer_24/attn": 0.009776475839316845, "grad/layer_24/mlp": 0.010501759126782417, "grad/layer_24/attn_mlp_ratio": 0.9309369628646696, "grad/layer_27/attn": 0.005652412306517363, "grad/layer_27/mlp": 0.010834588669240475, "grad/layer_27/attn_mlp_ratio": 0.5217006779772415} {"step": 71250, "timestamp": 1778271553.1253517, "eos/sharpness": 71.39019966125487, "eos/L0_probe": 1.9659236669540405, "eos/L_plus": 2.3715906143188477, "eos/L_minus": 2.2741587162017822, "eos/grad_norm": 0.16752734780311584, "eos/embed_grad_frac": 0.08347882330417633, "eos/time_s": 0.6050028800964355} {"step": 71250, "timestamp": 1778271553.144884, "train/loss": 2.1081496238708497, "train/z_loss": 0.0013808743213303387, "train/perplexity": 8.232993047683633, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909603.301739531, "perf/iters_per_sec": 0.9105698116967826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0982134342193604, "data/tokens_consumed": 149424177152, "data/tokens_consumed_B": 149.424177152, "train/loss_slope": 1.1810519128027409e-05} {"step": 71250, "timestamp": 1778271554.5084984, "geo/rankme_last": 438.434326171875, "geo/layer_0/stable_rank_q_proj": 19.229736328125, "geo/layer_0/stable_rank_k_proj": 15.938396453857422, "geo/layer_0/stable_rank_o_proj": 46.72162628173828, "geo/layer_0/stable_rank_gate_proj": 129.681884765625, "geo/layer_0/stable_rank_down_proj": 56.20981979370117, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06618797779083252, "geo/layer_0/attn_entropy_mean": 6.149681091308594, "geo/layer_0/attn_entropy_std": 0.4300270676612854, "geo/layer_7/stable_rank_q_proj": 42.97578430175781, "geo/layer_7/stable_rank_k_proj": 40.22112274169922, "geo/layer_7/stable_rank_o_proj": 89.52318572998047, "geo/layer_7/stable_rank_gate_proj": 78.70014190673828, "geo/layer_7/stable_rank_down_proj": 140.0601043701172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45526567101478577, "geo/layer_7/attn_entropy_mean": 4.636582374572754, "geo/layer_7/attn_entropy_std": 0.8019616007804871, "geo/layer_14/stable_rank_q_proj": 50.376895904541016, "geo/layer_14/stable_rank_k_proj": 40.91415023803711, "geo/layer_14/stable_rank_o_proj": 43.31105422973633, "geo/layer_14/stable_rank_gate_proj": 71.17268371582031, "geo/layer_14/stable_rank_down_proj": 127.18524932861328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39271441102027893, "geo/layer_14/attn_entropy_mean": 5.530891418457031, "geo/layer_14/attn_entropy_std": 0.422000527381897, "geo/layer_21/stable_rank_q_proj": 39.83757781982422, "geo/layer_21/stable_rank_k_proj": 30.138572692871094, "geo/layer_21/stable_rank_o_proj": 69.1538314819336, "geo/layer_21/stable_rank_gate_proj": 64.27484130859375, "geo/layer_21/stable_rank_down_proj": 49.78545379638672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13828635215759277, "geo/layer_21/attn_entropy_mean": 5.682187080383301, "geo/layer_21/attn_entropy_std": 0.3028566539287567, "geo/layer_27/stable_rank_q_proj": 43.6876106262207, "geo/layer_27/stable_rank_k_proj": 32.13825607299805, "geo/layer_27/stable_rank_o_proj": 115.6246109008789, "geo/layer_27/stable_rank_gate_proj": 78.29583740234375, "geo/layer_27/stable_rank_down_proj": 127.63683319091797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09231822192668915, "geo/layer_27/attn_entropy_mean": 4.158236503601074, "geo/layer_27/attn_entropy_std": 0.7677741646766663, "attnres/final_alpha/block_0": 0.23610852658748627, "attnres/block_norm/0": 1.7729586362838745, "attnres/final_alpha/block_1": 0.0040538436733186245, "attnres/block_norm/1": 47917.2890625, "attnres/final_alpha/block_2": 0.010184720158576965, "attnres/block_norm/2": 28954.7265625, "attnres/final_alpha/block_3": 0.011699797585606575, "attnres/block_norm/3": 60957.8359375, "attnres/final_alpha/block_4": 0.014119450002908707, "attnres/block_norm/4": 15618.728515625, "attnres/final_alpha/block_5": 0.6187200546264648, "attnres/block_norm/5": 6832.0927734375, "attnres/final_alpha/block_6": 0.10511361807584763, "attnres/block_norm/6": 41023.1875, "geo/tier1_time_s": 1.3590960502624512, "geo/step": 71250.0, "geo/rankme_slope": -3.596903605192077e-05} {"step": 71260, "timestamp": 1778271564.8958967, "train/loss": 2.108473563194275, "train/z_loss": 0.001386472035665065, "train/perplexity": 8.235660469899413, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785314.583413649, "perf/iters_per_sec": 0.8513043324535603, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.174668049812317, "data/tokens_consumed": 149445148672, "data/tokens_consumed_B": 149.445148672, "train/loss_slope": 8.327917812323335e-06} {"step": 71270, "timestamp": 1778271575.276565, "train/loss": 2.1459258079528807, "train/z_loss": 0.001376392669044435, "train/perplexity": 8.549953188914156, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021623.7963399666, "perf/iters_per_sec": 0.9639853460025628, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037360167503357, "data/tokens_consumed": 149466120192, "data/tokens_consumed_B": 149.466120192, "train/loss_slope": 7.343882688916574e-06} {"step": 71280, "timestamp": 1778271585.6565678, "train/loss": 2.083042228221893, "train/z_loss": 0.0013880869140848518, "train/perplexity": 8.028857414410423, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021627.5134177902, "perf/iters_per_sec": 0.963987118443389, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373582601547242, "data/tokens_consumed": 149487091712, "data/tokens_consumed_B": 149.487091712, "train/loss_slope": 5.992121943975334e-06} {"step": 71290, "timestamp": 1778271596.4118679, "train/loss": 2.125215220451355, "train/z_loss": 0.0013810162665322423, "train/perplexity": 8.374699700833366, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1950870.7847232642, "perf/iters_per_sec": 0.9302476810089417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0749825239181519, "data/tokens_consumed": 149508063232, "data/tokens_consumed_B": 149.508063232, "train/loss_slope": 3.280522821187667e-06} {"step": 71300, "timestamp": 1778271607.3069055, "grad/layer_0/attn": 0.011002280749380589, "grad/layer_0/mlp": 0.009503317065536976, "grad/layer_0/attn_mlp_ratio": 1.1577305647842087, "grad/layer_4/attn": 0.003357721259817481, "grad/layer_4/mlp": 0.0037714149802923203, "grad/layer_4/attn_mlp_ratio": 0.8903080643028096, "grad/layer_8/attn": 0.006235566455870867, "grad/layer_8/mlp": 0.0039950269274413586, "grad/layer_8/attn_mlp_ratio": 1.5608321077778733, "grad/layer_12/attn": 0.006202922202646732, "grad/layer_12/mlp": 0.007284441031515598, "grad/layer_12/attn_mlp_ratio": 0.8515302808626246, "grad/layer_16/attn": 0.003798495279625058, "grad/layer_16/mlp": 0.004548273514956236, "grad/layer_16/attn_mlp_ratio": 0.8351510047975881, "grad/layer_20/attn": 0.0033388242591172457, "grad/layer_20/mlp": 0.005781070329248905, "grad/layer_20/attn_mlp_ratio": 0.5775442973717648, "grad/layer_24/attn": 0.01136550772935152, "grad/layer_24/mlp": 0.01017035823315382, "grad/layer_24/attn_mlp_ratio": 1.1175130076097413, "grad/layer_27/attn": 0.013758884742856026, "grad/layer_27/mlp": 0.008520529605448246, "grad/layer_27/attn_mlp_ratio": 1.6147921805915706} {"step": 71300, "timestamp": 1778271607.3237767, "train/loss": 2.1573580741882323, "train/z_loss": 0.001373859925661236, "train/perplexity": 8.648259390716303, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1923403.6060392398, "perf/iters_per_sec": 0.9171503095813941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0903338193893433, "data/tokens_consumed": 149529034752, "data/tokens_consumed_B": 149.529034752, "train/loss_slope": 5.903261386700718e-06} {"step": 71310, "timestamp": 1778271618.302237, "train/loss": 2.180736470222473, "train/z_loss": 0.0013750284910202027, "train/perplexity": 8.85282369951526, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911117.926593685, "perf/iters_per_sec": 0.9112920411079812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973430633544923, "data/tokens_consumed": 149550006272, "data/tokens_consumed_B": 149.550006272, "train/loss_slope": 7.739646206594435e-06} {"step": 71320, "timestamp": 1778271628.681298, "train/loss": 2.0847319602966308, "train/z_loss": 0.0013696225476451218, "train/perplexity": 8.042435500740366, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021965.359936256, "perf/iters_per_sec": 0.9641482162171631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371849298477174, "data/tokens_consumed": 149570977792, "data/tokens_consumed_B": 149.570977792, "train/loss_slope": 9.054726249563391e-07} {"step": 71325, "timestamp": 1778271634.475447, "eos/sharpness": 38.074135780334466, "eos/L0_probe": 1.9666365385055542, "eos/L_plus": 2.148681879043579, "eos/L_minus": 2.165332555770874, "eos/grad_norm": 0.1624482423067093, "eos/embed_grad_frac": 0.1094161793589592, "eos/time_s": 0.6128251552581787} {"step": 71325, "timestamp": 1778271635.8571978, "geo/rankme_last": 438.0260009765625, "geo/layer_0/stable_rank_q_proj": 19.25960922241211, "geo/layer_0/stable_rank_k_proj": 15.94076919555664, "geo/layer_0/stable_rank_o_proj": 46.75545883178711, "geo/layer_0/stable_rank_gate_proj": 130.04510498046875, "geo/layer_0/stable_rank_down_proj": 56.22018051147461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062299273908138275, "geo/layer_0/attn_entropy_mean": 6.151369094848633, "geo/layer_0/attn_entropy_std": 0.42861422896385193, "geo/layer_7/stable_rank_q_proj": 43.080535888671875, "geo/layer_7/stable_rank_k_proj": 40.24318313598633, "geo/layer_7/stable_rank_o_proj": 89.57125091552734, "geo/layer_7/stable_rank_gate_proj": 78.73328399658203, "geo/layer_7/stable_rank_down_proj": 140.06483459472656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43296560645103455, "geo/layer_7/attn_entropy_mean": 4.653224945068359, "geo/layer_7/attn_entropy_std": 0.7900097370147705, "geo/layer_14/stable_rank_q_proj": 50.39632034301758, "geo/layer_14/stable_rank_k_proj": 40.93632888793945, "geo/layer_14/stable_rank_o_proj": 43.321048736572266, "geo/layer_14/stable_rank_gate_proj": 71.0357894897461, "geo/layer_14/stable_rank_down_proj": 127.40241241455078, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3881767988204956, "geo/layer_14/attn_entropy_mean": 5.527431011199951, "geo/layer_14/attn_entropy_std": 0.42224204540252686, "geo/layer_21/stable_rank_q_proj": 39.801612854003906, "geo/layer_21/stable_rank_k_proj": 30.19511604309082, "geo/layer_21/stable_rank_o_proj": 69.07705688476562, "geo/layer_21/stable_rank_gate_proj": 64.25440216064453, "geo/layer_21/stable_rank_down_proj": 49.8173828125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14094573259353638, "geo/layer_21/attn_entropy_mean": 5.677559852600098, "geo/layer_21/attn_entropy_std": 0.31525182723999023, "geo/layer_27/stable_rank_q_proj": 43.67265701293945, "geo/layer_27/stable_rank_k_proj": 32.09589385986328, "geo/layer_27/stable_rank_o_proj": 115.69483184814453, "geo/layer_27/stable_rank_gate_proj": 78.19453430175781, "geo/layer_27/stable_rank_down_proj": 127.84098815917969, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09626565873622894, "geo/layer_27/attn_entropy_mean": 4.152279853820801, "geo/layer_27/attn_entropy_std": 0.7673937082290649, "attnres/final_alpha/block_0": 0.23684053122997284, "attnres/block_norm/0": 1.7729291915893555, "attnres/final_alpha/block_1": 0.004033043049275875, "attnres/block_norm/1": 48314.1875, "attnres/final_alpha/block_2": 0.010152958333492279, "attnres/block_norm/2": 28785.0859375, "attnres/final_alpha/block_3": 0.011916548013687134, "attnres/block_norm/3": 61156.15625, "attnres/final_alpha/block_4": 0.014074936509132385, "attnres/block_norm/4": 15670.52734375, "attnres/final_alpha/block_5": 0.6176217198371887, "attnres/block_norm/5": 6797.89404296875, "attnres/final_alpha/block_6": 0.1053602322936058, "attnres/block_norm/6": 41042.48046875, "geo/tier1_time_s": 1.3612382411956787, "geo/step": 71325.0, "geo/rankme_slope": -5.452569699754902e-05} {"step": 71330, "timestamp": 1778271641.0496686, "train/loss": 2.108211350440979, "train/z_loss": 0.0013899071258492769, "train/perplexity": 8.23350125779124, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696608.884309182, "perf/iters_per_sec": 0.8090061589761648, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236084532737732, "data/tokens_consumed": 149591949312, "data/tokens_consumed_B": 149.591949312, "train/loss_slope": 2.8634584239742857e-07} {"step": 71340, "timestamp": 1778271651.4491494, "train/loss": 2.107844150066376, "train/z_loss": 0.001386824285145849, "train/perplexity": 8.230478468063806, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017780.0933390057, "perf/iters_per_sec": 0.9621525255866078, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0393362522125245, "data/tokens_consumed": 149612920832, "data/tokens_consumed_B": 149.612920832, "train/loss_slope": -3.678040586002934e-06} {"step": 71350, "timestamp": 1778271662.3671381, "grad/layer_0/attn": 0.002847700146958232, "grad/layer_0/mlp": 0.0030870442278683186, "grad/layer_0/attn_mlp_ratio": 0.922468174898104, "grad/layer_4/attn": 0.0030380194075405598, "grad/layer_4/mlp": 0.0024984213523566723, "grad/layer_4/attn_mlp_ratio": 1.215975553153734, "grad/layer_8/attn": 0.0037321678828448057, "grad/layer_8/mlp": 0.003607921302318573, "grad/layer_8/attn_mlp_ratio": 1.0344371361433726, "grad/layer_12/attn": 0.008260732516646385, "grad/layer_12/mlp": 0.007518346421420574, "grad/layer_12/attn_mlp_ratio": 1.098743253335115, "grad/layer_16/attn": 0.004446067847311497, "grad/layer_16/mlp": 0.004612418822944164, "grad/layer_16/attn_mlp_ratio": 0.9639340921950595, "grad/layer_20/attn": 0.002874779049307108, "grad/layer_20/mlp": 0.0061237020418047905, "grad/layer_20/attn_mlp_ratio": 0.4694511559734104, "grad/layer_24/attn": 0.004842820111662149, "grad/layer_24/mlp": 0.007458476349711418, "grad/layer_24/attn_mlp_ratio": 0.6493042036553628, "grad/layer_27/attn": 0.0047166659496724606, "grad/layer_27/mlp": 0.006201338022947311, "grad/layer_27/attn_mlp_ratio": 0.760588417557656} {"step": 71350, "timestamp": 1778271662.3842814, "train/loss": 2.103748059272766, "train/z_loss": 0.001377895928453654, "train/perplexity": 8.196834632126654, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1919051.8252415915, "perf/iters_per_sec": 0.9150752187927206, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.092806339263916, "data/tokens_consumed": 149633892352, "data/tokens_consumed_B": 149.633892352, "train/loss_slope": -5.668877725518278e-06} {"step": 71360, "timestamp": 1778271673.116733, "train/loss": 2.129143714904785, "train/z_loss": 0.0013880250859074294, "train/perplexity": 8.407664370522761, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1955430.3583506097, "perf/iters_per_sec": 0.9324218551400231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0724759340286254, "data/tokens_consumed": 149654863872, "data/tokens_consumed_B": 149.654863872, "train/loss_slope": -6.155303216288167e-06} {"step": 71370, "timestamp": 1778271683.494991, "train/loss": 2.146349287033081, "train/z_loss": 0.0013685414800420404, "train/perplexity": 8.553574681985511, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021964.7557082777, "perf/iters_per_sec": 0.964147928098811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371852397918702, "data/tokens_consumed": 149675835392, "data/tokens_consumed_B": 149.675835392, "train/loss_slope": -9.635754551502718e-06} {"step": 71380, "timestamp": 1778271693.8748934, "train/loss": 2.092478084564209, "train/z_loss": 0.0013840067549608648, "train/perplexity": 8.104975112635305, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021766.3095947013, "perf/iters_per_sec": 0.9640533016179568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372870445251465, "data/tokens_consumed": 149696806912, "data/tokens_consumed_B": 149.696806912, "train/loss_slope": -9.34612308934834e-06} {"step": 71390, "timestamp": 1778271704.2630675, "train/loss": 2.1423295974731444, "train/z_loss": 0.0013819132931530476, "train/perplexity": 8.519260978557027, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021254.2012121652, "perf/iters_per_sec": 0.9638091093121363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375498533248901, "data/tokens_consumed": 149717778432, "data/tokens_consumed_B": 149.717778432, "train/loss_slope": -8.1818553659604e-06} {"step": 71400, "timestamp": 1778271714.6044948, "grad/layer_0/attn": 0.003017457202076912, "grad/layer_0/mlp": 0.003281545592471957, "grad/layer_0/attn_mlp_ratio": 0.9195231408781299, "grad/layer_4/attn": 0.0027580370660871267, "grad/layer_4/mlp": 0.0025495465379208326, "grad/layer_4/attn_mlp_ratio": 1.0817754910089894, "grad/layer_8/attn": 0.004590617027133703, "grad/layer_8/mlp": 0.00358088337816298, "grad/layer_8/attn_mlp_ratio": 1.2819788901616882, "grad/layer_12/attn": 0.004369272850453854, "grad/layer_12/mlp": 0.007203008979558945, "grad/layer_12/attn_mlp_ratio": 0.606589940703142, "grad/layer_16/attn": 0.0037326491437852383, "grad/layer_16/mlp": 0.004615988116711378, "grad/layer_16/attn_mlp_ratio": 0.808634894316202, "grad/layer_20/attn": 0.002829406876116991, "grad/layer_20/mlp": 0.005687837488949299, "grad/layer_20/attn_mlp_ratio": 0.4974486053564798, "grad/layer_24/attn": 0.0076104868203401566, "grad/layer_24/mlp": 0.008892986923456192, "grad/layer_24/attn_mlp_ratio": 0.855785215953503, "grad/layer_27/attn": 0.006558774504810572, "grad/layer_27/mlp": 0.007120855152606964, "grad/layer_27/attn_mlp_ratio": 0.9210655563331929} {"step": 71400, "timestamp": 1778271715.2147648, "eos/sharpness": 18.309974670410153, "eos/L0_probe": 1.966955304145813, "eos/L_plus": 2.0618174076080322, "eos/L_minus": 2.0551929473876953, "eos/grad_norm": 0.09012509137392044, "eos/embed_grad_frac": 0.2785397469997406, "eos/time_s": 0.607203483581543} {"step": 71400, "timestamp": 1778271715.2349944, "train/loss": 2.1505913138389587, "train/z_loss": 0.0013636553427204489, "train/perplexity": 8.58993624390621, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1912656.873487501, "perf/iters_per_sec": 0.9120258681714539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964601278305053, "data/tokens_consumed": 149738749952, "data/tokens_consumed_B": 149.738749952, "train/loss_slope": -5.4425547582911335e-06} {"step": 71400, "timestamp": 1778271716.6044693, "geo/rankme_last": 438.3904113769531, "geo/layer_0/stable_rank_q_proj": 19.276927947998047, "geo/layer_0/stable_rank_k_proj": 15.949370384216309, "geo/layer_0/stable_rank_o_proj": 46.802818298339844, "geo/layer_0/stable_rank_gate_proj": 129.9945526123047, "geo/layer_0/stable_rank_down_proj": 56.156837463378906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.067165307700634, "geo/layer_0/attn_entropy_mean": 6.152728080749512, "geo/layer_0/attn_entropy_std": 0.4308927059173584, "geo/layer_7/stable_rank_q_proj": 43.072227478027344, "geo/layer_7/stable_rank_k_proj": 40.107425689697266, "geo/layer_7/stable_rank_o_proj": 89.53256225585938, "geo/layer_7/stable_rank_gate_proj": 78.7667465209961, "geo/layer_7/stable_rank_down_proj": 140.2874755859375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43886345624923706, "geo/layer_7/attn_entropy_mean": 4.64211368560791, "geo/layer_7/attn_entropy_std": 0.793828547000885, "geo/layer_14/stable_rank_q_proj": 50.39579772949219, "geo/layer_14/stable_rank_k_proj": 40.97813034057617, "geo/layer_14/stable_rank_o_proj": 43.295955657958984, "geo/layer_14/stable_rank_gate_proj": 70.89556121826172, "geo/layer_14/stable_rank_down_proj": 127.48129272460938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39493268728256226, "geo/layer_14/attn_entropy_mean": 5.547711372375488, "geo/layer_14/attn_entropy_std": 0.4234274923801422, "geo/layer_21/stable_rank_q_proj": 39.801517486572266, "geo/layer_21/stable_rank_k_proj": 30.221731185913086, "geo/layer_21/stable_rank_o_proj": 69.10845184326172, "geo/layer_21/stable_rank_gate_proj": 64.20999908447266, "geo/layer_21/stable_rank_down_proj": 49.800411224365234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14206096529960632, "geo/layer_21/attn_entropy_mean": 5.680515766143799, "geo/layer_21/attn_entropy_std": 0.30427345633506775, "geo/layer_27/stable_rank_q_proj": 43.62023162841797, "geo/layer_27/stable_rank_k_proj": 32.155113220214844, "geo/layer_27/stable_rank_o_proj": 115.59220886230469, "geo/layer_27/stable_rank_gate_proj": 78.25848388671875, "geo/layer_27/stable_rank_down_proj": 128.15370178222656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10541601479053497, "geo/layer_27/attn_entropy_mean": 4.156099319458008, "geo/layer_27/attn_entropy_std": 0.7706170082092285, "attnres/final_alpha/block_0": 0.23526333272457123, "attnres/block_norm/0": 1.7729692459106445, "attnres/final_alpha/block_1": 0.004017841070890427, "attnres/block_norm/1": 48314.19921875, "attnres/final_alpha/block_2": 0.009911254048347473, "attnres/block_norm/2": 28741.990234375, "attnres/final_alpha/block_3": 0.011676719412207603, "attnres/block_norm/3": 60891.90625, "attnres/final_alpha/block_4": 0.013852067291736603, "attnres/block_norm/4": 15626.55859375, "attnres/final_alpha/block_5": 0.6183100938796997, "attnres/block_norm/5": 6838.1435546875, "attnres/final_alpha/block_6": 0.10696865618228912, "attnres/block_norm/6": 40985.9921875, "geo/tier1_time_s": 1.3659980297088623, "geo/step": 71400.0, "geo/rankme_slope": -5.6151269101390556e-05} {"step": 71410, "timestamp": 1778271726.942433, "train/loss": 2.161844563484192, "train/z_loss": 0.0013629608205519617, "train/perplexity": 8.687146882830831, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791807.3515000362, "perf/iters_per_sec": 0.8544003255367452, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704115390777587, "data/tokens_consumed": 149759721472, "data/tokens_consumed_B": 149.759721472, "train/loss_slope": -5.793374871621113e-06} {"step": 71420, "timestamp": 1778271737.331554, "train/loss": 2.1676609992980955, "train/z_loss": 0.0013754607411101461, "train/perplexity": 8.737822347508176, "train/grad_norm": 0.34765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020316.5584386233, "perf/iters_per_sec": 0.9633620063965909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380313873291016, "data/tokens_consumed": 149780692992, "data/tokens_consumed_B": 149.780692992, "train/loss_slope": -1.306203360890938e-06} {"step": 71430, "timestamp": 1778271747.7094107, "train/loss": 2.1076887369155886, "train/z_loss": 0.0013711895211599768, "train/perplexity": 8.229199442863841, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021890.1130913289, "perf/iters_per_sec": 0.9641123357254643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372235298156738, "data/tokens_consumed": 149801664512, "data/tokens_consumed_B": 149.801664512, "train/loss_slope": -6.481886282004399e-06} {"step": 71440, "timestamp": 1778271758.0848124, "train/loss": 2.1674351692199707, "train/z_loss": 0.0013716911082156003, "train/perplexity": 8.735849307199107, "train/grad_norm": 0.169921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022670.554143784, "perf/iters_per_sec": 0.9644844790190621, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036823320388794, "data/tokens_consumed": 149822636032, "data/tokens_consumed_B": 149.822636032, "train/loss_slope": -5.39253173631272e-06} {"step": 71450, "timestamp": 1778271768.4500198, "grad/layer_0/attn": 0.002479767194017768, "grad/layer_0/mlp": 0.0029269331134855747, "grad/layer_0/attn_mlp_ratio": 0.8472237024720856, "grad/layer_4/attn": 0.0026191030628979206, "grad/layer_4/mlp": 0.0023758921306580305, "grad/layer_4/attn_mlp_ratio": 1.1023661044476458, "grad/layer_8/attn": 0.0032403215300291777, "grad/layer_8/mlp": 0.0034482639748603106, "grad/layer_8/attn_mlp_ratio": 0.9396964558639388, "grad/layer_12/attn": 0.006566417869180441, "grad/layer_12/mlp": 0.006712545175105333, "grad/layer_12/attn_mlp_ratio": 0.9782307008837867, "grad/layer_16/attn": 0.00406290590763092, "grad/layer_16/mlp": 0.004763908684253693, "grad/layer_16/attn_mlp_ratio": 0.8528513226492035, "grad/layer_20/attn": 0.00899291317909956, "grad/layer_20/mlp": 0.006295794155448675, "grad/layer_20/attn_mlp_ratio": 1.428400105565183, "grad/layer_24/attn": 0.013667641207575798, "grad/layer_24/mlp": 0.011251953430473804, "grad/layer_24/attn_mlp_ratio": 1.2146905131237484, "grad/layer_27/attn": 0.006930471397936344, "grad/layer_27/mlp": 0.010568122379481792, "grad/layer_27/attn_mlp_ratio": 0.6557902230402783} {"step": 71450, "timestamp": 1778271768.466599, "train/loss": 2.125324821472168, "train/z_loss": 0.0013875359087251126, "train/perplexity": 8.375617626771474, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021337.6688395434, "perf/iters_per_sec": 0.9638489097783772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375070095062255, "data/tokens_consumed": 149843607552, "data/tokens_consumed_B": 149.843607552, "train/loss_slope": -1.0308008511575076e-05} {"step": 71460, "timestamp": 1778271778.8389075, "train/loss": 2.1379810094833376, "train/z_loss": 0.0013690529624000192, "train/perplexity": 8.482294656489143, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022655.4845250326, "perf/iters_per_sec": 0.9644772932648814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368310451507567, "data/tokens_consumed": 149864579072, "data/tokens_consumed_B": 149.864579072, "train/loss_slope": -1.12611048507004e-05} {"step": 71470, "timestamp": 1778271789.2148633, "train/loss": 2.2023935079574586, "train/z_loss": 0.0013607505825348198, "train/perplexity": 9.046640813310095, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022454.2057171022, "perf/iters_per_sec": 0.9643813160501014, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036934232711792, "data/tokens_consumed": 149885550592, "data/tokens_consumed_B": 149.885550592, "train/loss_slope": -5.766201942059299e-06} {"step": 71475, "timestamp": 1778271795.0022476, "eos/sharpness": 40.3738260269165, "eos/L0_probe": 1.9664238691329956, "eos/L_plus": 2.159879446029663, "eos/L_minus": 2.176706552505493, "eos/grad_norm": 0.12402709573507309, "eos/embed_grad_frac": 0.14803211390972137, "eos/time_s": 0.6078243255615234} {"step": 71475, "timestamp": 1778271796.3816445, "geo/rankme_last": 437.9599914550781, "geo/layer_0/stable_rank_q_proj": 19.267850875854492, "geo/layer_0/stable_rank_k_proj": 15.99100399017334, "geo/layer_0/stable_rank_o_proj": 46.77457046508789, "geo/layer_0/stable_rank_gate_proj": 130.109130859375, "geo/layer_0/stable_rank_down_proj": 56.23341751098633, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06821908801794052, "geo/layer_0/attn_entropy_mean": 6.148173809051514, "geo/layer_0/attn_entropy_std": 0.4299604892730713, "geo/layer_7/stable_rank_q_proj": 43.046085357666016, "geo/layer_7/stable_rank_k_proj": 40.23453903198242, "geo/layer_7/stable_rank_o_proj": 89.36083221435547, "geo/layer_7/stable_rank_gate_proj": 78.7012710571289, "geo/layer_7/stable_rank_down_proj": 140.2816162109375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4411565661430359, "geo/layer_7/attn_entropy_mean": 4.671931266784668, "geo/layer_7/attn_entropy_std": 0.7908588647842407, "geo/layer_14/stable_rank_q_proj": 50.419288635253906, "geo/layer_14/stable_rank_k_proj": 41.05647659301758, "geo/layer_14/stable_rank_o_proj": 43.30329132080078, "geo/layer_14/stable_rank_gate_proj": 70.98474884033203, "geo/layer_14/stable_rank_down_proj": 127.67689514160156, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3965754508972168, "geo/layer_14/attn_entropy_mean": 5.531805038452148, "geo/layer_14/attn_entropy_std": 0.4203992784023285, "geo/layer_21/stable_rank_q_proj": 39.820919036865234, "geo/layer_21/stable_rank_k_proj": 30.215055465698242, "geo/layer_21/stable_rank_o_proj": 69.11115264892578, "geo/layer_21/stable_rank_gate_proj": 64.23218536376953, "geo/layer_21/stable_rank_down_proj": 49.851104736328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13762199878692627, "geo/layer_21/attn_entropy_mean": 5.689175605773926, "geo/layer_21/attn_entropy_std": 0.31120961904525757, "geo/layer_27/stable_rank_q_proj": 43.53889465332031, "geo/layer_27/stable_rank_k_proj": 32.100494384765625, "geo/layer_27/stable_rank_o_proj": 115.5880126953125, "geo/layer_27/stable_rank_gate_proj": 78.34197235107422, "geo/layer_27/stable_rank_down_proj": 128.10763549804688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08602093160152435, "geo/layer_27/attn_entropy_mean": 4.161376476287842, "geo/layer_27/attn_entropy_std": 0.7653371691703796, "attnres/final_alpha/block_0": 0.23694181442260742, "attnres/block_norm/0": 1.772878646850586, "attnres/final_alpha/block_1": 0.004047686234116554, "attnres/block_norm/1": 48130.2890625, "attnres/final_alpha/block_2": 0.010122827254235744, "attnres/block_norm/2": 28827.26953125, "attnres/final_alpha/block_3": 0.01190886925905943, "attnres/block_norm/3": 61249.34375, "attnres/final_alpha/block_4": 0.014087671414017677, "attnres/block_norm/4": 15664.716796875, "attnres/final_alpha/block_5": 0.6153640151023865, "attnres/block_norm/5": 6804.7724609375, "attnres/final_alpha/block_6": 0.10752709209918976, "attnres/block_norm/6": 40807.8671875, "geo/tier1_time_s": 1.359839916229248, "geo/step": 71475.0, "geo/rankme_slope": -7.111182363570428e-05} {"step": 71480, "timestamp": 1778271801.5739217, "train/loss": 2.129185211658478, "train/z_loss": 0.001369360787793994, "train/perplexity": 8.408013268539294, "train/grad_norm": 0.09375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697524.3236313267, "perf/iters_per_sec": 0.8094426744610437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235417938232422, "data/tokens_consumed": 149906522112, "data/tokens_consumed_B": 149.906522112, "train/loss_slope": -5.953711015556518e-06} {"step": 71490, "timestamp": 1778271811.9549625, "train/loss": 2.1297547698020933, "train/z_loss": 0.0013808391289785505, "train/perplexity": 8.412803484991919, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021120.0729102853, "perf/iters_per_sec": 0.9637451519538333, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376187086105346, "data/tokens_consumed": 149927493632, "data/tokens_consumed_B": 149.927493632, "train/loss_slope": -5.455709562407525e-06} {"step": 71500, "timestamp": 1778271822.3147347, "grad/layer_0/attn": 0.003145706607028842, "grad/layer_0/mlp": 0.003090054029598832, "grad/layer_0/attn_mlp_ratio": 1.0180101949985048, "grad/layer_4/attn": 0.0027782481629401445, "grad/layer_4/mlp": 0.0027970171067863703, "grad/layer_4/attn_mlp_ratio": 0.9932896216009374, "grad/layer_8/attn": 0.009377924725413322, "grad/layer_8/mlp": 0.0036473877262324095, "grad/layer_8/attn_mlp_ratio": 2.5711345138474906, "grad/layer_12/attn": 0.005680032540112734, "grad/layer_12/mlp": 0.007330830208957195, "grad/layer_12/attn_mlp_ratio": 0.7748143526351402, "grad/layer_16/attn": 0.0033812117762863636, "grad/layer_16/mlp": 0.00468668295070529, "grad/layer_16/attn_mlp_ratio": 0.7214509152219141, "grad/layer_20/attn": 0.004126299172639847, "grad/layer_20/mlp": 0.0062266141176223755, "grad/layer_20/attn_mlp_ratio": 0.6626874619856358, "grad/layer_24/attn": 0.009070838801562786, "grad/layer_24/mlp": 0.008546187542378902, "grad/layer_24/attn_mlp_ratio": 1.0613900818865993, "grad/layer_27/attn": 0.010485081933438778, "grad/layer_27/mlp": 0.007482450921088457, "grad/layer_27/attn_mlp_ratio": 1.4012897516987066} {"step": 71500, "timestamp": 1778271822.3317244, "train/loss": 2.136204147338867, "train/z_loss": 0.0013651836197823286, "train/perplexity": 8.467236170604481, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022536.330817591, "perf/iters_per_sec": 0.9644204763496356, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368921279907226, "data/tokens_consumed": 149948465152, "data/tokens_consumed_B": 149.948465152, "train/loss_slope": -4.344668740307755e-06} {"step": 71500, "timestamp": 1778271829.2934709, "geo/ww_alpha_mean": 7.522098779958442, "geo/ww_alpha_std": 4.179794276638163, "geo/ww_alpha_min": 1.332926577675399, "geo/ww_alpha_max": 31.764270127443687, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9409699677507897, "geo/ww_alpha_by_type/k_proj": 4.522939216518315, "geo/ww_alpha_by_type/v_proj": 8.763969079225841, "geo/ww_alpha_by_type/o_proj": 7.855590843065954, "geo/ww_alpha_by_type/gate_proj": 8.099745137649963, "geo/ww_alpha_by_type/up_proj": 11.505465568884777, "geo/ww_alpha_by_type/down_proj": 8.055931384897546, "geo/twonn_id/layer_0": 0.6767920255661011, "geo/twonn_id/layer_7": 3.461862564086914, "geo/twonn_id/layer_14": 4.876529693603516, "geo/twonn_id/layer_21": 6.569454193115234, "geo/twonn_id/layer_27": 5.493279933929443, "geo/tier2_time_s": 6.954887628555298} {"step": 71500, "timestamp": 1778271829.9422653, "eoc/jacobian_sigma/layer_0/attn": 1245.7576904296875, "eoc/jacobian_sigma/layer_0/mlp": 8153.82373046875, "eoc/jacobian_sigma/layer_0": 8153.82373046875, "eoc/jacobian_sigma/layer_7/attn": 1.1532812118530273, "eoc/jacobian_sigma/layer_7/mlp": 1.6844558715820312, "eoc/jacobian_sigma/layer_7": 1.6844558715820312, "eoc/jacobian_sigma/layer_14/attn": 1.4736546277999878, "eoc/jacobian_sigma/layer_14/mlp": 7.197477340698242, "eoc/jacobian_sigma/layer_14": 7.197477340698242, "eoc/jacobian_sigma/layer_21/attn": 1.1139711141586304, "eoc/jacobian_sigma/layer_21/mlp": 4.231622695922852, "eoc/jacobian_sigma/layer_21": 4.231622695922852, "eoc/jacobian_sigma/layer_27/attn": 3.2323074340820312, "eoc/jacobian_sigma/layer_27/mlp": 29.330551147460938, "eoc/jacobian_sigma/layer_27": 29.330551147460938, "eoc/layer0_sigma": 8153.82373046875, "eoc/sigma_max": 29.330551147460938, "eoc/sigma_min": 1.6844558715820312, "eoc/sigma_mean": 10.611026763916016, "eoc/time_s": 0.6425299644470215} {"step": 71510, "timestamp": 1778271840.331151, "train/loss": 2.1474099159240723, "train/z_loss": 0.0013684199424460531, "train/perplexity": 8.562651663217826, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1165505.1518609591, "perf/iters_per_sec": 0.5557561644844814, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7993502616882324, "data/tokens_consumed": 149969436672, "data/tokens_consumed_B": 149.969436672, "train/loss_slope": -1.7429616692329204e-06} {"step": 71520, "timestamp": 1778271850.7049491, "train/loss": 2.181575131416321, "train/z_loss": 0.0013730306294746698, "train/perplexity": 8.860251333406822, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022872.1551437306, "perf/iters_per_sec": 0.9645806098669675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367199897766113, "data/tokens_consumed": 149990408192, "data/tokens_consumed_B": 149.990408192, "train/loss_slope": -1.9053662535023673e-06} {"step": 71530, "timestamp": 1778271861.0825343, "train/loss": 2.135302257537842, "train/z_loss": 0.001371200301218778, "train/perplexity": 8.459603099271334, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021842.802007276, "perf/iters_per_sec": 0.964089776042593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372478008270263, "data/tokens_consumed": 150011379712, "data/tokens_consumed_B": 150.011379712, "train/loss_slope": -3.606240691417641e-06} {"step": 71540, "timestamp": 1778271871.4556339, "train/loss": 2.1360689282417296, "train/z_loss": 0.0013762673595920205, "train/perplexity": 8.46609131597909, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022715.345725101, "perf/iters_per_sec": 0.9645058373094086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368003606796266, "data/tokens_consumed": 150032351232, "data/tokens_consumed_B": 150.032351232, "train/loss_slope": -5.1495161637364476e-06} {"step": 71550, "timestamp": 1778271881.8169887, "grad/layer_0/attn": 0.00300515815615654, "grad/layer_0/mlp": 0.0031190854497253895, "grad/layer_0/attn_mlp_ratio": 0.9634741042678743, "grad/layer_4/attn": 0.0025328071787953377, "grad/layer_4/mlp": 0.0026315359864383936, "grad/layer_4/attn_mlp_ratio": 0.9624824040407978, "grad/layer_8/attn": 0.005214471835643053, "grad/layer_8/mlp": 0.0036893156357109547, "grad/layer_8/attn_mlp_ratio": 1.413398095795721, "grad/layer_12/attn": 0.006324321497231722, "grad/layer_12/mlp": 0.007927057333290577, "grad/layer_12/attn_mlp_ratio": 0.7978145169823063, "grad/layer_16/attn": 0.009435083717107773, "grad/layer_16/mlp": 0.005624357610940933, "grad/layer_16/attn_mlp_ratio": 1.6775397657858644, "grad/layer_20/attn": 0.004588444717228413, "grad/layer_20/mlp": 0.006015588995069265, "grad/layer_20/attn_mlp_ratio": 0.7627589991127178, "grad/layer_24/attn": 0.006775872781872749, "grad/layer_24/mlp": 0.007824942469596863, "grad/layer_24/attn_mlp_ratio": 0.8659325894863199, "grad/layer_27/attn": 0.005499381106346846, "grad/layer_27/mlp": 0.007102224975824356, "grad/layer_27/attn_mlp_ratio": 0.7743180549242917} {"step": 71550, "timestamp": 1778271882.435901, "eos/sharpness": 17.690443992614743, "eos/L0_probe": 1.9663562774658203, "eos/L_plus": 2.0625596046447754, "eos/L_minus": 2.0470573902130127, "eos/grad_norm": 0.10016551613807678, "eos/embed_grad_frac": 0.22542576491832733, "eos/time_s": 0.6162233352661133} {"step": 71550, "timestamp": 1778271882.4553928, "train/loss": 2.12766569852829, "train/z_loss": 0.0013604636769741774, "train/perplexity": 8.395246883779967, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907373.332095014, "perf/iters_per_sec": 0.9095064793086118, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099497389793396, "data/tokens_consumed": 150053322752, "data/tokens_consumed_B": 150.053322752, "train/loss_slope": -3.851162489562888e-06} {"step": 71550, "timestamp": 1778271883.8204193, "geo/rankme_last": 439.15032958984375, "geo/layer_0/stable_rank_q_proj": 19.274433135986328, "geo/layer_0/stable_rank_k_proj": 15.985136032104492, "geo/layer_0/stable_rank_o_proj": 46.7819938659668, "geo/layer_0/stable_rank_gate_proj": 130.00177001953125, "geo/layer_0/stable_rank_down_proj": 56.24094772338867, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06785107403993607, "geo/layer_0/attn_entropy_mean": 6.1521711349487305, "geo/layer_0/attn_entropy_std": 0.42570263147354126, "geo/layer_7/stable_rank_q_proj": 43.033870697021484, "geo/layer_7/stable_rank_k_proj": 40.272037506103516, "geo/layer_7/stable_rank_o_proj": 89.3873291015625, "geo/layer_7/stable_rank_gate_proj": 78.6644287109375, "geo/layer_7/stable_rank_down_proj": 140.05223083496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4385862648487091, "geo/layer_7/attn_entropy_mean": 4.647098541259766, "geo/layer_7/attn_entropy_std": 0.8003401756286621, "geo/layer_14/stable_rank_q_proj": 50.43903732299805, "geo/layer_14/stable_rank_k_proj": 41.14609146118164, "geo/layer_14/stable_rank_o_proj": 43.30942916870117, "geo/layer_14/stable_rank_gate_proj": 71.03955078125, "geo/layer_14/stable_rank_down_proj": 127.51558685302734, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3821621537208557, "geo/layer_14/attn_entropy_mean": 5.561515808105469, "geo/layer_14/attn_entropy_std": 0.4205772876739502, "geo/layer_21/stable_rank_q_proj": 39.86528396606445, "geo/layer_21/stable_rank_k_proj": 30.211013793945312, "geo/layer_21/stable_rank_o_proj": 69.02426147460938, "geo/layer_21/stable_rank_gate_proj": 64.2004165649414, "geo/layer_21/stable_rank_down_proj": 49.82533645629883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14267049729824066, "geo/layer_21/attn_entropy_mean": 5.691818714141846, "geo/layer_21/attn_entropy_std": 0.3081043064594269, "geo/layer_27/stable_rank_q_proj": 43.49003219604492, "geo/layer_27/stable_rank_k_proj": 32.10707092285156, "geo/layer_27/stable_rank_o_proj": 115.57130432128906, "geo/layer_27/stable_rank_gate_proj": 78.2361831665039, "geo/layer_27/stable_rank_down_proj": 128.2802734375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10737454146146774, "geo/layer_27/attn_entropy_mean": 4.183063507080078, "geo/layer_27/attn_entropy_std": 0.7734370827674866, "attnres/final_alpha/block_0": 0.2364179491996765, "attnres/block_norm/0": 1.7730913162231445, "attnres/final_alpha/block_1": 0.004080777056515217, "attnres/block_norm/1": 48292.96484375, "attnres/final_alpha/block_2": 0.009923105128109455, "attnres/block_norm/2": 28889.654296875, "attnres/final_alpha/block_3": 0.011719546280801296, "attnres/block_norm/3": 60914.77734375, "attnres/final_alpha/block_4": 0.013909739442169666, "attnres/block_norm/4": 15681.1748046875, "attnres/final_alpha/block_5": 0.6170269250869751, "attnres/block_norm/5": 6815.49853515625, "attnres/final_alpha/block_6": 0.1069219782948494, "attnres/block_norm/6": 40919.0625, "geo/tier1_time_s": 1.361114263534546, "geo/step": 71550.0, "geo/rankme_slope": -3.545437706332532e-05} {"step": 71560, "timestamp": 1778271894.1969733, "train/loss": 2.134077858924866, "train/z_loss": 0.0013697879388928414, "train/perplexity": 8.44925151149833, "train/grad_norm": 0.2216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786657.155981442, "perf/iters_per_sec": 0.8519445209414682, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173785352706909, "data/tokens_consumed": 150074294272, "data/tokens_consumed_B": 150.074294272, "train/loss_slope": -4.679205022057105e-06} {"step": 71570, "timestamp": 1778271904.5720003, "train/loss": 2.1831801414489744, "train/z_loss": 0.0013601522543467582, "train/perplexity": 8.874483544053959, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022320.5225514288, "perf/iters_per_sec": 0.964317570949282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370027780532838, "data/tokens_consumed": 150095265792, "data/tokens_consumed_B": 150.095265792, "train/loss_slope": 1.509810109199972e-06} {"step": 71580, "timestamp": 1778271914.9493139, "train/loss": 2.100263702869415, "train/z_loss": 0.0013809225638397038, "train/perplexity": 8.16832363896516, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022324.6141565687, "perf/iters_per_sec": 0.9643195219786495, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370006799697875, "data/tokens_consumed": 150116237312, "data/tokens_consumed_B": 150.116237312, "train/loss_slope": -3.815379759373737e-07} {"step": 71590, "timestamp": 1778271925.320705, "train/loss": 2.107783007621765, "train/z_loss": 0.001388541213236749, "train/perplexity": 8.229975251874043, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022930.2147530676, "perf/iters_per_sec": 0.9646082948460901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366902351379395, "data/tokens_consumed": 150137208832, "data/tokens_consumed_B": 150.137208832, "train/loss_slope": -2.784111883917178e-06} {"step": 71600, "timestamp": 1778271935.6850722, "grad/layer_0/attn": 0.002756855683401227, "grad/layer_0/mlp": 0.0032463334500789642, "grad/layer_0/attn_mlp_ratio": 0.8492213264204379, "grad/layer_4/attn": 0.002237027045339346, "grad/layer_4/mlp": 0.0026623099111020565, "grad/layer_4/attn_mlp_ratio": 0.840257909864274, "grad/layer_8/attn": 0.0052650123834609985, "grad/layer_8/mlp": 0.0035032671876251698, "grad/layer_8/attn_mlp_ratio": 1.5028862919078316, "grad/layer_12/attn": 0.007737608160823584, "grad/layer_12/mlp": 0.007118189707398415, "grad/layer_12/attn_mlp_ratio": 1.0870190835289846, "grad/layer_16/attn": 0.0038805461954325438, "grad/layer_16/mlp": 0.004822975490242243, "grad/layer_16/attn_mlp_ratio": 0.8045958605479147, "grad/layer_20/attn": 0.0030831315089017153, "grad/layer_20/mlp": 0.006574145518243313, "grad/layer_20/attn_mlp_ratio": 0.46897828066752856, "grad/layer_24/attn": 0.018773222342133522, "grad/layer_24/mlp": 0.010630653239786625, "grad/layer_24/attn_mlp_ratio": 1.7659518885704095, "grad/layer_27/attn": 0.00591528695076704, "grad/layer_27/mlp": 0.010545863769948483, "grad/layer_27/attn_mlp_ratio": 0.5609106113746883} {"step": 71600, "timestamp": 1778271935.7016993, "train/loss": 2.1614962100982664, "train/z_loss": 0.001379607862327248, "train/perplexity": 8.684121212831954, "train/grad_norm": 0.205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021574.267582494, "perf/iters_per_sec": 0.9639617288506003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373855829238892, "data/tokens_consumed": 150158180352, "data/tokens_consumed_B": 150.158180352, "train/loss_slope": -1.4079048295226322e-06} {"step": 71610, "timestamp": 1778271946.0767033, "train/loss": 2.101351237297058, "train/z_loss": 0.001376660924870521, "train/perplexity": 8.17721180435557, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022271.564060614, "perf/iters_per_sec": 0.964294225721652, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370278835296631, "data/tokens_consumed": 150179151872, "data/tokens_consumed_B": 150.179151872, "train/loss_slope": -2.0002649097231406e-06} {"step": 71620, "timestamp": 1778271956.4392784, "train/loss": 2.14915075302124, "train/z_loss": 0.001375765132252127, "train/perplexity": 8.577570827032469, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024791.0954257194, "perf/iters_per_sec": 0.9654956318977925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357374668121337, "data/tokens_consumed": 150200123392, "data/tokens_consumed_B": 150.200123392, "train/loss_slope": -2.110425895876392e-06} {"step": 71625, "timestamp": 1778271962.217465, "eos/sharpness": 88.76740932464598, "eos/L0_probe": 1.970319390296936, "eos/L_plus": 2.5087685585021973, "eos/L_minus": 2.3195443153381348, "eos/grad_norm": 0.2628336548805237, "eos/embed_grad_frac": 0.03053947724401951, "eos/time_s": 0.606757402420044} {"step": 71625, "timestamp": 1778271963.6003742, "geo/rankme_last": 438.5191955566406, "geo/layer_0/stable_rank_q_proj": 19.2733097076416, "geo/layer_0/stable_rank_k_proj": 15.969910621643066, "geo/layer_0/stable_rank_o_proj": 46.84203338623047, "geo/layer_0/stable_rank_gate_proj": 129.81680297851562, "geo/layer_0/stable_rank_down_proj": 56.26900863647461, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06370113044977188, "geo/layer_0/attn_entropy_mean": 6.148643970489502, "geo/layer_0/attn_entropy_std": 0.42414161562919617, "geo/layer_7/stable_rank_q_proj": 43.002044677734375, "geo/layer_7/stable_rank_k_proj": 40.370323181152344, "geo/layer_7/stable_rank_o_proj": 89.53486633300781, "geo/layer_7/stable_rank_gate_proj": 78.74300384521484, "geo/layer_7/stable_rank_down_proj": 140.256591796875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45069488883018494, "geo/layer_7/attn_entropy_mean": 4.646668910980225, "geo/layer_7/attn_entropy_std": 0.7972238659858704, "geo/layer_14/stable_rank_q_proj": 50.314659118652344, "geo/layer_14/stable_rank_k_proj": 41.12761306762695, "geo/layer_14/stable_rank_o_proj": 43.336978912353516, "geo/layer_14/stable_rank_gate_proj": 71.0374984741211, "geo/layer_14/stable_rank_down_proj": 127.39795684814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38750794529914856, "geo/layer_14/attn_entropy_mean": 5.5319085121154785, "geo/layer_14/attn_entropy_std": 0.41944921016693115, "geo/layer_21/stable_rank_q_proj": 39.81010437011719, "geo/layer_21/stable_rank_k_proj": 30.148691177368164, "geo/layer_21/stable_rank_o_proj": 68.95230102539062, "geo/layer_21/stable_rank_gate_proj": 64.26012420654297, "geo/layer_21/stable_rank_down_proj": 49.86307907104492, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14532983303070068, "geo/layer_21/attn_entropy_mean": 5.696629524230957, "geo/layer_21/attn_entropy_std": 0.2946512997150421, "geo/layer_27/stable_rank_q_proj": 43.45024490356445, "geo/layer_27/stable_rank_k_proj": 32.093536376953125, "geo/layer_27/stable_rank_o_proj": 115.78076171875, "geo/layer_27/stable_rank_gate_proj": 78.2247543334961, "geo/layer_27/stable_rank_down_proj": 127.85637664794922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10083403438329697, "geo/layer_27/attn_entropy_mean": 4.179220199584961, "geo/layer_27/attn_entropy_std": 0.7539014220237732, "attnres/final_alpha/block_0": 0.23498106002807617, "attnres/block_norm/0": 1.7733368873596191, "attnres/final_alpha/block_1": 0.00403592549264431, "attnres/block_norm/1": 48330.13671875, "attnres/final_alpha/block_2": 0.009839856065809727, "attnres/block_norm/2": 29061.515625, "attnres/final_alpha/block_3": 0.011602983810007572, "attnres/block_norm/3": 61132.0625, "attnres/final_alpha/block_4": 0.013856331817805767, "attnres/block_norm/4": 15677.9736328125, "attnres/final_alpha/block_5": 0.6215470433235168, "attnres/block_norm/5": 6750.93359375, "attnres/final_alpha/block_6": 0.10413680970668793, "attnres/block_norm/6": 41194.74609375, "geo/tier1_time_s": 1.361588716506958, "geo/step": 71625.0, "geo/rankme_slope": -5.496417316926771e-05} {"step": 71630, "timestamp": 1778271968.7775927, "train/loss": 2.120288383960724, "train/z_loss": 0.001381825131829828, "train/perplexity": 8.333540400577842, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700655.7745562159, "perf/iters_per_sec": 0.8109358666211204, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233143138885498, "data/tokens_consumed": 150221094912, "data/tokens_consumed_B": 150.221094912, "train/loss_slope": -3.0217325726751846e-06} {"step": 71640, "timestamp": 1778271979.127438, "train/loss": 2.1628486633300783, "train/z_loss": 0.0013692173291929067, "train/perplexity": 8.695874026405594, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027179.829554605, "perf/iters_per_sec": 0.9666346690915132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345170021057128, "data/tokens_consumed": 150242066432, "data/tokens_consumed_B": 150.242066432, "train/loss_slope": -1.6479562432638164e-06} {"step": 71650, "timestamp": 1778271989.4629653, "grad/layer_0/attn": 0.003035120666027069, "grad/layer_0/mlp": 0.003079493995755911, "grad/layer_0/attn_mlp_ratio": 0.9855906754976422, "grad/layer_4/attn": 0.0041387625969946384, "grad/layer_4/mlp": 0.0026418452616780996, "grad/layer_4/attn_mlp_ratio": 1.5666180379178958, "grad/layer_8/attn": 0.0033556639682501554, "grad/layer_8/mlp": 0.0038396127056330442, "grad/layer_8/attn_mlp_ratio": 0.8739589479770216, "grad/layer_12/attn": 0.005068965721875429, "grad/layer_12/mlp": 0.00765604991465807, "grad/layer_12/attn_mlp_ratio": 0.6620862862926081, "grad/layer_16/attn": 0.003943941090255976, "grad/layer_16/mlp": 0.005224133376032114, "grad/layer_16/attn_mlp_ratio": 0.7549464630546763, "grad/layer_20/attn": 0.0038514467887580395, "grad/layer_20/mlp": 0.006464151665568352, "grad/layer_20/attn_mlp_ratio": 0.5958162692393727, "grad/layer_24/attn": 0.01114126667380333, "grad/layer_24/mlp": 0.011671645566821098, "grad/layer_24/attn_mlp_ratio": 0.9545583366598017, "grad/layer_27/attn": 0.01340292114764452, "grad/layer_27/mlp": 0.010660676285624504, "grad/layer_27/attn_mlp_ratio": 1.257229903884694} {"step": 71650, "timestamp": 1778271989.4786177, "train/loss": 2.1537800312042235, "train/z_loss": 0.0013890705304220319, "train/perplexity": 8.617370840114283, "train/grad_norm": 0.216796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027137.923362045, "perf/iters_per_sec": 0.9666146866617418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345383882522583, "data/tokens_consumed": 150263037952, "data/tokens_consumed_B": 150.263037952, "train/loss_slope": -8.288353726272573e-07} {"step": 71660, "timestamp": 1778271999.8262188, "train/loss": 2.1414400577545165, "train/z_loss": 0.0013640947989188134, "train/perplexity": 8.511686127108343, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028170.5254204252, "perf/iters_per_sec": 0.9671070696928145, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034011673927307, "data/tokens_consumed": 150284009472, "data/tokens_consumed_B": 150.284009472, "train/loss_slope": -2.9043143982291015e-06} {"step": 71670, "timestamp": 1778272010.2012887, "train/loss": 2.1453848838806153, "train/z_loss": 0.0013727191486395895, "train/perplexity": 8.545329564045266, "train/grad_norm": 0.1162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022337.6795065512, "perf/iters_per_sec": 0.9643257520230061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369939804077148, "data/tokens_consumed": 150304980992, "data/tokens_consumed_B": 150.304980992, "train/loss_slope": -4.62618712509071e-06} {"step": 71680, "timestamp": 1778272020.5833347, "train/loss": 2.1381956815719603, "train/z_loss": 0.001363132381811738, "train/perplexity": 8.484115763862834, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020735.0174071754, "perf/iters_per_sec": 0.9635615431819798, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378164291381835, "data/tokens_consumed": 150325952512, "data/tokens_consumed_B": 150.325952512, "train/loss_slope": -2.8512767153104232e-06} {"step": 71690, "timestamp": 1778272030.968626, "train/loss": 2.1054686665534974, "train/z_loss": 0.001380192837677896, "train/perplexity": 8.21095030575661, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020852.380410272, "perf/iters_per_sec": 0.9636175062228546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377561569213867, "data/tokens_consumed": 150346924032, "data/tokens_consumed_B": 150.346924032, "train/loss_slope": -4.85935281999043e-06} {"step": 71700, "timestamp": 1778272041.333475, "grad/layer_0/attn": 0.0024700236972421408, "grad/layer_0/mlp": 0.0026845119427889585, "grad/layer_0/attn_mlp_ratio": 0.9201015521152275, "grad/layer_4/attn": 0.0023691614624112844, "grad/layer_4/mlp": 0.0024986863136291504, "grad/layer_4/attn_mlp_ratio": 0.9481627824478619, "grad/layer_8/attn": 0.005579317454248667, "grad/layer_8/mlp": 0.0036157204303890467, "grad/layer_8/attn_mlp_ratio": 1.5430720951346149, "grad/layer_12/attn": 0.003839176846668124, "grad/layer_12/mlp": 0.007002701982855797, "grad/layer_12/attn_mlp_ratio": 0.5482422072570101, "grad/layer_16/attn": 0.00487964553758502, "grad/layer_16/mlp": 0.004983517806977034, "grad/layer_16/attn_mlp_ratio": 0.9791568182695622, "grad/layer_20/attn": 0.003425074275583029, "grad/layer_20/mlp": 0.0061558871529996395, "grad/layer_20/attn_mlp_ratio": 0.5563900271100741, "grad/layer_24/attn": 0.01380350161343813, "grad/layer_24/mlp": 0.010470941662788391, "grad/layer_24/attn_mlp_ratio": 1.3182674420454694, "grad/layer_27/attn": 0.006574011407792568, "grad/layer_27/mlp": 0.011631961911916733, "grad/layer_27/attn_mlp_ratio": 0.5651678883629102} {"step": 71700, "timestamp": 1778272041.955139, "eos/sharpness": 87.76438236236571, "eos/L0_probe": 1.9662121534347534, "eos/L_plus": 2.3457186222076416, "eos/L_minus": 2.4643495082855225, "eos/grad_norm": 0.22393898665905, "eos/embed_grad_frac": 0.04100044071674347, "eos/time_s": 0.6189365386962891} {"step": 71700, "timestamp": 1778272041.9742842, "train/loss": 2.1778119087219237, "train/z_loss": 0.0013677109265699983, "train/perplexity": 8.826970894638134, "train/grad_norm": 0.2236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906472.4797903267, "perf/iters_per_sec": 0.909076919455684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1000169277191163, "data/tokens_consumed": 150367895552, "data/tokens_consumed_B": 150.367895552, "train/loss_slope": 2.7570183485290116e-06} {"step": 71700, "timestamp": 1778272043.3382645, "geo/rankme_last": 438.6112060546875, "geo/layer_0/stable_rank_q_proj": 19.251033782958984, "geo/layer_0/stable_rank_k_proj": 15.960182189941406, "geo/layer_0/stable_rank_o_proj": 46.80881881713867, "geo/layer_0/stable_rank_gate_proj": 129.7892608642578, "geo/layer_0/stable_rank_down_proj": 56.30667495727539, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0648985356092453, "geo/layer_0/attn_entropy_mean": 6.151727676391602, "geo/layer_0/attn_entropy_std": 0.42532575130462646, "geo/layer_7/stable_rank_q_proj": 42.948707580566406, "geo/layer_7/stable_rank_k_proj": 40.3781623840332, "geo/layer_7/stable_rank_o_proj": 89.21192169189453, "geo/layer_7/stable_rank_gate_proj": 78.65161895751953, "geo/layer_7/stable_rank_down_proj": 140.05178833007812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43617233633995056, "geo/layer_7/attn_entropy_mean": 4.65197229385376, "geo/layer_7/attn_entropy_std": 0.7963069081306458, "geo/layer_14/stable_rank_q_proj": 50.35833740234375, "geo/layer_14/stable_rank_k_proj": 41.12641143798828, "geo/layer_14/stable_rank_o_proj": 43.3513298034668, "geo/layer_14/stable_rank_gate_proj": 71.115234375, "geo/layer_14/stable_rank_down_proj": 127.3332290649414, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39179787039756775, "geo/layer_14/attn_entropy_mean": 5.562690258026123, "geo/layer_14/attn_entropy_std": 0.42661890387535095, "geo/layer_21/stable_rank_q_proj": 39.803443908691406, "geo/layer_21/stable_rank_k_proj": 30.1698055267334, "geo/layer_21/stable_rank_o_proj": 68.8531723022461, "geo/layer_21/stable_rank_gate_proj": 64.22360229492188, "geo/layer_21/stable_rank_down_proj": 49.870296478271484, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14399082958698273, "geo/layer_21/attn_entropy_mean": 5.697049617767334, "geo/layer_21/attn_entropy_std": 0.30513960123062134, "geo/layer_27/stable_rank_q_proj": 43.49312210083008, "geo/layer_27/stable_rank_k_proj": 32.083587646484375, "geo/layer_27/stable_rank_o_proj": 115.6352767944336, "geo/layer_27/stable_rank_gate_proj": 78.23660278320312, "geo/layer_27/stable_rank_down_proj": 127.83484649658203, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09714321047067642, "geo/layer_27/attn_entropy_mean": 4.169872760772705, "geo/layer_27/attn_entropy_std": 0.763515293598175, "attnres/final_alpha/block_0": 0.2398625910282135, "attnres/block_norm/0": 1.773305892944336, "attnres/final_alpha/block_1": 0.004164331126958132, "attnres/block_norm/1": 48104.6171875, "attnres/final_alpha/block_2": 0.01017393171787262, "attnres/block_norm/2": 28983.673828125, "attnres/final_alpha/block_3": 0.012041788548231125, "attnres/block_norm/3": 61128.01171875, "attnres/final_alpha/block_4": 0.014346333220601082, "attnres/block_norm/4": 15719.3828125, "attnres/final_alpha/block_5": 0.6102146506309509, "attnres/block_norm/5": 6904.5166015625, "attnres/final_alpha/block_6": 0.10919636487960815, "attnres/block_norm/6": 40842.23046875, "geo/tier1_time_s": 1.3604936599731445, "geo/step": 71700.0, "geo/rankme_slope": -5.208010938750501e-05} {"step": 71710, "timestamp": 1778272053.7217238, "train/loss": 2.1626015663146974, "train/z_loss": 0.0013602086924947797, "train/perplexity": 8.693725567337383, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785697.3171131636, "perf/iters_per_sec": 0.8514868341031855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1744162797927857, "data/tokens_consumed": 150388867072, "data/tokens_consumed_B": 150.388867072, "train/loss_slope": 4.194955528229496e-06} {"step": 71720, "timestamp": 1778272064.07751, "train/loss": 2.1288580656051637, "train/z_loss": 0.0014004985452629626, "train/perplexity": 8.405263070065095, "train/grad_norm": 0.22265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026608.5689126833, "perf/iters_per_sec": 0.9663622707904259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034808611869812, "data/tokens_consumed": 150409838592, "data/tokens_consumed_B": 150.409838592, "train/loss_slope": 2.0761127841032628e-06} {"step": 71730, "timestamp": 1778272074.432376, "train/loss": 2.165656101703644, "train/z_loss": 0.0013768362114205956, "train/perplexity": 8.720321458110675, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026236.9156853864, "perf/iters_per_sec": 0.9661850527216846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349984169006348, "data/tokens_consumed": 150430810112, "data/tokens_consumed_B": 150.430810112, "train/loss_slope": -1.1331111577191665e-06} {"step": 71740, "timestamp": 1778272084.7839055, "train/loss": 2.141846752166748, "train/z_loss": 0.0013687213533557952, "train/perplexity": 8.515148486308254, "train/grad_norm": 0.310546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026952.2862201494, "perf/iters_per_sec": 0.9665261679745433, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346331357955934, "data/tokens_consumed": 150451781632, "data/tokens_consumed_B": 150.451781632, "train/loss_slope": 2.1549766213194108e-07} {"step": 71750, "timestamp": 1778272095.1257796, "grad/layer_0/attn": 0.003034422639757395, "grad/layer_0/mlp": 0.0032138926908373833, "grad/layer_0/attn_mlp_ratio": 0.9441580156028704, "grad/layer_4/attn": 0.002168241422623396, "grad/layer_4/mlp": 0.0026601655408740044, "grad/layer_4/attn_mlp_ratio": 0.8150775986682598, "grad/layer_8/attn": 0.00661311112344265, "grad/layer_8/mlp": 0.003866432933136821, "grad/layer_8/attn_mlp_ratio": 1.7103907055329135, "grad/layer_12/attn": 0.004528947174549103, "grad/layer_12/mlp": 0.006735812406986952, "grad/layer_12/attn_mlp_ratio": 0.6723683549462367, "grad/layer_16/attn": 0.004012436140328646, "grad/layer_16/mlp": 0.004918180406093597, "grad/layer_16/attn_mlp_ratio": 0.8158375105096816, "grad/layer_20/attn": 0.003108947305008769, "grad/layer_20/mlp": 0.005701375659555197, "grad/layer_20/attn_mlp_ratio": 0.5452977379711103, "grad/layer_24/attn": 0.011319408193230629, "grad/layer_24/mlp": 0.010387909598648548, "grad/layer_24/attn_mlp_ratio": 1.0896714085513535, "grad/layer_27/attn": 0.006841947790235281, "grad/layer_27/mlp": 0.01026779692620039, "grad/layer_27/attn_mlp_ratio": 0.6663501209438255} {"step": 71750, "timestamp": 1778272095.141642, "train/loss": 2.128035306930542, "train/z_loss": 0.001385232270695269, "train/perplexity": 8.398350411076745, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026141.1883929505, "perf/iters_per_sec": 0.966139406391597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350473165512084, "data/tokens_consumed": 150472753152, "data/tokens_consumed_B": 150.472753152, "train/loss_slope": -1.5486861291033165e-06} {"step": 71760, "timestamp": 1778272105.494261, "train/loss": 2.1330809235572814, "train/z_loss": 0.0013762793270871044, "train/perplexity": 8.44083235121353, "train/grad_norm": 0.20703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026691.5921155342, "perf/iters_per_sec": 0.9664018593385383, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347662210464477, "data/tokens_consumed": 150493724672, "data/tokens_consumed_B": 150.493724672, "train/loss_slope": -1.0038908057981024e-06} {"step": 71770, "timestamp": 1778272115.849471, "train/loss": 2.111915957927704, "train/z_loss": 0.0013875316129997373, "train/perplexity": 8.264059716791829, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026104.7387886602, "perf/iters_per_sec": 0.9661220258658696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350659370422364, "data/tokens_consumed": 150514696192, "data/tokens_consumed_B": 150.514696192, "train/loss_slope": -1.9073597561515307e-06} {"step": 71775, "timestamp": 1778272121.6269543, "eos/sharpness": 78.78360748291014, "eos/L0_probe": 1.9686784744262695, "eos/L_plus": 2.4348719120025635, "eos/L_minus": 2.290321111679077, "eos/grad_norm": 0.21098436415195465, "eos/embed_grad_frac": 0.052684903144836426, "eos/time_s": 0.6112380027770996} {"step": 71775, "timestamp": 1778272123.0028799, "geo/rankme_last": 439.7051696777344, "geo/layer_0/stable_rank_q_proj": 19.238178253173828, "geo/layer_0/stable_rank_k_proj": 15.956456184387207, "geo/layer_0/stable_rank_o_proj": 46.75927734375, "geo/layer_0/stable_rank_gate_proj": 129.2439422607422, "geo/layer_0/stable_rank_down_proj": 56.28828430175781, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062246646732091904, "geo/layer_0/attn_entropy_mean": 6.150376319885254, "geo/layer_0/attn_entropy_std": 0.4256625175476074, "geo/layer_7/stable_rank_q_proj": 42.98388671875, "geo/layer_7/stable_rank_k_proj": 40.360599517822266, "geo/layer_7/stable_rank_o_proj": 89.24625396728516, "geo/layer_7/stable_rank_gate_proj": 78.69335174560547, "geo/layer_7/stable_rank_down_proj": 140.0401611328125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43667227029800415, "geo/layer_7/attn_entropy_mean": 4.641112804412842, "geo/layer_7/attn_entropy_std": 0.7982616424560547, "geo/layer_14/stable_rank_q_proj": 50.36720275878906, "geo/layer_14/stable_rank_k_proj": 41.07716369628906, "geo/layer_14/stable_rank_o_proj": 43.361106872558594, "geo/layer_14/stable_rank_gate_proj": 71.09381103515625, "geo/layer_14/stable_rank_down_proj": 127.0931625366211, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3938518166542053, "geo/layer_14/attn_entropy_mean": 5.544445991516113, "geo/layer_14/attn_entropy_std": 0.40664273500442505, "geo/layer_21/stable_rank_q_proj": 39.824703216552734, "geo/layer_21/stable_rank_k_proj": 30.19381332397461, "geo/layer_21/stable_rank_o_proj": 68.94766998291016, "geo/layer_21/stable_rank_gate_proj": 64.21913146972656, "geo/layer_21/stable_rank_down_proj": 49.80162811279297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14440709352493286, "geo/layer_21/attn_entropy_mean": 5.684673309326172, "geo/layer_21/attn_entropy_std": 0.30877506732940674, "geo/layer_27/stable_rank_q_proj": 43.50550842285156, "geo/layer_27/stable_rank_k_proj": 32.10211181640625, "geo/layer_27/stable_rank_o_proj": 115.3595962524414, "geo/layer_27/stable_rank_gate_proj": 78.17467498779297, "geo/layer_27/stable_rank_down_proj": 127.89632415771484, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09472106397151947, "geo/layer_27/attn_entropy_mean": 4.177137851715088, "geo/layer_27/attn_entropy_std": 0.7547031044960022, "attnres/final_alpha/block_0": 0.23575985431671143, "attnres/block_norm/0": 1.7733300924301147, "attnres/final_alpha/block_1": 0.004063314758241177, "attnres/block_norm/1": 48189.5859375, "attnres/final_alpha/block_2": 0.0099860318005085, "attnres/block_norm/2": 29008.67578125, "attnres/final_alpha/block_3": 0.011673832312226295, "attnres/block_norm/3": 61691.57421875, "attnres/final_alpha/block_4": 0.013830797746777534, "attnres/block_norm/4": 15634.837890625, "attnres/final_alpha/block_5": 0.6186201572418213, "attnres/block_norm/5": 6812.833984375, "attnres/final_alpha/block_6": 0.10606596618890762, "attnres/block_norm/6": 40937.93359375, "geo/tier1_time_s": 1.3568129539489746, "geo/step": 71775.0, "geo/rankme_slope": 2.017310830582235e-06} {"step": 71780, "timestamp": 1778272128.1807036, "train/loss": 2.172086405754089, "train/z_loss": 0.0013641916099004447, "train/perplexity": 8.776576451018707, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701510.0862652236, "perf/iters_per_sec": 0.8113432341886633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325239896774292, "data/tokens_consumed": 150535667712, "data/tokens_consumed_B": 150.535667712, "train/loss_slope": -1.209574671361012e-06} {"step": 71790, "timestamp": 1778272138.5328681, "train/loss": 2.181761145591736, "train/z_loss": 0.001363052416127175, "train/perplexity": 8.861899619050067, "train/grad_norm": 0.2119140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026763.6008738216, "perf/iters_per_sec": 0.9664361957902058, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347294569015504, "data/tokens_consumed": 150556639232, "data/tokens_consumed_B": 150.556639232, "train/loss_slope": 2.94262527382271e-06} {"step": 71800, "timestamp": 1778272148.8931582, "grad/layer_0/attn": 0.0030632675625383854, "grad/layer_0/mlp": 0.003207107074558735, "grad/layer_0/attn_mlp_ratio": 0.955149733329339, "grad/layer_4/attn": 0.0032082044053822756, "grad/layer_4/mlp": 0.0026315036229789257, "grad/layer_4/attn_mlp_ratio": 1.2191525238469019, "grad/layer_8/attn": 0.008187315426766872, "grad/layer_8/mlp": 0.0037251508329063654, "grad/layer_8/attn_mlp_ratio": 2.1978479729354543, "grad/layer_12/attn": 0.004451936576515436, "grad/layer_12/mlp": 0.006731683854013681, "grad/layer_12/attn_mlp_ratio": 0.6613406997310124, "grad/layer_16/attn": 0.0032833493314683437, "grad/layer_16/mlp": 0.00478252163156867, "grad/layer_16/attn_mlp_ratio": 0.6865309800466716, "grad/layer_20/attn": 0.004479432012885809, "grad/layer_20/mlp": 0.005969741847366095, "grad/layer_20/attn_mlp_ratio": 0.7503560543118913, "grad/layer_24/attn": 0.010215824469923973, "grad/layer_24/mlp": 0.009618879295885563, "grad/layer_24/attn_mlp_ratio": 1.0620597316454297, "grad/layer_27/attn": 0.011180395260453224, "grad/layer_27/mlp": 0.010529063642024994, "grad/layer_27/attn_mlp_ratio": 1.0618603452677895} {"step": 71800, "timestamp": 1778272148.9090168, "train/loss": 2.1327337265014648, "train/z_loss": 0.0013570538139902055, "train/perplexity": 8.437902227767099, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022130.188221869, "perf/iters_per_sec": 0.9642268124684663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371003866195678, "data/tokens_consumed": 150577610752, "data/tokens_consumed_B": 150.577610752, "train/loss_slope": 1.6298091415167264e-06} {"step": 71810, "timestamp": 1778272159.2990937, "train/loss": 2.135527515411377, "train/z_loss": 0.0013786903698928654, "train/perplexity": 8.461508906117365, "train/grad_norm": 0.140625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019679.038878066, "perf/iters_per_sec": 0.9630580133810358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383590459823608, "data/tokens_consumed": 150598582272, "data/tokens_consumed_B": 150.598582272, "train/loss_slope": 3.890108962049017e-06} {"step": 71820, "timestamp": 1778272169.6777222, "train/loss": 2.1064441204071045, "train/z_loss": 0.0013832828728482128, "train/perplexity": 8.218963616546185, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021560.050582593, "perf/iters_per_sec": 0.9639549496567693, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373928785324096, "data/tokens_consumed": 150619553792, "data/tokens_consumed_B": 150.619553792, "train/loss_slope": 2.910460697578102e-06} {"step": 71830, "timestamp": 1778272180.060728, "train/loss": 2.1317585468292237, "train/z_loss": 0.001380771689582616, "train/perplexity": 8.429677767853056, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021083.0607200826, "perf/iters_per_sec": 0.9637275031662381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037637710571289, "data/tokens_consumed": 150640525312, "data/tokens_consumed_B": 150.640525312, "train/loss_slope": 1.6106350372548127e-06} {"step": 71840, "timestamp": 1778272190.444452, "train/loss": 2.171992468833923, "train/z_loss": 0.00137854206841439, "train/perplexity": 8.775752045178976, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021124.99558728, "perf/iters_per_sec": 0.9637474992691422, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376161813735962, "data/tokens_consumed": 150661496832, "data/tokens_consumed_B": 150.661496832, "train/loss_slope": 4.460544433101625e-06} {"step": 71850, "timestamp": 1778272200.8135717, "grad/layer_0/attn": 0.0030852872878313065, "grad/layer_0/mlp": 0.0031230284366756678, "grad/layer_0/attn_mlp_ratio": 0.9879151764381457, "grad/layer_4/attn": 0.002693294547498226, "grad/layer_4/mlp": 0.002691837027668953, "grad/layer_4/attn_mlp_ratio": 1.0005414219955184, "grad/layer_8/attn": 0.004908704198896885, "grad/layer_8/mlp": 0.0036438137758523226, "grad/layer_8/attn_mlp_ratio": 1.3471336259590625, "grad/layer_12/attn": 0.0049382769502699375, "grad/layer_12/mlp": 0.006871853955090046, "grad/layer_12/attn_mlp_ratio": 0.718623665561132, "grad/layer_16/attn": 0.003913912456482649, "grad/layer_16/mlp": 0.004559260327368975, "grad/layer_16/attn_mlp_ratio": 0.8584533651527473, "grad/layer_20/attn": 0.004397780168801546, "grad/layer_20/mlp": 0.006188242230564356, "grad/layer_20/attn_mlp_ratio": 0.7106670899231701, "grad/layer_24/attn": 0.016371317207813263, "grad/layer_24/mlp": 0.0130925253033638, "grad/layer_24/attn_mlp_ratio": 1.2504323423811772, "grad/layer_27/attn": 0.004882559180259705, "grad/layer_27/mlp": 0.013402396813035011, "grad/layer_27/attn_mlp_ratio": 0.36430492336120757} {"step": 71850, "timestamp": 1778272201.4283643, "eos/sharpness": 55.95717430114745, "eos/L0_probe": 1.9694325923919678, "eos/L_plus": 2.2526931762695312, "eos/L_minus": 2.245743751525879, "eos/grad_norm": 0.20233511924743652, "eos/embed_grad_frac": 0.12118452042341232, "eos/time_s": 0.6120555400848389} {"step": 71850, "timestamp": 1778272201.4492552, "train/loss": 2.149269366264343, "train/z_loss": 0.0013849950977601112, "train/perplexity": 8.578588300867953, "train/grad_norm": 0.2021484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1906369.2654149435, "perf/iters_per_sec": 0.9090277030062406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1000764846801758, "data/tokens_consumed": 150682468352, "data/tokens_consumed_B": 150.682468352, "train/loss_slope": 8.255417354822963e-06} {"step": 71850, "timestamp": 1778272202.8104284, "geo/rankme_last": 438.617919921875, "geo/layer_0/stable_rank_q_proj": 19.228473663330078, "geo/layer_0/stable_rank_k_proj": 15.945038795471191, "geo/layer_0/stable_rank_o_proj": 46.7465934753418, "geo/layer_0/stable_rank_gate_proj": 129.0872344970703, "geo/layer_0/stable_rank_down_proj": 56.2388916015625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06109410896897316, "geo/layer_0/attn_entropy_mean": 6.147877216339111, "geo/layer_0/attn_entropy_std": 0.4257757067680359, "geo/layer_7/stable_rank_q_proj": 42.89466094970703, "geo/layer_7/stable_rank_k_proj": 40.29006576538086, "geo/layer_7/stable_rank_o_proj": 89.36109924316406, "geo/layer_7/stable_rank_gate_proj": 78.6866226196289, "geo/layer_7/stable_rank_down_proj": 139.982421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43092212080955505, "geo/layer_7/attn_entropy_mean": 4.686504364013672, "geo/layer_7/attn_entropy_std": 0.8004017472267151, "geo/layer_14/stable_rank_q_proj": 50.2784423828125, "geo/layer_14/stable_rank_k_proj": 41.068660736083984, "geo/layer_14/stable_rank_o_proj": 43.35158920288086, "geo/layer_14/stable_rank_gate_proj": 71.03890228271484, "geo/layer_14/stable_rank_down_proj": 127.04928588867188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38725733757019043, "geo/layer_14/attn_entropy_mean": 5.555471897125244, "geo/layer_14/attn_entropy_std": 0.4116628170013428, "geo/layer_21/stable_rank_q_proj": 39.85565185546875, "geo/layer_21/stable_rank_k_proj": 30.26272964477539, "geo/layer_21/stable_rank_o_proj": 68.86560821533203, "geo/layer_21/stable_rank_gate_proj": 64.26759338378906, "geo/layer_21/stable_rank_down_proj": 49.818260192871094, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14064842462539673, "geo/layer_21/attn_entropy_mean": 5.687596797943115, "geo/layer_21/attn_entropy_std": 0.30665189027786255, "geo/layer_27/stable_rank_q_proj": 43.51708221435547, "geo/layer_27/stable_rank_k_proj": 32.134761810302734, "geo/layer_27/stable_rank_o_proj": 115.25357818603516, "geo/layer_27/stable_rank_gate_proj": 78.13459777832031, "geo/layer_27/stable_rank_down_proj": 128.02284240722656, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09982436895370483, "geo/layer_27/attn_entropy_mean": 4.170869827270508, "geo/layer_27/attn_entropy_std": 0.7560389637947083, "attnres/final_alpha/block_0": 0.23528824746608734, "attnres/block_norm/0": 1.77347731590271, "attnres/final_alpha/block_1": 0.004091962240636349, "attnres/block_norm/1": 48252.515625, "attnres/final_alpha/block_2": 0.009829027578234673, "attnres/block_norm/2": 28993.02734375, "attnres/final_alpha/block_3": 0.011741519905626774, "attnres/block_norm/3": 61278.359375, "attnres/final_alpha/block_4": 0.013703620061278343, "attnres/block_norm/4": 15708.384765625, "attnres/final_alpha/block_5": 0.6204438805580139, "attnres/block_norm/5": 6800.54931640625, "attnres/final_alpha/block_6": 0.1049017608165741, "attnres/block_norm/6": 41214.9296875, "geo/tier1_time_s": 1.3570308685302734, "geo/step": 71850.0, "geo/rankme_slope": -9.967365852591038e-06} {"step": 71860, "timestamp": 1778272213.1992629, "train/loss": 2.1008413434028625, "train/z_loss": 0.0013729749363847077, "train/perplexity": 8.173043356808275, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1785381.260057861, "perf/iters_per_sec": 0.8513361263551049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1746241807937623, "data/tokens_consumed": 150703439872, "data/tokens_consumed_B": 150.703439872, "train/loss_slope": 5.01660382894294e-06} {"step": 71870, "timestamp": 1778272223.5840487, "train/loss": 2.2018236398696898, "train/z_loss": 0.0013580770348198712, "train/perplexity": 9.041486890076749, "train/grad_norm": 0.2060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020681.4472134716, "perf/iters_per_sec": 0.9635359989230498, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037843942642212, "data/tokens_consumed": 150724411392, "data/tokens_consumed_B": 150.724411392, "train/loss_slope": 8.139203390916335e-06} {"step": 71880, "timestamp": 1778272233.9639435, "train/loss": 2.143426251411438, "train/z_loss": 0.001366131962276995, "train/perplexity": 8.528608784377703, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021227.0304334378, "perf/iters_per_sec": 0.9637961532752217, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375638008117676, "data/tokens_consumed": 150745382912, "data/tokens_consumed_B": 150.745382912, "train/loss_slope": 9.382339636914e-06} {"step": 71890, "timestamp": 1778272244.3463566, "train/loss": 2.1120370626449585, "train/z_loss": 0.0013806883245706559, "train/perplexity": 8.265060594011459, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021077.2094816102, "perf/iters_per_sec": 0.963724713078313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376407146453857, "data/tokens_consumed": 150766354432, "data/tokens_consumed_B": 150.766354432, "train/loss_slope": 8.475349137563462e-06} {"step": 71900, "timestamp": 1778272254.7027392, "grad/layer_0/attn": 0.00327088451012969, "grad/layer_0/mlp": 0.003371011232957244, "grad/layer_0/attn_mlp_ratio": 0.9702976902365626, "grad/layer_4/attn": 0.0021113778930157423, "grad/layer_4/mlp": 0.0025805975310504436, "grad/layer_4/attn_mlp_ratio": 0.8181740026461609, "grad/layer_8/attn": 0.0033932202495634556, "grad/layer_8/mlp": 0.0037919196765869856, "grad/layer_8/attn_mlp_ratio": 0.8948554952335015, "grad/layer_12/attn": 0.006251620128750801, "grad/layer_12/mlp": 0.007094753440469503, "grad/layer_12/attn_mlp_ratio": 0.8811610006028614, "grad/layer_16/attn": 0.003426017938181758, "grad/layer_16/mlp": 0.0046872529201209545, "grad/layer_16/attn_mlp_ratio": 0.7309223384090645, "grad/layer_20/attn": 0.003716949140653014, "grad/layer_20/mlp": 0.006459658499807119, "grad/layer_20/attn_mlp_ratio": 0.5754095333713146, "grad/layer_24/attn": 0.013188288547098637, "grad/layer_24/mlp": 0.010958069935441017, "grad/layer_24/attn_mlp_ratio": 1.2035229291695126, "grad/layer_27/attn": 0.010396435856819153, "grad/layer_27/mlp": 0.010652144439518452, "grad/layer_27/attn_mlp_ratio": 0.9759946288983741} {"step": 71900, "timestamp": 1778272254.718757, "train/loss": 2.091530108451843, "train/z_loss": 0.0013911398244090378, "train/perplexity": 8.097295430489938, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022674.0425134636, "perf/iters_per_sec": 0.9644861424033468, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368215322494507, "data/tokens_consumed": 150787325952, "data/tokens_consumed_B": 150.787325952, "train/loss_slope": 6.659739910930352e-06} {"step": 71910, "timestamp": 1778272265.077153, "train/loss": 2.1047497987747192, "train/z_loss": 0.0013731722254306078, "train/perplexity": 8.205049839230384, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025639.316581946, "perf/iters_per_sec": 0.9659000952634554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353037595748902, "data/tokens_consumed": 150808297472, "data/tokens_consumed_B": 150.808297472, "train/loss_slope": 3.883841192976288e-06} {"step": 71920, "timestamp": 1778272275.431329, "train/loss": 2.174443483352661, "train/z_loss": 0.0013727322337217629, "train/perplexity": 8.797287922447135, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026495.4852567906, "perf/iters_per_sec": 0.9663083483013108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348663568496703, "data/tokens_consumed": 150829268992, "data/tokens_consumed_B": 150.829268992, "train/loss_slope": 7.366770514846772e-06} {"step": 71925, "timestamp": 1778272281.225417, "eos/sharpness": 57.76782035827635, "eos/L0_probe": 1.9661271572113037, "eos/L_plus": 2.2980079650878906, "eos/L_minus": 2.2119245529174805, "eos/grad_norm": 0.16106565296649933, "eos/embed_grad_frac": 0.0799291655421257, "eos/time_s": 0.6237983703613281} {"step": 71925, "timestamp": 1778272282.6035573, "geo/rankme_last": 438.09783935546875, "geo/layer_0/stable_rank_q_proj": 19.208621978759766, "geo/layer_0/stable_rank_k_proj": 15.895793914794922, "geo/layer_0/stable_rank_o_proj": 46.79233932495117, "geo/layer_0/stable_rank_gate_proj": 129.1793670654297, "geo/layer_0/stable_rank_down_proj": 56.252384185791016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06757219135761261, "geo/layer_0/attn_entropy_mean": 6.148863792419434, "geo/layer_0/attn_entropy_std": 0.4292709529399872, "geo/layer_7/stable_rank_q_proj": 42.88367462158203, "geo/layer_7/stable_rank_k_proj": 40.28938674926758, "geo/layer_7/stable_rank_o_proj": 89.36628723144531, "geo/layer_7/stable_rank_gate_proj": 78.64077758789062, "geo/layer_7/stable_rank_down_proj": 140.01388549804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4428250193595886, "geo/layer_7/attn_entropy_mean": 4.680422782897949, "geo/layer_7/attn_entropy_std": 0.7871599793434143, "geo/layer_14/stable_rank_q_proj": 50.32712936401367, "geo/layer_14/stable_rank_k_proj": 41.07024002075195, "geo/layer_14/stable_rank_o_proj": 43.36729049682617, "geo/layer_14/stable_rank_gate_proj": 70.96730041503906, "geo/layer_14/stable_rank_down_proj": 126.93877410888672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3839450478553772, "geo/layer_14/attn_entropy_mean": 5.535434246063232, "geo/layer_14/attn_entropy_std": 0.4213980436325073, "geo/layer_21/stable_rank_q_proj": 39.92481231689453, "geo/layer_21/stable_rank_k_proj": 30.162960052490234, "geo/layer_21/stable_rank_o_proj": 68.93994903564453, "geo/layer_21/stable_rank_gate_proj": 64.30048370361328, "geo/layer_21/stable_rank_down_proj": 49.8051872253418, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14793695509433746, "geo/layer_21/attn_entropy_mean": 5.702868461608887, "geo/layer_21/attn_entropy_std": 0.3034776449203491, "geo/layer_27/stable_rank_q_proj": 43.5643310546875, "geo/layer_27/stable_rank_k_proj": 32.11817169189453, "geo/layer_27/stable_rank_o_proj": 115.1043701171875, "geo/layer_27/stable_rank_gate_proj": 78.04232788085938, "geo/layer_27/stable_rank_down_proj": 127.81908416748047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10200434178113937, "geo/layer_27/attn_entropy_mean": 4.1782050132751465, "geo/layer_27/attn_entropy_std": 0.7705073356628418, "attnres/final_alpha/block_0": 0.23478886485099792, "attnres/block_norm/0": 1.7734472751617432, "attnres/final_alpha/block_1": 0.0040290080942213535, "attnres/block_norm/1": 48282.640625, "attnres/final_alpha/block_2": 0.009869039058685303, "attnres/block_norm/2": 28975.921875, "attnres/final_alpha/block_3": 0.011691063642501831, "attnres/block_norm/3": 61363.0390625, "attnres/final_alpha/block_4": 0.013749603182077408, "attnres/block_norm/4": 15717.60546875, "attnres/final_alpha/block_5": 0.6212354898452759, "attnres/block_norm/5": 6822.36376953125, "attnres/final_alpha/block_6": 0.10463689267635345, "attnres/block_norm/6": 41153.828125, "geo/tier1_time_s": 1.3584685325622559, "geo/step": 71925.0, "geo/rankme_slope": -1.3949837747599039e-05} {"step": 71930, "timestamp": 1778272287.7890568, "train/loss": 2.117979109287262, "train/z_loss": 0.0013831673539243638, "train/perplexity": 8.31431817003787, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697663.7938651873, "perf/iters_per_sec": 0.8095091790510117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353164434432984, "data/tokens_consumed": 150850240512, "data/tokens_consumed_B": 150.850240512, "train/loss_slope": 5.526278750731243e-06} {"step": 71940, "timestamp": 1778272298.1447675, "train/loss": 2.17200129032135, "train/z_loss": 0.0013712645624764264, "train/perplexity": 8.775829460706762, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026174.1855165367, "perf/iters_per_sec": 0.9661551406462368, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035030460357666, "data/tokens_consumed": 150871212032, "data/tokens_consumed_B": 150.871212032, "train/loss_slope": 9.567501621492423e-06} {"step": 71950, "timestamp": 1778272308.492569, "grad/layer_0/attn": 0.0027834500651806593, "grad/layer_0/mlp": 0.0030601960606873035, "grad/layer_0/attn_mlp_ratio": 0.9095658967677129, "grad/layer_4/attn": 0.004392298404127359, "grad/layer_4/mlp": 0.002684894250705838, "grad/layer_4/attn_mlp_ratio": 1.6359296979311897, "grad/layer_8/attn": 0.004292380064725876, "grad/layer_8/mlp": 0.003703600727021694, "grad/layer_8/attn_mlp_ratio": 1.158974809976392, "grad/layer_12/attn": 0.00608661025762558, "grad/layer_12/mlp": 0.006602137815207243, "grad/layer_12/attn_mlp_ratio": 0.9219150426418379, "grad/layer_16/attn": 0.00401756027713418, "grad/layer_16/mlp": 0.004984330851584673, "grad/layer_16/attn_mlp_ratio": 0.8060380251951115, "grad/layer_20/attn": 0.003462505294010043, "grad/layer_20/mlp": 0.005859436932951212, "grad/layer_20/attn_mlp_ratio": 0.5909279807152548, "grad/layer_24/attn": 0.0060349018312990665, "grad/layer_24/mlp": 0.008389005437493324, "grad/layer_24/attn_mlp_ratio": 0.7193822681755345, "grad/layer_27/attn": 0.0073491851799190044, "grad/layer_27/mlp": 0.007010227534919977, "grad/layer_27/attn_mlp_ratio": 1.0483518599753567} {"step": 71950, "timestamp": 1778272308.508683, "train/loss": 2.1051400542259215, "train/z_loss": 0.0013739228947088123, "train/perplexity": 8.208252529550562, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024743.8349108184, "perf/iters_per_sec": 0.9654730963281719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357616424560547, "data/tokens_consumed": 150892183552, "data/tokens_consumed_B": 150.892183552, "train/loss_slope": 1.3011036847684248e-05} {"step": 71960, "timestamp": 1778272318.8664634, "train/loss": 2.14924795627594, "train/z_loss": 0.0013867425266653299, "train/perplexity": 8.57840463535806, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025912.3383477358, "perf/iters_per_sec": 0.9660302821863822, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351642370224, "data/tokens_consumed": 150913155072, "data/tokens_consumed_B": 150.913155072, "train/loss_slope": 1.341595031676477e-05} {"step": 71970, "timestamp": 1778272329.2190392, "train/loss": 2.102252995967865, "train/z_loss": 0.0013843107502907515, "train/perplexity": 8.184589001729112, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026830.5241061936, "perf/iters_per_sec": 0.9664681072741478, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034695291519165, "data/tokens_consumed": 150934126592, "data/tokens_consumed_B": 150.934126592, "train/loss_slope": 7.062992297097306e-06} {"step": 71980, "timestamp": 1778272339.5744846, "train/loss": 2.1186233162879944, "train/z_loss": 0.0013753933482803405, "train/perplexity": 8.31967603761193, "train/grad_norm": 0.15625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026150.8493914148, "perf/iters_per_sec": 0.9661440131146501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350423812866212, "data/tokens_consumed": 150955098112, "data/tokens_consumed_B": 150.955098112, "train/loss_slope": 4.073524811301485e-06} {"step": 71990, "timestamp": 1778272349.9529042, "train/loss": 2.1868652582168577, "train/z_loss": 0.0013604861800558866, "train/perplexity": 8.907247384379167, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021521.4426799703, "perf/iters_per_sec": 0.9639365399741985, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037412691116333, "data/tokens_consumed": 150976069632, "data/tokens_consumed_B": 150.976069632, "train/loss_slope": 6.8746351172821674e-06} {"step": 72000, "timestamp": 1778272360.3212128, "grad/layer_0/attn": 0.002901459112763405, "grad/layer_0/mlp": 0.0030568407382816076, "grad/layer_0/attn_mlp_ratio": 0.9491691802947941, "grad/layer_4/attn": 0.003359560389071703, "grad/layer_4/mlp": 0.0026031755842268467, "grad/layer_4/attn_mlp_ratio": 1.290562296439668, "grad/layer_8/attn": 0.004534163046628237, "grad/layer_8/mlp": 0.0037652079481631517, "grad/layer_8/attn_mlp_ratio": 1.2042264301544288, "grad/layer_12/attn": 0.004730957094579935, "grad/layer_12/mlp": 0.0071352520026266575, "grad/layer_12/attn_mlp_ratio": 0.6630399355950376, "grad/layer_16/attn": 0.007198088336735964, "grad/layer_16/mlp": 0.004995565861463547, "grad/layer_16/attn_mlp_ratio": 1.4408954645505565, "grad/layer_20/attn": 0.007230628281831741, "grad/layer_20/mlp": 0.006474524270743132, "grad/layer_20/attn_mlp_ratio": 1.1167813831244915, "grad/layer_24/attn": 0.014274988323450089, "grad/layer_24/mlp": 0.010980610735714436, "grad/layer_24/attn_mlp_ratio": 1.300017689090728, "grad/layer_27/attn": 0.009159603156149387, "grad/layer_27/mlp": 0.011905000545084476, "grad/layer_27/attn_mlp_ratio": 0.7693912356007598} {"step": 72000, "timestamp": 1778272360.929099, "eos/sharpness": 82.07929134368895, "eos/L0_probe": 1.9701002836227417, "eos/L_plus": 2.4579710960388184, "eos/L_minus": 2.3030223846435547, "eos/grad_norm": 0.20809373259544373, "eos/embed_grad_frac": 0.05260695889592171, "eos/time_s": 0.605165958404541} {"step": 72000, "timestamp": 1778272360.949653, "train/loss": 2.084067904949188, "train/z_loss": 0.0013876825803890824, "train/perplexity": 8.037096651281592, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1907939.9683336504, "perf/iters_per_sec": 0.9097766725223781, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0991708517074585, "data/tokens_consumed": 150997041152, "data/tokens_consumed_B": 150.997041152, "train/loss_slope": 1.4980633552818871e-06} {"step": 72000, "timestamp": 1778272362.3093836, "geo/rankme_last": 438.2426452636719, "geo/layer_0/stable_rank_q_proj": 19.193517684936523, "geo/layer_0/stable_rank_k_proj": 15.866081237792969, "geo/layer_0/stable_rank_o_proj": 46.773712158203125, "geo/layer_0/stable_rank_gate_proj": 129.2205352783203, "geo/layer_0/stable_rank_down_proj": 56.18466567993164, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06337066739797592, "geo/layer_0/attn_entropy_mean": 6.149317741394043, "geo/layer_0/attn_entropy_std": 0.4307004511356354, "geo/layer_7/stable_rank_q_proj": 42.92424011230469, "geo/layer_7/stable_rank_k_proj": 40.220916748046875, "geo/layer_7/stable_rank_o_proj": 89.4277114868164, "geo/layer_7/stable_rank_gate_proj": 78.60066986083984, "geo/layer_7/stable_rank_down_proj": 139.80909729003906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4457792341709137, "geo/layer_7/attn_entropy_mean": 4.648974418640137, "geo/layer_7/attn_entropy_std": 0.7977663278579712, "geo/layer_14/stable_rank_q_proj": 50.3326416015625, "geo/layer_14/stable_rank_k_proj": 41.079010009765625, "geo/layer_14/stable_rank_o_proj": 43.462711334228516, "geo/layer_14/stable_rank_gate_proj": 70.91533660888672, "geo/layer_14/stable_rank_down_proj": 126.74497985839844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4043267071247101, "geo/layer_14/attn_entropy_mean": 5.554125785827637, "geo/layer_14/attn_entropy_std": 0.41633549332618713, "geo/layer_21/stable_rank_q_proj": 39.94115447998047, "geo/layer_21/stable_rank_k_proj": 30.13860511779785, "geo/layer_21/stable_rank_o_proj": 68.94635772705078, "geo/layer_21/stable_rank_gate_proj": 64.38644409179688, "geo/layer_21/stable_rank_down_proj": 49.863651275634766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14433583617210388, "geo/layer_21/attn_entropy_mean": 5.678823471069336, "geo/layer_21/attn_entropy_std": 0.3014894425868988, "geo/layer_27/stable_rank_q_proj": 43.5870246887207, "geo/layer_27/stable_rank_k_proj": 32.187747955322266, "geo/layer_27/stable_rank_o_proj": 115.17091369628906, "geo/layer_27/stable_rank_gate_proj": 78.02351379394531, "geo/layer_27/stable_rank_down_proj": 127.82230377197266, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09593282639980316, "geo/layer_27/attn_entropy_mean": 4.161066055297852, "geo/layer_27/attn_entropy_std": 0.7654292583465576, "attnres/final_alpha/block_0": 0.23595626652240753, "attnres/block_norm/0": 1.7735356092453003, "attnres/final_alpha/block_1": 0.004008124116808176, "attnres/block_norm/1": 48305.53125, "attnres/final_alpha/block_2": 0.009817296639084816, "attnres/block_norm/2": 29040.3046875, "attnres/final_alpha/block_3": 0.011751976795494556, "attnres/block_norm/3": 61684.703125, "attnres/final_alpha/block_4": 0.01367274671792984, "attnres/block_norm/4": 15667.3017578125, "attnres/final_alpha/block_5": 0.6191886067390442, "attnres/block_norm/5": 6802.642578125, "attnres/final_alpha/block_6": 0.10560497641563416, "attnres/block_norm/6": 40966.9375, "geo/tier1_time_s": 1.3561360836029053, "geo/step": 72000.0, "geo/rankme_slope": -3.653590342386955e-05} {"step": 72000, "timestamp": 1778272369.2286339, "geo/ww_alpha_mean": 7.519795837918537, "geo/ww_alpha_std": 4.364312124773745, "geo/ww_alpha_min": 1.3433436572170578, "geo/ww_alpha_max": 31.29616500246727, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.940537866639338, "geo/ww_alpha_by_type/k_proj": 4.492562898026785, "geo/ww_alpha_by_type/v_proj": 8.09719283095364, "geo/ww_alpha_by_type/o_proj": 8.744888004560458, "geo/ww_alpha_by_type/gate_proj": 7.735526502638656, "geo/ww_alpha_by_type/up_proj": 11.505091286021258, "geo/ww_alpha_by_type/down_proj": 8.213608842557438, "geo/twonn_id/layer_0": 0.6912036538124084, "geo/twonn_id/layer_7": 3.3187344074249268, "geo/twonn_id/layer_14": 4.603409290313721, "geo/twonn_id/layer_21": 8.400172233581543, "geo/twonn_id/layer_27": 5.481707572937012, "geo/tier2_time_s": 6.91138219833374} {"step": 72000, "timestamp": 1778272369.8746488, "eoc/jacobian_sigma/layer_0/attn": 1146.29931640625, "eoc/jacobian_sigma/layer_0/mlp": 7896.78564453125, "eoc/jacobian_sigma/layer_0": 7896.78564453125, "eoc/jacobian_sigma/layer_7/attn": 1.1442409753799438, "eoc/jacobian_sigma/layer_7/mlp": 1.6845887899398804, "eoc/jacobian_sigma/layer_7": 1.6845887899398804, "eoc/jacobian_sigma/layer_14/attn": 1.4544031620025635, "eoc/jacobian_sigma/layer_14/mlp": 7.048160552978516, "eoc/jacobian_sigma/layer_14": 7.048160552978516, "eoc/jacobian_sigma/layer_21/attn": 1.0815012454986572, "eoc/jacobian_sigma/layer_21/mlp": 4.294646263122559, "eoc/jacobian_sigma/layer_21": 4.294646263122559, "eoc/jacobian_sigma/layer_27/attn": 3.108808994293213, "eoc/jacobian_sigma/layer_27/mlp": 27.44809341430664, "eoc/jacobian_sigma/layer_27": 27.44809341430664, "eoc/layer0_sigma": 7896.78564453125, "eoc/sigma_max": 27.44809341430664, "eoc/sigma_min": 1.6845887899398804, "eoc/sigma_mean": 10.118872255086899, "eoc/time_s": 0.6393401622772217} {"step": 72010, "timestamp": 1778272380.5599008, "train/loss": 2.1185999870300294, "train/z_loss": 0.0013732436578720807, "train/perplexity": 8.319481948007457, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1069616.6577739206, "perf/iters_per_sec": 0.5100329674596408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.960657572746277, "data/tokens_consumed": 151018012672, "data/tokens_consumed_B": 151.018012672, "train/loss_slope": 3.262318757929914e-07} {"step": 72020, "timestamp": 1778272390.9418166, "train/loss": 2.16369206905365, "train/z_loss": 0.0013808872201479972, "train/perplexity": 8.703211270032623, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021453.6152791597, "perf/iters_per_sec": 0.9639041973491477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374475002288819, "data/tokens_consumed": 151038984192, "data/tokens_consumed_B": 151.038984192, "train/loss_slope": 1.9771189794073818e-06} {"step": 72030, "timestamp": 1778272401.321658, "train/loss": 2.101951789855957, "train/z_loss": 0.0013735195272602141, "train/perplexity": 8.182124124734981, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021788.3830564432, "perf/iters_per_sec": 0.9640638270647255, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372757196426392, "data/tokens_consumed": 151059955712, "data/tokens_consumed_B": 151.059955712, "train/loss_slope": 1.1740514976714359e-06} {"step": 72040, "timestamp": 1778272411.7233574, "train/loss": 2.145664119720459, "train/z_loss": 0.0013748934376053511, "train/perplexity": 8.547716059504848, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2017064.3356706281, "perf/iters_per_sec": 0.9618112257340565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0397050619125365, "data/tokens_consumed": 151080927232, "data/tokens_consumed_B": 151.080927232, "train/loss_slope": 2.477011950996768e-07} {"step": 72050, "timestamp": 1778272422.1004987, "grad/layer_0/attn": 0.00277303927578032, "grad/layer_0/mlp": 0.003096226369962096, "grad/layer_0/attn_mlp_ratio": 0.8956190067757761, "grad/layer_4/attn": 0.002490969840437174, "grad/layer_4/mlp": 0.0027444579172879457, "grad/layer_4/attn_mlp_ratio": 0.9076363437684256, "grad/layer_8/attn": 0.003290274878963828, "grad/layer_8/mlp": 0.003925108350813389, "grad/layer_8/attn_mlp_ratio": 0.8382634314937194, "grad/layer_12/attn": 0.0045698550529778, "grad/layer_12/mlp": 0.007295319344848394, "grad/layer_12/attn_mlp_ratio": 0.6264091774904818, "grad/layer_16/attn": 0.005365510005503893, "grad/layer_16/mlp": 0.004762132186442614, "grad/layer_16/attn_mlp_ratio": 1.1267032671013875, "grad/layer_20/attn": 0.007537551689893007, "grad/layer_20/mlp": 0.006000235211104155, "grad/layer_20/attn_mlp_ratio": 1.2562093483140344, "grad/layer_24/attn": 0.005171413999050856, "grad/layer_24/mlp": 0.007738510612398386, "grad/layer_24/attn_mlp_ratio": 0.6682699283164896, "grad/layer_27/attn": 0.005602422636002302, "grad/layer_27/mlp": 0.00725557841360569, "grad/layer_27/attn_mlp_ratio": 0.7721538159219995} {"step": 72050, "timestamp": 1778272422.116276, "train/loss": 2.135562014579773, "train/z_loss": 0.001376400259323418, "train/perplexity": 8.461800826173473, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2019168.4970734417, "perf/iters_per_sec": 0.9628145680777749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0386215925216675, "data/tokens_consumed": 151101898752, "data/tokens_consumed_B": 151.101898752, "train/loss_slope": 1.7025452778928297e-06} {"step": 72060, "timestamp": 1778272432.5017781, "train/loss": 2.192703592777252, "train/z_loss": 0.001352899894118309, "train/perplexity": 8.959402977373387, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020674.484225306, "perf/iters_per_sec": 0.9635326787115602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378475189208984, "data/tokens_consumed": 151122870272, "data/tokens_consumed_B": 151.122870272, "train/loss_slope": 5.897043118275166e-06} {"step": 72070, "timestamp": 1778272442.8749151, "train/loss": 2.0940863609313967, "train/z_loss": 0.0013753923121839762, "train/perplexity": 8.118020640160498, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2022664.7867323055, "perf/iters_per_sec": 0.9644817289029625, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368262767791747, "data/tokens_consumed": 151143841792, "data/tokens_consumed_B": 151.143841792, "train/loss_slope": -3.8240567506912215e-07} {"step": 72075, "timestamp": 1778272448.6719415, "eos/sharpness": 64.95525836944579, "eos/L0_probe": 1.9689122438430786, "eos/L_plus": 2.2683887481689453, "eos/L_minus": 2.31898832321167, "eos/grad_norm": 0.15667277574539185, "eos/embed_grad_frac": 0.09353715181350708, "eos/time_s": 0.6219482421875} {"step": 72075, "timestamp": 1778272450.0523121, "geo/rankme_last": 438.5890197753906, "geo/layer_0/stable_rank_q_proj": 19.166898727416992, "geo/layer_0/stable_rank_k_proj": 15.920099258422852, "geo/layer_0/stable_rank_o_proj": 46.78181457519531, "geo/layer_0/stable_rank_gate_proj": 129.42726135253906, "geo/layer_0/stable_rank_down_proj": 56.22748565673828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06499769538640976, "geo/layer_0/attn_entropy_mean": 6.146283149719238, "geo/layer_0/attn_entropy_std": 0.43061694502830505, "geo/layer_7/stable_rank_q_proj": 42.89565658569336, "geo/layer_7/stable_rank_k_proj": 40.242679595947266, "geo/layer_7/stable_rank_o_proj": 89.45294952392578, "geo/layer_7/stable_rank_gate_proj": 78.58621978759766, "geo/layer_7/stable_rank_down_proj": 139.7298126220703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43637189269065857, "geo/layer_7/attn_entropy_mean": 4.6660051345825195, "geo/layer_7/attn_entropy_std": 0.7883209586143494, "geo/layer_14/stable_rank_q_proj": 50.33968734741211, "geo/layer_14/stable_rank_k_proj": 41.04197311401367, "geo/layer_14/stable_rank_o_proj": 43.47748565673828, "geo/layer_14/stable_rank_gate_proj": 71.03199768066406, "geo/layer_14/stable_rank_down_proj": 126.7809066772461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3830723464488983, "geo/layer_14/attn_entropy_mean": 5.546915054321289, "geo/layer_14/attn_entropy_std": 0.433633416891098, "geo/layer_21/stable_rank_q_proj": 39.92092514038086, "geo/layer_21/stable_rank_k_proj": 30.134851455688477, "geo/layer_21/stable_rank_o_proj": 68.79277038574219, "geo/layer_21/stable_rank_gate_proj": 64.39403533935547, "geo/layer_21/stable_rank_down_proj": 49.872196197509766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14334255456924438, "geo/layer_21/attn_entropy_mean": 5.675910472869873, "geo/layer_21/attn_entropy_std": 0.30644944310188293, "geo/layer_27/stable_rank_q_proj": 43.52006912231445, "geo/layer_27/stable_rank_k_proj": 32.075950622558594, "geo/layer_27/stable_rank_o_proj": 115.0330810546875, "geo/layer_27/stable_rank_gate_proj": 77.99498748779297, "geo/layer_27/stable_rank_down_proj": 127.5195541381836, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09736034274101257, "geo/layer_27/attn_entropy_mean": 4.169096946716309, "geo/layer_27/attn_entropy_std": 0.7887089848518372, "attnres/final_alpha/block_0": 0.23528897762298584, "attnres/block_norm/0": 1.7737421989440918, "attnres/final_alpha/block_1": 0.004024064168334007, "attnres/block_norm/1": 48093.875, "attnres/final_alpha/block_2": 0.01014359574764967, "attnres/block_norm/2": 28938.99609375, "attnres/final_alpha/block_3": 0.01180996559560299, "attnres/block_norm/3": 61850.4375, "attnres/final_alpha/block_4": 0.013919460587203503, "attnres/block_norm/4": 15684.3212890625, "attnres/final_alpha/block_5": 0.6170368194580078, "attnres/block_norm/5": 6834.4541015625, "attnres/final_alpha/block_6": 0.10777710378170013, "attnres/block_norm/6": 40923.8046875, "geo/tier1_time_s": 1.358586072921753, "geo/step": 72075.0, "geo/rankme_slope": -2.7157386392056823e-05} {"step": 72080, "timestamp": 1778272455.2355797, "train/loss": 2.1937673091888428, "train/z_loss": 0.0013662537443451584, "train/perplexity": 8.968938311905292, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1697550.8267816883, "perf/iters_per_sec": 0.8094553121479455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2353986501693726, "data/tokens_consumed": 151164813312, "data/tokens_consumed_B": 151.164813312, "train/loss_slope": 2.499453756067483e-06} {"step": 72090, "timestamp": 1778272465.59006, "train/loss": 2.145618748664856, "train/z_loss": 0.001379391027148813, "train/perplexity": 8.547328249401978, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026617.1604246588, "perf/iters_per_sec": 0.966366367542581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348042249679565, "data/tokens_consumed": 151185784832, "data/tokens_consumed_B": 151.185784832, "train/loss_slope": 1.7029901828417284e-06} {"step": 72100, "timestamp": 1778272475.9306238, "grad/layer_0/attn": 0.0031022087205201387, "grad/layer_0/mlp": 0.0033775577321648598, "grad/layer_0/attn_mlp_ratio": 0.9184768624766271, "grad/layer_4/attn": 0.0023805100936442614, "grad/layer_4/mlp": 0.0027378343511372805, "grad/layer_4/attn_mlp_ratio": 0.869486499687887, "grad/layer_8/attn": 0.0034689330495893955, "grad/layer_8/mlp": 0.003789686132222414, "grad/layer_8/attn_mlp_ratio": 0.9153615463185941, "grad/layer_12/attn": 0.008840859867632389, "grad/layer_12/mlp": 0.007538891397416592, "grad/layer_12/attn_mlp_ratio": 1.172700239904229, "grad/layer_16/attn": 0.0035743732005357742, "grad/layer_16/mlp": 0.004794239066541195, "grad/layer_16/attn_mlp_ratio": 0.7455558799571751, "grad/layer_20/attn": 0.004452552180737257, "grad/layer_20/mlp": 0.006785156670957804, "grad/layer_20/attn_mlp_ratio": 0.6562194995693118, "grad/layer_24/attn": 0.01670399308204651, "grad/layer_24/mlp": 0.012675773352384567, "grad/layer_24/attn_mlp_ratio": 1.317788862730436, "grad/layer_27/attn": 0.01074575912207365, "grad/layer_27/mlp": 0.013138673268258572, "grad/layer_27/attn_mlp_ratio": 0.8178724610076761} {"step": 72100, "timestamp": 1778272475.9462683, "train/loss": 2.1296934843063355, "train/z_loss": 0.0013692998909391463, "train/perplexity": 8.41228791795818, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026191.2679008858, "perf/iters_per_sec": 0.9661632861618451, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350217342376709, "data/tokens_consumed": 151206756352, "data/tokens_consumed_B": 151.206756352, "train/loss_slope": 1.4733653364688526e-06} {"step": 72110, "timestamp": 1778272486.3065963, "train/loss": 2.1740601897239684, "train/z_loss": 0.0013626685715280474, "train/perplexity": 8.793916624176534, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025683.9131948813, "perf/iters_per_sec": 0.9659213605856329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035280966758728, "data/tokens_consumed": 151227727872, "data/tokens_consumed_B": 151.227727872, "train/loss_slope": 4.12527522703141e-06} {"step": 72120, "timestamp": 1778272496.655846, "train/loss": 2.1638976097106934, "train/z_loss": 0.0013672217493876815, "train/perplexity": 8.705000317650166, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027658.29962364, "perf/iters_per_sec": 0.9668628213995171, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034272885322571, "data/tokens_consumed": 151248699392, "data/tokens_consumed_B": 151.248699392, "train/loss_slope": 7.184395416699298e-06} {"step": 72130, "timestamp": 1778272507.0093045, "train/loss": 2.1749105215072633, "train/z_loss": 0.001381561381276697, "train/perplexity": 8.801397551165945, "train/grad_norm": 0.18359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026405.3822121592, "perf/iters_per_sec": 0.9662653838215634, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034912371635437, "data/tokens_consumed": 151269670912, "data/tokens_consumed_B": 151.269670912, "train/loss_slope": 8.37893679161317e-06} {"step": 72140, "timestamp": 1778272517.3648977, "train/loss": 2.1977246046066283, "train/z_loss": 0.0013627708773128687, "train/perplexity": 9.004501370743954, "train/grad_norm": 0.203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026570.8885221316, "perf/iters_per_sec": 0.9663443033800753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348278522491454, "data/tokens_consumed": 151290642432, "data/tokens_consumed_B": 151.290642432, "train/loss_slope": 1.06767971165861e-05} {"step": 72150, "timestamp": 1778272527.7046502, "grad/layer_0/attn": 0.0026585133746266365, "grad/layer_0/mlp": 0.002978106029331684, "grad/layer_0/attn_mlp_ratio": 0.8926859081490265, "grad/layer_4/attn": 0.0029238758143037558, "grad/layer_4/mlp": 0.002566620474681258, "grad/layer_4/attn_mlp_ratio": 1.139192852713286, "grad/layer_8/attn": 0.00836107786744833, "grad/layer_8/mlp": 0.003790728747844696, "grad/layer_8/attn_mlp_ratio": 2.2056649797577075, "grad/layer_12/attn": 0.005324709229171276, "grad/layer_12/mlp": 0.006682824809104204, "grad/layer_12/attn_mlp_ratio": 0.7967752113207802, "grad/layer_16/attn": 0.003809473244473338, "grad/layer_16/mlp": 0.0045080408453941345, "grad/layer_16/attn_mlp_ratio": 0.8450396282148851, "grad/layer_20/attn": 0.003099681343883276, "grad/layer_20/mlp": 0.005956833716481924, "grad/layer_20/attn_mlp_ratio": 0.5203571963526644, "grad/layer_24/attn": 0.010652763769030571, "grad/layer_24/mlp": 0.008280199952423573, "grad/layer_24/attn_mlp_ratio": 1.2865345887280297, "grad/layer_27/attn": 0.005515763536095619, "grad/layer_27/mlp": 0.007489291951060295, "grad/layer_27/attn_mlp_ratio": 0.7364866396570456} {"step": 72150, "timestamp": 1778272528.3259587, "eos/sharpness": 63.23614120483397, "eos/L0_probe": 1.9673802852630615, "eos/L_plus": 2.244637966156006, "eos/L_minus": 2.322484016418457, "eos/grad_norm": 0.147884801030159, "eos/embed_grad_frac": 0.106088787317276, "eos/time_s": 0.6184983253479004} {"step": 72150, "timestamp": 1778272528.3455763, "train/loss": 2.1145452857017517, "train/z_loss": 0.0013615257106721402, "train/perplexity": 8.28581722981342, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910806.3498421332, "perf/iters_per_sec": 0.9111434697352091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097521996498108, "data/tokens_consumed": 151311613952, "data/tokens_consumed_B": 151.311613952, "train/loss_slope": 8.660646082937962e-06} {"step": 72150, "timestamp": 1778272529.7109466, "geo/rankme_last": 438.0518798828125, "geo/layer_0/stable_rank_q_proj": 19.1696834564209, "geo/layer_0/stable_rank_k_proj": 15.896102905273438, "geo/layer_0/stable_rank_o_proj": 46.886962890625, "geo/layer_0/stable_rank_gate_proj": 129.0369873046875, "geo/layer_0/stable_rank_down_proj": 56.230262756347656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06703825294971466, "geo/layer_0/attn_entropy_mean": 6.145383358001709, "geo/layer_0/attn_entropy_std": 0.430010586977005, "geo/layer_7/stable_rank_q_proj": 42.88393020629883, "geo/layer_7/stable_rank_k_proj": 40.274417877197266, "geo/layer_7/stable_rank_o_proj": 89.4029541015625, "geo/layer_7/stable_rank_gate_proj": 78.55973815917969, "geo/layer_7/stable_rank_down_proj": 139.70079040527344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4415707290172577, "geo/layer_7/attn_entropy_mean": 4.638754844665527, "geo/layer_7/attn_entropy_std": 0.7979206442832947, "geo/layer_14/stable_rank_q_proj": 50.32538986206055, "geo/layer_14/stable_rank_k_proj": 41.093589782714844, "geo/layer_14/stable_rank_o_proj": 43.5419807434082, "geo/layer_14/stable_rank_gate_proj": 71.10345458984375, "geo/layer_14/stable_rank_down_proj": 126.99862670898438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.37593546509742737, "geo/layer_14/attn_entropy_mean": 5.529515266418457, "geo/layer_14/attn_entropy_std": 0.4142620265483856, "geo/layer_21/stable_rank_q_proj": 39.91944122314453, "geo/layer_21/stable_rank_k_proj": 30.07476043701172, "geo/layer_21/stable_rank_o_proj": 68.76808166503906, "geo/layer_21/stable_rank_gate_proj": 64.38774108886719, "geo/layer_21/stable_rank_down_proj": 49.87850570678711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14041118323802948, "geo/layer_21/attn_entropy_mean": 5.7002153396606445, "geo/layer_21/attn_entropy_std": 0.3124138116836548, "geo/layer_27/stable_rank_q_proj": 43.489830017089844, "geo/layer_27/stable_rank_k_proj": 32.092464447021484, "geo/layer_27/stable_rank_o_proj": 114.85435485839844, "geo/layer_27/stable_rank_gate_proj": 78.05864715576172, "geo/layer_27/stable_rank_down_proj": 127.47601318359375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09832362085580826, "geo/layer_27/attn_entropy_mean": 4.170841217041016, "geo/layer_27/attn_entropy_std": 0.7601235508918762, "attnres/final_alpha/block_0": 0.23714467883110046, "attnres/block_norm/0": 1.7737171649932861, "attnres/final_alpha/block_1": 0.004125035367906094, "attnres/block_norm/1": 48111.5078125, "attnres/final_alpha/block_2": 0.010017560794949532, "attnres/block_norm/2": 28891.5078125, "attnres/final_alpha/block_3": 0.011736325919628143, "attnres/block_norm/3": 61737.62890625, "attnres/final_alpha/block_4": 0.013926065526902676, "attnres/block_norm/4": 15752.21484375, "attnres/final_alpha/block_5": 0.6147546768188477, "attnres/block_norm/5": 6869.392578125, "attnres/final_alpha/block_6": 0.10829564183950424, "attnres/block_norm/6": 40896.1875, "geo/tier1_time_s": 1.3609874248504639, "geo/step": 72150.0, "geo/rankme_slope": -4.4329763155262104e-05} {"step": 72160, "timestamp": 1778272540.5960698, "train/loss": 2.146531343460083, "train/z_loss": 0.001363424491137266, "train/perplexity": 8.555132056990969, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1712472.4227315343, "perf/iters_per_sec": 0.8165704835565254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2246340274810792, "data/tokens_consumed": 151332585472, "data/tokens_consumed_B": 151.332585472, "train/loss_slope": 8.317592563909613e-06} {"step": 72170, "timestamp": 1778272550.957675, "train/loss": 2.1121871709823608, "train/z_loss": 0.0013830412761308252, "train/perplexity": 8.266301341636707, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024912.6126177104, "perf/iters_per_sec": 0.9655535758102943, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035675311088562, "data/tokens_consumed": 151353556992, "data/tokens_consumed_B": 151.353556992, "train/loss_slope": 6.787307005618219e-06} {"step": 72180, "timestamp": 1778272561.3079507, "train/loss": 2.0940425276756285, "train/z_loss": 0.001379175391048193, "train/perplexity": 8.117664808684129, "train/grad_norm": 0.1298828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027455.7435611847, "perf/iters_per_sec": 0.9667662351423191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343762159347534, "data/tokens_consumed": 151374528512, "data/tokens_consumed_B": 151.374528512, "train/loss_slope": 3.8107389664099562e-06} {"step": 72190, "timestamp": 1778272571.6613185, "train/loss": 2.1688066720962524, "train/z_loss": 0.0013832253287546336, "train/perplexity": 8.747838769562383, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026849.8593451122, "perf/iters_per_sec": 0.9664773270345269, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346854209899903, "data/tokens_consumed": 151395500032, "data/tokens_consumed_B": 151.395500032, "train/loss_slope": 5.491321386605706e-06} {"step": 72200, "timestamp": 1778272582.0013318, "grad/layer_0/attn": 0.003164461348205805, "grad/layer_0/mlp": 0.003340725088492036, "grad/layer_0/attn_mlp_ratio": 0.9472378509631932, "grad/layer_4/attn": 0.0020484160631895065, "grad/layer_4/mlp": 0.0026375767774879932, "grad/layer_4/attn_mlp_ratio": 0.7766280030254137, "grad/layer_8/attn": 0.008139071054756641, "grad/layer_8/mlp": 0.0037063201889395714, "grad/layer_8/attn_mlp_ratio": 2.195997760648295, "grad/layer_12/attn": 0.005285928957164288, "grad/layer_12/mlp": 0.007581259589642286, "grad/layer_12/attn_mlp_ratio": 0.697236234287827, "grad/layer_16/attn": 0.0034382170997560024, "grad/layer_16/mlp": 0.0047094328328967094, "grad/layer_16/attn_mlp_ratio": 0.7300702969436282, "grad/layer_20/attn": 0.003513665171340108, "grad/layer_20/mlp": 0.00571795366704464, "grad/layer_20/attn_mlp_ratio": 0.6144969537163936, "grad/layer_24/attn": 0.007591418456286192, "grad/layer_24/mlp": 0.007868493907153606, "grad/layer_24/attn_mlp_ratio": 0.9647867113305911, "grad/layer_27/attn": 0.004987264983355999, "grad/layer_27/mlp": 0.006569108460098505, "grad/layer_27/attn_mlp_ratio": 0.7591996596995216} {"step": 72200, "timestamp": 1778272582.0168402, "train/loss": 2.122411775588989, "train/z_loss": 0.001369974494446069, "train/perplexity": 8.351254570902931, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026572.6160930328, "perf/iters_per_sec": 0.9663451271500744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034826970100403, "data/tokens_consumed": 151416471552, "data/tokens_consumed_B": 151.416471552, "train/loss_slope": 7.430911779475244e-06} {"step": 72210, "timestamp": 1778272592.3687084, "train/loss": 2.1020927906036375, "train/z_loss": 0.0013605721178464592, "train/perplexity": 8.183277891693272, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027329.2483422102, "perf/iters_per_sec": 0.966705917521577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344407558441162, "data/tokens_consumed": 151437443072, "data/tokens_consumed_B": 151.437443072, "train/loss_slope": 4.196351010127722e-06} {"step": 72220, "timestamp": 1778272602.7182713, "train/loss": 2.0666579008102417, "train/z_loss": 0.0013716864050365984, "train/perplexity": 7.898381782323015, "train/grad_norm": 0.2265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027877.9602493092, "perf/iters_per_sec": 0.9669675637480303, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034160852432251, "data/tokens_consumed": 151458414592, "data/tokens_consumed_B": 151.458414592, "train/loss_slope": 2.448205614533475e-06} {"step": 72225, "timestamp": 1778272608.4896908, "eos/sharpness": 87.50619888305663, "eos/L0_probe": 1.9733755588531494, "eos/L_plus": 2.3365180492401123, "eos/L_minus": 2.485295057296753, "eos/grad_norm": 0.22830545902252197, "eos/embed_grad_frac": 0.04333537817001343, "eos/time_s": 0.6057052612304688} {"step": 72225, "timestamp": 1778272609.8700917, "geo/rankme_last": 437.4609375, "geo/layer_0/stable_rank_q_proj": 19.129234313964844, "geo/layer_0/stable_rank_k_proj": 15.887520790100098, "geo/layer_0/stable_rank_o_proj": 46.90221405029297, "geo/layer_0/stable_rank_gate_proj": 129.33253479003906, "geo/layer_0/stable_rank_down_proj": 56.24637985229492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062456000596284866, "geo/layer_0/attn_entropy_mean": 6.143185615539551, "geo/layer_0/attn_entropy_std": 0.4271462559700012, "geo/layer_7/stable_rank_q_proj": 42.94977569580078, "geo/layer_7/stable_rank_k_proj": 40.351654052734375, "geo/layer_7/stable_rank_o_proj": 89.42203521728516, "geo/layer_7/stable_rank_gate_proj": 78.5176010131836, "geo/layer_7/stable_rank_down_proj": 139.88278198242188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44526705145835876, "geo/layer_7/attn_entropy_mean": 4.656410217285156, "geo/layer_7/attn_entropy_std": 0.7795577645301819, "geo/layer_14/stable_rank_q_proj": 50.171119689941406, "geo/layer_14/stable_rank_k_proj": 41.10087966918945, "geo/layer_14/stable_rank_o_proj": 43.54293441772461, "geo/layer_14/stable_rank_gate_proj": 71.10179138183594, "geo/layer_14/stable_rank_down_proj": 126.93021392822266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3849581778049469, "geo/layer_14/attn_entropy_mean": 5.532840728759766, "geo/layer_14/attn_entropy_std": 0.4457085132598877, "geo/layer_21/stable_rank_q_proj": 39.86758041381836, "geo/layer_21/stable_rank_k_proj": 30.053674697875977, "geo/layer_21/stable_rank_o_proj": 68.66673278808594, "geo/layer_21/stable_rank_gate_proj": 64.33747863769531, "geo/layer_21/stable_rank_down_proj": 49.85426330566406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14327287673950195, "geo/layer_21/attn_entropy_mean": 5.675406455993652, "geo/layer_21/attn_entropy_std": 0.3065492510795593, "geo/layer_27/stable_rank_q_proj": 43.64267349243164, "geo/layer_27/stable_rank_k_proj": 32.11904525756836, "geo/layer_27/stable_rank_o_proj": 114.9140625, "geo/layer_27/stable_rank_gate_proj": 78.00442504882812, "geo/layer_27/stable_rank_down_proj": 127.53134155273438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0972750335931778, "geo/layer_27/attn_entropy_mean": 4.187593460083008, "geo/layer_27/attn_entropy_std": 0.7789698839187622, "attnres/final_alpha/block_0": 0.2368064522743225, "attnres/block_norm/0": 1.7738440036773682, "attnres/final_alpha/block_1": 0.004122412763535976, "attnres/block_norm/1": 48380.2890625, "attnres/final_alpha/block_2": 0.010039115324616432, "attnres/block_norm/2": 28961.609375, "attnres/final_alpha/block_3": 0.011721670627593994, "attnres/block_norm/3": 61455.9609375, "attnres/final_alpha/block_4": 0.01393205113708973, "attnres/block_norm/4": 15709.4228515625, "attnres/final_alpha/block_5": 0.6147410273551941, "attnres/block_norm/5": 6878.56005859375, "attnres/final_alpha/block_6": 0.1086372658610344, "attnres/block_norm/6": 41160.9765625, "geo/tier1_time_s": 1.3596327304840088, "geo/step": 72225.0, "geo/rankme_slope": -8.773050235719287e-05} {"step": 72230, "timestamp": 1778272615.0522332, "train/loss": 2.1085479617118836, "train/z_loss": 0.0013666090904735028, "train/perplexity": 8.236273213623232, "train/grad_norm": 0.1396484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1701217.268852981, "perf/iters_per_sec": 0.8112036079659372, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327361345291137, "data/tokens_consumed": 151479386112, "data/tokens_consumed_B": 151.479386112, "train/loss_slope": 2.122927427363404e-06} {"step": 72240, "timestamp": 1778272625.40146, "train/loss": 2.146969723701477, "train/z_loss": 0.0013671148684807123, "train/perplexity": 8.558883280018224, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027309.016178958, "perf/iters_per_sec": 0.9666962700743474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344510793685913, "data/tokens_consumed": 151500357632, "data/tokens_consumed_B": 151.500357632, "train/loss_slope": 3.9977538918289986e-06} {"step": 72250, "timestamp": 1778272635.7412806, "grad/layer_0/attn": 0.0024626164231449366, "grad/layer_0/mlp": 0.002890530973672867, "grad/layer_0/attn_mlp_ratio": 0.8519598511064617, "grad/layer_4/attn": 0.0021211926359683275, "grad/layer_4/mlp": 0.002670852467417717, "grad/layer_4/attn_mlp_ratio": 0.7942005716995372, "grad/layer_8/attn": 0.0034673491027206182, "grad/layer_8/mlp": 0.0037780110724270344, "grad/layer_8/attn_mlp_ratio": 0.9177710029092255, "grad/layer_12/attn": 0.004165405873209238, "grad/layer_12/mlp": 0.007107459474354982, "grad/layer_12/attn_mlp_ratio": 0.5860611417669945, "grad/layer_16/attn": 0.004231675062328577, "grad/layer_16/mlp": 0.00465963501483202, "grad/layer_16/attn_mlp_ratio": 0.9081558873266259, "grad/layer_20/attn": 0.002936920616775751, "grad/layer_20/mlp": 0.005904620047658682, "grad/layer_20/attn_mlp_ratio": 0.4973936584117622, "grad/layer_24/attn": 0.012425687164068222, "grad/layer_24/mlp": 0.010750719346106052, "grad/layer_24/attn_mlp_ratio": 1.1558005235239255, "grad/layer_27/attn": 0.007412679027765989, "grad/layer_27/mlp": 0.010564848780632019, "grad/layer_27/attn_mlp_ratio": 0.7016360680137379} {"step": 72250, "timestamp": 1778272635.7571387, "train/loss": 2.1379800915718077, "train/z_loss": 0.0013839236344210804, "train/perplexity": 8.482286870496653, "train/grad_norm": 0.181640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026384.2348347155, "perf/iters_per_sec": 0.9662552999661996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349231719970704, "data/tokens_consumed": 151521329152, "data/tokens_consumed_B": 151.521329152, "train/loss_slope": 2.32707027769503e-06} {"step": 72260, "timestamp": 1778272646.1172774, "train/loss": 2.182502508163452, "train/z_loss": 0.0013713588356040417, "train/perplexity": 8.868471935675695, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025476.947388908, "perf/iters_per_sec": 0.9658226715988674, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353867530822753, "data/tokens_consumed": 151542300672, "data/tokens_consumed_B": 151.542300672, "train/loss_slope": 3.2851152437211694e-06} {"step": 72270, "timestamp": 1778272656.464879, "train/loss": 2.1272459030151367, "train/z_loss": 0.001384931628126651, "train/perplexity": 8.391723336442771, "train/grad_norm": 0.234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027590.2000031995, "perf/iters_per_sec": 0.9668303489700315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343076229095458, "data/tokens_consumed": 151563272192, "data/tokens_consumed_B": 151.563272192, "train/loss_slope": 3.141579004225324e-06} {"step": 72280, "timestamp": 1778272666.8202178, "train/loss": 2.058840978145599, "train/z_loss": 0.0014018599409610032, "train/perplexity": 7.836881427673767, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026407.2962349907, "perf/iters_per_sec": 0.966266296498771, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349113941192627, "data/tokens_consumed": 151584243712, "data/tokens_consumed_B": 151.584243712, "train/loss_slope": -4.85432405497557e-06} {"step": 72290, "timestamp": 1778272677.1744113, "train/loss": 2.170328426361084, "train/z_loss": 0.0013599496567621827, "train/perplexity": 8.761160964500615, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026732.0322020259, "perf/iters_per_sec": 0.9664211426744584, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347455739974976, "data/tokens_consumed": 151605215232, "data/tokens_consumed_B": 151.605215232, "train/loss_slope": -3.6422270609744326e-06} {"step": 72300, "timestamp": 1778272687.5177495, "grad/layer_0/attn": 0.003765161382034421, "grad/layer_0/mlp": 0.0034456735011190176, "grad/layer_0/attn_mlp_ratio": 1.0927214292182656, "grad/layer_4/attn": 0.0028450365643948317, "grad/layer_4/mlp": 0.002637991216033697, "grad/layer_4/attn_mlp_ratio": 1.078485947661281, "grad/layer_8/attn": 0.01033215131610632, "grad/layer_8/mlp": 0.003794820513576269, "grad/layer_8/attn_mlp_ratio": 2.7226982163905813, "grad/layer_12/attn": 0.005503241438418627, "grad/layer_12/mlp": 0.007729697972536087, "grad/layer_12/attn_mlp_ratio": 0.7119607243097702, "grad/layer_16/attn": 0.004251738544553518, "grad/layer_16/mlp": 0.005082434043288231, "grad/layer_16/attn_mlp_ratio": 0.8365555607185361, "grad/layer_20/attn": 0.0039050152990967035, "grad/layer_20/mlp": 0.0069292825646698475, "grad/layer_20/attn_mlp_ratio": 0.5635526053810884, "grad/layer_24/attn": 0.01731194742023945, "grad/layer_24/mlp": 0.01292337104678154, "grad/layer_24/attn_mlp_ratio": 1.33958448021133, "grad/layer_27/attn": 0.01200879830867052, "grad/layer_27/mlp": 0.013211148791015148, "grad/layer_27/attn_mlp_ratio": 0.9089897031466854} {"step": 72300, "timestamp": 1778272688.1273053, "eos/sharpness": 79.8431634902954, "eos/L0_probe": 1.970089316368103, "eos/L_plus": 2.3188302516937256, "eos/L_minus": 2.4197800159454346, "eos/grad_norm": 0.25434938073158264, "eos/embed_grad_frac": 0.03733859583735466, "eos/time_s": 0.6068358421325684} {"step": 72300, "timestamp": 1778272688.1485934, "train/loss": 2.1043034553527833, "train/z_loss": 0.0013778852066025139, "train/perplexity": 8.201388386401467, "train/grad_norm": 0.25390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911796.5231465881, "perf/iters_per_sec": 0.911615621159834, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096953558921814, "data/tokens_consumed": 151626186752, "data/tokens_consumed_B": 151.626186752, "train/loss_slope": -4.458495092005746e-06} {"step": 72300, "timestamp": 1778272689.5126686, "geo/rankme_last": 437.60009765625, "geo/layer_0/stable_rank_q_proj": 19.15191078186035, "geo/layer_0/stable_rank_k_proj": 15.885865211486816, "geo/layer_0/stable_rank_o_proj": 46.968360900878906, "geo/layer_0/stable_rank_gate_proj": 129.32736206054688, "geo/layer_0/stable_rank_down_proj": 56.220481872558594, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059499941766262054, "geo/layer_0/attn_entropy_mean": 6.14765739440918, "geo/layer_0/attn_entropy_std": 0.4262126088142395, "geo/layer_7/stable_rank_q_proj": 42.926727294921875, "geo/layer_7/stable_rank_k_proj": 40.26728439331055, "geo/layer_7/stable_rank_o_proj": 89.52340698242188, "geo/layer_7/stable_rank_gate_proj": 78.57603454589844, "geo/layer_7/stable_rank_down_proj": 139.78334045410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.450624942779541, "geo/layer_7/attn_entropy_mean": 4.661251068115234, "geo/layer_7/attn_entropy_std": 0.7902846336364746, "geo/layer_14/stable_rank_q_proj": 50.065914154052734, "geo/layer_14/stable_rank_k_proj": 41.093509674072266, "geo/layer_14/stable_rank_o_proj": 43.556068420410156, "geo/layer_14/stable_rank_gate_proj": 71.12609100341797, "geo/layer_14/stable_rank_down_proj": 127.16284942626953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38660675287246704, "geo/layer_14/attn_entropy_mean": 5.529122352600098, "geo/layer_14/attn_entropy_std": 0.423805832862854, "geo/layer_21/stable_rank_q_proj": 39.85496139526367, "geo/layer_21/stable_rank_k_proj": 30.05600929260254, "geo/layer_21/stable_rank_o_proj": 68.72315979003906, "geo/layer_21/stable_rank_gate_proj": 64.29551696777344, "geo/layer_21/stable_rank_down_proj": 49.8121337890625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14431661367416382, "geo/layer_21/attn_entropy_mean": 5.689342498779297, "geo/layer_21/attn_entropy_std": 0.3074028491973877, "geo/layer_27/stable_rank_q_proj": 43.60883331298828, "geo/layer_27/stable_rank_k_proj": 32.110618591308594, "geo/layer_27/stable_rank_o_proj": 114.78596496582031, "geo/layer_27/stable_rank_gate_proj": 77.94273376464844, "geo/layer_27/stable_rank_down_proj": 127.49188995361328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09836630523204803, "geo/layer_27/attn_entropy_mean": 4.173244476318359, "geo/layer_27/attn_entropy_std": 0.7923988699913025, "attnres/final_alpha/block_0": 0.23665550351142883, "attnres/block_norm/0": 1.7738595008850098, "attnres/final_alpha/block_1": 0.004003222566097975, "attnres/block_norm/1": 48354.3984375, "attnres/final_alpha/block_2": 0.010018667206168175, "attnres/block_norm/2": 28972.81640625, "attnres/final_alpha/block_3": 0.011860162019729614, "attnres/block_norm/3": 61741.4375, "attnres/final_alpha/block_4": 0.013972467742860317, "attnres/block_norm/4": 15737.0625, "attnres/final_alpha/block_5": 0.6164518594741821, "attnres/block_norm/5": 6845.548828125, "attnres/final_alpha/block_6": 0.10703812539577484, "attnres/block_norm/6": 41137.484375, "geo/tier1_time_s": 1.3603980541229248, "geo/step": 72300.0, "geo/rankme_slope": -0.00012234669258328332} {"step": 72310, "timestamp": 1778272699.889944, "train/loss": 2.0966604471206667, "train/z_loss": 0.0013764083734713495, "train/perplexity": 8.138944042742288, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1786690.5801326619, "perf/iters_per_sec": 0.8519604588187513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173763394355774, "data/tokens_consumed": 151647158272, "data/tokens_consumed_B": 151.647158272, "train/loss_slope": -4.248258819316891e-06} {"step": 72320, "timestamp": 1778272710.2614603, "train/loss": 2.1118170022964478, "train/z_loss": 0.0013620578567497433, "train/perplexity": 8.263241982006212, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023123.7708567458, "perf/iters_per_sec": 0.9647005895885209, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365910530090332, "data/tokens_consumed": 151668129792, "data/tokens_consumed_B": 151.668129792, "train/loss_slope": -8.85518695941179e-06} {"step": 72330, "timestamp": 1778272721.0373085, "train/loss": 2.1806523323059084, "train/z_loss": 0.0013692995999008417, "train/perplexity": 8.852078872707997, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947489.5036181607, "perf/iters_per_sec": 0.9286353605356983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0768489360809326, "data/tokens_consumed": 151689101312, "data/tokens_consumed_B": 151.689101312, "train/loss_slope": -7.982399533040345e-06} {"step": 72340, "timestamp": 1778272731.8890758, "train/loss": 2.118153429031372, "train/z_loss": 0.0013656611437909306, "train/perplexity": 8.315767646186206, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1933840.367212967, "perf/iters_per_sec": 0.9221269451203189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0844493865966798, "data/tokens_consumed": 151710072832, "data/tokens_consumed_B": 151.710072832, "train/loss_slope": -1.0931610681972763e-05} {"step": 72350, "timestamp": 1778272742.2338216, "grad/layer_0/attn": 0.002794352127239108, "grad/layer_0/mlp": 0.0030101556330919266, "grad/layer_0/attn_mlp_ratio": 0.9283081591160227, "grad/layer_4/attn": 0.002595277735963464, "grad/layer_4/mlp": 0.002614902565255761, "grad/layer_4/attn_mlp_ratio": 0.9924949675744899, "grad/layer_8/attn": 0.004203193821012974, "grad/layer_8/mlp": 0.0036144061014056206, "grad/layer_8/attn_mlp_ratio": 1.1629002350035746, "grad/layer_12/attn": 0.005162729881703854, "grad/layer_12/mlp": 0.007487238384783268, "grad/layer_12/attn_mlp_ratio": 0.6895372562522695, "grad/layer_16/attn": 0.003989679738879204, "grad/layer_16/mlp": 0.005173738580197096, "grad/layer_16/attn_mlp_ratio": 0.7711405591762929, "grad/layer_20/attn": 0.0034340471029281616, "grad/layer_20/mlp": 0.006658324971795082, "grad/layer_20/attn_mlp_ratio": 0.5157523950692818, "grad/layer_24/attn": 0.016071656718850136, "grad/layer_24/mlp": 0.012060838751494884, "grad/layer_24/attn_mlp_ratio": 1.3325488315315754, "grad/layer_27/attn": 0.0060960594564676285, "grad/layer_27/mlp": 0.013254784047603607, "grad/layer_27/attn_mlp_ratio": 0.45991389890492956} {"step": 72350, "timestamp": 1778272742.2496653, "train/loss": 2.1907797336578367, "train/z_loss": 0.0013579785241745411, "train/perplexity": 8.942182918046225, "train/grad_norm": 0.2373046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025487.7680825375, "perf/iters_per_sec": 0.9658278313076675, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353812217712401, "data/tokens_consumed": 151731044352, "data/tokens_consumed_B": 151.731044352, "train/loss_slope": -9.827008191579992e-06} {"step": 72360, "timestamp": 1778272752.6071892, "train/loss": 2.1326822519302366, "train/z_loss": 0.0013684890465810895, "train/perplexity": 8.437467901546333, "train/grad_norm": 0.255859375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026227.3472166217, "perf/iters_per_sec": 0.9661804901202306, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350033044815063, "data/tokens_consumed": 151752015872, "data/tokens_consumed_B": 151.752015872, "train/loss_slope": -1.0739059059104192e-05} {"step": 72370, "timestamp": 1778272762.9578416, "train/loss": 2.1231356382369997, "train/z_loss": 0.001376550563145429, "train/perplexity": 8.357301920612066, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027062.804891991, "perf/iters_per_sec": 0.9665788673839526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345767259597778, "data/tokens_consumed": 151772987392, "data/tokens_consumed_B": 151.772987392, "train/loss_slope": -1.117972021449126e-05} {"step": 72375, "timestamp": 1778272768.7309017, "eos/sharpness": 33.71284008026122, "eos/L0_probe": 1.9696261882781982, "eos/L_plus": 2.1732993125915527, "eos/L_minus": 2.103081464767456, "eos/grad_norm": 0.10515797883272171, "eos/embed_grad_frac": 0.1938757300376892, "eos/time_s": 0.6054596900939941} {"step": 72375, "timestamp": 1778272770.1087599, "geo/rankme_last": 439.5200500488281, "geo/layer_0/stable_rank_q_proj": 19.15151023864746, "geo/layer_0/stable_rank_k_proj": 15.87492561340332, "geo/layer_0/stable_rank_o_proj": 46.931396484375, "geo/layer_0/stable_rank_gate_proj": 129.4090118408203, "geo/layer_0/stable_rank_down_proj": 56.243778228759766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06291341781616211, "geo/layer_0/attn_entropy_mean": 6.144654273986816, "geo/layer_0/attn_entropy_std": 0.42400893568992615, "geo/layer_7/stable_rank_q_proj": 42.826560974121094, "geo/layer_7/stable_rank_k_proj": 40.303924560546875, "geo/layer_7/stable_rank_o_proj": 89.47530364990234, "geo/layer_7/stable_rank_gate_proj": 78.42096710205078, "geo/layer_7/stable_rank_down_proj": 139.70419311523438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44450753927230835, "geo/layer_7/attn_entropy_mean": 4.644780158996582, "geo/layer_7/attn_entropy_std": 0.7952802181243896, "geo/layer_14/stable_rank_q_proj": 50.22809600830078, "geo/layer_14/stable_rank_k_proj": 41.1617431640625, "geo/layer_14/stable_rank_o_proj": 43.51359176635742, "geo/layer_14/stable_rank_gate_proj": 71.10372924804688, "geo/layer_14/stable_rank_down_proj": 127.2934341430664, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39706504344940186, "geo/layer_14/attn_entropy_mean": 5.530779838562012, "geo/layer_14/attn_entropy_std": 0.4412372410297394, "geo/layer_21/stable_rank_q_proj": 39.86174011230469, "geo/layer_21/stable_rank_k_proj": 30.058616638183594, "geo/layer_21/stable_rank_o_proj": 68.73352813720703, "geo/layer_21/stable_rank_gate_proj": 64.34036254882812, "geo/layer_21/stable_rank_down_proj": 49.84116744995117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14865219593048096, "geo/layer_21/attn_entropy_mean": 5.69735050201416, "geo/layer_21/attn_entropy_std": 0.3094537556171417, "geo/layer_27/stable_rank_q_proj": 43.774654388427734, "geo/layer_27/stable_rank_k_proj": 32.127777099609375, "geo/layer_27/stable_rank_o_proj": 114.80036163330078, "geo/layer_27/stable_rank_gate_proj": 77.92132568359375, "geo/layer_27/stable_rank_down_proj": 127.22307586669922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10027604550123215, "geo/layer_27/attn_entropy_mean": 4.1697282791137695, "geo/layer_27/attn_entropy_std": 0.7780224084854126, "attnres/final_alpha/block_0": 0.23490719497203827, "attnres/block_norm/0": 1.7739043235778809, "attnres/final_alpha/block_1": 0.004049445036798716, "attnres/block_norm/1": 48345.1875, "attnres/final_alpha/block_2": 0.01000507827848196, "attnres/block_norm/2": 28973.7421875, "attnres/final_alpha/block_3": 0.011736426502466202, "attnres/block_norm/3": 61253.296875, "attnres/final_alpha/block_4": 0.0134506206959486, "attnres/block_norm/4": 15734.2275390625, "attnres/final_alpha/block_5": 0.6207523345947266, "attnres/block_norm/5": 6765.83154296875, "attnres/final_alpha/block_6": 0.10509886592626572, "attnres/block_norm/6": 40831.1640625, "geo/tier1_time_s": 1.3576607704162598, "geo/step": 72375.0, "geo/rankme_slope": -0.00010122711193852541} {"step": 72380, "timestamp": 1778272775.8305254, "train/loss": 2.135800504684448, "train/z_loss": 0.0013738406007178128, "train/perplexity": 8.463819122600544, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1630205.5097709547, "perf/iters_per_sec": 0.7773425625662588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2864341259002685, "data/tokens_consumed": 151793958912, "data/tokens_consumed_B": 151.793958912, "train/loss_slope": -1.41050752633476e-05} {"step": 72390, "timestamp": 1778272786.6890638, "train/loss": 2.1312766790390016, "train/z_loss": 0.0013748521450906992, "train/perplexity": 8.425616756168719, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1932278.4899231952, "perf/iters_per_sec": 0.9213821839920021, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0853259563446045, "data/tokens_consumed": 151814930432, "data/tokens_consumed_B": 151.814930432, "train/loss_slope": -1.4329860720447077e-05} {"step": 72400, "timestamp": 1778272797.4458613, "grad/layer_0/attn": 0.002833815524354577, "grad/layer_0/mlp": 0.0030880309641361237, "grad/layer_0/attn_mlp_ratio": 0.9176771429750292, "grad/layer_4/attn": 0.002026026602834463, "grad/layer_4/mlp": 0.002614579861983657, "grad/layer_4/attn_mlp_ratio": 0.7748956361224982, "grad/layer_8/attn": 0.004884771071374416, "grad/layer_8/mlp": 0.0037334386724978685, "grad/layer_8/attn_mlp_ratio": 1.3083838704836306, "grad/layer_12/attn": 0.005005789455026388, "grad/layer_12/mlp": 0.007570288144052029, "grad/layer_12/attn_mlp_ratio": 0.6612415926116729, "grad/layer_16/attn": 0.0036430559121072292, "grad/layer_16/mlp": 0.005090709775686264, "grad/layer_16/attn_mlp_ratio": 0.715628272101466, "grad/layer_20/attn": 0.00394782517105341, "grad/layer_20/mlp": 0.005847388878464699, "grad/layer_20/attn_mlp_ratio": 0.6751432452318159, "grad/layer_24/attn": 0.007188660558313131, "grad/layer_24/mlp": 0.007690866012126207, "grad/layer_24/attn_mlp_ratio": 0.9347010406251584, "grad/layer_27/attn": 0.005405464209616184, "grad/layer_27/mlp": 0.00699018407613039, "grad/layer_27/attn_mlp_ratio": 0.7732935318177165} {"step": 72400, "timestamp": 1778272797.4617689, "train/loss": 2.146532082557678, "train/z_loss": 0.001369025488384068, "train/perplexity": 8.555138380070835, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1947701.9700761654, "perf/iters_per_sec": 0.9287366724377467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0767314672470092, "data/tokens_consumed": 151835901952, "data/tokens_consumed_B": 151.835901952, "train/loss_slope": -1.3134409670997197e-05} {"step": 72410, "timestamp": 1778272807.8169212, "train/loss": 2.1206361413002015, "train/z_loss": 0.0013796487473882735, "train/perplexity": 8.336438954383445, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026705.367714335, "perf/iters_per_sec": 0.966408428055923, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347591876983642, "data/tokens_consumed": 151856873472, "data/tokens_consumed_B": 151.856873472, "train/loss_slope": -1.2790444674808077e-05} {"step": 72420, "timestamp": 1778272818.166392, "train/loss": 2.16131386756897, "train/z_loss": 0.0013702315045520663, "train/perplexity": 8.68253787256481, "train/grad_norm": 0.2109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027732.2001316398, "perf/iters_per_sec": 0.9668980599077415, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342351913452148, "data/tokens_consumed": 151877844992, "data/tokens_consumed_B": 151.877844992, "train/loss_slope": -9.628014774820403e-06} {"step": 72430, "timestamp": 1778272828.520621, "train/loss": 2.1086215019226073, "train/z_loss": 0.0013843374559655785, "train/perplexity": 8.236878933163037, "train/grad_norm": 0.2080078125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026781.487110995, "perf/iters_per_sec": 0.9664447246127105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347203254699706, "data/tokens_consumed": 151898816512, "data/tokens_consumed_B": 151.898816512, "train/loss_slope": -1.3222889667010848e-05} {"step": 72440, "timestamp": 1778272838.8832693, "train/loss": 2.1293276071548464, "train/z_loss": 0.0013611415983177722, "train/perplexity": 8.409210617008632, "train/grad_norm": 0.306640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025103.4711273971, "perf/iters_per_sec": 0.9656445842396723, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355777025222779, "data/tokens_consumed": 151919788032, "data/tokens_consumed_B": 151.919788032, "train/loss_slope": -1.1967822434556697e-05} {"step": 72450, "timestamp": 1778272849.226944, "grad/layer_0/attn": 0.003049826482310891, "grad/layer_0/mlp": 0.0032077720388770103, "grad/layer_0/attn_mlp_ratio": 0.950761572291287, "grad/layer_4/attn": 0.002241550711914897, "grad/layer_4/mlp": 0.002620831597596407, "grad/layer_4/attn_mlp_ratio": 0.8552822044889976, "grad/layer_8/attn": 0.007234742399305105, "grad/layer_8/mlp": 0.0036142508033663034, "grad/layer_8/attn_mlp_ratio": 2.0017266627966217, "grad/layer_12/attn": 0.005195691715925932, "grad/layer_12/mlp": 0.007036981638520956, "grad/layer_12/attn_mlp_ratio": 0.7383409406172441, "grad/layer_16/attn": 0.004680483136326075, "grad/layer_16/mlp": 0.004680480808019638, "grad/layer_16/attn_mlp_ratio": 1.0000004760849324, "grad/layer_20/attn": 0.003854725742712617, "grad/layer_20/mlp": 0.005944374483078718, "grad/layer_20/attn_mlp_ratio": 0.6484661571774928, "grad/layer_24/attn": 0.010121452622115612, "grad/layer_24/mlp": 0.009910520166158676, "grad/layer_24/attn_mlp_ratio": 1.0212836814104709, "grad/layer_27/attn": 0.006649858318269253, "grad/layer_27/mlp": 0.009914563968777657, "grad/layer_27/attn_mlp_ratio": 0.6707161577795017} {"step": 72450, "timestamp": 1778272849.8438303, "eos/sharpness": 80.59780597686766, "eos/L0_probe": 1.969546914100647, "eos/L_plus": 2.3085970878601074, "eos/L_minus": 2.4364748001098633, "eos/grad_norm": 0.18426640331745148, "eos/embed_grad_frac": 0.07259304076433182, "eos/time_s": 0.6140477657318115} {"step": 72450, "timestamp": 1778272849.8646135, "train/loss": 2.1502324104309083, "train/z_loss": 0.0013717383844777942, "train/perplexity": 8.586853839689114, "train/grad_norm": 0.1845703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1910934.4555779025, "perf/iters_per_sec": 0.9112045553102028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0974484205245971, "data/tokens_consumed": 151940759552, "data/tokens_consumed_B": 151.940759552, "train/loss_slope": -1.1977294340456992e-05} {"step": 72450, "timestamp": 1778272851.2249112, "geo/rankme_last": 437.9712829589844, "geo/layer_0/stable_rank_q_proj": 19.15772247314453, "geo/layer_0/stable_rank_k_proj": 15.88149356842041, "geo/layer_0/stable_rank_o_proj": 46.90400695800781, "geo/layer_0/stable_rank_gate_proj": 129.38424682617188, "geo/layer_0/stable_rank_down_proj": 56.348392486572266, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060795433819293976, "geo/layer_0/attn_entropy_mean": 6.14760684967041, "geo/layer_0/attn_entropy_std": 0.4191325604915619, "geo/layer_7/stable_rank_q_proj": 42.73931884765625, "geo/layer_7/stable_rank_k_proj": 40.267303466796875, "geo/layer_7/stable_rank_o_proj": 89.431884765625, "geo/layer_7/stable_rank_gate_proj": 78.28501892089844, "geo/layer_7/stable_rank_down_proj": 139.7777862548828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4424322247505188, "geo/layer_7/attn_entropy_mean": 4.64255952835083, "geo/layer_7/attn_entropy_std": 0.8019549250602722, "geo/layer_14/stable_rank_q_proj": 50.30498504638672, "geo/layer_14/stable_rank_k_proj": 41.271392822265625, "geo/layer_14/stable_rank_o_proj": 43.47566223144531, "geo/layer_14/stable_rank_gate_proj": 71.1689453125, "geo/layer_14/stable_rank_down_proj": 127.00711059570312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3837086856365204, "geo/layer_14/attn_entropy_mean": 5.543581485748291, "geo/layer_14/attn_entropy_std": 0.43751442432403564, "geo/layer_21/stable_rank_q_proj": 39.92029571533203, "geo/layer_21/stable_rank_k_proj": 30.072185516357422, "geo/layer_21/stable_rank_o_proj": 68.64286804199219, "geo/layer_21/stable_rank_gate_proj": 64.36143493652344, "geo/layer_21/stable_rank_down_proj": 49.82747268676758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1441478580236435, "geo/layer_21/attn_entropy_mean": 5.681297302246094, "geo/layer_21/attn_entropy_std": 0.2990323007106781, "geo/layer_27/stable_rank_q_proj": 43.774513244628906, "geo/layer_27/stable_rank_k_proj": 32.186973571777344, "geo/layer_27/stable_rank_o_proj": 114.93297576904297, "geo/layer_27/stable_rank_gate_proj": 77.9185791015625, "geo/layer_27/stable_rank_down_proj": 127.17867279052734, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10088407248258591, "geo/layer_27/attn_entropy_mean": 4.183917045593262, "geo/layer_27/attn_entropy_std": 0.773150622844696, "attnres/final_alpha/block_0": 0.23786522448062897, "attnres/block_norm/0": 1.7740788459777832, "attnres/final_alpha/block_1": 0.004079801961779594, "attnres/block_norm/1": 48236.37890625, "attnres/final_alpha/block_2": 0.010126074776053429, "attnres/block_norm/2": 29049.0078125, "attnres/final_alpha/block_3": 0.011912672780454159, "attnres/block_norm/3": 61208.046875, "attnres/final_alpha/block_4": 0.014073766767978668, "attnres/block_norm/4": 15767.5322265625, "attnres/final_alpha/block_5": 0.6126129031181335, "attnres/block_norm/5": 6881.7001953125, "attnres/final_alpha/block_6": 0.10932956635951996, "attnres/block_norm/6": 41056.3671875, "geo/tier1_time_s": 1.356339693069458, "geo/step": 72450.0, "geo/rankme_slope": -0.00011627832773734494} {"step": 72460, "timestamp": 1778272861.5742936, "train/loss": 2.185920572280884, "train/z_loss": 0.0013660118100233376, "train/perplexity": 8.898836806349783, "train/grad_norm": 0.2138671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791508.0650326845, "perf/iters_per_sec": 0.8542576146281645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.17060706615448, "data/tokens_consumed": 151961731072, "data/tokens_consumed_B": 151.961731072, "train/loss_slope": -9.129527821899908e-06} {"step": 72470, "timestamp": 1778272871.968249, "train/loss": 2.1702739715576174, "train/z_loss": 0.0013800551416352392, "train/perplexity": 8.760683890191766, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2020399.2525355215, "perf/iters_per_sec": 0.963401438014756, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379889011383057, "data/tokens_consumed": 151982702592, "data/tokens_consumed_B": 151.982702592, "train/loss_slope": -3.36500777877772e-06} {"step": 72480, "timestamp": 1778272882.3246171, "train/loss": 2.127172112464905, "train/z_loss": 0.0013671464519575238, "train/perplexity": 8.391104129406475, "train/grad_norm": 0.23046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026169.5182485087, "perf/iters_per_sec": 0.9661529151194137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350328445434571, "data/tokens_consumed": 152003674112, "data/tokens_consumed_B": 152.003674112, "train/loss_slope": -4.559313360363546e-06} {"step": 72490, "timestamp": 1778272892.684931, "train/loss": 2.204982137680054, "train/z_loss": 0.0013556022546254098, "train/perplexity": 9.070089553567893, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025296.4640733106, "perf/iters_per_sec": 0.9657366104475549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354790210723877, "data/tokens_consumed": 152024645632, "data/tokens_consumed_B": 152.024645632, "train/loss_slope": -1.0943065703969874e-06} {"step": 72500, "timestamp": 1778272903.0528972, "grad/layer_0/attn": 0.0028697866946458817, "grad/layer_0/mlp": 0.0031238491646945477, "grad/layer_0/attn_mlp_ratio": 0.9186700290183468, "grad/layer_4/attn": 0.0032516196370124817, "grad/layer_4/mlp": 0.0026870262809097767, "grad/layer_4/attn_mlp_ratio": 1.2101182407861377, "grad/layer_8/attn": 0.00442503672093153, "grad/layer_8/mlp": 0.0036536778789013624, "grad/layer_8/attn_mlp_ratio": 1.2111184254563425, "grad/layer_12/attn": 0.0073445457965135574, "grad/layer_12/mlp": 0.008070331998169422, "grad/layer_12/attn_mlp_ratio": 0.9100673562342629, "grad/layer_16/attn": 0.0038429293781518936, "grad/layer_16/mlp": 0.0046635218895971775, "grad/layer_16/attn_mlp_ratio": 0.8240401539274903, "grad/layer_20/attn": 0.0032189481426030397, "grad/layer_20/mlp": 0.005973251070827246, "grad/layer_20/attn_mlp_ratio": 0.538893820223743, "grad/layer_24/attn": 0.009620234370231628, "grad/layer_24/mlp": 0.009960233233869076, "grad/layer_24/attn_mlp_ratio": 0.9658643575666742, "grad/layer_27/attn": 0.004258839879184961, "grad/layer_27/mlp": 0.009696639142930508, "grad/layer_27/attn_mlp_ratio": 0.43920782989734697} {"step": 72500, "timestamp": 1778272903.0688946, "train/loss": 2.1549601793289184, "train/z_loss": 0.0013657239847816526, "train/perplexity": 8.627546617432278, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2021177.1495720071, "perf/iters_per_sec": 0.9637723682270084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375894069671632, "data/tokens_consumed": 152045617152, "data/tokens_consumed_B": 152.045617152, "train/loss_slope": -3.003117704119343e-07} {"step": 72500, "timestamp": 1778272910.082856, "geo/ww_alpha_mean": 7.559074537461633, "geo/ww_alpha_std": 4.44268984459986, "geo/ww_alpha_min": 1.3521695684589856, "geo/ww_alpha_max": 31.84447451444818, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.930311986719246, "geo/ww_alpha_by_type/k_proj": 4.544075378836608, "geo/ww_alpha_by_type/v_proj": 8.377089658363413, "geo/ww_alpha_by_type/o_proj": 8.271989308568608, "geo/ww_alpha_by_type/gate_proj": 7.751092823359684, "geo/ww_alpha_by_type/up_proj": 12.104408302914083, "geo/ww_alpha_by_type/down_proj": 8.021151800695327, "geo/twonn_id/layer_0": 0.6969926357269287, "geo/twonn_id/layer_7": 3.475339889526367, "geo/twonn_id/layer_14": 4.447617053985596, "geo/twonn_id/layer_21": 6.684454441070557, "geo/twonn_id/layer_27": 5.245347499847412, "geo/tier2_time_s": 7.006540536880493} {"step": 72500, "timestamp": 1778272910.7392676, "eoc/jacobian_sigma/layer_0/attn": 1220.3394775390625, "eoc/jacobian_sigma/layer_0/mlp": 8329.4287109375, "eoc/jacobian_sigma/layer_0": 8329.4287109375, "eoc/jacobian_sigma/layer_7/attn": 1.1396656036376953, "eoc/jacobian_sigma/layer_7/mlp": 1.735804796218872, "eoc/jacobian_sigma/layer_7": 1.735804796218872, "eoc/jacobian_sigma/layer_14/attn": 1.4514644145965576, "eoc/jacobian_sigma/layer_14/mlp": 7.268709659576416, "eoc/jacobian_sigma/layer_14": 7.268709659576416, "eoc/jacobian_sigma/layer_21/attn": 1.0911056995391846, "eoc/jacobian_sigma/layer_21/mlp": 4.196175575256348, "eoc/jacobian_sigma/layer_21": 4.196175575256348, "eoc/jacobian_sigma/layer_27/attn": 3.0089454650878906, "eoc/jacobian_sigma/layer_27/mlp": 25.89242935180664, "eoc/jacobian_sigma/layer_27": 25.89242935180664, "eoc/layer0_sigma": 8329.4287109375, "eoc/sigma_max": 25.89242935180664, "eoc/sigma_min": 1.735804796218872, "eoc/sigma_mean": 9.773279845714569, "eoc/time_s": 0.6504361629486084} {"step": 72510, "timestamp": 1778272921.149168, "train/loss": 2.1302669286727904, "train/z_loss": 0.0013866098830476402, "train/perplexity": 8.417113280479976, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1160305.7458004626, "perf/iters_per_sec": 0.5532768944742501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8074132680892945, "data/tokens_consumed": 152066588672, "data/tokens_consumed_B": 152.066588672, "train/loss_slope": -3.1661475130359456e-07} {"step": 72520, "timestamp": 1778272931.507867, "train/loss": 2.144611048698425, "train/z_loss": 0.0013662567478604614, "train/perplexity": 8.538719445286281, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025844.4026098845, "perf/iters_per_sec": 0.9659978879022048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035198950767517, "data/tokens_consumed": 152087560192, "data/tokens_consumed_B": 152.087560192, "train/loss_slope": 2.6104009727297466e-06} {"step": 72525, "timestamp": 1778272937.3122895, "eos/sharpness": 12.40870952606201, "eos/L0_probe": 1.9660497903823853, "eos/L_plus": 2.034248113632202, "eos/L_minus": 2.0219385623931885, "eos/grad_norm": 0.17580188810825348, "eos/embed_grad_frac": 0.19397437572479248, "eos/time_s": 0.631655216217041} {"step": 72525, "timestamp": 1778272938.6918414, "geo/rankme_last": 437.7564392089844, "geo/layer_0/stable_rank_q_proj": 19.174846649169922, "geo/layer_0/stable_rank_k_proj": 15.865721702575684, "geo/layer_0/stable_rank_o_proj": 46.914005279541016, "geo/layer_0/stable_rank_gate_proj": 129.62693786621094, "geo/layer_0/stable_rank_down_proj": 56.299217224121094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061716582626104355, "geo/layer_0/attn_entropy_mean": 6.146976947784424, "geo/layer_0/attn_entropy_std": 0.4258626401424408, "geo/layer_7/stable_rank_q_proj": 42.721981048583984, "geo/layer_7/stable_rank_k_proj": 40.2004280090332, "geo/layer_7/stable_rank_o_proj": 89.21977996826172, "geo/layer_7/stable_rank_gate_proj": 78.23624420166016, "geo/layer_7/stable_rank_down_proj": 139.35462951660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4376462697982788, "geo/layer_7/attn_entropy_mean": 4.652102947235107, "geo/layer_7/attn_entropy_std": 0.8010446429252625, "geo/layer_14/stable_rank_q_proj": 50.336204528808594, "geo/layer_14/stable_rank_k_proj": 41.21391677856445, "geo/layer_14/stable_rank_o_proj": 43.40972137451172, "geo/layer_14/stable_rank_gate_proj": 71.0988998413086, "geo/layer_14/stable_rank_down_proj": 126.65098571777344, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3821183443069458, "geo/layer_14/attn_entropy_mean": 5.570488929748535, "geo/layer_14/attn_entropy_std": 0.4184095859527588, "geo/layer_21/stable_rank_q_proj": 39.9307861328125, "geo/layer_21/stable_rank_k_proj": 30.064023971557617, "geo/layer_21/stable_rank_o_proj": 68.70880126953125, "geo/layer_21/stable_rank_gate_proj": 64.37480163574219, "geo/layer_21/stable_rank_down_proj": 49.88364791870117, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14508312940597534, "geo/layer_21/attn_entropy_mean": 5.705857276916504, "geo/layer_21/attn_entropy_std": 0.3081374764442444, "geo/layer_27/stable_rank_q_proj": 43.80970764160156, "geo/layer_27/stable_rank_k_proj": 32.20258712768555, "geo/layer_27/stable_rank_o_proj": 115.08712768554688, "geo/layer_27/stable_rank_gate_proj": 78.03904724121094, "geo/layer_27/stable_rank_down_proj": 127.23733520507812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10486327856779099, "geo/layer_27/attn_entropy_mean": 4.1748552322387695, "geo/layer_27/attn_entropy_std": 0.7541284561157227, "attnres/final_alpha/block_0": 0.23602278530597687, "attnres/block_norm/0": 1.7739861011505127, "attnres/final_alpha/block_1": 0.00408262200653553, "attnres/block_norm/1": 48330.98828125, "attnres/final_alpha/block_2": 0.010103786364197731, "attnres/block_norm/2": 28931.015625, "attnres/final_alpha/block_3": 0.01182786375284195, "attnres/block_norm/3": 61286.7578125, "attnres/final_alpha/block_4": 0.014049599878489971, "attnres/block_norm/4": 15838.6962890625, "attnres/final_alpha/block_5": 0.6170767545700073, "attnres/block_norm/5": 6849.6708984375, "attnres/final_alpha/block_6": 0.10683660209178925, "attnres/block_norm/6": 41092.3671875, "geo/tier1_time_s": 1.3590805530548096, "geo/step": 72525.0, "geo/rankme_slope": -0.00013933557798119248} {"step": 72530, "timestamp": 1778272943.8740106, "train/loss": 2.170452904701233, "train/z_loss": 0.001362804404925555, "train/perplexity": 8.76225160715454, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1696793.6999634411, "perf/iters_per_sec": 0.8090942859475332, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2359498977661132, "data/tokens_consumed": 152108531712, "data/tokens_consumed_B": 152.108531712, "train/loss_slope": 4.312521196005453e-06} {"step": 72540, "timestamp": 1778272954.2275376, "train/loss": 2.108943748474121, "train/z_loss": 0.001387502404395491, "train/perplexity": 8.239533666710878, "train/grad_norm": 0.125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026867.7004000463, "perf/iters_per_sec": 0.966485834312461, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346763134002686, "data/tokens_consumed": 152129503232, "data/tokens_consumed_B": 152.129503232, "train/loss_slope": 2.364912034511738e-06} {"step": 72550, "timestamp": 1778272964.5689733, "grad/layer_0/attn": 0.0025829575024545193, "grad/layer_0/mlp": 0.0029021429363638163, "grad/layer_0/attn_mlp_ratio": 0.8900172975935692, "grad/layer_4/attn": 0.002365010790526867, "grad/layer_4/mlp": 0.0023598792031407356, "grad/layer_4/attn_mlp_ratio": 1.002174470270281, "grad/layer_8/attn": 0.004051488824188709, "grad/layer_8/mlp": 0.0036706747487187386, "grad/layer_8/attn_mlp_ratio": 1.1037449491345972, "grad/layer_12/attn": 0.00415944866836071, "grad/layer_12/mlp": 0.007005577906966209, "grad/layer_12/attn_mlp_ratio": 0.5937338309879122, "grad/layer_16/attn": 0.003858395619317889, "grad/layer_16/mlp": 0.004793897736817598, "grad/layer_16/attn_mlp_ratio": 0.8048556207612599, "grad/layer_20/attn": 0.003750846954062581, "grad/layer_20/mlp": 0.0069078062660992146, "grad/layer_20/attn_mlp_ratio": 0.5429866958156582, "grad/layer_24/attn": 0.016582539305090904, "grad/layer_24/mlp": 0.01152442954480648, "grad/layer_24/attn_mlp_ratio": 1.438903253018155, "grad/layer_27/attn": 0.008884798735380173, "grad/layer_27/mlp": 0.010598746128380299, "grad/layer_27/attn_mlp_ratio": 0.8382877128984669} {"step": 72550, "timestamp": 1778272964.5850837, "train/loss": 2.0655020594596865, "train/z_loss": 0.0013759089400991798, "train/perplexity": 7.889257780021887, "train/grad_norm": 0.236328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025742.1343224635, "perf/iters_per_sec": 0.9659491225826566, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035251212120056, "data/tokens_consumed": 152150474752, "data/tokens_consumed_B": 152.150474752, "train/loss_slope": -2.6401247021102167e-06} {"step": 72560, "timestamp": 1778272974.9371874, "train/loss": 2.109740900993347, "train/z_loss": 0.0013842297717928886, "train/perplexity": 8.246104450340955, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026465.09209076, "perf/iters_per_sec": 0.966293855710392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03488187789917, "data/tokens_consumed": 152171446272, "data/tokens_consumed_B": 152.171446272, "train/loss_slope": -4.553891837757801e-06} {"step": 72570, "timestamp": 1778272985.2920825, "train/loss": 2.146386504173279, "train/z_loss": 0.0013724841061048209, "train/perplexity": 8.55389302749756, "train/grad_norm": 0.2294921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026571.2620507255, "perf/iters_per_sec": 0.9663444814923885, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348276615142822, "data/tokens_consumed": 152192417792, "data/tokens_consumed_B": 152.192417792, "train/loss_slope": -1.2855892670680014e-06} {"step": 72580, "timestamp": 1778272995.6465247, "train/loss": 2.114198017120361, "train/z_loss": 0.0013598003308288753, "train/perplexity": 8.282940325376535, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026621.9698354471, "perf/iters_per_sec": 0.9663686608483539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348017692565918, "data/tokens_consumed": 152213389312, "data/tokens_consumed_B": 152.213389312, "train/loss_slope": -4.910555967439078e-06} {"step": 72590, "timestamp": 1778273006.0091364, "train/loss": 2.1761043071746826, "train/z_loss": 0.0013787205796688795, "train/perplexity": 8.811910807454565, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024695.6911124142, "perf/iters_per_sec": 0.9654501395761558, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357862710952759, "data/tokens_consumed": 152234360832, "data/tokens_consumed_B": 152.234360832, "train/loss_slope": -4.418928688294244e-06} {"step": 72600, "timestamp": 1778273016.3523035, "grad/layer_0/attn": 0.002806210657581687, "grad/layer_0/mlp": 0.0031784719321876764, "grad/layer_0/attn_mlp_ratio": 0.8828803995013385, "grad/layer_4/attn": 0.002432343317195773, "grad/layer_4/mlp": 0.002511979779228568, "grad/layer_4/attn_mlp_ratio": 0.9682972930271833, "grad/layer_8/attn": 0.005845983512699604, "grad/layer_8/mlp": 0.0036239095497876406, "grad/layer_8/attn_mlp_ratio": 1.6131703264296824, "grad/layer_12/attn": 0.0043428451754152775, "grad/layer_12/mlp": 0.006925974041223526, "grad/layer_12/attn_mlp_ratio": 0.6270374516079381, "grad/layer_16/attn": 0.0033948696218430996, "grad/layer_16/mlp": 0.004424887243658304, "grad/layer_16/attn_mlp_ratio": 0.7672217071715027, "grad/layer_20/attn": 0.0032709871884435415, "grad/layer_20/mlp": 0.005817958619445562, "grad/layer_20/attn_mlp_ratio": 0.5622224814883628, "grad/layer_24/attn": 0.005685919895768166, "grad/layer_24/mlp": 0.008596736006438732, "grad/layer_24/attn_mlp_ratio": 0.661404494143953, "grad/layer_27/attn": 0.009647990576922894, "grad/layer_27/mlp": 0.0073766340501606464, "grad/layer_27/attn_mlp_ratio": 1.3079123053313932} {"step": 72600, "timestamp": 1778273016.9640448, "eos/sharpness": 63.81542682647704, "eos/L0_probe": 1.9696568250656128, "eos/L_plus": 2.3760478496551514, "eos/L_minus": 2.2014200687408447, "eos/grad_norm": 0.14392594993114471, "eos/embed_grad_frac": 0.11319935321807861, "eos/time_s": 0.608945369720459} {"step": 72600, "timestamp": 1778273016.9852016, "train/loss": 2.09886234998703, "train/z_loss": 0.0013844861998222769, "train/perplexity": 8.156884951780002, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911857.8560905734, "perf/iters_per_sec": 0.9116448669865481, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0969183683395385, "data/tokens_consumed": 152255332352, "data/tokens_consumed_B": 152.255332352, "train/loss_slope": -5.342580683410908e-06} {"step": 72600, "timestamp": 1778273018.345891, "geo/rankme_last": 438.1734619140625, "geo/layer_0/stable_rank_q_proj": 19.153417587280273, "geo/layer_0/stable_rank_k_proj": 15.822052955627441, "geo/layer_0/stable_rank_o_proj": 46.890281677246094, "geo/layer_0/stable_rank_gate_proj": 129.85064697265625, "geo/layer_0/stable_rank_down_proj": 56.39399337768555, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06505991518497467, "geo/layer_0/attn_entropy_mean": 6.142349720001221, "geo/layer_0/attn_entropy_std": 0.4313499927520752, "geo/layer_7/stable_rank_q_proj": 42.706260681152344, "geo/layer_7/stable_rank_k_proj": 40.2015495300293, "geo/layer_7/stable_rank_o_proj": 89.16710662841797, "geo/layer_7/stable_rank_gate_proj": 78.21360778808594, "geo/layer_7/stable_rank_down_proj": 139.37242126464844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4379953145980835, "geo/layer_7/attn_entropy_mean": 4.686234474182129, "geo/layer_7/attn_entropy_std": 0.7977407574653625, "geo/layer_14/stable_rank_q_proj": 50.31557083129883, "geo/layer_14/stable_rank_k_proj": 41.31989288330078, "geo/layer_14/stable_rank_o_proj": 43.46132278442383, "geo/layer_14/stable_rank_gate_proj": 71.10861206054688, "geo/layer_14/stable_rank_down_proj": 126.667236328125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3908984363079071, "geo/layer_14/attn_entropy_mean": 5.540389060974121, "geo/layer_14/attn_entropy_std": 0.42307886481285095, "geo/layer_21/stable_rank_q_proj": 39.878990173339844, "geo/layer_21/stable_rank_k_proj": 30.127254486083984, "geo/layer_21/stable_rank_o_proj": 68.68146514892578, "geo/layer_21/stable_rank_gate_proj": 64.22357940673828, "geo/layer_21/stable_rank_down_proj": 49.82101821899414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14241763949394226, "geo/layer_21/attn_entropy_mean": 5.665050029754639, "geo/layer_21/attn_entropy_std": 0.3072435259819031, "geo/layer_27/stable_rank_q_proj": 43.8164176940918, "geo/layer_27/stable_rank_k_proj": 32.15028381347656, "geo/layer_27/stable_rank_o_proj": 115.37403106689453, "geo/layer_27/stable_rank_gate_proj": 78.06664276123047, "geo/layer_27/stable_rank_down_proj": 127.21265411376953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09832251816987991, "geo/layer_27/attn_entropy_mean": 4.169083595275879, "geo/layer_27/attn_entropy_std": 0.7898155450820923, "attnres/final_alpha/block_0": 0.2352217435836792, "attnres/block_norm/0": 1.774034023284912, "attnres/final_alpha/block_1": 0.004032492637634277, "attnres/block_norm/1": 48535.41015625, "attnres/final_alpha/block_2": 0.009853077121078968, "attnres/block_norm/2": 29035.95703125, "attnres/final_alpha/block_3": 0.011941452510654926, "attnres/block_norm/3": 61392.5625, "attnres/final_alpha/block_4": 0.013872666284441948, "attnres/block_norm/4": 15745.095703125, "attnres/final_alpha/block_5": 0.6188719272613525, "attnres/block_norm/5": 6866.1015625, "attnres/final_alpha/block_6": 0.10620664060115814, "attnres/block_norm/6": 41237.23046875, "geo/tier1_time_s": 1.3571856021881104, "geo/step": 72600.0, "geo/rankme_slope": -0.00014340808589060624} {"step": 72610, "timestamp": 1778273028.7017558, "train/loss": 2.094238042831421, "train/z_loss": 0.001372150215320289, "train/perplexity": 8.119252090347624, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790428.4216652901, "perf/iters_per_sec": 0.8537428005529833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713129520416259, "data/tokens_consumed": 152276303872, "data/tokens_consumed_B": 152.276303872, "train/loss_slope": -1.0110925712493891e-05} {"step": 72620, "timestamp": 1778273039.060822, "train/loss": 2.160924994945526, "train/z_loss": 0.0013684186735190452, "train/perplexity": 8.67916212769408, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025423.2187604338, "perf/iters_per_sec": 0.9657970517923516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035414218902588, "data/tokens_consumed": 152297275392, "data/tokens_consumed_B": 152.297275392, "train/loss_slope": -8.01219259193987e-06} {"step": 72630, "timestamp": 1778273049.4154499, "train/loss": 2.194637966156006, "train/z_loss": 0.0013642896665260196, "train/perplexity": 8.97675058094434, "train/grad_norm": 0.275390625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026608.7089910987, "perf/iters_per_sec": 0.9663623375850194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348085403442382, "data/tokens_consumed": 152318246912, "data/tokens_consumed_B": 152.318246912, "train/loss_slope": -5.674073698282447e-06} {"step": 72640, "timestamp": 1778273059.7783918, "train/loss": 2.1981058597564695, "train/z_loss": 0.0013618340715765953, "train/perplexity": 9.007935037773331, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025101.3264499034, "perf/iters_per_sec": 0.9656435615777509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355787992477417, "data/tokens_consumed": 152339218432, "data/tokens_consumed_B": 152.339218432, "train/loss_slope": -6.39758874015902e-07} {"step": 72650, "timestamp": 1778273070.1237142, "grad/layer_0/attn": 0.0030970871448516846, "grad/layer_0/mlp": 0.0032997161615639925, "grad/layer_0/attn_mlp_ratio": 0.9385919574138577, "grad/layer_4/attn": 0.002703984035179019, "grad/layer_4/mlp": 0.0025920190382748842, "grad/layer_4/attn_mlp_ratio": 1.0431960147403283, "grad/layer_8/attn": 0.004940403159707785, "grad/layer_8/mlp": 0.003530313726514578, "grad/layer_8/attn_mlp_ratio": 1.3994232248143688, "grad/layer_12/attn": 0.004615728743374348, "grad/layer_12/mlp": 0.007027968764305115, "grad/layer_12/attn_mlp_ratio": 0.6567656790310389, "grad/layer_16/attn": 0.004082408733665943, "grad/layer_16/mlp": 0.005008060019463301, "grad/layer_16/attn_mlp_ratio": 0.8151676769614027, "grad/layer_20/attn": 0.00507459556683898, "grad/layer_20/mlp": 0.0068305497989058495, "grad/layer_20/attn_mlp_ratio": 0.7429263590698391, "grad/layer_24/attn": 0.014813737943768501, "grad/layer_24/mlp": 0.012797076255083084, "grad/layer_24/attn_mlp_ratio": 1.1575876811803492, "grad/layer_27/attn": 0.010874462313950062, "grad/layer_27/mlp": 0.012941586785018444, "grad/layer_27/attn_mlp_ratio": 0.8402727123470967} {"step": 72650, "timestamp": 1778273070.1397798, "train/loss": 2.1363739371299744, "train/z_loss": 0.001368174108210951, "train/perplexity": 8.468673942921221, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024950.4644497992, "perf/iters_per_sec": 0.9655716249703403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035655951499939, "data/tokens_consumed": 152360189952, "data/tokens_consumed_B": 152.360189952, "train/loss_slope": 1.3538608671200244e-07} {"step": 72660, "timestamp": 1778273080.503848, "train/loss": 2.1546629667282104, "train/z_loss": 0.001354667020495981, "train/perplexity": 8.624982782885223, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024891.6828403065, "perf/iters_per_sec": 0.9655435957147153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356860160827637, "data/tokens_consumed": 152381161472, "data/tokens_consumed_B": 152.381161472, "train/loss_slope": 1.270020276334879e-06} {"step": 72670, "timestamp": 1778273090.8563068, "train/loss": 2.1789103746414185, "train/z_loss": 0.0013659337419085205, "train/perplexity": 8.836672348721102, "train/grad_norm": 0.2734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026981.5729616238, "perf/iters_per_sec": 0.966540132981121, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346181869506836, "data/tokens_consumed": 152402132992, "data/tokens_consumed_B": 152.402132992, "train/loss_slope": 4.068305833612233e-06} {"step": 72675, "timestamp": 1778273096.6332812, "eos/sharpness": 51.31640434265136, "eos/L0_probe": 1.9693819284439087, "eos/L_plus": 2.1911795139312744, "eos/L_minus": 2.2607483863830566, "eos/grad_norm": 0.11893533915281296, "eos/embed_grad_frac": 0.1591504067182541, "eos/time_s": 0.612372636795044} {"step": 72675, "timestamp": 1778273098.0172927, "geo/rankme_last": 438.731689453125, "geo/layer_0/stable_rank_q_proj": 19.15208625793457, "geo/layer_0/stable_rank_k_proj": 15.77957534790039, "geo/layer_0/stable_rank_o_proj": 46.80339813232422, "geo/layer_0/stable_rank_gate_proj": 129.84910583496094, "geo/layer_0/stable_rank_down_proj": 56.40300369262695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060771163552999496, "geo/layer_0/attn_entropy_mean": 6.147229194641113, "geo/layer_0/attn_entropy_std": 0.4328676760196686, "geo/layer_7/stable_rank_q_proj": 42.716773986816406, "geo/layer_7/stable_rank_k_proj": 40.209320068359375, "geo/layer_7/stable_rank_o_proj": 89.01549530029297, "geo/layer_7/stable_rank_gate_proj": 78.14143371582031, "geo/layer_7/stable_rank_down_proj": 139.4095458984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.441762775182724, "geo/layer_7/attn_entropy_mean": 4.668735504150391, "geo/layer_7/attn_entropy_std": 0.8381685018539429, "geo/layer_14/stable_rank_q_proj": 50.20169448852539, "geo/layer_14/stable_rank_k_proj": 41.23402786254883, "geo/layer_14/stable_rank_o_proj": 43.5108757019043, "geo/layer_14/stable_rank_gate_proj": 71.20953369140625, "geo/layer_14/stable_rank_down_proj": 126.88280487060547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3954004943370819, "geo/layer_14/attn_entropy_mean": 5.576171875, "geo/layer_14/attn_entropy_std": 0.4094325006008148, "geo/layer_21/stable_rank_q_proj": 39.84702682495117, "geo/layer_21/stable_rank_k_proj": 30.025054931640625, "geo/layer_21/stable_rank_o_proj": 68.65495300292969, "geo/layer_21/stable_rank_gate_proj": 64.2601318359375, "geo/layer_21/stable_rank_down_proj": 49.8594970703125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1467512547969818, "geo/layer_21/attn_entropy_mean": 5.671289443969727, "geo/layer_21/attn_entropy_std": 0.3053392171859741, "geo/layer_27/stable_rank_q_proj": 43.775062561035156, "geo/layer_27/stable_rank_k_proj": 32.25353240966797, "geo/layer_27/stable_rank_o_proj": 115.27144622802734, "geo/layer_27/stable_rank_gate_proj": 78.09678649902344, "geo/layer_27/stable_rank_down_proj": 127.45028686523438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09389804303646088, "geo/layer_27/attn_entropy_mean": 4.162673473358154, "geo/layer_27/attn_entropy_std": 0.7661475539207458, "attnres/final_alpha/block_0": 0.23705121874809265, "attnres/block_norm/0": 1.7742160558700562, "attnres/final_alpha/block_1": 0.004076649434864521, "attnres/block_norm/1": 48271.2109375, "attnres/final_alpha/block_2": 0.010071862488985062, "attnres/block_norm/2": 29045.609375, "attnres/final_alpha/block_3": 0.012080403044819832, "attnres/block_norm/3": 61130.25, "attnres/final_alpha/block_4": 0.01388443261384964, "attnres/block_norm/4": 15784.6767578125, "attnres/final_alpha/block_5": 0.6163454651832581, "attnres/block_norm/5": 6815.9716796875, "attnres/final_alpha/block_6": 0.10648993402719498, "attnres/block_norm/6": 40991.9140625, "geo/tier1_time_s": 1.362516164779663, "geo/step": 72675.0, "geo/rankme_slope": -0.0001224481394120148} {"step": 72680, "timestamp": 1778273103.2025518, "train/loss": 2.1261541843414307, "train/z_loss": 0.001386768533848226, "train/perplexity": 8.382566934388775, "train/grad_norm": 0.25, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1699234.8988340988, "perf/iters_per_sec": 0.8102583402796263, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2341742753982543, "data/tokens_consumed": 152423104512, "data/tokens_consumed_B": 152.423104512, "train/loss_slope": 3.2566164407578368e-06} {"step": 72690, "timestamp": 1778273113.566423, "train/loss": 2.1734062910079954, "train/z_loss": 0.0013746985234320164, "train/perplexity": 8.788168173044847, "train/grad_norm": 0.11767578125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025431.0073693271, "perf/iters_per_sec": 0.9658007656904827, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354102373123169, "data/tokens_consumed": 152444076032, "data/tokens_consumed_B": 152.444076032, "train/loss_slope": 3.2829786756179436e-06} {"step": 72700, "timestamp": 1778273123.9049747, "grad/layer_0/attn": 0.002638391684740782, "grad/layer_0/mlp": 0.0031067717354744673, "grad/layer_0/attn_mlp_ratio": 0.8492389607162271, "grad/layer_4/attn": 0.0020440362859517336, "grad/layer_4/mlp": 0.0026048619765788317, "grad/layer_4/attn_mlp_ratio": 0.7847003894487672, "grad/layer_8/attn": 0.00471292482689023, "grad/layer_8/mlp": 0.003685121890157461, "grad/layer_8/attn_mlp_ratio": 1.2789060550716944, "grad/layer_12/attn": 0.005470267031341791, "grad/layer_12/mlp": 0.006587598472833633, "grad/layer_12/attn_mlp_ratio": 0.8303886417579288, "grad/layer_16/attn": 0.005394285544753075, "grad/layer_16/mlp": 0.004023331683129072, "grad/layer_16/attn_mlp_ratio": 1.3407508591194954, "grad/layer_20/attn": 0.002747580409049988, "grad/layer_20/mlp": 0.005260061472654343, "grad/layer_20/attn_mlp_ratio": 0.5223475754226767, "grad/layer_24/attn": 0.006145107094198465, "grad/layer_24/mlp": 0.007003698498010635, "grad/layer_24/attn_mlp_ratio": 0.8774088445130909, "grad/layer_27/attn": 0.005525505635887384, "grad/layer_27/mlp": 0.0058142622001469135, "grad/layer_27/attn_mlp_ratio": 0.9503364916556598} {"step": 72700, "timestamp": 1778273123.9208775, "train/loss": 2.140704798698425, "train/z_loss": 0.0013729101978242398, "train/perplexity": 8.505430132970787, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026311.5995224996, "perf/iters_per_sec": 0.9662206647503374, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349602699279785, "data/tokens_consumed": 152465047552, "data/tokens_consumed_B": 152.465047552, "train/loss_slope": 5.669583617621524e-06} {"step": 72710, "timestamp": 1778273134.2880332, "train/loss": 2.118913006782532, "train/z_loss": 0.001371198962442577, "train/perplexity": 8.322086517807401, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2023919.368192253, "perf/iters_per_sec": 0.9650799599610581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361835718154908, "data/tokens_consumed": 152486019072, "data/tokens_consumed_B": 152.486019072, "train/loss_slope": 5.884320715186888e-06} {"step": 72720, "timestamp": 1778273144.6479514, "train/loss": 2.117958903312683, "train/z_loss": 0.0013633369118906558, "train/perplexity": 8.314150172833564, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025716.0556952718, "perf/iters_per_sec": 0.9659366873241767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035264539718628, "data/tokens_consumed": 152506990592, "data/tokens_consumed_B": 152.506990592, "train/loss_slope": 4.049749264229237e-06} {"step": 72730, "timestamp": 1778273155.002874, "train/loss": 2.2177212238311768, "train/z_loss": 0.0013546838774345816, "train/perplexity": 9.186373307587877, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026467.8932660788, "perf/iters_per_sec": 0.9662951914148706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348804473876954, "data/tokens_consumed": 152527962112, "data/tokens_consumed_B": 152.527962112, "train/loss_slope": 1.0384916608268493e-05} {"step": 72740, "timestamp": 1778273165.3557045, "train/loss": 2.1244423151016236, "train/z_loss": 0.0013686421094462275, "train/perplexity": 8.368229351438137, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026541.0066807412, "perf/iters_per_sec": 0.9663300546077448, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034843111038208, "data/tokens_consumed": 152548933632, "data/tokens_consumed_B": 152.548933632, "train/loss_slope": 9.673287365624696e-06} {"step": 72750, "timestamp": 1778273175.7122612, "grad/layer_0/attn": 0.0023811529390513897, "grad/layer_0/mlp": 0.002764879958704114, "grad/layer_0/attn_mlp_ratio": 0.861213827903777, "grad/layer_4/attn": 0.0020159012638032436, "grad/layer_4/mlp": 0.0025316448882222176, "grad/layer_4/attn_mlp_ratio": 0.7962811820700256, "grad/layer_8/attn": 0.0052476744167506695, "grad/layer_8/mlp": 0.0037628780119121075, "grad/layer_8/attn_mlp_ratio": 1.3945905928066489, "grad/layer_12/attn": 0.005646364763379097, "grad/layer_12/mlp": 0.007188156712800264, "grad/layer_12/attn_mlp_ratio": 0.785509402538961, "grad/layer_16/attn": 0.003719910979270935, "grad/layer_16/mlp": 0.004466320853680372, "grad/layer_16/attn_mlp_ratio": 0.8328803545131761, "grad/layer_20/attn": 0.0038631802890449762, "grad/layer_20/mlp": 0.005698648747056723, "grad/layer_20/attn_mlp_ratio": 0.6779116230403034, "grad/layer_24/attn": 0.012870211154222488, "grad/layer_24/mlp": 0.008553014136850834, "grad/layer_24/attn_mlp_ratio": 1.5047573636404024, "grad/layer_27/attn": 0.00391085771843791, "grad/layer_27/mlp": 0.006809036247432232, "grad/layer_27/attn_mlp_ratio": 0.5743628788107059} {"step": 72750, "timestamp": 1778273176.3237922, "eos/sharpness": 4.886949062347411, "eos/L0_probe": 1.9673823118209839, "eos/L_plus": 1.9937459230422974, "eos/L_minus": 1.9898881912231445, "eos/grad_norm": 0.08989905565977097, "eos/embed_grad_frac": 0.3044533133506775, "eos/time_s": 0.6087689399719238} {"step": 72750, "timestamp": 1778273176.3511386, "train/loss": 2.1524868965148927, "train/z_loss": 0.0013645877712406217, "train/perplexity": 8.606234620817673, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1908952.1120918924, "perf/iters_per_sec": 0.9102593002757513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985880613327026, "data/tokens_consumed": 152569905152, "data/tokens_consumed_B": 152.569905152, "train/loss_slope": 9.811502986579499e-06} {"step": 72750, "timestamp": 1778273177.7150433, "geo/rankme_last": 437.2000732421875, "geo/layer_0/stable_rank_q_proj": 19.12566566467285, "geo/layer_0/stable_rank_k_proj": 15.811026573181152, "geo/layer_0/stable_rank_o_proj": 46.757389068603516, "geo/layer_0/stable_rank_gate_proj": 129.8733673095703, "geo/layer_0/stable_rank_down_proj": 56.34480285644531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06636210530996323, "geo/layer_0/attn_entropy_mean": 6.143733978271484, "geo/layer_0/attn_entropy_std": 0.43103039264678955, "geo/layer_7/stable_rank_q_proj": 42.6656494140625, "geo/layer_7/stable_rank_k_proj": 40.2823486328125, "geo/layer_7/stable_rank_o_proj": 88.97603607177734, "geo/layer_7/stable_rank_gate_proj": 78.21444702148438, "geo/layer_7/stable_rank_down_proj": 139.3942413330078, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4407278597354889, "geo/layer_7/attn_entropy_mean": 4.631295204162598, "geo/layer_7/attn_entropy_std": 0.7992079854011536, "geo/layer_14/stable_rank_q_proj": 50.211219787597656, "geo/layer_14/stable_rank_k_proj": 41.24665451049805, "geo/layer_14/stable_rank_o_proj": 43.53353500366211, "geo/layer_14/stable_rank_gate_proj": 71.11194610595703, "geo/layer_14/stable_rank_down_proj": 126.78923797607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3802376985549927, "geo/layer_14/attn_entropy_mean": 5.522286415100098, "geo/layer_14/attn_entropy_std": 0.43856218457221985, "geo/layer_21/stable_rank_q_proj": 39.870296478271484, "geo/layer_21/stable_rank_k_proj": 30.08409881591797, "geo/layer_21/stable_rank_o_proj": 68.69168853759766, "geo/layer_21/stable_rank_gate_proj": 64.22567749023438, "geo/layer_21/stable_rank_down_proj": 49.86635208129883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13642647862434387, "geo/layer_21/attn_entropy_mean": 5.692546844482422, "geo/layer_21/attn_entropy_std": 0.3011506497859955, "geo/layer_27/stable_rank_q_proj": 43.74245071411133, "geo/layer_27/stable_rank_k_proj": 32.2501220703125, "geo/layer_27/stable_rank_o_proj": 115.16392517089844, "geo/layer_27/stable_rank_gate_proj": 77.92662048339844, "geo/layer_27/stable_rank_down_proj": 127.3249282836914, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10756124556064606, "geo/layer_27/attn_entropy_mean": 4.167794227600098, "geo/layer_27/attn_entropy_std": 0.7589501738548279, "attnres/final_alpha/block_0": 0.23851922154426575, "attnres/block_norm/0": 1.7742315530776978, "attnres/final_alpha/block_1": 0.004155056085437536, "attnres/block_norm/1": 48368.953125, "attnres/final_alpha/block_2": 0.01014622300863266, "attnres/block_norm/2": 29005.0859375, "attnres/final_alpha/block_3": 0.012073269113898277, "attnres/block_norm/3": 61528.76171875, "attnres/final_alpha/block_4": 0.014172124676406384, "attnres/block_norm/4": 15777.4921875, "attnres/final_alpha/block_5": 0.6130719184875488, "attnres/block_norm/5": 6898.33056640625, "attnres/final_alpha/block_6": 0.10786217451095581, "attnres/block_norm/6": 41405.9609375, "geo/tier1_time_s": 1.3607888221740723, "geo/step": 72750.0, "geo/rankme_slope": -0.00017531792013680472} {"step": 72760, "timestamp": 1778273188.0676205, "train/loss": 2.1905922174453734, "train/z_loss": 0.00135202263481915, "train/perplexity": 8.940506270978446, "train/grad_norm": 0.1728515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1790352.403045993, "perf/iters_per_sec": 0.8537065520505872, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713626861572266, "data/tokens_consumed": 152590876672, "data/tokens_consumed_B": 152.590876672, "train/loss_slope": 1.2489851144137702e-05} {"step": 72770, "timestamp": 1778273198.425442, "train/loss": 2.1514981508255007, "train/z_loss": 0.0013656460912898182, "train/perplexity": 8.597729448853404, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026013.0371979752, "perf/iters_per_sec": 0.9660782991399647, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351127862930298, "data/tokens_consumed": 152611848192, "data/tokens_consumed_B": 152.611848192, "train/loss_slope": 1.1494033858113011e-05} {"step": 72780, "timestamp": 1778273208.7883224, "train/loss": 2.123116374015808, "train/z_loss": 0.0013618611497804522, "train/perplexity": 8.357140925250034, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025204.5558626715, "perf/iters_per_sec": 0.9656927851975782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355260133743287, "data/tokens_consumed": 152632819712, "data/tokens_consumed_B": 152.632819712, "train/loss_slope": 1.2411360850822037e-05} {"step": 72790, "timestamp": 1778273219.1409278, "train/loss": 2.113449311256409, "train/z_loss": 0.001387902197893709, "train/perplexity": 8.276741160349344, "train/grad_norm": 0.3671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026697.5693015063, "perf/iters_per_sec": 0.9664047094829112, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347631692886352, "data/tokens_consumed": 152653791232, "data/tokens_consumed_B": 152.653791232, "train/loss_slope": 1.339952467155287e-05} {"step": 72800, "timestamp": 1778273229.4869523, "grad/layer_0/attn": 0.0030757447239011526, "grad/layer_0/mlp": 0.003434703918173909, "grad/layer_0/attn_mlp_ratio": 0.8954904724327307, "grad/layer_4/attn": 0.004184439778327942, "grad/layer_4/mlp": 0.00266056670807302, "grad/layer_4/attn_mlp_ratio": 1.5727625277557393, "grad/layer_8/attn": 0.0035218140110373497, "grad/layer_8/mlp": 0.0035106390714645386, "grad/layer_8/attn_mlp_ratio": 1.0031831353286442, "grad/layer_12/attn": 0.004834125284105539, "grad/layer_12/mlp": 0.007320909295231104, "grad/layer_12/attn_mlp_ratio": 0.660317594867999, "grad/layer_16/attn": 0.0036667324602603912, "grad/layer_16/mlp": 0.004630574025213718, "grad/layer_16/attn_mlp_ratio": 0.7918526647257068, "grad/layer_20/attn": 0.005766443442553282, "grad/layer_20/mlp": 0.006554550025612116, "grad/layer_20/attn_mlp_ratio": 0.8797618954839812, "grad/layer_24/attn": 0.012154276482760906, "grad/layer_24/mlp": 0.011122296564280987, "grad/layer_24/attn_mlp_ratio": 1.0927847772478594, "grad/layer_27/attn": 0.004768209997564554, "grad/layer_27/mlp": 0.010636272840201855, "grad/layer_27/attn_mlp_ratio": 0.44829707025871635} {"step": 72800, "timestamp": 1778273229.5028896, "train/loss": 2.180536723136902, "train/z_loss": 0.0013725891592912375, "train/perplexity": 8.851055550379405, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025100.4406061773, "perf/iters_per_sec": 0.9656431391745459, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035579252243042, "data/tokens_consumed": 152674762752, "data/tokens_consumed_B": 152.674762752, "train/loss_slope": 1.5483702963764114e-05} {"step": 72810, "timestamp": 1778273239.858943, "train/loss": 2.1160200238227844, "train/z_loss": 0.0013801627443172037, "train/perplexity": 8.298045654992269, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026337.4600842695, "perf/iters_per_sec": 0.9662329960271213, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349470615386962, "data/tokens_consumed": 152695734272, "data/tokens_consumed_B": 152.695734272, "train/loss_slope": 1.3847157256294952e-05} {"step": 72820, "timestamp": 1778273250.2184901, "train/loss": 2.1574149131774902, "train/z_loss": 0.001353525882586837, "train/perplexity": 8.648750963009016, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025391.7851033227, "perf/iters_per_sec": 0.9657820630566228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354302883148194, "data/tokens_consumed": 152716705792, "data/tokens_consumed_B": 152.716705792, "train/loss_slope": 1.2930495217509593e-05} {"step": 72825, "timestamp": 1778273255.9968607, "eos/sharpness": 3.5893559455871573, "eos/L0_probe": 1.974579095840454, "eos/L_plus": 1.9975833892822266, "eos/L_minus": 1.9874683618545532, "eos/grad_norm": 0.08423110842704773, "eos/embed_grad_frac": 0.2908765971660614, "eos/time_s": 0.6078693866729736} {"step": 72825, "timestamp": 1778273257.376138, "geo/rankme_last": 437.9378662109375, "geo/layer_0/stable_rank_q_proj": 19.15816879272461, "geo/layer_0/stable_rank_k_proj": 15.842988967895508, "geo/layer_0/stable_rank_o_proj": 46.782447814941406, "geo/layer_0/stable_rank_gate_proj": 129.8279571533203, "geo/layer_0/stable_rank_down_proj": 56.22911071777344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06181107088923454, "geo/layer_0/attn_entropy_mean": 6.148929119110107, "geo/layer_0/attn_entropy_std": 0.4293839633464813, "geo/layer_7/stable_rank_q_proj": 42.58891296386719, "geo/layer_7/stable_rank_k_proj": 40.16617965698242, "geo/layer_7/stable_rank_o_proj": 88.82239532470703, "geo/layer_7/stable_rank_gate_proj": 78.20996856689453, "geo/layer_7/stable_rank_down_proj": 139.33921813964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4460063874721527, "geo/layer_7/attn_entropy_mean": 4.687692642211914, "geo/layer_7/attn_entropy_std": 0.8020662069320679, "geo/layer_14/stable_rank_q_proj": 50.16740036010742, "geo/layer_14/stable_rank_k_proj": 41.192527770996094, "geo/layer_14/stable_rank_o_proj": 43.568153381347656, "geo/layer_14/stable_rank_gate_proj": 71.06524658203125, "geo/layer_14/stable_rank_down_proj": 126.74934387207031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40288302302360535, "geo/layer_14/attn_entropy_mean": 5.558845043182373, "geo/layer_14/attn_entropy_std": 0.4305838346481323, "geo/layer_21/stable_rank_q_proj": 39.793373107910156, "geo/layer_21/stable_rank_k_proj": 30.082664489746094, "geo/layer_21/stable_rank_o_proj": 68.66532897949219, "geo/layer_21/stable_rank_gate_proj": 64.29047393798828, "geo/layer_21/stable_rank_down_proj": 49.83802032470703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14269766211509705, "geo/layer_21/attn_entropy_mean": 5.689159393310547, "geo/layer_21/attn_entropy_std": 0.31120678782463074, "geo/layer_27/stable_rank_q_proj": 43.77902603149414, "geo/layer_27/stable_rank_k_proj": 32.21590805053711, "geo/layer_27/stable_rank_o_proj": 115.277099609375, "geo/layer_27/stable_rank_gate_proj": 77.86038208007812, "geo/layer_27/stable_rank_down_proj": 127.07909393310547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10558561235666275, "geo/layer_27/attn_entropy_mean": 4.153512477874756, "geo/layer_27/attn_entropy_std": 0.7705847024917603, "attnres/final_alpha/block_0": 0.23688460886478424, "attnres/block_norm/0": 1.77413809299469, "attnres/final_alpha/block_1": 0.004147539846599102, "attnres/block_norm/1": 48446.90625, "attnres/final_alpha/block_2": 0.010017944499850273, "attnres/block_norm/2": 29077.7109375, "attnres/final_alpha/block_3": 0.011884266510605812, "attnres/block_norm/3": 61706.078125, "attnres/final_alpha/block_4": 0.014057237654924393, "attnres/block_norm/4": 15780.130859375, "attnres/final_alpha/block_5": 0.6169314980506897, "attnres/block_norm/5": 6896.09814453125, "attnres/final_alpha/block_6": 0.1060769259929657, "attnres/block_norm/6": 41233.6875, "geo/tier1_time_s": 1.3584210872650146, "geo/step": 72825.0, "geo/rankme_slope": -0.0001748302055197079} {"step": 72830, "timestamp": 1778273262.5605257, "train/loss": 2.1147193193435667, "train/z_loss": 0.0013739418005570768, "train/perplexity": 8.287259366247826, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1700392.5712203907, "perf/iters_per_sec": 0.810810361490436, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2333340167999267, "data/tokens_consumed": 152737677312, "data/tokens_consumed_B": 152.737677312, "train/loss_slope": 1.0950497861313397e-05} {"step": 72840, "timestamp": 1778273272.9126, "train/loss": 2.1210842609405516, "train/z_loss": 0.0013935827068053187, "train/perplexity": 8.340175513559732, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026761.4526785815, "perf/iters_per_sec": 0.9664351714508922, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347305536270142, "data/tokens_consumed": 152758648832, "data/tokens_consumed_B": 152.758648832, "train/loss_slope": 1.180748448084325e-05} {"step": 72850, "timestamp": 1778273283.253519, "grad/layer_0/attn": 0.002772266510874033, "grad/layer_0/mlp": 0.0029525409918278456, "grad/layer_0/attn_mlp_ratio": 0.9389425666410594, "grad/layer_4/attn": 0.002537157153710723, "grad/layer_4/mlp": 0.002478433772921562, "grad/layer_4/attn_mlp_ratio": 1.0236937048959625, "grad/layer_8/attn": 0.00314279249869287, "grad/layer_8/mlp": 0.0036327247507870197, "grad/layer_8/attn_mlp_ratio": 0.8651336469955859, "grad/layer_12/attn": 0.0047021834179759026, "grad/layer_12/mlp": 0.00720300292596221, "grad/layer_12/attn_mlp_ratio": 0.6528087522700664, "grad/layer_16/attn": 0.006075420882552862, "grad/layer_16/mlp": 0.00455944798886776, "grad/layer_16/attn_mlp_ratio": 1.3324904164138782, "grad/layer_20/attn": 0.0035755226854234934, "grad/layer_20/mlp": 0.00642047356814146, "grad/layer_20/attn_mlp_ratio": 0.5568939100498644, "grad/layer_24/attn": 0.01795424520969391, "grad/layer_24/mlp": 0.011926467530429363, "grad/layer_24/attn_mlp_ratio": 1.505411808932025, "grad/layer_27/attn": 0.007473888341337442, "grad/layer_27/mlp": 0.011315534822642803, "grad/layer_27/attn_mlp_ratio": 0.6604980137865083} {"step": 72850, "timestamp": 1778273283.2694943, "train/loss": 2.086947965621948, "train/z_loss": 0.0013798279920592903, "train/perplexity": 8.060277342145014, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025821.6806155005, "perf/iters_per_sec": 0.965987053210974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352105617523193, "data/tokens_consumed": 152779620352, "data/tokens_consumed_B": 152.779620352, "train/loss_slope": 9.320510596153129e-06} {"step": 72860, "timestamp": 1778273293.6265817, "train/loss": 2.1163203954696654, "train/z_loss": 0.0013729930389672518, "train/perplexity": 8.300538527006847, "train/grad_norm": 0.28515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026209.9375028189, "perf/iters_per_sec": 0.9661721885217757, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350121974945068, "data/tokens_consumed": 152800591872, "data/tokens_consumed_B": 152.800591872, "train/loss_slope": 5.718195005611985e-06} {"step": 72870, "timestamp": 1778273303.9832435, "train/loss": 2.178273415565491, "train/z_loss": 0.0013655711780302226, "train/perplexity": 8.831045542280542, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025813.1891591123, "perf/iters_per_sec": 0.9659830041690408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035214900970459, "data/tokens_consumed": 152821563392, "data/tokens_consumed_B": 152.821563392, "train/loss_slope": 1.1897819306161543e-05} {"step": 72880, "timestamp": 1778273314.3355508, "train/loss": 2.137053656578064, "train/z_loss": 0.001378245826344937, "train/perplexity": 8.474432222085218, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026832.0186042592, "perf/iters_per_sec": 0.9664688199063584, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034694528579712, "data/tokens_consumed": 152842534912, "data/tokens_consumed_B": 152.842534912, "train/loss_slope": 1.211777248434069e-05} {"step": 72890, "timestamp": 1778273324.690001, "train/loss": 2.1941707372665404, "train/z_loss": 0.0013844348723068833, "train/perplexity": 8.972557363411864, "train/grad_norm": 0.21484375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026742.6328149848, "perf/iters_per_sec": 0.966426197440617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034740161895752, "data/tokens_consumed": 152863506432, "data/tokens_consumed_B": 152.863506432, "train/loss_slope": 1.3836092366637142e-05} {"step": 72900, "timestamp": 1778273335.0331185, "grad/layer_0/attn": 0.003020006697624922, "grad/layer_0/mlp": 0.0032114749774336815, "grad/layer_0/attn_mlp_ratio": 0.9403799266093749, "grad/layer_4/attn": 0.002799217589199543, "grad/layer_4/mlp": 0.00261197448708117, "grad/layer_4/attn_mlp_ratio": 1.0716863797390974, "grad/layer_8/attn": 0.002745425794273615, "grad/layer_8/mlp": 0.0035811876878142357, "grad/layer_8/attn_mlp_ratio": 0.7666243595534193, "grad/layer_12/attn": 0.006005074363201857, "grad/layer_12/mlp": 0.00671354541555047, "grad/layer_12/attn_mlp_ratio": 0.8944713861390239, "grad/layer_16/attn": 0.003851447720080614, "grad/layer_16/mlp": 0.004926712717860937, "grad/layer_16/attn_mlp_ratio": 0.7817479651174031, "grad/layer_20/attn": 0.005665320437401533, "grad/layer_20/mlp": 0.0057486011646687984, "grad/layer_20/attn_mlp_ratio": 0.9855128537477259, "grad/layer_24/attn": 0.008573496714234352, "grad/layer_24/mlp": 0.008537176996469498, "grad/layer_24/attn_mlp_ratio": 1.004254288900704, "grad/layer_27/attn": 0.004932768177241087, "grad/layer_27/mlp": 0.00760338781401515, "grad/layer_27/attn_mlp_ratio": 0.6487592416728639} {"step": 72900, "timestamp": 1778273335.656471, "eos/sharpness": 47.894644737243645, "eos/L0_probe": 1.9706125259399414, "eos/L_plus": 2.2330281734466553, "eos/L_minus": 2.187143325805664, "eos/grad_norm": 0.1166393905878067, "eos/embed_grad_frac": 0.16907799243927002, "eos/time_s": 0.6206037998199463} {"step": 72900, "timestamp": 1778273335.6756306, "train/loss": 2.112163209915161, "train/z_loss": 0.0013816897990182042, "train/perplexity": 8.266103274607724, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1909946.7922174877, "perf/iters_per_sec": 0.9107336007201613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0980159282684325, "data/tokens_consumed": 152884477952, "data/tokens_consumed_B": 152.884477952, "train/loss_slope": 9.341261975585637e-06} {"step": 72900, "timestamp": 1778273337.0407298, "geo/rankme_last": 437.46783447265625, "geo/layer_0/stable_rank_q_proj": 19.1646785736084, "geo/layer_0/stable_rank_k_proj": 15.865135192871094, "geo/layer_0/stable_rank_o_proj": 46.75532150268555, "geo/layer_0/stable_rank_gate_proj": 130.05740356445312, "geo/layer_0/stable_rank_down_proj": 56.23129653930664, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06458194553852081, "geo/layer_0/attn_entropy_mean": 6.144343376159668, "geo/layer_0/attn_entropy_std": 0.43106088042259216, "geo/layer_7/stable_rank_q_proj": 42.65604019165039, "geo/layer_7/stable_rank_k_proj": 40.08222961425781, "geo/layer_7/stable_rank_o_proj": 88.7733154296875, "geo/layer_7/stable_rank_gate_proj": 78.11843872070312, "geo/layer_7/stable_rank_down_proj": 139.3180389404297, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4438167214393616, "geo/layer_7/attn_entropy_mean": 4.65701961517334, "geo/layer_7/attn_entropy_std": 0.8037915229797363, "geo/layer_14/stable_rank_q_proj": 50.225624084472656, "geo/layer_14/stable_rank_k_proj": 41.28334045410156, "geo/layer_14/stable_rank_o_proj": 43.49916458129883, "geo/layer_14/stable_rank_gate_proj": 71.03739166259766, "geo/layer_14/stable_rank_down_proj": 126.94816589355469, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40416085720062256, "geo/layer_14/attn_entropy_mean": 5.531918525695801, "geo/layer_14/attn_entropy_std": 0.4175819754600525, "geo/layer_21/stable_rank_q_proj": 39.80464553833008, "geo/layer_21/stable_rank_k_proj": 30.12700080871582, "geo/layer_21/stable_rank_o_proj": 68.7198257446289, "geo/layer_21/stable_rank_gate_proj": 64.24849700927734, "geo/layer_21/stable_rank_down_proj": 49.87542724609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14565084874629974, "geo/layer_21/attn_entropy_mean": 5.693127155303955, "geo/layer_21/attn_entropy_std": 0.30634599924087524, "geo/layer_27/stable_rank_q_proj": 43.78492736816406, "geo/layer_27/stable_rank_k_proj": 32.16828918457031, "geo/layer_27/stable_rank_o_proj": 115.22345733642578, "geo/layer_27/stable_rank_gate_proj": 77.85124206542969, "geo/layer_27/stable_rank_down_proj": 127.18521118164062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10059516876935959, "geo/layer_27/attn_entropy_mean": 4.170779228210449, "geo/layer_27/attn_entropy_std": 0.7728581428527832, "attnres/final_alpha/block_0": 0.23579785227775574, "attnres/block_norm/0": 1.774308681488037, "attnres/final_alpha/block_1": 0.004083677660673857, "attnres/block_norm/1": 48392.25, "attnres/final_alpha/block_2": 0.009937034919857979, "attnres/block_norm/2": 29019.845703125, "attnres/final_alpha/block_3": 0.011749282479286194, "attnres/block_norm/3": 61957.69921875, "attnres/final_alpha/block_4": 0.013798852451145649, "attnres/block_norm/4": 15725.861328125, "attnres/final_alpha/block_5": 0.6196694374084473, "attnres/block_norm/5": 6842.85205078125, "attnres/final_alpha/block_6": 0.10496388375759125, "attnres/block_norm/6": 41256.3984375, "geo/tier1_time_s": 1.361321210861206, "geo/step": 72900.0, "geo/rankme_slope": -0.00020982074470413166} {"step": 72910, "timestamp": 1778273347.3981898, "train/loss": 2.139811646938324, "train/z_loss": 0.0013622346683405339, "train/perplexity": 8.497836884543482, "train/grad_norm": 0.251953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1789557.1820965447, "perf/iters_per_sec": 0.8533273611529086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718832015991212, "data/tokens_consumed": 152905449472, "data/tokens_consumed_B": 152.905449472, "train/loss_slope": 7.2653441467765534e-06} {"step": 72920, "timestamp": 1778273357.7527914, "train/loss": 2.099826788902283, "train/z_loss": 0.001377368497196585, "train/perplexity": 8.164755563806937, "train/grad_norm": 0.171875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026604.8334954146, "perf/iters_per_sec": 0.9663604896046708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034810519218445, "data/tokens_consumed": 152926420992, "data/tokens_consumed_B": 152.926420992, "train/loss_slope": 6.995869462568027e-06} {"step": 72930, "timestamp": 1778273368.1134138, "train/loss": 2.1776742458343508, "train/z_loss": 0.0013636628165841103, "train/perplexity": 8.825755831972693, "train/grad_norm": 0.1953125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025044.5407722497, "perf/iters_per_sec": 0.9656164840565918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356078386306762, "data/tokens_consumed": 152947392512, "data/tokens_consumed_B": 152.947392512, "train/loss_slope": 8.018461891812944e-06} {"step": 72940, "timestamp": 1778273378.4676077, "train/loss": 2.174272894859314, "train/z_loss": 0.001393040968105197, "train/perplexity": 8.795787334350074, "train/grad_norm": 0.1904296875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026410.3306688408, "perf/iters_per_sec": 0.9662677434295849, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349098443984985, "data/tokens_consumed": 152968364032, "data/tokens_consumed_B": 152.968364032, "train/loss_slope": 1.204142411692949e-05} {"step": 72950, "timestamp": 1778273388.8151886, "grad/layer_0/attn": 0.0026368778198957443, "grad/layer_0/mlp": 0.00300997169688344, "grad/layer_0/attn_mlp_ratio": 0.8760473512163797, "grad/layer_4/attn": 0.0019405132625252008, "grad/layer_4/mlp": 0.0025881524197757244, "grad/layer_4/attn_mlp_ratio": 0.7497677388399643, "grad/layer_8/attn": 0.0032069433946162462, "grad/layer_8/mlp": 0.003597014583647251, "grad/layer_8/attn_mlp_ratio": 0.8915569372556784, "grad/layer_12/attn": 0.004839907865971327, "grad/layer_12/mlp": 0.006793837063014507, "grad/layer_12/attn_mlp_ratio": 0.7123968016660267, "grad/layer_16/attn": 0.004341035149991512, "grad/layer_16/mlp": 0.004656072705984116, "grad/layer_16/attn_mlp_ratio": 0.932338331224609, "grad/layer_20/attn": 0.0036504368763417006, "grad/layer_20/mlp": 0.005634598899632692, "grad/layer_20/attn_mlp_ratio": 0.6478609882583775, "grad/layer_24/attn": 0.008900901302695274, "grad/layer_24/mlp": 0.007696503773331642, "grad/layer_24/attn_mlp_ratio": 1.1564863019866547, "grad/layer_27/attn": 0.004302619025111198, "grad/layer_27/mlp": 0.00665021687746048, "grad/layer_27/attn_mlp_ratio": 0.6469892696274462} {"step": 72950, "timestamp": 1778273388.831032, "train/loss": 2.1454302787780763, "train/z_loss": 0.001366108877118677, "train/perplexity": 8.545717487209396, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025016.055888747, "perf/iters_per_sec": 0.9656029014056907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356224060058594, "data/tokens_consumed": 152989335552, "data/tokens_consumed_B": 152.989335552, "train/loss_slope": 1.0296041279485748e-05} {"step": 72960, "timestamp": 1778273399.1833797, "train/loss": 2.1318085551261903, "train/z_loss": 0.001372002041898668, "train/perplexity": 8.430099332222973, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026827.021384988, "perf/iters_per_sec": 0.9664664370465221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346970796585082, "data/tokens_consumed": 153010307072, "data/tokens_consumed_B": 153.010307072, "train/loss_slope": 1.0366300216065915e-05} {"step": 72970, "timestamp": 1778273409.5434508, "train/loss": 2.202054500579834, "train/z_loss": 0.0013598902383819222, "train/perplexity": 9.043574455120053, "train/grad_norm": 0.212890625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025149.2097875834, "perf/iters_per_sec": 0.9656663941324155, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355543136596679, "data/tokens_consumed": 153031278592, "data/tokens_consumed_B": 153.031278592, "train/loss_slope": 1.17823355769453e-05} {"step": 72975, "timestamp": 1778273415.32972, "eos/sharpness": 33.174514770507805, "eos/L0_probe": 1.968113660812378, "eos/L_plus": 2.120265483856201, "eos/L_minus": 2.147706985473633, "eos/grad_norm": 0.09588596969842911, "eos/embed_grad_frac": 0.22084057331085205, "eos/time_s": 0.6195063591003418} {"step": 72975, "timestamp": 1778273416.7139611, "geo/rankme_last": 437.8781433105469, "geo/layer_0/stable_rank_q_proj": 19.158300399780273, "geo/layer_0/stable_rank_k_proj": 15.816791534423828, "geo/layer_0/stable_rank_o_proj": 46.797035217285156, "geo/layer_0/stable_rank_gate_proj": 129.86195373535156, "geo/layer_0/stable_rank_down_proj": 56.33715057373047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06392166763544083, "geo/layer_0/attn_entropy_mean": 6.145658493041992, "geo/layer_0/attn_entropy_std": 0.4295092225074768, "geo/layer_7/stable_rank_q_proj": 42.67000961303711, "geo/layer_7/stable_rank_k_proj": 40.104454040527344, "geo/layer_7/stable_rank_o_proj": 88.73712921142578, "geo/layer_7/stable_rank_gate_proj": 78.11570739746094, "geo/layer_7/stable_rank_down_proj": 139.27259826660156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4430221915245056, "geo/layer_7/attn_entropy_mean": 4.6521196365356445, "geo/layer_7/attn_entropy_std": 0.8036858439445496, "geo/layer_14/stable_rank_q_proj": 50.18016815185547, "geo/layer_14/stable_rank_k_proj": 41.34148025512695, "geo/layer_14/stable_rank_o_proj": 43.505062103271484, "geo/layer_14/stable_rank_gate_proj": 70.95166015625, "geo/layer_14/stable_rank_down_proj": 126.95609283447266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3897581696510315, "geo/layer_14/attn_entropy_mean": 5.546687602996826, "geo/layer_14/attn_entropy_std": 0.4291267991065979, "geo/layer_21/stable_rank_q_proj": 39.77030563354492, "geo/layer_21/stable_rank_k_proj": 30.218114852905273, "geo/layer_21/stable_rank_o_proj": 68.69596862792969, "geo/layer_21/stable_rank_gate_proj": 64.29214477539062, "geo/layer_21/stable_rank_down_proj": 49.821964263916016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1421297788619995, "geo/layer_21/attn_entropy_mean": 5.685298919677734, "geo/layer_21/attn_entropy_std": 0.3169211745262146, "geo/layer_27/stable_rank_q_proj": 43.814144134521484, "geo/layer_27/stable_rank_k_proj": 32.22122573852539, "geo/layer_27/stable_rank_o_proj": 115.18163299560547, "geo/layer_27/stable_rank_gate_proj": 77.87346649169922, "geo/layer_27/stable_rank_down_proj": 127.19883728027344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09980949759483337, "geo/layer_27/attn_entropy_mean": 4.1552276611328125, "geo/layer_27/attn_entropy_std": 0.7688401341438293, "attnres/final_alpha/block_0": 0.23603199422359467, "attnres/block_norm/0": 1.7743157148361206, "attnres/final_alpha/block_1": 0.004100731573998928, "attnres/block_norm/1": 48423.546875, "attnres/final_alpha/block_2": 0.00994716864079237, "attnres/block_norm/2": 28907.63671875, "attnres/final_alpha/block_3": 0.01175293792039156, "attnres/block_norm/3": 61578.15625, "attnres/final_alpha/block_4": 0.013948849402368069, "attnres/block_norm/4": 15768.4501953125, "attnres/final_alpha/block_5": 0.6181407570838928, "attnres/block_norm/5": 6891.49658203125, "attnres/final_alpha/block_6": 0.10607751458883286, "attnres/block_norm/6": 41356.19921875, "geo/tier1_time_s": 1.3645682334899902, "geo/step": 72975.0, "geo/rankme_slope": -0.00023043799551070428} {"step": 72980, "timestamp": 1778273421.8966193, "train/loss": 2.119768214225769, "train/z_loss": 0.0013719253125600516, "train/perplexity": 8.329206672311297, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1698562.200880779, "perf/iters_per_sec": 0.8099375728992362, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2346630573272706, "data/tokens_consumed": 153052250112, "data/tokens_consumed_B": 153.052250112, "train/loss_slope": 9.182443541519083e-06} {"step": 72990, "timestamp": 1778273432.2486248, "train/loss": 2.1819877862930297, "train/z_loss": 0.0013573499745689332, "train/perplexity": 8.863908313811923, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026787.6049303692, "perf/iters_per_sec": 0.9664476418163153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347172021865845, "data/tokens_consumed": 153073221632, "data/tokens_consumed_B": 153.073221632, "train/loss_slope": 1.4413264961597475e-05} {"step": 73000, "timestamp": 1778273442.5930066, "grad/layer_0/attn": 0.002782916184514761, "grad/layer_0/mlp": 0.0031098434701561928, "grad/layer_0/attn_mlp_ratio": 0.8948733663716044, "grad/layer_4/attn": 0.0021498745772987604, "grad/layer_4/mlp": 0.002595246070995927, "grad/layer_4/attn_mlp_ratio": 0.8283894612100497, "grad/layer_8/attn": 0.007755598984658718, "grad/layer_8/mlp": 0.003624043194577098, "grad/layer_8/attn_mlp_ratio": 2.140040378729456, "grad/layer_12/attn": 0.004899770952761173, "grad/layer_12/mlp": 0.007830708287656307, "grad/layer_12/attn_mlp_ratio": 0.6257123506839782, "grad/layer_16/attn": 0.003837511409074068, "grad/layer_16/mlp": 0.004754312802106142, "grad/layer_16/attn_mlp_ratio": 0.8071642502482463, "grad/layer_20/attn": 0.0044067553244531155, "grad/layer_20/mlp": 0.0064762057736516, "grad/layer_20/attn_mlp_ratio": 0.6804532484648103, "grad/layer_24/attn": 0.014475961215794086, "grad/layer_24/mlp": 0.011434957385063171, "grad/layer_24/attn_mlp_ratio": 1.2659392249339978, "grad/layer_27/attn": 0.013883199542760849, "grad/layer_27/mlp": 0.011404689401388168, "grad/layer_27/attn_mlp_ratio": 1.217323763270451} {"step": 73000, "timestamp": 1778273442.6090112, "train/loss": 2.1083441615104674, "train/z_loss": 0.001363401545677334, "train/perplexity": 8.234594830516594, "train/grad_norm": 0.21875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2025157.0895411344, "perf/iters_per_sec": 0.9656701514917061, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355502843856812, "data/tokens_consumed": 153094193152, "data/tokens_consumed_B": 153.094193152, "train/loss_slope": 9.04492851209253e-06} {"step": 73000, "timestamp": 1778273449.9342966, "geo/ww_alpha_mean": 7.707150058081421, "geo/ww_alpha_std": 4.827551870408704, "geo/ww_alpha_min": 1.344873490631174, "geo/ww_alpha_max": 36.662311664225555, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.937256021589753, "geo/ww_alpha_by_type/k_proj": 4.549492895815706, "geo/ww_alpha_by_type/v_proj": 7.890533179194857, "geo/ww_alpha_by_type/o_proj": 8.739130612584699, "geo/ww_alpha_by_type/gate_proj": 7.923641932233635, "geo/ww_alpha_by_type/up_proj": 12.871064252871337, "geo/ww_alpha_by_type/down_proj": 8.13010139407265, "geo/twonn_id/layer_0": 0.6743676662445068, "geo/twonn_id/layer_7": 3.0158531665802, "geo/twonn_id/layer_14": 5.51018762588501, "geo/twonn_id/layer_21": 7.971231460571289, "geo/twonn_id/layer_27": 5.267942428588867, "geo/tier2_time_s": 7.319581508636475} {"step": 73000, "timestamp": 1778273450.5916731, "eoc/jacobian_sigma/layer_0/attn": 1349.467041015625, "eoc/jacobian_sigma/layer_0/mlp": 8828.0625, "eoc/jacobian_sigma/layer_0": 8828.0625, "eoc/jacobian_sigma/layer_7/attn": 1.1542270183563232, "eoc/jacobian_sigma/layer_7/mlp": 1.7556428909301758, "eoc/jacobian_sigma/layer_7": 1.7556428909301758, "eoc/jacobian_sigma/layer_14/attn": 1.4578557014465332, "eoc/jacobian_sigma/layer_14/mlp": 8.828564643859863, "eoc/jacobian_sigma/layer_14": 8.828564643859863, "eoc/jacobian_sigma/layer_21/attn": 1.1048152446746826, "eoc/jacobian_sigma/layer_21/mlp": 4.146200180053711, "eoc/jacobian_sigma/layer_21": 4.146200180053711, "eoc/jacobian_sigma/layer_27/attn": 3.13271164894104, "eoc/jacobian_sigma/layer_27/mlp": 25.299997329711914, "eoc/jacobian_sigma/layer_27": 25.299997329711914, "eoc/layer0_sigma": 8828.0625, "eoc/sigma_max": 25.299997329711914, "eoc/sigma_min": 1.7556428909301758, "eoc/sigma_mean": 10.007601261138916, "eoc/time_s": 0.6513195037841797} {"step": 73010, "timestamp": 1778273460.9648433, "train/loss": 2.1424649000167846, "train/z_loss": 0.0013821159140206874, "train/perplexity": 8.520413734220988, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1142778.418622383, "perf/iters_per_sec": 0.5449192135917582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8351344108581542, "data/tokens_consumed": 153115164672, "data/tokens_consumed_B": 153.115164672, "train/loss_slope": 7.767285665448552e-06} {"step": 73020, "timestamp": 1778273471.318262, "train/loss": 2.1246031522750854, "train/z_loss": 0.0013864447828382254, "train/perplexity": 8.36957538203688, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2026518.362434737, "perf/iters_per_sec": 0.9663192569898305, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348546743392943, "data/tokens_consumed": 153136136192, "data/tokens_consumed_B": 153.136136192, "train/loss_slope": 8.13276158987206e-06} {"step": 73030, "timestamp": 1778273481.6682801, "train/loss": 2.1705185174942017, "train/z_loss": 0.0013625237392261625, "train/perplexity": 8.762826541816509, "train/grad_norm": 0.1611328125, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027464.6226511272, "perf/iters_per_sec": 0.9667704690223347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343716859817504, "data/tokens_consumed": 153157107712, "data/tokens_consumed_B": 153.157107712, "train/loss_slope": 7.530959261239853e-06} {"step": 73040, "timestamp": 1778273492.0137577, "train/loss": 2.170752763748169, "train/z_loss": 0.0013775194063782692, "train/perplexity": 8.764879441540742, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2028094.1613704902, "perf/iters_per_sec": 0.9670706564762546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340506076812743, "data/tokens_consumed": 153178079232, "data/tokens_consumed_B": 153.178079232, "train/loss_slope": 9.51001636265922e-06} {"step": 73050, "timestamp": 1778273502.3502102, "grad/layer_0/attn": 0.0026357632596045732, "grad/layer_0/mlp": 0.0029879985377192497, "grad/layer_0/attn_mlp_ratio": 0.8821166202460056, "grad/layer_4/attn": 0.004233349114656448, "grad/layer_4/mlp": 0.0025548525154590607, "grad/layer_4/attn_mlp_ratio": 1.6569836901905952, "grad/layer_8/attn": 0.004128559958189726, "grad/layer_8/mlp": 0.0036527481861412525, "grad/layer_8/attn_mlp_ratio": 1.1302612813080308, "grad/layer_12/attn": 0.005544467829167843, "grad/layer_12/mlp": 0.006850885227322578, "grad/layer_12/attn_mlp_ratio": 0.8093067631792779, "grad/layer_16/attn": 0.003672647988423705, "grad/layer_16/mlp": 0.004687927663326263, "grad/layer_16/attn_mlp_ratio": 0.7834267450012544, "grad/layer_20/attn": 0.0036272320430725813, "grad/layer_20/mlp": 0.00684506306424737, "grad/layer_20/attn_mlp_ratio": 0.5299048315606602, "grad/layer_24/attn": 0.01149834506213665, "grad/layer_24/mlp": 0.011220569722354412, "grad/layer_24/attn_mlp_ratio": 1.0247558942353208, "grad/layer_27/attn": 0.007998393848538399, "grad/layer_27/mlp": 0.010767759755253792, "grad/layer_27/attn_mlp_ratio": 0.7428094567539815} {"step": 73050, "timestamp": 1778273502.9692907, "eos/sharpness": 58.932209014892564, "eos/L0_probe": 1.9704084396362305, "eos/L_plus": 2.2566261291503906, "eos/L_minus": 2.273512840270996, "eos/grad_norm": 0.16666631400585175, "eos/embed_grad_frac": 0.08021050691604614, "eos/time_s": 0.6163780689239502} {"step": 73050, "timestamp": 1778273502.9905136, "train/loss": 2.1564740180969237, "train/z_loss": 0.001365047402214259, "train/perplexity": 8.640617222872972, "train/grad_norm": 0.166015625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1911381.8395325865, "perf/iters_per_sec": 0.91141788460378, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0971915483474732, "data/tokens_consumed": 153199050752, "data/tokens_consumed_B": 153.199050752, "train/loss_slope": 9.9984729679862e-06} {"step": 73050, "timestamp": 1778273504.3529897, "geo/rankme_last": 437.955810546875, "geo/layer_0/stable_rank_q_proj": 19.1475830078125, "geo/layer_0/stable_rank_k_proj": 15.807082176208496, "geo/layer_0/stable_rank_o_proj": 46.7994499206543, "geo/layer_0/stable_rank_gate_proj": 129.9137420654297, "geo/layer_0/stable_rank_down_proj": 56.34275817871094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06152366101741791, "geo/layer_0/attn_entropy_mean": 6.143301010131836, "geo/layer_0/attn_entropy_std": 0.43014681339263916, "geo/layer_7/stable_rank_q_proj": 42.674888610839844, "geo/layer_7/stable_rank_k_proj": 40.16523361206055, "geo/layer_7/stable_rank_o_proj": 88.72064971923828, "geo/layer_7/stable_rank_gate_proj": 77.9809799194336, "geo/layer_7/stable_rank_down_proj": 139.470458984375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4434399902820587, "geo/layer_7/attn_entropy_mean": 4.641813278198242, "geo/layer_7/attn_entropy_std": 0.8063793182373047, "geo/layer_14/stable_rank_q_proj": 50.24545669555664, "geo/layer_14/stable_rank_k_proj": 41.31272888183594, "geo/layer_14/stable_rank_o_proj": 43.53639602661133, "geo/layer_14/stable_rank_gate_proj": 70.94255065917969, "geo/layer_14/stable_rank_down_proj": 126.87557220458984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3887985348701477, "geo/layer_14/attn_entropy_mean": 5.537008285522461, "geo/layer_14/attn_entropy_std": 0.4286612272262573, "geo/layer_21/stable_rank_q_proj": 39.80870819091797, "geo/layer_21/stable_rank_k_proj": 30.19588279724121, "geo/layer_21/stable_rank_o_proj": 68.63019561767578, "geo/layer_21/stable_rank_gate_proj": 64.28014373779297, "geo/layer_21/stable_rank_down_proj": 49.763343811035156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14125129580497742, "geo/layer_21/attn_entropy_mean": 5.689479827880859, "geo/layer_21/attn_entropy_std": 0.30275195837020874, "geo/layer_27/stable_rank_q_proj": 43.80167007446289, "geo/layer_27/stable_rank_k_proj": 32.2418212890625, "geo/layer_27/stable_rank_o_proj": 115.04729461669922, "geo/layer_27/stable_rank_gate_proj": 77.83929443359375, "geo/layer_27/stable_rank_down_proj": 127.29702758789062, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10258343815803528, "geo/layer_27/attn_entropy_mean": 4.165839672088623, "geo/layer_27/attn_entropy_std": 0.7760300636291504, "attnres/final_alpha/block_0": 0.23806023597717285, "attnres/block_norm/0": 1.7743066549301147, "attnres/final_alpha/block_1": 0.00406538276001811, "attnres/block_norm/1": 48526.3984375, "attnres/final_alpha/block_2": 0.010028330609202385, "attnres/block_norm/2": 29067.470703125, "attnres/final_alpha/block_3": 0.011961647309362888, "attnres/block_norm/3": 61733.53515625, "attnres/final_alpha/block_4": 0.014089731499552727, "attnres/block_norm/4": 15811.09375, "attnres/final_alpha/block_5": 0.6143265962600708, "attnres/block_norm/5": 6906.7880859375, "attnres/final_alpha/block_6": 0.10746802389621735, "attnres/block_norm/6": 41053.6171875, "geo/tier1_time_s": 1.3586053848266602, "geo/step": 73050.0, "geo/rankme_slope": -0.00022555711737820127} {"step": 73060, "timestamp": 1778273514.702977, "train/loss": 2.1344838738441467, "train/z_loss": 0.0013773532817140221, "train/perplexity": 8.452682730184607, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 1791112.2963014126, "perf/iters_per_sec": 0.8540688973910392, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1708657264709472, "data/tokens_consumed": 153220022272, "data/tokens_consumed_B": 153.220022272, "train/loss_slope": 1.26186134076283e-05} {"step": 73070, "timestamp": 1778273525.0494936, "train/loss": 2.147107458114624, "train/z_loss": 0.0013724189018830657, "train/perplexity": 8.560062213972015, "train/grad_norm": 0.2177734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027890.2559193033, "perf/iters_per_sec": 0.9669734267803685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341545820236206, "data/tokens_consumed": 153240993792, "data/tokens_consumed_B": 153.240993792, "train/loss_slope": 1.0081738464259346e-05} {"step": 73080, "timestamp": 1778273535.4114125, "train/loss": 2.16838743686676, "train/z_loss": 0.0013795862207189203, "train/perplexity": 8.744172136012947, "train/grad_norm": 0.1943359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024887.9537494332, "perf/iters_per_sec": 0.9655418175456205, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356879234313965, "data/tokens_consumed": 153261965312, "data/tokens_consumed_B": 153.261965312, "train/loss_slope": 1.4786658586055151e-05} {"step": 73090, "timestamp": 1778273545.7607703, "train/loss": 2.15914546251297, "train/z_loss": 0.0013684183009900152, "train/perplexity": 8.663731011358616, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027772.5415178924, "perf/iters_per_sec": 0.9669172961797201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342146158218384, "data/tokens_consumed": 153282936832, "data/tokens_consumed_B": 153.282936832, "train/loss_slope": 1.605491484388232e-05} {"step": 73100, "timestamp": 1778273556.0946915, "grad/layer_0/attn": 0.002737408736720681, "grad/layer_0/mlp": 0.0029812364373356104, "grad/layer_0/attn_mlp_ratio": 0.918212527734266, "grad/layer_4/attn": 0.002150386106222868, "grad/layer_4/mlp": 0.0026649259962141514, "grad/layer_4/attn_mlp_ratio": 0.8069214787148321, "grad/layer_8/attn": 0.0035463289823383093, "grad/layer_8/mlp": 0.0035094753839075565, "grad/layer_8/attn_mlp_ratio": 1.010501141438298, "grad/layer_12/attn": 0.004374820273369551, "grad/layer_12/mlp": 0.007670278195291758, "grad/layer_12/attn_mlp_ratio": 0.5703600449614643, "grad/layer_16/attn": 0.0050174579955637455, "grad/layer_16/mlp": 0.004451829940080643, "grad/layer_16/attn_mlp_ratio": 1.1270551549341845, "grad/layer_20/attn": 0.003089732490479946, "grad/layer_20/mlp": 0.006300339009612799, "grad/layer_20/attn_mlp_ratio": 0.4904073315301012, "grad/layer_24/attn": 0.016908245161175728, "grad/layer_24/mlp": 0.013135865330696106, "grad/layer_24/attn_mlp_ratio": 1.287181666893775, "grad/layer_27/attn": 0.01135531347244978, "grad/layer_27/mlp": 0.013981925323605537, "grad/layer_27/attn_mlp_ratio": 0.8121423286437163} {"step": 73100, "timestamp": 1778273556.1102524, "train/loss": 2.165716814994812, "train/z_loss": 0.0013715909444727004, "train/perplexity": 8.720850913598769, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027261.9183885439, "perf/iters_per_sec": 0.9666738120978088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344751119613647, "data/tokens_consumed": 153303908352, "data/tokens_consumed_B": 153.303908352, "train/loss_slope": 1.6732147260002652e-05} {"step": 73110, "timestamp": 1778273566.4634037, "train/loss": 2.122319447994232, "train/z_loss": 0.0013744356925599276, "train/perplexity": 8.350483555248754, "train/grad_norm": 0.193359375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2027093.7765717357, "perf/iters_per_sec": 0.9665936358317069, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345609188079834, "data/tokens_consumed": 153324879872, "data/tokens_consumed_B": 153.324879872, "train/loss_slope": 1.7476977187522062e-05} {"step": 73120, "timestamp": 1778273576.8278592, "train/loss": 2.1297078847885134, "train/z_loss": 0.001376883639022708, "train/perplexity": 8.412409059832665, "train/grad_norm": 0.27734375, "optim/muon_lr": 0.02, "optim/adamw_lr": 0.0006, "perf/tokens_per_sec": 2024864.4141803007, "perf/iters_per_sec": 0.96553059300437, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356999635696411, "data/tokens_consumed": 153345851392, "data/tokens_consumed_B": 153.345851392, "train/loss_slope": 1.8106905311712673e-05} {"step": 73125, "timestamp": 1778273582.6055784, "eos/sharpness": 28.602480888366692, "eos/L0_probe": 1.9649055004119873, "eos/L_plus": 2.090630531311035, "eos/L_minus": 2.1252052783966064, "eos/grad_norm": 0.10941310971975327, "eos/embed_grad_frac": 0.19889767467975616, "eos/time_s": 0.6139414310455322} {"step": 73125, "timestamp": 1778273583.9883826, "geo/rankme_last": 438.17181396484375, "geo/layer_0/stable_rank_q_proj": 19.138439178466797, "geo/layer_0/stable_rank_k_proj": 15.803383827209473, "geo/layer_0/stable_rank_o_proj": 46.7837028503418, "geo/layer_0/stable_rank_gate_proj": 129.8353271484375, "geo/layer_0/stable_rank_down_proj": 56.360836029052734, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06784703582525253, "geo/layer_0/attn_entropy_mean": 6.145018577575684, "geo/layer_0/attn_entropy_std": 0.4309031367301941, "geo/layer_7/stable_rank_q_proj": 42.64448928833008, "geo/layer_7/stable_rank_k_proj": 40.12200164794922, "geo/layer_7/stable_rank_o_proj": 88.55120849609375, "geo/layer_7/stable_rank_gate_proj": 77.75676727294922, "geo/layer_7/stable_rank_down_proj": 139.69296264648438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4428883492946625, "geo/layer_7/attn_entropy_mean": 4.637578964233398, "geo/layer_7/attn_entropy_std": 0.7988356351852417, "geo/layer_14/stable_rank_q_proj": 50.22944641113281, "geo/layer_14/stable_rank_k_proj": 41.318992614746094, "geo/layer_14/stable_rank_o_proj": 43.54680633544922, "geo/layer_14/stable_rank_gate_proj": 70.98600769042969, "geo/layer_14/stable_rank_down_proj": 126.651123046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.38670802116394043, "geo/layer_14/attn_entropy_mean": 5.5491790771484375, "geo/layer_14/attn_entropy_std": 0.40231814980506897, "geo/layer_21/stable_rank_q_proj": 39.830604553222656, "geo/layer_21/stable_rank_k_proj": 30.218379974365234, "geo/layer_21/stable_rank_o_proj": 68.6014404296875, "geo/layer_21/stable_rank_gate_proj": 64.25247192382812, "geo/layer_21/stable_rank_down_proj": 49.76265335083008, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14336176216602325, "geo/layer_21/attn_entropy_mean": 5.6758713722229, "geo/layer_21/attn_entropy_std": 0.3143341839313507, "geo/layer_27/stable_rank_q_proj": 43.78744888305664, "geo/layer_27/stable_rank_k_proj": 32.27720642089844, "geo/layer_27/stable_rank_o_proj": 114.98553466796875, "geo/layer_27/stable_rank_gate_proj": 77.80420684814453, "geo/layer_27/stable_rank_down_proj": 127.51033782958984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.093051478266716, "geo/layer_27/attn_entropy_mean": 4.156330108642578, "geo/layer_27/attn_entropy_std": 0.7693431377410889, "attnres/final_alpha/block_0": 0.23767328262329102, "attnres/block_norm/0": 1.7746236324310303, "attnres/final_alpha/block_1": 0.004046186339110136, "attnres/block_norm/1": 48589.53515625, "attnres/final_alpha/block_2": 0.009865587577223778, "attnres/block_norm/2": 29080.29296875, "attnres/final_alpha/block_3": 0.0117656160145998, "attnres/block_norm/3": 61980.78125, "attnres/final_alpha/block_4": 0.013912726193666458, "attnres/block_norm/4": 15810.916015625, "attnres/final_alpha/block_5": 0.6158370971679688, "attnres/block_norm/5": 6864.37060546875, "attnres/final_alpha/block_6": 0.10689947754144669, "attnres/block_norm/6": 41220.609375, "geo/tier1_time_s": 1.3633739948272705, "geo/step": 73125.0, "geo/rankme_slope": -0.0002329781131202481} {"step": 73130, "timestamp": 1778273589.1700888, "train/loss": 2.138561153411865, "train/z_loss": 0.001365298789460212, "train/perplexity": 8.487217035940304, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.019999993443489076, "optim/adamw_lr": 0.0005999998033046722, "perf/tokens_per_sec": 1699804.0283901375, "perf/iters_per_sec": 0.8105297223997772, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337610483169557, "data/tokens_consumed": 153366822912, "data/tokens_consumed_B": 153.366822912, "train/loss_slope": 1.9971251158681403e-05} {"step": 73140, "timestamp": 1778273599.523007, "train/loss": 2.142548990249634, "train/z_loss": 0.0013560212100856006, "train/perplexity": 8.521130247921352, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.019999873638153077, "optim/adamw_lr": 0.0005999962091445923, "perf/tokens_per_sec": 2026690.3780039416, "perf/iters_per_sec": 0.9664012804050167, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347668409347535, "data/tokens_consumed": 153387794432, "data/tokens_consumed_B": 153.387794432, "train/loss_slope": 2.3498793511000137e-05} {"step": 73150, "timestamp": 1778273609.861234, "grad/layer_0/attn": 0.003008409636095166, "grad/layer_0/mlp": 0.0031361228320747614, "grad/layer_0/attn_mlp_ratio": 0.9592766933102638, "grad/layer_4/attn": 0.004122623708099127, "grad/layer_4/mlp": 0.0025054686702787876, "grad/layer_4/attn_mlp_ratio": 1.6454500479127487, "grad/layer_8/attn": 0.006653769873082638, "grad/layer_8/mlp": 0.0035537765361368656, "grad/layer_8/attn_mlp_ratio": 1.872309532744191, "grad/layer_12/attn": 0.007952656596899033, "grad/layer_12/mlp": 0.006539438385516405, "grad/layer_12/attn_mlp_ratio": 1.2161069508509994, "grad/layer_16/attn": 0.004440108314156532, "grad/layer_16/mlp": 0.004618323408067226, "grad/layer_16/attn_mlp_ratio": 0.961411279742662, "grad/layer_20/attn": 0.0031475634314119816, "grad/layer_20/mlp": 0.005671759136021137, "grad/layer_20/attn_mlp_ratio": 0.5549536396788356, "grad/layer_24/attn": 0.00813367310911417, "grad/layer_24/mlp": 0.007932605221867561, "grad/layer_24/attn_mlp_ratio": 1.0253470050617963, "grad/layer_27/attn": 0.006330512464046478, "grad/layer_27/mlp": 0.008138046599924564, "grad/layer_27/attn_mlp_ratio": 0.7778908990661308} {"step": 73150, "timestamp": 1778273609.877033, "train/loss": 2.1179012775421144, "train/z_loss": 0.001374611514620483, "train/perplexity": 8.313671077327488, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.01999960482120514, "optim/adamw_lr": 0.0005999881446361542, "perf/tokens_per_sec": 2026229.167552499, "perf/iters_per_sec": 0.9661813581240173, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350023746490478, "data/tokens_consumed": 153408765952, "data/tokens_consumed_B": 153.408765952, "train/loss_slope": 2.058716182268102e-05} {"step": 73160, "timestamp": 1778273620.2332308, "train/loss": 2.123559367656708, "train/z_loss": 0.0013758959597907961, "train/perplexity": 8.360843905673855, "train/grad_norm": 0.1640625, "optim/muon_lr": 0.019999186396598815, "optim/adamw_lr": 0.0005999755918979644, "perf/tokens_per_sec": 2026198.5490047154, "perf/iters_per_sec": 0.9661667580627038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035018014907837, "data/tokens_consumed": 153429737472, "data/tokens_consumed_B": 153.429737472, "train/loss_slope": 1.994617566166793e-05} {"step": 73170, "timestamp": 1778273630.595896, "train/loss": 2.148826372623444, "train/z_loss": 0.001369463128503412, "train/perplexity": 8.574788882424013, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.019998618364334107, "optim/adamw_lr": 0.0005999585509300231, "perf/tokens_per_sec": 2025058.3406111647, "perf/iters_per_sec": 0.9656230643325637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356007814407349, "data/tokens_consumed": 153450708992, "data/tokens_consumed_B": 153.450708992, "train/loss_slope": 1.8752303825925552e-05} {"step": 73180, "timestamp": 1778273640.9501827, "train/loss": 2.106614649295807, "train/z_loss": 0.0013704963494092226, "train/perplexity": 8.220365306788944, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.019997900724411013, "optim/adamw_lr": 0.0005999370217323303, "perf/tokens_per_sec": 2026387.876073547, "perf/iters_per_sec": 0.9662570362441764, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349213123321532, "data/tokens_consumed": 153471680512, "data/tokens_consumed_B": 153.471680512, "train/loss_slope": 1.3907158070772328e-05} {"step": 73190, "timestamp": 1778273651.3173573, "train/loss": 2.0888264894485475, "train/z_loss": 0.0013840776751749217, "train/perplexity": 8.075432995852694, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.019997034072875976, "optim/adamw_lr": 0.0005999110221862792, "perf/tokens_per_sec": 2023896.6893597173, "perf/iters_per_sec": 0.9650691458510005, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361951828002929, "data/tokens_consumed": 153492652032, "data/tokens_consumed_B": 153.492652032, "train/loss_slope": 1.2521362161622099e-05} {"step": 73200, "timestamp": 1778273662.200895, "grad/layer_0/attn": 0.004095219075679779, "grad/layer_0/mlp": 0.004470391198992729, "grad/layer_0/attn_mlp_ratio": 0.9160762004441347, "grad/layer_4/attn": 0.0032861928921192884, "grad/layer_4/mlp": 0.003080095862969756, "grad/layer_4/attn_mlp_ratio": 1.066912502606191, "grad/layer_8/attn": 0.01414242573082447, "grad/layer_8/mlp": 0.00473713967949152, "grad/layer_8/attn_mlp_ratio": 2.98543559809048, "grad/layer_12/attn": 0.006337664555758238, "grad/layer_12/mlp": 0.007426628842949867, "grad/layer_12/attn_mlp_ratio": 0.8533702982124363, "grad/layer_16/attn": 0.004094318952411413, "grad/layer_16/mlp": 0.0056413509882986546, "grad/layer_16/attn_mlp_ratio": 0.7257692152689964, "grad/layer_20/attn": 0.007127926219254732, "grad/layer_20/mlp": 0.0072400979697704315, "grad/layer_20/attn_mlp_ratio": 0.9845068603443301, "grad/layer_24/attn": 0.020863225683569908, "grad/layer_24/mlp": 0.013797011226415634, "grad/layer_24/attn_mlp_ratio": 1.5121554364187093, "grad/layer_27/attn": 0.008723258040845394, "grad/layer_27/mlp": 0.01376140583306551, "grad/layer_27/attn_mlp_ratio": 0.6338929382124686} {"step": 73200, "timestamp": 1778273662.8121378, "eos/sharpness": 82.95423984527586, "eos/L0_probe": 1.9680626392364502, "eos/L_plus": 2.3519253730773926, "eos/L_minus": 2.4137423038482666, "eos/grad_norm": 0.279628187417984, "eos/embed_grad_frac": 0.0314084067940712, "eos/time_s": 0.6083438396453857} {"step": 73200, "timestamp": 1778273662.8321493, "train/loss": 2.107186770439148, "train/z_loss": 0.0013864594744518398, "train/perplexity": 8.225069697199235, "train/grad_norm": 0.279296875, "optim/muon_lr": 0.019996017813682557, "optim/adamw_lr": 0.0005998805344104766, "perf/tokens_per_sec": 1822670.163701858, "perf/iters_per_sec": 0.8691168612012186, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1505932569503785, "data/tokens_consumed": 153513623552, "data/tokens_consumed_B": 153.513623552, "train/loss_slope": 9.510449922517629e-06} {"step": 73200, "timestamp": 1778273664.194852, "geo/rankme_last": 439.0625915527344, "geo/layer_0/stable_rank_q_proj": 19.146093368530273, "geo/layer_0/stable_rank_k_proj": 15.788774490356445, "geo/layer_0/stable_rank_o_proj": 46.78985595703125, "geo/layer_0/stable_rank_gate_proj": 129.69471740722656, "geo/layer_0/stable_rank_down_proj": 56.297447204589844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06536417454481125, "geo/layer_0/attn_entropy_mean": 6.148723125457764, "geo/layer_0/attn_entropy_std": 0.4290066063404083, "geo/layer_7/stable_rank_q_proj": 42.67912292480469, "geo/layer_7/stable_rank_k_proj": 40.11133575439453, "geo/layer_7/stable_rank_o_proj": 88.37405395507812, "geo/layer_7/stable_rank_gate_proj": 77.81956481933594, "geo/layer_7/stable_rank_down_proj": 139.80125427246094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4332164227962494, "geo/layer_7/attn_entropy_mean": 4.618264675140381, "geo/layer_7/attn_entropy_std": 0.805091142654419, "geo/layer_14/stable_rank_q_proj": 50.18489456176758, "geo/layer_14/stable_rank_k_proj": 41.349613189697266, "geo/layer_14/stable_rank_o_proj": 43.53679275512695, "geo/layer_14/stable_rank_gate_proj": 70.88646697998047, "geo/layer_14/stable_rank_down_proj": 126.95889282226562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4030968248844147, "geo/layer_14/attn_entropy_mean": 5.558736801147461, "geo/layer_14/attn_entropy_std": 0.4075203239917755, "geo/layer_21/stable_rank_q_proj": 39.805545806884766, "geo/layer_21/stable_rank_k_proj": 30.137638092041016, "geo/layer_21/stable_rank_o_proj": 68.6019058227539, "geo/layer_21/stable_rank_gate_proj": 64.27303314208984, "geo/layer_21/stable_rank_down_proj": 49.72368240356445, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13815952837467194, "geo/layer_21/attn_entropy_mean": 5.670413970947266, "geo/layer_21/attn_entropy_std": 0.30374160408973694, "geo/layer_27/stable_rank_q_proj": 43.77507400512695, "geo/layer_27/stable_rank_k_proj": 32.27874755859375, "geo/layer_27/stable_rank_o_proj": 115.1208724975586, "geo/layer_27/stable_rank_gate_proj": 77.80963134765625, "geo/layer_27/stable_rank_down_proj": 127.48106384277344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0937257707118988, "geo/layer_27/attn_entropy_mean": 4.164742469787598, "geo/layer_27/attn_entropy_std": 0.762365460395813, "attnres/final_alpha/block_0": 0.2403004765510559, "attnres/block_norm/0": 1.7748422622680664, "attnres/final_alpha/block_1": 0.004046895541250706, "attnres/block_norm/1": 48360.015625, "attnres/final_alpha/block_2": 0.010155852884054184, "attnres/block_norm/2": 28927.59765625, "attnres/final_alpha/block_3": 0.012075258418917656, "attnres/block_norm/3": 61586.90234375, "attnres/final_alpha/block_4": 0.014248008839786053, "attnres/block_norm/4": 15806.0966796875, "attnres/final_alpha/block_5": 0.6121645569801331, "attnres/block_norm/5": 6888.0458984375, "attnres/final_alpha/block_6": 0.10700896382331848, "attnres/block_norm/6": 41053.5, "geo/tier1_time_s": 1.3582117557525635, "geo/step": 73200.0, "geo/rankme_slope": -0.00020689535579856942} {"step": 73210, "timestamp": 1778273674.5430522, "train/loss": 2.1624619245529173, "train/z_loss": 0.0013693980290554464, "train/perplexity": 8.692511644941831, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.01999485194683075, "optim/adamw_lr": 0.0005998455584049225, "perf/tokens_per_sec": 1791365.8468650524, "perf/iters_per_sec": 0.854189799721266, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1707000017166138, "data/tokens_consumed": 153534595072, "data/tokens_consumed_B": 153.534595072, "train/loss_slope": 8.57002840291051e-06} {"step": 73220, "timestamp": 1778273684.9251878, "train/loss": 2.1014439344406126, "train/z_loss": 0.0013777171610854567, "train/perplexity": 8.17796984366557, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.019993537068367005, "optim/adamw_lr": 0.0005998061120510101, "perf/tokens_per_sec": 2021284.2524797237, "perf/iters_per_sec": 0.9638234388731592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375344276428222, "data/tokens_consumed": 153555566592, "data/tokens_consumed_B": 153.555566592, "train/loss_slope": 1.7847565033755743e-06} {"step": 73230, "timestamp": 1778273695.3073626, "train/loss": 2.1579888820648194, "train/z_loss": 0.001357970852404833, "train/perplexity": 8.653716501872129, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.019992072582244873, "optim/adamw_lr": 0.0005997621774673462, "perf/tokens_per_sec": 2021281.1869308227, "perf/iters_per_sec": 0.963821977105533, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375360012054444, "data/tokens_consumed": 153576538112, "data/tokens_consumed_B": 153.576538112, "train/loss_slope": 8.556345579969947e-07} {"step": 73240, "timestamp": 1778273705.6967523, "train/loss": 2.1227381944656374, "train/z_loss": 0.0013658264302648603, "train/perplexity": 8.353981022997075, "train/grad_norm": 0.39453125, "optim/muon_lr": 0.019990458488464355, "optim/adamw_lr": 0.0005997137546539306, "perf/tokens_per_sec": 2019528.1490359278, "perf/iters_per_sec": 0.9629860634975089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0384366273880006, "data/tokens_consumed": 153597509632, "data/tokens_consumed_B": 153.597509632, "train/loss_slope": 1.0166895700726875e-07} {"step": 73250, "timestamp": 1778273716.0709183, "grad/layer_0/attn": 0.0026929504238069057, "grad/layer_0/mlp": 0.002928011817857623, "grad/layer_0/attn_mlp_ratio": 0.9197197618571484, "grad/layer_4/attn": 0.001929135643877089, "grad/layer_4/mlp": 0.002420443342998624, "grad/layer_4/attn_mlp_ratio": 0.7970174430050444, "grad/layer_8/attn": 0.007286901120096445, "grad/layer_8/mlp": 0.0037486914079636335, "grad/layer_8/attn_mlp_ratio": 1.9438519026215737, "grad/layer_12/attn": 0.004613360855728388, "grad/layer_12/mlp": 0.007149871438741684, "grad/layer_12/attn_mlp_ratio": 0.6452368872266899, "grad/layer_16/attn": 0.007084902375936508, "grad/layer_16/mlp": 0.004596156999468803, "grad/layer_16/attn_mlp_ratio": 1.5414839446535324, "grad/layer_20/attn": 0.004133804235607386, "grad/layer_20/mlp": 0.006462009157985449, "grad/layer_20/attn_mlp_ratio": 0.6397088073649885, "grad/layer_24/attn": 0.014409157447516918, "grad/layer_24/mlp": 0.009888880886137486, "grad/layer_24/attn_mlp_ratio": 1.4571069737532574, "grad/layer_27/attn": 0.009586991742253304, "grad/layer_27/mlp": 0.009463767521083355, "grad/layer_27/attn_mlp_ratio": 1.0130206199162615} {"step": 73250, "timestamp": 1778273716.0866501, "train/loss": 2.1629783153533935, "train/z_loss": 0.0013735580258071422, "train/perplexity": 8.697001537158057, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.0199886953830719, "optim/adamw_lr": 0.0005996608614921569, "perf/tokens_per_sec": 2019814.1354834558, "perf/iters_per_sec": 0.9631224324624328, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382895946502686, "data/tokens_consumed": 153618481152, "data/tokens_consumed_B": 153.618481152, "train/loss_slope": 1.2224601404537532e-06} {"step": 73260, "timestamp": 1778273726.4649036, "train/loss": 2.0999250173568726, "train/z_loss": 0.001378996076527983, "train/perplexity": 8.165557614519528, "train/grad_norm": 0.1826171875, "optim/muon_lr": 0.019986783266067506, "optim/adamw_lr": 0.0005996034979820251, "perf/tokens_per_sec": 2022149.201447564, "perf/iters_per_sec": 0.9642358786809749, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370906352996827, "data/tokens_consumed": 153639452672, "data/tokens_consumed_B": 153.639452672, "train/loss_slope": 1.2658382585161146e-06} {"step": 73270, "timestamp": 1778273736.8437545, "train/loss": 2.116494393348694, "train/z_loss": 0.0013706534053198993, "train/perplexity": 8.301982928763117, "train/grad_norm": 0.171875, "optim/muon_lr": 0.019984721541404724, "optim/adamw_lr": 0.0005995416462421417, "perf/tokens_per_sec": 2021568.2276561444, "perf/iters_per_sec": 0.963958848789284, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373886823654175, "data/tokens_consumed": 153660424192, "data/tokens_consumed_B": 153.660424192, "train/loss_slope": -9.562465188168994e-07} {"step": 73275, "timestamp": 1778273742.6376345, "eos/sharpness": 16.210985183715817, "eos/L0_probe": 1.9673595428466797, "eos/L_plus": 2.0530946254730225, "eos/L_minus": 2.043734312057495, "eos/grad_norm": 0.10659999400377274, "eos/embed_grad_frac": 0.1760333627462387, "eos/time_s": 0.6154141426086426} {"step": 73275, "timestamp": 1778273744.0225992, "geo/rankme_last": 438.4639892578125, "geo/layer_0/stable_rank_q_proj": 19.168758392333984, "geo/layer_0/stable_rank_k_proj": 15.834327697753906, "geo/layer_0/stable_rank_o_proj": 46.75770568847656, "geo/layer_0/stable_rank_gate_proj": 129.57058715820312, "geo/layer_0/stable_rank_down_proj": 56.33570098876953, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06561963260173798, "geo/layer_0/attn_entropy_mean": 6.158141136169434, "geo/layer_0/attn_entropy_std": 0.429120272397995, "geo/layer_7/stable_rank_q_proj": 42.7027473449707, "geo/layer_7/stable_rank_k_proj": 40.14100646972656, "geo/layer_7/stable_rank_o_proj": 88.31651306152344, "geo/layer_7/stable_rank_gate_proj": 77.76194763183594, "geo/layer_7/stable_rank_down_proj": 139.8785400390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4471893310546875, "geo/layer_7/attn_entropy_mean": 4.622861862182617, "geo/layer_7/attn_entropy_std": 0.7935523390769958, "geo/layer_14/stable_rank_q_proj": 50.24953842163086, "geo/layer_14/stable_rank_k_proj": 41.41825866699219, "geo/layer_14/stable_rank_o_proj": 43.551063537597656, "geo/layer_14/stable_rank_gate_proj": 70.96269226074219, "geo/layer_14/stable_rank_down_proj": 127.14603424072266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40449419617652893, "geo/layer_14/attn_entropy_mean": 5.540960311889648, "geo/layer_14/attn_entropy_std": 0.4111047685146332, "geo/layer_21/stable_rank_q_proj": 39.76578140258789, "geo/layer_21/stable_rank_k_proj": 30.146730422973633, "geo/layer_21/stable_rank_o_proj": 68.53719329833984, "geo/layer_21/stable_rank_gate_proj": 64.33159637451172, "geo/layer_21/stable_rank_down_proj": 49.649261474609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13692069053649902, "geo/layer_21/attn_entropy_mean": 5.687446594238281, "geo/layer_21/attn_entropy_std": 0.29509660601615906, "geo/layer_27/stable_rank_q_proj": 43.77330017089844, "geo/layer_27/stable_rank_k_proj": 32.263275146484375, "geo/layer_27/stable_rank_o_proj": 115.49861145019531, "geo/layer_27/stable_rank_gate_proj": 77.86713409423828, "geo/layer_27/stable_rank_down_proj": 127.48722839355469, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.097341388463974, "geo/layer_27/attn_entropy_mean": 4.151875972747803, "geo/layer_27/attn_entropy_std": 0.7536541819572449, "attnres/final_alpha/block_0": 0.2368958294391632, "attnres/block_norm/0": 1.7746230363845825, "attnres/final_alpha/block_1": 0.004042693413794041, "attnres/block_norm/1": 48447.2578125, "attnres/final_alpha/block_2": 0.009952187538146973, "attnres/block_norm/2": 29166.19140625, "attnres/final_alpha/block_3": 0.011942434124648571, "attnres/block_norm/3": 61880.8515625, "attnres/final_alpha/block_4": 0.013871806673705578, "attnres/block_norm/4": 15809.822265625, "attnres/final_alpha/block_5": 0.6174911260604858, "attnres/block_norm/5": 6877.65234375, "attnres/final_alpha/block_6": 0.105803944170475, "attnres/block_norm/6": 41030.109375, "geo/tier1_time_s": 1.363440752029419, "geo/step": 73275.0, "geo/rankme_slope": -0.00021493792829631854} {"step": 73280, "timestamp": 1778273749.213567, "train/loss": 2.122137188911438, "train/z_loss": 0.0013635292882099748, "train/perplexity": 8.348961742461402, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.019982510805130006, "optim/adamw_lr": 0.0005994753241539, "perf/tokens_per_sec": 1696268.5845525023, "perf/iters_per_sec": 0.8088438914072524, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2363325119018556, "data/tokens_consumed": 153681395712, "data/tokens_consumed_B": 153.681395712, "train/loss_slope": -6.9759656386514106e-06} {"step": 73290, "timestamp": 1778273759.5903604, "train/loss": 2.162972164154053, "train/z_loss": 0.0013538307859562338, "train/perplexity": 8.696948040332472, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.0199801504611969, "optim/adamw_lr": 0.000599404513835907, "perf/tokens_per_sec": 2021913.9553705603, "perf/iters_per_sec": 0.9641237046101381, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037211298942566, "data/tokens_consumed": 153702367232, "data/tokens_consumed_B": 153.702367232, "train/loss_slope": -3.888992650924478e-06} {"step": 73300, "timestamp": 1778273769.9268029, "grad/layer_0/attn": 0.003165220143273473, "grad/layer_0/mlp": 0.0030874365475028753, "grad/layer_0/attn_mlp_ratio": 1.0251935520145843, "grad/layer_4/attn": 0.0026124268770217896, "grad/layer_4/mlp": 0.00256322487257421, "grad/layer_4/attn_mlp_ratio": 1.0191953125355855, "grad/layer_8/attn": 0.004911665339022875, "grad/layer_8/mlp": 0.0037424741312861443, "grad/layer_8/attn_mlp_ratio": 1.312411264708942, "grad/layer_12/attn": 0.004687111359089613, "grad/layer_12/mlp": 0.007484324276447296, "grad/layer_12/attn_mlp_ratio": 0.6262571106404289, "grad/layer_16/attn": 0.003474290482699871, "grad/layer_16/mlp": 0.004747924394905567, "grad/layer_16/attn_mlp_ratio": 0.7317493120262799, "grad/layer_20/attn": 0.004265728872269392, "grad/layer_20/mlp": 0.006142000667750835, "grad/layer_20/attn_mlp_ratio": 0.6945178018646646, "grad/layer_24/attn": 0.011859633959829807, "grad/layer_24/mlp": 0.009490029886364937, "grad/layer_24/attn_mlp_ratio": 1.2496940448944276, "grad/layer_27/attn": 0.005137298256158829, "grad/layer_27/mlp": 0.0073389168828725815, "grad/layer_27/attn_mlp_ratio": 0.7000076807174892} {"step": 73300, "timestamp": 1778273769.9424853, "train/loss": 2.146446704864502, "train/z_loss": 0.0013876792858354748, "train/perplexity": 8.554407993270956, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.019977641105651856, "optim/adamw_lr": 0.0005993292331695556, "perf/tokens_per_sec": 2026693.6934659588, "perf/iters_per_sec": 0.9664028613405031, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347651481628417, "data/tokens_consumed": 153723338752, "data/tokens_consumed_B": 153.723338752, "train/loss_slope": -5.776415112996426e-06} {"step": 73310, "timestamp": 1778273780.2947872, "train/loss": 2.1349493741989134, "train/z_loss": 0.0013737965491600335, "train/perplexity": 8.456618372944721, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.019974982738494875, "optim/adamw_lr": 0.0005992494821548462, "perf/tokens_per_sec": 2027052.948341761, "perf/iters_per_sec": 0.9665741674145513, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034581756591797, "data/tokens_consumed": 153744310272, "data/tokens_consumed_B": 153.744310272, "train/loss_slope": -8.860636894816637e-06} {"step": 73320, "timestamp": 1778273790.6555517, "train/loss": 2.1285463333129884, "train/z_loss": 0.0013662175857461988, "train/perplexity": 8.40264328649871, "train/grad_norm": 0.09375, "optim/muon_lr": 0.019972175359725952, "optim/adamw_lr": 0.0005991652607917785, "perf/tokens_per_sec": 2025304.1584419487, "perf/iters_per_sec": 0.9657402794084304, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354750871658325, "data/tokens_consumed": 153765281792, "data/tokens_consumed_B": 153.765281792, "train/loss_slope": -1.1452609549189613e-05} {"step": 73330, "timestamp": 1778273801.0100899, "train/loss": 2.109543228149414, "train/z_loss": 0.001368863950483501, "train/perplexity": 8.244474580518695, "train/grad_norm": 0.359375, "optim/muon_lr": 0.019969218969345094, "optim/adamw_lr": 0.0005990765690803528, "perf/tokens_per_sec": 2026440.9089325336, "perf/iters_per_sec": 0.9662823242819469, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348942279815674, "data/tokens_consumed": 153786253312, "data/tokens_consumed_B": 153.786253312, "train/loss_slope": -1.1021718562561342e-05} {"step": 73340, "timestamp": 1778273811.3638442, "train/loss": 2.1138279914855955, "train/z_loss": 0.0013708279351703823, "train/perplexity": 8.279875992100807, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.019966112971305846, "optim/adamw_lr": 0.0005989833891391754, "perf/tokens_per_sec": 2026722.13217024, "perf/iters_per_sec": 0.9664164219714355, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347506284713746, "data/tokens_consumed": 153807224832, "data/tokens_consumed_B": 153.807224832, "train/loss_slope": -1.4038760040459443e-05} {"step": 73350, "timestamp": 1778273821.7016506, "grad/layer_0/attn": 0.0036221297923475504, "grad/layer_0/mlp": 0.003648326965048909, "grad/layer_0/attn_mlp_ratio": 0.99281937385704, "grad/layer_4/attn": 0.0035846673417836428, "grad/layer_4/mlp": 0.0025890569668263197, "grad/layer_4/attn_mlp_ratio": 1.3845455118444907, "grad/layer_8/attn": 0.004034542012959719, "grad/layer_8/mlp": 0.003583901561796665, "grad/layer_8/attn_mlp_ratio": 1.1257401551964292, "grad/layer_12/attn": 0.005334784742444754, "grad/layer_12/mlp": 0.007115655113011599, "grad/layer_12/attn_mlp_ratio": 0.7497250193755359, "grad/layer_16/attn": 0.003762135747820139, "grad/layer_16/mlp": 0.004437353927642107, "grad/layer_16/attn_mlp_ratio": 0.8478331285681163, "grad/layer_20/attn": 0.004842731636017561, "grad/layer_20/mlp": 0.005437652114778757, "grad/layer_20/attn_mlp_ratio": 0.8905923815531477, "grad/layer_24/attn": 0.007894227281212807, "grad/layer_24/mlp": 0.01004749909043312, "grad/layer_24/attn_mlp_ratio": 0.7856907606152799, "grad/layer_27/attn": 0.006622821092605591, "grad/layer_27/mlp": 0.007797887548804283, "grad/layer_27/attn_mlp_ratio": 0.8493096324132248} {"step": 73350, "timestamp": 1778273822.3045456, "eos/sharpness": 51.63218975067138, "eos/L0_probe": 1.9657282829284668, "eos/L_plus": 2.2050325870513916, "eos/L_minus": 2.242745876312256, "eos/grad_norm": 0.12346091866493225, "eos/embed_grad_frac": 0.16410985589027405, "eos/time_s": 0.600064754486084} {"step": 73350, "timestamp": 1778273822.3236814, "train/loss": 2.1663917541503905, "train/z_loss": 0.0013682329910807312, "train/perplexity": 8.726738944157987, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.01996285855770111, "optim/adamw_lr": 0.0005988857567310333, "perf/tokens_per_sec": 1914680.3060164861, "perf/iters_per_sec": 0.912990715988391, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0953013896942139, "data/tokens_consumed": 153828196352, "data/tokens_consumed_B": 153.828196352, "train/loss_slope": -9.526416492147414e-06} {"step": 73350, "timestamp": 1778273823.685681, "geo/rankme_last": 438.769775390625, "geo/layer_0/stable_rank_q_proj": 19.201292037963867, "geo/layer_0/stable_rank_k_proj": 15.822846412658691, "geo/layer_0/stable_rank_o_proj": 46.738037109375, "geo/layer_0/stable_rank_gate_proj": 129.418212890625, "geo/layer_0/stable_rank_down_proj": 56.34550094604492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07019837945699692, "geo/layer_0/attn_entropy_mean": 6.163205146789551, "geo/layer_0/attn_entropy_std": 0.42892730236053467, "geo/layer_7/stable_rank_q_proj": 42.65866470336914, "geo/layer_7/stable_rank_k_proj": 40.03328323364258, "geo/layer_7/stable_rank_o_proj": 88.36957550048828, "geo/layer_7/stable_rank_gate_proj": 77.87508392333984, "geo/layer_7/stable_rank_down_proj": 140.05169677734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4473031759262085, "geo/layer_7/attn_entropy_mean": 4.63798713684082, "geo/layer_7/attn_entropy_std": 0.8056567907333374, "geo/layer_14/stable_rank_q_proj": 50.28376007080078, "geo/layer_14/stable_rank_k_proj": 41.427452087402344, "geo/layer_14/stable_rank_o_proj": 43.49245834350586, "geo/layer_14/stable_rank_gate_proj": 70.84159851074219, "geo/layer_14/stable_rank_down_proj": 126.96414184570312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39706528186798096, "geo/layer_14/attn_entropy_mean": 5.5620880126953125, "geo/layer_14/attn_entropy_std": 0.4043724238872528, "geo/layer_21/stable_rank_q_proj": 39.79869842529297, "geo/layer_21/stable_rank_k_proj": 30.084110260009766, "geo/layer_21/stable_rank_o_proj": 68.35824584960938, "geo/layer_21/stable_rank_gate_proj": 64.3418197631836, "geo/layer_21/stable_rank_down_proj": 49.615447998046875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14322298765182495, "geo/layer_21/attn_entropy_mean": 5.672209739685059, "geo/layer_21/attn_entropy_std": 0.2992274761199951, "geo/layer_27/stable_rank_q_proj": 43.743412017822266, "geo/layer_27/stable_rank_k_proj": 32.22770309448242, "geo/layer_27/stable_rank_o_proj": 115.47901153564453, "geo/layer_27/stable_rank_gate_proj": 77.85479736328125, "geo/layer_27/stable_rank_down_proj": 127.62212371826172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09815208613872528, "geo/layer_27/attn_entropy_mean": 4.173957824707031, "geo/layer_27/attn_entropy_std": 0.7563263773918152, "attnres/final_alpha/block_0": 0.23690727353096008, "attnres/block_norm/0": 1.774793267250061, "attnres/final_alpha/block_1": 0.004042183980345726, "attnres/block_norm/1": 48414.20703125, "attnres/final_alpha/block_2": 0.010135192424058914, "attnres/block_norm/2": 29107.15625, "attnres/final_alpha/block_3": 0.012049080803990364, "attnres/block_norm/3": 61939.40625, "attnres/final_alpha/block_4": 0.014104362577199936, "attnres/block_norm/4": 15785.8515625, "attnres/final_alpha/block_5": 0.614726185798645, "attnres/block_norm/5": 6879.1630859375, "attnres/final_alpha/block_6": 0.10803568363189697, "attnres/block_norm/6": 41216.8125, "geo/tier1_time_s": 1.3581113815307617, "geo/step": 73350.0, "geo/rankme_slope": -0.00018218756252501} {"step": 73360, "timestamp": 1778273834.0413158, "train/loss": 2.1611186861991882, "train/z_loss": 0.0013736114604398608, "train/perplexity": 8.680843368302874, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.019959455132484438, "optim/adamw_lr": 0.000598783653974533, "perf/tokens_per_sec": 1790315.7444083043, "perf/iters_per_sec": 0.8536890718499681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713866710662841, "data/tokens_consumed": 153849167872, "data/tokens_consumed_B": 153.849167872, "train/loss_slope": -8.819115501676218e-06} {"step": 73370, "timestamp": 1778273844.8012884, "train/loss": 2.188827395439148, "train/z_loss": 0.0013557644677348434, "train/perplexity": 8.924741783613404, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.01995590329170227, "optim/adamw_lr": 0.000598677098751068, "perf/tokens_per_sec": 1950315.6839926008, "perf/iters_per_sec": 0.9299829883540157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0752884864807128, "data/tokens_consumed": 153870139392, "data/tokens_consumed_B": 153.870139392, "train/loss_slope": -7.0784623580689986e-06} {"step": 73380, "timestamp": 1778273855.6769571, "train/loss": 2.1157896757125854, "train/z_loss": 0.0013666247949004173, "train/perplexity": 8.29613443598859, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.01995220184326172, "optim/adamw_lr": 0.0005985660552978515, "perf/tokens_per_sec": 1929762.7371731973, "perf/iters_per_sec": 0.9201825795999514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0867408514022827, "data/tokens_consumed": 153891110912, "data/tokens_consumed_B": 153.891110912, "train/loss_slope": -8.987953982623617e-06} {"step": 73390, "timestamp": 1778273866.0259452, "train/loss": 2.127933084964752, "train/z_loss": 0.0013729409547522664, "train/perplexity": 8.397491959065407, "train/grad_norm": 0.1669921875, "optim/muon_lr": 0.019948351979255676, "optim/adamw_lr": 0.0005984505593776702, "perf/tokens_per_sec": 2027772.4012785435, "perf/iters_per_sec": 0.9669172293083875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034214687347412, "data/tokens_consumed": 153912082432, "data/tokens_consumed_B": 153.912082432, "train/loss_slope": -1.0426210813467975e-05} {"step": 73400, "timestamp": 1778273876.3622618, "grad/layer_0/attn": 0.003576900577172637, "grad/layer_0/mlp": 0.0035806382074952126, "grad/layer_0/attn_mlp_ratio": 0.9989561273712703, "grad/layer_4/attn": 0.002248835051432252, "grad/layer_4/mlp": 0.0027631144039332867, "grad/layer_4/attn_mlp_ratio": 0.8138768944359854, "grad/layer_8/attn": 0.00700982753187418, "grad/layer_8/mlp": 0.00387841509655118, "grad/layer_8/attn_mlp_ratio": 1.8073948189218008, "grad/layer_12/attn": 0.004033220931887627, "grad/layer_12/mlp": 0.006780598778277636, "grad/layer_12/attn_mlp_ratio": 0.5948178036026399, "grad/layer_16/attn": 0.0045616808347404, "grad/layer_16/mlp": 0.004798481706529856, "grad/layer_16/attn_mlp_ratio": 0.9506508555545189, "grad/layer_20/attn": 0.0030643462669104338, "grad/layer_20/mlp": 0.005375283770263195, "grad/layer_20/attn_mlp_ratio": 0.5700808256588673, "grad/layer_24/attn": 0.011745361611247063, "grad/layer_24/mlp": 0.00867998506873846, "grad/layer_24/attn_mlp_ratio": 1.3531545714558082, "grad/layer_27/attn": 0.004744038451462984, "grad/layer_27/mlp": 0.008283101953566074, "grad/layer_27/attn_mlp_ratio": 0.5727369312588104} {"step": 73400, "timestamp": 1778273876.377815, "train/loss": 2.1633586406707765, "train/z_loss": 0.0013685737503692508, "train/perplexity": 8.700309856106806, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.019944353699684142, "optim/adamw_lr": 0.0005983306109905242, "perf/tokens_per_sec": 2027128.0660812794, "perf/iters_per_sec": 0.9666099863439939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345434188842773, "data/tokens_consumed": 153933053952, "data/tokens_consumed_B": 153.933053952, "train/loss_slope": -8.831396562145378e-06} {"step": 73410, "timestamp": 1778273886.7239501, "train/loss": 2.105385994911194, "train/z_loss": 0.0013880272745154798, "train/perplexity": 8.210271521068469, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.019940206408500673, "optim/adamw_lr": 0.0005982061922550201, "perf/tokens_per_sec": 2027995.3128368717, "perf/iters_per_sec": 0.9670235218223914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341010093688965, "data/tokens_consumed": 153954025472, "data/tokens_consumed_B": 153.954025472, "train/loss_slope": -1.2270146735323237e-05} {"step": 73420, "timestamp": 1778273897.0764327, "train/loss": 2.1537644624710084, "train/z_loss": 0.0013699041330255568, "train/perplexity": 8.617236679611015, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.01993591070175171, "optim/adamw_lr": 0.0005980773210525512, "perf/tokens_per_sec": 2026988.906456811, "perf/iters_per_sec": 0.9665436298641257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346144437789917, "data/tokens_consumed": 153974996992, "data/tokens_consumed_B": 153.974996992, "train/loss_slope": -1.0351309789182018e-05} {"step": 73425, "timestamp": 1778273902.8424118, "eos/sharpness": 55.00102043151854, "eos/L0_probe": 1.967089295387268, "eos/L_plus": 2.296462297439575, "eos/L_minus": 2.1877264976501465, "eos/grad_norm": 0.1191047877073288, "eos/embed_grad_frac": 0.174613818526268, "eos/time_s": 0.6014866828918457} {"step": 73425, "timestamp": 1778273904.2236652, "geo/rankme_last": 438.8827819824219, "geo/layer_0/stable_rank_q_proj": 19.200937271118164, "geo/layer_0/stable_rank_k_proj": 15.783513069152832, "geo/layer_0/stable_rank_o_proj": 46.69966506958008, "geo/layer_0/stable_rank_gate_proj": 129.49862670898438, "geo/layer_0/stable_rank_down_proj": 56.27934646606445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06099351495504379, "geo/layer_0/attn_entropy_mean": 6.1563615798950195, "geo/layer_0/attn_entropy_std": 0.42474332451820374, "geo/layer_7/stable_rank_q_proj": 42.66184997558594, "geo/layer_7/stable_rank_k_proj": 39.98164749145508, "geo/layer_7/stable_rank_o_proj": 88.3412857055664, "geo/layer_7/stable_rank_gate_proj": 77.77765655517578, "geo/layer_7/stable_rank_down_proj": 139.98814392089844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43577274680137634, "geo/layer_7/attn_entropy_mean": 4.641186237335205, "geo/layer_7/attn_entropy_std": 0.8006513118743896, "geo/layer_14/stable_rank_q_proj": 50.19664001464844, "geo/layer_14/stable_rank_k_proj": 41.442623138427734, "geo/layer_14/stable_rank_o_proj": 43.493778228759766, "geo/layer_14/stable_rank_gate_proj": 70.96403503417969, "geo/layer_14/stable_rank_down_proj": 126.75849151611328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40653035044670105, "geo/layer_14/attn_entropy_mean": 5.543495178222656, "geo/layer_14/attn_entropy_std": 0.42618298530578613, "geo/layer_21/stable_rank_q_proj": 39.76869583129883, "geo/layer_21/stable_rank_k_proj": 30.08541488647461, "geo/layer_21/stable_rank_o_proj": 68.29985046386719, "geo/layer_21/stable_rank_gate_proj": 64.34188842773438, "geo/layer_21/stable_rank_down_proj": 49.5923957824707, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14185450971126556, "geo/layer_21/attn_entropy_mean": 5.681931495666504, "geo/layer_21/attn_entropy_std": 0.2998330295085907, "geo/layer_27/stable_rank_q_proj": 43.78517532348633, "geo/layer_27/stable_rank_k_proj": 32.21356964111328, "geo/layer_27/stable_rank_o_proj": 115.38373565673828, "geo/layer_27/stable_rank_gate_proj": 77.93077850341797, "geo/layer_27/stable_rank_down_proj": 127.80435943603516, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09723437577486038, "geo/layer_27/attn_entropy_mean": 4.149139404296875, "geo/layer_27/attn_entropy_std": 0.7613852024078369, "attnres/final_alpha/block_0": 0.2361065149307251, "attnres/block_norm/0": 1.774857521057129, "attnres/final_alpha/block_1": 0.003941335715353489, "attnres/block_norm/1": 48367.7265625, "attnres/final_alpha/block_2": 0.010062750428915024, "attnres/block_norm/2": 29012.56640625, "attnres/final_alpha/block_3": 0.01190549973398447, "attnres/block_norm/3": 61915.9765625, "attnres/final_alpha/block_4": 0.01390434056520462, "attnres/block_norm/4": 15805.2236328125, "attnres/final_alpha/block_5": 0.618503212928772, "attnres/block_norm/5": 6836.5048828125, "attnres/final_alpha/block_6": 0.10557638853788376, "attnres/block_norm/6": 41044.8515625, "geo/tier1_time_s": 1.3615424633026123, "geo/step": 73425.0, "geo/rankme_slope": -0.00017367923731992797} {"step": 73430, "timestamp": 1778273909.8976345, "train/loss": 2.1385576009750364, "train/z_loss": 0.0013827662914991379, "train/perplexity": 8.487186885691486, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.019931466579437257, "optim/adamw_lr": 0.0005979439973831176, "perf/tokens_per_sec": 1636538.9328063123, "perf/iters_per_sec": 0.7803625740081369, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2814556121826173, "data/tokens_consumed": 153995968512, "data/tokens_consumed_B": 153.995968512, "train/loss_slope": -1.2520267267395981e-05} {"step": 73440, "timestamp": 1778273920.243368, "train/loss": 2.1124568939208985, "train/z_loss": 0.0013700955547392367, "train/perplexity": 8.268531253441076, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.019926873445510866, "optim/adamw_lr": 0.0005978062033653259, "perf/tokens_per_sec": 2028046.6062610596, "perf/iters_per_sec": 0.9670479804330157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034074854850769, "data/tokens_consumed": 154016940032, "data/tokens_consumed_B": 154.016940032, "train/loss_slope": -1.5020773241264533e-05} {"step": 73450, "timestamp": 1778273931.0550466, "grad/layer_0/attn": 0.0025486601516604424, "grad/layer_0/mlp": 0.002955088159069419, "grad/layer_0/attn_mlp_ratio": 0.8624649852126705, "grad/layer_4/attn": 0.00205701170489192, "grad/layer_4/mlp": 0.0025549777783453465, "grad/layer_4/attn_mlp_ratio": 0.8050996144921931, "grad/layer_8/attn": 0.004174884874373674, "grad/layer_8/mlp": 0.0037275494541972876, "grad/layer_8/attn_mlp_ratio": 1.1200078801561952, "grad/layer_12/attn": 0.004827935714274645, "grad/layer_12/mlp": 0.006502253003418446, "grad/layer_12/attn_mlp_ratio": 0.7425019662394321, "grad/layer_16/attn": 0.0034512600395828485, "grad/layer_16/mlp": 0.0043440996669232845, "grad/layer_16/attn_mlp_ratio": 0.7944707130948812, "grad/layer_20/attn": 0.003529063891619444, "grad/layer_20/mlp": 0.005725574679672718, "grad/layer_20/attn_mlp_ratio": 0.6163684917972847, "grad/layer_24/attn": 0.007476938888430595, "grad/layer_24/mlp": 0.008677658624947071, "grad/layer_24/attn_mlp_ratio": 0.8616308990045239, "grad/layer_27/attn": 0.007216328755021095, "grad/layer_27/mlp": 0.008252478204667568, "grad/layer_27/attn_mlp_ratio": 0.8744438323381679} {"step": 73450, "timestamp": 1778273931.0709043, "train/loss": 2.1710929393768312, "train/z_loss": 0.0013854346121661364, "train/perplexity": 8.767861547105978, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.01992213249206543, "optim/adamw_lr": 0.0005976639747619629, "perf/tokens_per_sec": 1938124.7700253974, "perf/iters_per_sec": 0.9241699075819957, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0820521116256714, "data/tokens_consumed": 154037911552, "data/tokens_consumed_B": 154.037911552, "train/loss_slope": -1.27507451272318e-05} {"step": 73460, "timestamp": 1778273941.426822, "train/loss": 2.154631769657135, "train/z_loss": 0.0013766984804533422, "train/perplexity": 8.62471371288144, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.019917243123054505, "optim/adamw_lr": 0.0005975172936916351, "perf/tokens_per_sec": 2026477.417320089, "perf/iters_per_sec": 0.9662997328377194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348755836486816, "data/tokens_consumed": 154058883072, "data/tokens_consumed_B": 154.058883072, "train/loss_slope": -9.320724521926477e-06} {"step": 73470, "timestamp": 1778273951.7782738, "train/loss": 2.148724675178528, "train/z_loss": 0.001375002518761903, "train/perplexity": 8.573916892644295, "train/grad_norm": 0.228515625, "optim/muon_lr": 0.019912205338478088, "optim/adamw_lr": 0.0005973661601543426, "perf/tokens_per_sec": 2026877.2749176705, "perf/iters_per_sec": 0.9664903997982361, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034671425819397, "data/tokens_consumed": 154079854592, "data/tokens_consumed_B": 154.079854592, "train/loss_slope": -7.1523489338336e-06} {"step": 73480, "timestamp": 1778273962.6894748, "train/loss": 2.1318332672119142, "train/z_loss": 0.0013792657176963986, "train/perplexity": 8.43030766013443, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.01990701973438263, "optim/adamw_lr": 0.0005972105920314788, "perf/tokens_per_sec": 1923101.1287338617, "perf/iters_per_sec": 0.9170060771626767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.090505313873291, "data/tokens_consumed": 154100826112, "data/tokens_consumed_B": 154.100826112, "train/loss_slope": -8.573795466533124e-06} {"step": 73490, "timestamp": 1778273973.0498385, "train/loss": 2.1862907648086547, "train/z_loss": 0.0013850861112587153, "train/perplexity": 8.90213169907605, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.01990168571472168, "optim/adamw_lr": 0.0005970505714416503, "perf/tokens_per_sec": 2025583.9933873995, "perf/iters_per_sec": 0.9658737151085851, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353320360183715, "data/tokens_consumed": 154121797632, "data/tokens_consumed_B": 154.121797632, "train/loss_slope": -2.0499779732420476e-06} {"step": 73500, "timestamp": 1778273983.3980048, "grad/layer_0/attn": 0.003782447427511215, "grad/layer_0/mlp": 0.003758992999792099, "grad/layer_0/attn_mlp_ratio": 1.006239524015198, "grad/layer_4/attn": 0.0023198712151497602, "grad/layer_4/mlp": 0.0027298505883663893, "grad/layer_4/attn_mlp_ratio": 0.8498161547941763, "grad/layer_8/attn": 0.005330218467861414, "grad/layer_8/mlp": 0.0038277439307421446, "grad/layer_8/attn_mlp_ratio": 1.3925221814866133, "grad/layer_12/attn": 0.004458379931747913, "grad/layer_12/mlp": 0.007572958245873451, "grad/layer_12/attn_mlp_ratio": 0.5887236834172356, "grad/layer_16/attn": 0.003815832082182169, "grad/layer_16/mlp": 0.005151182413101196, "grad/layer_16/attn_mlp_ratio": 0.740768177496569, "grad/layer_20/attn": 0.003449870739132166, "grad/layer_20/mlp": 0.006841964554041624, "grad/layer_20/attn_mlp_ratio": 0.5042222393087473, "grad/layer_24/attn": 0.019009999930858612, "grad/layer_24/mlp": 0.014944355003535748, "grad/layer_24/attn_mlp_ratio": 1.2720522096240177, "grad/layer_27/attn": 0.0053836326114833355, "grad/layer_27/mlp": 0.015299912542104721, "grad/layer_27/attn_mlp_ratio": 0.3518734215950883} {"step": 73500, "timestamp": 1778273984.0118299, "eos/sharpness": 74.75523948669432, "eos/L0_probe": 1.9651106595993042, "eos/L_plus": 2.378103256225586, "eos/L_minus": 2.299670457839966, "eos/grad_norm": 0.2702617645263672, "eos/embed_grad_frac": 0.04222520813345909, "eos/time_s": 0.6110930442810059} {"step": 73500, "timestamp": 1778273984.0338683, "train/loss": 2.087045764923096, "train/z_loss": 0.0013770350138656794, "train/perplexity": 8.061065670184469, "train/grad_norm": 0.26953125, "optim/muon_lr": 0.01989620387554169, "optim/adamw_lr": 0.0005968861162662506, "perf/tokens_per_sec": 1910219.5496194693, "perf/iters_per_sec": 0.9108636615846011, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978591442108154, "data/tokens_consumed": 154142769152, "data/tokens_consumed_B": 154.142769152, "train/loss_slope": -4.431104996237925e-06} {"step": 73500, "timestamp": 1778273985.398598, "geo/rankme_last": 437.3862609863281, "geo/layer_0/stable_rank_q_proj": 19.21767807006836, "geo/layer_0/stable_rank_k_proj": 15.779898643493652, "geo/layer_0/stable_rank_o_proj": 46.579166412353516, "geo/layer_0/stable_rank_gate_proj": 129.62171936035156, "geo/layer_0/stable_rank_down_proj": 56.384151458740234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06816066056489944, "geo/layer_0/attn_entropy_mean": 6.154106616973877, "geo/layer_0/attn_entropy_std": 0.42888176441192627, "geo/layer_7/stable_rank_q_proj": 42.62000274658203, "geo/layer_7/stable_rank_k_proj": 39.95093536376953, "geo/layer_7/stable_rank_o_proj": 88.32241821289062, "geo/layer_7/stable_rank_gate_proj": 77.87200927734375, "geo/layer_7/stable_rank_down_proj": 140.13589477539062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4376799464225769, "geo/layer_7/attn_entropy_mean": 4.625611305236816, "geo/layer_7/attn_entropy_std": 0.7976706624031067, "geo/layer_14/stable_rank_q_proj": 50.25957107543945, "geo/layer_14/stable_rank_k_proj": 41.42129135131836, "geo/layer_14/stable_rank_o_proj": 43.466651916503906, "geo/layer_14/stable_rank_gate_proj": 71.04216766357422, "geo/layer_14/stable_rank_down_proj": 126.665283203125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39669227600097656, "geo/layer_14/attn_entropy_mean": 5.547971248626709, "geo/layer_14/attn_entropy_std": 0.4198911488056183, "geo/layer_21/stable_rank_q_proj": 39.834781646728516, "geo/layer_21/stable_rank_k_proj": 30.06847381591797, "geo/layer_21/stable_rank_o_proj": 68.17353057861328, "geo/layer_21/stable_rank_gate_proj": 64.32406616210938, "geo/layer_21/stable_rank_down_proj": 49.63737869262695, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13795220851898193, "geo/layer_21/attn_entropy_mean": 5.682578086853027, "geo/layer_21/attn_entropy_std": 0.3055938184261322, "geo/layer_27/stable_rank_q_proj": 43.73577117919922, "geo/layer_27/stable_rank_k_proj": 32.1622428894043, "geo/layer_27/stable_rank_o_proj": 115.37646484375, "geo/layer_27/stable_rank_gate_proj": 77.90254211425781, "geo/layer_27/stable_rank_down_proj": 127.92784118652344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09327464550733566, "geo/layer_27/attn_entropy_mean": 4.133781909942627, "geo/layer_27/attn_entropy_std": 0.7903404831886292, "attnres/final_alpha/block_0": 0.23733501136302948, "attnres/block_norm/0": 1.7748112678527832, "attnres/final_alpha/block_1": 0.0040765004232525826, "attnres/block_norm/1": 48335.625, "attnres/final_alpha/block_2": 0.010140263475477695, "attnres/block_norm/2": 29218.125, "attnres/final_alpha/block_3": 0.012034907937049866, "attnres/block_norm/3": 61993.796875, "attnres/final_alpha/block_4": 0.014236118644475937, "attnres/block_norm/4": 15830.056640625, "attnres/final_alpha/block_5": 0.6165674924850464, "attnres/block_norm/5": 6896.7099609375, "attnres/final_alpha/block_6": 0.10560963302850723, "attnres/block_norm/6": 41401.53125, "geo/tier1_time_s": 1.3612205982208252, "geo/step": 73500.0, "geo/rankme_slope": -0.00018553888352215887} {"step": 73500, "timestamp": 1778273992.3241863, "geo/ww_alpha_mean": 8.028133402738233, "geo/ww_alpha_std": 5.810347447365712, "geo/ww_alpha_min": 1.3577758187667754, "geo/ww_alpha_max": 49.513149149331355, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.9479862616019488, "geo/ww_alpha_by_type/k_proj": 4.533907367550809, "geo/ww_alpha_by_type/v_proj": 7.375110479674846, "geo/ww_alpha_by_type/o_proj": 9.50874876001703, "geo/ww_alpha_by_type/gate_proj": 8.137933213668399, "geo/ww_alpha_by_type/up_proj": 14.646200170667154, "geo/ww_alpha_by_type/down_proj": 8.150063251304752, "geo/twonn_id/layer_0": 0.7076170444488525, "geo/twonn_id/layer_7": 3.3883111476898193, "geo/twonn_id/layer_14": 4.578010559082031, "geo/twonn_id/layer_21": 7.385119915008545, "geo/twonn_id/layer_27": 5.8280534744262695, "geo/tier2_time_s": 6.919007778167725} {"step": 73500, "timestamp": 1778273992.941682, "eoc/jacobian_sigma/layer_0/attn": 1277.57666015625, "eoc/jacobian_sigma/layer_0/mlp": 9407.58984375, "eoc/jacobian_sigma/layer_0": 9407.58984375, "eoc/jacobian_sigma/layer_7/attn": 1.151185154914856, "eoc/jacobian_sigma/layer_7/mlp": 1.7960662841796875, "eoc/jacobian_sigma/layer_7": 1.7960662841796875, "eoc/jacobian_sigma/layer_14/attn": 1.4102610349655151, "eoc/jacobian_sigma/layer_14/mlp": 7.232786178588867, "eoc/jacobian_sigma/layer_14": 7.232786178588867, "eoc/jacobian_sigma/layer_21/attn": 1.1311720609664917, "eoc/jacobian_sigma/layer_21/mlp": 4.1622514724731445, "eoc/jacobian_sigma/layer_21": 4.1622514724731445, "eoc/jacobian_sigma/layer_27/attn": 3.099111557006836, "eoc/jacobian_sigma/layer_27/mlp": 26.8062801361084, "eoc/jacobian_sigma/layer_27": 26.8062801361084, "eoc/layer0_sigma": 9407.58984375, "eoc/sigma_max": 26.8062801361084, "eoc/sigma_min": 1.7960662841796875, "eoc/sigma_mean": 9.999346017837524, "eoc/time_s": 0.6089310646057129} {"step": 73510, "timestamp": 1778274003.3123572, "train/loss": 2.1738831281661986, "train/z_loss": 0.0013728392426855863, "train/perplexity": 8.792359697440162, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.019890574216842653, "optim/adamw_lr": 0.0005967172265052795, "perf/tokens_per_sec": 1088010.9575507168, "perf/iters_per_sec": 0.5188040530923447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9275099992752076, "data/tokens_consumed": 154163740672, "data/tokens_consumed_B": 154.163740672, "train/loss_slope": -3.0686319750634493e-06} {"step": 73520, "timestamp": 1778274013.6660676, "train/loss": 2.117847204208374, "train/z_loss": 0.001375973375979811, "train/perplexity": 8.313221541570774, "train/grad_norm": 0.1865234375, "optim/muon_lr": 0.019884796738624574, "optim/adamw_lr": 0.0005965439021587371, "perf/tokens_per_sec": 2026479.9850950702, "perf/iters_per_sec": 0.9663009572482444, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348742723464965, "data/tokens_consumed": 154184712192, "data/tokens_consumed_B": 154.184712192, "train/loss_slope": -4.218029789905978e-06} {"step": 73530, "timestamp": 1778274024.024048, "train/loss": 2.176190161705017, "train/z_loss": 0.0013677859446033836, "train/perplexity": 8.812667382395516, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.01987887144088745, "optim/adamw_lr": 0.0005963661432266235, "perf/tokens_per_sec": 2026076.2707397095, "perf/iters_per_sec": 0.9661084512423084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350804805755616, "data/tokens_consumed": 154205683712, "data/tokens_consumed_B": 154.205683712, "train/loss_slope": -3.032164211713335e-07} {"step": 73540, "timestamp": 1778274034.3814263, "train/loss": 2.1712727785110473, "train/z_loss": 0.0013703210512176155, "train/perplexity": 8.769438493529625, "train/grad_norm": 0.318359375, "optim/muon_lr": 0.019872798323631286, "optim/adamw_lr": 0.0005961839497089386, "perf/tokens_per_sec": 2026111.785921026, "perf/iters_per_sec": 0.9661253862004404, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350623369216918, "data/tokens_consumed": 154226655232, "data/tokens_consumed_B": 154.226655232, "train/loss_slope": -4.1523787340812707e-07} {"step": 73550, "timestamp": 1778274044.7299676, "grad/layer_0/attn": 0.002657813485711813, "grad/layer_0/mlp": 0.002882090862840414, "grad/layer_0/attn_mlp_ratio": 0.9221823738319602, "grad/layer_4/attn": 0.003937968052923679, "grad/layer_4/mlp": 0.002601990941911936, "grad/layer_4/attn_mlp_ratio": 1.5134441239390553, "grad/layer_8/attn": 0.004077850375324488, "grad/layer_8/mlp": 0.003820301964879036, "grad/layer_8/attn_mlp_ratio": 1.0674156928095178, "grad/layer_12/attn": 0.004814561922103167, "grad/layer_12/mlp": 0.0074014016427099705, "grad/layer_12/attn_mlp_ratio": 0.6504932564760834, "grad/layer_16/attn": 0.00539153628051281, "grad/layer_16/mlp": 0.004577362444251776, "grad/layer_16/attn_mlp_ratio": 1.1778696199809362, "grad/layer_20/attn": 0.0038383519276976585, "grad/layer_20/mlp": 0.006184754427522421, "grad/layer_20/attn_mlp_ratio": 0.6206150802941054, "grad/layer_24/attn": 0.006594757549464703, "grad/layer_24/mlp": 0.007839776575565338, "grad/layer_24/attn_mlp_ratio": 0.8411920163515557, "grad/layer_27/attn": 0.006777917966246605, "grad/layer_27/mlp": 0.006644975859671831, "grad/layer_27/attn_mlp_ratio": 1.0200063939104664} {"step": 73550, "timestamp": 1778274044.7458303, "train/loss": 2.15207462310791, "train/z_loss": 0.0013826159294694663, "train/perplexity": 8.60268723044686, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.019866577982902527, "optim/adamw_lr": 0.0005959973394870757, "perf/tokens_per_sec": 2024350.455777363, "perf/iters_per_sec": 0.9652855185400786, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359629154205323, "data/tokens_consumed": 154247626752, "data/tokens_consumed_B": 154.247626752, "train/loss_slope": -4.375375758553605e-06} {"step": 73560, "timestamp": 1778274055.1072392, "train/loss": 2.1530229091644286, "train/z_loss": 0.0013749578618444502, "train/perplexity": 8.610848907986973, "train/grad_norm": 0.259765625, "optim/muon_lr": 0.019860209822654726, "optim/adamw_lr": 0.0005958062946796416, "perf/tokens_per_sec": 2025411.4660029209, "perf/iters_per_sec": 0.9657914476408581, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354202270507813, "data/tokens_consumed": 154268598272, "data/tokens_consumed_B": 154.268598272, "train/loss_slope": -5.7019353735529015e-06} {"step": 73570, "timestamp": 1778274065.4605772, "train/loss": 2.0813207387924195, "train/z_loss": 0.0013828998780809343, "train/perplexity": 8.015047711280195, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.019853694438934328, "optim/adamw_lr": 0.0005956108331680297, "perf/tokens_per_sec": 2026865.084937791, "perf/iters_per_sec": 0.9664845871628719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346776485443114, "data/tokens_consumed": 154289569792, "data/tokens_consumed_B": 154.289569792, "train/loss_slope": -9.119027819034018e-06} {"step": 73575, "timestamp": 1778274071.2445295, "eos/sharpness": 23.06399345397949, "eos/L0_probe": 1.968543529510498, "eos/L_plus": 2.0676212310791016, "eos/L_minus": 2.1001057624816895, "eos/grad_norm": 0.11048391461372375, "eos/embed_grad_frac": 0.2342703491449356, "eos/time_s": 0.6148405075073242} {"step": 73575, "timestamp": 1778274072.6246264, "geo/rankme_last": 438.11663818359375, "geo/layer_0/stable_rank_q_proj": 19.18418312072754, "geo/layer_0/stable_rank_k_proj": 15.793081283569336, "geo/layer_0/stable_rank_o_proj": 46.55930709838867, "geo/layer_0/stable_rank_gate_proj": 129.33885192871094, "geo/layer_0/stable_rank_down_proj": 56.34782028198242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0670635849237442, "geo/layer_0/attn_entropy_mean": 6.1553826332092285, "geo/layer_0/attn_entropy_std": 0.43174105882644653, "geo/layer_7/stable_rank_q_proj": 42.54491424560547, "geo/layer_7/stable_rank_k_proj": 40.04965591430664, "geo/layer_7/stable_rank_o_proj": 88.22686767578125, "geo/layer_7/stable_rank_gate_proj": 77.755615234375, "geo/layer_7/stable_rank_down_proj": 140.0347137451172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44655582308769226, "geo/layer_7/attn_entropy_mean": 4.627295017242432, "geo/layer_7/attn_entropy_std": 0.798815906047821, "geo/layer_14/stable_rank_q_proj": 50.12384796142578, "geo/layer_14/stable_rank_k_proj": 41.38023376464844, "geo/layer_14/stable_rank_o_proj": 43.42981719970703, "geo/layer_14/stable_rank_gate_proj": 70.9856185913086, "geo/layer_14/stable_rank_down_proj": 126.95162200927734, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3999151885509491, "geo/layer_14/attn_entropy_mean": 5.510275840759277, "geo/layer_14/attn_entropy_std": 0.43363311886787415, "geo/layer_21/stable_rank_q_proj": 39.807334899902344, "geo/layer_21/stable_rank_k_proj": 30.00191879272461, "geo/layer_21/stable_rank_o_proj": 68.09408569335938, "geo/layer_21/stable_rank_gate_proj": 64.2818832397461, "geo/layer_21/stable_rank_down_proj": 49.65686798095703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13722698390483856, "geo/layer_21/attn_entropy_mean": 5.6892290115356445, "geo/layer_21/attn_entropy_std": 0.3002607226371765, "geo/layer_27/stable_rank_q_proj": 43.7093391418457, "geo/layer_27/stable_rank_k_proj": 32.20171356201172, "geo/layer_27/stable_rank_o_proj": 115.47618103027344, "geo/layer_27/stable_rank_gate_proj": 77.84849548339844, "geo/layer_27/stable_rank_down_proj": 127.70355987548828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1109161376953125, "geo/layer_27/attn_entropy_mean": 4.140600681304932, "geo/layer_27/attn_entropy_std": 0.7769894599914551, "attnres/final_alpha/block_0": 0.23711591958999634, "attnres/block_norm/0": 1.7748632431030273, "attnres/final_alpha/block_1": 0.004102733451873064, "attnres/block_norm/1": 48259.234375, "attnres/final_alpha/block_2": 0.010102564468979836, "attnres/block_norm/2": 29097.880859375, "attnres/final_alpha/block_3": 0.012035507708787918, "attnres/block_norm/3": 61822.875, "attnres/final_alpha/block_4": 0.014014370739459991, "attnres/block_norm/4": 15754.291015625, "attnres/final_alpha/block_5": 0.6152325868606567, "attnres/block_norm/5": 6894.3837890625, "attnres/final_alpha/block_6": 0.10739636421203613, "attnres/block_norm/6": 41177.078125, "geo/tier1_time_s": 1.3611340522766113, "geo/step": 73575.0, "geo/rankme_slope": -0.0001713349011479592} {"step": 73580, "timestamp": 1778274077.8016531, "train/loss": 2.087502110004425, "train/z_loss": 0.0013764270581305027, "train/perplexity": 8.064745137342845, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.019847031831741333, "optim/adamw_lr": 0.0005954109549522399, "perf/tokens_per_sec": 1700263.0706356731, "perf/iters_per_sec": 0.8107486107996336, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2334279537200927, "data/tokens_consumed": 154310541312, "data/tokens_consumed_B": 154.310541312, "train/loss_slope": -1.4041640763521691e-05} {"step": 73590, "timestamp": 1778274088.1722567, "train/loss": 2.1140657663345337, "train/z_loss": 0.0013812281773425638, "train/perplexity": 8.28184497244178, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.019840222001075745, "optim/adamw_lr": 0.0005952066600322723, "perf/tokens_per_sec": 2023113.394193381, "perf/iters_per_sec": 0.9646956416098504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365963697433471, "data/tokens_consumed": 154331512832, "data/tokens_consumed_B": 154.331512832, "train/loss_slope": -1.360228005880498e-05} {"step": 73600, "timestamp": 1778274098.5246036, "grad/layer_0/attn": 0.0026334624271839857, "grad/layer_0/mlp": 0.0030399078968912363, "grad/layer_0/attn_mlp_ratio": 0.8662967530192024, "grad/layer_4/attn": 0.0028873293194919825, "grad/layer_4/mlp": 0.002530456054955721, "grad/layer_4/attn_mlp_ratio": 1.141031159080686, "grad/layer_8/attn": 0.003683590330183506, "grad/layer_8/mlp": 0.0035561590921133757, "grad/layer_8/attn_mlp_ratio": 1.035833923957268, "grad/layer_12/attn": 0.004481439478695393, "grad/layer_12/mlp": 0.006715199910104275, "grad/layer_12/attn_mlp_ratio": 0.6673575577722524, "grad/layer_16/attn": 0.003370673395693302, "grad/layer_16/mlp": 0.004899498075246811, "grad/layer_16/attn_mlp_ratio": 0.6879629862345047, "grad/layer_20/attn": 0.0033606798388063908, "grad/layer_20/mlp": 0.0069618565030395985, "grad/layer_20/attn_mlp_ratio": 0.48272752951261494, "grad/layer_24/attn": 0.016137778759002686, "grad/layer_24/mlp": 0.013205581344664097, "grad/layer_24/attn_mlp_ratio": 1.22204227255161, "grad/layer_27/attn": 0.013033624738454819, "grad/layer_27/mlp": 0.014571414329111576, "grad/layer_27/attn_mlp_ratio": 0.8944653109594854} {"step": 73600, "timestamp": 1778274098.540649, "train/loss": 2.0986343026161194, "train/z_loss": 0.0013759625027887524, "train/perplexity": 8.155025007697668, "train/grad_norm": 0.287109375, "optim/muon_lr": 0.019833264946937563, "optim/adamw_lr": 0.0005949979484081268, "perf/tokens_per_sec": 2023965.286234929, "perf/iters_per_sec": 0.965101855390038, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361600637435913, "data/tokens_consumed": 154352484352, "data/tokens_consumed_B": 154.352484352, "train/loss_slope": -1.8686517015291768e-05} {"step": 73610, "timestamp": 1778274108.9179912, "train/loss": 2.1452373027801515, "train/z_loss": 0.0013671368942596018, "train/perplexity": 8.544068527959213, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.01982616126537323, "optim/adamw_lr": 0.0005947848379611969, "perf/tokens_per_sec": 2021972.285344258, "perf/iters_per_sec": 0.9641515185090341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371813774108887, "data/tokens_consumed": 154373455872, "data/tokens_consumed_B": 154.373455872, "train/loss_slope": -2.12822462680495e-05} {"step": 73620, "timestamp": 1778274119.3149252, "train/loss": 2.140136790275574, "train/z_loss": 0.0013730793842114508, "train/perplexity": 8.500600348824179, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.019818910360336305, "optim/adamw_lr": 0.0005945673108100891, "perf/tokens_per_sec": 2018083.3179251533, "perf/iters_per_sec": 0.9622971143365637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.039180088043213, "data/tokens_consumed": 154394427392, "data/tokens_consumed_B": 154.394427392, "train/loss_slope": -2.020054803465422e-05} {"step": 73630, "timestamp": 1778274129.6928382, "train/loss": 2.149184155464172, "train/z_loss": 0.0013740589027293026, "train/perplexity": 8.577857343637666, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.01981151282787323, "optim/adamw_lr": 0.0005943453848361968, "perf/tokens_per_sec": 2022001.8466796305, "perf/iters_per_sec": 0.9641656144521858, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037166213989258, "data/tokens_consumed": 154415398912, "data/tokens_consumed_B": 154.415398912, "train/loss_slope": -1.6513223845501602e-05} {"step": 73640, "timestamp": 1778274140.076092, "train/loss": 2.051000237464905, "train/z_loss": 0.0013916100841015577, "train/perplexity": 7.775674739108248, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.01980396866798401, "optim/adamw_lr": 0.0005941190600395202, "perf/tokens_per_sec": 2021131.915239826, "perf/iters_per_sec": 0.963750798816598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376126289367675, "data/tokens_consumed": 154436370432, "data/tokens_consumed_B": 154.436370432, "train/loss_slope": -1.8393882258747418e-05} {"step": 73650, "timestamp": 1778274150.4470258, "grad/layer_0/attn": 0.0030084417667239904, "grad/layer_0/mlp": 0.003303651697933674, "grad/layer_0/attn_mlp_ratio": 0.9106412996084128, "grad/layer_4/attn": 0.0029147504828870296, "grad/layer_4/mlp": 0.002648020163178444, "grad/layer_4/attn_mlp_ratio": 1.1007281641373947, "grad/layer_8/attn": 0.00485871871933341, "grad/layer_8/mlp": 0.0037605524994432926, "grad/layer_8/attn_mlp_ratio": 1.292022539467388, "grad/layer_12/attn": 0.004297600127756596, "grad/layer_12/mlp": 0.006839357316493988, "grad/layer_12/attn_mlp_ratio": 0.6283631437936525, "grad/layer_16/attn": 0.004643899854272604, "grad/layer_16/mlp": 0.004879860673099756, "grad/layer_16/attn_mlp_ratio": 0.9516459731540114, "grad/layer_20/attn": 0.0028303158469498158, "grad/layer_20/mlp": 0.00613738689571619, "grad/layer_20/attn_mlp_ratio": 0.46115974907975277, "grad/layer_24/attn": 0.005254251882433891, "grad/layer_24/mlp": 0.007182810455560684, "grad/layer_24/attn_mlp_ratio": 0.7315036143291056, "grad/layer_27/attn": 0.003843628568574786, "grad/layer_27/mlp": 0.00634777033701539, "grad/layer_27/attn_mlp_ratio": 0.6055084390200464} {"step": 73650, "timestamp": 1778274151.0782025, "eos/sharpness": 8.862185478210447, "eos/L0_probe": 1.9680591821670532, "eos/L_plus": 2.0119922161102295, "eos/L_minus": 2.0127480030059814, "eos/grad_norm": 0.08989650011062622, "eos/embed_grad_frac": 0.27254125475883484, "eos/time_s": 0.6281132698059082} {"step": 73650, "timestamp": 1778274151.0997348, "train/loss": 2.068002450466156, "train/z_loss": 0.0013891723705455662, "train/perplexity": 7.909008691433253, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.01979627788066864, "optim/adamw_lr": 0.0005938883364200591, "perf/tokens_per_sec": 1903274.521966479, "perf/iters_per_sec": 0.9075520143349071, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1018652200698853, "data/tokens_consumed": 154457341952, "data/tokens_consumed_B": 154.457341952, "train/loss_slope": -2.2829292459313387e-05} {"step": 73650, "timestamp": 1778274152.4689116, "geo/rankme_last": 437.5154113769531, "geo/layer_0/stable_rank_q_proj": 19.18444061279297, "geo/layer_0/stable_rank_k_proj": 15.824967384338379, "geo/layer_0/stable_rank_o_proj": 46.63341522216797, "geo/layer_0/stable_rank_gate_proj": 129.28466796875, "geo/layer_0/stable_rank_down_proj": 56.3846321105957, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06352318823337555, "geo/layer_0/attn_entropy_mean": 6.156633377075195, "geo/layer_0/attn_entropy_std": 0.432975172996521, "geo/layer_7/stable_rank_q_proj": 42.493412017822266, "geo/layer_7/stable_rank_k_proj": 39.98686981201172, "geo/layer_7/stable_rank_o_proj": 88.56221008300781, "geo/layer_7/stable_rank_gate_proj": 77.7713851928711, "geo/layer_7/stable_rank_down_proj": 140.23818969726562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4424106776714325, "geo/layer_7/attn_entropy_mean": 4.626914024353027, "geo/layer_7/attn_entropy_std": 0.7884793877601624, "geo/layer_14/stable_rank_q_proj": 50.13799285888672, "geo/layer_14/stable_rank_k_proj": 41.42268371582031, "geo/layer_14/stable_rank_o_proj": 43.47021484375, "geo/layer_14/stable_rank_gate_proj": 70.79161071777344, "geo/layer_14/stable_rank_down_proj": 126.85165405273438, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3967403471469879, "geo/layer_14/attn_entropy_mean": 5.537627696990967, "geo/layer_14/attn_entropy_std": 0.43133747577667236, "geo/layer_21/stable_rank_q_proj": 39.70050811767578, "geo/layer_21/stable_rank_k_proj": 29.936290740966797, "geo/layer_21/stable_rank_o_proj": 68.12151336669922, "geo/layer_21/stable_rank_gate_proj": 64.19290924072266, "geo/layer_21/stable_rank_down_proj": 49.63839340209961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14373640716075897, "geo/layer_21/attn_entropy_mean": 5.687304496765137, "geo/layer_21/attn_entropy_std": 0.30235275626182556, "geo/layer_27/stable_rank_q_proj": 43.72600173950195, "geo/layer_27/stable_rank_k_proj": 32.28556823730469, "geo/layer_27/stable_rank_o_proj": 115.47410583496094, "geo/layer_27/stable_rank_gate_proj": 77.87747192382812, "geo/layer_27/stable_rank_down_proj": 127.93904113769531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0915471762418747, "geo/layer_27/attn_entropy_mean": 4.168262481689453, "geo/layer_27/attn_entropy_std": 0.7885016202926636, "attnres/final_alpha/block_0": 0.23757141828536987, "attnres/block_norm/0": 1.7750672101974487, "attnres/final_alpha/block_1": 0.004055148456245661, "attnres/block_norm/1": 48412.625, "attnres/final_alpha/block_2": 0.010053490288555622, "attnres/block_norm/2": 29153.56640625, "attnres/final_alpha/block_3": 0.01198677346110344, "attnres/block_norm/3": 61897.93359375, "attnres/final_alpha/block_4": 0.014174222946166992, "attnres/block_norm/4": 15789.8427734375, "attnres/final_alpha/block_5": 0.6145137548446655, "attnres/block_norm/5": 6913.10400390625, "attnres/final_alpha/block_6": 0.1076451987028122, "attnres/block_norm/6": 41251.6015625, "geo/tier1_time_s": 1.3657162189483643, "geo/step": 73650.0, "geo/rankme_slope": -0.0001803992300045018} {"step": 73660, "timestamp": 1778274162.8562531, "train/loss": 2.1126015186309814, "train/z_loss": 0.0013749618898145854, "train/perplexity": 8.269727173854152, "train/grad_norm": 0.15234375, "optim/muon_lr": 0.019788440465927124, "optim/adamw_lr": 0.0005936532139778136, "perf/tokens_per_sec": 1784315.8273383952, "perf/iters_per_sec": 0.8508280884448982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1753255605697632, "data/tokens_consumed": 154478313472, "data/tokens_consumed_B": 154.478313472, "train/loss_slope": -2.3424773040277056e-05} {"step": 73670, "timestamp": 1778274173.233607, "train/loss": 2.1676031827926634, "train/z_loss": 0.0013712239218875767, "train/perplexity": 8.737317171758846, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.019780457019805908, "optim/adamw_lr": 0.0005934137105941772, "perf/tokens_per_sec": 2022189.320991825, "perf/iters_per_sec": 0.9642550091704488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037070059776306, "data/tokens_consumed": 154499284992, "data/tokens_consumed_B": 154.499284992, "train/loss_slope": -1.9232809368354454e-05} {"step": 73680, "timestamp": 1778274183.6173496, "train/loss": 2.0819250345230103, "train/z_loss": 0.0013834799989126622, "train/perplexity": 8.019892634128263, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.019772327542304992, "optim/adamw_lr": 0.0005931698262691497, "perf/tokens_per_sec": 2020958.0094831004, "perf/iters_per_sec": 0.9636678740897657, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037701916694641, "data/tokens_consumed": 154520256512, "data/tokens_consumed_B": 154.520256512, "train/loss_slope": -2.3314411562196582e-05} {"step": 73690, "timestamp": 1778274193.9928653, "train/loss": 2.165026092529297, "train/z_loss": 0.0013684899196960032, "train/perplexity": 8.71482930582363, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.01976405143737793, "optim/adamw_lr": 0.0005929215431213379, "perf/tokens_per_sec": 2022250.828320445, "perf/iters_per_sec": 0.9642843381502366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037038516998291, "data/tokens_consumed": 154541228032, "data/tokens_consumed_B": 154.541228032, "train/loss_slope": -1.9542472912605015e-05} {"step": 73700, "timestamp": 1778274204.3391595, "grad/layer_0/attn": 0.0027203524950891733, "grad/layer_0/mlp": 0.0029394107405096292, "grad/layer_0/attn_mlp_ratio": 0.925475424394068, "grad/layer_4/attn": 0.0023538481909781694, "grad/layer_4/mlp": 0.002648816676810384, "grad/layer_4/attn_mlp_ratio": 0.8886413781373713, "grad/layer_8/attn": 0.0039664097130298615, "grad/layer_8/mlp": 0.003807323519140482, "grad/layer_8/attn_mlp_ratio": 1.0417842321282091, "grad/layer_12/attn": 0.00511655118316412, "grad/layer_12/mlp": 0.007173673715442419, "grad/layer_12/attn_mlp_ratio": 0.7132400099025926, "grad/layer_16/attn": 0.003736834740266204, "grad/layer_16/mlp": 0.004905216861516237, "grad/layer_16/attn_mlp_ratio": 0.7618082481536397, "grad/layer_20/attn": 0.006811813451349735, "grad/layer_20/mlp": 0.005599235184490681, "grad/layer_20/attn_mlp_ratio": 1.2165613883413284, "grad/layer_24/attn": 0.012941204942762852, "grad/layer_24/mlp": 0.008537785150110722, "grad/layer_24/attn_mlp_ratio": 1.5157566703373133, "grad/layer_27/attn": 0.005117183085530996, "grad/layer_27/mlp": 0.008535435423254967, "grad/layer_27/attn_mlp_ratio": 0.599522200312934} {"step": 73700, "timestamp": 1778274204.355557, "train/loss": 2.110092520713806, "train/z_loss": 0.0013541871216148139, "train/perplexity": 8.249004453101861, "train/grad_norm": 0.140625, "optim/muon_lr": 0.019755629897117617, "optim/adamw_lr": 0.0005926688969135284, "perf/tokens_per_sec": 2025035.7761073583, "perf/iters_per_sec": 0.9656123047386924, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356123208999635, "data/tokens_consumed": 154562199552, "data/tokens_consumed_B": 154.562199552, "train/loss_slope": -2.1005766147827597e-05} {"step": 73710, "timestamp": 1778274214.708864, "train/loss": 2.1128733158111572, "train/z_loss": 0.0013653567992150783, "train/perplexity": 8.271975167866207, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.0197470623254776, "optim/adamw_lr": 0.000592411869764328, "perf/tokens_per_sec": 2027015.2514250083, "perf/iters_per_sec": 0.9665561921238939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346009969711303, "data/tokens_consumed": 154583171072, "data/tokens_consumed_B": 154.583171072, "train/loss_slope": -2.3587839888839686e-05} {"step": 73720, "timestamp": 1778274225.0665252, "train/loss": 2.0835678219795226, "train/z_loss": 0.001365380163770169, "train/perplexity": 8.033078440923754, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.019738349318504333, "optim/adamw_lr": 0.0005921504795551299, "perf/tokens_per_sec": 2025556.472856322, "perf/iters_per_sec": 0.9658605922967539, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353461027145385, "data/tokens_consumed": 154604142592, "data/tokens_consumed_B": 154.604142592, "train/loss_slope": -2.79614101542105e-05} {"step": 73725, "timestamp": 1778274230.8394465, "eos/sharpness": 81.18588924407958, "eos/L0_probe": 1.9684642553329468, "eos/L_plus": 2.329035758972168, "eos/L_minus": 2.4197516441345215, "eos/grad_norm": 0.21343427896499634, "eos/embed_grad_frac": 0.047764938324689865, "eos/time_s": 0.6082174777984619} {"step": 73725, "timestamp": 1778274232.21896, "geo/rankme_last": 438.96649169921875, "geo/layer_0/stable_rank_q_proj": 19.1512451171875, "geo/layer_0/stable_rank_k_proj": 15.803462982177734, "geo/layer_0/stable_rank_o_proj": 46.55344009399414, "geo/layer_0/stable_rank_gate_proj": 129.35968017578125, "geo/layer_0/stable_rank_down_proj": 56.46756362915039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06033472344279289, "geo/layer_0/attn_entropy_mean": 6.152707099914551, "geo/layer_0/attn_entropy_std": 0.4332723021507263, "geo/layer_7/stable_rank_q_proj": 42.55752944946289, "geo/layer_7/stable_rank_k_proj": 40.0850944519043, "geo/layer_7/stable_rank_o_proj": 88.60603332519531, "geo/layer_7/stable_rank_gate_proj": 77.75353240966797, "geo/layer_7/stable_rank_down_proj": 140.1469268798828, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4240947663784027, "geo/layer_7/attn_entropy_mean": 4.654521465301514, "geo/layer_7/attn_entropy_std": 0.7895777225494385, "geo/layer_14/stable_rank_q_proj": 50.164390563964844, "geo/layer_14/stable_rank_k_proj": 41.45996856689453, "geo/layer_14/stable_rank_o_proj": 43.49115753173828, "geo/layer_14/stable_rank_gate_proj": 70.9235610961914, "geo/layer_14/stable_rank_down_proj": 126.55085754394531, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39975786209106445, "geo/layer_14/attn_entropy_mean": 5.507963180541992, "geo/layer_14/attn_entropy_std": 0.4173983633518219, "geo/layer_21/stable_rank_q_proj": 39.7017707824707, "geo/layer_21/stable_rank_k_proj": 29.941299438476562, "geo/layer_21/stable_rank_o_proj": 68.22362518310547, "geo/layer_21/stable_rank_gate_proj": 64.20276641845703, "geo/layer_21/stable_rank_down_proj": 49.59235382080078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14324305951595306, "geo/layer_21/attn_entropy_mean": 5.686339378356934, "geo/layer_21/attn_entropy_std": 0.30286428332328796, "geo/layer_27/stable_rank_q_proj": 43.762123107910156, "geo/layer_27/stable_rank_k_proj": 32.330162048339844, "geo/layer_27/stable_rank_o_proj": 115.3392562866211, "geo/layer_27/stable_rank_gate_proj": 77.7375259399414, "geo/layer_27/stable_rank_down_proj": 127.79364776611328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09952676296234131, "geo/layer_27/attn_entropy_mean": 4.18892240524292, "geo/layer_27/attn_entropy_std": 0.7934660911560059, "attnres/final_alpha/block_0": 0.23522767424583435, "attnres/block_norm/0": 1.775333046913147, "attnres/final_alpha/block_1": 0.0040474277921020985, "attnres/block_norm/1": 48362.6328125, "attnres/final_alpha/block_2": 0.009921843186020851, "attnres/block_norm/2": 29070.708984375, "attnres/final_alpha/block_3": 0.01170116662979126, "attnres/block_norm/3": 62272.75390625, "attnres/final_alpha/block_4": 0.013879593461751938, "attnres/block_norm/4": 15807.3681640625, "attnres/final_alpha/block_5": 0.6179720163345337, "attnres/block_norm/5": 6847.72607421875, "attnres/final_alpha/block_6": 0.10725022852420807, "attnres/block_norm/6": 41382.640625, "geo/tier1_time_s": 1.3606042861938477, "geo/step": 73725.0, "geo/rankme_slope": -0.00016107255402160865} {"step": 73730, "timestamp": 1778274237.395989, "train/loss": 2.1093037605285643, "train/z_loss": 0.0013677147100679577, "train/perplexity": 8.242500532175507, "train/grad_norm": 0.1591796875, "optim/muon_lr": 0.019729490876197815, "optim/adamw_lr": 0.0005918847262859344, "perf/tokens_per_sec": 1701823.5830104237, "perf/iters_per_sec": 0.8114927210857504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322969436645508, "data/tokens_consumed": 154625114112, "data/tokens_consumed_B": 154.625114112, "train/loss_slope": -2.471863811451722e-05} {"step": 73740, "timestamp": 1778274247.7462184, "train/loss": 2.1006011962890625, "train/z_loss": 0.0013790066237561404, "train/perplexity": 8.171080859688615, "train/grad_norm": 0.171875, "optim/muon_lr": 0.019720486402511596, "optim/adamw_lr": 0.0005916145920753478, "perf/tokens_per_sec": 2027386.3958713114, "perf/iters_per_sec": 0.9667331675869519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344115972518921, "data/tokens_consumed": 154646085632, "data/tokens_consumed_B": 154.646085632, "train/loss_slope": -2.7516003317422392e-05} {"step": 73750, "timestamp": 1778274258.0971243, "grad/layer_0/attn": 0.002939429134130478, "grad/layer_0/mlp": 0.0030690960120409727, "grad/layer_0/attn_mlp_ratio": 0.9577507601010692, "grad/layer_4/attn": 0.002787821227684617, "grad/layer_4/mlp": 0.0026544537395238876, "grad/layer_4/attn_mlp_ratio": 1.050242873383193, "grad/layer_8/attn": 0.002983447164297104, "grad/layer_8/mlp": 0.0037147158291190863, "grad/layer_8/attn_mlp_ratio": 0.8031427493311994, "grad/layer_12/attn": 0.005151647608727217, "grad/layer_12/mlp": 0.006879209540784359, "grad/layer_12/attn_mlp_ratio": 0.7488720184052761, "grad/layer_16/attn": 0.003550373949110508, "grad/layer_16/mlp": 0.004994530696421862, "grad/layer_16/attn_mlp_ratio": 0.7108523490643077, "grad/layer_20/attn": 0.0035914334002882242, "grad/layer_20/mlp": 0.007474520709365606, "grad/layer_20/attn_mlp_ratio": 0.4804901199536622, "grad/layer_24/attn": 0.021106455475091934, "grad/layer_24/mlp": 0.015383763238787651, "grad/layer_24/attn_mlp_ratio": 1.371995591083714, "grad/layer_27/attn": 0.012161226011812687, "grad/layer_27/mlp": 0.01689060591161251, "grad/layer_27/attn_mlp_ratio": 0.7199993892138438} {"step": 73750, "timestamp": 1778274258.1128588, "train/loss": 2.0719109535217286, "train/z_loss": 0.001383599848486483, "train/perplexity": 7.939981565427113, "train/grad_norm": 0.328125, "optim/muon_lr": 0.019711337089538573, "optim/adamw_lr": 0.0005913401126861572, "perf/tokens_per_sec": 2024712.0028883377, "perf/iters_per_sec": 0.9654579176370324, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357779264450073, "data/tokens_consumed": 154667057152, "data/tokens_consumed_B": 154.667057152, "train/loss_slope": -3.0289455573193306e-05} {"step": 73760, "timestamp": 1778274268.4865305, "train/loss": 2.1560511112213137, "train/z_loss": 0.001369479834102094, "train/perplexity": 8.63696381901916, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.0197020423412323, "optim/adamw_lr": 0.0005910612702369689, "perf/tokens_per_sec": 2022635.2060097477, "perf/iters_per_sec": 0.9644676237152804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368414402008057, "data/tokens_consumed": 154688028672, "data/tokens_consumed_B": 154.688028672, "train/loss_slope": -2.5658368427689816e-05} {"step": 73770, "timestamp": 1778274278.8402188, "train/loss": 2.11517938375473, "train/z_loss": 0.0013758760876953601, "train/perplexity": 8.291072916520433, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.01969260275363922, "optim/adamw_lr": 0.0005907780826091766, "perf/tokens_per_sec": 2026474.522745185, "perf/iters_per_sec": 0.966298352596848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034877061843872, "data/tokens_consumed": 154709000192, "data/tokens_consumed_B": 154.709000192, "train/loss_slope": -2.5783188594127996e-05} {"step": 73780, "timestamp": 1778274289.1974876, "train/loss": 2.1393683791160583, "train/z_loss": 0.0013576911529526114, "train/perplexity": 8.494070901624886, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.01968301832675934, "optim/adamw_lr": 0.0005904905498027801, "perf/tokens_per_sec": 2026319.8617601895, "perf/iters_per_sec": 0.9662246044922779, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349560499191284, "data/tokens_consumed": 154729971712, "data/tokens_consumed_B": 154.729971712, "train/loss_slope": -2.6147559545364204e-05} {"step": 73790, "timestamp": 1778274299.548992, "train/loss": 2.1121021032333376, "train/z_loss": 0.0013781991205178202, "train/perplexity": 8.265598175897614, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.019673289060592652, "optim/adamw_lr": 0.0005901986718177795, "perf/tokens_per_sec": 2026853.455553424, "perf/iters_per_sec": 0.9664790418402787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346835851669312, "data/tokens_consumed": 154750943232, "data/tokens_consumed_B": 154.750943232, "train/loss_slope": -2.8737096209945256e-05} {"step": 73800, "timestamp": 1778274309.8924654, "grad/layer_0/attn": 0.003403388662263751, "grad/layer_0/mlp": 0.003447801573202014, "grad/layer_0/attn_mlp_ratio": 0.987118455425246, "grad/layer_4/attn": 0.0024809560272842646, "grad/layer_4/mlp": 0.0028252466581761837, "grad/layer_4/attn_mlp_ratio": 0.8781378193265591, "grad/layer_8/attn": 0.0034017907455563545, "grad/layer_8/mlp": 0.003579272422939539, "grad/layer_8/attn_mlp_ratio": 0.9504140083646314, "grad/layer_12/attn": 0.007073611486703157, "grad/layer_12/mlp": 0.0076987710781395435, "grad/layer_12/attn_mlp_ratio": 0.9187974707948835, "grad/layer_16/attn": 0.003645682940259576, "grad/layer_16/mlp": 0.004945314954966307, "grad/layer_16/attn_mlp_ratio": 0.7371993290090625, "grad/layer_20/attn": 0.0035783504135906696, "grad/layer_20/mlp": 0.006500730291008949, "grad/layer_20/attn_mlp_ratio": 0.5504535949590874, "grad/layer_24/attn": 0.012163732200860977, "grad/layer_24/mlp": 0.010613146238029003, "grad/layer_24/attn_mlp_ratio": 1.146100488342078, "grad/layer_27/attn": 0.013979179784655571, "grad/layer_27/mlp": 0.010353603400290012, "grad/layer_27/attn_mlp_ratio": 1.3501753070091977} {"step": 73800, "timestamp": 1778274310.5161572, "eos/sharpness": 83.69109630584715, "eos/L0_probe": 1.9667253494262695, "eos/L_plus": 2.4863736629486084, "eos/L_minus": 2.2839879989624023, "eos/grad_norm": 0.20990131795406342, "eos/embed_grad_frac": 0.054420243948698044, "eos/time_s": 0.6208028793334961} {"step": 73800, "timestamp": 1778274310.5373282, "train/loss": 2.1635475635528563, "train/z_loss": 0.0013582695741206408, "train/perplexity": 8.701953698994688, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.019663415551185608, "optim/adamw_lr": 0.0005899024665355682, "perf/tokens_per_sec": 1909728.8413917494, "perf/iters_per_sec": 0.9106296736677882, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0981412410736084, "data/tokens_consumed": 154771914752, "data/tokens_consumed_B": 154.771914752, "train/loss_slope": -2.4202946353309207e-05} {"step": 73800, "timestamp": 1778274311.9018197, "geo/rankme_last": 439.1088562011719, "geo/layer_0/stable_rank_q_proj": 19.13011932373047, "geo/layer_0/stable_rank_k_proj": 15.821466445922852, "geo/layer_0/stable_rank_o_proj": 46.59189987182617, "geo/layer_0/stable_rank_gate_proj": 129.53005981445312, "geo/layer_0/stable_rank_down_proj": 56.48042297363281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06243119388818741, "geo/layer_0/attn_entropy_mean": 6.1515021324157715, "geo/layer_0/attn_entropy_std": 0.4349750578403473, "geo/layer_7/stable_rank_q_proj": 42.57429122924805, "geo/layer_7/stable_rank_k_proj": 40.03857421875, "geo/layer_7/stable_rank_o_proj": 88.5211410522461, "geo/layer_7/stable_rank_gate_proj": 77.89212036132812, "geo/layer_7/stable_rank_down_proj": 140.07643127441406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44511449337005615, "geo/layer_7/attn_entropy_mean": 4.641680717468262, "geo/layer_7/attn_entropy_std": 0.7814379930496216, "geo/layer_14/stable_rank_q_proj": 50.17829513549805, "geo/layer_14/stable_rank_k_proj": 41.4637336730957, "geo/layer_14/stable_rank_o_proj": 43.49361038208008, "geo/layer_14/stable_rank_gate_proj": 70.76512908935547, "geo/layer_14/stable_rank_down_proj": 126.74948120117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4017602503299713, "geo/layer_14/attn_entropy_mean": 5.5643815994262695, "geo/layer_14/attn_entropy_std": 0.4160168468952179, "geo/layer_21/stable_rank_q_proj": 39.760868072509766, "geo/layer_21/stable_rank_k_proj": 29.829082489013672, "geo/layer_21/stable_rank_o_proj": 68.2946548461914, "geo/layer_21/stable_rank_gate_proj": 64.20287322998047, "geo/layer_21/stable_rank_down_proj": 49.5451774597168, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14656631648540497, "geo/layer_21/attn_entropy_mean": 5.690919876098633, "geo/layer_21/attn_entropy_std": 0.30423274636268616, "geo/layer_27/stable_rank_q_proj": 43.80555725097656, "geo/layer_27/stable_rank_k_proj": 32.36644744873047, "geo/layer_27/stable_rank_o_proj": 115.35196685791016, "geo/layer_27/stable_rank_gate_proj": 77.70601654052734, "geo/layer_27/stable_rank_down_proj": 127.83489990234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10763102769851685, "geo/layer_27/attn_entropy_mean": 4.166855812072754, "geo/layer_27/attn_entropy_std": 0.7633364796638489, "attnres/final_alpha/block_0": 0.23307812213897705, "attnres/block_norm/0": 1.7754930257797241, "attnres/final_alpha/block_1": 0.003944020252674818, "attnres/block_norm/1": 48530.171875, "attnres/final_alpha/block_2": 0.009826523251831532, "attnres/block_norm/2": 29114.5234375, "attnres/final_alpha/block_3": 0.011440953239798546, "attnres/block_norm/3": 62252.6875, "attnres/final_alpha/block_4": 0.013629920780658722, "attnres/block_norm/4": 15768.5009765625, "attnres/final_alpha/block_5": 0.6228782534599304, "attnres/block_norm/5": 6804.7998046875, "attnres/final_alpha/block_6": 0.10520224273204803, "attnres/block_norm/6": 41424.90234375, "geo/tier1_time_s": 1.3603084087371826, "geo/step": 73800.0, "geo/rankme_slope": -0.00013720335790566227} {"step": 73810, "timestamp": 1778274322.259231, "train/loss": 2.182029676437378, "train/z_loss": 0.0013751686550676824, "train/perplexity": 8.86427963198791, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.01965339720249176, "optim/adamw_lr": 0.0005896019160747528, "perf/tokens_per_sec": 1789678.903587074, "perf/iters_per_sec": 0.8533854024825449, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718034982681274, "data/tokens_consumed": 154792886272, "data/tokens_consumed_B": 154.792886272, "train/loss_slope": -2.2460563154933055e-05} {"step": 73820, "timestamp": 1778274332.6128767, "train/loss": 2.170493996143341, "train/z_loss": 0.0013602706836536526, "train/perplexity": 8.762611668106855, "train/grad_norm": 0.248046875, "optim/muon_lr": 0.019643234610557555, "optim/adamw_lr": 0.0005892970383167267, "perf/tokens_per_sec": 2026532.6025716234, "perf/iters_per_sec": 0.9663260472162358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348474025726317, "data/tokens_consumed": 154813857792, "data/tokens_consumed_B": 154.813857792, "train/loss_slope": -1.897390622689207e-05} {"step": 73830, "timestamp": 1778274342.9681761, "train/loss": 2.1477426052093507, "train/z_loss": 0.00137791350716725, "train/perplexity": 8.565500839598698, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.019632928371429444, "optim/adamw_lr": 0.0005889878511428833, "perf/tokens_per_sec": 2026240.2296639308, "perf/iters_per_sec": 0.9661866329497961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349967241287232, "data/tokens_consumed": 154834829312, "data/tokens_consumed_B": 154.834829312, "train/loss_slope": -1.9442125293824403e-05} {"step": 73840, "timestamp": 1778274353.3244605, "train/loss": 2.1687920451164246, "train/z_loss": 0.0013760764268226922, "train/perplexity": 8.747710816036953, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.019622477889060973, "optim/adamw_lr": 0.0005886743366718292, "perf/tokens_per_sec": 2025970.1992996011, "perf/iters_per_sec": 0.9660578724382406, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351346731185913, "data/tokens_consumed": 154855800832, "data/tokens_consumed_B": 154.855800832, "train/loss_slope": -1.831376045414754e-05} {"step": 73850, "timestamp": 1778274363.6636567, "grad/layer_0/attn": 0.003868818050250411, "grad/layer_0/mlp": 0.004018587525933981, "grad/layer_0/attn_mlp_ratio": 0.9627307926006066, "grad/layer_4/attn": 0.004508310463279486, "grad/layer_4/mlp": 0.00283472309820354, "grad/layer_4/attn_mlp_ratio": 1.590388248890246, "grad/layer_8/attn": 0.0034158937633037567, "grad/layer_8/mlp": 0.0037334260996431112, "grad/layer_8/attn_mlp_ratio": 0.9149487844785285, "grad/layer_12/attn": 0.0067353988997638226, "grad/layer_12/mlp": 0.007015137001872063, "grad/layer_12/attn_mlp_ratio": 0.9601236300807879, "grad/layer_16/attn": 0.004205442499369383, "grad/layer_16/mlp": 0.005778091493993998, "grad/layer_16/attn_mlp_ratio": 0.7278255166014856, "grad/layer_20/attn": 0.00457336800172925, "grad/layer_20/mlp": 0.00704199867323041, "grad/layer_20/attn_mlp_ratio": 0.6494417492821128, "grad/layer_24/attn": 0.012834714725613594, "grad/layer_24/mlp": 0.01228167675435543, "grad/layer_24/attn_mlp_ratio": 1.0450295084145649, "grad/layer_27/attn": 0.008216910995543003, "grad/layer_27/mlp": 0.013213151134550571, "grad/layer_27/attn_mlp_ratio": 0.6218736809775485} {"step": 73850, "timestamp": 1778274363.679442, "train/loss": 2.1386332273483277, "train/z_loss": 0.001384486840106547, "train/perplexity": 8.487828765126297, "train/grad_norm": 0.2451171875, "optim/muon_lr": 0.019611883163452148, "optim/adamw_lr": 0.0005883564949035644, "perf/tokens_per_sec": 2026246.344216176, "perf/iters_per_sec": 0.9661895485955124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349936008453369, "data/tokens_consumed": 154876772352, "data/tokens_consumed_B": 154.876772352, "train/loss_slope": -2.110313000065267e-05} {"step": 73860, "timestamp": 1778274374.0348275, "train/loss": 2.1366066455841066, "train/z_loss": 0.0013820865890011192, "train/perplexity": 8.470644904263818, "train/grad_norm": 0.1787109375, "optim/muon_lr": 0.019601145386695863, "optim/adamw_lr": 0.0005880343616008758, "perf/tokens_per_sec": 2026444.4570045606, "perf/iters_per_sec": 0.9662840161345294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348924160003663, "data/tokens_consumed": 154897743872, "data/tokens_consumed_B": 154.897743872, "train/loss_slope": -2.2294771796477453e-05} {"step": 73870, "timestamp": 1778274384.3908784, "train/loss": 2.1389854669570925, "train/z_loss": 0.0013728763326071203, "train/perplexity": 8.490819041225713, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.01959026336669922, "optim/adamw_lr": 0.0005877079010009765, "perf/tokens_per_sec": 2026549.4108595634, "perf/iters_per_sec": 0.9663340620324914, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034838819503784, "data/tokens_consumed": 154918715392, "data/tokens_consumed_B": 154.918715392, "train/loss_slope": -1.9614714915209473e-05} {"step": 73875, "timestamp": 1778274390.1769543, "eos/sharpness": 42.70296096801757, "eos/L0_probe": 1.9688507318496704, "eos/L_plus": 2.1595728397369385, "eos/L_minus": 2.205158233642578, "eos/grad_norm": 0.13043120503425598, "eos/embed_grad_frac": 0.18835531175136566, "eos/time_s": 0.6192295551300049} {"step": 73875, "timestamp": 1778274391.5591686, "geo/rankme_last": 437.54669189453125, "geo/layer_0/stable_rank_q_proj": 19.123952865600586, "geo/layer_0/stable_rank_k_proj": 15.825899124145508, "geo/layer_0/stable_rank_o_proj": 46.644676208496094, "geo/layer_0/stable_rank_gate_proj": 129.5177764892578, "geo/layer_0/stable_rank_down_proj": 56.39787292480469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06504076719284058, "geo/layer_0/attn_entropy_mean": 6.151731491088867, "geo/layer_0/attn_entropy_std": 0.43122410774230957, "geo/layer_7/stable_rank_q_proj": 42.61027145385742, "geo/layer_7/stable_rank_k_proj": 40.100467681884766, "geo/layer_7/stable_rank_o_proj": 88.39176940917969, "geo/layer_7/stable_rank_gate_proj": 77.86730194091797, "geo/layer_7/stable_rank_down_proj": 140.00755310058594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4420943558216095, "geo/layer_7/attn_entropy_mean": 4.675130367279053, "geo/layer_7/attn_entropy_std": 0.7993444800376892, "geo/layer_14/stable_rank_q_proj": 50.093990325927734, "geo/layer_14/stable_rank_k_proj": 41.51921844482422, "geo/layer_14/stable_rank_o_proj": 43.48857498168945, "geo/layer_14/stable_rank_gate_proj": 70.760009765625, "geo/layer_14/stable_rank_down_proj": 126.57459259033203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40680575370788574, "geo/layer_14/attn_entropy_mean": 5.528450012207031, "geo/layer_14/attn_entropy_std": 0.42633530497550964, "geo/layer_21/stable_rank_q_proj": 39.74686050415039, "geo/layer_21/stable_rank_k_proj": 29.99660301208496, "geo/layer_21/stable_rank_o_proj": 68.22567749023438, "geo/layer_21/stable_rank_gate_proj": 64.08847045898438, "geo/layer_21/stable_rank_down_proj": 49.515846252441406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1415175199508667, "geo/layer_21/attn_entropy_mean": 5.699897766113281, "geo/layer_21/attn_entropy_std": 0.3047756552696228, "geo/layer_27/stable_rank_q_proj": 43.82381820678711, "geo/layer_27/stable_rank_k_proj": 32.36028289794922, "geo/layer_27/stable_rank_o_proj": 115.31146240234375, "geo/layer_27/stable_rank_gate_proj": 77.73666381835938, "geo/layer_27/stable_rank_down_proj": 128.0228729248047, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09384185075759888, "geo/layer_27/attn_entropy_mean": 4.15213680267334, "geo/layer_27/attn_entropy_std": 0.7943225502967834, "attnres/final_alpha/block_0": 0.23761698603630066, "attnres/block_norm/0": 1.775805950164795, "attnres/final_alpha/block_1": 0.004098653793334961, "attnres/block_norm/1": 48453.66796875, "attnres/final_alpha/block_2": 0.009995250031352043, "attnres/block_norm/2": 29109.2578125, "attnres/final_alpha/block_3": 0.011659033596515656, "attnres/block_norm/3": 62297.49609375, "attnres/final_alpha/block_4": 0.014016522094607353, "attnres/block_norm/4": 15797.44140625, "attnres/final_alpha/block_5": 0.6153026223182678, "attnres/block_norm/5": 6881.212890625, "attnres/final_alpha/block_6": 0.1073109358549118, "attnres/block_norm/6": 41404.078125, "geo/tier1_time_s": 1.3606386184692383, "geo/step": 73875.0, "geo/rankme_slope": -0.00014470960259103642} {"step": 73880, "timestamp": 1778274396.7435212, "train/loss": 2.08285893201828, "train/z_loss": 0.0013844529166817665, "train/perplexity": 8.027385890193532, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.019579238295555114, "optim/adamw_lr": 0.0005873771488666534, "perf/tokens_per_sec": 1698572.1393218401, "perf/iters_per_sec": 0.8099423119172288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2346558332443238, "data/tokens_consumed": 154939686912, "data/tokens_consumed_B": 154.939686912, "train/loss_slope": -2.271992459227078e-05} {"step": 73890, "timestamp": 1778274407.093857, "train/loss": 2.089416980743408, "train/z_loss": 0.001390681287739426, "train/perplexity": 8.080202876886993, "train/grad_norm": 0.1474609375, "optim/muon_lr": 0.019568070769309998, "optim/adamw_lr": 0.0005870421230792999, "perf/tokens_per_sec": 2027339.8084769566, "perf/iters_per_sec": 0.9667109529862197, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344353675842286, "data/tokens_consumed": 154960658432, "data/tokens_consumed_B": 154.960658432, "train/loss_slope": -2.1908865789018025e-05} {"step": 73900, "timestamp": 1778274417.432486, "grad/layer_0/attn": 0.0030906982719898224, "grad/layer_0/mlp": 0.0029922854155302048, "grad/layer_0/attn_mlp_ratio": 1.0328888255979742, "grad/layer_4/attn": 0.0022643005941063166, "grad/layer_4/mlp": 0.0026373642031103373, "grad/layer_4/attn_mlp_ratio": 0.8585467663439391, "grad/layer_8/attn": 0.006992540787905455, "grad/layer_8/mlp": 0.003805386135354638, "grad/layer_8/attn_mlp_ratio": 1.8375377308458136, "grad/layer_12/attn": 0.0053434353321790695, "grad/layer_12/mlp": 0.007389075588434935, "grad/layer_12/attn_mlp_ratio": 0.7231534169479934, "grad/layer_16/attn": 0.0035639782436192036, "grad/layer_16/mlp": 0.0047559840604662895, "grad/layer_16/attn_mlp_ratio": 0.7493671390339075, "grad/layer_20/attn": 0.0031282110139727592, "grad/layer_20/mlp": 0.005277504213154316, "grad/layer_20/attn_mlp_ratio": 0.5927443784698788, "grad/layer_24/attn": 0.007454290054738522, "grad/layer_24/mlp": 0.008707169443368912, "grad/layer_24/attn_mlp_ratio": 0.8561094414907149, "grad/layer_27/attn": 0.006410195026546717, "grad/layer_27/mlp": 0.006754818372428417, "grad/layer_27/attn_mlp_ratio": 0.9489810944160273} {"step": 73900, "timestamp": 1778274417.448038, "train/loss": 2.136930823326111, "train/z_loss": 0.0013867435627616942, "train/perplexity": 8.473391343945456, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.01955675959587097, "optim/adamw_lr": 0.0005867027878761291, "perf/tokens_per_sec": 2026557.9084890222, "perf/iters_per_sec": 0.966338114017974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348344802856446, "data/tokens_consumed": 154981629952, "data/tokens_consumed_B": 154.981629952, "train/loss_slope": -2.3119638569176906e-05} {"step": 73910, "timestamp": 1778274427.8006876, "train/loss": 2.1003886222839356, "train/z_loss": 0.0013742609415203332, "train/perplexity": 8.169344084907182, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.019545305371284485, "optim/adamw_lr": 0.0005863591611385345, "perf/tokens_per_sec": 2026795.1705290694, "perf/iters_per_sec": 0.9664512493748996, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034713339805603, "data/tokens_consumed": 155002601472, "data/tokens_consumed_B": 155.002601472, "train/loss_slope": -2.485529643223874e-05} {"step": 73920, "timestamp": 1778274438.1480074, "train/loss": 2.1792454361915587, "train/z_loss": 0.0013628080836497248, "train/perplexity": 8.839633673941751, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.019533708691596985, "optim/adamw_lr": 0.0005860112607479095, "perf/tokens_per_sec": 2028042.7252640312, "perf/iters_per_sec": 0.9670461298294216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340768337249755, "data/tokens_consumed": 155023572992, "data/tokens_consumed_B": 155.023572992, "train/loss_slope": -2.428240346150322e-05} {"step": 73930, "timestamp": 1778274448.4934583, "train/loss": 2.1316189646720884, "train/z_loss": 0.0013698318623937666, "train/perplexity": 8.428501217360902, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.01952196955680847, "optim/adamw_lr": 0.0005856590867042541, "perf/tokens_per_sec": 2027968.3813211608, "perf/iters_per_sec": 0.9670106798749737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341147422790526, "data/tokens_consumed": 155044544512, "data/tokens_consumed_B": 155.044544512, "train/loss_slope": -2.191609004364334e-05} {"step": 73940, "timestamp": 1778274458.850816, "train/loss": 2.150877523422241, "train/z_loss": 0.0013710425468161703, "train/perplexity": 8.592395117838857, "train/grad_norm": 0.1650390625, "optim/muon_lr": 0.019510087966918947, "optim/adamw_lr": 0.0005853026390075683, "perf/tokens_per_sec": 2025731.824087272, "perf/iters_per_sec": 0.9659442062794075, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352564811706544, "data/tokens_consumed": 155065516032, "data/tokens_consumed_B": 155.065516032, "train/loss_slope": -1.855657444749908e-05} {"step": 73950, "timestamp": 1778274469.1888108, "grad/layer_0/attn": 0.0027433743234723806, "grad/layer_0/mlp": 0.0030473426450043917, "grad/layer_0/attn_mlp_ratio": 0.9002513182902329, "grad/layer_4/attn": 0.002502531511709094, "grad/layer_4/mlp": 0.002495142398402095, "grad/layer_4/attn_mlp_ratio": 1.002961359245707, "grad/layer_8/attn": 0.004719248972833157, "grad/layer_8/mlp": 0.0037076042499393225, "grad/layer_8/attn_mlp_ratio": 1.2728566824856569, "grad/layer_12/attn": 0.004347587935626507, "grad/layer_12/mlp": 0.006658940576016903, "grad/layer_12/attn_mlp_ratio": 0.6528948292458809, "grad/layer_16/attn": 0.006823503877967596, "grad/layer_16/mlp": 0.004505983553826809, "grad/layer_16/attn_mlp_ratio": 1.5143206017120325, "grad/layer_20/attn": 0.00331076025031507, "grad/layer_20/mlp": 0.005640874616801739, "grad/layer_20/attn_mlp_ratio": 0.5869232019023114, "grad/layer_24/attn": 0.009340368211269379, "grad/layer_24/mlp": 0.00965211819857359, "grad/layer_24/attn_mlp_ratio": 0.9677013814314438, "grad/layer_27/attn": 0.0065291947685182095, "grad/layer_27/mlp": 0.008802720345556736, "grad/layer_27/attn_mlp_ratio": 0.7417246530660744} {"step": 73950, "timestamp": 1778274469.8010497, "eos/sharpness": 77.29663848876952, "eos/L0_probe": 1.9659113883972168, "eos/L_plus": 2.30126953125, "eos/L_minus": 2.403519630432129, "eos/grad_norm": 0.17135100066661835, "eos/embed_grad_frac": 0.06970234215259552, "eos/time_s": 0.6094906330108643} {"step": 73950, "timestamp": 1778274469.8200345, "train/loss": 2.1274694204330444, "train/z_loss": 0.0013688080012798308, "train/perplexity": 8.393599242415748, "train/grad_norm": 0.1708984375, "optim/muon_lr": 0.019498064517974853, "optim/adamw_lr": 0.0005849419355392455, "perf/tokens_per_sec": 1913072.2770262915, "perf/iters_per_sec": 0.9122239480143983, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0962220430374146, "data/tokens_consumed": 155086487552, "data/tokens_consumed_B": 155.086487552, "train/loss_slope": -1.8307599311757654e-05} {"step": 73950, "timestamp": 1778274471.1830215, "geo/rankme_last": 439.2391052246094, "geo/layer_0/stable_rank_q_proj": 19.131053924560547, "geo/layer_0/stable_rank_k_proj": 15.830202102661133, "geo/layer_0/stable_rank_o_proj": 46.73237609863281, "geo/layer_0/stable_rank_gate_proj": 129.60302734375, "geo/layer_0/stable_rank_down_proj": 56.340396881103516, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06499498337507248, "geo/layer_0/attn_entropy_mean": 6.15099573135376, "geo/layer_0/attn_entropy_std": 0.43149784207344055, "geo/layer_7/stable_rank_q_proj": 42.63710403442383, "geo/layer_7/stable_rank_k_proj": 40.156517028808594, "geo/layer_7/stable_rank_o_proj": 88.30025482177734, "geo/layer_7/stable_rank_gate_proj": 77.7981185913086, "geo/layer_7/stable_rank_down_proj": 139.9121856689453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4305705428123474, "geo/layer_7/attn_entropy_mean": 4.644023895263672, "geo/layer_7/attn_entropy_std": 0.7989864349365234, "geo/layer_14/stable_rank_q_proj": 49.96608352661133, "geo/layer_14/stable_rank_k_proj": 41.59049987792969, "geo/layer_14/stable_rank_o_proj": 43.490596771240234, "geo/layer_14/stable_rank_gate_proj": 70.76301574707031, "geo/layer_14/stable_rank_down_proj": 126.54442596435547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40292128920555115, "geo/layer_14/attn_entropy_mean": 5.535024642944336, "geo/layer_14/attn_entropy_std": 0.41611337661743164, "geo/layer_21/stable_rank_q_proj": 39.80726623535156, "geo/layer_21/stable_rank_k_proj": 30.058624267578125, "geo/layer_21/stable_rank_o_proj": 68.33341979980469, "geo/layer_21/stable_rank_gate_proj": 63.92775344848633, "geo/layer_21/stable_rank_down_proj": 49.535518646240234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1438877284526825, "geo/layer_21/attn_entropy_mean": 5.684466361999512, "geo/layer_21/attn_entropy_std": 0.29631999135017395, "geo/layer_27/stable_rank_q_proj": 43.7955436706543, "geo/layer_27/stable_rank_k_proj": 32.375152587890625, "geo/layer_27/stable_rank_o_proj": 115.31165313720703, "geo/layer_27/stable_rank_gate_proj": 77.80304718017578, "geo/layer_27/stable_rank_down_proj": 127.79806518554688, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09886626154184341, "geo/layer_27/attn_entropy_mean": 4.16957950592041, "geo/layer_27/attn_entropy_std": 0.7689214944839478, "attnres/final_alpha/block_0": 0.2368062436580658, "attnres/block_norm/0": 1.775822401046753, "attnres/final_alpha/block_1": 0.004042287822812796, "attnres/block_norm/1": 48493.1015625, "attnres/final_alpha/block_2": 0.010151713155210018, "attnres/block_norm/2": 29030.595703125, "attnres/final_alpha/block_3": 0.011871099472045898, "attnres/block_norm/3": 61676.12109375, "attnres/final_alpha/block_4": 0.01398413721472025, "attnres/block_norm/4": 15828.90234375, "attnres/final_alpha/block_5": 0.616014838218689, "attnres/block_norm/5": 6895.04248046875, "attnres/final_alpha/block_6": 0.10712968558073044, "attnres/block_norm/6": 41218.0546875, "geo/tier1_time_s": 1.3590648174285889, "geo/step": 73950.0, "geo/rankme_slope": -0.00010409284807673069} {"step": 73960, "timestamp": 1778274481.5516775, "train/loss": 2.0864999055862428, "train/z_loss": 0.0013737859553657472, "train/perplexity": 8.056666662952228, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.01948589861392975, "optim/adamw_lr": 0.0005845769584178924, "perf/tokens_per_sec": 1788170.7337378915, "perf/iters_per_sec": 0.8526662510575731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1727918148040772, "data/tokens_consumed": 155107459072, "data/tokens_consumed_B": 155.107459072, "train/loss_slope": -2.1296460564845384e-05} {"step": 73970, "timestamp": 1778274491.8982995, "train/loss": 2.1617568492889405, "train/z_loss": 0.0013901865342631937, "train/perplexity": 8.686384930150492, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.019473591446876527, "optim/adamw_lr": 0.0005842077434062957, "perf/tokens_per_sec": 2028331.409107928, "perf/iters_per_sec": 0.9671837850131645, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339296579360961, "data/tokens_consumed": 155128430592, "data/tokens_consumed_B": 155.128430592, "train/loss_slope": -1.5502906446516982e-05} {"step": 73980, "timestamp": 1778274502.2500226, "train/loss": 2.1534311056137083, "train/z_loss": 0.0013730031088925899, "train/perplexity": 8.614364543422633, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.019461142420768737, "optim/adamw_lr": 0.0005838342726230621, "perf/tokens_per_sec": 2027235.1465129757, "perf/iters_per_sec": 0.9666610462727431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344887733459474, "data/tokens_consumed": 155149402112, "data/tokens_consumed_B": 155.149402112, "train/loss_slope": -1.514263660243205e-05} {"step": 73990, "timestamp": 1778274512.6045375, "train/loss": 2.1281882524490356, "train/z_loss": 0.0013714345870539546, "train/perplexity": 8.399634999368365, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.019448552131652832, "optim/adamw_lr": 0.0005834565639495849, "perf/tokens_per_sec": 2026485.027290332, "perf/iters_per_sec": 0.9663033615543041, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348716974258423, "data/tokens_consumed": 155170373632, "data/tokens_consumed_B": 155.170373632, "train/loss_slope": -1.255145854789717e-05} {"step": 74000, "timestamp": 1778274522.9446616, "grad/layer_0/attn": 0.003081354545429349, "grad/layer_0/mlp": 0.003304895479232073, "grad/layer_0/attn_mlp_ratio": 0.9323606363821437, "grad/layer_4/attn": 0.0020320452749729156, "grad/layer_4/mlp": 0.0024278010241687298, "grad/layer_4/attn_mlp_ratio": 0.8369900049653698, "grad/layer_8/attn": 0.003998743370175362, "grad/layer_8/mlp": 0.003488165093585849, "grad/layer_8/attn_mlp_ratio": 1.1463744255944015, "grad/layer_12/attn": 0.00493102241307497, "grad/layer_12/mlp": 0.006878338288515806, "grad/layer_12/attn_mlp_ratio": 0.716891512826396, "grad/layer_16/attn": 0.0035161194391548634, "grad/layer_16/mlp": 0.0045229061506688595, "grad/layer_16/attn_mlp_ratio": 0.7774026796675015, "grad/layer_20/attn": 0.0032842233777046204, "grad/layer_20/mlp": 0.005274566356092691, "grad/layer_20/attn_mlp_ratio": 0.6226527630362851, "grad/layer_24/attn": 0.004098562523722649, "grad/layer_24/mlp": 0.007420896552503109, "grad/layer_24/attn_mlp_ratio": 0.5523001755239619, "grad/layer_27/attn": 0.0037261073011904955, "grad/layer_27/mlp": 0.00612641591578722, "grad/layer_27/attn_mlp_ratio": 0.6082034409006267} {"step": 74000, "timestamp": 1778274522.9602633, "train/loss": 2.1023243188858034, "train/z_loss": 0.0013877480872906745, "train/perplexity": 8.185172771316669, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.01943582057952881, "optim/adamw_lr": 0.0005830746173858642, "perf/tokens_per_sec": 2026941.2163325306, "perf/iters_per_sec": 0.9665208894407895, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346387863159179, "data/tokens_consumed": 155191345152, "data/tokens_consumed_B": 155.191345152, "train/loss_slope": -1.5895435883767807e-05} {"step": 74000, "timestamp": 1778274529.9870777, "geo/ww_alpha_mean": 7.876715401779405, "geo/ww_alpha_std": 4.88989196175118, "geo/ww_alpha_min": 1.349867211479801, "geo/ww_alpha_max": 28.741574531691175, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.8902027176186036, "geo/ww_alpha_by_type/k_proj": 4.321230649955665, "geo/ww_alpha_by_type/v_proj": 9.085624487936142, "geo/ww_alpha_by_type/o_proj": 9.574151333675188, "geo/ww_alpha_by_type/gate_proj": 8.083166083718229, "geo/ww_alpha_by_type/up_proj": 12.269687595422974, "geo/ww_alpha_by_type/down_proj": 8.00992405614081, "geo/twonn_id/layer_0": 0.7119622230529785, "geo/twonn_id/layer_7": 3.5748443603515625, "geo/twonn_id/layer_14": 4.480838775634766, "geo/twonn_id/layer_21": 7.353641986846924, "geo/twonn_id/layer_27": 5.492020606994629, "geo/tier2_time_s": 7.016788482666016} {"step": 74000, "timestamp": 1778274530.611517, "eoc/jacobian_sigma/layer_0/attn": 1164.4251708984375, "eoc/jacobian_sigma/layer_0/mlp": 9080.3369140625, "eoc/jacobian_sigma/layer_0": 9080.3369140625, "eoc/jacobian_sigma/layer_7/attn": 1.1610051393508911, "eoc/jacobian_sigma/layer_7/mlp": 1.7289055585861206, "eoc/jacobian_sigma/layer_7": 1.7289055585861206, "eoc/jacobian_sigma/layer_14/attn": 1.4242703914642334, "eoc/jacobian_sigma/layer_14/mlp": 7.299202919006348, "eoc/jacobian_sigma/layer_14": 7.299202919006348, "eoc/jacobian_sigma/layer_21/attn": 1.0786659717559814, "eoc/jacobian_sigma/layer_21/mlp": 3.998420238494873, "eoc/jacobian_sigma/layer_21": 3.998420238494873, "eoc/jacobian_sigma/layer_27/attn": 3.0476605892181396, "eoc/jacobian_sigma/layer_27/mlp": 24.251127243041992, "eoc/jacobian_sigma/layer_27": 24.251127243041992, "eoc/layer0_sigma": 9080.3369140625, "eoc/sigma_max": 24.251127243041992, "eoc/sigma_min": 1.7289055585861206, "eoc/sigma_mean": 9.319413989782333, "eoc/time_s": 0.6184685230255127} {"step": 74010, "timestamp": 1778274540.9813015, "train/loss": 2.194630575180054, "train/z_loss": 0.0013668410596437753, "train/perplexity": 8.976684234241853, "train/grad_norm": 0.380859375, "optim/muon_lr": 0.019422947764396667, "optim/adamw_lr": 0.0005826884329319, "perf/tokens_per_sec": 1164296.3051148362, "perf/iters_per_sec": 0.555179741437357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8012184619903564, "data/tokens_consumed": 155212316672, "data/tokens_consumed_B": 155.212316672, "train/loss_slope": -1.1680725162321816e-05} {"step": 74020, "timestamp": 1778274551.334054, "train/loss": 2.125279176235199, "train/z_loss": 0.0013744598836638033, "train/perplexity": 8.375235328445251, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.01940993428230286, "optim/adamw_lr": 0.0005822980284690857, "perf/tokens_per_sec": 2026690.8449697667, "perf/iters_per_sec": 0.9664015030716737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347666025161744, "data/tokens_consumed": 155233288192, "data/tokens_consumed_B": 155.233288192, "train/loss_slope": -1.273103476357538e-05} {"step": 74025, "timestamp": 1778274557.1116285, "eos/sharpness": 47.555780410766594, "eos/L0_probe": 1.9648710489273071, "eos/L_plus": 2.1493608951568604, "eos/L_minus": 2.25593900680542, "eos/grad_norm": 0.11826388537883759, "eos/embed_grad_frac": 0.14683875441551208, "eos/time_s": 0.6129586696624756} {"step": 74025, "timestamp": 1778274558.4896302, "geo/rankme_last": 438.14068603515625, "geo/layer_0/stable_rank_q_proj": 19.12751579284668, "geo/layer_0/stable_rank_k_proj": 15.781784057617188, "geo/layer_0/stable_rank_o_proj": 46.809757232666016, "geo/layer_0/stable_rank_gate_proj": 129.6025390625, "geo/layer_0/stable_rank_down_proj": 56.43681335449219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06892523914575577, "geo/layer_0/attn_entropy_mean": 6.146771430969238, "geo/layer_0/attn_entropy_std": 0.4360436201095581, "geo/layer_7/stable_rank_q_proj": 42.61322021484375, "geo/layer_7/stable_rank_k_proj": 40.09721374511719, "geo/layer_7/stable_rank_o_proj": 88.2933120727539, "geo/layer_7/stable_rank_gate_proj": 77.87737274169922, "geo/layer_7/stable_rank_down_proj": 139.86048889160156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44158756732940674, "geo/layer_7/attn_entropy_mean": 4.64125919342041, "geo/layer_7/attn_entropy_std": 0.7883687019348145, "geo/layer_14/stable_rank_q_proj": 50.02833938598633, "geo/layer_14/stable_rank_k_proj": 41.638099670410156, "geo/layer_14/stable_rank_o_proj": 43.42793273925781, "geo/layer_14/stable_rank_gate_proj": 70.81207275390625, "geo/layer_14/stable_rank_down_proj": 126.32447052001953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39550888538360596, "geo/layer_14/attn_entropy_mean": 5.565250396728516, "geo/layer_14/attn_entropy_std": 0.41554102301597595, "geo/layer_21/stable_rank_q_proj": 39.77908706665039, "geo/layer_21/stable_rank_k_proj": 30.013973236083984, "geo/layer_21/stable_rank_o_proj": 68.30512237548828, "geo/layer_21/stable_rank_gate_proj": 63.99861526489258, "geo/layer_21/stable_rank_down_proj": 49.57716369628906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14571011066436768, "geo/layer_21/attn_entropy_mean": 5.696128845214844, "geo/layer_21/attn_entropy_std": 0.3111974895000458, "geo/layer_27/stable_rank_q_proj": 43.697811126708984, "geo/layer_27/stable_rank_k_proj": 32.33107376098633, "geo/layer_27/stable_rank_o_proj": 115.34210968017578, "geo/layer_27/stable_rank_gate_proj": 77.72221374511719, "geo/layer_27/stable_rank_down_proj": 127.77841186523438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10383918136358261, "geo/layer_27/attn_entropy_mean": 4.16767692565918, "geo/layer_27/attn_entropy_std": 0.7668864130973816, "attnres/final_alpha/block_0": 0.23731407523155212, "attnres/block_norm/0": 1.7758573293685913, "attnres/final_alpha/block_1": 0.004052942618727684, "attnres/block_norm/1": 48489.4921875, "attnres/final_alpha/block_2": 0.010008581914007664, "attnres/block_norm/2": 29069.943359375, "attnres/final_alpha/block_3": 0.011854065582156181, "attnres/block_norm/3": 61877.82421875, "attnres/final_alpha/block_4": 0.014039801433682442, "attnres/block_norm/4": 15858.62890625, "attnres/final_alpha/block_5": 0.6152069568634033, "attnres/block_norm/5": 6860.10595703125, "attnres/final_alpha/block_6": 0.10752356052398682, "attnres/block_norm/6": 41470.21875, "geo/tier1_time_s": 1.3586397171020508, "geo/step": 74025.0, "geo/rankme_slope": -0.00010057151766956783} {"step": 74030, "timestamp": 1778274563.6704087, "train/loss": 2.1051132678985596, "train/z_loss": 0.001377589290495962, "train/perplexity": 8.208032663555949, "train/grad_norm": 0.2890625, "optim/muon_lr": 0.019396780133247377, "optim/adamw_lr": 0.0005819034039974212, "perf/tokens_per_sec": 1700856.9304996717, "perf/iters_per_sec": 0.8110317852495535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329972982406616, "data/tokens_consumed": 155254259712, "data/tokens_consumed_B": 155.254259712, "train/loss_slope": -1.219738102016454e-05} {"step": 74040, "timestamp": 1778274574.0208695, "train/loss": 2.1665907382965086, "train/z_loss": 0.0013817258062772454, "train/perplexity": 8.728475599632908, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.019383485913276672, "optim/adamw_lr": 0.0005815045773983001, "perf/tokens_per_sec": 2027050.472540889, "perf/iters_per_sec": 0.9665729868606991, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345830202102662, "data/tokens_consumed": 155275231232, "data/tokens_consumed_B": 155.275231232, "train/loss_slope": -7.918909430825747e-06} {"step": 74050, "timestamp": 1778274584.3585496, "grad/layer_0/attn": 0.0027655642479658127, "grad/layer_0/mlp": 0.0031662669498473406, "grad/layer_0/attn_mlp_ratio": 0.8734463026734124, "grad/layer_4/attn": 0.003645087592303753, "grad/layer_4/mlp": 0.0026366577949374914, "grad/layer_4/attn_mlp_ratio": 1.382465127274378, "grad/layer_8/attn": 0.003674617502838373, "grad/layer_8/mlp": 0.003778387326747179, "grad/layer_8/attn_mlp_ratio": 0.9725359228187614, "grad/layer_12/attn": 0.0049634529277682304, "grad/layer_12/mlp": 0.007812222931534052, "grad/layer_12/attn_mlp_ratio": 0.6353444989644106, "grad/layer_16/attn": 0.005681419279426336, "grad/layer_16/mlp": 0.004648860078305006, "grad/layer_16/attn_mlp_ratio": 1.2221101649690411, "grad/layer_20/attn": 0.007593404967337847, "grad/layer_20/mlp": 0.00553106376901269, "grad/layer_20/attn_mlp_ratio": 1.3728651751572147, "grad/layer_24/attn": 0.007916267029941082, "grad/layer_24/mlp": 0.008891895413398743, "grad/layer_24/attn_mlp_ratio": 0.8902789082499298, "grad/layer_27/attn": 0.005180130247026682, "grad/layer_27/mlp": 0.00846104882657528, "grad/layer_27/attn_mlp_ratio": 0.612232631199712} {"step": 74050, "timestamp": 1778274584.3742056, "train/loss": 2.136082410812378, "train/z_loss": 0.0013786942814476787, "train/perplexity": 8.46620546142286, "train/grad_norm": 0.1484375, "optim/muon_lr": 0.0193700510263443, "optim/adamw_lr": 0.0005811015307903289, "perf/tokens_per_sec": 2027038.187052442, "perf/iters_per_sec": 0.966567128683301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345892906188965, "data/tokens_consumed": 155296202752, "data/tokens_consumed_B": 155.296202752, "train/loss_slope": -6.3131973259162265e-06} {"step": 74060, "timestamp": 1778274594.7212002, "train/loss": 2.111677372455597, "train/z_loss": 0.0013816668186336755, "train/perplexity": 8.262088267391722, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.019356476068496706, "optim/adamw_lr": 0.0005806942820549011, "perf/tokens_per_sec": 2028263.685189319, "perf/iters_per_sec": 0.9671514917322727, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03396418094635, "data/tokens_consumed": 155317174272, "data/tokens_consumed_B": 155.317174272, "train/loss_slope": -7.465553119166151e-06} {"step": 74070, "timestamp": 1778274605.066945, "train/loss": 2.0830827713012696, "train/z_loss": 0.0013870498747564851, "train/perplexity": 8.029182935612647, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.019342761039733886, "optim/adamw_lr": 0.0005802828311920166, "perf/tokens_per_sec": 2027973.0101245183, "perf/iters_per_sec": 0.9670128870604126, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341123819351197, "data/tokens_consumed": 155338145792, "data/tokens_consumed_B": 155.338145792, "train/loss_slope": -9.524161837818461e-06} {"step": 74080, "timestamp": 1778274615.4151223, "train/loss": 2.1694232702255247, "train/z_loss": 0.0013820691965520382, "train/perplexity": 8.753234333859352, "train/grad_norm": 0.271484375, "optim/muon_lr": 0.019328907132148743, "optim/adamw_lr": 0.0005798672139644622, "perf/tokens_per_sec": 2027995.6868907374, "perf/iters_per_sec": 0.9670237001851737, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341008186340332, "data/tokens_consumed": 155359117312, "data/tokens_consumed_B": 155.359117312, "train/loss_slope": -5.087099016660568e-06} {"step": 74090, "timestamp": 1778274625.763544, "train/loss": 2.1146073222160338, "train/z_loss": 0.0013739842223003506, "train/perplexity": 8.28633126897677, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.01931491315364838, "optim/adamw_lr": 0.0005794473946094512, "perf/tokens_per_sec": 2027909.7516408581, "perf/iters_per_sec": 0.9669827230648318, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034144639968872, "data/tokens_consumed": 155380088832, "data/tokens_consumed_B": 155.380088832, "train/loss_slope": -4.467791928709974e-06} {"step": 74100, "timestamp": 1778274636.0994992, "grad/layer_0/attn": 0.0025603766553103924, "grad/layer_0/mlp": 0.0028343673329800367, "grad/layer_0/attn_mlp_ratio": 0.903332653881938, "grad/layer_4/attn": 0.0025745427701622248, "grad/layer_4/mlp": 0.0026680275332182646, "grad/layer_4/attn_mlp_ratio": 0.964961058914043, "grad/layer_8/attn": 0.003458859631791711, "grad/layer_8/mlp": 0.0037981097120791674, "grad/layer_8/attn_mlp_ratio": 0.9106792070075131, "grad/layer_12/attn": 0.007539966143667698, "grad/layer_12/mlp": 0.007039532531052828, "grad/layer_12/attn_mlp_ratio": 1.0710890252013827, "grad/layer_16/attn": 0.003456569043919444, "grad/layer_16/mlp": 0.0047786785289645195, "grad/layer_16/attn_mlp_ratio": 0.7233315550806227, "grad/layer_20/attn": 0.004087425768375397, "grad/layer_20/mlp": 0.006093073636293411, "grad/layer_20/attn_mlp_ratio": 0.6708314957734113, "grad/layer_24/attn": 0.01267650444060564, "grad/layer_24/mlp": 0.00941601861268282, "grad/layer_24/attn_mlp_ratio": 1.3462700985854181, "grad/layer_27/attn": 0.006570638623088598, "grad/layer_27/mlp": 0.00905788317322731, "grad/layer_27/attn_mlp_ratio": 0.7254055307280958} {"step": 74100, "timestamp": 1778274636.7072418, "eos/sharpness": 71.01299762725829, "eos/L0_probe": 1.9625053405761719, "eos/L_plus": 2.3815596103668213, "eos/L_minus": 2.2535810470581055, "eos/grad_norm": 0.1566152125597, "eos/embed_grad_frac": 0.09413041174411774, "eos/time_s": 0.6050574779510498} {"step": 74100, "timestamp": 1778274636.727823, "train/loss": 2.104475808143616, "train/z_loss": 0.001376152760349214, "train/perplexity": 8.202802040398671, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.019300780296325683, "optim/adamw_lr": 0.0005790234088897705, "perf/tokens_per_sec": 1913612.038582675, "perf/iters_per_sec": 0.9124813263810515, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0959128379821776, "data/tokens_consumed": 155401060352, "data/tokens_consumed_B": 155.401060352, "train/loss_slope": -3.998642063627336e-06} {"step": 74100, "timestamp": 1778274638.0940073, "geo/rankme_last": 438.0539855957031, "geo/layer_0/stable_rank_q_proj": 19.138877868652344, "geo/layer_0/stable_rank_k_proj": 15.779845237731934, "geo/layer_0/stable_rank_o_proj": 46.771644592285156, "geo/layer_0/stable_rank_gate_proj": 129.37904357910156, "geo/layer_0/stable_rank_down_proj": 56.40964126586914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0640937089920044, "geo/layer_0/attn_entropy_mean": 6.148536682128906, "geo/layer_0/attn_entropy_std": 0.4334418475627899, "geo/layer_7/stable_rank_q_proj": 42.710872650146484, "geo/layer_7/stable_rank_k_proj": 40.14847183227539, "geo/layer_7/stable_rank_o_proj": 88.25154113769531, "geo/layer_7/stable_rank_gate_proj": 77.73328399658203, "geo/layer_7/stable_rank_down_proj": 139.86607360839844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4343873858451843, "geo/layer_7/attn_entropy_mean": 4.683930397033691, "geo/layer_7/attn_entropy_std": 0.8009975552558899, "geo/layer_14/stable_rank_q_proj": 49.93216323852539, "geo/layer_14/stable_rank_k_proj": 41.6541633605957, "geo/layer_14/stable_rank_o_proj": 43.420005798339844, "geo/layer_14/stable_rank_gate_proj": 70.8836669921875, "geo/layer_14/stable_rank_down_proj": 126.06118774414062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39653608202934265, "geo/layer_14/attn_entropy_mean": 5.520839691162109, "geo/layer_14/attn_entropy_std": 0.43553897738456726, "geo/layer_21/stable_rank_q_proj": 39.720977783203125, "geo/layer_21/stable_rank_k_proj": 30.014745712280273, "geo/layer_21/stable_rank_o_proj": 68.27627563476562, "geo/layer_21/stable_rank_gate_proj": 63.954288482666016, "geo/layer_21/stable_rank_down_proj": 49.58199691772461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14971908926963806, "geo/layer_21/attn_entropy_mean": 5.673404693603516, "geo/layer_21/attn_entropy_std": 0.3145769238471985, "geo/layer_27/stable_rank_q_proj": 43.67454147338867, "geo/layer_27/stable_rank_k_proj": 32.39177322387695, "geo/layer_27/stable_rank_o_proj": 115.29661560058594, "geo/layer_27/stable_rank_gate_proj": 77.72496795654297, "geo/layer_27/stable_rank_down_proj": 127.7658462524414, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09241845458745956, "geo/layer_27/attn_entropy_mean": 4.162875175476074, "geo/layer_27/attn_entropy_std": 0.776120126247406, "attnres/final_alpha/block_0": 0.23677361011505127, "attnres/block_norm/0": 1.7761216163635254, "attnres/final_alpha/block_1": 0.004082361236214638, "attnres/block_norm/1": 48542.1875, "attnres/final_alpha/block_2": 0.01021300908178091, "attnres/block_norm/2": 29050.0546875, "attnres/final_alpha/block_3": 0.011768696829676628, "attnres/block_norm/3": 62306.828125, "attnres/final_alpha/block_4": 0.014139189384877682, "attnres/block_norm/4": 15850.1240234375, "attnres/final_alpha/block_5": 0.6162815690040588, "attnres/block_norm/5": 6885.57958984375, "attnres/final_alpha/block_6": 0.10674156248569489, "attnres/block_norm/6": 41694.953125, "geo/tier1_time_s": 1.3624308109283447, "geo/step": 74100.0, "geo/rankme_slope": -0.00010277175323254301} {"step": 74110, "timestamp": 1778274648.4492478, "train/loss": 2.1402886390686033, "train/z_loss": 0.0013876379001885653, "train/perplexity": 8.501891252735794, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.019286507964134218, "optim/adamw_lr": 0.0005785952389240264, "perf/tokens_per_sec": 1789721.1440494405, "perf/iters_per_sec": 0.8534055443045809, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1717758417129516, "data/tokens_consumed": 155422031872, "data/tokens_consumed_B": 155.422031872, "train/loss_slope": -3.958644188813173e-06} {"step": 74120, "timestamp": 1778274658.7995052, "train/loss": 2.1398258447647094, "train/z_loss": 0.0013670442509464919, "train/perplexity": 8.497957536212711, "train/grad_norm": 0.19140625, "optim/muon_lr": 0.01927209734916687, "optim/adamw_lr": 0.000578162920475006, "perf/tokens_per_sec": 2027158.385701153, "perf/iters_per_sec": 0.9666244438653722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345279455184937, "data/tokens_consumed": 155443003392, "data/tokens_consumed_B": 155.443003392, "train/loss_slope": -3.519920182592966e-06} {"step": 74130, "timestamp": 1778274669.1486132, "train/loss": 2.1314192414283752, "train/z_loss": 0.0013716628425754606, "train/perplexity": 8.426818017850758, "train/grad_norm": 0.265625, "optim/muon_lr": 0.019257547855377196, "optim/adamw_lr": 0.0005777264356613159, "perf/tokens_per_sec": 2027332.6593471575, "perf/iters_per_sec": 0.9667075440154826, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344390153884888, "data/tokens_consumed": 155463974912, "data/tokens_consumed_B": 155.463974912, "train/loss_slope": -3.0561793862683147e-06} {"step": 74140, "timestamp": 1778274679.4976013, "train/loss": 2.1183217763900757, "train/z_loss": 0.0013936125673353673, "train/perplexity": 8.31716770154954, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.019242860078811645, "optim/adamw_lr": 0.0005772858023643493, "perf/tokens_per_sec": 2027844.627110728, "perf/iters_per_sec": 0.9669516692689553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341778516769409, "data/tokens_consumed": 155484946432, "data/tokens_consumed_B": 155.484946432, "train/loss_slope": -3.1202475873694187e-06} {"step": 74150, "timestamp": 1778274689.8392565, "grad/layer_0/attn": 0.0026825228706002235, "grad/layer_0/mlp": 0.0031184402760118246, "grad/layer_0/attn_mlp_ratio": 0.8602129741633553, "grad/layer_4/attn": 0.003073222003877163, "grad/layer_4/mlp": 0.0027447438333183527, "grad/layer_4/attn_mlp_ratio": 1.1196753061629634, "grad/layer_8/attn": 0.005305591970682144, "grad/layer_8/mlp": 0.003578506177291274, "grad/layer_8/attn_mlp_ratio": 1.4826275433274287, "grad/layer_12/attn": 0.005147702991962433, "grad/layer_12/mlp": 0.006549329962581396, "grad/layer_12/attn_mlp_ratio": 0.7859892451249408, "grad/layer_16/attn": 0.00388585077598691, "grad/layer_16/mlp": 0.004712844733148813, "grad/layer_16/attn_mlp_ratio": 0.8245233852502291, "grad/layer_20/attn": 0.003940804395824671, "grad/layer_20/mlp": 0.005353637970983982, "grad/layer_20/attn_mlp_ratio": 0.73609839581486, "grad/layer_24/attn": 0.012339572422206402, "grad/layer_24/mlp": 0.009812213480472565, "grad/layer_24/attn_mlp_ratio": 1.257572750634329, "grad/layer_27/attn": 0.0038627972826361656, "grad/layer_27/mlp": 0.009297779761254787, "grad/layer_27/attn_mlp_ratio": 0.41545372554291254} {"step": 74150, "timestamp": 1778274689.8549495, "train/loss": 2.1145261883735658, "train/z_loss": 0.0013742838869802653, "train/perplexity": 8.285658994353435, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.019228034019470215, "optim/adamw_lr": 0.0005768410205841064, "perf/tokens_per_sec": 2025995.164521229, "perf/iters_per_sec": 0.9660697767835755, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351219177246094, "data/tokens_consumed": 155505917952, "data/tokens_consumed_B": 155.505917952, "train/loss_slope": -4.874521470663647e-06} {"step": 74160, "timestamp": 1778274700.201999, "train/loss": 2.119544970989227, "train/z_loss": 0.0013767755939625204, "train/perplexity": 8.327347440794096, "train/grad_norm": 0.2041015625, "optim/muon_lr": 0.019213070273399354, "optim/adamw_lr": 0.0005763921082019806, "perf/tokens_per_sec": 2027709.7162317575, "perf/iters_per_sec": 0.9668873387488163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342466592788697, "data/tokens_consumed": 155526889472, "data/tokens_consumed_B": 155.526889472, "train/loss_slope": -5.983684787584327e-06} {"step": 74170, "timestamp": 1778274710.5493963, "train/loss": 2.147667443752289, "train/z_loss": 0.0013764185132458806, "train/perplexity": 8.564857068268829, "train/grad_norm": 0.1572265625, "optim/muon_lr": 0.019197968840599062, "optim/adamw_lr": 0.0005759390652179718, "perf/tokens_per_sec": 2028147.0029967192, "perf/iters_per_sec": 0.9670958533271404, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340236663818358, "data/tokens_consumed": 155547860992, "data/tokens_consumed_B": 155.547860992, "train/loss_slope": -3.886054766060126e-06} {"step": 74175, "timestamp": 1778274716.3153744, "eos/sharpness": 51.25787258148193, "eos/L0_probe": 1.9630041122436523, "eos/L_plus": 2.219540596008301, "eos/L_minus": 2.2190463542938232, "eos/grad_norm": 0.15144847333431244, "eos/embed_grad_frac": 0.11887465417385101, "eos/time_s": 0.6035957336425781} {"step": 74175, "timestamp": 1778274717.6938097, "geo/rankme_last": 438.7833557128906, "geo/layer_0/stable_rank_q_proj": 19.131223678588867, "geo/layer_0/stable_rank_k_proj": 15.761236190795898, "geo/layer_0/stable_rank_o_proj": 46.712589263916016, "geo/layer_0/stable_rank_gate_proj": 129.2647247314453, "geo/layer_0/stable_rank_down_proj": 56.39567184448242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06286903470754623, "geo/layer_0/attn_entropy_mean": 6.1503586769104, "geo/layer_0/attn_entropy_std": 0.43356743454933167, "geo/layer_7/stable_rank_q_proj": 42.76652908325195, "geo/layer_7/stable_rank_k_proj": 40.15712356567383, "geo/layer_7/stable_rank_o_proj": 88.17344665527344, "geo/layer_7/stable_rank_gate_proj": 77.80429077148438, "geo/layer_7/stable_rank_down_proj": 140.19000244140625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4358326494693756, "geo/layer_7/attn_entropy_mean": 4.6287102699279785, "geo/layer_7/attn_entropy_std": 0.7992326021194458, "geo/layer_14/stable_rank_q_proj": 49.971893310546875, "geo/layer_14/stable_rank_k_proj": 41.65247344970703, "geo/layer_14/stable_rank_o_proj": 43.43958282470703, "geo/layer_14/stable_rank_gate_proj": 70.88627624511719, "geo/layer_14/stable_rank_down_proj": 126.21376037597656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.393594890832901, "geo/layer_14/attn_entropy_mean": 5.552087783813477, "geo/layer_14/attn_entropy_std": 0.4092476963996887, "geo/layer_21/stable_rank_q_proj": 39.74397659301758, "geo/layer_21/stable_rank_k_proj": 30.028343200683594, "geo/layer_21/stable_rank_o_proj": 68.3405990600586, "geo/layer_21/stable_rank_gate_proj": 64.03126525878906, "geo/layer_21/stable_rank_down_proj": 49.58277893066406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14193657040596008, "geo/layer_21/attn_entropy_mean": 5.659095287322998, "geo/layer_21/attn_entropy_std": 0.31394192576408386, "geo/layer_27/stable_rank_q_proj": 43.78943634033203, "geo/layer_27/stable_rank_k_proj": 32.38221740722656, "geo/layer_27/stable_rank_o_proj": 115.27134704589844, "geo/layer_27/stable_rank_gate_proj": 77.66211700439453, "geo/layer_27/stable_rank_down_proj": 127.8753433227539, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09453479945659637, "geo/layer_27/attn_entropy_mean": 4.154134750366211, "geo/layer_27/attn_entropy_std": 0.7927361726760864, "attnres/final_alpha/block_0": 0.23739296197891235, "attnres/block_norm/0": 1.7763996124267578, "attnres/final_alpha/block_1": 0.004094542004168034, "attnres/block_norm/1": 48472.15625, "attnres/final_alpha/block_2": 0.010143840685486794, "attnres/block_norm/2": 29108.064453125, "attnres/final_alpha/block_3": 0.011915180832147598, "attnres/block_norm/3": 62518.63671875, "attnres/final_alpha/block_4": 0.014048835262656212, "attnres/block_norm/4": 15829.015625, "attnres/final_alpha/block_5": 0.615455687046051, "attnres/block_norm/5": 6901.3623046875, "attnres/final_alpha/block_6": 0.10694893449544907, "attnres/block_norm/6": 41444.7421875, "geo/tier1_time_s": 1.3589622974395752, "geo/step": 74175.0, "geo/rankme_slope": -7.050800789065626e-05} {"step": 74180, "timestamp": 1778274722.8695576, "train/loss": 2.061116909980774, "train/z_loss": 0.0013754123821854592, "train/perplexity": 7.854737948007181, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.019182729721069335, "optim/adamw_lr": 0.00057548189163208, "perf/tokens_per_sec": 1702854.3963781854, "perf/iters_per_sec": 0.8119842512026717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2315509796142579, "data/tokens_consumed": 155568832512, "data/tokens_consumed_B": 155.568832512, "train/loss_slope": -9.486936020700935e-06} {"step": 74190, "timestamp": 1778274733.2193055, "train/loss": 2.171278190612793, "train/z_loss": 0.0013525690417736768, "train/perplexity": 8.769485954751438, "train/grad_norm": 0.2255859375, "optim/muon_lr": 0.01916735291481018, "optim/adamw_lr": 0.0005750205874443054, "perf/tokens_per_sec": 2027148.7618098431, "perf/iters_per_sec": 0.9666198548363891, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034532856941223, "data/tokens_consumed": 155589804032, "data/tokens_consumed_B": 155.589804032, "train/loss_slope": -9.567050328671017e-06} {"step": 74200, "timestamp": 1778274743.560997, "grad/layer_0/attn": 0.0029693585820496082, "grad/layer_0/mlp": 0.0029922828543931246, "grad/layer_0/attn_mlp_ratio": 0.9923388353665351, "grad/layer_4/attn": 0.004419167526066303, "grad/layer_4/mlp": 0.0026080221869051456, "grad/layer_4/attn_mlp_ratio": 1.6944515958528805, "grad/layer_8/attn": 0.009350752457976341, "grad/layer_8/mlp": 0.003641856834292412, "grad/layer_8/attn_mlp_ratio": 2.5675781961471063, "grad/layer_12/attn": 0.007650767918676138, "grad/layer_12/mlp": 0.006742099300026894, "grad/layer_12/attn_mlp_ratio": 1.1347753073242468, "grad/layer_16/attn": 0.0041475724428892136, "grad/layer_16/mlp": 0.004777976311743259, "grad/layer_16/attn_mlp_ratio": 0.8680604685898731, "grad/layer_20/attn": 0.0028488198295235634, "grad/layer_20/mlp": 0.005620946642011404, "grad/layer_20/attn_mlp_ratio": 0.506822064018372, "grad/layer_24/attn": 0.008057275786995888, "grad/layer_24/mlp": 0.008752654306590557, "grad/layer_24/attn_mlp_ratio": 0.9205522590871331, "grad/layer_27/attn": 0.00473603093996644, "grad/layer_27/mlp": 0.00867303367704153, "grad/layer_27/attn_mlp_ratio": 0.5460639335342188} {"step": 74200, "timestamp": 1778274743.576684, "train/loss": 2.134230041503906, "train/z_loss": 0.001388076727744192, "train/perplexity": 8.45053743822965, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.01915183961391449, "optim/adamw_lr": 0.0005745551884174347, "perf/tokens_per_sec": 2025790.6544853665, "perf/iters_per_sec": 0.9659722587992509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352264165878295, "data/tokens_consumed": 155610775552, "data/tokens_consumed_B": 155.610775552, "train/loss_slope": -1.0834252415853611e-05} {"step": 74210, "timestamp": 1778274753.9256504, "train/loss": 2.126492464542389, "train/z_loss": 0.0013827765826135874, "train/perplexity": 8.385403070493185, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.019136189818382263, "optim/adamw_lr": 0.0005740856945514679, "perf/tokens_per_sec": 2027572.4864384474, "perf/iters_per_sec": 0.9668219024841535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343166589736938, "data/tokens_consumed": 155631747072, "data/tokens_consumed_B": 155.631747072, "train/loss_slope": -9.243558370920418e-06} {"step": 74220, "timestamp": 1778274764.2766287, "train/loss": 2.1312392950057983, "train/z_loss": 0.0013939304975792766, "train/perplexity": 8.425301778519751, "train/grad_norm": 0.1552734375, "optim/muon_lr": 0.019120402932167054, "optim/adamw_lr": 0.0005736120879650115, "perf/tokens_per_sec": 2027113.6774264798, "perf/iters_per_sec": 0.9666031252987288, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345507621765138, "data/tokens_consumed": 155652718592, "data/tokens_consumed_B": 155.652718592, "train/loss_slope": -1.1025766735494675e-05} {"step": 74230, "timestamp": 1778274774.6280289, "train/loss": 2.1409146070480345, "train/z_loss": 0.0013894109637476505, "train/perplexity": 8.507214830445372, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.019104480147361755, "optim/adamw_lr": 0.0005731344044208526, "perf/tokens_per_sec": 2027034.6836134437, "perf/iters_per_sec": 0.9665654581134051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345910787582397, "data/tokens_consumed": 155673690112, "data/tokens_consumed_B": 155.673690112, "train/loss_slope": -8.84199557345874e-06} {"step": 74240, "timestamp": 1778274785.5547457, "train/loss": 2.1641231536865235, "train/z_loss": 0.0013756112777628005, "train/perplexity": 8.706963899460112, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.019088420867919922, "optim/adamw_lr": 0.0005726526260375976, "perf/tokens_per_sec": 1920056.4743425343, "perf/iters_per_sec": 0.9155542728150055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0922345399856568, "data/tokens_consumed": 155694661632, "data/tokens_consumed_B": 155.694661632, "train/loss_slope": -7.3954129984455456e-06} {"step": 74250, "timestamp": 1778274795.8911555, "grad/layer_0/attn": 0.002780987648293376, "grad/layer_0/mlp": 0.0033245449885725975, "grad/layer_0/attn_mlp_ratio": 0.836501709016496, "grad/layer_4/attn": 0.0029964130371809006, "grad/layer_4/mlp": 0.0026203496381640434, "grad/layer_4/attn_mlp_ratio": 1.143516452609241, "grad/layer_8/attn": 0.0042805662378668785, "grad/layer_8/mlp": 0.003843368496745825, "grad/layer_8/attn_mlp_ratio": 1.11375376316787, "grad/layer_12/attn": 0.004529875237494707, "grad/layer_12/mlp": 0.007346380036324263, "grad/layer_12/attn_mlp_ratio": 0.6166132371910196, "grad/layer_16/attn": 0.003426958806812763, "grad/layer_16/mlp": 0.004361174069344997, "grad/layer_16/attn_mlp_ratio": 0.7857881097483105, "grad/layer_20/attn": 0.0030026051681488752, "grad/layer_20/mlp": 0.005524531006813049, "grad/layer_20/attn_mlp_ratio": 0.5435040748428325, "grad/layer_24/attn": 0.0049632214941084385, "grad/layer_24/mlp": 0.008372420445084572, "grad/layer_24/attn_mlp_ratio": 0.5928060430531448, "grad/layer_27/attn": 0.007100454531610012, "grad/layer_27/mlp": 0.007316926959902048, "grad/layer_27/attn_mlp_ratio": 0.970414830362552} {"step": 74250, "timestamp": 1778274796.4945285, "eos/sharpness": 27.414059638977047, "eos/L0_probe": 1.9604803323745728, "eos/L_plus": 2.0868873596191406, "eos/L_minus": 2.1082139015197754, "eos/grad_norm": 0.11354532837867737, "eos/embed_grad_frac": 0.19035233557224274, "eos/time_s": 0.6006247997283936} {"step": 74250, "timestamp": 1778274796.5138688, "train/loss": 2.189809727668762, "train/z_loss": 0.0013726059347391128, "train/perplexity": 8.93351315260231, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.019072225689888003, "optim/adamw_lr": 0.00057216677069664, "perf/tokens_per_sec": 1914462.1069594924, "perf/iters_per_sec": 0.9128866705701315, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0954262256622314, "data/tokens_consumed": 155715633152, "data/tokens_consumed_B": 155.715633152, "train/loss_slope": -2.033767047816868e-06} {"step": 74250, "timestamp": 1778274797.8766336, "geo/rankme_last": 438.5340576171875, "geo/layer_0/stable_rank_q_proj": 19.163286209106445, "geo/layer_0/stable_rank_k_proj": 15.765923500061035, "geo/layer_0/stable_rank_o_proj": 46.68108367919922, "geo/layer_0/stable_rank_gate_proj": 129.28977966308594, "geo/layer_0/stable_rank_down_proj": 56.3452262878418, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06318237632513046, "geo/layer_0/attn_entropy_mean": 6.149335861206055, "geo/layer_0/attn_entropy_std": 0.4340079724788666, "geo/layer_7/stable_rank_q_proj": 42.783084869384766, "geo/layer_7/stable_rank_k_proj": 40.129695892333984, "geo/layer_7/stable_rank_o_proj": 88.22123718261719, "geo/layer_7/stable_rank_gate_proj": 77.82568359375, "geo/layer_7/stable_rank_down_proj": 140.39590454101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4539920389652252, "geo/layer_7/attn_entropy_mean": 4.658642292022705, "geo/layer_7/attn_entropy_std": 0.7894728779792786, "geo/layer_14/stable_rank_q_proj": 49.87272262573242, "geo/layer_14/stable_rank_k_proj": 41.7036018371582, "geo/layer_14/stable_rank_o_proj": 43.50117874145508, "geo/layer_14/stable_rank_gate_proj": 70.78775024414062, "geo/layer_14/stable_rank_down_proj": 125.6932144165039, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40036189556121826, "geo/layer_14/attn_entropy_mean": 5.567815780639648, "geo/layer_14/attn_entropy_std": 0.42587530612945557, "geo/layer_21/stable_rank_q_proj": 39.77939987182617, "geo/layer_21/stable_rank_k_proj": 30.08826446533203, "geo/layer_21/stable_rank_o_proj": 68.41943359375, "geo/layer_21/stable_rank_gate_proj": 63.97565841674805, "geo/layer_21/stable_rank_down_proj": 49.57429885864258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14342492818832397, "geo/layer_21/attn_entropy_mean": 5.690910339355469, "geo/layer_21/attn_entropy_std": 0.30836692452430725, "geo/layer_27/stable_rank_q_proj": 43.86454391479492, "geo/layer_27/stable_rank_k_proj": 32.41109848022461, "geo/layer_27/stable_rank_o_proj": 115.26393127441406, "geo/layer_27/stable_rank_gate_proj": 77.68473052978516, "geo/layer_27/stable_rank_down_proj": 127.87568664550781, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10095098614692688, "geo/layer_27/attn_entropy_mean": 4.176959991455078, "geo/layer_27/attn_entropy_std": 0.783209502696991, "attnres/final_alpha/block_0": 0.23737630248069763, "attnres/block_norm/0": 1.776459813117981, "attnres/final_alpha/block_1": 0.004080062732100487, "attnres/block_norm/1": 48329.97265625, "attnres/final_alpha/block_2": 0.010077347978949547, "attnres/block_norm/2": 29106.23046875, "attnres/final_alpha/block_3": 0.011788724921643734, "attnres/block_norm/3": 61923.296875, "attnres/final_alpha/block_4": 0.013834918849170208, "attnres/block_norm/4": 15875.8681640625, "attnres/final_alpha/block_5": 0.6138601303100586, "attnres/block_norm/5": 6863.54345703125, "attnres/final_alpha/block_6": 0.10898251831531525, "attnres/block_norm/6": 41159.53125, "geo/tier1_time_s": 1.3590421676635742, "geo/step": 74250.0, "geo/rankme_slope": -6.287436849739896e-05} {"step": 74260, "timestamp": 1778274808.2276757, "train/loss": 2.100754976272583, "train/z_loss": 0.0013933148235082625, "train/perplexity": 8.172337504989533, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.01905589520931244, "optim/adamw_lr": 0.0005716768562793732, "perf/tokens_per_sec": 1790884.1949703358, "perf/iters_per_sec": 0.8539601302005462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710148572921752, "data/tokens_consumed": 155736604672, "data/tokens_consumed_B": 155.736604672, "train/loss_slope": -5.8161152829073996e-06} {"step": 74270, "timestamp": 1778274818.5770855, "train/loss": 2.1409992933273316, "train/z_loss": 0.0013686123420484364, "train/perplexity": 8.507935305323281, "train/grad_norm": 0.2578125, "optim/muon_lr": 0.019039429426193237, "optim/adamw_lr": 0.0005711828827857971, "perf/tokens_per_sec": 2027381.8164738703, "perf/iters_per_sec": 0.9667309839600898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344139337539673, "data/tokens_consumed": 155757576192, "data/tokens_consumed_B": 155.757576192, "train/loss_slope": -6.204503454534e-06} {"step": 74280, "timestamp": 1778274828.929468, "train/loss": 2.1185163021087647, "train/z_loss": 0.0013827749760821461, "train/perplexity": 8.318785761946218, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.019022828340530398, "optim/adamw_lr": 0.0005706848502159118, "perf/tokens_per_sec": 2027184.8285137538, "perf/iters_per_sec": 0.9666370527809877, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345144510269164, "data/tokens_consumed": 155778547712, "data/tokens_consumed_B": 155.778547712, "train/loss_slope": -7.615936077860718e-06} {"step": 74290, "timestamp": 1778274839.28183, "train/loss": 2.114516806602478, "train/z_loss": 0.0013649900094605983, "train/perplexity": 8.28558126056208, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.019006092548370362, "optim/adamw_lr": 0.0005701827764511108, "perf/tokens_per_sec": 2027199.3116187, "perf/iters_per_sec": 0.9666439588635921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345070600509643, "data/tokens_consumed": 155799519232, "data/tokens_consumed_B": 155.799519232, "train/loss_slope": -6.785770010526118e-06} {"step": 74300, "timestamp": 1778274849.6590748, "grad/layer_0/attn": 0.0024831793271005154, "grad/layer_0/mlp": 0.002849255921319127, "grad/layer_0/attn_mlp_ratio": 0.8715184976430695, "grad/layer_4/attn": 0.0036374421324580908, "grad/layer_4/mlp": 0.00246611749753356, "grad/layer_4/attn_mlp_ratio": 1.4749670235093446, "grad/layer_8/attn": 0.00700727803632617, "grad/layer_8/mlp": 0.0037562905345112085, "grad/layer_8/attn_mlp_ratio": 1.8654781320556681, "grad/layer_12/attn": 0.006241972558200359, "grad/layer_12/mlp": 0.00708660576492548, "grad/layer_12/attn_mlp_ratio": 0.8808127158721276, "grad/layer_16/attn": 0.006256625521928072, "grad/layer_16/mlp": 0.005397196859121323, "grad/layer_16/attn_mlp_ratio": 1.159236094090342, "grad/layer_20/attn": 0.0034774444065988064, "grad/layer_20/mlp": 0.006500902120023966, "grad/layer_20/attn_mlp_ratio": 0.5349171990139527, "grad/layer_24/attn": 0.01691628061234951, "grad/layer_24/mlp": 0.011069219559431076, "grad/layer_24/attn_mlp_ratio": 1.5282270234773674, "grad/layer_27/attn": 0.012267285026609898, "grad/layer_27/mlp": 0.010899686254560947, "grad/layer_27/attn_mlp_ratio": 1.1254713784930777} {"step": 74300, "timestamp": 1778274849.674958, "train/loss": 2.1917285919189453, "train/z_loss": 0.001367802603635937, "train/perplexity": 8.950671808920573, "train/grad_norm": 0.2353515625, "optim/muon_lr": 0.018989222049713136, "optim/adamw_lr": 0.000569676661491394, "perf/tokens_per_sec": 2019211.2332261873, "perf/iters_per_sec": 0.9628349462634026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0385996103286743, "data/tokens_consumed": 155820490752, "data/tokens_consumed_B": 155.820490752, "train/loss_slope": -2.3121559544793992e-06} {"step": 74310, "timestamp": 1778274860.0480824, "train/loss": 2.1238328218460083, "train/z_loss": 0.0013695710804313421, "train/perplexity": 8.363130526094666, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.018972217440605163, "optim/adamw_lr": 0.0005691665232181549, "perf/tokens_per_sec": 2023132.1001837244, "perf/iters_per_sec": 0.9647045613211271, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365867853164672, "data/tokens_consumed": 155841462272, "data/tokens_consumed_B": 155.841462272, "train/loss_slope": -2.623105599935287e-06} {"step": 74320, "timestamp": 1778274870.405746, "train/loss": 2.1358144283294678, "train/z_loss": 0.0013786609400995077, "train/perplexity": 8.463936970633952, "train/grad_norm": 0.158203125, "optim/muon_lr": 0.01895507872104645, "optim/adamw_lr": 0.0005686523616313933, "perf/tokens_per_sec": 2025823.2669394205, "perf/iters_per_sec": 0.965987809629164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352097511291505, "data/tokens_consumed": 155862433792, "data/tokens_consumed_B": 155.862433792, "train/loss_slope": -2.5969985294656622e-06} {"step": 74325, "timestamp": 1778274876.1904466, "eos/sharpness": 65.77179431915282, "eos/L0_probe": 1.962561011314392, "eos/L_plus": 2.3113021850585938, "eos/L_minus": 2.2715377807617188, "eos/grad_norm": 0.16202513873577118, "eos/embed_grad_frac": 0.08345041424036026, "eos/time_s": 0.6182148456573486} {"step": 74325, "timestamp": 1778274877.5689633, "geo/rankme_last": 438.3642883300781, "geo/layer_0/stable_rank_q_proj": 19.147884368896484, "geo/layer_0/stable_rank_k_proj": 15.781323432922363, "geo/layer_0/stable_rank_o_proj": 46.6373176574707, "geo/layer_0/stable_rank_gate_proj": 129.30274963378906, "geo/layer_0/stable_rank_down_proj": 56.37228012084961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0646779015660286, "geo/layer_0/attn_entropy_mean": 6.145003795623779, "geo/layer_0/attn_entropy_std": 0.4297807812690735, "geo/layer_7/stable_rank_q_proj": 42.75212097167969, "geo/layer_7/stable_rank_k_proj": 40.1177864074707, "geo/layer_7/stable_rank_o_proj": 88.27239227294922, "geo/layer_7/stable_rank_gate_proj": 77.67735290527344, "geo/layer_7/stable_rank_down_proj": 140.20013427734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44268399477005005, "geo/layer_7/attn_entropy_mean": 4.639496326446533, "geo/layer_7/attn_entropy_std": 0.785041332244873, "geo/layer_14/stable_rank_q_proj": 49.82594299316406, "geo/layer_14/stable_rank_k_proj": 41.71951675415039, "geo/layer_14/stable_rank_o_proj": 43.437984466552734, "geo/layer_14/stable_rank_gate_proj": 70.74234771728516, "geo/layer_14/stable_rank_down_proj": 125.5550308227539, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3916715979576111, "geo/layer_14/attn_entropy_mean": 5.582740783691406, "geo/layer_14/attn_entropy_std": 0.41628819704055786, "geo/layer_21/stable_rank_q_proj": 39.75128936767578, "geo/layer_21/stable_rank_k_proj": 30.111766815185547, "geo/layer_21/stable_rank_o_proj": 68.2789306640625, "geo/layer_21/stable_rank_gate_proj": 63.928348541259766, "geo/layer_21/stable_rank_down_proj": 49.511474609375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14267286658287048, "geo/layer_21/attn_entropy_mean": 5.6911468505859375, "geo/layer_21/attn_entropy_std": 0.2955535650253296, "geo/layer_27/stable_rank_q_proj": 43.91669845581055, "geo/layer_27/stable_rank_k_proj": 32.42216110229492, "geo/layer_27/stable_rank_o_proj": 115.28288269042969, "geo/layer_27/stable_rank_gate_proj": 77.67119598388672, "geo/layer_27/stable_rank_down_proj": 127.96678924560547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09899657964706421, "geo/layer_27/attn_entropy_mean": 4.170268535614014, "geo/layer_27/attn_entropy_std": 0.7920828461647034, "attnres/final_alpha/block_0": 0.23525092005729675, "attnres/block_norm/0": 1.7765486240386963, "attnres/final_alpha/block_1": 0.00400254363194108, "attnres/block_norm/1": 48556.671875, "attnres/final_alpha/block_2": 0.009931819513440132, "attnres/block_norm/2": 29058.18359375, "attnres/final_alpha/block_3": 0.011581448838114738, "attnres/block_norm/3": 62714.53515625, "attnres/final_alpha/block_4": 0.013703776523470879, "attnres/block_norm/4": 15837.2314453125, "attnres/final_alpha/block_5": 0.6210286021232605, "attnres/block_norm/5": 6829.525390625, "attnres/final_alpha/block_6": 0.10450087487697601, "attnres/block_norm/6": 41373.6484375, "geo/tier1_time_s": 1.3589155673980713, "geo/step": 74325.0, "geo/rankme_slope": -3.5955046080932375e-05} {"step": 74330, "timestamp": 1778274882.7486176, "train/loss": 2.111377716064453, "train/z_loss": 0.0013746762415394187, "train/perplexity": 8.259612850743936, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.01893780589103699, "optim/adamw_lr": 0.0005681341767311095, "perf/tokens_per_sec": 1699688.8713792986, "perf/iters_per_sec": 0.8104748112579816, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338446378707886, "data/tokens_consumed": 155883405312, "data/tokens_consumed_B": 155.883405312, "train/loss_slope": -5.183003265650955e-06} {"step": 74340, "timestamp": 1778274893.111854, "train/loss": 2.1512255311012267, "train/z_loss": 0.0013723012409172953, "train/perplexity": 8.595385857690777, "train/grad_norm": 0.1982421875, "optim/muon_lr": 0.01892039954662323, "optim/adamw_lr": 0.0005676119863986969, "perf/tokens_per_sec": 2024951.1636964628, "perf/iters_per_sec": 0.9655719583971323, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356555938720704, "data/tokens_consumed": 155904376832, "data/tokens_consumed_B": 155.904376832, "train/loss_slope": -5.144330048181977e-06} {"step": 74350, "timestamp": 1778274903.4640715, "grad/layer_0/attn": 0.0029959476087242365, "grad/layer_0/mlp": 0.0033653767313808203, "grad/layer_0/attn_mlp_ratio": 0.8902264913658996, "grad/layer_4/attn": 0.00234194821678102, "grad/layer_4/mlp": 0.002548089949414134, "grad/layer_4/attn_mlp_ratio": 0.9190994711193544, "grad/layer_8/attn": 0.004316628910601139, "grad/layer_8/mlp": 0.003774243174120784, "grad/layer_8/attn_mlp_ratio": 1.1437071214246795, "grad/layer_12/attn": 0.005272947251796722, "grad/layer_12/mlp": 0.0069953217171132565, "grad/layer_12/attn_mlp_ratio": 0.7537819402242595, "grad/layer_16/attn": 0.0043651447631418705, "grad/layer_16/mlp": 0.004597701132297516, "grad/layer_16/attn_mlp_ratio": 0.9494189688703556, "grad/layer_20/attn": 0.003346641780808568, "grad/layer_20/mlp": 0.005873054265975952, "grad/layer_20/attn_mlp_ratio": 0.5698298657333204, "grad/layer_24/attn": 0.010953565128147602, "grad/layer_24/mlp": 0.009808157570660114, "grad/layer_24/attn_mlp_ratio": 1.1167811016042117, "grad/layer_27/attn": 0.004461539909243584, "grad/layer_27/mlp": 0.00871412456035614, "grad/layer_27/attn_mlp_ratio": 0.5119894519688044} {"step": 74350, "timestamp": 1778274903.479811, "train/loss": 2.1746246099472044, "train/z_loss": 0.0013762011774815619, "train/perplexity": 8.79888148956408, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.018902859687805175, "optim/adamw_lr": 0.0005670857906341552, "perf/tokens_per_sec": 2023803.6974907469, "perf/iters_per_sec": 0.9650248038724646, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362427949905395, "data/tokens_consumed": 155925348352, "data/tokens_consumed_B": 155.925348352, "train/loss_slope": -5.748114927803098e-07} {"step": 74360, "timestamp": 1778274913.8439484, "train/loss": 2.1426345109939575, "train/z_loss": 0.0013799469801597298, "train/perplexity": 8.521859012484432, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.01888518691062927, "optim/adamw_lr": 0.0005665556073188781, "perf/tokens_per_sec": 2024456.5440494048, "perf/iters_per_sec": 0.9653361053702377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359086275100708, "data/tokens_consumed": 155946319872, "data/tokens_consumed_B": 155.946319872, "train/loss_slope": 1.7648448537785724e-06} {"step": 74370, "timestamp": 1778274924.2035072, "train/loss": 2.1290018796920775, "train/z_loss": 0.0013818330713547765, "train/perplexity": 8.406471952223846, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.01886738121509552, "optim/adamw_lr": 0.0005660214364528656, "perf/tokens_per_sec": 2025945.421342211, "perf/iters_per_sec": 0.9660460573874526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351473331451415, "data/tokens_consumed": 155967291392, "data/tokens_consumed_B": 155.967291392, "train/loss_slope": 4.9961408551591016e-06} {"step": 74380, "timestamp": 1778274934.5579314, "train/loss": 2.1733665466308594, "train/z_loss": 0.0013815960031934083, "train/perplexity": 8.787818899715512, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.018849443197250366, "optim/adamw_lr": 0.000565483295917511, "perf/tokens_per_sec": 2026493.8978799332, "perf/iters_per_sec": 0.9663075913810412, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348671674728394, "data/tokens_consumed": 155988262912, "data/tokens_consumed_B": 155.988262912, "train/loss_slope": 6.508230955579966e-06} {"step": 74390, "timestamp": 1778274944.9276593, "train/loss": 2.13178346157074, "train/z_loss": 0.0013827022863551973, "train/perplexity": 8.429887793712066, "train/grad_norm": 0.1796875, "optim/muon_lr": 0.01883137285709381, "optim/adamw_lr": 0.0005649411857128143, "perf/tokens_per_sec": 2023507.0374461017, "perf/iters_per_sec": 0.9648833453398236, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036394715309143, "data/tokens_consumed": 156009234432, "data/tokens_consumed_B": 156.009234432, "train/loss_slope": 6.216903817285242e-06} {"step": 74400, "timestamp": 1778274955.2783701, "grad/layer_0/attn": 0.0027324925176799297, "grad/layer_0/mlp": 0.0029900644440203905, "grad/layer_0/attn_mlp_ratio": 0.913857369114135, "grad/layer_4/attn": 0.002264486625790596, "grad/layer_4/mlp": 0.002652376191690564, "grad/layer_4/attn_mlp_ratio": 0.8537576786841448, "grad/layer_8/attn": 0.006894082762300968, "grad/layer_8/mlp": 0.003688619239255786, "grad/layer_8/attn_mlp_ratio": 1.869014427412267, "grad/layer_12/attn": 0.005316707771271467, "grad/layer_12/mlp": 0.006517922505736351, "grad/layer_12/attn_mlp_ratio": 0.815705876377282, "grad/layer_16/attn": 0.004076594486832619, "grad/layer_16/mlp": 0.00439427187666297, "grad/layer_16/attn_mlp_ratio": 0.9277064570610406, "grad/layer_20/attn": 0.008929090574383736, "grad/layer_20/mlp": 0.005644375924021006, "grad/layer_20/attn_mlp_ratio": 1.581944671365592, "grad/layer_24/attn": 0.014615348540246487, "grad/layer_24/mlp": 0.009084686636924744, "grad/layer_24/attn_mlp_ratio": 1.608789489773197, "grad/layer_27/attn": 0.005184049252420664, "grad/layer_27/mlp": 0.010301136411726475, "grad/layer_27/attn_mlp_ratio": 0.5032502235573049} {"step": 74400, "timestamp": 1778274955.8805556, "eos/sharpness": 53.5304307937622, "eos/L0_probe": 1.9644889831542969, "eos/L_plus": 2.192026138305664, "eos/L_minus": 2.2722561359405518, "eos/grad_norm": 0.162465438246727, "eos/embed_grad_frac": 0.14416760206222534, "eos/time_s": 0.5991644859313965} {"step": 74400, "timestamp": 1778274955.9008229, "train/loss": 2.2062004804611206, "train/z_loss": 0.0013710364466533065, "train/perplexity": 9.081146766068887, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.018813170194625854, "optim/adamw_lr": 0.0005643951058387755, "perf/tokens_per_sec": 1912076.8335055618, "perf/iters_per_sec": 0.9117492835548219, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.09679274559021, "data/tokens_consumed": 156030205952, "data/tokens_consumed_B": 156.030205952, "train/loss_slope": 1.2488772144483573e-05} {"step": 74400, "timestamp": 1778274957.2643523, "geo/rankme_last": 438.12835693359375, "geo/layer_0/stable_rank_q_proj": 19.13580894470215, "geo/layer_0/stable_rank_k_proj": 15.786483764648438, "geo/layer_0/stable_rank_o_proj": 46.60377502441406, "geo/layer_0/stable_rank_gate_proj": 129.4275665283203, "geo/layer_0/stable_rank_down_proj": 56.401222229003906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06356941163539886, "geo/layer_0/attn_entropy_mean": 6.14365816116333, "geo/layer_0/attn_entropy_std": 0.4294970631599426, "geo/layer_7/stable_rank_q_proj": 42.7363166809082, "geo/layer_7/stable_rank_k_proj": 40.10335159301758, "geo/layer_7/stable_rank_o_proj": 88.40953826904297, "geo/layer_7/stable_rank_gate_proj": 77.74200439453125, "geo/layer_7/stable_rank_down_proj": 139.94239807128906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4384387135505676, "geo/layer_7/attn_entropy_mean": 4.634653091430664, "geo/layer_7/attn_entropy_std": 0.8132693767547607, "geo/layer_14/stable_rank_q_proj": 49.7757568359375, "geo/layer_14/stable_rank_k_proj": 41.59150314331055, "geo/layer_14/stable_rank_o_proj": 43.358360290527344, "geo/layer_14/stable_rank_gate_proj": 70.76158905029297, "geo/layer_14/stable_rank_down_proj": 125.5135498046875, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40040960907936096, "geo/layer_14/attn_entropy_mean": 5.544721603393555, "geo/layer_14/attn_entropy_std": 0.4271031320095062, "geo/layer_21/stable_rank_q_proj": 39.810482025146484, "geo/layer_21/stable_rank_k_proj": 30.128690719604492, "geo/layer_21/stable_rank_o_proj": 68.23914337158203, "geo/layer_21/stable_rank_gate_proj": 63.961570739746094, "geo/layer_21/stable_rank_down_proj": 49.49284362792969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14008310437202454, "geo/layer_21/attn_entropy_mean": 5.676636695861816, "geo/layer_21/attn_entropy_std": 0.3008185029029846, "geo/layer_27/stable_rank_q_proj": 43.90178298950195, "geo/layer_27/stable_rank_k_proj": 32.4169921875, "geo/layer_27/stable_rank_o_proj": 115.27323913574219, "geo/layer_27/stable_rank_gate_proj": 77.61773681640625, "geo/layer_27/stable_rank_down_proj": 127.80430603027344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09086296707391739, "geo/layer_27/attn_entropy_mean": 4.176515579223633, "geo/layer_27/attn_entropy_std": 0.7820955514907837, "attnres/final_alpha/block_0": 0.2363128364086151, "attnres/block_norm/0": 1.7766594886779785, "attnres/final_alpha/block_1": 0.004003043286502361, "attnres/block_norm/1": 48526.4921875, "attnres/final_alpha/block_2": 0.010102584958076477, "attnres/block_norm/2": 29066.169921875, "attnres/final_alpha/block_3": 0.011855930089950562, "attnres/block_norm/3": 62395.0078125, "attnres/final_alpha/block_4": 0.013960001990199089, "attnres/block_norm/4": 15789.71484375, "attnres/final_alpha/block_5": 0.6158343553543091, "attnres/block_norm/5": 6901.3994140625, "attnres/final_alpha/block_6": 0.1079312413930893, "attnres/block_norm/6": 41323.546875, "geo/tier1_time_s": 1.3593018054962158, "geo/step": 74400.0, "geo/rankme_slope": -6.815693074104643e-05} {"step": 74410, "timestamp": 1778274967.9397483, "train/loss": 2.1535233855247498, "train/z_loss": 0.0013768267817795277, "train/perplexity": 8.615159512895671, "train/grad_norm": 0.140625, "optim/muon_lr": 0.018794836401939394, "optim/adamw_lr": 0.0005638450920581817, "perf/tokens_per_sec": 1742586.8674542934, "perf/iters_per_sec": 0.8309301697989909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.20347056388855, "data/tokens_consumed": 156051177472, "data/tokens_consumed_B": 156.051177472, "train/loss_slope": 1.2066399067541942e-05} {"step": 74420, "timestamp": 1778274978.7006361, "train/loss": 2.0810141444206236, "train/z_loss": 0.001394668489228934, "train/perplexity": 8.012590719431424, "train/grad_norm": 0.1025390625, "optim/muon_lr": 0.018776370882987975, "optim/adamw_lr": 0.0005632911264896392, "perf/tokens_per_sec": 1950589.2802299666, "perf/iters_per_sec": 0.9301134492063363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0751376628875733, "data/tokens_consumed": 156072148992, "data/tokens_consumed_B": 156.072148992, "train/loss_slope": 1.0210804012205867e-05} {"step": 74430, "timestamp": 1778274989.5768256, "train/loss": 2.1293097734451294, "train/z_loss": 0.00137042875867337, "train/perplexity": 8.40906065092477, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.018757774233818053, "optim/adamw_lr": 0.0005627332270145415, "perf/tokens_per_sec": 1929238.9191735303, "perf/iters_per_sec": 0.9199328037135746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0870359182357787, "data/tokens_consumed": 156093120512, "data/tokens_consumed_B": 156.093120512, "train/loss_slope": 1.038993736877119e-05} {"step": 74440, "timestamp": 1778274999.9290016, "train/loss": 2.1435878753662108, "train/z_loss": 0.0013832671218551695, "train/perplexity": 8.529987323257593, "train/grad_norm": 0.1357421875, "optim/muon_lr": 0.018739046454429628, "optim/adamw_lr": 0.0005621713936328887, "perf/tokens_per_sec": 2026924.027885903, "perf/iters_per_sec": 0.9665126933507456, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034647560119629, "data/tokens_consumed": 156114092032, "data/tokens_consumed_B": 156.114092032, "train/loss_slope": 9.846512269158257e-06} {"step": 74450, "timestamp": 1778275010.271425, "grad/layer_0/attn": 0.0027963118627667427, "grad/layer_0/mlp": 0.0031593216117471457, "grad/layer_0/attn_mlp_ratio": 0.8850988021793913, "grad/layer_4/attn": 0.0025937953032553196, "grad/layer_4/mlp": 0.0025251638144254684, "grad/layer_4/attn_mlp_ratio": 1.0271789836841012, "grad/layer_8/attn": 0.0029375848826020956, "grad/layer_8/mlp": 0.003485466819256544, "grad/layer_8/attn_mlp_ratio": 0.84280957204686, "grad/layer_12/attn": 0.008036324754357338, "grad/layer_12/mlp": 0.00744680967181921, "grad/layer_12/attn_mlp_ratio": 1.0791634271052575, "grad/layer_16/attn": 0.004150573164224625, "grad/layer_16/mlp": 0.0050157951191067696, "grad/layer_16/attn_mlp_ratio": 0.8275005224323679, "grad/layer_20/attn": 0.004438878037035465, "grad/layer_20/mlp": 0.005489233881235123, "grad/layer_20/attn_mlp_ratio": 0.8086516355851681, "grad/layer_24/attn": 0.012342013418674469, "grad/layer_24/mlp": 0.007845212705433369, "grad/layer_24/attn_mlp_ratio": 1.573190393271008, "grad/layer_27/attn": 0.004185742698609829, "grad/layer_27/mlp": 0.006621704902499914, "grad/layer_27/attn_mlp_ratio": 0.6321246109619156} {"step": 74450, "timestamp": 1778275010.2876172, "train/loss": 2.136374258995056, "train/z_loss": 0.001392213930375874, "train/perplexity": 8.46867666869209, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.01872018814086914, "optim/adamw_lr": 0.0005616056442260741, "perf/tokens_per_sec": 2025523.5424917578, "perf/iters_per_sec": 0.9658448898752965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353629350662232, "data/tokens_consumed": 156135063552, "data/tokens_consumed_B": 156.135063552, "train/loss_slope": 1.2390894314708402e-05} {"step": 74460, "timestamp": 1778275020.6436284, "train/loss": 2.107636260986328, "train/z_loss": 0.0013799419510178268, "train/perplexity": 8.228767619306275, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.018701199889183045, "optim/adamw_lr": 0.0005610359966754913, "perf/tokens_per_sec": 2026149.4492410014, "perf/iters_per_sec": 0.966143345470906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350430965423585, "data/tokens_consumed": 156156035072, "data/tokens_consumed_B": 156.156035072, "train/loss_slope": 1.2272088491197733e-05} {"step": 74470, "timestamp": 1778275030.9948714, "train/loss": 2.115764093399048, "train/z_loss": 0.0013834118843078613, "train/perplexity": 8.295922204391, "train/grad_norm": 0.2275390625, "optim/muon_lr": 0.01868208050727844, "optim/adamw_lr": 0.0005604624152183533, "perf/tokens_per_sec": 2026941.823538422, "perf/iters_per_sec": 0.9665211789791212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034638476371765, "data/tokens_consumed": 156177006592, "data/tokens_consumed_B": 156.177006592, "train/loss_slope": 1.2334518721609399e-05} {"step": 74475, "timestamp": 1778275037.309153, "eos/sharpness": 83.06081295013426, "eos/L0_probe": 1.9625775814056396, "eos/L_plus": 2.319237232208252, "eos/L_minus": 2.43652606010437, "eos/grad_norm": 0.19581352174282074, "eos/embed_grad_frac": 0.05489927530288696, "eos/time_s": 0.6072077751159668} {"step": 74475, "timestamp": 1778275038.693493, "geo/rankme_last": 438.67938232421875, "geo/layer_0/stable_rank_q_proj": 19.15900421142578, "geo/layer_0/stable_rank_k_proj": 15.78918170928955, "geo/layer_0/stable_rank_o_proj": 46.66606903076172, "geo/layer_0/stable_rank_gate_proj": 128.9730987548828, "geo/layer_0/stable_rank_down_proj": 56.46706008911133, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06490084528923035, "geo/layer_0/attn_entropy_mean": 6.147942543029785, "geo/layer_0/attn_entropy_std": 0.4315829575061798, "geo/layer_7/stable_rank_q_proj": 42.77873992919922, "geo/layer_7/stable_rank_k_proj": 40.09711456298828, "geo/layer_7/stable_rank_o_proj": 88.35983276367188, "geo/layer_7/stable_rank_gate_proj": 77.79022216796875, "geo/layer_7/stable_rank_down_proj": 139.79722595214844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4593997895717621, "geo/layer_7/attn_entropy_mean": 4.638071060180664, "geo/layer_7/attn_entropy_std": 0.7757560014724731, "geo/layer_14/stable_rank_q_proj": 49.768775939941406, "geo/layer_14/stable_rank_k_proj": 41.50251770019531, "geo/layer_14/stable_rank_o_proj": 43.39749526977539, "geo/layer_14/stable_rank_gate_proj": 70.80242919921875, "geo/layer_14/stable_rank_down_proj": 125.81548309326172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39287352561950684, "geo/layer_14/attn_entropy_mean": 5.554006099700928, "geo/layer_14/attn_entropy_std": 0.43126243352890015, "geo/layer_21/stable_rank_q_proj": 39.765357971191406, "geo/layer_21/stable_rank_k_proj": 30.0076847076416, "geo/layer_21/stable_rank_o_proj": 68.17835998535156, "geo/layer_21/stable_rank_gate_proj": 63.97154998779297, "geo/layer_21/stable_rank_down_proj": 49.469154357910156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14286325871944427, "geo/layer_21/attn_entropy_mean": 5.673723220825195, "geo/layer_21/attn_entropy_std": 0.3049335479736328, "geo/layer_27/stable_rank_q_proj": 44.003868103027344, "geo/layer_27/stable_rank_k_proj": 32.44795227050781, "geo/layer_27/stable_rank_o_proj": 115.24078369140625, "geo/layer_27/stable_rank_gate_proj": 77.53887939453125, "geo/layer_27/stable_rank_down_proj": 127.82491302490234, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09933887422084808, "geo/layer_27/attn_entropy_mean": 4.161161422729492, "geo/layer_27/attn_entropy_std": 0.7807300686836243, "attnres/final_alpha/block_0": 0.23763726651668549, "attnres/block_norm/0": 1.7767484188079834, "attnres/final_alpha/block_1": 0.004026865586638451, "attnres/block_norm/1": 48382.2734375, "attnres/final_alpha/block_2": 0.010091196745634079, "attnres/block_norm/2": 29121.75390625, "attnres/final_alpha/block_3": 0.011633518151938915, "attnres/block_norm/3": 62463.765625, "attnres/final_alpha/block_4": 0.01379677839577198, "attnres/block_norm/4": 15839.7587890625, "attnres/final_alpha/block_5": 0.6156820058822632, "attnres/block_norm/5": 6897.68798828125, "attnres/final_alpha/block_6": 0.10713234543800354, "attnres/block_norm/6": 41282.1484375, "geo/tier1_time_s": 1.3617446422576904, "geo/step": 74475.0, "geo/rankme_slope": -3.61788856167467e-05} {"step": 74480, "timestamp": 1778275043.8741603, "train/loss": 2.097920370101929, "train/z_loss": 0.0013905320200137794, "train/perplexity": 8.149204948002769, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.018662832379341125, "optim/adamw_lr": 0.0005598849713802337, "perf/tokens_per_sec": 1628952.902370955, "perf/iters_per_sec": 0.7767452728132987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.287423348426819, "data/tokens_consumed": 156197978112, "data/tokens_consumed_B": 156.197978112, "train/loss_slope": 1.0352760744710045e-05} {"step": 74490, "timestamp": 1778275054.224217, "train/loss": 2.1806622982025146, "train/z_loss": 0.0013650379609316588, "train/perplexity": 8.852167092050383, "train/grad_norm": 0.15625, "optim/muon_lr": 0.018643454909324647, "optim/adamw_lr": 0.0005593036472797393, "perf/tokens_per_sec": 2027216.6448969808, "perf/iters_per_sec": 0.9666522240147499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344982147216797, "data/tokens_consumed": 156218949632, "data/tokens_consumed_B": 156.218949632, "train/loss_slope": 1.6627518779481103e-05} {"step": 74500, "timestamp": 1778275064.9711485, "grad/layer_0/attn": 0.0026166681200265884, "grad/layer_0/mlp": 0.002950426423922181, "grad/layer_0/attn_mlp_ratio": 0.886877913688253, "grad/layer_4/attn": 0.0026462809182703495, "grad/layer_4/mlp": 0.0024516326375305653, "grad/layer_4/attn_mlp_ratio": 1.0793953261269653, "grad/layer_8/attn": 0.004273996688425541, "grad/layer_8/mlp": 0.003595371264964342, "grad/layer_8/attn_mlp_ratio": 1.1887497158358018, "grad/layer_12/attn": 0.004546857438981533, "grad/layer_12/mlp": 0.006931871175765991, "grad/layer_12/attn_mlp_ratio": 0.6559350654530285, "grad/layer_16/attn": 0.004663064144551754, "grad/layer_16/mlp": 0.004675194155424833, "grad/layer_16/attn_mlp_ratio": 0.9974054316868216, "grad/layer_20/attn": 0.003777236444875598, "grad/layer_20/mlp": 0.005871361121535301, "grad/layer_20/attn_mlp_ratio": 0.6433323214761925, "grad/layer_24/attn": 0.01126419473439455, "grad/layer_24/mlp": 0.009837793186306953, "grad/layer_24/attn_mlp_ratio": 1.1449920125962576, "grad/layer_27/attn": 0.005378487054258585, "grad/layer_27/mlp": 0.008543010801076889, "grad/layer_27/attn_mlp_ratio": 0.6295774541948203} {"step": 74500, "timestamp": 1778275064.9869988, "train/loss": 2.1443171977996824, "train/z_loss": 0.0013858109712600708, "train/perplexity": 8.536210703519236, "train/grad_norm": 0.1513671875, "optim/muon_lr": 0.018623947501182556, "optim/adamw_lr": 0.0005587184250354767, "perf/tokens_per_sec": 1949430.417563092, "perf/iters_per_sec": 0.9295608604255161, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.075776791572571, "data/tokens_consumed": 156239921152, "data/tokens_consumed_B": 156.239921152, "train/loss_slope": 1.4735068296334642e-05} {"step": 74500, "timestamp": 1778275071.9938672, "geo/ww_alpha_mean": 7.8684555353842, "geo/ww_alpha_std": 4.75115325957858, "geo/ww_alpha_min": 1.3423825299855896, "geo/ww_alpha_max": 31.710481077546405, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9325318024649927, "geo/ww_alpha_by_type/k_proj": 4.516062155351784, "geo/ww_alpha_by_type/v_proj": 9.022166975207993, "geo/ww_alpha_by_type/o_proj": 9.147557176896608, "geo/ww_alpha_by_type/gate_proj": 8.099090004409971, "geo/ww_alpha_by_type/up_proj": 12.508140630163668, "geo/ww_alpha_by_type/down_proj": 7.9485283783551495, "geo/twonn_id/layer_0": 0.7285126447677612, "geo/twonn_id/layer_7": 3.3982651233673096, "geo/twonn_id/layer_14": 4.449829578399658, "geo/twonn_id/layer_21": 8.156532287597656, "geo/twonn_id/layer_27": 5.580106258392334, "geo/tier2_time_s": 7.000183582305908} {"step": 74500, "timestamp": 1778275072.6630328, "eoc/jacobian_sigma/layer_0/attn": 1317.3094482421875, "eoc/jacobian_sigma/layer_0/mlp": 9634.9873046875, "eoc/jacobian_sigma/layer_0": 9634.9873046875, "eoc/jacobian_sigma/layer_7/attn": 1.1487782001495361, "eoc/jacobian_sigma/layer_7/mlp": 1.7477866411209106, "eoc/jacobian_sigma/layer_7": 1.7477866411209106, "eoc/jacobian_sigma/layer_14/attn": 1.4009578227996826, "eoc/jacobian_sigma/layer_14/mlp": 6.884512424468994, "eoc/jacobian_sigma/layer_14": 6.884512424468994, "eoc/jacobian_sigma/layer_21/attn": 1.1006816625595093, "eoc/jacobian_sigma/layer_21/mlp": 3.857440710067749, "eoc/jacobian_sigma/layer_21": 3.857440710067749, "eoc/jacobian_sigma/layer_27/attn": 3.090895175933838, "eoc/jacobian_sigma/layer_27/mlp": 24.47948455810547, "eoc/jacobian_sigma/layer_27": 24.47948455810547, "eoc/layer0_sigma": 9634.9873046875, "eoc/sigma_max": 24.47948455810547, "eoc/sigma_min": 1.7477866411209106, "eoc/sigma_mean": 9.24230608344078, "eoc/time_s": 0.6621687412261963} {"step": 74510, "timestamp": 1778275083.0301578, "train/loss": 2.1139224648475645, "train/z_loss": 0.0013871444039978086, "train/perplexity": 8.280658256773473, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.01860431134700775, "optim/adamw_lr": 0.0005581293404102325, "perf/tokens_per_sec": 1162667.6471702124, "perf/iters_per_sec": 0.5544031368113577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8037415981292724, "data/tokens_consumed": 156260892672, "data/tokens_consumed_B": 156.260892672, "train/loss_slope": 1.623112801993317e-05} {"step": 74520, "timestamp": 1778275093.37859, "train/loss": 2.103713774681091, "train/z_loss": 0.0013852255535311997, "train/perplexity": 8.196553611815625, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.01858454704284668, "optim/adamw_lr": 0.0005575364112854003, "perf/tokens_per_sec": 2027597.2574727384, "perf/iters_per_sec": 0.9668337142337505, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343040227890015, "data/tokens_consumed": 156281864192, "data/tokens_consumed_B": 156.281864192, "train/loss_slope": 1.3796574290436074e-05} {"step": 74530, "timestamp": 1778275103.7421548, "train/loss": 2.103179121017456, "train/z_loss": 0.0013807179289869963, "train/perplexity": 8.19217246570016, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.018564653992652894, "optim/adamw_lr": 0.0005569396197795867, "perf/tokens_per_sec": 2024415.2628820182, "perf/iters_per_sec": 0.9653164209756938, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359297513961792, "data/tokens_consumed": 156302835712, "data/tokens_consumed_B": 156.302835712, "train/loss_slope": 1.4883157586750926e-05} {"step": 74540, "timestamp": 1778275114.0910113, "train/loss": 2.188283920288086, "train/z_loss": 0.0013705438817851246, "train/perplexity": 8.919892726014874, "train/grad_norm": 0.185546875, "optim/muon_lr": 0.01854463279247284, "optim/adamw_lr": 0.0005563389837741851, "perf/tokens_per_sec": 2027381.863202311, "perf/iters_per_sec": 0.9667310062419467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344139099121095, "data/tokens_consumed": 156323807232, "data/tokens_consumed_B": 156.323807232, "train/loss_slope": 2.0815070319955398e-05} {"step": 74550, "timestamp": 1778275124.4370346, "grad/layer_0/attn": 0.0025783409364521503, "grad/layer_0/mlp": 0.0029498005751520395, "grad/layer_0/attn_mlp_ratio": 0.8740729359007475, "grad/layer_4/attn": 0.0020164342131465673, "grad/layer_4/mlp": 0.002466541714966297, "grad/layer_4/attn_mlp_ratio": 0.8175147086140601, "grad/layer_8/attn": 0.003578771138563752, "grad/layer_8/mlp": 0.0035192063078284264, "grad/layer_8/attn_mlp_ratio": 1.016925614423418, "grad/layer_12/attn": 0.005819748621433973, "grad/layer_12/mlp": 0.007108877878636122, "grad/layer_12/attn_mlp_ratio": 0.8186592369321446, "grad/layer_16/attn": 0.0065023391507565975, "grad/layer_16/mlp": 0.005097529385238886, "grad/layer_16/attn_mlp_ratio": 1.2755863736709467, "grad/layer_20/attn": 0.0032564944121986628, "grad/layer_20/mlp": 0.006186529994010925, "grad/layer_20/attn_mlp_ratio": 0.5263846393232969, "grad/layer_24/attn": 0.013054470531642437, "grad/layer_24/mlp": 0.009745805524289608, "grad/layer_24/attn_mlp_ratio": 1.3394962956275875, "grad/layer_27/attn": 0.00393797317519784, "grad/layer_27/mlp": 0.009437494911253452, "grad/layer_27/attn_mlp_ratio": 0.4172689013877225} {"step": 74550, "timestamp": 1778275125.0626795, "eos/sharpness": 60.33549308776854, "eos/L0_probe": 1.96153724193573, "eos/L_plus": 2.236985445022583, "eos/L_minus": 2.2894439697265625, "eos/grad_norm": 0.15036523342132568, "eos/embed_grad_frac": 0.09996902942657471, "eos/time_s": 0.6228287220001221} {"step": 74550, "timestamp": 1778275125.082117, "train/loss": 2.1232754230499267, "train/z_loss": 0.0013751883641816675, "train/perplexity": 8.358470226151397, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.018524484634399416, "optim/adamw_lr": 0.0005557345390319824, "perf/tokens_per_sec": 1908904.0975525617, "perf/iters_per_sec": 0.9102364051592644, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986156940460206, "data/tokens_consumed": 156344778752, "data/tokens_consumed_B": 156.344778752, "train/loss_slope": 2.1701152146559977e-05} {"step": 74550, "timestamp": 1778275126.446494, "geo/rankme_last": 438.55224609375, "geo/layer_0/stable_rank_q_proj": 19.18092155456543, "geo/layer_0/stable_rank_k_proj": 15.782721519470215, "geo/layer_0/stable_rank_o_proj": 46.64712142944336, "geo/layer_0/stable_rank_gate_proj": 129.06109619140625, "geo/layer_0/stable_rank_down_proj": 56.46023941040039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06167467311024666, "geo/layer_0/attn_entropy_mean": 6.1497602462768555, "geo/layer_0/attn_entropy_std": 0.42874041199684143, "geo/layer_7/stable_rank_q_proj": 42.83422088623047, "geo/layer_7/stable_rank_k_proj": 40.03721618652344, "geo/layer_7/stable_rank_o_proj": 88.31604766845703, "geo/layer_7/stable_rank_gate_proj": 77.566650390625, "geo/layer_7/stable_rank_down_proj": 139.9172821044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45262610912323, "geo/layer_7/attn_entropy_mean": 4.651792526245117, "geo/layer_7/attn_entropy_std": 0.8043878078460693, "geo/layer_14/stable_rank_q_proj": 49.772361755371094, "geo/layer_14/stable_rank_k_proj": 41.52671813964844, "geo/layer_14/stable_rank_o_proj": 43.38145065307617, "geo/layer_14/stable_rank_gate_proj": 70.77014923095703, "geo/layer_14/stable_rank_down_proj": 125.77029418945312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40725454688072205, "geo/layer_14/attn_entropy_mean": 5.560616970062256, "geo/layer_14/attn_entropy_std": 0.42673853039741516, "geo/layer_21/stable_rank_q_proj": 39.78956604003906, "geo/layer_21/stable_rank_k_proj": 29.94930648803711, "geo/layer_21/stable_rank_o_proj": 68.1263198852539, "geo/layer_21/stable_rank_gate_proj": 63.91978454589844, "geo/layer_21/stable_rank_down_proj": 49.41371536254883, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14306646585464478, "geo/layer_21/attn_entropy_mean": 5.692594528198242, "geo/layer_21/attn_entropy_std": 0.30722323060035706, "geo/layer_27/stable_rank_q_proj": 43.9841194152832, "geo/layer_27/stable_rank_k_proj": 32.47172927856445, "geo/layer_27/stable_rank_o_proj": 115.18122100830078, "geo/layer_27/stable_rank_gate_proj": 77.50652313232422, "geo/layer_27/stable_rank_down_proj": 127.95722198486328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09796232730150223, "geo/layer_27/attn_entropy_mean": 4.1558051109313965, "geo/layer_27/attn_entropy_std": 0.7873824238777161, "attnres/final_alpha/block_0": 0.2373301386833191, "attnres/block_norm/0": 1.7768163681030273, "attnres/final_alpha/block_1": 0.003950635902583599, "attnres/block_norm/1": 48468.1953125, "attnres/final_alpha/block_2": 0.00996265560388565, "attnres/block_norm/2": 29287.8359375, "attnres/final_alpha/block_3": 0.011800536885857582, "attnres/block_norm/3": 62279.4453125, "attnres/final_alpha/block_4": 0.01379782147705555, "attnres/block_norm/4": 15842.1962890625, "attnres/final_alpha/block_5": 0.6161212921142578, "attnres/block_norm/5": 6902.2724609375, "attnres/final_alpha/block_6": 0.10703691095113754, "attnres/block_norm/6": 41293.09375, "geo/tier1_time_s": 1.3606953620910645, "geo/step": 74550.0, "geo/rankme_slope": -2.604721576130452e-05} {"step": 74560, "timestamp": 1778275137.3516393, "train/loss": 2.089727258682251, "train/z_loss": 0.0013950144755654037, "train/perplexity": 8.082710374571555, "train/grad_norm": 0.240234375, "optim/muon_lr": 0.018504208326339724, "optim/adamw_lr": 0.0005551262497901916, "perf/tokens_per_sec": 1709758.5384650298, "perf/iters_per_sec": 0.8152764026951932, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.226577877998352, "data/tokens_consumed": 156365750272, "data/tokens_consumed_B": 156.365750272, "train/loss_slope": 2.0686308092231145e-05} {"step": 74570, "timestamp": 1778275147.6998978, "train/loss": 2.141069006919861, "train/z_loss": 0.0013773639220744371, "train/perplexity": 8.508528444733123, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.018483805060386656, "optim/adamw_lr": 0.0005545141518115997, "perf/tokens_per_sec": 2027635.1162605695, "perf/iters_per_sec": 0.966851766710553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342847108840942, "data/tokens_consumed": 156386721792, "data/tokens_consumed_B": 156.386721792, "train/loss_slope": 1.8451845177365153e-05} {"step": 74580, "timestamp": 1778275158.0485501, "train/loss": 2.1363183736801146, "train/z_loss": 0.0013712685671634973, "train/perplexity": 8.468203407253627, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.01846327543258667, "optim/adamw_lr": 0.0005538982629776, "perf/tokens_per_sec": 2027402.8444898662, "perf/iters_per_sec": 0.96674101089948, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344032049179077, "data/tokens_consumed": 156407693312, "data/tokens_consumed_B": 156.407693312, "train/loss_slope": 1.6238089892515814e-05} {"step": 74590, "timestamp": 1778275168.4057224, "train/loss": 2.0719247341156004, "train/z_loss": 0.0013877102290280163, "train/perplexity": 7.94009098384234, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.01844261884689331, "optim/adamw_lr": 0.0005532785654067992, "perf/tokens_per_sec": 2025951.907415622, "perf/iters_per_sec": 0.9660491501882658, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351440191268921, "data/tokens_consumed": 156428664832, "data/tokens_consumed_B": 156.428664832, "train/loss_slope": 1.1750303071574496e-05} {"step": 74600, "timestamp": 1778275178.7510862, "grad/layer_0/attn": 0.002630437957122922, "grad/layer_0/mlp": 0.0028830058872699738, "grad/layer_0/attn_mlp_ratio": 0.9123942054708607, "grad/layer_4/attn": 0.0025080132763832808, "grad/layer_4/mlp": 0.0026436951011419296, "grad/layer_4/attn_mlp_ratio": 0.9486771679654894, "grad/layer_8/attn": 0.004275834187865257, "grad/layer_8/mlp": 0.003608287777751684, "grad/layer_8/attn_mlp_ratio": 1.1850036174301923, "grad/layer_12/attn": 0.004312271252274513, "grad/layer_12/mlp": 0.006626454181969166, "grad/layer_12/attn_mlp_ratio": 0.6507660158477766, "grad/layer_16/attn": 0.0033284707460552454, "grad/layer_16/mlp": 0.00466508511453867, "grad/layer_16/attn_mlp_ratio": 0.7134855191245201, "grad/layer_20/attn": 0.0043898578733205795, "grad/layer_20/mlp": 0.006798900198191404, "grad/layer_20/attn_mlp_ratio": 0.645671752899265, "grad/layer_24/attn": 0.013708824291825294, "grad/layer_24/mlp": 0.01205417513847351, "grad/layer_24/attn_mlp_ratio": 1.1372677118605854, "grad/layer_27/attn": 0.006339132785797119, "grad/layer_27/mlp": 0.01130107045173645, "grad/layer_27/attn_mlp_ratio": 0.5609320600890407} {"step": 74600, "timestamp": 1778275178.7667882, "train/loss": 2.1371931433677673, "train/z_loss": 0.0013689141254872085, "train/perplexity": 8.475614375875939, "train/grad_norm": 0.203125, "optim/muon_lr": 0.018421835899353027, "optim/adamw_lr": 0.0005526550769805907, "perf/tokens_per_sec": 2025138.1595855998, "perf/iters_per_sec": 0.965661124985504, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355599641799926, "data/tokens_consumed": 156449636352, "data/tokens_consumed_B": 156.449636352, "train/loss_slope": 1.0255181564070049e-05} {"step": 74610, "timestamp": 1778275189.1224482, "train/loss": 2.11486337184906, "train/z_loss": 0.0013813224737532436, "train/perplexity": 8.28845325271231, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.018400927782058717, "optim/adamw_lr": 0.0005520278334617614, "perf/tokens_per_sec": 2026109.219079058, "perf/iters_per_sec": 0.9661241622348108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350636482238769, "data/tokens_consumed": 156470607872, "data/tokens_consumed_B": 156.470607872, "train/loss_slope": 1.0211687992186322e-05} {"step": 74620, "timestamp": 1778275199.5096183, "train/loss": 2.1094708681106566, "train/z_loss": 0.0013791272649541497, "train/perplexity": 8.243878031601927, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.01837989330291748, "optim/adamw_lr": 0.0005513967990875243, "perf/tokens_per_sec": 2020124.2813908744, "perf/iters_per_sec": 0.9632703215555546, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381301879882812, "data/tokens_consumed": 156491579392, "data/tokens_consumed_B": 156.491579392, "train/loss_slope": 9.575178055944904e-06} {"step": 74625, "timestamp": 1778275205.310441, "eos/sharpness": 8.948028087615965, "eos/L0_probe": 1.9608006477355957, "eos/L_plus": 2.01320219039917, "eos/L_minus": 1.9978793859481812, "eos/grad_norm": 0.08594527095556259, "eos/embed_grad_frac": 0.2658683657646179, "eos/time_s": 0.620739221572876} {"step": 74625, "timestamp": 1778275206.6898074, "geo/rankme_last": 438.6020202636719, "geo/layer_0/stable_rank_q_proj": 19.15261459350586, "geo/layer_0/stable_rank_k_proj": 15.7791109085083, "geo/layer_0/stable_rank_o_proj": 46.66072082519531, "geo/layer_0/stable_rank_gate_proj": 129.11251831054688, "geo/layer_0/stable_rank_down_proj": 56.48611831665039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06225413456559181, "geo/layer_0/attn_entropy_mean": 6.147123336791992, "geo/layer_0/attn_entropy_std": 0.4282229244709015, "geo/layer_7/stable_rank_q_proj": 42.827354431152344, "geo/layer_7/stable_rank_k_proj": 40.1143684387207, "geo/layer_7/stable_rank_o_proj": 88.32047271728516, "geo/layer_7/stable_rank_gate_proj": 77.53079223632812, "geo/layer_7/stable_rank_down_proj": 139.75926208496094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4353342354297638, "geo/layer_7/attn_entropy_mean": 4.645009517669678, "geo/layer_7/attn_entropy_std": 0.7950364351272583, "geo/layer_14/stable_rank_q_proj": 49.696571350097656, "geo/layer_14/stable_rank_k_proj": 41.574859619140625, "geo/layer_14/stable_rank_o_proj": 43.38115310668945, "geo/layer_14/stable_rank_gate_proj": 70.69073486328125, "geo/layer_14/stable_rank_down_proj": 125.59525299072266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4029786288738251, "geo/layer_14/attn_entropy_mean": 5.536832332611084, "geo/layer_14/attn_entropy_std": 0.4303138554096222, "geo/layer_21/stable_rank_q_proj": 39.751861572265625, "geo/layer_21/stable_rank_k_proj": 30.004209518432617, "geo/layer_21/stable_rank_o_proj": 68.07379150390625, "geo/layer_21/stable_rank_gate_proj": 63.86614227294922, "geo/layer_21/stable_rank_down_proj": 49.384151458740234, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14192934334278107, "geo/layer_21/attn_entropy_mean": 5.6960248947143555, "geo/layer_21/attn_entropy_std": 0.29084154963493347, "geo/layer_27/stable_rank_q_proj": 43.89537811279297, "geo/layer_27/stable_rank_k_proj": 32.495819091796875, "geo/layer_27/stable_rank_o_proj": 115.20376586914062, "geo/layer_27/stable_rank_gate_proj": 77.44686889648438, "geo/layer_27/stable_rank_down_proj": 128.24786376953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09447210282087326, "geo/layer_27/attn_entropy_mean": 4.177772045135498, "geo/layer_27/attn_entropy_std": 0.7716077566146851, "attnres/final_alpha/block_0": 0.23731693625450134, "attnres/block_norm/0": 1.777061104774475, "attnres/final_alpha/block_1": 0.003917134366929531, "attnres/block_norm/1": 48471.1640625, "attnres/final_alpha/block_2": 0.009940418414771557, "attnres/block_norm/2": 29158.677734375, "attnres/final_alpha/block_3": 0.011923154816031456, "attnres/block_norm/3": 62174.2265625, "attnres/final_alpha/block_4": 0.014077424071729183, "attnres/block_norm/4": 15843.296875, "attnres/final_alpha/block_5": 0.6166613101959229, "attnres/block_norm/5": 6831.8232421875, "attnres/final_alpha/block_6": 0.10616365075111389, "attnres/block_norm/6": 41463.578125, "geo/tier1_time_s": 1.3589987754821777, "geo/step": 74625.0, "geo/rankme_slope": -7.06933163890556e-06} {"step": 74630, "timestamp": 1778275211.8826327, "train/loss": 2.153245759010315, "train/z_loss": 0.0013744082069024443, "train/perplexity": 8.612768048171176, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.018358734250068665, "optim/adamw_lr": 0.0005507620275020599, "perf/tokens_per_sec": 1695715.026306824, "perf/iters_per_sec": 0.8085799342664833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2367361068725586, "data/tokens_consumed": 156512550912, "data/tokens_consumed_B": 156.512550912, "train/loss_slope": 1.2124284637821965e-05} {"step": 74640, "timestamp": 1778275222.2662616, "train/loss": 2.1250006437301634, "train/z_loss": 0.0013860128587111832, "train/perplexity": 8.372902878015667, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.01833745002746582, "optim/adamw_lr": 0.0005501235008239745, "perf/tokens_per_sec": 2020660.6512311818, "perf/iters_per_sec": 0.9635260826259526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378546237945556, "data/tokens_consumed": 156533522432, "data/tokens_consumed_B": 156.533522432, "train/loss_slope": 7.040048682793348e-06} {"step": 74650, "timestamp": 1778275232.630091, "grad/layer_0/attn": 0.003084785072132945, "grad/layer_0/mlp": 0.0032570024486631155, "grad/layer_0/attn_mlp_ratio": 0.9471239355950583, "grad/layer_4/attn": 0.00224094046279788, "grad/layer_4/mlp": 0.002627671230584383, "grad/layer_4/attn_mlp_ratio": 0.852823729023791, "grad/layer_8/attn": 0.003958196844905615, "grad/layer_8/mlp": 0.0035868363920599222, "grad/layer_8/attn_mlp_ratio": 1.1035342295830217, "grad/layer_12/attn": 0.003646701807156205, "grad/layer_12/mlp": 0.0072278776206076145, "grad/layer_12/attn_mlp_ratio": 0.5045328584847232, "grad/layer_16/attn": 0.00342307286337018, "grad/layer_16/mlp": 0.004662139806896448, "grad/layer_16/attn_mlp_ratio": 0.7342278292220739, "grad/layer_20/attn": 0.002957264892756939, "grad/layer_20/mlp": 0.00597766786813736, "grad/layer_20/attn_mlp_ratio": 0.49471882823201074, "grad/layer_24/attn": 0.00557539751753211, "grad/layer_24/mlp": 0.008666235022246838, "grad/layer_24/attn_mlp_ratio": 0.643347132738145, "grad/layer_27/attn": 0.010284924879670143, "grad/layer_27/mlp": 0.007873643189668655, "grad/layer_27/attn_mlp_ratio": 1.3062472481024678} {"step": 74650, "timestamp": 1778275232.6458416, "train/loss": 2.1837800264358522, "train/z_loss": 0.0013876280398108065, "train/perplexity": 8.879808810612364, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.018316041231155395, "optim/adamw_lr": 0.0005494812369346618, "perf/tokens_per_sec": 2021572.5485228589, "perf/iters_per_sec": 0.963960909139089, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373864650726319, "data/tokens_consumed": 156554493952, "data/tokens_consumed_B": 156.554493952, "train/loss_slope": 6.38928502807018e-06} {"step": 74660, "timestamp": 1778275243.0219245, "train/loss": 2.1242560148239136, "train/z_loss": 0.0013759731082245708, "train/perplexity": 8.366670493198393, "train/grad_norm": 0.12109375, "optim/muon_lr": 0.018294507861137392, "optim/adamw_lr": 0.0005488352358341217, "perf/tokens_per_sec": 2022155.5702608062, "perf/iters_per_sec": 0.9642389155677825, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037087368965149, "data/tokens_consumed": 156575465472, "data/tokens_consumed_B": 156.575465472, "train/loss_slope": 4.76646831791711e-06} {"step": 74670, "timestamp": 1778275253.4104166, "train/loss": 2.1121054530143737, "train/z_loss": 0.001386811037082225, "train/perplexity": 8.26562586388801, "train/grad_norm": 0.1328125, "optim/muon_lr": 0.018272850513458252, "optim/adamw_lr": 0.0005481855154037475, "perf/tokens_per_sec": 2019691.2353234107, "perf/iters_per_sec": 0.9630638290993742, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383527755737305, "data/tokens_consumed": 156596436992, "data/tokens_consumed_B": 156.596436992, "train/loss_slope": 5.7412834522282255e-06} {"step": 74680, "timestamp": 1778275263.7898922, "train/loss": 2.0986053347587585, "train/z_loss": 0.0013879476813599468, "train/perplexity": 8.154788777518029, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.01825106978416443, "optim/adamw_lr": 0.0005475320935249328, "perf/tokens_per_sec": 2021776.6259736707, "perf/iters_per_sec": 0.9640582208507875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372817516326904, "data/tokens_consumed": 156617408512, "data/tokens_consumed_B": 156.617408512, "train/loss_slope": 7.881002529154062e-07} {"step": 74690, "timestamp": 1778275274.1622314, "train/loss": 2.124059820175171, "train/z_loss": 0.001377908606082201, "train/perplexity": 8.365029158235666, "train/grad_norm": 0.23828125, "optim/muon_lr": 0.018229165673255922, "optim/adamw_lr": 0.0005468749701976775, "perf/tokens_per_sec": 2022747.9987822247, "perf/iters_per_sec": 0.9645214075003742, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367836236953736, "data/tokens_consumed": 156638380032, "data/tokens_consumed_B": 156.638380032, "train/loss_slope": 2.3634740979399756e-06} {"step": 74700, "timestamp": 1778275284.539628, "grad/layer_0/attn": 0.002601881045848131, "grad/layer_0/mlp": 0.002897818572819233, "grad/layer_0/attn_mlp_ratio": 0.8978757264052034, "grad/layer_4/attn": 0.0021419639233499765, "grad/layer_4/mlp": 0.0027000720147043467, "grad/layer_4/attn_mlp_ratio": 0.7932987832750971, "grad/layer_8/attn": 0.0034080399200320244, "grad/layer_8/mlp": 0.003840052057057619, "grad/layer_8/attn_mlp_ratio": 0.8874983413359111, "grad/layer_12/attn": 0.005828384775668383, "grad/layer_12/mlp": 0.008190681226551533, "grad/layer_12/attn_mlp_ratio": 0.7115872957692359, "grad/layer_16/attn": 0.003710012184455991, "grad/layer_16/mlp": 0.004802579991519451, "grad/layer_16/attn_mlp_ratio": 0.772503969482414, "grad/layer_20/attn": 0.003922078758478165, "grad/layer_20/mlp": 0.006356076803058386, "grad/layer_20/attn_mlp_ratio": 0.6170596766365364, "grad/layer_24/attn": 0.007187599316239357, "grad/layer_24/mlp": 0.007466276176273823, "grad/layer_24/attn_mlp_ratio": 0.962675241348885, "grad/layer_27/attn": 0.004760358948260546, "grad/layer_27/mlp": 0.006288946606218815, "grad/layer_27/attn_mlp_ratio": 0.7569405769575488} {"step": 74700, "timestamp": 1778275285.1355207, "eos/sharpness": 26.680254936218258, "eos/L0_probe": 1.9584490060806274, "eos/L_plus": 2.1062123775482178, "eos/L_minus": 2.0774881839752197, "eos/grad_norm": 0.0955786406993866, "eos/embed_grad_frac": 0.2146284282207489, "eos/time_s": 0.5928092002868652} {"step": 74700, "timestamp": 1778275285.1562061, "train/loss": 2.146745705604553, "train/z_loss": 0.0013788370764814318, "train/perplexity": 8.556966150017965, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.018207138776779177, "optim/adamw_lr": 0.0005462141633033752, "perf/tokens_per_sec": 1908804.2234489953, "perf/iters_per_sec": 0.9101887814755417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0986731767654419, "data/tokens_consumed": 156659351552, "data/tokens_consumed_B": 156.659351552, "train/loss_slope": 2.0063813656183383e-06} {"step": 74700, "timestamp": 1778275286.5257692, "geo/rankme_last": 438.2311706542969, "geo/layer_0/stable_rank_q_proj": 19.132741928100586, "geo/layer_0/stable_rank_k_proj": 15.75895881652832, "geo/layer_0/stable_rank_o_proj": 46.62016296386719, "geo/layer_0/stable_rank_gate_proj": 129.09217834472656, "geo/layer_0/stable_rank_down_proj": 56.44713592529297, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.059612471610307693, "geo/layer_0/attn_entropy_mean": 6.148237228393555, "geo/layer_0/attn_entropy_std": 0.4291875958442688, "geo/layer_7/stable_rank_q_proj": 42.7942008972168, "geo/layer_7/stable_rank_k_proj": 40.10136413574219, "geo/layer_7/stable_rank_o_proj": 88.40950012207031, "geo/layer_7/stable_rank_gate_proj": 77.5684814453125, "geo/layer_7/stable_rank_down_proj": 139.91705322265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43361151218414307, "geo/layer_7/attn_entropy_mean": 4.630433082580566, "geo/layer_7/attn_entropy_std": 0.7788375020027161, "geo/layer_14/stable_rank_q_proj": 49.761505126953125, "geo/layer_14/stable_rank_k_proj": 41.636505126953125, "geo/layer_14/stable_rank_o_proj": 43.408267974853516, "geo/layer_14/stable_rank_gate_proj": 70.61129760742188, "geo/layer_14/stable_rank_down_proj": 125.82674407958984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39999696612358093, "geo/layer_14/attn_entropy_mean": 5.5714311599731445, "geo/layer_14/attn_entropy_std": 0.4338652193546295, "geo/layer_21/stable_rank_q_proj": 39.75665283203125, "geo/layer_21/stable_rank_k_proj": 29.96627426147461, "geo/layer_21/stable_rank_o_proj": 68.04910278320312, "geo/layer_21/stable_rank_gate_proj": 63.8934326171875, "geo/layer_21/stable_rank_down_proj": 49.33234786987305, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14239062368869781, "geo/layer_21/attn_entropy_mean": 5.686501502990723, "geo/layer_21/attn_entropy_std": 0.2974931001663208, "geo/layer_27/stable_rank_q_proj": 43.891239166259766, "geo/layer_27/stable_rank_k_proj": 32.45664596557617, "geo/layer_27/stable_rank_o_proj": 115.39348602294922, "geo/layer_27/stable_rank_gate_proj": 77.48933410644531, "geo/layer_27/stable_rank_down_proj": 128.2506103515625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09879095107316971, "geo/layer_27/attn_entropy_mean": 4.161624431610107, "geo/layer_27/attn_entropy_std": 0.7657209634780884, "attnres/final_alpha/block_0": 0.2361190915107727, "attnres/block_norm/0": 1.777166485786438, "attnres/final_alpha/block_1": 0.003893475513905287, "attnres/block_norm/1": 48354.97265625, "attnres/final_alpha/block_2": 0.009893910959362984, "attnres/block_norm/2": 29001.232421875, "attnres/final_alpha/block_3": 0.0118190823122859, "attnres/block_norm/3": 62171.0859375, "attnres/final_alpha/block_4": 0.013821439817547798, "attnres/block_norm/4": 15868.962890625, "attnres/final_alpha/block_5": 0.6185669302940369, "attnres/block_norm/5": 6856.9638671875, "attnres/final_alpha/block_6": 0.1058860495686531, "attnres/block_norm/6": 41471.94921875, "geo/tier1_time_s": 1.3648829460144043, "geo/step": 74700.0, "geo/rankme_slope": -1.6095129458033212e-05} {"step": 74710, "timestamp": 1778275296.900137, "train/loss": 2.1303624868392945, "train/z_loss": 0.0013622097671031953, "train/perplexity": 8.417917642823399, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.018184988498687743, "optim/adamw_lr": 0.0005455496549606323, "perf/tokens_per_sec": 1786426.0518539406, "perf/iters_per_sec": 0.8518343219060615, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173937201499939, "data/tokens_consumed": 156680323072, "data/tokens_consumed_B": 156.680323072, "train/loss_slope": 8.005729281003975e-07} {"step": 74720, "timestamp": 1778275307.278067, "train/loss": 2.135229504108429, "train/z_loss": 0.0013683398370631038, "train/perplexity": 8.458987656522446, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.018162716627120972, "optim/adamw_lr": 0.0005448814988136291, "perf/tokens_per_sec": 2021845.5904184105, "perf/iters_per_sec": 0.9640911056606343, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372463703155517, "data/tokens_consumed": 156701294592, "data/tokens_consumed_B": 156.701294592, "train/loss_slope": -1.913185410051836e-06} {"step": 74730, "timestamp": 1778275317.655177, "train/loss": 2.103874659538269, "train/z_loss": 0.0013785786810331046, "train/perplexity": 8.197872419258042, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.01814032256603241, "optim/adamw_lr": 0.0005442096769809723, "perf/tokens_per_sec": 2022156.453530044, "perf/iters_per_sec": 0.9642393367433758, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370869159698486, "data/tokens_consumed": 156722266112, "data/tokens_consumed_B": 156.722266112, "train/loss_slope": -4.991854149194431e-06} {"step": 74740, "timestamp": 1778275328.0304015, "train/loss": 2.14245982170105, "train/z_loss": 0.0013875341042876243, "train/perplexity": 8.520370464979724, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.01811780631542206, "optim/adamw_lr": 0.0005435341894626617, "perf/tokens_per_sec": 2022499.8247824565, "perf/iters_per_sec": 0.9644030689155848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036910843849182, "data/tokens_consumed": 156743237632, "data/tokens_consumed_B": 156.743237632, "train/loss_slope": -6.299247600064471e-06} {"step": 74750, "timestamp": 1778275338.397286, "grad/layer_0/attn": 0.0027310531586408615, "grad/layer_0/mlp": 0.0032031431328505278, "grad/layer_0/attn_mlp_ratio": 0.8526166206468557, "grad/layer_4/attn": 0.002055036835372448, "grad/layer_4/mlp": 0.0027126031927764416, "grad/layer_4/attn_mlp_ratio": 0.7575884173129672, "grad/layer_8/attn": 0.0043845390900969505, "grad/layer_8/mlp": 0.0037250469904392958, "grad/layer_8/attn_mlp_ratio": 1.177042593998424, "grad/layer_12/attn": 0.006081417668610811, "grad/layer_12/mlp": 0.007077716290950775, "grad/layer_12/attn_mlp_ratio": 0.8592344384392425, "grad/layer_16/attn": 0.006647107657045126, "grad/layer_16/mlp": 0.004779573995620012, "grad/layer_16/attn_mlp_ratio": 1.390732212549339, "grad/layer_20/attn": 0.005439810920506716, "grad/layer_20/mlp": 0.005642589647322893, "grad/layer_20/attn_mlp_ratio": 0.9640628087639396, "grad/layer_24/attn": 0.00858650729060173, "grad/layer_24/mlp": 0.00894101895391941, "grad/layer_24/attn_mlp_ratio": 0.96034995997886, "grad/layer_27/attn": 0.006100187078118324, "grad/layer_27/mlp": 0.0074151260778307915, "grad/layer_27/attn_mlp_ratio": 0.822668007505553} {"step": 74750, "timestamp": 1778275338.4124596, "train/loss": 2.104444181919098, "train/z_loss": 0.001394774450454861, "train/perplexity": 8.202542620841918, "train/grad_norm": 0.146484375, "optim/muon_lr": 0.01809516966342926, "optim/adamw_lr": 0.0005428550899028778, "perf/tokens_per_sec": 2021097.503207677, "perf/iters_per_sec": 0.96373438988098, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037630295753479, "data/tokens_consumed": 156764209152, "data/tokens_consumed_B": 156.764209152, "train/loss_slope": -1.1654033855457731e-05} {"step": 74760, "timestamp": 1778275348.7951438, "train/loss": 2.121389377117157, "train/z_loss": 0.0013943764381110669, "train/perplexity": 8.342720624282126, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.01807241141796112, "optim/adamw_lr": 0.0005421723425388336, "perf/tokens_per_sec": 2021044.6567813752, "perf/iters_per_sec": 0.9637091907412411, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376574277877808, "data/tokens_consumed": 156785180672, "data/tokens_consumed_B": 156.785180672, "train/loss_slope": -1.0941815133070506e-05} {"step": 74770, "timestamp": 1778275359.1729712, "train/loss": 2.1739374160766602, "train/z_loss": 0.0013846349203959108, "train/perplexity": 8.79283702923272, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.01804953217506409, "optim/adamw_lr": 0.0005414859652519225, "perf/tokens_per_sec": 2021888.6723478318, "perf/iters_per_sec": 0.9641116487254294, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372242689132691, "data/tokens_consumed": 156806152192, "data/tokens_consumed_B": 156.806152192, "train/loss_slope": -9.54340688108579e-06} {"step": 74775, "timestamp": 1778275364.9396002, "eos/sharpness": 6.328308582305907, "eos/L0_probe": 1.9586552381515503, "eos/L_plus": 1.9964312314987183, "eos/L_minus": 1.9841623306274414, "eos/grad_norm": 0.08847039192914963, "eos/embed_grad_frac": 0.289035439491272, "eos/time_s": 0.5879790782928467} {"step": 74775, "timestamp": 1778275366.3158295, "geo/rankme_last": 438.498291015625, "geo/layer_0/stable_rank_q_proj": 19.110807418823242, "geo/layer_0/stable_rank_k_proj": 15.763569831848145, "geo/layer_0/stable_rank_o_proj": 46.633968353271484, "geo/layer_0/stable_rank_gate_proj": 129.25160217285156, "geo/layer_0/stable_rank_down_proj": 56.44868469238281, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06645876914262772, "geo/layer_0/attn_entropy_mean": 6.146637439727783, "geo/layer_0/attn_entropy_std": 0.4306207001209259, "geo/layer_7/stable_rank_q_proj": 42.83134460449219, "geo/layer_7/stable_rank_k_proj": 40.080326080322266, "geo/layer_7/stable_rank_o_proj": 88.43111419677734, "geo/layer_7/stable_rank_gate_proj": 77.58877563476562, "geo/layer_7/stable_rank_down_proj": 139.8928680419922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4322574734687805, "geo/layer_7/attn_entropy_mean": 4.63644552230835, "geo/layer_7/attn_entropy_std": 0.7827532887458801, "geo/layer_14/stable_rank_q_proj": 49.823970794677734, "geo/layer_14/stable_rank_k_proj": 41.5663948059082, "geo/layer_14/stable_rank_o_proj": 43.432430267333984, "geo/layer_14/stable_rank_gate_proj": 70.64289093017578, "geo/layer_14/stable_rank_down_proj": 125.91544342041016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3991614580154419, "geo/layer_14/attn_entropy_mean": 5.541104316711426, "geo/layer_14/attn_entropy_std": 0.4149312973022461, "geo/layer_21/stable_rank_q_proj": 39.7641487121582, "geo/layer_21/stable_rank_k_proj": 30.021799087524414, "geo/layer_21/stable_rank_o_proj": 68.00607299804688, "geo/layer_21/stable_rank_gate_proj": 63.95640563964844, "geo/layer_21/stable_rank_down_proj": 49.307708740234375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15072748064994812, "geo/layer_21/attn_entropy_mean": 5.690834045410156, "geo/layer_21/attn_entropy_std": 0.2985714077949524, "geo/layer_27/stable_rank_q_proj": 43.91551208496094, "geo/layer_27/stable_rank_k_proj": 32.482452392578125, "geo/layer_27/stable_rank_o_proj": 115.33792114257812, "geo/layer_27/stable_rank_gate_proj": 77.44961547851562, "geo/layer_27/stable_rank_down_proj": 128.0889129638672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09659986197948456, "geo/layer_27/attn_entropy_mean": 4.16584587097168, "geo/layer_27/attn_entropy_std": 0.7856796383857727, "attnres/final_alpha/block_0": 0.2360447347164154, "attnres/block_norm/0": 1.7772836685180664, "attnres/final_alpha/block_1": 0.003992326557636261, "attnres/block_norm/1": 48376.5078125, "attnres/final_alpha/block_2": 0.009959522634744644, "attnres/block_norm/2": 29202.046875, "attnres/final_alpha/block_3": 0.01181207038462162, "attnres/block_norm/3": 62460.6640625, "attnres/final_alpha/block_4": 0.013719653710722923, "attnres/block_norm/4": 15840.166015625, "attnres/final_alpha/block_5": 0.6178402900695801, "attnres/block_norm/5": 6897.55517578125, "attnres/final_alpha/block_6": 0.10663145780563354, "attnres/block_norm/6": 41271.58984375, "geo/tier1_time_s": 1.3570778369903564, "geo/step": 74775.0, "geo/rankme_slope": -3.1653403548919565e-05} {"step": 74780, "timestamp": 1778275371.5060797, "train/loss": 2.1319698810577394, "train/z_loss": 0.0013774010236375033, "train/perplexity": 8.431459435557814, "train/grad_norm": 0.1875, "optim/muon_lr": 0.018026533126831053, "optim/adamw_lr": 0.0005407959938049316, "perf/tokens_per_sec": 1701189.4995416836, "perf/iters_per_sec": 0.8111903665264528, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23275625705719, "data/tokens_consumed": 156827123712, "data/tokens_consumed_B": 156.827123712, "train/loss_slope": -9.242636492901185e-06} {"step": 74790, "timestamp": 1778275381.8836997, "train/loss": 2.1078574895858764, "train/z_loss": 0.00137886080192402, "train/perplexity": 8.23058825942411, "train/grad_norm": 0.14453125, "optim/muon_lr": 0.018003414273262023, "optim/adamw_lr": 0.0005401024281978606, "perf/tokens_per_sec": 2021881.8869383237, "perf/iters_per_sec": 0.9641084131900424, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037227749824524, "data/tokens_consumed": 156848095232, "data/tokens_consumed_B": 156.848095232, "train/loss_slope": -1.2017907883145978e-05} {"step": 74800, "timestamp": 1778275392.2573636, "grad/layer_0/attn": 0.0028120744973421097, "grad/layer_0/mlp": 0.0029277962166815996, "grad/layer_0/attn_mlp_ratio": 0.9604747711853636, "grad/layer_4/attn": 0.0025251826737076044, "grad/layer_4/mlp": 0.0026845899410545826, "grad/layer_4/attn_mlp_ratio": 0.9406213369977494, "grad/layer_8/attn": 0.003597448579967022, "grad/layer_8/mlp": 0.0037037834990769625, "grad/layer_8/attn_mlp_ratio": 0.9712901641617396, "grad/layer_12/attn": 0.005702872760593891, "grad/layer_12/mlp": 0.006347668822854757, "grad/layer_12/attn_mlp_ratio": 0.8984200073921185, "grad/layer_16/attn": 0.0030079579446464777, "grad/layer_16/mlp": 0.004316720180213451, "grad/layer_16/attn_mlp_ratio": 0.6968155797432726, "grad/layer_20/attn": 0.002728966297581792, "grad/layer_20/mlp": 0.00516672944650054, "grad/layer_20/attn_mlp_ratio": 0.5281805972271452, "grad/layer_24/attn": 0.004952762275934219, "grad/layer_24/mlp": 0.007090632803738117, "grad/layer_24/attn_mlp_ratio": 0.698493680771877, "grad/layer_27/attn": 0.00516555504873395, "grad/layer_27/mlp": 0.006411898415535688, "grad/layer_27/attn_mlp_ratio": 0.8056202131425014} {"step": 74800, "timestamp": 1778275392.2722907, "train/loss": 2.143522620201111, "train/z_loss": 0.0013678264222107828, "train/perplexity": 8.529430715687472, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.017980175614356993, "optim/adamw_lr": 0.0005394052684307098, "perf/tokens_per_sec": 2019668.0946646724, "perf/iters_per_sec": 0.9630527947734224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383646726608275, "data/tokens_consumed": 156869066752, "data/tokens_consumed_B": 156.869066752, "train/loss_slope": -9.551457910969815e-06} {"step": 74810, "timestamp": 1778275402.6549237, "train/loss": 2.1506306648254396, "train/z_loss": 0.0013570543960668147, "train/perplexity": 8.590274273022063, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.017956817746162416, "optim/adamw_lr": 0.0005387045323848724, "perf/tokens_per_sec": 2021484.740987022, "perf/iters_per_sec": 0.9639190392432317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037431526184082, "data/tokens_consumed": 156890038272, "data/tokens_consumed_B": 156.890038272, "train/loss_slope": -5.5185874732378475e-06} {"step": 74820, "timestamp": 1778275413.0316346, "train/loss": 2.074436569213867, "train/z_loss": 0.0013858796562999486, "train/perplexity": 7.960060252314336, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.017933340668678285, "optim/adamw_lr": 0.0005380002200603485, "perf/tokens_per_sec": 2022157.0578725988, "perf/iters_per_sec": 0.9642396249163622, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370866060256958, "data/tokens_consumed": 156911009792, "data/tokens_consumed_B": 156.911009792, "train/loss_slope": -6.6735485098650386e-06} {"step": 74830, "timestamp": 1778275423.4155583, "train/loss": 2.1050952196121218, "train/z_loss": 0.001382902020122856, "train/perplexity": 8.207884523968183, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.017909744977951052, "optim/adamw_lr": 0.0005372923493385315, "perf/tokens_per_sec": 2020724.5723891307, "perf/iters_per_sec": 0.963556562609258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378217935562133, "data/tokens_consumed": 156931981312, "data/tokens_consumed_B": 156.931981312, "train/loss_slope": -7.270795319697678e-06} {"step": 74840, "timestamp": 1778275433.8008482, "train/loss": 2.0918226838111877, "train/z_loss": 0.0013924634316936136, "train/perplexity": 8.09966484620966, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.01788603127002716, "optim/adamw_lr": 0.0005365809381008147, "perf/tokens_per_sec": 2021311.6570420987, "perf/iters_per_sec": 0.963836506386804, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375203609466552, "data/tokens_consumed": 156952952832, "data/tokens_consumed_B": 156.952952832, "train/loss_slope": -7.3296039912066064e-06} {"step": 74850, "timestamp": 1778275444.1691415, "grad/layer_0/attn": 0.002724379999563098, "grad/layer_0/mlp": 0.002951045287773013, "grad/layer_0/attn_mlp_ratio": 0.9231914937164125, "grad/layer_4/attn": 0.002029628260061145, "grad/layer_4/mlp": 0.0024743671528995037, "grad/layer_4/attn_mlp_ratio": 0.8202615265307902, "grad/layer_8/attn": 0.004148539621382952, "grad/layer_8/mlp": 0.0035408884286880493, "grad/layer_8/attn_mlp_ratio": 1.171609777537969, "grad/layer_12/attn": 0.0043029217049479485, "grad/layer_12/mlp": 0.007081498391926289, "grad/layer_12/attn_mlp_ratio": 0.607628697493019, "grad/layer_16/attn": 0.003653218038380146, "grad/layer_16/mlp": 0.004708610009402037, "grad/layer_16/attn_mlp_ratio": 0.7758591077833118, "grad/layer_20/attn": 0.005452196579426527, "grad/layer_20/mlp": 0.005982478614896536, "grad/layer_20/attn_mlp_ratio": 0.9113607986352561, "grad/layer_24/attn": 0.010900226421654224, "grad/layer_24/mlp": 0.008673182688653469, "grad/layer_24/attn_mlp_ratio": 1.2567735152445125, "grad/layer_27/attn": 0.0039329626597464085, "grad/layer_27/mlp": 0.008502155542373657, "grad/layer_27/attn_mlp_ratio": 0.46258417572892047} {"step": 74850, "timestamp": 1778275444.7672317, "eos/sharpness": 77.144718170166, "eos/L0_probe": 1.9541478157043457, "eos/L_plus": 2.2888007164001465, "eos/L_minus": 2.390942096710205, "eos/grad_norm": 0.16141663491725922, "eos/embed_grad_frac": 0.08059419691562653, "eos/time_s": 0.5949501991271973} {"step": 74850, "timestamp": 1778275444.786016, "train/loss": 2.194969630241394, "train/z_loss": 0.001384478760883212, "train/perplexity": 8.97972834049618, "train/grad_norm": 0.162109375, "optim/muon_lr": 0.017862200140953063, "optim/adamw_lr": 0.0005358660042285918, "perf/tokens_per_sec": 1909990.0897420938, "perf/iters_per_sec": 0.9107542465887517, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979910373687745, "data/tokens_consumed": 156973924352, "data/tokens_consumed_B": 156.973924352, "train/loss_slope": -2.996305957271122e-06} {"step": 74850, "timestamp": 1778275446.148132, "geo/rankme_last": 438.5143737792969, "geo/layer_0/stable_rank_q_proj": 19.0750789642334, "geo/layer_0/stable_rank_k_proj": 15.757311820983887, "geo/layer_0/stable_rank_o_proj": 46.64473342895508, "geo/layer_0/stable_rank_gate_proj": 129.3137664794922, "geo/layer_0/stable_rank_down_proj": 56.36431121826172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06018060818314552, "geo/layer_0/attn_entropy_mean": 6.142864227294922, "geo/layer_0/attn_entropy_std": 0.4325692355632782, "geo/layer_7/stable_rank_q_proj": 42.78144836425781, "geo/layer_7/stable_rank_k_proj": 40.0764274597168, "geo/layer_7/stable_rank_o_proj": 88.44271087646484, "geo/layer_7/stable_rank_gate_proj": 77.58049011230469, "geo/layer_7/stable_rank_down_proj": 139.7461700439453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.438571035861969, "geo/layer_7/attn_entropy_mean": 4.638195037841797, "geo/layer_7/attn_entropy_std": 0.7987411022186279, "geo/layer_14/stable_rank_q_proj": 49.71215057373047, "geo/layer_14/stable_rank_k_proj": 41.50709915161133, "geo/layer_14/stable_rank_o_proj": 43.35452651977539, "geo/layer_14/stable_rank_gate_proj": 70.63666534423828, "geo/layer_14/stable_rank_down_proj": 125.76317596435547, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4041174054145813, "geo/layer_14/attn_entropy_mean": 5.508098602294922, "geo/layer_14/attn_entropy_std": 0.4433741569519043, "geo/layer_21/stable_rank_q_proj": 39.77610397338867, "geo/layer_21/stable_rank_k_proj": 30.10287094116211, "geo/layer_21/stable_rank_o_proj": 68.11614227294922, "geo/layer_21/stable_rank_gate_proj": 64.06995391845703, "geo/layer_21/stable_rank_down_proj": 49.319366455078125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14862890541553497, "geo/layer_21/attn_entropy_mean": 5.677924156188965, "geo/layer_21/attn_entropy_std": 0.3038254678249359, "geo/layer_27/stable_rank_q_proj": 43.91916275024414, "geo/layer_27/stable_rank_k_proj": 32.42341995239258, "geo/layer_27/stable_rank_o_proj": 115.4439697265625, "geo/layer_27/stable_rank_gate_proj": 77.36983489990234, "geo/layer_27/stable_rank_down_proj": 127.89654541015625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09818486124277115, "geo/layer_27/attn_entropy_mean": 4.147433280944824, "geo/layer_27/attn_entropy_std": 0.7742683291435242, "attnres/final_alpha/block_0": 0.23746973276138306, "attnres/block_norm/0": 1.7773997783660889, "attnres/final_alpha/block_1": 0.004067578352987766, "attnres/block_norm/1": 48353.17578125, "attnres/final_alpha/block_2": 0.009935523383319378, "attnres/block_norm/2": 29127.59765625, "attnres/final_alpha/block_3": 0.011800529435276985, "attnres/block_norm/3": 62435.35546875, "attnres/final_alpha/block_4": 0.013569026254117489, "attnres/block_norm/4": 15836.884765625, "attnres/final_alpha/block_5": 0.6148890852928162, "attnres/block_norm/5": 6882.302734375, "attnres/final_alpha/block_6": 0.10826851427555084, "attnres/block_norm/6": 41282.2265625, "geo/tier1_time_s": 1.3579344749450684, "geo/step": 74850.0, "geo/rankme_slope": -4.370537277410964e-06} {"step": 74860, "timestamp": 1778275456.5385506, "train/loss": 2.13315327167511, "train/z_loss": 0.001393904397264123, "train/perplexity": 8.441443051638295, "train/grad_norm": 0.2099609375, "optim/muon_lr": 0.017838250398635864, "optim/adamw_lr": 0.0005351475119590759, "perf/tokens_per_sec": 1785026.7351820706, "perf/iters_per_sec": 0.8511670757208207, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1748574733734132, "data/tokens_consumed": 156994895872, "data/tokens_consumed_B": 156.994895872, "train/loss_slope": -2.525700429807038e-06} {"step": 74870, "timestamp": 1778275466.9175868, "train/loss": 2.1684547424316407, "train/z_loss": 0.0013757699984125794, "train/perplexity": 8.74476068726414, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.01781418442726135, "optim/adamw_lr": 0.0005344255328178406, "perf/tokens_per_sec": 2021688.289579008, "perf/iters_per_sec": 0.9640160987753906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373270750045775, "data/tokens_consumed": 157015867392, "data/tokens_consumed_B": 157.015867392, "train/loss_slope": 1.9033762774171553e-07} {"step": 74880, "timestamp": 1778275477.301499, "train/loss": 2.117085802555084, "train/z_loss": 0.001378192484844476, "train/perplexity": 8.306894250055644, "train/grad_norm": 0.1455078125, "optim/muon_lr": 0.017790001630783082, "optim/adamw_lr": 0.0005337000489234923, "perf/tokens_per_sec": 2020683.1647579307, "perf/iters_per_sec": 0.9635368179120687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378430604934692, "data/tokens_consumed": 157036838912, "data/tokens_consumed_B": 157.036838912, "train/loss_slope": -3.5822193507420825e-06} {"step": 74890, "timestamp": 1778275487.6802027, "train/loss": 2.1250263452529907, "train/z_loss": 0.0013758548418991267, "train/perplexity": 8.373118077135578, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.01776570200920105, "optim/adamw_lr": 0.0005329710602760315, "perf/tokens_per_sec": 2021884.7219326475, "perf/iters_per_sec": 0.9641097650206792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372262954711915, "data/tokens_consumed": 157057810432, "data/tokens_consumed_B": 157.057810432, "train/loss_slope": -6.52667977521623e-06} {"step": 74900, "timestamp": 1778275498.0465257, "grad/layer_0/attn": 0.00237083388492465, "grad/layer_0/mlp": 0.002876519924029708, "grad/layer_0/attn_mlp_ratio": 0.824202114054244, "grad/layer_4/attn": 0.0026357173919677734, "grad/layer_4/mlp": 0.0026279205922037363, "grad/layer_4/attn_mlp_ratio": 1.0029668702663546, "grad/layer_8/attn": 0.003617477836087346, "grad/layer_8/mlp": 0.0038223385345190763, "grad/layer_8/attn_mlp_ratio": 0.9464043304322497, "grad/layer_12/attn": 0.004852851387113333, "grad/layer_12/mlp": 0.007416540291160345, "grad/layer_12/attn_mlp_ratio": 0.6543281814924609, "grad/layer_16/attn": 0.0034756490495055914, "grad/layer_16/mlp": 0.004335035569965839, "grad/layer_16/attn_mlp_ratio": 0.8017578894646051, "grad/layer_20/attn": 0.003006560495123267, "grad/layer_20/mlp": 0.005657410714775324, "grad/layer_20/attn_mlp_ratio": 0.5314375415819381, "grad/layer_24/attn": 0.008389565162360668, "grad/layer_24/mlp": 0.00864492915570736, "grad/layer_24/attn_mlp_ratio": 0.970460823241775, "grad/layer_27/attn": 0.006586022209376097, "grad/layer_27/mlp": 0.008007509633898735, "grad/layer_27/attn_mlp_ratio": 0.8224806997729945} {"step": 74900, "timestamp": 1778275498.061934, "train/loss": 2.142472565174103, "train/z_loss": 0.0013763411086983978, "train/perplexity": 8.520479044782984, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.017741286158561707, "optim/adamw_lr": 0.0005322385847568511, "perf/tokens_per_sec": 2020962.0955767566, "perf/iters_per_sec": 0.9636698224910529, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037699818611145, "data/tokens_consumed": 157078781952, "data/tokens_consumed_B": 157.078781952, "train/loss_slope": -5.597839783234394e-06} {"step": 74910, "timestamp": 1778275508.4441555, "train/loss": 2.147050619125366, "train/z_loss": 0.0013865278335288167, "train/perplexity": 8.559575682514899, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.017716755270957948, "optim/adamw_lr": 0.0005315026581287384, "perf/tokens_per_sec": 2021029.1934552065, "perf/iters_per_sec": 0.9637018172527344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376653671264648, "data/tokens_consumed": 157099753472, "data/tokens_consumed_B": 157.099753472, "train/loss_slope": -6.618365796521446e-06} {"step": 74920, "timestamp": 1778275518.8244002, "train/loss": 2.1496939659118652, "train/z_loss": 0.0013853393495082856, "train/perplexity": 8.582231539840993, "train/grad_norm": 0.1201171875, "optim/muon_lr": 0.017692108154296875, "optim/adamw_lr": 0.0005307632446289062, "perf/tokens_per_sec": 2021411.806135329, "perf/iters_per_sec": 0.9638842611958165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037468957901001, "data/tokens_consumed": 157120724992, "data/tokens_consumed_B": 157.120724992, "train/loss_slope": -2.7586604609633285e-06} {"step": 74925, "timestamp": 1778275524.5933046, "eos/sharpness": 13.06643486022949, "eos/L0_probe": 1.9574438333511353, "eos/L_plus": 2.0263195037841797, "eos/L_minus": 2.0192325115203857, "eos/grad_norm": 0.09491376578807831, "eos/embed_grad_frac": 0.24180395901203156, "eos/time_s": 0.5928452014923096} {"step": 74925, "timestamp": 1778275525.965866, "geo/rankme_last": 438.4054870605469, "geo/layer_0/stable_rank_q_proj": 19.077051162719727, "geo/layer_0/stable_rank_k_proj": 15.733613967895508, "geo/layer_0/stable_rank_o_proj": 46.57408905029297, "geo/layer_0/stable_rank_gate_proj": 129.2810516357422, "geo/layer_0/stable_rank_down_proj": 56.36990737915039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.07044296711683273, "geo/layer_0/attn_entropy_mean": 6.144550323486328, "geo/layer_0/attn_entropy_std": 0.43348830938339233, "geo/layer_7/stable_rank_q_proj": 42.75447082519531, "geo/layer_7/stable_rank_k_proj": 40.18687438964844, "geo/layer_7/stable_rank_o_proj": 88.47427368164062, "geo/layer_7/stable_rank_gate_proj": 77.5493392944336, "geo/layer_7/stable_rank_down_proj": 139.8556671142578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44444790482521057, "geo/layer_7/attn_entropy_mean": 4.647672653198242, "geo/layer_7/attn_entropy_std": 0.7802598476409912, "geo/layer_14/stable_rank_q_proj": 49.68075180053711, "geo/layer_14/stable_rank_k_proj": 41.52007293701172, "geo/layer_14/stable_rank_o_proj": 43.303993225097656, "geo/layer_14/stable_rank_gate_proj": 70.73316955566406, "geo/layer_14/stable_rank_down_proj": 125.95682525634766, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39379554986953735, "geo/layer_14/attn_entropy_mean": 5.5272698402404785, "geo/layer_14/attn_entropy_std": 0.42712828516960144, "geo/layer_21/stable_rank_q_proj": 39.83346176147461, "geo/layer_21/stable_rank_k_proj": 30.075597763061523, "geo/layer_21/stable_rank_o_proj": 68.09920501708984, "geo/layer_21/stable_rank_gate_proj": 64.04422760009766, "geo/layer_21/stable_rank_down_proj": 49.35501480102539, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14221681654453278, "geo/layer_21/attn_entropy_mean": 5.685172080993652, "geo/layer_21/attn_entropy_std": 0.2930624485015869, "geo/layer_27/stable_rank_q_proj": 43.98421096801758, "geo/layer_27/stable_rank_k_proj": 32.44240188598633, "geo/layer_27/stable_rank_o_proj": 115.52255249023438, "geo/layer_27/stable_rank_gate_proj": 77.31149291992188, "geo/layer_27/stable_rank_down_proj": 127.92768096923828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09731736034154892, "geo/layer_27/attn_entropy_mean": 4.1449809074401855, "geo/layer_27/attn_entropy_std": 0.7849740982055664, "attnres/final_alpha/block_0": 0.23798878490924835, "attnres/block_norm/0": 1.7774345874786377, "attnres/final_alpha/block_1": 0.00397100206464529, "attnres/block_norm/1": 48502.18359375, "attnres/final_alpha/block_2": 0.009928852319717407, "attnres/block_norm/2": 29173.4453125, "attnres/final_alpha/block_3": 0.01185908354818821, "attnres/block_norm/3": 62540.3046875, "attnres/final_alpha/block_4": 0.013894145376980305, "attnres/block_norm/4": 15793.521484375, "attnres/final_alpha/block_5": 0.6160233020782471, "attnres/block_norm/5": 6812.017578125, "attnres/final_alpha/block_6": 0.10633478313684464, "attnres/block_norm/6": 41132.53515625, "geo/tier1_time_s": 1.3535873889923096, "geo/step": 74925.0, "geo/rankme_slope": 2.0649568421118448e-05} {"step": 74930, "timestamp": 1778275531.1558774, "train/loss": 2.128348445892334, "train/z_loss": 0.001384988089557737, "train/perplexity": 8.40098067360258, "train/grad_norm": 0.24609375, "optim/muon_lr": 0.017667346596717835, "optim/adamw_lr": 0.000530020397901535, "perf/tokens_per_sec": 1701297.785239305, "perf/iters_per_sec": 0.8112420011707807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2326777935028077, "data/tokens_consumed": 157141696512, "data/tokens_consumed_B": 157.141696512, "train/loss_slope": -3.0179933281299015e-06} {"step": 74940, "timestamp": 1778275541.5342364, "train/loss": 2.0801279187202453, "train/z_loss": 0.0013811369077302515, "train/perplexity": 8.005492901208264, "train/grad_norm": 0.16796875, "optim/muon_lr": 0.017642470598220824, "optim/adamw_lr": 0.0005292741179466247, "perf/tokens_per_sec": 2022033.3146483433, "perf/iters_per_sec": 0.9641806195489613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371500730514527, "data/tokens_consumed": 157162668032, "data/tokens_consumed_B": 157.162668032, "train/loss_slope": -4.970801574061474e-06} {"step": 74950, "timestamp": 1778275551.8980293, "grad/layer_0/attn": 0.002747239312157035, "grad/layer_0/mlp": 0.002994655165821314, "grad/layer_0/attn_mlp_ratio": 0.917380822932077, "grad/layer_4/attn": 0.00326324999332428, "grad/layer_4/mlp": 0.002491199178621173, "grad/layer_4/attn_mlp_ratio": 1.3099112629522038, "grad/layer_8/attn": 0.004424622282385826, "grad/layer_8/mlp": 0.003545318264514208, "grad/layer_8/attn_mlp_ratio": 1.2480183237344056, "grad/layer_12/attn": 0.004053880926221609, "grad/layer_12/mlp": 0.006632817443460226, "grad/layer_12/attn_mlp_ratio": 0.6111853521764101, "grad/layer_16/attn": 0.004107664804905653, "grad/layer_16/mlp": 0.004396002274006605, "grad/layer_16/attn_mlp_ratio": 0.9344091416315239, "grad/layer_20/attn": 0.002772085601463914, "grad/layer_20/mlp": 0.0055316779762506485, "grad/layer_20/attn_mlp_ratio": 0.5011292347914114, "grad/layer_24/attn": 0.006459417287260294, "grad/layer_24/mlp": 0.007513898424804211, "grad/layer_24/attn_mlp_ratio": 0.8596625661016106, "grad/layer_27/attn": 0.010761869139969349, "grad/layer_27/mlp": 0.006745233666151762, "grad/layer_27/attn_mlp_ratio": 1.5954775643171115} {"step": 74950, "timestamp": 1778275551.9131405, "train/loss": 2.1376577973365785, "train/z_loss": 0.001388908305671066, "train/perplexity": 8.479553518832132, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.01761748015880585, "optim/adamw_lr": 0.0005285244047641754, "perf/tokens_per_sec": 2021914.5130914247, "perf/iters_per_sec": 0.9641239705521701, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037211012840271, "data/tokens_consumed": 157183639552, "data/tokens_consumed_B": 157.183639552, "train/loss_slope": -4.839758160519827e-06} {"step": 74960, "timestamp": 1778275562.291054, "train/loss": 2.1059346199035645, "train/z_loss": 0.0013818961568176746, "train/perplexity": 8.214777117047344, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.01759237587451935, "optim/adamw_lr": 0.0005277712762355804, "perf/tokens_per_sec": 2021738.7532968116, "perf/iters_per_sec": 0.9640401617511805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373011827468872, "data/tokens_consumed": 157204611072, "data/tokens_consumed_B": 157.204611072, "train/loss_slope": -9.08848811583652e-06} {"step": 74970, "timestamp": 1778275572.6841846, "train/loss": 2.1518264532089235, "train/z_loss": 0.0013748942874372005, "train/perplexity": 8.60055256731639, "train/grad_norm": 0.150390625, "optim/muon_lr": 0.017567158341407776, "optim/adamw_lr": 0.0005270147502422333, "perf/tokens_per_sec": 2018957.9946202545, "perf/iters_per_sec": 0.9627141926862023, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387298822402955, "data/tokens_consumed": 157225582592, "data/tokens_consumed_B": 157.225582592, "train/loss_slope": -6.073267646569728e-06} {"step": 74980, "timestamp": 1778275583.0664563, "train/loss": 2.12224600315094, "train/z_loss": 0.0013759720837697386, "train/perplexity": 8.349870277813935, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.017541826963424684, "optim/adamw_lr": 0.0005262548089027405, "perf/tokens_per_sec": 2021033.558454239, "perf/iters_per_sec": 0.9637038986464687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376631259918212, "data/tokens_consumed": 157246554112, "data/tokens_consumed_B": 157.246554112, "train/loss_slope": -5.307974494901964e-06} {"step": 74990, "timestamp": 1778275593.4468462, "train/loss": 2.149577236175537, "train/z_loss": 0.0013699079514481126, "train/perplexity": 8.581229796683987, "train/grad_norm": 0.138671875, "optim/muon_lr": 0.017516383528709413, "optim/adamw_lr": 0.0005254915058612823, "perf/tokens_per_sec": 2021420.0284636647, "perf/iters_per_sec": 0.9638881819074939, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374647378921509, "data/tokens_consumed": 157267525632, "data/tokens_consumed_B": 157.267525632, "train/loss_slope": -4.411487761038753e-06} {"step": 75000, "timestamp": 1778275603.8187594, "grad/layer_0/attn": 0.0023439335636794567, "grad/layer_0/mlp": 0.002668985165655613, "grad/layer_0/attn_mlp_ratio": 0.8782115037655294, "grad/layer_4/attn": 0.0023383095394819975, "grad/layer_4/mlp": 0.0024092765524983406, "grad/layer_4/attn_mlp_ratio": 0.9705442241584201, "grad/layer_8/attn": 0.005419041961431503, "grad/layer_8/mlp": 0.0035463059321045876, "grad/layer_8/attn_mlp_ratio": 1.5280807444064617, "grad/layer_12/attn": 0.004633303731679916, "grad/layer_12/mlp": 0.006978909485042095, "grad/layer_12/attn_mlp_ratio": 0.6639008107527975, "grad/layer_16/attn": 0.003912291023880243, "grad/layer_16/mlp": 0.004507944453507662, "grad/layer_16/attn_mlp_ratio": 0.8678658260860959, "grad/layer_20/attn": 0.0037318775430321693, "grad/layer_20/mlp": 0.0055357906967401505, "grad/layer_20/attn_mlp_ratio": 0.6741363032052715, "grad/layer_24/attn": 0.013042079284787178, "grad/layer_24/mlp": 0.010175411589443684, "grad/layer_24/attn_mlp_ratio": 1.2817249741666445, "grad/layer_27/attn": 0.003561669494956732, "grad/layer_27/mlp": 0.010455172508955002, "grad/layer_27/attn_mlp_ratio": 0.34066099414811307} {"step": 75000, "timestamp": 1778275604.4081907, "eos/sharpness": 60.43808460235594, "eos/L0_probe": 1.9548646211624146, "eos/L_plus": 2.2870380878448486, "eos/L_minus": 2.22707200050354, "eos/grad_norm": 0.154762864112854, "eos/embed_grad_frac": 0.0876586064696312, "eos/time_s": 0.5865552425384521} {"step": 75000, "timestamp": 1778275604.4272304, "train/loss": 2.1585421562194824, "train/z_loss": 0.0013781841611489654, "train/perplexity": 8.65850570430319, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.017490828037261964, "optim/adamw_lr": 0.0005247248411178588, "perf/tokens_per_sec": 1911031.7294410921, "perf/iters_per_sec": 0.9112509391026936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0973925590515137, "data/tokens_consumed": 157288497152, "data/tokens_consumed_B": 157.288497152, "train/loss_slope": -4.575611992065361e-06} {"step": 75000, "timestamp": 1778275605.7875376, "geo/rankme_last": 438.38037109375, "geo/layer_0/stable_rank_q_proj": 19.090227127075195, "geo/layer_0/stable_rank_k_proj": 15.722993850708008, "geo/layer_0/stable_rank_o_proj": 46.47772979736328, "geo/layer_0/stable_rank_gate_proj": 129.079345703125, "geo/layer_0/stable_rank_down_proj": 56.3978385925293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06324609369039536, "geo/layer_0/attn_entropy_mean": 6.145001411437988, "geo/layer_0/attn_entropy_std": 0.4319653809070587, "geo/layer_7/stable_rank_q_proj": 42.80112838745117, "geo/layer_7/stable_rank_k_proj": 40.19551467895508, "geo/layer_7/stable_rank_o_proj": 88.38432312011719, "geo/layer_7/stable_rank_gate_proj": 77.46286010742188, "geo/layer_7/stable_rank_down_proj": 139.9268035888672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44056937098503113, "geo/layer_7/attn_entropy_mean": 4.629934310913086, "geo/layer_7/attn_entropy_std": 0.7873964309692383, "geo/layer_14/stable_rank_q_proj": 49.67902755737305, "geo/layer_14/stable_rank_k_proj": 41.584068298339844, "geo/layer_14/stable_rank_o_proj": 43.28268814086914, "geo/layer_14/stable_rank_gate_proj": 70.6781005859375, "geo/layer_14/stable_rank_down_proj": 126.22112274169922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4001654088497162, "geo/layer_14/attn_entropy_mean": 5.541922092437744, "geo/layer_14/attn_entropy_std": 0.4249616861343384, "geo/layer_21/stable_rank_q_proj": 39.847774505615234, "geo/layer_21/stable_rank_k_proj": 30.088369369506836, "geo/layer_21/stable_rank_o_proj": 68.1137466430664, "geo/layer_21/stable_rank_gate_proj": 64.06248474121094, "geo/layer_21/stable_rank_down_proj": 49.36119842529297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14249704778194427, "geo/layer_21/attn_entropy_mean": 5.698819160461426, "geo/layer_21/attn_entropy_std": 0.29281535744667053, "geo/layer_27/stable_rank_q_proj": 44.07294464111328, "geo/layer_27/stable_rank_k_proj": 32.401981353759766, "geo/layer_27/stable_rank_o_proj": 115.49339294433594, "geo/layer_27/stable_rank_gate_proj": 77.24121856689453, "geo/layer_27/stable_rank_down_proj": 127.75444030761719, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09511927515268326, "geo/layer_27/attn_entropy_mean": 4.164668560028076, "geo/layer_27/attn_entropy_std": 0.7713133692741394, "attnres/final_alpha/block_0": 0.2376912236213684, "attnres/block_norm/0": 1.777586817741394, "attnres/final_alpha/block_1": 0.0040055857971310616, "attnres/block_norm/1": 48400.78125, "attnres/final_alpha/block_2": 0.009889965876936913, "attnres/block_norm/2": 29057.75390625, "attnres/final_alpha/block_3": 0.01178817544132471, "attnres/block_norm/3": 62519.8046875, "attnres/final_alpha/block_4": 0.01371234841644764, "attnres/block_norm/4": 15833.861328125, "attnres/final_alpha/block_5": 0.6162642240524292, "attnres/block_norm/5": 6868.3017578125, "attnres/final_alpha/block_6": 0.10664844512939453, "attnres/block_norm/6": 41288.5859375, "geo/tier1_time_s": 1.3564813137054443, "geo/step": 75000.0, "geo/rankme_slope": 2.5293574461034412e-05} {"step": 75000, "timestamp": 1778275612.8093286, "geo/ww_alpha_mean": 7.7269675505863855, "geo/ww_alpha_std": 4.658355371172287, "geo/ww_alpha_min": 1.335636584686689, "geo/ww_alpha_max": 31.301774335942966, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.9375857228300966, "geo/ww_alpha_by_type/k_proj": 4.497306806912186, "geo/ww_alpha_by_type/v_proj": 8.251212654383014, "geo/ww_alpha_by_type/o_proj": 8.661837216597263, "geo/ww_alpha_by_type/gate_proj": 8.049303771068812, "geo/ww_alpha_by_type/up_proj": 12.857134916647507, "geo/ww_alpha_by_type/down_proj": 7.9243393621112075, "geo/twonn_id/layer_0": 0.6986151337623596, "geo/twonn_id/layer_7": 2.828461170196533, "geo/twonn_id/layer_14": 5.043117523193359, "geo/twonn_id/layer_21": 7.219174385070801, "geo/twonn_id/layer_27": 4.756063938140869, "geo/tier2_time_s": 7.015143632888794} {"step": 75000, "timestamp": 1778275613.5339043, "eoc/jacobian_sigma/layer_0/attn": 1360.552001953125, "eoc/jacobian_sigma/layer_0/mlp": 9265.193359375, "eoc/jacobian_sigma/layer_0": 9265.193359375, "eoc/jacobian_sigma/layer_7/attn": 1.1488341093063354, "eoc/jacobian_sigma/layer_7/mlp": 1.7064005136489868, "eoc/jacobian_sigma/layer_7": 1.7064005136489868, "eoc/jacobian_sigma/layer_14/attn": 1.3975337743759155, "eoc/jacobian_sigma/layer_14/mlp": 6.324147701263428, "eoc/jacobian_sigma/layer_14": 6.324147701263428, "eoc/jacobian_sigma/layer_21/attn": 1.0838912725448608, "eoc/jacobian_sigma/layer_21/mlp": 4.008772850036621, "eoc/jacobian_sigma/layer_21": 4.008772850036621, "eoc/jacobian_sigma/layer_27/attn": 3.1185054779052734, "eoc/jacobian_sigma/layer_27/mlp": 26.865182876586914, "eoc/jacobian_sigma/layer_27": 26.865182876586914, "eoc/layer0_sigma": 9265.193359375, "eoc/sigma_max": 26.865182876586914, "eoc/sigma_min": 1.7064005136489868, "eoc/sigma_mean": 9.726125985383987, "eoc/time_s": 0.718148946762085} {"step": 75010, "timestamp": 1778275623.9328413, "train/loss": 2.110575294494629, "train/z_loss": 0.0013695611036382616, "train/perplexity": 8.252987817624318, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.01746515989303589, "optim/adamw_lr": 0.0005239547967910766, "perf/tokens_per_sec": 1075365.1107382067, "perf/iters_per_sec": 0.5127740434351953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.95017671585083, "data/tokens_consumed": 157309468672, "data/tokens_consumed_B": 157.309468672, "train/loss_slope": -2.0624023495297406e-06} {"step": 75020, "timestamp": 1778275634.311264, "train/loss": 2.1188341856002806, "train/z_loss": 0.001391331886406988, "train/perplexity": 8.321430586960233, "train/grad_norm": 0.359375, "optim/muon_lr": 0.017439380288124084, "optim/adamw_lr": 0.0005231814086437224, "perf/tokens_per_sec": 2022073.9409190542, "perf/iters_per_sec": 0.9641999916644355, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371292352676391, "data/tokens_consumed": 157330440192, "data/tokens_consumed_B": 157.330440192, "train/loss_slope": -3.160804175700995e-06} {"step": 75030, "timestamp": 1778275644.6882622, "train/loss": 2.1477102756500246, "train/z_loss": 0.0013735400163568557, "train/perplexity": 8.565223925207432, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.017413489818573, "optim/adamw_lr": 0.0005224046945571899, "perf/tokens_per_sec": 2022441.5573702154, "perf/iters_per_sec": 0.9643752848483159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369407176971435, "data/tokens_consumed": 157351411712, "data/tokens_consumed_B": 157.351411712, "train/loss_slope": -3.758236212376943e-06} {"step": 75040, "timestamp": 1778275655.066728, "train/loss": 2.1243141293525696, "train/z_loss": 0.0013860824983567, "train/perplexity": 8.36715673243917, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.017387487888336182, "optim/adamw_lr": 0.0005216246366500855, "perf/tokens_per_sec": 2021642.3818657745, "perf/iters_per_sec": 0.9639942082718728, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037350630760193, "data/tokens_consumed": 157372383232, "data/tokens_consumed_B": 157.372383232, "train/loss_slope": -2.0707525507379298e-06} {"step": 75050, "timestamp": 1778275665.4342082, "grad/layer_0/attn": 0.002474582986906171, "grad/layer_0/mlp": 0.0028772936202585697, "grad/layer_0/attn_mlp_ratio": 0.8600383650383079, "grad/layer_4/attn": 0.003698085667565465, "grad/layer_4/mlp": 0.00262271985411644, "grad/layer_4/attn_mlp_ratio": 1.4100192671204586, "grad/layer_8/attn": 0.00447050342336297, "grad/layer_8/mlp": 0.003611972089856863, "grad/layer_8/attn_mlp_ratio": 1.2376904329211156, "grad/layer_12/attn": 0.0039251623675227165, "grad/layer_12/mlp": 0.006813535932451487, "grad/layer_12/attn_mlp_ratio": 0.5760830131121295, "grad/layer_16/attn": 0.005751383490860462, "grad/layer_16/mlp": 0.004509280901402235, "grad/layer_16/attn_mlp_ratio": 1.2754546654049672, "grad/layer_20/attn": 0.00296606682240963, "grad/layer_20/mlp": 0.0052036396227777, "grad/layer_20/attn_mlp_ratio": 0.5699984972876533, "grad/layer_24/attn": 0.005800815299153328, "grad/layer_24/mlp": 0.008686032146215439, "grad/layer_24/attn_mlp_ratio": 0.6678325770297228, "grad/layer_27/attn": 0.006546786054968834, "grad/layer_27/mlp": 0.007029270753264427, "grad/layer_27/attn_mlp_ratio": 0.9313606192779548} {"step": 75050, "timestamp": 1778275665.4597557, "train/loss": 2.1281265139579775, "train/z_loss": 0.001392428472172469, "train/perplexity": 8.399116434585933, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.01736137628555298, "optim/adamw_lr": 0.0005208412885665893, "perf/tokens_per_sec": 2018978.5702032351, "perf/iters_per_sec": 0.9627240038887191, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387192964553833, "data/tokens_consumed": 157393354752, "data/tokens_consumed_B": 157.393354752, "train/loss_slope": -1.9550431357679487e-06} {"step": 75060, "timestamp": 1778275675.839742, "train/loss": 2.1392932176589965, "train/z_loss": 0.0013889811700209976, "train/perplexity": 8.493432498871474, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.01733515441417694, "optim/adamw_lr": 0.0005200546324253081, "perf/tokens_per_sec": 2021922.8789413143, "perf/iters_per_sec": 0.9641279597002574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372067213058471, "data/tokens_consumed": 157414326272, "data/tokens_consumed_B": 157.414326272, "train/loss_slope": -2.645510386819695e-06} {"step": 75070, "timestamp": 1778275686.2090294, "train/loss": 2.0668640613555906, "train/z_loss": 0.0013942328747361898, "train/perplexity": 7.900010284879352, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.017308822870254516, "optim/adamw_lr": 0.0005192646861076354, "perf/tokens_per_sec": 2023525.005903151, "perf/iters_per_sec": 0.9648919133678202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363855123519898, "data/tokens_consumed": 157435297792, "data/tokens_consumed_B": 157.435297792, "train/loss_slope": -9.404848253551625e-06} {"step": 75075, "timestamp": 1778275691.9957922, "eos/sharpness": 6.576311588287353, "eos/L0_probe": 1.9534728527069092, "eos/L_plus": 1.98784339427948, "eos/L_minus": 1.984865427017212, "eos/grad_norm": 0.0904167890548706, "eos/embed_grad_frac": 0.263034462928772, "eos/time_s": 0.6131858825683594} {"step": 75075, "timestamp": 1778275693.377297, "geo/rankme_last": 438.13690185546875, "geo/layer_0/stable_rank_q_proj": 19.069467544555664, "geo/layer_0/stable_rank_k_proj": 15.685935020446777, "geo/layer_0/stable_rank_o_proj": 46.47712707519531, "geo/layer_0/stable_rank_gate_proj": 128.63265991210938, "geo/layer_0/stable_rank_down_proj": 56.401100158691406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06549739837646484, "geo/layer_0/attn_entropy_mean": 6.142365455627441, "geo/layer_0/attn_entropy_std": 0.43590787053108215, "geo/layer_7/stable_rank_q_proj": 42.79587936401367, "geo/layer_7/stable_rank_k_proj": 40.204200744628906, "geo/layer_7/stable_rank_o_proj": 88.31800079345703, "geo/layer_7/stable_rank_gate_proj": 77.41182708740234, "geo/layer_7/stable_rank_down_proj": 139.96542358398438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44206735491752625, "geo/layer_7/attn_entropy_mean": 4.645876407623291, "geo/layer_7/attn_entropy_std": 0.7965468168258667, "geo/layer_14/stable_rank_q_proj": 49.713619232177734, "geo/layer_14/stable_rank_k_proj": 41.61725616455078, "geo/layer_14/stable_rank_o_proj": 43.30919647216797, "geo/layer_14/stable_rank_gate_proj": 70.67962646484375, "geo/layer_14/stable_rank_down_proj": 125.97539520263672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4122643768787384, "geo/layer_14/attn_entropy_mean": 5.543451309204102, "geo/layer_14/attn_entropy_std": 0.4249194264411926, "geo/layer_21/stable_rank_q_proj": 39.873416900634766, "geo/layer_21/stable_rank_k_proj": 30.07556915283203, "geo/layer_21/stable_rank_o_proj": 68.12356567382812, "geo/layer_21/stable_rank_gate_proj": 63.99058532714844, "geo/layer_21/stable_rank_down_proj": 49.3786735534668, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14795123040676117, "geo/layer_21/attn_entropy_mean": 5.690075874328613, "geo/layer_21/attn_entropy_std": 0.30198612809181213, "geo/layer_27/stable_rank_q_proj": 43.935909271240234, "geo/layer_27/stable_rank_k_proj": 32.41148376464844, "geo/layer_27/stable_rank_o_proj": 115.59880828857422, "geo/layer_27/stable_rank_gate_proj": 77.25566101074219, "geo/layer_27/stable_rank_down_proj": 127.62933349609375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.1053127869963646, "geo/layer_27/attn_entropy_mean": 4.141216278076172, "geo/layer_27/attn_entropy_std": 0.8064844608306885, "attnres/final_alpha/block_0": 0.2376018464565277, "attnres/block_norm/0": 1.7777860164642334, "attnres/final_alpha/block_1": 0.003961591981351376, "attnres/block_norm/1": 48513.6328125, "attnres/final_alpha/block_2": 0.010052470490336418, "attnres/block_norm/2": 29164.150390625, "attnres/final_alpha/block_3": 0.011745302937924862, "attnres/block_norm/3": 62657.3828125, "attnres/final_alpha/block_4": 0.013612116686999798, "attnres/block_norm/4": 15753.9013671875, "attnres/final_alpha/block_5": 0.6164348125457764, "attnres/block_norm/5": 6848.978515625, "attnres/final_alpha/block_6": 0.1065918430685997, "attnres/block_norm/6": 41315.2421875, "geo/tier1_time_s": 1.3595850467681885, "geo/step": 75075.0, "geo/rankme_slope": 9.035352422218889e-06} {"step": 75080, "timestamp": 1778275698.5637455, "train/loss": 2.141944146156311, "train/z_loss": 0.001388553623110056, "train/perplexity": 8.515977850977968, "train/grad_norm": 0.1337890625, "optim/muon_lr": 0.017282382249832154, "optim/adamw_lr": 0.0005184714674949646, "perf/tokens_per_sec": 1698311.5806958617, "perf/iters_per_sec": 0.8098180678824719, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.23484525680542, "data/tokens_consumed": 157456269312, "data/tokens_consumed_B": 157.456269312, "train/loss_slope": -6.451761153879444e-06} {"step": 75090, "timestamp": 1778275708.9303474, "train/loss": 2.1404034376144407, "train/z_loss": 0.0013880478916689753, "train/perplexity": 8.502867313512583, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.017255831956863404, "optim/adamw_lr": 0.0005176749587059021, "perf/tokens_per_sec": 2023916.5740581232, "perf/iters_per_sec": 0.96507862761408, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361850023269654, "data/tokens_consumed": 157477240832, "data/tokens_consumed_B": 157.477240832, "train/loss_slope": -6.879401714852526e-06} {"step": 75100, "timestamp": 1778275719.284442, "grad/layer_0/attn": 0.002537931315600872, "grad/layer_0/mlp": 0.003010657848790288, "grad/layer_0/attn_mlp_ratio": 0.8429822845270875, "grad/layer_4/attn": 0.0020372378639876842, "grad/layer_4/mlp": 0.002715321723371744, "grad/layer_4/attn_mlp_ratio": 0.7502749200674671, "grad/layer_8/attn": 0.004616187419742346, "grad/layer_8/mlp": 0.0035957819782197475, "grad/layer_8/attn_mlp_ratio": 1.2837784157453156, "grad/layer_12/attn": 0.006319023668766022, "grad/layer_12/mlp": 0.006236516870558262, "grad/layer_12/attn_mlp_ratio": 1.0132296117524033, "grad/layer_16/attn": 0.0036320521030575037, "grad/layer_16/mlp": 0.004275971092283726, "grad/layer_16/attn_mlp_ratio": 0.8494098626323263, "grad/layer_20/attn": 0.0032633591908961535, "grad/layer_20/mlp": 0.005389644298702478, "grad/layer_20/attn_mlp_ratio": 0.6054869207478301, "grad/layer_24/attn": 0.008273539133369923, "grad/layer_24/mlp": 0.009416946209967136, "grad/layer_24/attn_mlp_ratio": 0.8785798348040912, "grad/layer_27/attn": 0.008171038702130318, "grad/layer_27/mlp": 0.008512435480952263, "grad/layer_27/attn_mlp_ratio": 0.9598943362830413} {"step": 75100, "timestamp": 1778275719.3002148, "train/loss": 2.149557662010193, "train/z_loss": 0.0013763675000518561, "train/perplexity": 8.58106182791702, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.01722917437553406, "optim/adamw_lr": 0.0005168752312660217, "perf/tokens_per_sec": 2023383.6409386818, "perf/iters_per_sec": 0.9648245052998933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364579200744628, "data/tokens_consumed": 157498212352, "data/tokens_consumed_B": 157.498212352, "train/loss_slope": -7.408216555412487e-06} {"step": 75110, "timestamp": 1778275729.6633332, "train/loss": 2.0853947401046753, "train/z_loss": 0.001384136639535427, "train/perplexity": 8.04776763141684, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.017202408313751222, "optim/adamw_lr": 0.0005160722494125365, "perf/tokens_per_sec": 2024907.8113164997, "perf/iters_per_sec": 0.9655512863714694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356777667999268, "data/tokens_consumed": 157519183872, "data/tokens_consumed_B": 157.519183872, "train/loss_slope": -9.632319156521912e-06} {"step": 75120, "timestamp": 1778275740.0314558, "train/loss": 2.1322515964508058, "train/z_loss": 0.001383444364182651, "train/perplexity": 8.433835042073575, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.01717553436756134, "optim/adamw_lr": 0.0005152660310268402, "perf/tokens_per_sec": 2023938.7410677334, "perf/iters_per_sec": 0.9650891976679484, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361736536026, "data/tokens_consumed": 157540155392, "data/tokens_consumed_B": 157.540155392, "train/loss_slope": -9.035015006055293e-06} {"step": 75130, "timestamp": 1778275750.3963006, "train/loss": 2.1128671646118162, "train/z_loss": 0.0013786745141260326, "train/perplexity": 8.2719242854545, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.017148553133010863, "optim/adamw_lr": 0.0005144565939903258, "perf/tokens_per_sec": 2024471.6871118653, "perf/iters_per_sec": 0.9653433261451079, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03590087890625, "data/tokens_consumed": 157561126912, "data/tokens_consumed_B": 157.561126912, "train/loss_slope": -1.0089662366180873e-05} {"step": 75140, "timestamp": 1778275760.7673635, "train/loss": 2.111529088020325, "train/z_loss": 0.0013875164557248354, "train/perplexity": 8.260863219128863, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.01712146520614624, "optim/adamw_lr": 0.0005136439561843871, "perf/tokens_per_sec": 2023445.8260563514, "perf/iters_per_sec": 0.9648541574746854, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036426067352295, "data/tokens_consumed": 157582098432, "data/tokens_consumed_B": 157.582098432, "train/loss_slope": -1.1995320466056101e-05} {"step": 75150, "timestamp": 1778275771.1165984, "grad/layer_0/attn": 0.002619327511638403, "grad/layer_0/mlp": 0.0029062649700790644, "grad/layer_0/attn_mlp_ratio": 0.9012693090541616, "grad/layer_4/attn": 0.002028613118454814, "grad/layer_4/mlp": 0.0024437629617750645, "grad/layer_4/attn_mlp_ratio": 0.8301185782640073, "grad/layer_8/attn": 0.0032669666688889265, "grad/layer_8/mlp": 0.003396945307031274, "grad/layer_8/attn_mlp_ratio": 0.9617365831451672, "grad/layer_12/attn": 0.005061519797891378, "grad/layer_12/mlp": 0.006943139713257551, "grad/layer_12/attn_mlp_ratio": 0.7289958050717457, "grad/layer_16/attn": 0.005487846210598946, "grad/layer_16/mlp": 0.004416195675730705, "grad/layer_16/attn_mlp_ratio": 1.2426637063414439, "grad/layer_20/attn": 0.004049236420542002, "grad/layer_20/mlp": 0.005612810607999563, "grad/layer_20/attn_mlp_ratio": 0.7214275754517957, "grad/layer_24/attn": 0.01195970457047224, "grad/layer_24/mlp": 0.008258232846856117, "grad/layer_24/attn_mlp_ratio": 1.4482159376510761, "grad/layer_27/attn": 0.00541248545050621, "grad/layer_27/mlp": 0.006903735920786858, "grad/layer_27/attn_mlp_ratio": 0.7839936860577296} {"step": 75150, "timestamp": 1778275771.7181063, "eos/sharpness": 63.455247879028306, "eos/L0_probe": 1.957074522972107, "eos/L_plus": 2.232396364212036, "eos/L_minus": 2.316305160522461, "eos/grad_norm": 0.128487691283226, "eos/embed_grad_frac": 0.1163552775979042, "eos/time_s": 0.5984923839569092} {"step": 75150, "timestamp": 1778275771.7357862, "train/loss": 2.097040867805481, "train/z_loss": 0.0013805648777633905, "train/perplexity": 8.142040854416956, "train/grad_norm": 0.1279296875, "optim/muon_lr": 0.01709427058696747, "optim/adamw_lr": 0.000512828117609024, "perf/tokens_per_sec": 1913010.325276606, "perf/iters_per_sec": 0.9121944071181326, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0962575435638429, "data/tokens_consumed": 157603069952, "data/tokens_consumed_B": 157.603069952, "train/loss_slope": -1.49835485114444e-05} {"step": 75150, "timestamp": 1778275773.1004632, "geo/rankme_last": 439.1075744628906, "geo/layer_0/stable_rank_q_proj": 19.066884994506836, "geo/layer_0/stable_rank_k_proj": 15.704715728759766, "geo/layer_0/stable_rank_o_proj": 46.458370208740234, "geo/layer_0/stable_rank_gate_proj": 128.6956024169922, "geo/layer_0/stable_rank_down_proj": 56.40142822265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06508603692054749, "geo/layer_0/attn_entropy_mean": 6.140927791595459, "geo/layer_0/attn_entropy_std": 0.4312964975833893, "geo/layer_7/stable_rank_q_proj": 42.82661056518555, "geo/layer_7/stable_rank_k_proj": 40.19919967651367, "geo/layer_7/stable_rank_o_proj": 88.26639556884766, "geo/layer_7/stable_rank_gate_proj": 77.30986022949219, "geo/layer_7/stable_rank_down_proj": 140.20069885253906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4392290413379669, "geo/layer_7/attn_entropy_mean": 4.64826774597168, "geo/layer_7/attn_entropy_std": 0.7978315949440002, "geo/layer_14/stable_rank_q_proj": 49.67854690551758, "geo/layer_14/stable_rank_k_proj": 41.570831298828125, "geo/layer_14/stable_rank_o_proj": 43.325504302978516, "geo/layer_14/stable_rank_gate_proj": 70.62457275390625, "geo/layer_14/stable_rank_down_proj": 125.71898651123047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41214945912361145, "geo/layer_14/attn_entropy_mean": 5.524371147155762, "geo/layer_14/attn_entropy_std": 0.4243379533290863, "geo/layer_21/stable_rank_q_proj": 39.854164123535156, "geo/layer_21/stable_rank_k_proj": 29.990402221679688, "geo/layer_21/stable_rank_o_proj": 68.16785430908203, "geo/layer_21/stable_rank_gate_proj": 63.983707427978516, "geo/layer_21/stable_rank_down_proj": 49.40394592285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14648404717445374, "geo/layer_21/attn_entropy_mean": 5.67728328704834, "geo/layer_21/attn_entropy_std": 0.30154749751091003, "geo/layer_27/stable_rank_q_proj": 43.867774963378906, "geo/layer_27/stable_rank_k_proj": 32.44952392578125, "geo/layer_27/stable_rank_o_proj": 115.62476348876953, "geo/layer_27/stable_rank_gate_proj": 77.3246078491211, "geo/layer_27/stable_rank_down_proj": 127.58721923828125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09235231578350067, "geo/layer_27/attn_entropy_mean": 4.164287567138672, "geo/layer_27/attn_entropy_std": 0.7725591063499451, "attnres/final_alpha/block_0": 0.23846665024757385, "attnres/block_norm/0": 1.7778147459030151, "attnres/final_alpha/block_1": 0.003959133289754391, "attnres/block_norm/1": 48458.5546875, "attnres/final_alpha/block_2": 0.009945661760866642, "attnres/block_norm/2": 29188.154296875, "attnres/final_alpha/block_3": 0.011791449971497059, "attnres/block_norm/3": 62542.1015625, "attnres/final_alpha/block_4": 0.013729618862271309, "attnres/block_norm/4": 15796.0439453125, "attnres/final_alpha/block_5": 0.6164513826370239, "attnres/block_norm/5": 6808.669921875, "attnres/final_alpha/block_6": 0.10565610975027084, "attnres/block_norm/6": 41217.28515625, "geo/tier1_time_s": 1.361128807067871, "geo/step": 75150.0, "geo/rankme_slope": 3.498670952756103e-05} {"step": 75160, "timestamp": 1778275783.4535074, "train/loss": 2.1269819021224974, "train/z_loss": 0.001368945138528943, "train/perplexity": 8.389508206402187, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.017066970467567444, "optim/adamw_lr": 0.0005120091140270233, "perf/tokens_per_sec": 1790244.2170152771, "perf/iters_per_sec": 0.8536549649311433, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1714334726333617, "data/tokens_consumed": 157624041472, "data/tokens_consumed_B": 157.624041472, "train/loss_slope": -1.5867948124367905e-05} {"step": 75170, "timestamp": 1778275793.7970338, "train/loss": 2.115336775779724, "train/z_loss": 0.0013746015960350634, "train/perplexity": 8.292377967975789, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.017039564847946168, "optim/adamw_lr": 0.000511186945438385, "perf/tokens_per_sec": 2029071.3816828832, "perf/iters_per_sec": 0.9675366314329544, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033552598953247, "data/tokens_consumed": 157645012992, "data/tokens_consumed_B": 157.645012992, "train/loss_slope": -1.574867034223101e-05} {"step": 75180, "timestamp": 1778275804.145632, "train/loss": 2.1270193099975585, "train/z_loss": 0.001393011212348938, "train/perplexity": 8.389822045946994, "train/grad_norm": 0.1171875, "optim/muon_lr": 0.01701205313205719, "optim/adamw_lr": 0.0005103615939617156, "perf/tokens_per_sec": 2027472.660631196, "perf/iters_per_sec": 0.9667743018299084, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343675851821899, "data/tokens_consumed": 157665984512, "data/tokens_consumed_B": 157.665984512, "train/loss_slope": -2.0142066839969467e-05} {"step": 75190, "timestamp": 1778275814.5022452, "train/loss": 2.1404418349266052, "train/z_loss": 0.0013894648058339954, "train/perplexity": 8.503193807031309, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.016984437108039856, "optim/adamw_lr": 0.0005095331132411957, "perf/tokens_per_sec": 2025947.6611326511, "perf/iters_per_sec": 0.966047125402761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351461887359619, "data/tokens_consumed": 157686956032, "data/tokens_consumed_B": 157.686956032, "train/loss_slope": -1.7140735157824345e-05} {"step": 75200, "timestamp": 1778275824.8492882, "grad/layer_0/attn": 0.0026093972846865654, "grad/layer_0/mlp": 0.0030091695953160524, "grad/layer_0/attn_mlp_ratio": 0.8671485987474365, "grad/layer_4/attn": 0.0020971589256078005, "grad/layer_4/mlp": 0.0026366906240582466, "grad/layer_4/attn_mlp_ratio": 0.7953753947979044, "grad/layer_8/attn": 0.006170177366584539, "grad/layer_8/mlp": 0.00358444987796247, "grad/layer_8/attn_mlp_ratio": 1.7213735453191872, "grad/layer_12/attn": 0.00520078232511878, "grad/layer_12/mlp": 0.007174435071647167, "grad/layer_12/attn_mlp_ratio": 0.7249047765699917, "grad/layer_16/attn": 0.0037682426627725363, "grad/layer_16/mlp": 0.004677612334489822, "grad/layer_16/attn_mlp_ratio": 0.8055910393490159, "grad/layer_20/attn": 0.0036468093749135733, "grad/layer_20/mlp": 0.006142713129520416, "grad/layer_20/attn_mlp_ratio": 0.5936805510287989, "grad/layer_24/attn": 0.011450832709670067, "grad/layer_24/mlp": 0.010084947571158409, "grad/layer_24/attn_mlp_ratio": 1.135437989670279, "grad/layer_27/attn": 0.004246686119586229, "grad/layer_27/mlp": 0.008932704105973244, "grad/layer_27/attn_mlp_ratio": 0.47540879241769773} {"step": 75200, "timestamp": 1778275824.8638947, "train/loss": 2.1328409194946287, "train/z_loss": 0.0013824978261254728, "train/perplexity": 8.438806760241823, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.016956717371940613, "optim/adamw_lr": 0.0005087015211582183, "perf/tokens_per_sec": 2025313.4850277705, "perf/iters_per_sec": 0.9657447266711094, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354703187942504, "data/tokens_consumed": 157707927552, "data/tokens_consumed_B": 157.707927552, "train/loss_slope": -1.679927804897108e-05} {"step": 75210, "timestamp": 1778275835.2145169, "train/loss": 2.1383782386779786, "train/z_loss": 0.001375802350230515, "train/perplexity": 8.485664740867886, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.016928892135620117, "optim/adamw_lr": 0.0005078667640686035, "perf/tokens_per_sec": 2027240.5662422199, "perf/iters_per_sec": 0.9666636306010341, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344860076904296, "data/tokens_consumed": 157728899072, "data/tokens_consumed_B": 157.728899072, "train/loss_slope": -1.659614823081272e-05} {"step": 75220, "timestamp": 1778275845.5726454, "train/loss": 2.1207573652267455, "train/z_loss": 0.0013720368500798942, "train/perplexity": 8.33744959150235, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.016900964379310608, "optim/adamw_lr": 0.0005070289313793182, "perf/tokens_per_sec": 2026100.818550816, "perf/iters_per_sec": 0.9661201565507965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350679397583007, "data/tokens_consumed": 157749870592, "data/tokens_consumed_B": 157.749870592, "train/loss_slope": -1.716638063475991e-05} {"step": 75225, "timestamp": 1778275851.3408308, "eos/sharpness": 57.465934753417955, "eos/L0_probe": 1.9528721570968628, "eos/L_plus": 2.2861504554748535, "eos/L_minus": 2.1942532062530518, "eos/grad_norm": 0.13329030573368073, "eos/embed_grad_frac": 0.12019290030002594, "eos/time_s": 0.6003899574279785} {"step": 75225, "timestamp": 1778275852.7211077, "geo/rankme_last": 437.6739807128906, "geo/layer_0/stable_rank_q_proj": 19.05719757080078, "geo/layer_0/stable_rank_k_proj": 15.713621139526367, "geo/layer_0/stable_rank_o_proj": 46.43321990966797, "geo/layer_0/stable_rank_gate_proj": 128.83218383789062, "geo/layer_0/stable_rank_down_proj": 56.47362518310547, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06741652637720108, "geo/layer_0/attn_entropy_mean": 6.140565395355225, "geo/layer_0/attn_entropy_std": 0.430913507938385, "geo/layer_7/stable_rank_q_proj": 42.84281539916992, "geo/layer_7/stable_rank_k_proj": 40.190372467041016, "geo/layer_7/stable_rank_o_proj": 88.27953338623047, "geo/layer_7/stable_rank_gate_proj": 77.2839584350586, "geo/layer_7/stable_rank_down_proj": 140.33160400390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4419308304786682, "geo/layer_7/attn_entropy_mean": 4.629085063934326, "geo/layer_7/attn_entropy_std": 0.7789340019226074, "geo/layer_14/stable_rank_q_proj": 49.67167663574219, "geo/layer_14/stable_rank_k_proj": 41.67842483520508, "geo/layer_14/stable_rank_o_proj": 43.286033630371094, "geo/layer_14/stable_rank_gate_proj": 70.63307189941406, "geo/layer_14/stable_rank_down_proj": 125.65630340576172, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39924660325050354, "geo/layer_14/attn_entropy_mean": 5.519411087036133, "geo/layer_14/attn_entropy_std": 0.4245359003543854, "geo/layer_21/stable_rank_q_proj": 39.873592376708984, "geo/layer_21/stable_rank_k_proj": 30.03046989440918, "geo/layer_21/stable_rank_o_proj": 68.23100280761719, "geo/layer_21/stable_rank_gate_proj": 63.83198547363281, "geo/layer_21/stable_rank_down_proj": 49.3748779296875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14659550786018372, "geo/layer_21/attn_entropy_mean": 5.694805145263672, "geo/layer_21/attn_entropy_std": 0.29987189173698425, "geo/layer_27/stable_rank_q_proj": 43.87743377685547, "geo/layer_27/stable_rank_k_proj": 32.36623764038086, "geo/layer_27/stable_rank_o_proj": 115.5453872680664, "geo/layer_27/stable_rank_gate_proj": 77.31596374511719, "geo/layer_27/stable_rank_down_proj": 127.56768035888672, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0928167849779129, "geo/layer_27/attn_entropy_mean": 4.166205883026123, "geo/layer_27/attn_entropy_std": 0.7807192206382751, "attnres/final_alpha/block_0": 0.2373957335948944, "attnres/block_norm/0": 1.777894139289856, "attnres/final_alpha/block_1": 0.003971956204622984, "attnres/block_norm/1": 48428.6171875, "attnres/final_alpha/block_2": 0.009863302111625671, "attnres/block_norm/2": 29103.029296875, "attnres/final_alpha/block_3": 0.01171480305492878, "attnres/block_norm/3": 62695.42578125, "attnres/final_alpha/block_4": 0.01346929557621479, "attnres/block_norm/4": 15824.0234375, "attnres/final_alpha/block_5": 0.6171783208847046, "attnres/block_norm/5": 6846.642578125, "attnres/final_alpha/block_6": 0.10640653967857361, "attnres/block_norm/6": 41530.7734375, "geo/tier1_time_s": 1.3615810871124268, "geo/step": 75225.0, "geo/rankme_slope": 9.897122911664642e-07} {"step": 75230, "timestamp": 1778275857.898492, "train/loss": 2.1553019285202026, "train/z_loss": 0.0013719479902647436, "train/perplexity": 8.630495578385363, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.01687293350696564, "optim/adamw_lr": 0.0005061880052089691, "perf/tokens_per_sec": 1702524.3725821762, "perf/iters_per_sec": 0.8118268835936433, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317897081375122, "data/tokens_consumed": 157770842112, "data/tokens_consumed_B": 157.770842112, "train/loss_slope": -1.5085498642141713e-05} {"step": 75240, "timestamp": 1778275868.2452054, "train/loss": 2.121286928653717, "train/z_loss": 0.0013935012160800398, "train/perplexity": 8.341865969153062, "train/grad_norm": 0.2197265625, "optim/muon_lr": 0.016844799518585207, "optim/adamw_lr": 0.0005053439855575561, "perf/tokens_per_sec": 2028117.4487566822, "perf/iters_per_sec": 0.9670817607673083, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340387344360351, "data/tokens_consumed": 157791813632, "data/tokens_consumed_B": 157.791813632, "train/loss_slope": -1.3635997639165888e-05} {"step": 75250, "timestamp": 1778275878.5872774, "grad/layer_0/attn": 0.0027151003014296293, "grad/layer_0/mlp": 0.002865901216864586, "grad/layer_0/attn_mlp_ratio": 0.9473809462497687, "grad/layer_4/attn": 0.002153892768546939, "grad/layer_4/mlp": 0.0024465052410960197, "grad/layer_4/attn_mlp_ratio": 0.8803956943670549, "grad/layer_8/attn": 0.0033065436873584986, "grad/layer_8/mlp": 0.003567771753296256, "grad/layer_8/attn_mlp_ratio": 0.926781146138476, "grad/layer_12/attn": 0.005202631466090679, "grad/layer_12/mlp": 0.0065037827007472515, "grad/layer_12/attn_mlp_ratio": 0.799939300785525, "grad/layer_16/attn": 0.003036265494301915, "grad/layer_16/mlp": 0.004324513953179121, "grad/layer_16/attn_mlp_ratio": 0.7021055908165766, "grad/layer_20/attn": 0.002660582307726145, "grad/layer_20/mlp": 0.005230231676250696, "grad/layer_20/attn_mlp_ratio": 0.5086930028239379, "grad/layer_24/attn": 0.005475825164467096, "grad/layer_24/mlp": 0.007040356285870075, "grad/layer_24/attn_mlp_ratio": 0.7777767011137423, "grad/layer_27/attn": 0.007311705965548754, "grad/layer_27/mlp": 0.006599771324545145, "grad/layer_27/attn_mlp_ratio": 1.1078726057626573} {"step": 75250, "timestamp": 1778275878.6018116, "train/loss": 2.161636304855347, "train/z_loss": 0.00138499477179721, "train/perplexity": 8.685337897907333, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.016816563606262207, "optim/adamw_lr": 0.0005044969081878662, "perf/tokens_per_sec": 2025994.4178888532, "perf/iters_per_sec": 0.9660694207615153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351222991943358, "data/tokens_consumed": 157812785152, "data/tokens_consumed_B": 157.812785152, "train/loss_slope": -8.181333320118627e-06} {"step": 75260, "timestamp": 1778275888.9766858, "train/loss": 2.1112021923065187, "train/z_loss": 0.00139027590630576, "train/perplexity": 8.258163219683357, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.016788225769996643, "optim/adamw_lr": 0.0005036467730998993, "perf/tokens_per_sec": 2022668.8332191743, "perf/iters_per_sec": 0.9644836584182617, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368242025375367, "data/tokens_consumed": 157833756672, "data/tokens_consumed_B": 157.833756672, "train/loss_slope": -1.1086201066910692e-05} {"step": 75270, "timestamp": 1778275899.361774, "train/loss": 2.0999879360198976, "train/z_loss": 0.0013907207176089287, "train/perplexity": 8.16607139665056, "train/grad_norm": 0.171875, "optim/muon_lr": 0.016759786009788512, "optim/adamw_lr": 0.0005027935802936553, "perf/tokens_per_sec": 2020577.1002599478, "perf/iters_per_sec": 0.9634862424182643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037897539138794, "data/tokens_consumed": 157854728192, "data/tokens_consumed_B": 157.854728192, "train/loss_slope": -1.2230750651034739e-05} {"step": 75280, "timestamp": 1778275909.7454956, "train/loss": 2.108433711528778, "train/z_loss": 0.0013751377467997373, "train/perplexity": 8.235332271652887, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.016731245517730715, "optim/adamw_lr": 0.0005019373655319213, "perf/tokens_per_sec": 2020535.0488851294, "perf/iters_per_sec": 0.9634661907601973, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379191398620606, "data/tokens_consumed": 157875699712, "data/tokens_consumed_B": 157.875699712, "train/loss_slope": -1.4186957974781493e-05} {"step": 75290, "timestamp": 1778275920.6911182, "train/loss": 2.120434284210205, "train/z_loss": 0.001389493513852358, "train/perplexity": 8.33475635490321, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.016702604293823243, "optim/adamw_lr": 0.0005010781288146973, "perf/tokens_per_sec": 1917008.4101660696, "perf/iters_per_sec": 0.9141008425550793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0939712047576904, "data/tokens_consumed": 157896671232, "data/tokens_consumed_B": 157.896671232, "train/loss_slope": -1.5660553339040504e-05} {"step": 75300, "timestamp": 1778275931.0529122, "grad/layer_0/attn": 0.0035483285319060087, "grad/layer_0/mlp": 0.0030986208003014326, "grad/layer_0/attn_mlp_ratio": 1.1451315427327133, "grad/layer_4/attn": 0.0021555342245846987, "grad/layer_4/mlp": 0.0025020381435751915, "grad/layer_4/attn_mlp_ratio": 0.8615113018834719, "grad/layer_8/attn": 0.003367717145010829, "grad/layer_8/mlp": 0.0035078884102404118, "grad/layer_8/attn_mlp_ratio": 0.960041100274314, "grad/layer_12/attn": 0.0048503577709198, "grad/layer_12/mlp": 0.006705138832330704, "grad/layer_12/attn_mlp_ratio": 0.7233791603530305, "grad/layer_16/attn": 0.005285295657813549, "grad/layer_16/mlp": 0.0046704174019396305, "grad/layer_16/attn_mlp_ratio": 1.1316537880432658, "grad/layer_20/attn": 0.003220859682187438, "grad/layer_20/mlp": 0.005370281171053648, "grad/layer_20/attn_mlp_ratio": 0.599756236148783, "grad/layer_24/attn": 0.005398924928158522, "grad/layer_24/mlp": 0.007894238457083702, "grad/layer_24/attn_mlp_ratio": 0.6839069897772381, "grad/layer_27/attn": 0.00676786107942462, "grad/layer_27/mlp": 0.006379640195518732, "grad/layer_27/attn_mlp_ratio": 1.0608530835474512} {"step": 75300, "timestamp": 1778275931.6601434, "eos/sharpness": 4.053330421447753, "eos/L0_probe": 1.950789451599121, "eos/L_plus": 1.9750806093215942, "eos/L_minus": 1.9670315980911255, "eos/grad_norm": 0.07908876240253448, "eos/embed_grad_frac": 0.32939112186431885, "eos/time_s": 0.6045377254486084} {"step": 75300, "timestamp": 1778275931.6795928, "train/loss": 2.075430142879486, "train/z_loss": 0.001394237787462771, "train/perplexity": 7.9679730888998455, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.01667386293411255, "optim/adamw_lr": 0.0005002158880233764, "perf/tokens_per_sec": 1909392.8069635574, "perf/iters_per_sec": 0.9104694399659907, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098334503173828, "data/tokens_consumed": 157917642752, "data/tokens_consumed_B": 157.917642752, "train/loss_slope": -1.5135261592584527e-05} {"step": 75300, "timestamp": 1778275933.045136, "geo/rankme_last": 438.0723571777344, "geo/layer_0/stable_rank_q_proj": 19.029850006103516, "geo/layer_0/stable_rank_k_proj": 15.731115341186523, "geo/layer_0/stable_rank_o_proj": 46.43570327758789, "geo/layer_0/stable_rank_gate_proj": 128.3577117919922, "geo/layer_0/stable_rank_down_proj": 56.501766204833984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06122162193059921, "geo/layer_0/attn_entropy_mean": 6.1449737548828125, "geo/layer_0/attn_entropy_std": 0.4339856505393982, "geo/layer_7/stable_rank_q_proj": 42.8344612121582, "geo/layer_7/stable_rank_k_proj": 40.213592529296875, "geo/layer_7/stable_rank_o_proj": 88.27375030517578, "geo/layer_7/stable_rank_gate_proj": 77.30661010742188, "geo/layer_7/stable_rank_down_proj": 140.29734802246094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44528287649154663, "geo/layer_7/attn_entropy_mean": 4.623331546783447, "geo/layer_7/attn_entropy_std": 0.7924439311027527, "geo/layer_14/stable_rank_q_proj": 49.65116882324219, "geo/layer_14/stable_rank_k_proj": 41.734336853027344, "geo/layer_14/stable_rank_o_proj": 43.26212692260742, "geo/layer_14/stable_rank_gate_proj": 70.67462921142578, "geo/layer_14/stable_rank_down_proj": 125.52262115478516, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3994477391242981, "geo/layer_14/attn_entropy_mean": 5.507185935974121, "geo/layer_14/attn_entropy_std": 0.42785629630088806, "geo/layer_21/stable_rank_q_proj": 39.908241271972656, "geo/layer_21/stable_rank_k_proj": 30.033184051513672, "geo/layer_21/stable_rank_o_proj": 68.22286224365234, "geo/layer_21/stable_rank_gate_proj": 63.83988952636719, "geo/layer_21/stable_rank_down_proj": 49.35182571411133, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14079129695892334, "geo/layer_21/attn_entropy_mean": 5.688608169555664, "geo/layer_21/attn_entropy_std": 0.29554155468940735, "geo/layer_27/stable_rank_q_proj": 43.899375915527344, "geo/layer_27/stable_rank_k_proj": 32.40937423706055, "geo/layer_27/stable_rank_o_proj": 115.44705963134766, "geo/layer_27/stable_rank_gate_proj": 77.34859466552734, "geo/layer_27/stable_rank_down_proj": 127.7136001586914, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09393676370382309, "geo/layer_27/attn_entropy_mean": 4.158417701721191, "geo/layer_27/attn_entropy_std": 0.7892675995826721, "attnres/final_alpha/block_0": 0.23639920353889465, "attnres/block_norm/0": 1.7779489755630493, "attnres/final_alpha/block_1": 0.0038980806712061167, "attnres/block_norm/1": 48330.296875, "attnres/final_alpha/block_2": 0.009866951033473015, "attnres/block_norm/2": 29079.65234375, "attnres/final_alpha/block_3": 0.011731794103980064, "attnres/block_norm/3": 62504.5, "attnres/final_alpha/block_4": 0.01364574208855629, "attnres/block_norm/4": 15806.708984375, "attnres/final_alpha/block_5": 0.6182141304016113, "attnres/block_norm/5": 6843.521484375, "attnres/final_alpha/block_6": 0.10624413937330246, "attnres/block_norm/6": 41408.3515625, "geo/tier1_time_s": 1.3613591194152832, "geo/step": 75300.0, "geo/rankme_slope": 1.8725224464785922e-05} {"step": 75310, "timestamp": 1778275943.4225612, "train/loss": 2.0906657695770265, "train/z_loss": 0.0013881822000257672, "train/perplexity": 8.090299647067813, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.01664502203464508, "optim/adamw_lr": 0.0004993506610393523, "perf/tokens_per_sec": 1786469.8804806203, "perf/iters_per_sec": 0.8518552210238554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739084005355835, "data/tokens_consumed": 157938614272, "data/tokens_consumed_B": 157.938614272, "train/loss_slope": -1.7680206171499805e-05} {"step": 75320, "timestamp": 1778275953.8104055, "train/loss": 2.1162733316421507, "train/z_loss": 0.0013993938453495502, "train/perplexity": 8.30014788108605, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.01661608159542084, "optim/adamw_lr": 0.0004984824478626251, "perf/tokens_per_sec": 2020338.3218290952, "perf/iters_per_sec": 0.9633723839898564, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380202054977417, "data/tokens_consumed": 157959585792, "data/tokens_consumed_B": 157.959585792, "train/loss_slope": -1.7937947027754022e-05} {"step": 75330, "timestamp": 1778275964.1902652, "train/loss": 2.116391623020172, "train/z_loss": 0.0013869961723685265, "train/perplexity": 8.301129775090336, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.016587042212486268, "optim/adamw_lr": 0.000497611266374588, "perf/tokens_per_sec": 2021344.6828495245, "perf/iters_per_sec": 0.9638522543189643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375034093856812, "data/tokens_consumed": 157980557312, "data/tokens_consumed_B": 157.980557312, "train/loss_slope": -1.9646221876788153e-05} {"step": 75340, "timestamp": 1778275974.5698378, "train/loss": 2.11648690700531, "train/z_loss": 0.0013896649586968123, "train/perplexity": 8.301920777500788, "train/grad_norm": 0.1142578125, "optim/muon_lr": 0.016557904481887816, "optim/adamw_lr": 0.0004967371344566344, "perf/tokens_per_sec": 2021746.9318161106, "perf/iters_per_sec": 0.9640440615730813, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037296986579895, "data/tokens_consumed": 158001528832, "data/tokens_consumed_B": 158.001528832, "train/loss_slope": -1.893983448799969e-05} {"step": 75350, "timestamp": 1778275984.9396663, "grad/layer_0/attn": 0.0028081750497221947, "grad/layer_0/mlp": 0.003006614977493882, "grad/layer_0/attn_mlp_ratio": 0.933998858298451, "grad/layer_4/attn": 0.0021076237317174673, "grad/layer_4/mlp": 0.0025273493956774473, "grad/layer_4/attn_mlp_ratio": 0.8339265049500113, "grad/layer_8/attn": 0.003443102352321148, "grad/layer_8/mlp": 0.003573184832930565, "grad/layer_8/attn_mlp_ratio": 0.9635947808324802, "grad/layer_12/attn": 0.0049374643713235855, "grad/layer_12/mlp": 0.006888082250952721, "grad/layer_12/attn_mlp_ratio": 0.7168126221140001, "grad/layer_16/attn": 0.003832878079265356, "grad/layer_16/mlp": 0.004595454782247543, "grad/layer_16/attn_mlp_ratio": 0.8340584724424003, "grad/layer_20/attn": 0.002842789515852928, "grad/layer_20/mlp": 0.005436847452074289, "grad/layer_20/attn_mlp_ratio": 0.5228746049295304, "grad/layer_24/attn": 0.015496297739446163, "grad/layer_24/mlp": 0.008901212364435196, "grad/layer_24/attn_mlp_ratio": 1.7409198804501789, "grad/layer_27/attn": 0.004419827368110418, "grad/layer_27/mlp": 0.008796168491244316, "grad/layer_27/attn_mlp_ratio": 0.5024718799171156} {"step": 75350, "timestamp": 1778275984.956478, "train/loss": 2.097425651550293, "train/z_loss": 0.0013874404481612145, "train/perplexity": 8.14517438221395, "train/grad_norm": 0.1318359375, "optim/muon_lr": 0.016528668999671935, "optim/adamw_lr": 0.0004958600699901581, "perf/tokens_per_sec": 2020115.0953244935, "perf/iters_per_sec": 0.9632659412977664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381349086761475, "data/tokens_consumed": 158022500352, "data/tokens_consumed_B": 158.022500352, "train/loss_slope": -1.790598239978794e-05} {"step": 75360, "timestamp": 1778275995.3405318, "train/loss": 2.0970773935317992, "train/z_loss": 0.0013952748384326696, "train/perplexity": 8.142338253804207, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.016499335169792177, "optim/adamw_lr": 0.0004949800550937652, "perf/tokens_per_sec": 2020573.9904405074, "perf/iters_per_sec": 0.9634847595407998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037899136543274, "data/tokens_consumed": 158043471872, "data/tokens_consumed_B": 158.043471872, "train/loss_slope": -1.8738964767810816e-05} {"step": 75370, "timestamp": 1778276005.7254844, "train/loss": 2.0967862606048584, "train/z_loss": 0.001380761165637523, "train/perplexity": 8.139968096068456, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.01646990478038788, "optim/adamw_lr": 0.0004940971434116363, "perf/tokens_per_sec": 2020458.3305223088, "perf/iters_per_sec": 0.963429608594088, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037958550453186, "data/tokens_consumed": 158064443392, "data/tokens_consumed_B": 158.064443392, "train/loss_slope": -2.0360788231742912e-05} {"step": 75375, "timestamp": 1778276011.5152044, "eos/sharpness": 81.55024051666258, "eos/L0_probe": 1.950434684753418, "eos/L_plus": 2.440558671951294, "eos/L_minus": 2.275813102722168, "eos/grad_norm": 0.18080461025238037, "eos/embed_grad_frac": 0.06544622033834457, "eos/time_s": 0.6102948188781738} {"step": 75375, "timestamp": 1778276012.8974776, "geo/rankme_last": 438.8677062988281, "geo/layer_0/stable_rank_q_proj": 19.045507431030273, "geo/layer_0/stable_rank_k_proj": 15.726161003112793, "geo/layer_0/stable_rank_o_proj": 46.486671447753906, "geo/layer_0/stable_rank_gate_proj": 128.53240966796875, "geo/layer_0/stable_rank_down_proj": 56.515262603759766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06358213722705841, "geo/layer_0/attn_entropy_mean": 6.144992828369141, "geo/layer_0/attn_entropy_std": 0.4301499128341675, "geo/layer_7/stable_rank_q_proj": 42.785064697265625, "geo/layer_7/stable_rank_k_proj": 40.187564849853516, "geo/layer_7/stable_rank_o_proj": 88.28793334960938, "geo/layer_7/stable_rank_gate_proj": 77.28702545166016, "geo/layer_7/stable_rank_down_proj": 140.31466674804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4400073289871216, "geo/layer_7/attn_entropy_mean": 4.625715255737305, "geo/layer_7/attn_entropy_std": 0.7871190905570984, "geo/layer_14/stable_rank_q_proj": 49.6053352355957, "geo/layer_14/stable_rank_k_proj": 41.730140686035156, "geo/layer_14/stable_rank_o_proj": 43.289031982421875, "geo/layer_14/stable_rank_gate_proj": 70.68711853027344, "geo/layer_14/stable_rank_down_proj": 125.64591217041016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40404921770095825, "geo/layer_14/attn_entropy_mean": 5.490346431732178, "geo/layer_14/attn_entropy_std": 0.42681393027305603, "geo/layer_21/stable_rank_q_proj": 39.8570556640625, "geo/layer_21/stable_rank_k_proj": 29.98832130432129, "geo/layer_21/stable_rank_o_proj": 68.08242797851562, "geo/layer_21/stable_rank_gate_proj": 63.81126022338867, "geo/layer_21/stable_rank_down_proj": 49.29708480834961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14761698246002197, "geo/layer_21/attn_entropy_mean": 5.691285133361816, "geo/layer_21/attn_entropy_std": 0.29725468158721924, "geo/layer_27/stable_rank_q_proj": 43.86623764038086, "geo/layer_27/stable_rank_k_proj": 32.35307693481445, "geo/layer_27/stable_rank_o_proj": 115.37461853027344, "geo/layer_27/stable_rank_gate_proj": 77.31996154785156, "geo/layer_27/stable_rank_down_proj": 127.646240234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09684804081916809, "geo/layer_27/attn_entropy_mean": 4.169557571411133, "geo/layer_27/attn_entropy_std": 0.7982423901557922, "attnres/final_alpha/block_0": 0.236974835395813, "attnres/block_norm/0": 1.7780219316482544, "attnres/final_alpha/block_1": 0.0038818446919322014, "attnres/block_norm/1": 48300.53125, "attnres/final_alpha/block_2": 0.009674123488366604, "attnres/block_norm/2": 29126.619140625, "attnres/final_alpha/block_3": 0.01158558577299118, "attnres/block_norm/3": 62536.2421875, "attnres/final_alpha/block_4": 0.013578535988926888, "attnres/block_norm/4": 15832.4296875, "attnres/final_alpha/block_5": 0.6199269890785217, "attnres/block_norm/5": 6794.29150390625, "attnres/final_alpha/block_6": 0.10437804460525513, "attnres/block_norm/6": 41120.6015625, "geo/tier1_time_s": 1.362149715423584, "geo/step": 75375.0, "geo/rankme_slope": 4.2184764530812324e-05} {"step": 75380, "timestamp": 1778276018.0873933, "train/loss": 2.1543986201286316, "train/z_loss": 0.0013852191856130957, "train/perplexity": 8.622703099341711, "train/grad_norm": 0.111328125, "optim/muon_lr": 0.016440377831459047, "optim/adamw_lr": 0.0004932113349437713, "perf/tokens_per_sec": 1697238.7383598506, "perf/iters_per_sec": 0.8093064967917684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2356258153915405, "data/tokens_consumed": 158085414912, "data/tokens_consumed_B": 158.085414912, "train/loss_slope": -1.5832664985897057e-05} {"step": 75390, "timestamp": 1778276028.4633474, "train/loss": 2.1086011290550233, "train/z_loss": 0.0013859005412086844, "train/perplexity": 8.23671112602859, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.016410754323005677, "optim/adamw_lr": 0.0004923226296901702, "perf/tokens_per_sec": 2022066.8288796456, "perf/iters_per_sec": 0.9641966003797748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371328830718993, "data/tokens_consumed": 158106386432, "data/tokens_consumed_B": 158.106386432, "train/loss_slope": -1.652260795928603e-05} {"step": 75400, "timestamp": 1778276038.8315241, "grad/layer_0/attn": 0.0023486085701733828, "grad/layer_0/mlp": 0.0026163009461015463, "grad/layer_0/attn_mlp_ratio": 0.8976828464266212, "grad/layer_4/attn": 0.004137711133807898, "grad/layer_4/mlp": 0.0024526624474674463, "grad/layer_4/attn_mlp_ratio": 1.6870283023974852, "grad/layer_8/attn": 0.005978610832244158, "grad/layer_8/mlp": 0.0037297732196748257, "grad/layer_8/attn_mlp_ratio": 1.6029421414718559, "grad/layer_12/attn": 0.004217091016471386, "grad/layer_12/mlp": 0.006553458049893379, "grad/layer_12/attn_mlp_ratio": 0.643490950886746, "grad/layer_16/attn": 0.007633487693965435, "grad/layer_16/mlp": 0.004615966696292162, "grad/layer_16/attn_mlp_ratio": 1.6537137355704419, "grad/layer_20/attn": 0.0035811818670481443, "grad/layer_20/mlp": 0.00581037113443017, "grad/layer_20/attn_mlp_ratio": 0.6163430394649052, "grad/layer_24/attn": 0.01268700510263443, "grad/layer_24/mlp": 0.009796102531254292, "grad/layer_24/attn_mlp_ratio": 1.295107409568859, "grad/layer_27/attn": 0.006839415058493614, "grad/layer_27/mlp": 0.009279550053179264, "grad/layer_27/attn_mlp_ratio": 0.7370416610281872} {"step": 75400, "timestamp": 1778276038.848371, "train/loss": 2.1620986700057983, "train/z_loss": 0.0014026763383299112, "train/perplexity": 8.689354623996744, "train/grad_norm": 0.1748046875, "optim/muon_lr": 0.01638103485107422, "optim/adamw_lr": 0.0004914310455322265, "perf/tokens_per_sec": 2020550.1797521787, "perf/iters_per_sec": 0.9634734057198423, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379113674163818, "data/tokens_consumed": 158127357952, "data/tokens_consumed_B": 158.127357952, "train/loss_slope": -9.496535254855252e-06} {"step": 75410, "timestamp": 1778276049.2237399, "train/loss": 2.1084205746650695, "train/z_loss": 0.0013870669296011328, "train/perplexity": 8.235224085925852, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.01635122060775757, "optim/adamw_lr": 0.000490536618232727, "perf/tokens_per_sec": 2022129.2584889508, "perf/iters_per_sec": 0.9642263691372637, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037100863456726, "data/tokens_consumed": 158148329472, "data/tokens_consumed_B": 158.148329472, "train/loss_slope": -8.798884036410048e-06} {"step": 75420, "timestamp": 1778276059.6063192, "train/loss": 2.117770051956177, "train/z_loss": 0.0013766270945779978, "train/perplexity": 8.312580182547292, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.01632131040096283, "optim/adamw_lr": 0.0004896393120288849, "perf/tokens_per_sec": 2021044.0066668014, "perf/iters_per_sec": 0.9637088807424552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376577615737914, "data/tokens_consumed": 158169300992, "data/tokens_consumed_B": 158.169300992, "train/loss_slope": -1.1886189000369736e-05} {"step": 75430, "timestamp": 1778276069.985285, "train/loss": 2.1088780403137206, "train/z_loss": 0.0013790055061690509, "train/perplexity": 8.238992279898042, "train/grad_norm": 0.298828125, "optim/muon_lr": 0.016291306614875795, "optim/adamw_lr": 0.0004887391984462737, "perf/tokens_per_sec": 2021711.1977500864, "perf/iters_per_sec": 0.9640270222425873, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373153209686279, "data/tokens_consumed": 158190272512, "data/tokens_consumed_B": 158.190272512, "train/loss_slope": -1.261883596024853e-05} {"step": 75440, "timestamp": 1778276080.3588042, "train/loss": 2.138831305503845, "train/z_loss": 0.0013739939080551268, "train/perplexity": 8.489510185113227, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.016261208653450012, "optim/adamw_lr": 0.00048783625960350033, "perf/tokens_per_sec": 2022696.926516929, "perf/iters_per_sec": 0.9644970543465275, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368098020553589, "data/tokens_consumed": 158211244032, "data/tokens_consumed_B": 158.211244032, "train/loss_slope": -1.0682220985465248e-05} {"step": 75450, "timestamp": 1778276090.711886, "grad/layer_0/attn": 0.002762122079730034, "grad/layer_0/mlp": 0.0028398630674928427, "grad/layer_0/attn_mlp_ratio": 0.9726250586110309, "grad/layer_4/attn": 0.0020551520865410566, "grad/layer_4/mlp": 0.002634000265970826, "grad/layer_4/attn_mlp_ratio": 0.7802398637038834, "grad/layer_8/attn": 0.0031956275925040245, "grad/layer_8/mlp": 0.003545301267877221, "grad/layer_8/attn_mlp_ratio": 0.90136980214391, "grad/layer_12/attn": 0.004365064203739166, "grad/layer_12/mlp": 0.006634132470935583, "grad/layer_12/attn_mlp_ratio": 0.6579706023456177, "grad/layer_16/attn": 0.0035376206506043673, "grad/layer_16/mlp": 0.004790883976966143, "grad/layer_16/attn_mlp_ratio": 0.738406647660861, "grad/layer_20/attn": 0.003233118914067745, "grad/layer_20/mlp": 0.005178815219551325, "grad/layer_20/attn_mlp_ratio": 0.6242970089823269, "grad/layer_24/attn": 0.0054331435821950436, "grad/layer_24/mlp": 0.007256817538291216, "grad/layer_24/attn_mlp_ratio": 0.7486950689688824, "grad/layer_27/attn": 0.0045509785413742065, "grad/layer_27/mlp": 0.006283656228333712, "grad/layer_27/attn_mlp_ratio": 0.7242564366312227} {"step": 75450, "timestamp": 1778276091.3192832, "eos/sharpness": 7.0435047149658185, "eos/L0_probe": 1.9459551572799683, "eos/L_plus": 1.9779120683670044, "eos/L_minus": 1.9844332933425903, "eos/grad_norm": 0.0896388366818428, "eos/embed_grad_frac": 0.26569676399230957, "eos/time_s": 0.6043930053710938} {"step": 75450, "timestamp": 1778276091.3388288, "train/loss": 2.065763306617737, "train/z_loss": 0.0014024888747371732, "train/perplexity": 7.8913190954407195, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.016231016516685487, "optim/adamw_lr": 0.0004869304955005645, "perf/tokens_per_sec": 1910944.7512757692, "perf/iters_per_sec": 0.9112094646815153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0974425077438354, "data/tokens_consumed": 158232215552, "data/tokens_consumed_B": 158.232215552, "train/loss_slope": -1.3517759628135662e-05} {"step": 75450, "timestamp": 1778276092.707287, "geo/rankme_last": 439.1377258300781, "geo/layer_0/stable_rank_q_proj": 19.034259796142578, "geo/layer_0/stable_rank_k_proj": 15.776690483093262, "geo/layer_0/stable_rank_o_proj": 46.489593505859375, "geo/layer_0/stable_rank_gate_proj": 128.63265991210938, "geo/layer_0/stable_rank_down_proj": 56.55703353881836, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060377612709999084, "geo/layer_0/attn_entropy_mean": 6.141895294189453, "geo/layer_0/attn_entropy_std": 0.42974743247032166, "geo/layer_7/stable_rank_q_proj": 42.78178405761719, "geo/layer_7/stable_rank_k_proj": 40.183387756347656, "geo/layer_7/stable_rank_o_proj": 88.24080657958984, "geo/layer_7/stable_rank_gate_proj": 77.22312927246094, "geo/layer_7/stable_rank_down_proj": 140.3409423828125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43020832538604736, "geo/layer_7/attn_entropy_mean": 4.625148296356201, "geo/layer_7/attn_entropy_std": 0.7937994003295898, "geo/layer_14/stable_rank_q_proj": 49.64194107055664, "geo/layer_14/stable_rank_k_proj": 41.74144744873047, "geo/layer_14/stable_rank_o_proj": 43.33767318725586, "geo/layer_14/stable_rank_gate_proj": 70.61161041259766, "geo/layer_14/stable_rank_down_proj": 125.76908111572266, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4071045517921448, "geo/layer_14/attn_entropy_mean": 5.524125099182129, "geo/layer_14/attn_entropy_std": 0.4190537631511688, "geo/layer_21/stable_rank_q_proj": 39.8238525390625, "geo/layer_21/stable_rank_k_proj": 29.9648380279541, "geo/layer_21/stable_rank_o_proj": 67.9953384399414, "geo/layer_21/stable_rank_gate_proj": 63.816158294677734, "geo/layer_21/stable_rank_down_proj": 49.2586555480957, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14886820316314697, "geo/layer_21/attn_entropy_mean": 5.678624153137207, "geo/layer_21/attn_entropy_std": 0.3063128590583801, "geo/layer_27/stable_rank_q_proj": 43.8310432434082, "geo/layer_27/stable_rank_k_proj": 32.301673889160156, "geo/layer_27/stable_rank_o_proj": 115.4393081665039, "geo/layer_27/stable_rank_gate_proj": 77.31971740722656, "geo/layer_27/stable_rank_down_proj": 127.76691436767578, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10471992939710617, "geo/layer_27/attn_entropy_mean": 4.15132999420166, "geo/layer_27/attn_entropy_std": 0.7959325909614563, "attnres/final_alpha/block_0": 0.23772448301315308, "attnres/block_norm/0": 1.778110384941101, "attnres/final_alpha/block_1": 0.003972766455262899, "attnres/block_norm/1": 48329.52734375, "attnres/final_alpha/block_2": 0.009827731177210808, "attnres/block_norm/2": 29102.25, "attnres/final_alpha/block_3": 0.011526836082339287, "attnres/block_norm/3": 62738.515625, "attnres/final_alpha/block_4": 0.013408305123448372, "attnres/block_norm/4": 15794.544921875, "attnres/final_alpha/block_5": 0.617821216583252, "attnres/block_norm/5": 6845.14208984375, "attnres/final_alpha/block_6": 0.10571867972612381, "attnres/block_norm/6": 41240.53125, "geo/tier1_time_s": 1.3645343780517578, "geo/step": 75450.0, "geo/rankme_slope": 7.667365774434773e-05} {"step": 75460, "timestamp": 1778276103.0474024, "train/loss": 2.114890694618225, "train/z_loss": 0.0013860817765817047, "train/perplexity": 8.288679719301102, "train/grad_norm": 0.11669921875, "optim/muon_lr": 0.016200731992721557, "optim/adamw_lr": 0.0004860219597816467, "perf/tokens_per_sec": 1791687.457036773, "perf/iters_per_sec": 0.8543431554015984, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1704898595809936, "data/tokens_consumed": 158253187072, "data/tokens_consumed_B": 158.253187072, "train/loss_slope": -1.5091794809707113e-05} {"step": 75470, "timestamp": 1778276114.290212, "train/loss": 2.1621334075927736, "train/z_loss": 0.0013747060555033385, "train/perplexity": 8.689656476451537, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.01617035448551178, "optim/adamw_lr": 0.00048511063456535336, "perf/tokens_per_sec": 1866230.8571915461, "perf/iters_per_sec": 0.8898882184941989, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1237366437911986, "data/tokens_consumed": 158274158592, "data/tokens_consumed_B": 158.274158592, "train/loss_slope": -1.3375442477986495e-05} {"step": 75480, "timestamp": 1778276124.6757026, "train/loss": 2.1251809358596803, "train/z_loss": 0.0014020485919900238, "train/perplexity": 8.374412582595621, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.0161398845911026, "optim/adamw_lr": 0.00048419653773307795, "perf/tokens_per_sec": 2021232.6038669567, "perf/iters_per_sec": 0.9637988108954223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375609397888184, "data/tokens_consumed": 158295130112, "data/tokens_consumed_B": 158.295130112, "train/loss_slope": -1.4991373004335316e-05} {"step": 75490, "timestamp": 1778276135.0619457, "train/loss": 2.1049697041511535, "train/z_loss": 0.0014010344166308641, "train/perplexity": 8.206854372209921, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.016109323501586913, "optim/adamw_lr": 0.0004832797050476074, "perf/tokens_per_sec": 2020464.2709980835, "perf/iters_per_sec": 0.9634324412336748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379554986953736, "data/tokens_consumed": 158316101632, "data/tokens_consumed_B": 158.316101632, "train/loss_slope": -1.2826024402271603e-05} {"step": 75500, "timestamp": 1778276145.4272017, "grad/layer_0/attn": 0.0027492516674101353, "grad/layer_0/mlp": 0.0029203929007053375, "grad/layer_0/attn_mlp_ratio": 0.9413978415734224, "grad/layer_4/attn": 0.002715494716539979, "grad/layer_4/mlp": 0.0025385210756212473, "grad/layer_4/attn_mlp_ratio": 1.0697152116036304, "grad/layer_8/attn": 0.004107537213712931, "grad/layer_8/mlp": 0.0034855902194976807, "grad/layer_8/attn_mlp_ratio": 1.1784337335160149, "grad/layer_12/attn": 0.008224636316299438, "grad/layer_12/mlp": 0.006530188489705324, "grad/layer_12/attn_mlp_ratio": 1.2594791411178183, "grad/layer_16/attn": 0.005304490216076374, "grad/layer_16/mlp": 0.004675500560551882, "grad/layer_16/attn_mlp_ratio": 1.1345288133164864, "grad/layer_20/attn": 0.0032109927851706743, "grad/layer_20/mlp": 0.005645190831273794, "grad/layer_20/attn_mlp_ratio": 0.5688014496342533, "grad/layer_24/attn": 0.0056266142055392265, "grad/layer_24/mlp": 0.007472222670912743, "grad/layer_24/attn_mlp_ratio": 0.7530040763027102, "grad/layer_27/attn": 0.003812728449702263, "grad/layer_27/mlp": 0.006771347019821405, "grad/layer_27/attn_mlp_ratio": 0.563067936443764} {"step": 75500, "timestamp": 1778276145.4415226, "train/loss": 2.13579740524292, "train/z_loss": 0.0013833286589942872, "train/perplexity": 8.46379288952872, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.016078670620918275, "optim/adamw_lr": 0.0004823601186275482, "perf/tokens_per_sec": 2021512.4297357197, "perf/iters_per_sec": 0.963932242267475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374173164367675, "data/tokens_consumed": 158337073152, "data/tokens_consumed_B": 158.337073152, "train/loss_slope": -1.0941220381365071e-05} {"step": 75500, "timestamp": 1778276152.4750493, "geo/ww_alpha_mean": 7.620821828452015, "geo/ww_alpha_std": 4.586534476580044, "geo/ww_alpha_min": 1.3352123696366234, "geo/ww_alpha_max": 33.58787693559059, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.916107673300749, "geo/ww_alpha_by_type/k_proj": 4.530432698339312, "geo/ww_alpha_by_type/v_proj": 7.974460511141143, "geo/ww_alpha_by_type/o_proj": 9.30557568035981, "geo/ww_alpha_by_type/gate_proj": 7.892182804917344, "geo/ww_alpha_by_type/up_proj": 11.675242678185386, "geo/ww_alpha_by_type/down_proj": 8.137071294061501, "geo/twonn_id/layer_0": 0.7090638279914856, "geo/twonn_id/layer_7": 3.1507833003997803, "geo/twonn_id/layer_14": 4.315390586853027, "geo/twonn_id/layer_21": 5.96339225769043, "geo/twonn_id/layer_27": 6.637945652008057, "geo/tier2_time_s": 7.025240898132324} {"step": 75500, "timestamp": 1778276153.1435292, "eoc/jacobian_sigma/layer_0/attn": 1273.95849609375, "eoc/jacobian_sigma/layer_0/mlp": 9257.7509765625, "eoc/jacobian_sigma/layer_0": 9257.7509765625, "eoc/jacobian_sigma/layer_7/attn": 1.1553444862365723, "eoc/jacobian_sigma/layer_7/mlp": 1.782202124595642, "eoc/jacobian_sigma/layer_7": 1.782202124595642, "eoc/jacobian_sigma/layer_14/attn": 1.3913103342056274, "eoc/jacobian_sigma/layer_14/mlp": 6.606527328491211, "eoc/jacobian_sigma/layer_14": 6.606527328491211, "eoc/jacobian_sigma/layer_21/attn": 1.0974935293197632, "eoc/jacobian_sigma/layer_21/mlp": 4.464104175567627, "eoc/jacobian_sigma/layer_21": 4.464104175567627, "eoc/jacobian_sigma/layer_27/attn": 2.761761426925659, "eoc/jacobian_sigma/layer_27/mlp": 26.965890884399414, "eoc/jacobian_sigma/layer_27": 26.965890884399414, "eoc/layer0_sigma": 9257.7509765625, "eoc/sigma_max": 26.965890884399414, "eoc/sigma_min": 1.782202124595642, "eoc/sigma_mean": 9.954681128263474, "eoc/time_s": 0.6610479354858398} {"step": 75510, "timestamp": 1778276163.5367568, "train/loss": 2.1094265460968016, "train/z_loss": 0.001394720294047147, "train/perplexity": 8.243512654422776, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.01604792654514313, "optim/adamw_lr": 0.0004814377963542938, "perf/tokens_per_sec": 1159417.498017471, "perf/iters_per_sec": 0.5528533449256282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8087979555130005, "data/tokens_consumed": 158358044672, "data/tokens_consumed_B": 158.358044672, "train/loss_slope": -1.2454882306162767e-05} {"step": 75520, "timestamp": 1778276173.9204473, "train/loss": 2.0890177607536318, "train/z_loss": 0.001388668827712536, "train/perplexity": 8.076977742189046, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.016017093062400817, "optim/adamw_lr": 0.0004805127918720245, "perf/tokens_per_sec": 2020754.8400272196, "perf/iters_per_sec": 0.9635709953437899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037806248664856, "data/tokens_consumed": 158379016192, "data/tokens_consumed_B": 158.379016192, "train/loss_slope": -1.579426017352637e-05} {"step": 75525, "timestamp": 1778276180.2112136, "eos/sharpness": 16.176676750183102, "eos/L0_probe": 1.9491307735443115, "eos/L_plus": 2.0315356254577637, "eos/L_minus": 2.0284926891326904, "eos/grad_norm": 0.09186886250972748, "eos/embed_grad_frac": 0.24705147743225098, "eos/time_s": 0.604839563369751} {"step": 75525, "timestamp": 1778276181.595482, "geo/rankme_last": 439.1637268066406, "geo/layer_0/stable_rank_q_proj": 19.02022933959961, "geo/layer_0/stable_rank_k_proj": 15.773124694824219, "geo/layer_0/stable_rank_o_proj": 46.51072692871094, "geo/layer_0/stable_rank_gate_proj": 128.6793670654297, "geo/layer_0/stable_rank_down_proj": 56.61171340942383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06243559718132019, "geo/layer_0/attn_entropy_mean": 6.142355918884277, "geo/layer_0/attn_entropy_std": 0.4304601550102234, "geo/layer_7/stable_rank_q_proj": 42.828582763671875, "geo/layer_7/stable_rank_k_proj": 40.22200393676758, "geo/layer_7/stable_rank_o_proj": 88.0713882446289, "geo/layer_7/stable_rank_gate_proj": 77.13850402832031, "geo/layer_7/stable_rank_down_proj": 140.15643310546875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4353940188884735, "geo/layer_7/attn_entropy_mean": 4.642212867736816, "geo/layer_7/attn_entropy_std": 0.7838678956031799, "geo/layer_14/stable_rank_q_proj": 49.655113220214844, "geo/layer_14/stable_rank_k_proj": 41.72290802001953, "geo/layer_14/stable_rank_o_proj": 43.389892578125, "geo/layer_14/stable_rank_gate_proj": 70.55614471435547, "geo/layer_14/stable_rank_down_proj": 125.79592895507812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4022800326347351, "geo/layer_14/attn_entropy_mean": 5.533469200134277, "geo/layer_14/attn_entropy_std": 0.4188978672027588, "geo/layer_21/stable_rank_q_proj": 39.766517639160156, "geo/layer_21/stable_rank_k_proj": 29.978073120117188, "geo/layer_21/stable_rank_o_proj": 67.94464111328125, "geo/layer_21/stable_rank_gate_proj": 63.73114776611328, "geo/layer_21/stable_rank_down_proj": 49.264225006103516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15032218396663666, "geo/layer_21/attn_entropy_mean": 5.6806159019470215, "geo/layer_21/attn_entropy_std": 0.2995813190937042, "geo/layer_27/stable_rank_q_proj": 43.839080810546875, "geo/layer_27/stable_rank_k_proj": 32.30263900756836, "geo/layer_27/stable_rank_o_proj": 115.50037384033203, "geo/layer_27/stable_rank_gate_proj": 77.26579284667969, "geo/layer_27/stable_rank_down_proj": 127.70364379882812, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08789999783039093, "geo/layer_27/attn_entropy_mean": 4.15886116027832, "geo/layer_27/attn_entropy_std": 0.7886804938316345, "attnres/final_alpha/block_0": 0.23750242590904236, "attnres/block_norm/0": 1.778300166130066, "attnres/final_alpha/block_1": 0.003906717523932457, "attnres/block_norm/1": 48270.6640625, "attnres/final_alpha/block_2": 0.009746527299284935, "attnres/block_norm/2": 29196.283203125, "attnres/final_alpha/block_3": 0.011573301628232002, "attnres/block_norm/3": 62579.921875, "attnres/final_alpha/block_4": 0.013384050689637661, "attnres/block_norm/4": 15780.79296875, "attnres/final_alpha/block_5": 0.6182345151901245, "attnres/block_norm/5": 6848.8818359375, "attnres/final_alpha/block_6": 0.10565247386693954, "attnres/block_norm/6": 41350.20703125, "geo/tier1_time_s": 1.3642852306365967, "geo/step": 75525.0, "geo/rankme_slope": 0.00014702531403186275} {"step": 75530, "timestamp": 1778276186.783134, "train/loss": 2.1528951644897463, "train/z_loss": 0.0013776799896731973, "train/perplexity": 8.60974898815043, "train/grad_norm": 0.126953125, "optim/muon_lr": 0.01598616898059845, "optim/adamw_lr": 0.00047958506941795346, "perf/tokens_per_sec": 1631026.8730174452, "perf/iters_per_sec": 0.7777342190825678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2857862949371337, "data/tokens_consumed": 158399987712, "data/tokens_consumed_B": 158.399987712, "train/loss_slope": -1.535370706117015e-05} {"step": 75540, "timestamp": 1778276197.4863842, "train/loss": 2.14910249710083, "train/z_loss": 0.0013763875816948713, "train/perplexity": 8.577156918444176, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.01595515549182892, "optim/adamw_lr": 0.0004786546647548675, "perf/tokens_per_sec": 1960294.8706512128, "perf/iters_per_sec": 0.9347414353614868, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0698145627975464, "data/tokens_consumed": 158420959232, "data/tokens_consumed_B": 158.420959232, "train/loss_slope": -1.0040259518639116e-05} {"step": 75550, "timestamp": 1778276207.8583806, "grad/layer_0/attn": 0.0029691732488572598, "grad/layer_0/mlp": 0.003168975468724966, "grad/layer_0/attn_mlp_ratio": 0.936950501657513, "grad/layer_4/attn": 0.002172693610191345, "grad/layer_4/mlp": 0.0026493496261537075, "grad/layer_4/attn_mlp_ratio": 0.8200856190268373, "grad/layer_8/attn": 0.004649819806218147, "grad/layer_8/mlp": 0.003778490936383605, "grad/layer_8/attn_mlp_ratio": 1.230602312257565, "grad/layer_12/attn": 0.004726717248558998, "grad/layer_12/mlp": 0.006760068703442812, "grad/layer_12/attn_mlp_ratio": 0.6992114113027584, "grad/layer_16/attn": 0.0034642780665308237, "grad/layer_16/mlp": 0.005002247169613838, "grad/layer_16/attn_mlp_ratio": 0.6925443465329251, "grad/layer_20/attn": 0.0032183562871068716, "grad/layer_20/mlp": 0.005656623747199774, "grad/layer_20/attn_mlp_ratio": 0.5689535620615945, "grad/layer_24/attn": 0.017385192215442657, "grad/layer_24/mlp": 0.01030542142689228, "grad/layer_24/attn_mlp_ratio": 1.6869947697020953, "grad/layer_27/attn": 0.004467361606657505, "grad/layer_27/mlp": 0.00951760821044445, "grad/layer_27/attn_mlp_ratio": 0.46937859396410586} {"step": 75550, "timestamp": 1778276207.872655, "train/loss": 2.1108648061752318, "train/z_loss": 0.0013922528247348963, "train/perplexity": 8.255377499901167, "train/grad_norm": 0.173828125, "optim/muon_lr": 0.015924053192138674, "optim/adamw_lr": 0.00047772159576416013, "perf/tokens_per_sec": 2020102.05867673, "perf/iters_per_sec": 0.9632597249396944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381416082382202, "data/tokens_consumed": 158441930752, "data/tokens_consumed_B": 158.441930752, "train/loss_slope": -1.0891244485147257e-05} {"step": 75560, "timestamp": 1778276218.2575002, "train/loss": 2.1023067831993103, "train/z_loss": 0.0013928836560808122, "train/perplexity": 8.185029239951524, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.01589286208152771, "optim/adamw_lr": 0.00047678586244583125, "perf/tokens_per_sec": 2021046.792875063, "perf/iters_per_sec": 0.9637102093100848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376563310623168, "data/tokens_consumed": 158462902272, "data/tokens_consumed_B": 158.462902272, "train/loss_slope": -1.4268954709382417e-05} {"step": 75570, "timestamp": 1778276228.6393392, "train/loss": 2.102091336250305, "train/z_loss": 0.0013905451050959527, "train/perplexity": 8.183265990324454, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.01586158275604248, "optim/adamw_lr": 0.0004758474826812744, "perf/tokens_per_sec": 2021166.421328549, "perf/iters_per_sec": 0.9637672526018853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375949144363403, "data/tokens_consumed": 158483873792, "data/tokens_consumed_B": 158.483873792, "train/loss_slope": -1.4562939596076036e-05} {"step": 75580, "timestamp": 1778276239.0204074, "train/loss": 2.096158838272095, "train/z_loss": 0.0014049238990992308, "train/perplexity": 8.134862500146939, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.01583021640777588, "optim/adamw_lr": 0.0004749064922332763, "perf/tokens_per_sec": 2021220.1565744292, "perf/iters_per_sec": 0.9637928755638262, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375673294067382, "data/tokens_consumed": 158504845312, "data/tokens_consumed_B": 158.504845312, "train/loss_slope": -1.5450489388691107e-05} {"step": 75590, "timestamp": 1778276249.4067051, "train/loss": 2.0835452914237975, "train/z_loss": 0.001388860063161701, "train/perplexity": 8.032897453241182, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.015798762440681458, "optim/adamw_lr": 0.0004739628732204437, "perf/tokens_per_sec": 2020359.7144459842, "perf/iters_per_sec": 0.9633825847845002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380092144012452, "data/tokens_consumed": 158525816832, "data/tokens_consumed_B": 158.525816832, "train/loss_slope": -2.0941807356032443e-05} {"step": 75600, "timestamp": 1778276259.7824848, "grad/layer_0/attn": 0.002786658937111497, "grad/layer_0/mlp": 0.0030255047604441643, "grad/layer_0/attn_mlp_ratio": 0.9210558454374443, "grad/layer_4/attn": 0.0025033552665263414, "grad/layer_4/mlp": 0.0025837277062237263, "grad/layer_4/attn_mlp_ratio": 0.9688927991935598, "grad/layer_8/attn": 0.008568290621042252, "grad/layer_8/mlp": 0.003646932076662779, "grad/layer_8/attn_mlp_ratio": 2.3494515954730177, "grad/layer_12/attn": 0.004130121786147356, "grad/layer_12/mlp": 0.006994794122874737, "grad/layer_12/attn_mlp_ratio": 0.5904565101630609, "grad/layer_16/attn": 0.003353823209181428, "grad/layer_16/mlp": 0.004517794586718082, "grad/layer_16/attn_mlp_ratio": 0.7423584827883773, "grad/layer_20/attn": 0.003311256878077984, "grad/layer_20/mlp": 0.00607329560443759, "grad/layer_20/attn_mlp_ratio": 0.5452158167860228, "grad/layer_24/attn": 0.013169432990252972, "grad/layer_24/mlp": 0.008640461601316929, "grad/layer_24/attn_mlp_ratio": 1.524158481976231, "grad/layer_27/attn": 0.010305196978151798, "grad/layer_27/mlp": 0.007473311387002468, "grad/layer_27/attn_mlp_ratio": 1.3789331538066527} {"step": 75600, "timestamp": 1778276260.3738945, "eos/sharpness": 77.09915637969969, "eos/L0_probe": 1.9490368366241455, "eos/L_plus": 2.2794320583343506, "eos/L_minus": 2.3896331787109375, "eos/grad_norm": 0.15489810705184937, "eos/embed_grad_frac": 0.07859934121370316, "eos/time_s": 0.5886728763580322} {"step": 75600, "timestamp": 1778276260.392087, "train/loss": 2.1190908908843995, "train/z_loss": 0.0013836281723342836, "train/perplexity": 8.32356701636796, "train/grad_norm": 0.154296875, "optim/muon_lr": 0.015767221450805665, "optim/adamw_lr": 0.00047301664352416986, "perf/tokens_per_sec": 1910140.2776914153, "perf/iters_per_sec": 0.9108258617837025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0979047060012816, "data/tokens_consumed": 158546788352, "data/tokens_consumed_B": 158.546788352, "train/loss_slope": -2.037979045001521e-05} {"step": 75600, "timestamp": 1778276261.753379, "geo/rankme_last": 439.36834716796875, "geo/layer_0/stable_rank_q_proj": 18.997461318969727, "geo/layer_0/stable_rank_k_proj": 15.778029441833496, "geo/layer_0/stable_rank_o_proj": 46.5222282409668, "geo/layer_0/stable_rank_gate_proj": 128.59176635742188, "geo/layer_0/stable_rank_down_proj": 56.560577392578125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06473303586244583, "geo/layer_0/attn_entropy_mean": 6.140419006347656, "geo/layer_0/attn_entropy_std": 0.4311416745185852, "geo/layer_7/stable_rank_q_proj": 42.846805572509766, "geo/layer_7/stable_rank_k_proj": 40.21403884887695, "geo/layer_7/stable_rank_o_proj": 88.03195190429688, "geo/layer_7/stable_rank_gate_proj": 77.06201934814453, "geo/layer_7/stable_rank_down_proj": 139.93177795410156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43621012568473816, "geo/layer_7/attn_entropy_mean": 4.637835502624512, "geo/layer_7/attn_entropy_std": 0.8021352291107178, "geo/layer_14/stable_rank_q_proj": 49.67271041870117, "geo/layer_14/stable_rank_k_proj": 41.75209426879883, "geo/layer_14/stable_rank_o_proj": 43.351341247558594, "geo/layer_14/stable_rank_gate_proj": 70.52799224853516, "geo/layer_14/stable_rank_down_proj": 125.66109466552734, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40146297216415405, "geo/layer_14/attn_entropy_mean": 5.5384521484375, "geo/layer_14/attn_entropy_std": 0.4153042733669281, "geo/layer_21/stable_rank_q_proj": 39.774208068847656, "geo/layer_21/stable_rank_k_proj": 29.994882583618164, "geo/layer_21/stable_rank_o_proj": 68.00724792480469, "geo/layer_21/stable_rank_gate_proj": 63.735496520996094, "geo/layer_21/stable_rank_down_proj": 49.26139831542969, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14653295278549194, "geo/layer_21/attn_entropy_mean": 5.692981243133545, "geo/layer_21/attn_entropy_std": 0.2951542139053345, "geo/layer_27/stable_rank_q_proj": 43.83457565307617, "geo/layer_27/stable_rank_k_proj": 32.31029510498047, "geo/layer_27/stable_rank_o_proj": 115.4482192993164, "geo/layer_27/stable_rank_gate_proj": 77.23814392089844, "geo/layer_27/stable_rank_down_proj": 127.59394836425781, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09234412014484406, "geo/layer_27/attn_entropy_mean": 4.174962520599365, "geo/layer_27/attn_entropy_std": 0.7975298166275024, "attnres/final_alpha/block_0": 0.23938949406147003, "attnres/block_norm/0": 1.7785749435424805, "attnres/final_alpha/block_1": 0.003937716130167246, "attnres/block_norm/1": 48165.796875, "attnres/final_alpha/block_2": 0.009912015870213509, "attnres/block_norm/2": 29176.640625, "attnres/final_alpha/block_3": 0.011666051112115383, "attnres/block_norm/3": 62537.640625, "attnres/final_alpha/block_4": 0.013834461569786072, "attnres/block_norm/4": 15811.068359375, "attnres/final_alpha/block_5": 0.6149866580963135, "attnres/block_norm/5": 6872.203125, "attnres/final_alpha/block_6": 0.10627362132072449, "attnres/block_norm/6": 41435.7265625, "geo/tier1_time_s": 1.3574120998382568, "geo/step": 75600.0, "geo/rankme_slope": 0.00018898772399584835} {"step": 75610, "timestamp": 1778276272.1306767, "train/loss": 2.1059814572334288, "train/z_loss": 0.0014024745672941209, "train/perplexity": 8.215161884283601, "train/grad_norm": 0.177734375, "optim/muon_lr": 0.01573559522628784, "optim/adamw_lr": 0.0004720678567886352, "perf/tokens_per_sec": 1787101.7886143844, "perf/iters_per_sec": 0.8521565383026048, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734933137893677, "data/tokens_consumed": 158567759872, "data/tokens_consumed_B": 158.567759872, "train/loss_slope": -2.192814639120397e-05} {"step": 75620, "timestamp": 1778276282.5109873, "train/loss": 2.1493431091308595, "train/z_loss": 0.0013850677525624633, "train/perplexity": 8.57922093388572, "train/grad_norm": 0.2001953125, "optim/muon_lr": 0.015703882575035095, "optim/adamw_lr": 0.0004711164772510528, "perf/tokens_per_sec": 2021759.38559759, "perf/iters_per_sec": 0.9640499999988508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037290596961975, "data/tokens_consumed": 158588731392, "data/tokens_consumed_B": 158.588731392, "train/loss_slope": -2.1216721660626665e-05} {"step": 75630, "timestamp": 1778276292.889788, "train/loss": 2.087018299102783, "train/z_loss": 0.001412214885931462, "train/perplexity": 8.060844269443733, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.01567208468914032, "optim/adamw_lr": 0.00047016254067420955, "perf/tokens_per_sec": 2021701.1608404396, "perf/iters_per_sec": 0.9640222362711142, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373204708099366, "data/tokens_consumed": 158609702912, "data/tokens_consumed_B": 158.609702912, "train/loss_slope": -2.1602588675596595e-05} {"step": 75640, "timestamp": 1778276303.2704341, "train/loss": 2.09913227558136, "train/z_loss": 0.0013986101257614791, "train/perplexity": 8.159087000979842, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.015640202164649963, "optim/adamw_lr": 0.0004692060649394989, "perf/tokens_per_sec": 2021359.036173993, "perf/iters_per_sec": 0.9638590985174146, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374960422515869, "data/tokens_consumed": 158630674432, "data/tokens_consumed_B": 158.630674432, "train/loss_slope": -2.290115779203736e-05} {"step": 75650, "timestamp": 1778276313.6346745, "grad/layer_0/attn": 0.002418426563963294, "grad/layer_0/mlp": 0.002889376599341631, "grad/layer_0/attn_mlp_ratio": 0.8370063219912977, "grad/layer_4/attn": 0.002538127824664116, "grad/layer_4/mlp": 0.0026108904276043177, "grad/layer_4/attn_mlp_ratio": 0.9721310785837632, "grad/layer_8/attn": 0.003034183057025075, "grad/layer_8/mlp": 0.0035528833977878094, "grad/layer_8/attn_mlp_ratio": 0.8540057840101705, "grad/layer_12/attn": 0.004369144793599844, "grad/layer_12/mlp": 0.006581498775631189, "grad/layer_12/attn_mlp_ratio": 0.6638525472939213, "grad/layer_16/attn": 0.0034143372904509306, "grad/layer_16/mlp": 0.004590342752635479, "grad/layer_16/attn_mlp_ratio": 0.7438087742162088, "grad/layer_20/attn": 0.00424187770113349, "grad/layer_20/mlp": 0.005380998365581036, "grad/layer_20/attn_mlp_ratio": 0.7883068037031034, "grad/layer_24/attn": 0.009357750415802002, "grad/layer_24/mlp": 0.009281955659389496, "grad/layer_24/attn_mlp_ratio": 1.008165806687436, "grad/layer_27/attn": 0.004672184120863676, "grad/layer_27/mlp": 0.0071821617893874645, "grad/layer_27/attn_mlp_ratio": 0.6505261497610362} {"step": 75650, "timestamp": 1778276313.6490445, "train/loss": 2.1311323165893556, "train/z_loss": 0.0013948376406915485, "train/perplexity": 8.424400501286899, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.015608235597610475, "optim/adamw_lr": 0.00046824706792831416, "perf/tokens_per_sec": 2022143.2510597634, "perf/iters_per_sec": 0.9642330413149659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370936870574952, "data/tokens_consumed": 158651645952, "data/tokens_consumed_B": 158.651645952, "train/loss_slope": -1.8705302396408666e-05} {"step": 75660, "timestamp": 1778276324.4619205, "train/loss": 2.1381243228912354, "train/z_loss": 0.0013891255483031272, "train/perplexity": 8.483510370155113, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.015576184988021852, "optim/adamw_lr": 0.0004672855496406555, "perf/tokens_per_sec": 1940534.288610874, "perf/iters_per_sec": 0.9253188555769319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0807085514068604, "data/tokens_consumed": 158672617472, "data/tokens_consumed_B": 158.672617472, "train/loss_slope": -1.7638412574873407e-05} {"step": 75670, "timestamp": 1778276334.850385, "train/loss": 2.131764316558838, "train/z_loss": 0.001394108752720058, "train/perplexity": 8.42972640495482, "train/grad_norm": 0.1923828125, "optim/muon_lr": 0.015544050931930542, "optim/adamw_lr": 0.0004663215279579162, "perf/tokens_per_sec": 2020013.1727092308, "perf/iters_per_sec": 0.963217340807548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038187289237976, "data/tokens_consumed": 158693588992, "data/tokens_consumed_B": 158.693588992, "train/loss_slope": -1.7702386231169254e-05} {"step": 75675, "timestamp": 1778276340.6233237, "eos/sharpness": 20.910120010375973, "eos/L0_probe": 1.9457753896713257, "eos/L_plus": 2.066594123840332, "eos/L_minus": 2.034057855606079, "eos/grad_norm": 0.09292451292276382, "eos/embed_grad_frac": 0.24751198291778564, "eos/time_s": 0.5904843807220459} {"step": 75675, "timestamp": 1778276341.99983, "geo/rankme_last": 439.53533935546875, "geo/layer_0/stable_rank_q_proj": 18.99445152282715, "geo/layer_0/stable_rank_k_proj": 15.775903701782227, "geo/layer_0/stable_rank_o_proj": 46.41917037963867, "geo/layer_0/stable_rank_gate_proj": 128.3600311279297, "geo/layer_0/stable_rank_down_proj": 56.513404846191406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06802411377429962, "geo/layer_0/attn_entropy_mean": 6.141414642333984, "geo/layer_0/attn_entropy_std": 0.4349866509437561, "geo/layer_7/stable_rank_q_proj": 42.829315185546875, "geo/layer_7/stable_rank_k_proj": 40.25967025756836, "geo/layer_7/stable_rank_o_proj": 87.9953384399414, "geo/layer_7/stable_rank_gate_proj": 77.04979705810547, "geo/layer_7/stable_rank_down_proj": 139.78390502929688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4447983205318451, "geo/layer_7/attn_entropy_mean": 4.650183200836182, "geo/layer_7/attn_entropy_std": 0.7972138524055481, "geo/layer_14/stable_rank_q_proj": 49.66094207763672, "geo/layer_14/stable_rank_k_proj": 41.669368743896484, "geo/layer_14/stable_rank_o_proj": 43.28642272949219, "geo/layer_14/stable_rank_gate_proj": 70.50215148925781, "geo/layer_14/stable_rank_down_proj": 125.67581176757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40552330017089844, "geo/layer_14/attn_entropy_mean": 5.562901020050049, "geo/layer_14/attn_entropy_std": 0.4201929271221161, "geo/layer_21/stable_rank_q_proj": 39.72489547729492, "geo/layer_21/stable_rank_k_proj": 30.011276245117188, "geo/layer_21/stable_rank_o_proj": 68.08665466308594, "geo/layer_21/stable_rank_gate_proj": 63.673988342285156, "geo/layer_21/stable_rank_down_proj": 49.28245544433594, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14573371410369873, "geo/layer_21/attn_entropy_mean": 5.686366558074951, "geo/layer_21/attn_entropy_std": 0.30116018652915955, "geo/layer_27/stable_rank_q_proj": 43.82969665527344, "geo/layer_27/stable_rank_k_proj": 32.30398178100586, "geo/layer_27/stable_rank_o_proj": 115.3584976196289, "geo/layer_27/stable_rank_gate_proj": 77.21546173095703, "geo/layer_27/stable_rank_down_proj": 127.44036865234375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09511851519346237, "geo/layer_27/attn_entropy_mean": 4.188065528869629, "geo/layer_27/attn_entropy_std": 0.7902740240097046, "attnres/final_alpha/block_0": 0.2379201054573059, "attnres/block_norm/0": 1.7785234451293945, "attnres/final_alpha/block_1": 0.003913551568984985, "attnres/block_norm/1": 48233.59375, "attnres/final_alpha/block_2": 0.009629601612687111, "attnres/block_norm/2": 29193.490234375, "attnres/final_alpha/block_3": 0.011397721245884895, "attnres/block_norm/3": 62858.171875, "attnres/final_alpha/block_4": 0.013306953012943268, "attnres/block_norm/4": 15786.3984375, "attnres/final_alpha/block_5": 0.618855893611908, "attnres/block_norm/5": 6831.84765625, "attnres/final_alpha/block_6": 0.10497617721557617, "attnres/block_norm/6": 41312.84765625, "geo/tier1_time_s": 1.3580706119537354, "geo/step": 75675.0, "geo/rankme_slope": 0.0002182459507240396} {"step": 75680, "timestamp": 1778276347.1874123, "train/loss": 2.077087414264679, "train/z_loss": 0.0014105242909863592, "train/perplexity": 7.9811891309573575, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.015511834621429443, "optim/adamw_lr": 0.00046535503864288325, "perf/tokens_per_sec": 1700883.5378158141, "perf/iters_per_sec": 0.8110444726065703, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329780101776122, "data/tokens_consumed": 158714560512, "data/tokens_consumed_B": 158.714560512, "train/loss_slope": -2.185627467776079e-05} {"step": 75690, "timestamp": 1778276357.5668356, "train/loss": 2.0425467133522033, "train/z_loss": 0.0013954579830169679, "train/perplexity": 7.710219936877322, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.01547953486442566, "optim/adamw_lr": 0.00046438604593276975, "perf/tokens_per_sec": 2021430.4342180153, "perf/iters_per_sec": 0.9638931437578274, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037459397315979, "data/tokens_consumed": 158735532032, "data/tokens_consumed_B": 158.735532032, "train/loss_slope": -2.649356577799128e-05} {"step": 75700, "timestamp": 1778276367.93579, "grad/layer_0/attn": 0.0029322246555238962, "grad/layer_0/mlp": 0.003007735125720501, "grad/layer_0/attn_mlp_ratio": 0.9748945420624529, "grad/layer_4/attn": 0.002247519325464964, "grad/layer_4/mlp": 0.0026062913239002228, "grad/layer_4/attn_mlp_ratio": 0.8623438288039297, "grad/layer_8/attn": 0.003711988916620612, "grad/layer_8/mlp": 0.0037170930299907923, "grad/layer_8/attn_mlp_ratio": 0.9986268266110963, "grad/layer_12/attn": 0.00438135489821434, "grad/layer_12/mlp": 0.0061950902454555035, "grad/layer_12/attn_mlp_ratio": 0.7072301861470585, "grad/layer_16/attn": 0.004470800049602985, "grad/layer_16/mlp": 0.004697634372860193, "grad/layer_16/attn_mlp_ratio": 0.9517130537576506, "grad/layer_20/attn": 0.0040946840308606625, "grad/layer_20/mlp": 0.005543719977140427, "grad/layer_20/attn_mlp_ratio": 0.7386166642405205, "grad/layer_24/attn": 0.007862462662160397, "grad/layer_24/mlp": 0.007825695909559727, "grad/layer_24/attn_mlp_ratio": 1.0046981958608865, "grad/layer_27/attn": 0.0035529187880456448, "grad/layer_27/mlp": 0.006679450627416372, "grad/layer_27/attn_mlp_ratio": 0.5319178077715864} {"step": 75700, "timestamp": 1778276367.9501586, "train/loss": 2.139159345626831, "train/z_loss": 0.0013822725741192698, "train/perplexity": 8.492295541907858, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.015447154045104982, "optim/adamw_lr": 0.00046341462135314937, "perf/tokens_per_sec": 2021126.8996480207, "perf/iters_per_sec": 0.9637484071960548, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376152038574218, "data/tokens_consumed": 158756503552, "data/tokens_consumed_B": 158.756503552, "train/loss_slope": -2.3918764926705602e-05} {"step": 75710, "timestamp": 1778276378.32299, "train/loss": 2.0982228755950927, "train/z_loss": 0.0013882822124287487, "train/perplexity": 8.151670500167192, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.01541469156742096, "optim/adamw_lr": 0.00046244074702262876, "perf/tokens_per_sec": 2022937.1467720822, "perf/iters_per_sec": 0.9646116002903377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366866827011108, "data/tokens_consumed": 158777475072, "data/tokens_consumed_B": 158.777475072, "train/loss_slope": -2.4759651389237964e-05} {"step": 75720, "timestamp": 1778276388.6953743, "train/loss": 2.1270800828933716, "train/z_loss": 0.0013853114331141114, "train/perplexity": 8.390331935221651, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.015382148027420044, "optim/adamw_lr": 0.00046146444082260127, "perf/tokens_per_sec": 2022915.6995382851, "perf/iters_per_sec": 0.9646013734523226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366976737976075, "data/tokens_consumed": 158798446592, "data/tokens_consumed_B": 158.798446592, "train/loss_slope": -2.35527061750823e-05} {"step": 75730, "timestamp": 1778276399.0712333, "train/loss": 2.095510411262512, "train/z_loss": 0.001386218296829611, "train/perplexity": 8.129589345395347, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.015349524021148682, "optim/adamw_lr": 0.0004604857206344604, "perf/tokens_per_sec": 2022291.416838111, "perf/iters_per_sec": 0.9643036922636561, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370177030563354, "data/tokens_consumed": 158819418112, "data/tokens_consumed_B": 158.819418112, "train/loss_slope": -2.611170032522974e-05} {"step": 75740, "timestamp": 1778276409.453717, "train/loss": 2.0893708944320677, "train/z_loss": 0.0013980870833620429, "train/perplexity": 8.079830498722348, "train/grad_norm": 0.1962890625, "optim/muon_lr": 0.01531682014465332, "optim/adamw_lr": 0.0004595046043395996, "perf/tokens_per_sec": 2021052.1795661517, "perf/iters_per_sec": 0.9637127778845557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376535654067993, "data/tokens_consumed": 158840389632, "data/tokens_consumed_B": 158.840389632, "train/loss_slope": -2.6686885469209533e-05} {"step": 75750, "timestamp": 1778276419.8178473, "grad/layer_0/attn": 0.002581313019618392, "grad/layer_0/mlp": 0.0029347864910960197, "grad/layer_0/attn_mlp_ratio": 0.8795573168590701, "grad/layer_4/attn": 0.0023872852325439453, "grad/layer_4/mlp": 0.0027294692117720842, "grad/layer_4/attn_mlp_ratio": 0.8746334762760242, "grad/layer_8/attn": 0.004976377822458744, "grad/layer_8/mlp": 0.003708644537255168, "grad/layer_8/attn_mlp_ratio": 1.3418319384037418, "grad/layer_12/attn": 0.00499036954715848, "grad/layer_12/mlp": 0.007264019455760717, "grad/layer_12/attn_mlp_ratio": 0.6869983634888313, "grad/layer_16/attn": 0.003180102678015828, "grad/layer_16/mlp": 0.004476058296859264, "grad/layer_16/attn_mlp_ratio": 0.71046943450229, "grad/layer_20/attn": 0.005720853805541992, "grad/layer_20/mlp": 0.005231624934822321, "grad/layer_20/attn_mlp_ratio": 1.0935137299526068, "grad/layer_24/attn": 0.005394801963120699, "grad/layer_24/mlp": 0.007161612622439861, "grad/layer_24/attn_mlp_ratio": 0.7532942888990464, "grad/layer_27/attn": 0.004121742676943541, "grad/layer_27/mlp": 0.00603602547198534, "grad/layer_27/attn_mlp_ratio": 0.6828570601280335} {"step": 75750, "timestamp": 1778276420.4094365, "eos/sharpness": 9.290182590484617, "eos/L0_probe": 1.9423960447311401, "eos/L_plus": 1.9881718158721924, "eos/L_minus": 1.989522099494934, "eos/grad_norm": 0.07817647606134415, "eos/embed_grad_frac": 0.33625689148902893, "eos/time_s": 0.5887720584869385} {"step": 75750, "timestamp": 1778276420.4279425, "train/loss": 2.097247588634491, "train/z_loss": 0.0013947587693110108, "train/perplexity": 8.143724157833157, "train/grad_norm": 0.078125, "optim/muon_lr": 0.015284036993980408, "optim/adamw_lr": 0.0004585211098194122, "perf/tokens_per_sec": 1911942.6732371068, "perf/iters_per_sec": 0.911685310953668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096869707107544, "data/tokens_consumed": 158861361152, "data/tokens_consumed_B": 158.861361152, "train/loss_slope": -2.9034413269894573e-05} {"step": 75750, "timestamp": 1778276421.7878897, "geo/rankme_last": 438.98992919921875, "geo/layer_0/stable_rank_q_proj": 18.989498138427734, "geo/layer_0/stable_rank_k_proj": 15.782164573669434, "geo/layer_0/stable_rank_o_proj": 46.453521728515625, "geo/layer_0/stable_rank_gate_proj": 128.55795288085938, "geo/layer_0/stable_rank_down_proj": 56.541107177734375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06073416769504547, "geo/layer_0/attn_entropy_mean": 6.1384406089782715, "geo/layer_0/attn_entropy_std": 0.43482765555381775, "geo/layer_7/stable_rank_q_proj": 42.83233642578125, "geo/layer_7/stable_rank_k_proj": 40.34150314331055, "geo/layer_7/stable_rank_o_proj": 87.98838806152344, "geo/layer_7/stable_rank_gate_proj": 77.0016860961914, "geo/layer_7/stable_rank_down_proj": 139.71292114257812, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4396827816963196, "geo/layer_7/attn_entropy_mean": 4.631096363067627, "geo/layer_7/attn_entropy_std": 0.774163544178009, "geo/layer_14/stable_rank_q_proj": 49.58103561401367, "geo/layer_14/stable_rank_k_proj": 41.661109924316406, "geo/layer_14/stable_rank_o_proj": 43.319793701171875, "geo/layer_14/stable_rank_gate_proj": 70.46508026123047, "geo/layer_14/stable_rank_down_proj": 125.43546295166016, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39663636684417725, "geo/layer_14/attn_entropy_mean": 5.532046318054199, "geo/layer_14/attn_entropy_std": 0.4367745518684387, "geo/layer_21/stable_rank_q_proj": 39.77810287475586, "geo/layer_21/stable_rank_k_proj": 30.042369842529297, "geo/layer_21/stable_rank_o_proj": 68.07492065429688, "geo/layer_21/stable_rank_gate_proj": 63.62790298461914, "geo/layer_21/stable_rank_down_proj": 49.262786865234375, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14364053308963776, "geo/layer_21/attn_entropy_mean": 5.687216758728027, "geo/layer_21/attn_entropy_std": 0.297038733959198, "geo/layer_27/stable_rank_q_proj": 43.78972244262695, "geo/layer_27/stable_rank_k_proj": 32.304073333740234, "geo/layer_27/stable_rank_o_proj": 115.42263793945312, "geo/layer_27/stable_rank_gate_proj": 77.22027587890625, "geo/layer_27/stable_rank_down_proj": 127.43061065673828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09930747747421265, "geo/layer_27/attn_entropy_mean": 4.173585891723633, "geo/layer_27/attn_entropy_std": 0.7921008467674255, "attnres/final_alpha/block_0": 0.2402614951133728, "attnres/block_norm/0": 1.7788535356521606, "attnres/final_alpha/block_1": 0.003962542396038771, "attnres/block_norm/1": 48235.3046875, "attnres/final_alpha/block_2": 0.009850619360804558, "attnres/block_norm/2": 29197.625, "attnres/final_alpha/block_3": 0.011814238503575325, "attnres/block_norm/3": 62789.03515625, "attnres/final_alpha/block_4": 0.013606581836938858, "attnres/block_norm/4": 15789.982421875, "attnres/final_alpha/block_5": 0.614612877368927, "attnres/block_norm/5": 6842.1396484375, "attnres/final_alpha/block_6": 0.105891652405262, "attnres/block_norm/6": 41108.25, "geo/tier1_time_s": 1.3562676906585693, "geo/step": 75750.0, "geo/rankme_slope": 0.0002332808904811925} {"step": 75760, "timestamp": 1778276432.1654854, "train/loss": 2.146471381187439, "train/z_loss": 0.0013847874477505683, "train/perplexity": 8.554619087209634, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.015251175165176392, "optim/adamw_lr": 0.0004575352549552917, "perf/tokens_per_sec": 1787232.3999717166, "perf/iters_per_sec": 0.8522188186510642, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1734075546264648, "data/tokens_consumed": 158882332672, "data/tokens_consumed_B": 158.882332672, "train/loss_slope": -2.7422137071590704e-05} {"step": 75770, "timestamp": 1778276442.5429125, "train/loss": 2.082195746898651, "train/z_loss": 0.0013996354769915341, "train/perplexity": 8.02206401221183, "train/grad_norm": 0.09814453125, "optim/muon_lr": 0.015218233466148376, "optim/adamw_lr": 0.00045654700398445124, "perf/tokens_per_sec": 2021950.9050428707, "perf/iters_per_sec": 0.9641413235868791, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371923446655273, "data/tokens_consumed": 158903304192, "data/tokens_consumed_B": 158.903304192, "train/loss_slope": -2.6473587166608604e-05} {"step": 75780, "timestamp": 1778276452.9148011, "train/loss": 2.105561923980713, "train/z_loss": 0.0014054899220354855, "train/perplexity": 8.211716073563332, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.015185214877128601, "optim/adamw_lr": 0.00045555644631385797, "perf/tokens_per_sec": 2022923.096689021, "perf/iters_per_sec": 0.9646049006886582, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366938829421997, "data/tokens_consumed": 158924275712, "data/tokens_consumed_B": 158.924275712, "train/loss_slope": -2.6570333491708012e-05} {"step": 75790, "timestamp": 1778276463.290205, "train/loss": 2.076550245285034, "train/z_loss": 0.0014041623566299678, "train/perplexity": 7.976903035017482, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.015152118802070617, "optim/adamw_lr": 0.00045456356406211847, "perf/tokens_per_sec": 2022421.2366441654, "perf/iters_per_sec": 0.9643655951710536, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369511365890502, "data/tokens_consumed": 158945247232, "data/tokens_consumed_B": 158.945247232, "train/loss_slope": -2.9820210219073437e-05} {"step": 75800, "timestamp": 1778276473.656119, "grad/layer_0/attn": 0.002775255125015974, "grad/layer_0/mlp": 0.0029343911446630955, "grad/layer_0/attn_mlp_ratio": 0.9457686087577623, "grad/layer_4/attn": 0.0023352974094450474, "grad/layer_4/mlp": 0.0026345946826040745, "grad/layer_4/attn_mlp_ratio": 0.8863971889964813, "grad/layer_8/attn": 0.008280609734356403, "grad/layer_8/mlp": 0.0038010491989552975, "grad/layer_8/attn_mlp_ratio": 2.1785062710531786, "grad/layer_12/attn": 0.005083201453089714, "grad/layer_12/mlp": 0.006607408635318279, "grad/layer_12/attn_mlp_ratio": 0.769318450956229, "grad/layer_16/attn": 0.003614136716350913, "grad/layer_16/mlp": 0.004584429785609245, "grad/layer_16/attn_mlp_ratio": 0.7883503088782903, "grad/layer_20/attn": 0.0030803994741290808, "grad/layer_20/mlp": 0.005618661176413298, "grad/layer_20/attn_mlp_ratio": 0.5482443811055773, "grad/layer_24/attn": 0.006183055229485035, "grad/layer_24/mlp": 0.00838675070554018, "grad/layer_24/attn_mlp_ratio": 0.7372408424726997, "grad/layer_27/attn": 0.005017763935029507, "grad/layer_27/mlp": 0.006858567241579294, "grad/layer_27/attn_mlp_ratio": 0.7316052588140204} {"step": 75800, "timestamp": 1778276473.6705418, "train/loss": 2.1374582767486574, "train/z_loss": 0.0013837406411767005, "train/perplexity": 8.477861842096528, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.015118945837020874, "optim/adamw_lr": 0.00045356837511062616, "perf/tokens_per_sec": 2021470.3858772877, "perf/iters_per_sec": 0.9639121941935004, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374388933181762, "data/tokens_consumed": 158966218752, "data/tokens_consumed_B": 158.966218752, "train/loss_slope": -2.7252692529613746e-05} {"step": 75810, "timestamp": 1778276484.0442314, "train/loss": 2.0879002809524536, "train/z_loss": 0.0013984647463075816, "train/perplexity": 8.067956923937414, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.01508569598197937, "optim/adamw_lr": 0.00045257087945938104, "perf/tokens_per_sec": 2022601.6729351096, "perf/iters_per_sec": 0.9644516338992641, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368586301803588, "data/tokens_consumed": 158987190272, "data/tokens_consumed_B": 158.987190272, "train/loss_slope": -2.7191145671154394e-05} {"step": 75820, "timestamp": 1778276494.4254758, "train/loss": 2.1472012996673584, "train/z_loss": 0.0013899931684136391, "train/perplexity": 8.560865541193825, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.015052369236946106, "optim/adamw_lr": 0.00045157107710838316, "perf/tokens_per_sec": 2021262.143577767, "perf/iters_per_sec": 0.9638128965271793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375457763671876, "data/tokens_consumed": 159008161792, "data/tokens_consumed_B": 159.008161792, "train/loss_slope": -2.8149306005162654e-05} {"step": 75825, "timestamp": 1778276500.1936464, "eos/sharpness": 34.09075736999511, "eos/L0_probe": 1.9426342248916626, "eos/L_plus": 2.1016201972961426, "eos/L_minus": 2.124555826187134, "eos/grad_norm": 0.114833302795887, "eos/embed_grad_frac": 0.15337178111076355, "eos/time_s": 0.5901896953582764} {"step": 75825, "timestamp": 1778276501.5717402, "geo/rankme_last": 439.30487060546875, "geo/layer_0/stable_rank_q_proj": 18.984817504882812, "geo/layer_0/stable_rank_k_proj": 15.791200637817383, "geo/layer_0/stable_rank_o_proj": 46.39013671875, "geo/layer_0/stable_rank_gate_proj": 128.53762817382812, "geo/layer_0/stable_rank_down_proj": 56.525596618652344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06167295575141907, "geo/layer_0/attn_entropy_mean": 6.134613037109375, "geo/layer_0/attn_entropy_std": 0.43419891595840454, "geo/layer_7/stable_rank_q_proj": 42.83371353149414, "geo/layer_7/stable_rank_k_proj": 40.347347259521484, "geo/layer_7/stable_rank_o_proj": 87.94579315185547, "geo/layer_7/stable_rank_gate_proj": 77.09330749511719, "geo/layer_7/stable_rank_down_proj": 139.7332763671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4522927403450012, "geo/layer_7/attn_entropy_mean": 4.659186840057373, "geo/layer_7/attn_entropy_std": 0.7930514216423035, "geo/layer_14/stable_rank_q_proj": 49.52251434326172, "geo/layer_14/stable_rank_k_proj": 41.71317672729492, "geo/layer_14/stable_rank_o_proj": 43.317562103271484, "geo/layer_14/stable_rank_gate_proj": 70.40931701660156, "geo/layer_14/stable_rank_down_proj": 125.22201538085938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4019232988357544, "geo/layer_14/attn_entropy_mean": 5.537507057189941, "geo/layer_14/attn_entropy_std": 0.42113959789276123, "geo/layer_21/stable_rank_q_proj": 39.76241683959961, "geo/layer_21/stable_rank_k_proj": 30.082576751708984, "geo/layer_21/stable_rank_o_proj": 68.0816421508789, "geo/layer_21/stable_rank_gate_proj": 63.628883361816406, "geo/layer_21/stable_rank_down_proj": 49.239444732666016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14465557038784027, "geo/layer_21/attn_entropy_mean": 5.6725850105285645, "geo/layer_21/attn_entropy_std": 0.2999221682548523, "geo/layer_27/stable_rank_q_proj": 43.796443939208984, "geo/layer_27/stable_rank_k_proj": 32.359771728515625, "geo/layer_27/stable_rank_o_proj": 115.31426239013672, "geo/layer_27/stable_rank_gate_proj": 77.11865997314453, "geo/layer_27/stable_rank_down_proj": 127.45037841796875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09653928130865097, "geo/layer_27/attn_entropy_mean": 4.1496992111206055, "geo/layer_27/attn_entropy_std": 0.7871867418289185, "attnres/final_alpha/block_0": 0.24082428216934204, "attnres/block_norm/0": 1.7787721157073975, "attnres/final_alpha/block_1": 0.003944706171751022, "attnres/block_norm/1": 48175.109375, "attnres/final_alpha/block_2": 0.009778618812561035, "attnres/block_norm/2": 29145.9140625, "attnres/final_alpha/block_3": 0.011454479768872261, "attnres/block_norm/3": 62897.94140625, "attnres/final_alpha/block_4": 0.013396915048360825, "attnres/block_norm/4": 15779.833984375, "attnres/final_alpha/block_5": 0.613491415977478, "attnres/block_norm/5": 6888.34619140625, "attnres/final_alpha/block_6": 0.10710953921079636, "attnres/block_norm/6": 41200.58203125, "geo/tier1_time_s": 1.3598484992980957, "geo/step": 75825.0, "geo/rankme_slope": 0.0002685496268820028} {"step": 75830, "timestamp": 1778276506.7603343, "train/loss": 2.1439385414123535, "train/z_loss": 0.0013929629465565085, "train/perplexity": 8.532979024699289, "train/grad_norm": 0.1630859375, "optim/muon_lr": 0.015018967986106873, "optim/adamw_lr": 0.0004505690395832061, "perf/tokens_per_sec": 1700975.0417635622, "perf/iters_per_sec": 0.8110881050889789, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329116821289063, "data/tokens_consumed": 159029133312, "data/tokens_consumed_B": 159.029133312, "train/loss_slope": -2.7530519947288972e-05} {"step": 75840, "timestamp": 1778276517.1334043, "train/loss": 2.1062058091163633, "train/z_loss": 0.001388227753341198, "train/perplexity": 8.217005178086438, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.014985491931438446, "optim/adamw_lr": 0.00044956475794315334, "perf/tokens_per_sec": 2022980.7404905157, "perf/iters_per_sec": 0.964632387395151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366643428802491, "data/tokens_consumed": 159050104832, "data/tokens_consumed_B": 159.050104832, "train/loss_slope": -3.0004295099615008e-05} {"step": 75850, "timestamp": 1778276527.5000806, "grad/layer_0/attn": 0.0024389110039919615, "grad/layer_0/mlp": 0.0027652420103549957, "grad/layer_0/attn_mlp_ratio": 0.8819882334566572, "grad/layer_4/attn": 0.0020364064257591963, "grad/layer_4/mlp": 0.002515234751626849, "grad/layer_4/attn_mlp_ratio": 0.8096287408080611, "grad/layer_8/attn": 0.005157931707799435, "grad/layer_8/mlp": 0.003336389549076557, "grad/layer_8/attn_mlp_ratio": 1.5459620279145279, "grad/layer_12/attn": 0.0038888929411768913, "grad/layer_12/mlp": 0.00644178269430995, "grad/layer_12/attn_mlp_ratio": 0.6036982408987718, "grad/layer_16/attn": 0.0030622491613030434, "grad/layer_16/mlp": 0.004192017484456301, "grad/layer_16/attn_mlp_ratio": 0.7304953043750683, "grad/layer_20/attn": 0.0029899843502789736, "grad/layer_20/mlp": 0.005223769228905439, "grad/layer_20/attn_mlp_ratio": 0.5723806244150292, "grad/layer_24/attn": 0.004390480928122997, "grad/layer_24/mlp": 0.006950220093131065, "grad/layer_24/attn_mlp_ratio": 0.6317038606146795, "grad/layer_27/attn": 0.004042732063680887, "grad/layer_27/mlp": 0.005987795535475016, "grad/layer_27/attn_mlp_ratio": 0.6751619978025811} {"step": 75850, "timestamp": 1778276527.514498, "train/loss": 2.1334025859832764, "train/z_loss": 0.0013800106476992368, "train/perplexity": 8.443547886544465, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.014951941370964051, "optim/adamw_lr": 0.00044855824112892146, "perf/tokens_per_sec": 2021160.1051865302, "perf/iters_per_sec": 0.9637642408306742, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037598156929016, "data/tokens_consumed": 159071076352, "data/tokens_consumed_B": 159.071076352, "train/loss_slope": -2.4628351530870004e-05} {"step": 75860, "timestamp": 1778276537.8932598, "train/loss": 2.078600859642029, "train/z_loss": 0.0014056155341677368, "train/perplexity": 7.9932773698906665, "train/grad_norm": 0.1044921875, "optim/muon_lr": 0.01491831660270691, "optim/adamw_lr": 0.00044754949808120724, "perf/tokens_per_sec": 2021778.7171444034, "perf/iters_per_sec": 0.964059217998697, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372806787490845, "data/tokens_consumed": 159092047872, "data/tokens_consumed_B": 159.092047872, "train/loss_slope": -2.6180514193425302e-05} {"step": 75870, "timestamp": 1778276548.2725542, "train/loss": 2.1663329362869264, "train/z_loss": 0.0013931009685620665, "train/perplexity": 8.72622567111324, "train/grad_norm": 0.10888671875, "optim/muon_lr": 0.014884618222713471, "optim/adamw_lr": 0.0004465385466814041, "perf/tokens_per_sec": 2021944.119215654, "perf/iters_per_sec": 0.964138087852313, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371958255767821, "data/tokens_consumed": 159113019392, "data/tokens_consumed_B": 159.113019392, "train/loss_slope": -2.0315917805083152e-05} {"step": 75880, "timestamp": 1778276558.6492379, "train/loss": 2.1257964849472044, "train/z_loss": 0.0013856489444151522, "train/perplexity": 8.379569031480276, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.01485084593296051, "optim/adamw_lr": 0.00044552537798881527, "perf/tokens_per_sec": 2022054.8361421882, "perf/iters_per_sec": 0.9641908817969266, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371390342712403, "data/tokens_consumed": 159133990912, "data/tokens_consumed_B": 159.133990912, "train/loss_slope": -1.9970150098334315e-05} {"step": 75890, "timestamp": 1778276569.0286577, "train/loss": 2.104461395740509, "train/z_loss": 0.0013884078827686608, "train/perplexity": 8.202683819160987, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.014817002415657044, "optim/adamw_lr": 0.0004445100724697113, "perf/tokens_per_sec": 2021635.598109253, "perf/iters_per_sec": 0.9639909735246911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373541116714478, "data/tokens_consumed": 159154962432, "data/tokens_consumed_B": 159.154962432, "train/loss_slope": -2.0421022285830018e-05} {"step": 75900, "timestamp": 1778276579.3966672, "grad/layer_0/attn": 0.002868413459509611, "grad/layer_0/mlp": 0.003292699111625552, "grad/layer_0/attn_mlp_ratio": 0.8711434829461759, "grad/layer_4/attn": 0.0026706161443144083, "grad/layer_4/mlp": 0.0026360524352639914, "grad/layer_4/attn_mlp_ratio": 1.013111881720125, "grad/layer_8/attn": 0.0039452724158763885, "grad/layer_8/mlp": 0.0035820703487843275, "grad/layer_8/attn_mlp_ratio": 1.1013944232211634, "grad/layer_12/attn": 0.0059694345109164715, "grad/layer_12/mlp": 0.006724602542817593, "grad/layer_12/attn_mlp_ratio": 0.8877007056011406, "grad/layer_16/attn": 0.007438679225742817, "grad/layer_16/mlp": 0.004876198247075081, "grad/layer_16/attn_mlp_ratio": 1.5255079256988808, "grad/layer_20/attn": 0.003059675917029381, "grad/layer_20/mlp": 0.005871653091162443, "grad/layer_20/attn_mlp_ratio": 0.5210927514647098, "grad/layer_24/attn": 0.011055159382522106, "grad/layer_24/mlp": 0.010820921510457993, "grad/layer_24/attn_mlp_ratio": 1.0216467488164531, "grad/layer_27/attn": 0.004075102508068085, "grad/layer_27/mlp": 0.010014829225838184, "grad/layer_27/attn_mlp_ratio": 0.4069068354020124} {"step": 75900, "timestamp": 1778276579.9871728, "eos/sharpness": 61.00392341613768, "eos/L0_probe": 1.9403355121612549, "eos/L_plus": 2.254096508026123, "eos/L_minus": 2.2366137504577637, "eos/grad_norm": 0.16952255368232727, "eos/embed_grad_frac": 0.07136772572994232, "eos/time_s": 0.5876193046569824} {"step": 75900, "timestamp": 1778276580.0078695, "train/loss": 2.1105960845947265, "train/z_loss": 0.0014014025451615452, "train/perplexity": 8.25315939985075, "train/grad_norm": 0.1689453125, "optim/muon_lr": 0.014783086478710175, "optim/adamw_lr": 0.0004434925943613052, "perf/tokens_per_sec": 1911106.2587976821, "perf/iters_per_sec": 0.9112864774692927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097349762916565, "data/tokens_consumed": 159175933952, "data/tokens_consumed_B": 159.175933952, "train/loss_slope": -1.942543044234773e-05} {"step": 75900, "timestamp": 1778276581.3688295, "geo/rankme_last": 439.4354248046875, "geo/layer_0/stable_rank_q_proj": 18.979219436645508, "geo/layer_0/stable_rank_k_proj": 15.749824523925781, "geo/layer_0/stable_rank_o_proj": 46.40304946899414, "geo/layer_0/stable_rank_gate_proj": 128.26927185058594, "geo/layer_0/stable_rank_down_proj": 56.56575012207031, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06432000547647476, "geo/layer_0/attn_entropy_mean": 6.137115955352783, "geo/layer_0/attn_entropy_std": 0.43415263295173645, "geo/layer_7/stable_rank_q_proj": 42.78614044189453, "geo/layer_7/stable_rank_k_proj": 40.37632751464844, "geo/layer_7/stable_rank_o_proj": 87.88265991210938, "geo/layer_7/stable_rank_gate_proj": 77.06073760986328, "geo/layer_7/stable_rank_down_proj": 139.7699432373047, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4525167942047119, "geo/layer_7/attn_entropy_mean": 4.655631065368652, "geo/layer_7/attn_entropy_std": 0.788671612739563, "geo/layer_14/stable_rank_q_proj": 49.472991943359375, "geo/layer_14/stable_rank_k_proj": 41.66598892211914, "geo/layer_14/stable_rank_o_proj": 43.302486419677734, "geo/layer_14/stable_rank_gate_proj": 70.3176498413086, "geo/layer_14/stable_rank_down_proj": 125.20204162597656, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39925459027290344, "geo/layer_14/attn_entropy_mean": 5.523963451385498, "geo/layer_14/attn_entropy_std": 0.4249885082244873, "geo/layer_21/stable_rank_q_proj": 39.76637268066406, "geo/layer_21/stable_rank_k_proj": 30.13841438293457, "geo/layer_21/stable_rank_o_proj": 68.10079956054688, "geo/layer_21/stable_rank_gate_proj": 63.474918365478516, "geo/layer_21/stable_rank_down_proj": 49.22166061401367, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14868387579917908, "geo/layer_21/attn_entropy_mean": 5.697088241577148, "geo/layer_21/attn_entropy_std": 0.2847951352596283, "geo/layer_27/stable_rank_q_proj": 43.85564041137695, "geo/layer_27/stable_rank_k_proj": 32.31586837768555, "geo/layer_27/stable_rank_o_proj": 115.22359466552734, "geo/layer_27/stable_rank_gate_proj": 77.04712677001953, "geo/layer_27/stable_rank_down_proj": 127.45030975341797, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10560818016529083, "geo/layer_27/attn_entropy_mean": 4.16977596282959, "geo/layer_27/attn_entropy_std": 0.790730357170105, "attnres/final_alpha/block_0": 0.23933301866054535, "attnres/block_norm/0": 1.7787435054779053, "attnres/final_alpha/block_1": 0.0039728740230202675, "attnres/block_norm/1": 48058.796875, "attnres/final_alpha/block_2": 0.009579114615917206, "attnres/block_norm/2": 29093.287109375, "attnres/final_alpha/block_3": 0.011329004541039467, "attnres/block_norm/3": 63005.0, "attnres/final_alpha/block_4": 0.013330616056919098, "attnres/block_norm/4": 15839.05859375, "attnres/final_alpha/block_5": 0.6152657270431519, "attnres/block_norm/5": 6891.33642578125, "attnres/final_alpha/block_6": 0.10718967020511627, "attnres/block_norm/6": 41141.23046875, "geo/tier1_time_s": 1.3573451042175293, "geo/step": 75900.0, "geo/rankme_slope": 0.0002894582637742597} {"step": 75910, "timestamp": 1778276591.7425215, "train/loss": 2.159389066696167, "train/z_loss": 0.0013822654960677027, "train/perplexity": 8.665841789561838, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.0147490993142128, "optim/adamw_lr": 0.0004424729794263839, "perf/tokens_per_sec": 1787655.3384538807, "perf/iters_per_sec": 0.8524204914349941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173129940032959, "data/tokens_consumed": 159196905472, "data/tokens_consumed_B": 159.196905472, "train/loss_slope": -1.5215532185256713e-05} {"step": 75920, "timestamp": 1778276602.1168494, "train/loss": 2.150762844085693, "train/z_loss": 0.001358691428322345, "train/perplexity": 8.591409804166025, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.014715041220188142, "optim/adamw_lr": 0.0004414512366056442, "perf/tokens_per_sec": 2022369.2044268542, "perf/iters_per_sec": 0.9643407842764159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369778156280518, "data/tokens_consumed": 159217876992, "data/tokens_consumed_B": 159.217876992, "train/loss_slope": -1.1372687578415972e-05} {"step": 75930, "timestamp": 1778276612.489701, "train/loss": 2.081286668777466, "train/z_loss": 0.0013880767743103205, "train/perplexity": 8.014774643136562, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.014680912494659424, "optim/adamw_lr": 0.00044042737483978266, "perf/tokens_per_sec": 2022700.6475420457, "perf/iters_per_sec": 0.9644988286695698, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036807894706726, "data/tokens_consumed": 159238848512, "data/tokens_consumed_B": 159.238848512, "train/loss_slope": -1.295209117669663e-05} {"step": 75940, "timestamp": 1778276622.8478725, "train/loss": 2.152353346347809, "train/z_loss": 0.0013937235460616648, "train/perplexity": 8.605085333491566, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.014646712839603424, "optim/adamw_lr": 0.0004394013851881027, "perf/tokens_per_sec": 2026021.1103381172, "perf/iters_per_sec": 0.9660821487131678, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351086616516114, "data/tokens_consumed": 159259820032, "data/tokens_consumed_B": 159.259820032, "train/loss_slope": -1.3175688360748233e-05} {"step": 75950, "timestamp": 1778276633.184937, "grad/layer_0/attn": 0.0025812333915382624, "grad/layer_0/mlp": 0.002762076910585165, "grad/layer_0/attn_mlp_ratio": 0.934526221262531, "grad/layer_4/attn": 0.002161762909963727, "grad/layer_4/mlp": 0.0026248192880302668, "grad/layer_4/attn_mlp_ratio": 0.8235853940358047, "grad/layer_8/attn": 0.0044757104478776455, "grad/layer_8/mlp": 0.0037235456984490156, "grad/layer_8/attn_mlp_ratio": 1.2020022554152368, "grad/layer_12/attn": 0.004591096192598343, "grad/layer_12/mlp": 0.00708465650677681, "grad/layer_12/attn_mlp_ratio": 0.6480336941393521, "grad/layer_16/attn": 0.004508279729634523, "grad/layer_16/mlp": 0.004556507803499699, "grad/layer_16/attn_mlp_ratio": 0.9894155403904529, "grad/layer_20/attn": 0.004493835847824812, "grad/layer_20/mlp": 0.005653605330735445, "grad/layer_20/attn_mlp_ratio": 0.7948619518784201, "grad/layer_24/attn": 0.005893625319004059, "grad/layer_24/mlp": 0.007983746938407421, "grad/layer_24/attn_mlp_ratio": 0.7382029128242151, "grad/layer_27/attn": 0.00408291956409812, "grad/layer_27/mlp": 0.007486806251108646, "grad/layer_27/attn_mlp_ratio": 0.5453486269874632} {"step": 75950, "timestamp": 1778276633.1994936, "train/loss": 2.106313920021057, "train/z_loss": 0.0013986597303301096, "train/perplexity": 8.217893573971892, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.014612444639205933, "optim/adamw_lr": 0.00043837333917617796, "perf/tokens_per_sec": 2026949.9040822238, "perf/iters_per_sec": 0.9665250320826644, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346343517303467, "data/tokens_consumed": 159280791552, "data/tokens_consumed_B": 159.280791552, "train/loss_slope": -1.2734320841141798e-05} {"step": 75960, "timestamp": 1778276643.5539079, "train/loss": 2.0943180561065673, "train/z_loss": 0.0013942803256213666, "train/perplexity": 8.119901764290036, "train/grad_norm": 0.10009765625, "optim/muon_lr": 0.014578107595443725, "optim/adamw_lr": 0.00043734322786331175, "perf/tokens_per_sec": 2026587.4639860024, "perf/iters_per_sec": 0.9663522071771633, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348193883895873, "data/tokens_consumed": 159301763072, "data/tokens_consumed_B": 159.301763072, "train/loss_slope": -1.4890579300315965e-05} {"step": 75970, "timestamp": 1778276653.9034905, "train/loss": 2.09989013671875, "train/z_loss": 0.0013913617585785687, "train/perplexity": 8.165272799626598, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.014543702006340028, "optim/adamw_lr": 0.00043631106019020074, "perf/tokens_per_sec": 2028173.0039580774, "perf/iters_per_sec": 0.967108251551665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034010410308838, "data/tokens_consumed": 159322734592, "data/tokens_consumed_B": 159.322734592, "train/loss_slope": -1.3920558568346216e-05} {"step": 75975, "timestamp": 1778276659.671679, "eos/sharpness": 43.535304069519036, "eos/L0_probe": 1.9355700016021729, "eos/L_plus": 2.143671989440918, "eos/L_minus": 2.162821054458618, "eos/grad_norm": 0.10837563872337341, "eos/embed_grad_frac": 0.17445243895053864, "eos/time_s": 0.5976982116699219} {"step": 75975, "timestamp": 1778276661.0521185, "geo/rankme_last": 439.73577880859375, "geo/layer_0/stable_rank_q_proj": 18.975908279418945, "geo/layer_0/stable_rank_k_proj": 15.767411231994629, "geo/layer_0/stable_rank_o_proj": 46.4144172668457, "geo/layer_0/stable_rank_gate_proj": 128.224365234375, "geo/layer_0/stable_rank_down_proj": 56.65068817138672, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06495301425457001, "geo/layer_0/attn_entropy_mean": 6.137945175170898, "geo/layer_0/attn_entropy_std": 0.4300660192966461, "geo/layer_7/stable_rank_q_proj": 42.74054718017578, "geo/layer_7/stable_rank_k_proj": 40.377830505371094, "geo/layer_7/stable_rank_o_proj": 87.95426940917969, "geo/layer_7/stable_rank_gate_proj": 77.04814910888672, "geo/layer_7/stable_rank_down_proj": 139.79513549804688, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43041297793388367, "geo/layer_7/attn_entropy_mean": 4.6462178230285645, "geo/layer_7/attn_entropy_std": 0.7794928550720215, "geo/layer_14/stable_rank_q_proj": 49.461647033691406, "geo/layer_14/stable_rank_k_proj": 41.61332702636719, "geo/layer_14/stable_rank_o_proj": 43.29295349121094, "geo/layer_14/stable_rank_gate_proj": 70.31534576416016, "geo/layer_14/stable_rank_down_proj": 125.04238891601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40007516741752625, "geo/layer_14/attn_entropy_mean": 5.531184196472168, "geo/layer_14/attn_entropy_std": 0.4151177406311035, "geo/layer_21/stable_rank_q_proj": 39.766273498535156, "geo/layer_21/stable_rank_k_proj": 30.10443878173828, "geo/layer_21/stable_rank_o_proj": 68.07244873046875, "geo/layer_21/stable_rank_gate_proj": 63.41569519042969, "geo/layer_21/stable_rank_down_proj": 49.208648681640625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1428251713514328, "geo/layer_21/attn_entropy_mean": 5.688487529754639, "geo/layer_21/attn_entropy_std": 0.29438516497612, "geo/layer_27/stable_rank_q_proj": 43.86216354370117, "geo/layer_27/stable_rank_k_proj": 32.30705261230469, "geo/layer_27/stable_rank_o_proj": 115.25978088378906, "geo/layer_27/stable_rank_gate_proj": 76.96361541748047, "geo/layer_27/stable_rank_down_proj": 127.5750503540039, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0962054431438446, "geo/layer_27/attn_entropy_mean": 4.152270793914795, "geo/layer_27/attn_entropy_std": 0.8023955821990967, "attnres/final_alpha/block_0": 0.2401704639196396, "attnres/block_norm/0": 1.7788572311401367, "attnres/final_alpha/block_1": 0.0039908443577587605, "attnres/block_norm/1": 48005.28515625, "attnres/final_alpha/block_2": 0.009623069316148758, "attnres/block_norm/2": 29028.640625, "attnres/final_alpha/block_3": 0.011361338198184967, "attnres/block_norm/3": 62821.4453125, "attnres/final_alpha/block_4": 0.013357867486774921, "attnres/block_norm/4": 15783.806640625, "attnres/final_alpha/block_5": 0.6140031814575195, "attnres/block_norm/5": 6891.5166015625, "attnres/final_alpha/block_6": 0.10749328136444092, "attnres/block_norm/6": 41113.46875, "geo/tier1_time_s": 1.3617198467254639, "geo/step": 75975.0, "geo/rankme_slope": 0.0002987220474127151} {"step": 75980, "timestamp": 1778276666.2361746, "train/loss": 2.0727904558181764, "train/z_loss": 0.0013947955914773047, "train/perplexity": 7.946967869232441, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.014509228765964508, "optim/adamw_lr": 0.0004352768629789352, "perf/tokens_per_sec": 1701414.476781122, "perf/iters_per_sec": 0.811297644033967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2325932502746582, "data/tokens_consumed": 159343706112, "data/tokens_consumed_B": 159.343706112, "train/loss_slope": -1.6290844606273112e-05} {"step": 75990, "timestamp": 1778276676.5877671, "train/loss": 2.1301294803619384, "train/z_loss": 0.0013852713746018708, "train/perplexity": 8.415956441981894, "train/grad_norm": 0.11474609375, "optim/muon_lr": 0.014474687874317169, "optim/adamw_lr": 0.00043424063622951505, "perf/tokens_per_sec": 2027397.56408039, "perf/iters_per_sec": 0.9667384930040311, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344058990478515, "data/tokens_consumed": 159364677632, "data/tokens_consumed_B": 159.364677632, "train/loss_slope": -1.3539061001246349e-05} {"step": 76000, "timestamp": 1778276686.9184184, "grad/layer_0/attn": 0.003429821226745844, "grad/layer_0/mlp": 0.003479361766949296, "grad/layer_0/attn_mlp_ratio": 0.9857615729269088, "grad/layer_4/attn": 0.0025789709761738777, "grad/layer_4/mlp": 0.002540419576689601, "grad/layer_4/attn_mlp_ratio": 1.015175169613909, "grad/layer_8/attn": 0.005464669782668352, "grad/layer_8/mlp": 0.003339160233736038, "grad/layer_8/attn_mlp_ratio": 1.6365400988559826, "grad/layer_12/attn": 0.004650987684726715, "grad/layer_12/mlp": 0.006852325517684221, "grad/layer_12/attn_mlp_ratio": 0.6787458658887463, "grad/layer_16/attn": 0.003933275118470192, "grad/layer_16/mlp": 0.004583990201354027, "grad/layer_16/attn_mlp_ratio": 0.8580461257320668, "grad/layer_20/attn": 0.002984782448038459, "grad/layer_20/mlp": 0.00521327368915081, "grad/layer_20/attn_mlp_ratio": 0.5725351417855707, "grad/layer_24/attn": 0.007305924315005541, "grad/layer_24/mlp": 0.006837441120296717, "grad/layer_24/attn_mlp_ratio": 1.0685173121954084, "grad/layer_27/attn": 0.0036493672523647547, "grad/layer_27/mlp": 0.006144712679088116, "grad/layer_27/attn_mlp_ratio": 0.5939036344846579} {"step": 76000, "timestamp": 1778276686.9327993, "train/loss": 2.0725224375724793, "train/z_loss": 0.0013845094596035778, "train/perplexity": 7.9448382222503895, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.014440079331398011, "optim/adamw_lr": 0.00043320237994194026, "perf/tokens_per_sec": 2028316.0679345876, "perf/iters_per_sec": 0.9671764697716654, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339374780654906, "data/tokens_consumed": 159385649152, "data/tokens_consumed_B": 159.385649152, "train/loss_slope": -1.3642809829517382e-05} {"step": 76000, "timestamp": 1778276694.1940897, "geo/ww_alpha_mean": 7.536949799831524, "geo/ww_alpha_std": 4.31451521746891, "geo/ww_alpha_min": 1.3302380437706516, "geo/ww_alpha_max": 24.723460520156273, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9272322655807943, "geo/ww_alpha_by_type/k_proj": 4.541749999173939, "geo/ww_alpha_by_type/v_proj": 8.060996542790532, "geo/ww_alpha_by_type/o_proj": 9.11848136870862, "geo/ww_alpha_by_type/gate_proj": 7.946191976071003, "geo/ww_alpha_by_type/up_proj": 11.312712190136661, "geo/ww_alpha_by_type/down_proj": 7.94100794998945, "geo/twonn_id/layer_0": 0.655661404132843, "geo/twonn_id/layer_7": 3.273960828781128, "geo/twonn_id/layer_14": 4.431028842926025, "geo/twonn_id/layer_21": 6.738846302032471, "geo/twonn_id/layer_27": 5.698905944824219, "geo/tier2_time_s": 7.252876281738281} {"step": 76000, "timestamp": 1778276694.861881, "eoc/jacobian_sigma/layer_0/attn": 1367.961669921875, "eoc/jacobian_sigma/layer_0/mlp": 9150.1962890625, "eoc/jacobian_sigma/layer_0": 9150.1962890625, "eoc/jacobian_sigma/layer_7/attn": 1.1695059537887573, "eoc/jacobian_sigma/layer_7/mlp": 1.8995826244354248, "eoc/jacobian_sigma/layer_7": 1.8995826244354248, "eoc/jacobian_sigma/layer_14/attn": 1.3956266641616821, "eoc/jacobian_sigma/layer_14/mlp": 5.626201629638672, "eoc/jacobian_sigma/layer_14": 5.626201629638672, "eoc/jacobian_sigma/layer_21/attn": 1.090417742729187, "eoc/jacobian_sigma/layer_21/mlp": 4.548619270324707, "eoc/jacobian_sigma/layer_21": 4.548619270324707, "eoc/jacobian_sigma/layer_27/attn": 2.8091490268707275, "eoc/jacobian_sigma/layer_27/mlp": 29.333580017089844, "eoc/jacobian_sigma/layer_27": 29.333580017089844, "eoc/layer0_sigma": 9150.1962890625, "eoc/sigma_max": 29.333580017089844, "eoc/sigma_min": 1.8995826244354248, "eoc/sigma_mean": 10.351995885372162, "eoc/time_s": 0.6611740589141846} {"step": 76010, "timestamp": 1778276705.226964, "train/loss": 2.148895728588104, "train/z_loss": 0.001393780461512506, "train/perplexity": 8.575383615802627, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.014405405223369598, "optim/adamw_lr": 0.0004321621567010879, "perf/tokens_per_sec": 1146802.8530901629, "perf/iters_per_sec": 0.5468382134867491, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8286944389343263, "data/tokens_consumed": 159406620672, "data/tokens_consumed_B": 159.406620672, "train/loss_slope": -1.2013380011745417e-05} {"step": 76020, "timestamp": 1778276715.570295, "train/loss": 2.081221652030945, "train/z_loss": 0.0014011149061843753, "train/perplexity": 8.014253565504738, "train/grad_norm": 0.220703125, "optim/muon_lr": 0.01437066525220871, "optim/adamw_lr": 0.00043111995756626125, "perf/tokens_per_sec": 2028749.8729641447, "perf/iters_per_sec": 0.9673833241291736, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337163925170898, "data/tokens_consumed": 159427592192, "data/tokens_consumed_B": 159.427592192, "train/loss_slope": -1.394964263062205e-05} {"step": 76030, "timestamp": 1778276725.9097726, "train/loss": 2.139476180076599, "train/z_loss": 0.0013860960491001606, "train/perplexity": 8.494986619983756, "train/grad_norm": 0.125, "optim/muon_lr": 0.014335860013961792, "optim/adamw_lr": 0.00043007580041885373, "perf/tokens_per_sec": 2029359.562537828, "perf/iters_per_sec": 0.9676740467728748, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0334058284759522, "data/tokens_consumed": 159448563712, "data/tokens_consumed_B": 159.448563712, "train/loss_slope": -1.0630034599224109e-05} {"step": 76040, "timestamp": 1778276736.25265, "train/loss": 2.1098448872566222, "train/z_loss": 0.0014030023710802197, "train/perplexity": 8.24696197651402, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.014300990104675294, "optim/adamw_lr": 0.00042902970314025873, "perf/tokens_per_sec": 2028761.3369524982, "perf/iters_per_sec": 0.9673887905848018, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337105512619018, "data/tokens_consumed": 159469535232, "data/tokens_consumed_B": 159.469535232, "train/loss_slope": -1.0478767746388316e-05} {"step": 76050, "timestamp": 1778276746.5844955, "grad/layer_0/attn": 0.0027402311097830534, "grad/layer_0/mlp": 0.003069726750254631, "grad/layer_0/attn_mlp_ratio": 0.8926628470398771, "grad/layer_4/attn": 0.0021974199917167425, "grad/layer_4/mlp": 0.0025707315653562546, "grad/layer_4/attn_mlp_ratio": 0.8547838816978286, "grad/layer_8/attn": 0.005264573264867067, "grad/layer_8/mlp": 0.003702316200360656, "grad/layer_8/attn_mlp_ratio": 1.4219674489600542, "grad/layer_12/attn": 0.003947065677493811, "grad/layer_12/mlp": 0.00660555437207222, "grad/layer_12/attn_mlp_ratio": 0.5975373746718317, "grad/layer_16/attn": 0.003869003849104047, "grad/layer_16/mlp": 0.004584613721817732, "grad/layer_16/attn_mlp_ratio": 0.8439105232139366, "grad/layer_20/attn": 0.0027811217587441206, "grad/layer_20/mlp": 0.005338977091014385, "grad/layer_20/attn_mlp_ratio": 0.5209090916186736, "grad/layer_24/attn": 0.006781090050935745, "grad/layer_24/mlp": 0.008317987434566021, "grad/layer_24/attn_mlp_ratio": 0.8152320525555511, "grad/layer_27/attn": 0.003942909650504589, "grad/layer_27/mlp": 0.007734715472906828, "grad/layer_27/attn_mlp_ratio": 0.50976789170061} {"step": 76050, "timestamp": 1778276747.202091, "eos/sharpness": 44.62335109710693, "eos/L0_probe": 1.938925862312317, "eos/L_plus": 2.172528028488159, "eos/L_minus": 2.151557207107544, "eos/grad_norm": 0.10523491352796555, "eos/embed_grad_frac": 0.2014465034008026, "eos/time_s": 0.6146364212036133} {"step": 76050, "timestamp": 1778276747.222215, "train/loss": 2.115679943561554, "train/z_loss": 0.0013942619087174534, "train/perplexity": 8.295224133257333, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.014266055822372437, "optim/adamw_lr": 0.00042798167467117306, "perf/tokens_per_sec": 1912862.971959152, "perf/iters_per_sec": 0.912124143580986, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0963419914245605, "data/tokens_consumed": 159490506752, "data/tokens_consumed_B": 159.490506752, "train/loss_slope": -9.732445445891962e-06} {"step": 76050, "timestamp": 1778276748.594048, "geo/rankme_last": 439.59637451171875, "geo/layer_0/stable_rank_q_proj": 18.953432083129883, "geo/layer_0/stable_rank_k_proj": 15.754679679870605, "geo/layer_0/stable_rank_o_proj": 46.415618896484375, "geo/layer_0/stable_rank_gate_proj": 128.18365478515625, "geo/layer_0/stable_rank_down_proj": 56.66278839111328, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06405618786811829, "geo/layer_0/attn_entropy_mean": 6.137862205505371, "geo/layer_0/attn_entropy_std": 0.4291561543941498, "geo/layer_7/stable_rank_q_proj": 42.70941162109375, "geo/layer_7/stable_rank_k_proj": 40.39198303222656, "geo/layer_7/stable_rank_o_proj": 88.01449584960938, "geo/layer_7/stable_rank_gate_proj": 77.02717590332031, "geo/layer_7/stable_rank_down_proj": 139.9021759033203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4242217242717743, "geo/layer_7/attn_entropy_mean": 4.634049415588379, "geo/layer_7/attn_entropy_std": 0.7754582166671753, "geo/layer_14/stable_rank_q_proj": 49.51092529296875, "geo/layer_14/stable_rank_k_proj": 41.640892028808594, "geo/layer_14/stable_rank_o_proj": 43.28998947143555, "geo/layer_14/stable_rank_gate_proj": 70.28072357177734, "geo/layer_14/stable_rank_down_proj": 124.88697052001953, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3970862329006195, "geo/layer_14/attn_entropy_mean": 5.5128631591796875, "geo/layer_14/attn_entropy_std": 0.41563719511032104, "geo/layer_21/stable_rank_q_proj": 39.738590240478516, "geo/layer_21/stable_rank_k_proj": 30.10883331298828, "geo/layer_21/stable_rank_o_proj": 68.0733642578125, "geo/layer_21/stable_rank_gate_proj": 63.35791778564453, "geo/layer_21/stable_rank_down_proj": 49.22651290893555, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14575837552547455, "geo/layer_21/attn_entropy_mean": 5.6843414306640625, "geo/layer_21/attn_entropy_std": 0.30162179470062256, "geo/layer_27/stable_rank_q_proj": 43.878814697265625, "geo/layer_27/stable_rank_k_proj": 32.29487228393555, "geo/layer_27/stable_rank_o_proj": 115.36237335205078, "geo/layer_27/stable_rank_gate_proj": 76.90782928466797, "geo/layer_27/stable_rank_down_proj": 127.713134765625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0953511893749237, "geo/layer_27/attn_entropy_mean": 4.154760360717773, "geo/layer_27/attn_entropy_std": 0.7835590839385986, "attnres/final_alpha/block_0": 0.24050016701221466, "attnres/block_norm/0": 1.7788463830947876, "attnres/final_alpha/block_1": 0.003911254461854696, "attnres/block_norm/1": 48067.5703125, "attnres/final_alpha/block_2": 0.009870380163192749, "attnres/block_norm/2": 29124.4765625, "attnres/final_alpha/block_3": 0.011332469061017036, "attnres/block_norm/3": 62979.0546875, "attnres/final_alpha/block_4": 0.013481960631906986, "attnres/block_norm/4": 15797.138671875, "attnres/final_alpha/block_5": 0.6148320436477661, "attnres/block_norm/5": 6839.470703125, "attnres/final_alpha/block_6": 0.10607172548770905, "attnres/block_norm/6": 41354.078125, "geo/tier1_time_s": 1.3684487342834473, "geo/step": 76050.0, "geo/rankme_slope": 0.00030524340595613244} {"step": 76060, "timestamp": 1778276758.948752, "train/loss": 2.1123551726341248, "train/z_loss": 0.0013896491029299796, "train/perplexity": 8.267690210578952, "train/grad_norm": 0.11328125, "optim/muon_lr": 0.014231056571006775, "optim/adamw_lr": 0.0004269316971302032, "perf/tokens_per_sec": 1788869.2918407903, "perf/iters_per_sec": 0.8529993495181991, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.172333836555481, "data/tokens_consumed": 159511478272, "data/tokens_consumed_B": 159.511478272, "train/loss_slope": -8.491926990111912e-06} {"step": 76070, "timestamp": 1778276769.291755, "train/loss": 2.085342252254486, "train/z_loss": 0.0013903090148232877, "train/perplexity": 8.047345232480547, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.014195995330810547, "optim/adamw_lr": 0.00042587985992431637, "perf/tokens_per_sec": 2029053.9230620693, "perf/iters_per_sec": 0.9675283065138194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0335614919662475, "data/tokens_consumed": 159532449792, "data/tokens_consumed_B": 159.532449792, "train/loss_slope": -1.3213453427328247e-05} {"step": 76080, "timestamp": 1778276779.6881635, "train/loss": 2.0618974208831786, "train/z_loss": 0.001396882685367018, "train/perplexity": 7.860871049776175, "train/grad_norm": 0.1416015625, "optim/muon_lr": 0.014160871505737305, "optim/adamw_lr": 0.0004248261451721191, "perf/tokens_per_sec": 2018739.3817825203, "perf/iters_per_sec": 0.9626099499619104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0388423681259156, "data/tokens_consumed": 159553421312, "data/tokens_consumed_B": 159.553421312, "train/loss_slope": -1.4799610001168918e-05} {"step": 76090, "timestamp": 1778276790.0640776, "train/loss": 2.090784466266632, "train/z_loss": 0.001396619901061058, "train/perplexity": 8.09125999584782, "train/grad_norm": 0.11181640625, "optim/muon_lr": 0.014125685095787049, "optim/adamw_lr": 0.00042377055287361144, "perf/tokens_per_sec": 2022373.3892246804, "perf/iters_per_sec": 0.964342779743519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369756698608399, "data/tokens_consumed": 159574392832, "data/tokens_consumed_B": 159.574392832, "train/loss_slope": -1.4667015071391674e-05} {"step": 76100, "timestamp": 1778276800.4069057, "grad/layer_0/attn": 0.0024954969994723797, "grad/layer_0/mlp": 0.0029618495609611273, "grad/layer_0/attn_mlp_ratio": 0.842546815378396, "grad/layer_4/attn": 0.0032659517601132393, "grad/layer_4/mlp": 0.0024557027500122786, "grad/layer_4/attn_mlp_ratio": 1.329945827972187, "grad/layer_8/attn": 0.004901912063360214, "grad/layer_8/mlp": 0.0035266305785626173, "grad/layer_8/attn_mlp_ratio": 1.3899703456779697, "grad/layer_12/attn": 0.004070805851370096, "grad/layer_12/mlp": 0.006369401700794697, "grad/layer_12/attn_mlp_ratio": 0.6391190222702218, "grad/layer_16/attn": 0.00361596024595201, "grad/layer_16/mlp": 0.004341351334005594, "grad/layer_16/attn_mlp_ratio": 0.8329112030941265, "grad/layer_20/attn": 0.0060766772367060184, "grad/layer_20/mlp": 0.005203864071518183, "grad/layer_20/attn_mlp_ratio": 1.167724028994631, "grad/layer_24/attn": 0.010294734500348568, "grad/layer_24/mlp": 0.007045971695333719, "grad/layer_24/attn_mlp_ratio": 1.4610808557545436, "grad/layer_27/attn": 0.005575182847678661, "grad/layer_27/mlp": 0.005704584065824747, "grad/layer_27/attn_mlp_ratio": 0.9773162575247276} {"step": 76100, "timestamp": 1778276800.4213634, "train/loss": 2.0884047150611877, "train/z_loss": 0.001410037314053625, "train/perplexity": 8.07202770323129, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.014090437293052673, "optim/adamw_lr": 0.00042271311879158016, "perf/tokens_per_sec": 2025837.8705675104, "perf/iters_per_sec": 0.9659947731816818, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352022886276244, "data/tokens_consumed": 159595364352, "data/tokens_consumed_B": 159.595364352, "train/loss_slope": -1.4061441301333837e-05} {"step": 76110, "timestamp": 1778276810.770883, "train/loss": 2.1483240365982055, "train/z_loss": 0.0013940758653916419, "train/perplexity": 8.570482538765892, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.014055128395557405, "optim/adamw_lr": 0.0004216538518667221, "perf/tokens_per_sec": 2027632.3585974206, "perf/iters_per_sec": 0.9668504517542937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034286117553711, "data/tokens_consumed": 159616335872, "data/tokens_consumed_B": 159.616335872, "train/loss_slope": -1.3711574933852975e-05} {"step": 76120, "timestamp": 1778276821.1266418, "train/loss": 2.0803243279457093, "train/z_loss": 0.0013944294070824981, "train/perplexity": 8.007065408290842, "train/grad_norm": 0.13671875, "optim/muon_lr": 0.014019757509231567, "optim/adamw_lr": 0.00042059272527694697, "perf/tokens_per_sec": 2026737.0756516315, "perf/iters_per_sec": 0.9664235475786359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347429990768433, "data/tokens_consumed": 159637307392, "data/tokens_consumed_B": 159.637307392, "train/loss_slope": -1.4637008451535635e-05} {"step": 76125, "timestamp": 1778276826.9260004, "eos/sharpness": 37.927627563476555, "eos/L0_probe": 1.9348653554916382, "eos/L_plus": 2.154085874557495, "eos/L_minus": 2.094921112060547, "eos/grad_norm": 0.09790254384279251, "eos/embed_grad_frac": 0.21217723190784454, "eos/time_s": 0.6276905536651611} {"step": 76125, "timestamp": 1778276828.3117242, "geo/rankme_last": 439.37274169921875, "geo/layer_0/stable_rank_q_proj": 18.94141387939453, "geo/layer_0/stable_rank_k_proj": 15.768802642822266, "geo/layer_0/stable_rank_o_proj": 46.43342971801758, "geo/layer_0/stable_rank_gate_proj": 128.08111572265625, "geo/layer_0/stable_rank_down_proj": 56.743247985839844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06396735459566116, "geo/layer_0/attn_entropy_mean": 6.13534688949585, "geo/layer_0/attn_entropy_std": 0.4320555329322815, "geo/layer_7/stable_rank_q_proj": 42.71149826049805, "geo/layer_7/stable_rank_k_proj": 40.38035583496094, "geo/layer_7/stable_rank_o_proj": 88.03003692626953, "geo/layer_7/stable_rank_gate_proj": 76.9256362915039, "geo/layer_7/stable_rank_down_proj": 139.91990661621094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4320906698703766, "geo/layer_7/attn_entropy_mean": 4.637127876281738, "geo/layer_7/attn_entropy_std": 0.7839353084564209, "geo/layer_14/stable_rank_q_proj": 49.51592254638672, "geo/layer_14/stable_rank_k_proj": 41.710472106933594, "geo/layer_14/stable_rank_o_proj": 43.2535400390625, "geo/layer_14/stable_rank_gate_proj": 70.26351165771484, "geo/layer_14/stable_rank_down_proj": 124.8637924194336, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40956997871398926, "geo/layer_14/attn_entropy_mean": 5.520352363586426, "geo/layer_14/attn_entropy_std": 0.40977156162261963, "geo/layer_21/stable_rank_q_proj": 39.7200927734375, "geo/layer_21/stable_rank_k_proj": 30.09075927734375, "geo/layer_21/stable_rank_o_proj": 68.04225158691406, "geo/layer_21/stable_rank_gate_proj": 63.38951873779297, "geo/layer_21/stable_rank_down_proj": 49.233482360839844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1492646038532257, "geo/layer_21/attn_entropy_mean": 5.680472373962402, "geo/layer_21/attn_entropy_std": 0.29948240518569946, "geo/layer_27/stable_rank_q_proj": 43.90668487548828, "geo/layer_27/stable_rank_k_proj": 32.315860748291016, "geo/layer_27/stable_rank_o_proj": 115.42205047607422, "geo/layer_27/stable_rank_gate_proj": 76.91571044921875, "geo/layer_27/stable_rank_down_proj": 127.52143096923828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08998876065015793, "geo/layer_27/attn_entropy_mean": 4.167600631713867, "geo/layer_27/attn_entropy_std": 0.7897884845733643, "attnres/final_alpha/block_0": 0.23961478471755981, "attnres/block_norm/0": 1.7789520025253296, "attnres/final_alpha/block_1": 0.00396952498704195, "attnres/block_norm/1": 47928.734375, "attnres/final_alpha/block_2": 0.009573872201144695, "attnres/block_norm/2": 29137.015625, "attnres/final_alpha/block_3": 0.011136852204799652, "attnres/block_norm/3": 63143.26171875, "attnres/final_alpha/block_4": 0.013022428378462791, "attnres/block_norm/4": 15785.55078125, "attnres/final_alpha/block_5": 0.616847038269043, "attnres/block_norm/5": 6808.6044921875, "attnres/final_alpha/block_6": 0.1058354601264, "attnres/block_norm/6": 41199.265625, "geo/tier1_time_s": 1.3654265403747559, "geo/step": 76125.0, "geo/rankme_slope": 0.0003648848992722089} {"step": 76130, "timestamp": 1778276833.4933124, "train/loss": 2.0842674016952514, "train/z_loss": 0.0013873987831175328, "train/perplexity": 8.038700185855966, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.013984327912330627, "optim/adamw_lr": 0.00041952983736991877, "perf/tokens_per_sec": 1696767.3841023296, "perf/iters_per_sec": 0.8090817375671051, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235969066619873, "data/tokens_consumed": 159658278912, "data/tokens_consumed_B": 159.658278912, "train/loss_slope": -1.6440695061041324e-05} {"step": 76140, "timestamp": 1778276843.8413043, "train/loss": 2.0693082094192503, "train/z_loss": 0.0013904267805628478, "train/perplexity": 7.919342695733217, "train/grad_norm": 0.09375, "optim/muon_lr": 0.013948838710784912, "optim/adamw_lr": 0.0004184651613235473, "perf/tokens_per_sec": 2028168.046888831, "perf/iters_per_sec": 0.9671058878368526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340129375457763, "data/tokens_consumed": 159679250432, "data/tokens_consumed_B": 159.679250432, "train/loss_slope": -1.917981896141507e-05} {"step": 76150, "timestamp": 1778276854.1731484, "grad/layer_0/attn": 0.002326027490198612, "grad/layer_0/mlp": 0.00280087161809206, "grad/layer_0/attn_mlp_ratio": 0.8304655565529043, "grad/layer_4/attn": 0.0019660077523440123, "grad/layer_4/mlp": 0.0024997154250741005, "grad/layer_4/attn_mlp_ratio": 0.7864925959067813, "grad/layer_8/attn": 0.003759541315957904, "grad/layer_8/mlp": 0.0035958318039774895, "grad/layer_8/attn_mlp_ratio": 1.0455275486596938, "grad/layer_12/attn": 0.004506009630858898, "grad/layer_12/mlp": 0.006495400797575712, "grad/layer_12/attn_mlp_ratio": 0.6937230976059819, "grad/layer_16/attn": 0.0043952614068984985, "grad/layer_16/mlp": 0.004535493906587362, "grad/layer_16/attn_mlp_ratio": 0.9690810748542079, "grad/layer_20/attn": 0.002896522404626012, "grad/layer_20/mlp": 0.005392241757363081, "grad/layer_20/attn_mlp_ratio": 0.5371647788147381, "grad/layer_24/attn": 0.006548297591507435, "grad/layer_24/mlp": 0.007739074062556028, "grad/layer_24/attn_mlp_ratio": 0.8461344928299138, "grad/layer_27/attn": 0.003582874545827508, "grad/layer_27/mlp": 0.007113149389624596, "grad/layer_27/attn_mlp_ratio": 0.5036973496836489} {"step": 76150, "timestamp": 1778276854.1875, "train/loss": 2.146988129615784, "train/z_loss": 0.0013852552161552011, "train/perplexity": 8.559040815540225, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.013913290500640869, "optim/adamw_lr": 0.00041739871501922604, "perf/tokens_per_sec": 2027919.80353824, "perf/iters_per_sec": 0.9669875161830139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341395139694214, "data/tokens_consumed": 159700221952, "data/tokens_consumed_B": 159.700221952, "train/loss_slope": -1.813169796832362e-05} {"step": 76160, "timestamp": 1778276864.5704541, "train/loss": 2.1679381370544433, "train/z_loss": 0.0013887979555875063, "train/perplexity": 8.74024426357562, "train/grad_norm": 0.10205078125, "optim/muon_lr": 0.013877683579921724, "optim/adamw_lr": 0.00041633050739765164, "perf/tokens_per_sec": 2021425.6494158795, "perf/iters_per_sec": 0.9638908621863744, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374618530273438, "data/tokens_consumed": 159721193472, "data/tokens_consumed_B": 159.721193472, "train/loss_slope": -1.4084356645904039e-05} {"step": 76170, "timestamp": 1778276874.949969, "train/loss": 2.062554407119751, "train/z_loss": 0.0014048932353034616, "train/perplexity": 7.866037230732415, "train/grad_norm": 0.1533203125, "optim/muon_lr": 0.013842018842697144, "optim/adamw_lr": 0.0004152605652809143, "perf/tokens_per_sec": 2021676.7659894128, "perf/iters_per_sec": 0.9640106038996757, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373329877853394, "data/tokens_consumed": 159742164992, "data/tokens_consumed_B": 159.742164992, "train/loss_slope": -1.7052352553618595e-05} {"step": 76180, "timestamp": 1778276885.3288631, "train/loss": 2.1416378021240234, "train/z_loss": 0.001401581394020468, "train/perplexity": 8.513369431541491, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.013806295692920686, "optim/adamw_lr": 0.0004141888707876205, "perf/tokens_per_sec": 2021706.9692250772, "perf/iters_per_sec": 0.9640250059247385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373174905776978, "data/tokens_consumed": 159763136512, "data/tokens_consumed_B": 159.763136512, "train/loss_slope": -1.4550947441984694e-05} {"step": 76190, "timestamp": 1778276895.7087657, "train/loss": 2.091918933391571, "train/z_loss": 0.0013986360165290535, "train/perplexity": 8.100444473071128, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.013770516812801361, "optim/adamw_lr": 0.0004131155043840408, "perf/tokens_per_sec": 2021453.708190294, "perf/iters_per_sec": 0.9639042416526289, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037447452545166, "data/tokens_consumed": 159784108032, "data/tokens_consumed_B": 159.784108032, "train/loss_slope": -1.4207196078284795e-05} {"step": 76200, "timestamp": 1778276906.0760572, "grad/layer_0/attn": 0.002440504729747772, "grad/layer_0/mlp": 0.0028199171647429466, "grad/layer_0/attn_mlp_ratio": 0.8654526004223885, "grad/layer_4/attn": 0.0019322270527482033, "grad/layer_4/mlp": 0.002617466961964965, "grad/layer_4/attn_mlp_ratio": 0.7382049160525659, "grad/layer_8/attn": 0.0050989422015845776, "grad/layer_8/mlp": 0.003631953150033951, "grad/layer_8/attn_mlp_ratio": 1.4039118486827753, "grad/layer_12/attn": 0.005234864540398121, "grad/layer_12/mlp": 0.006335352081805468, "grad/layer_12/attn_mlp_ratio": 0.8262941648977553, "grad/layer_16/attn": 0.004226068500429392, "grad/layer_16/mlp": 0.004288075026124716, "grad/layer_16/attn_mlp_ratio": 0.9855397529493936, "grad/layer_20/attn": 0.004630622453987598, "grad/layer_20/mlp": 0.005001381505280733, "grad/layer_20/attn_mlp_ratio": 0.9258686537932506, "grad/layer_24/attn": 0.004074468277394772, "grad/layer_24/mlp": 0.006953340955078602, "grad/layer_24/attn_mlp_ratio": 0.5859727352822496, "grad/layer_27/attn": 0.004253928083926439, "grad/layer_27/mlp": 0.006025217939168215, "grad/layer_27/attn_mlp_ratio": 0.7060206047769347} {"step": 76200, "timestamp": 1778276906.6937656, "eos/sharpness": 4.6091318130493155, "eos/L0_probe": 1.935669183731079, "eos/L_plus": 1.9598267078399658, "eos/L_minus": 1.9576029777526855, "eos/grad_norm": 0.08097493648529053, "eos/embed_grad_frac": 0.3055128753185272, "eos/time_s": 0.6144800186157227} {"step": 76200, "timestamp": 1778276906.713222, "train/loss": 2.1048373937606812, "train/z_loss": 0.001396817690692842, "train/perplexity": 8.205768591934977, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.0137346813082695, "optim/adamw_lr": 0.00041204043924808497, "perf/tokens_per_sec": 1906593.3101755646, "perf/iters_per_sec": 0.9091345358732055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.099947214126587, "data/tokens_consumed": 159805079552, "data/tokens_consumed_B": 159.805079552, "train/loss_slope": -1.3498439661490164e-05} {"step": 76200, "timestamp": 1778276908.0806074, "geo/rankme_last": 438.919189453125, "geo/layer_0/stable_rank_q_proj": 18.92908477783203, "geo/layer_0/stable_rank_k_proj": 15.785067558288574, "geo/layer_0/stable_rank_o_proj": 46.46291732788086, "geo/layer_0/stable_rank_gate_proj": 127.93852233886719, "geo/layer_0/stable_rank_down_proj": 56.69314956665039, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06250600516796112, "geo/layer_0/attn_entropy_mean": 6.1371355056762695, "geo/layer_0/attn_entropy_std": 0.429861456155777, "geo/layer_7/stable_rank_q_proj": 42.71900939941406, "geo/layer_7/stable_rank_k_proj": 40.33747100830078, "geo/layer_7/stable_rank_o_proj": 88.00675964355469, "geo/layer_7/stable_rank_gate_proj": 76.91707611083984, "geo/layer_7/stable_rank_down_proj": 139.90902709960938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43692031502723694, "geo/layer_7/attn_entropy_mean": 4.653885841369629, "geo/layer_7/attn_entropy_std": 0.7903562188148499, "geo/layer_14/stable_rank_q_proj": 49.54263687133789, "geo/layer_14/stable_rank_k_proj": 41.76838302612305, "geo/layer_14/stable_rank_o_proj": 43.223731994628906, "geo/layer_14/stable_rank_gate_proj": 70.28958129882812, "geo/layer_14/stable_rank_down_proj": 124.73124694824219, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41243669390678406, "geo/layer_14/attn_entropy_mean": 5.539846420288086, "geo/layer_14/attn_entropy_std": 0.41253921389579773, "geo/layer_21/stable_rank_q_proj": 39.687156677246094, "geo/layer_21/stable_rank_k_proj": 30.150774002075195, "geo/layer_21/stable_rank_o_proj": 68.03781127929688, "geo/layer_21/stable_rank_gate_proj": 63.345237731933594, "geo/layer_21/stable_rank_down_proj": 49.1945915222168, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14353258907794952, "geo/layer_21/attn_entropy_mean": 5.698178291320801, "geo/layer_21/attn_entropy_std": 0.29606544971466064, "geo/layer_27/stable_rank_q_proj": 43.89236068725586, "geo/layer_27/stable_rank_k_proj": 32.325164794921875, "geo/layer_27/stable_rank_o_proj": 115.38265228271484, "geo/layer_27/stable_rank_gate_proj": 76.87846374511719, "geo/layer_27/stable_rank_down_proj": 127.43063354492188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09013307094573975, "geo/layer_27/attn_entropy_mean": 4.135878562927246, "geo/layer_27/attn_entropy_std": 0.7848477363586426, "attnres/final_alpha/block_0": 0.24014854431152344, "attnres/block_norm/0": 1.778955340385437, "attnres/final_alpha/block_1": 0.003931561950594187, "attnres/block_norm/1": 47987.6796875, "attnres/final_alpha/block_2": 0.009570563212037086, "attnres/block_norm/2": 29197.28125, "attnres/final_alpha/block_3": 0.011183352209627628, "attnres/block_norm/3": 62868.90234375, "attnres/final_alpha/block_4": 0.013227676972746849, "attnres/block_norm/4": 15839.5625, "attnres/final_alpha/block_5": 0.6159738302230835, "attnres/block_norm/5": 6891.5234375, "attnres/final_alpha/block_6": 0.10596448183059692, "attnres/block_norm/6": 41404.75, "geo/tier1_time_s": 1.3636212348937988, "geo/step": 76200.0, "geo/rankme_slope": 0.0003599139460471689} {"step": 76210, "timestamp": 1778276918.4620428, "train/loss": 2.039111351966858, "train/z_loss": 0.001403833704534918, "train/perplexity": 7.683777989860527, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.013698790073394775, "optim/adamw_lr": 0.00041096370220184325, "perf/tokens_per_sec": 1785538.2597377338, "perf/iters_per_sec": 0.8514109896362942, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1745208978652955, "data/tokens_consumed": 159826051072, "data/tokens_consumed_B": 159.826051072, "train/loss_slope": -1.632499788532096e-05} {"step": 76220, "timestamp": 1778276928.8575478, "train/loss": 2.0821189284324646, "train/z_loss": 0.0014014664571732282, "train/perplexity": 8.021447793227566, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.013662843704223633, "optim/adamw_lr": 0.00040988531112670895, "perf/tokens_per_sec": 2021267.7172049335, "perf/iters_per_sec": 0.9638155542397182, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375429153442384, "data/tokens_consumed": 159847022592, "data/tokens_consumed_B": 159.847022592, "train/loss_slope": -1.7545450123587545e-05} {"step": 76230, "timestamp": 1778276939.213448, "train/loss": 2.125088930130005, "train/z_loss": 0.0013760109199211002, "train/perplexity": 8.373642124099197, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.013626842498779297, "optim/adamw_lr": 0.0004088052749633789, "perf/tokens_per_sec": 2026198.6890264556, "perf/iters_per_sec": 0.9661668248302725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350179433822633, "data/tokens_consumed": 159867994112, "data/tokens_consumed_B": 159.867994112, "train/loss_slope": -1.4073248335880999e-05} {"step": 76240, "timestamp": 1778276949.5759113, "train/loss": 2.0673784971237184, "train/z_loss": 0.0013968661078251898, "train/perplexity": 7.904075378263539, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.01359078586101532, "optim/adamw_lr": 0.00040772357583045957, "perf/tokens_per_sec": 2025003.8416670016, "perf/iters_per_sec": 0.9655970772109039, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035628652572632, "data/tokens_consumed": 159888965632, "data/tokens_consumed_B": 159.888965632, "train/loss_slope": -1.605464502243609e-05} {"step": 76250, "timestamp": 1778276959.9127421, "grad/layer_0/attn": 0.002549202647060156, "grad/layer_0/mlp": 0.002895187120884657, "grad/layer_0/attn_mlp_ratio": 0.8804966492913776, "grad/layer_4/attn": 0.002358933910727501, "grad/layer_4/mlp": 0.0025748582556843758, "grad/layer_4/attn_mlp_ratio": 0.9161412337575036, "grad/layer_8/attn": 0.006921286229044199, "grad/layer_8/mlp": 0.0036441292613744736, "grad/layer_8/attn_mlp_ratio": 1.8992976216501998, "grad/layer_12/attn": 0.004934113938361406, "grad/layer_12/mlp": 0.007214229088276625, "grad/layer_12/attn_mlp_ratio": 0.683941944397818, "grad/layer_16/attn": 0.003200122155249119, "grad/layer_16/mlp": 0.004332771524786949, "grad/layer_16/attn_mlp_ratio": 0.7385854673119254, "grad/layer_20/attn": 0.003327031619846821, "grad/layer_20/mlp": 0.005561898462474346, "grad/layer_20/attn_mlp_ratio": 0.598182721686803, "grad/layer_24/attn": 0.00487007899209857, "grad/layer_24/mlp": 0.006900237873196602, "grad/layer_24/attn_mlp_ratio": 0.7057842078803638, "grad/layer_27/attn": 0.0033768846187740564, "grad/layer_27/mlp": 0.006075692363083363, "grad/layer_27/attn_mlp_ratio": 0.5558024273434532} {"step": 76250, "timestamp": 1778276959.9272108, "train/loss": 2.132785665988922, "train/z_loss": 0.0013926813378930091, "train/perplexity": 8.438340499465728, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.01355467677116394, "optim/adamw_lr": 0.00040664030313491815, "perf/tokens_per_sec": 2027306.5864775057, "perf/iters_per_sec": 0.9666951115024117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344523191452026, "data/tokens_consumed": 159909937152, "data/tokens_consumed_B": 159.909937152, "train/loss_slope": -1.1640353993971823e-05} {"step": 76260, "timestamp": 1778276970.2723835, "train/loss": 2.0980086088180543, "train/z_loss": 0.001401138980872929, "train/perplexity": 8.149924055110901, "train/grad_norm": 0.123046875, "optim/muon_lr": 0.013518514335155487, "optim/adamw_lr": 0.00040555543005466456, "perf/tokens_per_sec": 2028544.198274584, "perf/iters_per_sec": 0.9672852507946892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033821201324463, "data/tokens_consumed": 159930908672, "data/tokens_consumed_B": 159.930908672, "train/loss_slope": -1.2314015410520868e-05} {"step": 76270, "timestamp": 1778276980.6177082, "train/loss": 2.1090781331062316, "train/z_loss": 0.001388224505353719, "train/perplexity": 8.240641007814588, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.013482299447059632, "optim/adamw_lr": 0.0004044689834117889, "perf/tokens_per_sec": 2028283.1880917146, "perf/iters_per_sec": 0.9671607914408277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339542388916017, "data/tokens_consumed": 159951880192, "data/tokens_consumed_B": 159.951880192, "train/loss_slope": -1.2993899337386296e-05} {"step": 76275, "timestamp": 1778276986.3810072, "eos/sharpness": 11.24666929244995, "eos/L0_probe": 1.9348373413085938, "eos/L_plus": 1.9994585514068604, "eos/L_minus": 1.9826828241348267, "eos/grad_norm": 0.09347350150346756, "eos/embed_grad_frac": 0.21352767944335938, "eos/time_s": 0.5975282192230225} {"step": 76275, "timestamp": 1778276987.7612526, "geo/rankme_last": 439.0824890136719, "geo/layer_0/stable_rank_q_proj": 18.9320011138916, "geo/layer_0/stable_rank_k_proj": 15.8134183883667, "geo/layer_0/stable_rank_o_proj": 46.453304290771484, "geo/layer_0/stable_rank_gate_proj": 127.74337005615234, "geo/layer_0/stable_rank_down_proj": 56.69392395019531, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06294512003660202, "geo/layer_0/attn_entropy_mean": 6.135455131530762, "geo/layer_0/attn_entropy_std": 0.4280308485031128, "geo/layer_7/stable_rank_q_proj": 42.69752883911133, "geo/layer_7/stable_rank_k_proj": 40.273006439208984, "geo/layer_7/stable_rank_o_proj": 87.97331237792969, "geo/layer_7/stable_rank_gate_proj": 76.90644836425781, "geo/layer_7/stable_rank_down_proj": 139.99127197265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4486454725265503, "geo/layer_7/attn_entropy_mean": 4.640625476837158, "geo/layer_7/attn_entropy_std": 0.7875884175300598, "geo/layer_14/stable_rank_q_proj": 49.52764892578125, "geo/layer_14/stable_rank_k_proj": 41.709449768066406, "geo/layer_14/stable_rank_o_proj": 43.215858459472656, "geo/layer_14/stable_rank_gate_proj": 70.19821166992188, "geo/layer_14/stable_rank_down_proj": 124.70358276367188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4085284173488617, "geo/layer_14/attn_entropy_mean": 5.51378059387207, "geo/layer_14/attn_entropy_std": 0.40814074873924255, "geo/layer_21/stable_rank_q_proj": 39.69129943847656, "geo/layer_21/stable_rank_k_proj": 30.14911460876465, "geo/layer_21/stable_rank_o_proj": 68.0982894897461, "geo/layer_21/stable_rank_gate_proj": 63.348060607910156, "geo/layer_21/stable_rank_down_proj": 49.18682861328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1446920931339264, "geo/layer_21/attn_entropy_mean": 5.675042152404785, "geo/layer_21/attn_entropy_std": 0.295175164937973, "geo/layer_27/stable_rank_q_proj": 43.956363677978516, "geo/layer_27/stable_rank_k_proj": 32.34729766845703, "geo/layer_27/stable_rank_o_proj": 115.38719940185547, "geo/layer_27/stable_rank_gate_proj": 76.87158966064453, "geo/layer_27/stable_rank_down_proj": 127.50830078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08730316907167435, "geo/layer_27/attn_entropy_mean": 4.142391681671143, "geo/layer_27/attn_entropy_std": 0.7895379066467285, "attnres/final_alpha/block_0": 0.23993436992168427, "attnres/block_norm/0": 1.779006004333496, "attnres/final_alpha/block_1": 0.003892923006787896, "attnres/block_norm/1": 47985.1015625, "attnres/final_alpha/block_2": 0.009522616863250732, "attnres/block_norm/2": 29065.396484375, "attnres/final_alpha/block_3": 0.011162773706018925, "attnres/block_norm/3": 63142.02734375, "attnres/final_alpha/block_4": 0.013321620412170887, "attnres/block_norm/4": 15734.3681640625, "attnres/final_alpha/block_5": 0.6149980425834656, "attnres/block_norm/5": 6869.744140625, "attnres/final_alpha/block_6": 0.10716763138771057, "attnres/block_norm/6": 41206.421875, "geo/tier1_time_s": 1.3621227741241455, "geo/step": 76275.0, "geo/rankme_slope": 0.0003518370043329832} {"step": 76280, "timestamp": 1778276992.9330575, "train/loss": 2.0991410613059998, "train/z_loss": 0.0013962562312372028, "train/perplexity": 8.15915868478644, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.01344603270292282, "optim/adamw_lr": 0.0004033809810876846, "perf/tokens_per_sec": 1704026.5477802365, "perf/iters_per_sec": 0.8125431765462096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2307038307189941, "data/tokens_consumed": 159972851712, "data/tokens_consumed_B": 159.972851712, "train/loss_slope": -1.3763148506375164e-05} {"step": 76290, "timestamp": 1778277003.2796414, "train/loss": 2.061254620552063, "train/z_loss": 0.0013998970738612116, "train/perplexity": 7.855819702940162, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.013409714400768281, "optim/adamw_lr": 0.00040229143202304835, "perf/tokens_per_sec": 2028274.16152192, "perf/iters_per_sec": 0.9671564872369385, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339588403701783, "data/tokens_consumed": 159993823232, "data/tokens_consumed_B": 159.993823232, "train/loss_slope": -1.6044617593377854e-05} {"step": 76300, "timestamp": 1778277013.6209118, "grad/layer_0/attn": 0.002746527548879385, "grad/layer_0/mlp": 0.003146218368783593, "grad/layer_0/attn_mlp_ratio": 0.8729614857105779, "grad/layer_4/attn": 0.004787901416420937, "grad/layer_4/mlp": 0.002483888529241085, "grad/layer_4/attn_mlp_ratio": 1.927582968115526, "grad/layer_8/attn": 0.003235784126445651, "grad/layer_8/mlp": 0.003665945027023554, "grad/layer_8/attn_mlp_ratio": 0.8826602729519963, "grad/layer_12/attn": 0.0056526619009673595, "grad/layer_12/mlp": 0.006724454462528229, "grad/layer_12/attn_mlp_ratio": 0.8406126992762516, "grad/layer_16/attn": 0.0033269461710006, "grad/layer_16/mlp": 0.004431272391229868, "grad/layer_16/attn_mlp_ratio": 0.7507879909405504, "grad/layer_20/attn": 0.0029391671996563673, "grad/layer_20/mlp": 0.005575940012931824, "grad/layer_20/attn_mlp_ratio": 0.5271159912280615, "grad/layer_24/attn": 0.005613365210592747, "grad/layer_24/mlp": 0.007210962008684874, "grad/layer_24/attn_mlp_ratio": 0.7784488568913733, "grad/layer_27/attn": 0.004834587685763836, "grad/layer_27/mlp": 0.006736536044627428, "grad/layer_27/attn_mlp_ratio": 0.7176667031794303} {"step": 76300, "timestamp": 1778277013.635557, "train/loss": 2.1614957809448243, "train/z_loss": 0.0013826496549881994, "train/perplexity": 8.684117486012244, "train/grad_norm": 0.10693359375, "optim/muon_lr": 0.013373343944549561, "optim/adamw_lr": 0.0004012003183364868, "perf/tokens_per_sec": 2026058.9569569065, "perf/iters_per_sec": 0.9661001953873188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350893259048461, "data/tokens_consumed": 160014794752, "data/tokens_consumed_B": 160.014794752, "train/loss_slope": -1.5027667279362232e-05} {"step": 76310, "timestamp": 1778277023.9889095, "train/loss": 2.0585200548172, "train/z_loss": 0.0014061112655326725, "train/perplexity": 7.834366793125753, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.013336924016475678, "optim/adamw_lr": 0.0004001077204942703, "perf/tokens_per_sec": 2027016.5593490738, "perf/iters_per_sec": 0.9665568157906884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346003293991088, "data/tokens_consumed": 160035766272, "data/tokens_consumed_B": 160.035766272, "train/loss_slope": -1.930800461389981e-05} {"step": 76320, "timestamp": 1778277034.3431733, "train/loss": 2.0969609022140503, "train/z_loss": 0.0013978237402625382, "train/perplexity": 8.141389797336009, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.013300454318523407, "optim/adamw_lr": 0.00039901362955570215, "perf/tokens_per_sec": 2026901.1882543378, "perf/iters_per_sec": 0.9665018025657357, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034659218788147, "data/tokens_consumed": 160056737792, "data/tokens_consumed_B": 160.056737792, "train/loss_slope": -1.9714175077519053e-05} {"step": 76330, "timestamp": 1778277045.2219281, "train/loss": 2.1644426345825196, "train/z_loss": 0.0013876368408091365, "train/perplexity": 8.709746052486823, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.013263935446739197, "optim/adamw_lr": 0.00039791806340217585, "perf/tokens_per_sec": 1928699.0190463373, "perf/iters_per_sec": 0.9196753592712104, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0873402118682862, "data/tokens_consumed": 160077709312, "data/tokens_consumed_B": 160.077709312, "train/loss_slope": -1.6081183349410233e-05} {"step": 76340, "timestamp": 1778277055.5710497, "train/loss": 2.064402222633362, "train/z_loss": 0.0013989050756208598, "train/perplexity": 7.88058565361922, "train/grad_norm": 0.12451171875, "optim/muon_lr": 0.01322736769914627, "optim/adamw_lr": 0.0003968210309743881, "perf/tokens_per_sec": 2027986.2420728733, "perf/iters_per_sec": 0.9670191965450636, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341056346893311, "data/tokens_consumed": 160098680832, "data/tokens_consumed_B": 160.098680832, "train/loss_slope": -1.844307858415313e-05} {"step": 76350, "timestamp": 1778277065.9193668, "grad/layer_0/attn": 0.0024903968442231417, "grad/layer_0/mlp": 0.002764904871582985, "grad/layer_0/attn_mlp_ratio": 0.9007169757437704, "grad/layer_4/attn": 0.0019211372127756476, "grad/layer_4/mlp": 0.0024964571930468082, "grad/layer_4/attn_mlp_ratio": 0.7695453946384119, "grad/layer_8/attn": 0.0034148513805121183, "grad/layer_8/mlp": 0.0036164268385618925, "grad/layer_8/attn_mlp_ratio": 0.9442611280486887, "grad/layer_12/attn": 0.005061357282102108, "grad/layer_12/mlp": 0.006498386152088642, "grad/layer_12/attn_mlp_ratio": 0.7788637187386854, "grad/layer_16/attn": 0.0034975269809365273, "grad/layer_16/mlp": 0.004443971905857325, "grad/layer_16/attn_mlp_ratio": 0.7870272306681171, "grad/layer_20/attn": 0.0029199309647083282, "grad/layer_20/mlp": 0.005119075533002615, "grad/layer_20/attn_mlp_ratio": 0.5704019971659671, "grad/layer_24/attn": 0.007356642279773951, "grad/layer_24/mlp": 0.006585738155990839, "grad/layer_24/attn_mlp_ratio": 1.1170565840635778, "grad/layer_27/attn": 0.00558804627507925, "grad/layer_27/mlp": 0.006373474840074778, "grad/layer_27/attn_mlp_ratio": 0.8767660228712986} {"step": 76350, "timestamp": 1778277066.516836, "eos/sharpness": 50.926733016967766, "eos/L0_probe": 1.9306915998458862, "eos/L_plus": 2.157193422317505, "eos/L_minus": 2.2134571075439453, "eos/grad_norm": 0.10537154227495193, "eos/embed_grad_frac": 0.16657482087612152, "eos/time_s": 0.594618558883667} {"step": 76350, "timestamp": 1778277066.534859, "train/loss": 2.1195773839950562, "train/z_loss": 0.0013910994748584926, "train/perplexity": 8.32761735952965, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.013190751671791076, "optim/adamw_lr": 0.00039572255015373224, "perf/tokens_per_sec": 1914198.425853505, "perf/iters_per_sec": 0.9127609376208806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0955771207809448, "data/tokens_consumed": 160119652352, "data/tokens_consumed_B": 160.119652352, "train/loss_slope": -1.8619961192076377e-05} {"step": 76350, "timestamp": 1778277067.9053764, "geo/rankme_last": 439.7609558105469, "geo/layer_0/stable_rank_q_proj": 18.920642852783203, "geo/layer_0/stable_rank_k_proj": 15.823051452636719, "geo/layer_0/stable_rank_o_proj": 46.44743728637695, "geo/layer_0/stable_rank_gate_proj": 127.69442749023438, "geo/layer_0/stable_rank_down_proj": 56.66777038574219, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06347266584634781, "geo/layer_0/attn_entropy_mean": 6.13651180267334, "geo/layer_0/attn_entropy_std": 0.42992904782295227, "geo/layer_7/stable_rank_q_proj": 42.668209075927734, "geo/layer_7/stable_rank_k_proj": 40.250553131103516, "geo/layer_7/stable_rank_o_proj": 87.9361801147461, "geo/layer_7/stable_rank_gate_proj": 76.92472076416016, "geo/layer_7/stable_rank_down_proj": 139.9178009033203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4346529245376587, "geo/layer_7/attn_entropy_mean": 4.639047622680664, "geo/layer_7/attn_entropy_std": 0.7840118408203125, "geo/layer_14/stable_rank_q_proj": 49.52690124511719, "geo/layer_14/stable_rank_k_proj": 41.739585876464844, "geo/layer_14/stable_rank_o_proj": 43.16899490356445, "geo/layer_14/stable_rank_gate_proj": 70.1703109741211, "geo/layer_14/stable_rank_down_proj": 124.67530059814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40735238790512085, "geo/layer_14/attn_entropy_mean": 5.5273518562316895, "geo/layer_14/attn_entropy_std": 0.4132283926010132, "geo/layer_21/stable_rank_q_proj": 39.69211959838867, "geo/layer_21/stable_rank_k_proj": 30.173871994018555, "geo/layer_21/stable_rank_o_proj": 68.10430908203125, "geo/layer_21/stable_rank_gate_proj": 63.30678939819336, "geo/layer_21/stable_rank_down_proj": 49.17889404296875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.144791841506958, "geo/layer_21/attn_entropy_mean": 5.692255020141602, "geo/layer_21/attn_entropy_std": 0.3028718829154968, "geo/layer_27/stable_rank_q_proj": 43.98114776611328, "geo/layer_27/stable_rank_k_proj": 32.37581253051758, "geo/layer_27/stable_rank_o_proj": 115.35396575927734, "geo/layer_27/stable_rank_gate_proj": 76.80896759033203, "geo/layer_27/stable_rank_down_proj": 127.47579193115234, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.10007067769765854, "geo/layer_27/attn_entropy_mean": 4.1617279052734375, "geo/layer_27/attn_entropy_std": 0.7914156317710876, "attnres/final_alpha/block_0": 0.24194177985191345, "attnres/block_norm/0": 1.7790303230285645, "attnres/final_alpha/block_1": 0.003916559740900993, "attnres/block_norm/1": 47874.57421875, "attnres/final_alpha/block_2": 0.009714849293231964, "attnres/block_norm/2": 29079.94140625, "attnres/final_alpha/block_3": 0.011366767808794975, "attnres/block_norm/3": 62742.6796875, "attnres/final_alpha/block_4": 0.013284152373671532, "attnres/block_norm/4": 15838.71875, "attnres/final_alpha/block_5": 0.6125404238700867, "attnres/block_norm/5": 6867.55908203125, "attnres/final_alpha/block_6": 0.10723546147346497, "attnres/block_norm/6": 41032.8125, "geo/tier1_time_s": 1.3664400577545166, "geo/step": 76350.0, "geo/rankme_slope": 0.0003769643404236695} {"step": 76360, "timestamp": 1778277078.2658274, "train/loss": 2.126668381690979, "train/z_loss": 0.0013930520857684315, "train/perplexity": 8.38687833644961, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.013154086768627168, "optim/adamw_lr": 0.000394622603058815, "perf/tokens_per_sec": 1788327.8971887422, "perf/iters_per_sec": 0.8527411924308501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1726887464523315, "data/tokens_consumed": 160140623872, "data/tokens_consumed_B": 160.140623872, "train/loss_slope": -1.8423287696105884e-05} {"step": 76370, "timestamp": 1778277088.6118934, "train/loss": 2.0781073808670043, "train/z_loss": 0.0013952925452031194, "train/perplexity": 7.989333830272319, "train/grad_norm": 0.15625, "optim/muon_lr": 0.01311737596988678, "optim/adamw_lr": 0.00039352127909660333, "perf/tokens_per_sec": 2028484.412592318, "perf/iters_per_sec": 0.9672567427598562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338516712188721, "data/tokens_consumed": 160161595392, "data/tokens_consumed_B": 160.161595392, "train/loss_slope": -2.1164583287151807e-05} {"step": 76380, "timestamp": 1778277098.965373, "train/loss": 2.108559763431549, "train/z_loss": 0.0014054532046429812, "train/perplexity": 8.236370416384366, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.013080618381500245, "optim/adamw_lr": 0.0003924185514450073, "perf/tokens_per_sec": 2026509.0247486986, "perf/iters_per_sec": 0.9663148044341557, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348594427108764, "data/tokens_consumed": 160182566912, "data/tokens_consumed_B": 160.182566912, "train/loss_slope": -1.8582751398289698e-05} {"step": 76390, "timestamp": 1778277109.3095112, "train/loss": 2.114478659629822, "train/z_loss": 0.0014036984532140196, "train/perplexity": 8.285265196748771, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.013043814897537231, "optim/adamw_lr": 0.0003913144469261169, "perf/tokens_per_sec": 2028333.981583494, "perf/iters_per_sec": 0.967185011665103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033928346633911, "data/tokens_consumed": 160203538432, "data/tokens_consumed_B": 160.203538432, "train/loss_slope": -1.836989530385379e-05} {"step": 76400, "timestamp": 1778277119.643321, "grad/layer_0/attn": 0.0025956809986382723, "grad/layer_0/mlp": 0.002865759190171957, "grad/layer_0/attn_mlp_ratio": 0.9057568120044447, "grad/layer_4/attn": 0.001952422084286809, "grad/layer_4/mlp": 0.0024757946375757456, "grad/layer_4/attn_mlp_ratio": 0.7886041821861961, "grad/layer_8/attn": 0.003995767328888178, "grad/layer_8/mlp": 0.0033830786123871803, "grad/layer_8/attn_mlp_ratio": 1.181103860887904, "grad/layer_12/attn": 0.004391187336295843, "grad/layer_12/mlp": 0.00737970182672143, "grad/layer_12/attn_mlp_ratio": 0.5950358672883026, "grad/layer_16/attn": 0.0032431399449706078, "grad/layer_16/mlp": 0.00455515505746007, "grad/layer_16/attn_mlp_ratio": 0.7119713451822277, "grad/layer_20/attn": 0.0029254525434225798, "grad/layer_20/mlp": 0.005582728423178196, "grad/layer_20/attn_mlp_ratio": 0.5240184134472559, "grad/layer_24/attn": 0.009176676161587238, "grad/layer_24/mlp": 0.007674515713006258, "grad/layer_24/attn_mlp_ratio": 1.195733566153479, "grad/layer_27/attn": 0.0062088510021567345, "grad/layer_27/mlp": 0.007136957254260778, "grad/layer_27/attn_mlp_ratio": 0.8699576996141133} {"step": 76400, "timestamp": 1778277119.6574605, "train/loss": 2.0696504950523376, "train/z_loss": 0.001397968444507569, "train/perplexity": 7.922053836927331, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.013006966114044189, "optim/adamw_lr": 0.00039020898342132567, "perf/tokens_per_sec": 2028230.2926014073, "perf/iters_per_sec": 0.9671355689055477, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.033981204032898, "data/tokens_consumed": 160224509952, "data/tokens_consumed_B": 160.224509952, "train/loss_slope": -1.7584877024174726e-05} {"step": 76410, "timestamp": 1778277129.9990573, "train/loss": 2.1063029050827025, "train/z_loss": 0.00139811149565503, "train/perplexity": 8.217803054879301, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.012970072031021119, "optim/adamw_lr": 0.0003891021609306335, "perf/tokens_per_sec": 2028961.8607201152, "perf/iters_per_sec": 0.9674844077683045, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336083889007568, "data/tokens_consumed": 160245481472, "data/tokens_consumed_B": 160.245481472, "train/loss_slope": -1.7764756846206165e-05} {"step": 76420, "timestamp": 1778277140.3541512, "train/loss": 2.1301739692687987, "train/z_loss": 0.0013908541179262102, "train/perplexity": 8.416330867013, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.012933132648468017, "optim/adamw_lr": 0.0003879939794540405, "perf/tokens_per_sec": 2026768.5977803662, "perf/iters_per_sec": 0.9664385785009223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034726905822754, "data/tokens_consumed": 160266452992, "data/tokens_consumed_B": 160.266452992, "train/loss_slope": -1.5957377202296954e-05} {"step": 76425, "timestamp": 1778277146.1190324, "eos/sharpness": 48.381471633911126, "eos/L0_probe": 1.9289488792419434, "eos/L_plus": 2.1804356575012207, "eos/L_minus": 2.1612768173217773, "eos/grad_norm": 0.11344030499458313, "eos/embed_grad_frac": 0.15485282242298126, "eos/time_s": 0.5974974632263184} {"step": 76425, "timestamp": 1778277147.500744, "geo/rankme_last": 440.01849365234375, "geo/layer_0/stable_rank_q_proj": 18.90086555480957, "geo/layer_0/stable_rank_k_proj": 15.807804107666016, "geo/layer_0/stable_rank_o_proj": 46.459163665771484, "geo/layer_0/stable_rank_gate_proj": 127.65350341796875, "geo/layer_0/stable_rank_down_proj": 56.668006896972656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06428410112857819, "geo/layer_0/attn_entropy_mean": 6.137507438659668, "geo/layer_0/attn_entropy_std": 0.4299238920211792, "geo/layer_7/stable_rank_q_proj": 42.66941833496094, "geo/layer_7/stable_rank_k_proj": 40.22941970825195, "geo/layer_7/stable_rank_o_proj": 87.93939208984375, "geo/layer_7/stable_rank_gate_proj": 76.93025207519531, "geo/layer_7/stable_rank_down_proj": 139.95962524414062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4344618022441864, "geo/layer_7/attn_entropy_mean": 4.625357151031494, "geo/layer_7/attn_entropy_std": 0.7972694039344788, "geo/layer_14/stable_rank_q_proj": 49.499568939208984, "geo/layer_14/stable_rank_k_proj": 41.715248107910156, "geo/layer_14/stable_rank_o_proj": 43.14727020263672, "geo/layer_14/stable_rank_gate_proj": 70.20960998535156, "geo/layer_14/stable_rank_down_proj": 124.78854370117188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4216008186340332, "geo/layer_14/attn_entropy_mean": 5.524878025054932, "geo/layer_14/attn_entropy_std": 0.41366422176361084, "geo/layer_21/stable_rank_q_proj": 39.66629409790039, "geo/layer_21/stable_rank_k_proj": 30.152454376220703, "geo/layer_21/stable_rank_o_proj": 68.11188507080078, "geo/layer_21/stable_rank_gate_proj": 63.279632568359375, "geo/layer_21/stable_rank_down_proj": 49.20423126220703, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14768293499946594, "geo/layer_21/attn_entropy_mean": 5.680193901062012, "geo/layer_21/attn_entropy_std": 0.2946811616420746, "geo/layer_27/stable_rank_q_proj": 43.974205017089844, "geo/layer_27/stable_rank_k_proj": 32.352561950683594, "geo/layer_27/stable_rank_o_proj": 115.3221435546875, "geo/layer_27/stable_rank_gate_proj": 76.73048400878906, "geo/layer_27/stable_rank_down_proj": 127.4835205078125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0904061570763588, "geo/layer_27/attn_entropy_mean": 4.163036346435547, "geo/layer_27/attn_entropy_std": 0.792560875415802, "attnres/final_alpha/block_0": 0.24157759547233582, "attnres/block_norm/0": 1.7790937423706055, "attnres/final_alpha/block_1": 0.0038862437941133976, "attnres/block_norm/1": 47968.7578125, "attnres/final_alpha/block_2": 0.009644398465752602, "attnres/block_norm/2": 29117.771484375, "attnres/final_alpha/block_3": 0.011379165574908257, "attnres/block_norm/3": 62652.74609375, "attnres/final_alpha/block_4": 0.013266037218272686, "attnres/block_norm/4": 15785.4541015625, "attnres/final_alpha/block_5": 0.6140809655189514, "attnres/block_norm/5": 6834.2998046875, "attnres/final_alpha/block_6": 0.10616560280323029, "attnres/block_norm/6": 41022.15625, "geo/tier1_time_s": 1.3631384372711182, "geo/step": 76425.0, "geo/rankme_slope": 0.00042636632778111244} {"step": 76430, "timestamp": 1778277152.6739638, "train/loss": 2.0851757764816283, "train/z_loss": 0.0013898301520384848, "train/perplexity": 8.046005655970129, "train/grad_norm": 0.1220703125, "optim/muon_lr": 0.012896150648593903, "optim/adamw_lr": 0.00038688451945781703, "perf/tokens_per_sec": 1703389.6674141672, "perf/iters_per_sec": 0.8122394883223377, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2311639785766602, "data/tokens_consumed": 160287424512, "data/tokens_consumed_B": 160.287424512, "train/loss_slope": -1.7376953511848997e-05} {"step": 76440, "timestamp": 1778277163.0150383, "train/loss": 2.092954897880554, "train/z_loss": 0.0013851187075488269, "train/perplexity": 8.10884059418096, "train/grad_norm": 0.107421875, "optim/muon_lr": 0.012859125137329102, "optim/adamw_lr": 0.000385773754119873, "perf/tokens_per_sec": 2028897.604564796, "perf/iters_per_sec": 0.967453768045805, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336411237716674, "data/tokens_consumed": 160308396032, "data/tokens_consumed_B": 160.308396032, "train/loss_slope": -1.649060883108577e-05} {"step": 76450, "timestamp": 1778277173.3537247, "grad/layer_0/attn": 0.002748714527115226, "grad/layer_0/mlp": 0.0030555210541933775, "grad/layer_0/attn_mlp_ratio": 0.8995894279255464, "grad/layer_4/attn": 0.0027843457646667957, "grad/layer_4/mlp": 0.002674270886927843, "grad/layer_4/attn_mlp_ratio": 1.0411606670666553, "grad/layer_8/attn": 0.006663162261247635, "grad/layer_8/mlp": 0.0035084611736238003, "grad/layer_8/attn_mlp_ratio": 1.8991693912486667, "grad/layer_12/attn": 0.004329057410359383, "grad/layer_12/mlp": 0.0070077660493552685, "grad/layer_12/attn_mlp_ratio": 0.6177514086650375, "grad/layer_16/attn": 0.003845746861770749, "grad/layer_16/mlp": 0.0046671354211866856, "grad/layer_16/attn_mlp_ratio": 0.8240058263388318, "grad/layer_20/attn": 0.0036830506287515163, "grad/layer_20/mlp": 0.0052445014007389545, "grad/layer_20/attn_mlp_ratio": 0.7022689626902713, "grad/layer_24/attn": 0.004747576080262661, "grad/layer_24/mlp": 0.0070499274879693985, "grad/layer_24/attn_mlp_ratio": 0.6734219637041851, "grad/layer_27/attn": 0.006777267903089523, "grad/layer_27/mlp": 0.006667228881269693, "grad/layer_27/attn_mlp_ratio": 1.0165044461693102} {"step": 76450, "timestamp": 1778277173.3680704, "train/loss": 2.0682603240013124, "train/z_loss": 0.0014082142151892185, "train/perplexity": 7.911048478456339, "train/grad_norm": 0.12060546875, "optim/muon_lr": 0.012822057008743287, "optim/adamw_lr": 0.00038466171026229853, "perf/tokens_per_sec": 2026547.8700882278, "perf/iters_per_sec": 0.9663333273354663, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348396062850953, "data/tokens_consumed": 160329367552, "data/tokens_consumed_B": 160.329367552, "train/loss_slope": -2.1444574913175978e-05} {"step": 76460, "timestamp": 1778277183.7177057, "train/loss": 2.1073652505874634, "train/z_loss": 0.0014025142416357994, "train/perplexity": 8.226537839871957, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.01278494656085968, "optim/adamw_lr": 0.0003835483968257904, "perf/tokens_per_sec": 2027755.5259518265, "perf/iters_per_sec": 0.966909182525552, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342232942581178, "data/tokens_consumed": 160350339072, "data/tokens_consumed_B": 160.350339072, "train/loss_slope": -2.110105531551156e-05} {"step": 76470, "timestamp": 1778277194.071031, "train/loss": 2.076493668556213, "train/z_loss": 0.0013965435209684074, "train/perplexity": 7.976451740704137, "train/grad_norm": 0.09765625, "optim/muon_lr": 0.012747794687747956, "optim/adamw_lr": 0.0003824338406324386, "perf/tokens_per_sec": 2026730.4444545673, "perf/iters_per_sec": 0.9664203855774723, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347463846206666, "data/tokens_consumed": 160371310592, "data/tokens_consumed_B": 160.371310592, "train/loss_slope": -1.9719264924329495e-05} {"step": 76480, "timestamp": 1778277204.4252162, "train/loss": 2.090116763114929, "train/z_loss": 0.0013993653934448957, "train/perplexity": 8.085859239299078, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.01271060049533844, "optim/adamw_lr": 0.00038131801486015316, "perf/tokens_per_sec": 2026410.7508219364, "perf/iters_per_sec": 0.966267943774193, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349096298217773, "data/tokens_consumed": 160392282112, "data/tokens_consumed_B": 160.392282112, "train/loss_slope": -1.9664947563844358e-05} {"step": 76490, "timestamp": 1778277214.7693198, "train/loss": 2.1038001537323, "train/z_loss": 0.00138728404417634, "train/perplexity": 8.197261652919314, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.012673366963863373, "optim/adamw_lr": 0.00038020100891590115, "perf/tokens_per_sec": 2028921.33160048, "perf/iters_per_sec": 0.9674650819780731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0336290359497071, "data/tokens_consumed": 160413253632, "data/tokens_consumed_B": 160.413253632, "train/loss_slope": -1.998059743165325e-05} {"step": 76500, "timestamp": 1778277225.1024325, "grad/layer_0/attn": 0.002858737949281931, "grad/layer_0/mlp": 0.0030426234006881714, "grad/layer_0/attn_mlp_ratio": 0.9395634881001051, "grad/layer_4/attn": 0.0028185334522277117, "grad/layer_4/mlp": 0.0024790146853774786, "grad/layer_4/attn_mlp_ratio": 1.1369570963646087, "grad/layer_8/attn": 0.004706413950771093, "grad/layer_8/mlp": 0.0036552068777382374, "grad/layer_8/attn_mlp_ratio": 1.2875916410302215, "grad/layer_12/attn": 0.005681681912392378, "grad/layer_12/mlp": 0.006720236502587795, "grad/layer_12/attn_mlp_ratio": 0.8454586123060763, "grad/layer_16/attn": 0.004794979467988014, "grad/layer_16/mlp": 0.004463274031877518, "grad/layer_16/attn_mlp_ratio": 1.074318835525113, "grad/layer_20/attn": 0.0027820745017379522, "grad/layer_20/mlp": 0.005216700490564108, "grad/layer_20/attn_mlp_ratio": 0.5333015482564072, "grad/layer_24/attn": 0.005690807942301035, "grad/layer_24/mlp": 0.006740309298038483, "grad/layer_24/attn_mlp_ratio": 0.8442947654535168, "grad/layer_27/attn": 0.0036005217116326094, "grad/layer_27/mlp": 0.0067269993014633656, "grad/layer_27/attn_mlp_ratio": 0.5352344331782408} {"step": 76500, "timestamp": 1778277225.6995144, "eos/sharpness": 19.804000854492184, "eos/L0_probe": 1.928032636642456, "eos/L_plus": 2.024672269821167, "eos/L_minus": 2.029433012008667, "eos/grad_norm": 0.09224852919578552, "eos/embed_grad_frac": 0.22627849876880646, "eos/time_s": 0.5942292213439941} {"step": 76500, "timestamp": 1778277225.717158, "train/loss": 2.139489436149597, "train/z_loss": 0.001380519673693925, "train/perplexity": 8.495099230892894, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.012636093199253082, "optim/adamw_lr": 0.0003790827959775924, "perf/tokens_per_sec": 1916453.2442879034, "perf/iters_per_sec": 0.9138361188354032, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0942881107330322, "data/tokens_consumed": 160434225152, "data/tokens_consumed_B": 160.434225152, "train/loss_slope": -1.6306342786759755e-05} {"step": 76500, "timestamp": 1778277227.0816243, "geo/rankme_last": 439.1917724609375, "geo/layer_0/stable_rank_q_proj": 18.889638900756836, "geo/layer_0/stable_rank_k_proj": 15.792865753173828, "geo/layer_0/stable_rank_o_proj": 46.44586944580078, "geo/layer_0/stable_rank_gate_proj": 127.81902313232422, "geo/layer_0/stable_rank_down_proj": 56.676368713378906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06317870318889618, "geo/layer_0/attn_entropy_mean": 6.135849952697754, "geo/layer_0/attn_entropy_std": 0.4324222207069397, "geo/layer_7/stable_rank_q_proj": 42.66494369506836, "geo/layer_7/stable_rank_k_proj": 40.20189666748047, "geo/layer_7/stable_rank_o_proj": 87.99535369873047, "geo/layer_7/stable_rank_gate_proj": 76.90409088134766, "geo/layer_7/stable_rank_down_proj": 140.0718536376953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4385520815849304, "geo/layer_7/attn_entropy_mean": 4.637921333312988, "geo/layer_7/attn_entropy_std": 0.7875615954399109, "geo/layer_14/stable_rank_q_proj": 49.464263916015625, "geo/layer_14/stable_rank_k_proj": 41.64702606201172, "geo/layer_14/stable_rank_o_proj": 43.1308708190918, "geo/layer_14/stable_rank_gate_proj": 70.2197265625, "geo/layer_14/stable_rank_down_proj": 124.85723114013672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3999570310115814, "geo/layer_14/attn_entropy_mean": 5.529215335845947, "geo/layer_14/attn_entropy_std": 0.4025022089481354, "geo/layer_21/stable_rank_q_proj": 39.66417694091797, "geo/layer_21/stable_rank_k_proj": 30.1578369140625, "geo/layer_21/stable_rank_o_proj": 68.1173324584961, "geo/layer_21/stable_rank_gate_proj": 63.22489547729492, "geo/layer_21/stable_rank_down_proj": 49.21809768676758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1475144773721695, "geo/layer_21/attn_entropy_mean": 5.686415672302246, "geo/layer_21/attn_entropy_std": 0.2946861684322357, "geo/layer_27/stable_rank_q_proj": 43.974727630615234, "geo/layer_27/stable_rank_k_proj": 32.346126556396484, "geo/layer_27/stable_rank_o_proj": 115.3636245727539, "geo/layer_27/stable_rank_gate_proj": 76.67369079589844, "geo/layer_27/stable_rank_down_proj": 127.41637420654297, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09931450337171555, "geo/layer_27/attn_entropy_mean": 4.166903018951416, "geo/layer_27/attn_entropy_std": 0.790306031703949, "attnres/final_alpha/block_0": 0.2400871068239212, "attnres/block_norm/0": 1.7790955305099487, "attnres/final_alpha/block_1": 0.003915858455002308, "attnres/block_norm/1": 47989.48828125, "attnres/final_alpha/block_2": 0.009571940638124943, "attnres/block_norm/2": 29085.091796875, "attnres/final_alpha/block_3": 0.011404434219002724, "attnres/block_norm/3": 62901.4375, "attnres/final_alpha/block_4": 0.013243213295936584, "attnres/block_norm/4": 15799.09375, "attnres/final_alpha/block_5": 0.6154602766036987, "attnres/block_norm/5": 6866.76904296875, "attnres/final_alpha/block_6": 0.1063171774148941, "attnres/block_norm/6": 41277.42578125, "geo/tier1_time_s": 1.3605437278747559, "geo/step": 76500.0, "geo/rankme_slope": 0.000398172237645058} {"step": 76500, "timestamp": 1778277233.9876964, "geo/ww_alpha_mean": 7.7329421212531075, "geo/ww_alpha_std": 4.7775126955124785, "geo/ww_alpha_min": 1.334568304683502, "geo/ww_alpha_max": 33.11141538664237, "geo/ww_alpha_healthy_frac": 0.16243654822335024, "geo/ww_alpha_by_type/q_proj": 3.9441155597672695, "geo/ww_alpha_by_type/k_proj": 4.465406283368831, "geo/ww_alpha_by_type/v_proj": 8.050787409600105, "geo/ww_alpha_by_type/o_proj": 9.74902105210371, "geo/ww_alpha_by_type/gate_proj": 7.972769917180718, "geo/ww_alpha_by_type/up_proj": 12.030020644776272, "geo/ww_alpha_by_type/down_proj": 8.014351241293097, "geo/twonn_id/layer_0": 0.6971582174301147, "geo/twonn_id/layer_7": 3.3102097511291504, "geo/twonn_id/layer_14": 4.463735103607178, "geo/twonn_id/layer_21": 7.874997615814209, "geo/twonn_id/layer_27": 5.515906810760498, "geo/tier2_time_s": 6.899608373641968} {"step": 76500, "timestamp": 1778277234.666227, "eoc/jacobian_sigma/layer_0/attn": 1267.89892578125, "eoc/jacobian_sigma/layer_0/mlp": 8614.7236328125, "eoc/jacobian_sigma/layer_0": 8614.7236328125, "eoc/jacobian_sigma/layer_7/attn": 1.168058156967163, "eoc/jacobian_sigma/layer_7/mlp": 1.926659107208252, "eoc/jacobian_sigma/layer_7": 1.926659107208252, "eoc/jacobian_sigma/layer_14/attn": 1.3943967819213867, "eoc/jacobian_sigma/layer_14/mlp": 6.361214637756348, "eoc/jacobian_sigma/layer_14": 6.361214637756348, "eoc/jacobian_sigma/layer_21/attn": 1.0970152616500854, "eoc/jacobian_sigma/layer_21/mlp": 4.495145320892334, "eoc/jacobian_sigma/layer_21": 4.495145320892334, "eoc/jacobian_sigma/layer_27/attn": 3.0867762565612793, "eoc/jacobian_sigma/layer_27/mlp": 31.00611114501953, "eoc/jacobian_sigma/layer_27": 31.00611114501953, "eoc/layer0_sigma": 8614.7236328125, "eoc/sigma_max": 31.00611114501953, "eoc/sigma_min": 1.926659107208252, "eoc/sigma_mean": 10.947282552719116, "eoc/time_s": 0.6699092388153076} {"step": 76510, "timestamp": 1778277245.4122007, "train/loss": 2.15411696434021, "train/z_loss": 0.0013815197860822081, "train/perplexity": 8.62027480708928, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.012598780393600463, "optim/adamw_lr": 0.0003779634118080139, "perf/tokens_per_sec": 1065006.6245844977, "perf/iters_per_sec": 0.5078347323343743, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9691445589065553, "data/tokens_consumed": 160455196672, "data/tokens_consumed_B": 160.455196672, "train/loss_slope": -1.3365790838956332e-05} {"step": 76520, "timestamp": 1778277256.3176904, "train/loss": 2.073625922203064, "train/z_loss": 0.001411552238278091, "train/perplexity": 7.9536100680295165, "train/grad_norm": 0.1435546875, "optim/muon_lr": 0.012561428546905519, "optim/adamw_lr": 0.0003768428564071655, "perf/tokens_per_sec": 1924022.5238201357, "perf/iters_per_sec": 0.9174454325771979, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.089983081817627, "data/tokens_consumed": 160476168192, "data/tokens_consumed_B": 160.476168192, "train/loss_slope": -1.6497414905388995e-05} {"step": 76530, "timestamp": 1778277266.6772664, "train/loss": 2.068937158584595, "train/z_loss": 0.001403194572776556, "train/perplexity": 7.916404762111111, "train/grad_norm": 0.12353515625, "optim/muon_lr": 0.012524038553237915, "optim/adamw_lr": 0.0003757211565971374, "perf/tokens_per_sec": 2025647.5733380988, "perf/iters_per_sec": 0.9659040323915953, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03529953956604, "data/tokens_consumed": 160497139712, "data/tokens_consumed_B": 160.497139712, "train/loss_slope": -1.6017749529145238e-05} {"step": 76540, "timestamp": 1778277277.0258126, "train/loss": 2.0578678846359253, "train/z_loss": 0.0013961578253656626, "train/perplexity": 7.82925911843171, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.012486609667539597, "optim/adamw_lr": 0.0003745982900261879, "perf/tokens_per_sec": 2028081.863227707, "perf/iters_per_sec": 0.9670647922647987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340568780899049, "data/tokens_consumed": 160518111232, "data/tokens_consumed_B": 160.518111232, "train/loss_slope": -1.6324763730092316e-05} {"step": 76550, "timestamp": 1778277287.368202, "grad/layer_0/attn": 0.002426272025331855, "grad/layer_0/mlp": 0.0029220841825008392, "grad/layer_0/attn_mlp_ratio": 0.8303223968801311, "grad/layer_4/attn": 0.0019232443301007152, "grad/layer_4/mlp": 0.0026075993664562702, "grad/layer_4/attn_mlp_ratio": 0.7375535832251131, "grad/layer_8/attn": 0.0032020199578255415, "grad/layer_8/mlp": 0.0035999231040477753, "grad/layer_8/attn_mlp_ratio": 0.889468962622639, "grad/layer_12/attn": 0.004203171469271183, "grad/layer_12/mlp": 0.007010936737060547, "grad/layer_12/attn_mlp_ratio": 0.5995163794733934, "grad/layer_16/attn": 0.005874375347048044, "grad/layer_16/mlp": 0.004640562459826469, "grad/layer_16/attn_mlp_ratio": 1.2658756931546922, "grad/layer_20/attn": 0.0040207174606621265, "grad/layer_20/mlp": 0.0052626654505729675, "grad/layer_20/attn_mlp_ratio": 0.7640077869330638, "grad/layer_24/attn": 0.006763906683772802, "grad/layer_24/mlp": 0.007041289936751127, "grad/layer_24/attn_mlp_ratio": 0.9606061742194176, "grad/layer_27/attn": 0.0036756605841219425, "grad/layer_27/mlp": 0.006153726484626532, "grad/layer_27/attn_mlp_ratio": 0.5973064505830685} {"step": 76550, "timestamp": 1778277287.382898, "train/loss": 2.106103551387787, "train/z_loss": 0.0013959560310468078, "train/perplexity": 8.216164968760909, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.012449144721031188, "optim/adamw_lr": 0.0003734743416309356, "perf/tokens_per_sec": 2026259.9737326137, "perf/iters_per_sec": 0.9661960476553982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034986639022827, "data/tokens_consumed": 160539082752, "data/tokens_consumed_B": 160.539082752, "train/loss_slope": -1.5974236106929757e-05} {"step": 76560, "timestamp": 1778277297.7551956, "train/loss": 2.06761257648468, "train/z_loss": 0.001394571003038436, "train/perplexity": 7.905925775738556, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.012411643117666245, "optim/adamw_lr": 0.0003723492935299873, "perf/tokens_per_sec": 2022937.2398196403, "perf/iters_per_sec": 0.9646116446588708, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036686635017395, "data/tokens_consumed": 160560054272, "data/tokens_consumed_B": 160.560054272, "train/loss_slope": -1.8423255034262717e-05} {"step": 76570, "timestamp": 1778277308.6121466, "train/loss": 2.084792172908783, "train/z_loss": 0.0014032693463377655, "train/perplexity": 8.042919771369386, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.012374105602502824, "optim/adamw_lr": 0.00037122316807508467, "perf/tokens_per_sec": 1933015.1456613478, "perf/iters_per_sec": 0.9217334488207568, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0849123477935791, "data/tokens_consumed": 160581025792, "data/tokens_consumed_B": 160.581025792, "train/loss_slope": -1.982312405606556e-05} {"step": 76575, "timestamp": 1778277314.8187354, "eos/sharpness": 19.140863418579098, "eos/L0_probe": 1.9274557828903198, "eos/L_plus": 2.0202009677886963, "eos/L_minus": 2.0261192321777344, "eos/grad_norm": 0.08769901841878891, "eos/embed_grad_frac": 0.2581210434436798, "eos/time_s": 0.622014045715332} {"step": 76575, "timestamp": 1778277316.197473, "geo/rankme_last": 439.84521484375, "geo/layer_0/stable_rank_q_proj": 18.883304595947266, "geo/layer_0/stable_rank_k_proj": 15.79589557647705, "geo/layer_0/stable_rank_o_proj": 46.439456939697266, "geo/layer_0/stable_rank_gate_proj": 127.92769622802734, "geo/layer_0/stable_rank_down_proj": 56.705650329589844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061619751155376434, "geo/layer_0/attn_entropy_mean": 6.13612174987793, "geo/layer_0/attn_entropy_std": 0.43048009276390076, "geo/layer_7/stable_rank_q_proj": 42.65517807006836, "geo/layer_7/stable_rank_k_proj": 40.161102294921875, "geo/layer_7/stable_rank_o_proj": 87.9614486694336, "geo/layer_7/stable_rank_gate_proj": 76.88763427734375, "geo/layer_7/stable_rank_down_proj": 140.05035400390625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4392167329788208, "geo/layer_7/attn_entropy_mean": 4.638331890106201, "geo/layer_7/attn_entropy_std": 0.7858079671859741, "geo/layer_14/stable_rank_q_proj": 49.42753219604492, "geo/layer_14/stable_rank_k_proj": 41.65840148925781, "geo/layer_14/stable_rank_o_proj": 43.140159606933594, "geo/layer_14/stable_rank_gate_proj": 70.15702056884766, "geo/layer_14/stable_rank_down_proj": 124.87759399414062, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4096250832080841, "geo/layer_14/attn_entropy_mean": 5.5231122970581055, "geo/layer_14/attn_entropy_std": 0.417746365070343, "geo/layer_21/stable_rank_q_proj": 39.693092346191406, "geo/layer_21/stable_rank_k_proj": 30.16380500793457, "geo/layer_21/stable_rank_o_proj": 68.0895004272461, "geo/layer_21/stable_rank_gate_proj": 63.206363677978516, "geo/layer_21/stable_rank_down_proj": 49.26136779785156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14604103565216064, "geo/layer_21/attn_entropy_mean": 5.681862831115723, "geo/layer_21/attn_entropy_std": 0.29184260964393616, "geo/layer_27/stable_rank_q_proj": 43.95130157470703, "geo/layer_27/stable_rank_k_proj": 32.310245513916016, "geo/layer_27/stable_rank_o_proj": 115.2728271484375, "geo/layer_27/stable_rank_gate_proj": 76.56597900390625, "geo/layer_27/stable_rank_down_proj": 127.34820556640625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08918158710002899, "geo/layer_27/attn_entropy_mean": 4.174626350402832, "geo/layer_27/attn_entropy_std": 0.7921473979949951, "attnres/final_alpha/block_0": 0.2412053346633911, "attnres/block_norm/0": 1.779109001159668, "attnres/final_alpha/block_1": 0.003956823144108057, "attnres/block_norm/1": 47937.9375, "attnres/final_alpha/block_2": 0.00956693571060896, "attnres/block_norm/2": 29039.2734375, "attnres/final_alpha/block_3": 0.011382056400179863, "attnres/block_norm/3": 62812.6796875, "attnres/final_alpha/block_4": 0.013111284002661705, "attnres/block_norm/4": 15790.365234375, "attnres/final_alpha/block_5": 0.6136570572853088, "attnres/block_norm/5": 6841.23388671875, "attnres/final_alpha/block_6": 0.10712051391601562, "attnres/block_norm/6": 40888.6796875, "geo/tier1_time_s": 1.358170986175537, "geo/step": 76575.0, "geo/rankme_slope": 0.00041201910451680674} {"step": 76580, "timestamp": 1778277321.3970883, "train/loss": 2.065618598461151, "train/z_loss": 0.0014020149479620158, "train/perplexity": 7.8901772398212895, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.012336532473564149, "optim/adamw_lr": 0.0003700959742069244, "perf/tokens_per_sec": 1641056.0382822459, "perf/iters_per_sec": 0.7825164977465848, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2779283285140992, "data/tokens_consumed": 160601997312, "data/tokens_consumed_B": 160.601997312, "train/loss_slope": -2.2700801562375908e-05} {"step": 76590, "timestamp": 1778277331.7721703, "train/loss": 2.096439778804779, "train/z_loss": 0.0013901245896704496, "train/perplexity": 8.137148233813626, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.012298924475908279, "optim/adamw_lr": 0.00036896773427724837, "perf/tokens_per_sec": 2022648.0893312995, "perf/iters_per_sec": 0.9644737669617174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368348360061646, "data/tokens_consumed": 160622968832, "data/tokens_consumed_B": 160.622968832, "train/loss_slope": -2.4475323291930765e-05} {"step": 76600, "timestamp": 1778277342.1352751, "grad/layer_0/attn": 0.002598914783447981, "grad/layer_0/mlp": 0.00289590610191226, "grad/layer_0/attn_mlp_ratio": 0.8974443929612892, "grad/layer_4/attn": 0.0024206633679568768, "grad/layer_4/mlp": 0.002442551776766777, "grad/layer_4/attn_mlp_ratio": 0.991038671883246, "grad/layer_8/attn": 0.007885522209107876, "grad/layer_8/mlp": 0.0034179214853793383, "grad/layer_8/attn_mlp_ratio": 2.307110333613081, "grad/layer_12/attn": 0.0038657160475850105, "grad/layer_12/mlp": 0.0062951198779046535, "grad/layer_12/attn_mlp_ratio": 0.6140813933893797, "grad/layer_16/attn": 0.004977148491889238, "grad/layer_16/mlp": 0.004161427728831768, "grad/layer_16/attn_mlp_ratio": 1.196019418480812, "grad/layer_20/attn": 0.0034715321380645037, "grad/layer_20/mlp": 0.005194277968257666, "grad/layer_20/attn_mlp_ratio": 0.6683377540526959, "grad/layer_24/attn": 0.004507001023739576, "grad/layer_24/mlp": 0.006764700170606375, "grad/layer_24/attn_mlp_ratio": 0.6662528779468861, "grad/layer_27/attn": 0.0035032681189477444, "grad/layer_27/mlp": 0.005744737572968006, "grad/layer_27/attn_mlp_ratio": 0.6098221221540633} {"step": 76600, "timestamp": 1778277342.1517272, "train/loss": 2.1194493770599365, "train/z_loss": 0.0013991838437505066, "train/perplexity": 8.32655143497893, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.01226128101348877, "optim/adamw_lr": 0.0003678384304046631, "perf/tokens_per_sec": 2021439.3070663908, "perf/iters_per_sec": 0.963897374661632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374548435211182, "data/tokens_consumed": 160643940352, "data/tokens_consumed_B": 160.643940352, "train/loss_slope": -2.2744134371608978e-05} {"step": 76610, "timestamp": 1778277352.5254881, "train/loss": 2.1155823707580566, "train/z_loss": 0.0013887842884287237, "train/perplexity": 8.29441478446887, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.012223604768514633, "optim/adamw_lr": 0.00036670814305543895, "perf/tokens_per_sec": 2022368.785948024, "perf/iters_per_sec": 0.9643405847301597, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369780302047729, "data/tokens_consumed": 160664911872, "data/tokens_consumed_B": 160.664911872, "train/loss_slope": -2.2037609954728675e-05} {"step": 76620, "timestamp": 1778277362.901832, "train/loss": 2.0910195827484133, "train/z_loss": 0.0014039523899555205, "train/perplexity": 8.093162608090203, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.012185895442962646, "optim/adamw_lr": 0.00036557686328887935, "perf/tokens_per_sec": 2022158.6384625265, "perf/iters_per_sec": 0.9642403786003716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370857954025268, "data/tokens_consumed": 160685883392, "data/tokens_consumed_B": 160.685883392, "train/loss_slope": -2.0173804408753527e-05} {"step": 76630, "timestamp": 1778277373.2735622, "train/loss": 2.072077441215515, "train/z_loss": 0.0013918731012381614, "train/perplexity": 7.941303584693563, "train/grad_norm": 0.134765625, "optim/muon_lr": 0.012148153334856034, "optim/adamw_lr": 0.00036444460004568096, "perf/tokens_per_sec": 2022908.1163518631, "perf/iters_per_sec": 0.964597757507259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367015600204468, "data/tokens_consumed": 160706854912, "data/tokens_consumed_B": 160.706854912, "train/loss_slope": -2.314254058004676e-05} {"step": 76640, "timestamp": 1778277383.650751, "train/loss": 2.0335168480873107, "train/z_loss": 0.001404569111764431, "train/perplexity": 7.640911085423925, "train/grad_norm": 0.078125, "optim/muon_lr": 0.012110379189252854, "optim/adamw_lr": 0.00036331137567758556, "perf/tokens_per_sec": 2021788.2901145413, "perf/iters_per_sec": 0.9640637827465731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037275767326355, "data/tokens_consumed": 160727826432, "data/tokens_consumed_B": 160.727826432, "train/loss_slope": -2.7649893840797794e-05} {"step": 76650, "timestamp": 1778277394.0128806, "grad/layer_0/attn": 0.002480197697877884, "grad/layer_0/mlp": 0.00281003606505692, "grad/layer_0/attn_mlp_ratio": 0.8826212732488605, "grad/layer_4/attn": 0.001953377388417721, "grad/layer_4/mlp": 0.002419352764263749, "grad/layer_4/attn_mlp_ratio": 0.8073966461325464, "grad/layer_8/attn": 0.0030254083685576916, "grad/layer_8/mlp": 0.0033888823818415403, "grad/layer_8/attn_mlp_ratio": 0.892745140844679, "grad/layer_12/attn": 0.004741846118122339, "grad/layer_12/mlp": 0.006922638043761253, "grad/layer_12/attn_mlp_ratio": 0.684976741474742, "grad/layer_16/attn": 0.004165072459727526, "grad/layer_16/mlp": 0.004450393375009298, "grad/layer_16/attn_mlp_ratio": 0.935888586731946, "grad/layer_20/attn": 0.002714514033868909, "grad/layer_20/mlp": 0.005203851964324713, "grad/layer_20/attn_mlp_ratio": 0.5216355116008016, "grad/layer_24/attn": 0.004445314407348633, "grad/layer_24/mlp": 0.006524945143610239, "grad/layer_24/attn_mlp_ratio": 0.6812799558282653, "grad/layer_27/attn": 0.005402924958616495, "grad/layer_27/mlp": 0.005815536715090275, "grad/layer_27/attn_mlp_ratio": 0.9290500826332789} {"step": 76650, "timestamp": 1778277394.622866, "eos/sharpness": 15.883660316467282, "eos/L0_probe": 1.9252119064331055, "eos/L_plus": 2.0051422119140625, "eos/L_minus": 2.0041182041168213, "eos/grad_norm": 0.08342767506837845, "eos/embed_grad_frac": 0.2762473523616791, "eos/time_s": 0.6072001457214355} {"step": 76650, "timestamp": 1778277394.6420226, "train/loss": 2.109265851974487, "train/z_loss": 0.001412500999867916, "train/perplexity": 8.242188076820755, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.012072573453187943, "optim/adamw_lr": 0.0003621772035956382, "perf/tokens_per_sec": 1908981.029692523, "perf/iters_per_sec": 0.910273089262258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0985714197158813, "data/tokens_consumed": 160748797952, "data/tokens_consumed_B": 160.748797952, "train/loss_slope": -2.563916346420941e-05} {"step": 76650, "timestamp": 1778277396.0058692, "geo/rankme_last": 439.74456787109375, "geo/layer_0/stable_rank_q_proj": 18.85986328125, "geo/layer_0/stable_rank_k_proj": 15.787775993347168, "geo/layer_0/stable_rank_o_proj": 46.43912124633789, "geo/layer_0/stable_rank_gate_proj": 127.89636993408203, "geo/layer_0/stable_rank_down_proj": 56.7298698425293, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06619984656572342, "geo/layer_0/attn_entropy_mean": 6.1347832679748535, "geo/layer_0/attn_entropy_std": 0.43095847964286804, "geo/layer_7/stable_rank_q_proj": 42.691314697265625, "geo/layer_7/stable_rank_k_proj": 40.17506408691406, "geo/layer_7/stable_rank_o_proj": 87.9427490234375, "geo/layer_7/stable_rank_gate_proj": 76.9310073852539, "geo/layer_7/stable_rank_down_proj": 139.8826904296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44065165519714355, "geo/layer_7/attn_entropy_mean": 4.6285247802734375, "geo/layer_7/attn_entropy_std": 0.7762938141822815, "geo/layer_14/stable_rank_q_proj": 49.405887603759766, "geo/layer_14/stable_rank_k_proj": 41.603878021240234, "geo/layer_14/stable_rank_o_proj": 43.148277282714844, "geo/layer_14/stable_rank_gate_proj": 70.16531372070312, "geo/layer_14/stable_rank_down_proj": 124.87924194335938, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40675869584083557, "geo/layer_14/attn_entropy_mean": 5.527715682983398, "geo/layer_14/attn_entropy_std": 0.4219605326652527, "geo/layer_21/stable_rank_q_proj": 39.682437896728516, "geo/layer_21/stable_rank_k_proj": 30.168996810913086, "geo/layer_21/stable_rank_o_proj": 67.99871063232422, "geo/layer_21/stable_rank_gate_proj": 63.182586669921875, "geo/layer_21/stable_rank_down_proj": 49.269813537597656, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15214930474758148, "geo/layer_21/attn_entropy_mean": 5.683687210083008, "geo/layer_21/attn_entropy_std": 0.29601800441741943, "geo/layer_27/stable_rank_q_proj": 43.91992950439453, "geo/layer_27/stable_rank_k_proj": 32.30049514770508, "geo/layer_27/stable_rank_o_proj": 115.2282943725586, "geo/layer_27/stable_rank_gate_proj": 76.59317016601562, "geo/layer_27/stable_rank_down_proj": 127.33311462402344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09209836274385452, "geo/layer_27/attn_entropy_mean": 4.153571605682373, "geo/layer_27/attn_entropy_std": 0.8002191781997681, "attnres/final_alpha/block_0": 0.23971831798553467, "attnres/block_norm/0": 1.7791202068328857, "attnres/final_alpha/block_1": 0.00393451564013958, "attnres/block_norm/1": 47934.09375, "attnres/final_alpha/block_2": 0.009491527453064919, "attnres/block_norm/2": 29074.087890625, "attnres/final_alpha/block_3": 0.011342727579176426, "attnres/block_norm/3": 62978.2734375, "attnres/final_alpha/block_4": 0.013089662417769432, "attnres/block_norm/4": 15769.767578125, "attnres/final_alpha/block_5": 0.6164951324462891, "attnres/block_norm/5": 6811.1640625, "attnres/final_alpha/block_6": 0.10592813044786453, "attnres/block_norm/6": 41223.8046875, "geo/tier1_time_s": 1.360074758529663, "geo/step": 76650.0, "geo/rankme_slope": 0.00040491806097438975} {"step": 76660, "timestamp": 1778277406.392924, "train/loss": 2.0681001782417296, "train/z_loss": 0.001401083101518452, "train/perplexity": 7.909781659029248, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.012034735530614854, "optim/adamw_lr": 0.00036104206591844556, "perf/tokens_per_sec": 1785229.288054801, "perf/iters_per_sec": 0.8512636604570394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1747241735458374, "data/tokens_consumed": 160769769472, "data/tokens_consumed_B": 160.769769472, "train/loss_slope": -2.5623918316437946e-05} {"step": 76670, "timestamp": 1778277416.7764268, "train/loss": 2.0979080200195312, "train/z_loss": 0.0014086339273490013, "train/perplexity": 8.14910430527166, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.011996868401765823, "optim/adamw_lr": 0.0003599060520529747, "perf/tokens_per_sec": 2020722.0191789211, "perf/iters_per_sec": 0.9635553451437574, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378231048583983, "data/tokens_consumed": 160790740992, "data/tokens_consumed_B": 160.790740992, "train/loss_slope": -2.413932784269636e-05} {"step": 76680, "timestamp": 1778277427.1590273, "train/loss": 2.108558452129364, "train/z_loss": 0.001398687402252108, "train/perplexity": 8.236359616020923, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.011958971470594406, "optim/adamw_lr": 0.00035876914411783214, "perf/tokens_per_sec": 2020900.3880354222, "perf/iters_per_sec": 0.9636403980424033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377315044403077, "data/tokens_consumed": 160811712512, "data/tokens_consumed_B": 160.811712512, "train/loss_slope": -2.5295158466919346e-05} {"step": 76690, "timestamp": 1778277437.5445445, "train/loss": 2.109674561023712, "train/z_loss": 0.0013999379705637693, "train/perplexity": 8.245557422167229, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.011921045184135438, "optim/adamw_lr": 0.00035763135552406306, "perf/tokens_per_sec": 2020294.5171031533, "perf/iters_per_sec": 0.9633514962688223, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038042712211609, "data/tokens_consumed": 160832684032, "data/tokens_consumed_B": 160.832684032, "train/loss_slope": -2.8515830427685384e-05} {"step": 76700, "timestamp": 1778277447.9148252, "grad/layer_0/attn": 0.002729899249970913, "grad/layer_0/mlp": 0.0030906551983207464, "grad/layer_0/attn_mlp_ratio": 0.8832752236893439, "grad/layer_4/attn": 0.0018789534224197268, "grad/layer_4/mlp": 0.0025525216478854418, "grad/layer_4/attn_mlp_ratio": 0.736116518488545, "grad/layer_8/attn": 0.0036262013018131256, "grad/layer_8/mlp": 0.003361332230269909, "grad/layer_8/attn_mlp_ratio": 1.0787988052112503, "grad/layer_12/attn": 0.004666452296078205, "grad/layer_12/mlp": 0.007012681569904089, "grad/layer_12/attn_mlp_ratio": 0.6654305037265477, "grad/layer_16/attn": 0.004450877197086811, "grad/layer_16/mlp": 0.004563857335597277, "grad/layer_16/attn_mlp_ratio": 0.9752445732355179, "grad/layer_20/attn": 0.00598115473985672, "grad/layer_20/mlp": 0.005529617890715599, "grad/layer_20/attn_mlp_ratio": 1.0816578559132413, "grad/layer_24/attn": 0.009469090960919857, "grad/layer_24/mlp": 0.006931636016815901, "grad/layer_24/attn_mlp_ratio": 1.3660686743131514, "grad/layer_27/attn": 0.006069655530154705, "grad/layer_27/mlp": 0.006375615485012531, "grad/layer_27/attn_mlp_ratio": 0.9520108998451756} {"step": 76700, "timestamp": 1778277447.9300618, "train/loss": 2.0745400905609133, "train/z_loss": 0.0014041822054423393, "train/perplexity": 7.960884331128363, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.011883090287446977, "optim/adamw_lr": 0.0003564927086234092, "perf/tokens_per_sec": 2020542.9391862976, "perf/iters_per_sec": 0.9634699531489838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037915086746216, "data/tokens_consumed": 160853655552, "data/tokens_consumed_B": 160.853655552, "train/loss_slope": -2.804894907997419e-05} {"step": 76710, "timestamp": 1778277458.3134165, "train/loss": 2.071823167800903, "train/z_loss": 0.0014162033796310425, "train/perplexity": 7.939284579015226, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.011845107227563859, "optim/adamw_lr": 0.0003553532168269157, "perf/tokens_per_sec": 2020991.3023409615, "perf/iters_per_sec": 0.9636837493614967, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376848220825194, "data/tokens_consumed": 160874627072, "data/tokens_consumed_B": 160.874627072, "train/loss_slope": -3.0146916406919298e-05} {"step": 76720, "timestamp": 1778277468.690899, "train/loss": 2.098239612579346, "train/z_loss": 0.001394704170525074, "train/perplexity": 8.151806935689747, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.011807095408439636, "optim/adamw_lr": 0.00035421286225318906, "perf/tokens_per_sec": 2021856.5117694088, "perf/iters_per_sec": 0.9640963133666081, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372407674789428, "data/tokens_consumed": 160895598592, "data/tokens_consumed_B": 160.895598592, "train/loss_slope": -2.889498551257405e-05} {"step": 76725, "timestamp": 1778277474.485544, "eos/sharpness": 15.113973617553707, "eos/L0_probe": 1.9285287857055664, "eos/L_plus": 2.0017244815826416, "eos/L_minus": 2.0064728260040283, "eos/grad_norm": 0.09466369450092316, "eos/embed_grad_frac": 0.23765312135219574, "eos/time_s": 0.617300271987915} {"step": 76725, "timestamp": 1778277475.867548, "geo/rankme_last": 439.8404235839844, "geo/layer_0/stable_rank_q_proj": 18.845279693603516, "geo/layer_0/stable_rank_k_proj": 15.768166542053223, "geo/layer_0/stable_rank_o_proj": 46.45028305053711, "geo/layer_0/stable_rank_gate_proj": 127.7907943725586, "geo/layer_0/stable_rank_down_proj": 56.798011779785156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06596947461366653, "geo/layer_0/attn_entropy_mean": 6.136113166809082, "geo/layer_0/attn_entropy_std": 0.42628151178359985, "geo/layer_7/stable_rank_q_proj": 42.66348648071289, "geo/layer_7/stable_rank_k_proj": 40.17233657836914, "geo/layer_7/stable_rank_o_proj": 88.06233215332031, "geo/layer_7/stable_rank_gate_proj": 76.94158935546875, "geo/layer_7/stable_rank_down_proj": 139.8322296142578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44192224740982056, "geo/layer_7/attn_entropy_mean": 4.635011196136475, "geo/layer_7/attn_entropy_std": 0.7804633378982544, "geo/layer_14/stable_rank_q_proj": 49.43465042114258, "geo/layer_14/stable_rank_k_proj": 41.65626907348633, "geo/layer_14/stable_rank_o_proj": 43.144718170166016, "geo/layer_14/stable_rank_gate_proj": 70.19506072998047, "geo/layer_14/stable_rank_down_proj": 124.77882385253906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4160422384738922, "geo/layer_14/attn_entropy_mean": 5.551784038543701, "geo/layer_14/attn_entropy_std": 0.4160151481628418, "geo/layer_21/stable_rank_q_proj": 39.70368957519531, "geo/layer_21/stable_rank_k_proj": 30.154239654541016, "geo/layer_21/stable_rank_o_proj": 67.94422912597656, "geo/layer_21/stable_rank_gate_proj": 63.20051574707031, "geo/layer_21/stable_rank_down_proj": 49.27451705932617, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1483529657125473, "geo/layer_21/attn_entropy_mean": 5.685850143432617, "geo/layer_21/attn_entropy_std": 0.2931573688983917, "geo/layer_27/stable_rank_q_proj": 43.914737701416016, "geo/layer_27/stable_rank_k_proj": 32.289833068847656, "geo/layer_27/stable_rank_o_proj": 115.2395248413086, "geo/layer_27/stable_rank_gate_proj": 76.57229614257812, "geo/layer_27/stable_rank_down_proj": 127.28935241699219, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08386051654815674, "geo/layer_27/attn_entropy_mean": 4.1598663330078125, "geo/layer_27/attn_entropy_std": 0.7867057919502258, "attnres/final_alpha/block_0": 0.2417922019958496, "attnres/block_norm/0": 1.779152750968933, "attnres/final_alpha/block_1": 0.003932765685021877, "attnres/block_norm/1": 47824.8671875, "attnres/final_alpha/block_2": 0.009588100016117096, "attnres/block_norm/2": 29059.810546875, "attnres/final_alpha/block_3": 0.011240091174840927, "attnres/block_norm/3": 63363.6484375, "attnres/final_alpha/block_4": 0.013252416625618935, "attnres/block_norm/4": 15752.8544921875, "attnres/final_alpha/block_5": 0.6136667728424072, "attnres/block_norm/5": 6870.85302734375, "attnres/final_alpha/block_6": 0.10652764141559601, "attnres/block_norm/6": 41063.359375, "geo/tier1_time_s": 1.3595359325408936, "geo/step": 76725.0, "geo/rankme_slope": 0.0004113067101840736} {"step": 76730, "timestamp": 1778277481.061664, "train/loss": 2.080567145347595, "train/z_loss": 0.0014055861509405076, "train/perplexity": 8.009009899178572, "train/grad_norm": 0.142578125, "optim/muon_lr": 0.011769057661294937, "optim/adamw_lr": 0.0003530717298388481, "perf/tokens_per_sec": 1695926.949858279, "perf/iters_per_sec": 0.8086809872905154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.236581563949585, "data/tokens_consumed": 160916570112, "data/tokens_consumed_B": 160.916570112, "train/loss_slope": -3.057160556334257e-05} {"step": 76740, "timestamp": 1778277491.4468508, "train/loss": 2.124205935001373, "train/z_loss": 0.001391849631909281, "train/perplexity": 8.366251502316421, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.011730993539094925, "optim/adamw_lr": 0.00035192980617284773, "perf/tokens_per_sec": 2020303.4263850537, "perf/iters_per_sec": 0.9633557445454853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380381345748901, "data/tokens_consumed": 160937541632, "data/tokens_consumed_B": 160.937541632, "train/loss_slope": -3.0009980484990792e-05} {"step": 76750, "timestamp": 1778277501.8136375, "grad/layer_0/attn": 0.0029892597813159227, "grad/layer_0/mlp": 0.0032692665699869394, "grad/layer_0/attn_mlp_ratio": 0.9143517745916593, "grad/layer_4/attn": 0.0037863359320908785, "grad/layer_4/mlp": 0.0026720231398940086, "grad/layer_4/attn_mlp_ratio": 1.4170295660456425, "grad/layer_8/attn": 0.006191118620336056, "grad/layer_8/mlp": 0.003639876376837492, "grad/layer_8/attn_mlp_ratio": 1.7009144842506336, "grad/layer_12/attn": 0.0035956809297204018, "grad/layer_12/mlp": 0.006744722370058298, "grad/layer_12/attn_mlp_ratio": 0.533110286699361, "grad/layer_16/attn": 0.0037274016067385674, "grad/layer_16/mlp": 0.0045557827688753605, "grad/layer_16/attn_mlp_ratio": 0.8181692837478695, "grad/layer_20/attn": 0.0037940393667668104, "grad/layer_20/mlp": 0.005667848978191614, "grad/layer_20/attn_mlp_ratio": 0.6693966819556421, "grad/layer_24/attn": 0.006675406359136105, "grad/layer_24/mlp": 0.006932481657713652, "grad/layer_24/attn_mlp_ratio": 0.9629172628847521, "grad/layer_27/attn": 0.003820622805505991, "grad/layer_27/mlp": 0.005840493366122246, "grad/layer_27/attn_mlp_ratio": 0.6541609587730036} {"step": 76750, "timestamp": 1778277501.8306506, "train/loss": 2.092635917663574, "train/z_loss": 0.0014016821165569126, "train/perplexity": 8.106254446935603, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.011692903488874435, "optim/adamw_lr": 0.000350787104666233, "perf/tokens_per_sec": 2020681.0294328285, "perf/iters_per_sec": 0.9635357997097151, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378441572189332, "data/tokens_consumed": 160958513152, "data/tokens_consumed_B": 160.958513152, "train/loss_slope": -3.0888232804260366e-05} {"step": 76760, "timestamp": 1778277512.2100632, "train/loss": 2.0636232018470766, "train/z_loss": 0.0014143059263005853, "train/perplexity": 7.8744489042249715, "train/grad_norm": 0.12890625, "optim/muon_lr": 0.011654788255691528, "optim/adamw_lr": 0.0003496436476707458, "perf/tokens_per_sec": 2021828.3953389395, "perf/iters_per_sec": 0.9640829064078043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372551918029784, "data/tokens_consumed": 160979484672, "data/tokens_consumed_B": 160.979484672, "train/loss_slope": -3.0501218082452018e-05} {"step": 76770, "timestamp": 1778277523.0689933, "train/loss": 2.03796991109848, "train/z_loss": 0.0014128933078609407, "train/perplexity": 7.675012415283599, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.011616648137569428, "optim/adamw_lr": 0.0003484994441270828, "perf/tokens_per_sec": 1932026.1739352087, "perf/iters_per_sec": 0.9212618703533214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0854676961898804, "data/tokens_consumed": 161000456192, "data/tokens_consumed_B": 161.000456192, "train/loss_slope": -3.543422645611197e-05} {"step": 76780, "timestamp": 1778277533.4476624, "train/loss": 2.0694520831108094, "train/z_loss": 0.0014035385334864258, "train/perplexity": 7.9204821627692725, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.011578482836484908, "optim/adamw_lr": 0.00034735448509454724, "perf/tokens_per_sec": 2021772.1183536926, "perf/iters_per_sec": 0.9640560714500869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372840642929078, "data/tokens_consumed": 161021427712, "data/tokens_consumed_B": 161.021427712, "train/loss_slope": -3.702779853447686e-05} {"step": 76790, "timestamp": 1778277543.839553, "train/loss": 2.111441898345947, "train/z_loss": 0.0013876846642233431, "train/perplexity": 8.260142988553499, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.011540295034646987, "optim/adamw_lr": 0.0003462088510394096, "perf/tokens_per_sec": 2019018.1005969031, "perf/iters_per_sec": 0.9627428534492984, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038698959350586, "data/tokens_consumed": 161042399232, "data/tokens_consumed_B": 161.042399232, "train/loss_slope": -3.784187355808335e-05} {"step": 76800, "timestamp": 1778277554.203029, "grad/layer_0/attn": 0.0023047050926834345, "grad/layer_0/mlp": 0.0027701787184923887, "grad/layer_0/attn_mlp_ratio": 0.8319697910107194, "grad/layer_4/attn": 0.0018923445604741573, "grad/layer_4/mlp": 0.0024143464397639036, "grad/layer_4/attn_mlp_ratio": 0.7837916095753212, "grad/layer_8/attn": 0.005190432071685791, "grad/layer_8/mlp": 0.0035660674329847097, "grad/layer_8/attn_mlp_ratio": 1.4555058264254264, "grad/layer_12/attn": 0.0058215828612446785, "grad/layer_12/mlp": 0.006707286927849054, "grad/layer_12/attn_mlp_ratio": 0.8679489690948249, "grad/layer_16/attn": 0.003425388131290674, "grad/layer_16/mlp": 0.004147874191403389, "grad/layer_16/attn_mlp_ratio": 0.8258177299128636, "grad/layer_20/attn": 0.0036521756555885077, "grad/layer_20/mlp": 0.004906690213829279, "grad/layer_20/attn_mlp_ratio": 0.7443256904343485, "grad/layer_24/attn": 0.0063507710583508015, "grad/layer_24/mlp": 0.007021062076091766, "grad/layer_24/attn_mlp_ratio": 0.9045313798781828, "grad/layer_27/attn": 0.006406689994037151, "grad/layer_27/mlp": 0.006017773412168026, "grad/layer_27/attn_mlp_ratio": 1.064627969311695} {"step": 76800, "timestamp": 1778277554.810944, "eos/sharpness": 21.299886703491207, "eos/L0_probe": 1.9251586198806763, "eos/L_plus": 2.045299768447876, "eos/L_minus": 2.0180163383483887, "eos/grad_norm": 0.09189809858798981, "eos/embed_grad_frac": 0.21671997010707855, "eos/time_s": 0.6052098274230957} {"step": 76800, "timestamp": 1778277554.83055, "train/loss": 2.1182428002357483, "train/z_loss": 0.0013944048201665281, "train/perplexity": 8.316510869566947, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.011502084285020829, "optim/adamw_lr": 0.0003450625285506248, "perf/tokens_per_sec": 1909054.611950881, "perf/iters_per_sec": 0.9103081760172277, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098529076576233, "data/tokens_consumed": 161063370752, "data/tokens_consumed_B": 161.063370752, "train/loss_slope": -3.46024129328483e-05} {"step": 76800, "timestamp": 1778277556.1943834, "geo/rankme_last": 439.2169189453125, "geo/layer_0/stable_rank_q_proj": 18.83503532409668, "geo/layer_0/stable_rank_k_proj": 15.754921913146973, "geo/layer_0/stable_rank_o_proj": 46.44799041748047, "geo/layer_0/stable_rank_gate_proj": 127.6944580078125, "geo/layer_0/stable_rank_down_proj": 56.83660888671875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06236737594008446, "geo/layer_0/attn_entropy_mean": 6.13559627532959, "geo/layer_0/attn_entropy_std": 0.42944037914276123, "geo/layer_7/stable_rank_q_proj": 42.674102783203125, "geo/layer_7/stable_rank_k_proj": 40.1422233581543, "geo/layer_7/stable_rank_o_proj": 88.06310272216797, "geo/layer_7/stable_rank_gate_proj": 76.88786315917969, "geo/layer_7/stable_rank_down_proj": 139.85064697265625, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4360552132129669, "geo/layer_7/attn_entropy_mean": 4.620599269866943, "geo/layer_7/attn_entropy_std": 0.7751240730285645, "geo/layer_14/stable_rank_q_proj": 49.41493225097656, "geo/layer_14/stable_rank_k_proj": 41.639076232910156, "geo/layer_14/stable_rank_o_proj": 43.15481185913086, "geo/layer_14/stable_rank_gate_proj": 70.22045135498047, "geo/layer_14/stable_rank_down_proj": 124.74418640136719, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.399895042181015, "geo/layer_14/attn_entropy_mean": 5.517184734344482, "geo/layer_14/attn_entropy_std": 0.41877615451812744, "geo/layer_21/stable_rank_q_proj": 39.71086120605469, "geo/layer_21/stable_rank_k_proj": 30.14358139038086, "geo/layer_21/stable_rank_o_proj": 67.93242645263672, "geo/layer_21/stable_rank_gate_proj": 63.210025787353516, "geo/layer_21/stable_rank_down_proj": 49.24685287475586, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14407119154930115, "geo/layer_21/attn_entropy_mean": 5.68126106262207, "geo/layer_21/attn_entropy_std": 0.29662302136421204, "geo/layer_27/stable_rank_q_proj": 43.891910552978516, "geo/layer_27/stable_rank_k_proj": 32.314273834228516, "geo/layer_27/stable_rank_o_proj": 115.35116577148438, "geo/layer_27/stable_rank_gate_proj": 76.51361083984375, "geo/layer_27/stable_rank_down_proj": 127.28181457519531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08834803849458694, "geo/layer_27/attn_entropy_mean": 4.145162582397461, "geo/layer_27/attn_entropy_std": 0.790101170539856, "attnres/final_alpha/block_0": 0.2409551739692688, "attnres/block_norm/0": 1.7791435718536377, "attnres/final_alpha/block_1": 0.003933853469789028, "attnres/block_norm/1": 47696.87109375, "attnres/final_alpha/block_2": 0.009489689022302628, "attnres/block_norm/2": 29077.880859375, "attnres/final_alpha/block_3": 0.011144677177071571, "attnres/block_norm/3": 63188.01171875, "attnres/final_alpha/block_4": 0.013199614360928535, "attnres/block_norm/4": 15791.42578125, "attnres/final_alpha/block_5": 0.614773154258728, "attnres/block_norm/5": 6856.01904296875, "attnres/final_alpha/block_6": 0.10650382936000824, "attnres/block_norm/6": 41065.2578125, "geo/tier1_time_s": 1.359776496887207, "geo/step": 76800.0, "geo/rankme_slope": 0.0003981577787364946} {"step": 76810, "timestamp": 1778277566.880056, "train/loss": 2.0834392786026, "train/z_loss": 0.0014081185800023377, "train/perplexity": 8.032045908257913, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.011463851034641266, "optim/adamw_lr": 0.00034391553103923795, "perf/tokens_per_sec": 1741044.950472387, "perf/iters_per_sec": 0.8301949264871535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2045363903045654, "data/tokens_consumed": 161084342272, "data/tokens_consumed_B": 161.084342272, "train/loss_slope": -3.641094226505246e-05} {"step": 76820, "timestamp": 1778277577.2547634, "train/loss": 2.1109837174415587, "train/z_loss": 0.0013883528299629688, "train/perplexity": 8.256359215661062, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.011425595879554749, "optim/adamw_lr": 0.0003427678763866424, "perf/tokens_per_sec": 2023135.3109481928, "perf/iters_per_sec": 0.9647060923329319, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365851402282715, "data/tokens_consumed": 161105313792, "data/tokens_consumed_B": 161.105313792, "train/loss_slope": -3.2983813427462444e-05} {"step": 76830, "timestamp": 1778277587.6218467, "train/loss": 2.1378636717796327, "train/z_loss": 0.0013942117220722139, "train/perplexity": 8.481299421902415, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.011387319415807725, "optim/adamw_lr": 0.0003416195824742317, "perf/tokens_per_sec": 2023991.9718153584, "perf/iters_per_sec": 0.965114580066375, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361464023590088, "data/tokens_consumed": 161126285312, "data/tokens_consumed_B": 161.126285312, "train/loss_slope": -2.811413309862404e-05} {"step": 76840, "timestamp": 1778277597.9908738, "train/loss": 2.0719269514083862, "train/z_loss": 0.0014016361790709197, "train/perplexity": 7.940108589368315, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.011349021047353745, "optim/adamw_lr": 0.0003404706314206123, "perf/tokens_per_sec": 2023485.671224691, "perf/iters_per_sec": 0.9648731571315246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036405658721924, "data/tokens_consumed": 161147256832, "data/tokens_consumed_B": 161.147256832, "train/loss_slope": -2.944102735087352e-05} {"step": 76850, "timestamp": 1778277608.3486235, "grad/layer_0/attn": 0.002696831477805972, "grad/layer_0/mlp": 0.002939731115475297, "grad/layer_0/attn_mlp_ratio": 0.9173734876200058, "grad/layer_4/attn": 0.002152579603716731, "grad/layer_4/mlp": 0.002551676705479622, "grad/layer_4/attn_mlp_ratio": 0.8435941413482123, "grad/layer_8/attn": 0.003224962158128619, "grad/layer_8/mlp": 0.0036584932822734118, "grad/layer_8/attn_mlp_ratio": 0.8815000660530392, "grad/layer_12/attn": 0.005469615571200848, "grad/layer_12/mlp": 0.007144786883145571, "grad/layer_12/attn_mlp_ratio": 0.7655393483533627, "grad/layer_16/attn": 0.004049177747219801, "grad/layer_16/mlp": 0.004555858671665192, "grad/layer_16/attn_mlp_ratio": 0.8887847385444313, "grad/layer_20/attn": 0.0030840791296213865, "grad/layer_20/mlp": 0.005384536460042, "grad/layer_20/attn_mlp_ratio": 0.5727659372782364, "grad/layer_24/attn": 0.003791245399042964, "grad/layer_24/mlp": 0.006724124774336815, "grad/layer_24/attn_mlp_ratio": 0.563827333652379, "grad/layer_27/attn": 0.0053558857180178165, "grad/layer_27/mlp": 0.005916678812354803, "grad/layer_27/attn_mlp_ratio": 0.9052182478305564} {"step": 76850, "timestamp": 1778277608.3643448, "train/loss": 2.0950521349906923, "train/z_loss": 0.0014022906892932952, "train/perplexity": 8.12586460104488, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.011310703605413436, "optim/adamw_lr": 0.0003393211081624031, "perf/tokens_per_sec": 2022973.5755454428, "perf/iters_per_sec": 0.9646289708831037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366680145263671, "data/tokens_consumed": 161168228352, "data/tokens_consumed_B": 161.168228352, "train/loss_slope": -2.770472007079629e-05} {"step": 76860, "timestamp": 1778277618.7341404, "train/loss": 2.0913888454437255, "train/z_loss": 0.0013970484491437674, "train/perplexity": 8.096151662967733, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.011272366642951965, "optim/adamw_lr": 0.0003381709992885589, "perf/tokens_per_sec": 2023500.2877195582, "perf/iters_per_sec": 0.9648801268193999, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03639817237854, "data/tokens_consumed": 161189199872, "data/tokens_consumed_B": 161.189199872, "train/loss_slope": -2.9461325556173926e-05} {"step": 76870, "timestamp": 1778277629.1075008, "train/loss": 2.066523861885071, "train/z_loss": 0.0013974113971926272, "train/perplexity": 7.897323162668029, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.011234010756015779, "optim/adamw_lr": 0.0003370203226804733, "perf/tokens_per_sec": 2022541.5394293936, "perf/iters_per_sec": 0.9644229600092857, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368894577026366, "data/tokens_consumed": 161210171392, "data/tokens_consumed_B": 161.210171392, "train/loss_slope": -2.739331034114685e-05} {"step": 76875, "timestamp": 1778277634.887142, "eos/sharpness": 42.44182109832763, "eos/L0_probe": 1.9249423742294312, "eos/L_plus": 2.121108293533325, "eos/L_minus": 2.1531946659088135, "eos/grad_norm": 0.09172321110963821, "eos/embed_grad_frac": 0.2211945354938507, "eos/time_s": 0.6057593822479248} {"step": 76875, "timestamp": 1778277636.264023, "geo/rankme_last": 439.65777587890625, "geo/layer_0/stable_rank_q_proj": 18.827659606933594, "geo/layer_0/stable_rank_k_proj": 15.751899719238281, "geo/layer_0/stable_rank_o_proj": 46.415348052978516, "geo/layer_0/stable_rank_gate_proj": 127.64460754394531, "geo/layer_0/stable_rank_down_proj": 56.88786697387695, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06422214955091476, "geo/layer_0/attn_entropy_mean": 6.134725570678711, "geo/layer_0/attn_entropy_std": 0.42913052439689636, "geo/layer_7/stable_rank_q_proj": 42.671775817871094, "geo/layer_7/stable_rank_k_proj": 40.16286849975586, "geo/layer_7/stable_rank_o_proj": 88.05911254882812, "geo/layer_7/stable_rank_gate_proj": 76.88359832763672, "geo/layer_7/stable_rank_down_proj": 140.21185302734375, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.43528085947036743, "geo/layer_7/attn_entropy_mean": 4.609305381774902, "geo/layer_7/attn_entropy_std": 0.7835559844970703, "geo/layer_14/stable_rank_q_proj": 49.37861633300781, "geo/layer_14/stable_rank_k_proj": 41.64205551147461, "geo/layer_14/stable_rank_o_proj": 43.12800598144531, "geo/layer_14/stable_rank_gate_proj": 70.19696044921875, "geo/layer_14/stable_rank_down_proj": 124.76051330566406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4267139434814453, "geo/layer_14/attn_entropy_mean": 5.527914047241211, "geo/layer_14/attn_entropy_std": 0.4128657877445221, "geo/layer_21/stable_rank_q_proj": 39.6884765625, "geo/layer_21/stable_rank_k_proj": 30.09819793701172, "geo/layer_21/stable_rank_o_proj": 67.94978332519531, "geo/layer_21/stable_rank_gate_proj": 63.128684997558594, "geo/layer_21/stable_rank_down_proj": 49.23402404785156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1498938351869583, "geo/layer_21/attn_entropy_mean": 5.682165145874023, "geo/layer_21/attn_entropy_std": 0.29464977979660034, "geo/layer_27/stable_rank_q_proj": 43.884620666503906, "geo/layer_27/stable_rank_k_proj": 32.31585693359375, "geo/layer_27/stable_rank_o_proj": 115.3218994140625, "geo/layer_27/stable_rank_gate_proj": 76.50628662109375, "geo/layer_27/stable_rank_down_proj": 127.21604919433594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09003675729036331, "geo/layer_27/attn_entropy_mean": 4.157316207885742, "geo/layer_27/attn_entropy_std": 0.7863914370536804, "attnres/final_alpha/block_0": 0.24075034260749817, "attnres/block_norm/0": 1.7792603969573975, "attnres/final_alpha/block_1": 0.003915926441550255, "attnres/block_norm/1": 47773.11328125, "attnres/final_alpha/block_2": 0.009453728795051575, "attnres/block_norm/2": 29109.19140625, "attnres/final_alpha/block_3": 0.010984150692820549, "attnres/block_norm/3": 63456.94140625, "attnres/final_alpha/block_4": 0.013254260644316673, "attnres/block_norm/4": 15727.771484375, "attnres/final_alpha/block_5": 0.6149666905403137, "attnres/block_norm/5": 6832.9755859375, "attnres/final_alpha/block_6": 0.106674924492836, "attnres/block_norm/6": 41300.29296875, "geo/tier1_time_s": 1.356372356414795, "geo/step": 76875.0, "geo/rankme_slope": 0.0004042779611844738} {"step": 76880, "timestamp": 1778277641.4501169, "train/loss": 2.090066337585449, "train/z_loss": 0.0013883924228139221, "train/perplexity": 8.085451515845557, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.01119563639163971, "optim/adamw_lr": 0.00033586909174919123, "perf/tokens_per_sec": 1699691.8272968277, "perf/iters_per_sec": 0.810476220749296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2338424921035767, "data/tokens_consumed": 161231142912, "data/tokens_consumed_B": 161.231142912, "train/loss_slope": -2.6263703972306856e-05} {"step": 76890, "timestamp": 1778277651.8154109, "train/loss": 2.123822808265686, "train/z_loss": 0.0013921547681093216, "train/perplexity": 8.36304678163469, "train/grad_norm": 0.12255859375, "optim/muon_lr": 0.011157244071364402, "optim/adamw_lr": 0.00033471732214093204, "perf/tokens_per_sec": 2024106.5461776063, "perf/iters_per_sec": 0.9651692133796722, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360877513885498, "data/tokens_consumed": 161252114432, "data/tokens_consumed_B": 161.252114432, "train/loss_slope": -2.4378917956187696e-05} {"step": 76900, "timestamp": 1778277662.170808, "grad/layer_0/attn": 0.0027928133495151997, "grad/layer_0/mlp": 0.0028442321345210075, "grad/layer_0/attn_mlp_ratio": 0.9819216995076117, "grad/layer_4/attn": 0.0020752879790961742, "grad/layer_4/mlp": 0.0024871386121958494, "grad/layer_4/attn_mlp_ratio": 0.834407815261715, "grad/layer_8/attn": 0.003070669248700142, "grad/layer_8/mlp": 0.00336472992785275, "grad/layer_8/attn_mlp_ratio": 0.9126049410447747, "grad/layer_12/attn": 0.003763853572309017, "grad/layer_12/mlp": 0.006252227816730738, "grad/layer_12/attn_mlp_ratio": 0.6020019779248739, "grad/layer_16/attn": 0.004166734870523214, "grad/layer_16/mlp": 0.004478088580071926, "grad/layer_16/attn_mlp_ratio": 0.9304717186744702, "grad/layer_20/attn": 0.004518883768469095, "grad/layer_20/mlp": 0.005498084239661694, "grad/layer_20/attn_mlp_ratio": 0.8219015004682795, "grad/layer_24/attn": 0.00411373982205987, "grad/layer_24/mlp": 0.007075858768075705, "grad/layer_24/attn_mlp_ratio": 0.5813767485696913, "grad/layer_27/attn": 0.004247535951435566, "grad/layer_27/mlp": 0.006109130103141069, "grad/layer_27/attn_mlp_ratio": 0.6952767104638977} {"step": 76900, "timestamp": 1778277662.186738, "train/loss": 2.091028833389282, "train/z_loss": 0.0014005067176185548, "train/perplexity": 8.093237475377268, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.011118833348155022, "optim/adamw_lr": 0.00033356500044465063, "perf/tokens_per_sec": 2023384.3856480168, "perf/iters_per_sec": 0.9648248604049763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364575386047363, "data/tokens_consumed": 161273085952, "data/tokens_consumed_B": 161.273085952, "train/loss_slope": -2.409372555755331e-05} {"step": 76910, "timestamp": 1778277672.5536337, "train/loss": 2.110542869567871, "train/z_loss": 0.0013964628684334456, "train/perplexity": 8.252720219437249, "train/grad_norm": 0.08740234375, "optim/muon_lr": 0.011080407053232193, "optim/adamw_lr": 0.0003324122115969658, "perf/tokens_per_sec": 2024153.6837868155, "perf/iters_per_sec": 0.965191690343292, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360636234283447, "data/tokens_consumed": 161294057472, "data/tokens_consumed_B": 161.294057472, "train/loss_slope": -1.9668650004801505e-05} {"step": 76920, "timestamp": 1778277682.9095407, "train/loss": 2.1102569103240967, "train/z_loss": 0.0013932042755186557, "train/perplexity": 8.250360615195618, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.01104196459054947, "optim/adamw_lr": 0.000331258937716484, "perf/tokens_per_sec": 2026205.0366990087, "perf/iters_per_sec": 0.9661698516364139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350147008895874, "data/tokens_consumed": 161315028992, "data/tokens_consumed_B": 161.315028992, "train/loss_slope": -1.57247422015456e-05} {"step": 76930, "timestamp": 1778277693.2596712, "train/loss": 2.0923983097076415, "train/z_loss": 0.0013935476541519164, "train/perplexity": 8.10432856519767, "train/grad_norm": 0.11572265625, "optim/muon_lr": 0.011003506556153298, "optim/adamw_lr": 0.0003301051966845989, "perf/tokens_per_sec": 2027632.218377461, "perf/iters_per_sec": 0.9668503848922066, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342861890792847, "data/tokens_consumed": 161336000512, "data/tokens_consumed_B": 161.336000512, "train/loss_slope": -1.70038066752995e-05} {"step": 76940, "timestamp": 1778277703.605134, "train/loss": 2.0930046916007994, "train/z_loss": 0.0014059091452509164, "train/perplexity": 8.109244373573775, "train/grad_norm": 0.11865234375, "optim/muon_lr": 0.010965033546090127, "optim/adamw_lr": 0.00032895100638270373, "perf/tokens_per_sec": 2028427.2499226548, "perf/iters_per_sec": 0.9672294854748987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0338808059692384, "data/tokens_consumed": 161356972032, "data/tokens_consumed_B": 161.356972032, "train/loss_slope": -1.3953112397078044e-05} {"step": 76950, "timestamp": 1778277713.9388378, "grad/layer_0/attn": 0.0023814374580979347, "grad/layer_0/mlp": 0.0029449418652802706, "grad/layer_0/attn_mlp_ratio": 0.8086534424698902, "grad/layer_4/attn": 0.0021174210123717785, "grad/layer_4/mlp": 0.002663121558725834, "grad/layer_4/attn_mlp_ratio": 0.7950898545824808, "grad/layer_8/attn": 0.008412130177021027, "grad/layer_8/mlp": 0.0033858208917081356, "grad/layer_8/attn_mlp_ratio": 2.4845171075559827, "grad/layer_12/attn": 0.005818597972393036, "grad/layer_12/mlp": 0.006616433151066303, "grad/layer_12/attn_mlp_ratio": 0.8794161070778296, "grad/layer_16/attn": 0.003384658833965659, "grad/layer_16/mlp": 0.004560413304716349, "grad/layer_16/attn_mlp_ratio": 0.7421824588238576, "grad/layer_20/attn": 0.004743179306387901, "grad/layer_20/mlp": 0.005194256082177162, "grad/layer_20/attn_mlp_ratio": 0.9131585235751325, "grad/layer_24/attn": 0.004522514063864946, "grad/layer_24/mlp": 0.006800062954425812, "grad/layer_24/attn_mlp_ratio": 0.665069430631452, "grad/layer_27/attn": 0.0067799086682498455, "grad/layer_27/mlp": 0.005872982554137707, "grad/layer_27/attn_mlp_ratio": 1.1544234109857587} {"step": 76950, "timestamp": 1778277714.5418289, "eos/sharpness": 8.232390880584715, "eos/L0_probe": 1.9215365648269653, "eos/L_plus": 1.9650607109069824, "eos/L_minus": 1.9603363275527954, "eos/grad_norm": 0.084335096180439, "eos/embed_grad_frac": 0.2736888527870178, "eos/time_s": 0.6001834869384766} {"step": 76950, "timestamp": 1778277714.5608778, "train/loss": 2.1387538433074953, "train/z_loss": 0.0014100986998528242, "train/perplexity": 8.488852594477892, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.010926546156406402, "optim/adamw_lr": 0.00032779638469219205, "perf/tokens_per_sec": 1915052.2258149271, "perf/iters_per_sec": 0.9131680611681591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0950886726379394, "data/tokens_consumed": 161377943552, "data/tokens_consumed_B": 161.377943552, "train/loss_slope": -1.0903689484320136e-05} {"step": 76950, "timestamp": 1778277715.9251966, "geo/rankme_last": 440.21051025390625, "geo/layer_0/stable_rank_q_proj": 18.819026947021484, "geo/layer_0/stable_rank_k_proj": 15.752126693725586, "geo/layer_0/stable_rank_o_proj": 46.417354583740234, "geo/layer_0/stable_rank_gate_proj": 127.60086059570312, "geo/layer_0/stable_rank_down_proj": 56.8670654296875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.060890644788742065, "geo/layer_0/attn_entropy_mean": 6.133414268493652, "geo/layer_0/attn_entropy_std": 0.42795246839523315, "geo/layer_7/stable_rank_q_proj": 42.67084503173828, "geo/layer_7/stable_rank_k_proj": 40.12820053100586, "geo/layer_7/stable_rank_o_proj": 88.0863265991211, "geo/layer_7/stable_rank_gate_proj": 76.86402893066406, "geo/layer_7/stable_rank_down_proj": 140.2696533203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44169190526008606, "geo/layer_7/attn_entropy_mean": 4.623621463775635, "geo/layer_7/attn_entropy_std": 0.777347981929779, "geo/layer_14/stable_rank_q_proj": 49.42095184326172, "geo/layer_14/stable_rank_k_proj": 41.605201721191406, "geo/layer_14/stable_rank_o_proj": 43.14042282104492, "geo/layer_14/stable_rank_gate_proj": 70.24539184570312, "geo/layer_14/stable_rank_down_proj": 124.79598236083984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.42398902773857117, "geo/layer_14/attn_entropy_mean": 5.516383171081543, "geo/layer_14/attn_entropy_std": 0.41191622614860535, "geo/layer_21/stable_rank_q_proj": 39.68070602416992, "geo/layer_21/stable_rank_k_proj": 30.096200942993164, "geo/layer_21/stable_rank_o_proj": 67.95951080322266, "geo/layer_21/stable_rank_gate_proj": 63.10033416748047, "geo/layer_21/stable_rank_down_proj": 49.248512268066406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14499357342720032, "geo/layer_21/attn_entropy_mean": 5.683161735534668, "geo/layer_21/attn_entropy_std": 0.2955368757247925, "geo/layer_27/stable_rank_q_proj": 43.884273529052734, "geo/layer_27/stable_rank_k_proj": 32.33483123779297, "geo/layer_27/stable_rank_o_proj": 115.3370361328125, "geo/layer_27/stable_rank_gate_proj": 76.46895599365234, "geo/layer_27/stable_rank_down_proj": 127.27405548095703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08277604728937149, "geo/layer_27/attn_entropy_mean": 4.148643493652344, "geo/layer_27/attn_entropy_std": 0.7982451915740967, "attnres/final_alpha/block_0": 0.241029754281044, "attnres/block_norm/0": 1.7791743278503418, "attnres/final_alpha/block_1": 0.0038748462684452534, "attnres/block_norm/1": 47877.6796875, "attnres/final_alpha/block_2": 0.009520243853330612, "attnres/block_norm/2": 29096.693359375, "attnres/final_alpha/block_3": 0.011221027001738548, "attnres/block_norm/3": 63287.89453125, "attnres/final_alpha/block_4": 0.013182921335101128, "attnres/block_norm/4": 15789.701171875, "attnres/final_alpha/block_5": 0.6142910718917847, "attnres/block_norm/5": 6852.4970703125, "attnres/final_alpha/block_6": 0.10688015073537827, "attnres/block_norm/6": 41087.89453125, "geo/tier1_time_s": 1.3606109619140625, "geo/step": 76950.0, "geo/rankme_slope": 0.00045492265265481194} {"step": 76960, "timestamp": 1778277726.271045, "train/loss": 2.10651181936264, "train/z_loss": 0.0014145280933007599, "train/perplexity": 8.219520050633403, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.010888043642044067, "optim/adamw_lr": 0.000326641309261322, "perf/tokens_per_sec": 1791438.850343445, "perf/iters_per_sec": 0.8542246104924417, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706522941589355, "data/tokens_consumed": 161398915072, "data/tokens_consumed_B": 161.398915072, "train/loss_slope": -1.053558818721e-05} {"step": 76970, "timestamp": 1778277736.6137958, "train/loss": 2.129278039932251, "train/z_loss": 0.0013907678658142686, "train/perplexity": 8.408793806124292, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.010849529057741166, "optim/adamw_lr": 0.00032548587173223493, "perf/tokens_per_sec": 2028634.3044816891, "perf/iters_per_sec": 0.9673282167824216, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0337752819061279, "data/tokens_consumed": 161419886592, "data/tokens_consumed_B": 161.419886592, "train/loss_slope": -8.491970267411733e-06} {"step": 76980, "timestamp": 1778277746.9644687, "train/loss": 2.117625331878662, "train/z_loss": 0.0013979283510707318, "train/perplexity": 8.311377272343655, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.010811001807451249, "optim/adamw_lr": 0.0003243300542235374, "perf/tokens_per_sec": 2027191.3225255401, "perf/iters_per_sec": 0.9666401493671132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034511137008667, "data/tokens_consumed": 161440858112, "data/tokens_consumed_B": 161.440858112, "train/loss_slope": -8.818266343827216e-06} {"step": 76990, "timestamp": 1778277757.316392, "train/loss": 2.110423970222473, "train/z_loss": 0.0014014504384249448, "train/perplexity": 8.251739034737668, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.010772462487220764, "optim/adamw_lr": 0.0003231738746166229, "perf/tokens_per_sec": 2026889.7919844545, "perf/iters_per_sec": 0.9664963684007905, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034665036201477, "data/tokens_consumed": 161461829632, "data/tokens_consumed_B": 161.461829632, "train/loss_slope": -6.151080918390766e-06} {"step": 77000, "timestamp": 1778277767.657282, "grad/layer_0/attn": 0.0026636600960046053, "grad/layer_0/mlp": 0.0030720036011189222, "grad/layer_0/attn_mlp_ratio": 0.8670758095227603, "grad/layer_4/attn": 0.001906485529616475, "grad/layer_4/mlp": 0.002579071559011936, "grad/layer_4/attn_mlp_ratio": 0.7392138651730467, "grad/layer_8/attn": 0.0047320155426859856, "grad/layer_8/mlp": 0.003470825497061014, "grad/layer_8/attn_mlp_ratio": 1.3633688614872805, "grad/layer_12/attn": 0.0036866848822683096, "grad/layer_12/mlp": 0.006413551047444344, "grad/layer_12/attn_mlp_ratio": 0.5748273924247679, "grad/layer_16/attn": 0.0038609469775110483, "grad/layer_16/mlp": 0.004264262039214373, "grad/layer_16/attn_mlp_ratio": 0.9054197071998887, "grad/layer_20/attn": 0.0028193483594805002, "grad/layer_20/mlp": 0.005112911574542522, "grad/layer_20/attn_mlp_ratio": 0.55141737994775, "grad/layer_24/attn": 0.005343564786016941, "grad/layer_24/mlp": 0.006506156176328659, "grad/layer_24/attn_mlp_ratio": 0.8213090124284944, "grad/layer_27/attn": 0.003809213638305664, "grad/layer_27/mlp": 0.006010752636939287, "grad/layer_27/attn_mlp_ratio": 0.6337332119646198} {"step": 77000, "timestamp": 1778277767.6734173, "train/loss": 2.074766182899475, "train/z_loss": 0.0014030623948201536, "train/perplexity": 7.9626844295703725, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.010733911544084549, "optim/adamw_lr": 0.0003220173463225364, "perf/tokens_per_sec": 2025834.1846479045, "perf/iters_per_sec": 0.9659930155982516, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352041721343994, "data/tokens_consumed": 161482801152, "data/tokens_consumed_B": 161.482801152, "train/loss_slope": -9.069866837948752e-06} {"step": 77000, "timestamp": 1778277774.6797051, "geo/ww_alpha_mean": 7.884520024179703, "geo/ww_alpha_std": 4.7832562434862576, "geo/ww_alpha_min": 1.318943401839873, "geo/ww_alpha_max": 30.67691822166033, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.929503370664876, "geo/ww_alpha_by_type/k_proj": 4.504201015342795, "geo/ww_alpha_by_type/v_proj": 9.018103513889104, "geo/ww_alpha_by_type/o_proj": 9.746570568016809, "geo/ww_alpha_by_type/gate_proj": 8.06034293094702, "geo/ww_alpha_by_type/up_proj": 12.04216266189516, "geo/ww_alpha_by_type/down_proj": 7.983978152716111, "geo/twonn_id/layer_0": 0.6731339693069458, "geo/twonn_id/layer_7": 3.5624029636383057, "geo/twonn_id/layer_14": 4.372438907623291, "geo/twonn_id/layer_21": 6.143209934234619, "geo/twonn_id/layer_27": 5.506675720214844, "geo/tier2_time_s": 7.000426292419434} {"step": 77000, "timestamp": 1778277775.3433316, "eoc/jacobian_sigma/layer_0/attn": 1276.08251953125, "eoc/jacobian_sigma/layer_0/mlp": 8387.94140625, "eoc/jacobian_sigma/layer_0": 8387.94140625, "eoc/jacobian_sigma/layer_7/attn": 1.1618114709854126, "eoc/jacobian_sigma/layer_7/mlp": 1.9317506551742554, "eoc/jacobian_sigma/layer_7": 1.9317506551742554, "eoc/jacobian_sigma/layer_14/attn": 1.384454369544983, "eoc/jacobian_sigma/layer_14/mlp": 5.858770370483398, "eoc/jacobian_sigma/layer_14": 5.858770370483398, "eoc/jacobian_sigma/layer_21/attn": 1.094306468963623, "eoc/jacobian_sigma/layer_21/mlp": 4.480236530303955, "eoc/jacobian_sigma/layer_21": 4.480236530303955, "eoc/jacobian_sigma/layer_27/attn": 3.0672507286071777, "eoc/jacobian_sigma/layer_27/mlp": 27.311115264892578, "eoc/jacobian_sigma/layer_27": 27.311115264892578, "eoc/layer0_sigma": 8387.94140625, "eoc/sigma_max": 27.311115264892578, "eoc/sigma_min": 1.9317506551742554, "eoc/sigma_mean": 9.895468205213547, "eoc/time_s": 0.656958818435669} {"step": 77010, "timestamp": 1778277785.710701, "train/loss": 2.1204665899276733, "train/z_loss": 0.001396547700278461, "train/perplexity": 8.335025619536548, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.010695349648594856, "optim/adamw_lr": 0.00032086048945784563, "perf/tokens_per_sec": 1162978.2895573243, "perf/iters_per_sec": 0.5545512626444455, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.803259801864624, "data/tokens_consumed": 161503772672, "data/tokens_consumed_B": 161.503772672, "train/loss_slope": -4.64778555215676e-06} {"step": 77020, "timestamp": 1778277796.0736668, "train/loss": 2.1014097571372985, "train/z_loss": 0.0014075069106183947, "train/perplexity": 8.177690347485969, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.010656776130199433, "optim/adamw_lr": 0.00031970328390598294, "perf/tokens_per_sec": 2025136.0614596049, "perf/iters_per_sec": 0.9656601245210671, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0355610370635986, "data/tokens_consumed": 161524744192, "data/tokens_consumed_B": 161.524744192, "train/loss_slope": -5.4251341500727275e-06} {"step": 77025, "timestamp": 1778277801.8638554, "eos/sharpness": 19.94369029998779, "eos/L0_probe": 1.922897219657898, "eos/L_plus": 2.0107648372650146, "eos/L_minus": 2.034466505050659, "eos/grad_norm": 0.08471443504095078, "eos/embed_grad_frac": 0.264256089925766, "eos/time_s": 0.6255381107330322} {"step": 77025, "timestamp": 1778277803.2484057, "geo/rankme_last": 439.5810546875, "geo/layer_0/stable_rank_q_proj": 18.80567169189453, "geo/layer_0/stable_rank_k_proj": 15.743905067443848, "geo/layer_0/stable_rank_o_proj": 46.42526626586914, "geo/layer_0/stable_rank_gate_proj": 127.43780517578125, "geo/layer_0/stable_rank_down_proj": 56.850494384765625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06141825392842293, "geo/layer_0/attn_entropy_mean": 6.132979869842529, "geo/layer_0/attn_entropy_std": 0.4282539188861847, "geo/layer_7/stable_rank_q_proj": 42.67198181152344, "geo/layer_7/stable_rank_k_proj": 40.135433197021484, "geo/layer_7/stable_rank_o_proj": 88.1009750366211, "geo/layer_7/stable_rank_gate_proj": 76.95372009277344, "geo/layer_7/stable_rank_down_proj": 140.1995391845703, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4372689425945282, "geo/layer_7/attn_entropy_mean": 4.6400651931762695, "geo/layer_7/attn_entropy_std": 0.7782102823257446, "geo/layer_14/stable_rank_q_proj": 49.39851760864258, "geo/layer_14/stable_rank_k_proj": 41.62070083618164, "geo/layer_14/stable_rank_o_proj": 43.124114990234375, "geo/layer_14/stable_rank_gate_proj": 70.22022247314453, "geo/layer_14/stable_rank_down_proj": 124.70439147949219, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40547341108322144, "geo/layer_14/attn_entropy_mean": 5.529534339904785, "geo/layer_14/attn_entropy_std": 0.4178061783313751, "geo/layer_21/stable_rank_q_proj": 39.67081069946289, "geo/layer_21/stable_rank_k_proj": 30.089609146118164, "geo/layer_21/stable_rank_o_proj": 67.95439910888672, "geo/layer_21/stable_rank_gate_proj": 63.09412384033203, "geo/layer_21/stable_rank_down_proj": 49.284629821777344, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.13917678594589233, "geo/layer_21/attn_entropy_mean": 5.676648139953613, "geo/layer_21/attn_entropy_std": 0.2947094142436981, "geo/layer_27/stable_rank_q_proj": 43.84724426269531, "geo/layer_27/stable_rank_k_proj": 32.324485778808594, "geo/layer_27/stable_rank_o_proj": 115.36760711669922, "geo/layer_27/stable_rank_gate_proj": 76.45918273925781, "geo/layer_27/stable_rank_down_proj": 127.41844940185547, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09044183790683746, "geo/layer_27/attn_entropy_mean": 4.143374443054199, "geo/layer_27/attn_entropy_std": 0.7998850345611572, "attnres/final_alpha/block_0": 0.24031350016593933, "attnres/block_norm/0": 1.779118537902832, "attnres/final_alpha/block_1": 0.0038457377813756466, "attnres/block_norm/1": 47882.3359375, "attnres/final_alpha/block_2": 0.009485096670687199, "attnres/block_norm/2": 29164.533203125, "attnres/final_alpha/block_3": 0.011081686243414879, "attnres/block_norm/3": 63568.8359375, "attnres/final_alpha/block_4": 0.01318754255771637, "attnres/block_norm/4": 15733.009765625, "attnres/final_alpha/block_5": 0.6146847009658813, "attnres/block_norm/5": 6833.4365234375, "attnres/final_alpha/block_6": 0.10740175098180771, "attnres/block_norm/6": 40931.89453125, "geo/tier1_time_s": 1.3628594875335693, "geo/step": 77025.0, "geo/rankme_slope": 0.00046480359722013805} {"step": 77030, "timestamp": 1778277808.436806, "train/loss": 2.1273360729217528, "train/z_loss": 0.0014000499271787704, "train/perplexity": 8.392480051468315, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.010618194043636323, "optim/adamw_lr": 0.00031854582130908963, "perf/tokens_per_sec": 1697043.0865360503, "perf/iters_per_sec": 0.8092132027321102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2357682704925537, "data/tokens_consumed": 161545715712, "data/tokens_consumed_B": 161.545715712, "train/loss_slope": -1.1559562643047267e-06} {"step": 77040, "timestamp": 1778277818.7982476, "train/loss": 2.078329396247864, "train/z_loss": 0.0014091138378717004, "train/perplexity": 7.991107782180479, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.010579602718353271, "optim/adamw_lr": 0.00031738808155059813, "perf/tokens_per_sec": 2025515.2401042618, "perf/iters_per_sec": 0.9658409309884366, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353671789169312, "data/tokens_consumed": 161566687232, "data/tokens_consumed_B": 161.566687232, "train/loss_slope": -1.5793324446771192e-06} {"step": 77050, "timestamp": 1778277829.1387885, "grad/layer_0/attn": 0.0029539659153670073, "grad/layer_0/mlp": 0.003056778572499752, "grad/layer_0/attn_mlp_ratio": 0.9663656521626188, "grad/layer_4/attn": 0.0024319919757544994, "grad/layer_4/mlp": 0.0025552965234965086, "grad/layer_4/attn_mlp_ratio": 0.9517454660221455, "grad/layer_8/attn": 0.005699857138097286, "grad/layer_8/mlp": 0.0035210212226957083, "grad/layer_8/attn_mlp_ratio": 1.6188079013771641, "grad/layer_12/attn": 0.005011121276766062, "grad/layer_12/mlp": 0.00674845464527607, "grad/layer_12/attn_mlp_ratio": 0.7425583286712941, "grad/layer_16/attn": 0.003420308930799365, "grad/layer_16/mlp": 0.004604300484061241, "grad/layer_16/attn_mlp_ratio": 0.7428509212972513, "grad/layer_20/attn": 0.003320888848975301, "grad/layer_20/mlp": 0.005100746173411608, "grad/layer_20/attn_mlp_ratio": 0.6510594079705403, "grad/layer_24/attn": 0.004191712010651827, "grad/layer_24/mlp": 0.006603769492357969, "grad/layer_24/attn_mlp_ratio": 0.6347453453710091, "grad/layer_27/attn": 0.0036870751064270735, "grad/layer_27/mlp": 0.005850863642990589, "grad/layer_27/attn_mlp_ratio": 0.6301762044696798} {"step": 77050, "timestamp": 1778277829.1548347, "train/loss": 2.0768359422683718, "train/z_loss": 0.0014114221441559494, "train/perplexity": 7.979182337730414, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.010541002713143826, "optim/adamw_lr": 0.00031623008139431476, "perf/tokens_per_sec": 2025933.522538537, "perf/iters_per_sec": 0.9660403835957226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351534128189086, "data/tokens_consumed": 161587658752, "data/tokens_consumed_B": 161.587658752, "train/loss_slope": -1.6999665159310696e-06} {"step": 77060, "timestamp": 1778277839.5070097, "train/loss": 2.0675304174423217, "train/z_loss": 0.0014209680957719684, "train/perplexity": 7.905276259130062, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.010502394624054431, "optim/adamw_lr": 0.00031507183872163294, "perf/tokens_per_sec": 2026804.4174486292, "perf/iters_per_sec": 0.9664556586497446, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347086191177368, "data/tokens_consumed": 161608630272, "data/tokens_consumed_B": 161.608630272, "train/loss_slope": -2.528287847228784e-06} {"step": 77070, "timestamp": 1778277849.8590345, "train/loss": 2.0720136523246766, "train/z_loss": 0.0014056471991352737, "train/perplexity": 7.940797033902413, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.010463779009878636, "optim/adamw_lr": 0.00031391337029635903, "perf/tokens_per_sec": 2026876.9479814307, "perf/iters_per_sec": 0.9664902439028886, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346715927124024, "data/tokens_consumed": 161629601792, "data/tokens_consumed_B": 161.629601792, "train/loss_slope": -4.673629997372445e-06} {"step": 77080, "timestamp": 1778277860.211362, "train/loss": 2.098555266857147, "train/z_loss": 0.001414543658029288, "train/perplexity": 8.154380494576875, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.010425155274569988, "optim/adamw_lr": 0.00031275465823709964, "perf/tokens_per_sec": 2027228.1382853657, "perf/iters_per_sec": 0.9666577044894055, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344923496246339, "data/tokens_consumed": 161650573312, "data/tokens_consumed_B": 161.650573312, "train/loss_slope": -6.647145522810339e-06} {"step": 77090, "timestamp": 1778277870.561177, "train/loss": 2.075173819065094, "train/z_loss": 0.0014110440504737198, "train/perplexity": 7.9659309693778395, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.01038652639836073, "optim/adamw_lr": 0.00031159579195082183, "perf/tokens_per_sec": 2027429.0132967373, "perf/iters_per_sec": 0.966753489158982, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034389853477478, "data/tokens_consumed": 161671544832, "data/tokens_consumed_B": 161.671544832, "train/loss_slope": -8.302921738096659e-06} {"step": 77100, "timestamp": 1778277880.9085715, "grad/layer_0/attn": 0.002406527753919363, "grad/layer_0/mlp": 0.0027385582216084003, "grad/layer_0/attn_mlp_ratio": 0.8787571675690915, "grad/layer_4/attn": 0.0017943063285201788, "grad/layer_4/mlp": 0.0024995373096317053, "grad/layer_4/attn_mlp_ratio": 0.7178553605983281, "grad/layer_8/attn": 0.004424868617206812, "grad/layer_8/mlp": 0.0036739155184477568, "grad/layer_8/attn_mlp_ratio": 1.2044012646856423, "grad/layer_12/attn": 0.0036334695760160685, "grad/layer_12/mlp": 0.006223147734999657, "grad/layer_12/attn_mlp_ratio": 0.5838636124922251, "grad/layer_16/attn": 0.003908063285052776, "grad/layer_16/mlp": 0.004479831084609032, "grad/layer_16/attn_mlp_ratio": 0.8723684272923881, "grad/layer_20/attn": 0.002605832414701581, "grad/layer_20/mlp": 0.005156000144779682, "grad/layer_20/attn_mlp_ratio": 0.5053980393697459, "grad/layer_24/attn": 0.007349351420998573, "grad/layer_24/mlp": 0.006977797485888004, "grad/layer_24/attn_mlp_ratio": 1.0532480099253674, "grad/layer_27/attn": 0.005520269740372896, "grad/layer_27/mlp": 0.00628068670630455, "grad/layer_27/attn_mlp_ratio": 0.8789277209033338} {"step": 77100, "timestamp": 1778277881.5221736, "eos/sharpness": 18.461036682128903, "eos/L0_probe": 1.92251718044281, "eos/L_plus": 2.0114998817443848, "eos/L_minus": 2.0181448459625244, "eos/grad_norm": 0.08256931602954865, "eos/embed_grad_frac": 0.28424790501594543, "eos/time_s": 0.6106851100921631} {"step": 77100, "timestamp": 1778277881.5421262, "train/loss": 2.1359867811203004, "train/z_loss": 0.0013952158391475678, "train/perplexity": 8.465395879512169, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.010347891747951508, "optim/adamw_lr": 0.0003104367524385452, "perf/tokens_per_sec": 1910812.2026461384, "perf/iters_per_sec": 0.9111462605696384, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0975186347961425, "data/tokens_consumed": 161692516352, "data/tokens_consumed_B": 161.692516352, "train/loss_slope": -6.4715394378602505e-06} {"step": 77100, "timestamp": 1778277882.9040563, "geo/rankme_last": 439.8284606933594, "geo/layer_0/stable_rank_q_proj": 18.804140090942383, "geo/layer_0/stable_rank_k_proj": 15.734758377075195, "geo/layer_0/stable_rank_o_proj": 46.422325134277344, "geo/layer_0/stable_rank_gate_proj": 127.42105865478516, "geo/layer_0/stable_rank_down_proj": 56.819984436035156, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0636257603764534, "geo/layer_0/attn_entropy_mean": 6.128546237945557, "geo/layer_0/attn_entropy_std": 0.4312185049057007, "geo/layer_7/stable_rank_q_proj": 42.643768310546875, "geo/layer_7/stable_rank_k_proj": 40.09138870239258, "geo/layer_7/stable_rank_o_proj": 88.06713104248047, "geo/layer_7/stable_rank_gate_proj": 76.98570251464844, "geo/layer_7/stable_rank_down_proj": 140.24530029296875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4394304156303406, "geo/layer_7/attn_entropy_mean": 4.630066871643066, "geo/layer_7/attn_entropy_std": 0.7820436358451843, "geo/layer_14/stable_rank_q_proj": 49.367950439453125, "geo/layer_14/stable_rank_k_proj": 41.63130569458008, "geo/layer_14/stable_rank_o_proj": 43.1361083984375, "geo/layer_14/stable_rank_gate_proj": 70.20496368408203, "geo/layer_14/stable_rank_down_proj": 124.63623809814453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4106091856956482, "geo/layer_14/attn_entropy_mean": 5.536276340484619, "geo/layer_14/attn_entropy_std": 0.41048330068588257, "geo/layer_21/stable_rank_q_proj": 39.63154983520508, "geo/layer_21/stable_rank_k_proj": 30.11035919189453, "geo/layer_21/stable_rank_o_proj": 67.90809631347656, "geo/layer_21/stable_rank_gate_proj": 63.08988952636719, "geo/layer_21/stable_rank_down_proj": 49.28764343261719, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1446363478899002, "geo/layer_21/attn_entropy_mean": 5.6890645027160645, "geo/layer_21/attn_entropy_std": 0.2887064814567566, "geo/layer_27/stable_rank_q_proj": 43.84092330932617, "geo/layer_27/stable_rank_k_proj": 32.2785530090332, "geo/layer_27/stable_rank_o_proj": 115.37567138671875, "geo/layer_27/stable_rank_gate_proj": 76.47654724121094, "geo/layer_27/stable_rank_down_proj": 127.50826263427734, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08857789635658264, "geo/layer_27/attn_entropy_mean": 4.159841537475586, "geo/layer_27/attn_entropy_std": 0.7892777919769287, "attnres/final_alpha/block_0": 0.24014271795749664, "attnres/block_norm/0": 1.7791690826416016, "attnres/final_alpha/block_1": 0.00379699794575572, "attnres/block_norm/1": 47803.26171875, "attnres/final_alpha/block_2": 0.00940396822988987, "attnres/block_norm/2": 29127.853515625, "attnres/final_alpha/block_3": 0.010923913680016994, "attnres/block_norm/3": 63810.96875, "attnres/final_alpha/block_4": 0.0129863191395998, "attnres/block_norm/4": 15760.13671875, "attnres/final_alpha/block_5": 0.6169108152389526, "attnres/block_norm/5": 6764.3515625, "attnres/final_alpha/block_6": 0.10583525896072388, "attnres/block_norm/6": 41175.1015625, "geo/tier1_time_s": 1.3584516048431396, "geo/step": 77100.0, "geo/rankme_slope": 0.0004910007362319928} {"step": 77110, "timestamp": 1778277893.2544603, "train/loss": 2.108289432525635, "train/z_loss": 0.0014062103349715472, "train/perplexity": 8.23414417183317, "train/grad_norm": 0.10546875, "optim/muon_lr": 0.010309251863509417, "optim/adamw_lr": 0.0003092775559052825, "perf/tokens_per_sec": 1791055.803444659, "perf/iters_per_sec": 0.8540419594977661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709026575088501, "data/tokens_consumed": 161713487872, "data/tokens_consumed_B": 161.713487872, "train/loss_slope": -2.711174339994865e-06} {"step": 77120, "timestamp": 1778277903.6054275, "train/loss": 2.083650362491608, "train/z_loss": 0.0014018972055055202, "train/perplexity": 8.033741522697067, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.010270607378333808, "optim/adamw_lr": 0.0003081182213500142, "perf/tokens_per_sec": 2027278.6453293927, "perf/iters_per_sec": 0.9666817881247486, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344665765762329, "data/tokens_consumed": 161734459392, "data/tokens_consumed_B": 161.734459392, "train/loss_slope": -4.487664507131373e-06} {"step": 77130, "timestamp": 1778277913.9746528, "train/loss": 2.0943646907806395, "train/z_loss": 0.001393503532744944, "train/perplexity": 8.120280442092001, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.010231958851218224, "optim/adamw_lr": 0.00030695876553654667, "perf/tokens_per_sec": 2023544.138471161, "perf/iters_per_sec": 0.9649010364871793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363757133483886, "data/tokens_consumed": 161755430912, "data/tokens_consumed_B": 161.755430912, "train/loss_slope": -5.392677398404645e-06} {"step": 77140, "timestamp": 1778277924.350447, "train/loss": 2.1023630142211913, "train/z_loss": 0.0014003353076986968, "train/perplexity": 8.18548950545029, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.010193305648863315, "optim/adamw_lr": 0.0003057991694658994, "perf/tokens_per_sec": 2022629.299270985, "perf/iters_per_sec": 0.9644648071627545, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368444681167603, "data/tokens_consumed": 161776402432, "data/tokens_consumed_B": 161.776402432, "train/loss_slope": -6.741278034911791e-06} {"step": 77150, "timestamp": 1778277934.703578, "grad/layer_0/attn": 0.0026396019384264946, "grad/layer_0/mlp": 0.00294405035674572, "grad/layer_0/attn_mlp_ratio": 0.8965885528144938, "grad/layer_4/attn": 0.003003401216119528, "grad/layer_4/mlp": 0.002481750911101699, "grad/layer_4/attn_mlp_ratio": 1.2101944162344707, "grad/layer_8/attn": 0.005262046121060848, "grad/layer_8/mlp": 0.0034690892789512873, "grad/layer_8/attn_mlp_ratio": 1.516837863269921, "grad/layer_12/attn": 0.003999314270913601, "grad/layer_12/mlp": 0.0064387088641524315, "grad/layer_12/attn_mlp_ratio": 0.6211360527676928, "grad/layer_16/attn": 0.004480954259634018, "grad/layer_16/mlp": 0.0046088434755802155, "grad/layer_16/attn_mlp_ratio": 0.9722513220835235, "grad/layer_20/attn": 0.0026933688204735518, "grad/layer_20/mlp": 0.005488971713930368, "grad/layer_20/attn_mlp_ratio": 0.49068731117148334, "grad/layer_24/attn": 0.004907512106001377, "grad/layer_24/mlp": 0.006962880492210388, "grad/layer_24/attn_mlp_ratio": 0.7048106083409758, "grad/layer_27/attn": 0.004292372148483992, "grad/layer_27/mlp": 0.006562437396496534, "grad/layer_27/attn_mlp_ratio": 0.6540819856608989} {"step": 77150, "timestamp": 1778277934.7193751, "train/loss": 2.0965336322784425, "train/z_loss": 0.0014021062641404569, "train/perplexity": 8.137911969280136, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.010154650760814548, "optim/adamw_lr": 0.0003046395228244364, "perf/tokens_per_sec": 2024371.234598394, "perf/iters_per_sec": 0.9652954266540499, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359522819519043, "data/tokens_consumed": 161797373952, "data/tokens_consumed_B": 161.797373952, "train/loss_slope": -3.7679743702405025e-06} {"step": 77160, "timestamp": 1778277945.0687032, "train/loss": 2.083267164230347, "train/z_loss": 0.0014008960919454695, "train/perplexity": 8.030663596679762, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.010115993563085795, "optim/adamw_lr": 0.0003034798068925738, "perf/tokens_per_sec": 2027426.9571517434, "perf/iters_per_sec": 0.9667525087126462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343909025192262, "data/tokens_consumed": 161818345472, "data/tokens_consumed_B": 161.818345472, "train/loss_slope": -2.5252884585254654e-07} {"step": 77170, "timestamp": 1778277955.4172056, "train/loss": 2.0997472763061524, "train/z_loss": 0.001410913746803999, "train/perplexity": 8.164106388704429, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.01007733462844044, "optim/adamw_lr": 0.00030232003885321315, "perf/tokens_per_sec": 2027550.0995526712, "perf/iters_per_sec": 0.9668112275851589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034328079223633, "data/tokens_consumed": 161839316992, "data/tokens_consumed_B": 161.839316992, "train/loss_slope": -2.0433441568987313e-06} {"step": 77175, "timestamp": 1778277961.1930869, "eos/sharpness": 47.36123085021972, "eos/L0_probe": 1.9230265617370605, "eos/L_plus": 2.1626617908477783, "eos/L_minus": 2.15700364112854, "eos/grad_norm": 0.10608366131782532, "eos/embed_grad_frac": 0.15783707797527313, "eos/time_s": 0.6097791194915771} {"step": 77175, "timestamp": 1778277962.5716276, "geo/rankme_last": 439.68756103515625, "geo/layer_0/stable_rank_q_proj": 18.792343139648438, "geo/layer_0/stable_rank_k_proj": 15.723838806152344, "geo/layer_0/stable_rank_o_proj": 46.43216323852539, "geo/layer_0/stable_rank_gate_proj": 127.41636657714844, "geo/layer_0/stable_rank_down_proj": 56.82371139526367, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06623583287000656, "geo/layer_0/attn_entropy_mean": 6.127784252166748, "geo/layer_0/attn_entropy_std": 0.4317004084587097, "geo/layer_7/stable_rank_q_proj": 42.6386833190918, "geo/layer_7/stable_rank_k_proj": 40.04230880737305, "geo/layer_7/stable_rank_o_proj": 88.03369903564453, "geo/layer_7/stable_rank_gate_proj": 76.97866821289062, "geo/layer_7/stable_rank_down_proj": 140.21571350097656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44669198989868164, "geo/layer_7/attn_entropy_mean": 4.632275581359863, "geo/layer_7/attn_entropy_std": 0.7786944508552551, "geo/layer_14/stable_rank_q_proj": 49.34765625, "geo/layer_14/stable_rank_k_proj": 41.62858963012695, "geo/layer_14/stable_rank_o_proj": 43.135433197021484, "geo/layer_14/stable_rank_gate_proj": 70.19071197509766, "geo/layer_14/stable_rank_down_proj": 124.62640380859375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39961734414100647, "geo/layer_14/attn_entropy_mean": 5.527331829071045, "geo/layer_14/attn_entropy_std": 0.41936448216438293, "geo/layer_21/stable_rank_q_proj": 39.58037567138672, "geo/layer_21/stable_rank_k_proj": 30.100561141967773, "geo/layer_21/stable_rank_o_proj": 67.88957214355469, "geo/layer_21/stable_rank_gate_proj": 63.082035064697266, "geo/layer_21/stable_rank_down_proj": 49.30291748046875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15158560872077942, "geo/layer_21/attn_entropy_mean": 5.684443473815918, "geo/layer_21/attn_entropy_std": 0.29100552201271057, "geo/layer_27/stable_rank_q_proj": 43.84965896606445, "geo/layer_27/stable_rank_k_proj": 32.30108642578125, "geo/layer_27/stable_rank_o_proj": 115.37368774414062, "geo/layer_27/stable_rank_gate_proj": 76.44878387451172, "geo/layer_27/stable_rank_down_proj": 127.50270080566406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08725877851247787, "geo/layer_27/attn_entropy_mean": 4.151862144470215, "geo/layer_27/attn_entropy_std": 0.794289767742157, "attnres/final_alpha/block_0": 0.24090062081813812, "attnres/block_norm/0": 1.779088020324707, "attnres/final_alpha/block_1": 0.0038761859759688377, "attnres/block_norm/1": 47805.7109375, "attnres/final_alpha/block_2": 0.009333986788988113, "attnres/block_norm/2": 29019.779296875, "attnres/final_alpha/block_3": 0.010982598178088665, "attnres/block_norm/3": 63713.8359375, "attnres/final_alpha/block_4": 0.013031575828790665, "attnres/block_norm/4": 15748.787109375, "attnres/final_alpha/block_5": 0.6156158447265625, "attnres/block_norm/5": 6818.826171875, "attnres/final_alpha/block_6": 0.10625917464494705, "attnres/block_norm/6": 41193.21484375, "geo/tier1_time_s": 1.3590621948242188, "geo/step": 77175.0, "geo/rankme_slope": 0.0005151118455194578} {"step": 77180, "timestamp": 1778277967.7508624, "train/loss": 2.0623592138290405, "train/z_loss": 0.0014013656764291227, "train/perplexity": 7.864501982880413, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.010038674536626786, "optim/adamw_lr": 0.00030116023609880356, "perf/tokens_per_sec": 1701296.8309751428, "perf/iters_per_sec": 0.8112415461421694, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232678484916687, "data/tokens_consumed": 161860288512, "data/tokens_consumed_B": 161.860288512, "train/loss_slope": -1.3069353624395725e-06} {"step": 77190, "timestamp": 1778277978.1120298, "train/loss": 2.0844239354133607, "train/z_loss": 0.0014072128455154599, "train/perplexity": 8.039958611975313, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.010000012675908465, "optim/adamw_lr": 0.00030000038027725394, "perf/tokens_per_sec": 2025185.5317652493, "perf/iters_per_sec": 0.965683713801026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035535740852356, "data/tokens_consumed": 161881260032, "data/tokens_consumed_B": 161.881260032, "train/loss_slope": -2.1778719796456077e-06} {"step": 77200, "timestamp": 1778277988.4614654, "grad/layer_0/attn": 0.002777837449684739, "grad/layer_0/mlp": 0.0029353806748986244, "grad/layer_0/attn_mlp_ratio": 0.946329509765449, "grad/layer_4/attn": 0.0026754019781947136, "grad/layer_4/mlp": 0.0024427068419754505, "grad/layer_4/attn_mlp_ratio": 1.0952611351859831, "grad/layer_8/attn": 0.0031468637753278017, "grad/layer_8/mlp": 0.0033047599717974663, "grad/layer_8/attn_mlp_ratio": 0.9522215552598998, "grad/layer_12/attn": 0.00484815938398242, "grad/layer_12/mlp": 0.007110427133738995, "grad/layer_12/attn_mlp_ratio": 0.6818379859058106, "grad/layer_16/attn": 0.0032895090989768505, "grad/layer_16/mlp": 0.004611927550286055, "grad/layer_16/attn_mlp_ratio": 0.713261210585734, "grad/layer_20/attn": 0.0026962619740515947, "grad/layer_20/mlp": 0.005233077798038721, "grad/layer_20/attn_mlp_ratio": 0.5152344426331038, "grad/layer_24/attn": 0.0046868519857525826, "grad/layer_24/mlp": 0.007317409384995699, "grad/layer_24/attn_mlp_ratio": 0.6405069984620845, "grad/layer_27/attn": 0.004801253322511911, "grad/layer_27/mlp": 0.0066615017130970955, "grad/layer_27/attn_mlp_ratio": 0.7207463808044348} {"step": 77200, "timestamp": 1778277988.477451, "train/loss": 2.0860731840133666, "train/z_loss": 0.0014112842152826488, "train/perplexity": 8.053229442901804, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.009961352008394897, "optim/adamw_lr": 0.0002988405602518469, "perf/tokens_per_sec": 2025721.420653163, "perf/iters_per_sec": 0.9659392455354514, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352617979049683, "data/tokens_consumed": 161902231552, "data/tokens_consumed_B": 161.902231552, "train/loss_slope": -2.1589015433640026e-06} {"step": 77210, "timestamp": 1778277998.8275635, "train/loss": 2.068651854991913, "train/z_loss": 0.0014112706412561238, "train/perplexity": 7.914146505551029, "train/grad_norm": 0.09716796875, "optim/muon_lr": 0.00992269191890955, "optim/adamw_lr": 0.00029768075756728645, "perf/tokens_per_sec": 2027283.0373652016, "perf/iters_per_sec": 0.9666838824106224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344643354415894, "data/tokens_consumed": 161923203072, "data/tokens_consumed_B": 161.923203072, "train/loss_slope": -7.135738712726227e-06} {"step": 77220, "timestamp": 1778278009.1830752, "train/loss": 2.0686468839645387, "train/z_loss": 0.0014159304555505514, "train/perplexity": 7.914107164209891, "train/grad_norm": 0.1015625, "optim/muon_lr": 0.009884032979607582, "optim/adamw_lr": 0.0002965209893882274, "perf/tokens_per_sec": 2026234.2084996908, "perf/iters_per_sec": 0.9661837618349508, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349997997283935, "data/tokens_consumed": 161944174592, "data/tokens_consumed_B": 161.944174592, "train/loss_slope": -9.541803553695945e-06} {"step": 77230, "timestamp": 1778278019.534914, "train/loss": 2.081547403335571, "train/z_loss": 0.001412083685863763, "train/perplexity": 8.016864644317382, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.009845375781878829, "optim/adamw_lr": 0.0002953612734563648, "perf/tokens_per_sec": 2026954.2479849982, "perf/iters_per_sec": 0.9665271034169188, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034632134437561, "data/tokens_consumed": 161965146112, "data/tokens_consumed_B": 161.965146112, "train/loss_slope": -8.561090309985406e-06} {"step": 77240, "timestamp": 1778278029.8892076, "train/loss": 2.064740979671478, "train/z_loss": 0.0014186438987962902, "train/perplexity": 7.8832557096984734, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.009806720893830062, "optim/adamw_lr": 0.0002942016268149018, "perf/tokens_per_sec": 2026387.1758342937, "perf/iters_per_sec": 0.9662567023440808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349216699600219, "data/tokens_consumed": 161986117632, "data/tokens_consumed_B": 161.986117632, "train/loss_slope": -1.202412541478449e-05} {"step": 77250, "timestamp": 1778278040.229633, "grad/layer_0/attn": 0.002286056522279978, "grad/layer_0/mlp": 0.002789186779409647, "grad/layer_0/attn_mlp_ratio": 0.8196139667643351, "grad/layer_4/attn": 0.0017739987233653665, "grad/layer_4/mlp": 0.0024744714610278606, "grad/layer_4/attn_mlp_ratio": 0.7169202310930873, "grad/layer_8/attn": 0.0037186972331255674, "grad/layer_8/mlp": 0.0032671643421053886, "grad/layer_8/attn_mlp_ratio": 1.1382032643356044, "grad/layer_12/attn": 0.005134351551532745, "grad/layer_12/mlp": 0.006677628494799137, "grad/layer_12/attn_mlp_ratio": 0.7688884577275875, "grad/layer_16/attn": 0.0029822555370628834, "grad/layer_16/mlp": 0.00408340897411108, "grad/layer_16/attn_mlp_ratio": 0.7303347479855646, "grad/layer_20/attn": 0.0024768547154963017, "grad/layer_20/mlp": 0.004941963125020266, "grad/layer_20/attn_mlp_ratio": 0.5011884149514578, "grad/layer_24/attn": 0.006547376047819853, "grad/layer_24/mlp": 0.006894237361848354, "grad/layer_24/attn_mlp_ratio": 0.9496882119381616, "grad/layer_27/attn": 0.004319548141211271, "grad/layer_27/mlp": 0.005947696976363659, "grad/layer_27/attn_mlp_ratio": 0.7262555718207802} {"step": 77250, "timestamp": 1778278040.8399699, "eos/sharpness": 9.61930751800537, "eos/L0_probe": 1.9180065393447876, "eos/L_plus": 1.9676201343536377, "eos/L_minus": 1.9645860195159912, "eos/grad_norm": 0.07924014329910278, "eos/embed_grad_frac": 0.2959786057472229, "eos/time_s": 0.6074938774108887} {"step": 77250, "timestamp": 1778278040.8602364, "train/loss": 2.120390605926514, "train/z_loss": 0.0014051646343432368, "train/perplexity": 8.33439231500102, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.009768067691475153, "optim/adamw_lr": 0.00029304203074425456, "perf/tokens_per_sec": 1912357.5586878958, "perf/iters_per_sec": 0.9118831437530021, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0966317415237428, "data/tokens_consumed": 162007089152, "data/tokens_consumed_B": 162.007089152, "train/loss_slope": -8.214005950403511e-06} {"step": 77250, "timestamp": 1778278042.226695, "geo/rankme_last": 439.8519592285156, "geo/layer_0/stable_rank_q_proj": 18.791582107543945, "geo/layer_0/stable_rank_k_proj": 15.72715950012207, "geo/layer_0/stable_rank_o_proj": 46.39830017089844, "geo/layer_0/stable_rank_gate_proj": 127.49722290039062, "geo/layer_0/stable_rank_down_proj": 56.801422119140625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0604940764605999, "geo/layer_0/attn_entropy_mean": 6.128000736236572, "geo/layer_0/attn_entropy_std": 0.4308655261993408, "geo/layer_7/stable_rank_q_proj": 42.655174255371094, "geo/layer_7/stable_rank_k_proj": 40.04766845703125, "geo/layer_7/stable_rank_o_proj": 87.989013671875, "geo/layer_7/stable_rank_gate_proj": 76.96196746826172, "geo/layer_7/stable_rank_down_proj": 140.2273712158203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44927769899368286, "geo/layer_7/attn_entropy_mean": 4.650294303894043, "geo/layer_7/attn_entropy_std": 0.7829976081848145, "geo/layer_14/stable_rank_q_proj": 49.345340728759766, "geo/layer_14/stable_rank_k_proj": 41.59966278076172, "geo/layer_14/stable_rank_o_proj": 43.11476135253906, "geo/layer_14/stable_rank_gate_proj": 70.2185287475586, "geo/layer_14/stable_rank_down_proj": 124.6924057006836, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40397152304649353, "geo/layer_14/attn_entropy_mean": 5.545586585998535, "geo/layer_14/attn_entropy_std": 0.41044092178344727, "geo/layer_21/stable_rank_q_proj": 39.5731315612793, "geo/layer_21/stable_rank_k_proj": 30.104413986206055, "geo/layer_21/stable_rank_o_proj": 67.89094543457031, "geo/layer_21/stable_rank_gate_proj": 63.10211181640625, "geo/layer_21/stable_rank_down_proj": 49.292236328125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15009203553199768, "geo/layer_21/attn_entropy_mean": 5.682870864868164, "geo/layer_21/attn_entropy_std": 0.28751373291015625, "geo/layer_27/stable_rank_q_proj": 43.893272399902344, "geo/layer_27/stable_rank_k_proj": 32.29874801635742, "geo/layer_27/stable_rank_o_proj": 115.36448669433594, "geo/layer_27/stable_rank_gate_proj": 76.45378112792969, "geo/layer_27/stable_rank_down_proj": 127.48112487792969, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08918741345405579, "geo/layer_27/attn_entropy_mean": 4.161993026733398, "geo/layer_27/attn_entropy_std": 0.7910028696060181, "attnres/final_alpha/block_0": 0.2418031096458435, "attnres/block_norm/0": 1.7790287733078003, "attnres/final_alpha/block_1": 0.003844004124403, "attnres/block_norm/1": 47713.015625, "attnres/final_alpha/block_2": 0.009473580867052078, "attnres/block_norm/2": 29180.6328125, "attnres/final_alpha/block_3": 0.011061612516641617, "attnres/block_norm/3": 63620.16015625, "attnres/final_alpha/block_4": 0.013135973364114761, "attnres/block_norm/4": 15730.056640625, "attnres/final_alpha/block_5": 0.6146097183227539, "attnres/block_norm/5": 6819.34033203125, "attnres/final_alpha/block_6": 0.10607202351093292, "attnres/block_norm/6": 41116.109375, "geo/tier1_time_s": 1.3610610961914062, "geo/step": 77250.0, "geo/rankme_slope": 0.0004944840240783813} {"step": 77260, "timestamp": 1778278052.5757906, "train/loss": 2.0490491151809693, "train/z_loss": 0.0014185800217092037, "train/perplexity": 7.760518237754155, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.009729419164359569, "optim/adamw_lr": 0.00029188257493078704, "perf/tokens_per_sec": 1790848.4625360032, "perf/iters_per_sec": 0.8539430916481033, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1710382223129272, "data/tokens_consumed": 162028060672, "data/tokens_consumed_B": 162.028060672, "train/loss_slope": -1.0734819722111176e-05} {"step": 77270, "timestamp": 1778278062.9206874, "train/loss": 2.095381999015808, "train/z_loss": 0.0014050022582523525, "train/perplexity": 8.12854547358712, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.009690774660557509, "optim/adamw_lr": 0.00029072323981672524, "perf/tokens_per_sec": 2028258.1664504881, "perf/iters_per_sec": 0.9671488601925317, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339669942855836, "data/tokens_consumed": 162049032192, "data/tokens_consumed_B": 162.049032192, "train/loss_slope": -9.773547421194113e-06} {"step": 77280, "timestamp": 1778278073.287798, "train/loss": 2.108494818210602, "train/z_loss": 0.0014014492859132587, "train/perplexity": 8.235835520857512, "train/grad_norm": 0.10791015625, "optim/muon_lr": 0.00965213481336832, "optim/adamw_lr": 0.0002895640444010496, "perf/tokens_per_sec": 2023819.9483328557, "perf/iters_per_sec": 0.9650325528778342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036234474182129, "data/tokens_consumed": 162070003712, "data/tokens_consumed_B": 162.070003712, "train/loss_slope": -8.619105533333147e-06} {"step": 77290, "timestamp": 1778278083.645741, "train/loss": 2.090300953388214, "train/z_loss": 0.001410842570476234, "train/perplexity": 8.08734871309119, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.009613500125706196, "optim/adamw_lr": 0.0002884050037711858, "perf/tokens_per_sec": 2026260.7205607311, "perf/iters_per_sec": 0.9661964037707954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349862575531006, "data/tokens_consumed": 162090975232, "data/tokens_consumed_B": 162.090975232, "train/loss_slope": -1.0852860853616909e-05} {"step": 77300, "timestamp": 1778278093.9972205, "grad/layer_0/attn": 0.0026523973792791367, "grad/layer_0/mlp": 0.003104077186435461, "grad/layer_0/attn_mlp_ratio": 0.8544881890891941, "grad/layer_4/attn": 0.0021516152191907167, "grad/layer_4/mlp": 0.0026035092305392027, "grad/layer_4/attn_mlp_ratio": 0.8264288489202779, "grad/layer_8/attn": 0.003156052902340889, "grad/layer_8/mlp": 0.0036750498693436384, "grad/layer_8/attn_mlp_ratio": 0.8587782285051645, "grad/layer_12/attn": 0.004401287995278835, "grad/layer_12/mlp": 0.006510546896606684, "grad/layer_12/attn_mlp_ratio": 0.6760243029614562, "grad/layer_16/attn": 0.0031622580718249083, "grad/layer_16/mlp": 0.004274369217455387, "grad/layer_16/attn_mlp_ratio": 0.7398186345085082, "grad/layer_20/attn": 0.0026455370243638754, "grad/layer_20/mlp": 0.0052300794050097466, "grad/layer_20/attn_mlp_ratio": 0.5058311296854646, "grad/layer_24/attn": 0.006414752919226885, "grad/layer_24/mlp": 0.007187402807176113, "grad/layer_24/attn_mlp_ratio": 0.8924994190630677, "grad/layer_27/attn": 0.006532629020512104, "grad/layer_27/mlp": 0.006694352254271507, "grad/layer_27/attn_mlp_ratio": 0.9758418252878173} {"step": 77300, "timestamp": 1778278094.0129406, "train/loss": 2.1058927178382874, "train/z_loss": 0.0014057392836548388, "train/perplexity": 8.214432908131931, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.009574871249496937, "optim/adamw_lr": 0.00028724613748490807, "perf/tokens_per_sec": 2023893.4296103707, "perf/iters_per_sec": 0.9650675914813855, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0361968517303466, "data/tokens_consumed": 162111946752, "data/tokens_consumed_B": 162.111946752, "train/loss_slope": -6.1200099845494335e-06} {"step": 77310, "timestamp": 1778278104.3612442, "train/loss": 2.1156924962997437, "train/z_loss": 0.001397667604032904, "train/perplexity": 8.295328261687652, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.00953624751418829, "optim/adamw_lr": 0.0002860874254256487, "perf/tokens_per_sec": 2027601.136764999, "perf/iters_per_sec": 0.9668355640244479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034302043914795, "data/tokens_consumed": 162132918272, "data/tokens_consumed_B": 162.132918272, "train/loss_slope": -6.979216808246934e-06} {"step": 77320, "timestamp": 1778278114.710637, "train/loss": 2.0761465191841126, "train/z_loss": 0.0014093162724748253, "train/perplexity": 7.973683201066963, "train/grad_norm": 0.09521484375, "optim/muon_lr": 0.009497631900012493, "optim/adamw_lr": 0.00028492895700037477, "perf/tokens_per_sec": 2027460.6971491405, "perf/iters_per_sec": 0.9667685971971228, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034373688697815, "data/tokens_consumed": 162153889792, "data/tokens_consumed_B": 162.153889792, "train/loss_slope": -7.926555063285963e-06} {"step": 77325, "timestamp": 1778278120.4833221, "eos/sharpness": 11.769139766693113, "eos/L0_probe": 1.9197306632995605, "eos/L_plus": 1.9875904321670532, "eos/L_minus": 1.969562292098999, "eos/grad_norm": 0.08709156513214111, "eos/embed_grad_frac": 0.254887193441391, "eos/time_s": 0.6063442230224609} {"step": 77325, "timestamp": 1778278121.8651352, "geo/rankme_last": 439.936279296875, "geo/layer_0/stable_rank_q_proj": 18.783740997314453, "geo/layer_0/stable_rank_k_proj": 15.725452423095703, "geo/layer_0/stable_rank_o_proj": 46.39665222167969, "geo/layer_0/stable_rank_gate_proj": 127.48107147216797, "geo/layer_0/stable_rank_down_proj": 56.790138244628906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06494022905826569, "geo/layer_0/attn_entropy_mean": 6.129207134246826, "geo/layer_0/attn_entropy_std": 0.4296930432319641, "geo/layer_7/stable_rank_q_proj": 42.647159576416016, "geo/layer_7/stable_rank_k_proj": 40.023067474365234, "geo/layer_7/stable_rank_o_proj": 88.00048065185547, "geo/layer_7/stable_rank_gate_proj": 76.92915344238281, "geo/layer_7/stable_rank_down_proj": 140.22325134277344, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44869473576545715, "geo/layer_7/attn_entropy_mean": 4.640966415405273, "geo/layer_7/attn_entropy_std": 0.7771499156951904, "geo/layer_14/stable_rank_q_proj": 49.37331771850586, "geo/layer_14/stable_rank_k_proj": 41.61076354980469, "geo/layer_14/stable_rank_o_proj": 43.135032653808594, "geo/layer_14/stable_rank_gate_proj": 70.21031951904297, "geo/layer_14/stable_rank_down_proj": 124.68754577636719, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40059149265289307, "geo/layer_14/attn_entropy_mean": 5.513469696044922, "geo/layer_14/attn_entropy_std": 0.41316619515419006, "geo/layer_21/stable_rank_q_proj": 39.530025482177734, "geo/layer_21/stable_rank_k_proj": 30.089323043823242, "geo/layer_21/stable_rank_o_proj": 67.88339233398438, "geo/layer_21/stable_rank_gate_proj": 63.11347961425781, "geo/layer_21/stable_rank_down_proj": 49.28730392456055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14690683782100677, "geo/layer_21/attn_entropy_mean": 5.685305595397949, "geo/layer_21/attn_entropy_std": 0.28930360078811646, "geo/layer_27/stable_rank_q_proj": 43.88723373413086, "geo/layer_27/stable_rank_k_proj": 32.35431671142578, "geo/layer_27/stable_rank_o_proj": 115.33296203613281, "geo/layer_27/stable_rank_gate_proj": 76.49420166015625, "geo/layer_27/stable_rank_down_proj": 127.66849517822266, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.099617138504982, "geo/layer_27/attn_entropy_mean": 4.150225639343262, "geo/layer_27/attn_entropy_std": 0.7941750288009644, "attnres/final_alpha/block_0": 0.24210505187511444, "attnres/block_norm/0": 1.7790358066558838, "attnres/final_alpha/block_1": 0.0038644084706902504, "attnres/block_norm/1": 47736.640625, "attnres/final_alpha/block_2": 0.009586045518517494, "attnres/block_norm/2": 29039.73046875, "attnres/final_alpha/block_3": 0.011147293262183666, "attnres/block_norm/3": 63396.51171875, "attnres/final_alpha/block_4": 0.013118003495037556, "attnres/block_norm/4": 15772.923828125, "attnres/final_alpha/block_5": 0.6135523319244385, "attnres/block_norm/5": 6857.1064453125, "attnres/final_alpha/block_6": 0.10662685334682465, "attnres/block_norm/6": 40990.46875, "geo/tier1_time_s": 1.3625783920288086, "geo/step": 77325.0, "geo/rankme_slope": 0.0004971932327618548} {"step": 77330, "timestamp": 1778278127.0414636, "train/loss": 2.1522766232490538, "train/z_loss": 0.0013986875768750907, "train/perplexity": 8.604425150005714, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.0094590238109231, "optim/adamw_lr": 0.00028377071432769295, "perf/tokens_per_sec": 1701828.0280349792, "perf/iters_per_sec": 0.8114948406386276, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.232293725013733, "data/tokens_consumed": 162174861312, "data/tokens_consumed_B": 162.174861312, "train/loss_slope": -2.3653115948643867e-07} {"step": 77340, "timestamp": 1778278137.3875594, "train/loss": 2.0789087772369386, "train/z_loss": 0.001395725691691041, "train/perplexity": 7.995739019607029, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.00942042376846075, "optim/adamw_lr": 0.0002826127130538225, "perf/tokens_per_sec": 2027891.8922364376, "perf/iters_per_sec": 0.9669742070371807, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341537475585938, "data/tokens_consumed": 162195832832, "data/tokens_consumed_B": 162.195832832, "train/loss_slope": -2.9534477533751296e-06} {"step": 77350, "timestamp": 1778278147.7243712, "grad/layer_0/attn": 0.00266195903532207, "grad/layer_0/mlp": 0.0029904856346547604, "grad/layer_0/attn_mlp_ratio": 0.8901426963768422, "grad/layer_4/attn": 0.0020668748766183853, "grad/layer_4/mlp": 0.002597563434392214, "grad/layer_4/attn_mlp_ratio": 0.7956975254897852, "grad/layer_8/attn": 0.0033552993554621935, "grad/layer_8/mlp": 0.00375740765593946, "grad/layer_8/attn_mlp_ratio": 0.8929824957534512, "grad/layer_12/attn": 0.004371337592601776, "grad/layer_12/mlp": 0.0066980281844735146, "grad/layer_12/attn_mlp_ratio": 0.6526305066126452, "grad/layer_16/attn": 0.004390020854771137, "grad/layer_16/mlp": 0.0043432326056063175, "grad/layer_16/attn_mlp_ratio": 1.010772655378199, "grad/layer_20/attn": 0.0025754785165190697, "grad/layer_20/mlp": 0.005031208507716656, "grad/layer_20/attn_mlp_ratio": 0.5119005625345985, "grad/layer_24/attn": 0.006310509517788887, "grad/layer_24/mlp": 0.006788823753595352, "grad/layer_24/attn_mlp_ratio": 0.9295438582409014, "grad/layer_27/attn": 0.006789486855268478, "grad/layer_27/mlp": 0.0062257070094347, "grad/layer_27/attn_mlp_ratio": 1.0905567409329944} {"step": 77350, "timestamp": 1778278147.7400038, "train/loss": 2.050692141056061, "train/z_loss": 0.0014066104544326664, "train/perplexity": 7.7732794506536225, "train/grad_norm": 0.10302734375, "optim/muon_lr": 0.0093818324431777, "optim/adamw_lr": 0.000281454973295331, "perf/tokens_per_sec": 2026803.2032018544, "perf/iters_per_sec": 0.9664550796517631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347092390060424, "data/tokens_consumed": 162216804352, "data/tokens_consumed_B": 162.216804352, "train/loss_slope": -4.020060593038148e-06} {"step": 77360, "timestamp": 1778278158.0880938, "train/loss": 2.066342270374298, "train/z_loss": 0.0014007736346684397, "train/perplexity": 7.895889206024976, "train/grad_norm": 0.078125, "optim/muon_lr": 0.009343250319361687, "optim/adamw_lr": 0.0002802975095808506, "perf/tokens_per_sec": 2027733.7427043794, "perf/iters_per_sec": 0.9668987954637429, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342344045639038, "data/tokens_consumed": 162237775872, "data/tokens_consumed_B": 162.237775872, "train/loss_slope": -3.644534785433436e-06} {"step": 77370, "timestamp": 1778278168.8589225, "train/loss": 2.1049736976623534, "train/z_loss": 0.0013979491894133389, "train/perplexity": 8.206887146440215, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.00930467687547207, "optim/adamw_lr": 0.000279140306264162, "perf/tokens_per_sec": 1947935.059465758, "perf/iters_per_sec": 0.9288478181198874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0766026258468628, "data/tokens_consumed": 162258747392, "data/tokens_consumed_B": 162.258747392, "train/loss_slope": -3.844765088882186e-06} {"step": 77380, "timestamp": 1778278179.2105572, "train/loss": 2.1089484214782717, "train/z_loss": 0.0014030850026756525, "train/perplexity": 8.239572170175865, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.009266114905476571, "optim/adamw_lr": 0.00027798344716429706, "perf/tokens_per_sec": 2026940.0486299163, "perf/iters_per_sec": 0.9665203326367933, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346393823623656, "data/tokens_consumed": 162279718912, "data/tokens_consumed_B": 162.279718912, "train/loss_slope": -1.9955170465738187e-06} {"step": 77390, "timestamp": 1778278189.5561743, "train/loss": 2.047335517406464, "train/z_loss": 0.0014073540107347072, "train/perplexity": 7.7472312185275705, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.00922756403684616, "optim/adamw_lr": 0.0002768269211053848, "perf/tokens_per_sec": 2028067.320749427, "perf/iters_per_sec": 0.9670578578707824, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340642929077148, "data/tokens_consumed": 162300690432, "data/tokens_consumed_B": 162.300690432, "train/loss_slope": -3.448186977968944e-06} {"step": 77400, "timestamp": 1778278199.9012349, "grad/layer_0/attn": 0.0023397679906338453, "grad/layer_0/mlp": 0.002743746852502227, "grad/layer_0/attn_mlp_ratio": 0.8527637683570043, "grad/layer_4/attn": 0.0022420622408390045, "grad/layer_4/mlp": 0.00255321292206645, "grad/layer_4/attn_mlp_ratio": 0.8781336384632666, "grad/layer_8/attn": 0.0037496532313525677, "grad/layer_8/mlp": 0.003509329864755273, "grad/layer_8/attn_mlp_ratio": 1.0684812397269272, "grad/layer_12/attn": 0.004915231838822365, "grad/layer_12/mlp": 0.006599243264645338, "grad/layer_12/attn_mlp_ratio": 0.7448174839490138, "grad/layer_16/attn": 0.0030184751376509666, "grad/layer_16/mlp": 0.0044389874674379826, "grad/layer_16/attn_mlp_ratio": 0.6799917980831647, "grad/layer_20/attn": 0.0029788108076900244, "grad/layer_20/mlp": 0.0054305437952280045, "grad/layer_20/attn_mlp_ratio": 0.5485289991500856, "grad/layer_24/attn": 0.007044630590826273, "grad/layer_24/mlp": 0.006561825051903725, "grad/layer_24/attn_mlp_ratio": 1.0735779189091128, "grad/layer_27/attn": 0.0056204465217888355, "grad/layer_27/mlp": 0.00588595075532794, "grad/layer_27/attn_mlp_ratio": 0.9548918534889277} {"step": 77400, "timestamp": 1778278200.506575, "eos/sharpness": 15.49206972122192, "eos/L0_probe": 1.9196317195892334, "eos/L_plus": 2.002695322036743, "eos/L_minus": 1.9914888143539429, "eos/grad_norm": 0.07931399345397949, "eos/embed_grad_frac": 0.29096102714538574, "eos/time_s": 0.6026854515075684} {"step": 77400, "timestamp": 1778278200.5268047, "train/loss": 2.066989231109619, "train/z_loss": 0.001406639686319977, "train/perplexity": 7.9009991891126905, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.009189024642109871, "optim/adamw_lr": 0.0002756707392632961, "perf/tokens_per_sec": 1913230.9400642156, "perf/iters_per_sec": 0.9122996044465139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961311340332032, "data/tokens_consumed": 162321661952, "data/tokens_consumed_B": 162.321661952, "train/loss_slope": -6.369588191252367e-06} {"step": 77400, "timestamp": 1778278201.8916845, "geo/rankme_last": 440.39215087890625, "geo/layer_0/stable_rank_q_proj": 18.77236557006836, "geo/layer_0/stable_rank_k_proj": 15.721662521362305, "geo/layer_0/stable_rank_o_proj": 46.37021255493164, "geo/layer_0/stable_rank_gate_proj": 127.43064880371094, "geo/layer_0/stable_rank_down_proj": 56.79615783691406, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06450684368610382, "geo/layer_0/attn_entropy_mean": 6.128223419189453, "geo/layer_0/attn_entropy_std": 0.4296383261680603, "geo/layer_7/stable_rank_q_proj": 42.63311767578125, "geo/layer_7/stable_rank_k_proj": 40.03140640258789, "geo/layer_7/stable_rank_o_proj": 88.01715087890625, "geo/layer_7/stable_rank_gate_proj": 76.91573333740234, "geo/layer_7/stable_rank_down_proj": 140.17037963867188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4373054802417755, "geo/layer_7/attn_entropy_mean": 4.643777847290039, "geo/layer_7/attn_entropy_std": 0.7810603976249695, "geo/layer_14/stable_rank_q_proj": 49.34880447387695, "geo/layer_14/stable_rank_k_proj": 41.58464813232422, "geo/layer_14/stable_rank_o_proj": 43.1060791015625, "geo/layer_14/stable_rank_gate_proj": 70.22602081298828, "geo/layer_14/stable_rank_down_proj": 124.85728454589844, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41538262367248535, "geo/layer_14/attn_entropy_mean": 5.501728057861328, "geo/layer_14/attn_entropy_std": 0.4059959352016449, "geo/layer_21/stable_rank_q_proj": 39.50686264038086, "geo/layer_21/stable_rank_k_proj": 30.093576431274414, "geo/layer_21/stable_rank_o_proj": 67.85319519042969, "geo/layer_21/stable_rank_gate_proj": 63.07192611694336, "geo/layer_21/stable_rank_down_proj": 49.3079948425293, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14925220608711243, "geo/layer_21/attn_entropy_mean": 5.675065994262695, "geo/layer_21/attn_entropy_std": 0.2921721041202545, "geo/layer_27/stable_rank_q_proj": 43.89377212524414, "geo/layer_27/stable_rank_k_proj": 32.38153076171875, "geo/layer_27/stable_rank_o_proj": 115.33829498291016, "geo/layer_27/stable_rank_gate_proj": 76.50060272216797, "geo/layer_27/stable_rank_down_proj": 127.6084213256836, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09162595123052597, "geo/layer_27/attn_entropy_mean": 4.146643161773682, "geo/layer_27/attn_entropy_std": 0.7911043763160706, "attnres/final_alpha/block_0": 0.2420118749141693, "attnres/block_norm/0": 1.7790648937225342, "attnres/final_alpha/block_1": 0.0038957749493420124, "attnres/block_norm/1": 47653.671875, "attnres/final_alpha/block_2": 0.009419501759111881, "attnres/block_norm/2": 29053.58984375, "attnres/final_alpha/block_3": 0.011058484204113483, "attnres/block_norm/3": 63603.99609375, "attnres/final_alpha/block_4": 0.013142352923750877, "attnres/block_norm/4": 15772.9091796875, "attnres/final_alpha/block_5": 0.6143510341644287, "attnres/block_norm/5": 6846.0126953125, "attnres/final_alpha/block_6": 0.1061210185289383, "attnres/block_norm/6": 40900.40625, "geo/tier1_time_s": 1.360931158065796, "geo/step": 77400.0, "geo/rankme_slope": 0.0004922423266181473} {"step": 77410, "timestamp": 1778278212.2368753, "train/loss": 2.0363098263740538, "train/z_loss": 0.0014182291575707494, "train/perplexity": 7.662281814275007, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.009150497317314148, "optim/adamw_lr": 0.0002745149195194244, "perf/tokens_per_sec": 1791477.1603961624, "perf/iters_per_sec": 0.85424287814911, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706272602081298, "data/tokens_consumed": 162342633472, "data/tokens_consumed_B": 162.342633472, "train/loss_slope": -8.888976279944526e-06} {"step": 77420, "timestamp": 1778278222.5883486, "train/loss": 2.110065531730652, "train/z_loss": 0.0014038822613656522, "train/perplexity": 8.24878182386392, "train/grad_norm": 0.09423828125, "optim/muon_lr": 0.009111982807517053, "optim/adamw_lr": 0.0002733594842255115, "perf/tokens_per_sec": 2026845.6092971964, "perf/iters_per_sec": 0.9664753004537565, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03468759059906, "data/tokens_consumed": 162363604992, "data/tokens_consumed_B": 162.363604992, "train/loss_slope": -5.4961060604961285e-06} {"step": 77430, "timestamp": 1778278232.9478781, "train/loss": 2.1317794799804686, "train/z_loss": 0.0014055273844860495, "train/perplexity": 8.429854229419657, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.009073480293154716, "optim/adamw_lr": 0.0002722044087946415, "perf/tokens_per_sec": 2025518.691650348, "perf/iters_per_sec": 0.9658425768138638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353654146194458, "data/tokens_consumed": 162384576512, "data/tokens_consumed_B": 162.384576512, "train/loss_slope": -3.5163289821794863e-06} {"step": 77440, "timestamp": 1778278243.2960627, "train/loss": 2.0989362359046937, "train/z_loss": 0.0014066736795939506, "train/perplexity": 8.157487652975233, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.009034992828965187, "optim/adamw_lr": 0.00027104978486895557, "perf/tokens_per_sec": 2027565.5693520708, "perf/iters_per_sec": 0.9668186041603426, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343201875686645, "data/tokens_consumed": 162405548032, "data/tokens_consumed_B": 162.405548032, "train/loss_slope": -3.07210384458165e-06} {"step": 77450, "timestamp": 1778278253.6328359, "grad/layer_0/attn": 0.0025462659541517496, "grad/layer_0/mlp": 0.0030055276583880186, "grad/layer_0/attn_mlp_ratio": 0.8471942896037038, "grad/layer_4/attn": 0.0020320171024650335, "grad/layer_4/mlp": 0.0026244146283715963, "grad/layer_4/attn_mlp_ratio": 0.7742743860174359, "grad/layer_8/attn": 0.006995870266109705, "grad/layer_8/mlp": 0.0037041257601231337, "grad/layer_8/attn_mlp_ratio": 1.8886696970596852, "grad/layer_12/attn": 0.004836120177060366, "grad/layer_12/mlp": 0.00699882535263896, "grad/layer_12/attn_mlp_ratio": 0.690990253977097, "grad/layer_16/attn": 0.0043642558157444, "grad/layer_16/mlp": 0.004836004227399826, "grad/layer_16/attn_mlp_ratio": 0.9024507672620156, "grad/layer_20/attn": 0.002870543161407113, "grad/layer_20/mlp": 0.005424760282039642, "grad/layer_20/attn_mlp_ratio": 0.5291557523740477, "grad/layer_24/attn": 0.007130784913897514, "grad/layer_24/mlp": 0.007265379186719656, "grad/layer_24/attn_mlp_ratio": 0.9814745565908493, "grad/layer_27/attn": 0.006519597955048084, "grad/layer_27/mlp": 0.006691606715321541, "grad/layer_27/attn_mlp_ratio": 0.974294834556087} {"step": 77450, "timestamp": 1778278253.648608, "train/loss": 2.1034387707710267, "train/z_loss": 0.0013934327173046768, "train/perplexity": 8.19429983743591, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.008996519818902015, "optim/adamw_lr": 0.00026989559456706044, "perf/tokens_per_sec": 2026645.4102031162, "perf/iters_per_sec": 0.9663798380866605, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034789800643921, "data/tokens_consumed": 162426519552, "data/tokens_consumed_B": 162.426519552, "train/loss_slope": -3.86422055281923e-06} {"step": 77460, "timestamp": 1778278264.0006099, "train/loss": 2.1141493797302244, "train/z_loss": 0.0014160977443680167, "train/perplexity": 8.282537474573335, "train/grad_norm": 0.10986328125, "optim/muon_lr": 0.008958061784505844, "optim/adamw_lr": 0.0002687418535351753, "perf/tokens_per_sec": 2027366.72330019, "perf/iters_per_sec": 0.9667237869740438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344216346740722, "data/tokens_consumed": 162447491072, "data/tokens_consumed_B": 162.447491072, "train/loss_slope": -1.6922862687841467e-06} {"step": 77470, "timestamp": 1778278274.3509247, "train/loss": 2.1015379190444947, "train/z_loss": 0.0014121459447778762, "train/perplexity": 8.178738483041455, "train/grad_norm": 0.10498046875, "optim/muon_lr": 0.00891961932182312, "optim/adamw_lr": 0.0002675885796546936, "perf/tokens_per_sec": 2027106.2495969625, "perf/iters_per_sec": 0.9665995834336102, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345545530319213, "data/tokens_consumed": 162468462592, "data/tokens_consumed_B": 162.468462592, "train/loss_slope": -2.148694409788987e-06} {"step": 77475, "timestamp": 1778278280.1224453, "eos/sharpness": 42.02349185943603, "eos/L0_probe": 1.9153655767440796, "eos/L_plus": 2.144146680831909, "eos/L_minus": 2.1068193912506104, "eos/grad_norm": 0.09360451996326447, "eos/embed_grad_frac": 0.2063925564289093, "eos/time_s": 0.6073951721191406} {"step": 77475, "timestamp": 1778278281.5016809, "geo/rankme_last": 441.0646667480469, "geo/layer_0/stable_rank_q_proj": 18.76471519470215, "geo/layer_0/stable_rank_k_proj": 15.717069625854492, "geo/layer_0/stable_rank_o_proj": 46.39035415649414, "geo/layer_0/stable_rank_gate_proj": 127.43553924560547, "geo/layer_0/stable_rank_down_proj": 56.767452239990234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0636865422129631, "geo/layer_0/attn_entropy_mean": 6.127950668334961, "geo/layer_0/attn_entropy_std": 0.43036237359046936, "geo/layer_7/stable_rank_q_proj": 42.6116943359375, "geo/layer_7/stable_rank_k_proj": 40.05172348022461, "geo/layer_7/stable_rank_o_proj": 88.006591796875, "geo/layer_7/stable_rank_gate_proj": 76.8824462890625, "geo/layer_7/stable_rank_down_proj": 140.2967071533203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4413275420665741, "geo/layer_7/attn_entropy_mean": 4.653807640075684, "geo/layer_7/attn_entropy_std": 0.768128514289856, "geo/layer_14/stable_rank_q_proj": 49.30217361450195, "geo/layer_14/stable_rank_k_proj": 41.58287811279297, "geo/layer_14/stable_rank_o_proj": 43.09867477416992, "geo/layer_14/stable_rank_gate_proj": 70.2311019897461, "geo/layer_14/stable_rank_down_proj": 124.92572784423828, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4099538326263428, "geo/layer_14/attn_entropy_mean": 5.531068801879883, "geo/layer_14/attn_entropy_std": 0.4085957705974579, "geo/layer_21/stable_rank_q_proj": 39.50988006591797, "geo/layer_21/stable_rank_k_proj": 30.107328414916992, "geo/layer_21/stable_rank_o_proj": 67.88516998291016, "geo/layer_21/stable_rank_gate_proj": 63.04018020629883, "geo/layer_21/stable_rank_down_proj": 49.27965545654297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14869271218776703, "geo/layer_21/attn_entropy_mean": 5.682282447814941, "geo/layer_21/attn_entropy_std": 0.2855203151702881, "geo/layer_27/stable_rank_q_proj": 43.89219665527344, "geo/layer_27/stable_rank_k_proj": 32.380828857421875, "geo/layer_27/stable_rank_o_proj": 115.3081283569336, "geo/layer_27/stable_rank_gate_proj": 76.4678726196289, "geo/layer_27/stable_rank_down_proj": 127.62201690673828, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08668050169944763, "geo/layer_27/attn_entropy_mean": 4.148928642272949, "geo/layer_27/attn_entropy_std": 0.7902425527572632, "attnres/final_alpha/block_0": 0.24065056443214417, "attnres/block_norm/0": 1.7790892124176025, "attnres/final_alpha/block_1": 0.0038858240004628897, "attnres/block_norm/1": 47684.5859375, "attnres/final_alpha/block_2": 0.009285712614655495, "attnres/block_norm/2": 29128.0390625, "attnres/final_alpha/block_3": 0.010852435603737831, "attnres/block_norm/3": 63561.609375, "attnres/final_alpha/block_4": 0.012931986711919308, "attnres/block_norm/4": 15740.12109375, "attnres/final_alpha/block_5": 0.6160378456115723, "attnres/block_norm/5": 6826.87939453125, "attnres/final_alpha/block_6": 0.10635566711425781, "attnres/block_norm/6": 40878.01171875, "geo/tier1_time_s": 1.359896183013916, "geo/step": 77475.0, "geo/rankme_slope": 0.000552088472107593} {"step": 77480, "timestamp": 1778278286.6862285, "train/loss": 2.1263072967529295, "train/z_loss": 0.001405254378914833, "train/perplexity": 8.383850507689644, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.008881193026900292, "optim/adamw_lr": 0.0002664357908070087, "perf/tokens_per_sec": 1700800.7585785068, "perf/iters_per_sec": 0.8110050003902944, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2330380201339721, "data/tokens_consumed": 162489434112, "data/tokens_consumed_B": 162.489434112, "train/loss_slope": -3.380683818236573e-07} {"step": 77490, "timestamp": 1778278297.0347729, "train/loss": 2.0643330574035645, "train/z_loss": 0.0014163482584990561, "train/perplexity": 7.880040609950803, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.008842782303690911, "optim/adamw_lr": 0.00026528346911072726, "perf/tokens_per_sec": 2027620.299751769, "perf/iters_per_sec": 0.966844701648602, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342922687530518, "data/tokens_consumed": 162510405632, "data/tokens_consumed_B": 162.510405632, "train/loss_slope": -1.4232169176199132e-06} {"step": 77500, "timestamp": 1778278307.3736422, "grad/layer_0/attn": 0.002396184718236327, "grad/layer_0/mlp": 0.002889862284064293, "grad/layer_0/attn_mlp_ratio": 0.829169143641484, "grad/layer_4/attn": 0.0019703535363078117, "grad/layer_4/mlp": 0.0025056630838662386, "grad/layer_4/attn_mlp_ratio": 0.7863600938046095, "grad/layer_8/attn": 0.005012047477066517, "grad/layer_8/mlp": 0.003355514956638217, "grad/layer_8/attn_mlp_ratio": 1.4936745603781991, "grad/layer_12/attn": 0.003728402778506279, "grad/layer_12/mlp": 0.006615982390940189, "grad/layer_12/attn_mlp_ratio": 0.5635448376128396, "grad/layer_16/attn": 0.0032090151216834784, "grad/layer_16/mlp": 0.004293826408684254, "grad/layer_16/attn_mlp_ratio": 0.747355561570374, "grad/layer_20/attn": 0.002626194152981043, "grad/layer_20/mlp": 0.0048237210139632225, "grad/layer_20/attn_mlp_ratio": 0.5444332478880258, "grad/layer_24/attn": 0.004267112817615271, "grad/layer_24/mlp": 0.006722494959831238, "grad/layer_24/attn_mlp_ratio": 0.6347513504490985, "grad/layer_27/attn": 0.004201820120215416, "grad/layer_27/mlp": 0.00584173109382391, "grad/layer_27/attn_mlp_ratio": 0.7192765262218387} {"step": 77500, "timestamp": 1778278307.3893204, "train/loss": 2.088090968132019, "train/z_loss": 0.0014106239308603109, "train/perplexity": 8.069495526579333, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.008804389983415603, "optim/adamw_lr": 0.00026413169950246807, "perf/tokens_per_sec": 2026282.985626835, "perf/iters_per_sec": 0.9662070205816435, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349748849868774, "data/tokens_consumed": 162531377152, "data/tokens_consumed_B": 162.531377152, "train/loss_slope": 1.1133476452465962e-06} {"step": 77500, "timestamp": 1778278314.415673, "geo/ww_alpha_mean": 7.833796342294632, "geo/ww_alpha_std": 4.869766677918232, "geo/ww_alpha_min": 1.3127843360877132, "geo/ww_alpha_max": 33.009482688052614, "geo/ww_alpha_healthy_frac": 0.17766497461928935, "geo/ww_alpha_by_type/q_proj": 3.949936103140609, "geo/ww_alpha_by_type/k_proj": 4.457731639941614, "geo/ww_alpha_by_type/v_proj": 8.779024028494025, "geo/ww_alpha_by_type/o_proj": 9.52197235843487, "geo/ww_alpha_by_type/gate_proj": 7.9641943887847555, "geo/ww_alpha_by_type/up_proj": 12.19776649979077, "geo/ww_alpha_by_type/down_proj": 8.056779299560484, "geo/twonn_id/layer_0": 0.691682755947113, "geo/twonn_id/layer_7": 2.9821701049804688, "geo/twonn_id/layer_14": 5.1093573570251465, "geo/twonn_id/layer_21": 7.397437572479248, "geo/twonn_id/layer_27": 5.3076653480529785, "geo/tier2_time_s": 7.020065546035767} {"step": 77500, "timestamp": 1778278315.056524, "eoc/jacobian_sigma/layer_0/attn": 1322.3543701171875, "eoc/jacobian_sigma/layer_0/mlp": 9234.3447265625, "eoc/jacobian_sigma/layer_0": 9234.3447265625, "eoc/jacobian_sigma/layer_7/attn": 1.1540307998657227, "eoc/jacobian_sigma/layer_7/mlp": 1.8333802223205566, "eoc/jacobian_sigma/layer_7": 1.8333802223205566, "eoc/jacobian_sigma/layer_14/attn": 1.3856173753738403, "eoc/jacobian_sigma/layer_14/mlp": 6.552102565765381, "eoc/jacobian_sigma/layer_14": 6.552102565765381, "eoc/jacobian_sigma/layer_21/attn": 1.1120941638946533, "eoc/jacobian_sigma/layer_21/mlp": 4.502744674682617, "eoc/jacobian_sigma/layer_21": 4.502744674682617, "eoc/jacobian_sigma/layer_27/attn": 3.0059151649475098, "eoc/jacobian_sigma/layer_27/mlp": 28.5888671875, "eoc/jacobian_sigma/layer_27": 28.5888671875, "eoc/layer0_sigma": 9234.3447265625, "eoc/sigma_max": 28.5888671875, "eoc/sigma_min": 1.8333802223205566, "eoc/sigma_mean": 10.369273662567139, "eoc/time_s": 0.6332998275756836} {"step": 77510, "timestamp": 1778278325.4249284, "train/loss": 2.10412220954895, "train/z_loss": 0.0013887381181120872, "train/perplexity": 8.19990205387066, "train/grad_norm": 0.1083984375, "optim/muon_lr": 0.008766015619039535, "optim/adamw_lr": 0.00026298046857118604, "perf/tokens_per_sec": 1163112.5098737238, "perf/iters_per_sec": 0.5546152638786906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8030517101287842, "data/tokens_consumed": 162552348672, "data/tokens_consumed_B": 162.552348672, "train/loss_slope": 5.5504643734675705e-06} {"step": 77520, "timestamp": 1778278335.7757847, "train/loss": 1.9934865713119507, "train/z_loss": 0.0014233679976314306, "train/perplexity": 7.341084408648925, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.008727659583091737, "optim/adamw_lr": 0.00026182978749275204, "perf/tokens_per_sec": 2027151.7050290392, "perf/iters_per_sec": 0.9666212582726665, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345313549041748, "data/tokens_consumed": 162573320192, "data/tokens_consumed_B": 162.573320192, "train/loss_slope": -1.4030783232932523e-06} {"step": 77530, "timestamp": 1778278346.127786, "train/loss": 2.076527750492096, "train/z_loss": 0.0014026998775079846, "train/perplexity": 7.976723598253626, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.008689322620630265, "optim/adamw_lr": 0.0002606796786189079, "perf/tokens_per_sec": 2026640.6940696342, "perf/iters_per_sec": 0.9663775892589732, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347922086715697, "data/tokens_consumed": 162594291712, "data/tokens_consumed_B": 162.594291712, "train/loss_slope": -3.611472316092942e-06} {"step": 77540, "timestamp": 1778278356.476216, "train/loss": 2.0736115694046022, "train/z_loss": 0.0014190003857947886, "train/perplexity": 7.953495912286396, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.008651005327701569, "optim/adamw_lr": 0.00025953015983104706, "perf/tokens_per_sec": 2027653.7657387583, "perf/iters_per_sec": 0.9668606594747344, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034275197982788, "data/tokens_consumed": 162615263232, "data/tokens_consumed_B": 162.615263232, "train/loss_slope": -6.6730794983870895e-06} {"step": 77550, "timestamp": 1778278367.1969674, "grad/layer_0/attn": 0.0027035686653107405, "grad/layer_0/mlp": 0.00311827240511775, "grad/layer_0/attn_mlp_ratio": 0.8670084673079751, "grad/layer_4/attn": 0.001831444795243442, "grad/layer_4/mlp": 0.0025799323339015245, "grad/layer_4/attn_mlp_ratio": 0.7098809143903914, "grad/layer_8/attn": 0.002839491469785571, "grad/layer_8/mlp": 0.0036368437577039003, "grad/layer_8/attn_mlp_ratio": 0.7807570467372957, "grad/layer_12/attn": 0.0052842069417238235, "grad/layer_12/mlp": 0.006452408619225025, "grad/layer_12/attn_mlp_ratio": 0.8189510571423458, "grad/layer_16/attn": 0.0032519823871552944, "grad/layer_16/mlp": 0.004195916000753641, "grad/layer_16/attn_mlp_ratio": 0.7750351315583256, "grad/layer_20/attn": 0.0024206950329244137, "grad/layer_20/mlp": 0.004680061247199774, "grad/layer_20/attn_mlp_ratio": 0.517235748281973, "grad/layer_24/attn": 0.005556843243539333, "grad/layer_24/mlp": 0.006501044612377882, "grad/layer_24/attn_mlp_ratio": 0.8547615790057859, "grad/layer_27/attn": 0.005708959884941578, "grad/layer_27/mlp": 0.005879694130271673, "grad/layer_27/attn_mlp_ratio": 0.9709620366904341} {"step": 77550, "timestamp": 1778278367.824676, "eos/sharpness": 15.836250782012936, "eos/L0_probe": 1.9175883531570435, "eos/L_plus": 2.0042710304260254, "eos/L_minus": 1.989268183708191, "eos/grad_norm": 0.08078111708164215, "eos/embed_grad_frac": 0.29773756861686707, "eos/time_s": 0.6247982978820801} {"step": 77550, "timestamp": 1778278367.8455882, "train/loss": 2.0831586718559265, "train/z_loss": 0.001409977674484253, "train/perplexity": 8.029792378179124, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.008612706959247589, "optim/adamw_lr": 0.00025838120877742766, "perf/tokens_per_sec": 1845351.6861382846, "perf/iters_per_sec": 0.8799322539035247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1364511251449585, "data/tokens_consumed": 162636234752, "data/tokens_consumed_B": 162.636234752, "train/loss_slope": -6.263052648228917e-06} {"step": 77550, "timestamp": 1778278369.2089179, "geo/rankme_last": 439.7412109375, "geo/layer_0/stable_rank_q_proj": 18.765884399414062, "geo/layer_0/stable_rank_k_proj": 15.710909843444824, "geo/layer_0/stable_rank_o_proj": 46.4033317565918, "geo/layer_0/stable_rank_gate_proj": 127.4259262084961, "geo/layer_0/stable_rank_down_proj": 56.76325607299805, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06983523815870285, "geo/layer_0/attn_entropy_mean": 6.1259918212890625, "geo/layer_0/attn_entropy_std": 0.43025538325309753, "geo/layer_7/stable_rank_q_proj": 42.59618377685547, "geo/layer_7/stable_rank_k_proj": 40.019378662109375, "geo/layer_7/stable_rank_o_proj": 87.96762084960938, "geo/layer_7/stable_rank_gate_proj": 76.88094329833984, "geo/layer_7/stable_rank_down_proj": 140.2843475341797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44198837876319885, "geo/layer_7/attn_entropy_mean": 4.638312339782715, "geo/layer_7/attn_entropy_std": 0.7723802328109741, "geo/layer_14/stable_rank_q_proj": 49.3111572265625, "geo/layer_14/stable_rank_k_proj": 41.57305145263672, "geo/layer_14/stable_rank_o_proj": 43.11009979248047, "geo/layer_14/stable_rank_gate_proj": 70.26728057861328, "geo/layer_14/stable_rank_down_proj": 124.92568969726562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4049152135848999, "geo/layer_14/attn_entropy_mean": 5.503136157989502, "geo/layer_14/attn_entropy_std": 0.40500587224960327, "geo/layer_21/stable_rank_q_proj": 39.5183219909668, "geo/layer_21/stable_rank_k_proj": 30.10840606689453, "geo/layer_21/stable_rank_o_proj": 67.9035415649414, "geo/layer_21/stable_rank_gate_proj": 63.06308364868164, "geo/layer_21/stable_rank_down_proj": 49.26195526123047, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.148795023560524, "geo/layer_21/attn_entropy_mean": 5.6778974533081055, "geo/layer_21/attn_entropy_std": 0.28554022312164307, "geo/layer_27/stable_rank_q_proj": 43.89051055908203, "geo/layer_27/stable_rank_k_proj": 32.392024993896484, "geo/layer_27/stable_rank_o_proj": 115.2898941040039, "geo/layer_27/stable_rank_gate_proj": 76.43805694580078, "geo/layer_27/stable_rank_down_proj": 127.68559265136719, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0931236669421196, "geo/layer_27/attn_entropy_mean": 4.1483154296875, "geo/layer_27/attn_entropy_std": 0.7858484983444214, "attnres/final_alpha/block_0": 0.24124804139137268, "attnres/block_norm/0": 1.7791128158569336, "attnres/final_alpha/block_1": 0.0038864517118781805, "attnres/block_norm/1": 47672.0, "attnres/final_alpha/block_2": 0.009439711458981037, "attnres/block_norm/2": 29109.744140625, "attnres/final_alpha/block_3": 0.01101376861333847, "attnres/block_norm/3": 63565.59765625, "attnres/final_alpha/block_4": 0.01311999000608921, "attnres/block_norm/4": 15753.54296875, "attnres/final_alpha/block_5": 0.6143137216567993, "attnres/block_norm/5": 6844.1806640625, "attnres/final_alpha/block_6": 0.10697827488183975, "attnres/block_norm/6": 41126.46875, "geo/tier1_time_s": 1.3587441444396973, "geo/step": 77550.0, "geo/rankme_slope": 0.0005723763333458383} {"step": 77560, "timestamp": 1778278380.0698035, "train/loss": 2.052438747882843, "train/z_loss": 0.0014198009390383958, "train/perplexity": 7.786868177235409, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.008574430495500565, "optim/adamw_lr": 0.0002572329148650169, "perf/tokens_per_sec": 1716189.5450448361, "perf/iters_per_sec": 0.8183429455970936, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2219815731048584, "data/tokens_consumed": 162657206272, "data/tokens_consumed_B": 162.657206272, "train/loss_slope": -9.983221525334676e-06} {"step": 77570, "timestamp": 1778278390.4218445, "train/loss": 2.044516754150391, "train/z_loss": 0.0014276983914896847, "train/perplexity": 7.725424356461977, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.008536175340414048, "optim/adamw_lr": 0.0002560852602124214, "perf/tokens_per_sec": 2026789.3795719072, "perf/iters_per_sec": 0.9664484880313431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034716296195984, "data/tokens_consumed": 162678177792, "data/tokens_consumed_B": 162.678177792, "train/loss_slope": -1.311460581406642e-05} {"step": 77580, "timestamp": 1778278400.7693634, "train/loss": 2.1188193798065185, "train/z_loss": 0.0014110703254118563, "train/perplexity": 8.321307382487229, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.008497941941022873, "optim/adamw_lr": 0.00025493825823068615, "perf/tokens_per_sec": 2027703.31238477, "perf/iters_per_sec": 0.9668842851566172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342499256134032, "data/tokens_consumed": 162699149312, "data/tokens_consumed_B": 162.699149312, "train/loss_slope": -1.2945672216051935e-05} {"step": 77590, "timestamp": 1778278411.1212592, "train/loss": 2.103466248512268, "train/z_loss": 0.0013994449633173644, "train/perplexity": 8.194525001379981, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.008459731191396714, "optim/adamw_lr": 0.0002537919357419014, "perf/tokens_per_sec": 2026784.9896748194, "perf/iters_per_sec": 0.9664463947652909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347185373306274, "data/tokens_consumed": 162720120832, "data/tokens_consumed_B": 162.720120832, "train/loss_slope": -1.1884702822126911e-05} {"step": 77600, "timestamp": 1778278421.4686666, "grad/layer_0/attn": 0.002886958420276642, "grad/layer_0/mlp": 0.003212699666619301, "grad/layer_0/attn_mlp_ratio": 0.8986082204981656, "grad/layer_4/attn": 0.0021102656610310078, "grad/layer_4/mlp": 0.002548271557316184, "grad/layer_4/attn_mlp_ratio": 0.8281164431477921, "grad/layer_8/attn": 0.0038525632116943598, "grad/layer_8/mlp": 0.003508092137053609, "grad/layer_8/attn_mlp_ratio": 1.098193249026447, "grad/layer_12/attn": 0.0046400451101362705, "grad/layer_12/mlp": 0.0064230430871248245, "grad/layer_12/attn_mlp_ratio": 0.7224060270118338, "grad/layer_16/attn": 0.0037945809308439493, "grad/layer_16/mlp": 0.0044090477749705315, "grad/layer_16/attn_mlp_ratio": 0.8606350029413803, "grad/layer_20/attn": 0.0038588098250329494, "grad/layer_20/mlp": 0.0048241703771054745, "grad/layer_20/attn_mlp_ratio": 0.7998908503225728, "grad/layer_24/attn": 0.005725071299821138, "grad/layer_24/mlp": 0.006903682369738817, "grad/layer_24/attn_mlp_ratio": 0.8292778998622936, "grad/layer_27/attn": 0.005087938159704208, "grad/layer_27/mlp": 0.0058050877414643764, "grad/layer_27/attn_mlp_ratio": 0.8764618725253843} {"step": 77600, "timestamp": 1778278421.484595, "train/loss": 2.04506995677948, "train/z_loss": 0.0014174054260365665, "train/perplexity": 7.729699263862785, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.008421543389558792, "optim/adamw_lr": 0.00025264630168676377, "perf/tokens_per_sec": 2024671.503562795, "perf/iters_per_sec": 0.9654386060537314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357986450195313, "data/tokens_consumed": 162741092352, "data/tokens_consumed_B": 162.741092352, "train/loss_slope": -1.2906731566329769e-05} {"step": 77610, "timestamp": 1778278432.310026, "train/loss": 2.0902884244918822, "train/z_loss": 0.001398468972183764, "train/perplexity": 8.08724738817231, "train/grad_norm": 0.1240234375, "optim/muon_lr": 0.008383377939462661, "optim/adamw_lr": 0.00025150133818387983, "perf/tokens_per_sec": 1938132.11522739, "perf/iters_per_sec": 0.9241734100472403, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0820480108261108, "data/tokens_consumed": 162762063872, "data/tokens_consumed_B": 162.762063872, "train/loss_slope": -1.1387614491391388e-05} {"step": 77620, "timestamp": 1778278443.1492825, "train/loss": 2.0700190782547, "train/z_loss": 0.001413050398696214, "train/perplexity": 7.924974311085628, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.008345237970352174, "optim/adamw_lr": 0.0002503571391105652, "perf/tokens_per_sec": 1935756.287033354, "perf/iters_per_sec": 0.9230405268828172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0833760499954224, "data/tokens_consumed": 162783035392, "data/tokens_consumed_B": 162.783035392, "train/loss_slope": -1.2530915042569226e-05} {"step": 77625, "timestamp": 1778278448.945323, "eos/sharpness": 49.5617389678955, "eos/L0_probe": 1.9184551239013672, "eos/L_plus": 2.194648027420044, "eos/L_minus": 2.1378796100616455, "eos/grad_norm": 0.09978967905044556, "eos/embed_grad_frac": 0.18822810053825378, "eos/time_s": 0.6195285320281982} {"step": 77625, "timestamp": 1778278450.3254926, "geo/rankme_last": 440.17694091796875, "geo/layer_0/stable_rank_q_proj": 18.768857955932617, "geo/layer_0/stable_rank_k_proj": 15.710463523864746, "geo/layer_0/stable_rank_o_proj": 46.392784118652344, "geo/layer_0/stable_rank_gate_proj": 127.41638946533203, "geo/layer_0/stable_rank_down_proj": 56.766658782958984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06559479981660843, "geo/layer_0/attn_entropy_mean": 6.128146171569824, "geo/layer_0/attn_entropy_std": 0.429560124874115, "geo/layer_7/stable_rank_q_proj": 42.585838317871094, "geo/layer_7/stable_rank_k_proj": 40.01344299316406, "geo/layer_7/stable_rank_o_proj": 87.9488296508789, "geo/layer_7/stable_rank_gate_proj": 76.87236785888672, "geo/layer_7/stable_rank_down_proj": 140.39793395996094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4511605203151703, "geo/layer_7/attn_entropy_mean": 4.642240524291992, "geo/layer_7/attn_entropy_std": 0.7698003053665161, "geo/layer_14/stable_rank_q_proj": 49.29705047607422, "geo/layer_14/stable_rank_k_proj": 41.5665168762207, "geo/layer_14/stable_rank_o_proj": 43.10756301879883, "geo/layer_14/stable_rank_gate_proj": 70.26710510253906, "geo/layer_14/stable_rank_down_proj": 124.87431335449219, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41012752056121826, "geo/layer_14/attn_entropy_mean": 5.517536163330078, "geo/layer_14/attn_entropy_std": 0.4141480028629303, "geo/layer_21/stable_rank_q_proj": 39.498897552490234, "geo/layer_21/stable_rank_k_proj": 30.11756706237793, "geo/layer_21/stable_rank_o_proj": 67.91297149658203, "geo/layer_21/stable_rank_gate_proj": 63.03806686401367, "geo/layer_21/stable_rank_down_proj": 49.267173767089844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14814352989196777, "geo/layer_21/attn_entropy_mean": 5.680479049682617, "geo/layer_21/attn_entropy_std": 0.2904343605041504, "geo/layer_27/stable_rank_q_proj": 43.887142181396484, "geo/layer_27/stable_rank_k_proj": 32.41866683959961, "geo/layer_27/stable_rank_o_proj": 115.21680450439453, "geo/layer_27/stable_rank_gate_proj": 76.4377212524414, "geo/layer_27/stable_rank_down_proj": 127.66852569580078, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08135837316513062, "geo/layer_27/attn_entropy_mean": 4.148720741271973, "geo/layer_27/attn_entropy_std": 0.801780104637146, "attnres/final_alpha/block_0": 0.24146611988544464, "attnres/block_norm/0": 1.7791255712509155, "attnres/final_alpha/block_1": 0.0038946864660829306, "attnres/block_norm/1": 47606.0234375, "attnres/final_alpha/block_2": 0.00931241363286972, "attnres/block_norm/2": 29124.4296875, "attnres/final_alpha/block_3": 0.010887794196605682, "attnres/block_norm/3": 63520.92578125, "attnres/final_alpha/block_4": 0.01289369910955429, "attnres/block_norm/4": 15763.03515625, "attnres/final_alpha/block_5": 0.6141537427902222, "attnres/block_norm/5": 6833.07275390625, "attnres/final_alpha/block_6": 0.10739153623580933, "attnres/block_norm/6": 40935.55859375, "geo/tier1_time_s": 1.3601806163787842, "geo/step": 77625.0, "geo/rankme_slope": 0.0005545149700505203} {"step": 77630, "timestamp": 1778278455.5115566, "train/loss": 2.1164749383926393, "train/z_loss": 0.0014034834108315407, "train/perplexity": 8.301821415621193, "train/grad_norm": 0.1259765625, "optim/muon_lr": 0.008307122588157655, "optim/adamw_lr": 0.0002492136776447296, "perf/tokens_per_sec": 1697514.528486071, "perf/iters_per_sec": 0.8094380037718157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.235425066947937, "data/tokens_consumed": 162804006912, "data/tokens_consumed_B": 162.804006912, "train/loss_slope": -1.2037266978193088e-05} {"step": 77640, "timestamp": 1778278465.8795898, "train/loss": 2.1325152039527895, "train/z_loss": 0.0014023515861481429, "train/perplexity": 8.436058557315933, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.008269032537937164, "optim/adamw_lr": 0.0002480709761381149, "perf/tokens_per_sec": 2023786.6553406983, "perf/iters_per_sec": 0.9650166775420658, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362515211105348, "data/tokens_consumed": 162824978432, "data/tokens_consumed_B": 162.824978432, "train/loss_slope": -1.2981019819816356e-05} {"step": 77650, "timestamp": 1778278476.2428017, "grad/layer_0/attn": 0.0027312440797686577, "grad/layer_0/mlp": 0.002820795401930809, "grad/layer_0/attn_mlp_ratio": 0.9682531321037438, "grad/layer_4/attn": 0.0023522046394646168, "grad/layer_4/mlp": 0.002529788762331009, "grad/layer_4/attn_mlp_ratio": 0.9298027493477216, "grad/layer_8/attn": 0.0036558660212904215, "grad/layer_8/mlp": 0.003700908971950412, "grad/layer_8/attn_mlp_ratio": 0.9878291928322755, "grad/layer_12/attn": 0.003817848628386855, "grad/layer_12/mlp": 0.006816623732447624, "grad/layer_12/attn_mlp_ratio": 0.5600791128026779, "grad/layer_16/attn": 0.0030568400397896767, "grad/layer_16/mlp": 0.004322964232414961, "grad/layer_16/attn_mlp_ratio": 0.7071166460635631, "grad/layer_20/attn": 0.0024595174472779036, "grad/layer_20/mlp": 0.005176926963031292, "grad/layer_20/attn_mlp_ratio": 0.47509215743861777, "grad/layer_24/attn": 0.008378224447369576, "grad/layer_24/mlp": 0.0071394494734704494, "grad/layer_24/attn_mlp_ratio": 1.173511257576817, "grad/layer_27/attn": 0.004243968520313501, "grad/layer_27/mlp": 0.005973984953016043, "grad/layer_27/attn_mlp_ratio": 0.7104082923961921} {"step": 77650, "timestamp": 1778278476.2587295, "train/loss": 2.0689571619033815, "train/z_loss": 0.0014091236516833305, "train/perplexity": 7.91656311806303, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.008230968415737153, "optim/adamw_lr": 0.00024692905247211455, "perf/tokens_per_sec": 2022220.7018144343, "perf/iters_per_sec": 0.9642699727127239, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370539665222167, "data/tokens_consumed": 162845949952, "data/tokens_consumed_B": 162.845949952, "train/loss_slope": -1.3228459119295927e-05} {"step": 77660, "timestamp": 1778278486.6289291, "train/loss": 2.0818573355674745, "train/z_loss": 0.0013949172687716782, "train/perplexity": 8.01934971415119, "train/grad_norm": 0.078125, "optim/muon_lr": 0.008192930817604065, "optim/adamw_lr": 0.00024578792452812194, "perf/tokens_per_sec": 2023295.5363844426, "perf/iters_per_sec": 0.9647824937746251, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036503052711487, "data/tokens_consumed": 162866921472, "data/tokens_consumed_B": 162.866921472, "train/loss_slope": -1.5156065519958266e-05} {"step": 77670, "timestamp": 1778278496.999077, "train/loss": 2.0671029448509217, "train/z_loss": 0.001402724941726774, "train/perplexity": 7.90189769237563, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.00815491884946823, "optim/adamw_lr": 0.0002446475654840469, "perf/tokens_per_sec": 2023207.4860387854, "perf/iters_per_sec": 0.9647405080980231, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365481615066527, "data/tokens_consumed": 162887892992, "data/tokens_consumed_B": 162.887892992, "train/loss_slope": -1.617014475352325e-05} {"step": 77680, "timestamp": 1778278507.366053, "train/loss": 2.1371634006500244, "train/z_loss": 0.001410322228912264, "train/perplexity": 8.47536229181871, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.008116935789585113, "optim/adamw_lr": 0.00024350807368755337, "perf/tokens_per_sec": 2024296.460790729, "perf/iters_per_sec": 0.9652597717240948, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03599054813385, "data/tokens_consumed": 162908864512, "data/tokens_consumed_B": 162.908864512, "train/loss_slope": -1.2339766190783702e-05} {"step": 77690, "timestamp": 1778278517.734774, "train/loss": 2.1274396061897276, "train/z_loss": 0.0014064865419641138, "train/perplexity": 8.393348997336084, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.00807898074388504, "optim/adamw_lr": 0.00024236942231655118, "perf/tokens_per_sec": 2023605.728245462, "perf/iters_per_sec": 0.9649304047801314, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0363441705703735, "data/tokens_consumed": 162929836032, "data/tokens_consumed_B": 162.929836032, "train/loss_slope": -9.05372519196949e-06} {"step": 77700, "timestamp": 1778278528.0925698, "grad/layer_0/attn": 0.0023343495558947325, "grad/layer_0/mlp": 0.0026144422590732574, "grad/layer_0/attn_mlp_ratio": 0.8928670956517819, "grad/layer_4/attn": 0.001947094569914043, "grad/layer_4/mlp": 0.0025491260457783937, "grad/layer_4/attn_mlp_ratio": 0.7638282527283419, "grad/layer_8/attn": 0.0033636929001659155, "grad/layer_8/mlp": 0.0035987666342407465, "grad/layer_8/attn_mlp_ratio": 0.9346793356073338, "grad/layer_12/attn": 0.004405189771205187, "grad/layer_12/mlp": 0.006400248501449823, "grad/layer_12/attn_mlp_ratio": 0.6882841660568149, "grad/layer_16/attn": 0.003029903396964073, "grad/layer_16/mlp": 0.00427349004894495, "grad/layer_16/attn_mlp_ratio": 0.7089997382378671, "grad/layer_20/attn": 0.003000770229846239, "grad/layer_20/mlp": 0.005007949657738209, "grad/layer_20/attn_mlp_ratio": 0.599201344863633, "grad/layer_24/attn": 0.004850149154663086, "grad/layer_24/mlp": 0.006737495306879282, "grad/layer_24/attn_mlp_ratio": 0.7198742057339095, "grad/layer_27/attn": 0.0034471277613192797, "grad/layer_27/mlp": 0.005976008716970682, "grad/layer_27/attn_mlp_ratio": 0.5768277569353846} {"step": 77700, "timestamp": 1778278528.7397556, "eos/sharpness": 24.64208602905273, "eos/L0_probe": 1.9157310724258423, "eos/L_plus": 2.0257456302642822, "eos/L_minus": 2.0521373748779297, "eos/grad_norm": 0.08206827938556671, "eos/embed_grad_frac": 0.2732927203178406, "eos/time_s": 0.6443207263946533} {"step": 77700, "timestamp": 1778278528.760175, "train/loss": 2.1124198198318482, "train/z_loss": 0.0014087018091231585, "train/perplexity": 8.268224710859498, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.008041054606437684, "optim/adamw_lr": 0.00024123163819313048, "perf/tokens_per_sec": 1903046.5215979, "perf/iters_per_sec": 0.9074432952870846, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1019972324371339, "data/tokens_consumed": 162950807552, "data/tokens_consumed_B": 162.950807552, "train/loss_slope": -8.810630761714987e-06} {"step": 77700, "timestamp": 1778278530.1222098, "geo/rankme_last": 440.0483093261719, "geo/layer_0/stable_rank_q_proj": 18.761703491210938, "geo/layer_0/stable_rank_k_proj": 15.703934669494629, "geo/layer_0/stable_rank_o_proj": 46.357177734375, "geo/layer_0/stable_rank_gate_proj": 127.48108673095703, "geo/layer_0/stable_rank_down_proj": 56.776241302490234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061932213604450226, "geo/layer_0/attn_entropy_mean": 6.126449108123779, "geo/layer_0/attn_entropy_std": 0.4300251603126526, "geo/layer_7/stable_rank_q_proj": 42.59395217895508, "geo/layer_7/stable_rank_k_proj": 40.022361755371094, "geo/layer_7/stable_rank_o_proj": 87.91189575195312, "geo/layer_7/stable_rank_gate_proj": 76.86827087402344, "geo/layer_7/stable_rank_down_proj": 140.3706512451172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45258229970932007, "geo/layer_7/attn_entropy_mean": 4.651120662689209, "geo/layer_7/attn_entropy_std": 0.7676506638526917, "geo/layer_14/stable_rank_q_proj": 49.280452728271484, "geo/layer_14/stable_rank_k_proj": 41.55512619018555, "geo/layer_14/stable_rank_o_proj": 43.11042022705078, "geo/layer_14/stable_rank_gate_proj": 70.29716491699219, "geo/layer_14/stable_rank_down_proj": 124.84969329833984, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4139328896999359, "geo/layer_14/attn_entropy_mean": 5.534140110015869, "geo/layer_14/attn_entropy_std": 0.4137769341468811, "geo/layer_21/stable_rank_q_proj": 39.49747848510742, "geo/layer_21/stable_rank_k_proj": 30.102869033813477, "geo/layer_21/stable_rank_o_proj": 67.93041229248047, "geo/layer_21/stable_rank_gate_proj": 63.00444030761719, "geo/layer_21/stable_rank_down_proj": 49.28032684326172, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14483845233917236, "geo/layer_21/attn_entropy_mean": 5.686923027038574, "geo/layer_21/attn_entropy_std": 0.2874293923377991, "geo/layer_27/stable_rank_q_proj": 43.87335968017578, "geo/layer_27/stable_rank_k_proj": 32.42118453979492, "geo/layer_27/stable_rank_o_proj": 115.19065856933594, "geo/layer_27/stable_rank_gate_proj": 76.44642639160156, "geo/layer_27/stable_rank_down_proj": 127.65443420410156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08923143893480301, "geo/layer_27/attn_entropy_mean": 4.142387390136719, "geo/layer_27/attn_entropy_std": 0.7971031665802002, "attnres/final_alpha/block_0": 0.2409709095954895, "attnres/block_norm/0": 1.779119610786438, "attnres/final_alpha/block_1": 0.0038580032996833324, "attnres/block_norm/1": 47729.3515625, "attnres/final_alpha/block_2": 0.009297793731093407, "attnres/block_norm/2": 29157.4296875, "attnres/final_alpha/block_3": 0.010829017497599125, "attnres/block_norm/3": 63767.796875, "attnres/final_alpha/block_4": 0.012947075068950653, "attnres/block_norm/4": 15748.5419921875, "attnres/final_alpha/block_5": 0.6155803203582764, "attnres/block_norm/5": 6827.630859375, "attnres/final_alpha/block_6": 0.10651686787605286, "attnres/block_norm/6": 40997.0859375, "geo/tier1_time_s": 1.3582494258880615, "geo/step": 77700.0, "geo/rankme_slope": 0.0005845125159438776} {"step": 77710, "timestamp": 1778278540.487221, "train/loss": 2.056534540653229, "train/z_loss": 0.0014152768650092184, "train/perplexity": 7.818826979257702, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.008003157526254655, "optim/adamw_lr": 0.0002400947257876396, "perf/tokens_per_sec": 1788843.280236948, "perf/iters_per_sec": 0.8529869462189427, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1723508834838867, "data/tokens_consumed": 162971779072, "data/tokens_consumed_B": 162.971779072, "train/loss_slope": -1.2097576113030364e-05} {"step": 77720, "timestamp": 1778278550.8524632, "train/loss": 2.132518696784973, "train/z_loss": 0.001401420112233609, "train/perplexity": 8.436088023104224, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.007965290397405624, "optim/adamw_lr": 0.0002389587119221687, "perf/tokens_per_sec": 2024268.6025539527, "perf/iters_per_sec": 0.9652464878816379, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360048055648803, "data/tokens_consumed": 162992750592, "data/tokens_consumed_B": 162.992750592, "train/loss_slope": -9.251266358458717e-06} {"step": 77730, "timestamp": 1778278561.229463, "train/loss": 2.101478934288025, "train/z_loss": 0.0014084564754739404, "train/perplexity": 8.178256076371262, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.007927452623844146, "optim/adamw_lr": 0.00023782357871532438, "perf/tokens_per_sec": 2022509.6371031585, "perf/iters_per_sec": 0.9644077477947037, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369058132171631, "data/tokens_consumed": 163013722112, "data/tokens_consumed_B": 163.013722112, "train/loss_slope": -9.361100525888832e-06} {"step": 77740, "timestamp": 1778278571.579056, "train/loss": 2.0597710251808166, "train/z_loss": 0.0014137546997517348, "train/perplexity": 7.8441734864641255, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.007889646887779236, "optim/adamw_lr": 0.00023668940663337706, "perf/tokens_per_sec": 2027423.078525949, "perf/iters_per_sec": 0.9667506592397447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343928813934327, "data/tokens_consumed": 163034693632, "data/tokens_consumed_B": 163.034693632, "train/loss_slope": -9.328953789906353e-06} {"step": 77750, "timestamp": 1778278581.9261303, "grad/layer_0/attn": 0.0024852927308529615, "grad/layer_0/mlp": 0.0028635321650654078, "grad/layer_0/attn_mlp_ratio": 0.8679115514684774, "grad/layer_4/attn": 0.0018687160918489099, "grad/layer_4/mlp": 0.002531023696064949, "grad/layer_4/attn_mlp_ratio": 0.7383241891104513, "grad/layer_8/attn": 0.003247641259804368, "grad/layer_8/mlp": 0.0034315958619117737, "grad/layer_8/attn_mlp_ratio": 0.9463938341957592, "grad/layer_12/attn": 0.0034776844549924135, "grad/layer_12/mlp": 0.006280996836721897, "grad/layer_12/attn_mlp_ratio": 0.55368351394348, "grad/layer_16/attn": 0.003128073178231716, "grad/layer_16/mlp": 0.0041085039265453815, "grad/layer_16/attn_mlp_ratio": 0.7613654892440117, "grad/layer_20/attn": 0.004775605630129576, "grad/layer_20/mlp": 0.004872822668403387, "grad/layer_20/attn_mlp_ratio": 0.9800491126202677, "grad/layer_24/attn": 0.004654897376894951, "grad/layer_24/mlp": 0.006672217510640621, "grad/layer_24/attn_mlp_ratio": 0.6976537110347664, "grad/layer_27/attn": 0.00416501984000206, "grad/layer_27/mlp": 0.005543000530451536, "grad/layer_27/attn_mlp_ratio": 0.7514016536676409} {"step": 77750, "timestamp": 1778278581.9421663, "train/loss": 2.1007616400718687, "train/z_loss": 0.001404664048459381, "train/perplexity": 8.172391963987813, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.007851872593164444, "optim/adamw_lr": 0.0002355561777949333, "perf/tokens_per_sec": 2024569.0740649782, "perf/iters_per_sec": 0.9653897638630763, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358510494232178, "data/tokens_consumed": 163055665152, "data/tokens_consumed_B": 163.055665152, "train/loss_slope": -8.697725780153955e-06} {"step": 77760, "timestamp": 1778278592.3048193, "train/loss": 2.1260722398757936, "train/z_loss": 0.0014098621672019363, "train/perplexity": 8.381880057563931, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.007814130485057831, "optim/adamw_lr": 0.0002344239145517349, "perf/tokens_per_sec": 2024796.7351414098, "perf/iters_per_sec": 0.9654983211237954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357345819473267, "data/tokens_consumed": 163076636672, "data/tokens_consumed_B": 163.076636672, "train/loss_slope": -8.3309960372448e-06} {"step": 77770, "timestamp": 1778278602.6554334, "train/loss": 2.098202180862427, "train/z_loss": 0.0014116659294813871, "train/perplexity": 8.151501805270966, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.007776421159505845, "optim/adamw_lr": 0.0002332926347851753, "perf/tokens_per_sec": 2027149.4158578147, "perf/iters_per_sec": 0.9666201667107652, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345325231552125, "data/tokens_consumed": 163097608192, "data/tokens_consumed_B": 163.097608192, "train/loss_slope": -1.1249604598559455e-05} {"step": 77775, "timestamp": 1778278608.4280024, "eos/sharpness": 25.269913673400875, "eos/L0_probe": 1.9167859554290771, "eos/L_plus": 2.0494654178619385, "eos/L_minus": 2.0368056297302246, "eos/grad_norm": 0.08103804290294647, "eos/embed_grad_frac": 0.29007115960121155, "eos/time_s": 0.608844518661499} {"step": 77775, "timestamp": 1778278609.8099663, "geo/rankme_last": 439.7747497558594, "geo/layer_0/stable_rank_q_proj": 18.750059127807617, "geo/layer_0/stable_rank_k_proj": 15.694580078125, "geo/layer_0/stable_rank_o_proj": 46.35282897949219, "geo/layer_0/stable_rank_gate_proj": 127.38885498046875, "geo/layer_0/stable_rank_down_proj": 56.798038482666016, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06469607353210449, "geo/layer_0/attn_entropy_mean": 6.1275105476379395, "geo/layer_0/attn_entropy_std": 0.4302978217601776, "geo/layer_7/stable_rank_q_proj": 42.603858947753906, "geo/layer_7/stable_rank_k_proj": 40.00990676879883, "geo/layer_7/stable_rank_o_proj": 87.9185562133789, "geo/layer_7/stable_rank_gate_proj": 76.8825454711914, "geo/layer_7/stable_rank_down_proj": 140.2844696044922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45307570695877075, "geo/layer_7/attn_entropy_mean": 4.653745651245117, "geo/layer_7/attn_entropy_std": 0.7712077498435974, "geo/layer_14/stable_rank_q_proj": 49.26571273803711, "geo/layer_14/stable_rank_k_proj": 41.54781723022461, "geo/layer_14/stable_rank_o_proj": 43.11036682128906, "geo/layer_14/stable_rank_gate_proj": 70.28668975830078, "geo/layer_14/stable_rank_down_proj": 124.87419128417969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40154018998146057, "geo/layer_14/attn_entropy_mean": 5.504783630371094, "geo/layer_14/attn_entropy_std": 0.4153366684913635, "geo/layer_21/stable_rank_q_proj": 39.485984802246094, "geo/layer_21/stable_rank_k_proj": 30.079008102416992, "geo/layer_21/stable_rank_o_proj": 67.91130065917969, "geo/layer_21/stable_rank_gate_proj": 63.006038665771484, "geo/layer_21/stable_rank_down_proj": 49.26862716674805, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15239250659942627, "geo/layer_21/attn_entropy_mean": 5.679731369018555, "geo/layer_21/attn_entropy_std": 0.2885488271713257, "geo/layer_27/stable_rank_q_proj": 43.859474182128906, "geo/layer_27/stable_rank_k_proj": 32.416019439697266, "geo/layer_27/stable_rank_o_proj": 115.1824722290039, "geo/layer_27/stable_rank_gate_proj": 76.44131469726562, "geo/layer_27/stable_rank_down_proj": 127.6603012084961, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09134718775749207, "geo/layer_27/attn_entropy_mean": 4.151715278625488, "geo/layer_27/attn_entropy_std": 0.7938220500946045, "attnres/final_alpha/block_0": 0.24152252078056335, "attnres/block_norm/0": 1.7791590690612793, "attnres/final_alpha/block_1": 0.003923619166016579, "attnres/block_norm/1": 47646.18359375, "attnres/final_alpha/block_2": 0.009290553629398346, "attnres/block_norm/2": 29117.955078125, "attnres/final_alpha/block_3": 0.010828056372702122, "attnres/block_norm/3": 63697.65625, "attnres/final_alpha/block_4": 0.012884877622127533, "attnres/block_norm/4": 15723.3115234375, "attnres/final_alpha/block_5": 0.6137372851371765, "attnres/block_norm/5": 6838.42431640625, "attnres/final_alpha/block_6": 0.10781309008598328, "attnres/block_norm/6": 40954.12109375, "geo/tier1_time_s": 1.3629076480865479, "geo/step": 77775.0, "geo/rankme_slope": 0.0005690248560361645} {"step": 77780, "timestamp": 1778278614.987385, "train/loss": 2.116172218322754, "train/z_loss": 0.0014020512811839581, "train/perplexity": 8.299308668010834, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.007738744914531708, "optim/adamw_lr": 0.00023216234743595122, "perf/tokens_per_sec": 1701259.9774540951, "perf/iters_per_sec": 0.8112239730139232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327051877975463, "data/tokens_consumed": 163118579712, "data/tokens_consumed_B": 163.118579712, "train/loss_slope": -1.126496170935522e-05} {"step": 77790, "timestamp": 1778278625.3410902, "train/loss": 2.1358208060264587, "train/z_loss": 0.0014105835696682334, "train/perplexity": 8.463990951231438, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.007701101303100586, "optim/adamw_lr": 0.00023103303909301757, "perf/tokens_per_sec": 2026467.7998934435, "perf/iters_per_sec": 0.9662951468913286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348804950714112, "data/tokens_consumed": 163139551232, "data/tokens_consumed_B": 163.139551232, "train/loss_slope": -7.624308468520745e-06} {"step": 77800, "timestamp": 1778278635.6816874, "grad/layer_0/attn": 0.0029263230971992016, "grad/layer_0/mlp": 0.003119863336905837, "grad/layer_0/attn_mlp_ratio": 0.9379651245573166, "grad/layer_4/attn": 0.0024200547486543655, "grad/layer_4/mlp": 0.002586822723969817, "grad/layer_4/attn_mlp_ratio": 0.9355316978920403, "grad/layer_8/attn": 0.003102011512964964, "grad/layer_8/mlp": 0.003569666761904955, "grad/layer_8/attn_mlp_ratio": 0.8689918787854544, "grad/layer_12/attn": 0.004296766594052315, "grad/layer_12/mlp": 0.006616073660552502, "grad/layer_12/attn_mlp_ratio": 0.6494435747786307, "grad/layer_16/attn": 0.004109612666070461, "grad/layer_16/mlp": 0.004383494611829519, "grad/layer_16/attn_mlp_ratio": 0.9375197043080842, "grad/layer_20/attn": 0.0027251695282757282, "grad/layer_20/mlp": 0.004978503566235304, "grad/layer_20/attn_mlp_ratio": 0.5473872695440787, "grad/layer_24/attn": 0.008461277931928635, "grad/layer_24/mlp": 0.006970034912228584, "grad/layer_24/attn_mlp_ratio": 1.2139505636748937, "grad/layer_27/attn": 0.005278490949422121, "grad/layer_27/mlp": 0.0064381822012364864, "grad/layer_27/attn_mlp_ratio": 0.8198728620046002} {"step": 77800, "timestamp": 1778278635.6974268, "train/loss": 2.1253722667694093, "train/z_loss": 0.0014007390011101962, "train/perplexity": 8.376015019866497, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.007663493305444718, "optim/adamw_lr": 0.0002299047991633415, "perf/tokens_per_sec": 2026076.7840908864, "perf/iters_per_sec": 0.9661086960272247, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350802183151244, "data/tokens_consumed": 163160522752, "data/tokens_consumed_B": 163.160522752, "train/loss_slope": -4.221442272954194e-06} {"step": 77810, "timestamp": 1778278646.0466554, "train/loss": 2.0788524985313415, "train/z_loss": 0.0014137746766209602, "train/perplexity": 7.995289042426899, "train/grad_norm": 0.115234375, "optim/muon_lr": 0.007625920176506043, "optim/adamw_lr": 0.00022877760529518124, "perf/tokens_per_sec": 2027419.9475978797, "perf/iters_per_sec": 0.9667491662969016, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343944787979127, "data/tokens_consumed": 163181494272, "data/tokens_consumed_B": 163.181494272, "train/loss_slope": -5.699987222652581e-06} {"step": 77820, "timestamp": 1778278656.399745, "train/loss": 2.103896975517273, "train/z_loss": 0.001420493412297219, "train/perplexity": 8.198055364848123, "train/grad_norm": 0.130859375, "optim/muon_lr": 0.007588382661342621, "optim/adamw_lr": 0.00022765147984027861, "perf/tokens_per_sec": 2027063.7391671285, "perf/iters_per_sec": 0.9665793128810541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345762491226196, "data/tokens_consumed": 163202465792, "data/tokens_consumed_B": 163.202465792, "train/loss_slope": -4.015876844127406e-06} {"step": 77830, "timestamp": 1778278666.746863, "train/loss": 2.1453235387802123, "train/z_loss": 0.0013959660427644849, "train/perplexity": 8.544805366023837, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.007550881057977676, "optim/adamw_lr": 0.00022652643173933027, "perf/tokens_per_sec": 2027820.4110156698, "perf/iters_per_sec": 0.9669401221350049, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0341902017593383, "data/tokens_consumed": 163223437312, "data/tokens_consumed_B": 163.223437312, "train/loss_slope": 1.766810573116666e-06} {"step": 77840, "timestamp": 1778278677.0961905, "train/loss": 2.101747715473175, "train/z_loss": 0.001412812212947756, "train/perplexity": 8.180454533170606, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.007513416111469269, "optim/adamw_lr": 0.00022540248334407804, "perf/tokens_per_sec": 2027311.9598635198, "perf/iters_per_sec": 0.9666976737325286, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034449577331543, "data/tokens_consumed": 163244408832, "data/tokens_consumed_B": 163.244408832, "train/loss_slope": 9.557176129867529e-07} {"step": 77850, "timestamp": 1778278687.9445798, "grad/layer_0/attn": 0.0024757306091487408, "grad/layer_0/mlp": 0.0027211192063987255, "grad/layer_0/attn_mlp_ratio": 0.9098206768542078, "grad/layer_4/attn": 0.0021538310684263706, "grad/layer_4/mlp": 0.0024582494515925646, "grad/layer_4/attn_mlp_ratio": 0.8761645322099304, "grad/layer_8/attn": 0.004321692977100611, "grad/layer_8/mlp": 0.0032356353476643562, "grad/layer_8/attn_mlp_ratio": 1.3356550968126601, "grad/layer_12/attn": 0.0041389246471226215, "grad/layer_12/mlp": 0.0066483234986662865, "grad/layer_12/attn_mlp_ratio": 0.6225516230817837, "grad/layer_16/attn": 0.005633458029478788, "grad/layer_16/mlp": 0.004420047160238028, "grad/layer_16/attn_mlp_ratio": 1.2745243880436286, "grad/layer_20/attn": 0.003953189123421907, "grad/layer_20/mlp": 0.005174961406737566, "grad/layer_20/attn_mlp_ratio": 0.7639069620662945, "grad/layer_24/attn": 0.00500060198828578, "grad/layer_24/mlp": 0.006829463876783848, "grad/layer_24/attn_mlp_ratio": 0.7322100248694304, "grad/layer_27/attn": 0.004374463576823473, "grad/layer_27/mlp": 0.006312939804047346, "grad/layer_27/attn_mlp_ratio": 0.6929360398344552} {"step": 77850, "timestamp": 1778278688.552765, "eos/sharpness": 31.138849258422844, "eos/L0_probe": 1.9155422449111938, "eos/L_plus": 2.0905356407165527, "eos/L_minus": 2.0519373416900635, "eos/grad_norm": 0.09005989134311676, "eos/embed_grad_frac": 0.2146443873643875, "eos/time_s": 0.6051757335662842} {"step": 77850, "timestamp": 1778278688.5730083, "train/loss": 2.0841054797172545, "train/z_loss": 0.0014092506491579116, "train/perplexity": 8.037398648997902, "train/grad_norm": 0.09033203125, "optim/muon_lr": 0.007475987076759338, "optim/adamw_lr": 0.00022427961230278012, "perf/tokens_per_sec": 1828092.0453747765, "perf/iters_per_sec": 0.8717022158502467, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1471807479858398, "data/tokens_consumed": 163265380352, "data/tokens_consumed_B": 163.265380352, "train/loss_slope": 4.6230882558243737e-07} {"step": 77850, "timestamp": 1778278689.9373505, "geo/rankme_last": 440.32537841796875, "geo/layer_0/stable_rank_q_proj": 18.750131607055664, "geo/layer_0/stable_rank_k_proj": 15.700582504272461, "geo/layer_0/stable_rank_o_proj": 46.36262893676758, "geo/layer_0/stable_rank_gate_proj": 127.33889770507812, "geo/layer_0/stable_rank_down_proj": 56.814353942871094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061256397515535355, "geo/layer_0/attn_entropy_mean": 6.1277008056640625, "geo/layer_0/attn_entropy_std": 0.42987099289894104, "geo/layer_7/stable_rank_q_proj": 42.581748962402344, "geo/layer_7/stable_rank_k_proj": 40.051727294921875, "geo/layer_7/stable_rank_o_proj": 87.91450500488281, "geo/layer_7/stable_rank_gate_proj": 76.88957214355469, "geo/layer_7/stable_rank_down_proj": 140.26097106933594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4405922591686249, "geo/layer_7/attn_entropy_mean": 4.638552188873291, "geo/layer_7/attn_entropy_std": 0.7673248052597046, "geo/layer_14/stable_rank_q_proj": 49.2480354309082, "geo/layer_14/stable_rank_k_proj": 41.53664016723633, "geo/layer_14/stable_rank_o_proj": 43.1101188659668, "geo/layer_14/stable_rank_gate_proj": 70.26658630371094, "geo/layer_14/stable_rank_down_proj": 124.85549926757812, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4119081199169159, "geo/layer_14/attn_entropy_mean": 5.502440452575684, "geo/layer_14/attn_entropy_std": 0.415900856256485, "geo/layer_21/stable_rank_q_proj": 39.47692108154297, "geo/layer_21/stable_rank_k_proj": 30.061697006225586, "geo/layer_21/stable_rank_o_proj": 67.9214096069336, "geo/layer_21/stable_rank_gate_proj": 62.9976692199707, "geo/layer_21/stable_rank_down_proj": 49.278404235839844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15089188516139984, "geo/layer_21/attn_entropy_mean": 5.6766037940979, "geo/layer_21/attn_entropy_std": 0.28987738490104675, "geo/layer_27/stable_rank_q_proj": 43.85222625732422, "geo/layer_27/stable_rank_k_proj": 32.401397705078125, "geo/layer_27/stable_rank_o_proj": 115.20668029785156, "geo/layer_27/stable_rank_gate_proj": 76.40233612060547, "geo/layer_27/stable_rank_down_proj": 127.7330551147461, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09042999893426895, "geo/layer_27/attn_entropy_mean": 4.151302337646484, "geo/layer_27/attn_entropy_std": 0.8020544052124023, "attnres/final_alpha/block_0": 0.24120663106441498, "attnres/block_norm/0": 1.7791755199432373, "attnres/final_alpha/block_1": 0.003887742990627885, "attnres/block_norm/1": 47554.7109375, "attnres/final_alpha/block_2": 0.009278370067477226, "attnres/block_norm/2": 29134.072265625, "attnres/final_alpha/block_3": 0.010812200605869293, "attnres/block_norm/3": 63616.1640625, "attnres/final_alpha/block_4": 0.012870553880929947, "attnres/block_norm/4": 15736.826171875, "attnres/final_alpha/block_5": 0.6154536008834839, "attnres/block_norm/5": 6832.1435546875, "attnres/final_alpha/block_6": 0.10649095475673676, "attnres/block_norm/6": 41022.2109375, "geo/tier1_time_s": 1.360187292098999, "geo/step": 77850.0, "geo/rankme_slope": 0.0005658883475265106} {"step": 77860, "timestamp": 1778278700.2908862, "train/loss": 2.1008192539215087, "train/z_loss": 0.0014161762548610568, "train/perplexity": 8.172862820513425, "train/grad_norm": 0.1103515625, "optim/muon_lr": 0.007438597083091736, "optim/adamw_lr": 0.00022315791249275205, "perf/tokens_per_sec": 1790310.5336104962, "perf/iters_per_sec": 0.8536865871479493, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1713900804519652, "data/tokens_consumed": 163286351872, "data/tokens_consumed_B": 163.286351872, "train/loss_slope": 7.529172674156829e-07} {"step": 77870, "timestamp": 1778278710.6498883, "train/loss": 2.074486553668976, "train/z_loss": 0.001413826015777886, "train/perplexity": 7.960458141532736, "train/grad_norm": 0.1123046875, "optim/muon_lr": 0.00740124523639679, "optim/adamw_lr": 0.00022203735709190367, "perf/tokens_per_sec": 2025478.1600470967, "perf/iters_per_sec": 0.965823249839352, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353861331939698, "data/tokens_consumed": 163307323392, "data/tokens_consumed_B": 163.307323392, "train/loss_slope": -2.039079440094262e-06} {"step": 77880, "timestamp": 1778278721.0056756, "train/loss": 2.0639630913734437, "train/z_loss": 0.0014219239703379572, "train/perplexity": 7.87712580183239, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.007363932430744171, "optim/adamw_lr": 0.00022091797292232511, "perf/tokens_per_sec": 2026126.5803915036, "perf/iters_per_sec": 0.9661324407537001, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350547790527345, "data/tokens_consumed": 163328294912, "data/tokens_consumed_B": 163.328294912, "train/loss_slope": -4.038971800221847e-06} {"step": 77890, "timestamp": 1778278731.3543038, "train/loss": 2.09541494846344, "train/z_loss": 0.00142019079066813, "train/perplexity": 8.128813309083018, "train/grad_norm": 0.1005859375, "optim/muon_lr": 0.0073266586661338805, "optim/adamw_lr": 0.0002197997599840164, "perf/tokens_per_sec": 2027561.409775555, "perf/iters_per_sec": 0.9668166207196974, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343223094940186, "data/tokens_consumed": 163349266432, "data/tokens_consumed_B": 163.349266432, "train/loss_slope": -2.093263270437891e-06} {"step": 77900, "timestamp": 1778278741.697088, "grad/layer_0/attn": 0.0024926471523940563, "grad/layer_0/mlp": 0.0029252562671899796, "grad/layer_0/attn_mlp_ratio": 0.8521123756371854, "grad/layer_4/attn": 0.0023398143239319324, "grad/layer_4/mlp": 0.0025554983876645565, "grad/layer_4/attn_mlp_ratio": 0.9155999642442616, "grad/layer_8/attn": 0.0035523485857993364, "grad/layer_8/mlp": 0.0036327887792140245, "grad/layer_8/attn_mlp_ratio": 0.9778571516019142, "grad/layer_12/attn": 0.00549384905025363, "grad/layer_12/mlp": 0.006672705989331007, "grad/layer_12/attn_mlp_ratio": 0.8233314905084382, "grad/layer_16/attn": 0.0029155185911804438, "grad/layer_16/mlp": 0.004145417828112841, "grad/layer_16/attn_mlp_ratio": 0.7033111357502388, "grad/layer_20/attn": 0.0025072048883885145, "grad/layer_20/mlp": 0.0048148431815207005, "grad/layer_20/attn_mlp_ratio": 0.5207240904415583, "grad/layer_24/attn": 0.005308220162987709, "grad/layer_24/mlp": 0.006074757315218449, "grad/layer_24/attn_mlp_ratio": 0.8738159897034877, "grad/layer_27/attn": 0.003741324646398425, "grad/layer_27/mlp": 0.005605911370366812, "grad/layer_27/attn_mlp_ratio": 0.6673891776877489} {"step": 77900, "timestamp": 1778278741.7129455, "train/loss": 2.051589608192444, "train/z_loss": 0.0014201121404767037, "train/perplexity": 7.780258844922539, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.007289425134658814, "optim/adamw_lr": 0.00021868275403976438, "perf/tokens_per_sec": 2025563.5628199351, "perf/iters_per_sec": 0.9658639730548549, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353424787521361, "data/tokens_consumed": 163370237952, "data/tokens_consumed_B": 163.370237952, "train/loss_slope": -4.704461089133342e-06} {"step": 77910, "timestamp": 1778278752.0633883, "train/loss": 2.0737975358963014, "train/z_loss": 0.0014125745045021176, "train/perplexity": 7.95497513355648, "train/grad_norm": 0.10400390625, "optim/muon_lr": 0.007252230942249298, "optim/adamw_lr": 0.00021756692826747893, "perf/tokens_per_sec": 2027149.4158578147, "perf/iters_per_sec": 0.9666201667107652, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345325231552125, "data/tokens_consumed": 163391209472, "data/tokens_consumed_B": 163.391209472, "train/loss_slope": -4.766375411211288e-06} {"step": 77920, "timestamp": 1778278762.412843, "train/loss": 2.0861521005630492, "train/z_loss": 0.0014316122513264417, "train/perplexity": 8.05386500106094, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.007215079069137573, "optim/adamw_lr": 0.00021645237207412719, "perf/tokens_per_sec": 2027382.5641291824, "perf/iters_per_sec": 0.9667313404699241, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344135522842408, "data/tokens_consumed": 163412180992, "data/tokens_consumed_B": 163.412180992, "train/loss_slope": -4.067587251603098e-06} {"step": 77925, "timestamp": 1778278768.2083502, "eos/sharpness": 16.02101325988769, "eos/L0_probe": 1.9139567613601685, "eos/L_plus": 2.001260280609131, "eos/L_minus": 1.986863374710083, "eos/grad_norm": 0.07677524536848068, "eos/embed_grad_frac": 0.2823788523674011, "eos/time_s": 0.6130461692810059} {"step": 77925, "timestamp": 1778278769.588183, "geo/rankme_last": 440.2854919433594, "geo/layer_0/stable_rank_q_proj": 18.749223709106445, "geo/layer_0/stable_rank_k_proj": 15.7051362991333, "geo/layer_0/stable_rank_o_proj": 46.3546257019043, "geo/layer_0/stable_rank_gate_proj": 127.32081604003906, "geo/layer_0/stable_rank_down_proj": 56.817420959472656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06826126575469971, "geo/layer_0/attn_entropy_mean": 6.1281328201293945, "geo/layer_0/attn_entropy_std": 0.4286549985408783, "geo/layer_7/stable_rank_q_proj": 42.57741928100586, "geo/layer_7/stable_rank_k_proj": 40.06685256958008, "geo/layer_7/stable_rank_o_proj": 87.9489517211914, "geo/layer_7/stable_rank_gate_proj": 76.8847427368164, "geo/layer_7/stable_rank_down_proj": 140.2254638671875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4438219666481018, "geo/layer_7/attn_entropy_mean": 4.644248008728027, "geo/layer_7/attn_entropy_std": 0.763751208782196, "geo/layer_14/stable_rank_q_proj": 49.238956451416016, "geo/layer_14/stable_rank_k_proj": 41.51636505126953, "geo/layer_14/stable_rank_o_proj": 43.10176467895508, "geo/layer_14/stable_rank_gate_proj": 70.27176666259766, "geo/layer_14/stable_rank_down_proj": 124.90608215332031, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40048155188560486, "geo/layer_14/attn_entropy_mean": 5.509627342224121, "geo/layer_14/attn_entropy_std": 0.4194018244743347, "geo/layer_21/stable_rank_q_proj": 39.45665740966797, "geo/layer_21/stable_rank_k_proj": 30.07369613647461, "geo/layer_21/stable_rank_o_proj": 67.92549896240234, "geo/layer_21/stable_rank_gate_proj": 62.99365234375, "geo/layer_21/stable_rank_down_proj": 49.25090026855469, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15221965312957764, "geo/layer_21/attn_entropy_mean": 5.68203067779541, "geo/layer_21/attn_entropy_std": 0.2875061333179474, "geo/layer_27/stable_rank_q_proj": 43.85733413696289, "geo/layer_27/stable_rank_k_proj": 32.387245178222656, "geo/layer_27/stable_rank_o_proj": 115.19868469238281, "geo/layer_27/stable_rank_gate_proj": 76.3980484008789, "geo/layer_27/stable_rank_down_proj": 127.71298217773438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08305330574512482, "geo/layer_27/attn_entropy_mean": 4.150793075561523, "geo/layer_27/attn_entropy_std": 0.7953370809555054, "attnres/final_alpha/block_0": 0.24122744798660278, "attnres/block_norm/0": 1.7791224718093872, "attnres/final_alpha/block_1": 0.003893108107149601, "attnres/block_norm/1": 47438.5234375, "attnres/final_alpha/block_2": 0.009284566156566143, "attnres/block_norm/2": 29059.421875, "attnres/final_alpha/block_3": 0.01081741601228714, "attnres/block_norm/3": 63748.33984375, "attnres/final_alpha/block_4": 0.012884154915809631, "attnres/block_norm/4": 15744.03515625, "attnres/final_alpha/block_5": 0.6144455075263977, "attnres/block_norm/5": 6817.9072265625, "attnres/final_alpha/block_6": 0.10744777321815491, "attnres/block_norm/6": 40881.73828125, "geo/tier1_time_s": 1.358818769454956, "geo/step": 77925.0, "geo/rankme_slope": 0.0005824081194977991} {"step": 77930, "timestamp": 1778278774.770289, "train/loss": 2.0847370266914367, "train/z_loss": 0.0014110509306192397, "train/perplexity": 8.042476246997031, "train/grad_norm": 0.087890625, "optim/muon_lr": 0.007177968621253967, "optim/adamw_lr": 0.000215339058637619, "perf/tokens_per_sec": 1698027.795588649, "perf/iters_per_sec": 0.8096827485984083, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350516319274902, "data/tokens_consumed": 163433152512, "data/tokens_consumed_B": 163.433152512, "train/loss_slope": -4.506273653068727e-06} {"step": 77940, "timestamp": 1778278785.129226, "train/loss": 2.1102415323257446, "train/z_loss": 0.0014026606455445289, "train/perplexity": 8.250233742139201, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.007140900194644928, "optim/adamw_lr": 0.00021422700583934783, "perf/tokens_per_sec": 2025868.29158072, "perf/iters_per_sec": 0.9660092790511704, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351867437362672, "data/tokens_consumed": 163454124032, "data/tokens_consumed_B": 163.454124032, "train/loss_slope": -3.3838960311093083e-06} {"step": 77950, "timestamp": 1778278795.4759753, "grad/layer_0/attn": 0.002447261707857251, "grad/layer_0/mlp": 0.0028892119880765676, "grad/layer_0/attn_mlp_ratio": 0.8470342893679585, "grad/layer_4/attn": 0.002252086065709591, "grad/layer_4/mlp": 0.0026402815710753202, "grad/layer_4/attn_mlp_ratio": 0.8529718970447471, "grad/layer_8/attn": 0.005220944993197918, "grad/layer_8/mlp": 0.003568533807992935, "grad/layer_8/attn_mlp_ratio": 1.4630504088818803, "grad/layer_12/attn": 0.0056664361618459225, "grad/layer_12/mlp": 0.006737106014043093, "grad/layer_12/attn_mlp_ratio": 0.8410786569079827, "grad/layer_16/attn": 0.004534069914370775, "grad/layer_16/mlp": 0.004698886536061764, "grad/layer_16/attn_mlp_ratio": 0.9649243034666772, "grad/layer_20/attn": 0.0034422145690768957, "grad/layer_20/mlp": 0.005222859792411327, "grad/layer_20/attn_mlp_ratio": 0.6590669939429813, "grad/layer_24/attn": 0.005147730931639671, "grad/layer_24/mlp": 0.0071183196268975735, "grad/layer_24/attn_mlp_ratio": 0.7231665799146736, "grad/layer_27/attn": 0.005223847460001707, "grad/layer_27/mlp": 0.0059638419188559055, "grad/layer_27/attn_mlp_ratio": 0.8759198254221768} {"step": 77950, "timestamp": 1778278795.4917514, "train/loss": 2.120488965511322, "train/z_loss": 0.0014006047509610652, "train/perplexity": 8.335212122686062, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.0071038749814033506, "optim/adamw_lr": 0.0002131162494421005, "perf/tokens_per_sec": 2024622.9437918044, "perf/iters_per_sec": 0.9654154509505293, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358234882354735, "data/tokens_consumed": 163475095552, "data/tokens_consumed_B": 163.475095552, "train/loss_slope": 1.099229581428297e-06} {"step": 77960, "timestamp": 1778278805.8561683, "train/loss": 2.082746982574463, "train/z_loss": 0.0014056846732273698, "train/perplexity": 8.026487279108261, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.007066892683506012, "optim/adamw_lr": 0.00021200678050518033, "perf/tokens_per_sec": 2024513.063773052, "perf/iters_per_sec": 0.9653630560746441, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358797073364259, "data/tokens_consumed": 163496067072, "data/tokens_consumed_B": 163.496067072, "train/loss_slope": 1.4081151858128497e-06} {"step": 77970, "timestamp": 1778278816.2115936, "train/loss": 2.137242555618286, "train/z_loss": 0.001400895440019667, "train/perplexity": 8.476033185403855, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.007029953300952911, "optim/adamw_lr": 0.0002108985990285873, "perf/tokens_per_sec": 2026488.482142312, "perf/iters_per_sec": 0.9663050089561043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348699331283568, "data/tokens_consumed": 163517038592, "data/tokens_consumed_B": 163.517038592, "train/loss_slope": 6.3626531649022045e-06} {"step": 77980, "timestamp": 1778278826.5594769, "train/loss": 2.0667293548583983, "train/z_loss": 0.001410331716760993, "train/perplexity": 7.898946173839037, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.00699305921792984, "optim/adamw_lr": 0.00020979177653789518, "perf/tokens_per_sec": 2027605.203027038, "perf/iters_per_sec": 0.9668375029692831, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342999696731567, "data/tokens_consumed": 163538010112, "data/tokens_consumed_B": 163.538010112, "train/loss_slope": 6.412505023371431e-06} {"step": 77990, "timestamp": 1778278836.912661, "train/loss": 2.0784706115722655, "train/z_loss": 0.0014192002941854297, "train/perplexity": 7.992236328740427, "train/grad_norm": 0.12158203125, "optim/muon_lr": 0.006956210136413574, "optim/adamw_lr": 0.0002086863040924072, "perf/tokens_per_sec": 2027007.9644503987, "perf/iters_per_sec": 0.9665527174236291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346047163009644, "data/tokens_consumed": 163558981632, "data/tokens_consumed_B": 163.558981632, "train/loss_slope": 6.784492400255464e-06} {"step": 78000, "timestamp": 1778278847.2533894, "grad/layer_0/attn": 0.0025493178982287645, "grad/layer_0/mlp": 0.0029720193706452847, "grad/layer_0/attn_mlp_ratio": 0.8577729464454877, "grad/layer_4/attn": 0.0020707070361822844, "grad/layer_4/mlp": 0.0025763949379324913, "grad/layer_4/attn_mlp_ratio": 0.8037226456716001, "grad/layer_8/attn": 0.0060238316655159, "grad/layer_8/mlp": 0.0035361498594284058, "grad/layer_8/attn_mlp_ratio": 1.7035000592818779, "grad/layer_12/attn": 0.004203855060040951, "grad/layer_12/mlp": 0.006500854156911373, "grad/layer_12/attn_mlp_ratio": 0.6466619453238209, "grad/layer_16/attn": 0.004065231885761023, "grad/layer_16/mlp": 0.004394962918013334, "grad/layer_16/attn_mlp_ratio": 0.9249752202917602, "grad/layer_20/attn": 0.00262877787463367, "grad/layer_20/mlp": 0.005153076257556677, "grad/layer_20/attn_mlp_ratio": 0.5101375745730461, "grad/layer_24/attn": 0.00562298484146595, "grad/layer_24/mlp": 0.0076692188158631325, "grad/layer_24/attn_mlp_ratio": 0.7331887253648843, "grad/layer_27/attn": 0.004497183952480555, "grad/layer_27/mlp": 0.0071288966573774815, "grad/layer_27/attn_mlp_ratio": 0.6308386985442809} {"step": 78000, "timestamp": 1778278847.8642313, "eos/sharpness": 35.16373634338378, "eos/L0_probe": 1.9126957654953003, "eos/L_plus": 2.087768316268921, "eos/L_minus": 2.0892605781555176, "eos/grad_norm": 0.0957215428352356, "eos/embed_grad_frac": 0.2001434862613678, "eos/time_s": 0.6080911159515381} {"step": 78000, "timestamp": 1778278847.8838332, "train/loss": 2.0792423248291017, "train/z_loss": 0.001416614872869104, "train/perplexity": 7.998406423932999, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.006919406652450562, "optim/adamw_lr": 0.00020758219957351683, "perf/tokens_per_sec": 1912641.6934188688, "perf/iters_per_sec": 0.912018629750666, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0964688301086425, "data/tokens_consumed": 163579953152, "data/tokens_consumed_B": 163.579953152, "train/loss_slope": 5.079593974621884e-06} {"step": 78000, "timestamp": 1778278849.2466063, "geo/rankme_last": 440.190185546875, "geo/layer_0/stable_rank_q_proj": 18.74955940246582, "geo/layer_0/stable_rank_k_proj": 15.717676162719727, "geo/layer_0/stable_rank_o_proj": 46.34861373901367, "geo/layer_0/stable_rank_gate_proj": 127.29322814941406, "geo/layer_0/stable_rank_down_proj": 56.806678771972656, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0653386116027832, "geo/layer_0/attn_entropy_mean": 6.127312183380127, "geo/layer_0/attn_entropy_std": 0.4293029308319092, "geo/layer_7/stable_rank_q_proj": 42.57571792602539, "geo/layer_7/stable_rank_k_proj": 40.025550842285156, "geo/layer_7/stable_rank_o_proj": 87.9627685546875, "geo/layer_7/stable_rank_gate_proj": 76.86781311035156, "geo/layer_7/stable_rank_down_proj": 140.22816467285156, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4469550848007202, "geo/layer_7/attn_entropy_mean": 4.631519794464111, "geo/layer_7/attn_entropy_std": 0.7741600871086121, "geo/layer_14/stable_rank_q_proj": 49.22866439819336, "geo/layer_14/stable_rank_k_proj": 41.49089431762695, "geo/layer_14/stable_rank_o_proj": 43.10710144042969, "geo/layer_14/stable_rank_gate_proj": 70.2707290649414, "geo/layer_14/stable_rank_down_proj": 124.9421615600586, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40899550914764404, "geo/layer_14/attn_entropy_mean": 5.493380546569824, "geo/layer_14/attn_entropy_std": 0.41229379177093506, "geo/layer_21/stable_rank_q_proj": 39.43999099731445, "geo/layer_21/stable_rank_k_proj": 30.070106506347656, "geo/layer_21/stable_rank_o_proj": 67.91240692138672, "geo/layer_21/stable_rank_gate_proj": 63.00410842895508, "geo/layer_21/stable_rank_down_proj": 49.245243072509766, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14787086844444275, "geo/layer_21/attn_entropy_mean": 5.674975395202637, "geo/layer_21/attn_entropy_std": 0.29148733615875244, "geo/layer_27/stable_rank_q_proj": 43.84675216674805, "geo/layer_27/stable_rank_k_proj": 32.38813400268555, "geo/layer_27/stable_rank_o_proj": 115.16944122314453, "geo/layer_27/stable_rank_gate_proj": 76.37911987304688, "geo/layer_27/stable_rank_down_proj": 127.74446868896484, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08380686491727829, "geo/layer_27/attn_entropy_mean": 4.13715124130249, "geo/layer_27/attn_entropy_std": 0.7935465574264526, "attnres/final_alpha/block_0": 0.24156178534030914, "attnres/block_norm/0": 1.7790868282318115, "attnres/final_alpha/block_1": 0.003937182016670704, "attnres/block_norm/1": 47546.04296875, "attnres/final_alpha/block_2": 0.0091912392526865, "attnres/block_norm/2": 29138.01171875, "attnres/final_alpha/block_3": 0.010755026713013649, "attnres/block_norm/3": 63644.03125, "attnres/final_alpha/block_4": 0.012836379930377007, "attnres/block_norm/4": 15747.06640625, "attnres/final_alpha/block_5": 0.6139779090881348, "attnres/block_norm/5": 6822.66748046875, "attnres/final_alpha/block_6": 0.10774049162864685, "attnres/block_norm/6": 41029.46484375, "geo/tier1_time_s": 1.35860013961792, "geo/step": 78000.0, "geo/rankme_slope": 0.0005858740371148459} {"step": 78000, "timestamp": 1778278856.2118843, "geo/ww_alpha_mean": 7.964562120816418, "geo/ww_alpha_std": 4.917587126575285, "geo/ww_alpha_min": 1.3449080902358808, "geo/ww_alpha_max": 31.49488577981328, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.922162216176819, "geo/ww_alpha_by_type/k_proj": 4.4918351978109365, "geo/ww_alpha_by_type/v_proj": 9.093816801664262, "geo/ww_alpha_by_type/o_proj": 9.692729470429052, "geo/ww_alpha_by_type/gate_proj": 8.047117877515799, "geo/ww_alpha_by_type/up_proj": 12.595761708000081, "geo/ww_alpha_by_type/down_proj": 8.01157573331404, "geo/twonn_id/layer_0": 0.7186803817749023, "geo/twonn_id/layer_7": 3.0870656967163086, "geo/twonn_id/layer_14": 4.829063892364502, "geo/twonn_id/layer_21": 6.772797107696533, "geo/twonn_id/layer_27": 4.959611415863037, "geo/tier2_time_s": 6.959228992462158} {"step": 78000, "timestamp": 1778278856.8294513, "eoc/jacobian_sigma/layer_0/attn": 1332.00146484375, "eoc/jacobian_sigma/layer_0/mlp": 9061.421875, "eoc/jacobian_sigma/layer_0": 9061.421875, "eoc/jacobian_sigma/layer_7/attn": 1.1554114818572998, "eoc/jacobian_sigma/layer_7/mlp": 1.8766266107559204, "eoc/jacobian_sigma/layer_7": 1.8766266107559204, "eoc/jacobian_sigma/layer_14/attn": 1.3910669088363647, "eoc/jacobian_sigma/layer_14/mlp": 6.71507453918457, "eoc/jacobian_sigma/layer_14": 6.71507453918457, "eoc/jacobian_sigma/layer_21/attn": 1.1128400564193726, "eoc/jacobian_sigma/layer_21/mlp": 4.24371337890625, "eoc/jacobian_sigma/layer_21": 4.24371337890625, "eoc/jacobian_sigma/layer_27/attn": 3.1641523838043213, "eoc/jacobian_sigma/layer_27/mlp": 29.148357391357422, "eoc/jacobian_sigma/layer_27": 29.148357391357422, "eoc/layer0_sigma": 9061.421875, "eoc/sigma_max": 29.148357391357422, "eoc/sigma_min": 1.8766266107559204, "eoc/sigma_mean": 10.49594298005104, "eoc/time_s": 0.6123342514038086} {"step": 78010, "timestamp": 1778278867.1935909, "train/loss": 2.0962223768234254, "train/z_loss": 0.001398871827404946, "train/perplexity": 8.13537939394664, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.00688264936208725, "optim/adamw_lr": 0.00020647948086261747, "perf/tokens_per_sec": 1086276.3417785896, "perf/iters_per_sec": 0.5179769238369892, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.930587935447693, "data/tokens_consumed": 163600924672, "data/tokens_consumed_B": 163.600924672, "train/loss_slope": 7.147761239613462e-06} {"step": 78020, "timestamp": 1778278877.5480638, "train/loss": 2.0608449459075926, "train/z_loss": 0.0014097552630119026, "train/perplexity": 7.85260203194045, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.006845938265323639, "optim/adamw_lr": 0.00020537814795970914, "perf/tokens_per_sec": 2026936.8724856116, "perf/iters_per_sec": 0.966518818133169, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346410036087037, "data/tokens_consumed": 163621896192, "data/tokens_consumed_B": 163.621896192, "train/loss_slope": 5.9884353475172455e-06} {"step": 78030, "timestamp": 1778278887.898617, "train/loss": 2.0439927697181703, "train/z_loss": 0.001406516064889729, "train/perplexity": 7.721377414726722, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.006809273660182953, "optim/adamw_lr": 0.00020427820980548856, "perf/tokens_per_sec": 2027405.9753651114, "perf/iters_per_sec": 0.9667425038171346, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344016075134277, "data/tokens_consumed": 163642867712, "data/tokens_consumed_B": 163.642867712, "train/loss_slope": 5.44796458529591e-06} {"step": 78040, "timestamp": 1778278898.2484677, "train/loss": 2.0799516320228575, "train/z_loss": 0.0014094942715018987, "train/perplexity": 8.00408176368979, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.006772657632827759, "optim/adamw_lr": 0.00020317972898483276, "perf/tokens_per_sec": 2027547.6225372306, "perf/iters_per_sec": 0.9668100464521554, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034329342842102, "data/tokens_consumed": 163663839232, "data/tokens_consumed_B": 163.663839232, "train/loss_slope": 4.1735842104183625e-06} {"step": 78050, "timestamp": 1778278908.5887687, "grad/layer_0/attn": 0.0025718987453728914, "grad/layer_0/mlp": 0.0028281058184802532, "grad/layer_0/attn_mlp_ratio": 0.90940679716655, "grad/layer_4/attn": 0.003316788235679269, "grad/layer_4/mlp": 0.002485574223101139, "grad/layer_4/attn_mlp_ratio": 1.3344152314629085, "grad/layer_8/attn": 0.003172679338604212, "grad/layer_8/mlp": 0.003428627736866474, "grad/layer_8/attn_mlp_ratio": 0.9253495828534772, "grad/layer_12/attn": 0.0040856036357581615, "grad/layer_12/mlp": 0.006268461234867573, "grad/layer_12/attn_mlp_ratio": 0.6517713705965571, "grad/layer_16/attn": 0.00292609422467649, "grad/layer_16/mlp": 0.004330807365477085, "grad/layer_16/attn_mlp_ratio": 0.6756463426281983, "grad/layer_20/attn": 0.003556093666702509, "grad/layer_20/mlp": 0.0045484029687941074, "grad/layer_20/attn_mlp_ratio": 0.7818334507555673, "grad/layer_24/attn": 0.0036566394846886396, "grad/layer_24/mlp": 0.006553661078214645, "grad/layer_24/attn_mlp_ratio": 0.5579536972164291, "grad/layer_27/attn": 0.004138972144573927, "grad/layer_27/mlp": 0.005463569890707731, "grad/layer_27/attn_mlp_ratio": 0.7575581811184557} {"step": 78050, "timestamp": 1778278908.6043315, "train/loss": 2.0705935955047607, "train/z_loss": 0.0014179544639773666, "train/perplexity": 7.929528653682557, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.006736089587211609, "optim/adamw_lr": 0.00020208268761634824, "perf/tokens_per_sec": 2026410.5640872058, "perf/iters_per_sec": 0.9662678547321347, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034909725189209, "data/tokens_consumed": 163684810752, "data/tokens_consumed_B": 163.684810752, "train/loss_slope": 2.2508216340585016e-06} {"step": 78060, "timestamp": 1778278918.9481983, "train/loss": 2.044993555545807, "train/z_loss": 0.001424138166476041, "train/perplexity": 7.729108727862232, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.006699570715427399, "optim/adamw_lr": 0.00020098712146282194, "perf/tokens_per_sec": 2028351.287497814, "perf/iters_per_sec": 0.9671932637681074, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0339195251464843, "data/tokens_consumed": 163705782272, "data/tokens_consumed_B": 163.705782272, "train/loss_slope": -1.7492150316144448e-06} {"step": 78070, "timestamp": 1778278929.2958577, "train/loss": 2.069814074039459, "train/z_loss": 0.0014136968413367868, "train/perplexity": 7.9233498244651726, "train/grad_norm": 0.091796875, "optim/muon_lr": 0.006663101017475128, "optim/adamw_lr": 0.00019989303052425384, "perf/tokens_per_sec": 2028096.6864829017, "perf/iters_per_sec": 0.967071860543681, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340493202209473, "data/tokens_consumed": 163726753792, "data/tokens_consumed_B": 163.726753792, "train/loss_slope": -3.976007306178824e-06} {"step": 78075, "timestamp": 1778278935.0779815, "eos/sharpness": 2.4208664894104, "eos/L0_probe": 1.9133962392807007, "eos/L_plus": 1.925450325012207, "eos/L_minus": 1.9255508184432983, "eos/grad_norm": 0.06946852058172226, "eos/embed_grad_frac": 0.3752346336841583, "eos/time_s": 0.6172773838043213} {"step": 78075, "timestamp": 1778278936.4544358, "geo/rankme_last": 440.71966552734375, "geo/layer_0/stable_rank_q_proj": 18.7437744140625, "geo/layer_0/stable_rank_k_proj": 15.718972206115723, "geo/layer_0/stable_rank_o_proj": 46.34640121459961, "geo/layer_0/stable_rank_gate_proj": 127.27404022216797, "geo/layer_0/stable_rank_down_proj": 56.796974182128906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06237434595823288, "geo/layer_0/attn_entropy_mean": 6.125568389892578, "geo/layer_0/attn_entropy_std": 0.4292905926704407, "geo/layer_7/stable_rank_q_proj": 42.575439453125, "geo/layer_7/stable_rank_k_proj": 40.01218032836914, "geo/layer_7/stable_rank_o_proj": 87.95297241210938, "geo/layer_7/stable_rank_gate_proj": 76.87326049804688, "geo/layer_7/stable_rank_down_proj": 140.18211364746094, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4573529064655304, "geo/layer_7/attn_entropy_mean": 4.64653205871582, "geo/layer_7/attn_entropy_std": 0.7736846208572388, "geo/layer_14/stable_rank_q_proj": 49.21372604370117, "geo/layer_14/stable_rank_k_proj": 41.50416564941406, "geo/layer_14/stable_rank_o_proj": 43.11369323730469, "geo/layer_14/stable_rank_gate_proj": 70.25909423828125, "geo/layer_14/stable_rank_down_proj": 124.9880599975586, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4069162607192993, "geo/layer_14/attn_entropy_mean": 5.497066020965576, "geo/layer_14/attn_entropy_std": 0.4149872064590454, "geo/layer_21/stable_rank_q_proj": 39.43788528442383, "geo/layer_21/stable_rank_k_proj": 30.066375732421875, "geo/layer_21/stable_rank_o_proj": 67.91390228271484, "geo/layer_21/stable_rank_gate_proj": 62.99383544921875, "geo/layer_21/stable_rank_down_proj": 49.23957061767578, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14815020561218262, "geo/layer_21/attn_entropy_mean": 5.6813764572143555, "geo/layer_21/attn_entropy_std": 0.28800129890441895, "geo/layer_27/stable_rank_q_proj": 43.853641510009766, "geo/layer_27/stable_rank_k_proj": 32.3935546875, "geo/layer_27/stable_rank_o_proj": 115.16943359375, "geo/layer_27/stable_rank_gate_proj": 76.341552734375, "geo/layer_27/stable_rank_down_proj": 127.7677230834961, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0894932895898819, "geo/layer_27/attn_entropy_mean": 4.149129867553711, "geo/layer_27/attn_entropy_std": 0.7953848242759705, "attnres/final_alpha/block_0": 0.24104928970336914, "attnres/block_norm/0": 1.7791515588760376, "attnres/final_alpha/block_1": 0.003943463321775198, "attnres/block_norm/1": 47532.01953125, "attnres/final_alpha/block_2": 0.009187404066324234, "attnres/block_norm/2": 29139.24609375, "attnres/final_alpha/block_3": 0.01071973517537117, "attnres/block_norm/3": 63695.5234375, "attnres/final_alpha/block_4": 0.012898942455649376, "attnres/block_norm/4": 15743.205078125, "attnres/final_alpha/block_5": 0.6141729354858398, "attnres/block_norm/5": 6834.2978515625, "attnres/final_alpha/block_6": 0.10802824795246124, "attnres/block_norm/6": 40900.921875, "geo/tier1_time_s": 1.357475757598877, "geo/step": 78075.0, "geo/rankme_slope": 0.0005982869124212185} {"step": 78080, "timestamp": 1778278941.6296422, "train/loss": 2.127555823326111, "train/z_loss": 0.0014116063131950796, "train/perplexity": 8.394324505005475, "train/grad_norm": 0.1064453125, "optim/muon_lr": 0.006626681089401246, "optim/adamw_lr": 0.00019880043268203734, "perf/tokens_per_sec": 1701226.7119414601, "perf/iters_per_sec": 0.8112081107814122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327292919158936, "data/tokens_consumed": 163747725312, "data/tokens_consumed_B": 163.747725312, "train/loss_slope": -1.1613741959676215e-06} {"step": 78090, "timestamp": 1778278951.9761338, "train/loss": 2.076033687591553, "train/z_loss": 0.0014101047301664949, "train/perplexity": 7.972783568447264, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.006590310633182526, "optim/adamw_lr": 0.00019770931899547575, "perf/tokens_per_sec": 2027658.954000452, "perf/iters_per_sec": 0.9668631334306965, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03427255153656, "data/tokens_consumed": 163768696832, "data/tokens_consumed_B": 163.768696832, "train/loss_slope": -2.859323610602843e-06} {"step": 78100, "timestamp": 1778278962.3189166, "grad/layer_0/attn": 0.0026031865272670984, "grad/layer_0/mlp": 0.0029265712946653366, "grad/layer_0/attn_mlp_ratio": 0.8895004345399821, "grad/layer_4/attn": 0.00232183700427413, "grad/layer_4/mlp": 0.0025088011752814054, "grad/layer_4/attn_mlp_ratio": 0.9254766517980567, "grad/layer_8/attn": 0.0076080081053078175, "grad/layer_8/mlp": 0.003431451739743352, "grad/layer_8/attn_mlp_ratio": 2.2171397008086355, "grad/layer_12/attn": 0.004473099485039711, "grad/layer_12/mlp": 0.006649927701801062, "grad/layer_12/attn_mlp_ratio": 0.6726538420203931, "grad/layer_16/attn": 0.003124912967905402, "grad/layer_16/mlp": 0.004387499764561653, "grad/layer_16/attn_mlp_ratio": 0.7122308978618297, "grad/layer_20/attn": 0.0037721325643360615, "grad/layer_20/mlp": 0.0051985494792461395, "grad/layer_20/attn_mlp_ratio": 0.7256125014937479, "grad/layer_24/attn": 0.004969477653503418, "grad/layer_24/mlp": 0.0067274440079927444, "grad/layer_24/attn_mlp_ratio": 0.7386873192449536, "grad/layer_27/attn": 0.006842006929218769, "grad/layer_27/mlp": 0.006157631054520607, "grad/layer_27/attn_mlp_ratio": 1.111142703667096} {"step": 78100, "timestamp": 1778278962.3348687, "train/loss": 2.133311223983765, "train/z_loss": 0.0014018084271810948, "train/perplexity": 8.442776502364715, "train/grad_norm": 0.09912109375, "optim/muon_lr": 0.006553992331027984, "optim/adamw_lr": 0.00019661976993083952, "perf/tokens_per_sec": 2025453.5806819825, "perf/iters_per_sec": 0.9658115294847405, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353986978530885, "data/tokens_consumed": 163789668352, "data/tokens_consumed_B": 163.789668352, "train/loss_slope": 2.5299550175773717e-06} {"step": 78110, "timestamp": 1778278972.689879, "train/loss": 2.1225929498672484, "train/z_loss": 0.0014049164718016982, "train/perplexity": 8.35276774049194, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.006517725288867951, "optim/adamw_lr": 0.0001955317586660385, "perf/tokens_per_sec": 2026420.9279168046, "perf/iters_per_sec": 0.9662727965911887, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349044322967529, "data/tokens_consumed": 163810639872, "data/tokens_consumed_B": 163.810639872, "train/loss_slope": 5.6070882924282975e-06} {"step": 78120, "timestamp": 1778278983.039186, "train/loss": 2.116549754142761, "train/z_loss": 0.0013899169280193746, "train/perplexity": 8.302442545852651, "train/grad_norm": 0.11083984375, "optim/muon_lr": 0.0064815104007720946, "optim/adamw_lr": 0.00019444531202316283, "perf/tokens_per_sec": 2027206.0393249383, "perf/iters_per_sec": 0.966647166883916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345036268234253, "data/tokens_consumed": 163831611392, "data/tokens_consumed_B": 163.831611392, "train/loss_slope": 6.814776931909163e-06} {"step": 78130, "timestamp": 1778278993.4022717, "train/loss": 2.0710655450820923, "train/z_loss": 0.0014189696637913584, "train/perplexity": 7.93327187461539, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.006445348262786865, "optim/adamw_lr": 0.00019336044788360594, "perf/tokens_per_sec": 2024625.786478124, "perf/iters_per_sec": 0.9654168064489956, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358220338821411, "data/tokens_consumed": 163852582912, "data/tokens_consumed_B": 163.852582912, "train/loss_slope": 5.930303361299221e-06} {"step": 78140, "timestamp": 1778279003.763404, "train/loss": 2.039884328842163, "train/z_loss": 0.0014178326935507357, "train/perplexity": 7.68971966865598, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.006409238874912262, "optim/adamw_lr": 0.00019227716624736784, "perf/tokens_per_sec": 2025665.953025681, "perf/iters_per_sec": 0.9659127965095906, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352901458740233, "data/tokens_consumed": 163873554432, "data/tokens_consumed_B": 163.873554432, "train/loss_slope": 3.7061889572898422e-06} {"step": 78150, "timestamp": 1778279014.102075, "grad/layer_0/attn": 0.002531635109335184, "grad/layer_0/mlp": 0.002821977948769927, "grad/layer_0/attn_mlp_ratio": 0.8971136789808479, "grad/layer_4/attn": 0.0019049885449931026, "grad/layer_4/mlp": 0.0024901311844587326, "grad/layer_4/attn_mlp_ratio": 0.7650153053706084, "grad/layer_8/attn": 0.004934626165777445, "grad/layer_8/mlp": 0.0034725405275821686, "grad/layer_8/attn_mlp_ratio": 1.4210420251334208, "grad/layer_12/attn": 0.003930808510631323, "grad/layer_12/mlp": 0.006496730260550976, "grad/layer_12/attn_mlp_ratio": 0.605044121039673, "grad/layer_16/attn": 0.0029760634060949087, "grad/layer_16/mlp": 0.004340259823948145, "grad/layer_16/attn_mlp_ratio": 0.6856878293564764, "grad/layer_20/attn": 0.0032309212256222963, "grad/layer_20/mlp": 0.005591080989688635, "grad/layer_20/attn_mlp_ratio": 0.577870571682627, "grad/layer_24/attn": 0.0062190773896873, "grad/layer_24/mlp": 0.0070910085923969746, "grad/layer_24/attn_mlp_ratio": 0.8770370562872719, "grad/layer_27/attn": 0.005185927264392376, "grad/layer_27/mlp": 0.00596960261464119, "grad/layer_27/attn_mlp_ratio": 0.868722344231258} {"step": 78150, "timestamp": 1778279014.708323, "eos/sharpness": 6.290602684020995, "eos/L0_probe": 1.911952018737793, "eos/L_plus": 1.9465690851211548, "eos/L_minus": 1.9402409791946411, "eos/grad_norm": 0.08389148861169815, "eos/embed_grad_frac": 0.260479599237442, "eos/time_s": 0.6034588813781738} {"step": 78150, "timestamp": 1778279014.7276707, "train/loss": 2.144719398021698, "train/z_loss": 0.001406658929772675, "train/perplexity": 8.539644659882079, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.006373182237148285, "optim/adamw_lr": 0.00019119546711444854, "perf/tokens_per_sec": 1913484.739121652, "perf/iters_per_sec": 0.9124206252678165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.095985746383667, "data/tokens_consumed": 163894525952, "data/tokens_consumed_B": 163.894525952, "train/loss_slope": 7.431586490462816e-06} {"step": 78150, "timestamp": 1778279016.0933685, "geo/rankme_last": 440.81085205078125, "geo/layer_0/stable_rank_q_proj": 18.736072540283203, "geo/layer_0/stable_rank_k_proj": 15.714790344238281, "geo/layer_0/stable_rank_o_proj": 46.341400146484375, "geo/layer_0/stable_rank_gate_proj": 127.2337646484375, "geo/layer_0/stable_rank_down_proj": 56.78651809692383, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.062641441822052, "geo/layer_0/attn_entropy_mean": 6.126427173614502, "geo/layer_0/attn_entropy_std": 0.4286174178123474, "geo/layer_7/stable_rank_q_proj": 42.57887268066406, "geo/layer_7/stable_rank_k_proj": 40.02919387817383, "geo/layer_7/stable_rank_o_proj": 87.9465103149414, "geo/layer_7/stable_rank_gate_proj": 76.84566497802734, "geo/layer_7/stable_rank_down_proj": 140.2491455078125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44843411445617676, "geo/layer_7/attn_entropy_mean": 4.6356916427612305, "geo/layer_7/attn_entropy_std": 0.7682176232337952, "geo/layer_14/stable_rank_q_proj": 49.20389938354492, "geo/layer_14/stable_rank_k_proj": 41.5169563293457, "geo/layer_14/stable_rank_o_proj": 43.12776184082031, "geo/layer_14/stable_rank_gate_proj": 70.25534057617188, "geo/layer_14/stable_rank_down_proj": 125.0118637084961, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40312784910202026, "geo/layer_14/attn_entropy_mean": 5.479554176330566, "geo/layer_14/attn_entropy_std": 0.41678187251091003, "geo/layer_21/stable_rank_q_proj": 39.427337646484375, "geo/layer_21/stable_rank_k_proj": 30.049610137939453, "geo/layer_21/stable_rank_o_proj": 67.91758728027344, "geo/layer_21/stable_rank_gate_proj": 62.96178436279297, "geo/layer_21/stable_rank_down_proj": 49.24332809448242, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14959150552749634, "geo/layer_21/attn_entropy_mean": 5.681618690490723, "geo/layer_21/attn_entropy_std": 0.2857418656349182, "geo/layer_27/stable_rank_q_proj": 43.82831954956055, "geo/layer_27/stable_rank_k_proj": 32.38823318481445, "geo/layer_27/stable_rank_o_proj": 115.20244598388672, "geo/layer_27/stable_rank_gate_proj": 76.35334777832031, "geo/layer_27/stable_rank_down_proj": 127.79077911376953, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0826057568192482, "geo/layer_27/attn_entropy_mean": 4.145925521850586, "geo/layer_27/attn_entropy_std": 0.7950131893157959, "attnres/final_alpha/block_0": 0.24110953509807587, "attnres/block_norm/0": 1.7791703939437866, "attnres/final_alpha/block_1": 0.003938950598239899, "attnres/block_norm/1": 47535.68359375, "attnres/final_alpha/block_2": 0.009167401120066643, "attnres/block_norm/2": 29157.85546875, "attnres/final_alpha/block_3": 0.010752083733677864, "attnres/block_norm/3": 63777.4140625, "attnres/final_alpha/block_4": 0.012819979339838028, "attnres/block_norm/4": 15716.6171875, "attnres/final_alpha/block_5": 0.6154446601867676, "attnres/block_norm/5": 6814.1591796875, "attnres/final_alpha/block_6": 0.10676737129688263, "attnres/block_norm/6": 40791.359375, "geo/tier1_time_s": 1.3614208698272705, "geo/step": 78150.0, "geo/rankme_slope": 0.0006028405698216787} {"step": 78160, "timestamp": 1778279026.4415433, "train/loss": 2.0760940074920655, "train/z_loss": 0.0013931188033893705, "train/perplexity": 7.973264500463662, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.006337181031703949, "optim/adamw_lr": 0.00019011543095111846, "perf/tokens_per_sec": 1790951.6164771833, "perf/iters_per_sec": 0.8539922792802731, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709707736968995, "data/tokens_consumed": 163915497472, "data/tokens_consumed_B": 163.915497472, "train/loss_slope": 6.218371082274977e-06} {"step": 78170, "timestamp": 1778279036.7904625, "train/loss": 2.124366855621338, "train/z_loss": 0.0014058182365261017, "train/perplexity": 8.367597913024674, "train/grad_norm": 0.078125, "optim/muon_lr": 0.006301234662532807, "optim/adamw_lr": 0.0001890370398759842, "perf/tokens_per_sec": 2027529.208687645, "perf/iters_per_sec": 0.9668012660444474, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343387365341186, "data/tokens_consumed": 163936468992, "data/tokens_consumed_B": 163.936468992, "train/loss_slope": 8.880252942572528e-06} {"step": 78180, "timestamp": 1778279047.1380265, "train/loss": 2.07502521276474, "train/z_loss": 0.001420996431261301, "train/perplexity": 7.964747269802392, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.006265343427658081, "optim/adamw_lr": 0.0001879603028297424, "perf/tokens_per_sec": 2027650.2134307493, "perf/iters_per_sec": 0.9668589656022784, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342770099639893, "data/tokens_consumed": 163957440512, "data/tokens_consumed_B": 163.957440512, "train/loss_slope": 6.315458468263503e-06} {"step": 78190, "timestamp": 1778279057.4875922, "train/loss": 2.1085992455482483, "train/z_loss": 0.0013965193880721926, "train/perplexity": 8.23669561214199, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.006229507923126221, "optim/adamw_lr": 0.0001868852376937866, "perf/tokens_per_sec": 2027128.766832671, "perf/iters_per_sec": 0.966610320488296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345430612564086, "data/tokens_consumed": 163978412032, "data/tokens_consumed_B": 163.978412032, "train/loss_slope": 7.067216125795009e-06} {"step": 78200, "timestamp": 1778279067.828964, "grad/layer_0/attn": 0.0024612718261778355, "grad/layer_0/mlp": 0.002850201213732362, "grad/layer_0/attn_mlp_ratio": 0.863543151958898, "grad/layer_4/attn": 0.0029020891524851322, "grad/layer_4/mlp": 0.0026226171758025885, "grad/layer_4/attn_mlp_ratio": 1.1065622038187093, "grad/layer_8/attn": 0.00640972051769495, "grad/layer_8/mlp": 0.0034773563966155052, "grad/layer_8/attn_mlp_ratio": 1.8432739133688194, "grad/layer_12/attn": 0.005036446265876293, "grad/layer_12/mlp": 0.006528142839670181, "grad/layer_12/attn_mlp_ratio": 0.7714975472229393, "grad/layer_16/attn": 0.0034548987168818712, "grad/layer_16/mlp": 0.004622716456651688, "grad/layer_16/attn_mlp_ratio": 0.7473741196419608, "grad/layer_20/attn": 0.003475778503343463, "grad/layer_20/mlp": 0.005434057209640741, "grad/layer_20/attn_mlp_ratio": 0.6396286062675433, "grad/layer_24/attn": 0.004403563681989908, "grad/layer_24/mlp": 0.007002950645983219, "grad/layer_24/attn_mlp_ratio": 0.6288154581860685, "grad/layer_27/attn": 0.005067606922239065, "grad/layer_27/mlp": 0.005853394977748394, "grad/layer_27/attn_mlp_ratio": 0.8657551480684275} {"step": 78200, "timestamp": 1778279067.844712, "train/loss": 2.0894975900650024, "train/z_loss": 0.0014099186984822154, "train/perplexity": 8.080854242811972, "train/grad_norm": 0.08984375, "optim/muon_lr": 0.006193728744983674, "optim/adamw_lr": 0.0001858118623495102, "perf/tokens_per_sec": 2025889.5681697144, "perf/iters_per_sec": 0.9660194245194027, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03517587184906, "data/tokens_consumed": 163999383552, "data/tokens_consumed_B": 163.999383552, "train/loss_slope": 6.7551631595578195e-06} {"step": 78210, "timestamp": 1778279078.1957285, "train/loss": 2.1092031478881834, "train/z_loss": 0.0014153587864711881, "train/perplexity": 8.241671274151242, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.0061580055952072145, "optim/adamw_lr": 0.00018474016785621642, "perf/tokens_per_sec": 2027307.3340773322, "perf/iters_per_sec": 0.9666954679857884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344519376754762, "data/tokens_consumed": 164020355072, "data/tokens_consumed_B": 164.020355072, "train/loss_slope": 6.553789569993192e-06} {"step": 78220, "timestamp": 1778279088.551863, "train/loss": 2.1150613903999327, "train/z_loss": 0.0014068473363295197, "train/perplexity": 8.29009468272582, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.006122340857982636, "optim/adamw_lr": 0.00018367022573947904, "perf/tokens_per_sec": 2026304.3642849433, "perf/iters_per_sec": 0.9662172147202222, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349639654159546, "data/tokens_consumed": 164041326592, "data/tokens_consumed_B": 164.041326592, "train/loss_slope": 6.651462694563486e-06} {"step": 78225, "timestamp": 1778279094.3255012, "eos/sharpness": 19.81129646301269, "eos/L0_probe": 1.909708023071289, "eos/L_plus": 2.0096850395202637, "eos/L_minus": 2.0078439712524414, "eos/grad_norm": 0.08396900445222855, "eos/embed_grad_frac": 0.2577657401561737, "eos/time_s": 0.6092696189880371} {"step": 78225, "timestamp": 1778279095.7010655, "geo/rankme_last": 440.5448303222656, "geo/layer_0/stable_rank_q_proj": 18.73151397705078, "geo/layer_0/stable_rank_k_proj": 15.710417747497559, "geo/layer_0/stable_rank_o_proj": 46.34885787963867, "geo/layer_0/stable_rank_gate_proj": 127.23031616210938, "geo/layer_0/stable_rank_down_proj": 56.77436065673828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06614384055137634, "geo/layer_0/attn_entropy_mean": 6.126478672027588, "geo/layer_0/attn_entropy_std": 0.42816755175590515, "geo/layer_7/stable_rank_q_proj": 42.55975341796875, "geo/layer_7/stable_rank_k_proj": 40.027381896972656, "geo/layer_7/stable_rank_o_proj": 87.91029357910156, "geo/layer_7/stable_rank_gate_proj": 76.82501983642578, "geo/layer_7/stable_rank_down_proj": 140.24342346191406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44934552907943726, "geo/layer_7/attn_entropy_mean": 4.641575813293457, "geo/layer_7/attn_entropy_std": 0.7748180627822876, "geo/layer_14/stable_rank_q_proj": 49.19049835205078, "geo/layer_14/stable_rank_k_proj": 41.53739929199219, "geo/layer_14/stable_rank_o_proj": 43.12832260131836, "geo/layer_14/stable_rank_gate_proj": 70.2647705078125, "geo/layer_14/stable_rank_down_proj": 125.03704833984375, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.39694488048553467, "geo/layer_14/attn_entropy_mean": 5.490130424499512, "geo/layer_14/attn_entropy_std": 0.4129992127418518, "geo/layer_21/stable_rank_q_proj": 39.42877960205078, "geo/layer_21/stable_rank_k_proj": 30.064044952392578, "geo/layer_21/stable_rank_o_proj": 67.91034698486328, "geo/layer_21/stable_rank_gate_proj": 62.95161056518555, "geo/layer_21/stable_rank_down_proj": 49.252716064453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15221896767616272, "geo/layer_21/attn_entropy_mean": 5.681046962738037, "geo/layer_21/attn_entropy_std": 0.28813764452934265, "geo/layer_27/stable_rank_q_proj": 43.829288482666016, "geo/layer_27/stable_rank_k_proj": 32.37852478027344, "geo/layer_27/stable_rank_o_proj": 115.21177673339844, "geo/layer_27/stable_rank_gate_proj": 76.37459564208984, "geo/layer_27/stable_rank_down_proj": 127.69442749023438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0878170058131218, "geo/layer_27/attn_entropy_mean": 4.144315719604492, "geo/layer_27/attn_entropy_std": 0.7937635779380798, "attnres/final_alpha/block_0": 0.2420051097869873, "attnres/block_norm/0": 1.7791171073913574, "attnres/final_alpha/block_1": 0.003941297996789217, "attnres/block_norm/1": 47584.21875, "attnres/final_alpha/block_2": 0.009185373783111572, "attnres/block_norm/2": 29089.1953125, "attnres/final_alpha/block_3": 0.010785629972815514, "attnres/block_norm/3": 63634.19921875, "attnres/final_alpha/block_4": 0.012798518873751163, "attnres/block_norm/4": 15732.423828125, "attnres/final_alpha/block_5": 0.6134223341941833, "attnres/block_norm/5": 6818.94775390625, "attnres/final_alpha/block_6": 0.10786174237728119, "attnres/block_norm/6": 40921.46875, "geo/tier1_time_s": 1.3560268878936768, "geo/step": 78225.0, "geo/rankme_slope": 0.0006136061455832333} {"step": 78230, "timestamp": 1778279100.878906, "train/loss": 2.0580644845962524, "train/z_loss": 0.0014022483141161502, "train/perplexity": 7.830798501780178, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.006086733937263489, "optim/adamw_lr": 0.00018260201811790464, "perf/tokens_per_sec": 1701849.1339869685, "perf/iters_per_sec": 0.8115049047407954, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2322784423828126, "data/tokens_consumed": 164062298112, "data/tokens_consumed_B": 164.062298112, "train/loss_slope": 4.089327699745328e-06} {"step": 78240, "timestamp": 1778279111.2290704, "train/loss": 2.0501429557800295, "train/z_loss": 0.0014181187027134002, "train/perplexity": 7.76901165204619, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.006051185727119446, "optim/adamw_lr": 0.00018153557181358336, "perf/tokens_per_sec": 2027433.2190608515, "perf/iters_per_sec": 0.9667554946235902, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343877077102661, "data/tokens_consumed": 164083269632, "data/tokens_consumed_B": 164.083269632, "train/loss_slope": 6.621802421860586e-08} {"step": 78250, "timestamp": 1778279121.5781987, "grad/layer_0/attn": 0.00238671712577343, "grad/layer_0/mlp": 0.0025613021571189165, "grad/layer_0/attn_mlp_ratio": 0.9318373570084348, "grad/layer_4/attn": 0.0018935592379420996, "grad/layer_4/mlp": 0.00236140307970345, "grad/layer_4/attn_mlp_ratio": 0.801878838064365, "grad/layer_8/attn": 0.004392425995320082, "grad/layer_8/mlp": 0.0034320438280701637, "grad/layer_8/attn_mlp_ratio": 1.2798280230025902, "grad/layer_12/attn": 0.003997977823019028, "grad/layer_12/mlp": 0.006167916115373373, "grad/layer_12/attn_mlp_ratio": 0.6481893857530313, "grad/layer_16/attn": 0.0030427398160099983, "grad/layer_16/mlp": 0.00409517390653491, "grad/layer_16/attn_mlp_ratio": 0.7430062339608816, "grad/layer_20/attn": 0.003913471009582281, "grad/layer_20/mlp": 0.005142735317349434, "grad/layer_20/attn_mlp_ratio": 0.7609707076082255, "grad/layer_24/attn": 0.007465060334652662, "grad/layer_24/mlp": 0.007117427419871092, "grad/layer_24/attn_mlp_ratio": 1.0488424804904606, "grad/layer_27/attn": 0.004080051090568304, "grad/layer_27/mlp": 0.005797834601253271, "grad/layer_27/attn_mlp_ratio": 0.7037198024438939} {"step": 78250, "timestamp": 1778279121.594032, "train/loss": 2.0889541029930117, "train/z_loss": 0.0014134895289316773, "train/perplexity": 8.076463596238263, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.0060156965255737304, "optim/adamw_lr": 0.0001804708957672119, "perf/tokens_per_sec": 2024522.9888317988, "perf/iters_per_sec": 0.9653677887114519, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358746290206908, "data/tokens_consumed": 164104241152, "data/tokens_consumed_B": 164.104241152, "train/loss_slope": 1.7389450565387093e-06} {"step": 78260, "timestamp": 1778279131.9452548, "train/loss": 2.0458736419677734, "train/z_loss": 0.0014058691915124655, "train/perplexity": 7.735914005684577, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.005980266630649567, "optim/adamw_lr": 0.00017940799891948698, "perf/tokens_per_sec": 2027603.053047376, "perf/iters_per_sec": 0.9668364777790909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343010663986205, "data/tokens_consumed": 164125212672, "data/tokens_consumed_B": 164.125212672, "train/loss_slope": -3.433562355144533e-06} {"step": 78270, "timestamp": 1778279142.2964284, "train/loss": 2.0897067546844483, "train/z_loss": 0.0014130081748589874, "train/perplexity": 8.082544648394826, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.005944896042346955, "optim/adamw_lr": 0.0001783468812704086, "perf/tokens_per_sec": 2027114.56503239, "perf/iters_per_sec": 0.9666035485422086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345503091812134, "data/tokens_consumed": 164146184192, "data/tokens_consumed_B": 164.146184192, "train/loss_slope": -3.190257962029748e-06} {"step": 78280, "timestamp": 1778279152.6480289, "train/loss": 2.0963293313980103, "train/z_loss": 0.0014106375630944968, "train/perplexity": 8.13624955652191, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.005909586846828461, "optim/adamw_lr": 0.0001772876054048538, "perf/tokens_per_sec": 2026821.1835432155, "perf/iters_per_sec": 0.9664636533466413, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034700059890747, "data/tokens_consumed": 164167155712, "data/tokens_consumed_B": 164.167155712, "train/loss_slope": -1.7520058809584522e-06} {"step": 78290, "timestamp": 1778279162.9952128, "train/loss": 2.115871524810791, "train/z_loss": 0.0014105659443885087, "train/perplexity": 8.29681349490057, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.005874339044094086, "optim/adamw_lr": 0.00017623017132282256, "perf/tokens_per_sec": 2027756.4141199144, "perf/iters_per_sec": 0.9669096060370991, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342228412628174, "data/tokens_consumed": 164188127232, "data/tokens_consumed_B": 164.188127232, "train/loss_slope": -2.4088984883444907e-07} {"step": 78300, "timestamp": 1778279173.3320503, "grad/layer_0/attn": 0.002903223969042301, "grad/layer_0/mlp": 0.003137412015348673, "grad/layer_0/attn_mlp_ratio": 0.9253562688941342, "grad/layer_4/attn": 0.0024902592413127422, "grad/layer_4/mlp": 0.002548821968957782, "grad/layer_4/attn_mlp_ratio": 0.9770235716497127, "grad/layer_8/attn": 0.0036433276254683733, "grad/layer_8/mlp": 0.00361935468390584, "grad/layer_8/attn_mlp_ratio": 1.0066235124749674, "grad/layer_12/attn": 0.0049919323064386845, "grad/layer_12/mlp": 0.006516699213534594, "grad/layer_12/attn_mlp_ratio": 0.7660215802915586, "grad/layer_16/attn": 0.0033446813467890024, "grad/layer_16/mlp": 0.004305724054574966, "grad/layer_16/attn_mlp_ratio": 0.7767987977667298, "grad/layer_20/attn": 0.003305198159068823, "grad/layer_20/mlp": 0.004784774035215378, "grad/layer_20/attn_mlp_ratio": 0.6907741234310207, "grad/layer_24/attn": 0.004140151664614677, "grad/layer_24/mlp": 0.006831093691289425, "grad/layer_24/attn_mlp_ratio": 0.6060744869136382, "grad/layer_27/attn": 0.004090174566954374, "grad/layer_27/mlp": 0.005877618677914143, "grad/layer_27/attn_mlp_ratio": 0.6958897338363788} {"step": 78300, "timestamp": 1778279173.941557, "eos/sharpness": 17.040443420410153, "eos/L0_probe": 1.9108513593673706, "eos/L_plus": 1.995545744895935, "eos/L_minus": 1.9965614080429077, "eos/grad_norm": 0.07796606421470642, "eos/embed_grad_frac": 0.3186914622783661, "eos/time_s": 0.6065704822540283} {"step": 78300, "timestamp": 1778279173.9608848, "train/loss": 2.013278102874756, "train/z_loss": 0.0014280707691796124, "train/perplexity": 7.487823013473256, "train/grad_norm": 0.078125, "optim/muon_lr": 0.00583915263414383, "optim/adamw_lr": 0.00017517457902431488, "perf/tokens_per_sec": 1913167.4800133642, "perf/iters_per_sec": 0.9122693443362065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0961674928665162, "data/tokens_consumed": 164209098752, "data/tokens_consumed_B": 164.209098752, "train/loss_slope": -3.910164878849806e-06} {"step": 78300, "timestamp": 1778279175.3294094, "geo/rankme_last": 440.729736328125, "geo/layer_0/stable_rank_q_proj": 18.729869842529297, "geo/layer_0/stable_rank_k_proj": 15.710765838623047, "geo/layer_0/stable_rank_o_proj": 46.34754943847656, "geo/layer_0/stable_rank_gate_proj": 127.19776153564453, "geo/layer_0/stable_rank_down_proj": 56.79220962524414, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06337732821702957, "geo/layer_0/attn_entropy_mean": 6.125842571258545, "geo/layer_0/attn_entropy_std": 0.4280517101287842, "geo/layer_7/stable_rank_q_proj": 42.56258773803711, "geo/layer_7/stable_rank_k_proj": 40.049373626708984, "geo/layer_7/stable_rank_o_proj": 87.90254974365234, "geo/layer_7/stable_rank_gate_proj": 76.82213592529297, "geo/layer_7/stable_rank_down_proj": 140.2902069091797, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4425959289073944, "geo/layer_7/attn_entropy_mean": 4.643362045288086, "geo/layer_7/attn_entropy_std": 0.7678314447402954, "geo/layer_14/stable_rank_q_proj": 49.181400299072266, "geo/layer_14/stable_rank_k_proj": 41.52153396606445, "geo/layer_14/stable_rank_o_proj": 43.130035400390625, "geo/layer_14/stable_rank_gate_proj": 70.2752456665039, "geo/layer_14/stable_rank_down_proj": 125.05958557128906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40984809398651123, "geo/layer_14/attn_entropy_mean": 5.49329948425293, "geo/layer_14/attn_entropy_std": 0.41147395968437195, "geo/layer_21/stable_rank_q_proj": 39.42099380493164, "geo/layer_21/stable_rank_k_proj": 30.04703140258789, "geo/layer_21/stable_rank_o_proj": 67.92578125, "geo/layer_21/stable_rank_gate_proj": 62.951171875, "geo/layer_21/stable_rank_down_proj": 49.24812698364258, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15003864467144012, "geo/layer_21/attn_entropy_mean": 5.684878349304199, "geo/layer_21/attn_entropy_std": 0.2861688733100891, "geo/layer_27/stable_rank_q_proj": 43.82085037231445, "geo/layer_27/stable_rank_k_proj": 32.37432098388672, "geo/layer_27/stable_rank_o_proj": 115.20199584960938, "geo/layer_27/stable_rank_gate_proj": 76.3529052734375, "geo/layer_27/stable_rank_down_proj": 127.69447326660156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0904965028166771, "geo/layer_27/attn_entropy_mean": 4.148624420166016, "geo/layer_27/attn_entropy_std": 0.8023253083229065, "attnres/final_alpha/block_0": 0.24243606626987457, "attnres/block_norm/0": 1.7791163921356201, "attnres/final_alpha/block_1": 0.003942941315472126, "attnres/block_norm/1": 47573.98828125, "attnres/final_alpha/block_2": 0.009238146245479584, "attnres/block_norm/2": 29144.04296875, "attnres/final_alpha/block_3": 0.01084334310144186, "attnres/block_norm/3": 63723.8984375, "attnres/final_alpha/block_4": 0.012975700199604034, "attnres/block_norm/4": 15736.55078125, "attnres/final_alpha/block_5": 0.6127172708511353, "attnres/block_norm/5": 6848.68017578125, "attnres/final_alpha/block_6": 0.10784652829170227, "attnres/block_norm/6": 40848.078125, "geo/tier1_time_s": 1.3647935390472412, "geo/step": 78300.0, "geo/rankme_slope": 0.0006236325780312125} {"step": 78310, "timestamp": 1778279185.68346, "train/loss": 2.0229417204856874, "train/z_loss": 0.0014190569054335356, "train/perplexity": 7.5605332278286355, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.005804028809070588, "optim/adamw_lr": 0.0001741208642721176, "perf/tokens_per_sec": 1789546.2960573656, "perf/iters_per_sec": 0.8533221702849224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1718903303146362, "data/tokens_consumed": 164230070272, "data/tokens_consumed_B": 164.230070272, "train/loss_slope": -6.300288994963954e-06} {"step": 78320, "timestamp": 1778279196.033738, "train/loss": 2.099384343624115, "train/z_loss": 0.0014111737255007029, "train/perplexity": 8.16114390529988, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.005768966376781464, "optim/adamw_lr": 0.0001730689913034439, "perf/tokens_per_sec": 2027042.1109184974, "perf/iters_per_sec": 0.96656899972844, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034587287902832, "data/tokens_consumed": 164251041792, "data/tokens_consumed_B": 164.251041792, "train/loss_slope": -6.434681005675332e-06} {"step": 78330, "timestamp": 1778279206.38165, "train/loss": 2.0555743932724, "train/z_loss": 0.001423431688454002, "train/perplexity": 7.811323355881035, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.00573396921157837, "optim/adamw_lr": 0.00017201907634735106, "perf/tokens_per_sec": 2027500.9342651318, "perf/iters_per_sec": 0.9667877837491664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343531608581542, "data/tokens_consumed": 164272013312, "data/tokens_consumed_B": 164.272013312, "train/loss_slope": -4.58558680164968e-06} {"step": 78340, "timestamp": 1778279216.7645879, "train/loss": 2.1137660145759583, "train/z_loss": 0.0014235853450372815, "train/perplexity": 8.27936284687638, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.005699033737182617, "optim/adamw_lr": 0.0001709710121154785, "perf/tokens_per_sec": 2021180.2612483432, "perf/iters_per_sec": 0.9637738519899097, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375878095626831, "data/tokens_consumed": 164292984832, "data/tokens_consumed_B": 164.292984832, "train/loss_slope": -3.6100464399390997e-06} {"step": 78350, "timestamp": 1778279227.1338084, "grad/layer_0/attn": 0.0027321618981659412, "grad/layer_0/mlp": 0.002714073285460472, "grad/layer_0/attn_mlp_ratio": 1.0066647102478403, "grad/layer_4/attn": 0.0024904662277549505, "grad/layer_4/mlp": 0.002608745824545622, "grad/layer_4/attn_mlp_ratio": 0.9546603233079233, "grad/layer_8/attn": 0.0026441693771630526, "grad/layer_8/mlp": 0.0034571546129882336, "grad/layer_8/attn_mlp_ratio": 0.7648397589003278, "grad/layer_12/attn": 0.0037330070044845343, "grad/layer_12/mlp": 0.0069862003438174725, "grad/layer_12/attn_mlp_ratio": 0.5343400943767805, "grad/layer_16/attn": 0.0032009875867515802, "grad/layer_16/mlp": 0.004359476733952761, "grad/layer_16/attn_mlp_ratio": 0.7342595702817919, "grad/layer_20/attn": 0.0032129453029483557, "grad/layer_20/mlp": 0.0055939070880413055, "grad/layer_20/attn_mlp_ratio": 0.574365143171666, "grad/layer_24/attn": 0.004837743006646633, "grad/layer_24/mlp": 0.006533993873745203, "grad/layer_24/attn_mlp_ratio": 0.7403960006828258, "grad/layer_27/attn": 0.004509531427174807, "grad/layer_27/mlp": 0.005673966836184263, "grad/layer_27/attn_mlp_ratio": 0.7947757676232534} {"step": 78350, "timestamp": 1778279227.1497452, "train/loss": 2.0266674518585206, "train/z_loss": 0.0014094073441810906, "train/perplexity": 7.5887542830614585, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.00566416472196579, "optim/adamw_lr": 0.00016992494165897368, "perf/tokens_per_sec": 2020607.9204805926, "perf/iters_per_sec": 0.9635009386446918, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378817081451417, "data/tokens_consumed": 164313956352, "data/tokens_consumed_B": 164.313956352, "train/loss_slope": -9.560610151419649e-06} {"step": 78360, "timestamp": 1778279237.546346, "train/loss": 2.119420790672302, "train/z_loss": 0.0014228070736862718, "train/perplexity": 8.326313412354072, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.0056293585896492, "optim/adamw_lr": 0.000168880757689476, "perf/tokens_per_sec": 2018518.0370344575, "perf/iters_per_sec": 0.962504404561261, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0389562845230103, "data/tokens_consumed": 164334927872, "data/tokens_consumed_B": 164.334927872, "train/loss_slope": -9.02374933118617e-06} {"step": 78370, "timestamp": 1778279247.9289083, "train/loss": 2.1027884483337402, "train/z_loss": 0.0014072254183702172, "train/perplexity": 8.188972632781878, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.005594617426395416, "optim/adamw_lr": 0.00016783852279186247, "perf/tokens_per_sec": 2021206.4089966724, "perf/iters_per_sec": 0.9637863202079164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375743865966798, "data/tokens_consumed": 164355899392, "data/tokens_consumed_B": 164.355899392, "train/loss_slope": -7.197350422755905e-06} {"step": 78375, "timestamp": 1778279253.737365, "eos/sharpness": 18.49102973937988, "eos/L0_probe": 1.9106357097625732, "eos/L_plus": 2.019629955291748, "eos/L_minus": 1.9865517616271973, "eos/grad_norm": 0.07496609538793564, "eos/embed_grad_frac": 0.29126399755477905, "eos/time_s": 0.6265203952789307} {"step": 78375, "timestamp": 1778279255.124479, "geo/rankme_last": 440.667724609375, "geo/layer_0/stable_rank_q_proj": 18.726612091064453, "geo/layer_0/stable_rank_k_proj": 15.709694862365723, "geo/layer_0/stable_rank_o_proj": 46.34122848510742, "geo/layer_0/stable_rank_gate_proj": 127.24809265136719, "geo/layer_0/stable_rank_down_proj": 56.811885833740234, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.061258915811777115, "geo/layer_0/attn_entropy_mean": 6.125519752502441, "geo/layer_0/attn_entropy_std": 0.42817628383636475, "geo/layer_7/stable_rank_q_proj": 42.560272216796875, "geo/layer_7/stable_rank_k_proj": 40.05632019042969, "geo/layer_7/stable_rank_o_proj": 87.9045639038086, "geo/layer_7/stable_rank_gate_proj": 76.82616424560547, "geo/layer_7/stable_rank_down_proj": 140.29043579101562, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4310731589794159, "geo/layer_7/attn_entropy_mean": 4.648523807525635, "geo/layer_7/attn_entropy_std": 0.7710154056549072, "geo/layer_14/stable_rank_q_proj": 49.166709899902344, "geo/layer_14/stable_rank_k_proj": 41.49666213989258, "geo/layer_14/stable_rank_o_proj": 43.12567138671875, "geo/layer_14/stable_rank_gate_proj": 70.25650787353516, "geo/layer_14/stable_rank_down_proj": 125.06584930419922, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4020422697067261, "geo/layer_14/attn_entropy_mean": 5.501481533050537, "geo/layer_14/attn_entropy_std": 0.40471917390823364, "geo/layer_21/stable_rank_q_proj": 39.41738510131836, "geo/layer_21/stable_rank_k_proj": 30.040573120117188, "geo/layer_21/stable_rank_o_proj": 67.91936492919922, "geo/layer_21/stable_rank_gate_proj": 62.94654846191406, "geo/layer_21/stable_rank_down_proj": 49.238800048828125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15134474635124207, "geo/layer_21/attn_entropy_mean": 5.676389694213867, "geo/layer_21/attn_entropy_std": 0.28522804379463196, "geo/layer_27/stable_rank_q_proj": 43.79949951171875, "geo/layer_27/stable_rank_k_proj": 32.360801696777344, "geo/layer_27/stable_rank_o_proj": 115.25023651123047, "geo/layer_27/stable_rank_gate_proj": 76.328857421875, "geo/layer_27/stable_rank_down_proj": 127.71302032470703, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09720511734485626, "geo/layer_27/attn_entropy_mean": 4.151345252990723, "geo/layer_27/attn_entropy_std": 0.7940979599952698, "attnres/final_alpha/block_0": 0.24084588885307312, "attnres/block_norm/0": 1.7791023254394531, "attnres/final_alpha/block_1": 0.003917255438864231, "attnres/block_norm/1": 47538.27734375, "attnres/final_alpha/block_2": 0.00910193845629692, "attnres/block_norm/2": 29168.30859375, "attnres/final_alpha/block_3": 0.010637450963258743, "attnres/block_norm/3": 63723.140625, "attnres/final_alpha/block_4": 0.012705208733677864, "attnres/block_norm/4": 15726.7548828125, "attnres/final_alpha/block_5": 0.6147584915161133, "attnres/block_norm/5": 6826.9951171875, "attnres/final_alpha/block_6": 0.10803376138210297, "attnres/block_norm/6": 40955.2890625, "geo/tier1_time_s": 1.3587422370910645, "geo/step": 78375.0, "geo/rankme_slope": 0.0006305505209896459} {"step": 78380, "timestamp": 1778279260.3169427, "train/loss": 2.086939549446106, "train/z_loss": 0.0014114286983385681, "train/perplexity": 8.060209505719028, "train/grad_norm": 0.08642578125, "optim/muon_lr": 0.0055599445104599, "optim/adamw_lr": 0.00016679833531379698, "perf/tokens_per_sec": 1693875.9832855724, "perf/iters_per_sec": 0.8077030102184164, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2380788326263428, "data/tokens_consumed": 164376870912, "data/tokens_consumed_B": 164.376870912, "train/loss_slope": -6.0689553461953184e-06} {"step": 78390, "timestamp": 1778279270.6978543, "train/loss": 2.0570164561271667, "train/z_loss": 0.0014171169023029506, "train/perplexity": 7.822595901044971, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.005525335669517517, "optim/adamw_lr": 0.0001657600700855255, "perf/tokens_per_sec": 2021165.4460393365, "perf/iters_per_sec": 0.9637667875477488, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375954151153564, "data/tokens_consumed": 164397842432, "data/tokens_consumed_B": 164.397842432, "train/loss_slope": -1.042587188425415e-05} {"step": 78400, "timestamp": 1778279281.0671337, "grad/layer_0/attn": 0.0025767383631318808, "grad/layer_0/mlp": 0.0027532458771020174, "grad/layer_0/attn_mlp_ratio": 0.9358910844007032, "grad/layer_4/attn": 0.0029964421410113573, "grad/layer_4/mlp": 0.0024603509809821844, "grad/layer_4/attn_mlp_ratio": 1.2178920984785482, "grad/layer_8/attn": 0.006121940910816193, "grad/layer_8/mlp": 0.003398213069885969, "grad/layer_8/attn_mlp_ratio": 1.801517622575049, "grad/layer_12/attn": 0.0036646975204348564, "grad/layer_12/mlp": 0.00638255150988698, "grad/layer_12/attn_mlp_ratio": 0.5741743654305914, "grad/layer_16/attn": 0.0030025781597942114, "grad/layer_16/mlp": 0.004366351757198572, "grad/layer_16/attn_mlp_ratio": 0.6876628952483572, "grad/layer_20/attn": 0.0024272978771477938, "grad/layer_20/mlp": 0.004530208185315132, "grad/layer_20/attn_mlp_ratio": 0.5358027102232774, "grad/layer_24/attn": 0.0053094616159796715, "grad/layer_24/mlp": 0.006500800605863333, "grad/layer_24/attn_mlp_ratio": 0.8167396381173254, "grad/layer_27/attn": 0.00471765361726284, "grad/layer_27/mlp": 0.005587581545114517, "grad/layer_27/attn_mlp_ratio": 0.844310457886142} {"step": 78400, "timestamp": 1778279281.0832343, "train/loss": 2.136253809928894, "train/z_loss": 0.0014026191318407654, "train/perplexity": 8.467656685924938, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.005490795969963074, "optim/adamw_lr": 0.0001647238790988922, "perf/tokens_per_sec": 2020181.3946173391, "perf/iters_per_sec": 0.9632975552641578, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381008386611938, "data/tokens_consumed": 164418813952, "data/tokens_consumed_B": 164.418813952, "train/loss_slope": -8.896103014003177e-06} {"step": 78410, "timestamp": 1778279291.4647565, "train/loss": 2.085775685310364, "train/z_loss": 0.001413506269454956, "train/perplexity": 8.050833973929683, "train/grad_norm": 0.11376953125, "optim/muon_lr": 0.005456321537494659, "optim/adamw_lr": 0.00016368964612483976, "perf/tokens_per_sec": 2021605.3040809496, "perf/iters_per_sec": 0.9639765282063244, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373696565628052, "data/tokens_consumed": 164439785472, "data/tokens_consumed_B": 164.439785472, "train/loss_slope": -1.2307518286065513e-05} {"step": 78420, "timestamp": 1778279302.349177, "train/loss": 2.0612610220909118, "train/z_loss": 0.0014126273686997592, "train/perplexity": 7.855869992436144, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.005421915054321289, "optim/adamw_lr": 0.00016265745162963866, "perf/tokens_per_sec": 1927655.2715177613, "perf/iters_per_sec": 0.9191776616658026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0879289627075195, "data/tokens_consumed": 164460756992, "data/tokens_consumed_B": 164.460756992, "train/loss_slope": -1.2764572403361548e-05} {"step": 78430, "timestamp": 1778279312.7264762, "train/loss": 2.088900077342987, "train/z_loss": 0.001396022632252425, "train/perplexity": 8.076027271829036, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.0053875789046287535, "optim/adamw_lr": 0.0001616273671388626, "perf/tokens_per_sec": 2022041.9138943844, "perf/iters_per_sec": 0.9641847199890062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371456623077393, "data/tokens_consumed": 164481728512, "data/tokens_consumed_B": 164.481728512, "train/loss_slope": -1.020513431395708e-05} {"step": 78440, "timestamp": 1778279323.1074424, "train/loss": 2.071768653392792, "train/z_loss": 0.0014118672348558902, "train/perplexity": 7.938851785412425, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.00535330981016159, "optim/adamw_lr": 0.0001605992943048477, "perf/tokens_per_sec": 2021228.9346864333, "perf/iters_per_sec": 0.9637970612938086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375628232955934, "data/tokens_consumed": 164502700032, "data/tokens_consumed_B": 164.502700032, "train/loss_slope": -1.0602443789777696e-05} {"step": 78450, "timestamp": 1778279333.490065, "grad/layer_0/attn": 0.0028901253826916218, "grad/layer_0/mlp": 0.002857760526239872, "grad/layer_0/attn_mlp_ratio": 1.0113252160291444, "grad/layer_4/attn": 0.0018711121520027518, "grad/layer_4/mlp": 0.002617361955344677, "grad/layer_4/attn_mlp_ratio": 0.7148847245576606, "grad/layer_8/attn": 0.0038612165953963995, "grad/layer_8/mlp": 0.0036952286027371883, "grad/layer_8/attn_mlp_ratio": 1.0449195181170405, "grad/layer_12/attn": 0.004620869178324938, "grad/layer_12/mlp": 0.006146046798676252, "grad/layer_12/attn_mlp_ratio": 0.7518441128915222, "grad/layer_16/attn": 0.003512278664857149, "grad/layer_16/mlp": 0.004558838438242674, "grad/layer_16/attn_mlp_ratio": 0.7704327835683892, "grad/layer_20/attn": 0.0026558965910226107, "grad/layer_20/mlp": 0.005105312913656235, "grad/layer_20/attn_mlp_ratio": 0.5202220870529066, "grad/layer_24/attn": 0.003835426177829504, "grad/layer_24/mlp": 0.007064441218972206, "grad/layer_24/attn_mlp_ratio": 0.5429199570996669, "grad/layer_27/attn": 0.0036952761001884937, "grad/layer_27/mlp": 0.006331890355795622, "grad/layer_27/attn_mlp_ratio": 0.5835976042204241} {"step": 78450, "timestamp": 1778279334.1118689, "eos/sharpness": 8.26982259750366, "eos/L0_probe": 1.9111747741699219, "eos/L_plus": 1.9498119354248047, "eos/L_minus": 1.9552358388900757, "eos/grad_norm": 0.0790032222867012, "eos/embed_grad_frac": 0.29615849256515503, "eos/time_s": 0.6190073490142822} {"step": 78450, "timestamp": 1778279334.1318038, "train/loss": 2.0759504079818725, "train/z_loss": 0.0014225269202142953, "train/perplexity": 7.9721196257904445, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.005319112241268158, "optim/adamw_lr": 0.00015957336723804473, "perf/tokens_per_sec": 1903136.2822448479, "perf/iters_per_sec": 0.9074860964988937, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1019452571868897, "data/tokens_consumed": 164523671552, "data/tokens_consumed_B": 164.523671552, "train/loss_slope": -1.0445846940460351e-05} {"step": 78450, "timestamp": 1778279335.4966214, "geo/rankme_last": 440.56646728515625, "geo/layer_0/stable_rank_q_proj": 18.726295471191406, "geo/layer_0/stable_rank_k_proj": 15.704119682312012, "geo/layer_0/stable_rank_o_proj": 46.34171676635742, "geo/layer_0/stable_rank_gate_proj": 127.2704849243164, "geo/layer_0/stable_rank_down_proj": 56.81449890136719, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06835804879665375, "geo/layer_0/attn_entropy_mean": 6.125781059265137, "geo/layer_0/attn_entropy_std": 0.42870020866394043, "geo/layer_7/stable_rank_q_proj": 42.55641555786133, "geo/layer_7/stable_rank_k_proj": 40.05818176269531, "geo/layer_7/stable_rank_o_proj": 87.9284439086914, "geo/layer_7/stable_rank_gate_proj": 76.83216094970703, "geo/layer_7/stable_rank_down_proj": 140.2847442626953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44298455119132996, "geo/layer_7/attn_entropy_mean": 4.6479172706604, "geo/layer_7/attn_entropy_std": 0.7723158597946167, "geo/layer_14/stable_rank_q_proj": 49.15143585205078, "geo/layer_14/stable_rank_k_proj": 41.50365447998047, "geo/layer_14/stable_rank_o_proj": 43.11872863769531, "geo/layer_14/stable_rank_gate_proj": 70.26145935058594, "geo/layer_14/stable_rank_down_proj": 125.05410766601562, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4078054130077362, "geo/layer_14/attn_entropy_mean": 5.484976768493652, "geo/layer_14/attn_entropy_std": 0.4153803288936615, "geo/layer_21/stable_rank_q_proj": 39.40241241455078, "geo/layer_21/stable_rank_k_proj": 30.030941009521484, "geo/layer_21/stable_rank_o_proj": 67.92442321777344, "geo/layer_21/stable_rank_gate_proj": 62.93827438354492, "geo/layer_21/stable_rank_down_proj": 49.228824615478516, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15174008905887604, "geo/layer_21/attn_entropy_mean": 5.667549133300781, "geo/layer_21/attn_entropy_std": 0.28863367438316345, "geo/layer_27/stable_rank_q_proj": 43.78365707397461, "geo/layer_27/stable_rank_k_proj": 32.357025146484375, "geo/layer_27/stable_rank_o_proj": 115.2724380493164, "geo/layer_27/stable_rank_gate_proj": 76.31755828857422, "geo/layer_27/stable_rank_down_proj": 127.72274017333984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08779478073120117, "geo/layer_27/attn_entropy_mean": 4.153897285461426, "geo/layer_27/attn_entropy_std": 0.7933391332626343, "attnres/final_alpha/block_0": 0.24158404767513275, "attnres/block_norm/0": 1.7790558338165283, "attnres/final_alpha/block_1": 0.0039253756403923035, "attnres/block_norm/1": 47572.04296875, "attnres/final_alpha/block_2": 0.0091645373031497, "attnres/block_norm/2": 29127.73046875, "attnres/final_alpha/block_3": 0.010678799822926521, "attnres/block_norm/3": 63832.9921875, "attnres/final_alpha/block_4": 0.012810411863029003, "attnres/block_norm/4": 15690.802734375, "attnres/final_alpha/block_5": 0.6140213012695312, "attnres/block_norm/5": 6832.708984375, "attnres/final_alpha/block_6": 0.10781551897525787, "attnres/block_norm/6": 40849.53515625, "geo/tier1_time_s": 1.3607451915740967, "geo/step": 78450.0, "geo/rankme_slope": 0.0006195372289540816} {"step": 78460, "timestamp": 1778279345.8746562, "train/loss": 2.0512733936309813, "train/z_loss": 0.0014365908456966281, "train/perplexity": 7.777799002723285, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.005284982323646546, "optim/adamw_lr": 0.00015854946970939634, "perf/tokens_per_sec": 1786485.9539348218, "perf/iters_per_sec": 0.8518628854440793, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1738978385925294, "data/tokens_consumed": 164544643072, "data/tokens_consumed_B": 164.544643072, "train/loss_slope": -1.1073094189721434e-05} {"step": 78470, "timestamp": 1778279356.250068, "train/loss": 2.0582523226737974, "train/z_loss": 0.0014107373775914312, "train/perplexity": 7.832269562072639, "train/grad_norm": 0.0849609375, "optim/muon_lr": 0.005250925123691559, "optim/adamw_lr": 0.00015752775371074675, "perf/tokens_per_sec": 2022129.444435466, "perf/iters_per_sec": 0.9642264578034716, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371007680892945, "data/tokens_consumed": 164565614592, "data/tokens_consumed_B": 164.565614592, "train/loss_slope": -1.1974623816313095e-05} {"step": 78480, "timestamp": 1778279366.636265, "train/loss": 2.0562127470970153, "train/z_loss": 0.0014150464674457907, "train/perplexity": 7.816311335899247, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.005216936767101288, "optim/adamw_lr": 0.0001565081030130386, "perf/tokens_per_sec": 2020089.8572691623, "perf/iters_per_sec": 0.9632539068551838, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381478786468505, "data/tokens_consumed": 164586586112, "data/tokens_consumed_B": 164.586586112, "train/loss_slope": -1.1444194031448488e-05} {"step": 78490, "timestamp": 1778279377.0121255, "train/loss": 2.1377586841583254, "train/z_loss": 0.0013932874076999723, "train/perplexity": 8.48040903719102, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.0051830199360847476, "optim/adamw_lr": 0.0001554905980825424, "perf/tokens_per_sec": 2022101.1344722575, "perf/iters_per_sec": 0.9642129585610664, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371152877807617, "data/tokens_consumed": 164607557632, "data/tokens_consumed_B": 164.607557632, "train/loss_slope": -9.741343788080632e-06} {"step": 78500, "timestamp": 1778279387.3750174, "grad/layer_0/attn": 0.0028981403447687626, "grad/layer_0/mlp": 0.0029675608966499567, "grad/layer_0/attn_mlp_ratio": 0.9766068323584377, "grad/layer_4/attn": 0.0019086406100541353, "grad/layer_4/mlp": 0.0025242220144718885, "grad/layer_4/attn_mlp_ratio": 0.7561302149725665, "grad/layer_8/attn": 0.0041604056023061275, "grad/layer_8/mlp": 0.0036345082335174084, "grad/layer_8/attn_mlp_ratio": 1.144695573797121, "grad/layer_12/attn": 0.004140275530517101, "grad/layer_12/mlp": 0.0060546137392520905, "grad/layer_12/attn_mlp_ratio": 0.6838215682188805, "grad/layer_16/attn": 0.0037486322689801455, "grad/layer_16/mlp": 0.004244168289005756, "grad/layer_16/attn_mlp_ratio": 0.8832430585673121, "grad/layer_20/attn": 0.002682671183720231, "grad/layer_20/mlp": 0.004779899027198553, "grad/layer_20/attn_mlp_ratio": 0.5612401250175578, "grad/layer_24/attn": 0.0041337888687849045, "grad/layer_24/mlp": 0.006124404724687338, "grad/layer_24/attn_mlp_ratio": 0.6749698929309335, "grad/layer_27/attn": 0.003427086165174842, "grad/layer_27/mlp": 0.005720972549170256, "grad/layer_27/attn_mlp_ratio": 0.5990390752299596} {"step": 78500, "timestamp": 1778279387.3909063, "train/loss": 2.0066571474075316, "train/z_loss": 0.0014022935880348085, "train/perplexity": 7.438410231179223, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.005149177312850952, "optim/adamw_lr": 0.00015447531938552855, "perf/tokens_per_sec": 2021630.1618440787, "perf/iters_per_sec": 0.9639883813114541, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373569011688233, "data/tokens_consumed": 164628529152, "data/tokens_consumed_B": 164.628529152, "train/loss_slope": -1.4474949835300342e-05} {"step": 78500, "timestamp": 1778279394.4101312, "geo/ww_alpha_mean": 7.833897037896507, "geo/ww_alpha_std": 4.753463150162137, "geo/ww_alpha_min": 1.3429993794777995, "geo/ww_alpha_max": 31.263196281715206, "geo/ww_alpha_healthy_frac": 0.16751269035532995, "geo/ww_alpha_by_type/q_proj": 3.926224312515001, "geo/ww_alpha_by_type/k_proj": 4.469191714610375, "geo/ww_alpha_by_type/v_proj": 8.903658818369248, "geo/ww_alpha_by_type/o_proj": 9.628351080764569, "geo/ww_alpha_by_type/gate_proj": 8.04219202796352, "geo/ww_alpha_by_type/up_proj": 11.907729121492606, "geo/ww_alpha_by_type/down_proj": 8.058338837083966, "geo/twonn_id/layer_0": 0.6570327281951904, "geo/twonn_id/layer_7": 3.1845319271087646, "geo/twonn_id/layer_14": 4.730278968811035, "geo/twonn_id/layer_21": 6.834029197692871, "geo/twonn_id/layer_27": 5.384742259979248, "geo/tier2_time_s": 7.012158393859863} {"step": 78500, "timestamp": 1778279395.0702825, "eoc/jacobian_sigma/layer_0/attn": 1250.39697265625, "eoc/jacobian_sigma/layer_0/mlp": 8811.2041015625, "eoc/jacobian_sigma/layer_0": 8811.2041015625, "eoc/jacobian_sigma/layer_7/attn": 1.1519489288330078, "eoc/jacobian_sigma/layer_7/mlp": 1.918836236000061, "eoc/jacobian_sigma/layer_7": 1.918836236000061, "eoc/jacobian_sigma/layer_14/attn": 1.3956005573272705, "eoc/jacobian_sigma/layer_14/mlp": 6.937226295471191, "eoc/jacobian_sigma/layer_14": 6.937226295471191, "eoc/jacobian_sigma/layer_21/attn": 1.1052922010421753, "eoc/jacobian_sigma/layer_21/mlp": 4.1802520751953125, "eoc/jacobian_sigma/layer_21": 4.1802520751953125, "eoc/jacobian_sigma/layer_27/attn": 3.1568500995635986, "eoc/jacobian_sigma/layer_27/mlp": 27.03754425048828, "eoc/jacobian_sigma/layer_27": 27.03754425048828, "eoc/layer0_sigma": 8811.2041015625, "eoc/sigma_max": 27.03754425048828, "eoc/sigma_min": 1.918836236000061, "eoc/sigma_mean": 10.018464714288712, "eoc/time_s": 0.6533665657043457} {"step": 78510, "timestamp": 1778279405.4712405, "train/loss": 2.0778140902519224, "train/z_loss": 0.0014042865135706962, "train/perplexity": 7.986990977224344, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.005115405023097992, "optim/adamw_lr": 0.00015346215069293975, "perf/tokens_per_sec": 1160246.5308135585, "perf/iters_per_sec": 0.5532486585681717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8075055122375487, "data/tokens_consumed": 164649500672, "data/tokens_consumed_B": 164.649500672, "train/loss_slope": -1.3912090016241218e-05} {"step": 78520, "timestamp": 1778279415.8452575, "train/loss": 2.103278696537018, "train/z_loss": 0.0014152114395983516, "train/perplexity": 8.19298824614498, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.005081707537174225, "optim/adamw_lr": 0.00015245122611522673, "perf/tokens_per_sec": 2022678.8332274328, "perf/iters_per_sec": 0.9644884267937817, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036819076538086, "data/tokens_consumed": 164670472192, "data/tokens_consumed_B": 164.670472192, "train/loss_slope": -1.8510098604693906e-05} {"step": 78525, "timestamp": 1778279421.718977, "eos/sharpness": 11.207413673400877, "eos/L0_probe": 1.9097111225128174, "eos/L_plus": 1.9721020460128784, "eos/L_minus": 1.9593943357467651, "eos/grad_norm": 0.09571098536252975, "eos/embed_grad_frac": 0.2146235704421997, "eos/time_s": 0.6963803768157959} {"step": 78525, "timestamp": 1778279423.1027887, "geo/rankme_last": 440.5118408203125, "geo/layer_0/stable_rank_q_proj": 18.728958129882812, "geo/layer_0/stable_rank_k_proj": 15.702520370483398, "geo/layer_0/stable_rank_o_proj": 46.33426284790039, "geo/layer_0/stable_rank_gate_proj": 127.27030944824219, "geo/layer_0/stable_rank_down_proj": 56.81877517700195, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06744889169931412, "geo/layer_0/attn_entropy_mean": 6.125881195068359, "geo/layer_0/attn_entropy_std": 0.42863187193870544, "geo/layer_7/stable_rank_q_proj": 42.55281448364258, "geo/layer_7/stable_rank_k_proj": 40.0566291809082, "geo/layer_7/stable_rank_o_proj": 87.932373046875, "geo/layer_7/stable_rank_gate_proj": 76.8449478149414, "geo/layer_7/stable_rank_down_proj": 140.31851196289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4529845714569092, "geo/layer_7/attn_entropy_mean": 4.649161338806152, "geo/layer_7/attn_entropy_std": 0.7707686424255371, "geo/layer_14/stable_rank_q_proj": 49.168453216552734, "geo/layer_14/stable_rank_k_proj": 41.496337890625, "geo/layer_14/stable_rank_o_proj": 43.11794662475586, "geo/layer_14/stable_rank_gate_proj": 70.25776672363281, "geo/layer_14/stable_rank_down_proj": 125.05883026123047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4117692708969116, "geo/layer_14/attn_entropy_mean": 5.4967803955078125, "geo/layer_14/attn_entropy_std": 0.4144138991832733, "geo/layer_21/stable_rank_q_proj": 39.40900802612305, "geo/layer_21/stable_rank_k_proj": 30.021102905273438, "geo/layer_21/stable_rank_o_proj": 67.92973327636719, "geo/layer_21/stable_rank_gate_proj": 62.928497314453125, "geo/layer_21/stable_rank_down_proj": 49.23372268676758, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15035858750343323, "geo/layer_21/attn_entropy_mean": 5.671937465667725, "geo/layer_21/attn_entropy_std": 0.2903038263320923, "geo/layer_27/stable_rank_q_proj": 43.77059555053711, "geo/layer_27/stable_rank_k_proj": 32.3587760925293, "geo/layer_27/stable_rank_o_proj": 115.2785415649414, "geo/layer_27/stable_rank_gate_proj": 76.27227020263672, "geo/layer_27/stable_rank_down_proj": 127.73287963867188, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08459808677434921, "geo/layer_27/attn_entropy_mean": 4.140162467956543, "geo/layer_27/attn_entropy_std": 0.793914258480072, "attnres/final_alpha/block_0": 0.24054421484470367, "attnres/block_norm/0": 1.7790545225143433, "attnres/final_alpha/block_1": 0.0038998648524284363, "attnres/block_norm/1": 47543.734375, "attnres/final_alpha/block_2": 0.009123796597123146, "attnres/block_norm/2": 29202.82421875, "attnres/final_alpha/block_3": 0.010677339509129524, "attnres/block_norm/3": 63895.6328125, "attnres/final_alpha/block_4": 0.012706493958830833, "attnres/block_norm/4": 15737.880859375, "attnres/final_alpha/block_5": 0.6158756017684937, "attnres/block_norm/5": 6809.76416015625, "attnres/final_alpha/block_6": 0.10717266798019409, "attnres/block_norm/6": 40899.69140625, "geo/tier1_time_s": 1.3618438243865967, "geo/step": 78525.0, "geo/rankme_slope": 0.000612542087147359} {"step": 78530, "timestamp": 1778279428.2926047, "train/loss": 2.019648277759552, "train/z_loss": 0.0014201170182786883, "train/perplexity": 7.535674003349932, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.005048081874847412, "optim/adamw_lr": 0.00015144245624542236, "perf/tokens_per_sec": 1685484.489789638, "perf/iters_per_sec": 0.803701634306735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2442428350448609, "data/tokens_consumed": 164691443712, "data/tokens_consumed_B": 164.691443712, "train/loss_slope": -2.3175215871349158e-05} {"step": 78540, "timestamp": 1778279438.6652436, "train/loss": 2.1044902086257933, "train/z_loss": 0.0014047833159565926, "train/perplexity": 8.202920165553786, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.005014530122280121, "optim/adamw_lr": 0.00015043590366840362, "perf/tokens_per_sec": 2022746.4637858395, "perf/iters_per_sec": 0.96452067555706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367844104766846, "data/tokens_consumed": 164712415232, "data/tokens_consumed_B": 164.712415232, "train/loss_slope": -2.290869448730768e-05} {"step": 78550, "timestamp": 1778279449.0262048, "grad/layer_0/attn": 0.0024544214829802513, "grad/layer_0/mlp": 0.0027112106326967478, "grad/layer_0/attn_mlp_ratio": 0.9052861341172558, "grad/layer_4/attn": 0.002965555526316166, "grad/layer_4/mlp": 0.002561126137152314, "grad/layer_4/attn_mlp_ratio": 1.1579107204077266, "grad/layer_8/attn": 0.003159723710268736, "grad/layer_8/mlp": 0.0033896116074174643, "grad/layer_8/attn_mlp_ratio": 0.9321786632239716, "grad/layer_12/attn": 0.003970941063016653, "grad/layer_12/mlp": 0.006577608175575733, "grad/layer_12/attn_mlp_ratio": 0.6037059211570455, "grad/layer_16/attn": 0.0034232286270707846, "grad/layer_16/mlp": 0.004190230742096901, "grad/layer_16/attn_mlp_ratio": 0.8169546633754697, "grad/layer_20/attn": 0.002865920774638653, "grad/layer_20/mlp": 0.004815810360014439, "grad/layer_20/attn_mlp_ratio": 0.5951066385262307, "grad/layer_24/attn": 0.004953858442604542, "grad/layer_24/mlp": 0.00664946436882019, "grad/layer_24/attn_mlp_ratio": 0.7450011148767747, "grad/layer_27/attn": 0.004283247981220484, "grad/layer_27/mlp": 0.005885735619813204, "grad/layer_27/attn_mlp_ratio": 0.7277336572897334} {"step": 78550, "timestamp": 1778279449.0419927, "train/loss": 2.057926523685455, "train/z_loss": 0.0014078729902394116, "train/perplexity": 7.829718232205804, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.004981054663658142, "optim/adamw_lr": 0.00014943163990974425, "perf/tokens_per_sec": 2021942.3530489171, "perf/iters_per_sec": 0.9641372456783853, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037196731567383, "data/tokens_consumed": 164733386752, "data/tokens_consumed_B": 164.733386752, "train/loss_slope": -2.4866778407767916e-05} {"step": 78560, "timestamp": 1778279459.3986998, "train/loss": 2.086079788208008, "train/z_loss": 0.001402655232232064, "train/perplexity": 8.053282628172159, "train/grad_norm": 0.095703125, "optim/muon_lr": 0.004947652816772461, "optim/adamw_lr": 0.0001484295845031738, "perf/tokens_per_sec": 2025824.8066091305, "perf/iters_per_sec": 0.965988543800893, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352089643478393, "data/tokens_consumed": 164754358272, "data/tokens_consumed_B": 164.754358272, "train/loss_slope": -2.6983923136633635e-05} {"step": 78570, "timestamp": 1778279469.75822, "train/loss": 2.119553542137146, "train/z_loss": 0.0014072600402869283, "train/perplexity": 8.327418816026666, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.00491432785987854, "optim/adamw_lr": 0.0001474298357963562, "perf/tokens_per_sec": 2025743.9537866272, "perf/iters_per_sec": 0.9659499901707779, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352502822875977, "data/tokens_consumed": 164775329792, "data/tokens_consumed_B": 164.775329792, "train/loss_slope": -2.7633022146113422e-05} {"step": 78580, "timestamp": 1778279480.1170726, "train/loss": 2.0615524291992187, "train/z_loss": 0.00141240464290604, "train/perplexity": 7.8581595823790655, "train/grad_norm": 0.10107421875, "optim/muon_lr": 0.004881077408790588, "optim/adamw_lr": 0.00014643232226371763, "perf/tokens_per_sec": 2025774.8385230866, "perf/iters_per_sec": 0.9659647171607431, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352344989776612, "data/tokens_consumed": 164796301312, "data/tokens_consumed_B": 164.796301312, "train/loss_slope": -2.7314595542367547e-05} {"step": 78590, "timestamp": 1778279490.8788953, "train/loss": 2.0639477014541625, "train/z_loss": 0.0014013797510415316, "train/perplexity": 7.877004574434975, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.00484790325164795, "optim/adamw_lr": 0.00014543709754943847, "perf/tokens_per_sec": 1949836.3188259893, "perf/iters_per_sec": 0.9297544092302271, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0755528450012206, "data/tokens_consumed": 164817272832, "data/tokens_consumed_B": 164.817272832, "train/loss_slope": -2.771564112959992e-05} {"step": 78600, "timestamp": 1778279501.2279873, "grad/layer_0/attn": 0.0022485738154500723, "grad/layer_0/mlp": 0.0026718045119196177, "grad/layer_0/attn_mlp_ratio": 0.8415936574922424, "grad/layer_4/attn": 0.0027802379336208105, "grad/layer_4/mlp": 0.0025795260444283485, "grad/layer_4/attn_mlp_ratio": 1.0778095580174636, "grad/layer_8/attn": 0.006386889610439539, "grad/layer_8/mlp": 0.0034265450667589903, "grad/layer_8/attn_mlp_ratio": 1.8639443811798928, "grad/layer_12/attn": 0.007208884693682194, "grad/layer_12/mlp": 0.006595130544155836, "grad/layer_12/attn_mlp_ratio": 1.093061696976424, "grad/layer_16/attn": 0.0029918677173554897, "grad/layer_16/mlp": 0.004308965522795916, "grad/layer_16/attn_mlp_ratio": 0.694335480776982, "grad/layer_20/attn": 0.0026987099554389715, "grad/layer_20/mlp": 0.0048613958060741425, "grad/layer_20/attn_mlp_ratio": 0.5551306677300297, "grad/layer_24/attn": 0.004302622750401497, "grad/layer_24/mlp": 0.006476816721260548, "grad/layer_24/attn_mlp_ratio": 0.6643113228519718, "grad/layer_27/attn": 0.003597862785682082, "grad/layer_27/mlp": 0.0056639849208295345, "grad/layer_27/attn_mlp_ratio": 0.6352175672165086} {"step": 78600, "timestamp": 1778279501.8438072, "eos/sharpness": 4.083228111267089, "eos/L0_probe": 1.9077261686325073, "eos/L_plus": 1.9301320314407349, "eos/L_minus": 1.9261525869369507, "eos/grad_norm": 0.07124610245227814, "eos/embed_grad_frac": 0.3499242961406708, "eos/time_s": 0.6129434108734131} {"step": 78600, "timestamp": 1778279501.8638337, "train/loss": 2.067559778690338, "train/z_loss": 0.0014124705689027906, "train/perplexity": 7.90550837131448, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.004814807772636414, "optim/adamw_lr": 0.00014444423317909238, "perf/tokens_per_sec": 1910166.2861231188, "perf/iters_per_sec": 0.9108382635703653, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978897571563722, "data/tokens_consumed": 164838244352, "data/tokens_consumed_B": 164.838244352, "train/loss_slope": -3.13938500678758e-05} {"step": 78600, "timestamp": 1778279503.230592, "geo/rankme_last": 440.649658203125, "geo/layer_0/stable_rank_q_proj": 18.726774215698242, "geo/layer_0/stable_rank_k_proj": 15.70103931427002, "geo/layer_0/stable_rank_o_proj": 46.33976745605469, "geo/layer_0/stable_rank_gate_proj": 127.26283264160156, "geo/layer_0/stable_rank_down_proj": 56.814552307128906, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06262611597776413, "geo/layer_0/attn_entropy_mean": 6.124809741973877, "geo/layer_0/attn_entropy_std": 0.42819517850875854, "geo/layer_7/stable_rank_q_proj": 42.552215576171875, "geo/layer_7/stable_rank_k_proj": 40.053035736083984, "geo/layer_7/stable_rank_o_proj": 87.94320678710938, "geo/layer_7/stable_rank_gate_proj": 76.81802368164062, "geo/layer_7/stable_rank_down_proj": 140.3328399658203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45117396116256714, "geo/layer_7/attn_entropy_mean": 4.641571521759033, "geo/layer_7/attn_entropy_std": 0.7667334675788879, "geo/layer_14/stable_rank_q_proj": 49.15096664428711, "geo/layer_14/stable_rank_k_proj": 41.49855422973633, "geo/layer_14/stable_rank_o_proj": 43.1075325012207, "geo/layer_14/stable_rank_gate_proj": 70.23982238769531, "geo/layer_14/stable_rank_down_proj": 124.99938201904297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3983922004699707, "geo/layer_14/attn_entropy_mean": 5.4898576736450195, "geo/layer_14/attn_entropy_std": 0.41308721899986267, "geo/layer_21/stable_rank_q_proj": 39.415496826171875, "geo/layer_21/stable_rank_k_proj": 30.022357940673828, "geo/layer_21/stable_rank_o_proj": 67.9284439086914, "geo/layer_21/stable_rank_gate_proj": 62.937660217285156, "geo/layer_21/stable_rank_down_proj": 49.24238204956055, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14906135201454163, "geo/layer_21/attn_entropy_mean": 5.675411224365234, "geo/layer_21/attn_entropy_std": 0.29116931557655334, "geo/layer_27/stable_rank_q_proj": 43.771644592285156, "geo/layer_27/stable_rank_k_proj": 32.3594856262207, "geo/layer_27/stable_rank_o_proj": 115.28734588623047, "geo/layer_27/stable_rank_gate_proj": 76.26649475097656, "geo/layer_27/stable_rank_down_proj": 127.69441986083984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08620370924472809, "geo/layer_27/attn_entropy_mean": 4.139730930328369, "geo/layer_27/attn_entropy_std": 0.7950087189674377, "attnres/final_alpha/block_0": 0.24072930216789246, "attnres/block_norm/0": 1.7789976596832275, "attnres/final_alpha/block_1": 0.003931117709726095, "attnres/block_norm/1": 47538.8125, "attnres/final_alpha/block_2": 0.009129190817475319, "attnres/block_norm/2": 29117.048828125, "attnres/final_alpha/block_3": 0.010644247755408287, "attnres/block_norm/3": 63899.5, "attnres/final_alpha/block_4": 0.012787332758307457, "attnres/block_norm/4": 15716.7998046875, "attnres/final_alpha/block_5": 0.6154905557632446, "attnres/block_norm/5": 6799.478515625, "attnres/final_alpha/block_6": 0.10728824138641357, "attnres/block_norm/6": 40901.53125, "geo/tier1_time_s": 1.3626623153686523, "geo/step": 78600.0, "geo/rankme_slope": 0.0006078174043054722} {"step": 78610, "timestamp": 1778279514.0827267, "train/loss": 2.060698664188385, "train/z_loss": 0.001412292174063623, "train/perplexity": 7.851453423827199, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.0047817879915237425, "optim/adamw_lr": 0.00014345363974571228, "perf/tokens_per_sec": 1716842.0664111052, "perf/iters_per_sec": 0.8186540920310522, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2215171337127686, "data/tokens_consumed": 164859215872, "data/tokens_consumed_B": 164.859215872, "train/loss_slope": -3.2766127257314256e-05} {"step": 78620, "timestamp": 1778279524.4380121, "train/loss": 2.1041907787323, "train/z_loss": 0.001419525733217597, "train/perplexity": 8.20046433373536, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.004748848676681519, "optim/adamw_lr": 0.00014246546030044553, "perf/tokens_per_sec": 2026545.022001927, "perf/iters_per_sec": 0.9663319692620883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348410606384277, "data/tokens_consumed": 164880187392, "data/tokens_consumed_B": 164.880187392, "train/loss_slope": -3.274764843685223e-05} {"step": 78630, "timestamp": 1778279534.7849312, "train/loss": 2.072993266582489, "train/z_loss": 0.001418821350671351, "train/perplexity": 7.9485797633091755, "train/grad_norm": 0.10595703125, "optim/muon_lr": 0.0047159850597381595, "optim/adamw_lr": 0.00014147955179214476, "perf/tokens_per_sec": 2027650.8210614999, "perf/iters_per_sec": 0.9668592553431987, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342767000198365, "data/tokens_consumed": 164901158912, "data/tokens_consumed_B": 164.901158912, "train/loss_slope": -3.180799063640498e-05} {"step": 78640, "timestamp": 1778279545.1360471, "train/loss": 2.0854533195495604, "train/z_loss": 0.001406488474458456, "train/perplexity": 8.048239078985686, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.004683203101158142, "optim/adamw_lr": 0.00014049609303474426, "perf/tokens_per_sec": 2027034.496763704, "perf/iters_per_sec": 0.9665653690165062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345911741256715, "data/tokens_consumed": 164922130432, "data/tokens_consumed_B": 164.922130432, "train/loss_slope": -2.9103811138426655e-05} {"step": 78650, "timestamp": 1778279556.0423877, "grad/layer_0/attn": 0.0023057302460074425, "grad/layer_0/mlp": 0.0026772713754326105, "grad/layer_0/attn_mlp_ratio": 0.861223924120309, "grad/layer_4/attn": 0.0020584187004715204, "grad/layer_4/mlp": 0.0025041461922228336, "grad/layer_4/attn_mlp_ratio": 0.8220041723857682, "grad/layer_8/attn": 0.004681818187236786, "grad/layer_8/mlp": 0.003578798845410347, "grad/layer_8/attn_mlp_ratio": 1.3082093346543016, "grad/layer_12/attn": 0.004509251564741135, "grad/layer_12/mlp": 0.006081630941480398, "grad/layer_12/attn_mlp_ratio": 0.7414543128291268, "grad/layer_16/attn": 0.003220659913495183, "grad/layer_16/mlp": 0.004312905482947826, "grad/layer_16/attn_mlp_ratio": 0.7467494596285356, "grad/layer_20/attn": 0.0029785078950226307, "grad/layer_20/mlp": 0.004967779386788607, "grad/layer_20/attn_mlp_ratio": 0.5995652389450303, "grad/layer_24/attn": 0.00472648860886693, "grad/layer_24/mlp": 0.006696628872305155, "grad/layer_24/attn_mlp_ratio": 0.705801176743401, "grad/layer_27/attn": 0.0036841477267444134, "grad/layer_27/mlp": 0.005627075210213661, "grad/layer_27/attn_mlp_ratio": 0.6547180415476839} {"step": 78650, "timestamp": 1778279556.0587778, "train/loss": 2.0879336833953857, "train/z_loss": 0.0014106169226579368, "train/perplexity": 8.068226417908997, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.004650498628616333, "optim/adamw_lr": 0.00013951495885848998, "perf/tokens_per_sec": 1921395.531384537, "perf/iters_per_sec": 0.9161927849695859, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0914733409881592, "data/tokens_consumed": 164943101952, "data/tokens_consumed_B": 164.943101952, "train/loss_slope": -3.0047805932345737e-05} {"step": 78660, "timestamp": 1778279566.412272, "train/loss": 2.0468570590019226, "train/z_loss": 0.0014163917861878873, "train/perplexity": 7.743525377255289, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.004617873430252075, "optim/adamw_lr": 0.00013853620290756226, "perf/tokens_per_sec": 2026854.57645213, "perf/iters_per_sec": 0.9664795763264322, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346830129623412, "data/tokens_consumed": 164964073472, "data/tokens_consumed_B": 164.964073472, "train/loss_slope": -3.2672941631073375e-05} {"step": 78670, "timestamp": 1778279577.2314405, "train/loss": 2.0513015270233153, "train/z_loss": 0.0014131724135950207, "train/perplexity": 7.778017821672169, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.004585331082344055, "optim/adamw_lr": 0.00013755993247032165, "perf/tokens_per_sec": 1939271.0385054771, "perf/iters_per_sec": 0.9247164909865747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0814125299453736, "data/tokens_consumed": 164985044992, "data/tokens_consumed_B": 164.985044992, "train/loss_slope": -3.588625049219097e-05} {"step": 78675, "timestamp": 1778279583.009105, "eos/sharpness": 3.0036807060241695, "eos/L0_probe": 1.9085513353347778, "eos/L_plus": 1.924539566040039, "eos/L_minus": 1.9225999116897583, "eos/grad_norm": 0.07061870396137238, "eos/embed_grad_frac": 0.3496935963630676, "eos/time_s": 0.6108148097991943} {"step": 78675, "timestamp": 1778279584.388333, "geo/rankme_last": 440.7148132324219, "geo/layer_0/stable_rank_q_proj": 18.724443435668945, "geo/layer_0/stable_rank_k_proj": 15.69912052154541, "geo/layer_0/stable_rank_o_proj": 46.32264709472656, "geo/layer_0/stable_rank_gate_proj": 127.25542449951172, "geo/layer_0/stable_rank_down_proj": 56.81792449951172, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06270714104175568, "geo/layer_0/attn_entropy_mean": 6.125571250915527, "geo/layer_0/attn_entropy_std": 0.4280741512775421, "geo/layer_7/stable_rank_q_proj": 42.55202865600586, "geo/layer_7/stable_rank_k_proj": 40.06084060668945, "geo/layer_7/stable_rank_o_proj": 87.92996215820312, "geo/layer_7/stable_rank_gate_proj": 76.82894134521484, "geo/layer_7/stable_rank_down_proj": 140.3367462158203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.444559246301651, "geo/layer_7/attn_entropy_mean": 4.643332004547119, "geo/layer_7/attn_entropy_std": 0.7643678784370422, "geo/layer_14/stable_rank_q_proj": 49.15288543701172, "geo/layer_14/stable_rank_k_proj": 41.51128005981445, "geo/layer_14/stable_rank_o_proj": 43.100608825683594, "geo/layer_14/stable_rank_gate_proj": 70.23529815673828, "geo/layer_14/stable_rank_down_proj": 125.00535583496094, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.42341527342796326, "geo/layer_14/attn_entropy_mean": 5.481395721435547, "geo/layer_14/attn_entropy_std": 0.418677419424057, "geo/layer_21/stable_rank_q_proj": 39.409698486328125, "geo/layer_21/stable_rank_k_proj": 30.032846450805664, "geo/layer_21/stable_rank_o_proj": 67.92584991455078, "geo/layer_21/stable_rank_gate_proj": 62.94374465942383, "geo/layer_21/stable_rank_down_proj": 49.23928451538086, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15133711695671082, "geo/layer_21/attn_entropy_mean": 5.677957057952881, "geo/layer_21/attn_entropy_std": 0.2913703918457031, "geo/layer_27/stable_rank_q_proj": 43.76815414428711, "geo/layer_27/stable_rank_k_proj": 32.36273193359375, "geo/layer_27/stable_rank_o_proj": 115.3023681640625, "geo/layer_27/stable_rank_gate_proj": 76.25311279296875, "geo/layer_27/stable_rank_down_proj": 127.7547607421875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0875154659152031, "geo/layer_27/attn_entropy_mean": 4.141162395477295, "geo/layer_27/attn_entropy_std": 0.7950912714004517, "attnres/final_alpha/block_0": 0.24002820253372192, "attnres/block_norm/0": 1.7789639234542847, "attnres/final_alpha/block_1": 0.0039019009564071894, "attnres/block_norm/1": 47553.984375, "attnres/final_alpha/block_2": 0.00907736737281084, "attnres/block_norm/2": 29148.384765625, "attnres/final_alpha/block_3": 0.010532989166676998, "attnres/block_norm/3": 63940.1875, "attnres/final_alpha/block_4": 0.012679997831583023, "attnres/block_norm/4": 15733.185546875, "attnres/final_alpha/block_5": 0.6169363856315613, "attnres/block_norm/5": 6812.3349609375, "attnres/final_alpha/block_6": 0.10684313625097275, "attnres/block_norm/6": 40965.2578125, "geo/tier1_time_s": 1.3585562705993652, "geo/step": 78675.0, "geo/rankme_slope": 0.0005988470583545918} {"step": 78680, "timestamp": 1778279589.5665872, "train/loss": 2.039205276966095, "train/z_loss": 0.0014153375988826156, "train/perplexity": 7.684499722596204, "train/grad_norm": 0.08349609375, "optim/muon_lr": 0.004552868008613586, "optim/adamw_lr": 0.0001365860402584076, "perf/tokens_per_sec": 1700987.6728445103, "perf/iters_per_sec": 0.8110941280577232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2329025268554688, "data/tokens_consumed": 165006016512, "data/tokens_consumed_B": 165.006016512, "train/loss_slope": -3.555309663761713e-05} {"step": 78690, "timestamp": 1778279599.9159603, "train/loss": 2.0945505142211913, "train/z_loss": 0.001403489161748439, "train/perplexity": 8.121789520748747, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.004520487785339355, "optim/adamw_lr": 0.00013561463356018066, "perf/tokens_per_sec": 2027213.1408410794, "perf/iters_per_sec": 0.9666505531506917, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034500002861023, "data/tokens_consumed": 165026988032, "data/tokens_consumed_B": 165.026988032, "train/loss_slope": -3.240386627831336e-05} {"step": 78700, "timestamp": 1778279610.2735703, "grad/layer_0/attn": 0.0023648450151085854, "grad/layer_0/mlp": 0.002840276574715972, "grad/layer_0/attn_mlp_ratio": 0.8326107932231884, "grad/layer_4/attn": 0.0019061428029090166, "grad/layer_4/mlp": 0.002521226881071925, "grad/layer_4/attn_mlp_ratio": 0.7560377614627147, "grad/layer_8/attn": 0.00305287167429924, "grad/layer_8/mlp": 0.0033994142431765795, "grad/layer_8/attn_mlp_ratio": 0.8980581259319178, "grad/layer_12/attn": 0.0038770365063101053, "grad/layer_12/mlp": 0.006329352501779795, "grad/layer_12/attn_mlp_ratio": 0.6125486681244294, "grad/layer_16/attn": 0.004227852448821068, "grad/layer_16/mlp": 0.004339328035712242, "grad/layer_16/attn_mlp_ratio": 0.9743103809150223, "grad/layer_20/attn": 0.004082833416759968, "grad/layer_20/mlp": 0.004916761536151171, "grad/layer_20/attn_mlp_ratio": 0.8303907569446463, "grad/layer_24/attn": 0.003984787035733461, "grad/layer_24/mlp": 0.00653637433424592, "grad/layer_24/attn_mlp_ratio": 0.6096326145050736, "grad/layer_27/attn": 0.003488642629235983, "grad/layer_27/mlp": 0.005774191115051508, "grad/layer_27/attn_mlp_ratio": 0.6041785765844028} {"step": 78700, "timestamp": 1778279610.2894888, "train/loss": 2.1095492124557493, "train/z_loss": 0.0014094965532422065, "train/perplexity": 8.244523918127785, "train/grad_norm": 0.078125, "optim/muon_lr": 0.004488187432289124, "optim/adamw_lr": 0.00013464562296867369, "perf/tokens_per_sec": 2022480.4329405143, "perf/iters_per_sec": 0.9643938221647808, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369207859039307, "data/tokens_consumed": 165047959552, "data/tokens_consumed_B": 165.047959552, "train/loss_slope": -2.9234443367546946e-05} {"step": 78710, "timestamp": 1778279620.6512673, "train/loss": 2.114936923980713, "train/z_loss": 0.0014093349454924464, "train/perplexity": 8.289062908537622, "train/grad_norm": 0.078125, "optim/muon_lr": 0.00445596992969513, "optim/adamw_lr": 0.00013367909789085386, "perf/tokens_per_sec": 2025445.5587033243, "perf/iters_per_sec": 0.965807704307234, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354027986526488, "data/tokens_consumed": 165068931072, "data/tokens_consumed_B": 165.068931072, "train/loss_slope": -2.9128499929517976e-05} {"step": 78720, "timestamp": 1778279631.0046282, "train/loss": 2.042587637901306, "train/z_loss": 0.0014173840172588825, "train/perplexity": 7.710535480608421, "train/grad_norm": 0.08935546875, "optim/muon_lr": 0.004423837065696716, "optim/adamw_lr": 0.00013271511197090148, "perf/tokens_per_sec": 2026478.7712369952, "perf/iters_per_sec": 0.9663003784356095, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348748922348023, "data/tokens_consumed": 165089902592, "data/tokens_consumed_B": 165.089902592, "train/loss_slope": -2.8785523379703888e-05} {"step": 78730, "timestamp": 1778279641.357254, "train/loss": 2.0918153047561647, "train/z_loss": 0.0014180065947584808, "train/perplexity": 8.099605078557605, "train/grad_norm": 0.119140625, "optim/muon_lr": 0.004391785860061646, "optim/adamw_lr": 0.00013175357580184937, "perf/tokens_per_sec": 2026631.3085623668, "perf/iters_per_sec": 0.9663731139003595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347970008850098, "data/tokens_consumed": 165110874112, "data/tokens_consumed_B": 165.110874112, "train/loss_slope": -2.7291400640746937e-05} {"step": 78740, "timestamp": 1778279651.7122393, "train/loss": 2.0697036743164063, "train/z_loss": 0.001417144387960434, "train/perplexity": 7.92247513712241, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.0043598198890686035, "optim/adamw_lr": 0.00013079459667205808, "perf/tokens_per_sec": 2026215.445101107, "perf/iters_per_sec": 0.9661748147492919, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350093841552734, "data/tokens_consumed": 165131845632, "data/tokens_consumed_B": 165.131845632, "train/loss_slope": -2.962699465804585e-05} {"step": 78750, "timestamp": 1778279662.0775104, "grad/layer_0/attn": 0.002549693686887622, "grad/layer_0/mlp": 0.0029673234093934298, "grad/layer_0/attn_mlp_ratio": 0.8592570640903329, "grad/layer_4/attn": 0.002200428396463394, "grad/layer_4/mlp": 0.002661453327164054, "grad/layer_4/attn_mlp_ratio": 0.8267769685558951, "grad/layer_8/attn": 0.0032609503250569105, "grad/layer_8/mlp": 0.003477556398138404, "grad/layer_8/attn_mlp_ratio": 0.9377131117215647, "grad/layer_12/attn": 0.0034906475339084864, "grad/layer_12/mlp": 0.006778244860470295, "grad/layer_12/attn_mlp_ratio": 0.5149780738621011, "grad/layer_16/attn": 0.003074746346101165, "grad/layer_16/mlp": 0.004262426868081093, "grad/layer_16/attn_mlp_ratio": 0.7213604758805731, "grad/layer_20/attn": 0.004359252285212278, "grad/layer_20/mlp": 0.004852868616580963, "grad/layer_20/attn_mlp_ratio": 0.8982835802497339, "grad/layer_24/attn": 0.005336523987352848, "grad/layer_24/mlp": 0.0065172757022082806, "grad/layer_24/attn_mlp_ratio": 0.8188273980279687, "grad/layer_27/attn": 0.006133029703050852, "grad/layer_27/mlp": 0.005721063353121281, "grad/layer_27/attn_mlp_ratio": 1.0720086839283023} {"step": 78750, "timestamp": 1778279662.6890194, "eos/sharpness": 32.65535831451415, "eos/L0_probe": 1.9091196060180664, "eos/L_plus": 2.05979323387146, "eos/L_minus": 2.0849995613098145, "eos/grad_norm": 0.0832185372710228, "eos/embed_grad_frac": 0.2545149028301239, "eos/time_s": 0.6087052822113037} {"step": 78750, "timestamp": 1778279662.7082152, "train/loss": 2.0988086700439452, "train/z_loss": 0.001397616253234446, "train/perplexity": 8.156447102412011, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.004327936768531799, "optim/adamw_lr": 0.00012983810305595397, "perf/tokens_per_sec": 1908005.7308562954, "perf/iters_per_sec": 0.9098080305367925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0991329669952392, "data/tokens_consumed": 165152817152, "data/tokens_consumed_B": 165.152817152, "train/loss_slope": -2.7761219708797245e-05} {"step": 78750, "timestamp": 1778279664.0685174, "geo/rankme_last": 440.54986572265625, "geo/layer_0/stable_rank_q_proj": 18.72351837158203, "geo/layer_0/stable_rank_k_proj": 15.695456504821777, "geo/layer_0/stable_rank_o_proj": 46.317955017089844, "geo/layer_0/stable_rank_gate_proj": 127.20219421386719, "geo/layer_0/stable_rank_down_proj": 56.83175277709961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06363553553819656, "geo/layer_0/attn_entropy_mean": 6.125679016113281, "geo/layer_0/attn_entropy_std": 0.42799264192581177, "geo/layer_7/stable_rank_q_proj": 42.5557746887207, "geo/layer_7/stable_rank_k_proj": 40.05529022216797, "geo/layer_7/stable_rank_o_proj": 87.91536712646484, "geo/layer_7/stable_rank_gate_proj": 76.83512878417969, "geo/layer_7/stable_rank_down_proj": 140.3386993408203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4553721249103546, "geo/layer_7/attn_entropy_mean": 4.641475677490234, "geo/layer_7/attn_entropy_std": 0.7633203864097595, "geo/layer_14/stable_rank_q_proj": 49.15984344482422, "geo/layer_14/stable_rank_k_proj": 41.5050048828125, "geo/layer_14/stable_rank_o_proj": 43.09508514404297, "geo/layer_14/stable_rank_gate_proj": 70.22728729248047, "geo/layer_14/stable_rank_down_proj": 125.05782318115234, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.417568176984787, "geo/layer_14/attn_entropy_mean": 5.494112014770508, "geo/layer_14/attn_entropy_std": 0.4163424074649811, "geo/layer_21/stable_rank_q_proj": 39.40243148803711, "geo/layer_21/stable_rank_k_proj": 30.032451629638672, "geo/layer_21/stable_rank_o_proj": 67.91679382324219, "geo/layer_21/stable_rank_gate_proj": 62.926116943359375, "geo/layer_21/stable_rank_down_proj": 49.23774719238281, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1544344425201416, "geo/layer_21/attn_entropy_mean": 5.675719738006592, "geo/layer_21/attn_entropy_std": 0.2894359529018402, "geo/layer_27/stable_rank_q_proj": 43.77215576171875, "geo/layer_27/stable_rank_k_proj": 32.354312896728516, "geo/layer_27/stable_rank_o_proj": 115.30887603759766, "geo/layer_27/stable_rank_gate_proj": 76.2453842163086, "geo/layer_27/stable_rank_down_proj": 127.7437744140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08215903490781784, "geo/layer_27/attn_entropy_mean": 4.137452125549316, "geo/layer_27/attn_entropy_std": 0.7919443845748901, "attnres/final_alpha/block_0": 0.24039585888385773, "attnres/block_norm/0": 1.7790172100067139, "attnres/final_alpha/block_1": 0.003915219567716122, "attnres/block_norm/1": 47511.0859375, "attnres/final_alpha/block_2": 0.009117151610553265, "attnres/block_norm/2": 29124.388671875, "attnres/final_alpha/block_3": 0.010585560463368893, "attnres/block_norm/3": 63839.80859375, "attnres/final_alpha/block_4": 0.012682194821536541, "attnres/block_norm/4": 15715.2900390625, "attnres/final_alpha/block_5": 0.6165345311164856, "attnres/block_norm/5": 6825.89794921875, "attnres/final_alpha/block_6": 0.10676950961351395, "attnres/block_norm/6": 40872.75, "geo/tier1_time_s": 1.3563287258148193, "geo/step": 78750.0, "geo/rankme_slope": 0.0005809246550182573} {"step": 78760, "timestamp": 1778279674.4556842, "train/loss": 2.0940972566604614, "train/z_loss": 0.0014210369205102324, "train/perplexity": 8.11810909239581, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.00429614007472992, "optim/adamw_lr": 0.00012888420224189758, "perf/tokens_per_sec": 1785778.8141851625, "perf/iters_per_sec": 0.8515256949353993, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1743626832962035, "data/tokens_consumed": 165173788672, "data/tokens_consumed_B": 165.173788672, "train/loss_slope": -2.463901115901617e-05} {"step": 78770, "timestamp": 1778279684.8296354, "train/loss": 2.075369656085968, "train/z_loss": 0.001415921514853835, "train/perplexity": 7.967491146332588, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.0042644268274307255, "optim/adamw_lr": 0.00012793280482292175, "perf/tokens_per_sec": 2022591.1621031498, "perf/iters_per_sec": 0.9644466219440221, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368640184402467, "data/tokens_consumed": 165194760192, "data/tokens_consumed_B": 165.194760192, "train/loss_slope": -2.428005400485119e-05} {"step": 78780, "timestamp": 1778279695.2079673, "train/loss": 2.074185585975647, "train/z_loss": 0.0014206783729605378, "train/perplexity": 7.958062661307199, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.00423279881477356, "optim/adamw_lr": 0.00012698396444320678, "perf/tokens_per_sec": 2021612.3664112978, "perf/iters_per_sec": 0.9639798957878579, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373660326004028, "data/tokens_consumed": 165215731712, "data/tokens_consumed_B": 165.215731712, "train/loss_slope": -2.287494269999664e-05} {"step": 78790, "timestamp": 1778279705.6060836, "train/loss": 2.002052199840546, "train/z_loss": 0.0014313435298390687, "train/perplexity": 7.404235488922156, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.0042012596130371095, "optim/adamw_lr": 0.00012603778839111326, "perf/tokens_per_sec": 2018397.1008377406, "perf/iters_per_sec": 0.9624467376888945, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0390185356140136, "data/tokens_consumed": 165236703232, "data/tokens_consumed_B": 165.236703232, "train/loss_slope": -2.4513770551821728e-05} {"step": 78800, "timestamp": 1778279715.9723694, "grad/layer_0/attn": 0.0024615630973130465, "grad/layer_0/mlp": 0.0027781252283602953, "grad/layer_0/attn_mlp_ratio": 0.8860518538110409, "grad/layer_4/attn": 0.0039301421493291855, "grad/layer_4/mlp": 0.002525096060708165, "grad/layer_4/attn_mlp_ratio": 1.5564326660046777, "grad/layer_8/attn": 0.0037952829152345657, "grad/layer_8/mlp": 0.003371873637661338, "grad/layer_8/attn_mlp_ratio": 1.125570887439841, "grad/layer_12/attn": 0.005288449581712484, "grad/layer_12/mlp": 0.006418096832931042, "grad/layer_12/attn_mlp_ratio": 0.8239902944715007, "grad/layer_16/attn": 0.0035703093744814396, "grad/layer_16/mlp": 0.004354893229901791, "grad/layer_16/attn_mlp_ratio": 0.8198385365645582, "grad/layer_20/attn": 0.0034561341162770987, "grad/layer_20/mlp": 0.00495293689891696, "grad/layer_20/attn_mlp_ratio": 0.6977948875652645, "grad/layer_24/attn": 0.007292185910046101, "grad/layer_24/mlp": 0.006116604432463646, "grad/layer_24/attn_mlp_ratio": 1.1921950931015897, "grad/layer_27/attn": 0.0041399202309548855, "grad/layer_27/mlp": 0.005453535355627537, "grad/layer_27/attn_mlp_ratio": 0.7591259403444195} {"step": 78800, "timestamp": 1778279715.9881635, "train/loss": 2.1023391246795655, "train/z_loss": 0.0014180265134200454, "train/perplexity": 8.185293960193775, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.00416980504989624, "optim/adamw_lr": 0.00012509415149688718, "perf/tokens_per_sec": 2020972.589483899, "perf/iters_per_sec": 0.9636748263759132, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376944303512574, "data/tokens_consumed": 165257674752, "data/tokens_consumed_B": 165.257674752, "train/loss_slope": -2.0667665726495435e-05} {"step": 78810, "timestamp": 1778279726.3646624, "train/loss": 2.088146781921387, "train/z_loss": 0.0014150696573778988, "train/perplexity": 8.069945928272153, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.004138439297676087, "optim/adamw_lr": 0.0001241531789302826, "perf/tokens_per_sec": 2022018.9982279935, "perf/iters_per_sec": 0.9641737929477661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037157416343689, "data/tokens_consumed": 165278646272, "data/tokens_consumed_B": 165.278646272, "train/loss_slope": -2.045640773040698e-05} {"step": 78820, "timestamp": 1778279736.7409647, "train/loss": 2.1384639024734495, "train/z_loss": 0.0013989680912345648, "train/perplexity": 8.48639168625269, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.004107159376144409, "optim/adamw_lr": 0.00012321478128433228, "perf/tokens_per_sec": 2022059.8098571247, "perf/iters_per_sec": 0.9641932534490226, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037136483192444, "data/tokens_consumed": 165299617792, "data/tokens_consumed_B": 165.299617792, "train/loss_slope": -1.5749321025375707e-05} {"step": 78825, "timestamp": 1778279742.5255857, "eos/sharpness": 4.269349575042724, "eos/L0_probe": 1.906835913658142, "eos/L_plus": 1.9314966201782227, "eos/L_minus": 1.9248687028884888, "eos/grad_norm": 0.07049009203910828, "eos/embed_grad_frac": 0.34435784816741943, "eos/time_s": 0.6077258586883545} {"step": 78825, "timestamp": 1778279743.9042838, "geo/rankme_last": 440.7789001464844, "geo/layer_0/stable_rank_q_proj": 18.718162536621094, "geo/layer_0/stable_rank_k_proj": 15.693317413330078, "geo/layer_0/stable_rank_o_proj": 46.30375289916992, "geo/layer_0/stable_rank_gate_proj": 127.1523208618164, "geo/layer_0/stable_rank_down_proj": 56.82209396362305, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06379102170467377, "geo/layer_0/attn_entropy_mean": 6.125637531280518, "geo/layer_0/attn_entropy_std": 0.4282476305961609, "geo/layer_7/stable_rank_q_proj": 42.546443939208984, "geo/layer_7/stable_rank_k_proj": 40.056053161621094, "geo/layer_7/stable_rank_o_proj": 87.91329193115234, "geo/layer_7/stable_rank_gate_proj": 76.84241485595703, "geo/layer_7/stable_rank_down_proj": 140.3535614013672, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44702786207199097, "geo/layer_7/attn_entropy_mean": 4.638951301574707, "geo/layer_7/attn_entropy_std": 0.7669966220855713, "geo/layer_14/stable_rank_q_proj": 49.14359664916992, "geo/layer_14/stable_rank_k_proj": 41.49863052368164, "geo/layer_14/stable_rank_o_proj": 43.092262268066406, "geo/layer_14/stable_rank_gate_proj": 70.23098754882812, "geo/layer_14/stable_rank_down_proj": 125.04147338867188, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3960770070552826, "geo/layer_14/attn_entropy_mean": 5.494235992431641, "geo/layer_14/attn_entropy_std": 0.41524890065193176, "geo/layer_21/stable_rank_q_proj": 39.41460418701172, "geo/layer_21/stable_rank_k_proj": 30.029682159423828, "geo/layer_21/stable_rank_o_proj": 67.90106201171875, "geo/layer_21/stable_rank_gate_proj": 62.90189743041992, "geo/layer_21/stable_rank_down_proj": 49.244384765625, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14919514954090118, "geo/layer_21/attn_entropy_mean": 5.680492401123047, "geo/layer_21/attn_entropy_std": 0.2889382839202881, "geo/layer_27/stable_rank_q_proj": 43.771793365478516, "geo/layer_27/stable_rank_k_proj": 32.35790252685547, "geo/layer_27/stable_rank_o_proj": 115.28339385986328, "geo/layer_27/stable_rank_gate_proj": 76.24205017089844, "geo/layer_27/stable_rank_down_proj": 127.74908447265625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0967182070016861, "geo/layer_27/attn_entropy_mean": 4.143943786621094, "geo/layer_27/attn_entropy_std": 0.7926822900772095, "attnres/final_alpha/block_0": 0.24062895774841309, "attnres/block_norm/0": 1.7789556980133057, "attnres/final_alpha/block_1": 0.003917657770216465, "attnres/block_norm/1": 47529.609375, "attnres/final_alpha/block_2": 0.009140217676758766, "attnres/block_norm/2": 29138.10546875, "attnres/final_alpha/block_3": 0.010644864290952682, "attnres/block_norm/3": 63831.52734375, "attnres/final_alpha/block_4": 0.012760454788804054, "attnres/block_norm/4": 15710.58984375, "attnres/final_alpha/block_5": 0.6156436204910278, "attnres/block_norm/5": 6818.4404296875, "attnres/final_alpha/block_6": 0.10726422071456909, "attnres/block_norm/6": 40914.0, "geo/tier1_time_s": 1.3595538139343262, "geo/step": 78825.0, "geo/rankme_slope": 0.0005594595650760305} {"step": 78830, "timestamp": 1778279749.0939524, "train/loss": 2.1094427585601805, "train/z_loss": 0.0014013020205311476, "train/perplexity": 8.243646303153184, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.0040759676694869995, "optim/adamw_lr": 0.00012227903008460997, "perf/tokens_per_sec": 1698449.3437738847, "perf/iters_per_sec": 0.8098837584371017, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2347450971603393, "data/tokens_consumed": 165320589312, "data/tokens_consumed_B": 165.320589312, "train/loss_slope": -1.0297046328606727e-05} {"step": 78840, "timestamp": 1778279759.4678726, "train/loss": 2.0348538875579836, "train/z_loss": 0.0014214998693205417, "train/perplexity": 7.651134117913058, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.0040448659658432005, "optim/adamw_lr": 0.00012134597897529601, "perf/tokens_per_sec": 2022401.3813177245, "perf/iters_per_sec": 0.9643561274136183, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036961317062378, "data/tokens_consumed": 165341560832, "data/tokens_consumed_B": 165.341560832, "train/loss_slope": -1.1873691417489238e-05} {"step": 78850, "timestamp": 1778279769.841229, "grad/layer_0/attn": 0.0025345266330987215, "grad/layer_0/mlp": 0.00287814368493855, "grad/layer_0/attn_mlp_ratio": 0.8806115408000142, "grad/layer_4/attn": 0.002073293086141348, "grad/layer_4/mlp": 0.00241040694527328, "grad/layer_4/attn_mlp_ratio": 0.8601423109042929, "grad/layer_8/attn": 0.0032308429945260286, "grad/layer_8/mlp": 0.003505142405629158, "grad/layer_8/attn_mlp_ratio": 0.9217436921144826, "grad/layer_12/attn": 0.0036264711525291204, "grad/layer_12/mlp": 0.00638962397351861, "grad/layer_12/attn_mlp_ratio": 0.5675562616521994, "grad/layer_16/attn": 0.0034246318973600864, "grad/layer_16/mlp": 0.004399613942950964, "grad/layer_16/attn_mlp_ratio": 0.7783937099771313, "grad/layer_20/attn": 0.003921089693903923, "grad/layer_20/mlp": 0.005078685935586691, "grad/layer_20/attn_mlp_ratio": 0.7720677487107073, "grad/layer_24/attn": 0.003943321295082569, "grad/layer_24/mlp": 0.006595158483833075, "grad/layer_24/attn_mlp_ratio": 0.5979115202398559, "grad/layer_27/attn": 0.003738508792594075, "grad/layer_27/mlp": 0.005879695992916822, "grad/layer_27/attn_mlp_ratio": 0.6358336780531563} {"step": 78850, "timestamp": 1778279769.8570058, "train/loss": 2.1076220750808714, "train/z_loss": 0.0014092569006606936, "train/perplexity": 8.228650887614778, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.004013851881027221, "optim/adamw_lr": 0.00012041555643081664, "perf/tokens_per_sec": 2020019.1105676172, "perf/iters_per_sec": 0.9632201721990667, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381842374801635, "data/tokens_consumed": 165362532352, "data/tokens_consumed_B": 165.362532352, "train/loss_slope": -1.0116419669139039e-05} {"step": 78860, "timestamp": 1778279780.2313006, "train/loss": 2.002779185771942, "train/z_loss": 0.001428648829460144, "train/perplexity": 7.409620221030435, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.003982928991317749, "optim/adamw_lr": 0.00011948786973953246, "perf/tokens_per_sec": 2022502.1034645583, "perf/iters_per_sec": 0.9644041554758827, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369096755981446, "data/tokens_consumed": 165383503872, "data/tokens_consumed_B": 165.383503872, "train/loss_slope": -1.3602705099115604e-05} {"step": 78870, "timestamp": 1778279790.6084788, "train/loss": 2.143472695350647, "train/z_loss": 0.0014091967255808412, "train/perplexity": 8.529004895764034, "train/grad_norm": 0.0947265625, "optim/muon_lr": 0.0039520937204360965, "optim/adamw_lr": 0.00011856281161308287, "perf/tokens_per_sec": 2021941.7953126938, "perf/iters_per_sec": 0.9641369797290296, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371970176696776, "data/tokens_consumed": 165404475392, "data/tokens_consumed_B": 165.404475392, "train/loss_slope": -1.0209221636751886e-05} {"step": 78880, "timestamp": 1778279800.979857, "train/loss": 2.0348654866218565, "train/z_loss": 0.0014251183369196952, "train/perplexity": 7.651222864421079, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.003921351432800293, "optim/adamw_lr": 0.00011764054298400878, "perf/tokens_per_sec": 2023492.0019099207, "perf/iters_per_sec": 0.964876175837479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036402416229248, "data/tokens_consumed": 165425446912, "data/tokens_consumed_B": 165.425446912, "train/loss_slope": -1.3988228807068838e-05} {"step": 78890, "timestamp": 1778279811.3299742, "train/loss": 2.0857669711112976, "train/z_loss": 0.0014204665436409413, "train/perplexity": 8.05076381766546, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.0038906973600387573, "optim/adamw_lr": 0.00011672092080116272, "perf/tokens_per_sec": 2027297.9423945555, "perf/iters_per_sec": 0.9666909896824625, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034456729888916, "data/tokens_consumed": 165446418432, "data/tokens_consumed_B": 165.446418432, "train/loss_slope": -1.2802291638923454e-05} {"step": 78900, "timestamp": 1778279821.670535, "grad/layer_0/attn": 0.0023784195072948933, "grad/layer_0/mlp": 0.0026220399886369705, "grad/layer_0/attn_mlp_ratio": 0.9070873926001948, "grad/layer_4/attn": 0.0019581629894673824, "grad/layer_4/mlp": 0.0025157697964459658, "grad/layer_4/attn_mlp_ratio": 0.7783553624016585, "grad/layer_8/attn": 0.0033175700809806585, "grad/layer_8/mlp": 0.003606582060456276, "grad/layer_8/attn_mlp_ratio": 0.9198653831751183, "grad/layer_12/attn": 0.003587077371776104, "grad/layer_12/mlp": 0.006014338228851557, "grad/layer_12/attn_mlp_ratio": 0.5964209486799955, "grad/layer_16/attn": 0.0028659480158239603, "grad/layer_16/mlp": 0.004119676537811756, "grad/layer_16/attn_mlp_ratio": 0.6956730510155436, "grad/layer_20/attn": 0.0028521844651550055, "grad/layer_20/mlp": 0.004848624113947153, "grad/layer_20/attn_mlp_ratio": 0.5882461373167768, "grad/layer_24/attn": 0.005517120473086834, "grad/layer_24/mlp": 0.006439157295972109, "grad/layer_24/attn_mlp_ratio": 0.8568078296296908, "grad/layer_27/attn": 0.0040197246707975864, "grad/layer_27/mlp": 0.005678597372025251, "grad/layer_27/attn_mlp_ratio": 0.7078727961614025} {"step": 78900, "timestamp": 1778279822.2805269, "eos/sharpness": 28.614735603332512, "eos/L0_probe": 1.9065794944763184, "eos/L_plus": 2.0506880283355713, "eos/L_minus": 2.0486183166503906, "eos/grad_norm": 0.0771217867732048, "eos/embed_grad_frac": 0.2746950685977936, "eos/time_s": 0.6072494983673096} {"step": 78900, "timestamp": 1778279822.3012228, "train/loss": 2.034136116504669, "train/z_loss": 0.0014414746430702508, "train/perplexity": 7.645644325760834, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.0038601350784301758, "optim/adamw_lr": 0.00011580405235290526, "perf/tokens_per_sec": 1912349.9086345143, "perf/iters_per_sec": 0.9118794959232875, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096636128425598, "data/tokens_consumed": 165467389952, "data/tokens_consumed_B": 165.467389952, "train/loss_slope": -1.732803645020954e-05} {"step": 78900, "timestamp": 1778279823.6602802, "geo/rankme_last": 440.6275939941406, "geo/layer_0/stable_rank_q_proj": 18.718555450439453, "geo/layer_0/stable_rank_k_proj": 15.691523551940918, "geo/layer_0/stable_rank_o_proj": 46.29635238647461, "geo/layer_0/stable_rank_gate_proj": 127.15556335449219, "geo/layer_0/stable_rank_down_proj": 56.82735824584961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06524940580129623, "geo/layer_0/attn_entropy_mean": 6.125532150268555, "geo/layer_0/attn_entropy_std": 0.4285174310207367, "geo/layer_7/stable_rank_q_proj": 42.5350227355957, "geo/layer_7/stable_rank_k_proj": 40.062469482421875, "geo/layer_7/stable_rank_o_proj": 87.91366577148438, "geo/layer_7/stable_rank_gate_proj": 76.83612823486328, "geo/layer_7/stable_rank_down_proj": 140.3924102783203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44778817892074585, "geo/layer_7/attn_entropy_mean": 4.639799118041992, "geo/layer_7/attn_entropy_std": 0.7690985202789307, "geo/layer_14/stable_rank_q_proj": 49.13273620605469, "geo/layer_14/stable_rank_k_proj": 41.49375534057617, "geo/layer_14/stable_rank_o_proj": 43.088584899902344, "geo/layer_14/stable_rank_gate_proj": 70.2411880493164, "geo/layer_14/stable_rank_down_proj": 125.02349853515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4121071696281433, "geo/layer_14/attn_entropy_mean": 5.496978282928467, "geo/layer_14/attn_entropy_std": 0.417214035987854, "geo/layer_21/stable_rank_q_proj": 39.41440200805664, "geo/layer_21/stable_rank_k_proj": 30.030550003051758, "geo/layer_21/stable_rank_o_proj": 67.89397430419922, "geo/layer_21/stable_rank_gate_proj": 62.889827728271484, "geo/layer_21/stable_rank_down_proj": 49.24222946166992, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1488269418478012, "geo/layer_21/attn_entropy_mean": 5.681303977966309, "geo/layer_21/attn_entropy_std": 0.28816303610801697, "geo/layer_27/stable_rank_q_proj": 43.77623748779297, "geo/layer_27/stable_rank_k_proj": 32.35801315307617, "geo/layer_27/stable_rank_o_proj": 115.26622772216797, "geo/layer_27/stable_rank_gate_proj": 76.23371887207031, "geo/layer_27/stable_rank_down_proj": 127.75435638427734, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.0901518240571022, "geo/layer_27/attn_entropy_mean": 4.144711017608643, "geo/layer_27/attn_entropy_std": 0.7930915355682373, "attnres/final_alpha/block_0": 0.24026340246200562, "attnres/block_norm/0": 1.7789342403411865, "attnres/final_alpha/block_1": 0.0038978243246674538, "attnres/block_norm/1": 47502.28125, "attnres/final_alpha/block_2": 0.009137487038969994, "attnres/block_norm/2": 29151.115234375, "attnres/final_alpha/block_3": 0.01064118929207325, "attnres/block_norm/3": 63787.640625, "attnres/final_alpha/block_4": 0.012669965624809265, "attnres/block_norm/4": 15724.068359375, "attnres/final_alpha/block_5": 0.6161971688270569, "attnres/block_norm/5": 6792.31787109375, "attnres/final_alpha/block_6": 0.10719295591115952, "attnres/block_norm/6": 40952.921875, "geo/tier1_time_s": 1.355156660079956, "geo/step": 78900.0, "geo/rankme_slope": 0.0005615599364745898} {"step": 78910, "timestamp": 1778279834.0101936, "train/loss": 2.052897834777832, "train/z_loss": 0.0014115445199422538, "train/perplexity": 7.790443847077364, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.0038296663761138916, "optim/adamw_lr": 0.00011488999128341674, "perf/tokens_per_sec": 1791648.189157592, "perf/iters_per_sec": 0.8543244310176811, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.170515513420105, "data/tokens_consumed": 165488361472, "data/tokens_consumed_B": 165.488361472, "train/loss_slope": -1.9372342443308844e-05} {"step": 78920, "timestamp": 1778279844.3573267, "train/loss": 2.0763843297958373, "train/z_loss": 0.0014136733254417777, "train/perplexity": 7.975579653035972, "train/grad_norm": 0.099609375, "optim/muon_lr": 0.003799288272857666, "optim/adamw_lr": 0.00011397864818572997, "perf/tokens_per_sec": 2027646.567653893, "perf/iters_per_sec": 0.9668572271604028, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342788696289062, "data/tokens_consumed": 165509332992, "data/tokens_consumed_B": 165.509332992, "train/loss_slope": -1.924756746409432e-05} {"step": 78930, "timestamp": 1778279854.7134762, "train/loss": 2.066485917568207, "train/z_loss": 0.0014283459750004113, "train/perplexity": 7.897023509820664, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.0037690043449401856, "optim/adamw_lr": 0.00011307013034820555, "perf/tokens_per_sec": 2026043.6968060897, "perf/iters_per_sec": 0.9660929187803696, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350971221923828, "data/tokens_consumed": 165530304512, "data/tokens_consumed_B": 165.530304512, "train/loss_slope": -1.9784856529304553e-05} {"step": 78940, "timestamp": 1778279865.0897076, "train/loss": 2.0658368110656737, "train/z_loss": 0.0014136915910057724, "train/perplexity": 7.891899163812864, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.003738812208175659, "optim/adamw_lr": 0.00011216436624526977, "perf/tokens_per_sec": 2022164.5889408858, "perf/iters_per_sec": 0.9642432160095624, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370827436447143, "data/tokens_consumed": 165551276032, "data/tokens_consumed_B": 165.551276032, "train/loss_slope": -1.8793075241342997e-05} {"step": 78950, "timestamp": 1778279875.4696925, "grad/layer_0/attn": 0.0026552698109298944, "grad/layer_0/mlp": 0.0028426635544747114, "grad/layer_0/attn_mlp_ratio": 0.934078080869738, "grad/layer_4/attn": 0.001976295839995146, "grad/layer_4/mlp": 0.0025738186668604612, "grad/layer_4/attn_mlp_ratio": 0.7678457649937129, "grad/layer_8/attn": 0.003594485577195883, "grad/layer_8/mlp": 0.0033798585645854473, "grad/layer_8/attn_mlp_ratio": 1.0635017419098973, "grad/layer_12/attn": 0.003607633290812373, "grad/layer_12/mlp": 0.006237694527953863, "grad/layer_12/attn_mlp_ratio": 0.5783600361975041, "grad/layer_16/attn": 0.003463681787252426, "grad/layer_16/mlp": 0.004143183119595051, "grad/layer_16/attn_mlp_ratio": 0.8359953213922705, "grad/layer_20/attn": 0.0024228838738054037, "grad/layer_20/mlp": 0.0050103929825127125, "grad/layer_20/attn_mlp_ratio": 0.4835716148223499, "grad/layer_24/attn": 0.0036022430285811424, "grad/layer_24/mlp": 0.006426111329346895, "grad/layer_24/attn_mlp_ratio": 0.5605634244265275, "grad/layer_27/attn": 0.004820207599550486, "grad/layer_27/mlp": 0.005636688321828842, "grad/layer_27/attn_mlp_ratio": 0.8551488460642204} {"step": 78950, "timestamp": 1778279875.4854815, "train/loss": 2.1206735372543335, "train/z_loss": 0.0014057868742384017, "train/perplexity": 8.336750709301356, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0037087130546569827, "optim/adamw_lr": 0.00011126139163970946, "perf/tokens_per_sec": 2018406.8270736996, "perf/iters_per_sec": 0.9624513755196092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0390135288238525, "data/tokens_consumed": 165572247552, "data/tokens_consumed_B": 165.572247552, "train/loss_slope": -1.3869319103732017e-05} {"step": 78960, "timestamp": 1778279886.4216802, "train/loss": 2.107997107505798, "train/z_loss": 0.0014087535440921784, "train/perplexity": 8.23173747726046, "train/grad_norm": 0.103515625, "optim/muon_lr": 0.003678709864616394, "optim/adamw_lr": 0.00011036129593849181, "perf/tokens_per_sec": 1918469.1977734128, "perf/iters_per_sec": 0.9147974003665031, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0931382179260254, "data/tokens_consumed": 165593219072, "data/tokens_consumed_B": 165.593219072, "train/loss_slope": -1.1986232612213916e-05} {"step": 78970, "timestamp": 1778279896.8007843, "train/loss": 2.0898303508758547, "train/z_loss": 0.0014164831023663282, "train/perplexity": 8.083543681867333, "train/grad_norm": 0.0908203125, "optim/muon_lr": 0.0036487996578216555, "optim/adamw_lr": 0.00010946398973464964, "perf/tokens_per_sec": 2021893.7381969523, "perf/iters_per_sec": 0.9641140643105279, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372216701507568, "data/tokens_consumed": 165614190592, "data/tokens_consumed_B": 165.614190592, "train/loss_slope": -7.909900563420647e-06} {"step": 78975, "timestamp": 1778279902.584076, "eos/sharpness": 20.097374916076657, "eos/L0_probe": 1.9075570106506348, "eos/L_plus": 2.0078041553497314, "eos/L_minus": 2.0082836151123047, "eos/grad_norm": 0.07881763577461243, "eos/embed_grad_frac": 0.26878631114959717, "eos/time_s": 0.6081409454345703} {"step": 78975, "timestamp": 1778279903.964666, "geo/rankme_last": 440.4291076660156, "geo/layer_0/stable_rank_q_proj": 18.713764190673828, "geo/layer_0/stable_rank_k_proj": 15.6911039352417, "geo/layer_0/stable_rank_o_proj": 46.295284271240234, "geo/layer_0/stable_rank_gate_proj": 127.1332015991211, "geo/layer_0/stable_rank_down_proj": 56.824764251708984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06629189103841782, "geo/layer_0/attn_entropy_mean": 6.125677108764648, "geo/layer_0/attn_entropy_std": 0.42818209528923035, "geo/layer_7/stable_rank_q_proj": 42.53715515136719, "geo/layer_7/stable_rank_k_proj": 40.066810607910156, "geo/layer_7/stable_rank_o_proj": 87.90541076660156, "geo/layer_7/stable_rank_gate_proj": 76.8504409790039, "geo/layer_7/stable_rank_down_proj": 140.3752899169922, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4435053765773773, "geo/layer_7/attn_entropy_mean": 4.636890888214111, "geo/layer_7/attn_entropy_std": 0.7689126133918762, "geo/layer_14/stable_rank_q_proj": 49.12906265258789, "geo/layer_14/stable_rank_k_proj": 41.49259948730469, "geo/layer_14/stable_rank_o_proj": 43.08649444580078, "geo/layer_14/stable_rank_gate_proj": 70.2343978881836, "geo/layer_14/stable_rank_down_proj": 125.01362609863281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41030973196029663, "geo/layer_14/attn_entropy_mean": 5.498961448669434, "geo/layer_14/attn_entropy_std": 0.413698673248291, "geo/layer_21/stable_rank_q_proj": 39.408504486083984, "geo/layer_21/stable_rank_k_proj": 30.0322208404541, "geo/layer_21/stable_rank_o_proj": 67.89463806152344, "geo/layer_21/stable_rank_gate_proj": 62.89059066772461, "geo/layer_21/stable_rank_down_proj": 49.24374008178711, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1496579647064209, "geo/layer_21/attn_entropy_mean": 5.6860151290893555, "geo/layer_21/attn_entropy_std": 0.28511843085289, "geo/layer_27/stable_rank_q_proj": 43.77367401123047, "geo/layer_27/stable_rank_k_proj": 32.349632263183594, "geo/layer_27/stable_rank_o_proj": 115.25357055664062, "geo/layer_27/stable_rank_gate_proj": 76.22913360595703, "geo/layer_27/stable_rank_down_proj": 127.77791595458984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08029422163963318, "geo/layer_27/attn_entropy_mean": 4.140636444091797, "geo/layer_27/attn_entropy_std": 0.7924202680587769, "attnres/final_alpha/block_0": 0.24072465300559998, "attnres/block_norm/0": 1.7789018154144287, "attnres/final_alpha/block_1": 0.003911932464689016, "attnres/block_norm/1": 47507.2734375, "attnres/final_alpha/block_2": 0.009153096005320549, "attnres/block_norm/2": 29174.833984375, "attnres/final_alpha/block_3": 0.010693814605474472, "attnres/block_norm/3": 63891.1875, "attnres/final_alpha/block_4": 0.012769142165780067, "attnres/block_norm/4": 15720.072265625, "attnres/final_alpha/block_5": 0.61556077003479, "attnres/block_norm/5": 6805.21875, "attnres/final_alpha/block_6": 0.10718655586242676, "attnres/block_norm/6": 40894.8203125, "geo/tier1_time_s": 1.360501766204834, "geo/step": 78975.0, "geo/rankme_slope": 0.0005086756577631053} {"step": 78980, "timestamp": 1778279909.155913, "train/loss": 2.0845298171043396, "train/z_loss": 0.001418333943001926, "train/perplexity": 8.040809941457857, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.0036189860105514528, "optim/adamw_lr": 0.00010856958031654357, "perf/tokens_per_sec": 1698059.0676240819, "perf/iters_per_sec": 0.8096976602669153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350288867950439, "data/tokens_consumed": 165635162112, "data/tokens_consumed_B": 165.635162112, "train/loss_slope": -8.365078692031399e-06} {"step": 78990, "timestamp": 1778279919.541281, "train/loss": 2.1003531575202943, "train/z_loss": 0.0014024117263033986, "train/perplexity": 8.169054366187538, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.0035892659425735473, "optim/adamw_lr": 0.00010767797827720641, "perf/tokens_per_sec": 2020490.725103406, "perf/iters_per_sec": 0.9634450555340796, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379419088363648, "data/tokens_consumed": 165656133632, "data/tokens_consumed_B": 165.656133632, "train/loss_slope": -7.190027774864639e-06} {"step": 79000, "timestamp": 1778279929.8907852, "grad/layer_0/attn": 0.002564318012446165, "grad/layer_0/mlp": 0.002791040111333132, "grad/layer_0/attn_mlp_ratio": 0.9187678493608397, "grad/layer_4/attn": 0.0019968270789831877, "grad/layer_4/mlp": 0.0024350585881620646, "grad/layer_4/attn_mlp_ratio": 0.8200324241426618, "grad/layer_8/attn": 0.0028201646637171507, "grad/layer_8/mlp": 0.003324969206005335, "grad/layer_8/attn_mlp_ratio": 0.8481776534368483, "grad/layer_12/attn": 0.003790125949308276, "grad/layer_12/mlp": 0.006306488532572985, "grad/layer_12/attn_mlp_ratio": 0.6009883106317346, "grad/layer_16/attn": 0.0032956385985016823, "grad/layer_16/mlp": 0.004186233971267939, "grad/layer_16/attn_mlp_ratio": 0.7872561692431809, "grad/layer_20/attn": 0.00322977127507329, "grad/layer_20/mlp": 0.00483399722725153, "grad/layer_20/attn_mlp_ratio": 0.668136752344802, "grad/layer_24/attn": 0.007679729722440243, "grad/layer_24/mlp": 0.0065047661773860455, "grad/layer_24/attn_mlp_ratio": 1.180631154902365, "grad/layer_27/attn": 0.003500816412270069, "grad/layer_27/mlp": 0.005604934878647327, "grad/layer_27/attn_mlp_ratio": 0.6245953656209839} {"step": 79000, "timestamp": 1778279929.9068706, "train/loss": 2.122671401500702, "train/z_loss": 0.001402402261737734, "train/perplexity": 8.35342305446993, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.003559643626213074, "optim/adamw_lr": 0.0001067893087863922, "perf/tokens_per_sec": 2024403.2422716704, "perf/iters_per_sec": 0.9653106891020157, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03593590259552, "data/tokens_consumed": 165677105152, "data/tokens_consumed_B": 165.677105152, "train/loss_slope": -4.66863176252551e-06} {"step": 79000, "timestamp": 1778279936.7742312, "geo/ww_alpha_mean": 7.7139024349457745, "geo/ww_alpha_std": 4.6072434583023325, "geo/ww_alpha_min": 1.3384438333147766, "geo/ww_alpha_max": 31.210988028499845, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9328180994445328, "geo/ww_alpha_by_type/k_proj": 4.452842104458342, "geo/ww_alpha_by_type/v_proj": 8.696967892070074, "geo/ww_alpha_by_type/o_proj": 9.610741633455689, "geo/ww_alpha_by_type/gate_proj": 8.040850183183023, "geo/ww_alpha_by_type/up_proj": 11.357699246164755, "geo/ww_alpha_by_type/down_proj": 7.993364749690138, "geo/twonn_id/layer_0": 0.6613336801528931, "geo/twonn_id/layer_7": 3.1223933696746826, "geo/twonn_id/layer_14": 4.522486686706543, "geo/twonn_id/layer_21": 7.578427314758301, "geo/twonn_id/layer_27": 5.679220676422119, "geo/tier2_time_s": 6.860889196395874} {"step": 79000, "timestamp": 1778279937.4281085, "eoc/jacobian_sigma/layer_0/attn": 1279.0087890625, "eoc/jacobian_sigma/layer_0/mlp": 9590.251953125, "eoc/jacobian_sigma/layer_0": 9590.251953125, "eoc/jacobian_sigma/layer_7/attn": 1.1491343975067139, "eoc/jacobian_sigma/layer_7/mlp": 1.9131152629852295, "eoc/jacobian_sigma/layer_7": 1.9131152629852295, "eoc/jacobian_sigma/layer_14/attn": 1.3896428346633911, "eoc/jacobian_sigma/layer_14/mlp": 6.605847358703613, "eoc/jacobian_sigma/layer_14": 6.605847358703613, "eoc/jacobian_sigma/layer_21/attn": 1.0996793508529663, "eoc/jacobian_sigma/layer_21/mlp": 4.1014299392700195, "eoc/jacobian_sigma/layer_21": 4.1014299392700195, "eoc/jacobian_sigma/layer_27/attn": 3.1571810245513916, "eoc/jacobian_sigma/layer_27/mlp": 26.24587059020996, "eoc/jacobian_sigma/layer_27": 26.24587059020996, "eoc/layer0_sigma": 9590.251953125, "eoc/sigma_max": 26.24587059020996, "eoc/sigma_min": 1.9131152629852295, "eoc/sigma_mean": 9.716565787792206, "eoc/time_s": 0.6480860710144043} {"step": 79010, "timestamp": 1778279947.8009079, "train/loss": 2.0813470840454102, "train/z_loss": 0.0014134481316432357, "train/perplexity": 8.015258872521418, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.0035301154851913453, "optim/adamw_lr": 0.00010590346455574034, "perf/tokens_per_sec": 1172326.6533200191, "perf/iters_per_sec": 0.559008909854898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7888802528381347, "data/tokens_consumed": 165698076672, "data/tokens_consumed_B": 165.698076672, "train/loss_slope": -3.625171741779771e-06} {"step": 79020, "timestamp": 1778279958.1542232, "train/loss": 2.0651480913162232, "train/z_loss": 0.0014263981604017318, "train/perplexity": 7.8864657282700295, "train/grad_norm": 0.1181640625, "optim/muon_lr": 0.003500683903694153, "optim/adamw_lr": 0.00010502051711082457, "perf/tokens_per_sec": 2026842.5735595974, "perf/iters_per_sec": 0.9664738529012668, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346891403198242, "data/tokens_consumed": 165719048192, "data/tokens_consumed_B": 165.719048192, "train/loss_slope": -5.670262706412127e-06} {"step": 79030, "timestamp": 1778279968.5043578, "train/loss": 2.0502079486846925, "train/z_loss": 0.0014158838079310954, "train/perplexity": 7.769516599088628, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.0034713518619537355, "optim/adamw_lr": 0.00010414055585861206, "perf/tokens_per_sec": 2027174.1298368713, "perf/iters_per_sec": 0.9666319512543065, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034519910812378, "data/tokens_consumed": 165740019712, "data/tokens_consumed_B": 165.740019712, "train/loss_slope": -9.629395199079559e-06} {"step": 79040, "timestamp": 1778279978.8555717, "train/loss": 2.02371461391449, "train/z_loss": 0.0014140940620563924, "train/perplexity": 7.5663789730567, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.0034421151876449585, "optim/adamw_lr": 0.00010326345562934875, "perf/tokens_per_sec": 2026933.8364746252, "perf/iters_per_sec": 0.9665173704503179, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346425533294679, "data/tokens_consumed": 165760991232, "data/tokens_consumed_B": 165.760991232, "train/loss_slope": -1.299052313573721e-05} {"step": 79050, "timestamp": 1778279989.201725, "grad/layer_0/attn": 0.0024515504483133554, "grad/layer_0/mlp": 0.002812502207234502, "grad/layer_0/attn_mlp_ratio": 0.8716616665548389, "grad/layer_4/attn": 0.002517441753298044, "grad/layer_4/mlp": 0.0025978197809308767, "grad/layer_4/attn_mlp_ratio": 0.9690593915987622, "grad/layer_8/attn": 0.0048215920105576515, "grad/layer_8/mlp": 0.003630587365478277, "grad/layer_8/attn_mlp_ratio": 1.3280473357009392, "grad/layer_12/attn": 0.006091849412769079, "grad/layer_12/mlp": 0.006518199574202299, "grad/layer_12/attn_mlp_ratio": 0.9345907945838746, "grad/layer_16/attn": 0.0033023455180227757, "grad/layer_16/mlp": 0.00451259221881628, "grad/layer_16/attn_mlp_ratio": 0.7318067498038533, "grad/layer_20/attn": 0.0031474344432353973, "grad/layer_20/mlp": 0.0049491203390061855, "grad/layer_20/attn_mlp_ratio": 0.6359583449271285, "grad/layer_24/attn": 0.0035808959510177374, "grad/layer_24/mlp": 0.005867306143045425, "grad/layer_24/attn_mlp_ratio": 0.610313456070613, "grad/layer_27/attn": 0.004591233562678099, "grad/layer_27/mlp": 0.00549269188195467, "grad/layer_27/attn_mlp_ratio": 0.8358803984934594} {"step": 79050, "timestamp": 1778279989.839678, "eos/sharpness": 4.637885093688964, "eos/L0_probe": 1.9070290327072144, "eos/L_plus": 1.9308481216430664, "eos/L_minus": 1.929588794708252, "eos/grad_norm": 0.07245562970638275, "eos/embed_grad_frac": 0.3397507965564728, "eos/time_s": 0.6350874900817871} {"step": 79050, "timestamp": 1778279989.859018, "train/loss": 2.1588508009910585, "train/z_loss": 0.0013998080044984817, "train/perplexity": 8.661178519272449, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0034129786491394042, "optim/adamw_lr": 0.00010238935947418212, "perf/tokens_per_sec": 1906997.980330985, "perf/iters_per_sec": 0.9093274976401258, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0997138023376465, "data/tokens_consumed": 165781962752, "data/tokens_consumed_B": 165.781962752, "train/loss_slope": -8.823421328338898e-06} {"step": 79050, "timestamp": 1778279991.2250836, "geo/rankme_last": 440.6416931152344, "geo/layer_0/stable_rank_q_proj": 18.713363647460938, "geo/layer_0/stable_rank_k_proj": 15.690793991088867, "geo/layer_0/stable_rank_o_proj": 46.295127868652344, "geo/layer_0/stable_rank_gate_proj": 127.09928131103516, "geo/layer_0/stable_rank_down_proj": 56.820709228515625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06673931330442429, "geo/layer_0/attn_entropy_mean": 6.125394344329834, "geo/layer_0/attn_entropy_std": 0.42794615030288696, "geo/layer_7/stable_rank_q_proj": 42.53636169433594, "geo/layer_7/stable_rank_k_proj": 40.07149124145508, "geo/layer_7/stable_rank_o_proj": 87.89987182617188, "geo/layer_7/stable_rank_gate_proj": 76.84226989746094, "geo/layer_7/stable_rank_down_proj": 140.38211059570312, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44635480642318726, "geo/layer_7/attn_entropy_mean": 4.644067287445068, "geo/layer_7/attn_entropy_std": 0.7674738764762878, "geo/layer_14/stable_rank_q_proj": 49.125267028808594, "geo/layer_14/stable_rank_k_proj": 41.47917175292969, "geo/layer_14/stable_rank_o_proj": 43.084232330322266, "geo/layer_14/stable_rank_gate_proj": 70.23529052734375, "geo/layer_14/stable_rank_down_proj": 124.99908447265625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4090502858161926, "geo/layer_14/attn_entropy_mean": 5.497895240783691, "geo/layer_14/attn_entropy_std": 0.4149587154388428, "geo/layer_21/stable_rank_q_proj": 39.40713882446289, "geo/layer_21/stable_rank_k_proj": 30.03000831604004, "geo/layer_21/stable_rank_o_proj": 67.88204193115234, "geo/layer_21/stable_rank_gate_proj": 62.88917541503906, "geo/layer_21/stable_rank_down_proj": 49.24051284790039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15172843635082245, "geo/layer_21/attn_entropy_mean": 5.676450252532959, "geo/layer_21/attn_entropy_std": 0.28765180706977844, "geo/layer_27/stable_rank_q_proj": 43.78122329711914, "geo/layer_27/stable_rank_k_proj": 32.34401321411133, "geo/layer_27/stable_rank_o_proj": 115.27786254882812, "geo/layer_27/stable_rank_gate_proj": 76.22882843017578, "geo/layer_27/stable_rank_down_proj": 127.75872039794922, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08821666985750198, "geo/layer_27/attn_entropy_mean": 4.135571479797363, "geo/layer_27/attn_entropy_std": 0.7916274070739746, "attnres/final_alpha/block_0": 0.2399502843618393, "attnres/block_norm/0": 1.778856873512268, "attnres/final_alpha/block_1": 0.0038849972188472748, "attnres/block_norm/1": 47481.46875, "attnres/final_alpha/block_2": 0.00914587639272213, "attnres/block_norm/2": 29116.580078125, "attnres/final_alpha/block_3": 0.010626446455717087, "attnres/block_norm/3": 63794.76171875, "attnres/final_alpha/block_4": 0.012670985423028469, "attnres/block_norm/4": 15702.0146484375, "attnres/final_alpha/block_5": 0.6168594360351562, "attnres/block_norm/5": 6811.9130859375, "attnres/final_alpha/block_6": 0.10686192661523819, "attnres/block_norm/6": 40924.578125, "geo/tier1_time_s": 1.362173318862915, "geo/step": 79050.0, "geo/rankme_slope": 0.00047194096388555423} {"step": 79060, "timestamp": 1778280001.579655, "train/loss": 2.0157748579978945, "train/z_loss": 0.0014052831334993242, "train/perplexity": 7.5065416321228176, "train/grad_norm": 0.1376953125, "optim/muon_lr": 0.0033839380741119383, "optim/adamw_lr": 0.00010151814222335814, "perf/tokens_per_sec": 1789841.7224069494, "perf/iters_per_sec": 0.8534630405459163, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716969013214111, "data/tokens_consumed": 165802934272, "data/tokens_consumed_B": 165.802934272, "train/loss_slope": -1.4813317302608857e-05} {"step": 79070, "timestamp": 1778280011.933241, "train/loss": 2.1100265741348267, "train/z_loss": 0.001404323463793844, "train/perplexity": 8.248460477415058, "train/grad_norm": 0.080078125, "optim/muon_lr": 0.0033549970388412477, "optim/adamw_lr": 0.00010064991116523742, "perf/tokens_per_sec": 2026671.0458076496, "perf/iters_per_sec": 0.966392062095475, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347767114639281, "data/tokens_consumed": 165823905792, "data/tokens_consumed_B": 165.823905792, "train/loss_slope": -1.3664761635408077e-05} {"step": 79080, "timestamp": 1778280022.297284, "train/loss": 2.0700915575027468, "train/z_loss": 0.001415375666692853, "train/perplexity": 7.925548728080891, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.003326156735420227, "optim/adamw_lr": 9.97847020626068e-05, "perf/tokens_per_sec": 2024649.6001147565, "perf/iters_per_sec": 0.9654281616758139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358098506927491, "data/tokens_consumed": 165844877312, "data/tokens_consumed_B": 165.844877312, "train/loss_slope": -1.1437343063205689e-05} {"step": 79090, "timestamp": 1778280032.6542573, "train/loss": 2.087073266506195, "train/z_loss": 0.001419544836971909, "train/perplexity": 8.061287365300336, "train/grad_norm": 0.0869140625, "optim/muon_lr": 0.003297414779663086, "optim/adamw_lr": 9.892244338989256e-05, "perf/tokens_per_sec": 2026018.777054009, "perf/iters_per_sec": 0.9660810361166043, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035109853744507, "data/tokens_consumed": 165865848832, "data/tokens_consumed_B": 165.865848832, "train/loss_slope": -1.125469976025158e-05} {"step": 79100, "timestamp": 1778280043.009734, "grad/layer_0/attn": 0.0030373313929885626, "grad/layer_0/mlp": 0.0034168462734669447, "grad/layer_0/attn_mlp_ratio": 0.8889282867894049, "grad/layer_4/attn": 0.0076188454404473305, "grad/layer_4/mlp": 0.0027969449292868376, "grad/layer_4/attn_mlp_ratio": 2.723988266008208, "grad/layer_8/attn": 0.004385120701044798, "grad/layer_8/mlp": 0.0035301854368299246, "grad/layer_8/attn_mlp_ratio": 1.242178535744214, "grad/layer_12/attn": 0.004376348573714495, "grad/layer_12/mlp": 0.006989738438278437, "grad/layer_12/attn_mlp_ratio": 0.6261104832102037, "grad/layer_16/attn": 0.004314721096307039, "grad/layer_16/mlp": 0.005152953788638115, "grad/layer_16/attn_mlp_ratio": 0.8373296539331903, "grad/layer_20/attn": 0.008850079029798508, "grad/layer_20/mlp": 0.007287829648703337, "grad/layer_20/attn_mlp_ratio": 1.2143641296468444, "grad/layer_24/attn": 0.0071108462288975716, "grad/layer_24/mlp": 0.009704002179205418, "grad/layer_24/attn_mlp_ratio": 0.7327745835484101, "grad/layer_27/attn": 0.004691669251769781, "grad/layer_27/mlp": 0.0076220775954425335, "grad/layer_27/attn_mlp_ratio": 0.6155367918349967} {"step": 79100, "timestamp": 1778280043.0255969, "train/loss": 2.079911780357361, "train/z_loss": 0.0014152050483971835, "train/perplexity": 8.003762794056515, "train/grad_norm": 0.09619140625, "optim/muon_lr": 0.003268774747848511, "optim/adamw_lr": 9.806324243545531e-05, "perf/tokens_per_sec": 2023361.5326300056, "perf/iters_per_sec": 0.9648139632368115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364692449569701, "data/tokens_consumed": 165886820352, "data/tokens_consumed_B": 165.886820352, "train/loss_slope": -8.03937425564757e-06} {"step": 79110, "timestamp": 1778280053.3866735, "train/loss": 2.0520486712455748, "train/z_loss": 0.0014128824928775429, "train/perplexity": 7.783831294229043, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.003240233063697815, "optim/adamw_lr": 9.720699191093444e-05, "perf/tokens_per_sec": 2025309.6611171942, "perf/iters_per_sec": 0.965742903288457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354722738265991, "data/tokens_consumed": 165907791872, "data/tokens_consumed_B": 165.907791872, "train/loss_slope": -7.064789585476317e-06} {"step": 79120, "timestamp": 1778280063.74772, "train/loss": 2.0757335662841796, "train/z_loss": 0.0014196175266988575, "train/perplexity": 7.970391125248846, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.0032117950916290285, "optim/adamw_lr": 9.635385274887084e-05, "perf/tokens_per_sec": 2025388.1940799654, "perf/iters_per_sec": 0.9657803507232501, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354321241378783, "data/tokens_consumed": 165928763392, "data/tokens_consumed_B": 165.928763392, "train/loss_slope": -4.964774133491676e-06} {"step": 79125, "timestamp": 1778280069.515015, "eos/sharpness": 2.308261394500732, "eos/L0_probe": 1.9086852073669434, "eos/L_plus": 1.9228075742721558, "eos/L_minus": 1.9176454544067383, "eos/grad_norm": 0.06774267554283142, "eos/embed_grad_frac": 0.3825186789035797, "eos/time_s": 0.6021251678466797} {"step": 79125, "timestamp": 1778280070.8920128, "geo/rankme_last": 440.7450866699219, "geo/layer_0/stable_rank_q_proj": 18.712480545043945, "geo/layer_0/stable_rank_k_proj": 15.68889045715332, "geo/layer_0/stable_rank_o_proj": 46.290374755859375, "geo/layer_0/stable_rank_gate_proj": 127.07821655273438, "geo/layer_0/stable_rank_down_proj": 56.821903228759766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06345999240875244, "geo/layer_0/attn_entropy_mean": 6.125516891479492, "geo/layer_0/attn_entropy_std": 0.4281518757343292, "geo/layer_7/stable_rank_q_proj": 42.54110336303711, "geo/layer_7/stable_rank_k_proj": 40.069026947021484, "geo/layer_7/stable_rank_o_proj": 87.9035415649414, "geo/layer_7/stable_rank_gate_proj": 76.84156799316406, "geo/layer_7/stable_rank_down_proj": 140.39247131347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4519027769565582, "geo/layer_7/attn_entropy_mean": 4.645218849182129, "geo/layer_7/attn_entropy_std": 0.7695952653884888, "geo/layer_14/stable_rank_q_proj": 49.12349319458008, "geo/layer_14/stable_rank_k_proj": 41.47219467163086, "geo/layer_14/stable_rank_o_proj": 43.0867919921875, "geo/layer_14/stable_rank_gate_proj": 70.24053955078125, "geo/layer_14/stable_rank_down_proj": 125.01007080078125, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41466614603996277, "geo/layer_14/attn_entropy_mean": 5.502729892730713, "geo/layer_14/attn_entropy_std": 0.4121978282928467, "geo/layer_21/stable_rank_q_proj": 39.4074821472168, "geo/layer_21/stable_rank_k_proj": 30.032466888427734, "geo/layer_21/stable_rank_o_proj": 67.8747329711914, "geo/layer_21/stable_rank_gate_proj": 62.897579193115234, "geo/layer_21/stable_rank_down_proj": 49.23988342285156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15269696712493896, "geo/layer_21/attn_entropy_mean": 5.680154323577881, "geo/layer_21/attn_entropy_std": 0.28756076097488403, "geo/layer_27/stable_rank_q_proj": 43.77821731567383, "geo/layer_27/stable_rank_k_proj": 32.34735870361328, "geo/layer_27/stable_rank_o_proj": 115.2673110961914, "geo/layer_27/stable_rank_gate_proj": 76.2197265625, "geo/layer_27/stable_rank_down_proj": 127.77984619140625, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08802498131990433, "geo/layer_27/attn_entropy_mean": 4.142435073852539, "geo/layer_27/attn_entropy_std": 0.7920820713043213, "attnres/final_alpha/block_0": 0.24034200608730316, "attnres/block_norm/0": 1.7788432836532593, "attnres/final_alpha/block_1": 0.003925088793039322, "attnres/block_norm/1": 47486.1328125, "attnres/final_alpha/block_2": 0.009177397936582565, "attnres/block_norm/2": 29115.2109375, "attnres/final_alpha/block_3": 0.010615957900881767, "attnres/block_norm/3": 63836.87109375, "attnres/final_alpha/block_4": 0.012725094333291054, "attnres/block_norm/4": 15726.0341796875, "attnres/final_alpha/block_5": 0.6157835721969604, "attnres/block_norm/5": 6805.5869140625, "attnres/final_alpha/block_6": 0.10743086785078049, "attnres/block_norm/6": 40883.6484375, "geo/tier1_time_s": 1.356238842010498, "geo/step": 79125.0, "geo/rankme_slope": 0.0004611305068902561} {"step": 79130, "timestamp": 1778280076.0689187, "train/loss": 2.1204602003097532, "train/z_loss": 0.0014021235867403448, "train/perplexity": 8.334972362077632, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0031834560632705688, "optim/adamw_lr": 9.550368189811706e-05, "perf/tokens_per_sec": 1702645.846539071, "perf/iters_per_sec": 0.8118848068900447, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2317018270492555, "data/tokens_consumed": 165949734912, "data/tokens_consumed_B": 165.949734912, "train/loss_slope": -2.9153653246985143e-06} {"step": 79140, "timestamp": 1778280086.417072, "train/loss": 2.108236861228943, "train/z_loss": 0.0014073891914449632, "train/perplexity": 8.233711303575232, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.0031552189588546753, "optim/adamw_lr": 9.465656876564025e-05, "perf/tokens_per_sec": 2027545.4259436952, "perf/iters_per_sec": 0.9668089990347363, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034330463409424, "data/tokens_consumed": 165970706432, "data/tokens_consumed_B": 165.970706432, "train/loss_slope": -3.5411456666334546e-06} {"step": 79150, "timestamp": 1778280096.7514293, "grad/layer_0/attn": 0.002405488630756736, "grad/layer_0/mlp": 0.0029956635553389788, "grad/layer_0/attn_mlp_ratio": 0.8029902243763541, "grad/layer_4/attn": 0.002068053698167205, "grad/layer_4/mlp": 0.0026289403904229403, "grad/layer_4/attn_mlp_ratio": 0.7866491104309855, "grad/layer_8/attn": 0.003624835517257452, "grad/layer_8/mlp": 0.003690849058330059, "grad/layer_8/attn_mlp_ratio": 0.982114240316861, "grad/layer_12/attn": 0.00409915903583169, "grad/layer_12/mlp": 0.006707518827170134, "grad/layer_12/attn_mlp_ratio": 0.6111289554811742, "grad/layer_16/attn": 0.0038574638310819864, "grad/layer_16/mlp": 0.004351946525275707, "grad/layer_16/attn_mlp_ratio": 0.8863766408986246, "grad/layer_20/attn": 0.0027853762730956078, "grad/layer_20/mlp": 0.004975135438144207, "grad/layer_20/attn_mlp_ratio": 0.5598593750341505, "grad/layer_24/attn": 0.003241261001676321, "grad/layer_24/mlp": 0.006154101807624102, "grad/layer_24/attn_mlp_ratio": 0.5266830238317692, "grad/layer_27/attn": 0.003657824592664838, "grad/layer_27/mlp": 0.0056315199472010136, "grad/layer_27/attn_mlp_ratio": 0.6495270481160509} {"step": 79150, "timestamp": 1778280096.7670803, "train/loss": 2.089379096031189, "train/z_loss": 0.0014094802201725543, "train/perplexity": 8.079896766524817, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.003127086162567139, "optim/adamw_lr": 9.381258487701415e-05, "perf/tokens_per_sec": 2027269.861314866, "perf/iters_per_sec": 0.966677599580224, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03447105884552, "data/tokens_consumed": 165991677952, "data/tokens_consumed_B": 165.991677952, "train/loss_slope": 9.844200267042199e-07} {"step": 79160, "timestamp": 1778280107.1150267, "train/loss": 2.0529847502708436, "train/z_loss": 0.0014202059013769031, "train/perplexity": 7.791120986771651, "train/grad_norm": 0.0986328125, "optim/muon_lr": 0.0030990540981292725, "optim/adamw_lr": 9.297162294387816e-05, "perf/tokens_per_sec": 2027721.9631174738, "perf/iters_per_sec": 0.966893178518998, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342404127120972, "data/tokens_consumed": 166012649472, "data/tokens_consumed_B": 166.012649472, "train/loss_slope": -7.447541958213246e-07} {"step": 79170, "timestamp": 1778280117.4624836, "train/loss": 2.055067753791809, "train/z_loss": 0.001423372037243098, "train/perplexity": 7.807366833423157, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.003071126937866211, "optim/adamw_lr": 9.213380813598632e-05, "perf/tokens_per_sec": 2027665.3108257488, "perf/iters_per_sec": 0.9668661646012062, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342693090438844, "data/tokens_consumed": 166033620992, "data/tokens_consumed_B": 166.033620992, "train/loss_slope": 6.031754064803466e-07} {"step": 79180, "timestamp": 1778280127.812491, "train/loss": 2.0347105145454405, "train/z_loss": 0.0014174070791341364, "train/perplexity": 7.650037230399114, "train/grad_norm": 0.09228515625, "optim/muon_lr": 0.0030433017015457156, "optim/adamw_lr": 9.129905104637146e-05, "perf/tokens_per_sec": 2027215.5703185499, "perf/iters_per_sec": 0.9666517116158246, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344987630844116, "data/tokens_consumed": 166054592512, "data/tokens_consumed_B": 166.054592512, "train/loss_slope": -2.1654712949970523e-06} {"step": 79190, "timestamp": 1778280138.1677368, "train/loss": 2.0733332872390746, "train/z_loss": 0.001414163876324892, "train/perplexity": 7.951282904155039, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.003015580773353577, "optim/adamw_lr": 9.046742320060729e-05, "perf/tokens_per_sec": 2026176.7525231175, "perf/iters_per_sec": 0.9661563646903598, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035029149055481, "data/tokens_consumed": 166075564032, "data/tokens_consumed_B": 166.075564032, "train/loss_slope": -5.56523514480633e-07} {"step": 79200, "timestamp": 1778280148.520138, "grad/layer_0/attn": 0.002205418422818184, "grad/layer_0/mlp": 0.0027108993381261826, "grad/layer_0/attn_mlp_ratio": 0.813537526254609, "grad/layer_4/attn": 0.002102920087054372, "grad/layer_4/mlp": 0.0025205283891409636, "grad/layer_4/attn_mlp_ratio": 0.834317126790771, "grad/layer_8/attn": 0.002700977958738804, "grad/layer_8/mlp": 0.0034741307608783245, "grad/layer_8/attn_mlp_ratio": 0.7774542948724582, "grad/layer_12/attn": 0.003713410347700119, "grad/layer_12/mlp": 0.0067782229743897915, "grad/layer_12/attn_mlp_ratio": 0.5478442221428982, "grad/layer_16/attn": 0.002893822267651558, "grad/layer_16/mlp": 0.004109547007828951, "grad/layer_16/attn_mlp_ratio": 0.70417060365086, "grad/layer_20/attn": 0.002626346657052636, "grad/layer_20/mlp": 0.00491953594610095, "grad/layer_20/attn_mlp_ratio": 0.5338606389791951, "grad/layer_24/attn": 0.0037837387062609196, "grad/layer_24/mlp": 0.006462979130446911, "grad/layer_24/attn_mlp_ratio": 0.5854480683514863, "grad/layer_27/attn": 0.004830814432352781, "grad/layer_27/mlp": 0.00581918703392148, "grad/layer_27/attn_mlp_ratio": 0.8301527895868428} {"step": 79200, "timestamp": 1778280149.123141, "eos/sharpness": 24.831843376159664, "eos/L0_probe": 1.90718674659729, "eos/L_plus": 2.0240437984466553, "eos/L_minus": 2.0386481285095215, "eos/grad_norm": 0.08140715211629868, "eos/embed_grad_frac": 0.27340176701545715, "eos/time_s": 0.6001324653625488} {"step": 79200, "timestamp": 1778280149.1422954, "train/loss": 2.1045883893966675, "train/z_loss": 0.001408864394761622, "train/perplexity": 8.203725574116229, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.002987965941429138, "optim/adamw_lr": 8.963897824287414e-05, "perf/tokens_per_sec": 1912175.9280222505, "perf/iters_per_sec": 0.9117965355025532, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.096735906600952, "data/tokens_consumed": 166096535552, "data/tokens_consumed_B": 166.096535552, "train/loss_slope": 1.7938103004865518e-06} {"step": 79200, "timestamp": 1778280150.50341, "geo/rankme_last": 440.9296569824219, "geo/layer_0/stable_rank_q_proj": 18.712018966674805, "geo/layer_0/stable_rank_k_proj": 15.687469482421875, "geo/layer_0/stable_rank_o_proj": 46.286598205566406, "geo/layer_0/stable_rank_gate_proj": 127.0693359375, "geo/layer_0/stable_rank_down_proj": 56.82450485229492, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06561346352100372, "geo/layer_0/attn_entropy_mean": 6.125549793243408, "geo/layer_0/attn_entropy_std": 0.42779847979545593, "geo/layer_7/stable_rank_q_proj": 42.538116455078125, "geo/layer_7/stable_rank_k_proj": 40.06510543823242, "geo/layer_7/stable_rank_o_proj": 87.90288543701172, "geo/layer_7/stable_rank_gate_proj": 76.84001922607422, "geo/layer_7/stable_rank_down_proj": 140.39344787597656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4639034569263458, "geo/layer_7/attn_entropy_mean": 4.640395641326904, "geo/layer_7/attn_entropy_std": 0.7686581611633301, "geo/layer_14/stable_rank_q_proj": 49.11969757080078, "geo/layer_14/stable_rank_k_proj": 41.4742546081543, "geo/layer_14/stable_rank_o_proj": 43.0875358581543, "geo/layer_14/stable_rank_gate_proj": 70.2406997680664, "geo/layer_14/stable_rank_down_proj": 125.01189422607422, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41791871190071106, "geo/layer_14/attn_entropy_mean": 5.500744819641113, "geo/layer_14/attn_entropy_std": 0.41394850611686707, "geo/layer_21/stable_rank_q_proj": 39.40433120727539, "geo/layer_21/stable_rank_k_proj": 30.026884078979492, "geo/layer_21/stable_rank_o_proj": 67.87598419189453, "geo/layer_21/stable_rank_gate_proj": 62.90188217163086, "geo/layer_21/stable_rank_down_proj": 49.24016571044922, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15100491046905518, "geo/layer_21/attn_entropy_mean": 5.677570343017578, "geo/layer_21/attn_entropy_std": 0.29029300808906555, "geo/layer_27/stable_rank_q_proj": 43.780052185058594, "geo/layer_27/stable_rank_k_proj": 32.34523391723633, "geo/layer_27/stable_rank_o_proj": 115.27010345458984, "geo/layer_27/stable_rank_gate_proj": 76.21817779541016, "geo/layer_27/stable_rank_down_proj": 127.7659912109375, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08587727695703506, "geo/layer_27/attn_entropy_mean": 4.141485214233398, "geo/layer_27/attn_entropy_std": 0.7925124168395996, "attnres/final_alpha/block_0": 0.24032777547836304, "attnres/block_norm/0": 1.7787948846817017, "attnres/final_alpha/block_1": 0.003930932842195034, "attnres/block_norm/1": 47449.609375, "attnres/final_alpha/block_2": 0.009134632535278797, "attnres/block_norm/2": 29150.28125, "attnres/final_alpha/block_3": 0.010589361190795898, "attnres/block_norm/3": 63816.63671875, "attnres/final_alpha/block_4": 0.01269984245300293, "attnres/block_norm/4": 15734.5, "attnres/final_alpha/block_5": 0.6156607270240784, "attnres/block_norm/5": 6812.5224609375, "attnres/final_alpha/block_6": 0.10765676200389862, "attnres/block_norm/6": 40783.58203125, "geo/tier1_time_s": 1.3571531772613525, "geo/step": 79200.0, "geo/rankme_slope": 0.0004625234468787515} {"step": 79210, "timestamp": 1778280161.1521814, "train/loss": 2.0486698031425474, "train/z_loss": 0.0014307528617791831, "train/perplexity": 7.757575137974058, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0029604536294937133, "optim/adamw_lr": 8.881360888481139e-05, "perf/tokens_per_sec": 1746792.5857676317, "perf/iters_per_sec": 0.832935612567726, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2005729913711547, "data/tokens_consumed": 166117507072, "data/tokens_consumed_B": 166.117507072, "train/loss_slope": 1.9984133637706116e-06} {"step": 79220, "timestamp": 1778280171.5140414, "train/loss": 2.147279381752014, "train/z_loss": 0.0014031535829417408, "train/perplexity": 8.561534017519412, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.0029330486059188842, "optim/adamw_lr": 8.799145817756652e-05, "perf/tokens_per_sec": 2025667.2125596304, "perf/iters_per_sec": 0.9659133971021797, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03528950214386, "data/tokens_consumed": 166138478592, "data/tokens_consumed_B": 166.138478592, "train/loss_slope": 8.488703475068565e-06} {"step": 79230, "timestamp": 1778280181.8755276, "train/loss": 2.083689248561859, "train/z_loss": 0.0014032763778232039, "train/perplexity": 8.034053929408396, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0029057472944259643, "optim/adamw_lr": 8.717241883277892e-05, "perf/tokens_per_sec": 2024984.1686883296, "perf/iters_per_sec": 0.9655876964036606, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356387138366698, "data/tokens_consumed": 166159450112, "data/tokens_consumed_B": 166.159450112, "train/loss_slope": 7.708338528039957e-06} {"step": 79240, "timestamp": 1778280192.224417, "train/loss": 2.0992738723754885, "train/z_loss": 0.00141657586209476, "train/perplexity": 8.160242383339485, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.002878553867340088, "optim/adamw_lr": 8.635661602020263e-05, "perf/tokens_per_sec": 2027792.034976138, "perf/iters_per_sec": 0.9669265913849535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342046737670898, "data/tokens_consumed": 166180421632, "data/tokens_consumed_B": 166.180421632, "train/loss_slope": 7.34294736989798e-06} {"step": 79250, "timestamp": 1778280202.5636802, "grad/layer_0/attn": 0.0022678112145513296, "grad/layer_0/mlp": 0.002805750584229827, "grad/layer_0/attn_mlp_ratio": 0.8082725337279351, "grad/layer_4/attn": 0.0017885080305859447, "grad/layer_4/mlp": 0.0025264271534979343, "grad/layer_4/attn_mlp_ratio": 0.7079198611832925, "grad/layer_8/attn": 0.003270820714533329, "grad/layer_8/mlp": 0.003354291897267103, "grad/layer_8/attn_mlp_ratio": 0.9751150815725697, "grad/layer_12/attn": 0.004236307926476002, "grad/layer_12/mlp": 0.005869138985872269, "grad/layer_12/attn_mlp_ratio": 0.7217937527964383, "grad/layer_16/attn": 0.0028337985277175903, "grad/layer_16/mlp": 0.00402861088514328, "grad/layer_16/attn_mlp_ratio": 0.7034182595857675, "grad/layer_20/attn": 0.0024008245673030615, "grad/layer_20/mlp": 0.0045821224339306355, "grad/layer_20/attn_mlp_ratio": 0.5239546846521337, "grad/layer_24/attn": 0.003684493713080883, "grad/layer_24/mlp": 0.006402234546840191, "grad/layer_24/attn_mlp_ratio": 0.575501198616542, "grad/layer_27/attn": 0.004233523737639189, "grad/layer_27/mlp": 0.005561343394219875, "grad/layer_27/attn_mlp_ratio": 0.7612411896584446} {"step": 79250, "timestamp": 1778280202.5793185, "train/loss": 2.038825178146362, "train/z_loss": 0.0014221989782527088, "train/perplexity": 7.681579408360566, "train/grad_norm": 0.0732421875, "optim/muon_lr": 0.002851465344429016, "optim/adamw_lr": 8.554396033287048e-05, "perf/tokens_per_sec": 2026617.300504262, "perf/iters_per_sec": 0.9663664343377409, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348041534423829, "data/tokens_consumed": 166201393152, "data/tokens_consumed_B": 166.201393152, "train/loss_slope": 5.679772417358634e-06} {"step": 79260, "timestamp": 1778280212.9336834, "train/loss": 2.0765795350074767, "train/z_loss": 0.001410111563745886, "train/perplexity": 7.977136679715006, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0028244835138320924, "optim/adamw_lr": 8.473450541496276e-05, "perf/tokens_per_sec": 2026321.215466574, "perf/iters_per_sec": 0.9662252499897833, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349553585052491, "data/tokens_consumed": 166222364672, "data/tokens_consumed_B": 166.222364672, "train/loss_slope": 3.7086542397335224e-06} {"step": 79270, "timestamp": 1778280223.289424, "train/loss": 2.080032217502594, "train/z_loss": 0.0014139754814095796, "train/perplexity": 8.004726802448594, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.0027976107597351076, "optim/adamw_lr": 8.392832279205322e-05, "perf/tokens_per_sec": 2026637.5655575527, "perf/iters_per_sec": 0.9663760974681629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347938060760498, "data/tokens_consumed": 166243336192, "data/tokens_consumed_B": 166.243336192, "train/loss_slope": 4.562347442439256e-06} {"step": 79275, "timestamp": 1778280229.0596704, "eos/sharpness": 6.8901419639587385, "eos/L0_probe": 1.908294677734375, "eos/L_plus": 1.9424959421157837, "eos/L_minus": 1.9429948329925537, "eos/grad_norm": 0.0743306428194046, "eos/embed_grad_frac": 0.31801894307136536, "eos/time_s": 0.6019067764282227} {"step": 79275, "timestamp": 1778280230.4353578, "geo/rankme_last": 440.65179443359375, "geo/layer_0/stable_rank_q_proj": 18.70941162109375, "geo/layer_0/stable_rank_k_proj": 15.68692684173584, "geo/layer_0/stable_rank_o_proj": 46.28606033325195, "geo/layer_0/stable_rank_gate_proj": 127.0727310180664, "geo/layer_0/stable_rank_down_proj": 56.826011657714844, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06340288370847702, "geo/layer_0/attn_entropy_mean": 6.1255950927734375, "geo/layer_0/attn_entropy_std": 0.4280247092247009, "geo/layer_7/stable_rank_q_proj": 42.53966522216797, "geo/layer_7/stable_rank_k_proj": 40.05841827392578, "geo/layer_7/stable_rank_o_proj": 87.89801025390625, "geo/layer_7/stable_rank_gate_proj": 76.84486389160156, "geo/layer_7/stable_rank_down_proj": 140.3777618408203, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45857954025268555, "geo/layer_7/attn_entropy_mean": 4.640293121337891, "geo/layer_7/attn_entropy_std": 0.7689247131347656, "geo/layer_14/stable_rank_q_proj": 49.11937713623047, "geo/layer_14/stable_rank_k_proj": 41.472312927246094, "geo/layer_14/stable_rank_o_proj": 43.08579635620117, "geo/layer_14/stable_rank_gate_proj": 70.23261260986328, "geo/layer_14/stable_rank_down_proj": 125.01543426513672, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41376280784606934, "geo/layer_14/attn_entropy_mean": 5.496527671813965, "geo/layer_14/attn_entropy_std": 0.4140178859233856, "geo/layer_21/stable_rank_q_proj": 39.40517807006836, "geo/layer_21/stable_rank_k_proj": 30.027698516845703, "geo/layer_21/stable_rank_o_proj": 67.86467742919922, "geo/layer_21/stable_rank_gate_proj": 62.89021682739258, "geo/layer_21/stable_rank_down_proj": 49.23908996582031, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14459174871444702, "geo/layer_21/attn_entropy_mean": 5.677141189575195, "geo/layer_21/attn_entropy_std": 0.28662291169166565, "geo/layer_27/stable_rank_q_proj": 43.78616714477539, "geo/layer_27/stable_rank_k_proj": 32.34247589111328, "geo/layer_27/stable_rank_o_proj": 115.27058410644531, "geo/layer_27/stable_rank_gate_proj": 76.21993255615234, "geo/layer_27/stable_rank_down_proj": 127.77842712402344, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08421613276004791, "geo/layer_27/attn_entropy_mean": 4.1414475440979, "geo/layer_27/attn_entropy_std": 0.7917554378509521, "attnres/final_alpha/block_0": 0.24044552445411682, "attnres/block_norm/0": 1.7789194583892822, "attnres/final_alpha/block_1": 0.003938395529985428, "attnres/block_norm/1": 47421.59375, "attnres/final_alpha/block_2": 0.0091644786298275, "attnres/block_norm/2": 29131.75, "attnres/final_alpha/block_3": 0.010587867349386215, "attnres/block_norm/3": 63913.734375, "attnres/final_alpha/block_4": 0.012692451477050781, "attnres/block_norm/4": 15721.271484375, "attnres/final_alpha/block_5": 0.6155147552490234, "attnres/block_norm/5": 6823.38525390625, "attnres/final_alpha/block_6": 0.10765650868415833, "attnres/block_norm/6": 40860.421875, "geo/tier1_time_s": 1.3564000129699707, "geo/step": 79275.0, "geo/rankme_slope": 0.00045375355220213085} {"step": 79280, "timestamp": 1778280235.6113439, "train/loss": 2.1011608839035034, "train/z_loss": 0.0014109950046986342, "train/perplexity": 8.175655392477635, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.002770843505859375, "optim/adamw_lr": 8.312530517578124e-05, "perf/tokens_per_sec": 1702907.1105179866, "perf/iters_per_sec": 0.8120093872632916, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2315128564834594, "data/tokens_consumed": 166264307712, "data/tokens_consumed_B": 166.264307712, "train/loss_slope": 7.084187837538337e-06} {"step": 79290, "timestamp": 1778280245.9675782, "train/loss": 2.0883811831474306, "train/z_loss": 0.0014102750457823277, "train/perplexity": 8.071837755206461, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.002744186520576477, "optim/adamw_lr": 8.23255956172943e-05, "perf/tokens_per_sec": 2025982.0985341037, "perf/iters_per_sec": 0.9660635464354056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351285934448242, "data/tokens_consumed": 166285279232, "data/tokens_consumed_B": 166.285279232, "train/loss_slope": 1.0025415004211778e-05} {"step": 79300, "timestamp": 1778280256.312061, "grad/layer_0/attn": 0.002583745401352644, "grad/layer_0/mlp": 0.0028826147317886353, "grad/layer_0/attn_mlp_ratio": 0.8963200261303916, "grad/layer_4/attn": 0.001972511876374483, "grad/layer_4/mlp": 0.002431402448564768, "grad/layer_4/attn_mlp_ratio": 0.8112650361162271, "grad/layer_8/attn": 0.0027499361895024776, "grad/layer_8/mlp": 0.0034847313072532415, "grad/layer_8/attn_mlp_ratio": 0.7891386359874579, "grad/layer_12/attn": 0.004114650189876556, "grad/layer_12/mlp": 0.006559853442013264, "grad/layer_12/attn_mlp_ratio": 0.6272472645195281, "grad/layer_16/attn": 0.0031643379479646683, "grad/layer_16/mlp": 0.004271731246262789, "grad/layer_16/attn_mlp_ratio": 0.7407623961963465, "grad/layer_20/attn": 0.0024868263863027096, "grad/layer_20/mlp": 0.004762366879731417, "grad/layer_20/attn_mlp_ratio": 0.5221828550564491, "grad/layer_24/attn": 0.005240951664745808, "grad/layer_24/mlp": 0.006386815570294857, "grad/layer_24/attn_mlp_ratio": 0.8205891535466612, "grad/layer_27/attn": 0.004293805919587612, "grad/layer_27/mlp": 0.005895564798265696, "grad/layer_27/attn_mlp_ratio": 0.7283111955651147} {"step": 79300, "timestamp": 1778280256.3277614, "train/loss": 2.085059666633606, "train/z_loss": 0.0014016728498972952, "train/perplexity": 8.045071489710226, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.002717636227607727, "optim/adamw_lr": 8.15290868282318e-05, "perf/tokens_per_sec": 2025301.080687478, "perf/iters_per_sec": 0.965738811820735, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0354766607284547, "data/tokens_consumed": 166306250752, "data/tokens_consumed_B": 166.306250752, "train/loss_slope": 6.584532919711214e-06} {"step": 79310, "timestamp": 1778280266.6837878, "train/loss": 2.120187449455261, "train/z_loss": 0.001406738965306431, "train/perplexity": 8.332699301247441, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0026911944150924683, "optim/adamw_lr": 8.073583245277404e-05, "perf/tokens_per_sec": 2026028.2035548082, "perf/iters_per_sec": 0.9660855310224572, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035105037689209, "data/tokens_consumed": 166327222272, "data/tokens_consumed_B": 166.327222272, "train/loss_slope": 5.72997711815709e-06} {"step": 79320, "timestamp": 1778280277.0344412, "train/loss": 2.059936833381653, "train/z_loss": 0.001426648220513016, "train/perplexity": 7.845474222590344, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0026648640632629395, "optim/adamw_lr": 7.994592189788818e-05, "perf/tokens_per_sec": 2027333.3135138014, "perf/iters_per_sec": 0.9667078559464461, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034438681602478, "data/tokens_consumed": 166348193792, "data/tokens_consumed_B": 166.348193792, "train/loss_slope": 5.8123565027744935e-06} {"step": 79330, "timestamp": 1778280287.3862846, "train/loss": 2.108453869819641, "train/z_loss": 0.0014106640359386803, "train/perplexity": 8.235498283549425, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.0026386409997940065, "optim/adamw_lr": 7.915922999382018e-05, "perf/tokens_per_sec": 2026865.2717562998, "perf/iters_per_sec": 0.9664846762448787, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.03467755317688, "data/tokens_consumed": 166369165312, "data/tokens_consumed_B": 166.369165312, "train/loss_slope": 6.169129312127884e-06} {"step": 79340, "timestamp": 1778280297.7373188, "train/loss": 2.0481790781021116, "train/z_loss": 0.0014214059570804237, "train/perplexity": 7.753769235502989, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.002612529993057251, "optim/adamw_lr": 7.837589979171753e-05, "perf/tokens_per_sec": 2026913.3319616404, "perf/iters_per_sec": 0.9665075931366159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346530199050903, "data/tokens_consumed": 166390136832, "data/tokens_consumed_B": 166.390136832, "train/loss_slope": 6.4085246348503016e-06} {"step": 79350, "timestamp": 1778280308.0777934, "grad/layer_0/attn": 0.002558361040428281, "grad/layer_0/mlp": 0.002765551907941699, "grad/layer_0/attn_mlp_ratio": 0.9250815146782855, "grad/layer_4/attn": 0.001958168111741543, "grad/layer_4/mlp": 0.002500356873497367, "grad/layer_4/attn_mlp_ratio": 0.7831554184051411, "grad/layer_8/attn": 0.004474316257983446, "grad/layer_8/mlp": 0.0034762322902679443, "grad/layer_8/attn_mlp_ratio": 1.2871165548395833, "grad/layer_12/attn": 0.003957349807024002, "grad/layer_12/mlp": 0.00629604235291481, "grad/layer_12/attn_mlp_ratio": 0.6285456041027664, "grad/layer_16/attn": 0.003905551740899682, "grad/layer_16/mlp": 0.004463931545615196, "grad/layer_16/attn_mlp_ratio": 0.8749129805193151, "grad/layer_20/attn": 0.0031073486898094416, "grad/layer_20/mlp": 0.005870651453733444, "grad/layer_20/attn_mlp_ratio": 0.5293021841559173, "grad/layer_24/attn": 0.004248804412782192, "grad/layer_24/mlp": 0.006782503332942724, "grad/layer_24/attn_mlp_ratio": 0.6264360136031312, "grad/layer_27/attn": 0.003353330073878169, "grad/layer_27/mlp": 0.006065870635211468, "grad/layer_27/attn_mlp_ratio": 0.5528192439731022} {"step": 79350, "timestamp": 1778280308.6897378, "eos/sharpness": 8.514285087585447, "eos/L0_probe": 1.9070111513137817, "eos/L_plus": 1.9471006393432617, "eos/L_minus": 1.9520645141601562, "eos/grad_norm": 0.07631421834230423, "eos/embed_grad_frac": 0.3411623537540436, "eos/time_s": 0.6091263294219971} {"step": 79350, "timestamp": 1778280308.7104259, "train/loss": 2.073117768764496, "train/z_loss": 0.0014058759552426636, "train/perplexity": 7.949569440440768, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.0025865280628204344, "optim/adamw_lr": 7.759584188461303e-05, "perf/tokens_per_sec": 1912131.6168723528, "perf/iters_per_sec": 0.9117754062997593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0967613220214845, "data/tokens_consumed": 166411108352, "data/tokens_consumed_B": 166.411108352, "train/loss_slope": 2.9294377577902626e-06} {"step": 79350, "timestamp": 1778280310.0712776, "geo/rankme_last": 440.80645751953125, "geo/layer_0/stable_rank_q_proj": 18.70768165588379, "geo/layer_0/stable_rank_k_proj": 15.685470581054688, "geo/layer_0/stable_rank_o_proj": 46.287994384765625, "geo/layer_0/stable_rank_gate_proj": 127.06685638427734, "geo/layer_0/stable_rank_down_proj": 56.8291130065918, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06493359804153442, "geo/layer_0/attn_entropy_mean": 6.124899864196777, "geo/layer_0/attn_entropy_std": 0.4282994866371155, "geo/layer_7/stable_rank_q_proj": 42.53902816772461, "geo/layer_7/stable_rank_k_proj": 40.05316162109375, "geo/layer_7/stable_rank_o_proj": 87.90414428710938, "geo/layer_7/stable_rank_gate_proj": 76.8490982055664, "geo/layer_7/stable_rank_down_proj": 140.3706512451172, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4609452486038208, "geo/layer_7/attn_entropy_mean": 4.643685340881348, "geo/layer_7/attn_entropy_std": 0.7697412371635437, "geo/layer_14/stable_rank_q_proj": 49.11137390136719, "geo/layer_14/stable_rank_k_proj": 41.470951080322266, "geo/layer_14/stable_rank_o_proj": 43.0900993347168, "geo/layer_14/stable_rank_gate_proj": 70.23653411865234, "geo/layer_14/stable_rank_down_proj": 125.00651550292969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4106338322162628, "geo/layer_14/attn_entropy_mean": 5.498898983001709, "geo/layer_14/attn_entropy_std": 0.4119538366794586, "geo/layer_21/stable_rank_q_proj": 39.403743743896484, "geo/layer_21/stable_rank_k_proj": 30.02346420288086, "geo/layer_21/stable_rank_o_proj": 67.86050415039062, "geo/layer_21/stable_rank_gate_proj": 62.89008331298828, "geo/layer_21/stable_rank_down_proj": 49.24188232421875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15314622223377228, "geo/layer_21/attn_entropy_mean": 5.67836856842041, "geo/layer_21/attn_entropy_std": 0.2889762818813324, "geo/layer_27/stable_rank_q_proj": 43.78749084472656, "geo/layer_27/stable_rank_k_proj": 32.337432861328125, "geo/layer_27/stable_rank_o_proj": 115.28083801269531, "geo/layer_27/stable_rank_gate_proj": 76.21578216552734, "geo/layer_27/stable_rank_down_proj": 127.7810287475586, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07756523042917252, "geo/layer_27/attn_entropy_mean": 4.143632411956787, "geo/layer_27/attn_entropy_std": 0.7935723662376404, "attnres/final_alpha/block_0": 0.2399335354566574, "attnres/block_norm/0": 1.7789039611816406, "attnres/final_alpha/block_1": 0.003923707641661167, "attnres/block_norm/1": 47405.39453125, "attnres/final_alpha/block_2": 0.00911988690495491, "attnres/block_norm/2": 29123.96484375, "attnres/final_alpha/block_3": 0.01056028250604868, "attnres/block_norm/3": 63891.2265625, "attnres/final_alpha/block_4": 0.012640120461583138, "attnres/block_norm/4": 15730.0537109375, "attnres/final_alpha/block_5": 0.6170068979263306, "attnres/block_norm/5": 6792.58642578125, "attnres/final_alpha/block_6": 0.10681559145450592, "attnres/block_norm/6": 40911.27734375, "geo/tier1_time_s": 1.3567252159118652, "geo/step": 79350.0, "geo/rankme_slope": 0.0004546118251988295} {"step": 79360, "timestamp": 1778280320.4214869, "train/loss": 2.1036378026008604, "train/z_loss": 0.001411883719265461, "train/perplexity": 8.19593092624067, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0025606381893157957, "optim/adamw_lr": 7.681914567947387e-05, "perf/tokens_per_sec": 1791377.5211845564, "perf/iters_per_sec": 0.8541953664706022, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1706923723220826, "data/tokens_consumed": 166432079872, "data/tokens_consumed_B": 166.432079872, "train/loss_slope": 6.8290907164218344e-06} {"step": 79370, "timestamp": 1778280330.7671857, "train/loss": 2.053977870941162, "train/z_loss": 0.0014062200440093874, "train/perplexity": 7.798862353487949, "train/grad_norm": 0.06689453125, "optim/muon_lr": 0.002534857988357544, "optim/adamw_lr": 7.604573965072631e-05, "perf/tokens_per_sec": 2028051.001625029, "perf/iters_per_sec": 0.96705007630588, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0340726137161256, "data/tokens_consumed": 166453051392, "data/tokens_consumed_B": 166.453051392, "train/loss_slope": 6.789569457014396e-06} {"step": 79380, "timestamp": 1778280341.1165187, "train/loss": 2.125029468536377, "train/z_loss": 0.001403864671010524, "train/perplexity": 8.373144228796999, "train/grad_norm": 0.08154296875, "optim/muon_lr": 0.002509189248085022, "optim/adamw_lr": 7.527567744255065e-05, "perf/tokens_per_sec": 2027318.4079643362, "perf/iters_per_sec": 0.9667007484265977, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344462871551514, "data/tokens_consumed": 166474022912, "data/tokens_consumed_B": 166.474022912, "train/loss_slope": 1.0068974407664186e-05} {"step": 79390, "timestamp": 1778280351.5074432, "train/loss": 2.0535704851150514, "train/z_loss": 0.0014129912364296616, "train/perplexity": 7.795685854579597, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.002483634352684021, "optim/adamw_lr": 7.450903058052062e-05, "perf/tokens_per_sec": 2019744.0109937573, "perf/iters_per_sec": 0.9630889945000445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0383256435394288, "data/tokens_consumed": 166494994432, "data/tokens_consumed_B": 166.494994432, "train/loss_slope": 7.244057952910649e-06} {"step": 79400, "timestamp": 1778280361.8837204, "grad/layer_0/attn": 0.0026198383420705795, "grad/layer_0/mlp": 0.0029690356459468603, "grad/layer_0/attn_mlp_ratio": 0.8823869317326398, "grad/layer_4/attn": 0.0018911882070824504, "grad/layer_4/mlp": 0.002580225234851241, "grad/layer_4/attn_mlp_ratio": 0.732954669322121, "grad/layer_8/attn": 0.0038159932009875774, "grad/layer_8/mlp": 0.0036149942316114902, "grad/layer_8/attn_mlp_ratio": 1.0556014341761044, "grad/layer_12/attn": 0.005093896295875311, "grad/layer_12/mlp": 0.006300658453255892, "grad/layer_12/attn_mlp_ratio": 0.808470456353014, "grad/layer_16/attn": 0.003176711965352297, "grad/layer_16/mlp": 0.004388590343296528, "grad/layer_16/attn_mlp_ratio": 0.7238570120400851, "grad/layer_20/attn": 0.0027517578564584255, "grad/layer_20/mlp": 0.004731695167720318, "grad/layer_20/attn_mlp_ratio": 0.5815585537029299, "grad/layer_24/attn": 0.005845262669026852, "grad/layer_24/mlp": 0.006600182503461838, "grad/layer_24/attn_mlp_ratio": 0.8856213562880783, "grad/layer_27/attn": 0.007078923285007477, "grad/layer_27/mlp": 0.005720030516386032, "grad/layer_27/attn_mlp_ratio": 1.2375673767774351} {"step": 79400, "timestamp": 1778280361.9006224, "train/loss": 2.131872606277466, "train/z_loss": 0.0014050822355784476, "train/perplexity": 8.430639307083391, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.0024581897258758546, "optim/adamw_lr": 7.374569177627563e-05, "perf/tokens_per_sec": 2018913.5083068942, "perf/iters_per_sec": 0.9626929799589606, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0387527704238892, "data/tokens_consumed": 166515965952, "data/tokens_consumed_B": 166.515965952, "train/loss_slope": 1.3877152101864137e-05} {"step": 79410, "timestamp": 1778280372.284459, "train/loss": 2.129342818260193, "train/z_loss": 0.0014014541869983078, "train/perplexity": 8.409338531370064, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.0024328601360321046, "optim/adamw_lr": 7.298580408096313e-05, "perf/tokens_per_sec": 2021006.9508274402, "perf/iters_per_sec": 0.9636912111413194, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037676787376404, "data/tokens_consumed": 166536937472, "data/tokens_consumed_B": 166.536937472, "train/loss_slope": 1.730593950441574e-05} {"step": 79420, "timestamp": 1778280382.6628127, "train/loss": 2.1017436742782594, "train/z_loss": 0.0013991543441079558, "train/perplexity": 8.180421474426138, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.002407641410827637, "optim/adamw_lr": 7.22292423248291e-05, "perf/tokens_per_sec": 2021608.0918370346, "perf/iters_per_sec": 0.9639778575120137, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373682260513306, "data/tokens_consumed": 166557908992, "data/tokens_consumed_B": 166.557908992, "train/loss_slope": 1.7557150846195823e-05} {"step": 79425, "timestamp": 1778280388.4513042, "eos/sharpness": 34.427285194396966, "eos/L0_probe": 1.9066423177719116, "eos/L_plus": 2.0712454319000244, "eos/L_minus": 2.0863120555877686, "eos/grad_norm": 0.08389277756214142, "eos/embed_grad_frac": 0.2539674639701843, "eos/time_s": 0.6098659038543701} {"step": 79425, "timestamp": 1778280389.8321488, "geo/rankme_last": 440.7562255859375, "geo/layer_0/stable_rank_q_proj": 18.7073974609375, "geo/layer_0/stable_rank_k_proj": 15.685259819030762, "geo/layer_0/stable_rank_o_proj": 46.290008544921875, "geo/layer_0/stable_rank_gate_proj": 127.06056213378906, "geo/layer_0/stable_rank_down_proj": 56.8292350769043, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06766238063573837, "geo/layer_0/attn_entropy_mean": 6.125253200531006, "geo/layer_0/attn_entropy_std": 0.4282383918762207, "geo/layer_7/stable_rank_q_proj": 42.537662506103516, "geo/layer_7/stable_rank_k_proj": 40.05064392089844, "geo/layer_7/stable_rank_o_proj": 87.90414428710938, "geo/layer_7/stable_rank_gate_proj": 76.84626770019531, "geo/layer_7/stable_rank_down_proj": 140.36538696289062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45045456290245056, "geo/layer_7/attn_entropy_mean": 4.647243022918701, "geo/layer_7/attn_entropy_std": 0.7685695886611938, "geo/layer_14/stable_rank_q_proj": 49.10687255859375, "geo/layer_14/stable_rank_k_proj": 41.4762077331543, "geo/layer_14/stable_rank_o_proj": 43.087100982666016, "geo/layer_14/stable_rank_gate_proj": 70.24180603027344, "geo/layer_14/stable_rank_down_proj": 124.99322509765625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41344133019447327, "geo/layer_14/attn_entropy_mean": 5.503654479980469, "geo/layer_14/attn_entropy_std": 0.4116464853286743, "geo/layer_21/stable_rank_q_proj": 39.40224838256836, "geo/layer_21/stable_rank_k_proj": 30.020061492919922, "geo/layer_21/stable_rank_o_proj": 67.85638427734375, "geo/layer_21/stable_rank_gate_proj": 62.887508392333984, "geo/layer_21/stable_rank_down_proj": 49.24382781982422, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14754080772399902, "geo/layer_21/attn_entropy_mean": 5.680159091949463, "geo/layer_21/attn_entropy_std": 0.2886316776275635, "geo/layer_27/stable_rank_q_proj": 43.78728103637695, "geo/layer_27/stable_rank_k_proj": 32.338382720947266, "geo/layer_27/stable_rank_o_proj": 115.2851791381836, "geo/layer_27/stable_rank_gate_proj": 76.21324157714844, "geo/layer_27/stable_rank_down_proj": 127.79097747802734, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08612236380577087, "geo/layer_27/attn_entropy_mean": 4.142297744750977, "geo/layer_27/attn_entropy_std": 0.7920005321502686, "attnres/final_alpha/block_0": 0.23981228470802307, "attnres/block_norm/0": 1.7789466381072998, "attnres/final_alpha/block_1": 0.003920580260455608, "attnres/block_norm/1": 47433.87109375, "attnres/final_alpha/block_2": 0.009108046069741249, "attnres/block_norm/2": 29149.8359375, "attnres/final_alpha/block_3": 0.010574892163276672, "attnres/block_norm/3": 63893.3046875, "attnres/final_alpha/block_4": 0.012623117305338383, "attnres/block_norm/4": 15730.541015625, "attnres/final_alpha/block_5": 0.6166645288467407, "attnres/block_norm/5": 6809.00048828125, "attnres/final_alpha/block_6": 0.10729660838842392, "attnres/block_norm/6": 40916.4296875, "geo/tier1_time_s": 1.3612456321716309, "geo/step": 79425.0, "geo/rankme_slope": 0.00045750548266181474} {"step": 79430, "timestamp": 1778280395.0250723, "train/loss": 2.082770824432373, "train/z_loss": 0.0014044974581338466, "train/perplexity": 8.026678647758771, "train/grad_norm": 0.08203125, "optim/muon_lr": 0.00238253653049469, "optim/adamw_lr": 7.147609591484069e-05, "perf/tokens_per_sec": 1697172.850231, "perf/iters_per_sec": 0.8092750788836479, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2356737852096558, "data/tokens_consumed": 166578880512, "data/tokens_consumed_B": 166.578880512, "train/loss_slope": 1.8307772413803775e-05} {"step": 79440, "timestamp": 1778280405.4083495, "train/loss": 2.0623515844345093, "train/z_loss": 0.0014214288094080985, "train/perplexity": 7.864441981720881, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.0023575466871261597, "optim/adamw_lr": 7.072640061378478e-05, "perf/tokens_per_sec": 2021150.9561390525, "perf/iters_per_sec": 0.9637598782248747, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376028537750244, "data/tokens_consumed": 166599852032, "data/tokens_consumed_B": 166.599852032, "train/loss_slope": 1.6814457544005805e-05} {"step": 79450, "timestamp": 1778280415.7702734, "grad/layer_0/attn": 0.010612281039357185, "grad/layer_0/mlp": 0.007235240191221237, "grad/layer_0/attn_mlp_ratio": 1.46674894159817, "grad/layer_4/attn": 0.0019361265003681183, "grad/layer_4/mlp": 0.002542090369388461, "grad/layer_4/attn_mlp_ratio": 0.761627693303096, "grad/layer_8/attn": 0.003923925571143627, "grad/layer_8/mlp": 0.0034914142452180386, "grad/layer_8/attn_mlp_ratio": 1.1238785154554831, "grad/layer_12/attn": 0.0041117253713309765, "grad/layer_12/mlp": 0.006123492028564215, "grad/layer_12/attn_mlp_ratio": 0.6714674053635241, "grad/layer_16/attn": 0.005124471615999937, "grad/layer_16/mlp": 0.0042328923009335995, "grad/layer_16/attn_mlp_ratio": 1.2106312021703394, "grad/layer_20/attn": 0.0030596053693443537, "grad/layer_20/mlp": 0.004708744119852781, "grad/layer_20/attn_mlp_ratio": 0.6497709848932954, "grad/layer_24/attn": 0.005596592556685209, "grad/layer_24/mlp": 0.006039055529981852, "grad/layer_24/attn_mlp_ratio": 0.9267330688096391, "grad/layer_27/attn": 0.005450968164950609, "grad/layer_27/mlp": 0.0053924876265227795, "grad/layer_27/attn_mlp_ratio": 1.0108447976877528} {"step": 79450, "timestamp": 1778280415.7868268, "train/loss": 2.078136348724365, "train/z_loss": 0.001405448536388576, "train/perplexity": 7.989565267507227, "train/grad_norm": 0.0927734375, "optim/muon_lr": 0.0023326700925827026, "optim/adamw_lr": 6.998010277748107e-05, "perf/tokens_per_sec": 2021649.8161717819, "perf/iters_per_sec": 0.9639977532252225, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373468160629273, "data/tokens_consumed": 166620823552, "data/tokens_consumed_B": 166.620823552, "train/loss_slope": 1.6523592728878956e-05} {"step": 79460, "timestamp": 1778280426.156861, "train/loss": 2.10196293592453, "train/z_loss": 0.0014144351473078132, "train/perplexity": 8.182215323759802, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.00230790913105011, "optim/adamw_lr": 6.92372739315033e-05, "perf/tokens_per_sec": 2023311.8256076386, "perf/iters_per_sec": 0.9647902610815232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0364947080612184, "data/tokens_consumed": 166641795072, "data/tokens_consumed_B": 166.641795072, "train/loss_slope": 1.6149968718967436e-05} {"step": 79470, "timestamp": 1778280436.9407578, "train/loss": 2.0701037526130674, "train/z_loss": 0.001416074694134295, "train/perplexity": 7.925645381611331, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.002283262014389038, "optim/adamw_lr": 6.849786043167114e-05, "perf/tokens_per_sec": 1945628.233794798, "perf/iters_per_sec": 0.9277478379224767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0778790950775146, "data/tokens_consumed": 166662766592, "data/tokens_consumed_B": 166.662766592, "train/loss_slope": 1.424585182030856e-05} {"step": 79480, "timestamp": 1778280447.328975, "train/loss": 2.0819564938545225, "train/z_loss": 0.0014070216449908913, "train/perplexity": 8.020144938557976, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0022587311267852784, "optim/adamw_lr": 6.776193380355834e-05, "perf/tokens_per_sec": 2019890.5266294468, "perf/iters_per_sec": 0.963158858599399, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382503271102905, "data/tokens_consumed": 166683738112, "data/tokens_consumed_B": 166.683738112, "train/loss_slope": 1.2908024384458268e-05} {"step": 79490, "timestamp": 1778280457.7035208, "train/loss": 2.143598794937134, "train/z_loss": 0.0014043044648133218, "train/perplexity": 8.53008046756769, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0022343152761459353, "optim/adamw_lr": 6.702945828437804e-05, "perf/tokens_per_sec": 2022318.6162410516, "perf/iters_per_sec": 0.9643166619496591, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037003755569458, "data/tokens_consumed": 166704709632, "data/tokens_consumed_B": 166.704709632, "train/loss_slope": 2.0143398242850894e-05} {"step": 79500, "timestamp": 1778280468.0711875, "grad/layer_0/attn": 0.0023005176335573196, "grad/layer_0/mlp": 0.00265373382717371, "grad/layer_0/attn_mlp_ratio": 0.8668983766610792, "grad/layer_4/attn": 0.0029987094458192587, "grad/layer_4/mlp": 0.0025875649880617857, "grad/layer_4/attn_mlp_ratio": 1.1588923732409144, "grad/layer_8/attn": 0.0036579901352524757, "grad/layer_8/mlp": 0.003456020262092352, "grad/layer_8/attn_mlp_ratio": 1.0584399835647516, "grad/layer_12/attn": 0.003357901005074382, "grad/layer_12/mlp": 0.006410582922399044, "grad/layer_12/attn_mlp_ratio": 0.5238058680999266, "grad/layer_16/attn": 0.0027342294342815876, "grad/layer_16/mlp": 0.004055993165820837, "grad/layer_16/attn_mlp_ratio": 0.6741208022514418, "grad/layer_20/attn": 0.0036933578085154295, "grad/layer_20/mlp": 0.00451949005946517, "grad/layer_20/attn_mlp_ratio": 0.8172067375299908, "grad/layer_24/attn": 0.003935619257390499, "grad/layer_24/mlp": 0.006468219216912985, "grad/layer_24/attn_mlp_ratio": 0.6084548257508388, "grad/layer_27/attn": 0.004099471494555473, "grad/layer_27/mlp": 0.005525856278836727, "grad/layer_27/attn_mlp_ratio": 0.7418708003805324} {"step": 79500, "timestamp": 1778280468.6773844, "eos/sharpness": 20.317292213439938, "eos/L0_probe": 1.9065704345703125, "eos/L_plus": 1.9988927841186523, "eos/L_minus": 2.017421007156372, "eos/grad_norm": 0.07416897267103195, "eos/embed_grad_frac": 0.3164997100830078, "eos/time_s": 0.6033205986022949} {"step": 79500, "timestamp": 1778280468.6983047, "train/loss": 2.102005994319916, "train/z_loss": 0.0014150621951557695, "train/perplexity": 8.18256764440747, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.002210015058517456, "optim/adamw_lr": 6.630045175552368e-05, "perf/tokens_per_sec": 1908320.3279672842, "perf/iters_per_sec": 0.9099580421291753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098951768875122, "data/tokens_consumed": 166725681152, "data/tokens_consumed_B": 166.725681152, "train/loss_slope": 1.6955356142952277e-05} {"step": 79500, "timestamp": 1778280470.0596774, "geo/rankme_last": 440.646728515625, "geo/layer_0/stable_rank_q_proj": 18.706581115722656, "geo/layer_0/stable_rank_k_proj": 15.685306549072266, "geo/layer_0/stable_rank_o_proj": 46.287532806396484, "geo/layer_0/stable_rank_gate_proj": 127.04802703857422, "geo/layer_0/stable_rank_down_proj": 56.83197784423828, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0632433295249939, "geo/layer_0/attn_entropy_mean": 6.126428127288818, "geo/layer_0/attn_entropy_std": 0.42783644795417786, "geo/layer_7/stable_rank_q_proj": 42.535587310791016, "geo/layer_7/stable_rank_k_proj": 40.05042266845703, "geo/layer_7/stable_rank_o_proj": 87.90780639648438, "geo/layer_7/stable_rank_gate_proj": 76.84095001220703, "geo/layer_7/stable_rank_down_proj": 140.35787963867188, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45709481835365295, "geo/layer_7/attn_entropy_mean": 4.647120475769043, "geo/layer_7/attn_entropy_std": 0.7680420279502869, "geo/layer_14/stable_rank_q_proj": 49.10555648803711, "geo/layer_14/stable_rank_k_proj": 41.47868347167969, "geo/layer_14/stable_rank_o_proj": 43.085411071777344, "geo/layer_14/stable_rank_gate_proj": 70.24254608154297, "geo/layer_14/stable_rank_down_proj": 124.98269653320312, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.413827508687973, "geo/layer_14/attn_entropy_mean": 5.499021530151367, "geo/layer_14/attn_entropy_std": 0.4109375476837158, "geo/layer_21/stable_rank_q_proj": 39.40253448486328, "geo/layer_21/stable_rank_k_proj": 30.023561477661133, "geo/layer_21/stable_rank_o_proj": 67.86092376708984, "geo/layer_21/stable_rank_gate_proj": 62.88959503173828, "geo/layer_21/stable_rank_down_proj": 49.24449157714844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14386844635009766, "geo/layer_21/attn_entropy_mean": 5.679419994354248, "geo/layer_21/attn_entropy_std": 0.2879702150821686, "geo/layer_27/stable_rank_q_proj": 43.7864875793457, "geo/layer_27/stable_rank_k_proj": 32.33644104003906, "geo/layer_27/stable_rank_o_proj": 115.2960433959961, "geo/layer_27/stable_rank_gate_proj": 76.22096252441406, "geo/layer_27/stable_rank_down_proj": 127.76869201660156, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08736325055360794, "geo/layer_27/attn_entropy_mean": 4.141676902770996, "geo/layer_27/attn_entropy_std": 0.7928669452667236, "attnres/final_alpha/block_0": 0.23957276344299316, "attnres/block_norm/0": 1.7789480686187744, "attnres/final_alpha/block_1": 0.0039129918441176414, "attnres/block_norm/1": 47430.37109375, "attnres/final_alpha/block_2": 0.009097959846258163, "attnres/block_norm/2": 29189.556640625, "attnres/final_alpha/block_3": 0.010569778271019459, "attnres/block_norm/3": 63965.46875, "attnres/final_alpha/block_4": 0.012634914368391037, "attnres/block_norm/4": 15720.1455078125, "attnres/final_alpha/block_5": 0.6172069311141968, "attnres/block_norm/5": 6801.10888671875, "attnres/final_alpha/block_6": 0.10700462758541107, "attnres/block_norm/6": 40961.3203125, "geo/tier1_time_s": 1.3571557998657227, "geo/step": 79500.0, "geo/rankme_slope": 0.00043759126697554023} {"step": 79500, "timestamp": 1778280476.8131819, "geo/ww_alpha_mean": 7.732133418309897, "geo/ww_alpha_std": 4.619980379054061, "geo/ww_alpha_min": 1.3377146653762737, "geo/ww_alpha_max": 31.366595932364913, "geo/ww_alpha_healthy_frac": 0.17258883248730963, "geo/ww_alpha_by_type/q_proj": 3.9315033046211343, "geo/ww_alpha_by_type/k_proj": 4.456940460515187, "geo/ww_alpha_by_type/v_proj": 8.713466158248313, "geo/ww_alpha_by_type/o_proj": 9.606233616432386, "geo/ww_alpha_by_type/gate_proj": 8.03895459014667, "geo/ww_alpha_by_type/up_proj": 11.505795985798148, "geo/ww_alpha_by_type/down_proj": 7.959308128336586, "geo/twonn_id/layer_0": 0.7365629076957703, "geo/twonn_id/layer_7": 3.1931023597717285, "geo/twonn_id/layer_14": 4.5418901443481445, "geo/twonn_id/layer_21": 6.335668563842773, "geo/twonn_id/layer_27": 5.948830604553223, "geo/tier2_time_s": 6.747494697570801} {"step": 79500, "timestamp": 1778280477.4603844, "eoc/jacobian_sigma/layer_0/attn": 1326.1292724609375, "eoc/jacobian_sigma/layer_0/mlp": 8393.7890625, "eoc/jacobian_sigma/layer_0": 8393.7890625, "eoc/jacobian_sigma/layer_7/attn": 1.1541340351104736, "eoc/jacobian_sigma/layer_7/mlp": 1.9306902885437012, "eoc/jacobian_sigma/layer_7": 1.9306902885437012, "eoc/jacobian_sigma/layer_14/attn": 1.3776085376739502, "eoc/jacobian_sigma/layer_14/mlp": 6.620560646057129, "eoc/jacobian_sigma/layer_14": 6.620560646057129, "eoc/jacobian_sigma/layer_21/attn": 1.0978037118911743, "eoc/jacobian_sigma/layer_21/mlp": 4.061341762542725, "eoc/jacobian_sigma/layer_21": 4.061341762542725, "eoc/jacobian_sigma/layer_27/attn": 3.153249502182007, "eoc/jacobian_sigma/layer_27/mlp": 25.807369232177734, "eoc/jacobian_sigma/layer_27": 25.807369232177734, "eoc/layer0_sigma": 8393.7890625, "eoc/sigma_max": 25.807369232177734, "eoc/sigma_min": 1.9306902885437012, "eoc/sigma_mean": 9.604990482330322, "eoc/time_s": 0.641167163848877} {"step": 79510, "timestamp": 1778280487.8403635, "train/loss": 2.028847098350525, "train/z_loss": 0.0014198645832948387, "train/perplexity": 7.605313124367672, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.0021858322620391846, "optim/adamw_lr": 6.557496786117553e-05, "perf/tokens_per_sec": 1095808.7939778792, "perf/iters_per_sec": 0.5225223512544056, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9137937307357789, "data/tokens_consumed": 166746652672, "data/tokens_consumed_B": 166.746652672, "train/loss_slope": 1.3619352946437396e-05} {"step": 79520, "timestamp": 1778280498.2120056, "train/loss": 2.1014740228652955, "train/z_loss": 0.001401069387793541, "train/perplexity": 8.17821590959712, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.0021617656946182253, "optim/adamw_lr": 6.485297083854674e-05, "perf/tokens_per_sec": 2023740.9782065924, "perf/iters_per_sec": 0.9649948969872438, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362749099731445, "data/tokens_consumed": 166767624192, "data/tokens_consumed_B": 166.767624192, "train/loss_slope": 1.6199896251908543e-05} {"step": 79530, "timestamp": 1778280508.5825243, "train/loss": 2.1409335851669313, "train/z_loss": 0.0014013136737048627, "train/perplexity": 8.507376282911933, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.002137817144393921, "optim/adamw_lr": 6.413451433181762e-05, "perf/tokens_per_sec": 2024095.0415806072, "perf/iters_per_sec": 0.9651637275603329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360936403274537, "data/tokens_consumed": 166788595712, "data/tokens_consumed_B": 166.788595712, "train/loss_slope": 1.6058227567389494e-05} {"step": 79540, "timestamp": 1778280518.9360938, "train/loss": 2.0685351967811583, "train/z_loss": 0.0014137727906927467, "train/perplexity": 7.913223309230305, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.0021139848232269286, "optim/adamw_lr": 6.341954469680785e-05, "perf/tokens_per_sec": 2026762.9937747053, "perf/iters_per_sec": 0.9664359063027884, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347297668457032, "data/tokens_consumed": 166809567232, "data/tokens_consumed_B": 166.809567232, "train/loss_slope": 1.661204280275288e-05} {"step": 79550, "timestamp": 1778280529.29079, "grad/layer_0/attn": 0.002490212209522724, "grad/layer_0/mlp": 0.0027803699485957623, "grad/layer_0/attn_mlp_ratio": 0.8956405679813795, "grad/layer_4/attn": 0.002147659659385681, "grad/layer_4/mlp": 0.002375038806349039, "grad/layer_4/attn_mlp_ratio": 0.90426293802787, "grad/layer_8/attn": 0.0028231695760041475, "grad/layer_8/mlp": 0.0034222227986902, "grad/layer_8/attn_mlp_ratio": 0.8249519857647699, "grad/layer_12/attn": 0.0036214757710695267, "grad/layer_12/mlp": 0.005721499212086201, "grad/layer_12/attn_mlp_ratio": 0.632959225114203, "grad/layer_16/attn": 0.004816544707864523, "grad/layer_16/mlp": 0.0042227585799992085, "grad/layer_16/attn_mlp_ratio": 1.1406156668809277, "grad/layer_20/attn": 0.002584869507700205, "grad/layer_20/mlp": 0.004559358116239309, "grad/layer_20/attn_mlp_ratio": 0.5669371400767629, "grad/layer_24/attn": 0.003997187130153179, "grad/layer_24/mlp": 0.006078260950744152, "grad/layer_24/attn_mlp_ratio": 0.6576201806376529, "grad/layer_27/attn": 0.0035171916242688894, "grad/layer_27/mlp": 0.005364787299185991, "grad/layer_27/attn_mlp_ratio": 0.6556068978246108} {"step": 79550, "timestamp": 1778280529.3056061, "train/loss": 2.052502524852753, "train/z_loss": 0.0014114122721366584, "train/perplexity": 7.787364815929726, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.0020902705192565917, "optim/adamw_lr": 6.270811557769775e-05, "perf/tokens_per_sec": 2023692.602735881, "perf/iters_per_sec": 0.9649718297652631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0362996816635133, "data/tokens_consumed": 166830538752, "data/tokens_consumed_B": 166.830538752, "train/loss_slope": 1.3434530899684219e-05} {"step": 79560, "timestamp": 1778280539.6948967, "train/loss": 2.0840343236923218, "train/z_loss": 0.0014020706876181066, "train/perplexity": 8.036826760006155, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0020666760206222534, "optim/adamw_lr": 6.20002806186676e-05, "perf/tokens_per_sec": 2020449.6055118458, "perf/iters_per_sec": 0.9634254481848935, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379630327224731, "data/tokens_consumed": 166851510272, "data/tokens_consumed_B": 166.851510272, "train/loss_slope": 1.384296307075931e-05} {"step": 79570, "timestamp": 1778280550.0753455, "train/loss": 2.0731921553611756, "train/z_loss": 0.001409183326177299, "train/perplexity": 7.950160803850994, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.002043198347091675, "optim/adamw_lr": 6.129595041275024e-05, "perf/tokens_per_sec": 2021359.5935887638, "perf/iters_per_sec": 0.9638593643134898, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037495756149292, "data/tokens_consumed": 166872481792, "data/tokens_consumed_B": 166.872481792, "train/loss_slope": 1.5638473196284353e-05} {"step": 79575, "timestamp": 1778280555.8805072, "eos/sharpness": 54.68125343322753, "eos/L0_probe": 1.905593752861023, "eos/L_plus": 2.2135162353515625, "eos/L_minus": 2.144483804702759, "eos/grad_norm": 0.09736384451389313, "eos/embed_grad_frac": 0.18409831821918488, "eos/time_s": 0.6284687519073486} {"step": 79575, "timestamp": 1778280557.2594852, "geo/rankme_last": 440.7408752441406, "geo/layer_0/stable_rank_q_proj": 18.706096649169922, "geo/layer_0/stable_rank_k_proj": 15.685647010803223, "geo/layer_0/stable_rank_o_proj": 46.28633117675781, "geo/layer_0/stable_rank_gate_proj": 127.05506896972656, "geo/layer_0/stable_rank_down_proj": 56.827606201171875, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06487882137298584, "geo/layer_0/attn_entropy_mean": 6.126654624938965, "geo/layer_0/attn_entropy_std": 0.42791515588760376, "geo/layer_7/stable_rank_q_proj": 42.534942626953125, "geo/layer_7/stable_rank_k_proj": 40.04739761352539, "geo/layer_7/stable_rank_o_proj": 87.90210723876953, "geo/layer_7/stable_rank_gate_proj": 76.83861541748047, "geo/layer_7/stable_rank_down_proj": 140.35484313964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44396859407424927, "geo/layer_7/attn_entropy_mean": 4.648544788360596, "geo/layer_7/attn_entropy_std": 0.7685686945915222, "geo/layer_14/stable_rank_q_proj": 49.107215881347656, "geo/layer_14/stable_rank_k_proj": 41.484153747558594, "geo/layer_14/stable_rank_o_proj": 43.085060119628906, "geo/layer_14/stable_rank_gate_proj": 70.2394790649414, "geo/layer_14/stable_rank_down_proj": 124.99140167236328, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40689247846603394, "geo/layer_14/attn_entropy_mean": 5.502826690673828, "geo/layer_14/attn_entropy_std": 0.4099941551685333, "geo/layer_21/stable_rank_q_proj": 39.405120849609375, "geo/layer_21/stable_rank_k_proj": 30.02251625061035, "geo/layer_21/stable_rank_o_proj": 67.86509704589844, "geo/layer_21/stable_rank_gate_proj": 62.89081954956055, "geo/layer_21/stable_rank_down_proj": 49.2430534362793, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1456058770418167, "geo/layer_21/attn_entropy_mean": 5.677541732788086, "geo/layer_21/attn_entropy_std": 0.2877795994281769, "geo/layer_27/stable_rank_q_proj": 43.78666305541992, "geo/layer_27/stable_rank_k_proj": 32.337310791015625, "geo/layer_27/stable_rank_o_proj": 115.29619598388672, "geo/layer_27/stable_rank_gate_proj": 76.22284698486328, "geo/layer_27/stable_rank_down_proj": 127.7782211303711, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08886963874101639, "geo/layer_27/attn_entropy_mean": 4.140137672424316, "geo/layer_27/attn_entropy_std": 0.7931403517723083, "attnres/final_alpha/block_0": 0.23961195349693298, "attnres/block_norm/0": 1.778891682624817, "attnres/final_alpha/block_1": 0.003913217224180698, "attnres/block_norm/1": 47435.109375, "attnres/final_alpha/block_2": 0.009105652570724487, "attnres/block_norm/2": 29139.859375, "attnres/final_alpha/block_3": 0.010586337186396122, "attnres/block_norm/3": 63878.453125, "attnres/final_alpha/block_4": 0.01262656133621931, "attnres/block_norm/4": 15736.611328125, "attnres/final_alpha/block_5": 0.61695796251297, "attnres/block_norm/5": 6805.9453125, "attnres/final_alpha/block_6": 0.1071983128786087, "attnres/block_norm/6": 40950.9375, "geo/tier1_time_s": 1.3594460487365723, "geo/step": 79575.0, "geo/rankme_slope": 0.00042879292341936777} {"step": 79580, "timestamp": 1778280562.4501305, "train/loss": 2.0630808115005492, "train/z_loss": 0.0014151036157272756, "train/perplexity": 7.870179037227095, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0020198410749435423, "optim/adamw_lr": 6.059523224830627e-05, "perf/tokens_per_sec": 1695433.4480709704, "perf/iters_per_sec": 0.808445667300687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2369415044784546, "data/tokens_consumed": 166893453312, "data/tokens_consumed_B": 166.893453312, "train/loss_slope": 1.3373729691217892e-05} {"step": 79590, "timestamp": 1778280572.8267658, "train/loss": 2.0024593830108643, "train/z_loss": 0.0014217045158147813, "train/perplexity": 7.407250982889854, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.001996601223945618, "optim/adamw_lr": 5.989803671836853e-05, "perf/tokens_per_sec": 2022090.303448039, "perf/iters_per_sec": 0.9642077939262577, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371208429336547, "data/tokens_consumed": 166914424832, "data/tokens_consumed_B": 166.914424832, "train/loss_slope": 7.65104697983152e-06} {"step": 79600, "timestamp": 1778280583.1906378, "grad/layer_0/attn": 0.0023985854350030422, "grad/layer_0/mlp": 0.0027274105232208967, "grad/layer_0/attn_mlp_ratio": 0.8794368602152275, "grad/layer_4/attn": 0.0018706106347963214, "grad/layer_4/mlp": 0.0024388323072344065, "grad/layer_4/attn_mlp_ratio": 0.7670107340083944, "grad/layer_8/attn": 0.004335886798799038, "grad/layer_8/mlp": 0.0032916103955358267, "grad/layer_8/attn_mlp_ratio": 1.317253911019997, "grad/layer_12/attn": 0.0038852926809340715, "grad/layer_12/mlp": 0.005929670296609402, "grad/layer_12/attn_mlp_ratio": 0.655229114109899, "grad/layer_16/attn": 0.00406111404299736, "grad/layer_16/mlp": 0.004361676052212715, "grad/layer_16/attn_mlp_ratio": 0.9310902279934569, "grad/layer_20/attn": 0.0026374191511422396, "grad/layer_20/mlp": 0.004844382405281067, "grad/layer_20/attn_mlp_ratio": 0.5444283452570224, "grad/layer_24/attn": 0.003531097900122404, "grad/layer_24/mlp": 0.006347403861582279, "grad/layer_24/attn_mlp_ratio": 0.5563058411745033, "grad/layer_27/attn": 0.006837360095232725, "grad/layer_27/mlp": 0.005432219244539738, "grad/layer_27/attn_mlp_ratio": 1.2586678964105864} {"step": 79600, "timestamp": 1778280583.2073026, "train/loss": 2.0887087106704714, "train/z_loss": 0.00141269100131467, "train/perplexity": 8.074481937230367, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.001973482966423035, "optim/adamw_lr": 5.9204488992691036e-05, "perf/tokens_per_sec": 2021545.5085640827, "perf/iters_per_sec": 0.9639480154819883, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374003410339356, "data/tokens_consumed": 166935396352, "data/tokens_consumed_B": 166.935396352, "train/loss_slope": 7.3447938370268644e-06} {"step": 79610, "timestamp": 1778280593.5859349, "train/loss": 2.0519656896591187, "train/z_loss": 0.0014108979958109558, "train/perplexity": 7.783185406358313, "train/grad_norm": 0.08447265625, "optim/muon_lr": 0.0019504833221435547, "optim/adamw_lr": 5.851449966430664e-05, "perf/tokens_per_sec": 2021583.2810781645, "perf/iters_per_sec": 0.9639660268202612, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373809576034545, "data/tokens_consumed": 166956367872, "data/tokens_consumed_B": 166.956367872, "train/loss_slope": 4.414580592943219e-06} {"step": 79620, "timestamp": 1778280603.9662082, "train/loss": 2.0957727670669555, "train/z_loss": 0.001421450253110379, "train/perplexity": 8.131722470154449, "train/grad_norm": 0.09326171875, "optim/muon_lr": 0.001927604079246521, "optim/adamw_lr": 5.782812237739563e-05, "perf/tokens_per_sec": 2021320.8539939737, "perf/iters_per_sec": 0.9638408918352002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037515640258789, "data/tokens_consumed": 166977339392, "data/tokens_consumed_B": 166.977339392, "train/loss_slope": 6.73313433437999e-06} {"step": 79630, "timestamp": 1778280614.3398585, "train/loss": 2.0343497157096864, "train/z_loss": 0.0014206703752279281, "train/perplexity": 7.64727760373789, "train/grad_norm": 0.0673828125, "optim/muon_lr": 0.0019048464298248291, "optim/adamw_lr": 5.714539289474487e-05, "perf/tokens_per_sec": 2022621.53219457, "perf/iters_per_sec": 0.9644611035321092, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368484497070312, "data/tokens_consumed": 166998310912, "data/tokens_consumed_B": 166.998310912, "train/loss_slope": 3.522138247932045e-06} {"step": 79640, "timestamp": 1778280625.0813134, "train/loss": 2.1097083687782288, "train/z_loss": 0.0014057545573450624, "train/perplexity": 8.245836190660652, "train/grad_norm": 0.06689453125, "optim/muon_lr": 0.0018822085857391358, "optim/adamw_lr": 5.646625757217407e-05, "perf/tokens_per_sec": 1953187.5506545485, "perf/iters_per_sec": 0.9313524010918371, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0737074375152589, "data/tokens_consumed": 167019282432, "data/tokens_consumed_B": 167.019282432, "train/loss_slope": 5.5894254529365624e-06} {"step": 79650, "timestamp": 1778280635.9563289, "grad/layer_0/attn": 0.002228250727057457, "grad/layer_0/mlp": 0.0025526895187795162, "grad/layer_0/attn_mlp_ratio": 0.8729031178192439, "grad/layer_4/attn": 0.0023091479670256376, "grad/layer_4/mlp": 0.0024961773306131363, "grad/layer_4/attn_mlp_ratio": 0.9250736500964362, "grad/layer_8/attn": 0.003113066777586937, "grad/layer_8/mlp": 0.003320497227832675, "grad/layer_8/attn_mlp_ratio": 0.937530276411597, "grad/layer_12/attn": 0.003396386280655861, "grad/layer_12/mlp": 0.0062973955646157265, "grad/layer_12/attn_mlp_ratio": 0.5393318859953057, "grad/layer_16/attn": 0.003380506532266736, "grad/layer_16/mlp": 0.004272191319614649, "grad/layer_16/attn_mlp_ratio": 0.7912816164430333, "grad/layer_20/attn": 0.0026799223851412535, "grad/layer_20/mlp": 0.004353955388069153, "grad/layer_20/attn_mlp_ratio": 0.6155144195857909, "grad/layer_24/attn": 0.00345922471024096, "grad/layer_24/mlp": 0.005881617311388254, "grad/layer_24/attn_mlp_ratio": 0.5881417420220248, "grad/layer_27/attn": 0.005164764821529388, "grad/layer_27/mlp": 0.005215401761233807, "grad/layer_27/attn_mlp_ratio": 0.990290865200474} {"step": 79650, "timestamp": 1778280636.5726578, "eos/sharpness": 20.45485973358154, "eos/L0_probe": 1.9063576459884644, "eos/L_plus": 2.00439190864563, "eos/L_minus": 2.0128719806671143, "eos/grad_norm": 0.06966380029916763, "eos/embed_grad_frac": 0.32001999020576477, "eos/time_s": 0.6134614944458008} {"step": 79650, "timestamp": 1778280636.5929382, "train/loss": 2.093601882457733, "train/z_loss": 0.0014044264447875322, "train/perplexity": 8.114088586486176, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0018596935272216797, "optim/adamw_lr": 5.5790805816650385e-05, "perf/tokens_per_sec": 1822613.0221977662, "perf/iters_per_sec": 0.869089614008792, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1506293296813965, "data/tokens_consumed": 167040253952, "data/tokens_consumed_B": 167.040253952, "train/loss_slope": 6.821107800000456e-06} {"step": 79650, "timestamp": 1778280637.958426, "geo/rankme_last": 440.7566223144531, "geo/layer_0/stable_rank_q_proj": 18.70663833618164, "geo/layer_0/stable_rank_k_proj": 15.685083389282227, "geo/layer_0/stable_rank_o_proj": 46.2864875793457, "geo/layer_0/stable_rank_gate_proj": 127.0461654663086, "geo/layer_0/stable_rank_down_proj": 56.83346939086914, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06165696680545807, "geo/layer_0/attn_entropy_mean": 6.126730442047119, "geo/layer_0/attn_entropy_std": 0.42759665846824646, "geo/layer_7/stable_rank_q_proj": 42.536373138427734, "geo/layer_7/stable_rank_k_proj": 40.04669189453125, "geo/layer_7/stable_rank_o_proj": 87.90396118164062, "geo/layer_7/stable_rank_gate_proj": 76.84095001220703, "geo/layer_7/stable_rank_down_proj": 140.34703063964844, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4536624848842621, "geo/layer_7/attn_entropy_mean": 4.647917747497559, "geo/layer_7/attn_entropy_std": 0.7666803598403931, "geo/layer_14/stable_rank_q_proj": 49.106712341308594, "geo/layer_14/stable_rank_k_proj": 41.481292724609375, "geo/layer_14/stable_rank_o_proj": 43.080543518066406, "geo/layer_14/stable_rank_gate_proj": 70.24185943603516, "geo/layer_14/stable_rank_down_proj": 125.01007843017578, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40726733207702637, "geo/layer_14/attn_entropy_mean": 5.4993791580200195, "geo/layer_14/attn_entropy_std": 0.4092310965061188, "geo/layer_21/stable_rank_q_proj": 39.40314865112305, "geo/layer_21/stable_rank_k_proj": 30.019758224487305, "geo/layer_21/stable_rank_o_proj": 67.86659240722656, "geo/layer_21/stable_rank_gate_proj": 62.888221740722656, "geo/layer_21/stable_rank_down_proj": 49.2445182800293, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14874693751335144, "geo/layer_21/attn_entropy_mean": 5.679294586181641, "geo/layer_21/attn_entropy_std": 0.28755512833595276, "geo/layer_27/stable_rank_q_proj": 43.78887939453125, "geo/layer_27/stable_rank_k_proj": 32.33574676513672, "geo/layer_27/stable_rank_o_proj": 115.30268859863281, "geo/layer_27/stable_rank_gate_proj": 76.21971130371094, "geo/layer_27/stable_rank_down_proj": 127.77790832519531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08620043843984604, "geo/layer_27/attn_entropy_mean": 4.140198230743408, "geo/layer_27/attn_entropy_std": 0.7915230989456177, "attnres/final_alpha/block_0": 0.23936760425567627, "attnres/block_norm/0": 1.778926134109497, "attnres/final_alpha/block_1": 0.0039298818446695805, "attnres/block_norm/1": 47461.0, "attnres/final_alpha/block_2": 0.009134924039244652, "attnres/block_norm/2": 29110.34765625, "attnres/final_alpha/block_3": 0.01061317976564169, "attnres/block_norm/3": 63884.6875, "attnres/final_alpha/block_4": 0.01261512003839016, "attnres/block_norm/4": 15732.734375, "attnres/final_alpha/block_5": 0.6171824336051941, "attnres/block_norm/5": 6812.9853515625, "attnres/final_alpha/block_6": 0.10715686529874802, "attnres/block_norm/6": 40927.515625, "geo/tier1_time_s": 1.3612830638885498, "geo/step": 79650.0, "geo/rankme_slope": 0.00042291277839260705} {"step": 79660, "timestamp": 1778280648.3339958, "train/loss": 2.05832097530365, "train/z_loss": 0.0014121006475761532, "train/perplexity": 7.832807286433674, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.00183729887008667, "optim/adamw_lr": 5.511896610260009e-05, "perf/tokens_per_sec": 1786772.2041897455, "perf/iters_per_sec": 0.851999380202172, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173709774017334, "data/tokens_consumed": 167061225472, "data/tokens_consumed_B": 167.061225472, "train/loss_slope": 3.460599765955022e-06} {"step": 79670, "timestamp": 1778280658.7128973, "train/loss": 2.088746666908264, "train/z_loss": 0.0014195668511092663, "train/perplexity": 8.07478842000326, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.001815025806427002, "optim/adamw_lr": 5.4450774192810055e-05, "perf/tokens_per_sec": 2021478.6551389059, "perf/iters_per_sec": 0.9639161372847108, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374346494674682, "data/tokens_consumed": 167082196992, "data/tokens_consumed_B": 167.082196992, "train/loss_slope": 2.163162182803078e-06} {"step": 79680, "timestamp": 1778280669.0973825, "train/loss": 2.0721189975738525, "train/z_loss": 0.0014044345123693347, "train/perplexity": 7.941633603208133, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0017928767204284668, "optim/adamw_lr": 5.3786301612854e-05, "perf/tokens_per_sec": 2020590.8392783084, "perf/iters_per_sec": 0.9634927936927359, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378904819488526, "data/tokens_consumed": 167103168512, "data/tokens_consumed_B": 167.103168512, "train/loss_slope": -9.001024628486566e-07} {"step": 79690, "timestamp": 1778280679.4766722, "train/loss": 2.063578248023987, "train/z_loss": 0.0014036192093044519, "train/perplexity": 7.874094925598407, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.0017708492279052736, "optim/adamw_lr": 5.3125476837158196e-05, "perf/tokens_per_sec": 2021545.462104295, "perf/iters_per_sec": 0.9639479933282351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374003648757935, "data/tokens_consumed": 167124140032, "data/tokens_consumed_B": 167.124140032, "train/loss_slope": -1.1559819803200644e-06} {"step": 79700, "timestamp": 1778280690.328515, "grad/layer_0/attn": 0.0024619351606816053, "grad/layer_0/mlp": 0.0029175197705626488, "grad/layer_0/attn_mlp_ratio": 0.8438452075415743, "grad/layer_4/attn": 0.0021370314061641693, "grad/layer_4/mlp": 0.002609117655083537, "grad/layer_4/attn_mlp_ratio": 0.8190628429860757, "grad/layer_8/attn": 0.004548318218439817, "grad/layer_8/mlp": 0.0035170642659068108, "grad/layer_8/attn_mlp_ratio": 1.2932143814397086, "grad/layer_12/attn": 0.003552085952833295, "grad/layer_12/mlp": 0.0060152048245072365, "grad/layer_12/attn_mlp_ratio": 0.5905178622196784, "grad/layer_16/attn": 0.0028003782499581575, "grad/layer_16/mlp": 0.003982123918831348, "grad/layer_16/attn_mlp_ratio": 0.7032373267922473, "grad/layer_20/attn": 0.003413387108594179, "grad/layer_20/mlp": 0.0050306604243814945, "grad/layer_20/attn_mlp_ratio": 0.6785166862385026, "grad/layer_24/attn": 0.005237876437604427, "grad/layer_24/mlp": 0.0062808808870613575, "grad/layer_24/attn_mlp_ratio": 0.8339397687035427, "grad/layer_27/attn": 0.0035966085270047188, "grad/layer_27/mlp": 0.005791370756924152, "grad/layer_27/attn_mlp_ratio": 0.6210288748310118} {"step": 79700, "timestamp": 1778280690.3456414, "train/loss": 2.0998626232147215, "train/z_loss": 0.0014223261270672083, "train/perplexity": 8.16504814745103, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.001748945713043213, "optim/adamw_lr": 5.2468371391296385e-05, "perf/tokens_per_sec": 1930336.783194046, "perf/iters_per_sec": 0.9204563060732107, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0864176750183105, "data/tokens_consumed": 167145111552, "data/tokens_consumed_B": 167.145111552, "train/loss_slope": 1.689828389977807e-06} {"step": 79710, "timestamp": 1778280701.2344139, "train/loss": 2.095085322856903, "train/z_loss": 0.001415387133602053, "train/perplexity": 8.126134285627208, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.001727164387702942, "optim/adamw_lr": 5.181493163108825e-05, "perf/tokens_per_sec": 1927005.5608086616, "perf/iters_per_sec": 0.9188678554576214, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0882957696914672, "data/tokens_consumed": 167166083072, "data/tokens_consumed_B": 167.166083072, "train/loss_slope": 4.589991779825251e-06} {"step": 79720, "timestamp": 1778280711.6171632, "train/loss": 2.0601255655288697, "train/z_loss": 0.0014158619684167205, "train/perplexity": 7.8469550555223, "train/grad_norm": 0.0888671875, "optim/muon_lr": 0.0017055076360702516, "optim/adamw_lr": 5.116522908210754e-05, "perf/tokens_per_sec": 2020923.046198189, "perf/iters_per_sec": 0.9636512022963472, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377198696136474, "data/tokens_consumed": 167187054592, "data/tokens_consumed_B": 167.187054592, "train/loss_slope": 1.0523569895060128e-06} {"step": 79725, "timestamp": 1778280717.4030676, "eos/sharpness": 54.4739007949829, "eos/L0_probe": 1.9072914123535156, "eos/L_plus": 2.158557176589966, "eos/L_minus": 2.2007646560668945, "eos/grad_norm": 0.09434816986322403, "eos/embed_grad_frac": 0.19745291769504547, "eos/time_s": 0.6090826988220215} {"step": 79725, "timestamp": 1778280718.7806454, "geo/rankme_last": 440.7270202636719, "geo/layer_0/stable_rank_q_proj": 18.705778121948242, "geo/layer_0/stable_rank_k_proj": 15.685546875, "geo/layer_0/stable_rank_o_proj": 46.28668975830078, "geo/layer_0/stable_rank_gate_proj": 127.03950500488281, "geo/layer_0/stable_rank_down_proj": 56.83512496948242, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06325898319482803, "geo/layer_0/attn_entropy_mean": 6.1265974044799805, "geo/layer_0/attn_entropy_std": 0.42752644419670105, "geo/layer_7/stable_rank_q_proj": 42.53622055053711, "geo/layer_7/stable_rank_k_proj": 40.04435348510742, "geo/layer_7/stable_rank_o_proj": 87.89129638671875, "geo/layer_7/stable_rank_gate_proj": 76.83988189697266, "geo/layer_7/stable_rank_down_proj": 140.34156799316406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45827171206474304, "geo/layer_7/attn_entropy_mean": 4.644312858581543, "geo/layer_7/attn_entropy_std": 0.7678818702697754, "geo/layer_14/stable_rank_q_proj": 49.10585403442383, "geo/layer_14/stable_rank_k_proj": 41.479087829589844, "geo/layer_14/stable_rank_o_proj": 43.08236312866211, "geo/layer_14/stable_rank_gate_proj": 70.24394226074219, "geo/layer_14/stable_rank_down_proj": 125.00736999511719, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41044169664382935, "geo/layer_14/attn_entropy_mean": 5.499096393585205, "geo/layer_14/attn_entropy_std": 0.4101063013076782, "geo/layer_21/stable_rank_q_proj": 39.40142059326172, "geo/layer_21/stable_rank_k_proj": 30.01904296875, "geo/layer_21/stable_rank_o_proj": 67.86466979980469, "geo/layer_21/stable_rank_gate_proj": 62.884586334228516, "geo/layer_21/stable_rank_down_proj": 49.24359893798828, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15523074567317963, "geo/layer_21/attn_entropy_mean": 5.678806781768799, "geo/layer_21/attn_entropy_std": 0.28756847977638245, "geo/layer_27/stable_rank_q_proj": 43.78938293457031, "geo/layer_27/stable_rank_k_proj": 32.33470153808594, "geo/layer_27/stable_rank_o_proj": 115.3054428100586, "geo/layer_27/stable_rank_gate_proj": 76.22370910644531, "geo/layer_27/stable_rank_down_proj": 127.77790832519531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08545958250761032, "geo/layer_27/attn_entropy_mean": 4.143454551696777, "geo/layer_27/attn_entropy_std": 0.7909495830535889, "attnres/final_alpha/block_0": 0.23936831951141357, "attnres/block_norm/0": 1.7788927555084229, "attnres/final_alpha/block_1": 0.0039281826466321945, "attnres/block_norm/1": 47446.7734375, "attnres/final_alpha/block_2": 0.009102217853069305, "attnres/block_norm/2": 29169.14453125, "attnres/final_alpha/block_3": 0.010580673813819885, "attnres/block_norm/3": 63876.9140625, "attnres/final_alpha/block_4": 0.012616044841706753, "attnres/block_norm/4": 15730.0, "attnres/final_alpha/block_5": 0.616790771484375, "attnres/block_norm/5": 6810.8515625, "attnres/final_alpha/block_6": 0.10761377215385437, "attnres/block_norm/6": 40840.6015625, "geo/tier1_time_s": 1.3570072650909424, "geo/step": 79725.0, "geo/rankme_slope": 0.00042421960971888756} {"step": 79730, "timestamp": 1778280723.9726117, "train/loss": 2.0691409349441527, "train/z_loss": 0.0014134730910882354, "train/perplexity": 7.918018102629067, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0016839736700057983, "optim/adamw_lr": 5.051921010017395e-05, "perf/tokens_per_sec": 1698028.320057734, "perf/iters_per_sec": 0.8096829986847562, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350512504577638, "data/tokens_consumed": 167208026112, "data/tokens_consumed_B": 167.208026112, "train/loss_slope": 1.0127360420902314e-06} {"step": 79740, "timestamp": 1778280734.3472054, "train/loss": 2.1285121202468873, "train/z_loss": 0.0013998296577483415, "train/perplexity": 8.40235581122626, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0016625642776489258, "optim/adamw_lr": 4.987692832946777e-05, "perf/tokens_per_sec": 2022403.00878883, "perf/iters_per_sec": 0.9643569034523153, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036960482597351, "data/tokens_consumed": 167228997632, "data/tokens_consumed_B": 167.228997632, "train/loss_slope": 3.18722935220769e-06} {"step": 79750, "timestamp": 1778280744.726513, "grad/layer_0/attn": 0.0029980777762830257, "grad/layer_0/mlp": 0.003257047850638628, "grad/layer_0/attn_mlp_ratio": 0.9204892963565905, "grad/layer_4/attn": 0.001985374139621854, "grad/layer_4/mlp": 0.0026168792974203825, "grad/layer_4/attn_mlp_ratio": 0.758680029954364, "grad/layer_8/attn": 0.0031345451716333628, "grad/layer_8/mlp": 0.0035785390064120293, "grad/layer_8/attn_mlp_ratio": 0.8759287179555708, "grad/layer_12/attn": 0.004026089329272509, "grad/layer_12/mlp": 0.006506404373794794, "grad/layer_12/attn_mlp_ratio": 0.6187886636141349, "grad/layer_16/attn": 0.00292613310739398, "grad/layer_16/mlp": 0.004243733827024698, "grad/layer_16/attn_mlp_ratio": 0.6895185131093989, "grad/layer_20/attn": 0.0026747342199087143, "grad/layer_20/mlp": 0.004909916780889034, "grad/layer_20/attn_mlp_ratio": 0.5447616089632057, "grad/layer_24/attn": 0.003870616666972637, "grad/layer_24/mlp": 0.006830508820712566, "grad/layer_24/attn_mlp_ratio": 0.5666659266391604, "grad/layer_27/attn": 0.004018811043351889, "grad/layer_27/mlp": 0.005768968723714352, "grad/layer_27/attn_mlp_ratio": 0.6966255436902113} {"step": 79750, "timestamp": 1778280744.7439592, "train/loss": 2.10845068693161, "train/z_loss": 0.0014056591899134219, "train/perplexity": 8.235472070922226, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.0016412806510925292, "optim/adamw_lr": 4.9238419532775875e-05, "perf/tokens_per_sec": 2018141.149265083, "perf/iters_per_sec": 0.9623246904683509, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.039150309562683, "data/tokens_consumed": 167249969152, "data/tokens_consumed_B": 167.249969152, "train/loss_slope": 5.86331628873261e-06} {"step": 79760, "timestamp": 1778280755.1247077, "train/loss": 2.102877974510193, "train/z_loss": 0.001405897072982043, "train/perplexity": 8.189705793008793, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.001620120406150818, "optim/adamw_lr": 4.860361218452453e-05, "perf/tokens_per_sec": 2021181.979640874, "perf/iters_per_sec": 0.9637746713833208, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375869274139404, "data/tokens_consumed": 167270940672, "data/tokens_consumed_B": 167.270940672, "train/loss_slope": 7.911239219720472e-06} {"step": 79770, "timestamp": 1778280765.5079763, "train/loss": 2.0534116625785828, "train/z_loss": 0.001416602684184909, "train/perplexity": 7.794447822294977, "train/grad_norm": 0.06591796875, "optim/muon_lr": 0.001599087119102478, "optim/adamw_lr": 4.7972613573074334e-05, "perf/tokens_per_sec": 2021044.0531035427, "perf/iters_per_sec": 0.9637089028852189, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376577377319336, "data/tokens_consumed": 167291912192, "data/tokens_consumed_B": 167.291912192, "train/loss_slope": 5.87502535205685e-06} {"step": 79780, "timestamp": 1778280775.8891702, "train/loss": 2.088988494873047, "train/z_loss": 0.0014012923697009684, "train/perplexity": 8.076741365781857, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.0015781784057617189, "optim/adamw_lr": 4.734535217285156e-05, "perf/tokens_per_sec": 2021091.7447634437, "perf/iters_per_sec": 0.9637316440407961, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376332521438598, "data/tokens_consumed": 167312883712, "data/tokens_consumed_B": 167.312883712, "train/loss_slope": 5.906877072766654e-06} {"step": 79790, "timestamp": 1778280786.2667189, "train/loss": 2.0457753419876097, "train/z_loss": 0.0014214302529580892, "train/perplexity": 7.735153602865673, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.0015573948621749878, "optim/adamw_lr": 4.672184586524963e-05, "perf/tokens_per_sec": 2022125.0747013977, "perf/iters_per_sec": 0.9642243741518963, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037103009223938, "data/tokens_consumed": 167333855232, "data/tokens_consumed_B": 167.333855232, "train/loss_slope": -1.0178748387457568e-06} {"step": 79800, "timestamp": 1778280796.633584, "grad/layer_0/attn": 0.002445581369102001, "grad/layer_0/mlp": 0.002789021236822009, "grad/layer_0/attn_mlp_ratio": 0.8768600429169374, "grad/layer_4/attn": 0.0020201506558805704, "grad/layer_4/mlp": 0.002598751336336136, "grad/layer_4/attn_mlp_ratio": 0.7773543200917634, "grad/layer_8/attn": 0.0030165477655828, "grad/layer_8/mlp": 0.003416803665459156, "grad/layer_8/attn_mlp_ratio": 0.8828566030269025, "grad/layer_12/attn": 0.003272326895967126, "grad/layer_12/mlp": 0.006566163152456284, "grad/layer_12/attn_mlp_ratio": 0.4983620982531934, "grad/layer_16/attn": 0.004945312160998583, "grad/layer_16/mlp": 0.004369363654404879, "grad/layer_16/attn_mlp_ratio": 1.1318151655405377, "grad/layer_20/attn": 0.0030106790363788605, "grad/layer_20/mlp": 0.004438175819814205, "grad/layer_20/attn_mlp_ratio": 0.6783595537386666, "grad/layer_24/attn": 0.004166045226156712, "grad/layer_24/mlp": 0.0061883400194346905, "grad/layer_24/attn_mlp_ratio": 0.6732088323770548, "grad/layer_27/attn": 0.0031863462645560503, "grad/layer_27/mlp": 0.005522714462131262, "grad/layer_27/attn_mlp_ratio": 0.5769529148590313} {"step": 79800, "timestamp": 1778280797.2519872, "eos/sharpness": 5.970478057861327, "eos/L0_probe": 1.9070520401000977, "eos/L_plus": 1.9397755861282349, "eos/L_minus": 1.9340332746505737, "eos/grad_norm": 0.07109776884317398, "eos/embed_grad_frac": 0.34323084354400635, "eos/time_s": 0.6156647205352783} {"step": 79800, "timestamp": 1778280797.2716746, "train/loss": 2.0544066429138184, "train/z_loss": 0.0014121025102213024, "train/perplexity": 7.802207004078703, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0015367388725280762, "optim/adamw_lr": 4.610216617584228e-05, "perf/tokens_per_sec": 1906596.4096516196, "perf/iters_per_sec": 0.9091360138185595, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0999454259872437, "data/tokens_consumed": 167354826752, "data/tokens_consumed_B": 167.354826752, "train/loss_slope": -1.4043537732755572e-06} {"step": 79800, "timestamp": 1778280798.634066, "geo/rankme_last": 440.71185302734375, "geo/layer_0/stable_rank_q_proj": 18.706104278564453, "geo/layer_0/stable_rank_k_proj": 15.684945106506348, "geo/layer_0/stable_rank_o_proj": 46.28572082519531, "geo/layer_0/stable_rank_gate_proj": 127.03528594970703, "geo/layer_0/stable_rank_down_proj": 56.832820892333984, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06700681895017624, "geo/layer_0/attn_entropy_mean": 6.126252174377441, "geo/layer_0/attn_entropy_std": 0.4278320074081421, "geo/layer_7/stable_rank_q_proj": 42.5382194519043, "geo/layer_7/stable_rank_k_proj": 40.044471740722656, "geo/layer_7/stable_rank_o_proj": 87.9030990600586, "geo/layer_7/stable_rank_gate_proj": 76.83442687988281, "geo/layer_7/stable_rank_down_proj": 140.34837341308594, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44592559337615967, "geo/layer_7/attn_entropy_mean": 4.642618179321289, "geo/layer_7/attn_entropy_std": 0.7680584788322449, "geo/layer_14/stable_rank_q_proj": 49.1064453125, "geo/layer_14/stable_rank_k_proj": 41.47978973388672, "geo/layer_14/stable_rank_o_proj": 43.08476638793945, "geo/layer_14/stable_rank_gate_proj": 70.24114227294922, "geo/layer_14/stable_rank_down_proj": 125.0084228515625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41190075874328613, "geo/layer_14/attn_entropy_mean": 5.504050254821777, "geo/layer_14/attn_entropy_std": 0.4095248579978943, "geo/layer_21/stable_rank_q_proj": 39.39510726928711, "geo/layer_21/stable_rank_k_proj": 30.01966094970703, "geo/layer_21/stable_rank_o_proj": 67.86506652832031, "geo/layer_21/stable_rank_gate_proj": 62.88645553588867, "geo/layer_21/stable_rank_down_proj": 49.24383544921875, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14564616978168488, "geo/layer_21/attn_entropy_mean": 5.677947998046875, "geo/layer_21/attn_entropy_std": 0.28757089376449585, "geo/layer_27/stable_rank_q_proj": 43.78809356689453, "geo/layer_27/stable_rank_k_proj": 32.33366394042969, "geo/layer_27/stable_rank_o_proj": 115.30439758300781, "geo/layer_27/stable_rank_gate_proj": 76.22077178955078, "geo/layer_27/stable_rank_down_proj": 127.77788543701172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08802878856658936, "geo/layer_27/attn_entropy_mean": 4.141726493835449, "geo/layer_27/attn_entropy_std": 0.795035183429718, "attnres/final_alpha/block_0": 0.23935869336128235, "attnres/block_norm/0": 1.7789249420166016, "attnres/final_alpha/block_1": 0.003911919891834259, "attnres/block_norm/1": 47435.75, "attnres/final_alpha/block_2": 0.009156552143394947, "attnres/block_norm/2": 29147.578125, "attnres/final_alpha/block_3": 0.01064656488597393, "attnres/block_norm/3": 64023.28125, "attnres/final_alpha/block_4": 0.012645220384001732, "attnres/block_norm/4": 15720.162109375, "attnres/final_alpha/block_5": 0.6171725988388062, "attnres/block_norm/5": 6810.41552734375, "attnres/final_alpha/block_6": 0.10710841417312622, "attnres/block_norm/6": 40930.62109375, "geo/tier1_time_s": 1.3585407733917236, "geo/step": 79800.0, "geo/rankme_slope": 0.00041922933235794316} {"step": 79810, "timestamp": 1778280809.0140896, "train/loss": 2.075038659572601, "train/z_loss": 0.0014155779499560595, "train/perplexity": 7.964854370948673, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.0015162086486816407, "optim/adamw_lr": 4.548625946044922e-05, "perf/tokens_per_sec": 1786534.9743137537, "perf/iters_per_sec": 0.8518862601822632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1738656282424926, "data/tokens_consumed": 167375798272, "data/tokens_consumed_B": 167.375798272, "train/loss_slope": -1.367785606590272e-06} {"step": 79820, "timestamp": 1778280819.3949187, "train/loss": 2.0801098227500914, "train/z_loss": 0.0014138355967588724, "train/perplexity": 8.005348035358404, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0014958059787750244, "optim/adamw_lr": 4.487417936325073e-05, "perf/tokens_per_sec": 2021214.6296541358, "perf/iters_per_sec": 0.9637902401228599, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375701665878296, "data/tokens_consumed": 167396769792, "data/tokens_consumed_B": 167.396769792, "train/loss_slope": 2.0352935562110875e-06} {"step": 79830, "timestamp": 1778280829.7671995, "train/loss": 2.0424286007881163, "train/z_loss": 0.0014137662365101278, "train/perplexity": 7.709309316809796, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0014755290746688843, "optim/adamw_lr": 4.426587224006653e-05, "perf/tokens_per_sec": 2022958.268787365, "perf/iters_per_sec": 0.9646216720520806, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366758584976197, "data/tokens_consumed": 167417741312, "data/tokens_consumed_B": 167.417741312, "train/loss_slope": 1.5110589776209128e-06} {"step": 79840, "timestamp": 1778280840.1452854, "train/loss": 2.089126396179199, "train/z_loss": 0.0014157787314616143, "train/perplexity": 8.077855235765947, "train/grad_norm": 0.0859375, "optim/muon_lr": 0.0014553815126419067, "optim/adamw_lr": 4.36614453792572e-05, "perf/tokens_per_sec": 2021696.7000236965, "perf/iters_per_sec": 0.9640201091879351, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037322759628296, "data/tokens_consumed": 167438712832, "data/tokens_consumed_B": 167.438712832, "train/loss_slope": -6.791617467601796e-07} {"step": 79850, "timestamp": 1778280850.506253, "grad/layer_0/attn": 0.0024881043937057257, "grad/layer_0/mlp": 0.002806742675602436, "grad/layer_0/attn_mlp_ratio": 0.886473963817964, "grad/layer_4/attn": 0.0019227723823860288, "grad/layer_4/mlp": 0.002562097040936351, "grad/layer_4/attn_mlp_ratio": 0.7504681815785201, "grad/layer_8/attn": 0.0042482297867536545, "grad/layer_8/mlp": 0.0037015595007687807, "grad/layer_8/attn_mlp_ratio": 1.1476864470509498, "grad/layer_12/attn": 0.004087698180228472, "grad/layer_12/mlp": 0.005820045713335276, "grad/layer_12/attn_mlp_ratio": 0.7023481105359113, "grad/layer_16/attn": 0.003479801118373871, "grad/layer_16/mlp": 0.00406380370259285, "grad/layer_16/attn_mlp_ratio": 0.8562916143130817, "grad/layer_20/attn": 0.004362097010016441, "grad/layer_20/mlp": 0.004746101330965757, "grad/layer_20/attn_mlp_ratio": 0.919090557474416, "grad/layer_24/attn": 0.003644409356638789, "grad/layer_24/mlp": 0.006602736189961433, "grad/layer_24/attn_mlp_ratio": 0.5519544014168217, "grad/layer_27/attn": 0.0034113505389541388, "grad/layer_27/mlp": 0.005516452714800835, "grad/layer_27/attn_mlp_ratio": 0.618395670819728} {"step": 79850, "timestamp": 1778280850.5230393, "train/loss": 2.0651741147041323, "train/z_loss": 0.0014228319050744176, "train/perplexity": 7.8866709634973535, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.0014353597164154053, "optim/adamw_lr": 4.306079149246215e-05, "perf/tokens_per_sec": 2021894.6212374705, "perf/iters_per_sec": 0.9641144853770592, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372212171554565, "data/tokens_consumed": 167459684352, "data/tokens_consumed_B": 167.459684352, "train/loss_slope": 5.2769377966214545e-08} {"step": 79860, "timestamp": 1778280860.901649, "train/loss": 2.0794513702392576, "train/z_loss": 0.001409327983856201, "train/perplexity": 8.000078628861777, "train/grad_norm": 0.08544921875, "optim/muon_lr": 0.001415466070175171, "optim/adamw_lr": 4.246398210525512e-05, "perf/tokens_per_sec": 2021717.8426108386, "perf/iters_per_sec": 0.964030190759105, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373119115829468, "data/tokens_consumed": 167480655872, "data/tokens_consumed_B": 167.480655872, "train/loss_slope": -4.6703175599962385e-06} {"step": 79870, "timestamp": 1778280871.279267, "train/loss": 2.1036110401153563, "train/z_loss": 0.0014053731341846288, "train/perplexity": 8.195711585693127, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.0013957023620605469, "optim/adamw_lr": 4.18710708618164e-05, "perf/tokens_per_sec": 2021696.1424229878, "perf/iters_per_sec": 0.9640198433031978, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373230457305909, "data/tokens_consumed": 167501627392, "data/tokens_consumed_B": 167.501627392, "train/loss_slope": 4.7668696093618586e-07} {"step": 79875, "timestamp": 1778280877.062172, "eos/sharpness": 24.85566139221191, "eos/L0_probe": 1.9065279960632324, "eos/L_plus": 2.028132915496826, "eos/L_minus": 2.033479690551758, "eos/grad_norm": 0.08352230489253998, "eos/embed_grad_frac": 0.2551133334636688, "eos/time_s": 0.6072509288787842} {"step": 79875, "timestamp": 1778280878.4438763, "geo/rankme_last": 440.6379089355469, "geo/layer_0/stable_rank_q_proj": 18.706012725830078, "geo/layer_0/stable_rank_k_proj": 15.685519218444824, "geo/layer_0/stable_rank_o_proj": 46.283164978027344, "geo/layer_0/stable_rank_gate_proj": 127.02913665771484, "geo/layer_0/stable_rank_down_proj": 56.833038330078125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06254199147224426, "geo/layer_0/attn_entropy_mean": 6.126907825469971, "geo/layer_0/attn_entropy_std": 0.42758679389953613, "geo/layer_7/stable_rank_q_proj": 42.53691482543945, "geo/layer_7/stable_rank_k_proj": 40.04401397705078, "geo/layer_7/stable_rank_o_proj": 87.90087890625, "geo/layer_7/stable_rank_gate_proj": 76.83285522460938, "geo/layer_7/stable_rank_down_proj": 140.35523986816406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4508858621120453, "geo/layer_7/attn_entropy_mean": 4.646371841430664, "geo/layer_7/attn_entropy_std": 0.7674244046211243, "geo/layer_14/stable_rank_q_proj": 49.106300354003906, "geo/layer_14/stable_rank_k_proj": 41.47927474975586, "geo/layer_14/stable_rank_o_proj": 43.08301544189453, "geo/layer_14/stable_rank_gate_proj": 70.24433135986328, "geo/layer_14/stable_rank_down_proj": 125.0064926147461, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4097498655319214, "geo/layer_14/attn_entropy_mean": 5.501164436340332, "geo/layer_14/attn_entropy_std": 0.40931352972984314, "geo/layer_21/stable_rank_q_proj": 39.39411926269531, "geo/layer_21/stable_rank_k_proj": 30.0207576751709, "geo/layer_21/stable_rank_o_proj": 67.865234375, "geo/layer_21/stable_rank_gate_proj": 62.88478469848633, "geo/layer_21/stable_rank_down_proj": 49.24483871459961, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14944009482860565, "geo/layer_21/attn_entropy_mean": 5.680434226989746, "geo/layer_21/attn_entropy_std": 0.2866727113723755, "geo/layer_27/stable_rank_q_proj": 43.79204177856445, "geo/layer_27/stable_rank_k_proj": 32.335758209228516, "geo/layer_27/stable_rank_o_proj": 115.30162811279297, "geo/layer_27/stable_rank_gate_proj": 76.21928405761719, "geo/layer_27/stable_rank_down_proj": 127.76588439941406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08855346590280533, "geo/layer_27/attn_entropy_mean": 4.1427130699157715, "geo/layer_27/attn_entropy_std": 0.7949234247207642, "attnres/final_alpha/block_0": 0.23958994448184967, "attnres/block_norm/0": 1.7789127826690674, "attnres/final_alpha/block_1": 0.003943155519664288, "attnres/block_norm/1": 47454.9375, "attnres/final_alpha/block_2": 0.00913564208894968, "attnres/block_norm/2": 29181.26953125, "attnres/final_alpha/block_3": 0.010626628063619137, "attnres/block_norm/3": 63850.96875, "attnres/final_alpha/block_4": 0.012647654861211777, "attnres/block_norm/4": 15719.6640625, "attnres/final_alpha/block_5": 0.616848349571228, "attnres/block_norm/5": 6811.1953125, "attnres/final_alpha/block_6": 0.10720865428447723, "attnres/block_norm/6": 40932.7421875, "geo/tier1_time_s": 1.3618488311767578, "geo/step": 79875.0, "geo/rankme_slope": 0.00040318856058048217} {"step": 79880, "timestamp": 1778280883.6357775, "train/loss": 2.0660364627838135, "train/z_loss": 0.0014228051761165262, "train/perplexity": 7.893474952339513, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0013760656118392944, "optim/adamw_lr": 4.128196835517883e-05, "perf/tokens_per_sec": 1697928.3817691752, "perf/iters_per_sec": 0.8096353443952442, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2351239442825317, "data/tokens_consumed": 167522598912, "data/tokens_consumed_B": 167.522598912, "train/loss_slope": -3.1428779312486915e-06} {"step": 79890, "timestamp": 1778280894.0104911, "train/loss": 2.049998414516449, "train/z_loss": 0.0014166289125569164, "train/perplexity": 7.767888790437106, "train/grad_norm": 0.06494140625, "optim/muon_lr": 0.0013565593957901001, "optim/adamw_lr": 4.0696781873703e-05, "perf/tokens_per_sec": 2022363.1132520793, "perf/iters_per_sec": 0.9643378797779462, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036980938911438, "data/tokens_consumed": 167543570432, "data/tokens_consumed_B": 167.543570432, "train/loss_slope": -4.667668624429327e-06} {"step": 79900, "timestamp": 1778280904.3796513, "grad/layer_0/attn": 0.0025914122816175222, "grad/layer_0/mlp": 0.003086069831624627, "grad/layer_0/attn_mlp_ratio": 0.8397127540960493, "grad/layer_4/attn": 0.0024551486130803823, "grad/layer_4/mlp": 0.0025636213831603527, "grad/layer_4/attn_mlp_ratio": 0.9576876419578727, "grad/layer_8/attn": 0.003231292124837637, "grad/layer_8/mlp": 0.0035673058591783047, "grad/layer_8/attn_mlp_ratio": 0.9058073969023769, "grad/layer_12/attn": 0.0036882220301777124, "grad/layer_12/mlp": 0.006475278176367283, "grad/layer_12/attn_mlp_ratio": 0.5695851008656349, "grad/layer_16/attn": 0.0029670216608792543, "grad/layer_16/mlp": 0.004234603140503168, "grad/layer_16/attn_mlp_ratio": 0.7006610755171255, "grad/layer_20/attn": 0.003118800465017557, "grad/layer_20/mlp": 0.0048061697743833065, "grad/layer_20/attn_mlp_ratio": 0.6489159864366508, "grad/layer_24/attn": 0.007145728450268507, "grad/layer_24/mlp": 0.006506290286779404, "grad/layer_24/attn_mlp_ratio": 1.0982799760656892, "grad/layer_27/attn": 0.00505879707634449, "grad/layer_27/mlp": 0.005663145333528519, "grad/layer_27/attn_mlp_ratio": 0.8932839772035522} {"step": 79900, "timestamp": 1778280904.3963184, "train/loss": 2.087319016456604, "train/z_loss": 0.0014135228469967843, "train/perplexity": 8.06326866971335, "train/grad_norm": 0.07958984375, "optim/muon_lr": 0.0013371807336807252, "optim/adamw_lr": 4.011542201042175e-05, "perf/tokens_per_sec": 2020195.9170168517, "perf/iters_per_sec": 0.9633044800838717, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038093376159668, "data/tokens_consumed": 167564541952, "data/tokens_consumed_B": 167.564541952, "train/loss_slope": -7.061609998680736e-06} {"step": 79910, "timestamp": 1778280914.7758448, "train/loss": 2.0955724477767945, "train/z_loss": 0.0014121547807008027, "train/perplexity": 8.130093692424687, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.0013179320096969604, "optim/adamw_lr": 3.953796029090881e-05, "perf/tokens_per_sec": 2021774.6277479012, "perf/iters_per_sec": 0.9640572680224901, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372827768325805, "data/tokens_consumed": 167585513472, "data/tokens_consumed_B": 167.585513472, "train/loss_slope": -7.892000550019194e-06} {"step": 79920, "timestamp": 1778280925.1519482, "train/loss": 2.041937756538391, "train/z_loss": 0.0014132330776192247, "train/perplexity": 7.705526175204893, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0012988138198852539, "optim/adamw_lr": 3.8964414596557615e-05, "perf/tokens_per_sec": 2022130.3276818807, "perf/iters_per_sec": 0.9642268789681819, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371003150939941, "data/tokens_consumed": 167606484992, "data/tokens_consumed_B": 167.606484992, "train/loss_slope": -1.0536401099426265e-05} {"step": 79930, "timestamp": 1778280935.5328772, "train/loss": 2.1257815599441527, "train/z_loss": 0.0014060770743526518, "train/perplexity": 8.379443967320203, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0012798243761062622, "optim/adamw_lr": 3.839473128318786e-05, "perf/tokens_per_sec": 2021214.8154324782, "perf/iters_per_sec": 0.9637903287088767, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037570071220398, "data/tokens_consumed": 167627456512, "data/tokens_consumed_B": 167.627456512, "train/loss_slope": -8.758545393991066e-06} {"step": 79940, "timestamp": 1778280945.910864, "train/loss": 2.059401977062225, "train/z_loss": 0.001426566718146205, "train/perplexity": 7.841279143105877, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.0012609666585922242, "optim/adamw_lr": 3.782899975776672e-05, "perf/tokens_per_sec": 2022275.3300142097, "perf/iters_per_sec": 0.9642960214682625, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370259523391723, "data/tokens_consumed": 167648428032, "data/tokens_consumed_B": 167.648428032, "train/loss_slope": -1.103453292097025e-05} {"step": 79950, "timestamp": 1778280956.2732718, "grad/layer_0/attn": 0.002593838609755039, "grad/layer_0/mlp": 0.0028356413822621107, "grad/layer_0/attn_mlp_ratio": 0.9147272763430675, "grad/layer_4/attn": 0.0024864315055310726, "grad/layer_4/mlp": 0.002547405892983079, "grad/layer_4/attn_mlp_ratio": 0.9760640873029416, "grad/layer_8/attn": 0.0027706860564649105, "grad/layer_8/mlp": 0.0037090263795107603, "grad/layer_8/attn_mlp_ratio": 0.7470116678245927, "grad/layer_12/attn": 0.003657208289951086, "grad/layer_12/mlp": 0.005987019278109074, "grad/layer_12/attn_mlp_ratio": 0.6108562640240143, "grad/layer_16/attn": 0.0028730113990604877, "grad/layer_16/mlp": 0.004138919059187174, "grad/layer_16/attn_mlp_ratio": 0.6941453284206468, "grad/layer_20/attn": 0.0026086997240781784, "grad/layer_20/mlp": 0.004588596988469362, "grad/layer_20/attn_mlp_ratio": 0.5685179312503931, "grad/layer_24/attn": 0.0035080353263765574, "grad/layer_24/mlp": 0.006327748764306307, "grad/layer_24/attn_mlp_ratio": 0.5543891519091021, "grad/layer_27/attn": 0.004712855443358421, "grad/layer_27/mlp": 0.005573880858719349, "grad/layer_27/attn_mlp_ratio": 0.8455249543831743} {"step": 79950, "timestamp": 1778280956.879487, "eos/sharpness": 5.680871009826659, "eos/L0_probe": 1.9062589406967163, "eos/L_plus": 1.9348453283309937, "eos/L_minus": 1.9344812631607056, "eos/grad_norm": 0.06830708682537079, "eos/embed_grad_frac": 0.37221115827560425, "eos/time_s": 0.603461742401123} {"step": 79950, "timestamp": 1778280956.8990304, "train/loss": 2.0825042605400084, "train/z_loss": 0.0014119650004431605, "train/perplexity": 8.024539310203405, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0012422388792037965, "optim/adamw_lr": 3.726716637611389e-05, "perf/tokens_per_sec": 1909422.3182034704, "perf/iters_per_sec": 0.9104835120217659, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098317527770996, "data/tokens_consumed": 167669399552, "data/tokens_consumed_B": 167.669399552, "train/loss_slope": -8.606947044191819e-06} {"step": 79950, "timestamp": 1778280958.2634487, "geo/rankme_last": 440.80499267578125, "geo/layer_0/stable_rank_q_proj": 18.705814361572266, "geo/layer_0/stable_rank_k_proj": 15.684626579284668, "geo/layer_0/stable_rank_o_proj": 46.28255844116211, "geo/layer_0/stable_rank_gate_proj": 127.03482055664062, "geo/layer_0/stable_rank_down_proj": 56.834564208984375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06503179669380188, "geo/layer_0/attn_entropy_mean": 6.1264801025390625, "geo/layer_0/attn_entropy_std": 0.42733922600746155, "geo/layer_7/stable_rank_q_proj": 42.53499984741211, "geo/layer_7/stable_rank_k_proj": 40.04457473754883, "geo/layer_7/stable_rank_o_proj": 87.90355682373047, "geo/layer_7/stable_rank_gate_proj": 76.83290100097656, "geo/layer_7/stable_rank_down_proj": 140.35214233398438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45002874732017517, "geo/layer_7/attn_entropy_mean": 4.646304607391357, "geo/layer_7/attn_entropy_std": 0.766677975654602, "geo/layer_14/stable_rank_q_proj": 49.103328704833984, "geo/layer_14/stable_rank_k_proj": 41.48020553588867, "geo/layer_14/stable_rank_o_proj": 43.08320617675781, "geo/layer_14/stable_rank_gate_proj": 70.2430191040039, "geo/layer_14/stable_rank_down_proj": 125.0029525756836, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41310954093933105, "geo/layer_14/attn_entropy_mean": 5.500885009765625, "geo/layer_14/attn_entropy_std": 0.4084203541278839, "geo/layer_21/stable_rank_q_proj": 39.39242935180664, "geo/layer_21/stable_rank_k_proj": 30.023229598999023, "geo/layer_21/stable_rank_o_proj": 67.86521911621094, "geo/layer_21/stable_rank_gate_proj": 62.88399124145508, "geo/layer_21/stable_rank_down_proj": 49.24156188964844, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15153096616268158, "geo/layer_21/attn_entropy_mean": 5.679457664489746, "geo/layer_21/attn_entropy_std": 0.2863662540912628, "geo/layer_27/stable_rank_q_proj": 43.78767395019531, "geo/layer_27/stable_rank_k_proj": 32.33523178100586, "geo/layer_27/stable_rank_o_proj": 115.2927017211914, "geo/layer_27/stable_rank_gate_proj": 76.2154312133789, "geo/layer_27/stable_rank_down_proj": 127.75431060791016, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07919120788574219, "geo/layer_27/attn_entropy_mean": 4.141358852386475, "geo/layer_27/attn_entropy_std": 0.7953007221221924, "attnres/final_alpha/block_0": 0.23931638896465302, "attnres/block_norm/0": 1.778944969177246, "attnres/final_alpha/block_1": 0.003913989756256342, "attnres/block_norm/1": 47455.5546875, "attnres/final_alpha/block_2": 0.009113389998674393, "attnres/block_norm/2": 29170.857421875, "attnres/final_alpha/block_3": 0.010606350377202034, "attnres/block_norm/3": 63942.74609375, "attnres/final_alpha/block_4": 0.012606402859091759, "attnres/block_norm/4": 15724.296875, "attnres/final_alpha/block_5": 0.617653489112854, "attnres/block_norm/5": 6811.2919921875, "attnres/final_alpha/block_6": 0.10678994655609131, "attnres/block_norm/6": 40874.22265625, "geo/tier1_time_s": 1.3602700233459473, "geo/step": 79950.0, "geo/rankme_slope": 0.0003759597002863645} {"step": 79960, "timestamp": 1778280968.6428704, "train/loss": 2.0843269348144533, "train/z_loss": 0.001404063741210848, "train/perplexity": 8.03917876899799, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.0012236416339874267, "optim/adamw_lr": 3.67092490196228e-05, "perf/tokens_per_sec": 1786349.901101518, "perf/iters_per_sec": 0.8517980103976812, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1739872455596925, "data/tokens_consumed": 167690371072, "data/tokens_consumed_B": 167.690371072, "train/loss_slope": -6.7935442445231185e-06} {"step": 79970, "timestamp": 1778280979.0179043, "train/loss": 2.0295801997184753, "train/z_loss": 0.001419945084489882, "train/perplexity": 7.610890634013016, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0012051761150360107, "optim/adamw_lr": 3.615528345108032e-05, "perf/tokens_per_sec": 2022537.725978836, "perf/iters_per_sec": 0.9644211416143589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0368914127349853, "data/tokens_consumed": 167711342592, "data/tokens_consumed_B": 167.711342592, "train/loss_slope": -9.3050312430803e-06} {"step": 79980, "timestamp": 1778280989.390124, "train/loss": 2.0620750188827515, "train/z_loss": 0.0014145146240480244, "train/perplexity": 7.862267248726916, "train/grad_norm": 0.076171875, "optim/muon_lr": 0.0011868417263031007, "optim/adamw_lr": 3.5605251789093016e-05, "perf/tokens_per_sec": 2022767.1631756327, "perf/iters_per_sec": 0.9645305457952655, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0367738008499146, "data/tokens_consumed": 167732314112, "data/tokens_consumed_B": 167.732314112, "train/loss_slope": -1.013507000362063e-05} {"step": 79990, "timestamp": 1778280999.76127, "train/loss": 2.044488620758057, "train/z_loss": 0.0014136144891381264, "train/perplexity": 7.725207017124871, "train/grad_norm": 1.0078125, "optim/muon_lr": 0.0011686396598815918, "optim/adamw_lr": 3.505918979644775e-05, "perf/tokens_per_sec": 2023024.522199021, "perf/iters_per_sec": 0.9646532641406159, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0366419076919555, "data/tokens_consumed": 167753285632, "data/tokens_consumed_B": 167.753285632, "train/loss_slope": -1.1023906505469194e-05} {"step": 80000, "timestamp": 1778281010.1100094, "grad/layer_0/attn": 0.002899721497669816, "grad/layer_0/mlp": 0.00318136322312057, "grad/layer_0/attn_mlp_ratio": 0.9114713420488879, "grad/layer_4/attn": 0.003355389228090644, "grad/layer_4/mlp": 0.002496343106031418, "grad/layer_4/attn_mlp_ratio": 1.3441217617768597, "grad/layer_8/attn": 0.004537719301879406, "grad/layer_8/mlp": 0.003555190982297063, "grad/layer_8/attn_mlp_ratio": 1.2763643913472906, "grad/layer_12/attn": 0.004045481327921152, "grad/layer_12/mlp": 0.006828012876212597, "grad/layer_12/attn_mlp_ratio": 0.5924829583679444, "grad/layer_16/attn": 0.004864112474024296, "grad/layer_16/mlp": 0.004434745758771896, "grad/layer_16/attn_mlp_ratio": 1.0968187645754526, "grad/layer_20/attn": 0.003918222151696682, "grad/layer_20/mlp": 0.004844629671424627, "grad/layer_20/attn_mlp_ratio": 0.8087763846904812, "grad/layer_24/attn": 0.003763971384614706, "grad/layer_24/mlp": 0.006499415263533592, "grad/layer_24/attn_mlp_ratio": 0.57912460953539, "grad/layer_27/attn": 0.0063142976723611355, "grad/layer_27/mlp": 0.0055569931864738464, "grad/layer_27/attn_mlp_ratio": 1.1362795214690338} {"step": 80000, "timestamp": 1778281010.1255808, "train/loss": 2.0575971722602846, "train/z_loss": 0.0014079423039220274, "train/perplexity": 7.82713992795474, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.0011505687236785889, "optim/adamw_lr": 3.4517061710357664e-05, "perf/tokens_per_sec": 2024429.8462172262, "perf/iters_per_sec": 0.9653233748518115, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359222888946533, "data/tokens_consumed": 167774257152, "data/tokens_consumed_B": 167.774257152, "train/loss_slope": -9.71435217967527e-06} {"step": 80000, "timestamp": 1778281016.939293, "geo/ww_alpha_mean": 7.707260057288724, "geo/ww_alpha_std": 4.643957596003281, "geo/ww_alpha_min": 1.3394185307581177, "geo/ww_alpha_max": 31.42218637537657, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.9226245471480836, "geo/ww_alpha_by_type/k_proj": 4.443151777630288, "geo/ww_alpha_by_type/v_proj": 8.679574977979842, "geo/ww_alpha_by_type/o_proj": 9.441836282692085, "geo/ww_alpha_by_type/gate_proj": 8.041355668087766, "geo/ww_alpha_by_type/up_proj": 11.488062910306112, "geo/ww_alpha_by_type/down_proj": 8.021930299863035, "geo/twonn_id/layer_0": 0.6874470710754395, "geo/twonn_id/layer_7": 3.270207643508911, "geo/twonn_id/layer_14": 5.024848461151123, "geo/twonn_id/layer_21": 6.4369282722473145, "geo/twonn_id/layer_27": 6.070405006408691, "geo/tier2_time_s": 6.8078835010528564} {"step": 80000, "timestamp": 1778281017.5415034, "eoc/jacobian_sigma/layer_0/attn": 1256.2509765625, "eoc/jacobian_sigma/layer_0/mlp": 9620.42578125, "eoc/jacobian_sigma/layer_0": 9620.42578125, "eoc/jacobian_sigma/layer_7/attn": 1.1495195627212524, "eoc/jacobian_sigma/layer_7/mlp": 1.9187068939208984, "eoc/jacobian_sigma/layer_7": 1.9187068939208984, "eoc/jacobian_sigma/layer_14/attn": 1.373885989189148, "eoc/jacobian_sigma/layer_14/mlp": 6.764155387878418, "eoc/jacobian_sigma/layer_14": 6.764155387878418, "eoc/jacobian_sigma/layer_21/attn": 1.1041204929351807, "eoc/jacobian_sigma/layer_21/mlp": 4.03214693069458, "eoc/jacobian_sigma/layer_21": 4.03214693069458, "eoc/jacobian_sigma/layer_27/attn": 3.199122667312622, "eoc/jacobian_sigma/layer_27/mlp": 29.166391372680664, "eoc/jacobian_sigma/layer_27": 29.166391372680664, "eoc/layer0_sigma": 9620.42578125, "eoc/sigma_max": 29.166391372680664, "eoc/sigma_min": 1.9187068939208984, "eoc/sigma_mean": 10.47035014629364, "eoc/time_s": 0.5966758728027344} {"step": 80010, "timestamp": 1778281027.9275916, "train/loss": 2.0771001100540163, "train/z_loss": 0.0014157877769321204, "train/perplexity": 7.981290459096443, "train/grad_norm": 0.06689453125, "optim/muon_lr": 0.001132631301879883, "optim/adamw_lr": 3.3978939056396485e-05, "perf/tokens_per_sec": 1178355.5287710691, "perf/iters_per_sec": 0.5618837016921373, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.7797277212142943, "data/tokens_consumed": 167795228672, "data/tokens_consumed_B": 167.795228672, "train/loss_slope": -9.672614693319808e-06} {"step": 80020, "timestamp": 1778281038.289572, "train/loss": 2.080210304260254, "train/z_loss": 0.001418086455669254, "train/perplexity": 8.006152465232862, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0011148250102996826, "optim/adamw_lr": 3.3444750308990476e-05, "perf/tokens_per_sec": 2024968.5984027411, "perf/iters_per_sec": 0.9655802719129282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035646677017212, "data/tokens_consumed": 167816200192, "data/tokens_consumed_B": 167.816200192, "train/loss_slope": -1.0422773451337322e-05} {"step": 80025, "timestamp": 1778281044.0725734, "eos/sharpness": 6.631648540496825, "eos/L0_probe": 1.9059827327728271, "eos/L_plus": 1.9379982948303223, "eos/L_minus": 1.9402836561203003, "eos/grad_norm": 0.0722896158695221, "eos/embed_grad_frac": 0.3254464566707611, "eos/time_s": 0.6139276027679443} {"step": 80025, "timestamp": 1778281045.4485343, "geo/rankme_last": 440.778076171875, "geo/layer_0/stable_rank_q_proj": 18.7050724029541, "geo/layer_0/stable_rank_k_proj": 15.684621810913086, "geo/layer_0/stable_rank_o_proj": 46.284637451171875, "geo/layer_0/stable_rank_gate_proj": 127.03194427490234, "geo/layer_0/stable_rank_down_proj": 56.83373260498047, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0642370954155922, "geo/layer_0/attn_entropy_mean": 6.1267290115356445, "geo/layer_0/attn_entropy_std": 0.42729589343070984, "geo/layer_7/stable_rank_q_proj": 42.53288269042969, "geo/layer_7/stable_rank_k_proj": 40.045082092285156, "geo/layer_7/stable_rank_o_proj": 87.90314483642578, "geo/layer_7/stable_rank_gate_proj": 76.83142852783203, "geo/layer_7/stable_rank_down_proj": 140.344970703125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.448757529258728, "geo/layer_7/attn_entropy_mean": 4.646051406860352, "geo/layer_7/attn_entropy_std": 0.7669971585273743, "geo/layer_14/stable_rank_q_proj": 49.10467529296875, "geo/layer_14/stable_rank_k_proj": 41.48416519165039, "geo/layer_14/stable_rank_o_proj": 43.084014892578125, "geo/layer_14/stable_rank_gate_proj": 70.24249267578125, "geo/layer_14/stable_rank_down_proj": 124.99952697753906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.410747766494751, "geo/layer_14/attn_entropy_mean": 5.501307964324951, "geo/layer_14/attn_entropy_std": 0.40921369194984436, "geo/layer_21/stable_rank_q_proj": 39.39208984375, "geo/layer_21/stable_rank_k_proj": 30.02393913269043, "geo/layer_21/stable_rank_o_proj": 67.86260223388672, "geo/layer_21/stable_rank_gate_proj": 62.88208770751953, "geo/layer_21/stable_rank_down_proj": 49.2402458190918, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1463451087474823, "geo/layer_21/attn_entropy_mean": 5.679904937744141, "geo/layer_21/attn_entropy_std": 0.2870762348175049, "geo/layer_27/stable_rank_q_proj": 43.78716278076172, "geo/layer_27/stable_rank_k_proj": 32.33679962158203, "geo/layer_27/stable_rank_o_proj": 115.29436492919922, "geo/layer_27/stable_rank_gate_proj": 76.22019958496094, "geo/layer_27/stable_rank_down_proj": 127.77790832519531, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08014243096113205, "geo/layer_27/attn_entropy_mean": 4.142536640167236, "geo/layer_27/attn_entropy_std": 0.7961872220039368, "attnres/final_alpha/block_0": 0.23932430148124695, "attnres/block_norm/0": 1.778941035270691, "attnres/final_alpha/block_1": 0.003928723745048046, "attnres/block_norm/1": 47431.5546875, "attnres/final_alpha/block_2": 0.009101142175495625, "attnres/block_norm/2": 29134.84375, "attnres/final_alpha/block_3": 0.010581748560070992, "attnres/block_norm/3": 63859.765625, "attnres/final_alpha/block_4": 0.0126033965498209, "attnres/block_norm/4": 15720.35546875, "attnres/final_alpha/block_5": 0.617267370223999, "attnres/block_norm/5": 6805.8271484375, "attnres/final_alpha/block_6": 0.10719330608844757, "attnres/block_norm/6": 40893.5546875, "geo/tier1_time_s": 1.3566694259643555, "geo/step": 80025.0, "geo/rankme_slope": 0.00035080383715986396} {"step": 80030, "timestamp": 1778281050.6309342, "train/loss": 2.0656601667404173, "train/z_loss": 0.001419118675403297, "train/perplexity": 7.890505227729155, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.0010971522331237793, "optim/adamw_lr": 3.291456699371338e-05, "perf/tokens_per_sec": 1699881.0931078377, "perf/iters_per_sec": 0.8105664697207631, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2337051153182983, "data/tokens_consumed": 167837171712, "data/tokens_consumed_B": 167.837171712, "train/loss_slope": -1.2960836467462548e-05} {"step": 80040, "timestamp": 1778281061.5168033, "train/loss": 2.058487057685852, "train/z_loss": 0.0014144794200547039, "train/perplexity": 7.834108285760678, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.001079612970352173, "optim/adamw_lr": 3.238838911056518e-05, "perf/tokens_per_sec": 1927784.9282362473, "perf/iters_per_sec": 0.9192394868069874, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0878557920455934, "data/tokens_consumed": 167858143232, "data/tokens_consumed_B": 167.858143232, "train/loss_slope": -1.754922594281602e-05} {"step": 80050, "timestamp": 1778281071.8897288, "grad/layer_0/attn": 0.0026022556703537703, "grad/layer_0/mlp": 0.0028718772809952497, "grad/layer_0/attn_mlp_ratio": 0.906116565969804, "grad/layer_4/attn": 0.0031103447545319796, "grad/layer_4/mlp": 0.0024398183450102806, "grad/layer_4/attn_mlp_ratio": 1.2748263137747011, "grad/layer_8/attn": 0.0028826911002397537, "grad/layer_8/mlp": 0.0033343483228236437, "grad/layer_8/attn_mlp_ratio": 0.8645440531972328, "grad/layer_12/attn": 0.005643799901008606, "grad/layer_12/mlp": 0.006482988595962524, "grad/layer_12/attn_mlp_ratio": 0.8705552586453609, "grad/layer_16/attn": 0.002988466527312994, "grad/layer_16/mlp": 0.0041923425160348415, "grad/layer_16/attn_mlp_ratio": 0.7128392884404842, "grad/layer_20/attn": 0.0026758932508528233, "grad/layer_20/mlp": 0.004608897026628256, "grad/layer_20/attn_mlp_ratio": 0.5805929655909752, "grad/layer_24/attn": 0.006022334098815918, "grad/layer_24/mlp": 0.006338810082525015, "grad/layer_24/attn_mlp_ratio": 0.9500732669702643, "grad/layer_27/attn": 0.004548101685941219, "grad/layer_27/mlp": 0.005458769388496876, "grad/layer_27/attn_mlp_ratio": 0.8331734277340923} {"step": 80050, "timestamp": 1778281071.9054148, "train/loss": 2.0834163546562197, "train/z_loss": 0.0014008627040311694, "train/perplexity": 8.031861784178622, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0010622060298919678, "optim/adamw_lr": 3.186618089675903e-05, "perf/tokens_per_sec": 2019962.9342576992, "perf/iters_per_sec": 0.9631933852470871, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0382131099700929, "data/tokens_consumed": 167879114752, "data/tokens_consumed_B": 167.879114752, "train/loss_slope": -1.2508326244897902e-05} {"step": 80060, "timestamp": 1778281082.2859087, "train/loss": 2.1191850662231446, "train/z_loss": 0.0014201493817381562, "train/perplexity": 8.324350928023287, "train/grad_norm": 0.078125, "optim/muon_lr": 0.0010449337959289552, "optim/adamw_lr": 3.134801387786865e-05, "perf/tokens_per_sec": 2021421.7937180179, "perf/iters_per_sec": 0.9638890236463632, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374638319015503, "data/tokens_consumed": 167900086272, "data/tokens_consumed_B": 167.900086272, "train/loss_slope": -1.3923291476181299e-05} {"step": 80070, "timestamp": 1778281092.6692553, "train/loss": 2.068249249458313, "train/z_loss": 0.0014094848535023629, "train/perplexity": 7.910960867694919, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0010277944803237914, "optim/adamw_lr": 3.083383440971374e-05, "perf/tokens_per_sec": 2020615.2543462755, "perf/iters_per_sec": 0.9635044357043626, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378779411315917, "data/tokens_consumed": 167921057792, "data/tokens_consumed_B": 167.921057792, "train/loss_slope": -1.2776026252222378e-05} {"step": 80080, "timestamp": 1778281103.0455651, "train/loss": 2.0818788766860963, "train/z_loss": 0.0014212895766831934, "train/perplexity": 8.019522461775232, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0010107886791229249, "optim/adamw_lr": 3.032366037368774e-05, "perf/tokens_per_sec": 2022113.6391037516, "perf/iters_per_sec": 0.9642189212340124, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037108874320984, "data/tokens_consumed": 167942029312, "data/tokens_consumed_B": 167.942029312, "train/loss_slope": -1.3189246437766309e-05} {"step": 80090, "timestamp": 1778281113.4207425, "train/loss": 2.0295330762863157, "train/z_loss": 0.0014232843765057623, "train/perplexity": 7.610531991174858, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.000993918776512146, "optim/adamw_lr": 2.9817563295364376e-05, "perf/tokens_per_sec": 2022395.8479355583, "perf/iters_per_sec": 0.9643534888913909, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369641542434693, "data/tokens_consumed": 167963000832, "data/tokens_consumed_B": 167.963000832, "train/loss_slope": -1.56970698447904e-05} {"step": 80100, "timestamp": 1778281123.7903686, "grad/layer_0/attn": 0.0023008491843938828, "grad/layer_0/mlp": 0.0025508597027510405, "grad/layer_0/attn_mlp_ratio": 0.9019896671359487, "grad/layer_4/attn": 0.00253508728928864, "grad/layer_4/mlp": 0.00238321372307837, "grad/layer_4/attn_mlp_ratio": 1.0637263281790204, "grad/layer_8/attn": 0.0032729997765272856, "grad/layer_8/mlp": 0.0032928648870438337, "grad/layer_8/attn_mlp_ratio": 0.9939671955592743, "grad/layer_12/attn": 0.0036800464149564505, "grad/layer_12/mlp": 0.006217729300260544, "grad/layer_12/attn_mlp_ratio": 0.5918633922541313, "grad/layer_16/attn": 0.002945764223113656, "grad/layer_16/mlp": 0.004196755588054657, "grad/layer_16/attn_mlp_ratio": 0.7019146316994976, "grad/layer_20/attn": 0.002456412650644779, "grad/layer_20/mlp": 0.004530456382781267, "grad/layer_20/attn_mlp_ratio": 0.5421998114275621, "grad/layer_24/attn": 0.006542084738612175, "grad/layer_24/mlp": 0.00586360041052103, "grad/layer_24/attn_mlp_ratio": 1.1157111960260158, "grad/layer_27/attn": 0.003940884955227375, "grad/layer_27/mlp": 0.005480882246047258, "grad/layer_27/attn_mlp_ratio": 0.7190238188691445} {"step": 80100, "timestamp": 1778281124.4063432, "eos/sharpness": 7.843613624572752, "eos/L0_probe": 1.905593752861023, "eos/L_plus": 1.945786714553833, "eos/L_minus": 1.9438369274139404, "eos/grad_norm": 0.06784676760435104, "eos/embed_grad_frac": 0.35710230469703674, "eos/time_s": 0.6129140853881836} {"step": 80100, "timestamp": 1778281124.426156, "train/loss": 2.120017910003662, "train/z_loss": 0.001412338949739933, "train/perplexity": 8.331286699726796, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0009771823883056641, "optim/adamw_lr": 2.9315471649169918e-05, "perf/tokens_per_sec": 1906846.0952163425, "perf/iters_per_sec": 0.9092550731736863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0998013973236085, "data/tokens_consumed": 167983972352, "data/tokens_consumed_B": 167.983972352, "train/loss_slope": -1.3194530925126943e-05} {"step": 80100, "timestamp": 1778281125.791561, "geo/rankme_last": 440.6986389160156, "geo/layer_0/stable_rank_q_proj": 18.704641342163086, "geo/layer_0/stable_rank_k_proj": 15.68442440032959, "geo/layer_0/stable_rank_o_proj": 46.285369873046875, "geo/layer_0/stable_rank_gate_proj": 127.02709197998047, "geo/layer_0/stable_rank_down_proj": 56.83549880981445, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06256602704524994, "geo/layer_0/attn_entropy_mean": 6.126461029052734, "geo/layer_0/attn_entropy_std": 0.4271365702152252, "geo/layer_7/stable_rank_q_proj": 42.532962799072266, "geo/layer_7/stable_rank_k_proj": 40.04350662231445, "geo/layer_7/stable_rank_o_proj": 87.90235137939453, "geo/layer_7/stable_rank_gate_proj": 76.8290786743164, "geo/layer_7/stable_rank_down_proj": 140.34780883789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4597875773906708, "geo/layer_7/attn_entropy_mean": 4.64840030670166, "geo/layer_7/attn_entropy_std": 0.765418529510498, "geo/layer_14/stable_rank_q_proj": 49.10272216796875, "geo/layer_14/stable_rank_k_proj": 41.4854621887207, "geo/layer_14/stable_rank_o_proj": 43.082801818847656, "geo/layer_14/stable_rank_gate_proj": 70.23971557617188, "geo/layer_14/stable_rank_down_proj": 124.99938201904297, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40089449286460876, "geo/layer_14/attn_entropy_mean": 5.498561859130859, "geo/layer_14/attn_entropy_std": 0.40787428617477417, "geo/layer_21/stable_rank_q_proj": 39.39239501953125, "geo/layer_21/stable_rank_k_proj": 30.02428436279297, "geo/layer_21/stable_rank_o_proj": 67.86219787597656, "geo/layer_21/stable_rank_gate_proj": 62.8814582824707, "geo/layer_21/stable_rank_down_proj": 49.24094009399414, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1477181762456894, "geo/layer_21/attn_entropy_mean": 5.67897891998291, "geo/layer_21/attn_entropy_std": 0.28798073530197144, "geo/layer_27/stable_rank_q_proj": 43.78852462768555, "geo/layer_27/stable_rank_k_proj": 32.33635330200195, "geo/layer_27/stable_rank_o_proj": 115.30250549316406, "geo/layer_27/stable_rank_gate_proj": 76.2162094116211, "geo/layer_27/stable_rank_down_proj": 127.7779312133789, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08497319370508194, "geo/layer_27/attn_entropy_mean": 4.143314838409424, "geo/layer_27/attn_entropy_std": 0.7953525185585022, "attnres/final_alpha/block_0": 0.2391127347946167, "attnres/block_norm/0": 1.7789273262023926, "attnres/final_alpha/block_1": 0.003940410912036896, "attnres/block_norm/1": 47458.0625, "attnres/final_alpha/block_2": 0.009090250357985497, "attnres/block_norm/2": 29180.80859375, "attnres/final_alpha/block_3": 0.010537510737776756, "attnres/block_norm/3": 63864.1015625, "attnres/final_alpha/block_4": 0.012583973817527294, "attnres/block_norm/4": 15722.9267578125, "attnres/final_alpha/block_5": 0.6173899173736572, "attnres/block_norm/5": 6812.1396484375, "attnres/final_alpha/block_6": 0.10734528303146362, "attnres/block_norm/6": 40949.71875, "geo/tier1_time_s": 1.3613474369049072, "geo/step": 80100.0, "geo/rankme_slope": 0.0003431386226365546} {"step": 80110, "timestamp": 1778281136.1670163, "train/loss": 2.0413023352622988, "train/z_loss": 0.0014197211130522192, "train/perplexity": 7.700631475192637, "train/grad_norm": 0.07861328125, "optim/muon_lr": 0.0009605818986892701, "optim/adamw_lr": 2.8817456960678097e-05, "perf/tokens_per_sec": 1786790.0252847616, "perf/iters_per_sec": 0.8520078779624756, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1736980676651, "data/tokens_consumed": 168004943872, "data/tokens_consumed_B": 168.004943872, "train/loss_slope": -1.7104970210670793e-05} {"step": 80120, "timestamp": 1778281146.5468218, "train/loss": 2.0818785786628724, "train/z_loss": 0.001420473074540496, "train/perplexity": 8.01952007177165, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0009441155195236207, "optim/adamw_lr": 2.8323465585708614e-05, "perf/tokens_per_sec": 2021434.6615863221, "perf/iters_per_sec": 0.9638951595241175, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374572277069092, "data/tokens_consumed": 168025915392, "data/tokens_consumed_B": 168.025915392, "train/loss_slope": -1.715659451420298e-05} {"step": 80130, "timestamp": 1778281156.922762, "train/loss": 2.016532278060913, "train/z_loss": 0.001425441715400666, "train/perplexity": 7.512229391098338, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0009277856349945068, "optim/adamw_lr": 2.7833569049835203e-05, "perf/tokens_per_sec": 2022433.8382362502, "perf/iters_per_sec": 0.9643716040784122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369446754455567, "data/tokens_consumed": 168046886912, "data/tokens_consumed_B": 168.046886912, "train/loss_slope": -1.8386846929207365e-05} {"step": 80140, "timestamp": 1778281167.297737, "train/loss": 2.069631266593933, "train/z_loss": 0.001413943013176322, "train/perplexity": 7.921901509509164, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.0009115898609161377, "optim/adamw_lr": 2.7347695827484127e-05, "perf/tokens_per_sec": 2022413.3781662413, "perf/iters_per_sec": 0.9643618479567725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369551658630372, "data/tokens_consumed": 168067858432, "data/tokens_consumed_B": 168.067858432, "train/loss_slope": -1.707878643565801e-05} {"step": 80150, "timestamp": 1778281177.669252, "grad/layer_0/attn": 0.002358510624617338, "grad/layer_0/mlp": 0.0027222849894315004, "grad/layer_0/attn_mlp_ratio": 0.8663716499692066, "grad/layer_4/attn": 0.0022865093778818846, "grad/layer_4/mlp": 0.0024910313077270985, "grad/layer_4/attn_mlp_ratio": 0.9178966474646634, "grad/layer_8/attn": 0.0031316617969423532, "grad/layer_8/mlp": 0.003497468773275614, "grad/layer_8/attn_mlp_ratio": 0.8954080537704234, "grad/layer_12/attn": 0.0033061609137803316, "grad/layer_12/mlp": 0.006388492416590452, "grad/layer_12/attn_mlp_ratio": 0.5175181633530088, "grad/layer_16/attn": 0.0029659655410796404, "grad/layer_16/mlp": 0.004009617958217859, "grad/layer_16/attn_mlp_ratio": 0.739712735231922, "grad/layer_20/attn": 0.005341202951967716, "grad/layer_20/mlp": 0.004611174110323191, "grad/layer_20/attn_mlp_ratio": 1.158317319699218, "grad/layer_24/attn": 0.006263883784413338, "grad/layer_24/mlp": 0.006227235775440931, "grad/layer_24/attn_mlp_ratio": 1.0058851005013218, "grad/layer_27/attn": 0.003946363925933838, "grad/layer_27/mlp": 0.005477739032357931, "grad/layer_27/attn_mlp_ratio": 0.720436630985583} {"step": 80150, "timestamp": 1778281177.6849694, "train/loss": 2.07443630695343, "train/z_loss": 0.0014276104746386408, "train/perplexity": 7.96005816470573, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0008955305814743042, "optim/adamw_lr": 2.6865917444229122e-05, "perf/tokens_per_sec": 2020112.358078445, "perf/iters_per_sec": 0.9632646360771394, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381363153457641, "data/tokens_consumed": 168088829952, "data/tokens_consumed_B": 168.088829952, "train/loss_slope": -1.658184151373357e-05} {"step": 80160, "timestamp": 1778281188.0685937, "train/loss": 2.0539013862609865, "train/z_loss": 0.0014092336874455213, "train/perplexity": 7.798265882805834, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0008796077966690064, "optim/adamw_lr": 2.638823390007019e-05, "perf/tokens_per_sec": 2021101.0325929425, "perf/iters_per_sec": 0.9637360728230202, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376284837722778, "data/tokens_consumed": 168109801472, "data/tokens_consumed_B": 168.109801472, "train/loss_slope": -1.949257765523024e-05} {"step": 80170, "timestamp": 1778281198.444824, "train/loss": 2.0894066452980042, "train/z_loss": 0.0014130572788417339, "train/perplexity": 8.080119364822874, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0008638209104537964, "optim/adamw_lr": 2.591462731361389e-05, "perf/tokens_per_sec": 2022198.1540137737, "perf/iters_per_sec": 0.9642592210835331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370655298233031, "data/tokens_consumed": 168130772992, "data/tokens_consumed_B": 168.130772992, "train/loss_slope": -2.016894792315836e-05} {"step": 80175, "timestamp": 1778281204.225576, "eos/sharpness": 1.6154527664184568, "eos/L0_probe": 1.9071457386016846, "eos/L_plus": 1.9164903163909912, "eos/L_minus": 1.9139556884765625, "eos/grad_norm": 0.06686908006668091, "eos/embed_grad_frac": 0.37983301281929016, "eos/time_s": 0.6012868881225586} {"step": 80175, "timestamp": 1778281205.6068232, "geo/rankme_last": 440.7223815917969, "geo/layer_0/stable_rank_q_proj": 18.705522537231445, "geo/layer_0/stable_rank_k_proj": 15.684599876403809, "geo/layer_0/stable_rank_o_proj": 46.28604507446289, "geo/layer_0/stable_rank_gate_proj": 127.02810668945312, "geo/layer_0/stable_rank_down_proj": 56.83272933959961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06690334528684616, "geo/layer_0/attn_entropy_mean": 6.126461505889893, "geo/layer_0/attn_entropy_std": 0.4275383949279785, "geo/layer_7/stable_rank_q_proj": 42.533634185791016, "geo/layer_7/stable_rank_k_proj": 40.04293441772461, "geo/layer_7/stable_rank_o_proj": 87.8984603881836, "geo/layer_7/stable_rank_gate_proj": 76.8316650390625, "geo/layer_7/stable_rank_down_proj": 140.35556030273438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45517611503601074, "geo/layer_7/attn_entropy_mean": 4.644232749938965, "geo/layer_7/attn_entropy_std": 0.7664999961853027, "geo/layer_14/stable_rank_q_proj": 49.103084564208984, "geo/layer_14/stable_rank_k_proj": 41.48444747924805, "geo/layer_14/stable_rank_o_proj": 43.08189010620117, "geo/layer_14/stable_rank_gate_proj": 70.23915100097656, "geo/layer_14/stable_rank_down_proj": 124.99402618408203, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.411036878824234, "geo/layer_14/attn_entropy_mean": 5.493932247161865, "geo/layer_14/attn_entropy_std": 0.40796902775764465, "geo/layer_21/stable_rank_q_proj": 39.39060974121094, "geo/layer_21/stable_rank_k_proj": 30.023225784301758, "geo/layer_21/stable_rank_o_proj": 67.86222839355469, "geo/layer_21/stable_rank_gate_proj": 62.88097381591797, "geo/layer_21/stable_rank_down_proj": 49.24002456665039, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1474652886390686, "geo/layer_21/attn_entropy_mean": 5.6769256591796875, "geo/layer_21/attn_entropy_std": 0.2878826856613159, "geo/layer_27/stable_rank_q_proj": 43.78609085083008, "geo/layer_27/stable_rank_k_proj": 32.337379455566406, "geo/layer_27/stable_rank_o_proj": 115.30170440673828, "geo/layer_27/stable_rank_gate_proj": 76.21728515625, "geo/layer_27/stable_rank_down_proj": 127.77816009521484, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08425614982843399, "geo/layer_27/attn_entropy_mean": 4.143564224243164, "geo/layer_27/attn_entropy_std": 0.794969379901886, "attnres/final_alpha/block_0": 0.23904070258140564, "attnres/block_norm/0": 1.778878927230835, "attnres/final_alpha/block_1": 0.003940907306969166, "attnres/block_norm/1": 47441.609375, "attnres/final_alpha/block_2": 0.009109045378863811, "attnres/block_norm/2": 29170.2890625, "attnres/final_alpha/block_3": 0.010555192828178406, "attnres/block_norm/3": 63779.1171875, "attnres/final_alpha/block_4": 0.012608900666236877, "attnres/block_norm/4": 15733.33984375, "attnres/final_alpha/block_5": 0.6173422932624817, "attnres/block_norm/5": 6816.283203125, "attnres/final_alpha/block_6": 0.1074029952287674, "attnres/block_norm/6": 40873.71484375, "geo/tier1_time_s": 1.3606486320495605, "geo/step": 80175.0, "geo/rankme_slope": 0.0003434269606280012} {"step": 80180, "timestamp": 1778281210.836125, "train/loss": 2.0892791032791136, "train/z_loss": 0.0014140674844384193, "train/perplexity": 8.07908887580293, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.0008481711149215699, "optim/adamw_lr": 2.5445133447647092e-05, "perf/tokens_per_sec": 1693366.1680814864, "perf/iters_per_sec": 0.8074599113852913, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2384515762329102, "data/tokens_consumed": 168151744512, "data/tokens_consumed_B": 168.151744512, "train/loss_slope": -2.212787781110796e-05} {"step": 80190, "timestamp": 1778281221.2187996, "train/loss": 2.0823274731636046, "train/z_loss": 0.0014161089668050409, "train/perplexity": 8.023120798343093, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0008326572179794312, "optim/adamw_lr": 2.497971653938293e-05, "perf/tokens_per_sec": 2021107.719883039, "perf/iters_per_sec": 0.9637392615714259, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376250505447389, "data/tokens_consumed": 168172716032, "data/tokens_consumed_B": 168.172716032, "train/loss_slope": -2.222449057745282e-05} {"step": 80200, "timestamp": 1778281231.5818799, "grad/layer_0/attn": 0.0029845237731933594, "grad/layer_0/mlp": 0.0028664618730545044, "grad/layer_0/attn_mlp_ratio": 1.0411872898537176, "grad/layer_4/attn": 0.0020940026734024286, "grad/layer_4/mlp": 0.00254884478636086, "grad/layer_4/attn_mlp_ratio": 0.8215496692669136, "grad/layer_8/attn": 0.0030836467631161213, "grad/layer_8/mlp": 0.003586852690204978, "grad/layer_8/attn_mlp_ratio": 0.8597082020028762, "grad/layer_12/attn": 0.005119159817695618, "grad/layer_12/mlp": 0.006396248005330563, "grad/layer_12/attn_mlp_ratio": 0.8003379064407097, "grad/layer_16/attn": 0.002760162577033043, "grad/layer_16/mlp": 0.00410206476226449, "grad/layer_16/attn_mlp_ratio": 0.67287150976675, "grad/layer_20/attn": 0.0030737740453332663, "grad/layer_20/mlp": 0.004432498011738062, "grad/layer_20/attn_mlp_ratio": 0.6934631370046956, "grad/layer_24/attn": 0.0032123690471053123, "grad/layer_24/mlp": 0.005869149696081877, "grad/layer_24/attn_mlp_ratio": 0.5473312419542987, "grad/layer_27/attn": 0.003246612846851349, "grad/layer_27/mlp": 0.005289042368531227, "grad/layer_27/attn_mlp_ratio": 0.6138375454854188} {"step": 80200, "timestamp": 1778281231.5974798, "train/loss": 1.981948733329773, "train/z_loss": 0.0014213516493327915, "train/perplexity": 7.256870921134055, "train/grad_norm": 0.0654296875, "optim/muon_lr": 0.0008172804117202759, "optim/adamw_lr": 2.4518412351608274e-05, "perf/tokens_per_sec": 2021588.810014766, "perf/iters_per_sec": 0.9639686632226782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373781204223633, "data/tokens_consumed": 168193687552, "data/tokens_consumed_B": 168.193687552, "train/loss_slope": -2.640074228618082e-05} {"step": 80210, "timestamp": 1778281241.9838517, "train/loss": 2.0464280247688293, "train/z_loss": 0.00141596260946244, "train/perplexity": 7.740203852358514, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.0008020418882369996, "optim/adamw_lr": 2.4061256647109982e-05, "perf/tokens_per_sec": 2020360.5961470734, "perf/iters_per_sec": 0.963383005212342, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380087614059448, "data/tokens_consumed": 168214659072, "data/tokens_consumed_B": 168.214659072, "train/loss_slope": -2.9988363960144796e-05} {"step": 80220, "timestamp": 1778281252.3686345, "train/loss": 2.0693035244941713, "train/z_loss": 0.001421197154559195, "train/perplexity": 7.919305594292921, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0007869398593902588, "optim/adamw_lr": 2.3608195781707762e-05, "perf/tokens_per_sec": 2020459.676408291, "perf/iters_per_sec": 0.963430250362535, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379578590393066, "data/tokens_consumed": 168235630592, "data/tokens_consumed_B": 168.235630592, "train/loss_slope": -2.6238016555733477e-05} {"step": 80230, "timestamp": 1778281262.7565298, "train/loss": 2.06341689825058, "train/z_loss": 0.0014167220797389746, "train/perplexity": 7.872824544656972, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.0007719767093658448, "optim/adamw_lr": 2.315930128097534e-05, "perf/tokens_per_sec": 2019857.3164109536, "perf/iters_per_sec": 0.9631430227331894, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038267397880554, "data/tokens_consumed": 168256602112, "data/tokens_consumed_B": 168.256602112, "train/loss_slope": -2.6597736799570293e-05} {"step": 80240, "timestamp": 1778281273.1339226, "train/loss": 2.0677133917808534, "train/z_loss": 0.0014038455090485513, "train/perplexity": 7.906722854165336, "train/grad_norm": 0.06396484375, "optim/muon_lr": 0.0007571500539779664, "optim/adamw_lr": 2.271450161933899e-05, "perf/tokens_per_sec": 2021867.2008654862, "perf/iters_per_sec": 0.9641014103248053, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372352838516234, "data/tokens_consumed": 168277573632, "data/tokens_consumed_B": 168.277573632, "train/loss_slope": -2.5733367895314584e-05} {"step": 80250, "timestamp": 1778281283.500534, "grad/layer_0/attn": 0.0023262633476406336, "grad/layer_0/mlp": 0.0026765726506710052, "grad/layer_0/attn_mlp_ratio": 0.8691201638578474, "grad/layer_4/attn": 0.001876625930890441, "grad/layer_4/mlp": 0.002526547759771347, "grad/layer_4/attn_mlp_ratio": 0.742762866585981, "grad/layer_8/attn": 0.003500334220007062, "grad/layer_8/mlp": 0.00334878615103662, "grad/layer_8/attn_mlp_ratio": 1.0452545960267037, "grad/layer_12/attn": 0.0035836142487823963, "grad/layer_12/mlp": 0.005981605499982834, "grad/layer_12/attn_mlp_ratio": 0.5991057399024571, "grad/layer_16/attn": 0.0034948796965181828, "grad/layer_16/mlp": 0.003975838888436556, "grad/layer_16/attn_mlp_ratio": 0.8790294845145368, "grad/layer_20/attn": 0.00378659856505692, "grad/layer_20/mlp": 0.0045578558929264545, "grad/layer_20/attn_mlp_ratio": 0.8307850381700335, "grad/layer_24/attn": 0.004127344582229853, "grad/layer_24/mlp": 0.005905959755182266, "grad/layer_24/attn_mlp_ratio": 0.6988439954613401, "grad/layer_27/attn": 0.003846519161015749, "grad/layer_27/mlp": 0.005316504742950201, "grad/layer_27/attn_mlp_ratio": 0.7235052491518589} {"step": 80250, "timestamp": 1778281284.105175, "eos/sharpness": 2.67566442489624, "eos/L0_probe": 1.9063466787338257, "eos/L_plus": 1.919119954109192, "eos/L_minus": 1.9203300476074219, "eos/grad_norm": 0.06929358839988708, "eos/embed_grad_frac": 0.3450917601585388, "eos/time_s": 0.6019892692565918} {"step": 80250, "timestamp": 1778281284.1253214, "train/loss": 2.0817157864570617, "train/z_loss": 0.0014120186096988618, "train/perplexity": 8.01821466266772, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.0007424628734588624, "optim/adamw_lr": 2.2273886203765866e-05, "perf/tokens_per_sec": 1908884.005903936, "perf/iters_per_sec": 0.9102268247146301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098627257347107, "data/tokens_consumed": 168298545152, "data/tokens_consumed_B": 168.298545152, "train/loss_slope": -2.7662854426407126e-05} {"step": 80250, "timestamp": 1778281285.490141, "geo/rankme_last": 440.713134765625, "geo/layer_0/stable_rank_q_proj": 18.704851150512695, "geo/layer_0/stable_rank_k_proj": 15.684041023254395, "geo/layer_0/stable_rank_o_proj": 46.284568786621094, "geo/layer_0/stable_rank_gate_proj": 127.02354431152344, "geo/layer_0/stable_rank_down_proj": 56.83155059814453, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06797075271606445, "geo/layer_0/attn_entropy_mean": 6.126193046569824, "geo/layer_0/attn_entropy_std": 0.4273233711719513, "geo/layer_7/stable_rank_q_proj": 42.53163146972656, "geo/layer_7/stable_rank_k_proj": 40.043312072753906, "geo/layer_7/stable_rank_o_proj": 87.90210723876953, "geo/layer_7/stable_rank_gate_proj": 76.83309173583984, "geo/layer_7/stable_rank_down_proj": 140.352294921875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44434019923210144, "geo/layer_7/attn_entropy_mean": 4.644196510314941, "geo/layer_7/attn_entropy_std": 0.7658442258834839, "geo/layer_14/stable_rank_q_proj": 49.10480499267578, "geo/layer_14/stable_rank_k_proj": 41.484527587890625, "geo/layer_14/stable_rank_o_proj": 43.08272171020508, "geo/layer_14/stable_rank_gate_proj": 70.23529052734375, "geo/layer_14/stable_rank_down_proj": 124.99235534667969, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.3996889591217041, "geo/layer_14/attn_entropy_mean": 5.494973182678223, "geo/layer_14/attn_entropy_std": 0.40875864028930664, "geo/layer_21/stable_rank_q_proj": 39.38849639892578, "geo/layer_21/stable_rank_k_proj": 30.023094177246094, "geo/layer_21/stable_rank_o_proj": 67.86454010009766, "geo/layer_21/stable_rank_gate_proj": 62.885520935058594, "geo/layer_21/stable_rank_down_proj": 49.238956451416016, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15242554247379303, "geo/layer_21/attn_entropy_mean": 5.676042556762695, "geo/layer_21/attn_entropy_std": 0.2881622612476349, "geo/layer_27/stable_rank_q_proj": 43.787784576416016, "geo/layer_27/stable_rank_k_proj": 32.33582305908203, "geo/layer_27/stable_rank_o_proj": 115.30353546142578, "geo/layer_27/stable_rank_gate_proj": 76.21623992919922, "geo/layer_27/stable_rank_down_proj": 127.78150939941406, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09316135942935944, "geo/layer_27/attn_entropy_mean": 4.140251159667969, "geo/layer_27/attn_entropy_std": 0.7949513792991638, "attnres/final_alpha/block_0": 0.23883792757987976, "attnres/block_norm/0": 1.77881920337677, "attnres/final_alpha/block_1": 0.003935609944164753, "attnres/block_norm/1": 47490.796875, "attnres/final_alpha/block_2": 0.009081363677978516, "attnres/block_norm/2": 29147.11328125, "attnres/final_alpha/block_3": 0.010543309152126312, "attnres/block_norm/3": 63872.03125, "attnres/final_alpha/block_4": 0.012585705146193504, "attnres/block_norm/4": 15724.916015625, "attnres/final_alpha/block_5": 0.6177431344985962, "attnres/block_norm/5": 6818.2451171875, "attnres/final_alpha/block_6": 0.10727296769618988, "attnres/block_norm/6": 40900.90625, "geo/tier1_time_s": 1.360471487045288, "geo/step": 80250.0, "geo/rankme_slope": 0.0003155288873361845} {"step": 80260, "timestamp": 1778281295.8647368, "train/loss": 2.0880693912506105, "train/z_loss": 0.001401581335812807, "train/perplexity": 8.069321413909739, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.0007279127836227417, "optim/adamw_lr": 2.183738350868225e-05, "perf/tokens_per_sec": 1787041.0464944725, "perf/iters_per_sec": 0.8521275742027629, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1735332012176514, "data/tokens_consumed": 168319516672, "data/tokens_consumed_B": 168.319516672, "train/loss_slope": -2.6978230125869002e-05} {"step": 80270, "timestamp": 1778281306.2406416, "train/loss": 2.121356797218323, "train/z_loss": 0.0014136197394691407, "train/perplexity": 8.342448823715829, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.0007135015726089477, "optim/adamw_lr": 2.1405047178268432e-05, "perf/tokens_per_sec": 2022185.6018477743, "perf/iters_per_sec": 0.9642532357443687, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370719671249389, "data/tokens_consumed": 168340488192, "data/tokens_consumed_B": 168.340488192, "train/loss_slope": -2.4120671885742466e-05} {"step": 80280, "timestamp": 1778281316.6125278, "train/loss": 2.09871563911438, "train/z_loss": 0.001409023266751319, "train/perplexity": 8.155688335851051, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.0006992298364639283, "optim/adamw_lr": 2.0976895093917846e-05, "perf/tokens_per_sec": 2023162.0677150886, "perf/iters_per_sec": 0.9647188509536212, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0365714311599732, "data/tokens_consumed": 168361459712, "data/tokens_consumed_B": 168.361459712, "train/loss_slope": -2.13772021576051e-05} {"step": 80290, "timestamp": 1778281326.9935348, "train/loss": 2.0746127724647523, "train/z_loss": 0.0014092256664298476, "train/perplexity": 7.96146296438562, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0006850963830947876, "optim/adamw_lr": 2.0552891492843626e-05, "perf/tokens_per_sec": 2021194.1478013152, "perf/iters_per_sec": 0.9637804736143661, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037580680847168, "data/tokens_consumed": 168382431232, "data/tokens_consumed_B": 168.382431232, "train/loss_slope": -2.0837178625146247e-05} {"step": 80300, "timestamp": 1778281337.3543515, "grad/layer_0/attn": 0.0025776613038033247, "grad/layer_0/mlp": 0.0028037235606461763, "grad/layer_0/attn_mlp_ratio": 0.9193706712198766, "grad/layer_4/attn": 0.0022727872710675, "grad/layer_4/mlp": 0.002511288970708847, "grad/layer_4/attn_mlp_ratio": 0.9050281377706842, "grad/layer_8/attn": 0.004676183685660362, "grad/layer_8/mlp": 0.003614663379266858, "grad/layer_8/attn_mlp_ratio": 1.2936705484430895, "grad/layer_12/attn": 0.0036100733559578657, "grad/layer_12/mlp": 0.00677665276452899, "grad/layer_12/attn_mlp_ratio": 0.5327221901617626, "grad/layer_16/attn": 0.003452867502346635, "grad/layer_16/mlp": 0.00437141302973032, "grad/layer_16/attn_mlp_ratio": 0.7898744410276428, "grad/layer_20/attn": 0.0026601620484143496, "grad/layer_20/mlp": 0.005095027387142181, "grad/layer_20/attn_mlp_ratio": 0.5221094596893812, "grad/layer_24/attn": 0.004357513505965471, "grad/layer_24/mlp": 0.006302600726485252, "grad/layer_24/attn_mlp_ratio": 0.691383386943055, "grad/layer_27/attn": 0.005009569227695465, "grad/layer_27/mlp": 0.005443372298032045, "grad/layer_27/attn_mlp_ratio": 0.9203061744418931} {"step": 80300, "timestamp": 1778281337.370035, "train/loss": 2.1305344820022585, "train/z_loss": 0.0014024051022715866, "train/perplexity": 8.419365608458167, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0006711030006408691, "optim/adamw_lr": 2.0133090019226074e-05, "perf/tokens_per_sec": 2022401.3348185886, "perf/iters_per_sec": 0.9643561052411025, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0369613409042358, "data/tokens_consumed": 168403402752, "data/tokens_consumed_B": 168.403402752, "train/loss_slope": -1.7159853623931218e-05} {"step": 80310, "timestamp": 1778281347.734186, "train/loss": 2.0498220205307005, "train/z_loss": 0.0014197959564626217, "train/perplexity": 7.7665187024137055, "train/grad_norm": 0.083984375, "optim/muon_lr": 0.0006572479009628296, "optim/adamw_lr": 1.9717437028884887e-05, "perf/tokens_per_sec": 2024593.864967278, "perf/iters_per_sec": 0.9654015850864782, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0358383655548096, "data/tokens_consumed": 168424374272, "data/tokens_consumed_B": 168.424374272, "train/loss_slope": -1.6202947015416076e-05} {"step": 80320, "timestamp": 1778281358.0915704, "train/loss": 2.106705570220947, "train/z_loss": 0.0014106191112659872, "train/perplexity": 8.221112743985953, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0006435328722000122, "optim/adamw_lr": 1.9305986166000365e-05, "perf/tokens_per_sec": 2025908.652157144, "perf/iters_per_sec": 0.9660285244737358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035166120529175, "data/tokens_consumed": 168445345792, "data/tokens_consumed_B": 168.445345792, "train/loss_slope": -1.5433925163127083e-05} {"step": 80325, "timestamp": 1778281363.878835, "eos/sharpness": 3.366029262542724, "eos/L0_probe": 1.9066863059997559, "eos/L_plus": 1.9240374565124512, "eos/L_minus": 1.9229954481124878, "eos/grad_norm": 0.06715096533298492, "eos/embed_grad_frac": 0.3854888081550598, "eos/time_s": 0.6122126579284668} {"step": 80325, "timestamp": 1778281365.25578, "geo/rankme_last": 440.6778259277344, "geo/layer_0/stable_rank_q_proj": 18.70490074157715, "geo/layer_0/stable_rank_k_proj": 15.683979034423828, "geo/layer_0/stable_rank_o_proj": 46.2840576171875, "geo/layer_0/stable_rank_gate_proj": 127.03194427490234, "geo/layer_0/stable_rank_down_proj": 56.8321647644043, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06569146364927292, "geo/layer_0/attn_entropy_mean": 6.126280784606934, "geo/layer_0/attn_entropy_std": 0.426979660987854, "geo/layer_7/stable_rank_q_proj": 42.53446578979492, "geo/layer_7/stable_rank_k_proj": 40.04167938232422, "geo/layer_7/stable_rank_o_proj": 87.90171813964844, "geo/layer_7/stable_rank_gate_proj": 76.83135986328125, "geo/layer_7/stable_rank_down_proj": 140.3511505126953, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45345714688301086, "geo/layer_7/attn_entropy_mean": 4.641977310180664, "geo/layer_7/attn_entropy_std": 0.76674485206604, "geo/layer_14/stable_rank_q_proj": 49.10631561279297, "geo/layer_14/stable_rank_k_proj": 41.48291015625, "geo/layer_14/stable_rank_o_proj": 43.08307647705078, "geo/layer_14/stable_rank_gate_proj": 70.23695373535156, "geo/layer_14/stable_rank_down_proj": 124.98722839355469, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4141636788845062, "geo/layer_14/attn_entropy_mean": 5.495580673217773, "geo/layer_14/attn_entropy_std": 0.4092070758342743, "geo/layer_21/stable_rank_q_proj": 39.38969421386719, "geo/layer_21/stable_rank_k_proj": 30.022598266601562, "geo/layer_21/stable_rank_o_proj": 67.8631362915039, "geo/layer_21/stable_rank_gate_proj": 62.88131332397461, "geo/layer_21/stable_rank_down_proj": 49.241939544677734, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14777708053588867, "geo/layer_21/attn_entropy_mean": 5.677474021911621, "geo/layer_21/attn_entropy_std": 0.2880696952342987, "geo/layer_27/stable_rank_q_proj": 43.78662872314453, "geo/layer_27/stable_rank_k_proj": 32.3370475769043, "geo/layer_27/stable_rank_o_proj": 115.30216979980469, "geo/layer_27/stable_rank_gate_proj": 76.21638488769531, "geo/layer_27/stable_rank_down_proj": 127.79068756103516, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08243753015995026, "geo/layer_27/attn_entropy_mean": 4.142004489898682, "geo/layer_27/attn_entropy_std": 0.796603798866272, "attnres/final_alpha/block_0": 0.23889626562595367, "attnres/block_norm/0": 1.7787832021713257, "attnres/final_alpha/block_1": 0.003932180814445019, "attnres/block_norm/1": 47442.515625, "attnres/final_alpha/block_2": 0.009072951972484589, "attnres/block_norm/2": 29165.095703125, "attnres/final_alpha/block_3": 0.010539297014474869, "attnres/block_norm/3": 63842.63671875, "attnres/final_alpha/block_4": 0.012574766762554646, "attnres/block_norm/4": 15717.1767578125, "attnres/final_alpha/block_5": 0.6175095438957214, "attnres/block_norm/5": 6807.05859375, "attnres/final_alpha/block_6": 0.10747499018907547, "attnres/block_norm/6": 40916.5234375, "geo/tier1_time_s": 1.357375144958496, "geo/step": 80325.0, "geo/rankme_slope": 0.0003059118569302721} {"step": 80330, "timestamp": 1778281370.448785, "train/loss": 2.127652311325073, "train/z_loss": 0.001406434189993888, "train/perplexity": 8.395134495656162, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0006299585103988648, "optim/adamw_lr": 1.889875531196594e-05, "perf/tokens_per_sec": 1698028.71340976, "perf/iters_per_sec": 0.8096831862496185, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2350509643554688, "data/tokens_consumed": 168466317312, "data/tokens_consumed_B": 168.466317312, "train/loss_slope": -1.0536244099396937e-05} {"step": 80340, "timestamp": 1778281380.8273702, "train/loss": 2.1536041259765626, "train/z_loss": 0.0014120541629381478, "train/perplexity": 8.615855132849138, "train/grad_norm": 0.0751953125, "optim/muon_lr": 0.000616523027420044, "optim/adamw_lr": 1.8495690822601318e-05, "perf/tokens_per_sec": 2021467.877238262, "perf/iters_per_sec": 0.9639109979811964, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374401807785034, "data/tokens_consumed": 168487288832, "data/tokens_consumed_B": 168.487288832, "train/loss_slope": -7.772929189872543e-06} {"step": 80350, "timestamp": 1778281391.1965303, "grad/layer_0/attn": 0.002561078406870365, "grad/layer_0/mlp": 0.002865689340978861, "grad/layer_0/attn_mlp_ratio": 0.8937041014449748, "grad/layer_4/attn": 0.001950297737494111, "grad/layer_4/mlp": 0.0026916395872831345, "grad/layer_4/attn_mlp_ratio": 0.724576081526977, "grad/layer_8/attn": 0.005326895508915186, "grad/layer_8/mlp": 0.0034917667508125305, "grad/layer_8/attn_mlp_ratio": 1.5255587605099186, "grad/layer_12/attn": 0.003651734907180071, "grad/layer_12/mlp": 0.006175363902002573, "grad/layer_12/attn_mlp_ratio": 0.5913392159548605, "grad/layer_16/attn": 0.0028395887929946184, "grad/layer_16/mlp": 0.004328021313995123, "grad/layer_16/attn_mlp_ratio": 0.6560939795289604, "grad/layer_20/attn": 0.002829259494319558, "grad/layer_20/mlp": 0.005043491255491972, "grad/layer_20/attn_mlp_ratio": 0.5609724087736789, "grad/layer_24/attn": 0.004639788530766964, "grad/layer_24/mlp": 0.006631212774664164, "grad/layer_24/attn_mlp_ratio": 0.6996892753200816, "grad/layer_27/attn": 0.004397853277623653, "grad/layer_27/mlp": 0.005730126518756151, "grad/layer_27/attn_mlp_ratio": 0.767496701247119} {"step": 80350, "timestamp": 1778281391.2126946, "train/loss": 2.077466344833374, "train/z_loss": 0.0014069218072108925, "train/perplexity": 7.984214020568989, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.0006032288074493408, "optim/adamw_lr": 1.8096864223480222e-05, "perf/tokens_per_sec": 2020400.6911571585, "perf/iters_per_sec": 0.9634021240030091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379881620407105, "data/tokens_consumed": 168508260352, "data/tokens_consumed_B": 168.508260352, "train/loss_slope": -8.147737695903954e-06} {"step": 80360, "timestamp": 1778281401.5955129, "train/loss": 2.056763458251953, "train/z_loss": 0.0014131237636320292, "train/perplexity": 7.820617051236305, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.0005900746583938598, "optim/adamw_lr": 1.7702239751815793e-05, "perf/tokens_per_sec": 2021007.4616129822, "perf/iters_per_sec": 0.9636914547028457, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376765251159668, "data/tokens_consumed": 168529231872, "data/tokens_consumed_B": 168.529231872, "train/loss_slope": -7.907940445095064e-06} {"step": 80370, "timestamp": 1778281411.9712555, "train/loss": 2.04376882314682, "train/z_loss": 0.0014119260013103485, "train/perplexity": 7.719648432335657, "train/grad_norm": 0.06640625, "optim/muon_lr": 0.0005770611763000488, "optim/adamw_lr": 1.7311835289001463e-05, "perf/tokens_per_sec": 2022193.040149556, "perf/iters_per_sec": 0.9642567826030521, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370681524276733, "data/tokens_consumed": 168550203392, "data/tokens_consumed_B": 168.550203392, "train/loss_slope": -1.1393539704541125e-05} {"step": 80380, "timestamp": 1778281422.3544962, "train/loss": 2.0714676141738892, "train/z_loss": 0.0014229638851247727, "train/perplexity": 7.936462239363545, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0005641883611679077, "optim/adamw_lr": 1.692565083503723e-05, "perf/tokens_per_sec": 2021175.1989738164, "perf/iters_per_sec": 0.9637714381093103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375904083251952, "data/tokens_consumed": 168571174912, "data/tokens_consumed_B": 168.571174912, "train/loss_slope": -8.915256717131362e-06} {"step": 80390, "timestamp": 1778281432.7377758, "train/loss": 2.0782689809799195, "train/z_loss": 0.0014225739054381848, "train/perplexity": 7.990625011846142, "train/grad_norm": 0.07177734375, "optim/muon_lr": 0.0005514562129974365, "optim/adamw_lr": 1.6543686389923093e-05, "perf/tokens_per_sec": 2021386.4427642531, "perf/iters_per_sec": 0.9638721669980302, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374819755554199, "data/tokens_consumed": 168592146432, "data/tokens_consumed_B": 168.592146432, "train/loss_slope": -1.0299498964064177e-05} {"step": 80400, "timestamp": 1778281443.100501, "grad/layer_0/attn": 0.0023060522507876158, "grad/layer_0/mlp": 0.0028087443206459284, "grad/layer_0/attn_mlp_ratio": 0.8210260192549993, "grad/layer_4/attn": 0.002074676798656583, "grad/layer_4/mlp": 0.0023192819207906723, "grad/layer_4/attn_mlp_ratio": 0.8945340756573017, "grad/layer_8/attn": 0.003029898041859269, "grad/layer_8/mlp": 0.0031468262895941734, "grad/layer_8/attn_mlp_ratio": 0.9628424535520741, "grad/layer_12/attn": 0.0032840126659721136, "grad/layer_12/mlp": 0.006217877380549908, "grad/layer_12/attn_mlp_ratio": 0.5281565415601718, "grad/layer_16/attn": 0.0048932479694485664, "grad/layer_16/mlp": 0.004232133273035288, "grad/layer_16/attn_mlp_ratio": 1.1562130817108742, "grad/layer_20/attn": 0.0029975478537380695, "grad/layer_20/mlp": 0.004567091818898916, "grad/layer_20/attn_mlp_ratio": 0.6563362215973859, "grad/layer_24/attn": 0.004449395928531885, "grad/layer_24/mlp": 0.006199505180120468, "grad/layer_24/attn_mlp_ratio": 0.717701772559088, "grad/layer_27/attn": 0.003516790457069874, "grad/layer_27/mlp": 0.005358573514968157, "grad/layer_27/attn_mlp_ratio": 0.6562922728627612} {"step": 80400, "timestamp": 1778281443.706743, "eos/sharpness": 5.155253410339355, "eos/L0_probe": 1.9054697751998901, "eos/L_plus": 1.9307278394699097, "eos/L_minus": 1.9317642450332642, "eos/grad_norm": 0.06834137439727783, "eos/embed_grad_frac": 0.3566308915615082, "eos/time_s": 0.6036086082458496} {"step": 80400, "timestamp": 1778281443.7260754, "train/loss": 2.0724941968917845, "train/z_loss": 0.0014084637863561512, "train/perplexity": 7.944613857779101, "train/grad_norm": 0.068359375, "optim/muon_lr": 0.0005388659238815308, "optim/adamw_lr": 1.6165977716445923e-05, "perf/tokens_per_sec": 1909435.084584301, "perf/iters_per_sec": 0.9104895995065218, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0983101844787597, "data/tokens_consumed": 168613117952, "data/tokens_consumed_B": 168.613117952, "train/loss_slope": -7.310855749881612e-06} {"step": 80400, "timestamp": 1778281445.0880942, "geo/rankme_last": 440.7396240234375, "geo/layer_0/stable_rank_q_proj": 18.70487403869629, "geo/layer_0/stable_rank_k_proj": 15.683748245239258, "geo/layer_0/stable_rank_o_proj": 46.28247833251953, "geo/layer_0/stable_rank_gate_proj": 127.02257537841797, "geo/layer_0/stable_rank_down_proj": 56.83313751220703, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.05871963128447533, "geo/layer_0/attn_entropy_mean": 6.127480506896973, "geo/layer_0/attn_entropy_std": 0.4267323613166809, "geo/layer_7/stable_rank_q_proj": 42.5318489074707, "geo/layer_7/stable_rank_k_proj": 40.04148483276367, "geo/layer_7/stable_rank_o_proj": 87.9045181274414, "geo/layer_7/stable_rank_gate_proj": 76.83235931396484, "geo/layer_7/stable_rank_down_proj": 140.35475158691406, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4554790258407593, "geo/layer_7/attn_entropy_mean": 4.647751808166504, "geo/layer_7/attn_entropy_std": 0.766409695148468, "geo/layer_14/stable_rank_q_proj": 49.10538864135742, "geo/layer_14/stable_rank_k_proj": 41.48272705078125, "geo/layer_14/stable_rank_o_proj": 43.083221435546875, "geo/layer_14/stable_rank_gate_proj": 70.23721313476562, "geo/layer_14/stable_rank_down_proj": 124.99409484863281, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.42128780484199524, "geo/layer_14/attn_entropy_mean": 5.495965003967285, "geo/layer_14/attn_entropy_std": 0.4089243412017822, "geo/layer_21/stable_rank_q_proj": 39.38911819458008, "geo/layer_21/stable_rank_k_proj": 30.023170471191406, "geo/layer_21/stable_rank_o_proj": 67.86235046386719, "geo/layer_21/stable_rank_gate_proj": 62.88331985473633, "geo/layer_21/stable_rank_down_proj": 49.2409553527832, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15364807844161987, "geo/layer_21/attn_entropy_mean": 5.6781158447265625, "geo/layer_21/attn_entropy_std": 0.2877446413040161, "geo/layer_27/stable_rank_q_proj": 43.789512634277344, "geo/layer_27/stable_rank_k_proj": 32.3376350402832, "geo/layer_27/stable_rank_o_proj": 115.2977294921875, "geo/layer_27/stable_rank_gate_proj": 76.21409606933594, "geo/layer_27/stable_rank_down_proj": 127.78231048583984, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08235401660203934, "geo/layer_27/attn_entropy_mean": 4.143694877624512, "geo/layer_27/attn_entropy_std": 0.7949489951133728, "attnres/final_alpha/block_0": 0.23905134201049805, "attnres/block_norm/0": 1.7787632942199707, "attnres/final_alpha/block_1": 0.003941867500543594, "attnres/block_norm/1": 47423.328125, "attnres/final_alpha/block_2": 0.009096713736653328, "attnres/block_norm/2": 29178.744140625, "attnres/final_alpha/block_3": 0.010534391738474369, "attnres/block_norm/3": 63822.4140625, "attnres/final_alpha/block_4": 0.012583207339048386, "attnres/block_norm/4": 15737.6025390625, "attnres/final_alpha/block_5": 0.6172916889190674, "attnres/block_norm/5": 6811.4052734375, "attnres/final_alpha/block_6": 0.10750078409910202, "attnres/block_norm/6": 40896.3984375, "geo/tier1_time_s": 1.3582394123077393, "geo/step": 80400.0, "geo/rankme_slope": 0.0002938809898959584} {"step": 80410, "timestamp": 1778281455.4689627, "train/loss": 2.0733993530273436, "train/z_loss": 0.0014087799820117653, "train/perplexity": 7.95180822928067, "train/grad_norm": 0.0830078125, "optim/muon_lr": 0.0005264168977737426, "optim/adamw_lr": 1.579250693321228e-05, "perf/tokens_per_sec": 1786436.9362459367, "perf/iters_per_sec": 0.8518395119886096, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.173930048942566, "data/tokens_consumed": 168634089472, "data/tokens_consumed_B": 168.634089472, "train/loss_slope": -4.35050015259247e-06} {"step": 80420, "timestamp": 1778281465.845705, "train/loss": 2.0676610469818115, "train/z_loss": 0.0014223806909285485, "train/perplexity": 7.906308989178389, "train/grad_norm": 0.9375, "optim/muon_lr": 0.00051410973072052, "optim/adamw_lr": 1.5423291921615598e-05, "perf/tokens_per_sec": 2021977.8164088845, "perf/iters_per_sec": 0.9641541559261725, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371785402297973, "data/tokens_consumed": 168655060992, "data/tokens_consumed_B": 168.655060992, "train/loss_slope": -3.3365705440325488e-06} {"step": 80430, "timestamp": 1778281476.2213817, "train/loss": 2.0659321188926696, "train/z_loss": 0.0014080724446102976, "train/perplexity": 7.892651359417532, "train/grad_norm": 0.07763671875, "optim/muon_lr": 0.000501943826675415, "optim/adamw_lr": 1.505831480026245e-05, "perf/tokens_per_sec": 2022105.0392476413, "perf/iters_per_sec": 0.9642148205030638, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0371132850646974, "data/tokens_consumed": 168676032512, "data/tokens_consumed_B": 168.676032512, "train/loss_slope": -3.5343159793770227e-06} {"step": 80440, "timestamp": 1778281486.6004808, "train/loss": 2.095945143699646, "train/z_loss": 0.0014048851560801267, "train/perplexity": 8.133124309910563, "train/grad_norm": 0.08837890625, "optim/muon_lr": 0.0004899197816848755, "optim/adamw_lr": 1.4697593450546263e-05, "perf/tokens_per_sec": 2021451.8035137453, "perf/iters_per_sec": 0.9639033334320761, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374484300613402, "data/tokens_consumed": 168697004032, "data/tokens_consumed_B": 168.697004032, "train/loss_slope": -3.166430675336065e-06} {"step": 80450, "timestamp": 1778281496.9666789, "grad/layer_0/attn": 0.0024634781293570995, "grad/layer_0/mlp": 0.0028006043285131454, "grad/layer_0/attn_mlp_ratio": 0.8796237356037361, "grad/layer_4/attn": 0.0018477838020771742, "grad/layer_4/mlp": 0.002568559255450964, "grad/layer_4/attn_mlp_ratio": 0.7193852842667743, "grad/layer_8/attn": 0.004136607516556978, "grad/layer_8/mlp": 0.0035721552558243275, "grad/layer_8/attn_mlp_ratio": 1.1580144491230828, "grad/layer_12/attn": 0.0033897352404892445, "grad/layer_12/mlp": 0.006251417100429535, "grad/layer_12/attn_mlp_ratio": 0.5422346856415752, "grad/layer_16/attn": 0.0032014683820307255, "grad/layer_16/mlp": 0.0041791778057813644, "grad/layer_16/attn_mlp_ratio": 0.7660521887814106, "grad/layer_20/attn": 0.003413274185732007, "grad/layer_20/mlp": 0.004707592073827982, "grad/layer_20/attn_mlp_ratio": 0.7250573243596208, "grad/layer_24/attn": 0.00386337679810822, "grad/layer_24/mlp": 0.006541573908179998, "grad/layer_24/attn_mlp_ratio": 0.5905882580059798, "grad/layer_27/attn": 0.005663941614329815, "grad/layer_27/mlp": 0.005629371386021376, "grad/layer_27/attn_mlp_ratio": 1.0061410280693472} {"step": 80450, "timestamp": 1778281496.9823363, "train/loss": 2.0828224420547485, "train/z_loss": 0.0014018124551512302, "train/perplexity": 8.02709297651938, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.00047803878784179687, "optim/adamw_lr": 1.4341163635253905e-05, "perf/tokens_per_sec": 2021096.3886675227, "perf/iters_per_sec": 0.9637338584268201, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376308679580688, "data/tokens_consumed": 168717975552, "data/tokens_consumed_B": 168.717975552, "train/loss_slope": -2.6617757271905e-06} {"step": 80460, "timestamp": 1778281507.3605082, "train/loss": 2.063366115093231, "train/z_loss": 0.0014205488841980696, "train/perplexity": 7.872424747920898, "train/grad_norm": 0.07666015625, "optim/muon_lr": 0.0004662990570068359, "optim/adamw_lr": 1.3988971710205076e-05, "perf/tokens_per_sec": 2021648.3757707223, "perf/iters_per_sec": 0.9639970663884746, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0373475551605225, "data/tokens_consumed": 168738947072, "data/tokens_consumed_B": 168.738947072, "train/loss_slope": -1.8745304691945352e-06} {"step": 80470, "timestamp": 1778281517.739573, "train/loss": 2.110509979724884, "train/z_loss": 0.0014041596674360334, "train/perplexity": 8.25244879322862, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.0004547029733657837, "optim/adamw_lr": 1.3641089200973509e-05, "perf/tokens_per_sec": 2021796.654919944, "perf/iters_per_sec": 0.9640677713966103, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0372714757919312, "data/tokens_consumed": 168759918592, "data/tokens_consumed_B": 168.759918592, "train/loss_slope": -1.7119836707107525e-07} {"step": 80475, "timestamp": 1778281523.5181463, "eos/sharpness": 12.1143102645874, "eos/L0_probe": 1.905857801437378, "eos/L_plus": 1.9665706157684326, "eos/L_minus": 1.9662880897521973, "eos/grad_norm": 0.06969765573740005, "eos/embed_grad_frac": 0.33718541264533997, "eos/time_s": 0.6003162860870361} {"step": 80475, "timestamp": 1778281524.8959703, "geo/rankme_last": 440.67218017578125, "geo/layer_0/stable_rank_q_proj": 18.704225540161133, "geo/layer_0/stable_rank_k_proj": 15.683917045593262, "geo/layer_0/stable_rank_o_proj": 46.28440856933594, "geo/layer_0/stable_rank_gate_proj": 127.0291519165039, "geo/layer_0/stable_rank_down_proj": 56.830467224121094, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06326371431350708, "geo/layer_0/attn_entropy_mean": 6.1271538734436035, "geo/layer_0/attn_entropy_std": 0.4268915057182312, "geo/layer_7/stable_rank_q_proj": 42.53572082519531, "geo/layer_7/stable_rank_k_proj": 40.04253005981445, "geo/layer_7/stable_rank_o_proj": 87.9043197631836, "geo/layer_7/stable_rank_gate_proj": 76.83158111572266, "geo/layer_7/stable_rank_down_proj": 140.35458374023438, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.454802006483078, "geo/layer_7/attn_entropy_mean": 4.643548965454102, "geo/layer_7/attn_entropy_std": 0.7667421102523804, "geo/layer_14/stable_rank_q_proj": 49.10527038574219, "geo/layer_14/stable_rank_k_proj": 41.484493255615234, "geo/layer_14/stable_rank_o_proj": 43.082847595214844, "geo/layer_14/stable_rank_gate_proj": 70.23675537109375, "geo/layer_14/stable_rank_down_proj": 124.9931869506836, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40665602684020996, "geo/layer_14/attn_entropy_mean": 5.497256278991699, "geo/layer_14/attn_entropy_std": 0.40770259499549866, "geo/layer_21/stable_rank_q_proj": 39.38703155517578, "geo/layer_21/stable_rank_k_proj": 30.02261734008789, "geo/layer_21/stable_rank_o_proj": 67.86621856689453, "geo/layer_21/stable_rank_gate_proj": 62.8809814453125, "geo/layer_21/stable_rank_down_proj": 49.24160385131836, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14721614122390747, "geo/layer_21/attn_entropy_mean": 5.676079273223877, "geo/layer_21/attn_entropy_std": 0.2875096797943115, "geo/layer_27/stable_rank_q_proj": 43.7879524230957, "geo/layer_27/stable_rank_k_proj": 32.33590316772461, "geo/layer_27/stable_rank_o_proj": 115.29432678222656, "geo/layer_27/stable_rank_gate_proj": 76.21543884277344, "geo/layer_27/stable_rank_down_proj": 127.77803802490234, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08979279547929764, "geo/layer_27/attn_entropy_mean": 4.143011093139648, "geo/layer_27/attn_entropy_std": 0.7957696318626404, "attnres/final_alpha/block_0": 0.23879225552082062, "attnres/block_norm/0": 1.7787771224975586, "attnres/final_alpha/block_1": 0.003936671651899815, "attnres/block_norm/1": 47475.14453125, "attnres/final_alpha/block_2": 0.009078340604901314, "attnres/block_norm/2": 29174.5078125, "attnres/final_alpha/block_3": 0.010520312935113907, "attnres/block_norm/3": 63861.83203125, "attnres/final_alpha/block_4": 0.01252688653767109, "attnres/block_norm/4": 15710.53125, "attnres/final_alpha/block_5": 0.6178978681564331, "attnres/block_norm/5": 6804.66259765625, "attnres/final_alpha/block_6": 0.10724760591983795, "attnres/block_norm/6": 40910.92578125, "geo/tier1_time_s": 1.3581626415252686, "geo/step": 80475.0, "geo/rankme_slope": 0.00028159003835909364} {"step": 80480, "timestamp": 1778281530.0887382, "train/loss": 2.0621464371681215, "train/z_loss": 0.0014318703324534, "train/perplexity": 7.862828778424446, "train/grad_norm": 0.06787109375, "optim/muon_lr": 0.00044324815273284914, "optim/adamw_lr": 1.3297444581985472e-05, "perf/tokens_per_sec": 1698843.3757345038, "perf/iters_per_sec": 0.8100716475174445, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2344587087631225, "data/tokens_consumed": 168780890112, "data/tokens_consumed_B": 168.780890112, "train/loss_slope": -6.710923687316793e-07} {"step": 80490, "timestamp": 1778281540.4731858, "train/loss": 2.0773181557655334, "train/z_loss": 0.001421732222661376, "train/perplexity": 7.983030934998179, "train/grad_norm": 0.06884765625, "optim/muon_lr": 0.000431937575340271, "optim/adamw_lr": 1.295812726020813e-05, "perf/tokens_per_sec": 2020383.9847095576, "perf/iters_per_sec": 0.9633941577480114, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0379967451095582, "data/tokens_consumed": 168801861632, "data/tokens_consumed_B": 168.801861632, "train/loss_slope": 3.489975324093353e-06} {"step": 80500, "timestamp": 1778281550.8380473, "grad/layer_0/attn": 0.002359039383009076, "grad/layer_0/mlp": 0.0026774918660521507, "grad/layer_0/attn_mlp_ratio": 0.8810631041733359, "grad/layer_4/attn": 0.002056532772257924, "grad/layer_4/mlp": 0.0025137264747172594, "grad/layer_4/attn_mlp_ratio": 0.8181211086926747, "grad/layer_8/attn": 0.004686946514993906, "grad/layer_8/mlp": 0.003388200653716922, "grad/layer_8/attn_mlp_ratio": 1.3833142885209015, "grad/layer_12/attn": 0.0042009660974144936, "grad/layer_12/mlp": 0.006982062943279743, "grad/layer_12/attn_mlp_ratio": 0.6016797716339637, "grad/layer_16/attn": 0.003369794460013509, "grad/layer_16/mlp": 0.004373788367956877, "grad/layer_16/attn_mlp_ratio": 0.7704520885500523, "grad/layer_20/attn": 0.003030496882274747, "grad/layer_20/mlp": 0.004750189837068319, "grad/layer_20/attn_mlp_ratio": 0.6379738331358351, "grad/layer_24/attn": 0.006672847084701061, "grad/layer_24/mlp": 0.006705004256218672, "grad/layer_24/attn_mlp_ratio": 0.995203989466795, "grad/layer_27/attn": 0.003669098950922489, "grad/layer_27/mlp": 0.005603805650025606, "grad/layer_27/attn_mlp_ratio": 0.654751273436936} {"step": 80500, "timestamp": 1778281550.853711, "train/loss": 2.117813766002655, "train/z_loss": 0.0014031078899279237, "train/perplexity": 8.31294356700619, "train/grad_norm": 0.07421875, "optim/muon_lr": 0.0004207688570022583, "optim/adamw_lr": 1.2623065710067747e-05, "perf/tokens_per_sec": 2021232.0000768409, "perf/iters_per_sec": 0.9637985229858593, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0375612497329711, "data/tokens_consumed": 168822833152, "data/tokens_consumed_B": 168.822833152, "train/loss_slope": 7.6154917594801834e-06} {"step": 80500, "timestamp": 1778281557.6856782, "geo/ww_alpha_mean": 7.710588434013216, "geo/ww_alpha_std": 4.644690993407879, "geo/ww_alpha_min": 1.3391774304311062, "geo/ww_alpha_max": 31.375531408440605, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.9227622890819025, "geo/ww_alpha_by_type/k_proj": 4.4437561158610555, "geo/ww_alpha_by_type/v_proj": 8.679619801771237, "geo/ww_alpha_by_type/o_proj": 9.439838557237996, "geo/ww_alpha_by_type/gate_proj": 8.040716198269402, "geo/ww_alpha_by_type/up_proj": 11.512858743931526, "geo/ww_alpha_by_type/down_proj": 8.021105193422827, "geo/twonn_id/layer_0": 0.7130151391029358, "geo/twonn_id/layer_7": 3.604696273803711, "geo/twonn_id/layer_14": 4.3655290603637695, "geo/twonn_id/layer_21": 7.748854637145996, "geo/twonn_id/layer_27": 5.768024444580078, "geo/tier2_time_s": 6.825928688049316} {"step": 80500, "timestamp": 1778281558.3149729, "eoc/jacobian_sigma/layer_0/attn": 1212.90185546875, "eoc/jacobian_sigma/layer_0/mlp": 10017.4072265625, "eoc/jacobian_sigma/layer_0": 10017.4072265625, "eoc/jacobian_sigma/layer_7/attn": 1.1525970697402954, "eoc/jacobian_sigma/layer_7/mlp": 1.915605902671814, "eoc/jacobian_sigma/layer_7": 1.915605902671814, "eoc/jacobian_sigma/layer_14/attn": 1.3658510446548462, "eoc/jacobian_sigma/layer_14/mlp": 6.290121078491211, "eoc/jacobian_sigma/layer_14": 6.290121078491211, "eoc/jacobian_sigma/layer_21/attn": 1.102003574371338, "eoc/jacobian_sigma/layer_21/mlp": 4.082266807556152, "eoc/jacobian_sigma/layer_21": 4.082266807556152, "eoc/jacobian_sigma/layer_27/attn": 3.12817645072937, "eoc/jacobian_sigma/layer_27/mlp": 26.92217445373535, "eoc/jacobian_sigma/layer_27": 26.92217445373535, "eoc/layer0_sigma": 10017.4072265625, "eoc/sigma_max": 26.92217445373535, "eoc/sigma_min": 1.915605902671814, "eoc/sigma_mean": 9.802542060613632, "eoc/time_s": 0.6237120628356934} {"step": 80510, "timestamp": 1778281569.2278025, "train/loss": 2.111919093132019, "train/z_loss": 0.0014020216651260853, "train/perplexity": 8.264085626348127, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.00040974378585815433, "optim/adamw_lr": 1.2292313575744628e-05, "perf/tokens_per_sec": 1141668.700696765, "perf/iters_per_sec": 0.5443900588496995, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.8369181871414184, "data/tokens_consumed": 168843804672, "data/tokens_consumed_B": 168.843804672, "train/loss_slope": 6.937985902357595e-06} {"step": 80520, "timestamp": 1778281579.6090689, "train/loss": 2.084754395484924, "train/z_loss": 0.0014130030875094235, "train/perplexity": 8.042615936319208, "train/grad_norm": 0.07568359375, "optim/muon_lr": 0.000398862361907959, "optim/adamw_lr": 1.1965870857238769e-05, "perf/tokens_per_sec": 2021108.2771592112, "perf/iters_per_sec": 0.9637395273014122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037624764442444, "data/tokens_consumed": 168864776192, "data/tokens_consumed_B": 168.864776192, "train/loss_slope": 8.948671055955957e-06} {"step": 80530, "timestamp": 1778281589.9893444, "train/loss": 2.1038282752037047, "train/z_loss": 0.001421744446270168, "train/perplexity": 8.197492175219782, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.00038812398910522463, "optim/adamw_lr": 1.1643719673156738e-05, "perf/tokens_per_sec": 2021402.376089701, "perf/iters_per_sec": 0.9638797645996575, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374737977981567, "data/tokens_consumed": 168885747712, "data/tokens_consumed_B": 168.885747712, "train/loss_slope": 1.4504012158780387e-05} {"step": 80540, "timestamp": 1778281600.3652365, "train/loss": 2.094446325302124, "train/z_loss": 0.0014103976194746793, "train/perplexity": 8.120943364358514, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0003775298595428467, "optim/adamw_lr": 1.13258957862854e-05, "perf/tokens_per_sec": 2022160.8698878074, "perf/iters_per_sec": 0.9642414426268613, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370846509933471, "data/tokens_consumed": 168906719232, "data/tokens_consumed_B": 168.906719232, "train/loss_slope": 1.5158759413844165e-05} {"step": 80550, "timestamp": 1778281610.7363982, "grad/layer_0/attn": 0.002360609592869878, "grad/layer_0/mlp": 0.0027554037515074015, "grad/layer_0/attn_mlp_ratio": 0.856720001889544, "grad/layer_4/attn": 0.0021028758492320776, "grad/layer_4/mlp": 0.0025584648828953505, "grad/layer_4/attn_mlp_ratio": 0.8219287202642523, "grad/layer_8/attn": 0.0033053350634872913, "grad/layer_8/mlp": 0.003458252176642418, "grad/layer_8/attn_mlp_ratio": 0.9557819381229202, "grad/layer_12/attn": 0.00424545630812645, "grad/layer_12/mlp": 0.006533105857670307, "grad/layer_12/attn_mlp_ratio": 0.6498373569377055, "grad/layer_16/attn": 0.0030443358700722456, "grad/layer_16/mlp": 0.00424412963911891, "grad/layer_16/attn_mlp_ratio": 0.7173050913152, "grad/layer_20/attn": 0.002950561698526144, "grad/layer_20/mlp": 0.004885323345661163, "grad/layer_20/attn_mlp_ratio": 0.6039644521688011, "grad/layer_24/attn": 0.004230973776429892, "grad/layer_24/mlp": 0.006296687759459019, "grad/layer_24/attn_mlp_ratio": 0.671936400670399, "grad/layer_27/attn": 0.0044081006199121475, "grad/layer_27/mlp": 0.005548483692109585, "grad/layer_27/attn_mlp_ratio": 0.7944694055303615} {"step": 80550, "timestamp": 1778281611.353656, "eos/sharpness": 2.961921691894531, "eos/L0_probe": 1.9081809520721436, "eos/L_plus": 1.9265483617782593, "eos/L_minus": 1.9194327592849731, "eos/grad_norm": 0.07225655019283295, "eos/embed_grad_frac": 0.3418242335319519, "eos/time_s": 0.6145040988922119} {"step": 80550, "timestamp": 1778281611.373802, "train/loss": 2.1541502356529234, "train/z_loss": 0.0014062557485885919, "train/perplexity": 8.620561619719352, "train/grad_norm": 0.072265625, "optim/muon_lr": 0.0003670787811279297, "optim/adamw_lr": 1.1012363433837889e-05, "perf/tokens_per_sec": 1905850.057599137, "perf/iters_per_sec": 0.9087801254268346, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1003761768341065, "data/tokens_consumed": 168927690752, "data/tokens_consumed_B": 168.927690752, "train/loss_slope": 1.8357500069999015e-05} {"step": 80550, "timestamp": 1778281612.736053, "geo/rankme_last": 440.68499755859375, "geo/layer_0/stable_rank_q_proj": 18.705089569091797, "geo/layer_0/stable_rank_k_proj": 15.683331489562988, "geo/layer_0/stable_rank_o_proj": 46.285972595214844, "geo/layer_0/stable_rank_gate_proj": 127.02640533447266, "geo/layer_0/stable_rank_down_proj": 56.83419418334961, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0636436864733696, "geo/layer_0/attn_entropy_mean": 6.126886367797852, "geo/layer_0/attn_entropy_std": 0.4268609285354614, "geo/layer_7/stable_rank_q_proj": 42.531646728515625, "geo/layer_7/stable_rank_k_proj": 40.04142761230469, "geo/layer_7/stable_rank_o_proj": 87.90589141845703, "geo/layer_7/stable_rank_gate_proj": 76.82965850830078, "geo/layer_7/stable_rank_down_proj": 140.35317993164062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4535861015319824, "geo/layer_7/attn_entropy_mean": 4.642484664916992, "geo/layer_7/attn_entropy_std": 0.7662896513938904, "geo/layer_14/stable_rank_q_proj": 49.105194091796875, "geo/layer_14/stable_rank_k_proj": 41.485389709472656, "geo/layer_14/stable_rank_o_proj": 43.082061767578125, "geo/layer_14/stable_rank_gate_proj": 70.23890686035156, "geo/layer_14/stable_rank_down_proj": 124.99142456054688, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4114706516265869, "geo/layer_14/attn_entropy_mean": 5.4974365234375, "geo/layer_14/attn_entropy_std": 0.40762054920196533, "geo/layer_21/stable_rank_q_proj": 39.38838195800781, "geo/layer_21/stable_rank_k_proj": 30.022319793701172, "geo/layer_21/stable_rank_o_proj": 67.8656234741211, "geo/layer_21/stable_rank_gate_proj": 62.880775451660156, "geo/layer_21/stable_rank_down_proj": 49.24059295654297, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1487487107515335, "geo/layer_21/attn_entropy_mean": 5.675961494445801, "geo/layer_21/attn_entropy_std": 0.2878265380859375, "geo/layer_27/stable_rank_q_proj": 43.78651428222656, "geo/layer_27/stable_rank_k_proj": 32.33545684814453, "geo/layer_27/stable_rank_o_proj": 115.30233001708984, "geo/layer_27/stable_rank_gate_proj": 76.21546936035156, "geo/layer_27/stable_rank_down_proj": 127.7793960571289, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07972214370965958, "geo/layer_27/attn_entropy_mean": 4.142172336578369, "geo/layer_27/attn_entropy_std": 0.7971932291984558, "attnres/final_alpha/block_0": 0.23869016766548157, "attnres/block_norm/0": 1.7787728309631348, "attnres/final_alpha/block_1": 0.003927941899746656, "attnres/block_norm/1": 47462.64453125, "attnres/final_alpha/block_2": 0.0090828537940979, "attnres/block_norm/2": 29136.05859375, "attnres/final_alpha/block_3": 0.01052229106426239, "attnres/block_norm/3": 64045.03125, "attnres/final_alpha/block_4": 0.01251714676618576, "attnres/block_norm/4": 15710.830078125, "attnres/final_alpha/block_5": 0.6180744171142578, "attnres/block_norm/5": 6800.74951171875, "attnres/final_alpha/block_6": 0.10718519240617752, "attnres/block_norm/6": 40893.76953125, "geo/tier1_time_s": 1.3584773540496826, "geo/step": 80550.0, "geo/rankme_slope": 0.00024827679118522407} {"step": 80560, "timestamp": 1778281623.1210124, "train/loss": 2.103800868988037, "train/z_loss": 0.0014060913235880435, "train/perplexity": 8.197267516059838, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.00035677194595336916, "optim/adamw_lr": 1.0703158378601074e-05, "perf/tokens_per_sec": 1785822.3210056387, "perf/iters_per_sec": 0.8515464406040376, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1743340730667113, "data/tokens_consumed": 168948662272, "data/tokens_consumed_B": 168.948662272, "train/loss_slope": 2.035421790069475e-05} {"step": 80570, "timestamp": 1778281633.4952, "train/loss": 2.045465016365051, "train/z_loss": 0.0014079626067541539, "train/perplexity": 7.732753558925104, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.00034660935401916506, "optim/adamw_lr": 1.039828062057495e-05, "perf/tokens_per_sec": 2022382.270353001, "perf/iters_per_sec": 0.9643470145955091, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036971116065979, "data/tokens_consumed": 168969633792, "data/tokens_consumed_B": 168.969633792, "train/loss_slope": 1.82046161984095e-05} {"step": 80580, "timestamp": 1778281643.8796136, "train/loss": 2.1312670946121215, "train/z_loss": 0.001416496594902128, "train/perplexity": 8.425536001847991, "train/grad_norm": 0.06689453125, "optim/muon_lr": 0.0003365910053253174, "optim/adamw_lr": 1.009773015975952e-05, "perf/tokens_per_sec": 2020623.23810868, "perf/iters_per_sec": 0.9635082426589393, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0378738403320313, "data/tokens_consumed": 168990605312, "data/tokens_consumed_B": 168.990605312, "train/loss_slope": 2.0572634834875493e-05} {"step": 80590, "timestamp": 1778281654.2558486, "train/loss": 2.05855278968811, "train/z_loss": 0.0014202420716173947, "train/perplexity": 7.834623254308979, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.0003267174959182739, "optim/adamw_lr": 9.801524877548217e-06, "perf/tokens_per_sec": 2022158.87090243, "perf/iters_per_sec": 0.9642404894363547, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0370856761932372, "data/tokens_consumed": 169011576832, "data/tokens_consumed_B": 169.011576832, "train/loss_slope": 1.486513410070936e-05} {"step": 80600, "timestamp": 1778281664.619608, "grad/layer_0/attn": 0.0026220562867820263, "grad/layer_0/mlp": 0.0028812787495553493, "grad/layer_0/attn_mlp_ratio": 0.9100321154915889, "grad/layer_4/attn": 0.001822178135626018, "grad/layer_4/mlp": 0.0024728705175220966, "grad/layer_4/attn_mlp_ratio": 0.7368675589877423, "grad/layer_8/attn": 0.004777461290359497, "grad/layer_8/mlp": 0.003659067442640662, "grad/layer_8/attn_mlp_ratio": 1.3056499325813802, "grad/layer_12/attn": 0.003982372581958771, "grad/layer_12/mlp": 0.00613745441660285, "grad/layer_12/attn_mlp_ratio": 0.6488638850497028, "grad/layer_16/attn": 0.002951896982267499, "grad/layer_16/mlp": 0.004102244507521391, "grad/layer_16/attn_mlp_ratio": 0.719580928171677, "grad/layer_20/attn": 0.0025784645695239305, "grad/layer_20/mlp": 0.004629949573427439, "grad/layer_20/attn_mlp_ratio": 0.5569098481399165, "grad/layer_24/attn": 0.004657757002860308, "grad/layer_24/mlp": 0.006481364369392395, "grad/layer_24/attn_mlp_ratio": 0.7186383399446377, "grad/layer_27/attn": 0.0031980325002223253, "grad/layer_27/mlp": 0.00564622413367033, "grad/layer_27/attn_mlp_ratio": 0.566401964900966} {"step": 80600, "timestamp": 1778281664.6356478, "train/loss": 2.062879180908203, "train/z_loss": 0.0014133346034213901, "train/perplexity": 7.868592328335853, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.00031698822975158693, "optim/adamw_lr": 9.509646892547606e-06, "perf/tokens_per_sec": 2021468.8528193654, "perf/iters_per_sec": 0.9639114631745174, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0374396800994874, "data/tokens_consumed": 169032548352, "data/tokens_consumed_B": 169.032548352, "train/loss_slope": 1.4574559865826171e-05} {"step": 80610, "timestamp": 1778281675.0179353, "train/loss": 2.0992486596107485, "train/z_loss": 0.001406866661272943, "train/perplexity": 8.160036643661696, "train/grad_norm": 0.06640625, "optim/muon_lr": 0.0003074038028717041, "optim/adamw_lr": 9.222114086151122e-06, "perf/tokens_per_sec": 2020686.1820728346, "perf/iters_per_sec": 0.9635382566799329, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037841510772705, "data/tokens_consumed": 169053519872, "data/tokens_consumed_B": 169.053519872, "train/loss_slope": 1.424869749948778e-05} {"step": 80620, "timestamp": 1778281685.4018583, "train/loss": 2.1078622460365297, "train/z_loss": 0.0014113586861640216, "train/perplexity": 8.230627407904118, "train/grad_norm": 0.0673828125, "optim/muon_lr": 0.00029796361923217773, "optim/adamw_lr": 8.938908576965332e-06, "perf/tokens_per_sec": 2020917.8923501323, "perf/iters_per_sec": 0.963648744750086, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377225160598755, "data/tokens_consumed": 169074491392, "data/tokens_consumed_B": 169.074491392, "train/loss_slope": 1.7032762486549084e-05} {"step": 80625, "timestamp": 1778281691.19623, "eos/sharpness": 23.980855941772457, "eos/L0_probe": 1.9063899517059326, "eos/L_plus": 2.023832082748413, "eos/L_minus": 2.0287563800811768, "eos/grad_norm": 0.07921894639730453, "eos/embed_grad_frac": 0.3013882339000702, "eos/time_s": 0.6038072109222412} {"step": 80625, "timestamp": 1778281692.5755124, "geo/rankme_last": 440.6748962402344, "geo/layer_0/stable_rank_q_proj": 18.704662322998047, "geo/layer_0/stable_rank_k_proj": 15.683818817138672, "geo/layer_0/stable_rank_o_proj": 46.28262710571289, "geo/layer_0/stable_rank_gate_proj": 127.02632141113281, "geo/layer_0/stable_rank_down_proj": 56.835819244384766, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.0646679699420929, "geo/layer_0/attn_entropy_mean": 6.1265950202941895, "geo/layer_0/attn_entropy_std": 0.42694833874702454, "geo/layer_7/stable_rank_q_proj": 42.53425979614258, "geo/layer_7/stable_rank_k_proj": 40.04243087768555, "geo/layer_7/stable_rank_o_proj": 87.89848327636719, "geo/layer_7/stable_rank_gate_proj": 76.83164978027344, "geo/layer_7/stable_rank_down_proj": 140.3545684814453, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4482068121433258, "geo/layer_7/attn_entropy_mean": 4.641918182373047, "geo/layer_7/attn_entropy_std": 0.7660396099090576, "geo/layer_14/stable_rank_q_proj": 49.10493469238281, "geo/layer_14/stable_rank_k_proj": 41.483829498291016, "geo/layer_14/stable_rank_o_proj": 43.081787109375, "geo/layer_14/stable_rank_gate_proj": 70.23651123046875, "geo/layer_14/stable_rank_down_proj": 124.98805236816406, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40848296880722046, "geo/layer_14/attn_entropy_mean": 5.49831485748291, "geo/layer_14/attn_entropy_std": 0.4085279107093811, "geo/layer_21/stable_rank_q_proj": 39.38815689086914, "geo/layer_21/stable_rank_k_proj": 30.022083282470703, "geo/layer_21/stable_rank_o_proj": 67.86400604248047, "geo/layer_21/stable_rank_gate_proj": 62.880943298339844, "geo/layer_21/stable_rank_down_proj": 49.24117660522461, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15078109502792358, "geo/layer_21/attn_entropy_mean": 5.676089286804199, "geo/layer_21/attn_entropy_std": 0.2875096797943115, "geo/layer_27/stable_rank_q_proj": 43.790069580078125, "geo/layer_27/stable_rank_k_proj": 32.336273193359375, "geo/layer_27/stable_rank_o_proj": 115.30040740966797, "geo/layer_27/stable_rank_gate_proj": 76.21598052978516, "geo/layer_27/stable_rank_down_proj": 127.76737213134766, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08250367641448975, "geo/layer_27/attn_entropy_mean": 4.140419006347656, "geo/layer_27/attn_entropy_std": 0.7957261800765991, "attnres/final_alpha/block_0": 0.23846708238124847, "attnres/block_norm/0": 1.7787479162216187, "attnres/final_alpha/block_1": 0.0039166053757071495, "attnres/block_norm/1": 47402.609375, "attnres/final_alpha/block_2": 0.009073339402675629, "attnres/block_norm/2": 29176.6171875, "attnres/final_alpha/block_3": 0.010539844632148743, "attnres/block_norm/3": 63823.7578125, "attnres/final_alpha/block_4": 0.012491308152675629, "attnres/block_norm/4": 15719.6787109375, "attnres/final_alpha/block_5": 0.6184605360031128, "attnres/block_norm/5": 6808.01123046875, "attnres/final_alpha/block_6": 0.10705132782459259, "attnres/block_norm/6": 40914.9765625, "geo/tier1_time_s": 1.3598434925079346, "geo/step": 80625.0, "geo/rankme_slope": 0.0002271620757678071} {"step": 80630, "timestamp": 1778281697.7668061, "train/loss": 2.1057578921318054, "train/z_loss": 0.0014011096907779574, "train/perplexity": 8.21332546606925, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.00028866887092590334, "optim/adamw_lr": 8.6600661277771e-06, "perf/tokens_per_sec": 1697016.7956791585, "perf/iters_per_sec": 0.8092006662746232, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2357874155044555, "data/tokens_consumed": 169095462912, "data/tokens_consumed_B": 169.095462912, "train/loss_slope": 1.595469835174358e-05} {"step": 80640, "timestamp": 1778281708.14854, "train/loss": 2.1121190667152403, "train/z_loss": 0.0013949708081781864, "train/perplexity": 8.265738390411945, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0002795195579528809, "optim/adamw_lr": 8.385586738586424e-06, "perf/tokens_per_sec": 2020978.115080574, "perf/iters_per_sec": 0.9636774611857291, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.037691593170166, "data/tokens_consumed": 169116434432, "data/tokens_consumed_B": 169.116434432, "train/loss_slope": 1.9736018489868526e-05} {"step": 80650, "timestamp": 1778281718.5196166, "grad/layer_0/attn": 0.0032276217825710773, "grad/layer_0/mlp": 0.0032698127906769514, "grad/layer_0/attn_mlp_ratio": 0.9870967821350967, "grad/layer_4/attn": 0.0022979502100497484, "grad/layer_4/mlp": 0.0025158452335745096, "grad/layer_4/attn_mlp_ratio": 0.9133908906811946, "grad/layer_8/attn": 0.0033499114215373993, "grad/layer_8/mlp": 0.0034873399417847395, "grad/layer_8/attn_mlp_ratio": 0.9605921365279287, "grad/layer_12/attn": 0.005010900553315878, "grad/layer_12/mlp": 0.00627380795776844, "grad/layer_12/attn_mlp_ratio": 0.7987016031055034, "grad/layer_16/attn": 0.00311989220790565, "grad/layer_16/mlp": 0.004141387064009905, "grad/layer_16/attn_mlp_ratio": 0.7533447331412525, "grad/layer_20/attn": 0.0026552672497928143, "grad/layer_20/mlp": 0.0045849839225411415, "grad/layer_20/attn_mlp_ratio": 0.5791224651468212, "grad/layer_24/attn": 0.003865252248942852, "grad/layer_24/mlp": 0.006659637670964003, "grad/layer_24/attn_mlp_ratio": 0.5803997727617166, "grad/layer_27/attn": 0.0050655086524784565, "grad/layer_27/mlp": 0.005502684041857719, "grad/layer_27/attn_mlp_ratio": 0.9205523199026155} {"step": 80650, "timestamp": 1778281718.5353913, "train/loss": 2.0794841170310976, "train/z_loss": 0.0014162075240164995, "train/perplexity": 8.000340610060839, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.0002705150842666626, "optim/adamw_lr": 8.115452527999877e-06, "perf/tokens_per_sec": 2020079.8364515095, "perf/iters_per_sec": 0.9632491285569713, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0381530284881593, "data/tokens_consumed": 169137405952, "data/tokens_consumed_B": 169.137405952, "train/loss_slope": 2.059958493283944e-05} {"step": 80660, "timestamp": 1778281728.921843, "train/loss": 2.0817102909088137, "train/z_loss": 0.0014099101652391254, "train/perplexity": 8.018170598303257, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.00026165664196014404, "optim/adamw_lr": 7.849699258804321e-06, "perf/tokens_per_sec": 2020314.7951140872, "perf/iters_per_sec": 0.9633611655779301, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0380322933197021, "data/tokens_consumed": 169158377472, "data/tokens_consumed_B": 169.158377472, "train/loss_slope": 1.9474105544538075e-05} {"step": 80670, "timestamp": 1778281739.297833, "train/loss": 2.1010928630828856, "train/z_loss": 0.0014050683821551502, "train/perplexity": 8.175099296602014, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.0002529430389404297, "optim/adamw_lr": 7.58829116821289e-06, "perf/tokens_per_sec": 2022508.5675090745, "perf/iters_per_sec": 0.9644072377725003, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.036906361579895, "data/tokens_consumed": 169179348992, "data/tokens_consumed_B": 169.179348992, "train/loss_slope": 2.1315977411015485e-05} {"step": 80680, "timestamp": 1778281749.9964848, "train/loss": 2.0534276604652404, "train/z_loss": 0.0014219989301636816, "train/perplexity": 7.794572517985228, "train/grad_norm": 0.078125, "optim/muon_lr": 0.00024437546730041507, "optim/adamw_lr": 7.33126401901245e-06, "perf/tokens_per_sec": 1961301.8942836458, "perf/iters_per_sec": 0.9352216216486195, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0692652702331542, "data/tokens_consumed": 169200320512, "data/tokens_consumed_B": 169.200320512, "train/loss_slope": 1.930369862986512e-05} {"step": 80690, "timestamp": 1778281760.955002, "train/loss": 2.0529504299163817, "train/z_loss": 0.0014105546521022915, "train/perplexity": 7.790853597326206, "train/grad_norm": 0.0673828125, "optim/muon_lr": 0.0002359539270401001, "optim/adamw_lr": 7.078617811203002e-06, "perf/tokens_per_sec": 1914768.833336896, "perf/iters_per_sec": 0.9130329291042786, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0952507495880126, "data/tokens_consumed": 169221292032, "data/tokens_consumed_B": 169.221292032, "train/loss_slope": 1.676788018195434e-05} {"step": 80700, "timestamp": 1778281771.327326, "grad/layer_0/attn": 0.002580545609816909, "grad/layer_0/mlp": 0.002852834528312087, "grad/layer_0/attn_mlp_ratio": 0.9045549237965184, "grad/layer_4/attn": 0.0019010603427886963, "grad/layer_4/mlp": 0.002547826385125518, "grad/layer_4/attn_mlp_ratio": 0.746149847286418, "grad/layer_8/attn": 0.003719821572303772, "grad/layer_8/mlp": 0.0033427239395678043, "grad/layer_8/attn_mlp_ratio": 1.1128114460757952, "grad/layer_12/attn": 0.003616787726059556, "grad/layer_12/mlp": 0.006243661977350712, "grad/layer_12/attn_mlp_ratio": 0.5792734586293015, "grad/layer_16/attn": 0.003255635965615511, "grad/layer_16/mlp": 0.004126668907701969, "grad/layer_16/attn_mlp_ratio": 0.7889258769092526, "grad/layer_20/attn": 0.002427203580737114, "grad/layer_20/mlp": 0.004447300452739, "grad/layer_20/attn_mlp_ratio": 0.5457700805137287, "grad/layer_24/attn": 0.003539071651175618, "grad/layer_24/mlp": 0.006235267501324415, "grad/layer_24/attn_mlp_ratio": 0.5675893766650679, "grad/layer_27/attn": 0.0034534959122538567, "grad/layer_27/mlp": 0.005425039678812027, "grad/layer_27/attn_mlp_ratio": 0.6365844404942045} {"step": 80700, "timestamp": 1778281771.930367, "eos/sharpness": 13.166475296020506, "eos/L0_probe": 1.9070768356323242, "eos/L_plus": 1.9747307300567627, "eos/L_minus": 1.9710876941680908, "eos/grad_norm": 0.0710788294672966, "eos/embed_grad_frac": 0.3337278664112091, "eos/time_s": 0.6003513336181641} {"step": 80700, "timestamp": 1778281771.9495041, "train/loss": 2.064514636993408, "train/z_loss": 0.0014153936528600753, "train/perplexity": 7.8814715944075635, "train/grad_norm": 0.0712890625, "optim/muon_lr": 0.00022767782211303712, "optim/adamw_lr": 6.830334663391113e-06, "perf/tokens_per_sec": 1908616.143859357, "perf/iters_per_sec": 0.9100990981385026, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.098781442642212, "data/tokens_consumed": 169242263552, "data/tokens_consumed_B": 169.242263552, "train/loss_slope": 1.7130852031736354e-05} {"step": 80700, "timestamp": 1778281773.3121622, "geo/rankme_last": 440.7282409667969, "geo/layer_0/stable_rank_q_proj": 18.70500373840332, "geo/layer_0/stable_rank_k_proj": 15.683759689331055, "geo/layer_0/stable_rank_o_proj": 46.283626556396484, "geo/layer_0/stable_rank_gate_proj": 127.02542877197266, "geo/layer_0/stable_rank_down_proj": 56.82951354980469, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06258832663297653, "geo/layer_0/attn_entropy_mean": 6.126934051513672, "geo/layer_0/attn_entropy_std": 0.4270766079425812, "geo/layer_7/stable_rank_q_proj": 42.535614013671875, "geo/layer_7/stable_rank_k_proj": 40.04155349731445, "geo/layer_7/stable_rank_o_proj": 87.90414428710938, "geo/layer_7/stable_rank_gate_proj": 76.83164978027344, "geo/layer_7/stable_rank_down_proj": 140.35562133789062, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.44935280084609985, "geo/layer_7/attn_entropy_mean": 4.638509750366211, "geo/layer_7/attn_entropy_std": 0.7661536335945129, "geo/layer_14/stable_rank_q_proj": 49.104896545410156, "geo/layer_14/stable_rank_k_proj": 41.48371887207031, "geo/layer_14/stable_rank_o_proj": 43.08216094970703, "geo/layer_14/stable_rank_gate_proj": 70.23905181884766, "geo/layer_14/stable_rank_down_proj": 124.99144744873047, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41181883215904236, "geo/layer_14/attn_entropy_mean": 5.496101379394531, "geo/layer_14/attn_entropy_std": 0.4088188707828522, "geo/layer_21/stable_rank_q_proj": 39.38608932495117, "geo/layer_21/stable_rank_k_proj": 30.022125244140625, "geo/layer_21/stable_rank_o_proj": 67.86261749267578, "geo/layer_21/stable_rank_gate_proj": 62.882484436035156, "geo/layer_21/stable_rank_down_proj": 49.24147033691406, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1475754827260971, "geo/layer_21/attn_entropy_mean": 5.6751017570495605, "geo/layer_21/attn_entropy_std": 0.2875315248966217, "geo/layer_27/stable_rank_q_proj": 43.78939437866211, "geo/layer_27/stable_rank_k_proj": 32.336273193359375, "geo/layer_27/stable_rank_o_proj": 115.29811096191406, "geo/layer_27/stable_rank_gate_proj": 76.2144775390625, "geo/layer_27/stable_rank_down_proj": 127.77952575683594, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09207545965909958, "geo/layer_27/attn_entropy_mean": 4.1426682472229, "geo/layer_27/attn_entropy_std": 0.7960352897644043, "attnres/final_alpha/block_0": 0.2385915368795395, "attnres/block_norm/0": 1.7787739038467407, "attnres/final_alpha/block_1": 0.003928347025066614, "attnres/block_norm/1": 47423.13671875, "attnres/final_alpha/block_2": 0.009064778685569763, "attnres/block_norm/2": 29117.986328125, "attnres/final_alpha/block_3": 0.010517863556742668, "attnres/block_norm/3": 63880.73046875, "attnres/final_alpha/block_4": 0.012488191947340965, "attnres/block_norm/4": 15719.9921875, "attnres/final_alpha/block_5": 0.6182276010513306, "attnres/block_norm/5": 6807.228515625, "attnres/final_alpha/block_6": 0.10718169063329697, "attnres/block_norm/6": 40919.1875, "geo/tier1_time_s": 1.358614444732666, "geo/step": 80700.0, "geo/rankme_slope": 0.00022446695475065027} {"step": 80710, "timestamp": 1778281783.6939135, "train/loss": 2.085573709011078, "train/z_loss": 0.0014082220732234418, "train/perplexity": 8.04920806048098, "train/grad_norm": 0.06982421875, "optim/muon_lr": 0.00021954834461212157, "optim/adamw_lr": 6.586450338363647e-06, "perf/tokens_per_sec": 1786244.1207395655, "perf/iters_per_sec": 0.8517475703904941, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1740567684173584, "data/tokens_consumed": 169263235072, "data/tokens_consumed_B": 169.263235072, "train/loss_slope": 1.849774633339492e-05} {"step": 80720, "timestamp": 1778281794.0765102, "train/loss": 2.073677349090576, "train/z_loss": 0.0014153980067931116, "train/perplexity": 7.954019107957535, "train/grad_norm": 0.0966796875, "optim/muon_lr": 0.00021156489849090578, "optim/adamw_lr": 6.346946954727172e-06, "perf/tokens_per_sec": 2020933.9111531856, "perf/iters_per_sec": 0.9636563831106117, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0377142906188965, "data/tokens_consumed": 169284206592, "data/tokens_consumed_B": 169.284206592, "train/loss_slope": 1.7050568091057436e-05} {"step": 80730, "timestamp": 1778281804.457937, "train/loss": 2.0615297079086305, "train/z_loss": 0.001421540800947696, "train/perplexity": 7.857981036880106, "train/grad_norm": 0.08056640625, "optim/muon_lr": 0.00020372807979583742, "optim/adamw_lr": 6.111842393875122e-06, "perf/tokens_per_sec": 2021076.9772903062, "perf/iters_per_sec": 0.9637246023608714, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0376408338546752, "data/tokens_consumed": 169305178112, "data/tokens_consumed_B": 169.305178112, "train/loss_slope": 1.541187005682531e-05} {"step": 80740, "timestamp": 1778281815.3277671, "train/loss": 2.0905182123184205, "train/z_loss": 0.0014129365095868706, "train/perplexity": 8.089105952701889, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.000196036696434021, "optim/adamw_lr": 5.8811008930206296e-06, "perf/tokens_per_sec": 1930155.8297348465, "perf/iters_per_sec": 0.920370020739959, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0865195274353028, "data/tokens_consumed": 169326149632, "data/tokens_consumed_B": 169.326149632, "train/loss_slope": 1.9102649434064192e-05} {"step": 80750, "timestamp": 1778281825.7071867, "grad/layer_0/attn": 0.0022606910206377506, "grad/layer_0/mlp": 0.002739871386438608, "grad/layer_0/attn_mlp_ratio": 0.8251084154229018, "grad/layer_4/attn": 0.0020254291594028473, "grad/layer_4/mlp": 0.0024569351226091385, "grad/layer_4/attn_mlp_ratio": 0.8243722263267267, "grad/layer_8/attn": 0.004426029045134783, "grad/layer_8/mlp": 0.0033767942804843187, "grad/layer_8/attn_mlp_ratio": 1.310719145564314, "grad/layer_12/attn": 0.0035721559543162584, "grad/layer_12/mlp": 0.00599040137603879, "grad/layer_12/attn_mlp_ratio": 0.5963132802708878, "grad/layer_16/attn": 0.0033981488086283207, "grad/layer_16/mlp": 0.00417247973382473, "grad/layer_16/attn_mlp_ratio": 0.8144194685090632, "grad/layer_20/attn": 0.002437818795442581, "grad/layer_20/mlp": 0.004444478079676628, "grad/layer_20/attn_mlp_ratio": 0.548505065586789, "grad/layer_24/attn": 0.003682255744934082, "grad/layer_24/mlp": 0.006343398708850145, "grad/layer_24/attn_mlp_ratio": 0.5804862433994682, "grad/layer_27/attn": 0.003924105316400528, "grad/layer_27/mlp": 0.005893835797905922, "grad/layer_27/attn_mlp_ratio": 0.6657981973666355} {"step": 80750, "timestamp": 1778281825.7233853, "train/loss": 2.0750887751579286, "train/z_loss": 0.0014100666739977896, "train/perplexity": 7.9652535442898404, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.0001884925365447998, "optim/adamw_lr": 5.654776096343994e-06, "perf/tokens_per_sec": 2018698.3797725441, "perf/iters_per_sec": 0.9625903986799927, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.038863468170166, "data/tokens_consumed": 169347121152, "data/tokens_consumed_B": 169.347121152, "train/loss_slope": 2.0706581389836076e-05} {"step": 80760, "timestamp": 1778281836.6534882, "train/loss": 2.089469242095947, "train/z_loss": 0.0014062080066651107, "train/perplexity": 8.080625170252842, "train/grad_norm": 0.09130859375, "optim/muon_lr": 0.00018109500408172607, "optim/adamw_lr": 5.432850122451781e-06, "perf/tokens_per_sec": 1919667.2747675872, "perf/iters_per_sec": 0.9153686879957138, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0924559831619263, "data/tokens_consumed": 169368092672, "data/tokens_consumed_B": 169.368092672, "train/loss_slope": 2.286709666144837e-05} {"step": 80770, "timestamp": 1778281847.0132172, "train/loss": 2.1218264579772947, "train/z_loss": 0.0013972365180961789, "train/perplexity": 8.346367864799927, "train/grad_norm": 0.0771484375, "optim/muon_lr": 0.0001738440990447998, "optim/adamw_lr": 5.215322971343994e-06, "perf/tokens_per_sec": 2025767.607118181, "perf/iters_per_sec": 0.965961268958178, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352381944656373, "data/tokens_consumed": 169389064192, "data/tokens_consumed_B": 169.389064192, "train/loss_slope": 2.396795656671761e-05} {"step": 80775, "timestamp": 1778281852.8253007, "eos/sharpness": 1.8408656120300289, "eos/L0_probe": 1.906942367553711, "eos/L_plus": 1.9170480966567993, "eos/L_minus": 1.9152452945709229, "eos/grad_norm": 0.0674995705485344, "eos/embed_grad_frac": 0.37718522548675537, "eos/time_s": 0.6486854553222656} {"step": 80775, "timestamp": 1778281854.2058072, "geo/rankme_last": 440.70892333984375, "geo/layer_0/stable_rank_q_proj": 18.70560646057129, "geo/layer_0/stable_rank_k_proj": 15.683388710021973, "geo/layer_0/stable_rank_o_proj": 46.28511047363281, "geo/layer_0/stable_rank_gate_proj": 127.02912902832031, "geo/layer_0/stable_rank_down_proj": 56.8343391418457, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06567452847957611, "geo/layer_0/attn_entropy_mean": 6.1266937255859375, "geo/layer_0/attn_entropy_std": 0.42704856395721436, "geo/layer_7/stable_rank_q_proj": 42.53402328491211, "geo/layer_7/stable_rank_k_proj": 40.04192352294922, "geo/layer_7/stable_rank_o_proj": 87.90068054199219, "geo/layer_7/stable_rank_gate_proj": 76.8339614868164, "geo/layer_7/stable_rank_down_proj": 140.352783203125, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.45149168372154236, "geo/layer_7/attn_entropy_mean": 4.637740135192871, "geo/layer_7/attn_entropy_std": 0.7663549184799194, "geo/layer_14/stable_rank_q_proj": 49.104042053222656, "geo/layer_14/stable_rank_k_proj": 41.484188079833984, "geo/layer_14/stable_rank_o_proj": 43.082725524902344, "geo/layer_14/stable_rank_gate_proj": 70.23533630371094, "geo/layer_14/stable_rank_down_proj": 124.99231719970703, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4047556519508362, "geo/layer_14/attn_entropy_mean": 5.495637893676758, "geo/layer_14/attn_entropy_std": 0.40970683097839355, "geo/layer_21/stable_rank_q_proj": 39.38665771484375, "geo/layer_21/stable_rank_k_proj": 30.0224666595459, "geo/layer_21/stable_rank_o_proj": 67.86507415771484, "geo/layer_21/stable_rank_gate_proj": 62.88019943237305, "geo/layer_21/stable_rank_down_proj": 49.23786163330078, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15505695343017578, "geo/layer_21/attn_entropy_mean": 5.674996852874756, "geo/layer_21/attn_entropy_std": 0.2872874140739441, "geo/layer_27/stable_rank_q_proj": 43.78810119628906, "geo/layer_27/stable_rank_k_proj": 32.33580017089844, "geo/layer_27/stable_rank_o_proj": 115.30418395996094, "geo/layer_27/stable_rank_gate_proj": 76.21324157714844, "geo/layer_27/stable_rank_down_proj": 127.77788543701172, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08003177493810654, "geo/layer_27/attn_entropy_mean": 4.140575885772705, "geo/layer_27/attn_entropy_std": 0.7958795428276062, "attnres/final_alpha/block_0": 0.23865659534931183, "attnres/block_norm/0": 1.7787165641784668, "attnres/final_alpha/block_1": 0.003937490750104189, "attnres/block_norm/1": 47425.8984375, "attnres/final_alpha/block_2": 0.009065999649465084, "attnres/block_norm/2": 29113.16796875, "attnres/final_alpha/block_3": 0.010536383837461472, "attnres/block_norm/3": 63995.29296875, "attnres/final_alpha/block_4": 0.012504942715168, "attnres/block_norm/4": 15714.2783203125, "attnres/final_alpha/block_5": 0.6185834407806396, "attnres/block_norm/5": 6799.4033203125, "attnres/final_alpha/block_6": 0.10671517252922058, "attnres/block_norm/6": 40931.3203125, "geo/tier1_time_s": 1.3596765995025635, "geo/step": 80775.0, "geo/rankme_slope": 0.00019994917889030613} {"step": 80780, "timestamp": 1778281859.385195, "train/loss": 2.0368962049484254, "train/z_loss": 0.0014310221886262298, "train/perplexity": 7.666776129718059, "train/grad_norm": 0.07373046875, "optim/muon_lr": 0.00016673982143402099, "optim/adamw_lr": 5.00219464302063e-06, "perf/tokens_per_sec": 1695704.3040298577, "perf/iters_per_sec": 0.8085748214864052, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2367439270019531, "data/tokens_consumed": 169410035712, "data/tokens_consumed_B": 169.410035712, "train/loss_slope": 2.2097520595050387e-05} {"step": 80790, "timestamp": 1778281869.7346485, "train/loss": 2.0981525778770447, "train/z_loss": 0.0014054971397854388, "train/perplexity": 8.151097476474115, "train/grad_norm": 0.0673828125, "optim/muon_lr": 0.0001597827672958374, "optim/adamw_lr": 4.793483018875121e-06, "perf/tokens_per_sec": 2027122.6469535453, "perf/iters_per_sec": 0.9666074023025252, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034546184539795, "data/tokens_consumed": 169431007232, "data/tokens_consumed_B": 169.431007232, "train/loss_slope": 2.1309615111444007e-05} {"step": 80800, "timestamp": 1778281880.0723338, "grad/layer_0/attn": 0.0024191103875637054, "grad/layer_0/mlp": 0.0026968372985720634, "grad/layer_0/attn_mlp_ratio": 0.8970175172016636, "grad/layer_4/attn": 0.002170410705730319, "grad/layer_4/mlp": 0.0024987405631691217, "grad/layer_4/attn_mlp_ratio": 0.8686018272010726, "grad/layer_8/attn": 0.0038756791036576033, "grad/layer_8/mlp": 0.003435603342950344, "grad/layer_8/attn_mlp_ratio": 1.128092682410791, "grad/layer_12/attn": 0.006787223741412163, "grad/layer_12/mlp": 0.00625050812959671, "grad/layer_12/attn_mlp_ratio": 1.0858674994257358, "grad/layer_16/attn": 0.0035059223882853985, "grad/layer_16/mlp": 0.004417917691171169, "grad/layer_16/attn_mlp_ratio": 0.7935689512583699, "grad/layer_20/attn": 0.0028741632122546434, "grad/layer_20/mlp": 0.005238272715359926, "grad/layer_20/attn_mlp_ratio": 0.5486852849333999, "grad/layer_24/attn": 0.004696287214756012, "grad/layer_24/mlp": 0.006188956089317799, "grad/layer_24/attn_mlp_ratio": 0.7588173305963697, "grad/layer_27/attn": 0.005048480350524187, "grad/layer_27/mlp": 0.005442761350423098, "grad/layer_27/attn_mlp_ratio": 0.9275586292931761} {"step": 80800, "timestamp": 1778281880.0881512, "train/loss": 2.1121164560317993, "train/z_loss": 0.0014007727964781224, "train/perplexity": 8.26571681121377, "train/grad_norm": 0.07470703125, "optim/muon_lr": 0.00015297293663024903, "optim/adamw_lr": 4.58918809890747e-06, "perf/tokens_per_sec": 2026378.4929077618, "perf/iters_per_sec": 0.9662525620020684, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349261045455933, "data/tokens_consumed": 169451978752, "data/tokens_consumed_B": 169.451978752, "train/loss_slope": 2.1811497122040393e-05} {"step": 80810, "timestamp": 1778281890.44277, "train/loss": 2.0591862559318543, "train/z_loss": 0.0014091948280110956, "train/perplexity": 7.839587795941793, "train/grad_norm": 0.0791015625, "optim/muon_lr": 0.00014631032943725585, "optim/adamw_lr": 4.389309883117676e-06, "perf/tokens_per_sec": 2026147.7223914897, "perf/iters_per_sec": 0.9661425220448921, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035043978691101, "data/tokens_consumed": 169472950272, "data/tokens_consumed_B": 169.472950272, "train/loss_slope": 2.035017736030347e-05} {"step": 80820, "timestamp": 1778281900.7968295, "train/loss": 2.0770439982414244, "train/z_loss": 0.0014008923899382353, "train/perplexity": 7.980842626986414, "train/grad_norm": 0.0673828125, "optim/muon_lr": 0.00013979434967041016, "optim/adamw_lr": 4.1938304901123045e-06, "perf/tokens_per_sec": 2026342.8283341562, "perf/iters_per_sec": 0.9662355558081418, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0349443197250365, "data/tokens_consumed": 169493921792, "data/tokens_consumed_B": 169.493921792, "train/loss_slope": 2.0276081587078666e-05} {"step": 80830, "timestamp": 1778281911.145056, "train/loss": 2.078982102870941, "train/z_loss": 0.0014173585805110633, "train/perplexity": 7.996325333735625, "train/grad_norm": 0.08251953125, "optim/muon_lr": 0.0001334267854690552, "optim/adamw_lr": 4.002803564071655e-06, "perf/tokens_per_sec": 2027613.569295635, "perf/iters_per_sec": 0.9668414923170257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342957019805907, "data/tokens_consumed": 169514893312, "data/tokens_consumed_B": 169.514893312, "train/loss_slope": 1.8037089677748253e-05} {"step": 80840, "timestamp": 1778281921.49391, "train/loss": 2.0742849111557007, "train/z_loss": 0.0014209948712959886, "train/perplexity": 7.958853136570311, "train/grad_norm": 0.0810546875, "optim/muon_lr": 0.00012720584869384765, "optim/adamw_lr": 3.81617546081543e-06, "perf/tokens_per_sec": 2027463.7347386333, "perf/iters_per_sec": 0.9667700456326643, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0343721389770508, "data/tokens_consumed": 169535864832, "data/tokens_consumed_B": 169.535864832, "train/loss_slope": 1.8305357504706413e-05} {"step": 80850, "timestamp": 1778281931.8343368, "grad/layer_0/attn": 0.0028357491828501225, "grad/layer_0/mlp": 0.0031067708041518927, "grad/layer_0/attn_mlp_ratio": 0.9127641755175522, "grad/layer_4/attn": 0.002181237330660224, "grad/layer_4/mlp": 0.0024496798869222403, "grad/layer_4/attn_mlp_ratio": 0.8904172554394398, "grad/layer_8/attn": 0.0031434455886483192, "grad/layer_8/mlp": 0.0033793291077017784, "grad/layer_8/attn_mlp_ratio": 0.9301980941910423, "grad/layer_12/attn": 0.0035382884088903666, "grad/layer_12/mlp": 0.00666580768302083, "grad/layer_12/attn_mlp_ratio": 0.5308116471499685, "grad/layer_16/attn": 0.0029216159600764513, "grad/layer_16/mlp": 0.004050713963806629, "grad/layer_16/attn_mlp_ratio": 0.721259490068989, "grad/layer_20/attn": 0.0028100700583308935, "grad/layer_20/mlp": 0.0047267829068005085, "grad/layer_20/attn_mlp_ratio": 0.5944994839593852, "grad/layer_24/attn": 0.003560506971552968, "grad/layer_24/mlp": 0.006330450065433979, "grad/layer_24/attn_mlp_ratio": 0.5624413554338249, "grad/layer_27/attn": 0.0030822260305285454, "grad/layer_27/mlp": 0.005440928507596254, "grad/layer_27/attn_mlp_ratio": 0.5664889677518193} {"step": 80850, "timestamp": 1778281932.444576, "eos/sharpness": 7.135415077209471, "eos/L0_probe": 1.9068161249160767, "eos/L_plus": 1.94353187084198, "eos/L_minus": 1.941454529762268, "eos/grad_norm": 0.07043062895536423, "eos/embed_grad_frac": 0.33533334732055664, "eos/time_s": 0.6074941158294678} {"step": 80850, "timestamp": 1778281932.4641268, "train/loss": 2.0609147191047668, "train/z_loss": 0.001426099590025842, "train/perplexity": 7.853149952205207, "train/grad_norm": 0.0703125, "optim/muon_lr": 0.0001211327314376831, "optim/adamw_lr": 3.6339819431304927e-06, "perf/tokens_per_sec": 1912572.5335806112, "perf/iters_per_sec": 0.9119856517699295, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0965084791183473, "data/tokens_consumed": 169556836352, "data/tokens_consumed_B": 169.556836352, "train/loss_slope": 1.6345514637408823e-05} {"step": 80850, "timestamp": 1778281933.8294787, "geo/rankme_last": 440.62615966796875, "geo/layer_0/stable_rank_q_proj": 18.704832077026367, "geo/layer_0/stable_rank_k_proj": 15.683059692382812, "geo/layer_0/stable_rank_o_proj": 46.28373718261719, "geo/layer_0/stable_rank_gate_proj": 127.03486633300781, "geo/layer_0/stable_rank_down_proj": 56.83447265625, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06258413195610046, "geo/layer_0/attn_entropy_mean": 6.12691593170166, "geo/layer_0/attn_entropy_std": 0.4272952377796173, "geo/layer_7/stable_rank_q_proj": 42.5355339050293, "geo/layer_7/stable_rank_k_proj": 40.0427131652832, "geo/layer_7/stable_rank_o_proj": 87.90129852294922, "geo/layer_7/stable_rank_gate_proj": 76.83148956298828, "geo/layer_7/stable_rank_down_proj": 140.35906982421875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4581925868988037, "geo/layer_7/attn_entropy_mean": 4.640237331390381, "geo/layer_7/attn_entropy_std": 0.766436755657196, "geo/layer_14/stable_rank_q_proj": 49.10646057128906, "geo/layer_14/stable_rank_k_proj": 41.48395538330078, "geo/layer_14/stable_rank_o_proj": 43.08258819580078, "geo/layer_14/stable_rank_gate_proj": 70.2371826171875, "geo/layer_14/stable_rank_down_proj": 124.9950942993164, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.40748491883277893, "geo/layer_14/attn_entropy_mean": 5.495335578918457, "geo/layer_14/attn_entropy_std": 0.4100706875324249, "geo/layer_21/stable_rank_q_proj": 39.388057708740234, "geo/layer_21/stable_rank_k_proj": 30.02187728881836, "geo/layer_21/stable_rank_o_proj": 67.86521911621094, "geo/layer_21/stable_rank_gate_proj": 62.880699157714844, "geo/layer_21/stable_rank_down_proj": 49.241615295410156, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14943619072437286, "geo/layer_21/attn_entropy_mean": 5.675410270690918, "geo/layer_21/attn_entropy_std": 0.2871389091014862, "geo/layer_27/stable_rank_q_proj": 43.78901672363281, "geo/layer_27/stable_rank_k_proj": 32.33681869506836, "geo/layer_27/stable_rank_o_proj": 115.30233001708984, "geo/layer_27/stable_rank_gate_proj": 76.2175064086914, "geo/layer_27/stable_rank_down_proj": 127.79071807861328, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09190963953733444, "geo/layer_27/attn_entropy_mean": 4.141796112060547, "geo/layer_27/attn_entropy_std": 0.7953528165817261, "attnres/final_alpha/block_0": 0.23856037855148315, "attnres/block_norm/0": 1.7786824703216553, "attnres/final_alpha/block_1": 0.003934906795620918, "attnres/block_norm/1": 47418.296875, "attnres/final_alpha/block_2": 0.009058093652129173, "attnres/block_norm/2": 29149.8828125, "attnres/final_alpha/block_3": 0.01050675567239523, "attnres/block_norm/3": 64007.703125, "attnres/final_alpha/block_4": 0.012474335730075836, "attnres/block_norm/4": 15706.61328125, "attnres/final_alpha/block_5": 0.6186076402664185, "attnres/block_norm/5": 6802.0693359375, "attnres/final_alpha/block_6": 0.10685792565345764, "attnres/block_norm/6": 40941.02734375, "geo/tier1_time_s": 1.3611361980438232, "geo/step": 80850.0, "geo/rankme_slope": 0.00017946920955882353} {"step": 80860, "timestamp": 1778281944.1842234, "train/loss": 2.0789567470550536, "train/z_loss": 0.001406095689162612, "train/perplexity": 7.996122582953154, "train/grad_norm": 0.0693359375, "optim/muon_lr": 0.00011520743370056153, "optim/adamw_lr": 3.4562230110168456e-06, "perf/tokens_per_sec": 1789975.175082995, "perf/iters_per_sec": 0.8535266757407165, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1716095447540282, "data/tokens_consumed": 169577807872, "data/tokens_consumed_B": 169.577807872, "train/loss_slope": 1.632787541087019e-05} {"step": 80870, "timestamp": 1778281954.5429404, "train/loss": 2.0949323892593386, "train/z_loss": 0.0014085289323702455, "train/perplexity": 8.12489162170157, "train/grad_norm": 0.07080078125, "optim/muon_lr": 0.00010942935943603516, "optim/adamw_lr": 3.2828807830810545e-06, "perf/tokens_per_sec": 2026944.7661567405, "perf/iters_per_sec": 0.9665225821288779, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346369743347168, "data/tokens_consumed": 169598779392, "data/tokens_consumed_B": 169.598779392, "train/loss_slope": 1.872410025998627e-05} {"step": 80880, "timestamp": 1778281964.8969777, "train/loss": 2.0502379655838014, "train/z_loss": 0.0014150569797493518, "train/perplexity": 7.769749819384765, "train/grad_norm": 0.07275390625, "optim/muon_lr": 0.00010379970073699951, "optim/adamw_lr": 3.113991022109985e-06, "perf/tokens_per_sec": 2026521.2104461042, "perf/iters_per_sec": 0.9663206150274773, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034853219985962, "data/tokens_consumed": 169619750912, "data/tokens_consumed_B": 169.619750912, "train/loss_slope": 1.619837914291455e-05} {"step": 80890, "timestamp": 1778281975.249401, "train/loss": 2.0618423581123353, "train/z_loss": 0.0014123800559900702, "train/perplexity": 7.860438220351435, "train/grad_norm": 0.06689453125, "optim/muon_lr": 9.831786155700684e-05, "optim/adamw_lr": 2.9495358467102046e-06, "perf/tokens_per_sec": 2026783.635348705, "perf/iters_per_sec": 0.9664457489722753, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347192287445068, "data/tokens_consumed": 169640722432, "data/tokens_consumed_B": 169.640722432, "train/loss_slope": 1.340898504637753e-05} {"step": 80900, "timestamp": 1778281985.601622, "grad/layer_0/attn": 0.0026648747734725475, "grad/layer_0/mlp": 0.0029276590794324875, "grad/layer_0/attn_mlp_ratio": 0.9102407794575053, "grad/layer_4/attn": 0.002293469849973917, "grad/layer_4/mlp": 0.00273109576664865, "grad/layer_4/attn_mlp_ratio": 0.8397617520428722, "grad/layer_8/attn": 0.004142949357628822, "grad/layer_8/mlp": 0.003601955948397517, "grad/layer_8/attn_mlp_ratio": 1.1501943116357538, "grad/layer_12/attn": 0.005153613165020943, "grad/layer_12/mlp": 0.0065820529125630856, "grad/layer_12/attn_mlp_ratio": 0.7829795893749721, "grad/layer_16/attn": 0.003050131956115365, "grad/layer_16/mlp": 0.0042909192852675915, "grad/layer_16/attn_mlp_ratio": 0.710834131861732, "grad/layer_20/attn": 0.00376999843865633, "grad/layer_20/mlp": 0.0047782231122255325, "grad/layer_20/attn_mlp_ratio": 0.7889958821953809, "grad/layer_24/attn": 0.004226380493491888, "grad/layer_24/mlp": 0.006404005456715822, "grad/layer_24/attn_mlp_ratio": 0.6599589047919737, "grad/layer_27/attn": 0.0034983663354068995, "grad/layer_27/mlp": 0.005693640559911728, "grad/layer_27/attn_mlp_ratio": 0.6144339877362646} {"step": 80900, "timestamp": 1778281985.6176112, "train/loss": 2.0797307014465334, "train/z_loss": 0.0014063556678593158, "train/perplexity": 8.002313612619302, "train/grad_norm": 0.07861328125, "optim/muon_lr": 9.298384189605713e-05, "optim/adamw_lr": 2.7895152568817134e-06, "perf/tokens_per_sec": 2024095.507351225, "perf/iters_per_sec": 0.9651639496570706, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0360934019088746, "data/tokens_consumed": 169661693952, "data/tokens_consumed_B": 169.661693952, "train/loss_slope": 1.3929905320587275e-05} {"step": 80910, "timestamp": 1778281995.972999, "train/loss": 2.0783771753311155, "train/z_loss": 0.001413543487433344, "train/perplexity": 7.991489599105836, "train/grad_norm": 0.07275390625, "optim/muon_lr": 8.779823780059814e-05, "optim/adamw_lr": 2.633947134017944e-06, "perf/tokens_per_sec": 2026082.290965328, "perf/iters_per_sec": 0.966111321909584, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.035077404975891, "data/tokens_consumed": 169682665472, "data/tokens_consumed_B": 169.682665472, "train/loss_slope": 1.487973295029238e-05} {"step": 80920, "timestamp": 1778282006.3320577, "train/loss": 2.059592914581299, "train/z_loss": 0.001412627927493304, "train/perplexity": 7.842776480436216, "train/grad_norm": 0.07080078125, "optim/muon_lr": 8.275985717773438e-05, "optim/adamw_lr": 2.482795715332031e-06, "perf/tokens_per_sec": 2025641.6023438608, "perf/iters_per_sec": 0.9659011851996712, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0353025913238525, "data/tokens_consumed": 169703636992, "data/tokens_consumed_B": 169.703636992, "train/loss_slope": 1.1483712944582397e-05} {"step": 80925, "timestamp": 1778282012.1090019, "eos/sharpness": 1.3680934906005857, "eos/L0_probe": 1.906845211982727, "eos/L_plus": 1.914033055305481, "eos/L_minus": 1.913338303565979, "eos/grad_norm": 0.06760703772306442, "eos/embed_grad_frac": 0.3844158947467804, "eos/time_s": 0.6069090366363525} {"step": 80925, "timestamp": 1778282013.486372, "geo/rankme_last": 440.56396484375, "geo/layer_0/stable_rank_q_proj": 18.705503463745117, "geo/layer_0/stable_rank_k_proj": 15.683481216430664, "geo/layer_0/stable_rank_o_proj": 46.283626556396484, "geo/layer_0/stable_rank_gate_proj": 127.03192138671875, "geo/layer_0/stable_rank_down_proj": 56.830284118652344, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06273864954710007, "geo/layer_0/attn_entropy_mean": 6.1274333000183105, "geo/layer_0/attn_entropy_std": 0.4268394112586975, "geo/layer_7/stable_rank_q_proj": 42.53370666503906, "geo/layer_7/stable_rank_k_proj": 40.041526794433594, "geo/layer_7/stable_rank_o_proj": 87.90614318847656, "geo/layer_7/stable_rank_gate_proj": 76.83301544189453, "geo/layer_7/stable_rank_down_proj": 140.35401916503906, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4629581868648529, "geo/layer_7/attn_entropy_mean": 4.641277313232422, "geo/layer_7/attn_entropy_std": 0.7662138938903809, "geo/layer_14/stable_rank_q_proj": 49.104522705078125, "geo/layer_14/stable_rank_k_proj": 41.48412322998047, "geo/layer_14/stable_rank_o_proj": 43.08228302001953, "geo/layer_14/stable_rank_gate_proj": 70.23810577392578, "geo/layer_14/stable_rank_down_proj": 124.99513244628906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4083183705806732, "geo/layer_14/attn_entropy_mean": 5.495562553405762, "geo/layer_14/attn_entropy_std": 0.40950125455856323, "geo/layer_21/stable_rank_q_proj": 39.38778305053711, "geo/layer_21/stable_rank_k_proj": 30.02349090576172, "geo/layer_21/stable_rank_o_proj": 67.86434173583984, "geo/layer_21/stable_rank_gate_proj": 62.88172912597656, "geo/layer_21/stable_rank_down_proj": 49.24171447753906, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1474035382270813, "geo/layer_21/attn_entropy_mean": 5.675243377685547, "geo/layer_21/attn_entropy_std": 0.2875649333000183, "geo/layer_27/stable_rank_q_proj": 43.7901725769043, "geo/layer_27/stable_rank_k_proj": 32.33627700805664, "geo/layer_27/stable_rank_o_proj": 115.30242919921875, "geo/layer_27/stable_rank_gate_proj": 76.2149429321289, "geo/layer_27/stable_rank_down_proj": 127.77792358398438, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08747057616710663, "geo/layer_27/attn_entropy_mean": 4.142055511474609, "geo/layer_27/attn_entropy_std": 0.7944045662879944, "attnres/final_alpha/block_0": 0.23882272839546204, "attnres/block_norm/0": 1.7786414623260498, "attnres/final_alpha/block_1": 0.003935754299163818, "attnres/block_norm/1": 47415.9609375, "attnres/final_alpha/block_2": 0.00906821247190237, "attnres/block_norm/2": 29127.7109375, "attnres/final_alpha/block_3": 0.01053602434694767, "attnres/block_norm/3": 63854.4453125, "attnres/final_alpha/block_4": 0.012483086436986923, "attnres/block_norm/4": 15725.90625, "attnres/final_alpha/block_5": 0.6178925037384033, "attnres/block_norm/5": 6804.3740234375, "attnres/final_alpha/block_6": 0.10726173222064972, "attnres/block_norm/6": 40964.37890625, "geo/tier1_time_s": 1.3580198287963867, "geo/step": 80925.0, "geo/rankme_slope": 0.00015141576161714686} {"step": 80930, "timestamp": 1778282018.6691747, "train/loss": 2.0704259157180784, "train/z_loss": 0.0014054284547455609, "train/perplexity": 7.928199143478525, "train/grad_norm": 0.07275390625, "optim/muon_lr": 7.787108421325684e-05, "optim/adamw_lr": 2.336132526397705e-06, "perf/tokens_per_sec": 1700749.7532156224, "perf/iters_per_sec": 0.8109806791380035, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.233074998855591, "data/tokens_consumed": 169724608512, "data/tokens_consumed_B": 169.724608512, "train/loss_slope": 1.3791491883029205e-05} {"step": 80940, "timestamp": 1778282029.0187266, "train/loss": 2.0971439361572264, "train/z_loss": 0.0014127849834039807, "train/perplexity": 8.142880084395946, "train/grad_norm": 0.0751953125, "optim/muon_lr": 7.312953472137452e-05, "optim/adamw_lr": 2.1938860416412352e-06, "perf/tokens_per_sec": 2027266.2169056495, "perf/iters_per_sec": 0.9666758617904899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034472918510437, "data/tokens_consumed": 169745580032, "data/tokens_consumed_B": 169.745580032, "train/loss_slope": 1.3729908349740704e-05} {"step": 80950, "timestamp": 1778282039.358363, "grad/layer_0/attn": 0.0026842691004276276, "grad/layer_0/mlp": 0.002827436663210392, "grad/layer_0/attn_mlp_ratio": 0.9493648577235708, "grad/layer_4/attn": 0.0018092028331011534, "grad/layer_4/mlp": 0.002722575329244137, "grad/layer_4/attn_mlp_ratio": 0.664518901356367, "grad/layer_8/attn": 0.004710892681032419, "grad/layer_8/mlp": 0.003659014357253909, "grad/layer_8/attn_mlp_ratio": 1.2874758315570978, "grad/layer_12/attn": 0.004541098605841398, "grad/layer_12/mlp": 0.006590723525732756, "grad/layer_12/attn_mlp_ratio": 0.6890136597613012, "grad/layer_16/attn": 0.0029150641057640314, "grad/layer_16/mlp": 0.004271605983376503, "grad/layer_16/attn_mlp_ratio": 0.682428119275411, "grad/layer_20/attn": 0.0037250861059874296, "grad/layer_20/mlp": 0.004875910468399525, "grad/layer_20/attn_mlp_ratio": 0.7639775286547467, "grad/layer_24/attn": 0.003465942805632949, "grad/layer_24/mlp": 0.006575002800673246, "grad/layer_24/attn_mlp_ratio": 0.5271393576538277, "grad/layer_27/attn": 0.0037469230592250824, "grad/layer_27/mlp": 0.005572429392486811, "grad/layer_27/attn_mlp_ratio": 0.6724038526242423} {"step": 80950, "timestamp": 1778282039.3741488, "train/loss": 2.0727344155311584, "train/z_loss": 0.001411312399432063, "train/perplexity": 7.9465225313506735, "train/grad_norm": 0.07177734375, "optim/muon_lr": 6.853699684143067e-05, "optim/adamw_lr": 2.0561099052429197e-06, "perf/tokens_per_sec": 2026019.9903610747, "perf/iters_per_sec": 0.9660816146664976, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0351092338562011, "data/tokens_consumed": 169766551552, "data/tokens_consumed_B": 169.766551552, "train/loss_slope": 1.3573097796401425e-05} {"step": 80960, "timestamp": 1778282049.725797, "train/loss": 2.047254967689514, "train/z_loss": 0.0014177240431308747, "train/perplexity": 7.746607206378112, "train/grad_norm": 0.0673828125, "optim/muon_lr": 6.409227848052978e-05, "optim/adamw_lr": 1.9227683544158934e-06, "perf/tokens_per_sec": 2026863.7305046332, "perf/iters_per_sec": 0.9664839413188139, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034678339958191, "data/tokens_consumed": 169787523072, "data/tokens_consumed_B": 169.787523072, "train/loss_slope": 1.2024846772263367e-05} {"step": 80970, "timestamp": 1778282060.0755584, "train/loss": 2.0507100105285643, "train/z_loss": 0.001408811891451478, "train/perplexity": 7.77341835629811, "train/grad_norm": 0.07373046875, "optim/muon_lr": 5.9796571731567384e-05, "optim/adamw_lr": 1.7938971519470214e-06, "perf/tokens_per_sec": 2027111.3416251666, "perf/iters_per_sec": 0.9666020115018685, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345519542694093, "data/tokens_consumed": 169808494592, "data/tokens_consumed_B": 169.808494592, "train/loss_slope": 7.408352682191961e-06} {"step": 80980, "timestamp": 1778282070.4377065, "train/loss": 2.0539135694503785, "train/z_loss": 0.00142203465802595, "train/perplexity": 7.798360891134765, "train/grad_norm": 0.0693359375, "optim/muon_lr": 5.56492805480957e-05, "optim/adamw_lr": 1.6694784164428709e-06, "perf/tokens_per_sec": 2024988.5041697493, "perf/iters_per_sec": 0.9655897637223002, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356364965438842, "data/tokens_consumed": 169829466112, "data/tokens_consumed_B": 169.829466112, "train/loss_slope": 4.926193693969535e-06} {"step": 80990, "timestamp": 1778282080.790701, "train/loss": 2.1207882881164553, "train/z_loss": 0.0013926622224971652, "train/perplexity": 8.33770741352281, "train/grad_norm": 0.07177734375, "optim/muon_lr": 5.165040493011475e-05, "optim/adamw_lr": 1.5495121479034422e-06, "perf/tokens_per_sec": 2026613.8452130372, "perf/iters_per_sec": 0.9663647867264925, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034805917739868, "data/tokens_consumed": 169850437632, "data/tokens_consumed_B": 169.850437632, "train/loss_slope": 5.360742678271675e-06} {"step": 81000, "timestamp": 1778282091.1340232, "grad/layer_0/attn": 0.0024262862280011177, "grad/layer_0/mlp": 0.0028479595202952623, "grad/layer_0/attn_mlp_ratio": 0.8519384231120427, "grad/layer_4/attn": 0.002067880006507039, "grad/layer_4/mlp": 0.002435294445604086, "grad/layer_4/attn_mlp_ratio": 0.8491293220525372, "grad/layer_8/attn": 0.00380469742231071, "grad/layer_8/mlp": 0.003373078303411603, "grad/layer_8/attn_mlp_ratio": 1.1279599722504399, "grad/layer_12/attn": 0.004338592756539583, "grad/layer_12/mlp": 0.006365613080561161, "grad/layer_12/attn_mlp_ratio": 0.6815671379135096, "grad/layer_16/attn": 0.002865297719836235, "grad/layer_16/mlp": 0.0041243634186685085, "grad/layer_16/attn_mlp_ratio": 0.6947248240526711, "grad/layer_20/attn": 0.0037830809596925974, "grad/layer_20/mlp": 0.0047422489151358604, "grad/layer_20/attn_mlp_ratio": 0.7977398377052995, "grad/layer_24/attn": 0.004143669269979, "grad/layer_24/mlp": 0.00610788119956851, "grad/layer_24/attn_mlp_ratio": 0.6784135229137032, "grad/layer_27/attn": 0.003916799556463957, "grad/layer_27/mlp": 0.005359909497201443, "grad/layer_27/attn_mlp_ratio": 0.7307585110220939} {"step": 81000, "timestamp": 1778282091.7504427, "eos/sharpness": 2.0286679267883296, "eos/L0_probe": 1.9064722061157227, "eos/L_plus": 1.916534423828125, "eos/L_minus": 1.9166966676712036, "eos/grad_norm": 0.06490898877382278, "eos/embed_grad_frac": 0.40121227502822876, "eos/time_s": 0.6135902404785156} {"step": 81000, "timestamp": 1778282091.7701259, "train/loss": 2.065945839881897, "train/z_loss": 0.0014088867930695415, "train/perplexity": 7.892759655144771, "train/grad_norm": 0.06494140625, "optim/muon_lr": 4.780054092407227e-05, "optim/adamw_lr": 1.4340162277221678e-06, "perf/tokens_per_sec": 1910902.4898035768, "perf/iters_per_sec": 0.9111893128412136, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.097466778755188, "data/tokens_consumed": 169871409152, "data/tokens_consumed_B": 169.871409152, "train/loss_slope": 3.2402133498147063e-06} {"step": 81000, "timestamp": 1778282093.1339195, "geo/rankme_last": 440.5479736328125, "geo/layer_0/stable_rank_q_proj": 18.70465850830078, "geo/layer_0/stable_rank_k_proj": 15.683950424194336, "geo/layer_0/stable_rank_o_proj": 46.28282165527344, "geo/layer_0/stable_rank_gate_proj": 127.02352142333984, "geo/layer_0/stable_rank_down_proj": 56.835540771484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06749489158391953, "geo/layer_0/attn_entropy_mean": 6.126950263977051, "geo/layer_0/attn_entropy_std": 0.4270012378692627, "geo/layer_7/stable_rank_q_proj": 42.533843994140625, "geo/layer_7/stable_rank_k_proj": 40.04212188720703, "geo/layer_7/stable_rank_o_proj": 87.90253448486328, "geo/layer_7/stable_rank_gate_proj": 76.83158111572266, "geo/layer_7/stable_rank_down_proj": 140.35678100585938, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4547335207462311, "geo/layer_7/attn_entropy_mean": 4.6415791511535645, "geo/layer_7/attn_entropy_std": 0.7664867639541626, "geo/layer_14/stable_rank_q_proj": 49.10525894165039, "geo/layer_14/stable_rank_k_proj": 41.4835090637207, "geo/layer_14/stable_rank_o_proj": 43.08086013793945, "geo/layer_14/stable_rank_gate_proj": 70.23721313476562, "geo/layer_14/stable_rank_down_proj": 124.99073028564453, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4054144620895386, "geo/layer_14/attn_entropy_mean": 5.494328498840332, "geo/layer_14/attn_entropy_std": 0.4090738892555237, "geo/layer_21/stable_rank_q_proj": 39.38726806640625, "geo/layer_21/stable_rank_k_proj": 30.022506713867188, "geo/layer_21/stable_rank_o_proj": 67.86632537841797, "geo/layer_21/stable_rank_gate_proj": 62.88136291503906, "geo/layer_21/stable_rank_down_proj": 49.24199676513672, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.14907969534397125, "geo/layer_21/attn_entropy_mean": 5.67411470413208, "geo/layer_21/attn_entropy_std": 0.28740161657333374, "geo/layer_27/stable_rank_q_proj": 43.79076385498047, "geo/layer_27/stable_rank_k_proj": 32.33681869506836, "geo/layer_27/stable_rank_o_proj": 115.30906677246094, "geo/layer_27/stable_rank_gate_proj": 76.21778869628906, "geo/layer_27/stable_rank_down_proj": 127.77801513671875, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.08725155144929886, "geo/layer_27/attn_entropy_mean": 4.141422271728516, "geo/layer_27/attn_entropy_std": 0.7958040833473206, "attnres/final_alpha/block_0": 0.23866122961044312, "attnres/block_norm/0": 1.7786173820495605, "attnres/final_alpha/block_1": 0.003938561305403709, "attnres/block_norm/1": 47400.9921875, "attnres/final_alpha/block_2": 0.00905777420848608, "attnres/block_norm/2": 29152.787109375, "attnres/final_alpha/block_3": 0.010531429201364517, "attnres/block_norm/3": 63858.76953125, "attnres/final_alpha/block_4": 0.012474281713366508, "attnres/block_norm/4": 15715.8408203125, "attnres/final_alpha/block_5": 0.6182175874710083, "attnres/block_norm/5": 6804.2353515625, "attnres/final_alpha/block_6": 0.1071191281080246, "attnres/block_norm/6": 40967.36328125, "geo/tier1_time_s": 1.3596382141113281, "geo/step": 81000.0, "geo/rankme_slope": 0.00012710648712610043} {"step": 81000, "timestamp": 1778282100.0121508, "geo/ww_alpha_mean": 7.710766624189326, "geo/ww_alpha_std": 4.645734278103707, "geo/ww_alpha_min": 1.3391955404728815, "geo/ww_alpha_max": 31.39584294028386, "geo/ww_alpha_healthy_frac": 0.18274111675126903, "geo/ww_alpha_by_type/q_proj": 3.922661014058247, "geo/ww_alpha_by_type/k_proj": 4.443847195145585, "geo/ww_alpha_by_type/v_proj": 8.67965739537012, "geo/ww_alpha_by_type/o_proj": 9.441526349916629, "geo/ww_alpha_by_type/gate_proj": 8.04093549427902, "geo/ww_alpha_by_type/up_proj": 11.512432316416675, "geo/ww_alpha_by_type/down_proj": 8.020847429358948, "geo/twonn_id/layer_0": 0.7260813117027283, "geo/twonn_id/layer_7": 3.232982635498047, "geo/twonn_id/layer_14": 5.178991794586182, "geo/twonn_id/layer_21": 6.991806507110596, "geo/twonn_id/layer_27": 5.755934715270996, "geo/tier2_time_s": 6.872282981872559} {"step": 81000, "timestamp": 1778282100.667326, "eoc/jacobian_sigma/layer_0/attn": 1350.3935546875, "eoc/jacobian_sigma/layer_0/mlp": 9279.1259765625, "eoc/jacobian_sigma/layer_0": 9279.1259765625, "eoc/jacobian_sigma/layer_7/attn": 1.1502351760864258, "eoc/jacobian_sigma/layer_7/mlp": 1.9356873035430908, "eoc/jacobian_sigma/layer_7": 1.9356873035430908, "eoc/jacobian_sigma/layer_14/attn": 1.359102487564087, "eoc/jacobian_sigma/layer_14/mlp": 6.7254743576049805, "eoc/jacobian_sigma/layer_14": 6.7254743576049805, "eoc/jacobian_sigma/layer_21/attn": 1.0927945375442505, "eoc/jacobian_sigma/layer_21/mlp": 4.2824249267578125, "eoc/jacobian_sigma/layer_21": 4.2824249267578125, "eoc/jacobian_sigma/layer_27/attn": 3.160684108734131, "eoc/jacobian_sigma/layer_27/mlp": 27.663375854492188, "eoc/jacobian_sigma/layer_27": 27.663375854492188, "eoc/layer0_sigma": 9279.1259765625, "eoc/sigma_max": 27.663375854492188, "eoc/sigma_min": 1.9356873035430908, "eoc/sigma_mean": 10.151740610599518, "eoc/time_s": 0.6465413570404053} {"step": 81010, "timestamp": 1778282111.0401976, "train/loss": 2.0567666888237, "train/z_loss": 0.0014152245479635895, "train/perplexity": 7.820642316341605, "train/grad_norm": 0.0673828125, "optim/muon_lr": 4.409968852996826e-05, "optim/adamw_lr": 1.3229906558990477e-06, "perf/tokens_per_sec": 1088512.8449787858, "perf/iters_per_sec": 0.519043371667283, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.9266212701797485, "data/tokens_consumed": 169892380672, "data/tokens_consumed_B": 169.892380672, "train/loss_slope": 1.7463647457274722e-06} {"step": 81020, "timestamp": 1778282121.4022253, "train/loss": 2.125382089614868, "train/z_loss": 0.0014042905531823635, "train/perplexity": 8.376097296571691, "train/grad_norm": 0.06982421875, "optim/muon_lr": 4.054725170135498e-05, "optim/adamw_lr": 1.2164175510406493e-06, "perf/tokens_per_sec": 2024752.4106194978, "perf/iters_per_sec": 0.9654771855447282, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0357572555541992, "data/tokens_consumed": 169913352192, "data/tokens_consumed_B": 169.913352192, "train/loss_slope": 4.541577729168762e-06} {"step": 81030, "timestamp": 1778282131.7522898, "train/loss": 2.123077464103699, "train/z_loss": 0.0013994423090480268, "train/perplexity": 8.356815755957342, "train/grad_norm": 0.0654296875, "optim/muon_lr": 3.714382648468017e-05, "optim/adamw_lr": 1.1143147945404053e-06, "perf/tokens_per_sec": 2027205.1983592166, "perf/iters_per_sec": 0.9666467658802111, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345040559768677, "data/tokens_consumed": 169934323712, "data/tokens_consumed_B": 169.934323712, "train/loss_slope": 6.2638441864663484e-06} {"step": 81040, "timestamp": 1778282142.1060007, "train/loss": 2.056182312965393, "train/z_loss": 0.0014101162203587591, "train/perplexity": 7.816073456871088, "train/grad_norm": 0.0693359375, "optim/muon_lr": 3.38888168334961e-05, "optim/adamw_lr": 1.0166645050048828e-06, "perf/tokens_per_sec": 2026797.9726169955, "perf/iters_per_sec": 0.9664525855145433, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0347119092941284, "data/tokens_consumed": 169955295232, "data/tokens_consumed_B": 169.955295232, "train/loss_slope": 3.5084997395155394e-06} {"step": 81050, "timestamp": 1778282152.4439626, "grad/layer_0/attn": 0.0024754542391747236, "grad/layer_0/mlp": 0.002736002206802368, "grad/layer_0/attn_mlp_ratio": 0.904770523409334, "grad/layer_4/attn": 0.0019405712373554707, "grad/layer_4/mlp": 0.002579842461273074, "grad/layer_4/attn_mlp_ratio": 0.7522052959688584, "grad/layer_8/attn": 0.00482328562065959, "grad/layer_8/mlp": 0.0034941337071359158, "grad/layer_8/attn_mlp_ratio": 1.3803952243641058, "grad/layer_12/attn": 0.0037854351103305817, "grad/layer_12/mlp": 0.0065038204193115234, "grad/layer_12/attn_mlp_ratio": 0.5820325298169972, "grad/layer_16/attn": 0.0034663639962673187, "grad/layer_16/mlp": 0.004147992003709078, "grad/layer_16/attn_mlp_ratio": 0.8356727567460275, "grad/layer_20/attn": 0.0024173050187528133, "grad/layer_20/mlp": 0.00480280164629221, "grad/layer_20/attn_mlp_ratio": 0.5033114307953658, "grad/layer_24/attn": 0.003976107574999332, "grad/layer_24/mlp": 0.006154767237603664, "grad/layer_24/attn_mlp_ratio": 0.6460207765623542, "grad/layer_27/attn": 0.003450040239840746, "grad/layer_27/mlp": 0.005532087292522192, "grad/layer_27/attn_mlp_ratio": 0.6236416735759126} {"step": 81050, "timestamp": 1778282152.4594986, "train/loss": 2.1094938039779665, "train/z_loss": 0.001399468444287777, "train/perplexity": 8.244067114262958, "train/grad_norm": 0.07861328125, "optim/muon_lr": 3.0783414840698244e-05, "optim/adamw_lr": 9.235024452209472e-07, "perf/tokens_per_sec": 2026587.2305268822, "perf/iters_per_sec": 0.9663520958551799, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348195075988769, "data/tokens_consumed": 169976266752, "data/tokens_consumed_B": 169.976266752, "train/loss_slope": 5.4338070139525685e-06} {"step": 81060, "timestamp": 1778282162.810024, "train/loss": 2.0769896626472475, "train/z_loss": 0.0014078063424676657, "train/perplexity": 7.980408994941178, "train/grad_norm": 0.08056640625, "optim/muon_lr": 2.7826428413391115e-05, "optim/adamw_lr": 8.347928524017334e-07, "perf/tokens_per_sec": 2027162.3100325752, "perf/iters_per_sec": 0.9666263151324154, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345259428024292, "data/tokens_consumed": 169997238272, "data/tokens_consumed_B": 169.997238272, "train/loss_slope": 7.564679831668775e-06} {"step": 81070, "timestamp": 1778282173.1614769, "train/loss": 2.0364492535591125, "train/z_loss": 0.0014204180799424649, "train/perplexity": 7.663350219140109, "train/grad_norm": 0.07275390625, "optim/muon_lr": 2.5019049644470215e-05, "optim/adamw_lr": 7.505714893341064e-07, "perf/tokens_per_sec": 2027710.8380777293, "perf/iters_per_sec": 0.9668878736866614, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0342460870742798, "data/tokens_consumed": 170018209792, "data/tokens_consumed_B": 170.018209792, "train/loss_slope": 4.250831865813684e-06} {"step": 81075, "timestamp": 1778282178.9386148, "eos/sharpness": 32.91704654693603, "eos/L0_probe": 1.905948519706726, "eos/L_plus": 2.0840394496917725, "eos/L_minus": 2.05702805519104, "eos/grad_norm": 0.0801272839307785, "eos/embed_grad_frac": 0.2796635329723358, "eos/time_s": 0.6139371395111084} {"step": 81075, "timestamp": 1778282180.3163073, "geo/rankme_last": 440.52130126953125, "geo/layer_0/stable_rank_q_proj": 18.704723358154297, "geo/layer_0/stable_rank_k_proj": 15.684211730957031, "geo/layer_0/stable_rank_o_proj": 46.28206253051758, "geo/layer_0/stable_rank_gate_proj": 127.03484344482422, "geo/layer_0/stable_rank_down_proj": 56.8353271484375, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06388454884290695, "geo/layer_0/attn_entropy_mean": 6.127157211303711, "geo/layer_0/attn_entropy_std": 0.4269618093967438, "geo/layer_7/stable_rank_q_proj": 42.53506088256836, "geo/layer_7/stable_rank_k_proj": 40.042724609375, "geo/layer_7/stable_rank_o_proj": 87.90540313720703, "geo/layer_7/stable_rank_gate_proj": 76.83222198486328, "geo/layer_7/stable_rank_down_proj": 140.3517608642578, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4533829391002655, "geo/layer_7/attn_entropy_mean": 4.643169403076172, "geo/layer_7/attn_entropy_std": 0.7662287950515747, "geo/layer_14/stable_rank_q_proj": 49.10676956176758, "geo/layer_14/stable_rank_k_proj": 41.484283447265625, "geo/layer_14/stable_rank_o_proj": 43.081336975097656, "geo/layer_14/stable_rank_gate_proj": 70.23861694335938, "geo/layer_14/stable_rank_down_proj": 124.99317932128906, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41696134209632874, "geo/layer_14/attn_entropy_mean": 5.493953704833984, "geo/layer_14/attn_entropy_std": 0.4107866585254669, "geo/layer_21/stable_rank_q_proj": 39.38703155517578, "geo/layer_21/stable_rank_k_proj": 30.022212982177734, "geo/layer_21/stable_rank_o_proj": 67.86466217041016, "geo/layer_21/stable_rank_gate_proj": 62.8797721862793, "geo/layer_21/stable_rank_down_proj": 49.23975372314453, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15098756551742554, "geo/layer_21/attn_entropy_mean": 5.675267219543457, "geo/layer_21/attn_entropy_std": 0.28757014870643616, "geo/layer_27/stable_rank_q_proj": 43.78957748413086, "geo/layer_27/stable_rank_k_proj": 32.33574676513672, "geo/layer_27/stable_rank_o_proj": 115.30237579345703, "geo/layer_27/stable_rank_gate_proj": 76.21797180175781, "geo/layer_27/stable_rank_down_proj": 127.77811431884766, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09689007699489594, "geo/layer_27/attn_entropy_mean": 4.140342712402344, "geo/layer_27/attn_entropy_std": 0.7971565127372742, "attnres/final_alpha/block_0": 0.2384296953678131, "attnres/block_norm/0": 1.7786061763763428, "attnres/final_alpha/block_1": 0.003930324222892523, "attnres/block_norm/1": 47395.546875, "attnres/final_alpha/block_2": 0.009055019356310368, "attnres/block_norm/2": 29171.078125, "attnres/final_alpha/block_3": 0.01053742878139019, "attnres/block_norm/3": 63897.859375, "attnres/final_alpha/block_4": 0.012458959594368935, "attnres/block_norm/4": 15722.2646484375, "attnres/final_alpha/block_5": 0.6185095310211182, "attnres/block_norm/5": 6802.8056640625, "attnres/final_alpha/block_6": 0.10707898437976837, "attnres/block_norm/6": 40972.953125, "geo/tier1_time_s": 1.3584470748901367, "geo/step": 81075.0, "geo/rankme_slope": 0.00010382248602566027} {"step": 81080, "timestamp": 1778282185.4942305, "train/loss": 2.0677571654319764, "train/z_loss": 0.001418228109832853, "train/perplexity": 7.907068967868355, "train/grad_norm": 0.06689453125, "optim/muon_lr": 2.2360682487487795e-05, "optim/adamw_lr": 6.708204746246337e-07, "perf/tokens_per_sec": 1701173.1476580629, "perf/iters_per_sec": 0.8111825693407358, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2327681064605713, "data/tokens_consumed": 170039181312, "data/tokens_consumed_B": 170.039181312, "train/loss_slope": 3.6610616780672295e-06} {"step": 81090, "timestamp": 1778282195.8453302, "train/loss": 2.047633695602417, "train/z_loss": 0.001410658413078636, "train/perplexity": 7.7495416183942565, "train/grad_norm": 0.080078125, "optim/muon_lr": 1.985132694244385e-05, "optim/adamw_lr": 5.955398082733153e-07, "perf/tokens_per_sec": 2026912.8648932842, "perf/iters_per_sec": 0.9665073704210683, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346532583236694, "data/tokens_consumed": 170060152832, "data/tokens_consumed_B": 170.060152832, "train/loss_slope": -1.2796871661424693e-06} {"step": 81100, "timestamp": 1778282206.1855803, "grad/layer_0/attn": 0.0022434035781770945, "grad/layer_0/mlp": 0.0026450238656252623, "grad/layer_0/attn_mlp_ratio": 0.8481600194676391, "grad/layer_4/attn": 0.0018713524332270026, "grad/layer_4/mlp": 0.002520894631743431, "grad/layer_4/attn_mlp_ratio": 0.7423366036124761, "grad/layer_8/attn": 0.002908077323809266, "grad/layer_8/mlp": 0.003394104540348053, "grad/layer_8/attn_mlp_ratio": 0.8568024949021764, "grad/layer_12/attn": 0.003388918237760663, "grad/layer_12/mlp": 0.006746494211256504, "grad/layer_12/attn_mlp_ratio": 0.5023228481948416, "grad/layer_16/attn": 0.0029677606653422117, "grad/layer_16/mlp": 0.00424208864569664, "grad/layer_16/attn_mlp_ratio": 0.6995989106434508, "grad/layer_20/attn": 0.003451000200584531, "grad/layer_20/mlp": 0.004844768904149532, "grad/layer_20/attn_mlp_ratio": 0.7123147042983388, "grad/layer_24/attn": 0.004480297677218914, "grad/layer_24/mlp": 0.006717439740896225, "grad/layer_24/attn_mlp_ratio": 0.666965061591257, "grad/layer_27/attn": 0.004151867236942053, "grad/layer_27/mlp": 0.005524805746972561, "grad/layer_27/attn_mlp_ratio": 0.751495591327822} {"step": 81100, "timestamp": 1778282206.2010496, "train/loss": 2.1283897042274473, "train/z_loss": 0.0013986239209771156, "train/perplexity": 8.401327291228876, "train/grad_norm": 0.07666015625, "optim/muon_lr": 1.7490983009338378e-05, "optim/adamw_lr": 5.247294902801513e-07, "perf/tokens_per_sec": 2026148.5158084673, "perf/iters_per_sec": 0.966142900375589, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350435733795167, "data/tokens_consumed": 170081124352, "data/tokens_consumed_B": 170.081124352, "train/loss_slope": 4.039157041371682e-06} {"step": 81110, "timestamp": 1778282216.5660038, "train/loss": 2.034590411186218, "train/z_loss": 0.0014104624860920012, "train/perplexity": 7.649118490402551, "train/grad_norm": 0.07568359375, "optim/muon_lr": 1.5279650688171388e-05, "optim/adamw_lr": 4.5838952064514156e-07, "perf/tokens_per_sec": 2024365.17794732, "perf/iters_per_sec": 0.9652925386177635, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0359553813934326, "data/tokens_consumed": 170102095872, "data/tokens_consumed_B": 170.102095872, "train/loss_slope": -9.94922032486479e-07} {"step": 81120, "timestamp": 1778282226.92361, "train/loss": 2.0443952918052672, "train/z_loss": 0.0014086018200032413, "train/perplexity": 7.724486065287243, "train/grad_norm": 0.0849609375, "optim/muon_lr": 1.3217926025390624e-05, "optim/adamw_lr": 3.965377807617187e-07, "perf/tokens_per_sec": 2025698.7947192, "perf/iters_per_sec": 0.9659284566493989, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0352733612060547, "data/tokens_consumed": 170123067392, "data/tokens_consumed_B": 170.123067392, "train/loss_slope": -2.9793115696533897e-06} {"step": 81130, "timestamp": 1778282237.27873, "train/loss": 2.0771145820617676, "train/z_loss": 0.0014154882519505918, "train/perplexity": 7.981405965229634, "train/grad_norm": 0.06787109375, "optim/muon_lr": 1.1305809020996094e-05, "optim/adamw_lr": 3.391742706298828e-07, "perf/tokens_per_sec": 2026581.113917099, "perf/iters_per_sec": 0.9663491792283531, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348226308822632, "data/tokens_consumed": 170144038912, "data/tokens_consumed_B": 170.144038912, "train/loss_slope": -6.935378305553657e-06} {"step": 81140, "timestamp": 1778282248.1222503, "train/loss": 2.0737651109695436, "train/z_loss": 0.0014170258305966853, "train/perplexity": 7.954717198252204, "train/grad_norm": 0.07421875, "optim/muon_lr": 9.542703628540039e-06, "optim/adamw_lr": 2.8628110885620117e-07, "perf/tokens_per_sec": 1934870.4382371404, "perf/iters_per_sec": 0.9226181212602331, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0838720560073853, "data/tokens_consumed": 170165010432, "data/tokens_consumed_B": 170.165010432, "train/loss_slope": -7.945009040431953e-06} {"step": 81150, "timestamp": 1778282258.4700298, "grad/layer_0/attn": 0.00260789692401886, "grad/layer_0/mlp": 0.0031733873765915632, "grad/layer_0/attn_mlp_ratio": 0.8218022360193848, "grad/layer_4/attn": 0.0025619184598326683, "grad/layer_4/mlp": 0.0025762123987078667, "grad/layer_4/attn_mlp_ratio": 0.9944515295681673, "grad/layer_8/attn": 0.0028393124230206013, "grad/layer_8/mlp": 0.0037085991352796555, "grad/layer_8/attn_mlp_ratio": 0.7656023859387159, "grad/layer_12/attn": 0.00429015327244997, "grad/layer_12/mlp": 0.00624402891844511, "grad/layer_12/attn_mlp_ratio": 0.6870809312027037, "grad/layer_16/attn": 0.0028125226963311434, "grad/layer_16/mlp": 0.004172918852418661, "grad/layer_16/attn_mlp_ratio": 0.6739940862501008, "grad/layer_20/attn": 0.003171245800331235, "grad/layer_20/mlp": 0.004747486673295498, "grad/layer_20/attn_mlp_ratio": 0.667984125447051, "grad/layer_24/attn": 0.004636063706129789, "grad/layer_24/mlp": 0.0063078515231609344, "grad/layer_24/attn_mlp_ratio": 0.7349671462003424, "grad/layer_27/attn": 0.005466035101562738, "grad/layer_27/mlp": 0.0057446882128715515, "grad/layer_27/attn_mlp_ratio": 0.9514937632587547} {"step": 81150, "timestamp": 1778282259.085489, "eos/sharpness": 23.570036888122555, "eos/L0_probe": 1.9071717262268066, "eos/L_plus": 2.013803482055664, "eos/L_minus": 2.036240339279175, "eos/grad_norm": 0.07961701601743698, "eos/embed_grad_frac": 0.2914004921913147, "eos/time_s": 0.6126072406768799} {"step": 81150, "timestamp": 1778282259.1067877, "train/loss": 2.037951099872589, "train/z_loss": 0.0014203407103195787, "train/perplexity": 7.674868040249278, "train/grad_norm": 0.07958984375, "optim/muon_lr": 7.928609848022462e-06, "optim/adamw_lr": 2.378582954406738e-07, "perf/tokens_per_sec": 1910173.3379730626, "perf/iters_per_sec": 0.9108416261544526, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0978857040405274, "data/tokens_consumed": 170185981952, "data/tokens_consumed_B": 170.185981952, "train/loss_slope": -1.0795951333090337e-05} {"step": 81150, "timestamp": 1778282260.4675453, "geo/rankme_last": 440.5376892089844, "geo/layer_0/stable_rank_q_proj": 18.70525360107422, "geo/layer_0/stable_rank_k_proj": 15.683820724487305, "geo/layer_0/stable_rank_o_proj": 46.28334426879883, "geo/layer_0/stable_rank_gate_proj": 127.0299301147461, "geo/layer_0/stable_rank_down_proj": 56.83086013793945, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06457258760929108, "geo/layer_0/attn_entropy_mean": 6.127383232116699, "geo/layer_0/attn_entropy_std": 0.426633358001709, "geo/layer_7/stable_rank_q_proj": 42.53456115722656, "geo/layer_7/stable_rank_k_proj": 40.04158020019531, "geo/layer_7/stable_rank_o_proj": 87.90351104736328, "geo/layer_7/stable_rank_gate_proj": 76.83110046386719, "geo/layer_7/stable_rank_down_proj": 140.35731506347656, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4537893831729889, "geo/layer_7/attn_entropy_mean": 4.640323162078857, "geo/layer_7/attn_entropy_std": 0.7669457197189331, "geo/layer_14/stable_rank_q_proj": 49.105194091796875, "geo/layer_14/stable_rank_k_proj": 41.48394775390625, "geo/layer_14/stable_rank_o_proj": 43.083221435546875, "geo/layer_14/stable_rank_gate_proj": 70.23649597167969, "geo/layer_14/stable_rank_down_proj": 124.99669647216797, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.4135243594646454, "geo/layer_14/attn_entropy_mean": 5.494344711303711, "geo/layer_14/attn_entropy_std": 0.41078469157218933, "geo/layer_21/stable_rank_q_proj": 39.3870735168457, "geo/layer_21/stable_rank_k_proj": 30.022361755371094, "geo/layer_21/stable_rank_o_proj": 67.8643569946289, "geo/layer_21/stable_rank_gate_proj": 62.88190841674805, "geo/layer_21/stable_rank_down_proj": 49.242679595947266, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.15203972160816193, "geo/layer_21/attn_entropy_mean": 5.6747355461120605, "geo/layer_21/attn_entropy_std": 0.2880972921848297, "geo/layer_27/stable_rank_q_proj": 43.79100036621094, "geo/layer_27/stable_rank_k_proj": 32.33681106567383, "geo/layer_27/stable_rank_o_proj": 115.30253601074219, "geo/layer_27/stable_rank_gate_proj": 76.21367645263672, "geo/layer_27/stable_rank_down_proj": 127.79083251953125, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.07702266424894333, "geo/layer_27/attn_entropy_mean": 4.141204833984375, "geo/layer_27/attn_entropy_std": 0.7954135537147522, "attnres/final_alpha/block_0": 0.23874959349632263, "attnres/block_norm/0": 1.7785694599151611, "attnres/final_alpha/block_1": 0.00392775097861886, "attnres/block_norm/1": 47424.89453125, "attnres/final_alpha/block_2": 0.009046152234077454, "attnres/block_norm/2": 29169.30078125, "attnres/final_alpha/block_3": 0.01054577436298132, "attnres/block_norm/3": 64027.9296875, "attnres/final_alpha/block_4": 0.012463614344596863, "attnres/block_norm/4": 15715.029296875, "attnres/final_alpha/block_5": 0.6179887652397156, "attnres/block_norm/5": 6808.3271484375, "attnres/final_alpha/block_6": 0.10727835446596146, "attnres/block_norm/6": 40917.25, "geo/tier1_time_s": 1.3566510677337646, "geo/step": 81150.0, "geo/rankme_slope": 9.51890912615046e-05} {"step": 81160, "timestamp": 1778282270.820135, "train/loss": 2.032514274120331, "train/z_loss": 0.0014117292943410576, "train/perplexity": 7.633254345750955, "train/grad_norm": 0.08349609375, "optim/muon_lr": 6.4641237258911135e-06, "optim/adamw_lr": 1.939237117767334e-07, "perf/tokens_per_sec": 1790991.0361618565, "perf/iters_per_sec": 0.8540110760506899, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.1709450006484985, "data/tokens_consumed": 170206953472, "data/tokens_consumed_B": 170.206953472, "train/loss_slope": -1.5170627394274783e-05} {"step": 81170, "timestamp": 1778282281.176351, "train/loss": 2.0587793350219727, "train/z_loss": 0.0014158022589981557, "train/perplexity": 7.836398352712353, "train/grad_norm": 0.0703125, "optim/muon_lr": 5.148649215698242e-06, "optim/adamw_lr": 1.5445947647094725e-07, "perf/tokens_per_sec": 2026453.5606672564, "perf/iters_per_sec": 0.9662883570991785, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0348877668380738, "data/tokens_consumed": 170227924992, "data/tokens_consumed_B": 170.227924992, "train/loss_slope": -1.5807501825526642e-05} {"step": 81180, "timestamp": 1778282291.5278401, "train/loss": 2.0663739681243896, "train/z_loss": 0.0014112284872680903, "train/perplexity": 7.896139491914507, "train/grad_norm": 0.07470703125, "optim/muon_lr": 3.982782363891602e-06, "optim/adamw_lr": 1.1948347091674804e-07, "perf/tokens_per_sec": 2026855.2770144506, "perf/iters_per_sec": 0.9664799103805783, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0346826553344726, "data/tokens_consumed": 170248896512, "data/tokens_consumed_B": 170.248896512, "train/loss_slope": -1.596418330521332e-05} {"step": 81190, "timestamp": 1778282301.8772287, "train/loss": 2.05372668504715, "train/z_loss": 0.0014085076283663512, "train/perplexity": 7.7969036352869, "train/grad_norm": 0.0712890625, "optim/muon_lr": 2.9665231704711916e-06, "optim/adamw_lr": 8.899569511413573e-08, "perf/tokens_per_sec": 2027282.943917433, "perf/iters_per_sec": 0.966683837851254, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034464383125305, "data/tokens_consumed": 170269868032, "data/tokens_consumed_B": 170.269868032, "train/loss_slope": -1.7266010537077437e-05} {"step": 81200, "timestamp": 1778282312.2228162, "grad/layer_0/attn": 0.0024435529485344887, "grad/layer_0/mlp": 0.0027163776103407145, "grad/layer_0/attn_mlp_ratio": 0.8995630244028157, "grad/layer_4/attn": 0.0018523004837334156, "grad/layer_4/mlp": 0.0024673123843967915, "grad/layer_4/attn_mlp_ratio": 0.7507360723245665, "grad/layer_8/attn": 0.004283933900296688, "grad/layer_8/mlp": 0.003521910635754466, "grad/layer_8/attn_mlp_ratio": 1.2163664049761842, "grad/layer_12/attn": 0.004372906405478716, "grad/layer_12/mlp": 0.005922113545238972, "grad/layer_12/attn_mlp_ratio": 0.7384029870811875, "grad/layer_16/attn": 0.00418334174901247, "grad/layer_16/mlp": 0.004330638330429792, "grad/layer_16/attn_mlp_ratio": 0.9659873056170373, "grad/layer_20/attn": 0.0038255827967077494, "grad/layer_20/mlp": 0.004678265657275915, "grad/layer_20/attn_mlp_ratio": 0.8177352453220464, "grad/layer_24/attn": 0.005939212162047625, "grad/layer_24/mlp": 0.005926679354161024, "grad/layer_24/attn_mlp_ratio": 1.0021146255645397, "grad/layer_27/attn": 0.003198706777766347, "grad/layer_27/mlp": 0.005124362651258707, "grad/layer_27/attn_mlp_ratio": 0.6242155235752275} {"step": 81200, "timestamp": 1778282312.2385812, "train/loss": 2.07025260925293, "train/z_loss": 0.0014167658635415136, "train/perplexity": 7.926825254365347, "train/grad_norm": 0.0693359375, "optim/muon_lr": 2.099871635437012e-06, "optim/adamw_lr": 6.299614906311035e-08, "perf/tokens_per_sec": 2024937.4585498811, "perf/iters_per_sec": 0.9655654232739835, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0356626033782959, "data/tokens_consumed": 170290839552, "data/tokens_consumed_B": 170.290839552, "train/loss_slope": -2.3635335309062678e-05} {"step": 81210, "timestamp": 1778282322.589954, "train/loss": 2.040331280231476, "train/z_loss": 0.0014253223896957933, "train/perplexity": 7.693157367730272, "train/grad_norm": 0.07568359375, "optim/muon_lr": 1.3822317123413087e-06, "optim/adamw_lr": 4.146695137023926e-08, "perf/tokens_per_sec": 2026956.3965889513, "perf/iters_per_sec": 0.966528127951122, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034631037712097, "data/tokens_consumed": 170311811072, "data/tokens_consumed_B": 170.311811072, "train/loss_slope": -2.7980304179233074e-05} {"step": 81220, "timestamp": 1778282332.942955, "train/loss": 2.045227217674255, "train/z_loss": 0.0014171103131957353, "train/perplexity": 7.730914938871929, "train/grad_norm": 0.068359375, "optim/muon_lr": 8.141994476318359e-07, "optim/adamw_lr": 2.4425983428955075e-08, "perf/tokens_per_sec": 2026717.6958666225, "perf/iters_per_sec": 0.9664143065770257, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.034752893447876, "data/tokens_consumed": 170332782592, "data/tokens_consumed_B": 170.332782592, "train/loss_slope": -3.064071453265016e-05} {"step": 81225, "timestamp": 1778282338.71743, "eos/sharpness": 9.080505371093748, "eos/L0_probe": 1.9064635038375854, "eos/L_plus": 1.9489918947219849, "eos/L_minus": 1.9547401666641235, "eos/grad_norm": 0.06714098155498505, "eos/embed_grad_frac": 0.39240318536758423, "eos/time_s": 0.608130693435669} {"step": 81225, "timestamp": 1778282340.0979137, "geo/rankme_last": 440.52130126953125, "geo/layer_0/stable_rank_q_proj": 18.704648971557617, "geo/layer_0/stable_rank_k_proj": 15.684195518493652, "geo/layer_0/stable_rank_o_proj": 46.28178405761719, "geo/layer_0/stable_rank_gate_proj": 127.03201293945312, "geo/layer_0/stable_rank_down_proj": 56.834014892578125, "geo/layer_0/dead_units": 0.0, "geo/layer_0/anisotropy": 0.06595169752836227, "geo/layer_0/attn_entropy_mean": 6.127553939819336, "geo/layer_0/attn_entropy_std": 0.4267227351665497, "geo/layer_7/stable_rank_q_proj": 42.533721923828125, "geo/layer_7/stable_rank_k_proj": 40.0423698425293, "geo/layer_7/stable_rank_o_proj": 87.90637969970703, "geo/layer_7/stable_rank_gate_proj": 76.83291625976562, "geo/layer_7/stable_rank_down_proj": 140.354248046875, "geo/layer_7/dead_units": 0.0, "geo/layer_7/anisotropy": 0.4518471956253052, "geo/layer_7/attn_entropy_mean": 4.640636444091797, "geo/layer_7/attn_entropy_std": 0.7661572694778442, "geo/layer_14/stable_rank_q_proj": 49.10459899902344, "geo/layer_14/stable_rank_k_proj": 41.484092712402344, "geo/layer_14/stable_rank_o_proj": 43.08280563354492, "geo/layer_14/stable_rank_gate_proj": 70.23907470703125, "geo/layer_14/stable_rank_down_proj": 124.99420166015625, "geo/layer_14/dead_units": 0.0, "geo/layer_14/anisotropy": 0.41266384720802307, "geo/layer_14/attn_entropy_mean": 5.494502067565918, "geo/layer_14/attn_entropy_std": 0.40957632660865784, "geo/layer_21/stable_rank_q_proj": 39.38599395751953, "geo/layer_21/stable_rank_k_proj": 30.022586822509766, "geo/layer_21/stable_rank_o_proj": 67.86451721191406, "geo/layer_21/stable_rank_gate_proj": 62.88326644897461, "geo/layer_21/stable_rank_down_proj": 49.24200439453125, "geo/layer_21/dead_units": 0.0, "geo/layer_21/anisotropy": 0.1480329930782318, "geo/layer_21/attn_entropy_mean": 5.675074577331543, "geo/layer_21/attn_entropy_std": 0.2872699499130249, "geo/layer_27/stable_rank_q_proj": 43.78957748413086, "geo/layer_27/stable_rank_k_proj": 32.33627700805664, "geo/layer_27/stable_rank_o_proj": 115.29508972167969, "geo/layer_27/stable_rank_gate_proj": 76.21263122558594, "geo/layer_27/stable_rank_down_proj": 127.76644897460938, "geo/layer_27/dead_units": 0.0, "geo/layer_27/anisotropy": 0.09049651771783829, "geo/layer_27/attn_entropy_mean": 4.141091346740723, "geo/layer_27/attn_entropy_std": 0.794951856136322, "attnres/final_alpha/block_0": 0.2385348379611969, "attnres/block_norm/0": 1.7785837650299072, "attnres/final_alpha/block_1": 0.00392057653516531, "attnres/block_norm/1": 47388.8984375, "attnres/final_alpha/block_2": 0.009043690748512745, "attnres/block_norm/2": 29154.35546875, "attnres/final_alpha/block_3": 0.010553751140832901, "attnres/block_norm/3": 63874.33203125, "attnres/final_alpha/block_4": 0.012455714866518974, "attnres/block_norm/4": 15726.5966796875, "attnres/final_alpha/block_5": 0.6182796359062195, "attnres/block_norm/5": 6806.21484375, "attnres/final_alpha/block_6": 0.10721176862716675, "attnres/block_norm/6": 40999.625, "geo/tier1_time_s": 1.3603646755218506, "geo/step": 81225.0, "geo/rankme_slope": 0.00010781490330507203} {"step": 81230, "timestamp": 1778282345.2761438, "train/loss": 2.005422019958496, "train/z_loss": 0.0014201491372659802, "train/perplexity": 7.429228517985759, "train/grad_norm": 0.076171875, "optim/muon_lr": 3.957748413085938e-07, "optim/adamw_lr": 1.1873245239257812e-08, "perf/tokens_per_sec": 1701070.0755152253, "perf/iters_per_sec": 0.8111334207130553, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.2328428030014038, "data/tokens_consumed": 170353754112, "data/tokens_consumed_B": 170.353754112, "train/loss_slope": -3.59936608661591e-05} {"step": 81240, "timestamp": 1778282355.626485, "train/loss": 2.139371728897095, "train/z_loss": 0.0014093180885538458, "train/perplexity": 8.49409935495017, "train/grad_norm": 0.0673828125, "optim/muon_lr": 1.2636184692382813e-07, "optim/adamw_lr": 3.790855407714843e-09, "perf/tokens_per_sec": 2027368.5456838447, "perf/iters_per_sec": 0.9667246559542869, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0344207048416139, "data/tokens_consumed": 170374725632, "data/tokens_consumed_B": 170.374725632, "train/loss_slope": -3.305920438654887e-05} {"step": 81250, "timestamp": 1778282365.9667315, "grad/layer_0/attn": 0.0022056971210986376, "grad/layer_0/mlp": 0.0026227300986647606, "grad/layer_0/attn_mlp_ratio": 0.8409927647996597, "grad/layer_4/attn": 0.0020510542672127485, "grad/layer_4/mlp": 0.0024648746475577354, "grad/layer_4/attn_mlp_ratio": 0.8321129782537585, "grad/layer_8/attn": 0.0030965711921453476, "grad/layer_8/mlp": 0.0032765225041657686, "grad/layer_8/attn_mlp_ratio": 0.9450785379012399, "grad/layer_12/attn": 0.005163182038813829, "grad/layer_12/mlp": 0.006514118518680334, "grad/layer_12/attn_mlp_ratio": 0.792614065087414, "grad/layer_16/attn": 0.0033061434514820576, "grad/layer_16/mlp": 0.00452297693118453, "grad/layer_16/attn_mlp_ratio": 0.7309662261575107, "grad/layer_20/attn": 0.0024994865525513887, "grad/layer_20/mlp": 0.004650898277759552, "grad/layer_20/attn_mlp_ratio": 0.5374201604799309, "grad/layer_24/attn": 0.0035405647940933704, "grad/layer_24/mlp": 0.006356518715620041, "grad/layer_24/attn_mlp_ratio": 0.556997453605116, "grad/layer_27/attn": 0.0032146177254617214, "grad/layer_27/mlp": 0.005516473203897476, "grad/layer_27/attn_mlp_ratio": 0.582730586802721} {"step": 81250, "timestamp": 1778282365.982511, "train/loss": 2.1268896102905273, "train/z_loss": 0.0014010570244863628, "train/perplexity": 8.388733959049393, "train/grad_norm": 0.06787109375, "optim/muon_lr": 6.556510925292969e-09, "optim/adamw_lr": 1.9669532775878904e-10, "perf/tokens_per_sec": 2026069.9238739226, "perf/iters_per_sec": 0.966105424820863, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0350837230682373, "data/tokens_consumed": 170395697152, "data/tokens_consumed_B": 170.395697152, "train/loss_slope": -3.010362852739207e-05} {"step": 81252, "timestamp": 1778282368.0559435, "train/loss": 2.0812368392944336, "train/z_loss": 0.0014035851345397532, "train/perplexity": 8.014375281009562, "train/grad_norm": 0.06884765625, "optim/muon_lr": 5.960464477539063e-10, "optim/adamw_lr": 1.7881393432617186e-11, "perf/tokens_per_sec": 2027174.5970256436, "perf/iters_per_sec": 0.966632174027273, "perf/gpu_mem_gb": 78.330014208, "perf/step_time_s": 1.0345196723937988, "data/tokens_consumed": 170399891456, "data/tokens_consumed_B": 170.399891456, "train/loss_slope": -2.9558463845599375e-05}